aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/ia32/ia32entry.S2
-rw-r--r--arch/x86/include/asm/amd_iommu_types.h3
-rw-r--r--arch/x86/include/asm/kvm.h17
-rw-r--r--arch/x86/include/asm/kvm_emulate.h46
-rw-r--r--arch/x86/include/asm/kvm_host.h71
-rw-r--r--arch/x86/include/asm/lguest_hcall.h29
-rw-r--r--arch/x86/include/asm/svm.h9
-rw-r--r--arch/x86/kernel/amd_iommu.c20
-rw-r--r--arch/x86/kernel/amd_iommu_init.c48
-rw-r--r--arch/x86/kernel/aperture_64.c15
-rw-r--r--arch/x86/kernel/crash.c6
-rw-r--r--arch/x86/kernel/dumpstack.h8
-rw-r--r--arch/x86/kernel/pci-gart_64.c3
-rw-r--r--arch/x86/kvm/emulate.c1247
-rw-r--r--arch/x86/kvm/i8259.c53
-rw-r--r--arch/x86/kvm/irq.h1
-rw-r--r--arch/x86/kvm/kvm_timer.h4
-rw-r--r--arch/x86/kvm/mmu.c198
-rw-r--r--arch/x86/kvm/mmutrace.h13
-rw-r--r--arch/x86/kvm/paging_tmpl.h37
-rw-r--r--arch/x86/kvm/svm.c916
-rw-r--r--arch/x86/kvm/timer.c3
-rw-r--r--arch/x86/kvm/trace.h165
-rw-r--r--arch/x86/kvm/vmx.c297
-rw-r--r--arch/x86/kvm/x86.c1506
-rw-r--r--arch/x86/kvm/x86.h7
-rw-r--r--arch/x86/lguest/boot.c61
-rw-r--r--arch/x86/lguest/i386_head.S2
28 files changed, 2764 insertions, 2023 deletions
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 59b4556a5b92..e790bc1fbfa3 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -626,7 +626,7 @@ ia32_sys_call_table:
626 .quad stub32_sigreturn 626 .quad stub32_sigreturn
627 .quad stub32_clone /* 120 */ 627 .quad stub32_clone /* 120 */
628 .quad sys_setdomainname 628 .quad sys_setdomainname
629 .quad sys_uname 629 .quad sys_newuname
630 .quad sys_modify_ldt 630 .quad sys_modify_ldt
631 .quad compat_sys_adjtimex 631 .quad compat_sys_adjtimex
632 .quad sys32_mprotect /* 125 */ 632 .quad sys32_mprotect /* 125 */
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index ba19ad4c47d0..86a0ff0aeac7 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -21,6 +21,7 @@
21#define _ASM_X86_AMD_IOMMU_TYPES_H 21#define _ASM_X86_AMD_IOMMU_TYPES_H
22 22
23#include <linux/types.h> 23#include <linux/types.h>
24#include <linux/mutex.h>
24#include <linux/list.h> 25#include <linux/list.h>
25#include <linux/spinlock.h> 26#include <linux/spinlock.h>
26 27
@@ -140,6 +141,7 @@
140 141
141/* constants to configure the command buffer */ 142/* constants to configure the command buffer */
142#define CMD_BUFFER_SIZE 8192 143#define CMD_BUFFER_SIZE 8192
144#define CMD_BUFFER_UNINITIALIZED 1
143#define CMD_BUFFER_ENTRIES 512 145#define CMD_BUFFER_ENTRIES 512
144#define MMIO_CMD_SIZE_SHIFT 56 146#define MMIO_CMD_SIZE_SHIFT 56
145#define MMIO_CMD_SIZE_512 (0x9ULL << MMIO_CMD_SIZE_SHIFT) 147#define MMIO_CMD_SIZE_512 (0x9ULL << MMIO_CMD_SIZE_SHIFT)
@@ -237,6 +239,7 @@ struct protection_domain {
237 struct list_head list; /* for list of all protection domains */ 239 struct list_head list; /* for list of all protection domains */
238 struct list_head dev_list; /* List of all devices in this domain */ 240 struct list_head dev_list; /* List of all devices in this domain */
239 spinlock_t lock; /* mostly used to lock the page table*/ 241 spinlock_t lock; /* mostly used to lock the page table*/
242 struct mutex api_lock; /* protect page tables in the iommu-api path */
240 u16 id; /* the domain id written to the device table */ 243 u16 id; /* the domain id written to the device table */
241 int mode; /* paging mode (0-6 levels) */ 244 int mode; /* paging mode (0-6 levels) */
242 u64 *pt_root; /* page table root pointer */ 245 u64 *pt_root; /* page table root pointer */
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index f46b79f6c16c..ff90055c7f0b 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -21,6 +21,7 @@
21#define __KVM_HAVE_PIT_STATE2 21#define __KVM_HAVE_PIT_STATE2
22#define __KVM_HAVE_XEN_HVM 22#define __KVM_HAVE_XEN_HVM
23#define __KVM_HAVE_VCPU_EVENTS 23#define __KVM_HAVE_VCPU_EVENTS
24#define __KVM_HAVE_DEBUGREGS
24 25
25/* Architectural interrupt line count. */ 26/* Architectural interrupt line count. */
26#define KVM_NR_INTERRUPTS 256 27#define KVM_NR_INTERRUPTS 256
@@ -257,6 +258,11 @@ struct kvm_reinject_control {
257/* When set in flags, include corresponding fields on KVM_SET_VCPU_EVENTS */ 258/* When set in flags, include corresponding fields on KVM_SET_VCPU_EVENTS */
258#define KVM_VCPUEVENT_VALID_NMI_PENDING 0x00000001 259#define KVM_VCPUEVENT_VALID_NMI_PENDING 0x00000001
259#define KVM_VCPUEVENT_VALID_SIPI_VECTOR 0x00000002 260#define KVM_VCPUEVENT_VALID_SIPI_VECTOR 0x00000002
261#define KVM_VCPUEVENT_VALID_SHADOW 0x00000004
262
263/* Interrupt shadow states */
264#define KVM_X86_SHADOW_INT_MOV_SS 0x01
265#define KVM_X86_SHADOW_INT_STI 0x02
260 266
261/* for KVM_GET/SET_VCPU_EVENTS */ 267/* for KVM_GET/SET_VCPU_EVENTS */
262struct kvm_vcpu_events { 268struct kvm_vcpu_events {
@@ -271,7 +277,7 @@ struct kvm_vcpu_events {
271 __u8 injected; 277 __u8 injected;
272 __u8 nr; 278 __u8 nr;
273 __u8 soft; 279 __u8 soft;
274 __u8 pad; 280 __u8 shadow;
275 } interrupt; 281 } interrupt;
276 struct { 282 struct {
277 __u8 injected; 283 __u8 injected;
@@ -284,4 +290,13 @@ struct kvm_vcpu_events {
284 __u32 reserved[10]; 290 __u32 reserved[10];
285}; 291};
286 292
293/* for KVM_GET/SET_DEBUGREGS */
294struct kvm_debugregs {
295 __u64 db[4];
296 __u64 dr6;
297 __u64 dr7;
298 __u64 flags;
299 __u64 reserved[9];
300};
301
287#endif /* _ASM_X86_KVM_H */ 302#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 7a6f54fa13ba..0b2729bf2070 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -11,6 +11,8 @@
11#ifndef _ASM_X86_KVM_X86_EMULATE_H 11#ifndef _ASM_X86_KVM_X86_EMULATE_H
12#define _ASM_X86_KVM_X86_EMULATE_H 12#define _ASM_X86_KVM_X86_EMULATE_H
13 13
14#include <asm/desc_defs.h>
15
14struct x86_emulate_ctxt; 16struct x86_emulate_ctxt;
15 17
16/* 18/*
@@ -63,6 +65,15 @@ struct x86_emulate_ops {
63 unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error); 65 unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
64 66
65 /* 67 /*
68 * write_std: Write bytes of standard (non-emulated/special) memory.
69 * Used for descriptor writing.
70 * @addr: [IN ] Linear address to which to write.
71 * @val: [OUT] Value write to memory, zero-extended to 'u_long'.
72 * @bytes: [IN ] Number of bytes to write to memory.
73 */
74 int (*write_std)(unsigned long addr, void *val,
75 unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
76 /*
66 * fetch: Read bytes of standard (non-emulated/special) memory. 77 * fetch: Read bytes of standard (non-emulated/special) memory.
67 * Used for instruction fetch. 78 * Used for instruction fetch.
68 * @addr: [IN ] Linear address from which to read. 79 * @addr: [IN ] Linear address from which to read.
@@ -109,6 +120,23 @@ struct x86_emulate_ops {
109 unsigned int bytes, 120 unsigned int bytes,
110 struct kvm_vcpu *vcpu); 121 struct kvm_vcpu *vcpu);
111 122
123 int (*pio_in_emulated)(int size, unsigned short port, void *val,
124 unsigned int count, struct kvm_vcpu *vcpu);
125
126 int (*pio_out_emulated)(int size, unsigned short port, const void *val,
127 unsigned int count, struct kvm_vcpu *vcpu);
128
129 bool (*get_cached_descriptor)(struct desc_struct *desc,
130 int seg, struct kvm_vcpu *vcpu);
131 void (*set_cached_descriptor)(struct desc_struct *desc,
132 int seg, struct kvm_vcpu *vcpu);
133 u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu);
134 void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu);
135 void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu);
136 ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu);
137 void (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu);
138 int (*cpl)(struct kvm_vcpu *vcpu);
139 void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
112}; 140};
113 141
114/* Type, address-of, and value of an instruction's operand. */ 142/* Type, address-of, and value of an instruction's operand. */
@@ -124,6 +152,12 @@ struct fetch_cache {
124 unsigned long end; 152 unsigned long end;
125}; 153};
126 154
155struct read_cache {
156 u8 data[1024];
157 unsigned long pos;
158 unsigned long end;
159};
160
127struct decode_cache { 161struct decode_cache {
128 u8 twobyte; 162 u8 twobyte;
129 u8 b; 163 u8 b;
@@ -139,7 +173,7 @@ struct decode_cache {
139 u8 seg_override; 173 u8 seg_override;
140 unsigned int d; 174 unsigned int d;
141 unsigned long regs[NR_VCPU_REGS]; 175 unsigned long regs[NR_VCPU_REGS];
142 unsigned long eip, eip_orig; 176 unsigned long eip;
143 /* modrm */ 177 /* modrm */
144 u8 modrm; 178 u8 modrm;
145 u8 modrm_mod; 179 u8 modrm_mod;
@@ -151,16 +185,15 @@ struct decode_cache {
151 void *modrm_ptr; 185 void *modrm_ptr;
152 unsigned long modrm_val; 186 unsigned long modrm_val;
153 struct fetch_cache fetch; 187 struct fetch_cache fetch;
188 struct read_cache io_read;
154}; 189};
155 190
156#define X86_SHADOW_INT_MOV_SS 1
157#define X86_SHADOW_INT_STI 2
158
159struct x86_emulate_ctxt { 191struct x86_emulate_ctxt {
160 /* Register state before/after emulation. */ 192 /* Register state before/after emulation. */
161 struct kvm_vcpu *vcpu; 193 struct kvm_vcpu *vcpu;
162 194
163 unsigned long eflags; 195 unsigned long eflags;
196 unsigned long eip; /* eip before instruction emulation */
164 /* Emulated execution mode, represented by an X86EMUL_MODE value. */ 197 /* Emulated execution mode, represented by an X86EMUL_MODE value. */
165 int mode; 198 int mode;
166 u32 cs_base; 199 u32 cs_base;
@@ -168,6 +201,7 @@ struct x86_emulate_ctxt {
168 /* interruptibility state, as a result of execution of STI or MOV SS */ 201 /* interruptibility state, as a result of execution of STI or MOV SS */
169 int interruptibility; 202 int interruptibility;
170 203
204 bool restart; /* restart string instruction after writeback */
171 /* decode cache */ 205 /* decode cache */
172 struct decode_cache decode; 206 struct decode_cache decode;
173}; 207};
@@ -194,5 +228,9 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt,
194 struct x86_emulate_ops *ops); 228 struct x86_emulate_ops *ops);
195int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, 229int x86_emulate_insn(struct x86_emulate_ctxt *ctxt,
196 struct x86_emulate_ops *ops); 230 struct x86_emulate_ops *ops);
231int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
232 struct x86_emulate_ops *ops,
233 u16 tss_selector, int reason,
234 bool has_error_code, u32 error_code);
197 235
198#endif /* _ASM_X86_KVM_X86_EMULATE_H */ 236#endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 06d9e79ca37d..d47d087568fe 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -171,14 +171,13 @@ struct kvm_pte_chain {
171union kvm_mmu_page_role { 171union kvm_mmu_page_role {
172 unsigned word; 172 unsigned word;
173 struct { 173 struct {
174 unsigned glevels:4;
175 unsigned level:4; 174 unsigned level:4;
175 unsigned cr4_pae:1;
176 unsigned quadrant:2; 176 unsigned quadrant:2;
177 unsigned pad_for_nice_hex_output:6; 177 unsigned pad_for_nice_hex_output:6;
178 unsigned direct:1; 178 unsigned direct:1;
179 unsigned access:3; 179 unsigned access:3;
180 unsigned invalid:1; 180 unsigned invalid:1;
181 unsigned cr4_pge:1;
182 unsigned nxe:1; 181 unsigned nxe:1;
183 }; 182 };
184}; 183};
@@ -187,8 +186,6 @@ struct kvm_mmu_page {
187 struct list_head link; 186 struct list_head link;
188 struct hlist_node hash_link; 187 struct hlist_node hash_link;
189 188
190 struct list_head oos_link;
191
192 /* 189 /*
193 * The following two entries are used to key the shadow page in the 190 * The following two entries are used to key the shadow page in the
194 * hash table. 191 * hash table.
@@ -204,9 +201,9 @@ struct kvm_mmu_page {
204 * in this shadow page. 201 * in this shadow page.
205 */ 202 */
206 DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); 203 DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
207 int multimapped; /* More than one parent_pte? */ 204 bool multimapped; /* More than one parent_pte? */
208 int root_count; /* Currently serving as active root */
209 bool unsync; 205 bool unsync;
206 int root_count; /* Currently serving as active root */
210 unsigned int unsync_children; 207 unsigned int unsync_children;
211 union { 208 union {
212 u64 *parent_pte; /* !multimapped */ 209 u64 *parent_pte; /* !multimapped */
@@ -224,14 +221,9 @@ struct kvm_pv_mmu_op_buffer {
224 221
225struct kvm_pio_request { 222struct kvm_pio_request {
226 unsigned long count; 223 unsigned long count;
227 int cur_count;
228 gva_t guest_gva;
229 int in; 224 int in;
230 int port; 225 int port;
231 int size; 226 int size;
232 int string;
233 int down;
234 int rep;
235}; 227};
236 228
237/* 229/*
@@ -362,8 +354,8 @@ struct kvm_vcpu_arch {
362 u64 *mce_banks; 354 u64 *mce_banks;
363 355
364 /* used for guest single stepping over the given code position */ 356 /* used for guest single stepping over the given code position */
365 u16 singlestep_cs;
366 unsigned long singlestep_rip; 357 unsigned long singlestep_rip;
358
367 /* fields used by HYPER-V emulation */ 359 /* fields used by HYPER-V emulation */
368 u64 hv_vapic; 360 u64 hv_vapic;
369}; 361};
@@ -389,6 +381,7 @@ struct kvm_arch {
389 unsigned int n_free_mmu_pages; 381 unsigned int n_free_mmu_pages;
390 unsigned int n_requested_mmu_pages; 382 unsigned int n_requested_mmu_pages;
391 unsigned int n_alloc_mmu_pages; 383 unsigned int n_alloc_mmu_pages;
384 atomic_t invlpg_counter;
392 struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; 385 struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
393 /* 386 /*
394 * Hash table of struct kvm_mmu_page. 387 * Hash table of struct kvm_mmu_page.
@@ -461,11 +454,6 @@ struct kvm_vcpu_stat {
461 u32 nmi_injections; 454 u32 nmi_injections;
462}; 455};
463 456
464struct descriptor_table {
465 u16 limit;
466 unsigned long base;
467} __attribute__((packed));
468
469struct kvm_x86_ops { 457struct kvm_x86_ops {
470 int (*cpu_has_kvm_support)(void); /* __init */ 458 int (*cpu_has_kvm_support)(void); /* __init */
471 int (*disabled_by_bios)(void); /* __init */ 459 int (*disabled_by_bios)(void); /* __init */
@@ -503,12 +491,11 @@ struct kvm_x86_ops {
503 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); 491 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
504 void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); 492 void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
505 void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer); 493 void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
506 void (*get_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); 494 void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
507 void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); 495 void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
508 void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); 496 void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
509 void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); 497 void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
510 int (*get_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long *dest); 498 void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value);
511 int (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value);
512 void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); 499 void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
513 unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); 500 unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
514 void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); 501 void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
@@ -587,23 +574,14 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
587void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context); 574void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
588void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); 575void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
589void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); 576void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
590void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
591 unsigned long *rflags);
592 577
593unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr);
594void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long value,
595 unsigned long *rflags);
596void kvm_enable_efer_bits(u64); 578void kvm_enable_efer_bits(u64);
597int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data); 579int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
598int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); 580int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
599 581
600struct x86_emulate_ctxt; 582struct x86_emulate_ctxt;
601 583
602int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, 584int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port);
603 int size, unsigned port);
604int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
605 int size, unsigned long count, int down,
606 gva_t address, int rep, unsigned port);
607void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); 585void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
608int kvm_emulate_halt(struct kvm_vcpu *vcpu); 586int kvm_emulate_halt(struct kvm_vcpu *vcpu);
609int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); 587int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
@@ -616,12 +594,15 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
616void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); 594void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
617int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); 595int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
618 596
619int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason); 597int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
598 bool has_error_code, u32 error_code);
620 599
621void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); 600void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
622void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); 601void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
623void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); 602void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
624void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8); 603void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
604int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val);
605int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
625unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu); 606unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
626void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw); 607void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
627void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l); 608void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
@@ -649,8 +630,6 @@ int emulator_write_emulated(unsigned long addr,
649 unsigned int bytes, 630 unsigned int bytes,
650 struct kvm_vcpu *vcpu); 631 struct kvm_vcpu *vcpu);
651 632
652unsigned long segment_base(u16 selector);
653
654void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu); 633void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
655void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, 634void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
656 const u8 *new, int bytes, 635 const u8 *new, int bytes,
@@ -675,7 +654,6 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
675void kvm_enable_tdp(void); 654void kvm_enable_tdp(void);
676void kvm_disable_tdp(void); 655void kvm_disable_tdp(void);
677 656
678int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
679int complete_pio(struct kvm_vcpu *vcpu); 657int complete_pio(struct kvm_vcpu *vcpu);
680bool kvm_check_iopl(struct kvm_vcpu *vcpu); 658bool kvm_check_iopl(struct kvm_vcpu *vcpu);
681 659
@@ -724,23 +702,6 @@ static inline void kvm_load_ldt(u16 sel)
724 asm("lldt %0" : : "rm"(sel)); 702 asm("lldt %0" : : "rm"(sel));
725} 703}
726 704
727static inline void kvm_get_idt(struct descriptor_table *table)
728{
729 asm("sidt %0" : "=m"(*table));
730}
731
732static inline void kvm_get_gdt(struct descriptor_table *table)
733{
734 asm("sgdt %0" : "=m"(*table));
735}
736
737static inline unsigned long kvm_read_tr_base(void)
738{
739 u16 tr;
740 asm("str %0" : "=g"(tr));
741 return segment_base(tr);
742}
743
744#ifdef CONFIG_X86_64 705#ifdef CONFIG_X86_64
745static inline unsigned long read_msr(unsigned long msr) 706static inline unsigned long read_msr(unsigned long msr)
746{ 707{
@@ -826,4 +787,6 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
826void kvm_define_shared_msr(unsigned index, u32 msr); 787void kvm_define_shared_msr(unsigned index, u32 msr);
827void kvm_set_shared_msr(unsigned index, u64 val, u64 mask); 788void kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
828 789
790bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
791
829#endif /* _ASM_X86_KVM_HOST_H */ 792#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
index ba0eed8aa1a6..b60f2924c413 100644
--- a/arch/x86/include/asm/lguest_hcall.h
+++ b/arch/x86/include/asm/lguest_hcall.h
@@ -28,22 +28,39 @@
28 28
29#ifndef __ASSEMBLY__ 29#ifndef __ASSEMBLY__
30#include <asm/hw_irq.h> 30#include <asm/hw_irq.h>
31#include <asm/kvm_para.h>
32 31
33/*G:030 32/*G:030
34 * But first, how does our Guest contact the Host to ask for privileged 33 * But first, how does our Guest contact the Host to ask for privileged
35 * operations? There are two ways: the direct way is to make a "hypercall", 34 * operations? There are two ways: the direct way is to make a "hypercall",
36 * to make requests of the Host Itself. 35 * to make requests of the Host Itself.
37 * 36 *
38 * We use the KVM hypercall mechanism, though completely different hypercall 37 * Our hypercall mechanism uses the highest unused trap code (traps 32 and
39 * numbers. Seventeen hypercalls are available: the hypercall number is put in 38 * above are used by real hardware interrupts). Seventeen hypercalls are
40 * the %eax register, and the arguments (when required) are placed in %ebx, 39 * available: the hypercall number is put in the %eax register, and the
41 * %ecx, %edx and %esi. If a return value makes sense, it's returned in %eax. 40 * arguments (when required) are placed in %ebx, %ecx, %edx and %esi.
41 * If a return value makes sense, it's returned in %eax.
42 * 42 *
43 * Grossly invalid calls result in Sudden Death at the hands of the vengeful 43 * Grossly invalid calls result in Sudden Death at the hands of the vengeful
44 * Host, rather than returning failure. This reflects Winston Churchill's 44 * Host, rather than returning failure. This reflects Winston Churchill's
45 * definition of a gentleman: "someone who is only rude intentionally". 45 * definition of a gentleman: "someone who is only rude intentionally".
46:*/ 46 */
47static inline unsigned long
48hcall(unsigned long call,
49 unsigned long arg1, unsigned long arg2, unsigned long arg3,
50 unsigned long arg4)
51{
52 /* "int" is the Intel instruction to trigger a trap. */
53 asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY)
54 /* The call in %eax (aka "a") might be overwritten */
55 : "=a"(call)
56 /* The arguments are in %eax, %ebx, %ecx, %edx & %esi */
57 : "a"(call), "b"(arg1), "c"(arg2), "d"(arg3), "S"(arg4)
58 /* "memory" means this might write somewhere in memory.
59 * This isn't true for all calls, but it's safe to tell
60 * gcc that it might happen so it doesn't get clever. */
61 : "memory");
62 return call;
63}
47 64
48/* Can't use our min() macro here: needs to be a constant */ 65/* Can't use our min() macro here: needs to be a constant */
49#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) 66#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32)
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 38638cd2fa4c..0e831059ac5a 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -81,7 +81,9 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
81 u32 event_inj_err; 81 u32 event_inj_err;
82 u64 nested_cr3; 82 u64 nested_cr3;
83 u64 lbr_ctl; 83 u64 lbr_ctl;
84 u8 reserved_5[832]; 84 u64 reserved_5;
85 u64 next_rip;
86 u8 reserved_6[816];
85}; 87};
86 88
87 89
@@ -115,6 +117,10 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
115#define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT) 117#define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT)
116#define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT) 118#define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT)
117 119
120#define SVM_VM_CR_VALID_MASK 0x001fULL
121#define SVM_VM_CR_SVM_LOCK_MASK 0x0008ULL
122#define SVM_VM_CR_SVM_DIS_MASK 0x0010ULL
123
118struct __attribute__ ((__packed__)) vmcb_seg { 124struct __attribute__ ((__packed__)) vmcb_seg {
119 u16 selector; 125 u16 selector;
120 u16 attrib; 126 u16 attrib;
@@ -238,6 +244,7 @@ struct __attribute__ ((__packed__)) vmcb {
238 244
239#define SVM_EXITINFOSHIFT_TS_REASON_IRET 36 245#define SVM_EXITINFOSHIFT_TS_REASON_IRET 36
240#define SVM_EXITINFOSHIFT_TS_REASON_JMP 38 246#define SVM_EXITINFOSHIFT_TS_REASON_JMP 38
247#define SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE 44
241 248
242#define SVM_EXIT_READ_CR0 0x000 249#define SVM_EXIT_READ_CR0 0x000
243#define SVM_EXIT_READ_CR3 0x003 250#define SVM_EXIT_READ_CR3 0x003
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index f3dadb571d9b..f854d89b7edf 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -118,7 +118,7 @@ static bool check_device(struct device *dev)
118 return false; 118 return false;
119 119
120 /* No device or no PCI device */ 120 /* No device or no PCI device */
121 if (!dev || dev->bus != &pci_bus_type) 121 if (dev->bus != &pci_bus_type)
122 return false; 122 return false;
123 123
124 devid = get_device_id(dev); 124 devid = get_device_id(dev);
@@ -392,6 +392,7 @@ static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
392 u32 tail, head; 392 u32 tail, head;
393 u8 *target; 393 u8 *target;
394 394
395 WARN_ON(iommu->cmd_buf_size & CMD_BUFFER_UNINITIALIZED);
395 tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); 396 tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
396 target = iommu->cmd_buf + tail; 397 target = iommu->cmd_buf + tail;
397 memcpy_toio(target, cmd, sizeof(*cmd)); 398 memcpy_toio(target, cmd, sizeof(*cmd));
@@ -2186,7 +2187,7 @@ static void prealloc_protection_domains(void)
2186 struct dma_ops_domain *dma_dom; 2187 struct dma_ops_domain *dma_dom;
2187 u16 devid; 2188 u16 devid;
2188 2189
2189 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 2190 for_each_pci_dev(dev) {
2190 2191
2191 /* Do we handle this device? */ 2192 /* Do we handle this device? */
2192 if (!check_device(&dev->dev)) 2193 if (!check_device(&dev->dev))
@@ -2298,7 +2299,7 @@ static void cleanup_domain(struct protection_domain *domain)
2298 list_for_each_entry_safe(dev_data, next, &domain->dev_list, list) { 2299 list_for_each_entry_safe(dev_data, next, &domain->dev_list, list) {
2299 struct device *dev = dev_data->dev; 2300 struct device *dev = dev_data->dev;
2300 2301
2301 do_detach(dev); 2302 __detach_device(dev);
2302 atomic_set(&dev_data->bind, 0); 2303 atomic_set(&dev_data->bind, 0);
2303 } 2304 }
2304 2305
@@ -2327,6 +2328,7 @@ static struct protection_domain *protection_domain_alloc(void)
2327 return NULL; 2328 return NULL;
2328 2329
2329 spin_lock_init(&domain->lock); 2330 spin_lock_init(&domain->lock);
2331 mutex_init(&domain->api_lock);
2330 domain->id = domain_id_alloc(); 2332 domain->id = domain_id_alloc();
2331 if (!domain->id) 2333 if (!domain->id)
2332 goto out_err; 2334 goto out_err;
@@ -2379,9 +2381,7 @@ static void amd_iommu_domain_destroy(struct iommu_domain *dom)
2379 2381
2380 free_pagetable(domain); 2382 free_pagetable(domain);
2381 2383
2382 domain_id_free(domain->id); 2384 protection_domain_free(domain);
2383
2384 kfree(domain);
2385 2385
2386 dom->priv = NULL; 2386 dom->priv = NULL;
2387} 2387}
@@ -2456,6 +2456,8 @@ static int amd_iommu_map_range(struct iommu_domain *dom,
2456 iova &= PAGE_MASK; 2456 iova &= PAGE_MASK;
2457 paddr &= PAGE_MASK; 2457 paddr &= PAGE_MASK;
2458 2458
2459 mutex_lock(&domain->api_lock);
2460
2459 for (i = 0; i < npages; ++i) { 2461 for (i = 0; i < npages; ++i) {
2460 ret = iommu_map_page(domain, iova, paddr, prot, PM_MAP_4k); 2462 ret = iommu_map_page(domain, iova, paddr, prot, PM_MAP_4k);
2461 if (ret) 2463 if (ret)
@@ -2465,6 +2467,8 @@ static int amd_iommu_map_range(struct iommu_domain *dom,
2465 paddr += PAGE_SIZE; 2467 paddr += PAGE_SIZE;
2466 } 2468 }
2467 2469
2470 mutex_unlock(&domain->api_lock);
2471
2468 return 0; 2472 return 0;
2469} 2473}
2470 2474
@@ -2477,12 +2481,16 @@ static void amd_iommu_unmap_range(struct iommu_domain *dom,
2477 2481
2478 iova &= PAGE_MASK; 2482 iova &= PAGE_MASK;
2479 2483
2484 mutex_lock(&domain->api_lock);
2485
2480 for (i = 0; i < npages; ++i) { 2486 for (i = 0; i < npages; ++i) {
2481 iommu_unmap_page(domain, iova, PM_MAP_4k); 2487 iommu_unmap_page(domain, iova, PM_MAP_4k);
2482 iova += PAGE_SIZE; 2488 iova += PAGE_SIZE;
2483 } 2489 }
2484 2490
2485 iommu_flush_tlb_pde(domain); 2491 iommu_flush_tlb_pde(domain);
2492
2493 mutex_unlock(&domain->api_lock);
2486} 2494}
2487 2495
2488static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, 2496static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 42f5350b908f..6360abf993d4 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -138,9 +138,9 @@ int amd_iommus_present;
138bool amd_iommu_np_cache __read_mostly; 138bool amd_iommu_np_cache __read_mostly;
139 139
140/* 140/*
141 * Set to true if ACPI table parsing and hardware intialization went properly 141 * The ACPI table parsing functions set this variable on an error
142 */ 142 */
143static bool amd_iommu_initialized; 143static int __initdata amd_iommu_init_err;
144 144
145/* 145/*
146 * List of protection domains - used during resume 146 * List of protection domains - used during resume
@@ -391,9 +391,11 @@ static int __init find_last_devid_acpi(struct acpi_table_header *table)
391 */ 391 */
392 for (i = 0; i < table->length; ++i) 392 for (i = 0; i < table->length; ++i)
393 checksum += p[i]; 393 checksum += p[i];
394 if (checksum != 0) 394 if (checksum != 0) {
395 /* ACPI table corrupt */ 395 /* ACPI table corrupt */
396 return -ENODEV; 396 amd_iommu_init_err = -ENODEV;
397 return 0;
398 }
397 399
398 p += IVRS_HEADER_LENGTH; 400 p += IVRS_HEADER_LENGTH;
399 401
@@ -436,7 +438,7 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
436 if (cmd_buf == NULL) 438 if (cmd_buf == NULL)
437 return NULL; 439 return NULL;
438 440
439 iommu->cmd_buf_size = CMD_BUFFER_SIZE; 441 iommu->cmd_buf_size = CMD_BUFFER_SIZE | CMD_BUFFER_UNINITIALIZED;
440 442
441 return cmd_buf; 443 return cmd_buf;
442} 444}
@@ -472,12 +474,13 @@ static void iommu_enable_command_buffer(struct amd_iommu *iommu)
472 &entry, sizeof(entry)); 474 &entry, sizeof(entry));
473 475
474 amd_iommu_reset_cmd_buffer(iommu); 476 amd_iommu_reset_cmd_buffer(iommu);
477 iommu->cmd_buf_size &= ~(CMD_BUFFER_UNINITIALIZED);
475} 478}
476 479
477static void __init free_command_buffer(struct amd_iommu *iommu) 480static void __init free_command_buffer(struct amd_iommu *iommu)
478{ 481{
479 free_pages((unsigned long)iommu->cmd_buf, 482 free_pages((unsigned long)iommu->cmd_buf,
480 get_order(iommu->cmd_buf_size)); 483 get_order(iommu->cmd_buf_size & ~(CMD_BUFFER_UNINITIALIZED)));
481} 484}
482 485
483/* allocates the memory where the IOMMU will log its events to */ 486/* allocates the memory where the IOMMU will log its events to */
@@ -920,11 +923,16 @@ static int __init init_iommu_all(struct acpi_table_header *table)
920 h->mmio_phys); 923 h->mmio_phys);
921 924
922 iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL); 925 iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL);
923 if (iommu == NULL) 926 if (iommu == NULL) {
924 return -ENOMEM; 927 amd_iommu_init_err = -ENOMEM;
928 return 0;
929 }
930
925 ret = init_iommu_one(iommu, h); 931 ret = init_iommu_one(iommu, h);
926 if (ret) 932 if (ret) {
927 return ret; 933 amd_iommu_init_err = ret;
934 return 0;
935 }
928 break; 936 break;
929 default: 937 default:
930 break; 938 break;
@@ -934,8 +942,6 @@ static int __init init_iommu_all(struct acpi_table_header *table)
934 } 942 }
935 WARN_ON(p != end); 943 WARN_ON(p != end);
936 944
937 amd_iommu_initialized = true;
938
939 return 0; 945 return 0;
940} 946}
941 947
@@ -1211,6 +1217,10 @@ static int __init amd_iommu_init(void)
1211 if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0) 1217 if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0)
1212 return -ENODEV; 1218 return -ENODEV;
1213 1219
1220 ret = amd_iommu_init_err;
1221 if (ret)
1222 goto out;
1223
1214 dev_table_size = tbl_size(DEV_TABLE_ENTRY_SIZE); 1224 dev_table_size = tbl_size(DEV_TABLE_ENTRY_SIZE);
1215 alias_table_size = tbl_size(ALIAS_TABLE_ENTRY_SIZE); 1225 alias_table_size = tbl_size(ALIAS_TABLE_ENTRY_SIZE);
1216 rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE); 1226 rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE);
@@ -1270,12 +1280,19 @@ static int __init amd_iommu_init(void)
1270 if (acpi_table_parse("IVRS", init_iommu_all) != 0) 1280 if (acpi_table_parse("IVRS", init_iommu_all) != 0)
1271 goto free; 1281 goto free;
1272 1282
1273 if (!amd_iommu_initialized) 1283 if (amd_iommu_init_err) {
1284 ret = amd_iommu_init_err;
1274 goto free; 1285 goto free;
1286 }
1275 1287
1276 if (acpi_table_parse("IVRS", init_memory_definitions) != 0) 1288 if (acpi_table_parse("IVRS", init_memory_definitions) != 0)
1277 goto free; 1289 goto free;
1278 1290
1291 if (amd_iommu_init_err) {
1292 ret = amd_iommu_init_err;
1293 goto free;
1294 }
1295
1279 ret = sysdev_class_register(&amd_iommu_sysdev_class); 1296 ret = sysdev_class_register(&amd_iommu_sysdev_class);
1280 if (ret) 1297 if (ret)
1281 goto free; 1298 goto free;
@@ -1288,6 +1305,8 @@ static int __init amd_iommu_init(void)
1288 if (ret) 1305 if (ret)
1289 goto free; 1306 goto free;
1290 1307
1308 enable_iommus();
1309
1291 if (iommu_pass_through) 1310 if (iommu_pass_through)
1292 ret = amd_iommu_init_passthrough(); 1311 ret = amd_iommu_init_passthrough();
1293 else 1312 else
@@ -1300,8 +1319,6 @@ static int __init amd_iommu_init(void)
1300 1319
1301 amd_iommu_init_notifier(); 1320 amd_iommu_init_notifier();
1302 1321
1303 enable_iommus();
1304
1305 if (iommu_pass_through) 1322 if (iommu_pass_through)
1306 goto out; 1323 goto out;
1307 1324
@@ -1315,6 +1332,7 @@ out:
1315 return ret; 1332 return ret;
1316 1333
1317free: 1334free:
1335 disable_iommus();
1318 1336
1319 amd_iommu_uninit_devices(); 1337 amd_iommu_uninit_devices();
1320 1338
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 3704997e8b25..b5d8b0bcf235 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -393,6 +393,7 @@ void __init gart_iommu_hole_init(void)
393 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { 393 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
394 int bus; 394 int bus;
395 int dev_base, dev_limit; 395 int dev_base, dev_limit;
396 u32 ctl;
396 397
397 bus = bus_dev_ranges[i].bus; 398 bus = bus_dev_ranges[i].bus;
398 dev_base = bus_dev_ranges[i].dev_base; 399 dev_base = bus_dev_ranges[i].dev_base;
@@ -406,7 +407,19 @@ void __init gart_iommu_hole_init(void)
406 gart_iommu_aperture = 1; 407 gart_iommu_aperture = 1;
407 x86_init.iommu.iommu_init = gart_iommu_init; 408 x86_init.iommu.iommu_init = gart_iommu_init;
408 409
409 aper_order = (read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL) >> 1) & 7; 410 ctl = read_pci_config(bus, slot, 3,
411 AMD64_GARTAPERTURECTL);
412
413 /*
414 * Before we do anything else disable the GART. It may
415 * still be enabled if we boot into a crash-kernel here.
416 * Reconfiguring the GART while it is enabled could have
417 * unknown side-effects.
418 */
419 ctl &= ~GARTEN;
420 write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
421
422 aper_order = (ctl >> 1) & 7;
410 aper_size = (32 * 1024 * 1024) << aper_order; 423 aper_size = (32 * 1024 * 1024) << aper_order;
411 aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff; 424 aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
412 aper_base <<= 25; 425 aper_base <<= 25;
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index a4849c10a77e..ebd4c51d096a 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -27,7 +27,6 @@
27#include <asm/cpu.h> 27#include <asm/cpu.h>
28#include <asm/reboot.h> 28#include <asm/reboot.h>
29#include <asm/virtext.h> 29#include <asm/virtext.h>
30#include <asm/x86_init.h>
31 30
32#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) 31#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
33 32
@@ -103,10 +102,5 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
103#ifdef CONFIG_HPET_TIMER 102#ifdef CONFIG_HPET_TIMER
104 hpet_disable(); 103 hpet_disable();
105#endif 104#endif
106
107#ifdef CONFIG_X86_64
108 x86_platform.iommu_shutdown();
109#endif
110
111 crash_save_cpu(regs, safe_smp_processor_id()); 105 crash_save_cpu(regs, safe_smp_processor_id());
112} 106}
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h
index e39e77168a37..e1a93be4fd44 100644
--- a/arch/x86/kernel/dumpstack.h
+++ b/arch/x86/kernel/dumpstack.h
@@ -14,6 +14,8 @@
14#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) 14#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
15#endif 15#endif
16 16
17#include <linux/uaccess.h>
18
17extern void 19extern void
18show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, 20show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
19 unsigned long *stack, unsigned long bp, char *log_lvl); 21 unsigned long *stack, unsigned long bp, char *log_lvl);
@@ -42,8 +44,10 @@ static inline unsigned long rewind_frame_pointer(int n)
42 get_bp(frame); 44 get_bp(frame);
43 45
44#ifdef CONFIG_FRAME_POINTER 46#ifdef CONFIG_FRAME_POINTER
45 while (n--) 47 while (n--) {
46 frame = frame->next_frame; 48 if (probe_kernel_address(&frame->next_frame, frame))
49 break;
50 }
47#endif 51#endif
48 52
49 return (unsigned long)frame; 53 return (unsigned long)frame;
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 68cd24f9deae..0f7f130caa67 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -565,6 +565,9 @@ static void enable_gart_translations(void)
565 565
566 enable_gart_translation(dev, __pa(agp_gatt_table)); 566 enable_gart_translation(dev, __pa(agp_gatt_table));
567 } 567 }
568
569 /* Flush the GART-TLB to remove stale entries */
570 k8_flush_garts();
568} 571}
569 572
570/* 573/*
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 4dade6ac0827..5ac0bb465ed6 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -33,6 +33,7 @@
33#include <asm/kvm_emulate.h> 33#include <asm/kvm_emulate.h>
34 34
35#include "x86.h" 35#include "x86.h"
36#include "tss.h"
36 37
37/* 38/*
38 * Opcode effective-address decode tables. 39 * Opcode effective-address decode tables.
@@ -50,6 +51,8 @@
50#define DstReg (2<<1) /* Register operand. */ 51#define DstReg (2<<1) /* Register operand. */
51#define DstMem (3<<1) /* Memory operand. */ 52#define DstMem (3<<1) /* Memory operand. */
52#define DstAcc (4<<1) /* Destination Accumulator */ 53#define DstAcc (4<<1) /* Destination Accumulator */
54#define DstDI (5<<1) /* Destination is in ES:(E)DI */
55#define DstMem64 (6<<1) /* 64bit memory operand */
53#define DstMask (7<<1) 56#define DstMask (7<<1)
54/* Source operand type. */ 57/* Source operand type. */
55#define SrcNone (0<<4) /* No source operand. */ 58#define SrcNone (0<<4) /* No source operand. */
@@ -63,6 +66,7 @@
63#define SrcOne (7<<4) /* Implied '1' */ 66#define SrcOne (7<<4) /* Implied '1' */
64#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */ 67#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */
65#define SrcImmU (9<<4) /* Immediate operand, unsigned */ 68#define SrcImmU (9<<4) /* Immediate operand, unsigned */
69#define SrcSI (0xa<<4) /* Source is in the DS:RSI */
66#define SrcMask (0xf<<4) 70#define SrcMask (0xf<<4)
67/* Generic ModRM decode. */ 71/* Generic ModRM decode. */
68#define ModRM (1<<8) 72#define ModRM (1<<8)
@@ -85,6 +89,9 @@
85#define Src2ImmByte (2<<29) 89#define Src2ImmByte (2<<29)
86#define Src2One (3<<29) 90#define Src2One (3<<29)
87#define Src2Imm16 (4<<29) 91#define Src2Imm16 (4<<29)
92#define Src2Mem16 (5<<29) /* Used for Ep encoding. First argument has to be
93 in memory and second argument is located
94 immediately after the first one in memory. */
88#define Src2Mask (7<<29) 95#define Src2Mask (7<<29)
89 96
90enum { 97enum {
@@ -147,8 +154,8 @@ static u32 opcode_table[256] = {
147 0, 0, 0, 0, 154 0, 0, 0, 0,
148 /* 0x68 - 0x6F */ 155 /* 0x68 - 0x6F */
149 SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0, 156 SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0,
150 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */ 157 DstDI | ByteOp | Mov | String, DstDI | Mov | String, /* insb, insw/insd */
151 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */ 158 SrcSI | ByteOp | ImplicitOps | String, SrcSI | ImplicitOps | String, /* outsb, outsw/outsd */
152 /* 0x70 - 0x77 */ 159 /* 0x70 - 0x77 */
153 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, 160 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
154 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, 161 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
@@ -173,12 +180,12 @@ static u32 opcode_table[256] = {
173 /* 0xA0 - 0xA7 */ 180 /* 0xA0 - 0xA7 */
174 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, 181 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
175 ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs, 182 ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs,
176 ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, 183 ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String,
177 ByteOp | ImplicitOps | String, ImplicitOps | String, 184 ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String,
178 /* 0xA8 - 0xAF */ 185 /* 0xA8 - 0xAF */
179 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, 186 0, 0, ByteOp | DstDI | Mov | String, DstDI | Mov | String,
180 ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, 187 ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String,
181 ByteOp | ImplicitOps | String, ImplicitOps | String, 188 ByteOp | DstDI | String, DstDI | String,
182 /* 0xB0 - 0xB7 */ 189 /* 0xB0 - 0xB7 */
183 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, 190 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
184 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, 191 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
@@ -204,13 +211,13 @@ static u32 opcode_table[256] = {
204 0, 0, 0, 0, 0, 0, 0, 0, 211 0, 0, 0, 0, 0, 0, 0, 0,
205 /* 0xE0 - 0xE7 */ 212 /* 0xE0 - 0xE7 */
206 0, 0, 0, 0, 213 0, 0, 0, 0,
207 ByteOp | SrcImmUByte, SrcImmUByte, 214 ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
208 ByteOp | SrcImmUByte, SrcImmUByte, 215 ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
209 /* 0xE8 - 0xEF */ 216 /* 0xE8 - 0xEF */
210 SrcImm | Stack, SrcImm | ImplicitOps, 217 SrcImm | Stack, SrcImm | ImplicitOps,
211 SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps, 218 SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps,
212 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, 219 SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
213 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, 220 SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
214 /* 0xF0 - 0xF7 */ 221 /* 0xF0 - 0xF7 */
215 0, 0, 0, 0, 222 0, 0, 0, 0,
216 ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3, 223 ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3,
@@ -343,7 +350,8 @@ static u32 group_table[] = {
343 [Group5*8] = 350 [Group5*8] =
344 DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 351 DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
345 SrcMem | ModRM | Stack, 0, 352 SrcMem | ModRM | Stack, 0,
346 SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0, 353 SrcMem | ModRM | Stack, SrcMem | ModRM | Src2Mem16 | ImplicitOps,
354 SrcMem | ModRM | Stack, 0,
347 [Group7*8] = 355 [Group7*8] =
348 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv, 356 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv,
349 SrcNone | ModRM | DstMem | Mov, 0, 357 SrcNone | ModRM | DstMem | Mov, 0,
@@ -353,14 +361,14 @@ static u32 group_table[] = {
353 DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock, 361 DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock,
354 DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock, 362 DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock,
355 [Group9*8] = 363 [Group9*8] =
356 0, ImplicitOps | ModRM | Lock, 0, 0, 0, 0, 0, 0, 364 0, DstMem64 | ModRM | Lock, 0, 0, 0, 0, 0, 0,
357}; 365};
358 366
359static u32 group2_table[] = { 367static u32 group2_table[] = {
360 [Group7*8] = 368 [Group7*8] =
361 SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM, 369 SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM | Priv,
362 SrcNone | ModRM | DstMem | Mov, 0, 370 SrcNone | ModRM | DstMem | Mov, 0,
363 SrcMem16 | ModRM | Mov, 0, 371 SrcMem16 | ModRM | Mov | Priv, 0,
364 [Group9*8] = 372 [Group9*8] =
365 0, 0, 0, 0, 0, 0, 0, 0, 373 0, 0, 0, 0, 0, 0, 0, 0,
366}; 374};
@@ -562,7 +570,7 @@ static u32 group2_table[] = {
562#define insn_fetch(_type, _size, _eip) \ 570#define insn_fetch(_type, _size, _eip) \
563({ unsigned long _x; \ 571({ unsigned long _x; \
564 rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \ 572 rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \
565 if (rc != 0) \ 573 if (rc != X86EMUL_CONTINUE) \
566 goto done; \ 574 goto done; \
567 (_eip) += (_size); \ 575 (_eip) += (_size); \
568 (_type)_x; \ 576 (_type)_x; \
@@ -638,40 +646,40 @@ static unsigned long ss_base(struct x86_emulate_ctxt *ctxt)
638 646
639static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, 647static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
640 struct x86_emulate_ops *ops, 648 struct x86_emulate_ops *ops,
641 unsigned long linear, u8 *dest) 649 unsigned long eip, u8 *dest)
642{ 650{
643 struct fetch_cache *fc = &ctxt->decode.fetch; 651 struct fetch_cache *fc = &ctxt->decode.fetch;
644 int rc; 652 int rc;
645 int size; 653 int size, cur_size;
646 654
647 if (linear < fc->start || linear >= fc->end) { 655 if (eip == fc->end) {
648 size = min(15UL, PAGE_SIZE - offset_in_page(linear)); 656 cur_size = fc->end - fc->start;
649 rc = ops->fetch(linear, fc->data, size, ctxt->vcpu, NULL); 657 size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip));
650 if (rc) 658 rc = ops->fetch(ctxt->cs_base + eip, fc->data + cur_size,
659 size, ctxt->vcpu, NULL);
660 if (rc != X86EMUL_CONTINUE)
651 return rc; 661 return rc;
652 fc->start = linear; 662 fc->end += size;
653 fc->end = linear + size;
654 } 663 }
655 *dest = fc->data[linear - fc->start]; 664 *dest = fc->data[eip - fc->start];
656 return 0; 665 return X86EMUL_CONTINUE;
657} 666}
658 667
659static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, 668static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
660 struct x86_emulate_ops *ops, 669 struct x86_emulate_ops *ops,
661 unsigned long eip, void *dest, unsigned size) 670 unsigned long eip, void *dest, unsigned size)
662{ 671{
663 int rc = 0; 672 int rc;
664 673
665 /* x86 instructions are limited to 15 bytes. */ 674 /* x86 instructions are limited to 15 bytes. */
666 if (eip + size - ctxt->decode.eip_orig > 15) 675 if (eip + size - ctxt->eip > 15)
667 return X86EMUL_UNHANDLEABLE; 676 return X86EMUL_UNHANDLEABLE;
668 eip += ctxt->cs_base;
669 while (size--) { 677 while (size--) {
670 rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++); 678 rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++);
671 if (rc) 679 if (rc != X86EMUL_CONTINUE)
672 return rc; 680 return rc;
673 } 681 }
674 return 0; 682 return X86EMUL_CONTINUE;
675} 683}
676 684
677/* 685/*
@@ -702,7 +710,7 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
702 *address = 0; 710 *address = 0;
703 rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, 711 rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
704 ctxt->vcpu, NULL); 712 ctxt->vcpu, NULL);
705 if (rc) 713 if (rc != X86EMUL_CONTINUE)
706 return rc; 714 return rc;
707 rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, 715 rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
708 ctxt->vcpu, NULL); 716 ctxt->vcpu, NULL);
@@ -782,7 +790,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
782 struct decode_cache *c = &ctxt->decode; 790 struct decode_cache *c = &ctxt->decode;
783 u8 sib; 791 u8 sib;
784 int index_reg = 0, base_reg = 0, scale; 792 int index_reg = 0, base_reg = 0, scale;
785 int rc = 0; 793 int rc = X86EMUL_CONTINUE;
786 794
787 if (c->rex_prefix) { 795 if (c->rex_prefix) {
788 c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */ 796 c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */
@@ -895,7 +903,7 @@ static int decode_abs(struct x86_emulate_ctxt *ctxt,
895 struct x86_emulate_ops *ops) 903 struct x86_emulate_ops *ops)
896{ 904{
897 struct decode_cache *c = &ctxt->decode; 905 struct decode_cache *c = &ctxt->decode;
898 int rc = 0; 906 int rc = X86EMUL_CONTINUE;
899 907
900 switch (c->ad_bytes) { 908 switch (c->ad_bytes) {
901 case 2: 909 case 2:
@@ -916,14 +924,18 @@ int
916x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) 924x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
917{ 925{
918 struct decode_cache *c = &ctxt->decode; 926 struct decode_cache *c = &ctxt->decode;
919 int rc = 0; 927 int rc = X86EMUL_CONTINUE;
920 int mode = ctxt->mode; 928 int mode = ctxt->mode;
921 int def_op_bytes, def_ad_bytes, group; 929 int def_op_bytes, def_ad_bytes, group;
922 930
923 /* Shadow copy of register state. Committed on successful emulation. */
924 931
932 /* we cannot decode insn before we complete previous rep insn */
933 WARN_ON(ctxt->restart);
934
935 /* Shadow copy of register state. Committed on successful emulation. */
925 memset(c, 0, sizeof(struct decode_cache)); 936 memset(c, 0, sizeof(struct decode_cache));
926 c->eip = c->eip_orig = kvm_rip_read(ctxt->vcpu); 937 c->eip = ctxt->eip;
938 c->fetch.start = c->fetch.end = c->eip;
927 ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS); 939 ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
928 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); 940 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
929 941
@@ -1015,11 +1027,6 @@ done_prefixes:
1015 } 1027 }
1016 } 1028 }
1017 1029
1018 if (mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
1019 kvm_report_emulation_failure(ctxt->vcpu, "invalid x86/64 instruction");
1020 return -1;
1021 }
1022
1023 if (c->d & Group) { 1030 if (c->d & Group) {
1024 group = c->d & GroupMask; 1031 group = c->d & GroupMask;
1025 c->modrm = insn_fetch(u8, 1, c->eip); 1032 c->modrm = insn_fetch(u8, 1, c->eip);
@@ -1046,7 +1053,7 @@ done_prefixes:
1046 rc = decode_modrm(ctxt, ops); 1053 rc = decode_modrm(ctxt, ops);
1047 else if (c->d & MemAbs) 1054 else if (c->d & MemAbs)
1048 rc = decode_abs(ctxt, ops); 1055 rc = decode_abs(ctxt, ops);
1049 if (rc) 1056 if (rc != X86EMUL_CONTINUE)
1050 goto done; 1057 goto done;
1051 1058
1052 if (!c->has_seg_override) 1059 if (!c->has_seg_override)
@@ -1057,6 +1064,10 @@ done_prefixes:
1057 1064
1058 if (c->ad_bytes != 8) 1065 if (c->ad_bytes != 8)
1059 c->modrm_ea = (u32)c->modrm_ea; 1066 c->modrm_ea = (u32)c->modrm_ea;
1067
1068 if (c->rip_relative)
1069 c->modrm_ea += c->eip;
1070
1060 /* 1071 /*
1061 * Decode and fetch the source operand: register, memory 1072 * Decode and fetch the source operand: register, memory
1062 * or immediate. 1073 * or immediate.
@@ -1091,6 +1102,8 @@ done_prefixes:
1091 break; 1102 break;
1092 } 1103 }
1093 c->src.type = OP_MEM; 1104 c->src.type = OP_MEM;
1105 c->src.ptr = (unsigned long *)c->modrm_ea;
1106 c->src.val = 0;
1094 break; 1107 break;
1095 case SrcImm: 1108 case SrcImm:
1096 case SrcImmU: 1109 case SrcImmU:
@@ -1139,6 +1152,14 @@ done_prefixes:
1139 c->src.bytes = 1; 1152 c->src.bytes = 1;
1140 c->src.val = 1; 1153 c->src.val = 1;
1141 break; 1154 break;
1155 case SrcSI:
1156 c->src.type = OP_MEM;
1157 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1158 c->src.ptr = (unsigned long *)
1159 register_address(c, seg_override_base(ctxt, c),
1160 c->regs[VCPU_REGS_RSI]);
1161 c->src.val = 0;
1162 break;
1142 } 1163 }
1143 1164
1144 /* 1165 /*
@@ -1168,6 +1189,12 @@ done_prefixes:
1168 c->src2.bytes = 1; 1189 c->src2.bytes = 1;
1169 c->src2.val = 1; 1190 c->src2.val = 1;
1170 break; 1191 break;
1192 case Src2Mem16:
1193 c->src2.type = OP_MEM;
1194 c->src2.bytes = 2;
1195 c->src2.ptr = (unsigned long *)(c->modrm_ea + c->src.bytes);
1196 c->src2.val = 0;
1197 break;
1171 } 1198 }
1172 1199
1173 /* Decode and fetch the destination operand: register or memory. */ 1200 /* Decode and fetch the destination operand: register or memory. */
@@ -1180,6 +1207,7 @@ done_prefixes:
1180 c->twobyte && (c->b == 0xb6 || c->b == 0xb7)); 1207 c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
1181 break; 1208 break;
1182 case DstMem: 1209 case DstMem:
1210 case DstMem64:
1183 if ((c->d & ModRM) && c->modrm_mod == 3) { 1211 if ((c->d & ModRM) && c->modrm_mod == 3) {
1184 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1212 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1185 c->dst.type = OP_REG; 1213 c->dst.type = OP_REG;
@@ -1188,12 +1216,24 @@ done_prefixes:
1188 break; 1216 break;
1189 } 1217 }
1190 c->dst.type = OP_MEM; 1218 c->dst.type = OP_MEM;
1219 c->dst.ptr = (unsigned long *)c->modrm_ea;
1220 if ((c->d & DstMask) == DstMem64)
1221 c->dst.bytes = 8;
1222 else
1223 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1224 c->dst.val = 0;
1225 if (c->d & BitOp) {
1226 unsigned long mask = ~(c->dst.bytes * 8 - 1);
1227
1228 c->dst.ptr = (void *)c->dst.ptr +
1229 (c->src.val & mask) / 8;
1230 }
1191 break; 1231 break;
1192 case DstAcc: 1232 case DstAcc:
1193 c->dst.type = OP_REG; 1233 c->dst.type = OP_REG;
1194 c->dst.bytes = c->op_bytes; 1234 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1195 c->dst.ptr = &c->regs[VCPU_REGS_RAX]; 1235 c->dst.ptr = &c->regs[VCPU_REGS_RAX];
1196 switch (c->op_bytes) { 1236 switch (c->dst.bytes) {
1197 case 1: 1237 case 1:
1198 c->dst.val = *(u8 *)c->dst.ptr; 1238 c->dst.val = *(u8 *)c->dst.ptr;
1199 break; 1239 break;
@@ -1203,18 +1243,248 @@ done_prefixes:
1203 case 4: 1243 case 4:
1204 c->dst.val = *(u32 *)c->dst.ptr; 1244 c->dst.val = *(u32 *)c->dst.ptr;
1205 break; 1245 break;
1246 case 8:
1247 c->dst.val = *(u64 *)c->dst.ptr;
1248 break;
1206 } 1249 }
1207 c->dst.orig_val = c->dst.val; 1250 c->dst.orig_val = c->dst.val;
1208 break; 1251 break;
1252 case DstDI:
1253 c->dst.type = OP_MEM;
1254 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1255 c->dst.ptr = (unsigned long *)
1256 register_address(c, es_base(ctxt),
1257 c->regs[VCPU_REGS_RDI]);
1258 c->dst.val = 0;
1259 break;
1209 } 1260 }
1210 1261
1211 if (c->rip_relative)
1212 c->modrm_ea += c->eip;
1213
1214done: 1262done:
1215 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; 1263 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
1216} 1264}
1217 1265
1266static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
1267 struct x86_emulate_ops *ops,
1268 unsigned int size, unsigned short port,
1269 void *dest)
1270{
1271 struct read_cache *rc = &ctxt->decode.io_read;
1272
1273 if (rc->pos == rc->end) { /* refill pio read ahead */
1274 struct decode_cache *c = &ctxt->decode;
1275 unsigned int in_page, n;
1276 unsigned int count = c->rep_prefix ?
1277 address_mask(c, c->regs[VCPU_REGS_RCX]) : 1;
1278 in_page = (ctxt->eflags & EFLG_DF) ?
1279 offset_in_page(c->regs[VCPU_REGS_RDI]) :
1280 PAGE_SIZE - offset_in_page(c->regs[VCPU_REGS_RDI]);
1281 n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size,
1282 count);
1283 if (n == 0)
1284 n = 1;
1285 rc->pos = rc->end = 0;
1286 if (!ops->pio_in_emulated(size, port, rc->data, n, ctxt->vcpu))
1287 return 0;
1288 rc->end = n * size;
1289 }
1290
1291 memcpy(dest, rc->data + rc->pos, size);
1292 rc->pos += size;
1293 return 1;
1294}
1295
1296static u32 desc_limit_scaled(struct desc_struct *desc)
1297{
1298 u32 limit = get_desc_limit(desc);
1299
1300 return desc->g ? (limit << 12) | 0xfff : limit;
1301}
1302
1303static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
1304 struct x86_emulate_ops *ops,
1305 u16 selector, struct desc_ptr *dt)
1306{
1307 if (selector & 1 << 2) {
1308 struct desc_struct desc;
1309 memset (dt, 0, sizeof *dt);
1310 if (!ops->get_cached_descriptor(&desc, VCPU_SREG_LDTR, ctxt->vcpu))
1311 return;
1312
1313 dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */
1314 dt->address = get_desc_base(&desc);
1315 } else
1316 ops->get_gdt(dt, ctxt->vcpu);
1317}
1318
1319/* allowed just for 8 bytes segments */
1320static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1321 struct x86_emulate_ops *ops,
1322 u16 selector, struct desc_struct *desc)
1323{
1324 struct desc_ptr dt;
1325 u16 index = selector >> 3;
1326 int ret;
1327 u32 err;
1328 ulong addr;
1329
1330 get_descriptor_table_ptr(ctxt, ops, selector, &dt);
1331
1332 if (dt.size < index * 8 + 7) {
1333 kvm_inject_gp(ctxt->vcpu, selector & 0xfffc);
1334 return X86EMUL_PROPAGATE_FAULT;
1335 }
1336 addr = dt.address + index * 8;
1337 ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
1338 if (ret == X86EMUL_PROPAGATE_FAULT)
1339 kvm_inject_page_fault(ctxt->vcpu, addr, err);
1340
1341 return ret;
1342}
1343
1344/* allowed just for 8 bytes segments */
1345static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1346 struct x86_emulate_ops *ops,
1347 u16 selector, struct desc_struct *desc)
1348{
1349 struct desc_ptr dt;
1350 u16 index = selector >> 3;
1351 u32 err;
1352 ulong addr;
1353 int ret;
1354
1355 get_descriptor_table_ptr(ctxt, ops, selector, &dt);
1356
1357 if (dt.size < index * 8 + 7) {
1358 kvm_inject_gp(ctxt->vcpu, selector & 0xfffc);
1359 return X86EMUL_PROPAGATE_FAULT;
1360 }
1361
1362 addr = dt.address + index * 8;
1363 ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
1364 if (ret == X86EMUL_PROPAGATE_FAULT)
1365 kvm_inject_page_fault(ctxt->vcpu, addr, err);
1366
1367 return ret;
1368}
1369
1370static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1371 struct x86_emulate_ops *ops,
1372 u16 selector, int seg)
1373{
1374 struct desc_struct seg_desc;
1375 u8 dpl, rpl, cpl;
1376 unsigned err_vec = GP_VECTOR;
1377 u32 err_code = 0;
1378 bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
1379 int ret;
1380
1381 memset(&seg_desc, 0, sizeof seg_desc);
1382
1383 if ((seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86)
1384 || ctxt->mode == X86EMUL_MODE_REAL) {
1385 /* set real mode segment descriptor */
1386 set_desc_base(&seg_desc, selector << 4);
1387 set_desc_limit(&seg_desc, 0xffff);
1388 seg_desc.type = 3;
1389 seg_desc.p = 1;
1390 seg_desc.s = 1;
1391 goto load;
1392 }
1393
1394 /* NULL selector is not valid for TR, CS and SS */
1395 if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR)
1396 && null_selector)
1397 goto exception;
1398
1399 /* TR should be in GDT only */
1400 if (seg == VCPU_SREG_TR && (selector & (1 << 2)))
1401 goto exception;
1402
1403 if (null_selector) /* for NULL selector skip all following checks */
1404 goto load;
1405
1406 ret = read_segment_descriptor(ctxt, ops, selector, &seg_desc);
1407 if (ret != X86EMUL_CONTINUE)
1408 return ret;
1409
1410 err_code = selector & 0xfffc;
1411 err_vec = GP_VECTOR;
1412
1413 /* can't load system descriptor into segment selecor */
1414 if (seg <= VCPU_SREG_GS && !seg_desc.s)
1415 goto exception;
1416
1417 if (!seg_desc.p) {
1418 err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
1419 goto exception;
1420 }
1421
1422 rpl = selector & 3;
1423 dpl = seg_desc.dpl;
1424 cpl = ops->cpl(ctxt->vcpu);
1425
1426 switch (seg) {
1427 case VCPU_SREG_SS:
1428 /*
1429 * segment is not a writable data segment or segment
1430 * selector's RPL != CPL or segment selector's RPL != CPL
1431 */
1432 if (rpl != cpl || (seg_desc.type & 0xa) != 0x2 || dpl != cpl)
1433 goto exception;
1434 break;
1435 case VCPU_SREG_CS:
1436 if (!(seg_desc.type & 8))
1437 goto exception;
1438
1439 if (seg_desc.type & 4) {
1440 /* conforming */
1441 if (dpl > cpl)
1442 goto exception;
1443 } else {
1444 /* nonconforming */
1445 if (rpl > cpl || dpl != cpl)
1446 goto exception;
1447 }
1448 /* CS(RPL) <- CPL */
1449 selector = (selector & 0xfffc) | cpl;
1450 break;
1451 case VCPU_SREG_TR:
1452 if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9))
1453 goto exception;
1454 break;
1455 case VCPU_SREG_LDTR:
1456 if (seg_desc.s || seg_desc.type != 2)
1457 goto exception;
1458 break;
1459 default: /* DS, ES, FS, or GS */
1460 /*
1461 * segment is not a data or readable code segment or
1462 * ((segment is a data or nonconforming code segment)
1463 * and (both RPL and CPL > DPL))
1464 */
1465 if ((seg_desc.type & 0xa) == 0x8 ||
1466 (((seg_desc.type & 0xc) != 0xc) &&
1467 (rpl > dpl && cpl > dpl)))
1468 goto exception;
1469 break;
1470 }
1471
1472 if (seg_desc.s) {
1473 /* mark segment as accessed */
1474 seg_desc.type |= 1;
1475 ret = write_segment_descriptor(ctxt, ops, selector, &seg_desc);
1476 if (ret != X86EMUL_CONTINUE)
1477 return ret;
1478 }
1479load:
1480 ops->set_segment_selector(selector, seg, ctxt->vcpu);
1481 ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu);
1482 return X86EMUL_CONTINUE;
1483exception:
1484 kvm_queue_exception_e(ctxt->vcpu, err_vec, err_code);
1485 return X86EMUL_PROPAGATE_FAULT;
1486}
1487
1218static inline void emulate_push(struct x86_emulate_ctxt *ctxt) 1488static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
1219{ 1489{
1220 struct decode_cache *c = &ctxt->decode; 1490 struct decode_cache *c = &ctxt->decode;
@@ -1251,7 +1521,7 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1251 int rc; 1521 int rc;
1252 unsigned long val, change_mask; 1522 unsigned long val, change_mask;
1253 int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; 1523 int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
1254 int cpl = kvm_x86_ops->get_cpl(ctxt->vcpu); 1524 int cpl = ops->cpl(ctxt->vcpu);
1255 1525
1256 rc = emulate_pop(ctxt, ops, &val, len); 1526 rc = emulate_pop(ctxt, ops, &val, len);
1257 if (rc != X86EMUL_CONTINUE) 1527 if (rc != X86EMUL_CONTINUE)
@@ -1306,10 +1576,10 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
1306 int rc; 1576 int rc;
1307 1577
1308 rc = emulate_pop(ctxt, ops, &selector, c->op_bytes); 1578 rc = emulate_pop(ctxt, ops, &selector, c->op_bytes);
1309 if (rc != 0) 1579 if (rc != X86EMUL_CONTINUE)
1310 return rc; 1580 return rc;
1311 1581
1312 rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, seg); 1582 rc = load_segment_descriptor(ctxt, ops, (u16)selector, seg);
1313 return rc; 1583 return rc;
1314} 1584}
1315 1585
@@ -1332,7 +1602,7 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt,
1332 struct x86_emulate_ops *ops) 1602 struct x86_emulate_ops *ops)
1333{ 1603{
1334 struct decode_cache *c = &ctxt->decode; 1604 struct decode_cache *c = &ctxt->decode;
1335 int rc = 0; 1605 int rc = X86EMUL_CONTINUE;
1336 int reg = VCPU_REGS_RDI; 1606 int reg = VCPU_REGS_RDI;
1337 1607
1338 while (reg >= VCPU_REGS_RAX) { 1608 while (reg >= VCPU_REGS_RAX) {
@@ -1343,7 +1613,7 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt,
1343 } 1613 }
1344 1614
1345 rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes); 1615 rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes);
1346 if (rc != 0) 1616 if (rc != X86EMUL_CONTINUE)
1347 break; 1617 break;
1348 --reg; 1618 --reg;
1349 } 1619 }
@@ -1354,12 +1624,8 @@ static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
1354 struct x86_emulate_ops *ops) 1624 struct x86_emulate_ops *ops)
1355{ 1625{
1356 struct decode_cache *c = &ctxt->decode; 1626 struct decode_cache *c = &ctxt->decode;
1357 int rc;
1358 1627
1359 rc = emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes); 1628 return emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes);
1360 if (rc != 0)
1361 return rc;
1362 return 0;
1363} 1629}
1364 1630
1365static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt) 1631static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt)
@@ -1395,7 +1661,6 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
1395 struct x86_emulate_ops *ops) 1661 struct x86_emulate_ops *ops)
1396{ 1662{
1397 struct decode_cache *c = &ctxt->decode; 1663 struct decode_cache *c = &ctxt->decode;
1398 int rc = 0;
1399 1664
1400 switch (c->modrm_reg) { 1665 switch (c->modrm_reg) {
1401 case 0 ... 1: /* test */ 1666 case 0 ... 1: /* test */
@@ -1408,11 +1673,9 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
1408 emulate_1op("neg", c->dst, ctxt->eflags); 1673 emulate_1op("neg", c->dst, ctxt->eflags);
1409 break; 1674 break;
1410 default: 1675 default:
1411 DPRINTF("Cannot emulate %02x\n", c->b); 1676 return 0;
1412 rc = X86EMUL_UNHANDLEABLE;
1413 break;
1414 } 1677 }
1415 return rc; 1678 return 1;
1416} 1679}
1417 1680
1418static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, 1681static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
@@ -1442,20 +1705,14 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
1442 emulate_push(ctxt); 1705 emulate_push(ctxt);
1443 break; 1706 break;
1444 } 1707 }
1445 return 0; 1708 return X86EMUL_CONTINUE;
1446} 1709}
1447 1710
1448static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt, 1711static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
1449 struct x86_emulate_ops *ops, 1712 struct x86_emulate_ops *ops)
1450 unsigned long memop)
1451{ 1713{
1452 struct decode_cache *c = &ctxt->decode; 1714 struct decode_cache *c = &ctxt->decode;
1453 u64 old, new; 1715 u64 old = c->dst.orig_val;
1454 int rc;
1455
1456 rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu);
1457 if (rc != X86EMUL_CONTINUE)
1458 return rc;
1459 1716
1460 if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) || 1717 if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) ||
1461 ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) { 1718 ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) {
@@ -1463,17 +1720,13 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
1463 c->regs[VCPU_REGS_RAX] = (u32) (old >> 0); 1720 c->regs[VCPU_REGS_RAX] = (u32) (old >> 0);
1464 c->regs[VCPU_REGS_RDX] = (u32) (old >> 32); 1721 c->regs[VCPU_REGS_RDX] = (u32) (old >> 32);
1465 ctxt->eflags &= ~EFLG_ZF; 1722 ctxt->eflags &= ~EFLG_ZF;
1466
1467 } else { 1723 } else {
1468 new = ((u64)c->regs[VCPU_REGS_RCX] << 32) | 1724 c->dst.val = ((u64)c->regs[VCPU_REGS_RCX] << 32) |
1469 (u32) c->regs[VCPU_REGS_RBX]; 1725 (u32) c->regs[VCPU_REGS_RBX];
1470 1726
1471 rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu);
1472 if (rc != X86EMUL_CONTINUE)
1473 return rc;
1474 ctxt->eflags |= EFLG_ZF; 1727 ctxt->eflags |= EFLG_ZF;
1475 } 1728 }
1476 return 0; 1729 return X86EMUL_CONTINUE;
1477} 1730}
1478 1731
1479static int emulate_ret_far(struct x86_emulate_ctxt *ctxt, 1732static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
@@ -1484,14 +1737,14 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
1484 unsigned long cs; 1737 unsigned long cs;
1485 1738
1486 rc = emulate_pop(ctxt, ops, &c->eip, c->op_bytes); 1739 rc = emulate_pop(ctxt, ops, &c->eip, c->op_bytes);
1487 if (rc) 1740 if (rc != X86EMUL_CONTINUE)
1488 return rc; 1741 return rc;
1489 if (c->op_bytes == 4) 1742 if (c->op_bytes == 4)
1490 c->eip = (u32)c->eip; 1743 c->eip = (u32)c->eip;
1491 rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); 1744 rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
1492 if (rc) 1745 if (rc != X86EMUL_CONTINUE)
1493 return rc; 1746 return rc;
1494 rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, VCPU_SREG_CS); 1747 rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS);
1495 return rc; 1748 return rc;
1496} 1749}
1497 1750
@@ -1544,7 +1797,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
1544 default: 1797 default:
1545 break; 1798 break;
1546 } 1799 }
1547 return 0; 1800 return X86EMUL_CONTINUE;
1548} 1801}
1549 1802
1550static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask) 1803static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask)
@@ -1598,8 +1851,11 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt)
1598 u64 msr_data; 1851 u64 msr_data;
1599 1852
1600 /* syscall is not available in real mode */ 1853 /* syscall is not available in real mode */
1601 if (ctxt->mode == X86EMUL_MODE_REAL || ctxt->mode == X86EMUL_MODE_VM86) 1854 if (ctxt->mode == X86EMUL_MODE_REAL ||
1602 return X86EMUL_UNHANDLEABLE; 1855 ctxt->mode == X86EMUL_MODE_VM86) {
1856 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
1857 return X86EMUL_PROPAGATE_FAULT;
1858 }
1603 1859
1604 setup_syscalls_segments(ctxt, &cs, &ss); 1860 setup_syscalls_segments(ctxt, &cs, &ss);
1605 1861
@@ -1649,14 +1905,16 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)
1649 /* inject #GP if in real mode */ 1905 /* inject #GP if in real mode */
1650 if (ctxt->mode == X86EMUL_MODE_REAL) { 1906 if (ctxt->mode == X86EMUL_MODE_REAL) {
1651 kvm_inject_gp(ctxt->vcpu, 0); 1907 kvm_inject_gp(ctxt->vcpu, 0);
1652 return X86EMUL_UNHANDLEABLE; 1908 return X86EMUL_PROPAGATE_FAULT;
1653 } 1909 }
1654 1910
1655 /* XXX sysenter/sysexit have not been tested in 64bit mode. 1911 /* XXX sysenter/sysexit have not been tested in 64bit mode.
1656 * Therefore, we inject an #UD. 1912 * Therefore, we inject an #UD.
1657 */ 1913 */
1658 if (ctxt->mode == X86EMUL_MODE_PROT64) 1914 if (ctxt->mode == X86EMUL_MODE_PROT64) {
1659 return X86EMUL_UNHANDLEABLE; 1915 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
1916 return X86EMUL_PROPAGATE_FAULT;
1917 }
1660 1918
1661 setup_syscalls_segments(ctxt, &cs, &ss); 1919 setup_syscalls_segments(ctxt, &cs, &ss);
1662 1920
@@ -1711,7 +1969,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
1711 if (ctxt->mode == X86EMUL_MODE_REAL || 1969 if (ctxt->mode == X86EMUL_MODE_REAL ||
1712 ctxt->mode == X86EMUL_MODE_VM86) { 1970 ctxt->mode == X86EMUL_MODE_VM86) {
1713 kvm_inject_gp(ctxt->vcpu, 0); 1971 kvm_inject_gp(ctxt->vcpu, 0);
1714 return X86EMUL_UNHANDLEABLE; 1972 return X86EMUL_PROPAGATE_FAULT;
1715 } 1973 }
1716 1974
1717 setup_syscalls_segments(ctxt, &cs, &ss); 1975 setup_syscalls_segments(ctxt, &cs, &ss);
@@ -1756,7 +2014,8 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
1756 return X86EMUL_CONTINUE; 2014 return X86EMUL_CONTINUE;
1757} 2015}
1758 2016
1759static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt) 2017static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt,
2018 struct x86_emulate_ops *ops)
1760{ 2019{
1761 int iopl; 2020 int iopl;
1762 if (ctxt->mode == X86EMUL_MODE_REAL) 2021 if (ctxt->mode == X86EMUL_MODE_REAL)
@@ -1764,7 +2023,7 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt)
1764 if (ctxt->mode == X86EMUL_MODE_VM86) 2023 if (ctxt->mode == X86EMUL_MODE_VM86)
1765 return true; 2024 return true;
1766 iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; 2025 iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
1767 return kvm_x86_ops->get_cpl(ctxt->vcpu) > iopl; 2026 return ops->cpl(ctxt->vcpu) > iopl;
1768} 2027}
1769 2028
1770static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, 2029static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
@@ -1801,22 +2060,419 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
1801 struct x86_emulate_ops *ops, 2060 struct x86_emulate_ops *ops,
1802 u16 port, u16 len) 2061 u16 port, u16 len)
1803{ 2062{
1804 if (emulator_bad_iopl(ctxt)) 2063 if (emulator_bad_iopl(ctxt, ops))
1805 if (!emulator_io_port_access_allowed(ctxt, ops, port, len)) 2064 if (!emulator_io_port_access_allowed(ctxt, ops, port, len))
1806 return false; 2065 return false;
1807 return true; 2066 return true;
1808} 2067}
1809 2068
2069static u32 get_cached_descriptor_base(struct x86_emulate_ctxt *ctxt,
2070 struct x86_emulate_ops *ops,
2071 int seg)
2072{
2073 struct desc_struct desc;
2074 if (ops->get_cached_descriptor(&desc, seg, ctxt->vcpu))
2075 return get_desc_base(&desc);
2076 else
2077 return ~0;
2078}
2079
2080static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
2081 struct x86_emulate_ops *ops,
2082 struct tss_segment_16 *tss)
2083{
2084 struct decode_cache *c = &ctxt->decode;
2085
2086 tss->ip = c->eip;
2087 tss->flag = ctxt->eflags;
2088 tss->ax = c->regs[VCPU_REGS_RAX];
2089 tss->cx = c->regs[VCPU_REGS_RCX];
2090 tss->dx = c->regs[VCPU_REGS_RDX];
2091 tss->bx = c->regs[VCPU_REGS_RBX];
2092 tss->sp = c->regs[VCPU_REGS_RSP];
2093 tss->bp = c->regs[VCPU_REGS_RBP];
2094 tss->si = c->regs[VCPU_REGS_RSI];
2095 tss->di = c->regs[VCPU_REGS_RDI];
2096
2097 tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu);
2098 tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
2099 tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu);
2100 tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu);
2101 tss->ldt = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu);
2102}
2103
2104static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
2105 struct x86_emulate_ops *ops,
2106 struct tss_segment_16 *tss)
2107{
2108 struct decode_cache *c = &ctxt->decode;
2109 int ret;
2110
2111 c->eip = tss->ip;
2112 ctxt->eflags = tss->flag | 2;
2113 c->regs[VCPU_REGS_RAX] = tss->ax;
2114 c->regs[VCPU_REGS_RCX] = tss->cx;
2115 c->regs[VCPU_REGS_RDX] = tss->dx;
2116 c->regs[VCPU_REGS_RBX] = tss->bx;
2117 c->regs[VCPU_REGS_RSP] = tss->sp;
2118 c->regs[VCPU_REGS_RBP] = tss->bp;
2119 c->regs[VCPU_REGS_RSI] = tss->si;
2120 c->regs[VCPU_REGS_RDI] = tss->di;
2121
2122 /*
2123 * SDM says that segment selectors are loaded before segment
2124 * descriptors
2125 */
2126 ops->set_segment_selector(tss->ldt, VCPU_SREG_LDTR, ctxt->vcpu);
2127 ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu);
2128 ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu);
2129 ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu);
2130 ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu);
2131
2132 /*
2133 * Now load segment descriptors. If fault happenes at this stage
2134 * it is handled in a context of new task
2135 */
2136 ret = load_segment_descriptor(ctxt, ops, tss->ldt, VCPU_SREG_LDTR);
2137 if (ret != X86EMUL_CONTINUE)
2138 return ret;
2139 ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES);
2140 if (ret != X86EMUL_CONTINUE)
2141 return ret;
2142 ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS);
2143 if (ret != X86EMUL_CONTINUE)
2144 return ret;
2145 ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS);
2146 if (ret != X86EMUL_CONTINUE)
2147 return ret;
2148 ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS);
2149 if (ret != X86EMUL_CONTINUE)
2150 return ret;
2151
2152 return X86EMUL_CONTINUE;
2153}
2154
2155static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2156 struct x86_emulate_ops *ops,
2157 u16 tss_selector, u16 old_tss_sel,
2158 ulong old_tss_base, struct desc_struct *new_desc)
2159{
2160 struct tss_segment_16 tss_seg;
2161 int ret;
2162 u32 err, new_tss_base = get_desc_base(new_desc);
2163
2164 ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
2165 &err);
2166 if (ret == X86EMUL_PROPAGATE_FAULT) {
2167 /* FIXME: need to provide precise fault address */
2168 kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
2169 return ret;
2170 }
2171
2172 save_state_to_tss16(ctxt, ops, &tss_seg);
2173
2174 ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
2175 &err);
2176 if (ret == X86EMUL_PROPAGATE_FAULT) {
2177 /* FIXME: need to provide precise fault address */
2178 kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
2179 return ret;
2180 }
2181
2182 ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
2183 &err);
2184 if (ret == X86EMUL_PROPAGATE_FAULT) {
2185 /* FIXME: need to provide precise fault address */
2186 kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
2187 return ret;
2188 }
2189
2190 if (old_tss_sel != 0xffff) {
2191 tss_seg.prev_task_link = old_tss_sel;
2192
2193 ret = ops->write_std(new_tss_base,
2194 &tss_seg.prev_task_link,
2195 sizeof tss_seg.prev_task_link,
2196 ctxt->vcpu, &err);
2197 if (ret == X86EMUL_PROPAGATE_FAULT) {
2198 /* FIXME: need to provide precise fault address */
2199 kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
2200 return ret;
2201 }
2202 }
2203
2204 return load_state_from_tss16(ctxt, ops, &tss_seg);
2205}
2206
2207static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
2208 struct x86_emulate_ops *ops,
2209 struct tss_segment_32 *tss)
2210{
2211 struct decode_cache *c = &ctxt->decode;
2212
2213 tss->cr3 = ops->get_cr(3, ctxt->vcpu);
2214 tss->eip = c->eip;
2215 tss->eflags = ctxt->eflags;
2216 tss->eax = c->regs[VCPU_REGS_RAX];
2217 tss->ecx = c->regs[VCPU_REGS_RCX];
2218 tss->edx = c->regs[VCPU_REGS_RDX];
2219 tss->ebx = c->regs[VCPU_REGS_RBX];
2220 tss->esp = c->regs[VCPU_REGS_RSP];
2221 tss->ebp = c->regs[VCPU_REGS_RBP];
2222 tss->esi = c->regs[VCPU_REGS_RSI];
2223 tss->edi = c->regs[VCPU_REGS_RDI];
2224
2225 tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu);
2226 tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
2227 tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu);
2228 tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu);
2229 tss->fs = ops->get_segment_selector(VCPU_SREG_FS, ctxt->vcpu);
2230 tss->gs = ops->get_segment_selector(VCPU_SREG_GS, ctxt->vcpu);
2231 tss->ldt_selector = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu);
2232}
2233
2234static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
2235 struct x86_emulate_ops *ops,
2236 struct tss_segment_32 *tss)
2237{
2238 struct decode_cache *c = &ctxt->decode;
2239 int ret;
2240
2241 ops->set_cr(3, tss->cr3, ctxt->vcpu);
2242 c->eip = tss->eip;
2243 ctxt->eflags = tss->eflags | 2;
2244 c->regs[VCPU_REGS_RAX] = tss->eax;
2245 c->regs[VCPU_REGS_RCX] = tss->ecx;
2246 c->regs[VCPU_REGS_RDX] = tss->edx;
2247 c->regs[VCPU_REGS_RBX] = tss->ebx;
2248 c->regs[VCPU_REGS_RSP] = tss->esp;
2249 c->regs[VCPU_REGS_RBP] = tss->ebp;
2250 c->regs[VCPU_REGS_RSI] = tss->esi;
2251 c->regs[VCPU_REGS_RDI] = tss->edi;
2252
2253 /*
2254 * SDM says that segment selectors are loaded before segment
2255 * descriptors
2256 */
2257 ops->set_segment_selector(tss->ldt_selector, VCPU_SREG_LDTR, ctxt->vcpu);
2258 ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu);
2259 ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu);
2260 ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu);
2261 ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu);
2262 ops->set_segment_selector(tss->fs, VCPU_SREG_FS, ctxt->vcpu);
2263 ops->set_segment_selector(tss->gs, VCPU_SREG_GS, ctxt->vcpu);
2264
2265 /*
2266 * Now load segment descriptors. If fault happenes at this stage
2267 * it is handled in a context of new task
2268 */
2269 ret = load_segment_descriptor(ctxt, ops, tss->ldt_selector, VCPU_SREG_LDTR);
2270 if (ret != X86EMUL_CONTINUE)
2271 return ret;
2272 ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES);
2273 if (ret != X86EMUL_CONTINUE)
2274 return ret;
2275 ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS);
2276 if (ret != X86EMUL_CONTINUE)
2277 return ret;
2278 ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS);
2279 if (ret != X86EMUL_CONTINUE)
2280 return ret;
2281 ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS);
2282 if (ret != X86EMUL_CONTINUE)
2283 return ret;
2284 ret = load_segment_descriptor(ctxt, ops, tss->fs, VCPU_SREG_FS);
2285 if (ret != X86EMUL_CONTINUE)
2286 return ret;
2287 ret = load_segment_descriptor(ctxt, ops, tss->gs, VCPU_SREG_GS);
2288 if (ret != X86EMUL_CONTINUE)
2289 return ret;
2290
2291 return X86EMUL_CONTINUE;
2292}
2293
2294static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2295 struct x86_emulate_ops *ops,
2296 u16 tss_selector, u16 old_tss_sel,
2297 ulong old_tss_base, struct desc_struct *new_desc)
2298{
2299 struct tss_segment_32 tss_seg;
2300 int ret;
2301 u32 err, new_tss_base = get_desc_base(new_desc);
2302
2303 ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
2304 &err);
2305 if (ret == X86EMUL_PROPAGATE_FAULT) {
2306 /* FIXME: need to provide precise fault address */
2307 kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
2308 return ret;
2309 }
2310
2311 save_state_to_tss32(ctxt, ops, &tss_seg);
2312
2313 ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
2314 &err);
2315 if (ret == X86EMUL_PROPAGATE_FAULT) {
2316 /* FIXME: need to provide precise fault address */
2317 kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
2318 return ret;
2319 }
2320
2321 ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
2322 &err);
2323 if (ret == X86EMUL_PROPAGATE_FAULT) {
2324 /* FIXME: need to provide precise fault address */
2325 kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
2326 return ret;
2327 }
2328
2329 if (old_tss_sel != 0xffff) {
2330 tss_seg.prev_task_link = old_tss_sel;
2331
2332 ret = ops->write_std(new_tss_base,
2333 &tss_seg.prev_task_link,
2334 sizeof tss_seg.prev_task_link,
2335 ctxt->vcpu, &err);
2336 if (ret == X86EMUL_PROPAGATE_FAULT) {
2337 /* FIXME: need to provide precise fault address */
2338 kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
2339 return ret;
2340 }
2341 }
2342
2343 return load_state_from_tss32(ctxt, ops, &tss_seg);
2344}
2345
2346static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2347 struct x86_emulate_ops *ops,
2348 u16 tss_selector, int reason,
2349 bool has_error_code, u32 error_code)
2350{
2351 struct desc_struct curr_tss_desc, next_tss_desc;
2352 int ret;
2353 u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu);
2354 ulong old_tss_base =
2355 get_cached_descriptor_base(ctxt, ops, VCPU_SREG_TR);
2356 u32 desc_limit;
2357
2358 /* FIXME: old_tss_base == ~0 ? */
2359
2360 ret = read_segment_descriptor(ctxt, ops, tss_selector, &next_tss_desc);
2361 if (ret != X86EMUL_CONTINUE)
2362 return ret;
2363 ret = read_segment_descriptor(ctxt, ops, old_tss_sel, &curr_tss_desc);
2364 if (ret != X86EMUL_CONTINUE)
2365 return ret;
2366
2367 /* FIXME: check that next_tss_desc is tss */
2368
2369 if (reason != TASK_SWITCH_IRET) {
2370 if ((tss_selector & 3) > next_tss_desc.dpl ||
2371 ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) {
2372 kvm_inject_gp(ctxt->vcpu, 0);
2373 return X86EMUL_PROPAGATE_FAULT;
2374 }
2375 }
2376
2377 desc_limit = desc_limit_scaled(&next_tss_desc);
2378 if (!next_tss_desc.p ||
2379 ((desc_limit < 0x67 && (next_tss_desc.type & 8)) ||
2380 desc_limit < 0x2b)) {
2381 kvm_queue_exception_e(ctxt->vcpu, TS_VECTOR,
2382 tss_selector & 0xfffc);
2383 return X86EMUL_PROPAGATE_FAULT;
2384 }
2385
2386 if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
2387 curr_tss_desc.type &= ~(1 << 1); /* clear busy flag */
2388 write_segment_descriptor(ctxt, ops, old_tss_sel,
2389 &curr_tss_desc);
2390 }
2391
2392 if (reason == TASK_SWITCH_IRET)
2393 ctxt->eflags = ctxt->eflags & ~X86_EFLAGS_NT;
2394
2395 /* set back link to prev task only if NT bit is set in eflags
2396 note that old_tss_sel is not used afetr this point */
2397 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
2398 old_tss_sel = 0xffff;
2399
2400 if (next_tss_desc.type & 8)
2401 ret = task_switch_32(ctxt, ops, tss_selector, old_tss_sel,
2402 old_tss_base, &next_tss_desc);
2403 else
2404 ret = task_switch_16(ctxt, ops, tss_selector, old_tss_sel,
2405 old_tss_base, &next_tss_desc);
2406 if (ret != X86EMUL_CONTINUE)
2407 return ret;
2408
2409 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE)
2410 ctxt->eflags = ctxt->eflags | X86_EFLAGS_NT;
2411
2412 if (reason != TASK_SWITCH_IRET) {
2413 next_tss_desc.type |= (1 << 1); /* set busy flag */
2414 write_segment_descriptor(ctxt, ops, tss_selector,
2415 &next_tss_desc);
2416 }
2417
2418 ops->set_cr(0, ops->get_cr(0, ctxt->vcpu) | X86_CR0_TS, ctxt->vcpu);
2419 ops->set_cached_descriptor(&next_tss_desc, VCPU_SREG_TR, ctxt->vcpu);
2420 ops->set_segment_selector(tss_selector, VCPU_SREG_TR, ctxt->vcpu);
2421
2422 if (has_error_code) {
2423 struct decode_cache *c = &ctxt->decode;
2424
2425 c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2;
2426 c->lock_prefix = 0;
2427 c->src.val = (unsigned long) error_code;
2428 emulate_push(ctxt);
2429 }
2430
2431 return ret;
2432}
2433
2434int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
2435 struct x86_emulate_ops *ops,
2436 u16 tss_selector, int reason,
2437 bool has_error_code, u32 error_code)
2438{
2439 struct decode_cache *c = &ctxt->decode;
2440 int rc;
2441
2442 memset(c, 0, sizeof(struct decode_cache));
2443 c->eip = ctxt->eip;
2444 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
2445 c->dst.type = OP_NONE;
2446
2447 rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason,
2448 has_error_code, error_code);
2449
2450 if (rc == X86EMUL_CONTINUE) {
2451 memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
2452 kvm_rip_write(ctxt->vcpu, c->eip);
2453 rc = writeback(ctxt, ops);
2454 }
2455
2456 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
2457}
2458
2459static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base,
2460 int reg, struct operand *op)
2461{
2462 struct decode_cache *c = &ctxt->decode;
2463 int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
2464
2465 register_address_increment(c, &c->regs[reg], df * op->bytes);
2466 op->ptr = (unsigned long *)register_address(c, base, c->regs[reg]);
2467}
2468
1810int 2469int
1811x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) 2470x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1812{ 2471{
1813 unsigned long memop = 0;
1814 u64 msr_data; 2472 u64 msr_data;
1815 unsigned long saved_eip = 0;
1816 struct decode_cache *c = &ctxt->decode; 2473 struct decode_cache *c = &ctxt->decode;
1817 unsigned int port; 2474 int rc = X86EMUL_CONTINUE;
1818 int io_dir_in; 2475 int saved_dst_type = c->dst.type;
1819 int rc = 0;
1820 2476
1821 ctxt->interruptibility = 0; 2477 ctxt->interruptibility = 0;
1822 2478
@@ -1826,26 +2482,30 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1826 */ 2482 */
1827 2483
1828 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); 2484 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
1829 saved_eip = c->eip; 2485
2486 if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
2487 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
2488 goto done;
2489 }
1830 2490
1831 /* LOCK prefix is allowed only with some instructions */ 2491 /* LOCK prefix is allowed only with some instructions */
1832 if (c->lock_prefix && !(c->d & Lock)) { 2492 if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) {
1833 kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 2493 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
1834 goto done; 2494 goto done;
1835 } 2495 }
1836 2496
1837 /* Privileged instruction can be executed only in CPL=0 */ 2497 /* Privileged instruction can be executed only in CPL=0 */
1838 if ((c->d & Priv) && kvm_x86_ops->get_cpl(ctxt->vcpu)) { 2498 if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) {
1839 kvm_inject_gp(ctxt->vcpu, 0); 2499 kvm_inject_gp(ctxt->vcpu, 0);
1840 goto done; 2500 goto done;
1841 } 2501 }
1842 2502
1843 if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs))
1844 memop = c->modrm_ea;
1845
1846 if (c->rep_prefix && (c->d & String)) { 2503 if (c->rep_prefix && (c->d & String)) {
2504 ctxt->restart = true;
1847 /* All REP prefixes have the same first termination condition */ 2505 /* All REP prefixes have the same first termination condition */
1848 if (c->regs[VCPU_REGS_RCX] == 0) { 2506 if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) {
2507 string_done:
2508 ctxt->restart = false;
1849 kvm_rip_write(ctxt->vcpu, c->eip); 2509 kvm_rip_write(ctxt->vcpu, c->eip);
1850 goto done; 2510 goto done;
1851 } 2511 }
@@ -1857,25 +2517,18 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1857 * - if REPNE/REPNZ and ZF = 1 then done 2517 * - if REPNE/REPNZ and ZF = 1 then done
1858 */ 2518 */
1859 if ((c->b == 0xa6) || (c->b == 0xa7) || 2519 if ((c->b == 0xa6) || (c->b == 0xa7) ||
1860 (c->b == 0xae) || (c->b == 0xaf)) { 2520 (c->b == 0xae) || (c->b == 0xaf)) {
1861 if ((c->rep_prefix == REPE_PREFIX) && 2521 if ((c->rep_prefix == REPE_PREFIX) &&
1862 ((ctxt->eflags & EFLG_ZF) == 0)) { 2522 ((ctxt->eflags & EFLG_ZF) == 0))
1863 kvm_rip_write(ctxt->vcpu, c->eip); 2523 goto string_done;
1864 goto done;
1865 }
1866 if ((c->rep_prefix == REPNE_PREFIX) && 2524 if ((c->rep_prefix == REPNE_PREFIX) &&
1867 ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) { 2525 ((ctxt->eflags & EFLG_ZF) == EFLG_ZF))
1868 kvm_rip_write(ctxt->vcpu, c->eip); 2526 goto string_done;
1869 goto done;
1870 }
1871 } 2527 }
1872 c->regs[VCPU_REGS_RCX]--; 2528 c->eip = ctxt->eip;
1873 c->eip = kvm_rip_read(ctxt->vcpu);
1874 } 2529 }
1875 2530
1876 if (c->src.type == OP_MEM) { 2531 if (c->src.type == OP_MEM) {
1877 c->src.ptr = (unsigned long *)memop;
1878 c->src.val = 0;
1879 rc = ops->read_emulated((unsigned long)c->src.ptr, 2532 rc = ops->read_emulated((unsigned long)c->src.ptr,
1880 &c->src.val, 2533 &c->src.val,
1881 c->src.bytes, 2534 c->src.bytes,
@@ -1885,29 +2538,25 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1885 c->src.orig_val = c->src.val; 2538 c->src.orig_val = c->src.val;
1886 } 2539 }
1887 2540
2541 if (c->src2.type == OP_MEM) {
2542 rc = ops->read_emulated((unsigned long)c->src2.ptr,
2543 &c->src2.val,
2544 c->src2.bytes,
2545 ctxt->vcpu);
2546 if (rc != X86EMUL_CONTINUE)
2547 goto done;
2548 }
2549
1888 if ((c->d & DstMask) == ImplicitOps) 2550 if ((c->d & DstMask) == ImplicitOps)
1889 goto special_insn; 2551 goto special_insn;
1890 2552
1891 2553
1892 if (c->dst.type == OP_MEM) { 2554 if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
1893 c->dst.ptr = (unsigned long *)memop; 2555 /* optimisation - avoid slow emulated read if Mov */
1894 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 2556 rc = ops->read_emulated((unsigned long)c->dst.ptr, &c->dst.val,
1895 c->dst.val = 0; 2557 c->dst.bytes, ctxt->vcpu);
1896 if (c->d & BitOp) { 2558 if (rc != X86EMUL_CONTINUE)
1897 unsigned long mask = ~(c->dst.bytes * 8 - 1); 2559 goto done;
1898
1899 c->dst.ptr = (void *)c->dst.ptr +
1900 (c->src.val & mask) / 8;
1901 }
1902 if (!(c->d & Mov)) {
1903 /* optimisation - avoid slow emulated read */
1904 rc = ops->read_emulated((unsigned long)c->dst.ptr,
1905 &c->dst.val,
1906 c->dst.bytes,
1907 ctxt->vcpu);
1908 if (rc != X86EMUL_CONTINUE)
1909 goto done;
1910 }
1911 } 2560 }
1912 c->dst.orig_val = c->dst.val; 2561 c->dst.orig_val = c->dst.val;
1913 2562
@@ -1926,7 +2575,7 @@ special_insn:
1926 break; 2575 break;
1927 case 0x07: /* pop es */ 2576 case 0x07: /* pop es */
1928 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES); 2577 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
1929 if (rc != 0) 2578 if (rc != X86EMUL_CONTINUE)
1930 goto done; 2579 goto done;
1931 break; 2580 break;
1932 case 0x08 ... 0x0d: 2581 case 0x08 ... 0x0d:
@@ -1945,7 +2594,7 @@ special_insn:
1945 break; 2594 break;
1946 case 0x17: /* pop ss */ 2595 case 0x17: /* pop ss */
1947 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS); 2596 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
1948 if (rc != 0) 2597 if (rc != X86EMUL_CONTINUE)
1949 goto done; 2598 goto done;
1950 break; 2599 break;
1951 case 0x18 ... 0x1d: 2600 case 0x18 ... 0x1d:
@@ -1957,7 +2606,7 @@ special_insn:
1957 break; 2606 break;
1958 case 0x1f: /* pop ds */ 2607 case 0x1f: /* pop ds */
1959 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS); 2608 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
1960 if (rc != 0) 2609 if (rc != X86EMUL_CONTINUE)
1961 goto done; 2610 goto done;
1962 break; 2611 break;
1963 case 0x20 ... 0x25: 2612 case 0x20 ... 0x25:
@@ -1988,7 +2637,7 @@ special_insn:
1988 case 0x58 ... 0x5f: /* pop reg */ 2637 case 0x58 ... 0x5f: /* pop reg */
1989 pop_instruction: 2638 pop_instruction:
1990 rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes); 2639 rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes);
1991 if (rc != 0) 2640 if (rc != X86EMUL_CONTINUE)
1992 goto done; 2641 goto done;
1993 break; 2642 break;
1994 case 0x60: /* pusha */ 2643 case 0x60: /* pusha */
@@ -1996,7 +2645,7 @@ special_insn:
1996 break; 2645 break;
1997 case 0x61: /* popa */ 2646 case 0x61: /* popa */
1998 rc = emulate_popa(ctxt, ops); 2647 rc = emulate_popa(ctxt, ops);
1999 if (rc != 0) 2648 if (rc != X86EMUL_CONTINUE)
2000 goto done; 2649 goto done;
2001 break; 2650 break;
2002 case 0x63: /* movsxd */ 2651 case 0x63: /* movsxd */
@@ -2010,47 +2659,29 @@ special_insn:
2010 break; 2659 break;
2011 case 0x6c: /* insb */ 2660 case 0x6c: /* insb */
2012 case 0x6d: /* insw/insd */ 2661 case 0x6d: /* insw/insd */
2662 c->dst.bytes = min(c->dst.bytes, 4u);
2013 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], 2663 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
2014 (c->d & ByteOp) ? 1 : c->op_bytes)) { 2664 c->dst.bytes)) {
2015 kvm_inject_gp(ctxt->vcpu, 0); 2665 kvm_inject_gp(ctxt->vcpu, 0);
2016 goto done; 2666 goto done;
2017 } 2667 }
2018 if (kvm_emulate_pio_string(ctxt->vcpu, 2668 if (!pio_in_emulated(ctxt, ops, c->dst.bytes,
2019 1, 2669 c->regs[VCPU_REGS_RDX], &c->dst.val))
2020 (c->d & ByteOp) ? 1 : c->op_bytes, 2670 goto done; /* IO is needed, skip writeback */
2021 c->rep_prefix ? 2671 break;
2022 address_mask(c, c->regs[VCPU_REGS_RCX]) : 1,
2023 (ctxt->eflags & EFLG_DF),
2024 register_address(c, es_base(ctxt),
2025 c->regs[VCPU_REGS_RDI]),
2026 c->rep_prefix,
2027 c->regs[VCPU_REGS_RDX]) == 0) {
2028 c->eip = saved_eip;
2029 return -1;
2030 }
2031 return 0;
2032 case 0x6e: /* outsb */ 2672 case 0x6e: /* outsb */
2033 case 0x6f: /* outsw/outsd */ 2673 case 0x6f: /* outsw/outsd */
2674 c->src.bytes = min(c->src.bytes, 4u);
2034 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], 2675 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
2035 (c->d & ByteOp) ? 1 : c->op_bytes)) { 2676 c->src.bytes)) {
2036 kvm_inject_gp(ctxt->vcpu, 0); 2677 kvm_inject_gp(ctxt->vcpu, 0);
2037 goto done; 2678 goto done;
2038 } 2679 }
2039 if (kvm_emulate_pio_string(ctxt->vcpu, 2680 ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX],
2040 0, 2681 &c->src.val, 1, ctxt->vcpu);
2041 (c->d & ByteOp) ? 1 : c->op_bytes, 2682
2042 c->rep_prefix ? 2683 c->dst.type = OP_NONE; /* nothing to writeback */
2043 address_mask(c, c->regs[VCPU_REGS_RCX]) : 1, 2684 break;
2044 (ctxt->eflags & EFLG_DF),
2045 register_address(c,
2046 seg_override_base(ctxt, c),
2047 c->regs[VCPU_REGS_RSI]),
2048 c->rep_prefix,
2049 c->regs[VCPU_REGS_RDX]) == 0) {
2050 c->eip = saved_eip;
2051 return -1;
2052 }
2053 return 0;
2054 case 0x70 ... 0x7f: /* jcc (short) */ 2685 case 0x70 ... 0x7f: /* jcc (short) */
2055 if (test_cc(c->b, ctxt->eflags)) 2686 if (test_cc(c->b, ctxt->eflags))
2056 jmp_rel(c, c->src.val); 2687 jmp_rel(c, c->src.val);
@@ -2107,12 +2738,11 @@ special_insn:
2107 case 0x8c: { /* mov r/m, sreg */ 2738 case 0x8c: { /* mov r/m, sreg */
2108 struct kvm_segment segreg; 2739 struct kvm_segment segreg;
2109 2740
2110 if (c->modrm_reg <= 5) 2741 if (c->modrm_reg <= VCPU_SREG_GS)
2111 kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg); 2742 kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg);
2112 else { 2743 else {
2113 printk(KERN_INFO "0x8c: Invalid segreg in modrm byte 0x%02x\n", 2744 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
2114 c->modrm); 2745 goto done;
2115 goto cannot_emulate;
2116 } 2746 }
2117 c->dst.val = segreg.selector; 2747 c->dst.val = segreg.selector;
2118 break; 2748 break;
@@ -2132,16 +2762,16 @@ special_insn:
2132 } 2762 }
2133 2763
2134 if (c->modrm_reg == VCPU_SREG_SS) 2764 if (c->modrm_reg == VCPU_SREG_SS)
2135 toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS); 2765 toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_MOV_SS);
2136 2766
2137 rc = kvm_load_segment_descriptor(ctxt->vcpu, sel, c->modrm_reg); 2767 rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg);
2138 2768
2139 c->dst.type = OP_NONE; /* Disable writeback. */ 2769 c->dst.type = OP_NONE; /* Disable writeback. */
2140 break; 2770 break;
2141 } 2771 }
2142 case 0x8f: /* pop (sole member of Grp1a) */ 2772 case 0x8f: /* pop (sole member of Grp1a) */
2143 rc = emulate_grp1a(ctxt, ops); 2773 rc = emulate_grp1a(ctxt, ops);
2144 if (rc != 0) 2774 if (rc != X86EMUL_CONTINUE)
2145 goto done; 2775 goto done;
2146 break; 2776 break;
2147 case 0x90: /* nop / xchg r8,rax */ 2777 case 0x90: /* nop / xchg r8,rax */
@@ -2175,89 +2805,16 @@ special_insn:
2175 c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX]; 2805 c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX];
2176 break; 2806 break;
2177 case 0xa4 ... 0xa5: /* movs */ 2807 case 0xa4 ... 0xa5: /* movs */
2178 c->dst.type = OP_MEM; 2808 goto mov;
2179 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2180 c->dst.ptr = (unsigned long *)register_address(c,
2181 es_base(ctxt),
2182 c->regs[VCPU_REGS_RDI]);
2183 rc = ops->read_emulated(register_address(c,
2184 seg_override_base(ctxt, c),
2185 c->regs[VCPU_REGS_RSI]),
2186 &c->dst.val,
2187 c->dst.bytes, ctxt->vcpu);
2188 if (rc != X86EMUL_CONTINUE)
2189 goto done;
2190 register_address_increment(c, &c->regs[VCPU_REGS_RSI],
2191 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
2192 : c->dst.bytes);
2193 register_address_increment(c, &c->regs[VCPU_REGS_RDI],
2194 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
2195 : c->dst.bytes);
2196 break;
2197 case 0xa6 ... 0xa7: /* cmps */ 2809 case 0xa6 ... 0xa7: /* cmps */
2198 c->src.type = OP_NONE; /* Disable writeback. */
2199 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2200 c->src.ptr = (unsigned long *)register_address(c,
2201 seg_override_base(ctxt, c),
2202 c->regs[VCPU_REGS_RSI]);
2203 rc = ops->read_emulated((unsigned long)c->src.ptr,
2204 &c->src.val,
2205 c->src.bytes,
2206 ctxt->vcpu);
2207 if (rc != X86EMUL_CONTINUE)
2208 goto done;
2209
2210 c->dst.type = OP_NONE; /* Disable writeback. */ 2810 c->dst.type = OP_NONE; /* Disable writeback. */
2211 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2212 c->dst.ptr = (unsigned long *)register_address(c,
2213 es_base(ctxt),
2214 c->regs[VCPU_REGS_RDI]);
2215 rc = ops->read_emulated((unsigned long)c->dst.ptr,
2216 &c->dst.val,
2217 c->dst.bytes,
2218 ctxt->vcpu);
2219 if (rc != X86EMUL_CONTINUE)
2220 goto done;
2221
2222 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); 2811 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
2223 2812 goto cmp;
2224 emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
2225
2226 register_address_increment(c, &c->regs[VCPU_REGS_RSI],
2227 (ctxt->eflags & EFLG_DF) ? -c->src.bytes
2228 : c->src.bytes);
2229 register_address_increment(c, &c->regs[VCPU_REGS_RDI],
2230 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
2231 : c->dst.bytes);
2232
2233 break;
2234 case 0xaa ... 0xab: /* stos */ 2813 case 0xaa ... 0xab: /* stos */
2235 c->dst.type = OP_MEM;
2236 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2237 c->dst.ptr = (unsigned long *)register_address(c,
2238 es_base(ctxt),
2239 c->regs[VCPU_REGS_RDI]);
2240 c->dst.val = c->regs[VCPU_REGS_RAX]; 2814 c->dst.val = c->regs[VCPU_REGS_RAX];
2241 register_address_increment(c, &c->regs[VCPU_REGS_RDI],
2242 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
2243 : c->dst.bytes);
2244 break; 2815 break;
2245 case 0xac ... 0xad: /* lods */ 2816 case 0xac ... 0xad: /* lods */
2246 c->dst.type = OP_REG; 2817 goto mov;
2247 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2248 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
2249 rc = ops->read_emulated(register_address(c,
2250 seg_override_base(ctxt, c),
2251 c->regs[VCPU_REGS_RSI]),
2252 &c->dst.val,
2253 c->dst.bytes,
2254 ctxt->vcpu);
2255 if (rc != X86EMUL_CONTINUE)
2256 goto done;
2257 register_address_increment(c, &c->regs[VCPU_REGS_RSI],
2258 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
2259 : c->dst.bytes);
2260 break;
2261 case 0xae ... 0xaf: /* scas */ 2818 case 0xae ... 0xaf: /* scas */
2262 DPRINTF("Urk! I don't handle SCAS.\n"); 2819 DPRINTF("Urk! I don't handle SCAS.\n");
2263 goto cannot_emulate; 2820 goto cannot_emulate;
@@ -2277,7 +2834,7 @@ special_insn:
2277 break; 2834 break;
2278 case 0xcb: /* ret far */ 2835 case 0xcb: /* ret far */
2279 rc = emulate_ret_far(ctxt, ops); 2836 rc = emulate_ret_far(ctxt, ops);
2280 if (rc) 2837 if (rc != X86EMUL_CONTINUE)
2281 goto done; 2838 goto done;
2282 break; 2839 break;
2283 case 0xd0 ... 0xd1: /* Grp2 */ 2840 case 0xd0 ... 0xd1: /* Grp2 */
@@ -2290,14 +2847,10 @@ special_insn:
2290 break; 2847 break;
2291 case 0xe4: /* inb */ 2848 case 0xe4: /* inb */
2292 case 0xe5: /* in */ 2849 case 0xe5: /* in */
2293 port = c->src.val; 2850 goto do_io_in;
2294 io_dir_in = 1;
2295 goto do_io;
2296 case 0xe6: /* outb */ 2851 case 0xe6: /* outb */
2297 case 0xe7: /* out */ 2852 case 0xe7: /* out */
2298 port = c->src.val; 2853 goto do_io_out;
2299 io_dir_in = 0;
2300 goto do_io;
2301 case 0xe8: /* call (near) */ { 2854 case 0xe8: /* call (near) */ {
2302 long int rel = c->src.val; 2855 long int rel = c->src.val;
2303 c->src.val = (unsigned long) c->eip; 2856 c->src.val = (unsigned long) c->eip;
@@ -2308,8 +2861,9 @@ special_insn:
2308 case 0xe9: /* jmp rel */ 2861 case 0xe9: /* jmp rel */
2309 goto jmp; 2862 goto jmp;
2310 case 0xea: /* jmp far */ 2863 case 0xea: /* jmp far */
2311 if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, 2864 jump_far:
2312 VCPU_SREG_CS)) 2865 if (load_segment_descriptor(ctxt, ops, c->src2.val,
2866 VCPU_SREG_CS))
2313 goto done; 2867 goto done;
2314 2868
2315 c->eip = c->src.val; 2869 c->eip = c->src.val;
@@ -2321,25 +2875,29 @@ special_insn:
2321 break; 2875 break;
2322 case 0xec: /* in al,dx */ 2876 case 0xec: /* in al,dx */
2323 case 0xed: /* in (e/r)ax,dx */ 2877 case 0xed: /* in (e/r)ax,dx */
2324 port = c->regs[VCPU_REGS_RDX]; 2878 c->src.val = c->regs[VCPU_REGS_RDX];
2325 io_dir_in = 1; 2879 do_io_in:
2326 goto do_io; 2880 c->dst.bytes = min(c->dst.bytes, 4u);
2881 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
2882 kvm_inject_gp(ctxt->vcpu, 0);
2883 goto done;
2884 }
2885 if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val,
2886 &c->dst.val))
2887 goto done; /* IO is needed */
2888 break;
2327 case 0xee: /* out al,dx */ 2889 case 0xee: /* out al,dx */
2328 case 0xef: /* out (e/r)ax,dx */ 2890 case 0xef: /* out (e/r)ax,dx */
2329 port = c->regs[VCPU_REGS_RDX]; 2891 c->src.val = c->regs[VCPU_REGS_RDX];
2330 io_dir_in = 0; 2892 do_io_out:
2331 do_io: 2893 c->dst.bytes = min(c->dst.bytes, 4u);
2332 if (!emulator_io_permited(ctxt, ops, port, 2894 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
2333 (c->d & ByteOp) ? 1 : c->op_bytes)) {
2334 kvm_inject_gp(ctxt->vcpu, 0); 2895 kvm_inject_gp(ctxt->vcpu, 0);
2335 goto done; 2896 goto done;
2336 } 2897 }
2337 if (kvm_emulate_pio(ctxt->vcpu, io_dir_in, 2898 ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1,
2338 (c->d & ByteOp) ? 1 : c->op_bytes, 2899 ctxt->vcpu);
2339 port) != 0) { 2900 c->dst.type = OP_NONE; /* Disable writeback. */
2340 c->eip = saved_eip;
2341 goto cannot_emulate;
2342 }
2343 break; 2901 break;
2344 case 0xf4: /* hlt */ 2902 case 0xf4: /* hlt */
2345 ctxt->vcpu->arch.halt_request = 1; 2903 ctxt->vcpu->arch.halt_request = 1;
@@ -2350,16 +2908,15 @@ special_insn:
2350 c->dst.type = OP_NONE; /* Disable writeback. */ 2908 c->dst.type = OP_NONE; /* Disable writeback. */
2351 break; 2909 break;
2352 case 0xf6 ... 0xf7: /* Grp3 */ 2910 case 0xf6 ... 0xf7: /* Grp3 */
2353 rc = emulate_grp3(ctxt, ops); 2911 if (!emulate_grp3(ctxt, ops))
2354 if (rc != 0) 2912 goto cannot_emulate;
2355 goto done;
2356 break; 2913 break;
2357 case 0xf8: /* clc */ 2914 case 0xf8: /* clc */
2358 ctxt->eflags &= ~EFLG_CF; 2915 ctxt->eflags &= ~EFLG_CF;
2359 c->dst.type = OP_NONE; /* Disable writeback. */ 2916 c->dst.type = OP_NONE; /* Disable writeback. */
2360 break; 2917 break;
2361 case 0xfa: /* cli */ 2918 case 0xfa: /* cli */
2362 if (emulator_bad_iopl(ctxt)) 2919 if (emulator_bad_iopl(ctxt, ops))
2363 kvm_inject_gp(ctxt->vcpu, 0); 2920 kvm_inject_gp(ctxt->vcpu, 0);
2364 else { 2921 else {
2365 ctxt->eflags &= ~X86_EFLAGS_IF; 2922 ctxt->eflags &= ~X86_EFLAGS_IF;
@@ -2367,10 +2924,10 @@ special_insn:
2367 } 2924 }
2368 break; 2925 break;
2369 case 0xfb: /* sti */ 2926 case 0xfb: /* sti */
2370 if (emulator_bad_iopl(ctxt)) 2927 if (emulator_bad_iopl(ctxt, ops))
2371 kvm_inject_gp(ctxt->vcpu, 0); 2928 kvm_inject_gp(ctxt->vcpu, 0);
2372 else { 2929 else {
2373 toggle_interruptibility(ctxt, X86_SHADOW_INT_STI); 2930 toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_STI);
2374 ctxt->eflags |= X86_EFLAGS_IF; 2931 ctxt->eflags |= X86_EFLAGS_IF;
2375 c->dst.type = OP_NONE; /* Disable writeback. */ 2932 c->dst.type = OP_NONE; /* Disable writeback. */
2376 } 2933 }
@@ -2383,28 +2940,55 @@ special_insn:
2383 ctxt->eflags |= EFLG_DF; 2940 ctxt->eflags |= EFLG_DF;
2384 c->dst.type = OP_NONE; /* Disable writeback. */ 2941 c->dst.type = OP_NONE; /* Disable writeback. */
2385 break; 2942 break;
2386 case 0xfe ... 0xff: /* Grp4/Grp5 */ 2943 case 0xfe: /* Grp4 */
2944 grp45:
2387 rc = emulate_grp45(ctxt, ops); 2945 rc = emulate_grp45(ctxt, ops);
2388 if (rc != 0) 2946 if (rc != X86EMUL_CONTINUE)
2389 goto done; 2947 goto done;
2390 break; 2948 break;
2949 case 0xff: /* Grp5 */
2950 if (c->modrm_reg == 5)
2951 goto jump_far;
2952 goto grp45;
2391 } 2953 }
2392 2954
2393writeback: 2955writeback:
2394 rc = writeback(ctxt, ops); 2956 rc = writeback(ctxt, ops);
2395 if (rc != 0) 2957 if (rc != X86EMUL_CONTINUE)
2396 goto done; 2958 goto done;
2397 2959
2960 /*
2961 * restore dst type in case the decoding will be reused
2962 * (happens for string instruction )
2963 */
2964 c->dst.type = saved_dst_type;
2965
2966 if ((c->d & SrcMask) == SrcSI)
2967 string_addr_inc(ctxt, seg_override_base(ctxt, c), VCPU_REGS_RSI,
2968 &c->src);
2969
2970 if ((c->d & DstMask) == DstDI)
2971 string_addr_inc(ctxt, es_base(ctxt), VCPU_REGS_RDI, &c->dst);
2972
2973 if (c->rep_prefix && (c->d & String)) {
2974 struct read_cache *rc = &ctxt->decode.io_read;
2975 register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
2976 /*
2977 * Re-enter guest when pio read ahead buffer is empty or,
2978 * if it is not used, after each 1024 iteration.
2979 */
2980 if ((rc->end == 0 && !(c->regs[VCPU_REGS_RCX] & 0x3ff)) ||
2981 (rc->end != 0 && rc->end == rc->pos))
2982 ctxt->restart = false;
2983 }
2984
2398 /* Commit shadow register state. */ 2985 /* Commit shadow register state. */
2399 memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); 2986 memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
2400 kvm_rip_write(ctxt->vcpu, c->eip); 2987 kvm_rip_write(ctxt->vcpu, c->eip);
2988 ops->set_rflags(ctxt->vcpu, ctxt->eflags);
2401 2989
2402done: 2990done:
2403 if (rc == X86EMUL_UNHANDLEABLE) { 2991 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
2404 c->eip = saved_eip;
2405 return -1;
2406 }
2407 return 0;
2408 2992
2409twobyte_insn: 2993twobyte_insn:
2410 switch (c->b) { 2994 switch (c->b) {
@@ -2418,18 +3002,18 @@ twobyte_insn:
2418 goto cannot_emulate; 3002 goto cannot_emulate;
2419 3003
2420 rc = kvm_fix_hypercall(ctxt->vcpu); 3004 rc = kvm_fix_hypercall(ctxt->vcpu);
2421 if (rc) 3005 if (rc != X86EMUL_CONTINUE)
2422 goto done; 3006 goto done;
2423 3007
2424 /* Let the processor re-execute the fixed hypercall */ 3008 /* Let the processor re-execute the fixed hypercall */
2425 c->eip = kvm_rip_read(ctxt->vcpu); 3009 c->eip = ctxt->eip;
2426 /* Disable writeback. */ 3010 /* Disable writeback. */
2427 c->dst.type = OP_NONE; 3011 c->dst.type = OP_NONE;
2428 break; 3012 break;
2429 case 2: /* lgdt */ 3013 case 2: /* lgdt */
2430 rc = read_descriptor(ctxt, ops, c->src.ptr, 3014 rc = read_descriptor(ctxt, ops, c->src.ptr,
2431 &size, &address, c->op_bytes); 3015 &size, &address, c->op_bytes);
2432 if (rc) 3016 if (rc != X86EMUL_CONTINUE)
2433 goto done; 3017 goto done;
2434 realmode_lgdt(ctxt->vcpu, size, address); 3018 realmode_lgdt(ctxt->vcpu, size, address);
2435 /* Disable writeback. */ 3019 /* Disable writeback. */
@@ -2440,7 +3024,7 @@ twobyte_insn:
2440 switch (c->modrm_rm) { 3024 switch (c->modrm_rm) {
2441 case 1: 3025 case 1:
2442 rc = kvm_fix_hypercall(ctxt->vcpu); 3026 rc = kvm_fix_hypercall(ctxt->vcpu);
2443 if (rc) 3027 if (rc != X86EMUL_CONTINUE)
2444 goto done; 3028 goto done;
2445 break; 3029 break;
2446 default: 3030 default:
@@ -2450,7 +3034,7 @@ twobyte_insn:
2450 rc = read_descriptor(ctxt, ops, c->src.ptr, 3034 rc = read_descriptor(ctxt, ops, c->src.ptr,
2451 &size, &address, 3035 &size, &address,
2452 c->op_bytes); 3036 c->op_bytes);
2453 if (rc) 3037 if (rc != X86EMUL_CONTINUE)
2454 goto done; 3038 goto done;
2455 realmode_lidt(ctxt->vcpu, size, address); 3039 realmode_lidt(ctxt->vcpu, size, address);
2456 } 3040 }
@@ -2459,15 +3043,18 @@ twobyte_insn:
2459 break; 3043 break;
2460 case 4: /* smsw */ 3044 case 4: /* smsw */
2461 c->dst.bytes = 2; 3045 c->dst.bytes = 2;
2462 c->dst.val = realmode_get_cr(ctxt->vcpu, 0); 3046 c->dst.val = ops->get_cr(0, ctxt->vcpu);
2463 break; 3047 break;
2464 case 6: /* lmsw */ 3048 case 6: /* lmsw */
2465 realmode_lmsw(ctxt->vcpu, (u16)c->src.val, 3049 ops->set_cr(0, (ops->get_cr(0, ctxt->vcpu) & ~0x0ful) |
2466 &ctxt->eflags); 3050 (c->src.val & 0x0f), ctxt->vcpu);
2467 c->dst.type = OP_NONE; 3051 c->dst.type = OP_NONE;
2468 break; 3052 break;
3053 case 5: /* not defined */
3054 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
3055 goto done;
2469 case 7: /* invlpg*/ 3056 case 7: /* invlpg*/
2470 emulate_invlpg(ctxt->vcpu, memop); 3057 emulate_invlpg(ctxt->vcpu, c->modrm_ea);
2471 /* Disable writeback. */ 3058 /* Disable writeback. */
2472 c->dst.type = OP_NONE; 3059 c->dst.type = OP_NONE;
2473 break; 3060 break;
@@ -2493,54 +3080,54 @@ twobyte_insn:
2493 c->dst.type = OP_NONE; 3080 c->dst.type = OP_NONE;
2494 break; 3081 break;
2495 case 0x20: /* mov cr, reg */ 3082 case 0x20: /* mov cr, reg */
2496 if (c->modrm_mod != 3) 3083 switch (c->modrm_reg) {
2497 goto cannot_emulate; 3084 case 1:
2498 c->regs[c->modrm_rm] = 3085 case 5 ... 7:
2499 realmode_get_cr(ctxt->vcpu, c->modrm_reg); 3086 case 9 ... 15:
3087 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
3088 goto done;
3089 }
3090 c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu);
2500 c->dst.type = OP_NONE; /* no writeback */ 3091 c->dst.type = OP_NONE; /* no writeback */
2501 break; 3092 break;
2502 case 0x21: /* mov from dr to reg */ 3093 case 0x21: /* mov from dr to reg */
2503 if (c->modrm_mod != 3) 3094 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
2504 goto cannot_emulate; 3095 (c->modrm_reg == 4 || c->modrm_reg == 5)) {
2505 rc = emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]); 3096 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
2506 if (rc) 3097 goto done;
2507 goto cannot_emulate; 3098 }
3099 emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]);
2508 c->dst.type = OP_NONE; /* no writeback */ 3100 c->dst.type = OP_NONE; /* no writeback */
2509 break; 3101 break;
2510 case 0x22: /* mov reg, cr */ 3102 case 0x22: /* mov reg, cr */
2511 if (c->modrm_mod != 3) 3103 ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu);
2512 goto cannot_emulate;
2513 realmode_set_cr(ctxt->vcpu,
2514 c->modrm_reg, c->modrm_val, &ctxt->eflags);
2515 c->dst.type = OP_NONE; 3104 c->dst.type = OP_NONE;
2516 break; 3105 break;
2517 case 0x23: /* mov from reg to dr */ 3106 case 0x23: /* mov from reg to dr */
2518 if (c->modrm_mod != 3) 3107 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
2519 goto cannot_emulate; 3108 (c->modrm_reg == 4 || c->modrm_reg == 5)) {
2520 rc = emulator_set_dr(ctxt, c->modrm_reg, 3109 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
2521 c->regs[c->modrm_rm]); 3110 goto done;
2522 if (rc) 3111 }
2523 goto cannot_emulate; 3112 emulator_set_dr(ctxt, c->modrm_reg, c->regs[c->modrm_rm]);
2524 c->dst.type = OP_NONE; /* no writeback */ 3113 c->dst.type = OP_NONE; /* no writeback */
2525 break; 3114 break;
2526 case 0x30: 3115 case 0x30:
2527 /* wrmsr */ 3116 /* wrmsr */
2528 msr_data = (u32)c->regs[VCPU_REGS_RAX] 3117 msr_data = (u32)c->regs[VCPU_REGS_RAX]
2529 | ((u64)c->regs[VCPU_REGS_RDX] << 32); 3118 | ((u64)c->regs[VCPU_REGS_RDX] << 32);
2530 rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data); 3119 if (kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
2531 if (rc) {
2532 kvm_inject_gp(ctxt->vcpu, 0); 3120 kvm_inject_gp(ctxt->vcpu, 0);
2533 c->eip = kvm_rip_read(ctxt->vcpu); 3121 goto done;
2534 } 3122 }
2535 rc = X86EMUL_CONTINUE; 3123 rc = X86EMUL_CONTINUE;
2536 c->dst.type = OP_NONE; 3124 c->dst.type = OP_NONE;
2537 break; 3125 break;
2538 case 0x32: 3126 case 0x32:
2539 /* rdmsr */ 3127 /* rdmsr */
2540 rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data); 3128 if (kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
2541 if (rc) {
2542 kvm_inject_gp(ctxt->vcpu, 0); 3129 kvm_inject_gp(ctxt->vcpu, 0);
2543 c->eip = kvm_rip_read(ctxt->vcpu); 3130 goto done;
2544 } else { 3131 } else {
2545 c->regs[VCPU_REGS_RAX] = (u32)msr_data; 3132 c->regs[VCPU_REGS_RAX] = (u32)msr_data;
2546 c->regs[VCPU_REGS_RDX] = msr_data >> 32; 3133 c->regs[VCPU_REGS_RDX] = msr_data >> 32;
@@ -2577,7 +3164,7 @@ twobyte_insn:
2577 break; 3164 break;
2578 case 0xa1: /* pop fs */ 3165 case 0xa1: /* pop fs */
2579 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS); 3166 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
2580 if (rc != 0) 3167 if (rc != X86EMUL_CONTINUE)
2581 goto done; 3168 goto done;
2582 break; 3169 break;
2583 case 0xa3: 3170 case 0xa3:
@@ -2596,7 +3183,7 @@ twobyte_insn:
2596 break; 3183 break;
2597 case 0xa9: /* pop gs */ 3184 case 0xa9: /* pop gs */
2598 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS); 3185 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
2599 if (rc != 0) 3186 if (rc != X86EMUL_CONTINUE)
2600 goto done; 3187 goto done;
2601 break; 3188 break;
2602 case 0xab: 3189 case 0xab:
@@ -2668,16 +3255,14 @@ twobyte_insn:
2668 (u64) c->src.val; 3255 (u64) c->src.val;
2669 break; 3256 break;
2670 case 0xc7: /* Grp9 (cmpxchg8b) */ 3257 case 0xc7: /* Grp9 (cmpxchg8b) */
2671 rc = emulate_grp9(ctxt, ops, memop); 3258 rc = emulate_grp9(ctxt, ops);
2672 if (rc != 0) 3259 if (rc != X86EMUL_CONTINUE)
2673 goto done; 3260 goto done;
2674 c->dst.type = OP_NONE;
2675 break; 3261 break;
2676 } 3262 }
2677 goto writeback; 3263 goto writeback;
2678 3264
2679cannot_emulate: 3265cannot_emulate:
2680 DPRINTF("Cannot emulate %02x\n", c->b); 3266 DPRINTF("Cannot emulate %02x\n", c->b);
2681 c->eip = saved_eip;
2682 return -1; 3267 return -1;
2683} 3268}
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index a790fa128a9f..93825ff3338f 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -33,6 +33,29 @@
33#include <linux/kvm_host.h> 33#include <linux/kvm_host.h>
34#include "trace.h" 34#include "trace.h"
35 35
36static void pic_lock(struct kvm_pic *s)
37 __acquires(&s->lock)
38{
39 raw_spin_lock(&s->lock);
40}
41
42static void pic_unlock(struct kvm_pic *s)
43 __releases(&s->lock)
44{
45 bool wakeup = s->wakeup_needed;
46 struct kvm_vcpu *vcpu;
47
48 s->wakeup_needed = false;
49
50 raw_spin_unlock(&s->lock);
51
52 if (wakeup) {
53 vcpu = s->kvm->bsp_vcpu;
54 if (vcpu)
55 kvm_vcpu_kick(vcpu);
56 }
57}
58
36static void pic_clear_isr(struct kvm_kpic_state *s, int irq) 59static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
37{ 60{
38 s->isr &= ~(1 << irq); 61 s->isr &= ~(1 << irq);
@@ -45,19 +68,19 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
45 * Other interrupt may be delivered to PIC while lock is dropped but 68 * Other interrupt may be delivered to PIC while lock is dropped but
46 * it should be safe since PIC state is already updated at this stage. 69 * it should be safe since PIC state is already updated at this stage.
47 */ 70 */
48 raw_spin_unlock(&s->pics_state->lock); 71 pic_unlock(s->pics_state);
49 kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq); 72 kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq);
50 raw_spin_lock(&s->pics_state->lock); 73 pic_lock(s->pics_state);
51} 74}
52 75
53void kvm_pic_clear_isr_ack(struct kvm *kvm) 76void kvm_pic_clear_isr_ack(struct kvm *kvm)
54{ 77{
55 struct kvm_pic *s = pic_irqchip(kvm); 78 struct kvm_pic *s = pic_irqchip(kvm);
56 79
57 raw_spin_lock(&s->lock); 80 pic_lock(s);
58 s->pics[0].isr_ack = 0xff; 81 s->pics[0].isr_ack = 0xff;
59 s->pics[1].isr_ack = 0xff; 82 s->pics[1].isr_ack = 0xff;
60 raw_spin_unlock(&s->lock); 83 pic_unlock(s);
61} 84}
62 85
63/* 86/*
@@ -158,9 +181,9 @@ static void pic_update_irq(struct kvm_pic *s)
158 181
159void kvm_pic_update_irq(struct kvm_pic *s) 182void kvm_pic_update_irq(struct kvm_pic *s)
160{ 183{
161 raw_spin_lock(&s->lock); 184 pic_lock(s);
162 pic_update_irq(s); 185 pic_update_irq(s);
163 raw_spin_unlock(&s->lock); 186 pic_unlock(s);
164} 187}
165 188
166int kvm_pic_set_irq(void *opaque, int irq, int level) 189int kvm_pic_set_irq(void *opaque, int irq, int level)
@@ -168,14 +191,14 @@ int kvm_pic_set_irq(void *opaque, int irq, int level)
168 struct kvm_pic *s = opaque; 191 struct kvm_pic *s = opaque;
169 int ret = -1; 192 int ret = -1;
170 193
171 raw_spin_lock(&s->lock); 194 pic_lock(s);
172 if (irq >= 0 && irq < PIC_NUM_PINS) { 195 if (irq >= 0 && irq < PIC_NUM_PINS) {
173 ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); 196 ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
174 pic_update_irq(s); 197 pic_update_irq(s);
175 trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, 198 trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
176 s->pics[irq >> 3].imr, ret == 0); 199 s->pics[irq >> 3].imr, ret == 0);
177 } 200 }
178 raw_spin_unlock(&s->lock); 201 pic_unlock(s);
179 202
180 return ret; 203 return ret;
181} 204}
@@ -205,7 +228,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
205 int irq, irq2, intno; 228 int irq, irq2, intno;
206 struct kvm_pic *s = pic_irqchip(kvm); 229 struct kvm_pic *s = pic_irqchip(kvm);
207 230
208 raw_spin_lock(&s->lock); 231 pic_lock(s);
209 irq = pic_get_irq(&s->pics[0]); 232 irq = pic_get_irq(&s->pics[0]);
210 if (irq >= 0) { 233 if (irq >= 0) {
211 pic_intack(&s->pics[0], irq); 234 pic_intack(&s->pics[0], irq);
@@ -230,7 +253,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
230 intno = s->pics[0].irq_base + irq; 253 intno = s->pics[0].irq_base + irq;
231 } 254 }
232 pic_update_irq(s); 255 pic_update_irq(s);
233 raw_spin_unlock(&s->lock); 256 pic_unlock(s);
234 257
235 return intno; 258 return intno;
236} 259}
@@ -444,7 +467,7 @@ static int picdev_write(struct kvm_io_device *this,
444 printk(KERN_ERR "PIC: non byte write\n"); 467 printk(KERN_ERR "PIC: non byte write\n");
445 return 0; 468 return 0;
446 } 469 }
447 raw_spin_lock(&s->lock); 470 pic_lock(s);
448 switch (addr) { 471 switch (addr) {
449 case 0x20: 472 case 0x20:
450 case 0x21: 473 case 0x21:
@@ -457,7 +480,7 @@ static int picdev_write(struct kvm_io_device *this,
457 elcr_ioport_write(&s->pics[addr & 1], addr, data); 480 elcr_ioport_write(&s->pics[addr & 1], addr, data);
458 break; 481 break;
459 } 482 }
460 raw_spin_unlock(&s->lock); 483 pic_unlock(s);
461 return 0; 484 return 0;
462} 485}
463 486
@@ -474,7 +497,7 @@ static int picdev_read(struct kvm_io_device *this,
474 printk(KERN_ERR "PIC: non byte read\n"); 497 printk(KERN_ERR "PIC: non byte read\n");
475 return 0; 498 return 0;
476 } 499 }
477 raw_spin_lock(&s->lock); 500 pic_lock(s);
478 switch (addr) { 501 switch (addr) {
479 case 0x20: 502 case 0x20:
480 case 0x21: 503 case 0x21:
@@ -488,7 +511,7 @@ static int picdev_read(struct kvm_io_device *this,
488 break; 511 break;
489 } 512 }
490 *(unsigned char *)val = data; 513 *(unsigned char *)val = data;
491 raw_spin_unlock(&s->lock); 514 pic_unlock(s);
492 return 0; 515 return 0;
493} 516}
494 517
@@ -505,7 +528,7 @@ static void pic_irq_request(void *opaque, int level)
505 s->output = level; 528 s->output = level;
506 if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) { 529 if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
507 s->pics[0].isr_ack &= ~(1 << irq); 530 s->pics[0].isr_ack &= ~(1 << irq);
508 kvm_vcpu_kick(vcpu); 531 s->wakeup_needed = true;
509 } 532 }
510} 533}
511 534
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 34b15915754d..cd1f362f413d 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -63,6 +63,7 @@ struct kvm_kpic_state {
63 63
64struct kvm_pic { 64struct kvm_pic {
65 raw_spinlock_t lock; 65 raw_spinlock_t lock;
66 bool wakeup_needed;
66 unsigned pending_acks; 67 unsigned pending_acks;
67 struct kvm *kvm; 68 struct kvm *kvm;
68 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ 69 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h
index 55c7524dda54..64bc6ea78d90 100644
--- a/arch/x86/kvm/kvm_timer.h
+++ b/arch/x86/kvm/kvm_timer.h
@@ -10,9 +10,7 @@ struct kvm_timer {
10}; 10};
11 11
12struct kvm_timer_ops { 12struct kvm_timer_ops {
13 bool (*is_periodic)(struct kvm_timer *); 13 bool (*is_periodic)(struct kvm_timer *);
14}; 14};
15 15
16
17enum hrtimer_restart kvm_timer_fn(struct hrtimer *data); 16enum hrtimer_restart kvm_timer_fn(struct hrtimer *data);
18
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 48aeee8eefb0..ddfa8658fb6d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -148,7 +148,6 @@ module_param(oos_shadow, bool, 0644);
148 148
149#include <trace/events/kvm.h> 149#include <trace/events/kvm.h>
150 150
151#undef TRACE_INCLUDE_FILE
152#define CREATE_TRACE_POINTS 151#define CREATE_TRACE_POINTS
153#include "mmutrace.h" 152#include "mmutrace.h"
154 153
@@ -174,12 +173,7 @@ struct kvm_shadow_walk_iterator {
174 shadow_walk_okay(&(_walker)); \ 173 shadow_walk_okay(&(_walker)); \
175 shadow_walk_next(&(_walker))) 174 shadow_walk_next(&(_walker)))
176 175
177 176typedef int (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp);
178struct kvm_unsync_walk {
179 int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk);
180};
181
182typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp);
183 177
184static struct kmem_cache *pte_chain_cache; 178static struct kmem_cache *pte_chain_cache;
185static struct kmem_cache *rmap_desc_cache; 179static struct kmem_cache *rmap_desc_cache;
@@ -327,7 +321,6 @@ static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
327 page = alloc_page(GFP_KERNEL); 321 page = alloc_page(GFP_KERNEL);
328 if (!page) 322 if (!page)
329 return -ENOMEM; 323 return -ENOMEM;
330 set_page_private(page, 0);
331 cache->objects[cache->nobjs++] = page_address(page); 324 cache->objects[cache->nobjs++] = page_address(page);
332 } 325 }
333 return 0; 326 return 0;
@@ -438,9 +431,9 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
438 int i; 431 int i;
439 432
440 gfn = unalias_gfn(kvm, gfn); 433 gfn = unalias_gfn(kvm, gfn);
434 slot = gfn_to_memslot_unaliased(kvm, gfn);
441 for (i = PT_DIRECTORY_LEVEL; 435 for (i = PT_DIRECTORY_LEVEL;
442 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 436 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
443 slot = gfn_to_memslot_unaliased(kvm, gfn);
444 write_count = slot_largepage_idx(gfn, slot, i); 437 write_count = slot_largepage_idx(gfn, slot, i);
445 *write_count -= 1; 438 *write_count -= 1;
446 WARN_ON(*write_count < 0); 439 WARN_ON(*write_count < 0);
@@ -654,7 +647,6 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
654static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) 647static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
655{ 648{
656 struct kvm_rmap_desc *desc; 649 struct kvm_rmap_desc *desc;
657 struct kvm_rmap_desc *prev_desc;
658 u64 *prev_spte; 650 u64 *prev_spte;
659 int i; 651 int i;
660 652
@@ -666,7 +658,6 @@ static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
666 return NULL; 658 return NULL;
667 } 659 }
668 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 660 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
669 prev_desc = NULL;
670 prev_spte = NULL; 661 prev_spte = NULL;
671 while (desc) { 662 while (desc) {
672 for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) { 663 for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) {
@@ -794,7 +785,7 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
794 int retval = 0; 785 int retval = 0;
795 struct kvm_memslots *slots; 786 struct kvm_memslots *slots;
796 787
797 slots = rcu_dereference(kvm->memslots); 788 slots = kvm_memslots(kvm);
798 789
799 for (i = 0; i < slots->nmemslots; i++) { 790 for (i = 0; i < slots->nmemslots; i++) {
800 struct kvm_memory_slot *memslot = &slots->memslots[i]; 791 struct kvm_memory_slot *memslot = &slots->memslots[i];
@@ -925,7 +916,6 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
925 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); 916 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
926 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 917 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
927 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); 918 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
928 INIT_LIST_HEAD(&sp->oos_link);
929 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); 919 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
930 sp->multimapped = 0; 920 sp->multimapped = 0;
931 sp->parent_pte = parent_pte; 921 sp->parent_pte = parent_pte;
@@ -1009,8 +999,7 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
1009} 999}
1010 1000
1011 1001
1012static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 1002static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
1013 mmu_parent_walk_fn fn)
1014{ 1003{
1015 struct kvm_pte_chain *pte_chain; 1004 struct kvm_pte_chain *pte_chain;
1016 struct hlist_node *node; 1005 struct hlist_node *node;
@@ -1019,8 +1008,8 @@ static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1019 1008
1020 if (!sp->multimapped && sp->parent_pte) { 1009 if (!sp->multimapped && sp->parent_pte) {
1021 parent_sp = page_header(__pa(sp->parent_pte)); 1010 parent_sp = page_header(__pa(sp->parent_pte));
1022 fn(vcpu, parent_sp); 1011 fn(parent_sp);
1023 mmu_parent_walk(vcpu, parent_sp, fn); 1012 mmu_parent_walk(parent_sp, fn);
1024 return; 1013 return;
1025 } 1014 }
1026 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) 1015 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
@@ -1028,8 +1017,8 @@ static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1028 if (!pte_chain->parent_ptes[i]) 1017 if (!pte_chain->parent_ptes[i])
1029 break; 1018 break;
1030 parent_sp = page_header(__pa(pte_chain->parent_ptes[i])); 1019 parent_sp = page_header(__pa(pte_chain->parent_ptes[i]));
1031 fn(vcpu, parent_sp); 1020 fn(parent_sp);
1032 mmu_parent_walk(vcpu, parent_sp, fn); 1021 mmu_parent_walk(parent_sp, fn);
1033 } 1022 }
1034} 1023}
1035 1024
@@ -1066,16 +1055,15 @@ static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
1066 } 1055 }
1067} 1056}
1068 1057
1069static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 1058static int unsync_walk_fn(struct kvm_mmu_page *sp)
1070{ 1059{
1071 kvm_mmu_update_parents_unsync(sp); 1060 kvm_mmu_update_parents_unsync(sp);
1072 return 1; 1061 return 1;
1073} 1062}
1074 1063
1075static void kvm_mmu_mark_parents_unsync(struct kvm_vcpu *vcpu, 1064static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
1076 struct kvm_mmu_page *sp)
1077{ 1065{
1078 mmu_parent_walk(vcpu, sp, unsync_walk_fn); 1066 mmu_parent_walk(sp, unsync_walk_fn);
1079 kvm_mmu_update_parents_unsync(sp); 1067 kvm_mmu_update_parents_unsync(sp);
1080} 1068}
1081 1069
@@ -1209,7 +1197,7 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp);
1209 1197
1210static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 1198static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1211{ 1199{
1212 if (sp->role.glevels != vcpu->arch.mmu.root_level) { 1200 if (sp->role.cr4_pae != !!is_pae(vcpu)) {
1213 kvm_mmu_zap_page(vcpu->kvm, sp); 1201 kvm_mmu_zap_page(vcpu->kvm, sp);
1214 return 1; 1202 return 1;
1215 } 1203 }
@@ -1331,6 +1319,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1331 role = vcpu->arch.mmu.base_role; 1319 role = vcpu->arch.mmu.base_role;
1332 role.level = level; 1320 role.level = level;
1333 role.direct = direct; 1321 role.direct = direct;
1322 if (role.direct)
1323 role.cr4_pae = 0;
1334 role.access = access; 1324 role.access = access;
1335 if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { 1325 if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
1336 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); 1326 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
@@ -1351,7 +1341,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1351 mmu_page_add_parent_pte(vcpu, sp, parent_pte); 1341 mmu_page_add_parent_pte(vcpu, sp, parent_pte);
1352 if (sp->unsync_children) { 1342 if (sp->unsync_children) {
1353 set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); 1343 set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
1354 kvm_mmu_mark_parents_unsync(vcpu, sp); 1344 kvm_mmu_mark_parents_unsync(sp);
1355 } 1345 }
1356 trace_kvm_mmu_get_page(sp, false); 1346 trace_kvm_mmu_get_page(sp, false);
1357 return sp; 1347 return sp;
@@ -1490,8 +1480,8 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
1490 for_each_sp(pages, sp, parents, i) { 1480 for_each_sp(pages, sp, parents, i) {
1491 kvm_mmu_zap_page(kvm, sp); 1481 kvm_mmu_zap_page(kvm, sp);
1492 mmu_pages_clear_parents(&parents); 1482 mmu_pages_clear_parents(&parents);
1483 zapped++;
1493 } 1484 }
1494 zapped += pages.nr;
1495 kvm_mmu_pages_init(parent, &parents, &pages); 1485 kvm_mmu_pages_init(parent, &parents, &pages);
1496 } 1486 }
1497 1487
@@ -1542,14 +1532,16 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
1542 */ 1532 */
1543 1533
1544 if (used_pages > kvm_nr_mmu_pages) { 1534 if (used_pages > kvm_nr_mmu_pages) {
1545 while (used_pages > kvm_nr_mmu_pages) { 1535 while (used_pages > kvm_nr_mmu_pages &&
1536 !list_empty(&kvm->arch.active_mmu_pages)) {
1546 struct kvm_mmu_page *page; 1537 struct kvm_mmu_page *page;
1547 1538
1548 page = container_of(kvm->arch.active_mmu_pages.prev, 1539 page = container_of(kvm->arch.active_mmu_pages.prev,
1549 struct kvm_mmu_page, link); 1540 struct kvm_mmu_page, link);
1550 kvm_mmu_zap_page(kvm, page); 1541 used_pages -= kvm_mmu_zap_page(kvm, page);
1551 used_pages--; 1542 used_pages--;
1552 } 1543 }
1544 kvm_nr_mmu_pages = used_pages;
1553 kvm->arch.n_free_mmu_pages = 0; 1545 kvm->arch.n_free_mmu_pages = 0;
1554 } 1546 }
1555 else 1547 else
@@ -1571,13 +1563,14 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
1571 r = 0; 1563 r = 0;
1572 index = kvm_page_table_hashfn(gfn); 1564 index = kvm_page_table_hashfn(gfn);
1573 bucket = &kvm->arch.mmu_page_hash[index]; 1565 bucket = &kvm->arch.mmu_page_hash[index];
1566restart:
1574 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) 1567 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
1575 if (sp->gfn == gfn && !sp->role.direct) { 1568 if (sp->gfn == gfn && !sp->role.direct) {
1576 pgprintk("%s: gfn %lx role %x\n", __func__, gfn, 1569 pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
1577 sp->role.word); 1570 sp->role.word);
1578 r = 1; 1571 r = 1;
1579 if (kvm_mmu_zap_page(kvm, sp)) 1572 if (kvm_mmu_zap_page(kvm, sp))
1580 n = bucket->first; 1573 goto restart;
1581 } 1574 }
1582 return r; 1575 return r;
1583} 1576}
@@ -1591,12 +1584,14 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
1591 1584
1592 index = kvm_page_table_hashfn(gfn); 1585 index = kvm_page_table_hashfn(gfn);
1593 bucket = &kvm->arch.mmu_page_hash[index]; 1586 bucket = &kvm->arch.mmu_page_hash[index];
1587restart:
1594 hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) { 1588 hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) {
1595 if (sp->gfn == gfn && !sp->role.direct 1589 if (sp->gfn == gfn && !sp->role.direct
1596 && !sp->role.invalid) { 1590 && !sp->role.invalid) {
1597 pgprintk("%s: zap %lx %x\n", 1591 pgprintk("%s: zap %lx %x\n",
1598 __func__, gfn, sp->role.word); 1592 __func__, gfn, sp->role.word);
1599 kvm_mmu_zap_page(kvm, sp); 1593 if (kvm_mmu_zap_page(kvm, sp))
1594 goto restart;
1600 } 1595 }
1601 } 1596 }
1602} 1597}
@@ -1623,20 +1618,6 @@ static void mmu_convert_notrap(struct kvm_mmu_page *sp)
1623 } 1618 }
1624} 1619}
1625 1620
1626struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
1627{
1628 struct page *page;
1629
1630 gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
1631
1632 if (gpa == UNMAPPED_GVA)
1633 return NULL;
1634
1635 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1636
1637 return page;
1638}
1639
1640/* 1621/*
1641 * The function is based on mtrr_type_lookup() in 1622 * The function is based on mtrr_type_lookup() in
1642 * arch/x86/kernel/cpu/mtrr/generic.c 1623 * arch/x86/kernel/cpu/mtrr/generic.c
@@ -1762,7 +1743,7 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1762 ++vcpu->kvm->stat.mmu_unsync; 1743 ++vcpu->kvm->stat.mmu_unsync;
1763 sp->unsync = 1; 1744 sp->unsync = 1;
1764 1745
1765 kvm_mmu_mark_parents_unsync(vcpu, sp); 1746 kvm_mmu_mark_parents_unsync(sp);
1766 1747
1767 mmu_convert_notrap(sp); 1748 mmu_convert_notrap(sp);
1768 return 0; 1749 return 0;
@@ -2296,13 +2277,19 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
2296 /* no rsvd bits for 2 level 4K page table entries */ 2277 /* no rsvd bits for 2 level 4K page table entries */
2297 context->rsvd_bits_mask[0][1] = 0; 2278 context->rsvd_bits_mask[0][1] = 0;
2298 context->rsvd_bits_mask[0][0] = 0; 2279 context->rsvd_bits_mask[0][0] = 0;
2280 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2281
2282 if (!is_pse(vcpu)) {
2283 context->rsvd_bits_mask[1][1] = 0;
2284 break;
2285 }
2286
2299 if (is_cpuid_PSE36()) 2287 if (is_cpuid_PSE36())
2300 /* 36bits PSE 4MB page */ 2288 /* 36bits PSE 4MB page */
2301 context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21); 2289 context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
2302 else 2290 else
2303 /* 32 bits PSE 4MB page */ 2291 /* 32 bits PSE 4MB page */
2304 context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); 2292 context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
2305 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
2306 break; 2293 break;
2307 case PT32E_ROOT_LEVEL: 2294 case PT32E_ROOT_LEVEL:
2308 context->rsvd_bits_mask[0][2] = 2295 context->rsvd_bits_mask[0][2] =
@@ -2315,7 +2302,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
2315 context->rsvd_bits_mask[1][1] = exb_bit_rsvd | 2302 context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
2316 rsvd_bits(maxphyaddr, 62) | 2303 rsvd_bits(maxphyaddr, 62) |
2317 rsvd_bits(13, 20); /* large page */ 2304 rsvd_bits(13, 20); /* large page */
2318 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0]; 2305 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2319 break; 2306 break;
2320 case PT64_ROOT_LEVEL: 2307 case PT64_ROOT_LEVEL:
2321 context->rsvd_bits_mask[0][3] = exb_bit_rsvd | 2308 context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
@@ -2333,7 +2320,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
2333 context->rsvd_bits_mask[1][1] = exb_bit_rsvd | 2320 context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
2334 rsvd_bits(maxphyaddr, 51) | 2321 rsvd_bits(maxphyaddr, 51) |
2335 rsvd_bits(13, 20); /* large page */ 2322 rsvd_bits(13, 20); /* large page */
2336 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0]; 2323 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2337 break; 2324 break;
2338 } 2325 }
2339} 2326}
@@ -2435,7 +2422,7 @@ static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
2435 else 2422 else
2436 r = paging32_init_context(vcpu); 2423 r = paging32_init_context(vcpu);
2437 2424
2438 vcpu->arch.mmu.base_role.glevels = vcpu->arch.mmu.root_level; 2425 vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
2439 2426
2440 return r; 2427 return r;
2441} 2428}
@@ -2524,7 +2511,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
2524 } 2511 }
2525 2512
2526 ++vcpu->kvm->stat.mmu_pte_updated; 2513 ++vcpu->kvm->stat.mmu_pte_updated;
2527 if (sp->role.glevels == PT32_ROOT_LEVEL) 2514 if (!sp->role.cr4_pae)
2528 paging32_update_pte(vcpu, sp, spte, new); 2515 paging32_update_pte(vcpu, sp, spte, new);
2529 else 2516 else
2530 paging64_update_pte(vcpu, sp, spte, new); 2517 paging64_update_pte(vcpu, sp, spte, new);
@@ -2559,36 +2546,11 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
2559} 2546}
2560 2547
2561static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, 2548static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2562 const u8 *new, int bytes) 2549 u64 gpte)
2563{ 2550{
2564 gfn_t gfn; 2551 gfn_t gfn;
2565 int r;
2566 u64 gpte = 0;
2567 pfn_t pfn; 2552 pfn_t pfn;
2568 2553
2569 if (bytes != 4 && bytes != 8)
2570 return;
2571
2572 /*
2573 * Assume that the pte write on a page table of the same type
2574 * as the current vcpu paging mode. This is nearly always true
2575 * (might be false while changing modes). Note it is verified later
2576 * by update_pte().
2577 */
2578 if (is_pae(vcpu)) {
2579 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
2580 if ((bytes == 4) && (gpa % 4 == 0)) {
2581 r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8);
2582 if (r)
2583 return;
2584 memcpy((void *)&gpte + (gpa % 8), new, 4);
2585 } else if ((bytes == 8) && (gpa % 8 == 0)) {
2586 memcpy((void *)&gpte, new, 8);
2587 }
2588 } else {
2589 if ((bytes == 4) && (gpa % 4 == 0))
2590 memcpy((void *)&gpte, new, 4);
2591 }
2592 if (!is_present_gpte(gpte)) 2554 if (!is_present_gpte(gpte))
2593 return; 2555 return;
2594 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; 2556 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
@@ -2637,10 +2599,46 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2637 int flooded = 0; 2599 int flooded = 0;
2638 int npte; 2600 int npte;
2639 int r; 2601 int r;
2602 int invlpg_counter;
2640 2603
2641 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); 2604 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
2642 mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes); 2605
2606 invlpg_counter = atomic_read(&vcpu->kvm->arch.invlpg_counter);
2607
2608 /*
2609 * Assume that the pte write on a page table of the same type
2610 * as the current vcpu paging mode. This is nearly always true
2611 * (might be false while changing modes). Note it is verified later
2612 * by update_pte().
2613 */
2614 if ((is_pae(vcpu) && bytes == 4) || !new) {
2615 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
2616 if (is_pae(vcpu)) {
2617 gpa &= ~(gpa_t)7;
2618 bytes = 8;
2619 }
2620 r = kvm_read_guest(vcpu->kvm, gpa, &gentry, min(bytes, 8));
2621 if (r)
2622 gentry = 0;
2623 new = (const u8 *)&gentry;
2624 }
2625
2626 switch (bytes) {
2627 case 4:
2628 gentry = *(const u32 *)new;
2629 break;
2630 case 8:
2631 gentry = *(const u64 *)new;
2632 break;
2633 default:
2634 gentry = 0;
2635 break;
2636 }
2637
2638 mmu_guess_page_from_pte_write(vcpu, gpa, gentry);
2643 spin_lock(&vcpu->kvm->mmu_lock); 2639 spin_lock(&vcpu->kvm->mmu_lock);
2640 if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
2641 gentry = 0;
2644 kvm_mmu_access_page(vcpu, gfn); 2642 kvm_mmu_access_page(vcpu, gfn);
2645 kvm_mmu_free_some_pages(vcpu); 2643 kvm_mmu_free_some_pages(vcpu);
2646 ++vcpu->kvm->stat.mmu_pte_write; 2644 ++vcpu->kvm->stat.mmu_pte_write;
@@ -2659,10 +2657,12 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2659 } 2657 }
2660 index = kvm_page_table_hashfn(gfn); 2658 index = kvm_page_table_hashfn(gfn);
2661 bucket = &vcpu->kvm->arch.mmu_page_hash[index]; 2659 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
2660
2661restart:
2662 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { 2662 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
2663 if (sp->gfn != gfn || sp->role.direct || sp->role.invalid) 2663 if (sp->gfn != gfn || sp->role.direct || sp->role.invalid)
2664 continue; 2664 continue;
2665 pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; 2665 pte_size = sp->role.cr4_pae ? 8 : 4;
2666 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); 2666 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
2667 misaligned |= bytes < 4; 2667 misaligned |= bytes < 4;
2668 if (misaligned || flooded) { 2668 if (misaligned || flooded) {
@@ -2679,14 +2679,14 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2679 pgprintk("misaligned: gpa %llx bytes %d role %x\n", 2679 pgprintk("misaligned: gpa %llx bytes %d role %x\n",
2680 gpa, bytes, sp->role.word); 2680 gpa, bytes, sp->role.word);
2681 if (kvm_mmu_zap_page(vcpu->kvm, sp)) 2681 if (kvm_mmu_zap_page(vcpu->kvm, sp))
2682 n = bucket->first; 2682 goto restart;
2683 ++vcpu->kvm->stat.mmu_flooded; 2683 ++vcpu->kvm->stat.mmu_flooded;
2684 continue; 2684 continue;
2685 } 2685 }
2686 page_offset = offset; 2686 page_offset = offset;
2687 level = sp->role.level; 2687 level = sp->role.level;
2688 npte = 1; 2688 npte = 1;
2689 if (sp->role.glevels == PT32_ROOT_LEVEL) { 2689 if (!sp->role.cr4_pae) {
2690 page_offset <<= 1; /* 32->64 */ 2690 page_offset <<= 1; /* 32->64 */
2691 /* 2691 /*
2692 * A 32-bit pde maps 4MB while the shadow pdes map 2692 * A 32-bit pde maps 4MB while the shadow pdes map
@@ -2704,20 +2704,11 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2704 continue; 2704 continue;
2705 } 2705 }
2706 spte = &sp->spt[page_offset / sizeof(*spte)]; 2706 spte = &sp->spt[page_offset / sizeof(*spte)];
2707 if ((gpa & (pte_size - 1)) || (bytes < pte_size)) {
2708 gentry = 0;
2709 r = kvm_read_guest_atomic(vcpu->kvm,
2710 gpa & ~(u64)(pte_size - 1),
2711 &gentry, pte_size);
2712 new = (const void *)&gentry;
2713 if (r < 0)
2714 new = NULL;
2715 }
2716 while (npte--) { 2707 while (npte--) {
2717 entry = *spte; 2708 entry = *spte;
2718 mmu_pte_write_zap_pte(vcpu, sp, spte); 2709 mmu_pte_write_zap_pte(vcpu, sp, spte);
2719 if (new) 2710 if (gentry)
2720 mmu_pte_write_new_pte(vcpu, sp, spte, new); 2711 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
2721 mmu_pte_write_flush_tlb(vcpu, entry, *spte); 2712 mmu_pte_write_flush_tlb(vcpu, entry, *spte);
2722 ++spte; 2713 ++spte;
2723 } 2714 }
@@ -2897,10 +2888,11 @@ void kvm_mmu_zap_all(struct kvm *kvm)
2897 struct kvm_mmu_page *sp, *node; 2888 struct kvm_mmu_page *sp, *node;
2898 2889
2899 spin_lock(&kvm->mmu_lock); 2890 spin_lock(&kvm->mmu_lock);
2891restart:
2900 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) 2892 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
2901 if (kvm_mmu_zap_page(kvm, sp)) 2893 if (kvm_mmu_zap_page(kvm, sp))
2902 node = container_of(kvm->arch.active_mmu_pages.next, 2894 goto restart;
2903 struct kvm_mmu_page, link); 2895
2904 spin_unlock(&kvm->mmu_lock); 2896 spin_unlock(&kvm->mmu_lock);
2905 2897
2906 kvm_flush_remote_tlbs(kvm); 2898 kvm_flush_remote_tlbs(kvm);
@@ -3008,7 +3000,8 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
3008 unsigned int nr_pages = 0; 3000 unsigned int nr_pages = 0;
3009 struct kvm_memslots *slots; 3001 struct kvm_memslots *slots;
3010 3002
3011 slots = rcu_dereference(kvm->memslots); 3003 slots = kvm_memslots(kvm);
3004
3012 for (i = 0; i < slots->nmemslots; i++) 3005 for (i = 0; i < slots->nmemslots; i++)
3013 nr_pages += slots->memslots[i].npages; 3006 nr_pages += slots->memslots[i].npages;
3014 3007
@@ -3171,8 +3164,7 @@ static gva_t canonicalize(gva_t gva)
3171} 3164}
3172 3165
3173 3166
3174typedef void (*inspect_spte_fn) (struct kvm *kvm, struct kvm_mmu_page *sp, 3167typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
3175 u64 *sptep);
3176 3168
3177static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp, 3169static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
3178 inspect_spte_fn fn) 3170 inspect_spte_fn fn)
@@ -3188,7 +3180,7 @@ static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
3188 child = page_header(ent & PT64_BASE_ADDR_MASK); 3180 child = page_header(ent & PT64_BASE_ADDR_MASK);
3189 __mmu_spte_walk(kvm, child, fn); 3181 __mmu_spte_walk(kvm, child, fn);
3190 } else 3182 } else
3191 fn(kvm, sp, &sp->spt[i]); 3183 fn(kvm, &sp->spt[i]);
3192 } 3184 }
3193 } 3185 }
3194} 3186}
@@ -3279,11 +3271,13 @@ static void audit_mappings(struct kvm_vcpu *vcpu)
3279 3271
3280static int count_rmaps(struct kvm_vcpu *vcpu) 3272static int count_rmaps(struct kvm_vcpu *vcpu)
3281{ 3273{
3274 struct kvm *kvm = vcpu->kvm;
3275 struct kvm_memslots *slots;
3282 int nmaps = 0; 3276 int nmaps = 0;
3283 int i, j, k, idx; 3277 int i, j, k, idx;
3284 3278
3285 idx = srcu_read_lock(&kvm->srcu); 3279 idx = srcu_read_lock(&kvm->srcu);
3286 slots = rcu_dereference(kvm->memslots); 3280 slots = kvm_memslots(kvm);
3287 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 3281 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
3288 struct kvm_memory_slot *m = &slots->memslots[i]; 3282 struct kvm_memory_slot *m = &slots->memslots[i];
3289 struct kvm_rmap_desc *d; 3283 struct kvm_rmap_desc *d;
@@ -3312,7 +3306,7 @@ static int count_rmaps(struct kvm_vcpu *vcpu)
3312 return nmaps; 3306 return nmaps;
3313} 3307}
3314 3308
3315void inspect_spte_has_rmap(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *sptep) 3309void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
3316{ 3310{
3317 unsigned long *rmapp; 3311 unsigned long *rmapp;
3318 struct kvm_mmu_page *rev_sp; 3312 struct kvm_mmu_page *rev_sp;
@@ -3328,14 +3322,14 @@ void inspect_spte_has_rmap(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *sptep)
3328 printk(KERN_ERR "%s: no memslot for gfn %ld\n", 3322 printk(KERN_ERR "%s: no memslot for gfn %ld\n",
3329 audit_msg, gfn); 3323 audit_msg, gfn);
3330 printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n", 3324 printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n",
3331 audit_msg, sptep - rev_sp->spt, 3325 audit_msg, (long int)(sptep - rev_sp->spt),
3332 rev_sp->gfn); 3326 rev_sp->gfn);
3333 dump_stack(); 3327 dump_stack();
3334 return; 3328 return;
3335 } 3329 }
3336 3330
3337 rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt], 3331 rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt],
3338 is_large_pte(*sptep)); 3332 rev_sp->role.level);
3339 if (!*rmapp) { 3333 if (!*rmapp) {
3340 if (!printk_ratelimit()) 3334 if (!printk_ratelimit())
3341 return; 3335 return;
@@ -3370,7 +3364,7 @@ static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
3370 continue; 3364 continue;
3371 if (!(ent & PT_WRITABLE_MASK)) 3365 if (!(ent & PT_WRITABLE_MASK))
3372 continue; 3366 continue;
3373 inspect_spte_has_rmap(vcpu->kvm, sp, &pt[i]); 3367 inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
3374 } 3368 }
3375 } 3369 }
3376 return; 3370 return;
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 3e4a5c6ca2a9..bc4f7f0be2b1 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -6,8 +6,6 @@
6 6
7#undef TRACE_SYSTEM 7#undef TRACE_SYSTEM
8#define TRACE_SYSTEM kvmmmu 8#define TRACE_SYSTEM kvmmmu
9#define TRACE_INCLUDE_PATH .
10#define TRACE_INCLUDE_FILE mmutrace
11 9
12#define KVM_MMU_PAGE_FIELDS \ 10#define KVM_MMU_PAGE_FIELDS \
13 __field(__u64, gfn) \ 11 __field(__u64, gfn) \
@@ -30,14 +28,14 @@
30 \ 28 \
31 role.word = __entry->role; \ 29 role.word = __entry->role; \
32 \ 30 \
33 trace_seq_printf(p, "sp gfn %llx %u/%u q%u%s %s%s %spge" \ 31 trace_seq_printf(p, "sp gfn %llx %u%s q%u%s %s%s" \
34 " %snxe root %u %s%c", \ 32 " %snxe root %u %s%c", \
35 __entry->gfn, role.level, role.glevels, \ 33 __entry->gfn, role.level, \
34 role.cr4_pae ? " pae" : "", \
36 role.quadrant, \ 35 role.quadrant, \
37 role.direct ? " direct" : "", \ 36 role.direct ? " direct" : "", \
38 access_str[role.access], \ 37 access_str[role.access], \
39 role.invalid ? " invalid" : "", \ 38 role.invalid ? " invalid" : "", \
40 role.cr4_pge ? "" : "!", \
41 role.nxe ? "" : "!", \ 39 role.nxe ? "" : "!", \
42 __entry->root_count, \ 40 __entry->root_count, \
43 __entry->unsync ? "unsync" : "sync", 0); \ 41 __entry->unsync ? "unsync" : "sync", 0); \
@@ -216,5 +214,10 @@ TRACE_EVENT(
216 214
217#endif /* _TRACE_KVMMMU_H */ 215#endif /* _TRACE_KVMMMU_H */
218 216
217#undef TRACE_INCLUDE_PATH
218#define TRACE_INCLUDE_PATH .
219#undef TRACE_INCLUDE_FILE
220#define TRACE_INCLUDE_FILE mmutrace
221
219/* This part must be outside protection */ 222/* This part must be outside protection */
220#include <trace/define_trace.h> 223#include <trace/define_trace.h>
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 81eab9a50e6a..d0cc07eb6eda 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -170,7 +170,7 @@ walk:
170 goto access_error; 170 goto access_error;
171 171
172#if PTTYPE == 64 172#if PTTYPE == 64
173 if (fetch_fault && is_nx(vcpu) && (pte & PT64_NX_MASK)) 173 if (fetch_fault && (pte & PT64_NX_MASK))
174 goto access_error; 174 goto access_error;
175#endif 175#endif
176 176
@@ -190,10 +190,10 @@ walk:
190 190
191 if ((walker->level == PT_PAGE_TABLE_LEVEL) || 191 if ((walker->level == PT_PAGE_TABLE_LEVEL) ||
192 ((walker->level == PT_DIRECTORY_LEVEL) && 192 ((walker->level == PT_DIRECTORY_LEVEL) &&
193 (pte & PT_PAGE_SIZE_MASK) && 193 is_large_pte(pte) &&
194 (PTTYPE == 64 || is_pse(vcpu))) || 194 (PTTYPE == 64 || is_pse(vcpu))) ||
195 ((walker->level == PT_PDPE_LEVEL) && 195 ((walker->level == PT_PDPE_LEVEL) &&
196 (pte & PT_PAGE_SIZE_MASK) && 196 is_large_pte(pte) &&
197 is_long_mode(vcpu))) { 197 is_long_mode(vcpu))) {
198 int lvl = walker->level; 198 int lvl = walker->level;
199 199
@@ -258,11 +258,17 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
258 pt_element_t gpte; 258 pt_element_t gpte;
259 unsigned pte_access; 259 unsigned pte_access;
260 pfn_t pfn; 260 pfn_t pfn;
261 u64 new_spte;
261 262
262 gpte = *(const pt_element_t *)pte; 263 gpte = *(const pt_element_t *)pte;
263 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { 264 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
264 if (!is_present_gpte(gpte)) 265 if (!is_present_gpte(gpte)) {
265 __set_spte(spte, shadow_notrap_nonpresent_pte); 266 if (page->unsync)
267 new_spte = shadow_trap_nonpresent_pte;
268 else
269 new_spte = shadow_notrap_nonpresent_pte;
270 __set_spte(spte, new_spte);
271 }
266 return; 272 return;
267 } 273 }
268 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); 274 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
@@ -457,6 +463,7 @@ out_unlock:
457static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) 463static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
458{ 464{
459 struct kvm_shadow_walk_iterator iterator; 465 struct kvm_shadow_walk_iterator iterator;
466 gpa_t pte_gpa = -1;
460 int level; 467 int level;
461 u64 *sptep; 468 u64 *sptep;
462 int need_flush = 0; 469 int need_flush = 0;
@@ -470,6 +477,10 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
470 if (level == PT_PAGE_TABLE_LEVEL || 477 if (level == PT_PAGE_TABLE_LEVEL ||
471 ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) || 478 ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) ||
472 ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) { 479 ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) {
480 struct kvm_mmu_page *sp = page_header(__pa(sptep));
481
482 pte_gpa = (sp->gfn << PAGE_SHIFT);
483 pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
473 484
474 if (is_shadow_present_pte(*sptep)) { 485 if (is_shadow_present_pte(*sptep)) {
475 rmap_remove(vcpu->kvm, sptep); 486 rmap_remove(vcpu->kvm, sptep);
@@ -487,7 +498,17 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
487 498
488 if (need_flush) 499 if (need_flush)
489 kvm_flush_remote_tlbs(vcpu->kvm); 500 kvm_flush_remote_tlbs(vcpu->kvm);
501
502 atomic_inc(&vcpu->kvm->arch.invlpg_counter);
503
490 spin_unlock(&vcpu->kvm->mmu_lock); 504 spin_unlock(&vcpu->kvm->mmu_lock);
505
506 if (pte_gpa == -1)
507 return;
508
509 if (mmu_topup_memory_caches(vcpu))
510 return;
511 kvm_mmu_pte_write(vcpu, pte_gpa, NULL, sizeof(pt_element_t), 0);
491} 512}
492 513
493static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, 514static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
@@ -551,12 +572,15 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
551{ 572{
552 int i, offset, nr_present; 573 int i, offset, nr_present;
553 bool reset_host_protection; 574 bool reset_host_protection;
575 gpa_t first_pte_gpa;
554 576
555 offset = nr_present = 0; 577 offset = nr_present = 0;
556 578
557 if (PTTYPE == 32) 579 if (PTTYPE == 32)
558 offset = sp->role.quadrant << PT64_LEVEL_BITS; 580 offset = sp->role.quadrant << PT64_LEVEL_BITS;
559 581
582 first_pte_gpa = gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
583
560 for (i = 0; i < PT64_ENT_PER_PAGE; i++) { 584 for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
561 unsigned pte_access; 585 unsigned pte_access;
562 pt_element_t gpte; 586 pt_element_t gpte;
@@ -566,8 +590,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
566 if (!is_shadow_present_pte(sp->spt[i])) 590 if (!is_shadow_present_pte(sp->spt[i]))
567 continue; 591 continue;
568 592
569 pte_gpa = gfn_to_gpa(sp->gfn); 593 pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
570 pte_gpa += (i+offset) * sizeof(pt_element_t);
571 594
572 if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, 595 if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
573 sizeof(pt_element_t))) 596 sizeof(pt_element_t)))
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 445c59411ed0..ab78eb8ba899 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -44,10 +44,11 @@ MODULE_LICENSE("GPL");
44#define SEG_TYPE_LDT 2 44#define SEG_TYPE_LDT 2
45#define SEG_TYPE_BUSY_TSS16 3 45#define SEG_TYPE_BUSY_TSS16 3
46 46
47#define SVM_FEATURE_NPT (1 << 0) 47#define SVM_FEATURE_NPT (1 << 0)
48#define SVM_FEATURE_LBRV (1 << 1) 48#define SVM_FEATURE_LBRV (1 << 1)
49#define SVM_FEATURE_SVML (1 << 2) 49#define SVM_FEATURE_SVML (1 << 2)
50#define SVM_FEATURE_PAUSE_FILTER (1 << 10) 50#define SVM_FEATURE_NRIP (1 << 3)
51#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
51 52
52#define NESTED_EXIT_HOST 0 /* Exit handled on host level */ 53#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
53#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */ 54#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */
@@ -70,6 +71,7 @@ struct kvm_vcpu;
70struct nested_state { 71struct nested_state {
71 struct vmcb *hsave; 72 struct vmcb *hsave;
72 u64 hsave_msr; 73 u64 hsave_msr;
74 u64 vm_cr_msr;
73 u64 vmcb; 75 u64 vmcb;
74 76
75 /* These are the merged vectors */ 77 /* These are the merged vectors */
@@ -77,6 +79,7 @@ struct nested_state {
77 79
78 /* gpa pointers to the real vectors */ 80 /* gpa pointers to the real vectors */
79 u64 vmcb_msrpm; 81 u64 vmcb_msrpm;
82 u64 vmcb_iopm;
80 83
81 /* A VMEXIT is required but not yet emulated */ 84 /* A VMEXIT is required but not yet emulated */
82 bool exit_required; 85 bool exit_required;
@@ -91,6 +94,9 @@ struct nested_state {
91 94
92}; 95};
93 96
97#define MSRPM_OFFSETS 16
98static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
99
94struct vcpu_svm { 100struct vcpu_svm {
95 struct kvm_vcpu vcpu; 101 struct kvm_vcpu vcpu;
96 struct vmcb *vmcb; 102 struct vmcb *vmcb;
@@ -110,13 +116,39 @@ struct vcpu_svm {
110 struct nested_state nested; 116 struct nested_state nested;
111 117
112 bool nmi_singlestep; 118 bool nmi_singlestep;
119
120 unsigned int3_injected;
121 unsigned long int3_rip;
122};
123
124#define MSR_INVALID 0xffffffffU
125
126static struct svm_direct_access_msrs {
127 u32 index; /* Index of the MSR */
128 bool always; /* True if intercept is always on */
129} direct_access_msrs[] = {
130 { .index = MSR_K6_STAR, .always = true },
131 { .index = MSR_IA32_SYSENTER_CS, .always = true },
132#ifdef CONFIG_X86_64
133 { .index = MSR_GS_BASE, .always = true },
134 { .index = MSR_FS_BASE, .always = true },
135 { .index = MSR_KERNEL_GS_BASE, .always = true },
136 { .index = MSR_LSTAR, .always = true },
137 { .index = MSR_CSTAR, .always = true },
138 { .index = MSR_SYSCALL_MASK, .always = true },
139#endif
140 { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false },
141 { .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
142 { .index = MSR_IA32_LASTINTFROMIP, .always = false },
143 { .index = MSR_IA32_LASTINTTOIP, .always = false },
144 { .index = MSR_INVALID, .always = false },
113}; 145};
114 146
115/* enable NPT for AMD64 and X86 with PAE */ 147/* enable NPT for AMD64 and X86 with PAE */
116#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) 148#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
117static bool npt_enabled = true; 149static bool npt_enabled = true;
118#else 150#else
119static bool npt_enabled = false; 151static bool npt_enabled;
120#endif 152#endif
121static int npt = 1; 153static int npt = 1;
122 154
@@ -129,6 +161,7 @@ static void svm_flush_tlb(struct kvm_vcpu *vcpu);
129static void svm_complete_interrupts(struct vcpu_svm *svm); 161static void svm_complete_interrupts(struct vcpu_svm *svm);
130 162
131static int nested_svm_exit_handled(struct vcpu_svm *svm); 163static int nested_svm_exit_handled(struct vcpu_svm *svm);
164static int nested_svm_intercept(struct vcpu_svm *svm);
132static int nested_svm_vmexit(struct vcpu_svm *svm); 165static int nested_svm_vmexit(struct vcpu_svm *svm);
133static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, 166static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
134 bool has_error_code, u32 error_code); 167 bool has_error_code, u32 error_code);
@@ -163,8 +196,8 @@ static unsigned long iopm_base;
163struct kvm_ldttss_desc { 196struct kvm_ldttss_desc {
164 u16 limit0; 197 u16 limit0;
165 u16 base0; 198 u16 base0;
166 unsigned base1 : 8, type : 5, dpl : 2, p : 1; 199 unsigned base1:8, type:5, dpl:2, p:1;
167 unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8; 200 unsigned limit1:4, zero0:3, g:1, base2:8;
168 u32 base3; 201 u32 base3;
169 u32 zero1; 202 u32 zero1;
170} __attribute__((packed)); 203} __attribute__((packed));
@@ -194,6 +227,27 @@ static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
194#define MSRS_RANGE_SIZE 2048 227#define MSRS_RANGE_SIZE 2048
195#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2) 228#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
196 229
230static u32 svm_msrpm_offset(u32 msr)
231{
232 u32 offset;
233 int i;
234
235 for (i = 0; i < NUM_MSR_MAPS; i++) {
236 if (msr < msrpm_ranges[i] ||
237 msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
238 continue;
239
240 offset = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
241 offset += (i * MSRS_RANGE_SIZE); /* add range offset */
242
243 /* Now we have the u8 offset - but need the u32 offset */
244 return offset / 4;
245 }
246
247 /* MSR not in any range */
248 return MSR_INVALID;
249}
250
197#define MAX_INST_SIZE 15 251#define MAX_INST_SIZE 15
198 252
199static inline u32 svm_has(u32 feat) 253static inline u32 svm_has(u32 feat)
@@ -213,7 +267,7 @@ static inline void stgi(void)
213 267
214static inline void invlpga(unsigned long addr, u32 asid) 268static inline void invlpga(unsigned long addr, u32 asid)
215{ 269{
216 asm volatile (__ex(SVM_INVLPGA) :: "a"(addr), "c"(asid)); 270 asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid));
217} 271}
218 272
219static inline void force_new_asid(struct kvm_vcpu *vcpu) 273static inline void force_new_asid(struct kvm_vcpu *vcpu)
@@ -235,23 +289,6 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
235 vcpu->arch.efer = efer; 289 vcpu->arch.efer = efer;
236} 290}
237 291
238static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
239 bool has_error_code, u32 error_code)
240{
241 struct vcpu_svm *svm = to_svm(vcpu);
242
243 /* If we are within a nested VM we'd better #VMEXIT and let the
244 guest handle the exception */
245 if (nested_svm_check_exception(svm, nr, has_error_code, error_code))
246 return;
247
248 svm->vmcb->control.event_inj = nr
249 | SVM_EVTINJ_VALID
250 | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
251 | SVM_EVTINJ_TYPE_EXEPT;
252 svm->vmcb->control.event_inj_err = error_code;
253}
254
255static int is_external_interrupt(u32 info) 292static int is_external_interrupt(u32 info)
256{ 293{
257 info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID; 294 info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
@@ -264,7 +301,7 @@ static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
264 u32 ret = 0; 301 u32 ret = 0;
265 302
266 if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) 303 if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
267 ret |= X86_SHADOW_INT_STI | X86_SHADOW_INT_MOV_SS; 304 ret |= KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
268 return ret & mask; 305 return ret & mask;
269} 306}
270 307
@@ -283,6 +320,9 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
283{ 320{
284 struct vcpu_svm *svm = to_svm(vcpu); 321 struct vcpu_svm *svm = to_svm(vcpu);
285 322
323 if (svm->vmcb->control.next_rip != 0)
324 svm->next_rip = svm->vmcb->control.next_rip;
325
286 if (!svm->next_rip) { 326 if (!svm->next_rip) {
287 if (emulate_instruction(vcpu, 0, 0, EMULTYPE_SKIP) != 327 if (emulate_instruction(vcpu, 0, 0, EMULTYPE_SKIP) !=
288 EMULATE_DONE) 328 EMULATE_DONE)
@@ -297,6 +337,41 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
297 svm_set_interrupt_shadow(vcpu, 0); 337 svm_set_interrupt_shadow(vcpu, 0);
298} 338}
299 339
340static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
341 bool has_error_code, u32 error_code)
342{
343 struct vcpu_svm *svm = to_svm(vcpu);
344
345 /*
346 * If we are within a nested VM we'd better #VMEXIT and let the guest
347 * handle the exception
348 */
349 if (nested_svm_check_exception(svm, nr, has_error_code, error_code))
350 return;
351
352 if (nr == BP_VECTOR && !svm_has(SVM_FEATURE_NRIP)) {
353 unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
354
355 /*
356 * For guest debugging where we have to reinject #BP if some
357 * INT3 is guest-owned:
358 * Emulate nRIP by moving RIP forward. Will fail if injection
359 * raises a fault that is not intercepted. Still better than
360 * failing in all cases.
361 */
362 skip_emulated_instruction(&svm->vcpu);
363 rip = kvm_rip_read(&svm->vcpu);
364 svm->int3_rip = rip + svm->vmcb->save.cs.base;
365 svm->int3_injected = rip - old_rip;
366 }
367
368 svm->vmcb->control.event_inj = nr
369 | SVM_EVTINJ_VALID
370 | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
371 | SVM_EVTINJ_TYPE_EXEPT;
372 svm->vmcb->control.event_inj_err = error_code;
373}
374
300static int has_svm(void) 375static int has_svm(void)
301{ 376{
302 const char *msg; 377 const char *msg;
@@ -319,7 +394,7 @@ static int svm_hardware_enable(void *garbage)
319 394
320 struct svm_cpu_data *sd; 395 struct svm_cpu_data *sd;
321 uint64_t efer; 396 uint64_t efer;
322 struct descriptor_table gdt_descr; 397 struct desc_ptr gdt_descr;
323 struct desc_struct *gdt; 398 struct desc_struct *gdt;
324 int me = raw_smp_processor_id(); 399 int me = raw_smp_processor_id();
325 400
@@ -344,8 +419,8 @@ static int svm_hardware_enable(void *garbage)
344 sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; 419 sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
345 sd->next_asid = sd->max_asid + 1; 420 sd->next_asid = sd->max_asid + 1;
346 421
347 kvm_get_gdt(&gdt_descr); 422 native_store_gdt(&gdt_descr);
348 gdt = (struct desc_struct *)gdt_descr.base; 423 gdt = (struct desc_struct *)gdt_descr.address;
349 sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); 424 sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
350 425
351 wrmsrl(MSR_EFER, efer | EFER_SVME); 426 wrmsrl(MSR_EFER, efer | EFER_SVME);
@@ -391,42 +466,98 @@ err_1:
391 466
392} 467}
393 468
469static bool valid_msr_intercept(u32 index)
470{
471 int i;
472
473 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
474 if (direct_access_msrs[i].index == index)
475 return true;
476
477 return false;
478}
479
394static void set_msr_interception(u32 *msrpm, unsigned msr, 480static void set_msr_interception(u32 *msrpm, unsigned msr,
395 int read, int write) 481 int read, int write)
396{ 482{
483 u8 bit_read, bit_write;
484 unsigned long tmp;
485 u32 offset;
486
487 /*
488 * If this warning triggers extend the direct_access_msrs list at the
489 * beginning of the file
490 */
491 WARN_ON(!valid_msr_intercept(msr));
492
493 offset = svm_msrpm_offset(msr);
494 bit_read = 2 * (msr & 0x0f);
495 bit_write = 2 * (msr & 0x0f) + 1;
496 tmp = msrpm[offset];
497
498 BUG_ON(offset == MSR_INVALID);
499
500 read ? clear_bit(bit_read, &tmp) : set_bit(bit_read, &tmp);
501 write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
502
503 msrpm[offset] = tmp;
504}
505
506static void svm_vcpu_init_msrpm(u32 *msrpm)
507{
397 int i; 508 int i;
398 509
399 for (i = 0; i < NUM_MSR_MAPS; i++) { 510 memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
400 if (msr >= msrpm_ranges[i] && 511
401 msr < msrpm_ranges[i] + MSRS_IN_RANGE) { 512 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
402 u32 msr_offset = (i * MSRS_IN_RANGE + msr - 513 if (!direct_access_msrs[i].always)
403 msrpm_ranges[i]) * 2; 514 continue;
404 515
405 u32 *base = msrpm + (msr_offset / 32); 516 set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1);
406 u32 msr_shift = msr_offset % 32; 517 }
407 u32 mask = ((write) ? 0 : 2) | ((read) ? 0 : 1); 518}
408 *base = (*base & ~(0x3 << msr_shift)) | 519
409 (mask << msr_shift); 520static void add_msr_offset(u32 offset)
521{
522 int i;
523
524 for (i = 0; i < MSRPM_OFFSETS; ++i) {
525
526 /* Offset already in list? */
527 if (msrpm_offsets[i] == offset)
410 return; 528 return;
411 } 529
530 /* Slot used by another offset? */
531 if (msrpm_offsets[i] != MSR_INVALID)
532 continue;
533
534 /* Add offset to list */
535 msrpm_offsets[i] = offset;
536
537 return;
412 } 538 }
539
540 /*
541 * If this BUG triggers the msrpm_offsets table has an overflow. Just
542 * increase MSRPM_OFFSETS in this case.
543 */
413 BUG(); 544 BUG();
414} 545}
415 546
416static void svm_vcpu_init_msrpm(u32 *msrpm) 547static void init_msrpm_offsets(void)
417{ 548{
418 memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER)); 549 int i;
419 550
420#ifdef CONFIG_X86_64 551 memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
421 set_msr_interception(msrpm, MSR_GS_BASE, 1, 1); 552
422 set_msr_interception(msrpm, MSR_FS_BASE, 1, 1); 553 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
423 set_msr_interception(msrpm, MSR_KERNEL_GS_BASE, 1, 1); 554 u32 offset;
424 set_msr_interception(msrpm, MSR_LSTAR, 1, 1); 555
425 set_msr_interception(msrpm, MSR_CSTAR, 1, 1); 556 offset = svm_msrpm_offset(direct_access_msrs[i].index);
426 set_msr_interception(msrpm, MSR_SYSCALL_MASK, 1, 1); 557 BUG_ON(offset == MSR_INVALID);
427#endif 558
428 set_msr_interception(msrpm, MSR_K6_STAR, 1, 1); 559 add_msr_offset(offset);
429 set_msr_interception(msrpm, MSR_IA32_SYSENTER_CS, 1, 1); 560 }
430} 561}
431 562
432static void svm_enable_lbrv(struct vcpu_svm *svm) 563static void svm_enable_lbrv(struct vcpu_svm *svm)
@@ -467,6 +598,8 @@ static __init int svm_hardware_setup(void)
467 memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER)); 598 memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
468 iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; 599 iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
469 600
601 init_msrpm_offsets();
602
470 if (boot_cpu_has(X86_FEATURE_NX)) 603 if (boot_cpu_has(X86_FEATURE_NX))
471 kvm_enable_efer_bits(EFER_NX); 604 kvm_enable_efer_bits(EFER_NX);
472 605
@@ -523,7 +656,7 @@ static void init_seg(struct vmcb_seg *seg)
523{ 656{
524 seg->selector = 0; 657 seg->selector = 0;
525 seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK | 658 seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
526 SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */ 659 SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
527 seg->limit = 0xffff; 660 seg->limit = 0xffff;
528 seg->base = 0; 661 seg->base = 0;
529} 662}
@@ -543,16 +676,16 @@ static void init_vmcb(struct vcpu_svm *svm)
543 676
544 svm->vcpu.fpu_active = 1; 677 svm->vcpu.fpu_active = 1;
545 678
546 control->intercept_cr_read = INTERCEPT_CR0_MASK | 679 control->intercept_cr_read = INTERCEPT_CR0_MASK |
547 INTERCEPT_CR3_MASK | 680 INTERCEPT_CR3_MASK |
548 INTERCEPT_CR4_MASK; 681 INTERCEPT_CR4_MASK;
549 682
550 control->intercept_cr_write = INTERCEPT_CR0_MASK | 683 control->intercept_cr_write = INTERCEPT_CR0_MASK |
551 INTERCEPT_CR3_MASK | 684 INTERCEPT_CR3_MASK |
552 INTERCEPT_CR4_MASK | 685 INTERCEPT_CR4_MASK |
553 INTERCEPT_CR8_MASK; 686 INTERCEPT_CR8_MASK;
554 687
555 control->intercept_dr_read = INTERCEPT_DR0_MASK | 688 control->intercept_dr_read = INTERCEPT_DR0_MASK |
556 INTERCEPT_DR1_MASK | 689 INTERCEPT_DR1_MASK |
557 INTERCEPT_DR2_MASK | 690 INTERCEPT_DR2_MASK |
558 INTERCEPT_DR3_MASK | 691 INTERCEPT_DR3_MASK |
@@ -561,7 +694,7 @@ static void init_vmcb(struct vcpu_svm *svm)
561 INTERCEPT_DR6_MASK | 694 INTERCEPT_DR6_MASK |
562 INTERCEPT_DR7_MASK; 695 INTERCEPT_DR7_MASK;
563 696
564 control->intercept_dr_write = INTERCEPT_DR0_MASK | 697 control->intercept_dr_write = INTERCEPT_DR0_MASK |
565 INTERCEPT_DR1_MASK | 698 INTERCEPT_DR1_MASK |
566 INTERCEPT_DR2_MASK | 699 INTERCEPT_DR2_MASK |
567 INTERCEPT_DR3_MASK | 700 INTERCEPT_DR3_MASK |
@@ -575,7 +708,7 @@ static void init_vmcb(struct vcpu_svm *svm)
575 (1 << MC_VECTOR); 708 (1 << MC_VECTOR);
576 709
577 710
578 control->intercept = (1ULL << INTERCEPT_INTR) | 711 control->intercept = (1ULL << INTERCEPT_INTR) |
579 (1ULL << INTERCEPT_NMI) | 712 (1ULL << INTERCEPT_NMI) |
580 (1ULL << INTERCEPT_SMI) | 713 (1ULL << INTERCEPT_SMI) |
581 (1ULL << INTERCEPT_SELECTIVE_CR0) | 714 (1ULL << INTERCEPT_SELECTIVE_CR0) |
@@ -636,7 +769,8 @@ static void init_vmcb(struct vcpu_svm *svm)
636 save->rip = 0x0000fff0; 769 save->rip = 0x0000fff0;
637 svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; 770 svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
638 771
639 /* This is the guest-visible cr0 value. 772 /*
773 * This is the guest-visible cr0 value.
640 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. 774 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
641 */ 775 */
642 svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; 776 svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
@@ -706,30 +840,30 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
706 if (err) 840 if (err)
707 goto free_svm; 841 goto free_svm;
708 842
843 err = -ENOMEM;
709 page = alloc_page(GFP_KERNEL); 844 page = alloc_page(GFP_KERNEL);
710 if (!page) { 845 if (!page)
711 err = -ENOMEM;
712 goto uninit; 846 goto uninit;
713 }
714 847
715 err = -ENOMEM;
716 msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); 848 msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
717 if (!msrpm_pages) 849 if (!msrpm_pages)
718 goto uninit; 850 goto free_page1;
719 851
720 nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); 852 nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
721 if (!nested_msrpm_pages) 853 if (!nested_msrpm_pages)
722 goto uninit; 854 goto free_page2;
723
724 svm->msrpm = page_address(msrpm_pages);
725 svm_vcpu_init_msrpm(svm->msrpm);
726 855
727 hsave_page = alloc_page(GFP_KERNEL); 856 hsave_page = alloc_page(GFP_KERNEL);
728 if (!hsave_page) 857 if (!hsave_page)
729 goto uninit; 858 goto free_page3;
859
730 svm->nested.hsave = page_address(hsave_page); 860 svm->nested.hsave = page_address(hsave_page);
731 861
862 svm->msrpm = page_address(msrpm_pages);
863 svm_vcpu_init_msrpm(svm->msrpm);
864
732 svm->nested.msrpm = page_address(nested_msrpm_pages); 865 svm->nested.msrpm = page_address(nested_msrpm_pages);
866 svm_vcpu_init_msrpm(svm->nested.msrpm);
733 867
734 svm->vmcb = page_address(page); 868 svm->vmcb = page_address(page);
735 clear_page(svm->vmcb); 869 clear_page(svm->vmcb);
@@ -744,6 +878,12 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
744 878
745 return &svm->vcpu; 879 return &svm->vcpu;
746 880
881free_page3:
882 __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
883free_page2:
884 __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
885free_page1:
886 __free_page(page);
747uninit: 887uninit:
748 kvm_vcpu_uninit(&svm->vcpu); 888 kvm_vcpu_uninit(&svm->vcpu);
749free_svm: 889free_svm:
@@ -877,7 +1017,8 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
877 var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1; 1017 var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
878 var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1; 1018 var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1;
879 1019
880 /* AMD's VMCB does not have an explicit unusable field, so emulate it 1020 /*
1021 * AMD's VMCB does not have an explicit unusable field, so emulate it
881 * for cross vendor migration purposes by "not present" 1022 * for cross vendor migration purposes by "not present"
882 */ 1023 */
883 var->unusable = !var->present || (var->type == 0); 1024 var->unusable = !var->present || (var->type == 0);
@@ -913,7 +1054,8 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
913 var->type |= 0x1; 1054 var->type |= 0x1;
914 break; 1055 break;
915 case VCPU_SREG_SS: 1056 case VCPU_SREG_SS:
916 /* On AMD CPUs sometimes the DB bit in the segment 1057 /*
1058 * On AMD CPUs sometimes the DB bit in the segment
917 * descriptor is left as 1, although the whole segment has 1059 * descriptor is left as 1, although the whole segment has
918 * been made unusable. Clear it here to pass an Intel VMX 1060 * been made unusable. Clear it here to pass an Intel VMX
919 * entry check when cross vendor migrating. 1061 * entry check when cross vendor migrating.
@@ -931,36 +1073,36 @@ static int svm_get_cpl(struct kvm_vcpu *vcpu)
931 return save->cpl; 1073 return save->cpl;
932} 1074}
933 1075
934static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) 1076static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
935{ 1077{
936 struct vcpu_svm *svm = to_svm(vcpu); 1078 struct vcpu_svm *svm = to_svm(vcpu);
937 1079
938 dt->limit = svm->vmcb->save.idtr.limit; 1080 dt->size = svm->vmcb->save.idtr.limit;
939 dt->base = svm->vmcb->save.idtr.base; 1081 dt->address = svm->vmcb->save.idtr.base;
940} 1082}
941 1083
942static void svm_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) 1084static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
943{ 1085{
944 struct vcpu_svm *svm = to_svm(vcpu); 1086 struct vcpu_svm *svm = to_svm(vcpu);
945 1087
946 svm->vmcb->save.idtr.limit = dt->limit; 1088 svm->vmcb->save.idtr.limit = dt->size;
947 svm->vmcb->save.idtr.base = dt->base ; 1089 svm->vmcb->save.idtr.base = dt->address ;
948} 1090}
949 1091
950static void svm_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) 1092static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
951{ 1093{
952 struct vcpu_svm *svm = to_svm(vcpu); 1094 struct vcpu_svm *svm = to_svm(vcpu);
953 1095
954 dt->limit = svm->vmcb->save.gdtr.limit; 1096 dt->size = svm->vmcb->save.gdtr.limit;
955 dt->base = svm->vmcb->save.gdtr.base; 1097 dt->address = svm->vmcb->save.gdtr.base;
956} 1098}
957 1099
958static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) 1100static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
959{ 1101{
960 struct vcpu_svm *svm = to_svm(vcpu); 1102 struct vcpu_svm *svm = to_svm(vcpu);
961 1103
962 svm->vmcb->save.gdtr.limit = dt->limit; 1104 svm->vmcb->save.gdtr.limit = dt->size;
963 svm->vmcb->save.gdtr.base = dt->base ; 1105 svm->vmcb->save.gdtr.base = dt->address ;
964} 1106}
965 1107
966static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) 1108static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
@@ -973,6 +1115,7 @@ static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
973 1115
974static void update_cr0_intercept(struct vcpu_svm *svm) 1116static void update_cr0_intercept(struct vcpu_svm *svm)
975{ 1117{
1118 struct vmcb *vmcb = svm->vmcb;
976 ulong gcr0 = svm->vcpu.arch.cr0; 1119 ulong gcr0 = svm->vcpu.arch.cr0;
977 u64 *hcr0 = &svm->vmcb->save.cr0; 1120 u64 *hcr0 = &svm->vmcb->save.cr0;
978 1121
@@ -984,11 +1127,25 @@ static void update_cr0_intercept(struct vcpu_svm *svm)
984 1127
985 1128
986 if (gcr0 == *hcr0 && svm->vcpu.fpu_active) { 1129 if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
987 svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK; 1130 vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
988 svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK; 1131 vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
1132 if (is_nested(svm)) {
1133 struct vmcb *hsave = svm->nested.hsave;
1134
1135 hsave->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
1136 hsave->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
1137 vmcb->control.intercept_cr_read |= svm->nested.intercept_cr_read;
1138 vmcb->control.intercept_cr_write |= svm->nested.intercept_cr_write;
1139 }
989 } else { 1140 } else {
990 svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK; 1141 svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
991 svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK; 1142 svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
1143 if (is_nested(svm)) {
1144 struct vmcb *hsave = svm->nested.hsave;
1145
1146 hsave->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
1147 hsave->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
1148 }
992 } 1149 }
993} 1150}
994 1151
@@ -996,6 +1153,27 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
996{ 1153{
997 struct vcpu_svm *svm = to_svm(vcpu); 1154 struct vcpu_svm *svm = to_svm(vcpu);
998 1155
1156 if (is_nested(svm)) {
1157 /*
1158 * We are here because we run in nested mode, the host kvm
1159 * intercepts cr0 writes but the l1 hypervisor does not.
1160 * But the L1 hypervisor may intercept selective cr0 writes.
1161 * This needs to be checked here.
1162 */
1163 unsigned long old, new;
1164
1165 /* Remove bits that would trigger a real cr0 write intercept */
1166 old = vcpu->arch.cr0 & SVM_CR0_SELECTIVE_MASK;
1167 new = cr0 & SVM_CR0_SELECTIVE_MASK;
1168
1169 if (old == new) {
1170 /* cr0 write with ts and mp unchanged */
1171 svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
1172 if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE)
1173 return;
1174 }
1175 }
1176
999#ifdef CONFIG_X86_64 1177#ifdef CONFIG_X86_64
1000 if (vcpu->arch.efer & EFER_LME) { 1178 if (vcpu->arch.efer & EFER_LME) {
1001 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 1179 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
@@ -1129,70 +1307,11 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
1129 svm->vmcb->control.asid = sd->next_asid++; 1307 svm->vmcb->control.asid = sd->next_asid++;
1130} 1308}
1131 1309
1132static int svm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *dest) 1310static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
1133{
1134 struct vcpu_svm *svm = to_svm(vcpu);
1135
1136 switch (dr) {
1137 case 0 ... 3:
1138 *dest = vcpu->arch.db[dr];
1139 break;
1140 case 4:
1141 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
1142 return EMULATE_FAIL; /* will re-inject UD */
1143 /* fall through */
1144 case 6:
1145 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1146 *dest = vcpu->arch.dr6;
1147 else
1148 *dest = svm->vmcb->save.dr6;
1149 break;
1150 case 5:
1151 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
1152 return EMULATE_FAIL; /* will re-inject UD */
1153 /* fall through */
1154 case 7:
1155 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1156 *dest = vcpu->arch.dr7;
1157 else
1158 *dest = svm->vmcb->save.dr7;
1159 break;
1160 }
1161
1162 return EMULATE_DONE;
1163}
1164
1165static int svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value)
1166{ 1311{
1167 struct vcpu_svm *svm = to_svm(vcpu); 1312 struct vcpu_svm *svm = to_svm(vcpu);
1168 1313
1169 switch (dr) { 1314 svm->vmcb->save.dr7 = value;
1170 case 0 ... 3:
1171 vcpu->arch.db[dr] = value;
1172 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
1173 vcpu->arch.eff_db[dr] = value;
1174 break;
1175 case 4:
1176 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
1177 return EMULATE_FAIL; /* will re-inject UD */
1178 /* fall through */
1179 case 6:
1180 vcpu->arch.dr6 = (value & DR6_VOLATILE) | DR6_FIXED_1;
1181 break;
1182 case 5:
1183 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
1184 return EMULATE_FAIL; /* will re-inject UD */
1185 /* fall through */
1186 case 7:
1187 vcpu->arch.dr7 = (value & DR7_VOLATILE) | DR7_FIXED_1;
1188 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
1189 svm->vmcb->save.dr7 = vcpu->arch.dr7;
1190 vcpu->arch.switch_db_regs = (value & DR7_BP_EN_MASK);
1191 }
1192 break;
1193 }
1194
1195 return EMULATE_DONE;
1196} 1315}
1197 1316
1198static int pf_interception(struct vcpu_svm *svm) 1317static int pf_interception(struct vcpu_svm *svm)
@@ -1229,7 +1348,7 @@ static int db_interception(struct vcpu_svm *svm)
1229 } 1348 }
1230 1349
1231 if (svm->vcpu.guest_debug & 1350 if (svm->vcpu.guest_debug &
1232 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)){ 1351 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
1233 kvm_run->exit_reason = KVM_EXIT_DEBUG; 1352 kvm_run->exit_reason = KVM_EXIT_DEBUG;
1234 kvm_run->debug.arch.pc = 1353 kvm_run->debug.arch.pc =
1235 svm->vmcb->save.cs.base + svm->vmcb->save.rip; 1354 svm->vmcb->save.cs.base + svm->vmcb->save.rip;
@@ -1263,7 +1382,22 @@ static int ud_interception(struct vcpu_svm *svm)
1263static void svm_fpu_activate(struct kvm_vcpu *vcpu) 1382static void svm_fpu_activate(struct kvm_vcpu *vcpu)
1264{ 1383{
1265 struct vcpu_svm *svm = to_svm(vcpu); 1384 struct vcpu_svm *svm = to_svm(vcpu);
1266 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); 1385 u32 excp;
1386
1387 if (is_nested(svm)) {
1388 u32 h_excp, n_excp;
1389
1390 h_excp = svm->nested.hsave->control.intercept_exceptions;
1391 n_excp = svm->nested.intercept_exceptions;
1392 h_excp &= ~(1 << NM_VECTOR);
1393 excp = h_excp | n_excp;
1394 } else {
1395 excp = svm->vmcb->control.intercept_exceptions;
1396 excp &= ~(1 << NM_VECTOR);
1397 }
1398
1399 svm->vmcb->control.intercept_exceptions = excp;
1400
1267 svm->vcpu.fpu_active = 1; 1401 svm->vcpu.fpu_active = 1;
1268 update_cr0_intercept(svm); 1402 update_cr0_intercept(svm);
1269} 1403}
@@ -1304,29 +1438,23 @@ static int shutdown_interception(struct vcpu_svm *svm)
1304 1438
1305static int io_interception(struct vcpu_svm *svm) 1439static int io_interception(struct vcpu_svm *svm)
1306{ 1440{
1441 struct kvm_vcpu *vcpu = &svm->vcpu;
1307 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */ 1442 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
1308 int size, in, string; 1443 int size, in, string;
1309 unsigned port; 1444 unsigned port;
1310 1445
1311 ++svm->vcpu.stat.io_exits; 1446 ++svm->vcpu.stat.io_exits;
1312
1313 svm->next_rip = svm->vmcb->control.exit_info_2;
1314
1315 string = (io_info & SVM_IOIO_STR_MASK) != 0; 1447 string = (io_info & SVM_IOIO_STR_MASK) != 0;
1316
1317 if (string) {
1318 if (emulate_instruction(&svm->vcpu,
1319 0, 0, 0) == EMULATE_DO_MMIO)
1320 return 0;
1321 return 1;
1322 }
1323
1324 in = (io_info & SVM_IOIO_TYPE_MASK) != 0; 1448 in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
1449 if (string || in)
1450 return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO);
1451
1325 port = io_info >> 16; 1452 port = io_info >> 16;
1326 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; 1453 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
1327 1454 svm->next_rip = svm->vmcb->control.exit_info_2;
1328 skip_emulated_instruction(&svm->vcpu); 1455 skip_emulated_instruction(&svm->vcpu);
1329 return kvm_emulate_pio(&svm->vcpu, in, size, port); 1456
1457 return kvm_fast_pio_out(vcpu, size, port);
1330} 1458}
1331 1459
1332static int nmi_interception(struct vcpu_svm *svm) 1460static int nmi_interception(struct vcpu_svm *svm)
@@ -1379,6 +1507,8 @@ static int nested_svm_check_permissions(struct vcpu_svm *svm)
1379static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, 1507static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
1380 bool has_error_code, u32 error_code) 1508 bool has_error_code, u32 error_code)
1381{ 1509{
1510 int vmexit;
1511
1382 if (!is_nested(svm)) 1512 if (!is_nested(svm))
1383 return 0; 1513 return 0;
1384 1514
@@ -1387,21 +1517,28 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
1387 svm->vmcb->control.exit_info_1 = error_code; 1517 svm->vmcb->control.exit_info_1 = error_code;
1388 svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; 1518 svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
1389 1519
1390 return nested_svm_exit_handled(svm); 1520 vmexit = nested_svm_intercept(svm);
1521 if (vmexit == NESTED_EXIT_DONE)
1522 svm->nested.exit_required = true;
1523
1524 return vmexit;
1391} 1525}
1392 1526
1393static inline int nested_svm_intr(struct vcpu_svm *svm) 1527/* This function returns true if it is save to enable the irq window */
1528static inline bool nested_svm_intr(struct vcpu_svm *svm)
1394{ 1529{
1395 if (!is_nested(svm)) 1530 if (!is_nested(svm))
1396 return 0; 1531 return true;
1397 1532
1398 if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) 1533 if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
1399 return 0; 1534 return true;
1400 1535
1401 if (!(svm->vcpu.arch.hflags & HF_HIF_MASK)) 1536 if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
1402 return 0; 1537 return false;
1403 1538
1404 svm->vmcb->control.exit_code = SVM_EXIT_INTR; 1539 svm->vmcb->control.exit_code = SVM_EXIT_INTR;
1540 svm->vmcb->control.exit_info_1 = 0;
1541 svm->vmcb->control.exit_info_2 = 0;
1405 1542
1406 if (svm->nested.intercept & 1ULL) { 1543 if (svm->nested.intercept & 1ULL) {
1407 /* 1544 /*
@@ -1412,21 +1549,40 @@ static inline int nested_svm_intr(struct vcpu_svm *svm)
1412 */ 1549 */
1413 svm->nested.exit_required = true; 1550 svm->nested.exit_required = true;
1414 trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip); 1551 trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
1415 return 1; 1552 return false;
1416 } 1553 }
1417 1554
1418 return 0; 1555 return true;
1556}
1557
1558/* This function returns true if it is save to enable the nmi window */
1559static inline bool nested_svm_nmi(struct vcpu_svm *svm)
1560{
1561 if (!is_nested(svm))
1562 return true;
1563
1564 if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI)))
1565 return true;
1566
1567 svm->vmcb->control.exit_code = SVM_EXIT_NMI;
1568 svm->nested.exit_required = true;
1569
1570 return false;
1419} 1571}
1420 1572
1421static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, enum km_type idx) 1573static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
1422{ 1574{
1423 struct page *page; 1575 struct page *page;
1424 1576
1577 might_sleep();
1578
1425 page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT); 1579 page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT);
1426 if (is_error_page(page)) 1580 if (is_error_page(page))
1427 goto error; 1581 goto error;
1428 1582
1429 return kmap_atomic(page, idx); 1583 *_page = page;
1584
1585 return kmap(page);
1430 1586
1431error: 1587error:
1432 kvm_release_page_clean(page); 1588 kvm_release_page_clean(page);
@@ -1435,61 +1591,55 @@ error:
1435 return NULL; 1591 return NULL;
1436} 1592}
1437 1593
1438static void nested_svm_unmap(void *addr, enum km_type idx) 1594static void nested_svm_unmap(struct page *page)
1439{ 1595{
1440 struct page *page; 1596 kunmap(page);
1597 kvm_release_page_dirty(page);
1598}
1441 1599
1442 if (!addr) 1600static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
1443 return; 1601{
1602 unsigned port;
1603 u8 val, bit;
1604 u64 gpa;
1444 1605
1445 page = kmap_atomic_to_page(addr); 1606 if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT)))
1607 return NESTED_EXIT_HOST;
1446 1608
1447 kunmap_atomic(addr, idx); 1609 port = svm->vmcb->control.exit_info_1 >> 16;
1448 kvm_release_page_dirty(page); 1610 gpa = svm->nested.vmcb_iopm + (port / 8);
1611 bit = port % 8;
1612 val = 0;
1613
1614 if (kvm_read_guest(svm->vcpu.kvm, gpa, &val, 1))
1615 val &= (1 << bit);
1616
1617 return val ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
1449} 1618}
1450 1619
1451static bool nested_svm_exit_handled_msr(struct vcpu_svm *svm) 1620static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
1452{ 1621{
1453 u32 param = svm->vmcb->control.exit_info_1 & 1; 1622 u32 offset, msr, value;
1454 u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX]; 1623 int write, mask;
1455 bool ret = false;
1456 u32 t0, t1;
1457 u8 *msrpm;
1458 1624
1459 if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT))) 1625 if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
1460 return false; 1626 return NESTED_EXIT_HOST;
1461
1462 msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, KM_USER0);
1463 1627
1464 if (!msrpm) 1628 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
1465 goto out; 1629 offset = svm_msrpm_offset(msr);
1630 write = svm->vmcb->control.exit_info_1 & 1;
1631 mask = 1 << ((2 * (msr & 0xf)) + write);
1466 1632
1467 switch (msr) { 1633 if (offset == MSR_INVALID)
1468 case 0 ... 0x1fff: 1634 return NESTED_EXIT_DONE;
1469 t0 = (msr * 2) % 8;
1470 t1 = msr / 8;
1471 break;
1472 case 0xc0000000 ... 0xc0001fff:
1473 t0 = (8192 + msr - 0xc0000000) * 2;
1474 t1 = (t0 / 8);
1475 t0 %= 8;
1476 break;
1477 case 0xc0010000 ... 0xc0011fff:
1478 t0 = (16384 + msr - 0xc0010000) * 2;
1479 t1 = (t0 / 8);
1480 t0 %= 8;
1481 break;
1482 default:
1483 ret = true;
1484 goto out;
1485 }
1486 1635
1487 ret = msrpm[t1] & ((1 << param) << t0); 1636 /* Offset is in 32 bit units but need in 8 bit units */
1637 offset *= 4;
1488 1638
1489out: 1639 if (kvm_read_guest(svm->vcpu.kvm, svm->nested.vmcb_msrpm + offset, &value, 4))
1490 nested_svm_unmap(msrpm, KM_USER0); 1640 return NESTED_EXIT_DONE;
1491 1641
1492 return ret; 1642 return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
1493} 1643}
1494 1644
1495static int nested_svm_exit_special(struct vcpu_svm *svm) 1645static int nested_svm_exit_special(struct vcpu_svm *svm)
@@ -1500,16 +1650,19 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
1500 case SVM_EXIT_INTR: 1650 case SVM_EXIT_INTR:
1501 case SVM_EXIT_NMI: 1651 case SVM_EXIT_NMI:
1502 return NESTED_EXIT_HOST; 1652 return NESTED_EXIT_HOST;
1503 /* For now we are always handling NPFs when using them */
1504 case SVM_EXIT_NPF: 1653 case SVM_EXIT_NPF:
1654 /* For now we are always handling NPFs when using them */
1505 if (npt_enabled) 1655 if (npt_enabled)
1506 return NESTED_EXIT_HOST; 1656 return NESTED_EXIT_HOST;
1507 break; 1657 break;
1508 /* When we're shadowing, trap PFs */
1509 case SVM_EXIT_EXCP_BASE + PF_VECTOR: 1658 case SVM_EXIT_EXCP_BASE + PF_VECTOR:
1659 /* When we're shadowing, trap PFs */
1510 if (!npt_enabled) 1660 if (!npt_enabled)
1511 return NESTED_EXIT_HOST; 1661 return NESTED_EXIT_HOST;
1512 break; 1662 break;
1663 case SVM_EXIT_EXCP_BASE + NM_VECTOR:
1664 nm_interception(svm);
1665 break;
1513 default: 1666 default:
1514 break; 1667 break;
1515 } 1668 }
@@ -1520,7 +1673,7 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
1520/* 1673/*
1521 * If this function returns true, this #vmexit was already handled 1674 * If this function returns true, this #vmexit was already handled
1522 */ 1675 */
1523static int nested_svm_exit_handled(struct vcpu_svm *svm) 1676static int nested_svm_intercept(struct vcpu_svm *svm)
1524{ 1677{
1525 u32 exit_code = svm->vmcb->control.exit_code; 1678 u32 exit_code = svm->vmcb->control.exit_code;
1526 int vmexit = NESTED_EXIT_HOST; 1679 int vmexit = NESTED_EXIT_HOST;
@@ -1529,6 +1682,9 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm)
1529 case SVM_EXIT_MSR: 1682 case SVM_EXIT_MSR:
1530 vmexit = nested_svm_exit_handled_msr(svm); 1683 vmexit = nested_svm_exit_handled_msr(svm);
1531 break; 1684 break;
1685 case SVM_EXIT_IOIO:
1686 vmexit = nested_svm_intercept_ioio(svm);
1687 break;
1532 case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: { 1688 case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: {
1533 u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0); 1689 u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0);
1534 if (svm->nested.intercept_cr_read & cr_bits) 1690 if (svm->nested.intercept_cr_read & cr_bits)
@@ -1566,9 +1722,17 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm)
1566 } 1722 }
1567 } 1723 }
1568 1724
1569 if (vmexit == NESTED_EXIT_DONE) { 1725 return vmexit;
1726}
1727
1728static int nested_svm_exit_handled(struct vcpu_svm *svm)
1729{
1730 int vmexit;
1731
1732 vmexit = nested_svm_intercept(svm);
1733
1734 if (vmexit == NESTED_EXIT_DONE)
1570 nested_svm_vmexit(svm); 1735 nested_svm_vmexit(svm);
1571 }
1572 1736
1573 return vmexit; 1737 return vmexit;
1574} 1738}
@@ -1610,6 +1774,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1610 struct vmcb *nested_vmcb; 1774 struct vmcb *nested_vmcb;
1611 struct vmcb *hsave = svm->nested.hsave; 1775 struct vmcb *hsave = svm->nested.hsave;
1612 struct vmcb *vmcb = svm->vmcb; 1776 struct vmcb *vmcb = svm->vmcb;
1777 struct page *page;
1613 1778
1614 trace_kvm_nested_vmexit_inject(vmcb->control.exit_code, 1779 trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
1615 vmcb->control.exit_info_1, 1780 vmcb->control.exit_info_1,
@@ -1617,10 +1782,13 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1617 vmcb->control.exit_int_info, 1782 vmcb->control.exit_int_info,
1618 vmcb->control.exit_int_info_err); 1783 vmcb->control.exit_int_info_err);
1619 1784
1620 nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, KM_USER0); 1785 nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page);
1621 if (!nested_vmcb) 1786 if (!nested_vmcb)
1622 return 1; 1787 return 1;
1623 1788
1789 /* Exit nested SVM mode */
1790 svm->nested.vmcb = 0;
1791
1624 /* Give the current vmcb to the guest */ 1792 /* Give the current vmcb to the guest */
1625 disable_gif(svm); 1793 disable_gif(svm);
1626 1794
@@ -1630,9 +1798,13 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1630 nested_vmcb->save.ds = vmcb->save.ds; 1798 nested_vmcb->save.ds = vmcb->save.ds;
1631 nested_vmcb->save.gdtr = vmcb->save.gdtr; 1799 nested_vmcb->save.gdtr = vmcb->save.gdtr;
1632 nested_vmcb->save.idtr = vmcb->save.idtr; 1800 nested_vmcb->save.idtr = vmcb->save.idtr;
1801 nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu);
1633 if (npt_enabled) 1802 if (npt_enabled)
1634 nested_vmcb->save.cr3 = vmcb->save.cr3; 1803 nested_vmcb->save.cr3 = vmcb->save.cr3;
1804 else
1805 nested_vmcb->save.cr3 = svm->vcpu.arch.cr3;
1635 nested_vmcb->save.cr2 = vmcb->save.cr2; 1806 nested_vmcb->save.cr2 = vmcb->save.cr2;
1807 nested_vmcb->save.cr4 = svm->vcpu.arch.cr4;
1636 nested_vmcb->save.rflags = vmcb->save.rflags; 1808 nested_vmcb->save.rflags = vmcb->save.rflags;
1637 nested_vmcb->save.rip = vmcb->save.rip; 1809 nested_vmcb->save.rip = vmcb->save.rip;
1638 nested_vmcb->save.rsp = vmcb->save.rsp; 1810 nested_vmcb->save.rsp = vmcb->save.rsp;
@@ -1704,10 +1876,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1704 svm->vmcb->save.cpl = 0; 1876 svm->vmcb->save.cpl = 0;
1705 svm->vmcb->control.exit_int_info = 0; 1877 svm->vmcb->control.exit_int_info = 0;
1706 1878
1707 /* Exit nested SVM mode */ 1879 nested_svm_unmap(page);
1708 svm->nested.vmcb = 0;
1709
1710 nested_svm_unmap(nested_vmcb, KM_USER0);
1711 1880
1712 kvm_mmu_reset_context(&svm->vcpu); 1881 kvm_mmu_reset_context(&svm->vcpu);
1713 kvm_mmu_load(&svm->vcpu); 1882 kvm_mmu_load(&svm->vcpu);
@@ -1717,19 +1886,33 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1717 1886
1718static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) 1887static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
1719{ 1888{
1720 u32 *nested_msrpm; 1889 /*
1890 * This function merges the msr permission bitmaps of kvm and the
1891 * nested vmcb. It is omptimized in that it only merges the parts where
1892 * the kvm msr permission bitmap may contain zero bits
1893 */
1721 int i; 1894 int i;
1722 1895
1723 nested_msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, KM_USER0); 1896 if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
1724 if (!nested_msrpm) 1897 return true;
1725 return false;
1726 1898
1727 for (i=0; i< PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER) / 4; i++) 1899 for (i = 0; i < MSRPM_OFFSETS; i++) {
1728 svm->nested.msrpm[i] = svm->msrpm[i] | nested_msrpm[i]; 1900 u32 value, p;
1901 u64 offset;
1729 1902
1730 svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm); 1903 if (msrpm_offsets[i] == 0xffffffff)
1904 break;
1905
1906 p = msrpm_offsets[i];
1907 offset = svm->nested.vmcb_msrpm + (p * 4);
1908
1909 if (kvm_read_guest(svm->vcpu.kvm, offset, &value, 4))
1910 return false;
1911
1912 svm->nested.msrpm[p] = svm->msrpm[p] | value;
1913 }
1731 1914
1732 nested_svm_unmap(nested_msrpm, KM_USER0); 1915 svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm);
1733 1916
1734 return true; 1917 return true;
1735} 1918}
@@ -1739,26 +1922,34 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
1739 struct vmcb *nested_vmcb; 1922 struct vmcb *nested_vmcb;
1740 struct vmcb *hsave = svm->nested.hsave; 1923 struct vmcb *hsave = svm->nested.hsave;
1741 struct vmcb *vmcb = svm->vmcb; 1924 struct vmcb *vmcb = svm->vmcb;
1925 struct page *page;
1926 u64 vmcb_gpa;
1927
1928 vmcb_gpa = svm->vmcb->save.rax;
1742 1929
1743 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0); 1930 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
1744 if (!nested_vmcb) 1931 if (!nested_vmcb)
1745 return false; 1932 return false;
1746 1933
1747 /* nested_vmcb is our indicator if nested SVM is activated */ 1934 trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, vmcb_gpa,
1748 svm->nested.vmcb = svm->vmcb->save.rax;
1749
1750 trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, svm->nested.vmcb,
1751 nested_vmcb->save.rip, 1935 nested_vmcb->save.rip,
1752 nested_vmcb->control.int_ctl, 1936 nested_vmcb->control.int_ctl,
1753 nested_vmcb->control.event_inj, 1937 nested_vmcb->control.event_inj,
1754 nested_vmcb->control.nested_ctl); 1938 nested_vmcb->control.nested_ctl);
1755 1939
1940 trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr_read,
1941 nested_vmcb->control.intercept_cr_write,
1942 nested_vmcb->control.intercept_exceptions,
1943 nested_vmcb->control.intercept);
1944
1756 /* Clear internal status */ 1945 /* Clear internal status */
1757 kvm_clear_exception_queue(&svm->vcpu); 1946 kvm_clear_exception_queue(&svm->vcpu);
1758 kvm_clear_interrupt_queue(&svm->vcpu); 1947 kvm_clear_interrupt_queue(&svm->vcpu);
1759 1948
1760 /* Save the old vmcb, so we don't need to pick what we save, but 1949 /*
1761 can restore everything when a VMEXIT occurs */ 1950 * Save the old vmcb, so we don't need to pick what we save, but can
1951 * restore everything when a VMEXIT occurs
1952 */
1762 hsave->save.es = vmcb->save.es; 1953 hsave->save.es = vmcb->save.es;
1763 hsave->save.cs = vmcb->save.cs; 1954 hsave->save.cs = vmcb->save.cs;
1764 hsave->save.ss = vmcb->save.ss; 1955 hsave->save.ss = vmcb->save.ss;
@@ -1798,14 +1989,17 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
1798 if (npt_enabled) { 1989 if (npt_enabled) {
1799 svm->vmcb->save.cr3 = nested_vmcb->save.cr3; 1990 svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
1800 svm->vcpu.arch.cr3 = nested_vmcb->save.cr3; 1991 svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
1801 } else { 1992 } else
1802 kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); 1993 kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
1803 kvm_mmu_reset_context(&svm->vcpu); 1994
1804 } 1995 /* Guest paging mode is active - reset mmu */
1996 kvm_mmu_reset_context(&svm->vcpu);
1997
1805 svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2; 1998 svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2;
1806 kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax); 1999 kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax);
1807 kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp); 2000 kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp);
1808 kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip); 2001 kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip);
2002
1809 /* In case we don't even reach vcpu_run, the fields are not updated */ 2003 /* In case we don't even reach vcpu_run, the fields are not updated */
1810 svm->vmcb->save.rax = nested_vmcb->save.rax; 2004 svm->vmcb->save.rax = nested_vmcb->save.rax;
1811 svm->vmcb->save.rsp = nested_vmcb->save.rsp; 2005 svm->vmcb->save.rsp = nested_vmcb->save.rsp;
@@ -1814,22 +2008,8 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
1814 svm->vmcb->save.dr6 = nested_vmcb->save.dr6; 2008 svm->vmcb->save.dr6 = nested_vmcb->save.dr6;
1815 svm->vmcb->save.cpl = nested_vmcb->save.cpl; 2009 svm->vmcb->save.cpl = nested_vmcb->save.cpl;
1816 2010
1817 /* We don't want a nested guest to be more powerful than the guest, 2011 svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL;
1818 so all intercepts are ORed */ 2012 svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL;
1819 svm->vmcb->control.intercept_cr_read |=
1820 nested_vmcb->control.intercept_cr_read;
1821 svm->vmcb->control.intercept_cr_write |=
1822 nested_vmcb->control.intercept_cr_write;
1823 svm->vmcb->control.intercept_dr_read |=
1824 nested_vmcb->control.intercept_dr_read;
1825 svm->vmcb->control.intercept_dr_write |=
1826 nested_vmcb->control.intercept_dr_write;
1827 svm->vmcb->control.intercept_exceptions |=
1828 nested_vmcb->control.intercept_exceptions;
1829
1830 svm->vmcb->control.intercept |= nested_vmcb->control.intercept;
1831
1832 svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa;
1833 2013
1834 /* cache intercepts */ 2014 /* cache intercepts */
1835 svm->nested.intercept_cr_read = nested_vmcb->control.intercept_cr_read; 2015 svm->nested.intercept_cr_read = nested_vmcb->control.intercept_cr_read;
@@ -1846,13 +2026,40 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
1846 else 2026 else
1847 svm->vcpu.arch.hflags &= ~HF_VINTR_MASK; 2027 svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
1848 2028
2029 if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
2030 /* We only want the cr8 intercept bits of the guest */
2031 svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR8_MASK;
2032 svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
2033 }
2034
2035 /*
2036 * We don't want a nested guest to be more powerful than the guest, so
2037 * all intercepts are ORed
2038 */
2039 svm->vmcb->control.intercept_cr_read |=
2040 nested_vmcb->control.intercept_cr_read;
2041 svm->vmcb->control.intercept_cr_write |=
2042 nested_vmcb->control.intercept_cr_write;
2043 svm->vmcb->control.intercept_dr_read |=
2044 nested_vmcb->control.intercept_dr_read;
2045 svm->vmcb->control.intercept_dr_write |=
2046 nested_vmcb->control.intercept_dr_write;
2047 svm->vmcb->control.intercept_exceptions |=
2048 nested_vmcb->control.intercept_exceptions;
2049
2050 svm->vmcb->control.intercept |= nested_vmcb->control.intercept;
2051
2052 svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl;
1849 svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; 2053 svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
1850 svm->vmcb->control.int_state = nested_vmcb->control.int_state; 2054 svm->vmcb->control.int_state = nested_vmcb->control.int_state;
1851 svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset; 2055 svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset;
1852 svm->vmcb->control.event_inj = nested_vmcb->control.event_inj; 2056 svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
1853 svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err; 2057 svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
1854 2058
1855 nested_svm_unmap(nested_vmcb, KM_USER0); 2059 nested_svm_unmap(page);
2060
2061 /* nested_vmcb is our indicator if nested SVM is activated */
2062 svm->nested.vmcb = vmcb_gpa;
1856 2063
1857 enable_gif(svm); 2064 enable_gif(svm);
1858 2065
@@ -1878,6 +2085,7 @@ static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
1878static int vmload_interception(struct vcpu_svm *svm) 2085static int vmload_interception(struct vcpu_svm *svm)
1879{ 2086{
1880 struct vmcb *nested_vmcb; 2087 struct vmcb *nested_vmcb;
2088 struct page *page;
1881 2089
1882 if (nested_svm_check_permissions(svm)) 2090 if (nested_svm_check_permissions(svm))
1883 return 1; 2091 return 1;
@@ -1885,12 +2093,12 @@ static int vmload_interception(struct vcpu_svm *svm)
1885 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 2093 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1886 skip_emulated_instruction(&svm->vcpu); 2094 skip_emulated_instruction(&svm->vcpu);
1887 2095
1888 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0); 2096 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
1889 if (!nested_vmcb) 2097 if (!nested_vmcb)
1890 return 1; 2098 return 1;
1891 2099
1892 nested_svm_vmloadsave(nested_vmcb, svm->vmcb); 2100 nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
1893 nested_svm_unmap(nested_vmcb, KM_USER0); 2101 nested_svm_unmap(page);
1894 2102
1895 return 1; 2103 return 1;
1896} 2104}
@@ -1898,6 +2106,7 @@ static int vmload_interception(struct vcpu_svm *svm)
1898static int vmsave_interception(struct vcpu_svm *svm) 2106static int vmsave_interception(struct vcpu_svm *svm)
1899{ 2107{
1900 struct vmcb *nested_vmcb; 2108 struct vmcb *nested_vmcb;
2109 struct page *page;
1901 2110
1902 if (nested_svm_check_permissions(svm)) 2111 if (nested_svm_check_permissions(svm))
1903 return 1; 2112 return 1;
@@ -1905,12 +2114,12 @@ static int vmsave_interception(struct vcpu_svm *svm)
1905 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 2114 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1906 skip_emulated_instruction(&svm->vcpu); 2115 skip_emulated_instruction(&svm->vcpu);
1907 2116
1908 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0); 2117 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
1909 if (!nested_vmcb) 2118 if (!nested_vmcb)
1910 return 1; 2119 return 1;
1911 2120
1912 nested_svm_vmloadsave(svm->vmcb, nested_vmcb); 2121 nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
1913 nested_svm_unmap(nested_vmcb, KM_USER0); 2122 nested_svm_unmap(page);
1914 2123
1915 return 1; 2124 return 1;
1916} 2125}
@@ -2013,6 +2222,8 @@ static int task_switch_interception(struct vcpu_svm *svm)
2013 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK; 2222 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
2014 uint32_t idt_v = 2223 uint32_t idt_v =
2015 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID; 2224 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
2225 bool has_error_code = false;
2226 u32 error_code = 0;
2016 2227
2017 tss_selector = (u16)svm->vmcb->control.exit_info_1; 2228 tss_selector = (u16)svm->vmcb->control.exit_info_1;
2018 2229
@@ -2033,6 +2244,12 @@ static int task_switch_interception(struct vcpu_svm *svm)
2033 svm->vcpu.arch.nmi_injected = false; 2244 svm->vcpu.arch.nmi_injected = false;
2034 break; 2245 break;
2035 case SVM_EXITINTINFO_TYPE_EXEPT: 2246 case SVM_EXITINTINFO_TYPE_EXEPT:
2247 if (svm->vmcb->control.exit_info_2 &
2248 (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
2249 has_error_code = true;
2250 error_code =
2251 (u32)svm->vmcb->control.exit_info_2;
2252 }
2036 kvm_clear_exception_queue(&svm->vcpu); 2253 kvm_clear_exception_queue(&svm->vcpu);
2037 break; 2254 break;
2038 case SVM_EXITINTINFO_TYPE_INTR: 2255 case SVM_EXITINTINFO_TYPE_INTR:
@@ -2049,7 +2266,14 @@ static int task_switch_interception(struct vcpu_svm *svm)
2049 (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) 2266 (int_vec == OF_VECTOR || int_vec == BP_VECTOR)))
2050 skip_emulated_instruction(&svm->vcpu); 2267 skip_emulated_instruction(&svm->vcpu);
2051 2268
2052 return kvm_task_switch(&svm->vcpu, tss_selector, reason); 2269 if (kvm_task_switch(&svm->vcpu, tss_selector, reason,
2270 has_error_code, error_code) == EMULATE_FAIL) {
2271 svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2272 svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
2273 svm->vcpu.run->internal.ndata = 0;
2274 return 0;
2275 }
2276 return 1;
2053} 2277}
2054 2278
2055static int cpuid_interception(struct vcpu_svm *svm) 2279static int cpuid_interception(struct vcpu_svm *svm)
@@ -2140,9 +2364,11 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
2140 case MSR_IA32_SYSENTER_ESP: 2364 case MSR_IA32_SYSENTER_ESP:
2141 *data = svm->sysenter_esp; 2365 *data = svm->sysenter_esp;
2142 break; 2366 break;
2143 /* Nobody will change the following 5 values in the VMCB so 2367 /*
2144 we can safely return them on rdmsr. They will always be 0 2368 * Nobody will change the following 5 values in the VMCB so we can
2145 until LBRV is implemented. */ 2369 * safely return them on rdmsr. They will always be 0 until LBRV is
2370 * implemented.
2371 */
2146 case MSR_IA32_DEBUGCTLMSR: 2372 case MSR_IA32_DEBUGCTLMSR:
2147 *data = svm->vmcb->save.dbgctl; 2373 *data = svm->vmcb->save.dbgctl;
2148 break; 2374 break;
@@ -2162,7 +2388,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
2162 *data = svm->nested.hsave_msr; 2388 *data = svm->nested.hsave_msr;
2163 break; 2389 break;
2164 case MSR_VM_CR: 2390 case MSR_VM_CR:
2165 *data = 0; 2391 *data = svm->nested.vm_cr_msr;
2166 break; 2392 break;
2167 case MSR_IA32_UCODE_REV: 2393 case MSR_IA32_UCODE_REV:
2168 *data = 0x01000065; 2394 *data = 0x01000065;
@@ -2192,6 +2418,31 @@ static int rdmsr_interception(struct vcpu_svm *svm)
2192 return 1; 2418 return 1;
2193} 2419}
2194 2420
2421static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
2422{
2423 struct vcpu_svm *svm = to_svm(vcpu);
2424 int svm_dis, chg_mask;
2425
2426 if (data & ~SVM_VM_CR_VALID_MASK)
2427 return 1;
2428
2429 chg_mask = SVM_VM_CR_VALID_MASK;
2430
2431 if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
2432 chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
2433
2434 svm->nested.vm_cr_msr &= ~chg_mask;
2435 svm->nested.vm_cr_msr |= (data & chg_mask);
2436
2437 svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
2438
2439 /* check for svm_disable while efer.svme is set */
2440 if (svm_dis && (vcpu->arch.efer & EFER_SVME))
2441 return 1;
2442
2443 return 0;
2444}
2445
2195static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) 2446static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
2196{ 2447{
2197 struct vcpu_svm *svm = to_svm(vcpu); 2448 struct vcpu_svm *svm = to_svm(vcpu);
@@ -2258,6 +2509,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
2258 svm->nested.hsave_msr = data; 2509 svm->nested.hsave_msr = data;
2259 break; 2510 break;
2260 case MSR_VM_CR: 2511 case MSR_VM_CR:
2512 return svm_set_vm_cr(vcpu, data);
2261 case MSR_VM_IGNNE: 2513 case MSR_VM_IGNNE:
2262 pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); 2514 pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
2263 break; 2515 break;
@@ -2321,16 +2573,16 @@ static int pause_interception(struct vcpu_svm *svm)
2321} 2573}
2322 2574
2323static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { 2575static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2324 [SVM_EXIT_READ_CR0] = emulate_on_interception, 2576 [SVM_EXIT_READ_CR0] = emulate_on_interception,
2325 [SVM_EXIT_READ_CR3] = emulate_on_interception, 2577 [SVM_EXIT_READ_CR3] = emulate_on_interception,
2326 [SVM_EXIT_READ_CR4] = emulate_on_interception, 2578 [SVM_EXIT_READ_CR4] = emulate_on_interception,
2327 [SVM_EXIT_READ_CR8] = emulate_on_interception, 2579 [SVM_EXIT_READ_CR8] = emulate_on_interception,
2328 [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, 2580 [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception,
2329 [SVM_EXIT_WRITE_CR0] = emulate_on_interception, 2581 [SVM_EXIT_WRITE_CR0] = emulate_on_interception,
2330 [SVM_EXIT_WRITE_CR3] = emulate_on_interception, 2582 [SVM_EXIT_WRITE_CR3] = emulate_on_interception,
2331 [SVM_EXIT_WRITE_CR4] = emulate_on_interception, 2583 [SVM_EXIT_WRITE_CR4] = emulate_on_interception,
2332 [SVM_EXIT_WRITE_CR8] = cr8_write_interception, 2584 [SVM_EXIT_WRITE_CR8] = cr8_write_interception,
2333 [SVM_EXIT_READ_DR0] = emulate_on_interception, 2585 [SVM_EXIT_READ_DR0] = emulate_on_interception,
2334 [SVM_EXIT_READ_DR1] = emulate_on_interception, 2586 [SVM_EXIT_READ_DR1] = emulate_on_interception,
2335 [SVM_EXIT_READ_DR2] = emulate_on_interception, 2587 [SVM_EXIT_READ_DR2] = emulate_on_interception,
2336 [SVM_EXIT_READ_DR3] = emulate_on_interception, 2588 [SVM_EXIT_READ_DR3] = emulate_on_interception,
@@ -2349,15 +2601,14 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2349 [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception, 2601 [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception,
2350 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, 2602 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception,
2351 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, 2603 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
2352 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, 2604 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
2353 [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, 2605 [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception,
2354 [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception, 2606 [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception,
2355 [SVM_EXIT_INTR] = intr_interception, 2607 [SVM_EXIT_INTR] = intr_interception,
2356 [SVM_EXIT_NMI] = nmi_interception, 2608 [SVM_EXIT_NMI] = nmi_interception,
2357 [SVM_EXIT_SMI] = nop_on_interception, 2609 [SVM_EXIT_SMI] = nop_on_interception,
2358 [SVM_EXIT_INIT] = nop_on_interception, 2610 [SVM_EXIT_INIT] = nop_on_interception,
2359 [SVM_EXIT_VINTR] = interrupt_window_interception, 2611 [SVM_EXIT_VINTR] = interrupt_window_interception,
2360 /* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */
2361 [SVM_EXIT_CPUID] = cpuid_interception, 2612 [SVM_EXIT_CPUID] = cpuid_interception,
2362 [SVM_EXIT_IRET] = iret_interception, 2613 [SVM_EXIT_IRET] = iret_interception,
2363 [SVM_EXIT_INVD] = emulate_on_interception, 2614 [SVM_EXIT_INVD] = emulate_on_interception,
@@ -2365,7 +2616,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2365 [SVM_EXIT_HLT] = halt_interception, 2616 [SVM_EXIT_HLT] = halt_interception,
2366 [SVM_EXIT_INVLPG] = invlpg_interception, 2617 [SVM_EXIT_INVLPG] = invlpg_interception,
2367 [SVM_EXIT_INVLPGA] = invlpga_interception, 2618 [SVM_EXIT_INVLPGA] = invlpga_interception,
2368 [SVM_EXIT_IOIO] = io_interception, 2619 [SVM_EXIT_IOIO] = io_interception,
2369 [SVM_EXIT_MSR] = msr_interception, 2620 [SVM_EXIT_MSR] = msr_interception,
2370 [SVM_EXIT_TASK_SWITCH] = task_switch_interception, 2621 [SVM_EXIT_TASK_SWITCH] = task_switch_interception,
2371 [SVM_EXIT_SHUTDOWN] = shutdown_interception, 2622 [SVM_EXIT_SHUTDOWN] = shutdown_interception,
@@ -2388,7 +2639,7 @@ static int handle_exit(struct kvm_vcpu *vcpu)
2388 struct kvm_run *kvm_run = vcpu->run; 2639 struct kvm_run *kvm_run = vcpu->run;
2389 u32 exit_code = svm->vmcb->control.exit_code; 2640 u32 exit_code = svm->vmcb->control.exit_code;
2390 2641
2391 trace_kvm_exit(exit_code, svm->vmcb->save.rip); 2642 trace_kvm_exit(exit_code, vcpu);
2392 2643
2393 if (unlikely(svm->nested.exit_required)) { 2644 if (unlikely(svm->nested.exit_required)) {
2394 nested_svm_vmexit(svm); 2645 nested_svm_vmexit(svm);
@@ -2506,6 +2757,9 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
2506{ 2757{
2507 struct vcpu_svm *svm = to_svm(vcpu); 2758 struct vcpu_svm *svm = to_svm(vcpu);
2508 2759
2760 if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK))
2761 return;
2762
2509 if (irr == -1) 2763 if (irr == -1)
2510 return; 2764 return;
2511 2765
@@ -2563,13 +2817,13 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
2563{ 2817{
2564 struct vcpu_svm *svm = to_svm(vcpu); 2818 struct vcpu_svm *svm = to_svm(vcpu);
2565 2819
2566 nested_svm_intr(svm); 2820 /*
2567 2821 * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
2568 /* In case GIF=0 we can't rely on the CPU to tell us when 2822 * 1, because that's a separate STGI/VMRUN intercept. The next time we
2569 * GIF becomes 1, because that's a separate STGI/VMRUN intercept. 2823 * get that intercept, this function will be called again though and
2570 * The next time we get that intercept, this function will be 2824 * we'll get the vintr intercept.
2571 * called again though and we'll get the vintr intercept. */ 2825 */
2572 if (gif_set(svm)) { 2826 if (gif_set(svm) && nested_svm_intr(svm)) {
2573 svm_set_vintr(svm); 2827 svm_set_vintr(svm);
2574 svm_inject_irq(svm, 0x0); 2828 svm_inject_irq(svm, 0x0);
2575 } 2829 }
@@ -2583,12 +2837,15 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
2583 == HF_NMI_MASK) 2837 == HF_NMI_MASK)
2584 return; /* IRET will cause a vm exit */ 2838 return; /* IRET will cause a vm exit */
2585 2839
2586 /* Something prevents NMI from been injected. Single step over 2840 /*
2587 possible problem (IRET or exception injection or interrupt 2841 * Something prevents NMI from been injected. Single step over possible
2588 shadow) */ 2842 * problem (IRET or exception injection or interrupt shadow)
2589 svm->nmi_singlestep = true; 2843 */
2590 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); 2844 if (gif_set(svm) && nested_svm_nmi(svm)) {
2591 update_db_intercept(vcpu); 2845 svm->nmi_singlestep = true;
2846 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
2847 update_db_intercept(vcpu);
2848 }
2592} 2849}
2593 2850
2594static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) 2851static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
@@ -2609,6 +2866,9 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
2609{ 2866{
2610 struct vcpu_svm *svm = to_svm(vcpu); 2867 struct vcpu_svm *svm = to_svm(vcpu);
2611 2868
2869 if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK))
2870 return;
2871
2612 if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) { 2872 if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) {
2613 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK; 2873 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
2614 kvm_set_cr8(vcpu, cr8); 2874 kvm_set_cr8(vcpu, cr8);
@@ -2620,6 +2880,9 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
2620 struct vcpu_svm *svm = to_svm(vcpu); 2880 struct vcpu_svm *svm = to_svm(vcpu);
2621 u64 cr8; 2881 u64 cr8;
2622 2882
2883 if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK))
2884 return;
2885
2623 cr8 = kvm_get_cr8(vcpu); 2886 cr8 = kvm_get_cr8(vcpu);
2624 svm->vmcb->control.int_ctl &= ~V_TPR_MASK; 2887 svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
2625 svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; 2888 svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
@@ -2630,6 +2893,9 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
2630 u8 vector; 2893 u8 vector;
2631 int type; 2894 int type;
2632 u32 exitintinfo = svm->vmcb->control.exit_int_info; 2895 u32 exitintinfo = svm->vmcb->control.exit_int_info;
2896 unsigned int3_injected = svm->int3_injected;
2897
2898 svm->int3_injected = 0;
2633 2899
2634 if (svm->vcpu.arch.hflags & HF_IRET_MASK) 2900 if (svm->vcpu.arch.hflags & HF_IRET_MASK)
2635 svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); 2901 svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
@@ -2649,12 +2915,21 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
2649 svm->vcpu.arch.nmi_injected = true; 2915 svm->vcpu.arch.nmi_injected = true;
2650 break; 2916 break;
2651 case SVM_EXITINTINFO_TYPE_EXEPT: 2917 case SVM_EXITINTINFO_TYPE_EXEPT:
2652 /* In case of software exception do not reinject an exception
2653 vector, but re-execute and instruction instead */
2654 if (is_nested(svm)) 2918 if (is_nested(svm))
2655 break; 2919 break;
2656 if (kvm_exception_is_soft(vector)) 2920 /*
2921 * In case of software exceptions, do not reinject the vector,
2922 * but re-execute the instruction instead. Rewind RIP first
2923 * if we emulated INT3 before.
2924 */
2925 if (kvm_exception_is_soft(vector)) {
2926 if (vector == BP_VECTOR && int3_injected &&
2927 kvm_is_linear_rip(&svm->vcpu, svm->int3_rip))
2928 kvm_rip_write(&svm->vcpu,
2929 kvm_rip_read(&svm->vcpu) -
2930 int3_injected);
2657 break; 2931 break;
2932 }
2658 if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) { 2933 if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
2659 u32 err = svm->vmcb->control.exit_int_info_err; 2934 u32 err = svm->vmcb->control.exit_int_info_err;
2660 kvm_queue_exception_e(&svm->vcpu, vector, err); 2935 kvm_queue_exception_e(&svm->vcpu, vector, err);
@@ -2875,24 +3150,24 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu)
2875} 3150}
2876 3151
2877static const struct trace_print_flags svm_exit_reasons_str[] = { 3152static const struct trace_print_flags svm_exit_reasons_str[] = {
2878 { SVM_EXIT_READ_CR0, "read_cr0" }, 3153 { SVM_EXIT_READ_CR0, "read_cr0" },
2879 { SVM_EXIT_READ_CR3, "read_cr3" }, 3154 { SVM_EXIT_READ_CR3, "read_cr3" },
2880 { SVM_EXIT_READ_CR4, "read_cr4" }, 3155 { SVM_EXIT_READ_CR4, "read_cr4" },
2881 { SVM_EXIT_READ_CR8, "read_cr8" }, 3156 { SVM_EXIT_READ_CR8, "read_cr8" },
2882 { SVM_EXIT_WRITE_CR0, "write_cr0" }, 3157 { SVM_EXIT_WRITE_CR0, "write_cr0" },
2883 { SVM_EXIT_WRITE_CR3, "write_cr3" }, 3158 { SVM_EXIT_WRITE_CR3, "write_cr3" },
2884 { SVM_EXIT_WRITE_CR4, "write_cr4" }, 3159 { SVM_EXIT_WRITE_CR4, "write_cr4" },
2885 { SVM_EXIT_WRITE_CR8, "write_cr8" }, 3160 { SVM_EXIT_WRITE_CR8, "write_cr8" },
2886 { SVM_EXIT_READ_DR0, "read_dr0" }, 3161 { SVM_EXIT_READ_DR0, "read_dr0" },
2887 { SVM_EXIT_READ_DR1, "read_dr1" }, 3162 { SVM_EXIT_READ_DR1, "read_dr1" },
2888 { SVM_EXIT_READ_DR2, "read_dr2" }, 3163 { SVM_EXIT_READ_DR2, "read_dr2" },
2889 { SVM_EXIT_READ_DR3, "read_dr3" }, 3164 { SVM_EXIT_READ_DR3, "read_dr3" },
2890 { SVM_EXIT_WRITE_DR0, "write_dr0" }, 3165 { SVM_EXIT_WRITE_DR0, "write_dr0" },
2891 { SVM_EXIT_WRITE_DR1, "write_dr1" }, 3166 { SVM_EXIT_WRITE_DR1, "write_dr1" },
2892 { SVM_EXIT_WRITE_DR2, "write_dr2" }, 3167 { SVM_EXIT_WRITE_DR2, "write_dr2" },
2893 { SVM_EXIT_WRITE_DR3, "write_dr3" }, 3168 { SVM_EXIT_WRITE_DR3, "write_dr3" },
2894 { SVM_EXIT_WRITE_DR5, "write_dr5" }, 3169 { SVM_EXIT_WRITE_DR5, "write_dr5" },
2895 { SVM_EXIT_WRITE_DR7, "write_dr7" }, 3170 { SVM_EXIT_WRITE_DR7, "write_dr7" },
2896 { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" }, 3171 { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" },
2897 { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" }, 3172 { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" },
2898 { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" }, 3173 { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" },
@@ -2941,8 +3216,10 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
2941{ 3216{
2942 struct vcpu_svm *svm = to_svm(vcpu); 3217 struct vcpu_svm *svm = to_svm(vcpu);
2943 3218
2944 update_cr0_intercept(svm);
2945 svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR; 3219 svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR;
3220 if (is_nested(svm))
3221 svm->nested.hsave->control.intercept_exceptions |= 1 << NM_VECTOR;
3222 update_cr0_intercept(svm);
2946} 3223}
2947 3224
2948static struct kvm_x86_ops svm_x86_ops = { 3225static struct kvm_x86_ops svm_x86_ops = {
@@ -2981,8 +3258,7 @@ static struct kvm_x86_ops svm_x86_ops = {
2981 .set_idt = svm_set_idt, 3258 .set_idt = svm_set_idt,
2982 .get_gdt = svm_get_gdt, 3259 .get_gdt = svm_get_gdt,
2983 .set_gdt = svm_set_gdt, 3260 .set_gdt = svm_set_gdt,
2984 .get_dr = svm_get_dr, 3261 .set_dr7 = svm_set_dr7,
2985 .set_dr = svm_set_dr,
2986 .cache_reg = svm_cache_reg, 3262 .cache_reg = svm_cache_reg,
2987 .get_rflags = svm_get_rflags, 3263 .get_rflags = svm_get_rflags,
2988 .set_rflags = svm_set_rflags, 3264 .set_rflags = svm_set_rflags,
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
index eea40439066c..4ddadb1a5ffe 100644
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
@@ -12,7 +12,8 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
12 /* 12 /*
13 * There is a race window between reading and incrementing, but we do 13 * There is a race window between reading and incrementing, but we do
14 * not care about potentially loosing timer events in the !reinject 14 * not care about potentially loosing timer events in the !reinject
15 * case anyway. 15 * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked
16 * in vcpu_enter_guest.
16 */ 17 */
17 if (ktimer->reinject || !atomic_read(&ktimer->pending)) { 18 if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
18 atomic_inc(&ktimer->pending); 19 atomic_inc(&ktimer->pending);
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 6ad30a29f044..a6544b8e7c0f 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -5,8 +5,6 @@
5 5
6#undef TRACE_SYSTEM 6#undef TRACE_SYSTEM
7#define TRACE_SYSTEM kvm 7#define TRACE_SYSTEM kvm
8#define TRACE_INCLUDE_PATH arch/x86/kvm
9#define TRACE_INCLUDE_FILE trace
10 8
11/* 9/*
12 * Tracepoint for guest mode entry. 10 * Tracepoint for guest mode entry.
@@ -184,8 +182,8 @@ TRACE_EVENT(kvm_apic,
184 * Tracepoint for kvm guest exit: 182 * Tracepoint for kvm guest exit:
185 */ 183 */
186TRACE_EVENT(kvm_exit, 184TRACE_EVENT(kvm_exit,
187 TP_PROTO(unsigned int exit_reason, unsigned long guest_rip), 185 TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu),
188 TP_ARGS(exit_reason, guest_rip), 186 TP_ARGS(exit_reason, vcpu),
189 187
190 TP_STRUCT__entry( 188 TP_STRUCT__entry(
191 __field( unsigned int, exit_reason ) 189 __field( unsigned int, exit_reason )
@@ -194,7 +192,7 @@ TRACE_EVENT(kvm_exit,
194 192
195 TP_fast_assign( 193 TP_fast_assign(
196 __entry->exit_reason = exit_reason; 194 __entry->exit_reason = exit_reason;
197 __entry->guest_rip = guest_rip; 195 __entry->guest_rip = kvm_rip_read(vcpu);
198 ), 196 ),
199 197
200 TP_printk("reason %s rip 0x%lx", 198 TP_printk("reason %s rip 0x%lx",
@@ -221,6 +219,38 @@ TRACE_EVENT(kvm_inj_virq,
221 TP_printk("irq %u", __entry->irq) 219 TP_printk("irq %u", __entry->irq)
222); 220);
223 221
222#define EXS(x) { x##_VECTOR, "#" #x }
223
224#define kvm_trace_sym_exc \
225 EXS(DE), EXS(DB), EXS(BP), EXS(OF), EXS(BR), EXS(UD), EXS(NM), \
226 EXS(DF), EXS(TS), EXS(NP), EXS(SS), EXS(GP), EXS(PF), \
227 EXS(MF), EXS(MC)
228
229/*
230 * Tracepoint for kvm interrupt injection:
231 */
232TRACE_EVENT(kvm_inj_exception,
233 TP_PROTO(unsigned exception, bool has_error, unsigned error_code),
234 TP_ARGS(exception, has_error, error_code),
235
236 TP_STRUCT__entry(
237 __field( u8, exception )
238 __field( u8, has_error )
239 __field( u32, error_code )
240 ),
241
242 TP_fast_assign(
243 __entry->exception = exception;
244 __entry->has_error = has_error;
245 __entry->error_code = error_code;
246 ),
247
248 TP_printk("%s (0x%x)",
249 __print_symbolic(__entry->exception, kvm_trace_sym_exc),
250 /* FIXME: don't print error_code if not present */
251 __entry->has_error ? __entry->error_code : 0)
252);
253
224/* 254/*
225 * Tracepoint for page fault. 255 * Tracepoint for page fault.
226 */ 256 */
@@ -413,12 +443,34 @@ TRACE_EVENT(kvm_nested_vmrun,
413 ), 443 ),
414 444
415 TP_printk("rip: 0x%016llx vmcb: 0x%016llx nrip: 0x%016llx int_ctl: 0x%08x " 445 TP_printk("rip: 0x%016llx vmcb: 0x%016llx nrip: 0x%016llx int_ctl: 0x%08x "
416 "event_inj: 0x%08x npt: %s\n", 446 "event_inj: 0x%08x npt: %s",
417 __entry->rip, __entry->vmcb, __entry->nested_rip, 447 __entry->rip, __entry->vmcb, __entry->nested_rip,
418 __entry->int_ctl, __entry->event_inj, 448 __entry->int_ctl, __entry->event_inj,
419 __entry->npt ? "on" : "off") 449 __entry->npt ? "on" : "off")
420); 450);
421 451
452TRACE_EVENT(kvm_nested_intercepts,
453 TP_PROTO(__u16 cr_read, __u16 cr_write, __u32 exceptions, __u64 intercept),
454 TP_ARGS(cr_read, cr_write, exceptions, intercept),
455
456 TP_STRUCT__entry(
457 __field( __u16, cr_read )
458 __field( __u16, cr_write )
459 __field( __u32, exceptions )
460 __field( __u64, intercept )
461 ),
462
463 TP_fast_assign(
464 __entry->cr_read = cr_read;
465 __entry->cr_write = cr_write;
466 __entry->exceptions = exceptions;
467 __entry->intercept = intercept;
468 ),
469
470 TP_printk("cr_read: %04x cr_write: %04x excp: %08x intercept: %016llx",
471 __entry->cr_read, __entry->cr_write, __entry->exceptions,
472 __entry->intercept)
473);
422/* 474/*
423 * Tracepoint for #VMEXIT while nested 475 * Tracepoint for #VMEXIT while nested
424 */ 476 */
@@ -447,7 +499,7 @@ TRACE_EVENT(kvm_nested_vmexit,
447 __entry->exit_int_info_err = exit_int_info_err; 499 __entry->exit_int_info_err = exit_int_info_err;
448 ), 500 ),
449 TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx " 501 TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx "
450 "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x\n", 502 "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x",
451 __entry->rip, 503 __entry->rip,
452 ftrace_print_symbols_seq(p, __entry->exit_code, 504 ftrace_print_symbols_seq(p, __entry->exit_code,
453 kvm_x86_ops->exit_reasons_str), 505 kvm_x86_ops->exit_reasons_str),
@@ -482,7 +534,7 @@ TRACE_EVENT(kvm_nested_vmexit_inject,
482 ), 534 ),
483 535
484 TP_printk("reason: %s ext_inf1: 0x%016llx " 536 TP_printk("reason: %s ext_inf1: 0x%016llx "
485 "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x\n", 537 "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x",
486 ftrace_print_symbols_seq(p, __entry->exit_code, 538 ftrace_print_symbols_seq(p, __entry->exit_code,
487 kvm_x86_ops->exit_reasons_str), 539 kvm_x86_ops->exit_reasons_str),
488 __entry->exit_info1, __entry->exit_info2, 540 __entry->exit_info1, __entry->exit_info2,
@@ -504,7 +556,7 @@ TRACE_EVENT(kvm_nested_intr_vmexit,
504 __entry->rip = rip 556 __entry->rip = rip
505 ), 557 ),
506 558
507 TP_printk("rip: 0x%016llx\n", __entry->rip) 559 TP_printk("rip: 0x%016llx", __entry->rip)
508); 560);
509 561
510/* 562/*
@@ -526,7 +578,7 @@ TRACE_EVENT(kvm_invlpga,
526 __entry->address = address; 578 __entry->address = address;
527 ), 579 ),
528 580
529 TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx\n", 581 TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx",
530 __entry->rip, __entry->asid, __entry->address) 582 __entry->rip, __entry->asid, __entry->address)
531); 583);
532 584
@@ -547,11 +599,102 @@ TRACE_EVENT(kvm_skinit,
547 __entry->slb = slb; 599 __entry->slb = slb;
548 ), 600 ),
549 601
550 TP_printk("rip: 0x%016llx slb: 0x%08x\n", 602 TP_printk("rip: 0x%016llx slb: 0x%08x",
551 __entry->rip, __entry->slb) 603 __entry->rip, __entry->slb)
552); 604);
553 605
606#define __print_insn(insn, ilen) ({ \
607 int i; \
608 const char *ret = p->buffer + p->len; \
609 \
610 for (i = 0; i < ilen; ++i) \
611 trace_seq_printf(p, " %02x", insn[i]); \
612 trace_seq_printf(p, "%c", 0); \
613 ret; \
614 })
615
616#define KVM_EMUL_INSN_F_CR0_PE (1 << 0)
617#define KVM_EMUL_INSN_F_EFL_VM (1 << 1)
618#define KVM_EMUL_INSN_F_CS_D (1 << 2)
619#define KVM_EMUL_INSN_F_CS_L (1 << 3)
620
621#define kvm_trace_symbol_emul_flags \
622 { 0, "real" }, \
623 { KVM_EMUL_INSN_F_CR0_PE \
624 | KVM_EMUL_INSN_F_EFL_VM, "vm16" }, \
625 { KVM_EMUL_INSN_F_CR0_PE, "prot16" }, \
626 { KVM_EMUL_INSN_F_CR0_PE \
627 | KVM_EMUL_INSN_F_CS_D, "prot32" }, \
628 { KVM_EMUL_INSN_F_CR0_PE \
629 | KVM_EMUL_INSN_F_CS_L, "prot64" }
630
631#define kei_decode_mode(mode) ({ \
632 u8 flags = 0xff; \
633 switch (mode) { \
634 case X86EMUL_MODE_REAL: \
635 flags = 0; \
636 break; \
637 case X86EMUL_MODE_VM86: \
638 flags = KVM_EMUL_INSN_F_EFL_VM; \
639 break; \
640 case X86EMUL_MODE_PROT16: \
641 flags = KVM_EMUL_INSN_F_CR0_PE; \
642 break; \
643 case X86EMUL_MODE_PROT32: \
644 flags = KVM_EMUL_INSN_F_CR0_PE \
645 | KVM_EMUL_INSN_F_CS_D; \
646 break; \
647 case X86EMUL_MODE_PROT64: \
648 flags = KVM_EMUL_INSN_F_CR0_PE \
649 | KVM_EMUL_INSN_F_CS_L; \
650 break; \
651 } \
652 flags; \
653 })
654
655TRACE_EVENT(kvm_emulate_insn,
656 TP_PROTO(struct kvm_vcpu *vcpu, __u8 failed),
657 TP_ARGS(vcpu, failed),
658
659 TP_STRUCT__entry(
660 __field( __u64, rip )
661 __field( __u32, csbase )
662 __field( __u8, len )
663 __array( __u8, insn, 15 )
664 __field( __u8, flags )
665 __field( __u8, failed )
666 ),
667
668 TP_fast_assign(
669 __entry->rip = vcpu->arch.emulate_ctxt.decode.fetch.start;
670 __entry->csbase = kvm_x86_ops->get_segment_base(vcpu, VCPU_SREG_CS);
671 __entry->len = vcpu->arch.emulate_ctxt.decode.eip
672 - vcpu->arch.emulate_ctxt.decode.fetch.start;
673 memcpy(__entry->insn,
674 vcpu->arch.emulate_ctxt.decode.fetch.data,
675 15);
676 __entry->flags = kei_decode_mode(vcpu->arch.emulate_ctxt.mode);
677 __entry->failed = failed;
678 ),
679
680 TP_printk("%x:%llx:%s (%s)%s",
681 __entry->csbase, __entry->rip,
682 __print_insn(__entry->insn, __entry->len),
683 __print_symbolic(__entry->flags,
684 kvm_trace_symbol_emul_flags),
685 __entry->failed ? " failed" : ""
686 )
687 );
688
689#define trace_kvm_emulate_insn_start(vcpu) trace_kvm_emulate_insn(vcpu, 0)
690#define trace_kvm_emulate_insn_failed(vcpu) trace_kvm_emulate_insn(vcpu, 1)
691
554#endif /* _TRACE_KVM_H */ 692#endif /* _TRACE_KVM_H */
555 693
694#undef TRACE_INCLUDE_PATH
695#define TRACE_INCLUDE_PATH arch/x86/kvm
696#undef TRACE_INCLUDE_FILE
697#define TRACE_INCLUDE_FILE trace
698
556/* This part must be outside protection */ 699/* This part must be outside protection */
557#include <trace/define_trace.h> 700#include <trace/define_trace.h>
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 82be6dac3d25..54c0035a63f0 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -77,6 +77,8 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO);
77#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) 77#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
78#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) 78#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
79 79
80#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
81
80/* 82/*
81 * These 2 parameters are used to config the controls for Pause-Loop Exiting: 83 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
82 * ple_gap: upper bound on the amount of time between two successive 84 * ple_gap: upper bound on the amount of time between two successive
@@ -131,7 +133,7 @@ struct vcpu_vmx {
131 } host_state; 133 } host_state;
132 struct { 134 struct {
133 int vm86_active; 135 int vm86_active;
134 u8 save_iopl; 136 ulong save_rflags;
135 struct kvm_save_segment { 137 struct kvm_save_segment {
136 u16 selector; 138 u16 selector;
137 unsigned long base; 139 unsigned long base;
@@ -232,56 +234,56 @@ static const u32 vmx_msr_index[] = {
232}; 234};
233#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) 235#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
234 236
235static inline int is_page_fault(u32 intr_info) 237static inline bool is_page_fault(u32 intr_info)
236{ 238{
237 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 239 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
238 INTR_INFO_VALID_MASK)) == 240 INTR_INFO_VALID_MASK)) ==
239 (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK); 241 (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
240} 242}
241 243
242static inline int is_no_device(u32 intr_info) 244static inline bool is_no_device(u32 intr_info)
243{ 245{
244 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 246 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
245 INTR_INFO_VALID_MASK)) == 247 INTR_INFO_VALID_MASK)) ==
246 (INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK); 248 (INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
247} 249}
248 250
249static inline int is_invalid_opcode(u32 intr_info) 251static inline bool is_invalid_opcode(u32 intr_info)
250{ 252{
251 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 253 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
252 INTR_INFO_VALID_MASK)) == 254 INTR_INFO_VALID_MASK)) ==
253 (INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK); 255 (INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
254} 256}
255 257
256static inline int is_external_interrupt(u32 intr_info) 258static inline bool is_external_interrupt(u32 intr_info)
257{ 259{
258 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) 260 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
259 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); 261 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
260} 262}
261 263
262static inline int is_machine_check(u32 intr_info) 264static inline bool is_machine_check(u32 intr_info)
263{ 265{
264 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 266 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
265 INTR_INFO_VALID_MASK)) == 267 INTR_INFO_VALID_MASK)) ==
266 (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK); 268 (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
267} 269}
268 270
269static inline int cpu_has_vmx_msr_bitmap(void) 271static inline bool cpu_has_vmx_msr_bitmap(void)
270{ 272{
271 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS; 273 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
272} 274}
273 275
274static inline int cpu_has_vmx_tpr_shadow(void) 276static inline bool cpu_has_vmx_tpr_shadow(void)
275{ 277{
276 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW; 278 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
277} 279}
278 280
279static inline int vm_need_tpr_shadow(struct kvm *kvm) 281static inline bool vm_need_tpr_shadow(struct kvm *kvm)
280{ 282{
281 return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm)); 283 return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
282} 284}
283 285
284static inline int cpu_has_secondary_exec_ctrls(void) 286static inline bool cpu_has_secondary_exec_ctrls(void)
285{ 287{
286 return vmcs_config.cpu_based_exec_ctrl & 288 return vmcs_config.cpu_based_exec_ctrl &
287 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 289 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@@ -301,80 +303,80 @@ static inline bool cpu_has_vmx_flexpriority(void)
301 303
302static inline bool cpu_has_vmx_ept_execute_only(void) 304static inline bool cpu_has_vmx_ept_execute_only(void)
303{ 305{
304 return !!(vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT); 306 return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
305} 307}
306 308
307static inline bool cpu_has_vmx_eptp_uncacheable(void) 309static inline bool cpu_has_vmx_eptp_uncacheable(void)
308{ 310{
309 return !!(vmx_capability.ept & VMX_EPTP_UC_BIT); 311 return vmx_capability.ept & VMX_EPTP_UC_BIT;
310} 312}
311 313
312static inline bool cpu_has_vmx_eptp_writeback(void) 314static inline bool cpu_has_vmx_eptp_writeback(void)
313{ 315{
314 return !!(vmx_capability.ept & VMX_EPTP_WB_BIT); 316 return vmx_capability.ept & VMX_EPTP_WB_BIT;
315} 317}
316 318
317static inline bool cpu_has_vmx_ept_2m_page(void) 319static inline bool cpu_has_vmx_ept_2m_page(void)
318{ 320{
319 return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT); 321 return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
320} 322}
321 323
322static inline bool cpu_has_vmx_ept_1g_page(void) 324static inline bool cpu_has_vmx_ept_1g_page(void)
323{ 325{
324 return !!(vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT); 326 return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
325} 327}
326 328
327static inline int cpu_has_vmx_invept_individual_addr(void) 329static inline bool cpu_has_vmx_invept_individual_addr(void)
328{ 330{
329 return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT); 331 return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
330} 332}
331 333
332static inline int cpu_has_vmx_invept_context(void) 334static inline bool cpu_has_vmx_invept_context(void)
333{ 335{
334 return !!(vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT); 336 return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
335} 337}
336 338
337static inline int cpu_has_vmx_invept_global(void) 339static inline bool cpu_has_vmx_invept_global(void)
338{ 340{
339 return !!(vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT); 341 return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
340} 342}
341 343
342static inline int cpu_has_vmx_ept(void) 344static inline bool cpu_has_vmx_ept(void)
343{ 345{
344 return vmcs_config.cpu_based_2nd_exec_ctrl & 346 return vmcs_config.cpu_based_2nd_exec_ctrl &
345 SECONDARY_EXEC_ENABLE_EPT; 347 SECONDARY_EXEC_ENABLE_EPT;
346} 348}
347 349
348static inline int cpu_has_vmx_unrestricted_guest(void) 350static inline bool cpu_has_vmx_unrestricted_guest(void)
349{ 351{
350 return vmcs_config.cpu_based_2nd_exec_ctrl & 352 return vmcs_config.cpu_based_2nd_exec_ctrl &
351 SECONDARY_EXEC_UNRESTRICTED_GUEST; 353 SECONDARY_EXEC_UNRESTRICTED_GUEST;
352} 354}
353 355
354static inline int cpu_has_vmx_ple(void) 356static inline bool cpu_has_vmx_ple(void)
355{ 357{
356 return vmcs_config.cpu_based_2nd_exec_ctrl & 358 return vmcs_config.cpu_based_2nd_exec_ctrl &
357 SECONDARY_EXEC_PAUSE_LOOP_EXITING; 359 SECONDARY_EXEC_PAUSE_LOOP_EXITING;
358} 360}
359 361
360static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) 362static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm)
361{ 363{
362 return flexpriority_enabled && irqchip_in_kernel(kvm); 364 return flexpriority_enabled && irqchip_in_kernel(kvm);
363} 365}
364 366
365static inline int cpu_has_vmx_vpid(void) 367static inline bool cpu_has_vmx_vpid(void)
366{ 368{
367 return vmcs_config.cpu_based_2nd_exec_ctrl & 369 return vmcs_config.cpu_based_2nd_exec_ctrl &
368 SECONDARY_EXEC_ENABLE_VPID; 370 SECONDARY_EXEC_ENABLE_VPID;
369} 371}
370 372
371static inline int cpu_has_vmx_rdtscp(void) 373static inline bool cpu_has_vmx_rdtscp(void)
372{ 374{
373 return vmcs_config.cpu_based_2nd_exec_ctrl & 375 return vmcs_config.cpu_based_2nd_exec_ctrl &
374 SECONDARY_EXEC_RDTSCP; 376 SECONDARY_EXEC_RDTSCP;
375} 377}
376 378
377static inline int cpu_has_virtual_nmis(void) 379static inline bool cpu_has_virtual_nmis(void)
378{ 380{
379 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; 381 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
380} 382}
@@ -598,11 +600,11 @@ static void reload_tss(void)
598 /* 600 /*
599 * VT restores TR but not its size. Useless. 601 * VT restores TR but not its size. Useless.
600 */ 602 */
601 struct descriptor_table gdt; 603 struct desc_ptr gdt;
602 struct desc_struct *descs; 604 struct desc_struct *descs;
603 605
604 kvm_get_gdt(&gdt); 606 native_store_gdt(&gdt);
605 descs = (void *)gdt.base; 607 descs = (void *)gdt.address;
606 descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ 608 descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
607 load_TR_desc(); 609 load_TR_desc();
608} 610}
@@ -632,6 +634,43 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
632 return true; 634 return true;
633} 635}
634 636
637static unsigned long segment_base(u16 selector)
638{
639 struct desc_ptr gdt;
640 struct desc_struct *d;
641 unsigned long table_base;
642 unsigned long v;
643
644 if (!(selector & ~3))
645 return 0;
646
647 native_store_gdt(&gdt);
648 table_base = gdt.address;
649
650 if (selector & 4) { /* from ldt */
651 u16 ldt_selector = kvm_read_ldt();
652
653 if (!(ldt_selector & ~3))
654 return 0;
655
656 table_base = segment_base(ldt_selector);
657 }
658 d = (struct desc_struct *)(table_base + (selector & ~7));
659 v = get_desc_base(d);
660#ifdef CONFIG_X86_64
661 if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
662 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
663#endif
664 return v;
665}
666
667static inline unsigned long kvm_read_tr_base(void)
668{
669 u16 tr;
670 asm("str %0" : "=g"(tr));
671 return segment_base(tr);
672}
673
635static void vmx_save_host_state(struct kvm_vcpu *vcpu) 674static void vmx_save_host_state(struct kvm_vcpu *vcpu)
636{ 675{
637 struct vcpu_vmx *vmx = to_vmx(vcpu); 676 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -756,7 +795,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
756 } 795 }
757 796
758 if (vcpu->cpu != cpu) { 797 if (vcpu->cpu != cpu) {
759 struct descriptor_table dt; 798 struct desc_ptr dt;
760 unsigned long sysenter_esp; 799 unsigned long sysenter_esp;
761 800
762 vcpu->cpu = cpu; 801 vcpu->cpu = cpu;
@@ -765,8 +804,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
765 * processors. 804 * processors.
766 */ 805 */
767 vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */ 806 vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */
768 kvm_get_gdt(&dt); 807 native_store_gdt(&dt);
769 vmcs_writel(HOST_GDTR_BASE, dt.base); /* 22.2.4 */ 808 vmcs_writel(HOST_GDTR_BASE, dt.address); /* 22.2.4 */
770 809
771 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); 810 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
772 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ 811 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
@@ -818,18 +857,23 @@ static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
818 857
819static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) 858static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
820{ 859{
821 unsigned long rflags; 860 unsigned long rflags, save_rflags;
822 861
823 rflags = vmcs_readl(GUEST_RFLAGS); 862 rflags = vmcs_readl(GUEST_RFLAGS);
824 if (to_vmx(vcpu)->rmode.vm86_active) 863 if (to_vmx(vcpu)->rmode.vm86_active) {
825 rflags &= ~(unsigned long)(X86_EFLAGS_IOPL | X86_EFLAGS_VM); 864 rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
865 save_rflags = to_vmx(vcpu)->rmode.save_rflags;
866 rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
867 }
826 return rflags; 868 return rflags;
827} 869}
828 870
829static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 871static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
830{ 872{
831 if (to_vmx(vcpu)->rmode.vm86_active) 873 if (to_vmx(vcpu)->rmode.vm86_active) {
874 to_vmx(vcpu)->rmode.save_rflags = rflags;
832 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 875 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
876 }
833 vmcs_writel(GUEST_RFLAGS, rflags); 877 vmcs_writel(GUEST_RFLAGS, rflags);
834} 878}
835 879
@@ -839,9 +883,9 @@ static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
839 int ret = 0; 883 int ret = 0;
840 884
841 if (interruptibility & GUEST_INTR_STATE_STI) 885 if (interruptibility & GUEST_INTR_STATE_STI)
842 ret |= X86_SHADOW_INT_STI; 886 ret |= KVM_X86_SHADOW_INT_STI;
843 if (interruptibility & GUEST_INTR_STATE_MOV_SS) 887 if (interruptibility & GUEST_INTR_STATE_MOV_SS)
844 ret |= X86_SHADOW_INT_MOV_SS; 888 ret |= KVM_X86_SHADOW_INT_MOV_SS;
845 889
846 return ret & mask; 890 return ret & mask;
847} 891}
@@ -853,9 +897,9 @@ static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
853 897
854 interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS); 898 interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
855 899
856 if (mask & X86_SHADOW_INT_MOV_SS) 900 if (mask & KVM_X86_SHADOW_INT_MOV_SS)
857 interruptibility |= GUEST_INTR_STATE_MOV_SS; 901 interruptibility |= GUEST_INTR_STATE_MOV_SS;
858 if (mask & X86_SHADOW_INT_STI) 902 else if (mask & KVM_X86_SHADOW_INT_STI)
859 interruptibility |= GUEST_INTR_STATE_STI; 903 interruptibility |= GUEST_INTR_STATE_STI;
860 904
861 if ((interruptibility != interruptibility_old)) 905 if ((interruptibility != interruptibility_old))
@@ -1483,8 +1527,8 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
1483 vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar); 1527 vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar);
1484 1528
1485 flags = vmcs_readl(GUEST_RFLAGS); 1529 flags = vmcs_readl(GUEST_RFLAGS);
1486 flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM); 1530 flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
1487 flags |= (vmx->rmode.save_iopl << IOPL_SHIFT); 1531 flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
1488 vmcs_writel(GUEST_RFLAGS, flags); 1532 vmcs_writel(GUEST_RFLAGS, flags);
1489 1533
1490 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | 1534 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
@@ -1514,7 +1558,7 @@ static gva_t rmode_tss_base(struct kvm *kvm)
1514 struct kvm_memslots *slots; 1558 struct kvm_memslots *slots;
1515 gfn_t base_gfn; 1559 gfn_t base_gfn;
1516 1560
1517 slots = rcu_dereference(kvm->memslots); 1561 slots = kvm_memslots(kvm);
1518 base_gfn = kvm->memslots->memslots[0].base_gfn + 1562 base_gfn = kvm->memslots->memslots[0].base_gfn +
1519 kvm->memslots->memslots[0].npages - 3; 1563 kvm->memslots->memslots[0].npages - 3;
1520 return base_gfn << PAGE_SHIFT; 1564 return base_gfn << PAGE_SHIFT;
@@ -1557,8 +1601,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
1557 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 1601 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
1558 1602
1559 flags = vmcs_readl(GUEST_RFLAGS); 1603 flags = vmcs_readl(GUEST_RFLAGS);
1560 vmx->rmode.save_iopl 1604 vmx->rmode.save_rflags = flags;
1561 = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
1562 1605
1563 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 1606 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
1564 1607
@@ -1928,28 +1971,28 @@ static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
1928 *l = (ar >> 13) & 1; 1971 *l = (ar >> 13) & 1;
1929} 1972}
1930 1973
1931static void vmx_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) 1974static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1932{ 1975{
1933 dt->limit = vmcs_read32(GUEST_IDTR_LIMIT); 1976 dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
1934 dt->base = vmcs_readl(GUEST_IDTR_BASE); 1977 dt->address = vmcs_readl(GUEST_IDTR_BASE);
1935} 1978}
1936 1979
1937static void vmx_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) 1980static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1938{ 1981{
1939 vmcs_write32(GUEST_IDTR_LIMIT, dt->limit); 1982 vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
1940 vmcs_writel(GUEST_IDTR_BASE, dt->base); 1983 vmcs_writel(GUEST_IDTR_BASE, dt->address);
1941} 1984}
1942 1985
1943static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) 1986static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1944{ 1987{
1945 dt->limit = vmcs_read32(GUEST_GDTR_LIMIT); 1988 dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
1946 dt->base = vmcs_readl(GUEST_GDTR_BASE); 1989 dt->address = vmcs_readl(GUEST_GDTR_BASE);
1947} 1990}
1948 1991
1949static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) 1992static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1950{ 1993{
1951 vmcs_write32(GUEST_GDTR_LIMIT, dt->limit); 1994 vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
1952 vmcs_writel(GUEST_GDTR_BASE, dt->base); 1995 vmcs_writel(GUEST_GDTR_BASE, dt->address);
1953} 1996}
1954 1997
1955static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) 1998static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
@@ -2290,6 +2333,16 @@ static void allocate_vpid(struct vcpu_vmx *vmx)
2290 spin_unlock(&vmx_vpid_lock); 2333 spin_unlock(&vmx_vpid_lock);
2291} 2334}
2292 2335
2336static void free_vpid(struct vcpu_vmx *vmx)
2337{
2338 if (!enable_vpid)
2339 return;
2340 spin_lock(&vmx_vpid_lock);
2341 if (vmx->vpid != 0)
2342 __clear_bit(vmx->vpid, vmx_vpid_bitmap);
2343 spin_unlock(&vmx_vpid_lock);
2344}
2345
2293static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr) 2346static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
2294{ 2347{
2295 int f = sizeof(unsigned long); 2348 int f = sizeof(unsigned long);
@@ -2328,7 +2381,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2328 u32 junk; 2381 u32 junk;
2329 u64 host_pat, tsc_this, tsc_base; 2382 u64 host_pat, tsc_this, tsc_base;
2330 unsigned long a; 2383 unsigned long a;
2331 struct descriptor_table dt; 2384 struct desc_ptr dt;
2332 int i; 2385 int i;
2333 unsigned long kvm_vmx_return; 2386 unsigned long kvm_vmx_return;
2334 u32 exec_control; 2387 u32 exec_control;
@@ -2409,8 +2462,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2409 2462
2410 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ 2463 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
2411 2464
2412 kvm_get_idt(&dt); 2465 native_store_idt(&dt);
2413 vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */ 2466 vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */
2414 2467
2415 asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return)); 2468 asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
2416 vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */ 2469 vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
@@ -2942,22 +2995,20 @@ static int handle_io(struct kvm_vcpu *vcpu)
2942 int size, in, string; 2995 int size, in, string;
2943 unsigned port; 2996 unsigned port;
2944 2997
2945 ++vcpu->stat.io_exits;
2946 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 2998 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
2947 string = (exit_qualification & 16) != 0; 2999 string = (exit_qualification & 16) != 0;
3000 in = (exit_qualification & 8) != 0;
2948 3001
2949 if (string) { 3002 ++vcpu->stat.io_exits;
2950 if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO)
2951 return 0;
2952 return 1;
2953 }
2954 3003
2955 size = (exit_qualification & 7) + 1; 3004 if (string || in)
2956 in = (exit_qualification & 8) != 0; 3005 return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO);
2957 port = exit_qualification >> 16;
2958 3006
3007 port = exit_qualification >> 16;
3008 size = (exit_qualification & 7) + 1;
2959 skip_emulated_instruction(vcpu); 3009 skip_emulated_instruction(vcpu);
2960 return kvm_emulate_pio(vcpu, in, size, port); 3010
3011 return kvm_fast_pio_out(vcpu, size, port);
2961} 3012}
2962 3013
2963static void 3014static void
@@ -3048,19 +3099,9 @@ static int handle_cr(struct kvm_vcpu *vcpu)
3048 return 0; 3099 return 0;
3049} 3100}
3050 3101
3051static int check_dr_alias(struct kvm_vcpu *vcpu)
3052{
3053 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
3054 kvm_queue_exception(vcpu, UD_VECTOR);
3055 return -1;
3056 }
3057 return 0;
3058}
3059
3060static int handle_dr(struct kvm_vcpu *vcpu) 3102static int handle_dr(struct kvm_vcpu *vcpu)
3061{ 3103{
3062 unsigned long exit_qualification; 3104 unsigned long exit_qualification;
3063 unsigned long val;
3064 int dr, reg; 3105 int dr, reg;
3065 3106
3066 /* Do not handle if the CPL > 0, will trigger GP on re-entry */ 3107 /* Do not handle if the CPL > 0, will trigger GP on re-entry */
@@ -3095,67 +3136,20 @@ static int handle_dr(struct kvm_vcpu *vcpu)
3095 dr = exit_qualification & DEBUG_REG_ACCESS_NUM; 3136 dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
3096 reg = DEBUG_REG_ACCESS_REG(exit_qualification); 3137 reg = DEBUG_REG_ACCESS_REG(exit_qualification);
3097 if (exit_qualification & TYPE_MOV_FROM_DR) { 3138 if (exit_qualification & TYPE_MOV_FROM_DR) {
3098 switch (dr) { 3139 unsigned long val;
3099 case 0 ... 3: 3140 if (!kvm_get_dr(vcpu, dr, &val))
3100 val = vcpu->arch.db[dr]; 3141 kvm_register_write(vcpu, reg, val);
3101 break; 3142 } else
3102 case 4: 3143 kvm_set_dr(vcpu, dr, vcpu->arch.regs[reg]);
3103 if (check_dr_alias(vcpu) < 0)
3104 return 1;
3105 /* fall through */
3106 case 6:
3107 val = vcpu->arch.dr6;
3108 break;
3109 case 5:
3110 if (check_dr_alias(vcpu) < 0)
3111 return 1;
3112 /* fall through */
3113 default: /* 7 */
3114 val = vcpu->arch.dr7;
3115 break;
3116 }
3117 kvm_register_write(vcpu, reg, val);
3118 } else {
3119 val = vcpu->arch.regs[reg];
3120 switch (dr) {
3121 case 0 ... 3:
3122 vcpu->arch.db[dr] = val;
3123 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
3124 vcpu->arch.eff_db[dr] = val;
3125 break;
3126 case 4:
3127 if (check_dr_alias(vcpu) < 0)
3128 return 1;
3129 /* fall through */
3130 case 6:
3131 if (val & 0xffffffff00000000ULL) {
3132 kvm_inject_gp(vcpu, 0);
3133 return 1;
3134 }
3135 vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
3136 break;
3137 case 5:
3138 if (check_dr_alias(vcpu) < 0)
3139 return 1;
3140 /* fall through */
3141 default: /* 7 */
3142 if (val & 0xffffffff00000000ULL) {
3143 kvm_inject_gp(vcpu, 0);
3144 return 1;
3145 }
3146 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
3147 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
3148 vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
3149 vcpu->arch.switch_db_regs =
3150 (val & DR7_BP_EN_MASK);
3151 }
3152 break;
3153 }
3154 }
3155 skip_emulated_instruction(vcpu); 3144 skip_emulated_instruction(vcpu);
3156 return 1; 3145 return 1;
3157} 3146}
3158 3147
3148static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
3149{
3150 vmcs_writel(GUEST_DR7, val);
3151}
3152
3159static int handle_cpuid(struct kvm_vcpu *vcpu) 3153static int handle_cpuid(struct kvm_vcpu *vcpu)
3160{ 3154{
3161 kvm_emulate_cpuid(vcpu); 3155 kvm_emulate_cpuid(vcpu);
@@ -3287,6 +3281,8 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
3287{ 3281{
3288 struct vcpu_vmx *vmx = to_vmx(vcpu); 3282 struct vcpu_vmx *vmx = to_vmx(vcpu);
3289 unsigned long exit_qualification; 3283 unsigned long exit_qualification;
3284 bool has_error_code = false;
3285 u32 error_code = 0;
3290 u16 tss_selector; 3286 u16 tss_selector;
3291 int reason, type, idt_v; 3287 int reason, type, idt_v;
3292 3288
@@ -3309,6 +3305,13 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
3309 kvm_clear_interrupt_queue(vcpu); 3305 kvm_clear_interrupt_queue(vcpu);
3310 break; 3306 break;
3311 case INTR_TYPE_HARD_EXCEPTION: 3307 case INTR_TYPE_HARD_EXCEPTION:
3308 if (vmx->idt_vectoring_info &
3309 VECTORING_INFO_DELIVER_CODE_MASK) {
3310 has_error_code = true;
3311 error_code =
3312 vmcs_read32(IDT_VECTORING_ERROR_CODE);
3313 }
3314 /* fall through */
3312 case INTR_TYPE_SOFT_EXCEPTION: 3315 case INTR_TYPE_SOFT_EXCEPTION:
3313 kvm_clear_exception_queue(vcpu); 3316 kvm_clear_exception_queue(vcpu);
3314 break; 3317 break;
@@ -3323,8 +3326,13 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
3323 type != INTR_TYPE_NMI_INTR)) 3326 type != INTR_TYPE_NMI_INTR))
3324 skip_emulated_instruction(vcpu); 3327 skip_emulated_instruction(vcpu);
3325 3328
3326 if (!kvm_task_switch(vcpu, tss_selector, reason)) 3329 if (kvm_task_switch(vcpu, tss_selector, reason,
3330 has_error_code, error_code) == EMULATE_FAIL) {
3331 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3332 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
3333 vcpu->run->internal.ndata = 0;
3327 return 0; 3334 return 0;
3335 }
3328 3336
3329 /* clear all local breakpoint enable flags */ 3337 /* clear all local breakpoint enable flags */
3330 vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55); 3338 vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55);
@@ -3569,7 +3577,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
3569 u32 exit_reason = vmx->exit_reason; 3577 u32 exit_reason = vmx->exit_reason;
3570 u32 vectoring_info = vmx->idt_vectoring_info; 3578 u32 vectoring_info = vmx->idt_vectoring_info;
3571 3579
3572 trace_kvm_exit(exit_reason, kvm_rip_read(vcpu)); 3580 trace_kvm_exit(exit_reason, vcpu);
3573 3581
3574 /* If guest state is invalid, start emulating */ 3582 /* If guest state is invalid, start emulating */
3575 if (vmx->emulation_required && emulate_invalid_guest_state) 3583 if (vmx->emulation_required && emulate_invalid_guest_state)
@@ -3918,10 +3926,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
3918{ 3926{
3919 struct vcpu_vmx *vmx = to_vmx(vcpu); 3927 struct vcpu_vmx *vmx = to_vmx(vcpu);
3920 3928
3921 spin_lock(&vmx_vpid_lock); 3929 free_vpid(vmx);
3922 if (vmx->vpid != 0)
3923 __clear_bit(vmx->vpid, vmx_vpid_bitmap);
3924 spin_unlock(&vmx_vpid_lock);
3925 vmx_free_vmcs(vcpu); 3930 vmx_free_vmcs(vcpu);
3926 kfree(vmx->guest_msrs); 3931 kfree(vmx->guest_msrs);
3927 kvm_vcpu_uninit(vcpu); 3932 kvm_vcpu_uninit(vcpu);
@@ -3983,6 +3988,7 @@ free_msrs:
3983uninit_vcpu: 3988uninit_vcpu:
3984 kvm_vcpu_uninit(&vmx->vcpu); 3989 kvm_vcpu_uninit(&vmx->vcpu);
3985free_vcpu: 3990free_vcpu:
3991 free_vpid(vmx);
3986 kmem_cache_free(kvm_vcpu_cache, vmx); 3992 kmem_cache_free(kvm_vcpu_cache, vmx);
3987 return ERR_PTR(err); 3993 return ERR_PTR(err);
3988} 3994}
@@ -4149,6 +4155,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
4149 .set_idt = vmx_set_idt, 4155 .set_idt = vmx_set_idt,
4150 .get_gdt = vmx_get_gdt, 4156 .get_gdt = vmx_get_gdt,
4151 .set_gdt = vmx_set_gdt, 4157 .set_gdt = vmx_set_gdt,
4158 .set_dr7 = vmx_set_dr7,
4152 .cache_reg = vmx_cache_reg, 4159 .cache_reg = vmx_cache_reg,
4153 .get_rflags = vmx_get_rflags, 4160 .get_rflags = vmx_get_rflags,
4154 .set_rflags = vmx_set_rflags, 4161 .set_rflags = vmx_set_rflags,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 21b9b6aa3e88..848c814e8c3c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -42,7 +42,7 @@
42#include <linux/slab.h> 42#include <linux/slab.h>
43#include <linux/perf_event.h> 43#include <linux/perf_event.h>
44#include <trace/events/kvm.h> 44#include <trace/events/kvm.h>
45#undef TRACE_INCLUDE_FILE 45
46#define CREATE_TRACE_POINTS 46#define CREATE_TRACE_POINTS
47#include "trace.h" 47#include "trace.h"
48 48
@@ -224,34 +224,6 @@ static void drop_user_return_notifiers(void *ignore)
224 kvm_on_user_return(&smsr->urn); 224 kvm_on_user_return(&smsr->urn);
225} 225}
226 226
227unsigned long segment_base(u16 selector)
228{
229 struct descriptor_table gdt;
230 struct desc_struct *d;
231 unsigned long table_base;
232 unsigned long v;
233
234 if (selector == 0)
235 return 0;
236
237 kvm_get_gdt(&gdt);
238 table_base = gdt.base;
239
240 if (selector & 4) { /* from ldt */
241 u16 ldt_selector = kvm_read_ldt();
242
243 table_base = segment_base(ldt_selector);
244 }
245 d = (struct desc_struct *)(table_base + (selector & ~7));
246 v = get_desc_base(d);
247#ifdef CONFIG_X86_64
248 if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
249 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
250#endif
251 return v;
252}
253EXPORT_SYMBOL_GPL(segment_base);
254
255u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) 227u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
256{ 228{
257 if (irqchip_in_kernel(vcpu->kvm)) 229 if (irqchip_in_kernel(vcpu->kvm))
@@ -434,8 +406,6 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
434 406
435#ifdef CONFIG_X86_64 407#ifdef CONFIG_X86_64
436 if (cr0 & 0xffffffff00000000UL) { 408 if (cr0 & 0xffffffff00000000UL) {
437 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
438 cr0, kvm_read_cr0(vcpu));
439 kvm_inject_gp(vcpu, 0); 409 kvm_inject_gp(vcpu, 0);
440 return; 410 return;
441 } 411 }
@@ -444,14 +414,11 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
444 cr0 &= ~CR0_RESERVED_BITS; 414 cr0 &= ~CR0_RESERVED_BITS;
445 415
446 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { 416 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
447 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
448 kvm_inject_gp(vcpu, 0); 417 kvm_inject_gp(vcpu, 0);
449 return; 418 return;
450 } 419 }
451 420
452 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { 421 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
453 printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
454 "and a clear PE flag\n");
455 kvm_inject_gp(vcpu, 0); 422 kvm_inject_gp(vcpu, 0);
456 return; 423 return;
457 } 424 }
@@ -462,15 +429,11 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
462 int cs_db, cs_l; 429 int cs_db, cs_l;
463 430
464 if (!is_pae(vcpu)) { 431 if (!is_pae(vcpu)) {
465 printk(KERN_DEBUG "set_cr0: #GP, start paging "
466 "in long mode while PAE is disabled\n");
467 kvm_inject_gp(vcpu, 0); 432 kvm_inject_gp(vcpu, 0);
468 return; 433 return;
469 } 434 }
470 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 435 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
471 if (cs_l) { 436 if (cs_l) {
472 printk(KERN_DEBUG "set_cr0: #GP, start paging "
473 "in long mode while CS.L == 1\n");
474 kvm_inject_gp(vcpu, 0); 437 kvm_inject_gp(vcpu, 0);
475 return; 438 return;
476 439
@@ -478,8 +441,6 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
478 } else 441 } else
479#endif 442#endif
480 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { 443 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
481 printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
482 "reserved bits\n");
483 kvm_inject_gp(vcpu, 0); 444 kvm_inject_gp(vcpu, 0);
484 return; 445 return;
485 } 446 }
@@ -487,7 +448,6 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
487 } 448 }
488 449
489 kvm_x86_ops->set_cr0(vcpu, cr0); 450 kvm_x86_ops->set_cr0(vcpu, cr0);
490 vcpu->arch.cr0 = cr0;
491 451
492 kvm_mmu_reset_context(vcpu); 452 kvm_mmu_reset_context(vcpu);
493 return; 453 return;
@@ -506,34 +466,28 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
506 unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; 466 unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
507 467
508 if (cr4 & CR4_RESERVED_BITS) { 468 if (cr4 & CR4_RESERVED_BITS) {
509 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
510 kvm_inject_gp(vcpu, 0); 469 kvm_inject_gp(vcpu, 0);
511 return; 470 return;
512 } 471 }
513 472
514 if (is_long_mode(vcpu)) { 473 if (is_long_mode(vcpu)) {
515 if (!(cr4 & X86_CR4_PAE)) { 474 if (!(cr4 & X86_CR4_PAE)) {
516 printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
517 "in long mode\n");
518 kvm_inject_gp(vcpu, 0); 475 kvm_inject_gp(vcpu, 0);
519 return; 476 return;
520 } 477 }
521 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) 478 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
522 && ((cr4 ^ old_cr4) & pdptr_bits) 479 && ((cr4 ^ old_cr4) & pdptr_bits)
523 && !load_pdptrs(vcpu, vcpu->arch.cr3)) { 480 && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
524 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
525 kvm_inject_gp(vcpu, 0); 481 kvm_inject_gp(vcpu, 0);
526 return; 482 return;
527 } 483 }
528 484
529 if (cr4 & X86_CR4_VMXE) { 485 if (cr4 & X86_CR4_VMXE) {
530 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
531 kvm_inject_gp(vcpu, 0); 486 kvm_inject_gp(vcpu, 0);
532 return; 487 return;
533 } 488 }
534 kvm_x86_ops->set_cr4(vcpu, cr4); 489 kvm_x86_ops->set_cr4(vcpu, cr4);
535 vcpu->arch.cr4 = cr4; 490 vcpu->arch.cr4 = cr4;
536 vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled;
537 kvm_mmu_reset_context(vcpu); 491 kvm_mmu_reset_context(vcpu);
538} 492}
539EXPORT_SYMBOL_GPL(kvm_set_cr4); 493EXPORT_SYMBOL_GPL(kvm_set_cr4);
@@ -548,21 +502,16 @@ void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
548 502
549 if (is_long_mode(vcpu)) { 503 if (is_long_mode(vcpu)) {
550 if (cr3 & CR3_L_MODE_RESERVED_BITS) { 504 if (cr3 & CR3_L_MODE_RESERVED_BITS) {
551 printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
552 kvm_inject_gp(vcpu, 0); 505 kvm_inject_gp(vcpu, 0);
553 return; 506 return;
554 } 507 }
555 } else { 508 } else {
556 if (is_pae(vcpu)) { 509 if (is_pae(vcpu)) {
557 if (cr3 & CR3_PAE_RESERVED_BITS) { 510 if (cr3 & CR3_PAE_RESERVED_BITS) {
558 printk(KERN_DEBUG
559 "set_cr3: #GP, reserved bits\n");
560 kvm_inject_gp(vcpu, 0); 511 kvm_inject_gp(vcpu, 0);
561 return; 512 return;
562 } 513 }
563 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { 514 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
564 printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
565 "reserved bits\n");
566 kvm_inject_gp(vcpu, 0); 515 kvm_inject_gp(vcpu, 0);
567 return; 516 return;
568 } 517 }
@@ -594,7 +543,6 @@ EXPORT_SYMBOL_GPL(kvm_set_cr3);
594void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) 543void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
595{ 544{
596 if (cr8 & CR8_RESERVED_BITS) { 545 if (cr8 & CR8_RESERVED_BITS) {
597 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
598 kvm_inject_gp(vcpu, 0); 546 kvm_inject_gp(vcpu, 0);
599 return; 547 return;
600 } 548 }
@@ -614,6 +562,80 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
614} 562}
615EXPORT_SYMBOL_GPL(kvm_get_cr8); 563EXPORT_SYMBOL_GPL(kvm_get_cr8);
616 564
565int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
566{
567 switch (dr) {
568 case 0 ... 3:
569 vcpu->arch.db[dr] = val;
570 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
571 vcpu->arch.eff_db[dr] = val;
572 break;
573 case 4:
574 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
575 kvm_queue_exception(vcpu, UD_VECTOR);
576 return 1;
577 }
578 /* fall through */
579 case 6:
580 if (val & 0xffffffff00000000ULL) {
581 kvm_inject_gp(vcpu, 0);
582 return 1;
583 }
584 vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
585 break;
586 case 5:
587 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
588 kvm_queue_exception(vcpu, UD_VECTOR);
589 return 1;
590 }
591 /* fall through */
592 default: /* 7 */
593 if (val & 0xffffffff00000000ULL) {
594 kvm_inject_gp(vcpu, 0);
595 return 1;
596 }
597 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
598 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
599 kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7);
600 vcpu->arch.switch_db_regs = (val & DR7_BP_EN_MASK);
601 }
602 break;
603 }
604
605 return 0;
606}
607EXPORT_SYMBOL_GPL(kvm_set_dr);
608
609int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
610{
611 switch (dr) {
612 case 0 ... 3:
613 *val = vcpu->arch.db[dr];
614 break;
615 case 4:
616 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
617 kvm_queue_exception(vcpu, UD_VECTOR);
618 return 1;
619 }
620 /* fall through */
621 case 6:
622 *val = vcpu->arch.dr6;
623 break;
624 case 5:
625 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
626 kvm_queue_exception(vcpu, UD_VECTOR);
627 return 1;
628 }
629 /* fall through */
630 default: /* 7 */
631 *val = vcpu->arch.dr7;
632 break;
633 }
634
635 return 0;
636}
637EXPORT_SYMBOL_GPL(kvm_get_dr);
638
617static inline u32 bit(int bitno) 639static inline u32 bit(int bitno)
618{ 640{
619 return 1 << (bitno & 31); 641 return 1 << (bitno & 31);
@@ -650,15 +672,12 @@ static u32 emulated_msrs[] = {
650static void set_efer(struct kvm_vcpu *vcpu, u64 efer) 672static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
651{ 673{
652 if (efer & efer_reserved_bits) { 674 if (efer & efer_reserved_bits) {
653 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
654 efer);
655 kvm_inject_gp(vcpu, 0); 675 kvm_inject_gp(vcpu, 0);
656 return; 676 return;
657 } 677 }
658 678
659 if (is_paging(vcpu) 679 if (is_paging(vcpu)
660 && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) { 680 && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) {
661 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
662 kvm_inject_gp(vcpu, 0); 681 kvm_inject_gp(vcpu, 0);
663 return; 682 return;
664 } 683 }
@@ -668,7 +687,6 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
668 687
669 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 688 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
670 if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) { 689 if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) {
671 printk(KERN_DEBUG "set_efer: #GP, enable FFXSR w/o CPUID capability\n");
672 kvm_inject_gp(vcpu, 0); 690 kvm_inject_gp(vcpu, 0);
673 return; 691 return;
674 } 692 }
@@ -679,7 +697,6 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
679 697
680 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 698 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
681 if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) { 699 if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) {
682 printk(KERN_DEBUG "set_efer: #GP, enable SVM w/o SVM\n");
683 kvm_inject_gp(vcpu, 0); 700 kvm_inject_gp(vcpu, 0);
684 return; 701 return;
685 } 702 }
@@ -968,9 +985,13 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
968 if (msr >= MSR_IA32_MC0_CTL && 985 if (msr >= MSR_IA32_MC0_CTL &&
969 msr < MSR_IA32_MC0_CTL + 4 * bank_num) { 986 msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
970 u32 offset = msr - MSR_IA32_MC0_CTL; 987 u32 offset = msr - MSR_IA32_MC0_CTL;
971 /* only 0 or all 1s can be written to IA32_MCi_CTL */ 988 /* only 0 or all 1s can be written to IA32_MCi_CTL
989 * some Linux kernels though clear bit 10 in bank 4 to
990 * workaround a BIOS/GART TBL issue on AMD K8s, ignore
991 * this to avoid an uncatched #GP in the guest
992 */
972 if ((offset & 0x3) == 0 && 993 if ((offset & 0x3) == 0 &&
973 data != 0 && data != ~(u64)0) 994 data != 0 && (data | (1 << 10)) != ~(u64)0)
974 return -1; 995 return -1;
975 vcpu->arch.mce_banks[offset] = data; 996 vcpu->arch.mce_banks[offset] = data;
976 break; 997 break;
@@ -1114,6 +1135,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1114 break; 1135 break;
1115 case MSR_K7_HWCR: 1136 case MSR_K7_HWCR:
1116 data &= ~(u64)0x40; /* ignore flush filter disable */ 1137 data &= ~(u64)0x40; /* ignore flush filter disable */
1138 data &= ~(u64)0x100; /* ignore ignne emulation enable */
1117 if (data != 0) { 1139 if (data != 0) {
1118 pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", 1140 pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
1119 data); 1141 data);
@@ -1572,6 +1594,7 @@ int kvm_dev_ioctl_check_extension(long ext)
1572 case KVM_CAP_HYPERV_VAPIC: 1594 case KVM_CAP_HYPERV_VAPIC:
1573 case KVM_CAP_HYPERV_SPIN: 1595 case KVM_CAP_HYPERV_SPIN:
1574 case KVM_CAP_PCI_SEGMENT: 1596 case KVM_CAP_PCI_SEGMENT:
1597 case KVM_CAP_DEBUGREGS:
1575 case KVM_CAP_X86_ROBUST_SINGLESTEP: 1598 case KVM_CAP_X86_ROBUST_SINGLESTEP:
1576 r = 1; 1599 r = 1;
1577 break; 1600 break;
@@ -2124,14 +2147,20 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
2124{ 2147{
2125 vcpu_load(vcpu); 2148 vcpu_load(vcpu);
2126 2149
2127 events->exception.injected = vcpu->arch.exception.pending; 2150 events->exception.injected =
2151 vcpu->arch.exception.pending &&
2152 !kvm_exception_is_soft(vcpu->arch.exception.nr);
2128 events->exception.nr = vcpu->arch.exception.nr; 2153 events->exception.nr = vcpu->arch.exception.nr;
2129 events->exception.has_error_code = vcpu->arch.exception.has_error_code; 2154 events->exception.has_error_code = vcpu->arch.exception.has_error_code;
2130 events->exception.error_code = vcpu->arch.exception.error_code; 2155 events->exception.error_code = vcpu->arch.exception.error_code;
2131 2156
2132 events->interrupt.injected = vcpu->arch.interrupt.pending; 2157 events->interrupt.injected =
2158 vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft;
2133 events->interrupt.nr = vcpu->arch.interrupt.nr; 2159 events->interrupt.nr = vcpu->arch.interrupt.nr;
2134 events->interrupt.soft = vcpu->arch.interrupt.soft; 2160 events->interrupt.soft = 0;
2161 events->interrupt.shadow =
2162 kvm_x86_ops->get_interrupt_shadow(vcpu,
2163 KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI);
2135 2164
2136 events->nmi.injected = vcpu->arch.nmi_injected; 2165 events->nmi.injected = vcpu->arch.nmi_injected;
2137 events->nmi.pending = vcpu->arch.nmi_pending; 2166 events->nmi.pending = vcpu->arch.nmi_pending;
@@ -2140,7 +2169,8 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
2140 events->sipi_vector = vcpu->arch.sipi_vector; 2169 events->sipi_vector = vcpu->arch.sipi_vector;
2141 2170
2142 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING 2171 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
2143 | KVM_VCPUEVENT_VALID_SIPI_VECTOR); 2172 | KVM_VCPUEVENT_VALID_SIPI_VECTOR
2173 | KVM_VCPUEVENT_VALID_SHADOW);
2144 2174
2145 vcpu_put(vcpu); 2175 vcpu_put(vcpu);
2146} 2176}
@@ -2149,7 +2179,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2149 struct kvm_vcpu_events *events) 2179 struct kvm_vcpu_events *events)
2150{ 2180{
2151 if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING 2181 if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
2152 | KVM_VCPUEVENT_VALID_SIPI_VECTOR)) 2182 | KVM_VCPUEVENT_VALID_SIPI_VECTOR
2183 | KVM_VCPUEVENT_VALID_SHADOW))
2153 return -EINVAL; 2184 return -EINVAL;
2154 2185
2155 vcpu_load(vcpu); 2186 vcpu_load(vcpu);
@@ -2164,6 +2195,9 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2164 vcpu->arch.interrupt.soft = events->interrupt.soft; 2195 vcpu->arch.interrupt.soft = events->interrupt.soft;
2165 if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm)) 2196 if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm))
2166 kvm_pic_clear_isr_ack(vcpu->kvm); 2197 kvm_pic_clear_isr_ack(vcpu->kvm);
2198 if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
2199 kvm_x86_ops->set_interrupt_shadow(vcpu,
2200 events->interrupt.shadow);
2167 2201
2168 vcpu->arch.nmi_injected = events->nmi.injected; 2202 vcpu->arch.nmi_injected = events->nmi.injected;
2169 if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) 2203 if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
@@ -2178,6 +2212,36 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2178 return 0; 2212 return 0;
2179} 2213}
2180 2214
2215static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
2216 struct kvm_debugregs *dbgregs)
2217{
2218 vcpu_load(vcpu);
2219
2220 memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
2221 dbgregs->dr6 = vcpu->arch.dr6;
2222 dbgregs->dr7 = vcpu->arch.dr7;
2223 dbgregs->flags = 0;
2224
2225 vcpu_put(vcpu);
2226}
2227
2228static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
2229 struct kvm_debugregs *dbgregs)
2230{
2231 if (dbgregs->flags)
2232 return -EINVAL;
2233
2234 vcpu_load(vcpu);
2235
2236 memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
2237 vcpu->arch.dr6 = dbgregs->dr6;
2238 vcpu->arch.dr7 = dbgregs->dr7;
2239
2240 vcpu_put(vcpu);
2241
2242 return 0;
2243}
2244
2181long kvm_arch_vcpu_ioctl(struct file *filp, 2245long kvm_arch_vcpu_ioctl(struct file *filp,
2182 unsigned int ioctl, unsigned long arg) 2246 unsigned int ioctl, unsigned long arg)
2183{ 2247{
@@ -2356,6 +2420,29 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
2356 r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events); 2420 r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
2357 break; 2421 break;
2358 } 2422 }
2423 case KVM_GET_DEBUGREGS: {
2424 struct kvm_debugregs dbgregs;
2425
2426 kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
2427
2428 r = -EFAULT;
2429 if (copy_to_user(argp, &dbgregs,
2430 sizeof(struct kvm_debugregs)))
2431 break;
2432 r = 0;
2433 break;
2434 }
2435 case KVM_SET_DEBUGREGS: {
2436 struct kvm_debugregs dbgregs;
2437
2438 r = -EFAULT;
2439 if (copy_from_user(&dbgregs, argp,
2440 sizeof(struct kvm_debugregs)))
2441 break;
2442
2443 r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
2444 break;
2445 }
2359 default: 2446 default:
2360 r = -EINVAL; 2447 r = -EINVAL;
2361 } 2448 }
@@ -2409,7 +2496,7 @@ gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn)
2409 struct kvm_mem_alias *alias; 2496 struct kvm_mem_alias *alias;
2410 struct kvm_mem_aliases *aliases; 2497 struct kvm_mem_aliases *aliases;
2411 2498
2412 aliases = rcu_dereference(kvm->arch.aliases); 2499 aliases = kvm_aliases(kvm);
2413 2500
2414 for (i = 0; i < aliases->naliases; ++i) { 2501 for (i = 0; i < aliases->naliases; ++i) {
2415 alias = &aliases->aliases[i]; 2502 alias = &aliases->aliases[i];
@@ -2428,7 +2515,7 @@ gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
2428 struct kvm_mem_alias *alias; 2515 struct kvm_mem_alias *alias;
2429 struct kvm_mem_aliases *aliases; 2516 struct kvm_mem_aliases *aliases;
2430 2517
2431 aliases = rcu_dereference(kvm->arch.aliases); 2518 aliases = kvm_aliases(kvm);
2432 2519
2433 for (i = 0; i < aliases->naliases; ++i) { 2520 for (i = 0; i < aliases->naliases; ++i) {
2434 alias = &aliases->aliases[i]; 2521 alias = &aliases->aliases[i];
@@ -2636,8 +2723,9 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
2636int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, 2723int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2637 struct kvm_dirty_log *log) 2724 struct kvm_dirty_log *log)
2638{ 2725{
2639 int r, n, i; 2726 int r, i;
2640 struct kvm_memory_slot *memslot; 2727 struct kvm_memory_slot *memslot;
2728 unsigned long n;
2641 unsigned long is_dirty = 0; 2729 unsigned long is_dirty = 0;
2642 unsigned long *dirty_bitmap = NULL; 2730 unsigned long *dirty_bitmap = NULL;
2643 2731
@@ -2652,7 +2740,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2652 if (!memslot->dirty_bitmap) 2740 if (!memslot->dirty_bitmap)
2653 goto out; 2741 goto out;
2654 2742
2655 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; 2743 n = kvm_dirty_bitmap_bytes(memslot);
2656 2744
2657 r = -ENOMEM; 2745 r = -ENOMEM;
2658 dirty_bitmap = vmalloc(n); 2746 dirty_bitmap = vmalloc(n);
@@ -2822,11 +2910,13 @@ long kvm_arch_vm_ioctl(struct file *filp,
2822 r = -EFAULT; 2910 r = -EFAULT;
2823 if (copy_from_user(&irq_event, argp, sizeof irq_event)) 2911 if (copy_from_user(&irq_event, argp, sizeof irq_event))
2824 goto out; 2912 goto out;
2913 r = -ENXIO;
2825 if (irqchip_in_kernel(kvm)) { 2914 if (irqchip_in_kernel(kvm)) {
2826 __s32 status; 2915 __s32 status;
2827 status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 2916 status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
2828 irq_event.irq, irq_event.level); 2917 irq_event.irq, irq_event.level);
2829 if (ioctl == KVM_IRQ_LINE_STATUS) { 2918 if (ioctl == KVM_IRQ_LINE_STATUS) {
2919 r = -EFAULT;
2830 irq_event.status = status; 2920 irq_event.status = status;
2831 if (copy_to_user(argp, &irq_event, 2921 if (copy_to_user(argp, &irq_event,
2832 sizeof irq_event)) 2922 sizeof irq_event))
@@ -3042,6 +3132,18 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
3042 return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); 3132 return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
3043} 3133}
3044 3134
3135static void kvm_set_segment(struct kvm_vcpu *vcpu,
3136 struct kvm_segment *var, int seg)
3137{
3138 kvm_x86_ops->set_segment(vcpu, var, seg);
3139}
3140
3141void kvm_get_segment(struct kvm_vcpu *vcpu,
3142 struct kvm_segment *var, int seg)
3143{
3144 kvm_x86_ops->get_segment(vcpu, var, seg);
3145}
3146
3045gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3147gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
3046{ 3148{
3047 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3149 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
@@ -3122,14 +3224,17 @@ static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes,
3122 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error); 3224 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error);
3123} 3225}
3124 3226
3125static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes, 3227static int kvm_write_guest_virt_system(gva_t addr, void *val,
3126 struct kvm_vcpu *vcpu, u32 *error) 3228 unsigned int bytes,
3229 struct kvm_vcpu *vcpu,
3230 u32 *error)
3127{ 3231{
3128 void *data = val; 3232 void *data = val;
3129 int r = X86EMUL_CONTINUE; 3233 int r = X86EMUL_CONTINUE;
3130 3234
3131 while (bytes) { 3235 while (bytes) {
3132 gpa_t gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error); 3236 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr,
3237 PFERR_WRITE_MASK, error);
3133 unsigned offset = addr & (PAGE_SIZE-1); 3238 unsigned offset = addr & (PAGE_SIZE-1);
3134 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); 3239 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
3135 int ret; 3240 int ret;
@@ -3152,7 +3257,6 @@ out:
3152 return r; 3257 return r;
3153} 3258}
3154 3259
3155
3156static int emulator_read_emulated(unsigned long addr, 3260static int emulator_read_emulated(unsigned long addr,
3157 void *val, 3261 void *val,
3158 unsigned int bytes, 3262 unsigned int bytes,
@@ -3255,9 +3359,9 @@ mmio:
3255} 3359}
3256 3360
3257int emulator_write_emulated(unsigned long addr, 3361int emulator_write_emulated(unsigned long addr,
3258 const void *val, 3362 const void *val,
3259 unsigned int bytes, 3363 unsigned int bytes,
3260 struct kvm_vcpu *vcpu) 3364 struct kvm_vcpu *vcpu)
3261{ 3365{
3262 /* Crossing a page boundary? */ 3366 /* Crossing a page boundary? */
3263 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { 3367 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
@@ -3275,45 +3379,150 @@ int emulator_write_emulated(unsigned long addr,
3275} 3379}
3276EXPORT_SYMBOL_GPL(emulator_write_emulated); 3380EXPORT_SYMBOL_GPL(emulator_write_emulated);
3277 3381
3382#define CMPXCHG_TYPE(t, ptr, old, new) \
3383 (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
3384
3385#ifdef CONFIG_X86_64
3386# define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new)
3387#else
3388# define CMPXCHG64(ptr, old, new) \
3389 (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old))
3390#endif
3391
3278static int emulator_cmpxchg_emulated(unsigned long addr, 3392static int emulator_cmpxchg_emulated(unsigned long addr,
3279 const void *old, 3393 const void *old,
3280 const void *new, 3394 const void *new,
3281 unsigned int bytes, 3395 unsigned int bytes,
3282 struct kvm_vcpu *vcpu) 3396 struct kvm_vcpu *vcpu)
3283{ 3397{
3284 printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); 3398 gpa_t gpa;
3285#ifndef CONFIG_X86_64 3399 struct page *page;
3286 /* guests cmpxchg8b have to be emulated atomically */ 3400 char *kaddr;
3287 if (bytes == 8) { 3401 bool exchanged;
3288 gpa_t gpa;
3289 struct page *page;
3290 char *kaddr;
3291 u64 val;
3292 3402
3293 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL); 3403 /* guests cmpxchg8b have to be emulated atomically */
3404 if (bytes > 8 || (bytes & (bytes - 1)))
3405 goto emul_write;
3294 3406
3295 if (gpa == UNMAPPED_GVA || 3407 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
3296 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
3297 goto emul_write;
3298 3408
3299 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK)) 3409 if (gpa == UNMAPPED_GVA ||
3300 goto emul_write; 3410 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
3411 goto emul_write;
3301 3412
3302 val = *(u64 *)new; 3413 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
3414 goto emul_write;
3303 3415
3304 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); 3416 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
3305 3417
3306 kaddr = kmap_atomic(page, KM_USER0); 3418 kaddr = kmap_atomic(page, KM_USER0);
3307 set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val); 3419 kaddr += offset_in_page(gpa);
3308 kunmap_atomic(kaddr, KM_USER0); 3420 switch (bytes) {
3309 kvm_release_page_dirty(page); 3421 case 1:
3422 exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
3423 break;
3424 case 2:
3425 exchanged = CMPXCHG_TYPE(u16, kaddr, old, new);
3426 break;
3427 case 4:
3428 exchanged = CMPXCHG_TYPE(u32, kaddr, old, new);
3429 break;
3430 case 8:
3431 exchanged = CMPXCHG64(kaddr, old, new);
3432 break;
3433 default:
3434 BUG();
3310 } 3435 }
3436 kunmap_atomic(kaddr, KM_USER0);
3437 kvm_release_page_dirty(page);
3438
3439 if (!exchanged)
3440 return X86EMUL_CMPXCHG_FAILED;
3441
3442 kvm_mmu_pte_write(vcpu, gpa, new, bytes, 1);
3443
3444 return X86EMUL_CONTINUE;
3445
3311emul_write: 3446emul_write:
3312#endif 3447 printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
3313 3448
3314 return emulator_write_emulated(addr, new, bytes, vcpu); 3449 return emulator_write_emulated(addr, new, bytes, vcpu);
3315} 3450}
3316 3451
3452static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
3453{
3454 /* TODO: String I/O for in kernel device */
3455 int r;
3456
3457 if (vcpu->arch.pio.in)
3458 r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
3459 vcpu->arch.pio.size, pd);
3460 else
3461 r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
3462 vcpu->arch.pio.port, vcpu->arch.pio.size,
3463 pd);
3464 return r;
3465}
3466
3467
3468static int emulator_pio_in_emulated(int size, unsigned short port, void *val,
3469 unsigned int count, struct kvm_vcpu *vcpu)
3470{
3471 if (vcpu->arch.pio.count)
3472 goto data_avail;
3473
3474 trace_kvm_pio(1, port, size, 1);
3475
3476 vcpu->arch.pio.port = port;
3477 vcpu->arch.pio.in = 1;
3478 vcpu->arch.pio.count = count;
3479 vcpu->arch.pio.size = size;
3480
3481 if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
3482 data_avail:
3483 memcpy(val, vcpu->arch.pio_data, size * count);
3484 vcpu->arch.pio.count = 0;
3485 return 1;
3486 }
3487
3488 vcpu->run->exit_reason = KVM_EXIT_IO;
3489 vcpu->run->io.direction = KVM_EXIT_IO_IN;
3490 vcpu->run->io.size = size;
3491 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
3492 vcpu->run->io.count = count;
3493 vcpu->run->io.port = port;
3494
3495 return 0;
3496}
3497
3498static int emulator_pio_out_emulated(int size, unsigned short port,
3499 const void *val, unsigned int count,
3500 struct kvm_vcpu *vcpu)
3501{
3502 trace_kvm_pio(0, port, size, 1);
3503
3504 vcpu->arch.pio.port = port;
3505 vcpu->arch.pio.in = 0;
3506 vcpu->arch.pio.count = count;
3507 vcpu->arch.pio.size = size;
3508
3509 memcpy(vcpu->arch.pio_data, val, size * count);
3510
3511 if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
3512 vcpu->arch.pio.count = 0;
3513 return 1;
3514 }
3515
3516 vcpu->run->exit_reason = KVM_EXIT_IO;
3517 vcpu->run->io.direction = KVM_EXIT_IO_OUT;
3518 vcpu->run->io.size = size;
3519 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
3520 vcpu->run->io.count = count;
3521 vcpu->run->io.port = port;
3522
3523 return 0;
3524}
3525
3317static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) 3526static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
3318{ 3527{
3319 return kvm_x86_ops->get_segment_base(vcpu, seg); 3528 return kvm_x86_ops->get_segment_base(vcpu, seg);
@@ -3334,14 +3543,14 @@ int emulate_clts(struct kvm_vcpu *vcpu)
3334 3543
3335int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) 3544int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
3336{ 3545{
3337 return kvm_x86_ops->get_dr(ctxt->vcpu, dr, dest); 3546 return kvm_get_dr(ctxt->vcpu, dr, dest);
3338} 3547}
3339 3548
3340int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) 3549int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
3341{ 3550{
3342 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; 3551 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
3343 3552
3344 return kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask); 3553 return kvm_set_dr(ctxt->vcpu, dr, value & mask);
3345} 3554}
3346 3555
3347void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) 3556void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
@@ -3362,12 +3571,167 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
3362} 3571}
3363EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); 3572EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
3364 3573
3574static u64 mk_cr_64(u64 curr_cr, u32 new_val)
3575{
3576 return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
3577}
3578
3579static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)
3580{
3581 unsigned long value;
3582
3583 switch (cr) {
3584 case 0:
3585 value = kvm_read_cr0(vcpu);
3586 break;
3587 case 2:
3588 value = vcpu->arch.cr2;
3589 break;
3590 case 3:
3591 value = vcpu->arch.cr3;
3592 break;
3593 case 4:
3594 value = kvm_read_cr4(vcpu);
3595 break;
3596 case 8:
3597 value = kvm_get_cr8(vcpu);
3598 break;
3599 default:
3600 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
3601 return 0;
3602 }
3603
3604 return value;
3605}
3606
3607static void emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
3608{
3609 switch (cr) {
3610 case 0:
3611 kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
3612 break;
3613 case 2:
3614 vcpu->arch.cr2 = val;
3615 break;
3616 case 3:
3617 kvm_set_cr3(vcpu, val);
3618 break;
3619 case 4:
3620 kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
3621 break;
3622 case 8:
3623 kvm_set_cr8(vcpu, val & 0xfUL);
3624 break;
3625 default:
3626 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
3627 }
3628}
3629
3630static int emulator_get_cpl(struct kvm_vcpu *vcpu)
3631{
3632 return kvm_x86_ops->get_cpl(vcpu);
3633}
3634
3635static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu)
3636{
3637 kvm_x86_ops->get_gdt(vcpu, dt);
3638}
3639
3640static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg,
3641 struct kvm_vcpu *vcpu)
3642{
3643 struct kvm_segment var;
3644
3645 kvm_get_segment(vcpu, &var, seg);
3646
3647 if (var.unusable)
3648 return false;
3649
3650 if (var.g)
3651 var.limit >>= 12;
3652 set_desc_limit(desc, var.limit);
3653 set_desc_base(desc, (unsigned long)var.base);
3654 desc->type = var.type;
3655 desc->s = var.s;
3656 desc->dpl = var.dpl;
3657 desc->p = var.present;
3658 desc->avl = var.avl;
3659 desc->l = var.l;
3660 desc->d = var.db;
3661 desc->g = var.g;
3662
3663 return true;
3664}
3665
3666static void emulator_set_cached_descriptor(struct desc_struct *desc, int seg,
3667 struct kvm_vcpu *vcpu)
3668{
3669 struct kvm_segment var;
3670
3671 /* needed to preserve selector */
3672 kvm_get_segment(vcpu, &var, seg);
3673
3674 var.base = get_desc_base(desc);
3675 var.limit = get_desc_limit(desc);
3676 if (desc->g)
3677 var.limit = (var.limit << 12) | 0xfff;
3678 var.type = desc->type;
3679 var.present = desc->p;
3680 var.dpl = desc->dpl;
3681 var.db = desc->d;
3682 var.s = desc->s;
3683 var.l = desc->l;
3684 var.g = desc->g;
3685 var.avl = desc->avl;
3686 var.present = desc->p;
3687 var.unusable = !var.present;
3688 var.padding = 0;
3689
3690 kvm_set_segment(vcpu, &var, seg);
3691 return;
3692}
3693
3694static u16 emulator_get_segment_selector(int seg, struct kvm_vcpu *vcpu)
3695{
3696 struct kvm_segment kvm_seg;
3697
3698 kvm_get_segment(vcpu, &kvm_seg, seg);
3699 return kvm_seg.selector;
3700}
3701
3702static void emulator_set_segment_selector(u16 sel, int seg,
3703 struct kvm_vcpu *vcpu)
3704{
3705 struct kvm_segment kvm_seg;
3706
3707 kvm_get_segment(vcpu, &kvm_seg, seg);
3708 kvm_seg.selector = sel;
3709 kvm_set_segment(vcpu, &kvm_seg, seg);
3710}
3711
3712static void emulator_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
3713{
3714 kvm_x86_ops->set_rflags(vcpu, rflags);
3715}
3716
3365static struct x86_emulate_ops emulate_ops = { 3717static struct x86_emulate_ops emulate_ops = {
3366 .read_std = kvm_read_guest_virt_system, 3718 .read_std = kvm_read_guest_virt_system,
3719 .write_std = kvm_write_guest_virt_system,
3367 .fetch = kvm_fetch_guest_virt, 3720 .fetch = kvm_fetch_guest_virt,
3368 .read_emulated = emulator_read_emulated, 3721 .read_emulated = emulator_read_emulated,
3369 .write_emulated = emulator_write_emulated, 3722 .write_emulated = emulator_write_emulated,
3370 .cmpxchg_emulated = emulator_cmpxchg_emulated, 3723 .cmpxchg_emulated = emulator_cmpxchg_emulated,
3724 .pio_in_emulated = emulator_pio_in_emulated,
3725 .pio_out_emulated = emulator_pio_out_emulated,
3726 .get_cached_descriptor = emulator_get_cached_descriptor,
3727 .set_cached_descriptor = emulator_set_cached_descriptor,
3728 .get_segment_selector = emulator_get_segment_selector,
3729 .set_segment_selector = emulator_set_segment_selector,
3730 .get_gdt = emulator_get_gdt,
3731 .get_cr = emulator_get_cr,
3732 .set_cr = emulator_set_cr,
3733 .cpl = emulator_get_cpl,
3734 .set_rflags = emulator_set_rflags,
3371}; 3735};
3372 3736
3373static void cache_all_regs(struct kvm_vcpu *vcpu) 3737static void cache_all_regs(struct kvm_vcpu *vcpu)
@@ -3398,14 +3762,14 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3398 cache_all_regs(vcpu); 3762 cache_all_regs(vcpu);
3399 3763
3400 vcpu->mmio_is_write = 0; 3764 vcpu->mmio_is_write = 0;
3401 vcpu->arch.pio.string = 0;
3402 3765
3403 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 3766 if (!(emulation_type & EMULTYPE_NO_DECODE)) {
3404 int cs_db, cs_l; 3767 int cs_db, cs_l;
3405 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 3768 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
3406 3769
3407 vcpu->arch.emulate_ctxt.vcpu = vcpu; 3770 vcpu->arch.emulate_ctxt.vcpu = vcpu;
3408 vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu); 3771 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
3772 vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
3409 vcpu->arch.emulate_ctxt.mode = 3773 vcpu->arch.emulate_ctxt.mode =
3410 (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : 3774 (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
3411 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) 3775 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
@@ -3414,6 +3778,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3414 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 3778 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
3415 3779
3416 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 3780 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
3781 trace_kvm_emulate_insn_start(vcpu);
3417 3782
3418 /* Only allow emulation of specific instructions on #UD 3783 /* Only allow emulation of specific instructions on #UD
3419 * (namely VMMCALL, sysenter, sysexit, syscall)*/ 3784 * (namely VMMCALL, sysenter, sysexit, syscall)*/
@@ -3446,6 +3811,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3446 ++vcpu->stat.insn_emulation; 3811 ++vcpu->stat.insn_emulation;
3447 if (r) { 3812 if (r) {
3448 ++vcpu->stat.insn_emulation_fail; 3813 ++vcpu->stat.insn_emulation_fail;
3814 trace_kvm_emulate_insn_failed(vcpu);
3449 if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) 3815 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
3450 return EMULATE_DONE; 3816 return EMULATE_DONE;
3451 return EMULATE_FAIL; 3817 return EMULATE_FAIL;
@@ -3457,16 +3823,20 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3457 return EMULATE_DONE; 3823 return EMULATE_DONE;
3458 } 3824 }
3459 3825
3826restart:
3460 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 3827 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
3461 shadow_mask = vcpu->arch.emulate_ctxt.interruptibility; 3828 shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
3462 3829
3463 if (r == 0) 3830 if (r == 0)
3464 kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask); 3831 kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
3465 3832
3466 if (vcpu->arch.pio.string) 3833 if (vcpu->arch.pio.count) {
3834 if (!vcpu->arch.pio.in)
3835 vcpu->arch.pio.count = 0;
3467 return EMULATE_DO_MMIO; 3836 return EMULATE_DO_MMIO;
3837 }
3468 3838
3469 if ((r || vcpu->mmio_is_write) && run) { 3839 if (r || vcpu->mmio_is_write) {
3470 run->exit_reason = KVM_EXIT_MMIO; 3840 run->exit_reason = KVM_EXIT_MMIO;
3471 run->mmio.phys_addr = vcpu->mmio_phys_addr; 3841 run->mmio.phys_addr = vcpu->mmio_phys_addr;
3472 memcpy(run->mmio.data, vcpu->mmio_data, 8); 3842 memcpy(run->mmio.data, vcpu->mmio_data, 8);
@@ -3476,222 +3846,41 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3476 3846
3477 if (r) { 3847 if (r) {
3478 if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) 3848 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
3479 return EMULATE_DONE; 3849 goto done;
3480 if (!vcpu->mmio_needed) { 3850 if (!vcpu->mmio_needed) {
3851 ++vcpu->stat.insn_emulation_fail;
3852 trace_kvm_emulate_insn_failed(vcpu);
3481 kvm_report_emulation_failure(vcpu, "mmio"); 3853 kvm_report_emulation_failure(vcpu, "mmio");
3482 return EMULATE_FAIL; 3854 return EMULATE_FAIL;
3483 } 3855 }
3484 return EMULATE_DO_MMIO; 3856 return EMULATE_DO_MMIO;
3485 } 3857 }
3486 3858
3487 kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
3488
3489 if (vcpu->mmio_is_write) { 3859 if (vcpu->mmio_is_write) {
3490 vcpu->mmio_needed = 0; 3860 vcpu->mmio_needed = 0;
3491 return EMULATE_DO_MMIO; 3861 return EMULATE_DO_MMIO;
3492 } 3862 }
3493 3863
3494 return EMULATE_DONE; 3864done:
3495} 3865 if (vcpu->arch.exception.pending)
3496EXPORT_SYMBOL_GPL(emulate_instruction); 3866 vcpu->arch.emulate_ctxt.restart = false;
3497
3498static int pio_copy_data(struct kvm_vcpu *vcpu)
3499{
3500 void *p = vcpu->arch.pio_data;
3501 gva_t q = vcpu->arch.pio.guest_gva;
3502 unsigned bytes;
3503 int ret;
3504 u32 error_code;
3505
3506 bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
3507 if (vcpu->arch.pio.in)
3508 ret = kvm_write_guest_virt(q, p, bytes, vcpu, &error_code);
3509 else
3510 ret = kvm_read_guest_virt(q, p, bytes, vcpu, &error_code);
3511
3512 if (ret == X86EMUL_PROPAGATE_FAULT)
3513 kvm_inject_page_fault(vcpu, q, error_code);
3514
3515 return ret;
3516}
3517
3518int complete_pio(struct kvm_vcpu *vcpu)
3519{
3520 struct kvm_pio_request *io = &vcpu->arch.pio;
3521 long delta;
3522 int r;
3523 unsigned long val;
3524
3525 if (!io->string) {
3526 if (io->in) {
3527 val = kvm_register_read(vcpu, VCPU_REGS_RAX);
3528 memcpy(&val, vcpu->arch.pio_data, io->size);
3529 kvm_register_write(vcpu, VCPU_REGS_RAX, val);
3530 }
3531 } else {
3532 if (io->in) {
3533 r = pio_copy_data(vcpu);
3534 if (r)
3535 goto out;
3536 }
3537
3538 delta = 1;
3539 if (io->rep) {
3540 delta *= io->cur_count;
3541 /*
3542 * The size of the register should really depend on
3543 * current address size.
3544 */
3545 val = kvm_register_read(vcpu, VCPU_REGS_RCX);
3546 val -= delta;
3547 kvm_register_write(vcpu, VCPU_REGS_RCX, val);
3548 }
3549 if (io->down)
3550 delta = -delta;
3551 delta *= io->size;
3552 if (io->in) {
3553 val = kvm_register_read(vcpu, VCPU_REGS_RDI);
3554 val += delta;
3555 kvm_register_write(vcpu, VCPU_REGS_RDI, val);
3556 } else {
3557 val = kvm_register_read(vcpu, VCPU_REGS_RSI);
3558 val += delta;
3559 kvm_register_write(vcpu, VCPU_REGS_RSI, val);
3560 }
3561 }
3562out:
3563 io->count -= io->cur_count;
3564 io->cur_count = 0;
3565
3566 return 0;
3567}
3568 3867
3569static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) 3868 if (vcpu->arch.emulate_ctxt.restart)
3570{ 3869 goto restart;
3571 /* TODO: String I/O for in kernel device */
3572 int r;
3573 3870
3574 if (vcpu->arch.pio.in) 3871 return EMULATE_DONE;
3575 r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
3576 vcpu->arch.pio.size, pd);
3577 else
3578 r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
3579 vcpu->arch.pio.port, vcpu->arch.pio.size,
3580 pd);
3581 return r;
3582}
3583
3584static int pio_string_write(struct kvm_vcpu *vcpu)
3585{
3586 struct kvm_pio_request *io = &vcpu->arch.pio;
3587 void *pd = vcpu->arch.pio_data;
3588 int i, r = 0;
3589
3590 for (i = 0; i < io->cur_count; i++) {
3591 if (kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
3592 io->port, io->size, pd)) {
3593 r = -EOPNOTSUPP;
3594 break;
3595 }
3596 pd += io->size;
3597 }
3598 return r;
3599}
3600
3601int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port)
3602{
3603 unsigned long val;
3604
3605 trace_kvm_pio(!in, port, size, 1);
3606
3607 vcpu->run->exit_reason = KVM_EXIT_IO;
3608 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
3609 vcpu->run->io.size = vcpu->arch.pio.size = size;
3610 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
3611 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1;
3612 vcpu->run->io.port = vcpu->arch.pio.port = port;
3613 vcpu->arch.pio.in = in;
3614 vcpu->arch.pio.string = 0;
3615 vcpu->arch.pio.down = 0;
3616 vcpu->arch.pio.rep = 0;
3617
3618 if (!vcpu->arch.pio.in) {
3619 val = kvm_register_read(vcpu, VCPU_REGS_RAX);
3620 memcpy(vcpu->arch.pio_data, &val, 4);
3621 }
3622
3623 if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
3624 complete_pio(vcpu);
3625 return 1;
3626 }
3627 return 0;
3628} 3872}
3629EXPORT_SYMBOL_GPL(kvm_emulate_pio); 3873EXPORT_SYMBOL_GPL(emulate_instruction);
3630 3874
3631int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in, 3875int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
3632 int size, unsigned long count, int down,
3633 gva_t address, int rep, unsigned port)
3634{ 3876{
3635 unsigned now, in_page; 3877 unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
3636 int ret = 0; 3878 int ret = emulator_pio_out_emulated(size, port, &val, 1, vcpu);
3637 3879 /* do not return to emulator after return from userspace */
3638 trace_kvm_pio(!in, port, size, count); 3880 vcpu->arch.pio.count = 0;
3639
3640 vcpu->run->exit_reason = KVM_EXIT_IO;
3641 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
3642 vcpu->run->io.size = vcpu->arch.pio.size = size;
3643 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
3644 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
3645 vcpu->run->io.port = vcpu->arch.pio.port = port;
3646 vcpu->arch.pio.in = in;
3647 vcpu->arch.pio.string = 1;
3648 vcpu->arch.pio.down = down;
3649 vcpu->arch.pio.rep = rep;
3650
3651 if (!count) {
3652 kvm_x86_ops->skip_emulated_instruction(vcpu);
3653 return 1;
3654 }
3655
3656 if (!down)
3657 in_page = PAGE_SIZE - offset_in_page(address);
3658 else
3659 in_page = offset_in_page(address) + size;
3660 now = min(count, (unsigned long)in_page / size);
3661 if (!now)
3662 now = 1;
3663 if (down) {
3664 /*
3665 * String I/O in reverse. Yuck. Kill the guest, fix later.
3666 */
3667 pr_unimpl(vcpu, "guest string pio down\n");
3668 kvm_inject_gp(vcpu, 0);
3669 return 1;
3670 }
3671 vcpu->run->io.count = now;
3672 vcpu->arch.pio.cur_count = now;
3673
3674 if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
3675 kvm_x86_ops->skip_emulated_instruction(vcpu);
3676
3677 vcpu->arch.pio.guest_gva = address;
3678
3679 if (!vcpu->arch.pio.in) {
3680 /* string PIO write */
3681 ret = pio_copy_data(vcpu);
3682 if (ret == X86EMUL_PROPAGATE_FAULT)
3683 return 1;
3684 if (ret == 0 && !pio_string_write(vcpu)) {
3685 complete_pio(vcpu);
3686 if (vcpu->arch.pio.count == 0)
3687 ret = 1;
3688 }
3689 }
3690 /* no string PIO read support yet */
3691
3692 return ret; 3881 return ret;
3693} 3882}
3694EXPORT_SYMBOL_GPL(kvm_emulate_pio_string); 3883EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
3695 3884
3696static void bounce_off(void *info) 3885static void bounce_off(void *info)
3697{ 3886{
@@ -4014,85 +4203,20 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
4014 return emulator_write_emulated(rip, instruction, 3, vcpu); 4203 return emulator_write_emulated(rip, instruction, 3, vcpu);
4015} 4204}
4016 4205
4017static u64 mk_cr_64(u64 curr_cr, u32 new_val)
4018{
4019 return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
4020}
4021
4022void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) 4206void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
4023{ 4207{
4024 struct descriptor_table dt = { limit, base }; 4208 struct desc_ptr dt = { limit, base };
4025 4209
4026 kvm_x86_ops->set_gdt(vcpu, &dt); 4210 kvm_x86_ops->set_gdt(vcpu, &dt);
4027} 4211}
4028 4212
4029void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) 4213void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
4030{ 4214{
4031 struct descriptor_table dt = { limit, base }; 4215 struct desc_ptr dt = { limit, base };
4032 4216
4033 kvm_x86_ops->set_idt(vcpu, &dt); 4217 kvm_x86_ops->set_idt(vcpu, &dt);
4034} 4218}
4035 4219
4036void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
4037 unsigned long *rflags)
4038{
4039 kvm_lmsw(vcpu, msw);
4040 *rflags = kvm_get_rflags(vcpu);
4041}
4042
4043unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
4044{
4045 unsigned long value;
4046
4047 switch (cr) {
4048 case 0:
4049 value = kvm_read_cr0(vcpu);
4050 break;
4051 case 2:
4052 value = vcpu->arch.cr2;
4053 break;
4054 case 3:
4055 value = vcpu->arch.cr3;
4056 break;
4057 case 4:
4058 value = kvm_read_cr4(vcpu);
4059 break;
4060 case 8:
4061 value = kvm_get_cr8(vcpu);
4062 break;
4063 default:
4064 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
4065 return 0;
4066 }
4067
4068 return value;
4069}
4070
4071void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
4072 unsigned long *rflags)
4073{
4074 switch (cr) {
4075 case 0:
4076 kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
4077 *rflags = kvm_get_rflags(vcpu);
4078 break;
4079 case 2:
4080 vcpu->arch.cr2 = val;
4081 break;
4082 case 3:
4083 kvm_set_cr3(vcpu, val);
4084 break;
4085 case 4:
4086 kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
4087 break;
4088 case 8:
4089 kvm_set_cr8(vcpu, val & 0xfUL);
4090 break;
4091 default:
4092 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
4093 }
4094}
4095
4096static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) 4220static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
4097{ 4221{
4098 struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; 4222 struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
@@ -4156,9 +4280,13 @@ int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
4156{ 4280{
4157 struct kvm_cpuid_entry2 *best; 4281 struct kvm_cpuid_entry2 *best;
4158 4282
4283 best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0);
4284 if (!best || best->eax < 0x80000008)
4285 goto not_found;
4159 best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); 4286 best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
4160 if (best) 4287 if (best)
4161 return best->eax & 0xff; 4288 return best->eax & 0xff;
4289not_found:
4162 return 36; 4290 return 36;
4163} 4291}
4164 4292
@@ -4272,6 +4400,9 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
4272{ 4400{
4273 /* try to reinject previous events if any */ 4401 /* try to reinject previous events if any */
4274 if (vcpu->arch.exception.pending) { 4402 if (vcpu->arch.exception.pending) {
4403 trace_kvm_inj_exception(vcpu->arch.exception.nr,
4404 vcpu->arch.exception.has_error_code,
4405 vcpu->arch.exception.error_code);
4275 kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, 4406 kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
4276 vcpu->arch.exception.has_error_code, 4407 vcpu->arch.exception.has_error_code,
4277 vcpu->arch.exception.error_code); 4408 vcpu->arch.exception.error_code);
@@ -4532,24 +4663,17 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
4532 if (!irqchip_in_kernel(vcpu->kvm)) 4663 if (!irqchip_in_kernel(vcpu->kvm))
4533 kvm_set_cr8(vcpu, kvm_run->cr8); 4664 kvm_set_cr8(vcpu, kvm_run->cr8);
4534 4665
4535 if (vcpu->arch.pio.cur_count) { 4666 if (vcpu->arch.pio.count || vcpu->mmio_needed ||
4536 r = complete_pio(vcpu); 4667 vcpu->arch.emulate_ctxt.restart) {
4537 if (r) 4668 if (vcpu->mmio_needed) {
4538 goto out; 4669 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
4539 } 4670 vcpu->mmio_read_completed = 1;
4540 if (vcpu->mmio_needed) { 4671 vcpu->mmio_needed = 0;
4541 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); 4672 }
4542 vcpu->mmio_read_completed = 1;
4543 vcpu->mmio_needed = 0;
4544
4545 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); 4673 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
4546 r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0, 4674 r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE);
4547 EMULTYPE_NO_DECODE);
4548 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); 4675 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
4549 if (r == EMULATE_DO_MMIO) { 4676 if (r == EMULATE_DO_MMIO) {
4550 /*
4551 * Read-modify-write. Back to userspace.
4552 */
4553 r = 0; 4677 r = 0;
4554 goto out; 4678 goto out;
4555 } 4679 }
@@ -4632,12 +4756,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
4632 return 0; 4756 return 0;
4633} 4757}
4634 4758
4635void kvm_get_segment(struct kvm_vcpu *vcpu,
4636 struct kvm_segment *var, int seg)
4637{
4638 kvm_x86_ops->get_segment(vcpu, var, seg);
4639}
4640
4641void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 4759void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
4642{ 4760{
4643 struct kvm_segment cs; 4761 struct kvm_segment cs;
@@ -4651,7 +4769,7 @@ EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
4651int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, 4769int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
4652 struct kvm_sregs *sregs) 4770 struct kvm_sregs *sregs)
4653{ 4771{
4654 struct descriptor_table dt; 4772 struct desc_ptr dt;
4655 4773
4656 vcpu_load(vcpu); 4774 vcpu_load(vcpu);
4657 4775
@@ -4666,11 +4784,11 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
4666 kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 4784 kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
4667 4785
4668 kvm_x86_ops->get_idt(vcpu, &dt); 4786 kvm_x86_ops->get_idt(vcpu, &dt);
4669 sregs->idt.limit = dt.limit; 4787 sregs->idt.limit = dt.size;
4670 sregs->idt.base = dt.base; 4788 sregs->idt.base = dt.address;
4671 kvm_x86_ops->get_gdt(vcpu, &dt); 4789 kvm_x86_ops->get_gdt(vcpu, &dt);
4672 sregs->gdt.limit = dt.limit; 4790 sregs->gdt.limit = dt.size;
4673 sregs->gdt.base = dt.base; 4791 sregs->gdt.base = dt.address;
4674 4792
4675 sregs->cr0 = kvm_read_cr0(vcpu); 4793 sregs->cr0 = kvm_read_cr0(vcpu);
4676 sregs->cr2 = vcpu->arch.cr2; 4794 sregs->cr2 = vcpu->arch.cr2;
@@ -4709,559 +4827,33 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
4709 return 0; 4827 return 0;
4710} 4828}
4711 4829
4712static void kvm_set_segment(struct kvm_vcpu *vcpu, 4830int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
4713 struct kvm_segment *var, int seg) 4831 bool has_error_code, u32 error_code)
4714{
4715 kvm_x86_ops->set_segment(vcpu, var, seg);
4716}
4717
4718static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
4719 struct kvm_segment *kvm_desct)
4720{
4721 kvm_desct->base = get_desc_base(seg_desc);
4722 kvm_desct->limit = get_desc_limit(seg_desc);
4723 if (seg_desc->g) {
4724 kvm_desct->limit <<= 12;
4725 kvm_desct->limit |= 0xfff;
4726 }
4727 kvm_desct->selector = selector;
4728 kvm_desct->type = seg_desc->type;
4729 kvm_desct->present = seg_desc->p;
4730 kvm_desct->dpl = seg_desc->dpl;
4731 kvm_desct->db = seg_desc->d;
4732 kvm_desct->s = seg_desc->s;
4733 kvm_desct->l = seg_desc->l;
4734 kvm_desct->g = seg_desc->g;
4735 kvm_desct->avl = seg_desc->avl;
4736 if (!selector)
4737 kvm_desct->unusable = 1;
4738 else
4739 kvm_desct->unusable = 0;
4740 kvm_desct->padding = 0;
4741}
4742
4743static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu,
4744 u16 selector,
4745 struct descriptor_table *dtable)
4746{
4747 if (selector & 1 << 2) {
4748 struct kvm_segment kvm_seg;
4749
4750 kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR);
4751
4752 if (kvm_seg.unusable)
4753 dtable->limit = 0;
4754 else
4755 dtable->limit = kvm_seg.limit;
4756 dtable->base = kvm_seg.base;
4757 }
4758 else
4759 kvm_x86_ops->get_gdt(vcpu, dtable);
4760}
4761
4762/* allowed just for 8 bytes segments */
4763static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
4764 struct desc_struct *seg_desc)
4765{
4766 struct descriptor_table dtable;
4767 u16 index = selector >> 3;
4768 int ret;
4769 u32 err;
4770 gva_t addr;
4771
4772 get_segment_descriptor_dtable(vcpu, selector, &dtable);
4773
4774 if (dtable.limit < index * 8 + 7) {
4775 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
4776 return X86EMUL_PROPAGATE_FAULT;
4777 }
4778 addr = dtable.base + index * 8;
4779 ret = kvm_read_guest_virt_system(addr, seg_desc, sizeof(*seg_desc),
4780 vcpu, &err);
4781 if (ret == X86EMUL_PROPAGATE_FAULT)
4782 kvm_inject_page_fault(vcpu, addr, err);
4783
4784 return ret;
4785}
4786
4787/* allowed just for 8 bytes segments */
4788static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
4789 struct desc_struct *seg_desc)
4790{
4791 struct descriptor_table dtable;
4792 u16 index = selector >> 3;
4793
4794 get_segment_descriptor_dtable(vcpu, selector, &dtable);
4795
4796 if (dtable.limit < index * 8 + 7)
4797 return 1;
4798 return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu, NULL);
4799}
4800
4801static gpa_t get_tss_base_addr_write(struct kvm_vcpu *vcpu,
4802 struct desc_struct *seg_desc)
4803{
4804 u32 base_addr = get_desc_base(seg_desc);
4805
4806 return kvm_mmu_gva_to_gpa_write(vcpu, base_addr, NULL);
4807}
4808
4809static gpa_t get_tss_base_addr_read(struct kvm_vcpu *vcpu,
4810 struct desc_struct *seg_desc)
4811{
4812 u32 base_addr = get_desc_base(seg_desc);
4813
4814 return kvm_mmu_gva_to_gpa_read(vcpu, base_addr, NULL);
4815}
4816
4817static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
4818{
4819 struct kvm_segment kvm_seg;
4820
4821 kvm_get_segment(vcpu, &kvm_seg, seg);
4822 return kvm_seg.selector;
4823}
4824
4825static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
4826{
4827 struct kvm_segment segvar = {
4828 .base = selector << 4,
4829 .limit = 0xffff,
4830 .selector = selector,
4831 .type = 3,
4832 .present = 1,
4833 .dpl = 3,
4834 .db = 0,
4835 .s = 1,
4836 .l = 0,
4837 .g = 0,
4838 .avl = 0,
4839 .unusable = 0,
4840 };
4841 kvm_x86_ops->set_segment(vcpu, &segvar, seg);
4842 return X86EMUL_CONTINUE;
4843}
4844
4845static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
4846{ 4832{
4847 return (seg != VCPU_SREG_LDTR) && 4833 int cs_db, cs_l, ret;
4848 (seg != VCPU_SREG_TR) && 4834 cache_all_regs(vcpu);
4849 (kvm_get_rflags(vcpu) & X86_EFLAGS_VM);
4850}
4851
4852int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg)
4853{
4854 struct kvm_segment kvm_seg;
4855 struct desc_struct seg_desc;
4856 u8 dpl, rpl, cpl;
4857 unsigned err_vec = GP_VECTOR;
4858 u32 err_code = 0;
4859 bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
4860 int ret;
4861 4835
4862 if (is_vm86_segment(vcpu, seg) || !is_protmode(vcpu)) 4836 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
4863 return kvm_load_realmode_segment(vcpu, selector, seg);
4864 4837
4865 /* NULL selector is not valid for TR, CS and SS */ 4838 vcpu->arch.emulate_ctxt.vcpu = vcpu;
4866 if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR) 4839 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
4867 && null_selector) 4840 vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
4868 goto exception; 4841 vcpu->arch.emulate_ctxt.mode =
4842 (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
4843 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
4844 ? X86EMUL_MODE_VM86 : cs_l
4845 ? X86EMUL_MODE_PROT64 : cs_db
4846 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
4869 4847
4870 /* TR should be in GDT only */ 4848 ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops,
4871 if (seg == VCPU_SREG_TR && (selector & (1 << 2))) 4849 tss_selector, reason, has_error_code,
4872 goto exception; 4850 error_code);
4873 4851
4874 ret = load_guest_segment_descriptor(vcpu, selector, &seg_desc);
4875 if (ret) 4852 if (ret)
4876 return ret; 4853 return EMULATE_FAIL;
4877
4878 seg_desct_to_kvm_desct(&seg_desc, selector, &kvm_seg);
4879
4880 if (null_selector) { /* for NULL selector skip all following checks */
4881 kvm_seg.unusable = 1;
4882 goto load;
4883 }
4884
4885 err_code = selector & 0xfffc;
4886 err_vec = GP_VECTOR;
4887
4888 /* can't load system descriptor into segment selecor */
4889 if (seg <= VCPU_SREG_GS && !kvm_seg.s)
4890 goto exception;
4891
4892 if (!kvm_seg.present) {
4893 err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
4894 goto exception;
4895 }
4896
4897 rpl = selector & 3;
4898 dpl = kvm_seg.dpl;
4899 cpl = kvm_x86_ops->get_cpl(vcpu);
4900
4901 switch (seg) {
4902 case VCPU_SREG_SS:
4903 /*
4904 * segment is not a writable data segment or segment
4905 * selector's RPL != CPL or segment selector's RPL != CPL
4906 */
4907 if (rpl != cpl || (kvm_seg.type & 0xa) != 0x2 || dpl != cpl)
4908 goto exception;
4909 break;
4910 case VCPU_SREG_CS:
4911 if (!(kvm_seg.type & 8))
4912 goto exception;
4913
4914 if (kvm_seg.type & 4) {
4915 /* conforming */
4916 if (dpl > cpl)
4917 goto exception;
4918 } else {
4919 /* nonconforming */
4920 if (rpl > cpl || dpl != cpl)
4921 goto exception;
4922 }
4923 /* CS(RPL) <- CPL */
4924 selector = (selector & 0xfffc) | cpl;
4925 break;
4926 case VCPU_SREG_TR:
4927 if (kvm_seg.s || (kvm_seg.type != 1 && kvm_seg.type != 9))
4928 goto exception;
4929 break;
4930 case VCPU_SREG_LDTR:
4931 if (kvm_seg.s || kvm_seg.type != 2)
4932 goto exception;
4933 break;
4934 default: /* DS, ES, FS, or GS */
4935 /*
4936 * segment is not a data or readable code segment or
4937 * ((segment is a data or nonconforming code segment)
4938 * and (both RPL and CPL > DPL))
4939 */
4940 if ((kvm_seg.type & 0xa) == 0x8 ||
4941 (((kvm_seg.type & 0xc) != 0xc) && (rpl > dpl && cpl > dpl)))
4942 goto exception;
4943 break;
4944 }
4945
4946 if (!kvm_seg.unusable && kvm_seg.s) {
4947 /* mark segment as accessed */
4948 kvm_seg.type |= 1;
4949 seg_desc.type |= 1;
4950 save_guest_segment_descriptor(vcpu, selector, &seg_desc);
4951 }
4952load:
4953 kvm_set_segment(vcpu, &kvm_seg, seg);
4954 return X86EMUL_CONTINUE;
4955exception:
4956 kvm_queue_exception_e(vcpu, err_vec, err_code);
4957 return X86EMUL_PROPAGATE_FAULT;
4958}
4959
4960static void save_state_to_tss32(struct kvm_vcpu *vcpu,
4961 struct tss_segment_32 *tss)
4962{
4963 tss->cr3 = vcpu->arch.cr3;
4964 tss->eip = kvm_rip_read(vcpu);
4965 tss->eflags = kvm_get_rflags(vcpu);
4966 tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
4967 tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
4968 tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
4969 tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX);
4970 tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP);
4971 tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP);
4972 tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI);
4973 tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI);
4974 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
4975 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
4976 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
4977 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
4978 tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS);
4979 tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS);
4980 tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
4981}
4982
4983static void kvm_load_segment_selector(struct kvm_vcpu *vcpu, u16 sel, int seg)
4984{
4985 struct kvm_segment kvm_seg;
4986 kvm_get_segment(vcpu, &kvm_seg, seg);
4987 kvm_seg.selector = sel;
4988 kvm_set_segment(vcpu, &kvm_seg, seg);
4989}
4990
4991static int load_state_from_tss32(struct kvm_vcpu *vcpu,
4992 struct tss_segment_32 *tss)
4993{
4994 kvm_set_cr3(vcpu, tss->cr3);
4995
4996 kvm_rip_write(vcpu, tss->eip);
4997 kvm_set_rflags(vcpu, tss->eflags | 2);
4998
4999 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
5000 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
5001 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx);
5002 kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx);
5003 kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp);
5004 kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp);
5005 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
5006 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
5007
5008 /*
5009 * SDM says that segment selectors are loaded before segment
5010 * descriptors
5011 */
5012 kvm_load_segment_selector(vcpu, tss->ldt_selector, VCPU_SREG_LDTR);
5013 kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
5014 kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
5015 kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
5016 kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
5017 kvm_load_segment_selector(vcpu, tss->fs, VCPU_SREG_FS);
5018 kvm_load_segment_selector(vcpu, tss->gs, VCPU_SREG_GS);
5019
5020 /*
5021 * Now load segment descriptors. If fault happenes at this stage
5022 * it is handled in a context of new task
5023 */
5024 if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, VCPU_SREG_LDTR))
5025 return 1;
5026
5027 if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
5028 return 1;
5029
5030 if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))
5031 return 1;
5032
5033 if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
5034 return 1;
5035
5036 if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
5037 return 1;
5038
5039 if (kvm_load_segment_descriptor(vcpu, tss->fs, VCPU_SREG_FS))
5040 return 1;
5041
5042 if (kvm_load_segment_descriptor(vcpu, tss->gs, VCPU_SREG_GS))
5043 return 1;
5044 return 0;
5045}
5046
5047static void save_state_to_tss16(struct kvm_vcpu *vcpu,
5048 struct tss_segment_16 *tss)
5049{
5050 tss->ip = kvm_rip_read(vcpu);
5051 tss->flag = kvm_get_rflags(vcpu);
5052 tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
5053 tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
5054 tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
5055 tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX);
5056 tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP);
5057 tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP);
5058 tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI);
5059 tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI);
5060
5061 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
5062 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
5063 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
5064 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
5065 tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR);
5066}
5067
5068static int load_state_from_tss16(struct kvm_vcpu *vcpu,
5069 struct tss_segment_16 *tss)
5070{
5071 kvm_rip_write(vcpu, tss->ip);
5072 kvm_set_rflags(vcpu, tss->flag | 2);
5073 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
5074 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
5075 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
5076 kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx);
5077 kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp);
5078 kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp);
5079 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
5080 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
5081
5082 /*
5083 * SDM says that segment selectors are loaded before segment
5084 * descriptors
5085 */
5086 kvm_load_segment_selector(vcpu, tss->ldt, VCPU_SREG_LDTR);
5087 kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
5088 kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
5089 kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
5090 kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
5091
5092 /*
5093 * Now load segment descriptors. If fault happenes at this stage
5094 * it is handled in a context of new task
5095 */
5096 if (kvm_load_segment_descriptor(vcpu, tss->ldt, VCPU_SREG_LDTR))
5097 return 1;
5098
5099 if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
5100 return 1;
5101
5102 if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))
5103 return 1;
5104
5105 if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
5106 return 1;
5107
5108 if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
5109 return 1;
5110 return 0;
5111}
5112
5113static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
5114 u16 old_tss_sel, u32 old_tss_base,
5115 struct desc_struct *nseg_desc)
5116{
5117 struct tss_segment_16 tss_segment_16;
5118 int ret = 0;
5119
5120 if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
5121 sizeof tss_segment_16))
5122 goto out;
5123
5124 save_state_to_tss16(vcpu, &tss_segment_16);
5125
5126 if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
5127 sizeof tss_segment_16))
5128 goto out;
5129
5130 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
5131 &tss_segment_16, sizeof tss_segment_16))
5132 goto out;
5133
5134 if (old_tss_sel != 0xffff) {
5135 tss_segment_16.prev_task_link = old_tss_sel;
5136
5137 if (kvm_write_guest(vcpu->kvm,
5138 get_tss_base_addr_write(vcpu, nseg_desc),
5139 &tss_segment_16.prev_task_link,
5140 sizeof tss_segment_16.prev_task_link))
5141 goto out;
5142 }
5143
5144 if (load_state_from_tss16(vcpu, &tss_segment_16))
5145 goto out;
5146
5147 ret = 1;
5148out:
5149 return ret;
5150}
5151
5152static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
5153 u16 old_tss_sel, u32 old_tss_base,
5154 struct desc_struct *nseg_desc)
5155{
5156 struct tss_segment_32 tss_segment_32;
5157 int ret = 0;
5158
5159 if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
5160 sizeof tss_segment_32))
5161 goto out;
5162
5163 save_state_to_tss32(vcpu, &tss_segment_32);
5164
5165 if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
5166 sizeof tss_segment_32))
5167 goto out;
5168
5169 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
5170 &tss_segment_32, sizeof tss_segment_32))
5171 goto out;
5172
5173 if (old_tss_sel != 0xffff) {
5174 tss_segment_32.prev_task_link = old_tss_sel;
5175
5176 if (kvm_write_guest(vcpu->kvm,
5177 get_tss_base_addr_write(vcpu, nseg_desc),
5178 &tss_segment_32.prev_task_link,
5179 sizeof tss_segment_32.prev_task_link))
5180 goto out;
5181 }
5182
5183 if (load_state_from_tss32(vcpu, &tss_segment_32))
5184 goto out;
5185
5186 ret = 1;
5187out:
5188 return ret;
5189}
5190
5191int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
5192{
5193 struct kvm_segment tr_seg;
5194 struct desc_struct cseg_desc;
5195 struct desc_struct nseg_desc;
5196 int ret = 0;
5197 u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR);
5198 u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR);
5199
5200 old_tss_base = kvm_mmu_gva_to_gpa_write(vcpu, old_tss_base, NULL);
5201
5202 /* FIXME: Handle errors. Failure to read either TSS or their
5203 * descriptors should generate a pagefault.
5204 */
5205 if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc))
5206 goto out;
5207
5208 if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc))
5209 goto out;
5210
5211 if (reason != TASK_SWITCH_IRET) {
5212 int cpl;
5213
5214 cpl = kvm_x86_ops->get_cpl(vcpu);
5215 if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) {
5216 kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
5217 return 1;
5218 }
5219 }
5220 4854
5221 if (!nseg_desc.p || get_desc_limit(&nseg_desc) < 0x67) { 4855 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
5222 kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc); 4856 return EMULATE_DONE;
5223 return 1;
5224 }
5225
5226 if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
5227 cseg_desc.type &= ~(1 << 1); //clear the B flag
5228 save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc);
5229 }
5230
5231 if (reason == TASK_SWITCH_IRET) {
5232 u32 eflags = kvm_get_rflags(vcpu);
5233 kvm_set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
5234 }
5235
5236 /* set back link to prev task only if NT bit is set in eflags
5237 note that old_tss_sel is not used afetr this point */
5238 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
5239 old_tss_sel = 0xffff;
5240
5241 if (nseg_desc.type & 8)
5242 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel,
5243 old_tss_base, &nseg_desc);
5244 else
5245 ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel,
5246 old_tss_base, &nseg_desc);
5247
5248 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
5249 u32 eflags = kvm_get_rflags(vcpu);
5250 kvm_set_rflags(vcpu, eflags | X86_EFLAGS_NT);
5251 }
5252
5253 if (reason != TASK_SWITCH_IRET) {
5254 nseg_desc.type |= (1 << 1);
5255 save_guest_segment_descriptor(vcpu, tss_selector,
5256 &nseg_desc);
5257 }
5258
5259 kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0(vcpu) | X86_CR0_TS);
5260 seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
5261 tr_seg.type = 11;
5262 kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
5263out:
5264 return ret;
5265} 4857}
5266EXPORT_SYMBOL_GPL(kvm_task_switch); 4858EXPORT_SYMBOL_GPL(kvm_task_switch);
5267 4859
@@ -5270,15 +4862,15 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5270{ 4862{
5271 int mmu_reset_needed = 0; 4863 int mmu_reset_needed = 0;
5272 int pending_vec, max_bits; 4864 int pending_vec, max_bits;
5273 struct descriptor_table dt; 4865 struct desc_ptr dt;
5274 4866
5275 vcpu_load(vcpu); 4867 vcpu_load(vcpu);
5276 4868
5277 dt.limit = sregs->idt.limit; 4869 dt.size = sregs->idt.limit;
5278 dt.base = sregs->idt.base; 4870 dt.address = sregs->idt.base;
5279 kvm_x86_ops->set_idt(vcpu, &dt); 4871 kvm_x86_ops->set_idt(vcpu, &dt);
5280 dt.limit = sregs->gdt.limit; 4872 dt.size = sregs->gdt.limit;
5281 dt.base = sregs->gdt.base; 4873 dt.address = sregs->gdt.base;
5282 kvm_x86_ops->set_gdt(vcpu, &dt); 4874 kvm_x86_ops->set_gdt(vcpu, &dt);
5283 4875
5284 vcpu->arch.cr2 = sregs->cr2; 4876 vcpu->arch.cr2 = sregs->cr2;
@@ -5377,11 +4969,9 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
5377 vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK); 4969 vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
5378 } 4970 }
5379 4971
5380 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { 4972 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
5381 vcpu->arch.singlestep_cs = 4973 vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) +
5382 get_segment_selector(vcpu, VCPU_SREG_CS); 4974 get_segment_base(vcpu, VCPU_SREG_CS);
5383 vcpu->arch.singlestep_rip = kvm_rip_read(vcpu);
5384 }
5385 4975
5386 /* 4976 /*
5387 * Trigger an rflags update that will inject or remove the trace 4977 * Trigger an rflags update that will inject or remove the trace
@@ -5872,13 +5462,22 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
5872 return kvm_x86_ops->interrupt_allowed(vcpu); 5462 return kvm_x86_ops->interrupt_allowed(vcpu);
5873} 5463}
5874 5464
5465bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip)
5466{
5467 unsigned long current_rip = kvm_rip_read(vcpu) +
5468 get_segment_base(vcpu, VCPU_SREG_CS);
5469
5470 return current_rip == linear_rip;
5471}
5472EXPORT_SYMBOL_GPL(kvm_is_linear_rip);
5473
5875unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu) 5474unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
5876{ 5475{
5877 unsigned long rflags; 5476 unsigned long rflags;
5878 5477
5879 rflags = kvm_x86_ops->get_rflags(vcpu); 5478 rflags = kvm_x86_ops->get_rflags(vcpu);
5880 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 5479 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
5881 rflags &= ~(unsigned long)(X86_EFLAGS_TF | X86_EFLAGS_RF); 5480 rflags &= ~X86_EFLAGS_TF;
5882 return rflags; 5481 return rflags;
5883} 5482}
5884EXPORT_SYMBOL_GPL(kvm_get_rflags); 5483EXPORT_SYMBOL_GPL(kvm_get_rflags);
@@ -5886,10 +5485,8 @@ EXPORT_SYMBOL_GPL(kvm_get_rflags);
5886void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 5485void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
5887{ 5486{
5888 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP && 5487 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
5889 vcpu->arch.singlestep_cs == 5488 kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
5890 get_segment_selector(vcpu, VCPU_SREG_CS) && 5489 rflags |= X86_EFLAGS_TF;
5891 vcpu->arch.singlestep_rip == kvm_rip_read(vcpu))
5892 rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
5893 kvm_x86_ops->set_rflags(vcpu, rflags); 5490 kvm_x86_ops->set_rflags(vcpu, rflags);
5894} 5491}
5895EXPORT_SYMBOL_GPL(kvm_set_rflags); 5492EXPORT_SYMBOL_GPL(kvm_set_rflags);
@@ -5905,3 +5502,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
5905EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit); 5502EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
5906EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga); 5503EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
5907EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit); 5504EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
5505EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index b7a404722d2b..f4b54458285b 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -65,6 +65,13 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
65 return kvm_read_cr0_bits(vcpu, X86_CR0_PG); 65 return kvm_read_cr0_bits(vcpu, X86_CR0_PG);
66} 66}
67 67
68static inline struct kvm_mem_aliases *kvm_aliases(struct kvm *kvm)
69{
70 return rcu_dereference_check(kvm->arch.aliases,
71 srcu_read_lock_held(&kvm->srcu)
72 || lockdep_is_held(&kvm->slots_lock));
73}
74
68void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); 75void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
69void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); 76void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
70 77
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 7e59dc1d3fc2..2bdf628066bd 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -115,7 +115,7 @@ static void async_hcall(unsigned long call, unsigned long arg1,
115 local_irq_save(flags); 115 local_irq_save(flags);
116 if (lguest_data.hcall_status[next_call] != 0xFF) { 116 if (lguest_data.hcall_status[next_call] != 0xFF) {
117 /* Table full, so do normal hcall which will flush table. */ 117 /* Table full, so do normal hcall which will flush table. */
118 kvm_hypercall4(call, arg1, arg2, arg3, arg4); 118 hcall(call, arg1, arg2, arg3, arg4);
119 } else { 119 } else {
120 lguest_data.hcalls[next_call].arg0 = call; 120 lguest_data.hcalls[next_call].arg0 = call;
121 lguest_data.hcalls[next_call].arg1 = arg1; 121 lguest_data.hcalls[next_call].arg1 = arg1;
@@ -145,46 +145,45 @@ static void async_hcall(unsigned long call, unsigned long arg1,
145 * So, when we're in lazy mode, we call async_hcall() to store the call for 145 * So, when we're in lazy mode, we call async_hcall() to store the call for
146 * future processing: 146 * future processing:
147 */ 147 */
148static void lazy_hcall1(unsigned long call, 148static void lazy_hcall1(unsigned long call, unsigned long arg1)
149 unsigned long arg1)
150{ 149{
151 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) 150 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
152 kvm_hypercall1(call, arg1); 151 hcall(call, arg1, 0, 0, 0);
153 else 152 else
154 async_hcall(call, arg1, 0, 0, 0); 153 async_hcall(call, arg1, 0, 0, 0);
155} 154}
156 155
157/* You can imagine what lazy_hcall2, 3 and 4 look like. :*/ 156/* You can imagine what lazy_hcall2, 3 and 4 look like. :*/
158static void lazy_hcall2(unsigned long call, 157static void lazy_hcall2(unsigned long call,
159 unsigned long arg1, 158 unsigned long arg1,
160 unsigned long arg2) 159 unsigned long arg2)
161{ 160{
162 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) 161 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
163 kvm_hypercall2(call, arg1, arg2); 162 hcall(call, arg1, arg2, 0, 0);
164 else 163 else
165 async_hcall(call, arg1, arg2, 0, 0); 164 async_hcall(call, arg1, arg2, 0, 0);
166} 165}
167 166
168static void lazy_hcall3(unsigned long call, 167static void lazy_hcall3(unsigned long call,
169 unsigned long arg1, 168 unsigned long arg1,
170 unsigned long arg2, 169 unsigned long arg2,
171 unsigned long arg3) 170 unsigned long arg3)
172{ 171{
173 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) 172 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
174 kvm_hypercall3(call, arg1, arg2, arg3); 173 hcall(call, arg1, arg2, arg3, 0);
175 else 174 else
176 async_hcall(call, arg1, arg2, arg3, 0); 175 async_hcall(call, arg1, arg2, arg3, 0);
177} 176}
178 177
179#ifdef CONFIG_X86_PAE 178#ifdef CONFIG_X86_PAE
180static void lazy_hcall4(unsigned long call, 179static void lazy_hcall4(unsigned long call,
181 unsigned long arg1, 180 unsigned long arg1,
182 unsigned long arg2, 181 unsigned long arg2,
183 unsigned long arg3, 182 unsigned long arg3,
184 unsigned long arg4) 183 unsigned long arg4)
185{ 184{
186 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) 185 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
187 kvm_hypercall4(call, arg1, arg2, arg3, arg4); 186 hcall(call, arg1, arg2, arg3, arg4);
188 else 187 else
189 async_hcall(call, arg1, arg2, arg3, arg4); 188 async_hcall(call, arg1, arg2, arg3, arg4);
190} 189}
@@ -196,13 +195,13 @@ static void lazy_hcall4(unsigned long call,
196:*/ 195:*/
197static void lguest_leave_lazy_mmu_mode(void) 196static void lguest_leave_lazy_mmu_mode(void)
198{ 197{
199 kvm_hypercall0(LHCALL_FLUSH_ASYNC); 198 hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0);
200 paravirt_leave_lazy_mmu(); 199 paravirt_leave_lazy_mmu();
201} 200}
202 201
203static void lguest_end_context_switch(struct task_struct *next) 202static void lguest_end_context_switch(struct task_struct *next)
204{ 203{
205 kvm_hypercall0(LHCALL_FLUSH_ASYNC); 204 hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0);
206 paravirt_end_context_switch(next); 205 paravirt_end_context_switch(next);
207} 206}
208 207
@@ -286,7 +285,7 @@ static void lguest_write_idt_entry(gate_desc *dt,
286 /* Keep the local copy up to date. */ 285 /* Keep the local copy up to date. */
287 native_write_idt_entry(dt, entrynum, g); 286 native_write_idt_entry(dt, entrynum, g);
288 /* Tell Host about this new entry. */ 287 /* Tell Host about this new entry. */
289 kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]); 288 hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1], 0);
290} 289}
291 290
292/* 291/*
@@ -300,7 +299,7 @@ static void lguest_load_idt(const struct desc_ptr *desc)
300 struct desc_struct *idt = (void *)desc->address; 299 struct desc_struct *idt = (void *)desc->address;
301 300
302 for (i = 0; i < (desc->size+1)/8; i++) 301 for (i = 0; i < (desc->size+1)/8; i++)
303 kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b); 302 hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b, 0);
304} 303}
305 304
306/* 305/*
@@ -321,7 +320,7 @@ static void lguest_load_gdt(const struct desc_ptr *desc)
321 struct desc_struct *gdt = (void *)desc->address; 320 struct desc_struct *gdt = (void *)desc->address;
322 321
323 for (i = 0; i < (desc->size+1)/8; i++) 322 for (i = 0; i < (desc->size+1)/8; i++)
324 kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b); 323 hcall(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b, 0);
325} 324}
326 325
327/* 326/*
@@ -334,8 +333,8 @@ static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum,
334{ 333{
335 native_write_gdt_entry(dt, entrynum, desc, type); 334 native_write_gdt_entry(dt, entrynum, desc, type);
336 /* Tell Host about this new entry. */ 335 /* Tell Host about this new entry. */
337 kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, entrynum, 336 hcall(LHCALL_LOAD_GDT_ENTRY, entrynum,
338 dt[entrynum].a, dt[entrynum].b); 337 dt[entrynum].a, dt[entrynum].b, 0);
339} 338}
340 339
341/* 340/*
@@ -931,7 +930,7 @@ static int lguest_clockevent_set_next_event(unsigned long delta,
931 } 930 }
932 931
933 /* Please wake us this far in the future. */ 932 /* Please wake us this far in the future. */
934 kvm_hypercall1(LHCALL_SET_CLOCKEVENT, delta); 933 hcall(LHCALL_SET_CLOCKEVENT, delta, 0, 0, 0);
935 return 0; 934 return 0;
936} 935}
937 936
@@ -942,7 +941,7 @@ static void lguest_clockevent_set_mode(enum clock_event_mode mode,
942 case CLOCK_EVT_MODE_UNUSED: 941 case CLOCK_EVT_MODE_UNUSED:
943 case CLOCK_EVT_MODE_SHUTDOWN: 942 case CLOCK_EVT_MODE_SHUTDOWN:
944 /* A 0 argument shuts the clock down. */ 943 /* A 0 argument shuts the clock down. */
945 kvm_hypercall0(LHCALL_SET_CLOCKEVENT); 944 hcall(LHCALL_SET_CLOCKEVENT, 0, 0, 0, 0);
946 break; 945 break;
947 case CLOCK_EVT_MODE_ONESHOT: 946 case CLOCK_EVT_MODE_ONESHOT:
948 /* This is what we expect. */ 947 /* This is what we expect. */
@@ -1100,7 +1099,7 @@ static void set_lguest_basic_apic_ops(void)
1100/* STOP! Until an interrupt comes in. */ 1099/* STOP! Until an interrupt comes in. */
1101static void lguest_safe_halt(void) 1100static void lguest_safe_halt(void)
1102{ 1101{
1103 kvm_hypercall0(LHCALL_HALT); 1102 hcall(LHCALL_HALT, 0, 0, 0, 0);
1104} 1103}
1105 1104
1106/* 1105/*
@@ -1112,8 +1111,8 @@ static void lguest_safe_halt(void)
1112 */ 1111 */
1113static void lguest_power_off(void) 1112static void lguest_power_off(void)
1114{ 1113{
1115 kvm_hypercall2(LHCALL_SHUTDOWN, __pa("Power down"), 1114 hcall(LHCALL_SHUTDOWN, __pa("Power down"),
1116 LGUEST_SHUTDOWN_POWEROFF); 1115 LGUEST_SHUTDOWN_POWEROFF, 0, 0);
1117} 1116}
1118 1117
1119/* 1118/*
@@ -1123,7 +1122,7 @@ static void lguest_power_off(void)
1123 */ 1122 */
1124static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p) 1123static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p)
1125{ 1124{
1126 kvm_hypercall2(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF); 1125 hcall(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF, 0, 0);
1127 /* The hcall won't return, but to keep gcc happy, we're "done". */ 1126 /* The hcall won't return, but to keep gcc happy, we're "done". */
1128 return NOTIFY_DONE; 1127 return NOTIFY_DONE;
1129} 1128}
@@ -1162,7 +1161,7 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count)
1162 len = sizeof(scratch) - 1; 1161 len = sizeof(scratch) - 1;
1163 scratch[len] = '\0'; 1162 scratch[len] = '\0';
1164 memcpy(scratch, buf, len); 1163 memcpy(scratch, buf, len);
1165 kvm_hypercall1(LHCALL_NOTIFY, __pa(scratch)); 1164 hcall(LHCALL_NOTIFY, __pa(scratch), 0, 0, 0);
1166 1165
1167 /* This routine returns the number of bytes actually written. */ 1166 /* This routine returns the number of bytes actually written. */
1168 return len; 1167 return len;
@@ -1174,7 +1173,7 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count)
1174 */ 1173 */
1175static void lguest_restart(char *reason) 1174static void lguest_restart(char *reason)
1176{ 1175{
1177 kvm_hypercall2(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART); 1176 hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0, 0);
1178} 1177}
1179 1178
1180/*G:050 1179/*G:050
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
index 27eac0faee48..4f420c2f2d55 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -32,7 +32,7 @@ ENTRY(lguest_entry)
32 */ 32 */
33 movl $LHCALL_LGUEST_INIT, %eax 33 movl $LHCALL_LGUEST_INIT, %eax
34 movl $lguest_data - __PAGE_OFFSET, %ebx 34 movl $lguest_data - __PAGE_OFFSET, %ebx
35 .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ 35 int $LGUEST_TRAP_ENTRY
36 36
37 /* Set up the initial stack so we can run C code. */ 37 /* Set up the initial stack so we can run C code. */
38 movl $(init_thread_union+THREAD_SIZE),%esp 38 movl $(init_thread_union+THREAD_SIZE),%esp