aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/include/asm/kvm.h17
-rw-r--r--arch/x86/include/asm/kvm_emulate.h46
-rw-r--r--arch/x86/include/asm/kvm_host.h80
-rw-r--r--arch/x86/include/asm/kvm_para.h13
-rw-r--r--arch/x86/include/asm/msr-index.h5
-rw-r--r--arch/x86/include/asm/pvclock-abi.h4
-rw-r--r--arch/x86/include/asm/pvclock.h1
-rw-r--r--arch/x86/include/asm/svm.h9
-rw-r--r--arch/x86/include/asm/vmx.h12
-rw-r--r--arch/x86/kernel/kvmclock.c56
-rw-r--r--arch/x86/kernel/pvclock.c37
-rw-r--r--arch/x86/kernel/tboot.c1
-rw-r--r--arch/x86/kvm/emulate.c1247
-rw-r--r--arch/x86/kvm/i8259.c53
-rw-r--r--arch/x86/kvm/irq.h1
-rw-r--r--arch/x86/kvm/kvm_timer.h4
-rw-r--r--arch/x86/kvm/mmu.c225
-rw-r--r--arch/x86/kvm/mmutrace.h84
-rw-r--r--arch/x86/kvm/paging_tmpl.h46
-rw-r--r--arch/x86/kvm/svm.c944
-rw-r--r--arch/x86/kvm/timer.c3
-rw-r--r--arch/x86/kvm/trace.h165
-rw-r--r--arch/x86/kvm/vmx.c378
-rw-r--r--arch/x86/kvm/x86.c1599
-rw-r--r--arch/x86/kvm/x86.h7
25 files changed, 2987 insertions, 2050 deletions
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index f46b79f6c16c..ff90055c7f0b 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -21,6 +21,7 @@
21#define __KVM_HAVE_PIT_STATE2 21#define __KVM_HAVE_PIT_STATE2
22#define __KVM_HAVE_XEN_HVM 22#define __KVM_HAVE_XEN_HVM
23#define __KVM_HAVE_VCPU_EVENTS 23#define __KVM_HAVE_VCPU_EVENTS
24#define __KVM_HAVE_DEBUGREGS
24 25
25/* Architectural interrupt line count. */ 26/* Architectural interrupt line count. */
26#define KVM_NR_INTERRUPTS 256 27#define KVM_NR_INTERRUPTS 256
@@ -257,6 +258,11 @@ struct kvm_reinject_control {
257/* When set in flags, include corresponding fields on KVM_SET_VCPU_EVENTS */ 258/* When set in flags, include corresponding fields on KVM_SET_VCPU_EVENTS */
258#define KVM_VCPUEVENT_VALID_NMI_PENDING 0x00000001 259#define KVM_VCPUEVENT_VALID_NMI_PENDING 0x00000001
259#define KVM_VCPUEVENT_VALID_SIPI_VECTOR 0x00000002 260#define KVM_VCPUEVENT_VALID_SIPI_VECTOR 0x00000002
261#define KVM_VCPUEVENT_VALID_SHADOW 0x00000004
262
263/* Interrupt shadow states */
264#define KVM_X86_SHADOW_INT_MOV_SS 0x01
265#define KVM_X86_SHADOW_INT_STI 0x02
260 266
261/* for KVM_GET/SET_VCPU_EVENTS */ 267/* for KVM_GET/SET_VCPU_EVENTS */
262struct kvm_vcpu_events { 268struct kvm_vcpu_events {
@@ -271,7 +277,7 @@ struct kvm_vcpu_events {
271 __u8 injected; 277 __u8 injected;
272 __u8 nr; 278 __u8 nr;
273 __u8 soft; 279 __u8 soft;
274 __u8 pad; 280 __u8 shadow;
275 } interrupt; 281 } interrupt;
276 struct { 282 struct {
277 __u8 injected; 283 __u8 injected;
@@ -284,4 +290,13 @@ struct kvm_vcpu_events {
284 __u32 reserved[10]; 290 __u32 reserved[10];
285}; 291};
286 292
293/* for KVM_GET/SET_DEBUGREGS */
294struct kvm_debugregs {
295 __u64 db[4];
296 __u64 dr6;
297 __u64 dr7;
298 __u64 flags;
299 __u64 reserved[9];
300};
301
287#endif /* _ASM_X86_KVM_H */ 302#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 7a6f54fa13ba..0b2729bf2070 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -11,6 +11,8 @@
11#ifndef _ASM_X86_KVM_X86_EMULATE_H 11#ifndef _ASM_X86_KVM_X86_EMULATE_H
12#define _ASM_X86_KVM_X86_EMULATE_H 12#define _ASM_X86_KVM_X86_EMULATE_H
13 13
14#include <asm/desc_defs.h>
15
14struct x86_emulate_ctxt; 16struct x86_emulate_ctxt;
15 17
16/* 18/*
@@ -63,6 +65,15 @@ struct x86_emulate_ops {
63 unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error); 65 unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
64 66
65 /* 67 /*
68 * write_std: Write bytes of standard (non-emulated/special) memory.
69 * Used for descriptor writing.
70 * @addr: [IN ] Linear address to which to write.
71 * @val: [OUT] Value write to memory, zero-extended to 'u_long'.
72 * @bytes: [IN ] Number of bytes to write to memory.
73 */
74 int (*write_std)(unsigned long addr, void *val,
75 unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
76 /*
66 * fetch: Read bytes of standard (non-emulated/special) memory. 77 * fetch: Read bytes of standard (non-emulated/special) memory.
67 * Used for instruction fetch. 78 * Used for instruction fetch.
68 * @addr: [IN ] Linear address from which to read. 79 * @addr: [IN ] Linear address from which to read.
@@ -109,6 +120,23 @@ struct x86_emulate_ops {
109 unsigned int bytes, 120 unsigned int bytes,
110 struct kvm_vcpu *vcpu); 121 struct kvm_vcpu *vcpu);
111 122
123 int (*pio_in_emulated)(int size, unsigned short port, void *val,
124 unsigned int count, struct kvm_vcpu *vcpu);
125
126 int (*pio_out_emulated)(int size, unsigned short port, const void *val,
127 unsigned int count, struct kvm_vcpu *vcpu);
128
129 bool (*get_cached_descriptor)(struct desc_struct *desc,
130 int seg, struct kvm_vcpu *vcpu);
131 void (*set_cached_descriptor)(struct desc_struct *desc,
132 int seg, struct kvm_vcpu *vcpu);
133 u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu);
134 void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu);
135 void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu);
136 ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu);
137 void (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu);
138 int (*cpl)(struct kvm_vcpu *vcpu);
139 void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
112}; 140};
113 141
114/* Type, address-of, and value of an instruction's operand. */ 142/* Type, address-of, and value of an instruction's operand. */
@@ -124,6 +152,12 @@ struct fetch_cache {
124 unsigned long end; 152 unsigned long end;
125}; 153};
126 154
155struct read_cache {
156 u8 data[1024];
157 unsigned long pos;
158 unsigned long end;
159};
160
127struct decode_cache { 161struct decode_cache {
128 u8 twobyte; 162 u8 twobyte;
129 u8 b; 163 u8 b;
@@ -139,7 +173,7 @@ struct decode_cache {
139 u8 seg_override; 173 u8 seg_override;
140 unsigned int d; 174 unsigned int d;
141 unsigned long regs[NR_VCPU_REGS]; 175 unsigned long regs[NR_VCPU_REGS];
142 unsigned long eip, eip_orig; 176 unsigned long eip;
143 /* modrm */ 177 /* modrm */
144 u8 modrm; 178 u8 modrm;
145 u8 modrm_mod; 179 u8 modrm_mod;
@@ -151,16 +185,15 @@ struct decode_cache {
151 void *modrm_ptr; 185 void *modrm_ptr;
152 unsigned long modrm_val; 186 unsigned long modrm_val;
153 struct fetch_cache fetch; 187 struct fetch_cache fetch;
188 struct read_cache io_read;
154}; 189};
155 190
156#define X86_SHADOW_INT_MOV_SS 1
157#define X86_SHADOW_INT_STI 2
158
159struct x86_emulate_ctxt { 191struct x86_emulate_ctxt {
160 /* Register state before/after emulation. */ 192 /* Register state before/after emulation. */
161 struct kvm_vcpu *vcpu; 193 struct kvm_vcpu *vcpu;
162 194
163 unsigned long eflags; 195 unsigned long eflags;
196 unsigned long eip; /* eip before instruction emulation */
164 /* Emulated execution mode, represented by an X86EMUL_MODE value. */ 197 /* Emulated execution mode, represented by an X86EMUL_MODE value. */
165 int mode; 198 int mode;
166 u32 cs_base; 199 u32 cs_base;
@@ -168,6 +201,7 @@ struct x86_emulate_ctxt {
168 /* interruptibility state, as a result of execution of STI or MOV SS */ 201 /* interruptibility state, as a result of execution of STI or MOV SS */
169 int interruptibility; 202 int interruptibility;
170 203
204 bool restart; /* restart string instruction after writeback */
171 /* decode cache */ 205 /* decode cache */
172 struct decode_cache decode; 206 struct decode_cache decode;
173}; 207};
@@ -194,5 +228,9 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt,
194 struct x86_emulate_ops *ops); 228 struct x86_emulate_ops *ops);
195int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, 229int x86_emulate_insn(struct x86_emulate_ctxt *ctxt,
196 struct x86_emulate_ops *ops); 230 struct x86_emulate_ops *ops);
231int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
232 struct x86_emulate_ops *ops,
233 u16 tss_selector, int reason,
234 bool has_error_code, u32 error_code);
197 235
198#endif /* _ASM_X86_KVM_X86_EMULATE_H */ 236#endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 06d9e79ca37d..76f5483cffec 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -171,15 +171,15 @@ struct kvm_pte_chain {
171union kvm_mmu_page_role { 171union kvm_mmu_page_role {
172 unsigned word; 172 unsigned word;
173 struct { 173 struct {
174 unsigned glevels:4;
175 unsigned level:4; 174 unsigned level:4;
175 unsigned cr4_pae:1;
176 unsigned quadrant:2; 176 unsigned quadrant:2;
177 unsigned pad_for_nice_hex_output:6; 177 unsigned pad_for_nice_hex_output:6;
178 unsigned direct:1; 178 unsigned direct:1;
179 unsigned access:3; 179 unsigned access:3;
180 unsigned invalid:1; 180 unsigned invalid:1;
181 unsigned cr4_pge:1;
182 unsigned nxe:1; 181 unsigned nxe:1;
182 unsigned cr0_wp:1;
183 }; 183 };
184}; 184};
185 185
@@ -187,8 +187,6 @@ struct kvm_mmu_page {
187 struct list_head link; 187 struct list_head link;
188 struct hlist_node hash_link; 188 struct hlist_node hash_link;
189 189
190 struct list_head oos_link;
191
192 /* 190 /*
193 * The following two entries are used to key the shadow page in the 191 * The following two entries are used to key the shadow page in the
194 * hash table. 192 * hash table.
@@ -204,9 +202,9 @@ struct kvm_mmu_page {
204 * in this shadow page. 202 * in this shadow page.
205 */ 203 */
206 DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); 204 DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
207 int multimapped; /* More than one parent_pte? */ 205 bool multimapped; /* More than one parent_pte? */
208 int root_count; /* Currently serving as active root */
209 bool unsync; 206 bool unsync;
207 int root_count; /* Currently serving as active root */
210 unsigned int unsync_children; 208 unsigned int unsync_children;
211 union { 209 union {
212 u64 *parent_pte; /* !multimapped */ 210 u64 *parent_pte; /* !multimapped */
@@ -224,14 +222,9 @@ struct kvm_pv_mmu_op_buffer {
224 222
225struct kvm_pio_request { 223struct kvm_pio_request {
226 unsigned long count; 224 unsigned long count;
227 int cur_count;
228 gva_t guest_gva;
229 int in; 225 int in;
230 int port; 226 int port;
231 int size; 227 int size;
232 int string;
233 int down;
234 int rep;
235}; 228};
236 229
237/* 230/*
@@ -320,6 +313,7 @@ struct kvm_vcpu_arch {
320 struct kvm_queued_exception { 313 struct kvm_queued_exception {
321 bool pending; 314 bool pending;
322 bool has_error_code; 315 bool has_error_code;
316 bool reinject;
323 u8 nr; 317 u8 nr;
324 u32 error_code; 318 u32 error_code;
325 } exception; 319 } exception;
@@ -362,8 +356,8 @@ struct kvm_vcpu_arch {
362 u64 *mce_banks; 356 u64 *mce_banks;
363 357
364 /* used for guest single stepping over the given code position */ 358 /* used for guest single stepping over the given code position */
365 u16 singlestep_cs;
366 unsigned long singlestep_rip; 359 unsigned long singlestep_rip;
360
367 /* fields used by HYPER-V emulation */ 361 /* fields used by HYPER-V emulation */
368 u64 hv_vapic; 362 u64 hv_vapic;
369}; 363};
@@ -389,6 +383,7 @@ struct kvm_arch {
389 unsigned int n_free_mmu_pages; 383 unsigned int n_free_mmu_pages;
390 unsigned int n_requested_mmu_pages; 384 unsigned int n_requested_mmu_pages;
391 unsigned int n_alloc_mmu_pages; 385 unsigned int n_alloc_mmu_pages;
386 atomic_t invlpg_counter;
392 struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; 387 struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
393 /* 388 /*
394 * Hash table of struct kvm_mmu_page. 389 * Hash table of struct kvm_mmu_page.
@@ -461,11 +456,6 @@ struct kvm_vcpu_stat {
461 u32 nmi_injections; 456 u32 nmi_injections;
462}; 457};
463 458
464struct descriptor_table {
465 u16 limit;
466 unsigned long base;
467} __attribute__((packed));
468
469struct kvm_x86_ops { 459struct kvm_x86_ops {
470 int (*cpu_has_kvm_support)(void); /* __init */ 460 int (*cpu_has_kvm_support)(void); /* __init */
471 int (*disabled_by_bios)(void); /* __init */ 461 int (*disabled_by_bios)(void); /* __init */
@@ -503,12 +493,11 @@ struct kvm_x86_ops {
503 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); 493 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
504 void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); 494 void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
505 void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer); 495 void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
506 void (*get_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); 496 void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
507 void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); 497 void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
508 void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); 498 void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
509 void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); 499 void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
510 int (*get_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long *dest); 500 void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value);
511 int (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value);
512 void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); 501 void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
513 unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); 502 unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
514 void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); 503 void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
@@ -527,7 +516,8 @@ struct kvm_x86_ops {
527 void (*set_irq)(struct kvm_vcpu *vcpu); 516 void (*set_irq)(struct kvm_vcpu *vcpu);
528 void (*set_nmi)(struct kvm_vcpu *vcpu); 517 void (*set_nmi)(struct kvm_vcpu *vcpu);
529 void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr, 518 void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
530 bool has_error_code, u32 error_code); 519 bool has_error_code, u32 error_code,
520 bool reinject);
531 int (*interrupt_allowed)(struct kvm_vcpu *vcpu); 521 int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
532 int (*nmi_allowed)(struct kvm_vcpu *vcpu); 522 int (*nmi_allowed)(struct kvm_vcpu *vcpu);
533 bool (*get_nmi_mask)(struct kvm_vcpu *vcpu); 523 bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
@@ -541,6 +531,8 @@ struct kvm_x86_ops {
541 int (*get_lpage_level)(void); 531 int (*get_lpage_level)(void);
542 bool (*rdtscp_supported)(void); 532 bool (*rdtscp_supported)(void);
543 533
534 void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry);
535
544 const struct trace_print_flags *exit_reasons_str; 536 const struct trace_print_flags *exit_reasons_str;
545}; 537};
546 538
@@ -587,23 +579,14 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
587void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context); 579void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
588void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); 580void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
589void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); 581void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
590void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
591 unsigned long *rflags);
592 582
593unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr);
594void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long value,
595 unsigned long *rflags);
596void kvm_enable_efer_bits(u64); 583void kvm_enable_efer_bits(u64);
597int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data); 584int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
598int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); 585int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
599 586
600struct x86_emulate_ctxt; 587struct x86_emulate_ctxt;
601 588
602int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, 589int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port);
603 int size, unsigned port);
604int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
605 int size, unsigned long count, int down,
606 gva_t address, int rep, unsigned port);
607void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); 590void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
608int kvm_emulate_halt(struct kvm_vcpu *vcpu); 591int kvm_emulate_halt(struct kvm_vcpu *vcpu);
609int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); 592int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
@@ -616,12 +599,15 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
616void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); 599void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
617int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); 600int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
618 601
619int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason); 602int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
603 bool has_error_code, u32 error_code);
620 604
621void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); 605void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
622void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); 606void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
623void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); 607void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
624void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8); 608void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
609int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val);
610int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
625unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu); 611unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
626void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw); 612void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
627void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l); 613void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
@@ -634,6 +620,8 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
634 620
635void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); 621void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
636void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); 622void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
623void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr);
624void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
637void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2, 625void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
638 u32 error_code); 626 u32 error_code);
639bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); 627bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
@@ -649,8 +637,6 @@ int emulator_write_emulated(unsigned long addr,
649 unsigned int bytes, 637 unsigned int bytes,
650 struct kvm_vcpu *vcpu); 638 struct kvm_vcpu *vcpu);
651 639
652unsigned long segment_base(u16 selector);
653
654void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu); 640void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
655void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, 641void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
656 const u8 *new, int bytes, 642 const u8 *new, int bytes,
@@ -675,7 +661,6 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
675void kvm_enable_tdp(void); 661void kvm_enable_tdp(void);
676void kvm_disable_tdp(void); 662void kvm_disable_tdp(void);
677 663
678int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
679int complete_pio(struct kvm_vcpu *vcpu); 664int complete_pio(struct kvm_vcpu *vcpu);
680bool kvm_check_iopl(struct kvm_vcpu *vcpu); 665bool kvm_check_iopl(struct kvm_vcpu *vcpu);
681 666
@@ -724,23 +709,6 @@ static inline void kvm_load_ldt(u16 sel)
724 asm("lldt %0" : : "rm"(sel)); 709 asm("lldt %0" : : "rm"(sel));
725} 710}
726 711
727static inline void kvm_get_idt(struct descriptor_table *table)
728{
729 asm("sidt %0" : "=m"(*table));
730}
731
732static inline void kvm_get_gdt(struct descriptor_table *table)
733{
734 asm("sgdt %0" : "=m"(*table));
735}
736
737static inline unsigned long kvm_read_tr_base(void)
738{
739 u16 tr;
740 asm("str %0" : "=g"(tr));
741 return segment_base(tr);
742}
743
744#ifdef CONFIG_X86_64 712#ifdef CONFIG_X86_64
745static inline unsigned long read_msr(unsigned long msr) 713static inline unsigned long read_msr(unsigned long msr)
746{ 714{
@@ -826,4 +794,6 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
826void kvm_define_shared_msr(unsigned index, u32 msr); 794void kvm_define_shared_msr(unsigned index, u32 msr);
827void kvm_set_shared_msr(unsigned index, u64 val, u64 mask); 795void kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
828 796
797bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
798
829#endif /* _ASM_X86_KVM_HOST_H */ 799#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index ffae1420e7d7..05eba5e9a8e8 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -16,10 +16,23 @@
16#define KVM_FEATURE_CLOCKSOURCE 0 16#define KVM_FEATURE_CLOCKSOURCE 0
17#define KVM_FEATURE_NOP_IO_DELAY 1 17#define KVM_FEATURE_NOP_IO_DELAY 1
18#define KVM_FEATURE_MMU_OP 2 18#define KVM_FEATURE_MMU_OP 2
19/* This indicates that the new set of kvmclock msrs
20 * are available. The use of 0x11 and 0x12 is deprecated
21 */
22#define KVM_FEATURE_CLOCKSOURCE2 3
23
24/* The last 8 bits are used to indicate how to interpret the flags field
25 * in pvclock structure. If no bits are set, all flags are ignored.
26 */
27#define KVM_FEATURE_CLOCKSOURCE_STABLE_BIT 24
19 28
20#define MSR_KVM_WALL_CLOCK 0x11 29#define MSR_KVM_WALL_CLOCK 0x11
21#define MSR_KVM_SYSTEM_TIME 0x12 30#define MSR_KVM_SYSTEM_TIME 0x12
22 31
32/* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */
33#define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00
34#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
35
23#define KVM_MAX_MMU_OP_BATCH 32 36#define KVM_MAX_MMU_OP_BATCH 32
24 37
25/* Operations for KVM_HC_MMU_OP */ 38/* Operations for KVM_HC_MMU_OP */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index bc473acfa7f9..f9324851eba0 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -202,8 +202,9 @@
202#define MSR_IA32_EBL_CR_POWERON 0x0000002a 202#define MSR_IA32_EBL_CR_POWERON 0x0000002a
203#define MSR_IA32_FEATURE_CONTROL 0x0000003a 203#define MSR_IA32_FEATURE_CONTROL 0x0000003a
204 204
205#define FEATURE_CONTROL_LOCKED (1<<0) 205#define FEATURE_CONTROL_LOCKED (1<<0)
206#define FEATURE_CONTROL_VMXON_ENABLED (1<<2) 206#define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1)
207#define FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX (1<<2)
207 208
208#define MSR_IA32_APICBASE 0x0000001b 209#define MSR_IA32_APICBASE 0x0000001b
209#define MSR_IA32_APICBASE_BSP (1<<8) 210#define MSR_IA32_APICBASE_BSP (1<<8)
diff --git a/arch/x86/include/asm/pvclock-abi.h b/arch/x86/include/asm/pvclock-abi.h
index 6d93508f2626..35f2d1948ada 100644
--- a/arch/x86/include/asm/pvclock-abi.h
+++ b/arch/x86/include/asm/pvclock-abi.h
@@ -29,7 +29,8 @@ struct pvclock_vcpu_time_info {
29 u64 system_time; 29 u64 system_time;
30 u32 tsc_to_system_mul; 30 u32 tsc_to_system_mul;
31 s8 tsc_shift; 31 s8 tsc_shift;
32 u8 pad[3]; 32 u8 flags;
33 u8 pad[2];
33} __attribute__((__packed__)); /* 32 bytes */ 34} __attribute__((__packed__)); /* 32 bytes */
34 35
35struct pvclock_wall_clock { 36struct pvclock_wall_clock {
@@ -38,5 +39,6 @@ struct pvclock_wall_clock {
38 u32 nsec; 39 u32 nsec;
39} __attribute__((__packed__)); 40} __attribute__((__packed__));
40 41
42#define PVCLOCK_TSC_STABLE_BIT (1 << 0)
41#endif /* __ASSEMBLY__ */ 43#endif /* __ASSEMBLY__ */
42#endif /* _ASM_X86_PVCLOCK_ABI_H */ 44#endif /* _ASM_X86_PVCLOCK_ABI_H */
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 53235fd5f8ce..cd02f324aa6b 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -6,6 +6,7 @@
6 6
7/* some helper functions for xen and kvm pv clock sources */ 7/* some helper functions for xen and kvm pv clock sources */
8cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src); 8cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
9void pvclock_set_flags(u8 flags);
9unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src); 10unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src);
10void pvclock_read_wallclock(struct pvclock_wall_clock *wall, 11void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
11 struct pvclock_vcpu_time_info *vcpu, 12 struct pvclock_vcpu_time_info *vcpu,
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 38638cd2fa4c..0e831059ac5a 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -81,7 +81,9 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
81 u32 event_inj_err; 81 u32 event_inj_err;
82 u64 nested_cr3; 82 u64 nested_cr3;
83 u64 lbr_ctl; 83 u64 lbr_ctl;
84 u8 reserved_5[832]; 84 u64 reserved_5;
85 u64 next_rip;
86 u8 reserved_6[816];
85}; 87};
86 88
87 89
@@ -115,6 +117,10 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
115#define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT) 117#define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT)
116#define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT) 118#define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT)
117 119
120#define SVM_VM_CR_VALID_MASK 0x001fULL
121#define SVM_VM_CR_SVM_LOCK_MASK 0x0008ULL
122#define SVM_VM_CR_SVM_DIS_MASK 0x0010ULL
123
118struct __attribute__ ((__packed__)) vmcb_seg { 124struct __attribute__ ((__packed__)) vmcb_seg {
119 u16 selector; 125 u16 selector;
120 u16 attrib; 126 u16 attrib;
@@ -238,6 +244,7 @@ struct __attribute__ ((__packed__)) vmcb {
238 244
239#define SVM_EXITINFOSHIFT_TS_REASON_IRET 36 245#define SVM_EXITINFOSHIFT_TS_REASON_IRET 36
240#define SVM_EXITINFOSHIFT_TS_REASON_JMP 38 246#define SVM_EXITINFOSHIFT_TS_REASON_JMP 38
247#define SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE 44
241 248
242#define SVM_EXIT_READ_CR0 0x000 249#define SVM_EXIT_READ_CR0 0x000
243#define SVM_EXIT_READ_CR3 0x003 250#define SVM_EXIT_READ_CR3 0x003
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index fb9a080740ec..9e6779f7cf2d 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -25,6 +25,8 @@
25 * 25 *
26 */ 26 */
27 27
28#include <linux/types.h>
29
28/* 30/*
29 * Definitions of Primary Processor-Based VM-Execution Controls. 31 * Definitions of Primary Processor-Based VM-Execution Controls.
30 */ 32 */
@@ -120,6 +122,8 @@ enum vmcs_field {
120 GUEST_IA32_DEBUGCTL_HIGH = 0x00002803, 122 GUEST_IA32_DEBUGCTL_HIGH = 0x00002803,
121 GUEST_IA32_PAT = 0x00002804, 123 GUEST_IA32_PAT = 0x00002804,
122 GUEST_IA32_PAT_HIGH = 0x00002805, 124 GUEST_IA32_PAT_HIGH = 0x00002805,
125 GUEST_IA32_EFER = 0x00002806,
126 GUEST_IA32_EFER_HIGH = 0x00002807,
123 GUEST_PDPTR0 = 0x0000280a, 127 GUEST_PDPTR0 = 0x0000280a,
124 GUEST_PDPTR0_HIGH = 0x0000280b, 128 GUEST_PDPTR0_HIGH = 0x0000280b,
125 GUEST_PDPTR1 = 0x0000280c, 129 GUEST_PDPTR1 = 0x0000280c,
@@ -130,6 +134,8 @@ enum vmcs_field {
130 GUEST_PDPTR3_HIGH = 0x00002811, 134 GUEST_PDPTR3_HIGH = 0x00002811,
131 HOST_IA32_PAT = 0x00002c00, 135 HOST_IA32_PAT = 0x00002c00,
132 HOST_IA32_PAT_HIGH = 0x00002c01, 136 HOST_IA32_PAT_HIGH = 0x00002c01,
137 HOST_IA32_EFER = 0x00002c02,
138 HOST_IA32_EFER_HIGH = 0x00002c03,
133 PIN_BASED_VM_EXEC_CONTROL = 0x00004000, 139 PIN_BASED_VM_EXEC_CONTROL = 0x00004000,
134 CPU_BASED_VM_EXEC_CONTROL = 0x00004002, 140 CPU_BASED_VM_EXEC_CONTROL = 0x00004002,
135 EXCEPTION_BITMAP = 0x00004004, 141 EXCEPTION_BITMAP = 0x00004004,
@@ -394,6 +400,10 @@ enum vmcs_field {
394#define ASM_VMX_INVEPT ".byte 0x66, 0x0f, 0x38, 0x80, 0x08" 400#define ASM_VMX_INVEPT ".byte 0x66, 0x0f, 0x38, 0x80, 0x08"
395#define ASM_VMX_INVVPID ".byte 0x66, 0x0f, 0x38, 0x81, 0x08" 401#define ASM_VMX_INVVPID ".byte 0x66, 0x0f, 0x38, 0x81, 0x08"
396 402
397 403struct vmx_msr_entry {
404 u32 index;
405 u32 reserved;
406 u64 value;
407} __aligned(16);
398 408
399#endif 409#endif
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index feaeb0d3aa4f..eb9b76c716c2 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -29,6 +29,8 @@
29#define KVM_SCALE 22 29#define KVM_SCALE 22
30 30
31static int kvmclock = 1; 31static int kvmclock = 1;
32static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
33static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
32 34
33static int parse_no_kvmclock(char *arg) 35static int parse_no_kvmclock(char *arg)
34{ 36{
@@ -54,7 +56,8 @@ static unsigned long kvm_get_wallclock(void)
54 56
55 low = (int)__pa_symbol(&wall_clock); 57 low = (int)__pa_symbol(&wall_clock);
56 high = ((u64)__pa_symbol(&wall_clock) >> 32); 58 high = ((u64)__pa_symbol(&wall_clock) >> 32);
57 native_write_msr(MSR_KVM_WALL_CLOCK, low, high); 59
60 native_write_msr(msr_kvm_wall_clock, low, high);
58 61
59 vcpu_time = &get_cpu_var(hv_clock); 62 vcpu_time = &get_cpu_var(hv_clock);
60 pvclock_read_wallclock(&wall_clock, vcpu_time, &ts); 63 pvclock_read_wallclock(&wall_clock, vcpu_time, &ts);
@@ -130,7 +133,8 @@ static int kvm_register_clock(char *txt)
130 high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32); 133 high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);
131 printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", 134 printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
132 cpu, high, low, txt); 135 cpu, high, low, txt);
133 return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high); 136
137 return native_write_msr_safe(msr_kvm_system_time, low, high);
134} 138}
135 139
136#ifdef CONFIG_X86_LOCAL_APIC 140#ifdef CONFIG_X86_LOCAL_APIC
@@ -165,14 +169,14 @@ static void __init kvm_smp_prepare_boot_cpu(void)
165#ifdef CONFIG_KEXEC 169#ifdef CONFIG_KEXEC
166static void kvm_crash_shutdown(struct pt_regs *regs) 170static void kvm_crash_shutdown(struct pt_regs *regs)
167{ 171{
168 native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0); 172 native_write_msr(msr_kvm_system_time, 0, 0);
169 native_machine_crash_shutdown(regs); 173 native_machine_crash_shutdown(regs);
170} 174}
171#endif 175#endif
172 176
173static void kvm_shutdown(void) 177static void kvm_shutdown(void)
174{ 178{
175 native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0); 179 native_write_msr(msr_kvm_system_time, 0, 0);
176 native_machine_shutdown(); 180 native_machine_shutdown();
177} 181}
178 182
@@ -181,27 +185,37 @@ void __init kvmclock_init(void)
181 if (!kvm_para_available()) 185 if (!kvm_para_available())
182 return; 186 return;
183 187
184 if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { 188 if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) {
185 if (kvm_register_clock("boot clock")) 189 msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW;
186 return; 190 msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW;
187 pv_time_ops.sched_clock = kvm_clock_read; 191 } else if (!(kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)))
188 x86_platform.calibrate_tsc = kvm_get_tsc_khz; 192 return;
189 x86_platform.get_wallclock = kvm_get_wallclock; 193
190 x86_platform.set_wallclock = kvm_set_wallclock; 194 printk(KERN_INFO "kvm-clock: Using msrs %x and %x",
195 msr_kvm_system_time, msr_kvm_wall_clock);
196
197 if (kvm_register_clock("boot clock"))
198 return;
199 pv_time_ops.sched_clock = kvm_clock_read;
200 x86_platform.calibrate_tsc = kvm_get_tsc_khz;
201 x86_platform.get_wallclock = kvm_get_wallclock;
202 x86_platform.set_wallclock = kvm_set_wallclock;
191#ifdef CONFIG_X86_LOCAL_APIC 203#ifdef CONFIG_X86_LOCAL_APIC
192 x86_cpuinit.setup_percpu_clockev = 204 x86_cpuinit.setup_percpu_clockev =
193 kvm_setup_secondary_clock; 205 kvm_setup_secondary_clock;
194#endif 206#endif
195#ifdef CONFIG_SMP 207#ifdef CONFIG_SMP
196 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; 208 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
197#endif 209#endif
198 machine_ops.shutdown = kvm_shutdown; 210 machine_ops.shutdown = kvm_shutdown;
199#ifdef CONFIG_KEXEC 211#ifdef CONFIG_KEXEC
200 machine_ops.crash_shutdown = kvm_crash_shutdown; 212 machine_ops.crash_shutdown = kvm_crash_shutdown;
201#endif 213#endif
202 kvm_get_preset_lpj(); 214 kvm_get_preset_lpj();
203 clocksource_register(&kvm_clock); 215 clocksource_register(&kvm_clock);
204 pv_info.paravirt_enabled = 1; 216 pv_info.paravirt_enabled = 1;
205 pv_info.name = "KVM"; 217 pv_info.name = "KVM";
206 } 218
219 if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
220 pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
207} 221}
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 03801f2f761f..239427ca02af 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -31,8 +31,16 @@ struct pvclock_shadow_time {
31 u32 tsc_to_nsec_mul; 31 u32 tsc_to_nsec_mul;
32 int tsc_shift; 32 int tsc_shift;
33 u32 version; 33 u32 version;
34 u8 flags;
34}; 35};
35 36
37static u8 valid_flags __read_mostly = 0;
38
39void pvclock_set_flags(u8 flags)
40{
41 valid_flags = flags;
42}
43
36/* 44/*
37 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, 45 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
38 * yielding a 64-bit result. 46 * yielding a 64-bit result.
@@ -91,6 +99,7 @@ static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
91 dst->system_timestamp = src->system_time; 99 dst->system_timestamp = src->system_time;
92 dst->tsc_to_nsec_mul = src->tsc_to_system_mul; 100 dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
93 dst->tsc_shift = src->tsc_shift; 101 dst->tsc_shift = src->tsc_shift;
102 dst->flags = src->flags;
94 rmb(); /* test version after fetching data */ 103 rmb(); /* test version after fetching data */
95 } while ((src->version & 1) || (dst->version != src->version)); 104 } while ((src->version & 1) || (dst->version != src->version));
96 105
@@ -109,11 +118,14 @@ unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
109 return pv_tsc_khz; 118 return pv_tsc_khz;
110} 119}
111 120
121static atomic64_t last_value = ATOMIC64_INIT(0);
122
112cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) 123cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
113{ 124{
114 struct pvclock_shadow_time shadow; 125 struct pvclock_shadow_time shadow;
115 unsigned version; 126 unsigned version;
116 cycle_t ret, offset; 127 cycle_t ret, offset;
128 u64 last;
117 129
118 do { 130 do {
119 version = pvclock_get_time_values(&shadow, src); 131 version = pvclock_get_time_values(&shadow, src);
@@ -123,6 +135,31 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
123 barrier(); 135 barrier();
124 } while (version != src->version); 136 } while (version != src->version);
125 137
138 if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) &&
139 (shadow.flags & PVCLOCK_TSC_STABLE_BIT))
140 return ret;
141
142 /*
143 * Assumption here is that last_value, a global accumulator, always goes
144 * forward. If we are less than that, we should not be much smaller.
145 * We assume there is an error marging we're inside, and then the correction
146 * does not sacrifice accuracy.
147 *
148 * For reads: global may have changed between test and return,
149 * but this means someone else updated poked the clock at a later time.
150 * We just need to make sure we are not seeing a backwards event.
151 *
152 * For updates: last_value = ret is not enough, since two vcpus could be
153 * updating at the same time, and one of them could be slightly behind,
154 * making the assumption that last_value always go forward fail to hold.
155 */
156 last = atomic64_read(&last_value);
157 do {
158 if (ret < last)
159 return last;
160 last = atomic64_cmpxchg(&last_value, last, ret);
161 } while (unlikely(last != ret));
162
126 return ret; 163 return ret;
127} 164}
128 165
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index cc2c60474fd0..c2f1b26141e2 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -46,6 +46,7 @@
46 46
47/* Global pointer to shared data; NULL means no measured launch. */ 47/* Global pointer to shared data; NULL means no measured launch. */
48struct tboot *tboot __read_mostly; 48struct tboot *tboot __read_mostly;
49EXPORT_SYMBOL(tboot);
49 50
50/* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */ 51/* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */
51#define AP_WAIT_TIMEOUT 1 52#define AP_WAIT_TIMEOUT 1
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 4dade6ac0827..5ac0bb465ed6 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -33,6 +33,7 @@
33#include <asm/kvm_emulate.h> 33#include <asm/kvm_emulate.h>
34 34
35#include "x86.h" 35#include "x86.h"
36#include "tss.h"
36 37
37/* 38/*
38 * Opcode effective-address decode tables. 39 * Opcode effective-address decode tables.
@@ -50,6 +51,8 @@
50#define DstReg (2<<1) /* Register operand. */ 51#define DstReg (2<<1) /* Register operand. */
51#define DstMem (3<<1) /* Memory operand. */ 52#define DstMem (3<<1) /* Memory operand. */
52#define DstAcc (4<<1) /* Destination Accumulator */ 53#define DstAcc (4<<1) /* Destination Accumulator */
54#define DstDI (5<<1) /* Destination is in ES:(E)DI */
55#define DstMem64 (6<<1) /* 64bit memory operand */
53#define DstMask (7<<1) 56#define DstMask (7<<1)
54/* Source operand type. */ 57/* Source operand type. */
55#define SrcNone (0<<4) /* No source operand. */ 58#define SrcNone (0<<4) /* No source operand. */
@@ -63,6 +66,7 @@
63#define SrcOne (7<<4) /* Implied '1' */ 66#define SrcOne (7<<4) /* Implied '1' */
64#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */ 67#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */
65#define SrcImmU (9<<4) /* Immediate operand, unsigned */ 68#define SrcImmU (9<<4) /* Immediate operand, unsigned */
69#define SrcSI (0xa<<4) /* Source is in the DS:RSI */
66#define SrcMask (0xf<<4) 70#define SrcMask (0xf<<4)
67/* Generic ModRM decode. */ 71/* Generic ModRM decode. */
68#define ModRM (1<<8) 72#define ModRM (1<<8)
@@ -85,6 +89,9 @@
85#define Src2ImmByte (2<<29) 89#define Src2ImmByte (2<<29)
86#define Src2One (3<<29) 90#define Src2One (3<<29)
87#define Src2Imm16 (4<<29) 91#define Src2Imm16 (4<<29)
92#define Src2Mem16 (5<<29) /* Used for Ep encoding. First argument has to be
93 in memory and second argument is located
94 immediately after the first one in memory. */
88#define Src2Mask (7<<29) 95#define Src2Mask (7<<29)
89 96
90enum { 97enum {
@@ -147,8 +154,8 @@ static u32 opcode_table[256] = {
147 0, 0, 0, 0, 154 0, 0, 0, 0,
148 /* 0x68 - 0x6F */ 155 /* 0x68 - 0x6F */
149 SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0, 156 SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0,
150 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */ 157 DstDI | ByteOp | Mov | String, DstDI | Mov | String, /* insb, insw/insd */
151 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */ 158 SrcSI | ByteOp | ImplicitOps | String, SrcSI | ImplicitOps | String, /* outsb, outsw/outsd */
152 /* 0x70 - 0x77 */ 159 /* 0x70 - 0x77 */
153 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, 160 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
154 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, 161 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
@@ -173,12 +180,12 @@ static u32 opcode_table[256] = {
173 /* 0xA0 - 0xA7 */ 180 /* 0xA0 - 0xA7 */
174 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, 181 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
175 ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs, 182 ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs,
176 ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, 183 ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String,
177 ByteOp | ImplicitOps | String, ImplicitOps | String, 184 ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String,
178 /* 0xA8 - 0xAF */ 185 /* 0xA8 - 0xAF */
179 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, 186 0, 0, ByteOp | DstDI | Mov | String, DstDI | Mov | String,
180 ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, 187 ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String,
181 ByteOp | ImplicitOps | String, ImplicitOps | String, 188 ByteOp | DstDI | String, DstDI | String,
182 /* 0xB0 - 0xB7 */ 189 /* 0xB0 - 0xB7 */
183 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, 190 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
184 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, 191 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
@@ -204,13 +211,13 @@ static u32 opcode_table[256] = {
204 0, 0, 0, 0, 0, 0, 0, 0, 211 0, 0, 0, 0, 0, 0, 0, 0,
205 /* 0xE0 - 0xE7 */ 212 /* 0xE0 - 0xE7 */
206 0, 0, 0, 0, 213 0, 0, 0, 0,
207 ByteOp | SrcImmUByte, SrcImmUByte, 214 ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
208 ByteOp | SrcImmUByte, SrcImmUByte, 215 ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
209 /* 0xE8 - 0xEF */ 216 /* 0xE8 - 0xEF */
210 SrcImm | Stack, SrcImm | ImplicitOps, 217 SrcImm | Stack, SrcImm | ImplicitOps,
211 SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps, 218 SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps,
212 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, 219 SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
213 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, 220 SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
214 /* 0xF0 - 0xF7 */ 221 /* 0xF0 - 0xF7 */
215 0, 0, 0, 0, 222 0, 0, 0, 0,
216 ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3, 223 ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3,
@@ -343,7 +350,8 @@ static u32 group_table[] = {
343 [Group5*8] = 350 [Group5*8] =
344 DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 351 DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
345 SrcMem | ModRM | Stack, 0, 352 SrcMem | ModRM | Stack, 0,
346 SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0, 353 SrcMem | ModRM | Stack, SrcMem | ModRM | Src2Mem16 | ImplicitOps,
354 SrcMem | ModRM | Stack, 0,
347 [Group7*8] = 355 [Group7*8] =
348 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv, 356 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv,
349 SrcNone | ModRM | DstMem | Mov, 0, 357 SrcNone | ModRM | DstMem | Mov, 0,
@@ -353,14 +361,14 @@ static u32 group_table[] = {
353 DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock, 361 DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock,
354 DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock, 362 DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock,
355 [Group9*8] = 363 [Group9*8] =
356 0, ImplicitOps | ModRM | Lock, 0, 0, 0, 0, 0, 0, 364 0, DstMem64 | ModRM | Lock, 0, 0, 0, 0, 0, 0,
357}; 365};
358 366
359static u32 group2_table[] = { 367static u32 group2_table[] = {
360 [Group7*8] = 368 [Group7*8] =
361 SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM, 369 SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM | Priv,
362 SrcNone | ModRM | DstMem | Mov, 0, 370 SrcNone | ModRM | DstMem | Mov, 0,
363 SrcMem16 | ModRM | Mov, 0, 371 SrcMem16 | ModRM | Mov | Priv, 0,
364 [Group9*8] = 372 [Group9*8] =
365 0, 0, 0, 0, 0, 0, 0, 0, 373 0, 0, 0, 0, 0, 0, 0, 0,
366}; 374};
@@ -562,7 +570,7 @@ static u32 group2_table[] = {
562#define insn_fetch(_type, _size, _eip) \ 570#define insn_fetch(_type, _size, _eip) \
563({ unsigned long _x; \ 571({ unsigned long _x; \
564 rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \ 572 rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \
565 if (rc != 0) \ 573 if (rc != X86EMUL_CONTINUE) \
566 goto done; \ 574 goto done; \
567 (_eip) += (_size); \ 575 (_eip) += (_size); \
568 (_type)_x; \ 576 (_type)_x; \
@@ -638,40 +646,40 @@ static unsigned long ss_base(struct x86_emulate_ctxt *ctxt)
638 646
639static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, 647static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
640 struct x86_emulate_ops *ops, 648 struct x86_emulate_ops *ops,
641 unsigned long linear, u8 *dest) 649 unsigned long eip, u8 *dest)
642{ 650{
643 struct fetch_cache *fc = &ctxt->decode.fetch; 651 struct fetch_cache *fc = &ctxt->decode.fetch;
644 int rc; 652 int rc;
645 int size; 653 int size, cur_size;
646 654
647 if (linear < fc->start || linear >= fc->end) { 655 if (eip == fc->end) {
648 size = min(15UL, PAGE_SIZE - offset_in_page(linear)); 656 cur_size = fc->end - fc->start;
649 rc = ops->fetch(linear, fc->data, size, ctxt->vcpu, NULL); 657 size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip));
650 if (rc) 658 rc = ops->fetch(ctxt->cs_base + eip, fc->data + cur_size,
659 size, ctxt->vcpu, NULL);
660 if (rc != X86EMUL_CONTINUE)
651 return rc; 661 return rc;
652 fc->start = linear; 662 fc->end += size;
653 fc->end = linear + size;
654 } 663 }
655 *dest = fc->data[linear - fc->start]; 664 *dest = fc->data[eip - fc->start];
656 return 0; 665 return X86EMUL_CONTINUE;
657} 666}
658 667
659static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, 668static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
660 struct x86_emulate_ops *ops, 669 struct x86_emulate_ops *ops,
661 unsigned long eip, void *dest, unsigned size) 670 unsigned long eip, void *dest, unsigned size)
662{ 671{
663 int rc = 0; 672 int rc;
664 673
665 /* x86 instructions are limited to 15 bytes. */ 674 /* x86 instructions are limited to 15 bytes. */
666 if (eip + size - ctxt->decode.eip_orig > 15) 675 if (eip + size - ctxt->eip > 15)
667 return X86EMUL_UNHANDLEABLE; 676 return X86EMUL_UNHANDLEABLE;
668 eip += ctxt->cs_base;
669 while (size--) { 677 while (size--) {
670 rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++); 678 rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++);
671 if (rc) 679 if (rc != X86EMUL_CONTINUE)
672 return rc; 680 return rc;
673 } 681 }
674 return 0; 682 return X86EMUL_CONTINUE;
675} 683}
676 684
677/* 685/*
@@ -702,7 +710,7 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
702 *address = 0; 710 *address = 0;
703 rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, 711 rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
704 ctxt->vcpu, NULL); 712 ctxt->vcpu, NULL);
705 if (rc) 713 if (rc != X86EMUL_CONTINUE)
706 return rc; 714 return rc;
707 rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, 715 rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
708 ctxt->vcpu, NULL); 716 ctxt->vcpu, NULL);
@@ -782,7 +790,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
782 struct decode_cache *c = &ctxt->decode; 790 struct decode_cache *c = &ctxt->decode;
783 u8 sib; 791 u8 sib;
784 int index_reg = 0, base_reg = 0, scale; 792 int index_reg = 0, base_reg = 0, scale;
785 int rc = 0; 793 int rc = X86EMUL_CONTINUE;
786 794
787 if (c->rex_prefix) { 795 if (c->rex_prefix) {
788 c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */ 796 c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */
@@ -895,7 +903,7 @@ static int decode_abs(struct x86_emulate_ctxt *ctxt,
895 struct x86_emulate_ops *ops) 903 struct x86_emulate_ops *ops)
896{ 904{
897 struct decode_cache *c = &ctxt->decode; 905 struct decode_cache *c = &ctxt->decode;
898 int rc = 0; 906 int rc = X86EMUL_CONTINUE;
899 907
900 switch (c->ad_bytes) { 908 switch (c->ad_bytes) {
901 case 2: 909 case 2:
@@ -916,14 +924,18 @@ int
916x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) 924x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
917{ 925{
918 struct decode_cache *c = &ctxt->decode; 926 struct decode_cache *c = &ctxt->decode;
919 int rc = 0; 927 int rc = X86EMUL_CONTINUE;
920 int mode = ctxt->mode; 928 int mode = ctxt->mode;
921 int def_op_bytes, def_ad_bytes, group; 929 int def_op_bytes, def_ad_bytes, group;
922 930
923 /* Shadow copy of register state. Committed on successful emulation. */
924 931
932 /* we cannot decode insn before we complete previous rep insn */
933 WARN_ON(ctxt->restart);
934
935 /* Shadow copy of register state. Committed on successful emulation. */
925 memset(c, 0, sizeof(struct decode_cache)); 936 memset(c, 0, sizeof(struct decode_cache));
926 c->eip = c->eip_orig = kvm_rip_read(ctxt->vcpu); 937 c->eip = ctxt->eip;
938 c->fetch.start = c->fetch.end = c->eip;
927 ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS); 939 ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
928 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); 940 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
929 941
@@ -1015,11 +1027,6 @@ done_prefixes:
1015 } 1027 }
1016 } 1028 }
1017 1029
1018 if (mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
1019 kvm_report_emulation_failure(ctxt->vcpu, "invalid x86/64 instruction");
1020 return -1;
1021 }
1022
1023 if (c->d & Group) { 1030 if (c->d & Group) {
1024 group = c->d & GroupMask; 1031 group = c->d & GroupMask;
1025 c->modrm = insn_fetch(u8, 1, c->eip); 1032 c->modrm = insn_fetch(u8, 1, c->eip);
@@ -1046,7 +1053,7 @@ done_prefixes:
1046 rc = decode_modrm(ctxt, ops); 1053 rc = decode_modrm(ctxt, ops);
1047 else if (c->d & MemAbs) 1054 else if (c->d & MemAbs)
1048 rc = decode_abs(ctxt, ops); 1055 rc = decode_abs(ctxt, ops);
1049 if (rc) 1056 if (rc != X86EMUL_CONTINUE)
1050 goto done; 1057 goto done;
1051 1058
1052 if (!c->has_seg_override) 1059 if (!c->has_seg_override)
@@ -1057,6 +1064,10 @@ done_prefixes:
1057 1064
1058 if (c->ad_bytes != 8) 1065 if (c->ad_bytes != 8)
1059 c->modrm_ea = (u32)c->modrm_ea; 1066 c->modrm_ea = (u32)c->modrm_ea;
1067
1068 if (c->rip_relative)
1069 c->modrm_ea += c->eip;
1070
1060 /* 1071 /*
1061 * Decode and fetch the source operand: register, memory 1072 * Decode and fetch the source operand: register, memory
1062 * or immediate. 1073 * or immediate.
@@ -1091,6 +1102,8 @@ done_prefixes:
1091 break; 1102 break;
1092 } 1103 }
1093 c->src.type = OP_MEM; 1104 c->src.type = OP_MEM;
1105 c->src.ptr = (unsigned long *)c->modrm_ea;
1106 c->src.val = 0;
1094 break; 1107 break;
1095 case SrcImm: 1108 case SrcImm:
1096 case SrcImmU: 1109 case SrcImmU:
@@ -1139,6 +1152,14 @@ done_prefixes:
1139 c->src.bytes = 1; 1152 c->src.bytes = 1;
1140 c->src.val = 1; 1153 c->src.val = 1;
1141 break; 1154 break;
1155 case SrcSI:
1156 c->src.type = OP_MEM;
1157 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1158 c->src.ptr = (unsigned long *)
1159 register_address(c, seg_override_base(ctxt, c),
1160 c->regs[VCPU_REGS_RSI]);
1161 c->src.val = 0;
1162 break;
1142 } 1163 }
1143 1164
1144 /* 1165 /*
@@ -1168,6 +1189,12 @@ done_prefixes:
1168 c->src2.bytes = 1; 1189 c->src2.bytes = 1;
1169 c->src2.val = 1; 1190 c->src2.val = 1;
1170 break; 1191 break;
1192 case Src2Mem16:
1193 c->src2.type = OP_MEM;
1194 c->src2.bytes = 2;
1195 c->src2.ptr = (unsigned long *)(c->modrm_ea + c->src.bytes);
1196 c->src2.val = 0;
1197 break;
1171 } 1198 }
1172 1199
1173 /* Decode and fetch the destination operand: register or memory. */ 1200 /* Decode and fetch the destination operand: register or memory. */
@@ -1180,6 +1207,7 @@ done_prefixes:
1180 c->twobyte && (c->b == 0xb6 || c->b == 0xb7)); 1207 c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
1181 break; 1208 break;
1182 case DstMem: 1209 case DstMem:
1210 case DstMem64:
1183 if ((c->d & ModRM) && c->modrm_mod == 3) { 1211 if ((c->d & ModRM) && c->modrm_mod == 3) {
1184 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1212 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1185 c->dst.type = OP_REG; 1213 c->dst.type = OP_REG;
@@ -1188,12 +1216,24 @@ done_prefixes:
1188 break; 1216 break;
1189 } 1217 }
1190 c->dst.type = OP_MEM; 1218 c->dst.type = OP_MEM;
1219 c->dst.ptr = (unsigned long *)c->modrm_ea;
1220 if ((c->d & DstMask) == DstMem64)
1221 c->dst.bytes = 8;
1222 else
1223 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1224 c->dst.val = 0;
1225 if (c->d & BitOp) {
1226 unsigned long mask = ~(c->dst.bytes * 8 - 1);
1227
1228 c->dst.ptr = (void *)c->dst.ptr +
1229 (c->src.val & mask) / 8;
1230 }
1191 break; 1231 break;
1192 case DstAcc: 1232 case DstAcc:
1193 c->dst.type = OP_REG; 1233 c->dst.type = OP_REG;
1194 c->dst.bytes = c->op_bytes; 1234 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1195 c->dst.ptr = &c->regs[VCPU_REGS_RAX]; 1235 c->dst.ptr = &c->regs[VCPU_REGS_RAX];
1196 switch (c->op_bytes) { 1236 switch (c->dst.bytes) {
1197 case 1: 1237 case 1:
1198 c->dst.val = *(u8 *)c->dst.ptr; 1238 c->dst.val = *(u8 *)c->dst.ptr;
1199 break; 1239 break;
@@ -1203,18 +1243,248 @@ done_prefixes:
1203 case 4: 1243 case 4:
1204 c->dst.val = *(u32 *)c->dst.ptr; 1244 c->dst.val = *(u32 *)c->dst.ptr;
1205 break; 1245 break;
1246 case 8:
1247 c->dst.val = *(u64 *)c->dst.ptr;
1248 break;
1206 } 1249 }
1207 c->dst.orig_val = c->dst.val; 1250 c->dst.orig_val = c->dst.val;
1208 break; 1251 break;
1252 case DstDI:
1253 c->dst.type = OP_MEM;
1254 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1255 c->dst.ptr = (unsigned long *)
1256 register_address(c, es_base(ctxt),
1257 c->regs[VCPU_REGS_RDI]);
1258 c->dst.val = 0;
1259 break;
1209 } 1260 }
1210 1261
1211 if (c->rip_relative)
1212 c->modrm_ea += c->eip;
1213
1214done: 1262done:
1215 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; 1263 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
1216} 1264}
1217 1265
1266static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
1267 struct x86_emulate_ops *ops,
1268 unsigned int size, unsigned short port,
1269 void *dest)
1270{
1271 struct read_cache *rc = &ctxt->decode.io_read;
1272
1273 if (rc->pos == rc->end) { /* refill pio read ahead */
1274 struct decode_cache *c = &ctxt->decode;
1275 unsigned int in_page, n;
1276 unsigned int count = c->rep_prefix ?
1277 address_mask(c, c->regs[VCPU_REGS_RCX]) : 1;
1278 in_page = (ctxt->eflags & EFLG_DF) ?
1279 offset_in_page(c->regs[VCPU_REGS_RDI]) :
1280 PAGE_SIZE - offset_in_page(c->regs[VCPU_REGS_RDI]);
1281 n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size,
1282 count);
1283 if (n == 0)
1284 n = 1;
1285 rc->pos = rc->end = 0;
1286 if (!ops->pio_in_emulated(size, port, rc->data, n, ctxt->vcpu))
1287 return 0;
1288 rc->end = n * size;
1289 }
1290
1291 memcpy(dest, rc->data + rc->pos, size);
1292 rc->pos += size;
1293 return 1;
1294}
1295
1296static u32 desc_limit_scaled(struct desc_struct *desc)
1297{
1298 u32 limit = get_desc_limit(desc);
1299
1300 return desc->g ? (limit << 12) | 0xfff : limit;
1301}
1302
1303static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
1304 struct x86_emulate_ops *ops,
1305 u16 selector, struct desc_ptr *dt)
1306{
1307 if (selector & 1 << 2) {
1308 struct desc_struct desc;
1309 memset (dt, 0, sizeof *dt);
1310 if (!ops->get_cached_descriptor(&desc, VCPU_SREG_LDTR, ctxt->vcpu))
1311 return;
1312
1313 dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */
1314 dt->address = get_desc_base(&desc);
1315 } else
1316 ops->get_gdt(dt, ctxt->vcpu);
1317}
1318
1319/* allowed just for 8 bytes segments */
1320static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1321 struct x86_emulate_ops *ops,
1322 u16 selector, struct desc_struct *desc)
1323{
1324 struct desc_ptr dt;
1325 u16 index = selector >> 3;
1326 int ret;
1327 u32 err;
1328 ulong addr;
1329
1330 get_descriptor_table_ptr(ctxt, ops, selector, &dt);
1331
1332 if (dt.size < index * 8 + 7) {
1333 kvm_inject_gp(ctxt->vcpu, selector & 0xfffc);
1334 return X86EMUL_PROPAGATE_FAULT;
1335 }
1336 addr = dt.address + index * 8;
1337 ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
1338 if (ret == X86EMUL_PROPAGATE_FAULT)
1339 kvm_inject_page_fault(ctxt->vcpu, addr, err);
1340
1341 return ret;
1342}
1343
1344/* allowed just for 8 bytes segments */
1345static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1346 struct x86_emulate_ops *ops,
1347 u16 selector, struct desc_struct *desc)
1348{
1349 struct desc_ptr dt;
1350 u16 index = selector >> 3;
1351 u32 err;
1352 ulong addr;
1353 int ret;
1354
1355 get_descriptor_table_ptr(ctxt, ops, selector, &dt);
1356
1357 if (dt.size < index * 8 + 7) {
1358 kvm_inject_gp(ctxt->vcpu, selector & 0xfffc);
1359 return X86EMUL_PROPAGATE_FAULT;
1360 }
1361
1362 addr = dt.address + index * 8;
1363 ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
1364 if (ret == X86EMUL_PROPAGATE_FAULT)
1365 kvm_inject_page_fault(ctxt->vcpu, addr, err);
1366
1367 return ret;
1368}
1369
1370static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1371 struct x86_emulate_ops *ops,
1372 u16 selector, int seg)
1373{
1374 struct desc_struct seg_desc;
1375 u8 dpl, rpl, cpl;
1376 unsigned err_vec = GP_VECTOR;
1377 u32 err_code = 0;
1378 bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
1379 int ret;
1380
1381 memset(&seg_desc, 0, sizeof seg_desc);
1382
1383 if ((seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86)
1384 || ctxt->mode == X86EMUL_MODE_REAL) {
1385 /* set real mode segment descriptor */
1386 set_desc_base(&seg_desc, selector << 4);
1387 set_desc_limit(&seg_desc, 0xffff);
1388 seg_desc.type = 3;
1389 seg_desc.p = 1;
1390 seg_desc.s = 1;
1391 goto load;
1392 }
1393
1394 /* NULL selector is not valid for TR, CS and SS */
1395 if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR)
1396 && null_selector)
1397 goto exception;
1398
1399 /* TR should be in GDT only */
1400 if (seg == VCPU_SREG_TR && (selector & (1 << 2)))
1401 goto exception;
1402
1403 if (null_selector) /* for NULL selector skip all following checks */
1404 goto load;
1405
1406 ret = read_segment_descriptor(ctxt, ops, selector, &seg_desc);
1407 if (ret != X86EMUL_CONTINUE)
1408 return ret;
1409
1410 err_code = selector & 0xfffc;
1411 err_vec = GP_VECTOR;
1412
1413 /* can't load system descriptor into segment selecor */
1414 if (seg <= VCPU_SREG_GS && !seg_desc.s)
1415 goto exception;
1416
1417 if (!seg_desc.p) {
1418 err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
1419 goto exception;
1420 }
1421
1422 rpl = selector & 3;
1423 dpl = seg_desc.dpl;
1424 cpl = ops->cpl(ctxt->vcpu);
1425
1426 switch (seg) {
1427 case VCPU_SREG_SS:
1428 /*
1429 * segment is not a writable data segment or segment
1430 * selector's RPL != CPL or segment selector's RPL != CPL
1431 */
1432 if (rpl != cpl || (seg_desc.type & 0xa) != 0x2 || dpl != cpl)
1433 goto exception;
1434 break;
1435 case VCPU_SREG_CS:
1436 if (!(seg_desc.type & 8))
1437 goto exception;
1438
1439 if (seg_desc.type & 4) {
1440 /* conforming */
1441 if (dpl > cpl)
1442 goto exception;
1443 } else {
1444 /* nonconforming */
1445 if (rpl > cpl || dpl != cpl)
1446 goto exception;
1447 }
1448 /* CS(RPL) <- CPL */
1449 selector = (selector & 0xfffc) | cpl;
1450 break;
1451 case VCPU_SREG_TR:
1452 if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9))
1453 goto exception;
1454 break;
1455 case VCPU_SREG_LDTR:
1456 if (seg_desc.s || seg_desc.type != 2)
1457 goto exception;
1458 break;
1459 default: /* DS, ES, FS, or GS */
1460 /*
1461 * segment is not a data or readable code segment or
1462 * ((segment is a data or nonconforming code segment)
1463 * and (both RPL and CPL > DPL))
1464 */
1465 if ((seg_desc.type & 0xa) == 0x8 ||
1466 (((seg_desc.type & 0xc) != 0xc) &&
1467 (rpl > dpl && cpl > dpl)))
1468 goto exception;
1469 break;
1470 }
1471
1472 if (seg_desc.s) {
1473 /* mark segment as accessed */
1474 seg_desc.type |= 1;
1475 ret = write_segment_descriptor(ctxt, ops, selector, &seg_desc);
1476 if (ret != X86EMUL_CONTINUE)
1477 return ret;
1478 }
1479load:
1480 ops->set_segment_selector(selector, seg, ctxt->vcpu);
1481 ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu);
1482 return X86EMUL_CONTINUE;
1483exception:
1484 kvm_queue_exception_e(ctxt->vcpu, err_vec, err_code);
1485 return X86EMUL_PROPAGATE_FAULT;
1486}
1487
1218static inline void emulate_push(struct x86_emulate_ctxt *ctxt) 1488static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
1219{ 1489{
1220 struct decode_cache *c = &ctxt->decode; 1490 struct decode_cache *c = &ctxt->decode;
@@ -1251,7 +1521,7 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1251 int rc; 1521 int rc;
1252 unsigned long val, change_mask; 1522 unsigned long val, change_mask;
1253 int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; 1523 int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
1254 int cpl = kvm_x86_ops->get_cpl(ctxt->vcpu); 1524 int cpl = ops->cpl(ctxt->vcpu);
1255 1525
1256 rc = emulate_pop(ctxt, ops, &val, len); 1526 rc = emulate_pop(ctxt, ops, &val, len);
1257 if (rc != X86EMUL_CONTINUE) 1527 if (rc != X86EMUL_CONTINUE)
@@ -1306,10 +1576,10 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
1306 int rc; 1576 int rc;
1307 1577
1308 rc = emulate_pop(ctxt, ops, &selector, c->op_bytes); 1578 rc = emulate_pop(ctxt, ops, &selector, c->op_bytes);
1309 if (rc != 0) 1579 if (rc != X86EMUL_CONTINUE)
1310 return rc; 1580 return rc;
1311 1581
1312 rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, seg); 1582 rc = load_segment_descriptor(ctxt, ops, (u16)selector, seg);
1313 return rc; 1583 return rc;
1314} 1584}
1315 1585
@@ -1332,7 +1602,7 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt,
1332 struct x86_emulate_ops *ops) 1602 struct x86_emulate_ops *ops)
1333{ 1603{
1334 struct decode_cache *c = &ctxt->decode; 1604 struct decode_cache *c = &ctxt->decode;
1335 int rc = 0; 1605 int rc = X86EMUL_CONTINUE;
1336 int reg = VCPU_REGS_RDI; 1606 int reg = VCPU_REGS_RDI;
1337 1607
1338 while (reg >= VCPU_REGS_RAX) { 1608 while (reg >= VCPU_REGS_RAX) {
@@ -1343,7 +1613,7 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt,
1343 } 1613 }
1344 1614
1345 rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes); 1615 rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes);
1346 if (rc != 0) 1616 if (rc != X86EMUL_CONTINUE)
1347 break; 1617 break;
1348 --reg; 1618 --reg;
1349 } 1619 }
@@ -1354,12 +1624,8 @@ static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
1354 struct x86_emulate_ops *ops) 1624 struct x86_emulate_ops *ops)
1355{ 1625{
1356 struct decode_cache *c = &ctxt->decode; 1626 struct decode_cache *c = &ctxt->decode;
1357 int rc;
1358 1627
1359 rc = emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes); 1628 return emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes);
1360 if (rc != 0)
1361 return rc;
1362 return 0;
1363} 1629}
1364 1630
1365static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt) 1631static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt)
@@ -1395,7 +1661,6 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
1395 struct x86_emulate_ops *ops) 1661 struct x86_emulate_ops *ops)
1396{ 1662{
1397 struct decode_cache *c = &ctxt->decode; 1663 struct decode_cache *c = &ctxt->decode;
1398 int rc = 0;
1399 1664
1400 switch (c->modrm_reg) { 1665 switch (c->modrm_reg) {
1401 case 0 ... 1: /* test */ 1666 case 0 ... 1: /* test */
@@ -1408,11 +1673,9 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
1408 emulate_1op("neg", c->dst, ctxt->eflags); 1673 emulate_1op("neg", c->dst, ctxt->eflags);
1409 break; 1674 break;
1410 default: 1675 default:
1411 DPRINTF("Cannot emulate %02x\n", c->b); 1676 return 0;
1412 rc = X86EMUL_UNHANDLEABLE;
1413 break;
1414 } 1677 }
1415 return rc; 1678 return 1;
1416} 1679}
1417 1680
1418static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, 1681static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
@@ -1442,20 +1705,14 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
1442 emulate_push(ctxt); 1705 emulate_push(ctxt);
1443 break; 1706 break;
1444 } 1707 }
1445 return 0; 1708 return X86EMUL_CONTINUE;
1446} 1709}
1447 1710
1448static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt, 1711static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
1449 struct x86_emulate_ops *ops, 1712 struct x86_emulate_ops *ops)
1450 unsigned long memop)
1451{ 1713{
1452 struct decode_cache *c = &ctxt->decode; 1714 struct decode_cache *c = &ctxt->decode;
1453 u64 old, new; 1715 u64 old = c->dst.orig_val;
1454 int rc;
1455
1456 rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu);
1457 if (rc != X86EMUL_CONTINUE)
1458 return rc;
1459 1716
1460 if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) || 1717 if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) ||
1461 ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) { 1718 ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) {
@@ -1463,17 +1720,13 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
1463 c->regs[VCPU_REGS_RAX] = (u32) (old >> 0); 1720 c->regs[VCPU_REGS_RAX] = (u32) (old >> 0);
1464 c->regs[VCPU_REGS_RDX] = (u32) (old >> 32); 1721 c->regs[VCPU_REGS_RDX] = (u32) (old >> 32);
1465 ctxt->eflags &= ~EFLG_ZF; 1722 ctxt->eflags &= ~EFLG_ZF;
1466
1467 } else { 1723 } else {
1468 new = ((u64)c->regs[VCPU_REGS_RCX] << 32) | 1724 c->dst.val = ((u64)c->regs[VCPU_REGS_RCX] << 32) |
1469 (u32) c->regs[VCPU_REGS_RBX]; 1725 (u32) c->regs[VCPU_REGS_RBX];
1470 1726
1471 rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu);
1472 if (rc != X86EMUL_CONTINUE)
1473 return rc;
1474 ctxt->eflags |= EFLG_ZF; 1727 ctxt->eflags |= EFLG_ZF;
1475 } 1728 }
1476 return 0; 1729 return X86EMUL_CONTINUE;
1477} 1730}
1478 1731
1479static int emulate_ret_far(struct x86_emulate_ctxt *ctxt, 1732static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
@@ -1484,14 +1737,14 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
1484 unsigned long cs; 1737 unsigned long cs;
1485 1738
1486 rc = emulate_pop(ctxt, ops, &c->eip, c->op_bytes); 1739 rc = emulate_pop(ctxt, ops, &c->eip, c->op_bytes);
1487 if (rc) 1740 if (rc != X86EMUL_CONTINUE)
1488 return rc; 1741 return rc;
1489 if (c->op_bytes == 4) 1742 if (c->op_bytes == 4)
1490 c->eip = (u32)c->eip; 1743 c->eip = (u32)c->eip;
1491 rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); 1744 rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
1492 if (rc) 1745 if (rc != X86EMUL_CONTINUE)
1493 return rc; 1746 return rc;
1494 rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, VCPU_SREG_CS); 1747 rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS);
1495 return rc; 1748 return rc;
1496} 1749}
1497 1750
@@ -1544,7 +1797,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
1544 default: 1797 default:
1545 break; 1798 break;
1546 } 1799 }
1547 return 0; 1800 return X86EMUL_CONTINUE;
1548} 1801}
1549 1802
1550static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask) 1803static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask)
@@ -1598,8 +1851,11 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt)
1598 u64 msr_data; 1851 u64 msr_data;
1599 1852
1600 /* syscall is not available in real mode */ 1853 /* syscall is not available in real mode */
1601 if (ctxt->mode == X86EMUL_MODE_REAL || ctxt->mode == X86EMUL_MODE_VM86) 1854 if (ctxt->mode == X86EMUL_MODE_REAL ||
1602 return X86EMUL_UNHANDLEABLE; 1855 ctxt->mode == X86EMUL_MODE_VM86) {
1856 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
1857 return X86EMUL_PROPAGATE_FAULT;
1858 }
1603 1859
1604 setup_syscalls_segments(ctxt, &cs, &ss); 1860 setup_syscalls_segments(ctxt, &cs, &ss);
1605 1861
@@ -1649,14 +1905,16 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)
1649 /* inject #GP if in real mode */ 1905 /* inject #GP if in real mode */
1650 if (ctxt->mode == X86EMUL_MODE_REAL) { 1906 if (ctxt->mode == X86EMUL_MODE_REAL) {
1651 kvm_inject_gp(ctxt->vcpu, 0); 1907 kvm_inject_gp(ctxt->vcpu, 0);
1652 return X86EMUL_UNHANDLEABLE; 1908 return X86EMUL_PROPAGATE_FAULT;
1653 } 1909 }
1654 1910
1655 /* XXX sysenter/sysexit have not been tested in 64bit mode. 1911 /* XXX sysenter/sysexit have not been tested in 64bit mode.
1656 * Therefore, we inject an #UD. 1912 * Therefore, we inject an #UD.
1657 */ 1913 */
1658 if (ctxt->mode == X86EMUL_MODE_PROT64) 1914 if (ctxt->mode == X86EMUL_MODE_PROT64) {
1659 return X86EMUL_UNHANDLEABLE; 1915 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
1916 return X86EMUL_PROPAGATE_FAULT;
1917 }
1660 1918
1661 setup_syscalls_segments(ctxt, &cs, &ss); 1919 setup_syscalls_segments(ctxt, &cs, &ss);
1662 1920
@@ -1711,7 +1969,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
1711 if (ctxt->mode == X86EMUL_MODE_REAL || 1969 if (ctxt->mode == X86EMUL_MODE_REAL ||
1712 ctxt->mode == X86EMUL_MODE_VM86) { 1970 ctxt->mode == X86EMUL_MODE_VM86) {
1713 kvm_inject_gp(ctxt->vcpu, 0); 1971 kvm_inject_gp(ctxt->vcpu, 0);
1714 return X86EMUL_UNHANDLEABLE; 1972 return X86EMUL_PROPAGATE_FAULT;
1715 } 1973 }
1716 1974
1717 setup_syscalls_segments(ctxt, &cs, &ss); 1975 setup_syscalls_segments(ctxt, &cs, &ss);
@@ -1756,7 +2014,8 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
1756 return X86EMUL_CONTINUE; 2014 return X86EMUL_CONTINUE;
1757} 2015}
1758 2016
1759static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt) 2017static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt,
2018 struct x86_emulate_ops *ops)
1760{ 2019{
1761 int iopl; 2020 int iopl;
1762 if (ctxt->mode == X86EMUL_MODE_REAL) 2021 if (ctxt->mode == X86EMUL_MODE_REAL)
@@ -1764,7 +2023,7 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt)
1764 if (ctxt->mode == X86EMUL_MODE_VM86) 2023 if (ctxt->mode == X86EMUL_MODE_VM86)
1765 return true; 2024 return true;
1766 iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; 2025 iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
1767 return kvm_x86_ops->get_cpl(ctxt->vcpu) > iopl; 2026 return ops->cpl(ctxt->vcpu) > iopl;
1768} 2027}
1769 2028
1770static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, 2029static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
@@ -1801,22 +2060,419 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
1801 struct x86_emulate_ops *ops, 2060 struct x86_emulate_ops *ops,
1802 u16 port, u16 len) 2061 u16 port, u16 len)
1803{ 2062{
1804 if (emulator_bad_iopl(ctxt)) 2063 if (emulator_bad_iopl(ctxt, ops))
1805 if (!emulator_io_port_access_allowed(ctxt, ops, port, len)) 2064 if (!emulator_io_port_access_allowed(ctxt, ops, port, len))
1806 return false; 2065 return false;
1807 return true; 2066 return true;
1808} 2067}
1809 2068
2069static u32 get_cached_descriptor_base(struct x86_emulate_ctxt *ctxt,
2070 struct x86_emulate_ops *ops,
2071 int seg)
2072{
2073 struct desc_struct desc;
2074 if (ops->get_cached_descriptor(&desc, seg, ctxt->vcpu))
2075 return get_desc_base(&desc);
2076 else
2077 return ~0;
2078}
2079
2080static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
2081 struct x86_emulate_ops *ops,
2082 struct tss_segment_16 *tss)
2083{
2084 struct decode_cache *c = &ctxt->decode;
2085
2086 tss->ip = c->eip;
2087 tss->flag = ctxt->eflags;
2088 tss->ax = c->regs[VCPU_REGS_RAX];
2089 tss->cx = c->regs[VCPU_REGS_RCX];
2090 tss->dx = c->regs[VCPU_REGS_RDX];
2091 tss->bx = c->regs[VCPU_REGS_RBX];
2092 tss->sp = c->regs[VCPU_REGS_RSP];
2093 tss->bp = c->regs[VCPU_REGS_RBP];
2094 tss->si = c->regs[VCPU_REGS_RSI];
2095 tss->di = c->regs[VCPU_REGS_RDI];
2096
2097 tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu);
2098 tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
2099 tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu);
2100 tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu);
2101 tss->ldt = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu);
2102}
2103
2104static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
2105 struct x86_emulate_ops *ops,
2106 struct tss_segment_16 *tss)
2107{
2108 struct decode_cache *c = &ctxt->decode;
2109 int ret;
2110
2111 c->eip = tss->ip;
2112 ctxt->eflags = tss->flag | 2;
2113 c->regs[VCPU_REGS_RAX] = tss->ax;
2114 c->regs[VCPU_REGS_RCX] = tss->cx;
2115 c->regs[VCPU_REGS_RDX] = tss->dx;
2116 c->regs[VCPU_REGS_RBX] = tss->bx;
2117 c->regs[VCPU_REGS_RSP] = tss->sp;
2118 c->regs[VCPU_REGS_RBP] = tss->bp;
2119 c->regs[VCPU_REGS_RSI] = tss->si;
2120 c->regs[VCPU_REGS_RDI] = tss->di;
2121
2122 /*
2123 * SDM says that segment selectors are loaded before segment
2124 * descriptors
2125 */
2126 ops->set_segment_selector(tss->ldt, VCPU_SREG_LDTR, ctxt->vcpu);
2127 ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu);
2128 ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu);
2129 ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu);
2130 ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu);
2131
2132 /*
2133 * Now load segment descriptors. If fault happenes at this stage
2134 * it is handled in a context of new task
2135 */
2136 ret = load_segment_descriptor(ctxt, ops, tss->ldt, VCPU_SREG_LDTR);
2137 if (ret != X86EMUL_CONTINUE)
2138 return ret;
2139 ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES);
2140 if (ret != X86EMUL_CONTINUE)
2141 return ret;
2142 ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS);
2143 if (ret != X86EMUL_CONTINUE)
2144 return ret;
2145 ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS);
2146 if (ret != X86EMUL_CONTINUE)
2147 return ret;
2148 ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS);
2149 if (ret != X86EMUL_CONTINUE)
2150 return ret;
2151
2152 return X86EMUL_CONTINUE;
2153}
2154
2155static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2156 struct x86_emulate_ops *ops,
2157 u16 tss_selector, u16 old_tss_sel,
2158 ulong old_tss_base, struct desc_struct *new_desc)
2159{
2160 struct tss_segment_16 tss_seg;
2161 int ret;
2162 u32 err, new_tss_base = get_desc_base(new_desc);
2163
2164 ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
2165 &err);
2166 if (ret == X86EMUL_PROPAGATE_FAULT) {
2167 /* FIXME: need to provide precise fault address */
2168 kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
2169 return ret;
2170 }
2171
2172 save_state_to_tss16(ctxt, ops, &tss_seg);
2173
2174 ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
2175 &err);
2176 if (ret == X86EMUL_PROPAGATE_FAULT) {
2177 /* FIXME: need to provide precise fault address */
2178 kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
2179 return ret;
2180 }
2181
2182 ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
2183 &err);
2184 if (ret == X86EMUL_PROPAGATE_FAULT) {
2185 /* FIXME: need to provide precise fault address */
2186 kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
2187 return ret;
2188 }
2189
2190 if (old_tss_sel != 0xffff) {
2191 tss_seg.prev_task_link = old_tss_sel;
2192
2193 ret = ops->write_std(new_tss_base,
2194 &tss_seg.prev_task_link,
2195 sizeof tss_seg.prev_task_link,
2196 ctxt->vcpu, &err);
2197 if (ret == X86EMUL_PROPAGATE_FAULT) {
2198 /* FIXME: need to provide precise fault address */
2199 kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
2200 return ret;
2201 }
2202 }
2203
2204 return load_state_from_tss16(ctxt, ops, &tss_seg);
2205}
2206
2207static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
2208 struct x86_emulate_ops *ops,
2209 struct tss_segment_32 *tss)
2210{
2211 struct decode_cache *c = &ctxt->decode;
2212
2213 tss->cr3 = ops->get_cr(3, ctxt->vcpu);
2214 tss->eip = c->eip;
2215 tss->eflags = ctxt->eflags;
2216 tss->eax = c->regs[VCPU_REGS_RAX];
2217 tss->ecx = c->regs[VCPU_REGS_RCX];
2218 tss->edx = c->regs[VCPU_REGS_RDX];
2219 tss->ebx = c->regs[VCPU_REGS_RBX];
2220 tss->esp = c->regs[VCPU_REGS_RSP];
2221 tss->ebp = c->regs[VCPU_REGS_RBP];
2222 tss->esi = c->regs[VCPU_REGS_RSI];
2223 tss->edi = c->regs[VCPU_REGS_RDI];
2224
2225 tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu);
2226 tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
2227 tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu);
2228 tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu);
2229 tss->fs = ops->get_segment_selector(VCPU_SREG_FS, ctxt->vcpu);
2230 tss->gs = ops->get_segment_selector(VCPU_SREG_GS, ctxt->vcpu);
2231 tss->ldt_selector = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu);
2232}
2233
2234static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
2235 struct x86_emulate_ops *ops,
2236 struct tss_segment_32 *tss)
2237{
2238 struct decode_cache *c = &ctxt->decode;
2239 int ret;
2240
2241 ops->set_cr(3, tss->cr3, ctxt->vcpu);
2242 c->eip = tss->eip;
2243 ctxt->eflags = tss->eflags | 2;
2244 c->regs[VCPU_REGS_RAX] = tss->eax;
2245 c->regs[VCPU_REGS_RCX] = tss->ecx;
2246 c->regs[VCPU_REGS_RDX] = tss->edx;
2247 c->regs[VCPU_REGS_RBX] = tss->ebx;
2248 c->regs[VCPU_REGS_RSP] = tss->esp;
2249 c->regs[VCPU_REGS_RBP] = tss->ebp;
2250 c->regs[VCPU_REGS_RSI] = tss->esi;
2251 c->regs[VCPU_REGS_RDI] = tss->edi;
2252
2253 /*
2254 * SDM says that segment selectors are loaded before segment
2255 * descriptors
2256 */
2257 ops->set_segment_selector(tss->ldt_selector, VCPU_SREG_LDTR, ctxt->vcpu);
2258 ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu);
2259 ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu);
2260 ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu);
2261 ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu);
2262 ops->set_segment_selector(tss->fs, VCPU_SREG_FS, ctxt->vcpu);
2263 ops->set_segment_selector(tss->gs, VCPU_SREG_GS, ctxt->vcpu);
2264
2265 /*
2266 * Now load segment descriptors. If fault happenes at this stage
2267 * it is handled in a context of new task
2268 */
2269 ret = load_segment_descriptor(ctxt, ops, tss->ldt_selector, VCPU_SREG_LDTR);
2270 if (ret != X86EMUL_CONTINUE)
2271 return ret;
2272 ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES);
2273 if (ret != X86EMUL_CONTINUE)
2274 return ret;
2275 ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS);
2276 if (ret != X86EMUL_CONTINUE)
2277 return ret;
2278 ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS);
2279 if (ret != X86EMUL_CONTINUE)
2280 return ret;
2281 ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS);
2282 if (ret != X86EMUL_CONTINUE)
2283 return ret;
2284 ret = load_segment_descriptor(ctxt, ops, tss->fs, VCPU_SREG_FS);
2285 if (ret != X86EMUL_CONTINUE)
2286 return ret;
2287 ret = load_segment_descriptor(ctxt, ops, tss->gs, VCPU_SREG_GS);
2288 if (ret != X86EMUL_CONTINUE)
2289 return ret;
2290
2291 return X86EMUL_CONTINUE;
2292}
2293
2294static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2295 struct x86_emulate_ops *ops,
2296 u16 tss_selector, u16 old_tss_sel,
2297 ulong old_tss_base, struct desc_struct *new_desc)
2298{
2299 struct tss_segment_32 tss_seg;
2300 int ret;
2301 u32 err, new_tss_base = get_desc_base(new_desc);
2302
2303 ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
2304 &err);
2305 if (ret == X86EMUL_PROPAGATE_FAULT) {
2306 /* FIXME: need to provide precise fault address */
2307 kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
2308 return ret;
2309 }
2310
2311 save_state_to_tss32(ctxt, ops, &tss_seg);
2312
2313 ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
2314 &err);
2315 if (ret == X86EMUL_PROPAGATE_FAULT) {
2316 /* FIXME: need to provide precise fault address */
2317 kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
2318 return ret;
2319 }
2320
2321 ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
2322 &err);
2323 if (ret == X86EMUL_PROPAGATE_FAULT) {
2324 /* FIXME: need to provide precise fault address */
2325 kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
2326 return ret;
2327 }
2328
2329 if (old_tss_sel != 0xffff) {
2330 tss_seg.prev_task_link = old_tss_sel;
2331
2332 ret = ops->write_std(new_tss_base,
2333 &tss_seg.prev_task_link,
2334 sizeof tss_seg.prev_task_link,
2335 ctxt->vcpu, &err);
2336 if (ret == X86EMUL_PROPAGATE_FAULT) {
2337 /* FIXME: need to provide precise fault address */
2338 kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
2339 return ret;
2340 }
2341 }
2342
2343 return load_state_from_tss32(ctxt, ops, &tss_seg);
2344}
2345
2346static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2347 struct x86_emulate_ops *ops,
2348 u16 tss_selector, int reason,
2349 bool has_error_code, u32 error_code)
2350{
2351 struct desc_struct curr_tss_desc, next_tss_desc;
2352 int ret;
2353 u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu);
2354 ulong old_tss_base =
2355 get_cached_descriptor_base(ctxt, ops, VCPU_SREG_TR);
2356 u32 desc_limit;
2357
2358 /* FIXME: old_tss_base == ~0 ? */
2359
2360 ret = read_segment_descriptor(ctxt, ops, tss_selector, &next_tss_desc);
2361 if (ret != X86EMUL_CONTINUE)
2362 return ret;
2363 ret = read_segment_descriptor(ctxt, ops, old_tss_sel, &curr_tss_desc);
2364 if (ret != X86EMUL_CONTINUE)
2365 return ret;
2366
2367 /* FIXME: check that next_tss_desc is tss */
2368
2369 if (reason != TASK_SWITCH_IRET) {
2370 if ((tss_selector & 3) > next_tss_desc.dpl ||
2371 ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) {
2372 kvm_inject_gp(ctxt->vcpu, 0);
2373 return X86EMUL_PROPAGATE_FAULT;
2374 }
2375 }
2376
2377 desc_limit = desc_limit_scaled(&next_tss_desc);
2378 if (!next_tss_desc.p ||
2379 ((desc_limit < 0x67 && (next_tss_desc.type & 8)) ||
2380 desc_limit < 0x2b)) {
2381 kvm_queue_exception_e(ctxt->vcpu, TS_VECTOR,
2382 tss_selector & 0xfffc);
2383 return X86EMUL_PROPAGATE_FAULT;
2384 }
2385
2386 if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
2387 curr_tss_desc.type &= ~(1 << 1); /* clear busy flag */
2388 write_segment_descriptor(ctxt, ops, old_tss_sel,
2389 &curr_tss_desc);
2390 }
2391
2392 if (reason == TASK_SWITCH_IRET)
2393 ctxt->eflags = ctxt->eflags & ~X86_EFLAGS_NT;
2394
2395 /* set back link to prev task only if NT bit is set in eflags
2396 note that old_tss_sel is not used afetr this point */
2397 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
2398 old_tss_sel = 0xffff;
2399
2400 if (next_tss_desc.type & 8)
2401 ret = task_switch_32(ctxt, ops, tss_selector, old_tss_sel,
2402 old_tss_base, &next_tss_desc);
2403 else
2404 ret = task_switch_16(ctxt, ops, tss_selector, old_tss_sel,
2405 old_tss_base, &next_tss_desc);
2406 if (ret != X86EMUL_CONTINUE)
2407 return ret;
2408
2409 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE)
2410 ctxt->eflags = ctxt->eflags | X86_EFLAGS_NT;
2411
2412 if (reason != TASK_SWITCH_IRET) {
2413 next_tss_desc.type |= (1 << 1); /* set busy flag */
2414 write_segment_descriptor(ctxt, ops, tss_selector,
2415 &next_tss_desc);
2416 }
2417
2418 ops->set_cr(0, ops->get_cr(0, ctxt->vcpu) | X86_CR0_TS, ctxt->vcpu);
2419 ops->set_cached_descriptor(&next_tss_desc, VCPU_SREG_TR, ctxt->vcpu);
2420 ops->set_segment_selector(tss_selector, VCPU_SREG_TR, ctxt->vcpu);
2421
2422 if (has_error_code) {
2423 struct decode_cache *c = &ctxt->decode;
2424
2425 c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2;
2426 c->lock_prefix = 0;
2427 c->src.val = (unsigned long) error_code;
2428 emulate_push(ctxt);
2429 }
2430
2431 return ret;
2432}
2433
2434int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
2435 struct x86_emulate_ops *ops,
2436 u16 tss_selector, int reason,
2437 bool has_error_code, u32 error_code)
2438{
2439 struct decode_cache *c = &ctxt->decode;
2440 int rc;
2441
2442 memset(c, 0, sizeof(struct decode_cache));
2443 c->eip = ctxt->eip;
2444 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
2445 c->dst.type = OP_NONE;
2446
2447 rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason,
2448 has_error_code, error_code);
2449
2450 if (rc == X86EMUL_CONTINUE) {
2451 memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
2452 kvm_rip_write(ctxt->vcpu, c->eip);
2453 rc = writeback(ctxt, ops);
2454 }
2455
2456 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
2457}
2458
2459static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base,
2460 int reg, struct operand *op)
2461{
2462 struct decode_cache *c = &ctxt->decode;
2463 int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
2464
2465 register_address_increment(c, &c->regs[reg], df * op->bytes);
2466 op->ptr = (unsigned long *)register_address(c, base, c->regs[reg]);
2467}
2468
1810int 2469int
1811x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) 2470x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1812{ 2471{
1813 unsigned long memop = 0;
1814 u64 msr_data; 2472 u64 msr_data;
1815 unsigned long saved_eip = 0;
1816 struct decode_cache *c = &ctxt->decode; 2473 struct decode_cache *c = &ctxt->decode;
1817 unsigned int port; 2474 int rc = X86EMUL_CONTINUE;
1818 int io_dir_in; 2475 int saved_dst_type = c->dst.type;
1819 int rc = 0;
1820 2476
1821 ctxt->interruptibility = 0; 2477 ctxt->interruptibility = 0;
1822 2478
@@ -1826,26 +2482,30 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1826 */ 2482 */
1827 2483
1828 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); 2484 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
1829 saved_eip = c->eip; 2485
2486 if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
2487 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
2488 goto done;
2489 }
1830 2490
1831 /* LOCK prefix is allowed only with some instructions */ 2491 /* LOCK prefix is allowed only with some instructions */
1832 if (c->lock_prefix && !(c->d & Lock)) { 2492 if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) {
1833 kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 2493 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
1834 goto done; 2494 goto done;
1835 } 2495 }
1836 2496
1837 /* Privileged instruction can be executed only in CPL=0 */ 2497 /* Privileged instruction can be executed only in CPL=0 */
1838 if ((c->d & Priv) && kvm_x86_ops->get_cpl(ctxt->vcpu)) { 2498 if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) {
1839 kvm_inject_gp(ctxt->vcpu, 0); 2499 kvm_inject_gp(ctxt->vcpu, 0);
1840 goto done; 2500 goto done;
1841 } 2501 }
1842 2502
1843 if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs))
1844 memop = c->modrm_ea;
1845
1846 if (c->rep_prefix && (c->d & String)) { 2503 if (c->rep_prefix && (c->d & String)) {
2504 ctxt->restart = true;
1847 /* All REP prefixes have the same first termination condition */ 2505 /* All REP prefixes have the same first termination condition */
1848 if (c->regs[VCPU_REGS_RCX] == 0) { 2506 if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) {
2507 string_done:
2508 ctxt->restart = false;
1849 kvm_rip_write(ctxt->vcpu, c->eip); 2509 kvm_rip_write(ctxt->vcpu, c->eip);
1850 goto done; 2510 goto done;
1851 } 2511 }
@@ -1857,25 +2517,18 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1857 * - if REPNE/REPNZ and ZF = 1 then done 2517 * - if REPNE/REPNZ and ZF = 1 then done
1858 */ 2518 */
1859 if ((c->b == 0xa6) || (c->b == 0xa7) || 2519 if ((c->b == 0xa6) || (c->b == 0xa7) ||
1860 (c->b == 0xae) || (c->b == 0xaf)) { 2520 (c->b == 0xae) || (c->b == 0xaf)) {
1861 if ((c->rep_prefix == REPE_PREFIX) && 2521 if ((c->rep_prefix == REPE_PREFIX) &&
1862 ((ctxt->eflags & EFLG_ZF) == 0)) { 2522 ((ctxt->eflags & EFLG_ZF) == 0))
1863 kvm_rip_write(ctxt->vcpu, c->eip); 2523 goto string_done;
1864 goto done;
1865 }
1866 if ((c->rep_prefix == REPNE_PREFIX) && 2524 if ((c->rep_prefix == REPNE_PREFIX) &&
1867 ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) { 2525 ((ctxt->eflags & EFLG_ZF) == EFLG_ZF))
1868 kvm_rip_write(ctxt->vcpu, c->eip); 2526 goto string_done;
1869 goto done;
1870 }
1871 } 2527 }
1872 c->regs[VCPU_REGS_RCX]--; 2528 c->eip = ctxt->eip;
1873 c->eip = kvm_rip_read(ctxt->vcpu);
1874 } 2529 }
1875 2530
1876 if (c->src.type == OP_MEM) { 2531 if (c->src.type == OP_MEM) {
1877 c->src.ptr = (unsigned long *)memop;
1878 c->src.val = 0;
1879 rc = ops->read_emulated((unsigned long)c->src.ptr, 2532 rc = ops->read_emulated((unsigned long)c->src.ptr,
1880 &c->src.val, 2533 &c->src.val,
1881 c->src.bytes, 2534 c->src.bytes,
@@ -1885,29 +2538,25 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1885 c->src.orig_val = c->src.val; 2538 c->src.orig_val = c->src.val;
1886 } 2539 }
1887 2540
2541 if (c->src2.type == OP_MEM) {
2542 rc = ops->read_emulated((unsigned long)c->src2.ptr,
2543 &c->src2.val,
2544 c->src2.bytes,
2545 ctxt->vcpu);
2546 if (rc != X86EMUL_CONTINUE)
2547 goto done;
2548 }
2549
1888 if ((c->d & DstMask) == ImplicitOps) 2550 if ((c->d & DstMask) == ImplicitOps)
1889 goto special_insn; 2551 goto special_insn;
1890 2552
1891 2553
1892 if (c->dst.type == OP_MEM) { 2554 if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
1893 c->dst.ptr = (unsigned long *)memop; 2555 /* optimisation - avoid slow emulated read if Mov */
1894 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 2556 rc = ops->read_emulated((unsigned long)c->dst.ptr, &c->dst.val,
1895 c->dst.val = 0; 2557 c->dst.bytes, ctxt->vcpu);
1896 if (c->d & BitOp) { 2558 if (rc != X86EMUL_CONTINUE)
1897 unsigned long mask = ~(c->dst.bytes * 8 - 1); 2559 goto done;
1898
1899 c->dst.ptr = (void *)c->dst.ptr +
1900 (c->src.val & mask) / 8;
1901 }
1902 if (!(c->d & Mov)) {
1903 /* optimisation - avoid slow emulated read */
1904 rc = ops->read_emulated((unsigned long)c->dst.ptr,
1905 &c->dst.val,
1906 c->dst.bytes,
1907 ctxt->vcpu);
1908 if (rc != X86EMUL_CONTINUE)
1909 goto done;
1910 }
1911 } 2560 }
1912 c->dst.orig_val = c->dst.val; 2561 c->dst.orig_val = c->dst.val;
1913 2562
@@ -1926,7 +2575,7 @@ special_insn:
1926 break; 2575 break;
1927 case 0x07: /* pop es */ 2576 case 0x07: /* pop es */
1928 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES); 2577 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
1929 if (rc != 0) 2578 if (rc != X86EMUL_CONTINUE)
1930 goto done; 2579 goto done;
1931 break; 2580 break;
1932 case 0x08 ... 0x0d: 2581 case 0x08 ... 0x0d:
@@ -1945,7 +2594,7 @@ special_insn:
1945 break; 2594 break;
1946 case 0x17: /* pop ss */ 2595 case 0x17: /* pop ss */
1947 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS); 2596 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
1948 if (rc != 0) 2597 if (rc != X86EMUL_CONTINUE)
1949 goto done; 2598 goto done;
1950 break; 2599 break;
1951 case 0x18 ... 0x1d: 2600 case 0x18 ... 0x1d:
@@ -1957,7 +2606,7 @@ special_insn:
1957 break; 2606 break;
1958 case 0x1f: /* pop ds */ 2607 case 0x1f: /* pop ds */
1959 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS); 2608 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
1960 if (rc != 0) 2609 if (rc != X86EMUL_CONTINUE)
1961 goto done; 2610 goto done;
1962 break; 2611 break;
1963 case 0x20 ... 0x25: 2612 case 0x20 ... 0x25:
@@ -1988,7 +2637,7 @@ special_insn:
1988 case 0x58 ... 0x5f: /* pop reg */ 2637 case 0x58 ... 0x5f: /* pop reg */
1989 pop_instruction: 2638 pop_instruction:
1990 rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes); 2639 rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes);
1991 if (rc != 0) 2640 if (rc != X86EMUL_CONTINUE)
1992 goto done; 2641 goto done;
1993 break; 2642 break;
1994 case 0x60: /* pusha */ 2643 case 0x60: /* pusha */
@@ -1996,7 +2645,7 @@ special_insn:
1996 break; 2645 break;
1997 case 0x61: /* popa */ 2646 case 0x61: /* popa */
1998 rc = emulate_popa(ctxt, ops); 2647 rc = emulate_popa(ctxt, ops);
1999 if (rc != 0) 2648 if (rc != X86EMUL_CONTINUE)
2000 goto done; 2649 goto done;
2001 break; 2650 break;
2002 case 0x63: /* movsxd */ 2651 case 0x63: /* movsxd */
@@ -2010,47 +2659,29 @@ special_insn:
2010 break; 2659 break;
2011 case 0x6c: /* insb */ 2660 case 0x6c: /* insb */
2012 case 0x6d: /* insw/insd */ 2661 case 0x6d: /* insw/insd */
2662 c->dst.bytes = min(c->dst.bytes, 4u);
2013 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], 2663 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
2014 (c->d & ByteOp) ? 1 : c->op_bytes)) { 2664 c->dst.bytes)) {
2015 kvm_inject_gp(ctxt->vcpu, 0); 2665 kvm_inject_gp(ctxt->vcpu, 0);
2016 goto done; 2666 goto done;
2017 } 2667 }
2018 if (kvm_emulate_pio_string(ctxt->vcpu, 2668 if (!pio_in_emulated(ctxt, ops, c->dst.bytes,
2019 1, 2669 c->regs[VCPU_REGS_RDX], &c->dst.val))
2020 (c->d & ByteOp) ? 1 : c->op_bytes, 2670 goto done; /* IO is needed, skip writeback */
2021 c->rep_prefix ? 2671 break;
2022 address_mask(c, c->regs[VCPU_REGS_RCX]) : 1,
2023 (ctxt->eflags & EFLG_DF),
2024 register_address(c, es_base(ctxt),
2025 c->regs[VCPU_REGS_RDI]),
2026 c->rep_prefix,
2027 c->regs[VCPU_REGS_RDX]) == 0) {
2028 c->eip = saved_eip;
2029 return -1;
2030 }
2031 return 0;
2032 case 0x6e: /* outsb */ 2672 case 0x6e: /* outsb */
2033 case 0x6f: /* outsw/outsd */ 2673 case 0x6f: /* outsw/outsd */
2674 c->src.bytes = min(c->src.bytes, 4u);
2034 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], 2675 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
2035 (c->d & ByteOp) ? 1 : c->op_bytes)) { 2676 c->src.bytes)) {
2036 kvm_inject_gp(ctxt->vcpu, 0); 2677 kvm_inject_gp(ctxt->vcpu, 0);
2037 goto done; 2678 goto done;
2038 } 2679 }
2039 if (kvm_emulate_pio_string(ctxt->vcpu, 2680 ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX],
2040 0, 2681 &c->src.val, 1, ctxt->vcpu);
2041 (c->d & ByteOp) ? 1 : c->op_bytes, 2682
2042 c->rep_prefix ? 2683 c->dst.type = OP_NONE; /* nothing to writeback */
2043 address_mask(c, c->regs[VCPU_REGS_RCX]) : 1, 2684 break;
2044 (ctxt->eflags & EFLG_DF),
2045 register_address(c,
2046 seg_override_base(ctxt, c),
2047 c->regs[VCPU_REGS_RSI]),
2048 c->rep_prefix,
2049 c->regs[VCPU_REGS_RDX]) == 0) {
2050 c->eip = saved_eip;
2051 return -1;
2052 }
2053 return 0;
2054 case 0x70 ... 0x7f: /* jcc (short) */ 2685 case 0x70 ... 0x7f: /* jcc (short) */
2055 if (test_cc(c->b, ctxt->eflags)) 2686 if (test_cc(c->b, ctxt->eflags))
2056 jmp_rel(c, c->src.val); 2687 jmp_rel(c, c->src.val);
@@ -2107,12 +2738,11 @@ special_insn:
2107 case 0x8c: { /* mov r/m, sreg */ 2738 case 0x8c: { /* mov r/m, sreg */
2108 struct kvm_segment segreg; 2739 struct kvm_segment segreg;
2109 2740
2110 if (c->modrm_reg <= 5) 2741 if (c->modrm_reg <= VCPU_SREG_GS)
2111 kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg); 2742 kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg);
2112 else { 2743 else {
2113 printk(KERN_INFO "0x8c: Invalid segreg in modrm byte 0x%02x\n", 2744 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
2114 c->modrm); 2745 goto done;
2115 goto cannot_emulate;
2116 } 2746 }
2117 c->dst.val = segreg.selector; 2747 c->dst.val = segreg.selector;
2118 break; 2748 break;
@@ -2132,16 +2762,16 @@ special_insn:
2132 } 2762 }
2133 2763
2134 if (c->modrm_reg == VCPU_SREG_SS) 2764 if (c->modrm_reg == VCPU_SREG_SS)
2135 toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS); 2765 toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_MOV_SS);
2136 2766
2137 rc = kvm_load_segment_descriptor(ctxt->vcpu, sel, c->modrm_reg); 2767 rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg);
2138 2768
2139 c->dst.type = OP_NONE; /* Disable writeback. */ 2769 c->dst.type = OP_NONE; /* Disable writeback. */
2140 break; 2770 break;
2141 } 2771 }
2142 case 0x8f: /* pop (sole member of Grp1a) */ 2772 case 0x8f: /* pop (sole member of Grp1a) */
2143 rc = emulate_grp1a(ctxt, ops); 2773 rc = emulate_grp1a(ctxt, ops);
2144 if (rc != 0) 2774 if (rc != X86EMUL_CONTINUE)
2145 goto done; 2775 goto done;
2146 break; 2776 break;
2147 case 0x90: /* nop / xchg r8,rax */ 2777 case 0x90: /* nop / xchg r8,rax */
@@ -2175,89 +2805,16 @@ special_insn:
2175 c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX]; 2805 c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX];
2176 break; 2806 break;
2177 case 0xa4 ... 0xa5: /* movs */ 2807 case 0xa4 ... 0xa5: /* movs */
2178 c->dst.type = OP_MEM; 2808 goto mov;
2179 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2180 c->dst.ptr = (unsigned long *)register_address(c,
2181 es_base(ctxt),
2182 c->regs[VCPU_REGS_RDI]);
2183 rc = ops->read_emulated(register_address(c,
2184 seg_override_base(ctxt, c),
2185 c->regs[VCPU_REGS_RSI]),
2186 &c->dst.val,
2187 c->dst.bytes, ctxt->vcpu);
2188 if (rc != X86EMUL_CONTINUE)
2189 goto done;
2190 register_address_increment(c, &c->regs[VCPU_REGS_RSI],
2191 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
2192 : c->dst.bytes);
2193 register_address_increment(c, &c->regs[VCPU_REGS_RDI],
2194 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
2195 : c->dst.bytes);
2196 break;
2197 case 0xa6 ... 0xa7: /* cmps */ 2809 case 0xa6 ... 0xa7: /* cmps */
2198 c->src.type = OP_NONE; /* Disable writeback. */
2199 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2200 c->src.ptr = (unsigned long *)register_address(c,
2201 seg_override_base(ctxt, c),
2202 c->regs[VCPU_REGS_RSI]);
2203 rc = ops->read_emulated((unsigned long)c->src.ptr,
2204 &c->src.val,
2205 c->src.bytes,
2206 ctxt->vcpu);
2207 if (rc != X86EMUL_CONTINUE)
2208 goto done;
2209
2210 c->dst.type = OP_NONE; /* Disable writeback. */ 2810 c->dst.type = OP_NONE; /* Disable writeback. */
2211 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2212 c->dst.ptr = (unsigned long *)register_address(c,
2213 es_base(ctxt),
2214 c->regs[VCPU_REGS_RDI]);
2215 rc = ops->read_emulated((unsigned long)c->dst.ptr,
2216 &c->dst.val,
2217 c->dst.bytes,
2218 ctxt->vcpu);
2219 if (rc != X86EMUL_CONTINUE)
2220 goto done;
2221
2222 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); 2811 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
2223 2812 goto cmp;
2224 emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
2225
2226 register_address_increment(c, &c->regs[VCPU_REGS_RSI],
2227 (ctxt->eflags & EFLG_DF) ? -c->src.bytes
2228 : c->src.bytes);
2229 register_address_increment(c, &c->regs[VCPU_REGS_RDI],
2230 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
2231 : c->dst.bytes);
2232
2233 break;
2234 case 0xaa ... 0xab: /* stos */ 2813 case 0xaa ... 0xab: /* stos */
2235 c->dst.type = OP_MEM;
2236 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2237 c->dst.ptr = (unsigned long *)register_address(c,
2238 es_base(ctxt),
2239 c->regs[VCPU_REGS_RDI]);
2240 c->dst.val = c->regs[VCPU_REGS_RAX]; 2814 c->dst.val = c->regs[VCPU_REGS_RAX];
2241 register_address_increment(c, &c->regs[VCPU_REGS_RDI],
2242 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
2243 : c->dst.bytes);
2244 break; 2815 break;
2245 case 0xac ... 0xad: /* lods */ 2816 case 0xac ... 0xad: /* lods */
2246 c->dst.type = OP_REG; 2817 goto mov;
2247 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2248 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
2249 rc = ops->read_emulated(register_address(c,
2250 seg_override_base(ctxt, c),
2251 c->regs[VCPU_REGS_RSI]),
2252 &c->dst.val,
2253 c->dst.bytes,
2254 ctxt->vcpu);
2255 if (rc != X86EMUL_CONTINUE)
2256 goto done;
2257 register_address_increment(c, &c->regs[VCPU_REGS_RSI],
2258 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
2259 : c->dst.bytes);
2260 break;
2261 case 0xae ... 0xaf: /* scas */ 2818 case 0xae ... 0xaf: /* scas */
2262 DPRINTF("Urk! I don't handle SCAS.\n"); 2819 DPRINTF("Urk! I don't handle SCAS.\n");
2263 goto cannot_emulate; 2820 goto cannot_emulate;
@@ -2277,7 +2834,7 @@ special_insn:
2277 break; 2834 break;
2278 case 0xcb: /* ret far */ 2835 case 0xcb: /* ret far */
2279 rc = emulate_ret_far(ctxt, ops); 2836 rc = emulate_ret_far(ctxt, ops);
2280 if (rc) 2837 if (rc != X86EMUL_CONTINUE)
2281 goto done; 2838 goto done;
2282 break; 2839 break;
2283 case 0xd0 ... 0xd1: /* Grp2 */ 2840 case 0xd0 ... 0xd1: /* Grp2 */
@@ -2290,14 +2847,10 @@ special_insn:
2290 break; 2847 break;
2291 case 0xe4: /* inb */ 2848 case 0xe4: /* inb */
2292 case 0xe5: /* in */ 2849 case 0xe5: /* in */
2293 port = c->src.val; 2850 goto do_io_in;
2294 io_dir_in = 1;
2295 goto do_io;
2296 case 0xe6: /* outb */ 2851 case 0xe6: /* outb */
2297 case 0xe7: /* out */ 2852 case 0xe7: /* out */
2298 port = c->src.val; 2853 goto do_io_out;
2299 io_dir_in = 0;
2300 goto do_io;
2301 case 0xe8: /* call (near) */ { 2854 case 0xe8: /* call (near) */ {
2302 long int rel = c->src.val; 2855 long int rel = c->src.val;
2303 c->src.val = (unsigned long) c->eip; 2856 c->src.val = (unsigned long) c->eip;
@@ -2308,8 +2861,9 @@ special_insn:
2308 case 0xe9: /* jmp rel */ 2861 case 0xe9: /* jmp rel */
2309 goto jmp; 2862 goto jmp;
2310 case 0xea: /* jmp far */ 2863 case 0xea: /* jmp far */
2311 if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, 2864 jump_far:
2312 VCPU_SREG_CS)) 2865 if (load_segment_descriptor(ctxt, ops, c->src2.val,
2866 VCPU_SREG_CS))
2313 goto done; 2867 goto done;
2314 2868
2315 c->eip = c->src.val; 2869 c->eip = c->src.val;
@@ -2321,25 +2875,29 @@ special_insn:
2321 break; 2875 break;
2322 case 0xec: /* in al,dx */ 2876 case 0xec: /* in al,dx */
2323 case 0xed: /* in (e/r)ax,dx */ 2877 case 0xed: /* in (e/r)ax,dx */
2324 port = c->regs[VCPU_REGS_RDX]; 2878 c->src.val = c->regs[VCPU_REGS_RDX];
2325 io_dir_in = 1; 2879 do_io_in:
2326 goto do_io; 2880 c->dst.bytes = min(c->dst.bytes, 4u);
2881 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
2882 kvm_inject_gp(ctxt->vcpu, 0);
2883 goto done;
2884 }
2885 if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val,
2886 &c->dst.val))
2887 goto done; /* IO is needed */
2888 break;
2327 case 0xee: /* out al,dx */ 2889 case 0xee: /* out al,dx */
2328 case 0xef: /* out (e/r)ax,dx */ 2890 case 0xef: /* out (e/r)ax,dx */
2329 port = c->regs[VCPU_REGS_RDX]; 2891 c->src.val = c->regs[VCPU_REGS_RDX];
2330 io_dir_in = 0; 2892 do_io_out:
2331 do_io: 2893 c->dst.bytes = min(c->dst.bytes, 4u);
2332 if (!emulator_io_permited(ctxt, ops, port, 2894 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
2333 (c->d & ByteOp) ? 1 : c->op_bytes)) {
2334 kvm_inject_gp(ctxt->vcpu, 0); 2895 kvm_inject_gp(ctxt->vcpu, 0);
2335 goto done; 2896 goto done;
2336 } 2897 }
2337 if (kvm_emulate_pio(ctxt->vcpu, io_dir_in, 2898 ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1,
2338 (c->d & ByteOp) ? 1 : c->op_bytes, 2899 ctxt->vcpu);
2339 port) != 0) { 2900 c->dst.type = OP_NONE; /* Disable writeback. */
2340 c->eip = saved_eip;
2341 goto cannot_emulate;
2342 }
2343 break; 2901 break;
2344 case 0xf4: /* hlt */ 2902 case 0xf4: /* hlt */
2345 ctxt->vcpu->arch.halt_request = 1; 2903 ctxt->vcpu->arch.halt_request = 1;
@@ -2350,16 +2908,15 @@ special_insn:
2350 c->dst.type = OP_NONE; /* Disable writeback. */ 2908 c->dst.type = OP_NONE; /* Disable writeback. */
2351 break; 2909 break;
2352 case 0xf6 ... 0xf7: /* Grp3 */ 2910 case 0xf6 ... 0xf7: /* Grp3 */
2353 rc = emulate_grp3(ctxt, ops); 2911 if (!emulate_grp3(ctxt, ops))
2354 if (rc != 0) 2912 goto cannot_emulate;
2355 goto done;
2356 break; 2913 break;
2357 case 0xf8: /* clc */ 2914 case 0xf8: /* clc */
2358 ctxt->eflags &= ~EFLG_CF; 2915 ctxt->eflags &= ~EFLG_CF;
2359 c->dst.type = OP_NONE; /* Disable writeback. */ 2916 c->dst.type = OP_NONE; /* Disable writeback. */
2360 break; 2917 break;
2361 case 0xfa: /* cli */ 2918 case 0xfa: /* cli */
2362 if (emulator_bad_iopl(ctxt)) 2919 if (emulator_bad_iopl(ctxt, ops))
2363 kvm_inject_gp(ctxt->vcpu, 0); 2920 kvm_inject_gp(ctxt->vcpu, 0);
2364 else { 2921 else {
2365 ctxt->eflags &= ~X86_EFLAGS_IF; 2922 ctxt->eflags &= ~X86_EFLAGS_IF;
@@ -2367,10 +2924,10 @@ special_insn:
2367 } 2924 }
2368 break; 2925 break;
2369 case 0xfb: /* sti */ 2926 case 0xfb: /* sti */
2370 if (emulator_bad_iopl(ctxt)) 2927 if (emulator_bad_iopl(ctxt, ops))
2371 kvm_inject_gp(ctxt->vcpu, 0); 2928 kvm_inject_gp(ctxt->vcpu, 0);
2372 else { 2929 else {
2373 toggle_interruptibility(ctxt, X86_SHADOW_INT_STI); 2930 toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_STI);
2374 ctxt->eflags |= X86_EFLAGS_IF; 2931 ctxt->eflags |= X86_EFLAGS_IF;
2375 c->dst.type = OP_NONE; /* Disable writeback. */ 2932 c->dst.type = OP_NONE; /* Disable writeback. */
2376 } 2933 }
@@ -2383,28 +2940,55 @@ special_insn:
2383 ctxt->eflags |= EFLG_DF; 2940 ctxt->eflags |= EFLG_DF;
2384 c->dst.type = OP_NONE; /* Disable writeback. */ 2941 c->dst.type = OP_NONE; /* Disable writeback. */
2385 break; 2942 break;
2386 case 0xfe ... 0xff: /* Grp4/Grp5 */ 2943 case 0xfe: /* Grp4 */
2944 grp45:
2387 rc = emulate_grp45(ctxt, ops); 2945 rc = emulate_grp45(ctxt, ops);
2388 if (rc != 0) 2946 if (rc != X86EMUL_CONTINUE)
2389 goto done; 2947 goto done;
2390 break; 2948 break;
2949 case 0xff: /* Grp5 */
2950 if (c->modrm_reg == 5)
2951 goto jump_far;
2952 goto grp45;
2391 } 2953 }
2392 2954
2393writeback: 2955writeback:
2394 rc = writeback(ctxt, ops); 2956 rc = writeback(ctxt, ops);
2395 if (rc != 0) 2957 if (rc != X86EMUL_CONTINUE)
2396 goto done; 2958 goto done;
2397 2959
2960 /*
2961 * restore dst type in case the decoding will be reused
2962 * (happens for string instruction )
2963 */
2964 c->dst.type = saved_dst_type;
2965
2966 if ((c->d & SrcMask) == SrcSI)
2967 string_addr_inc(ctxt, seg_override_base(ctxt, c), VCPU_REGS_RSI,
2968 &c->src);
2969
2970 if ((c->d & DstMask) == DstDI)
2971 string_addr_inc(ctxt, es_base(ctxt), VCPU_REGS_RDI, &c->dst);
2972
2973 if (c->rep_prefix && (c->d & String)) {
2974 struct read_cache *rc = &ctxt->decode.io_read;
2975 register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
2976 /*
2977 * Re-enter guest when pio read ahead buffer is empty or,
2978 * if it is not used, after each 1024 iteration.
2979 */
2980 if ((rc->end == 0 && !(c->regs[VCPU_REGS_RCX] & 0x3ff)) ||
2981 (rc->end != 0 && rc->end == rc->pos))
2982 ctxt->restart = false;
2983 }
2984
2398 /* Commit shadow register state. */ 2985 /* Commit shadow register state. */
2399 memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); 2986 memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
2400 kvm_rip_write(ctxt->vcpu, c->eip); 2987 kvm_rip_write(ctxt->vcpu, c->eip);
2988 ops->set_rflags(ctxt->vcpu, ctxt->eflags);
2401 2989
2402done: 2990done:
2403 if (rc == X86EMUL_UNHANDLEABLE) { 2991 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
2404 c->eip = saved_eip;
2405 return -1;
2406 }
2407 return 0;
2408 2992
2409twobyte_insn: 2993twobyte_insn:
2410 switch (c->b) { 2994 switch (c->b) {
@@ -2418,18 +3002,18 @@ twobyte_insn:
2418 goto cannot_emulate; 3002 goto cannot_emulate;
2419 3003
2420 rc = kvm_fix_hypercall(ctxt->vcpu); 3004 rc = kvm_fix_hypercall(ctxt->vcpu);
2421 if (rc) 3005 if (rc != X86EMUL_CONTINUE)
2422 goto done; 3006 goto done;
2423 3007
2424 /* Let the processor re-execute the fixed hypercall */ 3008 /* Let the processor re-execute the fixed hypercall */
2425 c->eip = kvm_rip_read(ctxt->vcpu); 3009 c->eip = ctxt->eip;
2426 /* Disable writeback. */ 3010 /* Disable writeback. */
2427 c->dst.type = OP_NONE; 3011 c->dst.type = OP_NONE;
2428 break; 3012 break;
2429 case 2: /* lgdt */ 3013 case 2: /* lgdt */
2430 rc = read_descriptor(ctxt, ops, c->src.ptr, 3014 rc = read_descriptor(ctxt, ops, c->src.ptr,
2431 &size, &address, c->op_bytes); 3015 &size, &address, c->op_bytes);
2432 if (rc) 3016 if (rc != X86EMUL_CONTINUE)
2433 goto done; 3017 goto done;
2434 realmode_lgdt(ctxt->vcpu, size, address); 3018 realmode_lgdt(ctxt->vcpu, size, address);
2435 /* Disable writeback. */ 3019 /* Disable writeback. */
@@ -2440,7 +3024,7 @@ twobyte_insn:
2440 switch (c->modrm_rm) { 3024 switch (c->modrm_rm) {
2441 case 1: 3025 case 1:
2442 rc = kvm_fix_hypercall(ctxt->vcpu); 3026 rc = kvm_fix_hypercall(ctxt->vcpu);
2443 if (rc) 3027 if (rc != X86EMUL_CONTINUE)
2444 goto done; 3028 goto done;
2445 break; 3029 break;
2446 default: 3030 default:
@@ -2450,7 +3034,7 @@ twobyte_insn:
2450 rc = read_descriptor(ctxt, ops, c->src.ptr, 3034 rc = read_descriptor(ctxt, ops, c->src.ptr,
2451 &size, &address, 3035 &size, &address,
2452 c->op_bytes); 3036 c->op_bytes);
2453 if (rc) 3037 if (rc != X86EMUL_CONTINUE)
2454 goto done; 3038 goto done;
2455 realmode_lidt(ctxt->vcpu, size, address); 3039 realmode_lidt(ctxt->vcpu, size, address);
2456 } 3040 }
@@ -2459,15 +3043,18 @@ twobyte_insn:
2459 break; 3043 break;
2460 case 4: /* smsw */ 3044 case 4: /* smsw */
2461 c->dst.bytes = 2; 3045 c->dst.bytes = 2;
2462 c->dst.val = realmode_get_cr(ctxt->vcpu, 0); 3046 c->dst.val = ops->get_cr(0, ctxt->vcpu);
2463 break; 3047 break;
2464 case 6: /* lmsw */ 3048 case 6: /* lmsw */
2465 realmode_lmsw(ctxt->vcpu, (u16)c->src.val, 3049 ops->set_cr(0, (ops->get_cr(0, ctxt->vcpu) & ~0x0ful) |
2466 &ctxt->eflags); 3050 (c->src.val & 0x0f), ctxt->vcpu);
2467 c->dst.type = OP_NONE; 3051 c->dst.type = OP_NONE;
2468 break; 3052 break;
3053 case 5: /* not defined */
3054 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
3055 goto done;
2469 case 7: /* invlpg*/ 3056 case 7: /* invlpg*/
2470 emulate_invlpg(ctxt->vcpu, memop); 3057 emulate_invlpg(ctxt->vcpu, c->modrm_ea);
2471 /* Disable writeback. */ 3058 /* Disable writeback. */
2472 c->dst.type = OP_NONE; 3059 c->dst.type = OP_NONE;
2473 break; 3060 break;
@@ -2493,54 +3080,54 @@ twobyte_insn:
2493 c->dst.type = OP_NONE; 3080 c->dst.type = OP_NONE;
2494 break; 3081 break;
2495 case 0x20: /* mov cr, reg */ 3082 case 0x20: /* mov cr, reg */
2496 if (c->modrm_mod != 3) 3083 switch (c->modrm_reg) {
2497 goto cannot_emulate; 3084 case 1:
2498 c->regs[c->modrm_rm] = 3085 case 5 ... 7:
2499 realmode_get_cr(ctxt->vcpu, c->modrm_reg); 3086 case 9 ... 15:
3087 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
3088 goto done;
3089 }
3090 c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu);
2500 c->dst.type = OP_NONE; /* no writeback */ 3091 c->dst.type = OP_NONE; /* no writeback */
2501 break; 3092 break;
2502 case 0x21: /* mov from dr to reg */ 3093 case 0x21: /* mov from dr to reg */
2503 if (c->modrm_mod != 3) 3094 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
2504 goto cannot_emulate; 3095 (c->modrm_reg == 4 || c->modrm_reg == 5)) {
2505 rc = emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]); 3096 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
2506 if (rc) 3097 goto done;
2507 goto cannot_emulate; 3098 }
3099 emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]);
2508 c->dst.type = OP_NONE; /* no writeback */ 3100 c->dst.type = OP_NONE; /* no writeback */
2509 break; 3101 break;
2510 case 0x22: /* mov reg, cr */ 3102 case 0x22: /* mov reg, cr */
2511 if (c->modrm_mod != 3) 3103 ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu);
2512 goto cannot_emulate;
2513 realmode_set_cr(ctxt->vcpu,
2514 c->modrm_reg, c->modrm_val, &ctxt->eflags);
2515 c->dst.type = OP_NONE; 3104 c->dst.type = OP_NONE;
2516 break; 3105 break;
2517 case 0x23: /* mov from reg to dr */ 3106 case 0x23: /* mov from reg to dr */
2518 if (c->modrm_mod != 3) 3107 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
2519 goto cannot_emulate; 3108 (c->modrm_reg == 4 || c->modrm_reg == 5)) {
2520 rc = emulator_set_dr(ctxt, c->modrm_reg, 3109 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
2521 c->regs[c->modrm_rm]); 3110 goto done;
2522 if (rc) 3111 }
2523 goto cannot_emulate; 3112 emulator_set_dr(ctxt, c->modrm_reg, c->regs[c->modrm_rm]);
2524 c->dst.type = OP_NONE; /* no writeback */ 3113 c->dst.type = OP_NONE; /* no writeback */
2525 break; 3114 break;
2526 case 0x30: 3115 case 0x30:
2527 /* wrmsr */ 3116 /* wrmsr */
2528 msr_data = (u32)c->regs[VCPU_REGS_RAX] 3117 msr_data = (u32)c->regs[VCPU_REGS_RAX]
2529 | ((u64)c->regs[VCPU_REGS_RDX] << 32); 3118 | ((u64)c->regs[VCPU_REGS_RDX] << 32);
2530 rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data); 3119 if (kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
2531 if (rc) {
2532 kvm_inject_gp(ctxt->vcpu, 0); 3120 kvm_inject_gp(ctxt->vcpu, 0);
2533 c->eip = kvm_rip_read(ctxt->vcpu); 3121 goto done;
2534 } 3122 }
2535 rc = X86EMUL_CONTINUE; 3123 rc = X86EMUL_CONTINUE;
2536 c->dst.type = OP_NONE; 3124 c->dst.type = OP_NONE;
2537 break; 3125 break;
2538 case 0x32: 3126 case 0x32:
2539 /* rdmsr */ 3127 /* rdmsr */
2540 rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data); 3128 if (kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
2541 if (rc) {
2542 kvm_inject_gp(ctxt->vcpu, 0); 3129 kvm_inject_gp(ctxt->vcpu, 0);
2543 c->eip = kvm_rip_read(ctxt->vcpu); 3130 goto done;
2544 } else { 3131 } else {
2545 c->regs[VCPU_REGS_RAX] = (u32)msr_data; 3132 c->regs[VCPU_REGS_RAX] = (u32)msr_data;
2546 c->regs[VCPU_REGS_RDX] = msr_data >> 32; 3133 c->regs[VCPU_REGS_RDX] = msr_data >> 32;
@@ -2577,7 +3164,7 @@ twobyte_insn:
2577 break; 3164 break;
2578 case 0xa1: /* pop fs */ 3165 case 0xa1: /* pop fs */
2579 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS); 3166 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
2580 if (rc != 0) 3167 if (rc != X86EMUL_CONTINUE)
2581 goto done; 3168 goto done;
2582 break; 3169 break;
2583 case 0xa3: 3170 case 0xa3:
@@ -2596,7 +3183,7 @@ twobyte_insn:
2596 break; 3183 break;
2597 case 0xa9: /* pop gs */ 3184 case 0xa9: /* pop gs */
2598 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS); 3185 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
2599 if (rc != 0) 3186 if (rc != X86EMUL_CONTINUE)
2600 goto done; 3187 goto done;
2601 break; 3188 break;
2602 case 0xab: 3189 case 0xab:
@@ -2668,16 +3255,14 @@ twobyte_insn:
2668 (u64) c->src.val; 3255 (u64) c->src.val;
2669 break; 3256 break;
2670 case 0xc7: /* Grp9 (cmpxchg8b) */ 3257 case 0xc7: /* Grp9 (cmpxchg8b) */
2671 rc = emulate_grp9(ctxt, ops, memop); 3258 rc = emulate_grp9(ctxt, ops);
2672 if (rc != 0) 3259 if (rc != X86EMUL_CONTINUE)
2673 goto done; 3260 goto done;
2674 c->dst.type = OP_NONE;
2675 break; 3261 break;
2676 } 3262 }
2677 goto writeback; 3263 goto writeback;
2678 3264
2679cannot_emulate: 3265cannot_emulate:
2680 DPRINTF("Cannot emulate %02x\n", c->b); 3266 DPRINTF("Cannot emulate %02x\n", c->b);
2681 c->eip = saved_eip;
2682 return -1; 3267 return -1;
2683} 3268}
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index a790fa128a9f..93825ff3338f 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -33,6 +33,29 @@
33#include <linux/kvm_host.h> 33#include <linux/kvm_host.h>
34#include "trace.h" 34#include "trace.h"
35 35
36static void pic_lock(struct kvm_pic *s)
37 __acquires(&s->lock)
38{
39 raw_spin_lock(&s->lock);
40}
41
42static void pic_unlock(struct kvm_pic *s)
43 __releases(&s->lock)
44{
45 bool wakeup = s->wakeup_needed;
46 struct kvm_vcpu *vcpu;
47
48 s->wakeup_needed = false;
49
50 raw_spin_unlock(&s->lock);
51
52 if (wakeup) {
53 vcpu = s->kvm->bsp_vcpu;
54 if (vcpu)
55 kvm_vcpu_kick(vcpu);
56 }
57}
58
36static void pic_clear_isr(struct kvm_kpic_state *s, int irq) 59static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
37{ 60{
38 s->isr &= ~(1 << irq); 61 s->isr &= ~(1 << irq);
@@ -45,19 +68,19 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
45 * Other interrupt may be delivered to PIC while lock is dropped but 68 * Other interrupt may be delivered to PIC while lock is dropped but
46 * it should be safe since PIC state is already updated at this stage. 69 * it should be safe since PIC state is already updated at this stage.
47 */ 70 */
48 raw_spin_unlock(&s->pics_state->lock); 71 pic_unlock(s->pics_state);
49 kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq); 72 kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq);
50 raw_spin_lock(&s->pics_state->lock); 73 pic_lock(s->pics_state);
51} 74}
52 75
53void kvm_pic_clear_isr_ack(struct kvm *kvm) 76void kvm_pic_clear_isr_ack(struct kvm *kvm)
54{ 77{
55 struct kvm_pic *s = pic_irqchip(kvm); 78 struct kvm_pic *s = pic_irqchip(kvm);
56 79
57 raw_spin_lock(&s->lock); 80 pic_lock(s);
58 s->pics[0].isr_ack = 0xff; 81 s->pics[0].isr_ack = 0xff;
59 s->pics[1].isr_ack = 0xff; 82 s->pics[1].isr_ack = 0xff;
60 raw_spin_unlock(&s->lock); 83 pic_unlock(s);
61} 84}
62 85
63/* 86/*
@@ -158,9 +181,9 @@ static void pic_update_irq(struct kvm_pic *s)
158 181
159void kvm_pic_update_irq(struct kvm_pic *s) 182void kvm_pic_update_irq(struct kvm_pic *s)
160{ 183{
161 raw_spin_lock(&s->lock); 184 pic_lock(s);
162 pic_update_irq(s); 185 pic_update_irq(s);
163 raw_spin_unlock(&s->lock); 186 pic_unlock(s);
164} 187}
165 188
166int kvm_pic_set_irq(void *opaque, int irq, int level) 189int kvm_pic_set_irq(void *opaque, int irq, int level)
@@ -168,14 +191,14 @@ int kvm_pic_set_irq(void *opaque, int irq, int level)
168 struct kvm_pic *s = opaque; 191 struct kvm_pic *s = opaque;
169 int ret = -1; 192 int ret = -1;
170 193
171 raw_spin_lock(&s->lock); 194 pic_lock(s);
172 if (irq >= 0 && irq < PIC_NUM_PINS) { 195 if (irq >= 0 && irq < PIC_NUM_PINS) {
173 ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); 196 ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
174 pic_update_irq(s); 197 pic_update_irq(s);
175 trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, 198 trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
176 s->pics[irq >> 3].imr, ret == 0); 199 s->pics[irq >> 3].imr, ret == 0);
177 } 200 }
178 raw_spin_unlock(&s->lock); 201 pic_unlock(s);
179 202
180 return ret; 203 return ret;
181} 204}
@@ -205,7 +228,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
205 int irq, irq2, intno; 228 int irq, irq2, intno;
206 struct kvm_pic *s = pic_irqchip(kvm); 229 struct kvm_pic *s = pic_irqchip(kvm);
207 230
208 raw_spin_lock(&s->lock); 231 pic_lock(s);
209 irq = pic_get_irq(&s->pics[0]); 232 irq = pic_get_irq(&s->pics[0]);
210 if (irq >= 0) { 233 if (irq >= 0) {
211 pic_intack(&s->pics[0], irq); 234 pic_intack(&s->pics[0], irq);
@@ -230,7 +253,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
230 intno = s->pics[0].irq_base + irq; 253 intno = s->pics[0].irq_base + irq;
231 } 254 }
232 pic_update_irq(s); 255 pic_update_irq(s);
233 raw_spin_unlock(&s->lock); 256 pic_unlock(s);
234 257
235 return intno; 258 return intno;
236} 259}
@@ -444,7 +467,7 @@ static int picdev_write(struct kvm_io_device *this,
444 printk(KERN_ERR "PIC: non byte write\n"); 467 printk(KERN_ERR "PIC: non byte write\n");
445 return 0; 468 return 0;
446 } 469 }
447 raw_spin_lock(&s->lock); 470 pic_lock(s);
448 switch (addr) { 471 switch (addr) {
449 case 0x20: 472 case 0x20:
450 case 0x21: 473 case 0x21:
@@ -457,7 +480,7 @@ static int picdev_write(struct kvm_io_device *this,
457 elcr_ioport_write(&s->pics[addr & 1], addr, data); 480 elcr_ioport_write(&s->pics[addr & 1], addr, data);
458 break; 481 break;
459 } 482 }
460 raw_spin_unlock(&s->lock); 483 pic_unlock(s);
461 return 0; 484 return 0;
462} 485}
463 486
@@ -474,7 +497,7 @@ static int picdev_read(struct kvm_io_device *this,
474 printk(KERN_ERR "PIC: non byte read\n"); 497 printk(KERN_ERR "PIC: non byte read\n");
475 return 0; 498 return 0;
476 } 499 }
477 raw_spin_lock(&s->lock); 500 pic_lock(s);
478 switch (addr) { 501 switch (addr) {
479 case 0x20: 502 case 0x20:
480 case 0x21: 503 case 0x21:
@@ -488,7 +511,7 @@ static int picdev_read(struct kvm_io_device *this,
488 break; 511 break;
489 } 512 }
490 *(unsigned char *)val = data; 513 *(unsigned char *)val = data;
491 raw_spin_unlock(&s->lock); 514 pic_unlock(s);
492 return 0; 515 return 0;
493} 516}
494 517
@@ -505,7 +528,7 @@ static void pic_irq_request(void *opaque, int level)
505 s->output = level; 528 s->output = level;
506 if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) { 529 if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
507 s->pics[0].isr_ack &= ~(1 << irq); 530 s->pics[0].isr_ack &= ~(1 << irq);
508 kvm_vcpu_kick(vcpu); 531 s->wakeup_needed = true;
509 } 532 }
510} 533}
511 534
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 34b15915754d..cd1f362f413d 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -63,6 +63,7 @@ struct kvm_kpic_state {
63 63
64struct kvm_pic { 64struct kvm_pic {
65 raw_spinlock_t lock; 65 raw_spinlock_t lock;
66 bool wakeup_needed;
66 unsigned pending_acks; 67 unsigned pending_acks;
67 struct kvm *kvm; 68 struct kvm *kvm;
68 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ 69 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h
index 55c7524dda54..64bc6ea78d90 100644
--- a/arch/x86/kvm/kvm_timer.h
+++ b/arch/x86/kvm/kvm_timer.h
@@ -10,9 +10,7 @@ struct kvm_timer {
10}; 10};
11 11
12struct kvm_timer_ops { 12struct kvm_timer_ops {
13 bool (*is_periodic)(struct kvm_timer *); 13 bool (*is_periodic)(struct kvm_timer *);
14}; 14};
15 15
16
17enum hrtimer_restart kvm_timer_fn(struct hrtimer *data); 16enum hrtimer_restart kvm_timer_fn(struct hrtimer *data);
18
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 19a8906bcaa2..81563e76e28f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -148,7 +148,6 @@ module_param(oos_shadow, bool, 0644);
148 148
149#include <trace/events/kvm.h> 149#include <trace/events/kvm.h>
150 150
151#undef TRACE_INCLUDE_FILE
152#define CREATE_TRACE_POINTS 151#define CREATE_TRACE_POINTS
153#include "mmutrace.h" 152#include "mmutrace.h"
154 153
@@ -174,12 +173,7 @@ struct kvm_shadow_walk_iterator {
174 shadow_walk_okay(&(_walker)); \ 173 shadow_walk_okay(&(_walker)); \
175 shadow_walk_next(&(_walker))) 174 shadow_walk_next(&(_walker)))
176 175
177 176typedef int (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp);
178struct kvm_unsync_walk {
179 int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk);
180};
181
182typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp);
183 177
184static struct kmem_cache *pte_chain_cache; 178static struct kmem_cache *pte_chain_cache;
185static struct kmem_cache *rmap_desc_cache; 179static struct kmem_cache *rmap_desc_cache;
@@ -223,7 +217,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
223} 217}
224EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); 218EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
225 219
226static int is_write_protection(struct kvm_vcpu *vcpu) 220static bool is_write_protection(struct kvm_vcpu *vcpu)
227{ 221{
228 return kvm_read_cr0_bits(vcpu, X86_CR0_WP); 222 return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
229} 223}
@@ -327,7 +321,6 @@ static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
327 page = alloc_page(GFP_KERNEL); 321 page = alloc_page(GFP_KERNEL);
328 if (!page) 322 if (!page)
329 return -ENOMEM; 323 return -ENOMEM;
330 set_page_private(page, 0);
331 cache->objects[cache->nobjs++] = page_address(page); 324 cache->objects[cache->nobjs++] = page_address(page);
332 } 325 }
333 return 0; 326 return 0;
@@ -438,9 +431,9 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
438 int i; 431 int i;
439 432
440 gfn = unalias_gfn(kvm, gfn); 433 gfn = unalias_gfn(kvm, gfn);
434 slot = gfn_to_memslot_unaliased(kvm, gfn);
441 for (i = PT_DIRECTORY_LEVEL; 435 for (i = PT_DIRECTORY_LEVEL;
442 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 436 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
443 slot = gfn_to_memslot_unaliased(kvm, gfn);
444 write_count = slot_largepage_idx(gfn, slot, i); 437 write_count = slot_largepage_idx(gfn, slot, i);
445 *write_count -= 1; 438 *write_count -= 1;
446 WARN_ON(*write_count < 0); 439 WARN_ON(*write_count < 0);
@@ -654,7 +647,6 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
654static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) 647static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
655{ 648{
656 struct kvm_rmap_desc *desc; 649 struct kvm_rmap_desc *desc;
657 struct kvm_rmap_desc *prev_desc;
658 u64 *prev_spte; 650 u64 *prev_spte;
659 int i; 651 int i;
660 652
@@ -666,7 +658,6 @@ static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
666 return NULL; 658 return NULL;
667 } 659 }
668 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 660 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
669 prev_desc = NULL;
670 prev_spte = NULL; 661 prev_spte = NULL;
671 while (desc) { 662 while (desc) {
672 for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) { 663 for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) {
@@ -794,7 +785,7 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
794 int retval = 0; 785 int retval = 0;
795 struct kvm_memslots *slots; 786 struct kvm_memslots *slots;
796 787
797 slots = rcu_dereference(kvm->memslots); 788 slots = kvm_memslots(kvm);
798 789
799 for (i = 0; i < slots->nmemslots; i++) { 790 for (i = 0; i < slots->nmemslots; i++) {
800 struct kvm_memory_slot *memslot = &slots->memslots[i]; 791 struct kvm_memory_slot *memslot = &slots->memslots[i];
@@ -925,7 +916,6 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
925 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); 916 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
926 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 917 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
927 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); 918 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
928 INIT_LIST_HEAD(&sp->oos_link);
929 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); 919 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
930 sp->multimapped = 0; 920 sp->multimapped = 0;
931 sp->parent_pte = parent_pte; 921 sp->parent_pte = parent_pte;
@@ -1009,8 +999,7 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
1009} 999}
1010 1000
1011 1001
1012static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 1002static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
1013 mmu_parent_walk_fn fn)
1014{ 1003{
1015 struct kvm_pte_chain *pte_chain; 1004 struct kvm_pte_chain *pte_chain;
1016 struct hlist_node *node; 1005 struct hlist_node *node;
@@ -1019,8 +1008,8 @@ static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1019 1008
1020 if (!sp->multimapped && sp->parent_pte) { 1009 if (!sp->multimapped && sp->parent_pte) {
1021 parent_sp = page_header(__pa(sp->parent_pte)); 1010 parent_sp = page_header(__pa(sp->parent_pte));
1022 fn(vcpu, parent_sp); 1011 fn(parent_sp);
1023 mmu_parent_walk(vcpu, parent_sp, fn); 1012 mmu_parent_walk(parent_sp, fn);
1024 return; 1013 return;
1025 } 1014 }
1026 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) 1015 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
@@ -1028,8 +1017,8 @@ static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1028 if (!pte_chain->parent_ptes[i]) 1017 if (!pte_chain->parent_ptes[i])
1029 break; 1018 break;
1030 parent_sp = page_header(__pa(pte_chain->parent_ptes[i])); 1019 parent_sp = page_header(__pa(pte_chain->parent_ptes[i]));
1031 fn(vcpu, parent_sp); 1020 fn(parent_sp);
1032 mmu_parent_walk(vcpu, parent_sp, fn); 1021 mmu_parent_walk(parent_sp, fn);
1033 } 1022 }
1034} 1023}
1035 1024
@@ -1066,16 +1055,15 @@ static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
1066 } 1055 }
1067} 1056}
1068 1057
1069static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 1058static int unsync_walk_fn(struct kvm_mmu_page *sp)
1070{ 1059{
1071 kvm_mmu_update_parents_unsync(sp); 1060 kvm_mmu_update_parents_unsync(sp);
1072 return 1; 1061 return 1;
1073} 1062}
1074 1063
1075static void kvm_mmu_mark_parents_unsync(struct kvm_vcpu *vcpu, 1064static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
1076 struct kvm_mmu_page *sp)
1077{ 1065{
1078 mmu_parent_walk(vcpu, sp, unsync_walk_fn); 1066 mmu_parent_walk(sp, unsync_walk_fn);
1079 kvm_mmu_update_parents_unsync(sp); 1067 kvm_mmu_update_parents_unsync(sp);
1080} 1068}
1081 1069
@@ -1201,6 +1189,7 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
1201static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1189static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1202{ 1190{
1203 WARN_ON(!sp->unsync); 1191 WARN_ON(!sp->unsync);
1192 trace_kvm_mmu_sync_page(sp);
1204 sp->unsync = 0; 1193 sp->unsync = 0;
1205 --kvm->stat.mmu_unsync; 1194 --kvm->stat.mmu_unsync;
1206} 1195}
@@ -1209,12 +1198,11 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp);
1209 1198
1210static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 1199static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1211{ 1200{
1212 if (sp->role.glevels != vcpu->arch.mmu.root_level) { 1201 if (sp->role.cr4_pae != !!is_pae(vcpu)) {
1213 kvm_mmu_zap_page(vcpu->kvm, sp); 1202 kvm_mmu_zap_page(vcpu->kvm, sp);
1214 return 1; 1203 return 1;
1215 } 1204 }
1216 1205
1217 trace_kvm_mmu_sync_page(sp);
1218 if (rmap_write_protect(vcpu->kvm, sp->gfn)) 1206 if (rmap_write_protect(vcpu->kvm, sp->gfn))
1219 kvm_flush_remote_tlbs(vcpu->kvm); 1207 kvm_flush_remote_tlbs(vcpu->kvm);
1220 kvm_unlink_unsync_page(vcpu->kvm, sp); 1208 kvm_unlink_unsync_page(vcpu->kvm, sp);
@@ -1331,6 +1319,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1331 role = vcpu->arch.mmu.base_role; 1319 role = vcpu->arch.mmu.base_role;
1332 role.level = level; 1320 role.level = level;
1333 role.direct = direct; 1321 role.direct = direct;
1322 if (role.direct)
1323 role.cr4_pae = 0;
1334 role.access = access; 1324 role.access = access;
1335 if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { 1325 if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
1336 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); 1326 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
@@ -1351,7 +1341,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1351 mmu_page_add_parent_pte(vcpu, sp, parent_pte); 1341 mmu_page_add_parent_pte(vcpu, sp, parent_pte);
1352 if (sp->unsync_children) { 1342 if (sp->unsync_children) {
1353 set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); 1343 set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
1354 kvm_mmu_mark_parents_unsync(vcpu, sp); 1344 kvm_mmu_mark_parents_unsync(sp);
1355 } 1345 }
1356 trace_kvm_mmu_get_page(sp, false); 1346 trace_kvm_mmu_get_page(sp, false);
1357 return sp; 1347 return sp;
@@ -1573,13 +1563,14 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
1573 r = 0; 1563 r = 0;
1574 index = kvm_page_table_hashfn(gfn); 1564 index = kvm_page_table_hashfn(gfn);
1575 bucket = &kvm->arch.mmu_page_hash[index]; 1565 bucket = &kvm->arch.mmu_page_hash[index];
1566restart:
1576 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) 1567 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
1577 if (sp->gfn == gfn && !sp->role.direct) { 1568 if (sp->gfn == gfn && !sp->role.direct) {
1578 pgprintk("%s: gfn %lx role %x\n", __func__, gfn, 1569 pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
1579 sp->role.word); 1570 sp->role.word);
1580 r = 1; 1571 r = 1;
1581 if (kvm_mmu_zap_page(kvm, sp)) 1572 if (kvm_mmu_zap_page(kvm, sp))
1582 n = bucket->first; 1573 goto restart;
1583 } 1574 }
1584 return r; 1575 return r;
1585} 1576}
@@ -1593,13 +1584,14 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
1593 1584
1594 index = kvm_page_table_hashfn(gfn); 1585 index = kvm_page_table_hashfn(gfn);
1595 bucket = &kvm->arch.mmu_page_hash[index]; 1586 bucket = &kvm->arch.mmu_page_hash[index];
1587restart:
1596 hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) { 1588 hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) {
1597 if (sp->gfn == gfn && !sp->role.direct 1589 if (sp->gfn == gfn && !sp->role.direct
1598 && !sp->role.invalid) { 1590 && !sp->role.invalid) {
1599 pgprintk("%s: zap %lx %x\n", 1591 pgprintk("%s: zap %lx %x\n",
1600 __func__, gfn, sp->role.word); 1592 __func__, gfn, sp->role.word);
1601 if (kvm_mmu_zap_page(kvm, sp)) 1593 if (kvm_mmu_zap_page(kvm, sp))
1602 nn = bucket->first; 1594 goto restart;
1603 } 1595 }
1604 } 1596 }
1605} 1597}
@@ -1626,20 +1618,6 @@ static void mmu_convert_notrap(struct kvm_mmu_page *sp)
1626 } 1618 }
1627} 1619}
1628 1620
1629struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
1630{
1631 struct page *page;
1632
1633 gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
1634
1635 if (gpa == UNMAPPED_GVA)
1636 return NULL;
1637
1638 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1639
1640 return page;
1641}
1642
1643/* 1621/*
1644 * The function is based on mtrr_type_lookup() in 1622 * The function is based on mtrr_type_lookup() in
1645 * arch/x86/kernel/cpu/mtrr/generic.c 1623 * arch/x86/kernel/cpu/mtrr/generic.c
@@ -1752,7 +1730,6 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1752 struct kvm_mmu_page *s; 1730 struct kvm_mmu_page *s;
1753 struct hlist_node *node, *n; 1731 struct hlist_node *node, *n;
1754 1732
1755 trace_kvm_mmu_unsync_page(sp);
1756 index = kvm_page_table_hashfn(sp->gfn); 1733 index = kvm_page_table_hashfn(sp->gfn);
1757 bucket = &vcpu->kvm->arch.mmu_page_hash[index]; 1734 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
1758 /* don't unsync if pagetable is shadowed with multiple roles */ 1735 /* don't unsync if pagetable is shadowed with multiple roles */
@@ -1762,10 +1739,11 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1762 if (s->role.word != sp->role.word) 1739 if (s->role.word != sp->role.word)
1763 return 1; 1740 return 1;
1764 } 1741 }
1742 trace_kvm_mmu_unsync_page(sp);
1765 ++vcpu->kvm->stat.mmu_unsync; 1743 ++vcpu->kvm->stat.mmu_unsync;
1766 sp->unsync = 1; 1744 sp->unsync = 1;
1767 1745
1768 kvm_mmu_mark_parents_unsync(vcpu, sp); 1746 kvm_mmu_mark_parents_unsync(sp);
1769 1747
1770 mmu_convert_notrap(sp); 1748 mmu_convert_notrap(sp);
1771 return 0; 1749 return 0;
@@ -2081,21 +2059,23 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
2081 hpa_t root = vcpu->arch.mmu.root_hpa; 2059 hpa_t root = vcpu->arch.mmu.root_hpa;
2082 2060
2083 ASSERT(!VALID_PAGE(root)); 2061 ASSERT(!VALID_PAGE(root));
2084 if (tdp_enabled)
2085 direct = 1;
2086 if (mmu_check_root(vcpu, root_gfn)) 2062 if (mmu_check_root(vcpu, root_gfn))
2087 return 1; 2063 return 1;
2064 if (tdp_enabled) {
2065 direct = 1;
2066 root_gfn = 0;
2067 }
2068 spin_lock(&vcpu->kvm->mmu_lock);
2088 sp = kvm_mmu_get_page(vcpu, root_gfn, 0, 2069 sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
2089 PT64_ROOT_LEVEL, direct, 2070 PT64_ROOT_LEVEL, direct,
2090 ACC_ALL, NULL); 2071 ACC_ALL, NULL);
2091 root = __pa(sp->spt); 2072 root = __pa(sp->spt);
2092 ++sp->root_count; 2073 ++sp->root_count;
2074 spin_unlock(&vcpu->kvm->mmu_lock);
2093 vcpu->arch.mmu.root_hpa = root; 2075 vcpu->arch.mmu.root_hpa = root;
2094 return 0; 2076 return 0;
2095 } 2077 }
2096 direct = !is_paging(vcpu); 2078 direct = !is_paging(vcpu);
2097 if (tdp_enabled)
2098 direct = 1;
2099 for (i = 0; i < 4; ++i) { 2079 for (i = 0; i < 4; ++i) {
2100 hpa_t root = vcpu->arch.mmu.pae_root[i]; 2080 hpa_t root = vcpu->arch.mmu.pae_root[i];
2101 2081
@@ -2111,11 +2091,18 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
2111 root_gfn = 0; 2091 root_gfn = 0;
2112 if (mmu_check_root(vcpu, root_gfn)) 2092 if (mmu_check_root(vcpu, root_gfn))
2113 return 1; 2093 return 1;
2094 if (tdp_enabled) {
2095 direct = 1;
2096 root_gfn = i << 30;
2097 }
2098 spin_lock(&vcpu->kvm->mmu_lock);
2114 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, 2099 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
2115 PT32_ROOT_LEVEL, direct, 2100 PT32_ROOT_LEVEL, direct,
2116 ACC_ALL, NULL); 2101 ACC_ALL, NULL);
2117 root = __pa(sp->spt); 2102 root = __pa(sp->spt);
2118 ++sp->root_count; 2103 ++sp->root_count;
2104 spin_unlock(&vcpu->kvm->mmu_lock);
2105
2119 vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; 2106 vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
2120 } 2107 }
2121 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); 2108 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
@@ -2299,13 +2286,19 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
2299 /* no rsvd bits for 2 level 4K page table entries */ 2286 /* no rsvd bits for 2 level 4K page table entries */
2300 context->rsvd_bits_mask[0][1] = 0; 2287 context->rsvd_bits_mask[0][1] = 0;
2301 context->rsvd_bits_mask[0][0] = 0; 2288 context->rsvd_bits_mask[0][0] = 0;
2289 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2290
2291 if (!is_pse(vcpu)) {
2292 context->rsvd_bits_mask[1][1] = 0;
2293 break;
2294 }
2295
2302 if (is_cpuid_PSE36()) 2296 if (is_cpuid_PSE36())
2303 /* 36bits PSE 4MB page */ 2297 /* 36bits PSE 4MB page */
2304 context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21); 2298 context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
2305 else 2299 else
2306 /* 32 bits PSE 4MB page */ 2300 /* 32 bits PSE 4MB page */
2307 context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); 2301 context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
2308 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
2309 break; 2302 break;
2310 case PT32E_ROOT_LEVEL: 2303 case PT32E_ROOT_LEVEL:
2311 context->rsvd_bits_mask[0][2] = 2304 context->rsvd_bits_mask[0][2] =
@@ -2318,7 +2311,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
2318 context->rsvd_bits_mask[1][1] = exb_bit_rsvd | 2311 context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
2319 rsvd_bits(maxphyaddr, 62) | 2312 rsvd_bits(maxphyaddr, 62) |
2320 rsvd_bits(13, 20); /* large page */ 2313 rsvd_bits(13, 20); /* large page */
2321 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0]; 2314 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2322 break; 2315 break;
2323 case PT64_ROOT_LEVEL: 2316 case PT64_ROOT_LEVEL:
2324 context->rsvd_bits_mask[0][3] = exb_bit_rsvd | 2317 context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
@@ -2336,7 +2329,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
2336 context->rsvd_bits_mask[1][1] = exb_bit_rsvd | 2329 context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
2337 rsvd_bits(maxphyaddr, 51) | 2330 rsvd_bits(maxphyaddr, 51) |
2338 rsvd_bits(13, 20); /* large page */ 2331 rsvd_bits(13, 20); /* large page */
2339 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0]; 2332 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2340 break; 2333 break;
2341 } 2334 }
2342} 2335}
@@ -2438,7 +2431,8 @@ static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
2438 else 2431 else
2439 r = paging32_init_context(vcpu); 2432 r = paging32_init_context(vcpu);
2440 2433
2441 vcpu->arch.mmu.base_role.glevels = vcpu->arch.mmu.root_level; 2434 vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
2435 vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
2442 2436
2443 return r; 2437 return r;
2444} 2438}
@@ -2478,7 +2472,9 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
2478 goto out; 2472 goto out;
2479 spin_lock(&vcpu->kvm->mmu_lock); 2473 spin_lock(&vcpu->kvm->mmu_lock);
2480 kvm_mmu_free_some_pages(vcpu); 2474 kvm_mmu_free_some_pages(vcpu);
2475 spin_unlock(&vcpu->kvm->mmu_lock);
2481 r = mmu_alloc_roots(vcpu); 2476 r = mmu_alloc_roots(vcpu);
2477 spin_lock(&vcpu->kvm->mmu_lock);
2482 mmu_sync_roots(vcpu); 2478 mmu_sync_roots(vcpu);
2483 spin_unlock(&vcpu->kvm->mmu_lock); 2479 spin_unlock(&vcpu->kvm->mmu_lock);
2484 if (r) 2480 if (r)
@@ -2527,7 +2523,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
2527 } 2523 }
2528 2524
2529 ++vcpu->kvm->stat.mmu_pte_updated; 2525 ++vcpu->kvm->stat.mmu_pte_updated;
2530 if (sp->role.glevels == PT32_ROOT_LEVEL) 2526 if (!sp->role.cr4_pae)
2531 paging32_update_pte(vcpu, sp, spte, new); 2527 paging32_update_pte(vcpu, sp, spte, new);
2532 else 2528 else
2533 paging64_update_pte(vcpu, sp, spte, new); 2529 paging64_update_pte(vcpu, sp, spte, new);
@@ -2562,36 +2558,11 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
2562} 2558}
2563 2559
2564static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, 2560static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2565 const u8 *new, int bytes) 2561 u64 gpte)
2566{ 2562{
2567 gfn_t gfn; 2563 gfn_t gfn;
2568 int r;
2569 u64 gpte = 0;
2570 pfn_t pfn; 2564 pfn_t pfn;
2571 2565
2572 if (bytes != 4 && bytes != 8)
2573 return;
2574
2575 /*
2576 * Assume that the pte write on a page table of the same type
2577 * as the current vcpu paging mode. This is nearly always true
2578 * (might be false while changing modes). Note it is verified later
2579 * by update_pte().
2580 */
2581 if (is_pae(vcpu)) {
2582 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
2583 if ((bytes == 4) && (gpa % 4 == 0)) {
2584 r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8);
2585 if (r)
2586 return;
2587 memcpy((void *)&gpte + (gpa % 8), new, 4);
2588 } else if ((bytes == 8) && (gpa % 8 == 0)) {
2589 memcpy((void *)&gpte, new, 8);
2590 }
2591 } else {
2592 if ((bytes == 4) && (gpa % 4 == 0))
2593 memcpy((void *)&gpte, new, 4);
2594 }
2595 if (!is_present_gpte(gpte)) 2566 if (!is_present_gpte(gpte))
2596 return; 2567 return;
2597 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; 2568 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
@@ -2640,10 +2611,46 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2640 int flooded = 0; 2611 int flooded = 0;
2641 int npte; 2612 int npte;
2642 int r; 2613 int r;
2614 int invlpg_counter;
2643 2615
2644 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); 2616 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
2645 mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes); 2617
2618 invlpg_counter = atomic_read(&vcpu->kvm->arch.invlpg_counter);
2619
2620 /*
2621 * Assume that the pte write on a page table of the same type
2622 * as the current vcpu paging mode. This is nearly always true
2623 * (might be false while changing modes). Note it is verified later
2624 * by update_pte().
2625 */
2626 if ((is_pae(vcpu) && bytes == 4) || !new) {
2627 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
2628 if (is_pae(vcpu)) {
2629 gpa &= ~(gpa_t)7;
2630 bytes = 8;
2631 }
2632 r = kvm_read_guest(vcpu->kvm, gpa, &gentry, min(bytes, 8));
2633 if (r)
2634 gentry = 0;
2635 new = (const u8 *)&gentry;
2636 }
2637
2638 switch (bytes) {
2639 case 4:
2640 gentry = *(const u32 *)new;
2641 break;
2642 case 8:
2643 gentry = *(const u64 *)new;
2644 break;
2645 default:
2646 gentry = 0;
2647 break;
2648 }
2649
2650 mmu_guess_page_from_pte_write(vcpu, gpa, gentry);
2646 spin_lock(&vcpu->kvm->mmu_lock); 2651 spin_lock(&vcpu->kvm->mmu_lock);
2652 if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
2653 gentry = 0;
2647 kvm_mmu_access_page(vcpu, gfn); 2654 kvm_mmu_access_page(vcpu, gfn);
2648 kvm_mmu_free_some_pages(vcpu); 2655 kvm_mmu_free_some_pages(vcpu);
2649 ++vcpu->kvm->stat.mmu_pte_write; 2656 ++vcpu->kvm->stat.mmu_pte_write;
@@ -2662,10 +2669,12 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2662 } 2669 }
2663 index = kvm_page_table_hashfn(gfn); 2670 index = kvm_page_table_hashfn(gfn);
2664 bucket = &vcpu->kvm->arch.mmu_page_hash[index]; 2671 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
2672
2673restart:
2665 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { 2674 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
2666 if (sp->gfn != gfn || sp->role.direct || sp->role.invalid) 2675 if (sp->gfn != gfn || sp->role.direct || sp->role.invalid)
2667 continue; 2676 continue;
2668 pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; 2677 pte_size = sp->role.cr4_pae ? 8 : 4;
2669 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); 2678 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
2670 misaligned |= bytes < 4; 2679 misaligned |= bytes < 4;
2671 if (misaligned || flooded) { 2680 if (misaligned || flooded) {
@@ -2682,14 +2691,14 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2682 pgprintk("misaligned: gpa %llx bytes %d role %x\n", 2691 pgprintk("misaligned: gpa %llx bytes %d role %x\n",
2683 gpa, bytes, sp->role.word); 2692 gpa, bytes, sp->role.word);
2684 if (kvm_mmu_zap_page(vcpu->kvm, sp)) 2693 if (kvm_mmu_zap_page(vcpu->kvm, sp))
2685 n = bucket->first; 2694 goto restart;
2686 ++vcpu->kvm->stat.mmu_flooded; 2695 ++vcpu->kvm->stat.mmu_flooded;
2687 continue; 2696 continue;
2688 } 2697 }
2689 page_offset = offset; 2698 page_offset = offset;
2690 level = sp->role.level; 2699 level = sp->role.level;
2691 npte = 1; 2700 npte = 1;
2692 if (sp->role.glevels == PT32_ROOT_LEVEL) { 2701 if (!sp->role.cr4_pae) {
2693 page_offset <<= 1; /* 32->64 */ 2702 page_offset <<= 1; /* 32->64 */
2694 /* 2703 /*
2695 * A 32-bit pde maps 4MB while the shadow pdes map 2704 * A 32-bit pde maps 4MB while the shadow pdes map
@@ -2707,20 +2716,11 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2707 continue; 2716 continue;
2708 } 2717 }
2709 spte = &sp->spt[page_offset / sizeof(*spte)]; 2718 spte = &sp->spt[page_offset / sizeof(*spte)];
2710 if ((gpa & (pte_size - 1)) || (bytes < pte_size)) {
2711 gentry = 0;
2712 r = kvm_read_guest_atomic(vcpu->kvm,
2713 gpa & ~(u64)(pte_size - 1),
2714 &gentry, pte_size);
2715 new = (const void *)&gentry;
2716 if (r < 0)
2717 new = NULL;
2718 }
2719 while (npte--) { 2719 while (npte--) {
2720 entry = *spte; 2720 entry = *spte;
2721 mmu_pte_write_zap_pte(vcpu, sp, spte); 2721 mmu_pte_write_zap_pte(vcpu, sp, spte);
2722 if (new) 2722 if (gentry)
2723 mmu_pte_write_new_pte(vcpu, sp, spte, new); 2723 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
2724 mmu_pte_write_flush_tlb(vcpu, entry, *spte); 2724 mmu_pte_write_flush_tlb(vcpu, entry, *spte);
2725 ++spte; 2725 ++spte;
2726 } 2726 }
@@ -2900,22 +2900,23 @@ void kvm_mmu_zap_all(struct kvm *kvm)
2900 struct kvm_mmu_page *sp, *node; 2900 struct kvm_mmu_page *sp, *node;
2901 2901
2902 spin_lock(&kvm->mmu_lock); 2902 spin_lock(&kvm->mmu_lock);
2903restart:
2903 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) 2904 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
2904 if (kvm_mmu_zap_page(kvm, sp)) 2905 if (kvm_mmu_zap_page(kvm, sp))
2905 node = container_of(kvm->arch.active_mmu_pages.next, 2906 goto restart;
2906 struct kvm_mmu_page, link); 2907
2907 spin_unlock(&kvm->mmu_lock); 2908 spin_unlock(&kvm->mmu_lock);
2908 2909
2909 kvm_flush_remote_tlbs(kvm); 2910 kvm_flush_remote_tlbs(kvm);
2910} 2911}
2911 2912
2912static void kvm_mmu_remove_one_alloc_mmu_page(struct kvm *kvm) 2913static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm)
2913{ 2914{
2914 struct kvm_mmu_page *page; 2915 struct kvm_mmu_page *page;
2915 2916
2916 page = container_of(kvm->arch.active_mmu_pages.prev, 2917 page = container_of(kvm->arch.active_mmu_pages.prev,
2917 struct kvm_mmu_page, link); 2918 struct kvm_mmu_page, link);
2918 kvm_mmu_zap_page(kvm, page); 2919 return kvm_mmu_zap_page(kvm, page) + 1;
2919} 2920}
2920 2921
2921static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask) 2922static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
@@ -2927,7 +2928,7 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
2927 spin_lock(&kvm_lock); 2928 spin_lock(&kvm_lock);
2928 2929
2929 list_for_each_entry(kvm, &vm_list, vm_list) { 2930 list_for_each_entry(kvm, &vm_list, vm_list) {
2930 int npages, idx; 2931 int npages, idx, freed_pages;
2931 2932
2932 idx = srcu_read_lock(&kvm->srcu); 2933 idx = srcu_read_lock(&kvm->srcu);
2933 spin_lock(&kvm->mmu_lock); 2934 spin_lock(&kvm->mmu_lock);
@@ -2935,8 +2936,8 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
2935 kvm->arch.n_free_mmu_pages; 2936 kvm->arch.n_free_mmu_pages;
2936 cache_count += npages; 2937 cache_count += npages;
2937 if (!kvm_freed && nr_to_scan > 0 && npages > 0) { 2938 if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
2938 kvm_mmu_remove_one_alloc_mmu_page(kvm); 2939 freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm);
2939 cache_count--; 2940 cache_count -= freed_pages;
2940 kvm_freed = kvm; 2941 kvm_freed = kvm;
2941 } 2942 }
2942 nr_to_scan--; 2943 nr_to_scan--;
@@ -3011,7 +3012,8 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
3011 unsigned int nr_pages = 0; 3012 unsigned int nr_pages = 0;
3012 struct kvm_memslots *slots; 3013 struct kvm_memslots *slots;
3013 3014
3014 slots = rcu_dereference(kvm->memslots); 3015 slots = kvm_memslots(kvm);
3016
3015 for (i = 0; i < slots->nmemslots; i++) 3017 for (i = 0; i < slots->nmemslots; i++)
3016 nr_pages += slots->memslots[i].npages; 3018 nr_pages += slots->memslots[i].npages;
3017 3019
@@ -3174,8 +3176,7 @@ static gva_t canonicalize(gva_t gva)
3174} 3176}
3175 3177
3176 3178
3177typedef void (*inspect_spte_fn) (struct kvm *kvm, struct kvm_mmu_page *sp, 3179typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
3178 u64 *sptep);
3179 3180
3180static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp, 3181static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
3181 inspect_spte_fn fn) 3182 inspect_spte_fn fn)
@@ -3191,7 +3192,7 @@ static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
3191 child = page_header(ent & PT64_BASE_ADDR_MASK); 3192 child = page_header(ent & PT64_BASE_ADDR_MASK);
3192 __mmu_spte_walk(kvm, child, fn); 3193 __mmu_spte_walk(kvm, child, fn);
3193 } else 3194 } else
3194 fn(kvm, sp, &sp->spt[i]); 3195 fn(kvm, &sp->spt[i]);
3195 } 3196 }
3196 } 3197 }
3197} 3198}
@@ -3282,11 +3283,13 @@ static void audit_mappings(struct kvm_vcpu *vcpu)
3282 3283
3283static int count_rmaps(struct kvm_vcpu *vcpu) 3284static int count_rmaps(struct kvm_vcpu *vcpu)
3284{ 3285{
3286 struct kvm *kvm = vcpu->kvm;
3287 struct kvm_memslots *slots;
3285 int nmaps = 0; 3288 int nmaps = 0;
3286 int i, j, k, idx; 3289 int i, j, k, idx;
3287 3290
3288 idx = srcu_read_lock(&kvm->srcu); 3291 idx = srcu_read_lock(&kvm->srcu);
3289 slots = rcu_dereference(kvm->memslots); 3292 slots = kvm_memslots(kvm);
3290 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 3293 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
3291 struct kvm_memory_slot *m = &slots->memslots[i]; 3294 struct kvm_memory_slot *m = &slots->memslots[i];
3292 struct kvm_rmap_desc *d; 3295 struct kvm_rmap_desc *d;
@@ -3315,7 +3318,7 @@ static int count_rmaps(struct kvm_vcpu *vcpu)
3315 return nmaps; 3318 return nmaps;
3316} 3319}
3317 3320
3318void inspect_spte_has_rmap(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *sptep) 3321void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
3319{ 3322{
3320 unsigned long *rmapp; 3323 unsigned long *rmapp;
3321 struct kvm_mmu_page *rev_sp; 3324 struct kvm_mmu_page *rev_sp;
@@ -3331,14 +3334,14 @@ void inspect_spte_has_rmap(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *sptep)
3331 printk(KERN_ERR "%s: no memslot for gfn %ld\n", 3334 printk(KERN_ERR "%s: no memslot for gfn %ld\n",
3332 audit_msg, gfn); 3335 audit_msg, gfn);
3333 printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n", 3336 printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n",
3334 audit_msg, sptep - rev_sp->spt, 3337 audit_msg, (long int)(sptep - rev_sp->spt),
3335 rev_sp->gfn); 3338 rev_sp->gfn);
3336 dump_stack(); 3339 dump_stack();
3337 return; 3340 return;
3338 } 3341 }
3339 3342
3340 rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt], 3343 rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt],
3341 is_large_pte(*sptep)); 3344 rev_sp->role.level);
3342 if (!*rmapp) { 3345 if (!*rmapp) {
3343 if (!printk_ratelimit()) 3346 if (!printk_ratelimit())
3344 return; 3347 return;
@@ -3373,7 +3376,7 @@ static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
3373 continue; 3376 continue;
3374 if (!(ent & PT_WRITABLE_MASK)) 3377 if (!(ent & PT_WRITABLE_MASK))
3375 continue; 3378 continue;
3376 inspect_spte_has_rmap(vcpu->kvm, sp, &pt[i]); 3379 inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
3377 } 3380 }
3378 } 3381 }
3379 return; 3382 return;
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 3e4a5c6ca2a9..42f07b1bfbc9 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -6,14 +6,12 @@
6 6
7#undef TRACE_SYSTEM 7#undef TRACE_SYSTEM
8#define TRACE_SYSTEM kvmmmu 8#define TRACE_SYSTEM kvmmmu
9#define TRACE_INCLUDE_PATH .
10#define TRACE_INCLUDE_FILE mmutrace
11 9
12#define KVM_MMU_PAGE_FIELDS \ 10#define KVM_MMU_PAGE_FIELDS \
13 __field(__u64, gfn) \ 11 __field(__u64, gfn) \
14 __field(__u32, role) \ 12 __field(__u32, role) \
15 __field(__u32, root_count) \ 13 __field(__u32, root_count) \
16 __field(__u32, unsync) 14 __field(bool, unsync)
17 15
18#define KVM_MMU_PAGE_ASSIGN(sp) \ 16#define KVM_MMU_PAGE_ASSIGN(sp) \
19 __entry->gfn = sp->gfn; \ 17 __entry->gfn = sp->gfn; \
@@ -30,14 +28,14 @@
30 \ 28 \
31 role.word = __entry->role; \ 29 role.word = __entry->role; \
32 \ 30 \
33 trace_seq_printf(p, "sp gfn %llx %u/%u q%u%s %s%s %spge" \ 31 trace_seq_printf(p, "sp gfn %llx %u%s q%u%s %s%s" \
34 " %snxe root %u %s%c", \ 32 " %snxe root %u %s%c", \
35 __entry->gfn, role.level, role.glevels, \ 33 __entry->gfn, role.level, \
34 role.cr4_pae ? " pae" : "", \
36 role.quadrant, \ 35 role.quadrant, \
37 role.direct ? " direct" : "", \ 36 role.direct ? " direct" : "", \
38 access_str[role.access], \ 37 access_str[role.access], \
39 role.invalid ? " invalid" : "", \ 38 role.invalid ? " invalid" : "", \
40 role.cr4_pge ? "" : "!", \
41 role.nxe ? "" : "!", \ 39 role.nxe ? "" : "!", \
42 __entry->root_count, \ 40 __entry->root_count, \
43 __entry->unsync ? "unsync" : "sync", 0); \ 41 __entry->unsync ? "unsync" : "sync", 0); \
@@ -94,15 +92,15 @@ TRACE_EVENT(
94 TP_printk("pte %llx level %u", __entry->pte, __entry->level) 92 TP_printk("pte %llx level %u", __entry->pte, __entry->level)
95); 93);
96 94
97/* We set a pte accessed bit */ 95DECLARE_EVENT_CLASS(kvm_mmu_set_bit_class,
98TRACE_EVENT( 96
99 kvm_mmu_set_accessed_bit,
100 TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size), 97 TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
98
101 TP_ARGS(table_gfn, index, size), 99 TP_ARGS(table_gfn, index, size),
102 100
103 TP_STRUCT__entry( 101 TP_STRUCT__entry(
104 __field(__u64, gpa) 102 __field(__u64, gpa)
105 ), 103 ),
106 104
107 TP_fast_assign( 105 TP_fast_assign(
108 __entry->gpa = ((u64)table_gfn << PAGE_SHIFT) 106 __entry->gpa = ((u64)table_gfn << PAGE_SHIFT)
@@ -112,22 +110,20 @@ TRACE_EVENT(
112 TP_printk("gpa %llx", __entry->gpa) 110 TP_printk("gpa %llx", __entry->gpa)
113); 111);
114 112
115/* We set a pte dirty bit */ 113/* We set a pte accessed bit */
116TRACE_EVENT( 114DEFINE_EVENT(kvm_mmu_set_bit_class, kvm_mmu_set_accessed_bit,
117 kvm_mmu_set_dirty_bit, 115
118 TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size), 116 TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
119 TP_ARGS(table_gfn, index, size),
120 117
121 TP_STRUCT__entry( 118 TP_ARGS(table_gfn, index, size)
122 __field(__u64, gpa) 119);
123 ),
124 120
125 TP_fast_assign( 121/* We set a pte dirty bit */
126 __entry->gpa = ((u64)table_gfn << PAGE_SHIFT) 122DEFINE_EVENT(kvm_mmu_set_bit_class, kvm_mmu_set_dirty_bit,
127 + index * size;
128 ),
129 123
130 TP_printk("gpa %llx", __entry->gpa) 124 TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
125
126 TP_ARGS(table_gfn, index, size)
131); 127);
132 128
133TRACE_EVENT( 129TRACE_EVENT(
@@ -166,55 +162,45 @@ TRACE_EVENT(
166 __entry->created ? "new" : "existing") 162 __entry->created ? "new" : "existing")
167); 163);
168 164
169TRACE_EVENT( 165DECLARE_EVENT_CLASS(kvm_mmu_page_class,
170 kvm_mmu_sync_page, 166
171 TP_PROTO(struct kvm_mmu_page *sp), 167 TP_PROTO(struct kvm_mmu_page *sp),
172 TP_ARGS(sp), 168 TP_ARGS(sp),
173 169
174 TP_STRUCT__entry( 170 TP_STRUCT__entry(
175 KVM_MMU_PAGE_FIELDS 171 KVM_MMU_PAGE_FIELDS
176 ), 172 ),
177 173
178 TP_fast_assign( 174 TP_fast_assign(
179 KVM_MMU_PAGE_ASSIGN(sp) 175 KVM_MMU_PAGE_ASSIGN(sp)
180 ), 176 ),
181 177
182 TP_printk("%s", KVM_MMU_PAGE_PRINTK()) 178 TP_printk("%s", KVM_MMU_PAGE_PRINTK())
183); 179);
184 180
185TRACE_EVENT( 181DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_sync_page,
186 kvm_mmu_unsync_page,
187 TP_PROTO(struct kvm_mmu_page *sp), 182 TP_PROTO(struct kvm_mmu_page *sp),
188 TP_ARGS(sp),
189
190 TP_STRUCT__entry(
191 KVM_MMU_PAGE_FIELDS
192 ),
193 183
194 TP_fast_assign( 184 TP_ARGS(sp)
195 KVM_MMU_PAGE_ASSIGN(sp)
196 ),
197
198 TP_printk("%s", KVM_MMU_PAGE_PRINTK())
199); 185);
200 186
201TRACE_EVENT( 187DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_unsync_page,
202 kvm_mmu_zap_page,
203 TP_PROTO(struct kvm_mmu_page *sp), 188 TP_PROTO(struct kvm_mmu_page *sp),
204 TP_ARGS(sp),
205 189
206 TP_STRUCT__entry( 190 TP_ARGS(sp)
207 KVM_MMU_PAGE_FIELDS 191);
208 ),
209 192
210 TP_fast_assign( 193DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_zap_page,
211 KVM_MMU_PAGE_ASSIGN(sp) 194 TP_PROTO(struct kvm_mmu_page *sp),
212 ),
213 195
214 TP_printk("%s", KVM_MMU_PAGE_PRINTK()) 196 TP_ARGS(sp)
215); 197);
216
217#endif /* _TRACE_KVMMMU_H */ 198#endif /* _TRACE_KVMMMU_H */
218 199
200#undef TRACE_INCLUDE_PATH
201#define TRACE_INCLUDE_PATH .
202#undef TRACE_INCLUDE_FILE
203#define TRACE_INCLUDE_FILE mmutrace
204
219/* This part must be outside protection */ 205/* This part must be outside protection */
220#include <trace/define_trace.h> 206#include <trace/define_trace.h>
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 81eab9a50e6a..89d66ca4d87c 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -170,7 +170,7 @@ walk:
170 goto access_error; 170 goto access_error;
171 171
172#if PTTYPE == 64 172#if PTTYPE == 64
173 if (fetch_fault && is_nx(vcpu) && (pte & PT64_NX_MASK)) 173 if (fetch_fault && (pte & PT64_NX_MASK))
174 goto access_error; 174 goto access_error;
175#endif 175#endif
176 176
@@ -190,10 +190,10 @@ walk:
190 190
191 if ((walker->level == PT_PAGE_TABLE_LEVEL) || 191 if ((walker->level == PT_PAGE_TABLE_LEVEL) ||
192 ((walker->level == PT_DIRECTORY_LEVEL) && 192 ((walker->level == PT_DIRECTORY_LEVEL) &&
193 (pte & PT_PAGE_SIZE_MASK) && 193 is_large_pte(pte) &&
194 (PTTYPE == 64 || is_pse(vcpu))) || 194 (PTTYPE == 64 || is_pse(vcpu))) ||
195 ((walker->level == PT_PDPE_LEVEL) && 195 ((walker->level == PT_PDPE_LEVEL) &&
196 (pte & PT_PAGE_SIZE_MASK) && 196 is_large_pte(pte) &&
197 is_long_mode(vcpu))) { 197 is_long_mode(vcpu))) {
198 int lvl = walker->level; 198 int lvl = walker->level;
199 199
@@ -258,11 +258,17 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
258 pt_element_t gpte; 258 pt_element_t gpte;
259 unsigned pte_access; 259 unsigned pte_access;
260 pfn_t pfn; 260 pfn_t pfn;
261 u64 new_spte;
261 262
262 gpte = *(const pt_element_t *)pte; 263 gpte = *(const pt_element_t *)pte;
263 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { 264 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
264 if (!is_present_gpte(gpte)) 265 if (!is_present_gpte(gpte)) {
265 __set_spte(spte, shadow_notrap_nonpresent_pte); 266 if (page->unsync)
267 new_spte = shadow_trap_nonpresent_pte;
268 else
269 new_spte = shadow_notrap_nonpresent_pte;
270 __set_spte(spte, new_spte);
271 }
266 return; 272 return;
267 } 273 }
268 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); 274 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
@@ -457,6 +463,7 @@ out_unlock:
457static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) 463static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
458{ 464{
459 struct kvm_shadow_walk_iterator iterator; 465 struct kvm_shadow_walk_iterator iterator;
466 gpa_t pte_gpa = -1;
460 int level; 467 int level;
461 u64 *sptep; 468 u64 *sptep;
462 int need_flush = 0; 469 int need_flush = 0;
@@ -467,9 +474,16 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
467 level = iterator.level; 474 level = iterator.level;
468 sptep = iterator.sptep; 475 sptep = iterator.sptep;
469 476
470 if (level == PT_PAGE_TABLE_LEVEL || 477 if (is_last_spte(*sptep, level)) {
471 ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) || 478 struct kvm_mmu_page *sp = page_header(__pa(sptep));
472 ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) { 479 int offset, shift;
480
481 shift = PAGE_SHIFT -
482 (PT_LEVEL_BITS - PT64_LEVEL_BITS) * level;
483 offset = sp->role.quadrant << shift;
484
485 pte_gpa = (sp->gfn << PAGE_SHIFT) + offset;
486 pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
473 487
474 if (is_shadow_present_pte(*sptep)) { 488 if (is_shadow_present_pte(*sptep)) {
475 rmap_remove(vcpu->kvm, sptep); 489 rmap_remove(vcpu->kvm, sptep);
@@ -487,7 +501,17 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
487 501
488 if (need_flush) 502 if (need_flush)
489 kvm_flush_remote_tlbs(vcpu->kvm); 503 kvm_flush_remote_tlbs(vcpu->kvm);
504
505 atomic_inc(&vcpu->kvm->arch.invlpg_counter);
506
490 spin_unlock(&vcpu->kvm->mmu_lock); 507 spin_unlock(&vcpu->kvm->mmu_lock);
508
509 if (pte_gpa == -1)
510 return;
511
512 if (mmu_topup_memory_caches(vcpu))
513 return;
514 kvm_mmu_pte_write(vcpu, pte_gpa, NULL, sizeof(pt_element_t), 0);
491} 515}
492 516
493static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, 517static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
@@ -551,12 +575,15 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
551{ 575{
552 int i, offset, nr_present; 576 int i, offset, nr_present;
553 bool reset_host_protection; 577 bool reset_host_protection;
578 gpa_t first_pte_gpa;
554 579
555 offset = nr_present = 0; 580 offset = nr_present = 0;
556 581
557 if (PTTYPE == 32) 582 if (PTTYPE == 32)
558 offset = sp->role.quadrant << PT64_LEVEL_BITS; 583 offset = sp->role.quadrant << PT64_LEVEL_BITS;
559 584
585 first_pte_gpa = gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
586
560 for (i = 0; i < PT64_ENT_PER_PAGE; i++) { 587 for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
561 unsigned pte_access; 588 unsigned pte_access;
562 pt_element_t gpte; 589 pt_element_t gpte;
@@ -566,8 +593,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
566 if (!is_shadow_present_pte(sp->spt[i])) 593 if (!is_shadow_present_pte(sp->spt[i]))
567 continue; 594 continue;
568 595
569 pte_gpa = gfn_to_gpa(sp->gfn); 596 pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
570 pte_gpa += (i+offset) * sizeof(pt_element_t);
571 597
572 if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, 598 if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
573 sizeof(pt_element_t))) 599 sizeof(pt_element_t)))
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 737361fcd503..96dc232bfc56 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -44,10 +44,11 @@ MODULE_LICENSE("GPL");
44#define SEG_TYPE_LDT 2 44#define SEG_TYPE_LDT 2
45#define SEG_TYPE_BUSY_TSS16 3 45#define SEG_TYPE_BUSY_TSS16 3
46 46
47#define SVM_FEATURE_NPT (1 << 0) 47#define SVM_FEATURE_NPT (1 << 0)
48#define SVM_FEATURE_LBRV (1 << 1) 48#define SVM_FEATURE_LBRV (1 << 1)
49#define SVM_FEATURE_SVML (1 << 2) 49#define SVM_FEATURE_SVML (1 << 2)
50#define SVM_FEATURE_PAUSE_FILTER (1 << 10) 50#define SVM_FEATURE_NRIP (1 << 3)
51#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
51 52
52#define NESTED_EXIT_HOST 0 /* Exit handled on host level */ 53#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
53#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */ 54#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */
@@ -70,6 +71,7 @@ struct kvm_vcpu;
70struct nested_state { 71struct nested_state {
71 struct vmcb *hsave; 72 struct vmcb *hsave;
72 u64 hsave_msr; 73 u64 hsave_msr;
74 u64 vm_cr_msr;
73 u64 vmcb; 75 u64 vmcb;
74 76
75 /* These are the merged vectors */ 77 /* These are the merged vectors */
@@ -77,6 +79,7 @@ struct nested_state {
77 79
78 /* gpa pointers to the real vectors */ 80 /* gpa pointers to the real vectors */
79 u64 vmcb_msrpm; 81 u64 vmcb_msrpm;
82 u64 vmcb_iopm;
80 83
81 /* A VMEXIT is required but not yet emulated */ 84 /* A VMEXIT is required but not yet emulated */
82 bool exit_required; 85 bool exit_required;
@@ -91,6 +94,9 @@ struct nested_state {
91 94
92}; 95};
93 96
97#define MSRPM_OFFSETS 16
98static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
99
94struct vcpu_svm { 100struct vcpu_svm {
95 struct kvm_vcpu vcpu; 101 struct kvm_vcpu vcpu;
96 struct vmcb *vmcb; 102 struct vmcb *vmcb;
@@ -110,13 +116,39 @@ struct vcpu_svm {
110 struct nested_state nested; 116 struct nested_state nested;
111 117
112 bool nmi_singlestep; 118 bool nmi_singlestep;
119
120 unsigned int3_injected;
121 unsigned long int3_rip;
122};
123
124#define MSR_INVALID 0xffffffffU
125
126static struct svm_direct_access_msrs {
127 u32 index; /* Index of the MSR */
128 bool always; /* True if intercept is always on */
129} direct_access_msrs[] = {
130 { .index = MSR_K6_STAR, .always = true },
131 { .index = MSR_IA32_SYSENTER_CS, .always = true },
132#ifdef CONFIG_X86_64
133 { .index = MSR_GS_BASE, .always = true },
134 { .index = MSR_FS_BASE, .always = true },
135 { .index = MSR_KERNEL_GS_BASE, .always = true },
136 { .index = MSR_LSTAR, .always = true },
137 { .index = MSR_CSTAR, .always = true },
138 { .index = MSR_SYSCALL_MASK, .always = true },
139#endif
140 { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false },
141 { .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
142 { .index = MSR_IA32_LASTINTFROMIP, .always = false },
143 { .index = MSR_IA32_LASTINTTOIP, .always = false },
144 { .index = MSR_INVALID, .always = false },
113}; 145};
114 146
115/* enable NPT for AMD64 and X86 with PAE */ 147/* enable NPT for AMD64 and X86 with PAE */
116#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) 148#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
117static bool npt_enabled = true; 149static bool npt_enabled = true;
118#else 150#else
119static bool npt_enabled = false; 151static bool npt_enabled;
120#endif 152#endif
121static int npt = 1; 153static int npt = 1;
122 154
@@ -129,6 +161,7 @@ static void svm_flush_tlb(struct kvm_vcpu *vcpu);
129static void svm_complete_interrupts(struct vcpu_svm *svm); 161static void svm_complete_interrupts(struct vcpu_svm *svm);
130 162
131static int nested_svm_exit_handled(struct vcpu_svm *svm); 163static int nested_svm_exit_handled(struct vcpu_svm *svm);
164static int nested_svm_intercept(struct vcpu_svm *svm);
132static int nested_svm_vmexit(struct vcpu_svm *svm); 165static int nested_svm_vmexit(struct vcpu_svm *svm);
133static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, 166static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
134 bool has_error_code, u32 error_code); 167 bool has_error_code, u32 error_code);
@@ -163,8 +196,8 @@ static unsigned long iopm_base;
163struct kvm_ldttss_desc { 196struct kvm_ldttss_desc {
164 u16 limit0; 197 u16 limit0;
165 u16 base0; 198 u16 base0;
166 unsigned base1 : 8, type : 5, dpl : 2, p : 1; 199 unsigned base1:8, type:5, dpl:2, p:1;
167 unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8; 200 unsigned limit1:4, zero0:3, g:1, base2:8;
168 u32 base3; 201 u32 base3;
169 u32 zero1; 202 u32 zero1;
170} __attribute__((packed)); 203} __attribute__((packed));
@@ -194,6 +227,27 @@ static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
194#define MSRS_RANGE_SIZE 2048 227#define MSRS_RANGE_SIZE 2048
195#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2) 228#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
196 229
230static u32 svm_msrpm_offset(u32 msr)
231{
232 u32 offset;
233 int i;
234
235 for (i = 0; i < NUM_MSR_MAPS; i++) {
236 if (msr < msrpm_ranges[i] ||
237 msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
238 continue;
239
240 offset = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
241 offset += (i * MSRS_RANGE_SIZE); /* add range offset */
242
243 /* Now we have the u8 offset - but need the u32 offset */
244 return offset / 4;
245 }
246
247 /* MSR not in any range */
248 return MSR_INVALID;
249}
250
197#define MAX_INST_SIZE 15 251#define MAX_INST_SIZE 15
198 252
199static inline u32 svm_has(u32 feat) 253static inline u32 svm_has(u32 feat)
@@ -213,7 +267,7 @@ static inline void stgi(void)
213 267
214static inline void invlpga(unsigned long addr, u32 asid) 268static inline void invlpga(unsigned long addr, u32 asid)
215{ 269{
216 asm volatile (__ex(SVM_INVLPGA) :: "a"(addr), "c"(asid)); 270 asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid));
217} 271}
218 272
219static inline void force_new_asid(struct kvm_vcpu *vcpu) 273static inline void force_new_asid(struct kvm_vcpu *vcpu)
@@ -235,23 +289,6 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
235 vcpu->arch.efer = efer; 289 vcpu->arch.efer = efer;
236} 290}
237 291
238static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
239 bool has_error_code, u32 error_code)
240{
241 struct vcpu_svm *svm = to_svm(vcpu);
242
243 /* If we are within a nested VM we'd better #VMEXIT and let the
244 guest handle the exception */
245 if (nested_svm_check_exception(svm, nr, has_error_code, error_code))
246 return;
247
248 svm->vmcb->control.event_inj = nr
249 | SVM_EVTINJ_VALID
250 | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
251 | SVM_EVTINJ_TYPE_EXEPT;
252 svm->vmcb->control.event_inj_err = error_code;
253}
254
255static int is_external_interrupt(u32 info) 292static int is_external_interrupt(u32 info)
256{ 293{
257 info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID; 294 info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
@@ -264,7 +301,7 @@ static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
264 u32 ret = 0; 301 u32 ret = 0;
265 302
266 if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) 303 if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
267 ret |= X86_SHADOW_INT_STI | X86_SHADOW_INT_MOV_SS; 304 ret |= KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
268 return ret & mask; 305 return ret & mask;
269} 306}
270 307
@@ -283,6 +320,9 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
283{ 320{
284 struct vcpu_svm *svm = to_svm(vcpu); 321 struct vcpu_svm *svm = to_svm(vcpu);
285 322
323 if (svm->vmcb->control.next_rip != 0)
324 svm->next_rip = svm->vmcb->control.next_rip;
325
286 if (!svm->next_rip) { 326 if (!svm->next_rip) {
287 if (emulate_instruction(vcpu, 0, 0, EMULTYPE_SKIP) != 327 if (emulate_instruction(vcpu, 0, 0, EMULTYPE_SKIP) !=
288 EMULATE_DONE) 328 EMULATE_DONE)
@@ -297,6 +337,43 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
297 svm_set_interrupt_shadow(vcpu, 0); 337 svm_set_interrupt_shadow(vcpu, 0);
298} 338}
299 339
340static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
341 bool has_error_code, u32 error_code,
342 bool reinject)
343{
344 struct vcpu_svm *svm = to_svm(vcpu);
345
346 /*
347 * If we are within a nested VM we'd better #VMEXIT and let the guest
348 * handle the exception
349 */
350 if (!reinject &&
351 nested_svm_check_exception(svm, nr, has_error_code, error_code))
352 return;
353
354 if (nr == BP_VECTOR && !svm_has(SVM_FEATURE_NRIP)) {
355 unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
356
357 /*
358 * For guest debugging where we have to reinject #BP if some
359 * INT3 is guest-owned:
360 * Emulate nRIP by moving RIP forward. Will fail if injection
361 * raises a fault that is not intercepted. Still better than
362 * failing in all cases.
363 */
364 skip_emulated_instruction(&svm->vcpu);
365 rip = kvm_rip_read(&svm->vcpu);
366 svm->int3_rip = rip + svm->vmcb->save.cs.base;
367 svm->int3_injected = rip - old_rip;
368 }
369
370 svm->vmcb->control.event_inj = nr
371 | SVM_EVTINJ_VALID
372 | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
373 | SVM_EVTINJ_TYPE_EXEPT;
374 svm->vmcb->control.event_inj_err = error_code;
375}
376
300static int has_svm(void) 377static int has_svm(void)
301{ 378{
302 const char *msg; 379 const char *msg;
@@ -319,7 +396,7 @@ static int svm_hardware_enable(void *garbage)
319 396
320 struct svm_cpu_data *sd; 397 struct svm_cpu_data *sd;
321 uint64_t efer; 398 uint64_t efer;
322 struct descriptor_table gdt_descr; 399 struct desc_ptr gdt_descr;
323 struct desc_struct *gdt; 400 struct desc_struct *gdt;
324 int me = raw_smp_processor_id(); 401 int me = raw_smp_processor_id();
325 402
@@ -344,8 +421,8 @@ static int svm_hardware_enable(void *garbage)
344 sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; 421 sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
345 sd->next_asid = sd->max_asid + 1; 422 sd->next_asid = sd->max_asid + 1;
346 423
347 kvm_get_gdt(&gdt_descr); 424 native_store_gdt(&gdt_descr);
348 gdt = (struct desc_struct *)gdt_descr.base; 425 gdt = (struct desc_struct *)gdt_descr.address;
349 sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); 426 sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
350 427
351 wrmsrl(MSR_EFER, efer | EFER_SVME); 428 wrmsrl(MSR_EFER, efer | EFER_SVME);
@@ -391,42 +468,98 @@ err_1:
391 468
392} 469}
393 470
471static bool valid_msr_intercept(u32 index)
472{
473 int i;
474
475 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
476 if (direct_access_msrs[i].index == index)
477 return true;
478
479 return false;
480}
481
394static void set_msr_interception(u32 *msrpm, unsigned msr, 482static void set_msr_interception(u32 *msrpm, unsigned msr,
395 int read, int write) 483 int read, int write)
396{ 484{
485 u8 bit_read, bit_write;
486 unsigned long tmp;
487 u32 offset;
488
489 /*
490 * If this warning triggers extend the direct_access_msrs list at the
491 * beginning of the file
492 */
493 WARN_ON(!valid_msr_intercept(msr));
494
495 offset = svm_msrpm_offset(msr);
496 bit_read = 2 * (msr & 0x0f);
497 bit_write = 2 * (msr & 0x0f) + 1;
498 tmp = msrpm[offset];
499
500 BUG_ON(offset == MSR_INVALID);
501
502 read ? clear_bit(bit_read, &tmp) : set_bit(bit_read, &tmp);
503 write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
504
505 msrpm[offset] = tmp;
506}
507
508static void svm_vcpu_init_msrpm(u32 *msrpm)
509{
397 int i; 510 int i;
398 511
399 for (i = 0; i < NUM_MSR_MAPS; i++) { 512 memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
400 if (msr >= msrpm_ranges[i] && 513
401 msr < msrpm_ranges[i] + MSRS_IN_RANGE) { 514 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
402 u32 msr_offset = (i * MSRS_IN_RANGE + msr - 515 if (!direct_access_msrs[i].always)
403 msrpm_ranges[i]) * 2; 516 continue;
404 517
405 u32 *base = msrpm + (msr_offset / 32); 518 set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1);
406 u32 msr_shift = msr_offset % 32; 519 }
407 u32 mask = ((write) ? 0 : 2) | ((read) ? 0 : 1); 520}
408 *base = (*base & ~(0x3 << msr_shift)) | 521
409 (mask << msr_shift); 522static void add_msr_offset(u32 offset)
523{
524 int i;
525
526 for (i = 0; i < MSRPM_OFFSETS; ++i) {
527
528 /* Offset already in list? */
529 if (msrpm_offsets[i] == offset)
410 return; 530 return;
411 } 531
532 /* Slot used by another offset? */
533 if (msrpm_offsets[i] != MSR_INVALID)
534 continue;
535
536 /* Add offset to list */
537 msrpm_offsets[i] = offset;
538
539 return;
412 } 540 }
541
542 /*
543 * If this BUG triggers the msrpm_offsets table has an overflow. Just
544 * increase MSRPM_OFFSETS in this case.
545 */
413 BUG(); 546 BUG();
414} 547}
415 548
416static void svm_vcpu_init_msrpm(u32 *msrpm) 549static void init_msrpm_offsets(void)
417{ 550{
418 memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER)); 551 int i;
419 552
420#ifdef CONFIG_X86_64 553 memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
421 set_msr_interception(msrpm, MSR_GS_BASE, 1, 1); 554
422 set_msr_interception(msrpm, MSR_FS_BASE, 1, 1); 555 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
423 set_msr_interception(msrpm, MSR_KERNEL_GS_BASE, 1, 1); 556 u32 offset;
424 set_msr_interception(msrpm, MSR_LSTAR, 1, 1); 557
425 set_msr_interception(msrpm, MSR_CSTAR, 1, 1); 558 offset = svm_msrpm_offset(direct_access_msrs[i].index);
426 set_msr_interception(msrpm, MSR_SYSCALL_MASK, 1, 1); 559 BUG_ON(offset == MSR_INVALID);
427#endif 560
428 set_msr_interception(msrpm, MSR_K6_STAR, 1, 1); 561 add_msr_offset(offset);
429 set_msr_interception(msrpm, MSR_IA32_SYSENTER_CS, 1, 1); 562 }
430} 563}
431 564
432static void svm_enable_lbrv(struct vcpu_svm *svm) 565static void svm_enable_lbrv(struct vcpu_svm *svm)
@@ -467,6 +600,8 @@ static __init int svm_hardware_setup(void)
467 memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER)); 600 memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
468 iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; 601 iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
469 602
603 init_msrpm_offsets();
604
470 if (boot_cpu_has(X86_FEATURE_NX)) 605 if (boot_cpu_has(X86_FEATURE_NX))
471 kvm_enable_efer_bits(EFER_NX); 606 kvm_enable_efer_bits(EFER_NX);
472 607
@@ -523,7 +658,7 @@ static void init_seg(struct vmcb_seg *seg)
523{ 658{
524 seg->selector = 0; 659 seg->selector = 0;
525 seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK | 660 seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
526 SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */ 661 SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
527 seg->limit = 0xffff; 662 seg->limit = 0xffff;
528 seg->base = 0; 663 seg->base = 0;
529} 664}
@@ -543,16 +678,16 @@ static void init_vmcb(struct vcpu_svm *svm)
543 678
544 svm->vcpu.fpu_active = 1; 679 svm->vcpu.fpu_active = 1;
545 680
546 control->intercept_cr_read = INTERCEPT_CR0_MASK | 681 control->intercept_cr_read = INTERCEPT_CR0_MASK |
547 INTERCEPT_CR3_MASK | 682 INTERCEPT_CR3_MASK |
548 INTERCEPT_CR4_MASK; 683 INTERCEPT_CR4_MASK;
549 684
550 control->intercept_cr_write = INTERCEPT_CR0_MASK | 685 control->intercept_cr_write = INTERCEPT_CR0_MASK |
551 INTERCEPT_CR3_MASK | 686 INTERCEPT_CR3_MASK |
552 INTERCEPT_CR4_MASK | 687 INTERCEPT_CR4_MASK |
553 INTERCEPT_CR8_MASK; 688 INTERCEPT_CR8_MASK;
554 689
555 control->intercept_dr_read = INTERCEPT_DR0_MASK | 690 control->intercept_dr_read = INTERCEPT_DR0_MASK |
556 INTERCEPT_DR1_MASK | 691 INTERCEPT_DR1_MASK |
557 INTERCEPT_DR2_MASK | 692 INTERCEPT_DR2_MASK |
558 INTERCEPT_DR3_MASK | 693 INTERCEPT_DR3_MASK |
@@ -561,7 +696,7 @@ static void init_vmcb(struct vcpu_svm *svm)
561 INTERCEPT_DR6_MASK | 696 INTERCEPT_DR6_MASK |
562 INTERCEPT_DR7_MASK; 697 INTERCEPT_DR7_MASK;
563 698
564 control->intercept_dr_write = INTERCEPT_DR0_MASK | 699 control->intercept_dr_write = INTERCEPT_DR0_MASK |
565 INTERCEPT_DR1_MASK | 700 INTERCEPT_DR1_MASK |
566 INTERCEPT_DR2_MASK | 701 INTERCEPT_DR2_MASK |
567 INTERCEPT_DR3_MASK | 702 INTERCEPT_DR3_MASK |
@@ -575,7 +710,7 @@ static void init_vmcb(struct vcpu_svm *svm)
575 (1 << MC_VECTOR); 710 (1 << MC_VECTOR);
576 711
577 712
578 control->intercept = (1ULL << INTERCEPT_INTR) | 713 control->intercept = (1ULL << INTERCEPT_INTR) |
579 (1ULL << INTERCEPT_NMI) | 714 (1ULL << INTERCEPT_NMI) |
580 (1ULL << INTERCEPT_SMI) | 715 (1ULL << INTERCEPT_SMI) |
581 (1ULL << INTERCEPT_SELECTIVE_CR0) | 716 (1ULL << INTERCEPT_SELECTIVE_CR0) |
@@ -636,7 +771,8 @@ static void init_vmcb(struct vcpu_svm *svm)
636 save->rip = 0x0000fff0; 771 save->rip = 0x0000fff0;
637 svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; 772 svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
638 773
639 /* This is the guest-visible cr0 value. 774 /*
775 * This is the guest-visible cr0 value.
640 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. 776 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
641 */ 777 */
642 svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; 778 svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
@@ -729,6 +865,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
729 svm_vcpu_init_msrpm(svm->msrpm); 865 svm_vcpu_init_msrpm(svm->msrpm);
730 866
731 svm->nested.msrpm = page_address(nested_msrpm_pages); 867 svm->nested.msrpm = page_address(nested_msrpm_pages);
868 svm_vcpu_init_msrpm(svm->nested.msrpm);
732 869
733 svm->vmcb = page_address(page); 870 svm->vmcb = page_address(page);
734 clear_page(svm->vmcb); 871 clear_page(svm->vmcb);
@@ -882,7 +1019,8 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
882 var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1; 1019 var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
883 var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1; 1020 var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1;
884 1021
885 /* AMD's VMCB does not have an explicit unusable field, so emulate it 1022 /*
1023 * AMD's VMCB does not have an explicit unusable field, so emulate it
886 * for cross vendor migration purposes by "not present" 1024 * for cross vendor migration purposes by "not present"
887 */ 1025 */
888 var->unusable = !var->present || (var->type == 0); 1026 var->unusable = !var->present || (var->type == 0);
@@ -918,7 +1056,8 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
918 var->type |= 0x1; 1056 var->type |= 0x1;
919 break; 1057 break;
920 case VCPU_SREG_SS: 1058 case VCPU_SREG_SS:
921 /* On AMD CPUs sometimes the DB bit in the segment 1059 /*
1060 * On AMD CPUs sometimes the DB bit in the segment
922 * descriptor is left as 1, although the whole segment has 1061 * descriptor is left as 1, although the whole segment has
923 * been made unusable. Clear it here to pass an Intel VMX 1062 * been made unusable. Clear it here to pass an Intel VMX
924 * entry check when cross vendor migrating. 1063 * entry check when cross vendor migrating.
@@ -936,36 +1075,36 @@ static int svm_get_cpl(struct kvm_vcpu *vcpu)
936 return save->cpl; 1075 return save->cpl;
937} 1076}
938 1077
939static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) 1078static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
940{ 1079{
941 struct vcpu_svm *svm = to_svm(vcpu); 1080 struct vcpu_svm *svm = to_svm(vcpu);
942 1081
943 dt->limit = svm->vmcb->save.idtr.limit; 1082 dt->size = svm->vmcb->save.idtr.limit;
944 dt->base = svm->vmcb->save.idtr.base; 1083 dt->address = svm->vmcb->save.idtr.base;
945} 1084}
946 1085
947static void svm_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) 1086static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
948{ 1087{
949 struct vcpu_svm *svm = to_svm(vcpu); 1088 struct vcpu_svm *svm = to_svm(vcpu);
950 1089
951 svm->vmcb->save.idtr.limit = dt->limit; 1090 svm->vmcb->save.idtr.limit = dt->size;
952 svm->vmcb->save.idtr.base = dt->base ; 1091 svm->vmcb->save.idtr.base = dt->address ;
953} 1092}
954 1093
955static void svm_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) 1094static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
956{ 1095{
957 struct vcpu_svm *svm = to_svm(vcpu); 1096 struct vcpu_svm *svm = to_svm(vcpu);
958 1097
959 dt->limit = svm->vmcb->save.gdtr.limit; 1098 dt->size = svm->vmcb->save.gdtr.limit;
960 dt->base = svm->vmcb->save.gdtr.base; 1099 dt->address = svm->vmcb->save.gdtr.base;
961} 1100}
962 1101
963static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) 1102static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
964{ 1103{
965 struct vcpu_svm *svm = to_svm(vcpu); 1104 struct vcpu_svm *svm = to_svm(vcpu);
966 1105
967 svm->vmcb->save.gdtr.limit = dt->limit; 1106 svm->vmcb->save.gdtr.limit = dt->size;
968 svm->vmcb->save.gdtr.base = dt->base ; 1107 svm->vmcb->save.gdtr.base = dt->address ;
969} 1108}
970 1109
971static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) 1110static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
@@ -978,6 +1117,7 @@ static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
978 1117
979static void update_cr0_intercept(struct vcpu_svm *svm) 1118static void update_cr0_intercept(struct vcpu_svm *svm)
980{ 1119{
1120 struct vmcb *vmcb = svm->vmcb;
981 ulong gcr0 = svm->vcpu.arch.cr0; 1121 ulong gcr0 = svm->vcpu.arch.cr0;
982 u64 *hcr0 = &svm->vmcb->save.cr0; 1122 u64 *hcr0 = &svm->vmcb->save.cr0;
983 1123
@@ -989,11 +1129,25 @@ static void update_cr0_intercept(struct vcpu_svm *svm)
989 1129
990 1130
991 if (gcr0 == *hcr0 && svm->vcpu.fpu_active) { 1131 if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
992 svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK; 1132 vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
993 svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK; 1133 vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
1134 if (is_nested(svm)) {
1135 struct vmcb *hsave = svm->nested.hsave;
1136
1137 hsave->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
1138 hsave->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
1139 vmcb->control.intercept_cr_read |= svm->nested.intercept_cr_read;
1140 vmcb->control.intercept_cr_write |= svm->nested.intercept_cr_write;
1141 }
994 } else { 1142 } else {
995 svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK; 1143 svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
996 svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK; 1144 svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
1145 if (is_nested(svm)) {
1146 struct vmcb *hsave = svm->nested.hsave;
1147
1148 hsave->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
1149 hsave->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
1150 }
997 } 1151 }
998} 1152}
999 1153
@@ -1001,6 +1155,27 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1001{ 1155{
1002 struct vcpu_svm *svm = to_svm(vcpu); 1156 struct vcpu_svm *svm = to_svm(vcpu);
1003 1157
1158 if (is_nested(svm)) {
1159 /*
1160 * We are here because we run in nested mode, the host kvm
1161 * intercepts cr0 writes but the l1 hypervisor does not.
1162 * But the L1 hypervisor may intercept selective cr0 writes.
1163 * This needs to be checked here.
1164 */
1165 unsigned long old, new;
1166
1167 /* Remove bits that would trigger a real cr0 write intercept */
1168 old = vcpu->arch.cr0 & SVM_CR0_SELECTIVE_MASK;
1169 new = cr0 & SVM_CR0_SELECTIVE_MASK;
1170
1171 if (old == new) {
1172 /* cr0 write with ts and mp unchanged */
1173 svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
1174 if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE)
1175 return;
1176 }
1177 }
1178
1004#ifdef CONFIG_X86_64 1179#ifdef CONFIG_X86_64
1005 if (vcpu->arch.efer & EFER_LME) { 1180 if (vcpu->arch.efer & EFER_LME) {
1006 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 1181 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
@@ -1134,70 +1309,11 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
1134 svm->vmcb->control.asid = sd->next_asid++; 1309 svm->vmcb->control.asid = sd->next_asid++;
1135} 1310}
1136 1311
1137static int svm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *dest) 1312static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
1138{ 1313{
1139 struct vcpu_svm *svm = to_svm(vcpu); 1314 struct vcpu_svm *svm = to_svm(vcpu);
1140 1315
1141 switch (dr) { 1316 svm->vmcb->save.dr7 = value;
1142 case 0 ... 3:
1143 *dest = vcpu->arch.db[dr];
1144 break;
1145 case 4:
1146 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
1147 return EMULATE_FAIL; /* will re-inject UD */
1148 /* fall through */
1149 case 6:
1150 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1151 *dest = vcpu->arch.dr6;
1152 else
1153 *dest = svm->vmcb->save.dr6;
1154 break;
1155 case 5:
1156 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
1157 return EMULATE_FAIL; /* will re-inject UD */
1158 /* fall through */
1159 case 7:
1160 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1161 *dest = vcpu->arch.dr7;
1162 else
1163 *dest = svm->vmcb->save.dr7;
1164 break;
1165 }
1166
1167 return EMULATE_DONE;
1168}
1169
1170static int svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value)
1171{
1172 struct vcpu_svm *svm = to_svm(vcpu);
1173
1174 switch (dr) {
1175 case 0 ... 3:
1176 vcpu->arch.db[dr] = value;
1177 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
1178 vcpu->arch.eff_db[dr] = value;
1179 break;
1180 case 4:
1181 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
1182 return EMULATE_FAIL; /* will re-inject UD */
1183 /* fall through */
1184 case 6:
1185 vcpu->arch.dr6 = (value & DR6_VOLATILE) | DR6_FIXED_1;
1186 break;
1187 case 5:
1188 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
1189 return EMULATE_FAIL; /* will re-inject UD */
1190 /* fall through */
1191 case 7:
1192 vcpu->arch.dr7 = (value & DR7_VOLATILE) | DR7_FIXED_1;
1193 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
1194 svm->vmcb->save.dr7 = vcpu->arch.dr7;
1195 vcpu->arch.switch_db_regs = (value & DR7_BP_EN_MASK);
1196 }
1197 break;
1198 }
1199
1200 return EMULATE_DONE;
1201} 1317}
1202 1318
1203static int pf_interception(struct vcpu_svm *svm) 1319static int pf_interception(struct vcpu_svm *svm)
@@ -1234,7 +1350,7 @@ static int db_interception(struct vcpu_svm *svm)
1234 } 1350 }
1235 1351
1236 if (svm->vcpu.guest_debug & 1352 if (svm->vcpu.guest_debug &
1237 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)){ 1353 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
1238 kvm_run->exit_reason = KVM_EXIT_DEBUG; 1354 kvm_run->exit_reason = KVM_EXIT_DEBUG;
1239 kvm_run->debug.arch.pc = 1355 kvm_run->debug.arch.pc =
1240 svm->vmcb->save.cs.base + svm->vmcb->save.rip; 1356 svm->vmcb->save.cs.base + svm->vmcb->save.rip;
@@ -1268,7 +1384,22 @@ static int ud_interception(struct vcpu_svm *svm)
1268static void svm_fpu_activate(struct kvm_vcpu *vcpu) 1384static void svm_fpu_activate(struct kvm_vcpu *vcpu)
1269{ 1385{
1270 struct vcpu_svm *svm = to_svm(vcpu); 1386 struct vcpu_svm *svm = to_svm(vcpu);
1271 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); 1387 u32 excp;
1388
1389 if (is_nested(svm)) {
1390 u32 h_excp, n_excp;
1391
1392 h_excp = svm->nested.hsave->control.intercept_exceptions;
1393 n_excp = svm->nested.intercept_exceptions;
1394 h_excp &= ~(1 << NM_VECTOR);
1395 excp = h_excp | n_excp;
1396 } else {
1397 excp = svm->vmcb->control.intercept_exceptions;
1398 excp &= ~(1 << NM_VECTOR);
1399 }
1400
1401 svm->vmcb->control.intercept_exceptions = excp;
1402
1272 svm->vcpu.fpu_active = 1; 1403 svm->vcpu.fpu_active = 1;
1273 update_cr0_intercept(svm); 1404 update_cr0_intercept(svm);
1274} 1405}
@@ -1309,29 +1440,23 @@ static int shutdown_interception(struct vcpu_svm *svm)
1309 1440
1310static int io_interception(struct vcpu_svm *svm) 1441static int io_interception(struct vcpu_svm *svm)
1311{ 1442{
1443 struct kvm_vcpu *vcpu = &svm->vcpu;
1312 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */ 1444 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
1313 int size, in, string; 1445 int size, in, string;
1314 unsigned port; 1446 unsigned port;
1315 1447
1316 ++svm->vcpu.stat.io_exits; 1448 ++svm->vcpu.stat.io_exits;
1317
1318 svm->next_rip = svm->vmcb->control.exit_info_2;
1319
1320 string = (io_info & SVM_IOIO_STR_MASK) != 0; 1449 string = (io_info & SVM_IOIO_STR_MASK) != 0;
1321
1322 if (string) {
1323 if (emulate_instruction(&svm->vcpu,
1324 0, 0, 0) == EMULATE_DO_MMIO)
1325 return 0;
1326 return 1;
1327 }
1328
1329 in = (io_info & SVM_IOIO_TYPE_MASK) != 0; 1450 in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
1451 if (string || in)
1452 return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO);
1453
1330 port = io_info >> 16; 1454 port = io_info >> 16;
1331 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; 1455 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
1332 1456 svm->next_rip = svm->vmcb->control.exit_info_2;
1333 skip_emulated_instruction(&svm->vcpu); 1457 skip_emulated_instruction(&svm->vcpu);
1334 return kvm_emulate_pio(&svm->vcpu, in, size, port); 1458
1459 return kvm_fast_pio_out(vcpu, size, port);
1335} 1460}
1336 1461
1337static int nmi_interception(struct vcpu_svm *svm) 1462static int nmi_interception(struct vcpu_svm *svm)
@@ -1384,6 +1509,8 @@ static int nested_svm_check_permissions(struct vcpu_svm *svm)
1384static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, 1509static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
1385 bool has_error_code, u32 error_code) 1510 bool has_error_code, u32 error_code)
1386{ 1511{
1512 int vmexit;
1513
1387 if (!is_nested(svm)) 1514 if (!is_nested(svm))
1388 return 0; 1515 return 0;
1389 1516
@@ -1392,21 +1519,28 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
1392 svm->vmcb->control.exit_info_1 = error_code; 1519 svm->vmcb->control.exit_info_1 = error_code;
1393 svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; 1520 svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
1394 1521
1395 return nested_svm_exit_handled(svm); 1522 vmexit = nested_svm_intercept(svm);
1523 if (vmexit == NESTED_EXIT_DONE)
1524 svm->nested.exit_required = true;
1525
1526 return vmexit;
1396} 1527}
1397 1528
1398static inline int nested_svm_intr(struct vcpu_svm *svm) 1529/* This function returns true if it is save to enable the irq window */
1530static inline bool nested_svm_intr(struct vcpu_svm *svm)
1399{ 1531{
1400 if (!is_nested(svm)) 1532 if (!is_nested(svm))
1401 return 0; 1533 return true;
1402 1534
1403 if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) 1535 if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
1404 return 0; 1536 return true;
1405 1537
1406 if (!(svm->vcpu.arch.hflags & HF_HIF_MASK)) 1538 if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
1407 return 0; 1539 return false;
1408 1540
1409 svm->vmcb->control.exit_code = SVM_EXIT_INTR; 1541 svm->vmcb->control.exit_code = SVM_EXIT_INTR;
1542 svm->vmcb->control.exit_info_1 = 0;
1543 svm->vmcb->control.exit_info_2 = 0;
1410 1544
1411 if (svm->nested.intercept & 1ULL) { 1545 if (svm->nested.intercept & 1ULL) {
1412 /* 1546 /*
@@ -1417,21 +1551,40 @@ static inline int nested_svm_intr(struct vcpu_svm *svm)
1417 */ 1551 */
1418 svm->nested.exit_required = true; 1552 svm->nested.exit_required = true;
1419 trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip); 1553 trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
1420 return 1; 1554 return false;
1421 } 1555 }
1422 1556
1423 return 0; 1557 return true;
1558}
1559
1560/* This function returns true if it is save to enable the nmi window */
1561static inline bool nested_svm_nmi(struct vcpu_svm *svm)
1562{
1563 if (!is_nested(svm))
1564 return true;
1565
1566 if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI)))
1567 return true;
1568
1569 svm->vmcb->control.exit_code = SVM_EXIT_NMI;
1570 svm->nested.exit_required = true;
1571
1572 return false;
1424} 1573}
1425 1574
1426static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, enum km_type idx) 1575static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
1427{ 1576{
1428 struct page *page; 1577 struct page *page;
1429 1578
1579 might_sleep();
1580
1430 page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT); 1581 page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT);
1431 if (is_error_page(page)) 1582 if (is_error_page(page))
1432 goto error; 1583 goto error;
1433 1584
1434 return kmap_atomic(page, idx); 1585 *_page = page;
1586
1587 return kmap(page);
1435 1588
1436error: 1589error:
1437 kvm_release_page_clean(page); 1590 kvm_release_page_clean(page);
@@ -1440,61 +1593,55 @@ error:
1440 return NULL; 1593 return NULL;
1441} 1594}
1442 1595
1443static void nested_svm_unmap(void *addr, enum km_type idx) 1596static void nested_svm_unmap(struct page *page)
1444{ 1597{
1445 struct page *page; 1598 kunmap(page);
1599 kvm_release_page_dirty(page);
1600}
1446 1601
1447 if (!addr) 1602static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
1448 return; 1603{
1604 unsigned port;
1605 u8 val, bit;
1606 u64 gpa;
1449 1607
1450 page = kmap_atomic_to_page(addr); 1608 if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT)))
1609 return NESTED_EXIT_HOST;
1451 1610
1452 kunmap_atomic(addr, idx); 1611 port = svm->vmcb->control.exit_info_1 >> 16;
1453 kvm_release_page_dirty(page); 1612 gpa = svm->nested.vmcb_iopm + (port / 8);
1613 bit = port % 8;
1614 val = 0;
1615
1616 if (kvm_read_guest(svm->vcpu.kvm, gpa, &val, 1))
1617 val &= (1 << bit);
1618
1619 return val ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
1454} 1620}
1455 1621
1456static bool nested_svm_exit_handled_msr(struct vcpu_svm *svm) 1622static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
1457{ 1623{
1458 u32 param = svm->vmcb->control.exit_info_1 & 1; 1624 u32 offset, msr, value;
1459 u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX]; 1625 int write, mask;
1460 bool ret = false;
1461 u32 t0, t1;
1462 u8 *msrpm;
1463 1626
1464 if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT))) 1627 if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
1465 return false; 1628 return NESTED_EXIT_HOST;
1466 1629
1467 msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, KM_USER0); 1630 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
1631 offset = svm_msrpm_offset(msr);
1632 write = svm->vmcb->control.exit_info_1 & 1;
1633 mask = 1 << ((2 * (msr & 0xf)) + write);
1468 1634
1469 if (!msrpm) 1635 if (offset == MSR_INVALID)
1470 goto out; 1636 return NESTED_EXIT_DONE;
1471 1637
1472 switch (msr) { 1638 /* Offset is in 32 bit units but need in 8 bit units */
1473 case 0 ... 0x1fff: 1639 offset *= 4;
1474 t0 = (msr * 2) % 8;
1475 t1 = msr / 8;
1476 break;
1477 case 0xc0000000 ... 0xc0001fff:
1478 t0 = (8192 + msr - 0xc0000000) * 2;
1479 t1 = (t0 / 8);
1480 t0 %= 8;
1481 break;
1482 case 0xc0010000 ... 0xc0011fff:
1483 t0 = (16384 + msr - 0xc0010000) * 2;
1484 t1 = (t0 / 8);
1485 t0 %= 8;
1486 break;
1487 default:
1488 ret = true;
1489 goto out;
1490 }
1491 1640
1492 ret = msrpm[t1] & ((1 << param) << t0); 1641 if (kvm_read_guest(svm->vcpu.kvm, svm->nested.vmcb_msrpm + offset, &value, 4))
1493 1642 return NESTED_EXIT_DONE;
1494out:
1495 nested_svm_unmap(msrpm, KM_USER0);
1496 1643
1497 return ret; 1644 return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
1498} 1645}
1499 1646
1500static int nested_svm_exit_special(struct vcpu_svm *svm) 1647static int nested_svm_exit_special(struct vcpu_svm *svm)
@@ -1504,17 +1651,21 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
1504 switch (exit_code) { 1651 switch (exit_code) {
1505 case SVM_EXIT_INTR: 1652 case SVM_EXIT_INTR:
1506 case SVM_EXIT_NMI: 1653 case SVM_EXIT_NMI:
1654 case SVM_EXIT_EXCP_BASE + MC_VECTOR:
1507 return NESTED_EXIT_HOST; 1655 return NESTED_EXIT_HOST;
1508 /* For now we are always handling NPFs when using them */
1509 case SVM_EXIT_NPF: 1656 case SVM_EXIT_NPF:
1657 /* For now we are always handling NPFs when using them */
1510 if (npt_enabled) 1658 if (npt_enabled)
1511 return NESTED_EXIT_HOST; 1659 return NESTED_EXIT_HOST;
1512 break; 1660 break;
1513 /* When we're shadowing, trap PFs */
1514 case SVM_EXIT_EXCP_BASE + PF_VECTOR: 1661 case SVM_EXIT_EXCP_BASE + PF_VECTOR:
1662 /* When we're shadowing, trap PFs */
1515 if (!npt_enabled) 1663 if (!npt_enabled)
1516 return NESTED_EXIT_HOST; 1664 return NESTED_EXIT_HOST;
1517 break; 1665 break;
1666 case SVM_EXIT_EXCP_BASE + NM_VECTOR:
1667 nm_interception(svm);
1668 break;
1518 default: 1669 default:
1519 break; 1670 break;
1520 } 1671 }
@@ -1525,7 +1676,7 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
1525/* 1676/*
1526 * If this function returns true, this #vmexit was already handled 1677 * If this function returns true, this #vmexit was already handled
1527 */ 1678 */
1528static int nested_svm_exit_handled(struct vcpu_svm *svm) 1679static int nested_svm_intercept(struct vcpu_svm *svm)
1529{ 1680{
1530 u32 exit_code = svm->vmcb->control.exit_code; 1681 u32 exit_code = svm->vmcb->control.exit_code;
1531 int vmexit = NESTED_EXIT_HOST; 1682 int vmexit = NESTED_EXIT_HOST;
@@ -1534,6 +1685,9 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm)
1534 case SVM_EXIT_MSR: 1685 case SVM_EXIT_MSR:
1535 vmexit = nested_svm_exit_handled_msr(svm); 1686 vmexit = nested_svm_exit_handled_msr(svm);
1536 break; 1687 break;
1688 case SVM_EXIT_IOIO:
1689 vmexit = nested_svm_intercept_ioio(svm);
1690 break;
1537 case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: { 1691 case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: {
1538 u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0); 1692 u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0);
1539 if (svm->nested.intercept_cr_read & cr_bits) 1693 if (svm->nested.intercept_cr_read & cr_bits)
@@ -1564,6 +1718,10 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm)
1564 vmexit = NESTED_EXIT_DONE; 1718 vmexit = NESTED_EXIT_DONE;
1565 break; 1719 break;
1566 } 1720 }
1721 case SVM_EXIT_ERR: {
1722 vmexit = NESTED_EXIT_DONE;
1723 break;
1724 }
1567 default: { 1725 default: {
1568 u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR); 1726 u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
1569 if (svm->nested.intercept & exit_bits) 1727 if (svm->nested.intercept & exit_bits)
@@ -1571,9 +1729,17 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm)
1571 } 1729 }
1572 } 1730 }
1573 1731
1574 if (vmexit == NESTED_EXIT_DONE) { 1732 return vmexit;
1733}
1734
1735static int nested_svm_exit_handled(struct vcpu_svm *svm)
1736{
1737 int vmexit;
1738
1739 vmexit = nested_svm_intercept(svm);
1740
1741 if (vmexit == NESTED_EXIT_DONE)
1575 nested_svm_vmexit(svm); 1742 nested_svm_vmexit(svm);
1576 }
1577 1743
1578 return vmexit; 1744 return vmexit;
1579} 1745}
@@ -1615,6 +1781,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1615 struct vmcb *nested_vmcb; 1781 struct vmcb *nested_vmcb;
1616 struct vmcb *hsave = svm->nested.hsave; 1782 struct vmcb *hsave = svm->nested.hsave;
1617 struct vmcb *vmcb = svm->vmcb; 1783 struct vmcb *vmcb = svm->vmcb;
1784 struct page *page;
1618 1785
1619 trace_kvm_nested_vmexit_inject(vmcb->control.exit_code, 1786 trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
1620 vmcb->control.exit_info_1, 1787 vmcb->control.exit_info_1,
@@ -1622,10 +1789,13 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1622 vmcb->control.exit_int_info, 1789 vmcb->control.exit_int_info,
1623 vmcb->control.exit_int_info_err); 1790 vmcb->control.exit_int_info_err);
1624 1791
1625 nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, KM_USER0); 1792 nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page);
1626 if (!nested_vmcb) 1793 if (!nested_vmcb)
1627 return 1; 1794 return 1;
1628 1795
1796 /* Exit nested SVM mode */
1797 svm->nested.vmcb = 0;
1798
1629 /* Give the current vmcb to the guest */ 1799 /* Give the current vmcb to the guest */
1630 disable_gif(svm); 1800 disable_gif(svm);
1631 1801
@@ -1635,9 +1805,10 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1635 nested_vmcb->save.ds = vmcb->save.ds; 1805 nested_vmcb->save.ds = vmcb->save.ds;
1636 nested_vmcb->save.gdtr = vmcb->save.gdtr; 1806 nested_vmcb->save.gdtr = vmcb->save.gdtr;
1637 nested_vmcb->save.idtr = vmcb->save.idtr; 1807 nested_vmcb->save.idtr = vmcb->save.idtr;
1638 if (npt_enabled) 1808 nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu);
1639 nested_vmcb->save.cr3 = vmcb->save.cr3; 1809 nested_vmcb->save.cr3 = svm->vcpu.arch.cr3;
1640 nested_vmcb->save.cr2 = vmcb->save.cr2; 1810 nested_vmcb->save.cr2 = vmcb->save.cr2;
1811 nested_vmcb->save.cr4 = svm->vcpu.arch.cr4;
1641 nested_vmcb->save.rflags = vmcb->save.rflags; 1812 nested_vmcb->save.rflags = vmcb->save.rflags;
1642 nested_vmcb->save.rip = vmcb->save.rip; 1813 nested_vmcb->save.rip = vmcb->save.rip;
1643 nested_vmcb->save.rsp = vmcb->save.rsp; 1814 nested_vmcb->save.rsp = vmcb->save.rsp;
@@ -1709,10 +1880,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1709 svm->vmcb->save.cpl = 0; 1880 svm->vmcb->save.cpl = 0;
1710 svm->vmcb->control.exit_int_info = 0; 1881 svm->vmcb->control.exit_int_info = 0;
1711 1882
1712 /* Exit nested SVM mode */ 1883 nested_svm_unmap(page);
1713 svm->nested.vmcb = 0;
1714
1715 nested_svm_unmap(nested_vmcb, KM_USER0);
1716 1884
1717 kvm_mmu_reset_context(&svm->vcpu); 1885 kvm_mmu_reset_context(&svm->vcpu);
1718 kvm_mmu_load(&svm->vcpu); 1886 kvm_mmu_load(&svm->vcpu);
@@ -1722,19 +1890,33 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1722 1890
1723static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) 1891static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
1724{ 1892{
1725 u32 *nested_msrpm; 1893 /*
1894 * This function merges the msr permission bitmaps of kvm and the
1895 * nested vmcb. It is omptimized in that it only merges the parts where
1896 * the kvm msr permission bitmap may contain zero bits
1897 */
1726 int i; 1898 int i;
1727 1899
1728 nested_msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, KM_USER0); 1900 if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
1729 if (!nested_msrpm) 1901 return true;
1730 return false;
1731 1902
1732 for (i=0; i< PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER) / 4; i++) 1903 for (i = 0; i < MSRPM_OFFSETS; i++) {
1733 svm->nested.msrpm[i] = svm->msrpm[i] | nested_msrpm[i]; 1904 u32 value, p;
1905 u64 offset;
1734 1906
1735 svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm); 1907 if (msrpm_offsets[i] == 0xffffffff)
1908 break;
1909
1910 p = msrpm_offsets[i];
1911 offset = svm->nested.vmcb_msrpm + (p * 4);
1912
1913 if (kvm_read_guest(svm->vcpu.kvm, offset, &value, 4))
1914 return false;
1915
1916 svm->nested.msrpm[p] = svm->msrpm[p] | value;
1917 }
1736 1918
1737 nested_svm_unmap(nested_msrpm, KM_USER0); 1919 svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm);
1738 1920
1739 return true; 1921 return true;
1740} 1922}
@@ -1744,26 +1926,34 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
1744 struct vmcb *nested_vmcb; 1926 struct vmcb *nested_vmcb;
1745 struct vmcb *hsave = svm->nested.hsave; 1927 struct vmcb *hsave = svm->nested.hsave;
1746 struct vmcb *vmcb = svm->vmcb; 1928 struct vmcb *vmcb = svm->vmcb;
1929 struct page *page;
1930 u64 vmcb_gpa;
1747 1931
1748 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0); 1932 vmcb_gpa = svm->vmcb->save.rax;
1933
1934 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
1749 if (!nested_vmcb) 1935 if (!nested_vmcb)
1750 return false; 1936 return false;
1751 1937
1752 /* nested_vmcb is our indicator if nested SVM is activated */ 1938 trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, vmcb_gpa,
1753 svm->nested.vmcb = svm->vmcb->save.rax;
1754
1755 trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, svm->nested.vmcb,
1756 nested_vmcb->save.rip, 1939 nested_vmcb->save.rip,
1757 nested_vmcb->control.int_ctl, 1940 nested_vmcb->control.int_ctl,
1758 nested_vmcb->control.event_inj, 1941 nested_vmcb->control.event_inj,
1759 nested_vmcb->control.nested_ctl); 1942 nested_vmcb->control.nested_ctl);
1760 1943
1944 trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr_read,
1945 nested_vmcb->control.intercept_cr_write,
1946 nested_vmcb->control.intercept_exceptions,
1947 nested_vmcb->control.intercept);
1948
1761 /* Clear internal status */ 1949 /* Clear internal status */
1762 kvm_clear_exception_queue(&svm->vcpu); 1950 kvm_clear_exception_queue(&svm->vcpu);
1763 kvm_clear_interrupt_queue(&svm->vcpu); 1951 kvm_clear_interrupt_queue(&svm->vcpu);
1764 1952
1765 /* Save the old vmcb, so we don't need to pick what we save, but 1953 /*
1766 can restore everything when a VMEXIT occurs */ 1954 * Save the old vmcb, so we don't need to pick what we save, but can
1955 * restore everything when a VMEXIT occurs
1956 */
1767 hsave->save.es = vmcb->save.es; 1957 hsave->save.es = vmcb->save.es;
1768 hsave->save.cs = vmcb->save.cs; 1958 hsave->save.cs = vmcb->save.cs;
1769 hsave->save.ss = vmcb->save.ss; 1959 hsave->save.ss = vmcb->save.ss;
@@ -1803,14 +1993,17 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
1803 if (npt_enabled) { 1993 if (npt_enabled) {
1804 svm->vmcb->save.cr3 = nested_vmcb->save.cr3; 1994 svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
1805 svm->vcpu.arch.cr3 = nested_vmcb->save.cr3; 1995 svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
1806 } else { 1996 } else
1807 kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); 1997 kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
1808 kvm_mmu_reset_context(&svm->vcpu); 1998
1809 } 1999 /* Guest paging mode is active - reset mmu */
2000 kvm_mmu_reset_context(&svm->vcpu);
2001
1810 svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2; 2002 svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2;
1811 kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax); 2003 kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax);
1812 kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp); 2004 kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp);
1813 kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip); 2005 kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip);
2006
1814 /* In case we don't even reach vcpu_run, the fields are not updated */ 2007 /* In case we don't even reach vcpu_run, the fields are not updated */
1815 svm->vmcb->save.rax = nested_vmcb->save.rax; 2008 svm->vmcb->save.rax = nested_vmcb->save.rax;
1816 svm->vmcb->save.rsp = nested_vmcb->save.rsp; 2009 svm->vmcb->save.rsp = nested_vmcb->save.rsp;
@@ -1819,22 +2012,8 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
1819 svm->vmcb->save.dr6 = nested_vmcb->save.dr6; 2012 svm->vmcb->save.dr6 = nested_vmcb->save.dr6;
1820 svm->vmcb->save.cpl = nested_vmcb->save.cpl; 2013 svm->vmcb->save.cpl = nested_vmcb->save.cpl;
1821 2014
1822 /* We don't want a nested guest to be more powerful than the guest, 2015 svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL;
1823 so all intercepts are ORed */ 2016 svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL;
1824 svm->vmcb->control.intercept_cr_read |=
1825 nested_vmcb->control.intercept_cr_read;
1826 svm->vmcb->control.intercept_cr_write |=
1827 nested_vmcb->control.intercept_cr_write;
1828 svm->vmcb->control.intercept_dr_read |=
1829 nested_vmcb->control.intercept_dr_read;
1830 svm->vmcb->control.intercept_dr_write |=
1831 nested_vmcb->control.intercept_dr_write;
1832 svm->vmcb->control.intercept_exceptions |=
1833 nested_vmcb->control.intercept_exceptions;
1834
1835 svm->vmcb->control.intercept |= nested_vmcb->control.intercept;
1836
1837 svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa;
1838 2017
1839 /* cache intercepts */ 2018 /* cache intercepts */
1840 svm->nested.intercept_cr_read = nested_vmcb->control.intercept_cr_read; 2019 svm->nested.intercept_cr_read = nested_vmcb->control.intercept_cr_read;
@@ -1851,13 +2030,43 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
1851 else 2030 else
1852 svm->vcpu.arch.hflags &= ~HF_VINTR_MASK; 2031 svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
1853 2032
2033 if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
2034 /* We only want the cr8 intercept bits of the guest */
2035 svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR8_MASK;
2036 svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
2037 }
2038
2039 /* We don't want to see VMMCALLs from a nested guest */
2040 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VMMCALL);
2041
2042 /*
2043 * We don't want a nested guest to be more powerful than the guest, so
2044 * all intercepts are ORed
2045 */
2046 svm->vmcb->control.intercept_cr_read |=
2047 nested_vmcb->control.intercept_cr_read;
2048 svm->vmcb->control.intercept_cr_write |=
2049 nested_vmcb->control.intercept_cr_write;
2050 svm->vmcb->control.intercept_dr_read |=
2051 nested_vmcb->control.intercept_dr_read;
2052 svm->vmcb->control.intercept_dr_write |=
2053 nested_vmcb->control.intercept_dr_write;
2054 svm->vmcb->control.intercept_exceptions |=
2055 nested_vmcb->control.intercept_exceptions;
2056
2057 svm->vmcb->control.intercept |= nested_vmcb->control.intercept;
2058
2059 svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl;
1854 svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; 2060 svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
1855 svm->vmcb->control.int_state = nested_vmcb->control.int_state; 2061 svm->vmcb->control.int_state = nested_vmcb->control.int_state;
1856 svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset; 2062 svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset;
1857 svm->vmcb->control.event_inj = nested_vmcb->control.event_inj; 2063 svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
1858 svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err; 2064 svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
1859 2065
1860 nested_svm_unmap(nested_vmcb, KM_USER0); 2066 nested_svm_unmap(page);
2067
2068 /* nested_vmcb is our indicator if nested SVM is activated */
2069 svm->nested.vmcb = vmcb_gpa;
1861 2070
1862 enable_gif(svm); 2071 enable_gif(svm);
1863 2072
@@ -1883,6 +2092,7 @@ static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
1883static int vmload_interception(struct vcpu_svm *svm) 2092static int vmload_interception(struct vcpu_svm *svm)
1884{ 2093{
1885 struct vmcb *nested_vmcb; 2094 struct vmcb *nested_vmcb;
2095 struct page *page;
1886 2096
1887 if (nested_svm_check_permissions(svm)) 2097 if (nested_svm_check_permissions(svm))
1888 return 1; 2098 return 1;
@@ -1890,12 +2100,12 @@ static int vmload_interception(struct vcpu_svm *svm)
1890 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 2100 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1891 skip_emulated_instruction(&svm->vcpu); 2101 skip_emulated_instruction(&svm->vcpu);
1892 2102
1893 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0); 2103 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
1894 if (!nested_vmcb) 2104 if (!nested_vmcb)
1895 return 1; 2105 return 1;
1896 2106
1897 nested_svm_vmloadsave(nested_vmcb, svm->vmcb); 2107 nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
1898 nested_svm_unmap(nested_vmcb, KM_USER0); 2108 nested_svm_unmap(page);
1899 2109
1900 return 1; 2110 return 1;
1901} 2111}
@@ -1903,6 +2113,7 @@ static int vmload_interception(struct vcpu_svm *svm)
1903static int vmsave_interception(struct vcpu_svm *svm) 2113static int vmsave_interception(struct vcpu_svm *svm)
1904{ 2114{
1905 struct vmcb *nested_vmcb; 2115 struct vmcb *nested_vmcb;
2116 struct page *page;
1906 2117
1907 if (nested_svm_check_permissions(svm)) 2118 if (nested_svm_check_permissions(svm))
1908 return 1; 2119 return 1;
@@ -1910,12 +2121,12 @@ static int vmsave_interception(struct vcpu_svm *svm)
1910 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 2121 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1911 skip_emulated_instruction(&svm->vcpu); 2122 skip_emulated_instruction(&svm->vcpu);
1912 2123
1913 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0); 2124 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
1914 if (!nested_vmcb) 2125 if (!nested_vmcb)
1915 return 1; 2126 return 1;
1916 2127
1917 nested_svm_vmloadsave(svm->vmcb, nested_vmcb); 2128 nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
1918 nested_svm_unmap(nested_vmcb, KM_USER0); 2129 nested_svm_unmap(page);
1919 2130
1920 return 1; 2131 return 1;
1921} 2132}
@@ -2018,6 +2229,8 @@ static int task_switch_interception(struct vcpu_svm *svm)
2018 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK; 2229 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
2019 uint32_t idt_v = 2230 uint32_t idt_v =
2020 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID; 2231 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
2232 bool has_error_code = false;
2233 u32 error_code = 0;
2021 2234
2022 tss_selector = (u16)svm->vmcb->control.exit_info_1; 2235 tss_selector = (u16)svm->vmcb->control.exit_info_1;
2023 2236
@@ -2038,6 +2251,12 @@ static int task_switch_interception(struct vcpu_svm *svm)
2038 svm->vcpu.arch.nmi_injected = false; 2251 svm->vcpu.arch.nmi_injected = false;
2039 break; 2252 break;
2040 case SVM_EXITINTINFO_TYPE_EXEPT: 2253 case SVM_EXITINTINFO_TYPE_EXEPT:
2254 if (svm->vmcb->control.exit_info_2 &
2255 (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
2256 has_error_code = true;
2257 error_code =
2258 (u32)svm->vmcb->control.exit_info_2;
2259 }
2041 kvm_clear_exception_queue(&svm->vcpu); 2260 kvm_clear_exception_queue(&svm->vcpu);
2042 break; 2261 break;
2043 case SVM_EXITINTINFO_TYPE_INTR: 2262 case SVM_EXITINTINFO_TYPE_INTR:
@@ -2054,7 +2273,14 @@ static int task_switch_interception(struct vcpu_svm *svm)
2054 (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) 2273 (int_vec == OF_VECTOR || int_vec == BP_VECTOR)))
2055 skip_emulated_instruction(&svm->vcpu); 2274 skip_emulated_instruction(&svm->vcpu);
2056 2275
2057 return kvm_task_switch(&svm->vcpu, tss_selector, reason); 2276 if (kvm_task_switch(&svm->vcpu, tss_selector, reason,
2277 has_error_code, error_code) == EMULATE_FAIL) {
2278 svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2279 svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
2280 svm->vcpu.run->internal.ndata = 0;
2281 return 0;
2282 }
2283 return 1;
2058} 2284}
2059 2285
2060static int cpuid_interception(struct vcpu_svm *svm) 2286static int cpuid_interception(struct vcpu_svm *svm)
@@ -2145,9 +2371,11 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
2145 case MSR_IA32_SYSENTER_ESP: 2371 case MSR_IA32_SYSENTER_ESP:
2146 *data = svm->sysenter_esp; 2372 *data = svm->sysenter_esp;
2147 break; 2373 break;
2148 /* Nobody will change the following 5 values in the VMCB so 2374 /*
2149 we can safely return them on rdmsr. They will always be 0 2375 * Nobody will change the following 5 values in the VMCB so we can
2150 until LBRV is implemented. */ 2376 * safely return them on rdmsr. They will always be 0 until LBRV is
2377 * implemented.
2378 */
2151 case MSR_IA32_DEBUGCTLMSR: 2379 case MSR_IA32_DEBUGCTLMSR:
2152 *data = svm->vmcb->save.dbgctl; 2380 *data = svm->vmcb->save.dbgctl;
2153 break; 2381 break;
@@ -2167,7 +2395,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
2167 *data = svm->nested.hsave_msr; 2395 *data = svm->nested.hsave_msr;
2168 break; 2396 break;
2169 case MSR_VM_CR: 2397 case MSR_VM_CR:
2170 *data = 0; 2398 *data = svm->nested.vm_cr_msr;
2171 break; 2399 break;
2172 case MSR_IA32_UCODE_REV: 2400 case MSR_IA32_UCODE_REV:
2173 *data = 0x01000065; 2401 *data = 0x01000065;
@@ -2197,6 +2425,31 @@ static int rdmsr_interception(struct vcpu_svm *svm)
2197 return 1; 2425 return 1;
2198} 2426}
2199 2427
2428static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
2429{
2430 struct vcpu_svm *svm = to_svm(vcpu);
2431 int svm_dis, chg_mask;
2432
2433 if (data & ~SVM_VM_CR_VALID_MASK)
2434 return 1;
2435
2436 chg_mask = SVM_VM_CR_VALID_MASK;
2437
2438 if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
2439 chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
2440
2441 svm->nested.vm_cr_msr &= ~chg_mask;
2442 svm->nested.vm_cr_msr |= (data & chg_mask);
2443
2444 svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
2445
2446 /* check for svm_disable while efer.svme is set */
2447 if (svm_dis && (vcpu->arch.efer & EFER_SVME))
2448 return 1;
2449
2450 return 0;
2451}
2452
2200static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) 2453static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
2201{ 2454{
2202 struct vcpu_svm *svm = to_svm(vcpu); 2455 struct vcpu_svm *svm = to_svm(vcpu);
@@ -2263,6 +2516,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
2263 svm->nested.hsave_msr = data; 2516 svm->nested.hsave_msr = data;
2264 break; 2517 break;
2265 case MSR_VM_CR: 2518 case MSR_VM_CR:
2519 return svm_set_vm_cr(vcpu, data);
2266 case MSR_VM_IGNNE: 2520 case MSR_VM_IGNNE:
2267 pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); 2521 pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
2268 break; 2522 break;
@@ -2326,16 +2580,16 @@ static int pause_interception(struct vcpu_svm *svm)
2326} 2580}
2327 2581
2328static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { 2582static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2329 [SVM_EXIT_READ_CR0] = emulate_on_interception, 2583 [SVM_EXIT_READ_CR0] = emulate_on_interception,
2330 [SVM_EXIT_READ_CR3] = emulate_on_interception, 2584 [SVM_EXIT_READ_CR3] = emulate_on_interception,
2331 [SVM_EXIT_READ_CR4] = emulate_on_interception, 2585 [SVM_EXIT_READ_CR4] = emulate_on_interception,
2332 [SVM_EXIT_READ_CR8] = emulate_on_interception, 2586 [SVM_EXIT_READ_CR8] = emulate_on_interception,
2333 [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, 2587 [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception,
2334 [SVM_EXIT_WRITE_CR0] = emulate_on_interception, 2588 [SVM_EXIT_WRITE_CR0] = emulate_on_interception,
2335 [SVM_EXIT_WRITE_CR3] = emulate_on_interception, 2589 [SVM_EXIT_WRITE_CR3] = emulate_on_interception,
2336 [SVM_EXIT_WRITE_CR4] = emulate_on_interception, 2590 [SVM_EXIT_WRITE_CR4] = emulate_on_interception,
2337 [SVM_EXIT_WRITE_CR8] = cr8_write_interception, 2591 [SVM_EXIT_WRITE_CR8] = cr8_write_interception,
2338 [SVM_EXIT_READ_DR0] = emulate_on_interception, 2592 [SVM_EXIT_READ_DR0] = emulate_on_interception,
2339 [SVM_EXIT_READ_DR1] = emulate_on_interception, 2593 [SVM_EXIT_READ_DR1] = emulate_on_interception,
2340 [SVM_EXIT_READ_DR2] = emulate_on_interception, 2594 [SVM_EXIT_READ_DR2] = emulate_on_interception,
2341 [SVM_EXIT_READ_DR3] = emulate_on_interception, 2595 [SVM_EXIT_READ_DR3] = emulate_on_interception,
@@ -2354,15 +2608,14 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2354 [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception, 2608 [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception,
2355 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, 2609 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception,
2356 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, 2610 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
2357 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, 2611 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
2358 [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, 2612 [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception,
2359 [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception, 2613 [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception,
2360 [SVM_EXIT_INTR] = intr_interception, 2614 [SVM_EXIT_INTR] = intr_interception,
2361 [SVM_EXIT_NMI] = nmi_interception, 2615 [SVM_EXIT_NMI] = nmi_interception,
2362 [SVM_EXIT_SMI] = nop_on_interception, 2616 [SVM_EXIT_SMI] = nop_on_interception,
2363 [SVM_EXIT_INIT] = nop_on_interception, 2617 [SVM_EXIT_INIT] = nop_on_interception,
2364 [SVM_EXIT_VINTR] = interrupt_window_interception, 2618 [SVM_EXIT_VINTR] = interrupt_window_interception,
2365 /* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */
2366 [SVM_EXIT_CPUID] = cpuid_interception, 2619 [SVM_EXIT_CPUID] = cpuid_interception,
2367 [SVM_EXIT_IRET] = iret_interception, 2620 [SVM_EXIT_IRET] = iret_interception,
2368 [SVM_EXIT_INVD] = emulate_on_interception, 2621 [SVM_EXIT_INVD] = emulate_on_interception,
@@ -2370,7 +2623,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2370 [SVM_EXIT_HLT] = halt_interception, 2623 [SVM_EXIT_HLT] = halt_interception,
2371 [SVM_EXIT_INVLPG] = invlpg_interception, 2624 [SVM_EXIT_INVLPG] = invlpg_interception,
2372 [SVM_EXIT_INVLPGA] = invlpga_interception, 2625 [SVM_EXIT_INVLPGA] = invlpga_interception,
2373 [SVM_EXIT_IOIO] = io_interception, 2626 [SVM_EXIT_IOIO] = io_interception,
2374 [SVM_EXIT_MSR] = msr_interception, 2627 [SVM_EXIT_MSR] = msr_interception,
2375 [SVM_EXIT_TASK_SWITCH] = task_switch_interception, 2628 [SVM_EXIT_TASK_SWITCH] = task_switch_interception,
2376 [SVM_EXIT_SHUTDOWN] = shutdown_interception, 2629 [SVM_EXIT_SHUTDOWN] = shutdown_interception,
@@ -2393,7 +2646,12 @@ static int handle_exit(struct kvm_vcpu *vcpu)
2393 struct kvm_run *kvm_run = vcpu->run; 2646 struct kvm_run *kvm_run = vcpu->run;
2394 u32 exit_code = svm->vmcb->control.exit_code; 2647 u32 exit_code = svm->vmcb->control.exit_code;
2395 2648
2396 trace_kvm_exit(exit_code, svm->vmcb->save.rip); 2649 trace_kvm_exit(exit_code, vcpu);
2650
2651 if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK))
2652 vcpu->arch.cr0 = svm->vmcb->save.cr0;
2653 if (npt_enabled)
2654 vcpu->arch.cr3 = svm->vmcb->save.cr3;
2397 2655
2398 if (unlikely(svm->nested.exit_required)) { 2656 if (unlikely(svm->nested.exit_required)) {
2399 nested_svm_vmexit(svm); 2657 nested_svm_vmexit(svm);
@@ -2422,11 +2680,6 @@ static int handle_exit(struct kvm_vcpu *vcpu)
2422 2680
2423 svm_complete_interrupts(svm); 2681 svm_complete_interrupts(svm);
2424 2682
2425 if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK))
2426 vcpu->arch.cr0 = svm->vmcb->save.cr0;
2427 if (npt_enabled)
2428 vcpu->arch.cr3 = svm->vmcb->save.cr3;
2429
2430 if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { 2683 if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
2431 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; 2684 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
2432 kvm_run->fail_entry.hardware_entry_failure_reason 2685 kvm_run->fail_entry.hardware_entry_failure_reason
@@ -2511,6 +2764,9 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
2511{ 2764{
2512 struct vcpu_svm *svm = to_svm(vcpu); 2765 struct vcpu_svm *svm = to_svm(vcpu);
2513 2766
2767 if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK))
2768 return;
2769
2514 if (irr == -1) 2770 if (irr == -1)
2515 return; 2771 return;
2516 2772
@@ -2522,8 +2778,12 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
2522{ 2778{
2523 struct vcpu_svm *svm = to_svm(vcpu); 2779 struct vcpu_svm *svm = to_svm(vcpu);
2524 struct vmcb *vmcb = svm->vmcb; 2780 struct vmcb *vmcb = svm->vmcb;
2525 return !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) && 2781 int ret;
2526 !(svm->vcpu.arch.hflags & HF_NMI_MASK); 2782 ret = !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
2783 !(svm->vcpu.arch.hflags & HF_NMI_MASK);
2784 ret = ret && gif_set(svm) && nested_svm_nmi(svm);
2785
2786 return ret;
2527} 2787}
2528 2788
2529static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu) 2789static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
@@ -2568,13 +2828,13 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
2568{ 2828{
2569 struct vcpu_svm *svm = to_svm(vcpu); 2829 struct vcpu_svm *svm = to_svm(vcpu);
2570 2830
2571 nested_svm_intr(svm); 2831 /*
2572 2832 * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
2573 /* In case GIF=0 we can't rely on the CPU to tell us when 2833 * 1, because that's a separate STGI/VMRUN intercept. The next time we
2574 * GIF becomes 1, because that's a separate STGI/VMRUN intercept. 2834 * get that intercept, this function will be called again though and
2575 * The next time we get that intercept, this function will be 2835 * we'll get the vintr intercept.
2576 * called again though and we'll get the vintr intercept. */ 2836 */
2577 if (gif_set(svm)) { 2837 if (gif_set(svm) && nested_svm_intr(svm)) {
2578 svm_set_vintr(svm); 2838 svm_set_vintr(svm);
2579 svm_inject_irq(svm, 0x0); 2839 svm_inject_irq(svm, 0x0);
2580 } 2840 }
@@ -2588,9 +2848,10 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
2588 == HF_NMI_MASK) 2848 == HF_NMI_MASK)
2589 return; /* IRET will cause a vm exit */ 2849 return; /* IRET will cause a vm exit */
2590 2850
2591 /* Something prevents NMI from been injected. Single step over 2851 /*
2592 possible problem (IRET or exception injection or interrupt 2852 * Something prevents NMI from been injected. Single step over possible
2593 shadow) */ 2853 * problem (IRET or exception injection or interrupt shadow)
2854 */
2594 svm->nmi_singlestep = true; 2855 svm->nmi_singlestep = true;
2595 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); 2856 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
2596 update_db_intercept(vcpu); 2857 update_db_intercept(vcpu);
@@ -2614,6 +2875,9 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
2614{ 2875{
2615 struct vcpu_svm *svm = to_svm(vcpu); 2876 struct vcpu_svm *svm = to_svm(vcpu);
2616 2877
2878 if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK))
2879 return;
2880
2617 if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) { 2881 if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) {
2618 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK; 2882 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
2619 kvm_set_cr8(vcpu, cr8); 2883 kvm_set_cr8(vcpu, cr8);
@@ -2625,6 +2889,9 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
2625 struct vcpu_svm *svm = to_svm(vcpu); 2889 struct vcpu_svm *svm = to_svm(vcpu);
2626 u64 cr8; 2890 u64 cr8;
2627 2891
2892 if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK))
2893 return;
2894
2628 cr8 = kvm_get_cr8(vcpu); 2895 cr8 = kvm_get_cr8(vcpu);
2629 svm->vmcb->control.int_ctl &= ~V_TPR_MASK; 2896 svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
2630 svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; 2897 svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
@@ -2635,6 +2902,9 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
2635 u8 vector; 2902 u8 vector;
2636 int type; 2903 int type;
2637 u32 exitintinfo = svm->vmcb->control.exit_int_info; 2904 u32 exitintinfo = svm->vmcb->control.exit_int_info;
2905 unsigned int3_injected = svm->int3_injected;
2906
2907 svm->int3_injected = 0;
2638 2908
2639 if (svm->vcpu.arch.hflags & HF_IRET_MASK) 2909 if (svm->vcpu.arch.hflags & HF_IRET_MASK)
2640 svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); 2910 svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
@@ -2654,18 +2924,25 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
2654 svm->vcpu.arch.nmi_injected = true; 2924 svm->vcpu.arch.nmi_injected = true;
2655 break; 2925 break;
2656 case SVM_EXITINTINFO_TYPE_EXEPT: 2926 case SVM_EXITINTINFO_TYPE_EXEPT:
2657 /* In case of software exception do not reinject an exception 2927 /*
2658 vector, but re-execute and instruction instead */ 2928 * In case of software exceptions, do not reinject the vector,
2659 if (is_nested(svm)) 2929 * but re-execute the instruction instead. Rewind RIP first
2660 break; 2930 * if we emulated INT3 before.
2661 if (kvm_exception_is_soft(vector)) 2931 */
2932 if (kvm_exception_is_soft(vector)) {
2933 if (vector == BP_VECTOR && int3_injected &&
2934 kvm_is_linear_rip(&svm->vcpu, svm->int3_rip))
2935 kvm_rip_write(&svm->vcpu,
2936 kvm_rip_read(&svm->vcpu) -
2937 int3_injected);
2662 break; 2938 break;
2939 }
2663 if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) { 2940 if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
2664 u32 err = svm->vmcb->control.exit_int_info_err; 2941 u32 err = svm->vmcb->control.exit_int_info_err;
2665 kvm_queue_exception_e(&svm->vcpu, vector, err); 2942 kvm_requeue_exception_e(&svm->vcpu, vector, err);
2666 2943
2667 } else 2944 } else
2668 kvm_queue_exception(&svm->vcpu, vector); 2945 kvm_requeue_exception(&svm->vcpu, vector);
2669 break; 2946 break;
2670 case SVM_EXITINTINFO_TYPE_INTR: 2947 case SVM_EXITINTINFO_TYPE_INTR:
2671 kvm_queue_interrupt(&svm->vcpu, vector, false); 2948 kvm_queue_interrupt(&svm->vcpu, vector, false);
@@ -2688,6 +2965,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
2688 u16 gs_selector; 2965 u16 gs_selector;
2689 u16 ldt_selector; 2966 u16 ldt_selector;
2690 2967
2968 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
2969 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
2970 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
2971
2691 /* 2972 /*
2692 * A vmexit emulation is required before the vcpu can be executed 2973 * A vmexit emulation is required before the vcpu can be executed
2693 * again. 2974 * again.
@@ -2695,10 +2976,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
2695 if (unlikely(svm->nested.exit_required)) 2976 if (unlikely(svm->nested.exit_required))
2696 return; 2977 return;
2697 2978
2698 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
2699 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
2700 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
2701
2702 pre_svm_run(svm); 2979 pre_svm_run(svm);
2703 2980
2704 sync_lapic_to_cr8(vcpu); 2981 sync_lapic_to_cr8(vcpu);
@@ -2879,25 +3156,39 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu)
2879{ 3156{
2880} 3157}
2881 3158
3159static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
3160{
3161 switch (func) {
3162 case 0x8000000A:
3163 entry->eax = 1; /* SVM revision 1 */
3164 entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper
3165 ASID emulation to nested SVM */
3166 entry->ecx = 0; /* Reserved */
3167 entry->edx = 0; /* Do not support any additional features */
3168
3169 break;
3170 }
3171}
3172
2882static const struct trace_print_flags svm_exit_reasons_str[] = { 3173static const struct trace_print_flags svm_exit_reasons_str[] = {
2883 { SVM_EXIT_READ_CR0, "read_cr0" }, 3174 { SVM_EXIT_READ_CR0, "read_cr0" },
2884 { SVM_EXIT_READ_CR3, "read_cr3" }, 3175 { SVM_EXIT_READ_CR3, "read_cr3" },
2885 { SVM_EXIT_READ_CR4, "read_cr4" }, 3176 { SVM_EXIT_READ_CR4, "read_cr4" },
2886 { SVM_EXIT_READ_CR8, "read_cr8" }, 3177 { SVM_EXIT_READ_CR8, "read_cr8" },
2887 { SVM_EXIT_WRITE_CR0, "write_cr0" }, 3178 { SVM_EXIT_WRITE_CR0, "write_cr0" },
2888 { SVM_EXIT_WRITE_CR3, "write_cr3" }, 3179 { SVM_EXIT_WRITE_CR3, "write_cr3" },
2889 { SVM_EXIT_WRITE_CR4, "write_cr4" }, 3180 { SVM_EXIT_WRITE_CR4, "write_cr4" },
2890 { SVM_EXIT_WRITE_CR8, "write_cr8" }, 3181 { SVM_EXIT_WRITE_CR8, "write_cr8" },
2891 { SVM_EXIT_READ_DR0, "read_dr0" }, 3182 { SVM_EXIT_READ_DR0, "read_dr0" },
2892 { SVM_EXIT_READ_DR1, "read_dr1" }, 3183 { SVM_EXIT_READ_DR1, "read_dr1" },
2893 { SVM_EXIT_READ_DR2, "read_dr2" }, 3184 { SVM_EXIT_READ_DR2, "read_dr2" },
2894 { SVM_EXIT_READ_DR3, "read_dr3" }, 3185 { SVM_EXIT_READ_DR3, "read_dr3" },
2895 { SVM_EXIT_WRITE_DR0, "write_dr0" }, 3186 { SVM_EXIT_WRITE_DR0, "write_dr0" },
2896 { SVM_EXIT_WRITE_DR1, "write_dr1" }, 3187 { SVM_EXIT_WRITE_DR1, "write_dr1" },
2897 { SVM_EXIT_WRITE_DR2, "write_dr2" }, 3188 { SVM_EXIT_WRITE_DR2, "write_dr2" },
2898 { SVM_EXIT_WRITE_DR3, "write_dr3" }, 3189 { SVM_EXIT_WRITE_DR3, "write_dr3" },
2899 { SVM_EXIT_WRITE_DR5, "write_dr5" }, 3190 { SVM_EXIT_WRITE_DR5, "write_dr5" },
2900 { SVM_EXIT_WRITE_DR7, "write_dr7" }, 3191 { SVM_EXIT_WRITE_DR7, "write_dr7" },
2901 { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" }, 3192 { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" },
2902 { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" }, 3193 { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" },
2903 { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" }, 3194 { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" },
@@ -2946,8 +3237,10 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
2946{ 3237{
2947 struct vcpu_svm *svm = to_svm(vcpu); 3238 struct vcpu_svm *svm = to_svm(vcpu);
2948 3239
2949 update_cr0_intercept(svm);
2950 svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR; 3240 svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR;
3241 if (is_nested(svm))
3242 svm->nested.hsave->control.intercept_exceptions |= 1 << NM_VECTOR;
3243 update_cr0_intercept(svm);
2951} 3244}
2952 3245
2953static struct kvm_x86_ops svm_x86_ops = { 3246static struct kvm_x86_ops svm_x86_ops = {
@@ -2986,8 +3279,7 @@ static struct kvm_x86_ops svm_x86_ops = {
2986 .set_idt = svm_set_idt, 3279 .set_idt = svm_set_idt,
2987 .get_gdt = svm_get_gdt, 3280 .get_gdt = svm_get_gdt,
2988 .set_gdt = svm_set_gdt, 3281 .set_gdt = svm_set_gdt,
2989 .get_dr = svm_get_dr, 3282 .set_dr7 = svm_set_dr7,
2990 .set_dr = svm_set_dr,
2991 .cache_reg = svm_cache_reg, 3283 .cache_reg = svm_cache_reg,
2992 .get_rflags = svm_get_rflags, 3284 .get_rflags = svm_get_rflags,
2993 .set_rflags = svm_set_rflags, 3285 .set_rflags = svm_set_rflags,
@@ -3023,12 +3315,14 @@ static struct kvm_x86_ops svm_x86_ops = {
3023 .cpuid_update = svm_cpuid_update, 3315 .cpuid_update = svm_cpuid_update,
3024 3316
3025 .rdtscp_supported = svm_rdtscp_supported, 3317 .rdtscp_supported = svm_rdtscp_supported,
3318
3319 .set_supported_cpuid = svm_set_supported_cpuid,
3026}; 3320};
3027 3321
3028static int __init svm_init(void) 3322static int __init svm_init(void)
3029{ 3323{
3030 return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm), 3324 return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm),
3031 THIS_MODULE); 3325 __alignof__(struct vcpu_svm), THIS_MODULE);
3032} 3326}
3033 3327
3034static void __exit svm_exit(void) 3328static void __exit svm_exit(void)
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
index eea40439066c..4ddadb1a5ffe 100644
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
@@ -12,7 +12,8 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
12 /* 12 /*
13 * There is a race window between reading and incrementing, but we do 13 * There is a race window between reading and incrementing, but we do
14 * not care about potentially loosing timer events in the !reinject 14 * not care about potentially loosing timer events in the !reinject
15 * case anyway. 15 * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked
16 * in vcpu_enter_guest.
16 */ 17 */
17 if (ktimer->reinject || !atomic_read(&ktimer->pending)) { 18 if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
18 atomic_inc(&ktimer->pending); 19 atomic_inc(&ktimer->pending);
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 6ad30a29f044..a6544b8e7c0f 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -5,8 +5,6 @@
5 5
6#undef TRACE_SYSTEM 6#undef TRACE_SYSTEM
7#define TRACE_SYSTEM kvm 7#define TRACE_SYSTEM kvm
8#define TRACE_INCLUDE_PATH arch/x86/kvm
9#define TRACE_INCLUDE_FILE trace
10 8
11/* 9/*
12 * Tracepoint for guest mode entry. 10 * Tracepoint for guest mode entry.
@@ -184,8 +182,8 @@ TRACE_EVENT(kvm_apic,
184 * Tracepoint for kvm guest exit: 182 * Tracepoint for kvm guest exit:
185 */ 183 */
186TRACE_EVENT(kvm_exit, 184TRACE_EVENT(kvm_exit,
187 TP_PROTO(unsigned int exit_reason, unsigned long guest_rip), 185 TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu),
188 TP_ARGS(exit_reason, guest_rip), 186 TP_ARGS(exit_reason, vcpu),
189 187
190 TP_STRUCT__entry( 188 TP_STRUCT__entry(
191 __field( unsigned int, exit_reason ) 189 __field( unsigned int, exit_reason )
@@ -194,7 +192,7 @@ TRACE_EVENT(kvm_exit,
194 192
195 TP_fast_assign( 193 TP_fast_assign(
196 __entry->exit_reason = exit_reason; 194 __entry->exit_reason = exit_reason;
197 __entry->guest_rip = guest_rip; 195 __entry->guest_rip = kvm_rip_read(vcpu);
198 ), 196 ),
199 197
200 TP_printk("reason %s rip 0x%lx", 198 TP_printk("reason %s rip 0x%lx",
@@ -221,6 +219,38 @@ TRACE_EVENT(kvm_inj_virq,
221 TP_printk("irq %u", __entry->irq) 219 TP_printk("irq %u", __entry->irq)
222); 220);
223 221
222#define EXS(x) { x##_VECTOR, "#" #x }
223
224#define kvm_trace_sym_exc \
225 EXS(DE), EXS(DB), EXS(BP), EXS(OF), EXS(BR), EXS(UD), EXS(NM), \
226 EXS(DF), EXS(TS), EXS(NP), EXS(SS), EXS(GP), EXS(PF), \
227 EXS(MF), EXS(MC)
228
229/*
230 * Tracepoint for kvm interrupt injection:
231 */
232TRACE_EVENT(kvm_inj_exception,
233 TP_PROTO(unsigned exception, bool has_error, unsigned error_code),
234 TP_ARGS(exception, has_error, error_code),
235
236 TP_STRUCT__entry(
237 __field( u8, exception )
238 __field( u8, has_error )
239 __field( u32, error_code )
240 ),
241
242 TP_fast_assign(
243 __entry->exception = exception;
244 __entry->has_error = has_error;
245 __entry->error_code = error_code;
246 ),
247
248 TP_printk("%s (0x%x)",
249 __print_symbolic(__entry->exception, kvm_trace_sym_exc),
250 /* FIXME: don't print error_code if not present */
251 __entry->has_error ? __entry->error_code : 0)
252);
253
224/* 254/*
225 * Tracepoint for page fault. 255 * Tracepoint for page fault.
226 */ 256 */
@@ -413,12 +443,34 @@ TRACE_EVENT(kvm_nested_vmrun,
413 ), 443 ),
414 444
415 TP_printk("rip: 0x%016llx vmcb: 0x%016llx nrip: 0x%016llx int_ctl: 0x%08x " 445 TP_printk("rip: 0x%016llx vmcb: 0x%016llx nrip: 0x%016llx int_ctl: 0x%08x "
416 "event_inj: 0x%08x npt: %s\n", 446 "event_inj: 0x%08x npt: %s",
417 __entry->rip, __entry->vmcb, __entry->nested_rip, 447 __entry->rip, __entry->vmcb, __entry->nested_rip,
418 __entry->int_ctl, __entry->event_inj, 448 __entry->int_ctl, __entry->event_inj,
419 __entry->npt ? "on" : "off") 449 __entry->npt ? "on" : "off")
420); 450);
421 451
452TRACE_EVENT(kvm_nested_intercepts,
453 TP_PROTO(__u16 cr_read, __u16 cr_write, __u32 exceptions, __u64 intercept),
454 TP_ARGS(cr_read, cr_write, exceptions, intercept),
455
456 TP_STRUCT__entry(
457 __field( __u16, cr_read )
458 __field( __u16, cr_write )
459 __field( __u32, exceptions )
460 __field( __u64, intercept )
461 ),
462
463 TP_fast_assign(
464 __entry->cr_read = cr_read;
465 __entry->cr_write = cr_write;
466 __entry->exceptions = exceptions;
467 __entry->intercept = intercept;
468 ),
469
470 TP_printk("cr_read: %04x cr_write: %04x excp: %08x intercept: %016llx",
471 __entry->cr_read, __entry->cr_write, __entry->exceptions,
472 __entry->intercept)
473);
422/* 474/*
423 * Tracepoint for #VMEXIT while nested 475 * Tracepoint for #VMEXIT while nested
424 */ 476 */
@@ -447,7 +499,7 @@ TRACE_EVENT(kvm_nested_vmexit,
447 __entry->exit_int_info_err = exit_int_info_err; 499 __entry->exit_int_info_err = exit_int_info_err;
448 ), 500 ),
449 TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx " 501 TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx "
450 "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x\n", 502 "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x",
451 __entry->rip, 503 __entry->rip,
452 ftrace_print_symbols_seq(p, __entry->exit_code, 504 ftrace_print_symbols_seq(p, __entry->exit_code,
453 kvm_x86_ops->exit_reasons_str), 505 kvm_x86_ops->exit_reasons_str),
@@ -482,7 +534,7 @@ TRACE_EVENT(kvm_nested_vmexit_inject,
482 ), 534 ),
483 535
484 TP_printk("reason: %s ext_inf1: 0x%016llx " 536 TP_printk("reason: %s ext_inf1: 0x%016llx "
485 "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x\n", 537 "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x",
486 ftrace_print_symbols_seq(p, __entry->exit_code, 538 ftrace_print_symbols_seq(p, __entry->exit_code,
487 kvm_x86_ops->exit_reasons_str), 539 kvm_x86_ops->exit_reasons_str),
488 __entry->exit_info1, __entry->exit_info2, 540 __entry->exit_info1, __entry->exit_info2,
@@ -504,7 +556,7 @@ TRACE_EVENT(kvm_nested_intr_vmexit,
504 __entry->rip = rip 556 __entry->rip = rip
505 ), 557 ),
506 558
507 TP_printk("rip: 0x%016llx\n", __entry->rip) 559 TP_printk("rip: 0x%016llx", __entry->rip)
508); 560);
509 561
510/* 562/*
@@ -526,7 +578,7 @@ TRACE_EVENT(kvm_invlpga,
526 __entry->address = address; 578 __entry->address = address;
527 ), 579 ),
528 580
529 TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx\n", 581 TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx",
530 __entry->rip, __entry->asid, __entry->address) 582 __entry->rip, __entry->asid, __entry->address)
531); 583);
532 584
@@ -547,11 +599,102 @@ TRACE_EVENT(kvm_skinit,
547 __entry->slb = slb; 599 __entry->slb = slb;
548 ), 600 ),
549 601
550 TP_printk("rip: 0x%016llx slb: 0x%08x\n", 602 TP_printk("rip: 0x%016llx slb: 0x%08x",
551 __entry->rip, __entry->slb) 603 __entry->rip, __entry->slb)
552); 604);
553 605
606#define __print_insn(insn, ilen) ({ \
607 int i; \
608 const char *ret = p->buffer + p->len; \
609 \
610 for (i = 0; i < ilen; ++i) \
611 trace_seq_printf(p, " %02x", insn[i]); \
612 trace_seq_printf(p, "%c", 0); \
613 ret; \
614 })
615
616#define KVM_EMUL_INSN_F_CR0_PE (1 << 0)
617#define KVM_EMUL_INSN_F_EFL_VM (1 << 1)
618#define KVM_EMUL_INSN_F_CS_D (1 << 2)
619#define KVM_EMUL_INSN_F_CS_L (1 << 3)
620
621#define kvm_trace_symbol_emul_flags \
622 { 0, "real" }, \
623 { KVM_EMUL_INSN_F_CR0_PE \
624 | KVM_EMUL_INSN_F_EFL_VM, "vm16" }, \
625 { KVM_EMUL_INSN_F_CR0_PE, "prot16" }, \
626 { KVM_EMUL_INSN_F_CR0_PE \
627 | KVM_EMUL_INSN_F_CS_D, "prot32" }, \
628 { KVM_EMUL_INSN_F_CR0_PE \
629 | KVM_EMUL_INSN_F_CS_L, "prot64" }
630
631#define kei_decode_mode(mode) ({ \
632 u8 flags = 0xff; \
633 switch (mode) { \
634 case X86EMUL_MODE_REAL: \
635 flags = 0; \
636 break; \
637 case X86EMUL_MODE_VM86: \
638 flags = KVM_EMUL_INSN_F_EFL_VM; \
639 break; \
640 case X86EMUL_MODE_PROT16: \
641 flags = KVM_EMUL_INSN_F_CR0_PE; \
642 break; \
643 case X86EMUL_MODE_PROT32: \
644 flags = KVM_EMUL_INSN_F_CR0_PE \
645 | KVM_EMUL_INSN_F_CS_D; \
646 break; \
647 case X86EMUL_MODE_PROT64: \
648 flags = KVM_EMUL_INSN_F_CR0_PE \
649 | KVM_EMUL_INSN_F_CS_L; \
650 break; \
651 } \
652 flags; \
653 })
654
655TRACE_EVENT(kvm_emulate_insn,
656 TP_PROTO(struct kvm_vcpu *vcpu, __u8 failed),
657 TP_ARGS(vcpu, failed),
658
659 TP_STRUCT__entry(
660 __field( __u64, rip )
661 __field( __u32, csbase )
662 __field( __u8, len )
663 __array( __u8, insn, 15 )
664 __field( __u8, flags )
665 __field( __u8, failed )
666 ),
667
668 TP_fast_assign(
669 __entry->rip = vcpu->arch.emulate_ctxt.decode.fetch.start;
670 __entry->csbase = kvm_x86_ops->get_segment_base(vcpu, VCPU_SREG_CS);
671 __entry->len = vcpu->arch.emulate_ctxt.decode.eip
672 - vcpu->arch.emulate_ctxt.decode.fetch.start;
673 memcpy(__entry->insn,
674 vcpu->arch.emulate_ctxt.decode.fetch.data,
675 15);
676 __entry->flags = kei_decode_mode(vcpu->arch.emulate_ctxt.mode);
677 __entry->failed = failed;
678 ),
679
680 TP_printk("%x:%llx:%s (%s)%s",
681 __entry->csbase, __entry->rip,
682 __print_insn(__entry->insn, __entry->len),
683 __print_symbolic(__entry->flags,
684 kvm_trace_symbol_emul_flags),
685 __entry->failed ? " failed" : ""
686 )
687 );
688
689#define trace_kvm_emulate_insn_start(vcpu) trace_kvm_emulate_insn(vcpu, 0)
690#define trace_kvm_emulate_insn_failed(vcpu) trace_kvm_emulate_insn(vcpu, 1)
691
554#endif /* _TRACE_KVM_H */ 692#endif /* _TRACE_KVM_H */
555 693
694#undef TRACE_INCLUDE_PATH
695#define TRACE_INCLUDE_PATH arch/x86/kvm
696#undef TRACE_INCLUDE_FILE
697#define TRACE_INCLUDE_FILE trace
698
556/* This part must be outside protection */ 699/* This part must be outside protection */
557#include <trace/define_trace.h> 700#include <trace/define_trace.h>
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index edca080407a5..859a01a07dbf 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -27,6 +27,7 @@
27#include <linux/moduleparam.h> 27#include <linux/moduleparam.h>
28#include <linux/ftrace_event.h> 28#include <linux/ftrace_event.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/tboot.h>
30#include "kvm_cache_regs.h" 31#include "kvm_cache_regs.h"
31#include "x86.h" 32#include "x86.h"
32 33
@@ -98,6 +99,8 @@ module_param(ple_gap, int, S_IRUGO);
98static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; 99static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
99module_param(ple_window, int, S_IRUGO); 100module_param(ple_window, int, S_IRUGO);
100 101
102#define NR_AUTOLOAD_MSRS 1
103
101struct vmcs { 104struct vmcs {
102 u32 revision_id; 105 u32 revision_id;
103 u32 abort; 106 u32 abort;
@@ -125,6 +128,11 @@ struct vcpu_vmx {
125 u64 msr_guest_kernel_gs_base; 128 u64 msr_guest_kernel_gs_base;
126#endif 129#endif
127 struct vmcs *vmcs; 130 struct vmcs *vmcs;
131 struct msr_autoload {
132 unsigned nr;
133 struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
134 struct vmx_msr_entry host[NR_AUTOLOAD_MSRS];
135 } msr_autoload;
128 struct { 136 struct {
129 int loaded; 137 int loaded;
130 u16 fs_sel, gs_sel, ldt_sel; 138 u16 fs_sel, gs_sel, ldt_sel;
@@ -234,56 +242,56 @@ static const u32 vmx_msr_index[] = {
234}; 242};
235#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) 243#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
236 244
237static inline int is_page_fault(u32 intr_info) 245static inline bool is_page_fault(u32 intr_info)
238{ 246{
239 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 247 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
240 INTR_INFO_VALID_MASK)) == 248 INTR_INFO_VALID_MASK)) ==
241 (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK); 249 (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
242} 250}
243 251
244static inline int is_no_device(u32 intr_info) 252static inline bool is_no_device(u32 intr_info)
245{ 253{
246 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 254 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
247 INTR_INFO_VALID_MASK)) == 255 INTR_INFO_VALID_MASK)) ==
248 (INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK); 256 (INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
249} 257}
250 258
251static inline int is_invalid_opcode(u32 intr_info) 259static inline bool is_invalid_opcode(u32 intr_info)
252{ 260{
253 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 261 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
254 INTR_INFO_VALID_MASK)) == 262 INTR_INFO_VALID_MASK)) ==
255 (INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK); 263 (INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
256} 264}
257 265
258static inline int is_external_interrupt(u32 intr_info) 266static inline bool is_external_interrupt(u32 intr_info)
259{ 267{
260 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) 268 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
261 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); 269 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
262} 270}
263 271
264static inline int is_machine_check(u32 intr_info) 272static inline bool is_machine_check(u32 intr_info)
265{ 273{
266 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 274 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
267 INTR_INFO_VALID_MASK)) == 275 INTR_INFO_VALID_MASK)) ==
268 (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK); 276 (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
269} 277}
270 278
271static inline int cpu_has_vmx_msr_bitmap(void) 279static inline bool cpu_has_vmx_msr_bitmap(void)
272{ 280{
273 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS; 281 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
274} 282}
275 283
276static inline int cpu_has_vmx_tpr_shadow(void) 284static inline bool cpu_has_vmx_tpr_shadow(void)
277{ 285{
278 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW; 286 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
279} 287}
280 288
281static inline int vm_need_tpr_shadow(struct kvm *kvm) 289static inline bool vm_need_tpr_shadow(struct kvm *kvm)
282{ 290{
283 return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm)); 291 return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
284} 292}
285 293
286static inline int cpu_has_secondary_exec_ctrls(void) 294static inline bool cpu_has_secondary_exec_ctrls(void)
287{ 295{
288 return vmcs_config.cpu_based_exec_ctrl & 296 return vmcs_config.cpu_based_exec_ctrl &
289 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 297 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@@ -303,80 +311,80 @@ static inline bool cpu_has_vmx_flexpriority(void)
303 311
304static inline bool cpu_has_vmx_ept_execute_only(void) 312static inline bool cpu_has_vmx_ept_execute_only(void)
305{ 313{
306 return !!(vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT); 314 return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
307} 315}
308 316
309static inline bool cpu_has_vmx_eptp_uncacheable(void) 317static inline bool cpu_has_vmx_eptp_uncacheable(void)
310{ 318{
311 return !!(vmx_capability.ept & VMX_EPTP_UC_BIT); 319 return vmx_capability.ept & VMX_EPTP_UC_BIT;
312} 320}
313 321
314static inline bool cpu_has_vmx_eptp_writeback(void) 322static inline bool cpu_has_vmx_eptp_writeback(void)
315{ 323{
316 return !!(vmx_capability.ept & VMX_EPTP_WB_BIT); 324 return vmx_capability.ept & VMX_EPTP_WB_BIT;
317} 325}
318 326
319static inline bool cpu_has_vmx_ept_2m_page(void) 327static inline bool cpu_has_vmx_ept_2m_page(void)
320{ 328{
321 return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT); 329 return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
322} 330}
323 331
324static inline bool cpu_has_vmx_ept_1g_page(void) 332static inline bool cpu_has_vmx_ept_1g_page(void)
325{ 333{
326 return !!(vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT); 334 return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
327} 335}
328 336
329static inline int cpu_has_vmx_invept_individual_addr(void) 337static inline bool cpu_has_vmx_invept_individual_addr(void)
330{ 338{
331 return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT); 339 return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
332} 340}
333 341
334static inline int cpu_has_vmx_invept_context(void) 342static inline bool cpu_has_vmx_invept_context(void)
335{ 343{
336 return !!(vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT); 344 return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
337} 345}
338 346
339static inline int cpu_has_vmx_invept_global(void) 347static inline bool cpu_has_vmx_invept_global(void)
340{ 348{
341 return !!(vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT); 349 return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
342} 350}
343 351
344static inline int cpu_has_vmx_ept(void) 352static inline bool cpu_has_vmx_ept(void)
345{ 353{
346 return vmcs_config.cpu_based_2nd_exec_ctrl & 354 return vmcs_config.cpu_based_2nd_exec_ctrl &
347 SECONDARY_EXEC_ENABLE_EPT; 355 SECONDARY_EXEC_ENABLE_EPT;
348} 356}
349 357
350static inline int cpu_has_vmx_unrestricted_guest(void) 358static inline bool cpu_has_vmx_unrestricted_guest(void)
351{ 359{
352 return vmcs_config.cpu_based_2nd_exec_ctrl & 360 return vmcs_config.cpu_based_2nd_exec_ctrl &
353 SECONDARY_EXEC_UNRESTRICTED_GUEST; 361 SECONDARY_EXEC_UNRESTRICTED_GUEST;
354} 362}
355 363
356static inline int cpu_has_vmx_ple(void) 364static inline bool cpu_has_vmx_ple(void)
357{ 365{
358 return vmcs_config.cpu_based_2nd_exec_ctrl & 366 return vmcs_config.cpu_based_2nd_exec_ctrl &
359 SECONDARY_EXEC_PAUSE_LOOP_EXITING; 367 SECONDARY_EXEC_PAUSE_LOOP_EXITING;
360} 368}
361 369
362static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) 370static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm)
363{ 371{
364 return flexpriority_enabled && irqchip_in_kernel(kvm); 372 return flexpriority_enabled && irqchip_in_kernel(kvm);
365} 373}
366 374
367static inline int cpu_has_vmx_vpid(void) 375static inline bool cpu_has_vmx_vpid(void)
368{ 376{
369 return vmcs_config.cpu_based_2nd_exec_ctrl & 377 return vmcs_config.cpu_based_2nd_exec_ctrl &
370 SECONDARY_EXEC_ENABLE_VPID; 378 SECONDARY_EXEC_ENABLE_VPID;
371} 379}
372 380
373static inline int cpu_has_vmx_rdtscp(void) 381static inline bool cpu_has_vmx_rdtscp(void)
374{ 382{
375 return vmcs_config.cpu_based_2nd_exec_ctrl & 383 return vmcs_config.cpu_based_2nd_exec_ctrl &
376 SECONDARY_EXEC_RDTSCP; 384 SECONDARY_EXEC_RDTSCP;
377} 385}
378 386
379static inline int cpu_has_virtual_nmis(void) 387static inline bool cpu_has_virtual_nmis(void)
380{ 388{
381 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; 389 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
382} 390}
@@ -595,16 +603,56 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
595 vmcs_write32(EXCEPTION_BITMAP, eb); 603 vmcs_write32(EXCEPTION_BITMAP, eb);
596} 604}
597 605
606static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
607{
608 unsigned i;
609 struct msr_autoload *m = &vmx->msr_autoload;
610
611 for (i = 0; i < m->nr; ++i)
612 if (m->guest[i].index == msr)
613 break;
614
615 if (i == m->nr)
616 return;
617 --m->nr;
618 m->guest[i] = m->guest[m->nr];
619 m->host[i] = m->host[m->nr];
620 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
621 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
622}
623
624static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
625 u64 guest_val, u64 host_val)
626{
627 unsigned i;
628 struct msr_autoload *m = &vmx->msr_autoload;
629
630 for (i = 0; i < m->nr; ++i)
631 if (m->guest[i].index == msr)
632 break;
633
634 if (i == m->nr) {
635 ++m->nr;
636 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
637 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
638 }
639
640 m->guest[i].index = msr;
641 m->guest[i].value = guest_val;
642 m->host[i].index = msr;
643 m->host[i].value = host_val;
644}
645
598static void reload_tss(void) 646static void reload_tss(void)
599{ 647{
600 /* 648 /*
601 * VT restores TR but not its size. Useless. 649 * VT restores TR but not its size. Useless.
602 */ 650 */
603 struct descriptor_table gdt; 651 struct desc_ptr gdt;
604 struct desc_struct *descs; 652 struct desc_struct *descs;
605 653
606 kvm_get_gdt(&gdt); 654 native_store_gdt(&gdt);
607 descs = (void *)gdt.base; 655 descs = (void *)gdt.address;
608 descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ 656 descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
609 load_TR_desc(); 657 load_TR_desc();
610} 658}
@@ -631,9 +679,57 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
631 guest_efer |= host_efer & ignore_bits; 679 guest_efer |= host_efer & ignore_bits;
632 vmx->guest_msrs[efer_offset].data = guest_efer; 680 vmx->guest_msrs[efer_offset].data = guest_efer;
633 vmx->guest_msrs[efer_offset].mask = ~ignore_bits; 681 vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
682
683 clear_atomic_switch_msr(vmx, MSR_EFER);
684 /* On ept, can't emulate nx, and must switch nx atomically */
685 if (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX)) {
686 guest_efer = vmx->vcpu.arch.efer;
687 if (!(guest_efer & EFER_LMA))
688 guest_efer &= ~EFER_LME;
689 add_atomic_switch_msr(vmx, MSR_EFER, guest_efer, host_efer);
690 return false;
691 }
692
634 return true; 693 return true;
635} 694}
636 695
696static unsigned long segment_base(u16 selector)
697{
698 struct desc_ptr gdt;
699 struct desc_struct *d;
700 unsigned long table_base;
701 unsigned long v;
702
703 if (!(selector & ~3))
704 return 0;
705
706 native_store_gdt(&gdt);
707 table_base = gdt.address;
708
709 if (selector & 4) { /* from ldt */
710 u16 ldt_selector = kvm_read_ldt();
711
712 if (!(ldt_selector & ~3))
713 return 0;
714
715 table_base = segment_base(ldt_selector);
716 }
717 d = (struct desc_struct *)(table_base + (selector & ~7));
718 v = get_desc_base(d);
719#ifdef CONFIG_X86_64
720 if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
721 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
722#endif
723 return v;
724}
725
726static inline unsigned long kvm_read_tr_base(void)
727{
728 u16 tr;
729 asm("str %0" : "=g"(tr));
730 return segment_base(tr);
731}
732
637static void vmx_save_host_state(struct kvm_vcpu *vcpu) 733static void vmx_save_host_state(struct kvm_vcpu *vcpu)
638{ 734{
639 struct vcpu_vmx *vmx = to_vmx(vcpu); 735 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -758,7 +854,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
758 } 854 }
759 855
760 if (vcpu->cpu != cpu) { 856 if (vcpu->cpu != cpu) {
761 struct descriptor_table dt; 857 struct desc_ptr dt;
762 unsigned long sysenter_esp; 858 unsigned long sysenter_esp;
763 859
764 vcpu->cpu = cpu; 860 vcpu->cpu = cpu;
@@ -767,8 +863,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
767 * processors. 863 * processors.
768 */ 864 */
769 vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */ 865 vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */
770 kvm_get_gdt(&dt); 866 native_store_gdt(&dt);
771 vmcs_writel(HOST_GDTR_BASE, dt.base); /* 22.2.4 */ 867 vmcs_writel(HOST_GDTR_BASE, dt.address); /* 22.2.4 */
772 868
773 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); 869 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
774 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ 870 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
@@ -846,9 +942,9 @@ static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
846 int ret = 0; 942 int ret = 0;
847 943
848 if (interruptibility & GUEST_INTR_STATE_STI) 944 if (interruptibility & GUEST_INTR_STATE_STI)
849 ret |= X86_SHADOW_INT_STI; 945 ret |= KVM_X86_SHADOW_INT_STI;
850 if (interruptibility & GUEST_INTR_STATE_MOV_SS) 946 if (interruptibility & GUEST_INTR_STATE_MOV_SS)
851 ret |= X86_SHADOW_INT_MOV_SS; 947 ret |= KVM_X86_SHADOW_INT_MOV_SS;
852 948
853 return ret & mask; 949 return ret & mask;
854} 950}
@@ -860,9 +956,9 @@ static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
860 956
861 interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS); 957 interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
862 958
863 if (mask & X86_SHADOW_INT_MOV_SS) 959 if (mask & KVM_X86_SHADOW_INT_MOV_SS)
864 interruptibility |= GUEST_INTR_STATE_MOV_SS; 960 interruptibility |= GUEST_INTR_STATE_MOV_SS;
865 if (mask & X86_SHADOW_INT_STI) 961 else if (mask & KVM_X86_SHADOW_INT_STI)
866 interruptibility |= GUEST_INTR_STATE_STI; 962 interruptibility |= GUEST_INTR_STATE_STI;
867 963
868 if ((interruptibility != interruptibility_old)) 964 if ((interruptibility != interruptibility_old))
@@ -882,7 +978,8 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
882} 978}
883 979
884static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, 980static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
885 bool has_error_code, u32 error_code) 981 bool has_error_code, u32 error_code,
982 bool reinject)
886{ 983{
887 struct vcpu_vmx *vmx = to_vmx(vcpu); 984 struct vcpu_vmx *vmx = to_vmx(vcpu);
888 u32 intr_info = nr | INTR_INFO_VALID_MASK; 985 u32 intr_info = nr | INTR_INFO_VALID_MASK;
@@ -1176,9 +1273,16 @@ static __init int vmx_disabled_by_bios(void)
1176 u64 msr; 1273 u64 msr;
1177 1274
1178 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); 1275 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
1179 return (msr & (FEATURE_CONTROL_LOCKED | 1276 if (msr & FEATURE_CONTROL_LOCKED) {
1180 FEATURE_CONTROL_VMXON_ENABLED)) 1277 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
1181 == FEATURE_CONTROL_LOCKED; 1278 && tboot_enabled())
1279 return 1;
1280 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
1281 && !tboot_enabled())
1282 return 1;
1283 }
1284
1285 return 0;
1182 /* locked but not enabled */ 1286 /* locked but not enabled */
1183} 1287}
1184 1288
@@ -1186,21 +1290,23 @@ static int hardware_enable(void *garbage)
1186{ 1290{
1187 int cpu = raw_smp_processor_id(); 1291 int cpu = raw_smp_processor_id();
1188 u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); 1292 u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
1189 u64 old; 1293 u64 old, test_bits;
1190 1294
1191 if (read_cr4() & X86_CR4_VMXE) 1295 if (read_cr4() & X86_CR4_VMXE)
1192 return -EBUSY; 1296 return -EBUSY;
1193 1297
1194 INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu)); 1298 INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
1195 rdmsrl(MSR_IA32_FEATURE_CONTROL, old); 1299 rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
1196 if ((old & (FEATURE_CONTROL_LOCKED | 1300
1197 FEATURE_CONTROL_VMXON_ENABLED)) 1301 test_bits = FEATURE_CONTROL_LOCKED;
1198 != (FEATURE_CONTROL_LOCKED | 1302 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
1199 FEATURE_CONTROL_VMXON_ENABLED)) 1303 if (tboot_enabled())
1304 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
1305
1306 if ((old & test_bits) != test_bits) {
1200 /* enable and lock */ 1307 /* enable and lock */
1201 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | 1308 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
1202 FEATURE_CONTROL_LOCKED | 1309 }
1203 FEATURE_CONTROL_VMXON_ENABLED);
1204 write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ 1310 write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
1205 asm volatile (ASM_VMX_VMXON_RAX 1311 asm volatile (ASM_VMX_VMXON_RAX
1206 : : "a"(&phys_addr), "m"(phys_addr) 1312 : : "a"(&phys_addr), "m"(phys_addr)
@@ -1521,7 +1627,7 @@ static gva_t rmode_tss_base(struct kvm *kvm)
1521 struct kvm_memslots *slots; 1627 struct kvm_memslots *slots;
1522 gfn_t base_gfn; 1628 gfn_t base_gfn;
1523 1629
1524 slots = rcu_dereference(kvm->memslots); 1630 slots = kvm_memslots(kvm);
1525 base_gfn = kvm->memslots->memslots[0].base_gfn + 1631 base_gfn = kvm->memslots->memslots[0].base_gfn +
1526 kvm->memslots->memslots[0].npages - 3; 1632 kvm->memslots->memslots[0].npages - 3;
1527 return base_gfn << PAGE_SHIFT; 1633 return base_gfn << PAGE_SHIFT;
@@ -1649,6 +1755,7 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
1649 vmcs_write32(VM_ENTRY_CONTROLS, 1755 vmcs_write32(VM_ENTRY_CONTROLS,
1650 vmcs_read32(VM_ENTRY_CONTROLS) 1756 vmcs_read32(VM_ENTRY_CONTROLS)
1651 & ~VM_ENTRY_IA32E_MODE); 1757 & ~VM_ENTRY_IA32E_MODE);
1758 vmx_set_efer(vcpu, vcpu->arch.efer);
1652} 1759}
1653 1760
1654#endif 1761#endif
@@ -1934,28 +2041,28 @@ static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
1934 *l = (ar >> 13) & 1; 2041 *l = (ar >> 13) & 1;
1935} 2042}
1936 2043
1937static void vmx_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) 2044static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1938{ 2045{
1939 dt->limit = vmcs_read32(GUEST_IDTR_LIMIT); 2046 dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
1940 dt->base = vmcs_readl(GUEST_IDTR_BASE); 2047 dt->address = vmcs_readl(GUEST_IDTR_BASE);
1941} 2048}
1942 2049
1943static void vmx_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) 2050static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1944{ 2051{
1945 vmcs_write32(GUEST_IDTR_LIMIT, dt->limit); 2052 vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
1946 vmcs_writel(GUEST_IDTR_BASE, dt->base); 2053 vmcs_writel(GUEST_IDTR_BASE, dt->address);
1947} 2054}
1948 2055
1949static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) 2056static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1950{ 2057{
1951 dt->limit = vmcs_read32(GUEST_GDTR_LIMIT); 2058 dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
1952 dt->base = vmcs_readl(GUEST_GDTR_BASE); 2059 dt->address = vmcs_readl(GUEST_GDTR_BASE);
1953} 2060}
1954 2061
1955static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) 2062static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1956{ 2063{
1957 vmcs_write32(GUEST_GDTR_LIMIT, dt->limit); 2064 vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
1958 vmcs_writel(GUEST_GDTR_BASE, dt->base); 2065 vmcs_writel(GUEST_GDTR_BASE, dt->address);
1959} 2066}
1960 2067
1961static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) 2068static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
@@ -2296,6 +2403,16 @@ static void allocate_vpid(struct vcpu_vmx *vmx)
2296 spin_unlock(&vmx_vpid_lock); 2403 spin_unlock(&vmx_vpid_lock);
2297} 2404}
2298 2405
2406static void free_vpid(struct vcpu_vmx *vmx)
2407{
2408 if (!enable_vpid)
2409 return;
2410 spin_lock(&vmx_vpid_lock);
2411 if (vmx->vpid != 0)
2412 __clear_bit(vmx->vpid, vmx_vpid_bitmap);
2413 spin_unlock(&vmx_vpid_lock);
2414}
2415
2299static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr) 2416static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
2300{ 2417{
2301 int f = sizeof(unsigned long); 2418 int f = sizeof(unsigned long);
@@ -2334,7 +2451,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2334 u32 junk; 2451 u32 junk;
2335 u64 host_pat, tsc_this, tsc_base; 2452 u64 host_pat, tsc_this, tsc_base;
2336 unsigned long a; 2453 unsigned long a;
2337 struct descriptor_table dt; 2454 struct desc_ptr dt;
2338 int i; 2455 int i;
2339 unsigned long kvm_vmx_return; 2456 unsigned long kvm_vmx_return;
2340 u32 exec_control; 2457 u32 exec_control;
@@ -2415,14 +2532,16 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2415 2532
2416 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ 2533 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
2417 2534
2418 kvm_get_idt(&dt); 2535 native_store_idt(&dt);
2419 vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */ 2536 vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */
2420 2537
2421 asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return)); 2538 asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
2422 vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */ 2539 vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
2423 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); 2540 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
2424 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 2541 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
2542 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
2425 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 2543 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
2544 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
2426 2545
2427 rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk); 2546 rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
2428 vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs); 2547 vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
@@ -2947,22 +3066,20 @@ static int handle_io(struct kvm_vcpu *vcpu)
2947 int size, in, string; 3066 int size, in, string;
2948 unsigned port; 3067 unsigned port;
2949 3068
2950 ++vcpu->stat.io_exits;
2951 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 3069 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
2952 string = (exit_qualification & 16) != 0; 3070 string = (exit_qualification & 16) != 0;
3071 in = (exit_qualification & 8) != 0;
2953 3072
2954 if (string) { 3073 ++vcpu->stat.io_exits;
2955 if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO)
2956 return 0;
2957 return 1;
2958 }
2959 3074
2960 size = (exit_qualification & 7) + 1; 3075 if (string || in)
2961 in = (exit_qualification & 8) != 0; 3076 return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO);
2962 port = exit_qualification >> 16;
2963 3077
3078 port = exit_qualification >> 16;
3079 size = (exit_qualification & 7) + 1;
2964 skip_emulated_instruction(vcpu); 3080 skip_emulated_instruction(vcpu);
2965 return kvm_emulate_pio(vcpu, in, size, port); 3081
3082 return kvm_fast_pio_out(vcpu, size, port);
2966} 3083}
2967 3084
2968static void 3085static void
@@ -3053,19 +3170,9 @@ static int handle_cr(struct kvm_vcpu *vcpu)
3053 return 0; 3170 return 0;
3054} 3171}
3055 3172
3056static int check_dr_alias(struct kvm_vcpu *vcpu)
3057{
3058 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
3059 kvm_queue_exception(vcpu, UD_VECTOR);
3060 return -1;
3061 }
3062 return 0;
3063}
3064
3065static int handle_dr(struct kvm_vcpu *vcpu) 3173static int handle_dr(struct kvm_vcpu *vcpu)
3066{ 3174{
3067 unsigned long exit_qualification; 3175 unsigned long exit_qualification;
3068 unsigned long val;
3069 int dr, reg; 3176 int dr, reg;
3070 3177
3071 /* Do not handle if the CPL > 0, will trigger GP on re-entry */ 3178 /* Do not handle if the CPL > 0, will trigger GP on re-entry */
@@ -3100,67 +3207,20 @@ static int handle_dr(struct kvm_vcpu *vcpu)
3100 dr = exit_qualification & DEBUG_REG_ACCESS_NUM; 3207 dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
3101 reg = DEBUG_REG_ACCESS_REG(exit_qualification); 3208 reg = DEBUG_REG_ACCESS_REG(exit_qualification);
3102 if (exit_qualification & TYPE_MOV_FROM_DR) { 3209 if (exit_qualification & TYPE_MOV_FROM_DR) {
3103 switch (dr) { 3210 unsigned long val;
3104 case 0 ... 3: 3211 if (!kvm_get_dr(vcpu, dr, &val))
3105 val = vcpu->arch.db[dr]; 3212 kvm_register_write(vcpu, reg, val);
3106 break; 3213 } else
3107 case 4: 3214 kvm_set_dr(vcpu, dr, vcpu->arch.regs[reg]);
3108 if (check_dr_alias(vcpu) < 0)
3109 return 1;
3110 /* fall through */
3111 case 6:
3112 val = vcpu->arch.dr6;
3113 break;
3114 case 5:
3115 if (check_dr_alias(vcpu) < 0)
3116 return 1;
3117 /* fall through */
3118 default: /* 7 */
3119 val = vcpu->arch.dr7;
3120 break;
3121 }
3122 kvm_register_write(vcpu, reg, val);
3123 } else {
3124 val = vcpu->arch.regs[reg];
3125 switch (dr) {
3126 case 0 ... 3:
3127 vcpu->arch.db[dr] = val;
3128 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
3129 vcpu->arch.eff_db[dr] = val;
3130 break;
3131 case 4:
3132 if (check_dr_alias(vcpu) < 0)
3133 return 1;
3134 /* fall through */
3135 case 6:
3136 if (val & 0xffffffff00000000ULL) {
3137 kvm_inject_gp(vcpu, 0);
3138 return 1;
3139 }
3140 vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
3141 break;
3142 case 5:
3143 if (check_dr_alias(vcpu) < 0)
3144 return 1;
3145 /* fall through */
3146 default: /* 7 */
3147 if (val & 0xffffffff00000000ULL) {
3148 kvm_inject_gp(vcpu, 0);
3149 return 1;
3150 }
3151 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
3152 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
3153 vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
3154 vcpu->arch.switch_db_regs =
3155 (val & DR7_BP_EN_MASK);
3156 }
3157 break;
3158 }
3159 }
3160 skip_emulated_instruction(vcpu); 3215 skip_emulated_instruction(vcpu);
3161 return 1; 3216 return 1;
3162} 3217}
3163 3218
3219static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
3220{
3221 vmcs_writel(GUEST_DR7, val);
3222}
3223
3164static int handle_cpuid(struct kvm_vcpu *vcpu) 3224static int handle_cpuid(struct kvm_vcpu *vcpu)
3165{ 3225{
3166 kvm_emulate_cpuid(vcpu); 3226 kvm_emulate_cpuid(vcpu);
@@ -3292,6 +3352,8 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
3292{ 3352{
3293 struct vcpu_vmx *vmx = to_vmx(vcpu); 3353 struct vcpu_vmx *vmx = to_vmx(vcpu);
3294 unsigned long exit_qualification; 3354 unsigned long exit_qualification;
3355 bool has_error_code = false;
3356 u32 error_code = 0;
3295 u16 tss_selector; 3357 u16 tss_selector;
3296 int reason, type, idt_v; 3358 int reason, type, idt_v;
3297 3359
@@ -3314,6 +3376,13 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
3314 kvm_clear_interrupt_queue(vcpu); 3376 kvm_clear_interrupt_queue(vcpu);
3315 break; 3377 break;
3316 case INTR_TYPE_HARD_EXCEPTION: 3378 case INTR_TYPE_HARD_EXCEPTION:
3379 if (vmx->idt_vectoring_info &
3380 VECTORING_INFO_DELIVER_CODE_MASK) {
3381 has_error_code = true;
3382 error_code =
3383 vmcs_read32(IDT_VECTORING_ERROR_CODE);
3384 }
3385 /* fall through */
3317 case INTR_TYPE_SOFT_EXCEPTION: 3386 case INTR_TYPE_SOFT_EXCEPTION:
3318 kvm_clear_exception_queue(vcpu); 3387 kvm_clear_exception_queue(vcpu);
3319 break; 3388 break;
@@ -3328,8 +3397,13 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
3328 type != INTR_TYPE_NMI_INTR)) 3397 type != INTR_TYPE_NMI_INTR))
3329 skip_emulated_instruction(vcpu); 3398 skip_emulated_instruction(vcpu);
3330 3399
3331 if (!kvm_task_switch(vcpu, tss_selector, reason)) 3400 if (kvm_task_switch(vcpu, tss_selector, reason,
3401 has_error_code, error_code) == EMULATE_FAIL) {
3402 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3403 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
3404 vcpu->run->internal.ndata = 0;
3332 return 0; 3405 return 0;
3406 }
3333 3407
3334 /* clear all local breakpoint enable flags */ 3408 /* clear all local breakpoint enable flags */
3335 vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55); 3409 vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55);
@@ -3574,7 +3648,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
3574 u32 exit_reason = vmx->exit_reason; 3648 u32 exit_reason = vmx->exit_reason;
3575 u32 vectoring_info = vmx->idt_vectoring_info; 3649 u32 vectoring_info = vmx->idt_vectoring_info;
3576 3650
3577 trace_kvm_exit(exit_reason, kvm_rip_read(vcpu)); 3651 trace_kvm_exit(exit_reason, vcpu);
3578 3652
3579 /* If guest state is invalid, start emulating */ 3653 /* If guest state is invalid, start emulating */
3580 if (vmx->emulation_required && emulate_invalid_guest_state) 3654 if (vmx->emulation_required && emulate_invalid_guest_state)
@@ -3923,10 +3997,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
3923{ 3997{
3924 struct vcpu_vmx *vmx = to_vmx(vcpu); 3998 struct vcpu_vmx *vmx = to_vmx(vcpu);
3925 3999
3926 spin_lock(&vmx_vpid_lock); 4000 free_vpid(vmx);
3927 if (vmx->vpid != 0)
3928 __clear_bit(vmx->vpid, vmx_vpid_bitmap);
3929 spin_unlock(&vmx_vpid_lock);
3930 vmx_free_vmcs(vcpu); 4001 vmx_free_vmcs(vcpu);
3931 kfree(vmx->guest_msrs); 4002 kfree(vmx->guest_msrs);
3932 kvm_vcpu_uninit(vcpu); 4003 kvm_vcpu_uninit(vcpu);
@@ -3988,6 +4059,7 @@ free_msrs:
3988uninit_vcpu: 4059uninit_vcpu:
3989 kvm_vcpu_uninit(&vmx->vcpu); 4060 kvm_vcpu_uninit(&vmx->vcpu);
3990free_vcpu: 4061free_vcpu:
4062 free_vpid(vmx);
3991 kmem_cache_free(kvm_vcpu_cache, vmx); 4063 kmem_cache_free(kvm_vcpu_cache, vmx);
3992 return ERR_PTR(err); 4064 return ERR_PTR(err);
3993} 4065}
@@ -4118,6 +4190,10 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
4118 } 4190 }
4119} 4191}
4120 4192
4193static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
4194{
4195}
4196
4121static struct kvm_x86_ops vmx_x86_ops = { 4197static struct kvm_x86_ops vmx_x86_ops = {
4122 .cpu_has_kvm_support = cpu_has_kvm_support, 4198 .cpu_has_kvm_support = cpu_has_kvm_support,
4123 .disabled_by_bios = vmx_disabled_by_bios, 4199 .disabled_by_bios = vmx_disabled_by_bios,
@@ -4154,6 +4230,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
4154 .set_idt = vmx_set_idt, 4230 .set_idt = vmx_set_idt,
4155 .get_gdt = vmx_get_gdt, 4231 .get_gdt = vmx_get_gdt,
4156 .set_gdt = vmx_set_gdt, 4232 .set_gdt = vmx_set_gdt,
4233 .set_dr7 = vmx_set_dr7,
4157 .cache_reg = vmx_cache_reg, 4234 .cache_reg = vmx_cache_reg,
4158 .get_rflags = vmx_get_rflags, 4235 .get_rflags = vmx_get_rflags,
4159 .set_rflags = vmx_set_rflags, 4236 .set_rflags = vmx_set_rflags,
@@ -4189,6 +4266,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
4189 .cpuid_update = vmx_cpuid_update, 4266 .cpuid_update = vmx_cpuid_update,
4190 4267
4191 .rdtscp_supported = vmx_rdtscp_supported, 4268 .rdtscp_supported = vmx_rdtscp_supported,
4269
4270 .set_supported_cpuid = vmx_set_supported_cpuid,
4192}; 4271};
4193 4272
4194static int __init vmx_init(void) 4273static int __init vmx_init(void)
@@ -4236,7 +4315,8 @@ static int __init vmx_init(void)
4236 4315
4237 set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ 4316 set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
4238 4317
4239 r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE); 4318 r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
4319 __alignof__(struct vcpu_vmx), THIS_MODULE);
4240 if (r) 4320 if (r)
4241 goto out3; 4321 goto out3;
4242 4322
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index dd9bc8fb81ab..05d571f6f196 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -42,7 +42,7 @@
42#include <linux/slab.h> 42#include <linux/slab.h>
43#include <linux/perf_event.h> 43#include <linux/perf_event.h>
44#include <trace/events/kvm.h> 44#include <trace/events/kvm.h>
45#undef TRACE_INCLUDE_FILE 45
46#define CREATE_TRACE_POINTS 46#define CREATE_TRACE_POINTS
47#include "trace.h" 47#include "trace.h"
48 48
@@ -224,34 +224,6 @@ static void drop_user_return_notifiers(void *ignore)
224 kvm_on_user_return(&smsr->urn); 224 kvm_on_user_return(&smsr->urn);
225} 225}
226 226
227unsigned long segment_base(u16 selector)
228{
229 struct descriptor_table gdt;
230 struct desc_struct *d;
231 unsigned long table_base;
232 unsigned long v;
233
234 if (selector == 0)
235 return 0;
236
237 kvm_get_gdt(&gdt);
238 table_base = gdt.base;
239
240 if (selector & 4) { /* from ldt */
241 u16 ldt_selector = kvm_read_ldt();
242
243 table_base = segment_base(ldt_selector);
244 }
245 d = (struct desc_struct *)(table_base + (selector & ~7));
246 v = get_desc_base(d);
247#ifdef CONFIG_X86_64
248 if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
249 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
250#endif
251 return v;
252}
253EXPORT_SYMBOL_GPL(segment_base);
254
255u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) 227u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
256{ 228{
257 if (irqchip_in_kernel(vcpu->kvm)) 229 if (irqchip_in_kernel(vcpu->kvm))
@@ -293,7 +265,8 @@ static int exception_class(int vector)
293} 265}
294 266
295static void kvm_multiple_exception(struct kvm_vcpu *vcpu, 267static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
296 unsigned nr, bool has_error, u32 error_code) 268 unsigned nr, bool has_error, u32 error_code,
269 bool reinject)
297{ 270{
298 u32 prev_nr; 271 u32 prev_nr;
299 int class1, class2; 272 int class1, class2;
@@ -304,6 +277,7 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
304 vcpu->arch.exception.has_error_code = has_error; 277 vcpu->arch.exception.has_error_code = has_error;
305 vcpu->arch.exception.nr = nr; 278 vcpu->arch.exception.nr = nr;
306 vcpu->arch.exception.error_code = error_code; 279 vcpu->arch.exception.error_code = error_code;
280 vcpu->arch.exception.reinject = reinject;
307 return; 281 return;
308 } 282 }
309 283
@@ -332,10 +306,16 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
332 306
333void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) 307void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
334{ 308{
335 kvm_multiple_exception(vcpu, nr, false, 0); 309 kvm_multiple_exception(vcpu, nr, false, 0, false);
336} 310}
337EXPORT_SYMBOL_GPL(kvm_queue_exception); 311EXPORT_SYMBOL_GPL(kvm_queue_exception);
338 312
313void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
314{
315 kvm_multiple_exception(vcpu, nr, false, 0, true);
316}
317EXPORT_SYMBOL_GPL(kvm_requeue_exception);
318
339void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, 319void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
340 u32 error_code) 320 u32 error_code)
341{ 321{
@@ -352,10 +332,16 @@ EXPORT_SYMBOL_GPL(kvm_inject_nmi);
352 332
353void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) 333void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
354{ 334{
355 kvm_multiple_exception(vcpu, nr, true, error_code); 335 kvm_multiple_exception(vcpu, nr, true, error_code, false);
356} 336}
357EXPORT_SYMBOL_GPL(kvm_queue_exception_e); 337EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
358 338
339void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
340{
341 kvm_multiple_exception(vcpu, nr, true, error_code, true);
342}
343EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
344
359/* 345/*
360 * Checks if cpl <= required_cpl; if true, return true. Otherwise queue 346 * Checks if cpl <= required_cpl; if true, return true. Otherwise queue
361 * a #GP and return false. 347 * a #GP and return false.
@@ -476,7 +462,6 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
476 } 462 }
477 463
478 kvm_x86_ops->set_cr0(vcpu, cr0); 464 kvm_x86_ops->set_cr0(vcpu, cr0);
479 vcpu->arch.cr0 = cr0;
480 465
481 kvm_mmu_reset_context(vcpu); 466 kvm_mmu_reset_context(vcpu);
482 return; 467 return;
@@ -485,7 +470,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr0);
485 470
486void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) 471void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
487{ 472{
488 kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0ful) | (msw & 0x0f)); 473 kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
489} 474}
490EXPORT_SYMBOL_GPL(kvm_lmsw); 475EXPORT_SYMBOL_GPL(kvm_lmsw);
491 476
@@ -517,7 +502,6 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
517 } 502 }
518 kvm_x86_ops->set_cr4(vcpu, cr4); 503 kvm_x86_ops->set_cr4(vcpu, cr4);
519 vcpu->arch.cr4 = cr4; 504 vcpu->arch.cr4 = cr4;
520 vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled;
521 kvm_mmu_reset_context(vcpu); 505 kvm_mmu_reset_context(vcpu);
522} 506}
523EXPORT_SYMBOL_GPL(kvm_set_cr4); 507EXPORT_SYMBOL_GPL(kvm_set_cr4);
@@ -592,6 +576,80 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
592} 576}
593EXPORT_SYMBOL_GPL(kvm_get_cr8); 577EXPORT_SYMBOL_GPL(kvm_get_cr8);
594 578
579int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
580{
581 switch (dr) {
582 case 0 ... 3:
583 vcpu->arch.db[dr] = val;
584 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
585 vcpu->arch.eff_db[dr] = val;
586 break;
587 case 4:
588 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
589 kvm_queue_exception(vcpu, UD_VECTOR);
590 return 1;
591 }
592 /* fall through */
593 case 6:
594 if (val & 0xffffffff00000000ULL) {
595 kvm_inject_gp(vcpu, 0);
596 return 1;
597 }
598 vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
599 break;
600 case 5:
601 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
602 kvm_queue_exception(vcpu, UD_VECTOR);
603 return 1;
604 }
605 /* fall through */
606 default: /* 7 */
607 if (val & 0xffffffff00000000ULL) {
608 kvm_inject_gp(vcpu, 0);
609 return 1;
610 }
611 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
612 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
613 kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7);
614 vcpu->arch.switch_db_regs = (val & DR7_BP_EN_MASK);
615 }
616 break;
617 }
618
619 return 0;
620}
621EXPORT_SYMBOL_GPL(kvm_set_dr);
622
623int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
624{
625 switch (dr) {
626 case 0 ... 3:
627 *val = vcpu->arch.db[dr];
628 break;
629 case 4:
630 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
631 kvm_queue_exception(vcpu, UD_VECTOR);
632 return 1;
633 }
634 /* fall through */
635 case 6:
636 *val = vcpu->arch.dr6;
637 break;
638 case 5:
639 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
640 kvm_queue_exception(vcpu, UD_VECTOR);
641 return 1;
642 }
643 /* fall through */
644 default: /* 7 */
645 *val = vcpu->arch.dr7;
646 break;
647 }
648
649 return 0;
650}
651EXPORT_SYMBOL_GPL(kvm_get_dr);
652
595static inline u32 bit(int bitno) 653static inline u32 bit(int bitno)
596{ 654{
597 return 1 << (bitno & 31); 655 return 1 << (bitno & 31);
@@ -606,9 +664,10 @@ static inline u32 bit(int bitno)
606 * kvm-specific. Those are put in the beginning of the list. 664 * kvm-specific. Those are put in the beginning of the list.
607 */ 665 */
608 666
609#define KVM_SAVE_MSRS_BEGIN 5 667#define KVM_SAVE_MSRS_BEGIN 7
610static u32 msrs_to_save[] = { 668static u32 msrs_to_save[] = {
611 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 669 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
670 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
612 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, 671 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
613 HV_X64_MSR_APIC_ASSIST_PAGE, 672 HV_X64_MSR_APIC_ASSIST_PAGE,
614 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 673 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
@@ -625,48 +684,42 @@ static u32 emulated_msrs[] = {
625 MSR_IA32_MISC_ENABLE, 684 MSR_IA32_MISC_ENABLE,
626}; 685};
627 686
628static void set_efer(struct kvm_vcpu *vcpu, u64 efer) 687static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
629{ 688{
630 if (efer & efer_reserved_bits) { 689 if (efer & efer_reserved_bits)
631 kvm_inject_gp(vcpu, 0); 690 return 1;
632 return;
633 }
634 691
635 if (is_paging(vcpu) 692 if (is_paging(vcpu)
636 && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) { 693 && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
637 kvm_inject_gp(vcpu, 0); 694 return 1;
638 return;
639 }
640 695
641 if (efer & EFER_FFXSR) { 696 if (efer & EFER_FFXSR) {
642 struct kvm_cpuid_entry2 *feat; 697 struct kvm_cpuid_entry2 *feat;
643 698
644 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 699 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
645 if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) { 700 if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT)))
646 kvm_inject_gp(vcpu, 0); 701 return 1;
647 return;
648 }
649 } 702 }
650 703
651 if (efer & EFER_SVME) { 704 if (efer & EFER_SVME) {
652 struct kvm_cpuid_entry2 *feat; 705 struct kvm_cpuid_entry2 *feat;
653 706
654 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 707 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
655 if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) { 708 if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM)))
656 kvm_inject_gp(vcpu, 0); 709 return 1;
657 return;
658 }
659 } 710 }
660 711
661 kvm_x86_ops->set_efer(vcpu, efer);
662
663 efer &= ~EFER_LMA; 712 efer &= ~EFER_LMA;
664 efer |= vcpu->arch.efer & EFER_LMA; 713 efer |= vcpu->arch.efer & EFER_LMA;
665 714
715 kvm_x86_ops->set_efer(vcpu, efer);
716
666 vcpu->arch.efer = efer; 717 vcpu->arch.efer = efer;
667 718
668 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; 719 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
669 kvm_mmu_reset_context(vcpu); 720 kvm_mmu_reset_context(vcpu);
721
722 return 0;
670} 723}
671 724
672void kvm_enable_efer_bits(u64 mask) 725void kvm_enable_efer_bits(u64 mask)
@@ -696,14 +749,22 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
696 749
697static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) 750static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
698{ 751{
699 static int version; 752 int version;
753 int r;
700 struct pvclock_wall_clock wc; 754 struct pvclock_wall_clock wc;
701 struct timespec boot; 755 struct timespec boot;
702 756
703 if (!wall_clock) 757 if (!wall_clock)
704 return; 758 return;
705 759
706 version++; 760 r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
761 if (r)
762 return;
763
764 if (version & 1)
765 ++version; /* first time write, random junk */
766
767 ++version;
707 768
708 kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); 769 kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
709 770
@@ -796,6 +857,8 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
796 vcpu->hv_clock.system_time = ts.tv_nsec + 857 vcpu->hv_clock.system_time = ts.tv_nsec +
797 (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset; 858 (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset;
798 859
860 vcpu->hv_clock.flags = 0;
861
799 /* 862 /*
800 * The interface expects us to write an even number signaling that the 863 * The interface expects us to write an even number signaling that the
801 * update is finished. Since the guest won't see the intermediate 864 * update is finished. Since the guest won't see the intermediate
@@ -1087,10 +1150,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1087{ 1150{
1088 switch (msr) { 1151 switch (msr) {
1089 case MSR_EFER: 1152 case MSR_EFER:
1090 set_efer(vcpu, data); 1153 return set_efer(vcpu, data);
1091 break;
1092 case MSR_K7_HWCR: 1154 case MSR_K7_HWCR:
1093 data &= ~(u64)0x40; /* ignore flush filter disable */ 1155 data &= ~(u64)0x40; /* ignore flush filter disable */
1156 data &= ~(u64)0x100; /* ignore ignne emulation enable */
1094 if (data != 0) { 1157 if (data != 0) {
1095 pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", 1158 pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
1096 data); 1159 data);
@@ -1133,10 +1196,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1133 case MSR_IA32_MISC_ENABLE: 1196 case MSR_IA32_MISC_ENABLE:
1134 vcpu->arch.ia32_misc_enable_msr = data; 1197 vcpu->arch.ia32_misc_enable_msr = data;
1135 break; 1198 break;
1199 case MSR_KVM_WALL_CLOCK_NEW:
1136 case MSR_KVM_WALL_CLOCK: 1200 case MSR_KVM_WALL_CLOCK:
1137 vcpu->kvm->arch.wall_clock = data; 1201 vcpu->kvm->arch.wall_clock = data;
1138 kvm_write_wall_clock(vcpu->kvm, data); 1202 kvm_write_wall_clock(vcpu->kvm, data);
1139 break; 1203 break;
1204 case MSR_KVM_SYSTEM_TIME_NEW:
1140 case MSR_KVM_SYSTEM_TIME: { 1205 case MSR_KVM_SYSTEM_TIME: {
1141 if (vcpu->arch.time_page) { 1206 if (vcpu->arch.time_page) {
1142 kvm_release_page_dirty(vcpu->arch.time_page); 1207 kvm_release_page_dirty(vcpu->arch.time_page);
@@ -1408,9 +1473,11 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1408 data = vcpu->arch.efer; 1473 data = vcpu->arch.efer;
1409 break; 1474 break;
1410 case MSR_KVM_WALL_CLOCK: 1475 case MSR_KVM_WALL_CLOCK:
1476 case MSR_KVM_WALL_CLOCK_NEW:
1411 data = vcpu->kvm->arch.wall_clock; 1477 data = vcpu->kvm->arch.wall_clock;
1412 break; 1478 break;
1413 case MSR_KVM_SYSTEM_TIME: 1479 case MSR_KVM_SYSTEM_TIME:
1480 case MSR_KVM_SYSTEM_TIME_NEW:
1414 data = vcpu->arch.time; 1481 data = vcpu->arch.time;
1415 break; 1482 break;
1416 case MSR_IA32_P5_MC_ADDR: 1483 case MSR_IA32_P5_MC_ADDR:
@@ -1549,6 +1616,7 @@ int kvm_dev_ioctl_check_extension(long ext)
1549 case KVM_CAP_HYPERV_VAPIC: 1616 case KVM_CAP_HYPERV_VAPIC:
1550 case KVM_CAP_HYPERV_SPIN: 1617 case KVM_CAP_HYPERV_SPIN:
1551 case KVM_CAP_PCI_SEGMENT: 1618 case KVM_CAP_PCI_SEGMENT:
1619 case KVM_CAP_DEBUGREGS:
1552 case KVM_CAP_X86_ROBUST_SINGLESTEP: 1620 case KVM_CAP_X86_ROBUST_SINGLESTEP:
1553 r = 1; 1621 r = 1;
1554 break; 1622 break;
@@ -1769,6 +1837,7 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
1769{ 1837{
1770 int r; 1838 int r;
1771 1839
1840 vcpu_load(vcpu);
1772 r = -E2BIG; 1841 r = -E2BIG;
1773 if (cpuid->nent < vcpu->arch.cpuid_nent) 1842 if (cpuid->nent < vcpu->arch.cpuid_nent)
1774 goto out; 1843 goto out;
@@ -1780,6 +1849,7 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
1780 1849
1781out: 1850out:
1782 cpuid->nent = vcpu->arch.cpuid_nent; 1851 cpuid->nent = vcpu->arch.cpuid_nent;
1852 vcpu_put(vcpu);
1783 return r; 1853 return r;
1784} 1854}
1785 1855
@@ -1910,6 +1980,24 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1910 } 1980 }
1911 break; 1981 break;
1912 } 1982 }
1983 case KVM_CPUID_SIGNATURE: {
1984 char signature[12] = "KVMKVMKVM\0\0";
1985 u32 *sigptr = (u32 *)signature;
1986 entry->eax = 0;
1987 entry->ebx = sigptr[0];
1988 entry->ecx = sigptr[1];
1989 entry->edx = sigptr[2];
1990 break;
1991 }
1992 case KVM_CPUID_FEATURES:
1993 entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) |
1994 (1 << KVM_FEATURE_NOP_IO_DELAY) |
1995 (1 << KVM_FEATURE_CLOCKSOURCE2) |
1996 (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
1997 entry->ebx = 0;
1998 entry->ecx = 0;
1999 entry->edx = 0;
2000 break;
1913 case 0x80000000: 2001 case 0x80000000:
1914 entry->eax = min(entry->eax, 0x8000001a); 2002 entry->eax = min(entry->eax, 0x8000001a);
1915 break; 2003 break;
@@ -1918,6 +2006,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1918 entry->ecx &= kvm_supported_word6_x86_features; 2006 entry->ecx &= kvm_supported_word6_x86_features;
1919 break; 2007 break;
1920 } 2008 }
2009
2010 kvm_x86_ops->set_supported_cpuid(function, entry);
2011
1921 put_cpu(); 2012 put_cpu();
1922} 2013}
1923 2014
@@ -1953,6 +2044,23 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
1953 for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) 2044 for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
1954 do_cpuid_ent(&cpuid_entries[nent], func, 0, 2045 do_cpuid_ent(&cpuid_entries[nent], func, 0,
1955 &nent, cpuid->nent); 2046 &nent, cpuid->nent);
2047
2048
2049
2050 r = -E2BIG;
2051 if (nent >= cpuid->nent)
2052 goto out_free;
2053
2054 do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent,
2055 cpuid->nent);
2056
2057 r = -E2BIG;
2058 if (nent >= cpuid->nent)
2059 goto out_free;
2060
2061 do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_FEATURES, 0, &nent,
2062 cpuid->nent);
2063
1956 r = -E2BIG; 2064 r = -E2BIG;
1957 if (nent >= cpuid->nent) 2065 if (nent >= cpuid->nent)
1958 goto out_free; 2066 goto out_free;
@@ -2032,6 +2140,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
2032 int r; 2140 int r;
2033 unsigned bank_num = mcg_cap & 0xff, bank; 2141 unsigned bank_num = mcg_cap & 0xff, bank;
2034 2142
2143 vcpu_load(vcpu);
2035 r = -EINVAL; 2144 r = -EINVAL;
2036 if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS) 2145 if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
2037 goto out; 2146 goto out;
@@ -2046,6 +2155,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
2046 for (bank = 0; bank < bank_num; bank++) 2155 for (bank = 0; bank < bank_num; bank++)
2047 vcpu->arch.mce_banks[bank*4] = ~(u64)0; 2156 vcpu->arch.mce_banks[bank*4] = ~(u64)0;
2048out: 2157out:
2158 vcpu_put(vcpu);
2049 return r; 2159 return r;
2050} 2160}
2051 2161
@@ -2105,14 +2215,20 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
2105{ 2215{
2106 vcpu_load(vcpu); 2216 vcpu_load(vcpu);
2107 2217
2108 events->exception.injected = vcpu->arch.exception.pending; 2218 events->exception.injected =
2219 vcpu->arch.exception.pending &&
2220 !kvm_exception_is_soft(vcpu->arch.exception.nr);
2109 events->exception.nr = vcpu->arch.exception.nr; 2221 events->exception.nr = vcpu->arch.exception.nr;
2110 events->exception.has_error_code = vcpu->arch.exception.has_error_code; 2222 events->exception.has_error_code = vcpu->arch.exception.has_error_code;
2111 events->exception.error_code = vcpu->arch.exception.error_code; 2223 events->exception.error_code = vcpu->arch.exception.error_code;
2112 2224
2113 events->interrupt.injected = vcpu->arch.interrupt.pending; 2225 events->interrupt.injected =
2226 vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft;
2114 events->interrupt.nr = vcpu->arch.interrupt.nr; 2227 events->interrupt.nr = vcpu->arch.interrupt.nr;
2115 events->interrupt.soft = vcpu->arch.interrupt.soft; 2228 events->interrupt.soft = 0;
2229 events->interrupt.shadow =
2230 kvm_x86_ops->get_interrupt_shadow(vcpu,
2231 KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI);
2116 2232
2117 events->nmi.injected = vcpu->arch.nmi_injected; 2233 events->nmi.injected = vcpu->arch.nmi_injected;
2118 events->nmi.pending = vcpu->arch.nmi_pending; 2234 events->nmi.pending = vcpu->arch.nmi_pending;
@@ -2121,7 +2237,8 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
2121 events->sipi_vector = vcpu->arch.sipi_vector; 2237 events->sipi_vector = vcpu->arch.sipi_vector;
2122 2238
2123 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING 2239 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
2124 | KVM_VCPUEVENT_VALID_SIPI_VECTOR); 2240 | KVM_VCPUEVENT_VALID_SIPI_VECTOR
2241 | KVM_VCPUEVENT_VALID_SHADOW);
2125 2242
2126 vcpu_put(vcpu); 2243 vcpu_put(vcpu);
2127} 2244}
@@ -2130,7 +2247,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2130 struct kvm_vcpu_events *events) 2247 struct kvm_vcpu_events *events)
2131{ 2248{
2132 if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING 2249 if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
2133 | KVM_VCPUEVENT_VALID_SIPI_VECTOR)) 2250 | KVM_VCPUEVENT_VALID_SIPI_VECTOR
2251 | KVM_VCPUEVENT_VALID_SHADOW))
2134 return -EINVAL; 2252 return -EINVAL;
2135 2253
2136 vcpu_load(vcpu); 2254 vcpu_load(vcpu);
@@ -2145,6 +2263,9 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2145 vcpu->arch.interrupt.soft = events->interrupt.soft; 2263 vcpu->arch.interrupt.soft = events->interrupt.soft;
2146 if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm)) 2264 if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm))
2147 kvm_pic_clear_isr_ack(vcpu->kvm); 2265 kvm_pic_clear_isr_ack(vcpu->kvm);
2266 if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
2267 kvm_x86_ops->set_interrupt_shadow(vcpu,
2268 events->interrupt.shadow);
2148 2269
2149 vcpu->arch.nmi_injected = events->nmi.injected; 2270 vcpu->arch.nmi_injected = events->nmi.injected;
2150 if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) 2271 if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
@@ -2159,6 +2280,36 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2159 return 0; 2280 return 0;
2160} 2281}
2161 2282
2283static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
2284 struct kvm_debugregs *dbgregs)
2285{
2286 vcpu_load(vcpu);
2287
2288 memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
2289 dbgregs->dr6 = vcpu->arch.dr6;
2290 dbgregs->dr7 = vcpu->arch.dr7;
2291 dbgregs->flags = 0;
2292
2293 vcpu_put(vcpu);
2294}
2295
2296static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
2297 struct kvm_debugregs *dbgregs)
2298{
2299 if (dbgregs->flags)
2300 return -EINVAL;
2301
2302 vcpu_load(vcpu);
2303
2304 memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
2305 vcpu->arch.dr6 = dbgregs->dr6;
2306 vcpu->arch.dr7 = dbgregs->dr7;
2307
2308 vcpu_put(vcpu);
2309
2310 return 0;
2311}
2312
2162long kvm_arch_vcpu_ioctl(struct file *filp, 2313long kvm_arch_vcpu_ioctl(struct file *filp,
2163 unsigned int ioctl, unsigned long arg) 2314 unsigned int ioctl, unsigned long arg)
2164{ 2315{
@@ -2313,7 +2464,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
2313 r = -EFAULT; 2464 r = -EFAULT;
2314 if (copy_from_user(&mce, argp, sizeof mce)) 2465 if (copy_from_user(&mce, argp, sizeof mce))
2315 goto out; 2466 goto out;
2467 vcpu_load(vcpu);
2316 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); 2468 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
2469 vcpu_put(vcpu);
2317 break; 2470 break;
2318 } 2471 }
2319 case KVM_GET_VCPU_EVENTS: { 2472 case KVM_GET_VCPU_EVENTS: {
@@ -2337,6 +2490,29 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
2337 r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events); 2490 r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
2338 break; 2491 break;
2339 } 2492 }
2493 case KVM_GET_DEBUGREGS: {
2494 struct kvm_debugregs dbgregs;
2495
2496 kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
2497
2498 r = -EFAULT;
2499 if (copy_to_user(argp, &dbgregs,
2500 sizeof(struct kvm_debugregs)))
2501 break;
2502 r = 0;
2503 break;
2504 }
2505 case KVM_SET_DEBUGREGS: {
2506 struct kvm_debugregs dbgregs;
2507
2508 r = -EFAULT;
2509 if (copy_from_user(&dbgregs, argp,
2510 sizeof(struct kvm_debugregs)))
2511 break;
2512
2513 r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
2514 break;
2515 }
2340 default: 2516 default:
2341 r = -EINVAL; 2517 r = -EINVAL;
2342 } 2518 }
@@ -2390,7 +2566,7 @@ gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn)
2390 struct kvm_mem_alias *alias; 2566 struct kvm_mem_alias *alias;
2391 struct kvm_mem_aliases *aliases; 2567 struct kvm_mem_aliases *aliases;
2392 2568
2393 aliases = rcu_dereference(kvm->arch.aliases); 2569 aliases = kvm_aliases(kvm);
2394 2570
2395 for (i = 0; i < aliases->naliases; ++i) { 2571 for (i = 0; i < aliases->naliases; ++i) {
2396 alias = &aliases->aliases[i]; 2572 alias = &aliases->aliases[i];
@@ -2409,7 +2585,7 @@ gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
2409 struct kvm_mem_alias *alias; 2585 struct kvm_mem_alias *alias;
2410 struct kvm_mem_aliases *aliases; 2586 struct kvm_mem_aliases *aliases;
2411 2587
2412 aliases = rcu_dereference(kvm->arch.aliases); 2588 aliases = kvm_aliases(kvm);
2413 2589
2414 for (i = 0; i < aliases->naliases; ++i) { 2590 for (i = 0; i < aliases->naliases; ++i) {
2415 alias = &aliases->aliases[i]; 2591 alias = &aliases->aliases[i];
@@ -2804,11 +2980,13 @@ long kvm_arch_vm_ioctl(struct file *filp,
2804 r = -EFAULT; 2980 r = -EFAULT;
2805 if (copy_from_user(&irq_event, argp, sizeof irq_event)) 2981 if (copy_from_user(&irq_event, argp, sizeof irq_event))
2806 goto out; 2982 goto out;
2983 r = -ENXIO;
2807 if (irqchip_in_kernel(kvm)) { 2984 if (irqchip_in_kernel(kvm)) {
2808 __s32 status; 2985 __s32 status;
2809 status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 2986 status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
2810 irq_event.irq, irq_event.level); 2987 irq_event.irq, irq_event.level);
2811 if (ioctl == KVM_IRQ_LINE_STATUS) { 2988 if (ioctl == KVM_IRQ_LINE_STATUS) {
2989 r = -EFAULT;
2812 irq_event.status = status; 2990 irq_event.status = status;
2813 if (copy_to_user(argp, &irq_event, 2991 if (copy_to_user(argp, &irq_event,
2814 sizeof irq_event)) 2992 sizeof irq_event))
@@ -3024,6 +3202,18 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
3024 return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); 3202 return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
3025} 3203}
3026 3204
3205static void kvm_set_segment(struct kvm_vcpu *vcpu,
3206 struct kvm_segment *var, int seg)
3207{
3208 kvm_x86_ops->set_segment(vcpu, var, seg);
3209}
3210
3211void kvm_get_segment(struct kvm_vcpu *vcpu,
3212 struct kvm_segment *var, int seg)
3213{
3214 kvm_x86_ops->get_segment(vcpu, var, seg);
3215}
3216
3027gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3217gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
3028{ 3218{
3029 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3219 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
@@ -3104,14 +3294,17 @@ static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes,
3104 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error); 3294 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error);
3105} 3295}
3106 3296
3107static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes, 3297static int kvm_write_guest_virt_system(gva_t addr, void *val,
3108 struct kvm_vcpu *vcpu, u32 *error) 3298 unsigned int bytes,
3299 struct kvm_vcpu *vcpu,
3300 u32 *error)
3109{ 3301{
3110 void *data = val; 3302 void *data = val;
3111 int r = X86EMUL_CONTINUE; 3303 int r = X86EMUL_CONTINUE;
3112 3304
3113 while (bytes) { 3305 while (bytes) {
3114 gpa_t gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error); 3306 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr,
3307 PFERR_WRITE_MASK, error);
3115 unsigned offset = addr & (PAGE_SIZE-1); 3308 unsigned offset = addr & (PAGE_SIZE-1);
3116 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); 3309 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
3117 int ret; 3310 int ret;
@@ -3134,7 +3327,6 @@ out:
3134 return r; 3327 return r;
3135} 3328}
3136 3329
3137
3138static int emulator_read_emulated(unsigned long addr, 3330static int emulator_read_emulated(unsigned long addr,
3139 void *val, 3331 void *val,
3140 unsigned int bytes, 3332 unsigned int bytes,
@@ -3237,9 +3429,9 @@ mmio:
3237} 3429}
3238 3430
3239int emulator_write_emulated(unsigned long addr, 3431int emulator_write_emulated(unsigned long addr,
3240 const void *val, 3432 const void *val,
3241 unsigned int bytes, 3433 unsigned int bytes,
3242 struct kvm_vcpu *vcpu) 3434 struct kvm_vcpu *vcpu)
3243{ 3435{
3244 /* Crossing a page boundary? */ 3436 /* Crossing a page boundary? */
3245 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { 3437 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
@@ -3257,45 +3449,150 @@ int emulator_write_emulated(unsigned long addr,
3257} 3449}
3258EXPORT_SYMBOL_GPL(emulator_write_emulated); 3450EXPORT_SYMBOL_GPL(emulator_write_emulated);
3259 3451
3452#define CMPXCHG_TYPE(t, ptr, old, new) \
3453 (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
3454
3455#ifdef CONFIG_X86_64
3456# define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new)
3457#else
3458# define CMPXCHG64(ptr, old, new) \
3459 (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old))
3460#endif
3461
3260static int emulator_cmpxchg_emulated(unsigned long addr, 3462static int emulator_cmpxchg_emulated(unsigned long addr,
3261 const void *old, 3463 const void *old,
3262 const void *new, 3464 const void *new,
3263 unsigned int bytes, 3465 unsigned int bytes,
3264 struct kvm_vcpu *vcpu) 3466 struct kvm_vcpu *vcpu)
3265{ 3467{
3266 printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); 3468 gpa_t gpa;
3267#ifndef CONFIG_X86_64 3469 struct page *page;
3268 /* guests cmpxchg8b have to be emulated atomically */ 3470 char *kaddr;
3269 if (bytes == 8) { 3471 bool exchanged;
3270 gpa_t gpa;
3271 struct page *page;
3272 char *kaddr;
3273 u64 val;
3274 3472
3275 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL); 3473 /* guests cmpxchg8b have to be emulated atomically */
3474 if (bytes > 8 || (bytes & (bytes - 1)))
3475 goto emul_write;
3276 3476
3277 if (gpa == UNMAPPED_GVA || 3477 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
3278 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
3279 goto emul_write;
3280 3478
3281 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK)) 3479 if (gpa == UNMAPPED_GVA ||
3282 goto emul_write; 3480 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
3481 goto emul_write;
3283 3482
3284 val = *(u64 *)new; 3483 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
3484 goto emul_write;
3285 3485
3286 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); 3486 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
3287 3487
3288 kaddr = kmap_atomic(page, KM_USER0); 3488 kaddr = kmap_atomic(page, KM_USER0);
3289 set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val); 3489 kaddr += offset_in_page(gpa);
3290 kunmap_atomic(kaddr, KM_USER0); 3490 switch (bytes) {
3291 kvm_release_page_dirty(page); 3491 case 1:
3492 exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
3493 break;
3494 case 2:
3495 exchanged = CMPXCHG_TYPE(u16, kaddr, old, new);
3496 break;
3497 case 4:
3498 exchanged = CMPXCHG_TYPE(u32, kaddr, old, new);
3499 break;
3500 case 8:
3501 exchanged = CMPXCHG64(kaddr, old, new);
3502 break;
3503 default:
3504 BUG();
3292 } 3505 }
3506 kunmap_atomic(kaddr, KM_USER0);
3507 kvm_release_page_dirty(page);
3508
3509 if (!exchanged)
3510 return X86EMUL_CMPXCHG_FAILED;
3511
3512 kvm_mmu_pte_write(vcpu, gpa, new, bytes, 1);
3513
3514 return X86EMUL_CONTINUE;
3515
3293emul_write: 3516emul_write:
3294#endif 3517 printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
3295 3518
3296 return emulator_write_emulated(addr, new, bytes, vcpu); 3519 return emulator_write_emulated(addr, new, bytes, vcpu);
3297} 3520}
3298 3521
3522static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
3523{
3524 /* TODO: String I/O for in kernel device */
3525 int r;
3526
3527 if (vcpu->arch.pio.in)
3528 r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
3529 vcpu->arch.pio.size, pd);
3530 else
3531 r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
3532 vcpu->arch.pio.port, vcpu->arch.pio.size,
3533 pd);
3534 return r;
3535}
3536
3537
3538static int emulator_pio_in_emulated(int size, unsigned short port, void *val,
3539 unsigned int count, struct kvm_vcpu *vcpu)
3540{
3541 if (vcpu->arch.pio.count)
3542 goto data_avail;
3543
3544 trace_kvm_pio(1, port, size, 1);
3545
3546 vcpu->arch.pio.port = port;
3547 vcpu->arch.pio.in = 1;
3548 vcpu->arch.pio.count = count;
3549 vcpu->arch.pio.size = size;
3550
3551 if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
3552 data_avail:
3553 memcpy(val, vcpu->arch.pio_data, size * count);
3554 vcpu->arch.pio.count = 0;
3555 return 1;
3556 }
3557
3558 vcpu->run->exit_reason = KVM_EXIT_IO;
3559 vcpu->run->io.direction = KVM_EXIT_IO_IN;
3560 vcpu->run->io.size = size;
3561 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
3562 vcpu->run->io.count = count;
3563 vcpu->run->io.port = port;
3564
3565 return 0;
3566}
3567
3568static int emulator_pio_out_emulated(int size, unsigned short port,
3569 const void *val, unsigned int count,
3570 struct kvm_vcpu *vcpu)
3571{
3572 trace_kvm_pio(0, port, size, 1);
3573
3574 vcpu->arch.pio.port = port;
3575 vcpu->arch.pio.in = 0;
3576 vcpu->arch.pio.count = count;
3577 vcpu->arch.pio.size = size;
3578
3579 memcpy(vcpu->arch.pio_data, val, size * count);
3580
3581 if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
3582 vcpu->arch.pio.count = 0;
3583 return 1;
3584 }
3585
3586 vcpu->run->exit_reason = KVM_EXIT_IO;
3587 vcpu->run->io.direction = KVM_EXIT_IO_OUT;
3588 vcpu->run->io.size = size;
3589 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
3590 vcpu->run->io.count = count;
3591 vcpu->run->io.port = port;
3592
3593 return 0;
3594}
3595
3299static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) 3596static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
3300{ 3597{
3301 return kvm_x86_ops->get_segment_base(vcpu, seg); 3598 return kvm_x86_ops->get_segment_base(vcpu, seg);
@@ -3316,14 +3613,14 @@ int emulate_clts(struct kvm_vcpu *vcpu)
3316 3613
3317int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) 3614int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
3318{ 3615{
3319 return kvm_x86_ops->get_dr(ctxt->vcpu, dr, dest); 3616 return kvm_get_dr(ctxt->vcpu, dr, dest);
3320} 3617}
3321 3618
3322int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) 3619int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
3323{ 3620{
3324 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; 3621 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
3325 3622
3326 return kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask); 3623 return kvm_set_dr(ctxt->vcpu, dr, value & mask);
3327} 3624}
3328 3625
3329void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) 3626void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
@@ -3344,12 +3641,167 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
3344} 3641}
3345EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); 3642EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
3346 3643
3644static u64 mk_cr_64(u64 curr_cr, u32 new_val)
3645{
3646 return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
3647}
3648
3649static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)
3650{
3651 unsigned long value;
3652
3653 switch (cr) {
3654 case 0:
3655 value = kvm_read_cr0(vcpu);
3656 break;
3657 case 2:
3658 value = vcpu->arch.cr2;
3659 break;
3660 case 3:
3661 value = vcpu->arch.cr3;
3662 break;
3663 case 4:
3664 value = kvm_read_cr4(vcpu);
3665 break;
3666 case 8:
3667 value = kvm_get_cr8(vcpu);
3668 break;
3669 default:
3670 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
3671 return 0;
3672 }
3673
3674 return value;
3675}
3676
3677static void emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
3678{
3679 switch (cr) {
3680 case 0:
3681 kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
3682 break;
3683 case 2:
3684 vcpu->arch.cr2 = val;
3685 break;
3686 case 3:
3687 kvm_set_cr3(vcpu, val);
3688 break;
3689 case 4:
3690 kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
3691 break;
3692 case 8:
3693 kvm_set_cr8(vcpu, val & 0xfUL);
3694 break;
3695 default:
3696 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
3697 }
3698}
3699
3700static int emulator_get_cpl(struct kvm_vcpu *vcpu)
3701{
3702 return kvm_x86_ops->get_cpl(vcpu);
3703}
3704
3705static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu)
3706{
3707 kvm_x86_ops->get_gdt(vcpu, dt);
3708}
3709
3710static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg,
3711 struct kvm_vcpu *vcpu)
3712{
3713 struct kvm_segment var;
3714
3715 kvm_get_segment(vcpu, &var, seg);
3716
3717 if (var.unusable)
3718 return false;
3719
3720 if (var.g)
3721 var.limit >>= 12;
3722 set_desc_limit(desc, var.limit);
3723 set_desc_base(desc, (unsigned long)var.base);
3724 desc->type = var.type;
3725 desc->s = var.s;
3726 desc->dpl = var.dpl;
3727 desc->p = var.present;
3728 desc->avl = var.avl;
3729 desc->l = var.l;
3730 desc->d = var.db;
3731 desc->g = var.g;
3732
3733 return true;
3734}
3735
3736static void emulator_set_cached_descriptor(struct desc_struct *desc, int seg,
3737 struct kvm_vcpu *vcpu)
3738{
3739 struct kvm_segment var;
3740
3741 /* needed to preserve selector */
3742 kvm_get_segment(vcpu, &var, seg);
3743
3744 var.base = get_desc_base(desc);
3745 var.limit = get_desc_limit(desc);
3746 if (desc->g)
3747 var.limit = (var.limit << 12) | 0xfff;
3748 var.type = desc->type;
3749 var.present = desc->p;
3750 var.dpl = desc->dpl;
3751 var.db = desc->d;
3752 var.s = desc->s;
3753 var.l = desc->l;
3754 var.g = desc->g;
3755 var.avl = desc->avl;
3756 var.present = desc->p;
3757 var.unusable = !var.present;
3758 var.padding = 0;
3759
3760 kvm_set_segment(vcpu, &var, seg);
3761 return;
3762}
3763
3764static u16 emulator_get_segment_selector(int seg, struct kvm_vcpu *vcpu)
3765{
3766 struct kvm_segment kvm_seg;
3767
3768 kvm_get_segment(vcpu, &kvm_seg, seg);
3769 return kvm_seg.selector;
3770}
3771
3772static void emulator_set_segment_selector(u16 sel, int seg,
3773 struct kvm_vcpu *vcpu)
3774{
3775 struct kvm_segment kvm_seg;
3776
3777 kvm_get_segment(vcpu, &kvm_seg, seg);
3778 kvm_seg.selector = sel;
3779 kvm_set_segment(vcpu, &kvm_seg, seg);
3780}
3781
3782static void emulator_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
3783{
3784 kvm_x86_ops->set_rflags(vcpu, rflags);
3785}
3786
3347static struct x86_emulate_ops emulate_ops = { 3787static struct x86_emulate_ops emulate_ops = {
3348 .read_std = kvm_read_guest_virt_system, 3788 .read_std = kvm_read_guest_virt_system,
3789 .write_std = kvm_write_guest_virt_system,
3349 .fetch = kvm_fetch_guest_virt, 3790 .fetch = kvm_fetch_guest_virt,
3350 .read_emulated = emulator_read_emulated, 3791 .read_emulated = emulator_read_emulated,
3351 .write_emulated = emulator_write_emulated, 3792 .write_emulated = emulator_write_emulated,
3352 .cmpxchg_emulated = emulator_cmpxchg_emulated, 3793 .cmpxchg_emulated = emulator_cmpxchg_emulated,
3794 .pio_in_emulated = emulator_pio_in_emulated,
3795 .pio_out_emulated = emulator_pio_out_emulated,
3796 .get_cached_descriptor = emulator_get_cached_descriptor,
3797 .set_cached_descriptor = emulator_set_cached_descriptor,
3798 .get_segment_selector = emulator_get_segment_selector,
3799 .set_segment_selector = emulator_set_segment_selector,
3800 .get_gdt = emulator_get_gdt,
3801 .get_cr = emulator_get_cr,
3802 .set_cr = emulator_set_cr,
3803 .cpl = emulator_get_cpl,
3804 .set_rflags = emulator_set_rflags,
3353}; 3805};
3354 3806
3355static void cache_all_regs(struct kvm_vcpu *vcpu) 3807static void cache_all_regs(struct kvm_vcpu *vcpu)
@@ -3380,14 +3832,14 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3380 cache_all_regs(vcpu); 3832 cache_all_regs(vcpu);
3381 3833
3382 vcpu->mmio_is_write = 0; 3834 vcpu->mmio_is_write = 0;
3383 vcpu->arch.pio.string = 0;
3384 3835
3385 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 3836 if (!(emulation_type & EMULTYPE_NO_DECODE)) {
3386 int cs_db, cs_l; 3837 int cs_db, cs_l;
3387 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 3838 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
3388 3839
3389 vcpu->arch.emulate_ctxt.vcpu = vcpu; 3840 vcpu->arch.emulate_ctxt.vcpu = vcpu;
3390 vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu); 3841 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
3842 vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
3391 vcpu->arch.emulate_ctxt.mode = 3843 vcpu->arch.emulate_ctxt.mode =
3392 (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : 3844 (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
3393 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) 3845 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
@@ -3396,6 +3848,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3396 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 3848 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
3397 3849
3398 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 3850 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
3851 trace_kvm_emulate_insn_start(vcpu);
3399 3852
3400 /* Only allow emulation of specific instructions on #UD 3853 /* Only allow emulation of specific instructions on #UD
3401 * (namely VMMCALL, sysenter, sysexit, syscall)*/ 3854 * (namely VMMCALL, sysenter, sysexit, syscall)*/
@@ -3428,6 +3881,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3428 ++vcpu->stat.insn_emulation; 3881 ++vcpu->stat.insn_emulation;
3429 if (r) { 3882 if (r) {
3430 ++vcpu->stat.insn_emulation_fail; 3883 ++vcpu->stat.insn_emulation_fail;
3884 trace_kvm_emulate_insn_failed(vcpu);
3431 if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) 3885 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
3432 return EMULATE_DONE; 3886 return EMULATE_DONE;
3433 return EMULATE_FAIL; 3887 return EMULATE_FAIL;
@@ -3439,16 +3893,20 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3439 return EMULATE_DONE; 3893 return EMULATE_DONE;
3440 } 3894 }
3441 3895
3896restart:
3442 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 3897 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
3443 shadow_mask = vcpu->arch.emulate_ctxt.interruptibility; 3898 shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
3444 3899
3445 if (r == 0) 3900 if (r == 0)
3446 kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask); 3901 kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
3447 3902
3448 if (vcpu->arch.pio.string) 3903 if (vcpu->arch.pio.count) {
3904 if (!vcpu->arch.pio.in)
3905 vcpu->arch.pio.count = 0;
3449 return EMULATE_DO_MMIO; 3906 return EMULATE_DO_MMIO;
3907 }
3450 3908
3451 if ((r || vcpu->mmio_is_write) && run) { 3909 if (r || vcpu->mmio_is_write) {
3452 run->exit_reason = KVM_EXIT_MMIO; 3910 run->exit_reason = KVM_EXIT_MMIO;
3453 run->mmio.phys_addr = vcpu->mmio_phys_addr; 3911 run->mmio.phys_addr = vcpu->mmio_phys_addr;
3454 memcpy(run->mmio.data, vcpu->mmio_data, 8); 3912 memcpy(run->mmio.data, vcpu->mmio_data, 8);
@@ -3458,222 +3916,41 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3458 3916
3459 if (r) { 3917 if (r) {
3460 if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) 3918 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
3461 return EMULATE_DONE; 3919 goto done;
3462 if (!vcpu->mmio_needed) { 3920 if (!vcpu->mmio_needed) {
3921 ++vcpu->stat.insn_emulation_fail;
3922 trace_kvm_emulate_insn_failed(vcpu);
3463 kvm_report_emulation_failure(vcpu, "mmio"); 3923 kvm_report_emulation_failure(vcpu, "mmio");
3464 return EMULATE_FAIL; 3924 return EMULATE_FAIL;
3465 } 3925 }
3466 return EMULATE_DO_MMIO; 3926 return EMULATE_DO_MMIO;
3467 } 3927 }
3468 3928
3469 kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
3470
3471 if (vcpu->mmio_is_write) { 3929 if (vcpu->mmio_is_write) {
3472 vcpu->mmio_needed = 0; 3930 vcpu->mmio_needed = 0;
3473 return EMULATE_DO_MMIO; 3931 return EMULATE_DO_MMIO;
3474 } 3932 }
3475 3933
3476 return EMULATE_DONE; 3934done:
3477} 3935 if (vcpu->arch.exception.pending)
3478EXPORT_SYMBOL_GPL(emulate_instruction); 3936 vcpu->arch.emulate_ctxt.restart = false;
3479
3480static int pio_copy_data(struct kvm_vcpu *vcpu)
3481{
3482 void *p = vcpu->arch.pio_data;
3483 gva_t q = vcpu->arch.pio.guest_gva;
3484 unsigned bytes;
3485 int ret;
3486 u32 error_code;
3487
3488 bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
3489 if (vcpu->arch.pio.in)
3490 ret = kvm_write_guest_virt(q, p, bytes, vcpu, &error_code);
3491 else
3492 ret = kvm_read_guest_virt(q, p, bytes, vcpu, &error_code);
3493
3494 if (ret == X86EMUL_PROPAGATE_FAULT)
3495 kvm_inject_page_fault(vcpu, q, error_code);
3496
3497 return ret;
3498}
3499
3500int complete_pio(struct kvm_vcpu *vcpu)
3501{
3502 struct kvm_pio_request *io = &vcpu->arch.pio;
3503 long delta;
3504 int r;
3505 unsigned long val;
3506
3507 if (!io->string) {
3508 if (io->in) {
3509 val = kvm_register_read(vcpu, VCPU_REGS_RAX);
3510 memcpy(&val, vcpu->arch.pio_data, io->size);
3511 kvm_register_write(vcpu, VCPU_REGS_RAX, val);
3512 }
3513 } else {
3514 if (io->in) {
3515 r = pio_copy_data(vcpu);
3516 if (r)
3517 goto out;
3518 }
3519
3520 delta = 1;
3521 if (io->rep) {
3522 delta *= io->cur_count;
3523 /*
3524 * The size of the register should really depend on
3525 * current address size.
3526 */
3527 val = kvm_register_read(vcpu, VCPU_REGS_RCX);
3528 val -= delta;
3529 kvm_register_write(vcpu, VCPU_REGS_RCX, val);
3530 }
3531 if (io->down)
3532 delta = -delta;
3533 delta *= io->size;
3534 if (io->in) {
3535 val = kvm_register_read(vcpu, VCPU_REGS_RDI);
3536 val += delta;
3537 kvm_register_write(vcpu, VCPU_REGS_RDI, val);
3538 } else {
3539 val = kvm_register_read(vcpu, VCPU_REGS_RSI);
3540 val += delta;
3541 kvm_register_write(vcpu, VCPU_REGS_RSI, val);
3542 }
3543 }
3544out:
3545 io->count -= io->cur_count;
3546 io->cur_count = 0;
3547
3548 return 0;
3549}
3550
3551static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
3552{
3553 /* TODO: String I/O for in kernel device */
3554 int r;
3555
3556 if (vcpu->arch.pio.in)
3557 r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
3558 vcpu->arch.pio.size, pd);
3559 else
3560 r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
3561 vcpu->arch.pio.port, vcpu->arch.pio.size,
3562 pd);
3563 return r;
3564}
3565 3937
3566static int pio_string_write(struct kvm_vcpu *vcpu) 3938 if (vcpu->arch.emulate_ctxt.restart)
3567{ 3939 goto restart;
3568 struct kvm_pio_request *io = &vcpu->arch.pio;
3569 void *pd = vcpu->arch.pio_data;
3570 int i, r = 0;
3571 3940
3572 for (i = 0; i < io->cur_count; i++) { 3941 return EMULATE_DONE;
3573 if (kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
3574 io->port, io->size, pd)) {
3575 r = -EOPNOTSUPP;
3576 break;
3577 }
3578 pd += io->size;
3579 }
3580 return r;
3581}
3582
3583int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port)
3584{
3585 unsigned long val;
3586
3587 trace_kvm_pio(!in, port, size, 1);
3588
3589 vcpu->run->exit_reason = KVM_EXIT_IO;
3590 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
3591 vcpu->run->io.size = vcpu->arch.pio.size = size;
3592 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
3593 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1;
3594 vcpu->run->io.port = vcpu->arch.pio.port = port;
3595 vcpu->arch.pio.in = in;
3596 vcpu->arch.pio.string = 0;
3597 vcpu->arch.pio.down = 0;
3598 vcpu->arch.pio.rep = 0;
3599
3600 if (!vcpu->arch.pio.in) {
3601 val = kvm_register_read(vcpu, VCPU_REGS_RAX);
3602 memcpy(vcpu->arch.pio_data, &val, 4);
3603 }
3604
3605 if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
3606 complete_pio(vcpu);
3607 return 1;
3608 }
3609 return 0;
3610} 3942}
3611EXPORT_SYMBOL_GPL(kvm_emulate_pio); 3943EXPORT_SYMBOL_GPL(emulate_instruction);
3612 3944
3613int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in, 3945int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
3614 int size, unsigned long count, int down,
3615 gva_t address, int rep, unsigned port)
3616{ 3946{
3617 unsigned now, in_page; 3947 unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
3618 int ret = 0; 3948 int ret = emulator_pio_out_emulated(size, port, &val, 1, vcpu);
3619 3949 /* do not return to emulator after return from userspace */
3620 trace_kvm_pio(!in, port, size, count); 3950 vcpu->arch.pio.count = 0;
3621
3622 vcpu->run->exit_reason = KVM_EXIT_IO;
3623 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
3624 vcpu->run->io.size = vcpu->arch.pio.size = size;
3625 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
3626 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
3627 vcpu->run->io.port = vcpu->arch.pio.port = port;
3628 vcpu->arch.pio.in = in;
3629 vcpu->arch.pio.string = 1;
3630 vcpu->arch.pio.down = down;
3631 vcpu->arch.pio.rep = rep;
3632
3633 if (!count) {
3634 kvm_x86_ops->skip_emulated_instruction(vcpu);
3635 return 1;
3636 }
3637
3638 if (!down)
3639 in_page = PAGE_SIZE - offset_in_page(address);
3640 else
3641 in_page = offset_in_page(address) + size;
3642 now = min(count, (unsigned long)in_page / size);
3643 if (!now)
3644 now = 1;
3645 if (down) {
3646 /*
3647 * String I/O in reverse. Yuck. Kill the guest, fix later.
3648 */
3649 pr_unimpl(vcpu, "guest string pio down\n");
3650 kvm_inject_gp(vcpu, 0);
3651 return 1;
3652 }
3653 vcpu->run->io.count = now;
3654 vcpu->arch.pio.cur_count = now;
3655
3656 if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
3657 kvm_x86_ops->skip_emulated_instruction(vcpu);
3658
3659 vcpu->arch.pio.guest_gva = address;
3660
3661 if (!vcpu->arch.pio.in) {
3662 /* string PIO write */
3663 ret = pio_copy_data(vcpu);
3664 if (ret == X86EMUL_PROPAGATE_FAULT)
3665 return 1;
3666 if (ret == 0 && !pio_string_write(vcpu)) {
3667 complete_pio(vcpu);
3668 if (vcpu->arch.pio.count == 0)
3669 ret = 1;
3670 }
3671 }
3672 /* no string PIO read support yet */
3673
3674 return ret; 3951 return ret;
3675} 3952}
3676EXPORT_SYMBOL_GPL(kvm_emulate_pio_string); 3953EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
3677 3954
3678static void bounce_off(void *info) 3955static void bounce_off(void *info)
3679{ 3956{
@@ -3996,85 +4273,20 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
3996 return emulator_write_emulated(rip, instruction, 3, vcpu); 4273 return emulator_write_emulated(rip, instruction, 3, vcpu);
3997} 4274}
3998 4275
3999static u64 mk_cr_64(u64 curr_cr, u32 new_val)
4000{
4001 return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
4002}
4003
4004void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) 4276void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
4005{ 4277{
4006 struct descriptor_table dt = { limit, base }; 4278 struct desc_ptr dt = { limit, base };
4007 4279
4008 kvm_x86_ops->set_gdt(vcpu, &dt); 4280 kvm_x86_ops->set_gdt(vcpu, &dt);
4009} 4281}
4010 4282
4011void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) 4283void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
4012{ 4284{
4013 struct descriptor_table dt = { limit, base }; 4285 struct desc_ptr dt = { limit, base };
4014 4286
4015 kvm_x86_ops->set_idt(vcpu, &dt); 4287 kvm_x86_ops->set_idt(vcpu, &dt);
4016} 4288}
4017 4289
4018void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
4019 unsigned long *rflags)
4020{
4021 kvm_lmsw(vcpu, msw);
4022 *rflags = kvm_get_rflags(vcpu);
4023}
4024
4025unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
4026{
4027 unsigned long value;
4028
4029 switch (cr) {
4030 case 0:
4031 value = kvm_read_cr0(vcpu);
4032 break;
4033 case 2:
4034 value = vcpu->arch.cr2;
4035 break;
4036 case 3:
4037 value = vcpu->arch.cr3;
4038 break;
4039 case 4:
4040 value = kvm_read_cr4(vcpu);
4041 break;
4042 case 8:
4043 value = kvm_get_cr8(vcpu);
4044 break;
4045 default:
4046 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
4047 return 0;
4048 }
4049
4050 return value;
4051}
4052
4053void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
4054 unsigned long *rflags)
4055{
4056 switch (cr) {
4057 case 0:
4058 kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
4059 *rflags = kvm_get_rflags(vcpu);
4060 break;
4061 case 2:
4062 vcpu->arch.cr2 = val;
4063 break;
4064 case 3:
4065 kvm_set_cr3(vcpu, val);
4066 break;
4067 case 4:
4068 kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
4069 break;
4070 case 8:
4071 kvm_set_cr8(vcpu, val & 0xfUL);
4072 break;
4073 default:
4074 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
4075 }
4076}
4077
4078static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) 4290static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
4079{ 4291{
4080 struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; 4292 struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
@@ -4138,9 +4350,13 @@ int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
4138{ 4350{
4139 struct kvm_cpuid_entry2 *best; 4351 struct kvm_cpuid_entry2 *best;
4140 4352
4353 best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0);
4354 if (!best || best->eax < 0x80000008)
4355 goto not_found;
4141 best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); 4356 best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
4142 if (best) 4357 if (best)
4143 return best->eax & 0xff; 4358 return best->eax & 0xff;
4359not_found:
4144 return 36; 4360 return 36;
4145} 4361}
4146 4362
@@ -4254,9 +4470,13 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
4254{ 4470{
4255 /* try to reinject previous events if any */ 4471 /* try to reinject previous events if any */
4256 if (vcpu->arch.exception.pending) { 4472 if (vcpu->arch.exception.pending) {
4473 trace_kvm_inj_exception(vcpu->arch.exception.nr,
4474 vcpu->arch.exception.has_error_code,
4475 vcpu->arch.exception.error_code);
4257 kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, 4476 kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
4258 vcpu->arch.exception.has_error_code, 4477 vcpu->arch.exception.has_error_code,
4259 vcpu->arch.exception.error_code); 4478 vcpu->arch.exception.error_code,
4479 vcpu->arch.exception.reinject);
4260 return; 4480 return;
4261 } 4481 }
4262 4482
@@ -4486,7 +4706,6 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
4486 } 4706 }
4487 4707
4488 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); 4708 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
4489 post_kvm_run_save(vcpu);
4490 4709
4491 vapic_exit(vcpu); 4710 vapic_exit(vcpu);
4492 4711
@@ -4514,26 +4733,17 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
4514 if (!irqchip_in_kernel(vcpu->kvm)) 4733 if (!irqchip_in_kernel(vcpu->kvm))
4515 kvm_set_cr8(vcpu, kvm_run->cr8); 4734 kvm_set_cr8(vcpu, kvm_run->cr8);
4516 4735
4517 if (vcpu->arch.pio.cur_count) { 4736 if (vcpu->arch.pio.count || vcpu->mmio_needed ||
4518 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); 4737 vcpu->arch.emulate_ctxt.restart) {
4519 r = complete_pio(vcpu); 4738 if (vcpu->mmio_needed) {
4520 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); 4739 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
4521 if (r) 4740 vcpu->mmio_read_completed = 1;
4522 goto out; 4741 vcpu->mmio_needed = 0;
4523 } 4742 }
4524 if (vcpu->mmio_needed) {
4525 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
4526 vcpu->mmio_read_completed = 1;
4527 vcpu->mmio_needed = 0;
4528
4529 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); 4743 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
4530 r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0, 4744 r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE);
4531 EMULTYPE_NO_DECODE);
4532 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); 4745 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
4533 if (r == EMULATE_DO_MMIO) { 4746 if (r == EMULATE_DO_MMIO) {
4534 /*
4535 * Read-modify-write. Back to userspace.
4536 */
4537 r = 0; 4747 r = 0;
4538 goto out; 4748 goto out;
4539 } 4749 }
@@ -4545,6 +4755,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
4545 r = __vcpu_run(vcpu); 4755 r = __vcpu_run(vcpu);
4546 4756
4547out: 4757out:
4758 post_kvm_run_save(vcpu);
4548 if (vcpu->sigset_active) 4759 if (vcpu->sigset_active)
4549 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 4760 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
4550 4761
@@ -4616,12 +4827,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
4616 return 0; 4827 return 0;
4617} 4828}
4618 4829
4619void kvm_get_segment(struct kvm_vcpu *vcpu,
4620 struct kvm_segment *var, int seg)
4621{
4622 kvm_x86_ops->get_segment(vcpu, var, seg);
4623}
4624
4625void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 4830void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
4626{ 4831{
4627 struct kvm_segment cs; 4832 struct kvm_segment cs;
@@ -4635,7 +4840,7 @@ EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
4635int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, 4840int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
4636 struct kvm_sregs *sregs) 4841 struct kvm_sregs *sregs)
4637{ 4842{
4638 struct descriptor_table dt; 4843 struct desc_ptr dt;
4639 4844
4640 vcpu_load(vcpu); 4845 vcpu_load(vcpu);
4641 4846
@@ -4650,11 +4855,11 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
4650 kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 4855 kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
4651 4856
4652 kvm_x86_ops->get_idt(vcpu, &dt); 4857 kvm_x86_ops->get_idt(vcpu, &dt);
4653 sregs->idt.limit = dt.limit; 4858 sregs->idt.limit = dt.size;
4654 sregs->idt.base = dt.base; 4859 sregs->idt.base = dt.address;
4655 kvm_x86_ops->get_gdt(vcpu, &dt); 4860 kvm_x86_ops->get_gdt(vcpu, &dt);
4656 sregs->gdt.limit = dt.limit; 4861 sregs->gdt.limit = dt.size;
4657 sregs->gdt.base = dt.base; 4862 sregs->gdt.base = dt.address;
4658 4863
4659 sregs->cr0 = kvm_read_cr0(vcpu); 4864 sregs->cr0 = kvm_read_cr0(vcpu);
4660 sregs->cr2 = vcpu->arch.cr2; 4865 sregs->cr2 = vcpu->arch.cr2;
@@ -4693,563 +4898,33 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
4693 return 0; 4898 return 0;
4694} 4899}
4695 4900
4696static void kvm_set_segment(struct kvm_vcpu *vcpu, 4901int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
4697 struct kvm_segment *var, int seg) 4902 bool has_error_code, u32 error_code)
4698{
4699 kvm_x86_ops->set_segment(vcpu, var, seg);
4700}
4701
4702static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
4703 struct kvm_segment *kvm_desct)
4704{
4705 kvm_desct->base = get_desc_base(seg_desc);
4706 kvm_desct->limit = get_desc_limit(seg_desc);
4707 if (seg_desc->g) {
4708 kvm_desct->limit <<= 12;
4709 kvm_desct->limit |= 0xfff;
4710 }
4711 kvm_desct->selector = selector;
4712 kvm_desct->type = seg_desc->type;
4713 kvm_desct->present = seg_desc->p;
4714 kvm_desct->dpl = seg_desc->dpl;
4715 kvm_desct->db = seg_desc->d;
4716 kvm_desct->s = seg_desc->s;
4717 kvm_desct->l = seg_desc->l;
4718 kvm_desct->g = seg_desc->g;
4719 kvm_desct->avl = seg_desc->avl;
4720 if (!selector)
4721 kvm_desct->unusable = 1;
4722 else
4723 kvm_desct->unusable = 0;
4724 kvm_desct->padding = 0;
4725}
4726
4727static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu,
4728 u16 selector,
4729 struct descriptor_table *dtable)
4730{
4731 if (selector & 1 << 2) {
4732 struct kvm_segment kvm_seg;
4733
4734 kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR);
4735
4736 if (kvm_seg.unusable)
4737 dtable->limit = 0;
4738 else
4739 dtable->limit = kvm_seg.limit;
4740 dtable->base = kvm_seg.base;
4741 }
4742 else
4743 kvm_x86_ops->get_gdt(vcpu, dtable);
4744}
4745
4746/* allowed just for 8 bytes segments */
4747static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
4748 struct desc_struct *seg_desc)
4749{
4750 struct descriptor_table dtable;
4751 u16 index = selector >> 3;
4752 int ret;
4753 u32 err;
4754 gva_t addr;
4755
4756 get_segment_descriptor_dtable(vcpu, selector, &dtable);
4757
4758 if (dtable.limit < index * 8 + 7) {
4759 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
4760 return X86EMUL_PROPAGATE_FAULT;
4761 }
4762 addr = dtable.base + index * 8;
4763 ret = kvm_read_guest_virt_system(addr, seg_desc, sizeof(*seg_desc),
4764 vcpu, &err);
4765 if (ret == X86EMUL_PROPAGATE_FAULT)
4766 kvm_inject_page_fault(vcpu, addr, err);
4767
4768 return ret;
4769}
4770
4771/* allowed just for 8 bytes segments */
4772static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
4773 struct desc_struct *seg_desc)
4774{
4775 struct descriptor_table dtable;
4776 u16 index = selector >> 3;
4777
4778 get_segment_descriptor_dtable(vcpu, selector, &dtable);
4779
4780 if (dtable.limit < index * 8 + 7)
4781 return 1;
4782 return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu, NULL);
4783}
4784
4785static gpa_t get_tss_base_addr_write(struct kvm_vcpu *vcpu,
4786 struct desc_struct *seg_desc)
4787{
4788 u32 base_addr = get_desc_base(seg_desc);
4789
4790 return kvm_mmu_gva_to_gpa_write(vcpu, base_addr, NULL);
4791}
4792
4793static gpa_t get_tss_base_addr_read(struct kvm_vcpu *vcpu,
4794 struct desc_struct *seg_desc)
4795{
4796 u32 base_addr = get_desc_base(seg_desc);
4797
4798 return kvm_mmu_gva_to_gpa_read(vcpu, base_addr, NULL);
4799}
4800
4801static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
4802{
4803 struct kvm_segment kvm_seg;
4804
4805 kvm_get_segment(vcpu, &kvm_seg, seg);
4806 return kvm_seg.selector;
4807}
4808
4809static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
4810{
4811 struct kvm_segment segvar = {
4812 .base = selector << 4,
4813 .limit = 0xffff,
4814 .selector = selector,
4815 .type = 3,
4816 .present = 1,
4817 .dpl = 3,
4818 .db = 0,
4819 .s = 1,
4820 .l = 0,
4821 .g = 0,
4822 .avl = 0,
4823 .unusable = 0,
4824 };
4825 kvm_x86_ops->set_segment(vcpu, &segvar, seg);
4826 return X86EMUL_CONTINUE;
4827}
4828
4829static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
4830{ 4903{
4831 return (seg != VCPU_SREG_LDTR) && 4904 int cs_db, cs_l, ret;
4832 (seg != VCPU_SREG_TR) && 4905 cache_all_regs(vcpu);
4833 (kvm_get_rflags(vcpu) & X86_EFLAGS_VM);
4834}
4835
4836int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg)
4837{
4838 struct kvm_segment kvm_seg;
4839 struct desc_struct seg_desc;
4840 u8 dpl, rpl, cpl;
4841 unsigned err_vec = GP_VECTOR;
4842 u32 err_code = 0;
4843 bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
4844 int ret;
4845 4906
4846 if (is_vm86_segment(vcpu, seg) || !is_protmode(vcpu)) 4907 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
4847 return kvm_load_realmode_segment(vcpu, selector, seg);
4848 4908
4849 /* NULL selector is not valid for TR, CS and SS */ 4909 vcpu->arch.emulate_ctxt.vcpu = vcpu;
4850 if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR) 4910 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
4851 && null_selector) 4911 vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
4852 goto exception; 4912 vcpu->arch.emulate_ctxt.mode =
4913 (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
4914 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
4915 ? X86EMUL_MODE_VM86 : cs_l
4916 ? X86EMUL_MODE_PROT64 : cs_db
4917 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
4853 4918
4854 /* TR should be in GDT only */ 4919 ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops,
4855 if (seg == VCPU_SREG_TR && (selector & (1 << 2))) 4920 tss_selector, reason, has_error_code,
4856 goto exception; 4921 error_code);
4857 4922
4858 ret = load_guest_segment_descriptor(vcpu, selector, &seg_desc);
4859 if (ret) 4923 if (ret)
4860 return ret; 4924 return EMULATE_FAIL;
4861
4862 seg_desct_to_kvm_desct(&seg_desc, selector, &kvm_seg);
4863
4864 if (null_selector) { /* for NULL selector skip all following checks */
4865 kvm_seg.unusable = 1;
4866 goto load;
4867 }
4868
4869 err_code = selector & 0xfffc;
4870 err_vec = GP_VECTOR;
4871
4872 /* can't load system descriptor into segment selecor */
4873 if (seg <= VCPU_SREG_GS && !kvm_seg.s)
4874 goto exception;
4875
4876 if (!kvm_seg.present) {
4877 err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
4878 goto exception;
4879 }
4880
4881 rpl = selector & 3;
4882 dpl = kvm_seg.dpl;
4883 cpl = kvm_x86_ops->get_cpl(vcpu);
4884
4885 switch (seg) {
4886 case VCPU_SREG_SS:
4887 /*
4888 * segment is not a writable data segment or segment
4889 * selector's RPL != CPL or segment selector's RPL != CPL
4890 */
4891 if (rpl != cpl || (kvm_seg.type & 0xa) != 0x2 || dpl != cpl)
4892 goto exception;
4893 break;
4894 case VCPU_SREG_CS:
4895 if (!(kvm_seg.type & 8))
4896 goto exception;
4897
4898 if (kvm_seg.type & 4) {
4899 /* conforming */
4900 if (dpl > cpl)
4901 goto exception;
4902 } else {
4903 /* nonconforming */
4904 if (rpl > cpl || dpl != cpl)
4905 goto exception;
4906 }
4907 /* CS(RPL) <- CPL */
4908 selector = (selector & 0xfffc) | cpl;
4909 break;
4910 case VCPU_SREG_TR:
4911 if (kvm_seg.s || (kvm_seg.type != 1 && kvm_seg.type != 9))
4912 goto exception;
4913 break;
4914 case VCPU_SREG_LDTR:
4915 if (kvm_seg.s || kvm_seg.type != 2)
4916 goto exception;
4917 break;
4918 default: /* DS, ES, FS, or GS */
4919 /*
4920 * segment is not a data or readable code segment or
4921 * ((segment is a data or nonconforming code segment)
4922 * and (both RPL and CPL > DPL))
4923 */
4924 if ((kvm_seg.type & 0xa) == 0x8 ||
4925 (((kvm_seg.type & 0xc) != 0xc) && (rpl > dpl && cpl > dpl)))
4926 goto exception;
4927 break;
4928 }
4929
4930 if (!kvm_seg.unusable && kvm_seg.s) {
4931 /* mark segment as accessed */
4932 kvm_seg.type |= 1;
4933 seg_desc.type |= 1;
4934 save_guest_segment_descriptor(vcpu, selector, &seg_desc);
4935 }
4936load:
4937 kvm_set_segment(vcpu, &kvm_seg, seg);
4938 return X86EMUL_CONTINUE;
4939exception:
4940 kvm_queue_exception_e(vcpu, err_vec, err_code);
4941 return X86EMUL_PROPAGATE_FAULT;
4942}
4943
4944static void save_state_to_tss32(struct kvm_vcpu *vcpu,
4945 struct tss_segment_32 *tss)
4946{
4947 tss->cr3 = vcpu->arch.cr3;
4948 tss->eip = kvm_rip_read(vcpu);
4949 tss->eflags = kvm_get_rflags(vcpu);
4950 tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
4951 tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
4952 tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
4953 tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX);
4954 tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP);
4955 tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP);
4956 tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI);
4957 tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI);
4958 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
4959 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
4960 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
4961 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
4962 tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS);
4963 tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS);
4964 tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
4965}
4966
4967static void kvm_load_segment_selector(struct kvm_vcpu *vcpu, u16 sel, int seg)
4968{
4969 struct kvm_segment kvm_seg;
4970 kvm_get_segment(vcpu, &kvm_seg, seg);
4971 kvm_seg.selector = sel;
4972 kvm_set_segment(vcpu, &kvm_seg, seg);
4973}
4974
4975static int load_state_from_tss32(struct kvm_vcpu *vcpu,
4976 struct tss_segment_32 *tss)
4977{
4978 kvm_set_cr3(vcpu, tss->cr3);
4979
4980 kvm_rip_write(vcpu, tss->eip);
4981 kvm_set_rflags(vcpu, tss->eflags | 2);
4982
4983 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
4984 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
4985 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx);
4986 kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx);
4987 kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp);
4988 kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp);
4989 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
4990 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
4991
4992 /*
4993 * SDM says that segment selectors are loaded before segment
4994 * descriptors
4995 */
4996 kvm_load_segment_selector(vcpu, tss->ldt_selector, VCPU_SREG_LDTR);
4997 kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
4998 kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
4999 kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
5000 kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
5001 kvm_load_segment_selector(vcpu, tss->fs, VCPU_SREG_FS);
5002 kvm_load_segment_selector(vcpu, tss->gs, VCPU_SREG_GS);
5003
5004 /*
5005 * Now load segment descriptors. If fault happenes at this stage
5006 * it is handled in a context of new task
5007 */
5008 if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, VCPU_SREG_LDTR))
5009 return 1;
5010
5011 if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
5012 return 1;
5013 4925
5014 if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS)) 4926 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
5015 return 1; 4927 return EMULATE_DONE;
5016
5017 if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
5018 return 1;
5019
5020 if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
5021 return 1;
5022
5023 if (kvm_load_segment_descriptor(vcpu, tss->fs, VCPU_SREG_FS))
5024 return 1;
5025
5026 if (kvm_load_segment_descriptor(vcpu, tss->gs, VCPU_SREG_GS))
5027 return 1;
5028 return 0;
5029}
5030
5031static void save_state_to_tss16(struct kvm_vcpu *vcpu,
5032 struct tss_segment_16 *tss)
5033{
5034 tss->ip = kvm_rip_read(vcpu);
5035 tss->flag = kvm_get_rflags(vcpu);
5036 tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
5037 tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
5038 tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
5039 tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX);
5040 tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP);
5041 tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP);
5042 tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI);
5043 tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI);
5044
5045 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
5046 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
5047 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
5048 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
5049 tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR);
5050}
5051
5052static int load_state_from_tss16(struct kvm_vcpu *vcpu,
5053 struct tss_segment_16 *tss)
5054{
5055 kvm_rip_write(vcpu, tss->ip);
5056 kvm_set_rflags(vcpu, tss->flag | 2);
5057 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
5058 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
5059 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
5060 kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx);
5061 kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp);
5062 kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp);
5063 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
5064 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
5065
5066 /*
5067 * SDM says that segment selectors are loaded before segment
5068 * descriptors
5069 */
5070 kvm_load_segment_selector(vcpu, tss->ldt, VCPU_SREG_LDTR);
5071 kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
5072 kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
5073 kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
5074 kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
5075
5076 /*
5077 * Now load segment descriptors. If fault happenes at this stage
5078 * it is handled in a context of new task
5079 */
5080 if (kvm_load_segment_descriptor(vcpu, tss->ldt, VCPU_SREG_LDTR))
5081 return 1;
5082
5083 if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
5084 return 1;
5085
5086 if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))
5087 return 1;
5088
5089 if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
5090 return 1;
5091
5092 if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
5093 return 1;
5094 return 0;
5095}
5096
5097static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
5098 u16 old_tss_sel, u32 old_tss_base,
5099 struct desc_struct *nseg_desc)
5100{
5101 struct tss_segment_16 tss_segment_16;
5102 int ret = 0;
5103
5104 if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
5105 sizeof tss_segment_16))
5106 goto out;
5107
5108 save_state_to_tss16(vcpu, &tss_segment_16);
5109
5110 if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
5111 sizeof tss_segment_16))
5112 goto out;
5113
5114 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
5115 &tss_segment_16, sizeof tss_segment_16))
5116 goto out;
5117
5118 if (old_tss_sel != 0xffff) {
5119 tss_segment_16.prev_task_link = old_tss_sel;
5120
5121 if (kvm_write_guest(vcpu->kvm,
5122 get_tss_base_addr_write(vcpu, nseg_desc),
5123 &tss_segment_16.prev_task_link,
5124 sizeof tss_segment_16.prev_task_link))
5125 goto out;
5126 }
5127
5128 if (load_state_from_tss16(vcpu, &tss_segment_16))
5129 goto out;
5130
5131 ret = 1;
5132out:
5133 return ret;
5134}
5135
5136static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
5137 u16 old_tss_sel, u32 old_tss_base,
5138 struct desc_struct *nseg_desc)
5139{
5140 struct tss_segment_32 tss_segment_32;
5141 int ret = 0;
5142
5143 if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
5144 sizeof tss_segment_32))
5145 goto out;
5146
5147 save_state_to_tss32(vcpu, &tss_segment_32);
5148
5149 if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
5150 sizeof tss_segment_32))
5151 goto out;
5152
5153 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
5154 &tss_segment_32, sizeof tss_segment_32))
5155 goto out;
5156
5157 if (old_tss_sel != 0xffff) {
5158 tss_segment_32.prev_task_link = old_tss_sel;
5159
5160 if (kvm_write_guest(vcpu->kvm,
5161 get_tss_base_addr_write(vcpu, nseg_desc),
5162 &tss_segment_32.prev_task_link,
5163 sizeof tss_segment_32.prev_task_link))
5164 goto out;
5165 }
5166
5167 if (load_state_from_tss32(vcpu, &tss_segment_32))
5168 goto out;
5169
5170 ret = 1;
5171out:
5172 return ret;
5173}
5174
5175int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
5176{
5177 struct kvm_segment tr_seg;
5178 struct desc_struct cseg_desc;
5179 struct desc_struct nseg_desc;
5180 int ret = 0;
5181 u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR);
5182 u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR);
5183 u32 desc_limit;
5184
5185 old_tss_base = kvm_mmu_gva_to_gpa_write(vcpu, old_tss_base, NULL);
5186
5187 /* FIXME: Handle errors. Failure to read either TSS or their
5188 * descriptors should generate a pagefault.
5189 */
5190 if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc))
5191 goto out;
5192
5193 if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc))
5194 goto out;
5195
5196 if (reason != TASK_SWITCH_IRET) {
5197 int cpl;
5198
5199 cpl = kvm_x86_ops->get_cpl(vcpu);
5200 if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) {
5201 kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
5202 return 1;
5203 }
5204 }
5205
5206 desc_limit = get_desc_limit(&nseg_desc);
5207 if (!nseg_desc.p ||
5208 ((desc_limit < 0x67 && (nseg_desc.type & 8)) ||
5209 desc_limit < 0x2b)) {
5210 kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc);
5211 return 1;
5212 }
5213
5214 if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
5215 cseg_desc.type &= ~(1 << 1); //clear the B flag
5216 save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc);
5217 }
5218
5219 if (reason == TASK_SWITCH_IRET) {
5220 u32 eflags = kvm_get_rflags(vcpu);
5221 kvm_set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
5222 }
5223
5224 /* set back link to prev task only if NT bit is set in eflags
5225 note that old_tss_sel is not used afetr this point */
5226 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
5227 old_tss_sel = 0xffff;
5228
5229 if (nseg_desc.type & 8)
5230 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel,
5231 old_tss_base, &nseg_desc);
5232 else
5233 ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel,
5234 old_tss_base, &nseg_desc);
5235
5236 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
5237 u32 eflags = kvm_get_rflags(vcpu);
5238 kvm_set_rflags(vcpu, eflags | X86_EFLAGS_NT);
5239 }
5240
5241 if (reason != TASK_SWITCH_IRET) {
5242 nseg_desc.type |= (1 << 1);
5243 save_guest_segment_descriptor(vcpu, tss_selector,
5244 &nseg_desc);
5245 }
5246
5247 kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0(vcpu) | X86_CR0_TS);
5248 seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
5249 tr_seg.type = 11;
5250 kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
5251out:
5252 return ret;
5253} 4928}
5254EXPORT_SYMBOL_GPL(kvm_task_switch); 4929EXPORT_SYMBOL_GPL(kvm_task_switch);
5255 4930
@@ -5258,15 +4933,15 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5258{ 4933{
5259 int mmu_reset_needed = 0; 4934 int mmu_reset_needed = 0;
5260 int pending_vec, max_bits; 4935 int pending_vec, max_bits;
5261 struct descriptor_table dt; 4936 struct desc_ptr dt;
5262 4937
5263 vcpu_load(vcpu); 4938 vcpu_load(vcpu);
5264 4939
5265 dt.limit = sregs->idt.limit; 4940 dt.size = sregs->idt.limit;
5266 dt.base = sregs->idt.base; 4941 dt.address = sregs->idt.base;
5267 kvm_x86_ops->set_idt(vcpu, &dt); 4942 kvm_x86_ops->set_idt(vcpu, &dt);
5268 dt.limit = sregs->gdt.limit; 4943 dt.size = sregs->gdt.limit;
5269 dt.base = sregs->gdt.base; 4944 dt.address = sregs->gdt.base;
5270 kvm_x86_ops->set_gdt(vcpu, &dt); 4945 kvm_x86_ops->set_gdt(vcpu, &dt);
5271 4946
5272 vcpu->arch.cr2 = sregs->cr2; 4947 vcpu->arch.cr2 = sregs->cr2;
@@ -5365,11 +5040,9 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
5365 vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK); 5040 vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
5366 } 5041 }
5367 5042
5368 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { 5043 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
5369 vcpu->arch.singlestep_cs = 5044 vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) +
5370 get_segment_selector(vcpu, VCPU_SREG_CS); 5045 get_segment_base(vcpu, VCPU_SREG_CS);
5371 vcpu->arch.singlestep_rip = kvm_rip_read(vcpu);
5372 }
5373 5046
5374 /* 5047 /*
5375 * Trigger an rflags update that will inject or remove the trace 5048 * Trigger an rflags update that will inject or remove the trace
@@ -5860,13 +5533,22 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
5860 return kvm_x86_ops->interrupt_allowed(vcpu); 5533 return kvm_x86_ops->interrupt_allowed(vcpu);
5861} 5534}
5862 5535
5536bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip)
5537{
5538 unsigned long current_rip = kvm_rip_read(vcpu) +
5539 get_segment_base(vcpu, VCPU_SREG_CS);
5540
5541 return current_rip == linear_rip;
5542}
5543EXPORT_SYMBOL_GPL(kvm_is_linear_rip);
5544
5863unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu) 5545unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
5864{ 5546{
5865 unsigned long rflags; 5547 unsigned long rflags;
5866 5548
5867 rflags = kvm_x86_ops->get_rflags(vcpu); 5549 rflags = kvm_x86_ops->get_rflags(vcpu);
5868 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 5550 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
5869 rflags &= ~(unsigned long)(X86_EFLAGS_TF | X86_EFLAGS_RF); 5551 rflags &= ~X86_EFLAGS_TF;
5870 return rflags; 5552 return rflags;
5871} 5553}
5872EXPORT_SYMBOL_GPL(kvm_get_rflags); 5554EXPORT_SYMBOL_GPL(kvm_get_rflags);
@@ -5874,10 +5556,8 @@ EXPORT_SYMBOL_GPL(kvm_get_rflags);
5874void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 5556void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
5875{ 5557{
5876 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP && 5558 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
5877 vcpu->arch.singlestep_cs == 5559 kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
5878 get_segment_selector(vcpu, VCPU_SREG_CS) && 5560 rflags |= X86_EFLAGS_TF;
5879 vcpu->arch.singlestep_rip == kvm_rip_read(vcpu))
5880 rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
5881 kvm_x86_ops->set_rflags(vcpu, rflags); 5561 kvm_x86_ops->set_rflags(vcpu, rflags);
5882} 5562}
5883EXPORT_SYMBOL_GPL(kvm_set_rflags); 5563EXPORT_SYMBOL_GPL(kvm_set_rflags);
@@ -5893,3 +5573,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
5893EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit); 5573EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
5894EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga); 5574EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
5895EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit); 5575EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
5576EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index b7a404722d2b..f4b54458285b 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -65,6 +65,13 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
65 return kvm_read_cr0_bits(vcpu, X86_CR0_PG); 65 return kvm_read_cr0_bits(vcpu, X86_CR0_PG);
66} 66}
67 67
68static inline struct kvm_mem_aliases *kvm_aliases(struct kvm *kvm)
69{
70 return rcu_dereference_check(kvm->arch.aliases,
71 srcu_read_lock_held(&kvm->srcu)
72 || lockdep_is_held(&kvm->slots_lock));
73}
74
68void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); 75void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
69void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); 76void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
70 77