diff options
Diffstat (limited to 'drivers/kvm')
-rw-r--r-- | drivers/kvm/Kconfig | 33 | ||||
-rw-r--r-- | drivers/kvm/Makefile | 10 | ||||
-rw-r--r-- | drivers/kvm/kvm.h | 551 | ||||
-rw-r--r-- | drivers/kvm/kvm_main.c | 1935 | ||||
-rw-r--r-- | drivers/kvm/kvm_svm.h | 44 | ||||
-rw-r--r-- | drivers/kvm/kvm_vmx.h | 14 | ||||
-rw-r--r-- | drivers/kvm/mmu.c | 699 | ||||
-rw-r--r-- | drivers/kvm/paging_tmpl.h | 397 | ||||
-rw-r--r-- | drivers/kvm/segment_descriptor.h | 17 | ||||
-rw-r--r-- | drivers/kvm/svm.c | 1677 | ||||
-rw-r--r-- | drivers/kvm/svm.h | 315 | ||||
-rw-r--r-- | drivers/kvm/vmx.c | 2002 | ||||
-rw-r--r-- | drivers/kvm/vmx.h | 296 | ||||
-rw-r--r-- | drivers/kvm/x86_emulate.c | 1409 | ||||
-rw-r--r-- | drivers/kvm/x86_emulate.h | 185 |
15 files changed, 9584 insertions, 0 deletions
diff --git a/drivers/kvm/Kconfig b/drivers/kvm/Kconfig new file mode 100644 index 000000000000..36412e90f09b --- /dev/null +++ b/drivers/kvm/Kconfig | |||
@@ -0,0 +1,33 @@ | |||
1 | # | ||
2 | # KVM configuration | ||
3 | # | ||
4 | config KVM | ||
5 | tristate "Kernel-based Virtual Machine (KVM) support" | ||
6 | depends on X86 && EXPERIMENTAL | ||
7 | ---help--- | ||
8 | Support hosting fully virtualized guest machines using hardware | ||
9 | virtualization extensions. You will need a fairly recent | ||
10 | processor equipped with virtualization extensions. You will also | ||
11 | need to select one or more of the processor modules below. | ||
12 | |||
13 | This module provides access to the hardware capabilities through | ||
14 | a character device node named /dev/kvm. | ||
15 | |||
16 | To compile this as a module, choose M here: the module | ||
17 | will be called kvm. | ||
18 | |||
19 | If unsure, say N. | ||
20 | |||
21 | config KVM_INTEL | ||
22 | tristate "KVM for Intel processors support" | ||
23 | depends on KVM | ||
24 | ---help--- | ||
25 | Provides support for KVM on Intel processors equipped with the VT | ||
26 | extensions. | ||
27 | |||
28 | config KVM_AMD | ||
29 | tristate "KVM for AMD processors support" | ||
30 | depends on KVM | ||
31 | ---help--- | ||
32 | Provides support for KVM on AMD processors equipped with the AMD-V | ||
33 | (SVM) extensions. | ||
diff --git a/drivers/kvm/Makefile b/drivers/kvm/Makefile new file mode 100644 index 000000000000..c0a789fa9d65 --- /dev/null +++ b/drivers/kvm/Makefile | |||
@@ -0,0 +1,10 @@ | |||
1 | # | ||
2 | # Makefile for Kernel-based Virtual Machine module | ||
3 | # | ||
4 | |||
5 | kvm-objs := kvm_main.o mmu.o x86_emulate.o | ||
6 | obj-$(CONFIG_KVM) += kvm.o | ||
7 | kvm-intel-objs = vmx.o | ||
8 | obj-$(CONFIG_KVM_INTEL) += kvm-intel.o | ||
9 | kvm-amd-objs = svm.o | ||
10 | obj-$(CONFIG_KVM_AMD) += kvm-amd.o | ||
diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h new file mode 100644 index 000000000000..5785d0870ab6 --- /dev/null +++ b/drivers/kvm/kvm.h | |||
@@ -0,0 +1,551 @@ | |||
1 | #ifndef __KVM_H | ||
2 | #define __KVM_H | ||
3 | |||
4 | /* | ||
5 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
6 | * the COPYING file in the top-level directory. | ||
7 | */ | ||
8 | |||
9 | #include <linux/types.h> | ||
10 | #include <linux/list.h> | ||
11 | #include <linux/mutex.h> | ||
12 | #include <linux/spinlock.h> | ||
13 | #include <linux/mm.h> | ||
14 | |||
15 | #include "vmx.h" | ||
16 | #include <linux/kvm.h> | ||
17 | |||
18 | #define CR0_PE_MASK (1ULL << 0) | ||
19 | #define CR0_TS_MASK (1ULL << 3) | ||
20 | #define CR0_NE_MASK (1ULL << 5) | ||
21 | #define CR0_WP_MASK (1ULL << 16) | ||
22 | #define CR0_NW_MASK (1ULL << 29) | ||
23 | #define CR0_CD_MASK (1ULL << 30) | ||
24 | #define CR0_PG_MASK (1ULL << 31) | ||
25 | |||
26 | #define CR3_WPT_MASK (1ULL << 3) | ||
27 | #define CR3_PCD_MASK (1ULL << 4) | ||
28 | |||
29 | #define CR3_RESEVED_BITS 0x07ULL | ||
30 | #define CR3_L_MODE_RESEVED_BITS (~((1ULL << 40) - 1) | 0x0fe7ULL) | ||
31 | #define CR3_FLAGS_MASK ((1ULL << 5) - 1) | ||
32 | |||
33 | #define CR4_VME_MASK (1ULL << 0) | ||
34 | #define CR4_PSE_MASK (1ULL << 4) | ||
35 | #define CR4_PAE_MASK (1ULL << 5) | ||
36 | #define CR4_PGE_MASK (1ULL << 7) | ||
37 | #define CR4_VMXE_MASK (1ULL << 13) | ||
38 | |||
39 | #define KVM_GUEST_CR0_MASK \ | ||
40 | (CR0_PG_MASK | CR0_PE_MASK | CR0_WP_MASK | CR0_NE_MASK \ | ||
41 | | CR0_NW_MASK | CR0_CD_MASK) | ||
42 | #define KVM_VM_CR0_ALWAYS_ON \ | ||
43 | (CR0_PG_MASK | CR0_PE_MASK | CR0_WP_MASK | CR0_NE_MASK) | ||
44 | #define KVM_GUEST_CR4_MASK \ | ||
45 | (CR4_PSE_MASK | CR4_PAE_MASK | CR4_PGE_MASK | CR4_VMXE_MASK | CR4_VME_MASK) | ||
46 | #define KVM_PMODE_VM_CR4_ALWAYS_ON (CR4_VMXE_MASK | CR4_PAE_MASK) | ||
47 | #define KVM_RMODE_VM_CR4_ALWAYS_ON (CR4_VMXE_MASK | CR4_PAE_MASK | CR4_VME_MASK) | ||
48 | |||
49 | #define INVALID_PAGE (~(hpa_t)0) | ||
50 | #define UNMAPPED_GVA (~(gpa_t)0) | ||
51 | |||
52 | #define KVM_MAX_VCPUS 1 | ||
53 | #define KVM_MEMORY_SLOTS 4 | ||
54 | #define KVM_NUM_MMU_PAGES 256 | ||
55 | |||
56 | #define FX_IMAGE_SIZE 512 | ||
57 | #define FX_IMAGE_ALIGN 16 | ||
58 | #define FX_BUF_SIZE (2 * FX_IMAGE_SIZE + FX_IMAGE_ALIGN) | ||
59 | |||
60 | #define DE_VECTOR 0 | ||
61 | #define DF_VECTOR 8 | ||
62 | #define TS_VECTOR 10 | ||
63 | #define NP_VECTOR 11 | ||
64 | #define SS_VECTOR 12 | ||
65 | #define GP_VECTOR 13 | ||
66 | #define PF_VECTOR 14 | ||
67 | |||
68 | #define SELECTOR_TI_MASK (1 << 2) | ||
69 | #define SELECTOR_RPL_MASK 0x03 | ||
70 | |||
71 | #define IOPL_SHIFT 12 | ||
72 | |||
73 | /* | ||
74 | * Address types: | ||
75 | * | ||
76 | * gva - guest virtual address | ||
77 | * gpa - guest physical address | ||
78 | * gfn - guest frame number | ||
79 | * hva - host virtual address | ||
80 | * hpa - host physical address | ||
81 | * hfn - host frame number | ||
82 | */ | ||
83 | |||
84 | typedef unsigned long gva_t; | ||
85 | typedef u64 gpa_t; | ||
86 | typedef unsigned long gfn_t; | ||
87 | |||
88 | typedef unsigned long hva_t; | ||
89 | typedef u64 hpa_t; | ||
90 | typedef unsigned long hfn_t; | ||
91 | |||
92 | struct kvm_mmu_page { | ||
93 | struct list_head link; | ||
94 | hpa_t page_hpa; | ||
95 | unsigned long slot_bitmap; /* One bit set per slot which has memory | ||
96 | * in this shadow page. | ||
97 | */ | ||
98 | int global; /* Set if all ptes in this page are global */ | ||
99 | u64 *parent_pte; | ||
100 | }; | ||
101 | |||
102 | struct vmcs { | ||
103 | u32 revision_id; | ||
104 | u32 abort; | ||
105 | char data[0]; | ||
106 | }; | ||
107 | |||
108 | #define vmx_msr_entry kvm_msr_entry | ||
109 | |||
110 | struct kvm_vcpu; | ||
111 | |||
112 | /* | ||
113 | * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level | ||
114 | * 32-bit). The kvm_mmu structure abstracts the details of the current mmu | ||
115 | * mode. | ||
116 | */ | ||
117 | struct kvm_mmu { | ||
118 | void (*new_cr3)(struct kvm_vcpu *vcpu); | ||
119 | int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); | ||
120 | void (*inval_page)(struct kvm_vcpu *vcpu, gva_t gva); | ||
121 | void (*free)(struct kvm_vcpu *vcpu); | ||
122 | gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva); | ||
123 | hpa_t root_hpa; | ||
124 | int root_level; | ||
125 | int shadow_root_level; | ||
126 | }; | ||
127 | |||
128 | struct kvm_guest_debug { | ||
129 | int enabled; | ||
130 | unsigned long bp[4]; | ||
131 | int singlestep; | ||
132 | }; | ||
133 | |||
134 | enum { | ||
135 | VCPU_REGS_RAX = 0, | ||
136 | VCPU_REGS_RCX = 1, | ||
137 | VCPU_REGS_RDX = 2, | ||
138 | VCPU_REGS_RBX = 3, | ||
139 | VCPU_REGS_RSP = 4, | ||
140 | VCPU_REGS_RBP = 5, | ||
141 | VCPU_REGS_RSI = 6, | ||
142 | VCPU_REGS_RDI = 7, | ||
143 | #ifdef __x86_64__ | ||
144 | VCPU_REGS_R8 = 8, | ||
145 | VCPU_REGS_R9 = 9, | ||
146 | VCPU_REGS_R10 = 10, | ||
147 | VCPU_REGS_R11 = 11, | ||
148 | VCPU_REGS_R12 = 12, | ||
149 | VCPU_REGS_R13 = 13, | ||
150 | VCPU_REGS_R14 = 14, | ||
151 | VCPU_REGS_R15 = 15, | ||
152 | #endif | ||
153 | NR_VCPU_REGS | ||
154 | }; | ||
155 | |||
156 | enum { | ||
157 | VCPU_SREG_CS, | ||
158 | VCPU_SREG_DS, | ||
159 | VCPU_SREG_ES, | ||
160 | VCPU_SREG_FS, | ||
161 | VCPU_SREG_GS, | ||
162 | VCPU_SREG_SS, | ||
163 | VCPU_SREG_TR, | ||
164 | VCPU_SREG_LDTR, | ||
165 | }; | ||
166 | |||
167 | struct kvm_vcpu { | ||
168 | struct kvm *kvm; | ||
169 | union { | ||
170 | struct vmcs *vmcs; | ||
171 | struct vcpu_svm *svm; | ||
172 | }; | ||
173 | struct mutex mutex; | ||
174 | int cpu; | ||
175 | int launched; | ||
176 | unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */ | ||
177 | #define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE(unsigned long) | ||
178 | unsigned long irq_pending[NR_IRQ_WORDS]; | ||
179 | unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */ | ||
180 | unsigned long rip; /* needs vcpu_load_rsp_rip() */ | ||
181 | |||
182 | unsigned long cr0; | ||
183 | unsigned long cr2; | ||
184 | unsigned long cr3; | ||
185 | unsigned long cr4; | ||
186 | unsigned long cr8; | ||
187 | u64 shadow_efer; | ||
188 | u64 apic_base; | ||
189 | int nmsrs; | ||
190 | struct vmx_msr_entry *guest_msrs; | ||
191 | struct vmx_msr_entry *host_msrs; | ||
192 | |||
193 | struct list_head free_pages; | ||
194 | struct kvm_mmu_page page_header_buf[KVM_NUM_MMU_PAGES]; | ||
195 | struct kvm_mmu mmu; | ||
196 | |||
197 | struct kvm_guest_debug guest_debug; | ||
198 | |||
199 | char fx_buf[FX_BUF_SIZE]; | ||
200 | char *host_fx_image; | ||
201 | char *guest_fx_image; | ||
202 | |||
203 | int mmio_needed; | ||
204 | int mmio_read_completed; | ||
205 | int mmio_is_write; | ||
206 | int mmio_size; | ||
207 | unsigned char mmio_data[8]; | ||
208 | gpa_t mmio_phys_addr; | ||
209 | |||
210 | struct { | ||
211 | int active; | ||
212 | u8 save_iopl; | ||
213 | struct kvm_save_segment { | ||
214 | u16 selector; | ||
215 | unsigned long base; | ||
216 | u32 limit; | ||
217 | u32 ar; | ||
218 | } tr, es, ds, fs, gs; | ||
219 | } rmode; | ||
220 | }; | ||
221 | |||
222 | struct kvm_memory_slot { | ||
223 | gfn_t base_gfn; | ||
224 | unsigned long npages; | ||
225 | unsigned long flags; | ||
226 | struct page **phys_mem; | ||
227 | unsigned long *dirty_bitmap; | ||
228 | }; | ||
229 | |||
230 | struct kvm { | ||
231 | spinlock_t lock; /* protects everything except vcpus */ | ||
232 | int nmemslots; | ||
233 | struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS]; | ||
234 | struct list_head active_mmu_pages; | ||
235 | struct kvm_vcpu vcpus[KVM_MAX_VCPUS]; | ||
236 | int memory_config_version; | ||
237 | int busy; | ||
238 | }; | ||
239 | |||
240 | struct kvm_stat { | ||
241 | u32 pf_fixed; | ||
242 | u32 pf_guest; | ||
243 | u32 tlb_flush; | ||
244 | u32 invlpg; | ||
245 | |||
246 | u32 exits; | ||
247 | u32 io_exits; | ||
248 | u32 mmio_exits; | ||
249 | u32 signal_exits; | ||
250 | u32 irq_exits; | ||
251 | }; | ||
252 | |||
253 | struct descriptor_table { | ||
254 | u16 limit; | ||
255 | unsigned long base; | ||
256 | } __attribute__((packed)); | ||
257 | |||
258 | struct kvm_arch_ops { | ||
259 | int (*cpu_has_kvm_support)(void); /* __init */ | ||
260 | int (*disabled_by_bios)(void); /* __init */ | ||
261 | void (*hardware_enable)(void *dummy); /* __init */ | ||
262 | void (*hardware_disable)(void *dummy); | ||
263 | int (*hardware_setup)(void); /* __init */ | ||
264 | void (*hardware_unsetup)(void); /* __exit */ | ||
265 | |||
266 | int (*vcpu_create)(struct kvm_vcpu *vcpu); | ||
267 | void (*vcpu_free)(struct kvm_vcpu *vcpu); | ||
268 | |||
269 | struct kvm_vcpu *(*vcpu_load)(struct kvm_vcpu *vcpu); | ||
270 | void (*vcpu_put)(struct kvm_vcpu *vcpu); | ||
271 | |||
272 | int (*set_guest_debug)(struct kvm_vcpu *vcpu, | ||
273 | struct kvm_debug_guest *dbg); | ||
274 | int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); | ||
275 | int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); | ||
276 | u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); | ||
277 | void (*get_segment)(struct kvm_vcpu *vcpu, | ||
278 | struct kvm_segment *var, int seg); | ||
279 | void (*set_segment)(struct kvm_vcpu *vcpu, | ||
280 | struct kvm_segment *var, int seg); | ||
281 | int (*is_long_mode)(struct kvm_vcpu *vcpu); | ||
282 | void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); | ||
283 | void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); | ||
284 | void (*set_cr0_no_modeswitch)(struct kvm_vcpu *vcpu, | ||
285 | unsigned long cr0); | ||
286 | void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); | ||
287 | void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); | ||
288 | void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer); | ||
289 | void (*get_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); | ||
290 | void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); | ||
291 | void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); | ||
292 | void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); | ||
293 | unsigned long (*get_dr)(struct kvm_vcpu *vcpu, int dr); | ||
294 | void (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value, | ||
295 | int *exception); | ||
296 | void (*cache_regs)(struct kvm_vcpu *vcpu); | ||
297 | void (*decache_regs)(struct kvm_vcpu *vcpu); | ||
298 | unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); | ||
299 | void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); | ||
300 | |||
301 | void (*invlpg)(struct kvm_vcpu *vcpu, gva_t addr); | ||
302 | void (*tlb_flush)(struct kvm_vcpu *vcpu); | ||
303 | void (*inject_page_fault)(struct kvm_vcpu *vcpu, | ||
304 | unsigned long addr, u32 err_code); | ||
305 | |||
306 | void (*inject_gp)(struct kvm_vcpu *vcpu, unsigned err_code); | ||
307 | |||
308 | int (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run); | ||
309 | int (*vcpu_setup)(struct kvm_vcpu *vcpu); | ||
310 | void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); | ||
311 | }; | ||
312 | |||
313 | extern struct kvm_stat kvm_stat; | ||
314 | extern struct kvm_arch_ops *kvm_arch_ops; | ||
315 | |||
316 | #define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt) | ||
317 | #define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt) | ||
318 | |||
319 | int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module); | ||
320 | void kvm_exit_arch(void); | ||
321 | |||
322 | void kvm_mmu_destroy(struct kvm_vcpu *vcpu); | ||
323 | int kvm_mmu_init(struct kvm_vcpu *vcpu); | ||
324 | |||
325 | int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); | ||
326 | void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); | ||
327 | |||
328 | hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa); | ||
329 | #define HPA_MSB ((sizeof(hpa_t) * 8) - 1) | ||
330 | #define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB) | ||
331 | static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; } | ||
332 | hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva); | ||
333 | |||
334 | void kvm_emulator_want_group7_invlpg(void); | ||
335 | |||
336 | extern hpa_t bad_page_address; | ||
337 | |||
338 | static inline struct page *gfn_to_page(struct kvm_memory_slot *slot, gfn_t gfn) | ||
339 | { | ||
340 | return slot->phys_mem[gfn - slot->base_gfn]; | ||
341 | } | ||
342 | |||
343 | struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); | ||
344 | void mark_page_dirty(struct kvm *kvm, gfn_t gfn); | ||
345 | |||
346 | enum emulation_result { | ||
347 | EMULATE_DONE, /* no further processing */ | ||
348 | EMULATE_DO_MMIO, /* kvm_run filled with mmio request */ | ||
349 | EMULATE_FAIL, /* can't emulate this instruction */ | ||
350 | }; | ||
351 | |||
352 | int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run, | ||
353 | unsigned long cr2, u16 error_code); | ||
354 | void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); | ||
355 | void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); | ||
356 | void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, | ||
357 | unsigned long *rflags); | ||
358 | |||
359 | unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr); | ||
360 | void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long value, | ||
361 | unsigned long *rflags); | ||
362 | |||
363 | struct x86_emulate_ctxt; | ||
364 | |||
365 | int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); | ||
366 | int emulate_clts(struct kvm_vcpu *vcpu); | ||
367 | int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, | ||
368 | unsigned long *dest); | ||
369 | int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, | ||
370 | unsigned long value); | ||
371 | |||
372 | void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); | ||
373 | void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr0); | ||
374 | void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr0); | ||
375 | void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr0); | ||
376 | void lmsw(struct kvm_vcpu *vcpu, unsigned long msw); | ||
377 | |||
378 | #ifdef __x86_64__ | ||
379 | void set_efer(struct kvm_vcpu *vcpu, u64 efer); | ||
380 | #endif | ||
381 | |||
382 | void fx_init(struct kvm_vcpu *vcpu); | ||
383 | |||
384 | void load_msrs(struct vmx_msr_entry *e, int n); | ||
385 | void save_msrs(struct vmx_msr_entry *e, int n); | ||
386 | void kvm_resched(struct kvm_vcpu *vcpu); | ||
387 | |||
388 | int kvm_read_guest(struct kvm_vcpu *vcpu, | ||
389 | gva_t addr, | ||
390 | unsigned long size, | ||
391 | void *dest); | ||
392 | |||
393 | int kvm_write_guest(struct kvm_vcpu *vcpu, | ||
394 | gva_t addr, | ||
395 | unsigned long size, | ||
396 | void *data); | ||
397 | |||
398 | unsigned long segment_base(u16 selector); | ||
399 | |||
400 | static inline struct page *_gfn_to_page(struct kvm *kvm, gfn_t gfn) | ||
401 | { | ||
402 | struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); | ||
403 | return (slot) ? slot->phys_mem[gfn - slot->base_gfn] : NULL; | ||
404 | } | ||
405 | |||
406 | static inline int is_pae(struct kvm_vcpu *vcpu) | ||
407 | { | ||
408 | return vcpu->cr4 & CR4_PAE_MASK; | ||
409 | } | ||
410 | |||
411 | static inline int is_pse(struct kvm_vcpu *vcpu) | ||
412 | { | ||
413 | return vcpu->cr4 & CR4_PSE_MASK; | ||
414 | } | ||
415 | |||
416 | static inline int is_paging(struct kvm_vcpu *vcpu) | ||
417 | { | ||
418 | return vcpu->cr0 & CR0_PG_MASK; | ||
419 | } | ||
420 | |||
421 | static inline int memslot_id(struct kvm *kvm, struct kvm_memory_slot *slot) | ||
422 | { | ||
423 | return slot - kvm->memslots; | ||
424 | } | ||
425 | |||
426 | static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) | ||
427 | { | ||
428 | struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); | ||
429 | |||
430 | return (struct kvm_mmu_page *)page->private; | ||
431 | } | ||
432 | |||
433 | static inline u16 read_fs(void) | ||
434 | { | ||
435 | u16 seg; | ||
436 | asm ("mov %%fs, %0" : "=g"(seg)); | ||
437 | return seg; | ||
438 | } | ||
439 | |||
440 | static inline u16 read_gs(void) | ||
441 | { | ||
442 | u16 seg; | ||
443 | asm ("mov %%gs, %0" : "=g"(seg)); | ||
444 | return seg; | ||
445 | } | ||
446 | |||
447 | static inline u16 read_ldt(void) | ||
448 | { | ||
449 | u16 ldt; | ||
450 | asm ("sldt %0" : "=g"(ldt)); | ||
451 | return ldt; | ||
452 | } | ||
453 | |||
454 | static inline void load_fs(u16 sel) | ||
455 | { | ||
456 | asm ("mov %0, %%fs" : : "rm"(sel)); | ||
457 | } | ||
458 | |||
459 | static inline void load_gs(u16 sel) | ||
460 | { | ||
461 | asm ("mov %0, %%gs" : : "rm"(sel)); | ||
462 | } | ||
463 | |||
464 | #ifndef load_ldt | ||
465 | static inline void load_ldt(u16 sel) | ||
466 | { | ||
467 | asm ("lldt %0" : : "g"(sel)); | ||
468 | } | ||
469 | #endif | ||
470 | |||
471 | static inline void get_idt(struct descriptor_table *table) | ||
472 | { | ||
473 | asm ("sidt %0" : "=m"(*table)); | ||
474 | } | ||
475 | |||
476 | static inline void get_gdt(struct descriptor_table *table) | ||
477 | { | ||
478 | asm ("sgdt %0" : "=m"(*table)); | ||
479 | } | ||
480 | |||
481 | static inline unsigned long read_tr_base(void) | ||
482 | { | ||
483 | u16 tr; | ||
484 | asm ("str %0" : "=g"(tr)); | ||
485 | return segment_base(tr); | ||
486 | } | ||
487 | |||
488 | #ifdef __x86_64__ | ||
489 | static inline unsigned long read_msr(unsigned long msr) | ||
490 | { | ||
491 | u64 value; | ||
492 | |||
493 | rdmsrl(msr, value); | ||
494 | return value; | ||
495 | } | ||
496 | #endif | ||
497 | |||
498 | static inline void fx_save(void *image) | ||
499 | { | ||
500 | asm ("fxsave (%0)":: "r" (image)); | ||
501 | } | ||
502 | |||
503 | static inline void fx_restore(void *image) | ||
504 | { | ||
505 | asm ("fxrstor (%0)":: "r" (image)); | ||
506 | } | ||
507 | |||
508 | static inline void fpu_init(void) | ||
509 | { | ||
510 | asm ("finit"); | ||
511 | } | ||
512 | |||
513 | static inline u32 get_rdx_init_val(void) | ||
514 | { | ||
515 | return 0x600; /* P6 family */ | ||
516 | } | ||
517 | |||
518 | #define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30" | ||
519 | #define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2" | ||
520 | #define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3" | ||
521 | #define ASM_VMX_VMPTRLD_RAX ".byte 0x0f, 0xc7, 0x30" | ||
522 | #define ASM_VMX_VMREAD_RDX_RAX ".byte 0x0f, 0x78, 0xd0" | ||
523 | #define ASM_VMX_VMWRITE_RAX_RDX ".byte 0x0f, 0x79, 0xd0" | ||
524 | #define ASM_VMX_VMWRITE_RSP_RDX ".byte 0x0f, 0x79, 0xd4" | ||
525 | #define ASM_VMX_VMXOFF ".byte 0x0f, 0x01, 0xc4" | ||
526 | #define ASM_VMX_VMXON_RAX ".byte 0xf3, 0x0f, 0xc7, 0x30" | ||
527 | |||
528 | #define MSR_IA32_TIME_STAMP_COUNTER 0x010 | ||
529 | |||
530 | #define TSS_IOPB_BASE_OFFSET 0x66 | ||
531 | #define TSS_BASE_SIZE 0x68 | ||
532 | #define TSS_IOPB_SIZE (65536 / 8) | ||
533 | #define TSS_REDIRECTION_SIZE (256 / 8) | ||
534 | #define RMODE_TSS_SIZE (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1) | ||
535 | |||
536 | #ifdef __x86_64__ | ||
537 | |||
538 | /* | ||
539 | * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. Therefore | ||
540 | * we need to allocate shadow page tables in the first 4GB of memory, which | ||
541 | * happens to fit the DMA32 zone. | ||
542 | */ | ||
543 | #define GFP_KVM_MMU (GFP_KERNEL | __GFP_DMA32) | ||
544 | |||
545 | #else | ||
546 | |||
547 | #define GFP_KVM_MMU GFP_KERNEL | ||
548 | |||
549 | #endif | ||
550 | |||
551 | #endif | ||
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c new file mode 100644 index 000000000000..b6b8a41b5ec8 --- /dev/null +++ b/drivers/kvm/kvm_main.c | |||
@@ -0,0 +1,1935 @@ | |||
1 | /* | ||
2 | * Kernel-based Virtual Machine driver for Linux | ||
3 | * | ||
4 | * This module enables machines with Intel VT-x extensions to run virtual | ||
5 | * machines without emulation or binary translation. | ||
6 | * | ||
7 | * Copyright (C) 2006 Qumranet, Inc. | ||
8 | * | ||
9 | * Authors: | ||
10 | * Avi Kivity <avi@qumranet.com> | ||
11 | * Yaniv Kamay <yaniv@qumranet.com> | ||
12 | * | ||
13 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
14 | * the COPYING file in the top-level directory. | ||
15 | * | ||
16 | */ | ||
17 | |||
18 | #include "kvm.h" | ||
19 | |||
20 | #include <linux/kvm.h> | ||
21 | #include <linux/module.h> | ||
22 | #include <linux/errno.h> | ||
23 | #include <asm/processor.h> | ||
24 | #include <linux/percpu.h> | ||
25 | #include <linux/gfp.h> | ||
26 | #include <asm/msr.h> | ||
27 | #include <linux/mm.h> | ||
28 | #include <linux/miscdevice.h> | ||
29 | #include <linux/vmalloc.h> | ||
30 | #include <asm/uaccess.h> | ||
31 | #include <linux/reboot.h> | ||
32 | #include <asm/io.h> | ||
33 | #include <linux/debugfs.h> | ||
34 | #include <linux/highmem.h> | ||
35 | #include <linux/file.h> | ||
36 | #include <asm/desc.h> | ||
37 | |||
38 | #include "x86_emulate.h" | ||
39 | #include "segment_descriptor.h" | ||
40 | |||
41 | MODULE_AUTHOR("Qumranet"); | ||
42 | MODULE_LICENSE("GPL"); | ||
43 | |||
44 | struct kvm_arch_ops *kvm_arch_ops; | ||
45 | struct kvm_stat kvm_stat; | ||
46 | EXPORT_SYMBOL_GPL(kvm_stat); | ||
47 | |||
48 | static struct kvm_stats_debugfs_item { | ||
49 | const char *name; | ||
50 | u32 *data; | ||
51 | struct dentry *dentry; | ||
52 | } debugfs_entries[] = { | ||
53 | { "pf_fixed", &kvm_stat.pf_fixed }, | ||
54 | { "pf_guest", &kvm_stat.pf_guest }, | ||
55 | { "tlb_flush", &kvm_stat.tlb_flush }, | ||
56 | { "invlpg", &kvm_stat.invlpg }, | ||
57 | { "exits", &kvm_stat.exits }, | ||
58 | { "io_exits", &kvm_stat.io_exits }, | ||
59 | { "mmio_exits", &kvm_stat.mmio_exits }, | ||
60 | { "signal_exits", &kvm_stat.signal_exits }, | ||
61 | { "irq_exits", &kvm_stat.irq_exits }, | ||
62 | { 0, 0 } | ||
63 | }; | ||
64 | |||
65 | static struct dentry *debugfs_dir; | ||
66 | |||
67 | #define MAX_IO_MSRS 256 | ||
68 | |||
69 | #define CR0_RESEVED_BITS 0xffffffff1ffaffc0ULL | ||
70 | #define LMSW_GUEST_MASK 0x0eULL | ||
71 | #define CR4_RESEVED_BITS (~((1ULL << 11) - 1)) | ||
72 | #define CR8_RESEVED_BITS (~0x0fULL) | ||
73 | #define EFER_RESERVED_BITS 0xfffffffffffff2fe | ||
74 | |||
75 | struct vmx_msr_entry *find_msr_entry(struct kvm_vcpu *vcpu, u32 msr) | ||
76 | { | ||
77 | int i; | ||
78 | |||
79 | for (i = 0; i < vcpu->nmsrs; ++i) | ||
80 | if (vcpu->guest_msrs[i].index == msr) | ||
81 | return &vcpu->guest_msrs[i]; | ||
82 | return 0; | ||
83 | } | ||
84 | EXPORT_SYMBOL_GPL(find_msr_entry); | ||
85 | |||
86 | #ifdef __x86_64__ | ||
87 | // LDT or TSS descriptor in the GDT. 16 bytes. | ||
88 | struct segment_descriptor_64 { | ||
89 | struct segment_descriptor s; | ||
90 | u32 base_higher; | ||
91 | u32 pad_zero; | ||
92 | }; | ||
93 | |||
94 | #endif | ||
95 | |||
96 | unsigned long segment_base(u16 selector) | ||
97 | { | ||
98 | struct descriptor_table gdt; | ||
99 | struct segment_descriptor *d; | ||
100 | unsigned long table_base; | ||
101 | typedef unsigned long ul; | ||
102 | unsigned long v; | ||
103 | |||
104 | if (selector == 0) | ||
105 | return 0; | ||
106 | |||
107 | asm ("sgdt %0" : "=m"(gdt)); | ||
108 | table_base = gdt.base; | ||
109 | |||
110 | if (selector & 4) { /* from ldt */ | ||
111 | u16 ldt_selector; | ||
112 | |||
113 | asm ("sldt %0" : "=g"(ldt_selector)); | ||
114 | table_base = segment_base(ldt_selector); | ||
115 | } | ||
116 | d = (struct segment_descriptor *)(table_base + (selector & ~7)); | ||
117 | v = d->base_low | ((ul)d->base_mid << 16) | ((ul)d->base_high << 24); | ||
118 | #ifdef __x86_64__ | ||
119 | if (d->system == 0 | ||
120 | && (d->type == 2 || d->type == 9 || d->type == 11)) | ||
121 | v |= ((ul)((struct segment_descriptor_64 *)d)->base_higher) << 32; | ||
122 | #endif | ||
123 | return v; | ||
124 | } | ||
125 | EXPORT_SYMBOL_GPL(segment_base); | ||
126 | |||
127 | int kvm_read_guest(struct kvm_vcpu *vcpu, | ||
128 | gva_t addr, | ||
129 | unsigned long size, | ||
130 | void *dest) | ||
131 | { | ||
132 | unsigned char *host_buf = dest; | ||
133 | unsigned long req_size = size; | ||
134 | |||
135 | while (size) { | ||
136 | hpa_t paddr; | ||
137 | unsigned now; | ||
138 | unsigned offset; | ||
139 | hva_t guest_buf; | ||
140 | |||
141 | paddr = gva_to_hpa(vcpu, addr); | ||
142 | |||
143 | if (is_error_hpa(paddr)) | ||
144 | break; | ||
145 | |||
146 | guest_buf = (hva_t)kmap_atomic( | ||
147 | pfn_to_page(paddr >> PAGE_SHIFT), | ||
148 | KM_USER0); | ||
149 | offset = addr & ~PAGE_MASK; | ||
150 | guest_buf |= offset; | ||
151 | now = min(size, PAGE_SIZE - offset); | ||
152 | memcpy(host_buf, (void*)guest_buf, now); | ||
153 | host_buf += now; | ||
154 | addr += now; | ||
155 | size -= now; | ||
156 | kunmap_atomic((void *)(guest_buf & PAGE_MASK), KM_USER0); | ||
157 | } | ||
158 | return req_size - size; | ||
159 | } | ||
160 | EXPORT_SYMBOL_GPL(kvm_read_guest); | ||
161 | |||
162 | int kvm_write_guest(struct kvm_vcpu *vcpu, | ||
163 | gva_t addr, | ||
164 | unsigned long size, | ||
165 | void *data) | ||
166 | { | ||
167 | unsigned char *host_buf = data; | ||
168 | unsigned long req_size = size; | ||
169 | |||
170 | while (size) { | ||
171 | hpa_t paddr; | ||
172 | unsigned now; | ||
173 | unsigned offset; | ||
174 | hva_t guest_buf; | ||
175 | |||
176 | paddr = gva_to_hpa(vcpu, addr); | ||
177 | |||
178 | if (is_error_hpa(paddr)) | ||
179 | break; | ||
180 | |||
181 | guest_buf = (hva_t)kmap_atomic( | ||
182 | pfn_to_page(paddr >> PAGE_SHIFT), KM_USER0); | ||
183 | offset = addr & ~PAGE_MASK; | ||
184 | guest_buf |= offset; | ||
185 | now = min(size, PAGE_SIZE - offset); | ||
186 | memcpy((void*)guest_buf, host_buf, now); | ||
187 | host_buf += now; | ||
188 | addr += now; | ||
189 | size -= now; | ||
190 | kunmap_atomic((void *)(guest_buf & PAGE_MASK), KM_USER0); | ||
191 | } | ||
192 | return req_size - size; | ||
193 | } | ||
194 | EXPORT_SYMBOL_GPL(kvm_write_guest); | ||
195 | |||
196 | static int vcpu_slot(struct kvm_vcpu *vcpu) | ||
197 | { | ||
198 | return vcpu - vcpu->kvm->vcpus; | ||
199 | } | ||
200 | |||
201 | /* | ||
202 | * Switches to specified vcpu, until a matching vcpu_put() | ||
203 | */ | ||
204 | static struct kvm_vcpu *vcpu_load(struct kvm *kvm, int vcpu_slot) | ||
205 | { | ||
206 | struct kvm_vcpu *vcpu = &kvm->vcpus[vcpu_slot]; | ||
207 | |||
208 | mutex_lock(&vcpu->mutex); | ||
209 | if (unlikely(!vcpu->vmcs)) { | ||
210 | mutex_unlock(&vcpu->mutex); | ||
211 | return 0; | ||
212 | } | ||
213 | return kvm_arch_ops->vcpu_load(vcpu); | ||
214 | } | ||
215 | |||
216 | static void vcpu_put(struct kvm_vcpu *vcpu) | ||
217 | { | ||
218 | kvm_arch_ops->vcpu_put(vcpu); | ||
219 | put_cpu(); | ||
220 | mutex_unlock(&vcpu->mutex); | ||
221 | } | ||
222 | |||
223 | static int kvm_dev_open(struct inode *inode, struct file *filp) | ||
224 | { | ||
225 | struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); | ||
226 | int i; | ||
227 | |||
228 | if (!kvm) | ||
229 | return -ENOMEM; | ||
230 | |||
231 | spin_lock_init(&kvm->lock); | ||
232 | INIT_LIST_HEAD(&kvm->active_mmu_pages); | ||
233 | for (i = 0; i < KVM_MAX_VCPUS; ++i) { | ||
234 | struct kvm_vcpu *vcpu = &kvm->vcpus[i]; | ||
235 | |||
236 | mutex_init(&vcpu->mutex); | ||
237 | vcpu->mmu.root_hpa = INVALID_PAGE; | ||
238 | INIT_LIST_HEAD(&vcpu->free_pages); | ||
239 | } | ||
240 | filp->private_data = kvm; | ||
241 | return 0; | ||
242 | } | ||
243 | |||
244 | /* | ||
245 | * Free any memory in @free but not in @dont. | ||
246 | */ | ||
247 | static void kvm_free_physmem_slot(struct kvm_memory_slot *free, | ||
248 | struct kvm_memory_slot *dont) | ||
249 | { | ||
250 | int i; | ||
251 | |||
252 | if (!dont || free->phys_mem != dont->phys_mem) | ||
253 | if (free->phys_mem) { | ||
254 | for (i = 0; i < free->npages; ++i) | ||
255 | __free_page(free->phys_mem[i]); | ||
256 | vfree(free->phys_mem); | ||
257 | } | ||
258 | |||
259 | if (!dont || free->dirty_bitmap != dont->dirty_bitmap) | ||
260 | vfree(free->dirty_bitmap); | ||
261 | |||
262 | free->phys_mem = 0; | ||
263 | free->npages = 0; | ||
264 | free->dirty_bitmap = 0; | ||
265 | } | ||
266 | |||
267 | static void kvm_free_physmem(struct kvm *kvm) | ||
268 | { | ||
269 | int i; | ||
270 | |||
271 | for (i = 0; i < kvm->nmemslots; ++i) | ||
272 | kvm_free_physmem_slot(&kvm->memslots[i], 0); | ||
273 | } | ||
274 | |||
275 | static void kvm_free_vcpu(struct kvm_vcpu *vcpu) | ||
276 | { | ||
277 | kvm_arch_ops->vcpu_free(vcpu); | ||
278 | kvm_mmu_destroy(vcpu); | ||
279 | } | ||
280 | |||
281 | static void kvm_free_vcpus(struct kvm *kvm) | ||
282 | { | ||
283 | unsigned int i; | ||
284 | |||
285 | for (i = 0; i < KVM_MAX_VCPUS; ++i) | ||
286 | kvm_free_vcpu(&kvm->vcpus[i]); | ||
287 | } | ||
288 | |||
289 | static int kvm_dev_release(struct inode *inode, struct file *filp) | ||
290 | { | ||
291 | struct kvm *kvm = filp->private_data; | ||
292 | |||
293 | kvm_free_vcpus(kvm); | ||
294 | kvm_free_physmem(kvm); | ||
295 | kfree(kvm); | ||
296 | return 0; | ||
297 | } | ||
298 | |||
299 | static void inject_gp(struct kvm_vcpu *vcpu) | ||
300 | { | ||
301 | kvm_arch_ops->inject_gp(vcpu, 0); | ||
302 | } | ||
303 | |||
304 | static int pdptrs_have_reserved_bits_set(struct kvm_vcpu *vcpu, | ||
305 | unsigned long cr3) | ||
306 | { | ||
307 | gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; | ||
308 | unsigned offset = (cr3 & (PAGE_SIZE-1)) >> 5; | ||
309 | int i; | ||
310 | u64 pdpte; | ||
311 | u64 *pdpt; | ||
312 | struct kvm_memory_slot *memslot; | ||
313 | |||
314 | spin_lock(&vcpu->kvm->lock); | ||
315 | memslot = gfn_to_memslot(vcpu->kvm, pdpt_gfn); | ||
316 | /* FIXME: !memslot - emulate? 0xff? */ | ||
317 | pdpt = kmap_atomic(gfn_to_page(memslot, pdpt_gfn), KM_USER0); | ||
318 | |||
319 | for (i = 0; i < 4; ++i) { | ||
320 | pdpte = pdpt[offset + i]; | ||
321 | if ((pdpte & 1) && (pdpte & 0xfffffff0000001e6ull)) | ||
322 | break; | ||
323 | } | ||
324 | |||
325 | kunmap_atomic(pdpt, KM_USER0); | ||
326 | spin_unlock(&vcpu->kvm->lock); | ||
327 | |||
328 | return i != 4; | ||
329 | } | ||
330 | |||
331 | void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | ||
332 | { | ||
333 | if (cr0 & CR0_RESEVED_BITS) { | ||
334 | printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", | ||
335 | cr0, vcpu->cr0); | ||
336 | inject_gp(vcpu); | ||
337 | return; | ||
338 | } | ||
339 | |||
340 | if ((cr0 & CR0_NW_MASK) && !(cr0 & CR0_CD_MASK)) { | ||
341 | printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); | ||
342 | inject_gp(vcpu); | ||
343 | return; | ||
344 | } | ||
345 | |||
346 | if ((cr0 & CR0_PG_MASK) && !(cr0 & CR0_PE_MASK)) { | ||
347 | printk(KERN_DEBUG "set_cr0: #GP, set PG flag " | ||
348 | "and a clear PE flag\n"); | ||
349 | inject_gp(vcpu); | ||
350 | return; | ||
351 | } | ||
352 | |||
353 | if (!is_paging(vcpu) && (cr0 & CR0_PG_MASK)) { | ||
354 | #ifdef __x86_64__ | ||
355 | if ((vcpu->shadow_efer & EFER_LME)) { | ||
356 | int cs_db, cs_l; | ||
357 | |||
358 | if (!is_pae(vcpu)) { | ||
359 | printk(KERN_DEBUG "set_cr0: #GP, start paging " | ||
360 | "in long mode while PAE is disabled\n"); | ||
361 | inject_gp(vcpu); | ||
362 | return; | ||
363 | } | ||
364 | kvm_arch_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); | ||
365 | if (cs_l) { | ||
366 | printk(KERN_DEBUG "set_cr0: #GP, start paging " | ||
367 | "in long mode while CS.L == 1\n"); | ||
368 | inject_gp(vcpu); | ||
369 | return; | ||
370 | |||
371 | } | ||
372 | } else | ||
373 | #endif | ||
374 | if (is_pae(vcpu) && | ||
375 | pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) { | ||
376 | printk(KERN_DEBUG "set_cr0: #GP, pdptrs " | ||
377 | "reserved bits\n"); | ||
378 | inject_gp(vcpu); | ||
379 | return; | ||
380 | } | ||
381 | |||
382 | } | ||
383 | |||
384 | kvm_arch_ops->set_cr0(vcpu, cr0); | ||
385 | vcpu->cr0 = cr0; | ||
386 | |||
387 | spin_lock(&vcpu->kvm->lock); | ||
388 | kvm_mmu_reset_context(vcpu); | ||
389 | spin_unlock(&vcpu->kvm->lock); | ||
390 | return; | ||
391 | } | ||
392 | EXPORT_SYMBOL_GPL(set_cr0); | ||
393 | |||
394 | void lmsw(struct kvm_vcpu *vcpu, unsigned long msw) | ||
395 | { | ||
396 | set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f)); | ||
397 | } | ||
398 | EXPORT_SYMBOL_GPL(lmsw); | ||
399 | |||
400 | void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | ||
401 | { | ||
402 | if (cr4 & CR4_RESEVED_BITS) { | ||
403 | printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); | ||
404 | inject_gp(vcpu); | ||
405 | return; | ||
406 | } | ||
407 | |||
408 | if (kvm_arch_ops->is_long_mode(vcpu)) { | ||
409 | if (!(cr4 & CR4_PAE_MASK)) { | ||
410 | printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while " | ||
411 | "in long mode\n"); | ||
412 | inject_gp(vcpu); | ||
413 | return; | ||
414 | } | ||
415 | } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & CR4_PAE_MASK) | ||
416 | && pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) { | ||
417 | printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); | ||
418 | inject_gp(vcpu); | ||
419 | } | ||
420 | |||
421 | if (cr4 & CR4_VMXE_MASK) { | ||
422 | printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n"); | ||
423 | inject_gp(vcpu); | ||
424 | return; | ||
425 | } | ||
426 | kvm_arch_ops->set_cr4(vcpu, cr4); | ||
427 | spin_lock(&vcpu->kvm->lock); | ||
428 | kvm_mmu_reset_context(vcpu); | ||
429 | spin_unlock(&vcpu->kvm->lock); | ||
430 | } | ||
431 | EXPORT_SYMBOL_GPL(set_cr4); | ||
432 | |||
433 | void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | ||
434 | { | ||
435 | if (kvm_arch_ops->is_long_mode(vcpu)) { | ||
436 | if ( cr3 & CR3_L_MODE_RESEVED_BITS) { | ||
437 | printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); | ||
438 | inject_gp(vcpu); | ||
439 | return; | ||
440 | } | ||
441 | } else { | ||
442 | if (cr3 & CR3_RESEVED_BITS) { | ||
443 | printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); | ||
444 | inject_gp(vcpu); | ||
445 | return; | ||
446 | } | ||
447 | if (is_paging(vcpu) && is_pae(vcpu) && | ||
448 | pdptrs_have_reserved_bits_set(vcpu, cr3)) { | ||
449 | printk(KERN_DEBUG "set_cr3: #GP, pdptrs " | ||
450 | "reserved bits\n"); | ||
451 | inject_gp(vcpu); | ||
452 | return; | ||
453 | } | ||
454 | } | ||
455 | |||
456 | vcpu->cr3 = cr3; | ||
457 | spin_lock(&vcpu->kvm->lock); | ||
458 | vcpu->mmu.new_cr3(vcpu); | ||
459 | spin_unlock(&vcpu->kvm->lock); | ||
460 | } | ||
461 | EXPORT_SYMBOL_GPL(set_cr3); | ||
462 | |||
463 | void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) | ||
464 | { | ||
465 | if ( cr8 & CR8_RESEVED_BITS) { | ||
466 | printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8); | ||
467 | inject_gp(vcpu); | ||
468 | return; | ||
469 | } | ||
470 | vcpu->cr8 = cr8; | ||
471 | } | ||
472 | EXPORT_SYMBOL_GPL(set_cr8); | ||
473 | |||
474 | void fx_init(struct kvm_vcpu *vcpu) | ||
475 | { | ||
476 | struct __attribute__ ((__packed__)) fx_image_s { | ||
477 | u16 control; //fcw | ||
478 | u16 status; //fsw | ||
479 | u16 tag; // ftw | ||
480 | u16 opcode; //fop | ||
481 | u64 ip; // fpu ip | ||
482 | u64 operand;// fpu dp | ||
483 | u32 mxcsr; | ||
484 | u32 mxcsr_mask; | ||
485 | |||
486 | } *fx_image; | ||
487 | |||
488 | fx_save(vcpu->host_fx_image); | ||
489 | fpu_init(); | ||
490 | fx_save(vcpu->guest_fx_image); | ||
491 | fx_restore(vcpu->host_fx_image); | ||
492 | |||
493 | fx_image = (struct fx_image_s *)vcpu->guest_fx_image; | ||
494 | fx_image->mxcsr = 0x1f80; | ||
495 | memset(vcpu->guest_fx_image + sizeof(struct fx_image_s), | ||
496 | 0, FX_IMAGE_SIZE - sizeof(struct fx_image_s)); | ||
497 | } | ||
498 | EXPORT_SYMBOL_GPL(fx_init); | ||
499 | |||
500 | /* | ||
501 | * Creates some virtual cpus. Good luck creating more than one. | ||
502 | */ | ||
503 | static int kvm_dev_ioctl_create_vcpu(struct kvm *kvm, int n) | ||
504 | { | ||
505 | int r; | ||
506 | struct kvm_vcpu *vcpu; | ||
507 | |||
508 | r = -EINVAL; | ||
509 | if (n < 0 || n >= KVM_MAX_VCPUS) | ||
510 | goto out; | ||
511 | |||
512 | vcpu = &kvm->vcpus[n]; | ||
513 | |||
514 | mutex_lock(&vcpu->mutex); | ||
515 | |||
516 | if (vcpu->vmcs) { | ||
517 | mutex_unlock(&vcpu->mutex); | ||
518 | return -EEXIST; | ||
519 | } | ||
520 | |||
521 | vcpu->host_fx_image = (char*)ALIGN((hva_t)vcpu->fx_buf, | ||
522 | FX_IMAGE_ALIGN); | ||
523 | vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE; | ||
524 | |||
525 | vcpu->cpu = -1; /* First load will set up TR */ | ||
526 | vcpu->kvm = kvm; | ||
527 | r = kvm_arch_ops->vcpu_create(vcpu); | ||
528 | if (r < 0) | ||
529 | goto out_free_vcpus; | ||
530 | |||
531 | kvm_arch_ops->vcpu_load(vcpu); | ||
532 | |||
533 | r = kvm_arch_ops->vcpu_setup(vcpu); | ||
534 | if (r >= 0) | ||
535 | r = kvm_mmu_init(vcpu); | ||
536 | |||
537 | vcpu_put(vcpu); | ||
538 | |||
539 | if (r < 0) | ||
540 | goto out_free_vcpus; | ||
541 | |||
542 | return 0; | ||
543 | |||
544 | out_free_vcpus: | ||
545 | kvm_free_vcpu(vcpu); | ||
546 | mutex_unlock(&vcpu->mutex); | ||
547 | out: | ||
548 | return r; | ||
549 | } | ||
550 | |||
551 | /* | ||
552 | * Allocate some memory and give it an address in the guest physical address | ||
553 | * space. | ||
554 | * | ||
555 | * Discontiguous memory is allowed, mostly for framebuffers. | ||
556 | */ | ||
557 | static int kvm_dev_ioctl_set_memory_region(struct kvm *kvm, | ||
558 | struct kvm_memory_region *mem) | ||
559 | { | ||
560 | int r; | ||
561 | gfn_t base_gfn; | ||
562 | unsigned long npages; | ||
563 | unsigned long i; | ||
564 | struct kvm_memory_slot *memslot; | ||
565 | struct kvm_memory_slot old, new; | ||
566 | int memory_config_version; | ||
567 | |||
568 | r = -EINVAL; | ||
569 | /* General sanity checks */ | ||
570 | if (mem->memory_size & (PAGE_SIZE - 1)) | ||
571 | goto out; | ||
572 | if (mem->guest_phys_addr & (PAGE_SIZE - 1)) | ||
573 | goto out; | ||
574 | if (mem->slot >= KVM_MEMORY_SLOTS) | ||
575 | goto out; | ||
576 | if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) | ||
577 | goto out; | ||
578 | |||
579 | memslot = &kvm->memslots[mem->slot]; | ||
580 | base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; | ||
581 | npages = mem->memory_size >> PAGE_SHIFT; | ||
582 | |||
583 | if (!npages) | ||
584 | mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES; | ||
585 | |||
586 | raced: | ||
587 | spin_lock(&kvm->lock); | ||
588 | |||
589 | memory_config_version = kvm->memory_config_version; | ||
590 | new = old = *memslot; | ||
591 | |||
592 | new.base_gfn = base_gfn; | ||
593 | new.npages = npages; | ||
594 | new.flags = mem->flags; | ||
595 | |||
596 | /* Disallow changing a memory slot's size. */ | ||
597 | r = -EINVAL; | ||
598 | if (npages && old.npages && npages != old.npages) | ||
599 | goto out_unlock; | ||
600 | |||
601 | /* Check for overlaps */ | ||
602 | r = -EEXIST; | ||
603 | for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { | ||
604 | struct kvm_memory_slot *s = &kvm->memslots[i]; | ||
605 | |||
606 | if (s == memslot) | ||
607 | continue; | ||
608 | if (!((base_gfn + npages <= s->base_gfn) || | ||
609 | (base_gfn >= s->base_gfn + s->npages))) | ||
610 | goto out_unlock; | ||
611 | } | ||
612 | /* | ||
613 | * Do memory allocations outside lock. memory_config_version will | ||
614 | * detect any races. | ||
615 | */ | ||
616 | spin_unlock(&kvm->lock); | ||
617 | |||
618 | /* Deallocate if slot is being removed */ | ||
619 | if (!npages) | ||
620 | new.phys_mem = 0; | ||
621 | |||
622 | /* Free page dirty bitmap if unneeded */ | ||
623 | if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) | ||
624 | new.dirty_bitmap = 0; | ||
625 | |||
626 | r = -ENOMEM; | ||
627 | |||
628 | /* Allocate if a slot is being created */ | ||
629 | if (npages && !new.phys_mem) { | ||
630 | new.phys_mem = vmalloc(npages * sizeof(struct page *)); | ||
631 | |||
632 | if (!new.phys_mem) | ||
633 | goto out_free; | ||
634 | |||
635 | memset(new.phys_mem, 0, npages * sizeof(struct page *)); | ||
636 | for (i = 0; i < npages; ++i) { | ||
637 | new.phys_mem[i] = alloc_page(GFP_HIGHUSER | ||
638 | | __GFP_ZERO); | ||
639 | if (!new.phys_mem[i]) | ||
640 | goto out_free; | ||
641 | } | ||
642 | } | ||
643 | |||
644 | /* Allocate page dirty bitmap if needed */ | ||
645 | if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { | ||
646 | unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8; | ||
647 | |||
648 | new.dirty_bitmap = vmalloc(dirty_bytes); | ||
649 | if (!new.dirty_bitmap) | ||
650 | goto out_free; | ||
651 | memset(new.dirty_bitmap, 0, dirty_bytes); | ||
652 | } | ||
653 | |||
654 | spin_lock(&kvm->lock); | ||
655 | |||
656 | if (memory_config_version != kvm->memory_config_version) { | ||
657 | spin_unlock(&kvm->lock); | ||
658 | kvm_free_physmem_slot(&new, &old); | ||
659 | goto raced; | ||
660 | } | ||
661 | |||
662 | r = -EAGAIN; | ||
663 | if (kvm->busy) | ||
664 | goto out_unlock; | ||
665 | |||
666 | if (mem->slot >= kvm->nmemslots) | ||
667 | kvm->nmemslots = mem->slot + 1; | ||
668 | |||
669 | *memslot = new; | ||
670 | ++kvm->memory_config_version; | ||
671 | |||
672 | spin_unlock(&kvm->lock); | ||
673 | |||
674 | for (i = 0; i < KVM_MAX_VCPUS; ++i) { | ||
675 | struct kvm_vcpu *vcpu; | ||
676 | |||
677 | vcpu = vcpu_load(kvm, i); | ||
678 | if (!vcpu) | ||
679 | continue; | ||
680 | kvm_mmu_reset_context(vcpu); | ||
681 | vcpu_put(vcpu); | ||
682 | } | ||
683 | |||
684 | kvm_free_physmem_slot(&old, &new); | ||
685 | return 0; | ||
686 | |||
687 | out_unlock: | ||
688 | spin_unlock(&kvm->lock); | ||
689 | out_free: | ||
690 | kvm_free_physmem_slot(&new, &old); | ||
691 | out: | ||
692 | return r; | ||
693 | } | ||
694 | |||
695 | /* | ||
696 | * Get (and clear) the dirty memory log for a memory slot. | ||
697 | */ | ||
698 | static int kvm_dev_ioctl_get_dirty_log(struct kvm *kvm, | ||
699 | struct kvm_dirty_log *log) | ||
700 | { | ||
701 | struct kvm_memory_slot *memslot; | ||
702 | int r, i; | ||
703 | int n; | ||
704 | unsigned long any = 0; | ||
705 | |||
706 | spin_lock(&kvm->lock); | ||
707 | |||
708 | /* | ||
709 | * Prevent changes to guest memory configuration even while the lock | ||
710 | * is not taken. | ||
711 | */ | ||
712 | ++kvm->busy; | ||
713 | spin_unlock(&kvm->lock); | ||
714 | r = -EINVAL; | ||
715 | if (log->slot >= KVM_MEMORY_SLOTS) | ||
716 | goto out; | ||
717 | |||
718 | memslot = &kvm->memslots[log->slot]; | ||
719 | r = -ENOENT; | ||
720 | if (!memslot->dirty_bitmap) | ||
721 | goto out; | ||
722 | |||
723 | n = ALIGN(memslot->npages, 8) / 8; | ||
724 | |||
725 | for (i = 0; !any && i < n; ++i) | ||
726 | any = memslot->dirty_bitmap[i]; | ||
727 | |||
728 | r = -EFAULT; | ||
729 | if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) | ||
730 | goto out; | ||
731 | |||
732 | |||
733 | if (any) { | ||
734 | spin_lock(&kvm->lock); | ||
735 | kvm_mmu_slot_remove_write_access(kvm, log->slot); | ||
736 | spin_unlock(&kvm->lock); | ||
737 | memset(memslot->dirty_bitmap, 0, n); | ||
738 | for (i = 0; i < KVM_MAX_VCPUS; ++i) { | ||
739 | struct kvm_vcpu *vcpu = vcpu_load(kvm, i); | ||
740 | |||
741 | if (!vcpu) | ||
742 | continue; | ||
743 | kvm_arch_ops->tlb_flush(vcpu); | ||
744 | vcpu_put(vcpu); | ||
745 | } | ||
746 | } | ||
747 | |||
748 | r = 0; | ||
749 | |||
750 | out: | ||
751 | spin_lock(&kvm->lock); | ||
752 | --kvm->busy; | ||
753 | spin_unlock(&kvm->lock); | ||
754 | return r; | ||
755 | } | ||
756 | |||
757 | struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) | ||
758 | { | ||
759 | int i; | ||
760 | |||
761 | for (i = 0; i < kvm->nmemslots; ++i) { | ||
762 | struct kvm_memory_slot *memslot = &kvm->memslots[i]; | ||
763 | |||
764 | if (gfn >= memslot->base_gfn | ||
765 | && gfn < memslot->base_gfn + memslot->npages) | ||
766 | return memslot; | ||
767 | } | ||
768 | return 0; | ||
769 | } | ||
770 | EXPORT_SYMBOL_GPL(gfn_to_memslot); | ||
771 | |||
772 | void mark_page_dirty(struct kvm *kvm, gfn_t gfn) | ||
773 | { | ||
774 | int i; | ||
775 | struct kvm_memory_slot *memslot = 0; | ||
776 | unsigned long rel_gfn; | ||
777 | |||
778 | for (i = 0; i < kvm->nmemslots; ++i) { | ||
779 | memslot = &kvm->memslots[i]; | ||
780 | |||
781 | if (gfn >= memslot->base_gfn | ||
782 | && gfn < memslot->base_gfn + memslot->npages) { | ||
783 | |||
784 | if (!memslot || !memslot->dirty_bitmap) | ||
785 | return; | ||
786 | |||
787 | rel_gfn = gfn - memslot->base_gfn; | ||
788 | |||
789 | /* avoid RMW */ | ||
790 | if (!test_bit(rel_gfn, memslot->dirty_bitmap)) | ||
791 | set_bit(rel_gfn, memslot->dirty_bitmap); | ||
792 | return; | ||
793 | } | ||
794 | } | ||
795 | } | ||
796 | |||
797 | static int emulator_read_std(unsigned long addr, | ||
798 | unsigned long *val, | ||
799 | unsigned int bytes, | ||
800 | struct x86_emulate_ctxt *ctxt) | ||
801 | { | ||
802 | struct kvm_vcpu *vcpu = ctxt->vcpu; | ||
803 | void *data = val; | ||
804 | |||
805 | while (bytes) { | ||
806 | gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); | ||
807 | unsigned offset = addr & (PAGE_SIZE-1); | ||
808 | unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset); | ||
809 | unsigned long pfn; | ||
810 | struct kvm_memory_slot *memslot; | ||
811 | void *page; | ||
812 | |||
813 | if (gpa == UNMAPPED_GVA) | ||
814 | return X86EMUL_PROPAGATE_FAULT; | ||
815 | pfn = gpa >> PAGE_SHIFT; | ||
816 | memslot = gfn_to_memslot(vcpu->kvm, pfn); | ||
817 | if (!memslot) | ||
818 | return X86EMUL_UNHANDLEABLE; | ||
819 | page = kmap_atomic(gfn_to_page(memslot, pfn), KM_USER0); | ||
820 | |||
821 | memcpy(data, page + offset, tocopy); | ||
822 | |||
823 | kunmap_atomic(page, KM_USER0); | ||
824 | |||
825 | bytes -= tocopy; | ||
826 | data += tocopy; | ||
827 | addr += tocopy; | ||
828 | } | ||
829 | |||
830 | return X86EMUL_CONTINUE; | ||
831 | } | ||
832 | |||
833 | static int emulator_write_std(unsigned long addr, | ||
834 | unsigned long val, | ||
835 | unsigned int bytes, | ||
836 | struct x86_emulate_ctxt *ctxt) | ||
837 | { | ||
838 | printk(KERN_ERR "emulator_write_std: addr %lx n %d\n", | ||
839 | addr, bytes); | ||
840 | return X86EMUL_UNHANDLEABLE; | ||
841 | } | ||
842 | |||
843 | static int emulator_read_emulated(unsigned long addr, | ||
844 | unsigned long *val, | ||
845 | unsigned int bytes, | ||
846 | struct x86_emulate_ctxt *ctxt) | ||
847 | { | ||
848 | struct kvm_vcpu *vcpu = ctxt->vcpu; | ||
849 | |||
850 | if (vcpu->mmio_read_completed) { | ||
851 | memcpy(val, vcpu->mmio_data, bytes); | ||
852 | vcpu->mmio_read_completed = 0; | ||
853 | return X86EMUL_CONTINUE; | ||
854 | } else if (emulator_read_std(addr, val, bytes, ctxt) | ||
855 | == X86EMUL_CONTINUE) | ||
856 | return X86EMUL_CONTINUE; | ||
857 | else { | ||
858 | gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); | ||
859 | if (gpa == UNMAPPED_GVA) | ||
860 | return vcpu_printf(vcpu, "not present\n"), X86EMUL_PROPAGATE_FAULT; | ||
861 | vcpu->mmio_needed = 1; | ||
862 | vcpu->mmio_phys_addr = gpa; | ||
863 | vcpu->mmio_size = bytes; | ||
864 | vcpu->mmio_is_write = 0; | ||
865 | |||
866 | return X86EMUL_UNHANDLEABLE; | ||
867 | } | ||
868 | } | ||
869 | |||
870 | static int emulator_write_emulated(unsigned long addr, | ||
871 | unsigned long val, | ||
872 | unsigned int bytes, | ||
873 | struct x86_emulate_ctxt *ctxt) | ||
874 | { | ||
875 | struct kvm_vcpu *vcpu = ctxt->vcpu; | ||
876 | gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); | ||
877 | |||
878 | if (gpa == UNMAPPED_GVA) | ||
879 | return X86EMUL_PROPAGATE_FAULT; | ||
880 | |||
881 | vcpu->mmio_needed = 1; | ||
882 | vcpu->mmio_phys_addr = gpa; | ||
883 | vcpu->mmio_size = bytes; | ||
884 | vcpu->mmio_is_write = 1; | ||
885 | memcpy(vcpu->mmio_data, &val, bytes); | ||
886 | |||
887 | return X86EMUL_CONTINUE; | ||
888 | } | ||
889 | |||
890 | static int emulator_cmpxchg_emulated(unsigned long addr, | ||
891 | unsigned long old, | ||
892 | unsigned long new, | ||
893 | unsigned int bytes, | ||
894 | struct x86_emulate_ctxt *ctxt) | ||
895 | { | ||
896 | static int reported; | ||
897 | |||
898 | if (!reported) { | ||
899 | reported = 1; | ||
900 | printk(KERN_WARNING "kvm: emulating exchange as write\n"); | ||
901 | } | ||
902 | return emulator_write_emulated(addr, new, bytes, ctxt); | ||
903 | } | ||
904 | |||
905 | static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) | ||
906 | { | ||
907 | return kvm_arch_ops->get_segment_base(vcpu, seg); | ||
908 | } | ||
909 | |||
910 | int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) | ||
911 | { | ||
912 | spin_lock(&vcpu->kvm->lock); | ||
913 | vcpu->mmu.inval_page(vcpu, address); | ||
914 | spin_unlock(&vcpu->kvm->lock); | ||
915 | kvm_arch_ops->invlpg(vcpu, address); | ||
916 | return X86EMUL_CONTINUE; | ||
917 | } | ||
918 | |||
919 | int emulate_clts(struct kvm_vcpu *vcpu) | ||
920 | { | ||
921 | unsigned long cr0 = vcpu->cr0; | ||
922 | |||
923 | cr0 &= ~CR0_TS_MASK; | ||
924 | kvm_arch_ops->set_cr0(vcpu, cr0); | ||
925 | return X86EMUL_CONTINUE; | ||
926 | } | ||
927 | |||
928 | int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, unsigned long *dest) | ||
929 | { | ||
930 | struct kvm_vcpu *vcpu = ctxt->vcpu; | ||
931 | |||
932 | switch (dr) { | ||
933 | case 0 ... 3: | ||
934 | *dest = kvm_arch_ops->get_dr(vcpu, dr); | ||
935 | return X86EMUL_CONTINUE; | ||
936 | default: | ||
937 | printk(KERN_DEBUG "%s: unexpected dr %u\n", | ||
938 | __FUNCTION__, dr); | ||
939 | return X86EMUL_UNHANDLEABLE; | ||
940 | } | ||
941 | } | ||
942 | |||
943 | int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) | ||
944 | { | ||
945 | unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; | ||
946 | int exception; | ||
947 | |||
948 | kvm_arch_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception); | ||
949 | if (exception) { | ||
950 | /* FIXME: better handling */ | ||
951 | return X86EMUL_UNHANDLEABLE; | ||
952 | } | ||
953 | return X86EMUL_CONTINUE; | ||
954 | } | ||
955 | |||
956 | static void report_emulation_failure(struct x86_emulate_ctxt *ctxt) | ||
957 | { | ||
958 | static int reported; | ||
959 | u8 opcodes[4]; | ||
960 | unsigned long rip = ctxt->vcpu->rip; | ||
961 | unsigned long rip_linear; | ||
962 | |||
963 | rip_linear = rip + get_segment_base(ctxt->vcpu, VCPU_SREG_CS); | ||
964 | |||
965 | if (reported) | ||
966 | return; | ||
967 | |||
968 | emulator_read_std(rip_linear, (void *)opcodes, 4, ctxt); | ||
969 | |||
970 | printk(KERN_ERR "emulation failed but !mmio_needed?" | ||
971 | " rip %lx %02x %02x %02x %02x\n", | ||
972 | rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); | ||
973 | reported = 1; | ||
974 | } | ||
975 | |||
976 | struct x86_emulate_ops emulate_ops = { | ||
977 | .read_std = emulator_read_std, | ||
978 | .write_std = emulator_write_std, | ||
979 | .read_emulated = emulator_read_emulated, | ||
980 | .write_emulated = emulator_write_emulated, | ||
981 | .cmpxchg_emulated = emulator_cmpxchg_emulated, | ||
982 | }; | ||
983 | |||
984 | int emulate_instruction(struct kvm_vcpu *vcpu, | ||
985 | struct kvm_run *run, | ||
986 | unsigned long cr2, | ||
987 | u16 error_code) | ||
988 | { | ||
989 | struct x86_emulate_ctxt emulate_ctxt; | ||
990 | int r; | ||
991 | int cs_db, cs_l; | ||
992 | |||
993 | kvm_arch_ops->cache_regs(vcpu); | ||
994 | |||
995 | kvm_arch_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); | ||
996 | |||
997 | emulate_ctxt.vcpu = vcpu; | ||
998 | emulate_ctxt.eflags = kvm_arch_ops->get_rflags(vcpu); | ||
999 | emulate_ctxt.cr2 = cr2; | ||
1000 | emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM) | ||
1001 | ? X86EMUL_MODE_REAL : cs_l | ||
1002 | ? X86EMUL_MODE_PROT64 : cs_db | ||
1003 | ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; | ||
1004 | |||
1005 | if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) { | ||
1006 | emulate_ctxt.cs_base = 0; | ||
1007 | emulate_ctxt.ds_base = 0; | ||
1008 | emulate_ctxt.es_base = 0; | ||
1009 | emulate_ctxt.ss_base = 0; | ||
1010 | } else { | ||
1011 | emulate_ctxt.cs_base = get_segment_base(vcpu, VCPU_SREG_CS); | ||
1012 | emulate_ctxt.ds_base = get_segment_base(vcpu, VCPU_SREG_DS); | ||
1013 | emulate_ctxt.es_base = get_segment_base(vcpu, VCPU_SREG_ES); | ||
1014 | emulate_ctxt.ss_base = get_segment_base(vcpu, VCPU_SREG_SS); | ||
1015 | } | ||
1016 | |||
1017 | emulate_ctxt.gs_base = get_segment_base(vcpu, VCPU_SREG_GS); | ||
1018 | emulate_ctxt.fs_base = get_segment_base(vcpu, VCPU_SREG_FS); | ||
1019 | |||
1020 | vcpu->mmio_is_write = 0; | ||
1021 | r = x86_emulate_memop(&emulate_ctxt, &emulate_ops); | ||
1022 | |||
1023 | if ((r || vcpu->mmio_is_write) && run) { | ||
1024 | run->mmio.phys_addr = vcpu->mmio_phys_addr; | ||
1025 | memcpy(run->mmio.data, vcpu->mmio_data, 8); | ||
1026 | run->mmio.len = vcpu->mmio_size; | ||
1027 | run->mmio.is_write = vcpu->mmio_is_write; | ||
1028 | } | ||
1029 | |||
1030 | if (r) { | ||
1031 | if (!vcpu->mmio_needed) { | ||
1032 | report_emulation_failure(&emulate_ctxt); | ||
1033 | return EMULATE_FAIL; | ||
1034 | } | ||
1035 | return EMULATE_DO_MMIO; | ||
1036 | } | ||
1037 | |||
1038 | kvm_arch_ops->decache_regs(vcpu); | ||
1039 | kvm_arch_ops->set_rflags(vcpu, emulate_ctxt.eflags); | ||
1040 | |||
1041 | if (vcpu->mmio_is_write) | ||
1042 | return EMULATE_DO_MMIO; | ||
1043 | |||
1044 | return EMULATE_DONE; | ||
1045 | } | ||
1046 | EXPORT_SYMBOL_GPL(emulate_instruction); | ||
1047 | |||
1048 | static u64 mk_cr_64(u64 curr_cr, u32 new_val) | ||
1049 | { | ||
1050 | return (curr_cr & ~((1ULL << 32) - 1)) | new_val; | ||
1051 | } | ||
1052 | |||
1053 | void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) | ||
1054 | { | ||
1055 | struct descriptor_table dt = { limit, base }; | ||
1056 | |||
1057 | kvm_arch_ops->set_gdt(vcpu, &dt); | ||
1058 | } | ||
1059 | |||
1060 | void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) | ||
1061 | { | ||
1062 | struct descriptor_table dt = { limit, base }; | ||
1063 | |||
1064 | kvm_arch_ops->set_idt(vcpu, &dt); | ||
1065 | } | ||
1066 | |||
1067 | void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, | ||
1068 | unsigned long *rflags) | ||
1069 | { | ||
1070 | lmsw(vcpu, msw); | ||
1071 | *rflags = kvm_arch_ops->get_rflags(vcpu); | ||
1072 | } | ||
1073 | |||
1074 | unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) | ||
1075 | { | ||
1076 | switch (cr) { | ||
1077 | case 0: | ||
1078 | return vcpu->cr0; | ||
1079 | case 2: | ||
1080 | return vcpu->cr2; | ||
1081 | case 3: | ||
1082 | return vcpu->cr3; | ||
1083 | case 4: | ||
1084 | return vcpu->cr4; | ||
1085 | default: | ||
1086 | vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr); | ||
1087 | return 0; | ||
1088 | } | ||
1089 | } | ||
1090 | |||
1091 | void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, | ||
1092 | unsigned long *rflags) | ||
1093 | { | ||
1094 | switch (cr) { | ||
1095 | case 0: | ||
1096 | set_cr0(vcpu, mk_cr_64(vcpu->cr0, val)); | ||
1097 | *rflags = kvm_arch_ops->get_rflags(vcpu); | ||
1098 | break; | ||
1099 | case 2: | ||
1100 | vcpu->cr2 = val; | ||
1101 | break; | ||
1102 | case 3: | ||
1103 | set_cr3(vcpu, val); | ||
1104 | break; | ||
1105 | case 4: | ||
1106 | set_cr4(vcpu, mk_cr_64(vcpu->cr4, val)); | ||
1107 | break; | ||
1108 | default: | ||
1109 | vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr); | ||
1110 | } | ||
1111 | } | ||
1112 | |||
1113 | /* | ||
1114 | * Reads an msr value (of 'msr_index') into 'pdata'. | ||
1115 | * Returns 0 on success, non-0 otherwise. | ||
1116 | * Assumes vcpu_load() was already called. | ||
1117 | */ | ||
1118 | static int get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) | ||
1119 | { | ||
1120 | return kvm_arch_ops->get_msr(vcpu, msr_index, pdata); | ||
1121 | } | ||
1122 | |||
1123 | #ifdef __x86_64__ | ||
1124 | |||
1125 | void set_efer(struct kvm_vcpu *vcpu, u64 efer) | ||
1126 | { | ||
1127 | struct vmx_msr_entry *msr; | ||
1128 | |||
1129 | if (efer & EFER_RESERVED_BITS) { | ||
1130 | printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n", | ||
1131 | efer); | ||
1132 | inject_gp(vcpu); | ||
1133 | return; | ||
1134 | } | ||
1135 | |||
1136 | if (is_paging(vcpu) | ||
1137 | && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) { | ||
1138 | printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n"); | ||
1139 | inject_gp(vcpu); | ||
1140 | return; | ||
1141 | } | ||
1142 | |||
1143 | efer &= ~EFER_LMA; | ||
1144 | efer |= vcpu->shadow_efer & EFER_LMA; | ||
1145 | |||
1146 | vcpu->shadow_efer = efer; | ||
1147 | |||
1148 | msr = find_msr_entry(vcpu, MSR_EFER); | ||
1149 | |||
1150 | if (!(efer & EFER_LMA)) | ||
1151 | efer &= ~EFER_LME; | ||
1152 | msr->data = efer; | ||
1153 | } | ||
1154 | EXPORT_SYMBOL_GPL(set_efer); | ||
1155 | |||
1156 | #endif | ||
1157 | |||
1158 | /* | ||
1159 | * Writes msr value into into the appropriate "register". | ||
1160 | * Returns 0 on success, non-0 otherwise. | ||
1161 | * Assumes vcpu_load() was already called. | ||
1162 | */ | ||
1163 | static int set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) | ||
1164 | { | ||
1165 | return kvm_arch_ops->set_msr(vcpu, msr_index, data); | ||
1166 | } | ||
1167 | |||
1168 | void kvm_resched(struct kvm_vcpu *vcpu) | ||
1169 | { | ||
1170 | vcpu_put(vcpu); | ||
1171 | cond_resched(); | ||
1172 | /* Cannot fail - no vcpu unplug yet. */ | ||
1173 | vcpu_load(vcpu->kvm, vcpu_slot(vcpu)); | ||
1174 | } | ||
1175 | EXPORT_SYMBOL_GPL(kvm_resched); | ||
1176 | |||
1177 | void load_msrs(struct vmx_msr_entry *e, int n) | ||
1178 | { | ||
1179 | int i; | ||
1180 | |||
1181 | for (i = 0; i < n; ++i) | ||
1182 | wrmsrl(e[i].index, e[i].data); | ||
1183 | } | ||
1184 | EXPORT_SYMBOL_GPL(load_msrs); | ||
1185 | |||
1186 | void save_msrs(struct vmx_msr_entry *e, int n) | ||
1187 | { | ||
1188 | int i; | ||
1189 | |||
1190 | for (i = 0; i < n; ++i) | ||
1191 | rdmsrl(e[i].index, e[i].data); | ||
1192 | } | ||
1193 | EXPORT_SYMBOL_GPL(save_msrs); | ||
1194 | |||
1195 | static int kvm_dev_ioctl_run(struct kvm *kvm, struct kvm_run *kvm_run) | ||
1196 | { | ||
1197 | struct kvm_vcpu *vcpu; | ||
1198 | int r; | ||
1199 | |||
1200 | if (kvm_run->vcpu < 0 || kvm_run->vcpu >= KVM_MAX_VCPUS) | ||
1201 | return -EINVAL; | ||
1202 | |||
1203 | vcpu = vcpu_load(kvm, kvm_run->vcpu); | ||
1204 | if (!vcpu) | ||
1205 | return -ENOENT; | ||
1206 | |||
1207 | if (kvm_run->emulated) { | ||
1208 | kvm_arch_ops->skip_emulated_instruction(vcpu); | ||
1209 | kvm_run->emulated = 0; | ||
1210 | } | ||
1211 | |||
1212 | if (kvm_run->mmio_completed) { | ||
1213 | memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); | ||
1214 | vcpu->mmio_read_completed = 1; | ||
1215 | } | ||
1216 | |||
1217 | vcpu->mmio_needed = 0; | ||
1218 | |||
1219 | r = kvm_arch_ops->run(vcpu, kvm_run); | ||
1220 | |||
1221 | vcpu_put(vcpu); | ||
1222 | return r; | ||
1223 | } | ||
1224 | |||
1225 | static int kvm_dev_ioctl_get_regs(struct kvm *kvm, struct kvm_regs *regs) | ||
1226 | { | ||
1227 | struct kvm_vcpu *vcpu; | ||
1228 | |||
1229 | if (regs->vcpu < 0 || regs->vcpu >= KVM_MAX_VCPUS) | ||
1230 | return -EINVAL; | ||
1231 | |||
1232 | vcpu = vcpu_load(kvm, regs->vcpu); | ||
1233 | if (!vcpu) | ||
1234 | return -ENOENT; | ||
1235 | |||
1236 | kvm_arch_ops->cache_regs(vcpu); | ||
1237 | |||
1238 | regs->rax = vcpu->regs[VCPU_REGS_RAX]; | ||
1239 | regs->rbx = vcpu->regs[VCPU_REGS_RBX]; | ||
1240 | regs->rcx = vcpu->regs[VCPU_REGS_RCX]; | ||
1241 | regs->rdx = vcpu->regs[VCPU_REGS_RDX]; | ||
1242 | regs->rsi = vcpu->regs[VCPU_REGS_RSI]; | ||
1243 | regs->rdi = vcpu->regs[VCPU_REGS_RDI]; | ||
1244 | regs->rsp = vcpu->regs[VCPU_REGS_RSP]; | ||
1245 | regs->rbp = vcpu->regs[VCPU_REGS_RBP]; | ||
1246 | #ifdef __x86_64__ | ||
1247 | regs->r8 = vcpu->regs[VCPU_REGS_R8]; | ||
1248 | regs->r9 = vcpu->regs[VCPU_REGS_R9]; | ||
1249 | regs->r10 = vcpu->regs[VCPU_REGS_R10]; | ||
1250 | regs->r11 = vcpu->regs[VCPU_REGS_R11]; | ||
1251 | regs->r12 = vcpu->regs[VCPU_REGS_R12]; | ||
1252 | regs->r13 = vcpu->regs[VCPU_REGS_R13]; | ||
1253 | regs->r14 = vcpu->regs[VCPU_REGS_R14]; | ||
1254 | regs->r15 = vcpu->regs[VCPU_REGS_R15]; | ||
1255 | #endif | ||
1256 | |||
1257 | regs->rip = vcpu->rip; | ||
1258 | regs->rflags = kvm_arch_ops->get_rflags(vcpu); | ||
1259 | |||
1260 | /* | ||
1261 | * Don't leak debug flags in case they were set for guest debugging | ||
1262 | */ | ||
1263 | if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep) | ||
1264 | regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); | ||
1265 | |||
1266 | vcpu_put(vcpu); | ||
1267 | |||
1268 | return 0; | ||
1269 | } | ||
1270 | |||
1271 | static int kvm_dev_ioctl_set_regs(struct kvm *kvm, struct kvm_regs *regs) | ||
1272 | { | ||
1273 | struct kvm_vcpu *vcpu; | ||
1274 | |||
1275 | if (regs->vcpu < 0 || regs->vcpu >= KVM_MAX_VCPUS) | ||
1276 | return -EINVAL; | ||
1277 | |||
1278 | vcpu = vcpu_load(kvm, regs->vcpu); | ||
1279 | if (!vcpu) | ||
1280 | return -ENOENT; | ||
1281 | |||
1282 | vcpu->regs[VCPU_REGS_RAX] = regs->rax; | ||
1283 | vcpu->regs[VCPU_REGS_RBX] = regs->rbx; | ||
1284 | vcpu->regs[VCPU_REGS_RCX] = regs->rcx; | ||
1285 | vcpu->regs[VCPU_REGS_RDX] = regs->rdx; | ||
1286 | vcpu->regs[VCPU_REGS_RSI] = regs->rsi; | ||
1287 | vcpu->regs[VCPU_REGS_RDI] = regs->rdi; | ||
1288 | vcpu->regs[VCPU_REGS_RSP] = regs->rsp; | ||
1289 | vcpu->regs[VCPU_REGS_RBP] = regs->rbp; | ||
1290 | #ifdef __x86_64__ | ||
1291 | vcpu->regs[VCPU_REGS_R8] = regs->r8; | ||
1292 | vcpu->regs[VCPU_REGS_R9] = regs->r9; | ||
1293 | vcpu->regs[VCPU_REGS_R10] = regs->r10; | ||
1294 | vcpu->regs[VCPU_REGS_R11] = regs->r11; | ||
1295 | vcpu->regs[VCPU_REGS_R12] = regs->r12; | ||
1296 | vcpu->regs[VCPU_REGS_R13] = regs->r13; | ||
1297 | vcpu->regs[VCPU_REGS_R14] = regs->r14; | ||
1298 | vcpu->regs[VCPU_REGS_R15] = regs->r15; | ||
1299 | #endif | ||
1300 | |||
1301 | vcpu->rip = regs->rip; | ||
1302 | kvm_arch_ops->set_rflags(vcpu, regs->rflags); | ||
1303 | |||
1304 | kvm_arch_ops->decache_regs(vcpu); | ||
1305 | |||
1306 | vcpu_put(vcpu); | ||
1307 | |||
1308 | return 0; | ||
1309 | } | ||
1310 | |||
1311 | static void get_segment(struct kvm_vcpu *vcpu, | ||
1312 | struct kvm_segment *var, int seg) | ||
1313 | { | ||
1314 | return kvm_arch_ops->get_segment(vcpu, var, seg); | ||
1315 | } | ||
1316 | |||
1317 | static int kvm_dev_ioctl_get_sregs(struct kvm *kvm, struct kvm_sregs *sregs) | ||
1318 | { | ||
1319 | struct kvm_vcpu *vcpu; | ||
1320 | struct descriptor_table dt; | ||
1321 | |||
1322 | if (sregs->vcpu < 0 || sregs->vcpu >= KVM_MAX_VCPUS) | ||
1323 | return -EINVAL; | ||
1324 | vcpu = vcpu_load(kvm, sregs->vcpu); | ||
1325 | if (!vcpu) | ||
1326 | return -ENOENT; | ||
1327 | |||
1328 | get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); | ||
1329 | get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); | ||
1330 | get_segment(vcpu, &sregs->es, VCPU_SREG_ES); | ||
1331 | get_segment(vcpu, &sregs->fs, VCPU_SREG_FS); | ||
1332 | get_segment(vcpu, &sregs->gs, VCPU_SREG_GS); | ||
1333 | get_segment(vcpu, &sregs->ss, VCPU_SREG_SS); | ||
1334 | |||
1335 | get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); | ||
1336 | get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); | ||
1337 | |||
1338 | kvm_arch_ops->get_idt(vcpu, &dt); | ||
1339 | sregs->idt.limit = dt.limit; | ||
1340 | sregs->idt.base = dt.base; | ||
1341 | kvm_arch_ops->get_gdt(vcpu, &dt); | ||
1342 | sregs->gdt.limit = dt.limit; | ||
1343 | sregs->gdt.base = dt.base; | ||
1344 | |||
1345 | sregs->cr0 = vcpu->cr0; | ||
1346 | sregs->cr2 = vcpu->cr2; | ||
1347 | sregs->cr3 = vcpu->cr3; | ||
1348 | sregs->cr4 = vcpu->cr4; | ||
1349 | sregs->cr8 = vcpu->cr8; | ||
1350 | sregs->efer = vcpu->shadow_efer; | ||
1351 | sregs->apic_base = vcpu->apic_base; | ||
1352 | |||
1353 | memcpy(sregs->interrupt_bitmap, vcpu->irq_pending, | ||
1354 | sizeof sregs->interrupt_bitmap); | ||
1355 | |||
1356 | vcpu_put(vcpu); | ||
1357 | |||
1358 | return 0; | ||
1359 | } | ||
1360 | |||
1361 | static void set_segment(struct kvm_vcpu *vcpu, | ||
1362 | struct kvm_segment *var, int seg) | ||
1363 | { | ||
1364 | return kvm_arch_ops->set_segment(vcpu, var, seg); | ||
1365 | } | ||
1366 | |||
1367 | static int kvm_dev_ioctl_set_sregs(struct kvm *kvm, struct kvm_sregs *sregs) | ||
1368 | { | ||
1369 | struct kvm_vcpu *vcpu; | ||
1370 | int mmu_reset_needed = 0; | ||
1371 | int i; | ||
1372 | struct descriptor_table dt; | ||
1373 | |||
1374 | if (sregs->vcpu < 0 || sregs->vcpu >= KVM_MAX_VCPUS) | ||
1375 | return -EINVAL; | ||
1376 | vcpu = vcpu_load(kvm, sregs->vcpu); | ||
1377 | if (!vcpu) | ||
1378 | return -ENOENT; | ||
1379 | |||
1380 | set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); | ||
1381 | set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); | ||
1382 | set_segment(vcpu, &sregs->es, VCPU_SREG_ES); | ||
1383 | set_segment(vcpu, &sregs->fs, VCPU_SREG_FS); | ||
1384 | set_segment(vcpu, &sregs->gs, VCPU_SREG_GS); | ||
1385 | set_segment(vcpu, &sregs->ss, VCPU_SREG_SS); | ||
1386 | |||
1387 | set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); | ||
1388 | set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); | ||
1389 | |||
1390 | dt.limit = sregs->idt.limit; | ||
1391 | dt.base = sregs->idt.base; | ||
1392 | kvm_arch_ops->set_idt(vcpu, &dt); | ||
1393 | dt.limit = sregs->gdt.limit; | ||
1394 | dt.base = sregs->gdt.base; | ||
1395 | kvm_arch_ops->set_gdt(vcpu, &dt); | ||
1396 | |||
1397 | vcpu->cr2 = sregs->cr2; | ||
1398 | mmu_reset_needed |= vcpu->cr3 != sregs->cr3; | ||
1399 | vcpu->cr3 = sregs->cr3; | ||
1400 | |||
1401 | vcpu->cr8 = sregs->cr8; | ||
1402 | |||
1403 | mmu_reset_needed |= vcpu->shadow_efer != sregs->efer; | ||
1404 | #ifdef __x86_64__ | ||
1405 | kvm_arch_ops->set_efer(vcpu, sregs->efer); | ||
1406 | #endif | ||
1407 | vcpu->apic_base = sregs->apic_base; | ||
1408 | |||
1409 | mmu_reset_needed |= vcpu->cr0 != sregs->cr0; | ||
1410 | kvm_arch_ops->set_cr0_no_modeswitch(vcpu, sregs->cr0); | ||
1411 | |||
1412 | mmu_reset_needed |= vcpu->cr4 != sregs->cr4; | ||
1413 | kvm_arch_ops->set_cr4(vcpu, sregs->cr4); | ||
1414 | |||
1415 | if (mmu_reset_needed) | ||
1416 | kvm_mmu_reset_context(vcpu); | ||
1417 | |||
1418 | memcpy(vcpu->irq_pending, sregs->interrupt_bitmap, | ||
1419 | sizeof vcpu->irq_pending); | ||
1420 | vcpu->irq_summary = 0; | ||
1421 | for (i = 0; i < NR_IRQ_WORDS; ++i) | ||
1422 | if (vcpu->irq_pending[i]) | ||
1423 | __set_bit(i, &vcpu->irq_summary); | ||
1424 | |||
1425 | vcpu_put(vcpu); | ||
1426 | |||
1427 | return 0; | ||
1428 | } | ||
1429 | |||
1430 | /* | ||
1431 | * List of msr numbers which we expose to userspace through KVM_GET_MSRS | ||
1432 | * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. | ||
1433 | */ | ||
1434 | static u32 msrs_to_save[] = { | ||
1435 | MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, | ||
1436 | MSR_K6_STAR, | ||
1437 | #ifdef __x86_64__ | ||
1438 | MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, | ||
1439 | #endif | ||
1440 | MSR_IA32_TIME_STAMP_COUNTER, | ||
1441 | }; | ||
1442 | |||
1443 | |||
1444 | /* | ||
1445 | * Adapt set_msr() to msr_io()'s calling convention | ||
1446 | */ | ||
1447 | static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) | ||
1448 | { | ||
1449 | return set_msr(vcpu, index, *data); | ||
1450 | } | ||
1451 | |||
1452 | /* | ||
1453 | * Read or write a bunch of msrs. All parameters are kernel addresses. | ||
1454 | * | ||
1455 | * @return number of msrs set successfully. | ||
1456 | */ | ||
1457 | static int __msr_io(struct kvm *kvm, struct kvm_msrs *msrs, | ||
1458 | struct kvm_msr_entry *entries, | ||
1459 | int (*do_msr)(struct kvm_vcpu *vcpu, | ||
1460 | unsigned index, u64 *data)) | ||
1461 | { | ||
1462 | struct kvm_vcpu *vcpu; | ||
1463 | int i; | ||
1464 | |||
1465 | if (msrs->vcpu < 0 || msrs->vcpu >= KVM_MAX_VCPUS) | ||
1466 | return -EINVAL; | ||
1467 | |||
1468 | vcpu = vcpu_load(kvm, msrs->vcpu); | ||
1469 | if (!vcpu) | ||
1470 | return -ENOENT; | ||
1471 | |||
1472 | for (i = 0; i < msrs->nmsrs; ++i) | ||
1473 | if (do_msr(vcpu, entries[i].index, &entries[i].data)) | ||
1474 | break; | ||
1475 | |||
1476 | vcpu_put(vcpu); | ||
1477 | |||
1478 | return i; | ||
1479 | } | ||
1480 | |||
1481 | /* | ||
1482 | * Read or write a bunch of msrs. Parameters are user addresses. | ||
1483 | * | ||
1484 | * @return number of msrs set successfully. | ||
1485 | */ | ||
1486 | static int msr_io(struct kvm *kvm, struct kvm_msrs __user *user_msrs, | ||
1487 | int (*do_msr)(struct kvm_vcpu *vcpu, | ||
1488 | unsigned index, u64 *data), | ||
1489 | int writeback) | ||
1490 | { | ||
1491 | struct kvm_msrs msrs; | ||
1492 | struct kvm_msr_entry *entries; | ||
1493 | int r, n; | ||
1494 | unsigned size; | ||
1495 | |||
1496 | r = -EFAULT; | ||
1497 | if (copy_from_user(&msrs, user_msrs, sizeof msrs)) | ||
1498 | goto out; | ||
1499 | |||
1500 | r = -E2BIG; | ||
1501 | if (msrs.nmsrs >= MAX_IO_MSRS) | ||
1502 | goto out; | ||
1503 | |||
1504 | r = -ENOMEM; | ||
1505 | size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; | ||
1506 | entries = vmalloc(size); | ||
1507 | if (!entries) | ||
1508 | goto out; | ||
1509 | |||
1510 | r = -EFAULT; | ||
1511 | if (copy_from_user(entries, user_msrs->entries, size)) | ||
1512 | goto out_free; | ||
1513 | |||
1514 | r = n = __msr_io(kvm, &msrs, entries, do_msr); | ||
1515 | if (r < 0) | ||
1516 | goto out_free; | ||
1517 | |||
1518 | r = -EFAULT; | ||
1519 | if (writeback && copy_to_user(user_msrs->entries, entries, size)) | ||
1520 | goto out_free; | ||
1521 | |||
1522 | r = n; | ||
1523 | |||
1524 | out_free: | ||
1525 | vfree(entries); | ||
1526 | out: | ||
1527 | return r; | ||
1528 | } | ||
1529 | |||
1530 | /* | ||
1531 | * Translate a guest virtual address to a guest physical address. | ||
1532 | */ | ||
1533 | static int kvm_dev_ioctl_translate(struct kvm *kvm, struct kvm_translation *tr) | ||
1534 | { | ||
1535 | unsigned long vaddr = tr->linear_address; | ||
1536 | struct kvm_vcpu *vcpu; | ||
1537 | gpa_t gpa; | ||
1538 | |||
1539 | vcpu = vcpu_load(kvm, tr->vcpu); | ||
1540 | if (!vcpu) | ||
1541 | return -ENOENT; | ||
1542 | spin_lock(&kvm->lock); | ||
1543 | gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr); | ||
1544 | tr->physical_address = gpa; | ||
1545 | tr->valid = gpa != UNMAPPED_GVA; | ||
1546 | tr->writeable = 1; | ||
1547 | tr->usermode = 0; | ||
1548 | spin_unlock(&kvm->lock); | ||
1549 | vcpu_put(vcpu); | ||
1550 | |||
1551 | return 0; | ||
1552 | } | ||
1553 | |||
1554 | static int kvm_dev_ioctl_interrupt(struct kvm *kvm, struct kvm_interrupt *irq) | ||
1555 | { | ||
1556 | struct kvm_vcpu *vcpu; | ||
1557 | |||
1558 | if (irq->vcpu < 0 || irq->vcpu >= KVM_MAX_VCPUS) | ||
1559 | return -EINVAL; | ||
1560 | if (irq->irq < 0 || irq->irq >= 256) | ||
1561 | return -EINVAL; | ||
1562 | vcpu = vcpu_load(kvm, irq->vcpu); | ||
1563 | if (!vcpu) | ||
1564 | return -ENOENT; | ||
1565 | |||
1566 | set_bit(irq->irq, vcpu->irq_pending); | ||
1567 | set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary); | ||
1568 | |||
1569 | vcpu_put(vcpu); | ||
1570 | |||
1571 | return 0; | ||
1572 | } | ||
1573 | |||
1574 | static int kvm_dev_ioctl_debug_guest(struct kvm *kvm, | ||
1575 | struct kvm_debug_guest *dbg) | ||
1576 | { | ||
1577 | struct kvm_vcpu *vcpu; | ||
1578 | int r; | ||
1579 | |||
1580 | if (dbg->vcpu < 0 || dbg->vcpu >= KVM_MAX_VCPUS) | ||
1581 | return -EINVAL; | ||
1582 | vcpu = vcpu_load(kvm, dbg->vcpu); | ||
1583 | if (!vcpu) | ||
1584 | return -ENOENT; | ||
1585 | |||
1586 | r = kvm_arch_ops->set_guest_debug(vcpu, dbg); | ||
1587 | |||
1588 | vcpu_put(vcpu); | ||
1589 | |||
1590 | return r; | ||
1591 | } | ||
1592 | |||
1593 | static long kvm_dev_ioctl(struct file *filp, | ||
1594 | unsigned int ioctl, unsigned long arg) | ||
1595 | { | ||
1596 | struct kvm *kvm = filp->private_data; | ||
1597 | int r = -EINVAL; | ||
1598 | |||
1599 | switch (ioctl) { | ||
1600 | case KVM_CREATE_VCPU: { | ||
1601 | r = kvm_dev_ioctl_create_vcpu(kvm, arg); | ||
1602 | if (r) | ||
1603 | goto out; | ||
1604 | break; | ||
1605 | } | ||
1606 | case KVM_RUN: { | ||
1607 | struct kvm_run kvm_run; | ||
1608 | |||
1609 | r = -EFAULT; | ||
1610 | if (copy_from_user(&kvm_run, (void *)arg, sizeof kvm_run)) | ||
1611 | goto out; | ||
1612 | r = kvm_dev_ioctl_run(kvm, &kvm_run); | ||
1613 | if (r < 0) | ||
1614 | goto out; | ||
1615 | r = -EFAULT; | ||
1616 | if (copy_to_user((void *)arg, &kvm_run, sizeof kvm_run)) | ||
1617 | goto out; | ||
1618 | r = 0; | ||
1619 | break; | ||
1620 | } | ||
1621 | case KVM_GET_REGS: { | ||
1622 | struct kvm_regs kvm_regs; | ||
1623 | |||
1624 | r = -EFAULT; | ||
1625 | if (copy_from_user(&kvm_regs, (void *)arg, sizeof kvm_regs)) | ||
1626 | goto out; | ||
1627 | r = kvm_dev_ioctl_get_regs(kvm, &kvm_regs); | ||
1628 | if (r) | ||
1629 | goto out; | ||
1630 | r = -EFAULT; | ||
1631 | if (copy_to_user((void *)arg, &kvm_regs, sizeof kvm_regs)) | ||
1632 | goto out; | ||
1633 | r = 0; | ||
1634 | break; | ||
1635 | } | ||
1636 | case KVM_SET_REGS: { | ||
1637 | struct kvm_regs kvm_regs; | ||
1638 | |||
1639 | r = -EFAULT; | ||
1640 | if (copy_from_user(&kvm_regs, (void *)arg, sizeof kvm_regs)) | ||
1641 | goto out; | ||
1642 | r = kvm_dev_ioctl_set_regs(kvm, &kvm_regs); | ||
1643 | if (r) | ||
1644 | goto out; | ||
1645 | r = 0; | ||
1646 | break; | ||
1647 | } | ||
1648 | case KVM_GET_SREGS: { | ||
1649 | struct kvm_sregs kvm_sregs; | ||
1650 | |||
1651 | r = -EFAULT; | ||
1652 | if (copy_from_user(&kvm_sregs, (void *)arg, sizeof kvm_sregs)) | ||
1653 | goto out; | ||
1654 | r = kvm_dev_ioctl_get_sregs(kvm, &kvm_sregs); | ||
1655 | if (r) | ||
1656 | goto out; | ||
1657 | r = -EFAULT; | ||
1658 | if (copy_to_user((void *)arg, &kvm_sregs, sizeof kvm_sregs)) | ||
1659 | goto out; | ||
1660 | r = 0; | ||
1661 | break; | ||
1662 | } | ||
1663 | case KVM_SET_SREGS: { | ||
1664 | struct kvm_sregs kvm_sregs; | ||
1665 | |||
1666 | r = -EFAULT; | ||
1667 | if (copy_from_user(&kvm_sregs, (void *)arg, sizeof kvm_sregs)) | ||
1668 | goto out; | ||
1669 | r = kvm_dev_ioctl_set_sregs(kvm, &kvm_sregs); | ||
1670 | if (r) | ||
1671 | goto out; | ||
1672 | r = 0; | ||
1673 | break; | ||
1674 | } | ||
1675 | case KVM_TRANSLATE: { | ||
1676 | struct kvm_translation tr; | ||
1677 | |||
1678 | r = -EFAULT; | ||
1679 | if (copy_from_user(&tr, (void *)arg, sizeof tr)) | ||
1680 | goto out; | ||
1681 | r = kvm_dev_ioctl_translate(kvm, &tr); | ||
1682 | if (r) | ||
1683 | goto out; | ||
1684 | r = -EFAULT; | ||
1685 | if (copy_to_user((void *)arg, &tr, sizeof tr)) | ||
1686 | goto out; | ||
1687 | r = 0; | ||
1688 | break; | ||
1689 | } | ||
1690 | case KVM_INTERRUPT: { | ||
1691 | struct kvm_interrupt irq; | ||
1692 | |||
1693 | r = -EFAULT; | ||
1694 | if (copy_from_user(&irq, (void *)arg, sizeof irq)) | ||
1695 | goto out; | ||
1696 | r = kvm_dev_ioctl_interrupt(kvm, &irq); | ||
1697 | if (r) | ||
1698 | goto out; | ||
1699 | r = 0; | ||
1700 | break; | ||
1701 | } | ||
1702 | case KVM_DEBUG_GUEST: { | ||
1703 | struct kvm_debug_guest dbg; | ||
1704 | |||
1705 | r = -EFAULT; | ||
1706 | if (copy_from_user(&dbg, (void *)arg, sizeof dbg)) | ||
1707 | goto out; | ||
1708 | r = kvm_dev_ioctl_debug_guest(kvm, &dbg); | ||
1709 | if (r) | ||
1710 | goto out; | ||
1711 | r = 0; | ||
1712 | break; | ||
1713 | } | ||
1714 | case KVM_SET_MEMORY_REGION: { | ||
1715 | struct kvm_memory_region kvm_mem; | ||
1716 | |||
1717 | r = -EFAULT; | ||
1718 | if (copy_from_user(&kvm_mem, (void *)arg, sizeof kvm_mem)) | ||
1719 | goto out; | ||
1720 | r = kvm_dev_ioctl_set_memory_region(kvm, &kvm_mem); | ||
1721 | if (r) | ||
1722 | goto out; | ||
1723 | break; | ||
1724 | } | ||
1725 | case KVM_GET_DIRTY_LOG: { | ||
1726 | struct kvm_dirty_log log; | ||
1727 | |||
1728 | r = -EFAULT; | ||
1729 | if (copy_from_user(&log, (void *)arg, sizeof log)) | ||
1730 | goto out; | ||
1731 | r = kvm_dev_ioctl_get_dirty_log(kvm, &log); | ||
1732 | if (r) | ||
1733 | goto out; | ||
1734 | break; | ||
1735 | } | ||
1736 | case KVM_GET_MSRS: | ||
1737 | r = msr_io(kvm, (void __user *)arg, get_msr, 1); | ||
1738 | break; | ||
1739 | case KVM_SET_MSRS: | ||
1740 | r = msr_io(kvm, (void __user *)arg, do_set_msr, 0); | ||
1741 | break; | ||
1742 | case KVM_GET_MSR_INDEX_LIST: { | ||
1743 | struct kvm_msr_list __user *user_msr_list = (void __user *)arg; | ||
1744 | struct kvm_msr_list msr_list; | ||
1745 | unsigned n; | ||
1746 | |||
1747 | r = -EFAULT; | ||
1748 | if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list)) | ||
1749 | goto out; | ||
1750 | n = msr_list.nmsrs; | ||
1751 | msr_list.nmsrs = ARRAY_SIZE(msrs_to_save); | ||
1752 | if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) | ||
1753 | goto out; | ||
1754 | r = -E2BIG; | ||
1755 | if (n < ARRAY_SIZE(msrs_to_save)) | ||
1756 | goto out; | ||
1757 | r = -EFAULT; | ||
1758 | if (copy_to_user(user_msr_list->indices, &msrs_to_save, | ||
1759 | sizeof msrs_to_save)) | ||
1760 | goto out; | ||
1761 | r = 0; | ||
1762 | } | ||
1763 | default: | ||
1764 | ; | ||
1765 | } | ||
1766 | out: | ||
1767 | return r; | ||
1768 | } | ||
1769 | |||
1770 | static struct page *kvm_dev_nopage(struct vm_area_struct *vma, | ||
1771 | unsigned long address, | ||
1772 | int *type) | ||
1773 | { | ||
1774 | struct kvm *kvm = vma->vm_file->private_data; | ||
1775 | unsigned long pgoff; | ||
1776 | struct kvm_memory_slot *slot; | ||
1777 | struct page *page; | ||
1778 | |||
1779 | *type = VM_FAULT_MINOR; | ||
1780 | pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | ||
1781 | slot = gfn_to_memslot(kvm, pgoff); | ||
1782 | if (!slot) | ||
1783 | return NOPAGE_SIGBUS; | ||
1784 | page = gfn_to_page(slot, pgoff); | ||
1785 | if (!page) | ||
1786 | return NOPAGE_SIGBUS; | ||
1787 | get_page(page); | ||
1788 | return page; | ||
1789 | } | ||
1790 | |||
1791 | static struct vm_operations_struct kvm_dev_vm_ops = { | ||
1792 | .nopage = kvm_dev_nopage, | ||
1793 | }; | ||
1794 | |||
1795 | static int kvm_dev_mmap(struct file *file, struct vm_area_struct *vma) | ||
1796 | { | ||
1797 | vma->vm_ops = &kvm_dev_vm_ops; | ||
1798 | return 0; | ||
1799 | } | ||
1800 | |||
1801 | static struct file_operations kvm_chardev_ops = { | ||
1802 | .open = kvm_dev_open, | ||
1803 | .release = kvm_dev_release, | ||
1804 | .unlocked_ioctl = kvm_dev_ioctl, | ||
1805 | .compat_ioctl = kvm_dev_ioctl, | ||
1806 | .mmap = kvm_dev_mmap, | ||
1807 | }; | ||
1808 | |||
1809 | static struct miscdevice kvm_dev = { | ||
1810 | MISC_DYNAMIC_MINOR, | ||
1811 | "kvm", | ||
1812 | &kvm_chardev_ops, | ||
1813 | }; | ||
1814 | |||
1815 | static int kvm_reboot(struct notifier_block *notifier, unsigned long val, | ||
1816 | void *v) | ||
1817 | { | ||
1818 | if (val == SYS_RESTART) { | ||
1819 | /* | ||
1820 | * Some (well, at least mine) BIOSes hang on reboot if | ||
1821 | * in vmx root mode. | ||
1822 | */ | ||
1823 | printk(KERN_INFO "kvm: exiting hardware virtualization\n"); | ||
1824 | on_each_cpu(kvm_arch_ops->hardware_disable, 0, 0, 1); | ||
1825 | } | ||
1826 | return NOTIFY_OK; | ||
1827 | } | ||
1828 | |||
1829 | static struct notifier_block kvm_reboot_notifier = { | ||
1830 | .notifier_call = kvm_reboot, | ||
1831 | .priority = 0, | ||
1832 | }; | ||
1833 | |||
1834 | static __init void kvm_init_debug(void) | ||
1835 | { | ||
1836 | struct kvm_stats_debugfs_item *p; | ||
1837 | |||
1838 | debugfs_dir = debugfs_create_dir("kvm", 0); | ||
1839 | for (p = debugfs_entries; p->name; ++p) | ||
1840 | p->dentry = debugfs_create_u32(p->name, 0444, debugfs_dir, | ||
1841 | p->data); | ||
1842 | } | ||
1843 | |||
1844 | static void kvm_exit_debug(void) | ||
1845 | { | ||
1846 | struct kvm_stats_debugfs_item *p; | ||
1847 | |||
1848 | for (p = debugfs_entries; p->name; ++p) | ||
1849 | debugfs_remove(p->dentry); | ||
1850 | debugfs_remove(debugfs_dir); | ||
1851 | } | ||
1852 | |||
1853 | hpa_t bad_page_address; | ||
1854 | |||
1855 | int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module) | ||
1856 | { | ||
1857 | int r; | ||
1858 | |||
1859 | kvm_arch_ops = ops; | ||
1860 | |||
1861 | if (!kvm_arch_ops->cpu_has_kvm_support()) { | ||
1862 | printk(KERN_ERR "kvm: no hardware support\n"); | ||
1863 | return -EOPNOTSUPP; | ||
1864 | } | ||
1865 | if (kvm_arch_ops->disabled_by_bios()) { | ||
1866 | printk(KERN_ERR "kvm: disabled by bios\n"); | ||
1867 | return -EOPNOTSUPP; | ||
1868 | } | ||
1869 | |||
1870 | r = kvm_arch_ops->hardware_setup(); | ||
1871 | if (r < 0) | ||
1872 | return r; | ||
1873 | |||
1874 | on_each_cpu(kvm_arch_ops->hardware_enable, 0, 0, 1); | ||
1875 | register_reboot_notifier(&kvm_reboot_notifier); | ||
1876 | |||
1877 | kvm_chardev_ops.owner = module; | ||
1878 | |||
1879 | r = misc_register(&kvm_dev); | ||
1880 | if (r) { | ||
1881 | printk (KERN_ERR "kvm: misc device register failed\n"); | ||
1882 | goto out_free; | ||
1883 | } | ||
1884 | |||
1885 | return r; | ||
1886 | |||
1887 | out_free: | ||
1888 | unregister_reboot_notifier(&kvm_reboot_notifier); | ||
1889 | on_each_cpu(kvm_arch_ops->hardware_disable, 0, 0, 1); | ||
1890 | kvm_arch_ops->hardware_unsetup(); | ||
1891 | return r; | ||
1892 | } | ||
1893 | |||
1894 | void kvm_exit_arch(void) | ||
1895 | { | ||
1896 | misc_deregister(&kvm_dev); | ||
1897 | |||
1898 | unregister_reboot_notifier(&kvm_reboot_notifier); | ||
1899 | on_each_cpu(kvm_arch_ops->hardware_disable, 0, 0, 1); | ||
1900 | kvm_arch_ops->hardware_unsetup(); | ||
1901 | } | ||
1902 | |||
1903 | static __init int kvm_init(void) | ||
1904 | { | ||
1905 | static struct page *bad_page; | ||
1906 | int r = 0; | ||
1907 | |||
1908 | kvm_init_debug(); | ||
1909 | |||
1910 | if ((bad_page = alloc_page(GFP_KERNEL)) == NULL) { | ||
1911 | r = -ENOMEM; | ||
1912 | goto out; | ||
1913 | } | ||
1914 | |||
1915 | bad_page_address = page_to_pfn(bad_page) << PAGE_SHIFT; | ||
1916 | memset(__va(bad_page_address), 0, PAGE_SIZE); | ||
1917 | |||
1918 | return r; | ||
1919 | |||
1920 | out: | ||
1921 | kvm_exit_debug(); | ||
1922 | return r; | ||
1923 | } | ||
1924 | |||
1925 | static __exit void kvm_exit(void) | ||
1926 | { | ||
1927 | kvm_exit_debug(); | ||
1928 | __free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT)); | ||
1929 | } | ||
1930 | |||
1931 | module_init(kvm_init) | ||
1932 | module_exit(kvm_exit) | ||
1933 | |||
1934 | EXPORT_SYMBOL_GPL(kvm_init_arch); | ||
1935 | EXPORT_SYMBOL_GPL(kvm_exit_arch); | ||
diff --git a/drivers/kvm/kvm_svm.h b/drivers/kvm/kvm_svm.h new file mode 100644 index 000000000000..7d7f2aa10960 --- /dev/null +++ b/drivers/kvm/kvm_svm.h | |||
@@ -0,0 +1,44 @@ | |||
1 | #ifndef __KVM_SVM_H | ||
2 | #define __KVM_SVM_H | ||
3 | |||
4 | #include <linux/types.h> | ||
5 | #include <linux/list.h> | ||
6 | #include <asm/msr.h> | ||
7 | |||
8 | #include "svm.h" | ||
9 | #include "kvm.h" | ||
10 | |||
11 | static const u32 host_save_msrs[] = { | ||
12 | #ifdef __x86_64__ | ||
13 | MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE, | ||
14 | MSR_FS_BASE, MSR_GS_BASE, | ||
15 | #endif | ||
16 | MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, | ||
17 | MSR_IA32_DEBUGCTLMSR, /*MSR_IA32_LASTBRANCHFROMIP, | ||
18 | MSR_IA32_LASTBRANCHTOIP, MSR_IA32_LASTINTFROMIP,MSR_IA32_LASTINTTOIP,*/ | ||
19 | }; | ||
20 | |||
21 | #define NR_HOST_SAVE_MSRS (sizeof(host_save_msrs) / sizeof(*host_save_msrs)) | ||
22 | #define NUM_DB_REGS 4 | ||
23 | |||
24 | struct vcpu_svm { | ||
25 | struct vmcb *vmcb; | ||
26 | unsigned long vmcb_pa; | ||
27 | struct svm_cpu_data *svm_data; | ||
28 | uint64_t asid_generation; | ||
29 | |||
30 | unsigned long cr0; | ||
31 | unsigned long cr4; | ||
32 | unsigned long db_regs[NUM_DB_REGS]; | ||
33 | |||
34 | u64 next_rip; | ||
35 | |||
36 | u64 host_msrs[NR_HOST_SAVE_MSRS]; | ||
37 | unsigned long host_cr2; | ||
38 | unsigned long host_db_regs[NUM_DB_REGS]; | ||
39 | unsigned long host_dr6; | ||
40 | unsigned long host_dr7; | ||
41 | }; | ||
42 | |||
43 | #endif | ||
44 | |||
diff --git a/drivers/kvm/kvm_vmx.h b/drivers/kvm/kvm_vmx.h new file mode 100644 index 000000000000..87e12d2bfa16 --- /dev/null +++ b/drivers/kvm/kvm_vmx.h | |||
@@ -0,0 +1,14 @@ | |||
1 | #ifndef __KVM_VMX_H | ||
2 | #define __KVM_VMX_H | ||
3 | |||
4 | #ifdef __x86_64__ | ||
5 | /* | ||
6 | * avoid save/load MSR_SYSCALL_MASK and MSR_LSTAR by std vt | ||
7 | * mechanism (cpu bug AA24) | ||
8 | */ | ||
9 | #define NR_BAD_MSRS 2 | ||
10 | #else | ||
11 | #define NR_BAD_MSRS 0 | ||
12 | #endif | ||
13 | |||
14 | #endif | ||
diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c new file mode 100644 index 000000000000..4e29d9b7211c --- /dev/null +++ b/drivers/kvm/mmu.c | |||
@@ -0,0 +1,699 @@ | |||
1 | /* | ||
2 | * Kernel-based Virtual Machine driver for Linux | ||
3 | * | ||
4 | * This module enables machines with Intel VT-x extensions to run virtual | ||
5 | * machines without emulation or binary translation. | ||
6 | * | ||
7 | * MMU support | ||
8 | * | ||
9 | * Copyright (C) 2006 Qumranet, Inc. | ||
10 | * | ||
11 | * Authors: | ||
12 | * Yaniv Kamay <yaniv@qumranet.com> | ||
13 | * Avi Kivity <avi@qumranet.com> | ||
14 | * | ||
15 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
16 | * the COPYING file in the top-level directory. | ||
17 | * | ||
18 | */ | ||
19 | #include <linux/types.h> | ||
20 | #include <linux/string.h> | ||
21 | #include <asm/page.h> | ||
22 | #include <linux/mm.h> | ||
23 | #include <linux/highmem.h> | ||
24 | #include <linux/module.h> | ||
25 | |||
26 | #include "vmx.h" | ||
27 | #include "kvm.h" | ||
28 | |||
29 | #define pgprintk(x...) do { } while (0) | ||
30 | |||
31 | #define ASSERT(x) \ | ||
32 | if (!(x)) { \ | ||
33 | printk(KERN_WARNING "assertion failed %s:%d: %s\n", \ | ||
34 | __FILE__, __LINE__, #x); \ | ||
35 | } | ||
36 | |||
37 | #define PT64_ENT_PER_PAGE 512 | ||
38 | #define PT32_ENT_PER_PAGE 1024 | ||
39 | |||
40 | #define PT_WRITABLE_SHIFT 1 | ||
41 | |||
42 | #define PT_PRESENT_MASK (1ULL << 0) | ||
43 | #define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT) | ||
44 | #define PT_USER_MASK (1ULL << 2) | ||
45 | #define PT_PWT_MASK (1ULL << 3) | ||
46 | #define PT_PCD_MASK (1ULL << 4) | ||
47 | #define PT_ACCESSED_MASK (1ULL << 5) | ||
48 | #define PT_DIRTY_MASK (1ULL << 6) | ||
49 | #define PT_PAGE_SIZE_MASK (1ULL << 7) | ||
50 | #define PT_PAT_MASK (1ULL << 7) | ||
51 | #define PT_GLOBAL_MASK (1ULL << 8) | ||
52 | #define PT64_NX_MASK (1ULL << 63) | ||
53 | |||
54 | #define PT_PAT_SHIFT 7 | ||
55 | #define PT_DIR_PAT_SHIFT 12 | ||
56 | #define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT) | ||
57 | |||
58 | #define PT32_DIR_PSE36_SIZE 4 | ||
59 | #define PT32_DIR_PSE36_SHIFT 13 | ||
60 | #define PT32_DIR_PSE36_MASK (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT) | ||
61 | |||
62 | |||
63 | #define PT32_PTE_COPY_MASK \ | ||
64 | (PT_PRESENT_MASK | PT_PWT_MASK | PT_PCD_MASK | \ | ||
65 | PT_ACCESSED_MASK | PT_DIRTY_MASK | PT_PAT_MASK | \ | ||
66 | PT_GLOBAL_MASK ) | ||
67 | |||
68 | #define PT32_NON_PTE_COPY_MASK \ | ||
69 | (PT_PRESENT_MASK | PT_PWT_MASK | PT_PCD_MASK | \ | ||
70 | PT_ACCESSED_MASK | PT_DIRTY_MASK) | ||
71 | |||
72 | |||
73 | #define PT64_PTE_COPY_MASK \ | ||
74 | (PT64_NX_MASK | PT32_PTE_COPY_MASK) | ||
75 | |||
76 | #define PT64_NON_PTE_COPY_MASK \ | ||
77 | (PT64_NX_MASK | PT32_NON_PTE_COPY_MASK) | ||
78 | |||
79 | |||
80 | |||
81 | #define PT_FIRST_AVAIL_BITS_SHIFT 9 | ||
82 | #define PT64_SECOND_AVAIL_BITS_SHIFT 52 | ||
83 | |||
84 | #define PT_SHADOW_PS_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) | ||
85 | #define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) | ||
86 | |||
87 | #define PT_SHADOW_WRITABLE_SHIFT (PT_FIRST_AVAIL_BITS_SHIFT + 1) | ||
88 | #define PT_SHADOW_WRITABLE_MASK (1ULL << PT_SHADOW_WRITABLE_SHIFT) | ||
89 | |||
90 | #define PT_SHADOW_USER_SHIFT (PT_SHADOW_WRITABLE_SHIFT + 1) | ||
91 | #define PT_SHADOW_USER_MASK (1ULL << (PT_SHADOW_USER_SHIFT)) | ||
92 | |||
93 | #define PT_SHADOW_BITS_OFFSET (PT_SHADOW_WRITABLE_SHIFT - PT_WRITABLE_SHIFT) | ||
94 | |||
95 | #define VALID_PAGE(x) ((x) != INVALID_PAGE) | ||
96 | |||
97 | #define PT64_LEVEL_BITS 9 | ||
98 | |||
99 | #define PT64_LEVEL_SHIFT(level) \ | ||
100 | ( PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS ) | ||
101 | |||
102 | #define PT64_LEVEL_MASK(level) \ | ||
103 | (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level)) | ||
104 | |||
105 | #define PT64_INDEX(address, level)\ | ||
106 | (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) | ||
107 | |||
108 | |||
109 | #define PT32_LEVEL_BITS 10 | ||
110 | |||
111 | #define PT32_LEVEL_SHIFT(level) \ | ||
112 | ( PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS ) | ||
113 | |||
114 | #define PT32_LEVEL_MASK(level) \ | ||
115 | (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level)) | ||
116 | |||
117 | #define PT32_INDEX(address, level)\ | ||
118 | (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) | ||
119 | |||
120 | |||
121 | #define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & PAGE_MASK) | ||
122 | #define PT64_DIR_BASE_ADDR_MASK \ | ||
123 | (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1)) | ||
124 | |||
125 | #define PT32_BASE_ADDR_MASK PAGE_MASK | ||
126 | #define PT32_DIR_BASE_ADDR_MASK \ | ||
127 | (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1)) | ||
128 | |||
129 | |||
130 | #define PFERR_PRESENT_MASK (1U << 0) | ||
131 | #define PFERR_WRITE_MASK (1U << 1) | ||
132 | #define PFERR_USER_MASK (1U << 2) | ||
133 | |||
134 | #define PT64_ROOT_LEVEL 4 | ||
135 | #define PT32_ROOT_LEVEL 2 | ||
136 | #define PT32E_ROOT_LEVEL 3 | ||
137 | |||
138 | #define PT_DIRECTORY_LEVEL 2 | ||
139 | #define PT_PAGE_TABLE_LEVEL 1 | ||
140 | |||
141 | static int is_write_protection(struct kvm_vcpu *vcpu) | ||
142 | { | ||
143 | return vcpu->cr0 & CR0_WP_MASK; | ||
144 | } | ||
145 | |||
146 | static int is_cpuid_PSE36(void) | ||
147 | { | ||
148 | return 1; | ||
149 | } | ||
150 | |||
151 | static int is_present_pte(unsigned long pte) | ||
152 | { | ||
153 | return pte & PT_PRESENT_MASK; | ||
154 | } | ||
155 | |||
156 | static int is_writeble_pte(unsigned long pte) | ||
157 | { | ||
158 | return pte & PT_WRITABLE_MASK; | ||
159 | } | ||
160 | |||
161 | static int is_io_pte(unsigned long pte) | ||
162 | { | ||
163 | return pte & PT_SHADOW_IO_MARK; | ||
164 | } | ||
165 | |||
166 | static void kvm_mmu_free_page(struct kvm_vcpu *vcpu, hpa_t page_hpa) | ||
167 | { | ||
168 | struct kvm_mmu_page *page_head = page_header(page_hpa); | ||
169 | |||
170 | list_del(&page_head->link); | ||
171 | page_head->page_hpa = page_hpa; | ||
172 | list_add(&page_head->link, &vcpu->free_pages); | ||
173 | } | ||
174 | |||
175 | static int is_empty_shadow_page(hpa_t page_hpa) | ||
176 | { | ||
177 | u32 *pos; | ||
178 | u32 *end; | ||
179 | for (pos = __va(page_hpa), end = pos + PAGE_SIZE / sizeof(u32); | ||
180 | pos != end; pos++) | ||
181 | if (*pos != 0) | ||
182 | return 0; | ||
183 | return 1; | ||
184 | } | ||
185 | |||
186 | static hpa_t kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, u64 *parent_pte) | ||
187 | { | ||
188 | struct kvm_mmu_page *page; | ||
189 | |||
190 | if (list_empty(&vcpu->free_pages)) | ||
191 | return INVALID_PAGE; | ||
192 | |||
193 | page = list_entry(vcpu->free_pages.next, struct kvm_mmu_page, link); | ||
194 | list_del(&page->link); | ||
195 | list_add(&page->link, &vcpu->kvm->active_mmu_pages); | ||
196 | ASSERT(is_empty_shadow_page(page->page_hpa)); | ||
197 | page->slot_bitmap = 0; | ||
198 | page->global = 1; | ||
199 | page->parent_pte = parent_pte; | ||
200 | return page->page_hpa; | ||
201 | } | ||
202 | |||
203 | static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa) | ||
204 | { | ||
205 | int slot = memslot_id(kvm, gfn_to_memslot(kvm, gpa >> PAGE_SHIFT)); | ||
206 | struct kvm_mmu_page *page_head = page_header(__pa(pte)); | ||
207 | |||
208 | __set_bit(slot, &page_head->slot_bitmap); | ||
209 | } | ||
210 | |||
211 | hpa_t safe_gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa) | ||
212 | { | ||
213 | hpa_t hpa = gpa_to_hpa(vcpu, gpa); | ||
214 | |||
215 | return is_error_hpa(hpa) ? bad_page_address | (gpa & ~PAGE_MASK): hpa; | ||
216 | } | ||
217 | |||
218 | hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa) | ||
219 | { | ||
220 | struct kvm_memory_slot *slot; | ||
221 | struct page *page; | ||
222 | |||
223 | ASSERT((gpa & HPA_ERR_MASK) == 0); | ||
224 | slot = gfn_to_memslot(vcpu->kvm, gpa >> PAGE_SHIFT); | ||
225 | if (!slot) | ||
226 | return gpa | HPA_ERR_MASK; | ||
227 | page = gfn_to_page(slot, gpa >> PAGE_SHIFT); | ||
228 | return ((hpa_t)page_to_pfn(page) << PAGE_SHIFT) | ||
229 | | (gpa & (PAGE_SIZE-1)); | ||
230 | } | ||
231 | |||
232 | hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva) | ||
233 | { | ||
234 | gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva); | ||
235 | |||
236 | if (gpa == UNMAPPED_GVA) | ||
237 | return UNMAPPED_GVA; | ||
238 | return gpa_to_hpa(vcpu, gpa); | ||
239 | } | ||
240 | |||
241 | |||
242 | static void release_pt_page_64(struct kvm_vcpu *vcpu, hpa_t page_hpa, | ||
243 | int level) | ||
244 | { | ||
245 | ASSERT(vcpu); | ||
246 | ASSERT(VALID_PAGE(page_hpa)); | ||
247 | ASSERT(level <= PT64_ROOT_LEVEL && level > 0); | ||
248 | |||
249 | if (level == 1) | ||
250 | memset(__va(page_hpa), 0, PAGE_SIZE); | ||
251 | else { | ||
252 | u64 *pos; | ||
253 | u64 *end; | ||
254 | |||
255 | for (pos = __va(page_hpa), end = pos + PT64_ENT_PER_PAGE; | ||
256 | pos != end; pos++) { | ||
257 | u64 current_ent = *pos; | ||
258 | |||
259 | *pos = 0; | ||
260 | if (is_present_pte(current_ent)) | ||
261 | release_pt_page_64(vcpu, | ||
262 | current_ent & | ||
263 | PT64_BASE_ADDR_MASK, | ||
264 | level - 1); | ||
265 | } | ||
266 | } | ||
267 | kvm_mmu_free_page(vcpu, page_hpa); | ||
268 | } | ||
269 | |||
270 | static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) | ||
271 | { | ||
272 | } | ||
273 | |||
274 | static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p) | ||
275 | { | ||
276 | int level = PT32E_ROOT_LEVEL; | ||
277 | hpa_t table_addr = vcpu->mmu.root_hpa; | ||
278 | |||
279 | for (; ; level--) { | ||
280 | u32 index = PT64_INDEX(v, level); | ||
281 | u64 *table; | ||
282 | |||
283 | ASSERT(VALID_PAGE(table_addr)); | ||
284 | table = __va(table_addr); | ||
285 | |||
286 | if (level == 1) { | ||
287 | mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT); | ||
288 | page_header_update_slot(vcpu->kvm, table, v); | ||
289 | table[index] = p | PT_PRESENT_MASK | PT_WRITABLE_MASK | | ||
290 | PT_USER_MASK; | ||
291 | return 0; | ||
292 | } | ||
293 | |||
294 | if (table[index] == 0) { | ||
295 | hpa_t new_table = kvm_mmu_alloc_page(vcpu, | ||
296 | &table[index]); | ||
297 | |||
298 | if (!VALID_PAGE(new_table)) { | ||
299 | pgprintk("nonpaging_map: ENOMEM\n"); | ||
300 | return -ENOMEM; | ||
301 | } | ||
302 | |||
303 | if (level == PT32E_ROOT_LEVEL) | ||
304 | table[index] = new_table | PT_PRESENT_MASK; | ||
305 | else | ||
306 | table[index] = new_table | PT_PRESENT_MASK | | ||
307 | PT_WRITABLE_MASK | PT_USER_MASK; | ||
308 | } | ||
309 | table_addr = table[index] & PT64_BASE_ADDR_MASK; | ||
310 | } | ||
311 | } | ||
312 | |||
313 | static void nonpaging_flush(struct kvm_vcpu *vcpu) | ||
314 | { | ||
315 | hpa_t root = vcpu->mmu.root_hpa; | ||
316 | |||
317 | ++kvm_stat.tlb_flush; | ||
318 | pgprintk("nonpaging_flush\n"); | ||
319 | ASSERT(VALID_PAGE(root)); | ||
320 | release_pt_page_64(vcpu, root, vcpu->mmu.shadow_root_level); | ||
321 | root = kvm_mmu_alloc_page(vcpu, NULL); | ||
322 | ASSERT(VALID_PAGE(root)); | ||
323 | vcpu->mmu.root_hpa = root; | ||
324 | if (is_paging(vcpu)) | ||
325 | root |= (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK)); | ||
326 | kvm_arch_ops->set_cr3(vcpu, root); | ||
327 | kvm_arch_ops->tlb_flush(vcpu); | ||
328 | } | ||
329 | |||
330 | static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) | ||
331 | { | ||
332 | return vaddr; | ||
333 | } | ||
334 | |||
335 | static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, | ||
336 | u32 error_code) | ||
337 | { | ||
338 | int ret; | ||
339 | gpa_t addr = gva; | ||
340 | |||
341 | ASSERT(vcpu); | ||
342 | ASSERT(VALID_PAGE(vcpu->mmu.root_hpa)); | ||
343 | |||
344 | for (;;) { | ||
345 | hpa_t paddr; | ||
346 | |||
347 | paddr = gpa_to_hpa(vcpu , addr & PT64_BASE_ADDR_MASK); | ||
348 | |||
349 | if (is_error_hpa(paddr)) | ||
350 | return 1; | ||
351 | |||
352 | ret = nonpaging_map(vcpu, addr & PAGE_MASK, paddr); | ||
353 | if (ret) { | ||
354 | nonpaging_flush(vcpu); | ||
355 | continue; | ||
356 | } | ||
357 | break; | ||
358 | } | ||
359 | return ret; | ||
360 | } | ||
361 | |||
362 | static void nonpaging_inval_page(struct kvm_vcpu *vcpu, gva_t addr) | ||
363 | { | ||
364 | } | ||
365 | |||
366 | static void nonpaging_free(struct kvm_vcpu *vcpu) | ||
367 | { | ||
368 | hpa_t root; | ||
369 | |||
370 | ASSERT(vcpu); | ||
371 | root = vcpu->mmu.root_hpa; | ||
372 | if (VALID_PAGE(root)) | ||
373 | release_pt_page_64(vcpu, root, vcpu->mmu.shadow_root_level); | ||
374 | vcpu->mmu.root_hpa = INVALID_PAGE; | ||
375 | } | ||
376 | |||
377 | static int nonpaging_init_context(struct kvm_vcpu *vcpu) | ||
378 | { | ||
379 | struct kvm_mmu *context = &vcpu->mmu; | ||
380 | |||
381 | context->new_cr3 = nonpaging_new_cr3; | ||
382 | context->page_fault = nonpaging_page_fault; | ||
383 | context->inval_page = nonpaging_inval_page; | ||
384 | context->gva_to_gpa = nonpaging_gva_to_gpa; | ||
385 | context->free = nonpaging_free; | ||
386 | context->root_level = PT32E_ROOT_LEVEL; | ||
387 | context->shadow_root_level = PT32E_ROOT_LEVEL; | ||
388 | context->root_hpa = kvm_mmu_alloc_page(vcpu, NULL); | ||
389 | ASSERT(VALID_PAGE(context->root_hpa)); | ||
390 | kvm_arch_ops->set_cr3(vcpu, context->root_hpa); | ||
391 | return 0; | ||
392 | } | ||
393 | |||
394 | |||
395 | static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) | ||
396 | { | ||
397 | struct kvm_mmu_page *page, *npage; | ||
398 | |||
399 | list_for_each_entry_safe(page, npage, &vcpu->kvm->active_mmu_pages, | ||
400 | link) { | ||
401 | if (page->global) | ||
402 | continue; | ||
403 | |||
404 | if (!page->parent_pte) | ||
405 | continue; | ||
406 | |||
407 | *page->parent_pte = 0; | ||
408 | release_pt_page_64(vcpu, page->page_hpa, 1); | ||
409 | } | ||
410 | ++kvm_stat.tlb_flush; | ||
411 | kvm_arch_ops->tlb_flush(vcpu); | ||
412 | } | ||
413 | |||
414 | static void paging_new_cr3(struct kvm_vcpu *vcpu) | ||
415 | { | ||
416 | kvm_mmu_flush_tlb(vcpu); | ||
417 | } | ||
418 | |||
419 | static void mark_pagetable_nonglobal(void *shadow_pte) | ||
420 | { | ||
421 | page_header(__pa(shadow_pte))->global = 0; | ||
422 | } | ||
423 | |||
424 | static inline void set_pte_common(struct kvm_vcpu *vcpu, | ||
425 | u64 *shadow_pte, | ||
426 | gpa_t gaddr, | ||
427 | int dirty, | ||
428 | u64 access_bits) | ||
429 | { | ||
430 | hpa_t paddr; | ||
431 | |||
432 | *shadow_pte |= access_bits << PT_SHADOW_BITS_OFFSET; | ||
433 | if (!dirty) | ||
434 | access_bits &= ~PT_WRITABLE_MASK; | ||
435 | |||
436 | if (access_bits & PT_WRITABLE_MASK) | ||
437 | mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT); | ||
438 | |||
439 | *shadow_pte |= access_bits; | ||
440 | |||
441 | paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK); | ||
442 | |||
443 | if (!(*shadow_pte & PT_GLOBAL_MASK)) | ||
444 | mark_pagetable_nonglobal(shadow_pte); | ||
445 | |||
446 | if (is_error_hpa(paddr)) { | ||
447 | *shadow_pte |= gaddr; | ||
448 | *shadow_pte |= PT_SHADOW_IO_MARK; | ||
449 | *shadow_pte &= ~PT_PRESENT_MASK; | ||
450 | } else { | ||
451 | *shadow_pte |= paddr; | ||
452 | page_header_update_slot(vcpu->kvm, shadow_pte, gaddr); | ||
453 | } | ||
454 | } | ||
455 | |||
456 | static void inject_page_fault(struct kvm_vcpu *vcpu, | ||
457 | u64 addr, | ||
458 | u32 err_code) | ||
459 | { | ||
460 | kvm_arch_ops->inject_page_fault(vcpu, addr, err_code); | ||
461 | } | ||
462 | |||
463 | static inline int fix_read_pf(u64 *shadow_ent) | ||
464 | { | ||
465 | if ((*shadow_ent & PT_SHADOW_USER_MASK) && | ||
466 | !(*shadow_ent & PT_USER_MASK)) { | ||
467 | /* | ||
468 | * If supervisor write protect is disabled, we shadow kernel | ||
469 | * pages as user pages so we can trap the write access. | ||
470 | */ | ||
471 | *shadow_ent |= PT_USER_MASK; | ||
472 | *shadow_ent &= ~PT_WRITABLE_MASK; | ||
473 | |||
474 | return 1; | ||
475 | |||
476 | } | ||
477 | return 0; | ||
478 | } | ||
479 | |||
480 | static int may_access(u64 pte, int write, int user) | ||
481 | { | ||
482 | |||
483 | if (user && !(pte & PT_USER_MASK)) | ||
484 | return 0; | ||
485 | if (write && !(pte & PT_WRITABLE_MASK)) | ||
486 | return 0; | ||
487 | return 1; | ||
488 | } | ||
489 | |||
490 | /* | ||
491 | * Remove a shadow pte. | ||
492 | */ | ||
493 | static void paging_inval_page(struct kvm_vcpu *vcpu, gva_t addr) | ||
494 | { | ||
495 | hpa_t page_addr = vcpu->mmu.root_hpa; | ||
496 | int level = vcpu->mmu.shadow_root_level; | ||
497 | |||
498 | ++kvm_stat.invlpg; | ||
499 | |||
500 | for (; ; level--) { | ||
501 | u32 index = PT64_INDEX(addr, level); | ||
502 | u64 *table = __va(page_addr); | ||
503 | |||
504 | if (level == PT_PAGE_TABLE_LEVEL ) { | ||
505 | table[index] = 0; | ||
506 | return; | ||
507 | } | ||
508 | |||
509 | if (!is_present_pte(table[index])) | ||
510 | return; | ||
511 | |||
512 | page_addr = table[index] & PT64_BASE_ADDR_MASK; | ||
513 | |||
514 | if (level == PT_DIRECTORY_LEVEL && | ||
515 | (table[index] & PT_SHADOW_PS_MARK)) { | ||
516 | table[index] = 0; | ||
517 | release_pt_page_64(vcpu, page_addr, PT_PAGE_TABLE_LEVEL); | ||
518 | |||
519 | kvm_arch_ops->tlb_flush(vcpu); | ||
520 | return; | ||
521 | } | ||
522 | } | ||
523 | } | ||
524 | |||
525 | static void paging_free(struct kvm_vcpu *vcpu) | ||
526 | { | ||
527 | nonpaging_free(vcpu); | ||
528 | } | ||
529 | |||
530 | #define PTTYPE 64 | ||
531 | #include "paging_tmpl.h" | ||
532 | #undef PTTYPE | ||
533 | |||
534 | #define PTTYPE 32 | ||
535 | #include "paging_tmpl.h" | ||
536 | #undef PTTYPE | ||
537 | |||
538 | static int paging64_init_context(struct kvm_vcpu *vcpu) | ||
539 | { | ||
540 | struct kvm_mmu *context = &vcpu->mmu; | ||
541 | |||
542 | ASSERT(is_pae(vcpu)); | ||
543 | context->new_cr3 = paging_new_cr3; | ||
544 | context->page_fault = paging64_page_fault; | ||
545 | context->inval_page = paging_inval_page; | ||
546 | context->gva_to_gpa = paging64_gva_to_gpa; | ||
547 | context->free = paging_free; | ||
548 | context->root_level = PT64_ROOT_LEVEL; | ||
549 | context->shadow_root_level = PT64_ROOT_LEVEL; | ||
550 | context->root_hpa = kvm_mmu_alloc_page(vcpu, NULL); | ||
551 | ASSERT(VALID_PAGE(context->root_hpa)); | ||
552 | kvm_arch_ops->set_cr3(vcpu, context->root_hpa | | ||
553 | (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK))); | ||
554 | return 0; | ||
555 | } | ||
556 | |||
557 | static int paging32_init_context(struct kvm_vcpu *vcpu) | ||
558 | { | ||
559 | struct kvm_mmu *context = &vcpu->mmu; | ||
560 | |||
561 | context->new_cr3 = paging_new_cr3; | ||
562 | context->page_fault = paging32_page_fault; | ||
563 | context->inval_page = paging_inval_page; | ||
564 | context->gva_to_gpa = paging32_gva_to_gpa; | ||
565 | context->free = paging_free; | ||
566 | context->root_level = PT32_ROOT_LEVEL; | ||
567 | context->shadow_root_level = PT32E_ROOT_LEVEL; | ||
568 | context->root_hpa = kvm_mmu_alloc_page(vcpu, NULL); | ||
569 | ASSERT(VALID_PAGE(context->root_hpa)); | ||
570 | kvm_arch_ops->set_cr3(vcpu, context->root_hpa | | ||
571 | (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK))); | ||
572 | return 0; | ||
573 | } | ||
574 | |||
575 | static int paging32E_init_context(struct kvm_vcpu *vcpu) | ||
576 | { | ||
577 | int ret; | ||
578 | |||
579 | if ((ret = paging64_init_context(vcpu))) | ||
580 | return ret; | ||
581 | |||
582 | vcpu->mmu.root_level = PT32E_ROOT_LEVEL; | ||
583 | vcpu->mmu.shadow_root_level = PT32E_ROOT_LEVEL; | ||
584 | return 0; | ||
585 | } | ||
586 | |||
587 | static int init_kvm_mmu(struct kvm_vcpu *vcpu) | ||
588 | { | ||
589 | ASSERT(vcpu); | ||
590 | ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa)); | ||
591 | |||
592 | if (!is_paging(vcpu)) | ||
593 | return nonpaging_init_context(vcpu); | ||
594 | else if (kvm_arch_ops->is_long_mode(vcpu)) | ||
595 | return paging64_init_context(vcpu); | ||
596 | else if (is_pae(vcpu)) | ||
597 | return paging32E_init_context(vcpu); | ||
598 | else | ||
599 | return paging32_init_context(vcpu); | ||
600 | } | ||
601 | |||
602 | static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) | ||
603 | { | ||
604 | ASSERT(vcpu); | ||
605 | if (VALID_PAGE(vcpu->mmu.root_hpa)) { | ||
606 | vcpu->mmu.free(vcpu); | ||
607 | vcpu->mmu.root_hpa = INVALID_PAGE; | ||
608 | } | ||
609 | } | ||
610 | |||
611 | int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) | ||
612 | { | ||
613 | destroy_kvm_mmu(vcpu); | ||
614 | return init_kvm_mmu(vcpu); | ||
615 | } | ||
616 | |||
617 | static void free_mmu_pages(struct kvm_vcpu *vcpu) | ||
618 | { | ||
619 | while (!list_empty(&vcpu->free_pages)) { | ||
620 | struct kvm_mmu_page *page; | ||
621 | |||
622 | page = list_entry(vcpu->free_pages.next, | ||
623 | struct kvm_mmu_page, link); | ||
624 | list_del(&page->link); | ||
625 | __free_page(pfn_to_page(page->page_hpa >> PAGE_SHIFT)); | ||
626 | page->page_hpa = INVALID_PAGE; | ||
627 | } | ||
628 | } | ||
629 | |||
630 | static int alloc_mmu_pages(struct kvm_vcpu *vcpu) | ||
631 | { | ||
632 | int i; | ||
633 | |||
634 | ASSERT(vcpu); | ||
635 | |||
636 | for (i = 0; i < KVM_NUM_MMU_PAGES; i++) { | ||
637 | struct page *page; | ||
638 | struct kvm_mmu_page *page_header = &vcpu->page_header_buf[i]; | ||
639 | |||
640 | INIT_LIST_HEAD(&page_header->link); | ||
641 | if ((page = alloc_page(GFP_KVM_MMU)) == NULL) | ||
642 | goto error_1; | ||
643 | page->private = (unsigned long)page_header; | ||
644 | page_header->page_hpa = (hpa_t)page_to_pfn(page) << PAGE_SHIFT; | ||
645 | memset(__va(page_header->page_hpa), 0, PAGE_SIZE); | ||
646 | list_add(&page_header->link, &vcpu->free_pages); | ||
647 | } | ||
648 | return 0; | ||
649 | |||
650 | error_1: | ||
651 | free_mmu_pages(vcpu); | ||
652 | return -ENOMEM; | ||
653 | } | ||
654 | |||
655 | int kvm_mmu_init(struct kvm_vcpu *vcpu) | ||
656 | { | ||
657 | int r; | ||
658 | |||
659 | ASSERT(vcpu); | ||
660 | ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa)); | ||
661 | ASSERT(list_empty(&vcpu->free_pages)); | ||
662 | |||
663 | if ((r = alloc_mmu_pages(vcpu))) | ||
664 | return r; | ||
665 | |||
666 | if ((r = init_kvm_mmu(vcpu))) { | ||
667 | free_mmu_pages(vcpu); | ||
668 | return r; | ||
669 | } | ||
670 | return 0; | ||
671 | } | ||
672 | |||
673 | void kvm_mmu_destroy(struct kvm_vcpu *vcpu) | ||
674 | { | ||
675 | ASSERT(vcpu); | ||
676 | |||
677 | destroy_kvm_mmu(vcpu); | ||
678 | free_mmu_pages(vcpu); | ||
679 | } | ||
680 | |||
681 | void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) | ||
682 | { | ||
683 | struct kvm_mmu_page *page; | ||
684 | |||
685 | list_for_each_entry(page, &kvm->active_mmu_pages, link) { | ||
686 | int i; | ||
687 | u64 *pt; | ||
688 | |||
689 | if (!test_bit(slot, &page->slot_bitmap)) | ||
690 | continue; | ||
691 | |||
692 | pt = __va(page->page_hpa); | ||
693 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) | ||
694 | /* avoid RMW */ | ||
695 | if (pt[i] & PT_WRITABLE_MASK) | ||
696 | pt[i] &= ~PT_WRITABLE_MASK; | ||
697 | |||
698 | } | ||
699 | } | ||
diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h new file mode 100644 index 000000000000..765c2e1a048e --- /dev/null +++ b/drivers/kvm/paging_tmpl.h | |||
@@ -0,0 +1,397 @@ | |||
1 | /* | ||
2 | * Kernel-based Virtual Machine driver for Linux | ||
3 | * | ||
4 | * This module enables machines with Intel VT-x extensions to run virtual | ||
5 | * machines without emulation or binary translation. | ||
6 | * | ||
7 | * MMU support | ||
8 | * | ||
9 | * Copyright (C) 2006 Qumranet, Inc. | ||
10 | * | ||
11 | * Authors: | ||
12 | * Yaniv Kamay <yaniv@qumranet.com> | ||
13 | * Avi Kivity <avi@qumranet.com> | ||
14 | * | ||
15 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
16 | * the COPYING file in the top-level directory. | ||
17 | * | ||
18 | */ | ||
19 | |||
20 | /* | ||
21 | * We need the mmu code to access both 32-bit and 64-bit guest ptes, | ||
22 | * so the code in this file is compiled twice, once per pte size. | ||
23 | */ | ||
24 | |||
25 | #if PTTYPE == 64 | ||
26 | #define pt_element_t u64 | ||
27 | #define guest_walker guest_walker64 | ||
28 | #define FNAME(name) paging##64_##name | ||
29 | #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK | ||
30 | #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK | ||
31 | #define PT_INDEX(addr, level) PT64_INDEX(addr, level) | ||
32 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) | ||
33 | #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) | ||
34 | #define PT_PTE_COPY_MASK PT64_PTE_COPY_MASK | ||
35 | #define PT_NON_PTE_COPY_MASK PT64_NON_PTE_COPY_MASK | ||
36 | #elif PTTYPE == 32 | ||
37 | #define pt_element_t u32 | ||
38 | #define guest_walker guest_walker32 | ||
39 | #define FNAME(name) paging##32_##name | ||
40 | #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK | ||
41 | #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK | ||
42 | #define PT_INDEX(addr, level) PT32_INDEX(addr, level) | ||
43 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) | ||
44 | #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) | ||
45 | #define PT_PTE_COPY_MASK PT32_PTE_COPY_MASK | ||
46 | #define PT_NON_PTE_COPY_MASK PT32_NON_PTE_COPY_MASK | ||
47 | #else | ||
48 | #error Invalid PTTYPE value | ||
49 | #endif | ||
50 | |||
51 | /* | ||
52 | * The guest_walker structure emulates the behavior of the hardware page | ||
53 | * table walker. | ||
54 | */ | ||
55 | struct guest_walker { | ||
56 | int level; | ||
57 | pt_element_t *table; | ||
58 | pt_element_t inherited_ar; | ||
59 | }; | ||
60 | |||
61 | static void FNAME(init_walker)(struct guest_walker *walker, | ||
62 | struct kvm_vcpu *vcpu) | ||
63 | { | ||
64 | hpa_t hpa; | ||
65 | struct kvm_memory_slot *slot; | ||
66 | |||
67 | walker->level = vcpu->mmu.root_level; | ||
68 | slot = gfn_to_memslot(vcpu->kvm, | ||
69 | (vcpu->cr3 & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT); | ||
70 | hpa = safe_gpa_to_hpa(vcpu, vcpu->cr3 & PT64_BASE_ADDR_MASK); | ||
71 | walker->table = kmap_atomic(pfn_to_page(hpa >> PAGE_SHIFT), KM_USER0); | ||
72 | |||
73 | ASSERT((!kvm_arch_ops->is_long_mode(vcpu) && is_pae(vcpu)) || | ||
74 | (vcpu->cr3 & ~(PAGE_MASK | CR3_FLAGS_MASK)) == 0); | ||
75 | |||
76 | walker->table = (pt_element_t *)( (unsigned long)walker->table | | ||
77 | (unsigned long)(vcpu->cr3 & ~(PAGE_MASK | CR3_FLAGS_MASK)) ); | ||
78 | walker->inherited_ar = PT_USER_MASK | PT_WRITABLE_MASK; | ||
79 | } | ||
80 | |||
81 | static void FNAME(release_walker)(struct guest_walker *walker) | ||
82 | { | ||
83 | kunmap_atomic(walker->table, KM_USER0); | ||
84 | } | ||
85 | |||
86 | static void FNAME(set_pte)(struct kvm_vcpu *vcpu, u64 guest_pte, | ||
87 | u64 *shadow_pte, u64 access_bits) | ||
88 | { | ||
89 | ASSERT(*shadow_pte == 0); | ||
90 | access_bits &= guest_pte; | ||
91 | *shadow_pte = (guest_pte & PT_PTE_COPY_MASK); | ||
92 | set_pte_common(vcpu, shadow_pte, guest_pte & PT_BASE_ADDR_MASK, | ||
93 | guest_pte & PT_DIRTY_MASK, access_bits); | ||
94 | } | ||
95 | |||
96 | static void FNAME(set_pde)(struct kvm_vcpu *vcpu, u64 guest_pde, | ||
97 | u64 *shadow_pte, u64 access_bits, | ||
98 | int index) | ||
99 | { | ||
100 | gpa_t gaddr; | ||
101 | |||
102 | ASSERT(*shadow_pte == 0); | ||
103 | access_bits &= guest_pde; | ||
104 | gaddr = (guest_pde & PT_DIR_BASE_ADDR_MASK) + PAGE_SIZE * index; | ||
105 | if (PTTYPE == 32 && is_cpuid_PSE36()) | ||
106 | gaddr |= (guest_pde & PT32_DIR_PSE36_MASK) << | ||
107 | (32 - PT32_DIR_PSE36_SHIFT); | ||
108 | *shadow_pte = (guest_pde & (PT_NON_PTE_COPY_MASK | PT_GLOBAL_MASK)) | | ||
109 | ((guest_pde & PT_DIR_PAT_MASK) >> | ||
110 | (PT_DIR_PAT_SHIFT - PT_PAT_SHIFT)); | ||
111 | set_pte_common(vcpu, shadow_pte, gaddr, | ||
112 | guest_pde & PT_DIRTY_MASK, access_bits); | ||
113 | } | ||
114 | |||
115 | /* | ||
116 | * Fetch a guest pte from a specific level in the paging hierarchy. | ||
117 | */ | ||
118 | static pt_element_t *FNAME(fetch_guest)(struct kvm_vcpu *vcpu, | ||
119 | struct guest_walker *walker, | ||
120 | int level, | ||
121 | gva_t addr) | ||
122 | { | ||
123 | |||
124 | ASSERT(level > 0 && level <= walker->level); | ||
125 | |||
126 | for (;;) { | ||
127 | int index = PT_INDEX(addr, walker->level); | ||
128 | hpa_t paddr; | ||
129 | |||
130 | ASSERT(((unsigned long)walker->table & PAGE_MASK) == | ||
131 | ((unsigned long)&walker->table[index] & PAGE_MASK)); | ||
132 | if (level == walker->level || | ||
133 | !is_present_pte(walker->table[index]) || | ||
134 | (walker->level == PT_DIRECTORY_LEVEL && | ||
135 | (walker->table[index] & PT_PAGE_SIZE_MASK) && | ||
136 | (PTTYPE == 64 || is_pse(vcpu)))) | ||
137 | return &walker->table[index]; | ||
138 | if (walker->level != 3 || kvm_arch_ops->is_long_mode(vcpu)) | ||
139 | walker->inherited_ar &= walker->table[index]; | ||
140 | paddr = safe_gpa_to_hpa(vcpu, walker->table[index] & PT_BASE_ADDR_MASK); | ||
141 | kunmap_atomic(walker->table, KM_USER0); | ||
142 | walker->table = kmap_atomic(pfn_to_page(paddr >> PAGE_SHIFT), | ||
143 | KM_USER0); | ||
144 | --walker->level; | ||
145 | } | ||
146 | } | ||
147 | |||
148 | /* | ||
149 | * Fetch a shadow pte for a specific level in the paging hierarchy. | ||
150 | */ | ||
151 | static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | ||
152 | struct guest_walker *walker) | ||
153 | { | ||
154 | hpa_t shadow_addr; | ||
155 | int level; | ||
156 | u64 *prev_shadow_ent = NULL; | ||
157 | |||
158 | shadow_addr = vcpu->mmu.root_hpa; | ||
159 | level = vcpu->mmu.shadow_root_level; | ||
160 | |||
161 | for (; ; level--) { | ||
162 | u32 index = SHADOW_PT_INDEX(addr, level); | ||
163 | u64 *shadow_ent = ((u64 *)__va(shadow_addr)) + index; | ||
164 | pt_element_t *guest_ent; | ||
165 | |||
166 | if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) { | ||
167 | if (level == PT_PAGE_TABLE_LEVEL) | ||
168 | return shadow_ent; | ||
169 | shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK; | ||
170 | prev_shadow_ent = shadow_ent; | ||
171 | continue; | ||
172 | } | ||
173 | |||
174 | if (PTTYPE == 32 && level > PT32_ROOT_LEVEL) { | ||
175 | ASSERT(level == PT32E_ROOT_LEVEL); | ||
176 | guest_ent = FNAME(fetch_guest)(vcpu, walker, | ||
177 | PT32_ROOT_LEVEL, addr); | ||
178 | } else | ||
179 | guest_ent = FNAME(fetch_guest)(vcpu, walker, | ||
180 | level, addr); | ||
181 | |||
182 | if (!is_present_pte(*guest_ent)) | ||
183 | return NULL; | ||
184 | |||
185 | /* Don't set accessed bit on PAE PDPTRs */ | ||
186 | if (vcpu->mmu.root_level != 3 || walker->level != 3) | ||
187 | *guest_ent |= PT_ACCESSED_MASK; | ||
188 | |||
189 | if (level == PT_PAGE_TABLE_LEVEL) { | ||
190 | |||
191 | if (walker->level == PT_DIRECTORY_LEVEL) { | ||
192 | if (prev_shadow_ent) | ||
193 | *prev_shadow_ent |= PT_SHADOW_PS_MARK; | ||
194 | FNAME(set_pde)(vcpu, *guest_ent, shadow_ent, | ||
195 | walker->inherited_ar, | ||
196 | PT_INDEX(addr, PT_PAGE_TABLE_LEVEL)); | ||
197 | } else { | ||
198 | ASSERT(walker->level == PT_PAGE_TABLE_LEVEL); | ||
199 | FNAME(set_pte)(vcpu, *guest_ent, shadow_ent, walker->inherited_ar); | ||
200 | } | ||
201 | return shadow_ent; | ||
202 | } | ||
203 | |||
204 | shadow_addr = kvm_mmu_alloc_page(vcpu, shadow_ent); | ||
205 | if (!VALID_PAGE(shadow_addr)) | ||
206 | return ERR_PTR(-ENOMEM); | ||
207 | if (!kvm_arch_ops->is_long_mode(vcpu) && level == 3) | ||
208 | *shadow_ent = shadow_addr | | ||
209 | (*guest_ent & (PT_PRESENT_MASK | PT_PWT_MASK | PT_PCD_MASK)); | ||
210 | else { | ||
211 | *shadow_ent = shadow_addr | | ||
212 | (*guest_ent & PT_NON_PTE_COPY_MASK); | ||
213 | *shadow_ent |= (PT_WRITABLE_MASK | PT_USER_MASK); | ||
214 | } | ||
215 | prev_shadow_ent = shadow_ent; | ||
216 | } | ||
217 | } | ||
218 | |||
219 | /* | ||
220 | * The guest faulted for write. We need to | ||
221 | * | ||
222 | * - check write permissions | ||
223 | * - update the guest pte dirty bit | ||
224 | * - update our own dirty page tracking structures | ||
225 | */ | ||
226 | static int FNAME(fix_write_pf)(struct kvm_vcpu *vcpu, | ||
227 | u64 *shadow_ent, | ||
228 | struct guest_walker *walker, | ||
229 | gva_t addr, | ||
230 | int user) | ||
231 | { | ||
232 | pt_element_t *guest_ent; | ||
233 | int writable_shadow; | ||
234 | gfn_t gfn; | ||
235 | |||
236 | if (is_writeble_pte(*shadow_ent)) | ||
237 | return 0; | ||
238 | |||
239 | writable_shadow = *shadow_ent & PT_SHADOW_WRITABLE_MASK; | ||
240 | if (user) { | ||
241 | /* | ||
242 | * User mode access. Fail if it's a kernel page or a read-only | ||
243 | * page. | ||
244 | */ | ||
245 | if (!(*shadow_ent & PT_SHADOW_USER_MASK) || !writable_shadow) | ||
246 | return 0; | ||
247 | ASSERT(*shadow_ent & PT_USER_MASK); | ||
248 | } else | ||
249 | /* | ||
250 | * Kernel mode access. Fail if it's a read-only page and | ||
251 | * supervisor write protection is enabled. | ||
252 | */ | ||
253 | if (!writable_shadow) { | ||
254 | if (is_write_protection(vcpu)) | ||
255 | return 0; | ||
256 | *shadow_ent &= ~PT_USER_MASK; | ||
257 | } | ||
258 | |||
259 | guest_ent = FNAME(fetch_guest)(vcpu, walker, PT_PAGE_TABLE_LEVEL, addr); | ||
260 | |||
261 | if (!is_present_pte(*guest_ent)) { | ||
262 | *shadow_ent = 0; | ||
263 | return 0; | ||
264 | } | ||
265 | |||
266 | gfn = (*guest_ent & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; | ||
267 | mark_page_dirty(vcpu->kvm, gfn); | ||
268 | *shadow_ent |= PT_WRITABLE_MASK; | ||
269 | *guest_ent |= PT_DIRTY_MASK; | ||
270 | |||
271 | return 1; | ||
272 | } | ||
273 | |||
274 | /* | ||
275 | * Page fault handler. There are several causes for a page fault: | ||
276 | * - there is no shadow pte for the guest pte | ||
277 | * - write access through a shadow pte marked read only so that we can set | ||
278 | * the dirty bit | ||
279 | * - write access to a shadow pte marked read only so we can update the page | ||
280 | * dirty bitmap, when userspace requests it | ||
281 | * - mmio access; in this case we will never install a present shadow pte | ||
282 | * - normal guest page fault due to the guest pte marked not present, not | ||
283 | * writable, or not executable | ||
284 | * | ||
285 | * Returns: 1 if we need to emulate the instruction, 0 otherwise | ||
286 | */ | ||
287 | static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | ||
288 | u32 error_code) | ||
289 | { | ||
290 | int write_fault = error_code & PFERR_WRITE_MASK; | ||
291 | int pte_present = error_code & PFERR_PRESENT_MASK; | ||
292 | int user_fault = error_code & PFERR_USER_MASK; | ||
293 | struct guest_walker walker; | ||
294 | u64 *shadow_pte; | ||
295 | int fixed; | ||
296 | |||
297 | /* | ||
298 | * Look up the shadow pte for the faulting address. | ||
299 | */ | ||
300 | for (;;) { | ||
301 | FNAME(init_walker)(&walker, vcpu); | ||
302 | shadow_pte = FNAME(fetch)(vcpu, addr, &walker); | ||
303 | if (IS_ERR(shadow_pte)) { /* must be -ENOMEM */ | ||
304 | nonpaging_flush(vcpu); | ||
305 | FNAME(release_walker)(&walker); | ||
306 | continue; | ||
307 | } | ||
308 | break; | ||
309 | } | ||
310 | |||
311 | /* | ||
312 | * The page is not mapped by the guest. Let the guest handle it. | ||
313 | */ | ||
314 | if (!shadow_pte) { | ||
315 | inject_page_fault(vcpu, addr, error_code); | ||
316 | FNAME(release_walker)(&walker); | ||
317 | return 0; | ||
318 | } | ||
319 | |||
320 | /* | ||
321 | * Update the shadow pte. | ||
322 | */ | ||
323 | if (write_fault) | ||
324 | fixed = FNAME(fix_write_pf)(vcpu, shadow_pte, &walker, addr, | ||
325 | user_fault); | ||
326 | else | ||
327 | fixed = fix_read_pf(shadow_pte); | ||
328 | |||
329 | FNAME(release_walker)(&walker); | ||
330 | |||
331 | /* | ||
332 | * mmio: emulate if accessible, otherwise its a guest fault. | ||
333 | */ | ||
334 | if (is_io_pte(*shadow_pte)) { | ||
335 | if (may_access(*shadow_pte, write_fault, user_fault)) | ||
336 | return 1; | ||
337 | pgprintk("%s: io work, no access\n", __FUNCTION__); | ||
338 | inject_page_fault(vcpu, addr, | ||
339 | error_code | PFERR_PRESENT_MASK); | ||
340 | return 0; | ||
341 | } | ||
342 | |||
343 | /* | ||
344 | * pte not present, guest page fault. | ||
345 | */ | ||
346 | if (pte_present && !fixed) { | ||
347 | inject_page_fault(vcpu, addr, error_code); | ||
348 | return 0; | ||
349 | } | ||
350 | |||
351 | ++kvm_stat.pf_fixed; | ||
352 | |||
353 | return 0; | ||
354 | } | ||
355 | |||
356 | static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) | ||
357 | { | ||
358 | struct guest_walker walker; | ||
359 | pt_element_t guest_pte; | ||
360 | gpa_t gpa; | ||
361 | |||
362 | FNAME(init_walker)(&walker, vcpu); | ||
363 | guest_pte = *FNAME(fetch_guest)(vcpu, &walker, PT_PAGE_TABLE_LEVEL, | ||
364 | vaddr); | ||
365 | FNAME(release_walker)(&walker); | ||
366 | |||
367 | if (!is_present_pte(guest_pte)) | ||
368 | return UNMAPPED_GVA; | ||
369 | |||
370 | if (walker.level == PT_DIRECTORY_LEVEL) { | ||
371 | ASSERT((guest_pte & PT_PAGE_SIZE_MASK)); | ||
372 | ASSERT(PTTYPE == 64 || is_pse(vcpu)); | ||
373 | |||
374 | gpa = (guest_pte & PT_DIR_BASE_ADDR_MASK) | (vaddr & | ||
375 | (PT_LEVEL_MASK(PT_PAGE_TABLE_LEVEL) | ~PAGE_MASK)); | ||
376 | |||
377 | if (PTTYPE == 32 && is_cpuid_PSE36()) | ||
378 | gpa |= (guest_pte & PT32_DIR_PSE36_MASK) << | ||
379 | (32 - PT32_DIR_PSE36_SHIFT); | ||
380 | } else { | ||
381 | gpa = (guest_pte & PT_BASE_ADDR_MASK); | ||
382 | gpa |= (vaddr & ~PAGE_MASK); | ||
383 | } | ||
384 | |||
385 | return gpa; | ||
386 | } | ||
387 | |||
388 | #undef pt_element_t | ||
389 | #undef guest_walker | ||
390 | #undef FNAME | ||
391 | #undef PT_BASE_ADDR_MASK | ||
392 | #undef PT_INDEX | ||
393 | #undef SHADOW_PT_INDEX | ||
394 | #undef PT_LEVEL_MASK | ||
395 | #undef PT_PTE_COPY_MASK | ||
396 | #undef PT_NON_PTE_COPY_MASK | ||
397 | #undef PT_DIR_BASE_ADDR_MASK | ||
diff --git a/drivers/kvm/segment_descriptor.h b/drivers/kvm/segment_descriptor.h new file mode 100644 index 000000000000..71fdf458619a --- /dev/null +++ b/drivers/kvm/segment_descriptor.h | |||
@@ -0,0 +1,17 @@ | |||
1 | struct segment_descriptor { | ||
2 | u16 limit_low; | ||
3 | u16 base_low; | ||
4 | u8 base_mid; | ||
5 | u8 type : 4; | ||
6 | u8 system : 1; | ||
7 | u8 dpl : 2; | ||
8 | u8 present : 1; | ||
9 | u8 limit_high : 4; | ||
10 | u8 avl : 1; | ||
11 | u8 long_mode : 1; | ||
12 | u8 default_op : 1; | ||
13 | u8 granularity : 1; | ||
14 | u8 base_high; | ||
15 | } __attribute__((packed)); | ||
16 | |||
17 | |||
diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c new file mode 100644 index 000000000000..a33a89c68138 --- /dev/null +++ b/drivers/kvm/svm.c | |||
@@ -0,0 +1,1677 @@ | |||
1 | /* | ||
2 | * Kernel-based Virtual Machine driver for Linux | ||
3 | * | ||
4 | * AMD SVM support | ||
5 | * | ||
6 | * Copyright (C) 2006 Qumranet, Inc. | ||
7 | * | ||
8 | * Authors: | ||
9 | * Yaniv Kamay <yaniv@qumranet.com> | ||
10 | * Avi Kivity <avi@qumranet.com> | ||
11 | * | ||
12 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
13 | * the COPYING file in the top-level directory. | ||
14 | * | ||
15 | */ | ||
16 | |||
17 | #include <linux/module.h> | ||
18 | #include <linux/vmalloc.h> | ||
19 | #include <linux/highmem.h> | ||
20 | #include <asm/desc.h> | ||
21 | |||
22 | #include "kvm_svm.h" | ||
23 | #include "x86_emulate.h" | ||
24 | |||
25 | MODULE_AUTHOR("Qumranet"); | ||
26 | MODULE_LICENSE("GPL"); | ||
27 | |||
28 | #define IOPM_ALLOC_ORDER 2 | ||
29 | #define MSRPM_ALLOC_ORDER 1 | ||
30 | |||
31 | #define DB_VECTOR 1 | ||
32 | #define UD_VECTOR 6 | ||
33 | #define GP_VECTOR 13 | ||
34 | |||
35 | #define DR7_GD_MASK (1 << 13) | ||
36 | #define DR6_BD_MASK (1 << 13) | ||
37 | #define CR4_DE_MASK (1UL << 3) | ||
38 | |||
39 | #define SEG_TYPE_LDT 2 | ||
40 | #define SEG_TYPE_BUSY_TSS16 3 | ||
41 | |||
42 | #define KVM_EFER_LMA (1 << 10) | ||
43 | #define KVM_EFER_LME (1 << 8) | ||
44 | |||
45 | unsigned long iopm_base; | ||
46 | unsigned long msrpm_base; | ||
47 | |||
48 | struct kvm_ldttss_desc { | ||
49 | u16 limit0; | ||
50 | u16 base0; | ||
51 | unsigned base1 : 8, type : 5, dpl : 2, p : 1; | ||
52 | unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8; | ||
53 | u32 base3; | ||
54 | u32 zero1; | ||
55 | } __attribute__((packed)); | ||
56 | |||
57 | struct svm_cpu_data { | ||
58 | int cpu; | ||
59 | |||
60 | uint64_t asid_generation; | ||
61 | uint32_t max_asid; | ||
62 | uint32_t next_asid; | ||
63 | struct kvm_ldttss_desc *tss_desc; | ||
64 | |||
65 | struct page *save_area; | ||
66 | }; | ||
67 | |||
68 | static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); | ||
69 | |||
70 | struct svm_init_data { | ||
71 | int cpu; | ||
72 | int r; | ||
73 | }; | ||
74 | |||
75 | static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000}; | ||
76 | |||
77 | #define NUM_MSR_MAPS (sizeof(msrpm_ranges) / sizeof(*msrpm_ranges)) | ||
78 | #define MSRS_RANGE_SIZE 2048 | ||
79 | #define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2) | ||
80 | |||
81 | #define MAX_INST_SIZE 15 | ||
82 | |||
83 | static unsigned get_addr_size(struct kvm_vcpu *vcpu) | ||
84 | { | ||
85 | struct vmcb_save_area *sa = &vcpu->svm->vmcb->save; | ||
86 | u16 cs_attrib; | ||
87 | |||
88 | if (!(sa->cr0 & CR0_PE_MASK) || (sa->rflags & X86_EFLAGS_VM)) | ||
89 | return 2; | ||
90 | |||
91 | cs_attrib = sa->cs.attrib; | ||
92 | |||
93 | return (cs_attrib & SVM_SELECTOR_L_MASK) ? 8 : | ||
94 | (cs_attrib & SVM_SELECTOR_DB_MASK) ? 4 : 2; | ||
95 | } | ||
96 | |||
97 | static inline u8 pop_irq(struct kvm_vcpu *vcpu) | ||
98 | { | ||
99 | int word_index = __ffs(vcpu->irq_summary); | ||
100 | int bit_index = __ffs(vcpu->irq_pending[word_index]); | ||
101 | int irq = word_index * BITS_PER_LONG + bit_index; | ||
102 | |||
103 | clear_bit(bit_index, &vcpu->irq_pending[word_index]); | ||
104 | if (!vcpu->irq_pending[word_index]) | ||
105 | clear_bit(word_index, &vcpu->irq_summary); | ||
106 | return irq; | ||
107 | } | ||
108 | |||
109 | static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq) | ||
110 | { | ||
111 | set_bit(irq, vcpu->irq_pending); | ||
112 | set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary); | ||
113 | } | ||
114 | |||
115 | static inline void clgi(void) | ||
116 | { | ||
117 | asm volatile (SVM_CLGI); | ||
118 | } | ||
119 | |||
120 | static inline void stgi(void) | ||
121 | { | ||
122 | asm volatile (SVM_STGI); | ||
123 | } | ||
124 | |||
125 | static inline void invlpga(unsigned long addr, u32 asid) | ||
126 | { | ||
127 | asm volatile (SVM_INVLPGA :: "a"(addr), "c"(asid)); | ||
128 | } | ||
129 | |||
130 | static inline unsigned long kvm_read_cr2(void) | ||
131 | { | ||
132 | unsigned long cr2; | ||
133 | |||
134 | asm volatile ("mov %%cr2, %0" : "=r" (cr2)); | ||
135 | return cr2; | ||
136 | } | ||
137 | |||
138 | static inline void kvm_write_cr2(unsigned long val) | ||
139 | { | ||
140 | asm volatile ("mov %0, %%cr2" :: "r" (val)); | ||
141 | } | ||
142 | |||
143 | static inline unsigned long read_dr6(void) | ||
144 | { | ||
145 | unsigned long dr6; | ||
146 | |||
147 | asm volatile ("mov %%dr6, %0" : "=r" (dr6)); | ||
148 | return dr6; | ||
149 | } | ||
150 | |||
151 | static inline void write_dr6(unsigned long val) | ||
152 | { | ||
153 | asm volatile ("mov %0, %%dr6" :: "r" (val)); | ||
154 | } | ||
155 | |||
156 | static inline unsigned long read_dr7(void) | ||
157 | { | ||
158 | unsigned long dr7; | ||
159 | |||
160 | asm volatile ("mov %%dr7, %0" : "=r" (dr7)); | ||
161 | return dr7; | ||
162 | } | ||
163 | |||
164 | static inline void write_dr7(unsigned long val) | ||
165 | { | ||
166 | asm volatile ("mov %0, %%dr7" :: "r" (val)); | ||
167 | } | ||
168 | |||
169 | static inline int svm_is_long_mode(struct kvm_vcpu *vcpu) | ||
170 | { | ||
171 | return vcpu->svm->vmcb->save.efer & KVM_EFER_LMA; | ||
172 | } | ||
173 | |||
174 | static inline void force_new_asid(struct kvm_vcpu *vcpu) | ||
175 | { | ||
176 | vcpu->svm->asid_generation--; | ||
177 | } | ||
178 | |||
179 | static inline void flush_guest_tlb(struct kvm_vcpu *vcpu) | ||
180 | { | ||
181 | force_new_asid(vcpu); | ||
182 | } | ||
183 | |||
184 | static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) | ||
185 | { | ||
186 | if (!(efer & KVM_EFER_LMA)) | ||
187 | efer &= ~KVM_EFER_LME; | ||
188 | |||
189 | vcpu->svm->vmcb->save.efer = efer | MSR_EFER_SVME_MASK; | ||
190 | vcpu->shadow_efer = efer; | ||
191 | } | ||
192 | |||
193 | static void svm_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code) | ||
194 | { | ||
195 | vcpu->svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | | ||
196 | SVM_EVTINJ_VALID_ERR | | ||
197 | SVM_EVTINJ_TYPE_EXEPT | | ||
198 | GP_VECTOR; | ||
199 | vcpu->svm->vmcb->control.event_inj_err = error_code; | ||
200 | } | ||
201 | |||
202 | static void inject_ud(struct kvm_vcpu *vcpu) | ||
203 | { | ||
204 | vcpu->svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | | ||
205 | SVM_EVTINJ_TYPE_EXEPT | | ||
206 | UD_VECTOR; | ||
207 | } | ||
208 | |||
209 | static void inject_db(struct kvm_vcpu *vcpu) | ||
210 | { | ||
211 | vcpu->svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | | ||
212 | SVM_EVTINJ_TYPE_EXEPT | | ||
213 | DB_VECTOR; | ||
214 | } | ||
215 | |||
216 | static int is_page_fault(uint32_t info) | ||
217 | { | ||
218 | info &= SVM_EVTINJ_VEC_MASK | SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID; | ||
219 | return info == (PF_VECTOR | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT); | ||
220 | } | ||
221 | |||
222 | static int is_external_interrupt(u32 info) | ||
223 | { | ||
224 | info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID; | ||
225 | return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR); | ||
226 | } | ||
227 | |||
228 | static void skip_emulated_instruction(struct kvm_vcpu *vcpu) | ||
229 | { | ||
230 | if (!vcpu->svm->next_rip) { | ||
231 | printk(KERN_DEBUG "%s: NOP\n", __FUNCTION__); | ||
232 | return; | ||
233 | } | ||
234 | if (vcpu->svm->next_rip - vcpu->svm->vmcb->save.rip > 15) { | ||
235 | printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n", | ||
236 | __FUNCTION__, | ||
237 | vcpu->svm->vmcb->save.rip, | ||
238 | vcpu->svm->next_rip); | ||
239 | } | ||
240 | |||
241 | vcpu->rip = vcpu->svm->vmcb->save.rip = vcpu->svm->next_rip; | ||
242 | vcpu->svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; | ||
243 | } | ||
244 | |||
245 | static int has_svm(void) | ||
246 | { | ||
247 | uint32_t eax, ebx, ecx, edx; | ||
248 | |||
249 | if (current_cpu_data.x86_vendor != X86_VENDOR_AMD) { | ||
250 | printk(KERN_INFO "has_svm: not amd\n"); | ||
251 | return 0; | ||
252 | } | ||
253 | |||
254 | cpuid(0x80000000, &eax, &ebx, &ecx, &edx); | ||
255 | if (eax < SVM_CPUID_FUNC) { | ||
256 | printk(KERN_INFO "has_svm: can't execute cpuid_8000000a\n"); | ||
257 | return 0; | ||
258 | } | ||
259 | |||
260 | cpuid(0x80000001, &eax, &ebx, &ecx, &edx); | ||
261 | if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) { | ||
262 | printk(KERN_DEBUG "has_svm: svm not available\n"); | ||
263 | return 0; | ||
264 | } | ||
265 | return 1; | ||
266 | } | ||
267 | |||
268 | static void svm_hardware_disable(void *garbage) | ||
269 | { | ||
270 | struct svm_cpu_data *svm_data | ||
271 | = per_cpu(svm_data, raw_smp_processor_id()); | ||
272 | |||
273 | if (svm_data) { | ||
274 | uint64_t efer; | ||
275 | |||
276 | wrmsrl(MSR_VM_HSAVE_PA, 0); | ||
277 | rdmsrl(MSR_EFER, efer); | ||
278 | wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK); | ||
279 | per_cpu(svm_data, raw_smp_processor_id()) = 0; | ||
280 | __free_page(svm_data->save_area); | ||
281 | kfree(svm_data); | ||
282 | } | ||
283 | } | ||
284 | |||
285 | static void svm_hardware_enable(void *garbage) | ||
286 | { | ||
287 | |||
288 | struct svm_cpu_data *svm_data; | ||
289 | uint64_t efer; | ||
290 | #ifdef __x86_64__ | ||
291 | struct desc_ptr gdt_descr; | ||
292 | #else | ||
293 | struct Xgt_desc_struct gdt_descr; | ||
294 | #endif | ||
295 | struct desc_struct *gdt; | ||
296 | int me = raw_smp_processor_id(); | ||
297 | |||
298 | if (!has_svm()) { | ||
299 | printk(KERN_ERR "svm_cpu_init: err EOPNOTSUPP on %d\n", me); | ||
300 | return; | ||
301 | } | ||
302 | svm_data = per_cpu(svm_data, me); | ||
303 | |||
304 | if (!svm_data) { | ||
305 | printk(KERN_ERR "svm_cpu_init: svm_data is NULL on %d\n", | ||
306 | me); | ||
307 | return; | ||
308 | } | ||
309 | |||
310 | svm_data->asid_generation = 1; | ||
311 | svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; | ||
312 | svm_data->next_asid = svm_data->max_asid + 1; | ||
313 | |||
314 | asm volatile ( "sgdt %0" : "=m"(gdt_descr) ); | ||
315 | gdt = (struct desc_struct *)gdt_descr.address; | ||
316 | svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); | ||
317 | |||
318 | rdmsrl(MSR_EFER, efer); | ||
319 | wrmsrl(MSR_EFER, efer | MSR_EFER_SVME_MASK); | ||
320 | |||
321 | wrmsrl(MSR_VM_HSAVE_PA, | ||
322 | page_to_pfn(svm_data->save_area) << PAGE_SHIFT); | ||
323 | } | ||
324 | |||
325 | static int svm_cpu_init(int cpu) | ||
326 | { | ||
327 | struct svm_cpu_data *svm_data; | ||
328 | int r; | ||
329 | |||
330 | svm_data = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL); | ||
331 | if (!svm_data) | ||
332 | return -ENOMEM; | ||
333 | svm_data->cpu = cpu; | ||
334 | svm_data->save_area = alloc_page(GFP_KERNEL); | ||
335 | r = -ENOMEM; | ||
336 | if (!svm_data->save_area) | ||
337 | goto err_1; | ||
338 | |||
339 | per_cpu(svm_data, cpu) = svm_data; | ||
340 | |||
341 | return 0; | ||
342 | |||
343 | err_1: | ||
344 | kfree(svm_data); | ||
345 | return r; | ||
346 | |||
347 | } | ||
348 | |||
349 | static int set_msr_interception(u32 *msrpm, unsigned msr, | ||
350 | int read, int write) | ||
351 | { | ||
352 | int i; | ||
353 | |||
354 | for (i = 0; i < NUM_MSR_MAPS; i++) { | ||
355 | if (msr >= msrpm_ranges[i] && | ||
356 | msr < msrpm_ranges[i] + MSRS_IN_RANGE) { | ||
357 | u32 msr_offset = (i * MSRS_IN_RANGE + msr - | ||
358 | msrpm_ranges[i]) * 2; | ||
359 | |||
360 | u32 *base = msrpm + (msr_offset / 32); | ||
361 | u32 msr_shift = msr_offset % 32; | ||
362 | u32 mask = ((write) ? 0 : 2) | ((read) ? 0 : 1); | ||
363 | *base = (*base & ~(0x3 << msr_shift)) | | ||
364 | (mask << msr_shift); | ||
365 | return 1; | ||
366 | } | ||
367 | } | ||
368 | printk(KERN_DEBUG "%s: not found 0x%x\n", __FUNCTION__, msr); | ||
369 | return 0; | ||
370 | } | ||
371 | |||
372 | static __init int svm_hardware_setup(void) | ||
373 | { | ||
374 | int cpu; | ||
375 | struct page *iopm_pages; | ||
376 | struct page *msrpm_pages; | ||
377 | void *msrpm_va; | ||
378 | int r; | ||
379 | |||
380 | |||
381 | iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER); | ||
382 | |||
383 | if (!iopm_pages) | ||
384 | return -ENOMEM; | ||
385 | memset(page_address(iopm_pages), 0xff, | ||
386 | PAGE_SIZE * (1 << IOPM_ALLOC_ORDER)); | ||
387 | iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; | ||
388 | |||
389 | |||
390 | msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); | ||
391 | |||
392 | r = -ENOMEM; | ||
393 | if (!msrpm_pages) | ||
394 | goto err_1; | ||
395 | |||
396 | msrpm_va = page_address(msrpm_pages); | ||
397 | memset(msrpm_va, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER)); | ||
398 | msrpm_base = page_to_pfn(msrpm_pages) << PAGE_SHIFT; | ||
399 | |||
400 | #ifdef __x86_64__ | ||
401 | set_msr_interception(msrpm_va, MSR_GS_BASE, 1, 1); | ||
402 | set_msr_interception(msrpm_va, MSR_FS_BASE, 1, 1); | ||
403 | set_msr_interception(msrpm_va, MSR_KERNEL_GS_BASE, 1, 1); | ||
404 | set_msr_interception(msrpm_va, MSR_STAR, 1, 1); | ||
405 | set_msr_interception(msrpm_va, MSR_LSTAR, 1, 1); | ||
406 | set_msr_interception(msrpm_va, MSR_CSTAR, 1, 1); | ||
407 | set_msr_interception(msrpm_va, MSR_SYSCALL_MASK, 1, 1); | ||
408 | #endif | ||
409 | set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_CS, 1, 1); | ||
410 | set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_ESP, 1, 1); | ||
411 | set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_EIP, 1, 1); | ||
412 | |||
413 | for_each_online_cpu(cpu) { | ||
414 | r = svm_cpu_init(cpu); | ||
415 | if (r) | ||
416 | goto err_2; | ||
417 | } | ||
418 | return 0; | ||
419 | |||
420 | err_2: | ||
421 | __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER); | ||
422 | msrpm_base = 0; | ||
423 | err_1: | ||
424 | __free_pages(iopm_pages, IOPM_ALLOC_ORDER); | ||
425 | iopm_base = 0; | ||
426 | return r; | ||
427 | } | ||
428 | |||
429 | static __exit void svm_hardware_unsetup(void) | ||
430 | { | ||
431 | __free_pages(pfn_to_page(msrpm_base >> PAGE_SHIFT), MSRPM_ALLOC_ORDER); | ||
432 | __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER); | ||
433 | iopm_base = msrpm_base = 0; | ||
434 | } | ||
435 | |||
436 | static void init_seg(struct vmcb_seg *seg) | ||
437 | { | ||
438 | seg->selector = 0; | ||
439 | seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK | | ||
440 | SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */ | ||
441 | seg->limit = 0xffff; | ||
442 | seg->base = 0; | ||
443 | } | ||
444 | |||
445 | static void init_sys_seg(struct vmcb_seg *seg, uint32_t type) | ||
446 | { | ||
447 | seg->selector = 0; | ||
448 | seg->attrib = SVM_SELECTOR_P_MASK | type; | ||
449 | seg->limit = 0xffff; | ||
450 | seg->base = 0; | ||
451 | } | ||
452 | |||
453 | static int svm_vcpu_setup(struct kvm_vcpu *vcpu) | ||
454 | { | ||
455 | return 0; | ||
456 | } | ||
457 | |||
458 | static void init_vmcb(struct vmcb *vmcb) | ||
459 | { | ||
460 | struct vmcb_control_area *control = &vmcb->control; | ||
461 | struct vmcb_save_area *save = &vmcb->save; | ||
462 | u64 tsc; | ||
463 | |||
464 | control->intercept_cr_read = INTERCEPT_CR0_MASK | | ||
465 | INTERCEPT_CR3_MASK | | ||
466 | INTERCEPT_CR4_MASK; | ||
467 | |||
468 | control->intercept_cr_write = INTERCEPT_CR0_MASK | | ||
469 | INTERCEPT_CR3_MASK | | ||
470 | INTERCEPT_CR4_MASK; | ||
471 | |||
472 | control->intercept_dr_read = INTERCEPT_DR0_MASK | | ||
473 | INTERCEPT_DR1_MASK | | ||
474 | INTERCEPT_DR2_MASK | | ||
475 | INTERCEPT_DR3_MASK; | ||
476 | |||
477 | control->intercept_dr_write = INTERCEPT_DR0_MASK | | ||
478 | INTERCEPT_DR1_MASK | | ||
479 | INTERCEPT_DR2_MASK | | ||
480 | INTERCEPT_DR3_MASK | | ||
481 | INTERCEPT_DR5_MASK | | ||
482 | INTERCEPT_DR7_MASK; | ||
483 | |||
484 | control->intercept_exceptions = 1 << PF_VECTOR; | ||
485 | |||
486 | |||
487 | control->intercept = (1ULL << INTERCEPT_INTR) | | ||
488 | (1ULL << INTERCEPT_NMI) | | ||
489 | /* | ||
490 | * selective cr0 intercept bug? | ||
491 | * 0: 0f 22 d8 mov %eax,%cr3 | ||
492 | * 3: 0f 20 c0 mov %cr0,%eax | ||
493 | * 6: 0d 00 00 00 80 or $0x80000000,%eax | ||
494 | * b: 0f 22 c0 mov %eax,%cr0 | ||
495 | * set cr3 ->interception | ||
496 | * get cr0 ->interception | ||
497 | * set cr0 -> no interception | ||
498 | */ | ||
499 | /* (1ULL << INTERCEPT_SELECTIVE_CR0) | */ | ||
500 | (1ULL << INTERCEPT_CPUID) | | ||
501 | (1ULL << INTERCEPT_HLT) | | ||
502 | (1ULL << INTERCEPT_INVLPG) | | ||
503 | (1ULL << INTERCEPT_INVLPGA) | | ||
504 | (1ULL << INTERCEPT_IOIO_PROT) | | ||
505 | (1ULL << INTERCEPT_MSR_PROT) | | ||
506 | (1ULL << INTERCEPT_TASK_SWITCH) | | ||
507 | (1ULL << INTERCEPT_VMRUN) | | ||
508 | (1ULL << INTERCEPT_VMMCALL) | | ||
509 | (1ULL << INTERCEPT_VMLOAD) | | ||
510 | (1ULL << INTERCEPT_VMSAVE) | | ||
511 | (1ULL << INTERCEPT_STGI) | | ||
512 | (1ULL << INTERCEPT_CLGI) | | ||
513 | (1ULL << INTERCEPT_SKINIT); | ||
514 | |||
515 | control->iopm_base_pa = iopm_base; | ||
516 | control->msrpm_base_pa = msrpm_base; | ||
517 | rdtscll(tsc); | ||
518 | control->tsc_offset = -tsc; | ||
519 | control->int_ctl = V_INTR_MASKING_MASK; | ||
520 | |||
521 | init_seg(&save->es); | ||
522 | init_seg(&save->ss); | ||
523 | init_seg(&save->ds); | ||
524 | init_seg(&save->fs); | ||
525 | init_seg(&save->gs); | ||
526 | |||
527 | save->cs.selector = 0xf000; | ||
528 | /* Executable/Readable Code Segment */ | ||
529 | save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK | | ||
530 | SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK; | ||
531 | save->cs.limit = 0xffff; | ||
532 | save->cs.base = 0xffff0000; | ||
533 | |||
534 | save->gdtr.limit = 0xffff; | ||
535 | save->idtr.limit = 0xffff; | ||
536 | |||
537 | init_sys_seg(&save->ldtr, SEG_TYPE_LDT); | ||
538 | init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); | ||
539 | |||
540 | save->efer = MSR_EFER_SVME_MASK; | ||
541 | |||
542 | save->dr6 = 0xffff0ff0; | ||
543 | save->dr7 = 0x400; | ||
544 | save->rflags = 2; | ||
545 | save->rip = 0x0000fff0; | ||
546 | |||
547 | /* | ||
548 | * cr0 val on cpu init should be 0x60000010, we enable cpu | ||
549 | * cache by default. the orderly way is to enable cache in bios. | ||
550 | */ | ||
551 | save->cr0 = 0x00000010 | CR0_PG_MASK; | ||
552 | save->cr4 = CR4_PAE_MASK; | ||
553 | /* rdx = ?? */ | ||
554 | } | ||
555 | |||
556 | static int svm_create_vcpu(struct kvm_vcpu *vcpu) | ||
557 | { | ||
558 | struct page *page; | ||
559 | int r; | ||
560 | |||
561 | r = -ENOMEM; | ||
562 | vcpu->svm = kzalloc(sizeof *vcpu->svm, GFP_KERNEL); | ||
563 | if (!vcpu->svm) | ||
564 | goto out1; | ||
565 | page = alloc_page(GFP_KERNEL); | ||
566 | if (!page) | ||
567 | goto out2; | ||
568 | |||
569 | vcpu->svm->vmcb = page_address(page); | ||
570 | memset(vcpu->svm->vmcb, 0, PAGE_SIZE); | ||
571 | vcpu->svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; | ||
572 | vcpu->svm->cr0 = 0x00000010; | ||
573 | vcpu->svm->asid_generation = 0; | ||
574 | memset(vcpu->svm->db_regs, 0, sizeof(vcpu->svm->db_regs)); | ||
575 | init_vmcb(vcpu->svm->vmcb); | ||
576 | |||
577 | return 0; | ||
578 | |||
579 | out2: | ||
580 | kfree(vcpu->svm); | ||
581 | out1: | ||
582 | return r; | ||
583 | } | ||
584 | |||
585 | static void svm_free_vcpu(struct kvm_vcpu *vcpu) | ||
586 | { | ||
587 | if (!vcpu->svm) | ||
588 | return; | ||
589 | if (vcpu->svm->vmcb) | ||
590 | __free_page(pfn_to_page(vcpu->svm->vmcb_pa >> PAGE_SHIFT)); | ||
591 | kfree(vcpu->svm); | ||
592 | } | ||
593 | |||
594 | static struct kvm_vcpu *svm_vcpu_load(struct kvm_vcpu *vcpu) | ||
595 | { | ||
596 | get_cpu(); | ||
597 | return vcpu; | ||
598 | } | ||
599 | |||
600 | static void svm_vcpu_put(struct kvm_vcpu *vcpu) | ||
601 | { | ||
602 | put_cpu(); | ||
603 | } | ||
604 | |||
605 | static void svm_cache_regs(struct kvm_vcpu *vcpu) | ||
606 | { | ||
607 | vcpu->regs[VCPU_REGS_RAX] = vcpu->svm->vmcb->save.rax; | ||
608 | vcpu->regs[VCPU_REGS_RSP] = vcpu->svm->vmcb->save.rsp; | ||
609 | vcpu->rip = vcpu->svm->vmcb->save.rip; | ||
610 | } | ||
611 | |||
612 | static void svm_decache_regs(struct kvm_vcpu *vcpu) | ||
613 | { | ||
614 | vcpu->svm->vmcb->save.rax = vcpu->regs[VCPU_REGS_RAX]; | ||
615 | vcpu->svm->vmcb->save.rsp = vcpu->regs[VCPU_REGS_RSP]; | ||
616 | vcpu->svm->vmcb->save.rip = vcpu->rip; | ||
617 | } | ||
618 | |||
619 | static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) | ||
620 | { | ||
621 | return vcpu->svm->vmcb->save.rflags; | ||
622 | } | ||
623 | |||
624 | static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) | ||
625 | { | ||
626 | vcpu->svm->vmcb->save.rflags = rflags; | ||
627 | } | ||
628 | |||
629 | static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg) | ||
630 | { | ||
631 | struct vmcb_save_area *save = &vcpu->svm->vmcb->save; | ||
632 | |||
633 | switch (seg) { | ||
634 | case VCPU_SREG_CS: return &save->cs; | ||
635 | case VCPU_SREG_DS: return &save->ds; | ||
636 | case VCPU_SREG_ES: return &save->es; | ||
637 | case VCPU_SREG_FS: return &save->fs; | ||
638 | case VCPU_SREG_GS: return &save->gs; | ||
639 | case VCPU_SREG_SS: return &save->ss; | ||
640 | case VCPU_SREG_TR: return &save->tr; | ||
641 | case VCPU_SREG_LDTR: return &save->ldtr; | ||
642 | } | ||
643 | BUG(); | ||
644 | return 0; | ||
645 | } | ||
646 | |||
647 | static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg) | ||
648 | { | ||
649 | struct vmcb_seg *s = svm_seg(vcpu, seg); | ||
650 | |||
651 | return s->base; | ||
652 | } | ||
653 | |||
654 | static void svm_get_segment(struct kvm_vcpu *vcpu, | ||
655 | struct kvm_segment *var, int seg) | ||
656 | { | ||
657 | struct vmcb_seg *s = svm_seg(vcpu, seg); | ||
658 | |||
659 | var->base = s->base; | ||
660 | var->limit = s->limit; | ||
661 | var->selector = s->selector; | ||
662 | var->type = s->attrib & SVM_SELECTOR_TYPE_MASK; | ||
663 | var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1; | ||
664 | var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3; | ||
665 | var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1; | ||
666 | var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1; | ||
667 | var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1; | ||
668 | var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1; | ||
669 | var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1; | ||
670 | var->unusable = !var->present; | ||
671 | } | ||
672 | |||
673 | static void svm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) | ||
674 | { | ||
675 | struct vmcb_seg *s = svm_seg(vcpu, VCPU_SREG_CS); | ||
676 | |||
677 | *db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1; | ||
678 | *l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1; | ||
679 | } | ||
680 | |||
681 | static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) | ||
682 | { | ||
683 | dt->limit = vcpu->svm->vmcb->save.ldtr.limit; | ||
684 | dt->base = vcpu->svm->vmcb->save.ldtr.base; | ||
685 | } | ||
686 | |||
687 | static void svm_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) | ||
688 | { | ||
689 | vcpu->svm->vmcb->save.ldtr.limit = dt->limit; | ||
690 | vcpu->svm->vmcb->save.ldtr.base = dt->base ; | ||
691 | } | ||
692 | |||
693 | static void svm_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) | ||
694 | { | ||
695 | dt->limit = vcpu->svm->vmcb->save.gdtr.limit; | ||
696 | dt->base = vcpu->svm->vmcb->save.gdtr.base; | ||
697 | } | ||
698 | |||
699 | static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) | ||
700 | { | ||
701 | vcpu->svm->vmcb->save.gdtr.limit = dt->limit; | ||
702 | vcpu->svm->vmcb->save.gdtr.base = dt->base ; | ||
703 | } | ||
704 | |||
705 | static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | ||
706 | { | ||
707 | #ifdef __x86_64__ | ||
708 | if (vcpu->shadow_efer & KVM_EFER_LME) { | ||
709 | if (!is_paging(vcpu) && (cr0 & CR0_PG_MASK)) { | ||
710 | vcpu->shadow_efer |= KVM_EFER_LMA; | ||
711 | vcpu->svm->vmcb->save.efer |= KVM_EFER_LMA | KVM_EFER_LME; | ||
712 | } | ||
713 | |||
714 | if (is_paging(vcpu) && !(cr0 & CR0_PG_MASK) ) { | ||
715 | vcpu->shadow_efer &= ~KVM_EFER_LMA; | ||
716 | vcpu->svm->vmcb->save.efer &= ~(KVM_EFER_LMA | KVM_EFER_LME); | ||
717 | } | ||
718 | } | ||
719 | #endif | ||
720 | vcpu->svm->cr0 = cr0; | ||
721 | vcpu->svm->vmcb->save.cr0 = cr0 | CR0_PG_MASK; | ||
722 | vcpu->cr0 = cr0; | ||
723 | } | ||
724 | |||
725 | static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | ||
726 | { | ||
727 | vcpu->cr4 = cr4; | ||
728 | vcpu->svm->vmcb->save.cr4 = cr4 | CR4_PAE_MASK; | ||
729 | } | ||
730 | |||
731 | static void svm_set_segment(struct kvm_vcpu *vcpu, | ||
732 | struct kvm_segment *var, int seg) | ||
733 | { | ||
734 | struct vmcb_seg *s = svm_seg(vcpu, seg); | ||
735 | |||
736 | s->base = var->base; | ||
737 | s->limit = var->limit; | ||
738 | s->selector = var->selector; | ||
739 | if (var->unusable) | ||
740 | s->attrib = 0; | ||
741 | else { | ||
742 | s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK); | ||
743 | s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT; | ||
744 | s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT; | ||
745 | s->attrib |= (var->present & 1) << SVM_SELECTOR_P_SHIFT; | ||
746 | s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT; | ||
747 | s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT; | ||
748 | s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT; | ||
749 | s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT; | ||
750 | } | ||
751 | if (seg == VCPU_SREG_CS) | ||
752 | vcpu->svm->vmcb->save.cpl | ||
753 | = (vcpu->svm->vmcb->save.cs.attrib | ||
754 | >> SVM_SELECTOR_DPL_SHIFT) & 3; | ||
755 | |||
756 | } | ||
757 | |||
758 | /* FIXME: | ||
759 | |||
760 | vcpu->svm->vmcb->control.int_ctl &= ~V_TPR_MASK; | ||
761 | vcpu->svm->vmcb->control.int_ctl |= (sregs->cr8 & V_TPR_MASK); | ||
762 | |||
763 | */ | ||
764 | |||
765 | static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) | ||
766 | { | ||
767 | return -EOPNOTSUPP; | ||
768 | } | ||
769 | |||
770 | static void load_host_msrs(struct kvm_vcpu *vcpu) | ||
771 | { | ||
772 | int i; | ||
773 | |||
774 | for ( i = 0; i < NR_HOST_SAVE_MSRS; i++) | ||
775 | wrmsrl(host_save_msrs[i], vcpu->svm->host_msrs[i]); | ||
776 | } | ||
777 | |||
778 | static void save_host_msrs(struct kvm_vcpu *vcpu) | ||
779 | { | ||
780 | int i; | ||
781 | |||
782 | for ( i = 0; i < NR_HOST_SAVE_MSRS; i++) | ||
783 | rdmsrl(host_save_msrs[i], vcpu->svm->host_msrs[i]); | ||
784 | } | ||
785 | |||
786 | static void new_asid(struct kvm_vcpu *vcpu, struct svm_cpu_data *svm_data) | ||
787 | { | ||
788 | if (svm_data->next_asid > svm_data->max_asid) { | ||
789 | ++svm_data->asid_generation; | ||
790 | svm_data->next_asid = 1; | ||
791 | vcpu->svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID; | ||
792 | } | ||
793 | |||
794 | vcpu->cpu = svm_data->cpu; | ||
795 | vcpu->svm->asid_generation = svm_data->asid_generation; | ||
796 | vcpu->svm->vmcb->control.asid = svm_data->next_asid++; | ||
797 | } | ||
798 | |||
799 | static void svm_invlpg(struct kvm_vcpu *vcpu, gva_t address) | ||
800 | { | ||
801 | invlpga(address, vcpu->svm->vmcb->control.asid); // is needed? | ||
802 | } | ||
803 | |||
804 | static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr) | ||
805 | { | ||
806 | return vcpu->svm->db_regs[dr]; | ||
807 | } | ||
808 | |||
809 | static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, | ||
810 | int *exception) | ||
811 | { | ||
812 | *exception = 0; | ||
813 | |||
814 | if (vcpu->svm->vmcb->save.dr7 & DR7_GD_MASK) { | ||
815 | vcpu->svm->vmcb->save.dr7 &= ~DR7_GD_MASK; | ||
816 | vcpu->svm->vmcb->save.dr6 |= DR6_BD_MASK; | ||
817 | *exception = DB_VECTOR; | ||
818 | return; | ||
819 | } | ||
820 | |||
821 | switch (dr) { | ||
822 | case 0 ... 3: | ||
823 | vcpu->svm->db_regs[dr] = value; | ||
824 | return; | ||
825 | case 4 ... 5: | ||
826 | if (vcpu->cr4 & CR4_DE_MASK) { | ||
827 | *exception = UD_VECTOR; | ||
828 | return; | ||
829 | } | ||
830 | case 7: { | ||
831 | if (value & ~((1ULL << 32) - 1)) { | ||
832 | *exception = GP_VECTOR; | ||
833 | return; | ||
834 | } | ||
835 | vcpu->svm->vmcb->save.dr7 = value; | ||
836 | return; | ||
837 | } | ||
838 | default: | ||
839 | printk(KERN_DEBUG "%s: unexpected dr %u\n", | ||
840 | __FUNCTION__, dr); | ||
841 | *exception = UD_VECTOR; | ||
842 | return; | ||
843 | } | ||
844 | } | ||
845 | |||
846 | static int pf_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
847 | { | ||
848 | u32 exit_int_info = vcpu->svm->vmcb->control.exit_int_info; | ||
849 | u64 fault_address; | ||
850 | u32 error_code; | ||
851 | enum emulation_result er; | ||
852 | |||
853 | if (is_external_interrupt(exit_int_info)) | ||
854 | push_irq(vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK); | ||
855 | |||
856 | spin_lock(&vcpu->kvm->lock); | ||
857 | |||
858 | fault_address = vcpu->svm->vmcb->control.exit_info_2; | ||
859 | error_code = vcpu->svm->vmcb->control.exit_info_1; | ||
860 | if (!vcpu->mmu.page_fault(vcpu, fault_address, error_code)) { | ||
861 | spin_unlock(&vcpu->kvm->lock); | ||
862 | return 1; | ||
863 | } | ||
864 | er = emulate_instruction(vcpu, kvm_run, fault_address, error_code); | ||
865 | spin_unlock(&vcpu->kvm->lock); | ||
866 | |||
867 | switch (er) { | ||
868 | case EMULATE_DONE: | ||
869 | return 1; | ||
870 | case EMULATE_DO_MMIO: | ||
871 | ++kvm_stat.mmio_exits; | ||
872 | kvm_run->exit_reason = KVM_EXIT_MMIO; | ||
873 | return 0; | ||
874 | case EMULATE_FAIL: | ||
875 | vcpu_printf(vcpu, "%s: emulate fail\n", __FUNCTION__); | ||
876 | break; | ||
877 | default: | ||
878 | BUG(); | ||
879 | } | ||
880 | |||
881 | kvm_run->exit_reason = KVM_EXIT_UNKNOWN; | ||
882 | return 0; | ||
883 | } | ||
884 | |||
885 | static int io_get_override(struct kvm_vcpu *vcpu, | ||
886 | struct vmcb_seg **seg, | ||
887 | int *addr_override) | ||
888 | { | ||
889 | u8 inst[MAX_INST_SIZE]; | ||
890 | unsigned ins_length; | ||
891 | gva_t rip; | ||
892 | int i; | ||
893 | |||
894 | rip = vcpu->svm->vmcb->save.rip; | ||
895 | ins_length = vcpu->svm->next_rip - rip; | ||
896 | rip += vcpu->svm->vmcb->save.cs.base; | ||
897 | |||
898 | if (ins_length > MAX_INST_SIZE) | ||
899 | printk(KERN_DEBUG | ||
900 | "%s: inst length err, cs base 0x%llx rip 0x%llx " | ||
901 | "next rip 0x%llx ins_length %u\n", | ||
902 | __FUNCTION__, | ||
903 | vcpu->svm->vmcb->save.cs.base, | ||
904 | vcpu->svm->vmcb->save.rip, | ||
905 | vcpu->svm->vmcb->control.exit_info_2, | ||
906 | ins_length); | ||
907 | |||
908 | if (kvm_read_guest(vcpu, rip, ins_length, inst) != ins_length) | ||
909 | /* #PF */ | ||
910 | return 0; | ||
911 | |||
912 | *addr_override = 0; | ||
913 | *seg = 0; | ||
914 | for (i = 0; i < ins_length; i++) | ||
915 | switch (inst[i]) { | ||
916 | case 0xf0: | ||
917 | case 0xf2: | ||
918 | case 0xf3: | ||
919 | case 0x66: | ||
920 | continue; | ||
921 | case 0x67: | ||
922 | *addr_override = 1; | ||
923 | continue; | ||
924 | case 0x2e: | ||
925 | *seg = &vcpu->svm->vmcb->save.cs; | ||
926 | continue; | ||
927 | case 0x36: | ||
928 | *seg = &vcpu->svm->vmcb->save.ss; | ||
929 | continue; | ||
930 | case 0x3e: | ||
931 | *seg = &vcpu->svm->vmcb->save.ds; | ||
932 | continue; | ||
933 | case 0x26: | ||
934 | *seg = &vcpu->svm->vmcb->save.es; | ||
935 | continue; | ||
936 | case 0x64: | ||
937 | *seg = &vcpu->svm->vmcb->save.fs; | ||
938 | continue; | ||
939 | case 0x65: | ||
940 | *seg = &vcpu->svm->vmcb->save.gs; | ||
941 | continue; | ||
942 | default: | ||
943 | return 1; | ||
944 | } | ||
945 | printk(KERN_DEBUG "%s: unexpected\n", __FUNCTION__); | ||
946 | return 0; | ||
947 | } | ||
948 | |||
949 | static unsigned long io_adress(struct kvm_vcpu *vcpu, int ins, u64 *address) | ||
950 | { | ||
951 | unsigned long addr_mask; | ||
952 | unsigned long *reg; | ||
953 | struct vmcb_seg *seg; | ||
954 | int addr_override; | ||
955 | struct vmcb_save_area *save_area = &vcpu->svm->vmcb->save; | ||
956 | u16 cs_attrib = save_area->cs.attrib; | ||
957 | unsigned addr_size = get_addr_size(vcpu); | ||
958 | |||
959 | if (!io_get_override(vcpu, &seg, &addr_override)) | ||
960 | return 0; | ||
961 | |||
962 | if (addr_override) | ||
963 | addr_size = (addr_size == 2) ? 4: (addr_size >> 1); | ||
964 | |||
965 | if (ins) { | ||
966 | reg = &vcpu->regs[VCPU_REGS_RDI]; | ||
967 | seg = &vcpu->svm->vmcb->save.es; | ||
968 | } else { | ||
969 | reg = &vcpu->regs[VCPU_REGS_RSI]; | ||
970 | seg = (seg) ? seg : &vcpu->svm->vmcb->save.ds; | ||
971 | } | ||
972 | |||
973 | addr_mask = ~0ULL >> (64 - (addr_size * 8)); | ||
974 | |||
975 | if ((cs_attrib & SVM_SELECTOR_L_MASK) && | ||
976 | !(vcpu->svm->vmcb->save.rflags & X86_EFLAGS_VM)) { | ||
977 | *address = (*reg & addr_mask); | ||
978 | return addr_mask; | ||
979 | } | ||
980 | |||
981 | if (!(seg->attrib & SVM_SELECTOR_P_SHIFT)) { | ||
982 | svm_inject_gp(vcpu, 0); | ||
983 | return 0; | ||
984 | } | ||
985 | |||
986 | *address = (*reg & addr_mask) + seg->base; | ||
987 | return addr_mask; | ||
988 | } | ||
989 | |||
990 | static int io_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
991 | { | ||
992 | u32 io_info = vcpu->svm->vmcb->control.exit_info_1; //address size bug? | ||
993 | int _in = io_info & SVM_IOIO_TYPE_MASK; | ||
994 | |||
995 | ++kvm_stat.io_exits; | ||
996 | |||
997 | vcpu->svm->next_rip = vcpu->svm->vmcb->control.exit_info_2; | ||
998 | |||
999 | kvm_run->exit_reason = KVM_EXIT_IO; | ||
1000 | kvm_run->io.port = io_info >> 16; | ||
1001 | kvm_run->io.direction = (_in) ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; | ||
1002 | kvm_run->io.size = ((io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT); | ||
1003 | kvm_run->io.string = (io_info & SVM_IOIO_STR_MASK) != 0; | ||
1004 | kvm_run->io.rep = (io_info & SVM_IOIO_REP_MASK) != 0; | ||
1005 | |||
1006 | if (kvm_run->io.string) { | ||
1007 | unsigned addr_mask; | ||
1008 | |||
1009 | addr_mask = io_adress(vcpu, _in, &kvm_run->io.address); | ||
1010 | if (!addr_mask) { | ||
1011 | printk(KERN_DEBUG "%s: get io address failed\n", __FUNCTION__); | ||
1012 | return 1; | ||
1013 | } | ||
1014 | |||
1015 | if (kvm_run->io.rep) { | ||
1016 | kvm_run->io.count = vcpu->regs[VCPU_REGS_RCX] & addr_mask; | ||
1017 | kvm_run->io.string_down = (vcpu->svm->vmcb->save.rflags | ||
1018 | & X86_EFLAGS_DF) != 0; | ||
1019 | } | ||
1020 | } else { | ||
1021 | kvm_run->io.value = vcpu->svm->vmcb->save.rax; | ||
1022 | } | ||
1023 | return 0; | ||
1024 | } | ||
1025 | |||
1026 | |||
1027 | static int nop_on_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
1028 | { | ||
1029 | return 1; | ||
1030 | } | ||
1031 | |||
1032 | static int halt_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
1033 | { | ||
1034 | vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 1; | ||
1035 | skip_emulated_instruction(vcpu); | ||
1036 | if (vcpu->irq_summary && (vcpu->svm->vmcb->save.rflags & X86_EFLAGS_IF)) | ||
1037 | return 1; | ||
1038 | |||
1039 | kvm_run->exit_reason = KVM_EXIT_HLT; | ||
1040 | return 0; | ||
1041 | } | ||
1042 | |||
1043 | static int invalid_op_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
1044 | { | ||
1045 | inject_ud(vcpu); | ||
1046 | return 1; | ||
1047 | } | ||
1048 | |||
1049 | static int task_switch_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
1050 | { | ||
1051 | printk(KERN_DEBUG "%s: task swiche is unsupported\n", __FUNCTION__); | ||
1052 | kvm_run->exit_reason = KVM_EXIT_UNKNOWN; | ||
1053 | return 0; | ||
1054 | } | ||
1055 | |||
1056 | static int cpuid_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
1057 | { | ||
1058 | vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 2; | ||
1059 | kvm_run->exit_reason = KVM_EXIT_CPUID; | ||
1060 | return 0; | ||
1061 | } | ||
1062 | |||
1063 | static int emulate_on_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
1064 | { | ||
1065 | if (emulate_instruction(vcpu, 0, 0, 0) != EMULATE_DONE) | ||
1066 | printk(KERN_ERR "%s: failed\n", __FUNCTION__); | ||
1067 | return 1; | ||
1068 | } | ||
1069 | |||
1070 | static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) | ||
1071 | { | ||
1072 | switch (ecx) { | ||
1073 | case MSR_IA32_MC0_CTL: | ||
1074 | case MSR_IA32_MCG_STATUS: | ||
1075 | case MSR_IA32_MCG_CAP: | ||
1076 | case MSR_IA32_MC0_MISC: | ||
1077 | case MSR_IA32_MC0_MISC+4: | ||
1078 | case MSR_IA32_MC0_MISC+8: | ||
1079 | case MSR_IA32_MC0_MISC+12: | ||
1080 | case MSR_IA32_MC0_MISC+16: | ||
1081 | case MSR_IA32_UCODE_REV: | ||
1082 | /* MTRR registers */ | ||
1083 | case 0xfe: | ||
1084 | case 0x200 ... 0x2ff: | ||
1085 | *data = 0; | ||
1086 | break; | ||
1087 | case MSR_IA32_TIME_STAMP_COUNTER: { | ||
1088 | u64 tsc; | ||
1089 | |||
1090 | rdtscll(tsc); | ||
1091 | *data = vcpu->svm->vmcb->control.tsc_offset + tsc; | ||
1092 | break; | ||
1093 | } | ||
1094 | case MSR_EFER: | ||
1095 | *data = vcpu->shadow_efer; | ||
1096 | break; | ||
1097 | case MSR_IA32_APICBASE: | ||
1098 | *data = vcpu->apic_base; | ||
1099 | break; | ||
1100 | #ifdef __x86_64__ | ||
1101 | case MSR_STAR: | ||
1102 | *data = vcpu->svm->vmcb->save.star; | ||
1103 | break; | ||
1104 | case MSR_LSTAR: | ||
1105 | *data = vcpu->svm->vmcb->save.lstar; | ||
1106 | break; | ||
1107 | case MSR_CSTAR: | ||
1108 | *data = vcpu->svm->vmcb->save.cstar; | ||
1109 | break; | ||
1110 | case MSR_KERNEL_GS_BASE: | ||
1111 | *data = vcpu->svm->vmcb->save.kernel_gs_base; | ||
1112 | break; | ||
1113 | case MSR_SYSCALL_MASK: | ||
1114 | *data = vcpu->svm->vmcb->save.sfmask; | ||
1115 | break; | ||
1116 | #endif | ||
1117 | case MSR_IA32_SYSENTER_CS: | ||
1118 | *data = vcpu->svm->vmcb->save.sysenter_cs; | ||
1119 | break; | ||
1120 | case MSR_IA32_SYSENTER_EIP: | ||
1121 | *data = vcpu->svm->vmcb->save.sysenter_eip; | ||
1122 | break; | ||
1123 | case MSR_IA32_SYSENTER_ESP: | ||
1124 | *data = vcpu->svm->vmcb->save.sysenter_esp; | ||
1125 | break; | ||
1126 | default: | ||
1127 | printk(KERN_ERR "kvm: unhandled rdmsr: 0x%x\n", ecx); | ||
1128 | return 1; | ||
1129 | } | ||
1130 | return 0; | ||
1131 | } | ||
1132 | |||
1133 | static int rdmsr_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
1134 | { | ||
1135 | u32 ecx = vcpu->regs[VCPU_REGS_RCX]; | ||
1136 | u64 data; | ||
1137 | |||
1138 | if (svm_get_msr(vcpu, ecx, &data)) | ||
1139 | svm_inject_gp(vcpu, 0); | ||
1140 | else { | ||
1141 | vcpu->svm->vmcb->save.rax = data & 0xffffffff; | ||
1142 | vcpu->regs[VCPU_REGS_RDX] = data >> 32; | ||
1143 | vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 2; | ||
1144 | skip_emulated_instruction(vcpu); | ||
1145 | } | ||
1146 | return 1; | ||
1147 | } | ||
1148 | |||
1149 | static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) | ||
1150 | { | ||
1151 | switch (ecx) { | ||
1152 | #ifdef __x86_64__ | ||
1153 | case MSR_EFER: | ||
1154 | set_efer(vcpu, data); | ||
1155 | break; | ||
1156 | #endif | ||
1157 | case MSR_IA32_MC0_STATUS: | ||
1158 | printk(KERN_WARNING "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n" | ||
1159 | , __FUNCTION__, data); | ||
1160 | break; | ||
1161 | case MSR_IA32_TIME_STAMP_COUNTER: { | ||
1162 | u64 tsc; | ||
1163 | |||
1164 | rdtscll(tsc); | ||
1165 | vcpu->svm->vmcb->control.tsc_offset = data - tsc; | ||
1166 | break; | ||
1167 | } | ||
1168 | case MSR_IA32_UCODE_REV: | ||
1169 | case MSR_IA32_UCODE_WRITE: | ||
1170 | case 0x200 ... 0x2ff: /* MTRRs */ | ||
1171 | break; | ||
1172 | case MSR_IA32_APICBASE: | ||
1173 | vcpu->apic_base = data; | ||
1174 | break; | ||
1175 | #ifdef __x86_64___ | ||
1176 | case MSR_STAR: | ||
1177 | vcpu->svm->vmcb->save.star = data; | ||
1178 | break; | ||
1179 | case MSR_LSTAR: | ||
1180 | vcpu->svm->vmcb->save.lstar = data; | ||
1181 | break; | ||
1182 | case MSR_CSTAR: | ||
1183 | vcpu->svm->vmcb->save.cstar = data; | ||
1184 | break; | ||
1185 | case MSR_KERNEL_GS_BASE: | ||
1186 | vcpu->svm->vmcb->save.kernel_gs_base = data; | ||
1187 | break; | ||
1188 | case MSR_SYSCALL_MASK: | ||
1189 | vcpu->svm->vmcb->save.sfmask = data; | ||
1190 | break; | ||
1191 | #endif | ||
1192 | case MSR_IA32_SYSENTER_CS: | ||
1193 | vcpu->svm->vmcb->save.sysenter_cs = data; | ||
1194 | break; | ||
1195 | case MSR_IA32_SYSENTER_EIP: | ||
1196 | vcpu->svm->vmcb->save.sysenter_eip = data; | ||
1197 | break; | ||
1198 | case MSR_IA32_SYSENTER_ESP: | ||
1199 | vcpu->svm->vmcb->save.sysenter_esp = data; | ||
1200 | break; | ||
1201 | default: | ||
1202 | printk(KERN_ERR "kvm: unhandled wrmsr: %x\n", ecx); | ||
1203 | return 1; | ||
1204 | } | ||
1205 | return 0; | ||
1206 | } | ||
1207 | |||
1208 | static int wrmsr_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
1209 | { | ||
1210 | u32 ecx = vcpu->regs[VCPU_REGS_RCX]; | ||
1211 | u64 data = (vcpu->svm->vmcb->save.rax & -1u) | ||
1212 | | ((u64)(vcpu->regs[VCPU_REGS_RDX] & -1u) << 32); | ||
1213 | vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 2; | ||
1214 | if (svm_set_msr(vcpu, ecx, data)) | ||
1215 | svm_inject_gp(vcpu, 0); | ||
1216 | else | ||
1217 | skip_emulated_instruction(vcpu); | ||
1218 | return 1; | ||
1219 | } | ||
1220 | |||
1221 | static int msr_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
1222 | { | ||
1223 | if (vcpu->svm->vmcb->control.exit_info_1) | ||
1224 | return wrmsr_interception(vcpu, kvm_run); | ||
1225 | else | ||
1226 | return rdmsr_interception(vcpu, kvm_run); | ||
1227 | } | ||
1228 | |||
1229 | static int (*svm_exit_handlers[])(struct kvm_vcpu *vcpu, | ||
1230 | struct kvm_run *kvm_run) = { | ||
1231 | [SVM_EXIT_READ_CR0] = emulate_on_interception, | ||
1232 | [SVM_EXIT_READ_CR3] = emulate_on_interception, | ||
1233 | [SVM_EXIT_READ_CR4] = emulate_on_interception, | ||
1234 | /* for now: */ | ||
1235 | [SVM_EXIT_WRITE_CR0] = emulate_on_interception, | ||
1236 | [SVM_EXIT_WRITE_CR3] = emulate_on_interception, | ||
1237 | [SVM_EXIT_WRITE_CR4] = emulate_on_interception, | ||
1238 | [SVM_EXIT_READ_DR0] = emulate_on_interception, | ||
1239 | [SVM_EXIT_READ_DR1] = emulate_on_interception, | ||
1240 | [SVM_EXIT_READ_DR2] = emulate_on_interception, | ||
1241 | [SVM_EXIT_READ_DR3] = emulate_on_interception, | ||
1242 | [SVM_EXIT_WRITE_DR0] = emulate_on_interception, | ||
1243 | [SVM_EXIT_WRITE_DR1] = emulate_on_interception, | ||
1244 | [SVM_EXIT_WRITE_DR2] = emulate_on_interception, | ||
1245 | [SVM_EXIT_WRITE_DR3] = emulate_on_interception, | ||
1246 | [SVM_EXIT_WRITE_DR5] = emulate_on_interception, | ||
1247 | [SVM_EXIT_WRITE_DR7] = emulate_on_interception, | ||
1248 | [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, | ||
1249 | [SVM_EXIT_INTR] = nop_on_interception, | ||
1250 | [SVM_EXIT_NMI] = nop_on_interception, | ||
1251 | [SVM_EXIT_SMI] = nop_on_interception, | ||
1252 | [SVM_EXIT_INIT] = nop_on_interception, | ||
1253 | /* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */ | ||
1254 | [SVM_EXIT_CPUID] = cpuid_interception, | ||
1255 | [SVM_EXIT_HLT] = halt_interception, | ||
1256 | [SVM_EXIT_INVLPG] = emulate_on_interception, | ||
1257 | [SVM_EXIT_INVLPGA] = invalid_op_interception, | ||
1258 | [SVM_EXIT_IOIO] = io_interception, | ||
1259 | [SVM_EXIT_MSR] = msr_interception, | ||
1260 | [SVM_EXIT_TASK_SWITCH] = task_switch_interception, | ||
1261 | [SVM_EXIT_VMRUN] = invalid_op_interception, | ||
1262 | [SVM_EXIT_VMMCALL] = invalid_op_interception, | ||
1263 | [SVM_EXIT_VMLOAD] = invalid_op_interception, | ||
1264 | [SVM_EXIT_VMSAVE] = invalid_op_interception, | ||
1265 | [SVM_EXIT_STGI] = invalid_op_interception, | ||
1266 | [SVM_EXIT_CLGI] = invalid_op_interception, | ||
1267 | [SVM_EXIT_SKINIT] = invalid_op_interception, | ||
1268 | }; | ||
1269 | |||
1270 | |||
1271 | static int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
1272 | { | ||
1273 | u32 exit_code = vcpu->svm->vmcb->control.exit_code; | ||
1274 | |||
1275 | kvm_run->exit_type = KVM_EXIT_TYPE_VM_EXIT; | ||
1276 | |||
1277 | if (is_external_interrupt(vcpu->svm->vmcb->control.exit_int_info) && | ||
1278 | exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR) | ||
1279 | printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x " | ||
1280 | "exit_code 0x%x\n", | ||
1281 | __FUNCTION__, vcpu->svm->vmcb->control.exit_int_info, | ||
1282 | exit_code); | ||
1283 | |||
1284 | if (exit_code >= sizeof(svm_exit_handlers) / sizeof(*svm_exit_handlers) | ||
1285 | || svm_exit_handlers[exit_code] == 0) { | ||
1286 | kvm_run->exit_reason = KVM_EXIT_UNKNOWN; | ||
1287 | printk(KERN_ERR "%s: 0x%x @ 0x%llx cr0 0x%lx rflags 0x%llx\n", | ||
1288 | __FUNCTION__, | ||
1289 | exit_code, | ||
1290 | vcpu->svm->vmcb->save.rip, | ||
1291 | vcpu->cr0, | ||
1292 | vcpu->svm->vmcb->save.rflags); | ||
1293 | return 0; | ||
1294 | } | ||
1295 | |||
1296 | return svm_exit_handlers[exit_code](vcpu, kvm_run); | ||
1297 | } | ||
1298 | |||
1299 | static void reload_tss(struct kvm_vcpu *vcpu) | ||
1300 | { | ||
1301 | int cpu = raw_smp_processor_id(); | ||
1302 | |||
1303 | struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); | ||
1304 | svm_data->tss_desc->type = 9; //available 32/64-bit TSS | ||
1305 | load_TR_desc(); | ||
1306 | } | ||
1307 | |||
1308 | static void pre_svm_run(struct kvm_vcpu *vcpu) | ||
1309 | { | ||
1310 | int cpu = raw_smp_processor_id(); | ||
1311 | |||
1312 | struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); | ||
1313 | |||
1314 | vcpu->svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; | ||
1315 | if (vcpu->cpu != cpu || | ||
1316 | vcpu->svm->asid_generation != svm_data->asid_generation) | ||
1317 | new_asid(vcpu, svm_data); | ||
1318 | } | ||
1319 | |||
1320 | |||
1321 | static inline void kvm_try_inject_irq(struct kvm_vcpu *vcpu) | ||
1322 | { | ||
1323 | struct vmcb_control_area *control; | ||
1324 | |||
1325 | if (!vcpu->irq_summary) | ||
1326 | return; | ||
1327 | |||
1328 | control = &vcpu->svm->vmcb->control; | ||
1329 | |||
1330 | control->int_vector = pop_irq(vcpu); | ||
1331 | control->int_ctl &= ~V_INTR_PRIO_MASK; | ||
1332 | control->int_ctl |= V_IRQ_MASK | | ||
1333 | ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); | ||
1334 | } | ||
1335 | |||
1336 | static void kvm_reput_irq(struct kvm_vcpu *vcpu) | ||
1337 | { | ||
1338 | struct vmcb_control_area *control = &vcpu->svm->vmcb->control; | ||
1339 | |||
1340 | if (control->int_ctl & V_IRQ_MASK) { | ||
1341 | control->int_ctl &= ~V_IRQ_MASK; | ||
1342 | push_irq(vcpu, control->int_vector); | ||
1343 | } | ||
1344 | } | ||
1345 | |||
1346 | static void save_db_regs(unsigned long *db_regs) | ||
1347 | { | ||
1348 | #ifdef __x86_64__ | ||
1349 | asm ("mov %%dr0, %%rax \n\t" | ||
1350 | "mov %%rax, %[dr0] \n\t" | ||
1351 | "mov %%dr1, %%rax \n\t" | ||
1352 | "mov %%rax, %[dr1] \n\t" | ||
1353 | "mov %%dr2, %%rax \n\t" | ||
1354 | "mov %%rax, %[dr2] \n\t" | ||
1355 | "mov %%dr3, %%rax \n\t" | ||
1356 | "mov %%rax, %[dr3] \n\t" | ||
1357 | : [dr0] "=m"(db_regs[0]), | ||
1358 | [dr1] "=m"(db_regs[1]), | ||
1359 | [dr2] "=m"(db_regs[2]), | ||
1360 | [dr3] "=m"(db_regs[3]) | ||
1361 | : : "rax"); | ||
1362 | #else | ||
1363 | asm ("mov %%dr0, %%eax \n\t" | ||
1364 | "mov %%eax, %[dr0] \n\t" | ||
1365 | "mov %%dr1, %%eax \n\t" | ||
1366 | "mov %%eax, %[dr1] \n\t" | ||
1367 | "mov %%dr2, %%eax \n\t" | ||
1368 | "mov %%eax, %[dr2] \n\t" | ||
1369 | "mov %%dr3, %%eax \n\t" | ||
1370 | "mov %%eax, %[dr3] \n\t" | ||
1371 | : [dr0] "=m"(db_regs[0]), | ||
1372 | [dr1] "=m"(db_regs[1]), | ||
1373 | [dr2] "=m"(db_regs[2]), | ||
1374 | [dr3] "=m"(db_regs[3]) | ||
1375 | : : "eax"); | ||
1376 | #endif | ||
1377 | } | ||
1378 | |||
1379 | static void load_db_regs(unsigned long *db_regs) | ||
1380 | { | ||
1381 | asm volatile ("mov %[dr0], %%dr0 \n\t" | ||
1382 | "mov %[dr1], %%dr1 \n\t" | ||
1383 | "mov %[dr2], %%dr2 \n\t" | ||
1384 | "mov %[dr3], %%dr3 \n\t" | ||
1385 | : | ||
1386 | : [dr0] "r"(db_regs[0]), | ||
1387 | [dr1] "r"(db_regs[1]), | ||
1388 | [dr2] "r"(db_regs[2]), | ||
1389 | [dr3] "r"(db_regs[3]) | ||
1390 | #ifdef __x86_64__ | ||
1391 | : "rax"); | ||
1392 | #else | ||
1393 | : "eax"); | ||
1394 | #endif | ||
1395 | } | ||
1396 | |||
1397 | static int svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
1398 | { | ||
1399 | u16 fs_selector; | ||
1400 | u16 gs_selector; | ||
1401 | u16 ldt_selector; | ||
1402 | |||
1403 | again: | ||
1404 | kvm_try_inject_irq(vcpu); | ||
1405 | |||
1406 | clgi(); | ||
1407 | |||
1408 | pre_svm_run(vcpu); | ||
1409 | |||
1410 | save_host_msrs(vcpu); | ||
1411 | fs_selector = read_fs(); | ||
1412 | gs_selector = read_gs(); | ||
1413 | ldt_selector = read_ldt(); | ||
1414 | vcpu->svm->host_cr2 = kvm_read_cr2(); | ||
1415 | vcpu->svm->host_dr6 = read_dr6(); | ||
1416 | vcpu->svm->host_dr7 = read_dr7(); | ||
1417 | vcpu->svm->vmcb->save.cr2 = vcpu->cr2; | ||
1418 | |||
1419 | if (vcpu->svm->vmcb->save.dr7 & 0xff) { | ||
1420 | write_dr7(0); | ||
1421 | save_db_regs(vcpu->svm->host_db_regs); | ||
1422 | load_db_regs(vcpu->svm->db_regs); | ||
1423 | } | ||
1424 | asm volatile ( | ||
1425 | #ifdef __x86_64__ | ||
1426 | "push %%rbx; push %%rcx; push %%rdx;" | ||
1427 | "push %%rsi; push %%rdi; push %%rbp;" | ||
1428 | "push %%r8; push %%r9; push %%r10; push %%r11;" | ||
1429 | "push %%r12; push %%r13; push %%r14; push %%r15;" | ||
1430 | #else | ||
1431 | "push %%ebx; push %%ecx; push %%edx;" | ||
1432 | "push %%esi; push %%edi; push %%ebp;" | ||
1433 | #endif | ||
1434 | |||
1435 | #ifdef __x86_64__ | ||
1436 | "mov %c[rbx](%[vcpu]), %%rbx \n\t" | ||
1437 | "mov %c[rcx](%[vcpu]), %%rcx \n\t" | ||
1438 | "mov %c[rdx](%[vcpu]), %%rdx \n\t" | ||
1439 | "mov %c[rsi](%[vcpu]), %%rsi \n\t" | ||
1440 | "mov %c[rdi](%[vcpu]), %%rdi \n\t" | ||
1441 | "mov %c[rbp](%[vcpu]), %%rbp \n\t" | ||
1442 | "mov %c[r8](%[vcpu]), %%r8 \n\t" | ||
1443 | "mov %c[r9](%[vcpu]), %%r9 \n\t" | ||
1444 | "mov %c[r10](%[vcpu]), %%r10 \n\t" | ||
1445 | "mov %c[r11](%[vcpu]), %%r11 \n\t" | ||
1446 | "mov %c[r12](%[vcpu]), %%r12 \n\t" | ||
1447 | "mov %c[r13](%[vcpu]), %%r13 \n\t" | ||
1448 | "mov %c[r14](%[vcpu]), %%r14 \n\t" | ||
1449 | "mov %c[r15](%[vcpu]), %%r15 \n\t" | ||
1450 | #else | ||
1451 | "mov %c[rbx](%[vcpu]), %%ebx \n\t" | ||
1452 | "mov %c[rcx](%[vcpu]), %%ecx \n\t" | ||
1453 | "mov %c[rdx](%[vcpu]), %%edx \n\t" | ||
1454 | "mov %c[rsi](%[vcpu]), %%esi \n\t" | ||
1455 | "mov %c[rdi](%[vcpu]), %%edi \n\t" | ||
1456 | "mov %c[rbp](%[vcpu]), %%ebp \n\t" | ||
1457 | #endif | ||
1458 | |||
1459 | #ifdef __x86_64__ | ||
1460 | /* Enter guest mode */ | ||
1461 | "push %%rax \n\t" | ||
1462 | "mov %c[svm](%[vcpu]), %%rax \n\t" | ||
1463 | "mov %c[vmcb](%%rax), %%rax \n\t" | ||
1464 | SVM_VMLOAD "\n\t" | ||
1465 | SVM_VMRUN "\n\t" | ||
1466 | SVM_VMSAVE "\n\t" | ||
1467 | "pop %%rax \n\t" | ||
1468 | #else | ||
1469 | /* Enter guest mode */ | ||
1470 | "push %%eax \n\t" | ||
1471 | "mov %c[svm](%[vcpu]), %%eax \n\t" | ||
1472 | "mov %c[vmcb](%%eax), %%eax \n\t" | ||
1473 | SVM_VMLOAD "\n\t" | ||
1474 | SVM_VMRUN "\n\t" | ||
1475 | SVM_VMSAVE "\n\t" | ||
1476 | "pop %%eax \n\t" | ||
1477 | #endif | ||
1478 | |||
1479 | /* Save guest registers, load host registers */ | ||
1480 | #ifdef __x86_64__ | ||
1481 | "mov %%rbx, %c[rbx](%[vcpu]) \n\t" | ||
1482 | "mov %%rcx, %c[rcx](%[vcpu]) \n\t" | ||
1483 | "mov %%rdx, %c[rdx](%[vcpu]) \n\t" | ||
1484 | "mov %%rsi, %c[rsi](%[vcpu]) \n\t" | ||
1485 | "mov %%rdi, %c[rdi](%[vcpu]) \n\t" | ||
1486 | "mov %%rbp, %c[rbp](%[vcpu]) \n\t" | ||
1487 | "mov %%r8, %c[r8](%[vcpu]) \n\t" | ||
1488 | "mov %%r9, %c[r9](%[vcpu]) \n\t" | ||
1489 | "mov %%r10, %c[r10](%[vcpu]) \n\t" | ||
1490 | "mov %%r11, %c[r11](%[vcpu]) \n\t" | ||
1491 | "mov %%r12, %c[r12](%[vcpu]) \n\t" | ||
1492 | "mov %%r13, %c[r13](%[vcpu]) \n\t" | ||
1493 | "mov %%r14, %c[r14](%[vcpu]) \n\t" | ||
1494 | "mov %%r15, %c[r15](%[vcpu]) \n\t" | ||
1495 | |||
1496 | "pop %%r15; pop %%r14; pop %%r13; pop %%r12;" | ||
1497 | "pop %%r11; pop %%r10; pop %%r9; pop %%r8;" | ||
1498 | "pop %%rbp; pop %%rdi; pop %%rsi;" | ||
1499 | "pop %%rdx; pop %%rcx; pop %%rbx; \n\t" | ||
1500 | #else | ||
1501 | "mov %%ebx, %c[rbx](%[vcpu]) \n\t" | ||
1502 | "mov %%ecx, %c[rcx](%[vcpu]) \n\t" | ||
1503 | "mov %%edx, %c[rdx](%[vcpu]) \n\t" | ||
1504 | "mov %%esi, %c[rsi](%[vcpu]) \n\t" | ||
1505 | "mov %%edi, %c[rdi](%[vcpu]) \n\t" | ||
1506 | "mov %%ebp, %c[rbp](%[vcpu]) \n\t" | ||
1507 | |||
1508 | "pop %%ebp; pop %%edi; pop %%esi;" | ||
1509 | "pop %%edx; pop %%ecx; pop %%ebx; \n\t" | ||
1510 | #endif | ||
1511 | : | ||
1512 | : [vcpu]"a"(vcpu), | ||
1513 | [svm]"i"(offsetof(struct kvm_vcpu, svm)), | ||
1514 | [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), | ||
1515 | [rbx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBX])), | ||
1516 | [rcx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RCX])), | ||
1517 | [rdx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDX])), | ||
1518 | [rsi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RSI])), | ||
1519 | [rdi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDI])), | ||
1520 | [rbp]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBP])) | ||
1521 | #ifdef __x86_64__ | ||
1522 | ,[r8 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R8 ])), | ||
1523 | [r9 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R9 ])), | ||
1524 | [r10]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R10])), | ||
1525 | [r11]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R11])), | ||
1526 | [r12]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R12])), | ||
1527 | [r13]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R13])), | ||
1528 | [r14]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R14])), | ||
1529 | [r15]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R15])) | ||
1530 | #endif | ||
1531 | : "cc", "memory" ); | ||
1532 | |||
1533 | if ((vcpu->svm->vmcb->save.dr7 & 0xff)) | ||
1534 | load_db_regs(vcpu->svm->host_db_regs); | ||
1535 | |||
1536 | vcpu->cr2 = vcpu->svm->vmcb->save.cr2; | ||
1537 | |||
1538 | write_dr6(vcpu->svm->host_dr6); | ||
1539 | write_dr7(vcpu->svm->host_dr7); | ||
1540 | kvm_write_cr2(vcpu->svm->host_cr2); | ||
1541 | |||
1542 | load_fs(fs_selector); | ||
1543 | load_gs(gs_selector); | ||
1544 | load_ldt(ldt_selector); | ||
1545 | load_host_msrs(vcpu); | ||
1546 | |||
1547 | reload_tss(vcpu); | ||
1548 | |||
1549 | stgi(); | ||
1550 | |||
1551 | kvm_reput_irq(vcpu); | ||
1552 | |||
1553 | vcpu->svm->next_rip = 0; | ||
1554 | |||
1555 | if (vcpu->svm->vmcb->control.exit_code == SVM_EXIT_ERR) { | ||
1556 | kvm_run->exit_type = KVM_EXIT_TYPE_FAIL_ENTRY; | ||
1557 | kvm_run->exit_reason = vcpu->svm->vmcb->control.exit_code; | ||
1558 | return 0; | ||
1559 | } | ||
1560 | |||
1561 | if (handle_exit(vcpu, kvm_run)) { | ||
1562 | if (signal_pending(current)) { | ||
1563 | ++kvm_stat.signal_exits; | ||
1564 | return -EINTR; | ||
1565 | } | ||
1566 | kvm_resched(vcpu); | ||
1567 | goto again; | ||
1568 | } | ||
1569 | return 0; | ||
1570 | } | ||
1571 | |||
1572 | static void svm_flush_tlb(struct kvm_vcpu *vcpu) | ||
1573 | { | ||
1574 | force_new_asid(vcpu); | ||
1575 | } | ||
1576 | |||
1577 | static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) | ||
1578 | { | ||
1579 | vcpu->svm->vmcb->save.cr3 = root; | ||
1580 | force_new_asid(vcpu); | ||
1581 | } | ||
1582 | |||
1583 | static void svm_inject_page_fault(struct kvm_vcpu *vcpu, | ||
1584 | unsigned long addr, | ||
1585 | uint32_t err_code) | ||
1586 | { | ||
1587 | uint32_t exit_int_info = vcpu->svm->vmcb->control.exit_int_info; | ||
1588 | |||
1589 | ++kvm_stat.pf_guest; | ||
1590 | |||
1591 | if (is_page_fault(exit_int_info)) { | ||
1592 | |||
1593 | vcpu->svm->vmcb->control.event_inj_err = 0; | ||
1594 | vcpu->svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | | ||
1595 | SVM_EVTINJ_VALID_ERR | | ||
1596 | SVM_EVTINJ_TYPE_EXEPT | | ||
1597 | DF_VECTOR; | ||
1598 | return; | ||
1599 | } | ||
1600 | vcpu->cr2 = addr; | ||
1601 | vcpu->svm->vmcb->save.cr2 = addr; | ||
1602 | vcpu->svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | | ||
1603 | SVM_EVTINJ_VALID_ERR | | ||
1604 | SVM_EVTINJ_TYPE_EXEPT | | ||
1605 | PF_VECTOR; | ||
1606 | vcpu->svm->vmcb->control.event_inj_err = err_code; | ||
1607 | } | ||
1608 | |||
1609 | |||
1610 | static int is_disabled(void) | ||
1611 | { | ||
1612 | return 0; | ||
1613 | } | ||
1614 | |||
1615 | static struct kvm_arch_ops svm_arch_ops = { | ||
1616 | .cpu_has_kvm_support = has_svm, | ||
1617 | .disabled_by_bios = is_disabled, | ||
1618 | .hardware_setup = svm_hardware_setup, | ||
1619 | .hardware_unsetup = svm_hardware_unsetup, | ||
1620 | .hardware_enable = svm_hardware_enable, | ||
1621 | .hardware_disable = svm_hardware_disable, | ||
1622 | |||
1623 | .vcpu_create = svm_create_vcpu, | ||
1624 | .vcpu_free = svm_free_vcpu, | ||
1625 | |||
1626 | .vcpu_load = svm_vcpu_load, | ||
1627 | .vcpu_put = svm_vcpu_put, | ||
1628 | |||
1629 | .set_guest_debug = svm_guest_debug, | ||
1630 | .get_msr = svm_get_msr, | ||
1631 | .set_msr = svm_set_msr, | ||
1632 | .get_segment_base = svm_get_segment_base, | ||
1633 | .get_segment = svm_get_segment, | ||
1634 | .set_segment = svm_set_segment, | ||
1635 | .is_long_mode = svm_is_long_mode, | ||
1636 | .get_cs_db_l_bits = svm_get_cs_db_l_bits, | ||
1637 | .set_cr0 = svm_set_cr0, | ||
1638 | .set_cr0_no_modeswitch = svm_set_cr0, | ||
1639 | .set_cr3 = svm_set_cr3, | ||
1640 | .set_cr4 = svm_set_cr4, | ||
1641 | .set_efer = svm_set_efer, | ||
1642 | .get_idt = svm_get_idt, | ||
1643 | .set_idt = svm_set_idt, | ||
1644 | .get_gdt = svm_get_gdt, | ||
1645 | .set_gdt = svm_set_gdt, | ||
1646 | .get_dr = svm_get_dr, | ||
1647 | .set_dr = svm_set_dr, | ||
1648 | .cache_regs = svm_cache_regs, | ||
1649 | .decache_regs = svm_decache_regs, | ||
1650 | .get_rflags = svm_get_rflags, | ||
1651 | .set_rflags = svm_set_rflags, | ||
1652 | |||
1653 | .invlpg = svm_invlpg, | ||
1654 | .tlb_flush = svm_flush_tlb, | ||
1655 | .inject_page_fault = svm_inject_page_fault, | ||
1656 | |||
1657 | .inject_gp = svm_inject_gp, | ||
1658 | |||
1659 | .run = svm_vcpu_run, | ||
1660 | .skip_emulated_instruction = skip_emulated_instruction, | ||
1661 | .vcpu_setup = svm_vcpu_setup, | ||
1662 | }; | ||
1663 | |||
1664 | static int __init svm_init(void) | ||
1665 | { | ||
1666 | kvm_emulator_want_group7_invlpg(); | ||
1667 | kvm_init_arch(&svm_arch_ops, THIS_MODULE); | ||
1668 | return 0; | ||
1669 | } | ||
1670 | |||
1671 | static void __exit svm_exit(void) | ||
1672 | { | ||
1673 | kvm_exit_arch(); | ||
1674 | } | ||
1675 | |||
1676 | module_init(svm_init) | ||
1677 | module_exit(svm_exit) | ||
diff --git a/drivers/kvm/svm.h b/drivers/kvm/svm.h new file mode 100644 index 000000000000..df731c3fb588 --- /dev/null +++ b/drivers/kvm/svm.h | |||
@@ -0,0 +1,315 @@ | |||
1 | #ifndef __SVM_H | ||
2 | #define __SVM_H | ||
3 | |||
4 | enum { | ||
5 | INTERCEPT_INTR, | ||
6 | INTERCEPT_NMI, | ||
7 | INTERCEPT_SMI, | ||
8 | INTERCEPT_INIT, | ||
9 | INTERCEPT_VINTR, | ||
10 | INTERCEPT_SELECTIVE_CR0, | ||
11 | INTERCEPT_STORE_IDTR, | ||
12 | INTERCEPT_STORE_GDTR, | ||
13 | INTERCEPT_STORE_LDTR, | ||
14 | INTERCEPT_STORE_TR, | ||
15 | INTERCEPT_LOAD_IDTR, | ||
16 | INTERCEPT_LOAD_GDTR, | ||
17 | INTERCEPT_LOAD_LDTR, | ||
18 | INTERCEPT_LOAD_TR, | ||
19 | INTERCEPT_RDTSC, | ||
20 | INTERCEPT_RDPMC, | ||
21 | INTERCEPT_PUSHF, | ||
22 | INTERCEPT_POPF, | ||
23 | INTERCEPT_CPUID, | ||
24 | INTERCEPT_RSM, | ||
25 | INTERCEPT_IRET, | ||
26 | INTERCEPT_INTn, | ||
27 | INTERCEPT_INVD, | ||
28 | INTERCEPT_PAUSE, | ||
29 | INTERCEPT_HLT, | ||
30 | INTERCEPT_INVLPG, | ||
31 | INTERCEPT_INVLPGA, | ||
32 | INTERCEPT_IOIO_PROT, | ||
33 | INTERCEPT_MSR_PROT, | ||
34 | INTERCEPT_TASK_SWITCH, | ||
35 | INTERCEPT_FERR_FREEZE, | ||
36 | INTERCEPT_SHUTDOWN, | ||
37 | INTERCEPT_VMRUN, | ||
38 | INTERCEPT_VMMCALL, | ||
39 | INTERCEPT_VMLOAD, | ||
40 | INTERCEPT_VMSAVE, | ||
41 | INTERCEPT_STGI, | ||
42 | INTERCEPT_CLGI, | ||
43 | INTERCEPT_SKINIT, | ||
44 | INTERCEPT_RDTSCP, | ||
45 | INTERCEPT_ICEBP, | ||
46 | INTERCEPT_WBINVD, | ||
47 | }; | ||
48 | |||
49 | |||
50 | struct __attribute__ ((__packed__)) vmcb_control_area { | ||
51 | u16 intercept_cr_read; | ||
52 | u16 intercept_cr_write; | ||
53 | u16 intercept_dr_read; | ||
54 | u16 intercept_dr_write; | ||
55 | u32 intercept_exceptions; | ||
56 | u64 intercept; | ||
57 | u8 reserved_1[44]; | ||
58 | u64 iopm_base_pa; | ||
59 | u64 msrpm_base_pa; | ||
60 | u64 tsc_offset; | ||
61 | u32 asid; | ||
62 | u8 tlb_ctl; | ||
63 | u8 reserved_2[3]; | ||
64 | u32 int_ctl; | ||
65 | u32 int_vector; | ||
66 | u32 int_state; | ||
67 | u8 reserved_3[4]; | ||
68 | u32 exit_code; | ||
69 | u32 exit_code_hi; | ||
70 | u64 exit_info_1; | ||
71 | u64 exit_info_2; | ||
72 | u32 exit_int_info; | ||
73 | u32 exit_int_info_err; | ||
74 | u64 nested_ctl; | ||
75 | u8 reserved_4[16]; | ||
76 | u32 event_inj; | ||
77 | u32 event_inj_err; | ||
78 | u64 nested_cr3; | ||
79 | u64 lbr_ctl; | ||
80 | u8 reserved_5[832]; | ||
81 | }; | ||
82 | |||
83 | |||
84 | #define TLB_CONTROL_DO_NOTHING 0 | ||
85 | #define TLB_CONTROL_FLUSH_ALL_ASID 1 | ||
86 | |||
87 | #define V_TPR_MASK 0x0f | ||
88 | |||
89 | #define V_IRQ_SHIFT 8 | ||
90 | #define V_IRQ_MASK (1 << V_IRQ_SHIFT) | ||
91 | |||
92 | #define V_INTR_PRIO_SHIFT 16 | ||
93 | #define V_INTR_PRIO_MASK (0x0f << V_INTR_PRIO_SHIFT) | ||
94 | |||
95 | #define V_IGN_TPR_SHIFT 20 | ||
96 | #define V_IGN_TPR_MASK (1 << V_IGN_TPR_SHIFT) | ||
97 | |||
98 | #define V_INTR_MASKING_SHIFT 24 | ||
99 | #define V_INTR_MASKING_MASK (1 << V_INTR_MASKING_SHIFT) | ||
100 | |||
101 | #define SVM_INTERRUPT_SHADOW_MASK 1 | ||
102 | |||
103 | #define SVM_IOIO_STR_SHIFT 2 | ||
104 | #define SVM_IOIO_REP_SHIFT 3 | ||
105 | #define SVM_IOIO_SIZE_SHIFT 4 | ||
106 | #define SVM_IOIO_ASIZE_SHIFT 7 | ||
107 | |||
108 | #define SVM_IOIO_TYPE_MASK 1 | ||
109 | #define SVM_IOIO_STR_MASK (1 << SVM_IOIO_STR_SHIFT) | ||
110 | #define SVM_IOIO_REP_MASK (1 << SVM_IOIO_REP_SHIFT) | ||
111 | #define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT) | ||
112 | #define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT) | ||
113 | |||
114 | struct __attribute__ ((__packed__)) vmcb_seg { | ||
115 | u16 selector; | ||
116 | u16 attrib; | ||
117 | u32 limit; | ||
118 | u64 base; | ||
119 | }; | ||
120 | |||
121 | struct __attribute__ ((__packed__)) vmcb_save_area { | ||
122 | struct vmcb_seg es; | ||
123 | struct vmcb_seg cs; | ||
124 | struct vmcb_seg ss; | ||
125 | struct vmcb_seg ds; | ||
126 | struct vmcb_seg fs; | ||
127 | struct vmcb_seg gs; | ||
128 | struct vmcb_seg gdtr; | ||
129 | struct vmcb_seg ldtr; | ||
130 | struct vmcb_seg idtr; | ||
131 | struct vmcb_seg tr; | ||
132 | u8 reserved_1[43]; | ||
133 | u8 cpl; | ||
134 | u8 reserved_2[4]; | ||
135 | u64 efer; | ||
136 | u8 reserved_3[112]; | ||
137 | u64 cr4; | ||
138 | u64 cr3; | ||
139 | u64 cr0; | ||
140 | u64 dr7; | ||
141 | u64 dr6; | ||
142 | u64 rflags; | ||
143 | u64 rip; | ||
144 | u8 reserved_4[88]; | ||
145 | u64 rsp; | ||
146 | u8 reserved_5[24]; | ||
147 | u64 rax; | ||
148 | u64 star; | ||
149 | u64 lstar; | ||
150 | u64 cstar; | ||
151 | u64 sfmask; | ||
152 | u64 kernel_gs_base; | ||
153 | u64 sysenter_cs; | ||
154 | u64 sysenter_esp; | ||
155 | u64 sysenter_eip; | ||
156 | u64 cr2; | ||
157 | u8 reserved_6[32]; | ||
158 | u64 g_pat; | ||
159 | u64 dbgctl; | ||
160 | u64 br_from; | ||
161 | u64 br_to; | ||
162 | u64 last_excp_from; | ||
163 | u64 last_excp_to; | ||
164 | }; | ||
165 | |||
166 | struct __attribute__ ((__packed__)) vmcb { | ||
167 | struct vmcb_control_area control; | ||
168 | struct vmcb_save_area save; | ||
169 | }; | ||
170 | |||
171 | #define SVM_CPUID_FEATURE_SHIFT 2 | ||
172 | #define SVM_CPUID_FUNC 0x8000000a | ||
173 | |||
174 | #define MSR_EFER_SVME_MASK (1ULL << 12) | ||
175 | #define MSR_VM_HSAVE_PA 0xc0010117ULL | ||
176 | |||
177 | #define SVM_SELECTOR_S_SHIFT 4 | ||
178 | #define SVM_SELECTOR_DPL_SHIFT 5 | ||
179 | #define SVM_SELECTOR_P_SHIFT 7 | ||
180 | #define SVM_SELECTOR_AVL_SHIFT 8 | ||
181 | #define SVM_SELECTOR_L_SHIFT 9 | ||
182 | #define SVM_SELECTOR_DB_SHIFT 10 | ||
183 | #define SVM_SELECTOR_G_SHIFT 11 | ||
184 | |||
185 | #define SVM_SELECTOR_TYPE_MASK (0xf) | ||
186 | #define SVM_SELECTOR_S_MASK (1 << SVM_SELECTOR_S_SHIFT) | ||
187 | #define SVM_SELECTOR_DPL_MASK (3 << SVM_SELECTOR_DPL_SHIFT) | ||
188 | #define SVM_SELECTOR_P_MASK (1 << SVM_SELECTOR_P_SHIFT) | ||
189 | #define SVM_SELECTOR_AVL_MASK (1 << SVM_SELECTOR_AVL_SHIFT) | ||
190 | #define SVM_SELECTOR_L_MASK (1 << SVM_SELECTOR_L_SHIFT) | ||
191 | #define SVM_SELECTOR_DB_MASK (1 << SVM_SELECTOR_DB_SHIFT) | ||
192 | #define SVM_SELECTOR_G_MASK (1 << SVM_SELECTOR_G_SHIFT) | ||
193 | |||
194 | #define SVM_SELECTOR_WRITE_MASK (1 << 1) | ||
195 | #define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK | ||
196 | #define SVM_SELECTOR_CODE_MASK (1 << 3) | ||
197 | |||
198 | #define INTERCEPT_CR0_MASK 1 | ||
199 | #define INTERCEPT_CR3_MASK (1 << 3) | ||
200 | #define INTERCEPT_CR4_MASK (1 << 4) | ||
201 | |||
202 | #define INTERCEPT_DR0_MASK 1 | ||
203 | #define INTERCEPT_DR1_MASK (1 << 1) | ||
204 | #define INTERCEPT_DR2_MASK (1 << 2) | ||
205 | #define INTERCEPT_DR3_MASK (1 << 3) | ||
206 | #define INTERCEPT_DR4_MASK (1 << 4) | ||
207 | #define INTERCEPT_DR5_MASK (1 << 5) | ||
208 | #define INTERCEPT_DR6_MASK (1 << 6) | ||
209 | #define INTERCEPT_DR7_MASK (1 << 7) | ||
210 | |||
211 | #define SVM_EVTINJ_VEC_MASK 0xff | ||
212 | |||
213 | #define SVM_EVTINJ_TYPE_SHIFT 8 | ||
214 | #define SVM_EVTINJ_TYPE_MASK (7 << SVM_EVTINJ_TYPE_SHIFT) | ||
215 | |||
216 | #define SVM_EVTINJ_TYPE_INTR (0 << SVM_EVTINJ_TYPE_SHIFT) | ||
217 | #define SVM_EVTINJ_TYPE_NMI (2 << SVM_EVTINJ_TYPE_SHIFT) | ||
218 | #define SVM_EVTINJ_TYPE_EXEPT (3 << SVM_EVTINJ_TYPE_SHIFT) | ||
219 | #define SVM_EVTINJ_TYPE_SOFT (4 << SVM_EVTINJ_TYPE_SHIFT) | ||
220 | |||
221 | #define SVM_EVTINJ_VALID (1 << 31) | ||
222 | #define SVM_EVTINJ_VALID_ERR (1 << 11) | ||
223 | |||
224 | #define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK | ||
225 | |||
226 | #define SVM_EXITINTINFO_TYPE_INTR SVM_EVTINJ_TYPE_INTR | ||
227 | #define SVM_EXITINTINFO_TYPE_NMI SVM_EVTINJ_TYPE_NMI | ||
228 | #define SVM_EXITINTINFO_TYPE_EXEPT SVM_EVTINJ_TYPE_EXEPT | ||
229 | #define SVM_EXITINTINFO_TYPE_SOFT SVM_EVTINJ_TYPE_SOFT | ||
230 | |||
231 | #define SVM_EXITINTINFO_VALID SVM_EVTINJ_VALID | ||
232 | #define SVM_EXITINTINFO_VALID_ERR SVM_EVTINJ_VALID_ERR | ||
233 | |||
234 | #define SVM_EXIT_READ_CR0 0x000 | ||
235 | #define SVM_EXIT_READ_CR3 0x003 | ||
236 | #define SVM_EXIT_READ_CR4 0x004 | ||
237 | #define SVM_EXIT_READ_CR8 0x008 | ||
238 | #define SVM_EXIT_WRITE_CR0 0x010 | ||
239 | #define SVM_EXIT_WRITE_CR3 0x013 | ||
240 | #define SVM_EXIT_WRITE_CR4 0x014 | ||
241 | #define SVM_EXIT_WRITE_CR8 0x018 | ||
242 | #define SVM_EXIT_READ_DR0 0x020 | ||
243 | #define SVM_EXIT_READ_DR1 0x021 | ||
244 | #define SVM_EXIT_READ_DR2 0x022 | ||
245 | #define SVM_EXIT_READ_DR3 0x023 | ||
246 | #define SVM_EXIT_READ_DR4 0x024 | ||
247 | #define SVM_EXIT_READ_DR5 0x025 | ||
248 | #define SVM_EXIT_READ_DR6 0x026 | ||
249 | #define SVM_EXIT_READ_DR7 0x027 | ||
250 | #define SVM_EXIT_WRITE_DR0 0x030 | ||
251 | #define SVM_EXIT_WRITE_DR1 0x031 | ||
252 | #define SVM_EXIT_WRITE_DR2 0x032 | ||
253 | #define SVM_EXIT_WRITE_DR3 0x033 | ||
254 | #define SVM_EXIT_WRITE_DR4 0x034 | ||
255 | #define SVM_EXIT_WRITE_DR5 0x035 | ||
256 | #define SVM_EXIT_WRITE_DR6 0x036 | ||
257 | #define SVM_EXIT_WRITE_DR7 0x037 | ||
258 | #define SVM_EXIT_EXCP_BASE 0x040 | ||
259 | #define SVM_EXIT_INTR 0x060 | ||
260 | #define SVM_EXIT_NMI 0x061 | ||
261 | #define SVM_EXIT_SMI 0x062 | ||
262 | #define SVM_EXIT_INIT 0x063 | ||
263 | #define SVM_EXIT_VINTR 0x064 | ||
264 | #define SVM_EXIT_CR0_SEL_WRITE 0x065 | ||
265 | #define SVM_EXIT_IDTR_READ 0x066 | ||
266 | #define SVM_EXIT_GDTR_READ 0x067 | ||
267 | #define SVM_EXIT_LDTR_READ 0x068 | ||
268 | #define SVM_EXIT_TR_READ 0x069 | ||
269 | #define SVM_EXIT_IDTR_WRITE 0x06a | ||
270 | #define SVM_EXIT_GDTR_WRITE 0x06b | ||
271 | #define SVM_EXIT_LDTR_WRITE 0x06c | ||
272 | #define SVM_EXIT_TR_WRITE 0x06d | ||
273 | #define SVM_EXIT_RDTSC 0x06e | ||
274 | #define SVM_EXIT_RDPMC 0x06f | ||
275 | #define SVM_EXIT_PUSHF 0x070 | ||
276 | #define SVM_EXIT_POPF 0x071 | ||
277 | #define SVM_EXIT_CPUID 0x072 | ||
278 | #define SVM_EXIT_RSM 0x073 | ||
279 | #define SVM_EXIT_IRET 0x074 | ||
280 | #define SVM_EXIT_SWINT 0x075 | ||
281 | #define SVM_EXIT_INVD 0x076 | ||
282 | #define SVM_EXIT_PAUSE 0x077 | ||
283 | #define SVM_EXIT_HLT 0x078 | ||
284 | #define SVM_EXIT_INVLPG 0x079 | ||
285 | #define SVM_EXIT_INVLPGA 0x07a | ||
286 | #define SVM_EXIT_IOIO 0x07b | ||
287 | #define SVM_EXIT_MSR 0x07c | ||
288 | #define SVM_EXIT_TASK_SWITCH 0x07d | ||
289 | #define SVM_EXIT_FERR_FREEZE 0x07e | ||
290 | #define SVM_EXIT_SHUTDOWN 0x07f | ||
291 | #define SVM_EXIT_VMRUN 0x080 | ||
292 | #define SVM_EXIT_VMMCALL 0x081 | ||
293 | #define SVM_EXIT_VMLOAD 0x082 | ||
294 | #define SVM_EXIT_VMSAVE 0x083 | ||
295 | #define SVM_EXIT_STGI 0x084 | ||
296 | #define SVM_EXIT_CLGI 0x085 | ||
297 | #define SVM_EXIT_SKINIT 0x086 | ||
298 | #define SVM_EXIT_RDTSCP 0x087 | ||
299 | #define SVM_EXIT_ICEBP 0x088 | ||
300 | #define SVM_EXIT_WBINVD 0x089 | ||
301 | #define SVM_EXIT_NPF 0x400 | ||
302 | |||
303 | #define SVM_EXIT_ERR -1 | ||
304 | |||
305 | #define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) // TS and MP | ||
306 | |||
307 | #define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda" | ||
308 | #define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8" | ||
309 | #define SVM_VMSAVE ".byte 0x0f, 0x01, 0xdb" | ||
310 | #define SVM_CLGI ".byte 0x0f, 0x01, 0xdd" | ||
311 | #define SVM_STGI ".byte 0x0f, 0x01, 0xdc" | ||
312 | #define SVM_INVLPGA ".byte 0x0f, 0x01, 0xdf" | ||
313 | |||
314 | #endif | ||
315 | |||
diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c new file mode 100644 index 000000000000..bda7a7ae2167 --- /dev/null +++ b/drivers/kvm/vmx.c | |||
@@ -0,0 +1,2002 @@ | |||
1 | /* | ||
2 | * Kernel-based Virtual Machine driver for Linux | ||
3 | * | ||
4 | * This module enables machines with Intel VT-x extensions to run virtual | ||
5 | * machines without emulation or binary translation. | ||
6 | * | ||
7 | * Copyright (C) 2006 Qumranet, Inc. | ||
8 | * | ||
9 | * Authors: | ||
10 | * Avi Kivity <avi@qumranet.com> | ||
11 | * Yaniv Kamay <yaniv@qumranet.com> | ||
12 | * | ||
13 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
14 | * the COPYING file in the top-level directory. | ||
15 | * | ||
16 | */ | ||
17 | |||
18 | #include "kvm.h" | ||
19 | #include "vmx.h" | ||
20 | #include "kvm_vmx.h" | ||
21 | #include <linux/module.h> | ||
22 | #include <linux/mm.h> | ||
23 | #include <linux/highmem.h> | ||
24 | #include <asm/io.h> | ||
25 | |||
26 | #include "segment_descriptor.h" | ||
27 | |||
28 | #define MSR_IA32_FEATURE_CONTROL 0x03a | ||
29 | |||
30 | MODULE_AUTHOR("Qumranet"); | ||
31 | MODULE_LICENSE("GPL"); | ||
32 | |||
33 | static DEFINE_PER_CPU(struct vmcs *, vmxarea); | ||
34 | static DEFINE_PER_CPU(struct vmcs *, current_vmcs); | ||
35 | |||
36 | #ifdef __x86_64__ | ||
37 | #define HOST_IS_64 1 | ||
38 | #else | ||
39 | #define HOST_IS_64 0 | ||
40 | #endif | ||
41 | |||
42 | static struct vmcs_descriptor { | ||
43 | int size; | ||
44 | int order; | ||
45 | u32 revision_id; | ||
46 | } vmcs_descriptor; | ||
47 | |||
48 | #define VMX_SEGMENT_FIELD(seg) \ | ||
49 | [VCPU_SREG_##seg] = { \ | ||
50 | .selector = GUEST_##seg##_SELECTOR, \ | ||
51 | .base = GUEST_##seg##_BASE, \ | ||
52 | .limit = GUEST_##seg##_LIMIT, \ | ||
53 | .ar_bytes = GUEST_##seg##_AR_BYTES, \ | ||
54 | } | ||
55 | |||
56 | static struct kvm_vmx_segment_field { | ||
57 | unsigned selector; | ||
58 | unsigned base; | ||
59 | unsigned limit; | ||
60 | unsigned ar_bytes; | ||
61 | } kvm_vmx_segment_fields[] = { | ||
62 | VMX_SEGMENT_FIELD(CS), | ||
63 | VMX_SEGMENT_FIELD(DS), | ||
64 | VMX_SEGMENT_FIELD(ES), | ||
65 | VMX_SEGMENT_FIELD(FS), | ||
66 | VMX_SEGMENT_FIELD(GS), | ||
67 | VMX_SEGMENT_FIELD(SS), | ||
68 | VMX_SEGMENT_FIELD(TR), | ||
69 | VMX_SEGMENT_FIELD(LDTR), | ||
70 | }; | ||
71 | |||
72 | static const u32 vmx_msr_index[] = { | ||
73 | #ifdef __x86_64__ | ||
74 | MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE, | ||
75 | #endif | ||
76 | MSR_EFER, MSR_K6_STAR, | ||
77 | }; | ||
78 | #define NR_VMX_MSR (sizeof(vmx_msr_index) / sizeof(*vmx_msr_index)) | ||
79 | |||
80 | struct vmx_msr_entry *find_msr_entry(struct kvm_vcpu *vcpu, u32 msr); | ||
81 | |||
82 | static inline int is_page_fault(u32 intr_info) | ||
83 | { | ||
84 | return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | | ||
85 | INTR_INFO_VALID_MASK)) == | ||
86 | (INTR_TYPE_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK); | ||
87 | } | ||
88 | |||
89 | static inline int is_external_interrupt(u32 intr_info) | ||
90 | { | ||
91 | return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) | ||
92 | == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); | ||
93 | } | ||
94 | |||
95 | static void vmcs_clear(struct vmcs *vmcs) | ||
96 | { | ||
97 | u64 phys_addr = __pa(vmcs); | ||
98 | u8 error; | ||
99 | |||
100 | asm volatile (ASM_VMX_VMCLEAR_RAX "; setna %0" | ||
101 | : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) | ||
102 | : "cc", "memory"); | ||
103 | if (error) | ||
104 | printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n", | ||
105 | vmcs, phys_addr); | ||
106 | } | ||
107 | |||
108 | static void __vcpu_clear(void *arg) | ||
109 | { | ||
110 | struct kvm_vcpu *vcpu = arg; | ||
111 | int cpu = smp_processor_id(); | ||
112 | |||
113 | if (vcpu->cpu == cpu) | ||
114 | vmcs_clear(vcpu->vmcs); | ||
115 | if (per_cpu(current_vmcs, cpu) == vcpu->vmcs) | ||
116 | per_cpu(current_vmcs, cpu) = NULL; | ||
117 | } | ||
118 | |||
119 | static unsigned long vmcs_readl(unsigned long field) | ||
120 | { | ||
121 | unsigned long value; | ||
122 | |||
123 | asm volatile (ASM_VMX_VMREAD_RDX_RAX | ||
124 | : "=a"(value) : "d"(field) : "cc"); | ||
125 | return value; | ||
126 | } | ||
127 | |||
128 | static u16 vmcs_read16(unsigned long field) | ||
129 | { | ||
130 | return vmcs_readl(field); | ||
131 | } | ||
132 | |||
133 | static u32 vmcs_read32(unsigned long field) | ||
134 | { | ||
135 | return vmcs_readl(field); | ||
136 | } | ||
137 | |||
138 | static u64 vmcs_read64(unsigned long field) | ||
139 | { | ||
140 | #ifdef __x86_64__ | ||
141 | return vmcs_readl(field); | ||
142 | #else | ||
143 | return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32); | ||
144 | #endif | ||
145 | } | ||
146 | |||
147 | static void vmcs_writel(unsigned long field, unsigned long value) | ||
148 | { | ||
149 | u8 error; | ||
150 | |||
151 | asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0" | ||
152 | : "=q"(error) : "a"(value), "d"(field) : "cc" ); | ||
153 | if (error) | ||
154 | printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n", | ||
155 | field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); | ||
156 | } | ||
157 | |||
158 | static void vmcs_write16(unsigned long field, u16 value) | ||
159 | { | ||
160 | vmcs_writel(field, value); | ||
161 | } | ||
162 | |||
163 | static void vmcs_write32(unsigned long field, u32 value) | ||
164 | { | ||
165 | vmcs_writel(field, value); | ||
166 | } | ||
167 | |||
168 | static void vmcs_write64(unsigned long field, u64 value) | ||
169 | { | ||
170 | #ifdef __x86_64__ | ||
171 | vmcs_writel(field, value); | ||
172 | #else | ||
173 | vmcs_writel(field, value); | ||
174 | asm volatile (""); | ||
175 | vmcs_writel(field+1, value >> 32); | ||
176 | #endif | ||
177 | } | ||
178 | |||
179 | /* | ||
180 | * Switches to specified vcpu, until a matching vcpu_put(), but assumes | ||
181 | * vcpu mutex is already taken. | ||
182 | */ | ||
183 | static struct kvm_vcpu *vmx_vcpu_load(struct kvm_vcpu *vcpu) | ||
184 | { | ||
185 | u64 phys_addr = __pa(vcpu->vmcs); | ||
186 | int cpu; | ||
187 | |||
188 | cpu = get_cpu(); | ||
189 | |||
190 | if (vcpu->cpu != cpu) { | ||
191 | smp_call_function(__vcpu_clear, vcpu, 0, 1); | ||
192 | vcpu->launched = 0; | ||
193 | } | ||
194 | |||
195 | if (per_cpu(current_vmcs, cpu) != vcpu->vmcs) { | ||
196 | u8 error; | ||
197 | |||
198 | per_cpu(current_vmcs, cpu) = vcpu->vmcs; | ||
199 | asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0" | ||
200 | : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) | ||
201 | : "cc"); | ||
202 | if (error) | ||
203 | printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", | ||
204 | vcpu->vmcs, phys_addr); | ||
205 | } | ||
206 | |||
207 | if (vcpu->cpu != cpu) { | ||
208 | struct descriptor_table dt; | ||
209 | unsigned long sysenter_esp; | ||
210 | |||
211 | vcpu->cpu = cpu; | ||
212 | /* | ||
213 | * Linux uses per-cpu TSS and GDT, so set these when switching | ||
214 | * processors. | ||
215 | */ | ||
216 | vmcs_writel(HOST_TR_BASE, read_tr_base()); /* 22.2.4 */ | ||
217 | get_gdt(&dt); | ||
218 | vmcs_writel(HOST_GDTR_BASE, dt.base); /* 22.2.4 */ | ||
219 | |||
220 | rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); | ||
221 | vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ | ||
222 | } | ||
223 | return vcpu; | ||
224 | } | ||
225 | |||
226 | static void vmx_vcpu_put(struct kvm_vcpu *vcpu) | ||
227 | { | ||
228 | put_cpu(); | ||
229 | } | ||
230 | |||
231 | static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) | ||
232 | { | ||
233 | return vmcs_readl(GUEST_RFLAGS); | ||
234 | } | ||
235 | |||
236 | static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) | ||
237 | { | ||
238 | vmcs_writel(GUEST_RFLAGS, rflags); | ||
239 | } | ||
240 | |||
241 | static void skip_emulated_instruction(struct kvm_vcpu *vcpu) | ||
242 | { | ||
243 | unsigned long rip; | ||
244 | u32 interruptibility; | ||
245 | |||
246 | rip = vmcs_readl(GUEST_RIP); | ||
247 | rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); | ||
248 | vmcs_writel(GUEST_RIP, rip); | ||
249 | |||
250 | /* | ||
251 | * We emulated an instruction, so temporary interrupt blocking | ||
252 | * should be removed, if set. | ||
253 | */ | ||
254 | interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); | ||
255 | if (interruptibility & 3) | ||
256 | vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, | ||
257 | interruptibility & ~3); | ||
258 | } | ||
259 | |||
260 | static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code) | ||
261 | { | ||
262 | printk(KERN_DEBUG "inject_general_protection: rip 0x%lx\n", | ||
263 | vmcs_readl(GUEST_RIP)); | ||
264 | vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); | ||
265 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | ||
266 | GP_VECTOR | | ||
267 | INTR_TYPE_EXCEPTION | | ||
268 | INTR_INFO_DELIEVER_CODE_MASK | | ||
269 | INTR_INFO_VALID_MASK); | ||
270 | } | ||
271 | |||
272 | /* | ||
273 | * reads and returns guest's timestamp counter "register" | ||
274 | * guest_tsc = host_tsc + tsc_offset -- 21.3 | ||
275 | */ | ||
276 | static u64 guest_read_tsc(void) | ||
277 | { | ||
278 | u64 host_tsc, tsc_offset; | ||
279 | |||
280 | rdtscll(host_tsc); | ||
281 | tsc_offset = vmcs_read64(TSC_OFFSET); | ||
282 | return host_tsc + tsc_offset; | ||
283 | } | ||
284 | |||
285 | /* | ||
286 | * writes 'guest_tsc' into guest's timestamp counter "register" | ||
287 | * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc | ||
288 | */ | ||
289 | static void guest_write_tsc(u64 guest_tsc) | ||
290 | { | ||
291 | u64 host_tsc; | ||
292 | |||
293 | rdtscll(host_tsc); | ||
294 | vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc); | ||
295 | } | ||
296 | |||
297 | static void reload_tss(void) | ||
298 | { | ||
299 | #ifndef __x86_64__ | ||
300 | |||
301 | /* | ||
302 | * VT restores TR but not its size. Useless. | ||
303 | */ | ||
304 | struct descriptor_table gdt; | ||
305 | struct segment_descriptor *descs; | ||
306 | |||
307 | get_gdt(&gdt); | ||
308 | descs = (void *)gdt.base; | ||
309 | descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ | ||
310 | load_TR_desc(); | ||
311 | #endif | ||
312 | } | ||
313 | |||
314 | /* | ||
315 | * Reads an msr value (of 'msr_index') into 'pdata'. | ||
316 | * Returns 0 on success, non-0 otherwise. | ||
317 | * Assumes vcpu_load() was already called. | ||
318 | */ | ||
319 | static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) | ||
320 | { | ||
321 | u64 data; | ||
322 | struct vmx_msr_entry *msr; | ||
323 | |||
324 | if (!pdata) { | ||
325 | printk(KERN_ERR "BUG: get_msr called with NULL pdata\n"); | ||
326 | return -EINVAL; | ||
327 | } | ||
328 | |||
329 | switch (msr_index) { | ||
330 | #ifdef __x86_64__ | ||
331 | case MSR_FS_BASE: | ||
332 | data = vmcs_readl(GUEST_FS_BASE); | ||
333 | break; | ||
334 | case MSR_GS_BASE: | ||
335 | data = vmcs_readl(GUEST_GS_BASE); | ||
336 | break; | ||
337 | case MSR_EFER: | ||
338 | data = vcpu->shadow_efer; | ||
339 | break; | ||
340 | #endif | ||
341 | case MSR_IA32_TIME_STAMP_COUNTER: | ||
342 | data = guest_read_tsc(); | ||
343 | break; | ||
344 | case MSR_IA32_SYSENTER_CS: | ||
345 | data = vmcs_read32(GUEST_SYSENTER_CS); | ||
346 | break; | ||
347 | case MSR_IA32_SYSENTER_EIP: | ||
348 | data = vmcs_read32(GUEST_SYSENTER_EIP); | ||
349 | break; | ||
350 | case MSR_IA32_SYSENTER_ESP: | ||
351 | data = vmcs_read32(GUEST_SYSENTER_ESP); | ||
352 | break; | ||
353 | case MSR_IA32_MC0_CTL: | ||
354 | case MSR_IA32_MCG_STATUS: | ||
355 | case MSR_IA32_MCG_CAP: | ||
356 | case MSR_IA32_MC0_MISC: | ||
357 | case MSR_IA32_MC0_MISC+4: | ||
358 | case MSR_IA32_MC0_MISC+8: | ||
359 | case MSR_IA32_MC0_MISC+12: | ||
360 | case MSR_IA32_MC0_MISC+16: | ||
361 | case MSR_IA32_UCODE_REV: | ||
362 | /* MTRR registers */ | ||
363 | case 0xfe: | ||
364 | case 0x200 ... 0x2ff: | ||
365 | data = 0; | ||
366 | break; | ||
367 | case MSR_IA32_APICBASE: | ||
368 | data = vcpu->apic_base; | ||
369 | break; | ||
370 | default: | ||
371 | msr = find_msr_entry(vcpu, msr_index); | ||
372 | if (!msr) { | ||
373 | printk(KERN_ERR "kvm: unhandled rdmsr: %x\n", msr_index); | ||
374 | return 1; | ||
375 | } | ||
376 | data = msr->data; | ||
377 | break; | ||
378 | } | ||
379 | |||
380 | *pdata = data; | ||
381 | return 0; | ||
382 | } | ||
383 | |||
384 | /* | ||
385 | * Writes msr value into into the appropriate "register". | ||
386 | * Returns 0 on success, non-0 otherwise. | ||
387 | * Assumes vcpu_load() was already called. | ||
388 | */ | ||
389 | static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) | ||
390 | { | ||
391 | struct vmx_msr_entry *msr; | ||
392 | switch (msr_index) { | ||
393 | #ifdef __x86_64__ | ||
394 | case MSR_FS_BASE: | ||
395 | vmcs_writel(GUEST_FS_BASE, data); | ||
396 | break; | ||
397 | case MSR_GS_BASE: | ||
398 | vmcs_writel(GUEST_GS_BASE, data); | ||
399 | break; | ||
400 | #endif | ||
401 | case MSR_IA32_SYSENTER_CS: | ||
402 | vmcs_write32(GUEST_SYSENTER_CS, data); | ||
403 | break; | ||
404 | case MSR_IA32_SYSENTER_EIP: | ||
405 | vmcs_write32(GUEST_SYSENTER_EIP, data); | ||
406 | break; | ||
407 | case MSR_IA32_SYSENTER_ESP: | ||
408 | vmcs_write32(GUEST_SYSENTER_ESP, data); | ||
409 | break; | ||
410 | #ifdef __x86_64 | ||
411 | case MSR_EFER: | ||
412 | set_efer(vcpu, data); | ||
413 | break; | ||
414 | case MSR_IA32_MC0_STATUS: | ||
415 | printk(KERN_WARNING "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n" | ||
416 | , __FUNCTION__, data); | ||
417 | break; | ||
418 | #endif | ||
419 | case MSR_IA32_TIME_STAMP_COUNTER: { | ||
420 | guest_write_tsc(data); | ||
421 | break; | ||
422 | } | ||
423 | case MSR_IA32_UCODE_REV: | ||
424 | case MSR_IA32_UCODE_WRITE: | ||
425 | case 0x200 ... 0x2ff: /* MTRRs */ | ||
426 | break; | ||
427 | case MSR_IA32_APICBASE: | ||
428 | vcpu->apic_base = data; | ||
429 | break; | ||
430 | default: | ||
431 | msr = find_msr_entry(vcpu, msr_index); | ||
432 | if (!msr) { | ||
433 | printk(KERN_ERR "kvm: unhandled wrmsr: 0x%x\n", msr_index); | ||
434 | return 1; | ||
435 | } | ||
436 | msr->data = data; | ||
437 | break; | ||
438 | } | ||
439 | |||
440 | return 0; | ||
441 | } | ||
442 | |||
443 | /* | ||
444 | * Sync the rsp and rip registers into the vcpu structure. This allows | ||
445 | * registers to be accessed by indexing vcpu->regs. | ||
446 | */ | ||
447 | static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu) | ||
448 | { | ||
449 | vcpu->regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); | ||
450 | vcpu->rip = vmcs_readl(GUEST_RIP); | ||
451 | } | ||
452 | |||
453 | /* | ||
454 | * Syncs rsp and rip back into the vmcs. Should be called after possible | ||
455 | * modification. | ||
456 | */ | ||
457 | static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu) | ||
458 | { | ||
459 | vmcs_writel(GUEST_RSP, vcpu->regs[VCPU_REGS_RSP]); | ||
460 | vmcs_writel(GUEST_RIP, vcpu->rip); | ||
461 | } | ||
462 | |||
463 | static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) | ||
464 | { | ||
465 | unsigned long dr7 = 0x400; | ||
466 | u32 exception_bitmap; | ||
467 | int old_singlestep; | ||
468 | |||
469 | exception_bitmap = vmcs_read32(EXCEPTION_BITMAP); | ||
470 | old_singlestep = vcpu->guest_debug.singlestep; | ||
471 | |||
472 | vcpu->guest_debug.enabled = dbg->enabled; | ||
473 | if (vcpu->guest_debug.enabled) { | ||
474 | int i; | ||
475 | |||
476 | dr7 |= 0x200; /* exact */ | ||
477 | for (i = 0; i < 4; ++i) { | ||
478 | if (!dbg->breakpoints[i].enabled) | ||
479 | continue; | ||
480 | vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address; | ||
481 | dr7 |= 2 << (i*2); /* global enable */ | ||
482 | dr7 |= 0 << (i*4+16); /* execution breakpoint */ | ||
483 | } | ||
484 | |||
485 | exception_bitmap |= (1u << 1); /* Trap debug exceptions */ | ||
486 | |||
487 | vcpu->guest_debug.singlestep = dbg->singlestep; | ||
488 | } else { | ||
489 | exception_bitmap &= ~(1u << 1); /* Ignore debug exceptions */ | ||
490 | vcpu->guest_debug.singlestep = 0; | ||
491 | } | ||
492 | |||
493 | if (old_singlestep && !vcpu->guest_debug.singlestep) { | ||
494 | unsigned long flags; | ||
495 | |||
496 | flags = vmcs_readl(GUEST_RFLAGS); | ||
497 | flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); | ||
498 | vmcs_writel(GUEST_RFLAGS, flags); | ||
499 | } | ||
500 | |||
501 | vmcs_write32(EXCEPTION_BITMAP, exception_bitmap); | ||
502 | vmcs_writel(GUEST_DR7, dr7); | ||
503 | |||
504 | return 0; | ||
505 | } | ||
506 | |||
507 | static __init int cpu_has_kvm_support(void) | ||
508 | { | ||
509 | unsigned long ecx = cpuid_ecx(1); | ||
510 | return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */ | ||
511 | } | ||
512 | |||
513 | static __init int vmx_disabled_by_bios(void) | ||
514 | { | ||
515 | u64 msr; | ||
516 | |||
517 | rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); | ||
518 | return (msr & 5) == 1; /* locked but not enabled */ | ||
519 | } | ||
520 | |||
521 | static __init void hardware_enable(void *garbage) | ||
522 | { | ||
523 | int cpu = raw_smp_processor_id(); | ||
524 | u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); | ||
525 | u64 old; | ||
526 | |||
527 | rdmsrl(MSR_IA32_FEATURE_CONTROL, old); | ||
528 | if ((old & 5) == 0) | ||
529 | /* enable and lock */ | ||
530 | wrmsrl(MSR_IA32_FEATURE_CONTROL, old | 5); | ||
531 | write_cr4(read_cr4() | CR4_VMXE); /* FIXME: not cpu hotplug safe */ | ||
532 | asm volatile (ASM_VMX_VMXON_RAX : : "a"(&phys_addr), "m"(phys_addr) | ||
533 | : "memory", "cc"); | ||
534 | } | ||
535 | |||
536 | static void hardware_disable(void *garbage) | ||
537 | { | ||
538 | asm volatile (ASM_VMX_VMXOFF : : : "cc"); | ||
539 | } | ||
540 | |||
541 | static __init void setup_vmcs_descriptor(void) | ||
542 | { | ||
543 | u32 vmx_msr_low, vmx_msr_high; | ||
544 | |||
545 | rdmsr(MSR_IA32_VMX_BASIC_MSR, vmx_msr_low, vmx_msr_high); | ||
546 | vmcs_descriptor.size = vmx_msr_high & 0x1fff; | ||
547 | vmcs_descriptor.order = get_order(vmcs_descriptor.size); | ||
548 | vmcs_descriptor.revision_id = vmx_msr_low; | ||
549 | }; | ||
550 | |||
551 | static struct vmcs *alloc_vmcs_cpu(int cpu) | ||
552 | { | ||
553 | int node = cpu_to_node(cpu); | ||
554 | struct page *pages; | ||
555 | struct vmcs *vmcs; | ||
556 | |||
557 | pages = alloc_pages_node(node, GFP_KERNEL, vmcs_descriptor.order); | ||
558 | if (!pages) | ||
559 | return NULL; | ||
560 | vmcs = page_address(pages); | ||
561 | memset(vmcs, 0, vmcs_descriptor.size); | ||
562 | vmcs->revision_id = vmcs_descriptor.revision_id; /* vmcs revision id */ | ||
563 | return vmcs; | ||
564 | } | ||
565 | |||
566 | static struct vmcs *alloc_vmcs(void) | ||
567 | { | ||
568 | return alloc_vmcs_cpu(smp_processor_id()); | ||
569 | } | ||
570 | |||
571 | static void free_vmcs(struct vmcs *vmcs) | ||
572 | { | ||
573 | free_pages((unsigned long)vmcs, vmcs_descriptor.order); | ||
574 | } | ||
575 | |||
576 | static __exit void free_kvm_area(void) | ||
577 | { | ||
578 | int cpu; | ||
579 | |||
580 | for_each_online_cpu(cpu) | ||
581 | free_vmcs(per_cpu(vmxarea, cpu)); | ||
582 | } | ||
583 | |||
584 | extern struct vmcs *alloc_vmcs_cpu(int cpu); | ||
585 | |||
586 | static __init int alloc_kvm_area(void) | ||
587 | { | ||
588 | int cpu; | ||
589 | |||
590 | for_each_online_cpu(cpu) { | ||
591 | struct vmcs *vmcs; | ||
592 | |||
593 | vmcs = alloc_vmcs_cpu(cpu); | ||
594 | if (!vmcs) { | ||
595 | free_kvm_area(); | ||
596 | return -ENOMEM; | ||
597 | } | ||
598 | |||
599 | per_cpu(vmxarea, cpu) = vmcs; | ||
600 | } | ||
601 | return 0; | ||
602 | } | ||
603 | |||
604 | static __init int hardware_setup(void) | ||
605 | { | ||
606 | setup_vmcs_descriptor(); | ||
607 | return alloc_kvm_area(); | ||
608 | } | ||
609 | |||
610 | static __exit void hardware_unsetup(void) | ||
611 | { | ||
612 | free_kvm_area(); | ||
613 | } | ||
614 | |||
615 | static void update_exception_bitmap(struct kvm_vcpu *vcpu) | ||
616 | { | ||
617 | if (vcpu->rmode.active) | ||
618 | vmcs_write32(EXCEPTION_BITMAP, ~0); | ||
619 | else | ||
620 | vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR); | ||
621 | } | ||
622 | |||
623 | static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save) | ||
624 | { | ||
625 | struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; | ||
626 | |||
627 | if (vmcs_readl(sf->base) == save->base) { | ||
628 | vmcs_write16(sf->selector, save->selector); | ||
629 | vmcs_writel(sf->base, save->base); | ||
630 | vmcs_write32(sf->limit, save->limit); | ||
631 | vmcs_write32(sf->ar_bytes, save->ar); | ||
632 | } else { | ||
633 | u32 dpl = (vmcs_read16(sf->selector) & SELECTOR_RPL_MASK) | ||
634 | << AR_DPL_SHIFT; | ||
635 | vmcs_write32(sf->ar_bytes, 0x93 | dpl); | ||
636 | } | ||
637 | } | ||
638 | |||
639 | static void enter_pmode(struct kvm_vcpu *vcpu) | ||
640 | { | ||
641 | unsigned long flags; | ||
642 | |||
643 | vcpu->rmode.active = 0; | ||
644 | |||
645 | vmcs_writel(GUEST_TR_BASE, vcpu->rmode.tr.base); | ||
646 | vmcs_write32(GUEST_TR_LIMIT, vcpu->rmode.tr.limit); | ||
647 | vmcs_write32(GUEST_TR_AR_BYTES, vcpu->rmode.tr.ar); | ||
648 | |||
649 | flags = vmcs_readl(GUEST_RFLAGS); | ||
650 | flags &= ~(IOPL_MASK | X86_EFLAGS_VM); | ||
651 | flags |= (vcpu->rmode.save_iopl << IOPL_SHIFT); | ||
652 | vmcs_writel(GUEST_RFLAGS, flags); | ||
653 | |||
654 | vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~CR4_VME_MASK) | | ||
655 | (vmcs_readl(CR4_READ_SHADOW) & CR4_VME_MASK)); | ||
656 | |||
657 | update_exception_bitmap(vcpu); | ||
658 | |||
659 | fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->rmode.es); | ||
660 | fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->rmode.ds); | ||
661 | fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->rmode.gs); | ||
662 | fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->rmode.fs); | ||
663 | |||
664 | vmcs_write16(GUEST_SS_SELECTOR, 0); | ||
665 | vmcs_write32(GUEST_SS_AR_BYTES, 0x93); | ||
666 | |||
667 | vmcs_write16(GUEST_CS_SELECTOR, | ||
668 | vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK); | ||
669 | vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); | ||
670 | } | ||
671 | |||
672 | static int rmode_tss_base(struct kvm* kvm) | ||
673 | { | ||
674 | gfn_t base_gfn = kvm->memslots[0].base_gfn + kvm->memslots[0].npages - 3; | ||
675 | return base_gfn << PAGE_SHIFT; | ||
676 | } | ||
677 | |||
678 | static void fix_rmode_seg(int seg, struct kvm_save_segment *save) | ||
679 | { | ||
680 | struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; | ||
681 | |||
682 | save->selector = vmcs_read16(sf->selector); | ||
683 | save->base = vmcs_readl(sf->base); | ||
684 | save->limit = vmcs_read32(sf->limit); | ||
685 | save->ar = vmcs_read32(sf->ar_bytes); | ||
686 | vmcs_write16(sf->selector, vmcs_readl(sf->base) >> 4); | ||
687 | vmcs_write32(sf->limit, 0xffff); | ||
688 | vmcs_write32(sf->ar_bytes, 0xf3); | ||
689 | } | ||
690 | |||
691 | static void enter_rmode(struct kvm_vcpu *vcpu) | ||
692 | { | ||
693 | unsigned long flags; | ||
694 | |||
695 | vcpu->rmode.active = 1; | ||
696 | |||
697 | vcpu->rmode.tr.base = vmcs_readl(GUEST_TR_BASE); | ||
698 | vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); | ||
699 | |||
700 | vcpu->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT); | ||
701 | vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); | ||
702 | |||
703 | vcpu->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES); | ||
704 | vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); | ||
705 | |||
706 | flags = vmcs_readl(GUEST_RFLAGS); | ||
707 | vcpu->rmode.save_iopl = (flags & IOPL_MASK) >> IOPL_SHIFT; | ||
708 | |||
709 | flags |= IOPL_MASK | X86_EFLAGS_VM; | ||
710 | |||
711 | vmcs_writel(GUEST_RFLAGS, flags); | ||
712 | vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | CR4_VME_MASK); | ||
713 | update_exception_bitmap(vcpu); | ||
714 | |||
715 | vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4); | ||
716 | vmcs_write32(GUEST_SS_LIMIT, 0xffff); | ||
717 | vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); | ||
718 | |||
719 | vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); | ||
720 | vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4); | ||
721 | |||
722 | fix_rmode_seg(VCPU_SREG_ES, &vcpu->rmode.es); | ||
723 | fix_rmode_seg(VCPU_SREG_DS, &vcpu->rmode.ds); | ||
724 | fix_rmode_seg(VCPU_SREG_GS, &vcpu->rmode.gs); | ||
725 | fix_rmode_seg(VCPU_SREG_FS, &vcpu->rmode.fs); | ||
726 | } | ||
727 | |||
728 | #ifdef __x86_64__ | ||
729 | |||
730 | static void enter_lmode(struct kvm_vcpu *vcpu) | ||
731 | { | ||
732 | u32 guest_tr_ar; | ||
733 | |||
734 | guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); | ||
735 | if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) { | ||
736 | printk(KERN_DEBUG "%s: tss fixup for long mode. \n", | ||
737 | __FUNCTION__); | ||
738 | vmcs_write32(GUEST_TR_AR_BYTES, | ||
739 | (guest_tr_ar & ~AR_TYPE_MASK) | ||
740 | | AR_TYPE_BUSY_64_TSS); | ||
741 | } | ||
742 | |||
743 | vcpu->shadow_efer |= EFER_LMA; | ||
744 | |||
745 | find_msr_entry(vcpu, MSR_EFER)->data |= EFER_LMA | EFER_LME; | ||
746 | vmcs_write32(VM_ENTRY_CONTROLS, | ||
747 | vmcs_read32(VM_ENTRY_CONTROLS) | ||
748 | | VM_ENTRY_CONTROLS_IA32E_MASK); | ||
749 | } | ||
750 | |||
751 | static void exit_lmode(struct kvm_vcpu *vcpu) | ||
752 | { | ||
753 | vcpu->shadow_efer &= ~EFER_LMA; | ||
754 | |||
755 | vmcs_write32(VM_ENTRY_CONTROLS, | ||
756 | vmcs_read32(VM_ENTRY_CONTROLS) | ||
757 | & ~VM_ENTRY_CONTROLS_IA32E_MASK); | ||
758 | } | ||
759 | |||
760 | #endif | ||
761 | |||
762 | static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | ||
763 | { | ||
764 | if (vcpu->rmode.active && (cr0 & CR0_PE_MASK)) | ||
765 | enter_pmode(vcpu); | ||
766 | |||
767 | if (!vcpu->rmode.active && !(cr0 & CR0_PE_MASK)) | ||
768 | enter_rmode(vcpu); | ||
769 | |||
770 | #ifdef __x86_64__ | ||
771 | if (vcpu->shadow_efer & EFER_LME) { | ||
772 | if (!is_paging(vcpu) && (cr0 & CR0_PG_MASK)) | ||
773 | enter_lmode(vcpu); | ||
774 | if (is_paging(vcpu) && !(cr0 & CR0_PG_MASK)) | ||
775 | exit_lmode(vcpu); | ||
776 | } | ||
777 | #endif | ||
778 | |||
779 | vmcs_writel(CR0_READ_SHADOW, cr0); | ||
780 | vmcs_writel(GUEST_CR0, | ||
781 | (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON); | ||
782 | vcpu->cr0 = cr0; | ||
783 | } | ||
784 | |||
785 | /* | ||
786 | * Used when restoring the VM to avoid corrupting segment registers | ||
787 | */ | ||
788 | static void vmx_set_cr0_no_modeswitch(struct kvm_vcpu *vcpu, unsigned long cr0) | ||
789 | { | ||
790 | vcpu->rmode.active = ((cr0 & CR0_PE_MASK) == 0); | ||
791 | update_exception_bitmap(vcpu); | ||
792 | vmcs_writel(CR0_READ_SHADOW, cr0); | ||
793 | vmcs_writel(GUEST_CR0, | ||
794 | (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON); | ||
795 | vcpu->cr0 = cr0; | ||
796 | } | ||
797 | |||
798 | static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | ||
799 | { | ||
800 | vmcs_writel(GUEST_CR3, cr3); | ||
801 | } | ||
802 | |||
803 | static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | ||
804 | { | ||
805 | vmcs_writel(CR4_READ_SHADOW, cr4); | ||
806 | vmcs_writel(GUEST_CR4, cr4 | (vcpu->rmode.active ? | ||
807 | KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON)); | ||
808 | vcpu->cr4 = cr4; | ||
809 | } | ||
810 | |||
811 | #ifdef __x86_64__ | ||
812 | |||
813 | static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) | ||
814 | { | ||
815 | struct vmx_msr_entry *msr = find_msr_entry(vcpu, MSR_EFER); | ||
816 | |||
817 | vcpu->shadow_efer = efer; | ||
818 | if (efer & EFER_LMA) { | ||
819 | vmcs_write32(VM_ENTRY_CONTROLS, | ||
820 | vmcs_read32(VM_ENTRY_CONTROLS) | | ||
821 | VM_ENTRY_CONTROLS_IA32E_MASK); | ||
822 | msr->data = efer; | ||
823 | |||
824 | } else { | ||
825 | vmcs_write32(VM_ENTRY_CONTROLS, | ||
826 | vmcs_read32(VM_ENTRY_CONTROLS) & | ||
827 | ~VM_ENTRY_CONTROLS_IA32E_MASK); | ||
828 | |||
829 | msr->data = efer & ~EFER_LME; | ||
830 | } | ||
831 | } | ||
832 | |||
833 | #endif | ||
834 | |||
835 | static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) | ||
836 | { | ||
837 | struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; | ||
838 | |||
839 | return vmcs_readl(sf->base); | ||
840 | } | ||
841 | |||
842 | static void vmx_get_segment(struct kvm_vcpu *vcpu, | ||
843 | struct kvm_segment *var, int seg) | ||
844 | { | ||
845 | struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; | ||
846 | u32 ar; | ||
847 | |||
848 | var->base = vmcs_readl(sf->base); | ||
849 | var->limit = vmcs_read32(sf->limit); | ||
850 | var->selector = vmcs_read16(sf->selector); | ||
851 | ar = vmcs_read32(sf->ar_bytes); | ||
852 | if (ar & AR_UNUSABLE_MASK) | ||
853 | ar = 0; | ||
854 | var->type = ar & 15; | ||
855 | var->s = (ar >> 4) & 1; | ||
856 | var->dpl = (ar >> 5) & 3; | ||
857 | var->present = (ar >> 7) & 1; | ||
858 | var->avl = (ar >> 12) & 1; | ||
859 | var->l = (ar >> 13) & 1; | ||
860 | var->db = (ar >> 14) & 1; | ||
861 | var->g = (ar >> 15) & 1; | ||
862 | var->unusable = (ar >> 16) & 1; | ||
863 | } | ||
864 | |||
865 | static void vmx_set_segment(struct kvm_vcpu *vcpu, | ||
866 | struct kvm_segment *var, int seg) | ||
867 | { | ||
868 | struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; | ||
869 | u32 ar; | ||
870 | |||
871 | vmcs_writel(sf->base, var->base); | ||
872 | vmcs_write32(sf->limit, var->limit); | ||
873 | vmcs_write16(sf->selector, var->selector); | ||
874 | if (var->unusable) | ||
875 | ar = 1 << 16; | ||
876 | else { | ||
877 | ar = var->type & 15; | ||
878 | ar |= (var->s & 1) << 4; | ||
879 | ar |= (var->dpl & 3) << 5; | ||
880 | ar |= (var->present & 1) << 7; | ||
881 | ar |= (var->avl & 1) << 12; | ||
882 | ar |= (var->l & 1) << 13; | ||
883 | ar |= (var->db & 1) << 14; | ||
884 | ar |= (var->g & 1) << 15; | ||
885 | } | ||
886 | vmcs_write32(sf->ar_bytes, ar); | ||
887 | } | ||
888 | |||
889 | static int vmx_is_long_mode(struct kvm_vcpu *vcpu) | ||
890 | { | ||
891 | return vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_CONTROLS_IA32E_MASK; | ||
892 | } | ||
893 | |||
894 | static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) | ||
895 | { | ||
896 | u32 ar = vmcs_read32(GUEST_CS_AR_BYTES); | ||
897 | |||
898 | *db = (ar >> 14) & 1; | ||
899 | *l = (ar >> 13) & 1; | ||
900 | } | ||
901 | |||
902 | static void vmx_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) | ||
903 | { | ||
904 | dt->limit = vmcs_read32(GUEST_IDTR_LIMIT); | ||
905 | dt->base = vmcs_readl(GUEST_IDTR_BASE); | ||
906 | } | ||
907 | |||
908 | static void vmx_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) | ||
909 | { | ||
910 | vmcs_write32(GUEST_IDTR_LIMIT, dt->limit); | ||
911 | vmcs_writel(GUEST_IDTR_BASE, dt->base); | ||
912 | } | ||
913 | |||
914 | static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) | ||
915 | { | ||
916 | dt->limit = vmcs_read32(GUEST_GDTR_LIMIT); | ||
917 | dt->base = vmcs_readl(GUEST_GDTR_BASE); | ||
918 | } | ||
919 | |||
920 | static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) | ||
921 | { | ||
922 | vmcs_write32(GUEST_GDTR_LIMIT, dt->limit); | ||
923 | vmcs_writel(GUEST_GDTR_BASE, dt->base); | ||
924 | } | ||
925 | |||
926 | static int init_rmode_tss(struct kvm* kvm) | ||
927 | { | ||
928 | struct page *p1, *p2, *p3; | ||
929 | gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; | ||
930 | char *page; | ||
931 | |||
932 | p1 = _gfn_to_page(kvm, fn++); | ||
933 | p2 = _gfn_to_page(kvm, fn++); | ||
934 | p3 = _gfn_to_page(kvm, fn); | ||
935 | |||
936 | if (!p1 || !p2 || !p3) { | ||
937 | kvm_printf(kvm,"%s: gfn_to_page failed\n", __FUNCTION__); | ||
938 | return 0; | ||
939 | } | ||
940 | |||
941 | page = kmap_atomic(p1, KM_USER0); | ||
942 | memset(page, 0, PAGE_SIZE); | ||
943 | *(u16*)(page + 0x66) = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; | ||
944 | kunmap_atomic(page, KM_USER0); | ||
945 | |||
946 | page = kmap_atomic(p2, KM_USER0); | ||
947 | memset(page, 0, PAGE_SIZE); | ||
948 | kunmap_atomic(page, KM_USER0); | ||
949 | |||
950 | page = kmap_atomic(p3, KM_USER0); | ||
951 | memset(page, 0, PAGE_SIZE); | ||
952 | *(page + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1) = ~0; | ||
953 | kunmap_atomic(page, KM_USER0); | ||
954 | |||
955 | return 1; | ||
956 | } | ||
957 | |||
958 | static void vmcs_write32_fixedbits(u32 msr, u32 vmcs_field, u32 val) | ||
959 | { | ||
960 | u32 msr_high, msr_low; | ||
961 | |||
962 | rdmsr(msr, msr_low, msr_high); | ||
963 | |||
964 | val &= msr_high; | ||
965 | val |= msr_low; | ||
966 | vmcs_write32(vmcs_field, val); | ||
967 | } | ||
968 | |||
969 | static void seg_setup(int seg) | ||
970 | { | ||
971 | struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; | ||
972 | |||
973 | vmcs_write16(sf->selector, 0); | ||
974 | vmcs_writel(sf->base, 0); | ||
975 | vmcs_write32(sf->limit, 0xffff); | ||
976 | vmcs_write32(sf->ar_bytes, 0x93); | ||
977 | } | ||
978 | |||
979 | /* | ||
980 | * Sets up the vmcs for emulated real mode. | ||
981 | */ | ||
982 | static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) | ||
983 | { | ||
984 | u32 host_sysenter_cs; | ||
985 | u32 junk; | ||
986 | unsigned long a; | ||
987 | struct descriptor_table dt; | ||
988 | int i; | ||
989 | int ret = 0; | ||
990 | int nr_good_msrs; | ||
991 | extern asmlinkage void kvm_vmx_return(void); | ||
992 | |||
993 | if (!init_rmode_tss(vcpu->kvm)) { | ||
994 | ret = -ENOMEM; | ||
995 | goto out; | ||
996 | } | ||
997 | |||
998 | memset(vcpu->regs, 0, sizeof(vcpu->regs)); | ||
999 | vcpu->regs[VCPU_REGS_RDX] = get_rdx_init_val(); | ||
1000 | vcpu->cr8 = 0; | ||
1001 | vcpu->apic_base = 0xfee00000 | | ||
1002 | /*for vcpu 0*/ MSR_IA32_APICBASE_BSP | | ||
1003 | MSR_IA32_APICBASE_ENABLE; | ||
1004 | |||
1005 | fx_init(vcpu); | ||
1006 | |||
1007 | /* | ||
1008 | * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode | ||
1009 | * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. | ||
1010 | */ | ||
1011 | vmcs_write16(GUEST_CS_SELECTOR, 0xf000); | ||
1012 | vmcs_writel(GUEST_CS_BASE, 0x000f0000); | ||
1013 | vmcs_write32(GUEST_CS_LIMIT, 0xffff); | ||
1014 | vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); | ||
1015 | |||
1016 | seg_setup(VCPU_SREG_DS); | ||
1017 | seg_setup(VCPU_SREG_ES); | ||
1018 | seg_setup(VCPU_SREG_FS); | ||
1019 | seg_setup(VCPU_SREG_GS); | ||
1020 | seg_setup(VCPU_SREG_SS); | ||
1021 | |||
1022 | vmcs_write16(GUEST_TR_SELECTOR, 0); | ||
1023 | vmcs_writel(GUEST_TR_BASE, 0); | ||
1024 | vmcs_write32(GUEST_TR_LIMIT, 0xffff); | ||
1025 | vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); | ||
1026 | |||
1027 | vmcs_write16(GUEST_LDTR_SELECTOR, 0); | ||
1028 | vmcs_writel(GUEST_LDTR_BASE, 0); | ||
1029 | vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); | ||
1030 | vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); | ||
1031 | |||
1032 | vmcs_write32(GUEST_SYSENTER_CS, 0); | ||
1033 | vmcs_writel(GUEST_SYSENTER_ESP, 0); | ||
1034 | vmcs_writel(GUEST_SYSENTER_EIP, 0); | ||
1035 | |||
1036 | vmcs_writel(GUEST_RFLAGS, 0x02); | ||
1037 | vmcs_writel(GUEST_RIP, 0xfff0); | ||
1038 | vmcs_writel(GUEST_RSP, 0); | ||
1039 | |||
1040 | vmcs_writel(GUEST_CR3, 0); | ||
1041 | |||
1042 | //todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 | ||
1043 | vmcs_writel(GUEST_DR7, 0x400); | ||
1044 | |||
1045 | vmcs_writel(GUEST_GDTR_BASE, 0); | ||
1046 | vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); | ||
1047 | |||
1048 | vmcs_writel(GUEST_IDTR_BASE, 0); | ||
1049 | vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); | ||
1050 | |||
1051 | vmcs_write32(GUEST_ACTIVITY_STATE, 0); | ||
1052 | vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); | ||
1053 | vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); | ||
1054 | |||
1055 | /* I/O */ | ||
1056 | vmcs_write64(IO_BITMAP_A, 0); | ||
1057 | vmcs_write64(IO_BITMAP_B, 0); | ||
1058 | |||
1059 | guest_write_tsc(0); | ||
1060 | |||
1061 | vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ | ||
1062 | |||
1063 | /* Special registers */ | ||
1064 | vmcs_write64(GUEST_IA32_DEBUGCTL, 0); | ||
1065 | |||
1066 | /* Control */ | ||
1067 | vmcs_write32_fixedbits(MSR_IA32_VMX_PINBASED_CTLS_MSR, | ||
1068 | PIN_BASED_VM_EXEC_CONTROL, | ||
1069 | PIN_BASED_EXT_INTR_MASK /* 20.6.1 */ | ||
1070 | | PIN_BASED_NMI_EXITING /* 20.6.1 */ | ||
1071 | ); | ||
1072 | vmcs_write32_fixedbits(MSR_IA32_VMX_PROCBASED_CTLS_MSR, | ||
1073 | CPU_BASED_VM_EXEC_CONTROL, | ||
1074 | CPU_BASED_HLT_EXITING /* 20.6.2 */ | ||
1075 | | CPU_BASED_CR8_LOAD_EXITING /* 20.6.2 */ | ||
1076 | | CPU_BASED_CR8_STORE_EXITING /* 20.6.2 */ | ||
1077 | | CPU_BASED_UNCOND_IO_EXITING /* 20.6.2 */ | ||
1078 | | CPU_BASED_INVDPG_EXITING | ||
1079 | | CPU_BASED_MOV_DR_EXITING | ||
1080 | | CPU_BASED_USE_TSC_OFFSETING /* 21.3 */ | ||
1081 | ); | ||
1082 | |||
1083 | vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR); | ||
1084 | vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); | ||
1085 | vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); | ||
1086 | vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ | ||
1087 | |||
1088 | vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */ | ||
1089 | vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */ | ||
1090 | vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ | ||
1091 | |||
1092 | vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ | ||
1093 | vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ | ||
1094 | vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ | ||
1095 | vmcs_write16(HOST_FS_SELECTOR, read_fs()); /* 22.2.4 */ | ||
1096 | vmcs_write16(HOST_GS_SELECTOR, read_gs()); /* 22.2.4 */ | ||
1097 | vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ | ||
1098 | #ifdef __x86_64__ | ||
1099 | rdmsrl(MSR_FS_BASE, a); | ||
1100 | vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */ | ||
1101 | rdmsrl(MSR_GS_BASE, a); | ||
1102 | vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */ | ||
1103 | #else | ||
1104 | vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */ | ||
1105 | vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ | ||
1106 | #endif | ||
1107 | |||
1108 | vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ | ||
1109 | |||
1110 | get_idt(&dt); | ||
1111 | vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */ | ||
1112 | |||
1113 | |||
1114 | vmcs_writel(HOST_RIP, (unsigned long)kvm_vmx_return); /* 22.2.5 */ | ||
1115 | |||
1116 | rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk); | ||
1117 | vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs); | ||
1118 | rdmsrl(MSR_IA32_SYSENTER_ESP, a); | ||
1119 | vmcs_writel(HOST_IA32_SYSENTER_ESP, a); /* 22.2.3 */ | ||
1120 | rdmsrl(MSR_IA32_SYSENTER_EIP, a); | ||
1121 | vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */ | ||
1122 | |||
1123 | ret = -ENOMEM; | ||
1124 | vcpu->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); | ||
1125 | if (!vcpu->guest_msrs) | ||
1126 | goto out; | ||
1127 | vcpu->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); | ||
1128 | if (!vcpu->host_msrs) | ||
1129 | goto out_free_guest_msrs; | ||
1130 | |||
1131 | for (i = 0; i < NR_VMX_MSR; ++i) { | ||
1132 | u32 index = vmx_msr_index[i]; | ||
1133 | u32 data_low, data_high; | ||
1134 | u64 data; | ||
1135 | int j = vcpu->nmsrs; | ||
1136 | |||
1137 | if (rdmsr_safe(index, &data_low, &data_high) < 0) | ||
1138 | continue; | ||
1139 | data = data_low | ((u64)data_high << 32); | ||
1140 | vcpu->host_msrs[j].index = index; | ||
1141 | vcpu->host_msrs[j].reserved = 0; | ||
1142 | vcpu->host_msrs[j].data = data; | ||
1143 | vcpu->guest_msrs[j] = vcpu->host_msrs[j]; | ||
1144 | ++vcpu->nmsrs; | ||
1145 | } | ||
1146 | printk(KERN_DEBUG "kvm: msrs: %d\n", vcpu->nmsrs); | ||
1147 | |||
1148 | nr_good_msrs = vcpu->nmsrs - NR_BAD_MSRS; | ||
1149 | vmcs_writel(VM_ENTRY_MSR_LOAD_ADDR, | ||
1150 | virt_to_phys(vcpu->guest_msrs + NR_BAD_MSRS)); | ||
1151 | vmcs_writel(VM_EXIT_MSR_STORE_ADDR, | ||
1152 | virt_to_phys(vcpu->guest_msrs + NR_BAD_MSRS)); | ||
1153 | vmcs_writel(VM_EXIT_MSR_LOAD_ADDR, | ||
1154 | virt_to_phys(vcpu->host_msrs + NR_BAD_MSRS)); | ||
1155 | vmcs_write32_fixedbits(MSR_IA32_VMX_EXIT_CTLS_MSR, VM_EXIT_CONTROLS, | ||
1156 | (HOST_IS_64 << 9)); /* 22.2,1, 20.7.1 */ | ||
1157 | vmcs_write32(VM_EXIT_MSR_STORE_COUNT, nr_good_msrs); /* 22.2.2 */ | ||
1158 | vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */ | ||
1159 | vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */ | ||
1160 | |||
1161 | |||
1162 | /* 22.2.1, 20.8.1 */ | ||
1163 | vmcs_write32_fixedbits(MSR_IA32_VMX_ENTRY_CTLS_MSR, | ||
1164 | VM_ENTRY_CONTROLS, 0); | ||
1165 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ | ||
1166 | |||
1167 | vmcs_writel(VIRTUAL_APIC_PAGE_ADDR, 0); | ||
1168 | vmcs_writel(TPR_THRESHOLD, 0); | ||
1169 | |||
1170 | vmcs_writel(CR0_GUEST_HOST_MASK, KVM_GUEST_CR0_MASK); | ||
1171 | vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK); | ||
1172 | |||
1173 | vcpu->cr0 = 0x60000010; | ||
1174 | vmx_set_cr0(vcpu, vcpu->cr0); // enter rmode | ||
1175 | vmx_set_cr4(vcpu, 0); | ||
1176 | #ifdef __x86_64__ | ||
1177 | vmx_set_efer(vcpu, 0); | ||
1178 | #endif | ||
1179 | |||
1180 | return 0; | ||
1181 | |||
1182 | out_free_guest_msrs: | ||
1183 | kfree(vcpu->guest_msrs); | ||
1184 | out: | ||
1185 | return ret; | ||
1186 | } | ||
1187 | |||
1188 | static void inject_rmode_irq(struct kvm_vcpu *vcpu, int irq) | ||
1189 | { | ||
1190 | u16 ent[2]; | ||
1191 | u16 cs; | ||
1192 | u16 ip; | ||
1193 | unsigned long flags; | ||
1194 | unsigned long ss_base = vmcs_readl(GUEST_SS_BASE); | ||
1195 | u16 sp = vmcs_readl(GUEST_RSP); | ||
1196 | u32 ss_limit = vmcs_read32(GUEST_SS_LIMIT); | ||
1197 | |||
1198 | if (sp > ss_limit || sp - 6 > sp) { | ||
1199 | vcpu_printf(vcpu, "%s: #SS, rsp 0x%lx ss 0x%lx limit 0x%x\n", | ||
1200 | __FUNCTION__, | ||
1201 | vmcs_readl(GUEST_RSP), | ||
1202 | vmcs_readl(GUEST_SS_BASE), | ||
1203 | vmcs_read32(GUEST_SS_LIMIT)); | ||
1204 | return; | ||
1205 | } | ||
1206 | |||
1207 | if (kvm_read_guest(vcpu, irq * sizeof(ent), sizeof(ent), &ent) != | ||
1208 | sizeof(ent)) { | ||
1209 | vcpu_printf(vcpu, "%s: read guest err\n", __FUNCTION__); | ||
1210 | return; | ||
1211 | } | ||
1212 | |||
1213 | flags = vmcs_readl(GUEST_RFLAGS); | ||
1214 | cs = vmcs_readl(GUEST_CS_BASE) >> 4; | ||
1215 | ip = vmcs_readl(GUEST_RIP); | ||
1216 | |||
1217 | |||
1218 | if (kvm_write_guest(vcpu, ss_base + sp - 2, 2, &flags) != 2 || | ||
1219 | kvm_write_guest(vcpu, ss_base + sp - 4, 2, &cs) != 2 || | ||
1220 | kvm_write_guest(vcpu, ss_base + sp - 6, 2, &ip) != 2) { | ||
1221 | vcpu_printf(vcpu, "%s: write guest err\n", __FUNCTION__); | ||
1222 | return; | ||
1223 | } | ||
1224 | |||
1225 | vmcs_writel(GUEST_RFLAGS, flags & | ||
1226 | ~( X86_EFLAGS_IF | X86_EFLAGS_AC | X86_EFLAGS_TF)); | ||
1227 | vmcs_write16(GUEST_CS_SELECTOR, ent[1]) ; | ||
1228 | vmcs_writel(GUEST_CS_BASE, ent[1] << 4); | ||
1229 | vmcs_writel(GUEST_RIP, ent[0]); | ||
1230 | vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) & ~0xffff) | (sp - 6)); | ||
1231 | } | ||
1232 | |||
1233 | static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) | ||
1234 | { | ||
1235 | int word_index = __ffs(vcpu->irq_summary); | ||
1236 | int bit_index = __ffs(vcpu->irq_pending[word_index]); | ||
1237 | int irq = word_index * BITS_PER_LONG + bit_index; | ||
1238 | |||
1239 | clear_bit(bit_index, &vcpu->irq_pending[word_index]); | ||
1240 | if (!vcpu->irq_pending[word_index]) | ||
1241 | clear_bit(word_index, &vcpu->irq_summary); | ||
1242 | |||
1243 | if (vcpu->rmode.active) { | ||
1244 | inject_rmode_irq(vcpu, irq); | ||
1245 | return; | ||
1246 | } | ||
1247 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | ||
1248 | irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); | ||
1249 | } | ||
1250 | |||
1251 | static void kvm_try_inject_irq(struct kvm_vcpu *vcpu) | ||
1252 | { | ||
1253 | if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) | ||
1254 | && (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0) | ||
1255 | /* | ||
1256 | * Interrupts enabled, and not blocked by sti or mov ss. Good. | ||
1257 | */ | ||
1258 | kvm_do_inject_irq(vcpu); | ||
1259 | else | ||
1260 | /* | ||
1261 | * Interrupts blocked. Wait for unblock. | ||
1262 | */ | ||
1263 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, | ||
1264 | vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) | ||
1265 | | CPU_BASED_VIRTUAL_INTR_PENDING); | ||
1266 | } | ||
1267 | |||
1268 | static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu) | ||
1269 | { | ||
1270 | struct kvm_guest_debug *dbg = &vcpu->guest_debug; | ||
1271 | |||
1272 | set_debugreg(dbg->bp[0], 0); | ||
1273 | set_debugreg(dbg->bp[1], 1); | ||
1274 | set_debugreg(dbg->bp[2], 2); | ||
1275 | set_debugreg(dbg->bp[3], 3); | ||
1276 | |||
1277 | if (dbg->singlestep) { | ||
1278 | unsigned long flags; | ||
1279 | |||
1280 | flags = vmcs_readl(GUEST_RFLAGS); | ||
1281 | flags |= X86_EFLAGS_TF | X86_EFLAGS_RF; | ||
1282 | vmcs_writel(GUEST_RFLAGS, flags); | ||
1283 | } | ||
1284 | } | ||
1285 | |||
1286 | static int handle_rmode_exception(struct kvm_vcpu *vcpu, | ||
1287 | int vec, u32 err_code) | ||
1288 | { | ||
1289 | if (!vcpu->rmode.active) | ||
1290 | return 0; | ||
1291 | |||
1292 | if (vec == GP_VECTOR && err_code == 0) | ||
1293 | if (emulate_instruction(vcpu, NULL, 0, 0) == EMULATE_DONE) | ||
1294 | return 1; | ||
1295 | return 0; | ||
1296 | } | ||
1297 | |||
1298 | static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
1299 | { | ||
1300 | u32 intr_info, error_code; | ||
1301 | unsigned long cr2, rip; | ||
1302 | u32 vect_info; | ||
1303 | enum emulation_result er; | ||
1304 | |||
1305 | vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); | ||
1306 | intr_info = vmcs_read32(VM_EXIT_INTR_INFO); | ||
1307 | |||
1308 | if ((vect_info & VECTORING_INFO_VALID_MASK) && | ||
1309 | !is_page_fault(intr_info)) { | ||
1310 | printk(KERN_ERR "%s: unexpected, vectoring info 0x%x " | ||
1311 | "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info); | ||
1312 | } | ||
1313 | |||
1314 | if (is_external_interrupt(vect_info)) { | ||
1315 | int irq = vect_info & VECTORING_INFO_VECTOR_MASK; | ||
1316 | set_bit(irq, vcpu->irq_pending); | ||
1317 | set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary); | ||
1318 | } | ||
1319 | |||
1320 | if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) { /* nmi */ | ||
1321 | asm ("int $2"); | ||
1322 | return 1; | ||
1323 | } | ||
1324 | error_code = 0; | ||
1325 | rip = vmcs_readl(GUEST_RIP); | ||
1326 | if (intr_info & INTR_INFO_DELIEVER_CODE_MASK) | ||
1327 | error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); | ||
1328 | if (is_page_fault(intr_info)) { | ||
1329 | cr2 = vmcs_readl(EXIT_QUALIFICATION); | ||
1330 | |||
1331 | spin_lock(&vcpu->kvm->lock); | ||
1332 | if (!vcpu->mmu.page_fault(vcpu, cr2, error_code)) { | ||
1333 | spin_unlock(&vcpu->kvm->lock); | ||
1334 | return 1; | ||
1335 | } | ||
1336 | |||
1337 | er = emulate_instruction(vcpu, kvm_run, cr2, error_code); | ||
1338 | spin_unlock(&vcpu->kvm->lock); | ||
1339 | |||
1340 | switch (er) { | ||
1341 | case EMULATE_DONE: | ||
1342 | return 1; | ||
1343 | case EMULATE_DO_MMIO: | ||
1344 | ++kvm_stat.mmio_exits; | ||
1345 | kvm_run->exit_reason = KVM_EXIT_MMIO; | ||
1346 | return 0; | ||
1347 | case EMULATE_FAIL: | ||
1348 | vcpu_printf(vcpu, "%s: emulate fail\n", __FUNCTION__); | ||
1349 | break; | ||
1350 | default: | ||
1351 | BUG(); | ||
1352 | } | ||
1353 | } | ||
1354 | |||
1355 | if (vcpu->rmode.active && | ||
1356 | handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK, | ||
1357 | error_code)) | ||
1358 | return 1; | ||
1359 | |||
1360 | if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == (INTR_TYPE_EXCEPTION | 1)) { | ||
1361 | kvm_run->exit_reason = KVM_EXIT_DEBUG; | ||
1362 | return 0; | ||
1363 | } | ||
1364 | kvm_run->exit_reason = KVM_EXIT_EXCEPTION; | ||
1365 | kvm_run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK; | ||
1366 | kvm_run->ex.error_code = error_code; | ||
1367 | return 0; | ||
1368 | } | ||
1369 | |||
1370 | static int handle_external_interrupt(struct kvm_vcpu *vcpu, | ||
1371 | struct kvm_run *kvm_run) | ||
1372 | { | ||
1373 | ++kvm_stat.irq_exits; | ||
1374 | return 1; | ||
1375 | } | ||
1376 | |||
1377 | |||
1378 | static int get_io_count(struct kvm_vcpu *vcpu, u64 *count) | ||
1379 | { | ||
1380 | u64 inst; | ||
1381 | gva_t rip; | ||
1382 | int countr_size; | ||
1383 | int i, n; | ||
1384 | |||
1385 | if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_VM)) { | ||
1386 | countr_size = 2; | ||
1387 | } else { | ||
1388 | u32 cs_ar = vmcs_read32(GUEST_CS_AR_BYTES); | ||
1389 | |||
1390 | countr_size = (cs_ar & AR_L_MASK) ? 8: | ||
1391 | (cs_ar & AR_DB_MASK) ? 4: 2; | ||
1392 | } | ||
1393 | |||
1394 | rip = vmcs_readl(GUEST_RIP); | ||
1395 | if (countr_size != 8) | ||
1396 | rip += vmcs_readl(GUEST_CS_BASE); | ||
1397 | |||
1398 | n = kvm_read_guest(vcpu, rip, sizeof(inst), &inst); | ||
1399 | |||
1400 | for (i = 0; i < n; i++) { | ||
1401 | switch (((u8*)&inst)[i]) { | ||
1402 | case 0xf0: | ||
1403 | case 0xf2: | ||
1404 | case 0xf3: | ||
1405 | case 0x2e: | ||
1406 | case 0x36: | ||
1407 | case 0x3e: | ||
1408 | case 0x26: | ||
1409 | case 0x64: | ||
1410 | case 0x65: | ||
1411 | case 0x66: | ||
1412 | break; | ||
1413 | case 0x67: | ||
1414 | countr_size = (countr_size == 2) ? 4: (countr_size >> 1); | ||
1415 | default: | ||
1416 | goto done; | ||
1417 | } | ||
1418 | } | ||
1419 | return 0; | ||
1420 | done: | ||
1421 | countr_size *= 8; | ||
1422 | *count = vcpu->regs[VCPU_REGS_RCX] & (~0ULL >> (64 - countr_size)); | ||
1423 | return 1; | ||
1424 | } | ||
1425 | |||
1426 | static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
1427 | { | ||
1428 | u64 exit_qualification; | ||
1429 | |||
1430 | ++kvm_stat.io_exits; | ||
1431 | exit_qualification = vmcs_read64(EXIT_QUALIFICATION); | ||
1432 | kvm_run->exit_reason = KVM_EXIT_IO; | ||
1433 | if (exit_qualification & 8) | ||
1434 | kvm_run->io.direction = KVM_EXIT_IO_IN; | ||
1435 | else | ||
1436 | kvm_run->io.direction = KVM_EXIT_IO_OUT; | ||
1437 | kvm_run->io.size = (exit_qualification & 7) + 1; | ||
1438 | kvm_run->io.string = (exit_qualification & 16) != 0; | ||
1439 | kvm_run->io.string_down | ||
1440 | = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0; | ||
1441 | kvm_run->io.rep = (exit_qualification & 32) != 0; | ||
1442 | kvm_run->io.port = exit_qualification >> 16; | ||
1443 | if (kvm_run->io.string) { | ||
1444 | if (!get_io_count(vcpu, &kvm_run->io.count)) | ||
1445 | return 1; | ||
1446 | kvm_run->io.address = vmcs_readl(GUEST_LINEAR_ADDRESS); | ||
1447 | } else | ||
1448 | kvm_run->io.value = vcpu->regs[VCPU_REGS_RAX]; /* rax */ | ||
1449 | return 0; | ||
1450 | } | ||
1451 | |||
1452 | static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
1453 | { | ||
1454 | u64 address = vmcs_read64(EXIT_QUALIFICATION); | ||
1455 | int instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); | ||
1456 | spin_lock(&vcpu->kvm->lock); | ||
1457 | vcpu->mmu.inval_page(vcpu, address); | ||
1458 | spin_unlock(&vcpu->kvm->lock); | ||
1459 | vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP) + instruction_length); | ||
1460 | return 1; | ||
1461 | } | ||
1462 | |||
1463 | static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
1464 | { | ||
1465 | u64 exit_qualification; | ||
1466 | int cr; | ||
1467 | int reg; | ||
1468 | |||
1469 | exit_qualification = vmcs_read64(EXIT_QUALIFICATION); | ||
1470 | cr = exit_qualification & 15; | ||
1471 | reg = (exit_qualification >> 8) & 15; | ||
1472 | switch ((exit_qualification >> 4) & 3) { | ||
1473 | case 0: /* mov to cr */ | ||
1474 | switch (cr) { | ||
1475 | case 0: | ||
1476 | vcpu_load_rsp_rip(vcpu); | ||
1477 | set_cr0(vcpu, vcpu->regs[reg]); | ||
1478 | skip_emulated_instruction(vcpu); | ||
1479 | return 1; | ||
1480 | case 3: | ||
1481 | vcpu_load_rsp_rip(vcpu); | ||
1482 | set_cr3(vcpu, vcpu->regs[reg]); | ||
1483 | skip_emulated_instruction(vcpu); | ||
1484 | return 1; | ||
1485 | case 4: | ||
1486 | vcpu_load_rsp_rip(vcpu); | ||
1487 | set_cr4(vcpu, vcpu->regs[reg]); | ||
1488 | skip_emulated_instruction(vcpu); | ||
1489 | return 1; | ||
1490 | case 8: | ||
1491 | vcpu_load_rsp_rip(vcpu); | ||
1492 | set_cr8(vcpu, vcpu->regs[reg]); | ||
1493 | skip_emulated_instruction(vcpu); | ||
1494 | return 1; | ||
1495 | }; | ||
1496 | break; | ||
1497 | case 1: /*mov from cr*/ | ||
1498 | switch (cr) { | ||
1499 | case 3: | ||
1500 | vcpu_load_rsp_rip(vcpu); | ||
1501 | vcpu->regs[reg] = vcpu->cr3; | ||
1502 | vcpu_put_rsp_rip(vcpu); | ||
1503 | skip_emulated_instruction(vcpu); | ||
1504 | return 1; | ||
1505 | case 8: | ||
1506 | printk(KERN_DEBUG "handle_cr: read CR8 " | ||
1507 | "cpu erratum AA15\n"); | ||
1508 | vcpu_load_rsp_rip(vcpu); | ||
1509 | vcpu->regs[reg] = vcpu->cr8; | ||
1510 | vcpu_put_rsp_rip(vcpu); | ||
1511 | skip_emulated_instruction(vcpu); | ||
1512 | return 1; | ||
1513 | } | ||
1514 | break; | ||
1515 | case 3: /* lmsw */ | ||
1516 | lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f); | ||
1517 | |||
1518 | skip_emulated_instruction(vcpu); | ||
1519 | return 1; | ||
1520 | default: | ||
1521 | break; | ||
1522 | } | ||
1523 | kvm_run->exit_reason = 0; | ||
1524 | printk(KERN_ERR "kvm: unhandled control register: op %d cr %d\n", | ||
1525 | (int)(exit_qualification >> 4) & 3, cr); | ||
1526 | return 0; | ||
1527 | } | ||
1528 | |||
1529 | static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
1530 | { | ||
1531 | u64 exit_qualification; | ||
1532 | unsigned long val; | ||
1533 | int dr, reg; | ||
1534 | |||
1535 | /* | ||
1536 | * FIXME: this code assumes the host is debugging the guest. | ||
1537 | * need to deal with guest debugging itself too. | ||
1538 | */ | ||
1539 | exit_qualification = vmcs_read64(EXIT_QUALIFICATION); | ||
1540 | dr = exit_qualification & 7; | ||
1541 | reg = (exit_qualification >> 8) & 15; | ||
1542 | vcpu_load_rsp_rip(vcpu); | ||
1543 | if (exit_qualification & 16) { | ||
1544 | /* mov from dr */ | ||
1545 | switch (dr) { | ||
1546 | case 6: | ||
1547 | val = 0xffff0ff0; | ||
1548 | break; | ||
1549 | case 7: | ||
1550 | val = 0x400; | ||
1551 | break; | ||
1552 | default: | ||
1553 | val = 0; | ||
1554 | } | ||
1555 | vcpu->regs[reg] = val; | ||
1556 | } else { | ||
1557 | /* mov to dr */ | ||
1558 | } | ||
1559 | vcpu_put_rsp_rip(vcpu); | ||
1560 | skip_emulated_instruction(vcpu); | ||
1561 | return 1; | ||
1562 | } | ||
1563 | |||
1564 | static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
1565 | { | ||
1566 | kvm_run->exit_reason = KVM_EXIT_CPUID; | ||
1567 | return 0; | ||
1568 | } | ||
1569 | |||
1570 | static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
1571 | { | ||
1572 | u32 ecx = vcpu->regs[VCPU_REGS_RCX]; | ||
1573 | u64 data; | ||
1574 | |||
1575 | if (vmx_get_msr(vcpu, ecx, &data)) { | ||
1576 | vmx_inject_gp(vcpu, 0); | ||
1577 | return 1; | ||
1578 | } | ||
1579 | |||
1580 | /* FIXME: handling of bits 32:63 of rax, rdx */ | ||
1581 | vcpu->regs[VCPU_REGS_RAX] = data & -1u; | ||
1582 | vcpu->regs[VCPU_REGS_RDX] = (data >> 32) & -1u; | ||
1583 | skip_emulated_instruction(vcpu); | ||
1584 | return 1; | ||
1585 | } | ||
1586 | |||
1587 | static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
1588 | { | ||
1589 | u32 ecx = vcpu->regs[VCPU_REGS_RCX]; | ||
1590 | u64 data = (vcpu->regs[VCPU_REGS_RAX] & -1u) | ||
1591 | | ((u64)(vcpu->regs[VCPU_REGS_RDX] & -1u) << 32); | ||
1592 | |||
1593 | if (vmx_set_msr(vcpu, ecx, data) != 0) { | ||
1594 | vmx_inject_gp(vcpu, 0); | ||
1595 | return 1; | ||
1596 | } | ||
1597 | |||
1598 | skip_emulated_instruction(vcpu); | ||
1599 | return 1; | ||
1600 | } | ||
1601 | |||
1602 | static int handle_interrupt_window(struct kvm_vcpu *vcpu, | ||
1603 | struct kvm_run *kvm_run) | ||
1604 | { | ||
1605 | /* Turn off interrupt window reporting. */ | ||
1606 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, | ||
1607 | vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) | ||
1608 | & ~CPU_BASED_VIRTUAL_INTR_PENDING); | ||
1609 | return 1; | ||
1610 | } | ||
1611 | |||
1612 | static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
1613 | { | ||
1614 | skip_emulated_instruction(vcpu); | ||
1615 | if (vcpu->irq_summary && (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)) | ||
1616 | return 1; | ||
1617 | |||
1618 | kvm_run->exit_reason = KVM_EXIT_HLT; | ||
1619 | return 0; | ||
1620 | } | ||
1621 | |||
1622 | /* | ||
1623 | * The exit handlers return 1 if the exit was handled fully and guest execution | ||
1624 | * may resume. Otherwise they set the kvm_run parameter to indicate what needs | ||
1625 | * to be done to userspace and return 0. | ||
1626 | */ | ||
1627 | static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, | ||
1628 | struct kvm_run *kvm_run) = { | ||
1629 | [EXIT_REASON_EXCEPTION_NMI] = handle_exception, | ||
1630 | [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, | ||
1631 | [EXIT_REASON_IO_INSTRUCTION] = handle_io, | ||
1632 | [EXIT_REASON_INVLPG] = handle_invlpg, | ||
1633 | [EXIT_REASON_CR_ACCESS] = handle_cr, | ||
1634 | [EXIT_REASON_DR_ACCESS] = handle_dr, | ||
1635 | [EXIT_REASON_CPUID] = handle_cpuid, | ||
1636 | [EXIT_REASON_MSR_READ] = handle_rdmsr, | ||
1637 | [EXIT_REASON_MSR_WRITE] = handle_wrmsr, | ||
1638 | [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, | ||
1639 | [EXIT_REASON_HLT] = handle_halt, | ||
1640 | }; | ||
1641 | |||
1642 | static const int kvm_vmx_max_exit_handlers = | ||
1643 | sizeof(kvm_vmx_exit_handlers) / sizeof(*kvm_vmx_exit_handlers); | ||
1644 | |||
1645 | /* | ||
1646 | * The guest has exited. See if we can fix it or if we need userspace | ||
1647 | * assistance. | ||
1648 | */ | ||
1649 | static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) | ||
1650 | { | ||
1651 | u32 vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); | ||
1652 | u32 exit_reason = vmcs_read32(VM_EXIT_REASON); | ||
1653 | |||
1654 | if ( (vectoring_info & VECTORING_INFO_VALID_MASK) && | ||
1655 | exit_reason != EXIT_REASON_EXCEPTION_NMI ) | ||
1656 | printk(KERN_WARNING "%s: unexpected, valid vectoring info and " | ||
1657 | "exit reason is 0x%x\n", __FUNCTION__, exit_reason); | ||
1658 | kvm_run->instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); | ||
1659 | if (exit_reason < kvm_vmx_max_exit_handlers | ||
1660 | && kvm_vmx_exit_handlers[exit_reason]) | ||
1661 | return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run); | ||
1662 | else { | ||
1663 | kvm_run->exit_reason = KVM_EXIT_UNKNOWN; | ||
1664 | kvm_run->hw.hardware_exit_reason = exit_reason; | ||
1665 | } | ||
1666 | return 0; | ||
1667 | } | ||
1668 | |||
1669 | static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
1670 | { | ||
1671 | u8 fail; | ||
1672 | u16 fs_sel, gs_sel, ldt_sel; | ||
1673 | int fs_gs_ldt_reload_needed; | ||
1674 | |||
1675 | again: | ||
1676 | /* | ||
1677 | * Set host fs and gs selectors. Unfortunately, 22.2.3 does not | ||
1678 | * allow segment selectors with cpl > 0 or ti == 1. | ||
1679 | */ | ||
1680 | fs_sel = read_fs(); | ||
1681 | gs_sel = read_gs(); | ||
1682 | ldt_sel = read_ldt(); | ||
1683 | fs_gs_ldt_reload_needed = (fs_sel & 7) | (gs_sel & 7) | ldt_sel; | ||
1684 | if (!fs_gs_ldt_reload_needed) { | ||
1685 | vmcs_write16(HOST_FS_SELECTOR, fs_sel); | ||
1686 | vmcs_write16(HOST_GS_SELECTOR, gs_sel); | ||
1687 | } else { | ||
1688 | vmcs_write16(HOST_FS_SELECTOR, 0); | ||
1689 | vmcs_write16(HOST_GS_SELECTOR, 0); | ||
1690 | } | ||
1691 | |||
1692 | #ifdef __x86_64__ | ||
1693 | vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE)); | ||
1694 | vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE)); | ||
1695 | #else | ||
1696 | vmcs_writel(HOST_FS_BASE, segment_base(fs_sel)); | ||
1697 | vmcs_writel(HOST_GS_BASE, segment_base(gs_sel)); | ||
1698 | #endif | ||
1699 | |||
1700 | if (vcpu->irq_summary && | ||
1701 | !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK)) | ||
1702 | kvm_try_inject_irq(vcpu); | ||
1703 | |||
1704 | if (vcpu->guest_debug.enabled) | ||
1705 | kvm_guest_debug_pre(vcpu); | ||
1706 | |||
1707 | fx_save(vcpu->host_fx_image); | ||
1708 | fx_restore(vcpu->guest_fx_image); | ||
1709 | |||
1710 | save_msrs(vcpu->host_msrs, vcpu->nmsrs); | ||
1711 | load_msrs(vcpu->guest_msrs, NR_BAD_MSRS); | ||
1712 | |||
1713 | asm ( | ||
1714 | /* Store host registers */ | ||
1715 | "pushf \n\t" | ||
1716 | #ifdef __x86_64__ | ||
1717 | "push %%rax; push %%rbx; push %%rdx;" | ||
1718 | "push %%rsi; push %%rdi; push %%rbp;" | ||
1719 | "push %%r8; push %%r9; push %%r10; push %%r11;" | ||
1720 | "push %%r12; push %%r13; push %%r14; push %%r15;" | ||
1721 | "push %%rcx \n\t" | ||
1722 | ASM_VMX_VMWRITE_RSP_RDX "\n\t" | ||
1723 | #else | ||
1724 | "pusha; push %%ecx \n\t" | ||
1725 | ASM_VMX_VMWRITE_RSP_RDX "\n\t" | ||
1726 | #endif | ||
1727 | /* Check if vmlaunch of vmresume is needed */ | ||
1728 | "cmp $0, %1 \n\t" | ||
1729 | /* Load guest registers. Don't clobber flags. */ | ||
1730 | #ifdef __x86_64__ | ||
1731 | "mov %c[cr2](%3), %%rax \n\t" | ||
1732 | "mov %%rax, %%cr2 \n\t" | ||
1733 | "mov %c[rax](%3), %%rax \n\t" | ||
1734 | "mov %c[rbx](%3), %%rbx \n\t" | ||
1735 | "mov %c[rdx](%3), %%rdx \n\t" | ||
1736 | "mov %c[rsi](%3), %%rsi \n\t" | ||
1737 | "mov %c[rdi](%3), %%rdi \n\t" | ||
1738 | "mov %c[rbp](%3), %%rbp \n\t" | ||
1739 | "mov %c[r8](%3), %%r8 \n\t" | ||
1740 | "mov %c[r9](%3), %%r9 \n\t" | ||
1741 | "mov %c[r10](%3), %%r10 \n\t" | ||
1742 | "mov %c[r11](%3), %%r11 \n\t" | ||
1743 | "mov %c[r12](%3), %%r12 \n\t" | ||
1744 | "mov %c[r13](%3), %%r13 \n\t" | ||
1745 | "mov %c[r14](%3), %%r14 \n\t" | ||
1746 | "mov %c[r15](%3), %%r15 \n\t" | ||
1747 | "mov %c[rcx](%3), %%rcx \n\t" /* kills %3 (rcx) */ | ||
1748 | #else | ||
1749 | "mov %c[cr2](%3), %%eax \n\t" | ||
1750 | "mov %%eax, %%cr2 \n\t" | ||
1751 | "mov %c[rax](%3), %%eax \n\t" | ||
1752 | "mov %c[rbx](%3), %%ebx \n\t" | ||
1753 | "mov %c[rdx](%3), %%edx \n\t" | ||
1754 | "mov %c[rsi](%3), %%esi \n\t" | ||
1755 | "mov %c[rdi](%3), %%edi \n\t" | ||
1756 | "mov %c[rbp](%3), %%ebp \n\t" | ||
1757 | "mov %c[rcx](%3), %%ecx \n\t" /* kills %3 (ecx) */ | ||
1758 | #endif | ||
1759 | /* Enter guest mode */ | ||
1760 | "jne launched \n\t" | ||
1761 | ASM_VMX_VMLAUNCH "\n\t" | ||
1762 | "jmp kvm_vmx_return \n\t" | ||
1763 | "launched: " ASM_VMX_VMRESUME "\n\t" | ||
1764 | ".globl kvm_vmx_return \n\t" | ||
1765 | "kvm_vmx_return: " | ||
1766 | /* Save guest registers, load host registers, keep flags */ | ||
1767 | #ifdef __x86_64__ | ||
1768 | "xchg %3, 0(%%rsp) \n\t" | ||
1769 | "mov %%rax, %c[rax](%3) \n\t" | ||
1770 | "mov %%rbx, %c[rbx](%3) \n\t" | ||
1771 | "pushq 0(%%rsp); popq %c[rcx](%3) \n\t" | ||
1772 | "mov %%rdx, %c[rdx](%3) \n\t" | ||
1773 | "mov %%rsi, %c[rsi](%3) \n\t" | ||
1774 | "mov %%rdi, %c[rdi](%3) \n\t" | ||
1775 | "mov %%rbp, %c[rbp](%3) \n\t" | ||
1776 | "mov %%r8, %c[r8](%3) \n\t" | ||
1777 | "mov %%r9, %c[r9](%3) \n\t" | ||
1778 | "mov %%r10, %c[r10](%3) \n\t" | ||
1779 | "mov %%r11, %c[r11](%3) \n\t" | ||
1780 | "mov %%r12, %c[r12](%3) \n\t" | ||
1781 | "mov %%r13, %c[r13](%3) \n\t" | ||
1782 | "mov %%r14, %c[r14](%3) \n\t" | ||
1783 | "mov %%r15, %c[r15](%3) \n\t" | ||
1784 | "mov %%cr2, %%rax \n\t" | ||
1785 | "mov %%rax, %c[cr2](%3) \n\t" | ||
1786 | "mov 0(%%rsp), %3 \n\t" | ||
1787 | |||
1788 | "pop %%rcx; pop %%r15; pop %%r14; pop %%r13; pop %%r12;" | ||
1789 | "pop %%r11; pop %%r10; pop %%r9; pop %%r8;" | ||
1790 | "pop %%rbp; pop %%rdi; pop %%rsi;" | ||
1791 | "pop %%rdx; pop %%rbx; pop %%rax \n\t" | ||
1792 | #else | ||
1793 | "xchg %3, 0(%%esp) \n\t" | ||
1794 | "mov %%eax, %c[rax](%3) \n\t" | ||
1795 | "mov %%ebx, %c[rbx](%3) \n\t" | ||
1796 | "pushl 0(%%esp); popl %c[rcx](%3) \n\t" | ||
1797 | "mov %%edx, %c[rdx](%3) \n\t" | ||
1798 | "mov %%esi, %c[rsi](%3) \n\t" | ||
1799 | "mov %%edi, %c[rdi](%3) \n\t" | ||
1800 | "mov %%ebp, %c[rbp](%3) \n\t" | ||
1801 | "mov %%cr2, %%eax \n\t" | ||
1802 | "mov %%eax, %c[cr2](%3) \n\t" | ||
1803 | "mov 0(%%esp), %3 \n\t" | ||
1804 | |||
1805 | "pop %%ecx; popa \n\t" | ||
1806 | #endif | ||
1807 | "setbe %0 \n\t" | ||
1808 | "popf \n\t" | ||
1809 | : "=g" (fail) | ||
1810 | : "r"(vcpu->launched), "d"((unsigned long)HOST_RSP), | ||
1811 | "c"(vcpu), | ||
1812 | [rax]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RAX])), | ||
1813 | [rbx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBX])), | ||
1814 | [rcx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RCX])), | ||
1815 | [rdx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDX])), | ||
1816 | [rsi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RSI])), | ||
1817 | [rdi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDI])), | ||
1818 | [rbp]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBP])), | ||
1819 | #ifdef __x86_64__ | ||
1820 | [r8 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R8 ])), | ||
1821 | [r9 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R9 ])), | ||
1822 | [r10]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R10])), | ||
1823 | [r11]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R11])), | ||
1824 | [r12]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R12])), | ||
1825 | [r13]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R13])), | ||
1826 | [r14]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R14])), | ||
1827 | [r15]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R15])), | ||
1828 | #endif | ||
1829 | [cr2]"i"(offsetof(struct kvm_vcpu, cr2)) | ||
1830 | : "cc", "memory" ); | ||
1831 | |||
1832 | ++kvm_stat.exits; | ||
1833 | |||
1834 | save_msrs(vcpu->guest_msrs, NR_BAD_MSRS); | ||
1835 | load_msrs(vcpu->host_msrs, NR_BAD_MSRS); | ||
1836 | |||
1837 | fx_save(vcpu->guest_fx_image); | ||
1838 | fx_restore(vcpu->host_fx_image); | ||
1839 | |||
1840 | #ifndef __x86_64__ | ||
1841 | asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); | ||
1842 | #endif | ||
1843 | |||
1844 | kvm_run->exit_type = 0; | ||
1845 | if (fail) { | ||
1846 | kvm_run->exit_type = KVM_EXIT_TYPE_FAIL_ENTRY; | ||
1847 | kvm_run->exit_reason = vmcs_read32(VM_INSTRUCTION_ERROR); | ||
1848 | } else { | ||
1849 | if (fs_gs_ldt_reload_needed) { | ||
1850 | load_ldt(ldt_sel); | ||
1851 | load_fs(fs_sel); | ||
1852 | /* | ||
1853 | * If we have to reload gs, we must take care to | ||
1854 | * preserve our gs base. | ||
1855 | */ | ||
1856 | local_irq_disable(); | ||
1857 | load_gs(gs_sel); | ||
1858 | #ifdef __x86_64__ | ||
1859 | wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE)); | ||
1860 | #endif | ||
1861 | local_irq_enable(); | ||
1862 | |||
1863 | reload_tss(); | ||
1864 | } | ||
1865 | vcpu->launched = 1; | ||
1866 | kvm_run->exit_type = KVM_EXIT_TYPE_VM_EXIT; | ||
1867 | if (kvm_handle_exit(kvm_run, vcpu)) { | ||
1868 | /* Give scheduler a change to reschedule. */ | ||
1869 | if (signal_pending(current)) { | ||
1870 | ++kvm_stat.signal_exits; | ||
1871 | return -EINTR; | ||
1872 | } | ||
1873 | kvm_resched(vcpu); | ||
1874 | goto again; | ||
1875 | } | ||
1876 | } | ||
1877 | return 0; | ||
1878 | } | ||
1879 | |||
1880 | static void vmx_flush_tlb(struct kvm_vcpu *vcpu) | ||
1881 | { | ||
1882 | vmcs_writel(GUEST_CR3, vmcs_readl(GUEST_CR3)); | ||
1883 | } | ||
1884 | |||
1885 | static void vmx_inject_page_fault(struct kvm_vcpu *vcpu, | ||
1886 | unsigned long addr, | ||
1887 | u32 err_code) | ||
1888 | { | ||
1889 | u32 vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); | ||
1890 | |||
1891 | ++kvm_stat.pf_guest; | ||
1892 | |||
1893 | if (is_page_fault(vect_info)) { | ||
1894 | printk(KERN_DEBUG "inject_page_fault: " | ||
1895 | "double fault 0x%lx @ 0x%lx\n", | ||
1896 | addr, vmcs_readl(GUEST_RIP)); | ||
1897 | vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 0); | ||
1898 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | ||
1899 | DF_VECTOR | | ||
1900 | INTR_TYPE_EXCEPTION | | ||
1901 | INTR_INFO_DELIEVER_CODE_MASK | | ||
1902 | INTR_INFO_VALID_MASK); | ||
1903 | return; | ||
1904 | } | ||
1905 | vcpu->cr2 = addr; | ||
1906 | vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, err_code); | ||
1907 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | ||
1908 | PF_VECTOR | | ||
1909 | INTR_TYPE_EXCEPTION | | ||
1910 | INTR_INFO_DELIEVER_CODE_MASK | | ||
1911 | INTR_INFO_VALID_MASK); | ||
1912 | |||
1913 | } | ||
1914 | |||
1915 | static void vmx_free_vmcs(struct kvm_vcpu *vcpu) | ||
1916 | { | ||
1917 | if (vcpu->vmcs) { | ||
1918 | on_each_cpu(__vcpu_clear, vcpu, 0, 1); | ||
1919 | free_vmcs(vcpu->vmcs); | ||
1920 | vcpu->vmcs = NULL; | ||
1921 | } | ||
1922 | } | ||
1923 | |||
1924 | static void vmx_free_vcpu(struct kvm_vcpu *vcpu) | ||
1925 | { | ||
1926 | vmx_free_vmcs(vcpu); | ||
1927 | } | ||
1928 | |||
1929 | static int vmx_create_vcpu(struct kvm_vcpu *vcpu) | ||
1930 | { | ||
1931 | struct vmcs *vmcs; | ||
1932 | |||
1933 | vmcs = alloc_vmcs(); | ||
1934 | if (!vmcs) | ||
1935 | return -ENOMEM; | ||
1936 | vmcs_clear(vmcs); | ||
1937 | vcpu->vmcs = vmcs; | ||
1938 | vcpu->launched = 0; | ||
1939 | return 0; | ||
1940 | } | ||
1941 | |||
1942 | static struct kvm_arch_ops vmx_arch_ops = { | ||
1943 | .cpu_has_kvm_support = cpu_has_kvm_support, | ||
1944 | .disabled_by_bios = vmx_disabled_by_bios, | ||
1945 | .hardware_setup = hardware_setup, | ||
1946 | .hardware_unsetup = hardware_unsetup, | ||
1947 | .hardware_enable = hardware_enable, | ||
1948 | .hardware_disable = hardware_disable, | ||
1949 | |||
1950 | .vcpu_create = vmx_create_vcpu, | ||
1951 | .vcpu_free = vmx_free_vcpu, | ||
1952 | |||
1953 | .vcpu_load = vmx_vcpu_load, | ||
1954 | .vcpu_put = vmx_vcpu_put, | ||
1955 | |||
1956 | .set_guest_debug = set_guest_debug, | ||
1957 | .get_msr = vmx_get_msr, | ||
1958 | .set_msr = vmx_set_msr, | ||
1959 | .get_segment_base = vmx_get_segment_base, | ||
1960 | .get_segment = vmx_get_segment, | ||
1961 | .set_segment = vmx_set_segment, | ||
1962 | .is_long_mode = vmx_is_long_mode, | ||
1963 | .get_cs_db_l_bits = vmx_get_cs_db_l_bits, | ||
1964 | .set_cr0 = vmx_set_cr0, | ||
1965 | .set_cr0_no_modeswitch = vmx_set_cr0_no_modeswitch, | ||
1966 | .set_cr3 = vmx_set_cr3, | ||
1967 | .set_cr4 = vmx_set_cr4, | ||
1968 | #ifdef __x86_64__ | ||
1969 | .set_efer = vmx_set_efer, | ||
1970 | #endif | ||
1971 | .get_idt = vmx_get_idt, | ||
1972 | .set_idt = vmx_set_idt, | ||
1973 | .get_gdt = vmx_get_gdt, | ||
1974 | .set_gdt = vmx_set_gdt, | ||
1975 | .cache_regs = vcpu_load_rsp_rip, | ||
1976 | .decache_regs = vcpu_put_rsp_rip, | ||
1977 | .get_rflags = vmx_get_rflags, | ||
1978 | .set_rflags = vmx_set_rflags, | ||
1979 | |||
1980 | .tlb_flush = vmx_flush_tlb, | ||
1981 | .inject_page_fault = vmx_inject_page_fault, | ||
1982 | |||
1983 | .inject_gp = vmx_inject_gp, | ||
1984 | |||
1985 | .run = vmx_vcpu_run, | ||
1986 | .skip_emulated_instruction = skip_emulated_instruction, | ||
1987 | .vcpu_setup = vmx_vcpu_setup, | ||
1988 | }; | ||
1989 | |||
1990 | static int __init vmx_init(void) | ||
1991 | { | ||
1992 | kvm_init_arch(&vmx_arch_ops, THIS_MODULE); | ||
1993 | return 0; | ||
1994 | } | ||
1995 | |||
1996 | static void __exit vmx_exit(void) | ||
1997 | { | ||
1998 | kvm_exit_arch(); | ||
1999 | } | ||
2000 | |||
2001 | module_init(vmx_init) | ||
2002 | module_exit(vmx_exit) | ||
diff --git a/drivers/kvm/vmx.h b/drivers/kvm/vmx.h new file mode 100644 index 000000000000..797278341581 --- /dev/null +++ b/drivers/kvm/vmx.h | |||
@@ -0,0 +1,296 @@ | |||
1 | #ifndef VMX_H | ||
2 | #define VMX_H | ||
3 | |||
4 | /* | ||
5 | * vmx.h: VMX Architecture related definitions | ||
6 | * Copyright (c) 2004, Intel Corporation. | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or modify it | ||
9 | * under the terms and conditions of the GNU General Public License, | ||
10 | * version 2, as published by the Free Software Foundation. | ||
11 | * | ||
12 | * This program is distributed in the hope it will be useful, but WITHOUT | ||
13 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
14 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
15 | * more details. | ||
16 | * | ||
17 | * You should have received a copy of the GNU General Public License along with | ||
18 | * this program; if not, write to the Free Software Foundation, Inc., 59 Temple | ||
19 | * Place - Suite 330, Boston, MA 02111-1307 USA. | ||
20 | * | ||
21 | * A few random additions are: | ||
22 | * Copyright (C) 2006 Qumranet | ||
23 | * Avi Kivity <avi@qumranet.com> | ||
24 | * Yaniv Kamay <yaniv@qumranet.com> | ||
25 | * | ||
26 | */ | ||
27 | |||
28 | #define CPU_BASED_VIRTUAL_INTR_PENDING 0x00000004 | ||
29 | #define CPU_BASED_USE_TSC_OFFSETING 0x00000008 | ||
30 | #define CPU_BASED_HLT_EXITING 0x00000080 | ||
31 | #define CPU_BASED_INVDPG_EXITING 0x00000200 | ||
32 | #define CPU_BASED_MWAIT_EXITING 0x00000400 | ||
33 | #define CPU_BASED_RDPMC_EXITING 0x00000800 | ||
34 | #define CPU_BASED_RDTSC_EXITING 0x00001000 | ||
35 | #define CPU_BASED_CR8_LOAD_EXITING 0x00080000 | ||
36 | #define CPU_BASED_CR8_STORE_EXITING 0x00100000 | ||
37 | #define CPU_BASED_TPR_SHADOW 0x00200000 | ||
38 | #define CPU_BASED_MOV_DR_EXITING 0x00800000 | ||
39 | #define CPU_BASED_UNCOND_IO_EXITING 0x01000000 | ||
40 | #define CPU_BASED_ACTIVATE_IO_BITMAP 0x02000000 | ||
41 | #define CPU_BASED_MSR_BITMAPS 0x10000000 | ||
42 | #define CPU_BASED_MONITOR_EXITING 0x20000000 | ||
43 | #define CPU_BASED_PAUSE_EXITING 0x40000000 | ||
44 | |||
45 | #define PIN_BASED_EXT_INTR_MASK 0x1 | ||
46 | #define PIN_BASED_NMI_EXITING 0x8 | ||
47 | |||
48 | #define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000 | ||
49 | #define VM_EXIT_HOST_ADD_SPACE_SIZE 0x00000200 | ||
50 | |||
51 | |||
52 | /* VMCS Encodings */ | ||
53 | enum vmcs_field { | ||
54 | GUEST_ES_SELECTOR = 0x00000800, | ||
55 | GUEST_CS_SELECTOR = 0x00000802, | ||
56 | GUEST_SS_SELECTOR = 0x00000804, | ||
57 | GUEST_DS_SELECTOR = 0x00000806, | ||
58 | GUEST_FS_SELECTOR = 0x00000808, | ||
59 | GUEST_GS_SELECTOR = 0x0000080a, | ||
60 | GUEST_LDTR_SELECTOR = 0x0000080c, | ||
61 | GUEST_TR_SELECTOR = 0x0000080e, | ||
62 | HOST_ES_SELECTOR = 0x00000c00, | ||
63 | HOST_CS_SELECTOR = 0x00000c02, | ||
64 | HOST_SS_SELECTOR = 0x00000c04, | ||
65 | HOST_DS_SELECTOR = 0x00000c06, | ||
66 | HOST_FS_SELECTOR = 0x00000c08, | ||
67 | HOST_GS_SELECTOR = 0x00000c0a, | ||
68 | HOST_TR_SELECTOR = 0x00000c0c, | ||
69 | IO_BITMAP_A = 0x00002000, | ||
70 | IO_BITMAP_A_HIGH = 0x00002001, | ||
71 | IO_BITMAP_B = 0x00002002, | ||
72 | IO_BITMAP_B_HIGH = 0x00002003, | ||
73 | MSR_BITMAP = 0x00002004, | ||
74 | MSR_BITMAP_HIGH = 0x00002005, | ||
75 | VM_EXIT_MSR_STORE_ADDR = 0x00002006, | ||
76 | VM_EXIT_MSR_STORE_ADDR_HIGH = 0x00002007, | ||
77 | VM_EXIT_MSR_LOAD_ADDR = 0x00002008, | ||
78 | VM_EXIT_MSR_LOAD_ADDR_HIGH = 0x00002009, | ||
79 | VM_ENTRY_MSR_LOAD_ADDR = 0x0000200a, | ||
80 | VM_ENTRY_MSR_LOAD_ADDR_HIGH = 0x0000200b, | ||
81 | TSC_OFFSET = 0x00002010, | ||
82 | TSC_OFFSET_HIGH = 0x00002011, | ||
83 | VIRTUAL_APIC_PAGE_ADDR = 0x00002012, | ||
84 | VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013, | ||
85 | VMCS_LINK_POINTER = 0x00002800, | ||
86 | VMCS_LINK_POINTER_HIGH = 0x00002801, | ||
87 | GUEST_IA32_DEBUGCTL = 0x00002802, | ||
88 | GUEST_IA32_DEBUGCTL_HIGH = 0x00002803, | ||
89 | PIN_BASED_VM_EXEC_CONTROL = 0x00004000, | ||
90 | CPU_BASED_VM_EXEC_CONTROL = 0x00004002, | ||
91 | EXCEPTION_BITMAP = 0x00004004, | ||
92 | PAGE_FAULT_ERROR_CODE_MASK = 0x00004006, | ||
93 | PAGE_FAULT_ERROR_CODE_MATCH = 0x00004008, | ||
94 | CR3_TARGET_COUNT = 0x0000400a, | ||
95 | VM_EXIT_CONTROLS = 0x0000400c, | ||
96 | VM_EXIT_MSR_STORE_COUNT = 0x0000400e, | ||
97 | VM_EXIT_MSR_LOAD_COUNT = 0x00004010, | ||
98 | VM_ENTRY_CONTROLS = 0x00004012, | ||
99 | VM_ENTRY_MSR_LOAD_COUNT = 0x00004014, | ||
100 | VM_ENTRY_INTR_INFO_FIELD = 0x00004016, | ||
101 | VM_ENTRY_EXCEPTION_ERROR_CODE = 0x00004018, | ||
102 | VM_ENTRY_INSTRUCTION_LEN = 0x0000401a, | ||
103 | TPR_THRESHOLD = 0x0000401c, | ||
104 | SECONDARY_VM_EXEC_CONTROL = 0x0000401e, | ||
105 | VM_INSTRUCTION_ERROR = 0x00004400, | ||
106 | VM_EXIT_REASON = 0x00004402, | ||
107 | VM_EXIT_INTR_INFO = 0x00004404, | ||
108 | VM_EXIT_INTR_ERROR_CODE = 0x00004406, | ||
109 | IDT_VECTORING_INFO_FIELD = 0x00004408, | ||
110 | IDT_VECTORING_ERROR_CODE = 0x0000440a, | ||
111 | VM_EXIT_INSTRUCTION_LEN = 0x0000440c, | ||
112 | VMX_INSTRUCTION_INFO = 0x0000440e, | ||
113 | GUEST_ES_LIMIT = 0x00004800, | ||
114 | GUEST_CS_LIMIT = 0x00004802, | ||
115 | GUEST_SS_LIMIT = 0x00004804, | ||
116 | GUEST_DS_LIMIT = 0x00004806, | ||
117 | GUEST_FS_LIMIT = 0x00004808, | ||
118 | GUEST_GS_LIMIT = 0x0000480a, | ||
119 | GUEST_LDTR_LIMIT = 0x0000480c, | ||
120 | GUEST_TR_LIMIT = 0x0000480e, | ||
121 | GUEST_GDTR_LIMIT = 0x00004810, | ||
122 | GUEST_IDTR_LIMIT = 0x00004812, | ||
123 | GUEST_ES_AR_BYTES = 0x00004814, | ||
124 | GUEST_CS_AR_BYTES = 0x00004816, | ||
125 | GUEST_SS_AR_BYTES = 0x00004818, | ||
126 | GUEST_DS_AR_BYTES = 0x0000481a, | ||
127 | GUEST_FS_AR_BYTES = 0x0000481c, | ||
128 | GUEST_GS_AR_BYTES = 0x0000481e, | ||
129 | GUEST_LDTR_AR_BYTES = 0x00004820, | ||
130 | GUEST_TR_AR_BYTES = 0x00004822, | ||
131 | GUEST_INTERRUPTIBILITY_INFO = 0x00004824, | ||
132 | GUEST_ACTIVITY_STATE = 0X00004826, | ||
133 | GUEST_SYSENTER_CS = 0x0000482A, | ||
134 | HOST_IA32_SYSENTER_CS = 0x00004c00, | ||
135 | CR0_GUEST_HOST_MASK = 0x00006000, | ||
136 | CR4_GUEST_HOST_MASK = 0x00006002, | ||
137 | CR0_READ_SHADOW = 0x00006004, | ||
138 | CR4_READ_SHADOW = 0x00006006, | ||
139 | CR3_TARGET_VALUE0 = 0x00006008, | ||
140 | CR3_TARGET_VALUE1 = 0x0000600a, | ||
141 | CR3_TARGET_VALUE2 = 0x0000600c, | ||
142 | CR3_TARGET_VALUE3 = 0x0000600e, | ||
143 | EXIT_QUALIFICATION = 0x00006400, | ||
144 | GUEST_LINEAR_ADDRESS = 0x0000640a, | ||
145 | GUEST_CR0 = 0x00006800, | ||
146 | GUEST_CR3 = 0x00006802, | ||
147 | GUEST_CR4 = 0x00006804, | ||
148 | GUEST_ES_BASE = 0x00006806, | ||
149 | GUEST_CS_BASE = 0x00006808, | ||
150 | GUEST_SS_BASE = 0x0000680a, | ||
151 | GUEST_DS_BASE = 0x0000680c, | ||
152 | GUEST_FS_BASE = 0x0000680e, | ||
153 | GUEST_GS_BASE = 0x00006810, | ||
154 | GUEST_LDTR_BASE = 0x00006812, | ||
155 | GUEST_TR_BASE = 0x00006814, | ||
156 | GUEST_GDTR_BASE = 0x00006816, | ||
157 | GUEST_IDTR_BASE = 0x00006818, | ||
158 | GUEST_DR7 = 0x0000681a, | ||
159 | GUEST_RSP = 0x0000681c, | ||
160 | GUEST_RIP = 0x0000681e, | ||
161 | GUEST_RFLAGS = 0x00006820, | ||
162 | GUEST_PENDING_DBG_EXCEPTIONS = 0x00006822, | ||
163 | GUEST_SYSENTER_ESP = 0x00006824, | ||
164 | GUEST_SYSENTER_EIP = 0x00006826, | ||
165 | HOST_CR0 = 0x00006c00, | ||
166 | HOST_CR3 = 0x00006c02, | ||
167 | HOST_CR4 = 0x00006c04, | ||
168 | HOST_FS_BASE = 0x00006c06, | ||
169 | HOST_GS_BASE = 0x00006c08, | ||
170 | HOST_TR_BASE = 0x00006c0a, | ||
171 | HOST_GDTR_BASE = 0x00006c0c, | ||
172 | HOST_IDTR_BASE = 0x00006c0e, | ||
173 | HOST_IA32_SYSENTER_ESP = 0x00006c10, | ||
174 | HOST_IA32_SYSENTER_EIP = 0x00006c12, | ||
175 | HOST_RSP = 0x00006c14, | ||
176 | HOST_RIP = 0x00006c16, | ||
177 | }; | ||
178 | |||
179 | #define VMX_EXIT_REASONS_FAILED_VMENTRY 0x80000000 | ||
180 | |||
181 | #define EXIT_REASON_EXCEPTION_NMI 0 | ||
182 | #define EXIT_REASON_EXTERNAL_INTERRUPT 1 | ||
183 | |||
184 | #define EXIT_REASON_PENDING_INTERRUPT 7 | ||
185 | |||
186 | #define EXIT_REASON_TASK_SWITCH 9 | ||
187 | #define EXIT_REASON_CPUID 10 | ||
188 | #define EXIT_REASON_HLT 12 | ||
189 | #define EXIT_REASON_INVLPG 14 | ||
190 | #define EXIT_REASON_RDPMC 15 | ||
191 | #define EXIT_REASON_RDTSC 16 | ||
192 | #define EXIT_REASON_VMCALL 18 | ||
193 | #define EXIT_REASON_VMCLEAR 19 | ||
194 | #define EXIT_REASON_VMLAUNCH 20 | ||
195 | #define EXIT_REASON_VMPTRLD 21 | ||
196 | #define EXIT_REASON_VMPTRST 22 | ||
197 | #define EXIT_REASON_VMREAD 23 | ||
198 | #define EXIT_REASON_VMRESUME 24 | ||
199 | #define EXIT_REASON_VMWRITE 25 | ||
200 | #define EXIT_REASON_VMOFF 26 | ||
201 | #define EXIT_REASON_VMON 27 | ||
202 | #define EXIT_REASON_CR_ACCESS 28 | ||
203 | #define EXIT_REASON_DR_ACCESS 29 | ||
204 | #define EXIT_REASON_IO_INSTRUCTION 30 | ||
205 | #define EXIT_REASON_MSR_READ 31 | ||
206 | #define EXIT_REASON_MSR_WRITE 32 | ||
207 | #define EXIT_REASON_MWAIT_INSTRUCTION 36 | ||
208 | |||
209 | /* | ||
210 | * Interruption-information format | ||
211 | */ | ||
212 | #define INTR_INFO_VECTOR_MASK 0xff /* 7:0 */ | ||
213 | #define INTR_INFO_INTR_TYPE_MASK 0x700 /* 10:8 */ | ||
214 | #define INTR_INFO_DELIEVER_CODE_MASK 0x800 /* 11 */ | ||
215 | #define INTR_INFO_VALID_MASK 0x80000000 /* 31 */ | ||
216 | |||
217 | #define VECTORING_INFO_VECTOR_MASK INTR_INFO_VECTOR_MASK | ||
218 | #define VECTORING_INFO_TYPE_MASK INTR_INFO_INTR_TYPE_MASK | ||
219 | #define VECTORING_INFO_DELIEVER_CODE_MASK INTR_INFO_DELIEVER_CODE_MASK | ||
220 | #define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK | ||
221 | |||
222 | #define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */ | ||
223 | #define INTR_TYPE_EXCEPTION (3 << 8) /* processor exception */ | ||
224 | |||
225 | /* | ||
226 | * Exit Qualifications for MOV for Control Register Access | ||
227 | */ | ||
228 | #define CONTROL_REG_ACCESS_NUM 0x7 /* 2:0, number of control register */ | ||
229 | #define CONTROL_REG_ACCESS_TYPE 0x30 /* 5:4, access type */ | ||
230 | #define CONTROL_REG_ACCESS_REG 0xf00 /* 10:8, general purpose register */ | ||
231 | #define LMSW_SOURCE_DATA_SHIFT 16 | ||
232 | #define LMSW_SOURCE_DATA (0xFFFF << LMSW_SOURCE_DATA_SHIFT) /* 16:31 lmsw source */ | ||
233 | #define REG_EAX (0 << 8) | ||
234 | #define REG_ECX (1 << 8) | ||
235 | #define REG_EDX (2 << 8) | ||
236 | #define REG_EBX (3 << 8) | ||
237 | #define REG_ESP (4 << 8) | ||
238 | #define REG_EBP (5 << 8) | ||
239 | #define REG_ESI (6 << 8) | ||
240 | #define REG_EDI (7 << 8) | ||
241 | #define REG_R8 (8 << 8) | ||
242 | #define REG_R9 (9 << 8) | ||
243 | #define REG_R10 (10 << 8) | ||
244 | #define REG_R11 (11 << 8) | ||
245 | #define REG_R12 (12 << 8) | ||
246 | #define REG_R13 (13 << 8) | ||
247 | #define REG_R14 (14 << 8) | ||
248 | #define REG_R15 (15 << 8) | ||
249 | |||
250 | /* | ||
251 | * Exit Qualifications for MOV for Debug Register Access | ||
252 | */ | ||
253 | #define DEBUG_REG_ACCESS_NUM 0x7 /* 2:0, number of debug register */ | ||
254 | #define DEBUG_REG_ACCESS_TYPE 0x10 /* 4, direction of access */ | ||
255 | #define TYPE_MOV_TO_DR (0 << 4) | ||
256 | #define TYPE_MOV_FROM_DR (1 << 4) | ||
257 | #define DEBUG_REG_ACCESS_REG 0xf00 /* 11:8, general purpose register */ | ||
258 | |||
259 | |||
260 | /* segment AR */ | ||
261 | #define SEGMENT_AR_L_MASK (1 << 13) | ||
262 | |||
263 | /* entry controls */ | ||
264 | #define VM_ENTRY_CONTROLS_IA32E_MASK (1 << 9) | ||
265 | |||
266 | #define AR_TYPE_ACCESSES_MASK 1 | ||
267 | #define AR_TYPE_READABLE_MASK (1 << 1) | ||
268 | #define AR_TYPE_WRITEABLE_MASK (1 << 2) | ||
269 | #define AR_TYPE_CODE_MASK (1 << 3) | ||
270 | #define AR_TYPE_MASK 0x0f | ||
271 | #define AR_TYPE_BUSY_64_TSS 11 | ||
272 | #define AR_TYPE_BUSY_32_TSS 11 | ||
273 | #define AR_TYPE_BUSY_16_TSS 3 | ||
274 | #define AR_TYPE_LDT 2 | ||
275 | |||
276 | #define AR_UNUSABLE_MASK (1 << 16) | ||
277 | #define AR_S_MASK (1 << 4) | ||
278 | #define AR_P_MASK (1 << 7) | ||
279 | #define AR_L_MASK (1 << 13) | ||
280 | #define AR_DB_MASK (1 << 14) | ||
281 | #define AR_G_MASK (1 << 15) | ||
282 | #define AR_DPL_SHIFT 5 | ||
283 | #define AR_DPL(ar) (((ar) >> AR_DPL_SHIFT) & 3) | ||
284 | |||
285 | #define AR_RESERVD_MASK 0xfffe0f00 | ||
286 | |||
287 | #define CR4_VMXE 0x2000 | ||
288 | |||
289 | #define MSR_IA32_VMX_BASIC_MSR 0x480 | ||
290 | #define MSR_IA32_FEATURE_CONTROL 0x03a | ||
291 | #define MSR_IA32_VMX_PINBASED_CTLS_MSR 0x481 | ||
292 | #define MSR_IA32_VMX_PROCBASED_CTLS_MSR 0x482 | ||
293 | #define MSR_IA32_VMX_EXIT_CTLS_MSR 0x483 | ||
294 | #define MSR_IA32_VMX_ENTRY_CTLS_MSR 0x484 | ||
295 | |||
296 | #endif | ||
diff --git a/drivers/kvm/x86_emulate.c b/drivers/kvm/x86_emulate.c new file mode 100644 index 000000000000..7e838bf0592d --- /dev/null +++ b/drivers/kvm/x86_emulate.c | |||
@@ -0,0 +1,1409 @@ | |||
1 | /****************************************************************************** | ||
2 | * x86_emulate.c | ||
3 | * | ||
4 | * Generic x86 (32-bit and 64-bit) instruction decoder and emulator. | ||
5 | * | ||
6 | * Copyright (c) 2005 Keir Fraser | ||
7 | * | ||
8 | * Linux coding style, mod r/m decoder, segment base fixes, real-mode | ||
9 | * privieged instructions: | ||
10 | * | ||
11 | * Copyright (C) 2006 Qumranet | ||
12 | * | ||
13 | * Avi Kivity <avi@qumranet.com> | ||
14 | * Yaniv Kamay <yaniv@qumranet.com> | ||
15 | * | ||
16 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
17 | * the COPYING file in the top-level directory. | ||
18 | * | ||
19 | * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 | ||
20 | */ | ||
21 | |||
22 | #ifndef __KERNEL__ | ||
23 | #include <stdio.h> | ||
24 | #include <stdint.h> | ||
25 | #include <public/xen.h> | ||
26 | #define DPRINTF(_f, _a ...) printf( _f , ## _a ) | ||
27 | #else | ||
28 | #include "kvm.h" | ||
29 | #define DPRINTF(x...) do {} while (0) | ||
30 | #endif | ||
31 | #include "x86_emulate.h" | ||
32 | #include <linux/module.h> | ||
33 | |||
34 | /* | ||
35 | * Opcode effective-address decode tables. | ||
36 | * Note that we only emulate instructions that have at least one memory | ||
37 | * operand (excluding implicit stack references). We assume that stack | ||
38 | * references and instruction fetches will never occur in special memory | ||
39 | * areas that require emulation. So, for example, 'mov <imm>,<reg>' need | ||
40 | * not be handled. | ||
41 | */ | ||
42 | |||
43 | /* Operand sizes: 8-bit operands or specified/overridden size. */ | ||
44 | #define ByteOp (1<<0) /* 8-bit operands. */ | ||
45 | /* Destination operand type. */ | ||
46 | #define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ | ||
47 | #define DstReg (2<<1) /* Register operand. */ | ||
48 | #define DstMem (3<<1) /* Memory operand. */ | ||
49 | #define DstMask (3<<1) | ||
50 | /* Source operand type. */ | ||
51 | #define SrcNone (0<<3) /* No source operand. */ | ||
52 | #define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */ | ||
53 | #define SrcReg (1<<3) /* Register operand. */ | ||
54 | #define SrcMem (2<<3) /* Memory operand. */ | ||
55 | #define SrcMem16 (3<<3) /* Memory operand (16-bit). */ | ||
56 | #define SrcMem32 (4<<3) /* Memory operand (32-bit). */ | ||
57 | #define SrcImm (5<<3) /* Immediate operand. */ | ||
58 | #define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */ | ||
59 | #define SrcMask (7<<3) | ||
60 | /* Generic ModRM decode. */ | ||
61 | #define ModRM (1<<6) | ||
62 | /* Destination is only written; never read. */ | ||
63 | #define Mov (1<<7) | ||
64 | |||
65 | static u8 opcode_table[256] = { | ||
66 | /* 0x00 - 0x07 */ | ||
67 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
68 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
69 | 0, 0, 0, 0, | ||
70 | /* 0x08 - 0x0F */ | ||
71 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
72 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
73 | 0, 0, 0, 0, | ||
74 | /* 0x10 - 0x17 */ | ||
75 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
76 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
77 | 0, 0, 0, 0, | ||
78 | /* 0x18 - 0x1F */ | ||
79 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
80 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
81 | 0, 0, 0, 0, | ||
82 | /* 0x20 - 0x27 */ | ||
83 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
84 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
85 | 0, 0, 0, 0, | ||
86 | /* 0x28 - 0x2F */ | ||
87 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
88 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
89 | 0, 0, 0, 0, | ||
90 | /* 0x30 - 0x37 */ | ||
91 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
92 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
93 | 0, 0, 0, 0, | ||
94 | /* 0x38 - 0x3F */ | ||
95 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
96 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
97 | 0, 0, 0, 0, | ||
98 | /* 0x40 - 0x4F */ | ||
99 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
100 | /* 0x50 - 0x5F */ | ||
101 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
102 | /* 0x60 - 0x6F */ | ||
103 | 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , | ||
104 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
105 | /* 0x70 - 0x7F */ | ||
106 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
107 | /* 0x80 - 0x87 */ | ||
108 | ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, | ||
109 | ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, | ||
110 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
111 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
112 | /* 0x88 - 0x8F */ | ||
113 | ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, | ||
114 | ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
115 | 0, 0, 0, DstMem | SrcNone | ModRM | Mov, | ||
116 | /* 0x90 - 0x9F */ | ||
117 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
118 | /* 0xA0 - 0xA7 */ | ||
119 | ByteOp | DstReg | SrcMem | Mov, DstReg | SrcMem | Mov, | ||
120 | ByteOp | DstMem | SrcReg | Mov, DstMem | SrcReg | Mov, | ||
121 | ByteOp | ImplicitOps | Mov, ImplicitOps | Mov, | ||
122 | ByteOp | ImplicitOps, ImplicitOps, | ||
123 | /* 0xA8 - 0xAF */ | ||
124 | 0, 0, ByteOp | ImplicitOps | Mov, ImplicitOps | Mov, | ||
125 | ByteOp | ImplicitOps | Mov, ImplicitOps | Mov, | ||
126 | ByteOp | ImplicitOps, ImplicitOps, | ||
127 | /* 0xB0 - 0xBF */ | ||
128 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
129 | /* 0xC0 - 0xC7 */ | ||
130 | ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, 0, 0, | ||
131 | 0, 0, ByteOp | DstMem | SrcImm | ModRM | Mov, | ||
132 | DstMem | SrcImm | ModRM | Mov, | ||
133 | /* 0xC8 - 0xCF */ | ||
134 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
135 | /* 0xD0 - 0xD7 */ | ||
136 | ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, | ||
137 | ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, | ||
138 | 0, 0, 0, 0, | ||
139 | /* 0xD8 - 0xDF */ | ||
140 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
141 | /* 0xE0 - 0xEF */ | ||
142 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
143 | /* 0xF0 - 0xF7 */ | ||
144 | 0, 0, 0, 0, | ||
145 | 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, | ||
146 | /* 0xF8 - 0xFF */ | ||
147 | 0, 0, 0, 0, | ||
148 | 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM | ||
149 | }; | ||
150 | |||
151 | static u8 twobyte_table[256] = { | ||
152 | /* 0x00 - 0x0F */ | ||
153 | 0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0, | ||
154 | 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, | ||
155 | /* 0x10 - 0x1F */ | ||
156 | 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, | ||
157 | /* 0x20 - 0x2F */ | ||
158 | ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0, | ||
159 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
160 | /* 0x30 - 0x3F */ | ||
161 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
162 | /* 0x40 - 0x47 */ | ||
163 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
164 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
165 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
166 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
167 | /* 0x48 - 0x4F */ | ||
168 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
169 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
170 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
171 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
172 | /* 0x50 - 0x5F */ | ||
173 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
174 | /* 0x60 - 0x6F */ | ||
175 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
176 | /* 0x70 - 0x7F */ | ||
177 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
178 | /* 0x80 - 0x8F */ | ||
179 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
180 | /* 0x90 - 0x9F */ | ||
181 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
182 | /* 0xA0 - 0xA7 */ | ||
183 | 0, 0, 0, DstMem | SrcReg | ModRM, 0, 0, 0, 0, | ||
184 | /* 0xA8 - 0xAF */ | ||
185 | 0, 0, 0, DstMem | SrcReg | ModRM, 0, 0, 0, 0, | ||
186 | /* 0xB0 - 0xB7 */ | ||
187 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0, | ||
188 | DstMem | SrcReg | ModRM, | ||
189 | 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, | ||
190 | DstReg | SrcMem16 | ModRM | Mov, | ||
191 | /* 0xB8 - 0xBF */ | ||
192 | 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM, | ||
193 | 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, | ||
194 | DstReg | SrcMem16 | ModRM | Mov, | ||
195 | /* 0xC0 - 0xCF */ | ||
196 | 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, 0, | ||
197 | /* 0xD0 - 0xDF */ | ||
198 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
199 | /* 0xE0 - 0xEF */ | ||
200 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
201 | /* 0xF0 - 0xFF */ | ||
202 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | ||
203 | }; | ||
204 | |||
205 | /* | ||
206 | * Tell the emulator that of the Group 7 instructions (sgdt, lidt, etc.) we | ||
207 | * are interested only in invlpg and not in any of the rest. | ||
208 | * | ||
209 | * invlpg is a special instruction in that the data it references may not | ||
210 | * be mapped. | ||
211 | */ | ||
212 | void kvm_emulator_want_group7_invlpg(void) | ||
213 | { | ||
214 | twobyte_table[1] &= ~SrcMem; | ||
215 | } | ||
216 | EXPORT_SYMBOL_GPL(kvm_emulator_want_group7_invlpg); | ||
217 | |||
218 | /* Type, address-of, and value of an instruction's operand. */ | ||
219 | struct operand { | ||
220 | enum { OP_REG, OP_MEM, OP_IMM } type; | ||
221 | unsigned int bytes; | ||
222 | unsigned long val, orig_val, *ptr; | ||
223 | }; | ||
224 | |||
225 | /* EFLAGS bit definitions. */ | ||
226 | #define EFLG_OF (1<<11) | ||
227 | #define EFLG_DF (1<<10) | ||
228 | #define EFLG_SF (1<<7) | ||
229 | #define EFLG_ZF (1<<6) | ||
230 | #define EFLG_AF (1<<4) | ||
231 | #define EFLG_PF (1<<2) | ||
232 | #define EFLG_CF (1<<0) | ||
233 | |||
234 | /* | ||
235 | * Instruction emulation: | ||
236 | * Most instructions are emulated directly via a fragment of inline assembly | ||
237 | * code. This allows us to save/restore EFLAGS and thus very easily pick up | ||
238 | * any modified flags. | ||
239 | */ | ||
240 | |||
241 | #if defined(__x86_64__) | ||
242 | #define _LO32 "k" /* force 32-bit operand */ | ||
243 | #define _STK "%%rsp" /* stack pointer */ | ||
244 | #elif defined(__i386__) | ||
245 | #define _LO32 "" /* force 32-bit operand */ | ||
246 | #define _STK "%%esp" /* stack pointer */ | ||
247 | #endif | ||
248 | |||
249 | /* | ||
250 | * These EFLAGS bits are restored from saved value during emulation, and | ||
251 | * any changes are written back to the saved value after emulation. | ||
252 | */ | ||
253 | #define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF) | ||
254 | |||
255 | /* Before executing instruction: restore necessary bits in EFLAGS. */ | ||
256 | #define _PRE_EFLAGS(_sav, _msk, _tmp) \ | ||
257 | /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); */ \ | ||
258 | "push %"_sav"; " \ | ||
259 | "movl %"_msk",%"_LO32 _tmp"; " \ | ||
260 | "andl %"_LO32 _tmp",("_STK"); " \ | ||
261 | "pushf; " \ | ||
262 | "notl %"_LO32 _tmp"; " \ | ||
263 | "andl %"_LO32 _tmp",("_STK"); " \ | ||
264 | "pop %"_tmp"; " \ | ||
265 | "orl %"_LO32 _tmp",("_STK"); " \ | ||
266 | "popf; " \ | ||
267 | /* _sav &= ~msk; */ \ | ||
268 | "movl %"_msk",%"_LO32 _tmp"; " \ | ||
269 | "notl %"_LO32 _tmp"; " \ | ||
270 | "andl %"_LO32 _tmp",%"_sav"; " | ||
271 | |||
272 | /* After executing instruction: write-back necessary bits in EFLAGS. */ | ||
273 | #define _POST_EFLAGS(_sav, _msk, _tmp) \ | ||
274 | /* _sav |= EFLAGS & _msk; */ \ | ||
275 | "pushf; " \ | ||
276 | "pop %"_tmp"; " \ | ||
277 | "andl %"_msk",%"_LO32 _tmp"; " \ | ||
278 | "orl %"_LO32 _tmp",%"_sav"; " | ||
279 | |||
280 | /* Raw emulation: instruction has two explicit operands. */ | ||
281 | #define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \ | ||
282 | do { \ | ||
283 | unsigned long _tmp; \ | ||
284 | \ | ||
285 | switch ((_dst).bytes) { \ | ||
286 | case 2: \ | ||
287 | __asm__ __volatile__ ( \ | ||
288 | _PRE_EFLAGS("0","4","2") \ | ||
289 | _op"w %"_wx"3,%1; " \ | ||
290 | _POST_EFLAGS("0","4","2") \ | ||
291 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
292 | "=&r" (_tmp) \ | ||
293 | : _wy ((_src).val), "i" (EFLAGS_MASK) ); \ | ||
294 | break; \ | ||
295 | case 4: \ | ||
296 | __asm__ __volatile__ ( \ | ||
297 | _PRE_EFLAGS("0","4","2") \ | ||
298 | _op"l %"_lx"3,%1; " \ | ||
299 | _POST_EFLAGS("0","4","2") \ | ||
300 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
301 | "=&r" (_tmp) \ | ||
302 | : _ly ((_src).val), "i" (EFLAGS_MASK) ); \ | ||
303 | break; \ | ||
304 | case 8: \ | ||
305 | __emulate_2op_8byte(_op, _src, _dst, \ | ||
306 | _eflags, _qx, _qy); \ | ||
307 | break; \ | ||
308 | } \ | ||
309 | } while (0) | ||
310 | |||
311 | #define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \ | ||
312 | do { \ | ||
313 | unsigned long _tmp; \ | ||
314 | switch ( (_dst).bytes ) \ | ||
315 | { \ | ||
316 | case 1: \ | ||
317 | __asm__ __volatile__ ( \ | ||
318 | _PRE_EFLAGS("0","4","2") \ | ||
319 | _op"b %"_bx"3,%1; " \ | ||
320 | _POST_EFLAGS("0","4","2") \ | ||
321 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
322 | "=&r" (_tmp) \ | ||
323 | : _by ((_src).val), "i" (EFLAGS_MASK) ); \ | ||
324 | break; \ | ||
325 | default: \ | ||
326 | __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ | ||
327 | _wx, _wy, _lx, _ly, _qx, _qy); \ | ||
328 | break; \ | ||
329 | } \ | ||
330 | } while (0) | ||
331 | |||
332 | /* Source operand is byte-sized and may be restricted to just %cl. */ | ||
333 | #define emulate_2op_SrcB(_op, _src, _dst, _eflags) \ | ||
334 | __emulate_2op(_op, _src, _dst, _eflags, \ | ||
335 | "b", "c", "b", "c", "b", "c", "b", "c") | ||
336 | |||
337 | /* Source operand is byte, word, long or quad sized. */ | ||
338 | #define emulate_2op_SrcV(_op, _src, _dst, _eflags) \ | ||
339 | __emulate_2op(_op, _src, _dst, _eflags, \ | ||
340 | "b", "q", "w", "r", _LO32, "r", "", "r") | ||
341 | |||
342 | /* Source operand is word, long or quad sized. */ | ||
343 | #define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \ | ||
344 | __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ | ||
345 | "w", "r", _LO32, "r", "", "r") | ||
346 | |||
347 | /* Instruction has only one explicit operand (no source operand). */ | ||
348 | #define emulate_1op(_op, _dst, _eflags) \ | ||
349 | do { \ | ||
350 | unsigned long _tmp; \ | ||
351 | \ | ||
352 | switch ( (_dst).bytes ) \ | ||
353 | { \ | ||
354 | case 1: \ | ||
355 | __asm__ __volatile__ ( \ | ||
356 | _PRE_EFLAGS("0","3","2") \ | ||
357 | _op"b %1; " \ | ||
358 | _POST_EFLAGS("0","3","2") \ | ||
359 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
360 | "=&r" (_tmp) \ | ||
361 | : "i" (EFLAGS_MASK) ); \ | ||
362 | break; \ | ||
363 | case 2: \ | ||
364 | __asm__ __volatile__ ( \ | ||
365 | _PRE_EFLAGS("0","3","2") \ | ||
366 | _op"w %1; " \ | ||
367 | _POST_EFLAGS("0","3","2") \ | ||
368 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
369 | "=&r" (_tmp) \ | ||
370 | : "i" (EFLAGS_MASK) ); \ | ||
371 | break; \ | ||
372 | case 4: \ | ||
373 | __asm__ __volatile__ ( \ | ||
374 | _PRE_EFLAGS("0","3","2") \ | ||
375 | _op"l %1; " \ | ||
376 | _POST_EFLAGS("0","3","2") \ | ||
377 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
378 | "=&r" (_tmp) \ | ||
379 | : "i" (EFLAGS_MASK) ); \ | ||
380 | break; \ | ||
381 | case 8: \ | ||
382 | __emulate_1op_8byte(_op, _dst, _eflags); \ | ||
383 | break; \ | ||
384 | } \ | ||
385 | } while (0) | ||
386 | |||
387 | /* Emulate an instruction with quadword operands (x86/64 only). */ | ||
388 | #if defined(__x86_64__) | ||
389 | #define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) \ | ||
390 | do { \ | ||
391 | __asm__ __volatile__ ( \ | ||
392 | _PRE_EFLAGS("0","4","2") \ | ||
393 | _op"q %"_qx"3,%1; " \ | ||
394 | _POST_EFLAGS("0","4","2") \ | ||
395 | : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ | ||
396 | : _qy ((_src).val), "i" (EFLAGS_MASK) ); \ | ||
397 | } while (0) | ||
398 | |||
399 | #define __emulate_1op_8byte(_op, _dst, _eflags) \ | ||
400 | do { \ | ||
401 | __asm__ __volatile__ ( \ | ||
402 | _PRE_EFLAGS("0","3","2") \ | ||
403 | _op"q %1; " \ | ||
404 | _POST_EFLAGS("0","3","2") \ | ||
405 | : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ | ||
406 | : "i" (EFLAGS_MASK) ); \ | ||
407 | } while (0) | ||
408 | |||
409 | #elif defined(__i386__) | ||
410 | #define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) | ||
411 | #define __emulate_1op_8byte(_op, _dst, _eflags) | ||
412 | #endif /* __i386__ */ | ||
413 | |||
414 | /* Fetch next part of the instruction being emulated. */ | ||
415 | #define insn_fetch(_type, _size, _eip) \ | ||
416 | ({ unsigned long _x; \ | ||
417 | rc = ops->read_std((unsigned long)(_eip) + ctxt->cs_base, &_x, \ | ||
418 | (_size), ctxt); \ | ||
419 | if ( rc != 0 ) \ | ||
420 | goto done; \ | ||
421 | (_eip) += (_size); \ | ||
422 | (_type)_x; \ | ||
423 | }) | ||
424 | |||
425 | /* Access/update address held in a register, based on addressing mode. */ | ||
426 | #define register_address(base, reg) \ | ||
427 | ((base) + ((ad_bytes == sizeof(unsigned long)) ? (reg) : \ | ||
428 | ((reg) & ((1UL << (ad_bytes << 3)) - 1)))) | ||
429 | |||
430 | #define register_address_increment(reg, inc) \ | ||
431 | do { \ | ||
432 | /* signed type ensures sign extension to long */ \ | ||
433 | int _inc = (inc); \ | ||
434 | if ( ad_bytes == sizeof(unsigned long) ) \ | ||
435 | (reg) += _inc; \ | ||
436 | else \ | ||
437 | (reg) = ((reg) & ~((1UL << (ad_bytes << 3)) - 1)) | \ | ||
438 | (((reg) + _inc) & ((1UL << (ad_bytes << 3)) - 1)); \ | ||
439 | } while (0) | ||
440 | |||
441 | void *decode_register(u8 modrm_reg, unsigned long *regs, | ||
442 | int highbyte_regs) | ||
443 | { | ||
444 | void *p; | ||
445 | |||
446 | p = ®s[modrm_reg]; | ||
447 | if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8) | ||
448 | p = (unsigned char *)®s[modrm_reg & 3] + 1; | ||
449 | return p; | ||
450 | } | ||
451 | |||
452 | static int read_descriptor(struct x86_emulate_ctxt *ctxt, | ||
453 | struct x86_emulate_ops *ops, | ||
454 | void *ptr, | ||
455 | u16 *size, unsigned long *address, int op_bytes) | ||
456 | { | ||
457 | int rc; | ||
458 | |||
459 | if (op_bytes == 2) | ||
460 | op_bytes = 3; | ||
461 | *address = 0; | ||
462 | rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, ctxt); | ||
463 | if (rc) | ||
464 | return rc; | ||
465 | rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, ctxt); | ||
466 | return rc; | ||
467 | } | ||
468 | |||
469 | int | ||
470 | x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | ||
471 | { | ||
472 | u8 b, d, sib, twobyte = 0, rex_prefix = 0; | ||
473 | u8 modrm, modrm_mod = 0, modrm_reg = 0, modrm_rm = 0; | ||
474 | unsigned long *override_base = NULL; | ||
475 | unsigned int op_bytes, ad_bytes, lock_prefix = 0, rep_prefix = 0, i; | ||
476 | int rc = 0; | ||
477 | struct operand src, dst; | ||
478 | unsigned long cr2 = ctxt->cr2; | ||
479 | int mode = ctxt->mode; | ||
480 | unsigned long modrm_ea; | ||
481 | int use_modrm_ea, index_reg = 0, base_reg = 0, scale, rip_relative = 0; | ||
482 | |||
483 | /* Shadow copy of register state. Committed on successful emulation. */ | ||
484 | unsigned long _regs[NR_VCPU_REGS]; | ||
485 | unsigned long _eip = ctxt->vcpu->rip, _eflags = ctxt->eflags; | ||
486 | unsigned long modrm_val = 0; | ||
487 | |||
488 | memcpy(_regs, ctxt->vcpu->regs, sizeof _regs); | ||
489 | |||
490 | switch (mode) { | ||
491 | case X86EMUL_MODE_REAL: | ||
492 | case X86EMUL_MODE_PROT16: | ||
493 | op_bytes = ad_bytes = 2; | ||
494 | break; | ||
495 | case X86EMUL_MODE_PROT32: | ||
496 | op_bytes = ad_bytes = 4; | ||
497 | break; | ||
498 | #ifdef __x86_64__ | ||
499 | case X86EMUL_MODE_PROT64: | ||
500 | op_bytes = 4; | ||
501 | ad_bytes = 8; | ||
502 | break; | ||
503 | #endif | ||
504 | default: | ||
505 | return -1; | ||
506 | } | ||
507 | |||
508 | /* Legacy prefixes. */ | ||
509 | for (i = 0; i < 8; i++) { | ||
510 | switch (b = insn_fetch(u8, 1, _eip)) { | ||
511 | case 0x66: /* operand-size override */ | ||
512 | op_bytes ^= 6; /* switch between 2/4 bytes */ | ||
513 | break; | ||
514 | case 0x67: /* address-size override */ | ||
515 | if (mode == X86EMUL_MODE_PROT64) | ||
516 | ad_bytes ^= 12; /* switch between 4/8 bytes */ | ||
517 | else | ||
518 | ad_bytes ^= 6; /* switch between 2/4 bytes */ | ||
519 | break; | ||
520 | case 0x2e: /* CS override */ | ||
521 | override_base = &ctxt->cs_base; | ||
522 | break; | ||
523 | case 0x3e: /* DS override */ | ||
524 | override_base = &ctxt->ds_base; | ||
525 | break; | ||
526 | case 0x26: /* ES override */ | ||
527 | override_base = &ctxt->es_base; | ||
528 | break; | ||
529 | case 0x64: /* FS override */ | ||
530 | override_base = &ctxt->fs_base; | ||
531 | break; | ||
532 | case 0x65: /* GS override */ | ||
533 | override_base = &ctxt->gs_base; | ||
534 | break; | ||
535 | case 0x36: /* SS override */ | ||
536 | override_base = &ctxt->ss_base; | ||
537 | break; | ||
538 | case 0xf0: /* LOCK */ | ||
539 | lock_prefix = 1; | ||
540 | break; | ||
541 | case 0xf3: /* REP/REPE/REPZ */ | ||
542 | rep_prefix = 1; | ||
543 | break; | ||
544 | case 0xf2: /* REPNE/REPNZ */ | ||
545 | break; | ||
546 | default: | ||
547 | goto done_prefixes; | ||
548 | } | ||
549 | } | ||
550 | |||
551 | done_prefixes: | ||
552 | |||
553 | /* REX prefix. */ | ||
554 | if ((mode == X86EMUL_MODE_PROT64) && ((b & 0xf0) == 0x40)) { | ||
555 | rex_prefix = b; | ||
556 | if (b & 8) | ||
557 | op_bytes = 8; /* REX.W */ | ||
558 | modrm_reg = (b & 4) << 1; /* REX.R */ | ||
559 | index_reg = (b & 2) << 2; /* REX.X */ | ||
560 | modrm_rm = base_reg = (b & 1) << 3; /* REG.B */ | ||
561 | b = insn_fetch(u8, 1, _eip); | ||
562 | } | ||
563 | |||
564 | /* Opcode byte(s). */ | ||
565 | d = opcode_table[b]; | ||
566 | if (d == 0) { | ||
567 | /* Two-byte opcode? */ | ||
568 | if (b == 0x0f) { | ||
569 | twobyte = 1; | ||
570 | b = insn_fetch(u8, 1, _eip); | ||
571 | d = twobyte_table[b]; | ||
572 | } | ||
573 | |||
574 | /* Unrecognised? */ | ||
575 | if (d == 0) | ||
576 | goto cannot_emulate; | ||
577 | } | ||
578 | |||
579 | /* ModRM and SIB bytes. */ | ||
580 | if (d & ModRM) { | ||
581 | modrm = insn_fetch(u8, 1, _eip); | ||
582 | modrm_mod |= (modrm & 0xc0) >> 6; | ||
583 | modrm_reg |= (modrm & 0x38) >> 3; | ||
584 | modrm_rm |= (modrm & 0x07); | ||
585 | modrm_ea = 0; | ||
586 | use_modrm_ea = 1; | ||
587 | |||
588 | if (modrm_mod == 3) { | ||
589 | modrm_val = *(unsigned long *) | ||
590 | decode_register(modrm_rm, _regs, d & ByteOp); | ||
591 | goto modrm_done; | ||
592 | } | ||
593 | |||
594 | if (ad_bytes == 2) { | ||
595 | unsigned bx = _regs[VCPU_REGS_RBX]; | ||
596 | unsigned bp = _regs[VCPU_REGS_RBP]; | ||
597 | unsigned si = _regs[VCPU_REGS_RSI]; | ||
598 | unsigned di = _regs[VCPU_REGS_RDI]; | ||
599 | |||
600 | /* 16-bit ModR/M decode. */ | ||
601 | switch (modrm_mod) { | ||
602 | case 0: | ||
603 | if (modrm_rm == 6) | ||
604 | modrm_ea += insn_fetch(u16, 2, _eip); | ||
605 | break; | ||
606 | case 1: | ||
607 | modrm_ea += insn_fetch(s8, 1, _eip); | ||
608 | break; | ||
609 | case 2: | ||
610 | modrm_ea += insn_fetch(u16, 2, _eip); | ||
611 | break; | ||
612 | } | ||
613 | switch (modrm_rm) { | ||
614 | case 0: | ||
615 | modrm_ea += bx + si; | ||
616 | break; | ||
617 | case 1: | ||
618 | modrm_ea += bx + di; | ||
619 | break; | ||
620 | case 2: | ||
621 | modrm_ea += bp + si; | ||
622 | break; | ||
623 | case 3: | ||
624 | modrm_ea += bp + di; | ||
625 | break; | ||
626 | case 4: | ||
627 | modrm_ea += si; | ||
628 | break; | ||
629 | case 5: | ||
630 | modrm_ea += di; | ||
631 | break; | ||
632 | case 6: | ||
633 | if (modrm_mod != 0) | ||
634 | modrm_ea += bp; | ||
635 | break; | ||
636 | case 7: | ||
637 | modrm_ea += bx; | ||
638 | break; | ||
639 | } | ||
640 | if (modrm_rm == 2 || modrm_rm == 3 || | ||
641 | (modrm_rm == 6 && modrm_mod != 0)) | ||
642 | if (!override_base) | ||
643 | override_base = &ctxt->ss_base; | ||
644 | modrm_ea = (u16)modrm_ea; | ||
645 | } else { | ||
646 | /* 32/64-bit ModR/M decode. */ | ||
647 | switch (modrm_rm) { | ||
648 | case 4: | ||
649 | case 12: | ||
650 | sib = insn_fetch(u8, 1, _eip); | ||
651 | index_reg |= (sib >> 3) & 7; | ||
652 | base_reg |= sib & 7; | ||
653 | scale = sib >> 6; | ||
654 | |||
655 | switch (base_reg) { | ||
656 | case 5: | ||
657 | if (modrm_mod != 0) | ||
658 | modrm_ea += _regs[base_reg]; | ||
659 | else | ||
660 | modrm_ea += insn_fetch(s32, 4, _eip); | ||
661 | break; | ||
662 | default: | ||
663 | modrm_ea += _regs[base_reg]; | ||
664 | } | ||
665 | switch (index_reg) { | ||
666 | case 4: | ||
667 | break; | ||
668 | default: | ||
669 | modrm_ea += _regs[index_reg] << scale; | ||
670 | |||
671 | } | ||
672 | break; | ||
673 | case 5: | ||
674 | if (modrm_mod != 0) | ||
675 | modrm_ea += _regs[modrm_rm]; | ||
676 | else if (mode == X86EMUL_MODE_PROT64) | ||
677 | rip_relative = 1; | ||
678 | break; | ||
679 | default: | ||
680 | modrm_ea += _regs[modrm_rm]; | ||
681 | break; | ||
682 | } | ||
683 | switch (modrm_mod) { | ||
684 | case 0: | ||
685 | if (modrm_rm == 5) | ||
686 | modrm_ea += insn_fetch(s32, 4, _eip); | ||
687 | break; | ||
688 | case 1: | ||
689 | modrm_ea += insn_fetch(s8, 1, _eip); | ||
690 | break; | ||
691 | case 2: | ||
692 | modrm_ea += insn_fetch(s32, 4, _eip); | ||
693 | break; | ||
694 | } | ||
695 | } | ||
696 | if (!override_base) | ||
697 | override_base = &ctxt->ds_base; | ||
698 | if (mode == X86EMUL_MODE_PROT64 && | ||
699 | override_base != &ctxt->fs_base && | ||
700 | override_base != &ctxt->gs_base) | ||
701 | override_base = NULL; | ||
702 | |||
703 | if (override_base) | ||
704 | modrm_ea += *override_base; | ||
705 | |||
706 | if (rip_relative) { | ||
707 | modrm_ea += _eip; | ||
708 | switch (d & SrcMask) { | ||
709 | case SrcImmByte: | ||
710 | modrm_ea += 1; | ||
711 | break; | ||
712 | case SrcImm: | ||
713 | if (d & ByteOp) | ||
714 | modrm_ea += 1; | ||
715 | else | ||
716 | if (op_bytes == 8) | ||
717 | modrm_ea += 4; | ||
718 | else | ||
719 | modrm_ea += op_bytes; | ||
720 | } | ||
721 | } | ||
722 | if (ad_bytes != 8) | ||
723 | modrm_ea = (u32)modrm_ea; | ||
724 | cr2 = modrm_ea; | ||
725 | modrm_done: | ||
726 | ; | ||
727 | } | ||
728 | |||
729 | /* Decode and fetch the destination operand: register or memory. */ | ||
730 | switch (d & DstMask) { | ||
731 | case ImplicitOps: | ||
732 | /* Special instructions do their own operand decoding. */ | ||
733 | goto special_insn; | ||
734 | case DstReg: | ||
735 | dst.type = OP_REG; | ||
736 | if ((d & ByteOp) | ||
737 | && !(twobyte_table && (b == 0xb6 || b == 0xb7))) { | ||
738 | dst.ptr = decode_register(modrm_reg, _regs, | ||
739 | (rex_prefix == 0)); | ||
740 | dst.val = *(u8 *) dst.ptr; | ||
741 | dst.bytes = 1; | ||
742 | } else { | ||
743 | dst.ptr = decode_register(modrm_reg, _regs, 0); | ||
744 | switch ((dst.bytes = op_bytes)) { | ||
745 | case 2: | ||
746 | dst.val = *(u16 *)dst.ptr; | ||
747 | break; | ||
748 | case 4: | ||
749 | dst.val = *(u32 *)dst.ptr; | ||
750 | break; | ||
751 | case 8: | ||
752 | dst.val = *(u64 *)dst.ptr; | ||
753 | break; | ||
754 | } | ||
755 | } | ||
756 | break; | ||
757 | case DstMem: | ||
758 | dst.type = OP_MEM; | ||
759 | dst.ptr = (unsigned long *)cr2; | ||
760 | dst.bytes = (d & ByteOp) ? 1 : op_bytes; | ||
761 | if (!(d & Mov) && /* optimisation - avoid slow emulated read */ | ||
762 | ((rc = ops->read_emulated((unsigned long)dst.ptr, | ||
763 | &dst.val, dst.bytes, ctxt)) != 0)) | ||
764 | goto done; | ||
765 | break; | ||
766 | } | ||
767 | dst.orig_val = dst.val; | ||
768 | |||
769 | /* | ||
770 | * Decode and fetch the source operand: register, memory | ||
771 | * or immediate. | ||
772 | */ | ||
773 | switch (d & SrcMask) { | ||
774 | case SrcNone: | ||
775 | break; | ||
776 | case SrcReg: | ||
777 | src.type = OP_REG; | ||
778 | if (d & ByteOp) { | ||
779 | src.ptr = decode_register(modrm_reg, _regs, | ||
780 | (rex_prefix == 0)); | ||
781 | src.val = src.orig_val = *(u8 *) src.ptr; | ||
782 | src.bytes = 1; | ||
783 | } else { | ||
784 | src.ptr = decode_register(modrm_reg, _regs, 0); | ||
785 | switch ((src.bytes = op_bytes)) { | ||
786 | case 2: | ||
787 | src.val = src.orig_val = *(u16 *) src.ptr; | ||
788 | break; | ||
789 | case 4: | ||
790 | src.val = src.orig_val = *(u32 *) src.ptr; | ||
791 | break; | ||
792 | case 8: | ||
793 | src.val = src.orig_val = *(u64 *) src.ptr; | ||
794 | break; | ||
795 | } | ||
796 | } | ||
797 | break; | ||
798 | case SrcMem16: | ||
799 | src.bytes = 2; | ||
800 | goto srcmem_common; | ||
801 | case SrcMem32: | ||
802 | src.bytes = 4; | ||
803 | goto srcmem_common; | ||
804 | case SrcMem: | ||
805 | src.bytes = (d & ByteOp) ? 1 : op_bytes; | ||
806 | srcmem_common: | ||
807 | src.type = OP_MEM; | ||
808 | src.ptr = (unsigned long *)cr2; | ||
809 | if ((rc = ops->read_emulated((unsigned long)src.ptr, | ||
810 | &src.val, src.bytes, ctxt)) != 0) | ||
811 | goto done; | ||
812 | src.orig_val = src.val; | ||
813 | break; | ||
814 | case SrcImm: | ||
815 | src.type = OP_IMM; | ||
816 | src.ptr = (unsigned long *)_eip; | ||
817 | src.bytes = (d & ByteOp) ? 1 : op_bytes; | ||
818 | if (src.bytes == 8) | ||
819 | src.bytes = 4; | ||
820 | /* NB. Immediates are sign-extended as necessary. */ | ||
821 | switch (src.bytes) { | ||
822 | case 1: | ||
823 | src.val = insn_fetch(s8, 1, _eip); | ||
824 | break; | ||
825 | case 2: | ||
826 | src.val = insn_fetch(s16, 2, _eip); | ||
827 | break; | ||
828 | case 4: | ||
829 | src.val = insn_fetch(s32, 4, _eip); | ||
830 | break; | ||
831 | } | ||
832 | break; | ||
833 | case SrcImmByte: | ||
834 | src.type = OP_IMM; | ||
835 | src.ptr = (unsigned long *)_eip; | ||
836 | src.bytes = 1; | ||
837 | src.val = insn_fetch(s8, 1, _eip); | ||
838 | break; | ||
839 | } | ||
840 | |||
841 | if (twobyte) | ||
842 | goto twobyte_insn; | ||
843 | |||
844 | switch (b) { | ||
845 | case 0x00 ... 0x05: | ||
846 | add: /* add */ | ||
847 | emulate_2op_SrcV("add", src, dst, _eflags); | ||
848 | break; | ||
849 | case 0x08 ... 0x0d: | ||
850 | or: /* or */ | ||
851 | emulate_2op_SrcV("or", src, dst, _eflags); | ||
852 | break; | ||
853 | case 0x10 ... 0x15: | ||
854 | adc: /* adc */ | ||
855 | emulate_2op_SrcV("adc", src, dst, _eflags); | ||
856 | break; | ||
857 | case 0x18 ... 0x1d: | ||
858 | sbb: /* sbb */ | ||
859 | emulate_2op_SrcV("sbb", src, dst, _eflags); | ||
860 | break; | ||
861 | case 0x20 ... 0x25: | ||
862 | and: /* and */ | ||
863 | emulate_2op_SrcV("and", src, dst, _eflags); | ||
864 | break; | ||
865 | case 0x28 ... 0x2d: | ||
866 | sub: /* sub */ | ||
867 | emulate_2op_SrcV("sub", src, dst, _eflags); | ||
868 | break; | ||
869 | case 0x30 ... 0x35: | ||
870 | xor: /* xor */ | ||
871 | emulate_2op_SrcV("xor", src, dst, _eflags); | ||
872 | break; | ||
873 | case 0x38 ... 0x3d: | ||
874 | cmp: /* cmp */ | ||
875 | emulate_2op_SrcV("cmp", src, dst, _eflags); | ||
876 | break; | ||
877 | case 0x63: /* movsxd */ | ||
878 | if (mode != X86EMUL_MODE_PROT64) | ||
879 | goto cannot_emulate; | ||
880 | dst.val = (s32) src.val; | ||
881 | break; | ||
882 | case 0x80 ... 0x83: /* Grp1 */ | ||
883 | switch (modrm_reg) { | ||
884 | case 0: | ||
885 | goto add; | ||
886 | case 1: | ||
887 | goto or; | ||
888 | case 2: | ||
889 | goto adc; | ||
890 | case 3: | ||
891 | goto sbb; | ||
892 | case 4: | ||
893 | goto and; | ||
894 | case 5: | ||
895 | goto sub; | ||
896 | case 6: | ||
897 | goto xor; | ||
898 | case 7: | ||
899 | goto cmp; | ||
900 | } | ||
901 | break; | ||
902 | case 0x84 ... 0x85: | ||
903 | test: /* test */ | ||
904 | emulate_2op_SrcV("test", src, dst, _eflags); | ||
905 | break; | ||
906 | case 0x86 ... 0x87: /* xchg */ | ||
907 | /* Write back the register source. */ | ||
908 | switch (dst.bytes) { | ||
909 | case 1: | ||
910 | *(u8 *) src.ptr = (u8) dst.val; | ||
911 | break; | ||
912 | case 2: | ||
913 | *(u16 *) src.ptr = (u16) dst.val; | ||
914 | break; | ||
915 | case 4: | ||
916 | *src.ptr = (u32) dst.val; | ||
917 | break; /* 64b reg: zero-extend */ | ||
918 | case 8: | ||
919 | *src.ptr = dst.val; | ||
920 | break; | ||
921 | } | ||
922 | /* | ||
923 | * Write back the memory destination with implicit LOCK | ||
924 | * prefix. | ||
925 | */ | ||
926 | dst.val = src.val; | ||
927 | lock_prefix = 1; | ||
928 | break; | ||
929 | case 0xa0 ... 0xa1: /* mov */ | ||
930 | dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX]; | ||
931 | dst.val = src.val; | ||
932 | _eip += ad_bytes; /* skip src displacement */ | ||
933 | break; | ||
934 | case 0xa2 ... 0xa3: /* mov */ | ||
935 | dst.val = (unsigned long)_regs[VCPU_REGS_RAX]; | ||
936 | _eip += ad_bytes; /* skip dst displacement */ | ||
937 | break; | ||
938 | case 0x88 ... 0x8b: /* mov */ | ||
939 | case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */ | ||
940 | dst.val = src.val; | ||
941 | break; | ||
942 | case 0x8f: /* pop (sole member of Grp1a) */ | ||
943 | /* 64-bit mode: POP always pops a 64-bit operand. */ | ||
944 | if (mode == X86EMUL_MODE_PROT64) | ||
945 | dst.bytes = 8; | ||
946 | if ((rc = ops->read_std(register_address(ctxt->ss_base, | ||
947 | _regs[VCPU_REGS_RSP]), | ||
948 | &dst.val, dst.bytes, ctxt)) != 0) | ||
949 | goto done; | ||
950 | register_address_increment(_regs[VCPU_REGS_RSP], dst.bytes); | ||
951 | break; | ||
952 | case 0xc0 ... 0xc1: | ||
953 | grp2: /* Grp2 */ | ||
954 | switch (modrm_reg) { | ||
955 | case 0: /* rol */ | ||
956 | emulate_2op_SrcB("rol", src, dst, _eflags); | ||
957 | break; | ||
958 | case 1: /* ror */ | ||
959 | emulate_2op_SrcB("ror", src, dst, _eflags); | ||
960 | break; | ||
961 | case 2: /* rcl */ | ||
962 | emulate_2op_SrcB("rcl", src, dst, _eflags); | ||
963 | break; | ||
964 | case 3: /* rcr */ | ||
965 | emulate_2op_SrcB("rcr", src, dst, _eflags); | ||
966 | break; | ||
967 | case 4: /* sal/shl */ | ||
968 | case 6: /* sal/shl */ | ||
969 | emulate_2op_SrcB("sal", src, dst, _eflags); | ||
970 | break; | ||
971 | case 5: /* shr */ | ||
972 | emulate_2op_SrcB("shr", src, dst, _eflags); | ||
973 | break; | ||
974 | case 7: /* sar */ | ||
975 | emulate_2op_SrcB("sar", src, dst, _eflags); | ||
976 | break; | ||
977 | } | ||
978 | break; | ||
979 | case 0xd0 ... 0xd1: /* Grp2 */ | ||
980 | src.val = 1; | ||
981 | goto grp2; | ||
982 | case 0xd2 ... 0xd3: /* Grp2 */ | ||
983 | src.val = _regs[VCPU_REGS_RCX]; | ||
984 | goto grp2; | ||
985 | case 0xf6 ... 0xf7: /* Grp3 */ | ||
986 | switch (modrm_reg) { | ||
987 | case 0 ... 1: /* test */ | ||
988 | /* | ||
989 | * Special case in Grp3: test has an immediate | ||
990 | * source operand. | ||
991 | */ | ||
992 | src.type = OP_IMM; | ||
993 | src.ptr = (unsigned long *)_eip; | ||
994 | src.bytes = (d & ByteOp) ? 1 : op_bytes; | ||
995 | if (src.bytes == 8) | ||
996 | src.bytes = 4; | ||
997 | switch (src.bytes) { | ||
998 | case 1: | ||
999 | src.val = insn_fetch(s8, 1, _eip); | ||
1000 | break; | ||
1001 | case 2: | ||
1002 | src.val = insn_fetch(s16, 2, _eip); | ||
1003 | break; | ||
1004 | case 4: | ||
1005 | src.val = insn_fetch(s32, 4, _eip); | ||
1006 | break; | ||
1007 | } | ||
1008 | goto test; | ||
1009 | case 2: /* not */ | ||
1010 | dst.val = ~dst.val; | ||
1011 | break; | ||
1012 | case 3: /* neg */ | ||
1013 | emulate_1op("neg", dst, _eflags); | ||
1014 | break; | ||
1015 | default: | ||
1016 | goto cannot_emulate; | ||
1017 | } | ||
1018 | break; | ||
1019 | case 0xfe ... 0xff: /* Grp4/Grp5 */ | ||
1020 | switch (modrm_reg) { | ||
1021 | case 0: /* inc */ | ||
1022 | emulate_1op("inc", dst, _eflags); | ||
1023 | break; | ||
1024 | case 1: /* dec */ | ||
1025 | emulate_1op("dec", dst, _eflags); | ||
1026 | break; | ||
1027 | case 6: /* push */ | ||
1028 | /* 64-bit mode: PUSH always pushes a 64-bit operand. */ | ||
1029 | if (mode == X86EMUL_MODE_PROT64) { | ||
1030 | dst.bytes = 8; | ||
1031 | if ((rc = ops->read_std((unsigned long)dst.ptr, | ||
1032 | &dst.val, 8, | ||
1033 | ctxt)) != 0) | ||
1034 | goto done; | ||
1035 | } | ||
1036 | register_address_increment(_regs[VCPU_REGS_RSP], | ||
1037 | -dst.bytes); | ||
1038 | if ((rc = ops->write_std( | ||
1039 | register_address(ctxt->ss_base, | ||
1040 | _regs[VCPU_REGS_RSP]), | ||
1041 | dst.val, dst.bytes, ctxt)) != 0) | ||
1042 | goto done; | ||
1043 | dst.val = dst.orig_val; /* skanky: disable writeback */ | ||
1044 | break; | ||
1045 | default: | ||
1046 | goto cannot_emulate; | ||
1047 | } | ||
1048 | break; | ||
1049 | } | ||
1050 | |||
1051 | writeback: | ||
1052 | if ((d & Mov) || (dst.orig_val != dst.val)) { | ||
1053 | switch (dst.type) { | ||
1054 | case OP_REG: | ||
1055 | /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */ | ||
1056 | switch (dst.bytes) { | ||
1057 | case 1: | ||
1058 | *(u8 *)dst.ptr = (u8)dst.val; | ||
1059 | break; | ||
1060 | case 2: | ||
1061 | *(u16 *)dst.ptr = (u16)dst.val; | ||
1062 | break; | ||
1063 | case 4: | ||
1064 | *dst.ptr = (u32)dst.val; | ||
1065 | break; /* 64b: zero-ext */ | ||
1066 | case 8: | ||
1067 | *dst.ptr = dst.val; | ||
1068 | break; | ||
1069 | } | ||
1070 | break; | ||
1071 | case OP_MEM: | ||
1072 | if (lock_prefix) | ||
1073 | rc = ops->cmpxchg_emulated((unsigned long)dst. | ||
1074 | ptr, dst.orig_val, | ||
1075 | dst.val, dst.bytes, | ||
1076 | ctxt); | ||
1077 | else | ||
1078 | rc = ops->write_emulated((unsigned long)dst.ptr, | ||
1079 | dst.val, dst.bytes, | ||
1080 | ctxt); | ||
1081 | if (rc != 0) | ||
1082 | goto done; | ||
1083 | default: | ||
1084 | break; | ||
1085 | } | ||
1086 | } | ||
1087 | |||
1088 | /* Commit shadow register state. */ | ||
1089 | memcpy(ctxt->vcpu->regs, _regs, sizeof _regs); | ||
1090 | ctxt->eflags = _eflags; | ||
1091 | ctxt->vcpu->rip = _eip; | ||
1092 | |||
1093 | done: | ||
1094 | return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; | ||
1095 | |||
1096 | special_insn: | ||
1097 | if (twobyte) | ||
1098 | goto twobyte_special_insn; | ||
1099 | if (rep_prefix) { | ||
1100 | if (_regs[VCPU_REGS_RCX] == 0) { | ||
1101 | ctxt->vcpu->rip = _eip; | ||
1102 | goto done; | ||
1103 | } | ||
1104 | _regs[VCPU_REGS_RCX]--; | ||
1105 | _eip = ctxt->vcpu->rip; | ||
1106 | } | ||
1107 | switch (b) { | ||
1108 | case 0xa4 ... 0xa5: /* movs */ | ||
1109 | dst.type = OP_MEM; | ||
1110 | dst.bytes = (d & ByteOp) ? 1 : op_bytes; | ||
1111 | dst.ptr = (unsigned long *)register_address(ctxt->es_base, | ||
1112 | _regs[VCPU_REGS_RDI]); | ||
1113 | if ((rc = ops->read_emulated(register_address( | ||
1114 | override_base ? *override_base : ctxt->ds_base, | ||
1115 | _regs[VCPU_REGS_RSI]), &dst.val, dst.bytes, ctxt)) != 0) | ||
1116 | goto done; | ||
1117 | register_address_increment(_regs[VCPU_REGS_RSI], | ||
1118 | (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes); | ||
1119 | register_address_increment(_regs[VCPU_REGS_RDI], | ||
1120 | (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes); | ||
1121 | break; | ||
1122 | case 0xa6 ... 0xa7: /* cmps */ | ||
1123 | DPRINTF("Urk! I don't handle CMPS.\n"); | ||
1124 | goto cannot_emulate; | ||
1125 | case 0xaa ... 0xab: /* stos */ | ||
1126 | dst.type = OP_MEM; | ||
1127 | dst.bytes = (d & ByteOp) ? 1 : op_bytes; | ||
1128 | dst.ptr = (unsigned long *)cr2; | ||
1129 | dst.val = _regs[VCPU_REGS_RAX]; | ||
1130 | register_address_increment(_regs[VCPU_REGS_RDI], | ||
1131 | (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes); | ||
1132 | break; | ||
1133 | case 0xac ... 0xad: /* lods */ | ||
1134 | dst.type = OP_REG; | ||
1135 | dst.bytes = (d & ByteOp) ? 1 : op_bytes; | ||
1136 | dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX]; | ||
1137 | if ((rc = ops->read_emulated(cr2, &dst.val, dst.bytes, ctxt)) != 0) | ||
1138 | goto done; | ||
1139 | register_address_increment(_regs[VCPU_REGS_RSI], | ||
1140 | (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes); | ||
1141 | break; | ||
1142 | case 0xae ... 0xaf: /* scas */ | ||
1143 | DPRINTF("Urk! I don't handle SCAS.\n"); | ||
1144 | goto cannot_emulate; | ||
1145 | } | ||
1146 | goto writeback; | ||
1147 | |||
1148 | twobyte_insn: | ||
1149 | switch (b) { | ||
1150 | case 0x01: /* lgdt, lidt, lmsw */ | ||
1151 | switch (modrm_reg) { | ||
1152 | u16 size; | ||
1153 | unsigned long address; | ||
1154 | |||
1155 | case 2: /* lgdt */ | ||
1156 | rc = read_descriptor(ctxt, ops, src.ptr, | ||
1157 | &size, &address, op_bytes); | ||
1158 | if (rc) | ||
1159 | goto done; | ||
1160 | realmode_lgdt(ctxt->vcpu, size, address); | ||
1161 | break; | ||
1162 | case 3: /* lidt */ | ||
1163 | rc = read_descriptor(ctxt, ops, src.ptr, | ||
1164 | &size, &address, op_bytes); | ||
1165 | if (rc) | ||
1166 | goto done; | ||
1167 | realmode_lidt(ctxt->vcpu, size, address); | ||
1168 | break; | ||
1169 | case 4: /* smsw */ | ||
1170 | if (modrm_mod != 3) | ||
1171 | goto cannot_emulate; | ||
1172 | *(u16 *)&_regs[modrm_rm] | ||
1173 | = realmode_get_cr(ctxt->vcpu, 0); | ||
1174 | break; | ||
1175 | case 6: /* lmsw */ | ||
1176 | if (modrm_mod != 3) | ||
1177 | goto cannot_emulate; | ||
1178 | realmode_lmsw(ctxt->vcpu, (u16)modrm_val, &_eflags); | ||
1179 | break; | ||
1180 | case 7: /* invlpg*/ | ||
1181 | emulate_invlpg(ctxt->vcpu, cr2); | ||
1182 | break; | ||
1183 | default: | ||
1184 | goto cannot_emulate; | ||
1185 | } | ||
1186 | break; | ||
1187 | case 0x21: /* mov from dr to reg */ | ||
1188 | if (modrm_mod != 3) | ||
1189 | goto cannot_emulate; | ||
1190 | rc = emulator_get_dr(ctxt, modrm_reg, &_regs[modrm_rm]); | ||
1191 | break; | ||
1192 | case 0x23: /* mov from reg to dr */ | ||
1193 | if (modrm_mod != 3) | ||
1194 | goto cannot_emulate; | ||
1195 | rc = emulator_set_dr(ctxt, modrm_reg, _regs[modrm_rm]); | ||
1196 | break; | ||
1197 | case 0x40 ... 0x4f: /* cmov */ | ||
1198 | dst.val = dst.orig_val = src.val; | ||
1199 | d &= ~Mov; /* default to no move */ | ||
1200 | /* | ||
1201 | * First, assume we're decoding an even cmov opcode | ||
1202 | * (lsb == 0). | ||
1203 | */ | ||
1204 | switch ((b & 15) >> 1) { | ||
1205 | case 0: /* cmovo */ | ||
1206 | d |= (_eflags & EFLG_OF) ? Mov : 0; | ||
1207 | break; | ||
1208 | case 1: /* cmovb/cmovc/cmovnae */ | ||
1209 | d |= (_eflags & EFLG_CF) ? Mov : 0; | ||
1210 | break; | ||
1211 | case 2: /* cmovz/cmove */ | ||
1212 | d |= (_eflags & EFLG_ZF) ? Mov : 0; | ||
1213 | break; | ||
1214 | case 3: /* cmovbe/cmovna */ | ||
1215 | d |= (_eflags & (EFLG_CF | EFLG_ZF)) ? Mov : 0; | ||
1216 | break; | ||
1217 | case 4: /* cmovs */ | ||
1218 | d |= (_eflags & EFLG_SF) ? Mov : 0; | ||
1219 | break; | ||
1220 | case 5: /* cmovp/cmovpe */ | ||
1221 | d |= (_eflags & EFLG_PF) ? Mov : 0; | ||
1222 | break; | ||
1223 | case 7: /* cmovle/cmovng */ | ||
1224 | d |= (_eflags & EFLG_ZF) ? Mov : 0; | ||
1225 | /* fall through */ | ||
1226 | case 6: /* cmovl/cmovnge */ | ||
1227 | d |= (!(_eflags & EFLG_SF) != | ||
1228 | !(_eflags & EFLG_OF)) ? Mov : 0; | ||
1229 | break; | ||
1230 | } | ||
1231 | /* Odd cmov opcodes (lsb == 1) have inverted sense. */ | ||
1232 | d ^= (b & 1) ? Mov : 0; | ||
1233 | break; | ||
1234 | case 0xb0 ... 0xb1: /* cmpxchg */ | ||
1235 | /* | ||
1236 | * Save real source value, then compare EAX against | ||
1237 | * destination. | ||
1238 | */ | ||
1239 | src.orig_val = src.val; | ||
1240 | src.val = _regs[VCPU_REGS_RAX]; | ||
1241 | emulate_2op_SrcV("cmp", src, dst, _eflags); | ||
1242 | /* Always write back. The question is: where to? */ | ||
1243 | d |= Mov; | ||
1244 | if (_eflags & EFLG_ZF) { | ||
1245 | /* Success: write back to memory. */ | ||
1246 | dst.val = src.orig_val; | ||
1247 | } else { | ||
1248 | /* Failure: write the value we saw to EAX. */ | ||
1249 | dst.type = OP_REG; | ||
1250 | dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX]; | ||
1251 | } | ||
1252 | break; | ||
1253 | case 0xa3: | ||
1254 | bt: /* bt */ | ||
1255 | src.val &= (dst.bytes << 3) - 1; /* only subword offset */ | ||
1256 | emulate_2op_SrcV_nobyte("bt", src, dst, _eflags); | ||
1257 | break; | ||
1258 | case 0xb3: | ||
1259 | btr: /* btr */ | ||
1260 | src.val &= (dst.bytes << 3) - 1; /* only subword offset */ | ||
1261 | emulate_2op_SrcV_nobyte("btr", src, dst, _eflags); | ||
1262 | break; | ||
1263 | case 0xab: | ||
1264 | bts: /* bts */ | ||
1265 | src.val &= (dst.bytes << 3) - 1; /* only subword offset */ | ||
1266 | emulate_2op_SrcV_nobyte("bts", src, dst, _eflags); | ||
1267 | break; | ||
1268 | case 0xb6 ... 0xb7: /* movzx */ | ||
1269 | dst.bytes = op_bytes; | ||
1270 | dst.val = (d & ByteOp) ? (u8) src.val : (u16) src.val; | ||
1271 | break; | ||
1272 | case 0xbb: | ||
1273 | btc: /* btc */ | ||
1274 | src.val &= (dst.bytes << 3) - 1; /* only subword offset */ | ||
1275 | emulate_2op_SrcV_nobyte("btc", src, dst, _eflags); | ||
1276 | break; | ||
1277 | case 0xba: /* Grp8 */ | ||
1278 | switch (modrm_reg & 3) { | ||
1279 | case 0: | ||
1280 | goto bt; | ||
1281 | case 1: | ||
1282 | goto bts; | ||
1283 | case 2: | ||
1284 | goto btr; | ||
1285 | case 3: | ||
1286 | goto btc; | ||
1287 | } | ||
1288 | break; | ||
1289 | case 0xbe ... 0xbf: /* movsx */ | ||
1290 | dst.bytes = op_bytes; | ||
1291 | dst.val = (d & ByteOp) ? (s8) src.val : (s16) src.val; | ||
1292 | break; | ||
1293 | } | ||
1294 | goto writeback; | ||
1295 | |||
1296 | twobyte_special_insn: | ||
1297 | /* Disable writeback. */ | ||
1298 | dst.orig_val = dst.val; | ||
1299 | switch (b) { | ||
1300 | case 0x0d: /* GrpP (prefetch) */ | ||
1301 | case 0x18: /* Grp16 (prefetch/nop) */ | ||
1302 | break; | ||
1303 | case 0x06: | ||
1304 | emulate_clts(ctxt->vcpu); | ||
1305 | break; | ||
1306 | case 0x20: /* mov cr, reg */ | ||
1307 | if (modrm_mod != 3) | ||
1308 | goto cannot_emulate; | ||
1309 | _regs[modrm_rm] = realmode_get_cr(ctxt->vcpu, modrm_reg); | ||
1310 | break; | ||
1311 | case 0x22: /* mov reg, cr */ | ||
1312 | if (modrm_mod != 3) | ||
1313 | goto cannot_emulate; | ||
1314 | realmode_set_cr(ctxt->vcpu, modrm_reg, modrm_val, &_eflags); | ||
1315 | break; | ||
1316 | case 0xc7: /* Grp9 (cmpxchg8b) */ | ||
1317 | #if defined(__i386__) | ||
1318 | { | ||
1319 | unsigned long old_lo, old_hi; | ||
1320 | if (((rc = ops->read_emulated(cr2 + 0, &old_lo, 4, | ||
1321 | ctxt)) != 0) | ||
1322 | || ((rc = ops->read_emulated(cr2 + 4, &old_hi, 4, | ||
1323 | ctxt)) != 0)) | ||
1324 | goto done; | ||
1325 | if ((old_lo != _regs[VCPU_REGS_RAX]) | ||
1326 | || (old_hi != _regs[VCPU_REGS_RDI])) { | ||
1327 | _regs[VCPU_REGS_RAX] = old_lo; | ||
1328 | _regs[VCPU_REGS_RDX] = old_hi; | ||
1329 | _eflags &= ~EFLG_ZF; | ||
1330 | } else if (ops->cmpxchg8b_emulated == NULL) { | ||
1331 | rc = X86EMUL_UNHANDLEABLE; | ||
1332 | goto done; | ||
1333 | } else { | ||
1334 | if ((rc = ops->cmpxchg8b_emulated(cr2, old_lo, | ||
1335 | old_hi, | ||
1336 | _regs[VCPU_REGS_RBX], | ||
1337 | _regs[VCPU_REGS_RCX], | ||
1338 | ctxt)) != 0) | ||
1339 | goto done; | ||
1340 | _eflags |= EFLG_ZF; | ||
1341 | } | ||
1342 | break; | ||
1343 | } | ||
1344 | #elif defined(__x86_64__) | ||
1345 | { | ||
1346 | unsigned long old, new; | ||
1347 | if ((rc = ops->read_emulated(cr2, &old, 8, ctxt)) != 0) | ||
1348 | goto done; | ||
1349 | if (((u32) (old >> 0) != (u32) _regs[VCPU_REGS_RAX]) || | ||
1350 | ((u32) (old >> 32) != (u32) _regs[VCPU_REGS_RDX])) { | ||
1351 | _regs[VCPU_REGS_RAX] = (u32) (old >> 0); | ||
1352 | _regs[VCPU_REGS_RDX] = (u32) (old >> 32); | ||
1353 | _eflags &= ~EFLG_ZF; | ||
1354 | } else { | ||
1355 | new = (_regs[VCPU_REGS_RCX] << 32) | (u32) _regs[VCPU_REGS_RBX]; | ||
1356 | if ((rc = ops->cmpxchg_emulated(cr2, old, | ||
1357 | new, 8, ctxt)) != 0) | ||
1358 | goto done; | ||
1359 | _eflags |= EFLG_ZF; | ||
1360 | } | ||
1361 | break; | ||
1362 | } | ||
1363 | #endif | ||
1364 | } | ||
1365 | goto writeback; | ||
1366 | |||
1367 | cannot_emulate: | ||
1368 | DPRINTF("Cannot emulate %02x\n", b); | ||
1369 | return -1; | ||
1370 | } | ||
1371 | |||
1372 | #ifdef __XEN__ | ||
1373 | |||
1374 | #include <asm/mm.h> | ||
1375 | #include <asm/uaccess.h> | ||
1376 | |||
1377 | int | ||
1378 | x86_emulate_read_std(unsigned long addr, | ||
1379 | unsigned long *val, | ||
1380 | unsigned int bytes, struct x86_emulate_ctxt *ctxt) | ||
1381 | { | ||
1382 | unsigned int rc; | ||
1383 | |||
1384 | *val = 0; | ||
1385 | |||
1386 | if ((rc = copy_from_user((void *)val, (void *)addr, bytes)) != 0) { | ||
1387 | propagate_page_fault(addr + bytes - rc, 0); /* read fault */ | ||
1388 | return X86EMUL_PROPAGATE_FAULT; | ||
1389 | } | ||
1390 | |||
1391 | return X86EMUL_CONTINUE; | ||
1392 | } | ||
1393 | |||
1394 | int | ||
1395 | x86_emulate_write_std(unsigned long addr, | ||
1396 | unsigned long val, | ||
1397 | unsigned int bytes, struct x86_emulate_ctxt *ctxt) | ||
1398 | { | ||
1399 | unsigned int rc; | ||
1400 | |||
1401 | if ((rc = copy_to_user((void *)addr, (void *)&val, bytes)) != 0) { | ||
1402 | propagate_page_fault(addr + bytes - rc, PGERR_write_access); | ||
1403 | return X86EMUL_PROPAGATE_FAULT; | ||
1404 | } | ||
1405 | |||
1406 | return X86EMUL_CONTINUE; | ||
1407 | } | ||
1408 | |||
1409 | #endif | ||
diff --git a/drivers/kvm/x86_emulate.h b/drivers/kvm/x86_emulate.h new file mode 100644 index 000000000000..658b58de30fc --- /dev/null +++ b/drivers/kvm/x86_emulate.h | |||
@@ -0,0 +1,185 @@ | |||
1 | /****************************************************************************** | ||
2 | * x86_emulate.h | ||
3 | * | ||
4 | * Generic x86 (32-bit and 64-bit) instruction decoder and emulator. | ||
5 | * | ||
6 | * Copyright (c) 2005 Keir Fraser | ||
7 | * | ||
8 | * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 | ||
9 | */ | ||
10 | |||
11 | #ifndef __X86_EMULATE_H__ | ||
12 | #define __X86_EMULATE_H__ | ||
13 | |||
14 | struct x86_emulate_ctxt; | ||
15 | |||
16 | /* | ||
17 | * x86_emulate_ops: | ||
18 | * | ||
19 | * These operations represent the instruction emulator's interface to memory. | ||
20 | * There are two categories of operation: those that act on ordinary memory | ||
21 | * regions (*_std), and those that act on memory regions known to require | ||
22 | * special treatment or emulation (*_emulated). | ||
23 | * | ||
24 | * The emulator assumes that an instruction accesses only one 'emulated memory' | ||
25 | * location, that this location is the given linear faulting address (cr2), and | ||
26 | * that this is one of the instruction's data operands. Instruction fetches and | ||
27 | * stack operations are assumed never to access emulated memory. The emulator | ||
28 | * automatically deduces which operand of a string-move operation is accessing | ||
29 | * emulated memory, and assumes that the other operand accesses normal memory. | ||
30 | * | ||
31 | * NOTES: | ||
32 | * 1. The emulator isn't very smart about emulated vs. standard memory. | ||
33 | * 'Emulated memory' access addresses should be checked for sanity. | ||
34 | * 'Normal memory' accesses may fault, and the caller must arrange to | ||
35 | * detect and handle reentrancy into the emulator via recursive faults. | ||
36 | * Accesses may be unaligned and may cross page boundaries. | ||
37 | * 2. If the access fails (cannot emulate, or a standard access faults) then | ||
38 | * it is up to the memop to propagate the fault to the guest VM via | ||
39 | * some out-of-band mechanism, unknown to the emulator. The memop signals | ||
40 | * failure by returning X86EMUL_PROPAGATE_FAULT to the emulator, which will | ||
41 | * then immediately bail. | ||
42 | * 3. Valid access sizes are 1, 2, 4 and 8 bytes. On x86/32 systems only | ||
43 | * cmpxchg8b_emulated need support 8-byte accesses. | ||
44 | * 4. The emulator cannot handle 64-bit mode emulation on an x86/32 system. | ||
45 | */ | ||
46 | /* Access completed successfully: continue emulation as normal. */ | ||
47 | #define X86EMUL_CONTINUE 0 | ||
48 | /* Access is unhandleable: bail from emulation and return error to caller. */ | ||
49 | #define X86EMUL_UNHANDLEABLE 1 | ||
50 | /* Terminate emulation but return success to the caller. */ | ||
51 | #define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */ | ||
52 | #define X86EMUL_RETRY_INSTR 2 /* retry the instruction for some reason */ | ||
53 | #define X86EMUL_CMPXCHG_FAILED 2 /* cmpxchg did not see expected value */ | ||
54 | struct x86_emulate_ops { | ||
55 | /* | ||
56 | * read_std: Read bytes of standard (non-emulated/special) memory. | ||
57 | * Used for instruction fetch, stack operations, and others. | ||
58 | * @addr: [IN ] Linear address from which to read. | ||
59 | * @val: [OUT] Value read from memory, zero-extended to 'u_long'. | ||
60 | * @bytes: [IN ] Number of bytes to read from memory. | ||
61 | */ | ||
62 | int (*read_std)(unsigned long addr, | ||
63 | unsigned long *val, | ||
64 | unsigned int bytes, struct x86_emulate_ctxt * ctxt); | ||
65 | |||
66 | /* | ||
67 | * write_std: Write bytes of standard (non-emulated/special) memory. | ||
68 | * Used for stack operations, and others. | ||
69 | * @addr: [IN ] Linear address to which to write. | ||
70 | * @val: [IN ] Value to write to memory (low-order bytes used as | ||
71 | * required). | ||
72 | * @bytes: [IN ] Number of bytes to write to memory. | ||
73 | */ | ||
74 | int (*write_std)(unsigned long addr, | ||
75 | unsigned long val, | ||
76 | unsigned int bytes, struct x86_emulate_ctxt * ctxt); | ||
77 | |||
78 | /* | ||
79 | * read_emulated: Read bytes from emulated/special memory area. | ||
80 | * @addr: [IN ] Linear address from which to read. | ||
81 | * @val: [OUT] Value read from memory, zero-extended to 'u_long'. | ||
82 | * @bytes: [IN ] Number of bytes to read from memory. | ||
83 | */ | ||
84 | int (*read_emulated) (unsigned long addr, | ||
85 | unsigned long *val, | ||
86 | unsigned int bytes, | ||
87 | struct x86_emulate_ctxt * ctxt); | ||
88 | |||
89 | /* | ||
90 | * write_emulated: Read bytes from emulated/special memory area. | ||
91 | * @addr: [IN ] Linear address to which to write. | ||
92 | * @val: [IN ] Value to write to memory (low-order bytes used as | ||
93 | * required). | ||
94 | * @bytes: [IN ] Number of bytes to write to memory. | ||
95 | */ | ||
96 | int (*write_emulated) (unsigned long addr, | ||
97 | unsigned long val, | ||
98 | unsigned int bytes, | ||
99 | struct x86_emulate_ctxt * ctxt); | ||
100 | |||
101 | /* | ||
102 | * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an | ||
103 | * emulated/special memory area. | ||
104 | * @addr: [IN ] Linear address to access. | ||
105 | * @old: [IN ] Value expected to be current at @addr. | ||
106 | * @new: [IN ] Value to write to @addr. | ||
107 | * @bytes: [IN ] Number of bytes to access using CMPXCHG. | ||
108 | */ | ||
109 | int (*cmpxchg_emulated) (unsigned long addr, | ||
110 | unsigned long old, | ||
111 | unsigned long new, | ||
112 | unsigned int bytes, | ||
113 | struct x86_emulate_ctxt * ctxt); | ||
114 | |||
115 | /* | ||
116 | * cmpxchg8b_emulated: Emulate an atomic (LOCKed) CMPXCHG8B operation on an | ||
117 | * emulated/special memory area. | ||
118 | * @addr: [IN ] Linear address to access. | ||
119 | * @old: [IN ] Value expected to be current at @addr. | ||
120 | * @new: [IN ] Value to write to @addr. | ||
121 | * NOTES: | ||
122 | * 1. This function is only ever called when emulating a real CMPXCHG8B. | ||
123 | * 2. This function is *never* called on x86/64 systems. | ||
124 | * 2. Not defining this function (i.e., specifying NULL) is equivalent | ||
125 | * to defining a function that always returns X86EMUL_UNHANDLEABLE. | ||
126 | */ | ||
127 | int (*cmpxchg8b_emulated) (unsigned long addr, | ||
128 | unsigned long old_lo, | ||
129 | unsigned long old_hi, | ||
130 | unsigned long new_lo, | ||
131 | unsigned long new_hi, | ||
132 | struct x86_emulate_ctxt * ctxt); | ||
133 | }; | ||
134 | |||
135 | struct cpu_user_regs; | ||
136 | |||
137 | struct x86_emulate_ctxt { | ||
138 | /* Register state before/after emulation. */ | ||
139 | struct kvm_vcpu *vcpu; | ||
140 | |||
141 | /* Linear faulting address (if emulating a page-faulting instruction). */ | ||
142 | unsigned long eflags; | ||
143 | unsigned long cr2; | ||
144 | |||
145 | /* Emulated execution mode, represented by an X86EMUL_MODE value. */ | ||
146 | int mode; | ||
147 | |||
148 | unsigned long cs_base; | ||
149 | unsigned long ds_base; | ||
150 | unsigned long es_base; | ||
151 | unsigned long ss_base; | ||
152 | unsigned long gs_base; | ||
153 | unsigned long fs_base; | ||
154 | }; | ||
155 | |||
156 | /* Execution mode, passed to the emulator. */ | ||
157 | #define X86EMUL_MODE_REAL 0 /* Real mode. */ | ||
158 | #define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */ | ||
159 | #define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */ | ||
160 | #define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */ | ||
161 | |||
162 | /* Host execution mode. */ | ||
163 | #if defined(__i386__) | ||
164 | #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32 | ||
165 | #elif defined(__x86_64__) | ||
166 | #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 | ||
167 | #endif | ||
168 | |||
169 | /* | ||
170 | * x86_emulate_memop: Emulate an instruction that faulted attempting to | ||
171 | * read/write a 'special' memory area. | ||
172 | * Returns -1 on failure, 0 on success. | ||
173 | */ | ||
174 | int x86_emulate_memop(struct x86_emulate_ctxt *ctxt, | ||
175 | struct x86_emulate_ops *ops); | ||
176 | |||
177 | /* | ||
178 | * Given the 'reg' portion of a ModRM byte, and a register block, return a | ||
179 | * pointer into the block that addresses the relevant register. | ||
180 | * @highbyte_regs specifies whether to decode AH,CH,DH,BH. | ||
181 | */ | ||
182 | void *decode_register(u8 modrm_reg, unsigned long *regs, | ||
183 | int highbyte_regs); | ||
184 | |||
185 | #endif /* __X86_EMULATE_H__ */ | ||