aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/kvm/vmx.c
diff options
context:
space:
mode:
authorAvi Kivity <avi@qumranet.com>2006-12-10 05:21:36 -0500
committerLinus Torvalds <torvalds@woody.osdl.org>2006-12-10 12:57:22 -0500
commit6aa8b732ca01c3d7a54e93f4d701b8aabbe60fb7 (patch)
tree23fcbe6f4918cacdae26d513a2bd13e91d8b4c38 /drivers/kvm/vmx.c
parentf5f1a24a2caa299bb7d294aee92d7dd3410d9ed7 (diff)
[PATCH] kvm: userspace interface
web site: http://kvm.sourceforge.net mailing list: kvm-devel@lists.sourceforge.net (http://lists.sourceforge.net/lists/listinfo/kvm-devel) The following patchset adds a driver for Intel's hardware virtualization extensions to the x86 architecture. The driver adds a character device (/dev/kvm) that exposes the virtualization capabilities to userspace. Using this driver, a process can run a virtual machine (a "guest") in a fully virtualized PC containing its own virtual hard disks, network adapters, and display. Using this driver, one can start multiple virtual machines on a host. Each virtual machine is a process on the host; a virtual cpu is a thread in that process. kill(1), nice(1), top(1) work as expected. In effect, the driver adds a third execution mode to the existing two: we now have kernel mode, user mode, and guest mode. Guest mode has its own address space mapping guest physical memory (which is accessible to user mode by mmap()ing /dev/kvm). Guest mode has no access to any I/O devices; any such access is intercepted and directed to user mode for emulation. The driver supports i386 and x86_64 hosts and guests. All combinations are allowed except x86_64 guest on i386 host. For i386 guests and hosts, both pae and non-pae paging modes are supported. SMP hosts and UP guests are supported. At the moment only Intel hardware is supported, but AMD virtualization support is being worked on. Performance currently is non-stellar due to the naive implementation of the mmu virtualization, which throws away most of the shadow page table entries every context switch. We plan to address this in two ways: - cache shadow page tables across tlb flushes - wait until AMD and Intel release processors with nested page tables Currently a virtual desktop is responsive but consumes a lot of CPU. Under Windows I tried playing pinball and watching a few flash movies; with a recent CPU one can hardly feel the virtualization. Linux/X is slower, probably due to X being in a separate process. In addition to the driver, you need a slightly modified qemu to provide I/O device emulation and the BIOS. Caveats (akpm: might no longer be true): - The Windows install currently bluescreens due to a problem with the virtual APIC. We are working on a fix. A temporary workaround is to use an existing image or install through qemu - Windows 64-bit does not work. That's also true for qemu, so it's probably a problem with the device model. [bero@arklinux.org: build fix] [simon.kagstrom@bth.se: build fix, other fixes] [uril@qumranet.com: KVM: Expose interrupt bitmap] [akpm@osdl.org: i386 build fix] [mingo@elte.hu: i386 fixes] [rdreier@cisco.com: add log levels to all printks] [randy.dunlap@oracle.com: Fix sparse NULL and C99 struct init warnings] [anthony@codemonkey.ws: KVM: AMD SVM: 32-bit host support] Signed-off-by: Yaniv Kamay <yaniv@qumranet.com> Signed-off-by: Avi Kivity <avi@qumranet.com> Cc: Simon Kagstrom <simon.kagstrom@bth.se> Cc: Bernhard Rosenkraenzer <bero@arklinux.org> Signed-off-by: Uri Lublin <uril@qumranet.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Roland Dreier <rolandd@cisco.com> Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com> Signed-off-by: Anthony Liguori <anthony@codemonkey.ws> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'drivers/kvm/vmx.c')
-rw-r--r--drivers/kvm/vmx.c2002
1 files changed, 2002 insertions, 0 deletions
diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c
new file mode 100644
index 000000000000..bda7a7ae2167
--- /dev/null
+++ b/drivers/kvm/vmx.c
@@ -0,0 +1,2002 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * Copyright (C) 2006 Qumranet, Inc.
8 *
9 * Authors:
10 * Avi Kivity <avi@qumranet.com>
11 * Yaniv Kamay <yaniv@qumranet.com>
12 *
13 * This work is licensed under the terms of the GNU GPL, version 2. See
14 * the COPYING file in the top-level directory.
15 *
16 */
17
18#include "kvm.h"
19#include "vmx.h"
20#include "kvm_vmx.h"
21#include <linux/module.h>
22#include <linux/mm.h>
23#include <linux/highmem.h>
24#include <asm/io.h>
25
26#include "segment_descriptor.h"
27
28#define MSR_IA32_FEATURE_CONTROL 0x03a
29
30MODULE_AUTHOR("Qumranet");
31MODULE_LICENSE("GPL");
32
33static DEFINE_PER_CPU(struct vmcs *, vmxarea);
34static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
35
36#ifdef __x86_64__
37#define HOST_IS_64 1
38#else
39#define HOST_IS_64 0
40#endif
41
42static struct vmcs_descriptor {
43 int size;
44 int order;
45 u32 revision_id;
46} vmcs_descriptor;
47
48#define VMX_SEGMENT_FIELD(seg) \
49 [VCPU_SREG_##seg] = { \
50 .selector = GUEST_##seg##_SELECTOR, \
51 .base = GUEST_##seg##_BASE, \
52 .limit = GUEST_##seg##_LIMIT, \
53 .ar_bytes = GUEST_##seg##_AR_BYTES, \
54 }
55
56static struct kvm_vmx_segment_field {
57 unsigned selector;
58 unsigned base;
59 unsigned limit;
60 unsigned ar_bytes;
61} kvm_vmx_segment_fields[] = {
62 VMX_SEGMENT_FIELD(CS),
63 VMX_SEGMENT_FIELD(DS),
64 VMX_SEGMENT_FIELD(ES),
65 VMX_SEGMENT_FIELD(FS),
66 VMX_SEGMENT_FIELD(GS),
67 VMX_SEGMENT_FIELD(SS),
68 VMX_SEGMENT_FIELD(TR),
69 VMX_SEGMENT_FIELD(LDTR),
70};
71
72static const u32 vmx_msr_index[] = {
73#ifdef __x86_64__
74 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE,
75#endif
76 MSR_EFER, MSR_K6_STAR,
77};
78#define NR_VMX_MSR (sizeof(vmx_msr_index) / sizeof(*vmx_msr_index))
79
80struct vmx_msr_entry *find_msr_entry(struct kvm_vcpu *vcpu, u32 msr);
81
82static inline int is_page_fault(u32 intr_info)
83{
84 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
85 INTR_INFO_VALID_MASK)) ==
86 (INTR_TYPE_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
87}
88
89static inline int is_external_interrupt(u32 intr_info)
90{
91 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
92 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
93}
94
95static void vmcs_clear(struct vmcs *vmcs)
96{
97 u64 phys_addr = __pa(vmcs);
98 u8 error;
99
100 asm volatile (ASM_VMX_VMCLEAR_RAX "; setna %0"
101 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
102 : "cc", "memory");
103 if (error)
104 printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
105 vmcs, phys_addr);
106}
107
108static void __vcpu_clear(void *arg)
109{
110 struct kvm_vcpu *vcpu = arg;
111 int cpu = smp_processor_id();
112
113 if (vcpu->cpu == cpu)
114 vmcs_clear(vcpu->vmcs);
115 if (per_cpu(current_vmcs, cpu) == vcpu->vmcs)
116 per_cpu(current_vmcs, cpu) = NULL;
117}
118
119static unsigned long vmcs_readl(unsigned long field)
120{
121 unsigned long value;
122
123 asm volatile (ASM_VMX_VMREAD_RDX_RAX
124 : "=a"(value) : "d"(field) : "cc");
125 return value;
126}
127
128static u16 vmcs_read16(unsigned long field)
129{
130 return vmcs_readl(field);
131}
132
133static u32 vmcs_read32(unsigned long field)
134{
135 return vmcs_readl(field);
136}
137
138static u64 vmcs_read64(unsigned long field)
139{
140#ifdef __x86_64__
141 return vmcs_readl(field);
142#else
143 return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32);
144#endif
145}
146
147static void vmcs_writel(unsigned long field, unsigned long value)
148{
149 u8 error;
150
151 asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0"
152 : "=q"(error) : "a"(value), "d"(field) : "cc" );
153 if (error)
154 printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
155 field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
156}
157
158static void vmcs_write16(unsigned long field, u16 value)
159{
160 vmcs_writel(field, value);
161}
162
163static void vmcs_write32(unsigned long field, u32 value)
164{
165 vmcs_writel(field, value);
166}
167
168static void vmcs_write64(unsigned long field, u64 value)
169{
170#ifdef __x86_64__
171 vmcs_writel(field, value);
172#else
173 vmcs_writel(field, value);
174 asm volatile ("");
175 vmcs_writel(field+1, value >> 32);
176#endif
177}
178
179/*
180 * Switches to specified vcpu, until a matching vcpu_put(), but assumes
181 * vcpu mutex is already taken.
182 */
183static struct kvm_vcpu *vmx_vcpu_load(struct kvm_vcpu *vcpu)
184{
185 u64 phys_addr = __pa(vcpu->vmcs);
186 int cpu;
187
188 cpu = get_cpu();
189
190 if (vcpu->cpu != cpu) {
191 smp_call_function(__vcpu_clear, vcpu, 0, 1);
192 vcpu->launched = 0;
193 }
194
195 if (per_cpu(current_vmcs, cpu) != vcpu->vmcs) {
196 u8 error;
197
198 per_cpu(current_vmcs, cpu) = vcpu->vmcs;
199 asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0"
200 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
201 : "cc");
202 if (error)
203 printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
204 vcpu->vmcs, phys_addr);
205 }
206
207 if (vcpu->cpu != cpu) {
208 struct descriptor_table dt;
209 unsigned long sysenter_esp;
210
211 vcpu->cpu = cpu;
212 /*
213 * Linux uses per-cpu TSS and GDT, so set these when switching
214 * processors.
215 */
216 vmcs_writel(HOST_TR_BASE, read_tr_base()); /* 22.2.4 */
217 get_gdt(&dt);
218 vmcs_writel(HOST_GDTR_BASE, dt.base); /* 22.2.4 */
219
220 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
221 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
222 }
223 return vcpu;
224}
225
226static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
227{
228 put_cpu();
229}
230
231static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
232{
233 return vmcs_readl(GUEST_RFLAGS);
234}
235
236static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
237{
238 vmcs_writel(GUEST_RFLAGS, rflags);
239}
240
241static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
242{
243 unsigned long rip;
244 u32 interruptibility;
245
246 rip = vmcs_readl(GUEST_RIP);
247 rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
248 vmcs_writel(GUEST_RIP, rip);
249
250 /*
251 * We emulated an instruction, so temporary interrupt blocking
252 * should be removed, if set.
253 */
254 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
255 if (interruptibility & 3)
256 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
257 interruptibility & ~3);
258}
259
260static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code)
261{
262 printk(KERN_DEBUG "inject_general_protection: rip 0x%lx\n",
263 vmcs_readl(GUEST_RIP));
264 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
265 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
266 GP_VECTOR |
267 INTR_TYPE_EXCEPTION |
268 INTR_INFO_DELIEVER_CODE_MASK |
269 INTR_INFO_VALID_MASK);
270}
271
272/*
273 * reads and returns guest's timestamp counter "register"
274 * guest_tsc = host_tsc + tsc_offset -- 21.3
275 */
276static u64 guest_read_tsc(void)
277{
278 u64 host_tsc, tsc_offset;
279
280 rdtscll(host_tsc);
281 tsc_offset = vmcs_read64(TSC_OFFSET);
282 return host_tsc + tsc_offset;
283}
284
285/*
286 * writes 'guest_tsc' into guest's timestamp counter "register"
287 * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc
288 */
289static void guest_write_tsc(u64 guest_tsc)
290{
291 u64 host_tsc;
292
293 rdtscll(host_tsc);
294 vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc);
295}
296
297static void reload_tss(void)
298{
299#ifndef __x86_64__
300
301 /*
302 * VT restores TR but not its size. Useless.
303 */
304 struct descriptor_table gdt;
305 struct segment_descriptor *descs;
306
307 get_gdt(&gdt);
308 descs = (void *)gdt.base;
309 descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
310 load_TR_desc();
311#endif
312}
313
314/*
315 * Reads an msr value (of 'msr_index') into 'pdata'.
316 * Returns 0 on success, non-0 otherwise.
317 * Assumes vcpu_load() was already called.
318 */
319static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
320{
321 u64 data;
322 struct vmx_msr_entry *msr;
323
324 if (!pdata) {
325 printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
326 return -EINVAL;
327 }
328
329 switch (msr_index) {
330#ifdef __x86_64__
331 case MSR_FS_BASE:
332 data = vmcs_readl(GUEST_FS_BASE);
333 break;
334 case MSR_GS_BASE:
335 data = vmcs_readl(GUEST_GS_BASE);
336 break;
337 case MSR_EFER:
338 data = vcpu->shadow_efer;
339 break;
340#endif
341 case MSR_IA32_TIME_STAMP_COUNTER:
342 data = guest_read_tsc();
343 break;
344 case MSR_IA32_SYSENTER_CS:
345 data = vmcs_read32(GUEST_SYSENTER_CS);
346 break;
347 case MSR_IA32_SYSENTER_EIP:
348 data = vmcs_read32(GUEST_SYSENTER_EIP);
349 break;
350 case MSR_IA32_SYSENTER_ESP:
351 data = vmcs_read32(GUEST_SYSENTER_ESP);
352 break;
353 case MSR_IA32_MC0_CTL:
354 case MSR_IA32_MCG_STATUS:
355 case MSR_IA32_MCG_CAP:
356 case MSR_IA32_MC0_MISC:
357 case MSR_IA32_MC0_MISC+4:
358 case MSR_IA32_MC0_MISC+8:
359 case MSR_IA32_MC0_MISC+12:
360 case MSR_IA32_MC0_MISC+16:
361 case MSR_IA32_UCODE_REV:
362 /* MTRR registers */
363 case 0xfe:
364 case 0x200 ... 0x2ff:
365 data = 0;
366 break;
367 case MSR_IA32_APICBASE:
368 data = vcpu->apic_base;
369 break;
370 default:
371 msr = find_msr_entry(vcpu, msr_index);
372 if (!msr) {
373 printk(KERN_ERR "kvm: unhandled rdmsr: %x\n", msr_index);
374 return 1;
375 }
376 data = msr->data;
377 break;
378 }
379
380 *pdata = data;
381 return 0;
382}
383
384/*
385 * Writes msr value into into the appropriate "register".
386 * Returns 0 on success, non-0 otherwise.
387 * Assumes vcpu_load() was already called.
388 */
389static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
390{
391 struct vmx_msr_entry *msr;
392 switch (msr_index) {
393#ifdef __x86_64__
394 case MSR_FS_BASE:
395 vmcs_writel(GUEST_FS_BASE, data);
396 break;
397 case MSR_GS_BASE:
398 vmcs_writel(GUEST_GS_BASE, data);
399 break;
400#endif
401 case MSR_IA32_SYSENTER_CS:
402 vmcs_write32(GUEST_SYSENTER_CS, data);
403 break;
404 case MSR_IA32_SYSENTER_EIP:
405 vmcs_write32(GUEST_SYSENTER_EIP, data);
406 break;
407 case MSR_IA32_SYSENTER_ESP:
408 vmcs_write32(GUEST_SYSENTER_ESP, data);
409 break;
410#ifdef __x86_64
411 case MSR_EFER:
412 set_efer(vcpu, data);
413 break;
414 case MSR_IA32_MC0_STATUS:
415 printk(KERN_WARNING "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n"
416 , __FUNCTION__, data);
417 break;
418#endif
419 case MSR_IA32_TIME_STAMP_COUNTER: {
420 guest_write_tsc(data);
421 break;
422 }
423 case MSR_IA32_UCODE_REV:
424 case MSR_IA32_UCODE_WRITE:
425 case 0x200 ... 0x2ff: /* MTRRs */
426 break;
427 case MSR_IA32_APICBASE:
428 vcpu->apic_base = data;
429 break;
430 default:
431 msr = find_msr_entry(vcpu, msr_index);
432 if (!msr) {
433 printk(KERN_ERR "kvm: unhandled wrmsr: 0x%x\n", msr_index);
434 return 1;
435 }
436 msr->data = data;
437 break;
438 }
439
440 return 0;
441}
442
443/*
444 * Sync the rsp and rip registers into the vcpu structure. This allows
445 * registers to be accessed by indexing vcpu->regs.
446 */
447static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu)
448{
449 vcpu->regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
450 vcpu->rip = vmcs_readl(GUEST_RIP);
451}
452
453/*
454 * Syncs rsp and rip back into the vmcs. Should be called after possible
455 * modification.
456 */
457static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu)
458{
459 vmcs_writel(GUEST_RSP, vcpu->regs[VCPU_REGS_RSP]);
460 vmcs_writel(GUEST_RIP, vcpu->rip);
461}
462
463static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
464{
465 unsigned long dr7 = 0x400;
466 u32 exception_bitmap;
467 int old_singlestep;
468
469 exception_bitmap = vmcs_read32(EXCEPTION_BITMAP);
470 old_singlestep = vcpu->guest_debug.singlestep;
471
472 vcpu->guest_debug.enabled = dbg->enabled;
473 if (vcpu->guest_debug.enabled) {
474 int i;
475
476 dr7 |= 0x200; /* exact */
477 for (i = 0; i < 4; ++i) {
478 if (!dbg->breakpoints[i].enabled)
479 continue;
480 vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address;
481 dr7 |= 2 << (i*2); /* global enable */
482 dr7 |= 0 << (i*4+16); /* execution breakpoint */
483 }
484
485 exception_bitmap |= (1u << 1); /* Trap debug exceptions */
486
487 vcpu->guest_debug.singlestep = dbg->singlestep;
488 } else {
489 exception_bitmap &= ~(1u << 1); /* Ignore debug exceptions */
490 vcpu->guest_debug.singlestep = 0;
491 }
492
493 if (old_singlestep && !vcpu->guest_debug.singlestep) {
494 unsigned long flags;
495
496 flags = vmcs_readl(GUEST_RFLAGS);
497 flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
498 vmcs_writel(GUEST_RFLAGS, flags);
499 }
500
501 vmcs_write32(EXCEPTION_BITMAP, exception_bitmap);
502 vmcs_writel(GUEST_DR7, dr7);
503
504 return 0;
505}
506
507static __init int cpu_has_kvm_support(void)
508{
509 unsigned long ecx = cpuid_ecx(1);
510 return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */
511}
512
513static __init int vmx_disabled_by_bios(void)
514{
515 u64 msr;
516
517 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
518 return (msr & 5) == 1; /* locked but not enabled */
519}
520
521static __init void hardware_enable(void *garbage)
522{
523 int cpu = raw_smp_processor_id();
524 u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
525 u64 old;
526
527 rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
528 if ((old & 5) == 0)
529 /* enable and lock */
530 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | 5);
531 write_cr4(read_cr4() | CR4_VMXE); /* FIXME: not cpu hotplug safe */
532 asm volatile (ASM_VMX_VMXON_RAX : : "a"(&phys_addr), "m"(phys_addr)
533 : "memory", "cc");
534}
535
536static void hardware_disable(void *garbage)
537{
538 asm volatile (ASM_VMX_VMXOFF : : : "cc");
539}
540
541static __init void setup_vmcs_descriptor(void)
542{
543 u32 vmx_msr_low, vmx_msr_high;
544
545 rdmsr(MSR_IA32_VMX_BASIC_MSR, vmx_msr_low, vmx_msr_high);
546 vmcs_descriptor.size = vmx_msr_high & 0x1fff;
547 vmcs_descriptor.order = get_order(vmcs_descriptor.size);
548 vmcs_descriptor.revision_id = vmx_msr_low;
549};
550
551static struct vmcs *alloc_vmcs_cpu(int cpu)
552{
553 int node = cpu_to_node(cpu);
554 struct page *pages;
555 struct vmcs *vmcs;
556
557 pages = alloc_pages_node(node, GFP_KERNEL, vmcs_descriptor.order);
558 if (!pages)
559 return NULL;
560 vmcs = page_address(pages);
561 memset(vmcs, 0, vmcs_descriptor.size);
562 vmcs->revision_id = vmcs_descriptor.revision_id; /* vmcs revision id */
563 return vmcs;
564}
565
566static struct vmcs *alloc_vmcs(void)
567{
568 return alloc_vmcs_cpu(smp_processor_id());
569}
570
571static void free_vmcs(struct vmcs *vmcs)
572{
573 free_pages((unsigned long)vmcs, vmcs_descriptor.order);
574}
575
576static __exit void free_kvm_area(void)
577{
578 int cpu;
579
580 for_each_online_cpu(cpu)
581 free_vmcs(per_cpu(vmxarea, cpu));
582}
583
584extern struct vmcs *alloc_vmcs_cpu(int cpu);
585
586static __init int alloc_kvm_area(void)
587{
588 int cpu;
589
590 for_each_online_cpu(cpu) {
591 struct vmcs *vmcs;
592
593 vmcs = alloc_vmcs_cpu(cpu);
594 if (!vmcs) {
595 free_kvm_area();
596 return -ENOMEM;
597 }
598
599 per_cpu(vmxarea, cpu) = vmcs;
600 }
601 return 0;
602}
603
604static __init int hardware_setup(void)
605{
606 setup_vmcs_descriptor();
607 return alloc_kvm_area();
608}
609
610static __exit void hardware_unsetup(void)
611{
612 free_kvm_area();
613}
614
615static void update_exception_bitmap(struct kvm_vcpu *vcpu)
616{
617 if (vcpu->rmode.active)
618 vmcs_write32(EXCEPTION_BITMAP, ~0);
619 else
620 vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR);
621}
622
623static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save)
624{
625 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
626
627 if (vmcs_readl(sf->base) == save->base) {
628 vmcs_write16(sf->selector, save->selector);
629 vmcs_writel(sf->base, save->base);
630 vmcs_write32(sf->limit, save->limit);
631 vmcs_write32(sf->ar_bytes, save->ar);
632 } else {
633 u32 dpl = (vmcs_read16(sf->selector) & SELECTOR_RPL_MASK)
634 << AR_DPL_SHIFT;
635 vmcs_write32(sf->ar_bytes, 0x93 | dpl);
636 }
637}
638
639static void enter_pmode(struct kvm_vcpu *vcpu)
640{
641 unsigned long flags;
642
643 vcpu->rmode.active = 0;
644
645 vmcs_writel(GUEST_TR_BASE, vcpu->rmode.tr.base);
646 vmcs_write32(GUEST_TR_LIMIT, vcpu->rmode.tr.limit);
647 vmcs_write32(GUEST_TR_AR_BYTES, vcpu->rmode.tr.ar);
648
649 flags = vmcs_readl(GUEST_RFLAGS);
650 flags &= ~(IOPL_MASK | X86_EFLAGS_VM);
651 flags |= (vcpu->rmode.save_iopl << IOPL_SHIFT);
652 vmcs_writel(GUEST_RFLAGS, flags);
653
654 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~CR4_VME_MASK) |
655 (vmcs_readl(CR4_READ_SHADOW) & CR4_VME_MASK));
656
657 update_exception_bitmap(vcpu);
658
659 fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->rmode.es);
660 fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->rmode.ds);
661 fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->rmode.gs);
662 fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->rmode.fs);
663
664 vmcs_write16(GUEST_SS_SELECTOR, 0);
665 vmcs_write32(GUEST_SS_AR_BYTES, 0x93);
666
667 vmcs_write16(GUEST_CS_SELECTOR,
668 vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK);
669 vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
670}
671
672static int rmode_tss_base(struct kvm* kvm)
673{
674 gfn_t base_gfn = kvm->memslots[0].base_gfn + kvm->memslots[0].npages - 3;
675 return base_gfn << PAGE_SHIFT;
676}
677
678static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
679{
680 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
681
682 save->selector = vmcs_read16(sf->selector);
683 save->base = vmcs_readl(sf->base);
684 save->limit = vmcs_read32(sf->limit);
685 save->ar = vmcs_read32(sf->ar_bytes);
686 vmcs_write16(sf->selector, vmcs_readl(sf->base) >> 4);
687 vmcs_write32(sf->limit, 0xffff);
688 vmcs_write32(sf->ar_bytes, 0xf3);
689}
690
691static void enter_rmode(struct kvm_vcpu *vcpu)
692{
693 unsigned long flags;
694
695 vcpu->rmode.active = 1;
696
697 vcpu->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
698 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
699
700 vcpu->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
701 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
702
703 vcpu->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
704 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
705
706 flags = vmcs_readl(GUEST_RFLAGS);
707 vcpu->rmode.save_iopl = (flags & IOPL_MASK) >> IOPL_SHIFT;
708
709 flags |= IOPL_MASK | X86_EFLAGS_VM;
710
711 vmcs_writel(GUEST_RFLAGS, flags);
712 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | CR4_VME_MASK);
713 update_exception_bitmap(vcpu);
714
715 vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4);
716 vmcs_write32(GUEST_SS_LIMIT, 0xffff);
717 vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
718
719 vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
720 vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
721
722 fix_rmode_seg(VCPU_SREG_ES, &vcpu->rmode.es);
723 fix_rmode_seg(VCPU_SREG_DS, &vcpu->rmode.ds);
724 fix_rmode_seg(VCPU_SREG_GS, &vcpu->rmode.gs);
725 fix_rmode_seg(VCPU_SREG_FS, &vcpu->rmode.fs);
726}
727
728#ifdef __x86_64__
729
730static void enter_lmode(struct kvm_vcpu *vcpu)
731{
732 u32 guest_tr_ar;
733
734 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
735 if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
736 printk(KERN_DEBUG "%s: tss fixup for long mode. \n",
737 __FUNCTION__);
738 vmcs_write32(GUEST_TR_AR_BYTES,
739 (guest_tr_ar & ~AR_TYPE_MASK)
740 | AR_TYPE_BUSY_64_TSS);
741 }
742
743 vcpu->shadow_efer |= EFER_LMA;
744
745 find_msr_entry(vcpu, MSR_EFER)->data |= EFER_LMA | EFER_LME;
746 vmcs_write32(VM_ENTRY_CONTROLS,
747 vmcs_read32(VM_ENTRY_CONTROLS)
748 | VM_ENTRY_CONTROLS_IA32E_MASK);
749}
750
751static void exit_lmode(struct kvm_vcpu *vcpu)
752{
753 vcpu->shadow_efer &= ~EFER_LMA;
754
755 vmcs_write32(VM_ENTRY_CONTROLS,
756 vmcs_read32(VM_ENTRY_CONTROLS)
757 & ~VM_ENTRY_CONTROLS_IA32E_MASK);
758}
759
760#endif
761
762static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
763{
764 if (vcpu->rmode.active && (cr0 & CR0_PE_MASK))
765 enter_pmode(vcpu);
766
767 if (!vcpu->rmode.active && !(cr0 & CR0_PE_MASK))
768 enter_rmode(vcpu);
769
770#ifdef __x86_64__
771 if (vcpu->shadow_efer & EFER_LME) {
772 if (!is_paging(vcpu) && (cr0 & CR0_PG_MASK))
773 enter_lmode(vcpu);
774 if (is_paging(vcpu) && !(cr0 & CR0_PG_MASK))
775 exit_lmode(vcpu);
776 }
777#endif
778
779 vmcs_writel(CR0_READ_SHADOW, cr0);
780 vmcs_writel(GUEST_CR0,
781 (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON);
782 vcpu->cr0 = cr0;
783}
784
785/*
786 * Used when restoring the VM to avoid corrupting segment registers
787 */
788static void vmx_set_cr0_no_modeswitch(struct kvm_vcpu *vcpu, unsigned long cr0)
789{
790 vcpu->rmode.active = ((cr0 & CR0_PE_MASK) == 0);
791 update_exception_bitmap(vcpu);
792 vmcs_writel(CR0_READ_SHADOW, cr0);
793 vmcs_writel(GUEST_CR0,
794 (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON);
795 vcpu->cr0 = cr0;
796}
797
798static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
799{
800 vmcs_writel(GUEST_CR3, cr3);
801}
802
803static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
804{
805 vmcs_writel(CR4_READ_SHADOW, cr4);
806 vmcs_writel(GUEST_CR4, cr4 | (vcpu->rmode.active ?
807 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON));
808 vcpu->cr4 = cr4;
809}
810
811#ifdef __x86_64__
812
813static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
814{
815 struct vmx_msr_entry *msr = find_msr_entry(vcpu, MSR_EFER);
816
817 vcpu->shadow_efer = efer;
818 if (efer & EFER_LMA) {
819 vmcs_write32(VM_ENTRY_CONTROLS,
820 vmcs_read32(VM_ENTRY_CONTROLS) |
821 VM_ENTRY_CONTROLS_IA32E_MASK);
822 msr->data = efer;
823
824 } else {
825 vmcs_write32(VM_ENTRY_CONTROLS,
826 vmcs_read32(VM_ENTRY_CONTROLS) &
827 ~VM_ENTRY_CONTROLS_IA32E_MASK);
828
829 msr->data = efer & ~EFER_LME;
830 }
831}
832
833#endif
834
835static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
836{
837 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
838
839 return vmcs_readl(sf->base);
840}
841
842static void vmx_get_segment(struct kvm_vcpu *vcpu,
843 struct kvm_segment *var, int seg)
844{
845 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
846 u32 ar;
847
848 var->base = vmcs_readl(sf->base);
849 var->limit = vmcs_read32(sf->limit);
850 var->selector = vmcs_read16(sf->selector);
851 ar = vmcs_read32(sf->ar_bytes);
852 if (ar & AR_UNUSABLE_MASK)
853 ar = 0;
854 var->type = ar & 15;
855 var->s = (ar >> 4) & 1;
856 var->dpl = (ar >> 5) & 3;
857 var->present = (ar >> 7) & 1;
858 var->avl = (ar >> 12) & 1;
859 var->l = (ar >> 13) & 1;
860 var->db = (ar >> 14) & 1;
861 var->g = (ar >> 15) & 1;
862 var->unusable = (ar >> 16) & 1;
863}
864
865static void vmx_set_segment(struct kvm_vcpu *vcpu,
866 struct kvm_segment *var, int seg)
867{
868 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
869 u32 ar;
870
871 vmcs_writel(sf->base, var->base);
872 vmcs_write32(sf->limit, var->limit);
873 vmcs_write16(sf->selector, var->selector);
874 if (var->unusable)
875 ar = 1 << 16;
876 else {
877 ar = var->type & 15;
878 ar |= (var->s & 1) << 4;
879 ar |= (var->dpl & 3) << 5;
880 ar |= (var->present & 1) << 7;
881 ar |= (var->avl & 1) << 12;
882 ar |= (var->l & 1) << 13;
883 ar |= (var->db & 1) << 14;
884 ar |= (var->g & 1) << 15;
885 }
886 vmcs_write32(sf->ar_bytes, ar);
887}
888
889static int vmx_is_long_mode(struct kvm_vcpu *vcpu)
890{
891 return vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_CONTROLS_IA32E_MASK;
892}
893
894static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
895{
896 u32 ar = vmcs_read32(GUEST_CS_AR_BYTES);
897
898 *db = (ar >> 14) & 1;
899 *l = (ar >> 13) & 1;
900}
901
902static void vmx_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
903{
904 dt->limit = vmcs_read32(GUEST_IDTR_LIMIT);
905 dt->base = vmcs_readl(GUEST_IDTR_BASE);
906}
907
908static void vmx_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
909{
910 vmcs_write32(GUEST_IDTR_LIMIT, dt->limit);
911 vmcs_writel(GUEST_IDTR_BASE, dt->base);
912}
913
914static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
915{
916 dt->limit = vmcs_read32(GUEST_GDTR_LIMIT);
917 dt->base = vmcs_readl(GUEST_GDTR_BASE);
918}
919
920static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
921{
922 vmcs_write32(GUEST_GDTR_LIMIT, dt->limit);
923 vmcs_writel(GUEST_GDTR_BASE, dt->base);
924}
925
926static int init_rmode_tss(struct kvm* kvm)
927{
928 struct page *p1, *p2, *p3;
929 gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
930 char *page;
931
932 p1 = _gfn_to_page(kvm, fn++);
933 p2 = _gfn_to_page(kvm, fn++);
934 p3 = _gfn_to_page(kvm, fn);
935
936 if (!p1 || !p2 || !p3) {
937 kvm_printf(kvm,"%s: gfn_to_page failed\n", __FUNCTION__);
938 return 0;
939 }
940
941 page = kmap_atomic(p1, KM_USER0);
942 memset(page, 0, PAGE_SIZE);
943 *(u16*)(page + 0x66) = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
944 kunmap_atomic(page, KM_USER0);
945
946 page = kmap_atomic(p2, KM_USER0);
947 memset(page, 0, PAGE_SIZE);
948 kunmap_atomic(page, KM_USER0);
949
950 page = kmap_atomic(p3, KM_USER0);
951 memset(page, 0, PAGE_SIZE);
952 *(page + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1) = ~0;
953 kunmap_atomic(page, KM_USER0);
954
955 return 1;
956}
957
958static void vmcs_write32_fixedbits(u32 msr, u32 vmcs_field, u32 val)
959{
960 u32 msr_high, msr_low;
961
962 rdmsr(msr, msr_low, msr_high);
963
964 val &= msr_high;
965 val |= msr_low;
966 vmcs_write32(vmcs_field, val);
967}
968
969static void seg_setup(int seg)
970{
971 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
972
973 vmcs_write16(sf->selector, 0);
974 vmcs_writel(sf->base, 0);
975 vmcs_write32(sf->limit, 0xffff);
976 vmcs_write32(sf->ar_bytes, 0x93);
977}
978
979/*
980 * Sets up the vmcs for emulated real mode.
981 */
982static int vmx_vcpu_setup(struct kvm_vcpu *vcpu)
983{
984 u32 host_sysenter_cs;
985 u32 junk;
986 unsigned long a;
987 struct descriptor_table dt;
988 int i;
989 int ret = 0;
990 int nr_good_msrs;
991 extern asmlinkage void kvm_vmx_return(void);
992
993 if (!init_rmode_tss(vcpu->kvm)) {
994 ret = -ENOMEM;
995 goto out;
996 }
997
998 memset(vcpu->regs, 0, sizeof(vcpu->regs));
999 vcpu->regs[VCPU_REGS_RDX] = get_rdx_init_val();
1000 vcpu->cr8 = 0;
1001 vcpu->apic_base = 0xfee00000 |
1002 /*for vcpu 0*/ MSR_IA32_APICBASE_BSP |
1003 MSR_IA32_APICBASE_ENABLE;
1004
1005 fx_init(vcpu);
1006
1007 /*
1008 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
1009 * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh.
1010 */
1011 vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
1012 vmcs_writel(GUEST_CS_BASE, 0x000f0000);
1013 vmcs_write32(GUEST_CS_LIMIT, 0xffff);
1014 vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
1015
1016 seg_setup(VCPU_SREG_DS);
1017 seg_setup(VCPU_SREG_ES);
1018 seg_setup(VCPU_SREG_FS);
1019 seg_setup(VCPU_SREG_GS);
1020 seg_setup(VCPU_SREG_SS);
1021
1022 vmcs_write16(GUEST_TR_SELECTOR, 0);
1023 vmcs_writel(GUEST_TR_BASE, 0);
1024 vmcs_write32(GUEST_TR_LIMIT, 0xffff);
1025 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
1026
1027 vmcs_write16(GUEST_LDTR_SELECTOR, 0);
1028 vmcs_writel(GUEST_LDTR_BASE, 0);
1029 vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
1030 vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
1031
1032 vmcs_write32(GUEST_SYSENTER_CS, 0);
1033 vmcs_writel(GUEST_SYSENTER_ESP, 0);
1034 vmcs_writel(GUEST_SYSENTER_EIP, 0);
1035
1036 vmcs_writel(GUEST_RFLAGS, 0x02);
1037 vmcs_writel(GUEST_RIP, 0xfff0);
1038 vmcs_writel(GUEST_RSP, 0);
1039
1040 vmcs_writel(GUEST_CR3, 0);
1041
1042 //todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0
1043 vmcs_writel(GUEST_DR7, 0x400);
1044
1045 vmcs_writel(GUEST_GDTR_BASE, 0);
1046 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
1047
1048 vmcs_writel(GUEST_IDTR_BASE, 0);
1049 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
1050
1051 vmcs_write32(GUEST_ACTIVITY_STATE, 0);
1052 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
1053 vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
1054
1055 /* I/O */
1056 vmcs_write64(IO_BITMAP_A, 0);
1057 vmcs_write64(IO_BITMAP_B, 0);
1058
1059 guest_write_tsc(0);
1060
1061 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
1062
1063 /* Special registers */
1064 vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
1065
1066 /* Control */
1067 vmcs_write32_fixedbits(MSR_IA32_VMX_PINBASED_CTLS_MSR,
1068 PIN_BASED_VM_EXEC_CONTROL,
1069 PIN_BASED_EXT_INTR_MASK /* 20.6.1 */
1070 | PIN_BASED_NMI_EXITING /* 20.6.1 */
1071 );
1072 vmcs_write32_fixedbits(MSR_IA32_VMX_PROCBASED_CTLS_MSR,
1073 CPU_BASED_VM_EXEC_CONTROL,
1074 CPU_BASED_HLT_EXITING /* 20.6.2 */
1075 | CPU_BASED_CR8_LOAD_EXITING /* 20.6.2 */
1076 | CPU_BASED_CR8_STORE_EXITING /* 20.6.2 */
1077 | CPU_BASED_UNCOND_IO_EXITING /* 20.6.2 */
1078 | CPU_BASED_INVDPG_EXITING
1079 | CPU_BASED_MOV_DR_EXITING
1080 | CPU_BASED_USE_TSC_OFFSETING /* 21.3 */
1081 );
1082
1083 vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR);
1084 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
1085 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
1086 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
1087
1088 vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */
1089 vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */
1090 vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
1091
1092 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
1093 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
1094 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
1095 vmcs_write16(HOST_FS_SELECTOR, read_fs()); /* 22.2.4 */
1096 vmcs_write16(HOST_GS_SELECTOR, read_gs()); /* 22.2.4 */
1097 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
1098#ifdef __x86_64__
1099 rdmsrl(MSR_FS_BASE, a);
1100 vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
1101 rdmsrl(MSR_GS_BASE, a);
1102 vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */
1103#else
1104 vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
1105 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
1106#endif
1107
1108 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
1109
1110 get_idt(&dt);
1111 vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */
1112
1113
1114 vmcs_writel(HOST_RIP, (unsigned long)kvm_vmx_return); /* 22.2.5 */
1115
1116 rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
1117 vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
1118 rdmsrl(MSR_IA32_SYSENTER_ESP, a);
1119 vmcs_writel(HOST_IA32_SYSENTER_ESP, a); /* 22.2.3 */
1120 rdmsrl(MSR_IA32_SYSENTER_EIP, a);
1121 vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */
1122
1123 ret = -ENOMEM;
1124 vcpu->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
1125 if (!vcpu->guest_msrs)
1126 goto out;
1127 vcpu->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
1128 if (!vcpu->host_msrs)
1129 goto out_free_guest_msrs;
1130
1131 for (i = 0; i < NR_VMX_MSR; ++i) {
1132 u32 index = vmx_msr_index[i];
1133 u32 data_low, data_high;
1134 u64 data;
1135 int j = vcpu->nmsrs;
1136
1137 if (rdmsr_safe(index, &data_low, &data_high) < 0)
1138 continue;
1139 data = data_low | ((u64)data_high << 32);
1140 vcpu->host_msrs[j].index = index;
1141 vcpu->host_msrs[j].reserved = 0;
1142 vcpu->host_msrs[j].data = data;
1143 vcpu->guest_msrs[j] = vcpu->host_msrs[j];
1144 ++vcpu->nmsrs;
1145 }
1146 printk(KERN_DEBUG "kvm: msrs: %d\n", vcpu->nmsrs);
1147
1148 nr_good_msrs = vcpu->nmsrs - NR_BAD_MSRS;
1149 vmcs_writel(VM_ENTRY_MSR_LOAD_ADDR,
1150 virt_to_phys(vcpu->guest_msrs + NR_BAD_MSRS));
1151 vmcs_writel(VM_EXIT_MSR_STORE_ADDR,
1152 virt_to_phys(vcpu->guest_msrs + NR_BAD_MSRS));
1153 vmcs_writel(VM_EXIT_MSR_LOAD_ADDR,
1154 virt_to_phys(vcpu->host_msrs + NR_BAD_MSRS));
1155 vmcs_write32_fixedbits(MSR_IA32_VMX_EXIT_CTLS_MSR, VM_EXIT_CONTROLS,
1156 (HOST_IS_64 << 9)); /* 22.2,1, 20.7.1 */
1157 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, nr_good_msrs); /* 22.2.2 */
1158 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */
1159 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */
1160
1161
1162 /* 22.2.1, 20.8.1 */
1163 vmcs_write32_fixedbits(MSR_IA32_VMX_ENTRY_CTLS_MSR,
1164 VM_ENTRY_CONTROLS, 0);
1165 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
1166
1167 vmcs_writel(VIRTUAL_APIC_PAGE_ADDR, 0);
1168 vmcs_writel(TPR_THRESHOLD, 0);
1169
1170 vmcs_writel(CR0_GUEST_HOST_MASK, KVM_GUEST_CR0_MASK);
1171 vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK);
1172
1173 vcpu->cr0 = 0x60000010;
1174 vmx_set_cr0(vcpu, vcpu->cr0); // enter rmode
1175 vmx_set_cr4(vcpu, 0);
1176#ifdef __x86_64__
1177 vmx_set_efer(vcpu, 0);
1178#endif
1179
1180 return 0;
1181
1182out_free_guest_msrs:
1183 kfree(vcpu->guest_msrs);
1184out:
1185 return ret;
1186}
1187
1188static void inject_rmode_irq(struct kvm_vcpu *vcpu, int irq)
1189{
1190 u16 ent[2];
1191 u16 cs;
1192 u16 ip;
1193 unsigned long flags;
1194 unsigned long ss_base = vmcs_readl(GUEST_SS_BASE);
1195 u16 sp = vmcs_readl(GUEST_RSP);
1196 u32 ss_limit = vmcs_read32(GUEST_SS_LIMIT);
1197
1198 if (sp > ss_limit || sp - 6 > sp) {
1199 vcpu_printf(vcpu, "%s: #SS, rsp 0x%lx ss 0x%lx limit 0x%x\n",
1200 __FUNCTION__,
1201 vmcs_readl(GUEST_RSP),
1202 vmcs_readl(GUEST_SS_BASE),
1203 vmcs_read32(GUEST_SS_LIMIT));
1204 return;
1205 }
1206
1207 if (kvm_read_guest(vcpu, irq * sizeof(ent), sizeof(ent), &ent) !=
1208 sizeof(ent)) {
1209 vcpu_printf(vcpu, "%s: read guest err\n", __FUNCTION__);
1210 return;
1211 }
1212
1213 flags = vmcs_readl(GUEST_RFLAGS);
1214 cs = vmcs_readl(GUEST_CS_BASE) >> 4;
1215 ip = vmcs_readl(GUEST_RIP);
1216
1217
1218 if (kvm_write_guest(vcpu, ss_base + sp - 2, 2, &flags) != 2 ||
1219 kvm_write_guest(vcpu, ss_base + sp - 4, 2, &cs) != 2 ||
1220 kvm_write_guest(vcpu, ss_base + sp - 6, 2, &ip) != 2) {
1221 vcpu_printf(vcpu, "%s: write guest err\n", __FUNCTION__);
1222 return;
1223 }
1224
1225 vmcs_writel(GUEST_RFLAGS, flags &
1226 ~( X86_EFLAGS_IF | X86_EFLAGS_AC | X86_EFLAGS_TF));
1227 vmcs_write16(GUEST_CS_SELECTOR, ent[1]) ;
1228 vmcs_writel(GUEST_CS_BASE, ent[1] << 4);
1229 vmcs_writel(GUEST_RIP, ent[0]);
1230 vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) & ~0xffff) | (sp - 6));
1231}
1232
1233static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
1234{
1235 int word_index = __ffs(vcpu->irq_summary);
1236 int bit_index = __ffs(vcpu->irq_pending[word_index]);
1237 int irq = word_index * BITS_PER_LONG + bit_index;
1238
1239 clear_bit(bit_index, &vcpu->irq_pending[word_index]);
1240 if (!vcpu->irq_pending[word_index])
1241 clear_bit(word_index, &vcpu->irq_summary);
1242
1243 if (vcpu->rmode.active) {
1244 inject_rmode_irq(vcpu, irq);
1245 return;
1246 }
1247 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
1248 irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
1249}
1250
1251static void kvm_try_inject_irq(struct kvm_vcpu *vcpu)
1252{
1253 if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)
1254 && (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0)
1255 /*
1256 * Interrupts enabled, and not blocked by sti or mov ss. Good.
1257 */
1258 kvm_do_inject_irq(vcpu);
1259 else
1260 /*
1261 * Interrupts blocked. Wait for unblock.
1262 */
1263 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
1264 vmcs_read32(CPU_BASED_VM_EXEC_CONTROL)
1265 | CPU_BASED_VIRTUAL_INTR_PENDING);
1266}
1267
1268static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
1269{
1270 struct kvm_guest_debug *dbg = &vcpu->guest_debug;
1271
1272 set_debugreg(dbg->bp[0], 0);
1273 set_debugreg(dbg->bp[1], 1);
1274 set_debugreg(dbg->bp[2], 2);
1275 set_debugreg(dbg->bp[3], 3);
1276
1277 if (dbg->singlestep) {
1278 unsigned long flags;
1279
1280 flags = vmcs_readl(GUEST_RFLAGS);
1281 flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
1282 vmcs_writel(GUEST_RFLAGS, flags);
1283 }
1284}
1285
1286static int handle_rmode_exception(struct kvm_vcpu *vcpu,
1287 int vec, u32 err_code)
1288{
1289 if (!vcpu->rmode.active)
1290 return 0;
1291
1292 if (vec == GP_VECTOR && err_code == 0)
1293 if (emulate_instruction(vcpu, NULL, 0, 0) == EMULATE_DONE)
1294 return 1;
1295 return 0;
1296}
1297
1298static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1299{
1300 u32 intr_info, error_code;
1301 unsigned long cr2, rip;
1302 u32 vect_info;
1303 enum emulation_result er;
1304
1305 vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
1306 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
1307
1308 if ((vect_info & VECTORING_INFO_VALID_MASK) &&
1309 !is_page_fault(intr_info)) {
1310 printk(KERN_ERR "%s: unexpected, vectoring info 0x%x "
1311 "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info);
1312 }
1313
1314 if (is_external_interrupt(vect_info)) {
1315 int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
1316 set_bit(irq, vcpu->irq_pending);
1317 set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary);
1318 }
1319
1320 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) { /* nmi */
1321 asm ("int $2");
1322 return 1;
1323 }
1324 error_code = 0;
1325 rip = vmcs_readl(GUEST_RIP);
1326 if (intr_info & INTR_INFO_DELIEVER_CODE_MASK)
1327 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
1328 if (is_page_fault(intr_info)) {
1329 cr2 = vmcs_readl(EXIT_QUALIFICATION);
1330
1331 spin_lock(&vcpu->kvm->lock);
1332 if (!vcpu->mmu.page_fault(vcpu, cr2, error_code)) {
1333 spin_unlock(&vcpu->kvm->lock);
1334 return 1;
1335 }
1336
1337 er = emulate_instruction(vcpu, kvm_run, cr2, error_code);
1338 spin_unlock(&vcpu->kvm->lock);
1339
1340 switch (er) {
1341 case EMULATE_DONE:
1342 return 1;
1343 case EMULATE_DO_MMIO:
1344 ++kvm_stat.mmio_exits;
1345 kvm_run->exit_reason = KVM_EXIT_MMIO;
1346 return 0;
1347 case EMULATE_FAIL:
1348 vcpu_printf(vcpu, "%s: emulate fail\n", __FUNCTION__);
1349 break;
1350 default:
1351 BUG();
1352 }
1353 }
1354
1355 if (vcpu->rmode.active &&
1356 handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
1357 error_code))
1358 return 1;
1359
1360 if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == (INTR_TYPE_EXCEPTION | 1)) {
1361 kvm_run->exit_reason = KVM_EXIT_DEBUG;
1362 return 0;
1363 }
1364 kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
1365 kvm_run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
1366 kvm_run->ex.error_code = error_code;
1367 return 0;
1368}
1369
1370static int handle_external_interrupt(struct kvm_vcpu *vcpu,
1371 struct kvm_run *kvm_run)
1372{
1373 ++kvm_stat.irq_exits;
1374 return 1;
1375}
1376
1377
1378static int get_io_count(struct kvm_vcpu *vcpu, u64 *count)
1379{
1380 u64 inst;
1381 gva_t rip;
1382 int countr_size;
1383 int i, n;
1384
1385 if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_VM)) {
1386 countr_size = 2;
1387 } else {
1388 u32 cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
1389
1390 countr_size = (cs_ar & AR_L_MASK) ? 8:
1391 (cs_ar & AR_DB_MASK) ? 4: 2;
1392 }
1393
1394 rip = vmcs_readl(GUEST_RIP);
1395 if (countr_size != 8)
1396 rip += vmcs_readl(GUEST_CS_BASE);
1397
1398 n = kvm_read_guest(vcpu, rip, sizeof(inst), &inst);
1399
1400 for (i = 0; i < n; i++) {
1401 switch (((u8*)&inst)[i]) {
1402 case 0xf0:
1403 case 0xf2:
1404 case 0xf3:
1405 case 0x2e:
1406 case 0x36:
1407 case 0x3e:
1408 case 0x26:
1409 case 0x64:
1410 case 0x65:
1411 case 0x66:
1412 break;
1413 case 0x67:
1414 countr_size = (countr_size == 2) ? 4: (countr_size >> 1);
1415 default:
1416 goto done;
1417 }
1418 }
1419 return 0;
1420done:
1421 countr_size *= 8;
1422 *count = vcpu->regs[VCPU_REGS_RCX] & (~0ULL >> (64 - countr_size));
1423 return 1;
1424}
1425
1426static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1427{
1428 u64 exit_qualification;
1429
1430 ++kvm_stat.io_exits;
1431 exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
1432 kvm_run->exit_reason = KVM_EXIT_IO;
1433 if (exit_qualification & 8)
1434 kvm_run->io.direction = KVM_EXIT_IO_IN;
1435 else
1436 kvm_run->io.direction = KVM_EXIT_IO_OUT;
1437 kvm_run->io.size = (exit_qualification & 7) + 1;
1438 kvm_run->io.string = (exit_qualification & 16) != 0;
1439 kvm_run->io.string_down
1440 = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0;
1441 kvm_run->io.rep = (exit_qualification & 32) != 0;
1442 kvm_run->io.port = exit_qualification >> 16;
1443 if (kvm_run->io.string) {
1444 if (!get_io_count(vcpu, &kvm_run->io.count))
1445 return 1;
1446 kvm_run->io.address = vmcs_readl(GUEST_LINEAR_ADDRESS);
1447 } else
1448 kvm_run->io.value = vcpu->regs[VCPU_REGS_RAX]; /* rax */
1449 return 0;
1450}
1451
1452static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1453{
1454 u64 address = vmcs_read64(EXIT_QUALIFICATION);
1455 int instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
1456 spin_lock(&vcpu->kvm->lock);
1457 vcpu->mmu.inval_page(vcpu, address);
1458 spin_unlock(&vcpu->kvm->lock);
1459 vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP) + instruction_length);
1460 return 1;
1461}
1462
1463static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1464{
1465 u64 exit_qualification;
1466 int cr;
1467 int reg;
1468
1469 exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
1470 cr = exit_qualification & 15;
1471 reg = (exit_qualification >> 8) & 15;
1472 switch ((exit_qualification >> 4) & 3) {
1473 case 0: /* mov to cr */
1474 switch (cr) {
1475 case 0:
1476 vcpu_load_rsp_rip(vcpu);
1477 set_cr0(vcpu, vcpu->regs[reg]);
1478 skip_emulated_instruction(vcpu);
1479 return 1;
1480 case 3:
1481 vcpu_load_rsp_rip(vcpu);
1482 set_cr3(vcpu, vcpu->regs[reg]);
1483 skip_emulated_instruction(vcpu);
1484 return 1;
1485 case 4:
1486 vcpu_load_rsp_rip(vcpu);
1487 set_cr4(vcpu, vcpu->regs[reg]);
1488 skip_emulated_instruction(vcpu);
1489 return 1;
1490 case 8:
1491 vcpu_load_rsp_rip(vcpu);
1492 set_cr8(vcpu, vcpu->regs[reg]);
1493 skip_emulated_instruction(vcpu);
1494 return 1;
1495 };
1496 break;
1497 case 1: /*mov from cr*/
1498 switch (cr) {
1499 case 3:
1500 vcpu_load_rsp_rip(vcpu);
1501 vcpu->regs[reg] = vcpu->cr3;
1502 vcpu_put_rsp_rip(vcpu);
1503 skip_emulated_instruction(vcpu);
1504 return 1;
1505 case 8:
1506 printk(KERN_DEBUG "handle_cr: read CR8 "
1507 "cpu erratum AA15\n");
1508 vcpu_load_rsp_rip(vcpu);
1509 vcpu->regs[reg] = vcpu->cr8;
1510 vcpu_put_rsp_rip(vcpu);
1511 skip_emulated_instruction(vcpu);
1512 return 1;
1513 }
1514 break;
1515 case 3: /* lmsw */
1516 lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f);
1517
1518 skip_emulated_instruction(vcpu);
1519 return 1;
1520 default:
1521 break;
1522 }
1523 kvm_run->exit_reason = 0;
1524 printk(KERN_ERR "kvm: unhandled control register: op %d cr %d\n",
1525 (int)(exit_qualification >> 4) & 3, cr);
1526 return 0;
1527}
1528
1529static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1530{
1531 u64 exit_qualification;
1532 unsigned long val;
1533 int dr, reg;
1534
1535 /*
1536 * FIXME: this code assumes the host is debugging the guest.
1537 * need to deal with guest debugging itself too.
1538 */
1539 exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
1540 dr = exit_qualification & 7;
1541 reg = (exit_qualification >> 8) & 15;
1542 vcpu_load_rsp_rip(vcpu);
1543 if (exit_qualification & 16) {
1544 /* mov from dr */
1545 switch (dr) {
1546 case 6:
1547 val = 0xffff0ff0;
1548 break;
1549 case 7:
1550 val = 0x400;
1551 break;
1552 default:
1553 val = 0;
1554 }
1555 vcpu->regs[reg] = val;
1556 } else {
1557 /* mov to dr */
1558 }
1559 vcpu_put_rsp_rip(vcpu);
1560 skip_emulated_instruction(vcpu);
1561 return 1;
1562}
1563
1564static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1565{
1566 kvm_run->exit_reason = KVM_EXIT_CPUID;
1567 return 0;
1568}
1569
1570static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1571{
1572 u32 ecx = vcpu->regs[VCPU_REGS_RCX];
1573 u64 data;
1574
1575 if (vmx_get_msr(vcpu, ecx, &data)) {
1576 vmx_inject_gp(vcpu, 0);
1577 return 1;
1578 }
1579
1580 /* FIXME: handling of bits 32:63 of rax, rdx */
1581 vcpu->regs[VCPU_REGS_RAX] = data & -1u;
1582 vcpu->regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
1583 skip_emulated_instruction(vcpu);
1584 return 1;
1585}
1586
1587static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1588{
1589 u32 ecx = vcpu->regs[VCPU_REGS_RCX];
1590 u64 data = (vcpu->regs[VCPU_REGS_RAX] & -1u)
1591 | ((u64)(vcpu->regs[VCPU_REGS_RDX] & -1u) << 32);
1592
1593 if (vmx_set_msr(vcpu, ecx, data) != 0) {
1594 vmx_inject_gp(vcpu, 0);
1595 return 1;
1596 }
1597
1598 skip_emulated_instruction(vcpu);
1599 return 1;
1600}
1601
1602static int handle_interrupt_window(struct kvm_vcpu *vcpu,
1603 struct kvm_run *kvm_run)
1604{
1605 /* Turn off interrupt window reporting. */
1606 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
1607 vmcs_read32(CPU_BASED_VM_EXEC_CONTROL)
1608 & ~CPU_BASED_VIRTUAL_INTR_PENDING);
1609 return 1;
1610}
1611
1612static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1613{
1614 skip_emulated_instruction(vcpu);
1615 if (vcpu->irq_summary && (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF))
1616 return 1;
1617
1618 kvm_run->exit_reason = KVM_EXIT_HLT;
1619 return 0;
1620}
1621
1622/*
1623 * The exit handlers return 1 if the exit was handled fully and guest execution
1624 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
1625 * to be done to userspace and return 0.
1626 */
1627static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
1628 struct kvm_run *kvm_run) = {
1629 [EXIT_REASON_EXCEPTION_NMI] = handle_exception,
1630 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
1631 [EXIT_REASON_IO_INSTRUCTION] = handle_io,
1632 [EXIT_REASON_INVLPG] = handle_invlpg,
1633 [EXIT_REASON_CR_ACCESS] = handle_cr,
1634 [EXIT_REASON_DR_ACCESS] = handle_dr,
1635 [EXIT_REASON_CPUID] = handle_cpuid,
1636 [EXIT_REASON_MSR_READ] = handle_rdmsr,
1637 [EXIT_REASON_MSR_WRITE] = handle_wrmsr,
1638 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
1639 [EXIT_REASON_HLT] = handle_halt,
1640};
1641
1642static const int kvm_vmx_max_exit_handlers =
1643 sizeof(kvm_vmx_exit_handlers) / sizeof(*kvm_vmx_exit_handlers);
1644
1645/*
1646 * The guest has exited. See if we can fix it or if we need userspace
1647 * assistance.
1648 */
1649static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1650{
1651 u32 vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
1652 u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
1653
1654 if ( (vectoring_info & VECTORING_INFO_VALID_MASK) &&
1655 exit_reason != EXIT_REASON_EXCEPTION_NMI )
1656 printk(KERN_WARNING "%s: unexpected, valid vectoring info and "
1657 "exit reason is 0x%x\n", __FUNCTION__, exit_reason);
1658 kvm_run->instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
1659 if (exit_reason < kvm_vmx_max_exit_handlers
1660 && kvm_vmx_exit_handlers[exit_reason])
1661 return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run);
1662 else {
1663 kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
1664 kvm_run->hw.hardware_exit_reason = exit_reason;
1665 }
1666 return 0;
1667}
1668
1669static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1670{
1671 u8 fail;
1672 u16 fs_sel, gs_sel, ldt_sel;
1673 int fs_gs_ldt_reload_needed;
1674
1675again:
1676 /*
1677 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not
1678 * allow segment selectors with cpl > 0 or ti == 1.
1679 */
1680 fs_sel = read_fs();
1681 gs_sel = read_gs();
1682 ldt_sel = read_ldt();
1683 fs_gs_ldt_reload_needed = (fs_sel & 7) | (gs_sel & 7) | ldt_sel;
1684 if (!fs_gs_ldt_reload_needed) {
1685 vmcs_write16(HOST_FS_SELECTOR, fs_sel);
1686 vmcs_write16(HOST_GS_SELECTOR, gs_sel);
1687 } else {
1688 vmcs_write16(HOST_FS_SELECTOR, 0);
1689 vmcs_write16(HOST_GS_SELECTOR, 0);
1690 }
1691
1692#ifdef __x86_64__
1693 vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
1694 vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
1695#else
1696 vmcs_writel(HOST_FS_BASE, segment_base(fs_sel));
1697 vmcs_writel(HOST_GS_BASE, segment_base(gs_sel));
1698#endif
1699
1700 if (vcpu->irq_summary &&
1701 !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
1702 kvm_try_inject_irq(vcpu);
1703
1704 if (vcpu->guest_debug.enabled)
1705 kvm_guest_debug_pre(vcpu);
1706
1707 fx_save(vcpu->host_fx_image);
1708 fx_restore(vcpu->guest_fx_image);
1709
1710 save_msrs(vcpu->host_msrs, vcpu->nmsrs);
1711 load_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
1712
1713 asm (
1714 /* Store host registers */
1715 "pushf \n\t"
1716#ifdef __x86_64__
1717 "push %%rax; push %%rbx; push %%rdx;"
1718 "push %%rsi; push %%rdi; push %%rbp;"
1719 "push %%r8; push %%r9; push %%r10; push %%r11;"
1720 "push %%r12; push %%r13; push %%r14; push %%r15;"
1721 "push %%rcx \n\t"
1722 ASM_VMX_VMWRITE_RSP_RDX "\n\t"
1723#else
1724 "pusha; push %%ecx \n\t"
1725 ASM_VMX_VMWRITE_RSP_RDX "\n\t"
1726#endif
1727 /* Check if vmlaunch of vmresume is needed */
1728 "cmp $0, %1 \n\t"
1729 /* Load guest registers. Don't clobber flags. */
1730#ifdef __x86_64__
1731 "mov %c[cr2](%3), %%rax \n\t"
1732 "mov %%rax, %%cr2 \n\t"
1733 "mov %c[rax](%3), %%rax \n\t"
1734 "mov %c[rbx](%3), %%rbx \n\t"
1735 "mov %c[rdx](%3), %%rdx \n\t"
1736 "mov %c[rsi](%3), %%rsi \n\t"
1737 "mov %c[rdi](%3), %%rdi \n\t"
1738 "mov %c[rbp](%3), %%rbp \n\t"
1739 "mov %c[r8](%3), %%r8 \n\t"
1740 "mov %c[r9](%3), %%r9 \n\t"
1741 "mov %c[r10](%3), %%r10 \n\t"
1742 "mov %c[r11](%3), %%r11 \n\t"
1743 "mov %c[r12](%3), %%r12 \n\t"
1744 "mov %c[r13](%3), %%r13 \n\t"
1745 "mov %c[r14](%3), %%r14 \n\t"
1746 "mov %c[r15](%3), %%r15 \n\t"
1747 "mov %c[rcx](%3), %%rcx \n\t" /* kills %3 (rcx) */
1748#else
1749 "mov %c[cr2](%3), %%eax \n\t"
1750 "mov %%eax, %%cr2 \n\t"
1751 "mov %c[rax](%3), %%eax \n\t"
1752 "mov %c[rbx](%3), %%ebx \n\t"
1753 "mov %c[rdx](%3), %%edx \n\t"
1754 "mov %c[rsi](%3), %%esi \n\t"
1755 "mov %c[rdi](%3), %%edi \n\t"
1756 "mov %c[rbp](%3), %%ebp \n\t"
1757 "mov %c[rcx](%3), %%ecx \n\t" /* kills %3 (ecx) */
1758#endif
1759 /* Enter guest mode */
1760 "jne launched \n\t"
1761 ASM_VMX_VMLAUNCH "\n\t"
1762 "jmp kvm_vmx_return \n\t"
1763 "launched: " ASM_VMX_VMRESUME "\n\t"
1764 ".globl kvm_vmx_return \n\t"
1765 "kvm_vmx_return: "
1766 /* Save guest registers, load host registers, keep flags */
1767#ifdef __x86_64__
1768 "xchg %3, 0(%%rsp) \n\t"
1769 "mov %%rax, %c[rax](%3) \n\t"
1770 "mov %%rbx, %c[rbx](%3) \n\t"
1771 "pushq 0(%%rsp); popq %c[rcx](%3) \n\t"
1772 "mov %%rdx, %c[rdx](%3) \n\t"
1773 "mov %%rsi, %c[rsi](%3) \n\t"
1774 "mov %%rdi, %c[rdi](%3) \n\t"
1775 "mov %%rbp, %c[rbp](%3) \n\t"
1776 "mov %%r8, %c[r8](%3) \n\t"
1777 "mov %%r9, %c[r9](%3) \n\t"
1778 "mov %%r10, %c[r10](%3) \n\t"
1779 "mov %%r11, %c[r11](%3) \n\t"
1780 "mov %%r12, %c[r12](%3) \n\t"
1781 "mov %%r13, %c[r13](%3) \n\t"
1782 "mov %%r14, %c[r14](%3) \n\t"
1783 "mov %%r15, %c[r15](%3) \n\t"
1784 "mov %%cr2, %%rax \n\t"
1785 "mov %%rax, %c[cr2](%3) \n\t"
1786 "mov 0(%%rsp), %3 \n\t"
1787
1788 "pop %%rcx; pop %%r15; pop %%r14; pop %%r13; pop %%r12;"
1789 "pop %%r11; pop %%r10; pop %%r9; pop %%r8;"
1790 "pop %%rbp; pop %%rdi; pop %%rsi;"
1791 "pop %%rdx; pop %%rbx; pop %%rax \n\t"
1792#else
1793 "xchg %3, 0(%%esp) \n\t"
1794 "mov %%eax, %c[rax](%3) \n\t"
1795 "mov %%ebx, %c[rbx](%3) \n\t"
1796 "pushl 0(%%esp); popl %c[rcx](%3) \n\t"
1797 "mov %%edx, %c[rdx](%3) \n\t"
1798 "mov %%esi, %c[rsi](%3) \n\t"
1799 "mov %%edi, %c[rdi](%3) \n\t"
1800 "mov %%ebp, %c[rbp](%3) \n\t"
1801 "mov %%cr2, %%eax \n\t"
1802 "mov %%eax, %c[cr2](%3) \n\t"
1803 "mov 0(%%esp), %3 \n\t"
1804
1805 "pop %%ecx; popa \n\t"
1806#endif
1807 "setbe %0 \n\t"
1808 "popf \n\t"
1809 : "=g" (fail)
1810 : "r"(vcpu->launched), "d"((unsigned long)HOST_RSP),
1811 "c"(vcpu),
1812 [rax]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RAX])),
1813 [rbx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBX])),
1814 [rcx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RCX])),
1815 [rdx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDX])),
1816 [rsi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RSI])),
1817 [rdi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDI])),
1818 [rbp]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBP])),
1819#ifdef __x86_64__
1820 [r8 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R8 ])),
1821 [r9 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R9 ])),
1822 [r10]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R10])),
1823 [r11]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R11])),
1824 [r12]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R12])),
1825 [r13]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R13])),
1826 [r14]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R14])),
1827 [r15]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R15])),
1828#endif
1829 [cr2]"i"(offsetof(struct kvm_vcpu, cr2))
1830 : "cc", "memory" );
1831
1832 ++kvm_stat.exits;
1833
1834 save_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
1835 load_msrs(vcpu->host_msrs, NR_BAD_MSRS);
1836
1837 fx_save(vcpu->guest_fx_image);
1838 fx_restore(vcpu->host_fx_image);
1839
1840#ifndef __x86_64__
1841 asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
1842#endif
1843
1844 kvm_run->exit_type = 0;
1845 if (fail) {
1846 kvm_run->exit_type = KVM_EXIT_TYPE_FAIL_ENTRY;
1847 kvm_run->exit_reason = vmcs_read32(VM_INSTRUCTION_ERROR);
1848 } else {
1849 if (fs_gs_ldt_reload_needed) {
1850 load_ldt(ldt_sel);
1851 load_fs(fs_sel);
1852 /*
1853 * If we have to reload gs, we must take care to
1854 * preserve our gs base.
1855 */
1856 local_irq_disable();
1857 load_gs(gs_sel);
1858#ifdef __x86_64__
1859 wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
1860#endif
1861 local_irq_enable();
1862
1863 reload_tss();
1864 }
1865 vcpu->launched = 1;
1866 kvm_run->exit_type = KVM_EXIT_TYPE_VM_EXIT;
1867 if (kvm_handle_exit(kvm_run, vcpu)) {
1868 /* Give scheduler a change to reschedule. */
1869 if (signal_pending(current)) {
1870 ++kvm_stat.signal_exits;
1871 return -EINTR;
1872 }
1873 kvm_resched(vcpu);
1874 goto again;
1875 }
1876 }
1877 return 0;
1878}
1879
1880static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
1881{
1882 vmcs_writel(GUEST_CR3, vmcs_readl(GUEST_CR3));
1883}
1884
1885static void vmx_inject_page_fault(struct kvm_vcpu *vcpu,
1886 unsigned long addr,
1887 u32 err_code)
1888{
1889 u32 vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
1890
1891 ++kvm_stat.pf_guest;
1892
1893 if (is_page_fault(vect_info)) {
1894 printk(KERN_DEBUG "inject_page_fault: "
1895 "double fault 0x%lx @ 0x%lx\n",
1896 addr, vmcs_readl(GUEST_RIP));
1897 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 0);
1898 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
1899 DF_VECTOR |
1900 INTR_TYPE_EXCEPTION |
1901 INTR_INFO_DELIEVER_CODE_MASK |
1902 INTR_INFO_VALID_MASK);
1903 return;
1904 }
1905 vcpu->cr2 = addr;
1906 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, err_code);
1907 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
1908 PF_VECTOR |
1909 INTR_TYPE_EXCEPTION |
1910 INTR_INFO_DELIEVER_CODE_MASK |
1911 INTR_INFO_VALID_MASK);
1912
1913}
1914
1915static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
1916{
1917 if (vcpu->vmcs) {
1918 on_each_cpu(__vcpu_clear, vcpu, 0, 1);
1919 free_vmcs(vcpu->vmcs);
1920 vcpu->vmcs = NULL;
1921 }
1922}
1923
1924static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
1925{
1926 vmx_free_vmcs(vcpu);
1927}
1928
1929static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
1930{
1931 struct vmcs *vmcs;
1932
1933 vmcs = alloc_vmcs();
1934 if (!vmcs)
1935 return -ENOMEM;
1936 vmcs_clear(vmcs);
1937 vcpu->vmcs = vmcs;
1938 vcpu->launched = 0;
1939 return 0;
1940}
1941
1942static struct kvm_arch_ops vmx_arch_ops = {
1943 .cpu_has_kvm_support = cpu_has_kvm_support,
1944 .disabled_by_bios = vmx_disabled_by_bios,
1945 .hardware_setup = hardware_setup,
1946 .hardware_unsetup = hardware_unsetup,
1947 .hardware_enable = hardware_enable,
1948 .hardware_disable = hardware_disable,
1949
1950 .vcpu_create = vmx_create_vcpu,
1951 .vcpu_free = vmx_free_vcpu,
1952
1953 .vcpu_load = vmx_vcpu_load,
1954 .vcpu_put = vmx_vcpu_put,
1955
1956 .set_guest_debug = set_guest_debug,
1957 .get_msr = vmx_get_msr,
1958 .set_msr = vmx_set_msr,
1959 .get_segment_base = vmx_get_segment_base,
1960 .get_segment = vmx_get_segment,
1961 .set_segment = vmx_set_segment,
1962 .is_long_mode = vmx_is_long_mode,
1963 .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
1964 .set_cr0 = vmx_set_cr0,
1965 .set_cr0_no_modeswitch = vmx_set_cr0_no_modeswitch,
1966 .set_cr3 = vmx_set_cr3,
1967 .set_cr4 = vmx_set_cr4,
1968#ifdef __x86_64__
1969 .set_efer = vmx_set_efer,
1970#endif
1971 .get_idt = vmx_get_idt,
1972 .set_idt = vmx_set_idt,
1973 .get_gdt = vmx_get_gdt,
1974 .set_gdt = vmx_set_gdt,
1975 .cache_regs = vcpu_load_rsp_rip,
1976 .decache_regs = vcpu_put_rsp_rip,
1977 .get_rflags = vmx_get_rflags,
1978 .set_rflags = vmx_set_rflags,
1979
1980 .tlb_flush = vmx_flush_tlb,
1981 .inject_page_fault = vmx_inject_page_fault,
1982
1983 .inject_gp = vmx_inject_gp,
1984
1985 .run = vmx_vcpu_run,
1986 .skip_emulated_instruction = skip_emulated_instruction,
1987 .vcpu_setup = vmx_vcpu_setup,
1988};
1989
1990static int __init vmx_init(void)
1991{
1992 kvm_init_arch(&vmx_arch_ops, THIS_MODULE);
1993 return 0;
1994}
1995
1996static void __exit vmx_exit(void)
1997{
1998 kvm_exit_arch();
1999}
2000
2001module_init(vmx_init)
2002module_exit(vmx_exit)