aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAvi Kivity <avi@qumranet.com>2006-12-10 05:21:36 -0500
committerLinus Torvalds <torvalds@woody.osdl.org>2006-12-10 12:57:22 -0500
commit6aa8b732ca01c3d7a54e93f4d701b8aabbe60fb7 (patch)
tree23fcbe6f4918cacdae26d513a2bd13e91d8b4c38
parentf5f1a24a2caa299bb7d294aee92d7dd3410d9ed7 (diff)
[PATCH] kvm: userspace interface
web site: http://kvm.sourceforge.net mailing list: kvm-devel@lists.sourceforge.net (http://lists.sourceforge.net/lists/listinfo/kvm-devel) The following patchset adds a driver for Intel's hardware virtualization extensions to the x86 architecture. The driver adds a character device (/dev/kvm) that exposes the virtualization capabilities to userspace. Using this driver, a process can run a virtual machine (a "guest") in a fully virtualized PC containing its own virtual hard disks, network adapters, and display. Using this driver, one can start multiple virtual machines on a host. Each virtual machine is a process on the host; a virtual cpu is a thread in that process. kill(1), nice(1), top(1) work as expected. In effect, the driver adds a third execution mode to the existing two: we now have kernel mode, user mode, and guest mode. Guest mode has its own address space mapping guest physical memory (which is accessible to user mode by mmap()ing /dev/kvm). Guest mode has no access to any I/O devices; any such access is intercepted and directed to user mode for emulation. The driver supports i386 and x86_64 hosts and guests. All combinations are allowed except x86_64 guest on i386 host. For i386 guests and hosts, both pae and non-pae paging modes are supported. SMP hosts and UP guests are supported. At the moment only Intel hardware is supported, but AMD virtualization support is being worked on. Performance currently is non-stellar due to the naive implementation of the mmu virtualization, which throws away most of the shadow page table entries every context switch. We plan to address this in two ways: - cache shadow page tables across tlb flushes - wait until AMD and Intel release processors with nested page tables Currently a virtual desktop is responsive but consumes a lot of CPU. Under Windows I tried playing pinball and watching a few flash movies; with a recent CPU one can hardly feel the virtualization. Linux/X is slower, probably due to X being in a separate process. In addition to the driver, you need a slightly modified qemu to provide I/O device emulation and the BIOS. Caveats (akpm: might no longer be true): - The Windows install currently bluescreens due to a problem with the virtual APIC. We are working on a fix. A temporary workaround is to use an existing image or install through qemu - Windows 64-bit does not work. That's also true for qemu, so it's probably a problem with the device model. [bero@arklinux.org: build fix] [simon.kagstrom@bth.se: build fix, other fixes] [uril@qumranet.com: KVM: Expose interrupt bitmap] [akpm@osdl.org: i386 build fix] [mingo@elte.hu: i386 fixes] [rdreier@cisco.com: add log levels to all printks] [randy.dunlap@oracle.com: Fix sparse NULL and C99 struct init warnings] [anthony@codemonkey.ws: KVM: AMD SVM: 32-bit host support] Signed-off-by: Yaniv Kamay <yaniv@qumranet.com> Signed-off-by: Avi Kivity <avi@qumranet.com> Cc: Simon Kagstrom <simon.kagstrom@bth.se> Cc: Bernhard Rosenkraenzer <bero@arklinux.org> Signed-off-by: Uri Lublin <uril@qumranet.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Roland Dreier <rolandd@cisco.com> Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com> Signed-off-by: Anthony Liguori <anthony@codemonkey.ws> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--drivers/Kconfig2
-rw-r--r--drivers/Makefile1
-rw-r--r--drivers/kvm/Kconfig33
-rw-r--r--drivers/kvm/Makefile10
-rw-r--r--drivers/kvm/kvm.h551
-rw-r--r--drivers/kvm/kvm_main.c1935
-rw-r--r--drivers/kvm/kvm_svm.h44
-rw-r--r--drivers/kvm/kvm_vmx.h14
-rw-r--r--drivers/kvm/mmu.c699
-rw-r--r--drivers/kvm/paging_tmpl.h397
-rw-r--r--drivers/kvm/segment_descriptor.h17
-rw-r--r--drivers/kvm/svm.c1677
-rw-r--r--drivers/kvm/svm.h315
-rw-r--r--drivers/kvm/vmx.c2002
-rw-r--r--drivers/kvm/vmx.h296
-rw-r--r--drivers/kvm/x86_emulate.c1409
-rw-r--r--drivers/kvm/x86_emulate.h185
-rw-r--r--include/linux/kvm.h227
18 files changed, 9814 insertions, 0 deletions
diff --git a/drivers/Kconfig b/drivers/Kconfig
index 4929e923b5c6..e7da9fa724ec 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -80,4 +80,6 @@ source "drivers/rtc/Kconfig"
80 80
81source "drivers/dma/Kconfig" 81source "drivers/dma/Kconfig"
82 82
83source "drivers/kvm/Kconfig"
84
83endmenu 85endmenu
diff --git a/drivers/Makefile b/drivers/Makefile
index 50f76da598c9..0dd96d1afd39 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -43,6 +43,7 @@ obj-$(CONFIG_SPI) += spi/
43obj-$(CONFIG_PCCARD) += pcmcia/ 43obj-$(CONFIG_PCCARD) += pcmcia/
44obj-$(CONFIG_DIO) += dio/ 44obj-$(CONFIG_DIO) += dio/
45obj-$(CONFIG_SBUS) += sbus/ 45obj-$(CONFIG_SBUS) += sbus/
46obj-$(CONFIG_KVM) += kvm/
46obj-$(CONFIG_ZORRO) += zorro/ 47obj-$(CONFIG_ZORRO) += zorro/
47obj-$(CONFIG_MAC) += macintosh/ 48obj-$(CONFIG_MAC) += macintosh/
48obj-$(CONFIG_ATA_OVER_ETH) += block/aoe/ 49obj-$(CONFIG_ATA_OVER_ETH) += block/aoe/
diff --git a/drivers/kvm/Kconfig b/drivers/kvm/Kconfig
new file mode 100644
index 000000000000..36412e90f09b
--- /dev/null
+++ b/drivers/kvm/Kconfig
@@ -0,0 +1,33 @@
1#
2# KVM configuration
3#
4config KVM
5 tristate "Kernel-based Virtual Machine (KVM) support"
6 depends on X86 && EXPERIMENTAL
7 ---help---
8 Support hosting fully virtualized guest machines using hardware
9 virtualization extensions. You will need a fairly recent
10 processor equipped with virtualization extensions. You will also
11 need to select one or more of the processor modules below.
12
13 This module provides access to the hardware capabilities through
14 a character device node named /dev/kvm.
15
16 To compile this as a module, choose M here: the module
17 will be called kvm.
18
19 If unsure, say N.
20
21config KVM_INTEL
22 tristate "KVM for Intel processors support"
23 depends on KVM
24 ---help---
25 Provides support for KVM on Intel processors equipped with the VT
26 extensions.
27
28config KVM_AMD
29 tristate "KVM for AMD processors support"
30 depends on KVM
31 ---help---
32 Provides support for KVM on AMD processors equipped with the AMD-V
33 (SVM) extensions.
diff --git a/drivers/kvm/Makefile b/drivers/kvm/Makefile
new file mode 100644
index 000000000000..c0a789fa9d65
--- /dev/null
+++ b/drivers/kvm/Makefile
@@ -0,0 +1,10 @@
1#
2# Makefile for Kernel-based Virtual Machine module
3#
4
5kvm-objs := kvm_main.o mmu.o x86_emulate.o
6obj-$(CONFIG_KVM) += kvm.o
7kvm-intel-objs = vmx.o
8obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
9kvm-amd-objs = svm.o
10obj-$(CONFIG_KVM_AMD) += kvm-amd.o
diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
new file mode 100644
index 000000000000..5785d0870ab6
--- /dev/null
+++ b/drivers/kvm/kvm.h
@@ -0,0 +1,551 @@
1#ifndef __KVM_H
2#define __KVM_H
3
4/*
5 * This work is licensed under the terms of the GNU GPL, version 2. See
6 * the COPYING file in the top-level directory.
7 */
8
9#include <linux/types.h>
10#include <linux/list.h>
11#include <linux/mutex.h>
12#include <linux/spinlock.h>
13#include <linux/mm.h>
14
15#include "vmx.h"
16#include <linux/kvm.h>
17
18#define CR0_PE_MASK (1ULL << 0)
19#define CR0_TS_MASK (1ULL << 3)
20#define CR0_NE_MASK (1ULL << 5)
21#define CR0_WP_MASK (1ULL << 16)
22#define CR0_NW_MASK (1ULL << 29)
23#define CR0_CD_MASK (1ULL << 30)
24#define CR0_PG_MASK (1ULL << 31)
25
26#define CR3_WPT_MASK (1ULL << 3)
27#define CR3_PCD_MASK (1ULL << 4)
28
29#define CR3_RESEVED_BITS 0x07ULL
30#define CR3_L_MODE_RESEVED_BITS (~((1ULL << 40) - 1) | 0x0fe7ULL)
31#define CR3_FLAGS_MASK ((1ULL << 5) - 1)
32
33#define CR4_VME_MASK (1ULL << 0)
34#define CR4_PSE_MASK (1ULL << 4)
35#define CR4_PAE_MASK (1ULL << 5)
36#define CR4_PGE_MASK (1ULL << 7)
37#define CR4_VMXE_MASK (1ULL << 13)
38
39#define KVM_GUEST_CR0_MASK \
40 (CR0_PG_MASK | CR0_PE_MASK | CR0_WP_MASK | CR0_NE_MASK \
41 | CR0_NW_MASK | CR0_CD_MASK)
42#define KVM_VM_CR0_ALWAYS_ON \
43 (CR0_PG_MASK | CR0_PE_MASK | CR0_WP_MASK | CR0_NE_MASK)
44#define KVM_GUEST_CR4_MASK \
45 (CR4_PSE_MASK | CR4_PAE_MASK | CR4_PGE_MASK | CR4_VMXE_MASK | CR4_VME_MASK)
46#define KVM_PMODE_VM_CR4_ALWAYS_ON (CR4_VMXE_MASK | CR4_PAE_MASK)
47#define KVM_RMODE_VM_CR4_ALWAYS_ON (CR4_VMXE_MASK | CR4_PAE_MASK | CR4_VME_MASK)
48
49#define INVALID_PAGE (~(hpa_t)0)
50#define UNMAPPED_GVA (~(gpa_t)0)
51
52#define KVM_MAX_VCPUS 1
53#define KVM_MEMORY_SLOTS 4
54#define KVM_NUM_MMU_PAGES 256
55
56#define FX_IMAGE_SIZE 512
57#define FX_IMAGE_ALIGN 16
58#define FX_BUF_SIZE (2 * FX_IMAGE_SIZE + FX_IMAGE_ALIGN)
59
60#define DE_VECTOR 0
61#define DF_VECTOR 8
62#define TS_VECTOR 10
63#define NP_VECTOR 11
64#define SS_VECTOR 12
65#define GP_VECTOR 13
66#define PF_VECTOR 14
67
68#define SELECTOR_TI_MASK (1 << 2)
69#define SELECTOR_RPL_MASK 0x03
70
71#define IOPL_SHIFT 12
72
73/*
74 * Address types:
75 *
76 * gva - guest virtual address
77 * gpa - guest physical address
78 * gfn - guest frame number
79 * hva - host virtual address
80 * hpa - host physical address
81 * hfn - host frame number
82 */
83
84typedef unsigned long gva_t;
85typedef u64 gpa_t;
86typedef unsigned long gfn_t;
87
88typedef unsigned long hva_t;
89typedef u64 hpa_t;
90typedef unsigned long hfn_t;
91
92struct kvm_mmu_page {
93 struct list_head link;
94 hpa_t page_hpa;
95 unsigned long slot_bitmap; /* One bit set per slot which has memory
96 * in this shadow page.
97 */
98 int global; /* Set if all ptes in this page are global */
99 u64 *parent_pte;
100};
101
102struct vmcs {
103 u32 revision_id;
104 u32 abort;
105 char data[0];
106};
107
108#define vmx_msr_entry kvm_msr_entry
109
110struct kvm_vcpu;
111
112/*
113 * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
114 * 32-bit). The kvm_mmu structure abstracts the details of the current mmu
115 * mode.
116 */
117struct kvm_mmu {
118 void (*new_cr3)(struct kvm_vcpu *vcpu);
119 int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
120 void (*inval_page)(struct kvm_vcpu *vcpu, gva_t gva);
121 void (*free)(struct kvm_vcpu *vcpu);
122 gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva);
123 hpa_t root_hpa;
124 int root_level;
125 int shadow_root_level;
126};
127
128struct kvm_guest_debug {
129 int enabled;
130 unsigned long bp[4];
131 int singlestep;
132};
133
134enum {
135 VCPU_REGS_RAX = 0,
136 VCPU_REGS_RCX = 1,
137 VCPU_REGS_RDX = 2,
138 VCPU_REGS_RBX = 3,
139 VCPU_REGS_RSP = 4,
140 VCPU_REGS_RBP = 5,
141 VCPU_REGS_RSI = 6,
142 VCPU_REGS_RDI = 7,
143#ifdef __x86_64__
144 VCPU_REGS_R8 = 8,
145 VCPU_REGS_R9 = 9,
146 VCPU_REGS_R10 = 10,
147 VCPU_REGS_R11 = 11,
148 VCPU_REGS_R12 = 12,
149 VCPU_REGS_R13 = 13,
150 VCPU_REGS_R14 = 14,
151 VCPU_REGS_R15 = 15,
152#endif
153 NR_VCPU_REGS
154};
155
156enum {
157 VCPU_SREG_CS,
158 VCPU_SREG_DS,
159 VCPU_SREG_ES,
160 VCPU_SREG_FS,
161 VCPU_SREG_GS,
162 VCPU_SREG_SS,
163 VCPU_SREG_TR,
164 VCPU_SREG_LDTR,
165};
166
167struct kvm_vcpu {
168 struct kvm *kvm;
169 union {
170 struct vmcs *vmcs;
171 struct vcpu_svm *svm;
172 };
173 struct mutex mutex;
174 int cpu;
175 int launched;
176 unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
177#define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE(unsigned long)
178 unsigned long irq_pending[NR_IRQ_WORDS];
179 unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */
180 unsigned long rip; /* needs vcpu_load_rsp_rip() */
181
182 unsigned long cr0;
183 unsigned long cr2;
184 unsigned long cr3;
185 unsigned long cr4;
186 unsigned long cr8;
187 u64 shadow_efer;
188 u64 apic_base;
189 int nmsrs;
190 struct vmx_msr_entry *guest_msrs;
191 struct vmx_msr_entry *host_msrs;
192
193 struct list_head free_pages;
194 struct kvm_mmu_page page_header_buf[KVM_NUM_MMU_PAGES];
195 struct kvm_mmu mmu;
196
197 struct kvm_guest_debug guest_debug;
198
199 char fx_buf[FX_BUF_SIZE];
200 char *host_fx_image;
201 char *guest_fx_image;
202
203 int mmio_needed;
204 int mmio_read_completed;
205 int mmio_is_write;
206 int mmio_size;
207 unsigned char mmio_data[8];
208 gpa_t mmio_phys_addr;
209
210 struct {
211 int active;
212 u8 save_iopl;
213 struct kvm_save_segment {
214 u16 selector;
215 unsigned long base;
216 u32 limit;
217 u32 ar;
218 } tr, es, ds, fs, gs;
219 } rmode;
220};
221
222struct kvm_memory_slot {
223 gfn_t base_gfn;
224 unsigned long npages;
225 unsigned long flags;
226 struct page **phys_mem;
227 unsigned long *dirty_bitmap;
228};
229
230struct kvm {
231 spinlock_t lock; /* protects everything except vcpus */
232 int nmemslots;
233 struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS];
234 struct list_head active_mmu_pages;
235 struct kvm_vcpu vcpus[KVM_MAX_VCPUS];
236 int memory_config_version;
237 int busy;
238};
239
240struct kvm_stat {
241 u32 pf_fixed;
242 u32 pf_guest;
243 u32 tlb_flush;
244 u32 invlpg;
245
246 u32 exits;
247 u32 io_exits;
248 u32 mmio_exits;
249 u32 signal_exits;
250 u32 irq_exits;
251};
252
253struct descriptor_table {
254 u16 limit;
255 unsigned long base;
256} __attribute__((packed));
257
258struct kvm_arch_ops {
259 int (*cpu_has_kvm_support)(void); /* __init */
260 int (*disabled_by_bios)(void); /* __init */
261 void (*hardware_enable)(void *dummy); /* __init */
262 void (*hardware_disable)(void *dummy);
263 int (*hardware_setup)(void); /* __init */
264 void (*hardware_unsetup)(void); /* __exit */
265
266 int (*vcpu_create)(struct kvm_vcpu *vcpu);
267 void (*vcpu_free)(struct kvm_vcpu *vcpu);
268
269 struct kvm_vcpu *(*vcpu_load)(struct kvm_vcpu *vcpu);
270 void (*vcpu_put)(struct kvm_vcpu *vcpu);
271
272 int (*set_guest_debug)(struct kvm_vcpu *vcpu,
273 struct kvm_debug_guest *dbg);
274 int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
275 int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
276 u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
277 void (*get_segment)(struct kvm_vcpu *vcpu,
278 struct kvm_segment *var, int seg);
279 void (*set_segment)(struct kvm_vcpu *vcpu,
280 struct kvm_segment *var, int seg);
281 int (*is_long_mode)(struct kvm_vcpu *vcpu);
282 void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
283 void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
284 void (*set_cr0_no_modeswitch)(struct kvm_vcpu *vcpu,
285 unsigned long cr0);
286 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
287 void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
288 void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
289 void (*get_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
290 void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
291 void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
292 void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
293 unsigned long (*get_dr)(struct kvm_vcpu *vcpu, int dr);
294 void (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value,
295 int *exception);
296 void (*cache_regs)(struct kvm_vcpu *vcpu);
297 void (*decache_regs)(struct kvm_vcpu *vcpu);
298 unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
299 void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
300
301 void (*invlpg)(struct kvm_vcpu *vcpu, gva_t addr);
302 void (*tlb_flush)(struct kvm_vcpu *vcpu);
303 void (*inject_page_fault)(struct kvm_vcpu *vcpu,
304 unsigned long addr, u32 err_code);
305
306 void (*inject_gp)(struct kvm_vcpu *vcpu, unsigned err_code);
307
308 int (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run);
309 int (*vcpu_setup)(struct kvm_vcpu *vcpu);
310 void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
311};
312
313extern struct kvm_stat kvm_stat;
314extern struct kvm_arch_ops *kvm_arch_ops;
315
316#define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt)
317#define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt)
318
319int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module);
320void kvm_exit_arch(void);
321
322void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
323int kvm_mmu_init(struct kvm_vcpu *vcpu);
324
325int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
326void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
327
328hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa);
329#define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
330#define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB)
331static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
332hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva);
333
334void kvm_emulator_want_group7_invlpg(void);
335
336extern hpa_t bad_page_address;
337
338static inline struct page *gfn_to_page(struct kvm_memory_slot *slot, gfn_t gfn)
339{
340 return slot->phys_mem[gfn - slot->base_gfn];
341}
342
343struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
344void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
345
346enum emulation_result {
347 EMULATE_DONE, /* no further processing */
348 EMULATE_DO_MMIO, /* kvm_run filled with mmio request */
349 EMULATE_FAIL, /* can't emulate this instruction */
350};
351
352int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run,
353 unsigned long cr2, u16 error_code);
354void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
355void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
356void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
357 unsigned long *rflags);
358
359unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr);
360void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long value,
361 unsigned long *rflags);
362
363struct x86_emulate_ctxt;
364
365int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
366int emulate_clts(struct kvm_vcpu *vcpu);
367int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr,
368 unsigned long *dest);
369int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
370 unsigned long value);
371
372void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
373void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr0);
374void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr0);
375void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr0);
376void lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
377
378#ifdef __x86_64__
379void set_efer(struct kvm_vcpu *vcpu, u64 efer);
380#endif
381
382void fx_init(struct kvm_vcpu *vcpu);
383
384void load_msrs(struct vmx_msr_entry *e, int n);
385void save_msrs(struct vmx_msr_entry *e, int n);
386void kvm_resched(struct kvm_vcpu *vcpu);
387
388int kvm_read_guest(struct kvm_vcpu *vcpu,
389 gva_t addr,
390 unsigned long size,
391 void *dest);
392
393int kvm_write_guest(struct kvm_vcpu *vcpu,
394 gva_t addr,
395 unsigned long size,
396 void *data);
397
398unsigned long segment_base(u16 selector);
399
400static inline struct page *_gfn_to_page(struct kvm *kvm, gfn_t gfn)
401{
402 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
403 return (slot) ? slot->phys_mem[gfn - slot->base_gfn] : NULL;
404}
405
406static inline int is_pae(struct kvm_vcpu *vcpu)
407{
408 return vcpu->cr4 & CR4_PAE_MASK;
409}
410
411static inline int is_pse(struct kvm_vcpu *vcpu)
412{
413 return vcpu->cr4 & CR4_PSE_MASK;
414}
415
416static inline int is_paging(struct kvm_vcpu *vcpu)
417{
418 return vcpu->cr0 & CR0_PG_MASK;
419}
420
421static inline int memslot_id(struct kvm *kvm, struct kvm_memory_slot *slot)
422{
423 return slot - kvm->memslots;
424}
425
426static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
427{
428 struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
429
430 return (struct kvm_mmu_page *)page->private;
431}
432
433static inline u16 read_fs(void)
434{
435 u16 seg;
436 asm ("mov %%fs, %0" : "=g"(seg));
437 return seg;
438}
439
440static inline u16 read_gs(void)
441{
442 u16 seg;
443 asm ("mov %%gs, %0" : "=g"(seg));
444 return seg;
445}
446
447static inline u16 read_ldt(void)
448{
449 u16 ldt;
450 asm ("sldt %0" : "=g"(ldt));
451 return ldt;
452}
453
454static inline void load_fs(u16 sel)
455{
456 asm ("mov %0, %%fs" : : "rm"(sel));
457}
458
459static inline void load_gs(u16 sel)
460{
461 asm ("mov %0, %%gs" : : "rm"(sel));
462}
463
464#ifndef load_ldt
465static inline void load_ldt(u16 sel)
466{
467 asm ("lldt %0" : : "g"(sel));
468}
469#endif
470
471static inline void get_idt(struct descriptor_table *table)
472{
473 asm ("sidt %0" : "=m"(*table));
474}
475
476static inline void get_gdt(struct descriptor_table *table)
477{
478 asm ("sgdt %0" : "=m"(*table));
479}
480
481static inline unsigned long read_tr_base(void)
482{
483 u16 tr;
484 asm ("str %0" : "=g"(tr));
485 return segment_base(tr);
486}
487
488#ifdef __x86_64__
489static inline unsigned long read_msr(unsigned long msr)
490{
491 u64 value;
492
493 rdmsrl(msr, value);
494 return value;
495}
496#endif
497
498static inline void fx_save(void *image)
499{
500 asm ("fxsave (%0)":: "r" (image));
501}
502
503static inline void fx_restore(void *image)
504{
505 asm ("fxrstor (%0)":: "r" (image));
506}
507
508static inline void fpu_init(void)
509{
510 asm ("finit");
511}
512
513static inline u32 get_rdx_init_val(void)
514{
515 return 0x600; /* P6 family */
516}
517
518#define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30"
519#define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2"
520#define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3"
521#define ASM_VMX_VMPTRLD_RAX ".byte 0x0f, 0xc7, 0x30"
522#define ASM_VMX_VMREAD_RDX_RAX ".byte 0x0f, 0x78, 0xd0"
523#define ASM_VMX_VMWRITE_RAX_RDX ".byte 0x0f, 0x79, 0xd0"
524#define ASM_VMX_VMWRITE_RSP_RDX ".byte 0x0f, 0x79, 0xd4"
525#define ASM_VMX_VMXOFF ".byte 0x0f, 0x01, 0xc4"
526#define ASM_VMX_VMXON_RAX ".byte 0xf3, 0x0f, 0xc7, 0x30"
527
528#define MSR_IA32_TIME_STAMP_COUNTER 0x010
529
530#define TSS_IOPB_BASE_OFFSET 0x66
531#define TSS_BASE_SIZE 0x68
532#define TSS_IOPB_SIZE (65536 / 8)
533#define TSS_REDIRECTION_SIZE (256 / 8)
534#define RMODE_TSS_SIZE (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1)
535
536#ifdef __x86_64__
537
538/*
539 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. Therefore
540 * we need to allocate shadow page tables in the first 4GB of memory, which
541 * happens to fit the DMA32 zone.
542 */
543#define GFP_KVM_MMU (GFP_KERNEL | __GFP_DMA32)
544
545#else
546
547#define GFP_KVM_MMU GFP_KERNEL
548
549#endif
550
551#endif
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
new file mode 100644
index 000000000000..b6b8a41b5ec8
--- /dev/null
+++ b/drivers/kvm/kvm_main.c
@@ -0,0 +1,1935 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * Copyright (C) 2006 Qumranet, Inc.
8 *
9 * Authors:
10 * Avi Kivity <avi@qumranet.com>
11 * Yaniv Kamay <yaniv@qumranet.com>
12 *
13 * This work is licensed under the terms of the GNU GPL, version 2. See
14 * the COPYING file in the top-level directory.
15 *
16 */
17
18#include "kvm.h"
19
20#include <linux/kvm.h>
21#include <linux/module.h>
22#include <linux/errno.h>
23#include <asm/processor.h>
24#include <linux/percpu.h>
25#include <linux/gfp.h>
26#include <asm/msr.h>
27#include <linux/mm.h>
28#include <linux/miscdevice.h>
29#include <linux/vmalloc.h>
30#include <asm/uaccess.h>
31#include <linux/reboot.h>
32#include <asm/io.h>
33#include <linux/debugfs.h>
34#include <linux/highmem.h>
35#include <linux/file.h>
36#include <asm/desc.h>
37
38#include "x86_emulate.h"
39#include "segment_descriptor.h"
40
41MODULE_AUTHOR("Qumranet");
42MODULE_LICENSE("GPL");
43
44struct kvm_arch_ops *kvm_arch_ops;
45struct kvm_stat kvm_stat;
46EXPORT_SYMBOL_GPL(kvm_stat);
47
48static struct kvm_stats_debugfs_item {
49 const char *name;
50 u32 *data;
51 struct dentry *dentry;
52} debugfs_entries[] = {
53 { "pf_fixed", &kvm_stat.pf_fixed },
54 { "pf_guest", &kvm_stat.pf_guest },
55 { "tlb_flush", &kvm_stat.tlb_flush },
56 { "invlpg", &kvm_stat.invlpg },
57 { "exits", &kvm_stat.exits },
58 { "io_exits", &kvm_stat.io_exits },
59 { "mmio_exits", &kvm_stat.mmio_exits },
60 { "signal_exits", &kvm_stat.signal_exits },
61 { "irq_exits", &kvm_stat.irq_exits },
62 { 0, 0 }
63};
64
65static struct dentry *debugfs_dir;
66
67#define MAX_IO_MSRS 256
68
69#define CR0_RESEVED_BITS 0xffffffff1ffaffc0ULL
70#define LMSW_GUEST_MASK 0x0eULL
71#define CR4_RESEVED_BITS (~((1ULL << 11) - 1))
72#define CR8_RESEVED_BITS (~0x0fULL)
73#define EFER_RESERVED_BITS 0xfffffffffffff2fe
74
75struct vmx_msr_entry *find_msr_entry(struct kvm_vcpu *vcpu, u32 msr)
76{
77 int i;
78
79 for (i = 0; i < vcpu->nmsrs; ++i)
80 if (vcpu->guest_msrs[i].index == msr)
81 return &vcpu->guest_msrs[i];
82 return 0;
83}
84EXPORT_SYMBOL_GPL(find_msr_entry);
85
86#ifdef __x86_64__
87// LDT or TSS descriptor in the GDT. 16 bytes.
88struct segment_descriptor_64 {
89 struct segment_descriptor s;
90 u32 base_higher;
91 u32 pad_zero;
92};
93
94#endif
95
96unsigned long segment_base(u16 selector)
97{
98 struct descriptor_table gdt;
99 struct segment_descriptor *d;
100 unsigned long table_base;
101 typedef unsigned long ul;
102 unsigned long v;
103
104 if (selector == 0)
105 return 0;
106
107 asm ("sgdt %0" : "=m"(gdt));
108 table_base = gdt.base;
109
110 if (selector & 4) { /* from ldt */
111 u16 ldt_selector;
112
113 asm ("sldt %0" : "=g"(ldt_selector));
114 table_base = segment_base(ldt_selector);
115 }
116 d = (struct segment_descriptor *)(table_base + (selector & ~7));
117 v = d->base_low | ((ul)d->base_mid << 16) | ((ul)d->base_high << 24);
118#ifdef __x86_64__
119 if (d->system == 0
120 && (d->type == 2 || d->type == 9 || d->type == 11))
121 v |= ((ul)((struct segment_descriptor_64 *)d)->base_higher) << 32;
122#endif
123 return v;
124}
125EXPORT_SYMBOL_GPL(segment_base);
126
127int kvm_read_guest(struct kvm_vcpu *vcpu,
128 gva_t addr,
129 unsigned long size,
130 void *dest)
131{
132 unsigned char *host_buf = dest;
133 unsigned long req_size = size;
134
135 while (size) {
136 hpa_t paddr;
137 unsigned now;
138 unsigned offset;
139 hva_t guest_buf;
140
141 paddr = gva_to_hpa(vcpu, addr);
142
143 if (is_error_hpa(paddr))
144 break;
145
146 guest_buf = (hva_t)kmap_atomic(
147 pfn_to_page(paddr >> PAGE_SHIFT),
148 KM_USER0);
149 offset = addr & ~PAGE_MASK;
150 guest_buf |= offset;
151 now = min(size, PAGE_SIZE - offset);
152 memcpy(host_buf, (void*)guest_buf, now);
153 host_buf += now;
154 addr += now;
155 size -= now;
156 kunmap_atomic((void *)(guest_buf & PAGE_MASK), KM_USER0);
157 }
158 return req_size - size;
159}
160EXPORT_SYMBOL_GPL(kvm_read_guest);
161
162int kvm_write_guest(struct kvm_vcpu *vcpu,
163 gva_t addr,
164 unsigned long size,
165 void *data)
166{
167 unsigned char *host_buf = data;
168 unsigned long req_size = size;
169
170 while (size) {
171 hpa_t paddr;
172 unsigned now;
173 unsigned offset;
174 hva_t guest_buf;
175
176 paddr = gva_to_hpa(vcpu, addr);
177
178 if (is_error_hpa(paddr))
179 break;
180
181 guest_buf = (hva_t)kmap_atomic(
182 pfn_to_page(paddr >> PAGE_SHIFT), KM_USER0);
183 offset = addr & ~PAGE_MASK;
184 guest_buf |= offset;
185 now = min(size, PAGE_SIZE - offset);
186 memcpy((void*)guest_buf, host_buf, now);
187 host_buf += now;
188 addr += now;
189 size -= now;
190 kunmap_atomic((void *)(guest_buf & PAGE_MASK), KM_USER0);
191 }
192 return req_size - size;
193}
194EXPORT_SYMBOL_GPL(kvm_write_guest);
195
196static int vcpu_slot(struct kvm_vcpu *vcpu)
197{
198 return vcpu - vcpu->kvm->vcpus;
199}
200
201/*
202 * Switches to specified vcpu, until a matching vcpu_put()
203 */
204static struct kvm_vcpu *vcpu_load(struct kvm *kvm, int vcpu_slot)
205{
206 struct kvm_vcpu *vcpu = &kvm->vcpus[vcpu_slot];
207
208 mutex_lock(&vcpu->mutex);
209 if (unlikely(!vcpu->vmcs)) {
210 mutex_unlock(&vcpu->mutex);
211 return 0;
212 }
213 return kvm_arch_ops->vcpu_load(vcpu);
214}
215
216static void vcpu_put(struct kvm_vcpu *vcpu)
217{
218 kvm_arch_ops->vcpu_put(vcpu);
219 put_cpu();
220 mutex_unlock(&vcpu->mutex);
221}
222
223static int kvm_dev_open(struct inode *inode, struct file *filp)
224{
225 struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
226 int i;
227
228 if (!kvm)
229 return -ENOMEM;
230
231 spin_lock_init(&kvm->lock);
232 INIT_LIST_HEAD(&kvm->active_mmu_pages);
233 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
234 struct kvm_vcpu *vcpu = &kvm->vcpus[i];
235
236 mutex_init(&vcpu->mutex);
237 vcpu->mmu.root_hpa = INVALID_PAGE;
238 INIT_LIST_HEAD(&vcpu->free_pages);
239 }
240 filp->private_data = kvm;
241 return 0;
242}
243
244/*
245 * Free any memory in @free but not in @dont.
246 */
247static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
248 struct kvm_memory_slot *dont)
249{
250 int i;
251
252 if (!dont || free->phys_mem != dont->phys_mem)
253 if (free->phys_mem) {
254 for (i = 0; i < free->npages; ++i)
255 __free_page(free->phys_mem[i]);
256 vfree(free->phys_mem);
257 }
258
259 if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
260 vfree(free->dirty_bitmap);
261
262 free->phys_mem = 0;
263 free->npages = 0;
264 free->dirty_bitmap = 0;
265}
266
267static void kvm_free_physmem(struct kvm *kvm)
268{
269 int i;
270
271 for (i = 0; i < kvm->nmemslots; ++i)
272 kvm_free_physmem_slot(&kvm->memslots[i], 0);
273}
274
275static void kvm_free_vcpu(struct kvm_vcpu *vcpu)
276{
277 kvm_arch_ops->vcpu_free(vcpu);
278 kvm_mmu_destroy(vcpu);
279}
280
281static void kvm_free_vcpus(struct kvm *kvm)
282{
283 unsigned int i;
284
285 for (i = 0; i < KVM_MAX_VCPUS; ++i)
286 kvm_free_vcpu(&kvm->vcpus[i]);
287}
288
289static int kvm_dev_release(struct inode *inode, struct file *filp)
290{
291 struct kvm *kvm = filp->private_data;
292
293 kvm_free_vcpus(kvm);
294 kvm_free_physmem(kvm);
295 kfree(kvm);
296 return 0;
297}
298
299static void inject_gp(struct kvm_vcpu *vcpu)
300{
301 kvm_arch_ops->inject_gp(vcpu, 0);
302}
303
304static int pdptrs_have_reserved_bits_set(struct kvm_vcpu *vcpu,
305 unsigned long cr3)
306{
307 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
308 unsigned offset = (cr3 & (PAGE_SIZE-1)) >> 5;
309 int i;
310 u64 pdpte;
311 u64 *pdpt;
312 struct kvm_memory_slot *memslot;
313
314 spin_lock(&vcpu->kvm->lock);
315 memslot = gfn_to_memslot(vcpu->kvm, pdpt_gfn);
316 /* FIXME: !memslot - emulate? 0xff? */
317 pdpt = kmap_atomic(gfn_to_page(memslot, pdpt_gfn), KM_USER0);
318
319 for (i = 0; i < 4; ++i) {
320 pdpte = pdpt[offset + i];
321 if ((pdpte & 1) && (pdpte & 0xfffffff0000001e6ull))
322 break;
323 }
324
325 kunmap_atomic(pdpt, KM_USER0);
326 spin_unlock(&vcpu->kvm->lock);
327
328 return i != 4;
329}
330
331void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
332{
333 if (cr0 & CR0_RESEVED_BITS) {
334 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
335 cr0, vcpu->cr0);
336 inject_gp(vcpu);
337 return;
338 }
339
340 if ((cr0 & CR0_NW_MASK) && !(cr0 & CR0_CD_MASK)) {
341 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
342 inject_gp(vcpu);
343 return;
344 }
345
346 if ((cr0 & CR0_PG_MASK) && !(cr0 & CR0_PE_MASK)) {
347 printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
348 "and a clear PE flag\n");
349 inject_gp(vcpu);
350 return;
351 }
352
353 if (!is_paging(vcpu) && (cr0 & CR0_PG_MASK)) {
354#ifdef __x86_64__
355 if ((vcpu->shadow_efer & EFER_LME)) {
356 int cs_db, cs_l;
357
358 if (!is_pae(vcpu)) {
359 printk(KERN_DEBUG "set_cr0: #GP, start paging "
360 "in long mode while PAE is disabled\n");
361 inject_gp(vcpu);
362 return;
363 }
364 kvm_arch_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
365 if (cs_l) {
366 printk(KERN_DEBUG "set_cr0: #GP, start paging "
367 "in long mode while CS.L == 1\n");
368 inject_gp(vcpu);
369 return;
370
371 }
372 } else
373#endif
374 if (is_pae(vcpu) &&
375 pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) {
376 printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
377 "reserved bits\n");
378 inject_gp(vcpu);
379 return;
380 }
381
382 }
383
384 kvm_arch_ops->set_cr0(vcpu, cr0);
385 vcpu->cr0 = cr0;
386
387 spin_lock(&vcpu->kvm->lock);
388 kvm_mmu_reset_context(vcpu);
389 spin_unlock(&vcpu->kvm->lock);
390 return;
391}
392EXPORT_SYMBOL_GPL(set_cr0);
393
394void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
395{
396 set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f));
397}
398EXPORT_SYMBOL_GPL(lmsw);
399
400void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
401{
402 if (cr4 & CR4_RESEVED_BITS) {
403 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
404 inject_gp(vcpu);
405 return;
406 }
407
408 if (kvm_arch_ops->is_long_mode(vcpu)) {
409 if (!(cr4 & CR4_PAE_MASK)) {
410 printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
411 "in long mode\n");
412 inject_gp(vcpu);
413 return;
414 }
415 } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & CR4_PAE_MASK)
416 && pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) {
417 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
418 inject_gp(vcpu);
419 }
420
421 if (cr4 & CR4_VMXE_MASK) {
422 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
423 inject_gp(vcpu);
424 return;
425 }
426 kvm_arch_ops->set_cr4(vcpu, cr4);
427 spin_lock(&vcpu->kvm->lock);
428 kvm_mmu_reset_context(vcpu);
429 spin_unlock(&vcpu->kvm->lock);
430}
431EXPORT_SYMBOL_GPL(set_cr4);
432
433void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
434{
435 if (kvm_arch_ops->is_long_mode(vcpu)) {
436 if ( cr3 & CR3_L_MODE_RESEVED_BITS) {
437 printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
438 inject_gp(vcpu);
439 return;
440 }
441 } else {
442 if (cr3 & CR3_RESEVED_BITS) {
443 printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
444 inject_gp(vcpu);
445 return;
446 }
447 if (is_paging(vcpu) && is_pae(vcpu) &&
448 pdptrs_have_reserved_bits_set(vcpu, cr3)) {
449 printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
450 "reserved bits\n");
451 inject_gp(vcpu);
452 return;
453 }
454 }
455
456 vcpu->cr3 = cr3;
457 spin_lock(&vcpu->kvm->lock);
458 vcpu->mmu.new_cr3(vcpu);
459 spin_unlock(&vcpu->kvm->lock);
460}
461EXPORT_SYMBOL_GPL(set_cr3);
462
463void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
464{
465 if ( cr8 & CR8_RESEVED_BITS) {
466 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
467 inject_gp(vcpu);
468 return;
469 }
470 vcpu->cr8 = cr8;
471}
472EXPORT_SYMBOL_GPL(set_cr8);
473
474void fx_init(struct kvm_vcpu *vcpu)
475{
476 struct __attribute__ ((__packed__)) fx_image_s {
477 u16 control; //fcw
478 u16 status; //fsw
479 u16 tag; // ftw
480 u16 opcode; //fop
481 u64 ip; // fpu ip
482 u64 operand;// fpu dp
483 u32 mxcsr;
484 u32 mxcsr_mask;
485
486 } *fx_image;
487
488 fx_save(vcpu->host_fx_image);
489 fpu_init();
490 fx_save(vcpu->guest_fx_image);
491 fx_restore(vcpu->host_fx_image);
492
493 fx_image = (struct fx_image_s *)vcpu->guest_fx_image;
494 fx_image->mxcsr = 0x1f80;
495 memset(vcpu->guest_fx_image + sizeof(struct fx_image_s),
496 0, FX_IMAGE_SIZE - sizeof(struct fx_image_s));
497}
498EXPORT_SYMBOL_GPL(fx_init);
499
500/*
501 * Creates some virtual cpus. Good luck creating more than one.
502 */
503static int kvm_dev_ioctl_create_vcpu(struct kvm *kvm, int n)
504{
505 int r;
506 struct kvm_vcpu *vcpu;
507
508 r = -EINVAL;
509 if (n < 0 || n >= KVM_MAX_VCPUS)
510 goto out;
511
512 vcpu = &kvm->vcpus[n];
513
514 mutex_lock(&vcpu->mutex);
515
516 if (vcpu->vmcs) {
517 mutex_unlock(&vcpu->mutex);
518 return -EEXIST;
519 }
520
521 vcpu->host_fx_image = (char*)ALIGN((hva_t)vcpu->fx_buf,
522 FX_IMAGE_ALIGN);
523 vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE;
524
525 vcpu->cpu = -1; /* First load will set up TR */
526 vcpu->kvm = kvm;
527 r = kvm_arch_ops->vcpu_create(vcpu);
528 if (r < 0)
529 goto out_free_vcpus;
530
531 kvm_arch_ops->vcpu_load(vcpu);
532
533 r = kvm_arch_ops->vcpu_setup(vcpu);
534 if (r >= 0)
535 r = kvm_mmu_init(vcpu);
536
537 vcpu_put(vcpu);
538
539 if (r < 0)
540 goto out_free_vcpus;
541
542 return 0;
543
544out_free_vcpus:
545 kvm_free_vcpu(vcpu);
546 mutex_unlock(&vcpu->mutex);
547out:
548 return r;
549}
550
551/*
552 * Allocate some memory and give it an address in the guest physical address
553 * space.
554 *
555 * Discontiguous memory is allowed, mostly for framebuffers.
556 */
557static int kvm_dev_ioctl_set_memory_region(struct kvm *kvm,
558 struct kvm_memory_region *mem)
559{
560 int r;
561 gfn_t base_gfn;
562 unsigned long npages;
563 unsigned long i;
564 struct kvm_memory_slot *memslot;
565 struct kvm_memory_slot old, new;
566 int memory_config_version;
567
568 r = -EINVAL;
569 /* General sanity checks */
570 if (mem->memory_size & (PAGE_SIZE - 1))
571 goto out;
572 if (mem->guest_phys_addr & (PAGE_SIZE - 1))
573 goto out;
574 if (mem->slot >= KVM_MEMORY_SLOTS)
575 goto out;
576 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
577 goto out;
578
579 memslot = &kvm->memslots[mem->slot];
580 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
581 npages = mem->memory_size >> PAGE_SHIFT;
582
583 if (!npages)
584 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
585
586raced:
587 spin_lock(&kvm->lock);
588
589 memory_config_version = kvm->memory_config_version;
590 new = old = *memslot;
591
592 new.base_gfn = base_gfn;
593 new.npages = npages;
594 new.flags = mem->flags;
595
596 /* Disallow changing a memory slot's size. */
597 r = -EINVAL;
598 if (npages && old.npages && npages != old.npages)
599 goto out_unlock;
600
601 /* Check for overlaps */
602 r = -EEXIST;
603 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
604 struct kvm_memory_slot *s = &kvm->memslots[i];
605
606 if (s == memslot)
607 continue;
608 if (!((base_gfn + npages <= s->base_gfn) ||
609 (base_gfn >= s->base_gfn + s->npages)))
610 goto out_unlock;
611 }
612 /*
613 * Do memory allocations outside lock. memory_config_version will
614 * detect any races.
615 */
616 spin_unlock(&kvm->lock);
617
618 /* Deallocate if slot is being removed */
619 if (!npages)
620 new.phys_mem = 0;
621
622 /* Free page dirty bitmap if unneeded */
623 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
624 new.dirty_bitmap = 0;
625
626 r = -ENOMEM;
627
628 /* Allocate if a slot is being created */
629 if (npages && !new.phys_mem) {
630 new.phys_mem = vmalloc(npages * sizeof(struct page *));
631
632 if (!new.phys_mem)
633 goto out_free;
634
635 memset(new.phys_mem, 0, npages * sizeof(struct page *));
636 for (i = 0; i < npages; ++i) {
637 new.phys_mem[i] = alloc_page(GFP_HIGHUSER
638 | __GFP_ZERO);
639 if (!new.phys_mem[i])
640 goto out_free;
641 }
642 }
643
644 /* Allocate page dirty bitmap if needed */
645 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
646 unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8;
647
648 new.dirty_bitmap = vmalloc(dirty_bytes);
649 if (!new.dirty_bitmap)
650 goto out_free;
651 memset(new.dirty_bitmap, 0, dirty_bytes);
652 }
653
654 spin_lock(&kvm->lock);
655
656 if (memory_config_version != kvm->memory_config_version) {
657 spin_unlock(&kvm->lock);
658 kvm_free_physmem_slot(&new, &old);
659 goto raced;
660 }
661
662 r = -EAGAIN;
663 if (kvm->busy)
664 goto out_unlock;
665
666 if (mem->slot >= kvm->nmemslots)
667 kvm->nmemslots = mem->slot + 1;
668
669 *memslot = new;
670 ++kvm->memory_config_version;
671
672 spin_unlock(&kvm->lock);
673
674 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
675 struct kvm_vcpu *vcpu;
676
677 vcpu = vcpu_load(kvm, i);
678 if (!vcpu)
679 continue;
680 kvm_mmu_reset_context(vcpu);
681 vcpu_put(vcpu);
682 }
683
684 kvm_free_physmem_slot(&old, &new);
685 return 0;
686
687out_unlock:
688 spin_unlock(&kvm->lock);
689out_free:
690 kvm_free_physmem_slot(&new, &old);
691out:
692 return r;
693}
694
695/*
696 * Get (and clear) the dirty memory log for a memory slot.
697 */
698static int kvm_dev_ioctl_get_dirty_log(struct kvm *kvm,
699 struct kvm_dirty_log *log)
700{
701 struct kvm_memory_slot *memslot;
702 int r, i;
703 int n;
704 unsigned long any = 0;
705
706 spin_lock(&kvm->lock);
707
708 /*
709 * Prevent changes to guest memory configuration even while the lock
710 * is not taken.
711 */
712 ++kvm->busy;
713 spin_unlock(&kvm->lock);
714 r = -EINVAL;
715 if (log->slot >= KVM_MEMORY_SLOTS)
716 goto out;
717
718 memslot = &kvm->memslots[log->slot];
719 r = -ENOENT;
720 if (!memslot->dirty_bitmap)
721 goto out;
722
723 n = ALIGN(memslot->npages, 8) / 8;
724
725 for (i = 0; !any && i < n; ++i)
726 any = memslot->dirty_bitmap[i];
727
728 r = -EFAULT;
729 if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
730 goto out;
731
732
733 if (any) {
734 spin_lock(&kvm->lock);
735 kvm_mmu_slot_remove_write_access(kvm, log->slot);
736 spin_unlock(&kvm->lock);
737 memset(memslot->dirty_bitmap, 0, n);
738 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
739 struct kvm_vcpu *vcpu = vcpu_load(kvm, i);
740
741 if (!vcpu)
742 continue;
743 kvm_arch_ops->tlb_flush(vcpu);
744 vcpu_put(vcpu);
745 }
746 }
747
748 r = 0;
749
750out:
751 spin_lock(&kvm->lock);
752 --kvm->busy;
753 spin_unlock(&kvm->lock);
754 return r;
755}
756
757struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
758{
759 int i;
760
761 for (i = 0; i < kvm->nmemslots; ++i) {
762 struct kvm_memory_slot *memslot = &kvm->memslots[i];
763
764 if (gfn >= memslot->base_gfn
765 && gfn < memslot->base_gfn + memslot->npages)
766 return memslot;
767 }
768 return 0;
769}
770EXPORT_SYMBOL_GPL(gfn_to_memslot);
771
772void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
773{
774 int i;
775 struct kvm_memory_slot *memslot = 0;
776 unsigned long rel_gfn;
777
778 for (i = 0; i < kvm->nmemslots; ++i) {
779 memslot = &kvm->memslots[i];
780
781 if (gfn >= memslot->base_gfn
782 && gfn < memslot->base_gfn + memslot->npages) {
783
784 if (!memslot || !memslot->dirty_bitmap)
785 return;
786
787 rel_gfn = gfn - memslot->base_gfn;
788
789 /* avoid RMW */
790 if (!test_bit(rel_gfn, memslot->dirty_bitmap))
791 set_bit(rel_gfn, memslot->dirty_bitmap);
792 return;
793 }
794 }
795}
796
797static int emulator_read_std(unsigned long addr,
798 unsigned long *val,
799 unsigned int bytes,
800 struct x86_emulate_ctxt *ctxt)
801{
802 struct kvm_vcpu *vcpu = ctxt->vcpu;
803 void *data = val;
804
805 while (bytes) {
806 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
807 unsigned offset = addr & (PAGE_SIZE-1);
808 unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
809 unsigned long pfn;
810 struct kvm_memory_slot *memslot;
811 void *page;
812
813 if (gpa == UNMAPPED_GVA)
814 return X86EMUL_PROPAGATE_FAULT;
815 pfn = gpa >> PAGE_SHIFT;
816 memslot = gfn_to_memslot(vcpu->kvm, pfn);
817 if (!memslot)
818 return X86EMUL_UNHANDLEABLE;
819 page = kmap_atomic(gfn_to_page(memslot, pfn), KM_USER0);
820
821 memcpy(data, page + offset, tocopy);
822
823 kunmap_atomic(page, KM_USER0);
824
825 bytes -= tocopy;
826 data += tocopy;
827 addr += tocopy;
828 }
829
830 return X86EMUL_CONTINUE;
831}
832
833static int emulator_write_std(unsigned long addr,
834 unsigned long val,
835 unsigned int bytes,
836 struct x86_emulate_ctxt *ctxt)
837{
838 printk(KERN_ERR "emulator_write_std: addr %lx n %d\n",
839 addr, bytes);
840 return X86EMUL_UNHANDLEABLE;
841}
842
843static int emulator_read_emulated(unsigned long addr,
844 unsigned long *val,
845 unsigned int bytes,
846 struct x86_emulate_ctxt *ctxt)
847{
848 struct kvm_vcpu *vcpu = ctxt->vcpu;
849
850 if (vcpu->mmio_read_completed) {
851 memcpy(val, vcpu->mmio_data, bytes);
852 vcpu->mmio_read_completed = 0;
853 return X86EMUL_CONTINUE;
854 } else if (emulator_read_std(addr, val, bytes, ctxt)
855 == X86EMUL_CONTINUE)
856 return X86EMUL_CONTINUE;
857 else {
858 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
859 if (gpa == UNMAPPED_GVA)
860 return vcpu_printf(vcpu, "not present\n"), X86EMUL_PROPAGATE_FAULT;
861 vcpu->mmio_needed = 1;
862 vcpu->mmio_phys_addr = gpa;
863 vcpu->mmio_size = bytes;
864 vcpu->mmio_is_write = 0;
865
866 return X86EMUL_UNHANDLEABLE;
867 }
868}
869
870static int emulator_write_emulated(unsigned long addr,
871 unsigned long val,
872 unsigned int bytes,
873 struct x86_emulate_ctxt *ctxt)
874{
875 struct kvm_vcpu *vcpu = ctxt->vcpu;
876 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
877
878 if (gpa == UNMAPPED_GVA)
879 return X86EMUL_PROPAGATE_FAULT;
880
881 vcpu->mmio_needed = 1;
882 vcpu->mmio_phys_addr = gpa;
883 vcpu->mmio_size = bytes;
884 vcpu->mmio_is_write = 1;
885 memcpy(vcpu->mmio_data, &val, bytes);
886
887 return X86EMUL_CONTINUE;
888}
889
890static int emulator_cmpxchg_emulated(unsigned long addr,
891 unsigned long old,
892 unsigned long new,
893 unsigned int bytes,
894 struct x86_emulate_ctxt *ctxt)
895{
896 static int reported;
897
898 if (!reported) {
899 reported = 1;
900 printk(KERN_WARNING "kvm: emulating exchange as write\n");
901 }
902 return emulator_write_emulated(addr, new, bytes, ctxt);
903}
904
905static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
906{
907 return kvm_arch_ops->get_segment_base(vcpu, seg);
908}
909
910int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
911{
912 spin_lock(&vcpu->kvm->lock);
913 vcpu->mmu.inval_page(vcpu, address);
914 spin_unlock(&vcpu->kvm->lock);
915 kvm_arch_ops->invlpg(vcpu, address);
916 return X86EMUL_CONTINUE;
917}
918
919int emulate_clts(struct kvm_vcpu *vcpu)
920{
921 unsigned long cr0 = vcpu->cr0;
922
923 cr0 &= ~CR0_TS_MASK;
924 kvm_arch_ops->set_cr0(vcpu, cr0);
925 return X86EMUL_CONTINUE;
926}
927
928int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, unsigned long *dest)
929{
930 struct kvm_vcpu *vcpu = ctxt->vcpu;
931
932 switch (dr) {
933 case 0 ... 3:
934 *dest = kvm_arch_ops->get_dr(vcpu, dr);
935 return X86EMUL_CONTINUE;
936 default:
937 printk(KERN_DEBUG "%s: unexpected dr %u\n",
938 __FUNCTION__, dr);
939 return X86EMUL_UNHANDLEABLE;
940 }
941}
942
943int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
944{
945 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
946 int exception;
947
948 kvm_arch_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
949 if (exception) {
950 /* FIXME: better handling */
951 return X86EMUL_UNHANDLEABLE;
952 }
953 return X86EMUL_CONTINUE;
954}
955
956static void report_emulation_failure(struct x86_emulate_ctxt *ctxt)
957{
958 static int reported;
959 u8 opcodes[4];
960 unsigned long rip = ctxt->vcpu->rip;
961 unsigned long rip_linear;
962
963 rip_linear = rip + get_segment_base(ctxt->vcpu, VCPU_SREG_CS);
964
965 if (reported)
966 return;
967
968 emulator_read_std(rip_linear, (void *)opcodes, 4, ctxt);
969
970 printk(KERN_ERR "emulation failed but !mmio_needed?"
971 " rip %lx %02x %02x %02x %02x\n",
972 rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
973 reported = 1;
974}
975
976struct x86_emulate_ops emulate_ops = {
977 .read_std = emulator_read_std,
978 .write_std = emulator_write_std,
979 .read_emulated = emulator_read_emulated,
980 .write_emulated = emulator_write_emulated,
981 .cmpxchg_emulated = emulator_cmpxchg_emulated,
982};
983
984int emulate_instruction(struct kvm_vcpu *vcpu,
985 struct kvm_run *run,
986 unsigned long cr2,
987 u16 error_code)
988{
989 struct x86_emulate_ctxt emulate_ctxt;
990 int r;
991 int cs_db, cs_l;
992
993 kvm_arch_ops->cache_regs(vcpu);
994
995 kvm_arch_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
996
997 emulate_ctxt.vcpu = vcpu;
998 emulate_ctxt.eflags = kvm_arch_ops->get_rflags(vcpu);
999 emulate_ctxt.cr2 = cr2;
1000 emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM)
1001 ? X86EMUL_MODE_REAL : cs_l
1002 ? X86EMUL_MODE_PROT64 : cs_db
1003 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
1004
1005 if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
1006 emulate_ctxt.cs_base = 0;
1007 emulate_ctxt.ds_base = 0;
1008 emulate_ctxt.es_base = 0;
1009 emulate_ctxt.ss_base = 0;
1010 } else {
1011 emulate_ctxt.cs_base = get_segment_base(vcpu, VCPU_SREG_CS);
1012 emulate_ctxt.ds_base = get_segment_base(vcpu, VCPU_SREG_DS);
1013 emulate_ctxt.es_base = get_segment_base(vcpu, VCPU_SREG_ES);
1014 emulate_ctxt.ss_base = get_segment_base(vcpu, VCPU_SREG_SS);
1015 }
1016
1017 emulate_ctxt.gs_base = get_segment_base(vcpu, VCPU_SREG_GS);
1018 emulate_ctxt.fs_base = get_segment_base(vcpu, VCPU_SREG_FS);
1019
1020 vcpu->mmio_is_write = 0;
1021 r = x86_emulate_memop(&emulate_ctxt, &emulate_ops);
1022
1023 if ((r || vcpu->mmio_is_write) && run) {
1024 run->mmio.phys_addr = vcpu->mmio_phys_addr;
1025 memcpy(run->mmio.data, vcpu->mmio_data, 8);
1026 run->mmio.len = vcpu->mmio_size;
1027 run->mmio.is_write = vcpu->mmio_is_write;
1028 }
1029
1030 if (r) {
1031 if (!vcpu->mmio_needed) {
1032 report_emulation_failure(&emulate_ctxt);
1033 return EMULATE_FAIL;
1034 }
1035 return EMULATE_DO_MMIO;
1036 }
1037
1038 kvm_arch_ops->decache_regs(vcpu);
1039 kvm_arch_ops->set_rflags(vcpu, emulate_ctxt.eflags);
1040
1041 if (vcpu->mmio_is_write)
1042 return EMULATE_DO_MMIO;
1043
1044 return EMULATE_DONE;
1045}
1046EXPORT_SYMBOL_GPL(emulate_instruction);
1047
1048static u64 mk_cr_64(u64 curr_cr, u32 new_val)
1049{
1050 return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
1051}
1052
1053void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
1054{
1055 struct descriptor_table dt = { limit, base };
1056
1057 kvm_arch_ops->set_gdt(vcpu, &dt);
1058}
1059
1060void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
1061{
1062 struct descriptor_table dt = { limit, base };
1063
1064 kvm_arch_ops->set_idt(vcpu, &dt);
1065}
1066
1067void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
1068 unsigned long *rflags)
1069{
1070 lmsw(vcpu, msw);
1071 *rflags = kvm_arch_ops->get_rflags(vcpu);
1072}
1073
1074unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
1075{
1076 switch (cr) {
1077 case 0:
1078 return vcpu->cr0;
1079 case 2:
1080 return vcpu->cr2;
1081 case 3:
1082 return vcpu->cr3;
1083 case 4:
1084 return vcpu->cr4;
1085 default:
1086 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1087 return 0;
1088 }
1089}
1090
1091void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
1092 unsigned long *rflags)
1093{
1094 switch (cr) {
1095 case 0:
1096 set_cr0(vcpu, mk_cr_64(vcpu->cr0, val));
1097 *rflags = kvm_arch_ops->get_rflags(vcpu);
1098 break;
1099 case 2:
1100 vcpu->cr2 = val;
1101 break;
1102 case 3:
1103 set_cr3(vcpu, val);
1104 break;
1105 case 4:
1106 set_cr4(vcpu, mk_cr_64(vcpu->cr4, val));
1107 break;
1108 default:
1109 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1110 }
1111}
1112
1113/*
1114 * Reads an msr value (of 'msr_index') into 'pdata'.
1115 * Returns 0 on success, non-0 otherwise.
1116 * Assumes vcpu_load() was already called.
1117 */
1118static int get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1119{
1120 return kvm_arch_ops->get_msr(vcpu, msr_index, pdata);
1121}
1122
1123#ifdef __x86_64__
1124
1125void set_efer(struct kvm_vcpu *vcpu, u64 efer)
1126{
1127 struct vmx_msr_entry *msr;
1128
1129 if (efer & EFER_RESERVED_BITS) {
1130 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
1131 efer);
1132 inject_gp(vcpu);
1133 return;
1134 }
1135
1136 if (is_paging(vcpu)
1137 && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) {
1138 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
1139 inject_gp(vcpu);
1140 return;
1141 }
1142
1143 efer &= ~EFER_LMA;
1144 efer |= vcpu->shadow_efer & EFER_LMA;
1145
1146 vcpu->shadow_efer = efer;
1147
1148 msr = find_msr_entry(vcpu, MSR_EFER);
1149
1150 if (!(efer & EFER_LMA))
1151 efer &= ~EFER_LME;
1152 msr->data = efer;
1153}
1154EXPORT_SYMBOL_GPL(set_efer);
1155
1156#endif
1157
1158/*
1159 * Writes msr value into into the appropriate "register".
1160 * Returns 0 on success, non-0 otherwise.
1161 * Assumes vcpu_load() was already called.
1162 */
1163static int set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1164{
1165 return kvm_arch_ops->set_msr(vcpu, msr_index, data);
1166}
1167
1168void kvm_resched(struct kvm_vcpu *vcpu)
1169{
1170 vcpu_put(vcpu);
1171 cond_resched();
1172 /* Cannot fail - no vcpu unplug yet. */
1173 vcpu_load(vcpu->kvm, vcpu_slot(vcpu));
1174}
1175EXPORT_SYMBOL_GPL(kvm_resched);
1176
1177void load_msrs(struct vmx_msr_entry *e, int n)
1178{
1179 int i;
1180
1181 for (i = 0; i < n; ++i)
1182 wrmsrl(e[i].index, e[i].data);
1183}
1184EXPORT_SYMBOL_GPL(load_msrs);
1185
1186void save_msrs(struct vmx_msr_entry *e, int n)
1187{
1188 int i;
1189
1190 for (i = 0; i < n; ++i)
1191 rdmsrl(e[i].index, e[i].data);
1192}
1193EXPORT_SYMBOL_GPL(save_msrs);
1194
1195static int kvm_dev_ioctl_run(struct kvm *kvm, struct kvm_run *kvm_run)
1196{
1197 struct kvm_vcpu *vcpu;
1198 int r;
1199
1200 if (kvm_run->vcpu < 0 || kvm_run->vcpu >= KVM_MAX_VCPUS)
1201 return -EINVAL;
1202
1203 vcpu = vcpu_load(kvm, kvm_run->vcpu);
1204 if (!vcpu)
1205 return -ENOENT;
1206
1207 if (kvm_run->emulated) {
1208 kvm_arch_ops->skip_emulated_instruction(vcpu);
1209 kvm_run->emulated = 0;
1210 }
1211
1212 if (kvm_run->mmio_completed) {
1213 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
1214 vcpu->mmio_read_completed = 1;
1215 }
1216
1217 vcpu->mmio_needed = 0;
1218
1219 r = kvm_arch_ops->run(vcpu, kvm_run);
1220
1221 vcpu_put(vcpu);
1222 return r;
1223}
1224
1225static int kvm_dev_ioctl_get_regs(struct kvm *kvm, struct kvm_regs *regs)
1226{
1227 struct kvm_vcpu *vcpu;
1228
1229 if (regs->vcpu < 0 || regs->vcpu >= KVM_MAX_VCPUS)
1230 return -EINVAL;
1231
1232 vcpu = vcpu_load(kvm, regs->vcpu);
1233 if (!vcpu)
1234 return -ENOENT;
1235
1236 kvm_arch_ops->cache_regs(vcpu);
1237
1238 regs->rax = vcpu->regs[VCPU_REGS_RAX];
1239 regs->rbx = vcpu->regs[VCPU_REGS_RBX];
1240 regs->rcx = vcpu->regs[VCPU_REGS_RCX];
1241 regs->rdx = vcpu->regs[VCPU_REGS_RDX];
1242 regs->rsi = vcpu->regs[VCPU_REGS_RSI];
1243 regs->rdi = vcpu->regs[VCPU_REGS_RDI];
1244 regs->rsp = vcpu->regs[VCPU_REGS_RSP];
1245 regs->rbp = vcpu->regs[VCPU_REGS_RBP];
1246#ifdef __x86_64__
1247 regs->r8 = vcpu->regs[VCPU_REGS_R8];
1248 regs->r9 = vcpu->regs[VCPU_REGS_R9];
1249 regs->r10 = vcpu->regs[VCPU_REGS_R10];
1250 regs->r11 = vcpu->regs[VCPU_REGS_R11];
1251 regs->r12 = vcpu->regs[VCPU_REGS_R12];
1252 regs->r13 = vcpu->regs[VCPU_REGS_R13];
1253 regs->r14 = vcpu->regs[VCPU_REGS_R14];
1254 regs->r15 = vcpu->regs[VCPU_REGS_R15];
1255#endif
1256
1257 regs->rip = vcpu->rip;
1258 regs->rflags = kvm_arch_ops->get_rflags(vcpu);
1259
1260 /*
1261 * Don't leak debug flags in case they were set for guest debugging
1262 */
1263 if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
1264 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
1265
1266 vcpu_put(vcpu);
1267
1268 return 0;
1269}
1270
1271static int kvm_dev_ioctl_set_regs(struct kvm *kvm, struct kvm_regs *regs)
1272{
1273 struct kvm_vcpu *vcpu;
1274
1275 if (regs->vcpu < 0 || regs->vcpu >= KVM_MAX_VCPUS)
1276 return -EINVAL;
1277
1278 vcpu = vcpu_load(kvm, regs->vcpu);
1279 if (!vcpu)
1280 return -ENOENT;
1281
1282 vcpu->regs[VCPU_REGS_RAX] = regs->rax;
1283 vcpu->regs[VCPU_REGS_RBX] = regs->rbx;
1284 vcpu->regs[VCPU_REGS_RCX] = regs->rcx;
1285 vcpu->regs[VCPU_REGS_RDX] = regs->rdx;
1286 vcpu->regs[VCPU_REGS_RSI] = regs->rsi;
1287 vcpu->regs[VCPU_REGS_RDI] = regs->rdi;
1288 vcpu->regs[VCPU_REGS_RSP] = regs->rsp;
1289 vcpu->regs[VCPU_REGS_RBP] = regs->rbp;
1290#ifdef __x86_64__
1291 vcpu->regs[VCPU_REGS_R8] = regs->r8;
1292 vcpu->regs[VCPU_REGS_R9] = regs->r9;
1293 vcpu->regs[VCPU_REGS_R10] = regs->r10;
1294 vcpu->regs[VCPU_REGS_R11] = regs->r11;
1295 vcpu->regs[VCPU_REGS_R12] = regs->r12;
1296 vcpu->regs[VCPU_REGS_R13] = regs->r13;
1297 vcpu->regs[VCPU_REGS_R14] = regs->r14;
1298 vcpu->regs[VCPU_REGS_R15] = regs->r15;
1299#endif
1300
1301 vcpu->rip = regs->rip;
1302 kvm_arch_ops->set_rflags(vcpu, regs->rflags);
1303
1304 kvm_arch_ops->decache_regs(vcpu);
1305
1306 vcpu_put(vcpu);
1307
1308 return 0;
1309}
1310
1311static void get_segment(struct kvm_vcpu *vcpu,
1312 struct kvm_segment *var, int seg)
1313{
1314 return kvm_arch_ops->get_segment(vcpu, var, seg);
1315}
1316
1317static int kvm_dev_ioctl_get_sregs(struct kvm *kvm, struct kvm_sregs *sregs)
1318{
1319 struct kvm_vcpu *vcpu;
1320 struct descriptor_table dt;
1321
1322 if (sregs->vcpu < 0 || sregs->vcpu >= KVM_MAX_VCPUS)
1323 return -EINVAL;
1324 vcpu = vcpu_load(kvm, sregs->vcpu);
1325 if (!vcpu)
1326 return -ENOENT;
1327
1328 get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
1329 get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
1330 get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
1331 get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
1332 get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
1333 get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
1334
1335 get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
1336 get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
1337
1338 kvm_arch_ops->get_idt(vcpu, &dt);
1339 sregs->idt.limit = dt.limit;
1340 sregs->idt.base = dt.base;
1341 kvm_arch_ops->get_gdt(vcpu, &dt);
1342 sregs->gdt.limit = dt.limit;
1343 sregs->gdt.base = dt.base;
1344
1345 sregs->cr0 = vcpu->cr0;
1346 sregs->cr2 = vcpu->cr2;
1347 sregs->cr3 = vcpu->cr3;
1348 sregs->cr4 = vcpu->cr4;
1349 sregs->cr8 = vcpu->cr8;
1350 sregs->efer = vcpu->shadow_efer;
1351 sregs->apic_base = vcpu->apic_base;
1352
1353 memcpy(sregs->interrupt_bitmap, vcpu->irq_pending,
1354 sizeof sregs->interrupt_bitmap);
1355
1356 vcpu_put(vcpu);
1357
1358 return 0;
1359}
1360
1361static void set_segment(struct kvm_vcpu *vcpu,
1362 struct kvm_segment *var, int seg)
1363{
1364 return kvm_arch_ops->set_segment(vcpu, var, seg);
1365}
1366
1367static int kvm_dev_ioctl_set_sregs(struct kvm *kvm, struct kvm_sregs *sregs)
1368{
1369 struct kvm_vcpu *vcpu;
1370 int mmu_reset_needed = 0;
1371 int i;
1372 struct descriptor_table dt;
1373
1374 if (sregs->vcpu < 0 || sregs->vcpu >= KVM_MAX_VCPUS)
1375 return -EINVAL;
1376 vcpu = vcpu_load(kvm, sregs->vcpu);
1377 if (!vcpu)
1378 return -ENOENT;
1379
1380 set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
1381 set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
1382 set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
1383 set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
1384 set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
1385 set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
1386
1387 set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
1388 set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
1389
1390 dt.limit = sregs->idt.limit;
1391 dt.base = sregs->idt.base;
1392 kvm_arch_ops->set_idt(vcpu, &dt);
1393 dt.limit = sregs->gdt.limit;
1394 dt.base = sregs->gdt.base;
1395 kvm_arch_ops->set_gdt(vcpu, &dt);
1396
1397 vcpu->cr2 = sregs->cr2;
1398 mmu_reset_needed |= vcpu->cr3 != sregs->cr3;
1399 vcpu->cr3 = sregs->cr3;
1400
1401 vcpu->cr8 = sregs->cr8;
1402
1403 mmu_reset_needed |= vcpu->shadow_efer != sregs->efer;
1404#ifdef __x86_64__
1405 kvm_arch_ops->set_efer(vcpu, sregs->efer);
1406#endif
1407 vcpu->apic_base = sregs->apic_base;
1408
1409 mmu_reset_needed |= vcpu->cr0 != sregs->cr0;
1410 kvm_arch_ops->set_cr0_no_modeswitch(vcpu, sregs->cr0);
1411
1412 mmu_reset_needed |= vcpu->cr4 != sregs->cr4;
1413 kvm_arch_ops->set_cr4(vcpu, sregs->cr4);
1414
1415 if (mmu_reset_needed)
1416 kvm_mmu_reset_context(vcpu);
1417
1418 memcpy(vcpu->irq_pending, sregs->interrupt_bitmap,
1419 sizeof vcpu->irq_pending);
1420 vcpu->irq_summary = 0;
1421 for (i = 0; i < NR_IRQ_WORDS; ++i)
1422 if (vcpu->irq_pending[i])
1423 __set_bit(i, &vcpu->irq_summary);
1424
1425 vcpu_put(vcpu);
1426
1427 return 0;
1428}
1429
1430/*
1431 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
1432 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
1433 */
1434static u32 msrs_to_save[] = {
1435 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
1436 MSR_K6_STAR,
1437#ifdef __x86_64__
1438 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
1439#endif
1440 MSR_IA32_TIME_STAMP_COUNTER,
1441};
1442
1443
1444/*
1445 * Adapt set_msr() to msr_io()'s calling convention
1446 */
1447static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
1448{
1449 return set_msr(vcpu, index, *data);
1450}
1451
1452/*
1453 * Read or write a bunch of msrs. All parameters are kernel addresses.
1454 *
1455 * @return number of msrs set successfully.
1456 */
1457static int __msr_io(struct kvm *kvm, struct kvm_msrs *msrs,
1458 struct kvm_msr_entry *entries,
1459 int (*do_msr)(struct kvm_vcpu *vcpu,
1460 unsigned index, u64 *data))
1461{
1462 struct kvm_vcpu *vcpu;
1463 int i;
1464
1465 if (msrs->vcpu < 0 || msrs->vcpu >= KVM_MAX_VCPUS)
1466 return -EINVAL;
1467
1468 vcpu = vcpu_load(kvm, msrs->vcpu);
1469 if (!vcpu)
1470 return -ENOENT;
1471
1472 for (i = 0; i < msrs->nmsrs; ++i)
1473 if (do_msr(vcpu, entries[i].index, &entries[i].data))
1474 break;
1475
1476 vcpu_put(vcpu);
1477
1478 return i;
1479}
1480
1481/*
1482 * Read or write a bunch of msrs. Parameters are user addresses.
1483 *
1484 * @return number of msrs set successfully.
1485 */
1486static int msr_io(struct kvm *kvm, struct kvm_msrs __user *user_msrs,
1487 int (*do_msr)(struct kvm_vcpu *vcpu,
1488 unsigned index, u64 *data),
1489 int writeback)
1490{
1491 struct kvm_msrs msrs;
1492 struct kvm_msr_entry *entries;
1493 int r, n;
1494 unsigned size;
1495
1496 r = -EFAULT;
1497 if (copy_from_user(&msrs, user_msrs, sizeof msrs))
1498 goto out;
1499
1500 r = -E2BIG;
1501 if (msrs.nmsrs >= MAX_IO_MSRS)
1502 goto out;
1503
1504 r = -ENOMEM;
1505 size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
1506 entries = vmalloc(size);
1507 if (!entries)
1508 goto out;
1509
1510 r = -EFAULT;
1511 if (copy_from_user(entries, user_msrs->entries, size))
1512 goto out_free;
1513
1514 r = n = __msr_io(kvm, &msrs, entries, do_msr);
1515 if (r < 0)
1516 goto out_free;
1517
1518 r = -EFAULT;
1519 if (writeback && copy_to_user(user_msrs->entries, entries, size))
1520 goto out_free;
1521
1522 r = n;
1523
1524out_free:
1525 vfree(entries);
1526out:
1527 return r;
1528}
1529
1530/*
1531 * Translate a guest virtual address to a guest physical address.
1532 */
1533static int kvm_dev_ioctl_translate(struct kvm *kvm, struct kvm_translation *tr)
1534{
1535 unsigned long vaddr = tr->linear_address;
1536 struct kvm_vcpu *vcpu;
1537 gpa_t gpa;
1538
1539 vcpu = vcpu_load(kvm, tr->vcpu);
1540 if (!vcpu)
1541 return -ENOENT;
1542 spin_lock(&kvm->lock);
1543 gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr);
1544 tr->physical_address = gpa;
1545 tr->valid = gpa != UNMAPPED_GVA;
1546 tr->writeable = 1;
1547 tr->usermode = 0;
1548 spin_unlock(&kvm->lock);
1549 vcpu_put(vcpu);
1550
1551 return 0;
1552}
1553
1554static int kvm_dev_ioctl_interrupt(struct kvm *kvm, struct kvm_interrupt *irq)
1555{
1556 struct kvm_vcpu *vcpu;
1557
1558 if (irq->vcpu < 0 || irq->vcpu >= KVM_MAX_VCPUS)
1559 return -EINVAL;
1560 if (irq->irq < 0 || irq->irq >= 256)
1561 return -EINVAL;
1562 vcpu = vcpu_load(kvm, irq->vcpu);
1563 if (!vcpu)
1564 return -ENOENT;
1565
1566 set_bit(irq->irq, vcpu->irq_pending);
1567 set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary);
1568
1569 vcpu_put(vcpu);
1570
1571 return 0;
1572}
1573
1574static int kvm_dev_ioctl_debug_guest(struct kvm *kvm,
1575 struct kvm_debug_guest *dbg)
1576{
1577 struct kvm_vcpu *vcpu;
1578 int r;
1579
1580 if (dbg->vcpu < 0 || dbg->vcpu >= KVM_MAX_VCPUS)
1581 return -EINVAL;
1582 vcpu = vcpu_load(kvm, dbg->vcpu);
1583 if (!vcpu)
1584 return -ENOENT;
1585
1586 r = kvm_arch_ops->set_guest_debug(vcpu, dbg);
1587
1588 vcpu_put(vcpu);
1589
1590 return r;
1591}
1592
1593static long kvm_dev_ioctl(struct file *filp,
1594 unsigned int ioctl, unsigned long arg)
1595{
1596 struct kvm *kvm = filp->private_data;
1597 int r = -EINVAL;
1598
1599 switch (ioctl) {
1600 case KVM_CREATE_VCPU: {
1601 r = kvm_dev_ioctl_create_vcpu(kvm, arg);
1602 if (r)
1603 goto out;
1604 break;
1605 }
1606 case KVM_RUN: {
1607 struct kvm_run kvm_run;
1608
1609 r = -EFAULT;
1610 if (copy_from_user(&kvm_run, (void *)arg, sizeof kvm_run))
1611 goto out;
1612 r = kvm_dev_ioctl_run(kvm, &kvm_run);
1613 if (r < 0)
1614 goto out;
1615 r = -EFAULT;
1616 if (copy_to_user((void *)arg, &kvm_run, sizeof kvm_run))
1617 goto out;
1618 r = 0;
1619 break;
1620 }
1621 case KVM_GET_REGS: {
1622 struct kvm_regs kvm_regs;
1623
1624 r = -EFAULT;
1625 if (copy_from_user(&kvm_regs, (void *)arg, sizeof kvm_regs))
1626 goto out;
1627 r = kvm_dev_ioctl_get_regs(kvm, &kvm_regs);
1628 if (r)
1629 goto out;
1630 r = -EFAULT;
1631 if (copy_to_user((void *)arg, &kvm_regs, sizeof kvm_regs))
1632 goto out;
1633 r = 0;
1634 break;
1635 }
1636 case KVM_SET_REGS: {
1637 struct kvm_regs kvm_regs;
1638
1639 r = -EFAULT;
1640 if (copy_from_user(&kvm_regs, (void *)arg, sizeof kvm_regs))
1641 goto out;
1642 r = kvm_dev_ioctl_set_regs(kvm, &kvm_regs);
1643 if (r)
1644 goto out;
1645 r = 0;
1646 break;
1647 }
1648 case KVM_GET_SREGS: {
1649 struct kvm_sregs kvm_sregs;
1650
1651 r = -EFAULT;
1652 if (copy_from_user(&kvm_sregs, (void *)arg, sizeof kvm_sregs))
1653 goto out;
1654 r = kvm_dev_ioctl_get_sregs(kvm, &kvm_sregs);
1655 if (r)
1656 goto out;
1657 r = -EFAULT;
1658 if (copy_to_user((void *)arg, &kvm_sregs, sizeof kvm_sregs))
1659 goto out;
1660 r = 0;
1661 break;
1662 }
1663 case KVM_SET_SREGS: {
1664 struct kvm_sregs kvm_sregs;
1665
1666 r = -EFAULT;
1667 if (copy_from_user(&kvm_sregs, (void *)arg, sizeof kvm_sregs))
1668 goto out;
1669 r = kvm_dev_ioctl_set_sregs(kvm, &kvm_sregs);
1670 if (r)
1671 goto out;
1672 r = 0;
1673 break;
1674 }
1675 case KVM_TRANSLATE: {
1676 struct kvm_translation tr;
1677
1678 r = -EFAULT;
1679 if (copy_from_user(&tr, (void *)arg, sizeof tr))
1680 goto out;
1681 r = kvm_dev_ioctl_translate(kvm, &tr);
1682 if (r)
1683 goto out;
1684 r = -EFAULT;
1685 if (copy_to_user((void *)arg, &tr, sizeof tr))
1686 goto out;
1687 r = 0;
1688 break;
1689 }
1690 case KVM_INTERRUPT: {
1691 struct kvm_interrupt irq;
1692
1693 r = -EFAULT;
1694 if (copy_from_user(&irq, (void *)arg, sizeof irq))
1695 goto out;
1696 r = kvm_dev_ioctl_interrupt(kvm, &irq);
1697 if (r)
1698 goto out;
1699 r = 0;
1700 break;
1701 }
1702 case KVM_DEBUG_GUEST: {
1703 struct kvm_debug_guest dbg;
1704
1705 r = -EFAULT;
1706 if (copy_from_user(&dbg, (void *)arg, sizeof dbg))
1707 goto out;
1708 r = kvm_dev_ioctl_debug_guest(kvm, &dbg);
1709 if (r)
1710 goto out;
1711 r = 0;
1712 break;
1713 }
1714 case KVM_SET_MEMORY_REGION: {
1715 struct kvm_memory_region kvm_mem;
1716
1717 r = -EFAULT;
1718 if (copy_from_user(&kvm_mem, (void *)arg, sizeof kvm_mem))
1719 goto out;
1720 r = kvm_dev_ioctl_set_memory_region(kvm, &kvm_mem);
1721 if (r)
1722 goto out;
1723 break;
1724 }
1725 case KVM_GET_DIRTY_LOG: {
1726 struct kvm_dirty_log log;
1727
1728 r = -EFAULT;
1729 if (copy_from_user(&log, (void *)arg, sizeof log))
1730 goto out;
1731 r = kvm_dev_ioctl_get_dirty_log(kvm, &log);
1732 if (r)
1733 goto out;
1734 break;
1735 }
1736 case KVM_GET_MSRS:
1737 r = msr_io(kvm, (void __user *)arg, get_msr, 1);
1738 break;
1739 case KVM_SET_MSRS:
1740 r = msr_io(kvm, (void __user *)arg, do_set_msr, 0);
1741 break;
1742 case KVM_GET_MSR_INDEX_LIST: {
1743 struct kvm_msr_list __user *user_msr_list = (void __user *)arg;
1744 struct kvm_msr_list msr_list;
1745 unsigned n;
1746
1747 r = -EFAULT;
1748 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
1749 goto out;
1750 n = msr_list.nmsrs;
1751 msr_list.nmsrs = ARRAY_SIZE(msrs_to_save);
1752 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
1753 goto out;
1754 r = -E2BIG;
1755 if (n < ARRAY_SIZE(msrs_to_save))
1756 goto out;
1757 r = -EFAULT;
1758 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
1759 sizeof msrs_to_save))
1760 goto out;
1761 r = 0;
1762 }
1763 default:
1764 ;
1765 }
1766out:
1767 return r;
1768}
1769
1770static struct page *kvm_dev_nopage(struct vm_area_struct *vma,
1771 unsigned long address,
1772 int *type)
1773{
1774 struct kvm *kvm = vma->vm_file->private_data;
1775 unsigned long pgoff;
1776 struct kvm_memory_slot *slot;
1777 struct page *page;
1778
1779 *type = VM_FAULT_MINOR;
1780 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1781 slot = gfn_to_memslot(kvm, pgoff);
1782 if (!slot)
1783 return NOPAGE_SIGBUS;
1784 page = gfn_to_page(slot, pgoff);
1785 if (!page)
1786 return NOPAGE_SIGBUS;
1787 get_page(page);
1788 return page;
1789}
1790
1791static struct vm_operations_struct kvm_dev_vm_ops = {
1792 .nopage = kvm_dev_nopage,
1793};
1794
1795static int kvm_dev_mmap(struct file *file, struct vm_area_struct *vma)
1796{
1797 vma->vm_ops = &kvm_dev_vm_ops;
1798 return 0;
1799}
1800
1801static struct file_operations kvm_chardev_ops = {
1802 .open = kvm_dev_open,
1803 .release = kvm_dev_release,
1804 .unlocked_ioctl = kvm_dev_ioctl,
1805 .compat_ioctl = kvm_dev_ioctl,
1806 .mmap = kvm_dev_mmap,
1807};
1808
1809static struct miscdevice kvm_dev = {
1810 MISC_DYNAMIC_MINOR,
1811 "kvm",
1812 &kvm_chardev_ops,
1813};
1814
1815static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
1816 void *v)
1817{
1818 if (val == SYS_RESTART) {
1819 /*
1820 * Some (well, at least mine) BIOSes hang on reboot if
1821 * in vmx root mode.
1822 */
1823 printk(KERN_INFO "kvm: exiting hardware virtualization\n");
1824 on_each_cpu(kvm_arch_ops->hardware_disable, 0, 0, 1);
1825 }
1826 return NOTIFY_OK;
1827}
1828
1829static struct notifier_block kvm_reboot_notifier = {
1830 .notifier_call = kvm_reboot,
1831 .priority = 0,
1832};
1833
1834static __init void kvm_init_debug(void)
1835{
1836 struct kvm_stats_debugfs_item *p;
1837
1838 debugfs_dir = debugfs_create_dir("kvm", 0);
1839 for (p = debugfs_entries; p->name; ++p)
1840 p->dentry = debugfs_create_u32(p->name, 0444, debugfs_dir,
1841 p->data);
1842}
1843
1844static void kvm_exit_debug(void)
1845{
1846 struct kvm_stats_debugfs_item *p;
1847
1848 for (p = debugfs_entries; p->name; ++p)
1849 debugfs_remove(p->dentry);
1850 debugfs_remove(debugfs_dir);
1851}
1852
1853hpa_t bad_page_address;
1854
1855int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module)
1856{
1857 int r;
1858
1859 kvm_arch_ops = ops;
1860
1861 if (!kvm_arch_ops->cpu_has_kvm_support()) {
1862 printk(KERN_ERR "kvm: no hardware support\n");
1863 return -EOPNOTSUPP;
1864 }
1865 if (kvm_arch_ops->disabled_by_bios()) {
1866 printk(KERN_ERR "kvm: disabled by bios\n");
1867 return -EOPNOTSUPP;
1868 }
1869
1870 r = kvm_arch_ops->hardware_setup();
1871 if (r < 0)
1872 return r;
1873
1874 on_each_cpu(kvm_arch_ops->hardware_enable, 0, 0, 1);
1875 register_reboot_notifier(&kvm_reboot_notifier);
1876
1877 kvm_chardev_ops.owner = module;
1878
1879 r = misc_register(&kvm_dev);
1880 if (r) {
1881 printk (KERN_ERR "kvm: misc device register failed\n");
1882 goto out_free;
1883 }
1884
1885 return r;
1886
1887out_free:
1888 unregister_reboot_notifier(&kvm_reboot_notifier);
1889 on_each_cpu(kvm_arch_ops->hardware_disable, 0, 0, 1);
1890 kvm_arch_ops->hardware_unsetup();
1891 return r;
1892}
1893
1894void kvm_exit_arch(void)
1895{
1896 misc_deregister(&kvm_dev);
1897
1898 unregister_reboot_notifier(&kvm_reboot_notifier);
1899 on_each_cpu(kvm_arch_ops->hardware_disable, 0, 0, 1);
1900 kvm_arch_ops->hardware_unsetup();
1901}
1902
1903static __init int kvm_init(void)
1904{
1905 static struct page *bad_page;
1906 int r = 0;
1907
1908 kvm_init_debug();
1909
1910 if ((bad_page = alloc_page(GFP_KERNEL)) == NULL) {
1911 r = -ENOMEM;
1912 goto out;
1913 }
1914
1915 bad_page_address = page_to_pfn(bad_page) << PAGE_SHIFT;
1916 memset(__va(bad_page_address), 0, PAGE_SIZE);
1917
1918 return r;
1919
1920out:
1921 kvm_exit_debug();
1922 return r;
1923}
1924
1925static __exit void kvm_exit(void)
1926{
1927 kvm_exit_debug();
1928 __free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT));
1929}
1930
1931module_init(kvm_init)
1932module_exit(kvm_exit)
1933
1934EXPORT_SYMBOL_GPL(kvm_init_arch);
1935EXPORT_SYMBOL_GPL(kvm_exit_arch);
diff --git a/drivers/kvm/kvm_svm.h b/drivers/kvm/kvm_svm.h
new file mode 100644
index 000000000000..7d7f2aa10960
--- /dev/null
+++ b/drivers/kvm/kvm_svm.h
@@ -0,0 +1,44 @@
1#ifndef __KVM_SVM_H
2#define __KVM_SVM_H
3
4#include <linux/types.h>
5#include <linux/list.h>
6#include <asm/msr.h>
7
8#include "svm.h"
9#include "kvm.h"
10
11static const u32 host_save_msrs[] = {
12#ifdef __x86_64__
13 MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
14 MSR_FS_BASE, MSR_GS_BASE,
15#endif
16 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
17 MSR_IA32_DEBUGCTLMSR, /*MSR_IA32_LASTBRANCHFROMIP,
18 MSR_IA32_LASTBRANCHTOIP, MSR_IA32_LASTINTFROMIP,MSR_IA32_LASTINTTOIP,*/
19};
20
21#define NR_HOST_SAVE_MSRS (sizeof(host_save_msrs) / sizeof(*host_save_msrs))
22#define NUM_DB_REGS 4
23
24struct vcpu_svm {
25 struct vmcb *vmcb;
26 unsigned long vmcb_pa;
27 struct svm_cpu_data *svm_data;
28 uint64_t asid_generation;
29
30 unsigned long cr0;
31 unsigned long cr4;
32 unsigned long db_regs[NUM_DB_REGS];
33
34 u64 next_rip;
35
36 u64 host_msrs[NR_HOST_SAVE_MSRS];
37 unsigned long host_cr2;
38 unsigned long host_db_regs[NUM_DB_REGS];
39 unsigned long host_dr6;
40 unsigned long host_dr7;
41};
42
43#endif
44
diff --git a/drivers/kvm/kvm_vmx.h b/drivers/kvm/kvm_vmx.h
new file mode 100644
index 000000000000..87e12d2bfa16
--- /dev/null
+++ b/drivers/kvm/kvm_vmx.h
@@ -0,0 +1,14 @@
1#ifndef __KVM_VMX_H
2#define __KVM_VMX_H
3
4#ifdef __x86_64__
5/*
6 * avoid save/load MSR_SYSCALL_MASK and MSR_LSTAR by std vt
7 * mechanism (cpu bug AA24)
8 */
9#define NR_BAD_MSRS 2
10#else
11#define NR_BAD_MSRS 0
12#endif
13
14#endif
diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c
new file mode 100644
index 000000000000..4e29d9b7211c
--- /dev/null
+++ b/drivers/kvm/mmu.c
@@ -0,0 +1,699 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * MMU support
8 *
9 * Copyright (C) 2006 Qumranet, Inc.
10 *
11 * Authors:
12 * Yaniv Kamay <yaniv@qumranet.com>
13 * Avi Kivity <avi@qumranet.com>
14 *
15 * This work is licensed under the terms of the GNU GPL, version 2. See
16 * the COPYING file in the top-level directory.
17 *
18 */
19#include <linux/types.h>
20#include <linux/string.h>
21#include <asm/page.h>
22#include <linux/mm.h>
23#include <linux/highmem.h>
24#include <linux/module.h>
25
26#include "vmx.h"
27#include "kvm.h"
28
29#define pgprintk(x...) do { } while (0)
30
31#define ASSERT(x) \
32 if (!(x)) { \
33 printk(KERN_WARNING "assertion failed %s:%d: %s\n", \
34 __FILE__, __LINE__, #x); \
35 }
36
37#define PT64_ENT_PER_PAGE 512
38#define PT32_ENT_PER_PAGE 1024
39
40#define PT_WRITABLE_SHIFT 1
41
42#define PT_PRESENT_MASK (1ULL << 0)
43#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
44#define PT_USER_MASK (1ULL << 2)
45#define PT_PWT_MASK (1ULL << 3)
46#define PT_PCD_MASK (1ULL << 4)
47#define PT_ACCESSED_MASK (1ULL << 5)
48#define PT_DIRTY_MASK (1ULL << 6)
49#define PT_PAGE_SIZE_MASK (1ULL << 7)
50#define PT_PAT_MASK (1ULL << 7)
51#define PT_GLOBAL_MASK (1ULL << 8)
52#define PT64_NX_MASK (1ULL << 63)
53
54#define PT_PAT_SHIFT 7
55#define PT_DIR_PAT_SHIFT 12
56#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
57
58#define PT32_DIR_PSE36_SIZE 4
59#define PT32_DIR_PSE36_SHIFT 13
60#define PT32_DIR_PSE36_MASK (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
61
62
63#define PT32_PTE_COPY_MASK \
64 (PT_PRESENT_MASK | PT_PWT_MASK | PT_PCD_MASK | \
65 PT_ACCESSED_MASK | PT_DIRTY_MASK | PT_PAT_MASK | \
66 PT_GLOBAL_MASK )
67
68#define PT32_NON_PTE_COPY_MASK \
69 (PT_PRESENT_MASK | PT_PWT_MASK | PT_PCD_MASK | \
70 PT_ACCESSED_MASK | PT_DIRTY_MASK)
71
72
73#define PT64_PTE_COPY_MASK \
74 (PT64_NX_MASK | PT32_PTE_COPY_MASK)
75
76#define PT64_NON_PTE_COPY_MASK \
77 (PT64_NX_MASK | PT32_NON_PTE_COPY_MASK)
78
79
80
81#define PT_FIRST_AVAIL_BITS_SHIFT 9
82#define PT64_SECOND_AVAIL_BITS_SHIFT 52
83
84#define PT_SHADOW_PS_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
85#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
86
87#define PT_SHADOW_WRITABLE_SHIFT (PT_FIRST_AVAIL_BITS_SHIFT + 1)
88#define PT_SHADOW_WRITABLE_MASK (1ULL << PT_SHADOW_WRITABLE_SHIFT)
89
90#define PT_SHADOW_USER_SHIFT (PT_SHADOW_WRITABLE_SHIFT + 1)
91#define PT_SHADOW_USER_MASK (1ULL << (PT_SHADOW_USER_SHIFT))
92
93#define PT_SHADOW_BITS_OFFSET (PT_SHADOW_WRITABLE_SHIFT - PT_WRITABLE_SHIFT)
94
95#define VALID_PAGE(x) ((x) != INVALID_PAGE)
96
97#define PT64_LEVEL_BITS 9
98
99#define PT64_LEVEL_SHIFT(level) \
100 ( PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS )
101
102#define PT64_LEVEL_MASK(level) \
103 (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
104
105#define PT64_INDEX(address, level)\
106 (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
107
108
109#define PT32_LEVEL_BITS 10
110
111#define PT32_LEVEL_SHIFT(level) \
112 ( PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS )
113
114#define PT32_LEVEL_MASK(level) \
115 (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
116
117#define PT32_INDEX(address, level)\
118 (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
119
120
121#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & PAGE_MASK)
122#define PT64_DIR_BASE_ADDR_MASK \
123 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
124
125#define PT32_BASE_ADDR_MASK PAGE_MASK
126#define PT32_DIR_BASE_ADDR_MASK \
127 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
128
129
130#define PFERR_PRESENT_MASK (1U << 0)
131#define PFERR_WRITE_MASK (1U << 1)
132#define PFERR_USER_MASK (1U << 2)
133
134#define PT64_ROOT_LEVEL 4
135#define PT32_ROOT_LEVEL 2
136#define PT32E_ROOT_LEVEL 3
137
138#define PT_DIRECTORY_LEVEL 2
139#define PT_PAGE_TABLE_LEVEL 1
140
141static int is_write_protection(struct kvm_vcpu *vcpu)
142{
143 return vcpu->cr0 & CR0_WP_MASK;
144}
145
146static int is_cpuid_PSE36(void)
147{
148 return 1;
149}
150
151static int is_present_pte(unsigned long pte)
152{
153 return pte & PT_PRESENT_MASK;
154}
155
156static int is_writeble_pte(unsigned long pte)
157{
158 return pte & PT_WRITABLE_MASK;
159}
160
161static int is_io_pte(unsigned long pte)
162{
163 return pte & PT_SHADOW_IO_MARK;
164}
165
166static void kvm_mmu_free_page(struct kvm_vcpu *vcpu, hpa_t page_hpa)
167{
168 struct kvm_mmu_page *page_head = page_header(page_hpa);
169
170 list_del(&page_head->link);
171 page_head->page_hpa = page_hpa;
172 list_add(&page_head->link, &vcpu->free_pages);
173}
174
175static int is_empty_shadow_page(hpa_t page_hpa)
176{
177 u32 *pos;
178 u32 *end;
179 for (pos = __va(page_hpa), end = pos + PAGE_SIZE / sizeof(u32);
180 pos != end; pos++)
181 if (*pos != 0)
182 return 0;
183 return 1;
184}
185
186static hpa_t kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, u64 *parent_pte)
187{
188 struct kvm_mmu_page *page;
189
190 if (list_empty(&vcpu->free_pages))
191 return INVALID_PAGE;
192
193 page = list_entry(vcpu->free_pages.next, struct kvm_mmu_page, link);
194 list_del(&page->link);
195 list_add(&page->link, &vcpu->kvm->active_mmu_pages);
196 ASSERT(is_empty_shadow_page(page->page_hpa));
197 page->slot_bitmap = 0;
198 page->global = 1;
199 page->parent_pte = parent_pte;
200 return page->page_hpa;
201}
202
203static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa)
204{
205 int slot = memslot_id(kvm, gfn_to_memslot(kvm, gpa >> PAGE_SHIFT));
206 struct kvm_mmu_page *page_head = page_header(__pa(pte));
207
208 __set_bit(slot, &page_head->slot_bitmap);
209}
210
211hpa_t safe_gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa)
212{
213 hpa_t hpa = gpa_to_hpa(vcpu, gpa);
214
215 return is_error_hpa(hpa) ? bad_page_address | (gpa & ~PAGE_MASK): hpa;
216}
217
218hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa)
219{
220 struct kvm_memory_slot *slot;
221 struct page *page;
222
223 ASSERT((gpa & HPA_ERR_MASK) == 0);
224 slot = gfn_to_memslot(vcpu->kvm, gpa >> PAGE_SHIFT);
225 if (!slot)
226 return gpa | HPA_ERR_MASK;
227 page = gfn_to_page(slot, gpa >> PAGE_SHIFT);
228 return ((hpa_t)page_to_pfn(page) << PAGE_SHIFT)
229 | (gpa & (PAGE_SIZE-1));
230}
231
232hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva)
233{
234 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
235
236 if (gpa == UNMAPPED_GVA)
237 return UNMAPPED_GVA;
238 return gpa_to_hpa(vcpu, gpa);
239}
240
241
242static void release_pt_page_64(struct kvm_vcpu *vcpu, hpa_t page_hpa,
243 int level)
244{
245 ASSERT(vcpu);
246 ASSERT(VALID_PAGE(page_hpa));
247 ASSERT(level <= PT64_ROOT_LEVEL && level > 0);
248
249 if (level == 1)
250 memset(__va(page_hpa), 0, PAGE_SIZE);
251 else {
252 u64 *pos;
253 u64 *end;
254
255 for (pos = __va(page_hpa), end = pos + PT64_ENT_PER_PAGE;
256 pos != end; pos++) {
257 u64 current_ent = *pos;
258
259 *pos = 0;
260 if (is_present_pte(current_ent))
261 release_pt_page_64(vcpu,
262 current_ent &
263 PT64_BASE_ADDR_MASK,
264 level - 1);
265 }
266 }
267 kvm_mmu_free_page(vcpu, page_hpa);
268}
269
270static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
271{
272}
273
274static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p)
275{
276 int level = PT32E_ROOT_LEVEL;
277 hpa_t table_addr = vcpu->mmu.root_hpa;
278
279 for (; ; level--) {
280 u32 index = PT64_INDEX(v, level);
281 u64 *table;
282
283 ASSERT(VALID_PAGE(table_addr));
284 table = __va(table_addr);
285
286 if (level == 1) {
287 mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT);
288 page_header_update_slot(vcpu->kvm, table, v);
289 table[index] = p | PT_PRESENT_MASK | PT_WRITABLE_MASK |
290 PT_USER_MASK;
291 return 0;
292 }
293
294 if (table[index] == 0) {
295 hpa_t new_table = kvm_mmu_alloc_page(vcpu,
296 &table[index]);
297
298 if (!VALID_PAGE(new_table)) {
299 pgprintk("nonpaging_map: ENOMEM\n");
300 return -ENOMEM;
301 }
302
303 if (level == PT32E_ROOT_LEVEL)
304 table[index] = new_table | PT_PRESENT_MASK;
305 else
306 table[index] = new_table | PT_PRESENT_MASK |
307 PT_WRITABLE_MASK | PT_USER_MASK;
308 }
309 table_addr = table[index] & PT64_BASE_ADDR_MASK;
310 }
311}
312
313static void nonpaging_flush(struct kvm_vcpu *vcpu)
314{
315 hpa_t root = vcpu->mmu.root_hpa;
316
317 ++kvm_stat.tlb_flush;
318 pgprintk("nonpaging_flush\n");
319 ASSERT(VALID_PAGE(root));
320 release_pt_page_64(vcpu, root, vcpu->mmu.shadow_root_level);
321 root = kvm_mmu_alloc_page(vcpu, NULL);
322 ASSERT(VALID_PAGE(root));
323 vcpu->mmu.root_hpa = root;
324 if (is_paging(vcpu))
325 root |= (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK));
326 kvm_arch_ops->set_cr3(vcpu, root);
327 kvm_arch_ops->tlb_flush(vcpu);
328}
329
330static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
331{
332 return vaddr;
333}
334
335static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
336 u32 error_code)
337{
338 int ret;
339 gpa_t addr = gva;
340
341 ASSERT(vcpu);
342 ASSERT(VALID_PAGE(vcpu->mmu.root_hpa));
343
344 for (;;) {
345 hpa_t paddr;
346
347 paddr = gpa_to_hpa(vcpu , addr & PT64_BASE_ADDR_MASK);
348
349 if (is_error_hpa(paddr))
350 return 1;
351
352 ret = nonpaging_map(vcpu, addr & PAGE_MASK, paddr);
353 if (ret) {
354 nonpaging_flush(vcpu);
355 continue;
356 }
357 break;
358 }
359 return ret;
360}
361
362static void nonpaging_inval_page(struct kvm_vcpu *vcpu, gva_t addr)
363{
364}
365
366static void nonpaging_free(struct kvm_vcpu *vcpu)
367{
368 hpa_t root;
369
370 ASSERT(vcpu);
371 root = vcpu->mmu.root_hpa;
372 if (VALID_PAGE(root))
373 release_pt_page_64(vcpu, root, vcpu->mmu.shadow_root_level);
374 vcpu->mmu.root_hpa = INVALID_PAGE;
375}
376
377static int nonpaging_init_context(struct kvm_vcpu *vcpu)
378{
379 struct kvm_mmu *context = &vcpu->mmu;
380
381 context->new_cr3 = nonpaging_new_cr3;
382 context->page_fault = nonpaging_page_fault;
383 context->inval_page = nonpaging_inval_page;
384 context->gva_to_gpa = nonpaging_gva_to_gpa;
385 context->free = nonpaging_free;
386 context->root_level = PT32E_ROOT_LEVEL;
387 context->shadow_root_level = PT32E_ROOT_LEVEL;
388 context->root_hpa = kvm_mmu_alloc_page(vcpu, NULL);
389 ASSERT(VALID_PAGE(context->root_hpa));
390 kvm_arch_ops->set_cr3(vcpu, context->root_hpa);
391 return 0;
392}
393
394
395static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
396{
397 struct kvm_mmu_page *page, *npage;
398
399 list_for_each_entry_safe(page, npage, &vcpu->kvm->active_mmu_pages,
400 link) {
401 if (page->global)
402 continue;
403
404 if (!page->parent_pte)
405 continue;
406
407 *page->parent_pte = 0;
408 release_pt_page_64(vcpu, page->page_hpa, 1);
409 }
410 ++kvm_stat.tlb_flush;
411 kvm_arch_ops->tlb_flush(vcpu);
412}
413
414static void paging_new_cr3(struct kvm_vcpu *vcpu)
415{
416 kvm_mmu_flush_tlb(vcpu);
417}
418
419static void mark_pagetable_nonglobal(void *shadow_pte)
420{
421 page_header(__pa(shadow_pte))->global = 0;
422}
423
424static inline void set_pte_common(struct kvm_vcpu *vcpu,
425 u64 *shadow_pte,
426 gpa_t gaddr,
427 int dirty,
428 u64 access_bits)
429{
430 hpa_t paddr;
431
432 *shadow_pte |= access_bits << PT_SHADOW_BITS_OFFSET;
433 if (!dirty)
434 access_bits &= ~PT_WRITABLE_MASK;
435
436 if (access_bits & PT_WRITABLE_MASK)
437 mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT);
438
439 *shadow_pte |= access_bits;
440
441 paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK);
442
443 if (!(*shadow_pte & PT_GLOBAL_MASK))
444 mark_pagetable_nonglobal(shadow_pte);
445
446 if (is_error_hpa(paddr)) {
447 *shadow_pte |= gaddr;
448 *shadow_pte |= PT_SHADOW_IO_MARK;
449 *shadow_pte &= ~PT_PRESENT_MASK;
450 } else {
451 *shadow_pte |= paddr;
452 page_header_update_slot(vcpu->kvm, shadow_pte, gaddr);
453 }
454}
455
456static void inject_page_fault(struct kvm_vcpu *vcpu,
457 u64 addr,
458 u32 err_code)
459{
460 kvm_arch_ops->inject_page_fault(vcpu, addr, err_code);
461}
462
463static inline int fix_read_pf(u64 *shadow_ent)
464{
465 if ((*shadow_ent & PT_SHADOW_USER_MASK) &&
466 !(*shadow_ent & PT_USER_MASK)) {
467 /*
468 * If supervisor write protect is disabled, we shadow kernel
469 * pages as user pages so we can trap the write access.
470 */
471 *shadow_ent |= PT_USER_MASK;
472 *shadow_ent &= ~PT_WRITABLE_MASK;
473
474 return 1;
475
476 }
477 return 0;
478}
479
480static int may_access(u64 pte, int write, int user)
481{
482
483 if (user && !(pte & PT_USER_MASK))
484 return 0;
485 if (write && !(pte & PT_WRITABLE_MASK))
486 return 0;
487 return 1;
488}
489
490/*
491 * Remove a shadow pte.
492 */
493static void paging_inval_page(struct kvm_vcpu *vcpu, gva_t addr)
494{
495 hpa_t page_addr = vcpu->mmu.root_hpa;
496 int level = vcpu->mmu.shadow_root_level;
497
498 ++kvm_stat.invlpg;
499
500 for (; ; level--) {
501 u32 index = PT64_INDEX(addr, level);
502 u64 *table = __va(page_addr);
503
504 if (level == PT_PAGE_TABLE_LEVEL ) {
505 table[index] = 0;
506 return;
507 }
508
509 if (!is_present_pte(table[index]))
510 return;
511
512 page_addr = table[index] & PT64_BASE_ADDR_MASK;
513
514 if (level == PT_DIRECTORY_LEVEL &&
515 (table[index] & PT_SHADOW_PS_MARK)) {
516 table[index] = 0;
517 release_pt_page_64(vcpu, page_addr, PT_PAGE_TABLE_LEVEL);
518
519 kvm_arch_ops->tlb_flush(vcpu);
520 return;
521 }
522 }
523}
524
525static void paging_free(struct kvm_vcpu *vcpu)
526{
527 nonpaging_free(vcpu);
528}
529
530#define PTTYPE 64
531#include "paging_tmpl.h"
532#undef PTTYPE
533
534#define PTTYPE 32
535#include "paging_tmpl.h"
536#undef PTTYPE
537
538static int paging64_init_context(struct kvm_vcpu *vcpu)
539{
540 struct kvm_mmu *context = &vcpu->mmu;
541
542 ASSERT(is_pae(vcpu));
543 context->new_cr3 = paging_new_cr3;
544 context->page_fault = paging64_page_fault;
545 context->inval_page = paging_inval_page;
546 context->gva_to_gpa = paging64_gva_to_gpa;
547 context->free = paging_free;
548 context->root_level = PT64_ROOT_LEVEL;
549 context->shadow_root_level = PT64_ROOT_LEVEL;
550 context->root_hpa = kvm_mmu_alloc_page(vcpu, NULL);
551 ASSERT(VALID_PAGE(context->root_hpa));
552 kvm_arch_ops->set_cr3(vcpu, context->root_hpa |
553 (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK)));
554 return 0;
555}
556
557static int paging32_init_context(struct kvm_vcpu *vcpu)
558{
559 struct kvm_mmu *context = &vcpu->mmu;
560
561 context->new_cr3 = paging_new_cr3;
562 context->page_fault = paging32_page_fault;
563 context->inval_page = paging_inval_page;
564 context->gva_to_gpa = paging32_gva_to_gpa;
565 context->free = paging_free;
566 context->root_level = PT32_ROOT_LEVEL;
567 context->shadow_root_level = PT32E_ROOT_LEVEL;
568 context->root_hpa = kvm_mmu_alloc_page(vcpu, NULL);
569 ASSERT(VALID_PAGE(context->root_hpa));
570 kvm_arch_ops->set_cr3(vcpu, context->root_hpa |
571 (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK)));
572 return 0;
573}
574
575static int paging32E_init_context(struct kvm_vcpu *vcpu)
576{
577 int ret;
578
579 if ((ret = paging64_init_context(vcpu)))
580 return ret;
581
582 vcpu->mmu.root_level = PT32E_ROOT_LEVEL;
583 vcpu->mmu.shadow_root_level = PT32E_ROOT_LEVEL;
584 return 0;
585}
586
587static int init_kvm_mmu(struct kvm_vcpu *vcpu)
588{
589 ASSERT(vcpu);
590 ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
591
592 if (!is_paging(vcpu))
593 return nonpaging_init_context(vcpu);
594 else if (kvm_arch_ops->is_long_mode(vcpu))
595 return paging64_init_context(vcpu);
596 else if (is_pae(vcpu))
597 return paging32E_init_context(vcpu);
598 else
599 return paging32_init_context(vcpu);
600}
601
602static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
603{
604 ASSERT(vcpu);
605 if (VALID_PAGE(vcpu->mmu.root_hpa)) {
606 vcpu->mmu.free(vcpu);
607 vcpu->mmu.root_hpa = INVALID_PAGE;
608 }
609}
610
611int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
612{
613 destroy_kvm_mmu(vcpu);
614 return init_kvm_mmu(vcpu);
615}
616
617static void free_mmu_pages(struct kvm_vcpu *vcpu)
618{
619 while (!list_empty(&vcpu->free_pages)) {
620 struct kvm_mmu_page *page;
621
622 page = list_entry(vcpu->free_pages.next,
623 struct kvm_mmu_page, link);
624 list_del(&page->link);
625 __free_page(pfn_to_page(page->page_hpa >> PAGE_SHIFT));
626 page->page_hpa = INVALID_PAGE;
627 }
628}
629
630static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
631{
632 int i;
633
634 ASSERT(vcpu);
635
636 for (i = 0; i < KVM_NUM_MMU_PAGES; i++) {
637 struct page *page;
638 struct kvm_mmu_page *page_header = &vcpu->page_header_buf[i];
639
640 INIT_LIST_HEAD(&page_header->link);
641 if ((page = alloc_page(GFP_KVM_MMU)) == NULL)
642 goto error_1;
643 page->private = (unsigned long)page_header;
644 page_header->page_hpa = (hpa_t)page_to_pfn(page) << PAGE_SHIFT;
645 memset(__va(page_header->page_hpa), 0, PAGE_SIZE);
646 list_add(&page_header->link, &vcpu->free_pages);
647 }
648 return 0;
649
650error_1:
651 free_mmu_pages(vcpu);
652 return -ENOMEM;
653}
654
655int kvm_mmu_init(struct kvm_vcpu *vcpu)
656{
657 int r;
658
659 ASSERT(vcpu);
660 ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
661 ASSERT(list_empty(&vcpu->free_pages));
662
663 if ((r = alloc_mmu_pages(vcpu)))
664 return r;
665
666 if ((r = init_kvm_mmu(vcpu))) {
667 free_mmu_pages(vcpu);
668 return r;
669 }
670 return 0;
671}
672
673void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
674{
675 ASSERT(vcpu);
676
677 destroy_kvm_mmu(vcpu);
678 free_mmu_pages(vcpu);
679}
680
681void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
682{
683 struct kvm_mmu_page *page;
684
685 list_for_each_entry(page, &kvm->active_mmu_pages, link) {
686 int i;
687 u64 *pt;
688
689 if (!test_bit(slot, &page->slot_bitmap))
690 continue;
691
692 pt = __va(page->page_hpa);
693 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
694 /* avoid RMW */
695 if (pt[i] & PT_WRITABLE_MASK)
696 pt[i] &= ~PT_WRITABLE_MASK;
697
698 }
699}
diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h
new file mode 100644
index 000000000000..765c2e1a048e
--- /dev/null
+++ b/drivers/kvm/paging_tmpl.h
@@ -0,0 +1,397 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * MMU support
8 *
9 * Copyright (C) 2006 Qumranet, Inc.
10 *
11 * Authors:
12 * Yaniv Kamay <yaniv@qumranet.com>
13 * Avi Kivity <avi@qumranet.com>
14 *
15 * This work is licensed under the terms of the GNU GPL, version 2. See
16 * the COPYING file in the top-level directory.
17 *
18 */
19
20/*
21 * We need the mmu code to access both 32-bit and 64-bit guest ptes,
22 * so the code in this file is compiled twice, once per pte size.
23 */
24
25#if PTTYPE == 64
26 #define pt_element_t u64
27 #define guest_walker guest_walker64
28 #define FNAME(name) paging##64_##name
29 #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
30 #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
31 #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
32 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
33 #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
34 #define PT_PTE_COPY_MASK PT64_PTE_COPY_MASK
35 #define PT_NON_PTE_COPY_MASK PT64_NON_PTE_COPY_MASK
36#elif PTTYPE == 32
37 #define pt_element_t u32
38 #define guest_walker guest_walker32
39 #define FNAME(name) paging##32_##name
40 #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
41 #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
42 #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
43 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
44 #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
45 #define PT_PTE_COPY_MASK PT32_PTE_COPY_MASK
46 #define PT_NON_PTE_COPY_MASK PT32_NON_PTE_COPY_MASK
47#else
48 #error Invalid PTTYPE value
49#endif
50
51/*
52 * The guest_walker structure emulates the behavior of the hardware page
53 * table walker.
54 */
55struct guest_walker {
56 int level;
57 pt_element_t *table;
58 pt_element_t inherited_ar;
59};
60
61static void FNAME(init_walker)(struct guest_walker *walker,
62 struct kvm_vcpu *vcpu)
63{
64 hpa_t hpa;
65 struct kvm_memory_slot *slot;
66
67 walker->level = vcpu->mmu.root_level;
68 slot = gfn_to_memslot(vcpu->kvm,
69 (vcpu->cr3 & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
70 hpa = safe_gpa_to_hpa(vcpu, vcpu->cr3 & PT64_BASE_ADDR_MASK);
71 walker->table = kmap_atomic(pfn_to_page(hpa >> PAGE_SHIFT), KM_USER0);
72
73 ASSERT((!kvm_arch_ops->is_long_mode(vcpu) && is_pae(vcpu)) ||
74 (vcpu->cr3 & ~(PAGE_MASK | CR3_FLAGS_MASK)) == 0);
75
76 walker->table = (pt_element_t *)( (unsigned long)walker->table |
77 (unsigned long)(vcpu->cr3 & ~(PAGE_MASK | CR3_FLAGS_MASK)) );
78 walker->inherited_ar = PT_USER_MASK | PT_WRITABLE_MASK;
79}
80
81static void FNAME(release_walker)(struct guest_walker *walker)
82{
83 kunmap_atomic(walker->table, KM_USER0);
84}
85
86static void FNAME(set_pte)(struct kvm_vcpu *vcpu, u64 guest_pte,
87 u64 *shadow_pte, u64 access_bits)
88{
89 ASSERT(*shadow_pte == 0);
90 access_bits &= guest_pte;
91 *shadow_pte = (guest_pte & PT_PTE_COPY_MASK);
92 set_pte_common(vcpu, shadow_pte, guest_pte & PT_BASE_ADDR_MASK,
93 guest_pte & PT_DIRTY_MASK, access_bits);
94}
95
96static void FNAME(set_pde)(struct kvm_vcpu *vcpu, u64 guest_pde,
97 u64 *shadow_pte, u64 access_bits,
98 int index)
99{
100 gpa_t gaddr;
101
102 ASSERT(*shadow_pte == 0);
103 access_bits &= guest_pde;
104 gaddr = (guest_pde & PT_DIR_BASE_ADDR_MASK) + PAGE_SIZE * index;
105 if (PTTYPE == 32 && is_cpuid_PSE36())
106 gaddr |= (guest_pde & PT32_DIR_PSE36_MASK) <<
107 (32 - PT32_DIR_PSE36_SHIFT);
108 *shadow_pte = (guest_pde & (PT_NON_PTE_COPY_MASK | PT_GLOBAL_MASK)) |
109 ((guest_pde & PT_DIR_PAT_MASK) >>
110 (PT_DIR_PAT_SHIFT - PT_PAT_SHIFT));
111 set_pte_common(vcpu, shadow_pte, gaddr,
112 guest_pde & PT_DIRTY_MASK, access_bits);
113}
114
115/*
116 * Fetch a guest pte from a specific level in the paging hierarchy.
117 */
118static pt_element_t *FNAME(fetch_guest)(struct kvm_vcpu *vcpu,
119 struct guest_walker *walker,
120 int level,
121 gva_t addr)
122{
123
124 ASSERT(level > 0 && level <= walker->level);
125
126 for (;;) {
127 int index = PT_INDEX(addr, walker->level);
128 hpa_t paddr;
129
130 ASSERT(((unsigned long)walker->table & PAGE_MASK) ==
131 ((unsigned long)&walker->table[index] & PAGE_MASK));
132 if (level == walker->level ||
133 !is_present_pte(walker->table[index]) ||
134 (walker->level == PT_DIRECTORY_LEVEL &&
135 (walker->table[index] & PT_PAGE_SIZE_MASK) &&
136 (PTTYPE == 64 || is_pse(vcpu))))
137 return &walker->table[index];
138 if (walker->level != 3 || kvm_arch_ops->is_long_mode(vcpu))
139 walker->inherited_ar &= walker->table[index];
140 paddr = safe_gpa_to_hpa(vcpu, walker->table[index] & PT_BASE_ADDR_MASK);
141 kunmap_atomic(walker->table, KM_USER0);
142 walker->table = kmap_atomic(pfn_to_page(paddr >> PAGE_SHIFT),
143 KM_USER0);
144 --walker->level;
145 }
146}
147
148/*
149 * Fetch a shadow pte for a specific level in the paging hierarchy.
150 */
151static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
152 struct guest_walker *walker)
153{
154 hpa_t shadow_addr;
155 int level;
156 u64 *prev_shadow_ent = NULL;
157
158 shadow_addr = vcpu->mmu.root_hpa;
159 level = vcpu->mmu.shadow_root_level;
160
161 for (; ; level--) {
162 u32 index = SHADOW_PT_INDEX(addr, level);
163 u64 *shadow_ent = ((u64 *)__va(shadow_addr)) + index;
164 pt_element_t *guest_ent;
165
166 if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) {
167 if (level == PT_PAGE_TABLE_LEVEL)
168 return shadow_ent;
169 shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
170 prev_shadow_ent = shadow_ent;
171 continue;
172 }
173
174 if (PTTYPE == 32 && level > PT32_ROOT_LEVEL) {
175 ASSERT(level == PT32E_ROOT_LEVEL);
176 guest_ent = FNAME(fetch_guest)(vcpu, walker,
177 PT32_ROOT_LEVEL, addr);
178 } else
179 guest_ent = FNAME(fetch_guest)(vcpu, walker,
180 level, addr);
181
182 if (!is_present_pte(*guest_ent))
183 return NULL;
184
185 /* Don't set accessed bit on PAE PDPTRs */
186 if (vcpu->mmu.root_level != 3 || walker->level != 3)
187 *guest_ent |= PT_ACCESSED_MASK;
188
189 if (level == PT_PAGE_TABLE_LEVEL) {
190
191 if (walker->level == PT_DIRECTORY_LEVEL) {
192 if (prev_shadow_ent)
193 *prev_shadow_ent |= PT_SHADOW_PS_MARK;
194 FNAME(set_pde)(vcpu, *guest_ent, shadow_ent,
195 walker->inherited_ar,
196 PT_INDEX(addr, PT_PAGE_TABLE_LEVEL));
197 } else {
198 ASSERT(walker->level == PT_PAGE_TABLE_LEVEL);
199 FNAME(set_pte)(vcpu, *guest_ent, shadow_ent, walker->inherited_ar);
200 }
201 return shadow_ent;
202 }
203
204 shadow_addr = kvm_mmu_alloc_page(vcpu, shadow_ent);
205 if (!VALID_PAGE(shadow_addr))
206 return ERR_PTR(-ENOMEM);
207 if (!kvm_arch_ops->is_long_mode(vcpu) && level == 3)
208 *shadow_ent = shadow_addr |
209 (*guest_ent & (PT_PRESENT_MASK | PT_PWT_MASK | PT_PCD_MASK));
210 else {
211 *shadow_ent = shadow_addr |
212 (*guest_ent & PT_NON_PTE_COPY_MASK);
213 *shadow_ent |= (PT_WRITABLE_MASK | PT_USER_MASK);
214 }
215 prev_shadow_ent = shadow_ent;
216 }
217}
218
219/*
220 * The guest faulted for write. We need to
221 *
222 * - check write permissions
223 * - update the guest pte dirty bit
224 * - update our own dirty page tracking structures
225 */
226static int FNAME(fix_write_pf)(struct kvm_vcpu *vcpu,
227 u64 *shadow_ent,
228 struct guest_walker *walker,
229 gva_t addr,
230 int user)
231{
232 pt_element_t *guest_ent;
233 int writable_shadow;
234 gfn_t gfn;
235
236 if (is_writeble_pte(*shadow_ent))
237 return 0;
238
239 writable_shadow = *shadow_ent & PT_SHADOW_WRITABLE_MASK;
240 if (user) {
241 /*
242 * User mode access. Fail if it's a kernel page or a read-only
243 * page.
244 */
245 if (!(*shadow_ent & PT_SHADOW_USER_MASK) || !writable_shadow)
246 return 0;
247 ASSERT(*shadow_ent & PT_USER_MASK);
248 } else
249 /*
250 * Kernel mode access. Fail if it's a read-only page and
251 * supervisor write protection is enabled.
252 */
253 if (!writable_shadow) {
254 if (is_write_protection(vcpu))
255 return 0;
256 *shadow_ent &= ~PT_USER_MASK;
257 }
258
259 guest_ent = FNAME(fetch_guest)(vcpu, walker, PT_PAGE_TABLE_LEVEL, addr);
260
261 if (!is_present_pte(*guest_ent)) {
262 *shadow_ent = 0;
263 return 0;
264 }
265
266 gfn = (*guest_ent & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
267 mark_page_dirty(vcpu->kvm, gfn);
268 *shadow_ent |= PT_WRITABLE_MASK;
269 *guest_ent |= PT_DIRTY_MASK;
270
271 return 1;
272}
273
274/*
275 * Page fault handler. There are several causes for a page fault:
276 * - there is no shadow pte for the guest pte
277 * - write access through a shadow pte marked read only so that we can set
278 * the dirty bit
279 * - write access to a shadow pte marked read only so we can update the page
280 * dirty bitmap, when userspace requests it
281 * - mmio access; in this case we will never install a present shadow pte
282 * - normal guest page fault due to the guest pte marked not present, not
283 * writable, or not executable
284 *
285 * Returns: 1 if we need to emulate the instruction, 0 otherwise
286 */
287static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
288 u32 error_code)
289{
290 int write_fault = error_code & PFERR_WRITE_MASK;
291 int pte_present = error_code & PFERR_PRESENT_MASK;
292 int user_fault = error_code & PFERR_USER_MASK;
293 struct guest_walker walker;
294 u64 *shadow_pte;
295 int fixed;
296
297 /*
298 * Look up the shadow pte for the faulting address.
299 */
300 for (;;) {
301 FNAME(init_walker)(&walker, vcpu);
302 shadow_pte = FNAME(fetch)(vcpu, addr, &walker);
303 if (IS_ERR(shadow_pte)) { /* must be -ENOMEM */
304 nonpaging_flush(vcpu);
305 FNAME(release_walker)(&walker);
306 continue;
307 }
308 break;
309 }
310
311 /*
312 * The page is not mapped by the guest. Let the guest handle it.
313 */
314 if (!shadow_pte) {
315 inject_page_fault(vcpu, addr, error_code);
316 FNAME(release_walker)(&walker);
317 return 0;
318 }
319
320 /*
321 * Update the shadow pte.
322 */
323 if (write_fault)
324 fixed = FNAME(fix_write_pf)(vcpu, shadow_pte, &walker, addr,
325 user_fault);
326 else
327 fixed = fix_read_pf(shadow_pte);
328
329 FNAME(release_walker)(&walker);
330
331 /*
332 * mmio: emulate if accessible, otherwise its a guest fault.
333 */
334 if (is_io_pte(*shadow_pte)) {
335 if (may_access(*shadow_pte, write_fault, user_fault))
336 return 1;
337 pgprintk("%s: io work, no access\n", __FUNCTION__);
338 inject_page_fault(vcpu, addr,
339 error_code | PFERR_PRESENT_MASK);
340 return 0;
341 }
342
343 /*
344 * pte not present, guest page fault.
345 */
346 if (pte_present && !fixed) {
347 inject_page_fault(vcpu, addr, error_code);
348 return 0;
349 }
350
351 ++kvm_stat.pf_fixed;
352
353 return 0;
354}
355
356static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
357{
358 struct guest_walker walker;
359 pt_element_t guest_pte;
360 gpa_t gpa;
361
362 FNAME(init_walker)(&walker, vcpu);
363 guest_pte = *FNAME(fetch_guest)(vcpu, &walker, PT_PAGE_TABLE_LEVEL,
364 vaddr);
365 FNAME(release_walker)(&walker);
366
367 if (!is_present_pte(guest_pte))
368 return UNMAPPED_GVA;
369
370 if (walker.level == PT_DIRECTORY_LEVEL) {
371 ASSERT((guest_pte & PT_PAGE_SIZE_MASK));
372 ASSERT(PTTYPE == 64 || is_pse(vcpu));
373
374 gpa = (guest_pte & PT_DIR_BASE_ADDR_MASK) | (vaddr &
375 (PT_LEVEL_MASK(PT_PAGE_TABLE_LEVEL) | ~PAGE_MASK));
376
377 if (PTTYPE == 32 && is_cpuid_PSE36())
378 gpa |= (guest_pte & PT32_DIR_PSE36_MASK) <<
379 (32 - PT32_DIR_PSE36_SHIFT);
380 } else {
381 gpa = (guest_pte & PT_BASE_ADDR_MASK);
382 gpa |= (vaddr & ~PAGE_MASK);
383 }
384
385 return gpa;
386}
387
388#undef pt_element_t
389#undef guest_walker
390#undef FNAME
391#undef PT_BASE_ADDR_MASK
392#undef PT_INDEX
393#undef SHADOW_PT_INDEX
394#undef PT_LEVEL_MASK
395#undef PT_PTE_COPY_MASK
396#undef PT_NON_PTE_COPY_MASK
397#undef PT_DIR_BASE_ADDR_MASK
diff --git a/drivers/kvm/segment_descriptor.h b/drivers/kvm/segment_descriptor.h
new file mode 100644
index 000000000000..71fdf458619a
--- /dev/null
+++ b/drivers/kvm/segment_descriptor.h
@@ -0,0 +1,17 @@
1struct segment_descriptor {
2 u16 limit_low;
3 u16 base_low;
4 u8 base_mid;
5 u8 type : 4;
6 u8 system : 1;
7 u8 dpl : 2;
8 u8 present : 1;
9 u8 limit_high : 4;
10 u8 avl : 1;
11 u8 long_mode : 1;
12 u8 default_op : 1;
13 u8 granularity : 1;
14 u8 base_high;
15} __attribute__((packed));
16
17
diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c
new file mode 100644
index 000000000000..a33a89c68138
--- /dev/null
+++ b/drivers/kvm/svm.c
@@ -0,0 +1,1677 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * AMD SVM support
5 *
6 * Copyright (C) 2006 Qumranet, Inc.
7 *
8 * Authors:
9 * Yaniv Kamay <yaniv@qumranet.com>
10 * Avi Kivity <avi@qumranet.com>
11 *
12 * This work is licensed under the terms of the GNU GPL, version 2. See
13 * the COPYING file in the top-level directory.
14 *
15 */
16
17#include <linux/module.h>
18#include <linux/vmalloc.h>
19#include <linux/highmem.h>
20#include <asm/desc.h>
21
22#include "kvm_svm.h"
23#include "x86_emulate.h"
24
25MODULE_AUTHOR("Qumranet");
26MODULE_LICENSE("GPL");
27
28#define IOPM_ALLOC_ORDER 2
29#define MSRPM_ALLOC_ORDER 1
30
31#define DB_VECTOR 1
32#define UD_VECTOR 6
33#define GP_VECTOR 13
34
35#define DR7_GD_MASK (1 << 13)
36#define DR6_BD_MASK (1 << 13)
37#define CR4_DE_MASK (1UL << 3)
38
39#define SEG_TYPE_LDT 2
40#define SEG_TYPE_BUSY_TSS16 3
41
42#define KVM_EFER_LMA (1 << 10)
43#define KVM_EFER_LME (1 << 8)
44
45unsigned long iopm_base;
46unsigned long msrpm_base;
47
48struct kvm_ldttss_desc {
49 u16 limit0;
50 u16 base0;
51 unsigned base1 : 8, type : 5, dpl : 2, p : 1;
52 unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8;
53 u32 base3;
54 u32 zero1;
55} __attribute__((packed));
56
57struct svm_cpu_data {
58 int cpu;
59
60 uint64_t asid_generation;
61 uint32_t max_asid;
62 uint32_t next_asid;
63 struct kvm_ldttss_desc *tss_desc;
64
65 struct page *save_area;
66};
67
68static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
69
70struct svm_init_data {
71 int cpu;
72 int r;
73};
74
75static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
76
77#define NUM_MSR_MAPS (sizeof(msrpm_ranges) / sizeof(*msrpm_ranges))
78#define MSRS_RANGE_SIZE 2048
79#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
80
81#define MAX_INST_SIZE 15
82
83static unsigned get_addr_size(struct kvm_vcpu *vcpu)
84{
85 struct vmcb_save_area *sa = &vcpu->svm->vmcb->save;
86 u16 cs_attrib;
87
88 if (!(sa->cr0 & CR0_PE_MASK) || (sa->rflags & X86_EFLAGS_VM))
89 return 2;
90
91 cs_attrib = sa->cs.attrib;
92
93 return (cs_attrib & SVM_SELECTOR_L_MASK) ? 8 :
94 (cs_attrib & SVM_SELECTOR_DB_MASK) ? 4 : 2;
95}
96
97static inline u8 pop_irq(struct kvm_vcpu *vcpu)
98{
99 int word_index = __ffs(vcpu->irq_summary);
100 int bit_index = __ffs(vcpu->irq_pending[word_index]);
101 int irq = word_index * BITS_PER_LONG + bit_index;
102
103 clear_bit(bit_index, &vcpu->irq_pending[word_index]);
104 if (!vcpu->irq_pending[word_index])
105 clear_bit(word_index, &vcpu->irq_summary);
106 return irq;
107}
108
109static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq)
110{
111 set_bit(irq, vcpu->irq_pending);
112 set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary);
113}
114
115static inline void clgi(void)
116{
117 asm volatile (SVM_CLGI);
118}
119
120static inline void stgi(void)
121{
122 asm volatile (SVM_STGI);
123}
124
125static inline void invlpga(unsigned long addr, u32 asid)
126{
127 asm volatile (SVM_INVLPGA :: "a"(addr), "c"(asid));
128}
129
130static inline unsigned long kvm_read_cr2(void)
131{
132 unsigned long cr2;
133
134 asm volatile ("mov %%cr2, %0" : "=r" (cr2));
135 return cr2;
136}
137
138static inline void kvm_write_cr2(unsigned long val)
139{
140 asm volatile ("mov %0, %%cr2" :: "r" (val));
141}
142
143static inline unsigned long read_dr6(void)
144{
145 unsigned long dr6;
146
147 asm volatile ("mov %%dr6, %0" : "=r" (dr6));
148 return dr6;
149}
150
151static inline void write_dr6(unsigned long val)
152{
153 asm volatile ("mov %0, %%dr6" :: "r" (val));
154}
155
156static inline unsigned long read_dr7(void)
157{
158 unsigned long dr7;
159
160 asm volatile ("mov %%dr7, %0" : "=r" (dr7));
161 return dr7;
162}
163
164static inline void write_dr7(unsigned long val)
165{
166 asm volatile ("mov %0, %%dr7" :: "r" (val));
167}
168
169static inline int svm_is_long_mode(struct kvm_vcpu *vcpu)
170{
171 return vcpu->svm->vmcb->save.efer & KVM_EFER_LMA;
172}
173
174static inline void force_new_asid(struct kvm_vcpu *vcpu)
175{
176 vcpu->svm->asid_generation--;
177}
178
179static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
180{
181 force_new_asid(vcpu);
182}
183
184static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
185{
186 if (!(efer & KVM_EFER_LMA))
187 efer &= ~KVM_EFER_LME;
188
189 vcpu->svm->vmcb->save.efer = efer | MSR_EFER_SVME_MASK;
190 vcpu->shadow_efer = efer;
191}
192
193static void svm_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code)
194{
195 vcpu->svm->vmcb->control.event_inj = SVM_EVTINJ_VALID |
196 SVM_EVTINJ_VALID_ERR |
197 SVM_EVTINJ_TYPE_EXEPT |
198 GP_VECTOR;
199 vcpu->svm->vmcb->control.event_inj_err = error_code;
200}
201
202static void inject_ud(struct kvm_vcpu *vcpu)
203{
204 vcpu->svm->vmcb->control.event_inj = SVM_EVTINJ_VALID |
205 SVM_EVTINJ_TYPE_EXEPT |
206 UD_VECTOR;
207}
208
209static void inject_db(struct kvm_vcpu *vcpu)
210{
211 vcpu->svm->vmcb->control.event_inj = SVM_EVTINJ_VALID |
212 SVM_EVTINJ_TYPE_EXEPT |
213 DB_VECTOR;
214}
215
216static int is_page_fault(uint32_t info)
217{
218 info &= SVM_EVTINJ_VEC_MASK | SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
219 return info == (PF_VECTOR | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT);
220}
221
222static int is_external_interrupt(u32 info)
223{
224 info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
225 return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
226}
227
228static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
229{
230 if (!vcpu->svm->next_rip) {
231 printk(KERN_DEBUG "%s: NOP\n", __FUNCTION__);
232 return;
233 }
234 if (vcpu->svm->next_rip - vcpu->svm->vmcb->save.rip > 15) {
235 printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n",
236 __FUNCTION__,
237 vcpu->svm->vmcb->save.rip,
238 vcpu->svm->next_rip);
239 }
240
241 vcpu->rip = vcpu->svm->vmcb->save.rip = vcpu->svm->next_rip;
242 vcpu->svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
243}
244
245static int has_svm(void)
246{
247 uint32_t eax, ebx, ecx, edx;
248
249 if (current_cpu_data.x86_vendor != X86_VENDOR_AMD) {
250 printk(KERN_INFO "has_svm: not amd\n");
251 return 0;
252 }
253
254 cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
255 if (eax < SVM_CPUID_FUNC) {
256 printk(KERN_INFO "has_svm: can't execute cpuid_8000000a\n");
257 return 0;
258 }
259
260 cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
261 if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) {
262 printk(KERN_DEBUG "has_svm: svm not available\n");
263 return 0;
264 }
265 return 1;
266}
267
268static void svm_hardware_disable(void *garbage)
269{
270 struct svm_cpu_data *svm_data
271 = per_cpu(svm_data, raw_smp_processor_id());
272
273 if (svm_data) {
274 uint64_t efer;
275
276 wrmsrl(MSR_VM_HSAVE_PA, 0);
277 rdmsrl(MSR_EFER, efer);
278 wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK);
279 per_cpu(svm_data, raw_smp_processor_id()) = 0;
280 __free_page(svm_data->save_area);
281 kfree(svm_data);
282 }
283}
284
285static void svm_hardware_enable(void *garbage)
286{
287
288 struct svm_cpu_data *svm_data;
289 uint64_t efer;
290#ifdef __x86_64__
291 struct desc_ptr gdt_descr;
292#else
293 struct Xgt_desc_struct gdt_descr;
294#endif
295 struct desc_struct *gdt;
296 int me = raw_smp_processor_id();
297
298 if (!has_svm()) {
299 printk(KERN_ERR "svm_cpu_init: err EOPNOTSUPP on %d\n", me);
300 return;
301 }
302 svm_data = per_cpu(svm_data, me);
303
304 if (!svm_data) {
305 printk(KERN_ERR "svm_cpu_init: svm_data is NULL on %d\n",
306 me);
307 return;
308 }
309
310 svm_data->asid_generation = 1;
311 svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
312 svm_data->next_asid = svm_data->max_asid + 1;
313
314 asm volatile ( "sgdt %0" : "=m"(gdt_descr) );
315 gdt = (struct desc_struct *)gdt_descr.address;
316 svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
317
318 rdmsrl(MSR_EFER, efer);
319 wrmsrl(MSR_EFER, efer | MSR_EFER_SVME_MASK);
320
321 wrmsrl(MSR_VM_HSAVE_PA,
322 page_to_pfn(svm_data->save_area) << PAGE_SHIFT);
323}
324
325static int svm_cpu_init(int cpu)
326{
327 struct svm_cpu_data *svm_data;
328 int r;
329
330 svm_data = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
331 if (!svm_data)
332 return -ENOMEM;
333 svm_data->cpu = cpu;
334 svm_data->save_area = alloc_page(GFP_KERNEL);
335 r = -ENOMEM;
336 if (!svm_data->save_area)
337 goto err_1;
338
339 per_cpu(svm_data, cpu) = svm_data;
340
341 return 0;
342
343err_1:
344 kfree(svm_data);
345 return r;
346
347}
348
349static int set_msr_interception(u32 *msrpm, unsigned msr,
350 int read, int write)
351{
352 int i;
353
354 for (i = 0; i < NUM_MSR_MAPS; i++) {
355 if (msr >= msrpm_ranges[i] &&
356 msr < msrpm_ranges[i] + MSRS_IN_RANGE) {
357 u32 msr_offset = (i * MSRS_IN_RANGE + msr -
358 msrpm_ranges[i]) * 2;
359
360 u32 *base = msrpm + (msr_offset / 32);
361 u32 msr_shift = msr_offset % 32;
362 u32 mask = ((write) ? 0 : 2) | ((read) ? 0 : 1);
363 *base = (*base & ~(0x3 << msr_shift)) |
364 (mask << msr_shift);
365 return 1;
366 }
367 }
368 printk(KERN_DEBUG "%s: not found 0x%x\n", __FUNCTION__, msr);
369 return 0;
370}
371
372static __init int svm_hardware_setup(void)
373{
374 int cpu;
375 struct page *iopm_pages;
376 struct page *msrpm_pages;
377 void *msrpm_va;
378 int r;
379
380
381 iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER);
382
383 if (!iopm_pages)
384 return -ENOMEM;
385 memset(page_address(iopm_pages), 0xff,
386 PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
387 iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
388
389
390 msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
391
392 r = -ENOMEM;
393 if (!msrpm_pages)
394 goto err_1;
395
396 msrpm_va = page_address(msrpm_pages);
397 memset(msrpm_va, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
398 msrpm_base = page_to_pfn(msrpm_pages) << PAGE_SHIFT;
399
400#ifdef __x86_64__
401 set_msr_interception(msrpm_va, MSR_GS_BASE, 1, 1);
402 set_msr_interception(msrpm_va, MSR_FS_BASE, 1, 1);
403 set_msr_interception(msrpm_va, MSR_KERNEL_GS_BASE, 1, 1);
404 set_msr_interception(msrpm_va, MSR_STAR, 1, 1);
405 set_msr_interception(msrpm_va, MSR_LSTAR, 1, 1);
406 set_msr_interception(msrpm_va, MSR_CSTAR, 1, 1);
407 set_msr_interception(msrpm_va, MSR_SYSCALL_MASK, 1, 1);
408#endif
409 set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_CS, 1, 1);
410 set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_ESP, 1, 1);
411 set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_EIP, 1, 1);
412
413 for_each_online_cpu(cpu) {
414 r = svm_cpu_init(cpu);
415 if (r)
416 goto err_2;
417 }
418 return 0;
419
420err_2:
421 __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
422 msrpm_base = 0;
423err_1:
424 __free_pages(iopm_pages, IOPM_ALLOC_ORDER);
425 iopm_base = 0;
426 return r;
427}
428
429static __exit void svm_hardware_unsetup(void)
430{
431 __free_pages(pfn_to_page(msrpm_base >> PAGE_SHIFT), MSRPM_ALLOC_ORDER);
432 __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
433 iopm_base = msrpm_base = 0;
434}
435
436static void init_seg(struct vmcb_seg *seg)
437{
438 seg->selector = 0;
439 seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
440 SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
441 seg->limit = 0xffff;
442 seg->base = 0;
443}
444
445static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
446{
447 seg->selector = 0;
448 seg->attrib = SVM_SELECTOR_P_MASK | type;
449 seg->limit = 0xffff;
450 seg->base = 0;
451}
452
453static int svm_vcpu_setup(struct kvm_vcpu *vcpu)
454{
455 return 0;
456}
457
458static void init_vmcb(struct vmcb *vmcb)
459{
460 struct vmcb_control_area *control = &vmcb->control;
461 struct vmcb_save_area *save = &vmcb->save;
462 u64 tsc;
463
464 control->intercept_cr_read = INTERCEPT_CR0_MASK |
465 INTERCEPT_CR3_MASK |
466 INTERCEPT_CR4_MASK;
467
468 control->intercept_cr_write = INTERCEPT_CR0_MASK |
469 INTERCEPT_CR3_MASK |
470 INTERCEPT_CR4_MASK;
471
472 control->intercept_dr_read = INTERCEPT_DR0_MASK |
473 INTERCEPT_DR1_MASK |
474 INTERCEPT_DR2_MASK |
475 INTERCEPT_DR3_MASK;
476
477 control->intercept_dr_write = INTERCEPT_DR0_MASK |
478 INTERCEPT_DR1_MASK |
479 INTERCEPT_DR2_MASK |
480 INTERCEPT_DR3_MASK |
481 INTERCEPT_DR5_MASK |
482 INTERCEPT_DR7_MASK;
483
484 control->intercept_exceptions = 1 << PF_VECTOR;
485
486
487 control->intercept = (1ULL << INTERCEPT_INTR) |
488 (1ULL << INTERCEPT_NMI) |
489 /*
490 * selective cr0 intercept bug?
491 * 0: 0f 22 d8 mov %eax,%cr3
492 * 3: 0f 20 c0 mov %cr0,%eax
493 * 6: 0d 00 00 00 80 or $0x80000000,%eax
494 * b: 0f 22 c0 mov %eax,%cr0
495 * set cr3 ->interception
496 * get cr0 ->interception
497 * set cr0 -> no interception
498 */
499 /* (1ULL << INTERCEPT_SELECTIVE_CR0) | */
500 (1ULL << INTERCEPT_CPUID) |
501 (1ULL << INTERCEPT_HLT) |
502 (1ULL << INTERCEPT_INVLPG) |
503 (1ULL << INTERCEPT_INVLPGA) |
504 (1ULL << INTERCEPT_IOIO_PROT) |
505 (1ULL << INTERCEPT_MSR_PROT) |
506 (1ULL << INTERCEPT_TASK_SWITCH) |
507 (1ULL << INTERCEPT_VMRUN) |
508 (1ULL << INTERCEPT_VMMCALL) |
509 (1ULL << INTERCEPT_VMLOAD) |
510 (1ULL << INTERCEPT_VMSAVE) |
511 (1ULL << INTERCEPT_STGI) |
512 (1ULL << INTERCEPT_CLGI) |
513 (1ULL << INTERCEPT_SKINIT);
514
515 control->iopm_base_pa = iopm_base;
516 control->msrpm_base_pa = msrpm_base;
517 rdtscll(tsc);
518 control->tsc_offset = -tsc;
519 control->int_ctl = V_INTR_MASKING_MASK;
520
521 init_seg(&save->es);
522 init_seg(&save->ss);
523 init_seg(&save->ds);
524 init_seg(&save->fs);
525 init_seg(&save->gs);
526
527 save->cs.selector = 0xf000;
528 /* Executable/Readable Code Segment */
529 save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
530 SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
531 save->cs.limit = 0xffff;
532 save->cs.base = 0xffff0000;
533
534 save->gdtr.limit = 0xffff;
535 save->idtr.limit = 0xffff;
536
537 init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
538 init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
539
540 save->efer = MSR_EFER_SVME_MASK;
541
542 save->dr6 = 0xffff0ff0;
543 save->dr7 = 0x400;
544 save->rflags = 2;
545 save->rip = 0x0000fff0;
546
547 /*
548 * cr0 val on cpu init should be 0x60000010, we enable cpu
549 * cache by default. the orderly way is to enable cache in bios.
550 */
551 save->cr0 = 0x00000010 | CR0_PG_MASK;
552 save->cr4 = CR4_PAE_MASK;
553 /* rdx = ?? */
554}
555
556static int svm_create_vcpu(struct kvm_vcpu *vcpu)
557{
558 struct page *page;
559 int r;
560
561 r = -ENOMEM;
562 vcpu->svm = kzalloc(sizeof *vcpu->svm, GFP_KERNEL);
563 if (!vcpu->svm)
564 goto out1;
565 page = alloc_page(GFP_KERNEL);
566 if (!page)
567 goto out2;
568
569 vcpu->svm->vmcb = page_address(page);
570 memset(vcpu->svm->vmcb, 0, PAGE_SIZE);
571 vcpu->svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
572 vcpu->svm->cr0 = 0x00000010;
573 vcpu->svm->asid_generation = 0;
574 memset(vcpu->svm->db_regs, 0, sizeof(vcpu->svm->db_regs));
575 init_vmcb(vcpu->svm->vmcb);
576
577 return 0;
578
579out2:
580 kfree(vcpu->svm);
581out1:
582 return r;
583}
584
585static void svm_free_vcpu(struct kvm_vcpu *vcpu)
586{
587 if (!vcpu->svm)
588 return;
589 if (vcpu->svm->vmcb)
590 __free_page(pfn_to_page(vcpu->svm->vmcb_pa >> PAGE_SHIFT));
591 kfree(vcpu->svm);
592}
593
594static struct kvm_vcpu *svm_vcpu_load(struct kvm_vcpu *vcpu)
595{
596 get_cpu();
597 return vcpu;
598}
599
600static void svm_vcpu_put(struct kvm_vcpu *vcpu)
601{
602 put_cpu();
603}
604
605static void svm_cache_regs(struct kvm_vcpu *vcpu)
606{
607 vcpu->regs[VCPU_REGS_RAX] = vcpu->svm->vmcb->save.rax;
608 vcpu->regs[VCPU_REGS_RSP] = vcpu->svm->vmcb->save.rsp;
609 vcpu->rip = vcpu->svm->vmcb->save.rip;
610}
611
612static void svm_decache_regs(struct kvm_vcpu *vcpu)
613{
614 vcpu->svm->vmcb->save.rax = vcpu->regs[VCPU_REGS_RAX];
615 vcpu->svm->vmcb->save.rsp = vcpu->regs[VCPU_REGS_RSP];
616 vcpu->svm->vmcb->save.rip = vcpu->rip;
617}
618
619static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
620{
621 return vcpu->svm->vmcb->save.rflags;
622}
623
624static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
625{
626 vcpu->svm->vmcb->save.rflags = rflags;
627}
628
629static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
630{
631 struct vmcb_save_area *save = &vcpu->svm->vmcb->save;
632
633 switch (seg) {
634 case VCPU_SREG_CS: return &save->cs;
635 case VCPU_SREG_DS: return &save->ds;
636 case VCPU_SREG_ES: return &save->es;
637 case VCPU_SREG_FS: return &save->fs;
638 case VCPU_SREG_GS: return &save->gs;
639 case VCPU_SREG_SS: return &save->ss;
640 case VCPU_SREG_TR: return &save->tr;
641 case VCPU_SREG_LDTR: return &save->ldtr;
642 }
643 BUG();
644 return 0;
645}
646
647static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
648{
649 struct vmcb_seg *s = svm_seg(vcpu, seg);
650
651 return s->base;
652}
653
654static void svm_get_segment(struct kvm_vcpu *vcpu,
655 struct kvm_segment *var, int seg)
656{
657 struct vmcb_seg *s = svm_seg(vcpu, seg);
658
659 var->base = s->base;
660 var->limit = s->limit;
661 var->selector = s->selector;
662 var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
663 var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
664 var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
665 var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
666 var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
667 var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
668 var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
669 var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1;
670 var->unusable = !var->present;
671}
672
673static void svm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
674{
675 struct vmcb_seg *s = svm_seg(vcpu, VCPU_SREG_CS);
676
677 *db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
678 *l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
679}
680
681static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
682{
683 dt->limit = vcpu->svm->vmcb->save.ldtr.limit;
684 dt->base = vcpu->svm->vmcb->save.ldtr.base;
685}
686
687static void svm_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
688{
689 vcpu->svm->vmcb->save.ldtr.limit = dt->limit;
690 vcpu->svm->vmcb->save.ldtr.base = dt->base ;
691}
692
693static void svm_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
694{
695 dt->limit = vcpu->svm->vmcb->save.gdtr.limit;
696 dt->base = vcpu->svm->vmcb->save.gdtr.base;
697}
698
699static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
700{
701 vcpu->svm->vmcb->save.gdtr.limit = dt->limit;
702 vcpu->svm->vmcb->save.gdtr.base = dt->base ;
703}
704
705static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
706{
707#ifdef __x86_64__
708 if (vcpu->shadow_efer & KVM_EFER_LME) {
709 if (!is_paging(vcpu) && (cr0 & CR0_PG_MASK)) {
710 vcpu->shadow_efer |= KVM_EFER_LMA;
711 vcpu->svm->vmcb->save.efer |= KVM_EFER_LMA | KVM_EFER_LME;
712 }
713
714 if (is_paging(vcpu) && !(cr0 & CR0_PG_MASK) ) {
715 vcpu->shadow_efer &= ~KVM_EFER_LMA;
716 vcpu->svm->vmcb->save.efer &= ~(KVM_EFER_LMA | KVM_EFER_LME);
717 }
718 }
719#endif
720 vcpu->svm->cr0 = cr0;
721 vcpu->svm->vmcb->save.cr0 = cr0 | CR0_PG_MASK;
722 vcpu->cr0 = cr0;
723}
724
725static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
726{
727 vcpu->cr4 = cr4;
728 vcpu->svm->vmcb->save.cr4 = cr4 | CR4_PAE_MASK;
729}
730
731static void svm_set_segment(struct kvm_vcpu *vcpu,
732 struct kvm_segment *var, int seg)
733{
734 struct vmcb_seg *s = svm_seg(vcpu, seg);
735
736 s->base = var->base;
737 s->limit = var->limit;
738 s->selector = var->selector;
739 if (var->unusable)
740 s->attrib = 0;
741 else {
742 s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
743 s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
744 s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
745 s->attrib |= (var->present & 1) << SVM_SELECTOR_P_SHIFT;
746 s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
747 s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
748 s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
749 s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
750 }
751 if (seg == VCPU_SREG_CS)
752 vcpu->svm->vmcb->save.cpl
753 = (vcpu->svm->vmcb->save.cs.attrib
754 >> SVM_SELECTOR_DPL_SHIFT) & 3;
755
756}
757
758/* FIXME:
759
760 vcpu->svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
761 vcpu->svm->vmcb->control.int_ctl |= (sregs->cr8 & V_TPR_MASK);
762
763*/
764
765static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
766{
767 return -EOPNOTSUPP;
768}
769
770static void load_host_msrs(struct kvm_vcpu *vcpu)
771{
772 int i;
773
774 for ( i = 0; i < NR_HOST_SAVE_MSRS; i++)
775 wrmsrl(host_save_msrs[i], vcpu->svm->host_msrs[i]);
776}
777
778static void save_host_msrs(struct kvm_vcpu *vcpu)
779{
780 int i;
781
782 for ( i = 0; i < NR_HOST_SAVE_MSRS; i++)
783 rdmsrl(host_save_msrs[i], vcpu->svm->host_msrs[i]);
784}
785
786static void new_asid(struct kvm_vcpu *vcpu, struct svm_cpu_data *svm_data)
787{
788 if (svm_data->next_asid > svm_data->max_asid) {
789 ++svm_data->asid_generation;
790 svm_data->next_asid = 1;
791 vcpu->svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
792 }
793
794 vcpu->cpu = svm_data->cpu;
795 vcpu->svm->asid_generation = svm_data->asid_generation;
796 vcpu->svm->vmcb->control.asid = svm_data->next_asid++;
797}
798
799static void svm_invlpg(struct kvm_vcpu *vcpu, gva_t address)
800{
801 invlpga(address, vcpu->svm->vmcb->control.asid); // is needed?
802}
803
804static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr)
805{
806 return vcpu->svm->db_regs[dr];
807}
808
809static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
810 int *exception)
811{
812 *exception = 0;
813
814 if (vcpu->svm->vmcb->save.dr7 & DR7_GD_MASK) {
815 vcpu->svm->vmcb->save.dr7 &= ~DR7_GD_MASK;
816 vcpu->svm->vmcb->save.dr6 |= DR6_BD_MASK;
817 *exception = DB_VECTOR;
818 return;
819 }
820
821 switch (dr) {
822 case 0 ... 3:
823 vcpu->svm->db_regs[dr] = value;
824 return;
825 case 4 ... 5:
826 if (vcpu->cr4 & CR4_DE_MASK) {
827 *exception = UD_VECTOR;
828 return;
829 }
830 case 7: {
831 if (value & ~((1ULL << 32) - 1)) {
832 *exception = GP_VECTOR;
833 return;
834 }
835 vcpu->svm->vmcb->save.dr7 = value;
836 return;
837 }
838 default:
839 printk(KERN_DEBUG "%s: unexpected dr %u\n",
840 __FUNCTION__, dr);
841 *exception = UD_VECTOR;
842 return;
843 }
844}
845
846static int pf_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
847{
848 u32 exit_int_info = vcpu->svm->vmcb->control.exit_int_info;
849 u64 fault_address;
850 u32 error_code;
851 enum emulation_result er;
852
853 if (is_external_interrupt(exit_int_info))
854 push_irq(vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
855
856 spin_lock(&vcpu->kvm->lock);
857
858 fault_address = vcpu->svm->vmcb->control.exit_info_2;
859 error_code = vcpu->svm->vmcb->control.exit_info_1;
860 if (!vcpu->mmu.page_fault(vcpu, fault_address, error_code)) {
861 spin_unlock(&vcpu->kvm->lock);
862 return 1;
863 }
864 er = emulate_instruction(vcpu, kvm_run, fault_address, error_code);
865 spin_unlock(&vcpu->kvm->lock);
866
867 switch (er) {
868 case EMULATE_DONE:
869 return 1;
870 case EMULATE_DO_MMIO:
871 ++kvm_stat.mmio_exits;
872 kvm_run->exit_reason = KVM_EXIT_MMIO;
873 return 0;
874 case EMULATE_FAIL:
875 vcpu_printf(vcpu, "%s: emulate fail\n", __FUNCTION__);
876 break;
877 default:
878 BUG();
879 }
880
881 kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
882 return 0;
883}
884
885static int io_get_override(struct kvm_vcpu *vcpu,
886 struct vmcb_seg **seg,
887 int *addr_override)
888{
889 u8 inst[MAX_INST_SIZE];
890 unsigned ins_length;
891 gva_t rip;
892 int i;
893
894 rip = vcpu->svm->vmcb->save.rip;
895 ins_length = vcpu->svm->next_rip - rip;
896 rip += vcpu->svm->vmcb->save.cs.base;
897
898 if (ins_length > MAX_INST_SIZE)
899 printk(KERN_DEBUG
900 "%s: inst length err, cs base 0x%llx rip 0x%llx "
901 "next rip 0x%llx ins_length %u\n",
902 __FUNCTION__,
903 vcpu->svm->vmcb->save.cs.base,
904 vcpu->svm->vmcb->save.rip,
905 vcpu->svm->vmcb->control.exit_info_2,
906 ins_length);
907
908 if (kvm_read_guest(vcpu, rip, ins_length, inst) != ins_length)
909 /* #PF */
910 return 0;
911
912 *addr_override = 0;
913 *seg = 0;
914 for (i = 0; i < ins_length; i++)
915 switch (inst[i]) {
916 case 0xf0:
917 case 0xf2:
918 case 0xf3:
919 case 0x66:
920 continue;
921 case 0x67:
922 *addr_override = 1;
923 continue;
924 case 0x2e:
925 *seg = &vcpu->svm->vmcb->save.cs;
926 continue;
927 case 0x36:
928 *seg = &vcpu->svm->vmcb->save.ss;
929 continue;
930 case 0x3e:
931 *seg = &vcpu->svm->vmcb->save.ds;
932 continue;
933 case 0x26:
934 *seg = &vcpu->svm->vmcb->save.es;
935 continue;
936 case 0x64:
937 *seg = &vcpu->svm->vmcb->save.fs;
938 continue;
939 case 0x65:
940 *seg = &vcpu->svm->vmcb->save.gs;
941 continue;
942 default:
943 return 1;
944 }
945 printk(KERN_DEBUG "%s: unexpected\n", __FUNCTION__);
946 return 0;
947}
948
949static unsigned long io_adress(struct kvm_vcpu *vcpu, int ins, u64 *address)
950{
951 unsigned long addr_mask;
952 unsigned long *reg;
953 struct vmcb_seg *seg;
954 int addr_override;
955 struct vmcb_save_area *save_area = &vcpu->svm->vmcb->save;
956 u16 cs_attrib = save_area->cs.attrib;
957 unsigned addr_size = get_addr_size(vcpu);
958
959 if (!io_get_override(vcpu, &seg, &addr_override))
960 return 0;
961
962 if (addr_override)
963 addr_size = (addr_size == 2) ? 4: (addr_size >> 1);
964
965 if (ins) {
966 reg = &vcpu->regs[VCPU_REGS_RDI];
967 seg = &vcpu->svm->vmcb->save.es;
968 } else {
969 reg = &vcpu->regs[VCPU_REGS_RSI];
970 seg = (seg) ? seg : &vcpu->svm->vmcb->save.ds;
971 }
972
973 addr_mask = ~0ULL >> (64 - (addr_size * 8));
974
975 if ((cs_attrib & SVM_SELECTOR_L_MASK) &&
976 !(vcpu->svm->vmcb->save.rflags & X86_EFLAGS_VM)) {
977 *address = (*reg & addr_mask);
978 return addr_mask;
979 }
980
981 if (!(seg->attrib & SVM_SELECTOR_P_SHIFT)) {
982 svm_inject_gp(vcpu, 0);
983 return 0;
984 }
985
986 *address = (*reg & addr_mask) + seg->base;
987 return addr_mask;
988}
989
990static int io_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
991{
992 u32 io_info = vcpu->svm->vmcb->control.exit_info_1; //address size bug?
993 int _in = io_info & SVM_IOIO_TYPE_MASK;
994
995 ++kvm_stat.io_exits;
996
997 vcpu->svm->next_rip = vcpu->svm->vmcb->control.exit_info_2;
998
999 kvm_run->exit_reason = KVM_EXIT_IO;
1000 kvm_run->io.port = io_info >> 16;
1001 kvm_run->io.direction = (_in) ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
1002 kvm_run->io.size = ((io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT);
1003 kvm_run->io.string = (io_info & SVM_IOIO_STR_MASK) != 0;
1004 kvm_run->io.rep = (io_info & SVM_IOIO_REP_MASK) != 0;
1005
1006 if (kvm_run->io.string) {
1007 unsigned addr_mask;
1008
1009 addr_mask = io_adress(vcpu, _in, &kvm_run->io.address);
1010 if (!addr_mask) {
1011 printk(KERN_DEBUG "%s: get io address failed\n", __FUNCTION__);
1012 return 1;
1013 }
1014
1015 if (kvm_run->io.rep) {
1016 kvm_run->io.count = vcpu->regs[VCPU_REGS_RCX] & addr_mask;
1017 kvm_run->io.string_down = (vcpu->svm->vmcb->save.rflags
1018 & X86_EFLAGS_DF) != 0;
1019 }
1020 } else {
1021 kvm_run->io.value = vcpu->svm->vmcb->save.rax;
1022 }
1023 return 0;
1024}
1025
1026
1027static int nop_on_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1028{
1029 return 1;
1030}
1031
1032static int halt_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1033{
1034 vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 1;
1035 skip_emulated_instruction(vcpu);
1036 if (vcpu->irq_summary && (vcpu->svm->vmcb->save.rflags & X86_EFLAGS_IF))
1037 return 1;
1038
1039 kvm_run->exit_reason = KVM_EXIT_HLT;
1040 return 0;
1041}
1042
1043static int invalid_op_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1044{
1045 inject_ud(vcpu);
1046 return 1;
1047}
1048
1049static int task_switch_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1050{
1051 printk(KERN_DEBUG "%s: task swiche is unsupported\n", __FUNCTION__);
1052 kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
1053 return 0;
1054}
1055
1056static int cpuid_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1057{
1058 vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 2;
1059 kvm_run->exit_reason = KVM_EXIT_CPUID;
1060 return 0;
1061}
1062
1063static int emulate_on_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1064{
1065 if (emulate_instruction(vcpu, 0, 0, 0) != EMULATE_DONE)
1066 printk(KERN_ERR "%s: failed\n", __FUNCTION__);
1067 return 1;
1068}
1069
1070static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
1071{
1072 switch (ecx) {
1073 case MSR_IA32_MC0_CTL:
1074 case MSR_IA32_MCG_STATUS:
1075 case MSR_IA32_MCG_CAP:
1076 case MSR_IA32_MC0_MISC:
1077 case MSR_IA32_MC0_MISC+4:
1078 case MSR_IA32_MC0_MISC+8:
1079 case MSR_IA32_MC0_MISC+12:
1080 case MSR_IA32_MC0_MISC+16:
1081 case MSR_IA32_UCODE_REV:
1082 /* MTRR registers */
1083 case 0xfe:
1084 case 0x200 ... 0x2ff:
1085 *data = 0;
1086 break;
1087 case MSR_IA32_TIME_STAMP_COUNTER: {
1088 u64 tsc;
1089
1090 rdtscll(tsc);
1091 *data = vcpu->svm->vmcb->control.tsc_offset + tsc;
1092 break;
1093 }
1094 case MSR_EFER:
1095 *data = vcpu->shadow_efer;
1096 break;
1097 case MSR_IA32_APICBASE:
1098 *data = vcpu->apic_base;
1099 break;
1100#ifdef __x86_64__
1101 case MSR_STAR:
1102 *data = vcpu->svm->vmcb->save.star;
1103 break;
1104 case MSR_LSTAR:
1105 *data = vcpu->svm->vmcb->save.lstar;
1106 break;
1107 case MSR_CSTAR:
1108 *data = vcpu->svm->vmcb->save.cstar;
1109 break;
1110 case MSR_KERNEL_GS_BASE:
1111 *data = vcpu->svm->vmcb->save.kernel_gs_base;
1112 break;
1113 case MSR_SYSCALL_MASK:
1114 *data = vcpu->svm->vmcb->save.sfmask;
1115 break;
1116#endif
1117 case MSR_IA32_SYSENTER_CS:
1118 *data = vcpu->svm->vmcb->save.sysenter_cs;
1119 break;
1120 case MSR_IA32_SYSENTER_EIP:
1121 *data = vcpu->svm->vmcb->save.sysenter_eip;
1122 break;
1123 case MSR_IA32_SYSENTER_ESP:
1124 *data = vcpu->svm->vmcb->save.sysenter_esp;
1125 break;
1126 default:
1127 printk(KERN_ERR "kvm: unhandled rdmsr: 0x%x\n", ecx);
1128 return 1;
1129 }
1130 return 0;
1131}
1132
1133static int rdmsr_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1134{
1135 u32 ecx = vcpu->regs[VCPU_REGS_RCX];
1136 u64 data;
1137
1138 if (svm_get_msr(vcpu, ecx, &data))
1139 svm_inject_gp(vcpu, 0);
1140 else {
1141 vcpu->svm->vmcb->save.rax = data & 0xffffffff;
1142 vcpu->regs[VCPU_REGS_RDX] = data >> 32;
1143 vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 2;
1144 skip_emulated_instruction(vcpu);
1145 }
1146 return 1;
1147}
1148
1149static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
1150{
1151 switch (ecx) {
1152#ifdef __x86_64__
1153 case MSR_EFER:
1154 set_efer(vcpu, data);
1155 break;
1156#endif
1157 case MSR_IA32_MC0_STATUS:
1158 printk(KERN_WARNING "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n"
1159 , __FUNCTION__, data);
1160 break;
1161 case MSR_IA32_TIME_STAMP_COUNTER: {
1162 u64 tsc;
1163
1164 rdtscll(tsc);
1165 vcpu->svm->vmcb->control.tsc_offset = data - tsc;
1166 break;
1167 }
1168 case MSR_IA32_UCODE_REV:
1169 case MSR_IA32_UCODE_WRITE:
1170 case 0x200 ... 0x2ff: /* MTRRs */
1171 break;
1172 case MSR_IA32_APICBASE:
1173 vcpu->apic_base = data;
1174 break;
1175#ifdef __x86_64___
1176 case MSR_STAR:
1177 vcpu->svm->vmcb->save.star = data;
1178 break;
1179 case MSR_LSTAR:
1180 vcpu->svm->vmcb->save.lstar = data;
1181 break;
1182 case MSR_CSTAR:
1183 vcpu->svm->vmcb->save.cstar = data;
1184 break;
1185 case MSR_KERNEL_GS_BASE:
1186 vcpu->svm->vmcb->save.kernel_gs_base = data;
1187 break;
1188 case MSR_SYSCALL_MASK:
1189 vcpu->svm->vmcb->save.sfmask = data;
1190 break;
1191#endif
1192 case MSR_IA32_SYSENTER_CS:
1193 vcpu->svm->vmcb->save.sysenter_cs = data;
1194 break;
1195 case MSR_IA32_SYSENTER_EIP:
1196 vcpu->svm->vmcb->save.sysenter_eip = data;
1197 break;
1198 case MSR_IA32_SYSENTER_ESP:
1199 vcpu->svm->vmcb->save.sysenter_esp = data;
1200 break;
1201 default:
1202 printk(KERN_ERR "kvm: unhandled wrmsr: %x\n", ecx);
1203 return 1;
1204 }
1205 return 0;
1206}
1207
1208static int wrmsr_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1209{
1210 u32 ecx = vcpu->regs[VCPU_REGS_RCX];
1211 u64 data = (vcpu->svm->vmcb->save.rax & -1u)
1212 | ((u64)(vcpu->regs[VCPU_REGS_RDX] & -1u) << 32);
1213 vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 2;
1214 if (svm_set_msr(vcpu, ecx, data))
1215 svm_inject_gp(vcpu, 0);
1216 else
1217 skip_emulated_instruction(vcpu);
1218 return 1;
1219}
1220
1221static int msr_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1222{
1223 if (vcpu->svm->vmcb->control.exit_info_1)
1224 return wrmsr_interception(vcpu, kvm_run);
1225 else
1226 return rdmsr_interception(vcpu, kvm_run);
1227}
1228
1229static int (*svm_exit_handlers[])(struct kvm_vcpu *vcpu,
1230 struct kvm_run *kvm_run) = {
1231 [SVM_EXIT_READ_CR0] = emulate_on_interception,
1232 [SVM_EXIT_READ_CR3] = emulate_on_interception,
1233 [SVM_EXIT_READ_CR4] = emulate_on_interception,
1234 /* for now: */
1235 [SVM_EXIT_WRITE_CR0] = emulate_on_interception,
1236 [SVM_EXIT_WRITE_CR3] = emulate_on_interception,
1237 [SVM_EXIT_WRITE_CR4] = emulate_on_interception,
1238 [SVM_EXIT_READ_DR0] = emulate_on_interception,
1239 [SVM_EXIT_READ_DR1] = emulate_on_interception,
1240 [SVM_EXIT_READ_DR2] = emulate_on_interception,
1241 [SVM_EXIT_READ_DR3] = emulate_on_interception,
1242 [SVM_EXIT_WRITE_DR0] = emulate_on_interception,
1243 [SVM_EXIT_WRITE_DR1] = emulate_on_interception,
1244 [SVM_EXIT_WRITE_DR2] = emulate_on_interception,
1245 [SVM_EXIT_WRITE_DR3] = emulate_on_interception,
1246 [SVM_EXIT_WRITE_DR5] = emulate_on_interception,
1247 [SVM_EXIT_WRITE_DR7] = emulate_on_interception,
1248 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
1249 [SVM_EXIT_INTR] = nop_on_interception,
1250 [SVM_EXIT_NMI] = nop_on_interception,
1251 [SVM_EXIT_SMI] = nop_on_interception,
1252 [SVM_EXIT_INIT] = nop_on_interception,
1253 /* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */
1254 [SVM_EXIT_CPUID] = cpuid_interception,
1255 [SVM_EXIT_HLT] = halt_interception,
1256 [SVM_EXIT_INVLPG] = emulate_on_interception,
1257 [SVM_EXIT_INVLPGA] = invalid_op_interception,
1258 [SVM_EXIT_IOIO] = io_interception,
1259 [SVM_EXIT_MSR] = msr_interception,
1260 [SVM_EXIT_TASK_SWITCH] = task_switch_interception,
1261 [SVM_EXIT_VMRUN] = invalid_op_interception,
1262 [SVM_EXIT_VMMCALL] = invalid_op_interception,
1263 [SVM_EXIT_VMLOAD] = invalid_op_interception,
1264 [SVM_EXIT_VMSAVE] = invalid_op_interception,
1265 [SVM_EXIT_STGI] = invalid_op_interception,
1266 [SVM_EXIT_CLGI] = invalid_op_interception,
1267 [SVM_EXIT_SKINIT] = invalid_op_interception,
1268};
1269
1270
1271static int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1272{
1273 u32 exit_code = vcpu->svm->vmcb->control.exit_code;
1274
1275 kvm_run->exit_type = KVM_EXIT_TYPE_VM_EXIT;
1276
1277 if (is_external_interrupt(vcpu->svm->vmcb->control.exit_int_info) &&
1278 exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR)
1279 printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
1280 "exit_code 0x%x\n",
1281 __FUNCTION__, vcpu->svm->vmcb->control.exit_int_info,
1282 exit_code);
1283
1284 if (exit_code >= sizeof(svm_exit_handlers) / sizeof(*svm_exit_handlers)
1285 || svm_exit_handlers[exit_code] == 0) {
1286 kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
1287 printk(KERN_ERR "%s: 0x%x @ 0x%llx cr0 0x%lx rflags 0x%llx\n",
1288 __FUNCTION__,
1289 exit_code,
1290 vcpu->svm->vmcb->save.rip,
1291 vcpu->cr0,
1292 vcpu->svm->vmcb->save.rflags);
1293 return 0;
1294 }
1295
1296 return svm_exit_handlers[exit_code](vcpu, kvm_run);
1297}
1298
1299static void reload_tss(struct kvm_vcpu *vcpu)
1300{
1301 int cpu = raw_smp_processor_id();
1302
1303 struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
1304 svm_data->tss_desc->type = 9; //available 32/64-bit TSS
1305 load_TR_desc();
1306}
1307
1308static void pre_svm_run(struct kvm_vcpu *vcpu)
1309{
1310 int cpu = raw_smp_processor_id();
1311
1312 struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
1313
1314 vcpu->svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
1315 if (vcpu->cpu != cpu ||
1316 vcpu->svm->asid_generation != svm_data->asid_generation)
1317 new_asid(vcpu, svm_data);
1318}
1319
1320
1321static inline void kvm_try_inject_irq(struct kvm_vcpu *vcpu)
1322{
1323 struct vmcb_control_area *control;
1324
1325 if (!vcpu->irq_summary)
1326 return;
1327
1328 control = &vcpu->svm->vmcb->control;
1329
1330 control->int_vector = pop_irq(vcpu);
1331 control->int_ctl &= ~V_INTR_PRIO_MASK;
1332 control->int_ctl |= V_IRQ_MASK |
1333 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
1334}
1335
1336static void kvm_reput_irq(struct kvm_vcpu *vcpu)
1337{
1338 struct vmcb_control_area *control = &vcpu->svm->vmcb->control;
1339
1340 if (control->int_ctl & V_IRQ_MASK) {
1341 control->int_ctl &= ~V_IRQ_MASK;
1342 push_irq(vcpu, control->int_vector);
1343 }
1344}
1345
1346static void save_db_regs(unsigned long *db_regs)
1347{
1348#ifdef __x86_64__
1349 asm ("mov %%dr0, %%rax \n\t"
1350 "mov %%rax, %[dr0] \n\t"
1351 "mov %%dr1, %%rax \n\t"
1352 "mov %%rax, %[dr1] \n\t"
1353 "mov %%dr2, %%rax \n\t"
1354 "mov %%rax, %[dr2] \n\t"
1355 "mov %%dr3, %%rax \n\t"
1356 "mov %%rax, %[dr3] \n\t"
1357 : [dr0] "=m"(db_regs[0]),
1358 [dr1] "=m"(db_regs[1]),
1359 [dr2] "=m"(db_regs[2]),
1360 [dr3] "=m"(db_regs[3])
1361 : : "rax");
1362#else
1363 asm ("mov %%dr0, %%eax \n\t"
1364 "mov %%eax, %[dr0] \n\t"
1365 "mov %%dr1, %%eax \n\t"
1366 "mov %%eax, %[dr1] \n\t"
1367 "mov %%dr2, %%eax \n\t"
1368 "mov %%eax, %[dr2] \n\t"
1369 "mov %%dr3, %%eax \n\t"
1370 "mov %%eax, %[dr3] \n\t"
1371 : [dr0] "=m"(db_regs[0]),
1372 [dr1] "=m"(db_regs[1]),
1373 [dr2] "=m"(db_regs[2]),
1374 [dr3] "=m"(db_regs[3])
1375 : : "eax");
1376#endif
1377}
1378
1379static void load_db_regs(unsigned long *db_regs)
1380{
1381 asm volatile ("mov %[dr0], %%dr0 \n\t"
1382 "mov %[dr1], %%dr1 \n\t"
1383 "mov %[dr2], %%dr2 \n\t"
1384 "mov %[dr3], %%dr3 \n\t"
1385 :
1386 : [dr0] "r"(db_regs[0]),
1387 [dr1] "r"(db_regs[1]),
1388 [dr2] "r"(db_regs[2]),
1389 [dr3] "r"(db_regs[3])
1390#ifdef __x86_64__
1391 : "rax");
1392#else
1393 : "eax");
1394#endif
1395}
1396
1397static int svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1398{
1399 u16 fs_selector;
1400 u16 gs_selector;
1401 u16 ldt_selector;
1402
1403again:
1404 kvm_try_inject_irq(vcpu);
1405
1406 clgi();
1407
1408 pre_svm_run(vcpu);
1409
1410 save_host_msrs(vcpu);
1411 fs_selector = read_fs();
1412 gs_selector = read_gs();
1413 ldt_selector = read_ldt();
1414 vcpu->svm->host_cr2 = kvm_read_cr2();
1415 vcpu->svm->host_dr6 = read_dr6();
1416 vcpu->svm->host_dr7 = read_dr7();
1417 vcpu->svm->vmcb->save.cr2 = vcpu->cr2;
1418
1419 if (vcpu->svm->vmcb->save.dr7 & 0xff) {
1420 write_dr7(0);
1421 save_db_regs(vcpu->svm->host_db_regs);
1422 load_db_regs(vcpu->svm->db_regs);
1423 }
1424 asm volatile (
1425#ifdef __x86_64__
1426 "push %%rbx; push %%rcx; push %%rdx;"
1427 "push %%rsi; push %%rdi; push %%rbp;"
1428 "push %%r8; push %%r9; push %%r10; push %%r11;"
1429 "push %%r12; push %%r13; push %%r14; push %%r15;"
1430#else
1431 "push %%ebx; push %%ecx; push %%edx;"
1432 "push %%esi; push %%edi; push %%ebp;"
1433#endif
1434
1435#ifdef __x86_64__
1436 "mov %c[rbx](%[vcpu]), %%rbx \n\t"
1437 "mov %c[rcx](%[vcpu]), %%rcx \n\t"
1438 "mov %c[rdx](%[vcpu]), %%rdx \n\t"
1439 "mov %c[rsi](%[vcpu]), %%rsi \n\t"
1440 "mov %c[rdi](%[vcpu]), %%rdi \n\t"
1441 "mov %c[rbp](%[vcpu]), %%rbp \n\t"
1442 "mov %c[r8](%[vcpu]), %%r8 \n\t"
1443 "mov %c[r9](%[vcpu]), %%r9 \n\t"
1444 "mov %c[r10](%[vcpu]), %%r10 \n\t"
1445 "mov %c[r11](%[vcpu]), %%r11 \n\t"
1446 "mov %c[r12](%[vcpu]), %%r12 \n\t"
1447 "mov %c[r13](%[vcpu]), %%r13 \n\t"
1448 "mov %c[r14](%[vcpu]), %%r14 \n\t"
1449 "mov %c[r15](%[vcpu]), %%r15 \n\t"
1450#else
1451 "mov %c[rbx](%[vcpu]), %%ebx \n\t"
1452 "mov %c[rcx](%[vcpu]), %%ecx \n\t"
1453 "mov %c[rdx](%[vcpu]), %%edx \n\t"
1454 "mov %c[rsi](%[vcpu]), %%esi \n\t"
1455 "mov %c[rdi](%[vcpu]), %%edi \n\t"
1456 "mov %c[rbp](%[vcpu]), %%ebp \n\t"
1457#endif
1458
1459#ifdef __x86_64__
1460 /* Enter guest mode */
1461 "push %%rax \n\t"
1462 "mov %c[svm](%[vcpu]), %%rax \n\t"
1463 "mov %c[vmcb](%%rax), %%rax \n\t"
1464 SVM_VMLOAD "\n\t"
1465 SVM_VMRUN "\n\t"
1466 SVM_VMSAVE "\n\t"
1467 "pop %%rax \n\t"
1468#else
1469 /* Enter guest mode */
1470 "push %%eax \n\t"
1471 "mov %c[svm](%[vcpu]), %%eax \n\t"
1472 "mov %c[vmcb](%%eax), %%eax \n\t"
1473 SVM_VMLOAD "\n\t"
1474 SVM_VMRUN "\n\t"
1475 SVM_VMSAVE "\n\t"
1476 "pop %%eax \n\t"
1477#endif
1478
1479 /* Save guest registers, load host registers */
1480#ifdef __x86_64__
1481 "mov %%rbx, %c[rbx](%[vcpu]) \n\t"
1482 "mov %%rcx, %c[rcx](%[vcpu]) \n\t"
1483 "mov %%rdx, %c[rdx](%[vcpu]) \n\t"
1484 "mov %%rsi, %c[rsi](%[vcpu]) \n\t"
1485 "mov %%rdi, %c[rdi](%[vcpu]) \n\t"
1486 "mov %%rbp, %c[rbp](%[vcpu]) \n\t"
1487 "mov %%r8, %c[r8](%[vcpu]) \n\t"
1488 "mov %%r9, %c[r9](%[vcpu]) \n\t"
1489 "mov %%r10, %c[r10](%[vcpu]) \n\t"
1490 "mov %%r11, %c[r11](%[vcpu]) \n\t"
1491 "mov %%r12, %c[r12](%[vcpu]) \n\t"
1492 "mov %%r13, %c[r13](%[vcpu]) \n\t"
1493 "mov %%r14, %c[r14](%[vcpu]) \n\t"
1494 "mov %%r15, %c[r15](%[vcpu]) \n\t"
1495
1496 "pop %%r15; pop %%r14; pop %%r13; pop %%r12;"
1497 "pop %%r11; pop %%r10; pop %%r9; pop %%r8;"
1498 "pop %%rbp; pop %%rdi; pop %%rsi;"
1499 "pop %%rdx; pop %%rcx; pop %%rbx; \n\t"
1500#else
1501 "mov %%ebx, %c[rbx](%[vcpu]) \n\t"
1502 "mov %%ecx, %c[rcx](%[vcpu]) \n\t"
1503 "mov %%edx, %c[rdx](%[vcpu]) \n\t"
1504 "mov %%esi, %c[rsi](%[vcpu]) \n\t"
1505 "mov %%edi, %c[rdi](%[vcpu]) \n\t"
1506 "mov %%ebp, %c[rbp](%[vcpu]) \n\t"
1507
1508 "pop %%ebp; pop %%edi; pop %%esi;"
1509 "pop %%edx; pop %%ecx; pop %%ebx; \n\t"
1510#endif
1511 :
1512 : [vcpu]"a"(vcpu),
1513 [svm]"i"(offsetof(struct kvm_vcpu, svm)),
1514 [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
1515 [rbx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBX])),
1516 [rcx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RCX])),
1517 [rdx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDX])),
1518 [rsi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RSI])),
1519 [rdi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDI])),
1520 [rbp]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBP]))
1521#ifdef __x86_64__
1522 ,[r8 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R8 ])),
1523 [r9 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R9 ])),
1524 [r10]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R10])),
1525 [r11]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R11])),
1526 [r12]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R12])),
1527 [r13]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R13])),
1528 [r14]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R14])),
1529 [r15]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R15]))
1530#endif
1531 : "cc", "memory" );
1532
1533 if ((vcpu->svm->vmcb->save.dr7 & 0xff))
1534 load_db_regs(vcpu->svm->host_db_regs);
1535
1536 vcpu->cr2 = vcpu->svm->vmcb->save.cr2;
1537
1538 write_dr6(vcpu->svm->host_dr6);
1539 write_dr7(vcpu->svm->host_dr7);
1540 kvm_write_cr2(vcpu->svm->host_cr2);
1541
1542 load_fs(fs_selector);
1543 load_gs(gs_selector);
1544 load_ldt(ldt_selector);
1545 load_host_msrs(vcpu);
1546
1547 reload_tss(vcpu);
1548
1549 stgi();
1550
1551 kvm_reput_irq(vcpu);
1552
1553 vcpu->svm->next_rip = 0;
1554
1555 if (vcpu->svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
1556 kvm_run->exit_type = KVM_EXIT_TYPE_FAIL_ENTRY;
1557 kvm_run->exit_reason = vcpu->svm->vmcb->control.exit_code;
1558 return 0;
1559 }
1560
1561 if (handle_exit(vcpu, kvm_run)) {
1562 if (signal_pending(current)) {
1563 ++kvm_stat.signal_exits;
1564 return -EINTR;
1565 }
1566 kvm_resched(vcpu);
1567 goto again;
1568 }
1569 return 0;
1570}
1571
1572static void svm_flush_tlb(struct kvm_vcpu *vcpu)
1573{
1574 force_new_asid(vcpu);
1575}
1576
1577static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
1578{
1579 vcpu->svm->vmcb->save.cr3 = root;
1580 force_new_asid(vcpu);
1581}
1582
1583static void svm_inject_page_fault(struct kvm_vcpu *vcpu,
1584 unsigned long addr,
1585 uint32_t err_code)
1586{
1587 uint32_t exit_int_info = vcpu->svm->vmcb->control.exit_int_info;
1588
1589 ++kvm_stat.pf_guest;
1590
1591 if (is_page_fault(exit_int_info)) {
1592
1593 vcpu->svm->vmcb->control.event_inj_err = 0;
1594 vcpu->svm->vmcb->control.event_inj = SVM_EVTINJ_VALID |
1595 SVM_EVTINJ_VALID_ERR |
1596 SVM_EVTINJ_TYPE_EXEPT |
1597 DF_VECTOR;
1598 return;
1599 }
1600 vcpu->cr2 = addr;
1601 vcpu->svm->vmcb->save.cr2 = addr;
1602 vcpu->svm->vmcb->control.event_inj = SVM_EVTINJ_VALID |
1603 SVM_EVTINJ_VALID_ERR |
1604 SVM_EVTINJ_TYPE_EXEPT |
1605 PF_VECTOR;
1606 vcpu->svm->vmcb->control.event_inj_err = err_code;
1607}
1608
1609
1610static int is_disabled(void)
1611{
1612 return 0;
1613}
1614
1615static struct kvm_arch_ops svm_arch_ops = {
1616 .cpu_has_kvm_support = has_svm,
1617 .disabled_by_bios = is_disabled,
1618 .hardware_setup = svm_hardware_setup,
1619 .hardware_unsetup = svm_hardware_unsetup,
1620 .hardware_enable = svm_hardware_enable,
1621 .hardware_disable = svm_hardware_disable,
1622
1623 .vcpu_create = svm_create_vcpu,
1624 .vcpu_free = svm_free_vcpu,
1625
1626 .vcpu_load = svm_vcpu_load,
1627 .vcpu_put = svm_vcpu_put,
1628
1629 .set_guest_debug = svm_guest_debug,
1630 .get_msr = svm_get_msr,
1631 .set_msr = svm_set_msr,
1632 .get_segment_base = svm_get_segment_base,
1633 .get_segment = svm_get_segment,
1634 .set_segment = svm_set_segment,
1635 .is_long_mode = svm_is_long_mode,
1636 .get_cs_db_l_bits = svm_get_cs_db_l_bits,
1637 .set_cr0 = svm_set_cr0,
1638 .set_cr0_no_modeswitch = svm_set_cr0,
1639 .set_cr3 = svm_set_cr3,
1640 .set_cr4 = svm_set_cr4,
1641 .set_efer = svm_set_efer,
1642 .get_idt = svm_get_idt,
1643 .set_idt = svm_set_idt,
1644 .get_gdt = svm_get_gdt,
1645 .set_gdt = svm_set_gdt,
1646 .get_dr = svm_get_dr,
1647 .set_dr = svm_set_dr,
1648 .cache_regs = svm_cache_regs,
1649 .decache_regs = svm_decache_regs,
1650 .get_rflags = svm_get_rflags,
1651 .set_rflags = svm_set_rflags,
1652
1653 .invlpg = svm_invlpg,
1654 .tlb_flush = svm_flush_tlb,
1655 .inject_page_fault = svm_inject_page_fault,
1656
1657 .inject_gp = svm_inject_gp,
1658
1659 .run = svm_vcpu_run,
1660 .skip_emulated_instruction = skip_emulated_instruction,
1661 .vcpu_setup = svm_vcpu_setup,
1662};
1663
1664static int __init svm_init(void)
1665{
1666 kvm_emulator_want_group7_invlpg();
1667 kvm_init_arch(&svm_arch_ops, THIS_MODULE);
1668 return 0;
1669}
1670
1671static void __exit svm_exit(void)
1672{
1673 kvm_exit_arch();
1674}
1675
1676module_init(svm_init)
1677module_exit(svm_exit)
diff --git a/drivers/kvm/svm.h b/drivers/kvm/svm.h
new file mode 100644
index 000000000000..df731c3fb588
--- /dev/null
+++ b/drivers/kvm/svm.h
@@ -0,0 +1,315 @@
1#ifndef __SVM_H
2#define __SVM_H
3
4enum {
5 INTERCEPT_INTR,
6 INTERCEPT_NMI,
7 INTERCEPT_SMI,
8 INTERCEPT_INIT,
9 INTERCEPT_VINTR,
10 INTERCEPT_SELECTIVE_CR0,
11 INTERCEPT_STORE_IDTR,
12 INTERCEPT_STORE_GDTR,
13 INTERCEPT_STORE_LDTR,
14 INTERCEPT_STORE_TR,
15 INTERCEPT_LOAD_IDTR,
16 INTERCEPT_LOAD_GDTR,
17 INTERCEPT_LOAD_LDTR,
18 INTERCEPT_LOAD_TR,
19 INTERCEPT_RDTSC,
20 INTERCEPT_RDPMC,
21 INTERCEPT_PUSHF,
22 INTERCEPT_POPF,
23 INTERCEPT_CPUID,
24 INTERCEPT_RSM,
25 INTERCEPT_IRET,
26 INTERCEPT_INTn,
27 INTERCEPT_INVD,
28 INTERCEPT_PAUSE,
29 INTERCEPT_HLT,
30 INTERCEPT_INVLPG,
31 INTERCEPT_INVLPGA,
32 INTERCEPT_IOIO_PROT,
33 INTERCEPT_MSR_PROT,
34 INTERCEPT_TASK_SWITCH,
35 INTERCEPT_FERR_FREEZE,
36 INTERCEPT_SHUTDOWN,
37 INTERCEPT_VMRUN,
38 INTERCEPT_VMMCALL,
39 INTERCEPT_VMLOAD,
40 INTERCEPT_VMSAVE,
41 INTERCEPT_STGI,
42 INTERCEPT_CLGI,
43 INTERCEPT_SKINIT,
44 INTERCEPT_RDTSCP,
45 INTERCEPT_ICEBP,
46 INTERCEPT_WBINVD,
47};
48
49
50struct __attribute__ ((__packed__)) vmcb_control_area {
51 u16 intercept_cr_read;
52 u16 intercept_cr_write;
53 u16 intercept_dr_read;
54 u16 intercept_dr_write;
55 u32 intercept_exceptions;
56 u64 intercept;
57 u8 reserved_1[44];
58 u64 iopm_base_pa;
59 u64 msrpm_base_pa;
60 u64 tsc_offset;
61 u32 asid;
62 u8 tlb_ctl;
63 u8 reserved_2[3];
64 u32 int_ctl;
65 u32 int_vector;
66 u32 int_state;
67 u8 reserved_3[4];
68 u32 exit_code;
69 u32 exit_code_hi;
70 u64 exit_info_1;
71 u64 exit_info_2;
72 u32 exit_int_info;
73 u32 exit_int_info_err;
74 u64 nested_ctl;
75 u8 reserved_4[16];
76 u32 event_inj;
77 u32 event_inj_err;
78 u64 nested_cr3;
79 u64 lbr_ctl;
80 u8 reserved_5[832];
81};
82
83
84#define TLB_CONTROL_DO_NOTHING 0
85#define TLB_CONTROL_FLUSH_ALL_ASID 1
86
87#define V_TPR_MASK 0x0f
88
89#define V_IRQ_SHIFT 8
90#define V_IRQ_MASK (1 << V_IRQ_SHIFT)
91
92#define V_INTR_PRIO_SHIFT 16
93#define V_INTR_PRIO_MASK (0x0f << V_INTR_PRIO_SHIFT)
94
95#define V_IGN_TPR_SHIFT 20
96#define V_IGN_TPR_MASK (1 << V_IGN_TPR_SHIFT)
97
98#define V_INTR_MASKING_SHIFT 24
99#define V_INTR_MASKING_MASK (1 << V_INTR_MASKING_SHIFT)
100
101#define SVM_INTERRUPT_SHADOW_MASK 1
102
103#define SVM_IOIO_STR_SHIFT 2
104#define SVM_IOIO_REP_SHIFT 3
105#define SVM_IOIO_SIZE_SHIFT 4
106#define SVM_IOIO_ASIZE_SHIFT 7
107
108#define SVM_IOIO_TYPE_MASK 1
109#define SVM_IOIO_STR_MASK (1 << SVM_IOIO_STR_SHIFT)
110#define SVM_IOIO_REP_MASK (1 << SVM_IOIO_REP_SHIFT)
111#define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT)
112#define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT)
113
114struct __attribute__ ((__packed__)) vmcb_seg {
115 u16 selector;
116 u16 attrib;
117 u32 limit;
118 u64 base;
119};
120
121struct __attribute__ ((__packed__)) vmcb_save_area {
122 struct vmcb_seg es;
123 struct vmcb_seg cs;
124 struct vmcb_seg ss;
125 struct vmcb_seg ds;
126 struct vmcb_seg fs;
127 struct vmcb_seg gs;
128 struct vmcb_seg gdtr;
129 struct vmcb_seg ldtr;
130 struct vmcb_seg idtr;
131 struct vmcb_seg tr;
132 u8 reserved_1[43];
133 u8 cpl;
134 u8 reserved_2[4];
135 u64 efer;
136 u8 reserved_3[112];
137 u64 cr4;
138 u64 cr3;
139 u64 cr0;
140 u64 dr7;
141 u64 dr6;
142 u64 rflags;
143 u64 rip;
144 u8 reserved_4[88];
145 u64 rsp;
146 u8 reserved_5[24];
147 u64 rax;
148 u64 star;
149 u64 lstar;
150 u64 cstar;
151 u64 sfmask;
152 u64 kernel_gs_base;
153 u64 sysenter_cs;
154 u64 sysenter_esp;
155 u64 sysenter_eip;
156 u64 cr2;
157 u8 reserved_6[32];
158 u64 g_pat;
159 u64 dbgctl;
160 u64 br_from;
161 u64 br_to;
162 u64 last_excp_from;
163 u64 last_excp_to;
164};
165
166struct __attribute__ ((__packed__)) vmcb {
167 struct vmcb_control_area control;
168 struct vmcb_save_area save;
169};
170
171#define SVM_CPUID_FEATURE_SHIFT 2
172#define SVM_CPUID_FUNC 0x8000000a
173
174#define MSR_EFER_SVME_MASK (1ULL << 12)
175#define MSR_VM_HSAVE_PA 0xc0010117ULL
176
177#define SVM_SELECTOR_S_SHIFT 4
178#define SVM_SELECTOR_DPL_SHIFT 5
179#define SVM_SELECTOR_P_SHIFT 7
180#define SVM_SELECTOR_AVL_SHIFT 8
181#define SVM_SELECTOR_L_SHIFT 9
182#define SVM_SELECTOR_DB_SHIFT 10
183#define SVM_SELECTOR_G_SHIFT 11
184
185#define SVM_SELECTOR_TYPE_MASK (0xf)
186#define SVM_SELECTOR_S_MASK (1 << SVM_SELECTOR_S_SHIFT)
187#define SVM_SELECTOR_DPL_MASK (3 << SVM_SELECTOR_DPL_SHIFT)
188#define SVM_SELECTOR_P_MASK (1 << SVM_SELECTOR_P_SHIFT)
189#define SVM_SELECTOR_AVL_MASK (1 << SVM_SELECTOR_AVL_SHIFT)
190#define SVM_SELECTOR_L_MASK (1 << SVM_SELECTOR_L_SHIFT)
191#define SVM_SELECTOR_DB_MASK (1 << SVM_SELECTOR_DB_SHIFT)
192#define SVM_SELECTOR_G_MASK (1 << SVM_SELECTOR_G_SHIFT)
193
194#define SVM_SELECTOR_WRITE_MASK (1 << 1)
195#define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK
196#define SVM_SELECTOR_CODE_MASK (1 << 3)
197
198#define INTERCEPT_CR0_MASK 1
199#define INTERCEPT_CR3_MASK (1 << 3)
200#define INTERCEPT_CR4_MASK (1 << 4)
201
202#define INTERCEPT_DR0_MASK 1
203#define INTERCEPT_DR1_MASK (1 << 1)
204#define INTERCEPT_DR2_MASK (1 << 2)
205#define INTERCEPT_DR3_MASK (1 << 3)
206#define INTERCEPT_DR4_MASK (1 << 4)
207#define INTERCEPT_DR5_MASK (1 << 5)
208#define INTERCEPT_DR6_MASK (1 << 6)
209#define INTERCEPT_DR7_MASK (1 << 7)
210
211#define SVM_EVTINJ_VEC_MASK 0xff
212
213#define SVM_EVTINJ_TYPE_SHIFT 8
214#define SVM_EVTINJ_TYPE_MASK (7 << SVM_EVTINJ_TYPE_SHIFT)
215
216#define SVM_EVTINJ_TYPE_INTR (0 << SVM_EVTINJ_TYPE_SHIFT)
217#define SVM_EVTINJ_TYPE_NMI (2 << SVM_EVTINJ_TYPE_SHIFT)
218#define SVM_EVTINJ_TYPE_EXEPT (3 << SVM_EVTINJ_TYPE_SHIFT)
219#define SVM_EVTINJ_TYPE_SOFT (4 << SVM_EVTINJ_TYPE_SHIFT)
220
221#define SVM_EVTINJ_VALID (1 << 31)
222#define SVM_EVTINJ_VALID_ERR (1 << 11)
223
224#define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK
225
226#define SVM_EXITINTINFO_TYPE_INTR SVM_EVTINJ_TYPE_INTR
227#define SVM_EXITINTINFO_TYPE_NMI SVM_EVTINJ_TYPE_NMI
228#define SVM_EXITINTINFO_TYPE_EXEPT SVM_EVTINJ_TYPE_EXEPT
229#define SVM_EXITINTINFO_TYPE_SOFT SVM_EVTINJ_TYPE_SOFT
230
231#define SVM_EXITINTINFO_VALID SVM_EVTINJ_VALID
232#define SVM_EXITINTINFO_VALID_ERR SVM_EVTINJ_VALID_ERR
233
234#define SVM_EXIT_READ_CR0 0x000
235#define SVM_EXIT_READ_CR3 0x003
236#define SVM_EXIT_READ_CR4 0x004
237#define SVM_EXIT_READ_CR8 0x008
238#define SVM_EXIT_WRITE_CR0 0x010
239#define SVM_EXIT_WRITE_CR3 0x013
240#define SVM_EXIT_WRITE_CR4 0x014
241#define SVM_EXIT_WRITE_CR8 0x018
242#define SVM_EXIT_READ_DR0 0x020
243#define SVM_EXIT_READ_DR1 0x021
244#define SVM_EXIT_READ_DR2 0x022
245#define SVM_EXIT_READ_DR3 0x023
246#define SVM_EXIT_READ_DR4 0x024
247#define SVM_EXIT_READ_DR5 0x025
248#define SVM_EXIT_READ_DR6 0x026
249#define SVM_EXIT_READ_DR7 0x027
250#define SVM_EXIT_WRITE_DR0 0x030
251#define SVM_EXIT_WRITE_DR1 0x031
252#define SVM_EXIT_WRITE_DR2 0x032
253#define SVM_EXIT_WRITE_DR3 0x033
254#define SVM_EXIT_WRITE_DR4 0x034
255#define SVM_EXIT_WRITE_DR5 0x035
256#define SVM_EXIT_WRITE_DR6 0x036
257#define SVM_EXIT_WRITE_DR7 0x037
258#define SVM_EXIT_EXCP_BASE 0x040
259#define SVM_EXIT_INTR 0x060
260#define SVM_EXIT_NMI 0x061
261#define SVM_EXIT_SMI 0x062
262#define SVM_EXIT_INIT 0x063
263#define SVM_EXIT_VINTR 0x064
264#define SVM_EXIT_CR0_SEL_WRITE 0x065
265#define SVM_EXIT_IDTR_READ 0x066
266#define SVM_EXIT_GDTR_READ 0x067
267#define SVM_EXIT_LDTR_READ 0x068
268#define SVM_EXIT_TR_READ 0x069
269#define SVM_EXIT_IDTR_WRITE 0x06a
270#define SVM_EXIT_GDTR_WRITE 0x06b
271#define SVM_EXIT_LDTR_WRITE 0x06c
272#define SVM_EXIT_TR_WRITE 0x06d
273#define SVM_EXIT_RDTSC 0x06e
274#define SVM_EXIT_RDPMC 0x06f
275#define SVM_EXIT_PUSHF 0x070
276#define SVM_EXIT_POPF 0x071
277#define SVM_EXIT_CPUID 0x072
278#define SVM_EXIT_RSM 0x073
279#define SVM_EXIT_IRET 0x074
280#define SVM_EXIT_SWINT 0x075
281#define SVM_EXIT_INVD 0x076
282#define SVM_EXIT_PAUSE 0x077
283#define SVM_EXIT_HLT 0x078
284#define SVM_EXIT_INVLPG 0x079
285#define SVM_EXIT_INVLPGA 0x07a
286#define SVM_EXIT_IOIO 0x07b
287#define SVM_EXIT_MSR 0x07c
288#define SVM_EXIT_TASK_SWITCH 0x07d
289#define SVM_EXIT_FERR_FREEZE 0x07e
290#define SVM_EXIT_SHUTDOWN 0x07f
291#define SVM_EXIT_VMRUN 0x080
292#define SVM_EXIT_VMMCALL 0x081
293#define SVM_EXIT_VMLOAD 0x082
294#define SVM_EXIT_VMSAVE 0x083
295#define SVM_EXIT_STGI 0x084
296#define SVM_EXIT_CLGI 0x085
297#define SVM_EXIT_SKINIT 0x086
298#define SVM_EXIT_RDTSCP 0x087
299#define SVM_EXIT_ICEBP 0x088
300#define SVM_EXIT_WBINVD 0x089
301#define SVM_EXIT_NPF 0x400
302
303#define SVM_EXIT_ERR -1
304
305#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) // TS and MP
306
307#define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda"
308#define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8"
309#define SVM_VMSAVE ".byte 0x0f, 0x01, 0xdb"
310#define SVM_CLGI ".byte 0x0f, 0x01, 0xdd"
311#define SVM_STGI ".byte 0x0f, 0x01, 0xdc"
312#define SVM_INVLPGA ".byte 0x0f, 0x01, 0xdf"
313
314#endif
315
diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c
new file mode 100644
index 000000000000..bda7a7ae2167
--- /dev/null
+++ b/drivers/kvm/vmx.c
@@ -0,0 +1,2002 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * Copyright (C) 2006 Qumranet, Inc.
8 *
9 * Authors:
10 * Avi Kivity <avi@qumranet.com>
11 * Yaniv Kamay <yaniv@qumranet.com>
12 *
13 * This work is licensed under the terms of the GNU GPL, version 2. See
14 * the COPYING file in the top-level directory.
15 *
16 */
17
18#include "kvm.h"
19#include "vmx.h"
20#include "kvm_vmx.h"
21#include <linux/module.h>
22#include <linux/mm.h>
23#include <linux/highmem.h>
24#include <asm/io.h>
25
26#include "segment_descriptor.h"
27
28#define MSR_IA32_FEATURE_CONTROL 0x03a
29
30MODULE_AUTHOR("Qumranet");
31MODULE_LICENSE("GPL");
32
33static DEFINE_PER_CPU(struct vmcs *, vmxarea);
34static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
35
36#ifdef __x86_64__
37#define HOST_IS_64 1
38#else
39#define HOST_IS_64 0
40#endif
41
42static struct vmcs_descriptor {
43 int size;
44 int order;
45 u32 revision_id;
46} vmcs_descriptor;
47
48#define VMX_SEGMENT_FIELD(seg) \
49 [VCPU_SREG_##seg] = { \
50 .selector = GUEST_##seg##_SELECTOR, \
51 .base = GUEST_##seg##_BASE, \
52 .limit = GUEST_##seg##_LIMIT, \
53 .ar_bytes = GUEST_##seg##_AR_BYTES, \
54 }
55
56static struct kvm_vmx_segment_field {
57 unsigned selector;
58 unsigned base;
59 unsigned limit;
60 unsigned ar_bytes;
61} kvm_vmx_segment_fields[] = {
62 VMX_SEGMENT_FIELD(CS),
63 VMX_SEGMENT_FIELD(DS),
64 VMX_SEGMENT_FIELD(ES),
65 VMX_SEGMENT_FIELD(FS),
66 VMX_SEGMENT_FIELD(GS),
67 VMX_SEGMENT_FIELD(SS),
68 VMX_SEGMENT_FIELD(TR),
69 VMX_SEGMENT_FIELD(LDTR),
70};
71
72static const u32 vmx_msr_index[] = {
73#ifdef __x86_64__
74 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE,
75#endif
76 MSR_EFER, MSR_K6_STAR,
77};
78#define NR_VMX_MSR (sizeof(vmx_msr_index) / sizeof(*vmx_msr_index))
79
80struct vmx_msr_entry *find_msr_entry(struct kvm_vcpu *vcpu, u32 msr);
81
82static inline int is_page_fault(u32 intr_info)
83{
84 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
85 INTR_INFO_VALID_MASK)) ==
86 (INTR_TYPE_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
87}
88
89static inline int is_external_interrupt(u32 intr_info)
90{
91 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
92 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
93}
94
95static void vmcs_clear(struct vmcs *vmcs)
96{
97 u64 phys_addr = __pa(vmcs);
98 u8 error;
99
100 asm volatile (ASM_VMX_VMCLEAR_RAX "; setna %0"
101 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
102 : "cc", "memory");
103 if (error)
104 printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
105 vmcs, phys_addr);
106}
107
108static void __vcpu_clear(void *arg)
109{
110 struct kvm_vcpu *vcpu = arg;
111 int cpu = smp_processor_id();
112
113 if (vcpu->cpu == cpu)
114 vmcs_clear(vcpu->vmcs);
115 if (per_cpu(current_vmcs, cpu) == vcpu->vmcs)
116 per_cpu(current_vmcs, cpu) = NULL;
117}
118
119static unsigned long vmcs_readl(unsigned long field)
120{
121 unsigned long value;
122
123 asm volatile (ASM_VMX_VMREAD_RDX_RAX
124 : "=a"(value) : "d"(field) : "cc");
125 return value;
126}
127
128static u16 vmcs_read16(unsigned long field)
129{
130 return vmcs_readl(field);
131}
132
133static u32 vmcs_read32(unsigned long field)
134{
135 return vmcs_readl(field);
136}
137
138static u64 vmcs_read64(unsigned long field)
139{
140#ifdef __x86_64__
141 return vmcs_readl(field);
142#else
143 return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32);
144#endif
145}
146
147static void vmcs_writel(unsigned long field, unsigned long value)
148{
149 u8 error;
150
151 asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0"
152 : "=q"(error) : "a"(value), "d"(field) : "cc" );
153 if (error)
154 printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
155 field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
156}
157
158static void vmcs_write16(unsigned long field, u16 value)
159{
160 vmcs_writel(field, value);
161}
162
163static void vmcs_write32(unsigned long field, u32 value)
164{
165 vmcs_writel(field, value);
166}
167
168static void vmcs_write64(unsigned long field, u64 value)
169{
170#ifdef __x86_64__
171 vmcs_writel(field, value);
172#else
173 vmcs_writel(field, value);
174 asm volatile ("");
175 vmcs_writel(field+1, value >> 32);
176#endif
177}
178
179/*
180 * Switches to specified vcpu, until a matching vcpu_put(), but assumes
181 * vcpu mutex is already taken.
182 */
183static struct kvm_vcpu *vmx_vcpu_load(struct kvm_vcpu *vcpu)
184{
185 u64 phys_addr = __pa(vcpu->vmcs);
186 int cpu;
187
188 cpu = get_cpu();
189
190 if (vcpu->cpu != cpu) {
191 smp_call_function(__vcpu_clear, vcpu, 0, 1);
192 vcpu->launched = 0;
193 }
194
195 if (per_cpu(current_vmcs, cpu) != vcpu->vmcs) {
196 u8 error;
197
198 per_cpu(current_vmcs, cpu) = vcpu->vmcs;
199 asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0"
200 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
201 : "cc");
202 if (error)
203 printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
204 vcpu->vmcs, phys_addr);
205 }
206
207 if (vcpu->cpu != cpu) {
208 struct descriptor_table dt;
209 unsigned long sysenter_esp;
210
211 vcpu->cpu = cpu;
212 /*
213 * Linux uses per-cpu TSS and GDT, so set these when switching
214 * processors.
215 */
216 vmcs_writel(HOST_TR_BASE, read_tr_base()); /* 22.2.4 */
217 get_gdt(&dt);
218 vmcs_writel(HOST_GDTR_BASE, dt.base); /* 22.2.4 */
219
220 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
221 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
222 }
223 return vcpu;
224}
225
226static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
227{
228 put_cpu();
229}
230
231static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
232{
233 return vmcs_readl(GUEST_RFLAGS);
234}
235
236static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
237{
238 vmcs_writel(GUEST_RFLAGS, rflags);
239}
240
241static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
242{
243 unsigned long rip;
244 u32 interruptibility;
245
246 rip = vmcs_readl(GUEST_RIP);
247 rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
248 vmcs_writel(GUEST_RIP, rip);
249
250 /*
251 * We emulated an instruction, so temporary interrupt blocking
252 * should be removed, if set.
253 */
254 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
255 if (interruptibility & 3)
256 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
257 interruptibility & ~3);
258}
259
260static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code)
261{
262 printk(KERN_DEBUG "inject_general_protection: rip 0x%lx\n",
263 vmcs_readl(GUEST_RIP));
264 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
265 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
266 GP_VECTOR |
267 INTR_TYPE_EXCEPTION |
268 INTR_INFO_DELIEVER_CODE_MASK |
269 INTR_INFO_VALID_MASK);
270}
271
272/*
273 * reads and returns guest's timestamp counter "register"
274 * guest_tsc = host_tsc + tsc_offset -- 21.3
275 */
276static u64 guest_read_tsc(void)
277{
278 u64 host_tsc, tsc_offset;
279
280 rdtscll(host_tsc);
281 tsc_offset = vmcs_read64(TSC_OFFSET);
282 return host_tsc + tsc_offset;
283}
284
285/*
286 * writes 'guest_tsc' into guest's timestamp counter "register"
287 * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc
288 */
289static void guest_write_tsc(u64 guest_tsc)
290{
291 u64 host_tsc;
292
293 rdtscll(host_tsc);
294 vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc);
295}
296
297static void reload_tss(void)
298{
299#ifndef __x86_64__
300
301 /*
302 * VT restores TR but not its size. Useless.
303 */
304 struct descriptor_table gdt;
305 struct segment_descriptor *descs;
306
307 get_gdt(&gdt);
308 descs = (void *)gdt.base;
309 descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
310 load_TR_desc();
311#endif
312}
313
314/*
315 * Reads an msr value (of 'msr_index') into 'pdata'.
316 * Returns 0 on success, non-0 otherwise.
317 * Assumes vcpu_load() was already called.
318 */
319static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
320{
321 u64 data;
322 struct vmx_msr_entry *msr;
323
324 if (!pdata) {
325 printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
326 return -EINVAL;
327 }
328
329 switch (msr_index) {
330#ifdef __x86_64__
331 case MSR_FS_BASE:
332 data = vmcs_readl(GUEST_FS_BASE);
333 break;
334 case MSR_GS_BASE:
335 data = vmcs_readl(GUEST_GS_BASE);
336 break;
337 case MSR_EFER:
338 data = vcpu->shadow_efer;
339 break;
340#endif
341 case MSR_IA32_TIME_STAMP_COUNTER:
342 data = guest_read_tsc();
343 break;
344 case MSR_IA32_SYSENTER_CS:
345 data = vmcs_read32(GUEST_SYSENTER_CS);
346 break;
347 case MSR_IA32_SYSENTER_EIP:
348 data = vmcs_read32(GUEST_SYSENTER_EIP);
349 break;
350 case MSR_IA32_SYSENTER_ESP:
351 data = vmcs_read32(GUEST_SYSENTER_ESP);
352 break;
353 case MSR_IA32_MC0_CTL:
354 case MSR_IA32_MCG_STATUS:
355 case MSR_IA32_MCG_CAP:
356 case MSR_IA32_MC0_MISC:
357 case MSR_IA32_MC0_MISC+4:
358 case MSR_IA32_MC0_MISC+8:
359 case MSR_IA32_MC0_MISC+12:
360 case MSR_IA32_MC0_MISC+16:
361 case MSR_IA32_UCODE_REV:
362 /* MTRR registers */
363 case 0xfe:
364 case 0x200 ... 0x2ff:
365 data = 0;
366 break;
367 case MSR_IA32_APICBASE:
368 data = vcpu->apic_base;
369 break;
370 default:
371 msr = find_msr_entry(vcpu, msr_index);
372 if (!msr) {
373 printk(KERN_ERR "kvm: unhandled rdmsr: %x\n", msr_index);
374 return 1;
375 }
376 data = msr->data;
377 break;
378 }
379
380 *pdata = data;
381 return 0;
382}
383
384/*
385 * Writes msr value into into the appropriate "register".
386 * Returns 0 on success, non-0 otherwise.
387 * Assumes vcpu_load() was already called.
388 */
389static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
390{
391 struct vmx_msr_entry *msr;
392 switch (msr_index) {
393#ifdef __x86_64__
394 case MSR_FS_BASE:
395 vmcs_writel(GUEST_FS_BASE, data);
396 break;
397 case MSR_GS_BASE:
398 vmcs_writel(GUEST_GS_BASE, data);
399 break;
400#endif
401 case MSR_IA32_SYSENTER_CS:
402 vmcs_write32(GUEST_SYSENTER_CS, data);
403 break;
404 case MSR_IA32_SYSENTER_EIP:
405 vmcs_write32(GUEST_SYSENTER_EIP, data);
406 break;
407 case MSR_IA32_SYSENTER_ESP:
408 vmcs_write32(GUEST_SYSENTER_ESP, data);
409 break;
410#ifdef __x86_64
411 case MSR_EFER:
412 set_efer(vcpu, data);
413 break;
414 case MSR_IA32_MC0_STATUS:
415 printk(KERN_WARNING "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n"
416 , __FUNCTION__, data);
417 break;
418#endif
419 case MSR_IA32_TIME_STAMP_COUNTER: {
420 guest_write_tsc(data);
421 break;
422 }
423 case MSR_IA32_UCODE_REV:
424 case MSR_IA32_UCODE_WRITE:
425 case 0x200 ... 0x2ff: /* MTRRs */
426 break;
427 case MSR_IA32_APICBASE:
428 vcpu->apic_base = data;
429 break;
430 default:
431 msr = find_msr_entry(vcpu, msr_index);
432 if (!msr) {
433 printk(KERN_ERR "kvm: unhandled wrmsr: 0x%x\n", msr_index);
434 return 1;
435 }
436 msr->data = data;
437 break;
438 }
439
440 return 0;
441}
442
443/*
444 * Sync the rsp and rip registers into the vcpu structure. This allows
445 * registers to be accessed by indexing vcpu->regs.
446 */
447static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu)
448{
449 vcpu->regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
450 vcpu->rip = vmcs_readl(GUEST_RIP);
451}
452
453/*
454 * Syncs rsp and rip back into the vmcs. Should be called after possible
455 * modification.
456 */
457static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu)
458{
459 vmcs_writel(GUEST_RSP, vcpu->regs[VCPU_REGS_RSP]);
460 vmcs_writel(GUEST_RIP, vcpu->rip);
461}
462
463static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
464{
465 unsigned long dr7 = 0x400;
466 u32 exception_bitmap;
467 int old_singlestep;
468
469 exception_bitmap = vmcs_read32(EXCEPTION_BITMAP);
470 old_singlestep = vcpu->guest_debug.singlestep;
471
472 vcpu->guest_debug.enabled = dbg->enabled;
473 if (vcpu->guest_debug.enabled) {
474 int i;
475
476 dr7 |= 0x200; /* exact */
477 for (i = 0; i < 4; ++i) {
478 if (!dbg->breakpoints[i].enabled)
479 continue;
480 vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address;
481 dr7 |= 2 << (i*2); /* global enable */
482 dr7 |= 0 << (i*4+16); /* execution breakpoint */
483 }
484
485 exception_bitmap |= (1u << 1); /* Trap debug exceptions */
486
487 vcpu->guest_debug.singlestep = dbg->singlestep;
488 } else {
489 exception_bitmap &= ~(1u << 1); /* Ignore debug exceptions */
490 vcpu->guest_debug.singlestep = 0;
491 }
492
493 if (old_singlestep && !vcpu->guest_debug.singlestep) {
494 unsigned long flags;
495
496 flags = vmcs_readl(GUEST_RFLAGS);
497 flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
498 vmcs_writel(GUEST_RFLAGS, flags);
499 }
500
501 vmcs_write32(EXCEPTION_BITMAP, exception_bitmap);
502 vmcs_writel(GUEST_DR7, dr7);
503
504 return 0;
505}
506
507static __init int cpu_has_kvm_support(void)
508{
509 unsigned long ecx = cpuid_ecx(1);
510 return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */
511}
512
513static __init int vmx_disabled_by_bios(void)
514{
515 u64 msr;
516
517 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
518 return (msr & 5) == 1; /* locked but not enabled */
519}
520
521static __init void hardware_enable(void *garbage)
522{
523 int cpu = raw_smp_processor_id();
524 u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
525 u64 old;
526
527 rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
528 if ((old & 5) == 0)
529 /* enable and lock */
530 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | 5);
531 write_cr4(read_cr4() | CR4_VMXE); /* FIXME: not cpu hotplug safe */
532 asm volatile (ASM_VMX_VMXON_RAX : : "a"(&phys_addr), "m"(phys_addr)
533 : "memory", "cc");
534}
535
536static void hardware_disable(void *garbage)
537{
538 asm volatile (ASM_VMX_VMXOFF : : : "cc");
539}
540
541static __init void setup_vmcs_descriptor(void)
542{
543 u32 vmx_msr_low, vmx_msr_high;
544
545 rdmsr(MSR_IA32_VMX_BASIC_MSR, vmx_msr_low, vmx_msr_high);
546 vmcs_descriptor.size = vmx_msr_high & 0x1fff;
547 vmcs_descriptor.order = get_order(vmcs_descriptor.size);
548 vmcs_descriptor.revision_id = vmx_msr_low;
549};
550
551static struct vmcs *alloc_vmcs_cpu(int cpu)
552{
553 int node = cpu_to_node(cpu);
554 struct page *pages;
555 struct vmcs *vmcs;
556
557 pages = alloc_pages_node(node, GFP_KERNEL, vmcs_descriptor.order);
558 if (!pages)
559 return NULL;
560 vmcs = page_address(pages);
561 memset(vmcs, 0, vmcs_descriptor.size);
562 vmcs->revision_id = vmcs_descriptor.revision_id; /* vmcs revision id */
563 return vmcs;
564}
565
566static struct vmcs *alloc_vmcs(void)
567{
568 return alloc_vmcs_cpu(smp_processor_id());
569}
570
571static void free_vmcs(struct vmcs *vmcs)
572{
573 free_pages((unsigned long)vmcs, vmcs_descriptor.order);
574}
575
576static __exit void free_kvm_area(void)
577{
578 int cpu;
579
580 for_each_online_cpu(cpu)
581 free_vmcs(per_cpu(vmxarea, cpu));
582}
583
584extern struct vmcs *alloc_vmcs_cpu(int cpu);
585
586static __init int alloc_kvm_area(void)
587{
588 int cpu;
589
590 for_each_online_cpu(cpu) {
591 struct vmcs *vmcs;
592
593 vmcs = alloc_vmcs_cpu(cpu);
594 if (!vmcs) {
595 free_kvm_area();
596 return -ENOMEM;
597 }
598
599 per_cpu(vmxarea, cpu) = vmcs;
600 }
601 return 0;
602}
603
604static __init int hardware_setup(void)
605{
606 setup_vmcs_descriptor();
607 return alloc_kvm_area();
608}
609
610static __exit void hardware_unsetup(void)
611{
612 free_kvm_area();
613}
614
615static void update_exception_bitmap(struct kvm_vcpu *vcpu)
616{
617 if (vcpu->rmode.active)
618 vmcs_write32(EXCEPTION_BITMAP, ~0);
619 else
620 vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR);
621}
622
623static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save)
624{
625 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
626
627 if (vmcs_readl(sf->base) == save->base) {
628 vmcs_write16(sf->selector, save->selector);
629 vmcs_writel(sf->base, save->base);
630 vmcs_write32(sf->limit, save->limit);
631 vmcs_write32(sf->ar_bytes, save->ar);
632 } else {
633 u32 dpl = (vmcs_read16(sf->selector) & SELECTOR_RPL_MASK)
634 << AR_DPL_SHIFT;
635 vmcs_write32(sf->ar_bytes, 0x93 | dpl);
636 }
637}
638
639static void enter_pmode(struct kvm_vcpu *vcpu)
640{
641 unsigned long flags;
642
643 vcpu->rmode.active = 0;
644
645 vmcs_writel(GUEST_TR_BASE, vcpu->rmode.tr.base);
646 vmcs_write32(GUEST_TR_LIMIT, vcpu->rmode.tr.limit);
647 vmcs_write32(GUEST_TR_AR_BYTES, vcpu->rmode.tr.ar);
648
649 flags = vmcs_readl(GUEST_RFLAGS);
650 flags &= ~(IOPL_MASK | X86_EFLAGS_VM);
651 flags |= (vcpu->rmode.save_iopl << IOPL_SHIFT);
652 vmcs_writel(GUEST_RFLAGS, flags);
653
654 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~CR4_VME_MASK) |
655 (vmcs_readl(CR4_READ_SHADOW) & CR4_VME_MASK));
656
657 update_exception_bitmap(vcpu);
658
659 fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->rmode.es);
660 fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->rmode.ds);
661 fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->rmode.gs);
662 fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->rmode.fs);
663
664 vmcs_write16(GUEST_SS_SELECTOR, 0);
665 vmcs_write32(GUEST_SS_AR_BYTES, 0x93);
666
667 vmcs_write16(GUEST_CS_SELECTOR,
668 vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK);
669 vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
670}
671
672static int rmode_tss_base(struct kvm* kvm)
673{
674 gfn_t base_gfn = kvm->memslots[0].base_gfn + kvm->memslots[0].npages - 3;
675 return base_gfn << PAGE_SHIFT;
676}
677
678static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
679{
680 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
681
682 save->selector = vmcs_read16(sf->selector);
683 save->base = vmcs_readl(sf->base);
684 save->limit = vmcs_read32(sf->limit);
685 save->ar = vmcs_read32(sf->ar_bytes);
686 vmcs_write16(sf->selector, vmcs_readl(sf->base) >> 4);
687 vmcs_write32(sf->limit, 0xffff);
688 vmcs_write32(sf->ar_bytes, 0xf3);
689}
690
691static void enter_rmode(struct kvm_vcpu *vcpu)
692{
693 unsigned long flags;
694
695 vcpu->rmode.active = 1;
696
697 vcpu->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
698 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
699
700 vcpu->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
701 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
702
703 vcpu->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
704 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
705
706 flags = vmcs_readl(GUEST_RFLAGS);
707 vcpu->rmode.save_iopl = (flags & IOPL_MASK) >> IOPL_SHIFT;
708
709 flags |= IOPL_MASK | X86_EFLAGS_VM;
710
711 vmcs_writel(GUEST_RFLAGS, flags);
712 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | CR4_VME_MASK);
713 update_exception_bitmap(vcpu);
714
715 vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4);
716 vmcs_write32(GUEST_SS_LIMIT, 0xffff);
717 vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
718
719 vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
720 vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
721
722 fix_rmode_seg(VCPU_SREG_ES, &vcpu->rmode.es);
723 fix_rmode_seg(VCPU_SREG_DS, &vcpu->rmode.ds);
724 fix_rmode_seg(VCPU_SREG_GS, &vcpu->rmode.gs);
725 fix_rmode_seg(VCPU_SREG_FS, &vcpu->rmode.fs);
726}
727
728#ifdef __x86_64__
729
730static void enter_lmode(struct kvm_vcpu *vcpu)
731{
732 u32 guest_tr_ar;
733
734 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
735 if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
736 printk(KERN_DEBUG "%s: tss fixup for long mode. \n",
737 __FUNCTION__);
738 vmcs_write32(GUEST_TR_AR_BYTES,
739 (guest_tr_ar & ~AR_TYPE_MASK)
740 | AR_TYPE_BUSY_64_TSS);
741 }
742
743 vcpu->shadow_efer |= EFER_LMA;
744
745 find_msr_entry(vcpu, MSR_EFER)->data |= EFER_LMA | EFER_LME;
746 vmcs_write32(VM_ENTRY_CONTROLS,
747 vmcs_read32(VM_ENTRY_CONTROLS)
748 | VM_ENTRY_CONTROLS_IA32E_MASK);
749}
750
751static void exit_lmode(struct kvm_vcpu *vcpu)
752{
753 vcpu->shadow_efer &= ~EFER_LMA;
754
755 vmcs_write32(VM_ENTRY_CONTROLS,
756 vmcs_read32(VM_ENTRY_CONTROLS)
757 & ~VM_ENTRY_CONTROLS_IA32E_MASK);
758}
759
760#endif
761
762static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
763{
764 if (vcpu->rmode.active && (cr0 & CR0_PE_MASK))
765 enter_pmode(vcpu);
766
767 if (!vcpu->rmode.active && !(cr0 & CR0_PE_MASK))
768 enter_rmode(vcpu);
769
770#ifdef __x86_64__
771 if (vcpu->shadow_efer & EFER_LME) {
772 if (!is_paging(vcpu) && (cr0 & CR0_PG_MASK))
773 enter_lmode(vcpu);
774 if (is_paging(vcpu) && !(cr0 & CR0_PG_MASK))
775 exit_lmode(vcpu);
776 }
777#endif
778
779 vmcs_writel(CR0_READ_SHADOW, cr0);
780 vmcs_writel(GUEST_CR0,
781 (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON);
782 vcpu->cr0 = cr0;
783}
784
785/*
786 * Used when restoring the VM to avoid corrupting segment registers
787 */
788static void vmx_set_cr0_no_modeswitch(struct kvm_vcpu *vcpu, unsigned long cr0)
789{
790 vcpu->rmode.active = ((cr0 & CR0_PE_MASK) == 0);
791 update_exception_bitmap(vcpu);
792 vmcs_writel(CR0_READ_SHADOW, cr0);
793 vmcs_writel(GUEST_CR0,
794 (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON);
795 vcpu->cr0 = cr0;
796}
797
798static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
799{
800 vmcs_writel(GUEST_CR3, cr3);
801}
802
803static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
804{
805 vmcs_writel(CR4_READ_SHADOW, cr4);
806 vmcs_writel(GUEST_CR4, cr4 | (vcpu->rmode.active ?
807 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON));
808 vcpu->cr4 = cr4;
809}
810
811#ifdef __x86_64__
812
813static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
814{
815 struct vmx_msr_entry *msr = find_msr_entry(vcpu, MSR_EFER);
816
817 vcpu->shadow_efer = efer;
818 if (efer & EFER_LMA) {
819 vmcs_write32(VM_ENTRY_CONTROLS,
820 vmcs_read32(VM_ENTRY_CONTROLS) |
821 VM_ENTRY_CONTROLS_IA32E_MASK);
822 msr->data = efer;
823
824 } else {
825 vmcs_write32(VM_ENTRY_CONTROLS,
826 vmcs_read32(VM_ENTRY_CONTROLS) &
827 ~VM_ENTRY_CONTROLS_IA32E_MASK);
828
829 msr->data = efer & ~EFER_LME;
830 }
831}
832
833#endif
834
835static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
836{
837 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
838
839 return vmcs_readl(sf->base);
840}
841
842static void vmx_get_segment(struct kvm_vcpu *vcpu,
843 struct kvm_segment *var, int seg)
844{
845 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
846 u32 ar;
847
848 var->base = vmcs_readl(sf->base);
849 var->limit = vmcs_read32(sf->limit);
850 var->selector = vmcs_read16(sf->selector);
851 ar = vmcs_read32(sf->ar_bytes);
852 if (ar & AR_UNUSABLE_MASK)
853 ar = 0;
854 var->type = ar & 15;
855 var->s = (ar >> 4) & 1;
856 var->dpl = (ar >> 5) & 3;
857 var->present = (ar >> 7) & 1;
858 var->avl = (ar >> 12) & 1;
859 var->l = (ar >> 13) & 1;
860 var->db = (ar >> 14) & 1;
861 var->g = (ar >> 15) & 1;
862 var->unusable = (ar >> 16) & 1;
863}
864
865static void vmx_set_segment(struct kvm_vcpu *vcpu,
866 struct kvm_segment *var, int seg)
867{
868 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
869 u32 ar;
870
871 vmcs_writel(sf->base, var->base);
872 vmcs_write32(sf->limit, var->limit);
873 vmcs_write16(sf->selector, var->selector);
874 if (var->unusable)
875 ar = 1 << 16;
876 else {
877 ar = var->type & 15;
878 ar |= (var->s & 1) << 4;
879 ar |= (var->dpl & 3) << 5;
880 ar |= (var->present & 1) << 7;
881 ar |= (var->avl & 1) << 12;
882 ar |= (var->l & 1) << 13;
883 ar |= (var->db & 1) << 14;
884 ar |= (var->g & 1) << 15;
885 }
886 vmcs_write32(sf->ar_bytes, ar);
887}
888
889static int vmx_is_long_mode(struct kvm_vcpu *vcpu)
890{
891 return vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_CONTROLS_IA32E_MASK;
892}
893
894static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
895{
896 u32 ar = vmcs_read32(GUEST_CS_AR_BYTES);
897
898 *db = (ar >> 14) & 1;
899 *l = (ar >> 13) & 1;
900}
901
902static void vmx_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
903{
904 dt->limit = vmcs_read32(GUEST_IDTR_LIMIT);
905 dt->base = vmcs_readl(GUEST_IDTR_BASE);
906}
907
908static void vmx_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
909{
910 vmcs_write32(GUEST_IDTR_LIMIT, dt->limit);
911 vmcs_writel(GUEST_IDTR_BASE, dt->base);
912}
913
914static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
915{
916 dt->limit = vmcs_read32(GUEST_GDTR_LIMIT);
917 dt->base = vmcs_readl(GUEST_GDTR_BASE);
918}
919
920static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
921{
922 vmcs_write32(GUEST_GDTR_LIMIT, dt->limit);
923 vmcs_writel(GUEST_GDTR_BASE, dt->base);
924}
925
926static int init_rmode_tss(struct kvm* kvm)
927{
928 struct page *p1, *p2, *p3;
929 gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
930 char *page;
931
932 p1 = _gfn_to_page(kvm, fn++);
933 p2 = _gfn_to_page(kvm, fn++);
934 p3 = _gfn_to_page(kvm, fn);
935
936 if (!p1 || !p2 || !p3) {
937 kvm_printf(kvm,"%s: gfn_to_page failed\n", __FUNCTION__);
938 return 0;
939 }
940
941 page = kmap_atomic(p1, KM_USER0);
942 memset(page, 0, PAGE_SIZE);
943 *(u16*)(page + 0x66) = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
944 kunmap_atomic(page, KM_USER0);
945
946 page = kmap_atomic(p2, KM_USER0);
947 memset(page, 0, PAGE_SIZE);
948 kunmap_atomic(page, KM_USER0);
949
950 page = kmap_atomic(p3, KM_USER0);
951 memset(page, 0, PAGE_SIZE);
952 *(page + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1) = ~0;
953 kunmap_atomic(page, KM_USER0);
954
955 return 1;
956}
957
958static void vmcs_write32_fixedbits(u32 msr, u32 vmcs_field, u32 val)
959{
960 u32 msr_high, msr_low;
961
962 rdmsr(msr, msr_low, msr_high);
963
964 val &= msr_high;
965 val |= msr_low;
966 vmcs_write32(vmcs_field, val);
967}
968
969static void seg_setup(int seg)
970{
971 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
972
973 vmcs_write16(sf->selector, 0);
974 vmcs_writel(sf->base, 0);
975 vmcs_write32(sf->limit, 0xffff);
976 vmcs_write32(sf->ar_bytes, 0x93);
977}
978
979/*
980 * Sets up the vmcs for emulated real mode.
981 */
982static int vmx_vcpu_setup(struct kvm_vcpu *vcpu)
983{
984 u32 host_sysenter_cs;
985 u32 junk;
986 unsigned long a;
987 struct descriptor_table dt;
988 int i;
989 int ret = 0;
990 int nr_good_msrs;
991 extern asmlinkage void kvm_vmx_return(void);
992
993 if (!init_rmode_tss(vcpu->kvm)) {
994 ret = -ENOMEM;
995 goto out;
996 }
997
998 memset(vcpu->regs, 0, sizeof(vcpu->regs));
999 vcpu->regs[VCPU_REGS_RDX] = get_rdx_init_val();
1000 vcpu->cr8 = 0;
1001 vcpu->apic_base = 0xfee00000 |
1002 /*for vcpu 0*/ MSR_IA32_APICBASE_BSP |
1003 MSR_IA32_APICBASE_ENABLE;
1004
1005 fx_init(vcpu);
1006
1007 /*
1008 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
1009 * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh.
1010 */
1011 vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
1012 vmcs_writel(GUEST_CS_BASE, 0x000f0000);
1013 vmcs_write32(GUEST_CS_LIMIT, 0xffff);
1014 vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
1015
1016 seg_setup(VCPU_SREG_DS);
1017 seg_setup(VCPU_SREG_ES);
1018 seg_setup(VCPU_SREG_FS);
1019 seg_setup(VCPU_SREG_GS);
1020 seg_setup(VCPU_SREG_SS);
1021
1022 vmcs_write16(GUEST_TR_SELECTOR, 0);
1023 vmcs_writel(GUEST_TR_BASE, 0);
1024 vmcs_write32(GUEST_TR_LIMIT, 0xffff);
1025 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
1026
1027 vmcs_write16(GUEST_LDTR_SELECTOR, 0);
1028 vmcs_writel(GUEST_LDTR_BASE, 0);
1029 vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
1030 vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
1031
1032 vmcs_write32(GUEST_SYSENTER_CS, 0);
1033 vmcs_writel(GUEST_SYSENTER_ESP, 0);
1034 vmcs_writel(GUEST_SYSENTER_EIP, 0);
1035
1036 vmcs_writel(GUEST_RFLAGS, 0x02);
1037 vmcs_writel(GUEST_RIP, 0xfff0);
1038 vmcs_writel(GUEST_RSP, 0);
1039
1040 vmcs_writel(GUEST_CR3, 0);
1041
1042 //todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0
1043 vmcs_writel(GUEST_DR7, 0x400);
1044
1045 vmcs_writel(GUEST_GDTR_BASE, 0);
1046 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
1047
1048 vmcs_writel(GUEST_IDTR_BASE, 0);
1049 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
1050
1051 vmcs_write32(GUEST_ACTIVITY_STATE, 0);
1052 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
1053 vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
1054
1055 /* I/O */
1056 vmcs_write64(IO_BITMAP_A, 0);
1057 vmcs_write64(IO_BITMAP_B, 0);
1058
1059 guest_write_tsc(0);
1060
1061 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
1062
1063 /* Special registers */
1064 vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
1065
1066 /* Control */
1067 vmcs_write32_fixedbits(MSR_IA32_VMX_PINBASED_CTLS_MSR,
1068 PIN_BASED_VM_EXEC_CONTROL,
1069 PIN_BASED_EXT_INTR_MASK /* 20.6.1 */
1070 | PIN_BASED_NMI_EXITING /* 20.6.1 */
1071 );
1072 vmcs_write32_fixedbits(MSR_IA32_VMX_PROCBASED_CTLS_MSR,
1073 CPU_BASED_VM_EXEC_CONTROL,
1074 CPU_BASED_HLT_EXITING /* 20.6.2 */
1075 | CPU_BASED_CR8_LOAD_EXITING /* 20.6.2 */
1076 | CPU_BASED_CR8_STORE_EXITING /* 20.6.2 */
1077 | CPU_BASED_UNCOND_IO_EXITING /* 20.6.2 */
1078 | CPU_BASED_INVDPG_EXITING
1079 | CPU_BASED_MOV_DR_EXITING
1080 | CPU_BASED_USE_TSC_OFFSETING /* 21.3 */
1081 );
1082
1083 vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR);
1084 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
1085 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
1086 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
1087
1088 vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */
1089 vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */
1090 vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
1091
1092 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
1093 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
1094 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
1095 vmcs_write16(HOST_FS_SELECTOR, read_fs()); /* 22.2.4 */
1096 vmcs_write16(HOST_GS_SELECTOR, read_gs()); /* 22.2.4 */
1097 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
1098#ifdef __x86_64__
1099 rdmsrl(MSR_FS_BASE, a);
1100 vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
1101 rdmsrl(MSR_GS_BASE, a);
1102 vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */
1103#else
1104 vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
1105 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
1106#endif
1107
1108 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
1109
1110 get_idt(&dt);
1111 vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */
1112
1113
1114 vmcs_writel(HOST_RIP, (unsigned long)kvm_vmx_return); /* 22.2.5 */
1115
1116 rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
1117 vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
1118 rdmsrl(MSR_IA32_SYSENTER_ESP, a);
1119 vmcs_writel(HOST_IA32_SYSENTER_ESP, a); /* 22.2.3 */
1120 rdmsrl(MSR_IA32_SYSENTER_EIP, a);
1121 vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */
1122
1123 ret = -ENOMEM;
1124 vcpu->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
1125 if (!vcpu->guest_msrs)
1126 goto out;
1127 vcpu->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
1128 if (!vcpu->host_msrs)
1129 goto out_free_guest_msrs;
1130
1131 for (i = 0; i < NR_VMX_MSR; ++i) {
1132 u32 index = vmx_msr_index[i];
1133 u32 data_low, data_high;
1134 u64 data;
1135 int j = vcpu->nmsrs;
1136
1137 if (rdmsr_safe(index, &data_low, &data_high) < 0)
1138 continue;
1139 data = data_low | ((u64)data_high << 32);
1140 vcpu->host_msrs[j].index = index;
1141 vcpu->host_msrs[j].reserved = 0;
1142 vcpu->host_msrs[j].data = data;
1143 vcpu->guest_msrs[j] = vcpu->host_msrs[j];
1144 ++vcpu->nmsrs;
1145 }
1146 printk(KERN_DEBUG "kvm: msrs: %d\n", vcpu->nmsrs);
1147
1148 nr_good_msrs = vcpu->nmsrs - NR_BAD_MSRS;
1149 vmcs_writel(VM_ENTRY_MSR_LOAD_ADDR,
1150 virt_to_phys(vcpu->guest_msrs + NR_BAD_MSRS));
1151 vmcs_writel(VM_EXIT_MSR_STORE_ADDR,
1152 virt_to_phys(vcpu->guest_msrs + NR_BAD_MSRS));
1153 vmcs_writel(VM_EXIT_MSR_LOAD_ADDR,
1154 virt_to_phys(vcpu->host_msrs + NR_BAD_MSRS));
1155 vmcs_write32_fixedbits(MSR_IA32_VMX_EXIT_CTLS_MSR, VM_EXIT_CONTROLS,
1156 (HOST_IS_64 << 9)); /* 22.2,1, 20.7.1 */
1157 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, nr_good_msrs); /* 22.2.2 */
1158 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */
1159 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */
1160
1161
1162 /* 22.2.1, 20.8.1 */
1163 vmcs_write32_fixedbits(MSR_IA32_VMX_ENTRY_CTLS_MSR,
1164 VM_ENTRY_CONTROLS, 0);
1165 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
1166
1167 vmcs_writel(VIRTUAL_APIC_PAGE_ADDR, 0);
1168 vmcs_writel(TPR_THRESHOLD, 0);
1169
1170 vmcs_writel(CR0_GUEST_HOST_MASK, KVM_GUEST_CR0_MASK);
1171 vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK);
1172
1173 vcpu->cr0 = 0x60000010;
1174 vmx_set_cr0(vcpu, vcpu->cr0); // enter rmode
1175 vmx_set_cr4(vcpu, 0);
1176#ifdef __x86_64__
1177 vmx_set_efer(vcpu, 0);
1178#endif
1179
1180 return 0;
1181
1182out_free_guest_msrs:
1183 kfree(vcpu->guest_msrs);
1184out:
1185 return ret;
1186}
1187
1188static void inject_rmode_irq(struct kvm_vcpu *vcpu, int irq)
1189{
1190 u16 ent[2];
1191 u16 cs;
1192 u16 ip;
1193 unsigned long flags;
1194 unsigned long ss_base = vmcs_readl(GUEST_SS_BASE);
1195 u16 sp = vmcs_readl(GUEST_RSP);
1196 u32 ss_limit = vmcs_read32(GUEST_SS_LIMIT);
1197
1198 if (sp > ss_limit || sp - 6 > sp) {
1199 vcpu_printf(vcpu, "%s: #SS, rsp 0x%lx ss 0x%lx limit 0x%x\n",
1200 __FUNCTION__,
1201 vmcs_readl(GUEST_RSP),
1202 vmcs_readl(GUEST_SS_BASE),
1203 vmcs_read32(GUEST_SS_LIMIT));
1204 return;
1205 }
1206
1207 if (kvm_read_guest(vcpu, irq * sizeof(ent), sizeof(ent), &ent) !=
1208 sizeof(ent)) {
1209 vcpu_printf(vcpu, "%s: read guest err\n", __FUNCTION__);
1210 return;
1211 }
1212
1213 flags = vmcs_readl(GUEST_RFLAGS);
1214 cs = vmcs_readl(GUEST_CS_BASE) >> 4;
1215 ip = vmcs_readl(GUEST_RIP);
1216
1217
1218 if (kvm_write_guest(vcpu, ss_base + sp - 2, 2, &flags) != 2 ||
1219 kvm_write_guest(vcpu, ss_base + sp - 4, 2, &cs) != 2 ||
1220 kvm_write_guest(vcpu, ss_base + sp - 6, 2, &ip) != 2) {
1221 vcpu_printf(vcpu, "%s: write guest err\n", __FUNCTION__);
1222 return;
1223 }
1224
1225 vmcs_writel(GUEST_RFLAGS, flags &
1226 ~( X86_EFLAGS_IF | X86_EFLAGS_AC | X86_EFLAGS_TF));
1227 vmcs_write16(GUEST_CS_SELECTOR, ent[1]) ;
1228 vmcs_writel(GUEST_CS_BASE, ent[1] << 4);
1229 vmcs_writel(GUEST_RIP, ent[0]);
1230 vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) & ~0xffff) | (sp - 6));
1231}
1232
1233static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
1234{
1235 int word_index = __ffs(vcpu->irq_summary);
1236 int bit_index = __ffs(vcpu->irq_pending[word_index]);
1237 int irq = word_index * BITS_PER_LONG + bit_index;
1238
1239 clear_bit(bit_index, &vcpu->irq_pending[word_index]);
1240 if (!vcpu->irq_pending[word_index])
1241 clear_bit(word_index, &vcpu->irq_summary);
1242
1243 if (vcpu->rmode.active) {
1244 inject_rmode_irq(vcpu, irq);
1245 return;
1246 }
1247 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
1248 irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
1249}
1250
1251static void kvm_try_inject_irq(struct kvm_vcpu *vcpu)
1252{
1253 if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)
1254 && (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0)
1255 /*
1256 * Interrupts enabled, and not blocked by sti or mov ss. Good.
1257 */
1258 kvm_do_inject_irq(vcpu);
1259 else
1260 /*
1261 * Interrupts blocked. Wait for unblock.
1262 */
1263 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
1264 vmcs_read32(CPU_BASED_VM_EXEC_CONTROL)
1265 | CPU_BASED_VIRTUAL_INTR_PENDING);
1266}
1267
1268static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
1269{
1270 struct kvm_guest_debug *dbg = &vcpu->guest_debug;
1271
1272 set_debugreg(dbg->bp[0], 0);
1273 set_debugreg(dbg->bp[1], 1);
1274 set_debugreg(dbg->bp[2], 2);
1275 set_debugreg(dbg->bp[3], 3);
1276
1277 if (dbg->singlestep) {
1278 unsigned long flags;
1279
1280 flags = vmcs_readl(GUEST_RFLAGS);
1281 flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
1282 vmcs_writel(GUEST_RFLAGS, flags);
1283 }
1284}
1285
1286static int handle_rmode_exception(struct kvm_vcpu *vcpu,
1287 int vec, u32 err_code)
1288{
1289 if (!vcpu->rmode.active)
1290 return 0;
1291
1292 if (vec == GP_VECTOR && err_code == 0)
1293 if (emulate_instruction(vcpu, NULL, 0, 0) == EMULATE_DONE)
1294 return 1;
1295 return 0;
1296}
1297
1298static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1299{
1300 u32 intr_info, error_code;
1301 unsigned long cr2, rip;
1302 u32 vect_info;
1303 enum emulation_result er;
1304
1305 vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
1306 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
1307
1308 if ((vect_info & VECTORING_INFO_VALID_MASK) &&
1309 !is_page_fault(intr_info)) {
1310 printk(KERN_ERR "%s: unexpected, vectoring info 0x%x "
1311 "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info);
1312 }
1313
1314 if (is_external_interrupt(vect_info)) {
1315 int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
1316 set_bit(irq, vcpu->irq_pending);
1317 set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary);
1318 }
1319
1320 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) { /* nmi */
1321 asm ("int $2");
1322 return 1;
1323 }
1324 error_code = 0;
1325 rip = vmcs_readl(GUEST_RIP);
1326 if (intr_info & INTR_INFO_DELIEVER_CODE_MASK)
1327 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
1328 if (is_page_fault(intr_info)) {
1329 cr2 = vmcs_readl(EXIT_QUALIFICATION);
1330
1331 spin_lock(&vcpu->kvm->lock);
1332 if (!vcpu->mmu.page_fault(vcpu, cr2, error_code)) {
1333 spin_unlock(&vcpu->kvm->lock);
1334 return 1;
1335 }
1336
1337 er = emulate_instruction(vcpu, kvm_run, cr2, error_code);
1338 spin_unlock(&vcpu->kvm->lock);
1339
1340 switch (er) {
1341 case EMULATE_DONE:
1342 return 1;
1343 case EMULATE_DO_MMIO:
1344 ++kvm_stat.mmio_exits;
1345 kvm_run->exit_reason = KVM_EXIT_MMIO;
1346 return 0;
1347 case EMULATE_FAIL:
1348 vcpu_printf(vcpu, "%s: emulate fail\n", __FUNCTION__);
1349 break;
1350 default:
1351 BUG();
1352 }
1353 }
1354
1355 if (vcpu->rmode.active &&
1356 handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
1357 error_code))
1358 return 1;
1359
1360 if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == (INTR_TYPE_EXCEPTION | 1)) {
1361 kvm_run->exit_reason = KVM_EXIT_DEBUG;
1362 return 0;
1363 }
1364 kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
1365 kvm_run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
1366 kvm_run->ex.error_code = error_code;
1367 return 0;
1368}
1369
1370static int handle_external_interrupt(struct kvm_vcpu *vcpu,
1371 struct kvm_run *kvm_run)
1372{
1373 ++kvm_stat.irq_exits;
1374 return 1;
1375}
1376
1377
1378static int get_io_count(struct kvm_vcpu *vcpu, u64 *count)
1379{
1380 u64 inst;
1381 gva_t rip;
1382 int countr_size;
1383 int i, n;
1384
1385 if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_VM)) {
1386 countr_size = 2;
1387 } else {
1388 u32 cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
1389
1390 countr_size = (cs_ar & AR_L_MASK) ? 8:
1391 (cs_ar & AR_DB_MASK) ? 4: 2;
1392 }
1393
1394 rip = vmcs_readl(GUEST_RIP);
1395 if (countr_size != 8)
1396 rip += vmcs_readl(GUEST_CS_BASE);
1397
1398 n = kvm_read_guest(vcpu, rip, sizeof(inst), &inst);
1399
1400 for (i = 0; i < n; i++) {
1401 switch (((u8*)&inst)[i]) {
1402 case 0xf0:
1403 case 0xf2:
1404 case 0xf3:
1405 case 0x2e:
1406 case 0x36:
1407 case 0x3e:
1408 case 0x26:
1409 case 0x64:
1410 case 0x65:
1411 case 0x66:
1412 break;
1413 case 0x67:
1414 countr_size = (countr_size == 2) ? 4: (countr_size >> 1);
1415 default:
1416 goto done;
1417 }
1418 }
1419 return 0;
1420done:
1421 countr_size *= 8;
1422 *count = vcpu->regs[VCPU_REGS_RCX] & (~0ULL >> (64 - countr_size));
1423 return 1;
1424}
1425
1426static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1427{
1428 u64 exit_qualification;
1429
1430 ++kvm_stat.io_exits;
1431 exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
1432 kvm_run->exit_reason = KVM_EXIT_IO;
1433 if (exit_qualification & 8)
1434 kvm_run->io.direction = KVM_EXIT_IO_IN;
1435 else
1436 kvm_run->io.direction = KVM_EXIT_IO_OUT;
1437 kvm_run->io.size = (exit_qualification & 7) + 1;
1438 kvm_run->io.string = (exit_qualification & 16) != 0;
1439 kvm_run->io.string_down
1440 = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0;
1441 kvm_run->io.rep = (exit_qualification & 32) != 0;
1442 kvm_run->io.port = exit_qualification >> 16;
1443 if (kvm_run->io.string) {
1444 if (!get_io_count(vcpu, &kvm_run->io.count))
1445 return 1;
1446 kvm_run->io.address = vmcs_readl(GUEST_LINEAR_ADDRESS);
1447 } else
1448 kvm_run->io.value = vcpu->regs[VCPU_REGS_RAX]; /* rax */
1449 return 0;
1450}
1451
1452static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1453{
1454 u64 address = vmcs_read64(EXIT_QUALIFICATION);
1455 int instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
1456 spin_lock(&vcpu->kvm->lock);
1457 vcpu->mmu.inval_page(vcpu, address);
1458 spin_unlock(&vcpu->kvm->lock);
1459 vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP) + instruction_length);
1460 return 1;
1461}
1462
1463static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1464{
1465 u64 exit_qualification;
1466 int cr;
1467 int reg;
1468
1469 exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
1470 cr = exit_qualification & 15;
1471 reg = (exit_qualification >> 8) & 15;
1472 switch ((exit_qualification >> 4) & 3) {
1473 case 0: /* mov to cr */
1474 switch (cr) {
1475 case 0:
1476 vcpu_load_rsp_rip(vcpu);
1477 set_cr0(vcpu, vcpu->regs[reg]);
1478 skip_emulated_instruction(vcpu);
1479 return 1;
1480 case 3:
1481 vcpu_load_rsp_rip(vcpu);
1482 set_cr3(vcpu, vcpu->regs[reg]);
1483 skip_emulated_instruction(vcpu);
1484 return 1;
1485 case 4:
1486 vcpu_load_rsp_rip(vcpu);
1487 set_cr4(vcpu, vcpu->regs[reg]);
1488 skip_emulated_instruction(vcpu);
1489 return 1;
1490 case 8:
1491 vcpu_load_rsp_rip(vcpu);
1492 set_cr8(vcpu, vcpu->regs[reg]);
1493 skip_emulated_instruction(vcpu);
1494 return 1;
1495 };
1496 break;
1497 case 1: /*mov from cr*/
1498 switch (cr) {
1499 case 3:
1500 vcpu_load_rsp_rip(vcpu);
1501 vcpu->regs[reg] = vcpu->cr3;
1502 vcpu_put_rsp_rip(vcpu);
1503 skip_emulated_instruction(vcpu);
1504 return 1;
1505 case 8:
1506 printk(KERN_DEBUG "handle_cr: read CR8 "
1507 "cpu erratum AA15\n");
1508 vcpu_load_rsp_rip(vcpu);
1509 vcpu->regs[reg] = vcpu->cr8;
1510 vcpu_put_rsp_rip(vcpu);
1511 skip_emulated_instruction(vcpu);
1512 return 1;
1513 }
1514 break;
1515 case 3: /* lmsw */
1516 lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f);
1517
1518 skip_emulated_instruction(vcpu);
1519 return 1;
1520 default:
1521 break;
1522 }
1523 kvm_run->exit_reason = 0;
1524 printk(KERN_ERR "kvm: unhandled control register: op %d cr %d\n",
1525 (int)(exit_qualification >> 4) & 3, cr);
1526 return 0;
1527}
1528
1529static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1530{
1531 u64 exit_qualification;
1532 unsigned long val;
1533 int dr, reg;
1534
1535 /*
1536 * FIXME: this code assumes the host is debugging the guest.
1537 * need to deal with guest debugging itself too.
1538 */
1539 exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
1540 dr = exit_qualification & 7;
1541 reg = (exit_qualification >> 8) & 15;
1542 vcpu_load_rsp_rip(vcpu);
1543 if (exit_qualification & 16) {
1544 /* mov from dr */
1545 switch (dr) {
1546 case 6:
1547 val = 0xffff0ff0;
1548 break;
1549 case 7:
1550 val = 0x400;
1551 break;
1552 default:
1553 val = 0;
1554 }
1555 vcpu->regs[reg] = val;
1556 } else {
1557 /* mov to dr */
1558 }
1559 vcpu_put_rsp_rip(vcpu);
1560 skip_emulated_instruction(vcpu);
1561 return 1;
1562}
1563
1564static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1565{
1566 kvm_run->exit_reason = KVM_EXIT_CPUID;
1567 return 0;
1568}
1569
1570static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1571{
1572 u32 ecx = vcpu->regs[VCPU_REGS_RCX];
1573 u64 data;
1574
1575 if (vmx_get_msr(vcpu, ecx, &data)) {
1576 vmx_inject_gp(vcpu, 0);
1577 return 1;
1578 }
1579
1580 /* FIXME: handling of bits 32:63 of rax, rdx */
1581 vcpu->regs[VCPU_REGS_RAX] = data & -1u;
1582 vcpu->regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
1583 skip_emulated_instruction(vcpu);
1584 return 1;
1585}
1586
1587static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1588{
1589 u32 ecx = vcpu->regs[VCPU_REGS_RCX];
1590 u64 data = (vcpu->regs[VCPU_REGS_RAX] & -1u)
1591 | ((u64)(vcpu->regs[VCPU_REGS_RDX] & -1u) << 32);
1592
1593 if (vmx_set_msr(vcpu, ecx, data) != 0) {
1594 vmx_inject_gp(vcpu, 0);
1595 return 1;
1596 }
1597
1598 skip_emulated_instruction(vcpu);
1599 return 1;
1600}
1601
1602static int handle_interrupt_window(struct kvm_vcpu *vcpu,
1603 struct kvm_run *kvm_run)
1604{
1605 /* Turn off interrupt window reporting. */
1606 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
1607 vmcs_read32(CPU_BASED_VM_EXEC_CONTROL)
1608 & ~CPU_BASED_VIRTUAL_INTR_PENDING);
1609 return 1;
1610}
1611
1612static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1613{
1614 skip_emulated_instruction(vcpu);
1615 if (vcpu->irq_summary && (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF))
1616 return 1;
1617
1618 kvm_run->exit_reason = KVM_EXIT_HLT;
1619 return 0;
1620}
1621
1622/*
1623 * The exit handlers return 1 if the exit was handled fully and guest execution
1624 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
1625 * to be done to userspace and return 0.
1626 */
1627static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
1628 struct kvm_run *kvm_run) = {
1629 [EXIT_REASON_EXCEPTION_NMI] = handle_exception,
1630 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
1631 [EXIT_REASON_IO_INSTRUCTION] = handle_io,
1632 [EXIT_REASON_INVLPG] = handle_invlpg,
1633 [EXIT_REASON_CR_ACCESS] = handle_cr,
1634 [EXIT_REASON_DR_ACCESS] = handle_dr,
1635 [EXIT_REASON_CPUID] = handle_cpuid,
1636 [EXIT_REASON_MSR_READ] = handle_rdmsr,
1637 [EXIT_REASON_MSR_WRITE] = handle_wrmsr,
1638 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
1639 [EXIT_REASON_HLT] = handle_halt,
1640};
1641
1642static const int kvm_vmx_max_exit_handlers =
1643 sizeof(kvm_vmx_exit_handlers) / sizeof(*kvm_vmx_exit_handlers);
1644
1645/*
1646 * The guest has exited. See if we can fix it or if we need userspace
1647 * assistance.
1648 */
1649static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1650{
1651 u32 vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
1652 u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
1653
1654 if ( (vectoring_info & VECTORING_INFO_VALID_MASK) &&
1655 exit_reason != EXIT_REASON_EXCEPTION_NMI )
1656 printk(KERN_WARNING "%s: unexpected, valid vectoring info and "
1657 "exit reason is 0x%x\n", __FUNCTION__, exit_reason);
1658 kvm_run->instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
1659 if (exit_reason < kvm_vmx_max_exit_handlers
1660 && kvm_vmx_exit_handlers[exit_reason])
1661 return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run);
1662 else {
1663 kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
1664 kvm_run->hw.hardware_exit_reason = exit_reason;
1665 }
1666 return 0;
1667}
1668
1669static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1670{
1671 u8 fail;
1672 u16 fs_sel, gs_sel, ldt_sel;
1673 int fs_gs_ldt_reload_needed;
1674
1675again:
1676 /*
1677 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not
1678 * allow segment selectors with cpl > 0 or ti == 1.
1679 */
1680 fs_sel = read_fs();
1681 gs_sel = read_gs();
1682 ldt_sel = read_ldt();
1683 fs_gs_ldt_reload_needed = (fs_sel & 7) | (gs_sel & 7) | ldt_sel;
1684 if (!fs_gs_ldt_reload_needed) {
1685 vmcs_write16(HOST_FS_SELECTOR, fs_sel);
1686 vmcs_write16(HOST_GS_SELECTOR, gs_sel);
1687 } else {
1688 vmcs_write16(HOST_FS_SELECTOR, 0);
1689 vmcs_write16(HOST_GS_SELECTOR, 0);
1690 }
1691
1692#ifdef __x86_64__
1693 vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
1694 vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
1695#else
1696 vmcs_writel(HOST_FS_BASE, segment_base(fs_sel));
1697 vmcs_writel(HOST_GS_BASE, segment_base(gs_sel));
1698#endif
1699
1700 if (vcpu->irq_summary &&
1701 !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
1702 kvm_try_inject_irq(vcpu);
1703
1704 if (vcpu->guest_debug.enabled)
1705 kvm_guest_debug_pre(vcpu);
1706
1707 fx_save(vcpu->host_fx_image);
1708 fx_restore(vcpu->guest_fx_image);
1709
1710 save_msrs(vcpu->host_msrs, vcpu->nmsrs);
1711 load_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
1712
1713 asm (
1714 /* Store host registers */
1715 "pushf \n\t"
1716#ifdef __x86_64__
1717 "push %%rax; push %%rbx; push %%rdx;"
1718 "push %%rsi; push %%rdi; push %%rbp;"
1719 "push %%r8; push %%r9; push %%r10; push %%r11;"
1720 "push %%r12; push %%r13; push %%r14; push %%r15;"
1721 "push %%rcx \n\t"
1722 ASM_VMX_VMWRITE_RSP_RDX "\n\t"
1723#else
1724 "pusha; push %%ecx \n\t"
1725 ASM_VMX_VMWRITE_RSP_RDX "\n\t"
1726#endif
1727 /* Check if vmlaunch of vmresume is needed */
1728 "cmp $0, %1 \n\t"
1729 /* Load guest registers. Don't clobber flags. */
1730#ifdef __x86_64__
1731 "mov %c[cr2](%3), %%rax \n\t"
1732 "mov %%rax, %%cr2 \n\t"
1733 "mov %c[rax](%3), %%rax \n\t"
1734 "mov %c[rbx](%3), %%rbx \n\t"
1735 "mov %c[rdx](%3), %%rdx \n\t"
1736 "mov %c[rsi](%3), %%rsi \n\t"
1737 "mov %c[rdi](%3), %%rdi \n\t"
1738 "mov %c[rbp](%3), %%rbp \n\t"
1739 "mov %c[r8](%3), %%r8 \n\t"
1740 "mov %c[r9](%3), %%r9 \n\t"
1741 "mov %c[r10](%3), %%r10 \n\t"
1742 "mov %c[r11](%3), %%r11 \n\t"
1743 "mov %c[r12](%3), %%r12 \n\t"
1744 "mov %c[r13](%3), %%r13 \n\t"
1745 "mov %c[r14](%3), %%r14 \n\t"
1746 "mov %c[r15](%3), %%r15 \n\t"
1747 "mov %c[rcx](%3), %%rcx \n\t" /* kills %3 (rcx) */
1748#else
1749 "mov %c[cr2](%3), %%eax \n\t"
1750 "mov %%eax, %%cr2 \n\t"
1751 "mov %c[rax](%3), %%eax \n\t"
1752 "mov %c[rbx](%3), %%ebx \n\t"
1753 "mov %c[rdx](%3), %%edx \n\t"
1754 "mov %c[rsi](%3), %%esi \n\t"
1755 "mov %c[rdi](%3), %%edi \n\t"
1756 "mov %c[rbp](%3), %%ebp \n\t"
1757 "mov %c[rcx](%3), %%ecx \n\t" /* kills %3 (ecx) */
1758#endif
1759 /* Enter guest mode */
1760 "jne launched \n\t"
1761 ASM_VMX_VMLAUNCH "\n\t"
1762 "jmp kvm_vmx_return \n\t"
1763 "launched: " ASM_VMX_VMRESUME "\n\t"
1764 ".globl kvm_vmx_return \n\t"
1765 "kvm_vmx_return: "
1766 /* Save guest registers, load host registers, keep flags */
1767#ifdef __x86_64__
1768 "xchg %3, 0(%%rsp) \n\t"
1769 "mov %%rax, %c[rax](%3) \n\t"
1770 "mov %%rbx, %c[rbx](%3) \n\t"
1771 "pushq 0(%%rsp); popq %c[rcx](%3) \n\t"
1772 "mov %%rdx, %c[rdx](%3) \n\t"
1773 "mov %%rsi, %c[rsi](%3) \n\t"
1774 "mov %%rdi, %c[rdi](%3) \n\t"
1775 "mov %%rbp, %c[rbp](%3) \n\t"
1776 "mov %%r8, %c[r8](%3) \n\t"
1777 "mov %%r9, %c[r9](%3) \n\t"
1778 "mov %%r10, %c[r10](%3) \n\t"
1779 "mov %%r11, %c[r11](%3) \n\t"
1780 "mov %%r12, %c[r12](%3) \n\t"
1781 "mov %%r13, %c[r13](%3) \n\t"
1782 "mov %%r14, %c[r14](%3) \n\t"
1783 "mov %%r15, %c[r15](%3) \n\t"
1784 "mov %%cr2, %%rax \n\t"
1785 "mov %%rax, %c[cr2](%3) \n\t"
1786 "mov 0(%%rsp), %3 \n\t"
1787
1788 "pop %%rcx; pop %%r15; pop %%r14; pop %%r13; pop %%r12;"
1789 "pop %%r11; pop %%r10; pop %%r9; pop %%r8;"
1790 "pop %%rbp; pop %%rdi; pop %%rsi;"
1791 "pop %%rdx; pop %%rbx; pop %%rax \n\t"
1792#else
1793 "xchg %3, 0(%%esp) \n\t"
1794 "mov %%eax, %c[rax](%3) \n\t"
1795 "mov %%ebx, %c[rbx](%3) \n\t"
1796 "pushl 0(%%esp); popl %c[rcx](%3) \n\t"
1797 "mov %%edx, %c[rdx](%3) \n\t"
1798 "mov %%esi, %c[rsi](%3) \n\t"
1799 "mov %%edi, %c[rdi](%3) \n\t"
1800 "mov %%ebp, %c[rbp](%3) \n\t"
1801 "mov %%cr2, %%eax \n\t"
1802 "mov %%eax, %c[cr2](%3) \n\t"
1803 "mov 0(%%esp), %3 \n\t"
1804
1805 "pop %%ecx; popa \n\t"
1806#endif
1807 "setbe %0 \n\t"
1808 "popf \n\t"
1809 : "=g" (fail)
1810 : "r"(vcpu->launched), "d"((unsigned long)HOST_RSP),
1811 "c"(vcpu),
1812 [rax]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RAX])),
1813 [rbx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBX])),
1814 [rcx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RCX])),
1815 [rdx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDX])),
1816 [rsi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RSI])),
1817 [rdi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDI])),
1818 [rbp]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBP])),
1819#ifdef __x86_64__
1820 [r8 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R8 ])),
1821 [r9 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R9 ])),
1822 [r10]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R10])),
1823 [r11]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R11])),
1824 [r12]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R12])),
1825 [r13]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R13])),
1826 [r14]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R14])),
1827 [r15]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R15])),
1828#endif
1829 [cr2]"i"(offsetof(struct kvm_vcpu, cr2))
1830 : "cc", "memory" );
1831
1832 ++kvm_stat.exits;
1833
1834 save_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
1835 load_msrs(vcpu->host_msrs, NR_BAD_MSRS);
1836
1837 fx_save(vcpu->guest_fx_image);
1838 fx_restore(vcpu->host_fx_image);
1839
1840#ifndef __x86_64__
1841 asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
1842#endif
1843
1844 kvm_run->exit_type = 0;
1845 if (fail) {
1846 kvm_run->exit_type = KVM_EXIT_TYPE_FAIL_ENTRY;
1847 kvm_run->exit_reason = vmcs_read32(VM_INSTRUCTION_ERROR);
1848 } else {
1849 if (fs_gs_ldt_reload_needed) {
1850 load_ldt(ldt_sel);
1851 load_fs(fs_sel);
1852 /*
1853 * If we have to reload gs, we must take care to
1854 * preserve our gs base.
1855 */
1856 local_irq_disable();
1857 load_gs(gs_sel);
1858#ifdef __x86_64__
1859 wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
1860#endif
1861 local_irq_enable();
1862
1863 reload_tss();
1864 }
1865 vcpu->launched = 1;
1866 kvm_run->exit_type = KVM_EXIT_TYPE_VM_EXIT;
1867 if (kvm_handle_exit(kvm_run, vcpu)) {
1868 /* Give scheduler a change to reschedule. */
1869 if (signal_pending(current)) {
1870 ++kvm_stat.signal_exits;
1871 return -EINTR;
1872 }
1873 kvm_resched(vcpu);
1874 goto again;
1875 }
1876 }
1877 return 0;
1878}
1879
1880static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
1881{
1882 vmcs_writel(GUEST_CR3, vmcs_readl(GUEST_CR3));
1883}
1884
1885static void vmx_inject_page_fault(struct kvm_vcpu *vcpu,
1886 unsigned long addr,
1887 u32 err_code)
1888{
1889 u32 vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
1890
1891 ++kvm_stat.pf_guest;
1892
1893 if (is_page_fault(vect_info)) {
1894 printk(KERN_DEBUG "inject_page_fault: "
1895 "double fault 0x%lx @ 0x%lx\n",
1896 addr, vmcs_readl(GUEST_RIP));
1897 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 0);
1898 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
1899 DF_VECTOR |
1900 INTR_TYPE_EXCEPTION |
1901 INTR_INFO_DELIEVER_CODE_MASK |
1902 INTR_INFO_VALID_MASK);
1903 return;
1904 }
1905 vcpu->cr2 = addr;
1906 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, err_code);
1907 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
1908 PF_VECTOR |
1909 INTR_TYPE_EXCEPTION |
1910 INTR_INFO_DELIEVER_CODE_MASK |
1911 INTR_INFO_VALID_MASK);
1912
1913}
1914
1915static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
1916{
1917 if (vcpu->vmcs) {
1918 on_each_cpu(__vcpu_clear, vcpu, 0, 1);
1919 free_vmcs(vcpu->vmcs);
1920 vcpu->vmcs = NULL;
1921 }
1922}
1923
1924static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
1925{
1926 vmx_free_vmcs(vcpu);
1927}
1928
1929static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
1930{
1931 struct vmcs *vmcs;
1932
1933 vmcs = alloc_vmcs();
1934 if (!vmcs)
1935 return -ENOMEM;
1936 vmcs_clear(vmcs);
1937 vcpu->vmcs = vmcs;
1938 vcpu->launched = 0;
1939 return 0;
1940}
1941
1942static struct kvm_arch_ops vmx_arch_ops = {
1943 .cpu_has_kvm_support = cpu_has_kvm_support,
1944 .disabled_by_bios = vmx_disabled_by_bios,
1945 .hardware_setup = hardware_setup,
1946 .hardware_unsetup = hardware_unsetup,
1947 .hardware_enable = hardware_enable,
1948 .hardware_disable = hardware_disable,
1949
1950 .vcpu_create = vmx_create_vcpu,
1951 .vcpu_free = vmx_free_vcpu,
1952
1953 .vcpu_load = vmx_vcpu_load,
1954 .vcpu_put = vmx_vcpu_put,
1955
1956 .set_guest_debug = set_guest_debug,
1957 .get_msr = vmx_get_msr,
1958 .set_msr = vmx_set_msr,
1959 .get_segment_base = vmx_get_segment_base,
1960 .get_segment = vmx_get_segment,
1961 .set_segment = vmx_set_segment,
1962 .is_long_mode = vmx_is_long_mode,
1963 .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
1964 .set_cr0 = vmx_set_cr0,
1965 .set_cr0_no_modeswitch = vmx_set_cr0_no_modeswitch,
1966 .set_cr3 = vmx_set_cr3,
1967 .set_cr4 = vmx_set_cr4,
1968#ifdef __x86_64__
1969 .set_efer = vmx_set_efer,
1970#endif
1971 .get_idt = vmx_get_idt,
1972 .set_idt = vmx_set_idt,
1973 .get_gdt = vmx_get_gdt,
1974 .set_gdt = vmx_set_gdt,
1975 .cache_regs = vcpu_load_rsp_rip,
1976 .decache_regs = vcpu_put_rsp_rip,
1977 .get_rflags = vmx_get_rflags,
1978 .set_rflags = vmx_set_rflags,
1979
1980 .tlb_flush = vmx_flush_tlb,
1981 .inject_page_fault = vmx_inject_page_fault,
1982
1983 .inject_gp = vmx_inject_gp,
1984
1985 .run = vmx_vcpu_run,
1986 .skip_emulated_instruction = skip_emulated_instruction,
1987 .vcpu_setup = vmx_vcpu_setup,
1988};
1989
1990static int __init vmx_init(void)
1991{
1992 kvm_init_arch(&vmx_arch_ops, THIS_MODULE);
1993 return 0;
1994}
1995
1996static void __exit vmx_exit(void)
1997{
1998 kvm_exit_arch();
1999}
2000
2001module_init(vmx_init)
2002module_exit(vmx_exit)
diff --git a/drivers/kvm/vmx.h b/drivers/kvm/vmx.h
new file mode 100644
index 000000000000..797278341581
--- /dev/null
+++ b/drivers/kvm/vmx.h
@@ -0,0 +1,296 @@
1#ifndef VMX_H
2#define VMX_H
3
4/*
5 * vmx.h: VMX Architecture related definitions
6 * Copyright (c) 2004, Intel Corporation.
7 *
8 * This program is free software; you can redistribute it and/or modify it
9 * under the terms and conditions of the GNU General Public License,
10 * version 2, as published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope it will be useful, but WITHOUT
13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
15 * more details.
16 *
17 * You should have received a copy of the GNU General Public License along with
18 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
19 * Place - Suite 330, Boston, MA 02111-1307 USA.
20 *
21 * A few random additions are:
22 * Copyright (C) 2006 Qumranet
23 * Avi Kivity <avi@qumranet.com>
24 * Yaniv Kamay <yaniv@qumranet.com>
25 *
26 */
27
28#define CPU_BASED_VIRTUAL_INTR_PENDING 0x00000004
29#define CPU_BASED_USE_TSC_OFFSETING 0x00000008
30#define CPU_BASED_HLT_EXITING 0x00000080
31#define CPU_BASED_INVDPG_EXITING 0x00000200
32#define CPU_BASED_MWAIT_EXITING 0x00000400
33#define CPU_BASED_RDPMC_EXITING 0x00000800
34#define CPU_BASED_RDTSC_EXITING 0x00001000
35#define CPU_BASED_CR8_LOAD_EXITING 0x00080000
36#define CPU_BASED_CR8_STORE_EXITING 0x00100000
37#define CPU_BASED_TPR_SHADOW 0x00200000
38#define CPU_BASED_MOV_DR_EXITING 0x00800000
39#define CPU_BASED_UNCOND_IO_EXITING 0x01000000
40#define CPU_BASED_ACTIVATE_IO_BITMAP 0x02000000
41#define CPU_BASED_MSR_BITMAPS 0x10000000
42#define CPU_BASED_MONITOR_EXITING 0x20000000
43#define CPU_BASED_PAUSE_EXITING 0x40000000
44
45#define PIN_BASED_EXT_INTR_MASK 0x1
46#define PIN_BASED_NMI_EXITING 0x8
47
48#define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000
49#define VM_EXIT_HOST_ADD_SPACE_SIZE 0x00000200
50
51
52/* VMCS Encodings */
53enum vmcs_field {
54 GUEST_ES_SELECTOR = 0x00000800,
55 GUEST_CS_SELECTOR = 0x00000802,
56 GUEST_SS_SELECTOR = 0x00000804,
57 GUEST_DS_SELECTOR = 0x00000806,
58 GUEST_FS_SELECTOR = 0x00000808,
59 GUEST_GS_SELECTOR = 0x0000080a,
60 GUEST_LDTR_SELECTOR = 0x0000080c,
61 GUEST_TR_SELECTOR = 0x0000080e,
62 HOST_ES_SELECTOR = 0x00000c00,
63 HOST_CS_SELECTOR = 0x00000c02,
64 HOST_SS_SELECTOR = 0x00000c04,
65 HOST_DS_SELECTOR = 0x00000c06,
66 HOST_FS_SELECTOR = 0x00000c08,
67 HOST_GS_SELECTOR = 0x00000c0a,
68 HOST_TR_SELECTOR = 0x00000c0c,
69 IO_BITMAP_A = 0x00002000,
70 IO_BITMAP_A_HIGH = 0x00002001,
71 IO_BITMAP_B = 0x00002002,
72 IO_BITMAP_B_HIGH = 0x00002003,
73 MSR_BITMAP = 0x00002004,
74 MSR_BITMAP_HIGH = 0x00002005,
75 VM_EXIT_MSR_STORE_ADDR = 0x00002006,
76 VM_EXIT_MSR_STORE_ADDR_HIGH = 0x00002007,
77 VM_EXIT_MSR_LOAD_ADDR = 0x00002008,
78 VM_EXIT_MSR_LOAD_ADDR_HIGH = 0x00002009,
79 VM_ENTRY_MSR_LOAD_ADDR = 0x0000200a,
80 VM_ENTRY_MSR_LOAD_ADDR_HIGH = 0x0000200b,
81 TSC_OFFSET = 0x00002010,
82 TSC_OFFSET_HIGH = 0x00002011,
83 VIRTUAL_APIC_PAGE_ADDR = 0x00002012,
84 VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013,
85 VMCS_LINK_POINTER = 0x00002800,
86 VMCS_LINK_POINTER_HIGH = 0x00002801,
87 GUEST_IA32_DEBUGCTL = 0x00002802,
88 GUEST_IA32_DEBUGCTL_HIGH = 0x00002803,
89 PIN_BASED_VM_EXEC_CONTROL = 0x00004000,
90 CPU_BASED_VM_EXEC_CONTROL = 0x00004002,
91 EXCEPTION_BITMAP = 0x00004004,
92 PAGE_FAULT_ERROR_CODE_MASK = 0x00004006,
93 PAGE_FAULT_ERROR_CODE_MATCH = 0x00004008,
94 CR3_TARGET_COUNT = 0x0000400a,
95 VM_EXIT_CONTROLS = 0x0000400c,
96 VM_EXIT_MSR_STORE_COUNT = 0x0000400e,
97 VM_EXIT_MSR_LOAD_COUNT = 0x00004010,
98 VM_ENTRY_CONTROLS = 0x00004012,
99 VM_ENTRY_MSR_LOAD_COUNT = 0x00004014,
100 VM_ENTRY_INTR_INFO_FIELD = 0x00004016,
101 VM_ENTRY_EXCEPTION_ERROR_CODE = 0x00004018,
102 VM_ENTRY_INSTRUCTION_LEN = 0x0000401a,
103 TPR_THRESHOLD = 0x0000401c,
104 SECONDARY_VM_EXEC_CONTROL = 0x0000401e,
105 VM_INSTRUCTION_ERROR = 0x00004400,
106 VM_EXIT_REASON = 0x00004402,
107 VM_EXIT_INTR_INFO = 0x00004404,
108 VM_EXIT_INTR_ERROR_CODE = 0x00004406,
109 IDT_VECTORING_INFO_FIELD = 0x00004408,
110 IDT_VECTORING_ERROR_CODE = 0x0000440a,
111 VM_EXIT_INSTRUCTION_LEN = 0x0000440c,
112 VMX_INSTRUCTION_INFO = 0x0000440e,
113 GUEST_ES_LIMIT = 0x00004800,
114 GUEST_CS_LIMIT = 0x00004802,
115 GUEST_SS_LIMIT = 0x00004804,
116 GUEST_DS_LIMIT = 0x00004806,
117 GUEST_FS_LIMIT = 0x00004808,
118 GUEST_GS_LIMIT = 0x0000480a,
119 GUEST_LDTR_LIMIT = 0x0000480c,
120 GUEST_TR_LIMIT = 0x0000480e,
121 GUEST_GDTR_LIMIT = 0x00004810,
122 GUEST_IDTR_LIMIT = 0x00004812,
123 GUEST_ES_AR_BYTES = 0x00004814,
124 GUEST_CS_AR_BYTES = 0x00004816,
125 GUEST_SS_AR_BYTES = 0x00004818,
126 GUEST_DS_AR_BYTES = 0x0000481a,
127 GUEST_FS_AR_BYTES = 0x0000481c,
128 GUEST_GS_AR_BYTES = 0x0000481e,
129 GUEST_LDTR_AR_BYTES = 0x00004820,
130 GUEST_TR_AR_BYTES = 0x00004822,
131 GUEST_INTERRUPTIBILITY_INFO = 0x00004824,
132 GUEST_ACTIVITY_STATE = 0X00004826,
133 GUEST_SYSENTER_CS = 0x0000482A,
134 HOST_IA32_SYSENTER_CS = 0x00004c00,
135 CR0_GUEST_HOST_MASK = 0x00006000,
136 CR4_GUEST_HOST_MASK = 0x00006002,
137 CR0_READ_SHADOW = 0x00006004,
138 CR4_READ_SHADOW = 0x00006006,
139 CR3_TARGET_VALUE0 = 0x00006008,
140 CR3_TARGET_VALUE1 = 0x0000600a,
141 CR3_TARGET_VALUE2 = 0x0000600c,
142 CR3_TARGET_VALUE3 = 0x0000600e,
143 EXIT_QUALIFICATION = 0x00006400,
144 GUEST_LINEAR_ADDRESS = 0x0000640a,
145 GUEST_CR0 = 0x00006800,
146 GUEST_CR3 = 0x00006802,
147 GUEST_CR4 = 0x00006804,
148 GUEST_ES_BASE = 0x00006806,
149 GUEST_CS_BASE = 0x00006808,
150 GUEST_SS_BASE = 0x0000680a,
151 GUEST_DS_BASE = 0x0000680c,
152 GUEST_FS_BASE = 0x0000680e,
153 GUEST_GS_BASE = 0x00006810,
154 GUEST_LDTR_BASE = 0x00006812,
155 GUEST_TR_BASE = 0x00006814,
156 GUEST_GDTR_BASE = 0x00006816,
157 GUEST_IDTR_BASE = 0x00006818,
158 GUEST_DR7 = 0x0000681a,
159 GUEST_RSP = 0x0000681c,
160 GUEST_RIP = 0x0000681e,
161 GUEST_RFLAGS = 0x00006820,
162 GUEST_PENDING_DBG_EXCEPTIONS = 0x00006822,
163 GUEST_SYSENTER_ESP = 0x00006824,
164 GUEST_SYSENTER_EIP = 0x00006826,
165 HOST_CR0 = 0x00006c00,
166 HOST_CR3 = 0x00006c02,
167 HOST_CR4 = 0x00006c04,
168 HOST_FS_BASE = 0x00006c06,
169 HOST_GS_BASE = 0x00006c08,
170 HOST_TR_BASE = 0x00006c0a,
171 HOST_GDTR_BASE = 0x00006c0c,
172 HOST_IDTR_BASE = 0x00006c0e,
173 HOST_IA32_SYSENTER_ESP = 0x00006c10,
174 HOST_IA32_SYSENTER_EIP = 0x00006c12,
175 HOST_RSP = 0x00006c14,
176 HOST_RIP = 0x00006c16,
177};
178
179#define VMX_EXIT_REASONS_FAILED_VMENTRY 0x80000000
180
181#define EXIT_REASON_EXCEPTION_NMI 0
182#define EXIT_REASON_EXTERNAL_INTERRUPT 1
183
184#define EXIT_REASON_PENDING_INTERRUPT 7
185
186#define EXIT_REASON_TASK_SWITCH 9
187#define EXIT_REASON_CPUID 10
188#define EXIT_REASON_HLT 12
189#define EXIT_REASON_INVLPG 14
190#define EXIT_REASON_RDPMC 15
191#define EXIT_REASON_RDTSC 16
192#define EXIT_REASON_VMCALL 18
193#define EXIT_REASON_VMCLEAR 19
194#define EXIT_REASON_VMLAUNCH 20
195#define EXIT_REASON_VMPTRLD 21
196#define EXIT_REASON_VMPTRST 22
197#define EXIT_REASON_VMREAD 23
198#define EXIT_REASON_VMRESUME 24
199#define EXIT_REASON_VMWRITE 25
200#define EXIT_REASON_VMOFF 26
201#define EXIT_REASON_VMON 27
202#define EXIT_REASON_CR_ACCESS 28
203#define EXIT_REASON_DR_ACCESS 29
204#define EXIT_REASON_IO_INSTRUCTION 30
205#define EXIT_REASON_MSR_READ 31
206#define EXIT_REASON_MSR_WRITE 32
207#define EXIT_REASON_MWAIT_INSTRUCTION 36
208
209/*
210 * Interruption-information format
211 */
212#define INTR_INFO_VECTOR_MASK 0xff /* 7:0 */
213#define INTR_INFO_INTR_TYPE_MASK 0x700 /* 10:8 */
214#define INTR_INFO_DELIEVER_CODE_MASK 0x800 /* 11 */
215#define INTR_INFO_VALID_MASK 0x80000000 /* 31 */
216
217#define VECTORING_INFO_VECTOR_MASK INTR_INFO_VECTOR_MASK
218#define VECTORING_INFO_TYPE_MASK INTR_INFO_INTR_TYPE_MASK
219#define VECTORING_INFO_DELIEVER_CODE_MASK INTR_INFO_DELIEVER_CODE_MASK
220#define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK
221
222#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */
223#define INTR_TYPE_EXCEPTION (3 << 8) /* processor exception */
224
225/*
226 * Exit Qualifications for MOV for Control Register Access
227 */
228#define CONTROL_REG_ACCESS_NUM 0x7 /* 2:0, number of control register */
229#define CONTROL_REG_ACCESS_TYPE 0x30 /* 5:4, access type */
230#define CONTROL_REG_ACCESS_REG 0xf00 /* 10:8, general purpose register */
231#define LMSW_SOURCE_DATA_SHIFT 16
232#define LMSW_SOURCE_DATA (0xFFFF << LMSW_SOURCE_DATA_SHIFT) /* 16:31 lmsw source */
233#define REG_EAX (0 << 8)
234#define REG_ECX (1 << 8)
235#define REG_EDX (2 << 8)
236#define REG_EBX (3 << 8)
237#define REG_ESP (4 << 8)
238#define REG_EBP (5 << 8)
239#define REG_ESI (6 << 8)
240#define REG_EDI (7 << 8)
241#define REG_R8 (8 << 8)
242#define REG_R9 (9 << 8)
243#define REG_R10 (10 << 8)
244#define REG_R11 (11 << 8)
245#define REG_R12 (12 << 8)
246#define REG_R13 (13 << 8)
247#define REG_R14 (14 << 8)
248#define REG_R15 (15 << 8)
249
250/*
251 * Exit Qualifications for MOV for Debug Register Access
252 */
253#define DEBUG_REG_ACCESS_NUM 0x7 /* 2:0, number of debug register */
254#define DEBUG_REG_ACCESS_TYPE 0x10 /* 4, direction of access */
255#define TYPE_MOV_TO_DR (0 << 4)
256#define TYPE_MOV_FROM_DR (1 << 4)
257#define DEBUG_REG_ACCESS_REG 0xf00 /* 11:8, general purpose register */
258
259
260/* segment AR */
261#define SEGMENT_AR_L_MASK (1 << 13)
262
263/* entry controls */
264#define VM_ENTRY_CONTROLS_IA32E_MASK (1 << 9)
265
266#define AR_TYPE_ACCESSES_MASK 1
267#define AR_TYPE_READABLE_MASK (1 << 1)
268#define AR_TYPE_WRITEABLE_MASK (1 << 2)
269#define AR_TYPE_CODE_MASK (1 << 3)
270#define AR_TYPE_MASK 0x0f
271#define AR_TYPE_BUSY_64_TSS 11
272#define AR_TYPE_BUSY_32_TSS 11
273#define AR_TYPE_BUSY_16_TSS 3
274#define AR_TYPE_LDT 2
275
276#define AR_UNUSABLE_MASK (1 << 16)
277#define AR_S_MASK (1 << 4)
278#define AR_P_MASK (1 << 7)
279#define AR_L_MASK (1 << 13)
280#define AR_DB_MASK (1 << 14)
281#define AR_G_MASK (1 << 15)
282#define AR_DPL_SHIFT 5
283#define AR_DPL(ar) (((ar) >> AR_DPL_SHIFT) & 3)
284
285#define AR_RESERVD_MASK 0xfffe0f00
286
287#define CR4_VMXE 0x2000
288
289#define MSR_IA32_VMX_BASIC_MSR 0x480
290#define MSR_IA32_FEATURE_CONTROL 0x03a
291#define MSR_IA32_VMX_PINBASED_CTLS_MSR 0x481
292#define MSR_IA32_VMX_PROCBASED_CTLS_MSR 0x482
293#define MSR_IA32_VMX_EXIT_CTLS_MSR 0x483
294#define MSR_IA32_VMX_ENTRY_CTLS_MSR 0x484
295
296#endif
diff --git a/drivers/kvm/x86_emulate.c b/drivers/kvm/x86_emulate.c
new file mode 100644
index 000000000000..7e838bf0592d
--- /dev/null
+++ b/drivers/kvm/x86_emulate.c
@@ -0,0 +1,1409 @@
1/******************************************************************************
2 * x86_emulate.c
3 *
4 * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
5 *
6 * Copyright (c) 2005 Keir Fraser
7 *
8 * Linux coding style, mod r/m decoder, segment base fixes, real-mode
9 * privieged instructions:
10 *
11 * Copyright (C) 2006 Qumranet
12 *
13 * Avi Kivity <avi@qumranet.com>
14 * Yaniv Kamay <yaniv@qumranet.com>
15 *
16 * This work is licensed under the terms of the GNU GPL, version 2. See
17 * the COPYING file in the top-level directory.
18 *
19 * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
20 */
21
22#ifndef __KERNEL__
23#include <stdio.h>
24#include <stdint.h>
25#include <public/xen.h>
26#define DPRINTF(_f, _a ...) printf( _f , ## _a )
27#else
28#include "kvm.h"
29#define DPRINTF(x...) do {} while (0)
30#endif
31#include "x86_emulate.h"
32#include <linux/module.h>
33
34/*
35 * Opcode effective-address decode tables.
36 * Note that we only emulate instructions that have at least one memory
37 * operand (excluding implicit stack references). We assume that stack
38 * references and instruction fetches will never occur in special memory
39 * areas that require emulation. So, for example, 'mov <imm>,<reg>' need
40 * not be handled.
41 */
42
43/* Operand sizes: 8-bit operands or specified/overridden size. */
44#define ByteOp (1<<0) /* 8-bit operands. */
45/* Destination operand type. */
46#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */
47#define DstReg (2<<1) /* Register operand. */
48#define DstMem (3<<1) /* Memory operand. */
49#define DstMask (3<<1)
50/* Source operand type. */
51#define SrcNone (0<<3) /* No source operand. */
52#define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */
53#define SrcReg (1<<3) /* Register operand. */
54#define SrcMem (2<<3) /* Memory operand. */
55#define SrcMem16 (3<<3) /* Memory operand (16-bit). */
56#define SrcMem32 (4<<3) /* Memory operand (32-bit). */
57#define SrcImm (5<<3) /* Immediate operand. */
58#define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */
59#define SrcMask (7<<3)
60/* Generic ModRM decode. */
61#define ModRM (1<<6)
62/* Destination is only written; never read. */
63#define Mov (1<<7)
64
65static u8 opcode_table[256] = {
66 /* 0x00 - 0x07 */
67 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
68 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
69 0, 0, 0, 0,
70 /* 0x08 - 0x0F */
71 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
72 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
73 0, 0, 0, 0,
74 /* 0x10 - 0x17 */
75 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
76 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
77 0, 0, 0, 0,
78 /* 0x18 - 0x1F */
79 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
80 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
81 0, 0, 0, 0,
82 /* 0x20 - 0x27 */
83 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
84 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
85 0, 0, 0, 0,
86 /* 0x28 - 0x2F */
87 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
88 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
89 0, 0, 0, 0,
90 /* 0x30 - 0x37 */
91 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
92 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
93 0, 0, 0, 0,
94 /* 0x38 - 0x3F */
95 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
96 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
97 0, 0, 0, 0,
98 /* 0x40 - 0x4F */
99 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
100 /* 0x50 - 0x5F */
101 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
102 /* 0x60 - 0x6F */
103 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
104 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
105 /* 0x70 - 0x7F */
106 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
107 /* 0x80 - 0x87 */
108 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
109 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
110 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
111 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
112 /* 0x88 - 0x8F */
113 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
114 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
115 0, 0, 0, DstMem | SrcNone | ModRM | Mov,
116 /* 0x90 - 0x9F */
117 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
118 /* 0xA0 - 0xA7 */
119 ByteOp | DstReg | SrcMem | Mov, DstReg | SrcMem | Mov,
120 ByteOp | DstMem | SrcReg | Mov, DstMem | SrcReg | Mov,
121 ByteOp | ImplicitOps | Mov, ImplicitOps | Mov,
122 ByteOp | ImplicitOps, ImplicitOps,
123 /* 0xA8 - 0xAF */
124 0, 0, ByteOp | ImplicitOps | Mov, ImplicitOps | Mov,
125 ByteOp | ImplicitOps | Mov, ImplicitOps | Mov,
126 ByteOp | ImplicitOps, ImplicitOps,
127 /* 0xB0 - 0xBF */
128 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
129 /* 0xC0 - 0xC7 */
130 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, 0, 0,
131 0, 0, ByteOp | DstMem | SrcImm | ModRM | Mov,
132 DstMem | SrcImm | ModRM | Mov,
133 /* 0xC8 - 0xCF */
134 0, 0, 0, 0, 0, 0, 0, 0,
135 /* 0xD0 - 0xD7 */
136 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
137 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
138 0, 0, 0, 0,
139 /* 0xD8 - 0xDF */
140 0, 0, 0, 0, 0, 0, 0, 0,
141 /* 0xE0 - 0xEF */
142 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
143 /* 0xF0 - 0xF7 */
144 0, 0, 0, 0,
145 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
146 /* 0xF8 - 0xFF */
147 0, 0, 0, 0,
148 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM
149};
150
151static u8 twobyte_table[256] = {
152 /* 0x00 - 0x0F */
153 0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0,
154 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
155 /* 0x10 - 0x1F */
156 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
157 /* 0x20 - 0x2F */
158 ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 /* 0x30 - 0x3F */
161 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
162 /* 0x40 - 0x47 */
163 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
164 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
165 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
166 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
167 /* 0x48 - 0x4F */
168 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
169 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
170 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
171 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
172 /* 0x50 - 0x5F */
173 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
174 /* 0x60 - 0x6F */
175 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
176 /* 0x70 - 0x7F */
177 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
178 /* 0x80 - 0x8F */
179 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
180 /* 0x90 - 0x9F */
181 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
182 /* 0xA0 - 0xA7 */
183 0, 0, 0, DstMem | SrcReg | ModRM, 0, 0, 0, 0,
184 /* 0xA8 - 0xAF */
185 0, 0, 0, DstMem | SrcReg | ModRM, 0, 0, 0, 0,
186 /* 0xB0 - 0xB7 */
187 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0,
188 DstMem | SrcReg | ModRM,
189 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
190 DstReg | SrcMem16 | ModRM | Mov,
191 /* 0xB8 - 0xBF */
192 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM,
193 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
194 DstReg | SrcMem16 | ModRM | Mov,
195 /* 0xC0 - 0xCF */
196 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, 0,
197 /* 0xD0 - 0xDF */
198 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
199 /* 0xE0 - 0xEF */
200 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
201 /* 0xF0 - 0xFF */
202 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
203};
204
205/*
206 * Tell the emulator that of the Group 7 instructions (sgdt, lidt, etc.) we
207 * are interested only in invlpg and not in any of the rest.
208 *
209 * invlpg is a special instruction in that the data it references may not
210 * be mapped.
211 */
212void kvm_emulator_want_group7_invlpg(void)
213{
214 twobyte_table[1] &= ~SrcMem;
215}
216EXPORT_SYMBOL_GPL(kvm_emulator_want_group7_invlpg);
217
218/* Type, address-of, and value of an instruction's operand. */
219struct operand {
220 enum { OP_REG, OP_MEM, OP_IMM } type;
221 unsigned int bytes;
222 unsigned long val, orig_val, *ptr;
223};
224
225/* EFLAGS bit definitions. */
226#define EFLG_OF (1<<11)
227#define EFLG_DF (1<<10)
228#define EFLG_SF (1<<7)
229#define EFLG_ZF (1<<6)
230#define EFLG_AF (1<<4)
231#define EFLG_PF (1<<2)
232#define EFLG_CF (1<<0)
233
234/*
235 * Instruction emulation:
236 * Most instructions are emulated directly via a fragment of inline assembly
237 * code. This allows us to save/restore EFLAGS and thus very easily pick up
238 * any modified flags.
239 */
240
241#if defined(__x86_64__)
242#define _LO32 "k" /* force 32-bit operand */
243#define _STK "%%rsp" /* stack pointer */
244#elif defined(__i386__)
245#define _LO32 "" /* force 32-bit operand */
246#define _STK "%%esp" /* stack pointer */
247#endif
248
249/*
250 * These EFLAGS bits are restored from saved value during emulation, and
251 * any changes are written back to the saved value after emulation.
252 */
253#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF)
254
255/* Before executing instruction: restore necessary bits in EFLAGS. */
256#define _PRE_EFLAGS(_sav, _msk, _tmp) \
257 /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); */ \
258 "push %"_sav"; " \
259 "movl %"_msk",%"_LO32 _tmp"; " \
260 "andl %"_LO32 _tmp",("_STK"); " \
261 "pushf; " \
262 "notl %"_LO32 _tmp"; " \
263 "andl %"_LO32 _tmp",("_STK"); " \
264 "pop %"_tmp"; " \
265 "orl %"_LO32 _tmp",("_STK"); " \
266 "popf; " \
267 /* _sav &= ~msk; */ \
268 "movl %"_msk",%"_LO32 _tmp"; " \
269 "notl %"_LO32 _tmp"; " \
270 "andl %"_LO32 _tmp",%"_sav"; "
271
272/* After executing instruction: write-back necessary bits in EFLAGS. */
273#define _POST_EFLAGS(_sav, _msk, _tmp) \
274 /* _sav |= EFLAGS & _msk; */ \
275 "pushf; " \
276 "pop %"_tmp"; " \
277 "andl %"_msk",%"_LO32 _tmp"; " \
278 "orl %"_LO32 _tmp",%"_sav"; "
279
280/* Raw emulation: instruction has two explicit operands. */
281#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \
282 do { \
283 unsigned long _tmp; \
284 \
285 switch ((_dst).bytes) { \
286 case 2: \
287 __asm__ __volatile__ ( \
288 _PRE_EFLAGS("0","4","2") \
289 _op"w %"_wx"3,%1; " \
290 _POST_EFLAGS("0","4","2") \
291 : "=m" (_eflags), "=m" ((_dst).val), \
292 "=&r" (_tmp) \
293 : _wy ((_src).val), "i" (EFLAGS_MASK) ); \
294 break; \
295 case 4: \
296 __asm__ __volatile__ ( \
297 _PRE_EFLAGS("0","4","2") \
298 _op"l %"_lx"3,%1; " \
299 _POST_EFLAGS("0","4","2") \
300 : "=m" (_eflags), "=m" ((_dst).val), \
301 "=&r" (_tmp) \
302 : _ly ((_src).val), "i" (EFLAGS_MASK) ); \
303 break; \
304 case 8: \
305 __emulate_2op_8byte(_op, _src, _dst, \
306 _eflags, _qx, _qy); \
307 break; \
308 } \
309 } while (0)
310
311#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
312 do { \
313 unsigned long _tmp; \
314 switch ( (_dst).bytes ) \
315 { \
316 case 1: \
317 __asm__ __volatile__ ( \
318 _PRE_EFLAGS("0","4","2") \
319 _op"b %"_bx"3,%1; " \
320 _POST_EFLAGS("0","4","2") \
321 : "=m" (_eflags), "=m" ((_dst).val), \
322 "=&r" (_tmp) \
323 : _by ((_src).val), "i" (EFLAGS_MASK) ); \
324 break; \
325 default: \
326 __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
327 _wx, _wy, _lx, _ly, _qx, _qy); \
328 break; \
329 } \
330 } while (0)
331
332/* Source operand is byte-sized and may be restricted to just %cl. */
333#define emulate_2op_SrcB(_op, _src, _dst, _eflags) \
334 __emulate_2op(_op, _src, _dst, _eflags, \
335 "b", "c", "b", "c", "b", "c", "b", "c")
336
337/* Source operand is byte, word, long or quad sized. */
338#define emulate_2op_SrcV(_op, _src, _dst, _eflags) \
339 __emulate_2op(_op, _src, _dst, _eflags, \
340 "b", "q", "w", "r", _LO32, "r", "", "r")
341
342/* Source operand is word, long or quad sized. */
343#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \
344 __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
345 "w", "r", _LO32, "r", "", "r")
346
347/* Instruction has only one explicit operand (no source operand). */
348#define emulate_1op(_op, _dst, _eflags) \
349 do { \
350 unsigned long _tmp; \
351 \
352 switch ( (_dst).bytes ) \
353 { \
354 case 1: \
355 __asm__ __volatile__ ( \
356 _PRE_EFLAGS("0","3","2") \
357 _op"b %1; " \
358 _POST_EFLAGS("0","3","2") \
359 : "=m" (_eflags), "=m" ((_dst).val), \
360 "=&r" (_tmp) \
361 : "i" (EFLAGS_MASK) ); \
362 break; \
363 case 2: \
364 __asm__ __volatile__ ( \
365 _PRE_EFLAGS("0","3","2") \
366 _op"w %1; " \
367 _POST_EFLAGS("0","3","2") \
368 : "=m" (_eflags), "=m" ((_dst).val), \
369 "=&r" (_tmp) \
370 : "i" (EFLAGS_MASK) ); \
371 break; \
372 case 4: \
373 __asm__ __volatile__ ( \
374 _PRE_EFLAGS("0","3","2") \
375 _op"l %1; " \
376 _POST_EFLAGS("0","3","2") \
377 : "=m" (_eflags), "=m" ((_dst).val), \
378 "=&r" (_tmp) \
379 : "i" (EFLAGS_MASK) ); \
380 break; \
381 case 8: \
382 __emulate_1op_8byte(_op, _dst, _eflags); \
383 break; \
384 } \
385 } while (0)
386
387/* Emulate an instruction with quadword operands (x86/64 only). */
388#if defined(__x86_64__)
389#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) \
390 do { \
391 __asm__ __volatile__ ( \
392 _PRE_EFLAGS("0","4","2") \
393 _op"q %"_qx"3,%1; " \
394 _POST_EFLAGS("0","4","2") \
395 : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
396 : _qy ((_src).val), "i" (EFLAGS_MASK) ); \
397 } while (0)
398
399#define __emulate_1op_8byte(_op, _dst, _eflags) \
400 do { \
401 __asm__ __volatile__ ( \
402 _PRE_EFLAGS("0","3","2") \
403 _op"q %1; " \
404 _POST_EFLAGS("0","3","2") \
405 : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
406 : "i" (EFLAGS_MASK) ); \
407 } while (0)
408
409#elif defined(__i386__)
410#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)
411#define __emulate_1op_8byte(_op, _dst, _eflags)
412#endif /* __i386__ */
413
414/* Fetch next part of the instruction being emulated. */
415#define insn_fetch(_type, _size, _eip) \
416({ unsigned long _x; \
417 rc = ops->read_std((unsigned long)(_eip) + ctxt->cs_base, &_x, \
418 (_size), ctxt); \
419 if ( rc != 0 ) \
420 goto done; \
421 (_eip) += (_size); \
422 (_type)_x; \
423})
424
425/* Access/update address held in a register, based on addressing mode. */
426#define register_address(base, reg) \
427 ((base) + ((ad_bytes == sizeof(unsigned long)) ? (reg) : \
428 ((reg) & ((1UL << (ad_bytes << 3)) - 1))))
429
430#define register_address_increment(reg, inc) \
431 do { \
432 /* signed type ensures sign extension to long */ \
433 int _inc = (inc); \
434 if ( ad_bytes == sizeof(unsigned long) ) \
435 (reg) += _inc; \
436 else \
437 (reg) = ((reg) & ~((1UL << (ad_bytes << 3)) - 1)) | \
438 (((reg) + _inc) & ((1UL << (ad_bytes << 3)) - 1)); \
439 } while (0)
440
441void *decode_register(u8 modrm_reg, unsigned long *regs,
442 int highbyte_regs)
443{
444 void *p;
445
446 p = &regs[modrm_reg];
447 if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8)
448 p = (unsigned char *)&regs[modrm_reg & 3] + 1;
449 return p;
450}
451
452static int read_descriptor(struct x86_emulate_ctxt *ctxt,
453 struct x86_emulate_ops *ops,
454 void *ptr,
455 u16 *size, unsigned long *address, int op_bytes)
456{
457 int rc;
458
459 if (op_bytes == 2)
460 op_bytes = 3;
461 *address = 0;
462 rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, ctxt);
463 if (rc)
464 return rc;
465 rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, ctxt);
466 return rc;
467}
468
469int
470x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
471{
472 u8 b, d, sib, twobyte = 0, rex_prefix = 0;
473 u8 modrm, modrm_mod = 0, modrm_reg = 0, modrm_rm = 0;
474 unsigned long *override_base = NULL;
475 unsigned int op_bytes, ad_bytes, lock_prefix = 0, rep_prefix = 0, i;
476 int rc = 0;
477 struct operand src, dst;
478 unsigned long cr2 = ctxt->cr2;
479 int mode = ctxt->mode;
480 unsigned long modrm_ea;
481 int use_modrm_ea, index_reg = 0, base_reg = 0, scale, rip_relative = 0;
482
483 /* Shadow copy of register state. Committed on successful emulation. */
484 unsigned long _regs[NR_VCPU_REGS];
485 unsigned long _eip = ctxt->vcpu->rip, _eflags = ctxt->eflags;
486 unsigned long modrm_val = 0;
487
488 memcpy(_regs, ctxt->vcpu->regs, sizeof _regs);
489
490 switch (mode) {
491 case X86EMUL_MODE_REAL:
492 case X86EMUL_MODE_PROT16:
493 op_bytes = ad_bytes = 2;
494 break;
495 case X86EMUL_MODE_PROT32:
496 op_bytes = ad_bytes = 4;
497 break;
498#ifdef __x86_64__
499 case X86EMUL_MODE_PROT64:
500 op_bytes = 4;
501 ad_bytes = 8;
502 break;
503#endif
504 default:
505 return -1;
506 }
507
508 /* Legacy prefixes. */
509 for (i = 0; i < 8; i++) {
510 switch (b = insn_fetch(u8, 1, _eip)) {
511 case 0x66: /* operand-size override */
512 op_bytes ^= 6; /* switch between 2/4 bytes */
513 break;
514 case 0x67: /* address-size override */
515 if (mode == X86EMUL_MODE_PROT64)
516 ad_bytes ^= 12; /* switch between 4/8 bytes */
517 else
518 ad_bytes ^= 6; /* switch between 2/4 bytes */
519 break;
520 case 0x2e: /* CS override */
521 override_base = &ctxt->cs_base;
522 break;
523 case 0x3e: /* DS override */
524 override_base = &ctxt->ds_base;
525 break;
526 case 0x26: /* ES override */
527 override_base = &ctxt->es_base;
528 break;
529 case 0x64: /* FS override */
530 override_base = &ctxt->fs_base;
531 break;
532 case 0x65: /* GS override */
533 override_base = &ctxt->gs_base;
534 break;
535 case 0x36: /* SS override */
536 override_base = &ctxt->ss_base;
537 break;
538 case 0xf0: /* LOCK */
539 lock_prefix = 1;
540 break;
541 case 0xf3: /* REP/REPE/REPZ */
542 rep_prefix = 1;
543 break;
544 case 0xf2: /* REPNE/REPNZ */
545 break;
546 default:
547 goto done_prefixes;
548 }
549 }
550
551done_prefixes:
552
553 /* REX prefix. */
554 if ((mode == X86EMUL_MODE_PROT64) && ((b & 0xf0) == 0x40)) {
555 rex_prefix = b;
556 if (b & 8)
557 op_bytes = 8; /* REX.W */
558 modrm_reg = (b & 4) << 1; /* REX.R */
559 index_reg = (b & 2) << 2; /* REX.X */
560 modrm_rm = base_reg = (b & 1) << 3; /* REG.B */
561 b = insn_fetch(u8, 1, _eip);
562 }
563
564 /* Opcode byte(s). */
565 d = opcode_table[b];
566 if (d == 0) {
567 /* Two-byte opcode? */
568 if (b == 0x0f) {
569 twobyte = 1;
570 b = insn_fetch(u8, 1, _eip);
571 d = twobyte_table[b];
572 }
573
574 /* Unrecognised? */
575 if (d == 0)
576 goto cannot_emulate;
577 }
578
579 /* ModRM and SIB bytes. */
580 if (d & ModRM) {
581 modrm = insn_fetch(u8, 1, _eip);
582 modrm_mod |= (modrm & 0xc0) >> 6;
583 modrm_reg |= (modrm & 0x38) >> 3;
584 modrm_rm |= (modrm & 0x07);
585 modrm_ea = 0;
586 use_modrm_ea = 1;
587
588 if (modrm_mod == 3) {
589 modrm_val = *(unsigned long *)
590 decode_register(modrm_rm, _regs, d & ByteOp);
591 goto modrm_done;
592 }
593
594 if (ad_bytes == 2) {
595 unsigned bx = _regs[VCPU_REGS_RBX];
596 unsigned bp = _regs[VCPU_REGS_RBP];
597 unsigned si = _regs[VCPU_REGS_RSI];
598 unsigned di = _regs[VCPU_REGS_RDI];
599
600 /* 16-bit ModR/M decode. */
601 switch (modrm_mod) {
602 case 0:
603 if (modrm_rm == 6)
604 modrm_ea += insn_fetch(u16, 2, _eip);
605 break;
606 case 1:
607 modrm_ea += insn_fetch(s8, 1, _eip);
608 break;
609 case 2:
610 modrm_ea += insn_fetch(u16, 2, _eip);
611 break;
612 }
613 switch (modrm_rm) {
614 case 0:
615 modrm_ea += bx + si;
616 break;
617 case 1:
618 modrm_ea += bx + di;
619 break;
620 case 2:
621 modrm_ea += bp + si;
622 break;
623 case 3:
624 modrm_ea += bp + di;
625 break;
626 case 4:
627 modrm_ea += si;
628 break;
629 case 5:
630 modrm_ea += di;
631 break;
632 case 6:
633 if (modrm_mod != 0)
634 modrm_ea += bp;
635 break;
636 case 7:
637 modrm_ea += bx;
638 break;
639 }
640 if (modrm_rm == 2 || modrm_rm == 3 ||
641 (modrm_rm == 6 && modrm_mod != 0))
642 if (!override_base)
643 override_base = &ctxt->ss_base;
644 modrm_ea = (u16)modrm_ea;
645 } else {
646 /* 32/64-bit ModR/M decode. */
647 switch (modrm_rm) {
648 case 4:
649 case 12:
650 sib = insn_fetch(u8, 1, _eip);
651 index_reg |= (sib >> 3) & 7;
652 base_reg |= sib & 7;
653 scale = sib >> 6;
654
655 switch (base_reg) {
656 case 5:
657 if (modrm_mod != 0)
658 modrm_ea += _regs[base_reg];
659 else
660 modrm_ea += insn_fetch(s32, 4, _eip);
661 break;
662 default:
663 modrm_ea += _regs[base_reg];
664 }
665 switch (index_reg) {
666 case 4:
667 break;
668 default:
669 modrm_ea += _regs[index_reg] << scale;
670
671 }
672 break;
673 case 5:
674 if (modrm_mod != 0)
675 modrm_ea += _regs[modrm_rm];
676 else if (mode == X86EMUL_MODE_PROT64)
677 rip_relative = 1;
678 break;
679 default:
680 modrm_ea += _regs[modrm_rm];
681 break;
682 }
683 switch (modrm_mod) {
684 case 0:
685 if (modrm_rm == 5)
686 modrm_ea += insn_fetch(s32, 4, _eip);
687 break;
688 case 1:
689 modrm_ea += insn_fetch(s8, 1, _eip);
690 break;
691 case 2:
692 modrm_ea += insn_fetch(s32, 4, _eip);
693 break;
694 }
695 }
696 if (!override_base)
697 override_base = &ctxt->ds_base;
698 if (mode == X86EMUL_MODE_PROT64 &&
699 override_base != &ctxt->fs_base &&
700 override_base != &ctxt->gs_base)
701 override_base = NULL;
702
703 if (override_base)
704 modrm_ea += *override_base;
705
706 if (rip_relative) {
707 modrm_ea += _eip;
708 switch (d & SrcMask) {
709 case SrcImmByte:
710 modrm_ea += 1;
711 break;
712 case SrcImm:
713 if (d & ByteOp)
714 modrm_ea += 1;
715 else
716 if (op_bytes == 8)
717 modrm_ea += 4;
718 else
719 modrm_ea += op_bytes;
720 }
721 }
722 if (ad_bytes != 8)
723 modrm_ea = (u32)modrm_ea;
724 cr2 = modrm_ea;
725 modrm_done:
726 ;
727 }
728
729 /* Decode and fetch the destination operand: register or memory. */
730 switch (d & DstMask) {
731 case ImplicitOps:
732 /* Special instructions do their own operand decoding. */
733 goto special_insn;
734 case DstReg:
735 dst.type = OP_REG;
736 if ((d & ByteOp)
737 && !(twobyte_table && (b == 0xb6 || b == 0xb7))) {
738 dst.ptr = decode_register(modrm_reg, _regs,
739 (rex_prefix == 0));
740 dst.val = *(u8 *) dst.ptr;
741 dst.bytes = 1;
742 } else {
743 dst.ptr = decode_register(modrm_reg, _regs, 0);
744 switch ((dst.bytes = op_bytes)) {
745 case 2:
746 dst.val = *(u16 *)dst.ptr;
747 break;
748 case 4:
749 dst.val = *(u32 *)dst.ptr;
750 break;
751 case 8:
752 dst.val = *(u64 *)dst.ptr;
753 break;
754 }
755 }
756 break;
757 case DstMem:
758 dst.type = OP_MEM;
759 dst.ptr = (unsigned long *)cr2;
760 dst.bytes = (d & ByteOp) ? 1 : op_bytes;
761 if (!(d & Mov) && /* optimisation - avoid slow emulated read */
762 ((rc = ops->read_emulated((unsigned long)dst.ptr,
763 &dst.val, dst.bytes, ctxt)) != 0))
764 goto done;
765 break;
766 }
767 dst.orig_val = dst.val;
768
769 /*
770 * Decode and fetch the source operand: register, memory
771 * or immediate.
772 */
773 switch (d & SrcMask) {
774 case SrcNone:
775 break;
776 case SrcReg:
777 src.type = OP_REG;
778 if (d & ByteOp) {
779 src.ptr = decode_register(modrm_reg, _regs,
780 (rex_prefix == 0));
781 src.val = src.orig_val = *(u8 *) src.ptr;
782 src.bytes = 1;
783 } else {
784 src.ptr = decode_register(modrm_reg, _regs, 0);
785 switch ((src.bytes = op_bytes)) {
786 case 2:
787 src.val = src.orig_val = *(u16 *) src.ptr;
788 break;
789 case 4:
790 src.val = src.orig_val = *(u32 *) src.ptr;
791 break;
792 case 8:
793 src.val = src.orig_val = *(u64 *) src.ptr;
794 break;
795 }
796 }
797 break;
798 case SrcMem16:
799 src.bytes = 2;
800 goto srcmem_common;
801 case SrcMem32:
802 src.bytes = 4;
803 goto srcmem_common;
804 case SrcMem:
805 src.bytes = (d & ByteOp) ? 1 : op_bytes;
806 srcmem_common:
807 src.type = OP_MEM;
808 src.ptr = (unsigned long *)cr2;
809 if ((rc = ops->read_emulated((unsigned long)src.ptr,
810 &src.val, src.bytes, ctxt)) != 0)
811 goto done;
812 src.orig_val = src.val;
813 break;
814 case SrcImm:
815 src.type = OP_IMM;
816 src.ptr = (unsigned long *)_eip;
817 src.bytes = (d & ByteOp) ? 1 : op_bytes;
818 if (src.bytes == 8)
819 src.bytes = 4;
820 /* NB. Immediates are sign-extended as necessary. */
821 switch (src.bytes) {
822 case 1:
823 src.val = insn_fetch(s8, 1, _eip);
824 break;
825 case 2:
826 src.val = insn_fetch(s16, 2, _eip);
827 break;
828 case 4:
829 src.val = insn_fetch(s32, 4, _eip);
830 break;
831 }
832 break;
833 case SrcImmByte:
834 src.type = OP_IMM;
835 src.ptr = (unsigned long *)_eip;
836 src.bytes = 1;
837 src.val = insn_fetch(s8, 1, _eip);
838 break;
839 }
840
841 if (twobyte)
842 goto twobyte_insn;
843
844 switch (b) {
845 case 0x00 ... 0x05:
846 add: /* add */
847 emulate_2op_SrcV("add", src, dst, _eflags);
848 break;
849 case 0x08 ... 0x0d:
850 or: /* or */
851 emulate_2op_SrcV("or", src, dst, _eflags);
852 break;
853 case 0x10 ... 0x15:
854 adc: /* adc */
855 emulate_2op_SrcV("adc", src, dst, _eflags);
856 break;
857 case 0x18 ... 0x1d:
858 sbb: /* sbb */
859 emulate_2op_SrcV("sbb", src, dst, _eflags);
860 break;
861 case 0x20 ... 0x25:
862 and: /* and */
863 emulate_2op_SrcV("and", src, dst, _eflags);
864 break;
865 case 0x28 ... 0x2d:
866 sub: /* sub */
867 emulate_2op_SrcV("sub", src, dst, _eflags);
868 break;
869 case 0x30 ... 0x35:
870 xor: /* xor */
871 emulate_2op_SrcV("xor", src, dst, _eflags);
872 break;
873 case 0x38 ... 0x3d:
874 cmp: /* cmp */
875 emulate_2op_SrcV("cmp", src, dst, _eflags);
876 break;
877 case 0x63: /* movsxd */
878 if (mode != X86EMUL_MODE_PROT64)
879 goto cannot_emulate;
880 dst.val = (s32) src.val;
881 break;
882 case 0x80 ... 0x83: /* Grp1 */
883 switch (modrm_reg) {
884 case 0:
885 goto add;
886 case 1:
887 goto or;
888 case 2:
889 goto adc;
890 case 3:
891 goto sbb;
892 case 4:
893 goto and;
894 case 5:
895 goto sub;
896 case 6:
897 goto xor;
898 case 7:
899 goto cmp;
900 }
901 break;
902 case 0x84 ... 0x85:
903 test: /* test */
904 emulate_2op_SrcV("test", src, dst, _eflags);
905 break;
906 case 0x86 ... 0x87: /* xchg */
907 /* Write back the register source. */
908 switch (dst.bytes) {
909 case 1:
910 *(u8 *) src.ptr = (u8) dst.val;
911 break;
912 case 2:
913 *(u16 *) src.ptr = (u16) dst.val;
914 break;
915 case 4:
916 *src.ptr = (u32) dst.val;
917 break; /* 64b reg: zero-extend */
918 case 8:
919 *src.ptr = dst.val;
920 break;
921 }
922 /*
923 * Write back the memory destination with implicit LOCK
924 * prefix.
925 */
926 dst.val = src.val;
927 lock_prefix = 1;
928 break;
929 case 0xa0 ... 0xa1: /* mov */
930 dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
931 dst.val = src.val;
932 _eip += ad_bytes; /* skip src displacement */
933 break;
934 case 0xa2 ... 0xa3: /* mov */
935 dst.val = (unsigned long)_regs[VCPU_REGS_RAX];
936 _eip += ad_bytes; /* skip dst displacement */
937 break;
938 case 0x88 ... 0x8b: /* mov */
939 case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */
940 dst.val = src.val;
941 break;
942 case 0x8f: /* pop (sole member of Grp1a) */
943 /* 64-bit mode: POP always pops a 64-bit operand. */
944 if (mode == X86EMUL_MODE_PROT64)
945 dst.bytes = 8;
946 if ((rc = ops->read_std(register_address(ctxt->ss_base,
947 _regs[VCPU_REGS_RSP]),
948 &dst.val, dst.bytes, ctxt)) != 0)
949 goto done;
950 register_address_increment(_regs[VCPU_REGS_RSP], dst.bytes);
951 break;
952 case 0xc0 ... 0xc1:
953 grp2: /* Grp2 */
954 switch (modrm_reg) {
955 case 0: /* rol */
956 emulate_2op_SrcB("rol", src, dst, _eflags);
957 break;
958 case 1: /* ror */
959 emulate_2op_SrcB("ror", src, dst, _eflags);
960 break;
961 case 2: /* rcl */
962 emulate_2op_SrcB("rcl", src, dst, _eflags);
963 break;
964 case 3: /* rcr */
965 emulate_2op_SrcB("rcr", src, dst, _eflags);
966 break;
967 case 4: /* sal/shl */
968 case 6: /* sal/shl */
969 emulate_2op_SrcB("sal", src, dst, _eflags);
970 break;
971 case 5: /* shr */
972 emulate_2op_SrcB("shr", src, dst, _eflags);
973 break;
974 case 7: /* sar */
975 emulate_2op_SrcB("sar", src, dst, _eflags);
976 break;
977 }
978 break;
979 case 0xd0 ... 0xd1: /* Grp2 */
980 src.val = 1;
981 goto grp2;
982 case 0xd2 ... 0xd3: /* Grp2 */
983 src.val = _regs[VCPU_REGS_RCX];
984 goto grp2;
985 case 0xf6 ... 0xf7: /* Grp3 */
986 switch (modrm_reg) {
987 case 0 ... 1: /* test */
988 /*
989 * Special case in Grp3: test has an immediate
990 * source operand.
991 */
992 src.type = OP_IMM;
993 src.ptr = (unsigned long *)_eip;
994 src.bytes = (d & ByteOp) ? 1 : op_bytes;
995 if (src.bytes == 8)
996 src.bytes = 4;
997 switch (src.bytes) {
998 case 1:
999 src.val = insn_fetch(s8, 1, _eip);
1000 break;
1001 case 2:
1002 src.val = insn_fetch(s16, 2, _eip);
1003 break;
1004 case 4:
1005 src.val = insn_fetch(s32, 4, _eip);
1006 break;
1007 }
1008 goto test;
1009 case 2: /* not */
1010 dst.val = ~dst.val;
1011 break;
1012 case 3: /* neg */
1013 emulate_1op("neg", dst, _eflags);
1014 break;
1015 default:
1016 goto cannot_emulate;
1017 }
1018 break;
1019 case 0xfe ... 0xff: /* Grp4/Grp5 */
1020 switch (modrm_reg) {
1021 case 0: /* inc */
1022 emulate_1op("inc", dst, _eflags);
1023 break;
1024 case 1: /* dec */
1025 emulate_1op("dec", dst, _eflags);
1026 break;
1027 case 6: /* push */
1028 /* 64-bit mode: PUSH always pushes a 64-bit operand. */
1029 if (mode == X86EMUL_MODE_PROT64) {
1030 dst.bytes = 8;
1031 if ((rc = ops->read_std((unsigned long)dst.ptr,
1032 &dst.val, 8,
1033 ctxt)) != 0)
1034 goto done;
1035 }
1036 register_address_increment(_regs[VCPU_REGS_RSP],
1037 -dst.bytes);
1038 if ((rc = ops->write_std(
1039 register_address(ctxt->ss_base,
1040 _regs[VCPU_REGS_RSP]),
1041 dst.val, dst.bytes, ctxt)) != 0)
1042 goto done;
1043 dst.val = dst.orig_val; /* skanky: disable writeback */
1044 break;
1045 default:
1046 goto cannot_emulate;
1047 }
1048 break;
1049 }
1050
1051writeback:
1052 if ((d & Mov) || (dst.orig_val != dst.val)) {
1053 switch (dst.type) {
1054 case OP_REG:
1055 /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
1056 switch (dst.bytes) {
1057 case 1:
1058 *(u8 *)dst.ptr = (u8)dst.val;
1059 break;
1060 case 2:
1061 *(u16 *)dst.ptr = (u16)dst.val;
1062 break;
1063 case 4:
1064 *dst.ptr = (u32)dst.val;
1065 break; /* 64b: zero-ext */
1066 case 8:
1067 *dst.ptr = dst.val;
1068 break;
1069 }
1070 break;
1071 case OP_MEM:
1072 if (lock_prefix)
1073 rc = ops->cmpxchg_emulated((unsigned long)dst.
1074 ptr, dst.orig_val,
1075 dst.val, dst.bytes,
1076 ctxt);
1077 else
1078 rc = ops->write_emulated((unsigned long)dst.ptr,
1079 dst.val, dst.bytes,
1080 ctxt);
1081 if (rc != 0)
1082 goto done;
1083 default:
1084 break;
1085 }
1086 }
1087
1088 /* Commit shadow register state. */
1089 memcpy(ctxt->vcpu->regs, _regs, sizeof _regs);
1090 ctxt->eflags = _eflags;
1091 ctxt->vcpu->rip = _eip;
1092
1093done:
1094 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
1095
1096special_insn:
1097 if (twobyte)
1098 goto twobyte_special_insn;
1099 if (rep_prefix) {
1100 if (_regs[VCPU_REGS_RCX] == 0) {
1101 ctxt->vcpu->rip = _eip;
1102 goto done;
1103 }
1104 _regs[VCPU_REGS_RCX]--;
1105 _eip = ctxt->vcpu->rip;
1106 }
1107 switch (b) {
1108 case 0xa4 ... 0xa5: /* movs */
1109 dst.type = OP_MEM;
1110 dst.bytes = (d & ByteOp) ? 1 : op_bytes;
1111 dst.ptr = (unsigned long *)register_address(ctxt->es_base,
1112 _regs[VCPU_REGS_RDI]);
1113 if ((rc = ops->read_emulated(register_address(
1114 override_base ? *override_base : ctxt->ds_base,
1115 _regs[VCPU_REGS_RSI]), &dst.val, dst.bytes, ctxt)) != 0)
1116 goto done;
1117 register_address_increment(_regs[VCPU_REGS_RSI],
1118 (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
1119 register_address_increment(_regs[VCPU_REGS_RDI],
1120 (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
1121 break;
1122 case 0xa6 ... 0xa7: /* cmps */
1123 DPRINTF("Urk! I don't handle CMPS.\n");
1124 goto cannot_emulate;
1125 case 0xaa ... 0xab: /* stos */
1126 dst.type = OP_MEM;
1127 dst.bytes = (d & ByteOp) ? 1 : op_bytes;
1128 dst.ptr = (unsigned long *)cr2;
1129 dst.val = _regs[VCPU_REGS_RAX];
1130 register_address_increment(_regs[VCPU_REGS_RDI],
1131 (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
1132 break;
1133 case 0xac ... 0xad: /* lods */
1134 dst.type = OP_REG;
1135 dst.bytes = (d & ByteOp) ? 1 : op_bytes;
1136 dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
1137 if ((rc = ops->read_emulated(cr2, &dst.val, dst.bytes, ctxt)) != 0)
1138 goto done;
1139 register_address_increment(_regs[VCPU_REGS_RSI],
1140 (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
1141 break;
1142 case 0xae ... 0xaf: /* scas */
1143 DPRINTF("Urk! I don't handle SCAS.\n");
1144 goto cannot_emulate;
1145 }
1146 goto writeback;
1147
1148twobyte_insn:
1149 switch (b) {
1150 case 0x01: /* lgdt, lidt, lmsw */
1151 switch (modrm_reg) {
1152 u16 size;
1153 unsigned long address;
1154
1155 case 2: /* lgdt */
1156 rc = read_descriptor(ctxt, ops, src.ptr,
1157 &size, &address, op_bytes);
1158 if (rc)
1159 goto done;
1160 realmode_lgdt(ctxt->vcpu, size, address);
1161 break;
1162 case 3: /* lidt */
1163 rc = read_descriptor(ctxt, ops, src.ptr,
1164 &size, &address, op_bytes);
1165 if (rc)
1166 goto done;
1167 realmode_lidt(ctxt->vcpu, size, address);
1168 break;
1169 case 4: /* smsw */
1170 if (modrm_mod != 3)
1171 goto cannot_emulate;
1172 *(u16 *)&_regs[modrm_rm]
1173 = realmode_get_cr(ctxt->vcpu, 0);
1174 break;
1175 case 6: /* lmsw */
1176 if (modrm_mod != 3)
1177 goto cannot_emulate;
1178 realmode_lmsw(ctxt->vcpu, (u16)modrm_val, &_eflags);
1179 break;
1180 case 7: /* invlpg*/
1181 emulate_invlpg(ctxt->vcpu, cr2);
1182 break;
1183 default:
1184 goto cannot_emulate;
1185 }
1186 break;
1187 case 0x21: /* mov from dr to reg */
1188 if (modrm_mod != 3)
1189 goto cannot_emulate;
1190 rc = emulator_get_dr(ctxt, modrm_reg, &_regs[modrm_rm]);
1191 break;
1192 case 0x23: /* mov from reg to dr */
1193 if (modrm_mod != 3)
1194 goto cannot_emulate;
1195 rc = emulator_set_dr(ctxt, modrm_reg, _regs[modrm_rm]);
1196 break;
1197 case 0x40 ... 0x4f: /* cmov */
1198 dst.val = dst.orig_val = src.val;
1199 d &= ~Mov; /* default to no move */
1200 /*
1201 * First, assume we're decoding an even cmov opcode
1202 * (lsb == 0).
1203 */
1204 switch ((b & 15) >> 1) {
1205 case 0: /* cmovo */
1206 d |= (_eflags & EFLG_OF) ? Mov : 0;
1207 break;
1208 case 1: /* cmovb/cmovc/cmovnae */
1209 d |= (_eflags & EFLG_CF) ? Mov : 0;
1210 break;
1211 case 2: /* cmovz/cmove */
1212 d |= (_eflags & EFLG_ZF) ? Mov : 0;
1213 break;
1214 case 3: /* cmovbe/cmovna */
1215 d |= (_eflags & (EFLG_CF | EFLG_ZF)) ? Mov : 0;
1216 break;
1217 case 4: /* cmovs */
1218 d |= (_eflags & EFLG_SF) ? Mov : 0;
1219 break;
1220 case 5: /* cmovp/cmovpe */
1221 d |= (_eflags & EFLG_PF) ? Mov : 0;
1222 break;
1223 case 7: /* cmovle/cmovng */
1224 d |= (_eflags & EFLG_ZF) ? Mov : 0;
1225 /* fall through */
1226 case 6: /* cmovl/cmovnge */
1227 d |= (!(_eflags & EFLG_SF) !=
1228 !(_eflags & EFLG_OF)) ? Mov : 0;
1229 break;
1230 }
1231 /* Odd cmov opcodes (lsb == 1) have inverted sense. */
1232 d ^= (b & 1) ? Mov : 0;
1233 break;
1234 case 0xb0 ... 0xb1: /* cmpxchg */
1235 /*
1236 * Save real source value, then compare EAX against
1237 * destination.
1238 */
1239 src.orig_val = src.val;
1240 src.val = _regs[VCPU_REGS_RAX];
1241 emulate_2op_SrcV("cmp", src, dst, _eflags);
1242 /* Always write back. The question is: where to? */
1243 d |= Mov;
1244 if (_eflags & EFLG_ZF) {
1245 /* Success: write back to memory. */
1246 dst.val = src.orig_val;
1247 } else {
1248 /* Failure: write the value we saw to EAX. */
1249 dst.type = OP_REG;
1250 dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
1251 }
1252 break;
1253 case 0xa3:
1254 bt: /* bt */
1255 src.val &= (dst.bytes << 3) - 1; /* only subword offset */
1256 emulate_2op_SrcV_nobyte("bt", src, dst, _eflags);
1257 break;
1258 case 0xb3:
1259 btr: /* btr */
1260 src.val &= (dst.bytes << 3) - 1; /* only subword offset */
1261 emulate_2op_SrcV_nobyte("btr", src, dst, _eflags);
1262 break;
1263 case 0xab:
1264 bts: /* bts */
1265 src.val &= (dst.bytes << 3) - 1; /* only subword offset */
1266 emulate_2op_SrcV_nobyte("bts", src, dst, _eflags);
1267 break;
1268 case 0xb6 ... 0xb7: /* movzx */
1269 dst.bytes = op_bytes;
1270 dst.val = (d & ByteOp) ? (u8) src.val : (u16) src.val;
1271 break;
1272 case 0xbb:
1273 btc: /* btc */
1274 src.val &= (dst.bytes << 3) - 1; /* only subword offset */
1275 emulate_2op_SrcV_nobyte("btc", src, dst, _eflags);
1276 break;
1277 case 0xba: /* Grp8 */
1278 switch (modrm_reg & 3) {
1279 case 0:
1280 goto bt;
1281 case 1:
1282 goto bts;
1283 case 2:
1284 goto btr;
1285 case 3:
1286 goto btc;
1287 }
1288 break;
1289 case 0xbe ... 0xbf: /* movsx */
1290 dst.bytes = op_bytes;
1291 dst.val = (d & ByteOp) ? (s8) src.val : (s16) src.val;
1292 break;
1293 }
1294 goto writeback;
1295
1296twobyte_special_insn:
1297 /* Disable writeback. */
1298 dst.orig_val = dst.val;
1299 switch (b) {
1300 case 0x0d: /* GrpP (prefetch) */
1301 case 0x18: /* Grp16 (prefetch/nop) */
1302 break;
1303 case 0x06:
1304 emulate_clts(ctxt->vcpu);
1305 break;
1306 case 0x20: /* mov cr, reg */
1307 if (modrm_mod != 3)
1308 goto cannot_emulate;
1309 _regs[modrm_rm] = realmode_get_cr(ctxt->vcpu, modrm_reg);
1310 break;
1311 case 0x22: /* mov reg, cr */
1312 if (modrm_mod != 3)
1313 goto cannot_emulate;
1314 realmode_set_cr(ctxt->vcpu, modrm_reg, modrm_val, &_eflags);
1315 break;
1316 case 0xc7: /* Grp9 (cmpxchg8b) */
1317#if defined(__i386__)
1318 {
1319 unsigned long old_lo, old_hi;
1320 if (((rc = ops->read_emulated(cr2 + 0, &old_lo, 4,
1321 ctxt)) != 0)
1322 || ((rc = ops->read_emulated(cr2 + 4, &old_hi, 4,
1323 ctxt)) != 0))
1324 goto done;
1325 if ((old_lo != _regs[VCPU_REGS_RAX])
1326 || (old_hi != _regs[VCPU_REGS_RDI])) {
1327 _regs[VCPU_REGS_RAX] = old_lo;
1328 _regs[VCPU_REGS_RDX] = old_hi;
1329 _eflags &= ~EFLG_ZF;
1330 } else if (ops->cmpxchg8b_emulated == NULL) {
1331 rc = X86EMUL_UNHANDLEABLE;
1332 goto done;
1333 } else {
1334 if ((rc = ops->cmpxchg8b_emulated(cr2, old_lo,
1335 old_hi,
1336 _regs[VCPU_REGS_RBX],
1337 _regs[VCPU_REGS_RCX],
1338 ctxt)) != 0)
1339 goto done;
1340 _eflags |= EFLG_ZF;
1341 }
1342 break;
1343 }
1344#elif defined(__x86_64__)
1345 {
1346 unsigned long old, new;
1347 if ((rc = ops->read_emulated(cr2, &old, 8, ctxt)) != 0)
1348 goto done;
1349 if (((u32) (old >> 0) != (u32) _regs[VCPU_REGS_RAX]) ||
1350 ((u32) (old >> 32) != (u32) _regs[VCPU_REGS_RDX])) {
1351 _regs[VCPU_REGS_RAX] = (u32) (old >> 0);
1352 _regs[VCPU_REGS_RDX] = (u32) (old >> 32);
1353 _eflags &= ~EFLG_ZF;
1354 } else {
1355 new = (_regs[VCPU_REGS_RCX] << 32) | (u32) _regs[VCPU_REGS_RBX];
1356 if ((rc = ops->cmpxchg_emulated(cr2, old,
1357 new, 8, ctxt)) != 0)
1358 goto done;
1359 _eflags |= EFLG_ZF;
1360 }
1361 break;
1362 }
1363#endif
1364 }
1365 goto writeback;
1366
1367cannot_emulate:
1368 DPRINTF("Cannot emulate %02x\n", b);
1369 return -1;
1370}
1371
1372#ifdef __XEN__
1373
1374#include <asm/mm.h>
1375#include <asm/uaccess.h>
1376
1377int
1378x86_emulate_read_std(unsigned long addr,
1379 unsigned long *val,
1380 unsigned int bytes, struct x86_emulate_ctxt *ctxt)
1381{
1382 unsigned int rc;
1383
1384 *val = 0;
1385
1386 if ((rc = copy_from_user((void *)val, (void *)addr, bytes)) != 0) {
1387 propagate_page_fault(addr + bytes - rc, 0); /* read fault */
1388 return X86EMUL_PROPAGATE_FAULT;
1389 }
1390
1391 return X86EMUL_CONTINUE;
1392}
1393
1394int
1395x86_emulate_write_std(unsigned long addr,
1396 unsigned long val,
1397 unsigned int bytes, struct x86_emulate_ctxt *ctxt)
1398{
1399 unsigned int rc;
1400
1401 if ((rc = copy_to_user((void *)addr, (void *)&val, bytes)) != 0) {
1402 propagate_page_fault(addr + bytes - rc, PGERR_write_access);
1403 return X86EMUL_PROPAGATE_FAULT;
1404 }
1405
1406 return X86EMUL_CONTINUE;
1407}
1408
1409#endif
diff --git a/drivers/kvm/x86_emulate.h b/drivers/kvm/x86_emulate.h
new file mode 100644
index 000000000000..658b58de30fc
--- /dev/null
+++ b/drivers/kvm/x86_emulate.h
@@ -0,0 +1,185 @@
1/******************************************************************************
2 * x86_emulate.h
3 *
4 * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
5 *
6 * Copyright (c) 2005 Keir Fraser
7 *
8 * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
9 */
10
11#ifndef __X86_EMULATE_H__
12#define __X86_EMULATE_H__
13
14struct x86_emulate_ctxt;
15
16/*
17 * x86_emulate_ops:
18 *
19 * These operations represent the instruction emulator's interface to memory.
20 * There are two categories of operation: those that act on ordinary memory
21 * regions (*_std), and those that act on memory regions known to require
22 * special treatment or emulation (*_emulated).
23 *
24 * The emulator assumes that an instruction accesses only one 'emulated memory'
25 * location, that this location is the given linear faulting address (cr2), and
26 * that this is one of the instruction's data operands. Instruction fetches and
27 * stack operations are assumed never to access emulated memory. The emulator
28 * automatically deduces which operand of a string-move operation is accessing
29 * emulated memory, and assumes that the other operand accesses normal memory.
30 *
31 * NOTES:
32 * 1. The emulator isn't very smart about emulated vs. standard memory.
33 * 'Emulated memory' access addresses should be checked for sanity.
34 * 'Normal memory' accesses may fault, and the caller must arrange to
35 * detect and handle reentrancy into the emulator via recursive faults.
36 * Accesses may be unaligned and may cross page boundaries.
37 * 2. If the access fails (cannot emulate, or a standard access faults) then
38 * it is up to the memop to propagate the fault to the guest VM via
39 * some out-of-band mechanism, unknown to the emulator. The memop signals
40 * failure by returning X86EMUL_PROPAGATE_FAULT to the emulator, which will
41 * then immediately bail.
42 * 3. Valid access sizes are 1, 2, 4 and 8 bytes. On x86/32 systems only
43 * cmpxchg8b_emulated need support 8-byte accesses.
44 * 4. The emulator cannot handle 64-bit mode emulation on an x86/32 system.
45 */
46/* Access completed successfully: continue emulation as normal. */
47#define X86EMUL_CONTINUE 0
48/* Access is unhandleable: bail from emulation and return error to caller. */
49#define X86EMUL_UNHANDLEABLE 1
50/* Terminate emulation but return success to the caller. */
51#define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */
52#define X86EMUL_RETRY_INSTR 2 /* retry the instruction for some reason */
53#define X86EMUL_CMPXCHG_FAILED 2 /* cmpxchg did not see expected value */
54struct x86_emulate_ops {
55 /*
56 * read_std: Read bytes of standard (non-emulated/special) memory.
57 * Used for instruction fetch, stack operations, and others.
58 * @addr: [IN ] Linear address from which to read.
59 * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
60 * @bytes: [IN ] Number of bytes to read from memory.
61 */
62 int (*read_std)(unsigned long addr,
63 unsigned long *val,
64 unsigned int bytes, struct x86_emulate_ctxt * ctxt);
65
66 /*
67 * write_std: Write bytes of standard (non-emulated/special) memory.
68 * Used for stack operations, and others.
69 * @addr: [IN ] Linear address to which to write.
70 * @val: [IN ] Value to write to memory (low-order bytes used as
71 * required).
72 * @bytes: [IN ] Number of bytes to write to memory.
73 */
74 int (*write_std)(unsigned long addr,
75 unsigned long val,
76 unsigned int bytes, struct x86_emulate_ctxt * ctxt);
77
78 /*
79 * read_emulated: Read bytes from emulated/special memory area.
80 * @addr: [IN ] Linear address from which to read.
81 * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
82 * @bytes: [IN ] Number of bytes to read from memory.
83 */
84 int (*read_emulated) (unsigned long addr,
85 unsigned long *val,
86 unsigned int bytes,
87 struct x86_emulate_ctxt * ctxt);
88
89 /*
90 * write_emulated: Read bytes from emulated/special memory area.
91 * @addr: [IN ] Linear address to which to write.
92 * @val: [IN ] Value to write to memory (low-order bytes used as
93 * required).
94 * @bytes: [IN ] Number of bytes to write to memory.
95 */
96 int (*write_emulated) (unsigned long addr,
97 unsigned long val,
98 unsigned int bytes,
99 struct x86_emulate_ctxt * ctxt);
100
101 /*
102 * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an
103 * emulated/special memory area.
104 * @addr: [IN ] Linear address to access.
105 * @old: [IN ] Value expected to be current at @addr.
106 * @new: [IN ] Value to write to @addr.
107 * @bytes: [IN ] Number of bytes to access using CMPXCHG.
108 */
109 int (*cmpxchg_emulated) (unsigned long addr,
110 unsigned long old,
111 unsigned long new,
112 unsigned int bytes,
113 struct x86_emulate_ctxt * ctxt);
114
115 /*
116 * cmpxchg8b_emulated: Emulate an atomic (LOCKed) CMPXCHG8B operation on an
117 * emulated/special memory area.
118 * @addr: [IN ] Linear address to access.
119 * @old: [IN ] Value expected to be current at @addr.
120 * @new: [IN ] Value to write to @addr.
121 * NOTES:
122 * 1. This function is only ever called when emulating a real CMPXCHG8B.
123 * 2. This function is *never* called on x86/64 systems.
124 * 2. Not defining this function (i.e., specifying NULL) is equivalent
125 * to defining a function that always returns X86EMUL_UNHANDLEABLE.
126 */
127 int (*cmpxchg8b_emulated) (unsigned long addr,
128 unsigned long old_lo,
129 unsigned long old_hi,
130 unsigned long new_lo,
131 unsigned long new_hi,
132 struct x86_emulate_ctxt * ctxt);
133};
134
135struct cpu_user_regs;
136
137struct x86_emulate_ctxt {
138 /* Register state before/after emulation. */
139 struct kvm_vcpu *vcpu;
140
141 /* Linear faulting address (if emulating a page-faulting instruction). */
142 unsigned long eflags;
143 unsigned long cr2;
144
145 /* Emulated execution mode, represented by an X86EMUL_MODE value. */
146 int mode;
147
148 unsigned long cs_base;
149 unsigned long ds_base;
150 unsigned long es_base;
151 unsigned long ss_base;
152 unsigned long gs_base;
153 unsigned long fs_base;
154};
155
156/* Execution mode, passed to the emulator. */
157#define X86EMUL_MODE_REAL 0 /* Real mode. */
158#define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */
159#define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */
160#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */
161
162/* Host execution mode. */
163#if defined(__i386__)
164#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32
165#elif defined(__x86_64__)
166#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
167#endif
168
169/*
170 * x86_emulate_memop: Emulate an instruction that faulted attempting to
171 * read/write a 'special' memory area.
172 * Returns -1 on failure, 0 on success.
173 */
174int x86_emulate_memop(struct x86_emulate_ctxt *ctxt,
175 struct x86_emulate_ops *ops);
176
177/*
178 * Given the 'reg' portion of a ModRM byte, and a register block, return a
179 * pointer into the block that addresses the relevant register.
180 * @highbyte_regs specifies whether to decode AH,CH,DH,BH.
181 */
182void *decode_register(u8 modrm_reg, unsigned long *regs,
183 int highbyte_regs);
184
185#endif /* __X86_EMULATE_H__ */
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
new file mode 100644
index 000000000000..5bb2c3c585c1
--- /dev/null
+++ b/include/linux/kvm.h
@@ -0,0 +1,227 @@
1#ifndef __LINUX_KVM_H
2#define __LINUX_KVM_H
3
4/*
5 * Userspace interface for /dev/kvm - kernel based virtual machine
6 *
7 * Note: this interface is considered experimental and may change without
8 * notice.
9 */
10
11#include <asm/types.h>
12#include <linux/ioctl.h>
13
14/*
15 * Architectural interrupt line count, and the size of the bitmap needed
16 * to hold them.
17 */
18#define KVM_NR_INTERRUPTS 256
19#define KVM_IRQ_BITMAP_SIZE_BYTES ((KVM_NR_INTERRUPTS + 7) / 8)
20#define KVM_IRQ_BITMAP_SIZE(type) (KVM_IRQ_BITMAP_SIZE_BYTES / sizeof(type))
21
22
23/* for KVM_CREATE_MEMORY_REGION */
24struct kvm_memory_region {
25 __u32 slot;
26 __u32 flags;
27 __u64 guest_phys_addr;
28 __u64 memory_size; /* bytes */
29};
30
31/* for kvm_memory_region::flags */
32#define KVM_MEM_LOG_DIRTY_PAGES 1UL
33
34
35#define KVM_EXIT_TYPE_FAIL_ENTRY 1
36#define KVM_EXIT_TYPE_VM_EXIT 2
37
38enum kvm_exit_reason {
39 KVM_EXIT_UNKNOWN = 0,
40 KVM_EXIT_EXCEPTION = 1,
41 KVM_EXIT_IO = 2,
42 KVM_EXIT_CPUID = 3,
43 KVM_EXIT_DEBUG = 4,
44 KVM_EXIT_HLT = 5,
45 KVM_EXIT_MMIO = 6,
46};
47
48/* for KVM_RUN */
49struct kvm_run {
50 /* in */
51 __u32 vcpu;
52 __u32 emulated; /* skip current instruction */
53 __u32 mmio_completed; /* mmio request completed */
54
55 /* out */
56 __u32 exit_type;
57 __u32 exit_reason;
58 __u32 instruction_length;
59 union {
60 /* KVM_EXIT_UNKNOWN */
61 struct {
62 __u32 hardware_exit_reason;
63 } hw;
64 /* KVM_EXIT_EXCEPTION */
65 struct {
66 __u32 exception;
67 __u32 error_code;
68 } ex;
69 /* KVM_EXIT_IO */
70 struct {
71#define KVM_EXIT_IO_IN 0
72#define KVM_EXIT_IO_OUT 1
73 __u8 direction;
74 __u8 size; /* bytes */
75 __u8 string;
76 __u8 string_down;
77 __u8 rep;
78 __u8 pad;
79 __u16 port;
80 __u64 count;
81 union {
82 __u64 address;
83 __u32 value;
84 };
85 } io;
86 struct {
87 } debug;
88 /* KVM_EXIT_MMIO */
89 struct {
90 __u64 phys_addr;
91 __u8 data[8];
92 __u32 len;
93 __u8 is_write;
94 } mmio;
95 };
96};
97
98/* for KVM_GET_REGS and KVM_SET_REGS */
99struct kvm_regs {
100 /* in */
101 __u32 vcpu;
102 __u32 padding;
103
104 /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
105 __u64 rax, rbx, rcx, rdx;
106 __u64 rsi, rdi, rsp, rbp;
107 __u64 r8, r9, r10, r11;
108 __u64 r12, r13, r14, r15;
109 __u64 rip, rflags;
110};
111
112struct kvm_segment {
113 __u64 base;
114 __u32 limit;
115 __u16 selector;
116 __u8 type;
117 __u8 present, dpl, db, s, l, g, avl;
118 __u8 unusable;
119 __u8 padding;
120};
121
122struct kvm_dtable {
123 __u64 base;
124 __u16 limit;
125 __u16 padding[3];
126};
127
128/* for KVM_GET_SREGS and KVM_SET_SREGS */
129struct kvm_sregs {
130 /* in */
131 __u32 vcpu;
132 __u32 padding;
133
134 /* out (KVM_GET_SREGS) / in (KVM_SET_SREGS) */
135 struct kvm_segment cs, ds, es, fs, gs, ss;
136 struct kvm_segment tr, ldt;
137 struct kvm_dtable gdt, idt;
138 __u64 cr0, cr2, cr3, cr4, cr8;
139 __u64 efer;
140 __u64 apic_base;
141 __u64 interrupt_bitmap[KVM_IRQ_BITMAP_SIZE(__u64)];
142};
143
144struct kvm_msr_entry {
145 __u32 index;
146 __u32 reserved;
147 __u64 data;
148};
149
150/* for KVM_GET_MSRS and KVM_SET_MSRS */
151struct kvm_msrs {
152 __u32 vcpu;
153 __u32 nmsrs; /* number of msrs in entries */
154
155 struct kvm_msr_entry entries[0];
156};
157
158/* for KVM_GET_MSR_INDEX_LIST */
159struct kvm_msr_list {
160 __u32 nmsrs; /* number of msrs in entries */
161 __u32 indices[0];
162};
163
164/* for KVM_TRANSLATE */
165struct kvm_translation {
166 /* in */
167 __u64 linear_address;
168 __u32 vcpu;
169 __u32 padding;
170
171 /* out */
172 __u64 physical_address;
173 __u8 valid;
174 __u8 writeable;
175 __u8 usermode;
176};
177
178/* for KVM_INTERRUPT */
179struct kvm_interrupt {
180 /* in */
181 __u32 vcpu;
182 __u32 irq;
183};
184
185struct kvm_breakpoint {
186 __u32 enabled;
187 __u32 padding;
188 __u64 address;
189};
190
191/* for KVM_DEBUG_GUEST */
192struct kvm_debug_guest {
193 /* int */
194 __u32 vcpu;
195 __u32 enabled;
196 struct kvm_breakpoint breakpoints[4];
197 __u32 singlestep;
198};
199
200/* for KVM_GET_DIRTY_LOG */
201struct kvm_dirty_log {
202 __u32 slot;
203 __u32 padding;
204 union {
205 void __user *dirty_bitmap; /* one bit per page */
206 __u64 padding;
207 };
208};
209
210#define KVMIO 0xAE
211
212#define KVM_RUN _IOWR(KVMIO, 2, struct kvm_run)
213#define KVM_GET_REGS _IOWR(KVMIO, 3, struct kvm_regs)
214#define KVM_SET_REGS _IOW(KVMIO, 4, struct kvm_regs)
215#define KVM_GET_SREGS _IOWR(KVMIO, 5, struct kvm_sregs)
216#define KVM_SET_SREGS _IOW(KVMIO, 6, struct kvm_sregs)
217#define KVM_TRANSLATE _IOWR(KVMIO, 7, struct kvm_translation)
218#define KVM_INTERRUPT _IOW(KVMIO, 8, struct kvm_interrupt)
219#define KVM_DEBUG_GUEST _IOW(KVMIO, 9, struct kvm_debug_guest)
220#define KVM_SET_MEMORY_REGION _IOW(KVMIO, 10, struct kvm_memory_region)
221#define KVM_CREATE_VCPU _IOW(KVMIO, 11, int /* vcpu_slot */)
222#define KVM_GET_DIRTY_LOG _IOW(KVMIO, 12, struct kvm_dirty_log)
223#define KVM_GET_MSRS _IOWR(KVMIO, 13, struct kvm_msrs)
224#define KVM_SET_MSRS _IOWR(KVMIO, 14, struct kvm_msrs)
225#define KVM_GET_MSR_INDEX_LIST _IOWR(KVMIO, 15, struct kvm_msr_list)
226
227#endif