aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2008-01-30 17:30:10 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2008-01-30 17:30:10 -0500
commit2c57ee6f924c95e4dce61ed4776fb3f62e1b9f92 (patch)
treeb9d92e52e8c0ee68a0f5012b470c6146a9f0b65a /arch
parentf389e9fcecdec4c4cb890ad28ea30a87a579ec3e (diff)
parent2f52d58c92d971bf421f461ad06eb93fb4f34981 (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm: (249 commits) KVM: Move apic timer migration away from critical section KVM: Put kvm_para.h include outside __KERNEL__ KVM: Fix unbounded preemption latency KVM: Initialize the mmu caches only after verifying cpu support KVM: MMU: Fix dirty page setting for pages removed from rmap KVM: Portability: Move kvm_fpu to asm-x86/kvm.h KVM: x86 emulator: Only allow VMCALL/VMMCALL trapped by #UD KVM: MMU: Merge shadow level check in FNAME(fetch) KVM: MMU: Move kvm_free_some_pages() into critical section KVM: MMU: Switch to mmu spinlock KVM: MMU: Avoid calling gfn_to_page() in mmu_set_spte() KVM: Add kvm_read_guest_atomic() KVM: MMU: Concurrent guest walkers KVM: Disable vapic support on Intel machines with FlexPriority KVM: Accelerated apic support KVM: local APIC TPR access reporting facility KVM: Print data for unimplemented wrmsr KVM: MMU: Add cache miss statistic KVM: MMU: Coalesce remote tlb flushes KVM: Expose ioapic to ia64 save/restore APIs ...
Diffstat (limited to 'arch')
-rw-r--r--arch/x86/Kconfig3
-rw-r--r--arch/x86/Makefile2
-rw-r--r--arch/x86/kvm/Kconfig57
-rw-r--r--arch/x86/kvm/Makefile14
-rw-r--r--arch/x86/kvm/i8259.c450
-rw-r--r--arch/x86/kvm/irq.c78
-rw-r--r--arch/x86/kvm/irq.h88
-rw-r--r--arch/x86/kvm/kvm_svm.h45
-rw-r--r--arch/x86/kvm/lapic.c1154
-rw-r--r--arch/x86/kvm/lapic.h50
-rw-r--r--arch/x86/kvm/mmu.c1885
-rw-r--r--arch/x86/kvm/mmu.h44
-rw-r--r--arch/x86/kvm/paging_tmpl.h484
-rw-r--r--arch/x86/kvm/segment_descriptor.h29
-rw-r--r--arch/x86/kvm/svm.c1731
-rw-r--r--arch/x86/kvm/svm.h325
-rw-r--r--arch/x86/kvm/vmx.c2679
-rw-r--r--arch/x86/kvm/vmx.h324
-rw-r--r--arch/x86/kvm/x86.c3287
-rw-r--r--arch/x86/kvm/x86_emulate.c1912
20 files changed, 14641 insertions, 0 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index fb3eea3e38ee..65b449134cf7 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -107,6 +107,7 @@ config ARCH_SUPPORTS_OPROFILE
107 bool 107 bool
108 default y 108 default y
109 109
110select HAVE_KVM
110 111
111config ZONE_DMA32 112config ZONE_DMA32
112 bool 113 bool
@@ -1598,4 +1599,6 @@ source "security/Kconfig"
1598 1599
1599source "crypto/Kconfig" 1600source "crypto/Kconfig"
1600 1601
1602source "arch/x86/kvm/Kconfig"
1603
1601source "lib/Kconfig" 1604source "lib/Kconfig"
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index b08f18261df6..da8f4129780b 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -7,6 +7,8 @@ else
7 KBUILD_DEFCONFIG := $(ARCH)_defconfig 7 KBUILD_DEFCONFIG := $(ARCH)_defconfig
8endif 8endif
9 9
10core-$(CONFIG_KVM) += arch/x86/kvm/
11
10# BITS is used as extension for files which are available in a 32 bit 12# BITS is used as extension for files which are available in a 32 bit
11# and a 64 bit version to simplify shared Makefiles. 13# and a 64 bit version to simplify shared Makefiles.
12# e.g.: obj-y += foo_$(BITS).o 14# e.g.: obj-y += foo_$(BITS).o
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
new file mode 100644
index 000000000000..c83e1c9b5129
--- /dev/null
+++ b/arch/x86/kvm/Kconfig
@@ -0,0 +1,57 @@
1#
2# KVM configuration
3#
4config HAVE_KVM
5 bool
6
7menuconfig VIRTUALIZATION
8 bool "Virtualization"
9 depends on HAVE_KVM || X86
10 default y
11 ---help---
12 Say Y here to get to see options for using your Linux host to run other
13 operating systems inside virtual machines (guests).
14 This option alone does not add any kernel code.
15
16 If you say N, all options in this submenu will be skipped and disabled.
17
18if VIRTUALIZATION
19
20config KVM
21 tristate "Kernel-based Virtual Machine (KVM) support"
22 depends on HAVE_KVM && EXPERIMENTAL
23 select PREEMPT_NOTIFIERS
24 select ANON_INODES
25 ---help---
26 Support hosting fully virtualized guest machines using hardware
27 virtualization extensions. You will need a fairly recent
28 processor equipped with virtualization extensions. You will also
29 need to select one or more of the processor modules below.
30
31 This module provides access to the hardware capabilities through
32 a character device node named /dev/kvm.
33
34 To compile this as a module, choose M here: the module
35 will be called kvm.
36
37 If unsure, say N.
38
39config KVM_INTEL
40 tristate "KVM for Intel processors support"
41 depends on KVM
42 ---help---
43 Provides support for KVM on Intel processors equipped with the VT
44 extensions.
45
46config KVM_AMD
47 tristate "KVM for AMD processors support"
48 depends on KVM
49 ---help---
50 Provides support for KVM on AMD processors equipped with the AMD-V
51 (SVM) extensions.
52
53# OK, it's a little counter-intuitive to do this, but it puts it neatly under
54# the virtualization menu.
55source drivers/lguest/Kconfig
56
57endif # VIRTUALIZATION
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
new file mode 100644
index 000000000000..ffdd0b310784
--- /dev/null
+++ b/arch/x86/kvm/Makefile
@@ -0,0 +1,14 @@
1#
2# Makefile for Kernel-based Virtual Machine module
3#
4
5common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o)
6
7EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
8
9kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o
10obj-$(CONFIG_KVM) += kvm.o
11kvm-intel-objs = vmx.o
12obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
13kvm-amd-objs = svm.o
14obj-$(CONFIG_KVM_AMD) += kvm-amd.o
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
new file mode 100644
index 000000000000..ab29cf2def47
--- /dev/null
+++ b/arch/x86/kvm/i8259.c
@@ -0,0 +1,450 @@
1/*
2 * 8259 interrupt controller emulation
3 *
4 * Copyright (c) 2003-2004 Fabrice Bellard
5 * Copyright (c) 2007 Intel Corporation
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a copy
8 * of this software and associated documentation files (the "Software"), to deal
9 * in the Software without restriction, including without limitation the rights
10 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 * copies of the Software, and to permit persons to whom the Software is
12 * furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be included in
15 * all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 * THE SOFTWARE.
24 * Authors:
25 * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
26 * Port from Qemu.
27 */
28#include <linux/mm.h>
29#include "irq.h"
30
31#include <linux/kvm_host.h>
32
33/*
34 * set irq level. If an edge is detected, then the IRR is set to 1
35 */
36static inline void pic_set_irq1(struct kvm_kpic_state *s, int irq, int level)
37{
38 int mask;
39 mask = 1 << irq;
40 if (s->elcr & mask) /* level triggered */
41 if (level) {
42 s->irr |= mask;
43 s->last_irr |= mask;
44 } else {
45 s->irr &= ~mask;
46 s->last_irr &= ~mask;
47 }
48 else /* edge triggered */
49 if (level) {
50 if ((s->last_irr & mask) == 0)
51 s->irr |= mask;
52 s->last_irr |= mask;
53 } else
54 s->last_irr &= ~mask;
55}
56
57/*
58 * return the highest priority found in mask (highest = smallest
59 * number). Return 8 if no irq
60 */
61static inline int get_priority(struct kvm_kpic_state *s, int mask)
62{
63 int priority;
64 if (mask == 0)
65 return 8;
66 priority = 0;
67 while ((mask & (1 << ((priority + s->priority_add) & 7))) == 0)
68 priority++;
69 return priority;
70}
71
72/*
73 * return the pic wanted interrupt. return -1 if none
74 */
75static int pic_get_irq(struct kvm_kpic_state *s)
76{
77 int mask, cur_priority, priority;
78
79 mask = s->irr & ~s->imr;
80 priority = get_priority(s, mask);
81 if (priority == 8)
82 return -1;
83 /*
84 * compute current priority. If special fully nested mode on the
85 * master, the IRQ coming from the slave is not taken into account
86 * for the priority computation.
87 */
88 mask = s->isr;
89 if (s->special_fully_nested_mode && s == &s->pics_state->pics[0])
90 mask &= ~(1 << 2);
91 cur_priority = get_priority(s, mask);
92 if (priority < cur_priority)
93 /*
94 * higher priority found: an irq should be generated
95 */
96 return (priority + s->priority_add) & 7;
97 else
98 return -1;
99}
100
101/*
102 * raise irq to CPU if necessary. must be called every time the active
103 * irq may change
104 */
105static void pic_update_irq(struct kvm_pic *s)
106{
107 int irq2, irq;
108
109 irq2 = pic_get_irq(&s->pics[1]);
110 if (irq2 >= 0) {
111 /*
112 * if irq request by slave pic, signal master PIC
113 */
114 pic_set_irq1(&s->pics[0], 2, 1);
115 pic_set_irq1(&s->pics[0], 2, 0);
116 }
117 irq = pic_get_irq(&s->pics[0]);
118 if (irq >= 0)
119 s->irq_request(s->irq_request_opaque, 1);
120 else
121 s->irq_request(s->irq_request_opaque, 0);
122}
123
124void kvm_pic_update_irq(struct kvm_pic *s)
125{
126 pic_update_irq(s);
127}
128
129void kvm_pic_set_irq(void *opaque, int irq, int level)
130{
131 struct kvm_pic *s = opaque;
132
133 pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
134 pic_update_irq(s);
135}
136
137/*
138 * acknowledge interrupt 'irq'
139 */
140static inline void pic_intack(struct kvm_kpic_state *s, int irq)
141{
142 if (s->auto_eoi) {
143 if (s->rotate_on_auto_eoi)
144 s->priority_add = (irq + 1) & 7;
145 } else
146 s->isr |= (1 << irq);
147 /*
148 * We don't clear a level sensitive interrupt here
149 */
150 if (!(s->elcr & (1 << irq)))
151 s->irr &= ~(1 << irq);
152}
153
154int kvm_pic_read_irq(struct kvm_pic *s)
155{
156 int irq, irq2, intno;
157
158 irq = pic_get_irq(&s->pics[0]);
159 if (irq >= 0) {
160 pic_intack(&s->pics[0], irq);
161 if (irq == 2) {
162 irq2 = pic_get_irq(&s->pics[1]);
163 if (irq2 >= 0)
164 pic_intack(&s->pics[1], irq2);
165 else
166 /*
167 * spurious IRQ on slave controller
168 */
169 irq2 = 7;
170 intno = s->pics[1].irq_base + irq2;
171 irq = irq2 + 8;
172 } else
173 intno = s->pics[0].irq_base + irq;
174 } else {
175 /*
176 * spurious IRQ on host controller
177 */
178 irq = 7;
179 intno = s->pics[0].irq_base + irq;
180 }
181 pic_update_irq(s);
182
183 return intno;
184}
185
186void kvm_pic_reset(struct kvm_kpic_state *s)
187{
188 s->last_irr = 0;
189 s->irr = 0;
190 s->imr = 0;
191 s->isr = 0;
192 s->priority_add = 0;
193 s->irq_base = 0;
194 s->read_reg_select = 0;
195 s->poll = 0;
196 s->special_mask = 0;
197 s->init_state = 0;
198 s->auto_eoi = 0;
199 s->rotate_on_auto_eoi = 0;
200 s->special_fully_nested_mode = 0;
201 s->init4 = 0;
202}
203
204static void pic_ioport_write(void *opaque, u32 addr, u32 val)
205{
206 struct kvm_kpic_state *s = opaque;
207 int priority, cmd, irq;
208
209 addr &= 1;
210 if (addr == 0) {
211 if (val & 0x10) {
212 kvm_pic_reset(s); /* init */
213 /*
214 * deassert a pending interrupt
215 */
216 s->pics_state->irq_request(s->pics_state->
217 irq_request_opaque, 0);
218 s->init_state = 1;
219 s->init4 = val & 1;
220 if (val & 0x02)
221 printk(KERN_ERR "single mode not supported");
222 if (val & 0x08)
223 printk(KERN_ERR
224 "level sensitive irq not supported");
225 } else if (val & 0x08) {
226 if (val & 0x04)
227 s->poll = 1;
228 if (val & 0x02)
229 s->read_reg_select = val & 1;
230 if (val & 0x40)
231 s->special_mask = (val >> 5) & 1;
232 } else {
233 cmd = val >> 5;
234 switch (cmd) {
235 case 0:
236 case 4:
237 s->rotate_on_auto_eoi = cmd >> 2;
238 break;
239 case 1: /* end of interrupt */
240 case 5:
241 priority = get_priority(s, s->isr);
242 if (priority != 8) {
243 irq = (priority + s->priority_add) & 7;
244 s->isr &= ~(1 << irq);
245 if (cmd == 5)
246 s->priority_add = (irq + 1) & 7;
247 pic_update_irq(s->pics_state);
248 }
249 break;
250 case 3:
251 irq = val & 7;
252 s->isr &= ~(1 << irq);
253 pic_update_irq(s->pics_state);
254 break;
255 case 6:
256 s->priority_add = (val + 1) & 7;
257 pic_update_irq(s->pics_state);
258 break;
259 case 7:
260 irq = val & 7;
261 s->isr &= ~(1 << irq);
262 s->priority_add = (irq + 1) & 7;
263 pic_update_irq(s->pics_state);
264 break;
265 default:
266 break; /* no operation */
267 }
268 }
269 } else
270 switch (s->init_state) {
271 case 0: /* normal mode */
272 s->imr = val;
273 pic_update_irq(s->pics_state);
274 break;
275 case 1:
276 s->irq_base = val & 0xf8;
277 s->init_state = 2;
278 break;
279 case 2:
280 if (s->init4)
281 s->init_state = 3;
282 else
283 s->init_state = 0;
284 break;
285 case 3:
286 s->special_fully_nested_mode = (val >> 4) & 1;
287 s->auto_eoi = (val >> 1) & 1;
288 s->init_state = 0;
289 break;
290 }
291}
292
293static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1)
294{
295 int ret;
296
297 ret = pic_get_irq(s);
298 if (ret >= 0) {
299 if (addr1 >> 7) {
300 s->pics_state->pics[0].isr &= ~(1 << 2);
301 s->pics_state->pics[0].irr &= ~(1 << 2);
302 }
303 s->irr &= ~(1 << ret);
304 s->isr &= ~(1 << ret);
305 if (addr1 >> 7 || ret != 2)
306 pic_update_irq(s->pics_state);
307 } else {
308 ret = 0x07;
309 pic_update_irq(s->pics_state);
310 }
311
312 return ret;
313}
314
315static u32 pic_ioport_read(void *opaque, u32 addr1)
316{
317 struct kvm_kpic_state *s = opaque;
318 unsigned int addr;
319 int ret;
320
321 addr = addr1;
322 addr &= 1;
323 if (s->poll) {
324 ret = pic_poll_read(s, addr1);
325 s->poll = 0;
326 } else
327 if (addr == 0)
328 if (s->read_reg_select)
329 ret = s->isr;
330 else
331 ret = s->irr;
332 else
333 ret = s->imr;
334 return ret;
335}
336
337static void elcr_ioport_write(void *opaque, u32 addr, u32 val)
338{
339 struct kvm_kpic_state *s = opaque;
340 s->elcr = val & s->elcr_mask;
341}
342
343static u32 elcr_ioport_read(void *opaque, u32 addr1)
344{
345 struct kvm_kpic_state *s = opaque;
346 return s->elcr;
347}
348
349static int picdev_in_range(struct kvm_io_device *this, gpa_t addr)
350{
351 switch (addr) {
352 case 0x20:
353 case 0x21:
354 case 0xa0:
355 case 0xa1:
356 case 0x4d0:
357 case 0x4d1:
358 return 1;
359 default:
360 return 0;
361 }
362}
363
364static void picdev_write(struct kvm_io_device *this,
365 gpa_t addr, int len, const void *val)
366{
367 struct kvm_pic *s = this->private;
368 unsigned char data = *(unsigned char *)val;
369
370 if (len != 1) {
371 if (printk_ratelimit())
372 printk(KERN_ERR "PIC: non byte write\n");
373 return;
374 }
375 switch (addr) {
376 case 0x20:
377 case 0x21:
378 case 0xa0:
379 case 0xa1:
380 pic_ioport_write(&s->pics[addr >> 7], addr, data);
381 break;
382 case 0x4d0:
383 case 0x4d1:
384 elcr_ioport_write(&s->pics[addr & 1], addr, data);
385 break;
386 }
387}
388
389static void picdev_read(struct kvm_io_device *this,
390 gpa_t addr, int len, void *val)
391{
392 struct kvm_pic *s = this->private;
393 unsigned char data = 0;
394
395 if (len != 1) {
396 if (printk_ratelimit())
397 printk(KERN_ERR "PIC: non byte read\n");
398 return;
399 }
400 switch (addr) {
401 case 0x20:
402 case 0x21:
403 case 0xa0:
404 case 0xa1:
405 data = pic_ioport_read(&s->pics[addr >> 7], addr);
406 break;
407 case 0x4d0:
408 case 0x4d1:
409 data = elcr_ioport_read(&s->pics[addr & 1], addr);
410 break;
411 }
412 *(unsigned char *)val = data;
413}
414
415/*
416 * callback when PIC0 irq status changed
417 */
418static void pic_irq_request(void *opaque, int level)
419{
420 struct kvm *kvm = opaque;
421 struct kvm_vcpu *vcpu = kvm->vcpus[0];
422
423 pic_irqchip(kvm)->output = level;
424 if (vcpu)
425 kvm_vcpu_kick(vcpu);
426}
427
428struct kvm_pic *kvm_create_pic(struct kvm *kvm)
429{
430 struct kvm_pic *s;
431 s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
432 if (!s)
433 return NULL;
434 s->pics[0].elcr_mask = 0xf8;
435 s->pics[1].elcr_mask = 0xde;
436 s->irq_request = pic_irq_request;
437 s->irq_request_opaque = kvm;
438 s->pics[0].pics_state = s;
439 s->pics[1].pics_state = s;
440
441 /*
442 * Initialize PIO device
443 */
444 s->dev.read = picdev_read;
445 s->dev.write = picdev_write;
446 s->dev.in_range = picdev_in_range;
447 s->dev.private = s;
448 kvm_io_bus_register_dev(&kvm->pio_bus, &s->dev);
449 return s;
450}
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
new file mode 100644
index 000000000000..e5714759e97f
--- /dev/null
+++ b/arch/x86/kvm/irq.c
@@ -0,0 +1,78 @@
1/*
2 * irq.c: API for in kernel interrupt controller
3 * Copyright (c) 2007, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Authors:
18 * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
19 *
20 */
21
22#include <linux/module.h>
23#include <linux/kvm_host.h>
24
25#include "irq.h"
26
27/*
28 * check if there is pending interrupt without
29 * intack.
30 */
31int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
32{
33 struct kvm_pic *s;
34
35 if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */
36 if (kvm_apic_accept_pic_intr(v)) {
37 s = pic_irqchip(v->kvm); /* PIC */
38 return s->output;
39 } else
40 return 0;
41 }
42 return 1;
43}
44EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
45
46/*
47 * Read pending interrupt vector and intack.
48 */
49int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
50{
51 struct kvm_pic *s;
52 int vector;
53
54 vector = kvm_get_apic_interrupt(v); /* APIC */
55 if (vector == -1) {
56 if (kvm_apic_accept_pic_intr(v)) {
57 s = pic_irqchip(v->kvm);
58 s->output = 0; /* PIC */
59 vector = kvm_pic_read_irq(s);
60 }
61 }
62 return vector;
63}
64EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
65
66void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
67{
68 kvm_inject_apic_timer_irqs(vcpu);
69 /* TODO: PIT, RTC etc. */
70}
71EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
72
73void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
74{
75 kvm_apic_timer_intr_post(vcpu, vec);
76 /* TODO: PIT, RTC etc. */
77}
78EXPORT_SYMBOL_GPL(kvm_timer_intr_post);
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
new file mode 100644
index 000000000000..fa5ed5d59b5d
--- /dev/null
+++ b/arch/x86/kvm/irq.h
@@ -0,0 +1,88 @@
1/*
2 * irq.h: in kernel interrupt controller related definitions
3 * Copyright (c) 2007, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Authors:
18 * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
19 *
20 */
21
22#ifndef __IRQ_H
23#define __IRQ_H
24
25#include <linux/mm_types.h>
26#include <linux/hrtimer.h>
27#include <linux/kvm_host.h>
28
29#include "iodev.h"
30#include "ioapic.h"
31#include "lapic.h"
32
33struct kvm;
34struct kvm_vcpu;
35
36typedef void irq_request_func(void *opaque, int level);
37
38struct kvm_kpic_state {
39 u8 last_irr; /* edge detection */
40 u8 irr; /* interrupt request register */
41 u8 imr; /* interrupt mask register */
42 u8 isr; /* interrupt service register */
43 u8 priority_add; /* highest irq priority */
44 u8 irq_base;
45 u8 read_reg_select;
46 u8 poll;
47 u8 special_mask;
48 u8 init_state;
49 u8 auto_eoi;
50 u8 rotate_on_auto_eoi;
51 u8 special_fully_nested_mode;
52 u8 init4; /* true if 4 byte init */
53 u8 elcr; /* PIIX edge/trigger selection */
54 u8 elcr_mask;
55 struct kvm_pic *pics_state;
56};
57
58struct kvm_pic {
59 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
60 irq_request_func *irq_request;
61 void *irq_request_opaque;
62 int output; /* intr from master PIC */
63 struct kvm_io_device dev;
64};
65
66struct kvm_pic *kvm_create_pic(struct kvm *kvm);
67void kvm_pic_set_irq(void *opaque, int irq, int level);
68int kvm_pic_read_irq(struct kvm_pic *s);
69void kvm_pic_update_irq(struct kvm_pic *s);
70
71static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
72{
73 return kvm->arch.vpic;
74}
75
76static inline int irqchip_in_kernel(struct kvm *kvm)
77{
78 return pic_irqchip(kvm) != NULL;
79}
80
81void kvm_pic_reset(struct kvm_kpic_state *s);
82
83void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
84void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
85void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
86void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
87
88#endif
diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h
new file mode 100644
index 000000000000..ecdfe97e4635
--- /dev/null
+++ b/arch/x86/kvm/kvm_svm.h
@@ -0,0 +1,45 @@
1#ifndef __KVM_SVM_H
2#define __KVM_SVM_H
3
4#include <linux/kernel.h>
5#include <linux/types.h>
6#include <linux/list.h>
7#include <linux/kvm_host.h>
8#include <asm/msr.h>
9
10#include "svm.h"
11
12static const u32 host_save_user_msrs[] = {
13#ifdef CONFIG_X86_64
14 MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
15 MSR_FS_BASE,
16#endif
17 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
18};
19
20#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
21#define NUM_DB_REGS 4
22
23struct kvm_vcpu;
24
25struct vcpu_svm {
26 struct kvm_vcpu vcpu;
27 struct vmcb *vmcb;
28 unsigned long vmcb_pa;
29 struct svm_cpu_data *svm_data;
30 uint64_t asid_generation;
31
32 unsigned long db_regs[NUM_DB_REGS];
33
34 u64 next_rip;
35
36 u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
37 u64 host_gs_base;
38 unsigned long host_cr2;
39 unsigned long host_db_regs[NUM_DB_REGS];
40 unsigned long host_dr6;
41 unsigned long host_dr7;
42};
43
44#endif
45
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
new file mode 100644
index 000000000000..2cbee9479ce4
--- /dev/null
+++ b/arch/x86/kvm/lapic.c
@@ -0,0 +1,1154 @@
1
2/*
3 * Local APIC virtualization
4 *
5 * Copyright (C) 2006 Qumranet, Inc.
6 * Copyright (C) 2007 Novell
7 * Copyright (C) 2007 Intel
8 *
9 * Authors:
10 * Dor Laor <dor.laor@qumranet.com>
11 * Gregory Haskins <ghaskins@novell.com>
12 * Yaozu (Eddie) Dong <eddie.dong@intel.com>
13 *
14 * Based on Xen 3.1 code, Copyright (c) 2004, Intel Corporation.
15 *
16 * This work is licensed under the terms of the GNU GPL, version 2. See
17 * the COPYING file in the top-level directory.
18 */
19
20#include <linux/kvm_host.h>
21#include <linux/kvm.h>
22#include <linux/mm.h>
23#include <linux/highmem.h>
24#include <linux/smp.h>
25#include <linux/hrtimer.h>
26#include <linux/io.h>
27#include <linux/module.h>
28#include <asm/processor.h>
29#include <asm/msr.h>
30#include <asm/page.h>
31#include <asm/current.h>
32#include <asm/apicdef.h>
33#include <asm/atomic.h>
34#include <asm/div64.h>
35#include "irq.h"
36
37#define PRId64 "d"
38#define PRIx64 "llx"
39#define PRIu64 "u"
40#define PRIo64 "o"
41
42#define APIC_BUS_CYCLE_NS 1
43
44/* #define apic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */
45#define apic_debug(fmt, arg...)
46
47#define APIC_LVT_NUM 6
48/* 14 is the version for Xeon and Pentium 8.4.8*/
49#define APIC_VERSION (0x14UL | ((APIC_LVT_NUM - 1) << 16))
50#define LAPIC_MMIO_LENGTH (1 << 12)
51/* followed define is not in apicdef.h */
52#define APIC_SHORT_MASK 0xc0000
53#define APIC_DEST_NOSHORT 0x0
54#define APIC_DEST_MASK 0x800
55#define MAX_APIC_VECTOR 256
56
57#define VEC_POS(v) ((v) & (32 - 1))
58#define REG_POS(v) (((v) >> 5) << 4)
59
60static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off)
61{
62 return *((u32 *) (apic->regs + reg_off));
63}
64
65static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
66{
67 *((u32 *) (apic->regs + reg_off)) = val;
68}
69
70static inline int apic_test_and_set_vector(int vec, void *bitmap)
71{
72 return test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
73}
74
75static inline int apic_test_and_clear_vector(int vec, void *bitmap)
76{
77 return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
78}
79
80static inline void apic_set_vector(int vec, void *bitmap)
81{
82 set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
83}
84
85static inline void apic_clear_vector(int vec, void *bitmap)
86{
87 clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
88}
89
90static inline int apic_hw_enabled(struct kvm_lapic *apic)
91{
92 return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
93}
94
95static inline int apic_sw_enabled(struct kvm_lapic *apic)
96{
97 return apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED;
98}
99
100static inline int apic_enabled(struct kvm_lapic *apic)
101{
102 return apic_sw_enabled(apic) && apic_hw_enabled(apic);
103}
104
105#define LVT_MASK \
106 (APIC_LVT_MASKED | APIC_SEND_PENDING | APIC_VECTOR_MASK)
107
108#define LINT_MASK \
109 (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
110 APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
111
112static inline int kvm_apic_id(struct kvm_lapic *apic)
113{
114 return (apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
115}
116
117static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
118{
119 return !(apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED);
120}
121
122static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type)
123{
124 return apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK;
125}
126
127static inline int apic_lvtt_period(struct kvm_lapic *apic)
128{
129 return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC;
130}
131
132static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
133 LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */
134 LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */
135 LVT_MASK | APIC_MODE_MASK, /* LVTPC */
136 LINT_MASK, LINT_MASK, /* LVT0-1 */
137 LVT_MASK /* LVTERR */
138};
139
140static int find_highest_vector(void *bitmap)
141{
142 u32 *word = bitmap;
143 int word_offset = MAX_APIC_VECTOR >> 5;
144
145 while ((word_offset != 0) && (word[(--word_offset) << 2] == 0))
146 continue;
147
148 if (likely(!word_offset && !word[0]))
149 return -1;
150 else
151 return fls(word[word_offset << 2]) - 1 + (word_offset << 5);
152}
153
154static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic)
155{
156 return apic_test_and_set_vector(vec, apic->regs + APIC_IRR);
157}
158
159static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
160{
161 apic_clear_vector(vec, apic->regs + APIC_IRR);
162}
163
164static inline int apic_find_highest_irr(struct kvm_lapic *apic)
165{
166 int result;
167
168 result = find_highest_vector(apic->regs + APIC_IRR);
169 ASSERT(result == -1 || result >= 16);
170
171 return result;
172}
173
174int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
175{
176 struct kvm_lapic *apic = vcpu->arch.apic;
177 int highest_irr;
178
179 if (!apic)
180 return 0;
181 highest_irr = apic_find_highest_irr(apic);
182
183 return highest_irr;
184}
185EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
186
187int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig)
188{
189 struct kvm_lapic *apic = vcpu->arch.apic;
190
191 if (!apic_test_and_set_irr(vec, apic)) {
192 /* a new pending irq is set in IRR */
193 if (trig)
194 apic_set_vector(vec, apic->regs + APIC_TMR);
195 else
196 apic_clear_vector(vec, apic->regs + APIC_TMR);
197 kvm_vcpu_kick(apic->vcpu);
198 return 1;
199 }
200 return 0;
201}
202
203static inline int apic_find_highest_isr(struct kvm_lapic *apic)
204{
205 int result;
206
207 result = find_highest_vector(apic->regs + APIC_ISR);
208 ASSERT(result == -1 || result >= 16);
209
210 return result;
211}
212
213static void apic_update_ppr(struct kvm_lapic *apic)
214{
215 u32 tpr, isrv, ppr;
216 int isr;
217
218 tpr = apic_get_reg(apic, APIC_TASKPRI);
219 isr = apic_find_highest_isr(apic);
220 isrv = (isr != -1) ? isr : 0;
221
222 if ((tpr & 0xf0) >= (isrv & 0xf0))
223 ppr = tpr & 0xff;
224 else
225 ppr = isrv & 0xf0;
226
227 apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x",
228 apic, ppr, isr, isrv);
229
230 apic_set_reg(apic, APIC_PROCPRI, ppr);
231}
232
233static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
234{
235 apic_set_reg(apic, APIC_TASKPRI, tpr);
236 apic_update_ppr(apic);
237}
238
239int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest)
240{
241 return kvm_apic_id(apic) == dest;
242}
243
244int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
245{
246 int result = 0;
247 u8 logical_id;
248
249 logical_id = GET_APIC_LOGICAL_ID(apic_get_reg(apic, APIC_LDR));
250
251 switch (apic_get_reg(apic, APIC_DFR)) {
252 case APIC_DFR_FLAT:
253 if (logical_id & mda)
254 result = 1;
255 break;
256 case APIC_DFR_CLUSTER:
257 if (((logical_id >> 4) == (mda >> 0x4))
258 && (logical_id & mda & 0xf))
259 result = 1;
260 break;
261 default:
262 printk(KERN_WARNING "Bad DFR vcpu %d: %08x\n",
263 apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR));
264 break;
265 }
266
267 return result;
268}
269
270static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
271 int short_hand, int dest, int dest_mode)
272{
273 int result = 0;
274 struct kvm_lapic *target = vcpu->arch.apic;
275
276 apic_debug("target %p, source %p, dest 0x%x, "
277 "dest_mode 0x%x, short_hand 0x%x",
278 target, source, dest, dest_mode, short_hand);
279
280 ASSERT(!target);
281 switch (short_hand) {
282 case APIC_DEST_NOSHORT:
283 if (dest_mode == 0) {
284 /* Physical mode. */
285 if ((dest == 0xFF) || (dest == kvm_apic_id(target)))
286 result = 1;
287 } else
288 /* Logical mode. */
289 result = kvm_apic_match_logical_addr(target, dest);
290 break;
291 case APIC_DEST_SELF:
292 if (target == source)
293 result = 1;
294 break;
295 case APIC_DEST_ALLINC:
296 result = 1;
297 break;
298 case APIC_DEST_ALLBUT:
299 if (target != source)
300 result = 1;
301 break;
302 default:
303 printk(KERN_WARNING "Bad dest shorthand value %x\n",
304 short_hand);
305 break;
306 }
307
308 return result;
309}
310
311/*
312 * Add a pending IRQ into lapic.
313 * Return 1 if successfully added and 0 if discarded.
314 */
315static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
316 int vector, int level, int trig_mode)
317{
318 int orig_irr, result = 0;
319 struct kvm_vcpu *vcpu = apic->vcpu;
320
321 switch (delivery_mode) {
322 case APIC_DM_FIXED:
323 case APIC_DM_LOWEST:
324 /* FIXME add logic for vcpu on reset */
325 if (unlikely(!apic_enabled(apic)))
326 break;
327
328 orig_irr = apic_test_and_set_irr(vector, apic);
329 if (orig_irr && trig_mode) {
330 apic_debug("level trig mode repeatedly for vector %d",
331 vector);
332 break;
333 }
334
335 if (trig_mode) {
336 apic_debug("level trig mode for vector %d", vector);
337 apic_set_vector(vector, apic->regs + APIC_TMR);
338 } else
339 apic_clear_vector(vector, apic->regs + APIC_TMR);
340
341 if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE)
342 kvm_vcpu_kick(vcpu);
343 else if (vcpu->arch.mp_state == VCPU_MP_STATE_HALTED) {
344 vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
345 if (waitqueue_active(&vcpu->wq))
346 wake_up_interruptible(&vcpu->wq);
347 }
348
349 result = (orig_irr == 0);
350 break;
351
352 case APIC_DM_REMRD:
353 printk(KERN_DEBUG "Ignoring delivery mode 3\n");
354 break;
355
356 case APIC_DM_SMI:
357 printk(KERN_DEBUG "Ignoring guest SMI\n");
358 break;
359 case APIC_DM_NMI:
360 printk(KERN_DEBUG "Ignoring guest NMI\n");
361 break;
362
363 case APIC_DM_INIT:
364 if (level) {
365 if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE)
366 printk(KERN_DEBUG
367 "INIT on a runnable vcpu %d\n",
368 vcpu->vcpu_id);
369 vcpu->arch.mp_state = VCPU_MP_STATE_INIT_RECEIVED;
370 kvm_vcpu_kick(vcpu);
371 } else {
372 printk(KERN_DEBUG
373 "Ignoring de-assert INIT to vcpu %d\n",
374 vcpu->vcpu_id);
375 }
376
377 break;
378
379 case APIC_DM_STARTUP:
380 printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n",
381 vcpu->vcpu_id, vector);
382 if (vcpu->arch.mp_state == VCPU_MP_STATE_INIT_RECEIVED) {
383 vcpu->arch.sipi_vector = vector;
384 vcpu->arch.mp_state = VCPU_MP_STATE_SIPI_RECEIVED;
385 if (waitqueue_active(&vcpu->wq))
386 wake_up_interruptible(&vcpu->wq);
387 }
388 break;
389
390 default:
391 printk(KERN_ERR "TODO: unsupported delivery mode %x\n",
392 delivery_mode);
393 break;
394 }
395 return result;
396}
397
398static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
399 unsigned long bitmap)
400{
401 int last;
402 int next;
403 struct kvm_lapic *apic = NULL;
404
405 last = kvm->arch.round_robin_prev_vcpu;
406 next = last;
407
408 do {
409 if (++next == KVM_MAX_VCPUS)
410 next = 0;
411 if (kvm->vcpus[next] == NULL || !test_bit(next, &bitmap))
412 continue;
413 apic = kvm->vcpus[next]->arch.apic;
414 if (apic && apic_enabled(apic))
415 break;
416 apic = NULL;
417 } while (next != last);
418 kvm->arch.round_robin_prev_vcpu = next;
419
420 if (!apic)
421 printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n");
422
423 return apic;
424}
425
426struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
427 unsigned long bitmap)
428{
429 struct kvm_lapic *apic;
430
431 apic = kvm_apic_round_robin(kvm, vector, bitmap);
432 if (apic)
433 return apic->vcpu;
434 return NULL;
435}
436
437static void apic_set_eoi(struct kvm_lapic *apic)
438{
439 int vector = apic_find_highest_isr(apic);
440
441 /*
442 * Not every write EOI will has corresponding ISR,
443 * one example is when Kernel check timer on setup_IO_APIC
444 */
445 if (vector == -1)
446 return;
447
448 apic_clear_vector(vector, apic->regs + APIC_ISR);
449 apic_update_ppr(apic);
450
451 if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR))
452 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector);
453}
454
455static void apic_send_ipi(struct kvm_lapic *apic)
456{
457 u32 icr_low = apic_get_reg(apic, APIC_ICR);
458 u32 icr_high = apic_get_reg(apic, APIC_ICR2);
459
460 unsigned int dest = GET_APIC_DEST_FIELD(icr_high);
461 unsigned int short_hand = icr_low & APIC_SHORT_MASK;
462 unsigned int trig_mode = icr_low & APIC_INT_LEVELTRIG;
463 unsigned int level = icr_low & APIC_INT_ASSERT;
464 unsigned int dest_mode = icr_low & APIC_DEST_MASK;
465 unsigned int delivery_mode = icr_low & APIC_MODE_MASK;
466 unsigned int vector = icr_low & APIC_VECTOR_MASK;
467
468 struct kvm_vcpu *target;
469 struct kvm_vcpu *vcpu;
470 unsigned long lpr_map = 0;
471 int i;
472
473 apic_debug("icr_high 0x%x, icr_low 0x%x, "
474 "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, "
475 "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n",
476 icr_high, icr_low, short_hand, dest,
477 trig_mode, level, dest_mode, delivery_mode, vector);
478
479 for (i = 0; i < KVM_MAX_VCPUS; i++) {
480 vcpu = apic->vcpu->kvm->vcpus[i];
481 if (!vcpu)
482 continue;
483
484 if (vcpu->arch.apic &&
485 apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) {
486 if (delivery_mode == APIC_DM_LOWEST)
487 set_bit(vcpu->vcpu_id, &lpr_map);
488 else
489 __apic_accept_irq(vcpu->arch.apic, delivery_mode,
490 vector, level, trig_mode);
491 }
492 }
493
494 if (delivery_mode == APIC_DM_LOWEST) {
495 target = kvm_get_lowest_prio_vcpu(vcpu->kvm, vector, lpr_map);
496 if (target != NULL)
497 __apic_accept_irq(target->arch.apic, delivery_mode,
498 vector, level, trig_mode);
499 }
500}
501
502static u32 apic_get_tmcct(struct kvm_lapic *apic)
503{
504 u64 counter_passed;
505 ktime_t passed, now;
506 u32 tmcct;
507
508 ASSERT(apic != NULL);
509
510 now = apic->timer.dev.base->get_time();
511 tmcct = apic_get_reg(apic, APIC_TMICT);
512
513 /* if initial count is 0, current count should also be 0 */
514 if (tmcct == 0)
515 return 0;
516
517 if (unlikely(ktime_to_ns(now) <=
518 ktime_to_ns(apic->timer.last_update))) {
519 /* Wrap around */
520 passed = ktime_add(( {
521 (ktime_t) {
522 .tv64 = KTIME_MAX -
523 (apic->timer.last_update).tv64}; }
524 ), now);
525 apic_debug("time elapsed\n");
526 } else
527 passed = ktime_sub(now, apic->timer.last_update);
528
529 counter_passed = div64_64(ktime_to_ns(passed),
530 (APIC_BUS_CYCLE_NS * apic->timer.divide_count));
531
532 if (counter_passed > tmcct) {
533 if (unlikely(!apic_lvtt_period(apic))) {
534 /* one-shot timers stick at 0 until reset */
535 tmcct = 0;
536 } else {
537 /*
538 * periodic timers reset to APIC_TMICT when they
539 * hit 0. The while loop simulates this happening N
540 * times. (counter_passed %= tmcct) would also work,
541 * but might be slower or not work on 32-bit??
542 */
543 while (counter_passed > tmcct)
544 counter_passed -= tmcct;
545 tmcct -= counter_passed;
546 }
547 } else {
548 tmcct -= counter_passed;
549 }
550
551 return tmcct;
552}
553
554static void __report_tpr_access(struct kvm_lapic *apic, bool write)
555{
556 struct kvm_vcpu *vcpu = apic->vcpu;
557 struct kvm_run *run = vcpu->run;
558
559 set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests);
560 kvm_x86_ops->cache_regs(vcpu);
561 run->tpr_access.rip = vcpu->arch.rip;
562 run->tpr_access.is_write = write;
563}
564
565static inline void report_tpr_access(struct kvm_lapic *apic, bool write)
566{
567 if (apic->vcpu->arch.tpr_access_reporting)
568 __report_tpr_access(apic, write);
569}
570
571static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
572{
573 u32 val = 0;
574
575 if (offset >= LAPIC_MMIO_LENGTH)
576 return 0;
577
578 switch (offset) {
579 case APIC_ARBPRI:
580 printk(KERN_WARNING "Access APIC ARBPRI register "
581 "which is for P6\n");
582 break;
583
584 case APIC_TMCCT: /* Timer CCR */
585 val = apic_get_tmcct(apic);
586 break;
587
588 case APIC_TASKPRI:
589 report_tpr_access(apic, false);
590 /* fall thru */
591 default:
592 apic_update_ppr(apic);
593 val = apic_get_reg(apic, offset);
594 break;
595 }
596
597 return val;
598}
599
600static void apic_mmio_read(struct kvm_io_device *this,
601 gpa_t address, int len, void *data)
602{
603 struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
604 unsigned int offset = address - apic->base_address;
605 unsigned char alignment = offset & 0xf;
606 u32 result;
607
608 if ((alignment + len) > 4) {
609 printk(KERN_ERR "KVM_APIC_READ: alignment error %lx %d",
610 (unsigned long)address, len);
611 return;
612 }
613 result = __apic_read(apic, offset & ~0xf);
614
615 switch (len) {
616 case 1:
617 case 2:
618 case 4:
619 memcpy(data, (char *)&result + alignment, len);
620 break;
621 default:
622 printk(KERN_ERR "Local APIC read with len = %x, "
623 "should be 1,2, or 4 instead\n", len);
624 break;
625 }
626}
627
628static void update_divide_count(struct kvm_lapic *apic)
629{
630 u32 tmp1, tmp2, tdcr;
631
632 tdcr = apic_get_reg(apic, APIC_TDCR);
633 tmp1 = tdcr & 0xf;
634 tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1;
635 apic->timer.divide_count = 0x1 << (tmp2 & 0x7);
636
637 apic_debug("timer divide count is 0x%x\n",
638 apic->timer.divide_count);
639}
640
641static void start_apic_timer(struct kvm_lapic *apic)
642{
643 ktime_t now = apic->timer.dev.base->get_time();
644
645 apic->timer.last_update = now;
646
647 apic->timer.period = apic_get_reg(apic, APIC_TMICT) *
648 APIC_BUS_CYCLE_NS * apic->timer.divide_count;
649 atomic_set(&apic->timer.pending, 0);
650 hrtimer_start(&apic->timer.dev,
651 ktime_add_ns(now, apic->timer.period),
652 HRTIMER_MODE_ABS);
653
654 apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
655 PRIx64 ", "
656 "timer initial count 0x%x, period %lldns, "
657 "expire @ 0x%016" PRIx64 ".\n", __FUNCTION__,
658 APIC_BUS_CYCLE_NS, ktime_to_ns(now),
659 apic_get_reg(apic, APIC_TMICT),
660 apic->timer.period,
661 ktime_to_ns(ktime_add_ns(now,
662 apic->timer.period)));
663}
664
665static void apic_mmio_write(struct kvm_io_device *this,
666 gpa_t address, int len, const void *data)
667{
668 struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
669 unsigned int offset = address - apic->base_address;
670 unsigned char alignment = offset & 0xf;
671 u32 val;
672
673 /*
674 * APIC register must be aligned on 128-bits boundary.
675 * 32/64/128 bits registers must be accessed thru 32 bits.
676 * Refer SDM 8.4.1
677 */
678 if (len != 4 || alignment) {
679 if (printk_ratelimit())
680 printk(KERN_ERR "apic write: bad size=%d %lx\n",
681 len, (long)address);
682 return;
683 }
684
685 val = *(u32 *) data;
686
687 /* too common printing */
688 if (offset != APIC_EOI)
689 apic_debug("%s: offset 0x%x with length 0x%x, and value is "
690 "0x%x\n", __FUNCTION__, offset, len, val);
691
692 offset &= 0xff0;
693
694 switch (offset) {
695 case APIC_ID: /* Local APIC ID */
696 apic_set_reg(apic, APIC_ID, val);
697 break;
698
699 case APIC_TASKPRI:
700 report_tpr_access(apic, true);
701 apic_set_tpr(apic, val & 0xff);
702 break;
703
704 case APIC_EOI:
705 apic_set_eoi(apic);
706 break;
707
708 case APIC_LDR:
709 apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK);
710 break;
711
712 case APIC_DFR:
713 apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
714 break;
715
716 case APIC_SPIV:
717 apic_set_reg(apic, APIC_SPIV, val & 0x3ff);
718 if (!(val & APIC_SPIV_APIC_ENABLED)) {
719 int i;
720 u32 lvt_val;
721
722 for (i = 0; i < APIC_LVT_NUM; i++) {
723 lvt_val = apic_get_reg(apic,
724 APIC_LVTT + 0x10 * i);
725 apic_set_reg(apic, APIC_LVTT + 0x10 * i,
726 lvt_val | APIC_LVT_MASKED);
727 }
728 atomic_set(&apic->timer.pending, 0);
729
730 }
731 break;
732
733 case APIC_ICR:
734 /* No delay here, so we always clear the pending bit */
735 apic_set_reg(apic, APIC_ICR, val & ~(1 << 12));
736 apic_send_ipi(apic);
737 break;
738
739 case APIC_ICR2:
740 apic_set_reg(apic, APIC_ICR2, val & 0xff000000);
741 break;
742
743 case APIC_LVTT:
744 case APIC_LVTTHMR:
745 case APIC_LVTPC:
746 case APIC_LVT0:
747 case APIC_LVT1:
748 case APIC_LVTERR:
749 /* TODO: Check vector */
750 if (!apic_sw_enabled(apic))
751 val |= APIC_LVT_MASKED;
752
753 val &= apic_lvt_mask[(offset - APIC_LVTT) >> 4];
754 apic_set_reg(apic, offset, val);
755
756 break;
757
758 case APIC_TMICT:
759 hrtimer_cancel(&apic->timer.dev);
760 apic_set_reg(apic, APIC_TMICT, val);
761 start_apic_timer(apic);
762 return;
763
764 case APIC_TDCR:
765 if (val & 4)
766 printk(KERN_ERR "KVM_WRITE:TDCR %x\n", val);
767 apic_set_reg(apic, APIC_TDCR, val);
768 update_divide_count(apic);
769 break;
770
771 default:
772 apic_debug("Local APIC Write to read-only register %x\n",
773 offset);
774 break;
775 }
776
777}
778
779static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr)
780{
781 struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
782 int ret = 0;
783
784
785 if (apic_hw_enabled(apic) &&
786 (addr >= apic->base_address) &&
787 (addr < (apic->base_address + LAPIC_MMIO_LENGTH)))
788 ret = 1;
789
790 return ret;
791}
792
793void kvm_free_lapic(struct kvm_vcpu *vcpu)
794{
795 if (!vcpu->arch.apic)
796 return;
797
798 hrtimer_cancel(&vcpu->arch.apic->timer.dev);
799
800 if (vcpu->arch.apic->regs_page)
801 __free_page(vcpu->arch.apic->regs_page);
802
803 kfree(vcpu->arch.apic);
804}
805
806/*
807 *----------------------------------------------------------------------
808 * LAPIC interface
809 *----------------------------------------------------------------------
810 */
811
812void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
813{
814 struct kvm_lapic *apic = vcpu->arch.apic;
815
816 if (!apic)
817 return;
818 apic_set_tpr(apic, ((cr8 & 0x0f) << 4)
819 | (apic_get_reg(apic, APIC_TASKPRI) & 4));
820}
821
822u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
823{
824 struct kvm_lapic *apic = vcpu->arch.apic;
825 u64 tpr;
826
827 if (!apic)
828 return 0;
829 tpr = (u64) apic_get_reg(apic, APIC_TASKPRI);
830
831 return (tpr & 0xf0) >> 4;
832}
833EXPORT_SYMBOL_GPL(kvm_lapic_get_cr8);
834
835void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
836{
837 struct kvm_lapic *apic = vcpu->arch.apic;
838
839 if (!apic) {
840 value |= MSR_IA32_APICBASE_BSP;
841 vcpu->arch.apic_base = value;
842 return;
843 }
844 if (apic->vcpu->vcpu_id)
845 value &= ~MSR_IA32_APICBASE_BSP;
846
847 vcpu->arch.apic_base = value;
848 apic->base_address = apic->vcpu->arch.apic_base &
849 MSR_IA32_APICBASE_BASE;
850
851 /* with FSB delivery interrupt, we can restart APIC functionality */
852 apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is "
853 "0x%lx.\n", apic->vcpu->arch.apic_base, apic->base_address);
854
855}
856
857u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu)
858{
859 return vcpu->arch.apic_base;
860}
861EXPORT_SYMBOL_GPL(kvm_lapic_get_base);
862
863void kvm_lapic_reset(struct kvm_vcpu *vcpu)
864{
865 struct kvm_lapic *apic;
866 int i;
867
868 apic_debug("%s\n", __FUNCTION__);
869
870 ASSERT(vcpu);
871 apic = vcpu->arch.apic;
872 ASSERT(apic != NULL);
873
874 /* Stop the timer in case it's a reset to an active apic */
875 hrtimer_cancel(&apic->timer.dev);
876
877 apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24);
878 apic_set_reg(apic, APIC_LVR, APIC_VERSION);
879
880 for (i = 0; i < APIC_LVT_NUM; i++)
881 apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
882 apic_set_reg(apic, APIC_LVT0,
883 SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
884
885 apic_set_reg(apic, APIC_DFR, 0xffffffffU);
886 apic_set_reg(apic, APIC_SPIV, 0xff);
887 apic_set_reg(apic, APIC_TASKPRI, 0);
888 apic_set_reg(apic, APIC_LDR, 0);
889 apic_set_reg(apic, APIC_ESR, 0);
890 apic_set_reg(apic, APIC_ICR, 0);
891 apic_set_reg(apic, APIC_ICR2, 0);
892 apic_set_reg(apic, APIC_TDCR, 0);
893 apic_set_reg(apic, APIC_TMICT, 0);
894 for (i = 0; i < 8; i++) {
895 apic_set_reg(apic, APIC_IRR + 0x10 * i, 0);
896 apic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
897 apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
898 }
899 update_divide_count(apic);
900 atomic_set(&apic->timer.pending, 0);
901 if (vcpu->vcpu_id == 0)
902 vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
903 apic_update_ppr(apic);
904
905 apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr="
906 "0x%016" PRIx64 ", base_address=0x%0lx.\n", __FUNCTION__,
907 vcpu, kvm_apic_id(apic),
908 vcpu->arch.apic_base, apic->base_address);
909}
910EXPORT_SYMBOL_GPL(kvm_lapic_reset);
911
912int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
913{
914 struct kvm_lapic *apic = vcpu->arch.apic;
915 int ret = 0;
916
917 if (!apic)
918 return 0;
919 ret = apic_enabled(apic);
920
921 return ret;
922}
923EXPORT_SYMBOL_GPL(kvm_lapic_enabled);
924
925/*
926 *----------------------------------------------------------------------
927 * timer interface
928 *----------------------------------------------------------------------
929 */
930
931/* TODO: make sure __apic_timer_fn runs in current pCPU */
932static int __apic_timer_fn(struct kvm_lapic *apic)
933{
934 int result = 0;
935 wait_queue_head_t *q = &apic->vcpu->wq;
936
937 atomic_inc(&apic->timer.pending);
938 if (waitqueue_active(q)) {
939 apic->vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
940 wake_up_interruptible(q);
941 }
942 if (apic_lvtt_period(apic)) {
943 result = 1;
944 apic->timer.dev.expires = ktime_add_ns(
945 apic->timer.dev.expires,
946 apic->timer.period);
947 }
948 return result;
949}
950
951static int __inject_apic_timer_irq(struct kvm_lapic *apic)
952{
953 int vector;
954
955 vector = apic_lvt_vector(apic, APIC_LVTT);
956 return __apic_accept_irq(apic, APIC_DM_FIXED, vector, 1, 0);
957}
958
959static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
960{
961 struct kvm_lapic *apic;
962 int restart_timer = 0;
963
964 apic = container_of(data, struct kvm_lapic, timer.dev);
965
966 restart_timer = __apic_timer_fn(apic);
967
968 if (restart_timer)
969 return HRTIMER_RESTART;
970 else
971 return HRTIMER_NORESTART;
972}
973
974int kvm_create_lapic(struct kvm_vcpu *vcpu)
975{
976 struct kvm_lapic *apic;
977
978 ASSERT(vcpu != NULL);
979 apic_debug("apic_init %d\n", vcpu->vcpu_id);
980
981 apic = kzalloc(sizeof(*apic), GFP_KERNEL);
982 if (!apic)
983 goto nomem;
984
985 vcpu->arch.apic = apic;
986
987 apic->regs_page = alloc_page(GFP_KERNEL);
988 if (apic->regs_page == NULL) {
989 printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
990 vcpu->vcpu_id);
991 goto nomem_free_apic;
992 }
993 apic->regs = page_address(apic->regs_page);
994 memset(apic->regs, 0, PAGE_SIZE);
995 apic->vcpu = vcpu;
996
997 hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
998 apic->timer.dev.function = apic_timer_fn;
999 apic->base_address = APIC_DEFAULT_PHYS_BASE;
1000 vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE;
1001
1002 kvm_lapic_reset(vcpu);
1003 apic->dev.read = apic_mmio_read;
1004 apic->dev.write = apic_mmio_write;
1005 apic->dev.in_range = apic_mmio_range;
1006 apic->dev.private = apic;
1007
1008 return 0;
1009nomem_free_apic:
1010 kfree(apic);
1011nomem:
1012 return -ENOMEM;
1013}
1014EXPORT_SYMBOL_GPL(kvm_create_lapic);
1015
1016int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
1017{
1018 struct kvm_lapic *apic = vcpu->arch.apic;
1019 int highest_irr;
1020
1021 if (!apic || !apic_enabled(apic))
1022 return -1;
1023
1024 apic_update_ppr(apic);
1025 highest_irr = apic_find_highest_irr(apic);
1026 if ((highest_irr == -1) ||
1027 ((highest_irr & 0xF0) <= apic_get_reg(apic, APIC_PROCPRI)))
1028 return -1;
1029 return highest_irr;
1030}
1031
1032int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
1033{
1034 u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0);
1035 int r = 0;
1036
1037 if (vcpu->vcpu_id == 0) {
1038 if (!apic_hw_enabled(vcpu->arch.apic))
1039 r = 1;
1040 if ((lvt0 & APIC_LVT_MASKED) == 0 &&
1041 GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
1042 r = 1;
1043 }
1044 return r;
1045}
1046
1047void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
1048{
1049 struct kvm_lapic *apic = vcpu->arch.apic;
1050
1051 if (apic && apic_lvt_enabled(apic, APIC_LVTT) &&
1052 atomic_read(&apic->timer.pending) > 0) {
1053 if (__inject_apic_timer_irq(apic))
1054 atomic_dec(&apic->timer.pending);
1055 }
1056}
1057
1058void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
1059{
1060 struct kvm_lapic *apic = vcpu->arch.apic;
1061
1062 if (apic && apic_lvt_vector(apic, APIC_LVTT) == vec)
1063 apic->timer.last_update = ktime_add_ns(
1064 apic->timer.last_update,
1065 apic->timer.period);
1066}
1067
1068int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
1069{
1070 int vector = kvm_apic_has_interrupt(vcpu);
1071 struct kvm_lapic *apic = vcpu->arch.apic;
1072
1073 if (vector == -1)
1074 return -1;
1075
1076 apic_set_vector(vector, apic->regs + APIC_ISR);
1077 apic_update_ppr(apic);
1078 apic_clear_irr(vector, apic);
1079 return vector;
1080}
1081
1082void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
1083{
1084 struct kvm_lapic *apic = vcpu->arch.apic;
1085
1086 apic->base_address = vcpu->arch.apic_base &
1087 MSR_IA32_APICBASE_BASE;
1088 apic_set_reg(apic, APIC_LVR, APIC_VERSION);
1089 apic_update_ppr(apic);
1090 hrtimer_cancel(&apic->timer.dev);
1091 update_divide_count(apic);
1092 start_apic_timer(apic);
1093}
1094
1095void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
1096{
1097 struct kvm_lapic *apic = vcpu->arch.apic;
1098 struct hrtimer *timer;
1099
1100 if (!apic)
1101 return;
1102
1103 timer = &apic->timer.dev;
1104 if (hrtimer_cancel(timer))
1105 hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS);
1106}
1107
1108void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
1109{
1110 u32 data;
1111 void *vapic;
1112
1113 if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr)
1114 return;
1115
1116 vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0);
1117 data = *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr));
1118 kunmap_atomic(vapic, KM_USER0);
1119
1120 apic_set_tpr(vcpu->arch.apic, data & 0xff);
1121}
1122
1123void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
1124{
1125 u32 data, tpr;
1126 int max_irr, max_isr;
1127 struct kvm_lapic *apic;
1128 void *vapic;
1129
1130 if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr)
1131 return;
1132
1133 apic = vcpu->arch.apic;
1134 tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff;
1135 max_irr = apic_find_highest_irr(apic);
1136 if (max_irr < 0)
1137 max_irr = 0;
1138 max_isr = apic_find_highest_isr(apic);
1139 if (max_isr < 0)
1140 max_isr = 0;
1141 data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24);
1142
1143 vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0);
1144 *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr)) = data;
1145 kunmap_atomic(vapic, KM_USER0);
1146}
1147
1148void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
1149{
1150 if (!irqchip_in_kernel(vcpu->kvm))
1151 return;
1152
1153 vcpu->arch.apic->vapic_addr = vapic_addr;
1154}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
new file mode 100644
index 000000000000..676c396c9cee
--- /dev/null
+++ b/arch/x86/kvm/lapic.h
@@ -0,0 +1,50 @@
1#ifndef __KVM_X86_LAPIC_H
2#define __KVM_X86_LAPIC_H
3
4#include "iodev.h"
5
6#include <linux/kvm_host.h>
7
8struct kvm_lapic {
9 unsigned long base_address;
10 struct kvm_io_device dev;
11 struct {
12 atomic_t pending;
13 s64 period; /* unit: ns */
14 u32 divide_count;
15 ktime_t last_update;
16 struct hrtimer dev;
17 } timer;
18 struct kvm_vcpu *vcpu;
19 struct page *regs_page;
20 void *regs;
21 gpa_t vapic_addr;
22 struct page *vapic_page;
23};
24int kvm_create_lapic(struct kvm_vcpu *vcpu);
25void kvm_free_lapic(struct kvm_vcpu *vcpu);
26
27int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
28int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
29int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
30void kvm_lapic_reset(struct kvm_vcpu *vcpu);
31u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
32void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
33void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
34
35int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
36int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
37int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig);
38
39u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
40void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
41void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
42int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
43int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
44void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
45
46void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
47void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
48void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
49
50#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
new file mode 100644
index 000000000000..8efdcdbebb03
--- /dev/null
+++ b/arch/x86/kvm/mmu.c
@@ -0,0 +1,1885 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * MMU support
8 *
9 * Copyright (C) 2006 Qumranet, Inc.
10 *
11 * Authors:
12 * Yaniv Kamay <yaniv@qumranet.com>
13 * Avi Kivity <avi@qumranet.com>
14 *
15 * This work is licensed under the terms of the GNU GPL, version 2. See
16 * the COPYING file in the top-level directory.
17 *
18 */
19
20#include "vmx.h"
21#include "mmu.h"
22
23#include <linux/kvm_host.h>
24#include <linux/types.h>
25#include <linux/string.h>
26#include <linux/mm.h>
27#include <linux/highmem.h>
28#include <linux/module.h>
29#include <linux/swap.h>
30
31#include <asm/page.h>
32#include <asm/cmpxchg.h>
33#include <asm/io.h>
34
35#undef MMU_DEBUG
36
37#undef AUDIT
38
39#ifdef AUDIT
40static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
41#else
42static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
43#endif
44
45#ifdef MMU_DEBUG
46
47#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
48#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
49
50#else
51
52#define pgprintk(x...) do { } while (0)
53#define rmap_printk(x...) do { } while (0)
54
55#endif
56
57#if defined(MMU_DEBUG) || defined(AUDIT)
58static int dbg = 1;
59#endif
60
61#ifndef MMU_DEBUG
62#define ASSERT(x) do { } while (0)
63#else
64#define ASSERT(x) \
65 if (!(x)) { \
66 printk(KERN_WARNING "assertion failed %s:%d: %s\n", \
67 __FILE__, __LINE__, #x); \
68 }
69#endif
70
71#define PT64_PT_BITS 9
72#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
73#define PT32_PT_BITS 10
74#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
75
76#define PT_WRITABLE_SHIFT 1
77
78#define PT_PRESENT_MASK (1ULL << 0)
79#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
80#define PT_USER_MASK (1ULL << 2)
81#define PT_PWT_MASK (1ULL << 3)
82#define PT_PCD_MASK (1ULL << 4)
83#define PT_ACCESSED_MASK (1ULL << 5)
84#define PT_DIRTY_MASK (1ULL << 6)
85#define PT_PAGE_SIZE_MASK (1ULL << 7)
86#define PT_PAT_MASK (1ULL << 7)
87#define PT_GLOBAL_MASK (1ULL << 8)
88#define PT64_NX_SHIFT 63
89#define PT64_NX_MASK (1ULL << PT64_NX_SHIFT)
90
91#define PT_PAT_SHIFT 7
92#define PT_DIR_PAT_SHIFT 12
93#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
94
95#define PT32_DIR_PSE36_SIZE 4
96#define PT32_DIR_PSE36_SHIFT 13
97#define PT32_DIR_PSE36_MASK \
98 (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
99
100
101#define PT_FIRST_AVAIL_BITS_SHIFT 9
102#define PT64_SECOND_AVAIL_BITS_SHIFT 52
103
104#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
105
106#define VALID_PAGE(x) ((x) != INVALID_PAGE)
107
108#define PT64_LEVEL_BITS 9
109
110#define PT64_LEVEL_SHIFT(level) \
111 (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
112
113#define PT64_LEVEL_MASK(level) \
114 (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
115
116#define PT64_INDEX(address, level)\
117 (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
118
119
120#define PT32_LEVEL_BITS 10
121
122#define PT32_LEVEL_SHIFT(level) \
123 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
124
125#define PT32_LEVEL_MASK(level) \
126 (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
127
128#define PT32_INDEX(address, level)\
129 (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
130
131
132#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
133#define PT64_DIR_BASE_ADDR_MASK \
134 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
135
136#define PT32_BASE_ADDR_MASK PAGE_MASK
137#define PT32_DIR_BASE_ADDR_MASK \
138 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
139
140#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
141 | PT64_NX_MASK)
142
143#define PFERR_PRESENT_MASK (1U << 0)
144#define PFERR_WRITE_MASK (1U << 1)
145#define PFERR_USER_MASK (1U << 2)
146#define PFERR_FETCH_MASK (1U << 4)
147
148#define PT64_ROOT_LEVEL 4
149#define PT32_ROOT_LEVEL 2
150#define PT32E_ROOT_LEVEL 3
151
152#define PT_DIRECTORY_LEVEL 2
153#define PT_PAGE_TABLE_LEVEL 1
154
155#define RMAP_EXT 4
156
157#define ACC_EXEC_MASK 1
158#define ACC_WRITE_MASK PT_WRITABLE_MASK
159#define ACC_USER_MASK PT_USER_MASK
160#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
161
162struct kvm_rmap_desc {
163 u64 *shadow_ptes[RMAP_EXT];
164 struct kvm_rmap_desc *more;
165};
166
167static struct kmem_cache *pte_chain_cache;
168static struct kmem_cache *rmap_desc_cache;
169static struct kmem_cache *mmu_page_header_cache;
170
171static u64 __read_mostly shadow_trap_nonpresent_pte;
172static u64 __read_mostly shadow_notrap_nonpresent_pte;
173
174void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
175{
176 shadow_trap_nonpresent_pte = trap_pte;
177 shadow_notrap_nonpresent_pte = notrap_pte;
178}
179EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
180
181static int is_write_protection(struct kvm_vcpu *vcpu)
182{
183 return vcpu->arch.cr0 & X86_CR0_WP;
184}
185
186static int is_cpuid_PSE36(void)
187{
188 return 1;
189}
190
191static int is_nx(struct kvm_vcpu *vcpu)
192{
193 return vcpu->arch.shadow_efer & EFER_NX;
194}
195
196static int is_present_pte(unsigned long pte)
197{
198 return pte & PT_PRESENT_MASK;
199}
200
201static int is_shadow_present_pte(u64 pte)
202{
203 pte &= ~PT_SHADOW_IO_MARK;
204 return pte != shadow_trap_nonpresent_pte
205 && pte != shadow_notrap_nonpresent_pte;
206}
207
208static int is_writeble_pte(unsigned long pte)
209{
210 return pte & PT_WRITABLE_MASK;
211}
212
213static int is_dirty_pte(unsigned long pte)
214{
215 return pte & PT_DIRTY_MASK;
216}
217
218static int is_io_pte(unsigned long pte)
219{
220 return pte & PT_SHADOW_IO_MARK;
221}
222
223static int is_rmap_pte(u64 pte)
224{
225 return pte != shadow_trap_nonpresent_pte
226 && pte != shadow_notrap_nonpresent_pte;
227}
228
229static gfn_t pse36_gfn_delta(u32 gpte)
230{
231 int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
232
233 return (gpte & PT32_DIR_PSE36_MASK) << shift;
234}
235
236static void set_shadow_pte(u64 *sptep, u64 spte)
237{
238#ifdef CONFIG_X86_64
239 set_64bit((unsigned long *)sptep, spte);
240#else
241 set_64bit((unsigned long long *)sptep, spte);
242#endif
243}
244
245static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
246 struct kmem_cache *base_cache, int min)
247{
248 void *obj;
249
250 if (cache->nobjs >= min)
251 return 0;
252 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
253 obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
254 if (!obj)
255 return -ENOMEM;
256 cache->objects[cache->nobjs++] = obj;
257 }
258 return 0;
259}
260
261static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
262{
263 while (mc->nobjs)
264 kfree(mc->objects[--mc->nobjs]);
265}
266
267static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
268 int min)
269{
270 struct page *page;
271
272 if (cache->nobjs >= min)
273 return 0;
274 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
275 page = alloc_page(GFP_KERNEL);
276 if (!page)
277 return -ENOMEM;
278 set_page_private(page, 0);
279 cache->objects[cache->nobjs++] = page_address(page);
280 }
281 return 0;
282}
283
284static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
285{
286 while (mc->nobjs)
287 free_page((unsigned long)mc->objects[--mc->nobjs]);
288}
289
290static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
291{
292 int r;
293
294 r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache,
295 pte_chain_cache, 4);
296 if (r)
297 goto out;
298 r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
299 rmap_desc_cache, 1);
300 if (r)
301 goto out;
302 r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
303 if (r)
304 goto out;
305 r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
306 mmu_page_header_cache, 4);
307out:
308 return r;
309}
310
311static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
312{
313 mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache);
314 mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache);
315 mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
316 mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
317}
318
319static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
320 size_t size)
321{
322 void *p;
323
324 BUG_ON(!mc->nobjs);
325 p = mc->objects[--mc->nobjs];
326 memset(p, 0, size);
327 return p;
328}
329
330static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
331{
332 return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache,
333 sizeof(struct kvm_pte_chain));
334}
335
336static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
337{
338 kfree(pc);
339}
340
341static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
342{
343 return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache,
344 sizeof(struct kvm_rmap_desc));
345}
346
347static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
348{
349 kfree(rd);
350}
351
352/*
353 * Take gfn and return the reverse mapping to it.
354 * Note: gfn must be unaliased before this function get called
355 */
356
357static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn)
358{
359 struct kvm_memory_slot *slot;
360
361 slot = gfn_to_memslot(kvm, gfn);
362 return &slot->rmap[gfn - slot->base_gfn];
363}
364
365/*
366 * Reverse mapping data structures:
367 *
368 * If rmapp bit zero is zero, then rmapp point to the shadw page table entry
369 * that points to page_address(page).
370 *
371 * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
372 * containing more mappings.
373 */
374static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
375{
376 struct kvm_mmu_page *sp;
377 struct kvm_rmap_desc *desc;
378 unsigned long *rmapp;
379 int i;
380
381 if (!is_rmap_pte(*spte))
382 return;
383 gfn = unalias_gfn(vcpu->kvm, gfn);
384 sp = page_header(__pa(spte));
385 sp->gfns[spte - sp->spt] = gfn;
386 rmapp = gfn_to_rmap(vcpu->kvm, gfn);
387 if (!*rmapp) {
388 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
389 *rmapp = (unsigned long)spte;
390 } else if (!(*rmapp & 1)) {
391 rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
392 desc = mmu_alloc_rmap_desc(vcpu);
393 desc->shadow_ptes[0] = (u64 *)*rmapp;
394 desc->shadow_ptes[1] = spte;
395 *rmapp = (unsigned long)desc | 1;
396 } else {
397 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
398 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
399 while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
400 desc = desc->more;
401 if (desc->shadow_ptes[RMAP_EXT-1]) {
402 desc->more = mmu_alloc_rmap_desc(vcpu);
403 desc = desc->more;
404 }
405 for (i = 0; desc->shadow_ptes[i]; ++i)
406 ;
407 desc->shadow_ptes[i] = spte;
408 }
409}
410
411static void rmap_desc_remove_entry(unsigned long *rmapp,
412 struct kvm_rmap_desc *desc,
413 int i,
414 struct kvm_rmap_desc *prev_desc)
415{
416 int j;
417
418 for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j)
419 ;
420 desc->shadow_ptes[i] = desc->shadow_ptes[j];
421 desc->shadow_ptes[j] = NULL;
422 if (j != 0)
423 return;
424 if (!prev_desc && !desc->more)
425 *rmapp = (unsigned long)desc->shadow_ptes[0];
426 else
427 if (prev_desc)
428 prev_desc->more = desc->more;
429 else
430 *rmapp = (unsigned long)desc->more | 1;
431 mmu_free_rmap_desc(desc);
432}
433
434static void rmap_remove(struct kvm *kvm, u64 *spte)
435{
436 struct kvm_rmap_desc *desc;
437 struct kvm_rmap_desc *prev_desc;
438 struct kvm_mmu_page *sp;
439 struct page *page;
440 unsigned long *rmapp;
441 int i;
442
443 if (!is_rmap_pte(*spte))
444 return;
445 sp = page_header(__pa(spte));
446 page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
447 mark_page_accessed(page);
448 if (is_writeble_pte(*spte))
449 kvm_release_page_dirty(page);
450 else
451 kvm_release_page_clean(page);
452 rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt]);
453 if (!*rmapp) {
454 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
455 BUG();
456 } else if (!(*rmapp & 1)) {
457 rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte);
458 if ((u64 *)*rmapp != spte) {
459 printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n",
460 spte, *spte);
461 BUG();
462 }
463 *rmapp = 0;
464 } else {
465 rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte);
466 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
467 prev_desc = NULL;
468 while (desc) {
469 for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
470 if (desc->shadow_ptes[i] == spte) {
471 rmap_desc_remove_entry(rmapp,
472 desc, i,
473 prev_desc);
474 return;
475 }
476 prev_desc = desc;
477 desc = desc->more;
478 }
479 BUG();
480 }
481}
482
483static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
484{
485 struct kvm_rmap_desc *desc;
486 struct kvm_rmap_desc *prev_desc;
487 u64 *prev_spte;
488 int i;
489
490 if (!*rmapp)
491 return NULL;
492 else if (!(*rmapp & 1)) {
493 if (!spte)
494 return (u64 *)*rmapp;
495 return NULL;
496 }
497 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
498 prev_desc = NULL;
499 prev_spte = NULL;
500 while (desc) {
501 for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) {
502 if (prev_spte == spte)
503 return desc->shadow_ptes[i];
504 prev_spte = desc->shadow_ptes[i];
505 }
506 desc = desc->more;
507 }
508 return NULL;
509}
510
511static void rmap_write_protect(struct kvm *kvm, u64 gfn)
512{
513 unsigned long *rmapp;
514 u64 *spte;
515 int write_protected = 0;
516
517 gfn = unalias_gfn(kvm, gfn);
518 rmapp = gfn_to_rmap(kvm, gfn);
519
520 spte = rmap_next(kvm, rmapp, NULL);
521 while (spte) {
522 BUG_ON(!spte);
523 BUG_ON(!(*spte & PT_PRESENT_MASK));
524 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
525 if (is_writeble_pte(*spte)) {
526 set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK);
527 write_protected = 1;
528 }
529 spte = rmap_next(kvm, rmapp, spte);
530 }
531 if (write_protected)
532 kvm_flush_remote_tlbs(kvm);
533}
534
535#ifdef MMU_DEBUG
536static int is_empty_shadow_page(u64 *spt)
537{
538 u64 *pos;
539 u64 *end;
540
541 for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
542 if ((*pos & ~PT_SHADOW_IO_MARK) != shadow_trap_nonpresent_pte) {
543 printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
544 pos, *pos);
545 return 0;
546 }
547 return 1;
548}
549#endif
550
551static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
552{
553 ASSERT(is_empty_shadow_page(sp->spt));
554 list_del(&sp->link);
555 __free_page(virt_to_page(sp->spt));
556 __free_page(virt_to_page(sp->gfns));
557 kfree(sp);
558 ++kvm->arch.n_free_mmu_pages;
559}
560
561static unsigned kvm_page_table_hashfn(gfn_t gfn)
562{
563 return gfn;
564}
565
566static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
567 u64 *parent_pte)
568{
569 struct kvm_mmu_page *sp;
570
571 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
572 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
573 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
574 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
575 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
576 ASSERT(is_empty_shadow_page(sp->spt));
577 sp->slot_bitmap = 0;
578 sp->multimapped = 0;
579 sp->parent_pte = parent_pte;
580 --vcpu->kvm->arch.n_free_mmu_pages;
581 return sp;
582}
583
584static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
585 struct kvm_mmu_page *sp, u64 *parent_pte)
586{
587 struct kvm_pte_chain *pte_chain;
588 struct hlist_node *node;
589 int i;
590
591 if (!parent_pte)
592 return;
593 if (!sp->multimapped) {
594 u64 *old = sp->parent_pte;
595
596 if (!old) {
597 sp->parent_pte = parent_pte;
598 return;
599 }
600 sp->multimapped = 1;
601 pte_chain = mmu_alloc_pte_chain(vcpu);
602 INIT_HLIST_HEAD(&sp->parent_ptes);
603 hlist_add_head(&pte_chain->link, &sp->parent_ptes);
604 pte_chain->parent_ptes[0] = old;
605 }
606 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) {
607 if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
608 continue;
609 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
610 if (!pte_chain->parent_ptes[i]) {
611 pte_chain->parent_ptes[i] = parent_pte;
612 return;
613 }
614 }
615 pte_chain = mmu_alloc_pte_chain(vcpu);
616 BUG_ON(!pte_chain);
617 hlist_add_head(&pte_chain->link, &sp->parent_ptes);
618 pte_chain->parent_ptes[0] = parent_pte;
619}
620
621static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
622 u64 *parent_pte)
623{
624 struct kvm_pte_chain *pte_chain;
625 struct hlist_node *node;
626 int i;
627
628 if (!sp->multimapped) {
629 BUG_ON(sp->parent_pte != parent_pte);
630 sp->parent_pte = NULL;
631 return;
632 }
633 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
634 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
635 if (!pte_chain->parent_ptes[i])
636 break;
637 if (pte_chain->parent_ptes[i] != parent_pte)
638 continue;
639 while (i + 1 < NR_PTE_CHAIN_ENTRIES
640 && pte_chain->parent_ptes[i + 1]) {
641 pte_chain->parent_ptes[i]
642 = pte_chain->parent_ptes[i + 1];
643 ++i;
644 }
645 pte_chain->parent_ptes[i] = NULL;
646 if (i == 0) {
647 hlist_del(&pte_chain->link);
648 mmu_free_pte_chain(pte_chain);
649 if (hlist_empty(&sp->parent_ptes)) {
650 sp->multimapped = 0;
651 sp->parent_pte = NULL;
652 }
653 }
654 return;
655 }
656 BUG();
657}
658
659static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
660{
661 unsigned index;
662 struct hlist_head *bucket;
663 struct kvm_mmu_page *sp;
664 struct hlist_node *node;
665
666 pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
667 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
668 bucket = &kvm->arch.mmu_page_hash[index];
669 hlist_for_each_entry(sp, node, bucket, hash_link)
670 if (sp->gfn == gfn && !sp->role.metaphysical) {
671 pgprintk("%s: found role %x\n",
672 __FUNCTION__, sp->role.word);
673 return sp;
674 }
675 return NULL;
676}
677
678static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
679 gfn_t gfn,
680 gva_t gaddr,
681 unsigned level,
682 int metaphysical,
683 unsigned access,
684 u64 *parent_pte,
685 bool *new_page)
686{
687 union kvm_mmu_page_role role;
688 unsigned index;
689 unsigned quadrant;
690 struct hlist_head *bucket;
691 struct kvm_mmu_page *sp;
692 struct hlist_node *node;
693
694 role.word = 0;
695 role.glevels = vcpu->arch.mmu.root_level;
696 role.level = level;
697 role.metaphysical = metaphysical;
698 role.access = access;
699 if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
700 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
701 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
702 role.quadrant = quadrant;
703 }
704 pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__,
705 gfn, role.word);
706 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
707 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
708 hlist_for_each_entry(sp, node, bucket, hash_link)
709 if (sp->gfn == gfn && sp->role.word == role.word) {
710 mmu_page_add_parent_pte(vcpu, sp, parent_pte);
711 pgprintk("%s: found\n", __FUNCTION__);
712 return sp;
713 }
714 ++vcpu->kvm->stat.mmu_cache_miss;
715 sp = kvm_mmu_alloc_page(vcpu, parent_pte);
716 if (!sp)
717 return sp;
718 pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word);
719 sp->gfn = gfn;
720 sp->role = role;
721 hlist_add_head(&sp->hash_link, bucket);
722 vcpu->arch.mmu.prefetch_page(vcpu, sp);
723 if (!metaphysical)
724 rmap_write_protect(vcpu->kvm, gfn);
725 if (new_page)
726 *new_page = 1;
727 return sp;
728}
729
730static void kvm_mmu_page_unlink_children(struct kvm *kvm,
731 struct kvm_mmu_page *sp)
732{
733 unsigned i;
734 u64 *pt;
735 u64 ent;
736
737 pt = sp->spt;
738
739 if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
740 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
741 if (is_shadow_present_pte(pt[i]))
742 rmap_remove(kvm, &pt[i]);
743 pt[i] = shadow_trap_nonpresent_pte;
744 }
745 kvm_flush_remote_tlbs(kvm);
746 return;
747 }
748
749 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
750 ent = pt[i];
751
752 pt[i] = shadow_trap_nonpresent_pte;
753 if (!is_shadow_present_pte(ent))
754 continue;
755 ent &= PT64_BASE_ADDR_MASK;
756 mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
757 }
758 kvm_flush_remote_tlbs(kvm);
759}
760
761static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
762{
763 mmu_page_remove_parent_pte(sp, parent_pte);
764}
765
766static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
767{
768 int i;
769
770 for (i = 0; i < KVM_MAX_VCPUS; ++i)
771 if (kvm->vcpus[i])
772 kvm->vcpus[i]->arch.last_pte_updated = NULL;
773}
774
775static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
776{
777 u64 *parent_pte;
778
779 ++kvm->stat.mmu_shadow_zapped;
780 while (sp->multimapped || sp->parent_pte) {
781 if (!sp->multimapped)
782 parent_pte = sp->parent_pte;
783 else {
784 struct kvm_pte_chain *chain;
785
786 chain = container_of(sp->parent_ptes.first,
787 struct kvm_pte_chain, link);
788 parent_pte = chain->parent_ptes[0];
789 }
790 BUG_ON(!parent_pte);
791 kvm_mmu_put_page(sp, parent_pte);
792 set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte);
793 }
794 kvm_mmu_page_unlink_children(kvm, sp);
795 if (!sp->root_count) {
796 hlist_del(&sp->hash_link);
797 kvm_mmu_free_page(kvm, sp);
798 } else
799 list_move(&sp->link, &kvm->arch.active_mmu_pages);
800 kvm_mmu_reset_last_pte_updated(kvm);
801}
802
803/*
804 * Changing the number of mmu pages allocated to the vm
805 * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
806 */
807void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
808{
809 /*
810 * If we set the number of mmu pages to be smaller be than the
811 * number of actived pages , we must to free some mmu pages before we
812 * change the value
813 */
814
815 if ((kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages) >
816 kvm_nr_mmu_pages) {
817 int n_used_mmu_pages = kvm->arch.n_alloc_mmu_pages
818 - kvm->arch.n_free_mmu_pages;
819
820 while (n_used_mmu_pages > kvm_nr_mmu_pages) {
821 struct kvm_mmu_page *page;
822
823 page = container_of(kvm->arch.active_mmu_pages.prev,
824 struct kvm_mmu_page, link);
825 kvm_mmu_zap_page(kvm, page);
826 n_used_mmu_pages--;
827 }
828 kvm->arch.n_free_mmu_pages = 0;
829 }
830 else
831 kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages
832 - kvm->arch.n_alloc_mmu_pages;
833
834 kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages;
835}
836
837static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
838{
839 unsigned index;
840 struct hlist_head *bucket;
841 struct kvm_mmu_page *sp;
842 struct hlist_node *node, *n;
843 int r;
844
845 pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
846 r = 0;
847 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
848 bucket = &kvm->arch.mmu_page_hash[index];
849 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
850 if (sp->gfn == gfn && !sp->role.metaphysical) {
851 pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn,
852 sp->role.word);
853 kvm_mmu_zap_page(kvm, sp);
854 r = 1;
855 }
856 return r;
857}
858
859static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
860{
861 struct kvm_mmu_page *sp;
862
863 while ((sp = kvm_mmu_lookup_page(kvm, gfn)) != NULL) {
864 pgprintk("%s: zap %lx %x\n", __FUNCTION__, gfn, sp->role.word);
865 kvm_mmu_zap_page(kvm, sp);
866 }
867}
868
869static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
870{
871 int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn));
872 struct kvm_mmu_page *sp = page_header(__pa(pte));
873
874 __set_bit(slot, &sp->slot_bitmap);
875}
876
877struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
878{
879 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
880
881 if (gpa == UNMAPPED_GVA)
882 return NULL;
883 return gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
884}
885
886static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
887 unsigned pt_access, unsigned pte_access,
888 int user_fault, int write_fault, int dirty,
889 int *ptwrite, gfn_t gfn, struct page *page)
890{
891 u64 spte;
892 int was_rmapped = is_rmap_pte(*shadow_pte);
893 int was_writeble = is_writeble_pte(*shadow_pte);
894
895 pgprintk("%s: spte %llx access %x write_fault %d"
896 " user_fault %d gfn %lx\n",
897 __FUNCTION__, *shadow_pte, pt_access,
898 write_fault, user_fault, gfn);
899
900 /*
901 * We don't set the accessed bit, since we sometimes want to see
902 * whether the guest actually used the pte (in order to detect
903 * demand paging).
904 */
905 spte = PT_PRESENT_MASK | PT_DIRTY_MASK;
906 if (!dirty)
907 pte_access &= ~ACC_WRITE_MASK;
908 if (!(pte_access & ACC_EXEC_MASK))
909 spte |= PT64_NX_MASK;
910
911 spte |= PT_PRESENT_MASK;
912 if (pte_access & ACC_USER_MASK)
913 spte |= PT_USER_MASK;
914
915 if (is_error_page(page)) {
916 set_shadow_pte(shadow_pte,
917 shadow_trap_nonpresent_pte | PT_SHADOW_IO_MARK);
918 kvm_release_page_clean(page);
919 return;
920 }
921
922 spte |= page_to_phys(page);
923
924 if ((pte_access & ACC_WRITE_MASK)
925 || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
926 struct kvm_mmu_page *shadow;
927
928 spte |= PT_WRITABLE_MASK;
929 if (user_fault) {
930 mmu_unshadow(vcpu->kvm, gfn);
931 goto unshadowed;
932 }
933
934 shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
935 if (shadow) {
936 pgprintk("%s: found shadow page for %lx, marking ro\n",
937 __FUNCTION__, gfn);
938 pte_access &= ~ACC_WRITE_MASK;
939 if (is_writeble_pte(spte)) {
940 spte &= ~PT_WRITABLE_MASK;
941 kvm_x86_ops->tlb_flush(vcpu);
942 }
943 if (write_fault)
944 *ptwrite = 1;
945 }
946 }
947
948unshadowed:
949
950 if (pte_access & ACC_WRITE_MASK)
951 mark_page_dirty(vcpu->kvm, gfn);
952
953 pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte);
954 set_shadow_pte(shadow_pte, spte);
955 page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
956 if (!was_rmapped) {
957 rmap_add(vcpu, shadow_pte, gfn);
958 if (!is_rmap_pte(*shadow_pte))
959 kvm_release_page_clean(page);
960 } else {
961 if (was_writeble)
962 kvm_release_page_dirty(page);
963 else
964 kvm_release_page_clean(page);
965 }
966 if (!ptwrite || !*ptwrite)
967 vcpu->arch.last_pte_updated = shadow_pte;
968}
969
970static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
971{
972}
973
974static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write,
975 gfn_t gfn, struct page *page)
976{
977 int level = PT32E_ROOT_LEVEL;
978 hpa_t table_addr = vcpu->arch.mmu.root_hpa;
979 int pt_write = 0;
980
981 for (; ; level--) {
982 u32 index = PT64_INDEX(v, level);
983 u64 *table;
984
985 ASSERT(VALID_PAGE(table_addr));
986 table = __va(table_addr);
987
988 if (level == 1) {
989 mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
990 0, write, 1, &pt_write, gfn, page);
991 return pt_write || is_io_pte(table[index]);
992 }
993
994 if (table[index] == shadow_trap_nonpresent_pte) {
995 struct kvm_mmu_page *new_table;
996 gfn_t pseudo_gfn;
997
998 pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
999 >> PAGE_SHIFT;
1000 new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
1001 v, level - 1,
1002 1, ACC_ALL, &table[index],
1003 NULL);
1004 if (!new_table) {
1005 pgprintk("nonpaging_map: ENOMEM\n");
1006 kvm_release_page_clean(page);
1007 return -ENOMEM;
1008 }
1009
1010 table[index] = __pa(new_table->spt) | PT_PRESENT_MASK
1011 | PT_WRITABLE_MASK | PT_USER_MASK;
1012 }
1013 table_addr = table[index] & PT64_BASE_ADDR_MASK;
1014 }
1015}
1016
1017static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1018{
1019 int r;
1020
1021 struct page *page;
1022
1023 down_read(&current->mm->mmap_sem);
1024 page = gfn_to_page(vcpu->kvm, gfn);
1025
1026 spin_lock(&vcpu->kvm->mmu_lock);
1027 kvm_mmu_free_some_pages(vcpu);
1028 r = __nonpaging_map(vcpu, v, write, gfn, page);
1029 spin_unlock(&vcpu->kvm->mmu_lock);
1030
1031 up_read(&current->mm->mmap_sem);
1032
1033 return r;
1034}
1035
1036
1037static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
1038 struct kvm_mmu_page *sp)
1039{
1040 int i;
1041
1042 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1043 sp->spt[i] = shadow_trap_nonpresent_pte;
1044}
1045
1046static void mmu_free_roots(struct kvm_vcpu *vcpu)
1047{
1048 int i;
1049 struct kvm_mmu_page *sp;
1050
1051 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
1052 return;
1053 spin_lock(&vcpu->kvm->mmu_lock);
1054#ifdef CONFIG_X86_64
1055 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
1056 hpa_t root = vcpu->arch.mmu.root_hpa;
1057
1058 sp = page_header(root);
1059 --sp->root_count;
1060 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
1061 spin_unlock(&vcpu->kvm->mmu_lock);
1062 return;
1063 }
1064#endif
1065 for (i = 0; i < 4; ++i) {
1066 hpa_t root = vcpu->arch.mmu.pae_root[i];
1067
1068 if (root) {
1069 root &= PT64_BASE_ADDR_MASK;
1070 sp = page_header(root);
1071 --sp->root_count;
1072 }
1073 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
1074 }
1075 spin_unlock(&vcpu->kvm->mmu_lock);
1076 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
1077}
1078
1079static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
1080{
1081 int i;
1082 gfn_t root_gfn;
1083 struct kvm_mmu_page *sp;
1084
1085 root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
1086
1087#ifdef CONFIG_X86_64
1088 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
1089 hpa_t root = vcpu->arch.mmu.root_hpa;
1090
1091 ASSERT(!VALID_PAGE(root));
1092 sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
1093 PT64_ROOT_LEVEL, 0, ACC_ALL, NULL, NULL);
1094 root = __pa(sp->spt);
1095 ++sp->root_count;
1096 vcpu->arch.mmu.root_hpa = root;
1097 return;
1098 }
1099#endif
1100 for (i = 0; i < 4; ++i) {
1101 hpa_t root = vcpu->arch.mmu.pae_root[i];
1102
1103 ASSERT(!VALID_PAGE(root));
1104 if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
1105 if (!is_present_pte(vcpu->arch.pdptrs[i])) {
1106 vcpu->arch.mmu.pae_root[i] = 0;
1107 continue;
1108 }
1109 root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT;
1110 } else if (vcpu->arch.mmu.root_level == 0)
1111 root_gfn = 0;
1112 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
1113 PT32_ROOT_LEVEL, !is_paging(vcpu),
1114 ACC_ALL, NULL, NULL);
1115 root = __pa(sp->spt);
1116 ++sp->root_count;
1117 vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
1118 }
1119 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
1120}
1121
1122static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
1123{
1124 return vaddr;
1125}
1126
1127static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
1128 u32 error_code)
1129{
1130 gfn_t gfn;
1131 int r;
1132
1133 pgprintk("%s: gva %lx error %x\n", __FUNCTION__, gva, error_code);
1134 r = mmu_topup_memory_caches(vcpu);
1135 if (r)
1136 return r;
1137
1138 ASSERT(vcpu);
1139 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
1140
1141 gfn = gva >> PAGE_SHIFT;
1142
1143 return nonpaging_map(vcpu, gva & PAGE_MASK,
1144 error_code & PFERR_WRITE_MASK, gfn);
1145}
1146
1147static void nonpaging_free(struct kvm_vcpu *vcpu)
1148{
1149 mmu_free_roots(vcpu);
1150}
1151
1152static int nonpaging_init_context(struct kvm_vcpu *vcpu)
1153{
1154 struct kvm_mmu *context = &vcpu->arch.mmu;
1155
1156 context->new_cr3 = nonpaging_new_cr3;
1157 context->page_fault = nonpaging_page_fault;
1158 context->gva_to_gpa = nonpaging_gva_to_gpa;
1159 context->free = nonpaging_free;
1160 context->prefetch_page = nonpaging_prefetch_page;
1161 context->root_level = 0;
1162 context->shadow_root_level = PT32E_ROOT_LEVEL;
1163 context->root_hpa = INVALID_PAGE;
1164 return 0;
1165}
1166
1167void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
1168{
1169 ++vcpu->stat.tlb_flush;
1170 kvm_x86_ops->tlb_flush(vcpu);
1171}
1172
1173static void paging_new_cr3(struct kvm_vcpu *vcpu)
1174{
1175 pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3);
1176 mmu_free_roots(vcpu);
1177}
1178
1179static void inject_page_fault(struct kvm_vcpu *vcpu,
1180 u64 addr,
1181 u32 err_code)
1182{
1183 kvm_inject_page_fault(vcpu, addr, err_code);
1184}
1185
1186static void paging_free(struct kvm_vcpu *vcpu)
1187{
1188 nonpaging_free(vcpu);
1189}
1190
1191#define PTTYPE 64
1192#include "paging_tmpl.h"
1193#undef PTTYPE
1194
1195#define PTTYPE 32
1196#include "paging_tmpl.h"
1197#undef PTTYPE
1198
1199static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
1200{
1201 struct kvm_mmu *context = &vcpu->arch.mmu;
1202
1203 ASSERT(is_pae(vcpu));
1204 context->new_cr3 = paging_new_cr3;
1205 context->page_fault = paging64_page_fault;
1206 context->gva_to_gpa = paging64_gva_to_gpa;
1207 context->prefetch_page = paging64_prefetch_page;
1208 context->free = paging_free;
1209 context->root_level = level;
1210 context->shadow_root_level = level;
1211 context->root_hpa = INVALID_PAGE;
1212 return 0;
1213}
1214
1215static int paging64_init_context(struct kvm_vcpu *vcpu)
1216{
1217 return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
1218}
1219
1220static int paging32_init_context(struct kvm_vcpu *vcpu)
1221{
1222 struct kvm_mmu *context = &vcpu->arch.mmu;
1223
1224 context->new_cr3 = paging_new_cr3;
1225 context->page_fault = paging32_page_fault;
1226 context->gva_to_gpa = paging32_gva_to_gpa;
1227 context->free = paging_free;
1228 context->prefetch_page = paging32_prefetch_page;
1229 context->root_level = PT32_ROOT_LEVEL;
1230 context->shadow_root_level = PT32E_ROOT_LEVEL;
1231 context->root_hpa = INVALID_PAGE;
1232 return 0;
1233}
1234
1235static int paging32E_init_context(struct kvm_vcpu *vcpu)
1236{
1237 return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
1238}
1239
1240static int init_kvm_mmu(struct kvm_vcpu *vcpu)
1241{
1242 ASSERT(vcpu);
1243 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
1244
1245 if (!is_paging(vcpu))
1246 return nonpaging_init_context(vcpu);
1247 else if (is_long_mode(vcpu))
1248 return paging64_init_context(vcpu);
1249 else if (is_pae(vcpu))
1250 return paging32E_init_context(vcpu);
1251 else
1252 return paging32_init_context(vcpu);
1253}
1254
1255static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
1256{
1257 ASSERT(vcpu);
1258 if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
1259 vcpu->arch.mmu.free(vcpu);
1260 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
1261 }
1262}
1263
1264int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
1265{
1266 destroy_kvm_mmu(vcpu);
1267 return init_kvm_mmu(vcpu);
1268}
1269EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
1270
1271int kvm_mmu_load(struct kvm_vcpu *vcpu)
1272{
1273 int r;
1274
1275 r = mmu_topup_memory_caches(vcpu);
1276 if (r)
1277 goto out;
1278 spin_lock(&vcpu->kvm->mmu_lock);
1279 kvm_mmu_free_some_pages(vcpu);
1280 mmu_alloc_roots(vcpu);
1281 spin_unlock(&vcpu->kvm->mmu_lock);
1282 kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
1283 kvm_mmu_flush_tlb(vcpu);
1284out:
1285 return r;
1286}
1287EXPORT_SYMBOL_GPL(kvm_mmu_load);
1288
1289void kvm_mmu_unload(struct kvm_vcpu *vcpu)
1290{
1291 mmu_free_roots(vcpu);
1292}
1293
1294static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
1295 struct kvm_mmu_page *sp,
1296 u64 *spte)
1297{
1298 u64 pte;
1299 struct kvm_mmu_page *child;
1300
1301 pte = *spte;
1302 if (is_shadow_present_pte(pte)) {
1303 if (sp->role.level == PT_PAGE_TABLE_LEVEL)
1304 rmap_remove(vcpu->kvm, spte);
1305 else {
1306 child = page_header(pte & PT64_BASE_ADDR_MASK);
1307 mmu_page_remove_parent_pte(child, spte);
1308 }
1309 }
1310 set_shadow_pte(spte, shadow_trap_nonpresent_pte);
1311}
1312
1313static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
1314 struct kvm_mmu_page *sp,
1315 u64 *spte,
1316 const void *new, int bytes,
1317 int offset_in_pte)
1318{
1319 if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
1320 ++vcpu->kvm->stat.mmu_pde_zapped;
1321 return;
1322 }
1323
1324 ++vcpu->kvm->stat.mmu_pte_updated;
1325 if (sp->role.glevels == PT32_ROOT_LEVEL)
1326 paging32_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte);
1327 else
1328 paging64_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte);
1329}
1330
1331static bool need_remote_flush(u64 old, u64 new)
1332{
1333 if (!is_shadow_present_pte(old))
1334 return false;
1335 if (!is_shadow_present_pte(new))
1336 return true;
1337 if ((old ^ new) & PT64_BASE_ADDR_MASK)
1338 return true;
1339 old ^= PT64_NX_MASK;
1340 new ^= PT64_NX_MASK;
1341 return (old & ~new & PT64_PERM_MASK) != 0;
1342}
1343
1344static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new)
1345{
1346 if (need_remote_flush(old, new))
1347 kvm_flush_remote_tlbs(vcpu->kvm);
1348 else
1349 kvm_mmu_flush_tlb(vcpu);
1350}
1351
1352static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
1353{
1354 u64 *spte = vcpu->arch.last_pte_updated;
1355
1356 return !!(spte && (*spte & PT_ACCESSED_MASK));
1357}
1358
1359static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1360 const u8 *new, int bytes)
1361{
1362 gfn_t gfn;
1363 int r;
1364 u64 gpte = 0;
1365
1366 if (bytes != 4 && bytes != 8)
1367 return;
1368
1369 /*
1370 * Assume that the pte write on a page table of the same type
1371 * as the current vcpu paging mode. This is nearly always true
1372 * (might be false while changing modes). Note it is verified later
1373 * by update_pte().
1374 */
1375 if (is_pae(vcpu)) {
1376 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
1377 if ((bytes == 4) && (gpa % 4 == 0)) {
1378 r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8);
1379 if (r)
1380 return;
1381 memcpy((void *)&gpte + (gpa % 8), new, 4);
1382 } else if ((bytes == 8) && (gpa % 8 == 0)) {
1383 memcpy((void *)&gpte, new, 8);
1384 }
1385 } else {
1386 if ((bytes == 4) && (gpa % 4 == 0))
1387 memcpy((void *)&gpte, new, 4);
1388 }
1389 if (!is_present_pte(gpte))
1390 return;
1391 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
1392 vcpu->arch.update_pte.gfn = gfn;
1393 vcpu->arch.update_pte.page = gfn_to_page(vcpu->kvm, gfn);
1394}
1395
1396void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1397 const u8 *new, int bytes)
1398{
1399 gfn_t gfn = gpa >> PAGE_SHIFT;
1400 struct kvm_mmu_page *sp;
1401 struct hlist_node *node, *n;
1402 struct hlist_head *bucket;
1403 unsigned index;
1404 u64 entry;
1405 u64 *spte;
1406 unsigned offset = offset_in_page(gpa);
1407 unsigned pte_size;
1408 unsigned page_offset;
1409 unsigned misaligned;
1410 unsigned quadrant;
1411 int level;
1412 int flooded = 0;
1413 int npte;
1414
1415 pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
1416 mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes);
1417 spin_lock(&vcpu->kvm->mmu_lock);
1418 kvm_mmu_free_some_pages(vcpu);
1419 ++vcpu->kvm->stat.mmu_pte_write;
1420 kvm_mmu_audit(vcpu, "pre pte write");
1421 if (gfn == vcpu->arch.last_pt_write_gfn
1422 && !last_updated_pte_accessed(vcpu)) {
1423 ++vcpu->arch.last_pt_write_count;
1424 if (vcpu->arch.last_pt_write_count >= 3)
1425 flooded = 1;
1426 } else {
1427 vcpu->arch.last_pt_write_gfn = gfn;
1428 vcpu->arch.last_pt_write_count = 1;
1429 vcpu->arch.last_pte_updated = NULL;
1430 }
1431 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
1432 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
1433 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
1434 if (sp->gfn != gfn || sp->role.metaphysical)
1435 continue;
1436 pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
1437 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
1438 misaligned |= bytes < 4;
1439 if (misaligned || flooded) {
1440 /*
1441 * Misaligned accesses are too much trouble to fix
1442 * up; also, they usually indicate a page is not used
1443 * as a page table.
1444 *
1445 * If we're seeing too many writes to a page,
1446 * it may no longer be a page table, or we may be
1447 * forking, in which case it is better to unmap the
1448 * page.
1449 */
1450 pgprintk("misaligned: gpa %llx bytes %d role %x\n",
1451 gpa, bytes, sp->role.word);
1452 kvm_mmu_zap_page(vcpu->kvm, sp);
1453 ++vcpu->kvm->stat.mmu_flooded;
1454 continue;
1455 }
1456 page_offset = offset;
1457 level = sp->role.level;
1458 npte = 1;
1459 if (sp->role.glevels == PT32_ROOT_LEVEL) {
1460 page_offset <<= 1; /* 32->64 */
1461 /*
1462 * A 32-bit pde maps 4MB while the shadow pdes map
1463 * only 2MB. So we need to double the offset again
1464 * and zap two pdes instead of one.
1465 */
1466 if (level == PT32_ROOT_LEVEL) {
1467 page_offset &= ~7; /* kill rounding error */
1468 page_offset <<= 1;
1469 npte = 2;
1470 }
1471 quadrant = page_offset >> PAGE_SHIFT;
1472 page_offset &= ~PAGE_MASK;
1473 if (quadrant != sp->role.quadrant)
1474 continue;
1475 }
1476 spte = &sp->spt[page_offset / sizeof(*spte)];
1477 while (npte--) {
1478 entry = *spte;
1479 mmu_pte_write_zap_pte(vcpu, sp, spte);
1480 mmu_pte_write_new_pte(vcpu, sp, spte, new, bytes,
1481 page_offset & (pte_size - 1));
1482 mmu_pte_write_flush_tlb(vcpu, entry, *spte);
1483 ++spte;
1484 }
1485 }
1486 kvm_mmu_audit(vcpu, "post pte write");
1487 spin_unlock(&vcpu->kvm->mmu_lock);
1488 if (vcpu->arch.update_pte.page) {
1489 kvm_release_page_clean(vcpu->arch.update_pte.page);
1490 vcpu->arch.update_pte.page = NULL;
1491 }
1492}
1493
1494int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
1495{
1496 gpa_t gpa;
1497 int r;
1498
1499 down_read(&current->mm->mmap_sem);
1500 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
1501 up_read(&current->mm->mmap_sem);
1502
1503 spin_lock(&vcpu->kvm->mmu_lock);
1504 r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1505 spin_unlock(&vcpu->kvm->mmu_lock);
1506 return r;
1507}
1508
1509void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
1510{
1511 while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) {
1512 struct kvm_mmu_page *sp;
1513
1514 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
1515 struct kvm_mmu_page, link);
1516 kvm_mmu_zap_page(vcpu->kvm, sp);
1517 ++vcpu->kvm->stat.mmu_recycled;
1518 }
1519}
1520
1521int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
1522{
1523 int r;
1524 enum emulation_result er;
1525
1526 r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code);
1527 if (r < 0)
1528 goto out;
1529
1530 if (!r) {
1531 r = 1;
1532 goto out;
1533 }
1534
1535 r = mmu_topup_memory_caches(vcpu);
1536 if (r)
1537 goto out;
1538
1539 er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0);
1540
1541 switch (er) {
1542 case EMULATE_DONE:
1543 return 1;
1544 case EMULATE_DO_MMIO:
1545 ++vcpu->stat.mmio_exits;
1546 return 0;
1547 case EMULATE_FAIL:
1548 kvm_report_emulation_failure(vcpu, "pagetable");
1549 return 1;
1550 default:
1551 BUG();
1552 }
1553out:
1554 return r;
1555}
1556EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
1557
1558static void free_mmu_pages(struct kvm_vcpu *vcpu)
1559{
1560 struct kvm_mmu_page *sp;
1561
1562 while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
1563 sp = container_of(vcpu->kvm->arch.active_mmu_pages.next,
1564 struct kvm_mmu_page, link);
1565 kvm_mmu_zap_page(vcpu->kvm, sp);
1566 }
1567 free_page((unsigned long)vcpu->arch.mmu.pae_root);
1568}
1569
1570static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
1571{
1572 struct page *page;
1573 int i;
1574
1575 ASSERT(vcpu);
1576
1577 if (vcpu->kvm->arch.n_requested_mmu_pages)
1578 vcpu->kvm->arch.n_free_mmu_pages =
1579 vcpu->kvm->arch.n_requested_mmu_pages;
1580 else
1581 vcpu->kvm->arch.n_free_mmu_pages =
1582 vcpu->kvm->arch.n_alloc_mmu_pages;
1583 /*
1584 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
1585 * Therefore we need to allocate shadow page tables in the first
1586 * 4GB of memory, which happens to fit the DMA32 zone.
1587 */
1588 page = alloc_page(GFP_KERNEL | __GFP_DMA32);
1589 if (!page)
1590 goto error_1;
1591 vcpu->arch.mmu.pae_root = page_address(page);
1592 for (i = 0; i < 4; ++i)
1593 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
1594
1595 return 0;
1596
1597error_1:
1598 free_mmu_pages(vcpu);
1599 return -ENOMEM;
1600}
1601
1602int kvm_mmu_create(struct kvm_vcpu *vcpu)
1603{
1604 ASSERT(vcpu);
1605 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
1606
1607 return alloc_mmu_pages(vcpu);
1608}
1609
1610int kvm_mmu_setup(struct kvm_vcpu *vcpu)
1611{
1612 ASSERT(vcpu);
1613 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
1614
1615 return init_kvm_mmu(vcpu);
1616}
1617
1618void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
1619{
1620 ASSERT(vcpu);
1621
1622 destroy_kvm_mmu(vcpu);
1623 free_mmu_pages(vcpu);
1624 mmu_free_memory_caches(vcpu);
1625}
1626
1627void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
1628{
1629 struct kvm_mmu_page *sp;
1630
1631 list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
1632 int i;
1633 u64 *pt;
1634
1635 if (!test_bit(slot, &sp->slot_bitmap))
1636 continue;
1637
1638 pt = sp->spt;
1639 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1640 /* avoid RMW */
1641 if (pt[i] & PT_WRITABLE_MASK)
1642 pt[i] &= ~PT_WRITABLE_MASK;
1643 }
1644}
1645
1646void kvm_mmu_zap_all(struct kvm *kvm)
1647{
1648 struct kvm_mmu_page *sp, *node;
1649
1650 spin_lock(&kvm->mmu_lock);
1651 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
1652 kvm_mmu_zap_page(kvm, sp);
1653 spin_unlock(&kvm->mmu_lock);
1654
1655 kvm_flush_remote_tlbs(kvm);
1656}
1657
1658void kvm_mmu_module_exit(void)
1659{
1660 if (pte_chain_cache)
1661 kmem_cache_destroy(pte_chain_cache);
1662 if (rmap_desc_cache)
1663 kmem_cache_destroy(rmap_desc_cache);
1664 if (mmu_page_header_cache)
1665 kmem_cache_destroy(mmu_page_header_cache);
1666}
1667
1668int kvm_mmu_module_init(void)
1669{
1670 pte_chain_cache = kmem_cache_create("kvm_pte_chain",
1671 sizeof(struct kvm_pte_chain),
1672 0, 0, NULL);
1673 if (!pte_chain_cache)
1674 goto nomem;
1675 rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
1676 sizeof(struct kvm_rmap_desc),
1677 0, 0, NULL);
1678 if (!rmap_desc_cache)
1679 goto nomem;
1680
1681 mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
1682 sizeof(struct kvm_mmu_page),
1683 0, 0, NULL);
1684 if (!mmu_page_header_cache)
1685 goto nomem;
1686
1687 return 0;
1688
1689nomem:
1690 kvm_mmu_module_exit();
1691 return -ENOMEM;
1692}
1693
1694/*
1695 * Caculate mmu pages needed for kvm.
1696 */
1697unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
1698{
1699 int i;
1700 unsigned int nr_mmu_pages;
1701 unsigned int nr_pages = 0;
1702
1703 for (i = 0; i < kvm->nmemslots; i++)
1704 nr_pages += kvm->memslots[i].npages;
1705
1706 nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
1707 nr_mmu_pages = max(nr_mmu_pages,
1708 (unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
1709
1710 return nr_mmu_pages;
1711}
1712
1713#ifdef AUDIT
1714
1715static const char *audit_msg;
1716
1717static gva_t canonicalize(gva_t gva)
1718{
1719#ifdef CONFIG_X86_64
1720 gva = (long long)(gva << 16) >> 16;
1721#endif
1722 return gva;
1723}
1724
1725static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
1726 gva_t va, int level)
1727{
1728 u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
1729 int i;
1730 gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
1731
1732 for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
1733 u64 ent = pt[i];
1734
1735 if (ent == shadow_trap_nonpresent_pte)
1736 continue;
1737
1738 va = canonicalize(va);
1739 if (level > 1) {
1740 if (ent == shadow_notrap_nonpresent_pte)
1741 printk(KERN_ERR "audit: (%s) nontrapping pte"
1742 " in nonleaf level: levels %d gva %lx"
1743 " level %d pte %llx\n", audit_msg,
1744 vcpu->arch.mmu.root_level, va, level, ent);
1745
1746 audit_mappings_page(vcpu, ent, va, level - 1);
1747 } else {
1748 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va);
1749 struct page *page = gpa_to_page(vcpu, gpa);
1750 hpa_t hpa = page_to_phys(page);
1751
1752 if (is_shadow_present_pte(ent)
1753 && (ent & PT64_BASE_ADDR_MASK) != hpa)
1754 printk(KERN_ERR "xx audit error: (%s) levels %d"
1755 " gva %lx gpa %llx hpa %llx ent %llx %d\n",
1756 audit_msg, vcpu->arch.mmu.root_level,
1757 va, gpa, hpa, ent,
1758 is_shadow_present_pte(ent));
1759 else if (ent == shadow_notrap_nonpresent_pte
1760 && !is_error_hpa(hpa))
1761 printk(KERN_ERR "audit: (%s) notrap shadow,"
1762 " valid guest gva %lx\n", audit_msg, va);
1763 kvm_release_page_clean(page);
1764
1765 }
1766 }
1767}
1768
1769static void audit_mappings(struct kvm_vcpu *vcpu)
1770{
1771 unsigned i;
1772
1773 if (vcpu->arch.mmu.root_level == 4)
1774 audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
1775 else
1776 for (i = 0; i < 4; ++i)
1777 if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
1778 audit_mappings_page(vcpu,
1779 vcpu->arch.mmu.pae_root[i],
1780 i << 30,
1781 2);
1782}
1783
1784static int count_rmaps(struct kvm_vcpu *vcpu)
1785{
1786 int nmaps = 0;
1787 int i, j, k;
1788
1789 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
1790 struct kvm_memory_slot *m = &vcpu->kvm->memslots[i];
1791 struct kvm_rmap_desc *d;
1792
1793 for (j = 0; j < m->npages; ++j) {
1794 unsigned long *rmapp = &m->rmap[j];
1795
1796 if (!*rmapp)
1797 continue;
1798 if (!(*rmapp & 1)) {
1799 ++nmaps;
1800 continue;
1801 }
1802 d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
1803 while (d) {
1804 for (k = 0; k < RMAP_EXT; ++k)
1805 if (d->shadow_ptes[k])
1806 ++nmaps;
1807 else
1808 break;
1809 d = d->more;
1810 }
1811 }
1812 }
1813 return nmaps;
1814}
1815
1816static int count_writable_mappings(struct kvm_vcpu *vcpu)
1817{
1818 int nmaps = 0;
1819 struct kvm_mmu_page *sp;
1820 int i;
1821
1822 list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
1823 u64 *pt = sp->spt;
1824
1825 if (sp->role.level != PT_PAGE_TABLE_LEVEL)
1826 continue;
1827
1828 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1829 u64 ent = pt[i];
1830
1831 if (!(ent & PT_PRESENT_MASK))
1832 continue;
1833 if (!(ent & PT_WRITABLE_MASK))
1834 continue;
1835 ++nmaps;
1836 }
1837 }
1838 return nmaps;
1839}
1840
1841static void audit_rmap(struct kvm_vcpu *vcpu)
1842{
1843 int n_rmap = count_rmaps(vcpu);
1844 int n_actual = count_writable_mappings(vcpu);
1845
1846 if (n_rmap != n_actual)
1847 printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
1848 __FUNCTION__, audit_msg, n_rmap, n_actual);
1849}
1850
1851static void audit_write_protection(struct kvm_vcpu *vcpu)
1852{
1853 struct kvm_mmu_page *sp;
1854 struct kvm_memory_slot *slot;
1855 unsigned long *rmapp;
1856 gfn_t gfn;
1857
1858 list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
1859 if (sp->role.metaphysical)
1860 continue;
1861
1862 slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
1863 gfn = unalias_gfn(vcpu->kvm, sp->gfn);
1864 rmapp = &slot->rmap[gfn - slot->base_gfn];
1865 if (*rmapp)
1866 printk(KERN_ERR "%s: (%s) shadow page has writable"
1867 " mappings: gfn %lx role %x\n",
1868 __FUNCTION__, audit_msg, sp->gfn,
1869 sp->role.word);
1870 }
1871}
1872
1873static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
1874{
1875 int olddbg = dbg;
1876
1877 dbg = 0;
1878 audit_msg = msg;
1879 audit_rmap(vcpu);
1880 audit_write_protection(vcpu);
1881 audit_mappings(vcpu);
1882 dbg = olddbg;
1883}
1884
1885#endif
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
new file mode 100644
index 000000000000..1fce19ec7a23
--- /dev/null
+++ b/arch/x86/kvm/mmu.h
@@ -0,0 +1,44 @@
1#ifndef __KVM_X86_MMU_H
2#define __KVM_X86_MMU_H
3
4#include <linux/kvm_host.h>
5
6static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
7{
8 if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
9 __kvm_mmu_free_some_pages(vcpu);
10}
11
12static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
13{
14 if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE))
15 return 0;
16
17 return kvm_mmu_load(vcpu);
18}
19
20static inline int is_long_mode(struct kvm_vcpu *vcpu)
21{
22#ifdef CONFIG_X86_64
23 return vcpu->arch.shadow_efer & EFER_LME;
24#else
25 return 0;
26#endif
27}
28
29static inline int is_pae(struct kvm_vcpu *vcpu)
30{
31 return vcpu->arch.cr4 & X86_CR4_PAE;
32}
33
34static inline int is_pse(struct kvm_vcpu *vcpu)
35{
36 return vcpu->arch.cr4 & X86_CR4_PSE;
37}
38
39static inline int is_paging(struct kvm_vcpu *vcpu)
40{
41 return vcpu->arch.cr0 & X86_CR0_PG;
42}
43
44#endif
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
new file mode 100644
index 000000000000..03ba8608fe0f
--- /dev/null
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -0,0 +1,484 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * MMU support
8 *
9 * Copyright (C) 2006 Qumranet, Inc.
10 *
11 * Authors:
12 * Yaniv Kamay <yaniv@qumranet.com>
13 * Avi Kivity <avi@qumranet.com>
14 *
15 * This work is licensed under the terms of the GNU GPL, version 2. See
16 * the COPYING file in the top-level directory.
17 *
18 */
19
20/*
21 * We need the mmu code to access both 32-bit and 64-bit guest ptes,
22 * so the code in this file is compiled twice, once per pte size.
23 */
24
25#if PTTYPE == 64
26 #define pt_element_t u64
27 #define guest_walker guest_walker64
28 #define FNAME(name) paging##64_##name
29 #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
30 #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
31 #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
32 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
33 #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
34 #define PT_LEVEL_BITS PT64_LEVEL_BITS
35 #ifdef CONFIG_X86_64
36 #define PT_MAX_FULL_LEVELS 4
37 #define CMPXCHG cmpxchg
38 #else
39 #define CMPXCHG cmpxchg64
40 #define PT_MAX_FULL_LEVELS 2
41 #endif
42#elif PTTYPE == 32
43 #define pt_element_t u32
44 #define guest_walker guest_walker32
45 #define FNAME(name) paging##32_##name
46 #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
47 #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
48 #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
49 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
50 #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
51 #define PT_LEVEL_BITS PT32_LEVEL_BITS
52 #define PT_MAX_FULL_LEVELS 2
53 #define CMPXCHG cmpxchg
54#else
55 #error Invalid PTTYPE value
56#endif
57
58#define gpte_to_gfn FNAME(gpte_to_gfn)
59#define gpte_to_gfn_pde FNAME(gpte_to_gfn_pde)
60
61/*
62 * The guest_walker structure emulates the behavior of the hardware page
63 * table walker.
64 */
65struct guest_walker {
66 int level;
67 gfn_t table_gfn[PT_MAX_FULL_LEVELS];
68 pt_element_t ptes[PT_MAX_FULL_LEVELS];
69 gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
70 unsigned pt_access;
71 unsigned pte_access;
72 gfn_t gfn;
73 u32 error_code;
74};
75
76static gfn_t gpte_to_gfn(pt_element_t gpte)
77{
78 return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
79}
80
81static gfn_t gpte_to_gfn_pde(pt_element_t gpte)
82{
83 return (gpte & PT_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
84}
85
86static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
87 gfn_t table_gfn, unsigned index,
88 pt_element_t orig_pte, pt_element_t new_pte)
89{
90 pt_element_t ret;
91 pt_element_t *table;
92 struct page *page;
93
94 page = gfn_to_page(kvm, table_gfn);
95 table = kmap_atomic(page, KM_USER0);
96
97 ret = CMPXCHG(&table[index], orig_pte, new_pte);
98
99 kunmap_atomic(table, KM_USER0);
100
101 kvm_release_page_dirty(page);
102
103 return (ret != orig_pte);
104}
105
106static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
107{
108 unsigned access;
109
110 access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
111#if PTTYPE == 64
112 if (is_nx(vcpu))
113 access &= ~(gpte >> PT64_NX_SHIFT);
114#endif
115 return access;
116}
117
118/*
119 * Fetch a guest pte for a guest virtual address
120 */
121static int FNAME(walk_addr)(struct guest_walker *walker,
122 struct kvm_vcpu *vcpu, gva_t addr,
123 int write_fault, int user_fault, int fetch_fault)
124{
125 pt_element_t pte;
126 gfn_t table_gfn;
127 unsigned index, pt_access, pte_access;
128 gpa_t pte_gpa;
129
130 pgprintk("%s: addr %lx\n", __FUNCTION__, addr);
131walk:
132 walker->level = vcpu->arch.mmu.root_level;
133 pte = vcpu->arch.cr3;
134#if PTTYPE == 64
135 if (!is_long_mode(vcpu)) {
136 pte = vcpu->arch.pdptrs[(addr >> 30) & 3];
137 if (!is_present_pte(pte))
138 goto not_present;
139 --walker->level;
140 }
141#endif
142 ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
143 (vcpu->cr3 & CR3_NONPAE_RESERVED_BITS) == 0);
144
145 pt_access = ACC_ALL;
146
147 for (;;) {
148 index = PT_INDEX(addr, walker->level);
149
150 table_gfn = gpte_to_gfn(pte);
151 pte_gpa = gfn_to_gpa(table_gfn);
152 pte_gpa += index * sizeof(pt_element_t);
153 walker->table_gfn[walker->level - 1] = table_gfn;
154 walker->pte_gpa[walker->level - 1] = pte_gpa;
155 pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
156 walker->level - 1, table_gfn);
157
158 kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte));
159
160 if (!is_present_pte(pte))
161 goto not_present;
162
163 if (write_fault && !is_writeble_pte(pte))
164 if (user_fault || is_write_protection(vcpu))
165 goto access_error;
166
167 if (user_fault && !(pte & PT_USER_MASK))
168 goto access_error;
169
170#if PTTYPE == 64
171 if (fetch_fault && is_nx(vcpu) && (pte & PT64_NX_MASK))
172 goto access_error;
173#endif
174
175 if (!(pte & PT_ACCESSED_MASK)) {
176 mark_page_dirty(vcpu->kvm, table_gfn);
177 if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn,
178 index, pte, pte|PT_ACCESSED_MASK))
179 goto walk;
180 pte |= PT_ACCESSED_MASK;
181 }
182
183 pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
184
185 walker->ptes[walker->level - 1] = pte;
186
187 if (walker->level == PT_PAGE_TABLE_LEVEL) {
188 walker->gfn = gpte_to_gfn(pte);
189 break;
190 }
191
192 if (walker->level == PT_DIRECTORY_LEVEL
193 && (pte & PT_PAGE_SIZE_MASK)
194 && (PTTYPE == 64 || is_pse(vcpu))) {
195 walker->gfn = gpte_to_gfn_pde(pte);
196 walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL);
197 if (PTTYPE == 32 && is_cpuid_PSE36())
198 walker->gfn += pse36_gfn_delta(pte);
199 break;
200 }
201
202 pt_access = pte_access;
203 --walker->level;
204 }
205
206 if (write_fault && !is_dirty_pte(pte)) {
207 bool ret;
208
209 mark_page_dirty(vcpu->kvm, table_gfn);
210 ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
211 pte|PT_DIRTY_MASK);
212 if (ret)
213 goto walk;
214 pte |= PT_DIRTY_MASK;
215 kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte));
216 walker->ptes[walker->level - 1] = pte;
217 }
218
219 walker->pt_access = pt_access;
220 walker->pte_access = pte_access;
221 pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
222 __FUNCTION__, (u64)pte, pt_access, pte_access);
223 return 1;
224
225not_present:
226 walker->error_code = 0;
227 goto err;
228
229access_error:
230 walker->error_code = PFERR_PRESENT_MASK;
231
232err:
233 if (write_fault)
234 walker->error_code |= PFERR_WRITE_MASK;
235 if (user_fault)
236 walker->error_code |= PFERR_USER_MASK;
237 if (fetch_fault)
238 walker->error_code |= PFERR_FETCH_MASK;
239 return 0;
240}
241
242static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
243 u64 *spte, const void *pte, int bytes,
244 int offset_in_pte)
245{
246 pt_element_t gpte;
247 unsigned pte_access;
248 struct page *npage;
249
250 gpte = *(const pt_element_t *)pte;
251 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
252 if (!offset_in_pte && !is_present_pte(gpte))
253 set_shadow_pte(spte, shadow_notrap_nonpresent_pte);
254 return;
255 }
256 if (bytes < sizeof(pt_element_t))
257 return;
258 pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte);
259 pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte);
260 if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
261 return;
262 npage = vcpu->arch.update_pte.page;
263 if (!npage)
264 return;
265 get_page(npage);
266 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
267 gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte), npage);
268}
269
270/*
271 * Fetch a shadow pte for a specific level in the paging hierarchy.
272 */
273static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
274 struct guest_walker *walker,
275 int user_fault, int write_fault, int *ptwrite,
276 struct page *page)
277{
278 hpa_t shadow_addr;
279 int level;
280 u64 *shadow_ent;
281 unsigned access = walker->pt_access;
282
283 if (!is_present_pte(walker->ptes[walker->level - 1]))
284 return NULL;
285
286 shadow_addr = vcpu->arch.mmu.root_hpa;
287 level = vcpu->arch.mmu.shadow_root_level;
288 if (level == PT32E_ROOT_LEVEL) {
289 shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
290 shadow_addr &= PT64_BASE_ADDR_MASK;
291 --level;
292 }
293
294 for (; ; level--) {
295 u32 index = SHADOW_PT_INDEX(addr, level);
296 struct kvm_mmu_page *shadow_page;
297 u64 shadow_pte;
298 int metaphysical;
299 gfn_t table_gfn;
300 bool new_page = 0;
301
302 shadow_ent = ((u64 *)__va(shadow_addr)) + index;
303 if (level == PT_PAGE_TABLE_LEVEL)
304 break;
305 if (is_shadow_present_pte(*shadow_ent)) {
306 shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
307 continue;
308 }
309
310 if (level - 1 == PT_PAGE_TABLE_LEVEL
311 && walker->level == PT_DIRECTORY_LEVEL) {
312 metaphysical = 1;
313 if (!is_dirty_pte(walker->ptes[level - 1]))
314 access &= ~ACC_WRITE_MASK;
315 table_gfn = gpte_to_gfn(walker->ptes[level - 1]);
316 } else {
317 metaphysical = 0;
318 table_gfn = walker->table_gfn[level - 2];
319 }
320 shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
321 metaphysical, access,
322 shadow_ent, &new_page);
323 if (new_page && !metaphysical) {
324 int r;
325 pt_element_t curr_pte;
326 r = kvm_read_guest_atomic(vcpu->kvm,
327 walker->pte_gpa[level - 2],
328 &curr_pte, sizeof(curr_pte));
329 if (r || curr_pte != walker->ptes[level - 2]) {
330 kvm_release_page_clean(page);
331 return NULL;
332 }
333 }
334 shadow_addr = __pa(shadow_page->spt);
335 shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
336 | PT_WRITABLE_MASK | PT_USER_MASK;
337 *shadow_ent = shadow_pte;
338 }
339
340 mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access,
341 user_fault, write_fault,
342 walker->ptes[walker->level-1] & PT_DIRTY_MASK,
343 ptwrite, walker->gfn, page);
344
345 return shadow_ent;
346}
347
348/*
349 * Page fault handler. There are several causes for a page fault:
350 * - there is no shadow pte for the guest pte
351 * - write access through a shadow pte marked read only so that we can set
352 * the dirty bit
353 * - write access to a shadow pte marked read only so we can update the page
354 * dirty bitmap, when userspace requests it
355 * - mmio access; in this case we will never install a present shadow pte
356 * - normal guest page fault due to the guest pte marked not present, not
357 * writable, or not executable
358 *
359 * Returns: 1 if we need to emulate the instruction, 0 otherwise, or
360 * a negative value on error.
361 */
362static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
363 u32 error_code)
364{
365 int write_fault = error_code & PFERR_WRITE_MASK;
366 int user_fault = error_code & PFERR_USER_MASK;
367 int fetch_fault = error_code & PFERR_FETCH_MASK;
368 struct guest_walker walker;
369 u64 *shadow_pte;
370 int write_pt = 0;
371 int r;
372 struct page *page;
373
374 pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code);
375 kvm_mmu_audit(vcpu, "pre page fault");
376
377 r = mmu_topup_memory_caches(vcpu);
378 if (r)
379 return r;
380
381 down_read(&current->mm->mmap_sem);
382 /*
383 * Look up the shadow pte for the faulting address.
384 */
385 r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
386 fetch_fault);
387
388 /*
389 * The page is not mapped by the guest. Let the guest handle it.
390 */
391 if (!r) {
392 pgprintk("%s: guest page fault\n", __FUNCTION__);
393 inject_page_fault(vcpu, addr, walker.error_code);
394 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
395 up_read(&current->mm->mmap_sem);
396 return 0;
397 }
398
399 page = gfn_to_page(vcpu->kvm, walker.gfn);
400
401 spin_lock(&vcpu->kvm->mmu_lock);
402 kvm_mmu_free_some_pages(vcpu);
403 shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
404 &write_pt, page);
405 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__,
406 shadow_pte, *shadow_pte, write_pt);
407
408 if (!write_pt)
409 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
410
411 /*
412 * mmio: emulate if accessible, otherwise its a guest fault.
413 */
414 if (shadow_pte && is_io_pte(*shadow_pte)) {
415 spin_unlock(&vcpu->kvm->mmu_lock);
416 up_read(&current->mm->mmap_sem);
417 return 1;
418 }
419
420 ++vcpu->stat.pf_fixed;
421 kvm_mmu_audit(vcpu, "post page fault (fixed)");
422 spin_unlock(&vcpu->kvm->mmu_lock);
423 up_read(&current->mm->mmap_sem);
424
425 return write_pt;
426}
427
428static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
429{
430 struct guest_walker walker;
431 gpa_t gpa = UNMAPPED_GVA;
432 int r;
433
434 r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0);
435
436 if (r) {
437 gpa = gfn_to_gpa(walker.gfn);
438 gpa |= vaddr & ~PAGE_MASK;
439 }
440
441 return gpa;
442}
443
444static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
445 struct kvm_mmu_page *sp)
446{
447 int i, offset = 0, r = 0;
448 pt_element_t pt;
449
450 if (sp->role.metaphysical
451 || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) {
452 nonpaging_prefetch_page(vcpu, sp);
453 return;
454 }
455
456 if (PTTYPE == 32)
457 offset = sp->role.quadrant << PT64_LEVEL_BITS;
458
459 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
460 gpa_t pte_gpa = gfn_to_gpa(sp->gfn);
461 pte_gpa += (i+offset) * sizeof(pt_element_t);
462
463 r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &pt,
464 sizeof(pt_element_t));
465 if (r || is_present_pte(pt))
466 sp->spt[i] = shadow_trap_nonpresent_pte;
467 else
468 sp->spt[i] = shadow_notrap_nonpresent_pte;
469 }
470}
471
472#undef pt_element_t
473#undef guest_walker
474#undef FNAME
475#undef PT_BASE_ADDR_MASK
476#undef PT_INDEX
477#undef SHADOW_PT_INDEX
478#undef PT_LEVEL_MASK
479#undef PT_DIR_BASE_ADDR_MASK
480#undef PT_LEVEL_BITS
481#undef PT_MAX_FULL_LEVELS
482#undef gpte_to_gfn
483#undef gpte_to_gfn_pde
484#undef CMPXCHG
diff --git a/arch/x86/kvm/segment_descriptor.h b/arch/x86/kvm/segment_descriptor.h
new file mode 100644
index 000000000000..56fc4c873389
--- /dev/null
+++ b/arch/x86/kvm/segment_descriptor.h
@@ -0,0 +1,29 @@
1#ifndef __SEGMENT_DESCRIPTOR_H
2#define __SEGMENT_DESCRIPTOR_H
3
4struct segment_descriptor {
5 u16 limit_low;
6 u16 base_low;
7 u8 base_mid;
8 u8 type : 4;
9 u8 system : 1;
10 u8 dpl : 2;
11 u8 present : 1;
12 u8 limit_high : 4;
13 u8 avl : 1;
14 u8 long_mode : 1;
15 u8 default_op : 1;
16 u8 granularity : 1;
17 u8 base_high;
18} __attribute__((packed));
19
20#ifdef CONFIG_X86_64
21/* LDT or TSS descriptor in the GDT. 16 bytes. */
22struct segment_descriptor_64 {
23 struct segment_descriptor s;
24 u32 base_higher;
25 u32 pad_zero;
26};
27
28#endif
29#endif
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
new file mode 100644
index 000000000000..de755cb1431d
--- /dev/null
+++ b/arch/x86/kvm/svm.c
@@ -0,0 +1,1731 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * AMD SVM support
5 *
6 * Copyright (C) 2006 Qumranet, Inc.
7 *
8 * Authors:
9 * Yaniv Kamay <yaniv@qumranet.com>
10 * Avi Kivity <avi@qumranet.com>
11 *
12 * This work is licensed under the terms of the GNU GPL, version 2. See
13 * the COPYING file in the top-level directory.
14 *
15 */
16#include <linux/kvm_host.h>
17
18#include "kvm_svm.h"
19#include "irq.h"
20#include "mmu.h"
21
22#include <linux/module.h>
23#include <linux/kernel.h>
24#include <linux/vmalloc.h>
25#include <linux/highmem.h>
26#include <linux/sched.h>
27
28#include <asm/desc.h>
29
30MODULE_AUTHOR("Qumranet");
31MODULE_LICENSE("GPL");
32
33#define IOPM_ALLOC_ORDER 2
34#define MSRPM_ALLOC_ORDER 1
35
36#define DB_VECTOR 1
37#define UD_VECTOR 6
38#define GP_VECTOR 13
39
40#define DR7_GD_MASK (1 << 13)
41#define DR6_BD_MASK (1 << 13)
42
43#define SEG_TYPE_LDT 2
44#define SEG_TYPE_BUSY_TSS16 3
45
46#define SVM_FEATURE_NPT (1 << 0)
47#define SVM_FEATURE_LBRV (1 << 1)
48#define SVM_DEATURE_SVML (1 << 2)
49
50static void kvm_reput_irq(struct vcpu_svm *svm);
51
52static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
53{
54 return container_of(vcpu, struct vcpu_svm, vcpu);
55}
56
57unsigned long iopm_base;
58unsigned long msrpm_base;
59
60struct kvm_ldttss_desc {
61 u16 limit0;
62 u16 base0;
63 unsigned base1 : 8, type : 5, dpl : 2, p : 1;
64 unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8;
65 u32 base3;
66 u32 zero1;
67} __attribute__((packed));
68
69struct svm_cpu_data {
70 int cpu;
71
72 u64 asid_generation;
73 u32 max_asid;
74 u32 next_asid;
75 struct kvm_ldttss_desc *tss_desc;
76
77 struct page *save_area;
78};
79
80static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
81static uint32_t svm_features;
82
83struct svm_init_data {
84 int cpu;
85 int r;
86};
87
88static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
89
90#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
91#define MSRS_RANGE_SIZE 2048
92#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
93
94#define MAX_INST_SIZE 15
95
96static inline u32 svm_has(u32 feat)
97{
98 return svm_features & feat;
99}
100
101static inline u8 pop_irq(struct kvm_vcpu *vcpu)
102{
103 int word_index = __ffs(vcpu->arch.irq_summary);
104 int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
105 int irq = word_index * BITS_PER_LONG + bit_index;
106
107 clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
108 if (!vcpu->arch.irq_pending[word_index])
109 clear_bit(word_index, &vcpu->arch.irq_summary);
110 return irq;
111}
112
113static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq)
114{
115 set_bit(irq, vcpu->arch.irq_pending);
116 set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
117}
118
119static inline void clgi(void)
120{
121 asm volatile (SVM_CLGI);
122}
123
124static inline void stgi(void)
125{
126 asm volatile (SVM_STGI);
127}
128
129static inline void invlpga(unsigned long addr, u32 asid)
130{
131 asm volatile (SVM_INVLPGA :: "a"(addr), "c"(asid));
132}
133
134static inline unsigned long kvm_read_cr2(void)
135{
136 unsigned long cr2;
137
138 asm volatile ("mov %%cr2, %0" : "=r" (cr2));
139 return cr2;
140}
141
142static inline void kvm_write_cr2(unsigned long val)
143{
144 asm volatile ("mov %0, %%cr2" :: "r" (val));
145}
146
147static inline unsigned long read_dr6(void)
148{
149 unsigned long dr6;
150
151 asm volatile ("mov %%dr6, %0" : "=r" (dr6));
152 return dr6;
153}
154
155static inline void write_dr6(unsigned long val)
156{
157 asm volatile ("mov %0, %%dr6" :: "r" (val));
158}
159
160static inline unsigned long read_dr7(void)
161{
162 unsigned long dr7;
163
164 asm volatile ("mov %%dr7, %0" : "=r" (dr7));
165 return dr7;
166}
167
168static inline void write_dr7(unsigned long val)
169{
170 asm volatile ("mov %0, %%dr7" :: "r" (val));
171}
172
173static inline void force_new_asid(struct kvm_vcpu *vcpu)
174{
175 to_svm(vcpu)->asid_generation--;
176}
177
178static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
179{
180 force_new_asid(vcpu);
181}
182
183static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
184{
185 if (!(efer & EFER_LMA))
186 efer &= ~EFER_LME;
187
188 to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK;
189 vcpu->arch.shadow_efer = efer;
190}
191
192static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
193 bool has_error_code, u32 error_code)
194{
195 struct vcpu_svm *svm = to_svm(vcpu);
196
197 svm->vmcb->control.event_inj = nr
198 | SVM_EVTINJ_VALID
199 | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
200 | SVM_EVTINJ_TYPE_EXEPT;
201 svm->vmcb->control.event_inj_err = error_code;
202}
203
204static bool svm_exception_injected(struct kvm_vcpu *vcpu)
205{
206 struct vcpu_svm *svm = to_svm(vcpu);
207
208 return !(svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID);
209}
210
211static int is_external_interrupt(u32 info)
212{
213 info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
214 return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
215}
216
217static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
218{
219 struct vcpu_svm *svm = to_svm(vcpu);
220
221 if (!svm->next_rip) {
222 printk(KERN_DEBUG "%s: NOP\n", __FUNCTION__);
223 return;
224 }
225 if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE)
226 printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n",
227 __FUNCTION__,
228 svm->vmcb->save.rip,
229 svm->next_rip);
230
231 vcpu->arch.rip = svm->vmcb->save.rip = svm->next_rip;
232 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
233
234 vcpu->arch.interrupt_window_open = 1;
235}
236
237static int has_svm(void)
238{
239 uint32_t eax, ebx, ecx, edx;
240
241 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
242 printk(KERN_INFO "has_svm: not amd\n");
243 return 0;
244 }
245
246 cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
247 if (eax < SVM_CPUID_FUNC) {
248 printk(KERN_INFO "has_svm: can't execute cpuid_8000000a\n");
249 return 0;
250 }
251
252 cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
253 if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) {
254 printk(KERN_DEBUG "has_svm: svm not available\n");
255 return 0;
256 }
257 return 1;
258}
259
260static void svm_hardware_disable(void *garbage)
261{
262 struct svm_cpu_data *svm_data
263 = per_cpu(svm_data, raw_smp_processor_id());
264
265 if (svm_data) {
266 uint64_t efer;
267
268 wrmsrl(MSR_VM_HSAVE_PA, 0);
269 rdmsrl(MSR_EFER, efer);
270 wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK);
271 per_cpu(svm_data, raw_smp_processor_id()) = NULL;
272 __free_page(svm_data->save_area);
273 kfree(svm_data);
274 }
275}
276
277static void svm_hardware_enable(void *garbage)
278{
279
280 struct svm_cpu_data *svm_data;
281 uint64_t efer;
282#ifdef CONFIG_X86_64
283 struct desc_ptr gdt_descr;
284#else
285 struct desc_ptr gdt_descr;
286#endif
287 struct desc_struct *gdt;
288 int me = raw_smp_processor_id();
289
290 if (!has_svm()) {
291 printk(KERN_ERR "svm_cpu_init: err EOPNOTSUPP on %d\n", me);
292 return;
293 }
294 svm_data = per_cpu(svm_data, me);
295
296 if (!svm_data) {
297 printk(KERN_ERR "svm_cpu_init: svm_data is NULL on %d\n",
298 me);
299 return;
300 }
301
302 svm_data->asid_generation = 1;
303 svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
304 svm_data->next_asid = svm_data->max_asid + 1;
305 svm_features = cpuid_edx(SVM_CPUID_FUNC);
306
307 asm volatile ("sgdt %0" : "=m"(gdt_descr));
308 gdt = (struct desc_struct *)gdt_descr.address;
309 svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
310
311 rdmsrl(MSR_EFER, efer);
312 wrmsrl(MSR_EFER, efer | MSR_EFER_SVME_MASK);
313
314 wrmsrl(MSR_VM_HSAVE_PA,
315 page_to_pfn(svm_data->save_area) << PAGE_SHIFT);
316}
317
318static int svm_cpu_init(int cpu)
319{
320 struct svm_cpu_data *svm_data;
321 int r;
322
323 svm_data = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
324 if (!svm_data)
325 return -ENOMEM;
326 svm_data->cpu = cpu;
327 svm_data->save_area = alloc_page(GFP_KERNEL);
328 r = -ENOMEM;
329 if (!svm_data->save_area)
330 goto err_1;
331
332 per_cpu(svm_data, cpu) = svm_data;
333
334 return 0;
335
336err_1:
337 kfree(svm_data);
338 return r;
339
340}
341
342static void set_msr_interception(u32 *msrpm, unsigned msr,
343 int read, int write)
344{
345 int i;
346
347 for (i = 0; i < NUM_MSR_MAPS; i++) {
348 if (msr >= msrpm_ranges[i] &&
349 msr < msrpm_ranges[i] + MSRS_IN_RANGE) {
350 u32 msr_offset = (i * MSRS_IN_RANGE + msr -
351 msrpm_ranges[i]) * 2;
352
353 u32 *base = msrpm + (msr_offset / 32);
354 u32 msr_shift = msr_offset % 32;
355 u32 mask = ((write) ? 0 : 2) | ((read) ? 0 : 1);
356 *base = (*base & ~(0x3 << msr_shift)) |
357 (mask << msr_shift);
358 return;
359 }
360 }
361 BUG();
362}
363
364static __init int svm_hardware_setup(void)
365{
366 int cpu;
367 struct page *iopm_pages;
368 struct page *msrpm_pages;
369 void *iopm_va, *msrpm_va;
370 int r;
371
372 iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER);
373
374 if (!iopm_pages)
375 return -ENOMEM;
376
377 iopm_va = page_address(iopm_pages);
378 memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
379 clear_bit(0x80, iopm_va); /* allow direct access to PC debug port */
380 iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
381
382
383 msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
384
385 r = -ENOMEM;
386 if (!msrpm_pages)
387 goto err_1;
388
389 msrpm_va = page_address(msrpm_pages);
390 memset(msrpm_va, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
391 msrpm_base = page_to_pfn(msrpm_pages) << PAGE_SHIFT;
392
393#ifdef CONFIG_X86_64
394 set_msr_interception(msrpm_va, MSR_GS_BASE, 1, 1);
395 set_msr_interception(msrpm_va, MSR_FS_BASE, 1, 1);
396 set_msr_interception(msrpm_va, MSR_KERNEL_GS_BASE, 1, 1);
397 set_msr_interception(msrpm_va, MSR_LSTAR, 1, 1);
398 set_msr_interception(msrpm_va, MSR_CSTAR, 1, 1);
399 set_msr_interception(msrpm_va, MSR_SYSCALL_MASK, 1, 1);
400#endif
401 set_msr_interception(msrpm_va, MSR_K6_STAR, 1, 1);
402 set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_CS, 1, 1);
403 set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_ESP, 1, 1);
404 set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_EIP, 1, 1);
405
406 for_each_online_cpu(cpu) {
407 r = svm_cpu_init(cpu);
408 if (r)
409 goto err_2;
410 }
411 return 0;
412
413err_2:
414 __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
415 msrpm_base = 0;
416err_1:
417 __free_pages(iopm_pages, IOPM_ALLOC_ORDER);
418 iopm_base = 0;
419 return r;
420}
421
422static __exit void svm_hardware_unsetup(void)
423{
424 __free_pages(pfn_to_page(msrpm_base >> PAGE_SHIFT), MSRPM_ALLOC_ORDER);
425 __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
426 iopm_base = msrpm_base = 0;
427}
428
429static void init_seg(struct vmcb_seg *seg)
430{
431 seg->selector = 0;
432 seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
433 SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
434 seg->limit = 0xffff;
435 seg->base = 0;
436}
437
438static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
439{
440 seg->selector = 0;
441 seg->attrib = SVM_SELECTOR_P_MASK | type;
442 seg->limit = 0xffff;
443 seg->base = 0;
444}
445
446static void init_vmcb(struct vmcb *vmcb)
447{
448 struct vmcb_control_area *control = &vmcb->control;
449 struct vmcb_save_area *save = &vmcb->save;
450
451 control->intercept_cr_read = INTERCEPT_CR0_MASK |
452 INTERCEPT_CR3_MASK |
453 INTERCEPT_CR4_MASK |
454 INTERCEPT_CR8_MASK;
455
456 control->intercept_cr_write = INTERCEPT_CR0_MASK |
457 INTERCEPT_CR3_MASK |
458 INTERCEPT_CR4_MASK |
459 INTERCEPT_CR8_MASK;
460
461 control->intercept_dr_read = INTERCEPT_DR0_MASK |
462 INTERCEPT_DR1_MASK |
463 INTERCEPT_DR2_MASK |
464 INTERCEPT_DR3_MASK;
465
466 control->intercept_dr_write = INTERCEPT_DR0_MASK |
467 INTERCEPT_DR1_MASK |
468 INTERCEPT_DR2_MASK |
469 INTERCEPT_DR3_MASK |
470 INTERCEPT_DR5_MASK |
471 INTERCEPT_DR7_MASK;
472
473 control->intercept_exceptions = (1 << PF_VECTOR) |
474 (1 << UD_VECTOR);
475
476
477 control->intercept = (1ULL << INTERCEPT_INTR) |
478 (1ULL << INTERCEPT_NMI) |
479 (1ULL << INTERCEPT_SMI) |
480 /*
481 * selective cr0 intercept bug?
482 * 0: 0f 22 d8 mov %eax,%cr3
483 * 3: 0f 20 c0 mov %cr0,%eax
484 * 6: 0d 00 00 00 80 or $0x80000000,%eax
485 * b: 0f 22 c0 mov %eax,%cr0
486 * set cr3 ->interception
487 * get cr0 ->interception
488 * set cr0 -> no interception
489 */
490 /* (1ULL << INTERCEPT_SELECTIVE_CR0) | */
491 (1ULL << INTERCEPT_CPUID) |
492 (1ULL << INTERCEPT_INVD) |
493 (1ULL << INTERCEPT_HLT) |
494 (1ULL << INTERCEPT_INVLPGA) |
495 (1ULL << INTERCEPT_IOIO_PROT) |
496 (1ULL << INTERCEPT_MSR_PROT) |
497 (1ULL << INTERCEPT_TASK_SWITCH) |
498 (1ULL << INTERCEPT_SHUTDOWN) |
499 (1ULL << INTERCEPT_VMRUN) |
500 (1ULL << INTERCEPT_VMMCALL) |
501 (1ULL << INTERCEPT_VMLOAD) |
502 (1ULL << INTERCEPT_VMSAVE) |
503 (1ULL << INTERCEPT_STGI) |
504 (1ULL << INTERCEPT_CLGI) |
505 (1ULL << INTERCEPT_SKINIT) |
506 (1ULL << INTERCEPT_WBINVD) |
507 (1ULL << INTERCEPT_MONITOR) |
508 (1ULL << INTERCEPT_MWAIT);
509
510 control->iopm_base_pa = iopm_base;
511 control->msrpm_base_pa = msrpm_base;
512 control->tsc_offset = 0;
513 control->int_ctl = V_INTR_MASKING_MASK;
514
515 init_seg(&save->es);
516 init_seg(&save->ss);
517 init_seg(&save->ds);
518 init_seg(&save->fs);
519 init_seg(&save->gs);
520
521 save->cs.selector = 0xf000;
522 /* Executable/Readable Code Segment */
523 save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
524 SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
525 save->cs.limit = 0xffff;
526 /*
527 * cs.base should really be 0xffff0000, but vmx can't handle that, so
528 * be consistent with it.
529 *
530 * Replace when we have real mode working for vmx.
531 */
532 save->cs.base = 0xf0000;
533
534 save->gdtr.limit = 0xffff;
535 save->idtr.limit = 0xffff;
536
537 init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
538 init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
539
540 save->efer = MSR_EFER_SVME_MASK;
541 save->dr6 = 0xffff0ff0;
542 save->dr7 = 0x400;
543 save->rflags = 2;
544 save->rip = 0x0000fff0;
545
546 /*
547 * cr0 val on cpu init should be 0x60000010, we enable cpu
548 * cache by default. the orderly way is to enable cache in bios.
549 */
550 save->cr0 = 0x00000010 | X86_CR0_PG | X86_CR0_WP;
551 save->cr4 = X86_CR4_PAE;
552 /* rdx = ?? */
553}
554
555static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
556{
557 struct vcpu_svm *svm = to_svm(vcpu);
558
559 init_vmcb(svm->vmcb);
560
561 if (vcpu->vcpu_id != 0) {
562 svm->vmcb->save.rip = 0;
563 svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
564 svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
565 }
566
567 return 0;
568}
569
570static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
571{
572 struct vcpu_svm *svm;
573 struct page *page;
574 int err;
575
576 svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
577 if (!svm) {
578 err = -ENOMEM;
579 goto out;
580 }
581
582 err = kvm_vcpu_init(&svm->vcpu, kvm, id);
583 if (err)
584 goto free_svm;
585
586 page = alloc_page(GFP_KERNEL);
587 if (!page) {
588 err = -ENOMEM;
589 goto uninit;
590 }
591
592 svm->vmcb = page_address(page);
593 clear_page(svm->vmcb);
594 svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
595 svm->asid_generation = 0;
596 memset(svm->db_regs, 0, sizeof(svm->db_regs));
597 init_vmcb(svm->vmcb);
598
599 fx_init(&svm->vcpu);
600 svm->vcpu.fpu_active = 1;
601 svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
602 if (svm->vcpu.vcpu_id == 0)
603 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
604
605 return &svm->vcpu;
606
607uninit:
608 kvm_vcpu_uninit(&svm->vcpu);
609free_svm:
610 kmem_cache_free(kvm_vcpu_cache, svm);
611out:
612 return ERR_PTR(err);
613}
614
615static void svm_free_vcpu(struct kvm_vcpu *vcpu)
616{
617 struct vcpu_svm *svm = to_svm(vcpu);
618
619 __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT));
620 kvm_vcpu_uninit(vcpu);
621 kmem_cache_free(kvm_vcpu_cache, svm);
622}
623
624static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
625{
626 struct vcpu_svm *svm = to_svm(vcpu);
627 int i;
628
629 if (unlikely(cpu != vcpu->cpu)) {
630 u64 tsc_this, delta;
631
632 /*
633 * Make sure that the guest sees a monotonically
634 * increasing TSC.
635 */
636 rdtscll(tsc_this);
637 delta = vcpu->arch.host_tsc - tsc_this;
638 svm->vmcb->control.tsc_offset += delta;
639 vcpu->cpu = cpu;
640 kvm_migrate_apic_timer(vcpu);
641 }
642
643 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
644 rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
645}
646
647static void svm_vcpu_put(struct kvm_vcpu *vcpu)
648{
649 struct vcpu_svm *svm = to_svm(vcpu);
650 int i;
651
652 ++vcpu->stat.host_state_reload;
653 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
654 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
655
656 rdtscll(vcpu->arch.host_tsc);
657}
658
659static void svm_vcpu_decache(struct kvm_vcpu *vcpu)
660{
661}
662
663static void svm_cache_regs(struct kvm_vcpu *vcpu)
664{
665 struct vcpu_svm *svm = to_svm(vcpu);
666
667 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
668 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
669 vcpu->arch.rip = svm->vmcb->save.rip;
670}
671
672static void svm_decache_regs(struct kvm_vcpu *vcpu)
673{
674 struct vcpu_svm *svm = to_svm(vcpu);
675 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
676 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
677 svm->vmcb->save.rip = vcpu->arch.rip;
678}
679
680static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
681{
682 return to_svm(vcpu)->vmcb->save.rflags;
683}
684
685static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
686{
687 to_svm(vcpu)->vmcb->save.rflags = rflags;
688}
689
690static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
691{
692 struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
693
694 switch (seg) {
695 case VCPU_SREG_CS: return &save->cs;
696 case VCPU_SREG_DS: return &save->ds;
697 case VCPU_SREG_ES: return &save->es;
698 case VCPU_SREG_FS: return &save->fs;
699 case VCPU_SREG_GS: return &save->gs;
700 case VCPU_SREG_SS: return &save->ss;
701 case VCPU_SREG_TR: return &save->tr;
702 case VCPU_SREG_LDTR: return &save->ldtr;
703 }
704 BUG();
705 return NULL;
706}
707
708static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
709{
710 struct vmcb_seg *s = svm_seg(vcpu, seg);
711
712 return s->base;
713}
714
715static void svm_get_segment(struct kvm_vcpu *vcpu,
716 struct kvm_segment *var, int seg)
717{
718 struct vmcb_seg *s = svm_seg(vcpu, seg);
719
720 var->base = s->base;
721 var->limit = s->limit;
722 var->selector = s->selector;
723 var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
724 var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
725 var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
726 var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
727 var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
728 var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
729 var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
730 var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1;
731 var->unusable = !var->present;
732}
733
734static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
735{
736 struct vcpu_svm *svm = to_svm(vcpu);
737
738 dt->limit = svm->vmcb->save.idtr.limit;
739 dt->base = svm->vmcb->save.idtr.base;
740}
741
742static void svm_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
743{
744 struct vcpu_svm *svm = to_svm(vcpu);
745
746 svm->vmcb->save.idtr.limit = dt->limit;
747 svm->vmcb->save.idtr.base = dt->base ;
748}
749
750static void svm_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
751{
752 struct vcpu_svm *svm = to_svm(vcpu);
753
754 dt->limit = svm->vmcb->save.gdtr.limit;
755 dt->base = svm->vmcb->save.gdtr.base;
756}
757
758static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
759{
760 struct vcpu_svm *svm = to_svm(vcpu);
761
762 svm->vmcb->save.gdtr.limit = dt->limit;
763 svm->vmcb->save.gdtr.base = dt->base ;
764}
765
766static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
767{
768}
769
770static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
771{
772 struct vcpu_svm *svm = to_svm(vcpu);
773
774#ifdef CONFIG_X86_64
775 if (vcpu->arch.shadow_efer & EFER_LME) {
776 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
777 vcpu->arch.shadow_efer |= EFER_LMA;
778 svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
779 }
780
781 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
782 vcpu->arch.shadow_efer &= ~EFER_LMA;
783 svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
784 }
785 }
786#endif
787 if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) {
788 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
789 vcpu->fpu_active = 1;
790 }
791
792 vcpu->arch.cr0 = cr0;
793 cr0 |= X86_CR0_PG | X86_CR0_WP;
794 cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
795 svm->vmcb->save.cr0 = cr0;
796}
797
798static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
799{
800 vcpu->arch.cr4 = cr4;
801 to_svm(vcpu)->vmcb->save.cr4 = cr4 | X86_CR4_PAE;
802}
803
804static void svm_set_segment(struct kvm_vcpu *vcpu,
805 struct kvm_segment *var, int seg)
806{
807 struct vcpu_svm *svm = to_svm(vcpu);
808 struct vmcb_seg *s = svm_seg(vcpu, seg);
809
810 s->base = var->base;
811 s->limit = var->limit;
812 s->selector = var->selector;
813 if (var->unusable)
814 s->attrib = 0;
815 else {
816 s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
817 s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
818 s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
819 s->attrib |= (var->present & 1) << SVM_SELECTOR_P_SHIFT;
820 s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
821 s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
822 s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
823 s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
824 }
825 if (seg == VCPU_SREG_CS)
826 svm->vmcb->save.cpl
827 = (svm->vmcb->save.cs.attrib
828 >> SVM_SELECTOR_DPL_SHIFT) & 3;
829
830}
831
832/* FIXME:
833
834 svm(vcpu)->vmcb->control.int_ctl &= ~V_TPR_MASK;
835 svm(vcpu)->vmcb->control.int_ctl |= (sregs->cr8 & V_TPR_MASK);
836
837*/
838
839static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
840{
841 return -EOPNOTSUPP;
842}
843
844static int svm_get_irq(struct kvm_vcpu *vcpu)
845{
846 struct vcpu_svm *svm = to_svm(vcpu);
847 u32 exit_int_info = svm->vmcb->control.exit_int_info;
848
849 if (is_external_interrupt(exit_int_info))
850 return exit_int_info & SVM_EVTINJ_VEC_MASK;
851 return -1;
852}
853
854static void load_host_msrs(struct kvm_vcpu *vcpu)
855{
856#ifdef CONFIG_X86_64
857 wrmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
858#endif
859}
860
861static void save_host_msrs(struct kvm_vcpu *vcpu)
862{
863#ifdef CONFIG_X86_64
864 rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
865#endif
866}
867
868static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data)
869{
870 if (svm_data->next_asid > svm_data->max_asid) {
871 ++svm_data->asid_generation;
872 svm_data->next_asid = 1;
873 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
874 }
875
876 svm->vcpu.cpu = svm_data->cpu;
877 svm->asid_generation = svm_data->asid_generation;
878 svm->vmcb->control.asid = svm_data->next_asid++;
879}
880
881static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr)
882{
883 return to_svm(vcpu)->db_regs[dr];
884}
885
886static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
887 int *exception)
888{
889 struct vcpu_svm *svm = to_svm(vcpu);
890
891 *exception = 0;
892
893 if (svm->vmcb->save.dr7 & DR7_GD_MASK) {
894 svm->vmcb->save.dr7 &= ~DR7_GD_MASK;
895 svm->vmcb->save.dr6 |= DR6_BD_MASK;
896 *exception = DB_VECTOR;
897 return;
898 }
899
900 switch (dr) {
901 case 0 ... 3:
902 svm->db_regs[dr] = value;
903 return;
904 case 4 ... 5:
905 if (vcpu->arch.cr4 & X86_CR4_DE) {
906 *exception = UD_VECTOR;
907 return;
908 }
909 case 7: {
910 if (value & ~((1ULL << 32) - 1)) {
911 *exception = GP_VECTOR;
912 return;
913 }
914 svm->vmcb->save.dr7 = value;
915 return;
916 }
917 default:
918 printk(KERN_DEBUG "%s: unexpected dr %u\n",
919 __FUNCTION__, dr);
920 *exception = UD_VECTOR;
921 return;
922 }
923}
924
925static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
926{
927 u32 exit_int_info = svm->vmcb->control.exit_int_info;
928 struct kvm *kvm = svm->vcpu.kvm;
929 u64 fault_address;
930 u32 error_code;
931
932 if (!irqchip_in_kernel(kvm) &&
933 is_external_interrupt(exit_int_info))
934 push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
935
936 fault_address = svm->vmcb->control.exit_info_2;
937 error_code = svm->vmcb->control.exit_info_1;
938 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
939}
940
941static int ud_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
942{
943 int er;
944
945 er = emulate_instruction(&svm->vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD);
946 if (er != EMULATE_DONE)
947 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
948 return 1;
949}
950
951static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
952{
953 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
954 if (!(svm->vcpu.arch.cr0 & X86_CR0_TS))
955 svm->vmcb->save.cr0 &= ~X86_CR0_TS;
956 svm->vcpu.fpu_active = 1;
957
958 return 1;
959}
960
961static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
962{
963 /*
964 * VMCB is undefined after a SHUTDOWN intercept
965 * so reinitialize it.
966 */
967 clear_page(svm->vmcb);
968 init_vmcb(svm->vmcb);
969
970 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
971 return 0;
972}
973
974static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
975{
976 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
977 int size, down, in, string, rep;
978 unsigned port;
979
980 ++svm->vcpu.stat.io_exits;
981
982 svm->next_rip = svm->vmcb->control.exit_info_2;
983
984 string = (io_info & SVM_IOIO_STR_MASK) != 0;
985
986 if (string) {
987 if (emulate_instruction(&svm->vcpu,
988 kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
989 return 0;
990 return 1;
991 }
992
993 in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
994 port = io_info >> 16;
995 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
996 rep = (io_info & SVM_IOIO_REP_MASK) != 0;
997 down = (svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0;
998
999 return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port);
1000}
1001
1002static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1003{
1004 return 1;
1005}
1006
1007static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1008{
1009 svm->next_rip = svm->vmcb->save.rip + 1;
1010 skip_emulated_instruction(&svm->vcpu);
1011 return kvm_emulate_halt(&svm->vcpu);
1012}
1013
1014static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1015{
1016 svm->next_rip = svm->vmcb->save.rip + 3;
1017 skip_emulated_instruction(&svm->vcpu);
1018 kvm_emulate_hypercall(&svm->vcpu);
1019 return 1;
1020}
1021
1022static int invalid_op_interception(struct vcpu_svm *svm,
1023 struct kvm_run *kvm_run)
1024{
1025 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1026 return 1;
1027}
1028
1029static int task_switch_interception(struct vcpu_svm *svm,
1030 struct kvm_run *kvm_run)
1031{
1032 pr_unimpl(&svm->vcpu, "%s: task switch is unsupported\n", __FUNCTION__);
1033 kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
1034 return 0;
1035}
1036
1037static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1038{
1039 svm->next_rip = svm->vmcb->save.rip + 2;
1040 kvm_emulate_cpuid(&svm->vcpu);
1041 return 1;
1042}
1043
1044static int emulate_on_interception(struct vcpu_svm *svm,
1045 struct kvm_run *kvm_run)
1046{
1047 if (emulate_instruction(&svm->vcpu, NULL, 0, 0, 0) != EMULATE_DONE)
1048 pr_unimpl(&svm->vcpu, "%s: failed\n", __FUNCTION__);
1049 return 1;
1050}
1051
1052static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1053{
1054 emulate_instruction(&svm->vcpu, NULL, 0, 0, 0);
1055 if (irqchip_in_kernel(svm->vcpu.kvm))
1056 return 1;
1057 kvm_run->exit_reason = KVM_EXIT_SET_TPR;
1058 return 0;
1059}
1060
1061static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
1062{
1063 struct vcpu_svm *svm = to_svm(vcpu);
1064
1065 switch (ecx) {
1066 case MSR_IA32_TIME_STAMP_COUNTER: {
1067 u64 tsc;
1068
1069 rdtscll(tsc);
1070 *data = svm->vmcb->control.tsc_offset + tsc;
1071 break;
1072 }
1073 case MSR_K6_STAR:
1074 *data = svm->vmcb->save.star;
1075 break;
1076#ifdef CONFIG_X86_64
1077 case MSR_LSTAR:
1078 *data = svm->vmcb->save.lstar;
1079 break;
1080 case MSR_CSTAR:
1081 *data = svm->vmcb->save.cstar;
1082 break;
1083 case MSR_KERNEL_GS_BASE:
1084 *data = svm->vmcb->save.kernel_gs_base;
1085 break;
1086 case MSR_SYSCALL_MASK:
1087 *data = svm->vmcb->save.sfmask;
1088 break;
1089#endif
1090 case MSR_IA32_SYSENTER_CS:
1091 *data = svm->vmcb->save.sysenter_cs;
1092 break;
1093 case MSR_IA32_SYSENTER_EIP:
1094 *data = svm->vmcb->save.sysenter_eip;
1095 break;
1096 case MSR_IA32_SYSENTER_ESP:
1097 *data = svm->vmcb->save.sysenter_esp;
1098 break;
1099 default:
1100 return kvm_get_msr_common(vcpu, ecx, data);
1101 }
1102 return 0;
1103}
1104
1105static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1106{
1107 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
1108 u64 data;
1109
1110 if (svm_get_msr(&svm->vcpu, ecx, &data))
1111 kvm_inject_gp(&svm->vcpu, 0);
1112 else {
1113 svm->vmcb->save.rax = data & 0xffffffff;
1114 svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32;
1115 svm->next_rip = svm->vmcb->save.rip + 2;
1116 skip_emulated_instruction(&svm->vcpu);
1117 }
1118 return 1;
1119}
1120
1121static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
1122{
1123 struct vcpu_svm *svm = to_svm(vcpu);
1124
1125 switch (ecx) {
1126 case MSR_IA32_TIME_STAMP_COUNTER: {
1127 u64 tsc;
1128
1129 rdtscll(tsc);
1130 svm->vmcb->control.tsc_offset = data - tsc;
1131 break;
1132 }
1133 case MSR_K6_STAR:
1134 svm->vmcb->save.star = data;
1135 break;
1136#ifdef CONFIG_X86_64
1137 case MSR_LSTAR:
1138 svm->vmcb->save.lstar = data;
1139 break;
1140 case MSR_CSTAR:
1141 svm->vmcb->save.cstar = data;
1142 break;
1143 case MSR_KERNEL_GS_BASE:
1144 svm->vmcb->save.kernel_gs_base = data;
1145 break;
1146 case MSR_SYSCALL_MASK:
1147 svm->vmcb->save.sfmask = data;
1148 break;
1149#endif
1150 case MSR_IA32_SYSENTER_CS:
1151 svm->vmcb->save.sysenter_cs = data;
1152 break;
1153 case MSR_IA32_SYSENTER_EIP:
1154 svm->vmcb->save.sysenter_eip = data;
1155 break;
1156 case MSR_IA32_SYSENTER_ESP:
1157 svm->vmcb->save.sysenter_esp = data;
1158 break;
1159 case MSR_K7_EVNTSEL0:
1160 case MSR_K7_EVNTSEL1:
1161 case MSR_K7_EVNTSEL2:
1162 case MSR_K7_EVNTSEL3:
1163 /*
1164 * only support writing 0 to the performance counters for now
1165 * to make Windows happy. Should be replaced by a real
1166 * performance counter emulation later.
1167 */
1168 if (data != 0)
1169 goto unhandled;
1170 break;
1171 default:
1172 unhandled:
1173 return kvm_set_msr_common(vcpu, ecx, data);
1174 }
1175 return 0;
1176}
1177
1178static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1179{
1180 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
1181 u64 data = (svm->vmcb->save.rax & -1u)
1182 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
1183 svm->next_rip = svm->vmcb->save.rip + 2;
1184 if (svm_set_msr(&svm->vcpu, ecx, data))
1185 kvm_inject_gp(&svm->vcpu, 0);
1186 else
1187 skip_emulated_instruction(&svm->vcpu);
1188 return 1;
1189}
1190
1191static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1192{
1193 if (svm->vmcb->control.exit_info_1)
1194 return wrmsr_interception(svm, kvm_run);
1195 else
1196 return rdmsr_interception(svm, kvm_run);
1197}
1198
1199static int interrupt_window_interception(struct vcpu_svm *svm,
1200 struct kvm_run *kvm_run)
1201{
1202 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR);
1203 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
1204 /*
1205 * If the user space waits to inject interrupts, exit as soon as
1206 * possible
1207 */
1208 if (kvm_run->request_interrupt_window &&
1209 !svm->vcpu.arch.irq_summary) {
1210 ++svm->vcpu.stat.irq_window_exits;
1211 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
1212 return 0;
1213 }
1214
1215 return 1;
1216}
1217
1218static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
1219 struct kvm_run *kvm_run) = {
1220 [SVM_EXIT_READ_CR0] = emulate_on_interception,
1221 [SVM_EXIT_READ_CR3] = emulate_on_interception,
1222 [SVM_EXIT_READ_CR4] = emulate_on_interception,
1223 [SVM_EXIT_READ_CR8] = emulate_on_interception,
1224 /* for now: */
1225 [SVM_EXIT_WRITE_CR0] = emulate_on_interception,
1226 [SVM_EXIT_WRITE_CR3] = emulate_on_interception,
1227 [SVM_EXIT_WRITE_CR4] = emulate_on_interception,
1228 [SVM_EXIT_WRITE_CR8] = cr8_write_interception,
1229 [SVM_EXIT_READ_DR0] = emulate_on_interception,
1230 [SVM_EXIT_READ_DR1] = emulate_on_interception,
1231 [SVM_EXIT_READ_DR2] = emulate_on_interception,
1232 [SVM_EXIT_READ_DR3] = emulate_on_interception,
1233 [SVM_EXIT_WRITE_DR0] = emulate_on_interception,
1234 [SVM_EXIT_WRITE_DR1] = emulate_on_interception,
1235 [SVM_EXIT_WRITE_DR2] = emulate_on_interception,
1236 [SVM_EXIT_WRITE_DR3] = emulate_on_interception,
1237 [SVM_EXIT_WRITE_DR5] = emulate_on_interception,
1238 [SVM_EXIT_WRITE_DR7] = emulate_on_interception,
1239 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
1240 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
1241 [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception,
1242 [SVM_EXIT_INTR] = nop_on_interception,
1243 [SVM_EXIT_NMI] = nop_on_interception,
1244 [SVM_EXIT_SMI] = nop_on_interception,
1245 [SVM_EXIT_INIT] = nop_on_interception,
1246 [SVM_EXIT_VINTR] = interrupt_window_interception,
1247 /* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */
1248 [SVM_EXIT_CPUID] = cpuid_interception,
1249 [SVM_EXIT_INVD] = emulate_on_interception,
1250 [SVM_EXIT_HLT] = halt_interception,
1251 [SVM_EXIT_INVLPG] = emulate_on_interception,
1252 [SVM_EXIT_INVLPGA] = invalid_op_interception,
1253 [SVM_EXIT_IOIO] = io_interception,
1254 [SVM_EXIT_MSR] = msr_interception,
1255 [SVM_EXIT_TASK_SWITCH] = task_switch_interception,
1256 [SVM_EXIT_SHUTDOWN] = shutdown_interception,
1257 [SVM_EXIT_VMRUN] = invalid_op_interception,
1258 [SVM_EXIT_VMMCALL] = vmmcall_interception,
1259 [SVM_EXIT_VMLOAD] = invalid_op_interception,
1260 [SVM_EXIT_VMSAVE] = invalid_op_interception,
1261 [SVM_EXIT_STGI] = invalid_op_interception,
1262 [SVM_EXIT_CLGI] = invalid_op_interception,
1263 [SVM_EXIT_SKINIT] = invalid_op_interception,
1264 [SVM_EXIT_WBINVD] = emulate_on_interception,
1265 [SVM_EXIT_MONITOR] = invalid_op_interception,
1266 [SVM_EXIT_MWAIT] = invalid_op_interception,
1267};
1268
1269
1270static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1271{
1272 struct vcpu_svm *svm = to_svm(vcpu);
1273 u32 exit_code = svm->vmcb->control.exit_code;
1274
1275 kvm_reput_irq(svm);
1276
1277 if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
1278 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
1279 kvm_run->fail_entry.hardware_entry_failure_reason
1280 = svm->vmcb->control.exit_code;
1281 return 0;
1282 }
1283
1284 if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
1285 exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR)
1286 printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
1287 "exit_code 0x%x\n",
1288 __FUNCTION__, svm->vmcb->control.exit_int_info,
1289 exit_code);
1290
1291 if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
1292 || !svm_exit_handlers[exit_code]) {
1293 kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
1294 kvm_run->hw.hardware_exit_reason = exit_code;
1295 return 0;
1296 }
1297
1298 return svm_exit_handlers[exit_code](svm, kvm_run);
1299}
1300
1301static void reload_tss(struct kvm_vcpu *vcpu)
1302{
1303 int cpu = raw_smp_processor_id();
1304
1305 struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
1306 svm_data->tss_desc->type = 9; /* available 32/64-bit TSS */
1307 load_TR_desc();
1308}
1309
1310static void pre_svm_run(struct vcpu_svm *svm)
1311{
1312 int cpu = raw_smp_processor_id();
1313
1314 struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
1315
1316 svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
1317 if (svm->vcpu.cpu != cpu ||
1318 svm->asid_generation != svm_data->asid_generation)
1319 new_asid(svm, svm_data);
1320}
1321
1322
1323static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
1324{
1325 struct vmcb_control_area *control;
1326
1327 control = &svm->vmcb->control;
1328 control->int_vector = irq;
1329 control->int_ctl &= ~V_INTR_PRIO_MASK;
1330 control->int_ctl |= V_IRQ_MASK |
1331 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
1332}
1333
1334static void svm_set_irq(struct kvm_vcpu *vcpu, int irq)
1335{
1336 struct vcpu_svm *svm = to_svm(vcpu);
1337
1338 svm_inject_irq(svm, irq);
1339}
1340
1341static void svm_intr_assist(struct kvm_vcpu *vcpu)
1342{
1343 struct vcpu_svm *svm = to_svm(vcpu);
1344 struct vmcb *vmcb = svm->vmcb;
1345 int intr_vector = -1;
1346
1347 if ((vmcb->control.exit_int_info & SVM_EVTINJ_VALID) &&
1348 ((vmcb->control.exit_int_info & SVM_EVTINJ_TYPE_MASK) == 0)) {
1349 intr_vector = vmcb->control.exit_int_info &
1350 SVM_EVTINJ_VEC_MASK;
1351 vmcb->control.exit_int_info = 0;
1352 svm_inject_irq(svm, intr_vector);
1353 return;
1354 }
1355
1356 if (vmcb->control.int_ctl & V_IRQ_MASK)
1357 return;
1358
1359 if (!kvm_cpu_has_interrupt(vcpu))
1360 return;
1361
1362 if (!(vmcb->save.rflags & X86_EFLAGS_IF) ||
1363 (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
1364 (vmcb->control.event_inj & SVM_EVTINJ_VALID)) {
1365 /* unable to deliver irq, set pending irq */
1366 vmcb->control.intercept |= (1ULL << INTERCEPT_VINTR);
1367 svm_inject_irq(svm, 0x0);
1368 return;
1369 }
1370 /* Okay, we can deliver the interrupt: grab it and update PIC state. */
1371 intr_vector = kvm_cpu_get_interrupt(vcpu);
1372 svm_inject_irq(svm, intr_vector);
1373 kvm_timer_intr_post(vcpu, intr_vector);
1374}
1375
1376static void kvm_reput_irq(struct vcpu_svm *svm)
1377{
1378 struct vmcb_control_area *control = &svm->vmcb->control;
1379
1380 if ((control->int_ctl & V_IRQ_MASK)
1381 && !irqchip_in_kernel(svm->vcpu.kvm)) {
1382 control->int_ctl &= ~V_IRQ_MASK;
1383 push_irq(&svm->vcpu, control->int_vector);
1384 }
1385
1386 svm->vcpu.arch.interrupt_window_open =
1387 !(control->int_state & SVM_INTERRUPT_SHADOW_MASK);
1388}
1389
1390static void svm_do_inject_vector(struct vcpu_svm *svm)
1391{
1392 struct kvm_vcpu *vcpu = &svm->vcpu;
1393 int word_index = __ffs(vcpu->arch.irq_summary);
1394 int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
1395 int irq = word_index * BITS_PER_LONG + bit_index;
1396
1397 clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
1398 if (!vcpu->arch.irq_pending[word_index])
1399 clear_bit(word_index, &vcpu->arch.irq_summary);
1400 svm_inject_irq(svm, irq);
1401}
1402
1403static void do_interrupt_requests(struct kvm_vcpu *vcpu,
1404 struct kvm_run *kvm_run)
1405{
1406 struct vcpu_svm *svm = to_svm(vcpu);
1407 struct vmcb_control_area *control = &svm->vmcb->control;
1408
1409 svm->vcpu.arch.interrupt_window_open =
1410 (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
1411 (svm->vmcb->save.rflags & X86_EFLAGS_IF));
1412
1413 if (svm->vcpu.arch.interrupt_window_open && svm->vcpu.arch.irq_summary)
1414 /*
1415 * If interrupts enabled, and not blocked by sti or mov ss. Good.
1416 */
1417 svm_do_inject_vector(svm);
1418
1419 /*
1420 * Interrupts blocked. Wait for unblock.
1421 */
1422 if (!svm->vcpu.arch.interrupt_window_open &&
1423 (svm->vcpu.arch.irq_summary || kvm_run->request_interrupt_window))
1424 control->intercept |= 1ULL << INTERCEPT_VINTR;
1425 else
1426 control->intercept &= ~(1ULL << INTERCEPT_VINTR);
1427}
1428
1429static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
1430{
1431 return 0;
1432}
1433
1434static void save_db_regs(unsigned long *db_regs)
1435{
1436 asm volatile ("mov %%dr0, %0" : "=r"(db_regs[0]));
1437 asm volatile ("mov %%dr1, %0" : "=r"(db_regs[1]));
1438 asm volatile ("mov %%dr2, %0" : "=r"(db_regs[2]));
1439 asm volatile ("mov %%dr3, %0" : "=r"(db_regs[3]));
1440}
1441
1442static void load_db_regs(unsigned long *db_regs)
1443{
1444 asm volatile ("mov %0, %%dr0" : : "r"(db_regs[0]));
1445 asm volatile ("mov %0, %%dr1" : : "r"(db_regs[1]));
1446 asm volatile ("mov %0, %%dr2" : : "r"(db_regs[2]));
1447 asm volatile ("mov %0, %%dr3" : : "r"(db_regs[3]));
1448}
1449
1450static void svm_flush_tlb(struct kvm_vcpu *vcpu)
1451{
1452 force_new_asid(vcpu);
1453}
1454
1455static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
1456{
1457}
1458
1459static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1460{
1461 struct vcpu_svm *svm = to_svm(vcpu);
1462 u16 fs_selector;
1463 u16 gs_selector;
1464 u16 ldt_selector;
1465
1466 pre_svm_run(svm);
1467
1468 save_host_msrs(vcpu);
1469 fs_selector = read_fs();
1470 gs_selector = read_gs();
1471 ldt_selector = read_ldt();
1472 svm->host_cr2 = kvm_read_cr2();
1473 svm->host_dr6 = read_dr6();
1474 svm->host_dr7 = read_dr7();
1475 svm->vmcb->save.cr2 = vcpu->arch.cr2;
1476
1477 if (svm->vmcb->save.dr7 & 0xff) {
1478 write_dr7(0);
1479 save_db_regs(svm->host_db_regs);
1480 load_db_regs(svm->db_regs);
1481 }
1482
1483 clgi();
1484
1485 local_irq_enable();
1486
1487 asm volatile (
1488#ifdef CONFIG_X86_64
1489 "push %%rbp; \n\t"
1490#else
1491 "push %%ebp; \n\t"
1492#endif
1493
1494#ifdef CONFIG_X86_64
1495 "mov %c[rbx](%[svm]), %%rbx \n\t"
1496 "mov %c[rcx](%[svm]), %%rcx \n\t"
1497 "mov %c[rdx](%[svm]), %%rdx \n\t"
1498 "mov %c[rsi](%[svm]), %%rsi \n\t"
1499 "mov %c[rdi](%[svm]), %%rdi \n\t"
1500 "mov %c[rbp](%[svm]), %%rbp \n\t"
1501 "mov %c[r8](%[svm]), %%r8 \n\t"
1502 "mov %c[r9](%[svm]), %%r9 \n\t"
1503 "mov %c[r10](%[svm]), %%r10 \n\t"
1504 "mov %c[r11](%[svm]), %%r11 \n\t"
1505 "mov %c[r12](%[svm]), %%r12 \n\t"
1506 "mov %c[r13](%[svm]), %%r13 \n\t"
1507 "mov %c[r14](%[svm]), %%r14 \n\t"
1508 "mov %c[r15](%[svm]), %%r15 \n\t"
1509#else
1510 "mov %c[rbx](%[svm]), %%ebx \n\t"
1511 "mov %c[rcx](%[svm]), %%ecx \n\t"
1512 "mov %c[rdx](%[svm]), %%edx \n\t"
1513 "mov %c[rsi](%[svm]), %%esi \n\t"
1514 "mov %c[rdi](%[svm]), %%edi \n\t"
1515 "mov %c[rbp](%[svm]), %%ebp \n\t"
1516#endif
1517
1518#ifdef CONFIG_X86_64
1519 /* Enter guest mode */
1520 "push %%rax \n\t"
1521 "mov %c[vmcb](%[svm]), %%rax \n\t"
1522 SVM_VMLOAD "\n\t"
1523 SVM_VMRUN "\n\t"
1524 SVM_VMSAVE "\n\t"
1525 "pop %%rax \n\t"
1526#else
1527 /* Enter guest mode */
1528 "push %%eax \n\t"
1529 "mov %c[vmcb](%[svm]), %%eax \n\t"
1530 SVM_VMLOAD "\n\t"
1531 SVM_VMRUN "\n\t"
1532 SVM_VMSAVE "\n\t"
1533 "pop %%eax \n\t"
1534#endif
1535
1536 /* Save guest registers, load host registers */
1537#ifdef CONFIG_X86_64
1538 "mov %%rbx, %c[rbx](%[svm]) \n\t"
1539 "mov %%rcx, %c[rcx](%[svm]) \n\t"
1540 "mov %%rdx, %c[rdx](%[svm]) \n\t"
1541 "mov %%rsi, %c[rsi](%[svm]) \n\t"
1542 "mov %%rdi, %c[rdi](%[svm]) \n\t"
1543 "mov %%rbp, %c[rbp](%[svm]) \n\t"
1544 "mov %%r8, %c[r8](%[svm]) \n\t"
1545 "mov %%r9, %c[r9](%[svm]) \n\t"
1546 "mov %%r10, %c[r10](%[svm]) \n\t"
1547 "mov %%r11, %c[r11](%[svm]) \n\t"
1548 "mov %%r12, %c[r12](%[svm]) \n\t"
1549 "mov %%r13, %c[r13](%[svm]) \n\t"
1550 "mov %%r14, %c[r14](%[svm]) \n\t"
1551 "mov %%r15, %c[r15](%[svm]) \n\t"
1552
1553 "pop %%rbp; \n\t"
1554#else
1555 "mov %%ebx, %c[rbx](%[svm]) \n\t"
1556 "mov %%ecx, %c[rcx](%[svm]) \n\t"
1557 "mov %%edx, %c[rdx](%[svm]) \n\t"
1558 "mov %%esi, %c[rsi](%[svm]) \n\t"
1559 "mov %%edi, %c[rdi](%[svm]) \n\t"
1560 "mov %%ebp, %c[rbp](%[svm]) \n\t"
1561
1562 "pop %%ebp; \n\t"
1563#endif
1564 :
1565 : [svm]"a"(svm),
1566 [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
1567 [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])),
1568 [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])),
1569 [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])),
1570 [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])),
1571 [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])),
1572 [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP]))
1573#ifdef CONFIG_X86_64
1574 , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])),
1575 [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])),
1576 [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])),
1577 [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])),
1578 [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])),
1579 [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])),
1580 [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])),
1581 [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
1582#endif
1583 : "cc", "memory"
1584#ifdef CONFIG_X86_64
1585 , "rbx", "rcx", "rdx", "rsi", "rdi"
1586 , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
1587#else
1588 , "ebx", "ecx", "edx" , "esi", "edi"
1589#endif
1590 );
1591
1592 if ((svm->vmcb->save.dr7 & 0xff))
1593 load_db_regs(svm->host_db_regs);
1594
1595 vcpu->arch.cr2 = svm->vmcb->save.cr2;
1596
1597 write_dr6(svm->host_dr6);
1598 write_dr7(svm->host_dr7);
1599 kvm_write_cr2(svm->host_cr2);
1600
1601 load_fs(fs_selector);
1602 load_gs(gs_selector);
1603 load_ldt(ldt_selector);
1604 load_host_msrs(vcpu);
1605
1606 reload_tss(vcpu);
1607
1608 local_irq_disable();
1609
1610 stgi();
1611
1612 svm->next_rip = 0;
1613}
1614
1615static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
1616{
1617 struct vcpu_svm *svm = to_svm(vcpu);
1618
1619 svm->vmcb->save.cr3 = root;
1620 force_new_asid(vcpu);
1621
1622 if (vcpu->fpu_active) {
1623 svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR);
1624 svm->vmcb->save.cr0 |= X86_CR0_TS;
1625 vcpu->fpu_active = 0;
1626 }
1627}
1628
1629static int is_disabled(void)
1630{
1631 u64 vm_cr;
1632
1633 rdmsrl(MSR_VM_CR, vm_cr);
1634 if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
1635 return 1;
1636
1637 return 0;
1638}
1639
1640static void
1641svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
1642{
1643 /*
1644 * Patch in the VMMCALL instruction:
1645 */
1646 hypercall[0] = 0x0f;
1647 hypercall[1] = 0x01;
1648 hypercall[2] = 0xd9;
1649}
1650
1651static void svm_check_processor_compat(void *rtn)
1652{
1653 *(int *)rtn = 0;
1654}
1655
1656static bool svm_cpu_has_accelerated_tpr(void)
1657{
1658 return false;
1659}
1660
1661static struct kvm_x86_ops svm_x86_ops = {
1662 .cpu_has_kvm_support = has_svm,
1663 .disabled_by_bios = is_disabled,
1664 .hardware_setup = svm_hardware_setup,
1665 .hardware_unsetup = svm_hardware_unsetup,
1666 .check_processor_compatibility = svm_check_processor_compat,
1667 .hardware_enable = svm_hardware_enable,
1668 .hardware_disable = svm_hardware_disable,
1669 .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
1670
1671 .vcpu_create = svm_create_vcpu,
1672 .vcpu_free = svm_free_vcpu,
1673 .vcpu_reset = svm_vcpu_reset,
1674
1675 .prepare_guest_switch = svm_prepare_guest_switch,
1676 .vcpu_load = svm_vcpu_load,
1677 .vcpu_put = svm_vcpu_put,
1678 .vcpu_decache = svm_vcpu_decache,
1679
1680 .set_guest_debug = svm_guest_debug,
1681 .get_msr = svm_get_msr,
1682 .set_msr = svm_set_msr,
1683 .get_segment_base = svm_get_segment_base,
1684 .get_segment = svm_get_segment,
1685 .set_segment = svm_set_segment,
1686 .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
1687 .decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
1688 .set_cr0 = svm_set_cr0,
1689 .set_cr3 = svm_set_cr3,
1690 .set_cr4 = svm_set_cr4,
1691 .set_efer = svm_set_efer,
1692 .get_idt = svm_get_idt,
1693 .set_idt = svm_set_idt,
1694 .get_gdt = svm_get_gdt,
1695 .set_gdt = svm_set_gdt,
1696 .get_dr = svm_get_dr,
1697 .set_dr = svm_set_dr,
1698 .cache_regs = svm_cache_regs,
1699 .decache_regs = svm_decache_regs,
1700 .get_rflags = svm_get_rflags,
1701 .set_rflags = svm_set_rflags,
1702
1703 .tlb_flush = svm_flush_tlb,
1704
1705 .run = svm_vcpu_run,
1706 .handle_exit = handle_exit,
1707 .skip_emulated_instruction = skip_emulated_instruction,
1708 .patch_hypercall = svm_patch_hypercall,
1709 .get_irq = svm_get_irq,
1710 .set_irq = svm_set_irq,
1711 .queue_exception = svm_queue_exception,
1712 .exception_injected = svm_exception_injected,
1713 .inject_pending_irq = svm_intr_assist,
1714 .inject_pending_vectors = do_interrupt_requests,
1715
1716 .set_tss_addr = svm_set_tss_addr,
1717};
1718
1719static int __init svm_init(void)
1720{
1721 return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm),
1722 THIS_MODULE);
1723}
1724
1725static void __exit svm_exit(void)
1726{
1727 kvm_exit();
1728}
1729
1730module_init(svm_init)
1731module_exit(svm_exit)
diff --git a/arch/x86/kvm/svm.h b/arch/x86/kvm/svm.h
new file mode 100644
index 000000000000..5fd50491b555
--- /dev/null
+++ b/arch/x86/kvm/svm.h
@@ -0,0 +1,325 @@
1#ifndef __SVM_H
2#define __SVM_H
3
4enum {
5 INTERCEPT_INTR,
6 INTERCEPT_NMI,
7 INTERCEPT_SMI,
8 INTERCEPT_INIT,
9 INTERCEPT_VINTR,
10 INTERCEPT_SELECTIVE_CR0,
11 INTERCEPT_STORE_IDTR,
12 INTERCEPT_STORE_GDTR,
13 INTERCEPT_STORE_LDTR,
14 INTERCEPT_STORE_TR,
15 INTERCEPT_LOAD_IDTR,
16 INTERCEPT_LOAD_GDTR,
17 INTERCEPT_LOAD_LDTR,
18 INTERCEPT_LOAD_TR,
19 INTERCEPT_RDTSC,
20 INTERCEPT_RDPMC,
21 INTERCEPT_PUSHF,
22 INTERCEPT_POPF,
23 INTERCEPT_CPUID,
24 INTERCEPT_RSM,
25 INTERCEPT_IRET,
26 INTERCEPT_INTn,
27 INTERCEPT_INVD,
28 INTERCEPT_PAUSE,
29 INTERCEPT_HLT,
30 INTERCEPT_INVLPG,
31 INTERCEPT_INVLPGA,
32 INTERCEPT_IOIO_PROT,
33 INTERCEPT_MSR_PROT,
34 INTERCEPT_TASK_SWITCH,
35 INTERCEPT_FERR_FREEZE,
36 INTERCEPT_SHUTDOWN,
37 INTERCEPT_VMRUN,
38 INTERCEPT_VMMCALL,
39 INTERCEPT_VMLOAD,
40 INTERCEPT_VMSAVE,
41 INTERCEPT_STGI,
42 INTERCEPT_CLGI,
43 INTERCEPT_SKINIT,
44 INTERCEPT_RDTSCP,
45 INTERCEPT_ICEBP,
46 INTERCEPT_WBINVD,
47 INTERCEPT_MONITOR,
48 INTERCEPT_MWAIT,
49 INTERCEPT_MWAIT_COND,
50};
51
52
53struct __attribute__ ((__packed__)) vmcb_control_area {
54 u16 intercept_cr_read;
55 u16 intercept_cr_write;
56 u16 intercept_dr_read;
57 u16 intercept_dr_write;
58 u32 intercept_exceptions;
59 u64 intercept;
60 u8 reserved_1[44];
61 u64 iopm_base_pa;
62 u64 msrpm_base_pa;
63 u64 tsc_offset;
64 u32 asid;
65 u8 tlb_ctl;
66 u8 reserved_2[3];
67 u32 int_ctl;
68 u32 int_vector;
69 u32 int_state;
70 u8 reserved_3[4];
71 u32 exit_code;
72 u32 exit_code_hi;
73 u64 exit_info_1;
74 u64 exit_info_2;
75 u32 exit_int_info;
76 u32 exit_int_info_err;
77 u64 nested_ctl;
78 u8 reserved_4[16];
79 u32 event_inj;
80 u32 event_inj_err;
81 u64 nested_cr3;
82 u64 lbr_ctl;
83 u8 reserved_5[832];
84};
85
86
87#define TLB_CONTROL_DO_NOTHING 0
88#define TLB_CONTROL_FLUSH_ALL_ASID 1
89
90#define V_TPR_MASK 0x0f
91
92#define V_IRQ_SHIFT 8
93#define V_IRQ_MASK (1 << V_IRQ_SHIFT)
94
95#define V_INTR_PRIO_SHIFT 16
96#define V_INTR_PRIO_MASK (0x0f << V_INTR_PRIO_SHIFT)
97
98#define V_IGN_TPR_SHIFT 20
99#define V_IGN_TPR_MASK (1 << V_IGN_TPR_SHIFT)
100
101#define V_INTR_MASKING_SHIFT 24
102#define V_INTR_MASKING_MASK (1 << V_INTR_MASKING_SHIFT)
103
104#define SVM_INTERRUPT_SHADOW_MASK 1
105
106#define SVM_IOIO_STR_SHIFT 2
107#define SVM_IOIO_REP_SHIFT 3
108#define SVM_IOIO_SIZE_SHIFT 4
109#define SVM_IOIO_ASIZE_SHIFT 7
110
111#define SVM_IOIO_TYPE_MASK 1
112#define SVM_IOIO_STR_MASK (1 << SVM_IOIO_STR_SHIFT)
113#define SVM_IOIO_REP_MASK (1 << SVM_IOIO_REP_SHIFT)
114#define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT)
115#define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT)
116
117struct __attribute__ ((__packed__)) vmcb_seg {
118 u16 selector;
119 u16 attrib;
120 u32 limit;
121 u64 base;
122};
123
124struct __attribute__ ((__packed__)) vmcb_save_area {
125 struct vmcb_seg es;
126 struct vmcb_seg cs;
127 struct vmcb_seg ss;
128 struct vmcb_seg ds;
129 struct vmcb_seg fs;
130 struct vmcb_seg gs;
131 struct vmcb_seg gdtr;
132 struct vmcb_seg ldtr;
133 struct vmcb_seg idtr;
134 struct vmcb_seg tr;
135 u8 reserved_1[43];
136 u8 cpl;
137 u8 reserved_2[4];
138 u64 efer;
139 u8 reserved_3[112];
140 u64 cr4;
141 u64 cr3;
142 u64 cr0;
143 u64 dr7;
144 u64 dr6;
145 u64 rflags;
146 u64 rip;
147 u8 reserved_4[88];
148 u64 rsp;
149 u8 reserved_5[24];
150 u64 rax;
151 u64 star;
152 u64 lstar;
153 u64 cstar;
154 u64 sfmask;
155 u64 kernel_gs_base;
156 u64 sysenter_cs;
157 u64 sysenter_esp;
158 u64 sysenter_eip;
159 u64 cr2;
160 u8 reserved_6[32];
161 u64 g_pat;
162 u64 dbgctl;
163 u64 br_from;
164 u64 br_to;
165 u64 last_excp_from;
166 u64 last_excp_to;
167};
168
169struct __attribute__ ((__packed__)) vmcb {
170 struct vmcb_control_area control;
171 struct vmcb_save_area save;
172};
173
174#define SVM_CPUID_FEATURE_SHIFT 2
175#define SVM_CPUID_FUNC 0x8000000a
176
177#define MSR_EFER_SVME_MASK (1ULL << 12)
178#define MSR_VM_CR 0xc0010114
179#define MSR_VM_HSAVE_PA 0xc0010117ULL
180
181#define SVM_VM_CR_SVM_DISABLE 4
182
183#define SVM_SELECTOR_S_SHIFT 4
184#define SVM_SELECTOR_DPL_SHIFT 5
185#define SVM_SELECTOR_P_SHIFT 7
186#define SVM_SELECTOR_AVL_SHIFT 8
187#define SVM_SELECTOR_L_SHIFT 9
188#define SVM_SELECTOR_DB_SHIFT 10
189#define SVM_SELECTOR_G_SHIFT 11
190
191#define SVM_SELECTOR_TYPE_MASK (0xf)
192#define SVM_SELECTOR_S_MASK (1 << SVM_SELECTOR_S_SHIFT)
193#define SVM_SELECTOR_DPL_MASK (3 << SVM_SELECTOR_DPL_SHIFT)
194#define SVM_SELECTOR_P_MASK (1 << SVM_SELECTOR_P_SHIFT)
195#define SVM_SELECTOR_AVL_MASK (1 << SVM_SELECTOR_AVL_SHIFT)
196#define SVM_SELECTOR_L_MASK (1 << SVM_SELECTOR_L_SHIFT)
197#define SVM_SELECTOR_DB_MASK (1 << SVM_SELECTOR_DB_SHIFT)
198#define SVM_SELECTOR_G_MASK (1 << SVM_SELECTOR_G_SHIFT)
199
200#define SVM_SELECTOR_WRITE_MASK (1 << 1)
201#define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK
202#define SVM_SELECTOR_CODE_MASK (1 << 3)
203
204#define INTERCEPT_CR0_MASK 1
205#define INTERCEPT_CR3_MASK (1 << 3)
206#define INTERCEPT_CR4_MASK (1 << 4)
207#define INTERCEPT_CR8_MASK (1 << 8)
208
209#define INTERCEPT_DR0_MASK 1
210#define INTERCEPT_DR1_MASK (1 << 1)
211#define INTERCEPT_DR2_MASK (1 << 2)
212#define INTERCEPT_DR3_MASK (1 << 3)
213#define INTERCEPT_DR4_MASK (1 << 4)
214#define INTERCEPT_DR5_MASK (1 << 5)
215#define INTERCEPT_DR6_MASK (1 << 6)
216#define INTERCEPT_DR7_MASK (1 << 7)
217
218#define SVM_EVTINJ_VEC_MASK 0xff
219
220#define SVM_EVTINJ_TYPE_SHIFT 8
221#define SVM_EVTINJ_TYPE_MASK (7 << SVM_EVTINJ_TYPE_SHIFT)
222
223#define SVM_EVTINJ_TYPE_INTR (0 << SVM_EVTINJ_TYPE_SHIFT)
224#define SVM_EVTINJ_TYPE_NMI (2 << SVM_EVTINJ_TYPE_SHIFT)
225#define SVM_EVTINJ_TYPE_EXEPT (3 << SVM_EVTINJ_TYPE_SHIFT)
226#define SVM_EVTINJ_TYPE_SOFT (4 << SVM_EVTINJ_TYPE_SHIFT)
227
228#define SVM_EVTINJ_VALID (1 << 31)
229#define SVM_EVTINJ_VALID_ERR (1 << 11)
230
231#define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK
232
233#define SVM_EXITINTINFO_TYPE_INTR SVM_EVTINJ_TYPE_INTR
234#define SVM_EXITINTINFO_TYPE_NMI SVM_EVTINJ_TYPE_NMI
235#define SVM_EXITINTINFO_TYPE_EXEPT SVM_EVTINJ_TYPE_EXEPT
236#define SVM_EXITINTINFO_TYPE_SOFT SVM_EVTINJ_TYPE_SOFT
237
238#define SVM_EXITINTINFO_VALID SVM_EVTINJ_VALID
239#define SVM_EXITINTINFO_VALID_ERR SVM_EVTINJ_VALID_ERR
240
241#define SVM_EXIT_READ_CR0 0x000
242#define SVM_EXIT_READ_CR3 0x003
243#define SVM_EXIT_READ_CR4 0x004
244#define SVM_EXIT_READ_CR8 0x008
245#define SVM_EXIT_WRITE_CR0 0x010
246#define SVM_EXIT_WRITE_CR3 0x013
247#define SVM_EXIT_WRITE_CR4 0x014
248#define SVM_EXIT_WRITE_CR8 0x018
249#define SVM_EXIT_READ_DR0 0x020
250#define SVM_EXIT_READ_DR1 0x021
251#define SVM_EXIT_READ_DR2 0x022
252#define SVM_EXIT_READ_DR3 0x023
253#define SVM_EXIT_READ_DR4 0x024
254#define SVM_EXIT_READ_DR5 0x025
255#define SVM_EXIT_READ_DR6 0x026
256#define SVM_EXIT_READ_DR7 0x027
257#define SVM_EXIT_WRITE_DR0 0x030
258#define SVM_EXIT_WRITE_DR1 0x031
259#define SVM_EXIT_WRITE_DR2 0x032
260#define SVM_EXIT_WRITE_DR3 0x033
261#define SVM_EXIT_WRITE_DR4 0x034
262#define SVM_EXIT_WRITE_DR5 0x035
263#define SVM_EXIT_WRITE_DR6 0x036
264#define SVM_EXIT_WRITE_DR7 0x037
265#define SVM_EXIT_EXCP_BASE 0x040
266#define SVM_EXIT_INTR 0x060
267#define SVM_EXIT_NMI 0x061
268#define SVM_EXIT_SMI 0x062
269#define SVM_EXIT_INIT 0x063
270#define SVM_EXIT_VINTR 0x064
271#define SVM_EXIT_CR0_SEL_WRITE 0x065
272#define SVM_EXIT_IDTR_READ 0x066
273#define SVM_EXIT_GDTR_READ 0x067
274#define SVM_EXIT_LDTR_READ 0x068
275#define SVM_EXIT_TR_READ 0x069
276#define SVM_EXIT_IDTR_WRITE 0x06a
277#define SVM_EXIT_GDTR_WRITE 0x06b
278#define SVM_EXIT_LDTR_WRITE 0x06c
279#define SVM_EXIT_TR_WRITE 0x06d
280#define SVM_EXIT_RDTSC 0x06e
281#define SVM_EXIT_RDPMC 0x06f
282#define SVM_EXIT_PUSHF 0x070
283#define SVM_EXIT_POPF 0x071
284#define SVM_EXIT_CPUID 0x072
285#define SVM_EXIT_RSM 0x073
286#define SVM_EXIT_IRET 0x074
287#define SVM_EXIT_SWINT 0x075
288#define SVM_EXIT_INVD 0x076
289#define SVM_EXIT_PAUSE 0x077
290#define SVM_EXIT_HLT 0x078
291#define SVM_EXIT_INVLPG 0x079
292#define SVM_EXIT_INVLPGA 0x07a
293#define SVM_EXIT_IOIO 0x07b
294#define SVM_EXIT_MSR 0x07c
295#define SVM_EXIT_TASK_SWITCH 0x07d
296#define SVM_EXIT_FERR_FREEZE 0x07e
297#define SVM_EXIT_SHUTDOWN 0x07f
298#define SVM_EXIT_VMRUN 0x080
299#define SVM_EXIT_VMMCALL 0x081
300#define SVM_EXIT_VMLOAD 0x082
301#define SVM_EXIT_VMSAVE 0x083
302#define SVM_EXIT_STGI 0x084
303#define SVM_EXIT_CLGI 0x085
304#define SVM_EXIT_SKINIT 0x086
305#define SVM_EXIT_RDTSCP 0x087
306#define SVM_EXIT_ICEBP 0x088
307#define SVM_EXIT_WBINVD 0x089
308#define SVM_EXIT_MONITOR 0x08a
309#define SVM_EXIT_MWAIT 0x08b
310#define SVM_EXIT_MWAIT_COND 0x08c
311#define SVM_EXIT_NPF 0x400
312
313#define SVM_EXIT_ERR -1
314
315#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) /* TS and MP */
316
317#define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda"
318#define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8"
319#define SVM_VMSAVE ".byte 0x0f, 0x01, 0xdb"
320#define SVM_CLGI ".byte 0x0f, 0x01, 0xdd"
321#define SVM_STGI ".byte 0x0f, 0x01, 0xdc"
322#define SVM_INVLPGA ".byte 0x0f, 0x01, 0xdf"
323
324#endif
325
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
new file mode 100644
index 000000000000..ad36447e696e
--- /dev/null
+++ b/arch/x86/kvm/vmx.c
@@ -0,0 +1,2679 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * Copyright (C) 2006 Qumranet, Inc.
8 *
9 * Authors:
10 * Avi Kivity <avi@qumranet.com>
11 * Yaniv Kamay <yaniv@qumranet.com>
12 *
13 * This work is licensed under the terms of the GNU GPL, version 2. See
14 * the COPYING file in the top-level directory.
15 *
16 */
17
18#include "irq.h"
19#include "vmx.h"
20#include "segment_descriptor.h"
21#include "mmu.h"
22
23#include <linux/kvm_host.h>
24#include <linux/module.h>
25#include <linux/kernel.h>
26#include <linux/mm.h>
27#include <linux/highmem.h>
28#include <linux/sched.h>
29#include <linux/moduleparam.h>
30
31#include <asm/io.h>
32#include <asm/desc.h>
33
34MODULE_AUTHOR("Qumranet");
35MODULE_LICENSE("GPL");
36
37static int bypass_guest_pf = 1;
38module_param(bypass_guest_pf, bool, 0);
39
40struct vmcs {
41 u32 revision_id;
42 u32 abort;
43 char data[0];
44};
45
46struct vcpu_vmx {
47 struct kvm_vcpu vcpu;
48 int launched;
49 u8 fail;
50 u32 idt_vectoring_info;
51 struct kvm_msr_entry *guest_msrs;
52 struct kvm_msr_entry *host_msrs;
53 int nmsrs;
54 int save_nmsrs;
55 int msr_offset_efer;
56#ifdef CONFIG_X86_64
57 int msr_offset_kernel_gs_base;
58#endif
59 struct vmcs *vmcs;
60 struct {
61 int loaded;
62 u16 fs_sel, gs_sel, ldt_sel;
63 int gs_ldt_reload_needed;
64 int fs_reload_needed;
65 int guest_efer_loaded;
66 } host_state;
67 struct {
68 struct {
69 bool pending;
70 u8 vector;
71 unsigned rip;
72 } irq;
73 } rmode;
74};
75
76static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
77{
78 return container_of(vcpu, struct vcpu_vmx, vcpu);
79}
80
81static int init_rmode_tss(struct kvm *kvm);
82
83static DEFINE_PER_CPU(struct vmcs *, vmxarea);
84static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
85
86static struct page *vmx_io_bitmap_a;
87static struct page *vmx_io_bitmap_b;
88
89static struct vmcs_config {
90 int size;
91 int order;
92 u32 revision_id;
93 u32 pin_based_exec_ctrl;
94 u32 cpu_based_exec_ctrl;
95 u32 cpu_based_2nd_exec_ctrl;
96 u32 vmexit_ctrl;
97 u32 vmentry_ctrl;
98} vmcs_config;
99
100#define VMX_SEGMENT_FIELD(seg) \
101 [VCPU_SREG_##seg] = { \
102 .selector = GUEST_##seg##_SELECTOR, \
103 .base = GUEST_##seg##_BASE, \
104 .limit = GUEST_##seg##_LIMIT, \
105 .ar_bytes = GUEST_##seg##_AR_BYTES, \
106 }
107
108static struct kvm_vmx_segment_field {
109 unsigned selector;
110 unsigned base;
111 unsigned limit;
112 unsigned ar_bytes;
113} kvm_vmx_segment_fields[] = {
114 VMX_SEGMENT_FIELD(CS),
115 VMX_SEGMENT_FIELD(DS),
116 VMX_SEGMENT_FIELD(ES),
117 VMX_SEGMENT_FIELD(FS),
118 VMX_SEGMENT_FIELD(GS),
119 VMX_SEGMENT_FIELD(SS),
120 VMX_SEGMENT_FIELD(TR),
121 VMX_SEGMENT_FIELD(LDTR),
122};
123
124/*
125 * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it
126 * away by decrementing the array size.
127 */
128static const u32 vmx_msr_index[] = {
129#ifdef CONFIG_X86_64
130 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE,
131#endif
132 MSR_EFER, MSR_K6_STAR,
133};
134#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
135
136static void load_msrs(struct kvm_msr_entry *e, int n)
137{
138 int i;
139
140 for (i = 0; i < n; ++i)
141 wrmsrl(e[i].index, e[i].data);
142}
143
144static void save_msrs(struct kvm_msr_entry *e, int n)
145{
146 int i;
147
148 for (i = 0; i < n; ++i)
149 rdmsrl(e[i].index, e[i].data);
150}
151
152static inline int is_page_fault(u32 intr_info)
153{
154 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
155 INTR_INFO_VALID_MASK)) ==
156 (INTR_TYPE_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
157}
158
159static inline int is_no_device(u32 intr_info)
160{
161 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
162 INTR_INFO_VALID_MASK)) ==
163 (INTR_TYPE_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
164}
165
166static inline int is_invalid_opcode(u32 intr_info)
167{
168 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
169 INTR_INFO_VALID_MASK)) ==
170 (INTR_TYPE_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
171}
172
173static inline int is_external_interrupt(u32 intr_info)
174{
175 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
176 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
177}
178
179static inline int cpu_has_vmx_tpr_shadow(void)
180{
181 return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW);
182}
183
184static inline int vm_need_tpr_shadow(struct kvm *kvm)
185{
186 return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm)));
187}
188
189static inline int cpu_has_secondary_exec_ctrls(void)
190{
191 return (vmcs_config.cpu_based_exec_ctrl &
192 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS);
193}
194
195static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
196{
197 return (vmcs_config.cpu_based_2nd_exec_ctrl &
198 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
199}
200
201static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
202{
203 return ((cpu_has_vmx_virtualize_apic_accesses()) &&
204 (irqchip_in_kernel(kvm)));
205}
206
207static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
208{
209 int i;
210
211 for (i = 0; i < vmx->nmsrs; ++i)
212 if (vmx->guest_msrs[i].index == msr)
213 return i;
214 return -1;
215}
216
217static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
218{
219 int i;
220
221 i = __find_msr_index(vmx, msr);
222 if (i >= 0)
223 return &vmx->guest_msrs[i];
224 return NULL;
225}
226
227static void vmcs_clear(struct vmcs *vmcs)
228{
229 u64 phys_addr = __pa(vmcs);
230 u8 error;
231
232 asm volatile (ASM_VMX_VMCLEAR_RAX "; setna %0"
233 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
234 : "cc", "memory");
235 if (error)
236 printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
237 vmcs, phys_addr);
238}
239
240static void __vcpu_clear(void *arg)
241{
242 struct vcpu_vmx *vmx = arg;
243 int cpu = raw_smp_processor_id();
244
245 if (vmx->vcpu.cpu == cpu)
246 vmcs_clear(vmx->vmcs);
247 if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
248 per_cpu(current_vmcs, cpu) = NULL;
249 rdtscll(vmx->vcpu.arch.host_tsc);
250}
251
252static void vcpu_clear(struct vcpu_vmx *vmx)
253{
254 if (vmx->vcpu.cpu == -1)
255 return;
256 smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 0, 1);
257 vmx->launched = 0;
258}
259
260static unsigned long vmcs_readl(unsigned long field)
261{
262 unsigned long value;
263
264 asm volatile (ASM_VMX_VMREAD_RDX_RAX
265 : "=a"(value) : "d"(field) : "cc");
266 return value;
267}
268
269static u16 vmcs_read16(unsigned long field)
270{
271 return vmcs_readl(field);
272}
273
274static u32 vmcs_read32(unsigned long field)
275{
276 return vmcs_readl(field);
277}
278
279static u64 vmcs_read64(unsigned long field)
280{
281#ifdef CONFIG_X86_64
282 return vmcs_readl(field);
283#else
284 return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32);
285#endif
286}
287
288static noinline void vmwrite_error(unsigned long field, unsigned long value)
289{
290 printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
291 field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
292 dump_stack();
293}
294
295static void vmcs_writel(unsigned long field, unsigned long value)
296{
297 u8 error;
298
299 asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0"
300 : "=q"(error) : "a"(value), "d"(field) : "cc");
301 if (unlikely(error))
302 vmwrite_error(field, value);
303}
304
305static void vmcs_write16(unsigned long field, u16 value)
306{
307 vmcs_writel(field, value);
308}
309
310static void vmcs_write32(unsigned long field, u32 value)
311{
312 vmcs_writel(field, value);
313}
314
315static void vmcs_write64(unsigned long field, u64 value)
316{
317#ifdef CONFIG_X86_64
318 vmcs_writel(field, value);
319#else
320 vmcs_writel(field, value);
321 asm volatile ("");
322 vmcs_writel(field+1, value >> 32);
323#endif
324}
325
326static void vmcs_clear_bits(unsigned long field, u32 mask)
327{
328 vmcs_writel(field, vmcs_readl(field) & ~mask);
329}
330
331static void vmcs_set_bits(unsigned long field, u32 mask)
332{
333 vmcs_writel(field, vmcs_readl(field) | mask);
334}
335
336static void update_exception_bitmap(struct kvm_vcpu *vcpu)
337{
338 u32 eb;
339
340 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR);
341 if (!vcpu->fpu_active)
342 eb |= 1u << NM_VECTOR;
343 if (vcpu->guest_debug.enabled)
344 eb |= 1u << 1;
345 if (vcpu->arch.rmode.active)
346 eb = ~0;
347 vmcs_write32(EXCEPTION_BITMAP, eb);
348}
349
350static void reload_tss(void)
351{
352#ifndef CONFIG_X86_64
353
354 /*
355 * VT restores TR but not its size. Useless.
356 */
357 struct descriptor_table gdt;
358 struct segment_descriptor *descs;
359
360 get_gdt(&gdt);
361 descs = (void *)gdt.base;
362 descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
363 load_TR_desc();
364#endif
365}
366
367static void load_transition_efer(struct vcpu_vmx *vmx)
368{
369 int efer_offset = vmx->msr_offset_efer;
370 u64 host_efer = vmx->host_msrs[efer_offset].data;
371 u64 guest_efer = vmx->guest_msrs[efer_offset].data;
372 u64 ignore_bits;
373
374 if (efer_offset < 0)
375 return;
376 /*
377 * NX is emulated; LMA and LME handled by hardware; SCE meaninless
378 * outside long mode
379 */
380 ignore_bits = EFER_NX | EFER_SCE;
381#ifdef CONFIG_X86_64
382 ignore_bits |= EFER_LMA | EFER_LME;
383 /* SCE is meaningful only in long mode on Intel */
384 if (guest_efer & EFER_LMA)
385 ignore_bits &= ~(u64)EFER_SCE;
386#endif
387 if ((guest_efer & ~ignore_bits) == (host_efer & ~ignore_bits))
388 return;
389
390 vmx->host_state.guest_efer_loaded = 1;
391 guest_efer &= ~ignore_bits;
392 guest_efer |= host_efer & ignore_bits;
393 wrmsrl(MSR_EFER, guest_efer);
394 vmx->vcpu.stat.efer_reload++;
395}
396
397static void reload_host_efer(struct vcpu_vmx *vmx)
398{
399 if (vmx->host_state.guest_efer_loaded) {
400 vmx->host_state.guest_efer_loaded = 0;
401 load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1);
402 }
403}
404
405static void vmx_save_host_state(struct kvm_vcpu *vcpu)
406{
407 struct vcpu_vmx *vmx = to_vmx(vcpu);
408
409 if (vmx->host_state.loaded)
410 return;
411
412 vmx->host_state.loaded = 1;
413 /*
414 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not
415 * allow segment selectors with cpl > 0 or ti == 1.
416 */
417 vmx->host_state.ldt_sel = read_ldt();
418 vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
419 vmx->host_state.fs_sel = read_fs();
420 if (!(vmx->host_state.fs_sel & 7)) {
421 vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
422 vmx->host_state.fs_reload_needed = 0;
423 } else {
424 vmcs_write16(HOST_FS_SELECTOR, 0);
425 vmx->host_state.fs_reload_needed = 1;
426 }
427 vmx->host_state.gs_sel = read_gs();
428 if (!(vmx->host_state.gs_sel & 7))
429 vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
430 else {
431 vmcs_write16(HOST_GS_SELECTOR, 0);
432 vmx->host_state.gs_ldt_reload_needed = 1;
433 }
434
435#ifdef CONFIG_X86_64
436 vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
437 vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
438#else
439 vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel));
440 vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel));
441#endif
442
443#ifdef CONFIG_X86_64
444 if (is_long_mode(&vmx->vcpu))
445 save_msrs(vmx->host_msrs +
446 vmx->msr_offset_kernel_gs_base, 1);
447
448#endif
449 load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
450 load_transition_efer(vmx);
451}
452
453static void vmx_load_host_state(struct vcpu_vmx *vmx)
454{
455 unsigned long flags;
456
457 if (!vmx->host_state.loaded)
458 return;
459
460 ++vmx->vcpu.stat.host_state_reload;
461 vmx->host_state.loaded = 0;
462 if (vmx->host_state.fs_reload_needed)
463 load_fs(vmx->host_state.fs_sel);
464 if (vmx->host_state.gs_ldt_reload_needed) {
465 load_ldt(vmx->host_state.ldt_sel);
466 /*
467 * If we have to reload gs, we must take care to
468 * preserve our gs base.
469 */
470 local_irq_save(flags);
471 load_gs(vmx->host_state.gs_sel);
472#ifdef CONFIG_X86_64
473 wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
474#endif
475 local_irq_restore(flags);
476 }
477 reload_tss();
478 save_msrs(vmx->guest_msrs, vmx->save_nmsrs);
479 load_msrs(vmx->host_msrs, vmx->save_nmsrs);
480 reload_host_efer(vmx);
481}
482
483/*
484 * Switches to specified vcpu, until a matching vcpu_put(), but assumes
485 * vcpu mutex is already taken.
486 */
487static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
488{
489 struct vcpu_vmx *vmx = to_vmx(vcpu);
490 u64 phys_addr = __pa(vmx->vmcs);
491 u64 tsc_this, delta;
492
493 if (vcpu->cpu != cpu) {
494 vcpu_clear(vmx);
495 kvm_migrate_apic_timer(vcpu);
496 }
497
498 if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
499 u8 error;
500
501 per_cpu(current_vmcs, cpu) = vmx->vmcs;
502 asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0"
503 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
504 : "cc");
505 if (error)
506 printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
507 vmx->vmcs, phys_addr);
508 }
509
510 if (vcpu->cpu != cpu) {
511 struct descriptor_table dt;
512 unsigned long sysenter_esp;
513
514 vcpu->cpu = cpu;
515 /*
516 * Linux uses per-cpu TSS and GDT, so set these when switching
517 * processors.
518 */
519 vmcs_writel(HOST_TR_BASE, read_tr_base()); /* 22.2.4 */
520 get_gdt(&dt);
521 vmcs_writel(HOST_GDTR_BASE, dt.base); /* 22.2.4 */
522
523 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
524 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
525
526 /*
527 * Make sure the time stamp counter is monotonous.
528 */
529 rdtscll(tsc_this);
530 delta = vcpu->arch.host_tsc - tsc_this;
531 vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta);
532 }
533}
534
535static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
536{
537 vmx_load_host_state(to_vmx(vcpu));
538}
539
540static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
541{
542 if (vcpu->fpu_active)
543 return;
544 vcpu->fpu_active = 1;
545 vmcs_clear_bits(GUEST_CR0, X86_CR0_TS);
546 if (vcpu->arch.cr0 & X86_CR0_TS)
547 vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
548 update_exception_bitmap(vcpu);
549}
550
551static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
552{
553 if (!vcpu->fpu_active)
554 return;
555 vcpu->fpu_active = 0;
556 vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
557 update_exception_bitmap(vcpu);
558}
559
560static void vmx_vcpu_decache(struct kvm_vcpu *vcpu)
561{
562 vcpu_clear(to_vmx(vcpu));
563}
564
565static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
566{
567 return vmcs_readl(GUEST_RFLAGS);
568}
569
570static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
571{
572 if (vcpu->arch.rmode.active)
573 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
574 vmcs_writel(GUEST_RFLAGS, rflags);
575}
576
577static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
578{
579 unsigned long rip;
580 u32 interruptibility;
581
582 rip = vmcs_readl(GUEST_RIP);
583 rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
584 vmcs_writel(GUEST_RIP, rip);
585
586 /*
587 * We emulated an instruction, so temporary interrupt blocking
588 * should be removed, if set.
589 */
590 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
591 if (interruptibility & 3)
592 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
593 interruptibility & ~3);
594 vcpu->arch.interrupt_window_open = 1;
595}
596
597static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
598 bool has_error_code, u32 error_code)
599{
600 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
601 nr | INTR_TYPE_EXCEPTION
602 | (has_error_code ? INTR_INFO_DELIEVER_CODE_MASK : 0)
603 | INTR_INFO_VALID_MASK);
604 if (has_error_code)
605 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
606}
607
608static bool vmx_exception_injected(struct kvm_vcpu *vcpu)
609{
610 struct vcpu_vmx *vmx = to_vmx(vcpu);
611
612 return !(vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
613}
614
615/*
616 * Swap MSR entry in host/guest MSR entry array.
617 */
618#ifdef CONFIG_X86_64
619static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
620{
621 struct kvm_msr_entry tmp;
622
623 tmp = vmx->guest_msrs[to];
624 vmx->guest_msrs[to] = vmx->guest_msrs[from];
625 vmx->guest_msrs[from] = tmp;
626 tmp = vmx->host_msrs[to];
627 vmx->host_msrs[to] = vmx->host_msrs[from];
628 vmx->host_msrs[from] = tmp;
629}
630#endif
631
632/*
633 * Set up the vmcs to automatically save and restore system
634 * msrs. Don't touch the 64-bit msrs if the guest is in legacy
635 * mode, as fiddling with msrs is very expensive.
636 */
637static void setup_msrs(struct vcpu_vmx *vmx)
638{
639 int save_nmsrs;
640
641 save_nmsrs = 0;
642#ifdef CONFIG_X86_64
643 if (is_long_mode(&vmx->vcpu)) {
644 int index;
645
646 index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
647 if (index >= 0)
648 move_msr_up(vmx, index, save_nmsrs++);
649 index = __find_msr_index(vmx, MSR_LSTAR);
650 if (index >= 0)
651 move_msr_up(vmx, index, save_nmsrs++);
652 index = __find_msr_index(vmx, MSR_CSTAR);
653 if (index >= 0)
654 move_msr_up(vmx, index, save_nmsrs++);
655 index = __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
656 if (index >= 0)
657 move_msr_up(vmx, index, save_nmsrs++);
658 /*
659 * MSR_K6_STAR is only needed on long mode guests, and only
660 * if efer.sce is enabled.
661 */
662 index = __find_msr_index(vmx, MSR_K6_STAR);
663 if ((index >= 0) && (vmx->vcpu.arch.shadow_efer & EFER_SCE))
664 move_msr_up(vmx, index, save_nmsrs++);
665 }
666#endif
667 vmx->save_nmsrs = save_nmsrs;
668
669#ifdef CONFIG_X86_64
670 vmx->msr_offset_kernel_gs_base =
671 __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
672#endif
673 vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER);
674}
675
676/*
677 * reads and returns guest's timestamp counter "register"
678 * guest_tsc = host_tsc + tsc_offset -- 21.3
679 */
680static u64 guest_read_tsc(void)
681{
682 u64 host_tsc, tsc_offset;
683
684 rdtscll(host_tsc);
685 tsc_offset = vmcs_read64(TSC_OFFSET);
686 return host_tsc + tsc_offset;
687}
688
689/*
690 * writes 'guest_tsc' into guest's timestamp counter "register"
691 * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc
692 */
693static void guest_write_tsc(u64 guest_tsc)
694{
695 u64 host_tsc;
696
697 rdtscll(host_tsc);
698 vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc);
699}
700
701/*
702 * Reads an msr value (of 'msr_index') into 'pdata'.
703 * Returns 0 on success, non-0 otherwise.
704 * Assumes vcpu_load() was already called.
705 */
706static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
707{
708 u64 data;
709 struct kvm_msr_entry *msr;
710
711 if (!pdata) {
712 printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
713 return -EINVAL;
714 }
715
716 switch (msr_index) {
717#ifdef CONFIG_X86_64
718 case MSR_FS_BASE:
719 data = vmcs_readl(GUEST_FS_BASE);
720 break;
721 case MSR_GS_BASE:
722 data = vmcs_readl(GUEST_GS_BASE);
723 break;
724 case MSR_EFER:
725 return kvm_get_msr_common(vcpu, msr_index, pdata);
726#endif
727 case MSR_IA32_TIME_STAMP_COUNTER:
728 data = guest_read_tsc();
729 break;
730 case MSR_IA32_SYSENTER_CS:
731 data = vmcs_read32(GUEST_SYSENTER_CS);
732 break;
733 case MSR_IA32_SYSENTER_EIP:
734 data = vmcs_readl(GUEST_SYSENTER_EIP);
735 break;
736 case MSR_IA32_SYSENTER_ESP:
737 data = vmcs_readl(GUEST_SYSENTER_ESP);
738 break;
739 default:
740 msr = find_msr_entry(to_vmx(vcpu), msr_index);
741 if (msr) {
742 data = msr->data;
743 break;
744 }
745 return kvm_get_msr_common(vcpu, msr_index, pdata);
746 }
747
748 *pdata = data;
749 return 0;
750}
751
752/*
753 * Writes msr value into into the appropriate "register".
754 * Returns 0 on success, non-0 otherwise.
755 * Assumes vcpu_load() was already called.
756 */
757static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
758{
759 struct vcpu_vmx *vmx = to_vmx(vcpu);
760 struct kvm_msr_entry *msr;
761 int ret = 0;
762
763 switch (msr_index) {
764#ifdef CONFIG_X86_64
765 case MSR_EFER:
766 ret = kvm_set_msr_common(vcpu, msr_index, data);
767 if (vmx->host_state.loaded) {
768 reload_host_efer(vmx);
769 load_transition_efer(vmx);
770 }
771 break;
772 case MSR_FS_BASE:
773 vmcs_writel(GUEST_FS_BASE, data);
774 break;
775 case MSR_GS_BASE:
776 vmcs_writel(GUEST_GS_BASE, data);
777 break;
778#endif
779 case MSR_IA32_SYSENTER_CS:
780 vmcs_write32(GUEST_SYSENTER_CS, data);
781 break;
782 case MSR_IA32_SYSENTER_EIP:
783 vmcs_writel(GUEST_SYSENTER_EIP, data);
784 break;
785 case MSR_IA32_SYSENTER_ESP:
786 vmcs_writel(GUEST_SYSENTER_ESP, data);
787 break;
788 case MSR_IA32_TIME_STAMP_COUNTER:
789 guest_write_tsc(data);
790 break;
791 default:
792 msr = find_msr_entry(vmx, msr_index);
793 if (msr) {
794 msr->data = data;
795 if (vmx->host_state.loaded)
796 load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
797 break;
798 }
799 ret = kvm_set_msr_common(vcpu, msr_index, data);
800 }
801
802 return ret;
803}
804
805/*
806 * Sync the rsp and rip registers into the vcpu structure. This allows
807 * registers to be accessed by indexing vcpu->arch.regs.
808 */
809static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu)
810{
811 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
812 vcpu->arch.rip = vmcs_readl(GUEST_RIP);
813}
814
815/*
816 * Syncs rsp and rip back into the vmcs. Should be called after possible
817 * modification.
818 */
819static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu)
820{
821 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
822 vmcs_writel(GUEST_RIP, vcpu->arch.rip);
823}
824
825static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
826{
827 unsigned long dr7 = 0x400;
828 int old_singlestep;
829
830 old_singlestep = vcpu->guest_debug.singlestep;
831
832 vcpu->guest_debug.enabled = dbg->enabled;
833 if (vcpu->guest_debug.enabled) {
834 int i;
835
836 dr7 |= 0x200; /* exact */
837 for (i = 0; i < 4; ++i) {
838 if (!dbg->breakpoints[i].enabled)
839 continue;
840 vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address;
841 dr7 |= 2 << (i*2); /* global enable */
842 dr7 |= 0 << (i*4+16); /* execution breakpoint */
843 }
844
845 vcpu->guest_debug.singlestep = dbg->singlestep;
846 } else
847 vcpu->guest_debug.singlestep = 0;
848
849 if (old_singlestep && !vcpu->guest_debug.singlestep) {
850 unsigned long flags;
851
852 flags = vmcs_readl(GUEST_RFLAGS);
853 flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
854 vmcs_writel(GUEST_RFLAGS, flags);
855 }
856
857 update_exception_bitmap(vcpu);
858 vmcs_writel(GUEST_DR7, dr7);
859
860 return 0;
861}
862
863static int vmx_get_irq(struct kvm_vcpu *vcpu)
864{
865 struct vcpu_vmx *vmx = to_vmx(vcpu);
866 u32 idtv_info_field;
867
868 idtv_info_field = vmx->idt_vectoring_info;
869 if (idtv_info_field & INTR_INFO_VALID_MASK) {
870 if (is_external_interrupt(idtv_info_field))
871 return idtv_info_field & VECTORING_INFO_VECTOR_MASK;
872 else
873 printk(KERN_DEBUG "pending exception: not handled yet\n");
874 }
875 return -1;
876}
877
878static __init int cpu_has_kvm_support(void)
879{
880 unsigned long ecx = cpuid_ecx(1);
881 return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */
882}
883
884static __init int vmx_disabled_by_bios(void)
885{
886 u64 msr;
887
888 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
889 return (msr & (MSR_IA32_FEATURE_CONTROL_LOCKED |
890 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
891 == MSR_IA32_FEATURE_CONTROL_LOCKED;
892 /* locked but not enabled */
893}
894
895static void hardware_enable(void *garbage)
896{
897 int cpu = raw_smp_processor_id();
898 u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
899 u64 old;
900
901 rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
902 if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED |
903 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
904 != (MSR_IA32_FEATURE_CONTROL_LOCKED |
905 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
906 /* enable and lock */
907 wrmsrl(MSR_IA32_FEATURE_CONTROL, old |
908 MSR_IA32_FEATURE_CONTROL_LOCKED |
909 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED);
910 write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
911 asm volatile (ASM_VMX_VMXON_RAX : : "a"(&phys_addr), "m"(phys_addr)
912 : "memory", "cc");
913}
914
915static void hardware_disable(void *garbage)
916{
917 asm volatile (ASM_VMX_VMXOFF : : : "cc");
918}
919
920static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
921 u32 msr, u32 *result)
922{
923 u32 vmx_msr_low, vmx_msr_high;
924 u32 ctl = ctl_min | ctl_opt;
925
926 rdmsr(msr, vmx_msr_low, vmx_msr_high);
927
928 ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
929 ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */
930
931 /* Ensure minimum (required) set of control bits are supported. */
932 if (ctl_min & ~ctl)
933 return -EIO;
934
935 *result = ctl;
936 return 0;
937}
938
939static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
940{
941 u32 vmx_msr_low, vmx_msr_high;
942 u32 min, opt;
943 u32 _pin_based_exec_control = 0;
944 u32 _cpu_based_exec_control = 0;
945 u32 _cpu_based_2nd_exec_control = 0;
946 u32 _vmexit_control = 0;
947 u32 _vmentry_control = 0;
948
949 min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
950 opt = 0;
951 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
952 &_pin_based_exec_control) < 0)
953 return -EIO;
954
955 min = CPU_BASED_HLT_EXITING |
956#ifdef CONFIG_X86_64
957 CPU_BASED_CR8_LOAD_EXITING |
958 CPU_BASED_CR8_STORE_EXITING |
959#endif
960 CPU_BASED_USE_IO_BITMAPS |
961 CPU_BASED_MOV_DR_EXITING |
962 CPU_BASED_USE_TSC_OFFSETING;
963 opt = CPU_BASED_TPR_SHADOW |
964 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
965 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
966 &_cpu_based_exec_control) < 0)
967 return -EIO;
968#ifdef CONFIG_X86_64
969 if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
970 _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
971 ~CPU_BASED_CR8_STORE_EXITING;
972#endif
973 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
974 min = 0;
975 opt = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
976 SECONDARY_EXEC_WBINVD_EXITING;
977 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS2,
978 &_cpu_based_2nd_exec_control) < 0)
979 return -EIO;
980 }
981#ifndef CONFIG_X86_64
982 if (!(_cpu_based_2nd_exec_control &
983 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
984 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
985#endif
986
987 min = 0;
988#ifdef CONFIG_X86_64
989 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
990#endif
991 opt = 0;
992 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
993 &_vmexit_control) < 0)
994 return -EIO;
995
996 min = opt = 0;
997 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
998 &_vmentry_control) < 0)
999 return -EIO;
1000
1001 rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
1002
1003 /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
1004 if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
1005 return -EIO;
1006
1007#ifdef CONFIG_X86_64
1008 /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
1009 if (vmx_msr_high & (1u<<16))
1010 return -EIO;
1011#endif
1012
1013 /* Require Write-Back (WB) memory type for VMCS accesses. */
1014 if (((vmx_msr_high >> 18) & 15) != 6)
1015 return -EIO;
1016
1017 vmcs_conf->size = vmx_msr_high & 0x1fff;
1018 vmcs_conf->order = get_order(vmcs_config.size);
1019 vmcs_conf->revision_id = vmx_msr_low;
1020
1021 vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
1022 vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
1023 vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
1024 vmcs_conf->vmexit_ctrl = _vmexit_control;
1025 vmcs_conf->vmentry_ctrl = _vmentry_control;
1026
1027 return 0;
1028}
1029
1030static struct vmcs *alloc_vmcs_cpu(int cpu)
1031{
1032 int node = cpu_to_node(cpu);
1033 struct page *pages;
1034 struct vmcs *vmcs;
1035
1036 pages = alloc_pages_node(node, GFP_KERNEL, vmcs_config.order);
1037 if (!pages)
1038 return NULL;
1039 vmcs = page_address(pages);
1040 memset(vmcs, 0, vmcs_config.size);
1041 vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */
1042 return vmcs;
1043}
1044
1045static struct vmcs *alloc_vmcs(void)
1046{
1047 return alloc_vmcs_cpu(raw_smp_processor_id());
1048}
1049
1050static void free_vmcs(struct vmcs *vmcs)
1051{
1052 free_pages((unsigned long)vmcs, vmcs_config.order);
1053}
1054
1055static void free_kvm_area(void)
1056{
1057 int cpu;
1058
1059 for_each_online_cpu(cpu)
1060 free_vmcs(per_cpu(vmxarea, cpu));
1061}
1062
1063static __init int alloc_kvm_area(void)
1064{
1065 int cpu;
1066
1067 for_each_online_cpu(cpu) {
1068 struct vmcs *vmcs;
1069
1070 vmcs = alloc_vmcs_cpu(cpu);
1071 if (!vmcs) {
1072 free_kvm_area();
1073 return -ENOMEM;
1074 }
1075
1076 per_cpu(vmxarea, cpu) = vmcs;
1077 }
1078 return 0;
1079}
1080
1081static __init int hardware_setup(void)
1082{
1083 if (setup_vmcs_config(&vmcs_config) < 0)
1084 return -EIO;
1085 return alloc_kvm_area();
1086}
1087
1088static __exit void hardware_unsetup(void)
1089{
1090 free_kvm_area();
1091}
1092
1093static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save)
1094{
1095 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1096
1097 if (vmcs_readl(sf->base) == save->base && (save->base & AR_S_MASK)) {
1098 vmcs_write16(sf->selector, save->selector);
1099 vmcs_writel(sf->base, save->base);
1100 vmcs_write32(sf->limit, save->limit);
1101 vmcs_write32(sf->ar_bytes, save->ar);
1102 } else {
1103 u32 dpl = (vmcs_read16(sf->selector) & SELECTOR_RPL_MASK)
1104 << AR_DPL_SHIFT;
1105 vmcs_write32(sf->ar_bytes, 0x93 | dpl);
1106 }
1107}
1108
1109static void enter_pmode(struct kvm_vcpu *vcpu)
1110{
1111 unsigned long flags;
1112
1113 vcpu->arch.rmode.active = 0;
1114
1115 vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base);
1116 vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit);
1117 vmcs_write32(GUEST_TR_AR_BYTES, vcpu->arch.rmode.tr.ar);
1118
1119 flags = vmcs_readl(GUEST_RFLAGS);
1120 flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM);
1121 flags |= (vcpu->arch.rmode.save_iopl << IOPL_SHIFT);
1122 vmcs_writel(GUEST_RFLAGS, flags);
1123
1124 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
1125 (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
1126
1127 update_exception_bitmap(vcpu);
1128
1129 fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
1130 fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
1131 fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
1132 fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
1133
1134 vmcs_write16(GUEST_SS_SELECTOR, 0);
1135 vmcs_write32(GUEST_SS_AR_BYTES, 0x93);
1136
1137 vmcs_write16(GUEST_CS_SELECTOR,
1138 vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK);
1139 vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
1140}
1141
1142static gva_t rmode_tss_base(struct kvm *kvm)
1143{
1144 if (!kvm->arch.tss_addr) {
1145 gfn_t base_gfn = kvm->memslots[0].base_gfn +
1146 kvm->memslots[0].npages - 3;
1147 return base_gfn << PAGE_SHIFT;
1148 }
1149 return kvm->arch.tss_addr;
1150}
1151
1152static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
1153{
1154 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1155
1156 save->selector = vmcs_read16(sf->selector);
1157 save->base = vmcs_readl(sf->base);
1158 save->limit = vmcs_read32(sf->limit);
1159 save->ar = vmcs_read32(sf->ar_bytes);
1160 vmcs_write16(sf->selector, save->base >> 4);
1161 vmcs_write32(sf->base, save->base & 0xfffff);
1162 vmcs_write32(sf->limit, 0xffff);
1163 vmcs_write32(sf->ar_bytes, 0xf3);
1164}
1165
1166static void enter_rmode(struct kvm_vcpu *vcpu)
1167{
1168 unsigned long flags;
1169
1170 vcpu->arch.rmode.active = 1;
1171
1172 vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
1173 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
1174
1175 vcpu->arch.rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
1176 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
1177
1178 vcpu->arch.rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
1179 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
1180
1181 flags = vmcs_readl(GUEST_RFLAGS);
1182 vcpu->arch.rmode.save_iopl
1183 = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
1184
1185 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
1186
1187 vmcs_writel(GUEST_RFLAGS, flags);
1188 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
1189 update_exception_bitmap(vcpu);
1190
1191 vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4);
1192 vmcs_write32(GUEST_SS_LIMIT, 0xffff);
1193 vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
1194
1195 vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
1196 vmcs_write32(GUEST_CS_LIMIT, 0xffff);
1197 if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000)
1198 vmcs_writel(GUEST_CS_BASE, 0xf0000);
1199 vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
1200
1201 fix_rmode_seg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
1202 fix_rmode_seg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
1203 fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
1204 fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
1205
1206 kvm_mmu_reset_context(vcpu);
1207 init_rmode_tss(vcpu->kvm);
1208}
1209
1210#ifdef CONFIG_X86_64
1211
1212static void enter_lmode(struct kvm_vcpu *vcpu)
1213{
1214 u32 guest_tr_ar;
1215
1216 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
1217 if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
1218 printk(KERN_DEBUG "%s: tss fixup for long mode. \n",
1219 __FUNCTION__);
1220 vmcs_write32(GUEST_TR_AR_BYTES,
1221 (guest_tr_ar & ~AR_TYPE_MASK)
1222 | AR_TYPE_BUSY_64_TSS);
1223 }
1224
1225 vcpu->arch.shadow_efer |= EFER_LMA;
1226
1227 find_msr_entry(to_vmx(vcpu), MSR_EFER)->data |= EFER_LMA | EFER_LME;
1228 vmcs_write32(VM_ENTRY_CONTROLS,
1229 vmcs_read32(VM_ENTRY_CONTROLS)
1230 | VM_ENTRY_IA32E_MODE);
1231}
1232
1233static void exit_lmode(struct kvm_vcpu *vcpu)
1234{
1235 vcpu->arch.shadow_efer &= ~EFER_LMA;
1236
1237 vmcs_write32(VM_ENTRY_CONTROLS,
1238 vmcs_read32(VM_ENTRY_CONTROLS)
1239 & ~VM_ENTRY_IA32E_MODE);
1240}
1241
1242#endif
1243
1244static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
1245{
1246 vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK;
1247 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK;
1248}
1249
1250static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1251{
1252 vmx_fpu_deactivate(vcpu);
1253
1254 if (vcpu->arch.rmode.active && (cr0 & X86_CR0_PE))
1255 enter_pmode(vcpu);
1256
1257 if (!vcpu->arch.rmode.active && !(cr0 & X86_CR0_PE))
1258 enter_rmode(vcpu);
1259
1260#ifdef CONFIG_X86_64
1261 if (vcpu->arch.shadow_efer & EFER_LME) {
1262 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
1263 enter_lmode(vcpu);
1264 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
1265 exit_lmode(vcpu);
1266 }
1267#endif
1268
1269 vmcs_writel(CR0_READ_SHADOW, cr0);
1270 vmcs_writel(GUEST_CR0,
1271 (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON);
1272 vcpu->arch.cr0 = cr0;
1273
1274 if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE))
1275 vmx_fpu_activate(vcpu);
1276}
1277
1278static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1279{
1280 vmcs_writel(GUEST_CR3, cr3);
1281 if (vcpu->arch.cr0 & X86_CR0_PE)
1282 vmx_fpu_deactivate(vcpu);
1283}
1284
1285static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1286{
1287 vmcs_writel(CR4_READ_SHADOW, cr4);
1288 vmcs_writel(GUEST_CR4, cr4 | (vcpu->arch.rmode.active ?
1289 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON));
1290 vcpu->arch.cr4 = cr4;
1291}
1292
1293#ifdef CONFIG_X86_64
1294
1295static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
1296{
1297 struct vcpu_vmx *vmx = to_vmx(vcpu);
1298 struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
1299
1300 vcpu->arch.shadow_efer = efer;
1301 if (efer & EFER_LMA) {
1302 vmcs_write32(VM_ENTRY_CONTROLS,
1303 vmcs_read32(VM_ENTRY_CONTROLS) |
1304 VM_ENTRY_IA32E_MODE);
1305 msr->data = efer;
1306
1307 } else {
1308 vmcs_write32(VM_ENTRY_CONTROLS,
1309 vmcs_read32(VM_ENTRY_CONTROLS) &
1310 ~VM_ENTRY_IA32E_MODE);
1311
1312 msr->data = efer & ~EFER_LME;
1313 }
1314 setup_msrs(vmx);
1315}
1316
1317#endif
1318
1319static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
1320{
1321 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1322
1323 return vmcs_readl(sf->base);
1324}
1325
1326static void vmx_get_segment(struct kvm_vcpu *vcpu,
1327 struct kvm_segment *var, int seg)
1328{
1329 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1330 u32 ar;
1331
1332 var->base = vmcs_readl(sf->base);
1333 var->limit = vmcs_read32(sf->limit);
1334 var->selector = vmcs_read16(sf->selector);
1335 ar = vmcs_read32(sf->ar_bytes);
1336 if (ar & AR_UNUSABLE_MASK)
1337 ar = 0;
1338 var->type = ar & 15;
1339 var->s = (ar >> 4) & 1;
1340 var->dpl = (ar >> 5) & 3;
1341 var->present = (ar >> 7) & 1;
1342 var->avl = (ar >> 12) & 1;
1343 var->l = (ar >> 13) & 1;
1344 var->db = (ar >> 14) & 1;
1345 var->g = (ar >> 15) & 1;
1346 var->unusable = (ar >> 16) & 1;
1347}
1348
1349static u32 vmx_segment_access_rights(struct kvm_segment *var)
1350{
1351 u32 ar;
1352
1353 if (var->unusable)
1354 ar = 1 << 16;
1355 else {
1356 ar = var->type & 15;
1357 ar |= (var->s & 1) << 4;
1358 ar |= (var->dpl & 3) << 5;
1359 ar |= (var->present & 1) << 7;
1360 ar |= (var->avl & 1) << 12;
1361 ar |= (var->l & 1) << 13;
1362 ar |= (var->db & 1) << 14;
1363 ar |= (var->g & 1) << 15;
1364 }
1365 if (ar == 0) /* a 0 value means unusable */
1366 ar = AR_UNUSABLE_MASK;
1367
1368 return ar;
1369}
1370
1371static void vmx_set_segment(struct kvm_vcpu *vcpu,
1372 struct kvm_segment *var, int seg)
1373{
1374 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1375 u32 ar;
1376
1377 if (vcpu->arch.rmode.active && seg == VCPU_SREG_TR) {
1378 vcpu->arch.rmode.tr.selector = var->selector;
1379 vcpu->arch.rmode.tr.base = var->base;
1380 vcpu->arch.rmode.tr.limit = var->limit;
1381 vcpu->arch.rmode.tr.ar = vmx_segment_access_rights(var);
1382 return;
1383 }
1384 vmcs_writel(sf->base, var->base);
1385 vmcs_write32(sf->limit, var->limit);
1386 vmcs_write16(sf->selector, var->selector);
1387 if (vcpu->arch.rmode.active && var->s) {
1388 /*
1389 * Hack real-mode segments into vm86 compatibility.
1390 */
1391 if (var->base == 0xffff0000 && var->selector == 0xf000)
1392 vmcs_writel(sf->base, 0xf0000);
1393 ar = 0xf3;
1394 } else
1395 ar = vmx_segment_access_rights(var);
1396 vmcs_write32(sf->ar_bytes, ar);
1397}
1398
1399static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
1400{
1401 u32 ar = vmcs_read32(GUEST_CS_AR_BYTES);
1402
1403 *db = (ar >> 14) & 1;
1404 *l = (ar >> 13) & 1;
1405}
1406
1407static void vmx_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
1408{
1409 dt->limit = vmcs_read32(GUEST_IDTR_LIMIT);
1410 dt->base = vmcs_readl(GUEST_IDTR_BASE);
1411}
1412
1413static void vmx_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
1414{
1415 vmcs_write32(GUEST_IDTR_LIMIT, dt->limit);
1416 vmcs_writel(GUEST_IDTR_BASE, dt->base);
1417}
1418
1419static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
1420{
1421 dt->limit = vmcs_read32(GUEST_GDTR_LIMIT);
1422 dt->base = vmcs_readl(GUEST_GDTR_BASE);
1423}
1424
1425static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
1426{
1427 vmcs_write32(GUEST_GDTR_LIMIT, dt->limit);
1428 vmcs_writel(GUEST_GDTR_BASE, dt->base);
1429}
1430
1431static int init_rmode_tss(struct kvm *kvm)
1432{
1433 gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
1434 u16 data = 0;
1435 int ret = 0;
1436 int r;
1437
1438 down_read(&current->mm->mmap_sem);
1439 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
1440 if (r < 0)
1441 goto out;
1442 data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
1443 r = kvm_write_guest_page(kvm, fn++, &data, 0x66, sizeof(u16));
1444 if (r < 0)
1445 goto out;
1446 r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
1447 if (r < 0)
1448 goto out;
1449 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
1450 if (r < 0)
1451 goto out;
1452 data = ~0;
1453 r = kvm_write_guest_page(kvm, fn, &data,
1454 RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
1455 sizeof(u8));
1456 if (r < 0)
1457 goto out;
1458
1459 ret = 1;
1460out:
1461 up_read(&current->mm->mmap_sem);
1462 return ret;
1463}
1464
1465static void seg_setup(int seg)
1466{
1467 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1468
1469 vmcs_write16(sf->selector, 0);
1470 vmcs_writel(sf->base, 0);
1471 vmcs_write32(sf->limit, 0xffff);
1472 vmcs_write32(sf->ar_bytes, 0x93);
1473}
1474
1475static int alloc_apic_access_page(struct kvm *kvm)
1476{
1477 struct kvm_userspace_memory_region kvm_userspace_mem;
1478 int r = 0;
1479
1480 down_write(&current->mm->mmap_sem);
1481 if (kvm->arch.apic_access_page)
1482 goto out;
1483 kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
1484 kvm_userspace_mem.flags = 0;
1485 kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL;
1486 kvm_userspace_mem.memory_size = PAGE_SIZE;
1487 r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
1488 if (r)
1489 goto out;
1490 kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
1491out:
1492 up_write(&current->mm->mmap_sem);
1493 return r;
1494}
1495
1496/*
1497 * Sets up the vmcs for emulated real mode.
1498 */
1499static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
1500{
1501 u32 host_sysenter_cs;
1502 u32 junk;
1503 unsigned long a;
1504 struct descriptor_table dt;
1505 int i;
1506 unsigned long kvm_vmx_return;
1507 u32 exec_control;
1508
1509 /* I/O */
1510 vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a));
1511 vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b));
1512
1513 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
1514
1515 /* Control */
1516 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
1517 vmcs_config.pin_based_exec_ctrl);
1518
1519 exec_control = vmcs_config.cpu_based_exec_ctrl;
1520 if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
1521 exec_control &= ~CPU_BASED_TPR_SHADOW;
1522#ifdef CONFIG_X86_64
1523 exec_control |= CPU_BASED_CR8_STORE_EXITING |
1524 CPU_BASED_CR8_LOAD_EXITING;
1525#endif
1526 }
1527 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
1528
1529 if (cpu_has_secondary_exec_ctrls()) {
1530 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
1531 if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
1532 exec_control &=
1533 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
1534 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
1535 }
1536
1537 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
1538 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
1539 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
1540
1541 vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */
1542 vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */
1543 vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
1544
1545 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
1546 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
1547 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
1548 vmcs_write16(HOST_FS_SELECTOR, read_fs()); /* 22.2.4 */
1549 vmcs_write16(HOST_GS_SELECTOR, read_gs()); /* 22.2.4 */
1550 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
1551#ifdef CONFIG_X86_64
1552 rdmsrl(MSR_FS_BASE, a);
1553 vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
1554 rdmsrl(MSR_GS_BASE, a);
1555 vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */
1556#else
1557 vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
1558 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
1559#endif
1560
1561 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
1562
1563 get_idt(&dt);
1564 vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */
1565
1566 asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
1567 vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
1568 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
1569 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
1570 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
1571
1572 rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
1573 vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
1574 rdmsrl(MSR_IA32_SYSENTER_ESP, a);
1575 vmcs_writel(HOST_IA32_SYSENTER_ESP, a); /* 22.2.3 */
1576 rdmsrl(MSR_IA32_SYSENTER_EIP, a);
1577 vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */
1578
1579 for (i = 0; i < NR_VMX_MSR; ++i) {
1580 u32 index = vmx_msr_index[i];
1581 u32 data_low, data_high;
1582 u64 data;
1583 int j = vmx->nmsrs;
1584
1585 if (rdmsr_safe(index, &data_low, &data_high) < 0)
1586 continue;
1587 if (wrmsr_safe(index, data_low, data_high) < 0)
1588 continue;
1589 data = data_low | ((u64)data_high << 32);
1590 vmx->host_msrs[j].index = index;
1591 vmx->host_msrs[j].reserved = 0;
1592 vmx->host_msrs[j].data = data;
1593 vmx->guest_msrs[j] = vmx->host_msrs[j];
1594 ++vmx->nmsrs;
1595 }
1596
1597 vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
1598
1599 /* 22.2.1, 20.8.1 */
1600 vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
1601
1602 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
1603 vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK);
1604
1605 if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
1606 if (alloc_apic_access_page(vmx->vcpu.kvm) != 0)
1607 return -ENOMEM;
1608
1609 return 0;
1610}
1611
1612static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
1613{
1614 struct vcpu_vmx *vmx = to_vmx(vcpu);
1615 u64 msr;
1616 int ret;
1617
1618 if (!init_rmode_tss(vmx->vcpu.kvm)) {
1619 ret = -ENOMEM;
1620 goto out;
1621 }
1622
1623 vmx->vcpu.arch.rmode.active = 0;
1624
1625 vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
1626 set_cr8(&vmx->vcpu, 0);
1627 msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
1628 if (vmx->vcpu.vcpu_id == 0)
1629 msr |= MSR_IA32_APICBASE_BSP;
1630 kvm_set_apic_base(&vmx->vcpu, msr);
1631
1632 fx_init(&vmx->vcpu);
1633
1634 /*
1635 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
1636 * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh.
1637 */
1638 if (vmx->vcpu.vcpu_id == 0) {
1639 vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
1640 vmcs_writel(GUEST_CS_BASE, 0x000f0000);
1641 } else {
1642 vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
1643 vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
1644 }
1645 vmcs_write32(GUEST_CS_LIMIT, 0xffff);
1646 vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
1647
1648 seg_setup(VCPU_SREG_DS);
1649 seg_setup(VCPU_SREG_ES);
1650 seg_setup(VCPU_SREG_FS);
1651 seg_setup(VCPU_SREG_GS);
1652 seg_setup(VCPU_SREG_SS);
1653
1654 vmcs_write16(GUEST_TR_SELECTOR, 0);
1655 vmcs_writel(GUEST_TR_BASE, 0);
1656 vmcs_write32(GUEST_TR_LIMIT, 0xffff);
1657 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
1658
1659 vmcs_write16(GUEST_LDTR_SELECTOR, 0);
1660 vmcs_writel(GUEST_LDTR_BASE, 0);
1661 vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
1662 vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
1663
1664 vmcs_write32(GUEST_SYSENTER_CS, 0);
1665 vmcs_writel(GUEST_SYSENTER_ESP, 0);
1666 vmcs_writel(GUEST_SYSENTER_EIP, 0);
1667
1668 vmcs_writel(GUEST_RFLAGS, 0x02);
1669 if (vmx->vcpu.vcpu_id == 0)
1670 vmcs_writel(GUEST_RIP, 0xfff0);
1671 else
1672 vmcs_writel(GUEST_RIP, 0);
1673 vmcs_writel(GUEST_RSP, 0);
1674
1675 /* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */
1676 vmcs_writel(GUEST_DR7, 0x400);
1677
1678 vmcs_writel(GUEST_GDTR_BASE, 0);
1679 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
1680
1681 vmcs_writel(GUEST_IDTR_BASE, 0);
1682 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
1683
1684 vmcs_write32(GUEST_ACTIVITY_STATE, 0);
1685 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
1686 vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
1687
1688 guest_write_tsc(0);
1689
1690 /* Special registers */
1691 vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
1692
1693 setup_msrs(vmx);
1694
1695 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
1696
1697 if (cpu_has_vmx_tpr_shadow()) {
1698 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
1699 if (vm_need_tpr_shadow(vmx->vcpu.kvm))
1700 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
1701 page_to_phys(vmx->vcpu.arch.apic->regs_page));
1702 vmcs_write32(TPR_THRESHOLD, 0);
1703 }
1704
1705 if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
1706 vmcs_write64(APIC_ACCESS_ADDR,
1707 page_to_phys(vmx->vcpu.kvm->arch.apic_access_page));
1708
1709 vmx->vcpu.arch.cr0 = 0x60000010;
1710 vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */
1711 vmx_set_cr4(&vmx->vcpu, 0);
1712#ifdef CONFIG_X86_64
1713 vmx_set_efer(&vmx->vcpu, 0);
1714#endif
1715 vmx_fpu_activate(&vmx->vcpu);
1716 update_exception_bitmap(&vmx->vcpu);
1717
1718 return 0;
1719
1720out:
1721 return ret;
1722}
1723
1724static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
1725{
1726 struct vcpu_vmx *vmx = to_vmx(vcpu);
1727
1728 if (vcpu->arch.rmode.active) {
1729 vmx->rmode.irq.pending = true;
1730 vmx->rmode.irq.vector = irq;
1731 vmx->rmode.irq.rip = vmcs_readl(GUEST_RIP);
1732 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
1733 irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK);
1734 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
1735 vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip - 1);
1736 return;
1737 }
1738 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
1739 irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
1740}
1741
1742static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
1743{
1744 int word_index = __ffs(vcpu->arch.irq_summary);
1745 int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
1746 int irq = word_index * BITS_PER_LONG + bit_index;
1747
1748 clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
1749 if (!vcpu->arch.irq_pending[word_index])
1750 clear_bit(word_index, &vcpu->arch.irq_summary);
1751 vmx_inject_irq(vcpu, irq);
1752}
1753
1754
1755static void do_interrupt_requests(struct kvm_vcpu *vcpu,
1756 struct kvm_run *kvm_run)
1757{
1758 u32 cpu_based_vm_exec_control;
1759
1760 vcpu->arch.interrupt_window_open =
1761 ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
1762 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
1763
1764 if (vcpu->arch.interrupt_window_open &&
1765 vcpu->arch.irq_summary &&
1766 !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
1767 /*
1768 * If interrupts enabled, and not blocked by sti or mov ss. Good.
1769 */
1770 kvm_do_inject_irq(vcpu);
1771
1772 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
1773 if (!vcpu->arch.interrupt_window_open &&
1774 (vcpu->arch.irq_summary || kvm_run->request_interrupt_window))
1775 /*
1776 * Interrupts blocked. Wait for unblock.
1777 */
1778 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
1779 else
1780 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
1781 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
1782}
1783
1784static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
1785{
1786 int ret;
1787 struct kvm_userspace_memory_region tss_mem = {
1788 .slot = 8,
1789 .guest_phys_addr = addr,
1790 .memory_size = PAGE_SIZE * 3,
1791 .flags = 0,
1792 };
1793
1794 ret = kvm_set_memory_region(kvm, &tss_mem, 0);
1795 if (ret)
1796 return ret;
1797 kvm->arch.tss_addr = addr;
1798 return 0;
1799}
1800
1801static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
1802{
1803 struct kvm_guest_debug *dbg = &vcpu->guest_debug;
1804
1805 set_debugreg(dbg->bp[0], 0);
1806 set_debugreg(dbg->bp[1], 1);
1807 set_debugreg(dbg->bp[2], 2);
1808 set_debugreg(dbg->bp[3], 3);
1809
1810 if (dbg->singlestep) {
1811 unsigned long flags;
1812
1813 flags = vmcs_readl(GUEST_RFLAGS);
1814 flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
1815 vmcs_writel(GUEST_RFLAGS, flags);
1816 }
1817}
1818
1819static int handle_rmode_exception(struct kvm_vcpu *vcpu,
1820 int vec, u32 err_code)
1821{
1822 if (!vcpu->arch.rmode.active)
1823 return 0;
1824
1825 /*
1826 * Instruction with address size override prefix opcode 0x67
1827 * Cause the #SS fault with 0 error code in VM86 mode.
1828 */
1829 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
1830 if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE)
1831 return 1;
1832 return 0;
1833}
1834
1835static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1836{
1837 struct vcpu_vmx *vmx = to_vmx(vcpu);
1838 u32 intr_info, error_code;
1839 unsigned long cr2, rip;
1840 u32 vect_info;
1841 enum emulation_result er;
1842
1843 vect_info = vmx->idt_vectoring_info;
1844 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
1845
1846 if ((vect_info & VECTORING_INFO_VALID_MASK) &&
1847 !is_page_fault(intr_info))
1848 printk(KERN_ERR "%s: unexpected, vectoring info 0x%x "
1849 "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info);
1850
1851 if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) {
1852 int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
1853 set_bit(irq, vcpu->arch.irq_pending);
1854 set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
1855 }
1856
1857 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */
1858 return 1; /* already handled by vmx_vcpu_run() */
1859
1860 if (is_no_device(intr_info)) {
1861 vmx_fpu_activate(vcpu);
1862 return 1;
1863 }
1864
1865 if (is_invalid_opcode(intr_info)) {
1866 er = emulate_instruction(vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD);
1867 if (er != EMULATE_DONE)
1868 kvm_queue_exception(vcpu, UD_VECTOR);
1869 return 1;
1870 }
1871
1872 error_code = 0;
1873 rip = vmcs_readl(GUEST_RIP);
1874 if (intr_info & INTR_INFO_DELIEVER_CODE_MASK)
1875 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
1876 if (is_page_fault(intr_info)) {
1877 cr2 = vmcs_readl(EXIT_QUALIFICATION);
1878 return kvm_mmu_page_fault(vcpu, cr2, error_code);
1879 }
1880
1881 if (vcpu->arch.rmode.active &&
1882 handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
1883 error_code)) {
1884 if (vcpu->arch.halt_request) {
1885 vcpu->arch.halt_request = 0;
1886 return kvm_emulate_halt(vcpu);
1887 }
1888 return 1;
1889 }
1890
1891 if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) ==
1892 (INTR_TYPE_EXCEPTION | 1)) {
1893 kvm_run->exit_reason = KVM_EXIT_DEBUG;
1894 return 0;
1895 }
1896 kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
1897 kvm_run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
1898 kvm_run->ex.error_code = error_code;
1899 return 0;
1900}
1901
1902static int handle_external_interrupt(struct kvm_vcpu *vcpu,
1903 struct kvm_run *kvm_run)
1904{
1905 ++vcpu->stat.irq_exits;
1906 return 1;
1907}
1908
1909static int handle_triple_fault(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1910{
1911 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
1912 return 0;
1913}
1914
1915static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1916{
1917 unsigned long exit_qualification;
1918 int size, down, in, string, rep;
1919 unsigned port;
1920
1921 ++vcpu->stat.io_exits;
1922 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
1923 string = (exit_qualification & 16) != 0;
1924
1925 if (string) {
1926 if (emulate_instruction(vcpu,
1927 kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
1928 return 0;
1929 return 1;
1930 }
1931
1932 size = (exit_qualification & 7) + 1;
1933 in = (exit_qualification & 8) != 0;
1934 down = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0;
1935 rep = (exit_qualification & 32) != 0;
1936 port = exit_qualification >> 16;
1937
1938 return kvm_emulate_pio(vcpu, kvm_run, in, size, port);
1939}
1940
1941static void
1942vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
1943{
1944 /*
1945 * Patch in the VMCALL instruction:
1946 */
1947 hypercall[0] = 0x0f;
1948 hypercall[1] = 0x01;
1949 hypercall[2] = 0xc1;
1950}
1951
1952static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1953{
1954 unsigned long exit_qualification;
1955 int cr;
1956 int reg;
1957
1958 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
1959 cr = exit_qualification & 15;
1960 reg = (exit_qualification >> 8) & 15;
1961 switch ((exit_qualification >> 4) & 3) {
1962 case 0: /* mov to cr */
1963 switch (cr) {
1964 case 0:
1965 vcpu_load_rsp_rip(vcpu);
1966 set_cr0(vcpu, vcpu->arch.regs[reg]);
1967 skip_emulated_instruction(vcpu);
1968 return 1;
1969 case 3:
1970 vcpu_load_rsp_rip(vcpu);
1971 set_cr3(vcpu, vcpu->arch.regs[reg]);
1972 skip_emulated_instruction(vcpu);
1973 return 1;
1974 case 4:
1975 vcpu_load_rsp_rip(vcpu);
1976 set_cr4(vcpu, vcpu->arch.regs[reg]);
1977 skip_emulated_instruction(vcpu);
1978 return 1;
1979 case 8:
1980 vcpu_load_rsp_rip(vcpu);
1981 set_cr8(vcpu, vcpu->arch.regs[reg]);
1982 skip_emulated_instruction(vcpu);
1983 if (irqchip_in_kernel(vcpu->kvm))
1984 return 1;
1985 kvm_run->exit_reason = KVM_EXIT_SET_TPR;
1986 return 0;
1987 };
1988 break;
1989 case 2: /* clts */
1990 vcpu_load_rsp_rip(vcpu);
1991 vmx_fpu_deactivate(vcpu);
1992 vcpu->arch.cr0 &= ~X86_CR0_TS;
1993 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
1994 vmx_fpu_activate(vcpu);
1995 skip_emulated_instruction(vcpu);
1996 return 1;
1997 case 1: /*mov from cr*/
1998 switch (cr) {
1999 case 3:
2000 vcpu_load_rsp_rip(vcpu);
2001 vcpu->arch.regs[reg] = vcpu->arch.cr3;
2002 vcpu_put_rsp_rip(vcpu);
2003 skip_emulated_instruction(vcpu);
2004 return 1;
2005 case 8:
2006 vcpu_load_rsp_rip(vcpu);
2007 vcpu->arch.regs[reg] = get_cr8(vcpu);
2008 vcpu_put_rsp_rip(vcpu);
2009 skip_emulated_instruction(vcpu);
2010 return 1;
2011 }
2012 break;
2013 case 3: /* lmsw */
2014 lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f);
2015
2016 skip_emulated_instruction(vcpu);
2017 return 1;
2018 default:
2019 break;
2020 }
2021 kvm_run->exit_reason = 0;
2022 pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
2023 (int)(exit_qualification >> 4) & 3, cr);
2024 return 0;
2025}
2026
2027static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2028{
2029 unsigned long exit_qualification;
2030 unsigned long val;
2031 int dr, reg;
2032
2033 /*
2034 * FIXME: this code assumes the host is debugging the guest.
2035 * need to deal with guest debugging itself too.
2036 */
2037 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
2038 dr = exit_qualification & 7;
2039 reg = (exit_qualification >> 8) & 15;
2040 vcpu_load_rsp_rip(vcpu);
2041 if (exit_qualification & 16) {
2042 /* mov from dr */
2043 switch (dr) {
2044 case 6:
2045 val = 0xffff0ff0;
2046 break;
2047 case 7:
2048 val = 0x400;
2049 break;
2050 default:
2051 val = 0;
2052 }
2053 vcpu->arch.regs[reg] = val;
2054 } else {
2055 /* mov to dr */
2056 }
2057 vcpu_put_rsp_rip(vcpu);
2058 skip_emulated_instruction(vcpu);
2059 return 1;
2060}
2061
2062static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2063{
2064 kvm_emulate_cpuid(vcpu);
2065 return 1;
2066}
2067
2068static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2069{
2070 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
2071 u64 data;
2072
2073 if (vmx_get_msr(vcpu, ecx, &data)) {
2074 kvm_inject_gp(vcpu, 0);
2075 return 1;
2076 }
2077
2078 /* FIXME: handling of bits 32:63 of rax, rdx */
2079 vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u;
2080 vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
2081 skip_emulated_instruction(vcpu);
2082 return 1;
2083}
2084
2085static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2086{
2087 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
2088 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
2089 | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
2090
2091 if (vmx_set_msr(vcpu, ecx, data) != 0) {
2092 kvm_inject_gp(vcpu, 0);
2093 return 1;
2094 }
2095
2096 skip_emulated_instruction(vcpu);
2097 return 1;
2098}
2099
2100static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu,
2101 struct kvm_run *kvm_run)
2102{
2103 return 1;
2104}
2105
2106static int handle_interrupt_window(struct kvm_vcpu *vcpu,
2107 struct kvm_run *kvm_run)
2108{
2109 u32 cpu_based_vm_exec_control;
2110
2111 /* clear pending irq */
2112 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2113 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2114 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2115 /*
2116 * If the user space waits to inject interrupts, exit as soon as
2117 * possible
2118 */
2119 if (kvm_run->request_interrupt_window &&
2120 !vcpu->arch.irq_summary) {
2121 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
2122 ++vcpu->stat.irq_window_exits;
2123 return 0;
2124 }
2125 return 1;
2126}
2127
2128static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2129{
2130 skip_emulated_instruction(vcpu);
2131 return kvm_emulate_halt(vcpu);
2132}
2133
2134static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2135{
2136 skip_emulated_instruction(vcpu);
2137 kvm_emulate_hypercall(vcpu);
2138 return 1;
2139}
2140
2141static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2142{
2143 skip_emulated_instruction(vcpu);
2144 /* TODO: Add support for VT-d/pass-through device */
2145 return 1;
2146}
2147
2148static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2149{
2150 u64 exit_qualification;
2151 enum emulation_result er;
2152 unsigned long offset;
2153
2154 exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2155 offset = exit_qualification & 0xffful;
2156
2157 er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
2158
2159 if (er != EMULATE_DONE) {
2160 printk(KERN_ERR
2161 "Fail to handle apic access vmexit! Offset is 0x%lx\n",
2162 offset);
2163 return -ENOTSUPP;
2164 }
2165 return 1;
2166}
2167
2168/*
2169 * The exit handlers return 1 if the exit was handled fully and guest execution
2170 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
2171 * to be done to userspace and return 0.
2172 */
2173static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
2174 struct kvm_run *kvm_run) = {
2175 [EXIT_REASON_EXCEPTION_NMI] = handle_exception,
2176 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
2177 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
2178 [EXIT_REASON_IO_INSTRUCTION] = handle_io,
2179 [EXIT_REASON_CR_ACCESS] = handle_cr,
2180 [EXIT_REASON_DR_ACCESS] = handle_dr,
2181 [EXIT_REASON_CPUID] = handle_cpuid,
2182 [EXIT_REASON_MSR_READ] = handle_rdmsr,
2183 [EXIT_REASON_MSR_WRITE] = handle_wrmsr,
2184 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
2185 [EXIT_REASON_HLT] = handle_halt,
2186 [EXIT_REASON_VMCALL] = handle_vmcall,
2187 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
2188 [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
2189 [EXIT_REASON_WBINVD] = handle_wbinvd,
2190};
2191
2192static const int kvm_vmx_max_exit_handlers =
2193 ARRAY_SIZE(kvm_vmx_exit_handlers);
2194
2195/*
2196 * The guest has exited. See if we can fix it or if we need userspace
2197 * assistance.
2198 */
2199static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2200{
2201 u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
2202 struct vcpu_vmx *vmx = to_vmx(vcpu);
2203 u32 vectoring_info = vmx->idt_vectoring_info;
2204
2205 if (unlikely(vmx->fail)) {
2206 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
2207 kvm_run->fail_entry.hardware_entry_failure_reason
2208 = vmcs_read32(VM_INSTRUCTION_ERROR);
2209 return 0;
2210 }
2211
2212 if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
2213 exit_reason != EXIT_REASON_EXCEPTION_NMI)
2214 printk(KERN_WARNING "%s: unexpected, valid vectoring info and "
2215 "exit reason is 0x%x\n", __FUNCTION__, exit_reason);
2216 if (exit_reason < kvm_vmx_max_exit_handlers
2217 && kvm_vmx_exit_handlers[exit_reason])
2218 return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run);
2219 else {
2220 kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
2221 kvm_run->hw.hardware_exit_reason = exit_reason;
2222 }
2223 return 0;
2224}
2225
2226static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
2227{
2228}
2229
2230static void update_tpr_threshold(struct kvm_vcpu *vcpu)
2231{
2232 int max_irr, tpr;
2233
2234 if (!vm_need_tpr_shadow(vcpu->kvm))
2235 return;
2236
2237 if (!kvm_lapic_enabled(vcpu) ||
2238 ((max_irr = kvm_lapic_find_highest_irr(vcpu)) == -1)) {
2239 vmcs_write32(TPR_THRESHOLD, 0);
2240 return;
2241 }
2242
2243 tpr = (kvm_lapic_get_cr8(vcpu) & 0x0f) << 4;
2244 vmcs_write32(TPR_THRESHOLD, (max_irr > tpr) ? tpr >> 4 : max_irr >> 4);
2245}
2246
2247static void enable_irq_window(struct kvm_vcpu *vcpu)
2248{
2249 u32 cpu_based_vm_exec_control;
2250
2251 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2252 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
2253 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2254}
2255
2256static void vmx_intr_assist(struct kvm_vcpu *vcpu)
2257{
2258 struct vcpu_vmx *vmx = to_vmx(vcpu);
2259 u32 idtv_info_field, intr_info_field;
2260 int has_ext_irq, interrupt_window_open;
2261 int vector;
2262
2263 update_tpr_threshold(vcpu);
2264
2265 has_ext_irq = kvm_cpu_has_interrupt(vcpu);
2266 intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
2267 idtv_info_field = vmx->idt_vectoring_info;
2268 if (intr_info_field & INTR_INFO_VALID_MASK) {
2269 if (idtv_info_field & INTR_INFO_VALID_MASK) {
2270 /* TODO: fault when IDT_Vectoring */
2271 if (printk_ratelimit())
2272 printk(KERN_ERR "Fault when IDT_Vectoring\n");
2273 }
2274 if (has_ext_irq)
2275 enable_irq_window(vcpu);
2276 return;
2277 }
2278 if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) {
2279 if ((idtv_info_field & VECTORING_INFO_TYPE_MASK)
2280 == INTR_TYPE_EXT_INTR
2281 && vcpu->arch.rmode.active) {
2282 u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK;
2283
2284 vmx_inject_irq(vcpu, vect);
2285 if (unlikely(has_ext_irq))
2286 enable_irq_window(vcpu);
2287 return;
2288 }
2289
2290 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field);
2291 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2292 vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
2293
2294 if (unlikely(idtv_info_field & INTR_INFO_DELIEVER_CODE_MASK))
2295 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
2296 vmcs_read32(IDT_VECTORING_ERROR_CODE));
2297 if (unlikely(has_ext_irq))
2298 enable_irq_window(vcpu);
2299 return;
2300 }
2301 if (!has_ext_irq)
2302 return;
2303 interrupt_window_open =
2304 ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
2305 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
2306 if (interrupt_window_open) {
2307 vector = kvm_cpu_get_interrupt(vcpu);
2308 vmx_inject_irq(vcpu, vector);
2309 kvm_timer_intr_post(vcpu, vector);
2310 } else
2311 enable_irq_window(vcpu);
2312}
2313
2314/*
2315 * Failure to inject an interrupt should give us the information
2316 * in IDT_VECTORING_INFO_FIELD. However, if the failure occurs
2317 * when fetching the interrupt redirection bitmap in the real-mode
2318 * tss, this doesn't happen. So we do it ourselves.
2319 */
2320static void fixup_rmode_irq(struct vcpu_vmx *vmx)
2321{
2322 vmx->rmode.irq.pending = 0;
2323 if (vmcs_readl(GUEST_RIP) + 1 != vmx->rmode.irq.rip)
2324 return;
2325 vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip);
2326 if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
2327 vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK;
2328 vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR;
2329 return;
2330 }
2331 vmx->idt_vectoring_info =
2332 VECTORING_INFO_VALID_MASK
2333 | INTR_TYPE_EXT_INTR
2334 | vmx->rmode.irq.vector;
2335}
2336
2337static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2338{
2339 struct vcpu_vmx *vmx = to_vmx(vcpu);
2340 u32 intr_info;
2341
2342 /*
2343 * Loading guest fpu may have cleared host cr0.ts
2344 */
2345 vmcs_writel(HOST_CR0, read_cr0());
2346
2347 asm(
2348 /* Store host registers */
2349#ifdef CONFIG_X86_64
2350 "push %%rdx; push %%rbp;"
2351 "push %%rcx \n\t"
2352#else
2353 "push %%edx; push %%ebp;"
2354 "push %%ecx \n\t"
2355#endif
2356 ASM_VMX_VMWRITE_RSP_RDX "\n\t"
2357 /* Check if vmlaunch of vmresume is needed */
2358 "cmpl $0, %c[launched](%0) \n\t"
2359 /* Load guest registers. Don't clobber flags. */
2360#ifdef CONFIG_X86_64
2361 "mov %c[cr2](%0), %%rax \n\t"
2362 "mov %%rax, %%cr2 \n\t"
2363 "mov %c[rax](%0), %%rax \n\t"
2364 "mov %c[rbx](%0), %%rbx \n\t"
2365 "mov %c[rdx](%0), %%rdx \n\t"
2366 "mov %c[rsi](%0), %%rsi \n\t"
2367 "mov %c[rdi](%0), %%rdi \n\t"
2368 "mov %c[rbp](%0), %%rbp \n\t"
2369 "mov %c[r8](%0), %%r8 \n\t"
2370 "mov %c[r9](%0), %%r9 \n\t"
2371 "mov %c[r10](%0), %%r10 \n\t"
2372 "mov %c[r11](%0), %%r11 \n\t"
2373 "mov %c[r12](%0), %%r12 \n\t"
2374 "mov %c[r13](%0), %%r13 \n\t"
2375 "mov %c[r14](%0), %%r14 \n\t"
2376 "mov %c[r15](%0), %%r15 \n\t"
2377 "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (rcx) */
2378#else
2379 "mov %c[cr2](%0), %%eax \n\t"
2380 "mov %%eax, %%cr2 \n\t"
2381 "mov %c[rax](%0), %%eax \n\t"
2382 "mov %c[rbx](%0), %%ebx \n\t"
2383 "mov %c[rdx](%0), %%edx \n\t"
2384 "mov %c[rsi](%0), %%esi \n\t"
2385 "mov %c[rdi](%0), %%edi \n\t"
2386 "mov %c[rbp](%0), %%ebp \n\t"
2387 "mov %c[rcx](%0), %%ecx \n\t" /* kills %0 (ecx) */
2388#endif
2389 /* Enter guest mode */
2390 "jne .Llaunched \n\t"
2391 ASM_VMX_VMLAUNCH "\n\t"
2392 "jmp .Lkvm_vmx_return \n\t"
2393 ".Llaunched: " ASM_VMX_VMRESUME "\n\t"
2394 ".Lkvm_vmx_return: "
2395 /* Save guest registers, load host registers, keep flags */
2396#ifdef CONFIG_X86_64
2397 "xchg %0, (%%rsp) \n\t"
2398 "mov %%rax, %c[rax](%0) \n\t"
2399 "mov %%rbx, %c[rbx](%0) \n\t"
2400 "pushq (%%rsp); popq %c[rcx](%0) \n\t"
2401 "mov %%rdx, %c[rdx](%0) \n\t"
2402 "mov %%rsi, %c[rsi](%0) \n\t"
2403 "mov %%rdi, %c[rdi](%0) \n\t"
2404 "mov %%rbp, %c[rbp](%0) \n\t"
2405 "mov %%r8, %c[r8](%0) \n\t"
2406 "mov %%r9, %c[r9](%0) \n\t"
2407 "mov %%r10, %c[r10](%0) \n\t"
2408 "mov %%r11, %c[r11](%0) \n\t"
2409 "mov %%r12, %c[r12](%0) \n\t"
2410 "mov %%r13, %c[r13](%0) \n\t"
2411 "mov %%r14, %c[r14](%0) \n\t"
2412 "mov %%r15, %c[r15](%0) \n\t"
2413 "mov %%cr2, %%rax \n\t"
2414 "mov %%rax, %c[cr2](%0) \n\t"
2415
2416 "pop %%rbp; pop %%rbp; pop %%rdx \n\t"
2417#else
2418 "xchg %0, (%%esp) \n\t"
2419 "mov %%eax, %c[rax](%0) \n\t"
2420 "mov %%ebx, %c[rbx](%0) \n\t"
2421 "pushl (%%esp); popl %c[rcx](%0) \n\t"
2422 "mov %%edx, %c[rdx](%0) \n\t"
2423 "mov %%esi, %c[rsi](%0) \n\t"
2424 "mov %%edi, %c[rdi](%0) \n\t"
2425 "mov %%ebp, %c[rbp](%0) \n\t"
2426 "mov %%cr2, %%eax \n\t"
2427 "mov %%eax, %c[cr2](%0) \n\t"
2428
2429 "pop %%ebp; pop %%ebp; pop %%edx \n\t"
2430#endif
2431 "setbe %c[fail](%0) \n\t"
2432 : : "c"(vmx), "d"((unsigned long)HOST_RSP),
2433 [launched]"i"(offsetof(struct vcpu_vmx, launched)),
2434 [fail]"i"(offsetof(struct vcpu_vmx, fail)),
2435 [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
2436 [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
2437 [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
2438 [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
2439 [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
2440 [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
2441 [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
2442#ifdef CONFIG_X86_64
2443 [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
2444 [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
2445 [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
2446 [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
2447 [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
2448 [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
2449 [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
2450 [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
2451#endif
2452 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2))
2453 : "cc", "memory"
2454#ifdef CONFIG_X86_64
2455 , "rbx", "rdi", "rsi"
2456 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
2457#else
2458 , "ebx", "edi", "rsi"
2459#endif
2460 );
2461
2462 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
2463 if (vmx->rmode.irq.pending)
2464 fixup_rmode_irq(vmx);
2465
2466 vcpu->arch.interrupt_window_open =
2467 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0;
2468
2469 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
2470 vmx->launched = 1;
2471
2472 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
2473
2474 /* We need to handle NMIs before interrupts are enabled */
2475 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */
2476 asm("int $2");
2477}
2478
2479static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
2480{
2481 struct vcpu_vmx *vmx = to_vmx(vcpu);
2482
2483 if (vmx->vmcs) {
2484 on_each_cpu(__vcpu_clear, vmx, 0, 1);
2485 free_vmcs(vmx->vmcs);
2486 vmx->vmcs = NULL;
2487 }
2488}
2489
2490static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
2491{
2492 struct vcpu_vmx *vmx = to_vmx(vcpu);
2493
2494 vmx_free_vmcs(vcpu);
2495 kfree(vmx->host_msrs);
2496 kfree(vmx->guest_msrs);
2497 kvm_vcpu_uninit(vcpu);
2498 kmem_cache_free(kvm_vcpu_cache, vmx);
2499}
2500
2501static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
2502{
2503 int err;
2504 struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
2505 int cpu;
2506
2507 if (!vmx)
2508 return ERR_PTR(-ENOMEM);
2509
2510 err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
2511 if (err)
2512 goto free_vcpu;
2513
2514 vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
2515 if (!vmx->guest_msrs) {
2516 err = -ENOMEM;
2517 goto uninit_vcpu;
2518 }
2519
2520 vmx->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
2521 if (!vmx->host_msrs)
2522 goto free_guest_msrs;
2523
2524 vmx->vmcs = alloc_vmcs();
2525 if (!vmx->vmcs)
2526 goto free_msrs;
2527
2528 vmcs_clear(vmx->vmcs);
2529
2530 cpu = get_cpu();
2531 vmx_vcpu_load(&vmx->vcpu, cpu);
2532 err = vmx_vcpu_setup(vmx);
2533 vmx_vcpu_put(&vmx->vcpu);
2534 put_cpu();
2535 if (err)
2536 goto free_vmcs;
2537
2538 return &vmx->vcpu;
2539
2540free_vmcs:
2541 free_vmcs(vmx->vmcs);
2542free_msrs:
2543 kfree(vmx->host_msrs);
2544free_guest_msrs:
2545 kfree(vmx->guest_msrs);
2546uninit_vcpu:
2547 kvm_vcpu_uninit(&vmx->vcpu);
2548free_vcpu:
2549 kmem_cache_free(kvm_vcpu_cache, vmx);
2550 return ERR_PTR(err);
2551}
2552
2553static void __init vmx_check_processor_compat(void *rtn)
2554{
2555 struct vmcs_config vmcs_conf;
2556
2557 *(int *)rtn = 0;
2558 if (setup_vmcs_config(&vmcs_conf) < 0)
2559 *(int *)rtn = -EIO;
2560 if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
2561 printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
2562 smp_processor_id());
2563 *(int *)rtn = -EIO;
2564 }
2565}
2566
2567static struct kvm_x86_ops vmx_x86_ops = {
2568 .cpu_has_kvm_support = cpu_has_kvm_support,
2569 .disabled_by_bios = vmx_disabled_by_bios,
2570 .hardware_setup = hardware_setup,
2571 .hardware_unsetup = hardware_unsetup,
2572 .check_processor_compatibility = vmx_check_processor_compat,
2573 .hardware_enable = hardware_enable,
2574 .hardware_disable = hardware_disable,
2575 .cpu_has_accelerated_tpr = cpu_has_vmx_virtualize_apic_accesses,
2576
2577 .vcpu_create = vmx_create_vcpu,
2578 .vcpu_free = vmx_free_vcpu,
2579 .vcpu_reset = vmx_vcpu_reset,
2580
2581 .prepare_guest_switch = vmx_save_host_state,
2582 .vcpu_load = vmx_vcpu_load,
2583 .vcpu_put = vmx_vcpu_put,
2584 .vcpu_decache = vmx_vcpu_decache,
2585
2586 .set_guest_debug = set_guest_debug,
2587 .guest_debug_pre = kvm_guest_debug_pre,
2588 .get_msr = vmx_get_msr,
2589 .set_msr = vmx_set_msr,
2590 .get_segment_base = vmx_get_segment_base,
2591 .get_segment = vmx_get_segment,
2592 .set_segment = vmx_set_segment,
2593 .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
2594 .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
2595 .set_cr0 = vmx_set_cr0,
2596 .set_cr3 = vmx_set_cr3,
2597 .set_cr4 = vmx_set_cr4,
2598#ifdef CONFIG_X86_64
2599 .set_efer = vmx_set_efer,
2600#endif
2601 .get_idt = vmx_get_idt,
2602 .set_idt = vmx_set_idt,
2603 .get_gdt = vmx_get_gdt,
2604 .set_gdt = vmx_set_gdt,
2605 .cache_regs = vcpu_load_rsp_rip,
2606 .decache_regs = vcpu_put_rsp_rip,
2607 .get_rflags = vmx_get_rflags,
2608 .set_rflags = vmx_set_rflags,
2609
2610 .tlb_flush = vmx_flush_tlb,
2611
2612 .run = vmx_vcpu_run,
2613 .handle_exit = kvm_handle_exit,
2614 .skip_emulated_instruction = skip_emulated_instruction,
2615 .patch_hypercall = vmx_patch_hypercall,
2616 .get_irq = vmx_get_irq,
2617 .set_irq = vmx_inject_irq,
2618 .queue_exception = vmx_queue_exception,
2619 .exception_injected = vmx_exception_injected,
2620 .inject_pending_irq = vmx_intr_assist,
2621 .inject_pending_vectors = do_interrupt_requests,
2622
2623 .set_tss_addr = vmx_set_tss_addr,
2624};
2625
2626static int __init vmx_init(void)
2627{
2628 void *iova;
2629 int r;
2630
2631 vmx_io_bitmap_a = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
2632 if (!vmx_io_bitmap_a)
2633 return -ENOMEM;
2634
2635 vmx_io_bitmap_b = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
2636 if (!vmx_io_bitmap_b) {
2637 r = -ENOMEM;
2638 goto out;
2639 }
2640
2641 /*
2642 * Allow direct access to the PC debug port (it is often used for I/O
2643 * delays, but the vmexits simply slow things down).
2644 */
2645 iova = kmap(vmx_io_bitmap_a);
2646 memset(iova, 0xff, PAGE_SIZE);
2647 clear_bit(0x80, iova);
2648 kunmap(vmx_io_bitmap_a);
2649
2650 iova = kmap(vmx_io_bitmap_b);
2651 memset(iova, 0xff, PAGE_SIZE);
2652 kunmap(vmx_io_bitmap_b);
2653
2654 r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
2655 if (r)
2656 goto out1;
2657
2658 if (bypass_guest_pf)
2659 kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
2660
2661 return 0;
2662
2663out1:
2664 __free_page(vmx_io_bitmap_b);
2665out:
2666 __free_page(vmx_io_bitmap_a);
2667 return r;
2668}
2669
2670static void __exit vmx_exit(void)
2671{
2672 __free_page(vmx_io_bitmap_b);
2673 __free_page(vmx_io_bitmap_a);
2674
2675 kvm_exit();
2676}
2677
2678module_init(vmx_init)
2679module_exit(vmx_exit)
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h
new file mode 100644
index 000000000000..d52ae8d7303d
--- /dev/null
+++ b/arch/x86/kvm/vmx.h
@@ -0,0 +1,324 @@
1#ifndef VMX_H
2#define VMX_H
3
4/*
5 * vmx.h: VMX Architecture related definitions
6 * Copyright (c) 2004, Intel Corporation.
7 *
8 * This program is free software; you can redistribute it and/or modify it
9 * under the terms and conditions of the GNU General Public License,
10 * version 2, as published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope it will be useful, but WITHOUT
13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
15 * more details.
16 *
17 * You should have received a copy of the GNU General Public License along with
18 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
19 * Place - Suite 330, Boston, MA 02111-1307 USA.
20 *
21 * A few random additions are:
22 * Copyright (C) 2006 Qumranet
23 * Avi Kivity <avi@qumranet.com>
24 * Yaniv Kamay <yaniv@qumranet.com>
25 *
26 */
27
28/*
29 * Definitions of Primary Processor-Based VM-Execution Controls.
30 */
31#define CPU_BASED_VIRTUAL_INTR_PENDING 0x00000004
32#define CPU_BASED_USE_TSC_OFFSETING 0x00000008
33#define CPU_BASED_HLT_EXITING 0x00000080
34#define CPU_BASED_INVLPG_EXITING 0x00000200
35#define CPU_BASED_MWAIT_EXITING 0x00000400
36#define CPU_BASED_RDPMC_EXITING 0x00000800
37#define CPU_BASED_RDTSC_EXITING 0x00001000
38#define CPU_BASED_CR8_LOAD_EXITING 0x00080000
39#define CPU_BASED_CR8_STORE_EXITING 0x00100000
40#define CPU_BASED_TPR_SHADOW 0x00200000
41#define CPU_BASED_MOV_DR_EXITING 0x00800000
42#define CPU_BASED_UNCOND_IO_EXITING 0x01000000
43#define CPU_BASED_USE_IO_BITMAPS 0x02000000
44#define CPU_BASED_USE_MSR_BITMAPS 0x10000000
45#define CPU_BASED_MONITOR_EXITING 0x20000000
46#define CPU_BASED_PAUSE_EXITING 0x40000000
47#define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS 0x80000000
48/*
49 * Definitions of Secondary Processor-Based VM-Execution Controls.
50 */
51#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
52#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
53
54
55#define PIN_BASED_EXT_INTR_MASK 0x00000001
56#define PIN_BASED_NMI_EXITING 0x00000008
57#define PIN_BASED_VIRTUAL_NMIS 0x00000020
58
59#define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200
60#define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000
61
62#define VM_ENTRY_IA32E_MODE 0x00000200
63#define VM_ENTRY_SMM 0x00000400
64#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800
65
66/* VMCS Encodings */
67enum vmcs_field {
68 GUEST_ES_SELECTOR = 0x00000800,
69 GUEST_CS_SELECTOR = 0x00000802,
70 GUEST_SS_SELECTOR = 0x00000804,
71 GUEST_DS_SELECTOR = 0x00000806,
72 GUEST_FS_SELECTOR = 0x00000808,
73 GUEST_GS_SELECTOR = 0x0000080a,
74 GUEST_LDTR_SELECTOR = 0x0000080c,
75 GUEST_TR_SELECTOR = 0x0000080e,
76 HOST_ES_SELECTOR = 0x00000c00,
77 HOST_CS_SELECTOR = 0x00000c02,
78 HOST_SS_SELECTOR = 0x00000c04,
79 HOST_DS_SELECTOR = 0x00000c06,
80 HOST_FS_SELECTOR = 0x00000c08,
81 HOST_GS_SELECTOR = 0x00000c0a,
82 HOST_TR_SELECTOR = 0x00000c0c,
83 IO_BITMAP_A = 0x00002000,
84 IO_BITMAP_A_HIGH = 0x00002001,
85 IO_BITMAP_B = 0x00002002,
86 IO_BITMAP_B_HIGH = 0x00002003,
87 MSR_BITMAP = 0x00002004,
88 MSR_BITMAP_HIGH = 0x00002005,
89 VM_EXIT_MSR_STORE_ADDR = 0x00002006,
90 VM_EXIT_MSR_STORE_ADDR_HIGH = 0x00002007,
91 VM_EXIT_MSR_LOAD_ADDR = 0x00002008,
92 VM_EXIT_MSR_LOAD_ADDR_HIGH = 0x00002009,
93 VM_ENTRY_MSR_LOAD_ADDR = 0x0000200a,
94 VM_ENTRY_MSR_LOAD_ADDR_HIGH = 0x0000200b,
95 TSC_OFFSET = 0x00002010,
96 TSC_OFFSET_HIGH = 0x00002011,
97 VIRTUAL_APIC_PAGE_ADDR = 0x00002012,
98 VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013,
99 APIC_ACCESS_ADDR = 0x00002014,
100 APIC_ACCESS_ADDR_HIGH = 0x00002015,
101 VMCS_LINK_POINTER = 0x00002800,
102 VMCS_LINK_POINTER_HIGH = 0x00002801,
103 GUEST_IA32_DEBUGCTL = 0x00002802,
104 GUEST_IA32_DEBUGCTL_HIGH = 0x00002803,
105 PIN_BASED_VM_EXEC_CONTROL = 0x00004000,
106 CPU_BASED_VM_EXEC_CONTROL = 0x00004002,
107 EXCEPTION_BITMAP = 0x00004004,
108 PAGE_FAULT_ERROR_CODE_MASK = 0x00004006,
109 PAGE_FAULT_ERROR_CODE_MATCH = 0x00004008,
110 CR3_TARGET_COUNT = 0x0000400a,
111 VM_EXIT_CONTROLS = 0x0000400c,
112 VM_EXIT_MSR_STORE_COUNT = 0x0000400e,
113 VM_EXIT_MSR_LOAD_COUNT = 0x00004010,
114 VM_ENTRY_CONTROLS = 0x00004012,
115 VM_ENTRY_MSR_LOAD_COUNT = 0x00004014,
116 VM_ENTRY_INTR_INFO_FIELD = 0x00004016,
117 VM_ENTRY_EXCEPTION_ERROR_CODE = 0x00004018,
118 VM_ENTRY_INSTRUCTION_LEN = 0x0000401a,
119 TPR_THRESHOLD = 0x0000401c,
120 SECONDARY_VM_EXEC_CONTROL = 0x0000401e,
121 VM_INSTRUCTION_ERROR = 0x00004400,
122 VM_EXIT_REASON = 0x00004402,
123 VM_EXIT_INTR_INFO = 0x00004404,
124 VM_EXIT_INTR_ERROR_CODE = 0x00004406,
125 IDT_VECTORING_INFO_FIELD = 0x00004408,
126 IDT_VECTORING_ERROR_CODE = 0x0000440a,
127 VM_EXIT_INSTRUCTION_LEN = 0x0000440c,
128 VMX_INSTRUCTION_INFO = 0x0000440e,
129 GUEST_ES_LIMIT = 0x00004800,
130 GUEST_CS_LIMIT = 0x00004802,
131 GUEST_SS_LIMIT = 0x00004804,
132 GUEST_DS_LIMIT = 0x00004806,
133 GUEST_FS_LIMIT = 0x00004808,
134 GUEST_GS_LIMIT = 0x0000480a,
135 GUEST_LDTR_LIMIT = 0x0000480c,
136 GUEST_TR_LIMIT = 0x0000480e,
137 GUEST_GDTR_LIMIT = 0x00004810,
138 GUEST_IDTR_LIMIT = 0x00004812,
139 GUEST_ES_AR_BYTES = 0x00004814,
140 GUEST_CS_AR_BYTES = 0x00004816,
141 GUEST_SS_AR_BYTES = 0x00004818,
142 GUEST_DS_AR_BYTES = 0x0000481a,
143 GUEST_FS_AR_BYTES = 0x0000481c,
144 GUEST_GS_AR_BYTES = 0x0000481e,
145 GUEST_LDTR_AR_BYTES = 0x00004820,
146 GUEST_TR_AR_BYTES = 0x00004822,
147 GUEST_INTERRUPTIBILITY_INFO = 0x00004824,
148 GUEST_ACTIVITY_STATE = 0X00004826,
149 GUEST_SYSENTER_CS = 0x0000482A,
150 HOST_IA32_SYSENTER_CS = 0x00004c00,
151 CR0_GUEST_HOST_MASK = 0x00006000,
152 CR4_GUEST_HOST_MASK = 0x00006002,
153 CR0_READ_SHADOW = 0x00006004,
154 CR4_READ_SHADOW = 0x00006006,
155 CR3_TARGET_VALUE0 = 0x00006008,
156 CR3_TARGET_VALUE1 = 0x0000600a,
157 CR3_TARGET_VALUE2 = 0x0000600c,
158 CR3_TARGET_VALUE3 = 0x0000600e,
159 EXIT_QUALIFICATION = 0x00006400,
160 GUEST_LINEAR_ADDRESS = 0x0000640a,
161 GUEST_CR0 = 0x00006800,
162 GUEST_CR3 = 0x00006802,
163 GUEST_CR4 = 0x00006804,
164 GUEST_ES_BASE = 0x00006806,
165 GUEST_CS_BASE = 0x00006808,
166 GUEST_SS_BASE = 0x0000680a,
167 GUEST_DS_BASE = 0x0000680c,
168 GUEST_FS_BASE = 0x0000680e,
169 GUEST_GS_BASE = 0x00006810,
170 GUEST_LDTR_BASE = 0x00006812,
171 GUEST_TR_BASE = 0x00006814,
172 GUEST_GDTR_BASE = 0x00006816,
173 GUEST_IDTR_BASE = 0x00006818,
174 GUEST_DR7 = 0x0000681a,
175 GUEST_RSP = 0x0000681c,
176 GUEST_RIP = 0x0000681e,
177 GUEST_RFLAGS = 0x00006820,
178 GUEST_PENDING_DBG_EXCEPTIONS = 0x00006822,
179 GUEST_SYSENTER_ESP = 0x00006824,
180 GUEST_SYSENTER_EIP = 0x00006826,
181 HOST_CR0 = 0x00006c00,
182 HOST_CR3 = 0x00006c02,
183 HOST_CR4 = 0x00006c04,
184 HOST_FS_BASE = 0x00006c06,
185 HOST_GS_BASE = 0x00006c08,
186 HOST_TR_BASE = 0x00006c0a,
187 HOST_GDTR_BASE = 0x00006c0c,
188 HOST_IDTR_BASE = 0x00006c0e,
189 HOST_IA32_SYSENTER_ESP = 0x00006c10,
190 HOST_IA32_SYSENTER_EIP = 0x00006c12,
191 HOST_RSP = 0x00006c14,
192 HOST_RIP = 0x00006c16,
193};
194
195#define VMX_EXIT_REASONS_FAILED_VMENTRY 0x80000000
196
197#define EXIT_REASON_EXCEPTION_NMI 0
198#define EXIT_REASON_EXTERNAL_INTERRUPT 1
199#define EXIT_REASON_TRIPLE_FAULT 2
200
201#define EXIT_REASON_PENDING_INTERRUPT 7
202
203#define EXIT_REASON_TASK_SWITCH 9
204#define EXIT_REASON_CPUID 10
205#define EXIT_REASON_HLT 12
206#define EXIT_REASON_INVLPG 14
207#define EXIT_REASON_RDPMC 15
208#define EXIT_REASON_RDTSC 16
209#define EXIT_REASON_VMCALL 18
210#define EXIT_REASON_VMCLEAR 19
211#define EXIT_REASON_VMLAUNCH 20
212#define EXIT_REASON_VMPTRLD 21
213#define EXIT_REASON_VMPTRST 22
214#define EXIT_REASON_VMREAD 23
215#define EXIT_REASON_VMRESUME 24
216#define EXIT_REASON_VMWRITE 25
217#define EXIT_REASON_VMOFF 26
218#define EXIT_REASON_VMON 27
219#define EXIT_REASON_CR_ACCESS 28
220#define EXIT_REASON_DR_ACCESS 29
221#define EXIT_REASON_IO_INSTRUCTION 30
222#define EXIT_REASON_MSR_READ 31
223#define EXIT_REASON_MSR_WRITE 32
224#define EXIT_REASON_MWAIT_INSTRUCTION 36
225#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
226#define EXIT_REASON_APIC_ACCESS 44
227#define EXIT_REASON_WBINVD 54
228
229/*
230 * Interruption-information format
231 */
232#define INTR_INFO_VECTOR_MASK 0xff /* 7:0 */
233#define INTR_INFO_INTR_TYPE_MASK 0x700 /* 10:8 */
234#define INTR_INFO_DELIEVER_CODE_MASK 0x800 /* 11 */
235#define INTR_INFO_VALID_MASK 0x80000000 /* 31 */
236
237#define VECTORING_INFO_VECTOR_MASK INTR_INFO_VECTOR_MASK
238#define VECTORING_INFO_TYPE_MASK INTR_INFO_INTR_TYPE_MASK
239#define VECTORING_INFO_DELIEVER_CODE_MASK INTR_INFO_DELIEVER_CODE_MASK
240#define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK
241
242#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */
243#define INTR_TYPE_EXCEPTION (3 << 8) /* processor exception */
244#define INTR_TYPE_SOFT_INTR (4 << 8) /* software interrupt */
245
246/*
247 * Exit Qualifications for MOV for Control Register Access
248 */
249#define CONTROL_REG_ACCESS_NUM 0x7 /* 2:0, number of control reg.*/
250#define CONTROL_REG_ACCESS_TYPE 0x30 /* 5:4, access type */
251#define CONTROL_REG_ACCESS_REG 0xf00 /* 10:8, general purpose reg. */
252#define LMSW_SOURCE_DATA_SHIFT 16
253#define LMSW_SOURCE_DATA (0xFFFF << LMSW_SOURCE_DATA_SHIFT) /* 16:31 lmsw source */
254#define REG_EAX (0 << 8)
255#define REG_ECX (1 << 8)
256#define REG_EDX (2 << 8)
257#define REG_EBX (3 << 8)
258#define REG_ESP (4 << 8)
259#define REG_EBP (5 << 8)
260#define REG_ESI (6 << 8)
261#define REG_EDI (7 << 8)
262#define REG_R8 (8 << 8)
263#define REG_R9 (9 << 8)
264#define REG_R10 (10 << 8)
265#define REG_R11 (11 << 8)
266#define REG_R12 (12 << 8)
267#define REG_R13 (13 << 8)
268#define REG_R14 (14 << 8)
269#define REG_R15 (15 << 8)
270
271/*
272 * Exit Qualifications for MOV for Debug Register Access
273 */
274#define DEBUG_REG_ACCESS_NUM 0x7 /* 2:0, number of debug reg. */
275#define DEBUG_REG_ACCESS_TYPE 0x10 /* 4, direction of access */
276#define TYPE_MOV_TO_DR (0 << 4)
277#define TYPE_MOV_FROM_DR (1 << 4)
278#define DEBUG_REG_ACCESS_REG 0xf00 /* 11:8, general purpose reg. */
279
280
281/* segment AR */
282#define SEGMENT_AR_L_MASK (1 << 13)
283
284#define AR_TYPE_ACCESSES_MASK 1
285#define AR_TYPE_READABLE_MASK (1 << 1)
286#define AR_TYPE_WRITEABLE_MASK (1 << 2)
287#define AR_TYPE_CODE_MASK (1 << 3)
288#define AR_TYPE_MASK 0x0f
289#define AR_TYPE_BUSY_64_TSS 11
290#define AR_TYPE_BUSY_32_TSS 11
291#define AR_TYPE_BUSY_16_TSS 3
292#define AR_TYPE_LDT 2
293
294#define AR_UNUSABLE_MASK (1 << 16)
295#define AR_S_MASK (1 << 4)
296#define AR_P_MASK (1 << 7)
297#define AR_L_MASK (1 << 13)
298#define AR_DB_MASK (1 << 14)
299#define AR_G_MASK (1 << 15)
300#define AR_DPL_SHIFT 5
301#define AR_DPL(ar) (((ar) >> AR_DPL_SHIFT) & 3)
302
303#define AR_RESERVD_MASK 0xfffe0f00
304
305#define MSR_IA32_VMX_BASIC 0x480
306#define MSR_IA32_VMX_PINBASED_CTLS 0x481
307#define MSR_IA32_VMX_PROCBASED_CTLS 0x482
308#define MSR_IA32_VMX_EXIT_CTLS 0x483
309#define MSR_IA32_VMX_ENTRY_CTLS 0x484
310#define MSR_IA32_VMX_MISC 0x485
311#define MSR_IA32_VMX_CR0_FIXED0 0x486
312#define MSR_IA32_VMX_CR0_FIXED1 0x487
313#define MSR_IA32_VMX_CR4_FIXED0 0x488
314#define MSR_IA32_VMX_CR4_FIXED1 0x489
315#define MSR_IA32_VMX_VMCS_ENUM 0x48a
316#define MSR_IA32_VMX_PROCBASED_CTLS2 0x48b
317
318#define MSR_IA32_FEATURE_CONTROL 0x3a
319#define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1
320#define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4
321
322#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9
323
324#endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
new file mode 100644
index 000000000000..8f94a0b89dff
--- /dev/null
+++ b/arch/x86/kvm/x86.c
@@ -0,0 +1,3287 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * derived from drivers/kvm/kvm_main.c
5 *
6 * Copyright (C) 2006 Qumranet, Inc.
7 *
8 * Authors:
9 * Avi Kivity <avi@qumranet.com>
10 * Yaniv Kamay <yaniv@qumranet.com>
11 *
12 * This work is licensed under the terms of the GNU GPL, version 2. See
13 * the COPYING file in the top-level directory.
14 *
15 */
16
17#include <linux/kvm_host.h>
18#include "segment_descriptor.h"
19#include "irq.h"
20#include "mmu.h"
21
22#include <linux/kvm.h>
23#include <linux/fs.h>
24#include <linux/vmalloc.h>
25#include <linux/module.h>
26#include <linux/mman.h>
27#include <linux/highmem.h>
28
29#include <asm/uaccess.h>
30#include <asm/msr.h>
31
32#define MAX_IO_MSRS 256
33#define CR0_RESERVED_BITS \
34 (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
35 | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
36 | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
37#define CR4_RESERVED_BITS \
38 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
39 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
40 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \
41 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
42
43#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
44#define EFER_RESERVED_BITS 0xfffffffffffff2fe
45
46#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
47#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
48
49struct kvm_x86_ops *kvm_x86_ops;
50
51struct kvm_stats_debugfs_item debugfs_entries[] = {
52 { "pf_fixed", VCPU_STAT(pf_fixed) },
53 { "pf_guest", VCPU_STAT(pf_guest) },
54 { "tlb_flush", VCPU_STAT(tlb_flush) },
55 { "invlpg", VCPU_STAT(invlpg) },
56 { "exits", VCPU_STAT(exits) },
57 { "io_exits", VCPU_STAT(io_exits) },
58 { "mmio_exits", VCPU_STAT(mmio_exits) },
59 { "signal_exits", VCPU_STAT(signal_exits) },
60 { "irq_window", VCPU_STAT(irq_window_exits) },
61 { "halt_exits", VCPU_STAT(halt_exits) },
62 { "halt_wakeup", VCPU_STAT(halt_wakeup) },
63 { "request_irq", VCPU_STAT(request_irq_exits) },
64 { "irq_exits", VCPU_STAT(irq_exits) },
65 { "host_state_reload", VCPU_STAT(host_state_reload) },
66 { "efer_reload", VCPU_STAT(efer_reload) },
67 { "fpu_reload", VCPU_STAT(fpu_reload) },
68 { "insn_emulation", VCPU_STAT(insn_emulation) },
69 { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
70 { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
71 { "mmu_pte_write", VM_STAT(mmu_pte_write) },
72 { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
73 { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
74 { "mmu_flooded", VM_STAT(mmu_flooded) },
75 { "mmu_recycled", VM_STAT(mmu_recycled) },
76 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
77 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
78 { NULL }
79};
80
81
82unsigned long segment_base(u16 selector)
83{
84 struct descriptor_table gdt;
85 struct segment_descriptor *d;
86 unsigned long table_base;
87 unsigned long v;
88
89 if (selector == 0)
90 return 0;
91
92 asm("sgdt %0" : "=m"(gdt));
93 table_base = gdt.base;
94
95 if (selector & 4) { /* from ldt */
96 u16 ldt_selector;
97
98 asm("sldt %0" : "=g"(ldt_selector));
99 table_base = segment_base(ldt_selector);
100 }
101 d = (struct segment_descriptor *)(table_base + (selector & ~7));
102 v = d->base_low | ((unsigned long)d->base_mid << 16) |
103 ((unsigned long)d->base_high << 24);
104#ifdef CONFIG_X86_64
105 if (d->system == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
106 v |= ((unsigned long) \
107 ((struct segment_descriptor_64 *)d)->base_higher) << 32;
108#endif
109 return v;
110}
111EXPORT_SYMBOL_GPL(segment_base);
112
113u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
114{
115 if (irqchip_in_kernel(vcpu->kvm))
116 return vcpu->arch.apic_base;
117 else
118 return vcpu->arch.apic_base;
119}
120EXPORT_SYMBOL_GPL(kvm_get_apic_base);
121
122void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
123{
124 /* TODO: reserve bits check */
125 if (irqchip_in_kernel(vcpu->kvm))
126 kvm_lapic_set_base(vcpu, data);
127 else
128 vcpu->arch.apic_base = data;
129}
130EXPORT_SYMBOL_GPL(kvm_set_apic_base);
131
132void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
133{
134 WARN_ON(vcpu->arch.exception.pending);
135 vcpu->arch.exception.pending = true;
136 vcpu->arch.exception.has_error_code = false;
137 vcpu->arch.exception.nr = nr;
138}
139EXPORT_SYMBOL_GPL(kvm_queue_exception);
140
141void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
142 u32 error_code)
143{
144 ++vcpu->stat.pf_guest;
145 if (vcpu->arch.exception.pending && vcpu->arch.exception.nr == PF_VECTOR) {
146 printk(KERN_DEBUG "kvm: inject_page_fault:"
147 " double fault 0x%lx\n", addr);
148 vcpu->arch.exception.nr = DF_VECTOR;
149 vcpu->arch.exception.error_code = 0;
150 return;
151 }
152 vcpu->arch.cr2 = addr;
153 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
154}
155
156void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
157{
158 WARN_ON(vcpu->arch.exception.pending);
159 vcpu->arch.exception.pending = true;
160 vcpu->arch.exception.has_error_code = true;
161 vcpu->arch.exception.nr = nr;
162 vcpu->arch.exception.error_code = error_code;
163}
164EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
165
166static void __queue_exception(struct kvm_vcpu *vcpu)
167{
168 kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
169 vcpu->arch.exception.has_error_code,
170 vcpu->arch.exception.error_code);
171}
172
173/*
174 * Load the pae pdptrs. Return true is they are all valid.
175 */
176int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
177{
178 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
179 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
180 int i;
181 int ret;
182 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
183
184 down_read(&current->mm->mmap_sem);
185 ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
186 offset * sizeof(u64), sizeof(pdpte));
187 if (ret < 0) {
188 ret = 0;
189 goto out;
190 }
191 for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
192 if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {
193 ret = 0;
194 goto out;
195 }
196 }
197 ret = 1;
198
199 memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
200out:
201 up_read(&current->mm->mmap_sem);
202
203 return ret;
204}
205
206static bool pdptrs_changed(struct kvm_vcpu *vcpu)
207{
208 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
209 bool changed = true;
210 int r;
211
212 if (is_long_mode(vcpu) || !is_pae(vcpu))
213 return false;
214
215 down_read(&current->mm->mmap_sem);
216 r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
217 if (r < 0)
218 goto out;
219 changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
220out:
221 up_read(&current->mm->mmap_sem);
222
223 return changed;
224}
225
226void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
227{
228 if (cr0 & CR0_RESERVED_BITS) {
229 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
230 cr0, vcpu->arch.cr0);
231 kvm_inject_gp(vcpu, 0);
232 return;
233 }
234
235 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
236 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
237 kvm_inject_gp(vcpu, 0);
238 return;
239 }
240
241 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
242 printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
243 "and a clear PE flag\n");
244 kvm_inject_gp(vcpu, 0);
245 return;
246 }
247
248 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
249#ifdef CONFIG_X86_64
250 if ((vcpu->arch.shadow_efer & EFER_LME)) {
251 int cs_db, cs_l;
252
253 if (!is_pae(vcpu)) {
254 printk(KERN_DEBUG "set_cr0: #GP, start paging "
255 "in long mode while PAE is disabled\n");
256 kvm_inject_gp(vcpu, 0);
257 return;
258 }
259 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
260 if (cs_l) {
261 printk(KERN_DEBUG "set_cr0: #GP, start paging "
262 "in long mode while CS.L == 1\n");
263 kvm_inject_gp(vcpu, 0);
264 return;
265
266 }
267 } else
268#endif
269 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
270 printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
271 "reserved bits\n");
272 kvm_inject_gp(vcpu, 0);
273 return;
274 }
275
276 }
277
278 kvm_x86_ops->set_cr0(vcpu, cr0);
279 vcpu->arch.cr0 = cr0;
280
281 kvm_mmu_reset_context(vcpu);
282 return;
283}
284EXPORT_SYMBOL_GPL(set_cr0);
285
286void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
287{
288 set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
289}
290EXPORT_SYMBOL_GPL(lmsw);
291
292void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
293{
294 if (cr4 & CR4_RESERVED_BITS) {
295 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
296 kvm_inject_gp(vcpu, 0);
297 return;
298 }
299
300 if (is_long_mode(vcpu)) {
301 if (!(cr4 & X86_CR4_PAE)) {
302 printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
303 "in long mode\n");
304 kvm_inject_gp(vcpu, 0);
305 return;
306 }
307 } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE)
308 && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
309 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
310 kvm_inject_gp(vcpu, 0);
311 return;
312 }
313
314 if (cr4 & X86_CR4_VMXE) {
315 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
316 kvm_inject_gp(vcpu, 0);
317 return;
318 }
319 kvm_x86_ops->set_cr4(vcpu, cr4);
320 vcpu->arch.cr4 = cr4;
321 kvm_mmu_reset_context(vcpu);
322}
323EXPORT_SYMBOL_GPL(set_cr4);
324
325void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
326{
327 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
328 kvm_mmu_flush_tlb(vcpu);
329 return;
330 }
331
332 if (is_long_mode(vcpu)) {
333 if (cr3 & CR3_L_MODE_RESERVED_BITS) {
334 printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
335 kvm_inject_gp(vcpu, 0);
336 return;
337 }
338 } else {
339 if (is_pae(vcpu)) {
340 if (cr3 & CR3_PAE_RESERVED_BITS) {
341 printk(KERN_DEBUG
342 "set_cr3: #GP, reserved bits\n");
343 kvm_inject_gp(vcpu, 0);
344 return;
345 }
346 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
347 printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
348 "reserved bits\n");
349 kvm_inject_gp(vcpu, 0);
350 return;
351 }
352 }
353 /*
354 * We don't check reserved bits in nonpae mode, because
355 * this isn't enforced, and VMware depends on this.
356 */
357 }
358
359 down_read(&current->mm->mmap_sem);
360 /*
361 * Does the new cr3 value map to physical memory? (Note, we
362 * catch an invalid cr3 even in real-mode, because it would
363 * cause trouble later on when we turn on paging anyway.)
364 *
365 * A real CPU would silently accept an invalid cr3 and would
366 * attempt to use it - with largely undefined (and often hard
367 * to debug) behavior on the guest side.
368 */
369 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
370 kvm_inject_gp(vcpu, 0);
371 else {
372 vcpu->arch.cr3 = cr3;
373 vcpu->arch.mmu.new_cr3(vcpu);
374 }
375 up_read(&current->mm->mmap_sem);
376}
377EXPORT_SYMBOL_GPL(set_cr3);
378
379void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
380{
381 if (cr8 & CR8_RESERVED_BITS) {
382 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
383 kvm_inject_gp(vcpu, 0);
384 return;
385 }
386 if (irqchip_in_kernel(vcpu->kvm))
387 kvm_lapic_set_tpr(vcpu, cr8);
388 else
389 vcpu->arch.cr8 = cr8;
390}
391EXPORT_SYMBOL_GPL(set_cr8);
392
393unsigned long get_cr8(struct kvm_vcpu *vcpu)
394{
395 if (irqchip_in_kernel(vcpu->kvm))
396 return kvm_lapic_get_cr8(vcpu);
397 else
398 return vcpu->arch.cr8;
399}
400EXPORT_SYMBOL_GPL(get_cr8);
401
402/*
403 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
404 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
405 *
406 * This list is modified at module load time to reflect the
407 * capabilities of the host cpu.
408 */
409static u32 msrs_to_save[] = {
410 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
411 MSR_K6_STAR,
412#ifdef CONFIG_X86_64
413 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
414#endif
415 MSR_IA32_TIME_STAMP_COUNTER,
416};
417
418static unsigned num_msrs_to_save;
419
420static u32 emulated_msrs[] = {
421 MSR_IA32_MISC_ENABLE,
422};
423
424#ifdef CONFIG_X86_64
425
426static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
427{
428 if (efer & EFER_RESERVED_BITS) {
429 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
430 efer);
431 kvm_inject_gp(vcpu, 0);
432 return;
433 }
434
435 if (is_paging(vcpu)
436 && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) {
437 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
438 kvm_inject_gp(vcpu, 0);
439 return;
440 }
441
442 kvm_x86_ops->set_efer(vcpu, efer);
443
444 efer &= ~EFER_LMA;
445 efer |= vcpu->arch.shadow_efer & EFER_LMA;
446
447 vcpu->arch.shadow_efer = efer;
448}
449
450#endif
451
452/*
453 * Writes msr value into into the appropriate "register".
454 * Returns 0 on success, non-0 otherwise.
455 * Assumes vcpu_load() was already called.
456 */
457int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
458{
459 return kvm_x86_ops->set_msr(vcpu, msr_index, data);
460}
461
462/*
463 * Adapt set_msr() to msr_io()'s calling convention
464 */
465static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
466{
467 return kvm_set_msr(vcpu, index, *data);
468}
469
470
471int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
472{
473 switch (msr) {
474#ifdef CONFIG_X86_64
475 case MSR_EFER:
476 set_efer(vcpu, data);
477 break;
478#endif
479 case MSR_IA32_MC0_STATUS:
480 pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
481 __FUNCTION__, data);
482 break;
483 case MSR_IA32_MCG_STATUS:
484 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
485 __FUNCTION__, data);
486 break;
487 case MSR_IA32_UCODE_REV:
488 case MSR_IA32_UCODE_WRITE:
489 case 0x200 ... 0x2ff: /* MTRRs */
490 break;
491 case MSR_IA32_APICBASE:
492 kvm_set_apic_base(vcpu, data);
493 break;
494 case MSR_IA32_MISC_ENABLE:
495 vcpu->arch.ia32_misc_enable_msr = data;
496 break;
497 default:
498 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
499 return 1;
500 }
501 return 0;
502}
503EXPORT_SYMBOL_GPL(kvm_set_msr_common);
504
505
506/*
507 * Reads an msr value (of 'msr_index') into 'pdata'.
508 * Returns 0 on success, non-0 otherwise.
509 * Assumes vcpu_load() was already called.
510 */
511int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
512{
513 return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
514}
515
516int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
517{
518 u64 data;
519
520 switch (msr) {
521 case 0xc0010010: /* SYSCFG */
522 case 0xc0010015: /* HWCR */
523 case MSR_IA32_PLATFORM_ID:
524 case MSR_IA32_P5_MC_ADDR:
525 case MSR_IA32_P5_MC_TYPE:
526 case MSR_IA32_MC0_CTL:
527 case MSR_IA32_MCG_STATUS:
528 case MSR_IA32_MCG_CAP:
529 case MSR_IA32_MC0_MISC:
530 case MSR_IA32_MC0_MISC+4:
531 case MSR_IA32_MC0_MISC+8:
532 case MSR_IA32_MC0_MISC+12:
533 case MSR_IA32_MC0_MISC+16:
534 case MSR_IA32_UCODE_REV:
535 case MSR_IA32_PERF_STATUS:
536 case MSR_IA32_EBL_CR_POWERON:
537 /* MTRR registers */
538 case 0xfe:
539 case 0x200 ... 0x2ff:
540 data = 0;
541 break;
542 case 0xcd: /* fsb frequency */
543 data = 3;
544 break;
545 case MSR_IA32_APICBASE:
546 data = kvm_get_apic_base(vcpu);
547 break;
548 case MSR_IA32_MISC_ENABLE:
549 data = vcpu->arch.ia32_misc_enable_msr;
550 break;
551#ifdef CONFIG_X86_64
552 case MSR_EFER:
553 data = vcpu->arch.shadow_efer;
554 break;
555#endif
556 default:
557 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
558 return 1;
559 }
560 *pdata = data;
561 return 0;
562}
563EXPORT_SYMBOL_GPL(kvm_get_msr_common);
564
565/*
566 * Read or write a bunch of msrs. All parameters are kernel addresses.
567 *
568 * @return number of msrs set successfully.
569 */
570static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
571 struct kvm_msr_entry *entries,
572 int (*do_msr)(struct kvm_vcpu *vcpu,
573 unsigned index, u64 *data))
574{
575 int i;
576
577 vcpu_load(vcpu);
578
579 for (i = 0; i < msrs->nmsrs; ++i)
580 if (do_msr(vcpu, entries[i].index, &entries[i].data))
581 break;
582
583 vcpu_put(vcpu);
584
585 return i;
586}
587
588/*
589 * Read or write a bunch of msrs. Parameters are user addresses.
590 *
591 * @return number of msrs set successfully.
592 */
593static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
594 int (*do_msr)(struct kvm_vcpu *vcpu,
595 unsigned index, u64 *data),
596 int writeback)
597{
598 struct kvm_msrs msrs;
599 struct kvm_msr_entry *entries;
600 int r, n;
601 unsigned size;
602
603 r = -EFAULT;
604 if (copy_from_user(&msrs, user_msrs, sizeof msrs))
605 goto out;
606
607 r = -E2BIG;
608 if (msrs.nmsrs >= MAX_IO_MSRS)
609 goto out;
610
611 r = -ENOMEM;
612 size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
613 entries = vmalloc(size);
614 if (!entries)
615 goto out;
616
617 r = -EFAULT;
618 if (copy_from_user(entries, user_msrs->entries, size))
619 goto out_free;
620
621 r = n = __msr_io(vcpu, &msrs, entries, do_msr);
622 if (r < 0)
623 goto out_free;
624
625 r = -EFAULT;
626 if (writeback && copy_to_user(user_msrs->entries, entries, size))
627 goto out_free;
628
629 r = n;
630
631out_free:
632 vfree(entries);
633out:
634 return r;
635}
636
637/*
638 * Make sure that a cpu that is being hot-unplugged does not have any vcpus
639 * cached on it.
640 */
641void decache_vcpus_on_cpu(int cpu)
642{
643 struct kvm *vm;
644 struct kvm_vcpu *vcpu;
645 int i;
646
647 spin_lock(&kvm_lock);
648 list_for_each_entry(vm, &vm_list, vm_list)
649 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
650 vcpu = vm->vcpus[i];
651 if (!vcpu)
652 continue;
653 /*
654 * If the vcpu is locked, then it is running on some
655 * other cpu and therefore it is not cached on the
656 * cpu in question.
657 *
658 * If it's not locked, check the last cpu it executed
659 * on.
660 */
661 if (mutex_trylock(&vcpu->mutex)) {
662 if (vcpu->cpu == cpu) {
663 kvm_x86_ops->vcpu_decache(vcpu);
664 vcpu->cpu = -1;
665 }
666 mutex_unlock(&vcpu->mutex);
667 }
668 }
669 spin_unlock(&kvm_lock);
670}
671
672int kvm_dev_ioctl_check_extension(long ext)
673{
674 int r;
675
676 switch (ext) {
677 case KVM_CAP_IRQCHIP:
678 case KVM_CAP_HLT:
679 case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
680 case KVM_CAP_USER_MEMORY:
681 case KVM_CAP_SET_TSS_ADDR:
682 case KVM_CAP_EXT_CPUID:
683 r = 1;
684 break;
685 case KVM_CAP_VAPIC:
686 r = !kvm_x86_ops->cpu_has_accelerated_tpr();
687 break;
688 default:
689 r = 0;
690 break;
691 }
692 return r;
693
694}
695
696long kvm_arch_dev_ioctl(struct file *filp,
697 unsigned int ioctl, unsigned long arg)
698{
699 void __user *argp = (void __user *)arg;
700 long r;
701
702 switch (ioctl) {
703 case KVM_GET_MSR_INDEX_LIST: {
704 struct kvm_msr_list __user *user_msr_list = argp;
705 struct kvm_msr_list msr_list;
706 unsigned n;
707
708 r = -EFAULT;
709 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
710 goto out;
711 n = msr_list.nmsrs;
712 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
713 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
714 goto out;
715 r = -E2BIG;
716 if (n < num_msrs_to_save)
717 goto out;
718 r = -EFAULT;
719 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
720 num_msrs_to_save * sizeof(u32)))
721 goto out;
722 if (copy_to_user(user_msr_list->indices
723 + num_msrs_to_save * sizeof(u32),
724 &emulated_msrs,
725 ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
726 goto out;
727 r = 0;
728 break;
729 }
730 default:
731 r = -EINVAL;
732 }
733out:
734 return r;
735}
736
737void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
738{
739 kvm_x86_ops->vcpu_load(vcpu, cpu);
740}
741
742void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
743{
744 kvm_x86_ops->vcpu_put(vcpu);
745 kvm_put_guest_fpu(vcpu);
746}
747
748static int is_efer_nx(void)
749{
750 u64 efer;
751
752 rdmsrl(MSR_EFER, efer);
753 return efer & EFER_NX;
754}
755
756static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
757{
758 int i;
759 struct kvm_cpuid_entry2 *e, *entry;
760
761 entry = NULL;
762 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
763 e = &vcpu->arch.cpuid_entries[i];
764 if (e->function == 0x80000001) {
765 entry = e;
766 break;
767 }
768 }
769 if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
770 entry->edx &= ~(1 << 20);
771 printk(KERN_INFO "kvm: guest NX capability removed\n");
772 }
773}
774
775/* when an old userspace process fills a new kernel module */
776static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
777 struct kvm_cpuid *cpuid,
778 struct kvm_cpuid_entry __user *entries)
779{
780 int r, i;
781 struct kvm_cpuid_entry *cpuid_entries;
782
783 r = -E2BIG;
784 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
785 goto out;
786 r = -ENOMEM;
787 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
788 if (!cpuid_entries)
789 goto out;
790 r = -EFAULT;
791 if (copy_from_user(cpuid_entries, entries,
792 cpuid->nent * sizeof(struct kvm_cpuid_entry)))
793 goto out_free;
794 for (i = 0; i < cpuid->nent; i++) {
795 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
796 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
797 vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
798 vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
799 vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
800 vcpu->arch.cpuid_entries[i].index = 0;
801 vcpu->arch.cpuid_entries[i].flags = 0;
802 vcpu->arch.cpuid_entries[i].padding[0] = 0;
803 vcpu->arch.cpuid_entries[i].padding[1] = 0;
804 vcpu->arch.cpuid_entries[i].padding[2] = 0;
805 }
806 vcpu->arch.cpuid_nent = cpuid->nent;
807 cpuid_fix_nx_cap(vcpu);
808 r = 0;
809
810out_free:
811 vfree(cpuid_entries);
812out:
813 return r;
814}
815
816static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
817 struct kvm_cpuid2 *cpuid,
818 struct kvm_cpuid_entry2 __user *entries)
819{
820 int r;
821
822 r = -E2BIG;
823 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
824 goto out;
825 r = -EFAULT;
826 if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
827 cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
828 goto out;
829 vcpu->arch.cpuid_nent = cpuid->nent;
830 return 0;
831
832out:
833 return r;
834}
835
836static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
837 struct kvm_cpuid2 *cpuid,
838 struct kvm_cpuid_entry2 __user *entries)
839{
840 int r;
841
842 r = -E2BIG;
843 if (cpuid->nent < vcpu->arch.cpuid_nent)
844 goto out;
845 r = -EFAULT;
846 if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
847 vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
848 goto out;
849 return 0;
850
851out:
852 cpuid->nent = vcpu->arch.cpuid_nent;
853 return r;
854}
855
856static inline u32 bit(int bitno)
857{
858 return 1 << (bitno & 31);
859}
860
861static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
862 u32 index)
863{
864 entry->function = function;
865 entry->index = index;
866 cpuid_count(entry->function, entry->index,
867 &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
868 entry->flags = 0;
869}
870
871static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
872 u32 index, int *nent, int maxnent)
873{
874 const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) |
875 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
876 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
877 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
878 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
879 bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) |
880 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
881 bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) |
882 bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) |
883 bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP);
884 const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) |
885 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
886 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
887 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
888 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
889 bit(X86_FEATURE_PGE) |
890 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
891 bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) |
892 bit(X86_FEATURE_SYSCALL) |
893 (bit(X86_FEATURE_NX) && is_efer_nx()) |
894#ifdef CONFIG_X86_64
895 bit(X86_FEATURE_LM) |
896#endif
897 bit(X86_FEATURE_MMXEXT) |
898 bit(X86_FEATURE_3DNOWEXT) |
899 bit(X86_FEATURE_3DNOW);
900 const u32 kvm_supported_word3_x86_features =
901 bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16);
902 const u32 kvm_supported_word6_x86_features =
903 bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY);
904
905 /* all func 2 cpuid_count() should be called on the same cpu */
906 get_cpu();
907 do_cpuid_1_ent(entry, function, index);
908 ++*nent;
909
910 switch (function) {
911 case 0:
912 entry->eax = min(entry->eax, (u32)0xb);
913 break;
914 case 1:
915 entry->edx &= kvm_supported_word0_x86_features;
916 entry->ecx &= kvm_supported_word3_x86_features;
917 break;
918 /* function 2 entries are STATEFUL. That is, repeated cpuid commands
919 * may return different values. This forces us to get_cpu() before
920 * issuing the first command, and also to emulate this annoying behavior
921 * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
922 case 2: {
923 int t, times = entry->eax & 0xff;
924
925 entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
926 for (t = 1; t < times && *nent < maxnent; ++t) {
927 do_cpuid_1_ent(&entry[t], function, 0);
928 entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
929 ++*nent;
930 }
931 break;
932 }
933 /* function 4 and 0xb have additional index. */
934 case 4: {
935 int index, cache_type;
936
937 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
938 /* read more entries until cache_type is zero */
939 for (index = 1; *nent < maxnent; ++index) {
940 cache_type = entry[index - 1].eax & 0x1f;
941 if (!cache_type)
942 break;
943 do_cpuid_1_ent(&entry[index], function, index);
944 entry[index].flags |=
945 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
946 ++*nent;
947 }
948 break;
949 }
950 case 0xb: {
951 int index, level_type;
952
953 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
954 /* read more entries until level_type is zero */
955 for (index = 1; *nent < maxnent; ++index) {
956 level_type = entry[index - 1].ecx & 0xff;
957 if (!level_type)
958 break;
959 do_cpuid_1_ent(&entry[index], function, index);
960 entry[index].flags |=
961 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
962 ++*nent;
963 }
964 break;
965 }
966 case 0x80000000:
967 entry->eax = min(entry->eax, 0x8000001a);
968 break;
969 case 0x80000001:
970 entry->edx &= kvm_supported_word1_x86_features;
971 entry->ecx &= kvm_supported_word6_x86_features;
972 break;
973 }
974 put_cpu();
975}
976
977static int kvm_vm_ioctl_get_supported_cpuid(struct kvm *kvm,
978 struct kvm_cpuid2 *cpuid,
979 struct kvm_cpuid_entry2 __user *entries)
980{
981 struct kvm_cpuid_entry2 *cpuid_entries;
982 int limit, nent = 0, r = -E2BIG;
983 u32 func;
984
985 if (cpuid->nent < 1)
986 goto out;
987 r = -ENOMEM;
988 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
989 if (!cpuid_entries)
990 goto out;
991
992 do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
993 limit = cpuid_entries[0].eax;
994 for (func = 1; func <= limit && nent < cpuid->nent; ++func)
995 do_cpuid_ent(&cpuid_entries[nent], func, 0,
996 &nent, cpuid->nent);
997 r = -E2BIG;
998 if (nent >= cpuid->nent)
999 goto out_free;
1000
1001 do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
1002 limit = cpuid_entries[nent - 1].eax;
1003 for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
1004 do_cpuid_ent(&cpuid_entries[nent], func, 0,
1005 &nent, cpuid->nent);
1006 r = -EFAULT;
1007 if (copy_to_user(entries, cpuid_entries,
1008 nent * sizeof(struct kvm_cpuid_entry2)))
1009 goto out_free;
1010 cpuid->nent = nent;
1011 r = 0;
1012
1013out_free:
1014 vfree(cpuid_entries);
1015out:
1016 return r;
1017}
1018
1019static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
1020 struct kvm_lapic_state *s)
1021{
1022 vcpu_load(vcpu);
1023 memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
1024 vcpu_put(vcpu);
1025
1026 return 0;
1027}
1028
1029static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
1030 struct kvm_lapic_state *s)
1031{
1032 vcpu_load(vcpu);
1033 memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
1034 kvm_apic_post_state_restore(vcpu);
1035 vcpu_put(vcpu);
1036
1037 return 0;
1038}
1039
1040static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
1041 struct kvm_interrupt *irq)
1042{
1043 if (irq->irq < 0 || irq->irq >= 256)
1044 return -EINVAL;
1045 if (irqchip_in_kernel(vcpu->kvm))
1046 return -ENXIO;
1047 vcpu_load(vcpu);
1048
1049 set_bit(irq->irq, vcpu->arch.irq_pending);
1050 set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
1051
1052 vcpu_put(vcpu);
1053
1054 return 0;
1055}
1056
1057static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
1058 struct kvm_tpr_access_ctl *tac)
1059{
1060 if (tac->flags)
1061 return -EINVAL;
1062 vcpu->arch.tpr_access_reporting = !!tac->enabled;
1063 return 0;
1064}
1065
1066long kvm_arch_vcpu_ioctl(struct file *filp,
1067 unsigned int ioctl, unsigned long arg)
1068{
1069 struct kvm_vcpu *vcpu = filp->private_data;
1070 void __user *argp = (void __user *)arg;
1071 int r;
1072
1073 switch (ioctl) {
1074 case KVM_GET_LAPIC: {
1075 struct kvm_lapic_state lapic;
1076
1077 memset(&lapic, 0, sizeof lapic);
1078 r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic);
1079 if (r)
1080 goto out;
1081 r = -EFAULT;
1082 if (copy_to_user(argp, &lapic, sizeof lapic))
1083 goto out;
1084 r = 0;
1085 break;
1086 }
1087 case KVM_SET_LAPIC: {
1088 struct kvm_lapic_state lapic;
1089
1090 r = -EFAULT;
1091 if (copy_from_user(&lapic, argp, sizeof lapic))
1092 goto out;
1093 r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);;
1094 if (r)
1095 goto out;
1096 r = 0;
1097 break;
1098 }
1099 case KVM_INTERRUPT: {
1100 struct kvm_interrupt irq;
1101
1102 r = -EFAULT;
1103 if (copy_from_user(&irq, argp, sizeof irq))
1104 goto out;
1105 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
1106 if (r)
1107 goto out;
1108 r = 0;
1109 break;
1110 }
1111 case KVM_SET_CPUID: {
1112 struct kvm_cpuid __user *cpuid_arg = argp;
1113 struct kvm_cpuid cpuid;
1114
1115 r = -EFAULT;
1116 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1117 goto out;
1118 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
1119 if (r)
1120 goto out;
1121 break;
1122 }
1123 case KVM_SET_CPUID2: {
1124 struct kvm_cpuid2 __user *cpuid_arg = argp;
1125 struct kvm_cpuid2 cpuid;
1126
1127 r = -EFAULT;
1128 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1129 goto out;
1130 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
1131 cpuid_arg->entries);
1132 if (r)
1133 goto out;
1134 break;
1135 }
1136 case KVM_GET_CPUID2: {
1137 struct kvm_cpuid2 __user *cpuid_arg = argp;
1138 struct kvm_cpuid2 cpuid;
1139
1140 r = -EFAULT;
1141 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1142 goto out;
1143 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
1144 cpuid_arg->entries);
1145 if (r)
1146 goto out;
1147 r = -EFAULT;
1148 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
1149 goto out;
1150 r = 0;
1151 break;
1152 }
1153 case KVM_GET_MSRS:
1154 r = msr_io(vcpu, argp, kvm_get_msr, 1);
1155 break;
1156 case KVM_SET_MSRS:
1157 r = msr_io(vcpu, argp, do_set_msr, 0);
1158 break;
1159 case KVM_TPR_ACCESS_REPORTING: {
1160 struct kvm_tpr_access_ctl tac;
1161
1162 r = -EFAULT;
1163 if (copy_from_user(&tac, argp, sizeof tac))
1164 goto out;
1165 r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
1166 if (r)
1167 goto out;
1168 r = -EFAULT;
1169 if (copy_to_user(argp, &tac, sizeof tac))
1170 goto out;
1171 r = 0;
1172 break;
1173 };
1174 case KVM_SET_VAPIC_ADDR: {
1175 struct kvm_vapic_addr va;
1176
1177 r = -EINVAL;
1178 if (!irqchip_in_kernel(vcpu->kvm))
1179 goto out;
1180 r = -EFAULT;
1181 if (copy_from_user(&va, argp, sizeof va))
1182 goto out;
1183 r = 0;
1184 kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
1185 break;
1186 }
1187 default:
1188 r = -EINVAL;
1189 }
1190out:
1191 return r;
1192}
1193
1194static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
1195{
1196 int ret;
1197
1198 if (addr > (unsigned int)(-3 * PAGE_SIZE))
1199 return -1;
1200 ret = kvm_x86_ops->set_tss_addr(kvm, addr);
1201 return ret;
1202}
1203
1204static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
1205 u32 kvm_nr_mmu_pages)
1206{
1207 if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
1208 return -EINVAL;
1209
1210 down_write(&current->mm->mmap_sem);
1211
1212 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
1213 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
1214
1215 up_write(&current->mm->mmap_sem);
1216 return 0;
1217}
1218
1219static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
1220{
1221 return kvm->arch.n_alloc_mmu_pages;
1222}
1223
1224gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
1225{
1226 int i;
1227 struct kvm_mem_alias *alias;
1228
1229 for (i = 0; i < kvm->arch.naliases; ++i) {
1230 alias = &kvm->arch.aliases[i];
1231 if (gfn >= alias->base_gfn
1232 && gfn < alias->base_gfn + alias->npages)
1233 return alias->target_gfn + gfn - alias->base_gfn;
1234 }
1235 return gfn;
1236}
1237
1238/*
1239 * Set a new alias region. Aliases map a portion of physical memory into
1240 * another portion. This is useful for memory windows, for example the PC
1241 * VGA region.
1242 */
1243static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
1244 struct kvm_memory_alias *alias)
1245{
1246 int r, n;
1247 struct kvm_mem_alias *p;
1248
1249 r = -EINVAL;
1250 /* General sanity checks */
1251 if (alias->memory_size & (PAGE_SIZE - 1))
1252 goto out;
1253 if (alias->guest_phys_addr & (PAGE_SIZE - 1))
1254 goto out;
1255 if (alias->slot >= KVM_ALIAS_SLOTS)
1256 goto out;
1257 if (alias->guest_phys_addr + alias->memory_size
1258 < alias->guest_phys_addr)
1259 goto out;
1260 if (alias->target_phys_addr + alias->memory_size
1261 < alias->target_phys_addr)
1262 goto out;
1263
1264 down_write(&current->mm->mmap_sem);
1265
1266 p = &kvm->arch.aliases[alias->slot];
1267 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
1268 p->npages = alias->memory_size >> PAGE_SHIFT;
1269 p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
1270
1271 for (n = KVM_ALIAS_SLOTS; n > 0; --n)
1272 if (kvm->arch.aliases[n - 1].npages)
1273 break;
1274 kvm->arch.naliases = n;
1275
1276 kvm_mmu_zap_all(kvm);
1277
1278 up_write(&current->mm->mmap_sem);
1279
1280 return 0;
1281
1282out:
1283 return r;
1284}
1285
1286static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
1287{
1288 int r;
1289
1290 r = 0;
1291 switch (chip->chip_id) {
1292 case KVM_IRQCHIP_PIC_MASTER:
1293 memcpy(&chip->chip.pic,
1294 &pic_irqchip(kvm)->pics[0],
1295 sizeof(struct kvm_pic_state));
1296 break;
1297 case KVM_IRQCHIP_PIC_SLAVE:
1298 memcpy(&chip->chip.pic,
1299 &pic_irqchip(kvm)->pics[1],
1300 sizeof(struct kvm_pic_state));
1301 break;
1302 case KVM_IRQCHIP_IOAPIC:
1303 memcpy(&chip->chip.ioapic,
1304 ioapic_irqchip(kvm),
1305 sizeof(struct kvm_ioapic_state));
1306 break;
1307 default:
1308 r = -EINVAL;
1309 break;
1310 }
1311 return r;
1312}
1313
1314static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
1315{
1316 int r;
1317
1318 r = 0;
1319 switch (chip->chip_id) {
1320 case KVM_IRQCHIP_PIC_MASTER:
1321 memcpy(&pic_irqchip(kvm)->pics[0],
1322 &chip->chip.pic,
1323 sizeof(struct kvm_pic_state));
1324 break;
1325 case KVM_IRQCHIP_PIC_SLAVE:
1326 memcpy(&pic_irqchip(kvm)->pics[1],
1327 &chip->chip.pic,
1328 sizeof(struct kvm_pic_state));
1329 break;
1330 case KVM_IRQCHIP_IOAPIC:
1331 memcpy(ioapic_irqchip(kvm),
1332 &chip->chip.ioapic,
1333 sizeof(struct kvm_ioapic_state));
1334 break;
1335 default:
1336 r = -EINVAL;
1337 break;
1338 }
1339 kvm_pic_update_irq(pic_irqchip(kvm));
1340 return r;
1341}
1342
1343/*
1344 * Get (and clear) the dirty memory log for a memory slot.
1345 */
1346int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
1347 struct kvm_dirty_log *log)
1348{
1349 int r;
1350 int n;
1351 struct kvm_memory_slot *memslot;
1352 int is_dirty = 0;
1353
1354 down_write(&current->mm->mmap_sem);
1355
1356 r = kvm_get_dirty_log(kvm, log, &is_dirty);
1357 if (r)
1358 goto out;
1359
1360 /* If nothing is dirty, don't bother messing with page tables. */
1361 if (is_dirty) {
1362 kvm_mmu_slot_remove_write_access(kvm, log->slot);
1363 kvm_flush_remote_tlbs(kvm);
1364 memslot = &kvm->memslots[log->slot];
1365 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
1366 memset(memslot->dirty_bitmap, 0, n);
1367 }
1368 r = 0;
1369out:
1370 up_write(&current->mm->mmap_sem);
1371 return r;
1372}
1373
1374long kvm_arch_vm_ioctl(struct file *filp,
1375 unsigned int ioctl, unsigned long arg)
1376{
1377 struct kvm *kvm = filp->private_data;
1378 void __user *argp = (void __user *)arg;
1379 int r = -EINVAL;
1380
1381 switch (ioctl) {
1382 case KVM_SET_TSS_ADDR:
1383 r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
1384 if (r < 0)
1385 goto out;
1386 break;
1387 case KVM_SET_MEMORY_REGION: {
1388 struct kvm_memory_region kvm_mem;
1389 struct kvm_userspace_memory_region kvm_userspace_mem;
1390
1391 r = -EFAULT;
1392 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
1393 goto out;
1394 kvm_userspace_mem.slot = kvm_mem.slot;
1395 kvm_userspace_mem.flags = kvm_mem.flags;
1396 kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
1397 kvm_userspace_mem.memory_size = kvm_mem.memory_size;
1398 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
1399 if (r)
1400 goto out;
1401 break;
1402 }
1403 case KVM_SET_NR_MMU_PAGES:
1404 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
1405 if (r)
1406 goto out;
1407 break;
1408 case KVM_GET_NR_MMU_PAGES:
1409 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
1410 break;
1411 case KVM_SET_MEMORY_ALIAS: {
1412 struct kvm_memory_alias alias;
1413
1414 r = -EFAULT;
1415 if (copy_from_user(&alias, argp, sizeof alias))
1416 goto out;
1417 r = kvm_vm_ioctl_set_memory_alias(kvm, &alias);
1418 if (r)
1419 goto out;
1420 break;
1421 }
1422 case KVM_CREATE_IRQCHIP:
1423 r = -ENOMEM;
1424 kvm->arch.vpic = kvm_create_pic(kvm);
1425 if (kvm->arch.vpic) {
1426 r = kvm_ioapic_init(kvm);
1427 if (r) {
1428 kfree(kvm->arch.vpic);
1429 kvm->arch.vpic = NULL;
1430 goto out;
1431 }
1432 } else
1433 goto out;
1434 break;
1435 case KVM_IRQ_LINE: {
1436 struct kvm_irq_level irq_event;
1437
1438 r = -EFAULT;
1439 if (copy_from_user(&irq_event, argp, sizeof irq_event))
1440 goto out;
1441 if (irqchip_in_kernel(kvm)) {
1442 mutex_lock(&kvm->lock);
1443 if (irq_event.irq < 16)
1444 kvm_pic_set_irq(pic_irqchip(kvm),
1445 irq_event.irq,
1446 irq_event.level);
1447 kvm_ioapic_set_irq(kvm->arch.vioapic,
1448 irq_event.irq,
1449 irq_event.level);
1450 mutex_unlock(&kvm->lock);
1451 r = 0;
1452 }
1453 break;
1454 }
1455 case KVM_GET_IRQCHIP: {
1456 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
1457 struct kvm_irqchip chip;
1458
1459 r = -EFAULT;
1460 if (copy_from_user(&chip, argp, sizeof chip))
1461 goto out;
1462 r = -ENXIO;
1463 if (!irqchip_in_kernel(kvm))
1464 goto out;
1465 r = kvm_vm_ioctl_get_irqchip(kvm, &chip);
1466 if (r)
1467 goto out;
1468 r = -EFAULT;
1469 if (copy_to_user(argp, &chip, sizeof chip))
1470 goto out;
1471 r = 0;
1472 break;
1473 }
1474 case KVM_SET_IRQCHIP: {
1475 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
1476 struct kvm_irqchip chip;
1477
1478 r = -EFAULT;
1479 if (copy_from_user(&chip, argp, sizeof chip))
1480 goto out;
1481 r = -ENXIO;
1482 if (!irqchip_in_kernel(kvm))
1483 goto out;
1484 r = kvm_vm_ioctl_set_irqchip(kvm, &chip);
1485 if (r)
1486 goto out;
1487 r = 0;
1488 break;
1489 }
1490 case KVM_GET_SUPPORTED_CPUID: {
1491 struct kvm_cpuid2 __user *cpuid_arg = argp;
1492 struct kvm_cpuid2 cpuid;
1493
1494 r = -EFAULT;
1495 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1496 goto out;
1497 r = kvm_vm_ioctl_get_supported_cpuid(kvm, &cpuid,
1498 cpuid_arg->entries);
1499 if (r)
1500 goto out;
1501
1502 r = -EFAULT;
1503 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
1504 goto out;
1505 r = 0;
1506 break;
1507 }
1508 default:
1509 ;
1510 }
1511out:
1512 return r;
1513}
1514
1515static void kvm_init_msr_list(void)
1516{
1517 u32 dummy[2];
1518 unsigned i, j;
1519
1520 for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
1521 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
1522 continue;
1523 if (j < i)
1524 msrs_to_save[j] = msrs_to_save[i];
1525 j++;
1526 }
1527 num_msrs_to_save = j;
1528}
1529
1530/*
1531 * Only apic need an MMIO device hook, so shortcut now..
1532 */
1533static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
1534 gpa_t addr)
1535{
1536 struct kvm_io_device *dev;
1537
1538 if (vcpu->arch.apic) {
1539 dev = &vcpu->arch.apic->dev;
1540 if (dev->in_range(dev, addr))
1541 return dev;
1542 }
1543 return NULL;
1544}
1545
1546
1547static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
1548 gpa_t addr)
1549{
1550 struct kvm_io_device *dev;
1551
1552 dev = vcpu_find_pervcpu_dev(vcpu, addr);
1553 if (dev == NULL)
1554 dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr);
1555 return dev;
1556}
1557
1558int emulator_read_std(unsigned long addr,
1559 void *val,
1560 unsigned int bytes,
1561 struct kvm_vcpu *vcpu)
1562{
1563 void *data = val;
1564 int r = X86EMUL_CONTINUE;
1565
1566 down_read(&current->mm->mmap_sem);
1567 while (bytes) {
1568 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1569 unsigned offset = addr & (PAGE_SIZE-1);
1570 unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
1571 int ret;
1572
1573 if (gpa == UNMAPPED_GVA) {
1574 r = X86EMUL_PROPAGATE_FAULT;
1575 goto out;
1576 }
1577 ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy);
1578 if (ret < 0) {
1579 r = X86EMUL_UNHANDLEABLE;
1580 goto out;
1581 }
1582
1583 bytes -= tocopy;
1584 data += tocopy;
1585 addr += tocopy;
1586 }
1587out:
1588 up_read(&current->mm->mmap_sem);
1589 return r;
1590}
1591EXPORT_SYMBOL_GPL(emulator_read_std);
1592
1593static int emulator_read_emulated(unsigned long addr,
1594 void *val,
1595 unsigned int bytes,
1596 struct kvm_vcpu *vcpu)
1597{
1598 struct kvm_io_device *mmio_dev;
1599 gpa_t gpa;
1600
1601 if (vcpu->mmio_read_completed) {
1602 memcpy(val, vcpu->mmio_data, bytes);
1603 vcpu->mmio_read_completed = 0;
1604 return X86EMUL_CONTINUE;
1605 }
1606
1607 down_read(&current->mm->mmap_sem);
1608 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1609 up_read(&current->mm->mmap_sem);
1610
1611 /* For APIC access vmexit */
1612 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
1613 goto mmio;
1614
1615 if (emulator_read_std(addr, val, bytes, vcpu)
1616 == X86EMUL_CONTINUE)
1617 return X86EMUL_CONTINUE;
1618 if (gpa == UNMAPPED_GVA)
1619 return X86EMUL_PROPAGATE_FAULT;
1620
1621mmio:
1622 /*
1623 * Is this MMIO handled locally?
1624 */
1625 mutex_lock(&vcpu->kvm->lock);
1626 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1627 if (mmio_dev) {
1628 kvm_iodevice_read(mmio_dev, gpa, bytes, val);
1629 mutex_unlock(&vcpu->kvm->lock);
1630 return X86EMUL_CONTINUE;
1631 }
1632 mutex_unlock(&vcpu->kvm->lock);
1633
1634 vcpu->mmio_needed = 1;
1635 vcpu->mmio_phys_addr = gpa;
1636 vcpu->mmio_size = bytes;
1637 vcpu->mmio_is_write = 0;
1638
1639 return X86EMUL_UNHANDLEABLE;
1640}
1641
1642static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
1643 const void *val, int bytes)
1644{
1645 int ret;
1646
1647 down_read(&current->mm->mmap_sem);
1648 ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
1649 if (ret < 0) {
1650 up_read(&current->mm->mmap_sem);
1651 return 0;
1652 }
1653 kvm_mmu_pte_write(vcpu, gpa, val, bytes);
1654 up_read(&current->mm->mmap_sem);
1655 return 1;
1656}
1657
1658static int emulator_write_emulated_onepage(unsigned long addr,
1659 const void *val,
1660 unsigned int bytes,
1661 struct kvm_vcpu *vcpu)
1662{
1663 struct kvm_io_device *mmio_dev;
1664 gpa_t gpa;
1665
1666 down_read(&current->mm->mmap_sem);
1667 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1668 up_read(&current->mm->mmap_sem);
1669
1670 if (gpa == UNMAPPED_GVA) {
1671 kvm_inject_page_fault(vcpu, addr, 2);
1672 return X86EMUL_PROPAGATE_FAULT;
1673 }
1674
1675 /* For APIC access vmexit */
1676 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
1677 goto mmio;
1678
1679 if (emulator_write_phys(vcpu, gpa, val, bytes))
1680 return X86EMUL_CONTINUE;
1681
1682mmio:
1683 /*
1684 * Is this MMIO handled locally?
1685 */
1686 mutex_lock(&vcpu->kvm->lock);
1687 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1688 if (mmio_dev) {
1689 kvm_iodevice_write(mmio_dev, gpa, bytes, val);
1690 mutex_unlock(&vcpu->kvm->lock);
1691 return X86EMUL_CONTINUE;
1692 }
1693 mutex_unlock(&vcpu->kvm->lock);
1694
1695 vcpu->mmio_needed = 1;
1696 vcpu->mmio_phys_addr = gpa;
1697 vcpu->mmio_size = bytes;
1698 vcpu->mmio_is_write = 1;
1699 memcpy(vcpu->mmio_data, val, bytes);
1700
1701 return X86EMUL_CONTINUE;
1702}
1703
1704int emulator_write_emulated(unsigned long addr,
1705 const void *val,
1706 unsigned int bytes,
1707 struct kvm_vcpu *vcpu)
1708{
1709 /* Crossing a page boundary? */
1710 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
1711 int rc, now;
1712
1713 now = -addr & ~PAGE_MASK;
1714 rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
1715 if (rc != X86EMUL_CONTINUE)
1716 return rc;
1717 addr += now;
1718 val += now;
1719 bytes -= now;
1720 }
1721 return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
1722}
1723EXPORT_SYMBOL_GPL(emulator_write_emulated);
1724
1725static int emulator_cmpxchg_emulated(unsigned long addr,
1726 const void *old,
1727 const void *new,
1728 unsigned int bytes,
1729 struct kvm_vcpu *vcpu)
1730{
1731 static int reported;
1732
1733 if (!reported) {
1734 reported = 1;
1735 printk(KERN_WARNING "kvm: emulating exchange as write\n");
1736 }
1737#ifndef CONFIG_X86_64
1738 /* guests cmpxchg8b have to be emulated atomically */
1739 if (bytes == 8) {
1740 gpa_t gpa;
1741 struct page *page;
1742 char *addr;
1743 u64 val;
1744
1745 down_read(&current->mm->mmap_sem);
1746 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1747
1748 if (gpa == UNMAPPED_GVA ||
1749 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
1750 goto emul_write;
1751
1752 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
1753 goto emul_write;
1754
1755 val = *(u64 *)new;
1756 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1757 addr = kmap_atomic(page, KM_USER0);
1758 set_64bit((u64 *)(addr + offset_in_page(gpa)), val);
1759 kunmap_atomic(addr, KM_USER0);
1760 kvm_release_page_dirty(page);
1761 emul_write:
1762 up_read(&current->mm->mmap_sem);
1763 }
1764#endif
1765
1766 return emulator_write_emulated(addr, new, bytes, vcpu);
1767}
1768
1769static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
1770{
1771 return kvm_x86_ops->get_segment_base(vcpu, seg);
1772}
1773
1774int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
1775{
1776 return X86EMUL_CONTINUE;
1777}
1778
1779int emulate_clts(struct kvm_vcpu *vcpu)
1780{
1781 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
1782 return X86EMUL_CONTINUE;
1783}
1784
1785int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
1786{
1787 struct kvm_vcpu *vcpu = ctxt->vcpu;
1788
1789 switch (dr) {
1790 case 0 ... 3:
1791 *dest = kvm_x86_ops->get_dr(vcpu, dr);
1792 return X86EMUL_CONTINUE;
1793 default:
1794 pr_unimpl(vcpu, "%s: unexpected dr %u\n", __FUNCTION__, dr);
1795 return X86EMUL_UNHANDLEABLE;
1796 }
1797}
1798
1799int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
1800{
1801 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
1802 int exception;
1803
1804 kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
1805 if (exception) {
1806 /* FIXME: better handling */
1807 return X86EMUL_UNHANDLEABLE;
1808 }
1809 return X86EMUL_CONTINUE;
1810}
1811
1812void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
1813{
1814 static int reported;
1815 u8 opcodes[4];
1816 unsigned long rip = vcpu->arch.rip;
1817 unsigned long rip_linear;
1818
1819 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
1820
1821 if (reported)
1822 return;
1823
1824 emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu);
1825
1826 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
1827 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
1828 reported = 1;
1829}
1830EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
1831
1832struct x86_emulate_ops emulate_ops = {
1833 .read_std = emulator_read_std,
1834 .read_emulated = emulator_read_emulated,
1835 .write_emulated = emulator_write_emulated,
1836 .cmpxchg_emulated = emulator_cmpxchg_emulated,
1837};
1838
1839int emulate_instruction(struct kvm_vcpu *vcpu,
1840 struct kvm_run *run,
1841 unsigned long cr2,
1842 u16 error_code,
1843 int emulation_type)
1844{
1845 int r;
1846 struct decode_cache *c;
1847
1848 vcpu->arch.mmio_fault_cr2 = cr2;
1849 kvm_x86_ops->cache_regs(vcpu);
1850
1851 vcpu->mmio_is_write = 0;
1852 vcpu->arch.pio.string = 0;
1853
1854 if (!(emulation_type & EMULTYPE_NO_DECODE)) {
1855 int cs_db, cs_l;
1856 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
1857
1858 vcpu->arch.emulate_ctxt.vcpu = vcpu;
1859 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
1860 vcpu->arch.emulate_ctxt.mode =
1861 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
1862 ? X86EMUL_MODE_REAL : cs_l
1863 ? X86EMUL_MODE_PROT64 : cs_db
1864 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
1865
1866 if (vcpu->arch.emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
1867 vcpu->arch.emulate_ctxt.cs_base = 0;
1868 vcpu->arch.emulate_ctxt.ds_base = 0;
1869 vcpu->arch.emulate_ctxt.es_base = 0;
1870 vcpu->arch.emulate_ctxt.ss_base = 0;
1871 } else {
1872 vcpu->arch.emulate_ctxt.cs_base =
1873 get_segment_base(vcpu, VCPU_SREG_CS);
1874 vcpu->arch.emulate_ctxt.ds_base =
1875 get_segment_base(vcpu, VCPU_SREG_DS);
1876 vcpu->arch.emulate_ctxt.es_base =
1877 get_segment_base(vcpu, VCPU_SREG_ES);
1878 vcpu->arch.emulate_ctxt.ss_base =
1879 get_segment_base(vcpu, VCPU_SREG_SS);
1880 }
1881
1882 vcpu->arch.emulate_ctxt.gs_base =
1883 get_segment_base(vcpu, VCPU_SREG_GS);
1884 vcpu->arch.emulate_ctxt.fs_base =
1885 get_segment_base(vcpu, VCPU_SREG_FS);
1886
1887 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
1888
1889 /* Reject the instructions other than VMCALL/VMMCALL when
1890 * try to emulate invalid opcode */
1891 c = &vcpu->arch.emulate_ctxt.decode;
1892 if ((emulation_type & EMULTYPE_TRAP_UD) &&
1893 (!(c->twobyte && c->b == 0x01 &&
1894 (c->modrm_reg == 0 || c->modrm_reg == 3) &&
1895 c->modrm_mod == 3 && c->modrm_rm == 1)))
1896 return EMULATE_FAIL;
1897
1898 ++vcpu->stat.insn_emulation;
1899 if (r) {
1900 ++vcpu->stat.insn_emulation_fail;
1901 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
1902 return EMULATE_DONE;
1903 return EMULATE_FAIL;
1904 }
1905 }
1906
1907 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
1908
1909 if (vcpu->arch.pio.string)
1910 return EMULATE_DO_MMIO;
1911
1912 if ((r || vcpu->mmio_is_write) && run) {
1913 run->exit_reason = KVM_EXIT_MMIO;
1914 run->mmio.phys_addr = vcpu->mmio_phys_addr;
1915 memcpy(run->mmio.data, vcpu->mmio_data, 8);
1916 run->mmio.len = vcpu->mmio_size;
1917 run->mmio.is_write = vcpu->mmio_is_write;
1918 }
1919
1920 if (r) {
1921 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
1922 return EMULATE_DONE;
1923 if (!vcpu->mmio_needed) {
1924 kvm_report_emulation_failure(vcpu, "mmio");
1925 return EMULATE_FAIL;
1926 }
1927 return EMULATE_DO_MMIO;
1928 }
1929
1930 kvm_x86_ops->decache_regs(vcpu);
1931 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
1932
1933 if (vcpu->mmio_is_write) {
1934 vcpu->mmio_needed = 0;
1935 return EMULATE_DO_MMIO;
1936 }
1937
1938 return EMULATE_DONE;
1939}
1940EXPORT_SYMBOL_GPL(emulate_instruction);
1941
1942static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
1943{
1944 int i;
1945
1946 for (i = 0; i < ARRAY_SIZE(vcpu->arch.pio.guest_pages); ++i)
1947 if (vcpu->arch.pio.guest_pages[i]) {
1948 kvm_release_page_dirty(vcpu->arch.pio.guest_pages[i]);
1949 vcpu->arch.pio.guest_pages[i] = NULL;
1950 }
1951}
1952
1953static int pio_copy_data(struct kvm_vcpu *vcpu)
1954{
1955 void *p = vcpu->arch.pio_data;
1956 void *q;
1957 unsigned bytes;
1958 int nr_pages = vcpu->arch.pio.guest_pages[1] ? 2 : 1;
1959
1960 q = vmap(vcpu->arch.pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
1961 PAGE_KERNEL);
1962 if (!q) {
1963 free_pio_guest_pages(vcpu);
1964 return -ENOMEM;
1965 }
1966 q += vcpu->arch.pio.guest_page_offset;
1967 bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
1968 if (vcpu->arch.pio.in)
1969 memcpy(q, p, bytes);
1970 else
1971 memcpy(p, q, bytes);
1972 q -= vcpu->arch.pio.guest_page_offset;
1973 vunmap(q);
1974 free_pio_guest_pages(vcpu);
1975 return 0;
1976}
1977
1978int complete_pio(struct kvm_vcpu *vcpu)
1979{
1980 struct kvm_pio_request *io = &vcpu->arch.pio;
1981 long delta;
1982 int r;
1983
1984 kvm_x86_ops->cache_regs(vcpu);
1985
1986 if (!io->string) {
1987 if (io->in)
1988 memcpy(&vcpu->arch.regs[VCPU_REGS_RAX], vcpu->arch.pio_data,
1989 io->size);
1990 } else {
1991 if (io->in) {
1992 r = pio_copy_data(vcpu);
1993 if (r) {
1994 kvm_x86_ops->cache_regs(vcpu);
1995 return r;
1996 }
1997 }
1998
1999 delta = 1;
2000 if (io->rep) {
2001 delta *= io->cur_count;
2002 /*
2003 * The size of the register should really depend on
2004 * current address size.
2005 */
2006 vcpu->arch.regs[VCPU_REGS_RCX] -= delta;
2007 }
2008 if (io->down)
2009 delta = -delta;
2010 delta *= io->size;
2011 if (io->in)
2012 vcpu->arch.regs[VCPU_REGS_RDI] += delta;
2013 else
2014 vcpu->arch.regs[VCPU_REGS_RSI] += delta;
2015 }
2016
2017 kvm_x86_ops->decache_regs(vcpu);
2018
2019 io->count -= io->cur_count;
2020 io->cur_count = 0;
2021
2022 return 0;
2023}
2024
2025static void kernel_pio(struct kvm_io_device *pio_dev,
2026 struct kvm_vcpu *vcpu,
2027 void *pd)
2028{
2029 /* TODO: String I/O for in kernel device */
2030
2031 mutex_lock(&vcpu->kvm->lock);
2032 if (vcpu->arch.pio.in)
2033 kvm_iodevice_read(pio_dev, vcpu->arch.pio.port,
2034 vcpu->arch.pio.size,
2035 pd);
2036 else
2037 kvm_iodevice_write(pio_dev, vcpu->arch.pio.port,
2038 vcpu->arch.pio.size,
2039 pd);
2040 mutex_unlock(&vcpu->kvm->lock);
2041}
2042
2043static void pio_string_write(struct kvm_io_device *pio_dev,
2044 struct kvm_vcpu *vcpu)
2045{
2046 struct kvm_pio_request *io = &vcpu->arch.pio;
2047 void *pd = vcpu->arch.pio_data;
2048 int i;
2049
2050 mutex_lock(&vcpu->kvm->lock);
2051 for (i = 0; i < io->cur_count; i++) {
2052 kvm_iodevice_write(pio_dev, io->port,
2053 io->size,
2054 pd);
2055 pd += io->size;
2056 }
2057 mutex_unlock(&vcpu->kvm->lock);
2058}
2059
2060static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
2061 gpa_t addr)
2062{
2063 return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr);
2064}
2065
2066int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2067 int size, unsigned port)
2068{
2069 struct kvm_io_device *pio_dev;
2070
2071 vcpu->run->exit_reason = KVM_EXIT_IO;
2072 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
2073 vcpu->run->io.size = vcpu->arch.pio.size = size;
2074 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
2075 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1;
2076 vcpu->run->io.port = vcpu->arch.pio.port = port;
2077 vcpu->arch.pio.in = in;
2078 vcpu->arch.pio.string = 0;
2079 vcpu->arch.pio.down = 0;
2080 vcpu->arch.pio.guest_page_offset = 0;
2081 vcpu->arch.pio.rep = 0;
2082
2083 kvm_x86_ops->cache_regs(vcpu);
2084 memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4);
2085 kvm_x86_ops->decache_regs(vcpu);
2086
2087 kvm_x86_ops->skip_emulated_instruction(vcpu);
2088
2089 pio_dev = vcpu_find_pio_dev(vcpu, port);
2090 if (pio_dev) {
2091 kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
2092 complete_pio(vcpu);
2093 return 1;
2094 }
2095 return 0;
2096}
2097EXPORT_SYMBOL_GPL(kvm_emulate_pio);
2098
2099int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2100 int size, unsigned long count, int down,
2101 gva_t address, int rep, unsigned port)
2102{
2103 unsigned now, in_page;
2104 int i, ret = 0;
2105 int nr_pages = 1;
2106 struct page *page;
2107 struct kvm_io_device *pio_dev;
2108
2109 vcpu->run->exit_reason = KVM_EXIT_IO;
2110 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
2111 vcpu->run->io.size = vcpu->arch.pio.size = size;
2112 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
2113 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
2114 vcpu->run->io.port = vcpu->arch.pio.port = port;
2115 vcpu->arch.pio.in = in;
2116 vcpu->arch.pio.string = 1;
2117 vcpu->arch.pio.down = down;
2118 vcpu->arch.pio.guest_page_offset = offset_in_page(address);
2119 vcpu->arch.pio.rep = rep;
2120
2121 if (!count) {
2122 kvm_x86_ops->skip_emulated_instruction(vcpu);
2123 return 1;
2124 }
2125
2126 if (!down)
2127 in_page = PAGE_SIZE - offset_in_page(address);
2128 else
2129 in_page = offset_in_page(address) + size;
2130 now = min(count, (unsigned long)in_page / size);
2131 if (!now) {
2132 /*
2133 * String I/O straddles page boundary. Pin two guest pages
2134 * so that we satisfy atomicity constraints. Do just one
2135 * transaction to avoid complexity.
2136 */
2137 nr_pages = 2;
2138 now = 1;
2139 }
2140 if (down) {
2141 /*
2142 * String I/O in reverse. Yuck. Kill the guest, fix later.
2143 */
2144 pr_unimpl(vcpu, "guest string pio down\n");
2145 kvm_inject_gp(vcpu, 0);
2146 return 1;
2147 }
2148 vcpu->run->io.count = now;
2149 vcpu->arch.pio.cur_count = now;
2150
2151 if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
2152 kvm_x86_ops->skip_emulated_instruction(vcpu);
2153
2154 for (i = 0; i < nr_pages; ++i) {
2155 down_read(&current->mm->mmap_sem);
2156 page = gva_to_page(vcpu, address + i * PAGE_SIZE);
2157 vcpu->arch.pio.guest_pages[i] = page;
2158 up_read(&current->mm->mmap_sem);
2159 if (!page) {
2160 kvm_inject_gp(vcpu, 0);
2161 free_pio_guest_pages(vcpu);
2162 return 1;
2163 }
2164 }
2165
2166 pio_dev = vcpu_find_pio_dev(vcpu, port);
2167 if (!vcpu->arch.pio.in) {
2168 /* string PIO write */
2169 ret = pio_copy_data(vcpu);
2170 if (ret >= 0 && pio_dev) {
2171 pio_string_write(pio_dev, vcpu);
2172 complete_pio(vcpu);
2173 if (vcpu->arch.pio.count == 0)
2174 ret = 1;
2175 }
2176 } else if (pio_dev)
2177 pr_unimpl(vcpu, "no string pio read support yet, "
2178 "port %x size %d count %ld\n",
2179 port, size, count);
2180
2181 return ret;
2182}
2183EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
2184
2185int kvm_arch_init(void *opaque)
2186{
2187 int r;
2188 struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
2189
2190 if (kvm_x86_ops) {
2191 printk(KERN_ERR "kvm: already loaded the other module\n");
2192 r = -EEXIST;
2193 goto out;
2194 }
2195
2196 if (!ops->cpu_has_kvm_support()) {
2197 printk(KERN_ERR "kvm: no hardware support\n");
2198 r = -EOPNOTSUPP;
2199 goto out;
2200 }
2201 if (ops->disabled_by_bios()) {
2202 printk(KERN_ERR "kvm: disabled by bios\n");
2203 r = -EOPNOTSUPP;
2204 goto out;
2205 }
2206
2207 r = kvm_mmu_module_init();
2208 if (r)
2209 goto out;
2210
2211 kvm_init_msr_list();
2212
2213 kvm_x86_ops = ops;
2214 kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
2215 return 0;
2216
2217out:
2218 return r;
2219}
2220
2221void kvm_arch_exit(void)
2222{
2223 kvm_x86_ops = NULL;
2224 kvm_mmu_module_exit();
2225}
2226
2227int kvm_emulate_halt(struct kvm_vcpu *vcpu)
2228{
2229 ++vcpu->stat.halt_exits;
2230 if (irqchip_in_kernel(vcpu->kvm)) {
2231 vcpu->arch.mp_state = VCPU_MP_STATE_HALTED;
2232 kvm_vcpu_block(vcpu);
2233 if (vcpu->arch.mp_state != VCPU_MP_STATE_RUNNABLE)
2234 return -EINTR;
2235 return 1;
2236 } else {
2237 vcpu->run->exit_reason = KVM_EXIT_HLT;
2238 return 0;
2239 }
2240}
2241EXPORT_SYMBOL_GPL(kvm_emulate_halt);
2242
2243int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
2244{
2245 unsigned long nr, a0, a1, a2, a3, ret;
2246
2247 kvm_x86_ops->cache_regs(vcpu);
2248
2249 nr = vcpu->arch.regs[VCPU_REGS_RAX];
2250 a0 = vcpu->arch.regs[VCPU_REGS_RBX];
2251 a1 = vcpu->arch.regs[VCPU_REGS_RCX];
2252 a2 = vcpu->arch.regs[VCPU_REGS_RDX];
2253 a3 = vcpu->arch.regs[VCPU_REGS_RSI];
2254
2255 if (!is_long_mode(vcpu)) {
2256 nr &= 0xFFFFFFFF;
2257 a0 &= 0xFFFFFFFF;
2258 a1 &= 0xFFFFFFFF;
2259 a2 &= 0xFFFFFFFF;
2260 a3 &= 0xFFFFFFFF;
2261 }
2262
2263 switch (nr) {
2264 case KVM_HC_VAPIC_POLL_IRQ:
2265 ret = 0;
2266 break;
2267 default:
2268 ret = -KVM_ENOSYS;
2269 break;
2270 }
2271 vcpu->arch.regs[VCPU_REGS_RAX] = ret;
2272 kvm_x86_ops->decache_regs(vcpu);
2273 return 0;
2274}
2275EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
2276
2277int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
2278{
2279 char instruction[3];
2280 int ret = 0;
2281
2282
2283 /*
2284 * Blow out the MMU to ensure that no other VCPU has an active mapping
2285 * to ensure that the updated hypercall appears atomically across all
2286 * VCPUs.
2287 */
2288 kvm_mmu_zap_all(vcpu->kvm);
2289
2290 kvm_x86_ops->cache_regs(vcpu);
2291 kvm_x86_ops->patch_hypercall(vcpu, instruction);
2292 if (emulator_write_emulated(vcpu->arch.rip, instruction, 3, vcpu)
2293 != X86EMUL_CONTINUE)
2294 ret = -EFAULT;
2295
2296 return ret;
2297}
2298
2299static u64 mk_cr_64(u64 curr_cr, u32 new_val)
2300{
2301 return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
2302}
2303
2304void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
2305{
2306 struct descriptor_table dt = { limit, base };
2307
2308 kvm_x86_ops->set_gdt(vcpu, &dt);
2309}
2310
2311void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
2312{
2313 struct descriptor_table dt = { limit, base };
2314
2315 kvm_x86_ops->set_idt(vcpu, &dt);
2316}
2317
2318void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
2319 unsigned long *rflags)
2320{
2321 lmsw(vcpu, msw);
2322 *rflags = kvm_x86_ops->get_rflags(vcpu);
2323}
2324
2325unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
2326{
2327 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2328 switch (cr) {
2329 case 0:
2330 return vcpu->arch.cr0;
2331 case 2:
2332 return vcpu->arch.cr2;
2333 case 3:
2334 return vcpu->arch.cr3;
2335 case 4:
2336 return vcpu->arch.cr4;
2337 case 8:
2338 return get_cr8(vcpu);
2339 default:
2340 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
2341 return 0;
2342 }
2343}
2344
2345void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
2346 unsigned long *rflags)
2347{
2348 switch (cr) {
2349 case 0:
2350 set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
2351 *rflags = kvm_x86_ops->get_rflags(vcpu);
2352 break;
2353 case 2:
2354 vcpu->arch.cr2 = val;
2355 break;
2356 case 3:
2357 set_cr3(vcpu, val);
2358 break;
2359 case 4:
2360 set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val));
2361 break;
2362 case 8:
2363 set_cr8(vcpu, val & 0xfUL);
2364 break;
2365 default:
2366 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
2367 }
2368}
2369
2370static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
2371{
2372 struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
2373 int j, nent = vcpu->arch.cpuid_nent;
2374
2375 e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
2376 /* when no next entry is found, the current entry[i] is reselected */
2377 for (j = i + 1; j == i; j = (j + 1) % nent) {
2378 struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
2379 if (ej->function == e->function) {
2380 ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
2381 return j;
2382 }
2383 }
2384 return 0; /* silence gcc, even though control never reaches here */
2385}
2386
2387/* find an entry with matching function, matching index (if needed), and that
2388 * should be read next (if it's stateful) */
2389static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
2390 u32 function, u32 index)
2391{
2392 if (e->function != function)
2393 return 0;
2394 if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
2395 return 0;
2396 if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
2397 !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
2398 return 0;
2399 return 1;
2400}
2401
2402void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
2403{
2404 int i;
2405 u32 function, index;
2406 struct kvm_cpuid_entry2 *e, *best;
2407
2408 kvm_x86_ops->cache_regs(vcpu);
2409 function = vcpu->arch.regs[VCPU_REGS_RAX];
2410 index = vcpu->arch.regs[VCPU_REGS_RCX];
2411 vcpu->arch.regs[VCPU_REGS_RAX] = 0;
2412 vcpu->arch.regs[VCPU_REGS_RBX] = 0;
2413 vcpu->arch.regs[VCPU_REGS_RCX] = 0;
2414 vcpu->arch.regs[VCPU_REGS_RDX] = 0;
2415 best = NULL;
2416 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
2417 e = &vcpu->arch.cpuid_entries[i];
2418 if (is_matching_cpuid_entry(e, function, index)) {
2419 if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
2420 move_to_next_stateful_cpuid_entry(vcpu, i);
2421 best = e;
2422 break;
2423 }
2424 /*
2425 * Both basic or both extended?
2426 */
2427 if (((e->function ^ function) & 0x80000000) == 0)
2428 if (!best || e->function > best->function)
2429 best = e;
2430 }
2431 if (best) {
2432 vcpu->arch.regs[VCPU_REGS_RAX] = best->eax;
2433 vcpu->arch.regs[VCPU_REGS_RBX] = best->ebx;
2434 vcpu->arch.regs[VCPU_REGS_RCX] = best->ecx;
2435 vcpu->arch.regs[VCPU_REGS_RDX] = best->edx;
2436 }
2437 kvm_x86_ops->decache_regs(vcpu);
2438 kvm_x86_ops->skip_emulated_instruction(vcpu);
2439}
2440EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
2441
2442/*
2443 * Check if userspace requested an interrupt window, and that the
2444 * interrupt window is open.
2445 *
2446 * No need to exit to userspace if we already have an interrupt queued.
2447 */
2448static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
2449 struct kvm_run *kvm_run)
2450{
2451 return (!vcpu->arch.irq_summary &&
2452 kvm_run->request_interrupt_window &&
2453 vcpu->arch.interrupt_window_open &&
2454 (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
2455}
2456
2457static void post_kvm_run_save(struct kvm_vcpu *vcpu,
2458 struct kvm_run *kvm_run)
2459{
2460 kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
2461 kvm_run->cr8 = get_cr8(vcpu);
2462 kvm_run->apic_base = kvm_get_apic_base(vcpu);
2463 if (irqchip_in_kernel(vcpu->kvm))
2464 kvm_run->ready_for_interrupt_injection = 1;
2465 else
2466 kvm_run->ready_for_interrupt_injection =
2467 (vcpu->arch.interrupt_window_open &&
2468 vcpu->arch.irq_summary == 0);
2469}
2470
2471static void vapic_enter(struct kvm_vcpu *vcpu)
2472{
2473 struct kvm_lapic *apic = vcpu->arch.apic;
2474 struct page *page;
2475
2476 if (!apic || !apic->vapic_addr)
2477 return;
2478
2479 down_read(&current->mm->mmap_sem);
2480 page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
2481 vcpu->arch.apic->vapic_page = page;
2482 up_read(&current->mm->mmap_sem);
2483}
2484
2485static void vapic_exit(struct kvm_vcpu *vcpu)
2486{
2487 struct kvm_lapic *apic = vcpu->arch.apic;
2488
2489 if (!apic || !apic->vapic_addr)
2490 return;
2491
2492 kvm_release_page_dirty(apic->vapic_page);
2493 mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
2494}
2495
2496static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2497{
2498 int r;
2499
2500 if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) {
2501 pr_debug("vcpu %d received sipi with vector # %x\n",
2502 vcpu->vcpu_id, vcpu->arch.sipi_vector);
2503 kvm_lapic_reset(vcpu);
2504 r = kvm_x86_ops->vcpu_reset(vcpu);
2505 if (r)
2506 return r;
2507 vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
2508 }
2509
2510 vapic_enter(vcpu);
2511
2512preempted:
2513 if (vcpu->guest_debug.enabled)
2514 kvm_x86_ops->guest_debug_pre(vcpu);
2515
2516again:
2517 r = kvm_mmu_reload(vcpu);
2518 if (unlikely(r))
2519 goto out;
2520
2521 if (vcpu->requests) {
2522 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
2523 __kvm_migrate_apic_timer(vcpu);
2524 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
2525 &vcpu->requests)) {
2526 kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS;
2527 r = 0;
2528 goto out;
2529 }
2530 }
2531
2532 kvm_inject_pending_timer_irqs(vcpu);
2533
2534 preempt_disable();
2535
2536 kvm_x86_ops->prepare_guest_switch(vcpu);
2537 kvm_load_guest_fpu(vcpu);
2538
2539 local_irq_disable();
2540
2541 if (need_resched()) {
2542 local_irq_enable();
2543 preempt_enable();
2544 r = 1;
2545 goto out;
2546 }
2547
2548 if (signal_pending(current)) {
2549 local_irq_enable();
2550 preempt_enable();
2551 r = -EINTR;
2552 kvm_run->exit_reason = KVM_EXIT_INTR;
2553 ++vcpu->stat.signal_exits;
2554 goto out;
2555 }
2556
2557 if (vcpu->arch.exception.pending)
2558 __queue_exception(vcpu);
2559 else if (irqchip_in_kernel(vcpu->kvm))
2560 kvm_x86_ops->inject_pending_irq(vcpu);
2561 else
2562 kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
2563
2564 kvm_lapic_sync_to_vapic(vcpu);
2565
2566 vcpu->guest_mode = 1;
2567 kvm_guest_enter();
2568
2569 if (vcpu->requests)
2570 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
2571 kvm_x86_ops->tlb_flush(vcpu);
2572
2573 kvm_x86_ops->run(vcpu, kvm_run);
2574
2575 vcpu->guest_mode = 0;
2576 local_irq_enable();
2577
2578 ++vcpu->stat.exits;
2579
2580 /*
2581 * We must have an instruction between local_irq_enable() and
2582 * kvm_guest_exit(), so the timer interrupt isn't delayed by
2583 * the interrupt shadow. The stat.exits increment will do nicely.
2584 * But we need to prevent reordering, hence this barrier():
2585 */
2586 barrier();
2587
2588 kvm_guest_exit();
2589
2590 preempt_enable();
2591
2592 /*
2593 * Profile KVM exit RIPs:
2594 */
2595 if (unlikely(prof_on == KVM_PROFILING)) {
2596 kvm_x86_ops->cache_regs(vcpu);
2597 profile_hit(KVM_PROFILING, (void *)vcpu->arch.rip);
2598 }
2599
2600 if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu))
2601 vcpu->arch.exception.pending = false;
2602
2603 kvm_lapic_sync_from_vapic(vcpu);
2604
2605 r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
2606
2607 if (r > 0) {
2608 if (dm_request_for_irq_injection(vcpu, kvm_run)) {
2609 r = -EINTR;
2610 kvm_run->exit_reason = KVM_EXIT_INTR;
2611 ++vcpu->stat.request_irq_exits;
2612 goto out;
2613 }
2614 if (!need_resched())
2615 goto again;
2616 }
2617
2618out:
2619 if (r > 0) {
2620 kvm_resched(vcpu);
2621 goto preempted;
2622 }
2623
2624 post_kvm_run_save(vcpu, kvm_run);
2625
2626 vapic_exit(vcpu);
2627
2628 return r;
2629}
2630
2631int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2632{
2633 int r;
2634 sigset_t sigsaved;
2635
2636 vcpu_load(vcpu);
2637
2638 if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_UNINITIALIZED)) {
2639 kvm_vcpu_block(vcpu);
2640 vcpu_put(vcpu);
2641 return -EAGAIN;
2642 }
2643
2644 if (vcpu->sigset_active)
2645 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
2646
2647 /* re-sync apic's tpr */
2648 if (!irqchip_in_kernel(vcpu->kvm))
2649 set_cr8(vcpu, kvm_run->cr8);
2650
2651 if (vcpu->arch.pio.cur_count) {
2652 r = complete_pio(vcpu);
2653 if (r)
2654 goto out;
2655 }
2656#if CONFIG_HAS_IOMEM
2657 if (vcpu->mmio_needed) {
2658 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
2659 vcpu->mmio_read_completed = 1;
2660 vcpu->mmio_needed = 0;
2661 r = emulate_instruction(vcpu, kvm_run,
2662 vcpu->arch.mmio_fault_cr2, 0,
2663 EMULTYPE_NO_DECODE);
2664 if (r == EMULATE_DO_MMIO) {
2665 /*
2666 * Read-modify-write. Back to userspace.
2667 */
2668 r = 0;
2669 goto out;
2670 }
2671 }
2672#endif
2673 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) {
2674 kvm_x86_ops->cache_regs(vcpu);
2675 vcpu->arch.regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;
2676 kvm_x86_ops->decache_regs(vcpu);
2677 }
2678
2679 r = __vcpu_run(vcpu, kvm_run);
2680
2681out:
2682 if (vcpu->sigset_active)
2683 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
2684
2685 vcpu_put(vcpu);
2686 return r;
2687}
2688
2689int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
2690{
2691 vcpu_load(vcpu);
2692
2693 kvm_x86_ops->cache_regs(vcpu);
2694
2695 regs->rax = vcpu->arch.regs[VCPU_REGS_RAX];
2696 regs->rbx = vcpu->arch.regs[VCPU_REGS_RBX];
2697 regs->rcx = vcpu->arch.regs[VCPU_REGS_RCX];
2698 regs->rdx = vcpu->arch.regs[VCPU_REGS_RDX];
2699 regs->rsi = vcpu->arch.regs[VCPU_REGS_RSI];
2700 regs->rdi = vcpu->arch.regs[VCPU_REGS_RDI];
2701 regs->rsp = vcpu->arch.regs[VCPU_REGS_RSP];
2702 regs->rbp = vcpu->arch.regs[VCPU_REGS_RBP];
2703#ifdef CONFIG_X86_64
2704 regs->r8 = vcpu->arch.regs[VCPU_REGS_R8];
2705 regs->r9 = vcpu->arch.regs[VCPU_REGS_R9];
2706 regs->r10 = vcpu->arch.regs[VCPU_REGS_R10];
2707 regs->r11 = vcpu->arch.regs[VCPU_REGS_R11];
2708 regs->r12 = vcpu->arch.regs[VCPU_REGS_R12];
2709 regs->r13 = vcpu->arch.regs[VCPU_REGS_R13];
2710 regs->r14 = vcpu->arch.regs[VCPU_REGS_R14];
2711 regs->r15 = vcpu->arch.regs[VCPU_REGS_R15];
2712#endif
2713
2714 regs->rip = vcpu->arch.rip;
2715 regs->rflags = kvm_x86_ops->get_rflags(vcpu);
2716
2717 /*
2718 * Don't leak debug flags in case they were set for guest debugging
2719 */
2720 if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
2721 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
2722
2723 vcpu_put(vcpu);
2724
2725 return 0;
2726}
2727
2728int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
2729{
2730 vcpu_load(vcpu);
2731
2732 vcpu->arch.regs[VCPU_REGS_RAX] = regs->rax;
2733 vcpu->arch.regs[VCPU_REGS_RBX] = regs->rbx;
2734 vcpu->arch.regs[VCPU_REGS_RCX] = regs->rcx;
2735 vcpu->arch.regs[VCPU_REGS_RDX] = regs->rdx;
2736 vcpu->arch.regs[VCPU_REGS_RSI] = regs->rsi;
2737 vcpu->arch.regs[VCPU_REGS_RDI] = regs->rdi;
2738 vcpu->arch.regs[VCPU_REGS_RSP] = regs->rsp;
2739 vcpu->arch.regs[VCPU_REGS_RBP] = regs->rbp;
2740#ifdef CONFIG_X86_64
2741 vcpu->arch.regs[VCPU_REGS_R8] = regs->r8;
2742 vcpu->arch.regs[VCPU_REGS_R9] = regs->r9;
2743 vcpu->arch.regs[VCPU_REGS_R10] = regs->r10;
2744 vcpu->arch.regs[VCPU_REGS_R11] = regs->r11;
2745 vcpu->arch.regs[VCPU_REGS_R12] = regs->r12;
2746 vcpu->arch.regs[VCPU_REGS_R13] = regs->r13;
2747 vcpu->arch.regs[VCPU_REGS_R14] = regs->r14;
2748 vcpu->arch.regs[VCPU_REGS_R15] = regs->r15;
2749#endif
2750
2751 vcpu->arch.rip = regs->rip;
2752 kvm_x86_ops->set_rflags(vcpu, regs->rflags);
2753
2754 kvm_x86_ops->decache_regs(vcpu);
2755
2756 vcpu_put(vcpu);
2757
2758 return 0;
2759}
2760
2761static void get_segment(struct kvm_vcpu *vcpu,
2762 struct kvm_segment *var, int seg)
2763{
2764 return kvm_x86_ops->get_segment(vcpu, var, seg);
2765}
2766
2767void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
2768{
2769 struct kvm_segment cs;
2770
2771 get_segment(vcpu, &cs, VCPU_SREG_CS);
2772 *db = cs.db;
2773 *l = cs.l;
2774}
2775EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
2776
2777int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
2778 struct kvm_sregs *sregs)
2779{
2780 struct descriptor_table dt;
2781 int pending_vec;
2782
2783 vcpu_load(vcpu);
2784
2785 get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
2786 get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
2787 get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
2788 get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
2789 get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
2790 get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
2791
2792 get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
2793 get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
2794
2795 kvm_x86_ops->get_idt(vcpu, &dt);
2796 sregs->idt.limit = dt.limit;
2797 sregs->idt.base = dt.base;
2798 kvm_x86_ops->get_gdt(vcpu, &dt);
2799 sregs->gdt.limit = dt.limit;
2800 sregs->gdt.base = dt.base;
2801
2802 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2803 sregs->cr0 = vcpu->arch.cr0;
2804 sregs->cr2 = vcpu->arch.cr2;
2805 sregs->cr3 = vcpu->arch.cr3;
2806 sregs->cr4 = vcpu->arch.cr4;
2807 sregs->cr8 = get_cr8(vcpu);
2808 sregs->efer = vcpu->arch.shadow_efer;
2809 sregs->apic_base = kvm_get_apic_base(vcpu);
2810
2811 if (irqchip_in_kernel(vcpu->kvm)) {
2812 memset(sregs->interrupt_bitmap, 0,
2813 sizeof sregs->interrupt_bitmap);
2814 pending_vec = kvm_x86_ops->get_irq(vcpu);
2815 if (pending_vec >= 0)
2816 set_bit(pending_vec,
2817 (unsigned long *)sregs->interrupt_bitmap);
2818 } else
2819 memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending,
2820 sizeof sregs->interrupt_bitmap);
2821
2822 vcpu_put(vcpu);
2823
2824 return 0;
2825}
2826
2827static void set_segment(struct kvm_vcpu *vcpu,
2828 struct kvm_segment *var, int seg)
2829{
2830 return kvm_x86_ops->set_segment(vcpu, var, seg);
2831}
2832
2833int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
2834 struct kvm_sregs *sregs)
2835{
2836 int mmu_reset_needed = 0;
2837 int i, pending_vec, max_bits;
2838 struct descriptor_table dt;
2839
2840 vcpu_load(vcpu);
2841
2842 dt.limit = sregs->idt.limit;
2843 dt.base = sregs->idt.base;
2844 kvm_x86_ops->set_idt(vcpu, &dt);
2845 dt.limit = sregs->gdt.limit;
2846 dt.base = sregs->gdt.base;
2847 kvm_x86_ops->set_gdt(vcpu, &dt);
2848
2849 vcpu->arch.cr2 = sregs->cr2;
2850 mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
2851 vcpu->arch.cr3 = sregs->cr3;
2852
2853 set_cr8(vcpu, sregs->cr8);
2854
2855 mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer;
2856#ifdef CONFIG_X86_64
2857 kvm_x86_ops->set_efer(vcpu, sregs->efer);
2858#endif
2859 kvm_set_apic_base(vcpu, sregs->apic_base);
2860
2861 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2862
2863 mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0;
2864 vcpu->arch.cr0 = sregs->cr0;
2865 kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
2866
2867 mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4;
2868 kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
2869 if (!is_long_mode(vcpu) && is_pae(vcpu))
2870 load_pdptrs(vcpu, vcpu->arch.cr3);
2871
2872 if (mmu_reset_needed)
2873 kvm_mmu_reset_context(vcpu);
2874
2875 if (!irqchip_in_kernel(vcpu->kvm)) {
2876 memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap,
2877 sizeof vcpu->arch.irq_pending);
2878 vcpu->arch.irq_summary = 0;
2879 for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i)
2880 if (vcpu->arch.irq_pending[i])
2881 __set_bit(i, &vcpu->arch.irq_summary);
2882 } else {
2883 max_bits = (sizeof sregs->interrupt_bitmap) << 3;
2884 pending_vec = find_first_bit(
2885 (const unsigned long *)sregs->interrupt_bitmap,
2886 max_bits);
2887 /* Only pending external irq is handled here */
2888 if (pending_vec < max_bits) {
2889 kvm_x86_ops->set_irq(vcpu, pending_vec);
2890 pr_debug("Set back pending irq %d\n",
2891 pending_vec);
2892 }
2893 }
2894
2895 set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
2896 set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
2897 set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
2898 set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
2899 set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
2900 set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
2901
2902 set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
2903 set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
2904
2905 vcpu_put(vcpu);
2906
2907 return 0;
2908}
2909
2910int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
2911 struct kvm_debug_guest *dbg)
2912{
2913 int r;
2914
2915 vcpu_load(vcpu);
2916
2917 r = kvm_x86_ops->set_guest_debug(vcpu, dbg);
2918
2919 vcpu_put(vcpu);
2920
2921 return r;
2922}
2923
2924/*
2925 * fxsave fpu state. Taken from x86_64/processor.h. To be killed when
2926 * we have asm/x86/processor.h
2927 */
2928struct fxsave {
2929 u16 cwd;
2930 u16 swd;
2931 u16 twd;
2932 u16 fop;
2933 u64 rip;
2934 u64 rdp;
2935 u32 mxcsr;
2936 u32 mxcsr_mask;
2937 u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
2938#ifdef CONFIG_X86_64
2939 u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
2940#else
2941 u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */
2942#endif
2943};
2944
2945/*
2946 * Translate a guest virtual address to a guest physical address.
2947 */
2948int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
2949 struct kvm_translation *tr)
2950{
2951 unsigned long vaddr = tr->linear_address;
2952 gpa_t gpa;
2953
2954 vcpu_load(vcpu);
2955 down_read(&current->mm->mmap_sem);
2956 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr);
2957 up_read(&current->mm->mmap_sem);
2958 tr->physical_address = gpa;
2959 tr->valid = gpa != UNMAPPED_GVA;
2960 tr->writeable = 1;
2961 tr->usermode = 0;
2962 vcpu_put(vcpu);
2963
2964 return 0;
2965}
2966
2967int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2968{
2969 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
2970
2971 vcpu_load(vcpu);
2972
2973 memcpy(fpu->fpr, fxsave->st_space, 128);
2974 fpu->fcw = fxsave->cwd;
2975 fpu->fsw = fxsave->swd;
2976 fpu->ftwx = fxsave->twd;
2977 fpu->last_opcode = fxsave->fop;
2978 fpu->last_ip = fxsave->rip;
2979 fpu->last_dp = fxsave->rdp;
2980 memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
2981
2982 vcpu_put(vcpu);
2983
2984 return 0;
2985}
2986
2987int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2988{
2989 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
2990
2991 vcpu_load(vcpu);
2992
2993 memcpy(fxsave->st_space, fpu->fpr, 128);
2994 fxsave->cwd = fpu->fcw;
2995 fxsave->swd = fpu->fsw;
2996 fxsave->twd = fpu->ftwx;
2997 fxsave->fop = fpu->last_opcode;
2998 fxsave->rip = fpu->last_ip;
2999 fxsave->rdp = fpu->last_dp;
3000 memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
3001
3002 vcpu_put(vcpu);
3003
3004 return 0;
3005}
3006
3007void fx_init(struct kvm_vcpu *vcpu)
3008{
3009 unsigned after_mxcsr_mask;
3010
3011 /* Initialize guest FPU by resetting ours and saving into guest's */
3012 preempt_disable();
3013 fx_save(&vcpu->arch.host_fx_image);
3014 fpu_init();
3015 fx_save(&vcpu->arch.guest_fx_image);
3016 fx_restore(&vcpu->arch.host_fx_image);
3017 preempt_enable();
3018
3019 vcpu->arch.cr0 |= X86_CR0_ET;
3020 after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
3021 vcpu->arch.guest_fx_image.mxcsr = 0x1f80;
3022 memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask,
3023 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
3024}
3025EXPORT_SYMBOL_GPL(fx_init);
3026
3027void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
3028{
3029 if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
3030 return;
3031
3032 vcpu->guest_fpu_loaded = 1;
3033 fx_save(&vcpu->arch.host_fx_image);
3034 fx_restore(&vcpu->arch.guest_fx_image);
3035}
3036EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
3037
3038void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
3039{
3040 if (!vcpu->guest_fpu_loaded)
3041 return;
3042
3043 vcpu->guest_fpu_loaded = 0;
3044 fx_save(&vcpu->arch.guest_fx_image);
3045 fx_restore(&vcpu->arch.host_fx_image);
3046 ++vcpu->stat.fpu_reload;
3047}
3048EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
3049
3050void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
3051{
3052 kvm_x86_ops->vcpu_free(vcpu);
3053}
3054
3055struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
3056 unsigned int id)
3057{
3058 return kvm_x86_ops->vcpu_create(kvm, id);
3059}
3060
3061int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
3062{
3063 int r;
3064
3065 /* We do fxsave: this must be aligned. */
3066 BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
3067
3068 vcpu_load(vcpu);
3069 r = kvm_arch_vcpu_reset(vcpu);
3070 if (r == 0)
3071 r = kvm_mmu_setup(vcpu);
3072 vcpu_put(vcpu);
3073 if (r < 0)
3074 goto free_vcpu;
3075
3076 return 0;
3077free_vcpu:
3078 kvm_x86_ops->vcpu_free(vcpu);
3079 return r;
3080}
3081
3082void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
3083{
3084 vcpu_load(vcpu);
3085 kvm_mmu_unload(vcpu);
3086 vcpu_put(vcpu);
3087
3088 kvm_x86_ops->vcpu_free(vcpu);
3089}
3090
3091int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
3092{
3093 return kvm_x86_ops->vcpu_reset(vcpu);
3094}
3095
3096void kvm_arch_hardware_enable(void *garbage)
3097{
3098 kvm_x86_ops->hardware_enable(garbage);
3099}
3100
3101void kvm_arch_hardware_disable(void *garbage)
3102{
3103 kvm_x86_ops->hardware_disable(garbage);
3104}
3105
3106int kvm_arch_hardware_setup(void)
3107{
3108 return kvm_x86_ops->hardware_setup();
3109}
3110
3111void kvm_arch_hardware_unsetup(void)
3112{
3113 kvm_x86_ops->hardware_unsetup();
3114}
3115
3116void kvm_arch_check_processor_compat(void *rtn)
3117{
3118 kvm_x86_ops->check_processor_compatibility(rtn);
3119}
3120
3121int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
3122{
3123 struct page *page;
3124 struct kvm *kvm;
3125 int r;
3126
3127 BUG_ON(vcpu->kvm == NULL);
3128 kvm = vcpu->kvm;
3129
3130 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
3131 if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0)
3132 vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
3133 else
3134 vcpu->arch.mp_state = VCPU_MP_STATE_UNINITIALIZED;
3135
3136 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
3137 if (!page) {
3138 r = -ENOMEM;
3139 goto fail;
3140 }
3141 vcpu->arch.pio_data = page_address(page);
3142
3143 r = kvm_mmu_create(vcpu);
3144 if (r < 0)
3145 goto fail_free_pio_data;
3146
3147 if (irqchip_in_kernel(kvm)) {
3148 r = kvm_create_lapic(vcpu);
3149 if (r < 0)
3150 goto fail_mmu_destroy;
3151 }
3152
3153 return 0;
3154
3155fail_mmu_destroy:
3156 kvm_mmu_destroy(vcpu);
3157fail_free_pio_data:
3158 free_page((unsigned long)vcpu->arch.pio_data);
3159fail:
3160 return r;
3161}
3162
3163void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
3164{
3165 kvm_free_lapic(vcpu);
3166 kvm_mmu_destroy(vcpu);
3167 free_page((unsigned long)vcpu->arch.pio_data);
3168}
3169
3170struct kvm *kvm_arch_create_vm(void)
3171{
3172 struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
3173
3174 if (!kvm)
3175 return ERR_PTR(-ENOMEM);
3176
3177 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
3178
3179 return kvm;
3180}
3181
3182static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
3183{
3184 vcpu_load(vcpu);
3185 kvm_mmu_unload(vcpu);
3186 vcpu_put(vcpu);
3187}
3188
3189static void kvm_free_vcpus(struct kvm *kvm)
3190{
3191 unsigned int i;
3192
3193 /*
3194 * Unpin any mmu pages first.
3195 */
3196 for (i = 0; i < KVM_MAX_VCPUS; ++i)
3197 if (kvm->vcpus[i])
3198 kvm_unload_vcpu_mmu(kvm->vcpus[i]);
3199 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
3200 if (kvm->vcpus[i]) {
3201 kvm_arch_vcpu_free(kvm->vcpus[i]);
3202 kvm->vcpus[i] = NULL;
3203 }
3204 }
3205
3206}
3207
3208void kvm_arch_destroy_vm(struct kvm *kvm)
3209{
3210 kfree(kvm->arch.vpic);
3211 kfree(kvm->arch.vioapic);
3212 kvm_free_vcpus(kvm);
3213 kvm_free_physmem(kvm);
3214 kfree(kvm);
3215}
3216
3217int kvm_arch_set_memory_region(struct kvm *kvm,
3218 struct kvm_userspace_memory_region *mem,
3219 struct kvm_memory_slot old,
3220 int user_alloc)
3221{
3222 int npages = mem->memory_size >> PAGE_SHIFT;
3223 struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
3224
3225 /*To keep backward compatibility with older userspace,
3226 *x86 needs to hanlde !user_alloc case.
3227 */
3228 if (!user_alloc) {
3229 if (npages && !old.rmap) {
3230 memslot->userspace_addr = do_mmap(NULL, 0,
3231 npages * PAGE_SIZE,
3232 PROT_READ | PROT_WRITE,
3233 MAP_SHARED | MAP_ANONYMOUS,
3234 0);
3235
3236 if (IS_ERR((void *)memslot->userspace_addr))
3237 return PTR_ERR((void *)memslot->userspace_addr);
3238 } else {
3239 if (!old.user_alloc && old.rmap) {
3240 int ret;
3241
3242 ret = do_munmap(current->mm, old.userspace_addr,
3243 old.npages * PAGE_SIZE);
3244 if (ret < 0)
3245 printk(KERN_WARNING
3246 "kvm_vm_ioctl_set_memory_region: "
3247 "failed to munmap memory\n");
3248 }
3249 }
3250 }
3251
3252 if (!kvm->arch.n_requested_mmu_pages) {
3253 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
3254 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
3255 }
3256
3257 kvm_mmu_slot_remove_write_access(kvm, mem->slot);
3258 kvm_flush_remote_tlbs(kvm);
3259
3260 return 0;
3261}
3262
3263int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
3264{
3265 return vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE
3266 || vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED;
3267}
3268
3269static void vcpu_kick_intr(void *info)
3270{
3271#ifdef DEBUG
3272 struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
3273 printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu);
3274#endif
3275}
3276
3277void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
3278{
3279 int ipi_pcpu = vcpu->cpu;
3280
3281 if (waitqueue_active(&vcpu->wq)) {
3282 wake_up_interruptible(&vcpu->wq);
3283 ++vcpu->stat.halt_wakeup;
3284 }
3285 if (vcpu->guest_mode)
3286 smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0);
3287}
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
new file mode 100644
index 000000000000..79586003397a
--- /dev/null
+++ b/arch/x86/kvm/x86_emulate.c
@@ -0,0 +1,1912 @@
1/******************************************************************************
2 * x86_emulate.c
3 *
4 * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
5 *
6 * Copyright (c) 2005 Keir Fraser
7 *
8 * Linux coding style, mod r/m decoder, segment base fixes, real-mode
9 * privileged instructions:
10 *
11 * Copyright (C) 2006 Qumranet
12 *
13 * Avi Kivity <avi@qumranet.com>
14 * Yaniv Kamay <yaniv@qumranet.com>
15 *
16 * This work is licensed under the terms of the GNU GPL, version 2. See
17 * the COPYING file in the top-level directory.
18 *
19 * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
20 */
21
22#ifndef __KERNEL__
23#include <stdio.h>
24#include <stdint.h>
25#include <public/xen.h>
26#define DPRINTF(_f, _a ...) printf(_f , ## _a)
27#else
28#include <linux/kvm_host.h>
29#define DPRINTF(x...) do {} while (0)
30#endif
31#include <linux/module.h>
32#include <asm/kvm_x86_emulate.h>
33
34/*
35 * Opcode effective-address decode tables.
36 * Note that we only emulate instructions that have at least one memory
37 * operand (excluding implicit stack references). We assume that stack
38 * references and instruction fetches will never occur in special memory
39 * areas that require emulation. So, for example, 'mov <imm>,<reg>' need
40 * not be handled.
41 */
42
43/* Operand sizes: 8-bit operands or specified/overridden size. */
44#define ByteOp (1<<0) /* 8-bit operands. */
45/* Destination operand type. */
46#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */
47#define DstReg (2<<1) /* Register operand. */
48#define DstMem (3<<1) /* Memory operand. */
49#define DstMask (3<<1)
50/* Source operand type. */
51#define SrcNone (0<<3) /* No source operand. */
52#define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */
53#define SrcReg (1<<3) /* Register operand. */
54#define SrcMem (2<<3) /* Memory operand. */
55#define SrcMem16 (3<<3) /* Memory operand (16-bit). */
56#define SrcMem32 (4<<3) /* Memory operand (32-bit). */
57#define SrcImm (5<<3) /* Immediate operand. */
58#define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */
59#define SrcMask (7<<3)
60/* Generic ModRM decode. */
61#define ModRM (1<<6)
62/* Destination is only written; never read. */
63#define Mov (1<<7)
64#define BitOp (1<<8)
65#define MemAbs (1<<9) /* Memory operand is absolute displacement */
66#define String (1<<10) /* String instruction (rep capable) */
67#define Stack (1<<11) /* Stack instruction (push/pop) */
68
69static u16 opcode_table[256] = {
70 /* 0x00 - 0x07 */
71 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
72 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
73 0, 0, 0, 0,
74 /* 0x08 - 0x0F */
75 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
76 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
77 0, 0, 0, 0,
78 /* 0x10 - 0x17 */
79 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
80 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
81 0, 0, 0, 0,
82 /* 0x18 - 0x1F */
83 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
84 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
85 0, 0, 0, 0,
86 /* 0x20 - 0x27 */
87 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
88 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
89 SrcImmByte, SrcImm, 0, 0,
90 /* 0x28 - 0x2F */
91 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
92 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
93 0, 0, 0, 0,
94 /* 0x30 - 0x37 */
95 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
96 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
97 0, 0, 0, 0,
98 /* 0x38 - 0x3F */
99 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
100 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
101 0, 0, 0, 0,
102 /* 0x40 - 0x47 */
103 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
104 /* 0x48 - 0x4F */
105 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
106 /* 0x50 - 0x57 */
107 SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
108 SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
109 /* 0x58 - 0x5F */
110 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
111 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
112 /* 0x60 - 0x67 */
113 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
114 0, 0, 0, 0,
115 /* 0x68 - 0x6F */
116 0, 0, ImplicitOps | Mov | Stack, 0,
117 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */
118 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */
119 /* 0x70 - 0x77 */
120 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
121 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
122 /* 0x78 - 0x7F */
123 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
124 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
125 /* 0x80 - 0x87 */
126 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
127 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
128 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
129 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
130 /* 0x88 - 0x8F */
131 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
132 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
133 0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov | Stack,
134 /* 0x90 - 0x9F */
135 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
137 /* 0xA0 - 0xA7 */
138 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
139 ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs,
140 ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
141 ByteOp | ImplicitOps | String, ImplicitOps | String,
142 /* 0xA8 - 0xAF */
143 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
144 ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
145 ByteOp | ImplicitOps | String, ImplicitOps | String,
146 /* 0xB0 - 0xBF */
147 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
148 /* 0xC0 - 0xC7 */
149 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
150 0, ImplicitOps | Stack, 0, 0,
151 ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
152 /* 0xC8 - 0xCF */
153 0, 0, 0, 0, 0, 0, 0, 0,
154 /* 0xD0 - 0xD7 */
155 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
156 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
157 0, 0, 0, 0,
158 /* 0xD8 - 0xDF */
159 0, 0, 0, 0, 0, 0, 0, 0,
160 /* 0xE0 - 0xE7 */
161 0, 0, 0, 0, 0, 0, 0, 0,
162 /* 0xE8 - 0xEF */
163 ImplicitOps | Stack, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps,
164 0, 0, 0, 0,
165 /* 0xF0 - 0xF7 */
166 0, 0, 0, 0,
167 ImplicitOps, ImplicitOps,
168 ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
169 /* 0xF8 - 0xFF */
170 ImplicitOps, 0, ImplicitOps, ImplicitOps,
171 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM
172};
173
174static u16 twobyte_table[256] = {
175 /* 0x00 - 0x0F */
176 0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0,
177 ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
178 /* 0x10 - 0x1F */
179 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
180 /* 0x20 - 0x2F */
181 ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 /* 0x30 - 0x3F */
184 ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
185 /* 0x40 - 0x47 */
186 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
187 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
188 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
189 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
190 /* 0x48 - 0x4F */
191 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
192 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
193 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
194 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
195 /* 0x50 - 0x5F */
196 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
197 /* 0x60 - 0x6F */
198 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
199 /* 0x70 - 0x7F */
200 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
201 /* 0x80 - 0x8F */
202 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
203 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
204 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
205 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
206 /* 0x90 - 0x9F */
207 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
208 /* 0xA0 - 0xA7 */
209 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
210 /* 0xA8 - 0xAF */
211 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
212 /* 0xB0 - 0xB7 */
213 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0,
214 DstMem | SrcReg | ModRM | BitOp,
215 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
216 DstReg | SrcMem16 | ModRM | Mov,
217 /* 0xB8 - 0xBF */
218 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp,
219 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
220 DstReg | SrcMem16 | ModRM | Mov,
221 /* 0xC0 - 0xCF */
222 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM,
223 0, 0, 0, 0, 0, 0, 0, 0,
224 /* 0xD0 - 0xDF */
225 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
226 /* 0xE0 - 0xEF */
227 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
228 /* 0xF0 - 0xFF */
229 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
230};
231
232/* EFLAGS bit definitions. */
233#define EFLG_OF (1<<11)
234#define EFLG_DF (1<<10)
235#define EFLG_SF (1<<7)
236#define EFLG_ZF (1<<6)
237#define EFLG_AF (1<<4)
238#define EFLG_PF (1<<2)
239#define EFLG_CF (1<<0)
240
241/*
242 * Instruction emulation:
243 * Most instructions are emulated directly via a fragment of inline assembly
244 * code. This allows us to save/restore EFLAGS and thus very easily pick up
245 * any modified flags.
246 */
247
248#if defined(CONFIG_X86_64)
249#define _LO32 "k" /* force 32-bit operand */
250#define _STK "%%rsp" /* stack pointer */
251#elif defined(__i386__)
252#define _LO32 "" /* force 32-bit operand */
253#define _STK "%%esp" /* stack pointer */
254#endif
255
256/*
257 * These EFLAGS bits are restored from saved value during emulation, and
258 * any changes are written back to the saved value after emulation.
259 */
260#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF)
261
262/* Before executing instruction: restore necessary bits in EFLAGS. */
263#define _PRE_EFLAGS(_sav, _msk, _tmp) \
264 /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~_msk; */ \
265 "movl %"_sav",%"_LO32 _tmp"; " \
266 "push %"_tmp"; " \
267 "push %"_tmp"; " \
268 "movl %"_msk",%"_LO32 _tmp"; " \
269 "andl %"_LO32 _tmp",("_STK"); " \
270 "pushf; " \
271 "notl %"_LO32 _tmp"; " \
272 "andl %"_LO32 _tmp",("_STK"); " \
273 "andl %"_LO32 _tmp","__stringify(BITS_PER_LONG/4)"("_STK"); " \
274 "pop %"_tmp"; " \
275 "orl %"_LO32 _tmp",("_STK"); " \
276 "popf; " \
277 "pop %"_sav"; "
278
279/* After executing instruction: write-back necessary bits in EFLAGS. */
280#define _POST_EFLAGS(_sav, _msk, _tmp) \
281 /* _sav |= EFLAGS & _msk; */ \
282 "pushf; " \
283 "pop %"_tmp"; " \
284 "andl %"_msk",%"_LO32 _tmp"; " \
285 "orl %"_LO32 _tmp",%"_sav"; "
286
287/* Raw emulation: instruction has two explicit operands. */
288#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \
289 do { \
290 unsigned long _tmp; \
291 \
292 switch ((_dst).bytes) { \
293 case 2: \
294 __asm__ __volatile__ ( \
295 _PRE_EFLAGS("0", "4", "2") \
296 _op"w %"_wx"3,%1; " \
297 _POST_EFLAGS("0", "4", "2") \
298 : "=m" (_eflags), "=m" ((_dst).val), \
299 "=&r" (_tmp) \
300 : _wy ((_src).val), "i" (EFLAGS_MASK)); \
301 break; \
302 case 4: \
303 __asm__ __volatile__ ( \
304 _PRE_EFLAGS("0", "4", "2") \
305 _op"l %"_lx"3,%1; " \
306 _POST_EFLAGS("0", "4", "2") \
307 : "=m" (_eflags), "=m" ((_dst).val), \
308 "=&r" (_tmp) \
309 : _ly ((_src).val), "i" (EFLAGS_MASK)); \
310 break; \
311 case 8: \
312 __emulate_2op_8byte(_op, _src, _dst, \
313 _eflags, _qx, _qy); \
314 break; \
315 } \
316 } while (0)
317
318#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
319 do { \
320 unsigned long _tmp; \
321 switch ((_dst).bytes) { \
322 case 1: \
323 __asm__ __volatile__ ( \
324 _PRE_EFLAGS("0", "4", "2") \
325 _op"b %"_bx"3,%1; " \
326 _POST_EFLAGS("0", "4", "2") \
327 : "=m" (_eflags), "=m" ((_dst).val), \
328 "=&r" (_tmp) \
329 : _by ((_src).val), "i" (EFLAGS_MASK)); \
330 break; \
331 default: \
332 __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
333 _wx, _wy, _lx, _ly, _qx, _qy); \
334 break; \
335 } \
336 } while (0)
337
338/* Source operand is byte-sized and may be restricted to just %cl. */
339#define emulate_2op_SrcB(_op, _src, _dst, _eflags) \
340 __emulate_2op(_op, _src, _dst, _eflags, \
341 "b", "c", "b", "c", "b", "c", "b", "c")
342
343/* Source operand is byte, word, long or quad sized. */
344#define emulate_2op_SrcV(_op, _src, _dst, _eflags) \
345 __emulate_2op(_op, _src, _dst, _eflags, \
346 "b", "q", "w", "r", _LO32, "r", "", "r")
347
348/* Source operand is word, long or quad sized. */
349#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \
350 __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
351 "w", "r", _LO32, "r", "", "r")
352
353/* Instruction has only one explicit operand (no source operand). */
354#define emulate_1op(_op, _dst, _eflags) \
355 do { \
356 unsigned long _tmp; \
357 \
358 switch ((_dst).bytes) { \
359 case 1: \
360 __asm__ __volatile__ ( \
361 _PRE_EFLAGS("0", "3", "2") \
362 _op"b %1; " \
363 _POST_EFLAGS("0", "3", "2") \
364 : "=m" (_eflags), "=m" ((_dst).val), \
365 "=&r" (_tmp) \
366 : "i" (EFLAGS_MASK)); \
367 break; \
368 case 2: \
369 __asm__ __volatile__ ( \
370 _PRE_EFLAGS("0", "3", "2") \
371 _op"w %1; " \
372 _POST_EFLAGS("0", "3", "2") \
373 : "=m" (_eflags), "=m" ((_dst).val), \
374 "=&r" (_tmp) \
375 : "i" (EFLAGS_MASK)); \
376 break; \
377 case 4: \
378 __asm__ __volatile__ ( \
379 _PRE_EFLAGS("0", "3", "2") \
380 _op"l %1; " \
381 _POST_EFLAGS("0", "3", "2") \
382 : "=m" (_eflags), "=m" ((_dst).val), \
383 "=&r" (_tmp) \
384 : "i" (EFLAGS_MASK)); \
385 break; \
386 case 8: \
387 __emulate_1op_8byte(_op, _dst, _eflags); \
388 break; \
389 } \
390 } while (0)
391
392/* Emulate an instruction with quadword operands (x86/64 only). */
393#if defined(CONFIG_X86_64)
394#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) \
395 do { \
396 __asm__ __volatile__ ( \
397 _PRE_EFLAGS("0", "4", "2") \
398 _op"q %"_qx"3,%1; " \
399 _POST_EFLAGS("0", "4", "2") \
400 : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
401 : _qy ((_src).val), "i" (EFLAGS_MASK)); \
402 } while (0)
403
404#define __emulate_1op_8byte(_op, _dst, _eflags) \
405 do { \
406 __asm__ __volatile__ ( \
407 _PRE_EFLAGS("0", "3", "2") \
408 _op"q %1; " \
409 _POST_EFLAGS("0", "3", "2") \
410 : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
411 : "i" (EFLAGS_MASK)); \
412 } while (0)
413
414#elif defined(__i386__)
415#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)
416#define __emulate_1op_8byte(_op, _dst, _eflags)
417#endif /* __i386__ */
418
419/* Fetch next part of the instruction being emulated. */
420#define insn_fetch(_type, _size, _eip) \
421({ unsigned long _x; \
422 rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \
423 if (rc != 0) \
424 goto done; \
425 (_eip) += (_size); \
426 (_type)_x; \
427})
428
429/* Access/update address held in a register, based on addressing mode. */
430#define address_mask(reg) \
431 ((c->ad_bytes == sizeof(unsigned long)) ? \
432 (reg) : ((reg) & ((1UL << (c->ad_bytes << 3)) - 1)))
433#define register_address(base, reg) \
434 ((base) + address_mask(reg))
435#define register_address_increment(reg, inc) \
436 do { \
437 /* signed type ensures sign extension to long */ \
438 int _inc = (inc); \
439 if (c->ad_bytes == sizeof(unsigned long)) \
440 (reg) += _inc; \
441 else \
442 (reg) = ((reg) & \
443 ~((1UL << (c->ad_bytes << 3)) - 1)) | \
444 (((reg) + _inc) & \
445 ((1UL << (c->ad_bytes << 3)) - 1)); \
446 } while (0)
447
448#define JMP_REL(rel) \
449 do { \
450 register_address_increment(c->eip, rel); \
451 } while (0)
452
453static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
454 struct x86_emulate_ops *ops,
455 unsigned long linear, u8 *dest)
456{
457 struct fetch_cache *fc = &ctxt->decode.fetch;
458 int rc;
459 int size;
460
461 if (linear < fc->start || linear >= fc->end) {
462 size = min(15UL, PAGE_SIZE - offset_in_page(linear));
463 rc = ops->read_std(linear, fc->data, size, ctxt->vcpu);
464 if (rc)
465 return rc;
466 fc->start = linear;
467 fc->end = linear + size;
468 }
469 *dest = fc->data[linear - fc->start];
470 return 0;
471}
472
473static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
474 struct x86_emulate_ops *ops,
475 unsigned long eip, void *dest, unsigned size)
476{
477 int rc = 0;
478
479 eip += ctxt->cs_base;
480 while (size--) {
481 rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++);
482 if (rc)
483 return rc;
484 }
485 return 0;
486}
487
488/*
489 * Given the 'reg' portion of a ModRM byte, and a register block, return a
490 * pointer into the block that addresses the relevant register.
491 * @highbyte_regs specifies whether to decode AH,CH,DH,BH.
492 */
493static void *decode_register(u8 modrm_reg, unsigned long *regs,
494 int highbyte_regs)
495{
496 void *p;
497
498 p = &regs[modrm_reg];
499 if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8)
500 p = (unsigned char *)&regs[modrm_reg & 3] + 1;
501 return p;
502}
503
504static int read_descriptor(struct x86_emulate_ctxt *ctxt,
505 struct x86_emulate_ops *ops,
506 void *ptr,
507 u16 *size, unsigned long *address, int op_bytes)
508{
509 int rc;
510
511 if (op_bytes == 2)
512 op_bytes = 3;
513 *address = 0;
514 rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
515 ctxt->vcpu);
516 if (rc)
517 return rc;
518 rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
519 ctxt->vcpu);
520 return rc;
521}
522
523static int test_cc(unsigned int condition, unsigned int flags)
524{
525 int rc = 0;
526
527 switch ((condition & 15) >> 1) {
528 case 0: /* o */
529 rc |= (flags & EFLG_OF);
530 break;
531 case 1: /* b/c/nae */
532 rc |= (flags & EFLG_CF);
533 break;
534 case 2: /* z/e */
535 rc |= (flags & EFLG_ZF);
536 break;
537 case 3: /* be/na */
538 rc |= (flags & (EFLG_CF|EFLG_ZF));
539 break;
540 case 4: /* s */
541 rc |= (flags & EFLG_SF);
542 break;
543 case 5: /* p/pe */
544 rc |= (flags & EFLG_PF);
545 break;
546 case 7: /* le/ng */
547 rc |= (flags & EFLG_ZF);
548 /* fall through */
549 case 6: /* l/nge */
550 rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF));
551 break;
552 }
553
554 /* Odd condition identifiers (lsb == 1) have inverted sense. */
555 return (!!rc ^ (condition & 1));
556}
557
558static void decode_register_operand(struct operand *op,
559 struct decode_cache *c,
560 int inhibit_bytereg)
561{
562 unsigned reg = c->modrm_reg;
563 int highbyte_regs = c->rex_prefix == 0;
564
565 if (!(c->d & ModRM))
566 reg = (c->b & 7) | ((c->rex_prefix & 1) << 3);
567 op->type = OP_REG;
568 if ((c->d & ByteOp) && !inhibit_bytereg) {
569 op->ptr = decode_register(reg, c->regs, highbyte_regs);
570 op->val = *(u8 *)op->ptr;
571 op->bytes = 1;
572 } else {
573 op->ptr = decode_register(reg, c->regs, 0);
574 op->bytes = c->op_bytes;
575 switch (op->bytes) {
576 case 2:
577 op->val = *(u16 *)op->ptr;
578 break;
579 case 4:
580 op->val = *(u32 *)op->ptr;
581 break;
582 case 8:
583 op->val = *(u64 *) op->ptr;
584 break;
585 }
586 }
587 op->orig_val = op->val;
588}
589
590static int decode_modrm(struct x86_emulate_ctxt *ctxt,
591 struct x86_emulate_ops *ops)
592{
593 struct decode_cache *c = &ctxt->decode;
594 u8 sib;
595 int index_reg = 0, base_reg = 0, scale, rip_relative = 0;
596 int rc = 0;
597
598 if (c->rex_prefix) {
599 c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */
600 index_reg = (c->rex_prefix & 2) << 2; /* REX.X */
601 c->modrm_rm = base_reg = (c->rex_prefix & 1) << 3; /* REG.B */
602 }
603
604 c->modrm = insn_fetch(u8, 1, c->eip);
605 c->modrm_mod |= (c->modrm & 0xc0) >> 6;
606 c->modrm_reg |= (c->modrm & 0x38) >> 3;
607 c->modrm_rm |= (c->modrm & 0x07);
608 c->modrm_ea = 0;
609 c->use_modrm_ea = 1;
610
611 if (c->modrm_mod == 3) {
612 c->modrm_val = *(unsigned long *)
613 decode_register(c->modrm_rm, c->regs, c->d & ByteOp);
614 return rc;
615 }
616
617 if (c->ad_bytes == 2) {
618 unsigned bx = c->regs[VCPU_REGS_RBX];
619 unsigned bp = c->regs[VCPU_REGS_RBP];
620 unsigned si = c->regs[VCPU_REGS_RSI];
621 unsigned di = c->regs[VCPU_REGS_RDI];
622
623 /* 16-bit ModR/M decode. */
624 switch (c->modrm_mod) {
625 case 0:
626 if (c->modrm_rm == 6)
627 c->modrm_ea += insn_fetch(u16, 2, c->eip);
628 break;
629 case 1:
630 c->modrm_ea += insn_fetch(s8, 1, c->eip);
631 break;
632 case 2:
633 c->modrm_ea += insn_fetch(u16, 2, c->eip);
634 break;
635 }
636 switch (c->modrm_rm) {
637 case 0:
638 c->modrm_ea += bx + si;
639 break;
640 case 1:
641 c->modrm_ea += bx + di;
642 break;
643 case 2:
644 c->modrm_ea += bp + si;
645 break;
646 case 3:
647 c->modrm_ea += bp + di;
648 break;
649 case 4:
650 c->modrm_ea += si;
651 break;
652 case 5:
653 c->modrm_ea += di;
654 break;
655 case 6:
656 if (c->modrm_mod != 0)
657 c->modrm_ea += bp;
658 break;
659 case 7:
660 c->modrm_ea += bx;
661 break;
662 }
663 if (c->modrm_rm == 2 || c->modrm_rm == 3 ||
664 (c->modrm_rm == 6 && c->modrm_mod != 0))
665 if (!c->override_base)
666 c->override_base = &ctxt->ss_base;
667 c->modrm_ea = (u16)c->modrm_ea;
668 } else {
669 /* 32/64-bit ModR/M decode. */
670 switch (c->modrm_rm) {
671 case 4:
672 case 12:
673 sib = insn_fetch(u8, 1, c->eip);
674 index_reg |= (sib >> 3) & 7;
675 base_reg |= sib & 7;
676 scale = sib >> 6;
677
678 switch (base_reg) {
679 case 5:
680 if (c->modrm_mod != 0)
681 c->modrm_ea += c->regs[base_reg];
682 else
683 c->modrm_ea +=
684 insn_fetch(s32, 4, c->eip);
685 break;
686 default:
687 c->modrm_ea += c->regs[base_reg];
688 }
689 switch (index_reg) {
690 case 4:
691 break;
692 default:
693 c->modrm_ea += c->regs[index_reg] << scale;
694 }
695 break;
696 case 5:
697 if (c->modrm_mod != 0)
698 c->modrm_ea += c->regs[c->modrm_rm];
699 else if (ctxt->mode == X86EMUL_MODE_PROT64)
700 rip_relative = 1;
701 break;
702 default:
703 c->modrm_ea += c->regs[c->modrm_rm];
704 break;
705 }
706 switch (c->modrm_mod) {
707 case 0:
708 if (c->modrm_rm == 5)
709 c->modrm_ea += insn_fetch(s32, 4, c->eip);
710 break;
711 case 1:
712 c->modrm_ea += insn_fetch(s8, 1, c->eip);
713 break;
714 case 2:
715 c->modrm_ea += insn_fetch(s32, 4, c->eip);
716 break;
717 }
718 }
719 if (rip_relative) {
720 c->modrm_ea += c->eip;
721 switch (c->d & SrcMask) {
722 case SrcImmByte:
723 c->modrm_ea += 1;
724 break;
725 case SrcImm:
726 if (c->d & ByteOp)
727 c->modrm_ea += 1;
728 else
729 if (c->op_bytes == 8)
730 c->modrm_ea += 4;
731 else
732 c->modrm_ea += c->op_bytes;
733 }
734 }
735done:
736 return rc;
737}
738
739static int decode_abs(struct x86_emulate_ctxt *ctxt,
740 struct x86_emulate_ops *ops)
741{
742 struct decode_cache *c = &ctxt->decode;
743 int rc = 0;
744
745 switch (c->ad_bytes) {
746 case 2:
747 c->modrm_ea = insn_fetch(u16, 2, c->eip);
748 break;
749 case 4:
750 c->modrm_ea = insn_fetch(u32, 4, c->eip);
751 break;
752 case 8:
753 c->modrm_ea = insn_fetch(u64, 8, c->eip);
754 break;
755 }
756done:
757 return rc;
758}
759
760int
761x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
762{
763 struct decode_cache *c = &ctxt->decode;
764 int rc = 0;
765 int mode = ctxt->mode;
766 int def_op_bytes, def_ad_bytes;
767
768 /* Shadow copy of register state. Committed on successful emulation. */
769
770 memset(c, 0, sizeof(struct decode_cache));
771 c->eip = ctxt->vcpu->arch.rip;
772 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
773
774 switch (mode) {
775 case X86EMUL_MODE_REAL:
776 case X86EMUL_MODE_PROT16:
777 def_op_bytes = def_ad_bytes = 2;
778 break;
779 case X86EMUL_MODE_PROT32:
780 def_op_bytes = def_ad_bytes = 4;
781 break;
782#ifdef CONFIG_X86_64
783 case X86EMUL_MODE_PROT64:
784 def_op_bytes = 4;
785 def_ad_bytes = 8;
786 break;
787#endif
788 default:
789 return -1;
790 }
791
792 c->op_bytes = def_op_bytes;
793 c->ad_bytes = def_ad_bytes;
794
795 /* Legacy prefixes. */
796 for (;;) {
797 switch (c->b = insn_fetch(u8, 1, c->eip)) {
798 case 0x66: /* operand-size override */
799 /* switch between 2/4 bytes */
800 c->op_bytes = def_op_bytes ^ 6;
801 break;
802 case 0x67: /* address-size override */
803 if (mode == X86EMUL_MODE_PROT64)
804 /* switch between 4/8 bytes */
805 c->ad_bytes = def_ad_bytes ^ 12;
806 else
807 /* switch between 2/4 bytes */
808 c->ad_bytes = def_ad_bytes ^ 6;
809 break;
810 case 0x2e: /* CS override */
811 c->override_base = &ctxt->cs_base;
812 break;
813 case 0x3e: /* DS override */
814 c->override_base = &ctxt->ds_base;
815 break;
816 case 0x26: /* ES override */
817 c->override_base = &ctxt->es_base;
818 break;
819 case 0x64: /* FS override */
820 c->override_base = &ctxt->fs_base;
821 break;
822 case 0x65: /* GS override */
823 c->override_base = &ctxt->gs_base;
824 break;
825 case 0x36: /* SS override */
826 c->override_base = &ctxt->ss_base;
827 break;
828 case 0x40 ... 0x4f: /* REX */
829 if (mode != X86EMUL_MODE_PROT64)
830 goto done_prefixes;
831 c->rex_prefix = c->b;
832 continue;
833 case 0xf0: /* LOCK */
834 c->lock_prefix = 1;
835 break;
836 case 0xf2: /* REPNE/REPNZ */
837 c->rep_prefix = REPNE_PREFIX;
838 break;
839 case 0xf3: /* REP/REPE/REPZ */
840 c->rep_prefix = REPE_PREFIX;
841 break;
842 default:
843 goto done_prefixes;
844 }
845
846 /* Any legacy prefix after a REX prefix nullifies its effect. */
847
848 c->rex_prefix = 0;
849 }
850
851done_prefixes:
852
853 /* REX prefix. */
854 if (c->rex_prefix)
855 if (c->rex_prefix & 8)
856 c->op_bytes = 8; /* REX.W */
857
858 /* Opcode byte(s). */
859 c->d = opcode_table[c->b];
860 if (c->d == 0) {
861 /* Two-byte opcode? */
862 if (c->b == 0x0f) {
863 c->twobyte = 1;
864 c->b = insn_fetch(u8, 1, c->eip);
865 c->d = twobyte_table[c->b];
866 }
867
868 /* Unrecognised? */
869 if (c->d == 0) {
870 DPRINTF("Cannot emulate %02x\n", c->b);
871 return -1;
872 }
873 }
874
875 if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
876 c->op_bytes = 8;
877
878 /* ModRM and SIB bytes. */
879 if (c->d & ModRM)
880 rc = decode_modrm(ctxt, ops);
881 else if (c->d & MemAbs)
882 rc = decode_abs(ctxt, ops);
883 if (rc)
884 goto done;
885
886 if (!c->override_base)
887 c->override_base = &ctxt->ds_base;
888 if (mode == X86EMUL_MODE_PROT64 &&
889 c->override_base != &ctxt->fs_base &&
890 c->override_base != &ctxt->gs_base)
891 c->override_base = NULL;
892
893 if (c->override_base)
894 c->modrm_ea += *c->override_base;
895
896 if (c->ad_bytes != 8)
897 c->modrm_ea = (u32)c->modrm_ea;
898 /*
899 * Decode and fetch the source operand: register, memory
900 * or immediate.
901 */
902 switch (c->d & SrcMask) {
903 case SrcNone:
904 break;
905 case SrcReg:
906 decode_register_operand(&c->src, c, 0);
907 break;
908 case SrcMem16:
909 c->src.bytes = 2;
910 goto srcmem_common;
911 case SrcMem32:
912 c->src.bytes = 4;
913 goto srcmem_common;
914 case SrcMem:
915 c->src.bytes = (c->d & ByteOp) ? 1 :
916 c->op_bytes;
917 /* Don't fetch the address for invlpg: it could be unmapped. */
918 if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7)
919 break;
920 srcmem_common:
921 /*
922 * For instructions with a ModR/M byte, switch to register
923 * access if Mod = 3.
924 */
925 if ((c->d & ModRM) && c->modrm_mod == 3) {
926 c->src.type = OP_REG;
927 break;
928 }
929 c->src.type = OP_MEM;
930 break;
931 case SrcImm:
932 c->src.type = OP_IMM;
933 c->src.ptr = (unsigned long *)c->eip;
934 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
935 if (c->src.bytes == 8)
936 c->src.bytes = 4;
937 /* NB. Immediates are sign-extended as necessary. */
938 switch (c->src.bytes) {
939 case 1:
940 c->src.val = insn_fetch(s8, 1, c->eip);
941 break;
942 case 2:
943 c->src.val = insn_fetch(s16, 2, c->eip);
944 break;
945 case 4:
946 c->src.val = insn_fetch(s32, 4, c->eip);
947 break;
948 }
949 break;
950 case SrcImmByte:
951 c->src.type = OP_IMM;
952 c->src.ptr = (unsigned long *)c->eip;
953 c->src.bytes = 1;
954 c->src.val = insn_fetch(s8, 1, c->eip);
955 break;
956 }
957
958 /* Decode and fetch the destination operand: register or memory. */
959 switch (c->d & DstMask) {
960 case ImplicitOps:
961 /* Special instructions do their own operand decoding. */
962 return 0;
963 case DstReg:
964 decode_register_operand(&c->dst, c,
965 c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
966 break;
967 case DstMem:
968 if ((c->d & ModRM) && c->modrm_mod == 3) {
969 c->dst.type = OP_REG;
970 break;
971 }
972 c->dst.type = OP_MEM;
973 break;
974 }
975
976done:
977 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
978}
979
980static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
981{
982 struct decode_cache *c = &ctxt->decode;
983
984 c->dst.type = OP_MEM;
985 c->dst.bytes = c->op_bytes;
986 c->dst.val = c->src.val;
987 register_address_increment(c->regs[VCPU_REGS_RSP], -c->op_bytes);
988 c->dst.ptr = (void *) register_address(ctxt->ss_base,
989 c->regs[VCPU_REGS_RSP]);
990}
991
992static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
993 struct x86_emulate_ops *ops)
994{
995 struct decode_cache *c = &ctxt->decode;
996 int rc;
997
998 rc = ops->read_std(register_address(ctxt->ss_base,
999 c->regs[VCPU_REGS_RSP]),
1000 &c->dst.val, c->dst.bytes, ctxt->vcpu);
1001 if (rc != 0)
1002 return rc;
1003
1004 register_address_increment(c->regs[VCPU_REGS_RSP], c->dst.bytes);
1005
1006 return 0;
1007}
1008
1009static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt)
1010{
1011 struct decode_cache *c = &ctxt->decode;
1012 switch (c->modrm_reg) {
1013 case 0: /* rol */
1014 emulate_2op_SrcB("rol", c->src, c->dst, ctxt->eflags);
1015 break;
1016 case 1: /* ror */
1017 emulate_2op_SrcB("ror", c->src, c->dst, ctxt->eflags);
1018 break;
1019 case 2: /* rcl */
1020 emulate_2op_SrcB("rcl", c->src, c->dst, ctxt->eflags);
1021 break;
1022 case 3: /* rcr */
1023 emulate_2op_SrcB("rcr", c->src, c->dst, ctxt->eflags);
1024 break;
1025 case 4: /* sal/shl */
1026 case 6: /* sal/shl */
1027 emulate_2op_SrcB("sal", c->src, c->dst, ctxt->eflags);
1028 break;
1029 case 5: /* shr */
1030 emulate_2op_SrcB("shr", c->src, c->dst, ctxt->eflags);
1031 break;
1032 case 7: /* sar */
1033 emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags);
1034 break;
1035 }
1036}
1037
1038static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
1039 struct x86_emulate_ops *ops)
1040{
1041 struct decode_cache *c = &ctxt->decode;
1042 int rc = 0;
1043
1044 switch (c->modrm_reg) {
1045 case 0 ... 1: /* test */
1046 /*
1047 * Special case in Grp3: test has an immediate
1048 * source operand.
1049 */
1050 c->src.type = OP_IMM;
1051 c->src.ptr = (unsigned long *)c->eip;
1052 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1053 if (c->src.bytes == 8)
1054 c->src.bytes = 4;
1055 switch (c->src.bytes) {
1056 case 1:
1057 c->src.val = insn_fetch(s8, 1, c->eip);
1058 break;
1059 case 2:
1060 c->src.val = insn_fetch(s16, 2, c->eip);
1061 break;
1062 case 4:
1063 c->src.val = insn_fetch(s32, 4, c->eip);
1064 break;
1065 }
1066 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
1067 break;
1068 case 2: /* not */
1069 c->dst.val = ~c->dst.val;
1070 break;
1071 case 3: /* neg */
1072 emulate_1op("neg", c->dst, ctxt->eflags);
1073 break;
1074 default:
1075 DPRINTF("Cannot emulate %02x\n", c->b);
1076 rc = X86EMUL_UNHANDLEABLE;
1077 break;
1078 }
1079done:
1080 return rc;
1081}
1082
1083static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
1084 struct x86_emulate_ops *ops)
1085{
1086 struct decode_cache *c = &ctxt->decode;
1087 int rc;
1088
1089 switch (c->modrm_reg) {
1090 case 0: /* inc */
1091 emulate_1op("inc", c->dst, ctxt->eflags);
1092 break;
1093 case 1: /* dec */
1094 emulate_1op("dec", c->dst, ctxt->eflags);
1095 break;
1096 case 4: /* jmp abs */
1097 if (c->b == 0xff)
1098 c->eip = c->dst.val;
1099 else {
1100 DPRINTF("Cannot emulate %02x\n", c->b);
1101 return X86EMUL_UNHANDLEABLE;
1102 }
1103 break;
1104 case 6: /* push */
1105
1106 /* 64-bit mode: PUSH always pushes a 64-bit operand. */
1107
1108 if (ctxt->mode == X86EMUL_MODE_PROT64) {
1109 c->dst.bytes = 8;
1110 rc = ops->read_std((unsigned long)c->dst.ptr,
1111 &c->dst.val, 8, ctxt->vcpu);
1112 if (rc != 0)
1113 return rc;
1114 }
1115 register_address_increment(c->regs[VCPU_REGS_RSP],
1116 -c->dst.bytes);
1117 rc = ops->write_emulated(register_address(ctxt->ss_base,
1118 c->regs[VCPU_REGS_RSP]), &c->dst.val,
1119 c->dst.bytes, ctxt->vcpu);
1120 if (rc != 0)
1121 return rc;
1122 c->dst.type = OP_NONE;
1123 break;
1124 default:
1125 DPRINTF("Cannot emulate %02x\n", c->b);
1126 return X86EMUL_UNHANDLEABLE;
1127 }
1128 return 0;
1129}
1130
1131static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
1132 struct x86_emulate_ops *ops,
1133 unsigned long memop)
1134{
1135 struct decode_cache *c = &ctxt->decode;
1136 u64 old, new;
1137 int rc;
1138
1139 rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu);
1140 if (rc != 0)
1141 return rc;
1142
1143 if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) ||
1144 ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) {
1145
1146 c->regs[VCPU_REGS_RAX] = (u32) (old >> 0);
1147 c->regs[VCPU_REGS_RDX] = (u32) (old >> 32);
1148 ctxt->eflags &= ~EFLG_ZF;
1149
1150 } else {
1151 new = ((u64)c->regs[VCPU_REGS_RCX] << 32) |
1152 (u32) c->regs[VCPU_REGS_RBX];
1153
1154 rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu);
1155 if (rc != 0)
1156 return rc;
1157 ctxt->eflags |= EFLG_ZF;
1158 }
1159 return 0;
1160}
1161
1162static inline int writeback(struct x86_emulate_ctxt *ctxt,
1163 struct x86_emulate_ops *ops)
1164{
1165 int rc;
1166 struct decode_cache *c = &ctxt->decode;
1167
1168 switch (c->dst.type) {
1169 case OP_REG:
1170 /* The 4-byte case *is* correct:
1171 * in 64-bit mode we zero-extend.
1172 */
1173 switch (c->dst.bytes) {
1174 case 1:
1175 *(u8 *)c->dst.ptr = (u8)c->dst.val;
1176 break;
1177 case 2:
1178 *(u16 *)c->dst.ptr = (u16)c->dst.val;
1179 break;
1180 case 4:
1181 *c->dst.ptr = (u32)c->dst.val;
1182 break; /* 64b: zero-ext */
1183 case 8:
1184 *c->dst.ptr = c->dst.val;
1185 break;
1186 }
1187 break;
1188 case OP_MEM:
1189 if (c->lock_prefix)
1190 rc = ops->cmpxchg_emulated(
1191 (unsigned long)c->dst.ptr,
1192 &c->dst.orig_val,
1193 &c->dst.val,
1194 c->dst.bytes,
1195 ctxt->vcpu);
1196 else
1197 rc = ops->write_emulated(
1198 (unsigned long)c->dst.ptr,
1199 &c->dst.val,
1200 c->dst.bytes,
1201 ctxt->vcpu);
1202 if (rc != 0)
1203 return rc;
1204 break;
1205 case OP_NONE:
1206 /* no writeback */
1207 break;
1208 default:
1209 break;
1210 }
1211 return 0;
1212}
1213
1214int
1215x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1216{
1217 unsigned long memop = 0;
1218 u64 msr_data;
1219 unsigned long saved_eip = 0;
1220 struct decode_cache *c = &ctxt->decode;
1221 int rc = 0;
1222
1223 /* Shadow copy of register state. Committed on successful emulation.
1224 * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't
1225 * modify them.
1226 */
1227
1228 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
1229 saved_eip = c->eip;
1230
1231 if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs))
1232 memop = c->modrm_ea;
1233
1234 if (c->rep_prefix && (c->d & String)) {
1235 /* All REP prefixes have the same first termination condition */
1236 if (c->regs[VCPU_REGS_RCX] == 0) {
1237 ctxt->vcpu->arch.rip = c->eip;
1238 goto done;
1239 }
1240 /* The second termination condition only applies for REPE
1241 * and REPNE. Test if the repeat string operation prefix is
1242 * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
1243 * corresponding termination condition according to:
1244 * - if REPE/REPZ and ZF = 0 then done
1245 * - if REPNE/REPNZ and ZF = 1 then done
1246 */
1247 if ((c->b == 0xa6) || (c->b == 0xa7) ||
1248 (c->b == 0xae) || (c->b == 0xaf)) {
1249 if ((c->rep_prefix == REPE_PREFIX) &&
1250 ((ctxt->eflags & EFLG_ZF) == 0)) {
1251 ctxt->vcpu->arch.rip = c->eip;
1252 goto done;
1253 }
1254 if ((c->rep_prefix == REPNE_PREFIX) &&
1255 ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) {
1256 ctxt->vcpu->arch.rip = c->eip;
1257 goto done;
1258 }
1259 }
1260 c->regs[VCPU_REGS_RCX]--;
1261 c->eip = ctxt->vcpu->arch.rip;
1262 }
1263
1264 if (c->src.type == OP_MEM) {
1265 c->src.ptr = (unsigned long *)memop;
1266 c->src.val = 0;
1267 rc = ops->read_emulated((unsigned long)c->src.ptr,
1268 &c->src.val,
1269 c->src.bytes,
1270 ctxt->vcpu);
1271 if (rc != 0)
1272 goto done;
1273 c->src.orig_val = c->src.val;
1274 }
1275
1276 if ((c->d & DstMask) == ImplicitOps)
1277 goto special_insn;
1278
1279
1280 if (c->dst.type == OP_MEM) {
1281 c->dst.ptr = (unsigned long *)memop;
1282 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1283 c->dst.val = 0;
1284 if (c->d & BitOp) {
1285 unsigned long mask = ~(c->dst.bytes * 8 - 1);
1286
1287 c->dst.ptr = (void *)c->dst.ptr +
1288 (c->src.val & mask) / 8;
1289 }
1290 if (!(c->d & Mov) &&
1291 /* optimisation - avoid slow emulated read */
1292 ((rc = ops->read_emulated((unsigned long)c->dst.ptr,
1293 &c->dst.val,
1294 c->dst.bytes, ctxt->vcpu)) != 0))
1295 goto done;
1296 }
1297 c->dst.orig_val = c->dst.val;
1298
1299special_insn:
1300
1301 if (c->twobyte)
1302 goto twobyte_insn;
1303
1304 switch (c->b) {
1305 case 0x00 ... 0x05:
1306 add: /* add */
1307 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
1308 break;
1309 case 0x08 ... 0x0d:
1310 or: /* or */
1311 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
1312 break;
1313 case 0x10 ... 0x15:
1314 adc: /* adc */
1315 emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
1316 break;
1317 case 0x18 ... 0x1d:
1318 sbb: /* sbb */
1319 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
1320 break;
1321 case 0x20 ... 0x23:
1322 and: /* and */
1323 emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
1324 break;
1325 case 0x24: /* and al imm8 */
1326 c->dst.type = OP_REG;
1327 c->dst.ptr = &c->regs[VCPU_REGS_RAX];
1328 c->dst.val = *(u8 *)c->dst.ptr;
1329 c->dst.bytes = 1;
1330 c->dst.orig_val = c->dst.val;
1331 goto and;
1332 case 0x25: /* and ax imm16, or eax imm32 */
1333 c->dst.type = OP_REG;
1334 c->dst.bytes = c->op_bytes;
1335 c->dst.ptr = &c->regs[VCPU_REGS_RAX];
1336 if (c->op_bytes == 2)
1337 c->dst.val = *(u16 *)c->dst.ptr;
1338 else
1339 c->dst.val = *(u32 *)c->dst.ptr;
1340 c->dst.orig_val = c->dst.val;
1341 goto and;
1342 case 0x28 ... 0x2d:
1343 sub: /* sub */
1344 emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags);
1345 break;
1346 case 0x30 ... 0x35:
1347 xor: /* xor */
1348 emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags);
1349 break;
1350 case 0x38 ... 0x3d:
1351 cmp: /* cmp */
1352 emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
1353 break;
1354 case 0x40 ... 0x47: /* inc r16/r32 */
1355 emulate_1op("inc", c->dst, ctxt->eflags);
1356 break;
1357 case 0x48 ... 0x4f: /* dec r16/r32 */
1358 emulate_1op("dec", c->dst, ctxt->eflags);
1359 break;
1360 case 0x50 ... 0x57: /* push reg */
1361 c->dst.type = OP_MEM;
1362 c->dst.bytes = c->op_bytes;
1363 c->dst.val = c->src.val;
1364 register_address_increment(c->regs[VCPU_REGS_RSP],
1365 -c->op_bytes);
1366 c->dst.ptr = (void *) register_address(
1367 ctxt->ss_base, c->regs[VCPU_REGS_RSP]);
1368 break;
1369 case 0x58 ... 0x5f: /* pop reg */
1370 pop_instruction:
1371 if ((rc = ops->read_std(register_address(ctxt->ss_base,
1372 c->regs[VCPU_REGS_RSP]), c->dst.ptr,
1373 c->op_bytes, ctxt->vcpu)) != 0)
1374 goto done;
1375
1376 register_address_increment(c->regs[VCPU_REGS_RSP],
1377 c->op_bytes);
1378 c->dst.type = OP_NONE; /* Disable writeback. */
1379 break;
1380 case 0x63: /* movsxd */
1381 if (ctxt->mode != X86EMUL_MODE_PROT64)
1382 goto cannot_emulate;
1383 c->dst.val = (s32) c->src.val;
1384 break;
1385 case 0x6a: /* push imm8 */
1386 c->src.val = 0L;
1387 c->src.val = insn_fetch(s8, 1, c->eip);
1388 emulate_push(ctxt);
1389 break;
1390 case 0x6c: /* insb */
1391 case 0x6d: /* insw/insd */
1392 if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
1393 1,
1394 (c->d & ByteOp) ? 1 : c->op_bytes,
1395 c->rep_prefix ?
1396 address_mask(c->regs[VCPU_REGS_RCX]) : 1,
1397 (ctxt->eflags & EFLG_DF),
1398 register_address(ctxt->es_base,
1399 c->regs[VCPU_REGS_RDI]),
1400 c->rep_prefix,
1401 c->regs[VCPU_REGS_RDX]) == 0) {
1402 c->eip = saved_eip;
1403 return -1;
1404 }
1405 return 0;
1406 case 0x6e: /* outsb */
1407 case 0x6f: /* outsw/outsd */
1408 if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
1409 0,
1410 (c->d & ByteOp) ? 1 : c->op_bytes,
1411 c->rep_prefix ?
1412 address_mask(c->regs[VCPU_REGS_RCX]) : 1,
1413 (ctxt->eflags & EFLG_DF),
1414 register_address(c->override_base ?
1415 *c->override_base :
1416 ctxt->ds_base,
1417 c->regs[VCPU_REGS_RSI]),
1418 c->rep_prefix,
1419 c->regs[VCPU_REGS_RDX]) == 0) {
1420 c->eip = saved_eip;
1421 return -1;
1422 }
1423 return 0;
1424 case 0x70 ... 0x7f: /* jcc (short) */ {
1425 int rel = insn_fetch(s8, 1, c->eip);
1426
1427 if (test_cc(c->b, ctxt->eflags))
1428 JMP_REL(rel);
1429 break;
1430 }
1431 case 0x80 ... 0x83: /* Grp1 */
1432 switch (c->modrm_reg) {
1433 case 0:
1434 goto add;
1435 case 1:
1436 goto or;
1437 case 2:
1438 goto adc;
1439 case 3:
1440 goto sbb;
1441 case 4:
1442 goto and;
1443 case 5:
1444 goto sub;
1445 case 6:
1446 goto xor;
1447 case 7:
1448 goto cmp;
1449 }
1450 break;
1451 case 0x84 ... 0x85:
1452 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
1453 break;
1454 case 0x86 ... 0x87: /* xchg */
1455 /* Write back the register source. */
1456 switch (c->dst.bytes) {
1457 case 1:
1458 *(u8 *) c->src.ptr = (u8) c->dst.val;
1459 break;
1460 case 2:
1461 *(u16 *) c->src.ptr = (u16) c->dst.val;
1462 break;
1463 case 4:
1464 *c->src.ptr = (u32) c->dst.val;
1465 break; /* 64b reg: zero-extend */
1466 case 8:
1467 *c->src.ptr = c->dst.val;
1468 break;
1469 }
1470 /*
1471 * Write back the memory destination with implicit LOCK
1472 * prefix.
1473 */
1474 c->dst.val = c->src.val;
1475 c->lock_prefix = 1;
1476 break;
1477 case 0x88 ... 0x8b: /* mov */
1478 goto mov;
1479 case 0x8d: /* lea r16/r32, m */
1480 c->dst.val = c->modrm_val;
1481 break;
1482 case 0x8f: /* pop (sole member of Grp1a) */
1483 rc = emulate_grp1a(ctxt, ops);
1484 if (rc != 0)
1485 goto done;
1486 break;
1487 case 0x9c: /* pushf */
1488 c->src.val = (unsigned long) ctxt->eflags;
1489 emulate_push(ctxt);
1490 break;
1491 case 0x9d: /* popf */
1492 c->dst.ptr = (unsigned long *) &ctxt->eflags;
1493 goto pop_instruction;
1494 case 0xa0 ... 0xa1: /* mov */
1495 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
1496 c->dst.val = c->src.val;
1497 break;
1498 case 0xa2 ... 0xa3: /* mov */
1499 c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX];
1500 break;
1501 case 0xa4 ... 0xa5: /* movs */
1502 c->dst.type = OP_MEM;
1503 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1504 c->dst.ptr = (unsigned long *)register_address(
1505 ctxt->es_base,
1506 c->regs[VCPU_REGS_RDI]);
1507 if ((rc = ops->read_emulated(register_address(
1508 c->override_base ? *c->override_base :
1509 ctxt->ds_base,
1510 c->regs[VCPU_REGS_RSI]),
1511 &c->dst.val,
1512 c->dst.bytes, ctxt->vcpu)) != 0)
1513 goto done;
1514 register_address_increment(c->regs[VCPU_REGS_RSI],
1515 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
1516 : c->dst.bytes);
1517 register_address_increment(c->regs[VCPU_REGS_RDI],
1518 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
1519 : c->dst.bytes);
1520 break;
1521 case 0xa6 ... 0xa7: /* cmps */
1522 c->src.type = OP_NONE; /* Disable writeback. */
1523 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1524 c->src.ptr = (unsigned long *)register_address(
1525 c->override_base ? *c->override_base :
1526 ctxt->ds_base,
1527 c->regs[VCPU_REGS_RSI]);
1528 if ((rc = ops->read_emulated((unsigned long)c->src.ptr,
1529 &c->src.val,
1530 c->src.bytes,
1531 ctxt->vcpu)) != 0)
1532 goto done;
1533
1534 c->dst.type = OP_NONE; /* Disable writeback. */
1535 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1536 c->dst.ptr = (unsigned long *)register_address(
1537 ctxt->es_base,
1538 c->regs[VCPU_REGS_RDI]);
1539 if ((rc = ops->read_emulated((unsigned long)c->dst.ptr,
1540 &c->dst.val,
1541 c->dst.bytes,
1542 ctxt->vcpu)) != 0)
1543 goto done;
1544
1545 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
1546
1547 emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
1548
1549 register_address_increment(c->regs[VCPU_REGS_RSI],
1550 (ctxt->eflags & EFLG_DF) ? -c->src.bytes
1551 : c->src.bytes);
1552 register_address_increment(c->regs[VCPU_REGS_RDI],
1553 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
1554 : c->dst.bytes);
1555
1556 break;
1557 case 0xaa ... 0xab: /* stos */
1558 c->dst.type = OP_MEM;
1559 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1560 c->dst.ptr = (unsigned long *)register_address(
1561 ctxt->es_base,
1562 c->regs[VCPU_REGS_RDI]);
1563 c->dst.val = c->regs[VCPU_REGS_RAX];
1564 register_address_increment(c->regs[VCPU_REGS_RDI],
1565 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
1566 : c->dst.bytes);
1567 break;
1568 case 0xac ... 0xad: /* lods */
1569 c->dst.type = OP_REG;
1570 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1571 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
1572 if ((rc = ops->read_emulated(register_address(
1573 c->override_base ? *c->override_base :
1574 ctxt->ds_base,
1575 c->regs[VCPU_REGS_RSI]),
1576 &c->dst.val,
1577 c->dst.bytes,
1578 ctxt->vcpu)) != 0)
1579 goto done;
1580 register_address_increment(c->regs[VCPU_REGS_RSI],
1581 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
1582 : c->dst.bytes);
1583 break;
1584 case 0xae ... 0xaf: /* scas */
1585 DPRINTF("Urk! I don't handle SCAS.\n");
1586 goto cannot_emulate;
1587 case 0xc0 ... 0xc1:
1588 emulate_grp2(ctxt);
1589 break;
1590 case 0xc3: /* ret */
1591 c->dst.ptr = &c->eip;
1592 goto pop_instruction;
1593 case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */
1594 mov:
1595 c->dst.val = c->src.val;
1596 break;
1597 case 0xd0 ... 0xd1: /* Grp2 */
1598 c->src.val = 1;
1599 emulate_grp2(ctxt);
1600 break;
1601 case 0xd2 ... 0xd3: /* Grp2 */
1602 c->src.val = c->regs[VCPU_REGS_RCX];
1603 emulate_grp2(ctxt);
1604 break;
1605 case 0xe8: /* call (near) */ {
1606 long int rel;
1607 switch (c->op_bytes) {
1608 case 2:
1609 rel = insn_fetch(s16, 2, c->eip);
1610 break;
1611 case 4:
1612 rel = insn_fetch(s32, 4, c->eip);
1613 break;
1614 default:
1615 DPRINTF("Call: Invalid op_bytes\n");
1616 goto cannot_emulate;
1617 }
1618 c->src.val = (unsigned long) c->eip;
1619 JMP_REL(rel);
1620 c->op_bytes = c->ad_bytes;
1621 emulate_push(ctxt);
1622 break;
1623 }
1624 case 0xe9: /* jmp rel */
1625 case 0xeb: /* jmp rel short */
1626 JMP_REL(c->src.val);
1627 c->dst.type = OP_NONE; /* Disable writeback. */
1628 break;
1629 case 0xf4: /* hlt */
1630 ctxt->vcpu->arch.halt_request = 1;
1631 goto done;
1632 case 0xf5: /* cmc */
1633 /* complement carry flag from eflags reg */
1634 ctxt->eflags ^= EFLG_CF;
1635 c->dst.type = OP_NONE; /* Disable writeback. */
1636 break;
1637 case 0xf6 ... 0xf7: /* Grp3 */
1638 rc = emulate_grp3(ctxt, ops);
1639 if (rc != 0)
1640 goto done;
1641 break;
1642 case 0xf8: /* clc */
1643 ctxt->eflags &= ~EFLG_CF;
1644 c->dst.type = OP_NONE; /* Disable writeback. */
1645 break;
1646 case 0xfa: /* cli */
1647 ctxt->eflags &= ~X86_EFLAGS_IF;
1648 c->dst.type = OP_NONE; /* Disable writeback. */
1649 break;
1650 case 0xfb: /* sti */
1651 ctxt->eflags |= X86_EFLAGS_IF;
1652 c->dst.type = OP_NONE; /* Disable writeback. */
1653 break;
1654 case 0xfe ... 0xff: /* Grp4/Grp5 */
1655 rc = emulate_grp45(ctxt, ops);
1656 if (rc != 0)
1657 goto done;
1658 break;
1659 }
1660
1661writeback:
1662 rc = writeback(ctxt, ops);
1663 if (rc != 0)
1664 goto done;
1665
1666 /* Commit shadow register state. */
1667 memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
1668 ctxt->vcpu->arch.rip = c->eip;
1669
1670done:
1671 if (rc == X86EMUL_UNHANDLEABLE) {
1672 c->eip = saved_eip;
1673 return -1;
1674 }
1675 return 0;
1676
1677twobyte_insn:
1678 switch (c->b) {
1679 case 0x01: /* lgdt, lidt, lmsw */
1680 switch (c->modrm_reg) {
1681 u16 size;
1682 unsigned long address;
1683
1684 case 0: /* vmcall */
1685 if (c->modrm_mod != 3 || c->modrm_rm != 1)
1686 goto cannot_emulate;
1687
1688 rc = kvm_fix_hypercall(ctxt->vcpu);
1689 if (rc)
1690 goto done;
1691
1692 kvm_emulate_hypercall(ctxt->vcpu);
1693 break;
1694 case 2: /* lgdt */
1695 rc = read_descriptor(ctxt, ops, c->src.ptr,
1696 &size, &address, c->op_bytes);
1697 if (rc)
1698 goto done;
1699 realmode_lgdt(ctxt->vcpu, size, address);
1700 break;
1701 case 3: /* lidt/vmmcall */
1702 if (c->modrm_mod == 3 && c->modrm_rm == 1) {
1703 rc = kvm_fix_hypercall(ctxt->vcpu);
1704 if (rc)
1705 goto done;
1706 kvm_emulate_hypercall(ctxt->vcpu);
1707 } else {
1708 rc = read_descriptor(ctxt, ops, c->src.ptr,
1709 &size, &address,
1710 c->op_bytes);
1711 if (rc)
1712 goto done;
1713 realmode_lidt(ctxt->vcpu, size, address);
1714 }
1715 break;
1716 case 4: /* smsw */
1717 if (c->modrm_mod != 3)
1718 goto cannot_emulate;
1719 *(u16 *)&c->regs[c->modrm_rm]
1720 = realmode_get_cr(ctxt->vcpu, 0);
1721 break;
1722 case 6: /* lmsw */
1723 if (c->modrm_mod != 3)
1724 goto cannot_emulate;
1725 realmode_lmsw(ctxt->vcpu, (u16)c->modrm_val,
1726 &ctxt->eflags);
1727 break;
1728 case 7: /* invlpg*/
1729 emulate_invlpg(ctxt->vcpu, memop);
1730 break;
1731 default:
1732 goto cannot_emulate;
1733 }
1734 /* Disable writeback. */
1735 c->dst.type = OP_NONE;
1736 break;
1737 case 0x06:
1738 emulate_clts(ctxt->vcpu);
1739 c->dst.type = OP_NONE;
1740 break;
1741 case 0x08: /* invd */
1742 case 0x09: /* wbinvd */
1743 case 0x0d: /* GrpP (prefetch) */
1744 case 0x18: /* Grp16 (prefetch/nop) */
1745 c->dst.type = OP_NONE;
1746 break;
1747 case 0x20: /* mov cr, reg */
1748 if (c->modrm_mod != 3)
1749 goto cannot_emulate;
1750 c->regs[c->modrm_rm] =
1751 realmode_get_cr(ctxt->vcpu, c->modrm_reg);
1752 c->dst.type = OP_NONE; /* no writeback */
1753 break;
1754 case 0x21: /* mov from dr to reg */
1755 if (c->modrm_mod != 3)
1756 goto cannot_emulate;
1757 rc = emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]);
1758 if (rc)
1759 goto cannot_emulate;
1760 c->dst.type = OP_NONE; /* no writeback */
1761 break;
1762 case 0x22: /* mov reg, cr */
1763 if (c->modrm_mod != 3)
1764 goto cannot_emulate;
1765 realmode_set_cr(ctxt->vcpu,
1766 c->modrm_reg, c->modrm_val, &ctxt->eflags);
1767 c->dst.type = OP_NONE;
1768 break;
1769 case 0x23: /* mov from reg to dr */
1770 if (c->modrm_mod != 3)
1771 goto cannot_emulate;
1772 rc = emulator_set_dr(ctxt, c->modrm_reg,
1773 c->regs[c->modrm_rm]);
1774 if (rc)
1775 goto cannot_emulate;
1776 c->dst.type = OP_NONE; /* no writeback */
1777 break;
1778 case 0x30:
1779 /* wrmsr */
1780 msr_data = (u32)c->regs[VCPU_REGS_RAX]
1781 | ((u64)c->regs[VCPU_REGS_RDX] << 32);
1782 rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data);
1783 if (rc) {
1784 kvm_inject_gp(ctxt->vcpu, 0);
1785 c->eip = ctxt->vcpu->arch.rip;
1786 }
1787 rc = X86EMUL_CONTINUE;
1788 c->dst.type = OP_NONE;
1789 break;
1790 case 0x32:
1791 /* rdmsr */
1792 rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data);
1793 if (rc) {
1794 kvm_inject_gp(ctxt->vcpu, 0);
1795 c->eip = ctxt->vcpu->arch.rip;
1796 } else {
1797 c->regs[VCPU_REGS_RAX] = (u32)msr_data;
1798 c->regs[VCPU_REGS_RDX] = msr_data >> 32;
1799 }
1800 rc = X86EMUL_CONTINUE;
1801 c->dst.type = OP_NONE;
1802 break;
1803 case 0x40 ... 0x4f: /* cmov */
1804 c->dst.val = c->dst.orig_val = c->src.val;
1805 if (!test_cc(c->b, ctxt->eflags))
1806 c->dst.type = OP_NONE; /* no writeback */
1807 break;
1808 case 0x80 ... 0x8f: /* jnz rel, etc*/ {
1809 long int rel;
1810
1811 switch (c->op_bytes) {
1812 case 2:
1813 rel = insn_fetch(s16, 2, c->eip);
1814 break;
1815 case 4:
1816 rel = insn_fetch(s32, 4, c->eip);
1817 break;
1818 case 8:
1819 rel = insn_fetch(s64, 8, c->eip);
1820 break;
1821 default:
1822 DPRINTF("jnz: Invalid op_bytes\n");
1823 goto cannot_emulate;
1824 }
1825 if (test_cc(c->b, ctxt->eflags))
1826 JMP_REL(rel);
1827 c->dst.type = OP_NONE;
1828 break;
1829 }
1830 case 0xa3:
1831 bt: /* bt */
1832 c->dst.type = OP_NONE;
1833 /* only subword offset */
1834 c->src.val &= (c->dst.bytes << 3) - 1;
1835 emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags);
1836 break;
1837 case 0xab:
1838 bts: /* bts */
1839 /* only subword offset */
1840 c->src.val &= (c->dst.bytes << 3) - 1;
1841 emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags);
1842 break;
1843 case 0xb0 ... 0xb1: /* cmpxchg */
1844 /*
1845 * Save real source value, then compare EAX against
1846 * destination.
1847 */
1848 c->src.orig_val = c->src.val;
1849 c->src.val = c->regs[VCPU_REGS_RAX];
1850 emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
1851 if (ctxt->eflags & EFLG_ZF) {
1852 /* Success: write back to memory. */
1853 c->dst.val = c->src.orig_val;
1854 } else {
1855 /* Failure: write the value we saw to EAX. */
1856 c->dst.type = OP_REG;
1857 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
1858 }
1859 break;
1860 case 0xb3:
1861 btr: /* btr */
1862 /* only subword offset */
1863 c->src.val &= (c->dst.bytes << 3) - 1;
1864 emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags);
1865 break;
1866 case 0xb6 ... 0xb7: /* movzx */
1867 c->dst.bytes = c->op_bytes;
1868 c->dst.val = (c->d & ByteOp) ? (u8) c->src.val
1869 : (u16) c->src.val;
1870 break;
1871 case 0xba: /* Grp8 */
1872 switch (c->modrm_reg & 3) {
1873 case 0:
1874 goto bt;
1875 case 1:
1876 goto bts;
1877 case 2:
1878 goto btr;
1879 case 3:
1880 goto btc;
1881 }
1882 break;
1883 case 0xbb:
1884 btc: /* btc */
1885 /* only subword offset */
1886 c->src.val &= (c->dst.bytes << 3) - 1;
1887 emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags);
1888 break;
1889 case 0xbe ... 0xbf: /* movsx */
1890 c->dst.bytes = c->op_bytes;
1891 c->dst.val = (c->d & ByteOp) ? (s8) c->src.val :
1892 (s16) c->src.val;
1893 break;
1894 case 0xc3: /* movnti */
1895 c->dst.bytes = c->op_bytes;
1896 c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val :
1897 (u64) c->src.val;
1898 break;
1899 case 0xc7: /* Grp9 (cmpxchg8b) */
1900 rc = emulate_grp9(ctxt, ops, memop);
1901 if (rc != 0)
1902 goto done;
1903 c->dst.type = OP_NONE;
1904 break;
1905 }
1906 goto writeback;
1907
1908cannot_emulate:
1909 DPRINTF("Cannot emulate %02x\n", c->b);
1910 c->eip = saved_eip;
1911 return -1;
1912}