aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorAvi Kivity <avi@qumranet.com>2007-12-16 04:02:48 -0500
committerAvi Kivity <avi@qumranet.com>2008-01-30 11:01:18 -0500
commitedf884172e9828c6234b254208af04655855038d (patch)
treef5e5d1eecaed9737eced6ba60d09fe93149751c1 /arch
parent9584bf2c93f56656dba0de8f6c75b54ca7995143 (diff)
KVM: Move arch dependent files to new directory arch/x86/kvm/
This paves the way for multiple architecture support. Note that while ioapic.c could potentially be shared with ia64, it is also moved. Signed-off-by: Avi Kivity <avi@qumranet.com>
Diffstat (limited to 'arch')
-rw-r--r--arch/x86/Kconfig2
-rw-r--r--arch/x86/Makefile2
-rw-r--r--arch/x86/kvm/Kconfig57
-rw-r--r--arch/x86/kvm/Makefile15
-rw-r--r--arch/x86/kvm/i8259.c450
-rw-r--r--arch/x86/kvm/ioapic.c400
-rw-r--r--arch/x86/kvm/irq.c98
-rw-r--r--arch/x86/kvm/irq.h195
-rw-r--r--arch/x86/kvm/kvm_svm.h45
-rw-r--r--arch/x86/kvm/lapic.c1085
-rw-r--r--arch/x86/kvm/mmu.c1805
-rw-r--r--arch/x86/kvm/mmu.h44
-rw-r--r--arch/x86/kvm/paging_tmpl.h461
-rw-r--r--arch/x86/kvm/segment_descriptor.h29
-rw-r--r--arch/x86/kvm/svm.c1725
-rw-r--r--arch/x86/kvm/svm.h325
-rw-r--r--arch/x86/kvm/vmx.c2671
-rw-r--r--arch/x86/kvm/vmx.h324
-rw-r--r--arch/x86/kvm/x86.c3146
-rw-r--r--arch/x86/kvm/x86_emulate.c1912
20 files changed, 14791 insertions, 0 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d289cfcf92c4..65b449134cf7 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1599,4 +1599,6 @@ source "security/Kconfig"
1599 1599
1600source "crypto/Kconfig" 1600source "crypto/Kconfig"
1601 1601
1602source "arch/x86/kvm/Kconfig"
1603
1602source "lib/Kconfig" 1604source "lib/Kconfig"
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index b08f18261df6..da8f4129780b 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -7,6 +7,8 @@ else
7 KBUILD_DEFCONFIG := $(ARCH)_defconfig 7 KBUILD_DEFCONFIG := $(ARCH)_defconfig
8endif 8endif
9 9
10core-$(CONFIG_KVM) += arch/x86/kvm/
11
10# BITS is used as extension for files which are available in a 32 bit 12# BITS is used as extension for files which are available in a 32 bit
11# and a 64 bit version to simplify shared Makefiles. 13# and a 64 bit version to simplify shared Makefiles.
12# e.g.: obj-y += foo_$(BITS).o 14# e.g.: obj-y += foo_$(BITS).o
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
new file mode 100644
index 000000000000..c83e1c9b5129
--- /dev/null
+++ b/arch/x86/kvm/Kconfig
@@ -0,0 +1,57 @@
1#
2# KVM configuration
3#
4config HAVE_KVM
5 bool
6
7menuconfig VIRTUALIZATION
8 bool "Virtualization"
9 depends on HAVE_KVM || X86
10 default y
11 ---help---
12 Say Y here to get to see options for using your Linux host to run other
13 operating systems inside virtual machines (guests).
14 This option alone does not add any kernel code.
15
16 If you say N, all options in this submenu will be skipped and disabled.
17
18if VIRTUALIZATION
19
20config KVM
21 tristate "Kernel-based Virtual Machine (KVM) support"
22 depends on HAVE_KVM && EXPERIMENTAL
23 select PREEMPT_NOTIFIERS
24 select ANON_INODES
25 ---help---
26 Support hosting fully virtualized guest machines using hardware
27 virtualization extensions. You will need a fairly recent
28 processor equipped with virtualization extensions. You will also
29 need to select one or more of the processor modules below.
30
31 This module provides access to the hardware capabilities through
32 a character device node named /dev/kvm.
33
34 To compile this as a module, choose M here: the module
35 will be called kvm.
36
37 If unsure, say N.
38
39config KVM_INTEL
40 tristate "KVM for Intel processors support"
41 depends on KVM
42 ---help---
43 Provides support for KVM on Intel processors equipped with the VT
44 extensions.
45
46config KVM_AMD
47 tristate "KVM for AMD processors support"
48 depends on KVM
49 ---help---
50 Provides support for KVM on AMD processors equipped with the AMD-V
51 (SVM) extensions.
52
53# OK, it's a little counter-intuitive to do this, but it puts it neatly under
54# the virtualization menu.
55source drivers/lguest/Kconfig
56
57endif # VIRTUALIZATION
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
new file mode 100644
index 000000000000..880ffe403b35
--- /dev/null
+++ b/arch/x86/kvm/Makefile
@@ -0,0 +1,15 @@
1#
2# Makefile for Kernel-based Virtual Machine module
3#
4
5common-objs = $(addprefix ../../../drivers/kvm/, kvm_main.o)
6
7EXTRA_CFLAGS += -I drivers/kvm
8
9kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \
10 ioapic.o
11obj-$(CONFIG_KVM) += kvm.o
12kvm-intel-objs = vmx.o
13obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
14kvm-amd-objs = svm.o
15obj-$(CONFIG_KVM_AMD) += kvm-amd.o
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
new file mode 100644
index 000000000000..ab29cf2def47
--- /dev/null
+++ b/arch/x86/kvm/i8259.c
@@ -0,0 +1,450 @@
1/*
2 * 8259 interrupt controller emulation
3 *
4 * Copyright (c) 2003-2004 Fabrice Bellard
5 * Copyright (c) 2007 Intel Corporation
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a copy
8 * of this software and associated documentation files (the "Software"), to deal
9 * in the Software without restriction, including without limitation the rights
10 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 * copies of the Software, and to permit persons to whom the Software is
12 * furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be included in
15 * all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 * THE SOFTWARE.
24 * Authors:
25 * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
26 * Port from Qemu.
27 */
28#include <linux/mm.h>
29#include "irq.h"
30
31#include <linux/kvm_host.h>
32
33/*
34 * set irq level. If an edge is detected, then the IRR is set to 1
35 */
36static inline void pic_set_irq1(struct kvm_kpic_state *s, int irq, int level)
37{
38 int mask;
39 mask = 1 << irq;
40 if (s->elcr & mask) /* level triggered */
41 if (level) {
42 s->irr |= mask;
43 s->last_irr |= mask;
44 } else {
45 s->irr &= ~mask;
46 s->last_irr &= ~mask;
47 }
48 else /* edge triggered */
49 if (level) {
50 if ((s->last_irr & mask) == 0)
51 s->irr |= mask;
52 s->last_irr |= mask;
53 } else
54 s->last_irr &= ~mask;
55}
56
57/*
58 * return the highest priority found in mask (highest = smallest
59 * number). Return 8 if no irq
60 */
61static inline int get_priority(struct kvm_kpic_state *s, int mask)
62{
63 int priority;
64 if (mask == 0)
65 return 8;
66 priority = 0;
67 while ((mask & (1 << ((priority + s->priority_add) & 7))) == 0)
68 priority++;
69 return priority;
70}
71
72/*
73 * return the pic wanted interrupt. return -1 if none
74 */
75static int pic_get_irq(struct kvm_kpic_state *s)
76{
77 int mask, cur_priority, priority;
78
79 mask = s->irr & ~s->imr;
80 priority = get_priority(s, mask);
81 if (priority == 8)
82 return -1;
83 /*
84 * compute current priority. If special fully nested mode on the
85 * master, the IRQ coming from the slave is not taken into account
86 * for the priority computation.
87 */
88 mask = s->isr;
89 if (s->special_fully_nested_mode && s == &s->pics_state->pics[0])
90 mask &= ~(1 << 2);
91 cur_priority = get_priority(s, mask);
92 if (priority < cur_priority)
93 /*
94 * higher priority found: an irq should be generated
95 */
96 return (priority + s->priority_add) & 7;
97 else
98 return -1;
99}
100
101/*
102 * raise irq to CPU if necessary. must be called every time the active
103 * irq may change
104 */
105static void pic_update_irq(struct kvm_pic *s)
106{
107 int irq2, irq;
108
109 irq2 = pic_get_irq(&s->pics[1]);
110 if (irq2 >= 0) {
111 /*
112 * if irq request by slave pic, signal master PIC
113 */
114 pic_set_irq1(&s->pics[0], 2, 1);
115 pic_set_irq1(&s->pics[0], 2, 0);
116 }
117 irq = pic_get_irq(&s->pics[0]);
118 if (irq >= 0)
119 s->irq_request(s->irq_request_opaque, 1);
120 else
121 s->irq_request(s->irq_request_opaque, 0);
122}
123
124void kvm_pic_update_irq(struct kvm_pic *s)
125{
126 pic_update_irq(s);
127}
128
129void kvm_pic_set_irq(void *opaque, int irq, int level)
130{
131 struct kvm_pic *s = opaque;
132
133 pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
134 pic_update_irq(s);
135}
136
137/*
138 * acknowledge interrupt 'irq'
139 */
140static inline void pic_intack(struct kvm_kpic_state *s, int irq)
141{
142 if (s->auto_eoi) {
143 if (s->rotate_on_auto_eoi)
144 s->priority_add = (irq + 1) & 7;
145 } else
146 s->isr |= (1 << irq);
147 /*
148 * We don't clear a level sensitive interrupt here
149 */
150 if (!(s->elcr & (1 << irq)))
151 s->irr &= ~(1 << irq);
152}
153
154int kvm_pic_read_irq(struct kvm_pic *s)
155{
156 int irq, irq2, intno;
157
158 irq = pic_get_irq(&s->pics[0]);
159 if (irq >= 0) {
160 pic_intack(&s->pics[0], irq);
161 if (irq == 2) {
162 irq2 = pic_get_irq(&s->pics[1]);
163 if (irq2 >= 0)
164 pic_intack(&s->pics[1], irq2);
165 else
166 /*
167 * spurious IRQ on slave controller
168 */
169 irq2 = 7;
170 intno = s->pics[1].irq_base + irq2;
171 irq = irq2 + 8;
172 } else
173 intno = s->pics[0].irq_base + irq;
174 } else {
175 /*
176 * spurious IRQ on host controller
177 */
178 irq = 7;
179 intno = s->pics[0].irq_base + irq;
180 }
181 pic_update_irq(s);
182
183 return intno;
184}
185
186void kvm_pic_reset(struct kvm_kpic_state *s)
187{
188 s->last_irr = 0;
189 s->irr = 0;
190 s->imr = 0;
191 s->isr = 0;
192 s->priority_add = 0;
193 s->irq_base = 0;
194 s->read_reg_select = 0;
195 s->poll = 0;
196 s->special_mask = 0;
197 s->init_state = 0;
198 s->auto_eoi = 0;
199 s->rotate_on_auto_eoi = 0;
200 s->special_fully_nested_mode = 0;
201 s->init4 = 0;
202}
203
204static void pic_ioport_write(void *opaque, u32 addr, u32 val)
205{
206 struct kvm_kpic_state *s = opaque;
207 int priority, cmd, irq;
208
209 addr &= 1;
210 if (addr == 0) {
211 if (val & 0x10) {
212 kvm_pic_reset(s); /* init */
213 /*
214 * deassert a pending interrupt
215 */
216 s->pics_state->irq_request(s->pics_state->
217 irq_request_opaque, 0);
218 s->init_state = 1;
219 s->init4 = val & 1;
220 if (val & 0x02)
221 printk(KERN_ERR "single mode not supported");
222 if (val & 0x08)
223 printk(KERN_ERR
224 "level sensitive irq not supported");
225 } else if (val & 0x08) {
226 if (val & 0x04)
227 s->poll = 1;
228 if (val & 0x02)
229 s->read_reg_select = val & 1;
230 if (val & 0x40)
231 s->special_mask = (val >> 5) & 1;
232 } else {
233 cmd = val >> 5;
234 switch (cmd) {
235 case 0:
236 case 4:
237 s->rotate_on_auto_eoi = cmd >> 2;
238 break;
239 case 1: /* end of interrupt */
240 case 5:
241 priority = get_priority(s, s->isr);
242 if (priority != 8) {
243 irq = (priority + s->priority_add) & 7;
244 s->isr &= ~(1 << irq);
245 if (cmd == 5)
246 s->priority_add = (irq + 1) & 7;
247 pic_update_irq(s->pics_state);
248 }
249 break;
250 case 3:
251 irq = val & 7;
252 s->isr &= ~(1 << irq);
253 pic_update_irq(s->pics_state);
254 break;
255 case 6:
256 s->priority_add = (val + 1) & 7;
257 pic_update_irq(s->pics_state);
258 break;
259 case 7:
260 irq = val & 7;
261 s->isr &= ~(1 << irq);
262 s->priority_add = (irq + 1) & 7;
263 pic_update_irq(s->pics_state);
264 break;
265 default:
266 break; /* no operation */
267 }
268 }
269 } else
270 switch (s->init_state) {
271 case 0: /* normal mode */
272 s->imr = val;
273 pic_update_irq(s->pics_state);
274 break;
275 case 1:
276 s->irq_base = val & 0xf8;
277 s->init_state = 2;
278 break;
279 case 2:
280 if (s->init4)
281 s->init_state = 3;
282 else
283 s->init_state = 0;
284 break;
285 case 3:
286 s->special_fully_nested_mode = (val >> 4) & 1;
287 s->auto_eoi = (val >> 1) & 1;
288 s->init_state = 0;
289 break;
290 }
291}
292
293static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1)
294{
295 int ret;
296
297 ret = pic_get_irq(s);
298 if (ret >= 0) {
299 if (addr1 >> 7) {
300 s->pics_state->pics[0].isr &= ~(1 << 2);
301 s->pics_state->pics[0].irr &= ~(1 << 2);
302 }
303 s->irr &= ~(1 << ret);
304 s->isr &= ~(1 << ret);
305 if (addr1 >> 7 || ret != 2)
306 pic_update_irq(s->pics_state);
307 } else {
308 ret = 0x07;
309 pic_update_irq(s->pics_state);
310 }
311
312 return ret;
313}
314
315static u32 pic_ioport_read(void *opaque, u32 addr1)
316{
317 struct kvm_kpic_state *s = opaque;
318 unsigned int addr;
319 int ret;
320
321 addr = addr1;
322 addr &= 1;
323 if (s->poll) {
324 ret = pic_poll_read(s, addr1);
325 s->poll = 0;
326 } else
327 if (addr == 0)
328 if (s->read_reg_select)
329 ret = s->isr;
330 else
331 ret = s->irr;
332 else
333 ret = s->imr;
334 return ret;
335}
336
337static void elcr_ioport_write(void *opaque, u32 addr, u32 val)
338{
339 struct kvm_kpic_state *s = opaque;
340 s->elcr = val & s->elcr_mask;
341}
342
343static u32 elcr_ioport_read(void *opaque, u32 addr1)
344{
345 struct kvm_kpic_state *s = opaque;
346 return s->elcr;
347}
348
349static int picdev_in_range(struct kvm_io_device *this, gpa_t addr)
350{
351 switch (addr) {
352 case 0x20:
353 case 0x21:
354 case 0xa0:
355 case 0xa1:
356 case 0x4d0:
357 case 0x4d1:
358 return 1;
359 default:
360 return 0;
361 }
362}
363
364static void picdev_write(struct kvm_io_device *this,
365 gpa_t addr, int len, const void *val)
366{
367 struct kvm_pic *s = this->private;
368 unsigned char data = *(unsigned char *)val;
369
370 if (len != 1) {
371 if (printk_ratelimit())
372 printk(KERN_ERR "PIC: non byte write\n");
373 return;
374 }
375 switch (addr) {
376 case 0x20:
377 case 0x21:
378 case 0xa0:
379 case 0xa1:
380 pic_ioport_write(&s->pics[addr >> 7], addr, data);
381 break;
382 case 0x4d0:
383 case 0x4d1:
384 elcr_ioport_write(&s->pics[addr & 1], addr, data);
385 break;
386 }
387}
388
389static void picdev_read(struct kvm_io_device *this,
390 gpa_t addr, int len, void *val)
391{
392 struct kvm_pic *s = this->private;
393 unsigned char data = 0;
394
395 if (len != 1) {
396 if (printk_ratelimit())
397 printk(KERN_ERR "PIC: non byte read\n");
398 return;
399 }
400 switch (addr) {
401 case 0x20:
402 case 0x21:
403 case 0xa0:
404 case 0xa1:
405 data = pic_ioport_read(&s->pics[addr >> 7], addr);
406 break;
407 case 0x4d0:
408 case 0x4d1:
409 data = elcr_ioport_read(&s->pics[addr & 1], addr);
410 break;
411 }
412 *(unsigned char *)val = data;
413}
414
415/*
416 * callback when PIC0 irq status changed
417 */
418static void pic_irq_request(void *opaque, int level)
419{
420 struct kvm *kvm = opaque;
421 struct kvm_vcpu *vcpu = kvm->vcpus[0];
422
423 pic_irqchip(kvm)->output = level;
424 if (vcpu)
425 kvm_vcpu_kick(vcpu);
426}
427
428struct kvm_pic *kvm_create_pic(struct kvm *kvm)
429{
430 struct kvm_pic *s;
431 s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
432 if (!s)
433 return NULL;
434 s->pics[0].elcr_mask = 0xf8;
435 s->pics[1].elcr_mask = 0xde;
436 s->irq_request = pic_irq_request;
437 s->irq_request_opaque = kvm;
438 s->pics[0].pics_state = s;
439 s->pics[1].pics_state = s;
440
441 /*
442 * Initialize PIO device
443 */
444 s->dev.read = picdev_read;
445 s->dev.write = picdev_write;
446 s->dev.in_range = picdev_in_range;
447 s->dev.private = s;
448 kvm_io_bus_register_dev(&kvm->pio_bus, &s->dev);
449 return s;
450}
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
new file mode 100644
index 000000000000..72f12f75495d
--- /dev/null
+++ b/arch/x86/kvm/ioapic.c
@@ -0,0 +1,400 @@
1/*
2 * Copyright (C) 2001 MandrakeSoft S.A.
3 *
4 * MandrakeSoft S.A.
5 * 43, rue d'Aboukir
6 * 75002 Paris - France
7 * http://www.linux-mandrake.com/
8 * http://www.mandrakesoft.com/
9 *
10 * This library is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This library is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with this library; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 *
24 * Yunhong Jiang <yunhong.jiang@intel.com>
25 * Yaozu (Eddie) Dong <eddie.dong@intel.com>
26 * Based on Xen 3.1 code.
27 */
28
29#include <linux/kvm_host.h>
30#include <linux/kvm.h>
31#include <linux/mm.h>
32#include <linux/highmem.h>
33#include <linux/smp.h>
34#include <linux/hrtimer.h>
35#include <linux/io.h>
36#include <asm/processor.h>
37#include <asm/page.h>
38#include <asm/current.h>
39#include "irq.h"
40#if 0
41#define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg)
42#else
43#define ioapic_debug(fmt, arg...)
44#endif
45static void ioapic_deliver(struct kvm_ioapic *vioapic, int irq);
46
47static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
48 unsigned long addr,
49 unsigned long length)
50{
51 unsigned long result = 0;
52
53 switch (ioapic->ioregsel) {
54 case IOAPIC_REG_VERSION:
55 result = ((((IOAPIC_NUM_PINS - 1) & 0xff) << 16)
56 | (IOAPIC_VERSION_ID & 0xff));
57 break;
58
59 case IOAPIC_REG_APIC_ID:
60 case IOAPIC_REG_ARB_ID:
61 result = ((ioapic->id & 0xf) << 24);
62 break;
63
64 default:
65 {
66 u32 redir_index = (ioapic->ioregsel - 0x10) >> 1;
67 u64 redir_content;
68
69 ASSERT(redir_index < IOAPIC_NUM_PINS);
70
71 redir_content = ioapic->redirtbl[redir_index].bits;
72 result = (ioapic->ioregsel & 0x1) ?
73 (redir_content >> 32) & 0xffffffff :
74 redir_content & 0xffffffff;
75 break;
76 }
77 }
78
79 return result;
80}
81
82static void ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx)
83{
84 union ioapic_redir_entry *pent;
85
86 pent = &ioapic->redirtbl[idx];
87
88 if (!pent->fields.mask) {
89 ioapic_deliver(ioapic, idx);
90 if (pent->fields.trig_mode == IOAPIC_LEVEL_TRIG)
91 pent->fields.remote_irr = 1;
92 }
93 if (!pent->fields.trig_mode)
94 ioapic->irr &= ~(1 << idx);
95}
96
97static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
98{
99 unsigned index;
100
101 switch (ioapic->ioregsel) {
102 case IOAPIC_REG_VERSION:
103 /* Writes are ignored. */
104 break;
105
106 case IOAPIC_REG_APIC_ID:
107 ioapic->id = (val >> 24) & 0xf;
108 break;
109
110 case IOAPIC_REG_ARB_ID:
111 break;
112
113 default:
114 index = (ioapic->ioregsel - 0x10) >> 1;
115
116 ioapic_debug("change redir index %x val %x\n", index, val);
117 if (index >= IOAPIC_NUM_PINS)
118 return;
119 if (ioapic->ioregsel & 1) {
120 ioapic->redirtbl[index].bits &= 0xffffffff;
121 ioapic->redirtbl[index].bits |= (u64) val << 32;
122 } else {
123 ioapic->redirtbl[index].bits &= ~0xffffffffULL;
124 ioapic->redirtbl[index].bits |= (u32) val;
125 ioapic->redirtbl[index].fields.remote_irr = 0;
126 }
127 if (ioapic->irr & (1 << index))
128 ioapic_service(ioapic, index);
129 break;
130 }
131}
132
133static void ioapic_inj_irq(struct kvm_ioapic *ioapic,
134 struct kvm_vcpu *vcpu,
135 u8 vector, u8 trig_mode, u8 delivery_mode)
136{
137 ioapic_debug("irq %d trig %d deliv %d\n", vector, trig_mode,
138 delivery_mode);
139
140 ASSERT((delivery_mode == IOAPIC_FIXED) ||
141 (delivery_mode == IOAPIC_LOWEST_PRIORITY));
142
143 kvm_apic_set_irq(vcpu, vector, trig_mode);
144}
145
146static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
147 u8 dest_mode)
148{
149 u32 mask = 0;
150 int i;
151 struct kvm *kvm = ioapic->kvm;
152 struct kvm_vcpu *vcpu;
153
154 ioapic_debug("dest %d dest_mode %d\n", dest, dest_mode);
155
156 if (dest_mode == 0) { /* Physical mode. */
157 if (dest == 0xFF) { /* Broadcast. */
158 for (i = 0; i < KVM_MAX_VCPUS; ++i)
159 if (kvm->vcpus[i] && kvm->vcpus[i]->arch.apic)
160 mask |= 1 << i;
161 return mask;
162 }
163 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
164 vcpu = kvm->vcpus[i];
165 if (!vcpu)
166 continue;
167 if (kvm_apic_match_physical_addr(vcpu->arch.apic, dest)) {
168 if (vcpu->arch.apic)
169 mask = 1 << i;
170 break;
171 }
172 }
173 } else if (dest != 0) /* Logical mode, MDA non-zero. */
174 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
175 vcpu = kvm->vcpus[i];
176 if (!vcpu)
177 continue;
178 if (vcpu->arch.apic &&
179 kvm_apic_match_logical_addr(vcpu->arch.apic, dest))
180 mask |= 1 << vcpu->vcpu_id;
181 }
182 ioapic_debug("mask %x\n", mask);
183 return mask;
184}
185
186static void ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
187{
188 u8 dest = ioapic->redirtbl[irq].fields.dest_id;
189 u8 dest_mode = ioapic->redirtbl[irq].fields.dest_mode;
190 u8 delivery_mode = ioapic->redirtbl[irq].fields.delivery_mode;
191 u8 vector = ioapic->redirtbl[irq].fields.vector;
192 u8 trig_mode = ioapic->redirtbl[irq].fields.trig_mode;
193 u32 deliver_bitmask;
194 struct kvm_vcpu *vcpu;
195 int vcpu_id;
196
197 ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
198 "vector=%x trig_mode=%x\n",
199 dest, dest_mode, delivery_mode, vector, trig_mode);
200
201 deliver_bitmask = ioapic_get_delivery_bitmask(ioapic, dest, dest_mode);
202 if (!deliver_bitmask) {
203 ioapic_debug("no target on destination\n");
204 return;
205 }
206
207 switch (delivery_mode) {
208 case IOAPIC_LOWEST_PRIORITY:
209 vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm, vector,
210 deliver_bitmask);
211 if (vcpu != NULL)
212 ioapic_inj_irq(ioapic, vcpu, vector,
213 trig_mode, delivery_mode);
214 else
215 ioapic_debug("null lowest prio vcpu: "
216 "mask=%x vector=%x delivery_mode=%x\n",
217 deliver_bitmask, vector, IOAPIC_LOWEST_PRIORITY);
218 break;
219 case IOAPIC_FIXED:
220 for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) {
221 if (!(deliver_bitmask & (1 << vcpu_id)))
222 continue;
223 deliver_bitmask &= ~(1 << vcpu_id);
224 vcpu = ioapic->kvm->vcpus[vcpu_id];
225 if (vcpu) {
226 ioapic_inj_irq(ioapic, vcpu, vector,
227 trig_mode, delivery_mode);
228 }
229 }
230 break;
231
232 /* TODO: NMI */
233 default:
234 printk(KERN_WARNING "Unsupported delivery mode %d\n",
235 delivery_mode);
236 break;
237 }
238}
239
240void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
241{
242 u32 old_irr = ioapic->irr;
243 u32 mask = 1 << irq;
244 union ioapic_redir_entry entry;
245
246 if (irq >= 0 && irq < IOAPIC_NUM_PINS) {
247 entry = ioapic->redirtbl[irq];
248 level ^= entry.fields.polarity;
249 if (!level)
250 ioapic->irr &= ~mask;
251 else {
252 ioapic->irr |= mask;
253 if ((!entry.fields.trig_mode && old_irr != ioapic->irr)
254 || !entry.fields.remote_irr)
255 ioapic_service(ioapic, irq);
256 }
257 }
258}
259
260static int get_eoi_gsi(struct kvm_ioapic *ioapic, int vector)
261{
262 int i;
263
264 for (i = 0; i < IOAPIC_NUM_PINS; i++)
265 if (ioapic->redirtbl[i].fields.vector == vector)
266 return i;
267 return -1;
268}
269
270void kvm_ioapic_update_eoi(struct kvm *kvm, int vector)
271{
272 struct kvm_ioapic *ioapic = kvm->arch.vioapic;
273 union ioapic_redir_entry *ent;
274 int gsi;
275
276 gsi = get_eoi_gsi(ioapic, vector);
277 if (gsi == -1) {
278 printk(KERN_WARNING "Can't find redir item for %d EOI\n",
279 vector);
280 return;
281 }
282
283 ent = &ioapic->redirtbl[gsi];
284 ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
285
286 ent->fields.remote_irr = 0;
287 if (!ent->fields.mask && (ioapic->irr & (1 << gsi)))
288 ioapic_deliver(ioapic, gsi);
289}
290
291static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr)
292{
293 struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
294
295 return ((addr >= ioapic->base_address &&
296 (addr < ioapic->base_address + IOAPIC_MEM_LENGTH)));
297}
298
299static void ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
300 void *val)
301{
302 struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
303 u32 result;
304
305 ioapic_debug("addr %lx\n", (unsigned long)addr);
306 ASSERT(!(addr & 0xf)); /* check alignment */
307
308 addr &= 0xff;
309 switch (addr) {
310 case IOAPIC_REG_SELECT:
311 result = ioapic->ioregsel;
312 break;
313
314 case IOAPIC_REG_WINDOW:
315 result = ioapic_read_indirect(ioapic, addr, len);
316 break;
317
318 default:
319 result = 0;
320 break;
321 }
322 switch (len) {
323 case 8:
324 *(u64 *) val = result;
325 break;
326 case 1:
327 case 2:
328 case 4:
329 memcpy(val, (char *)&result, len);
330 break;
331 default:
332 printk(KERN_WARNING "ioapic: wrong length %d\n", len);
333 }
334}
335
336static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
337 const void *val)
338{
339 struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
340 u32 data;
341
342 ioapic_debug("ioapic_mmio_write addr=%p len=%d val=%p\n",
343 (void*)addr, len, val);
344 ASSERT(!(addr & 0xf)); /* check alignment */
345 if (len == 4 || len == 8)
346 data = *(u32 *) val;
347 else {
348 printk(KERN_WARNING "ioapic: Unsupported size %d\n", len);
349 return;
350 }
351
352 addr &= 0xff;
353 switch (addr) {
354 case IOAPIC_REG_SELECT:
355 ioapic->ioregsel = data;
356 break;
357
358 case IOAPIC_REG_WINDOW:
359 ioapic_write_indirect(ioapic, data);
360 break;
361#ifdef CONFIG_IA64
362 case IOAPIC_REG_EOI:
363 kvm_ioapic_update_eoi(ioapic, data);
364 break;
365#endif
366
367 default:
368 break;
369 }
370}
371
372void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
373{
374 int i;
375
376 for (i = 0; i < IOAPIC_NUM_PINS; i++)
377 ioapic->redirtbl[i].fields.mask = 1;
378 ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
379 ioapic->ioregsel = 0;
380 ioapic->irr = 0;
381 ioapic->id = 0;
382}
383
384int kvm_ioapic_init(struct kvm *kvm)
385{
386 struct kvm_ioapic *ioapic;
387
388 ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL);
389 if (!ioapic)
390 return -ENOMEM;
391 kvm->arch.vioapic = ioapic;
392 kvm_ioapic_reset(ioapic);
393 ioapic->dev.read = ioapic_mmio_read;
394 ioapic->dev.write = ioapic_mmio_write;
395 ioapic->dev.in_range = ioapic_in_range;
396 ioapic->dev.private = ioapic;
397 ioapic->kvm = kvm;
398 kvm_io_bus_register_dev(&kvm->mmio_bus, &ioapic->dev);
399 return 0;
400}
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
new file mode 100644
index 000000000000..07a09aad4fd6
--- /dev/null
+++ b/arch/x86/kvm/irq.c
@@ -0,0 +1,98 @@
1/*
2 * irq.c: API for in kernel interrupt controller
3 * Copyright (c) 2007, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Authors:
18 * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
19 *
20 */
21
22#include <linux/module.h>
23#include <linux/kvm_host.h>
24
25#include "irq.h"
26
27/*
28 * check if there is pending interrupt without
29 * intack.
30 */
31int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
32{
33 struct kvm_pic *s;
34
35 if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */
36 if (kvm_apic_accept_pic_intr(v)) {
37 s = pic_irqchip(v->kvm); /* PIC */
38 return s->output;
39 } else
40 return 0;
41 }
42 return 1;
43}
44EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
45
46/*
47 * Read pending interrupt vector and intack.
48 */
49int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
50{
51 struct kvm_pic *s;
52 int vector;
53
54 vector = kvm_get_apic_interrupt(v); /* APIC */
55 if (vector == -1) {
56 if (kvm_apic_accept_pic_intr(v)) {
57 s = pic_irqchip(v->kvm);
58 s->output = 0; /* PIC */
59 vector = kvm_pic_read_irq(s);
60 }
61 }
62 return vector;
63}
64EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
65
66static void vcpu_kick_intr(void *info)
67{
68#ifdef DEBUG
69 struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
70 printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu);
71#endif
72}
73
74void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
75{
76 int ipi_pcpu = vcpu->cpu;
77
78 if (waitqueue_active(&vcpu->wq)) {
79 wake_up_interruptible(&vcpu->wq);
80 ++vcpu->stat.halt_wakeup;
81 }
82 if (vcpu->guest_mode)
83 smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0);
84}
85
86void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
87{
88 kvm_inject_apic_timer_irqs(vcpu);
89 /* TODO: PIT, RTC etc. */
90}
91EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
92
93void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
94{
95 kvm_apic_timer_intr_post(vcpu, vec);
96 /* TODO: PIT, RTC etc. */
97}
98EXPORT_SYMBOL_GPL(kvm_timer_intr_post);
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
new file mode 100644
index 000000000000..6316638eec9f
--- /dev/null
+++ b/arch/x86/kvm/irq.h
@@ -0,0 +1,195 @@
1/*
2 * irq.h: in kernel interrupt controller related definitions
3 * Copyright (c) 2007, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Authors:
18 * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
19 *
20 */
21
22#ifndef __IRQ_H
23#define __IRQ_H
24
25#include <linux/mm_types.h>
26#include <linux/hrtimer.h>
27#include <linux/kvm_host.h>
28#include "iodev.h"
29
30struct kvm;
31struct kvm_vcpu;
32
33typedef void irq_request_func(void *opaque, int level);
34
35struct kvm_kpic_state {
36 u8 last_irr; /* edge detection */
37 u8 irr; /* interrupt request register */
38 u8 imr; /* interrupt mask register */
39 u8 isr; /* interrupt service register */
40 u8 priority_add; /* highest irq priority */
41 u8 irq_base;
42 u8 read_reg_select;
43 u8 poll;
44 u8 special_mask;
45 u8 init_state;
46 u8 auto_eoi;
47 u8 rotate_on_auto_eoi;
48 u8 special_fully_nested_mode;
49 u8 init4; /* true if 4 byte init */
50 u8 elcr; /* PIIX edge/trigger selection */
51 u8 elcr_mask;
52 struct kvm_pic *pics_state;
53};
54
55struct kvm_pic {
56 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
57 irq_request_func *irq_request;
58 void *irq_request_opaque;
59 int output; /* intr from master PIC */
60 struct kvm_io_device dev;
61};
62
63struct kvm_pic *kvm_create_pic(struct kvm *kvm);
64void kvm_pic_set_irq(void *opaque, int irq, int level);
65int kvm_pic_read_irq(struct kvm_pic *s);
66void kvm_pic_update_irq(struct kvm_pic *s);
67
68#define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS
69#define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */
70#define IOAPIC_EDGE_TRIG 0
71#define IOAPIC_LEVEL_TRIG 1
72
73#define IOAPIC_DEFAULT_BASE_ADDRESS 0xfec00000
74#define IOAPIC_MEM_LENGTH 0x100
75
76/* Direct registers. */
77#define IOAPIC_REG_SELECT 0x00
78#define IOAPIC_REG_WINDOW 0x10
79#define IOAPIC_REG_EOI 0x40 /* IA64 IOSAPIC only */
80
81/* Indirect registers. */
82#define IOAPIC_REG_APIC_ID 0x00 /* x86 IOAPIC only */
83#define IOAPIC_REG_VERSION 0x01
84#define IOAPIC_REG_ARB_ID 0x02 /* x86 IOAPIC only */
85
86/*ioapic delivery mode*/
87#define IOAPIC_FIXED 0x0
88#define IOAPIC_LOWEST_PRIORITY 0x1
89#define IOAPIC_PMI 0x2
90#define IOAPIC_NMI 0x4
91#define IOAPIC_INIT 0x5
92#define IOAPIC_EXTINT 0x7
93
94struct kvm_ioapic {
95 u64 base_address;
96 u32 ioregsel;
97 u32 id;
98 u32 irr;
99 u32 pad;
100 union ioapic_redir_entry {
101 u64 bits;
102 struct {
103 u8 vector;
104 u8 delivery_mode:3;
105 u8 dest_mode:1;
106 u8 delivery_status:1;
107 u8 polarity:1;
108 u8 remote_irr:1;
109 u8 trig_mode:1;
110 u8 mask:1;
111 u8 reserve:7;
112 u8 reserved[4];
113 u8 dest_id;
114 } fields;
115 } redirtbl[IOAPIC_NUM_PINS];
116 struct kvm_io_device dev;
117 struct kvm *kvm;
118};
119
120struct kvm_lapic {
121 unsigned long base_address;
122 struct kvm_io_device dev;
123 struct {
124 atomic_t pending;
125 s64 period; /* unit: ns */
126 u32 divide_count;
127 ktime_t last_update;
128 struct hrtimer dev;
129 } timer;
130 struct kvm_vcpu *vcpu;
131 struct page *regs_page;
132 void *regs;
133};
134
135#ifdef DEBUG
136#define ASSERT(x) \
137do { \
138 if (!(x)) { \
139 printk(KERN_EMERG "assertion failed %s: %d: %s\n", \
140 __FILE__, __LINE__, #x); \
141 BUG(); \
142 } \
143} while (0)
144#else
145#define ASSERT(x) do { } while (0)
146#endif
147
148static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
149{
150 return kvm->arch.vpic;
151}
152
153static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
154{
155 return kvm->arch.vioapic;
156}
157
158static inline int irqchip_in_kernel(struct kvm *kvm)
159{
160 return pic_irqchip(kvm) != NULL;
161}
162
163void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
164int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
165int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
166int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
167int kvm_create_lapic(struct kvm_vcpu *vcpu);
168void kvm_lapic_reset(struct kvm_vcpu *vcpu);
169void kvm_pic_reset(struct kvm_kpic_state *s);
170void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
171void kvm_free_lapic(struct kvm_vcpu *vcpu);
172u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
173void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
174void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
175
176struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
177 unsigned long bitmap);
178u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
179void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
180int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
181void kvm_ioapic_update_eoi(struct kvm *kvm, int vector);
182int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
183int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig);
184void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
185int kvm_ioapic_init(struct kvm *kvm);
186void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
187int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
188int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
189void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
190void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
191void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
192void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
193void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
194
195#endif
diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h
new file mode 100644
index 000000000000..ecdfe97e4635
--- /dev/null
+++ b/arch/x86/kvm/kvm_svm.h
@@ -0,0 +1,45 @@
1#ifndef __KVM_SVM_H
2#define __KVM_SVM_H
3
4#include <linux/kernel.h>
5#include <linux/types.h>
6#include <linux/list.h>
7#include <linux/kvm_host.h>
8#include <asm/msr.h>
9
10#include "svm.h"
11
12static const u32 host_save_user_msrs[] = {
13#ifdef CONFIG_X86_64
14 MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
15 MSR_FS_BASE,
16#endif
17 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
18};
19
20#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
21#define NUM_DB_REGS 4
22
23struct kvm_vcpu;
24
25struct vcpu_svm {
26 struct kvm_vcpu vcpu;
27 struct vmcb *vmcb;
28 unsigned long vmcb_pa;
29 struct svm_cpu_data *svm_data;
30 uint64_t asid_generation;
31
32 unsigned long db_regs[NUM_DB_REGS];
33
34 u64 next_rip;
35
36 u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
37 u64 host_gs_base;
38 unsigned long host_cr2;
39 unsigned long host_db_regs[NUM_DB_REGS];
40 unsigned long host_dr6;
41 unsigned long host_dr7;
42};
43
44#endif
45
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
new file mode 100644
index 000000000000..4076331b01ee
--- /dev/null
+++ b/arch/x86/kvm/lapic.c
@@ -0,0 +1,1085 @@
1
2/*
3 * Local APIC virtualization
4 *
5 * Copyright (C) 2006 Qumranet, Inc.
6 * Copyright (C) 2007 Novell
7 * Copyright (C) 2007 Intel
8 *
9 * Authors:
10 * Dor Laor <dor.laor@qumranet.com>
11 * Gregory Haskins <ghaskins@novell.com>
12 * Yaozu (Eddie) Dong <eddie.dong@intel.com>
13 *
14 * Based on Xen 3.1 code, Copyright (c) 2004, Intel Corporation.
15 *
16 * This work is licensed under the terms of the GNU GPL, version 2. See
17 * the COPYING file in the top-level directory.
18 */
19
20#include <linux/kvm_host.h>
21#include <linux/kvm.h>
22#include <linux/mm.h>
23#include <linux/highmem.h>
24#include <linux/smp.h>
25#include <linux/hrtimer.h>
26#include <linux/io.h>
27#include <linux/module.h>
28#include <asm/processor.h>
29#include <asm/msr.h>
30#include <asm/page.h>
31#include <asm/current.h>
32#include <asm/apicdef.h>
33#include <asm/atomic.h>
34#include <asm/div64.h>
35#include "irq.h"
36
37#define PRId64 "d"
38#define PRIx64 "llx"
39#define PRIu64 "u"
40#define PRIo64 "o"
41
42#define APIC_BUS_CYCLE_NS 1
43
44/* #define apic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */
45#define apic_debug(fmt, arg...)
46
47#define APIC_LVT_NUM 6
48/* 14 is the version for Xeon and Pentium 8.4.8*/
49#define APIC_VERSION (0x14UL | ((APIC_LVT_NUM - 1) << 16))
50#define LAPIC_MMIO_LENGTH (1 << 12)
51/* followed define is not in apicdef.h */
52#define APIC_SHORT_MASK 0xc0000
53#define APIC_DEST_NOSHORT 0x0
54#define APIC_DEST_MASK 0x800
55#define MAX_APIC_VECTOR 256
56
57#define VEC_POS(v) ((v) & (32 - 1))
58#define REG_POS(v) (((v) >> 5) << 4)
59
60static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off)
61{
62 return *((u32 *) (apic->regs + reg_off));
63}
64
65static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
66{
67 *((u32 *) (apic->regs + reg_off)) = val;
68}
69
70static inline int apic_test_and_set_vector(int vec, void *bitmap)
71{
72 return test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
73}
74
75static inline int apic_test_and_clear_vector(int vec, void *bitmap)
76{
77 return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
78}
79
80static inline void apic_set_vector(int vec, void *bitmap)
81{
82 set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
83}
84
85static inline void apic_clear_vector(int vec, void *bitmap)
86{
87 clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
88}
89
90static inline int apic_hw_enabled(struct kvm_lapic *apic)
91{
92 return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
93}
94
95static inline int apic_sw_enabled(struct kvm_lapic *apic)
96{
97 return apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED;
98}
99
100static inline int apic_enabled(struct kvm_lapic *apic)
101{
102 return apic_sw_enabled(apic) && apic_hw_enabled(apic);
103}
104
105#define LVT_MASK \
106 (APIC_LVT_MASKED | APIC_SEND_PENDING | APIC_VECTOR_MASK)
107
108#define LINT_MASK \
109 (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
110 APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
111
112static inline int kvm_apic_id(struct kvm_lapic *apic)
113{
114 return (apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
115}
116
117static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
118{
119 return !(apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED);
120}
121
122static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type)
123{
124 return apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK;
125}
126
127static inline int apic_lvtt_period(struct kvm_lapic *apic)
128{
129 return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC;
130}
131
132static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
133 LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */
134 LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */
135 LVT_MASK | APIC_MODE_MASK, /* LVTPC */
136 LINT_MASK, LINT_MASK, /* LVT0-1 */
137 LVT_MASK /* LVTERR */
138};
139
140static int find_highest_vector(void *bitmap)
141{
142 u32 *word = bitmap;
143 int word_offset = MAX_APIC_VECTOR >> 5;
144
145 while ((word_offset != 0) && (word[(--word_offset) << 2] == 0))
146 continue;
147
148 if (likely(!word_offset && !word[0]))
149 return -1;
150 else
151 return fls(word[word_offset << 2]) - 1 + (word_offset << 5);
152}
153
154static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic)
155{
156 return apic_test_and_set_vector(vec, apic->regs + APIC_IRR);
157}
158
159static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
160{
161 apic_clear_vector(vec, apic->regs + APIC_IRR);
162}
163
164static inline int apic_find_highest_irr(struct kvm_lapic *apic)
165{
166 int result;
167
168 result = find_highest_vector(apic->regs + APIC_IRR);
169 ASSERT(result == -1 || result >= 16);
170
171 return result;
172}
173
174int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
175{
176 struct kvm_lapic *apic = vcpu->arch.apic;
177 int highest_irr;
178
179 if (!apic)
180 return 0;
181 highest_irr = apic_find_highest_irr(apic);
182
183 return highest_irr;
184}
185EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
186
187int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig)
188{
189 struct kvm_lapic *apic = vcpu->arch.apic;
190
191 if (!apic_test_and_set_irr(vec, apic)) {
192 /* a new pending irq is set in IRR */
193 if (trig)
194 apic_set_vector(vec, apic->regs + APIC_TMR);
195 else
196 apic_clear_vector(vec, apic->regs + APIC_TMR);
197 kvm_vcpu_kick(apic->vcpu);
198 return 1;
199 }
200 return 0;
201}
202
203static inline int apic_find_highest_isr(struct kvm_lapic *apic)
204{
205 int result;
206
207 result = find_highest_vector(apic->regs + APIC_ISR);
208 ASSERT(result == -1 || result >= 16);
209
210 return result;
211}
212
213static void apic_update_ppr(struct kvm_lapic *apic)
214{
215 u32 tpr, isrv, ppr;
216 int isr;
217
218 tpr = apic_get_reg(apic, APIC_TASKPRI);
219 isr = apic_find_highest_isr(apic);
220 isrv = (isr != -1) ? isr : 0;
221
222 if ((tpr & 0xf0) >= (isrv & 0xf0))
223 ppr = tpr & 0xff;
224 else
225 ppr = isrv & 0xf0;
226
227 apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x",
228 apic, ppr, isr, isrv);
229
230 apic_set_reg(apic, APIC_PROCPRI, ppr);
231}
232
233static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
234{
235 apic_set_reg(apic, APIC_TASKPRI, tpr);
236 apic_update_ppr(apic);
237}
238
239int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest)
240{
241 return kvm_apic_id(apic) == dest;
242}
243
244int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
245{
246 int result = 0;
247 u8 logical_id;
248
249 logical_id = GET_APIC_LOGICAL_ID(apic_get_reg(apic, APIC_LDR));
250
251 switch (apic_get_reg(apic, APIC_DFR)) {
252 case APIC_DFR_FLAT:
253 if (logical_id & mda)
254 result = 1;
255 break;
256 case APIC_DFR_CLUSTER:
257 if (((logical_id >> 4) == (mda >> 0x4))
258 && (logical_id & mda & 0xf))
259 result = 1;
260 break;
261 default:
262 printk(KERN_WARNING "Bad DFR vcpu %d: %08x\n",
263 apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR));
264 break;
265 }
266
267 return result;
268}
269
270static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
271 int short_hand, int dest, int dest_mode)
272{
273 int result = 0;
274 struct kvm_lapic *target = vcpu->arch.apic;
275
276 apic_debug("target %p, source %p, dest 0x%x, "
277 "dest_mode 0x%x, short_hand 0x%x",
278 target, source, dest, dest_mode, short_hand);
279
280 ASSERT(!target);
281 switch (short_hand) {
282 case APIC_DEST_NOSHORT:
283 if (dest_mode == 0) {
284 /* Physical mode. */
285 if ((dest == 0xFF) || (dest == kvm_apic_id(target)))
286 result = 1;
287 } else
288 /* Logical mode. */
289 result = kvm_apic_match_logical_addr(target, dest);
290 break;
291 case APIC_DEST_SELF:
292 if (target == source)
293 result = 1;
294 break;
295 case APIC_DEST_ALLINC:
296 result = 1;
297 break;
298 case APIC_DEST_ALLBUT:
299 if (target != source)
300 result = 1;
301 break;
302 default:
303 printk(KERN_WARNING "Bad dest shorthand value %x\n",
304 short_hand);
305 break;
306 }
307
308 return result;
309}
310
311/*
312 * Add a pending IRQ into lapic.
313 * Return 1 if successfully added and 0 if discarded.
314 */
315static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
316 int vector, int level, int trig_mode)
317{
318 int orig_irr, result = 0;
319 struct kvm_vcpu *vcpu = apic->vcpu;
320
321 switch (delivery_mode) {
322 case APIC_DM_FIXED:
323 case APIC_DM_LOWEST:
324 /* FIXME add logic for vcpu on reset */
325 if (unlikely(!apic_enabled(apic)))
326 break;
327
328 orig_irr = apic_test_and_set_irr(vector, apic);
329 if (orig_irr && trig_mode) {
330 apic_debug("level trig mode repeatedly for vector %d",
331 vector);
332 break;
333 }
334
335 if (trig_mode) {
336 apic_debug("level trig mode for vector %d", vector);
337 apic_set_vector(vector, apic->regs + APIC_TMR);
338 } else
339 apic_clear_vector(vector, apic->regs + APIC_TMR);
340
341 if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE)
342 kvm_vcpu_kick(vcpu);
343 else if (vcpu->arch.mp_state == VCPU_MP_STATE_HALTED) {
344 vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
345 if (waitqueue_active(&vcpu->wq))
346 wake_up_interruptible(&vcpu->wq);
347 }
348
349 result = (orig_irr == 0);
350 break;
351
352 case APIC_DM_REMRD:
353 printk(KERN_DEBUG "Ignoring delivery mode 3\n");
354 break;
355
356 case APIC_DM_SMI:
357 printk(KERN_DEBUG "Ignoring guest SMI\n");
358 break;
359 case APIC_DM_NMI:
360 printk(KERN_DEBUG "Ignoring guest NMI\n");
361 break;
362
363 case APIC_DM_INIT:
364 if (level) {
365 if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE)
366 printk(KERN_DEBUG
367 "INIT on a runnable vcpu %d\n",
368 vcpu->vcpu_id);
369 vcpu->arch.mp_state = VCPU_MP_STATE_INIT_RECEIVED;
370 kvm_vcpu_kick(vcpu);
371 } else {
372 printk(KERN_DEBUG
373 "Ignoring de-assert INIT to vcpu %d\n",
374 vcpu->vcpu_id);
375 }
376
377 break;
378
379 case APIC_DM_STARTUP:
380 printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n",
381 vcpu->vcpu_id, vector);
382 if (vcpu->arch.mp_state == VCPU_MP_STATE_INIT_RECEIVED) {
383 vcpu->arch.sipi_vector = vector;
384 vcpu->arch.mp_state = VCPU_MP_STATE_SIPI_RECEIVED;
385 if (waitqueue_active(&vcpu->wq))
386 wake_up_interruptible(&vcpu->wq);
387 }
388 break;
389
390 default:
391 printk(KERN_ERR "TODO: unsupported delivery mode %x\n",
392 delivery_mode);
393 break;
394 }
395 return result;
396}
397
398static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
399 unsigned long bitmap)
400{
401 int last;
402 int next;
403 struct kvm_lapic *apic = NULL;
404
405 last = kvm->arch.round_robin_prev_vcpu;
406 next = last;
407
408 do {
409 if (++next == KVM_MAX_VCPUS)
410 next = 0;
411 if (kvm->vcpus[next] == NULL || !test_bit(next, &bitmap))
412 continue;
413 apic = kvm->vcpus[next]->arch.apic;
414 if (apic && apic_enabled(apic))
415 break;
416 apic = NULL;
417 } while (next != last);
418 kvm->arch.round_robin_prev_vcpu = next;
419
420 if (!apic)
421 printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n");
422
423 return apic;
424}
425
426struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
427 unsigned long bitmap)
428{
429 struct kvm_lapic *apic;
430
431 apic = kvm_apic_round_robin(kvm, vector, bitmap);
432 if (apic)
433 return apic->vcpu;
434 return NULL;
435}
436
437static void apic_set_eoi(struct kvm_lapic *apic)
438{
439 int vector = apic_find_highest_isr(apic);
440
441 /*
442 * Not every write EOI will has corresponding ISR,
443 * one example is when Kernel check timer on setup_IO_APIC
444 */
445 if (vector == -1)
446 return;
447
448 apic_clear_vector(vector, apic->regs + APIC_ISR);
449 apic_update_ppr(apic);
450
451 if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR))
452 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector);
453}
454
455static void apic_send_ipi(struct kvm_lapic *apic)
456{
457 u32 icr_low = apic_get_reg(apic, APIC_ICR);
458 u32 icr_high = apic_get_reg(apic, APIC_ICR2);
459
460 unsigned int dest = GET_APIC_DEST_FIELD(icr_high);
461 unsigned int short_hand = icr_low & APIC_SHORT_MASK;
462 unsigned int trig_mode = icr_low & APIC_INT_LEVELTRIG;
463 unsigned int level = icr_low & APIC_INT_ASSERT;
464 unsigned int dest_mode = icr_low & APIC_DEST_MASK;
465 unsigned int delivery_mode = icr_low & APIC_MODE_MASK;
466 unsigned int vector = icr_low & APIC_VECTOR_MASK;
467
468 struct kvm_vcpu *target;
469 struct kvm_vcpu *vcpu;
470 unsigned long lpr_map = 0;
471 int i;
472
473 apic_debug("icr_high 0x%x, icr_low 0x%x, "
474 "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, "
475 "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n",
476 icr_high, icr_low, short_hand, dest,
477 trig_mode, level, dest_mode, delivery_mode, vector);
478
479 for (i = 0; i < KVM_MAX_VCPUS; i++) {
480 vcpu = apic->vcpu->kvm->vcpus[i];
481 if (!vcpu)
482 continue;
483
484 if (vcpu->arch.apic &&
485 apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) {
486 if (delivery_mode == APIC_DM_LOWEST)
487 set_bit(vcpu->vcpu_id, &lpr_map);
488 else
489 __apic_accept_irq(vcpu->arch.apic, delivery_mode,
490 vector, level, trig_mode);
491 }
492 }
493
494 if (delivery_mode == APIC_DM_LOWEST) {
495 target = kvm_get_lowest_prio_vcpu(vcpu->kvm, vector, lpr_map);
496 if (target != NULL)
497 __apic_accept_irq(target->arch.apic, delivery_mode,
498 vector, level, trig_mode);
499 }
500}
501
502static u32 apic_get_tmcct(struct kvm_lapic *apic)
503{
504 u64 counter_passed;
505 ktime_t passed, now;
506 u32 tmcct;
507
508 ASSERT(apic != NULL);
509
510 now = apic->timer.dev.base->get_time();
511 tmcct = apic_get_reg(apic, APIC_TMICT);
512
513 /* if initial count is 0, current count should also be 0 */
514 if (tmcct == 0)
515 return 0;
516
517 if (unlikely(ktime_to_ns(now) <=
518 ktime_to_ns(apic->timer.last_update))) {
519 /* Wrap around */
520 passed = ktime_add(( {
521 (ktime_t) {
522 .tv64 = KTIME_MAX -
523 (apic->timer.last_update).tv64}; }
524 ), now);
525 apic_debug("time elapsed\n");
526 } else
527 passed = ktime_sub(now, apic->timer.last_update);
528
529 counter_passed = div64_64(ktime_to_ns(passed),
530 (APIC_BUS_CYCLE_NS * apic->timer.divide_count));
531
532 if (counter_passed > tmcct) {
533 if (unlikely(!apic_lvtt_period(apic))) {
534 /* one-shot timers stick at 0 until reset */
535 tmcct = 0;
536 } else {
537 /*
538 * periodic timers reset to APIC_TMICT when they
539 * hit 0. The while loop simulates this happening N
540 * times. (counter_passed %= tmcct) would also work,
541 * but might be slower or not work on 32-bit??
542 */
543 while (counter_passed > tmcct)
544 counter_passed -= tmcct;
545 tmcct -= counter_passed;
546 }
547 } else {
548 tmcct -= counter_passed;
549 }
550
551 return tmcct;
552}
553
554static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
555{
556 u32 val = 0;
557
558 if (offset >= LAPIC_MMIO_LENGTH)
559 return 0;
560
561 switch (offset) {
562 case APIC_ARBPRI:
563 printk(KERN_WARNING "Access APIC ARBPRI register "
564 "which is for P6\n");
565 break;
566
567 case APIC_TMCCT: /* Timer CCR */
568 val = apic_get_tmcct(apic);
569 break;
570
571 default:
572 apic_update_ppr(apic);
573 val = apic_get_reg(apic, offset);
574 break;
575 }
576
577 return val;
578}
579
580static void apic_mmio_read(struct kvm_io_device *this,
581 gpa_t address, int len, void *data)
582{
583 struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
584 unsigned int offset = address - apic->base_address;
585 unsigned char alignment = offset & 0xf;
586 u32 result;
587
588 if ((alignment + len) > 4) {
589 printk(KERN_ERR "KVM_APIC_READ: alignment error %lx %d",
590 (unsigned long)address, len);
591 return;
592 }
593 result = __apic_read(apic, offset & ~0xf);
594
595 switch (len) {
596 case 1:
597 case 2:
598 case 4:
599 memcpy(data, (char *)&result + alignment, len);
600 break;
601 default:
602 printk(KERN_ERR "Local APIC read with len = %x, "
603 "should be 1,2, or 4 instead\n", len);
604 break;
605 }
606}
607
608static void update_divide_count(struct kvm_lapic *apic)
609{
610 u32 tmp1, tmp2, tdcr;
611
612 tdcr = apic_get_reg(apic, APIC_TDCR);
613 tmp1 = tdcr & 0xf;
614 tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1;
615 apic->timer.divide_count = 0x1 << (tmp2 & 0x7);
616
617 apic_debug("timer divide count is 0x%x\n",
618 apic->timer.divide_count);
619}
620
621static void start_apic_timer(struct kvm_lapic *apic)
622{
623 ktime_t now = apic->timer.dev.base->get_time();
624
625 apic->timer.last_update = now;
626
627 apic->timer.period = apic_get_reg(apic, APIC_TMICT) *
628 APIC_BUS_CYCLE_NS * apic->timer.divide_count;
629 atomic_set(&apic->timer.pending, 0);
630 hrtimer_start(&apic->timer.dev,
631 ktime_add_ns(now, apic->timer.period),
632 HRTIMER_MODE_ABS);
633
634 apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
635 PRIx64 ", "
636 "timer initial count 0x%x, period %lldns, "
637 "expire @ 0x%016" PRIx64 ".\n", __FUNCTION__,
638 APIC_BUS_CYCLE_NS, ktime_to_ns(now),
639 apic_get_reg(apic, APIC_TMICT),
640 apic->timer.period,
641 ktime_to_ns(ktime_add_ns(now,
642 apic->timer.period)));
643}
644
645static void apic_mmio_write(struct kvm_io_device *this,
646 gpa_t address, int len, const void *data)
647{
648 struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
649 unsigned int offset = address - apic->base_address;
650 unsigned char alignment = offset & 0xf;
651 u32 val;
652
653 /*
654 * APIC register must be aligned on 128-bits boundary.
655 * 32/64/128 bits registers must be accessed thru 32 bits.
656 * Refer SDM 8.4.1
657 */
658 if (len != 4 || alignment) {
659 if (printk_ratelimit())
660 printk(KERN_ERR "apic write: bad size=%d %lx\n",
661 len, (long)address);
662 return;
663 }
664
665 val = *(u32 *) data;
666
667 /* too common printing */
668 if (offset != APIC_EOI)
669 apic_debug("%s: offset 0x%x with length 0x%x, and value is "
670 "0x%x\n", __FUNCTION__, offset, len, val);
671
672 offset &= 0xff0;
673
674 switch (offset) {
675 case APIC_ID: /* Local APIC ID */
676 apic_set_reg(apic, APIC_ID, val);
677 break;
678
679 case APIC_TASKPRI:
680 apic_set_tpr(apic, val & 0xff);
681 break;
682
683 case APIC_EOI:
684 apic_set_eoi(apic);
685 break;
686
687 case APIC_LDR:
688 apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK);
689 break;
690
691 case APIC_DFR:
692 apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
693 break;
694
695 case APIC_SPIV:
696 apic_set_reg(apic, APIC_SPIV, val & 0x3ff);
697 if (!(val & APIC_SPIV_APIC_ENABLED)) {
698 int i;
699 u32 lvt_val;
700
701 for (i = 0; i < APIC_LVT_NUM; i++) {
702 lvt_val = apic_get_reg(apic,
703 APIC_LVTT + 0x10 * i);
704 apic_set_reg(apic, APIC_LVTT + 0x10 * i,
705 lvt_val | APIC_LVT_MASKED);
706 }
707 atomic_set(&apic->timer.pending, 0);
708
709 }
710 break;
711
712 case APIC_ICR:
713 /* No delay here, so we always clear the pending bit */
714 apic_set_reg(apic, APIC_ICR, val & ~(1 << 12));
715 apic_send_ipi(apic);
716 break;
717
718 case APIC_ICR2:
719 apic_set_reg(apic, APIC_ICR2, val & 0xff000000);
720 break;
721
722 case APIC_LVTT:
723 case APIC_LVTTHMR:
724 case APIC_LVTPC:
725 case APIC_LVT0:
726 case APIC_LVT1:
727 case APIC_LVTERR:
728 /* TODO: Check vector */
729 if (!apic_sw_enabled(apic))
730 val |= APIC_LVT_MASKED;
731
732 val &= apic_lvt_mask[(offset - APIC_LVTT) >> 4];
733 apic_set_reg(apic, offset, val);
734
735 break;
736
737 case APIC_TMICT:
738 hrtimer_cancel(&apic->timer.dev);
739 apic_set_reg(apic, APIC_TMICT, val);
740 start_apic_timer(apic);
741 return;
742
743 case APIC_TDCR:
744 if (val & 4)
745 printk(KERN_ERR "KVM_WRITE:TDCR %x\n", val);
746 apic_set_reg(apic, APIC_TDCR, val);
747 update_divide_count(apic);
748 break;
749
750 default:
751 apic_debug("Local APIC Write to read-only register %x\n",
752 offset);
753 break;
754 }
755
756}
757
758static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr)
759{
760 struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
761 int ret = 0;
762
763
764 if (apic_hw_enabled(apic) &&
765 (addr >= apic->base_address) &&
766 (addr < (apic->base_address + LAPIC_MMIO_LENGTH)))
767 ret = 1;
768
769 return ret;
770}
771
772void kvm_free_lapic(struct kvm_vcpu *vcpu)
773{
774 if (!vcpu->arch.apic)
775 return;
776
777 hrtimer_cancel(&vcpu->arch.apic->timer.dev);
778
779 if (vcpu->arch.apic->regs_page)
780 __free_page(vcpu->arch.apic->regs_page);
781
782 kfree(vcpu->arch.apic);
783}
784
785/*
786 *----------------------------------------------------------------------
787 * LAPIC interface
788 *----------------------------------------------------------------------
789 */
790
791void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
792{
793 struct kvm_lapic *apic = vcpu->arch.apic;
794
795 if (!apic)
796 return;
797 apic_set_tpr(apic, ((cr8 & 0x0f) << 4));
798}
799
800u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
801{
802 struct kvm_lapic *apic = vcpu->arch.apic;
803 u64 tpr;
804
805 if (!apic)
806 return 0;
807 tpr = (u64) apic_get_reg(apic, APIC_TASKPRI);
808
809 return (tpr & 0xf0) >> 4;
810}
811EXPORT_SYMBOL_GPL(kvm_lapic_get_cr8);
812
813void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
814{
815 struct kvm_lapic *apic = vcpu->arch.apic;
816
817 if (!apic) {
818 value |= MSR_IA32_APICBASE_BSP;
819 vcpu->arch.apic_base = value;
820 return;
821 }
822 if (apic->vcpu->vcpu_id)
823 value &= ~MSR_IA32_APICBASE_BSP;
824
825 vcpu->arch.apic_base = value;
826 apic->base_address = apic->vcpu->arch.apic_base &
827 MSR_IA32_APICBASE_BASE;
828
829 /* with FSB delivery interrupt, we can restart APIC functionality */
830 apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is "
831 "0x%lx.\n", apic->vcpu->arch.apic_base, apic->base_address);
832
833}
834
835u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu)
836{
837 return vcpu->arch.apic_base;
838}
839EXPORT_SYMBOL_GPL(kvm_lapic_get_base);
840
841void kvm_lapic_reset(struct kvm_vcpu *vcpu)
842{
843 struct kvm_lapic *apic;
844 int i;
845
846 apic_debug("%s\n", __FUNCTION__);
847
848 ASSERT(vcpu);
849 apic = vcpu->arch.apic;
850 ASSERT(apic != NULL);
851
852 /* Stop the timer in case it's a reset to an active apic */
853 hrtimer_cancel(&apic->timer.dev);
854
855 apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24);
856 apic_set_reg(apic, APIC_LVR, APIC_VERSION);
857
858 for (i = 0; i < APIC_LVT_NUM; i++)
859 apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
860 apic_set_reg(apic, APIC_LVT0,
861 SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
862
863 apic_set_reg(apic, APIC_DFR, 0xffffffffU);
864 apic_set_reg(apic, APIC_SPIV, 0xff);
865 apic_set_reg(apic, APIC_TASKPRI, 0);
866 apic_set_reg(apic, APIC_LDR, 0);
867 apic_set_reg(apic, APIC_ESR, 0);
868 apic_set_reg(apic, APIC_ICR, 0);
869 apic_set_reg(apic, APIC_ICR2, 0);
870 apic_set_reg(apic, APIC_TDCR, 0);
871 apic_set_reg(apic, APIC_TMICT, 0);
872 for (i = 0; i < 8; i++) {
873 apic_set_reg(apic, APIC_IRR + 0x10 * i, 0);
874 apic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
875 apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
876 }
877 update_divide_count(apic);
878 atomic_set(&apic->timer.pending, 0);
879 if (vcpu->vcpu_id == 0)
880 vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
881 apic_update_ppr(apic);
882
883 apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr="
884 "0x%016" PRIx64 ", base_address=0x%0lx.\n", __FUNCTION__,
885 vcpu, kvm_apic_id(apic),
886 vcpu->arch.apic_base, apic->base_address);
887}
888EXPORT_SYMBOL_GPL(kvm_lapic_reset);
889
890int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
891{
892 struct kvm_lapic *apic = vcpu->arch.apic;
893 int ret = 0;
894
895 if (!apic)
896 return 0;
897 ret = apic_enabled(apic);
898
899 return ret;
900}
901EXPORT_SYMBOL_GPL(kvm_lapic_enabled);
902
903/*
904 *----------------------------------------------------------------------
905 * timer interface
906 *----------------------------------------------------------------------
907 */
908
909/* TODO: make sure __apic_timer_fn runs in current pCPU */
910static int __apic_timer_fn(struct kvm_lapic *apic)
911{
912 int result = 0;
913 wait_queue_head_t *q = &apic->vcpu->wq;
914
915 atomic_inc(&apic->timer.pending);
916 if (waitqueue_active(q)) {
917 apic->vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
918 wake_up_interruptible(q);
919 }
920 if (apic_lvtt_period(apic)) {
921 result = 1;
922 apic->timer.dev.expires = ktime_add_ns(
923 apic->timer.dev.expires,
924 apic->timer.period);
925 }
926 return result;
927}
928
929static int __inject_apic_timer_irq(struct kvm_lapic *apic)
930{
931 int vector;
932
933 vector = apic_lvt_vector(apic, APIC_LVTT);
934 return __apic_accept_irq(apic, APIC_DM_FIXED, vector, 1, 0);
935}
936
937static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
938{
939 struct kvm_lapic *apic;
940 int restart_timer = 0;
941
942 apic = container_of(data, struct kvm_lapic, timer.dev);
943
944 restart_timer = __apic_timer_fn(apic);
945
946 if (restart_timer)
947 return HRTIMER_RESTART;
948 else
949 return HRTIMER_NORESTART;
950}
951
952int kvm_create_lapic(struct kvm_vcpu *vcpu)
953{
954 struct kvm_lapic *apic;
955
956 ASSERT(vcpu != NULL);
957 apic_debug("apic_init %d\n", vcpu->vcpu_id);
958
959 apic = kzalloc(sizeof(*apic), GFP_KERNEL);
960 if (!apic)
961 goto nomem;
962
963 vcpu->arch.apic = apic;
964
965 apic->regs_page = alloc_page(GFP_KERNEL);
966 if (apic->regs_page == NULL) {
967 printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
968 vcpu->vcpu_id);
969 goto nomem_free_apic;
970 }
971 apic->regs = page_address(apic->regs_page);
972 memset(apic->regs, 0, PAGE_SIZE);
973 apic->vcpu = vcpu;
974
975 hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
976 apic->timer.dev.function = apic_timer_fn;
977 apic->base_address = APIC_DEFAULT_PHYS_BASE;
978 vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE;
979
980 kvm_lapic_reset(vcpu);
981 apic->dev.read = apic_mmio_read;
982 apic->dev.write = apic_mmio_write;
983 apic->dev.in_range = apic_mmio_range;
984 apic->dev.private = apic;
985
986 return 0;
987nomem_free_apic:
988 kfree(apic);
989nomem:
990 return -ENOMEM;
991}
992EXPORT_SYMBOL_GPL(kvm_create_lapic);
993
994int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
995{
996 struct kvm_lapic *apic = vcpu->arch.apic;
997 int highest_irr;
998
999 if (!apic || !apic_enabled(apic))
1000 return -1;
1001
1002 apic_update_ppr(apic);
1003 highest_irr = apic_find_highest_irr(apic);
1004 if ((highest_irr == -1) ||
1005 ((highest_irr & 0xF0) <= apic_get_reg(apic, APIC_PROCPRI)))
1006 return -1;
1007 return highest_irr;
1008}
1009
1010int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
1011{
1012 u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0);
1013 int r = 0;
1014
1015 if (vcpu->vcpu_id == 0) {
1016 if (!apic_hw_enabled(vcpu->arch.apic))
1017 r = 1;
1018 if ((lvt0 & APIC_LVT_MASKED) == 0 &&
1019 GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
1020 r = 1;
1021 }
1022 return r;
1023}
1024
1025void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
1026{
1027 struct kvm_lapic *apic = vcpu->arch.apic;
1028
1029 if (apic && apic_lvt_enabled(apic, APIC_LVTT) &&
1030 atomic_read(&apic->timer.pending) > 0) {
1031 if (__inject_apic_timer_irq(apic))
1032 atomic_dec(&apic->timer.pending);
1033 }
1034}
1035
1036void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
1037{
1038 struct kvm_lapic *apic = vcpu->arch.apic;
1039
1040 if (apic && apic_lvt_vector(apic, APIC_LVTT) == vec)
1041 apic->timer.last_update = ktime_add_ns(
1042 apic->timer.last_update,
1043 apic->timer.period);
1044}
1045
1046int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
1047{
1048 int vector = kvm_apic_has_interrupt(vcpu);
1049 struct kvm_lapic *apic = vcpu->arch.apic;
1050
1051 if (vector == -1)
1052 return -1;
1053
1054 apic_set_vector(vector, apic->regs + APIC_ISR);
1055 apic_update_ppr(apic);
1056 apic_clear_irr(vector, apic);
1057 return vector;
1058}
1059
1060void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
1061{
1062 struct kvm_lapic *apic = vcpu->arch.apic;
1063
1064 apic->base_address = vcpu->arch.apic_base &
1065 MSR_IA32_APICBASE_BASE;
1066 apic_set_reg(apic, APIC_LVR, APIC_VERSION);
1067 apic_update_ppr(apic);
1068 hrtimer_cancel(&apic->timer.dev);
1069 update_divide_count(apic);
1070 start_apic_timer(apic);
1071}
1072
1073void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
1074{
1075 struct kvm_lapic *apic = vcpu->arch.apic;
1076 struct hrtimer *timer;
1077
1078 if (!apic)
1079 return;
1080
1081 timer = &apic->timer.dev;
1082 if (hrtimer_cancel(timer))
1083 hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS);
1084}
1085EXPORT_SYMBOL_GPL(kvm_migrate_apic_timer);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
new file mode 100644
index 000000000000..401eb7ce3207
--- /dev/null
+++ b/arch/x86/kvm/mmu.c
@@ -0,0 +1,1805 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * MMU support
8 *
9 * Copyright (C) 2006 Qumranet, Inc.
10 *
11 * Authors:
12 * Yaniv Kamay <yaniv@qumranet.com>
13 * Avi Kivity <avi@qumranet.com>
14 *
15 * This work is licensed under the terms of the GNU GPL, version 2. See
16 * the COPYING file in the top-level directory.
17 *
18 */
19
20#include "vmx.h"
21#include "mmu.h"
22
23#include <linux/kvm_host.h>
24#include <linux/types.h>
25#include <linux/string.h>
26#include <linux/mm.h>
27#include <linux/highmem.h>
28#include <linux/module.h>
29#include <linux/swap.h>
30
31#include <asm/page.h>
32#include <asm/cmpxchg.h>
33#include <asm/io.h>
34
35#undef MMU_DEBUG
36
37#undef AUDIT
38
39#ifdef AUDIT
40static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
41#else
42static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
43#endif
44
45#ifdef MMU_DEBUG
46
47#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
48#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
49
50#else
51
52#define pgprintk(x...) do { } while (0)
53#define rmap_printk(x...) do { } while (0)
54
55#endif
56
57#if defined(MMU_DEBUG) || defined(AUDIT)
58static int dbg = 1;
59#endif
60
61#ifndef MMU_DEBUG
62#define ASSERT(x) do { } while (0)
63#else
64#define ASSERT(x) \
65 if (!(x)) { \
66 printk(KERN_WARNING "assertion failed %s:%d: %s\n", \
67 __FILE__, __LINE__, #x); \
68 }
69#endif
70
71#define PT64_PT_BITS 9
72#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
73#define PT32_PT_BITS 10
74#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
75
76#define PT_WRITABLE_SHIFT 1
77
78#define PT_PRESENT_MASK (1ULL << 0)
79#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
80#define PT_USER_MASK (1ULL << 2)
81#define PT_PWT_MASK (1ULL << 3)
82#define PT_PCD_MASK (1ULL << 4)
83#define PT_ACCESSED_MASK (1ULL << 5)
84#define PT_DIRTY_MASK (1ULL << 6)
85#define PT_PAGE_SIZE_MASK (1ULL << 7)
86#define PT_PAT_MASK (1ULL << 7)
87#define PT_GLOBAL_MASK (1ULL << 8)
88#define PT64_NX_SHIFT 63
89#define PT64_NX_MASK (1ULL << PT64_NX_SHIFT)
90
91#define PT_PAT_SHIFT 7
92#define PT_DIR_PAT_SHIFT 12
93#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
94
95#define PT32_DIR_PSE36_SIZE 4
96#define PT32_DIR_PSE36_SHIFT 13
97#define PT32_DIR_PSE36_MASK \
98 (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
99
100
101#define PT_FIRST_AVAIL_BITS_SHIFT 9
102#define PT64_SECOND_AVAIL_BITS_SHIFT 52
103
104#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
105
106#define VALID_PAGE(x) ((x) != INVALID_PAGE)
107
108#define PT64_LEVEL_BITS 9
109
110#define PT64_LEVEL_SHIFT(level) \
111 (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
112
113#define PT64_LEVEL_MASK(level) \
114 (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
115
116#define PT64_INDEX(address, level)\
117 (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
118
119
120#define PT32_LEVEL_BITS 10
121
122#define PT32_LEVEL_SHIFT(level) \
123 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
124
125#define PT32_LEVEL_MASK(level) \
126 (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
127
128#define PT32_INDEX(address, level)\
129 (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
130
131
132#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
133#define PT64_DIR_BASE_ADDR_MASK \
134 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
135
136#define PT32_BASE_ADDR_MASK PAGE_MASK
137#define PT32_DIR_BASE_ADDR_MASK \
138 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
139
140#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
141 | PT64_NX_MASK)
142
143#define PFERR_PRESENT_MASK (1U << 0)
144#define PFERR_WRITE_MASK (1U << 1)
145#define PFERR_USER_MASK (1U << 2)
146#define PFERR_FETCH_MASK (1U << 4)
147
148#define PT64_ROOT_LEVEL 4
149#define PT32_ROOT_LEVEL 2
150#define PT32E_ROOT_LEVEL 3
151
152#define PT_DIRECTORY_LEVEL 2
153#define PT_PAGE_TABLE_LEVEL 1
154
155#define RMAP_EXT 4
156
157#define ACC_EXEC_MASK 1
158#define ACC_WRITE_MASK PT_WRITABLE_MASK
159#define ACC_USER_MASK PT_USER_MASK
160#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
161
162struct kvm_rmap_desc {
163 u64 *shadow_ptes[RMAP_EXT];
164 struct kvm_rmap_desc *more;
165};
166
167static struct kmem_cache *pte_chain_cache;
168static struct kmem_cache *rmap_desc_cache;
169static struct kmem_cache *mmu_page_header_cache;
170
171static u64 __read_mostly shadow_trap_nonpresent_pte;
172static u64 __read_mostly shadow_notrap_nonpresent_pte;
173
174void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
175{
176 shadow_trap_nonpresent_pte = trap_pte;
177 shadow_notrap_nonpresent_pte = notrap_pte;
178}
179EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
180
181static int is_write_protection(struct kvm_vcpu *vcpu)
182{
183 return vcpu->arch.cr0 & X86_CR0_WP;
184}
185
186static int is_cpuid_PSE36(void)
187{
188 return 1;
189}
190
191static int is_nx(struct kvm_vcpu *vcpu)
192{
193 return vcpu->arch.shadow_efer & EFER_NX;
194}
195
196static int is_present_pte(unsigned long pte)
197{
198 return pte & PT_PRESENT_MASK;
199}
200
201static int is_shadow_present_pte(u64 pte)
202{
203 pte &= ~PT_SHADOW_IO_MARK;
204 return pte != shadow_trap_nonpresent_pte
205 && pte != shadow_notrap_nonpresent_pte;
206}
207
208static int is_writeble_pte(unsigned long pte)
209{
210 return pte & PT_WRITABLE_MASK;
211}
212
213static int is_dirty_pte(unsigned long pte)
214{
215 return pte & PT_DIRTY_MASK;
216}
217
218static int is_io_pte(unsigned long pte)
219{
220 return pte & PT_SHADOW_IO_MARK;
221}
222
223static int is_rmap_pte(u64 pte)
224{
225 return pte != shadow_trap_nonpresent_pte
226 && pte != shadow_notrap_nonpresent_pte;
227}
228
229static gfn_t pse36_gfn_delta(u32 gpte)
230{
231 int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
232
233 return (gpte & PT32_DIR_PSE36_MASK) << shift;
234}
235
236static void set_shadow_pte(u64 *sptep, u64 spte)
237{
238#ifdef CONFIG_X86_64
239 set_64bit((unsigned long *)sptep, spte);
240#else
241 set_64bit((unsigned long long *)sptep, spte);
242#endif
243}
244
245static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
246 struct kmem_cache *base_cache, int min)
247{
248 void *obj;
249
250 if (cache->nobjs >= min)
251 return 0;
252 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
253 obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
254 if (!obj)
255 return -ENOMEM;
256 cache->objects[cache->nobjs++] = obj;
257 }
258 return 0;
259}
260
261static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
262{
263 while (mc->nobjs)
264 kfree(mc->objects[--mc->nobjs]);
265}
266
267static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
268 int min)
269{
270 struct page *page;
271
272 if (cache->nobjs >= min)
273 return 0;
274 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
275 page = alloc_page(GFP_KERNEL);
276 if (!page)
277 return -ENOMEM;
278 set_page_private(page, 0);
279 cache->objects[cache->nobjs++] = page_address(page);
280 }
281 return 0;
282}
283
284static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
285{
286 while (mc->nobjs)
287 free_page((unsigned long)mc->objects[--mc->nobjs]);
288}
289
290static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
291{
292 int r;
293
294 kvm_mmu_free_some_pages(vcpu);
295 r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache,
296 pte_chain_cache, 4);
297 if (r)
298 goto out;
299 r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
300 rmap_desc_cache, 1);
301 if (r)
302 goto out;
303 r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
304 if (r)
305 goto out;
306 r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
307 mmu_page_header_cache, 4);
308out:
309 return r;
310}
311
312static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
313{
314 mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache);
315 mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache);
316 mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
317 mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
318}
319
320static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
321 size_t size)
322{
323 void *p;
324
325 BUG_ON(!mc->nobjs);
326 p = mc->objects[--mc->nobjs];
327 memset(p, 0, size);
328 return p;
329}
330
331static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
332{
333 return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache,
334 sizeof(struct kvm_pte_chain));
335}
336
337static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
338{
339 kfree(pc);
340}
341
342static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
343{
344 return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache,
345 sizeof(struct kvm_rmap_desc));
346}
347
348static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
349{
350 kfree(rd);
351}
352
353/*
354 * Take gfn and return the reverse mapping to it.
355 * Note: gfn must be unaliased before this function get called
356 */
357
358static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn)
359{
360 struct kvm_memory_slot *slot;
361
362 slot = gfn_to_memslot(kvm, gfn);
363 return &slot->rmap[gfn - slot->base_gfn];
364}
365
366/*
367 * Reverse mapping data structures:
368 *
369 * If rmapp bit zero is zero, then rmapp point to the shadw page table entry
370 * that points to page_address(page).
371 *
372 * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
373 * containing more mappings.
374 */
375static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
376{
377 struct kvm_mmu_page *sp;
378 struct kvm_rmap_desc *desc;
379 unsigned long *rmapp;
380 int i;
381
382 if (!is_rmap_pte(*spte))
383 return;
384 gfn = unalias_gfn(vcpu->kvm, gfn);
385 sp = page_header(__pa(spte));
386 sp->gfns[spte - sp->spt] = gfn;
387 rmapp = gfn_to_rmap(vcpu->kvm, gfn);
388 if (!*rmapp) {
389 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
390 *rmapp = (unsigned long)spte;
391 } else if (!(*rmapp & 1)) {
392 rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
393 desc = mmu_alloc_rmap_desc(vcpu);
394 desc->shadow_ptes[0] = (u64 *)*rmapp;
395 desc->shadow_ptes[1] = spte;
396 *rmapp = (unsigned long)desc | 1;
397 } else {
398 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
399 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
400 while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
401 desc = desc->more;
402 if (desc->shadow_ptes[RMAP_EXT-1]) {
403 desc->more = mmu_alloc_rmap_desc(vcpu);
404 desc = desc->more;
405 }
406 for (i = 0; desc->shadow_ptes[i]; ++i)
407 ;
408 desc->shadow_ptes[i] = spte;
409 }
410}
411
412static void rmap_desc_remove_entry(unsigned long *rmapp,
413 struct kvm_rmap_desc *desc,
414 int i,
415 struct kvm_rmap_desc *prev_desc)
416{
417 int j;
418
419 for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j)
420 ;
421 desc->shadow_ptes[i] = desc->shadow_ptes[j];
422 desc->shadow_ptes[j] = NULL;
423 if (j != 0)
424 return;
425 if (!prev_desc && !desc->more)
426 *rmapp = (unsigned long)desc->shadow_ptes[0];
427 else
428 if (prev_desc)
429 prev_desc->more = desc->more;
430 else
431 *rmapp = (unsigned long)desc->more | 1;
432 mmu_free_rmap_desc(desc);
433}
434
435static void rmap_remove(struct kvm *kvm, u64 *spte)
436{
437 struct kvm_rmap_desc *desc;
438 struct kvm_rmap_desc *prev_desc;
439 struct kvm_mmu_page *sp;
440 struct page *page;
441 unsigned long *rmapp;
442 int i;
443
444 if (!is_rmap_pte(*spte))
445 return;
446 sp = page_header(__pa(spte));
447 page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
448 mark_page_accessed(page);
449 if (is_writeble_pte(*spte))
450 kvm_release_page_dirty(page);
451 else
452 kvm_release_page_clean(page);
453 rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt]);
454 if (!*rmapp) {
455 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
456 BUG();
457 } else if (!(*rmapp & 1)) {
458 rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte);
459 if ((u64 *)*rmapp != spte) {
460 printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n",
461 spte, *spte);
462 BUG();
463 }
464 *rmapp = 0;
465 } else {
466 rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte);
467 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
468 prev_desc = NULL;
469 while (desc) {
470 for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
471 if (desc->shadow_ptes[i] == spte) {
472 rmap_desc_remove_entry(rmapp,
473 desc, i,
474 prev_desc);
475 return;
476 }
477 prev_desc = desc;
478 desc = desc->more;
479 }
480 BUG();
481 }
482}
483
484static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
485{
486 struct kvm_rmap_desc *desc;
487 struct kvm_rmap_desc *prev_desc;
488 u64 *prev_spte;
489 int i;
490
491 if (!*rmapp)
492 return NULL;
493 else if (!(*rmapp & 1)) {
494 if (!spte)
495 return (u64 *)*rmapp;
496 return NULL;
497 }
498 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
499 prev_desc = NULL;
500 prev_spte = NULL;
501 while (desc) {
502 for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) {
503 if (prev_spte == spte)
504 return desc->shadow_ptes[i];
505 prev_spte = desc->shadow_ptes[i];
506 }
507 desc = desc->more;
508 }
509 return NULL;
510}
511
512static void rmap_write_protect(struct kvm *kvm, u64 gfn)
513{
514 unsigned long *rmapp;
515 u64 *spte;
516
517 gfn = unalias_gfn(kvm, gfn);
518 rmapp = gfn_to_rmap(kvm, gfn);
519
520 spte = rmap_next(kvm, rmapp, NULL);
521 while (spte) {
522 BUG_ON(!spte);
523 BUG_ON(!(*spte & PT_PRESENT_MASK));
524 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
525 if (is_writeble_pte(*spte))
526 set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK);
527 kvm_flush_remote_tlbs(kvm);
528 spte = rmap_next(kvm, rmapp, spte);
529 }
530}
531
532#ifdef MMU_DEBUG
533static int is_empty_shadow_page(u64 *spt)
534{
535 u64 *pos;
536 u64 *end;
537
538 for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
539 if ((*pos & ~PT_SHADOW_IO_MARK) != shadow_trap_nonpresent_pte) {
540 printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
541 pos, *pos);
542 return 0;
543 }
544 return 1;
545}
546#endif
547
548static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
549{
550 ASSERT(is_empty_shadow_page(sp->spt));
551 list_del(&sp->link);
552 __free_page(virt_to_page(sp->spt));
553 __free_page(virt_to_page(sp->gfns));
554 kfree(sp);
555 ++kvm->arch.n_free_mmu_pages;
556}
557
558static unsigned kvm_page_table_hashfn(gfn_t gfn)
559{
560 return gfn;
561}
562
563static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
564 u64 *parent_pte)
565{
566 struct kvm_mmu_page *sp;
567
568 if (!vcpu->kvm->arch.n_free_mmu_pages)
569 return NULL;
570
571 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
572 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
573 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
574 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
575 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
576 ASSERT(is_empty_shadow_page(sp->spt));
577 sp->slot_bitmap = 0;
578 sp->multimapped = 0;
579 sp->parent_pte = parent_pte;
580 --vcpu->kvm->arch.n_free_mmu_pages;
581 return sp;
582}
583
584static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
585 struct kvm_mmu_page *sp, u64 *parent_pte)
586{
587 struct kvm_pte_chain *pte_chain;
588 struct hlist_node *node;
589 int i;
590
591 if (!parent_pte)
592 return;
593 if (!sp->multimapped) {
594 u64 *old = sp->parent_pte;
595
596 if (!old) {
597 sp->parent_pte = parent_pte;
598 return;
599 }
600 sp->multimapped = 1;
601 pte_chain = mmu_alloc_pte_chain(vcpu);
602 INIT_HLIST_HEAD(&sp->parent_ptes);
603 hlist_add_head(&pte_chain->link, &sp->parent_ptes);
604 pte_chain->parent_ptes[0] = old;
605 }
606 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) {
607 if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
608 continue;
609 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
610 if (!pte_chain->parent_ptes[i]) {
611 pte_chain->parent_ptes[i] = parent_pte;
612 return;
613 }
614 }
615 pte_chain = mmu_alloc_pte_chain(vcpu);
616 BUG_ON(!pte_chain);
617 hlist_add_head(&pte_chain->link, &sp->parent_ptes);
618 pte_chain->parent_ptes[0] = parent_pte;
619}
620
621static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
622 u64 *parent_pte)
623{
624 struct kvm_pte_chain *pte_chain;
625 struct hlist_node *node;
626 int i;
627
628 if (!sp->multimapped) {
629 BUG_ON(sp->parent_pte != parent_pte);
630 sp->parent_pte = NULL;
631 return;
632 }
633 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
634 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
635 if (!pte_chain->parent_ptes[i])
636 break;
637 if (pte_chain->parent_ptes[i] != parent_pte)
638 continue;
639 while (i + 1 < NR_PTE_CHAIN_ENTRIES
640 && pte_chain->parent_ptes[i + 1]) {
641 pte_chain->parent_ptes[i]
642 = pte_chain->parent_ptes[i + 1];
643 ++i;
644 }
645 pte_chain->parent_ptes[i] = NULL;
646 if (i == 0) {
647 hlist_del(&pte_chain->link);
648 mmu_free_pte_chain(pte_chain);
649 if (hlist_empty(&sp->parent_ptes)) {
650 sp->multimapped = 0;
651 sp->parent_pte = NULL;
652 }
653 }
654 return;
655 }
656 BUG();
657}
658
659static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
660{
661 unsigned index;
662 struct hlist_head *bucket;
663 struct kvm_mmu_page *sp;
664 struct hlist_node *node;
665
666 pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
667 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
668 bucket = &kvm->arch.mmu_page_hash[index];
669 hlist_for_each_entry(sp, node, bucket, hash_link)
670 if (sp->gfn == gfn && !sp->role.metaphysical) {
671 pgprintk("%s: found role %x\n",
672 __FUNCTION__, sp->role.word);
673 return sp;
674 }
675 return NULL;
676}
677
678static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
679 gfn_t gfn,
680 gva_t gaddr,
681 unsigned level,
682 int metaphysical,
683 unsigned access,
684 u64 *parent_pte,
685 bool *new_page)
686{
687 union kvm_mmu_page_role role;
688 unsigned index;
689 unsigned quadrant;
690 struct hlist_head *bucket;
691 struct kvm_mmu_page *sp;
692 struct hlist_node *node;
693
694 role.word = 0;
695 role.glevels = vcpu->arch.mmu.root_level;
696 role.level = level;
697 role.metaphysical = metaphysical;
698 role.access = access;
699 if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
700 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
701 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
702 role.quadrant = quadrant;
703 }
704 pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__,
705 gfn, role.word);
706 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
707 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
708 hlist_for_each_entry(sp, node, bucket, hash_link)
709 if (sp->gfn == gfn && sp->role.word == role.word) {
710 mmu_page_add_parent_pte(vcpu, sp, parent_pte);
711 pgprintk("%s: found\n", __FUNCTION__);
712 return sp;
713 }
714 sp = kvm_mmu_alloc_page(vcpu, parent_pte);
715 if (!sp)
716 return sp;
717 pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word);
718 sp->gfn = gfn;
719 sp->role = role;
720 hlist_add_head(&sp->hash_link, bucket);
721 vcpu->arch.mmu.prefetch_page(vcpu, sp);
722 if (!metaphysical)
723 rmap_write_protect(vcpu->kvm, gfn);
724 if (new_page)
725 *new_page = 1;
726 return sp;
727}
728
729static void kvm_mmu_page_unlink_children(struct kvm *kvm,
730 struct kvm_mmu_page *sp)
731{
732 unsigned i;
733 u64 *pt;
734 u64 ent;
735
736 pt = sp->spt;
737
738 if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
739 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
740 if (is_shadow_present_pte(pt[i]))
741 rmap_remove(kvm, &pt[i]);
742 pt[i] = shadow_trap_nonpresent_pte;
743 }
744 kvm_flush_remote_tlbs(kvm);
745 return;
746 }
747
748 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
749 ent = pt[i];
750
751 pt[i] = shadow_trap_nonpresent_pte;
752 if (!is_shadow_present_pte(ent))
753 continue;
754 ent &= PT64_BASE_ADDR_MASK;
755 mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
756 }
757 kvm_flush_remote_tlbs(kvm);
758}
759
760static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
761{
762 mmu_page_remove_parent_pte(sp, parent_pte);
763}
764
765static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
766{
767 int i;
768
769 for (i = 0; i < KVM_MAX_VCPUS; ++i)
770 if (kvm->vcpus[i])
771 kvm->vcpus[i]->arch.last_pte_updated = NULL;
772}
773
774static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
775{
776 u64 *parent_pte;
777
778 ++kvm->stat.mmu_shadow_zapped;
779 while (sp->multimapped || sp->parent_pte) {
780 if (!sp->multimapped)
781 parent_pte = sp->parent_pte;
782 else {
783 struct kvm_pte_chain *chain;
784
785 chain = container_of(sp->parent_ptes.first,
786 struct kvm_pte_chain, link);
787 parent_pte = chain->parent_ptes[0];
788 }
789 BUG_ON(!parent_pte);
790 kvm_mmu_put_page(sp, parent_pte);
791 set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte);
792 }
793 kvm_mmu_page_unlink_children(kvm, sp);
794 if (!sp->root_count) {
795 hlist_del(&sp->hash_link);
796 kvm_mmu_free_page(kvm, sp);
797 } else
798 list_move(&sp->link, &kvm->arch.active_mmu_pages);
799 kvm_mmu_reset_last_pte_updated(kvm);
800}
801
802/*
803 * Changing the number of mmu pages allocated to the vm
804 * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
805 */
806void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
807{
808 /*
809 * If we set the number of mmu pages to be smaller be than the
810 * number of actived pages , we must to free some mmu pages before we
811 * change the value
812 */
813
814 if ((kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages) >
815 kvm_nr_mmu_pages) {
816 int n_used_mmu_pages = kvm->arch.n_alloc_mmu_pages
817 - kvm->arch.n_free_mmu_pages;
818
819 while (n_used_mmu_pages > kvm_nr_mmu_pages) {
820 struct kvm_mmu_page *page;
821
822 page = container_of(kvm->arch.active_mmu_pages.prev,
823 struct kvm_mmu_page, link);
824 kvm_mmu_zap_page(kvm, page);
825 n_used_mmu_pages--;
826 }
827 kvm->arch.n_free_mmu_pages = 0;
828 }
829 else
830 kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages
831 - kvm->arch.n_alloc_mmu_pages;
832
833 kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages;
834}
835
836static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
837{
838 unsigned index;
839 struct hlist_head *bucket;
840 struct kvm_mmu_page *sp;
841 struct hlist_node *node, *n;
842 int r;
843
844 pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
845 r = 0;
846 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
847 bucket = &kvm->arch.mmu_page_hash[index];
848 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
849 if (sp->gfn == gfn && !sp->role.metaphysical) {
850 pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn,
851 sp->role.word);
852 kvm_mmu_zap_page(kvm, sp);
853 r = 1;
854 }
855 return r;
856}
857
858static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
859{
860 struct kvm_mmu_page *sp;
861
862 while ((sp = kvm_mmu_lookup_page(kvm, gfn)) != NULL) {
863 pgprintk("%s: zap %lx %x\n", __FUNCTION__, gfn, sp->role.word);
864 kvm_mmu_zap_page(kvm, sp);
865 }
866}
867
868static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
869{
870 int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn));
871 struct kvm_mmu_page *sp = page_header(__pa(pte));
872
873 __set_bit(slot, &sp->slot_bitmap);
874}
875
876struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
877{
878 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
879
880 if (gpa == UNMAPPED_GVA)
881 return NULL;
882 return gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
883}
884
885static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
886 unsigned pt_access, unsigned pte_access,
887 int user_fault, int write_fault, int dirty,
888 int *ptwrite, gfn_t gfn)
889{
890 u64 spte;
891 int was_rmapped = is_rmap_pte(*shadow_pte);
892 struct page *page;
893
894 pgprintk("%s: spte %llx access %x write_fault %d"
895 " user_fault %d gfn %lx\n",
896 __FUNCTION__, *shadow_pte, pt_access,
897 write_fault, user_fault, gfn);
898
899 /*
900 * We don't set the accessed bit, since we sometimes want to see
901 * whether the guest actually used the pte (in order to detect
902 * demand paging).
903 */
904 spte = PT_PRESENT_MASK | PT_DIRTY_MASK;
905 if (!dirty)
906 pte_access &= ~ACC_WRITE_MASK;
907 if (!(pte_access & ACC_EXEC_MASK))
908 spte |= PT64_NX_MASK;
909
910 page = gfn_to_page(vcpu->kvm, gfn);
911
912 spte |= PT_PRESENT_MASK;
913 if (pte_access & ACC_USER_MASK)
914 spte |= PT_USER_MASK;
915
916 if (is_error_page(page)) {
917 set_shadow_pte(shadow_pte,
918 shadow_trap_nonpresent_pte | PT_SHADOW_IO_MARK);
919 kvm_release_page_clean(page);
920 return;
921 }
922
923 spte |= page_to_phys(page);
924
925 if ((pte_access & ACC_WRITE_MASK)
926 || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
927 struct kvm_mmu_page *shadow;
928
929 spte |= PT_WRITABLE_MASK;
930 if (user_fault) {
931 mmu_unshadow(vcpu->kvm, gfn);
932 goto unshadowed;
933 }
934
935 shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
936 if (shadow) {
937 pgprintk("%s: found shadow page for %lx, marking ro\n",
938 __FUNCTION__, gfn);
939 pte_access &= ~ACC_WRITE_MASK;
940 if (is_writeble_pte(spte)) {
941 spte &= ~PT_WRITABLE_MASK;
942 kvm_x86_ops->tlb_flush(vcpu);
943 }
944 if (write_fault)
945 *ptwrite = 1;
946 }
947 }
948
949unshadowed:
950
951 if (pte_access & ACC_WRITE_MASK)
952 mark_page_dirty(vcpu->kvm, gfn);
953
954 pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte);
955 set_shadow_pte(shadow_pte, spte);
956 page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
957 if (!was_rmapped) {
958 rmap_add(vcpu, shadow_pte, gfn);
959 if (!is_rmap_pte(*shadow_pte))
960 kvm_release_page_clean(page);
961 }
962 else
963 kvm_release_page_clean(page);
964 if (!ptwrite || !*ptwrite)
965 vcpu->arch.last_pte_updated = shadow_pte;
966}
967
968static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
969{
970}
971
972static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
973{
974 int level = PT32E_ROOT_LEVEL;
975 hpa_t table_addr = vcpu->arch.mmu.root_hpa;
976 int pt_write = 0;
977
978 for (; ; level--) {
979 u32 index = PT64_INDEX(v, level);
980 u64 *table;
981
982 ASSERT(VALID_PAGE(table_addr));
983 table = __va(table_addr);
984
985 if (level == 1) {
986 mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
987 0, write, 1, &pt_write, gfn);
988 return pt_write || is_io_pte(table[index]);
989 }
990
991 if (table[index] == shadow_trap_nonpresent_pte) {
992 struct kvm_mmu_page *new_table;
993 gfn_t pseudo_gfn;
994
995 pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
996 >> PAGE_SHIFT;
997 new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
998 v, level - 1,
999 1, ACC_ALL, &table[index],
1000 NULL);
1001 if (!new_table) {
1002 pgprintk("nonpaging_map: ENOMEM\n");
1003 return -ENOMEM;
1004 }
1005
1006 table[index] = __pa(new_table->spt) | PT_PRESENT_MASK
1007 | PT_WRITABLE_MASK | PT_USER_MASK;
1008 }
1009 table_addr = table[index] & PT64_BASE_ADDR_MASK;
1010 }
1011}
1012
1013static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
1014 struct kvm_mmu_page *sp)
1015{
1016 int i;
1017
1018 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1019 sp->spt[i] = shadow_trap_nonpresent_pte;
1020}
1021
1022static void mmu_free_roots(struct kvm_vcpu *vcpu)
1023{
1024 int i;
1025 struct kvm_mmu_page *sp;
1026
1027 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
1028 return;
1029#ifdef CONFIG_X86_64
1030 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
1031 hpa_t root = vcpu->arch.mmu.root_hpa;
1032
1033 sp = page_header(root);
1034 --sp->root_count;
1035 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
1036 return;
1037 }
1038#endif
1039 for (i = 0; i < 4; ++i) {
1040 hpa_t root = vcpu->arch.mmu.pae_root[i];
1041
1042 if (root) {
1043 root &= PT64_BASE_ADDR_MASK;
1044 sp = page_header(root);
1045 --sp->root_count;
1046 }
1047 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
1048 }
1049 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
1050}
1051
1052static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
1053{
1054 int i;
1055 gfn_t root_gfn;
1056 struct kvm_mmu_page *sp;
1057
1058 root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
1059
1060#ifdef CONFIG_X86_64
1061 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
1062 hpa_t root = vcpu->arch.mmu.root_hpa;
1063
1064 ASSERT(!VALID_PAGE(root));
1065 sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
1066 PT64_ROOT_LEVEL, 0, ACC_ALL, NULL, NULL);
1067 root = __pa(sp->spt);
1068 ++sp->root_count;
1069 vcpu->arch.mmu.root_hpa = root;
1070 return;
1071 }
1072#endif
1073 for (i = 0; i < 4; ++i) {
1074 hpa_t root = vcpu->arch.mmu.pae_root[i];
1075
1076 ASSERT(!VALID_PAGE(root));
1077 if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
1078 if (!is_present_pte(vcpu->arch.pdptrs[i])) {
1079 vcpu->arch.mmu.pae_root[i] = 0;
1080 continue;
1081 }
1082 root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT;
1083 } else if (vcpu->arch.mmu.root_level == 0)
1084 root_gfn = 0;
1085 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
1086 PT32_ROOT_LEVEL, !is_paging(vcpu),
1087 ACC_ALL, NULL, NULL);
1088 root = __pa(sp->spt);
1089 ++sp->root_count;
1090 vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
1091 }
1092 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
1093}
1094
1095static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
1096{
1097 return vaddr;
1098}
1099
1100static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
1101 u32 error_code)
1102{
1103 gfn_t gfn;
1104 int r;
1105
1106 pgprintk("%s: gva %lx error %x\n", __FUNCTION__, gva, error_code);
1107 r = mmu_topup_memory_caches(vcpu);
1108 if (r)
1109 return r;
1110
1111 ASSERT(vcpu);
1112 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
1113
1114 gfn = gva >> PAGE_SHIFT;
1115
1116 return nonpaging_map(vcpu, gva & PAGE_MASK,
1117 error_code & PFERR_WRITE_MASK, gfn);
1118}
1119
1120static void nonpaging_free(struct kvm_vcpu *vcpu)
1121{
1122 mmu_free_roots(vcpu);
1123}
1124
1125static int nonpaging_init_context(struct kvm_vcpu *vcpu)
1126{
1127 struct kvm_mmu *context = &vcpu->arch.mmu;
1128
1129 context->new_cr3 = nonpaging_new_cr3;
1130 context->page_fault = nonpaging_page_fault;
1131 context->gva_to_gpa = nonpaging_gva_to_gpa;
1132 context->free = nonpaging_free;
1133 context->prefetch_page = nonpaging_prefetch_page;
1134 context->root_level = 0;
1135 context->shadow_root_level = PT32E_ROOT_LEVEL;
1136 context->root_hpa = INVALID_PAGE;
1137 return 0;
1138}
1139
1140void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
1141{
1142 ++vcpu->stat.tlb_flush;
1143 kvm_x86_ops->tlb_flush(vcpu);
1144}
1145
1146static void paging_new_cr3(struct kvm_vcpu *vcpu)
1147{
1148 pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3);
1149 mmu_free_roots(vcpu);
1150}
1151
1152static void inject_page_fault(struct kvm_vcpu *vcpu,
1153 u64 addr,
1154 u32 err_code)
1155{
1156 kvm_inject_page_fault(vcpu, addr, err_code);
1157}
1158
1159static void paging_free(struct kvm_vcpu *vcpu)
1160{
1161 nonpaging_free(vcpu);
1162}
1163
1164#define PTTYPE 64
1165#include "paging_tmpl.h"
1166#undef PTTYPE
1167
1168#define PTTYPE 32
1169#include "paging_tmpl.h"
1170#undef PTTYPE
1171
1172static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
1173{
1174 struct kvm_mmu *context = &vcpu->arch.mmu;
1175
1176 ASSERT(is_pae(vcpu));
1177 context->new_cr3 = paging_new_cr3;
1178 context->page_fault = paging64_page_fault;
1179 context->gva_to_gpa = paging64_gva_to_gpa;
1180 context->prefetch_page = paging64_prefetch_page;
1181 context->free = paging_free;
1182 context->root_level = level;
1183 context->shadow_root_level = level;
1184 context->root_hpa = INVALID_PAGE;
1185 return 0;
1186}
1187
1188static int paging64_init_context(struct kvm_vcpu *vcpu)
1189{
1190 return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
1191}
1192
1193static int paging32_init_context(struct kvm_vcpu *vcpu)
1194{
1195 struct kvm_mmu *context = &vcpu->arch.mmu;
1196
1197 context->new_cr3 = paging_new_cr3;
1198 context->page_fault = paging32_page_fault;
1199 context->gva_to_gpa = paging32_gva_to_gpa;
1200 context->free = paging_free;
1201 context->prefetch_page = paging32_prefetch_page;
1202 context->root_level = PT32_ROOT_LEVEL;
1203 context->shadow_root_level = PT32E_ROOT_LEVEL;
1204 context->root_hpa = INVALID_PAGE;
1205 return 0;
1206}
1207
1208static int paging32E_init_context(struct kvm_vcpu *vcpu)
1209{
1210 return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
1211}
1212
1213static int init_kvm_mmu(struct kvm_vcpu *vcpu)
1214{
1215 ASSERT(vcpu);
1216 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
1217
1218 if (!is_paging(vcpu))
1219 return nonpaging_init_context(vcpu);
1220 else if (is_long_mode(vcpu))
1221 return paging64_init_context(vcpu);
1222 else if (is_pae(vcpu))
1223 return paging32E_init_context(vcpu);
1224 else
1225 return paging32_init_context(vcpu);
1226}
1227
1228static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
1229{
1230 ASSERT(vcpu);
1231 if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
1232 vcpu->arch.mmu.free(vcpu);
1233 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
1234 }
1235}
1236
1237int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
1238{
1239 destroy_kvm_mmu(vcpu);
1240 return init_kvm_mmu(vcpu);
1241}
1242EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
1243
1244int kvm_mmu_load(struct kvm_vcpu *vcpu)
1245{
1246 int r;
1247
1248 mutex_lock(&vcpu->kvm->lock);
1249 r = mmu_topup_memory_caches(vcpu);
1250 if (r)
1251 goto out;
1252 mmu_alloc_roots(vcpu);
1253 kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
1254 kvm_mmu_flush_tlb(vcpu);
1255out:
1256 mutex_unlock(&vcpu->kvm->lock);
1257 return r;
1258}
1259EXPORT_SYMBOL_GPL(kvm_mmu_load);
1260
1261void kvm_mmu_unload(struct kvm_vcpu *vcpu)
1262{
1263 mmu_free_roots(vcpu);
1264}
1265
1266static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
1267 struct kvm_mmu_page *sp,
1268 u64 *spte)
1269{
1270 u64 pte;
1271 struct kvm_mmu_page *child;
1272
1273 pte = *spte;
1274 if (is_shadow_present_pte(pte)) {
1275 if (sp->role.level == PT_PAGE_TABLE_LEVEL)
1276 rmap_remove(vcpu->kvm, spte);
1277 else {
1278 child = page_header(pte & PT64_BASE_ADDR_MASK);
1279 mmu_page_remove_parent_pte(child, spte);
1280 }
1281 }
1282 set_shadow_pte(spte, shadow_trap_nonpresent_pte);
1283}
1284
1285static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
1286 struct kvm_mmu_page *sp,
1287 u64 *spte,
1288 const void *new, int bytes,
1289 int offset_in_pte)
1290{
1291 if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
1292 ++vcpu->kvm->stat.mmu_pde_zapped;
1293 return;
1294 }
1295
1296 ++vcpu->kvm->stat.mmu_pte_updated;
1297 if (sp->role.glevels == PT32_ROOT_LEVEL)
1298 paging32_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte);
1299 else
1300 paging64_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte);
1301}
1302
1303static bool need_remote_flush(u64 old, u64 new)
1304{
1305 if (!is_shadow_present_pte(old))
1306 return false;
1307 if (!is_shadow_present_pte(new))
1308 return true;
1309 if ((old ^ new) & PT64_BASE_ADDR_MASK)
1310 return true;
1311 old ^= PT64_NX_MASK;
1312 new ^= PT64_NX_MASK;
1313 return (old & ~new & PT64_PERM_MASK) != 0;
1314}
1315
1316static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new)
1317{
1318 if (need_remote_flush(old, new))
1319 kvm_flush_remote_tlbs(vcpu->kvm);
1320 else
1321 kvm_mmu_flush_tlb(vcpu);
1322}
1323
1324static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
1325{
1326 u64 *spte = vcpu->arch.last_pte_updated;
1327
1328 return !!(spte && (*spte & PT_ACCESSED_MASK));
1329}
1330
1331void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1332 const u8 *new, int bytes)
1333{
1334 gfn_t gfn = gpa >> PAGE_SHIFT;
1335 struct kvm_mmu_page *sp;
1336 struct hlist_node *node, *n;
1337 struct hlist_head *bucket;
1338 unsigned index;
1339 u64 entry;
1340 u64 *spte;
1341 unsigned offset = offset_in_page(gpa);
1342 unsigned pte_size;
1343 unsigned page_offset;
1344 unsigned misaligned;
1345 unsigned quadrant;
1346 int level;
1347 int flooded = 0;
1348 int npte;
1349
1350 pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
1351 ++vcpu->kvm->stat.mmu_pte_write;
1352 kvm_mmu_audit(vcpu, "pre pte write");
1353 if (gfn == vcpu->arch.last_pt_write_gfn
1354 && !last_updated_pte_accessed(vcpu)) {
1355 ++vcpu->arch.last_pt_write_count;
1356 if (vcpu->arch.last_pt_write_count >= 3)
1357 flooded = 1;
1358 } else {
1359 vcpu->arch.last_pt_write_gfn = gfn;
1360 vcpu->arch.last_pt_write_count = 1;
1361 vcpu->arch.last_pte_updated = NULL;
1362 }
1363 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
1364 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
1365 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
1366 if (sp->gfn != gfn || sp->role.metaphysical)
1367 continue;
1368 pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
1369 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
1370 misaligned |= bytes < 4;
1371 if (misaligned || flooded) {
1372 /*
1373 * Misaligned accesses are too much trouble to fix
1374 * up; also, they usually indicate a page is not used
1375 * as a page table.
1376 *
1377 * If we're seeing too many writes to a page,
1378 * it may no longer be a page table, or we may be
1379 * forking, in which case it is better to unmap the
1380 * page.
1381 */
1382 pgprintk("misaligned: gpa %llx bytes %d role %x\n",
1383 gpa, bytes, sp->role.word);
1384 kvm_mmu_zap_page(vcpu->kvm, sp);
1385 ++vcpu->kvm->stat.mmu_flooded;
1386 continue;
1387 }
1388 page_offset = offset;
1389 level = sp->role.level;
1390 npte = 1;
1391 if (sp->role.glevels == PT32_ROOT_LEVEL) {
1392 page_offset <<= 1; /* 32->64 */
1393 /*
1394 * A 32-bit pde maps 4MB while the shadow pdes map
1395 * only 2MB. So we need to double the offset again
1396 * and zap two pdes instead of one.
1397 */
1398 if (level == PT32_ROOT_LEVEL) {
1399 page_offset &= ~7; /* kill rounding error */
1400 page_offset <<= 1;
1401 npte = 2;
1402 }
1403 quadrant = page_offset >> PAGE_SHIFT;
1404 page_offset &= ~PAGE_MASK;
1405 if (quadrant != sp->role.quadrant)
1406 continue;
1407 }
1408 spte = &sp->spt[page_offset / sizeof(*spte)];
1409 while (npte--) {
1410 entry = *spte;
1411 mmu_pte_write_zap_pte(vcpu, sp, spte);
1412 mmu_pte_write_new_pte(vcpu, sp, spte, new, bytes,
1413 page_offset & (pte_size - 1));
1414 mmu_pte_write_flush_tlb(vcpu, entry, *spte);
1415 ++spte;
1416 }
1417 }
1418 kvm_mmu_audit(vcpu, "post pte write");
1419}
1420
1421int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
1422{
1423 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
1424
1425 return kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1426}
1427
1428void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
1429{
1430 while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) {
1431 struct kvm_mmu_page *sp;
1432
1433 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
1434 struct kvm_mmu_page, link);
1435 kvm_mmu_zap_page(vcpu->kvm, sp);
1436 ++vcpu->kvm->stat.mmu_recycled;
1437 }
1438}
1439
1440int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
1441{
1442 int r;
1443 enum emulation_result er;
1444
1445 mutex_lock(&vcpu->kvm->lock);
1446 r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code);
1447 if (r < 0)
1448 goto out;
1449
1450 if (!r) {
1451 r = 1;
1452 goto out;
1453 }
1454
1455 r = mmu_topup_memory_caches(vcpu);
1456 if (r)
1457 goto out;
1458
1459 er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0);
1460 mutex_unlock(&vcpu->kvm->lock);
1461
1462 switch (er) {
1463 case EMULATE_DONE:
1464 return 1;
1465 case EMULATE_DO_MMIO:
1466 ++vcpu->stat.mmio_exits;
1467 return 0;
1468 case EMULATE_FAIL:
1469 kvm_report_emulation_failure(vcpu, "pagetable");
1470 return 1;
1471 default:
1472 BUG();
1473 }
1474out:
1475 mutex_unlock(&vcpu->kvm->lock);
1476 return r;
1477}
1478EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
1479
1480static void free_mmu_pages(struct kvm_vcpu *vcpu)
1481{
1482 struct kvm_mmu_page *sp;
1483
1484 while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
1485 sp = container_of(vcpu->kvm->arch.active_mmu_pages.next,
1486 struct kvm_mmu_page, link);
1487 kvm_mmu_zap_page(vcpu->kvm, sp);
1488 }
1489 free_page((unsigned long)vcpu->arch.mmu.pae_root);
1490}
1491
1492static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
1493{
1494 struct page *page;
1495 int i;
1496
1497 ASSERT(vcpu);
1498
1499 if (vcpu->kvm->arch.n_requested_mmu_pages)
1500 vcpu->kvm->arch.n_free_mmu_pages =
1501 vcpu->kvm->arch.n_requested_mmu_pages;
1502 else
1503 vcpu->kvm->arch.n_free_mmu_pages =
1504 vcpu->kvm->arch.n_alloc_mmu_pages;
1505 /*
1506 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
1507 * Therefore we need to allocate shadow page tables in the first
1508 * 4GB of memory, which happens to fit the DMA32 zone.
1509 */
1510 page = alloc_page(GFP_KERNEL | __GFP_DMA32);
1511 if (!page)
1512 goto error_1;
1513 vcpu->arch.mmu.pae_root = page_address(page);
1514 for (i = 0; i < 4; ++i)
1515 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
1516
1517 return 0;
1518
1519error_1:
1520 free_mmu_pages(vcpu);
1521 return -ENOMEM;
1522}
1523
1524int kvm_mmu_create(struct kvm_vcpu *vcpu)
1525{
1526 ASSERT(vcpu);
1527 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
1528
1529 return alloc_mmu_pages(vcpu);
1530}
1531
1532int kvm_mmu_setup(struct kvm_vcpu *vcpu)
1533{
1534 ASSERT(vcpu);
1535 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
1536
1537 return init_kvm_mmu(vcpu);
1538}
1539
1540void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
1541{
1542 ASSERT(vcpu);
1543
1544 destroy_kvm_mmu(vcpu);
1545 free_mmu_pages(vcpu);
1546 mmu_free_memory_caches(vcpu);
1547}
1548
1549void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
1550{
1551 struct kvm_mmu_page *sp;
1552
1553 list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
1554 int i;
1555 u64 *pt;
1556
1557 if (!test_bit(slot, &sp->slot_bitmap))
1558 continue;
1559
1560 pt = sp->spt;
1561 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1562 /* avoid RMW */
1563 if (pt[i] & PT_WRITABLE_MASK)
1564 pt[i] &= ~PT_WRITABLE_MASK;
1565 }
1566}
1567
1568void kvm_mmu_zap_all(struct kvm *kvm)
1569{
1570 struct kvm_mmu_page *sp, *node;
1571
1572 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
1573 kvm_mmu_zap_page(kvm, sp);
1574
1575 kvm_flush_remote_tlbs(kvm);
1576}
1577
1578void kvm_mmu_module_exit(void)
1579{
1580 if (pte_chain_cache)
1581 kmem_cache_destroy(pte_chain_cache);
1582 if (rmap_desc_cache)
1583 kmem_cache_destroy(rmap_desc_cache);
1584 if (mmu_page_header_cache)
1585 kmem_cache_destroy(mmu_page_header_cache);
1586}
1587
1588int kvm_mmu_module_init(void)
1589{
1590 pte_chain_cache = kmem_cache_create("kvm_pte_chain",
1591 sizeof(struct kvm_pte_chain),
1592 0, 0, NULL);
1593 if (!pte_chain_cache)
1594 goto nomem;
1595 rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
1596 sizeof(struct kvm_rmap_desc),
1597 0, 0, NULL);
1598 if (!rmap_desc_cache)
1599 goto nomem;
1600
1601 mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
1602 sizeof(struct kvm_mmu_page),
1603 0, 0, NULL);
1604 if (!mmu_page_header_cache)
1605 goto nomem;
1606
1607 return 0;
1608
1609nomem:
1610 kvm_mmu_module_exit();
1611 return -ENOMEM;
1612}
1613
1614/*
1615 * Caculate mmu pages needed for kvm.
1616 */
1617unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
1618{
1619 int i;
1620 unsigned int nr_mmu_pages;
1621 unsigned int nr_pages = 0;
1622
1623 for (i = 0; i < kvm->nmemslots; i++)
1624 nr_pages += kvm->memslots[i].npages;
1625
1626 nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
1627 nr_mmu_pages = max(nr_mmu_pages,
1628 (unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
1629
1630 return nr_mmu_pages;
1631}
1632
1633#ifdef AUDIT
1634
1635static const char *audit_msg;
1636
1637static gva_t canonicalize(gva_t gva)
1638{
1639#ifdef CONFIG_X86_64
1640 gva = (long long)(gva << 16) >> 16;
1641#endif
1642 return gva;
1643}
1644
1645static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
1646 gva_t va, int level)
1647{
1648 u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
1649 int i;
1650 gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
1651
1652 for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
1653 u64 ent = pt[i];
1654
1655 if (ent == shadow_trap_nonpresent_pte)
1656 continue;
1657
1658 va = canonicalize(va);
1659 if (level > 1) {
1660 if (ent == shadow_notrap_nonpresent_pte)
1661 printk(KERN_ERR "audit: (%s) nontrapping pte"
1662 " in nonleaf level: levels %d gva %lx"
1663 " level %d pte %llx\n", audit_msg,
1664 vcpu->arch.mmu.root_level, va, level, ent);
1665
1666 audit_mappings_page(vcpu, ent, va, level - 1);
1667 } else {
1668 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va);
1669 struct page *page = gpa_to_page(vcpu, gpa);
1670 hpa_t hpa = page_to_phys(page);
1671
1672 if (is_shadow_present_pte(ent)
1673 && (ent & PT64_BASE_ADDR_MASK) != hpa)
1674 printk(KERN_ERR "xx audit error: (%s) levels %d"
1675 " gva %lx gpa %llx hpa %llx ent %llx %d\n",
1676 audit_msg, vcpu->arch.mmu.root_level,
1677 va, gpa, hpa, ent,
1678 is_shadow_present_pte(ent));
1679 else if (ent == shadow_notrap_nonpresent_pte
1680 && !is_error_hpa(hpa))
1681 printk(KERN_ERR "audit: (%s) notrap shadow,"
1682 " valid guest gva %lx\n", audit_msg, va);
1683 kvm_release_page_clean(page);
1684
1685 }
1686 }
1687}
1688
1689static void audit_mappings(struct kvm_vcpu *vcpu)
1690{
1691 unsigned i;
1692
1693 if (vcpu->arch.mmu.root_level == 4)
1694 audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
1695 else
1696 for (i = 0; i < 4; ++i)
1697 if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
1698 audit_mappings_page(vcpu,
1699 vcpu->arch.mmu.pae_root[i],
1700 i << 30,
1701 2);
1702}
1703
1704static int count_rmaps(struct kvm_vcpu *vcpu)
1705{
1706 int nmaps = 0;
1707 int i, j, k;
1708
1709 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
1710 struct kvm_memory_slot *m = &vcpu->kvm->memslots[i];
1711 struct kvm_rmap_desc *d;
1712
1713 for (j = 0; j < m->npages; ++j) {
1714 unsigned long *rmapp = &m->rmap[j];
1715
1716 if (!*rmapp)
1717 continue;
1718 if (!(*rmapp & 1)) {
1719 ++nmaps;
1720 continue;
1721 }
1722 d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
1723 while (d) {
1724 for (k = 0; k < RMAP_EXT; ++k)
1725 if (d->shadow_ptes[k])
1726 ++nmaps;
1727 else
1728 break;
1729 d = d->more;
1730 }
1731 }
1732 }
1733 return nmaps;
1734}
1735
1736static int count_writable_mappings(struct kvm_vcpu *vcpu)
1737{
1738 int nmaps = 0;
1739 struct kvm_mmu_page *sp;
1740 int i;
1741
1742 list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
1743 u64 *pt = sp->spt;
1744
1745 if (sp->role.level != PT_PAGE_TABLE_LEVEL)
1746 continue;
1747
1748 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1749 u64 ent = pt[i];
1750
1751 if (!(ent & PT_PRESENT_MASK))
1752 continue;
1753 if (!(ent & PT_WRITABLE_MASK))
1754 continue;
1755 ++nmaps;
1756 }
1757 }
1758 return nmaps;
1759}
1760
1761static void audit_rmap(struct kvm_vcpu *vcpu)
1762{
1763 int n_rmap = count_rmaps(vcpu);
1764 int n_actual = count_writable_mappings(vcpu);
1765
1766 if (n_rmap != n_actual)
1767 printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
1768 __FUNCTION__, audit_msg, n_rmap, n_actual);
1769}
1770
1771static void audit_write_protection(struct kvm_vcpu *vcpu)
1772{
1773 struct kvm_mmu_page *sp;
1774 struct kvm_memory_slot *slot;
1775 unsigned long *rmapp;
1776 gfn_t gfn;
1777
1778 list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
1779 if (sp->role.metaphysical)
1780 continue;
1781
1782 slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
1783 gfn = unalias_gfn(vcpu->kvm, sp->gfn);
1784 rmapp = &slot->rmap[gfn - slot->base_gfn];
1785 if (*rmapp)
1786 printk(KERN_ERR "%s: (%s) shadow page has writable"
1787 " mappings: gfn %lx role %x\n",
1788 __FUNCTION__, audit_msg, sp->gfn,
1789 sp->role.word);
1790 }
1791}
1792
1793static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
1794{
1795 int olddbg = dbg;
1796
1797 dbg = 0;
1798 audit_msg = msg;
1799 audit_rmap(vcpu);
1800 audit_write_protection(vcpu);
1801 audit_mappings(vcpu);
1802 dbg = olddbg;
1803}
1804
1805#endif
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
new file mode 100644
index 000000000000..1fce19ec7a23
--- /dev/null
+++ b/arch/x86/kvm/mmu.h
@@ -0,0 +1,44 @@
1#ifndef __KVM_X86_MMU_H
2#define __KVM_X86_MMU_H
3
4#include <linux/kvm_host.h>
5
6static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
7{
8 if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
9 __kvm_mmu_free_some_pages(vcpu);
10}
11
12static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
13{
14 if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE))
15 return 0;
16
17 return kvm_mmu_load(vcpu);
18}
19
20static inline int is_long_mode(struct kvm_vcpu *vcpu)
21{
22#ifdef CONFIG_X86_64
23 return vcpu->arch.shadow_efer & EFER_LME;
24#else
25 return 0;
26#endif
27}
28
29static inline int is_pae(struct kvm_vcpu *vcpu)
30{
31 return vcpu->arch.cr4 & X86_CR4_PAE;
32}
33
34static inline int is_pse(struct kvm_vcpu *vcpu)
35{
36 return vcpu->arch.cr4 & X86_CR4_PSE;
37}
38
39static inline int is_paging(struct kvm_vcpu *vcpu)
40{
41 return vcpu->arch.cr0 & X86_CR0_PG;
42}
43
44#endif
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
new file mode 100644
index 000000000000..56b88f7e83ef
--- /dev/null
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -0,0 +1,461 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * MMU support
8 *
9 * Copyright (C) 2006 Qumranet, Inc.
10 *
11 * Authors:
12 * Yaniv Kamay <yaniv@qumranet.com>
13 * Avi Kivity <avi@qumranet.com>
14 *
15 * This work is licensed under the terms of the GNU GPL, version 2. See
16 * the COPYING file in the top-level directory.
17 *
18 */
19
20/*
21 * We need the mmu code to access both 32-bit and 64-bit guest ptes,
22 * so the code in this file is compiled twice, once per pte size.
23 */
24
25#if PTTYPE == 64
26 #define pt_element_t u64
27 #define guest_walker guest_walker64
28 #define FNAME(name) paging##64_##name
29 #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
30 #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
31 #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
32 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
33 #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
34 #define PT_LEVEL_BITS PT64_LEVEL_BITS
35 #ifdef CONFIG_X86_64
36 #define PT_MAX_FULL_LEVELS 4
37 #define CMPXCHG cmpxchg
38 #else
39 #define CMPXCHG cmpxchg64
40 #define PT_MAX_FULL_LEVELS 2
41 #endif
42#elif PTTYPE == 32
43 #define pt_element_t u32
44 #define guest_walker guest_walker32
45 #define FNAME(name) paging##32_##name
46 #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
47 #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
48 #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
49 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
50 #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
51 #define PT_LEVEL_BITS PT32_LEVEL_BITS
52 #define PT_MAX_FULL_LEVELS 2
53 #define CMPXCHG cmpxchg
54#else
55 #error Invalid PTTYPE value
56#endif
57
58#define gpte_to_gfn FNAME(gpte_to_gfn)
59#define gpte_to_gfn_pde FNAME(gpte_to_gfn_pde)
60
61/*
62 * The guest_walker structure emulates the behavior of the hardware page
63 * table walker.
64 */
65struct guest_walker {
66 int level;
67 gfn_t table_gfn[PT_MAX_FULL_LEVELS];
68 pt_element_t ptes[PT_MAX_FULL_LEVELS];
69 gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
70 unsigned pt_access;
71 unsigned pte_access;
72 gfn_t gfn;
73 u32 error_code;
74};
75
76static gfn_t gpte_to_gfn(pt_element_t gpte)
77{
78 return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
79}
80
81static gfn_t gpte_to_gfn_pde(pt_element_t gpte)
82{
83 return (gpte & PT_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
84}
85
86static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
87 gfn_t table_gfn, unsigned index,
88 pt_element_t orig_pte, pt_element_t new_pte)
89{
90 pt_element_t ret;
91 pt_element_t *table;
92 struct page *page;
93
94 page = gfn_to_page(kvm, table_gfn);
95 table = kmap_atomic(page, KM_USER0);
96
97 ret = CMPXCHG(&table[index], orig_pte, new_pte);
98
99 kunmap_atomic(table, KM_USER0);
100
101 kvm_release_page_dirty(page);
102
103 return (ret != orig_pte);
104}
105
106static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
107{
108 unsigned access;
109
110 access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
111#if PTTYPE == 64
112 if (is_nx(vcpu))
113 access &= ~(gpte >> PT64_NX_SHIFT);
114#endif
115 return access;
116}
117
118/*
119 * Fetch a guest pte for a guest virtual address
120 */
121static int FNAME(walk_addr)(struct guest_walker *walker,
122 struct kvm_vcpu *vcpu, gva_t addr,
123 int write_fault, int user_fault, int fetch_fault)
124{
125 pt_element_t pte;
126 gfn_t table_gfn;
127 unsigned index, pt_access, pte_access;
128 gpa_t pte_gpa;
129
130 pgprintk("%s: addr %lx\n", __FUNCTION__, addr);
131walk:
132 walker->level = vcpu->arch.mmu.root_level;
133 pte = vcpu->arch.cr3;
134#if PTTYPE == 64
135 if (!is_long_mode(vcpu)) {
136 pte = vcpu->arch.pdptrs[(addr >> 30) & 3];
137 if (!is_present_pte(pte))
138 goto not_present;
139 --walker->level;
140 }
141#endif
142 ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
143 (vcpu->cr3 & CR3_NONPAE_RESERVED_BITS) == 0);
144
145 pt_access = ACC_ALL;
146
147 for (;;) {
148 index = PT_INDEX(addr, walker->level);
149
150 table_gfn = gpte_to_gfn(pte);
151 pte_gpa = gfn_to_gpa(table_gfn);
152 pte_gpa += index * sizeof(pt_element_t);
153 walker->table_gfn[walker->level - 1] = table_gfn;
154 walker->pte_gpa[walker->level - 1] = pte_gpa;
155 pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
156 walker->level - 1, table_gfn);
157
158 kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte));
159
160 if (!is_present_pte(pte))
161 goto not_present;
162
163 if (write_fault && !is_writeble_pte(pte))
164 if (user_fault || is_write_protection(vcpu))
165 goto access_error;
166
167 if (user_fault && !(pte & PT_USER_MASK))
168 goto access_error;
169
170#if PTTYPE == 64
171 if (fetch_fault && is_nx(vcpu) && (pte & PT64_NX_MASK))
172 goto access_error;
173#endif
174
175 if (!(pte & PT_ACCESSED_MASK)) {
176 mark_page_dirty(vcpu->kvm, table_gfn);
177 if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn,
178 index, pte, pte|PT_ACCESSED_MASK))
179 goto walk;
180 pte |= PT_ACCESSED_MASK;
181 }
182
183 pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
184
185 walker->ptes[walker->level - 1] = pte;
186
187 if (walker->level == PT_PAGE_TABLE_LEVEL) {
188 walker->gfn = gpte_to_gfn(pte);
189 break;
190 }
191
192 if (walker->level == PT_DIRECTORY_LEVEL
193 && (pte & PT_PAGE_SIZE_MASK)
194 && (PTTYPE == 64 || is_pse(vcpu))) {
195 walker->gfn = gpte_to_gfn_pde(pte);
196 walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL);
197 if (PTTYPE == 32 && is_cpuid_PSE36())
198 walker->gfn += pse36_gfn_delta(pte);
199 break;
200 }
201
202 pt_access = pte_access;
203 --walker->level;
204 }
205
206 if (write_fault && !is_dirty_pte(pte)) {
207 bool ret;
208
209 mark_page_dirty(vcpu->kvm, table_gfn);
210 ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
211 pte|PT_DIRTY_MASK);
212 if (ret)
213 goto walk;
214 pte |= PT_DIRTY_MASK;
215 kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte));
216 walker->ptes[walker->level - 1] = pte;
217 }
218
219 walker->pt_access = pt_access;
220 walker->pte_access = pte_access;
221 pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
222 __FUNCTION__, (u64)pte, pt_access, pte_access);
223 return 1;
224
225not_present:
226 walker->error_code = 0;
227 goto err;
228
229access_error:
230 walker->error_code = PFERR_PRESENT_MASK;
231
232err:
233 if (write_fault)
234 walker->error_code |= PFERR_WRITE_MASK;
235 if (user_fault)
236 walker->error_code |= PFERR_USER_MASK;
237 if (fetch_fault)
238 walker->error_code |= PFERR_FETCH_MASK;
239 return 0;
240}
241
242static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
243 u64 *spte, const void *pte, int bytes,
244 int offset_in_pte)
245{
246 pt_element_t gpte;
247 unsigned pte_access;
248
249 gpte = *(const pt_element_t *)pte;
250 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
251 if (!offset_in_pte && !is_present_pte(gpte))
252 set_shadow_pte(spte, shadow_notrap_nonpresent_pte);
253 return;
254 }
255 if (bytes < sizeof(pt_element_t))
256 return;
257 pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte);
258 pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte);
259 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
260 gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte));
261}
262
263/*
264 * Fetch a shadow pte for a specific level in the paging hierarchy.
265 */
266static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
267 struct guest_walker *walker,
268 int user_fault, int write_fault, int *ptwrite)
269{
270 hpa_t shadow_addr;
271 int level;
272 u64 *shadow_ent;
273 unsigned access = walker->pt_access;
274
275 if (!is_present_pte(walker->ptes[walker->level - 1]))
276 return NULL;
277
278 shadow_addr = vcpu->arch.mmu.root_hpa;
279 level = vcpu->arch.mmu.shadow_root_level;
280 if (level == PT32E_ROOT_LEVEL) {
281 shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
282 shadow_addr &= PT64_BASE_ADDR_MASK;
283 --level;
284 }
285
286 for (; ; level--) {
287 u32 index = SHADOW_PT_INDEX(addr, level);
288 struct kvm_mmu_page *shadow_page;
289 u64 shadow_pte;
290 int metaphysical;
291 gfn_t table_gfn;
292 bool new_page = 0;
293
294 shadow_ent = ((u64 *)__va(shadow_addr)) + index;
295 if (is_shadow_present_pte(*shadow_ent)) {
296 if (level == PT_PAGE_TABLE_LEVEL)
297 break;
298 shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
299 continue;
300 }
301
302 if (level == PT_PAGE_TABLE_LEVEL)
303 break;
304
305 if (level - 1 == PT_PAGE_TABLE_LEVEL
306 && walker->level == PT_DIRECTORY_LEVEL) {
307 metaphysical = 1;
308 if (!is_dirty_pte(walker->ptes[level - 1]))
309 access &= ~ACC_WRITE_MASK;
310 table_gfn = gpte_to_gfn(walker->ptes[level - 1]);
311 } else {
312 metaphysical = 0;
313 table_gfn = walker->table_gfn[level - 2];
314 }
315 shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
316 metaphysical, access,
317 shadow_ent, &new_page);
318 if (new_page && !metaphysical) {
319 pt_element_t curr_pte;
320 kvm_read_guest(vcpu->kvm, walker->pte_gpa[level - 2],
321 &curr_pte, sizeof(curr_pte));
322 if (curr_pte != walker->ptes[level - 2])
323 return NULL;
324 }
325 shadow_addr = __pa(shadow_page->spt);
326 shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
327 | PT_WRITABLE_MASK | PT_USER_MASK;
328 *shadow_ent = shadow_pte;
329 }
330
331 mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access,
332 user_fault, write_fault,
333 walker->ptes[walker->level-1] & PT_DIRTY_MASK,
334 ptwrite, walker->gfn);
335
336 return shadow_ent;
337}
338
339/*
340 * Page fault handler. There are several causes for a page fault:
341 * - there is no shadow pte for the guest pte
342 * - write access through a shadow pte marked read only so that we can set
343 * the dirty bit
344 * - write access to a shadow pte marked read only so we can update the page
345 * dirty bitmap, when userspace requests it
346 * - mmio access; in this case we will never install a present shadow pte
347 * - normal guest page fault due to the guest pte marked not present, not
348 * writable, or not executable
349 *
350 * Returns: 1 if we need to emulate the instruction, 0 otherwise, or
351 * a negative value on error.
352 */
353static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
354 u32 error_code)
355{
356 int write_fault = error_code & PFERR_WRITE_MASK;
357 int user_fault = error_code & PFERR_USER_MASK;
358 int fetch_fault = error_code & PFERR_FETCH_MASK;
359 struct guest_walker walker;
360 u64 *shadow_pte;
361 int write_pt = 0;
362 int r;
363
364 pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code);
365 kvm_mmu_audit(vcpu, "pre page fault");
366
367 r = mmu_topup_memory_caches(vcpu);
368 if (r)
369 return r;
370
371 /*
372 * Look up the shadow pte for the faulting address.
373 */
374 r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
375 fetch_fault);
376
377 /*
378 * The page is not mapped by the guest. Let the guest handle it.
379 */
380 if (!r) {
381 pgprintk("%s: guest page fault\n", __FUNCTION__);
382 inject_page_fault(vcpu, addr, walker.error_code);
383 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
384 return 0;
385 }
386
387 shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
388 &write_pt);
389 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__,
390 shadow_pte, *shadow_pte, write_pt);
391
392 if (!write_pt)
393 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
394
395 /*
396 * mmio: emulate if accessible, otherwise its a guest fault.
397 */
398 if (shadow_pte && is_io_pte(*shadow_pte))
399 return 1;
400
401 ++vcpu->stat.pf_fixed;
402 kvm_mmu_audit(vcpu, "post page fault (fixed)");
403
404 return write_pt;
405}
406
407static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
408{
409 struct guest_walker walker;
410 gpa_t gpa = UNMAPPED_GVA;
411 int r;
412
413 r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0);
414
415 if (r) {
416 gpa = gfn_to_gpa(walker.gfn);
417 gpa |= vaddr & ~PAGE_MASK;
418 }
419
420 return gpa;
421}
422
423static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
424 struct kvm_mmu_page *sp)
425{
426 int i, offset = 0;
427 pt_element_t *gpt;
428 struct page *page;
429
430 if (sp->role.metaphysical
431 || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) {
432 nonpaging_prefetch_page(vcpu, sp);
433 return;
434 }
435
436 if (PTTYPE == 32)
437 offset = sp->role.quadrant << PT64_LEVEL_BITS;
438 page = gfn_to_page(vcpu->kvm, sp->gfn);
439 gpt = kmap_atomic(page, KM_USER0);
440 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
441 if (is_present_pte(gpt[offset + i]))
442 sp->spt[i] = shadow_trap_nonpresent_pte;
443 else
444 sp->spt[i] = shadow_notrap_nonpresent_pte;
445 kunmap_atomic(gpt, KM_USER0);
446 kvm_release_page_clean(page);
447}
448
449#undef pt_element_t
450#undef guest_walker
451#undef FNAME
452#undef PT_BASE_ADDR_MASK
453#undef PT_INDEX
454#undef SHADOW_PT_INDEX
455#undef PT_LEVEL_MASK
456#undef PT_DIR_BASE_ADDR_MASK
457#undef PT_LEVEL_BITS
458#undef PT_MAX_FULL_LEVELS
459#undef gpte_to_gfn
460#undef gpte_to_gfn_pde
461#undef CMPXCHG
diff --git a/arch/x86/kvm/segment_descriptor.h b/arch/x86/kvm/segment_descriptor.h
new file mode 100644
index 000000000000..56fc4c873389
--- /dev/null
+++ b/arch/x86/kvm/segment_descriptor.h
@@ -0,0 +1,29 @@
1#ifndef __SEGMENT_DESCRIPTOR_H
2#define __SEGMENT_DESCRIPTOR_H
3
4struct segment_descriptor {
5 u16 limit_low;
6 u16 base_low;
7 u8 base_mid;
8 u8 type : 4;
9 u8 system : 1;
10 u8 dpl : 2;
11 u8 present : 1;
12 u8 limit_high : 4;
13 u8 avl : 1;
14 u8 long_mode : 1;
15 u8 default_op : 1;
16 u8 granularity : 1;
17 u8 base_high;
18} __attribute__((packed));
19
20#ifdef CONFIG_X86_64
21/* LDT or TSS descriptor in the GDT. 16 bytes. */
22struct segment_descriptor_64 {
23 struct segment_descriptor s;
24 u32 base_higher;
25 u32 pad_zero;
26};
27
28#endif
29#endif
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
new file mode 100644
index 000000000000..3d4b71a94440
--- /dev/null
+++ b/arch/x86/kvm/svm.c
@@ -0,0 +1,1725 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * AMD SVM support
5 *
6 * Copyright (C) 2006 Qumranet, Inc.
7 *
8 * Authors:
9 * Yaniv Kamay <yaniv@qumranet.com>
10 * Avi Kivity <avi@qumranet.com>
11 *
12 * This work is licensed under the terms of the GNU GPL, version 2. See
13 * the COPYING file in the top-level directory.
14 *
15 */
16#include <linux/kvm_host.h>
17
18#include "kvm_svm.h"
19#include "irq.h"
20#include "mmu.h"
21
22#include <linux/module.h>
23#include <linux/kernel.h>
24#include <linux/vmalloc.h>
25#include <linux/highmem.h>
26#include <linux/sched.h>
27
28#include <asm/desc.h>
29
30MODULE_AUTHOR("Qumranet");
31MODULE_LICENSE("GPL");
32
33#define IOPM_ALLOC_ORDER 2
34#define MSRPM_ALLOC_ORDER 1
35
36#define DB_VECTOR 1
37#define UD_VECTOR 6
38#define GP_VECTOR 13
39
40#define DR7_GD_MASK (1 << 13)
41#define DR6_BD_MASK (1 << 13)
42
43#define SEG_TYPE_LDT 2
44#define SEG_TYPE_BUSY_TSS16 3
45
46#define SVM_FEATURE_NPT (1 << 0)
47#define SVM_FEATURE_LBRV (1 << 1)
48#define SVM_DEATURE_SVML (1 << 2)
49
50static void kvm_reput_irq(struct vcpu_svm *svm);
51
52static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
53{
54 return container_of(vcpu, struct vcpu_svm, vcpu);
55}
56
57unsigned long iopm_base;
58unsigned long msrpm_base;
59
60struct kvm_ldttss_desc {
61 u16 limit0;
62 u16 base0;
63 unsigned base1 : 8, type : 5, dpl : 2, p : 1;
64 unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8;
65 u32 base3;
66 u32 zero1;
67} __attribute__((packed));
68
69struct svm_cpu_data {
70 int cpu;
71
72 u64 asid_generation;
73 u32 max_asid;
74 u32 next_asid;
75 struct kvm_ldttss_desc *tss_desc;
76
77 struct page *save_area;
78};
79
80static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
81static uint32_t svm_features;
82
83struct svm_init_data {
84 int cpu;
85 int r;
86};
87
88static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
89
90#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
91#define MSRS_RANGE_SIZE 2048
92#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
93
94#define MAX_INST_SIZE 15
95
96static inline u32 svm_has(u32 feat)
97{
98 return svm_features & feat;
99}
100
101static inline u8 pop_irq(struct kvm_vcpu *vcpu)
102{
103 int word_index = __ffs(vcpu->arch.irq_summary);
104 int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
105 int irq = word_index * BITS_PER_LONG + bit_index;
106
107 clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
108 if (!vcpu->arch.irq_pending[word_index])
109 clear_bit(word_index, &vcpu->arch.irq_summary);
110 return irq;
111}
112
113static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq)
114{
115 set_bit(irq, vcpu->arch.irq_pending);
116 set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
117}
118
119static inline void clgi(void)
120{
121 asm volatile (SVM_CLGI);
122}
123
124static inline void stgi(void)
125{
126 asm volatile (SVM_STGI);
127}
128
129static inline void invlpga(unsigned long addr, u32 asid)
130{
131 asm volatile (SVM_INVLPGA :: "a"(addr), "c"(asid));
132}
133
134static inline unsigned long kvm_read_cr2(void)
135{
136 unsigned long cr2;
137
138 asm volatile ("mov %%cr2, %0" : "=r" (cr2));
139 return cr2;
140}
141
142static inline void kvm_write_cr2(unsigned long val)
143{
144 asm volatile ("mov %0, %%cr2" :: "r" (val));
145}
146
147static inline unsigned long read_dr6(void)
148{
149 unsigned long dr6;
150
151 asm volatile ("mov %%dr6, %0" : "=r" (dr6));
152 return dr6;
153}
154
155static inline void write_dr6(unsigned long val)
156{
157 asm volatile ("mov %0, %%dr6" :: "r" (val));
158}
159
160static inline unsigned long read_dr7(void)
161{
162 unsigned long dr7;
163
164 asm volatile ("mov %%dr7, %0" : "=r" (dr7));
165 return dr7;
166}
167
168static inline void write_dr7(unsigned long val)
169{
170 asm volatile ("mov %0, %%dr7" :: "r" (val));
171}
172
173static inline void force_new_asid(struct kvm_vcpu *vcpu)
174{
175 to_svm(vcpu)->asid_generation--;
176}
177
178static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
179{
180 force_new_asid(vcpu);
181}
182
183static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
184{
185 if (!(efer & EFER_LMA))
186 efer &= ~EFER_LME;
187
188 to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK;
189 vcpu->arch.shadow_efer = efer;
190}
191
192static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
193 bool has_error_code, u32 error_code)
194{
195 struct vcpu_svm *svm = to_svm(vcpu);
196
197 svm->vmcb->control.event_inj = nr
198 | SVM_EVTINJ_VALID
199 | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
200 | SVM_EVTINJ_TYPE_EXEPT;
201 svm->vmcb->control.event_inj_err = error_code;
202}
203
204static bool svm_exception_injected(struct kvm_vcpu *vcpu)
205{
206 struct vcpu_svm *svm = to_svm(vcpu);
207
208 return !(svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID);
209}
210
211static int is_external_interrupt(u32 info)
212{
213 info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
214 return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
215}
216
217static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
218{
219 struct vcpu_svm *svm = to_svm(vcpu);
220
221 if (!svm->next_rip) {
222 printk(KERN_DEBUG "%s: NOP\n", __FUNCTION__);
223 return;
224 }
225 if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE)
226 printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n",
227 __FUNCTION__,
228 svm->vmcb->save.rip,
229 svm->next_rip);
230
231 vcpu->arch.rip = svm->vmcb->save.rip = svm->next_rip;
232 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
233
234 vcpu->arch.interrupt_window_open = 1;
235}
236
237static int has_svm(void)
238{
239 uint32_t eax, ebx, ecx, edx;
240
241 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
242 printk(KERN_INFO "has_svm: not amd\n");
243 return 0;
244 }
245
246 cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
247 if (eax < SVM_CPUID_FUNC) {
248 printk(KERN_INFO "has_svm: can't execute cpuid_8000000a\n");
249 return 0;
250 }
251
252 cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
253 if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) {
254 printk(KERN_DEBUG "has_svm: svm not available\n");
255 return 0;
256 }
257 return 1;
258}
259
260static void svm_hardware_disable(void *garbage)
261{
262 struct svm_cpu_data *svm_data
263 = per_cpu(svm_data, raw_smp_processor_id());
264
265 if (svm_data) {
266 uint64_t efer;
267
268 wrmsrl(MSR_VM_HSAVE_PA, 0);
269 rdmsrl(MSR_EFER, efer);
270 wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK);
271 per_cpu(svm_data, raw_smp_processor_id()) = NULL;
272 __free_page(svm_data->save_area);
273 kfree(svm_data);
274 }
275}
276
277static void svm_hardware_enable(void *garbage)
278{
279
280 struct svm_cpu_data *svm_data;
281 uint64_t efer;
282#ifdef CONFIG_X86_64
283 struct desc_ptr gdt_descr;
284#else
285 struct desc_ptr gdt_descr;
286#endif
287 struct desc_struct *gdt;
288 int me = raw_smp_processor_id();
289
290 if (!has_svm()) {
291 printk(KERN_ERR "svm_cpu_init: err EOPNOTSUPP on %d\n", me);
292 return;
293 }
294 svm_data = per_cpu(svm_data, me);
295
296 if (!svm_data) {
297 printk(KERN_ERR "svm_cpu_init: svm_data is NULL on %d\n",
298 me);
299 return;
300 }
301
302 svm_data->asid_generation = 1;
303 svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
304 svm_data->next_asid = svm_data->max_asid + 1;
305 svm_features = cpuid_edx(SVM_CPUID_FUNC);
306
307 asm volatile ("sgdt %0" : "=m"(gdt_descr));
308 gdt = (struct desc_struct *)gdt_descr.address;
309 svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
310
311 rdmsrl(MSR_EFER, efer);
312 wrmsrl(MSR_EFER, efer | MSR_EFER_SVME_MASK);
313
314 wrmsrl(MSR_VM_HSAVE_PA,
315 page_to_pfn(svm_data->save_area) << PAGE_SHIFT);
316}
317
318static int svm_cpu_init(int cpu)
319{
320 struct svm_cpu_data *svm_data;
321 int r;
322
323 svm_data = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
324 if (!svm_data)
325 return -ENOMEM;
326 svm_data->cpu = cpu;
327 svm_data->save_area = alloc_page(GFP_KERNEL);
328 r = -ENOMEM;
329 if (!svm_data->save_area)
330 goto err_1;
331
332 per_cpu(svm_data, cpu) = svm_data;
333
334 return 0;
335
336err_1:
337 kfree(svm_data);
338 return r;
339
340}
341
342static void set_msr_interception(u32 *msrpm, unsigned msr,
343 int read, int write)
344{
345 int i;
346
347 for (i = 0; i < NUM_MSR_MAPS; i++) {
348 if (msr >= msrpm_ranges[i] &&
349 msr < msrpm_ranges[i] + MSRS_IN_RANGE) {
350 u32 msr_offset = (i * MSRS_IN_RANGE + msr -
351 msrpm_ranges[i]) * 2;
352
353 u32 *base = msrpm + (msr_offset / 32);
354 u32 msr_shift = msr_offset % 32;
355 u32 mask = ((write) ? 0 : 2) | ((read) ? 0 : 1);
356 *base = (*base & ~(0x3 << msr_shift)) |
357 (mask << msr_shift);
358 return;
359 }
360 }
361 BUG();
362}
363
364static __init int svm_hardware_setup(void)
365{
366 int cpu;
367 struct page *iopm_pages;
368 struct page *msrpm_pages;
369 void *iopm_va, *msrpm_va;
370 int r;
371
372 iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER);
373
374 if (!iopm_pages)
375 return -ENOMEM;
376
377 iopm_va = page_address(iopm_pages);
378 memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
379 clear_bit(0x80, iopm_va); /* allow direct access to PC debug port */
380 iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
381
382
383 msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
384
385 r = -ENOMEM;
386 if (!msrpm_pages)
387 goto err_1;
388
389 msrpm_va = page_address(msrpm_pages);
390 memset(msrpm_va, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
391 msrpm_base = page_to_pfn(msrpm_pages) << PAGE_SHIFT;
392
393#ifdef CONFIG_X86_64
394 set_msr_interception(msrpm_va, MSR_GS_BASE, 1, 1);
395 set_msr_interception(msrpm_va, MSR_FS_BASE, 1, 1);
396 set_msr_interception(msrpm_va, MSR_KERNEL_GS_BASE, 1, 1);
397 set_msr_interception(msrpm_va, MSR_LSTAR, 1, 1);
398 set_msr_interception(msrpm_va, MSR_CSTAR, 1, 1);
399 set_msr_interception(msrpm_va, MSR_SYSCALL_MASK, 1, 1);
400#endif
401 set_msr_interception(msrpm_va, MSR_K6_STAR, 1, 1);
402 set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_CS, 1, 1);
403 set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_ESP, 1, 1);
404 set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_EIP, 1, 1);
405
406 for_each_online_cpu(cpu) {
407 r = svm_cpu_init(cpu);
408 if (r)
409 goto err_2;
410 }
411 return 0;
412
413err_2:
414 __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
415 msrpm_base = 0;
416err_1:
417 __free_pages(iopm_pages, IOPM_ALLOC_ORDER);
418 iopm_base = 0;
419 return r;
420}
421
422static __exit void svm_hardware_unsetup(void)
423{
424 __free_pages(pfn_to_page(msrpm_base >> PAGE_SHIFT), MSRPM_ALLOC_ORDER);
425 __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
426 iopm_base = msrpm_base = 0;
427}
428
429static void init_seg(struct vmcb_seg *seg)
430{
431 seg->selector = 0;
432 seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
433 SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
434 seg->limit = 0xffff;
435 seg->base = 0;
436}
437
438static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
439{
440 seg->selector = 0;
441 seg->attrib = SVM_SELECTOR_P_MASK | type;
442 seg->limit = 0xffff;
443 seg->base = 0;
444}
445
446static void init_vmcb(struct vmcb *vmcb)
447{
448 struct vmcb_control_area *control = &vmcb->control;
449 struct vmcb_save_area *save = &vmcb->save;
450
451 control->intercept_cr_read = INTERCEPT_CR0_MASK |
452 INTERCEPT_CR3_MASK |
453 INTERCEPT_CR4_MASK |
454 INTERCEPT_CR8_MASK;
455
456 control->intercept_cr_write = INTERCEPT_CR0_MASK |
457 INTERCEPT_CR3_MASK |
458 INTERCEPT_CR4_MASK |
459 INTERCEPT_CR8_MASK;
460
461 control->intercept_dr_read = INTERCEPT_DR0_MASK |
462 INTERCEPT_DR1_MASK |
463 INTERCEPT_DR2_MASK |
464 INTERCEPT_DR3_MASK;
465
466 control->intercept_dr_write = INTERCEPT_DR0_MASK |
467 INTERCEPT_DR1_MASK |
468 INTERCEPT_DR2_MASK |
469 INTERCEPT_DR3_MASK |
470 INTERCEPT_DR5_MASK |
471 INTERCEPT_DR7_MASK;
472
473 control->intercept_exceptions = (1 << PF_VECTOR) |
474 (1 << UD_VECTOR);
475
476
477 control->intercept = (1ULL << INTERCEPT_INTR) |
478 (1ULL << INTERCEPT_NMI) |
479 (1ULL << INTERCEPT_SMI) |
480 /*
481 * selective cr0 intercept bug?
482 * 0: 0f 22 d8 mov %eax,%cr3
483 * 3: 0f 20 c0 mov %cr0,%eax
484 * 6: 0d 00 00 00 80 or $0x80000000,%eax
485 * b: 0f 22 c0 mov %eax,%cr0
486 * set cr3 ->interception
487 * get cr0 ->interception
488 * set cr0 -> no interception
489 */
490 /* (1ULL << INTERCEPT_SELECTIVE_CR0) | */
491 (1ULL << INTERCEPT_CPUID) |
492 (1ULL << INTERCEPT_INVD) |
493 (1ULL << INTERCEPT_HLT) |
494 (1ULL << INTERCEPT_INVLPGA) |
495 (1ULL << INTERCEPT_IOIO_PROT) |
496 (1ULL << INTERCEPT_MSR_PROT) |
497 (1ULL << INTERCEPT_TASK_SWITCH) |
498 (1ULL << INTERCEPT_SHUTDOWN) |
499 (1ULL << INTERCEPT_VMRUN) |
500 (1ULL << INTERCEPT_VMMCALL) |
501 (1ULL << INTERCEPT_VMLOAD) |
502 (1ULL << INTERCEPT_VMSAVE) |
503 (1ULL << INTERCEPT_STGI) |
504 (1ULL << INTERCEPT_CLGI) |
505 (1ULL << INTERCEPT_SKINIT) |
506 (1ULL << INTERCEPT_WBINVD) |
507 (1ULL << INTERCEPT_MONITOR) |
508 (1ULL << INTERCEPT_MWAIT);
509
510 control->iopm_base_pa = iopm_base;
511 control->msrpm_base_pa = msrpm_base;
512 control->tsc_offset = 0;
513 control->int_ctl = V_INTR_MASKING_MASK;
514
515 init_seg(&save->es);
516 init_seg(&save->ss);
517 init_seg(&save->ds);
518 init_seg(&save->fs);
519 init_seg(&save->gs);
520
521 save->cs.selector = 0xf000;
522 /* Executable/Readable Code Segment */
523 save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
524 SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
525 save->cs.limit = 0xffff;
526 /*
527 * cs.base should really be 0xffff0000, but vmx can't handle that, so
528 * be consistent with it.
529 *
530 * Replace when we have real mode working for vmx.
531 */
532 save->cs.base = 0xf0000;
533
534 save->gdtr.limit = 0xffff;
535 save->idtr.limit = 0xffff;
536
537 init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
538 init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
539
540 save->efer = MSR_EFER_SVME_MASK;
541 save->dr6 = 0xffff0ff0;
542 save->dr7 = 0x400;
543 save->rflags = 2;
544 save->rip = 0x0000fff0;
545
546 /*
547 * cr0 val on cpu init should be 0x60000010, we enable cpu
548 * cache by default. the orderly way is to enable cache in bios.
549 */
550 save->cr0 = 0x00000010 | X86_CR0_PG | X86_CR0_WP;
551 save->cr4 = X86_CR4_PAE;
552 /* rdx = ?? */
553}
554
555static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
556{
557 struct vcpu_svm *svm = to_svm(vcpu);
558
559 init_vmcb(svm->vmcb);
560
561 if (vcpu->vcpu_id != 0) {
562 svm->vmcb->save.rip = 0;
563 svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
564 svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
565 }
566
567 return 0;
568}
569
570static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
571{
572 struct vcpu_svm *svm;
573 struct page *page;
574 int err;
575
576 svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
577 if (!svm) {
578 err = -ENOMEM;
579 goto out;
580 }
581
582 err = kvm_vcpu_init(&svm->vcpu, kvm, id);
583 if (err)
584 goto free_svm;
585
586 page = alloc_page(GFP_KERNEL);
587 if (!page) {
588 err = -ENOMEM;
589 goto uninit;
590 }
591
592 svm->vmcb = page_address(page);
593 clear_page(svm->vmcb);
594 svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
595 svm->asid_generation = 0;
596 memset(svm->db_regs, 0, sizeof(svm->db_regs));
597 init_vmcb(svm->vmcb);
598
599 fx_init(&svm->vcpu);
600 svm->vcpu.fpu_active = 1;
601 svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
602 if (svm->vcpu.vcpu_id == 0)
603 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
604
605 return &svm->vcpu;
606
607uninit:
608 kvm_vcpu_uninit(&svm->vcpu);
609free_svm:
610 kmem_cache_free(kvm_vcpu_cache, svm);
611out:
612 return ERR_PTR(err);
613}
614
615static void svm_free_vcpu(struct kvm_vcpu *vcpu)
616{
617 struct vcpu_svm *svm = to_svm(vcpu);
618
619 __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT));
620 kvm_vcpu_uninit(vcpu);
621 kmem_cache_free(kvm_vcpu_cache, svm);
622}
623
624static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
625{
626 struct vcpu_svm *svm = to_svm(vcpu);
627 int i;
628
629 if (unlikely(cpu != vcpu->cpu)) {
630 u64 tsc_this, delta;
631
632 /*
633 * Make sure that the guest sees a monotonically
634 * increasing TSC.
635 */
636 rdtscll(tsc_this);
637 delta = vcpu->arch.host_tsc - tsc_this;
638 svm->vmcb->control.tsc_offset += delta;
639 vcpu->cpu = cpu;
640 kvm_migrate_apic_timer(vcpu);
641 }
642
643 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
644 rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
645}
646
647static void svm_vcpu_put(struct kvm_vcpu *vcpu)
648{
649 struct vcpu_svm *svm = to_svm(vcpu);
650 int i;
651
652 ++vcpu->stat.host_state_reload;
653 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
654 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
655
656 rdtscll(vcpu->arch.host_tsc);
657}
658
659static void svm_vcpu_decache(struct kvm_vcpu *vcpu)
660{
661}
662
663static void svm_cache_regs(struct kvm_vcpu *vcpu)
664{
665 struct vcpu_svm *svm = to_svm(vcpu);
666
667 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
668 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
669 vcpu->arch.rip = svm->vmcb->save.rip;
670}
671
672static void svm_decache_regs(struct kvm_vcpu *vcpu)
673{
674 struct vcpu_svm *svm = to_svm(vcpu);
675 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
676 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
677 svm->vmcb->save.rip = vcpu->arch.rip;
678}
679
680static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
681{
682 return to_svm(vcpu)->vmcb->save.rflags;
683}
684
685static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
686{
687 to_svm(vcpu)->vmcb->save.rflags = rflags;
688}
689
690static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
691{
692 struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
693
694 switch (seg) {
695 case VCPU_SREG_CS: return &save->cs;
696 case VCPU_SREG_DS: return &save->ds;
697 case VCPU_SREG_ES: return &save->es;
698 case VCPU_SREG_FS: return &save->fs;
699 case VCPU_SREG_GS: return &save->gs;
700 case VCPU_SREG_SS: return &save->ss;
701 case VCPU_SREG_TR: return &save->tr;
702 case VCPU_SREG_LDTR: return &save->ldtr;
703 }
704 BUG();
705 return NULL;
706}
707
708static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
709{
710 struct vmcb_seg *s = svm_seg(vcpu, seg);
711
712 return s->base;
713}
714
715static void svm_get_segment(struct kvm_vcpu *vcpu,
716 struct kvm_segment *var, int seg)
717{
718 struct vmcb_seg *s = svm_seg(vcpu, seg);
719
720 var->base = s->base;
721 var->limit = s->limit;
722 var->selector = s->selector;
723 var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
724 var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
725 var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
726 var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
727 var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
728 var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
729 var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
730 var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1;
731 var->unusable = !var->present;
732}
733
734static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
735{
736 struct vcpu_svm *svm = to_svm(vcpu);
737
738 dt->limit = svm->vmcb->save.idtr.limit;
739 dt->base = svm->vmcb->save.idtr.base;
740}
741
742static void svm_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
743{
744 struct vcpu_svm *svm = to_svm(vcpu);
745
746 svm->vmcb->save.idtr.limit = dt->limit;
747 svm->vmcb->save.idtr.base = dt->base ;
748}
749
750static void svm_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
751{
752 struct vcpu_svm *svm = to_svm(vcpu);
753
754 dt->limit = svm->vmcb->save.gdtr.limit;
755 dt->base = svm->vmcb->save.gdtr.base;
756}
757
758static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
759{
760 struct vcpu_svm *svm = to_svm(vcpu);
761
762 svm->vmcb->save.gdtr.limit = dt->limit;
763 svm->vmcb->save.gdtr.base = dt->base ;
764}
765
766static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
767{
768}
769
770static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
771{
772 struct vcpu_svm *svm = to_svm(vcpu);
773
774#ifdef CONFIG_X86_64
775 if (vcpu->arch.shadow_efer & EFER_LME) {
776 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
777 vcpu->arch.shadow_efer |= EFER_LMA;
778 svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
779 }
780
781 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
782 vcpu->arch.shadow_efer &= ~EFER_LMA;
783 svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
784 }
785 }
786#endif
787 if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) {
788 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
789 vcpu->fpu_active = 1;
790 }
791
792 vcpu->arch.cr0 = cr0;
793 cr0 |= X86_CR0_PG | X86_CR0_WP;
794 cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
795 svm->vmcb->save.cr0 = cr0;
796}
797
798static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
799{
800 vcpu->arch.cr4 = cr4;
801 to_svm(vcpu)->vmcb->save.cr4 = cr4 | X86_CR4_PAE;
802}
803
804static void svm_set_segment(struct kvm_vcpu *vcpu,
805 struct kvm_segment *var, int seg)
806{
807 struct vcpu_svm *svm = to_svm(vcpu);
808 struct vmcb_seg *s = svm_seg(vcpu, seg);
809
810 s->base = var->base;
811 s->limit = var->limit;
812 s->selector = var->selector;
813 if (var->unusable)
814 s->attrib = 0;
815 else {
816 s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
817 s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
818 s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
819 s->attrib |= (var->present & 1) << SVM_SELECTOR_P_SHIFT;
820 s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
821 s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
822 s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
823 s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
824 }
825 if (seg == VCPU_SREG_CS)
826 svm->vmcb->save.cpl
827 = (svm->vmcb->save.cs.attrib
828 >> SVM_SELECTOR_DPL_SHIFT) & 3;
829
830}
831
832/* FIXME:
833
834 svm(vcpu)->vmcb->control.int_ctl &= ~V_TPR_MASK;
835 svm(vcpu)->vmcb->control.int_ctl |= (sregs->cr8 & V_TPR_MASK);
836
837*/
838
839static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
840{
841 return -EOPNOTSUPP;
842}
843
844static int svm_get_irq(struct kvm_vcpu *vcpu)
845{
846 struct vcpu_svm *svm = to_svm(vcpu);
847 u32 exit_int_info = svm->vmcb->control.exit_int_info;
848
849 if (is_external_interrupt(exit_int_info))
850 return exit_int_info & SVM_EVTINJ_VEC_MASK;
851 return -1;
852}
853
854static void load_host_msrs(struct kvm_vcpu *vcpu)
855{
856#ifdef CONFIG_X86_64
857 wrmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
858#endif
859}
860
861static void save_host_msrs(struct kvm_vcpu *vcpu)
862{
863#ifdef CONFIG_X86_64
864 rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
865#endif
866}
867
868static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data)
869{
870 if (svm_data->next_asid > svm_data->max_asid) {
871 ++svm_data->asid_generation;
872 svm_data->next_asid = 1;
873 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
874 }
875
876 svm->vcpu.cpu = svm_data->cpu;
877 svm->asid_generation = svm_data->asid_generation;
878 svm->vmcb->control.asid = svm_data->next_asid++;
879}
880
881static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr)
882{
883 return to_svm(vcpu)->db_regs[dr];
884}
885
886static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
887 int *exception)
888{
889 struct vcpu_svm *svm = to_svm(vcpu);
890
891 *exception = 0;
892
893 if (svm->vmcb->save.dr7 & DR7_GD_MASK) {
894 svm->vmcb->save.dr7 &= ~DR7_GD_MASK;
895 svm->vmcb->save.dr6 |= DR6_BD_MASK;
896 *exception = DB_VECTOR;
897 return;
898 }
899
900 switch (dr) {
901 case 0 ... 3:
902 svm->db_regs[dr] = value;
903 return;
904 case 4 ... 5:
905 if (vcpu->arch.cr4 & X86_CR4_DE) {
906 *exception = UD_VECTOR;
907 return;
908 }
909 case 7: {
910 if (value & ~((1ULL << 32) - 1)) {
911 *exception = GP_VECTOR;
912 return;
913 }
914 svm->vmcb->save.dr7 = value;
915 return;
916 }
917 default:
918 printk(KERN_DEBUG "%s: unexpected dr %u\n",
919 __FUNCTION__, dr);
920 *exception = UD_VECTOR;
921 return;
922 }
923}
924
925static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
926{
927 u32 exit_int_info = svm->vmcb->control.exit_int_info;
928 struct kvm *kvm = svm->vcpu.kvm;
929 u64 fault_address;
930 u32 error_code;
931
932 if (!irqchip_in_kernel(kvm) &&
933 is_external_interrupt(exit_int_info))
934 push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
935
936 fault_address = svm->vmcb->control.exit_info_2;
937 error_code = svm->vmcb->control.exit_info_1;
938 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
939}
940
941static int ud_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
942{
943 int er;
944
945 er = emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0);
946 if (er != EMULATE_DONE)
947 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
948 return 1;
949}
950
951static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
952{
953 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
954 if (!(svm->vcpu.arch.cr0 & X86_CR0_TS))
955 svm->vmcb->save.cr0 &= ~X86_CR0_TS;
956 svm->vcpu.fpu_active = 1;
957
958 return 1;
959}
960
961static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
962{
963 /*
964 * VMCB is undefined after a SHUTDOWN intercept
965 * so reinitialize it.
966 */
967 clear_page(svm->vmcb);
968 init_vmcb(svm->vmcb);
969
970 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
971 return 0;
972}
973
974static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
975{
976 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
977 int size, down, in, string, rep;
978 unsigned port;
979
980 ++svm->vcpu.stat.io_exits;
981
982 svm->next_rip = svm->vmcb->control.exit_info_2;
983
984 string = (io_info & SVM_IOIO_STR_MASK) != 0;
985
986 if (string) {
987 if (emulate_instruction(&svm->vcpu,
988 kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
989 return 0;
990 return 1;
991 }
992
993 in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
994 port = io_info >> 16;
995 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
996 rep = (io_info & SVM_IOIO_REP_MASK) != 0;
997 down = (svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0;
998
999 return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port);
1000}
1001
1002static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1003{
1004 return 1;
1005}
1006
1007static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1008{
1009 svm->next_rip = svm->vmcb->save.rip + 1;
1010 skip_emulated_instruction(&svm->vcpu);
1011 return kvm_emulate_halt(&svm->vcpu);
1012}
1013
1014static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1015{
1016 svm->next_rip = svm->vmcb->save.rip + 3;
1017 skip_emulated_instruction(&svm->vcpu);
1018 kvm_emulate_hypercall(&svm->vcpu);
1019 return 1;
1020}
1021
1022static int invalid_op_interception(struct vcpu_svm *svm,
1023 struct kvm_run *kvm_run)
1024{
1025 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1026 return 1;
1027}
1028
1029static int task_switch_interception(struct vcpu_svm *svm,
1030 struct kvm_run *kvm_run)
1031{
1032 pr_unimpl(&svm->vcpu, "%s: task switch is unsupported\n", __FUNCTION__);
1033 kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
1034 return 0;
1035}
1036
1037static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1038{
1039 svm->next_rip = svm->vmcb->save.rip + 2;
1040 kvm_emulate_cpuid(&svm->vcpu);
1041 return 1;
1042}
1043
1044static int emulate_on_interception(struct vcpu_svm *svm,
1045 struct kvm_run *kvm_run)
1046{
1047 if (emulate_instruction(&svm->vcpu, NULL, 0, 0, 0) != EMULATE_DONE)
1048 pr_unimpl(&svm->vcpu, "%s: failed\n", __FUNCTION__);
1049 return 1;
1050}
1051
1052static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1053{
1054 emulate_instruction(&svm->vcpu, NULL, 0, 0, 0);
1055 if (irqchip_in_kernel(svm->vcpu.kvm))
1056 return 1;
1057 kvm_run->exit_reason = KVM_EXIT_SET_TPR;
1058 return 0;
1059}
1060
1061static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
1062{
1063 struct vcpu_svm *svm = to_svm(vcpu);
1064
1065 switch (ecx) {
1066 case MSR_IA32_TIME_STAMP_COUNTER: {
1067 u64 tsc;
1068
1069 rdtscll(tsc);
1070 *data = svm->vmcb->control.tsc_offset + tsc;
1071 break;
1072 }
1073 case MSR_K6_STAR:
1074 *data = svm->vmcb->save.star;
1075 break;
1076#ifdef CONFIG_X86_64
1077 case MSR_LSTAR:
1078 *data = svm->vmcb->save.lstar;
1079 break;
1080 case MSR_CSTAR:
1081 *data = svm->vmcb->save.cstar;
1082 break;
1083 case MSR_KERNEL_GS_BASE:
1084 *data = svm->vmcb->save.kernel_gs_base;
1085 break;
1086 case MSR_SYSCALL_MASK:
1087 *data = svm->vmcb->save.sfmask;
1088 break;
1089#endif
1090 case MSR_IA32_SYSENTER_CS:
1091 *data = svm->vmcb->save.sysenter_cs;
1092 break;
1093 case MSR_IA32_SYSENTER_EIP:
1094 *data = svm->vmcb->save.sysenter_eip;
1095 break;
1096 case MSR_IA32_SYSENTER_ESP:
1097 *data = svm->vmcb->save.sysenter_esp;
1098 break;
1099 default:
1100 return kvm_get_msr_common(vcpu, ecx, data);
1101 }
1102 return 0;
1103}
1104
1105static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1106{
1107 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
1108 u64 data;
1109
1110 if (svm_get_msr(&svm->vcpu, ecx, &data))
1111 kvm_inject_gp(&svm->vcpu, 0);
1112 else {
1113 svm->vmcb->save.rax = data & 0xffffffff;
1114 svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32;
1115 svm->next_rip = svm->vmcb->save.rip + 2;
1116 skip_emulated_instruction(&svm->vcpu);
1117 }
1118 return 1;
1119}
1120
1121static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
1122{
1123 struct vcpu_svm *svm = to_svm(vcpu);
1124
1125 switch (ecx) {
1126 case MSR_IA32_TIME_STAMP_COUNTER: {
1127 u64 tsc;
1128
1129 rdtscll(tsc);
1130 svm->vmcb->control.tsc_offset = data - tsc;
1131 break;
1132 }
1133 case MSR_K6_STAR:
1134 svm->vmcb->save.star = data;
1135 break;
1136#ifdef CONFIG_X86_64
1137 case MSR_LSTAR:
1138 svm->vmcb->save.lstar = data;
1139 break;
1140 case MSR_CSTAR:
1141 svm->vmcb->save.cstar = data;
1142 break;
1143 case MSR_KERNEL_GS_BASE:
1144 svm->vmcb->save.kernel_gs_base = data;
1145 break;
1146 case MSR_SYSCALL_MASK:
1147 svm->vmcb->save.sfmask = data;
1148 break;
1149#endif
1150 case MSR_IA32_SYSENTER_CS:
1151 svm->vmcb->save.sysenter_cs = data;
1152 break;
1153 case MSR_IA32_SYSENTER_EIP:
1154 svm->vmcb->save.sysenter_eip = data;
1155 break;
1156 case MSR_IA32_SYSENTER_ESP:
1157 svm->vmcb->save.sysenter_esp = data;
1158 break;
1159 case MSR_K7_EVNTSEL0:
1160 case MSR_K7_EVNTSEL1:
1161 case MSR_K7_EVNTSEL2:
1162 case MSR_K7_EVNTSEL3:
1163 /*
1164 * only support writing 0 to the performance counters for now
1165 * to make Windows happy. Should be replaced by a real
1166 * performance counter emulation later.
1167 */
1168 if (data != 0)
1169 goto unhandled;
1170 break;
1171 default:
1172 unhandled:
1173 return kvm_set_msr_common(vcpu, ecx, data);
1174 }
1175 return 0;
1176}
1177
1178static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1179{
1180 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
1181 u64 data = (svm->vmcb->save.rax & -1u)
1182 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
1183 svm->next_rip = svm->vmcb->save.rip + 2;
1184 if (svm_set_msr(&svm->vcpu, ecx, data))
1185 kvm_inject_gp(&svm->vcpu, 0);
1186 else
1187 skip_emulated_instruction(&svm->vcpu);
1188 return 1;
1189}
1190
1191static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1192{
1193 if (svm->vmcb->control.exit_info_1)
1194 return wrmsr_interception(svm, kvm_run);
1195 else
1196 return rdmsr_interception(svm, kvm_run);
1197}
1198
1199static int interrupt_window_interception(struct vcpu_svm *svm,
1200 struct kvm_run *kvm_run)
1201{
1202 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR);
1203 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
1204 /*
1205 * If the user space waits to inject interrupts, exit as soon as
1206 * possible
1207 */
1208 if (kvm_run->request_interrupt_window &&
1209 !svm->vcpu.arch.irq_summary) {
1210 ++svm->vcpu.stat.irq_window_exits;
1211 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
1212 return 0;
1213 }
1214
1215 return 1;
1216}
1217
1218static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
1219 struct kvm_run *kvm_run) = {
1220 [SVM_EXIT_READ_CR0] = emulate_on_interception,
1221 [SVM_EXIT_READ_CR3] = emulate_on_interception,
1222 [SVM_EXIT_READ_CR4] = emulate_on_interception,
1223 [SVM_EXIT_READ_CR8] = emulate_on_interception,
1224 /* for now: */
1225 [SVM_EXIT_WRITE_CR0] = emulate_on_interception,
1226 [SVM_EXIT_WRITE_CR3] = emulate_on_interception,
1227 [SVM_EXIT_WRITE_CR4] = emulate_on_interception,
1228 [SVM_EXIT_WRITE_CR8] = cr8_write_interception,
1229 [SVM_EXIT_READ_DR0] = emulate_on_interception,
1230 [SVM_EXIT_READ_DR1] = emulate_on_interception,
1231 [SVM_EXIT_READ_DR2] = emulate_on_interception,
1232 [SVM_EXIT_READ_DR3] = emulate_on_interception,
1233 [SVM_EXIT_WRITE_DR0] = emulate_on_interception,
1234 [SVM_EXIT_WRITE_DR1] = emulate_on_interception,
1235 [SVM_EXIT_WRITE_DR2] = emulate_on_interception,
1236 [SVM_EXIT_WRITE_DR3] = emulate_on_interception,
1237 [SVM_EXIT_WRITE_DR5] = emulate_on_interception,
1238 [SVM_EXIT_WRITE_DR7] = emulate_on_interception,
1239 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
1240 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
1241 [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception,
1242 [SVM_EXIT_INTR] = nop_on_interception,
1243 [SVM_EXIT_NMI] = nop_on_interception,
1244 [SVM_EXIT_SMI] = nop_on_interception,
1245 [SVM_EXIT_INIT] = nop_on_interception,
1246 [SVM_EXIT_VINTR] = interrupt_window_interception,
1247 /* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */
1248 [SVM_EXIT_CPUID] = cpuid_interception,
1249 [SVM_EXIT_INVD] = emulate_on_interception,
1250 [SVM_EXIT_HLT] = halt_interception,
1251 [SVM_EXIT_INVLPG] = emulate_on_interception,
1252 [SVM_EXIT_INVLPGA] = invalid_op_interception,
1253 [SVM_EXIT_IOIO] = io_interception,
1254 [SVM_EXIT_MSR] = msr_interception,
1255 [SVM_EXIT_TASK_SWITCH] = task_switch_interception,
1256 [SVM_EXIT_SHUTDOWN] = shutdown_interception,
1257 [SVM_EXIT_VMRUN] = invalid_op_interception,
1258 [SVM_EXIT_VMMCALL] = vmmcall_interception,
1259 [SVM_EXIT_VMLOAD] = invalid_op_interception,
1260 [SVM_EXIT_VMSAVE] = invalid_op_interception,
1261 [SVM_EXIT_STGI] = invalid_op_interception,
1262 [SVM_EXIT_CLGI] = invalid_op_interception,
1263 [SVM_EXIT_SKINIT] = invalid_op_interception,
1264 [SVM_EXIT_WBINVD] = emulate_on_interception,
1265 [SVM_EXIT_MONITOR] = invalid_op_interception,
1266 [SVM_EXIT_MWAIT] = invalid_op_interception,
1267};
1268
1269
1270static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1271{
1272 struct vcpu_svm *svm = to_svm(vcpu);
1273 u32 exit_code = svm->vmcb->control.exit_code;
1274
1275 kvm_reput_irq(svm);
1276
1277 if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
1278 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
1279 kvm_run->fail_entry.hardware_entry_failure_reason
1280 = svm->vmcb->control.exit_code;
1281 return 0;
1282 }
1283
1284 if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
1285 exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR)
1286 printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
1287 "exit_code 0x%x\n",
1288 __FUNCTION__, svm->vmcb->control.exit_int_info,
1289 exit_code);
1290
1291 if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
1292 || !svm_exit_handlers[exit_code]) {
1293 kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
1294 kvm_run->hw.hardware_exit_reason = exit_code;
1295 return 0;
1296 }
1297
1298 return svm_exit_handlers[exit_code](svm, kvm_run);
1299}
1300
1301static void reload_tss(struct kvm_vcpu *vcpu)
1302{
1303 int cpu = raw_smp_processor_id();
1304
1305 struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
1306 svm_data->tss_desc->type = 9; /* available 32/64-bit TSS */
1307 load_TR_desc();
1308}
1309
1310static void pre_svm_run(struct vcpu_svm *svm)
1311{
1312 int cpu = raw_smp_processor_id();
1313
1314 struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
1315
1316 svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
1317 if (svm->vcpu.cpu != cpu ||
1318 svm->asid_generation != svm_data->asid_generation)
1319 new_asid(svm, svm_data);
1320}
1321
1322
1323static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
1324{
1325 struct vmcb_control_area *control;
1326
1327 control = &svm->vmcb->control;
1328 control->int_vector = irq;
1329 control->int_ctl &= ~V_INTR_PRIO_MASK;
1330 control->int_ctl |= V_IRQ_MASK |
1331 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
1332}
1333
1334static void svm_set_irq(struct kvm_vcpu *vcpu, int irq)
1335{
1336 struct vcpu_svm *svm = to_svm(vcpu);
1337
1338 svm_inject_irq(svm, irq);
1339}
1340
1341static void svm_intr_assist(struct kvm_vcpu *vcpu)
1342{
1343 struct vcpu_svm *svm = to_svm(vcpu);
1344 struct vmcb *vmcb = svm->vmcb;
1345 int intr_vector = -1;
1346
1347 if ((vmcb->control.exit_int_info & SVM_EVTINJ_VALID) &&
1348 ((vmcb->control.exit_int_info & SVM_EVTINJ_TYPE_MASK) == 0)) {
1349 intr_vector = vmcb->control.exit_int_info &
1350 SVM_EVTINJ_VEC_MASK;
1351 vmcb->control.exit_int_info = 0;
1352 svm_inject_irq(svm, intr_vector);
1353 return;
1354 }
1355
1356 if (vmcb->control.int_ctl & V_IRQ_MASK)
1357 return;
1358
1359 if (!kvm_cpu_has_interrupt(vcpu))
1360 return;
1361
1362 if (!(vmcb->save.rflags & X86_EFLAGS_IF) ||
1363 (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
1364 (vmcb->control.event_inj & SVM_EVTINJ_VALID)) {
1365 /* unable to deliver irq, set pending irq */
1366 vmcb->control.intercept |= (1ULL << INTERCEPT_VINTR);
1367 svm_inject_irq(svm, 0x0);
1368 return;
1369 }
1370 /* Okay, we can deliver the interrupt: grab it and update PIC state. */
1371 intr_vector = kvm_cpu_get_interrupt(vcpu);
1372 svm_inject_irq(svm, intr_vector);
1373 kvm_timer_intr_post(vcpu, intr_vector);
1374}
1375
1376static void kvm_reput_irq(struct vcpu_svm *svm)
1377{
1378 struct vmcb_control_area *control = &svm->vmcb->control;
1379
1380 if ((control->int_ctl & V_IRQ_MASK)
1381 && !irqchip_in_kernel(svm->vcpu.kvm)) {
1382 control->int_ctl &= ~V_IRQ_MASK;
1383 push_irq(&svm->vcpu, control->int_vector);
1384 }
1385
1386 svm->vcpu.arch.interrupt_window_open =
1387 !(control->int_state & SVM_INTERRUPT_SHADOW_MASK);
1388}
1389
1390static void svm_do_inject_vector(struct vcpu_svm *svm)
1391{
1392 struct kvm_vcpu *vcpu = &svm->vcpu;
1393 int word_index = __ffs(vcpu->arch.irq_summary);
1394 int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
1395 int irq = word_index * BITS_PER_LONG + bit_index;
1396
1397 clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
1398 if (!vcpu->arch.irq_pending[word_index])
1399 clear_bit(word_index, &vcpu->arch.irq_summary);
1400 svm_inject_irq(svm, irq);
1401}
1402
1403static void do_interrupt_requests(struct kvm_vcpu *vcpu,
1404 struct kvm_run *kvm_run)
1405{
1406 struct vcpu_svm *svm = to_svm(vcpu);
1407 struct vmcb_control_area *control = &svm->vmcb->control;
1408
1409 svm->vcpu.arch.interrupt_window_open =
1410 (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
1411 (svm->vmcb->save.rflags & X86_EFLAGS_IF));
1412
1413 if (svm->vcpu.arch.interrupt_window_open && svm->vcpu.arch.irq_summary)
1414 /*
1415 * If interrupts enabled, and not blocked by sti or mov ss. Good.
1416 */
1417 svm_do_inject_vector(svm);
1418
1419 /*
1420 * Interrupts blocked. Wait for unblock.
1421 */
1422 if (!svm->vcpu.arch.interrupt_window_open &&
1423 (svm->vcpu.arch.irq_summary || kvm_run->request_interrupt_window))
1424 control->intercept |= 1ULL << INTERCEPT_VINTR;
1425 else
1426 control->intercept &= ~(1ULL << INTERCEPT_VINTR);
1427}
1428
1429static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
1430{
1431 return 0;
1432}
1433
1434static void save_db_regs(unsigned long *db_regs)
1435{
1436 asm volatile ("mov %%dr0, %0" : "=r"(db_regs[0]));
1437 asm volatile ("mov %%dr1, %0" : "=r"(db_regs[1]));
1438 asm volatile ("mov %%dr2, %0" : "=r"(db_regs[2]));
1439 asm volatile ("mov %%dr3, %0" : "=r"(db_regs[3]));
1440}
1441
1442static void load_db_regs(unsigned long *db_regs)
1443{
1444 asm volatile ("mov %0, %%dr0" : : "r"(db_regs[0]));
1445 asm volatile ("mov %0, %%dr1" : : "r"(db_regs[1]));
1446 asm volatile ("mov %0, %%dr2" : : "r"(db_regs[2]));
1447 asm volatile ("mov %0, %%dr3" : : "r"(db_regs[3]));
1448}
1449
1450static void svm_flush_tlb(struct kvm_vcpu *vcpu)
1451{
1452 force_new_asid(vcpu);
1453}
1454
1455static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
1456{
1457}
1458
1459static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1460{
1461 struct vcpu_svm *svm = to_svm(vcpu);
1462 u16 fs_selector;
1463 u16 gs_selector;
1464 u16 ldt_selector;
1465
1466 pre_svm_run(svm);
1467
1468 save_host_msrs(vcpu);
1469 fs_selector = read_fs();
1470 gs_selector = read_gs();
1471 ldt_selector = read_ldt();
1472 svm->host_cr2 = kvm_read_cr2();
1473 svm->host_dr6 = read_dr6();
1474 svm->host_dr7 = read_dr7();
1475 svm->vmcb->save.cr2 = vcpu->arch.cr2;
1476
1477 if (svm->vmcb->save.dr7 & 0xff) {
1478 write_dr7(0);
1479 save_db_regs(svm->host_db_regs);
1480 load_db_regs(svm->db_regs);
1481 }
1482
1483 clgi();
1484
1485 local_irq_enable();
1486
1487 asm volatile (
1488#ifdef CONFIG_X86_64
1489 "push %%rbp; \n\t"
1490#else
1491 "push %%ebp; \n\t"
1492#endif
1493
1494#ifdef CONFIG_X86_64
1495 "mov %c[rbx](%[svm]), %%rbx \n\t"
1496 "mov %c[rcx](%[svm]), %%rcx \n\t"
1497 "mov %c[rdx](%[svm]), %%rdx \n\t"
1498 "mov %c[rsi](%[svm]), %%rsi \n\t"
1499 "mov %c[rdi](%[svm]), %%rdi \n\t"
1500 "mov %c[rbp](%[svm]), %%rbp \n\t"
1501 "mov %c[r8](%[svm]), %%r8 \n\t"
1502 "mov %c[r9](%[svm]), %%r9 \n\t"
1503 "mov %c[r10](%[svm]), %%r10 \n\t"
1504 "mov %c[r11](%[svm]), %%r11 \n\t"
1505 "mov %c[r12](%[svm]), %%r12 \n\t"
1506 "mov %c[r13](%[svm]), %%r13 \n\t"
1507 "mov %c[r14](%[svm]), %%r14 \n\t"
1508 "mov %c[r15](%[svm]), %%r15 \n\t"
1509#else
1510 "mov %c[rbx](%[svm]), %%ebx \n\t"
1511 "mov %c[rcx](%[svm]), %%ecx \n\t"
1512 "mov %c[rdx](%[svm]), %%edx \n\t"
1513 "mov %c[rsi](%[svm]), %%esi \n\t"
1514 "mov %c[rdi](%[svm]), %%edi \n\t"
1515 "mov %c[rbp](%[svm]), %%ebp \n\t"
1516#endif
1517
1518#ifdef CONFIG_X86_64
1519 /* Enter guest mode */
1520 "push %%rax \n\t"
1521 "mov %c[vmcb](%[svm]), %%rax \n\t"
1522 SVM_VMLOAD "\n\t"
1523 SVM_VMRUN "\n\t"
1524 SVM_VMSAVE "\n\t"
1525 "pop %%rax \n\t"
1526#else
1527 /* Enter guest mode */
1528 "push %%eax \n\t"
1529 "mov %c[vmcb](%[svm]), %%eax \n\t"
1530 SVM_VMLOAD "\n\t"
1531 SVM_VMRUN "\n\t"
1532 SVM_VMSAVE "\n\t"
1533 "pop %%eax \n\t"
1534#endif
1535
1536 /* Save guest registers, load host registers */
1537#ifdef CONFIG_X86_64
1538 "mov %%rbx, %c[rbx](%[svm]) \n\t"
1539 "mov %%rcx, %c[rcx](%[svm]) \n\t"
1540 "mov %%rdx, %c[rdx](%[svm]) \n\t"
1541 "mov %%rsi, %c[rsi](%[svm]) \n\t"
1542 "mov %%rdi, %c[rdi](%[svm]) \n\t"
1543 "mov %%rbp, %c[rbp](%[svm]) \n\t"
1544 "mov %%r8, %c[r8](%[svm]) \n\t"
1545 "mov %%r9, %c[r9](%[svm]) \n\t"
1546 "mov %%r10, %c[r10](%[svm]) \n\t"
1547 "mov %%r11, %c[r11](%[svm]) \n\t"
1548 "mov %%r12, %c[r12](%[svm]) \n\t"
1549 "mov %%r13, %c[r13](%[svm]) \n\t"
1550 "mov %%r14, %c[r14](%[svm]) \n\t"
1551 "mov %%r15, %c[r15](%[svm]) \n\t"
1552
1553 "pop %%rbp; \n\t"
1554#else
1555 "mov %%ebx, %c[rbx](%[svm]) \n\t"
1556 "mov %%ecx, %c[rcx](%[svm]) \n\t"
1557 "mov %%edx, %c[rdx](%[svm]) \n\t"
1558 "mov %%esi, %c[rsi](%[svm]) \n\t"
1559 "mov %%edi, %c[rdi](%[svm]) \n\t"
1560 "mov %%ebp, %c[rbp](%[svm]) \n\t"
1561
1562 "pop %%ebp; \n\t"
1563#endif
1564 :
1565 : [svm]"a"(svm),
1566 [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
1567 [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])),
1568 [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])),
1569 [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])),
1570 [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])),
1571 [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])),
1572 [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP]))
1573#ifdef CONFIG_X86_64
1574 , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])),
1575 [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])),
1576 [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])),
1577 [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])),
1578 [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])),
1579 [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])),
1580 [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])),
1581 [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
1582#endif
1583 : "cc", "memory"
1584#ifdef CONFIG_X86_64
1585 , "rbx", "rcx", "rdx", "rsi", "rdi"
1586 , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
1587#else
1588 , "ebx", "ecx", "edx" , "esi", "edi"
1589#endif
1590 );
1591
1592 if ((svm->vmcb->save.dr7 & 0xff))
1593 load_db_regs(svm->host_db_regs);
1594
1595 vcpu->arch.cr2 = svm->vmcb->save.cr2;
1596
1597 write_dr6(svm->host_dr6);
1598 write_dr7(svm->host_dr7);
1599 kvm_write_cr2(svm->host_cr2);
1600
1601 load_fs(fs_selector);
1602 load_gs(gs_selector);
1603 load_ldt(ldt_selector);
1604 load_host_msrs(vcpu);
1605
1606 reload_tss(vcpu);
1607
1608 local_irq_disable();
1609
1610 stgi();
1611
1612 svm->next_rip = 0;
1613}
1614
1615static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
1616{
1617 struct vcpu_svm *svm = to_svm(vcpu);
1618
1619 svm->vmcb->save.cr3 = root;
1620 force_new_asid(vcpu);
1621
1622 if (vcpu->fpu_active) {
1623 svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR);
1624 svm->vmcb->save.cr0 |= X86_CR0_TS;
1625 vcpu->fpu_active = 0;
1626 }
1627}
1628
1629static int is_disabled(void)
1630{
1631 u64 vm_cr;
1632
1633 rdmsrl(MSR_VM_CR, vm_cr);
1634 if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
1635 return 1;
1636
1637 return 0;
1638}
1639
1640static void
1641svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
1642{
1643 /*
1644 * Patch in the VMMCALL instruction:
1645 */
1646 hypercall[0] = 0x0f;
1647 hypercall[1] = 0x01;
1648 hypercall[2] = 0xd9;
1649}
1650
1651static void svm_check_processor_compat(void *rtn)
1652{
1653 *(int *)rtn = 0;
1654}
1655
1656static struct kvm_x86_ops svm_x86_ops = {
1657 .cpu_has_kvm_support = has_svm,
1658 .disabled_by_bios = is_disabled,
1659 .hardware_setup = svm_hardware_setup,
1660 .hardware_unsetup = svm_hardware_unsetup,
1661 .check_processor_compatibility = svm_check_processor_compat,
1662 .hardware_enable = svm_hardware_enable,
1663 .hardware_disable = svm_hardware_disable,
1664
1665 .vcpu_create = svm_create_vcpu,
1666 .vcpu_free = svm_free_vcpu,
1667 .vcpu_reset = svm_vcpu_reset,
1668
1669 .prepare_guest_switch = svm_prepare_guest_switch,
1670 .vcpu_load = svm_vcpu_load,
1671 .vcpu_put = svm_vcpu_put,
1672 .vcpu_decache = svm_vcpu_decache,
1673
1674 .set_guest_debug = svm_guest_debug,
1675 .get_msr = svm_get_msr,
1676 .set_msr = svm_set_msr,
1677 .get_segment_base = svm_get_segment_base,
1678 .get_segment = svm_get_segment,
1679 .set_segment = svm_set_segment,
1680 .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
1681 .decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
1682 .set_cr0 = svm_set_cr0,
1683 .set_cr3 = svm_set_cr3,
1684 .set_cr4 = svm_set_cr4,
1685 .set_efer = svm_set_efer,
1686 .get_idt = svm_get_idt,
1687 .set_idt = svm_set_idt,
1688 .get_gdt = svm_get_gdt,
1689 .set_gdt = svm_set_gdt,
1690 .get_dr = svm_get_dr,
1691 .set_dr = svm_set_dr,
1692 .cache_regs = svm_cache_regs,
1693 .decache_regs = svm_decache_regs,
1694 .get_rflags = svm_get_rflags,
1695 .set_rflags = svm_set_rflags,
1696
1697 .tlb_flush = svm_flush_tlb,
1698
1699 .run = svm_vcpu_run,
1700 .handle_exit = handle_exit,
1701 .skip_emulated_instruction = skip_emulated_instruction,
1702 .patch_hypercall = svm_patch_hypercall,
1703 .get_irq = svm_get_irq,
1704 .set_irq = svm_set_irq,
1705 .queue_exception = svm_queue_exception,
1706 .exception_injected = svm_exception_injected,
1707 .inject_pending_irq = svm_intr_assist,
1708 .inject_pending_vectors = do_interrupt_requests,
1709
1710 .set_tss_addr = svm_set_tss_addr,
1711};
1712
1713static int __init svm_init(void)
1714{
1715 return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm),
1716 THIS_MODULE);
1717}
1718
1719static void __exit svm_exit(void)
1720{
1721 kvm_exit();
1722}
1723
1724module_init(svm_init)
1725module_exit(svm_exit)
diff --git a/arch/x86/kvm/svm.h b/arch/x86/kvm/svm.h
new file mode 100644
index 000000000000..5fd50491b555
--- /dev/null
+++ b/arch/x86/kvm/svm.h
@@ -0,0 +1,325 @@
1#ifndef __SVM_H
2#define __SVM_H
3
4enum {
5 INTERCEPT_INTR,
6 INTERCEPT_NMI,
7 INTERCEPT_SMI,
8 INTERCEPT_INIT,
9 INTERCEPT_VINTR,
10 INTERCEPT_SELECTIVE_CR0,
11 INTERCEPT_STORE_IDTR,
12 INTERCEPT_STORE_GDTR,
13 INTERCEPT_STORE_LDTR,
14 INTERCEPT_STORE_TR,
15 INTERCEPT_LOAD_IDTR,
16 INTERCEPT_LOAD_GDTR,
17 INTERCEPT_LOAD_LDTR,
18 INTERCEPT_LOAD_TR,
19 INTERCEPT_RDTSC,
20 INTERCEPT_RDPMC,
21 INTERCEPT_PUSHF,
22 INTERCEPT_POPF,
23 INTERCEPT_CPUID,
24 INTERCEPT_RSM,
25 INTERCEPT_IRET,
26 INTERCEPT_INTn,
27 INTERCEPT_INVD,
28 INTERCEPT_PAUSE,
29 INTERCEPT_HLT,
30 INTERCEPT_INVLPG,
31 INTERCEPT_INVLPGA,
32 INTERCEPT_IOIO_PROT,
33 INTERCEPT_MSR_PROT,
34 INTERCEPT_TASK_SWITCH,
35 INTERCEPT_FERR_FREEZE,
36 INTERCEPT_SHUTDOWN,
37 INTERCEPT_VMRUN,
38 INTERCEPT_VMMCALL,
39 INTERCEPT_VMLOAD,
40 INTERCEPT_VMSAVE,
41 INTERCEPT_STGI,
42 INTERCEPT_CLGI,
43 INTERCEPT_SKINIT,
44 INTERCEPT_RDTSCP,
45 INTERCEPT_ICEBP,
46 INTERCEPT_WBINVD,
47 INTERCEPT_MONITOR,
48 INTERCEPT_MWAIT,
49 INTERCEPT_MWAIT_COND,
50};
51
52
53struct __attribute__ ((__packed__)) vmcb_control_area {
54 u16 intercept_cr_read;
55 u16 intercept_cr_write;
56 u16 intercept_dr_read;
57 u16 intercept_dr_write;
58 u32 intercept_exceptions;
59 u64 intercept;
60 u8 reserved_1[44];
61 u64 iopm_base_pa;
62 u64 msrpm_base_pa;
63 u64 tsc_offset;
64 u32 asid;
65 u8 tlb_ctl;
66 u8 reserved_2[3];
67 u32 int_ctl;
68 u32 int_vector;
69 u32 int_state;
70 u8 reserved_3[4];
71 u32 exit_code;
72 u32 exit_code_hi;
73 u64 exit_info_1;
74 u64 exit_info_2;
75 u32 exit_int_info;
76 u32 exit_int_info_err;
77 u64 nested_ctl;
78 u8 reserved_4[16];
79 u32 event_inj;
80 u32 event_inj_err;
81 u64 nested_cr3;
82 u64 lbr_ctl;
83 u8 reserved_5[832];
84};
85
86
87#define TLB_CONTROL_DO_NOTHING 0
88#define TLB_CONTROL_FLUSH_ALL_ASID 1
89
90#define V_TPR_MASK 0x0f
91
92#define V_IRQ_SHIFT 8
93#define V_IRQ_MASK (1 << V_IRQ_SHIFT)
94
95#define V_INTR_PRIO_SHIFT 16
96#define V_INTR_PRIO_MASK (0x0f << V_INTR_PRIO_SHIFT)
97
98#define V_IGN_TPR_SHIFT 20
99#define V_IGN_TPR_MASK (1 << V_IGN_TPR_SHIFT)
100
101#define V_INTR_MASKING_SHIFT 24
102#define V_INTR_MASKING_MASK (1 << V_INTR_MASKING_SHIFT)
103
104#define SVM_INTERRUPT_SHADOW_MASK 1
105
106#define SVM_IOIO_STR_SHIFT 2
107#define SVM_IOIO_REP_SHIFT 3
108#define SVM_IOIO_SIZE_SHIFT 4
109#define SVM_IOIO_ASIZE_SHIFT 7
110
111#define SVM_IOIO_TYPE_MASK 1
112#define SVM_IOIO_STR_MASK (1 << SVM_IOIO_STR_SHIFT)
113#define SVM_IOIO_REP_MASK (1 << SVM_IOIO_REP_SHIFT)
114#define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT)
115#define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT)
116
117struct __attribute__ ((__packed__)) vmcb_seg {
118 u16 selector;
119 u16 attrib;
120 u32 limit;
121 u64 base;
122};
123
124struct __attribute__ ((__packed__)) vmcb_save_area {
125 struct vmcb_seg es;
126 struct vmcb_seg cs;
127 struct vmcb_seg ss;
128 struct vmcb_seg ds;
129 struct vmcb_seg fs;
130 struct vmcb_seg gs;
131 struct vmcb_seg gdtr;
132 struct vmcb_seg ldtr;
133 struct vmcb_seg idtr;
134 struct vmcb_seg tr;
135 u8 reserved_1[43];
136 u8 cpl;
137 u8 reserved_2[4];
138 u64 efer;
139 u8 reserved_3[112];
140 u64 cr4;
141 u64 cr3;
142 u64 cr0;
143 u64 dr7;
144 u64 dr6;
145 u64 rflags;
146 u64 rip;
147 u8 reserved_4[88];
148 u64 rsp;
149 u8 reserved_5[24];
150 u64 rax;
151 u64 star;
152 u64 lstar;
153 u64 cstar;
154 u64 sfmask;
155 u64 kernel_gs_base;
156 u64 sysenter_cs;
157 u64 sysenter_esp;
158 u64 sysenter_eip;
159 u64 cr2;
160 u8 reserved_6[32];
161 u64 g_pat;
162 u64 dbgctl;
163 u64 br_from;
164 u64 br_to;
165 u64 last_excp_from;
166 u64 last_excp_to;
167};
168
169struct __attribute__ ((__packed__)) vmcb {
170 struct vmcb_control_area control;
171 struct vmcb_save_area save;
172};
173
174#define SVM_CPUID_FEATURE_SHIFT 2
175#define SVM_CPUID_FUNC 0x8000000a
176
177#define MSR_EFER_SVME_MASK (1ULL << 12)
178#define MSR_VM_CR 0xc0010114
179#define MSR_VM_HSAVE_PA 0xc0010117ULL
180
181#define SVM_VM_CR_SVM_DISABLE 4
182
183#define SVM_SELECTOR_S_SHIFT 4
184#define SVM_SELECTOR_DPL_SHIFT 5
185#define SVM_SELECTOR_P_SHIFT 7
186#define SVM_SELECTOR_AVL_SHIFT 8
187#define SVM_SELECTOR_L_SHIFT 9
188#define SVM_SELECTOR_DB_SHIFT 10
189#define SVM_SELECTOR_G_SHIFT 11
190
191#define SVM_SELECTOR_TYPE_MASK (0xf)
192#define SVM_SELECTOR_S_MASK (1 << SVM_SELECTOR_S_SHIFT)
193#define SVM_SELECTOR_DPL_MASK (3 << SVM_SELECTOR_DPL_SHIFT)
194#define SVM_SELECTOR_P_MASK (1 << SVM_SELECTOR_P_SHIFT)
195#define SVM_SELECTOR_AVL_MASK (1 << SVM_SELECTOR_AVL_SHIFT)
196#define SVM_SELECTOR_L_MASK (1 << SVM_SELECTOR_L_SHIFT)
197#define SVM_SELECTOR_DB_MASK (1 << SVM_SELECTOR_DB_SHIFT)
198#define SVM_SELECTOR_G_MASK (1 << SVM_SELECTOR_G_SHIFT)
199
200#define SVM_SELECTOR_WRITE_MASK (1 << 1)
201#define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK
202#define SVM_SELECTOR_CODE_MASK (1 << 3)
203
204#define INTERCEPT_CR0_MASK 1
205#define INTERCEPT_CR3_MASK (1 << 3)
206#define INTERCEPT_CR4_MASK (1 << 4)
207#define INTERCEPT_CR8_MASK (1 << 8)
208
209#define INTERCEPT_DR0_MASK 1
210#define INTERCEPT_DR1_MASK (1 << 1)
211#define INTERCEPT_DR2_MASK (1 << 2)
212#define INTERCEPT_DR3_MASK (1 << 3)
213#define INTERCEPT_DR4_MASK (1 << 4)
214#define INTERCEPT_DR5_MASK (1 << 5)
215#define INTERCEPT_DR6_MASK (1 << 6)
216#define INTERCEPT_DR7_MASK (1 << 7)
217
218#define SVM_EVTINJ_VEC_MASK 0xff
219
220#define SVM_EVTINJ_TYPE_SHIFT 8
221#define SVM_EVTINJ_TYPE_MASK (7 << SVM_EVTINJ_TYPE_SHIFT)
222
223#define SVM_EVTINJ_TYPE_INTR (0 << SVM_EVTINJ_TYPE_SHIFT)
224#define SVM_EVTINJ_TYPE_NMI (2 << SVM_EVTINJ_TYPE_SHIFT)
225#define SVM_EVTINJ_TYPE_EXEPT (3 << SVM_EVTINJ_TYPE_SHIFT)
226#define SVM_EVTINJ_TYPE_SOFT (4 << SVM_EVTINJ_TYPE_SHIFT)
227
228#define SVM_EVTINJ_VALID (1 << 31)
229#define SVM_EVTINJ_VALID_ERR (1 << 11)
230
231#define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK
232
233#define SVM_EXITINTINFO_TYPE_INTR SVM_EVTINJ_TYPE_INTR
234#define SVM_EXITINTINFO_TYPE_NMI SVM_EVTINJ_TYPE_NMI
235#define SVM_EXITINTINFO_TYPE_EXEPT SVM_EVTINJ_TYPE_EXEPT
236#define SVM_EXITINTINFO_TYPE_SOFT SVM_EVTINJ_TYPE_SOFT
237
238#define SVM_EXITINTINFO_VALID SVM_EVTINJ_VALID
239#define SVM_EXITINTINFO_VALID_ERR SVM_EVTINJ_VALID_ERR
240
241#define SVM_EXIT_READ_CR0 0x000
242#define SVM_EXIT_READ_CR3 0x003
243#define SVM_EXIT_READ_CR4 0x004
244#define SVM_EXIT_READ_CR8 0x008
245#define SVM_EXIT_WRITE_CR0 0x010
246#define SVM_EXIT_WRITE_CR3 0x013
247#define SVM_EXIT_WRITE_CR4 0x014
248#define SVM_EXIT_WRITE_CR8 0x018
249#define SVM_EXIT_READ_DR0 0x020
250#define SVM_EXIT_READ_DR1 0x021
251#define SVM_EXIT_READ_DR2 0x022
252#define SVM_EXIT_READ_DR3 0x023
253#define SVM_EXIT_READ_DR4 0x024
254#define SVM_EXIT_READ_DR5 0x025
255#define SVM_EXIT_READ_DR6 0x026
256#define SVM_EXIT_READ_DR7 0x027
257#define SVM_EXIT_WRITE_DR0 0x030
258#define SVM_EXIT_WRITE_DR1 0x031
259#define SVM_EXIT_WRITE_DR2 0x032
260#define SVM_EXIT_WRITE_DR3 0x033
261#define SVM_EXIT_WRITE_DR4 0x034
262#define SVM_EXIT_WRITE_DR5 0x035
263#define SVM_EXIT_WRITE_DR6 0x036
264#define SVM_EXIT_WRITE_DR7 0x037
265#define SVM_EXIT_EXCP_BASE 0x040
266#define SVM_EXIT_INTR 0x060
267#define SVM_EXIT_NMI 0x061
268#define SVM_EXIT_SMI 0x062
269#define SVM_EXIT_INIT 0x063
270#define SVM_EXIT_VINTR 0x064
271#define SVM_EXIT_CR0_SEL_WRITE 0x065
272#define SVM_EXIT_IDTR_READ 0x066
273#define SVM_EXIT_GDTR_READ 0x067
274#define SVM_EXIT_LDTR_READ 0x068
275#define SVM_EXIT_TR_READ 0x069
276#define SVM_EXIT_IDTR_WRITE 0x06a
277#define SVM_EXIT_GDTR_WRITE 0x06b
278#define SVM_EXIT_LDTR_WRITE 0x06c
279#define SVM_EXIT_TR_WRITE 0x06d
280#define SVM_EXIT_RDTSC 0x06e
281#define SVM_EXIT_RDPMC 0x06f
282#define SVM_EXIT_PUSHF 0x070
283#define SVM_EXIT_POPF 0x071
284#define SVM_EXIT_CPUID 0x072
285#define SVM_EXIT_RSM 0x073
286#define SVM_EXIT_IRET 0x074
287#define SVM_EXIT_SWINT 0x075
288#define SVM_EXIT_INVD 0x076
289#define SVM_EXIT_PAUSE 0x077
290#define SVM_EXIT_HLT 0x078
291#define SVM_EXIT_INVLPG 0x079
292#define SVM_EXIT_INVLPGA 0x07a
293#define SVM_EXIT_IOIO 0x07b
294#define SVM_EXIT_MSR 0x07c
295#define SVM_EXIT_TASK_SWITCH 0x07d
296#define SVM_EXIT_FERR_FREEZE 0x07e
297#define SVM_EXIT_SHUTDOWN 0x07f
298#define SVM_EXIT_VMRUN 0x080
299#define SVM_EXIT_VMMCALL 0x081
300#define SVM_EXIT_VMLOAD 0x082
301#define SVM_EXIT_VMSAVE 0x083
302#define SVM_EXIT_STGI 0x084
303#define SVM_EXIT_CLGI 0x085
304#define SVM_EXIT_SKINIT 0x086
305#define SVM_EXIT_RDTSCP 0x087
306#define SVM_EXIT_ICEBP 0x088
307#define SVM_EXIT_WBINVD 0x089
308#define SVM_EXIT_MONITOR 0x08a
309#define SVM_EXIT_MWAIT 0x08b
310#define SVM_EXIT_MWAIT_COND 0x08c
311#define SVM_EXIT_NPF 0x400
312
313#define SVM_EXIT_ERR -1
314
315#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) /* TS and MP */
316
317#define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda"
318#define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8"
319#define SVM_VMSAVE ".byte 0x0f, 0x01, 0xdb"
320#define SVM_CLGI ".byte 0x0f, 0x01, 0xdd"
321#define SVM_STGI ".byte 0x0f, 0x01, 0xdc"
322#define SVM_INVLPGA ".byte 0x0f, 0x01, 0xdf"
323
324#endif
325
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
new file mode 100644
index 000000000000..fc494aff5d8b
--- /dev/null
+++ b/arch/x86/kvm/vmx.c
@@ -0,0 +1,2671 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * Copyright (C) 2006 Qumranet, Inc.
8 *
9 * Authors:
10 * Avi Kivity <avi@qumranet.com>
11 * Yaniv Kamay <yaniv@qumranet.com>
12 *
13 * This work is licensed under the terms of the GNU GPL, version 2. See
14 * the COPYING file in the top-level directory.
15 *
16 */
17
18#include "irq.h"
19#include "vmx.h"
20#include "segment_descriptor.h"
21#include "mmu.h"
22
23#include <linux/kvm_host.h>
24#include <linux/module.h>
25#include <linux/kernel.h>
26#include <linux/mm.h>
27#include <linux/highmem.h>
28#include <linux/sched.h>
29#include <linux/moduleparam.h>
30
31#include <asm/io.h>
32#include <asm/desc.h>
33
34MODULE_AUTHOR("Qumranet");
35MODULE_LICENSE("GPL");
36
37static int bypass_guest_pf = 1;
38module_param(bypass_guest_pf, bool, 0);
39
40struct vmcs {
41 u32 revision_id;
42 u32 abort;
43 char data[0];
44};
45
46struct vcpu_vmx {
47 struct kvm_vcpu vcpu;
48 int launched;
49 u8 fail;
50 u32 idt_vectoring_info;
51 struct kvm_msr_entry *guest_msrs;
52 struct kvm_msr_entry *host_msrs;
53 int nmsrs;
54 int save_nmsrs;
55 int msr_offset_efer;
56#ifdef CONFIG_X86_64
57 int msr_offset_kernel_gs_base;
58#endif
59 struct vmcs *vmcs;
60 struct {
61 int loaded;
62 u16 fs_sel, gs_sel, ldt_sel;
63 int gs_ldt_reload_needed;
64 int fs_reload_needed;
65 int guest_efer_loaded;
66 } host_state;
67 struct {
68 struct {
69 bool pending;
70 u8 vector;
71 unsigned rip;
72 } irq;
73 } rmode;
74};
75
76static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
77{
78 return container_of(vcpu, struct vcpu_vmx, vcpu);
79}
80
81static int init_rmode_tss(struct kvm *kvm);
82
83static DEFINE_PER_CPU(struct vmcs *, vmxarea);
84static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
85
86static struct page *vmx_io_bitmap_a;
87static struct page *vmx_io_bitmap_b;
88
89static struct vmcs_config {
90 int size;
91 int order;
92 u32 revision_id;
93 u32 pin_based_exec_ctrl;
94 u32 cpu_based_exec_ctrl;
95 u32 cpu_based_2nd_exec_ctrl;
96 u32 vmexit_ctrl;
97 u32 vmentry_ctrl;
98} vmcs_config;
99
100#define VMX_SEGMENT_FIELD(seg) \
101 [VCPU_SREG_##seg] = { \
102 .selector = GUEST_##seg##_SELECTOR, \
103 .base = GUEST_##seg##_BASE, \
104 .limit = GUEST_##seg##_LIMIT, \
105 .ar_bytes = GUEST_##seg##_AR_BYTES, \
106 }
107
108static struct kvm_vmx_segment_field {
109 unsigned selector;
110 unsigned base;
111 unsigned limit;
112 unsigned ar_bytes;
113} kvm_vmx_segment_fields[] = {
114 VMX_SEGMENT_FIELD(CS),
115 VMX_SEGMENT_FIELD(DS),
116 VMX_SEGMENT_FIELD(ES),
117 VMX_SEGMENT_FIELD(FS),
118 VMX_SEGMENT_FIELD(GS),
119 VMX_SEGMENT_FIELD(SS),
120 VMX_SEGMENT_FIELD(TR),
121 VMX_SEGMENT_FIELD(LDTR),
122};
123
124/*
125 * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it
126 * away by decrementing the array size.
127 */
128static const u32 vmx_msr_index[] = {
129#ifdef CONFIG_X86_64
130 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE,
131#endif
132 MSR_EFER, MSR_K6_STAR,
133};
134#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
135
136static void load_msrs(struct kvm_msr_entry *e, int n)
137{
138 int i;
139
140 for (i = 0; i < n; ++i)
141 wrmsrl(e[i].index, e[i].data);
142}
143
144static void save_msrs(struct kvm_msr_entry *e, int n)
145{
146 int i;
147
148 for (i = 0; i < n; ++i)
149 rdmsrl(e[i].index, e[i].data);
150}
151
152static inline int is_page_fault(u32 intr_info)
153{
154 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
155 INTR_INFO_VALID_MASK)) ==
156 (INTR_TYPE_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
157}
158
159static inline int is_no_device(u32 intr_info)
160{
161 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
162 INTR_INFO_VALID_MASK)) ==
163 (INTR_TYPE_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
164}
165
166static inline int is_invalid_opcode(u32 intr_info)
167{
168 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
169 INTR_INFO_VALID_MASK)) ==
170 (INTR_TYPE_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
171}
172
173static inline int is_external_interrupt(u32 intr_info)
174{
175 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
176 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
177}
178
179static inline int cpu_has_vmx_tpr_shadow(void)
180{
181 return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW);
182}
183
184static inline int vm_need_tpr_shadow(struct kvm *kvm)
185{
186 return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm)));
187}
188
189static inline int cpu_has_secondary_exec_ctrls(void)
190{
191 return (vmcs_config.cpu_based_exec_ctrl &
192 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS);
193}
194
195static inline int cpu_has_vmx_virtualize_apic_accesses(void)
196{
197 return (vmcs_config.cpu_based_2nd_exec_ctrl &
198 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
199}
200
201static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
202{
203 return ((cpu_has_vmx_virtualize_apic_accesses()) &&
204 (irqchip_in_kernel(kvm)));
205}
206
207static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
208{
209 int i;
210
211 for (i = 0; i < vmx->nmsrs; ++i)
212 if (vmx->guest_msrs[i].index == msr)
213 return i;
214 return -1;
215}
216
217static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
218{
219 int i;
220
221 i = __find_msr_index(vmx, msr);
222 if (i >= 0)
223 return &vmx->guest_msrs[i];
224 return NULL;
225}
226
227static void vmcs_clear(struct vmcs *vmcs)
228{
229 u64 phys_addr = __pa(vmcs);
230 u8 error;
231
232 asm volatile (ASM_VMX_VMCLEAR_RAX "; setna %0"
233 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
234 : "cc", "memory");
235 if (error)
236 printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
237 vmcs, phys_addr);
238}
239
240static void __vcpu_clear(void *arg)
241{
242 struct vcpu_vmx *vmx = arg;
243 int cpu = raw_smp_processor_id();
244
245 if (vmx->vcpu.cpu == cpu)
246 vmcs_clear(vmx->vmcs);
247 if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
248 per_cpu(current_vmcs, cpu) = NULL;
249 rdtscll(vmx->vcpu.arch.host_tsc);
250}
251
252static void vcpu_clear(struct vcpu_vmx *vmx)
253{
254 if (vmx->vcpu.cpu == -1)
255 return;
256 smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 0, 1);
257 vmx->launched = 0;
258}
259
260static unsigned long vmcs_readl(unsigned long field)
261{
262 unsigned long value;
263
264 asm volatile (ASM_VMX_VMREAD_RDX_RAX
265 : "=a"(value) : "d"(field) : "cc");
266 return value;
267}
268
269static u16 vmcs_read16(unsigned long field)
270{
271 return vmcs_readl(field);
272}
273
274static u32 vmcs_read32(unsigned long field)
275{
276 return vmcs_readl(field);
277}
278
279static u64 vmcs_read64(unsigned long field)
280{
281#ifdef CONFIG_X86_64
282 return vmcs_readl(field);
283#else
284 return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32);
285#endif
286}
287
288static noinline void vmwrite_error(unsigned long field, unsigned long value)
289{
290 printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
291 field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
292 dump_stack();
293}
294
295static void vmcs_writel(unsigned long field, unsigned long value)
296{
297 u8 error;
298
299 asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0"
300 : "=q"(error) : "a"(value), "d"(field) : "cc");
301 if (unlikely(error))
302 vmwrite_error(field, value);
303}
304
305static void vmcs_write16(unsigned long field, u16 value)
306{
307 vmcs_writel(field, value);
308}
309
310static void vmcs_write32(unsigned long field, u32 value)
311{
312 vmcs_writel(field, value);
313}
314
315static void vmcs_write64(unsigned long field, u64 value)
316{
317#ifdef CONFIG_X86_64
318 vmcs_writel(field, value);
319#else
320 vmcs_writel(field, value);
321 asm volatile ("");
322 vmcs_writel(field+1, value >> 32);
323#endif
324}
325
326static void vmcs_clear_bits(unsigned long field, u32 mask)
327{
328 vmcs_writel(field, vmcs_readl(field) & ~mask);
329}
330
331static void vmcs_set_bits(unsigned long field, u32 mask)
332{
333 vmcs_writel(field, vmcs_readl(field) | mask);
334}
335
336static void update_exception_bitmap(struct kvm_vcpu *vcpu)
337{
338 u32 eb;
339
340 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR);
341 if (!vcpu->fpu_active)
342 eb |= 1u << NM_VECTOR;
343 if (vcpu->guest_debug.enabled)
344 eb |= 1u << 1;
345 if (vcpu->arch.rmode.active)
346 eb = ~0;
347 vmcs_write32(EXCEPTION_BITMAP, eb);
348}
349
350static void reload_tss(void)
351{
352#ifndef CONFIG_X86_64
353
354 /*
355 * VT restores TR but not its size. Useless.
356 */
357 struct descriptor_table gdt;
358 struct segment_descriptor *descs;
359
360 get_gdt(&gdt);
361 descs = (void *)gdt.base;
362 descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
363 load_TR_desc();
364#endif
365}
366
367static void load_transition_efer(struct vcpu_vmx *vmx)
368{
369 int efer_offset = vmx->msr_offset_efer;
370 u64 host_efer = vmx->host_msrs[efer_offset].data;
371 u64 guest_efer = vmx->guest_msrs[efer_offset].data;
372 u64 ignore_bits;
373
374 if (efer_offset < 0)
375 return;
376 /*
377 * NX is emulated; LMA and LME handled by hardware; SCE meaninless
378 * outside long mode
379 */
380 ignore_bits = EFER_NX | EFER_SCE;
381#ifdef CONFIG_X86_64
382 ignore_bits |= EFER_LMA | EFER_LME;
383 /* SCE is meaningful only in long mode on Intel */
384 if (guest_efer & EFER_LMA)
385 ignore_bits &= ~(u64)EFER_SCE;
386#endif
387 if ((guest_efer & ~ignore_bits) == (host_efer & ~ignore_bits))
388 return;
389
390 vmx->host_state.guest_efer_loaded = 1;
391 guest_efer &= ~ignore_bits;
392 guest_efer |= host_efer & ignore_bits;
393 wrmsrl(MSR_EFER, guest_efer);
394 vmx->vcpu.stat.efer_reload++;
395}
396
397static void reload_host_efer(struct vcpu_vmx *vmx)
398{
399 if (vmx->host_state.guest_efer_loaded) {
400 vmx->host_state.guest_efer_loaded = 0;
401 load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1);
402 }
403}
404
405static void vmx_save_host_state(struct kvm_vcpu *vcpu)
406{
407 struct vcpu_vmx *vmx = to_vmx(vcpu);
408
409 if (vmx->host_state.loaded)
410 return;
411
412 vmx->host_state.loaded = 1;
413 /*
414 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not
415 * allow segment selectors with cpl > 0 or ti == 1.
416 */
417 vmx->host_state.ldt_sel = read_ldt();
418 vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
419 vmx->host_state.fs_sel = read_fs();
420 if (!(vmx->host_state.fs_sel & 7)) {
421 vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
422 vmx->host_state.fs_reload_needed = 0;
423 } else {
424 vmcs_write16(HOST_FS_SELECTOR, 0);
425 vmx->host_state.fs_reload_needed = 1;
426 }
427 vmx->host_state.gs_sel = read_gs();
428 if (!(vmx->host_state.gs_sel & 7))
429 vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
430 else {
431 vmcs_write16(HOST_GS_SELECTOR, 0);
432 vmx->host_state.gs_ldt_reload_needed = 1;
433 }
434
435#ifdef CONFIG_X86_64
436 vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
437 vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
438#else
439 vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel));
440 vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel));
441#endif
442
443#ifdef CONFIG_X86_64
444 if (is_long_mode(&vmx->vcpu))
445 save_msrs(vmx->host_msrs +
446 vmx->msr_offset_kernel_gs_base, 1);
447
448#endif
449 load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
450 load_transition_efer(vmx);
451}
452
453static void vmx_load_host_state(struct vcpu_vmx *vmx)
454{
455 unsigned long flags;
456
457 if (!vmx->host_state.loaded)
458 return;
459
460 ++vmx->vcpu.stat.host_state_reload;
461 vmx->host_state.loaded = 0;
462 if (vmx->host_state.fs_reload_needed)
463 load_fs(vmx->host_state.fs_sel);
464 if (vmx->host_state.gs_ldt_reload_needed) {
465 load_ldt(vmx->host_state.ldt_sel);
466 /*
467 * If we have to reload gs, we must take care to
468 * preserve our gs base.
469 */
470 local_irq_save(flags);
471 load_gs(vmx->host_state.gs_sel);
472#ifdef CONFIG_X86_64
473 wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
474#endif
475 local_irq_restore(flags);
476 }
477 reload_tss();
478 save_msrs(vmx->guest_msrs, vmx->save_nmsrs);
479 load_msrs(vmx->host_msrs, vmx->save_nmsrs);
480 reload_host_efer(vmx);
481}
482
483/*
484 * Switches to specified vcpu, until a matching vcpu_put(), but assumes
485 * vcpu mutex is already taken.
486 */
487static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
488{
489 struct vcpu_vmx *vmx = to_vmx(vcpu);
490 u64 phys_addr = __pa(vmx->vmcs);
491 u64 tsc_this, delta;
492
493 if (vcpu->cpu != cpu) {
494 vcpu_clear(vmx);
495 kvm_migrate_apic_timer(vcpu);
496 }
497
498 if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
499 u8 error;
500
501 per_cpu(current_vmcs, cpu) = vmx->vmcs;
502 asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0"
503 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
504 : "cc");
505 if (error)
506 printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
507 vmx->vmcs, phys_addr);
508 }
509
510 if (vcpu->cpu != cpu) {
511 struct descriptor_table dt;
512 unsigned long sysenter_esp;
513
514 vcpu->cpu = cpu;
515 /*
516 * Linux uses per-cpu TSS and GDT, so set these when switching
517 * processors.
518 */
519 vmcs_writel(HOST_TR_BASE, read_tr_base()); /* 22.2.4 */
520 get_gdt(&dt);
521 vmcs_writel(HOST_GDTR_BASE, dt.base); /* 22.2.4 */
522
523 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
524 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
525
526 /*
527 * Make sure the time stamp counter is monotonous.
528 */
529 rdtscll(tsc_this);
530 delta = vcpu->arch.host_tsc - tsc_this;
531 vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta);
532 }
533}
534
535static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
536{
537 vmx_load_host_state(to_vmx(vcpu));
538}
539
540static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
541{
542 if (vcpu->fpu_active)
543 return;
544 vcpu->fpu_active = 1;
545 vmcs_clear_bits(GUEST_CR0, X86_CR0_TS);
546 if (vcpu->arch.cr0 & X86_CR0_TS)
547 vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
548 update_exception_bitmap(vcpu);
549}
550
551static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
552{
553 if (!vcpu->fpu_active)
554 return;
555 vcpu->fpu_active = 0;
556 vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
557 update_exception_bitmap(vcpu);
558}
559
560static void vmx_vcpu_decache(struct kvm_vcpu *vcpu)
561{
562 vcpu_clear(to_vmx(vcpu));
563}
564
565static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
566{
567 return vmcs_readl(GUEST_RFLAGS);
568}
569
570static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
571{
572 if (vcpu->arch.rmode.active)
573 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
574 vmcs_writel(GUEST_RFLAGS, rflags);
575}
576
577static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
578{
579 unsigned long rip;
580 u32 interruptibility;
581
582 rip = vmcs_readl(GUEST_RIP);
583 rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
584 vmcs_writel(GUEST_RIP, rip);
585
586 /*
587 * We emulated an instruction, so temporary interrupt blocking
588 * should be removed, if set.
589 */
590 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
591 if (interruptibility & 3)
592 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
593 interruptibility & ~3);
594 vcpu->arch.interrupt_window_open = 1;
595}
596
597static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
598 bool has_error_code, u32 error_code)
599{
600 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
601 nr | INTR_TYPE_EXCEPTION
602 | (has_error_code ? INTR_INFO_DELIEVER_CODE_MASK : 0)
603 | INTR_INFO_VALID_MASK);
604 if (has_error_code)
605 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
606}
607
608static bool vmx_exception_injected(struct kvm_vcpu *vcpu)
609{
610 struct vcpu_vmx *vmx = to_vmx(vcpu);
611
612 return !(vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
613}
614
615/*
616 * Swap MSR entry in host/guest MSR entry array.
617 */
618#ifdef CONFIG_X86_64
619static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
620{
621 struct kvm_msr_entry tmp;
622
623 tmp = vmx->guest_msrs[to];
624 vmx->guest_msrs[to] = vmx->guest_msrs[from];
625 vmx->guest_msrs[from] = tmp;
626 tmp = vmx->host_msrs[to];
627 vmx->host_msrs[to] = vmx->host_msrs[from];
628 vmx->host_msrs[from] = tmp;
629}
630#endif
631
632/*
633 * Set up the vmcs to automatically save and restore system
634 * msrs. Don't touch the 64-bit msrs if the guest is in legacy
635 * mode, as fiddling with msrs is very expensive.
636 */
637static void setup_msrs(struct vcpu_vmx *vmx)
638{
639 int save_nmsrs;
640
641 save_nmsrs = 0;
642#ifdef CONFIG_X86_64
643 if (is_long_mode(&vmx->vcpu)) {
644 int index;
645
646 index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
647 if (index >= 0)
648 move_msr_up(vmx, index, save_nmsrs++);
649 index = __find_msr_index(vmx, MSR_LSTAR);
650 if (index >= 0)
651 move_msr_up(vmx, index, save_nmsrs++);
652 index = __find_msr_index(vmx, MSR_CSTAR);
653 if (index >= 0)
654 move_msr_up(vmx, index, save_nmsrs++);
655 index = __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
656 if (index >= 0)
657 move_msr_up(vmx, index, save_nmsrs++);
658 /*
659 * MSR_K6_STAR is only needed on long mode guests, and only
660 * if efer.sce is enabled.
661 */
662 index = __find_msr_index(vmx, MSR_K6_STAR);
663 if ((index >= 0) && (vmx->vcpu.arch.shadow_efer & EFER_SCE))
664 move_msr_up(vmx, index, save_nmsrs++);
665 }
666#endif
667 vmx->save_nmsrs = save_nmsrs;
668
669#ifdef CONFIG_X86_64
670 vmx->msr_offset_kernel_gs_base =
671 __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
672#endif
673 vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER);
674}
675
676/*
677 * reads and returns guest's timestamp counter "register"
678 * guest_tsc = host_tsc + tsc_offset -- 21.3
679 */
680static u64 guest_read_tsc(void)
681{
682 u64 host_tsc, tsc_offset;
683
684 rdtscll(host_tsc);
685 tsc_offset = vmcs_read64(TSC_OFFSET);
686 return host_tsc + tsc_offset;
687}
688
689/*
690 * writes 'guest_tsc' into guest's timestamp counter "register"
691 * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc
692 */
693static void guest_write_tsc(u64 guest_tsc)
694{
695 u64 host_tsc;
696
697 rdtscll(host_tsc);
698 vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc);
699}
700
701/*
702 * Reads an msr value (of 'msr_index') into 'pdata'.
703 * Returns 0 on success, non-0 otherwise.
704 * Assumes vcpu_load() was already called.
705 */
706static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
707{
708 u64 data;
709 struct kvm_msr_entry *msr;
710
711 if (!pdata) {
712 printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
713 return -EINVAL;
714 }
715
716 switch (msr_index) {
717#ifdef CONFIG_X86_64
718 case MSR_FS_BASE:
719 data = vmcs_readl(GUEST_FS_BASE);
720 break;
721 case MSR_GS_BASE:
722 data = vmcs_readl(GUEST_GS_BASE);
723 break;
724 case MSR_EFER:
725 return kvm_get_msr_common(vcpu, msr_index, pdata);
726#endif
727 case MSR_IA32_TIME_STAMP_COUNTER:
728 data = guest_read_tsc();
729 break;
730 case MSR_IA32_SYSENTER_CS:
731 data = vmcs_read32(GUEST_SYSENTER_CS);
732 break;
733 case MSR_IA32_SYSENTER_EIP:
734 data = vmcs_readl(GUEST_SYSENTER_EIP);
735 break;
736 case MSR_IA32_SYSENTER_ESP:
737 data = vmcs_readl(GUEST_SYSENTER_ESP);
738 break;
739 default:
740 msr = find_msr_entry(to_vmx(vcpu), msr_index);
741 if (msr) {
742 data = msr->data;
743 break;
744 }
745 return kvm_get_msr_common(vcpu, msr_index, pdata);
746 }
747
748 *pdata = data;
749 return 0;
750}
751
752/*
753 * Writes msr value into into the appropriate "register".
754 * Returns 0 on success, non-0 otherwise.
755 * Assumes vcpu_load() was already called.
756 */
757static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
758{
759 struct vcpu_vmx *vmx = to_vmx(vcpu);
760 struct kvm_msr_entry *msr;
761 int ret = 0;
762
763 switch (msr_index) {
764#ifdef CONFIG_X86_64
765 case MSR_EFER:
766 ret = kvm_set_msr_common(vcpu, msr_index, data);
767 if (vmx->host_state.loaded) {
768 reload_host_efer(vmx);
769 load_transition_efer(vmx);
770 }
771 break;
772 case MSR_FS_BASE:
773 vmcs_writel(GUEST_FS_BASE, data);
774 break;
775 case MSR_GS_BASE:
776 vmcs_writel(GUEST_GS_BASE, data);
777 break;
778#endif
779 case MSR_IA32_SYSENTER_CS:
780 vmcs_write32(GUEST_SYSENTER_CS, data);
781 break;
782 case MSR_IA32_SYSENTER_EIP:
783 vmcs_writel(GUEST_SYSENTER_EIP, data);
784 break;
785 case MSR_IA32_SYSENTER_ESP:
786 vmcs_writel(GUEST_SYSENTER_ESP, data);
787 break;
788 case MSR_IA32_TIME_STAMP_COUNTER:
789 guest_write_tsc(data);
790 break;
791 default:
792 msr = find_msr_entry(vmx, msr_index);
793 if (msr) {
794 msr->data = data;
795 if (vmx->host_state.loaded)
796 load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
797 break;
798 }
799 ret = kvm_set_msr_common(vcpu, msr_index, data);
800 }
801
802 return ret;
803}
804
805/*
806 * Sync the rsp and rip registers into the vcpu structure. This allows
807 * registers to be accessed by indexing vcpu->arch.regs.
808 */
809static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu)
810{
811 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
812 vcpu->arch.rip = vmcs_readl(GUEST_RIP);
813}
814
815/*
816 * Syncs rsp and rip back into the vmcs. Should be called after possible
817 * modification.
818 */
819static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu)
820{
821 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
822 vmcs_writel(GUEST_RIP, vcpu->arch.rip);
823}
824
825static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
826{
827 unsigned long dr7 = 0x400;
828 int old_singlestep;
829
830 old_singlestep = vcpu->guest_debug.singlestep;
831
832 vcpu->guest_debug.enabled = dbg->enabled;
833 if (vcpu->guest_debug.enabled) {
834 int i;
835
836 dr7 |= 0x200; /* exact */
837 for (i = 0; i < 4; ++i) {
838 if (!dbg->breakpoints[i].enabled)
839 continue;
840 vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address;
841 dr7 |= 2 << (i*2); /* global enable */
842 dr7 |= 0 << (i*4+16); /* execution breakpoint */
843 }
844
845 vcpu->guest_debug.singlestep = dbg->singlestep;
846 } else
847 vcpu->guest_debug.singlestep = 0;
848
849 if (old_singlestep && !vcpu->guest_debug.singlestep) {
850 unsigned long flags;
851
852 flags = vmcs_readl(GUEST_RFLAGS);
853 flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
854 vmcs_writel(GUEST_RFLAGS, flags);
855 }
856
857 update_exception_bitmap(vcpu);
858 vmcs_writel(GUEST_DR7, dr7);
859
860 return 0;
861}
862
863static int vmx_get_irq(struct kvm_vcpu *vcpu)
864{
865 struct vcpu_vmx *vmx = to_vmx(vcpu);
866 u32 idtv_info_field;
867
868 idtv_info_field = vmx->idt_vectoring_info;
869 if (idtv_info_field & INTR_INFO_VALID_MASK) {
870 if (is_external_interrupt(idtv_info_field))
871 return idtv_info_field & VECTORING_INFO_VECTOR_MASK;
872 else
873 printk(KERN_DEBUG "pending exception: not handled yet\n");
874 }
875 return -1;
876}
877
878static __init int cpu_has_kvm_support(void)
879{
880 unsigned long ecx = cpuid_ecx(1);
881 return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */
882}
883
884static __init int vmx_disabled_by_bios(void)
885{
886 u64 msr;
887
888 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
889 return (msr & (MSR_IA32_FEATURE_CONTROL_LOCKED |
890 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
891 == MSR_IA32_FEATURE_CONTROL_LOCKED;
892 /* locked but not enabled */
893}
894
895static void hardware_enable(void *garbage)
896{
897 int cpu = raw_smp_processor_id();
898 u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
899 u64 old;
900
901 rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
902 if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED |
903 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
904 != (MSR_IA32_FEATURE_CONTROL_LOCKED |
905 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
906 /* enable and lock */
907 wrmsrl(MSR_IA32_FEATURE_CONTROL, old |
908 MSR_IA32_FEATURE_CONTROL_LOCKED |
909 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED);
910 write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
911 asm volatile (ASM_VMX_VMXON_RAX : : "a"(&phys_addr), "m"(phys_addr)
912 : "memory", "cc");
913}
914
915static void hardware_disable(void *garbage)
916{
917 asm volatile (ASM_VMX_VMXOFF : : : "cc");
918}
919
920static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
921 u32 msr, u32 *result)
922{
923 u32 vmx_msr_low, vmx_msr_high;
924 u32 ctl = ctl_min | ctl_opt;
925
926 rdmsr(msr, vmx_msr_low, vmx_msr_high);
927
928 ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
929 ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */
930
931 /* Ensure minimum (required) set of control bits are supported. */
932 if (ctl_min & ~ctl)
933 return -EIO;
934
935 *result = ctl;
936 return 0;
937}
938
939static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
940{
941 u32 vmx_msr_low, vmx_msr_high;
942 u32 min, opt;
943 u32 _pin_based_exec_control = 0;
944 u32 _cpu_based_exec_control = 0;
945 u32 _cpu_based_2nd_exec_control = 0;
946 u32 _vmexit_control = 0;
947 u32 _vmentry_control = 0;
948
949 min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
950 opt = 0;
951 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
952 &_pin_based_exec_control) < 0)
953 return -EIO;
954
955 min = CPU_BASED_HLT_EXITING |
956#ifdef CONFIG_X86_64
957 CPU_BASED_CR8_LOAD_EXITING |
958 CPU_BASED_CR8_STORE_EXITING |
959#endif
960 CPU_BASED_USE_IO_BITMAPS |
961 CPU_BASED_MOV_DR_EXITING |
962 CPU_BASED_USE_TSC_OFFSETING;
963 opt = CPU_BASED_TPR_SHADOW |
964 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
965 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
966 &_cpu_based_exec_control) < 0)
967 return -EIO;
968#ifdef CONFIG_X86_64
969 if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
970 _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
971 ~CPU_BASED_CR8_STORE_EXITING;
972#endif
973 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
974 min = 0;
975 opt = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
976 SECONDARY_EXEC_WBINVD_EXITING;
977 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS2,
978 &_cpu_based_2nd_exec_control) < 0)
979 return -EIO;
980 }
981#ifndef CONFIG_X86_64
982 if (!(_cpu_based_2nd_exec_control &
983 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
984 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
985#endif
986
987 min = 0;
988#ifdef CONFIG_X86_64
989 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
990#endif
991 opt = 0;
992 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
993 &_vmexit_control) < 0)
994 return -EIO;
995
996 min = opt = 0;
997 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
998 &_vmentry_control) < 0)
999 return -EIO;
1000
1001 rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
1002
1003 /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
1004 if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
1005 return -EIO;
1006
1007#ifdef CONFIG_X86_64
1008 /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
1009 if (vmx_msr_high & (1u<<16))
1010 return -EIO;
1011#endif
1012
1013 /* Require Write-Back (WB) memory type for VMCS accesses. */
1014 if (((vmx_msr_high >> 18) & 15) != 6)
1015 return -EIO;
1016
1017 vmcs_conf->size = vmx_msr_high & 0x1fff;
1018 vmcs_conf->order = get_order(vmcs_config.size);
1019 vmcs_conf->revision_id = vmx_msr_low;
1020
1021 vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
1022 vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
1023 vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
1024 vmcs_conf->vmexit_ctrl = _vmexit_control;
1025 vmcs_conf->vmentry_ctrl = _vmentry_control;
1026
1027 return 0;
1028}
1029
1030static struct vmcs *alloc_vmcs_cpu(int cpu)
1031{
1032 int node = cpu_to_node(cpu);
1033 struct page *pages;
1034 struct vmcs *vmcs;
1035
1036 pages = alloc_pages_node(node, GFP_KERNEL, vmcs_config.order);
1037 if (!pages)
1038 return NULL;
1039 vmcs = page_address(pages);
1040 memset(vmcs, 0, vmcs_config.size);
1041 vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */
1042 return vmcs;
1043}
1044
1045static struct vmcs *alloc_vmcs(void)
1046{
1047 return alloc_vmcs_cpu(raw_smp_processor_id());
1048}
1049
1050static void free_vmcs(struct vmcs *vmcs)
1051{
1052 free_pages((unsigned long)vmcs, vmcs_config.order);
1053}
1054
1055static void free_kvm_area(void)
1056{
1057 int cpu;
1058
1059 for_each_online_cpu(cpu)
1060 free_vmcs(per_cpu(vmxarea, cpu));
1061}
1062
1063static __init int alloc_kvm_area(void)
1064{
1065 int cpu;
1066
1067 for_each_online_cpu(cpu) {
1068 struct vmcs *vmcs;
1069
1070 vmcs = alloc_vmcs_cpu(cpu);
1071 if (!vmcs) {
1072 free_kvm_area();
1073 return -ENOMEM;
1074 }
1075
1076 per_cpu(vmxarea, cpu) = vmcs;
1077 }
1078 return 0;
1079}
1080
1081static __init int hardware_setup(void)
1082{
1083 if (setup_vmcs_config(&vmcs_config) < 0)
1084 return -EIO;
1085 return alloc_kvm_area();
1086}
1087
1088static __exit void hardware_unsetup(void)
1089{
1090 free_kvm_area();
1091}
1092
1093static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save)
1094{
1095 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1096
1097 if (vmcs_readl(sf->base) == save->base && (save->base & AR_S_MASK)) {
1098 vmcs_write16(sf->selector, save->selector);
1099 vmcs_writel(sf->base, save->base);
1100 vmcs_write32(sf->limit, save->limit);
1101 vmcs_write32(sf->ar_bytes, save->ar);
1102 } else {
1103 u32 dpl = (vmcs_read16(sf->selector) & SELECTOR_RPL_MASK)
1104 << AR_DPL_SHIFT;
1105 vmcs_write32(sf->ar_bytes, 0x93 | dpl);
1106 }
1107}
1108
1109static void enter_pmode(struct kvm_vcpu *vcpu)
1110{
1111 unsigned long flags;
1112
1113 vcpu->arch.rmode.active = 0;
1114
1115 vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base);
1116 vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit);
1117 vmcs_write32(GUEST_TR_AR_BYTES, vcpu->arch.rmode.tr.ar);
1118
1119 flags = vmcs_readl(GUEST_RFLAGS);
1120 flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM);
1121 flags |= (vcpu->arch.rmode.save_iopl << IOPL_SHIFT);
1122 vmcs_writel(GUEST_RFLAGS, flags);
1123
1124 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
1125 (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
1126
1127 update_exception_bitmap(vcpu);
1128
1129 fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
1130 fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
1131 fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
1132 fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
1133
1134 vmcs_write16(GUEST_SS_SELECTOR, 0);
1135 vmcs_write32(GUEST_SS_AR_BYTES, 0x93);
1136
1137 vmcs_write16(GUEST_CS_SELECTOR,
1138 vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK);
1139 vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
1140}
1141
1142static gva_t rmode_tss_base(struct kvm *kvm)
1143{
1144 if (!kvm->arch.tss_addr) {
1145 gfn_t base_gfn = kvm->memslots[0].base_gfn +
1146 kvm->memslots[0].npages - 3;
1147 return base_gfn << PAGE_SHIFT;
1148 }
1149 return kvm->arch.tss_addr;
1150}
1151
1152static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
1153{
1154 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1155
1156 save->selector = vmcs_read16(sf->selector);
1157 save->base = vmcs_readl(sf->base);
1158 save->limit = vmcs_read32(sf->limit);
1159 save->ar = vmcs_read32(sf->ar_bytes);
1160 vmcs_write16(sf->selector, save->base >> 4);
1161 vmcs_write32(sf->base, save->base & 0xfffff);
1162 vmcs_write32(sf->limit, 0xffff);
1163 vmcs_write32(sf->ar_bytes, 0xf3);
1164}
1165
1166static void enter_rmode(struct kvm_vcpu *vcpu)
1167{
1168 unsigned long flags;
1169
1170 vcpu->arch.rmode.active = 1;
1171
1172 vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
1173 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
1174
1175 vcpu->arch.rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
1176 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
1177
1178 vcpu->arch.rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
1179 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
1180
1181 flags = vmcs_readl(GUEST_RFLAGS);
1182 vcpu->arch.rmode.save_iopl
1183 = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
1184
1185 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
1186
1187 vmcs_writel(GUEST_RFLAGS, flags);
1188 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
1189 update_exception_bitmap(vcpu);
1190
1191 vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4);
1192 vmcs_write32(GUEST_SS_LIMIT, 0xffff);
1193 vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
1194
1195 vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
1196 vmcs_write32(GUEST_CS_LIMIT, 0xffff);
1197 if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000)
1198 vmcs_writel(GUEST_CS_BASE, 0xf0000);
1199 vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
1200
1201 fix_rmode_seg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
1202 fix_rmode_seg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
1203 fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
1204 fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
1205
1206 kvm_mmu_reset_context(vcpu);
1207 init_rmode_tss(vcpu->kvm);
1208}
1209
1210#ifdef CONFIG_X86_64
1211
1212static void enter_lmode(struct kvm_vcpu *vcpu)
1213{
1214 u32 guest_tr_ar;
1215
1216 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
1217 if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
1218 printk(KERN_DEBUG "%s: tss fixup for long mode. \n",
1219 __FUNCTION__);
1220 vmcs_write32(GUEST_TR_AR_BYTES,
1221 (guest_tr_ar & ~AR_TYPE_MASK)
1222 | AR_TYPE_BUSY_64_TSS);
1223 }
1224
1225 vcpu->arch.shadow_efer |= EFER_LMA;
1226
1227 find_msr_entry(to_vmx(vcpu), MSR_EFER)->data |= EFER_LMA | EFER_LME;
1228 vmcs_write32(VM_ENTRY_CONTROLS,
1229 vmcs_read32(VM_ENTRY_CONTROLS)
1230 | VM_ENTRY_IA32E_MODE);
1231}
1232
1233static void exit_lmode(struct kvm_vcpu *vcpu)
1234{
1235 vcpu->arch.shadow_efer &= ~EFER_LMA;
1236
1237 vmcs_write32(VM_ENTRY_CONTROLS,
1238 vmcs_read32(VM_ENTRY_CONTROLS)
1239 & ~VM_ENTRY_IA32E_MODE);
1240}
1241
1242#endif
1243
1244static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
1245{
1246 vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK;
1247 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK;
1248}
1249
1250static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1251{
1252 vmx_fpu_deactivate(vcpu);
1253
1254 if (vcpu->arch.rmode.active && (cr0 & X86_CR0_PE))
1255 enter_pmode(vcpu);
1256
1257 if (!vcpu->arch.rmode.active && !(cr0 & X86_CR0_PE))
1258 enter_rmode(vcpu);
1259
1260#ifdef CONFIG_X86_64
1261 if (vcpu->arch.shadow_efer & EFER_LME) {
1262 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
1263 enter_lmode(vcpu);
1264 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
1265 exit_lmode(vcpu);
1266 }
1267#endif
1268
1269 vmcs_writel(CR0_READ_SHADOW, cr0);
1270 vmcs_writel(GUEST_CR0,
1271 (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON);
1272 vcpu->arch.cr0 = cr0;
1273
1274 if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE))
1275 vmx_fpu_activate(vcpu);
1276}
1277
1278static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1279{
1280 vmcs_writel(GUEST_CR3, cr3);
1281 if (vcpu->arch.cr0 & X86_CR0_PE)
1282 vmx_fpu_deactivate(vcpu);
1283}
1284
1285static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1286{
1287 vmcs_writel(CR4_READ_SHADOW, cr4);
1288 vmcs_writel(GUEST_CR4, cr4 | (vcpu->arch.rmode.active ?
1289 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON));
1290 vcpu->arch.cr4 = cr4;
1291}
1292
1293#ifdef CONFIG_X86_64
1294
1295static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
1296{
1297 struct vcpu_vmx *vmx = to_vmx(vcpu);
1298 struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
1299
1300 vcpu->arch.shadow_efer = efer;
1301 if (efer & EFER_LMA) {
1302 vmcs_write32(VM_ENTRY_CONTROLS,
1303 vmcs_read32(VM_ENTRY_CONTROLS) |
1304 VM_ENTRY_IA32E_MODE);
1305 msr->data = efer;
1306
1307 } else {
1308 vmcs_write32(VM_ENTRY_CONTROLS,
1309 vmcs_read32(VM_ENTRY_CONTROLS) &
1310 ~VM_ENTRY_IA32E_MODE);
1311
1312 msr->data = efer & ~EFER_LME;
1313 }
1314 setup_msrs(vmx);
1315}
1316
1317#endif
1318
1319static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
1320{
1321 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1322
1323 return vmcs_readl(sf->base);
1324}
1325
1326static void vmx_get_segment(struct kvm_vcpu *vcpu,
1327 struct kvm_segment *var, int seg)
1328{
1329 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1330 u32 ar;
1331
1332 var->base = vmcs_readl(sf->base);
1333 var->limit = vmcs_read32(sf->limit);
1334 var->selector = vmcs_read16(sf->selector);
1335 ar = vmcs_read32(sf->ar_bytes);
1336 if (ar & AR_UNUSABLE_MASK)
1337 ar = 0;
1338 var->type = ar & 15;
1339 var->s = (ar >> 4) & 1;
1340 var->dpl = (ar >> 5) & 3;
1341 var->present = (ar >> 7) & 1;
1342 var->avl = (ar >> 12) & 1;
1343 var->l = (ar >> 13) & 1;
1344 var->db = (ar >> 14) & 1;
1345 var->g = (ar >> 15) & 1;
1346 var->unusable = (ar >> 16) & 1;
1347}
1348
1349static u32 vmx_segment_access_rights(struct kvm_segment *var)
1350{
1351 u32 ar;
1352
1353 if (var->unusable)
1354 ar = 1 << 16;
1355 else {
1356 ar = var->type & 15;
1357 ar |= (var->s & 1) << 4;
1358 ar |= (var->dpl & 3) << 5;
1359 ar |= (var->present & 1) << 7;
1360 ar |= (var->avl & 1) << 12;
1361 ar |= (var->l & 1) << 13;
1362 ar |= (var->db & 1) << 14;
1363 ar |= (var->g & 1) << 15;
1364 }
1365 if (ar == 0) /* a 0 value means unusable */
1366 ar = AR_UNUSABLE_MASK;
1367
1368 return ar;
1369}
1370
1371static void vmx_set_segment(struct kvm_vcpu *vcpu,
1372 struct kvm_segment *var, int seg)
1373{
1374 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1375 u32 ar;
1376
1377 if (vcpu->arch.rmode.active && seg == VCPU_SREG_TR) {
1378 vcpu->arch.rmode.tr.selector = var->selector;
1379 vcpu->arch.rmode.tr.base = var->base;
1380 vcpu->arch.rmode.tr.limit = var->limit;
1381 vcpu->arch.rmode.tr.ar = vmx_segment_access_rights(var);
1382 return;
1383 }
1384 vmcs_writel(sf->base, var->base);
1385 vmcs_write32(sf->limit, var->limit);
1386 vmcs_write16(sf->selector, var->selector);
1387 if (vcpu->arch.rmode.active && var->s) {
1388 /*
1389 * Hack real-mode segments into vm86 compatibility.
1390 */
1391 if (var->base == 0xffff0000 && var->selector == 0xf000)
1392 vmcs_writel(sf->base, 0xf0000);
1393 ar = 0xf3;
1394 } else
1395 ar = vmx_segment_access_rights(var);
1396 vmcs_write32(sf->ar_bytes, ar);
1397}
1398
1399static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
1400{
1401 u32 ar = vmcs_read32(GUEST_CS_AR_BYTES);
1402
1403 *db = (ar >> 14) & 1;
1404 *l = (ar >> 13) & 1;
1405}
1406
1407static void vmx_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
1408{
1409 dt->limit = vmcs_read32(GUEST_IDTR_LIMIT);
1410 dt->base = vmcs_readl(GUEST_IDTR_BASE);
1411}
1412
1413static void vmx_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
1414{
1415 vmcs_write32(GUEST_IDTR_LIMIT, dt->limit);
1416 vmcs_writel(GUEST_IDTR_BASE, dt->base);
1417}
1418
1419static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
1420{
1421 dt->limit = vmcs_read32(GUEST_GDTR_LIMIT);
1422 dt->base = vmcs_readl(GUEST_GDTR_BASE);
1423}
1424
1425static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
1426{
1427 vmcs_write32(GUEST_GDTR_LIMIT, dt->limit);
1428 vmcs_writel(GUEST_GDTR_BASE, dt->base);
1429}
1430
1431static int init_rmode_tss(struct kvm *kvm)
1432{
1433 gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
1434 u16 data = 0;
1435 int r;
1436
1437 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
1438 if (r < 0)
1439 return 0;
1440 data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
1441 r = kvm_write_guest_page(kvm, fn++, &data, 0x66, sizeof(u16));
1442 if (r < 0)
1443 return 0;
1444 r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
1445 if (r < 0)
1446 return 0;
1447 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
1448 if (r < 0)
1449 return 0;
1450 data = ~0;
1451 r = kvm_write_guest_page(kvm, fn, &data, RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
1452 sizeof(u8));
1453 if (r < 0)
1454 return 0;
1455 return 1;
1456}
1457
1458static void seg_setup(int seg)
1459{
1460 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1461
1462 vmcs_write16(sf->selector, 0);
1463 vmcs_writel(sf->base, 0);
1464 vmcs_write32(sf->limit, 0xffff);
1465 vmcs_write32(sf->ar_bytes, 0x93);
1466}
1467
1468static int alloc_apic_access_page(struct kvm *kvm)
1469{
1470 struct kvm_userspace_memory_region kvm_userspace_mem;
1471 int r = 0;
1472
1473 mutex_lock(&kvm->lock);
1474 if (kvm->arch.apic_access_page)
1475 goto out;
1476 kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
1477 kvm_userspace_mem.flags = 0;
1478 kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL;
1479 kvm_userspace_mem.memory_size = PAGE_SIZE;
1480 r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
1481 if (r)
1482 goto out;
1483 kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
1484out:
1485 mutex_unlock(&kvm->lock);
1486 return r;
1487}
1488
1489/*
1490 * Sets up the vmcs for emulated real mode.
1491 */
1492static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
1493{
1494 u32 host_sysenter_cs;
1495 u32 junk;
1496 unsigned long a;
1497 struct descriptor_table dt;
1498 int i;
1499 unsigned long kvm_vmx_return;
1500 u32 exec_control;
1501
1502 /* I/O */
1503 vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a));
1504 vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b));
1505
1506 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
1507
1508 /* Control */
1509 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
1510 vmcs_config.pin_based_exec_ctrl);
1511
1512 exec_control = vmcs_config.cpu_based_exec_ctrl;
1513 if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
1514 exec_control &= ~CPU_BASED_TPR_SHADOW;
1515#ifdef CONFIG_X86_64
1516 exec_control |= CPU_BASED_CR8_STORE_EXITING |
1517 CPU_BASED_CR8_LOAD_EXITING;
1518#endif
1519 }
1520 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
1521
1522 if (cpu_has_secondary_exec_ctrls()) {
1523 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
1524 if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
1525 exec_control &=
1526 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
1527 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
1528 }
1529
1530 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
1531 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
1532 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
1533
1534 vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */
1535 vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */
1536 vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
1537
1538 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
1539 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
1540 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
1541 vmcs_write16(HOST_FS_SELECTOR, read_fs()); /* 22.2.4 */
1542 vmcs_write16(HOST_GS_SELECTOR, read_gs()); /* 22.2.4 */
1543 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
1544#ifdef CONFIG_X86_64
1545 rdmsrl(MSR_FS_BASE, a);
1546 vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
1547 rdmsrl(MSR_GS_BASE, a);
1548 vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */
1549#else
1550 vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
1551 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
1552#endif
1553
1554 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
1555
1556 get_idt(&dt);
1557 vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */
1558
1559 asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
1560 vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
1561 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
1562 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
1563 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
1564
1565 rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
1566 vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
1567 rdmsrl(MSR_IA32_SYSENTER_ESP, a);
1568 vmcs_writel(HOST_IA32_SYSENTER_ESP, a); /* 22.2.3 */
1569 rdmsrl(MSR_IA32_SYSENTER_EIP, a);
1570 vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */
1571
1572 for (i = 0; i < NR_VMX_MSR; ++i) {
1573 u32 index = vmx_msr_index[i];
1574 u32 data_low, data_high;
1575 u64 data;
1576 int j = vmx->nmsrs;
1577
1578 if (rdmsr_safe(index, &data_low, &data_high) < 0)
1579 continue;
1580 if (wrmsr_safe(index, data_low, data_high) < 0)
1581 continue;
1582 data = data_low | ((u64)data_high << 32);
1583 vmx->host_msrs[j].index = index;
1584 vmx->host_msrs[j].reserved = 0;
1585 vmx->host_msrs[j].data = data;
1586 vmx->guest_msrs[j] = vmx->host_msrs[j];
1587 ++vmx->nmsrs;
1588 }
1589
1590 vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
1591
1592 /* 22.2.1, 20.8.1 */
1593 vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
1594
1595 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
1596 vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK);
1597
1598 if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
1599 if (alloc_apic_access_page(vmx->vcpu.kvm) != 0)
1600 return -ENOMEM;
1601
1602 return 0;
1603}
1604
1605static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
1606{
1607 struct vcpu_vmx *vmx = to_vmx(vcpu);
1608 u64 msr;
1609 int ret;
1610
1611 if (!init_rmode_tss(vmx->vcpu.kvm)) {
1612 ret = -ENOMEM;
1613 goto out;
1614 }
1615
1616 vmx->vcpu.arch.rmode.active = 0;
1617
1618 vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
1619 set_cr8(&vmx->vcpu, 0);
1620 msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
1621 if (vmx->vcpu.vcpu_id == 0)
1622 msr |= MSR_IA32_APICBASE_BSP;
1623 kvm_set_apic_base(&vmx->vcpu, msr);
1624
1625 fx_init(&vmx->vcpu);
1626
1627 /*
1628 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
1629 * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh.
1630 */
1631 if (vmx->vcpu.vcpu_id == 0) {
1632 vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
1633 vmcs_writel(GUEST_CS_BASE, 0x000f0000);
1634 } else {
1635 vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
1636 vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
1637 }
1638 vmcs_write32(GUEST_CS_LIMIT, 0xffff);
1639 vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
1640
1641 seg_setup(VCPU_SREG_DS);
1642 seg_setup(VCPU_SREG_ES);
1643 seg_setup(VCPU_SREG_FS);
1644 seg_setup(VCPU_SREG_GS);
1645 seg_setup(VCPU_SREG_SS);
1646
1647 vmcs_write16(GUEST_TR_SELECTOR, 0);
1648 vmcs_writel(GUEST_TR_BASE, 0);
1649 vmcs_write32(GUEST_TR_LIMIT, 0xffff);
1650 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
1651
1652 vmcs_write16(GUEST_LDTR_SELECTOR, 0);
1653 vmcs_writel(GUEST_LDTR_BASE, 0);
1654 vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
1655 vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
1656
1657 vmcs_write32(GUEST_SYSENTER_CS, 0);
1658 vmcs_writel(GUEST_SYSENTER_ESP, 0);
1659 vmcs_writel(GUEST_SYSENTER_EIP, 0);
1660
1661 vmcs_writel(GUEST_RFLAGS, 0x02);
1662 if (vmx->vcpu.vcpu_id == 0)
1663 vmcs_writel(GUEST_RIP, 0xfff0);
1664 else
1665 vmcs_writel(GUEST_RIP, 0);
1666 vmcs_writel(GUEST_RSP, 0);
1667
1668 /* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */
1669 vmcs_writel(GUEST_DR7, 0x400);
1670
1671 vmcs_writel(GUEST_GDTR_BASE, 0);
1672 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
1673
1674 vmcs_writel(GUEST_IDTR_BASE, 0);
1675 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
1676
1677 vmcs_write32(GUEST_ACTIVITY_STATE, 0);
1678 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
1679 vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
1680
1681 guest_write_tsc(0);
1682
1683 /* Special registers */
1684 vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
1685
1686 setup_msrs(vmx);
1687
1688 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
1689
1690 if (cpu_has_vmx_tpr_shadow()) {
1691 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
1692 if (vm_need_tpr_shadow(vmx->vcpu.kvm))
1693 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
1694 page_to_phys(vmx->vcpu.arch.apic->regs_page));
1695 vmcs_write32(TPR_THRESHOLD, 0);
1696 }
1697
1698 if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
1699 vmcs_write64(APIC_ACCESS_ADDR,
1700 page_to_phys(vmx->vcpu.kvm->arch.apic_access_page));
1701
1702 vmx->vcpu.arch.cr0 = 0x60000010;
1703 vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */
1704 vmx_set_cr4(&vmx->vcpu, 0);
1705#ifdef CONFIG_X86_64
1706 vmx_set_efer(&vmx->vcpu, 0);
1707#endif
1708 vmx_fpu_activate(&vmx->vcpu);
1709 update_exception_bitmap(&vmx->vcpu);
1710
1711 return 0;
1712
1713out:
1714 return ret;
1715}
1716
1717static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
1718{
1719 struct vcpu_vmx *vmx = to_vmx(vcpu);
1720
1721 if (vcpu->arch.rmode.active) {
1722 vmx->rmode.irq.pending = true;
1723 vmx->rmode.irq.vector = irq;
1724 vmx->rmode.irq.rip = vmcs_readl(GUEST_RIP);
1725 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
1726 irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK);
1727 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
1728 vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip - 1);
1729 return;
1730 }
1731 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
1732 irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
1733}
1734
1735static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
1736{
1737 int word_index = __ffs(vcpu->arch.irq_summary);
1738 int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
1739 int irq = word_index * BITS_PER_LONG + bit_index;
1740
1741 clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
1742 if (!vcpu->arch.irq_pending[word_index])
1743 clear_bit(word_index, &vcpu->arch.irq_summary);
1744 vmx_inject_irq(vcpu, irq);
1745}
1746
1747
1748static void do_interrupt_requests(struct kvm_vcpu *vcpu,
1749 struct kvm_run *kvm_run)
1750{
1751 u32 cpu_based_vm_exec_control;
1752
1753 vcpu->arch.interrupt_window_open =
1754 ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
1755 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
1756
1757 if (vcpu->arch.interrupt_window_open &&
1758 vcpu->arch.irq_summary &&
1759 !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
1760 /*
1761 * If interrupts enabled, and not blocked by sti or mov ss. Good.
1762 */
1763 kvm_do_inject_irq(vcpu);
1764
1765 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
1766 if (!vcpu->arch.interrupt_window_open &&
1767 (vcpu->arch.irq_summary || kvm_run->request_interrupt_window))
1768 /*
1769 * Interrupts blocked. Wait for unblock.
1770 */
1771 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
1772 else
1773 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
1774 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
1775}
1776
1777static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
1778{
1779 int ret;
1780 struct kvm_userspace_memory_region tss_mem = {
1781 .slot = 8,
1782 .guest_phys_addr = addr,
1783 .memory_size = PAGE_SIZE * 3,
1784 .flags = 0,
1785 };
1786
1787 ret = kvm_set_memory_region(kvm, &tss_mem, 0);
1788 if (ret)
1789 return ret;
1790 kvm->arch.tss_addr = addr;
1791 return 0;
1792}
1793
1794static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
1795{
1796 struct kvm_guest_debug *dbg = &vcpu->guest_debug;
1797
1798 set_debugreg(dbg->bp[0], 0);
1799 set_debugreg(dbg->bp[1], 1);
1800 set_debugreg(dbg->bp[2], 2);
1801 set_debugreg(dbg->bp[3], 3);
1802
1803 if (dbg->singlestep) {
1804 unsigned long flags;
1805
1806 flags = vmcs_readl(GUEST_RFLAGS);
1807 flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
1808 vmcs_writel(GUEST_RFLAGS, flags);
1809 }
1810}
1811
1812static int handle_rmode_exception(struct kvm_vcpu *vcpu,
1813 int vec, u32 err_code)
1814{
1815 if (!vcpu->arch.rmode.active)
1816 return 0;
1817
1818 /*
1819 * Instruction with address size override prefix opcode 0x67
1820 * Cause the #SS fault with 0 error code in VM86 mode.
1821 */
1822 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
1823 if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE)
1824 return 1;
1825 return 0;
1826}
1827
1828static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1829{
1830 struct vcpu_vmx *vmx = to_vmx(vcpu);
1831 u32 intr_info, error_code;
1832 unsigned long cr2, rip;
1833 u32 vect_info;
1834 enum emulation_result er;
1835
1836 vect_info = vmx->idt_vectoring_info;
1837 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
1838
1839 if ((vect_info & VECTORING_INFO_VALID_MASK) &&
1840 !is_page_fault(intr_info))
1841 printk(KERN_ERR "%s: unexpected, vectoring info 0x%x "
1842 "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info);
1843
1844 if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) {
1845 int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
1846 set_bit(irq, vcpu->arch.irq_pending);
1847 set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
1848 }
1849
1850 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */
1851 return 1; /* already handled by vmx_vcpu_run() */
1852
1853 if (is_no_device(intr_info)) {
1854 vmx_fpu_activate(vcpu);
1855 return 1;
1856 }
1857
1858 if (is_invalid_opcode(intr_info)) {
1859 er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
1860 if (er != EMULATE_DONE)
1861 kvm_queue_exception(vcpu, UD_VECTOR);
1862 return 1;
1863 }
1864
1865 error_code = 0;
1866 rip = vmcs_readl(GUEST_RIP);
1867 if (intr_info & INTR_INFO_DELIEVER_CODE_MASK)
1868 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
1869 if (is_page_fault(intr_info)) {
1870 cr2 = vmcs_readl(EXIT_QUALIFICATION);
1871 return kvm_mmu_page_fault(vcpu, cr2, error_code);
1872 }
1873
1874 if (vcpu->arch.rmode.active &&
1875 handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
1876 error_code)) {
1877 if (vcpu->arch.halt_request) {
1878 vcpu->arch.halt_request = 0;
1879 return kvm_emulate_halt(vcpu);
1880 }
1881 return 1;
1882 }
1883
1884 if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) ==
1885 (INTR_TYPE_EXCEPTION | 1)) {
1886 kvm_run->exit_reason = KVM_EXIT_DEBUG;
1887 return 0;
1888 }
1889 kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
1890 kvm_run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
1891 kvm_run->ex.error_code = error_code;
1892 return 0;
1893}
1894
1895static int handle_external_interrupt(struct kvm_vcpu *vcpu,
1896 struct kvm_run *kvm_run)
1897{
1898 ++vcpu->stat.irq_exits;
1899 return 1;
1900}
1901
1902static int handle_triple_fault(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1903{
1904 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
1905 return 0;
1906}
1907
1908static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1909{
1910 unsigned long exit_qualification;
1911 int size, down, in, string, rep;
1912 unsigned port;
1913
1914 ++vcpu->stat.io_exits;
1915 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
1916 string = (exit_qualification & 16) != 0;
1917
1918 if (string) {
1919 if (emulate_instruction(vcpu,
1920 kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
1921 return 0;
1922 return 1;
1923 }
1924
1925 size = (exit_qualification & 7) + 1;
1926 in = (exit_qualification & 8) != 0;
1927 down = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0;
1928 rep = (exit_qualification & 32) != 0;
1929 port = exit_qualification >> 16;
1930
1931 return kvm_emulate_pio(vcpu, kvm_run, in, size, port);
1932}
1933
1934static void
1935vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
1936{
1937 /*
1938 * Patch in the VMCALL instruction:
1939 */
1940 hypercall[0] = 0x0f;
1941 hypercall[1] = 0x01;
1942 hypercall[2] = 0xc1;
1943}
1944
1945static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1946{
1947 unsigned long exit_qualification;
1948 int cr;
1949 int reg;
1950
1951 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
1952 cr = exit_qualification & 15;
1953 reg = (exit_qualification >> 8) & 15;
1954 switch ((exit_qualification >> 4) & 3) {
1955 case 0: /* mov to cr */
1956 switch (cr) {
1957 case 0:
1958 vcpu_load_rsp_rip(vcpu);
1959 set_cr0(vcpu, vcpu->arch.regs[reg]);
1960 skip_emulated_instruction(vcpu);
1961 return 1;
1962 case 3:
1963 vcpu_load_rsp_rip(vcpu);
1964 set_cr3(vcpu, vcpu->arch.regs[reg]);
1965 skip_emulated_instruction(vcpu);
1966 return 1;
1967 case 4:
1968 vcpu_load_rsp_rip(vcpu);
1969 set_cr4(vcpu, vcpu->arch.regs[reg]);
1970 skip_emulated_instruction(vcpu);
1971 return 1;
1972 case 8:
1973 vcpu_load_rsp_rip(vcpu);
1974 set_cr8(vcpu, vcpu->arch.regs[reg]);
1975 skip_emulated_instruction(vcpu);
1976 if (irqchip_in_kernel(vcpu->kvm))
1977 return 1;
1978 kvm_run->exit_reason = KVM_EXIT_SET_TPR;
1979 return 0;
1980 };
1981 break;
1982 case 2: /* clts */
1983 vcpu_load_rsp_rip(vcpu);
1984 vmx_fpu_deactivate(vcpu);
1985 vcpu->arch.cr0 &= ~X86_CR0_TS;
1986 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
1987 vmx_fpu_activate(vcpu);
1988 skip_emulated_instruction(vcpu);
1989 return 1;
1990 case 1: /*mov from cr*/
1991 switch (cr) {
1992 case 3:
1993 vcpu_load_rsp_rip(vcpu);
1994 vcpu->arch.regs[reg] = vcpu->arch.cr3;
1995 vcpu_put_rsp_rip(vcpu);
1996 skip_emulated_instruction(vcpu);
1997 return 1;
1998 case 8:
1999 vcpu_load_rsp_rip(vcpu);
2000 vcpu->arch.regs[reg] = get_cr8(vcpu);
2001 vcpu_put_rsp_rip(vcpu);
2002 skip_emulated_instruction(vcpu);
2003 return 1;
2004 }
2005 break;
2006 case 3: /* lmsw */
2007 lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f);
2008
2009 skip_emulated_instruction(vcpu);
2010 return 1;
2011 default:
2012 break;
2013 }
2014 kvm_run->exit_reason = 0;
2015 pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
2016 (int)(exit_qualification >> 4) & 3, cr);
2017 return 0;
2018}
2019
2020static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2021{
2022 unsigned long exit_qualification;
2023 unsigned long val;
2024 int dr, reg;
2025
2026 /*
2027 * FIXME: this code assumes the host is debugging the guest.
2028 * need to deal with guest debugging itself too.
2029 */
2030 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
2031 dr = exit_qualification & 7;
2032 reg = (exit_qualification >> 8) & 15;
2033 vcpu_load_rsp_rip(vcpu);
2034 if (exit_qualification & 16) {
2035 /* mov from dr */
2036 switch (dr) {
2037 case 6:
2038 val = 0xffff0ff0;
2039 break;
2040 case 7:
2041 val = 0x400;
2042 break;
2043 default:
2044 val = 0;
2045 }
2046 vcpu->arch.regs[reg] = val;
2047 } else {
2048 /* mov to dr */
2049 }
2050 vcpu_put_rsp_rip(vcpu);
2051 skip_emulated_instruction(vcpu);
2052 return 1;
2053}
2054
2055static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2056{
2057 kvm_emulate_cpuid(vcpu);
2058 return 1;
2059}
2060
2061static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2062{
2063 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
2064 u64 data;
2065
2066 if (vmx_get_msr(vcpu, ecx, &data)) {
2067 kvm_inject_gp(vcpu, 0);
2068 return 1;
2069 }
2070
2071 /* FIXME: handling of bits 32:63 of rax, rdx */
2072 vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u;
2073 vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
2074 skip_emulated_instruction(vcpu);
2075 return 1;
2076}
2077
2078static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2079{
2080 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
2081 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
2082 | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
2083
2084 if (vmx_set_msr(vcpu, ecx, data) != 0) {
2085 kvm_inject_gp(vcpu, 0);
2086 return 1;
2087 }
2088
2089 skip_emulated_instruction(vcpu);
2090 return 1;
2091}
2092
2093static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu,
2094 struct kvm_run *kvm_run)
2095{
2096 return 1;
2097}
2098
2099static int handle_interrupt_window(struct kvm_vcpu *vcpu,
2100 struct kvm_run *kvm_run)
2101{
2102 u32 cpu_based_vm_exec_control;
2103
2104 /* clear pending irq */
2105 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2106 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2107 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2108 /*
2109 * If the user space waits to inject interrupts, exit as soon as
2110 * possible
2111 */
2112 if (kvm_run->request_interrupt_window &&
2113 !vcpu->arch.irq_summary) {
2114 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
2115 ++vcpu->stat.irq_window_exits;
2116 return 0;
2117 }
2118 return 1;
2119}
2120
2121static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2122{
2123 skip_emulated_instruction(vcpu);
2124 return kvm_emulate_halt(vcpu);
2125}
2126
2127static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2128{
2129 skip_emulated_instruction(vcpu);
2130 kvm_emulate_hypercall(vcpu);
2131 return 1;
2132}
2133
2134static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2135{
2136 skip_emulated_instruction(vcpu);
2137 /* TODO: Add support for VT-d/pass-through device */
2138 return 1;
2139}
2140
2141static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2142{
2143 u64 exit_qualification;
2144 enum emulation_result er;
2145 unsigned long offset;
2146
2147 exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2148 offset = exit_qualification & 0xffful;
2149
2150 er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
2151
2152 if (er != EMULATE_DONE) {
2153 printk(KERN_ERR
2154 "Fail to handle apic access vmexit! Offset is 0x%lx\n",
2155 offset);
2156 return -ENOTSUPP;
2157 }
2158 return 1;
2159}
2160
2161/*
2162 * The exit handlers return 1 if the exit was handled fully and guest execution
2163 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
2164 * to be done to userspace and return 0.
2165 */
2166static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
2167 struct kvm_run *kvm_run) = {
2168 [EXIT_REASON_EXCEPTION_NMI] = handle_exception,
2169 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
2170 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
2171 [EXIT_REASON_IO_INSTRUCTION] = handle_io,
2172 [EXIT_REASON_CR_ACCESS] = handle_cr,
2173 [EXIT_REASON_DR_ACCESS] = handle_dr,
2174 [EXIT_REASON_CPUID] = handle_cpuid,
2175 [EXIT_REASON_MSR_READ] = handle_rdmsr,
2176 [EXIT_REASON_MSR_WRITE] = handle_wrmsr,
2177 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
2178 [EXIT_REASON_HLT] = handle_halt,
2179 [EXIT_REASON_VMCALL] = handle_vmcall,
2180 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
2181 [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
2182 [EXIT_REASON_WBINVD] = handle_wbinvd,
2183};
2184
2185static const int kvm_vmx_max_exit_handlers =
2186 ARRAY_SIZE(kvm_vmx_exit_handlers);
2187
2188/*
2189 * The guest has exited. See if we can fix it or if we need userspace
2190 * assistance.
2191 */
2192static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2193{
2194 u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
2195 struct vcpu_vmx *vmx = to_vmx(vcpu);
2196 u32 vectoring_info = vmx->idt_vectoring_info;
2197
2198 if (unlikely(vmx->fail)) {
2199 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
2200 kvm_run->fail_entry.hardware_entry_failure_reason
2201 = vmcs_read32(VM_INSTRUCTION_ERROR);
2202 return 0;
2203 }
2204
2205 if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
2206 exit_reason != EXIT_REASON_EXCEPTION_NMI)
2207 printk(KERN_WARNING "%s: unexpected, valid vectoring info and "
2208 "exit reason is 0x%x\n", __FUNCTION__, exit_reason);
2209 if (exit_reason < kvm_vmx_max_exit_handlers
2210 && kvm_vmx_exit_handlers[exit_reason])
2211 return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run);
2212 else {
2213 kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
2214 kvm_run->hw.hardware_exit_reason = exit_reason;
2215 }
2216 return 0;
2217}
2218
2219static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
2220{
2221}
2222
2223static void update_tpr_threshold(struct kvm_vcpu *vcpu)
2224{
2225 int max_irr, tpr;
2226
2227 if (!vm_need_tpr_shadow(vcpu->kvm))
2228 return;
2229
2230 if (!kvm_lapic_enabled(vcpu) ||
2231 ((max_irr = kvm_lapic_find_highest_irr(vcpu)) == -1)) {
2232 vmcs_write32(TPR_THRESHOLD, 0);
2233 return;
2234 }
2235
2236 tpr = (kvm_lapic_get_cr8(vcpu) & 0x0f) << 4;
2237 vmcs_write32(TPR_THRESHOLD, (max_irr > tpr) ? tpr >> 4 : max_irr >> 4);
2238}
2239
2240static void enable_irq_window(struct kvm_vcpu *vcpu)
2241{
2242 u32 cpu_based_vm_exec_control;
2243
2244 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2245 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
2246 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2247}
2248
2249static void vmx_intr_assist(struct kvm_vcpu *vcpu)
2250{
2251 struct vcpu_vmx *vmx = to_vmx(vcpu);
2252 u32 idtv_info_field, intr_info_field;
2253 int has_ext_irq, interrupt_window_open;
2254 int vector;
2255
2256 update_tpr_threshold(vcpu);
2257
2258 has_ext_irq = kvm_cpu_has_interrupt(vcpu);
2259 intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
2260 idtv_info_field = vmx->idt_vectoring_info;
2261 if (intr_info_field & INTR_INFO_VALID_MASK) {
2262 if (idtv_info_field & INTR_INFO_VALID_MASK) {
2263 /* TODO: fault when IDT_Vectoring */
2264 if (printk_ratelimit())
2265 printk(KERN_ERR "Fault when IDT_Vectoring\n");
2266 }
2267 if (has_ext_irq)
2268 enable_irq_window(vcpu);
2269 return;
2270 }
2271 if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) {
2272 if ((idtv_info_field & VECTORING_INFO_TYPE_MASK)
2273 == INTR_TYPE_EXT_INTR
2274 && vcpu->arch.rmode.active) {
2275 u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK;
2276
2277 vmx_inject_irq(vcpu, vect);
2278 if (unlikely(has_ext_irq))
2279 enable_irq_window(vcpu);
2280 return;
2281 }
2282
2283 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field);
2284 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2285 vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
2286
2287 if (unlikely(idtv_info_field & INTR_INFO_DELIEVER_CODE_MASK))
2288 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
2289 vmcs_read32(IDT_VECTORING_ERROR_CODE));
2290 if (unlikely(has_ext_irq))
2291 enable_irq_window(vcpu);
2292 return;
2293 }
2294 if (!has_ext_irq)
2295 return;
2296 interrupt_window_open =
2297 ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
2298 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
2299 if (interrupt_window_open) {
2300 vector = kvm_cpu_get_interrupt(vcpu);
2301 vmx_inject_irq(vcpu, vector);
2302 kvm_timer_intr_post(vcpu, vector);
2303 } else
2304 enable_irq_window(vcpu);
2305}
2306
2307/*
2308 * Failure to inject an interrupt should give us the information
2309 * in IDT_VECTORING_INFO_FIELD. However, if the failure occurs
2310 * when fetching the interrupt redirection bitmap in the real-mode
2311 * tss, this doesn't happen. So we do it ourselves.
2312 */
2313static void fixup_rmode_irq(struct vcpu_vmx *vmx)
2314{
2315 vmx->rmode.irq.pending = 0;
2316 if (vmcs_readl(GUEST_RIP) + 1 != vmx->rmode.irq.rip)
2317 return;
2318 vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip);
2319 if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
2320 vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK;
2321 vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR;
2322 return;
2323 }
2324 vmx->idt_vectoring_info =
2325 VECTORING_INFO_VALID_MASK
2326 | INTR_TYPE_EXT_INTR
2327 | vmx->rmode.irq.vector;
2328}
2329
2330static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2331{
2332 struct vcpu_vmx *vmx = to_vmx(vcpu);
2333 u32 intr_info;
2334
2335 /*
2336 * Loading guest fpu may have cleared host cr0.ts
2337 */
2338 vmcs_writel(HOST_CR0, read_cr0());
2339
2340 asm(
2341 /* Store host registers */
2342#ifdef CONFIG_X86_64
2343 "push %%rdx; push %%rbp;"
2344 "push %%rcx \n\t"
2345#else
2346 "push %%edx; push %%ebp;"
2347 "push %%ecx \n\t"
2348#endif
2349 ASM_VMX_VMWRITE_RSP_RDX "\n\t"
2350 /* Check if vmlaunch of vmresume is needed */
2351 "cmpl $0, %c[launched](%0) \n\t"
2352 /* Load guest registers. Don't clobber flags. */
2353#ifdef CONFIG_X86_64
2354 "mov %c[cr2](%0), %%rax \n\t"
2355 "mov %%rax, %%cr2 \n\t"
2356 "mov %c[rax](%0), %%rax \n\t"
2357 "mov %c[rbx](%0), %%rbx \n\t"
2358 "mov %c[rdx](%0), %%rdx \n\t"
2359 "mov %c[rsi](%0), %%rsi \n\t"
2360 "mov %c[rdi](%0), %%rdi \n\t"
2361 "mov %c[rbp](%0), %%rbp \n\t"
2362 "mov %c[r8](%0), %%r8 \n\t"
2363 "mov %c[r9](%0), %%r9 \n\t"
2364 "mov %c[r10](%0), %%r10 \n\t"
2365 "mov %c[r11](%0), %%r11 \n\t"
2366 "mov %c[r12](%0), %%r12 \n\t"
2367 "mov %c[r13](%0), %%r13 \n\t"
2368 "mov %c[r14](%0), %%r14 \n\t"
2369 "mov %c[r15](%0), %%r15 \n\t"
2370 "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (rcx) */
2371#else
2372 "mov %c[cr2](%0), %%eax \n\t"
2373 "mov %%eax, %%cr2 \n\t"
2374 "mov %c[rax](%0), %%eax \n\t"
2375 "mov %c[rbx](%0), %%ebx \n\t"
2376 "mov %c[rdx](%0), %%edx \n\t"
2377 "mov %c[rsi](%0), %%esi \n\t"
2378 "mov %c[rdi](%0), %%edi \n\t"
2379 "mov %c[rbp](%0), %%ebp \n\t"
2380 "mov %c[rcx](%0), %%ecx \n\t" /* kills %0 (ecx) */
2381#endif
2382 /* Enter guest mode */
2383 "jne .Llaunched \n\t"
2384 ASM_VMX_VMLAUNCH "\n\t"
2385 "jmp .Lkvm_vmx_return \n\t"
2386 ".Llaunched: " ASM_VMX_VMRESUME "\n\t"
2387 ".Lkvm_vmx_return: "
2388 /* Save guest registers, load host registers, keep flags */
2389#ifdef CONFIG_X86_64
2390 "xchg %0, (%%rsp) \n\t"
2391 "mov %%rax, %c[rax](%0) \n\t"
2392 "mov %%rbx, %c[rbx](%0) \n\t"
2393 "pushq (%%rsp); popq %c[rcx](%0) \n\t"
2394 "mov %%rdx, %c[rdx](%0) \n\t"
2395 "mov %%rsi, %c[rsi](%0) \n\t"
2396 "mov %%rdi, %c[rdi](%0) \n\t"
2397 "mov %%rbp, %c[rbp](%0) \n\t"
2398 "mov %%r8, %c[r8](%0) \n\t"
2399 "mov %%r9, %c[r9](%0) \n\t"
2400 "mov %%r10, %c[r10](%0) \n\t"
2401 "mov %%r11, %c[r11](%0) \n\t"
2402 "mov %%r12, %c[r12](%0) \n\t"
2403 "mov %%r13, %c[r13](%0) \n\t"
2404 "mov %%r14, %c[r14](%0) \n\t"
2405 "mov %%r15, %c[r15](%0) \n\t"
2406 "mov %%cr2, %%rax \n\t"
2407 "mov %%rax, %c[cr2](%0) \n\t"
2408
2409 "pop %%rbp; pop %%rbp; pop %%rdx \n\t"
2410#else
2411 "xchg %0, (%%esp) \n\t"
2412 "mov %%eax, %c[rax](%0) \n\t"
2413 "mov %%ebx, %c[rbx](%0) \n\t"
2414 "pushl (%%esp); popl %c[rcx](%0) \n\t"
2415 "mov %%edx, %c[rdx](%0) \n\t"
2416 "mov %%esi, %c[rsi](%0) \n\t"
2417 "mov %%edi, %c[rdi](%0) \n\t"
2418 "mov %%ebp, %c[rbp](%0) \n\t"
2419 "mov %%cr2, %%eax \n\t"
2420 "mov %%eax, %c[cr2](%0) \n\t"
2421
2422 "pop %%ebp; pop %%ebp; pop %%edx \n\t"
2423#endif
2424 "setbe %c[fail](%0) \n\t"
2425 : : "c"(vmx), "d"((unsigned long)HOST_RSP),
2426 [launched]"i"(offsetof(struct vcpu_vmx, launched)),
2427 [fail]"i"(offsetof(struct vcpu_vmx, fail)),
2428 [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
2429 [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
2430 [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
2431 [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
2432 [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
2433 [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
2434 [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
2435#ifdef CONFIG_X86_64
2436 [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
2437 [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
2438 [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
2439 [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
2440 [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
2441 [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
2442 [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
2443 [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
2444#endif
2445 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2))
2446 : "cc", "memory"
2447#ifdef CONFIG_X86_64
2448 , "rbx", "rdi", "rsi"
2449 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
2450#else
2451 , "ebx", "edi", "rsi"
2452#endif
2453 );
2454
2455 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
2456 if (vmx->rmode.irq.pending)
2457 fixup_rmode_irq(vmx);
2458
2459 vcpu->arch.interrupt_window_open =
2460 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0;
2461
2462 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
2463 vmx->launched = 1;
2464
2465 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
2466
2467 /* We need to handle NMIs before interrupts are enabled */
2468 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */
2469 asm("int $2");
2470}
2471
2472static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
2473{
2474 struct vcpu_vmx *vmx = to_vmx(vcpu);
2475
2476 if (vmx->vmcs) {
2477 on_each_cpu(__vcpu_clear, vmx, 0, 1);
2478 free_vmcs(vmx->vmcs);
2479 vmx->vmcs = NULL;
2480 }
2481}
2482
2483static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
2484{
2485 struct vcpu_vmx *vmx = to_vmx(vcpu);
2486
2487 vmx_free_vmcs(vcpu);
2488 kfree(vmx->host_msrs);
2489 kfree(vmx->guest_msrs);
2490 kvm_vcpu_uninit(vcpu);
2491 kmem_cache_free(kvm_vcpu_cache, vmx);
2492}
2493
2494static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
2495{
2496 int err;
2497 struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
2498 int cpu;
2499
2500 if (!vmx)
2501 return ERR_PTR(-ENOMEM);
2502
2503 err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
2504 if (err)
2505 goto free_vcpu;
2506
2507 vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
2508 if (!vmx->guest_msrs) {
2509 err = -ENOMEM;
2510 goto uninit_vcpu;
2511 }
2512
2513 vmx->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
2514 if (!vmx->host_msrs)
2515 goto free_guest_msrs;
2516
2517 vmx->vmcs = alloc_vmcs();
2518 if (!vmx->vmcs)
2519 goto free_msrs;
2520
2521 vmcs_clear(vmx->vmcs);
2522
2523 cpu = get_cpu();
2524 vmx_vcpu_load(&vmx->vcpu, cpu);
2525 err = vmx_vcpu_setup(vmx);
2526 vmx_vcpu_put(&vmx->vcpu);
2527 put_cpu();
2528 if (err)
2529 goto free_vmcs;
2530
2531 return &vmx->vcpu;
2532
2533free_vmcs:
2534 free_vmcs(vmx->vmcs);
2535free_msrs:
2536 kfree(vmx->host_msrs);
2537free_guest_msrs:
2538 kfree(vmx->guest_msrs);
2539uninit_vcpu:
2540 kvm_vcpu_uninit(&vmx->vcpu);
2541free_vcpu:
2542 kmem_cache_free(kvm_vcpu_cache, vmx);
2543 return ERR_PTR(err);
2544}
2545
2546static void __init vmx_check_processor_compat(void *rtn)
2547{
2548 struct vmcs_config vmcs_conf;
2549
2550 *(int *)rtn = 0;
2551 if (setup_vmcs_config(&vmcs_conf) < 0)
2552 *(int *)rtn = -EIO;
2553 if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
2554 printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
2555 smp_processor_id());
2556 *(int *)rtn = -EIO;
2557 }
2558}
2559
2560static struct kvm_x86_ops vmx_x86_ops = {
2561 .cpu_has_kvm_support = cpu_has_kvm_support,
2562 .disabled_by_bios = vmx_disabled_by_bios,
2563 .hardware_setup = hardware_setup,
2564 .hardware_unsetup = hardware_unsetup,
2565 .check_processor_compatibility = vmx_check_processor_compat,
2566 .hardware_enable = hardware_enable,
2567 .hardware_disable = hardware_disable,
2568
2569 .vcpu_create = vmx_create_vcpu,
2570 .vcpu_free = vmx_free_vcpu,
2571 .vcpu_reset = vmx_vcpu_reset,
2572
2573 .prepare_guest_switch = vmx_save_host_state,
2574 .vcpu_load = vmx_vcpu_load,
2575 .vcpu_put = vmx_vcpu_put,
2576 .vcpu_decache = vmx_vcpu_decache,
2577
2578 .set_guest_debug = set_guest_debug,
2579 .guest_debug_pre = kvm_guest_debug_pre,
2580 .get_msr = vmx_get_msr,
2581 .set_msr = vmx_set_msr,
2582 .get_segment_base = vmx_get_segment_base,
2583 .get_segment = vmx_get_segment,
2584 .set_segment = vmx_set_segment,
2585 .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
2586 .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
2587 .set_cr0 = vmx_set_cr0,
2588 .set_cr3 = vmx_set_cr3,
2589 .set_cr4 = vmx_set_cr4,
2590#ifdef CONFIG_X86_64
2591 .set_efer = vmx_set_efer,
2592#endif
2593 .get_idt = vmx_get_idt,
2594 .set_idt = vmx_set_idt,
2595 .get_gdt = vmx_get_gdt,
2596 .set_gdt = vmx_set_gdt,
2597 .cache_regs = vcpu_load_rsp_rip,
2598 .decache_regs = vcpu_put_rsp_rip,
2599 .get_rflags = vmx_get_rflags,
2600 .set_rflags = vmx_set_rflags,
2601
2602 .tlb_flush = vmx_flush_tlb,
2603
2604 .run = vmx_vcpu_run,
2605 .handle_exit = kvm_handle_exit,
2606 .skip_emulated_instruction = skip_emulated_instruction,
2607 .patch_hypercall = vmx_patch_hypercall,
2608 .get_irq = vmx_get_irq,
2609 .set_irq = vmx_inject_irq,
2610 .queue_exception = vmx_queue_exception,
2611 .exception_injected = vmx_exception_injected,
2612 .inject_pending_irq = vmx_intr_assist,
2613 .inject_pending_vectors = do_interrupt_requests,
2614
2615 .set_tss_addr = vmx_set_tss_addr,
2616};
2617
2618static int __init vmx_init(void)
2619{
2620 void *iova;
2621 int r;
2622
2623 vmx_io_bitmap_a = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
2624 if (!vmx_io_bitmap_a)
2625 return -ENOMEM;
2626
2627 vmx_io_bitmap_b = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
2628 if (!vmx_io_bitmap_b) {
2629 r = -ENOMEM;
2630 goto out;
2631 }
2632
2633 /*
2634 * Allow direct access to the PC debug port (it is often used for I/O
2635 * delays, but the vmexits simply slow things down).
2636 */
2637 iova = kmap(vmx_io_bitmap_a);
2638 memset(iova, 0xff, PAGE_SIZE);
2639 clear_bit(0x80, iova);
2640 kunmap(vmx_io_bitmap_a);
2641
2642 iova = kmap(vmx_io_bitmap_b);
2643 memset(iova, 0xff, PAGE_SIZE);
2644 kunmap(vmx_io_bitmap_b);
2645
2646 r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
2647 if (r)
2648 goto out1;
2649
2650 if (bypass_guest_pf)
2651 kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
2652
2653 return 0;
2654
2655out1:
2656 __free_page(vmx_io_bitmap_b);
2657out:
2658 __free_page(vmx_io_bitmap_a);
2659 return r;
2660}
2661
2662static void __exit vmx_exit(void)
2663{
2664 __free_page(vmx_io_bitmap_b);
2665 __free_page(vmx_io_bitmap_a);
2666
2667 kvm_exit();
2668}
2669
2670module_init(vmx_init)
2671module_exit(vmx_exit)
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h
new file mode 100644
index 000000000000..d52ae8d7303d
--- /dev/null
+++ b/arch/x86/kvm/vmx.h
@@ -0,0 +1,324 @@
1#ifndef VMX_H
2#define VMX_H
3
4/*
5 * vmx.h: VMX Architecture related definitions
6 * Copyright (c) 2004, Intel Corporation.
7 *
8 * This program is free software; you can redistribute it and/or modify it
9 * under the terms and conditions of the GNU General Public License,
10 * version 2, as published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope it will be useful, but WITHOUT
13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
15 * more details.
16 *
17 * You should have received a copy of the GNU General Public License along with
18 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
19 * Place - Suite 330, Boston, MA 02111-1307 USA.
20 *
21 * A few random additions are:
22 * Copyright (C) 2006 Qumranet
23 * Avi Kivity <avi@qumranet.com>
24 * Yaniv Kamay <yaniv@qumranet.com>
25 *
26 */
27
28/*
29 * Definitions of Primary Processor-Based VM-Execution Controls.
30 */
31#define CPU_BASED_VIRTUAL_INTR_PENDING 0x00000004
32#define CPU_BASED_USE_TSC_OFFSETING 0x00000008
33#define CPU_BASED_HLT_EXITING 0x00000080
34#define CPU_BASED_INVLPG_EXITING 0x00000200
35#define CPU_BASED_MWAIT_EXITING 0x00000400
36#define CPU_BASED_RDPMC_EXITING 0x00000800
37#define CPU_BASED_RDTSC_EXITING 0x00001000
38#define CPU_BASED_CR8_LOAD_EXITING 0x00080000
39#define CPU_BASED_CR8_STORE_EXITING 0x00100000
40#define CPU_BASED_TPR_SHADOW 0x00200000
41#define CPU_BASED_MOV_DR_EXITING 0x00800000
42#define CPU_BASED_UNCOND_IO_EXITING 0x01000000
43#define CPU_BASED_USE_IO_BITMAPS 0x02000000
44#define CPU_BASED_USE_MSR_BITMAPS 0x10000000
45#define CPU_BASED_MONITOR_EXITING 0x20000000
46#define CPU_BASED_PAUSE_EXITING 0x40000000
47#define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS 0x80000000
48/*
49 * Definitions of Secondary Processor-Based VM-Execution Controls.
50 */
51#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
52#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
53
54
55#define PIN_BASED_EXT_INTR_MASK 0x00000001
56#define PIN_BASED_NMI_EXITING 0x00000008
57#define PIN_BASED_VIRTUAL_NMIS 0x00000020
58
59#define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200
60#define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000
61
62#define VM_ENTRY_IA32E_MODE 0x00000200
63#define VM_ENTRY_SMM 0x00000400
64#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800
65
66/* VMCS Encodings */
67enum vmcs_field {
68 GUEST_ES_SELECTOR = 0x00000800,
69 GUEST_CS_SELECTOR = 0x00000802,
70 GUEST_SS_SELECTOR = 0x00000804,
71 GUEST_DS_SELECTOR = 0x00000806,
72 GUEST_FS_SELECTOR = 0x00000808,
73 GUEST_GS_SELECTOR = 0x0000080a,
74 GUEST_LDTR_SELECTOR = 0x0000080c,
75 GUEST_TR_SELECTOR = 0x0000080e,
76 HOST_ES_SELECTOR = 0x00000c00,
77 HOST_CS_SELECTOR = 0x00000c02,
78 HOST_SS_SELECTOR = 0x00000c04,
79 HOST_DS_SELECTOR = 0x00000c06,
80 HOST_FS_SELECTOR = 0x00000c08,
81 HOST_GS_SELECTOR = 0x00000c0a,
82 HOST_TR_SELECTOR = 0x00000c0c,
83 IO_BITMAP_A = 0x00002000,
84 IO_BITMAP_A_HIGH = 0x00002001,
85 IO_BITMAP_B = 0x00002002,
86 IO_BITMAP_B_HIGH = 0x00002003,
87 MSR_BITMAP = 0x00002004,
88 MSR_BITMAP_HIGH = 0x00002005,
89 VM_EXIT_MSR_STORE_ADDR = 0x00002006,
90 VM_EXIT_MSR_STORE_ADDR_HIGH = 0x00002007,
91 VM_EXIT_MSR_LOAD_ADDR = 0x00002008,
92 VM_EXIT_MSR_LOAD_ADDR_HIGH = 0x00002009,
93 VM_ENTRY_MSR_LOAD_ADDR = 0x0000200a,
94 VM_ENTRY_MSR_LOAD_ADDR_HIGH = 0x0000200b,
95 TSC_OFFSET = 0x00002010,
96 TSC_OFFSET_HIGH = 0x00002011,
97 VIRTUAL_APIC_PAGE_ADDR = 0x00002012,
98 VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013,
99 APIC_ACCESS_ADDR = 0x00002014,
100 APIC_ACCESS_ADDR_HIGH = 0x00002015,
101 VMCS_LINK_POINTER = 0x00002800,
102 VMCS_LINK_POINTER_HIGH = 0x00002801,
103 GUEST_IA32_DEBUGCTL = 0x00002802,
104 GUEST_IA32_DEBUGCTL_HIGH = 0x00002803,
105 PIN_BASED_VM_EXEC_CONTROL = 0x00004000,
106 CPU_BASED_VM_EXEC_CONTROL = 0x00004002,
107 EXCEPTION_BITMAP = 0x00004004,
108 PAGE_FAULT_ERROR_CODE_MASK = 0x00004006,
109 PAGE_FAULT_ERROR_CODE_MATCH = 0x00004008,
110 CR3_TARGET_COUNT = 0x0000400a,
111 VM_EXIT_CONTROLS = 0x0000400c,
112 VM_EXIT_MSR_STORE_COUNT = 0x0000400e,
113 VM_EXIT_MSR_LOAD_COUNT = 0x00004010,
114 VM_ENTRY_CONTROLS = 0x00004012,
115 VM_ENTRY_MSR_LOAD_COUNT = 0x00004014,
116 VM_ENTRY_INTR_INFO_FIELD = 0x00004016,
117 VM_ENTRY_EXCEPTION_ERROR_CODE = 0x00004018,
118 VM_ENTRY_INSTRUCTION_LEN = 0x0000401a,
119 TPR_THRESHOLD = 0x0000401c,
120 SECONDARY_VM_EXEC_CONTROL = 0x0000401e,
121 VM_INSTRUCTION_ERROR = 0x00004400,
122 VM_EXIT_REASON = 0x00004402,
123 VM_EXIT_INTR_INFO = 0x00004404,
124 VM_EXIT_INTR_ERROR_CODE = 0x00004406,
125 IDT_VECTORING_INFO_FIELD = 0x00004408,
126 IDT_VECTORING_ERROR_CODE = 0x0000440a,
127 VM_EXIT_INSTRUCTION_LEN = 0x0000440c,
128 VMX_INSTRUCTION_INFO = 0x0000440e,
129 GUEST_ES_LIMIT = 0x00004800,
130 GUEST_CS_LIMIT = 0x00004802,
131 GUEST_SS_LIMIT = 0x00004804,
132 GUEST_DS_LIMIT = 0x00004806,
133 GUEST_FS_LIMIT = 0x00004808,
134 GUEST_GS_LIMIT = 0x0000480a,
135 GUEST_LDTR_LIMIT = 0x0000480c,
136 GUEST_TR_LIMIT = 0x0000480e,
137 GUEST_GDTR_LIMIT = 0x00004810,
138 GUEST_IDTR_LIMIT = 0x00004812,
139 GUEST_ES_AR_BYTES = 0x00004814,
140 GUEST_CS_AR_BYTES = 0x00004816,
141 GUEST_SS_AR_BYTES = 0x00004818,
142 GUEST_DS_AR_BYTES = 0x0000481a,
143 GUEST_FS_AR_BYTES = 0x0000481c,
144 GUEST_GS_AR_BYTES = 0x0000481e,
145 GUEST_LDTR_AR_BYTES = 0x00004820,
146 GUEST_TR_AR_BYTES = 0x00004822,
147 GUEST_INTERRUPTIBILITY_INFO = 0x00004824,
148 GUEST_ACTIVITY_STATE = 0X00004826,
149 GUEST_SYSENTER_CS = 0x0000482A,
150 HOST_IA32_SYSENTER_CS = 0x00004c00,
151 CR0_GUEST_HOST_MASK = 0x00006000,
152 CR4_GUEST_HOST_MASK = 0x00006002,
153 CR0_READ_SHADOW = 0x00006004,
154 CR4_READ_SHADOW = 0x00006006,
155 CR3_TARGET_VALUE0 = 0x00006008,
156 CR3_TARGET_VALUE1 = 0x0000600a,
157 CR3_TARGET_VALUE2 = 0x0000600c,
158 CR3_TARGET_VALUE3 = 0x0000600e,
159 EXIT_QUALIFICATION = 0x00006400,
160 GUEST_LINEAR_ADDRESS = 0x0000640a,
161 GUEST_CR0 = 0x00006800,
162 GUEST_CR3 = 0x00006802,
163 GUEST_CR4 = 0x00006804,
164 GUEST_ES_BASE = 0x00006806,
165 GUEST_CS_BASE = 0x00006808,
166 GUEST_SS_BASE = 0x0000680a,
167 GUEST_DS_BASE = 0x0000680c,
168 GUEST_FS_BASE = 0x0000680e,
169 GUEST_GS_BASE = 0x00006810,
170 GUEST_LDTR_BASE = 0x00006812,
171 GUEST_TR_BASE = 0x00006814,
172 GUEST_GDTR_BASE = 0x00006816,
173 GUEST_IDTR_BASE = 0x00006818,
174 GUEST_DR7 = 0x0000681a,
175 GUEST_RSP = 0x0000681c,
176 GUEST_RIP = 0x0000681e,
177 GUEST_RFLAGS = 0x00006820,
178 GUEST_PENDING_DBG_EXCEPTIONS = 0x00006822,
179 GUEST_SYSENTER_ESP = 0x00006824,
180 GUEST_SYSENTER_EIP = 0x00006826,
181 HOST_CR0 = 0x00006c00,
182 HOST_CR3 = 0x00006c02,
183 HOST_CR4 = 0x00006c04,
184 HOST_FS_BASE = 0x00006c06,
185 HOST_GS_BASE = 0x00006c08,
186 HOST_TR_BASE = 0x00006c0a,
187 HOST_GDTR_BASE = 0x00006c0c,
188 HOST_IDTR_BASE = 0x00006c0e,
189 HOST_IA32_SYSENTER_ESP = 0x00006c10,
190 HOST_IA32_SYSENTER_EIP = 0x00006c12,
191 HOST_RSP = 0x00006c14,
192 HOST_RIP = 0x00006c16,
193};
194
195#define VMX_EXIT_REASONS_FAILED_VMENTRY 0x80000000
196
197#define EXIT_REASON_EXCEPTION_NMI 0
198#define EXIT_REASON_EXTERNAL_INTERRUPT 1
199#define EXIT_REASON_TRIPLE_FAULT 2
200
201#define EXIT_REASON_PENDING_INTERRUPT 7
202
203#define EXIT_REASON_TASK_SWITCH 9
204#define EXIT_REASON_CPUID 10
205#define EXIT_REASON_HLT 12
206#define EXIT_REASON_INVLPG 14
207#define EXIT_REASON_RDPMC 15
208#define EXIT_REASON_RDTSC 16
209#define EXIT_REASON_VMCALL 18
210#define EXIT_REASON_VMCLEAR 19
211#define EXIT_REASON_VMLAUNCH 20
212#define EXIT_REASON_VMPTRLD 21
213#define EXIT_REASON_VMPTRST 22
214#define EXIT_REASON_VMREAD 23
215#define EXIT_REASON_VMRESUME 24
216#define EXIT_REASON_VMWRITE 25
217#define EXIT_REASON_VMOFF 26
218#define EXIT_REASON_VMON 27
219#define EXIT_REASON_CR_ACCESS 28
220#define EXIT_REASON_DR_ACCESS 29
221#define EXIT_REASON_IO_INSTRUCTION 30
222#define EXIT_REASON_MSR_READ 31
223#define EXIT_REASON_MSR_WRITE 32
224#define EXIT_REASON_MWAIT_INSTRUCTION 36
225#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
226#define EXIT_REASON_APIC_ACCESS 44
227#define EXIT_REASON_WBINVD 54
228
229/*
230 * Interruption-information format
231 */
232#define INTR_INFO_VECTOR_MASK 0xff /* 7:0 */
233#define INTR_INFO_INTR_TYPE_MASK 0x700 /* 10:8 */
234#define INTR_INFO_DELIEVER_CODE_MASK 0x800 /* 11 */
235#define INTR_INFO_VALID_MASK 0x80000000 /* 31 */
236
237#define VECTORING_INFO_VECTOR_MASK INTR_INFO_VECTOR_MASK
238#define VECTORING_INFO_TYPE_MASK INTR_INFO_INTR_TYPE_MASK
239#define VECTORING_INFO_DELIEVER_CODE_MASK INTR_INFO_DELIEVER_CODE_MASK
240#define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK
241
242#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */
243#define INTR_TYPE_EXCEPTION (3 << 8) /* processor exception */
244#define INTR_TYPE_SOFT_INTR (4 << 8) /* software interrupt */
245
246/*
247 * Exit Qualifications for MOV for Control Register Access
248 */
249#define CONTROL_REG_ACCESS_NUM 0x7 /* 2:0, number of control reg.*/
250#define CONTROL_REG_ACCESS_TYPE 0x30 /* 5:4, access type */
251#define CONTROL_REG_ACCESS_REG 0xf00 /* 10:8, general purpose reg. */
252#define LMSW_SOURCE_DATA_SHIFT 16
253#define LMSW_SOURCE_DATA (0xFFFF << LMSW_SOURCE_DATA_SHIFT) /* 16:31 lmsw source */
254#define REG_EAX (0 << 8)
255#define REG_ECX (1 << 8)
256#define REG_EDX (2 << 8)
257#define REG_EBX (3 << 8)
258#define REG_ESP (4 << 8)
259#define REG_EBP (5 << 8)
260#define REG_ESI (6 << 8)
261#define REG_EDI (7 << 8)
262#define REG_R8 (8 << 8)
263#define REG_R9 (9 << 8)
264#define REG_R10 (10 << 8)
265#define REG_R11 (11 << 8)
266#define REG_R12 (12 << 8)
267#define REG_R13 (13 << 8)
268#define REG_R14 (14 << 8)
269#define REG_R15 (15 << 8)
270
271/*
272 * Exit Qualifications for MOV for Debug Register Access
273 */
274#define DEBUG_REG_ACCESS_NUM 0x7 /* 2:0, number of debug reg. */
275#define DEBUG_REG_ACCESS_TYPE 0x10 /* 4, direction of access */
276#define TYPE_MOV_TO_DR (0 << 4)
277#define TYPE_MOV_FROM_DR (1 << 4)
278#define DEBUG_REG_ACCESS_REG 0xf00 /* 11:8, general purpose reg. */
279
280
281/* segment AR */
282#define SEGMENT_AR_L_MASK (1 << 13)
283
284#define AR_TYPE_ACCESSES_MASK 1
285#define AR_TYPE_READABLE_MASK (1 << 1)
286#define AR_TYPE_WRITEABLE_MASK (1 << 2)
287#define AR_TYPE_CODE_MASK (1 << 3)
288#define AR_TYPE_MASK 0x0f
289#define AR_TYPE_BUSY_64_TSS 11
290#define AR_TYPE_BUSY_32_TSS 11
291#define AR_TYPE_BUSY_16_TSS 3
292#define AR_TYPE_LDT 2
293
294#define AR_UNUSABLE_MASK (1 << 16)
295#define AR_S_MASK (1 << 4)
296#define AR_P_MASK (1 << 7)
297#define AR_L_MASK (1 << 13)
298#define AR_DB_MASK (1 << 14)
299#define AR_G_MASK (1 << 15)
300#define AR_DPL_SHIFT 5
301#define AR_DPL(ar) (((ar) >> AR_DPL_SHIFT) & 3)
302
303#define AR_RESERVD_MASK 0xfffe0f00
304
305#define MSR_IA32_VMX_BASIC 0x480
306#define MSR_IA32_VMX_PINBASED_CTLS 0x481
307#define MSR_IA32_VMX_PROCBASED_CTLS 0x482
308#define MSR_IA32_VMX_EXIT_CTLS 0x483
309#define MSR_IA32_VMX_ENTRY_CTLS 0x484
310#define MSR_IA32_VMX_MISC 0x485
311#define MSR_IA32_VMX_CR0_FIXED0 0x486
312#define MSR_IA32_VMX_CR0_FIXED1 0x487
313#define MSR_IA32_VMX_CR4_FIXED0 0x488
314#define MSR_IA32_VMX_CR4_FIXED1 0x489
315#define MSR_IA32_VMX_VMCS_ENUM 0x48a
316#define MSR_IA32_VMX_PROCBASED_CTLS2 0x48b
317
318#define MSR_IA32_FEATURE_CONTROL 0x3a
319#define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1
320#define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4
321
322#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9
323
324#endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
new file mode 100644
index 000000000000..5902c5cbc1bb
--- /dev/null
+++ b/arch/x86/kvm/x86.c
@@ -0,0 +1,3146 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * derived from drivers/kvm/kvm_main.c
5 *
6 * Copyright (C) 2006 Qumranet, Inc.
7 *
8 * Authors:
9 * Avi Kivity <avi@qumranet.com>
10 * Yaniv Kamay <yaniv@qumranet.com>
11 *
12 * This work is licensed under the terms of the GNU GPL, version 2. See
13 * the COPYING file in the top-level directory.
14 *
15 */
16
17#include <linux/kvm_host.h>
18#include "segment_descriptor.h"
19#include "irq.h"
20#include "mmu.h"
21
22#include <linux/kvm.h>
23#include <linux/fs.h>
24#include <linux/vmalloc.h>
25#include <linux/module.h>
26#include <linux/mman.h>
27#include <linux/highmem.h>
28
29#include <asm/uaccess.h>
30#include <asm/msr.h>
31
32#define MAX_IO_MSRS 256
33#define CR0_RESERVED_BITS \
34 (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
35 | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
36 | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
37#define CR4_RESERVED_BITS \
38 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
39 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
40 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \
41 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
42
43#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
44#define EFER_RESERVED_BITS 0xfffffffffffff2fe
45
46#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
47#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
48
49struct kvm_x86_ops *kvm_x86_ops;
50
51struct kvm_stats_debugfs_item debugfs_entries[] = {
52 { "pf_fixed", VCPU_STAT(pf_fixed) },
53 { "pf_guest", VCPU_STAT(pf_guest) },
54 { "tlb_flush", VCPU_STAT(tlb_flush) },
55 { "invlpg", VCPU_STAT(invlpg) },
56 { "exits", VCPU_STAT(exits) },
57 { "io_exits", VCPU_STAT(io_exits) },
58 { "mmio_exits", VCPU_STAT(mmio_exits) },
59 { "signal_exits", VCPU_STAT(signal_exits) },
60 { "irq_window", VCPU_STAT(irq_window_exits) },
61 { "halt_exits", VCPU_STAT(halt_exits) },
62 { "halt_wakeup", VCPU_STAT(halt_wakeup) },
63 { "request_irq", VCPU_STAT(request_irq_exits) },
64 { "irq_exits", VCPU_STAT(irq_exits) },
65 { "host_state_reload", VCPU_STAT(host_state_reload) },
66 { "efer_reload", VCPU_STAT(efer_reload) },
67 { "fpu_reload", VCPU_STAT(fpu_reload) },
68 { "insn_emulation", VCPU_STAT(insn_emulation) },
69 { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
70 { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
71 { "mmu_pte_write", VM_STAT(mmu_pte_write) },
72 { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
73 { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
74 { "mmu_flooded", VM_STAT(mmu_flooded) },
75 { "mmu_recycled", VM_STAT(mmu_recycled) },
76 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
77 { NULL }
78};
79
80
81unsigned long segment_base(u16 selector)
82{
83 struct descriptor_table gdt;
84 struct segment_descriptor *d;
85 unsigned long table_base;
86 unsigned long v;
87
88 if (selector == 0)
89 return 0;
90
91 asm("sgdt %0" : "=m"(gdt));
92 table_base = gdt.base;
93
94 if (selector & 4) { /* from ldt */
95 u16 ldt_selector;
96
97 asm("sldt %0" : "=g"(ldt_selector));
98 table_base = segment_base(ldt_selector);
99 }
100 d = (struct segment_descriptor *)(table_base + (selector & ~7));
101 v = d->base_low | ((unsigned long)d->base_mid << 16) |
102 ((unsigned long)d->base_high << 24);
103#ifdef CONFIG_X86_64
104 if (d->system == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
105 v |= ((unsigned long) \
106 ((struct segment_descriptor_64 *)d)->base_higher) << 32;
107#endif
108 return v;
109}
110EXPORT_SYMBOL_GPL(segment_base);
111
112u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
113{
114 if (irqchip_in_kernel(vcpu->kvm))
115 return vcpu->arch.apic_base;
116 else
117 return vcpu->arch.apic_base;
118}
119EXPORT_SYMBOL_GPL(kvm_get_apic_base);
120
121void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
122{
123 /* TODO: reserve bits check */
124 if (irqchip_in_kernel(vcpu->kvm))
125 kvm_lapic_set_base(vcpu, data);
126 else
127 vcpu->arch.apic_base = data;
128}
129EXPORT_SYMBOL_GPL(kvm_set_apic_base);
130
131void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
132{
133 WARN_ON(vcpu->arch.exception.pending);
134 vcpu->arch.exception.pending = true;
135 vcpu->arch.exception.has_error_code = false;
136 vcpu->arch.exception.nr = nr;
137}
138EXPORT_SYMBOL_GPL(kvm_queue_exception);
139
140void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
141 u32 error_code)
142{
143 ++vcpu->stat.pf_guest;
144 if (vcpu->arch.exception.pending && vcpu->arch.exception.nr == PF_VECTOR) {
145 printk(KERN_DEBUG "kvm: inject_page_fault:"
146 " double fault 0x%lx\n", addr);
147 vcpu->arch.exception.nr = DF_VECTOR;
148 vcpu->arch.exception.error_code = 0;
149 return;
150 }
151 vcpu->arch.cr2 = addr;
152 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
153}
154
155void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
156{
157 WARN_ON(vcpu->arch.exception.pending);
158 vcpu->arch.exception.pending = true;
159 vcpu->arch.exception.has_error_code = true;
160 vcpu->arch.exception.nr = nr;
161 vcpu->arch.exception.error_code = error_code;
162}
163EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
164
165static void __queue_exception(struct kvm_vcpu *vcpu)
166{
167 kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
168 vcpu->arch.exception.has_error_code,
169 vcpu->arch.exception.error_code);
170}
171
172/*
173 * Load the pae pdptrs. Return true is they are all valid.
174 */
175int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
176{
177 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
178 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
179 int i;
180 int ret;
181 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
182
183 mutex_lock(&vcpu->kvm->lock);
184 ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
185 offset * sizeof(u64), sizeof(pdpte));
186 if (ret < 0) {
187 ret = 0;
188 goto out;
189 }
190 for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
191 if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {
192 ret = 0;
193 goto out;
194 }
195 }
196 ret = 1;
197
198 memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
199out:
200 mutex_unlock(&vcpu->kvm->lock);
201
202 return ret;
203}
204
205static bool pdptrs_changed(struct kvm_vcpu *vcpu)
206{
207 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
208 bool changed = true;
209 int r;
210
211 if (is_long_mode(vcpu) || !is_pae(vcpu))
212 return false;
213
214 mutex_lock(&vcpu->kvm->lock);
215 r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
216 if (r < 0)
217 goto out;
218 changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
219out:
220 mutex_unlock(&vcpu->kvm->lock);
221
222 return changed;
223}
224
225void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
226{
227 if (cr0 & CR0_RESERVED_BITS) {
228 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
229 cr0, vcpu->arch.cr0);
230 kvm_inject_gp(vcpu, 0);
231 return;
232 }
233
234 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
235 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
236 kvm_inject_gp(vcpu, 0);
237 return;
238 }
239
240 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
241 printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
242 "and a clear PE flag\n");
243 kvm_inject_gp(vcpu, 0);
244 return;
245 }
246
247 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
248#ifdef CONFIG_X86_64
249 if ((vcpu->arch.shadow_efer & EFER_LME)) {
250 int cs_db, cs_l;
251
252 if (!is_pae(vcpu)) {
253 printk(KERN_DEBUG "set_cr0: #GP, start paging "
254 "in long mode while PAE is disabled\n");
255 kvm_inject_gp(vcpu, 0);
256 return;
257 }
258 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
259 if (cs_l) {
260 printk(KERN_DEBUG "set_cr0: #GP, start paging "
261 "in long mode while CS.L == 1\n");
262 kvm_inject_gp(vcpu, 0);
263 return;
264
265 }
266 } else
267#endif
268 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
269 printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
270 "reserved bits\n");
271 kvm_inject_gp(vcpu, 0);
272 return;
273 }
274
275 }
276
277 kvm_x86_ops->set_cr0(vcpu, cr0);
278 vcpu->arch.cr0 = cr0;
279
280 mutex_lock(&vcpu->kvm->lock);
281 kvm_mmu_reset_context(vcpu);
282 mutex_unlock(&vcpu->kvm->lock);
283 return;
284}
285EXPORT_SYMBOL_GPL(set_cr0);
286
287void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
288{
289 set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
290}
291EXPORT_SYMBOL_GPL(lmsw);
292
293void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
294{
295 if (cr4 & CR4_RESERVED_BITS) {
296 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
297 kvm_inject_gp(vcpu, 0);
298 return;
299 }
300
301 if (is_long_mode(vcpu)) {
302 if (!(cr4 & X86_CR4_PAE)) {
303 printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
304 "in long mode\n");
305 kvm_inject_gp(vcpu, 0);
306 return;
307 }
308 } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE)
309 && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
310 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
311 kvm_inject_gp(vcpu, 0);
312 return;
313 }
314
315 if (cr4 & X86_CR4_VMXE) {
316 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
317 kvm_inject_gp(vcpu, 0);
318 return;
319 }
320 kvm_x86_ops->set_cr4(vcpu, cr4);
321 vcpu->arch.cr4 = cr4;
322 mutex_lock(&vcpu->kvm->lock);
323 kvm_mmu_reset_context(vcpu);
324 mutex_unlock(&vcpu->kvm->lock);
325}
326EXPORT_SYMBOL_GPL(set_cr4);
327
328void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
329{
330 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
331 kvm_mmu_flush_tlb(vcpu);
332 return;
333 }
334
335 if (is_long_mode(vcpu)) {
336 if (cr3 & CR3_L_MODE_RESERVED_BITS) {
337 printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
338 kvm_inject_gp(vcpu, 0);
339 return;
340 }
341 } else {
342 if (is_pae(vcpu)) {
343 if (cr3 & CR3_PAE_RESERVED_BITS) {
344 printk(KERN_DEBUG
345 "set_cr3: #GP, reserved bits\n");
346 kvm_inject_gp(vcpu, 0);
347 return;
348 }
349 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
350 printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
351 "reserved bits\n");
352 kvm_inject_gp(vcpu, 0);
353 return;
354 }
355 }
356 /*
357 * We don't check reserved bits in nonpae mode, because
358 * this isn't enforced, and VMware depends on this.
359 */
360 }
361
362 mutex_lock(&vcpu->kvm->lock);
363 /*
364 * Does the new cr3 value map to physical memory? (Note, we
365 * catch an invalid cr3 even in real-mode, because it would
366 * cause trouble later on when we turn on paging anyway.)
367 *
368 * A real CPU would silently accept an invalid cr3 and would
369 * attempt to use it - with largely undefined (and often hard
370 * to debug) behavior on the guest side.
371 */
372 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
373 kvm_inject_gp(vcpu, 0);
374 else {
375 vcpu->arch.cr3 = cr3;
376 vcpu->arch.mmu.new_cr3(vcpu);
377 }
378 mutex_unlock(&vcpu->kvm->lock);
379}
380EXPORT_SYMBOL_GPL(set_cr3);
381
382void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
383{
384 if (cr8 & CR8_RESERVED_BITS) {
385 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
386 kvm_inject_gp(vcpu, 0);
387 return;
388 }
389 if (irqchip_in_kernel(vcpu->kvm))
390 kvm_lapic_set_tpr(vcpu, cr8);
391 else
392 vcpu->arch.cr8 = cr8;
393}
394EXPORT_SYMBOL_GPL(set_cr8);
395
396unsigned long get_cr8(struct kvm_vcpu *vcpu)
397{
398 if (irqchip_in_kernel(vcpu->kvm))
399 return kvm_lapic_get_cr8(vcpu);
400 else
401 return vcpu->arch.cr8;
402}
403EXPORT_SYMBOL_GPL(get_cr8);
404
405/*
406 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
407 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
408 *
409 * This list is modified at module load time to reflect the
410 * capabilities of the host cpu.
411 */
412static u32 msrs_to_save[] = {
413 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
414 MSR_K6_STAR,
415#ifdef CONFIG_X86_64
416 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
417#endif
418 MSR_IA32_TIME_STAMP_COUNTER,
419};
420
421static unsigned num_msrs_to_save;
422
423static u32 emulated_msrs[] = {
424 MSR_IA32_MISC_ENABLE,
425};
426
427#ifdef CONFIG_X86_64
428
429static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
430{
431 if (efer & EFER_RESERVED_BITS) {
432 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
433 efer);
434 kvm_inject_gp(vcpu, 0);
435 return;
436 }
437
438 if (is_paging(vcpu)
439 && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) {
440 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
441 kvm_inject_gp(vcpu, 0);
442 return;
443 }
444
445 kvm_x86_ops->set_efer(vcpu, efer);
446
447 efer &= ~EFER_LMA;
448 efer |= vcpu->arch.shadow_efer & EFER_LMA;
449
450 vcpu->arch.shadow_efer = efer;
451}
452
453#endif
454
455/*
456 * Writes msr value into into the appropriate "register".
457 * Returns 0 on success, non-0 otherwise.
458 * Assumes vcpu_load() was already called.
459 */
460int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
461{
462 return kvm_x86_ops->set_msr(vcpu, msr_index, data);
463}
464
465/*
466 * Adapt set_msr() to msr_io()'s calling convention
467 */
468static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
469{
470 return kvm_set_msr(vcpu, index, *data);
471}
472
473
474int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
475{
476 switch (msr) {
477#ifdef CONFIG_X86_64
478 case MSR_EFER:
479 set_efer(vcpu, data);
480 break;
481#endif
482 case MSR_IA32_MC0_STATUS:
483 pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
484 __FUNCTION__, data);
485 break;
486 case MSR_IA32_MCG_STATUS:
487 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
488 __FUNCTION__, data);
489 break;
490 case MSR_IA32_UCODE_REV:
491 case MSR_IA32_UCODE_WRITE:
492 case 0x200 ... 0x2ff: /* MTRRs */
493 break;
494 case MSR_IA32_APICBASE:
495 kvm_set_apic_base(vcpu, data);
496 break;
497 case MSR_IA32_MISC_ENABLE:
498 vcpu->arch.ia32_misc_enable_msr = data;
499 break;
500 default:
501 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x\n", msr);
502 return 1;
503 }
504 return 0;
505}
506EXPORT_SYMBOL_GPL(kvm_set_msr_common);
507
508
509/*
510 * Reads an msr value (of 'msr_index') into 'pdata'.
511 * Returns 0 on success, non-0 otherwise.
512 * Assumes vcpu_load() was already called.
513 */
514int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
515{
516 return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
517}
518
519int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
520{
521 u64 data;
522
523 switch (msr) {
524 case 0xc0010010: /* SYSCFG */
525 case 0xc0010015: /* HWCR */
526 case MSR_IA32_PLATFORM_ID:
527 case MSR_IA32_P5_MC_ADDR:
528 case MSR_IA32_P5_MC_TYPE:
529 case MSR_IA32_MC0_CTL:
530 case MSR_IA32_MCG_STATUS:
531 case MSR_IA32_MCG_CAP:
532 case MSR_IA32_MC0_MISC:
533 case MSR_IA32_MC0_MISC+4:
534 case MSR_IA32_MC0_MISC+8:
535 case MSR_IA32_MC0_MISC+12:
536 case MSR_IA32_MC0_MISC+16:
537 case MSR_IA32_UCODE_REV:
538 case MSR_IA32_PERF_STATUS:
539 case MSR_IA32_EBL_CR_POWERON:
540 /* MTRR registers */
541 case 0xfe:
542 case 0x200 ... 0x2ff:
543 data = 0;
544 break;
545 case 0xcd: /* fsb frequency */
546 data = 3;
547 break;
548 case MSR_IA32_APICBASE:
549 data = kvm_get_apic_base(vcpu);
550 break;
551 case MSR_IA32_MISC_ENABLE:
552 data = vcpu->arch.ia32_misc_enable_msr;
553 break;
554#ifdef CONFIG_X86_64
555 case MSR_EFER:
556 data = vcpu->arch.shadow_efer;
557 break;
558#endif
559 default:
560 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
561 return 1;
562 }
563 *pdata = data;
564 return 0;
565}
566EXPORT_SYMBOL_GPL(kvm_get_msr_common);
567
568/*
569 * Read or write a bunch of msrs. All parameters are kernel addresses.
570 *
571 * @return number of msrs set successfully.
572 */
573static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
574 struct kvm_msr_entry *entries,
575 int (*do_msr)(struct kvm_vcpu *vcpu,
576 unsigned index, u64 *data))
577{
578 int i;
579
580 vcpu_load(vcpu);
581
582 for (i = 0; i < msrs->nmsrs; ++i)
583 if (do_msr(vcpu, entries[i].index, &entries[i].data))
584 break;
585
586 vcpu_put(vcpu);
587
588 return i;
589}
590
591/*
592 * Read or write a bunch of msrs. Parameters are user addresses.
593 *
594 * @return number of msrs set successfully.
595 */
596static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
597 int (*do_msr)(struct kvm_vcpu *vcpu,
598 unsigned index, u64 *data),
599 int writeback)
600{
601 struct kvm_msrs msrs;
602 struct kvm_msr_entry *entries;
603 int r, n;
604 unsigned size;
605
606 r = -EFAULT;
607 if (copy_from_user(&msrs, user_msrs, sizeof msrs))
608 goto out;
609
610 r = -E2BIG;
611 if (msrs.nmsrs >= MAX_IO_MSRS)
612 goto out;
613
614 r = -ENOMEM;
615 size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
616 entries = vmalloc(size);
617 if (!entries)
618 goto out;
619
620 r = -EFAULT;
621 if (copy_from_user(entries, user_msrs->entries, size))
622 goto out_free;
623
624 r = n = __msr_io(vcpu, &msrs, entries, do_msr);
625 if (r < 0)
626 goto out_free;
627
628 r = -EFAULT;
629 if (writeback && copy_to_user(user_msrs->entries, entries, size))
630 goto out_free;
631
632 r = n;
633
634out_free:
635 vfree(entries);
636out:
637 return r;
638}
639
640/*
641 * Make sure that a cpu that is being hot-unplugged does not have any vcpus
642 * cached on it.
643 */
644void decache_vcpus_on_cpu(int cpu)
645{
646 struct kvm *vm;
647 struct kvm_vcpu *vcpu;
648 int i;
649
650 spin_lock(&kvm_lock);
651 list_for_each_entry(vm, &vm_list, vm_list)
652 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
653 vcpu = vm->vcpus[i];
654 if (!vcpu)
655 continue;
656 /*
657 * If the vcpu is locked, then it is running on some
658 * other cpu and therefore it is not cached on the
659 * cpu in question.
660 *
661 * If it's not locked, check the last cpu it executed
662 * on.
663 */
664 if (mutex_trylock(&vcpu->mutex)) {
665 if (vcpu->cpu == cpu) {
666 kvm_x86_ops->vcpu_decache(vcpu);
667 vcpu->cpu = -1;
668 }
669 mutex_unlock(&vcpu->mutex);
670 }
671 }
672 spin_unlock(&kvm_lock);
673}
674
675int kvm_dev_ioctl_check_extension(long ext)
676{
677 int r;
678
679 switch (ext) {
680 case KVM_CAP_IRQCHIP:
681 case KVM_CAP_HLT:
682 case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
683 case KVM_CAP_USER_MEMORY:
684 case KVM_CAP_SET_TSS_ADDR:
685 case KVM_CAP_EXT_CPUID:
686 r = 1;
687 break;
688 default:
689 r = 0;
690 break;
691 }
692 return r;
693
694}
695
696long kvm_arch_dev_ioctl(struct file *filp,
697 unsigned int ioctl, unsigned long arg)
698{
699 void __user *argp = (void __user *)arg;
700 long r;
701
702 switch (ioctl) {
703 case KVM_GET_MSR_INDEX_LIST: {
704 struct kvm_msr_list __user *user_msr_list = argp;
705 struct kvm_msr_list msr_list;
706 unsigned n;
707
708 r = -EFAULT;
709 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
710 goto out;
711 n = msr_list.nmsrs;
712 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
713 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
714 goto out;
715 r = -E2BIG;
716 if (n < num_msrs_to_save)
717 goto out;
718 r = -EFAULT;
719 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
720 num_msrs_to_save * sizeof(u32)))
721 goto out;
722 if (copy_to_user(user_msr_list->indices
723 + num_msrs_to_save * sizeof(u32),
724 &emulated_msrs,
725 ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
726 goto out;
727 r = 0;
728 break;
729 }
730 default:
731 r = -EINVAL;
732 }
733out:
734 return r;
735}
736
737void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
738{
739 kvm_x86_ops->vcpu_load(vcpu, cpu);
740}
741
742void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
743{
744 kvm_x86_ops->vcpu_put(vcpu);
745 kvm_put_guest_fpu(vcpu);
746}
747
748static int is_efer_nx(void)
749{
750 u64 efer;
751
752 rdmsrl(MSR_EFER, efer);
753 return efer & EFER_NX;
754}
755
756static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
757{
758 int i;
759 struct kvm_cpuid_entry2 *e, *entry;
760
761 entry = NULL;
762 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
763 e = &vcpu->arch.cpuid_entries[i];
764 if (e->function == 0x80000001) {
765 entry = e;
766 break;
767 }
768 }
769 if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
770 entry->edx &= ~(1 << 20);
771 printk(KERN_INFO "kvm: guest NX capability removed\n");
772 }
773}
774
775/* when an old userspace process fills a new kernel module */
776static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
777 struct kvm_cpuid *cpuid,
778 struct kvm_cpuid_entry __user *entries)
779{
780 int r, i;
781 struct kvm_cpuid_entry *cpuid_entries;
782
783 r = -E2BIG;
784 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
785 goto out;
786 r = -ENOMEM;
787 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
788 if (!cpuid_entries)
789 goto out;
790 r = -EFAULT;
791 if (copy_from_user(cpuid_entries, entries,
792 cpuid->nent * sizeof(struct kvm_cpuid_entry)))
793 goto out_free;
794 for (i = 0; i < cpuid->nent; i++) {
795 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
796 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
797 vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
798 vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
799 vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
800 vcpu->arch.cpuid_entries[i].index = 0;
801 vcpu->arch.cpuid_entries[i].flags = 0;
802 vcpu->arch.cpuid_entries[i].padding[0] = 0;
803 vcpu->arch.cpuid_entries[i].padding[1] = 0;
804 vcpu->arch.cpuid_entries[i].padding[2] = 0;
805 }
806 vcpu->arch.cpuid_nent = cpuid->nent;
807 cpuid_fix_nx_cap(vcpu);
808 r = 0;
809
810out_free:
811 vfree(cpuid_entries);
812out:
813 return r;
814}
815
816static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
817 struct kvm_cpuid2 *cpuid,
818 struct kvm_cpuid_entry2 __user *entries)
819{
820 int r;
821
822 r = -E2BIG;
823 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
824 goto out;
825 r = -EFAULT;
826 if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
827 cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
828 goto out;
829 vcpu->arch.cpuid_nent = cpuid->nent;
830 return 0;
831
832out:
833 return r;
834}
835
836static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
837 struct kvm_cpuid2 *cpuid,
838 struct kvm_cpuid_entry2 __user *entries)
839{
840 int r;
841
842 r = -E2BIG;
843 if (cpuid->nent < vcpu->arch.cpuid_nent)
844 goto out;
845 r = -EFAULT;
846 if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
847 vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
848 goto out;
849 return 0;
850
851out:
852 cpuid->nent = vcpu->arch.cpuid_nent;
853 return r;
854}
855
856static inline u32 bit(int bitno)
857{
858 return 1 << (bitno & 31);
859}
860
861static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
862 u32 index)
863{
864 entry->function = function;
865 entry->index = index;
866 cpuid_count(entry->function, entry->index,
867 &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
868 entry->flags = 0;
869}
870
871static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
872 u32 index, int *nent, int maxnent)
873{
874 const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) |
875 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
876 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
877 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
878 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
879 bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) |
880 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
881 bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) |
882 bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) |
883 bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP);
884 const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) |
885 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
886 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
887 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
888 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
889 bit(X86_FEATURE_PGE) |
890 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
891 bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) |
892 bit(X86_FEATURE_SYSCALL) |
893 (bit(X86_FEATURE_NX) && is_efer_nx()) |
894#ifdef CONFIG_X86_64
895 bit(X86_FEATURE_LM) |
896#endif
897 bit(X86_FEATURE_MMXEXT) |
898 bit(X86_FEATURE_3DNOWEXT) |
899 bit(X86_FEATURE_3DNOW);
900 const u32 kvm_supported_word3_x86_features =
901 bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16);
902 const u32 kvm_supported_word6_x86_features =
903 bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY);
904
905 /* all func 2 cpuid_count() should be called on the same cpu */
906 get_cpu();
907 do_cpuid_1_ent(entry, function, index);
908 ++*nent;
909
910 switch (function) {
911 case 0:
912 entry->eax = min(entry->eax, (u32)0xb);
913 break;
914 case 1:
915 entry->edx &= kvm_supported_word0_x86_features;
916 entry->ecx &= kvm_supported_word3_x86_features;
917 break;
918 /* function 2 entries are STATEFUL. That is, repeated cpuid commands
919 * may return different values. This forces us to get_cpu() before
920 * issuing the first command, and also to emulate this annoying behavior
921 * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
922 case 2: {
923 int t, times = entry->eax & 0xff;
924
925 entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
926 for (t = 1; t < times && *nent < maxnent; ++t) {
927 do_cpuid_1_ent(&entry[t], function, 0);
928 entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
929 ++*nent;
930 }
931 break;
932 }
933 /* function 4 and 0xb have additional index. */
934 case 4: {
935 int index, cache_type;
936
937 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
938 /* read more entries until cache_type is zero */
939 for (index = 1; *nent < maxnent; ++index) {
940 cache_type = entry[index - 1].eax & 0x1f;
941 if (!cache_type)
942 break;
943 do_cpuid_1_ent(&entry[index], function, index);
944 entry[index].flags |=
945 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
946 ++*nent;
947 }
948 break;
949 }
950 case 0xb: {
951 int index, level_type;
952
953 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
954 /* read more entries until level_type is zero */
955 for (index = 1; *nent < maxnent; ++index) {
956 level_type = entry[index - 1].ecx & 0xff;
957 if (!level_type)
958 break;
959 do_cpuid_1_ent(&entry[index], function, index);
960 entry[index].flags |=
961 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
962 ++*nent;
963 }
964 break;
965 }
966 case 0x80000000:
967 entry->eax = min(entry->eax, 0x8000001a);
968 break;
969 case 0x80000001:
970 entry->edx &= kvm_supported_word1_x86_features;
971 entry->ecx &= kvm_supported_word6_x86_features;
972 break;
973 }
974 put_cpu();
975}
976
977static int kvm_vm_ioctl_get_supported_cpuid(struct kvm *kvm,
978 struct kvm_cpuid2 *cpuid,
979 struct kvm_cpuid_entry2 __user *entries)
980{
981 struct kvm_cpuid_entry2 *cpuid_entries;
982 int limit, nent = 0, r = -E2BIG;
983 u32 func;
984
985 if (cpuid->nent < 1)
986 goto out;
987 r = -ENOMEM;
988 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
989 if (!cpuid_entries)
990 goto out;
991
992 do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
993 limit = cpuid_entries[0].eax;
994 for (func = 1; func <= limit && nent < cpuid->nent; ++func)
995 do_cpuid_ent(&cpuid_entries[nent], func, 0,
996 &nent, cpuid->nent);
997 r = -E2BIG;
998 if (nent >= cpuid->nent)
999 goto out_free;
1000
1001 do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
1002 limit = cpuid_entries[nent - 1].eax;
1003 for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
1004 do_cpuid_ent(&cpuid_entries[nent], func, 0,
1005 &nent, cpuid->nent);
1006 r = -EFAULT;
1007 if (copy_to_user(entries, cpuid_entries,
1008 nent * sizeof(struct kvm_cpuid_entry2)))
1009 goto out_free;
1010 cpuid->nent = nent;
1011 r = 0;
1012
1013out_free:
1014 vfree(cpuid_entries);
1015out:
1016 return r;
1017}
1018
1019static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
1020 struct kvm_lapic_state *s)
1021{
1022 vcpu_load(vcpu);
1023 memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
1024 vcpu_put(vcpu);
1025
1026 return 0;
1027}
1028
1029static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
1030 struct kvm_lapic_state *s)
1031{
1032 vcpu_load(vcpu);
1033 memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
1034 kvm_apic_post_state_restore(vcpu);
1035 vcpu_put(vcpu);
1036
1037 return 0;
1038}
1039
1040static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
1041 struct kvm_interrupt *irq)
1042{
1043 if (irq->irq < 0 || irq->irq >= 256)
1044 return -EINVAL;
1045 if (irqchip_in_kernel(vcpu->kvm))
1046 return -ENXIO;
1047 vcpu_load(vcpu);
1048
1049 set_bit(irq->irq, vcpu->arch.irq_pending);
1050 set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
1051
1052 vcpu_put(vcpu);
1053
1054 return 0;
1055}
1056
1057long kvm_arch_vcpu_ioctl(struct file *filp,
1058 unsigned int ioctl, unsigned long arg)
1059{
1060 struct kvm_vcpu *vcpu = filp->private_data;
1061 void __user *argp = (void __user *)arg;
1062 int r;
1063
1064 switch (ioctl) {
1065 case KVM_GET_LAPIC: {
1066 struct kvm_lapic_state lapic;
1067
1068 memset(&lapic, 0, sizeof lapic);
1069 r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic);
1070 if (r)
1071 goto out;
1072 r = -EFAULT;
1073 if (copy_to_user(argp, &lapic, sizeof lapic))
1074 goto out;
1075 r = 0;
1076 break;
1077 }
1078 case KVM_SET_LAPIC: {
1079 struct kvm_lapic_state lapic;
1080
1081 r = -EFAULT;
1082 if (copy_from_user(&lapic, argp, sizeof lapic))
1083 goto out;
1084 r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);;
1085 if (r)
1086 goto out;
1087 r = 0;
1088 break;
1089 }
1090 case KVM_INTERRUPT: {
1091 struct kvm_interrupt irq;
1092
1093 r = -EFAULT;
1094 if (copy_from_user(&irq, argp, sizeof irq))
1095 goto out;
1096 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
1097 if (r)
1098 goto out;
1099 r = 0;
1100 break;
1101 }
1102 case KVM_SET_CPUID: {
1103 struct kvm_cpuid __user *cpuid_arg = argp;
1104 struct kvm_cpuid cpuid;
1105
1106 r = -EFAULT;
1107 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1108 goto out;
1109 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
1110 if (r)
1111 goto out;
1112 break;
1113 }
1114 case KVM_SET_CPUID2: {
1115 struct kvm_cpuid2 __user *cpuid_arg = argp;
1116 struct kvm_cpuid2 cpuid;
1117
1118 r = -EFAULT;
1119 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1120 goto out;
1121 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
1122 cpuid_arg->entries);
1123 if (r)
1124 goto out;
1125 break;
1126 }
1127 case KVM_GET_CPUID2: {
1128 struct kvm_cpuid2 __user *cpuid_arg = argp;
1129 struct kvm_cpuid2 cpuid;
1130
1131 r = -EFAULT;
1132 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1133 goto out;
1134 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
1135 cpuid_arg->entries);
1136 if (r)
1137 goto out;
1138 r = -EFAULT;
1139 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
1140 goto out;
1141 r = 0;
1142 break;
1143 }
1144 case KVM_GET_MSRS:
1145 r = msr_io(vcpu, argp, kvm_get_msr, 1);
1146 break;
1147 case KVM_SET_MSRS:
1148 r = msr_io(vcpu, argp, do_set_msr, 0);
1149 break;
1150 default:
1151 r = -EINVAL;
1152 }
1153out:
1154 return r;
1155}
1156
1157static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
1158{
1159 int ret;
1160
1161 if (addr > (unsigned int)(-3 * PAGE_SIZE))
1162 return -1;
1163 ret = kvm_x86_ops->set_tss_addr(kvm, addr);
1164 return ret;
1165}
1166
1167static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
1168 u32 kvm_nr_mmu_pages)
1169{
1170 if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
1171 return -EINVAL;
1172
1173 mutex_lock(&kvm->lock);
1174
1175 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
1176 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
1177
1178 mutex_unlock(&kvm->lock);
1179 return 0;
1180}
1181
1182static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
1183{
1184 return kvm->arch.n_alloc_mmu_pages;
1185}
1186
1187gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
1188{
1189 int i;
1190 struct kvm_mem_alias *alias;
1191
1192 for (i = 0; i < kvm->arch.naliases; ++i) {
1193 alias = &kvm->arch.aliases[i];
1194 if (gfn >= alias->base_gfn
1195 && gfn < alias->base_gfn + alias->npages)
1196 return alias->target_gfn + gfn - alias->base_gfn;
1197 }
1198 return gfn;
1199}
1200
1201/*
1202 * Set a new alias region. Aliases map a portion of physical memory into
1203 * another portion. This is useful for memory windows, for example the PC
1204 * VGA region.
1205 */
1206static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
1207 struct kvm_memory_alias *alias)
1208{
1209 int r, n;
1210 struct kvm_mem_alias *p;
1211
1212 r = -EINVAL;
1213 /* General sanity checks */
1214 if (alias->memory_size & (PAGE_SIZE - 1))
1215 goto out;
1216 if (alias->guest_phys_addr & (PAGE_SIZE - 1))
1217 goto out;
1218 if (alias->slot >= KVM_ALIAS_SLOTS)
1219 goto out;
1220 if (alias->guest_phys_addr + alias->memory_size
1221 < alias->guest_phys_addr)
1222 goto out;
1223 if (alias->target_phys_addr + alias->memory_size
1224 < alias->target_phys_addr)
1225 goto out;
1226
1227 mutex_lock(&kvm->lock);
1228
1229 p = &kvm->arch.aliases[alias->slot];
1230 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
1231 p->npages = alias->memory_size >> PAGE_SHIFT;
1232 p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
1233
1234 for (n = KVM_ALIAS_SLOTS; n > 0; --n)
1235 if (kvm->arch.aliases[n - 1].npages)
1236 break;
1237 kvm->arch.naliases = n;
1238
1239 kvm_mmu_zap_all(kvm);
1240
1241 mutex_unlock(&kvm->lock);
1242
1243 return 0;
1244
1245out:
1246 return r;
1247}
1248
1249static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
1250{
1251 int r;
1252
1253 r = 0;
1254 switch (chip->chip_id) {
1255 case KVM_IRQCHIP_PIC_MASTER:
1256 memcpy(&chip->chip.pic,
1257 &pic_irqchip(kvm)->pics[0],
1258 sizeof(struct kvm_pic_state));
1259 break;
1260 case KVM_IRQCHIP_PIC_SLAVE:
1261 memcpy(&chip->chip.pic,
1262 &pic_irqchip(kvm)->pics[1],
1263 sizeof(struct kvm_pic_state));
1264 break;
1265 case KVM_IRQCHIP_IOAPIC:
1266 memcpy(&chip->chip.ioapic,
1267 ioapic_irqchip(kvm),
1268 sizeof(struct kvm_ioapic_state));
1269 break;
1270 default:
1271 r = -EINVAL;
1272 break;
1273 }
1274 return r;
1275}
1276
1277static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
1278{
1279 int r;
1280
1281 r = 0;
1282 switch (chip->chip_id) {
1283 case KVM_IRQCHIP_PIC_MASTER:
1284 memcpy(&pic_irqchip(kvm)->pics[0],
1285 &chip->chip.pic,
1286 sizeof(struct kvm_pic_state));
1287 break;
1288 case KVM_IRQCHIP_PIC_SLAVE:
1289 memcpy(&pic_irqchip(kvm)->pics[1],
1290 &chip->chip.pic,
1291 sizeof(struct kvm_pic_state));
1292 break;
1293 case KVM_IRQCHIP_IOAPIC:
1294 memcpy(ioapic_irqchip(kvm),
1295 &chip->chip.ioapic,
1296 sizeof(struct kvm_ioapic_state));
1297 break;
1298 default:
1299 r = -EINVAL;
1300 break;
1301 }
1302 kvm_pic_update_irq(pic_irqchip(kvm));
1303 return r;
1304}
1305
1306/*
1307 * Get (and clear) the dirty memory log for a memory slot.
1308 */
1309int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
1310 struct kvm_dirty_log *log)
1311{
1312 int r;
1313 int n;
1314 struct kvm_memory_slot *memslot;
1315 int is_dirty = 0;
1316
1317 mutex_lock(&kvm->lock);
1318
1319 r = kvm_get_dirty_log(kvm, log, &is_dirty);
1320 if (r)
1321 goto out;
1322
1323 /* If nothing is dirty, don't bother messing with page tables. */
1324 if (is_dirty) {
1325 kvm_mmu_slot_remove_write_access(kvm, log->slot);
1326 kvm_flush_remote_tlbs(kvm);
1327 memslot = &kvm->memslots[log->slot];
1328 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
1329 memset(memslot->dirty_bitmap, 0, n);
1330 }
1331 r = 0;
1332out:
1333 mutex_unlock(&kvm->lock);
1334 return r;
1335}
1336
1337long kvm_arch_vm_ioctl(struct file *filp,
1338 unsigned int ioctl, unsigned long arg)
1339{
1340 struct kvm *kvm = filp->private_data;
1341 void __user *argp = (void __user *)arg;
1342 int r = -EINVAL;
1343
1344 switch (ioctl) {
1345 case KVM_SET_TSS_ADDR:
1346 r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
1347 if (r < 0)
1348 goto out;
1349 break;
1350 case KVM_SET_MEMORY_REGION: {
1351 struct kvm_memory_region kvm_mem;
1352 struct kvm_userspace_memory_region kvm_userspace_mem;
1353
1354 r = -EFAULT;
1355 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
1356 goto out;
1357 kvm_userspace_mem.slot = kvm_mem.slot;
1358 kvm_userspace_mem.flags = kvm_mem.flags;
1359 kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
1360 kvm_userspace_mem.memory_size = kvm_mem.memory_size;
1361 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
1362 if (r)
1363 goto out;
1364 break;
1365 }
1366 case KVM_SET_NR_MMU_PAGES:
1367 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
1368 if (r)
1369 goto out;
1370 break;
1371 case KVM_GET_NR_MMU_PAGES:
1372 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
1373 break;
1374 case KVM_SET_MEMORY_ALIAS: {
1375 struct kvm_memory_alias alias;
1376
1377 r = -EFAULT;
1378 if (copy_from_user(&alias, argp, sizeof alias))
1379 goto out;
1380 r = kvm_vm_ioctl_set_memory_alias(kvm, &alias);
1381 if (r)
1382 goto out;
1383 break;
1384 }
1385 case KVM_CREATE_IRQCHIP:
1386 r = -ENOMEM;
1387 kvm->arch.vpic = kvm_create_pic(kvm);
1388 if (kvm->arch.vpic) {
1389 r = kvm_ioapic_init(kvm);
1390 if (r) {
1391 kfree(kvm->arch.vpic);
1392 kvm->arch.vpic = NULL;
1393 goto out;
1394 }
1395 } else
1396 goto out;
1397 break;
1398 case KVM_IRQ_LINE: {
1399 struct kvm_irq_level irq_event;
1400
1401 r = -EFAULT;
1402 if (copy_from_user(&irq_event, argp, sizeof irq_event))
1403 goto out;
1404 if (irqchip_in_kernel(kvm)) {
1405 mutex_lock(&kvm->lock);
1406 if (irq_event.irq < 16)
1407 kvm_pic_set_irq(pic_irqchip(kvm),
1408 irq_event.irq,
1409 irq_event.level);
1410 kvm_ioapic_set_irq(kvm->arch.vioapic,
1411 irq_event.irq,
1412 irq_event.level);
1413 mutex_unlock(&kvm->lock);
1414 r = 0;
1415 }
1416 break;
1417 }
1418 case KVM_GET_IRQCHIP: {
1419 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
1420 struct kvm_irqchip chip;
1421
1422 r = -EFAULT;
1423 if (copy_from_user(&chip, argp, sizeof chip))
1424 goto out;
1425 r = -ENXIO;
1426 if (!irqchip_in_kernel(kvm))
1427 goto out;
1428 r = kvm_vm_ioctl_get_irqchip(kvm, &chip);
1429 if (r)
1430 goto out;
1431 r = -EFAULT;
1432 if (copy_to_user(argp, &chip, sizeof chip))
1433 goto out;
1434 r = 0;
1435 break;
1436 }
1437 case KVM_SET_IRQCHIP: {
1438 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
1439 struct kvm_irqchip chip;
1440
1441 r = -EFAULT;
1442 if (copy_from_user(&chip, argp, sizeof chip))
1443 goto out;
1444 r = -ENXIO;
1445 if (!irqchip_in_kernel(kvm))
1446 goto out;
1447 r = kvm_vm_ioctl_set_irqchip(kvm, &chip);
1448 if (r)
1449 goto out;
1450 r = 0;
1451 break;
1452 }
1453 case KVM_GET_SUPPORTED_CPUID: {
1454 struct kvm_cpuid2 __user *cpuid_arg = argp;
1455 struct kvm_cpuid2 cpuid;
1456
1457 r = -EFAULT;
1458 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1459 goto out;
1460 r = kvm_vm_ioctl_get_supported_cpuid(kvm, &cpuid,
1461 cpuid_arg->entries);
1462 if (r)
1463 goto out;
1464
1465 r = -EFAULT;
1466 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
1467 goto out;
1468 r = 0;
1469 break;
1470 }
1471 default:
1472 ;
1473 }
1474out:
1475 return r;
1476}
1477
1478static void kvm_init_msr_list(void)
1479{
1480 u32 dummy[2];
1481 unsigned i, j;
1482
1483 for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
1484 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
1485 continue;
1486 if (j < i)
1487 msrs_to_save[j] = msrs_to_save[i];
1488 j++;
1489 }
1490 num_msrs_to_save = j;
1491}
1492
1493/*
1494 * Only apic need an MMIO device hook, so shortcut now..
1495 */
1496static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
1497 gpa_t addr)
1498{
1499 struct kvm_io_device *dev;
1500
1501 if (vcpu->arch.apic) {
1502 dev = &vcpu->arch.apic->dev;
1503 if (dev->in_range(dev, addr))
1504 return dev;
1505 }
1506 return NULL;
1507}
1508
1509
1510static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
1511 gpa_t addr)
1512{
1513 struct kvm_io_device *dev;
1514
1515 dev = vcpu_find_pervcpu_dev(vcpu, addr);
1516 if (dev == NULL)
1517 dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr);
1518 return dev;
1519}
1520
1521int emulator_read_std(unsigned long addr,
1522 void *val,
1523 unsigned int bytes,
1524 struct kvm_vcpu *vcpu)
1525{
1526 void *data = val;
1527
1528 while (bytes) {
1529 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1530 unsigned offset = addr & (PAGE_SIZE-1);
1531 unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
1532 int ret;
1533
1534 if (gpa == UNMAPPED_GVA)
1535 return X86EMUL_PROPAGATE_FAULT;
1536 ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy);
1537 if (ret < 0)
1538 return X86EMUL_UNHANDLEABLE;
1539
1540 bytes -= tocopy;
1541 data += tocopy;
1542 addr += tocopy;
1543 }
1544
1545 return X86EMUL_CONTINUE;
1546}
1547EXPORT_SYMBOL_GPL(emulator_read_std);
1548
1549static int emulator_read_emulated(unsigned long addr,
1550 void *val,
1551 unsigned int bytes,
1552 struct kvm_vcpu *vcpu)
1553{
1554 struct kvm_io_device *mmio_dev;
1555 gpa_t gpa;
1556
1557 if (vcpu->mmio_read_completed) {
1558 memcpy(val, vcpu->mmio_data, bytes);
1559 vcpu->mmio_read_completed = 0;
1560 return X86EMUL_CONTINUE;
1561 }
1562
1563 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1564
1565 /* For APIC access vmexit */
1566 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
1567 goto mmio;
1568
1569 if (emulator_read_std(addr, val, bytes, vcpu)
1570 == X86EMUL_CONTINUE)
1571 return X86EMUL_CONTINUE;
1572 if (gpa == UNMAPPED_GVA)
1573 return X86EMUL_PROPAGATE_FAULT;
1574
1575mmio:
1576 /*
1577 * Is this MMIO handled locally?
1578 */
1579 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1580 if (mmio_dev) {
1581 kvm_iodevice_read(mmio_dev, gpa, bytes, val);
1582 return X86EMUL_CONTINUE;
1583 }
1584
1585 vcpu->mmio_needed = 1;
1586 vcpu->mmio_phys_addr = gpa;
1587 vcpu->mmio_size = bytes;
1588 vcpu->mmio_is_write = 0;
1589
1590 return X86EMUL_UNHANDLEABLE;
1591}
1592
1593static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
1594 const void *val, int bytes)
1595{
1596 int ret;
1597
1598 ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
1599 if (ret < 0)
1600 return 0;
1601 kvm_mmu_pte_write(vcpu, gpa, val, bytes);
1602 return 1;
1603}
1604
1605static int emulator_write_emulated_onepage(unsigned long addr,
1606 const void *val,
1607 unsigned int bytes,
1608 struct kvm_vcpu *vcpu)
1609{
1610 struct kvm_io_device *mmio_dev;
1611 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1612
1613 if (gpa == UNMAPPED_GVA) {
1614 kvm_inject_page_fault(vcpu, addr, 2);
1615 return X86EMUL_PROPAGATE_FAULT;
1616 }
1617
1618 /* For APIC access vmexit */
1619 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
1620 goto mmio;
1621
1622 if (emulator_write_phys(vcpu, gpa, val, bytes))
1623 return X86EMUL_CONTINUE;
1624
1625mmio:
1626 /*
1627 * Is this MMIO handled locally?
1628 */
1629 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1630 if (mmio_dev) {
1631 kvm_iodevice_write(mmio_dev, gpa, bytes, val);
1632 return X86EMUL_CONTINUE;
1633 }
1634
1635 vcpu->mmio_needed = 1;
1636 vcpu->mmio_phys_addr = gpa;
1637 vcpu->mmio_size = bytes;
1638 vcpu->mmio_is_write = 1;
1639 memcpy(vcpu->mmio_data, val, bytes);
1640
1641 return X86EMUL_CONTINUE;
1642}
1643
1644int emulator_write_emulated(unsigned long addr,
1645 const void *val,
1646 unsigned int bytes,
1647 struct kvm_vcpu *vcpu)
1648{
1649 /* Crossing a page boundary? */
1650 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
1651 int rc, now;
1652
1653 now = -addr & ~PAGE_MASK;
1654 rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
1655 if (rc != X86EMUL_CONTINUE)
1656 return rc;
1657 addr += now;
1658 val += now;
1659 bytes -= now;
1660 }
1661 return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
1662}
1663EXPORT_SYMBOL_GPL(emulator_write_emulated);
1664
1665static int emulator_cmpxchg_emulated(unsigned long addr,
1666 const void *old,
1667 const void *new,
1668 unsigned int bytes,
1669 struct kvm_vcpu *vcpu)
1670{
1671 static int reported;
1672
1673 if (!reported) {
1674 reported = 1;
1675 printk(KERN_WARNING "kvm: emulating exchange as write\n");
1676 }
1677#ifndef CONFIG_X86_64
1678 /* guests cmpxchg8b have to be emulated atomically */
1679 if (bytes == 8) {
1680 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1681 struct page *page;
1682 char *addr;
1683 u64 val;
1684
1685 if (gpa == UNMAPPED_GVA ||
1686 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
1687 goto emul_write;
1688
1689 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
1690 goto emul_write;
1691
1692 val = *(u64 *)new;
1693 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1694 addr = kmap_atomic(page, KM_USER0);
1695 set_64bit((u64 *)(addr + offset_in_page(gpa)), val);
1696 kunmap_atomic(addr, KM_USER0);
1697 kvm_release_page_dirty(page);
1698 }
1699emul_write:
1700#endif
1701
1702 return emulator_write_emulated(addr, new, bytes, vcpu);
1703}
1704
1705static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
1706{
1707 return kvm_x86_ops->get_segment_base(vcpu, seg);
1708}
1709
1710int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
1711{
1712 return X86EMUL_CONTINUE;
1713}
1714
1715int emulate_clts(struct kvm_vcpu *vcpu)
1716{
1717 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
1718 return X86EMUL_CONTINUE;
1719}
1720
1721int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
1722{
1723 struct kvm_vcpu *vcpu = ctxt->vcpu;
1724
1725 switch (dr) {
1726 case 0 ... 3:
1727 *dest = kvm_x86_ops->get_dr(vcpu, dr);
1728 return X86EMUL_CONTINUE;
1729 default:
1730 pr_unimpl(vcpu, "%s: unexpected dr %u\n", __FUNCTION__, dr);
1731 return X86EMUL_UNHANDLEABLE;
1732 }
1733}
1734
1735int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
1736{
1737 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
1738 int exception;
1739
1740 kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
1741 if (exception) {
1742 /* FIXME: better handling */
1743 return X86EMUL_UNHANDLEABLE;
1744 }
1745 return X86EMUL_CONTINUE;
1746}
1747
1748void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
1749{
1750 static int reported;
1751 u8 opcodes[4];
1752 unsigned long rip = vcpu->arch.rip;
1753 unsigned long rip_linear;
1754
1755 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
1756
1757 if (reported)
1758 return;
1759
1760 emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu);
1761
1762 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
1763 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
1764 reported = 1;
1765}
1766EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
1767
1768struct x86_emulate_ops emulate_ops = {
1769 .read_std = emulator_read_std,
1770 .read_emulated = emulator_read_emulated,
1771 .write_emulated = emulator_write_emulated,
1772 .cmpxchg_emulated = emulator_cmpxchg_emulated,
1773};
1774
1775int emulate_instruction(struct kvm_vcpu *vcpu,
1776 struct kvm_run *run,
1777 unsigned long cr2,
1778 u16 error_code,
1779 int no_decode)
1780{
1781 int r;
1782
1783 vcpu->arch.mmio_fault_cr2 = cr2;
1784 kvm_x86_ops->cache_regs(vcpu);
1785
1786 vcpu->mmio_is_write = 0;
1787 vcpu->arch.pio.string = 0;
1788
1789 if (!no_decode) {
1790 int cs_db, cs_l;
1791 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
1792
1793 vcpu->arch.emulate_ctxt.vcpu = vcpu;
1794 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
1795 vcpu->arch.emulate_ctxt.mode =
1796 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
1797 ? X86EMUL_MODE_REAL : cs_l
1798 ? X86EMUL_MODE_PROT64 : cs_db
1799 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
1800
1801 if (vcpu->arch.emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
1802 vcpu->arch.emulate_ctxt.cs_base = 0;
1803 vcpu->arch.emulate_ctxt.ds_base = 0;
1804 vcpu->arch.emulate_ctxt.es_base = 0;
1805 vcpu->arch.emulate_ctxt.ss_base = 0;
1806 } else {
1807 vcpu->arch.emulate_ctxt.cs_base =
1808 get_segment_base(vcpu, VCPU_SREG_CS);
1809 vcpu->arch.emulate_ctxt.ds_base =
1810 get_segment_base(vcpu, VCPU_SREG_DS);
1811 vcpu->arch.emulate_ctxt.es_base =
1812 get_segment_base(vcpu, VCPU_SREG_ES);
1813 vcpu->arch.emulate_ctxt.ss_base =
1814 get_segment_base(vcpu, VCPU_SREG_SS);
1815 }
1816
1817 vcpu->arch.emulate_ctxt.gs_base =
1818 get_segment_base(vcpu, VCPU_SREG_GS);
1819 vcpu->arch.emulate_ctxt.fs_base =
1820 get_segment_base(vcpu, VCPU_SREG_FS);
1821
1822 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
1823 ++vcpu->stat.insn_emulation;
1824 if (r) {
1825 ++vcpu->stat.insn_emulation_fail;
1826 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
1827 return EMULATE_DONE;
1828 return EMULATE_FAIL;
1829 }
1830 }
1831
1832 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
1833
1834 if (vcpu->arch.pio.string)
1835 return EMULATE_DO_MMIO;
1836
1837 if ((r || vcpu->mmio_is_write) && run) {
1838 run->exit_reason = KVM_EXIT_MMIO;
1839 run->mmio.phys_addr = vcpu->mmio_phys_addr;
1840 memcpy(run->mmio.data, vcpu->mmio_data, 8);
1841 run->mmio.len = vcpu->mmio_size;
1842 run->mmio.is_write = vcpu->mmio_is_write;
1843 }
1844
1845 if (r) {
1846 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
1847 return EMULATE_DONE;
1848 if (!vcpu->mmio_needed) {
1849 kvm_report_emulation_failure(vcpu, "mmio");
1850 return EMULATE_FAIL;
1851 }
1852 return EMULATE_DO_MMIO;
1853 }
1854
1855 kvm_x86_ops->decache_regs(vcpu);
1856 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
1857
1858 if (vcpu->mmio_is_write) {
1859 vcpu->mmio_needed = 0;
1860 return EMULATE_DO_MMIO;
1861 }
1862
1863 return EMULATE_DONE;
1864}
1865EXPORT_SYMBOL_GPL(emulate_instruction);
1866
1867static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
1868{
1869 int i;
1870
1871 for (i = 0; i < ARRAY_SIZE(vcpu->arch.pio.guest_pages); ++i)
1872 if (vcpu->arch.pio.guest_pages[i]) {
1873 kvm_release_page_dirty(vcpu->arch.pio.guest_pages[i]);
1874 vcpu->arch.pio.guest_pages[i] = NULL;
1875 }
1876}
1877
1878static int pio_copy_data(struct kvm_vcpu *vcpu)
1879{
1880 void *p = vcpu->arch.pio_data;
1881 void *q;
1882 unsigned bytes;
1883 int nr_pages = vcpu->arch.pio.guest_pages[1] ? 2 : 1;
1884
1885 q = vmap(vcpu->arch.pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
1886 PAGE_KERNEL);
1887 if (!q) {
1888 free_pio_guest_pages(vcpu);
1889 return -ENOMEM;
1890 }
1891 q += vcpu->arch.pio.guest_page_offset;
1892 bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
1893 if (vcpu->arch.pio.in)
1894 memcpy(q, p, bytes);
1895 else
1896 memcpy(p, q, bytes);
1897 q -= vcpu->arch.pio.guest_page_offset;
1898 vunmap(q);
1899 free_pio_guest_pages(vcpu);
1900 return 0;
1901}
1902
1903int complete_pio(struct kvm_vcpu *vcpu)
1904{
1905 struct kvm_pio_request *io = &vcpu->arch.pio;
1906 long delta;
1907 int r;
1908
1909 kvm_x86_ops->cache_regs(vcpu);
1910
1911 if (!io->string) {
1912 if (io->in)
1913 memcpy(&vcpu->arch.regs[VCPU_REGS_RAX], vcpu->arch.pio_data,
1914 io->size);
1915 } else {
1916 if (io->in) {
1917 r = pio_copy_data(vcpu);
1918 if (r) {
1919 kvm_x86_ops->cache_regs(vcpu);
1920 return r;
1921 }
1922 }
1923
1924 delta = 1;
1925 if (io->rep) {
1926 delta *= io->cur_count;
1927 /*
1928 * The size of the register should really depend on
1929 * current address size.
1930 */
1931 vcpu->arch.regs[VCPU_REGS_RCX] -= delta;
1932 }
1933 if (io->down)
1934 delta = -delta;
1935 delta *= io->size;
1936 if (io->in)
1937 vcpu->arch.regs[VCPU_REGS_RDI] += delta;
1938 else
1939 vcpu->arch.regs[VCPU_REGS_RSI] += delta;
1940 }
1941
1942 kvm_x86_ops->decache_regs(vcpu);
1943
1944 io->count -= io->cur_count;
1945 io->cur_count = 0;
1946
1947 return 0;
1948}
1949
1950static void kernel_pio(struct kvm_io_device *pio_dev,
1951 struct kvm_vcpu *vcpu,
1952 void *pd)
1953{
1954 /* TODO: String I/O for in kernel device */
1955
1956 mutex_lock(&vcpu->kvm->lock);
1957 if (vcpu->arch.pio.in)
1958 kvm_iodevice_read(pio_dev, vcpu->arch.pio.port,
1959 vcpu->arch.pio.size,
1960 pd);
1961 else
1962 kvm_iodevice_write(pio_dev, vcpu->arch.pio.port,
1963 vcpu->arch.pio.size,
1964 pd);
1965 mutex_unlock(&vcpu->kvm->lock);
1966}
1967
1968static void pio_string_write(struct kvm_io_device *pio_dev,
1969 struct kvm_vcpu *vcpu)
1970{
1971 struct kvm_pio_request *io = &vcpu->arch.pio;
1972 void *pd = vcpu->arch.pio_data;
1973 int i;
1974
1975 mutex_lock(&vcpu->kvm->lock);
1976 for (i = 0; i < io->cur_count; i++) {
1977 kvm_iodevice_write(pio_dev, io->port,
1978 io->size,
1979 pd);
1980 pd += io->size;
1981 }
1982 mutex_unlock(&vcpu->kvm->lock);
1983}
1984
1985static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
1986 gpa_t addr)
1987{
1988 return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr);
1989}
1990
1991int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
1992 int size, unsigned port)
1993{
1994 struct kvm_io_device *pio_dev;
1995
1996 vcpu->run->exit_reason = KVM_EXIT_IO;
1997 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
1998 vcpu->run->io.size = vcpu->arch.pio.size = size;
1999 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
2000 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1;
2001 vcpu->run->io.port = vcpu->arch.pio.port = port;
2002 vcpu->arch.pio.in = in;
2003 vcpu->arch.pio.string = 0;
2004 vcpu->arch.pio.down = 0;
2005 vcpu->arch.pio.guest_page_offset = 0;
2006 vcpu->arch.pio.rep = 0;
2007
2008 kvm_x86_ops->cache_regs(vcpu);
2009 memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4);
2010 kvm_x86_ops->decache_regs(vcpu);
2011
2012 kvm_x86_ops->skip_emulated_instruction(vcpu);
2013
2014 pio_dev = vcpu_find_pio_dev(vcpu, port);
2015 if (pio_dev) {
2016 kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
2017 complete_pio(vcpu);
2018 return 1;
2019 }
2020 return 0;
2021}
2022EXPORT_SYMBOL_GPL(kvm_emulate_pio);
2023
2024int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2025 int size, unsigned long count, int down,
2026 gva_t address, int rep, unsigned port)
2027{
2028 unsigned now, in_page;
2029 int i, ret = 0;
2030 int nr_pages = 1;
2031 struct page *page;
2032 struct kvm_io_device *pio_dev;
2033
2034 vcpu->run->exit_reason = KVM_EXIT_IO;
2035 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
2036 vcpu->run->io.size = vcpu->arch.pio.size = size;
2037 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
2038 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
2039 vcpu->run->io.port = vcpu->arch.pio.port = port;
2040 vcpu->arch.pio.in = in;
2041 vcpu->arch.pio.string = 1;
2042 vcpu->arch.pio.down = down;
2043 vcpu->arch.pio.guest_page_offset = offset_in_page(address);
2044 vcpu->arch.pio.rep = rep;
2045
2046 if (!count) {
2047 kvm_x86_ops->skip_emulated_instruction(vcpu);
2048 return 1;
2049 }
2050
2051 if (!down)
2052 in_page = PAGE_SIZE - offset_in_page(address);
2053 else
2054 in_page = offset_in_page(address) + size;
2055 now = min(count, (unsigned long)in_page / size);
2056 if (!now) {
2057 /*
2058 * String I/O straddles page boundary. Pin two guest pages
2059 * so that we satisfy atomicity constraints. Do just one
2060 * transaction to avoid complexity.
2061 */
2062 nr_pages = 2;
2063 now = 1;
2064 }
2065 if (down) {
2066 /*
2067 * String I/O in reverse. Yuck. Kill the guest, fix later.
2068 */
2069 pr_unimpl(vcpu, "guest string pio down\n");
2070 kvm_inject_gp(vcpu, 0);
2071 return 1;
2072 }
2073 vcpu->run->io.count = now;
2074 vcpu->arch.pio.cur_count = now;
2075
2076 if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
2077 kvm_x86_ops->skip_emulated_instruction(vcpu);
2078
2079 for (i = 0; i < nr_pages; ++i) {
2080 mutex_lock(&vcpu->kvm->lock);
2081 page = gva_to_page(vcpu, address + i * PAGE_SIZE);
2082 vcpu->arch.pio.guest_pages[i] = page;
2083 mutex_unlock(&vcpu->kvm->lock);
2084 if (!page) {
2085 kvm_inject_gp(vcpu, 0);
2086 free_pio_guest_pages(vcpu);
2087 return 1;
2088 }
2089 }
2090
2091 pio_dev = vcpu_find_pio_dev(vcpu, port);
2092 if (!vcpu->arch.pio.in) {
2093 /* string PIO write */
2094 ret = pio_copy_data(vcpu);
2095 if (ret >= 0 && pio_dev) {
2096 pio_string_write(pio_dev, vcpu);
2097 complete_pio(vcpu);
2098 if (vcpu->arch.pio.count == 0)
2099 ret = 1;
2100 }
2101 } else if (pio_dev)
2102 pr_unimpl(vcpu, "no string pio read support yet, "
2103 "port %x size %d count %ld\n",
2104 port, size, count);
2105
2106 return ret;
2107}
2108EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
2109
2110int kvm_arch_init(void *opaque)
2111{
2112 int r;
2113 struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
2114
2115 r = kvm_mmu_module_init();
2116 if (r)
2117 goto out_fail;
2118
2119 kvm_init_msr_list();
2120
2121 if (kvm_x86_ops) {
2122 printk(KERN_ERR "kvm: already loaded the other module\n");
2123 r = -EEXIST;
2124 goto out;
2125 }
2126
2127 if (!ops->cpu_has_kvm_support()) {
2128 printk(KERN_ERR "kvm: no hardware support\n");
2129 r = -EOPNOTSUPP;
2130 goto out;
2131 }
2132 if (ops->disabled_by_bios()) {
2133 printk(KERN_ERR "kvm: disabled by bios\n");
2134 r = -EOPNOTSUPP;
2135 goto out;
2136 }
2137
2138 kvm_x86_ops = ops;
2139 kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
2140 return 0;
2141
2142out:
2143 kvm_mmu_module_exit();
2144out_fail:
2145 return r;
2146}
2147
2148void kvm_arch_exit(void)
2149{
2150 kvm_x86_ops = NULL;
2151 kvm_mmu_module_exit();
2152}
2153
2154int kvm_emulate_halt(struct kvm_vcpu *vcpu)
2155{
2156 ++vcpu->stat.halt_exits;
2157 if (irqchip_in_kernel(vcpu->kvm)) {
2158 vcpu->arch.mp_state = VCPU_MP_STATE_HALTED;
2159 kvm_vcpu_block(vcpu);
2160 if (vcpu->arch.mp_state != VCPU_MP_STATE_RUNNABLE)
2161 return -EINTR;
2162 return 1;
2163 } else {
2164 vcpu->run->exit_reason = KVM_EXIT_HLT;
2165 return 0;
2166 }
2167}
2168EXPORT_SYMBOL_GPL(kvm_emulate_halt);
2169
2170int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
2171{
2172 unsigned long nr, a0, a1, a2, a3, ret;
2173
2174 kvm_x86_ops->cache_regs(vcpu);
2175
2176 nr = vcpu->arch.regs[VCPU_REGS_RAX];
2177 a0 = vcpu->arch.regs[VCPU_REGS_RBX];
2178 a1 = vcpu->arch.regs[VCPU_REGS_RCX];
2179 a2 = vcpu->arch.regs[VCPU_REGS_RDX];
2180 a3 = vcpu->arch.regs[VCPU_REGS_RSI];
2181
2182 if (!is_long_mode(vcpu)) {
2183 nr &= 0xFFFFFFFF;
2184 a0 &= 0xFFFFFFFF;
2185 a1 &= 0xFFFFFFFF;
2186 a2 &= 0xFFFFFFFF;
2187 a3 &= 0xFFFFFFFF;
2188 }
2189
2190 switch (nr) {
2191 default:
2192 ret = -KVM_ENOSYS;
2193 break;
2194 }
2195 vcpu->arch.regs[VCPU_REGS_RAX] = ret;
2196 kvm_x86_ops->decache_regs(vcpu);
2197 return 0;
2198}
2199EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
2200
2201int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
2202{
2203 char instruction[3];
2204 int ret = 0;
2205
2206 mutex_lock(&vcpu->kvm->lock);
2207
2208 /*
2209 * Blow out the MMU to ensure that no other VCPU has an active mapping
2210 * to ensure that the updated hypercall appears atomically across all
2211 * VCPUs.
2212 */
2213 kvm_mmu_zap_all(vcpu->kvm);
2214
2215 kvm_x86_ops->cache_regs(vcpu);
2216 kvm_x86_ops->patch_hypercall(vcpu, instruction);
2217 if (emulator_write_emulated(vcpu->arch.rip, instruction, 3, vcpu)
2218 != X86EMUL_CONTINUE)
2219 ret = -EFAULT;
2220
2221 mutex_unlock(&vcpu->kvm->lock);
2222
2223 return ret;
2224}
2225
2226static u64 mk_cr_64(u64 curr_cr, u32 new_val)
2227{
2228 return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
2229}
2230
2231void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
2232{
2233 struct descriptor_table dt = { limit, base };
2234
2235 kvm_x86_ops->set_gdt(vcpu, &dt);
2236}
2237
2238void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
2239{
2240 struct descriptor_table dt = { limit, base };
2241
2242 kvm_x86_ops->set_idt(vcpu, &dt);
2243}
2244
2245void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
2246 unsigned long *rflags)
2247{
2248 lmsw(vcpu, msw);
2249 *rflags = kvm_x86_ops->get_rflags(vcpu);
2250}
2251
2252unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
2253{
2254 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2255 switch (cr) {
2256 case 0:
2257 return vcpu->arch.cr0;
2258 case 2:
2259 return vcpu->arch.cr2;
2260 case 3:
2261 return vcpu->arch.cr3;
2262 case 4:
2263 return vcpu->arch.cr4;
2264 case 8:
2265 return get_cr8(vcpu);
2266 default:
2267 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
2268 return 0;
2269 }
2270}
2271
2272void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
2273 unsigned long *rflags)
2274{
2275 switch (cr) {
2276 case 0:
2277 set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
2278 *rflags = kvm_x86_ops->get_rflags(vcpu);
2279 break;
2280 case 2:
2281 vcpu->arch.cr2 = val;
2282 break;
2283 case 3:
2284 set_cr3(vcpu, val);
2285 break;
2286 case 4:
2287 set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val));
2288 break;
2289 case 8:
2290 set_cr8(vcpu, val & 0xfUL);
2291 break;
2292 default:
2293 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
2294 }
2295}
2296
2297static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
2298{
2299 struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
2300 int j, nent = vcpu->arch.cpuid_nent;
2301
2302 e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
2303 /* when no next entry is found, the current entry[i] is reselected */
2304 for (j = i + 1; j == i; j = (j + 1) % nent) {
2305 struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
2306 if (ej->function == e->function) {
2307 ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
2308 return j;
2309 }
2310 }
2311 return 0; /* silence gcc, even though control never reaches here */
2312}
2313
2314/* find an entry with matching function, matching index (if needed), and that
2315 * should be read next (if it's stateful) */
2316static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
2317 u32 function, u32 index)
2318{
2319 if (e->function != function)
2320 return 0;
2321 if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
2322 return 0;
2323 if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
2324 !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
2325 return 0;
2326 return 1;
2327}
2328
2329void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
2330{
2331 int i;
2332 u32 function, index;
2333 struct kvm_cpuid_entry2 *e, *best;
2334
2335 kvm_x86_ops->cache_regs(vcpu);
2336 function = vcpu->arch.regs[VCPU_REGS_RAX];
2337 index = vcpu->arch.regs[VCPU_REGS_RCX];
2338 vcpu->arch.regs[VCPU_REGS_RAX] = 0;
2339 vcpu->arch.regs[VCPU_REGS_RBX] = 0;
2340 vcpu->arch.regs[VCPU_REGS_RCX] = 0;
2341 vcpu->arch.regs[VCPU_REGS_RDX] = 0;
2342 best = NULL;
2343 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
2344 e = &vcpu->arch.cpuid_entries[i];
2345 if (is_matching_cpuid_entry(e, function, index)) {
2346 if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
2347 move_to_next_stateful_cpuid_entry(vcpu, i);
2348 best = e;
2349 break;
2350 }
2351 /*
2352 * Both basic or both extended?
2353 */
2354 if (((e->function ^ function) & 0x80000000) == 0)
2355 if (!best || e->function > best->function)
2356 best = e;
2357 }
2358 if (best) {
2359 vcpu->arch.regs[VCPU_REGS_RAX] = best->eax;
2360 vcpu->arch.regs[VCPU_REGS_RBX] = best->ebx;
2361 vcpu->arch.regs[VCPU_REGS_RCX] = best->ecx;
2362 vcpu->arch.regs[VCPU_REGS_RDX] = best->edx;
2363 }
2364 kvm_x86_ops->decache_regs(vcpu);
2365 kvm_x86_ops->skip_emulated_instruction(vcpu);
2366}
2367EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
2368
2369/*
2370 * Check if userspace requested an interrupt window, and that the
2371 * interrupt window is open.
2372 *
2373 * No need to exit to userspace if we already have an interrupt queued.
2374 */
2375static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
2376 struct kvm_run *kvm_run)
2377{
2378 return (!vcpu->arch.irq_summary &&
2379 kvm_run->request_interrupt_window &&
2380 vcpu->arch.interrupt_window_open &&
2381 (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
2382}
2383
2384static void post_kvm_run_save(struct kvm_vcpu *vcpu,
2385 struct kvm_run *kvm_run)
2386{
2387 kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
2388 kvm_run->cr8 = get_cr8(vcpu);
2389 kvm_run->apic_base = kvm_get_apic_base(vcpu);
2390 if (irqchip_in_kernel(vcpu->kvm))
2391 kvm_run->ready_for_interrupt_injection = 1;
2392 else
2393 kvm_run->ready_for_interrupt_injection =
2394 (vcpu->arch.interrupt_window_open &&
2395 vcpu->arch.irq_summary == 0);
2396}
2397
2398static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2399{
2400 int r;
2401
2402 if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) {
2403 pr_debug("vcpu %d received sipi with vector # %x\n",
2404 vcpu->vcpu_id, vcpu->arch.sipi_vector);
2405 kvm_lapic_reset(vcpu);
2406 r = kvm_x86_ops->vcpu_reset(vcpu);
2407 if (r)
2408 return r;
2409 vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
2410 }
2411
2412preempted:
2413 if (vcpu->guest_debug.enabled)
2414 kvm_x86_ops->guest_debug_pre(vcpu);
2415
2416again:
2417 r = kvm_mmu_reload(vcpu);
2418 if (unlikely(r))
2419 goto out;
2420
2421 kvm_inject_pending_timer_irqs(vcpu);
2422
2423 preempt_disable();
2424
2425 kvm_x86_ops->prepare_guest_switch(vcpu);
2426 kvm_load_guest_fpu(vcpu);
2427
2428 local_irq_disable();
2429
2430 if (signal_pending(current)) {
2431 local_irq_enable();
2432 preempt_enable();
2433 r = -EINTR;
2434 kvm_run->exit_reason = KVM_EXIT_INTR;
2435 ++vcpu->stat.signal_exits;
2436 goto out;
2437 }
2438
2439 if (vcpu->arch.exception.pending)
2440 __queue_exception(vcpu);
2441 else if (irqchip_in_kernel(vcpu->kvm))
2442 kvm_x86_ops->inject_pending_irq(vcpu);
2443 else
2444 kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
2445
2446 vcpu->guest_mode = 1;
2447 kvm_guest_enter();
2448
2449 if (vcpu->requests)
2450 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
2451 kvm_x86_ops->tlb_flush(vcpu);
2452
2453 kvm_x86_ops->run(vcpu, kvm_run);
2454
2455 vcpu->guest_mode = 0;
2456 local_irq_enable();
2457
2458 ++vcpu->stat.exits;
2459
2460 /*
2461 * We must have an instruction between local_irq_enable() and
2462 * kvm_guest_exit(), so the timer interrupt isn't delayed by
2463 * the interrupt shadow. The stat.exits increment will do nicely.
2464 * But we need to prevent reordering, hence this barrier():
2465 */
2466 barrier();
2467
2468 kvm_guest_exit();
2469
2470 preempt_enable();
2471
2472 /*
2473 * Profile KVM exit RIPs:
2474 */
2475 if (unlikely(prof_on == KVM_PROFILING)) {
2476 kvm_x86_ops->cache_regs(vcpu);
2477 profile_hit(KVM_PROFILING, (void *)vcpu->arch.rip);
2478 }
2479
2480 if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu))
2481 vcpu->arch.exception.pending = false;
2482
2483 r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
2484
2485 if (r > 0) {
2486 if (dm_request_for_irq_injection(vcpu, kvm_run)) {
2487 r = -EINTR;
2488 kvm_run->exit_reason = KVM_EXIT_INTR;
2489 ++vcpu->stat.request_irq_exits;
2490 goto out;
2491 }
2492 if (!need_resched())
2493 goto again;
2494 }
2495
2496out:
2497 if (r > 0) {
2498 kvm_resched(vcpu);
2499 goto preempted;
2500 }
2501
2502 post_kvm_run_save(vcpu, kvm_run);
2503
2504 return r;
2505}
2506
2507int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2508{
2509 int r;
2510 sigset_t sigsaved;
2511
2512 vcpu_load(vcpu);
2513
2514 if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_UNINITIALIZED)) {
2515 kvm_vcpu_block(vcpu);
2516 vcpu_put(vcpu);
2517 return -EAGAIN;
2518 }
2519
2520 if (vcpu->sigset_active)
2521 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
2522
2523 /* re-sync apic's tpr */
2524 if (!irqchip_in_kernel(vcpu->kvm))
2525 set_cr8(vcpu, kvm_run->cr8);
2526
2527 if (vcpu->arch.pio.cur_count) {
2528 r = complete_pio(vcpu);
2529 if (r)
2530 goto out;
2531 }
2532#if CONFIG_HAS_IOMEM
2533 if (vcpu->mmio_needed) {
2534 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
2535 vcpu->mmio_read_completed = 1;
2536 vcpu->mmio_needed = 0;
2537 r = emulate_instruction(vcpu, kvm_run,
2538 vcpu->arch.mmio_fault_cr2, 0, 1);
2539 if (r == EMULATE_DO_MMIO) {
2540 /*
2541 * Read-modify-write. Back to userspace.
2542 */
2543 r = 0;
2544 goto out;
2545 }
2546 }
2547#endif
2548 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) {
2549 kvm_x86_ops->cache_regs(vcpu);
2550 vcpu->arch.regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;
2551 kvm_x86_ops->decache_regs(vcpu);
2552 }
2553
2554 r = __vcpu_run(vcpu, kvm_run);
2555
2556out:
2557 if (vcpu->sigset_active)
2558 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
2559
2560 vcpu_put(vcpu);
2561 return r;
2562}
2563
2564int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
2565{
2566 vcpu_load(vcpu);
2567
2568 kvm_x86_ops->cache_regs(vcpu);
2569
2570 regs->rax = vcpu->arch.regs[VCPU_REGS_RAX];
2571 regs->rbx = vcpu->arch.regs[VCPU_REGS_RBX];
2572 regs->rcx = vcpu->arch.regs[VCPU_REGS_RCX];
2573 regs->rdx = vcpu->arch.regs[VCPU_REGS_RDX];
2574 regs->rsi = vcpu->arch.regs[VCPU_REGS_RSI];
2575 regs->rdi = vcpu->arch.regs[VCPU_REGS_RDI];
2576 regs->rsp = vcpu->arch.regs[VCPU_REGS_RSP];
2577 regs->rbp = vcpu->arch.regs[VCPU_REGS_RBP];
2578#ifdef CONFIG_X86_64
2579 regs->r8 = vcpu->arch.regs[VCPU_REGS_R8];
2580 regs->r9 = vcpu->arch.regs[VCPU_REGS_R9];
2581 regs->r10 = vcpu->arch.regs[VCPU_REGS_R10];
2582 regs->r11 = vcpu->arch.regs[VCPU_REGS_R11];
2583 regs->r12 = vcpu->arch.regs[VCPU_REGS_R12];
2584 regs->r13 = vcpu->arch.regs[VCPU_REGS_R13];
2585 regs->r14 = vcpu->arch.regs[VCPU_REGS_R14];
2586 regs->r15 = vcpu->arch.regs[VCPU_REGS_R15];
2587#endif
2588
2589 regs->rip = vcpu->arch.rip;
2590 regs->rflags = kvm_x86_ops->get_rflags(vcpu);
2591
2592 /*
2593 * Don't leak debug flags in case they were set for guest debugging
2594 */
2595 if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
2596 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
2597
2598 vcpu_put(vcpu);
2599
2600 return 0;
2601}
2602
2603int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
2604{
2605 vcpu_load(vcpu);
2606
2607 vcpu->arch.regs[VCPU_REGS_RAX] = regs->rax;
2608 vcpu->arch.regs[VCPU_REGS_RBX] = regs->rbx;
2609 vcpu->arch.regs[VCPU_REGS_RCX] = regs->rcx;
2610 vcpu->arch.regs[VCPU_REGS_RDX] = regs->rdx;
2611 vcpu->arch.regs[VCPU_REGS_RSI] = regs->rsi;
2612 vcpu->arch.regs[VCPU_REGS_RDI] = regs->rdi;
2613 vcpu->arch.regs[VCPU_REGS_RSP] = regs->rsp;
2614 vcpu->arch.regs[VCPU_REGS_RBP] = regs->rbp;
2615#ifdef CONFIG_X86_64
2616 vcpu->arch.regs[VCPU_REGS_R8] = regs->r8;
2617 vcpu->arch.regs[VCPU_REGS_R9] = regs->r9;
2618 vcpu->arch.regs[VCPU_REGS_R10] = regs->r10;
2619 vcpu->arch.regs[VCPU_REGS_R11] = regs->r11;
2620 vcpu->arch.regs[VCPU_REGS_R12] = regs->r12;
2621 vcpu->arch.regs[VCPU_REGS_R13] = regs->r13;
2622 vcpu->arch.regs[VCPU_REGS_R14] = regs->r14;
2623 vcpu->arch.regs[VCPU_REGS_R15] = regs->r15;
2624#endif
2625
2626 vcpu->arch.rip = regs->rip;
2627 kvm_x86_ops->set_rflags(vcpu, regs->rflags);
2628
2629 kvm_x86_ops->decache_regs(vcpu);
2630
2631 vcpu_put(vcpu);
2632
2633 return 0;
2634}
2635
2636static void get_segment(struct kvm_vcpu *vcpu,
2637 struct kvm_segment *var, int seg)
2638{
2639 return kvm_x86_ops->get_segment(vcpu, var, seg);
2640}
2641
2642void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
2643{
2644 struct kvm_segment cs;
2645
2646 get_segment(vcpu, &cs, VCPU_SREG_CS);
2647 *db = cs.db;
2648 *l = cs.l;
2649}
2650EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
2651
2652int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
2653 struct kvm_sregs *sregs)
2654{
2655 struct descriptor_table dt;
2656 int pending_vec;
2657
2658 vcpu_load(vcpu);
2659
2660 get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
2661 get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
2662 get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
2663 get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
2664 get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
2665 get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
2666
2667 get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
2668 get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
2669
2670 kvm_x86_ops->get_idt(vcpu, &dt);
2671 sregs->idt.limit = dt.limit;
2672 sregs->idt.base = dt.base;
2673 kvm_x86_ops->get_gdt(vcpu, &dt);
2674 sregs->gdt.limit = dt.limit;
2675 sregs->gdt.base = dt.base;
2676
2677 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2678 sregs->cr0 = vcpu->arch.cr0;
2679 sregs->cr2 = vcpu->arch.cr2;
2680 sregs->cr3 = vcpu->arch.cr3;
2681 sregs->cr4 = vcpu->arch.cr4;
2682 sregs->cr8 = get_cr8(vcpu);
2683 sregs->efer = vcpu->arch.shadow_efer;
2684 sregs->apic_base = kvm_get_apic_base(vcpu);
2685
2686 if (irqchip_in_kernel(vcpu->kvm)) {
2687 memset(sregs->interrupt_bitmap, 0,
2688 sizeof sregs->interrupt_bitmap);
2689 pending_vec = kvm_x86_ops->get_irq(vcpu);
2690 if (pending_vec >= 0)
2691 set_bit(pending_vec,
2692 (unsigned long *)sregs->interrupt_bitmap);
2693 } else
2694 memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending,
2695 sizeof sregs->interrupt_bitmap);
2696
2697 vcpu_put(vcpu);
2698
2699 return 0;
2700}
2701
2702static void set_segment(struct kvm_vcpu *vcpu,
2703 struct kvm_segment *var, int seg)
2704{
2705 return kvm_x86_ops->set_segment(vcpu, var, seg);
2706}
2707
2708int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
2709 struct kvm_sregs *sregs)
2710{
2711 int mmu_reset_needed = 0;
2712 int i, pending_vec, max_bits;
2713 struct descriptor_table dt;
2714
2715 vcpu_load(vcpu);
2716
2717 dt.limit = sregs->idt.limit;
2718 dt.base = sregs->idt.base;
2719 kvm_x86_ops->set_idt(vcpu, &dt);
2720 dt.limit = sregs->gdt.limit;
2721 dt.base = sregs->gdt.base;
2722 kvm_x86_ops->set_gdt(vcpu, &dt);
2723
2724 vcpu->arch.cr2 = sregs->cr2;
2725 mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
2726 vcpu->arch.cr3 = sregs->cr3;
2727
2728 set_cr8(vcpu, sregs->cr8);
2729
2730 mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer;
2731#ifdef CONFIG_X86_64
2732 kvm_x86_ops->set_efer(vcpu, sregs->efer);
2733#endif
2734 kvm_set_apic_base(vcpu, sregs->apic_base);
2735
2736 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2737
2738 mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0;
2739 vcpu->arch.cr0 = sregs->cr0;
2740 kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
2741
2742 mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4;
2743 kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
2744 if (!is_long_mode(vcpu) && is_pae(vcpu))
2745 load_pdptrs(vcpu, vcpu->arch.cr3);
2746
2747 if (mmu_reset_needed)
2748 kvm_mmu_reset_context(vcpu);
2749
2750 if (!irqchip_in_kernel(vcpu->kvm)) {
2751 memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap,
2752 sizeof vcpu->arch.irq_pending);
2753 vcpu->arch.irq_summary = 0;
2754 for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i)
2755 if (vcpu->arch.irq_pending[i])
2756 __set_bit(i, &vcpu->arch.irq_summary);
2757 } else {
2758 max_bits = (sizeof sregs->interrupt_bitmap) << 3;
2759 pending_vec = find_first_bit(
2760 (const unsigned long *)sregs->interrupt_bitmap,
2761 max_bits);
2762 /* Only pending external irq is handled here */
2763 if (pending_vec < max_bits) {
2764 kvm_x86_ops->set_irq(vcpu, pending_vec);
2765 pr_debug("Set back pending irq %d\n",
2766 pending_vec);
2767 }
2768 }
2769
2770 set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
2771 set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
2772 set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
2773 set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
2774 set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
2775 set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
2776
2777 set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
2778 set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
2779
2780 vcpu_put(vcpu);
2781
2782 return 0;
2783}
2784
2785int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
2786 struct kvm_debug_guest *dbg)
2787{
2788 int r;
2789
2790 vcpu_load(vcpu);
2791
2792 r = kvm_x86_ops->set_guest_debug(vcpu, dbg);
2793
2794 vcpu_put(vcpu);
2795
2796 return r;
2797}
2798
2799/*
2800 * fxsave fpu state. Taken from x86_64/processor.h. To be killed when
2801 * we have asm/x86/processor.h
2802 */
2803struct fxsave {
2804 u16 cwd;
2805 u16 swd;
2806 u16 twd;
2807 u16 fop;
2808 u64 rip;
2809 u64 rdp;
2810 u32 mxcsr;
2811 u32 mxcsr_mask;
2812 u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
2813#ifdef CONFIG_X86_64
2814 u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
2815#else
2816 u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */
2817#endif
2818};
2819
2820/*
2821 * Translate a guest virtual address to a guest physical address.
2822 */
2823int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
2824 struct kvm_translation *tr)
2825{
2826 unsigned long vaddr = tr->linear_address;
2827 gpa_t gpa;
2828
2829 vcpu_load(vcpu);
2830 mutex_lock(&vcpu->kvm->lock);
2831 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr);
2832 tr->physical_address = gpa;
2833 tr->valid = gpa != UNMAPPED_GVA;
2834 tr->writeable = 1;
2835 tr->usermode = 0;
2836 mutex_unlock(&vcpu->kvm->lock);
2837 vcpu_put(vcpu);
2838
2839 return 0;
2840}
2841
2842int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2843{
2844 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
2845
2846 vcpu_load(vcpu);
2847
2848 memcpy(fpu->fpr, fxsave->st_space, 128);
2849 fpu->fcw = fxsave->cwd;
2850 fpu->fsw = fxsave->swd;
2851 fpu->ftwx = fxsave->twd;
2852 fpu->last_opcode = fxsave->fop;
2853 fpu->last_ip = fxsave->rip;
2854 fpu->last_dp = fxsave->rdp;
2855 memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
2856
2857 vcpu_put(vcpu);
2858
2859 return 0;
2860}
2861
2862int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2863{
2864 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
2865
2866 vcpu_load(vcpu);
2867
2868 memcpy(fxsave->st_space, fpu->fpr, 128);
2869 fxsave->cwd = fpu->fcw;
2870 fxsave->swd = fpu->fsw;
2871 fxsave->twd = fpu->ftwx;
2872 fxsave->fop = fpu->last_opcode;
2873 fxsave->rip = fpu->last_ip;
2874 fxsave->rdp = fpu->last_dp;
2875 memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
2876
2877 vcpu_put(vcpu);
2878
2879 return 0;
2880}
2881
2882void fx_init(struct kvm_vcpu *vcpu)
2883{
2884 unsigned after_mxcsr_mask;
2885
2886 /* Initialize guest FPU by resetting ours and saving into guest's */
2887 preempt_disable();
2888 fx_save(&vcpu->arch.host_fx_image);
2889 fpu_init();
2890 fx_save(&vcpu->arch.guest_fx_image);
2891 fx_restore(&vcpu->arch.host_fx_image);
2892 preempt_enable();
2893
2894 vcpu->arch.cr0 |= X86_CR0_ET;
2895 after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
2896 vcpu->arch.guest_fx_image.mxcsr = 0x1f80;
2897 memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask,
2898 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
2899}
2900EXPORT_SYMBOL_GPL(fx_init);
2901
2902void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
2903{
2904 if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
2905 return;
2906
2907 vcpu->guest_fpu_loaded = 1;
2908 fx_save(&vcpu->arch.host_fx_image);
2909 fx_restore(&vcpu->arch.guest_fx_image);
2910}
2911EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
2912
2913void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
2914{
2915 if (!vcpu->guest_fpu_loaded)
2916 return;
2917
2918 vcpu->guest_fpu_loaded = 0;
2919 fx_save(&vcpu->arch.guest_fx_image);
2920 fx_restore(&vcpu->arch.host_fx_image);
2921 ++vcpu->stat.fpu_reload;
2922}
2923EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
2924
2925void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
2926{
2927 kvm_x86_ops->vcpu_free(vcpu);
2928}
2929
2930struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
2931 unsigned int id)
2932{
2933 return kvm_x86_ops->vcpu_create(kvm, id);
2934}
2935
2936int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
2937{
2938 int r;
2939
2940 /* We do fxsave: this must be aligned. */
2941 BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
2942
2943 vcpu_load(vcpu);
2944 r = kvm_arch_vcpu_reset(vcpu);
2945 if (r == 0)
2946 r = kvm_mmu_setup(vcpu);
2947 vcpu_put(vcpu);
2948 if (r < 0)
2949 goto free_vcpu;
2950
2951 return 0;
2952free_vcpu:
2953 kvm_x86_ops->vcpu_free(vcpu);
2954 return r;
2955}
2956
2957void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
2958{
2959 vcpu_load(vcpu);
2960 kvm_mmu_unload(vcpu);
2961 vcpu_put(vcpu);
2962
2963 kvm_x86_ops->vcpu_free(vcpu);
2964}
2965
2966int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
2967{
2968 return kvm_x86_ops->vcpu_reset(vcpu);
2969}
2970
2971void kvm_arch_hardware_enable(void *garbage)
2972{
2973 kvm_x86_ops->hardware_enable(garbage);
2974}
2975
2976void kvm_arch_hardware_disable(void *garbage)
2977{
2978 kvm_x86_ops->hardware_disable(garbage);
2979}
2980
2981int kvm_arch_hardware_setup(void)
2982{
2983 return kvm_x86_ops->hardware_setup();
2984}
2985
2986void kvm_arch_hardware_unsetup(void)
2987{
2988 kvm_x86_ops->hardware_unsetup();
2989}
2990
2991void kvm_arch_check_processor_compat(void *rtn)
2992{
2993 kvm_x86_ops->check_processor_compatibility(rtn);
2994}
2995
2996int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
2997{
2998 struct page *page;
2999 struct kvm *kvm;
3000 int r;
3001
3002 BUG_ON(vcpu->kvm == NULL);
3003 kvm = vcpu->kvm;
3004
3005 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
3006 if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0)
3007 vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
3008 else
3009 vcpu->arch.mp_state = VCPU_MP_STATE_UNINITIALIZED;
3010
3011 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
3012 if (!page) {
3013 r = -ENOMEM;
3014 goto fail;
3015 }
3016 vcpu->arch.pio_data = page_address(page);
3017
3018 r = kvm_mmu_create(vcpu);
3019 if (r < 0)
3020 goto fail_free_pio_data;
3021
3022 if (irqchip_in_kernel(kvm)) {
3023 r = kvm_create_lapic(vcpu);
3024 if (r < 0)
3025 goto fail_mmu_destroy;
3026 }
3027
3028 return 0;
3029
3030fail_mmu_destroy:
3031 kvm_mmu_destroy(vcpu);
3032fail_free_pio_data:
3033 free_page((unsigned long)vcpu->arch.pio_data);
3034fail:
3035 return r;
3036}
3037
3038void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
3039{
3040 kvm_free_lapic(vcpu);
3041 kvm_mmu_destroy(vcpu);
3042 free_page((unsigned long)vcpu->arch.pio_data);
3043}
3044
3045struct kvm *kvm_arch_create_vm(void)
3046{
3047 struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
3048
3049 if (!kvm)
3050 return ERR_PTR(-ENOMEM);
3051
3052 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
3053
3054 return kvm;
3055}
3056
3057static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
3058{
3059 vcpu_load(vcpu);
3060 kvm_mmu_unload(vcpu);
3061 vcpu_put(vcpu);
3062}
3063
3064static void kvm_free_vcpus(struct kvm *kvm)
3065{
3066 unsigned int i;
3067
3068 /*
3069 * Unpin any mmu pages first.
3070 */
3071 for (i = 0; i < KVM_MAX_VCPUS; ++i)
3072 if (kvm->vcpus[i])
3073 kvm_unload_vcpu_mmu(kvm->vcpus[i]);
3074 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
3075 if (kvm->vcpus[i]) {
3076 kvm_arch_vcpu_free(kvm->vcpus[i]);
3077 kvm->vcpus[i] = NULL;
3078 }
3079 }
3080
3081}
3082
3083void kvm_arch_destroy_vm(struct kvm *kvm)
3084{
3085 kfree(kvm->arch.vpic);
3086 kfree(kvm->arch.vioapic);
3087 kvm_free_vcpus(kvm);
3088 kvm_free_physmem(kvm);
3089 kfree(kvm);
3090}
3091
3092int kvm_arch_set_memory_region(struct kvm *kvm,
3093 struct kvm_userspace_memory_region *mem,
3094 struct kvm_memory_slot old,
3095 int user_alloc)
3096{
3097 int npages = mem->memory_size >> PAGE_SHIFT;
3098 struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
3099
3100 /*To keep backward compatibility with older userspace,
3101 *x86 needs to hanlde !user_alloc case.
3102 */
3103 if (!user_alloc) {
3104 if (npages && !old.rmap) {
3105 down_write(&current->mm->mmap_sem);
3106 memslot->userspace_addr = do_mmap(NULL, 0,
3107 npages * PAGE_SIZE,
3108 PROT_READ | PROT_WRITE,
3109 MAP_SHARED | MAP_ANONYMOUS,
3110 0);
3111 up_write(&current->mm->mmap_sem);
3112
3113 if (IS_ERR((void *)memslot->userspace_addr))
3114 return PTR_ERR((void *)memslot->userspace_addr);
3115 } else {
3116 if (!old.user_alloc && old.rmap) {
3117 int ret;
3118
3119 down_write(&current->mm->mmap_sem);
3120 ret = do_munmap(current->mm, old.userspace_addr,
3121 old.npages * PAGE_SIZE);
3122 up_write(&current->mm->mmap_sem);
3123 if (ret < 0)
3124 printk(KERN_WARNING
3125 "kvm_vm_ioctl_set_memory_region: "
3126 "failed to munmap memory\n");
3127 }
3128 }
3129 }
3130
3131 if (!kvm->arch.n_requested_mmu_pages) {
3132 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
3133 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
3134 }
3135
3136 kvm_mmu_slot_remove_write_access(kvm, mem->slot);
3137 kvm_flush_remote_tlbs(kvm);
3138
3139 return 0;
3140}
3141
3142int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
3143{
3144 return vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE
3145 || vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED;
3146}
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
new file mode 100644
index 000000000000..79586003397a
--- /dev/null
+++ b/arch/x86/kvm/x86_emulate.c
@@ -0,0 +1,1912 @@
1/******************************************************************************
2 * x86_emulate.c
3 *
4 * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
5 *
6 * Copyright (c) 2005 Keir Fraser
7 *
8 * Linux coding style, mod r/m decoder, segment base fixes, real-mode
9 * privileged instructions:
10 *
11 * Copyright (C) 2006 Qumranet
12 *
13 * Avi Kivity <avi@qumranet.com>
14 * Yaniv Kamay <yaniv@qumranet.com>
15 *
16 * This work is licensed under the terms of the GNU GPL, version 2. See
17 * the COPYING file in the top-level directory.
18 *
19 * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
20 */
21
22#ifndef __KERNEL__
23#include <stdio.h>
24#include <stdint.h>
25#include <public/xen.h>
26#define DPRINTF(_f, _a ...) printf(_f , ## _a)
27#else
28#include <linux/kvm_host.h>
29#define DPRINTF(x...) do {} while (0)
30#endif
31#include <linux/module.h>
32#include <asm/kvm_x86_emulate.h>
33
34/*
35 * Opcode effective-address decode tables.
36 * Note that we only emulate instructions that have at least one memory
37 * operand (excluding implicit stack references). We assume that stack
38 * references and instruction fetches will never occur in special memory
39 * areas that require emulation. So, for example, 'mov <imm>,<reg>' need
40 * not be handled.
41 */
42
43/* Operand sizes: 8-bit operands or specified/overridden size. */
44#define ByteOp (1<<0) /* 8-bit operands. */
45/* Destination operand type. */
46#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */
47#define DstReg (2<<1) /* Register operand. */
48#define DstMem (3<<1) /* Memory operand. */
49#define DstMask (3<<1)
50/* Source operand type. */
51#define SrcNone (0<<3) /* No source operand. */
52#define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */
53#define SrcReg (1<<3) /* Register operand. */
54#define SrcMem (2<<3) /* Memory operand. */
55#define SrcMem16 (3<<3) /* Memory operand (16-bit). */
56#define SrcMem32 (4<<3) /* Memory operand (32-bit). */
57#define SrcImm (5<<3) /* Immediate operand. */
58#define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */
59#define SrcMask (7<<3)
60/* Generic ModRM decode. */
61#define ModRM (1<<6)
62/* Destination is only written; never read. */
63#define Mov (1<<7)
64#define BitOp (1<<8)
65#define MemAbs (1<<9) /* Memory operand is absolute displacement */
66#define String (1<<10) /* String instruction (rep capable) */
67#define Stack (1<<11) /* Stack instruction (push/pop) */
68
69static u16 opcode_table[256] = {
70 /* 0x00 - 0x07 */
71 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
72 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
73 0, 0, 0, 0,
74 /* 0x08 - 0x0F */
75 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
76 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
77 0, 0, 0, 0,
78 /* 0x10 - 0x17 */
79 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
80 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
81 0, 0, 0, 0,
82 /* 0x18 - 0x1F */
83 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
84 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
85 0, 0, 0, 0,
86 /* 0x20 - 0x27 */
87 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
88 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
89 SrcImmByte, SrcImm, 0, 0,
90 /* 0x28 - 0x2F */
91 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
92 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
93 0, 0, 0, 0,
94 /* 0x30 - 0x37 */
95 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
96 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
97 0, 0, 0, 0,
98 /* 0x38 - 0x3F */
99 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
100 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
101 0, 0, 0, 0,
102 /* 0x40 - 0x47 */
103 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
104 /* 0x48 - 0x4F */
105 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
106 /* 0x50 - 0x57 */
107 SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
108 SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
109 /* 0x58 - 0x5F */
110 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
111 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
112 /* 0x60 - 0x67 */
113 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
114 0, 0, 0, 0,
115 /* 0x68 - 0x6F */
116 0, 0, ImplicitOps | Mov | Stack, 0,
117 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */
118 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */
119 /* 0x70 - 0x77 */
120 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
121 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
122 /* 0x78 - 0x7F */
123 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
124 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
125 /* 0x80 - 0x87 */
126 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
127 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
128 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
129 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
130 /* 0x88 - 0x8F */
131 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
132 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
133 0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov | Stack,
134 /* 0x90 - 0x9F */
135 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
137 /* 0xA0 - 0xA7 */
138 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
139 ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs,
140 ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
141 ByteOp | ImplicitOps | String, ImplicitOps | String,
142 /* 0xA8 - 0xAF */
143 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
144 ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
145 ByteOp | ImplicitOps | String, ImplicitOps | String,
146 /* 0xB0 - 0xBF */
147 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
148 /* 0xC0 - 0xC7 */
149 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
150 0, ImplicitOps | Stack, 0, 0,
151 ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
152 /* 0xC8 - 0xCF */
153 0, 0, 0, 0, 0, 0, 0, 0,
154 /* 0xD0 - 0xD7 */
155 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
156 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
157 0, 0, 0, 0,
158 /* 0xD8 - 0xDF */
159 0, 0, 0, 0, 0, 0, 0, 0,
160 /* 0xE0 - 0xE7 */
161 0, 0, 0, 0, 0, 0, 0, 0,
162 /* 0xE8 - 0xEF */
163 ImplicitOps | Stack, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps,
164 0, 0, 0, 0,
165 /* 0xF0 - 0xF7 */
166 0, 0, 0, 0,
167 ImplicitOps, ImplicitOps,
168 ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
169 /* 0xF8 - 0xFF */
170 ImplicitOps, 0, ImplicitOps, ImplicitOps,
171 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM
172};
173
174static u16 twobyte_table[256] = {
175 /* 0x00 - 0x0F */
176 0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0,
177 ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
178 /* 0x10 - 0x1F */
179 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
180 /* 0x20 - 0x2F */
181 ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 /* 0x30 - 0x3F */
184 ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
185 /* 0x40 - 0x47 */
186 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
187 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
188 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
189 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
190 /* 0x48 - 0x4F */
191 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
192 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
193 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
194 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
195 /* 0x50 - 0x5F */
196 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
197 /* 0x60 - 0x6F */
198 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
199 /* 0x70 - 0x7F */
200 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
201 /* 0x80 - 0x8F */
202 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
203 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
204 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
205 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
206 /* 0x90 - 0x9F */
207 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
208 /* 0xA0 - 0xA7 */
209 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
210 /* 0xA8 - 0xAF */
211 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
212 /* 0xB0 - 0xB7 */
213 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0,
214 DstMem | SrcReg | ModRM | BitOp,
215 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
216 DstReg | SrcMem16 | ModRM | Mov,
217 /* 0xB8 - 0xBF */
218 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp,
219 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
220 DstReg | SrcMem16 | ModRM | Mov,
221 /* 0xC0 - 0xCF */
222 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM,
223 0, 0, 0, 0, 0, 0, 0, 0,
224 /* 0xD0 - 0xDF */
225 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
226 /* 0xE0 - 0xEF */
227 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
228 /* 0xF0 - 0xFF */
229 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
230};
231
232/* EFLAGS bit definitions. */
233#define EFLG_OF (1<<11)
234#define EFLG_DF (1<<10)
235#define EFLG_SF (1<<7)
236#define EFLG_ZF (1<<6)
237#define EFLG_AF (1<<4)
238#define EFLG_PF (1<<2)
239#define EFLG_CF (1<<0)
240
241/*
242 * Instruction emulation:
243 * Most instructions are emulated directly via a fragment of inline assembly
244 * code. This allows us to save/restore EFLAGS and thus very easily pick up
245 * any modified flags.
246 */
247
248#if defined(CONFIG_X86_64)
249#define _LO32 "k" /* force 32-bit operand */
250#define _STK "%%rsp" /* stack pointer */
251#elif defined(__i386__)
252#define _LO32 "" /* force 32-bit operand */
253#define _STK "%%esp" /* stack pointer */
254#endif
255
256/*
257 * These EFLAGS bits are restored from saved value during emulation, and
258 * any changes are written back to the saved value after emulation.
259 */
260#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF)
261
262/* Before executing instruction: restore necessary bits in EFLAGS. */
263#define _PRE_EFLAGS(_sav, _msk, _tmp) \
264 /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~_msk; */ \
265 "movl %"_sav",%"_LO32 _tmp"; " \
266 "push %"_tmp"; " \
267 "push %"_tmp"; " \
268 "movl %"_msk",%"_LO32 _tmp"; " \
269 "andl %"_LO32 _tmp",("_STK"); " \
270 "pushf; " \
271 "notl %"_LO32 _tmp"; " \
272 "andl %"_LO32 _tmp",("_STK"); " \
273 "andl %"_LO32 _tmp","__stringify(BITS_PER_LONG/4)"("_STK"); " \
274 "pop %"_tmp"; " \
275 "orl %"_LO32 _tmp",("_STK"); " \
276 "popf; " \
277 "pop %"_sav"; "
278
279/* After executing instruction: write-back necessary bits in EFLAGS. */
280#define _POST_EFLAGS(_sav, _msk, _tmp) \
281 /* _sav |= EFLAGS & _msk; */ \
282 "pushf; " \
283 "pop %"_tmp"; " \
284 "andl %"_msk",%"_LO32 _tmp"; " \
285 "orl %"_LO32 _tmp",%"_sav"; "
286
287/* Raw emulation: instruction has two explicit operands. */
288#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \
289 do { \
290 unsigned long _tmp; \
291 \
292 switch ((_dst).bytes) { \
293 case 2: \
294 __asm__ __volatile__ ( \
295 _PRE_EFLAGS("0", "4", "2") \
296 _op"w %"_wx"3,%1; " \
297 _POST_EFLAGS("0", "4", "2") \
298 : "=m" (_eflags), "=m" ((_dst).val), \
299 "=&r" (_tmp) \
300 : _wy ((_src).val), "i" (EFLAGS_MASK)); \
301 break; \
302 case 4: \
303 __asm__ __volatile__ ( \
304 _PRE_EFLAGS("0", "4", "2") \
305 _op"l %"_lx"3,%1; " \
306 _POST_EFLAGS("0", "4", "2") \
307 : "=m" (_eflags), "=m" ((_dst).val), \
308 "=&r" (_tmp) \
309 : _ly ((_src).val), "i" (EFLAGS_MASK)); \
310 break; \
311 case 8: \
312 __emulate_2op_8byte(_op, _src, _dst, \
313 _eflags, _qx, _qy); \
314 break; \
315 } \
316 } while (0)
317
318#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
319 do { \
320 unsigned long _tmp; \
321 switch ((_dst).bytes) { \
322 case 1: \
323 __asm__ __volatile__ ( \
324 _PRE_EFLAGS("0", "4", "2") \
325 _op"b %"_bx"3,%1; " \
326 _POST_EFLAGS("0", "4", "2") \
327 : "=m" (_eflags), "=m" ((_dst).val), \
328 "=&r" (_tmp) \
329 : _by ((_src).val), "i" (EFLAGS_MASK)); \
330 break; \
331 default: \
332 __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
333 _wx, _wy, _lx, _ly, _qx, _qy); \
334 break; \
335 } \
336 } while (0)
337
338/* Source operand is byte-sized and may be restricted to just %cl. */
339#define emulate_2op_SrcB(_op, _src, _dst, _eflags) \
340 __emulate_2op(_op, _src, _dst, _eflags, \
341 "b", "c", "b", "c", "b", "c", "b", "c")
342
343/* Source operand is byte, word, long or quad sized. */
344#define emulate_2op_SrcV(_op, _src, _dst, _eflags) \
345 __emulate_2op(_op, _src, _dst, _eflags, \
346 "b", "q", "w", "r", _LO32, "r", "", "r")
347
348/* Source operand is word, long or quad sized. */
349#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \
350 __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
351 "w", "r", _LO32, "r", "", "r")
352
353/* Instruction has only one explicit operand (no source operand). */
354#define emulate_1op(_op, _dst, _eflags) \
355 do { \
356 unsigned long _tmp; \
357 \
358 switch ((_dst).bytes) { \
359 case 1: \
360 __asm__ __volatile__ ( \
361 _PRE_EFLAGS("0", "3", "2") \
362 _op"b %1; " \
363 _POST_EFLAGS("0", "3", "2") \
364 : "=m" (_eflags), "=m" ((_dst).val), \
365 "=&r" (_tmp) \
366 : "i" (EFLAGS_MASK)); \
367 break; \
368 case 2: \
369 __asm__ __volatile__ ( \
370 _PRE_EFLAGS("0", "3", "2") \
371 _op"w %1; " \
372 _POST_EFLAGS("0", "3", "2") \
373 : "=m" (_eflags), "=m" ((_dst).val), \
374 "=&r" (_tmp) \
375 : "i" (EFLAGS_MASK)); \
376 break; \
377 case 4: \
378 __asm__ __volatile__ ( \
379 _PRE_EFLAGS("0", "3", "2") \
380 _op"l %1; " \
381 _POST_EFLAGS("0", "3", "2") \
382 : "=m" (_eflags), "=m" ((_dst).val), \
383 "=&r" (_tmp) \
384 : "i" (EFLAGS_MASK)); \
385 break; \
386 case 8: \
387 __emulate_1op_8byte(_op, _dst, _eflags); \
388 break; \
389 } \
390 } while (0)
391
392/* Emulate an instruction with quadword operands (x86/64 only). */
393#if defined(CONFIG_X86_64)
394#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) \
395 do { \
396 __asm__ __volatile__ ( \
397 _PRE_EFLAGS("0", "4", "2") \
398 _op"q %"_qx"3,%1; " \
399 _POST_EFLAGS("0", "4", "2") \
400 : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
401 : _qy ((_src).val), "i" (EFLAGS_MASK)); \
402 } while (0)
403
404#define __emulate_1op_8byte(_op, _dst, _eflags) \
405 do { \
406 __asm__ __volatile__ ( \
407 _PRE_EFLAGS("0", "3", "2") \
408 _op"q %1; " \
409 _POST_EFLAGS("0", "3", "2") \
410 : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
411 : "i" (EFLAGS_MASK)); \
412 } while (0)
413
414#elif defined(__i386__)
415#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)
416#define __emulate_1op_8byte(_op, _dst, _eflags)
417#endif /* __i386__ */
418
419/* Fetch next part of the instruction being emulated. */
420#define insn_fetch(_type, _size, _eip) \
421({ unsigned long _x; \
422 rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \
423 if (rc != 0) \
424 goto done; \
425 (_eip) += (_size); \
426 (_type)_x; \
427})
428
429/* Access/update address held in a register, based on addressing mode. */
430#define address_mask(reg) \
431 ((c->ad_bytes == sizeof(unsigned long)) ? \
432 (reg) : ((reg) & ((1UL << (c->ad_bytes << 3)) - 1)))
433#define register_address(base, reg) \
434 ((base) + address_mask(reg))
435#define register_address_increment(reg, inc) \
436 do { \
437 /* signed type ensures sign extension to long */ \
438 int _inc = (inc); \
439 if (c->ad_bytes == sizeof(unsigned long)) \
440 (reg) += _inc; \
441 else \
442 (reg) = ((reg) & \
443 ~((1UL << (c->ad_bytes << 3)) - 1)) | \
444 (((reg) + _inc) & \
445 ((1UL << (c->ad_bytes << 3)) - 1)); \
446 } while (0)
447
448#define JMP_REL(rel) \
449 do { \
450 register_address_increment(c->eip, rel); \
451 } while (0)
452
453static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
454 struct x86_emulate_ops *ops,
455 unsigned long linear, u8 *dest)
456{
457 struct fetch_cache *fc = &ctxt->decode.fetch;
458 int rc;
459 int size;
460
461 if (linear < fc->start || linear >= fc->end) {
462 size = min(15UL, PAGE_SIZE - offset_in_page(linear));
463 rc = ops->read_std(linear, fc->data, size, ctxt->vcpu);
464 if (rc)
465 return rc;
466 fc->start = linear;
467 fc->end = linear + size;
468 }
469 *dest = fc->data[linear - fc->start];
470 return 0;
471}
472
473static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
474 struct x86_emulate_ops *ops,
475 unsigned long eip, void *dest, unsigned size)
476{
477 int rc = 0;
478
479 eip += ctxt->cs_base;
480 while (size--) {
481 rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++);
482 if (rc)
483 return rc;
484 }
485 return 0;
486}
487
488/*
489 * Given the 'reg' portion of a ModRM byte, and a register block, return a
490 * pointer into the block that addresses the relevant register.
491 * @highbyte_regs specifies whether to decode AH,CH,DH,BH.
492 */
493static void *decode_register(u8 modrm_reg, unsigned long *regs,
494 int highbyte_regs)
495{
496 void *p;
497
498 p = &regs[modrm_reg];
499 if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8)
500 p = (unsigned char *)&regs[modrm_reg & 3] + 1;
501 return p;
502}
503
504static int read_descriptor(struct x86_emulate_ctxt *ctxt,
505 struct x86_emulate_ops *ops,
506 void *ptr,
507 u16 *size, unsigned long *address, int op_bytes)
508{
509 int rc;
510
511 if (op_bytes == 2)
512 op_bytes = 3;
513 *address = 0;
514 rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
515 ctxt->vcpu);
516 if (rc)
517 return rc;
518 rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
519 ctxt->vcpu);
520 return rc;
521}
522
523static int test_cc(unsigned int condition, unsigned int flags)
524{
525 int rc = 0;
526
527 switch ((condition & 15) >> 1) {
528 case 0: /* o */
529 rc |= (flags & EFLG_OF);
530 break;
531 case 1: /* b/c/nae */
532 rc |= (flags & EFLG_CF);
533 break;
534 case 2: /* z/e */
535 rc |= (flags & EFLG_ZF);
536 break;
537 case 3: /* be/na */
538 rc |= (flags & (EFLG_CF|EFLG_ZF));
539 break;
540 case 4: /* s */
541 rc |= (flags & EFLG_SF);
542 break;
543 case 5: /* p/pe */
544 rc |= (flags & EFLG_PF);
545 break;
546 case 7: /* le/ng */
547 rc |= (flags & EFLG_ZF);
548 /* fall through */
549 case 6: /* l/nge */
550 rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF));
551 break;
552 }
553
554 /* Odd condition identifiers (lsb == 1) have inverted sense. */
555 return (!!rc ^ (condition & 1));
556}
557
558static void decode_register_operand(struct operand *op,
559 struct decode_cache *c,
560 int inhibit_bytereg)
561{
562 unsigned reg = c->modrm_reg;
563 int highbyte_regs = c->rex_prefix == 0;
564
565 if (!(c->d & ModRM))
566 reg = (c->b & 7) | ((c->rex_prefix & 1) << 3);
567 op->type = OP_REG;
568 if ((c->d & ByteOp) && !inhibit_bytereg) {
569 op->ptr = decode_register(reg, c->regs, highbyte_regs);
570 op->val = *(u8 *)op->ptr;
571 op->bytes = 1;
572 } else {
573 op->ptr = decode_register(reg, c->regs, 0);
574 op->bytes = c->op_bytes;
575 switch (op->bytes) {
576 case 2:
577 op->val = *(u16 *)op->ptr;
578 break;
579 case 4:
580 op->val = *(u32 *)op->ptr;
581 break;
582 case 8:
583 op->val = *(u64 *) op->ptr;
584 break;
585 }
586 }
587 op->orig_val = op->val;
588}
589
590static int decode_modrm(struct x86_emulate_ctxt *ctxt,
591 struct x86_emulate_ops *ops)
592{
593 struct decode_cache *c = &ctxt->decode;
594 u8 sib;
595 int index_reg = 0, base_reg = 0, scale, rip_relative = 0;
596 int rc = 0;
597
598 if (c->rex_prefix) {
599 c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */
600 index_reg = (c->rex_prefix & 2) << 2; /* REX.X */
601 c->modrm_rm = base_reg = (c->rex_prefix & 1) << 3; /* REG.B */
602 }
603
604 c->modrm = insn_fetch(u8, 1, c->eip);
605 c->modrm_mod |= (c->modrm & 0xc0) >> 6;
606 c->modrm_reg |= (c->modrm & 0x38) >> 3;
607 c->modrm_rm |= (c->modrm & 0x07);
608 c->modrm_ea = 0;
609 c->use_modrm_ea = 1;
610
611 if (c->modrm_mod == 3) {
612 c->modrm_val = *(unsigned long *)
613 decode_register(c->modrm_rm, c->regs, c->d & ByteOp);
614 return rc;
615 }
616
617 if (c->ad_bytes == 2) {
618 unsigned bx = c->regs[VCPU_REGS_RBX];
619 unsigned bp = c->regs[VCPU_REGS_RBP];
620 unsigned si = c->regs[VCPU_REGS_RSI];
621 unsigned di = c->regs[VCPU_REGS_RDI];
622
623 /* 16-bit ModR/M decode. */
624 switch (c->modrm_mod) {
625 case 0:
626 if (c->modrm_rm == 6)
627 c->modrm_ea += insn_fetch(u16, 2, c->eip);
628 break;
629 case 1:
630 c->modrm_ea += insn_fetch(s8, 1, c->eip);
631 break;
632 case 2:
633 c->modrm_ea += insn_fetch(u16, 2, c->eip);
634 break;
635 }
636 switch (c->modrm_rm) {
637 case 0:
638 c->modrm_ea += bx + si;
639 break;
640 case 1:
641 c->modrm_ea += bx + di;
642 break;
643 case 2:
644 c->modrm_ea += bp + si;
645 break;
646 case 3:
647 c->modrm_ea += bp + di;
648 break;
649 case 4:
650 c->modrm_ea += si;
651 break;
652 case 5:
653 c->modrm_ea += di;
654 break;
655 case 6:
656 if (c->modrm_mod != 0)
657 c->modrm_ea += bp;
658 break;
659 case 7:
660 c->modrm_ea += bx;
661 break;
662 }
663 if (c->modrm_rm == 2 || c->modrm_rm == 3 ||
664 (c->modrm_rm == 6 && c->modrm_mod != 0))
665 if (!c->override_base)
666 c->override_base = &ctxt->ss_base;
667 c->modrm_ea = (u16)c->modrm_ea;
668 } else {
669 /* 32/64-bit ModR/M decode. */
670 switch (c->modrm_rm) {
671 case 4:
672 case 12:
673 sib = insn_fetch(u8, 1, c->eip);
674 index_reg |= (sib >> 3) & 7;
675 base_reg |= sib & 7;
676 scale = sib >> 6;
677
678 switch (base_reg) {
679 case 5:
680 if (c->modrm_mod != 0)
681 c->modrm_ea += c->regs[base_reg];
682 else
683 c->modrm_ea +=
684 insn_fetch(s32, 4, c->eip);
685 break;
686 default:
687 c->modrm_ea += c->regs[base_reg];
688 }
689 switch (index_reg) {
690 case 4:
691 break;
692 default:
693 c->modrm_ea += c->regs[index_reg] << scale;
694 }
695 break;
696 case 5:
697 if (c->modrm_mod != 0)
698 c->modrm_ea += c->regs[c->modrm_rm];
699 else if (ctxt->mode == X86EMUL_MODE_PROT64)
700 rip_relative = 1;
701 break;
702 default:
703 c->modrm_ea += c->regs[c->modrm_rm];
704 break;
705 }
706 switch (c->modrm_mod) {
707 case 0:
708 if (c->modrm_rm == 5)
709 c->modrm_ea += insn_fetch(s32, 4, c->eip);
710 break;
711 case 1:
712 c->modrm_ea += insn_fetch(s8, 1, c->eip);
713 break;
714 case 2:
715 c->modrm_ea += insn_fetch(s32, 4, c->eip);
716 break;
717 }
718 }
719 if (rip_relative) {
720 c->modrm_ea += c->eip;
721 switch (c->d & SrcMask) {
722 case SrcImmByte:
723 c->modrm_ea += 1;
724 break;
725 case SrcImm:
726 if (c->d & ByteOp)
727 c->modrm_ea += 1;
728 else
729 if (c->op_bytes == 8)
730 c->modrm_ea += 4;
731 else
732 c->modrm_ea += c->op_bytes;
733 }
734 }
735done:
736 return rc;
737}
738
739static int decode_abs(struct x86_emulate_ctxt *ctxt,
740 struct x86_emulate_ops *ops)
741{
742 struct decode_cache *c = &ctxt->decode;
743 int rc = 0;
744
745 switch (c->ad_bytes) {
746 case 2:
747 c->modrm_ea = insn_fetch(u16, 2, c->eip);
748 break;
749 case 4:
750 c->modrm_ea = insn_fetch(u32, 4, c->eip);
751 break;
752 case 8:
753 c->modrm_ea = insn_fetch(u64, 8, c->eip);
754 break;
755 }
756done:
757 return rc;
758}
759
760int
761x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
762{
763 struct decode_cache *c = &ctxt->decode;
764 int rc = 0;
765 int mode = ctxt->mode;
766 int def_op_bytes, def_ad_bytes;
767
768 /* Shadow copy of register state. Committed on successful emulation. */
769
770 memset(c, 0, sizeof(struct decode_cache));
771 c->eip = ctxt->vcpu->arch.rip;
772 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
773
774 switch (mode) {
775 case X86EMUL_MODE_REAL:
776 case X86EMUL_MODE_PROT16:
777 def_op_bytes = def_ad_bytes = 2;
778 break;
779 case X86EMUL_MODE_PROT32:
780 def_op_bytes = def_ad_bytes = 4;
781 break;
782#ifdef CONFIG_X86_64
783 case X86EMUL_MODE_PROT64:
784 def_op_bytes = 4;
785 def_ad_bytes = 8;
786 break;
787#endif
788 default:
789 return -1;
790 }
791
792 c->op_bytes = def_op_bytes;
793 c->ad_bytes = def_ad_bytes;
794
795 /* Legacy prefixes. */
796 for (;;) {
797 switch (c->b = insn_fetch(u8, 1, c->eip)) {
798 case 0x66: /* operand-size override */
799 /* switch between 2/4 bytes */
800 c->op_bytes = def_op_bytes ^ 6;
801 break;
802 case 0x67: /* address-size override */
803 if (mode == X86EMUL_MODE_PROT64)
804 /* switch between 4/8 bytes */
805 c->ad_bytes = def_ad_bytes ^ 12;
806 else
807 /* switch between 2/4 bytes */
808 c->ad_bytes = def_ad_bytes ^ 6;
809 break;
810 case 0x2e: /* CS override */
811 c->override_base = &ctxt->cs_base;
812 break;
813 case 0x3e: /* DS override */
814 c->override_base = &ctxt->ds_base;
815 break;
816 case 0x26: /* ES override */
817 c->override_base = &ctxt->es_base;
818 break;
819 case 0x64: /* FS override */
820 c->override_base = &ctxt->fs_base;
821 break;
822 case 0x65: /* GS override */
823 c->override_base = &ctxt->gs_base;
824 break;
825 case 0x36: /* SS override */
826 c->override_base = &ctxt->ss_base;
827 break;
828 case 0x40 ... 0x4f: /* REX */
829 if (mode != X86EMUL_MODE_PROT64)
830 goto done_prefixes;
831 c->rex_prefix = c->b;
832 continue;
833 case 0xf0: /* LOCK */
834 c->lock_prefix = 1;
835 break;
836 case 0xf2: /* REPNE/REPNZ */
837 c->rep_prefix = REPNE_PREFIX;
838 break;
839 case 0xf3: /* REP/REPE/REPZ */
840 c->rep_prefix = REPE_PREFIX;
841 break;
842 default:
843 goto done_prefixes;
844 }
845
846 /* Any legacy prefix after a REX prefix nullifies its effect. */
847
848 c->rex_prefix = 0;
849 }
850
851done_prefixes:
852
853 /* REX prefix. */
854 if (c->rex_prefix)
855 if (c->rex_prefix & 8)
856 c->op_bytes = 8; /* REX.W */
857
858 /* Opcode byte(s). */
859 c->d = opcode_table[c->b];
860 if (c->d == 0) {
861 /* Two-byte opcode? */
862 if (c->b == 0x0f) {
863 c->twobyte = 1;
864 c->b = insn_fetch(u8, 1, c->eip);
865 c->d = twobyte_table[c->b];
866 }
867
868 /* Unrecognised? */
869 if (c->d == 0) {
870 DPRINTF("Cannot emulate %02x\n", c->b);
871 return -1;
872 }
873 }
874
875 if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
876 c->op_bytes = 8;
877
878 /* ModRM and SIB bytes. */
879 if (c->d & ModRM)
880 rc = decode_modrm(ctxt, ops);
881 else if (c->d & MemAbs)
882 rc = decode_abs(ctxt, ops);
883 if (rc)
884 goto done;
885
886 if (!c->override_base)
887 c->override_base = &ctxt->ds_base;
888 if (mode == X86EMUL_MODE_PROT64 &&
889 c->override_base != &ctxt->fs_base &&
890 c->override_base != &ctxt->gs_base)
891 c->override_base = NULL;
892
893 if (c->override_base)
894 c->modrm_ea += *c->override_base;
895
896 if (c->ad_bytes != 8)
897 c->modrm_ea = (u32)c->modrm_ea;
898 /*
899 * Decode and fetch the source operand: register, memory
900 * or immediate.
901 */
902 switch (c->d & SrcMask) {
903 case SrcNone:
904 break;
905 case SrcReg:
906 decode_register_operand(&c->src, c, 0);
907 break;
908 case SrcMem16:
909 c->src.bytes = 2;
910 goto srcmem_common;
911 case SrcMem32:
912 c->src.bytes = 4;
913 goto srcmem_common;
914 case SrcMem:
915 c->src.bytes = (c->d & ByteOp) ? 1 :
916 c->op_bytes;
917 /* Don't fetch the address for invlpg: it could be unmapped. */
918 if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7)
919 break;
920 srcmem_common:
921 /*
922 * For instructions with a ModR/M byte, switch to register
923 * access if Mod = 3.
924 */
925 if ((c->d & ModRM) && c->modrm_mod == 3) {
926 c->src.type = OP_REG;
927 break;
928 }
929 c->src.type = OP_MEM;
930 break;
931 case SrcImm:
932 c->src.type = OP_IMM;
933 c->src.ptr = (unsigned long *)c->eip;
934 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
935 if (c->src.bytes == 8)
936 c->src.bytes = 4;
937 /* NB. Immediates are sign-extended as necessary. */
938 switch (c->src.bytes) {
939 case 1:
940 c->src.val = insn_fetch(s8, 1, c->eip);
941 break;
942 case 2:
943 c->src.val = insn_fetch(s16, 2, c->eip);
944 break;
945 case 4:
946 c->src.val = insn_fetch(s32, 4, c->eip);
947 break;
948 }
949 break;
950 case SrcImmByte:
951 c->src.type = OP_IMM;
952 c->src.ptr = (unsigned long *)c->eip;
953 c->src.bytes = 1;
954 c->src.val = insn_fetch(s8, 1, c->eip);
955 break;
956 }
957
958 /* Decode and fetch the destination operand: register or memory. */
959 switch (c->d & DstMask) {
960 case ImplicitOps:
961 /* Special instructions do their own operand decoding. */
962 return 0;
963 case DstReg:
964 decode_register_operand(&c->dst, c,
965 c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
966 break;
967 case DstMem:
968 if ((c->d & ModRM) && c->modrm_mod == 3) {
969 c->dst.type = OP_REG;
970 break;
971 }
972 c->dst.type = OP_MEM;
973 break;
974 }
975
976done:
977 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
978}
979
980static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
981{
982 struct decode_cache *c = &ctxt->decode;
983
984 c->dst.type = OP_MEM;
985 c->dst.bytes = c->op_bytes;
986 c->dst.val = c->src.val;
987 register_address_increment(c->regs[VCPU_REGS_RSP], -c->op_bytes);
988 c->dst.ptr = (void *) register_address(ctxt->ss_base,
989 c->regs[VCPU_REGS_RSP]);
990}
991
992static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
993 struct x86_emulate_ops *ops)
994{
995 struct decode_cache *c = &ctxt->decode;
996 int rc;
997
998 rc = ops->read_std(register_address(ctxt->ss_base,
999 c->regs[VCPU_REGS_RSP]),
1000 &c->dst.val, c->dst.bytes, ctxt->vcpu);
1001 if (rc != 0)
1002 return rc;
1003
1004 register_address_increment(c->regs[VCPU_REGS_RSP], c->dst.bytes);
1005
1006 return 0;
1007}
1008
1009static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt)
1010{
1011 struct decode_cache *c = &ctxt->decode;
1012 switch (c->modrm_reg) {
1013 case 0: /* rol */
1014 emulate_2op_SrcB("rol", c->src, c->dst, ctxt->eflags);
1015 break;
1016 case 1: /* ror */
1017 emulate_2op_SrcB("ror", c->src, c->dst, ctxt->eflags);
1018 break;
1019 case 2: /* rcl */
1020 emulate_2op_SrcB("rcl", c->src, c->dst, ctxt->eflags);
1021 break;
1022 case 3: /* rcr */
1023 emulate_2op_SrcB("rcr", c->src, c->dst, ctxt->eflags);
1024 break;
1025 case 4: /* sal/shl */
1026 case 6: /* sal/shl */
1027 emulate_2op_SrcB("sal", c->src, c->dst, ctxt->eflags);
1028 break;
1029 case 5: /* shr */
1030 emulate_2op_SrcB("shr", c->src, c->dst, ctxt->eflags);
1031 break;
1032 case 7: /* sar */
1033 emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags);
1034 break;
1035 }
1036}
1037
1038static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
1039 struct x86_emulate_ops *ops)
1040{
1041 struct decode_cache *c = &ctxt->decode;
1042 int rc = 0;
1043
1044 switch (c->modrm_reg) {
1045 case 0 ... 1: /* test */
1046 /*
1047 * Special case in Grp3: test has an immediate
1048 * source operand.
1049 */
1050 c->src.type = OP_IMM;
1051 c->src.ptr = (unsigned long *)c->eip;
1052 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1053 if (c->src.bytes == 8)
1054 c->src.bytes = 4;
1055 switch (c->src.bytes) {
1056 case 1:
1057 c->src.val = insn_fetch(s8, 1, c->eip);
1058 break;
1059 case 2:
1060 c->src.val = insn_fetch(s16, 2, c->eip);
1061 break;
1062 case 4:
1063 c->src.val = insn_fetch(s32, 4, c->eip);
1064 break;
1065 }
1066 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
1067 break;
1068 case 2: /* not */
1069 c->dst.val = ~c->dst.val;
1070 break;
1071 case 3: /* neg */
1072 emulate_1op("neg", c->dst, ctxt->eflags);
1073 break;
1074 default:
1075 DPRINTF("Cannot emulate %02x\n", c->b);
1076 rc = X86EMUL_UNHANDLEABLE;
1077 break;
1078 }
1079done:
1080 return rc;
1081}
1082
1083static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
1084 struct x86_emulate_ops *ops)
1085{
1086 struct decode_cache *c = &ctxt->decode;
1087 int rc;
1088
1089 switch (c->modrm_reg) {
1090 case 0: /* inc */
1091 emulate_1op("inc", c->dst, ctxt->eflags);
1092 break;
1093 case 1: /* dec */
1094 emulate_1op("dec", c->dst, ctxt->eflags);
1095 break;
1096 case 4: /* jmp abs */
1097 if (c->b == 0xff)
1098 c->eip = c->dst.val;
1099 else {
1100 DPRINTF("Cannot emulate %02x\n", c->b);
1101 return X86EMUL_UNHANDLEABLE;
1102 }
1103 break;
1104 case 6: /* push */
1105
1106 /* 64-bit mode: PUSH always pushes a 64-bit operand. */
1107
1108 if (ctxt->mode == X86EMUL_MODE_PROT64) {
1109 c->dst.bytes = 8;
1110 rc = ops->read_std((unsigned long)c->dst.ptr,
1111 &c->dst.val, 8, ctxt->vcpu);
1112 if (rc != 0)
1113 return rc;
1114 }
1115 register_address_increment(c->regs[VCPU_REGS_RSP],
1116 -c->dst.bytes);
1117 rc = ops->write_emulated(register_address(ctxt->ss_base,
1118 c->regs[VCPU_REGS_RSP]), &c->dst.val,
1119 c->dst.bytes, ctxt->vcpu);
1120 if (rc != 0)
1121 return rc;
1122 c->dst.type = OP_NONE;
1123 break;
1124 default:
1125 DPRINTF("Cannot emulate %02x\n", c->b);
1126 return X86EMUL_UNHANDLEABLE;
1127 }
1128 return 0;
1129}
1130
1131static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
1132 struct x86_emulate_ops *ops,
1133 unsigned long memop)
1134{
1135 struct decode_cache *c = &ctxt->decode;
1136 u64 old, new;
1137 int rc;
1138
1139 rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu);
1140 if (rc != 0)
1141 return rc;
1142
1143 if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) ||
1144 ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) {
1145
1146 c->regs[VCPU_REGS_RAX] = (u32) (old >> 0);
1147 c->regs[VCPU_REGS_RDX] = (u32) (old >> 32);
1148 ctxt->eflags &= ~EFLG_ZF;
1149
1150 } else {
1151 new = ((u64)c->regs[VCPU_REGS_RCX] << 32) |
1152 (u32) c->regs[VCPU_REGS_RBX];
1153
1154 rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu);
1155 if (rc != 0)
1156 return rc;
1157 ctxt->eflags |= EFLG_ZF;
1158 }
1159 return 0;
1160}
1161
1162static inline int writeback(struct x86_emulate_ctxt *ctxt,
1163 struct x86_emulate_ops *ops)
1164{
1165 int rc;
1166 struct decode_cache *c = &ctxt->decode;
1167
1168 switch (c->dst.type) {
1169 case OP_REG:
1170 /* The 4-byte case *is* correct:
1171 * in 64-bit mode we zero-extend.
1172 */
1173 switch (c->dst.bytes) {
1174 case 1:
1175 *(u8 *)c->dst.ptr = (u8)c->dst.val;
1176 break;
1177 case 2:
1178 *(u16 *)c->dst.ptr = (u16)c->dst.val;
1179 break;
1180 case 4:
1181 *c->dst.ptr = (u32)c->dst.val;
1182 break; /* 64b: zero-ext */
1183 case 8:
1184 *c->dst.ptr = c->dst.val;
1185 break;
1186 }
1187 break;
1188 case OP_MEM:
1189 if (c->lock_prefix)
1190 rc = ops->cmpxchg_emulated(
1191 (unsigned long)c->dst.ptr,
1192 &c->dst.orig_val,
1193 &c->dst.val,
1194 c->dst.bytes,
1195 ctxt->vcpu);
1196 else
1197 rc = ops->write_emulated(
1198 (unsigned long)c->dst.ptr,
1199 &c->dst.val,
1200 c->dst.bytes,
1201 ctxt->vcpu);
1202 if (rc != 0)
1203 return rc;
1204 break;
1205 case OP_NONE:
1206 /* no writeback */
1207 break;
1208 default:
1209 break;
1210 }
1211 return 0;
1212}
1213
1214int
1215x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1216{
1217 unsigned long memop = 0;
1218 u64 msr_data;
1219 unsigned long saved_eip = 0;
1220 struct decode_cache *c = &ctxt->decode;
1221 int rc = 0;
1222
1223 /* Shadow copy of register state. Committed on successful emulation.
1224 * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't
1225 * modify them.
1226 */
1227
1228 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
1229 saved_eip = c->eip;
1230
1231 if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs))
1232 memop = c->modrm_ea;
1233
1234 if (c->rep_prefix && (c->d & String)) {
1235 /* All REP prefixes have the same first termination condition */
1236 if (c->regs[VCPU_REGS_RCX] == 0) {
1237 ctxt->vcpu->arch.rip = c->eip;
1238 goto done;
1239 }
1240 /* The second termination condition only applies for REPE
1241 * and REPNE. Test if the repeat string operation prefix is
1242 * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
1243 * corresponding termination condition according to:
1244 * - if REPE/REPZ and ZF = 0 then done
1245 * - if REPNE/REPNZ and ZF = 1 then done
1246 */
1247 if ((c->b == 0xa6) || (c->b == 0xa7) ||
1248 (c->b == 0xae) || (c->b == 0xaf)) {
1249 if ((c->rep_prefix == REPE_PREFIX) &&
1250 ((ctxt->eflags & EFLG_ZF) == 0)) {
1251 ctxt->vcpu->arch.rip = c->eip;
1252 goto done;
1253 }
1254 if ((c->rep_prefix == REPNE_PREFIX) &&
1255 ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) {
1256 ctxt->vcpu->arch.rip = c->eip;
1257 goto done;
1258 }
1259 }
1260 c->regs[VCPU_REGS_RCX]--;
1261 c->eip = ctxt->vcpu->arch.rip;
1262 }
1263
1264 if (c->src.type == OP_MEM) {
1265 c->src.ptr = (unsigned long *)memop;
1266 c->src.val = 0;
1267 rc = ops->read_emulated((unsigned long)c->src.ptr,
1268 &c->src.val,
1269 c->src.bytes,
1270 ctxt->vcpu);
1271 if (rc != 0)
1272 goto done;
1273 c->src.orig_val = c->src.val;
1274 }
1275
1276 if ((c->d & DstMask) == ImplicitOps)
1277 goto special_insn;
1278
1279
1280 if (c->dst.type == OP_MEM) {
1281 c->dst.ptr = (unsigned long *)memop;
1282 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1283 c->dst.val = 0;
1284 if (c->d & BitOp) {
1285 unsigned long mask = ~(c->dst.bytes * 8 - 1);
1286
1287 c->dst.ptr = (void *)c->dst.ptr +
1288 (c->src.val & mask) / 8;
1289 }
1290 if (!(c->d & Mov) &&
1291 /* optimisation - avoid slow emulated read */
1292 ((rc = ops->read_emulated((unsigned long)c->dst.ptr,
1293 &c->dst.val,
1294 c->dst.bytes, ctxt->vcpu)) != 0))
1295 goto done;
1296 }
1297 c->dst.orig_val = c->dst.val;
1298
1299special_insn:
1300
1301 if (c->twobyte)
1302 goto twobyte_insn;
1303
1304 switch (c->b) {
1305 case 0x00 ... 0x05:
1306 add: /* add */
1307 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
1308 break;
1309 case 0x08 ... 0x0d:
1310 or: /* or */
1311 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
1312 break;
1313 case 0x10 ... 0x15:
1314 adc: /* adc */
1315 emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
1316 break;
1317 case 0x18 ... 0x1d:
1318 sbb: /* sbb */
1319 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
1320 break;
1321 case 0x20 ... 0x23:
1322 and: /* and */
1323 emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
1324 break;
1325 case 0x24: /* and al imm8 */
1326 c->dst.type = OP_REG;
1327 c->dst.ptr = &c->regs[VCPU_REGS_RAX];
1328 c->dst.val = *(u8 *)c->dst.ptr;
1329 c->dst.bytes = 1;
1330 c->dst.orig_val = c->dst.val;
1331 goto and;
1332 case 0x25: /* and ax imm16, or eax imm32 */
1333 c->dst.type = OP_REG;
1334 c->dst.bytes = c->op_bytes;
1335 c->dst.ptr = &c->regs[VCPU_REGS_RAX];
1336 if (c->op_bytes == 2)
1337 c->dst.val = *(u16 *)c->dst.ptr;
1338 else
1339 c->dst.val = *(u32 *)c->dst.ptr;
1340 c->dst.orig_val = c->dst.val;
1341 goto and;
1342 case 0x28 ... 0x2d:
1343 sub: /* sub */
1344 emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags);
1345 break;
1346 case 0x30 ... 0x35:
1347 xor: /* xor */
1348 emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags);
1349 break;
1350 case 0x38 ... 0x3d:
1351 cmp: /* cmp */
1352 emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
1353 break;
1354 case 0x40 ... 0x47: /* inc r16/r32 */
1355 emulate_1op("inc", c->dst, ctxt->eflags);
1356 break;
1357 case 0x48 ... 0x4f: /* dec r16/r32 */
1358 emulate_1op("dec", c->dst, ctxt->eflags);
1359 break;
1360 case 0x50 ... 0x57: /* push reg */
1361 c->dst.type = OP_MEM;
1362 c->dst.bytes = c->op_bytes;
1363 c->dst.val = c->src.val;
1364 register_address_increment(c->regs[VCPU_REGS_RSP],
1365 -c->op_bytes);
1366 c->dst.ptr = (void *) register_address(
1367 ctxt->ss_base, c->regs[VCPU_REGS_RSP]);
1368 break;
1369 case 0x58 ... 0x5f: /* pop reg */
1370 pop_instruction:
1371 if ((rc = ops->read_std(register_address(ctxt->ss_base,
1372 c->regs[VCPU_REGS_RSP]), c->dst.ptr,
1373 c->op_bytes, ctxt->vcpu)) != 0)
1374 goto done;
1375
1376 register_address_increment(c->regs[VCPU_REGS_RSP],
1377 c->op_bytes);
1378 c->dst.type = OP_NONE; /* Disable writeback. */
1379 break;
1380 case 0x63: /* movsxd */
1381 if (ctxt->mode != X86EMUL_MODE_PROT64)
1382 goto cannot_emulate;
1383 c->dst.val = (s32) c->src.val;
1384 break;
1385 case 0x6a: /* push imm8 */
1386 c->src.val = 0L;
1387 c->src.val = insn_fetch(s8, 1, c->eip);
1388 emulate_push(ctxt);
1389 break;
1390 case 0x6c: /* insb */
1391 case 0x6d: /* insw/insd */
1392 if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
1393 1,
1394 (c->d & ByteOp) ? 1 : c->op_bytes,
1395 c->rep_prefix ?
1396 address_mask(c->regs[VCPU_REGS_RCX]) : 1,
1397 (ctxt->eflags & EFLG_DF),
1398 register_address(ctxt->es_base,
1399 c->regs[VCPU_REGS_RDI]),
1400 c->rep_prefix,
1401 c->regs[VCPU_REGS_RDX]) == 0) {
1402 c->eip = saved_eip;
1403 return -1;
1404 }
1405 return 0;
1406 case 0x6e: /* outsb */
1407 case 0x6f: /* outsw/outsd */
1408 if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
1409 0,
1410 (c->d & ByteOp) ? 1 : c->op_bytes,
1411 c->rep_prefix ?
1412 address_mask(c->regs[VCPU_REGS_RCX]) : 1,
1413 (ctxt->eflags & EFLG_DF),
1414 register_address(c->override_base ?
1415 *c->override_base :
1416 ctxt->ds_base,
1417 c->regs[VCPU_REGS_RSI]),
1418 c->rep_prefix,
1419 c->regs[VCPU_REGS_RDX]) == 0) {
1420 c->eip = saved_eip;
1421 return -1;
1422 }
1423 return 0;
1424 case 0x70 ... 0x7f: /* jcc (short) */ {
1425 int rel = insn_fetch(s8, 1, c->eip);
1426
1427 if (test_cc(c->b, ctxt->eflags))
1428 JMP_REL(rel);
1429 break;
1430 }
1431 case 0x80 ... 0x83: /* Grp1 */
1432 switch (c->modrm_reg) {
1433 case 0:
1434 goto add;
1435 case 1:
1436 goto or;
1437 case 2:
1438 goto adc;
1439 case 3:
1440 goto sbb;
1441 case 4:
1442 goto and;
1443 case 5:
1444 goto sub;
1445 case 6:
1446 goto xor;
1447 case 7:
1448 goto cmp;
1449 }
1450 break;
1451 case 0x84 ... 0x85:
1452 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
1453 break;
1454 case 0x86 ... 0x87: /* xchg */
1455 /* Write back the register source. */
1456 switch (c->dst.bytes) {
1457 case 1:
1458 *(u8 *) c->src.ptr = (u8) c->dst.val;
1459 break;
1460 case 2:
1461 *(u16 *) c->src.ptr = (u16) c->dst.val;
1462 break;
1463 case 4:
1464 *c->src.ptr = (u32) c->dst.val;
1465 break; /* 64b reg: zero-extend */
1466 case 8:
1467 *c->src.ptr = c->dst.val;
1468 break;
1469 }
1470 /*
1471 * Write back the memory destination with implicit LOCK
1472 * prefix.
1473 */
1474 c->dst.val = c->src.val;
1475 c->lock_prefix = 1;
1476 break;
1477 case 0x88 ... 0x8b: /* mov */
1478 goto mov;
1479 case 0x8d: /* lea r16/r32, m */
1480 c->dst.val = c->modrm_val;
1481 break;
1482 case 0x8f: /* pop (sole member of Grp1a) */
1483 rc = emulate_grp1a(ctxt, ops);
1484 if (rc != 0)
1485 goto done;
1486 break;
1487 case 0x9c: /* pushf */
1488 c->src.val = (unsigned long) ctxt->eflags;
1489 emulate_push(ctxt);
1490 break;
1491 case 0x9d: /* popf */
1492 c->dst.ptr = (unsigned long *) &ctxt->eflags;
1493 goto pop_instruction;
1494 case 0xa0 ... 0xa1: /* mov */
1495 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
1496 c->dst.val = c->src.val;
1497 break;
1498 case 0xa2 ... 0xa3: /* mov */
1499 c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX];
1500 break;
1501 case 0xa4 ... 0xa5: /* movs */
1502 c->dst.type = OP_MEM;
1503 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1504 c->dst.ptr = (unsigned long *)register_address(
1505 ctxt->es_base,
1506 c->regs[VCPU_REGS_RDI]);
1507 if ((rc = ops->read_emulated(register_address(
1508 c->override_base ? *c->override_base :
1509 ctxt->ds_base,
1510 c->regs[VCPU_REGS_RSI]),
1511 &c->dst.val,
1512 c->dst.bytes, ctxt->vcpu)) != 0)
1513 goto done;
1514 register_address_increment(c->regs[VCPU_REGS_RSI],
1515 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
1516 : c->dst.bytes);
1517 register_address_increment(c->regs[VCPU_REGS_RDI],
1518 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
1519 : c->dst.bytes);
1520 break;
1521 case 0xa6 ... 0xa7: /* cmps */
1522 c->src.type = OP_NONE; /* Disable writeback. */
1523 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1524 c->src.ptr = (unsigned long *)register_address(
1525 c->override_base ? *c->override_base :
1526 ctxt->ds_base,
1527 c->regs[VCPU_REGS_RSI]);
1528 if ((rc = ops->read_emulated((unsigned long)c->src.ptr,
1529 &c->src.val,
1530 c->src.bytes,
1531 ctxt->vcpu)) != 0)
1532 goto done;
1533
1534 c->dst.type = OP_NONE; /* Disable writeback. */
1535 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1536 c->dst.ptr = (unsigned long *)register_address(
1537 ctxt->es_base,
1538 c->regs[VCPU_REGS_RDI]);
1539 if ((rc = ops->read_emulated((unsigned long)c->dst.ptr,
1540 &c->dst.val,
1541 c->dst.bytes,
1542 ctxt->vcpu)) != 0)
1543 goto done;
1544
1545 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
1546
1547 emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
1548
1549 register_address_increment(c->regs[VCPU_REGS_RSI],
1550 (ctxt->eflags & EFLG_DF) ? -c->src.bytes
1551 : c->src.bytes);
1552 register_address_increment(c->regs[VCPU_REGS_RDI],
1553 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
1554 : c->dst.bytes);
1555
1556 break;
1557 case 0xaa ... 0xab: /* stos */
1558 c->dst.type = OP_MEM;
1559 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1560 c->dst.ptr = (unsigned long *)register_address(
1561 ctxt->es_base,
1562 c->regs[VCPU_REGS_RDI]);
1563 c->dst.val = c->regs[VCPU_REGS_RAX];
1564 register_address_increment(c->regs[VCPU_REGS_RDI],
1565 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
1566 : c->dst.bytes);
1567 break;
1568 case 0xac ... 0xad: /* lods */
1569 c->dst.type = OP_REG;
1570 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1571 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
1572 if ((rc = ops->read_emulated(register_address(
1573 c->override_base ? *c->override_base :
1574 ctxt->ds_base,
1575 c->regs[VCPU_REGS_RSI]),
1576 &c->dst.val,
1577 c->dst.bytes,
1578 ctxt->vcpu)) != 0)
1579 goto done;
1580 register_address_increment(c->regs[VCPU_REGS_RSI],
1581 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
1582 : c->dst.bytes);
1583 break;
1584 case 0xae ... 0xaf: /* scas */
1585 DPRINTF("Urk! I don't handle SCAS.\n");
1586 goto cannot_emulate;
1587 case 0xc0 ... 0xc1:
1588 emulate_grp2(ctxt);
1589 break;
1590 case 0xc3: /* ret */
1591 c->dst.ptr = &c->eip;
1592 goto pop_instruction;
1593 case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */
1594 mov:
1595 c->dst.val = c->src.val;
1596 break;
1597 case 0xd0 ... 0xd1: /* Grp2 */
1598 c->src.val = 1;
1599 emulate_grp2(ctxt);
1600 break;
1601 case 0xd2 ... 0xd3: /* Grp2 */
1602 c->src.val = c->regs[VCPU_REGS_RCX];
1603 emulate_grp2(ctxt);
1604 break;
1605 case 0xe8: /* call (near) */ {
1606 long int rel;
1607 switch (c->op_bytes) {
1608 case 2:
1609 rel = insn_fetch(s16, 2, c->eip);
1610 break;
1611 case 4:
1612 rel = insn_fetch(s32, 4, c->eip);
1613 break;
1614 default:
1615 DPRINTF("Call: Invalid op_bytes\n");
1616 goto cannot_emulate;
1617 }
1618 c->src.val = (unsigned long) c->eip;
1619 JMP_REL(rel);
1620 c->op_bytes = c->ad_bytes;
1621 emulate_push(ctxt);
1622 break;
1623 }
1624 case 0xe9: /* jmp rel */
1625 case 0xeb: /* jmp rel short */
1626 JMP_REL(c->src.val);
1627 c->dst.type = OP_NONE; /* Disable writeback. */
1628 break;
1629 case 0xf4: /* hlt */
1630 ctxt->vcpu->arch.halt_request = 1;
1631 goto done;
1632 case 0xf5: /* cmc */
1633 /* complement carry flag from eflags reg */
1634 ctxt->eflags ^= EFLG_CF;
1635 c->dst.type = OP_NONE; /* Disable writeback. */
1636 break;
1637 case 0xf6 ... 0xf7: /* Grp3 */
1638 rc = emulate_grp3(ctxt, ops);
1639 if (rc != 0)
1640 goto done;
1641 break;
1642 case 0xf8: /* clc */
1643 ctxt->eflags &= ~EFLG_CF;
1644 c->dst.type = OP_NONE; /* Disable writeback. */
1645 break;
1646 case 0xfa: /* cli */
1647 ctxt->eflags &= ~X86_EFLAGS_IF;
1648 c->dst.type = OP_NONE; /* Disable writeback. */
1649 break;
1650 case 0xfb: /* sti */
1651 ctxt->eflags |= X86_EFLAGS_IF;
1652 c->dst.type = OP_NONE; /* Disable writeback. */
1653 break;
1654 case 0xfe ... 0xff: /* Grp4/Grp5 */
1655 rc = emulate_grp45(ctxt, ops);
1656 if (rc != 0)
1657 goto done;
1658 break;
1659 }
1660
1661writeback:
1662 rc = writeback(ctxt, ops);
1663 if (rc != 0)
1664 goto done;
1665
1666 /* Commit shadow register state. */
1667 memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
1668 ctxt->vcpu->arch.rip = c->eip;
1669
1670done:
1671 if (rc == X86EMUL_UNHANDLEABLE) {
1672 c->eip = saved_eip;
1673 return -1;
1674 }
1675 return 0;
1676
1677twobyte_insn:
1678 switch (c->b) {
1679 case 0x01: /* lgdt, lidt, lmsw */
1680 switch (c->modrm_reg) {
1681 u16 size;
1682 unsigned long address;
1683
1684 case 0: /* vmcall */
1685 if (c->modrm_mod != 3 || c->modrm_rm != 1)
1686 goto cannot_emulate;
1687
1688 rc = kvm_fix_hypercall(ctxt->vcpu);
1689 if (rc)
1690 goto done;
1691
1692 kvm_emulate_hypercall(ctxt->vcpu);
1693 break;
1694 case 2: /* lgdt */
1695 rc = read_descriptor(ctxt, ops, c->src.ptr,
1696 &size, &address, c->op_bytes);
1697 if (rc)
1698 goto done;
1699 realmode_lgdt(ctxt->vcpu, size, address);
1700 break;
1701 case 3: /* lidt/vmmcall */
1702 if (c->modrm_mod == 3 && c->modrm_rm == 1) {
1703 rc = kvm_fix_hypercall(ctxt->vcpu);
1704 if (rc)
1705 goto done;
1706 kvm_emulate_hypercall(ctxt->vcpu);
1707 } else {
1708 rc = read_descriptor(ctxt, ops, c->src.ptr,
1709 &size, &address,
1710 c->op_bytes);
1711 if (rc)
1712 goto done;
1713 realmode_lidt(ctxt->vcpu, size, address);
1714 }
1715 break;
1716 case 4: /* smsw */
1717 if (c->modrm_mod != 3)
1718 goto cannot_emulate;
1719 *(u16 *)&c->regs[c->modrm_rm]
1720 = realmode_get_cr(ctxt->vcpu, 0);
1721 break;
1722 case 6: /* lmsw */
1723 if (c->modrm_mod != 3)
1724 goto cannot_emulate;
1725 realmode_lmsw(ctxt->vcpu, (u16)c->modrm_val,
1726 &ctxt->eflags);
1727 break;
1728 case 7: /* invlpg*/
1729 emulate_invlpg(ctxt->vcpu, memop);
1730 break;
1731 default:
1732 goto cannot_emulate;
1733 }
1734 /* Disable writeback. */
1735 c->dst.type = OP_NONE;
1736 break;
1737 case 0x06:
1738 emulate_clts(ctxt->vcpu);
1739 c->dst.type = OP_NONE;
1740 break;
1741 case 0x08: /* invd */
1742 case 0x09: /* wbinvd */
1743 case 0x0d: /* GrpP (prefetch) */
1744 case 0x18: /* Grp16 (prefetch/nop) */
1745 c->dst.type = OP_NONE;
1746 break;
1747 case 0x20: /* mov cr, reg */
1748 if (c->modrm_mod != 3)
1749 goto cannot_emulate;
1750 c->regs[c->modrm_rm] =
1751 realmode_get_cr(ctxt->vcpu, c->modrm_reg);
1752 c->dst.type = OP_NONE; /* no writeback */
1753 break;
1754 case 0x21: /* mov from dr to reg */
1755 if (c->modrm_mod != 3)
1756 goto cannot_emulate;
1757 rc = emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]);
1758 if (rc)
1759 goto cannot_emulate;
1760 c->dst.type = OP_NONE; /* no writeback */
1761 break;
1762 case 0x22: /* mov reg, cr */
1763 if (c->modrm_mod != 3)
1764 goto cannot_emulate;
1765 realmode_set_cr(ctxt->vcpu,
1766 c->modrm_reg, c->modrm_val, &ctxt->eflags);
1767 c->dst.type = OP_NONE;
1768 break;
1769 case 0x23: /* mov from reg to dr */
1770 if (c->modrm_mod != 3)
1771 goto cannot_emulate;
1772 rc = emulator_set_dr(ctxt, c->modrm_reg,
1773 c->regs[c->modrm_rm]);
1774 if (rc)
1775 goto cannot_emulate;
1776 c->dst.type = OP_NONE; /* no writeback */
1777 break;
1778 case 0x30:
1779 /* wrmsr */
1780 msr_data = (u32)c->regs[VCPU_REGS_RAX]
1781 | ((u64)c->regs[VCPU_REGS_RDX] << 32);
1782 rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data);
1783 if (rc) {
1784 kvm_inject_gp(ctxt->vcpu, 0);
1785 c->eip = ctxt->vcpu->arch.rip;
1786 }
1787 rc = X86EMUL_CONTINUE;
1788 c->dst.type = OP_NONE;
1789 break;
1790 case 0x32:
1791 /* rdmsr */
1792 rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data);
1793 if (rc) {
1794 kvm_inject_gp(ctxt->vcpu, 0);
1795 c->eip = ctxt->vcpu->arch.rip;
1796 } else {
1797 c->regs[VCPU_REGS_RAX] = (u32)msr_data;
1798 c->regs[VCPU_REGS_RDX] = msr_data >> 32;
1799 }
1800 rc = X86EMUL_CONTINUE;
1801 c->dst.type = OP_NONE;
1802 break;
1803 case 0x40 ... 0x4f: /* cmov */
1804 c->dst.val = c->dst.orig_val = c->src.val;
1805 if (!test_cc(c->b, ctxt->eflags))
1806 c->dst.type = OP_NONE; /* no writeback */
1807 break;
1808 case 0x80 ... 0x8f: /* jnz rel, etc*/ {
1809 long int rel;
1810
1811 switch (c->op_bytes) {
1812 case 2:
1813 rel = insn_fetch(s16, 2, c->eip);
1814 break;
1815 case 4:
1816 rel = insn_fetch(s32, 4, c->eip);
1817 break;
1818 case 8:
1819 rel = insn_fetch(s64, 8, c->eip);
1820 break;
1821 default:
1822 DPRINTF("jnz: Invalid op_bytes\n");
1823 goto cannot_emulate;
1824 }
1825 if (test_cc(c->b, ctxt->eflags))
1826 JMP_REL(rel);
1827 c->dst.type = OP_NONE;
1828 break;
1829 }
1830 case 0xa3:
1831 bt: /* bt */
1832 c->dst.type = OP_NONE;
1833 /* only subword offset */
1834 c->src.val &= (c->dst.bytes << 3) - 1;
1835 emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags);
1836 break;
1837 case 0xab:
1838 bts: /* bts */
1839 /* only subword offset */
1840 c->src.val &= (c->dst.bytes << 3) - 1;
1841 emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags);
1842 break;
1843 case 0xb0 ... 0xb1: /* cmpxchg */
1844 /*
1845 * Save real source value, then compare EAX against
1846 * destination.
1847 */
1848 c->src.orig_val = c->src.val;
1849 c->src.val = c->regs[VCPU_REGS_RAX];
1850 emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
1851 if (ctxt->eflags & EFLG_ZF) {
1852 /* Success: write back to memory. */
1853 c->dst.val = c->src.orig_val;
1854 } else {
1855 /* Failure: write the value we saw to EAX. */
1856 c->dst.type = OP_REG;
1857 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
1858 }
1859 break;
1860 case 0xb3:
1861 btr: /* btr */
1862 /* only subword offset */
1863 c->src.val &= (c->dst.bytes << 3) - 1;
1864 emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags);
1865 break;
1866 case 0xb6 ... 0xb7: /* movzx */
1867 c->dst.bytes = c->op_bytes;
1868 c->dst.val = (c->d & ByteOp) ? (u8) c->src.val
1869 : (u16) c->src.val;
1870 break;
1871 case 0xba: /* Grp8 */
1872 switch (c->modrm_reg & 3) {
1873 case 0:
1874 goto bt;
1875 case 1:
1876 goto bts;
1877 case 2:
1878 goto btr;
1879 case 3:
1880 goto btc;
1881 }
1882 break;
1883 case 0xbb:
1884 btc: /* btc */
1885 /* only subword offset */
1886 c->src.val &= (c->dst.bytes << 3) - 1;
1887 emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags);
1888 break;
1889 case 0xbe ... 0xbf: /* movsx */
1890 c->dst.bytes = c->op_bytes;
1891 c->dst.val = (c->d & ByteOp) ? (s8) c->src.val :
1892 (s16) c->src.val;
1893 break;
1894 case 0xc3: /* movnti */
1895 c->dst.bytes = c->op_bytes;
1896 c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val :
1897 (u64) c->src.val;
1898 break;
1899 case 0xc7: /* Grp9 (cmpxchg8b) */
1900 rc = emulate_grp9(ctxt, ops, memop);
1901 if (rc != 0)
1902 goto done;
1903 c->dst.type = OP_NONE;
1904 break;
1905 }
1906 goto writeback;
1907
1908cannot_emulate:
1909 DPRINTF("Cannot emulate %02x\n", c->b);
1910 c->eip = saved_eip;
1911 return -1;
1912}