aboutsummaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorAvi Kivity <avi@qumranet.com>2007-12-16 04:02:48 -0500
committerAvi Kivity <avi@qumranet.com>2008-01-30 11:01:18 -0500
commitedf884172e9828c6234b254208af04655855038d (patch)
treef5e5d1eecaed9737eced6ba60d09fe93149751c1 /drivers
parent9584bf2c93f56656dba0de8f6c75b54ca7995143 (diff)
KVM: Move arch dependent files to new directory arch/x86/kvm/
This paves the way for multiple architecture support. Note that while ioapic.c could potentially be shared with ia64, it is also moved. Signed-off-by: Avi Kivity <avi@qumranet.com>
Diffstat (limited to 'drivers')
-rw-r--r--drivers/Kconfig2
-rw-r--r--drivers/Makefile1
-rw-r--r--drivers/kvm/Kconfig57
-rw-r--r--drivers/kvm/Makefile10
-rw-r--r--drivers/kvm/i8259.c449
-rw-r--r--drivers/kvm/ioapic.c402
-rw-r--r--drivers/kvm/iodev.h2
-rw-r--r--drivers/kvm/irq.c99
-rw-r--r--drivers/kvm/irq.h196
-rw-r--r--drivers/kvm/kvm.h289
-rw-r--r--drivers/kvm/kvm_main.c2
-rw-r--r--drivers/kvm/kvm_svm.h45
-rw-r--r--drivers/kvm/lapic.c1087
-rw-r--r--drivers/kvm/mmu.c1806
-rw-r--r--drivers/kvm/mmu.h44
-rw-r--r--drivers/kvm/paging_tmpl.h461
-rw-r--r--drivers/kvm/segment_descriptor.h29
-rw-r--r--drivers/kvm/svm.c1725
-rw-r--r--drivers/kvm/svm.h325
-rw-r--r--drivers/kvm/types.h54
-rw-r--r--drivers/kvm/vmx.c2673
-rw-r--r--drivers/kvm/vmx.h324
-rw-r--r--drivers/kvm/x86.c3148
-rw-r--r--drivers/kvm/x86.h602
-rw-r--r--drivers/kvm/x86_emulate.c1913
-rw-r--r--drivers/kvm/x86_emulate.h186
26 files changed, 2 insertions, 15929 deletions
diff --git a/drivers/Kconfig b/drivers/Kconfig
index f4076d9e9902..08d4ae201597 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -90,8 +90,6 @@ source "drivers/dca/Kconfig"
90 90
91source "drivers/auxdisplay/Kconfig" 91source "drivers/auxdisplay/Kconfig"
92 92
93source "drivers/kvm/Kconfig"
94
95source "drivers/uio/Kconfig" 93source "drivers/uio/Kconfig"
96 94
97source "drivers/virtio/Kconfig" 95source "drivers/virtio/Kconfig"
diff --git a/drivers/Makefile b/drivers/Makefile
index d92d4d82d001..9e1f808e43cf 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -47,7 +47,6 @@ obj-$(CONFIG_SPI) += spi/
47obj-$(CONFIG_PCCARD) += pcmcia/ 47obj-$(CONFIG_PCCARD) += pcmcia/
48obj-$(CONFIG_DIO) += dio/ 48obj-$(CONFIG_DIO) += dio/
49obj-$(CONFIG_SBUS) += sbus/ 49obj-$(CONFIG_SBUS) += sbus/
50obj-$(CONFIG_KVM) += kvm/
51obj-$(CONFIG_ZORRO) += zorro/ 50obj-$(CONFIG_ZORRO) += zorro/
52obj-$(CONFIG_MAC) += macintosh/ 51obj-$(CONFIG_MAC) += macintosh/
53obj-$(CONFIG_ATA_OVER_ETH) += block/aoe/ 52obj-$(CONFIG_ATA_OVER_ETH) += block/aoe/
diff --git a/drivers/kvm/Kconfig b/drivers/kvm/Kconfig
deleted file mode 100644
index c83e1c9b5129..000000000000
--- a/drivers/kvm/Kconfig
+++ /dev/null
@@ -1,57 +0,0 @@
1#
2# KVM configuration
3#
4config HAVE_KVM
5 bool
6
7menuconfig VIRTUALIZATION
8 bool "Virtualization"
9 depends on HAVE_KVM || X86
10 default y
11 ---help---
12 Say Y here to get to see options for using your Linux host to run other
13 operating systems inside virtual machines (guests).
14 This option alone does not add any kernel code.
15
16 If you say N, all options in this submenu will be skipped and disabled.
17
18if VIRTUALIZATION
19
20config KVM
21 tristate "Kernel-based Virtual Machine (KVM) support"
22 depends on HAVE_KVM && EXPERIMENTAL
23 select PREEMPT_NOTIFIERS
24 select ANON_INODES
25 ---help---
26 Support hosting fully virtualized guest machines using hardware
27 virtualization extensions. You will need a fairly recent
28 processor equipped with virtualization extensions. You will also
29 need to select one or more of the processor modules below.
30
31 This module provides access to the hardware capabilities through
32 a character device node named /dev/kvm.
33
34 To compile this as a module, choose M here: the module
35 will be called kvm.
36
37 If unsure, say N.
38
39config KVM_INTEL
40 tristate "KVM for Intel processors support"
41 depends on KVM
42 ---help---
43 Provides support for KVM on Intel processors equipped with the VT
44 extensions.
45
46config KVM_AMD
47 tristate "KVM for AMD processors support"
48 depends on KVM
49 ---help---
50 Provides support for KVM on AMD processors equipped with the AMD-V
51 (SVM) extensions.
52
53# OK, it's a little counter-intuitive to do this, but it puts it neatly under
54# the virtualization menu.
55source drivers/lguest/Kconfig
56
57endif # VIRTUALIZATION
diff --git a/drivers/kvm/Makefile b/drivers/kvm/Makefile
deleted file mode 100644
index cf18ad46e987..000000000000
--- a/drivers/kvm/Makefile
+++ /dev/null
@@ -1,10 +0,0 @@
1#
2# Makefile for Kernel-based Virtual Machine module
3#
4
5kvm-objs := kvm_main.o x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o ioapic.o
6obj-$(CONFIG_KVM) += kvm.o
7kvm-intel-objs = vmx.o
8obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
9kvm-amd-objs = svm.o
10obj-$(CONFIG_KVM_AMD) += kvm-amd.o
diff --git a/drivers/kvm/i8259.c b/drivers/kvm/i8259.c
deleted file mode 100644
index b3cad632f3d5..000000000000
--- a/drivers/kvm/i8259.c
+++ /dev/null
@@ -1,449 +0,0 @@
1/*
2 * 8259 interrupt controller emulation
3 *
4 * Copyright (c) 2003-2004 Fabrice Bellard
5 * Copyright (c) 2007 Intel Corporation
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a copy
8 * of this software and associated documentation files (the "Software"), to deal
9 * in the Software without restriction, including without limitation the rights
10 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 * copies of the Software, and to permit persons to whom the Software is
12 * furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be included in
15 * all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 * THE SOFTWARE.
24 * Authors:
25 * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
26 * Port from Qemu.
27 */
28#include <linux/mm.h>
29#include "irq.h"
30#include "kvm.h"
31
32/*
33 * set irq level. If an edge is detected, then the IRR is set to 1
34 */
35static inline void pic_set_irq1(struct kvm_kpic_state *s, int irq, int level)
36{
37 int mask;
38 mask = 1 << irq;
39 if (s->elcr & mask) /* level triggered */
40 if (level) {
41 s->irr |= mask;
42 s->last_irr |= mask;
43 } else {
44 s->irr &= ~mask;
45 s->last_irr &= ~mask;
46 }
47 else /* edge triggered */
48 if (level) {
49 if ((s->last_irr & mask) == 0)
50 s->irr |= mask;
51 s->last_irr |= mask;
52 } else
53 s->last_irr &= ~mask;
54}
55
56/*
57 * return the highest priority found in mask (highest = smallest
58 * number). Return 8 if no irq
59 */
60static inline int get_priority(struct kvm_kpic_state *s, int mask)
61{
62 int priority;
63 if (mask == 0)
64 return 8;
65 priority = 0;
66 while ((mask & (1 << ((priority + s->priority_add) & 7))) == 0)
67 priority++;
68 return priority;
69}
70
71/*
72 * return the pic wanted interrupt. return -1 if none
73 */
74static int pic_get_irq(struct kvm_kpic_state *s)
75{
76 int mask, cur_priority, priority;
77
78 mask = s->irr & ~s->imr;
79 priority = get_priority(s, mask);
80 if (priority == 8)
81 return -1;
82 /*
83 * compute current priority. If special fully nested mode on the
84 * master, the IRQ coming from the slave is not taken into account
85 * for the priority computation.
86 */
87 mask = s->isr;
88 if (s->special_fully_nested_mode && s == &s->pics_state->pics[0])
89 mask &= ~(1 << 2);
90 cur_priority = get_priority(s, mask);
91 if (priority < cur_priority)
92 /*
93 * higher priority found: an irq should be generated
94 */
95 return (priority + s->priority_add) & 7;
96 else
97 return -1;
98}
99
100/*
101 * raise irq to CPU if necessary. must be called every time the active
102 * irq may change
103 */
104static void pic_update_irq(struct kvm_pic *s)
105{
106 int irq2, irq;
107
108 irq2 = pic_get_irq(&s->pics[1]);
109 if (irq2 >= 0) {
110 /*
111 * if irq request by slave pic, signal master PIC
112 */
113 pic_set_irq1(&s->pics[0], 2, 1);
114 pic_set_irq1(&s->pics[0], 2, 0);
115 }
116 irq = pic_get_irq(&s->pics[0]);
117 if (irq >= 0)
118 s->irq_request(s->irq_request_opaque, 1);
119 else
120 s->irq_request(s->irq_request_opaque, 0);
121}
122
123void kvm_pic_update_irq(struct kvm_pic *s)
124{
125 pic_update_irq(s);
126}
127
128void kvm_pic_set_irq(void *opaque, int irq, int level)
129{
130 struct kvm_pic *s = opaque;
131
132 pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
133 pic_update_irq(s);
134}
135
136/*
137 * acknowledge interrupt 'irq'
138 */
139static inline void pic_intack(struct kvm_kpic_state *s, int irq)
140{
141 if (s->auto_eoi) {
142 if (s->rotate_on_auto_eoi)
143 s->priority_add = (irq + 1) & 7;
144 } else
145 s->isr |= (1 << irq);
146 /*
147 * We don't clear a level sensitive interrupt here
148 */
149 if (!(s->elcr & (1 << irq)))
150 s->irr &= ~(1 << irq);
151}
152
153int kvm_pic_read_irq(struct kvm_pic *s)
154{
155 int irq, irq2, intno;
156
157 irq = pic_get_irq(&s->pics[0]);
158 if (irq >= 0) {
159 pic_intack(&s->pics[0], irq);
160 if (irq == 2) {
161 irq2 = pic_get_irq(&s->pics[1]);
162 if (irq2 >= 0)
163 pic_intack(&s->pics[1], irq2);
164 else
165 /*
166 * spurious IRQ on slave controller
167 */
168 irq2 = 7;
169 intno = s->pics[1].irq_base + irq2;
170 irq = irq2 + 8;
171 } else
172 intno = s->pics[0].irq_base + irq;
173 } else {
174 /*
175 * spurious IRQ on host controller
176 */
177 irq = 7;
178 intno = s->pics[0].irq_base + irq;
179 }
180 pic_update_irq(s);
181
182 return intno;
183}
184
185void kvm_pic_reset(struct kvm_kpic_state *s)
186{
187 s->last_irr = 0;
188 s->irr = 0;
189 s->imr = 0;
190 s->isr = 0;
191 s->priority_add = 0;
192 s->irq_base = 0;
193 s->read_reg_select = 0;
194 s->poll = 0;
195 s->special_mask = 0;
196 s->init_state = 0;
197 s->auto_eoi = 0;
198 s->rotate_on_auto_eoi = 0;
199 s->special_fully_nested_mode = 0;
200 s->init4 = 0;
201}
202
203static void pic_ioport_write(void *opaque, u32 addr, u32 val)
204{
205 struct kvm_kpic_state *s = opaque;
206 int priority, cmd, irq;
207
208 addr &= 1;
209 if (addr == 0) {
210 if (val & 0x10) {
211 kvm_pic_reset(s); /* init */
212 /*
213 * deassert a pending interrupt
214 */
215 s->pics_state->irq_request(s->pics_state->
216 irq_request_opaque, 0);
217 s->init_state = 1;
218 s->init4 = val & 1;
219 if (val & 0x02)
220 printk(KERN_ERR "single mode not supported");
221 if (val & 0x08)
222 printk(KERN_ERR
223 "level sensitive irq not supported");
224 } else if (val & 0x08) {
225 if (val & 0x04)
226 s->poll = 1;
227 if (val & 0x02)
228 s->read_reg_select = val & 1;
229 if (val & 0x40)
230 s->special_mask = (val >> 5) & 1;
231 } else {
232 cmd = val >> 5;
233 switch (cmd) {
234 case 0:
235 case 4:
236 s->rotate_on_auto_eoi = cmd >> 2;
237 break;
238 case 1: /* end of interrupt */
239 case 5:
240 priority = get_priority(s, s->isr);
241 if (priority != 8) {
242 irq = (priority + s->priority_add) & 7;
243 s->isr &= ~(1 << irq);
244 if (cmd == 5)
245 s->priority_add = (irq + 1) & 7;
246 pic_update_irq(s->pics_state);
247 }
248 break;
249 case 3:
250 irq = val & 7;
251 s->isr &= ~(1 << irq);
252 pic_update_irq(s->pics_state);
253 break;
254 case 6:
255 s->priority_add = (val + 1) & 7;
256 pic_update_irq(s->pics_state);
257 break;
258 case 7:
259 irq = val & 7;
260 s->isr &= ~(1 << irq);
261 s->priority_add = (irq + 1) & 7;
262 pic_update_irq(s->pics_state);
263 break;
264 default:
265 break; /* no operation */
266 }
267 }
268 } else
269 switch (s->init_state) {
270 case 0: /* normal mode */
271 s->imr = val;
272 pic_update_irq(s->pics_state);
273 break;
274 case 1:
275 s->irq_base = val & 0xf8;
276 s->init_state = 2;
277 break;
278 case 2:
279 if (s->init4)
280 s->init_state = 3;
281 else
282 s->init_state = 0;
283 break;
284 case 3:
285 s->special_fully_nested_mode = (val >> 4) & 1;
286 s->auto_eoi = (val >> 1) & 1;
287 s->init_state = 0;
288 break;
289 }
290}
291
292static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1)
293{
294 int ret;
295
296 ret = pic_get_irq(s);
297 if (ret >= 0) {
298 if (addr1 >> 7) {
299 s->pics_state->pics[0].isr &= ~(1 << 2);
300 s->pics_state->pics[0].irr &= ~(1 << 2);
301 }
302 s->irr &= ~(1 << ret);
303 s->isr &= ~(1 << ret);
304 if (addr1 >> 7 || ret != 2)
305 pic_update_irq(s->pics_state);
306 } else {
307 ret = 0x07;
308 pic_update_irq(s->pics_state);
309 }
310
311 return ret;
312}
313
314static u32 pic_ioport_read(void *opaque, u32 addr1)
315{
316 struct kvm_kpic_state *s = opaque;
317 unsigned int addr;
318 int ret;
319
320 addr = addr1;
321 addr &= 1;
322 if (s->poll) {
323 ret = pic_poll_read(s, addr1);
324 s->poll = 0;
325 } else
326 if (addr == 0)
327 if (s->read_reg_select)
328 ret = s->isr;
329 else
330 ret = s->irr;
331 else
332 ret = s->imr;
333 return ret;
334}
335
336static void elcr_ioport_write(void *opaque, u32 addr, u32 val)
337{
338 struct kvm_kpic_state *s = opaque;
339 s->elcr = val & s->elcr_mask;
340}
341
342static u32 elcr_ioport_read(void *opaque, u32 addr1)
343{
344 struct kvm_kpic_state *s = opaque;
345 return s->elcr;
346}
347
348static int picdev_in_range(struct kvm_io_device *this, gpa_t addr)
349{
350 switch (addr) {
351 case 0x20:
352 case 0x21:
353 case 0xa0:
354 case 0xa1:
355 case 0x4d0:
356 case 0x4d1:
357 return 1;
358 default:
359 return 0;
360 }
361}
362
363static void picdev_write(struct kvm_io_device *this,
364 gpa_t addr, int len, const void *val)
365{
366 struct kvm_pic *s = this->private;
367 unsigned char data = *(unsigned char *)val;
368
369 if (len != 1) {
370 if (printk_ratelimit())
371 printk(KERN_ERR "PIC: non byte write\n");
372 return;
373 }
374 switch (addr) {
375 case 0x20:
376 case 0x21:
377 case 0xa0:
378 case 0xa1:
379 pic_ioport_write(&s->pics[addr >> 7], addr, data);
380 break;
381 case 0x4d0:
382 case 0x4d1:
383 elcr_ioport_write(&s->pics[addr & 1], addr, data);
384 break;
385 }
386}
387
388static void picdev_read(struct kvm_io_device *this,
389 gpa_t addr, int len, void *val)
390{
391 struct kvm_pic *s = this->private;
392 unsigned char data = 0;
393
394 if (len != 1) {
395 if (printk_ratelimit())
396 printk(KERN_ERR "PIC: non byte read\n");
397 return;
398 }
399 switch (addr) {
400 case 0x20:
401 case 0x21:
402 case 0xa0:
403 case 0xa1:
404 data = pic_ioport_read(&s->pics[addr >> 7], addr);
405 break;
406 case 0x4d0:
407 case 0x4d1:
408 data = elcr_ioport_read(&s->pics[addr & 1], addr);
409 break;
410 }
411 *(unsigned char *)val = data;
412}
413
414/*
415 * callback when PIC0 irq status changed
416 */
417static void pic_irq_request(void *opaque, int level)
418{
419 struct kvm *kvm = opaque;
420 struct kvm_vcpu *vcpu = kvm->vcpus[0];
421
422 pic_irqchip(kvm)->output = level;
423 if (vcpu)
424 kvm_vcpu_kick(vcpu);
425}
426
427struct kvm_pic *kvm_create_pic(struct kvm *kvm)
428{
429 struct kvm_pic *s;
430 s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
431 if (!s)
432 return NULL;
433 s->pics[0].elcr_mask = 0xf8;
434 s->pics[1].elcr_mask = 0xde;
435 s->irq_request = pic_irq_request;
436 s->irq_request_opaque = kvm;
437 s->pics[0].pics_state = s;
438 s->pics[1].pics_state = s;
439
440 /*
441 * Initialize PIO device
442 */
443 s->dev.read = picdev_read;
444 s->dev.write = picdev_write;
445 s->dev.in_range = picdev_in_range;
446 s->dev.private = s;
447 kvm_io_bus_register_dev(&kvm->pio_bus, &s->dev);
448 return s;
449}
diff --git a/drivers/kvm/ioapic.c b/drivers/kvm/ioapic.c
deleted file mode 100644
index f8236774c1b4..000000000000
--- a/drivers/kvm/ioapic.c
+++ /dev/null
@@ -1,402 +0,0 @@
1/*
2 * Copyright (C) 2001 MandrakeSoft S.A.
3 *
4 * MandrakeSoft S.A.
5 * 43, rue d'Aboukir
6 * 75002 Paris - France
7 * http://www.linux-mandrake.com/
8 * http://www.mandrakesoft.com/
9 *
10 * This library is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This library is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with this library; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 *
24 * Yunhong Jiang <yunhong.jiang@intel.com>
25 * Yaozu (Eddie) Dong <eddie.dong@intel.com>
26 * Based on Xen 3.1 code.
27 */
28
29#include "kvm.h"
30#include "x86.h"
31
32#include <linux/kvm.h>
33#include <linux/mm.h>
34#include <linux/highmem.h>
35#include <linux/smp.h>
36#include <linux/hrtimer.h>
37#include <linux/io.h>
38#include <asm/processor.h>
39#include <asm/page.h>
40#include <asm/current.h>
41#include "irq.h"
42#if 0
43#define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg)
44#else
45#define ioapic_debug(fmt, arg...)
46#endif
47static void ioapic_deliver(struct kvm_ioapic *vioapic, int irq);
48
49static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
50 unsigned long addr,
51 unsigned long length)
52{
53 unsigned long result = 0;
54
55 switch (ioapic->ioregsel) {
56 case IOAPIC_REG_VERSION:
57 result = ((((IOAPIC_NUM_PINS - 1) & 0xff) << 16)
58 | (IOAPIC_VERSION_ID & 0xff));
59 break;
60
61 case IOAPIC_REG_APIC_ID:
62 case IOAPIC_REG_ARB_ID:
63 result = ((ioapic->id & 0xf) << 24);
64 break;
65
66 default:
67 {
68 u32 redir_index = (ioapic->ioregsel - 0x10) >> 1;
69 u64 redir_content;
70
71 ASSERT(redir_index < IOAPIC_NUM_PINS);
72
73 redir_content = ioapic->redirtbl[redir_index].bits;
74 result = (ioapic->ioregsel & 0x1) ?
75 (redir_content >> 32) & 0xffffffff :
76 redir_content & 0xffffffff;
77 break;
78 }
79 }
80
81 return result;
82}
83
84static void ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx)
85{
86 union ioapic_redir_entry *pent;
87
88 pent = &ioapic->redirtbl[idx];
89
90 if (!pent->fields.mask) {
91 ioapic_deliver(ioapic, idx);
92 if (pent->fields.trig_mode == IOAPIC_LEVEL_TRIG)
93 pent->fields.remote_irr = 1;
94 }
95 if (!pent->fields.trig_mode)
96 ioapic->irr &= ~(1 << idx);
97}
98
99static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
100{
101 unsigned index;
102
103 switch (ioapic->ioregsel) {
104 case IOAPIC_REG_VERSION:
105 /* Writes are ignored. */
106 break;
107
108 case IOAPIC_REG_APIC_ID:
109 ioapic->id = (val >> 24) & 0xf;
110 break;
111
112 case IOAPIC_REG_ARB_ID:
113 break;
114
115 default:
116 index = (ioapic->ioregsel - 0x10) >> 1;
117
118 ioapic_debug("change redir index %x val %x\n", index, val);
119 if (index >= IOAPIC_NUM_PINS)
120 return;
121 if (ioapic->ioregsel & 1) {
122 ioapic->redirtbl[index].bits &= 0xffffffff;
123 ioapic->redirtbl[index].bits |= (u64) val << 32;
124 } else {
125 ioapic->redirtbl[index].bits &= ~0xffffffffULL;
126 ioapic->redirtbl[index].bits |= (u32) val;
127 ioapic->redirtbl[index].fields.remote_irr = 0;
128 }
129 if (ioapic->irr & (1 << index))
130 ioapic_service(ioapic, index);
131 break;
132 }
133}
134
135static void ioapic_inj_irq(struct kvm_ioapic *ioapic,
136 struct kvm_vcpu *vcpu,
137 u8 vector, u8 trig_mode, u8 delivery_mode)
138{
139 ioapic_debug("irq %d trig %d deliv %d\n", vector, trig_mode,
140 delivery_mode);
141
142 ASSERT((delivery_mode == IOAPIC_FIXED) ||
143 (delivery_mode == IOAPIC_LOWEST_PRIORITY));
144
145 kvm_apic_set_irq(vcpu, vector, trig_mode);
146}
147
148static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
149 u8 dest_mode)
150{
151 u32 mask = 0;
152 int i;
153 struct kvm *kvm = ioapic->kvm;
154 struct kvm_vcpu *vcpu;
155
156 ioapic_debug("dest %d dest_mode %d\n", dest, dest_mode);
157
158 if (dest_mode == 0) { /* Physical mode. */
159 if (dest == 0xFF) { /* Broadcast. */
160 for (i = 0; i < KVM_MAX_VCPUS; ++i)
161 if (kvm->vcpus[i] && kvm->vcpus[i]->arch.apic)
162 mask |= 1 << i;
163 return mask;
164 }
165 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
166 vcpu = kvm->vcpus[i];
167 if (!vcpu)
168 continue;
169 if (kvm_apic_match_physical_addr(vcpu->arch.apic, dest)) {
170 if (vcpu->arch.apic)
171 mask = 1 << i;
172 break;
173 }
174 }
175 } else if (dest != 0) /* Logical mode, MDA non-zero. */
176 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
177 vcpu = kvm->vcpus[i];
178 if (!vcpu)
179 continue;
180 if (vcpu->arch.apic &&
181 kvm_apic_match_logical_addr(vcpu->arch.apic, dest))
182 mask |= 1 << vcpu->vcpu_id;
183 }
184 ioapic_debug("mask %x\n", mask);
185 return mask;
186}
187
188static void ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
189{
190 u8 dest = ioapic->redirtbl[irq].fields.dest_id;
191 u8 dest_mode = ioapic->redirtbl[irq].fields.dest_mode;
192 u8 delivery_mode = ioapic->redirtbl[irq].fields.delivery_mode;
193 u8 vector = ioapic->redirtbl[irq].fields.vector;
194 u8 trig_mode = ioapic->redirtbl[irq].fields.trig_mode;
195 u32 deliver_bitmask;
196 struct kvm_vcpu *vcpu;
197 int vcpu_id;
198
199 ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
200 "vector=%x trig_mode=%x\n",
201 dest, dest_mode, delivery_mode, vector, trig_mode);
202
203 deliver_bitmask = ioapic_get_delivery_bitmask(ioapic, dest, dest_mode);
204 if (!deliver_bitmask) {
205 ioapic_debug("no target on destination\n");
206 return;
207 }
208
209 switch (delivery_mode) {
210 case IOAPIC_LOWEST_PRIORITY:
211 vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm, vector,
212 deliver_bitmask);
213 if (vcpu != NULL)
214 ioapic_inj_irq(ioapic, vcpu, vector,
215 trig_mode, delivery_mode);
216 else
217 ioapic_debug("null lowest prio vcpu: "
218 "mask=%x vector=%x delivery_mode=%x\n",
219 deliver_bitmask, vector, IOAPIC_LOWEST_PRIORITY);
220 break;
221 case IOAPIC_FIXED:
222 for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) {
223 if (!(deliver_bitmask & (1 << vcpu_id)))
224 continue;
225 deliver_bitmask &= ~(1 << vcpu_id);
226 vcpu = ioapic->kvm->vcpus[vcpu_id];
227 if (vcpu) {
228 ioapic_inj_irq(ioapic, vcpu, vector,
229 trig_mode, delivery_mode);
230 }
231 }
232 break;
233
234 /* TODO: NMI */
235 default:
236 printk(KERN_WARNING "Unsupported delivery mode %d\n",
237 delivery_mode);
238 break;
239 }
240}
241
242void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
243{
244 u32 old_irr = ioapic->irr;
245 u32 mask = 1 << irq;
246 union ioapic_redir_entry entry;
247
248 if (irq >= 0 && irq < IOAPIC_NUM_PINS) {
249 entry = ioapic->redirtbl[irq];
250 level ^= entry.fields.polarity;
251 if (!level)
252 ioapic->irr &= ~mask;
253 else {
254 ioapic->irr |= mask;
255 if ((!entry.fields.trig_mode && old_irr != ioapic->irr)
256 || !entry.fields.remote_irr)
257 ioapic_service(ioapic, irq);
258 }
259 }
260}
261
262static int get_eoi_gsi(struct kvm_ioapic *ioapic, int vector)
263{
264 int i;
265
266 for (i = 0; i < IOAPIC_NUM_PINS; i++)
267 if (ioapic->redirtbl[i].fields.vector == vector)
268 return i;
269 return -1;
270}
271
272void kvm_ioapic_update_eoi(struct kvm *kvm, int vector)
273{
274 struct kvm_ioapic *ioapic = kvm->arch.vioapic;
275 union ioapic_redir_entry *ent;
276 int gsi;
277
278 gsi = get_eoi_gsi(ioapic, vector);
279 if (gsi == -1) {
280 printk(KERN_WARNING "Can't find redir item for %d EOI\n",
281 vector);
282 return;
283 }
284
285 ent = &ioapic->redirtbl[gsi];
286 ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
287
288 ent->fields.remote_irr = 0;
289 if (!ent->fields.mask && (ioapic->irr & (1 << gsi)))
290 ioapic_deliver(ioapic, gsi);
291}
292
293static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr)
294{
295 struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
296
297 return ((addr >= ioapic->base_address &&
298 (addr < ioapic->base_address + IOAPIC_MEM_LENGTH)));
299}
300
301static void ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
302 void *val)
303{
304 struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
305 u32 result;
306
307 ioapic_debug("addr %lx\n", (unsigned long)addr);
308 ASSERT(!(addr & 0xf)); /* check alignment */
309
310 addr &= 0xff;
311 switch (addr) {
312 case IOAPIC_REG_SELECT:
313 result = ioapic->ioregsel;
314 break;
315
316 case IOAPIC_REG_WINDOW:
317 result = ioapic_read_indirect(ioapic, addr, len);
318 break;
319
320 default:
321 result = 0;
322 break;
323 }
324 switch (len) {
325 case 8:
326 *(u64 *) val = result;
327 break;
328 case 1:
329 case 2:
330 case 4:
331 memcpy(val, (char *)&result, len);
332 break;
333 default:
334 printk(KERN_WARNING "ioapic: wrong length %d\n", len);
335 }
336}
337
338static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
339 const void *val)
340{
341 struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
342 u32 data;
343
344 ioapic_debug("ioapic_mmio_write addr=%p len=%d val=%p\n",
345 (void*)addr, len, val);
346 ASSERT(!(addr & 0xf)); /* check alignment */
347 if (len == 4 || len == 8)
348 data = *(u32 *) val;
349 else {
350 printk(KERN_WARNING "ioapic: Unsupported size %d\n", len);
351 return;
352 }
353
354 addr &= 0xff;
355 switch (addr) {
356 case IOAPIC_REG_SELECT:
357 ioapic->ioregsel = data;
358 break;
359
360 case IOAPIC_REG_WINDOW:
361 ioapic_write_indirect(ioapic, data);
362 break;
363#ifdef CONFIG_IA64
364 case IOAPIC_REG_EOI:
365 kvm_ioapic_update_eoi(ioapic, data);
366 break;
367#endif
368
369 default:
370 break;
371 }
372}
373
374void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
375{
376 int i;
377
378 for (i = 0; i < IOAPIC_NUM_PINS; i++)
379 ioapic->redirtbl[i].fields.mask = 1;
380 ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
381 ioapic->ioregsel = 0;
382 ioapic->irr = 0;
383 ioapic->id = 0;
384}
385
386int kvm_ioapic_init(struct kvm *kvm)
387{
388 struct kvm_ioapic *ioapic;
389
390 ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL);
391 if (!ioapic)
392 return -ENOMEM;
393 kvm->arch.vioapic = ioapic;
394 kvm_ioapic_reset(ioapic);
395 ioapic->dev.read = ioapic_mmio_read;
396 ioapic->dev.write = ioapic_mmio_write;
397 ioapic->dev.in_range = ioapic_in_range;
398 ioapic->dev.private = ioapic;
399 ioapic->kvm = kvm;
400 kvm_io_bus_register_dev(&kvm->mmio_bus, &ioapic->dev);
401 return 0;
402}
diff --git a/drivers/kvm/iodev.h b/drivers/kvm/iodev.h
index eb9e8a71843a..c14e642027b2 100644
--- a/drivers/kvm/iodev.h
+++ b/drivers/kvm/iodev.h
@@ -16,7 +16,7 @@
16#ifndef __KVM_IODEV_H__ 16#ifndef __KVM_IODEV_H__
17#define __KVM_IODEV_H__ 17#define __KVM_IODEV_H__
18 18
19#include "types.h" 19#include <linux/kvm_types.h>
20 20
21struct kvm_io_device { 21struct kvm_io_device {
22 void (*read)(struct kvm_io_device *this, 22 void (*read)(struct kvm_io_device *this,
diff --git a/drivers/kvm/irq.c b/drivers/kvm/irq.c
deleted file mode 100644
index 59b47c55fc76..000000000000
--- a/drivers/kvm/irq.c
+++ /dev/null
@@ -1,99 +0,0 @@
1/*
2 * irq.c: API for in kernel interrupt controller
3 * Copyright (c) 2007, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Authors:
18 * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
19 *
20 */
21
22#include <linux/module.h>
23
24#include "kvm.h"
25#include "x86.h"
26#include "irq.h"
27
28/*
29 * check if there is pending interrupt without
30 * intack.
31 */
32int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
33{
34 struct kvm_pic *s;
35
36 if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */
37 if (kvm_apic_accept_pic_intr(v)) {
38 s = pic_irqchip(v->kvm); /* PIC */
39 return s->output;
40 } else
41 return 0;
42 }
43 return 1;
44}
45EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
46
47/*
48 * Read pending interrupt vector and intack.
49 */
50int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
51{
52 struct kvm_pic *s;
53 int vector;
54
55 vector = kvm_get_apic_interrupt(v); /* APIC */
56 if (vector == -1) {
57 if (kvm_apic_accept_pic_intr(v)) {
58 s = pic_irqchip(v->kvm);
59 s->output = 0; /* PIC */
60 vector = kvm_pic_read_irq(s);
61 }
62 }
63 return vector;
64}
65EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
66
67static void vcpu_kick_intr(void *info)
68{
69#ifdef DEBUG
70 struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
71 printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu);
72#endif
73}
74
75void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
76{
77 int ipi_pcpu = vcpu->cpu;
78
79 if (waitqueue_active(&vcpu->wq)) {
80 wake_up_interruptible(&vcpu->wq);
81 ++vcpu->stat.halt_wakeup;
82 }
83 if (vcpu->guest_mode)
84 smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0);
85}
86
87void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
88{
89 kvm_inject_apic_timer_irqs(vcpu);
90 /* TODO: PIT, RTC etc. */
91}
92EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
93
94void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
95{
96 kvm_apic_timer_intr_post(vcpu, vec);
97 /* TODO: PIT, RTC etc. */
98}
99EXPORT_SYMBOL_GPL(kvm_timer_intr_post);
diff --git a/drivers/kvm/irq.h b/drivers/kvm/irq.h
deleted file mode 100644
index 6e023dc3f848..000000000000
--- a/drivers/kvm/irq.h
+++ /dev/null
@@ -1,196 +0,0 @@
1/*
2 * irq.h: in kernel interrupt controller related definitions
3 * Copyright (c) 2007, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Authors:
18 * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
19 *
20 */
21
22#ifndef __IRQ_H
23#define __IRQ_H
24
25#include <linux/mm_types.h>
26#include <linux/hrtimer.h>
27#include <asm/kvm.h>
28#include "iodev.h"
29#include "kvm.h"
30
31struct kvm;
32struct kvm_vcpu;
33
34typedef void irq_request_func(void *opaque, int level);
35
36struct kvm_kpic_state {
37 u8 last_irr; /* edge detection */
38 u8 irr; /* interrupt request register */
39 u8 imr; /* interrupt mask register */
40 u8 isr; /* interrupt service register */
41 u8 priority_add; /* highest irq priority */
42 u8 irq_base;
43 u8 read_reg_select;
44 u8 poll;
45 u8 special_mask;
46 u8 init_state;
47 u8 auto_eoi;
48 u8 rotate_on_auto_eoi;
49 u8 special_fully_nested_mode;
50 u8 init4; /* true if 4 byte init */
51 u8 elcr; /* PIIX edge/trigger selection */
52 u8 elcr_mask;
53 struct kvm_pic *pics_state;
54};
55
56struct kvm_pic {
57 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
58 irq_request_func *irq_request;
59 void *irq_request_opaque;
60 int output; /* intr from master PIC */
61 struct kvm_io_device dev;
62};
63
64struct kvm_pic *kvm_create_pic(struct kvm *kvm);
65void kvm_pic_set_irq(void *opaque, int irq, int level);
66int kvm_pic_read_irq(struct kvm_pic *s);
67void kvm_pic_update_irq(struct kvm_pic *s);
68
69#define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS
70#define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */
71#define IOAPIC_EDGE_TRIG 0
72#define IOAPIC_LEVEL_TRIG 1
73
74#define IOAPIC_DEFAULT_BASE_ADDRESS 0xfec00000
75#define IOAPIC_MEM_LENGTH 0x100
76
77/* Direct registers. */
78#define IOAPIC_REG_SELECT 0x00
79#define IOAPIC_REG_WINDOW 0x10
80#define IOAPIC_REG_EOI 0x40 /* IA64 IOSAPIC only */
81
82/* Indirect registers. */
83#define IOAPIC_REG_APIC_ID 0x00 /* x86 IOAPIC only */
84#define IOAPIC_REG_VERSION 0x01
85#define IOAPIC_REG_ARB_ID 0x02 /* x86 IOAPIC only */
86
87/*ioapic delivery mode*/
88#define IOAPIC_FIXED 0x0
89#define IOAPIC_LOWEST_PRIORITY 0x1
90#define IOAPIC_PMI 0x2
91#define IOAPIC_NMI 0x4
92#define IOAPIC_INIT 0x5
93#define IOAPIC_EXTINT 0x7
94
95struct kvm_ioapic {
96 u64 base_address;
97 u32 ioregsel;
98 u32 id;
99 u32 irr;
100 u32 pad;
101 union ioapic_redir_entry {
102 u64 bits;
103 struct {
104 u8 vector;
105 u8 delivery_mode:3;
106 u8 dest_mode:1;
107 u8 delivery_status:1;
108 u8 polarity:1;
109 u8 remote_irr:1;
110 u8 trig_mode:1;
111 u8 mask:1;
112 u8 reserve:7;
113 u8 reserved[4];
114 u8 dest_id;
115 } fields;
116 } redirtbl[IOAPIC_NUM_PINS];
117 struct kvm_io_device dev;
118 struct kvm *kvm;
119};
120
121struct kvm_lapic {
122 unsigned long base_address;
123 struct kvm_io_device dev;
124 struct {
125 atomic_t pending;
126 s64 period; /* unit: ns */
127 u32 divide_count;
128 ktime_t last_update;
129 struct hrtimer dev;
130 } timer;
131 struct kvm_vcpu *vcpu;
132 struct page *regs_page;
133 void *regs;
134};
135
136#ifdef DEBUG
137#define ASSERT(x) \
138do { \
139 if (!(x)) { \
140 printk(KERN_EMERG "assertion failed %s: %d: %s\n", \
141 __FILE__, __LINE__, #x); \
142 BUG(); \
143 } \
144} while (0)
145#else
146#define ASSERT(x) do { } while (0)
147#endif
148
149static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
150{
151 return kvm->arch.vpic;
152}
153
154static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
155{
156 return kvm->arch.vioapic;
157}
158
159static inline int irqchip_in_kernel(struct kvm *kvm)
160{
161 return pic_irqchip(kvm) != NULL;
162}
163
164void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
165int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
166int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
167int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
168int kvm_create_lapic(struct kvm_vcpu *vcpu);
169void kvm_lapic_reset(struct kvm_vcpu *vcpu);
170void kvm_pic_reset(struct kvm_kpic_state *s);
171void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
172void kvm_free_lapic(struct kvm_vcpu *vcpu);
173u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
174void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
175void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
176
177struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
178 unsigned long bitmap);
179u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
180void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
181int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
182void kvm_ioapic_update_eoi(struct kvm *kvm, int vector);
183int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
184int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig);
185void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
186int kvm_ioapic_init(struct kvm *kvm);
187void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
188int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
189int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
190void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
191void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
192void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
193void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
194void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
195
196#endif
diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
deleted file mode 100644
index bf6a3b330a3d..000000000000
--- a/drivers/kvm/kvm.h
+++ /dev/null
@@ -1,289 +0,0 @@
1#ifndef __KVM_H
2#define __KVM_H
3
4/*
5 * This work is licensed under the terms of the GNU GPL, version 2. See
6 * the COPYING file in the top-level directory.
7 */
8
9#include <linux/types.h>
10#include <linux/hardirq.h>
11#include <linux/list.h>
12#include <linux/mutex.h>
13#include <linux/spinlock.h>
14#include <linux/signal.h>
15#include <linux/sched.h>
16#include <linux/mm.h>
17#include <linux/preempt.h>
18#include <asm/signal.h>
19
20#include <linux/kvm.h>
21#include <linux/kvm_para.h>
22
23#include "types.h"
24
25#include "x86.h"
26
27#define KVM_MAX_VCPUS 4
28#define KVM_MEMORY_SLOTS 8
29/* memory slots that does not exposed to userspace */
30#define KVM_PRIVATE_MEM_SLOTS 4
31
32#define KVM_PIO_PAGE_OFFSET 1
33
34/*
35 * vcpu->requests bit members
36 */
37#define KVM_REQ_TLB_FLUSH 0
38
39
40struct kvm_vcpu;
41extern struct kmem_cache *kvm_vcpu_cache;
42
43struct kvm_guest_debug {
44 int enabled;
45 unsigned long bp[4];
46 int singlestep;
47};
48
49/*
50 * It would be nice to use something smarter than a linear search, TBD...
51 * Thankfully we dont expect many devices to register (famous last words :),
52 * so until then it will suffice. At least its abstracted so we can change
53 * in one place.
54 */
55struct kvm_io_bus {
56 int dev_count;
57#define NR_IOBUS_DEVS 6
58 struct kvm_io_device *devs[NR_IOBUS_DEVS];
59};
60
61void kvm_io_bus_init(struct kvm_io_bus *bus);
62void kvm_io_bus_destroy(struct kvm_io_bus *bus);
63struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr);
64void kvm_io_bus_register_dev(struct kvm_io_bus *bus,
65 struct kvm_io_device *dev);
66
67struct kvm_vcpu {
68 struct kvm *kvm;
69 struct preempt_notifier preempt_notifier;
70 int vcpu_id;
71 struct mutex mutex;
72 int cpu;
73 struct kvm_run *run;
74 int guest_mode;
75 unsigned long requests;
76 struct kvm_guest_debug guest_debug;
77 int fpu_active;
78 int guest_fpu_loaded;
79 wait_queue_head_t wq;
80 int sigset_active;
81 sigset_t sigset;
82 struct kvm_vcpu_stat stat;
83
84#ifdef CONFIG_HAS_IOMEM
85 int mmio_needed;
86 int mmio_read_completed;
87 int mmio_is_write;
88 int mmio_size;
89 unsigned char mmio_data[8];
90 gpa_t mmio_phys_addr;
91#endif
92
93 struct kvm_vcpu_arch arch;
94};
95
96struct kvm_memory_slot {
97 gfn_t base_gfn;
98 unsigned long npages;
99 unsigned long flags;
100 unsigned long *rmap;
101 unsigned long *dirty_bitmap;
102 unsigned long userspace_addr;
103 int user_alloc;
104};
105
106struct kvm {
107 struct mutex lock; /* protects everything except vcpus */
108 struct mm_struct *mm; /* userspace tied to this vm */
109 int nmemslots;
110 struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS +
111 KVM_PRIVATE_MEM_SLOTS];
112 struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
113 struct list_head vm_list;
114 struct file *filp;
115 struct kvm_io_bus mmio_bus;
116 struct kvm_io_bus pio_bus;
117 struct kvm_vm_stat stat;
118 struct kvm_arch arch;
119};
120
121/* The guest did something we don't support. */
122#define pr_unimpl(vcpu, fmt, ...) \
123 do { \
124 if (printk_ratelimit()) \
125 printk(KERN_ERR "kvm: %i: cpu%i " fmt, \
126 current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__); \
127 } while (0)
128
129#define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt)
130#define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt)
131
132int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id);
133void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
134
135void vcpu_load(struct kvm_vcpu *vcpu);
136void vcpu_put(struct kvm_vcpu *vcpu);
137
138void decache_vcpus_on_cpu(int cpu);
139
140
141int kvm_init(void *opaque, unsigned int vcpu_size,
142 struct module *module);
143void kvm_exit(void);
144
145#define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
146#define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB)
147static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
148struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva);
149
150extern struct page *bad_page;
151
152int is_error_page(struct page *page);
153int kvm_is_error_hva(unsigned long addr);
154int kvm_set_memory_region(struct kvm *kvm,
155 struct kvm_userspace_memory_region *mem,
156 int user_alloc);
157int __kvm_set_memory_region(struct kvm *kvm,
158 struct kvm_userspace_memory_region *mem,
159 int user_alloc);
160int kvm_arch_set_memory_region(struct kvm *kvm,
161 struct kvm_userspace_memory_region *mem,
162 struct kvm_memory_slot old,
163 int user_alloc);
164gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn);
165struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
166void kvm_release_page_clean(struct page *page);
167void kvm_release_page_dirty(struct page *page);
168int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
169 int len);
170int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len);
171int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
172 int offset, int len);
173int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
174 unsigned long len);
175int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len);
176int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len);
177struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
178int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn);
179void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
180
181void kvm_vcpu_block(struct kvm_vcpu *vcpu);
182void kvm_resched(struct kvm_vcpu *vcpu);
183void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
184void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
185void kvm_flush_remote_tlbs(struct kvm *kvm);
186
187long kvm_arch_dev_ioctl(struct file *filp,
188 unsigned int ioctl, unsigned long arg);
189long kvm_arch_vcpu_ioctl(struct file *filp,
190 unsigned int ioctl, unsigned long arg);
191void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
192void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu);
193
194int kvm_dev_ioctl_check_extension(long ext);
195
196int kvm_get_dirty_log(struct kvm *kvm,
197 struct kvm_dirty_log *log, int *is_dirty);
198int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
199 struct kvm_dirty_log *log);
200
201int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
202 struct
203 kvm_userspace_memory_region *mem,
204 int user_alloc);
205long kvm_arch_vm_ioctl(struct file *filp,
206 unsigned int ioctl, unsigned long arg);
207void kvm_arch_destroy_vm(struct kvm *kvm);
208
209int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu);
210int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu);
211
212int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
213 struct kvm_translation *tr);
214
215int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs);
216int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs);
217int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
218 struct kvm_sregs *sregs);
219int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
220 struct kvm_sregs *sregs);
221int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
222 struct kvm_debug_guest *dbg);
223int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run);
224
225int kvm_arch_init(void *opaque);
226void kvm_arch_exit(void);
227
228int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu);
229void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu);
230
231void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu);
232void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
233void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu);
234struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id);
235int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu);
236void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu);
237
238int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu);
239void kvm_arch_hardware_enable(void *garbage);
240void kvm_arch_hardware_disable(void *garbage);
241int kvm_arch_hardware_setup(void);
242void kvm_arch_hardware_unsetup(void);
243void kvm_arch_check_processor_compat(void *rtn);
244int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
245
246void kvm_free_physmem(struct kvm *kvm);
247
248struct kvm *kvm_arch_create_vm(void);
249void kvm_arch_destroy_vm(struct kvm *kvm);
250
251int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
252int kvm_cpu_has_interrupt(struct kvm_vcpu *v);
253
254static inline void kvm_guest_enter(void)
255{
256 account_system_vtime(current);
257 current->flags |= PF_VCPU;
258}
259
260static inline void kvm_guest_exit(void)
261{
262 account_system_vtime(current);
263 current->flags &= ~PF_VCPU;
264}
265
266static inline int memslot_id(struct kvm *kvm, struct kvm_memory_slot *slot)
267{
268 return slot - kvm->memslots;
269}
270
271static inline gpa_t gfn_to_gpa(gfn_t gfn)
272{
273 return (gpa_t)gfn << PAGE_SHIFT;
274}
275
276enum kvm_stat_kind {
277 KVM_STAT_VM,
278 KVM_STAT_VCPU,
279};
280
281struct kvm_stats_debugfs_item {
282 const char *name;
283 int offset;
284 enum kvm_stat_kind kind;
285 struct dentry *dentry;
286};
287extern struct kvm_stats_debugfs_item debugfs_entries[];
288
289#endif
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
index ae2a1bf640bc..4026d7d64296 100644
--- a/drivers/kvm/kvm_main.c
+++ b/drivers/kvm/kvm_main.c
@@ -15,9 +15,9 @@
15 * 15 *
16 */ 16 */
17 17
18#include "kvm.h"
19#include "iodev.h" 18#include "iodev.h"
20 19
20#include <linux/kvm_host.h>
21#include <linux/kvm.h> 21#include <linux/kvm.h>
22#include <linux/module.h> 22#include <linux/module.h>
23#include <linux/errno.h> 23#include <linux/errno.h>
diff --git a/drivers/kvm/kvm_svm.h b/drivers/kvm/kvm_svm.h
deleted file mode 100644
index a0e415daef5b..000000000000
--- a/drivers/kvm/kvm_svm.h
+++ /dev/null
@@ -1,45 +0,0 @@
1#ifndef __KVM_SVM_H
2#define __KVM_SVM_H
3
4#include <linux/kernel.h>
5#include <linux/types.h>
6#include <linux/list.h>
7#include <asm/msr.h>
8
9#include "svm.h"
10#include "kvm.h"
11
12static const u32 host_save_user_msrs[] = {
13#ifdef CONFIG_X86_64
14 MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
15 MSR_FS_BASE,
16#endif
17 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
18};
19
20#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
21#define NUM_DB_REGS 4
22
23struct kvm_vcpu;
24
25struct vcpu_svm {
26 struct kvm_vcpu vcpu;
27 struct vmcb *vmcb;
28 unsigned long vmcb_pa;
29 struct svm_cpu_data *svm_data;
30 uint64_t asid_generation;
31
32 unsigned long db_regs[NUM_DB_REGS];
33
34 u64 next_rip;
35
36 u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
37 u64 host_gs_base;
38 unsigned long host_cr2;
39 unsigned long host_db_regs[NUM_DB_REGS];
40 unsigned long host_dr6;
41 unsigned long host_dr7;
42};
43
44#endif
45
diff --git a/drivers/kvm/lapic.c b/drivers/kvm/lapic.c
deleted file mode 100644
index 8c74bf184a07..000000000000
--- a/drivers/kvm/lapic.c
+++ /dev/null
@@ -1,1087 +0,0 @@
1
2/*
3 * Local APIC virtualization
4 *
5 * Copyright (C) 2006 Qumranet, Inc.
6 * Copyright (C) 2007 Novell
7 * Copyright (C) 2007 Intel
8 *
9 * Authors:
10 * Dor Laor <dor.laor@qumranet.com>
11 * Gregory Haskins <ghaskins@novell.com>
12 * Yaozu (Eddie) Dong <eddie.dong@intel.com>
13 *
14 * Based on Xen 3.1 code, Copyright (c) 2004, Intel Corporation.
15 *
16 * This work is licensed under the terms of the GNU GPL, version 2. See
17 * the COPYING file in the top-level directory.
18 */
19
20#include "kvm.h"
21#include "x86.h"
22
23#include <linux/kvm.h>
24#include <linux/mm.h>
25#include <linux/highmem.h>
26#include <linux/smp.h>
27#include <linux/hrtimer.h>
28#include <linux/io.h>
29#include <linux/module.h>
30#include <asm/processor.h>
31#include <asm/msr.h>
32#include <asm/page.h>
33#include <asm/current.h>
34#include <asm/apicdef.h>
35#include <asm/atomic.h>
36#include <asm/div64.h>
37#include "irq.h"
38
39#define PRId64 "d"
40#define PRIx64 "llx"
41#define PRIu64 "u"
42#define PRIo64 "o"
43
44#define APIC_BUS_CYCLE_NS 1
45
46/* #define apic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */
47#define apic_debug(fmt, arg...)
48
49#define APIC_LVT_NUM 6
50/* 14 is the version for Xeon and Pentium 8.4.8*/
51#define APIC_VERSION (0x14UL | ((APIC_LVT_NUM - 1) << 16))
52#define LAPIC_MMIO_LENGTH (1 << 12)
53/* followed define is not in apicdef.h */
54#define APIC_SHORT_MASK 0xc0000
55#define APIC_DEST_NOSHORT 0x0
56#define APIC_DEST_MASK 0x800
57#define MAX_APIC_VECTOR 256
58
59#define VEC_POS(v) ((v) & (32 - 1))
60#define REG_POS(v) (((v) >> 5) << 4)
61
62static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off)
63{
64 return *((u32 *) (apic->regs + reg_off));
65}
66
67static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
68{
69 *((u32 *) (apic->regs + reg_off)) = val;
70}
71
72static inline int apic_test_and_set_vector(int vec, void *bitmap)
73{
74 return test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
75}
76
77static inline int apic_test_and_clear_vector(int vec, void *bitmap)
78{
79 return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
80}
81
82static inline void apic_set_vector(int vec, void *bitmap)
83{
84 set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
85}
86
87static inline void apic_clear_vector(int vec, void *bitmap)
88{
89 clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
90}
91
92static inline int apic_hw_enabled(struct kvm_lapic *apic)
93{
94 return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
95}
96
97static inline int apic_sw_enabled(struct kvm_lapic *apic)
98{
99 return apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED;
100}
101
102static inline int apic_enabled(struct kvm_lapic *apic)
103{
104 return apic_sw_enabled(apic) && apic_hw_enabled(apic);
105}
106
107#define LVT_MASK \
108 (APIC_LVT_MASKED | APIC_SEND_PENDING | APIC_VECTOR_MASK)
109
110#define LINT_MASK \
111 (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
112 APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
113
114static inline int kvm_apic_id(struct kvm_lapic *apic)
115{
116 return (apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
117}
118
119static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
120{
121 return !(apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED);
122}
123
124static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type)
125{
126 return apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK;
127}
128
129static inline int apic_lvtt_period(struct kvm_lapic *apic)
130{
131 return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC;
132}
133
134static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
135 LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */
136 LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */
137 LVT_MASK | APIC_MODE_MASK, /* LVTPC */
138 LINT_MASK, LINT_MASK, /* LVT0-1 */
139 LVT_MASK /* LVTERR */
140};
141
142static int find_highest_vector(void *bitmap)
143{
144 u32 *word = bitmap;
145 int word_offset = MAX_APIC_VECTOR >> 5;
146
147 while ((word_offset != 0) && (word[(--word_offset) << 2] == 0))
148 continue;
149
150 if (likely(!word_offset && !word[0]))
151 return -1;
152 else
153 return fls(word[word_offset << 2]) - 1 + (word_offset << 5);
154}
155
156static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic)
157{
158 return apic_test_and_set_vector(vec, apic->regs + APIC_IRR);
159}
160
161static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
162{
163 apic_clear_vector(vec, apic->regs + APIC_IRR);
164}
165
166static inline int apic_find_highest_irr(struct kvm_lapic *apic)
167{
168 int result;
169
170 result = find_highest_vector(apic->regs + APIC_IRR);
171 ASSERT(result == -1 || result >= 16);
172
173 return result;
174}
175
176int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
177{
178 struct kvm_lapic *apic = vcpu->arch.apic;
179 int highest_irr;
180
181 if (!apic)
182 return 0;
183 highest_irr = apic_find_highest_irr(apic);
184
185 return highest_irr;
186}
187EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
188
189int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig)
190{
191 struct kvm_lapic *apic = vcpu->arch.apic;
192
193 if (!apic_test_and_set_irr(vec, apic)) {
194 /* a new pending irq is set in IRR */
195 if (trig)
196 apic_set_vector(vec, apic->regs + APIC_TMR);
197 else
198 apic_clear_vector(vec, apic->regs + APIC_TMR);
199 kvm_vcpu_kick(apic->vcpu);
200 return 1;
201 }
202 return 0;
203}
204
205static inline int apic_find_highest_isr(struct kvm_lapic *apic)
206{
207 int result;
208
209 result = find_highest_vector(apic->regs + APIC_ISR);
210 ASSERT(result == -1 || result >= 16);
211
212 return result;
213}
214
215static void apic_update_ppr(struct kvm_lapic *apic)
216{
217 u32 tpr, isrv, ppr;
218 int isr;
219
220 tpr = apic_get_reg(apic, APIC_TASKPRI);
221 isr = apic_find_highest_isr(apic);
222 isrv = (isr != -1) ? isr : 0;
223
224 if ((tpr & 0xf0) >= (isrv & 0xf0))
225 ppr = tpr & 0xff;
226 else
227 ppr = isrv & 0xf0;
228
229 apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x",
230 apic, ppr, isr, isrv);
231
232 apic_set_reg(apic, APIC_PROCPRI, ppr);
233}
234
235static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
236{
237 apic_set_reg(apic, APIC_TASKPRI, tpr);
238 apic_update_ppr(apic);
239}
240
241int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest)
242{
243 return kvm_apic_id(apic) == dest;
244}
245
246int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
247{
248 int result = 0;
249 u8 logical_id;
250
251 logical_id = GET_APIC_LOGICAL_ID(apic_get_reg(apic, APIC_LDR));
252
253 switch (apic_get_reg(apic, APIC_DFR)) {
254 case APIC_DFR_FLAT:
255 if (logical_id & mda)
256 result = 1;
257 break;
258 case APIC_DFR_CLUSTER:
259 if (((logical_id >> 4) == (mda >> 0x4))
260 && (logical_id & mda & 0xf))
261 result = 1;
262 break;
263 default:
264 printk(KERN_WARNING "Bad DFR vcpu %d: %08x\n",
265 apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR));
266 break;
267 }
268
269 return result;
270}
271
272static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
273 int short_hand, int dest, int dest_mode)
274{
275 int result = 0;
276 struct kvm_lapic *target = vcpu->arch.apic;
277
278 apic_debug("target %p, source %p, dest 0x%x, "
279 "dest_mode 0x%x, short_hand 0x%x",
280 target, source, dest, dest_mode, short_hand);
281
282 ASSERT(!target);
283 switch (short_hand) {
284 case APIC_DEST_NOSHORT:
285 if (dest_mode == 0) {
286 /* Physical mode. */
287 if ((dest == 0xFF) || (dest == kvm_apic_id(target)))
288 result = 1;
289 } else
290 /* Logical mode. */
291 result = kvm_apic_match_logical_addr(target, dest);
292 break;
293 case APIC_DEST_SELF:
294 if (target == source)
295 result = 1;
296 break;
297 case APIC_DEST_ALLINC:
298 result = 1;
299 break;
300 case APIC_DEST_ALLBUT:
301 if (target != source)
302 result = 1;
303 break;
304 default:
305 printk(KERN_WARNING "Bad dest shorthand value %x\n",
306 short_hand);
307 break;
308 }
309
310 return result;
311}
312
313/*
314 * Add a pending IRQ into lapic.
315 * Return 1 if successfully added and 0 if discarded.
316 */
317static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
318 int vector, int level, int trig_mode)
319{
320 int orig_irr, result = 0;
321 struct kvm_vcpu *vcpu = apic->vcpu;
322
323 switch (delivery_mode) {
324 case APIC_DM_FIXED:
325 case APIC_DM_LOWEST:
326 /* FIXME add logic for vcpu on reset */
327 if (unlikely(!apic_enabled(apic)))
328 break;
329
330 orig_irr = apic_test_and_set_irr(vector, apic);
331 if (orig_irr && trig_mode) {
332 apic_debug("level trig mode repeatedly for vector %d",
333 vector);
334 break;
335 }
336
337 if (trig_mode) {
338 apic_debug("level trig mode for vector %d", vector);
339 apic_set_vector(vector, apic->regs + APIC_TMR);
340 } else
341 apic_clear_vector(vector, apic->regs + APIC_TMR);
342
343 if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE)
344 kvm_vcpu_kick(vcpu);
345 else if (vcpu->arch.mp_state == VCPU_MP_STATE_HALTED) {
346 vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
347 if (waitqueue_active(&vcpu->wq))
348 wake_up_interruptible(&vcpu->wq);
349 }
350
351 result = (orig_irr == 0);
352 break;
353
354 case APIC_DM_REMRD:
355 printk(KERN_DEBUG "Ignoring delivery mode 3\n");
356 break;
357
358 case APIC_DM_SMI:
359 printk(KERN_DEBUG "Ignoring guest SMI\n");
360 break;
361 case APIC_DM_NMI:
362 printk(KERN_DEBUG "Ignoring guest NMI\n");
363 break;
364
365 case APIC_DM_INIT:
366 if (level) {
367 if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE)
368 printk(KERN_DEBUG
369 "INIT on a runnable vcpu %d\n",
370 vcpu->vcpu_id);
371 vcpu->arch.mp_state = VCPU_MP_STATE_INIT_RECEIVED;
372 kvm_vcpu_kick(vcpu);
373 } else {
374 printk(KERN_DEBUG
375 "Ignoring de-assert INIT to vcpu %d\n",
376 vcpu->vcpu_id);
377 }
378
379 break;
380
381 case APIC_DM_STARTUP:
382 printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n",
383 vcpu->vcpu_id, vector);
384 if (vcpu->arch.mp_state == VCPU_MP_STATE_INIT_RECEIVED) {
385 vcpu->arch.sipi_vector = vector;
386 vcpu->arch.mp_state = VCPU_MP_STATE_SIPI_RECEIVED;
387 if (waitqueue_active(&vcpu->wq))
388 wake_up_interruptible(&vcpu->wq);
389 }
390 break;
391
392 default:
393 printk(KERN_ERR "TODO: unsupported delivery mode %x\n",
394 delivery_mode);
395 break;
396 }
397 return result;
398}
399
400static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
401 unsigned long bitmap)
402{
403 int last;
404 int next;
405 struct kvm_lapic *apic = NULL;
406
407 last = kvm->arch.round_robin_prev_vcpu;
408 next = last;
409
410 do {
411 if (++next == KVM_MAX_VCPUS)
412 next = 0;
413 if (kvm->vcpus[next] == NULL || !test_bit(next, &bitmap))
414 continue;
415 apic = kvm->vcpus[next]->arch.apic;
416 if (apic && apic_enabled(apic))
417 break;
418 apic = NULL;
419 } while (next != last);
420 kvm->arch.round_robin_prev_vcpu = next;
421
422 if (!apic)
423 printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n");
424
425 return apic;
426}
427
428struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
429 unsigned long bitmap)
430{
431 struct kvm_lapic *apic;
432
433 apic = kvm_apic_round_robin(kvm, vector, bitmap);
434 if (apic)
435 return apic->vcpu;
436 return NULL;
437}
438
439static void apic_set_eoi(struct kvm_lapic *apic)
440{
441 int vector = apic_find_highest_isr(apic);
442
443 /*
444 * Not every write EOI will has corresponding ISR,
445 * one example is when Kernel check timer on setup_IO_APIC
446 */
447 if (vector == -1)
448 return;
449
450 apic_clear_vector(vector, apic->regs + APIC_ISR);
451 apic_update_ppr(apic);
452
453 if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR))
454 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector);
455}
456
457static void apic_send_ipi(struct kvm_lapic *apic)
458{
459 u32 icr_low = apic_get_reg(apic, APIC_ICR);
460 u32 icr_high = apic_get_reg(apic, APIC_ICR2);
461
462 unsigned int dest = GET_APIC_DEST_FIELD(icr_high);
463 unsigned int short_hand = icr_low & APIC_SHORT_MASK;
464 unsigned int trig_mode = icr_low & APIC_INT_LEVELTRIG;
465 unsigned int level = icr_low & APIC_INT_ASSERT;
466 unsigned int dest_mode = icr_low & APIC_DEST_MASK;
467 unsigned int delivery_mode = icr_low & APIC_MODE_MASK;
468 unsigned int vector = icr_low & APIC_VECTOR_MASK;
469
470 struct kvm_vcpu *target;
471 struct kvm_vcpu *vcpu;
472 unsigned long lpr_map = 0;
473 int i;
474
475 apic_debug("icr_high 0x%x, icr_low 0x%x, "
476 "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, "
477 "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n",
478 icr_high, icr_low, short_hand, dest,
479 trig_mode, level, dest_mode, delivery_mode, vector);
480
481 for (i = 0; i < KVM_MAX_VCPUS; i++) {
482 vcpu = apic->vcpu->kvm->vcpus[i];
483 if (!vcpu)
484 continue;
485
486 if (vcpu->arch.apic &&
487 apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) {
488 if (delivery_mode == APIC_DM_LOWEST)
489 set_bit(vcpu->vcpu_id, &lpr_map);
490 else
491 __apic_accept_irq(vcpu->arch.apic, delivery_mode,
492 vector, level, trig_mode);
493 }
494 }
495
496 if (delivery_mode == APIC_DM_LOWEST) {
497 target = kvm_get_lowest_prio_vcpu(vcpu->kvm, vector, lpr_map);
498 if (target != NULL)
499 __apic_accept_irq(target->arch.apic, delivery_mode,
500 vector, level, trig_mode);
501 }
502}
503
504static u32 apic_get_tmcct(struct kvm_lapic *apic)
505{
506 u64 counter_passed;
507 ktime_t passed, now;
508 u32 tmcct;
509
510 ASSERT(apic != NULL);
511
512 now = apic->timer.dev.base->get_time();
513 tmcct = apic_get_reg(apic, APIC_TMICT);
514
515 /* if initial count is 0, current count should also be 0 */
516 if (tmcct == 0)
517 return 0;
518
519 if (unlikely(ktime_to_ns(now) <=
520 ktime_to_ns(apic->timer.last_update))) {
521 /* Wrap around */
522 passed = ktime_add(( {
523 (ktime_t) {
524 .tv64 = KTIME_MAX -
525 (apic->timer.last_update).tv64}; }
526 ), now);
527 apic_debug("time elapsed\n");
528 } else
529 passed = ktime_sub(now, apic->timer.last_update);
530
531 counter_passed = div64_64(ktime_to_ns(passed),
532 (APIC_BUS_CYCLE_NS * apic->timer.divide_count));
533
534 if (counter_passed > tmcct) {
535 if (unlikely(!apic_lvtt_period(apic))) {
536 /* one-shot timers stick at 0 until reset */
537 tmcct = 0;
538 } else {
539 /*
540 * periodic timers reset to APIC_TMICT when they
541 * hit 0. The while loop simulates this happening N
542 * times. (counter_passed %= tmcct) would also work,
543 * but might be slower or not work on 32-bit??
544 */
545 while (counter_passed > tmcct)
546 counter_passed -= tmcct;
547 tmcct -= counter_passed;
548 }
549 } else {
550 tmcct -= counter_passed;
551 }
552
553 return tmcct;
554}
555
556static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
557{
558 u32 val = 0;
559
560 if (offset >= LAPIC_MMIO_LENGTH)
561 return 0;
562
563 switch (offset) {
564 case APIC_ARBPRI:
565 printk(KERN_WARNING "Access APIC ARBPRI register "
566 "which is for P6\n");
567 break;
568
569 case APIC_TMCCT: /* Timer CCR */
570 val = apic_get_tmcct(apic);
571 break;
572
573 default:
574 apic_update_ppr(apic);
575 val = apic_get_reg(apic, offset);
576 break;
577 }
578
579 return val;
580}
581
582static void apic_mmio_read(struct kvm_io_device *this,
583 gpa_t address, int len, void *data)
584{
585 struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
586 unsigned int offset = address - apic->base_address;
587 unsigned char alignment = offset & 0xf;
588 u32 result;
589
590 if ((alignment + len) > 4) {
591 printk(KERN_ERR "KVM_APIC_READ: alignment error %lx %d",
592 (unsigned long)address, len);
593 return;
594 }
595 result = __apic_read(apic, offset & ~0xf);
596
597 switch (len) {
598 case 1:
599 case 2:
600 case 4:
601 memcpy(data, (char *)&result + alignment, len);
602 break;
603 default:
604 printk(KERN_ERR "Local APIC read with len = %x, "
605 "should be 1,2, or 4 instead\n", len);
606 break;
607 }
608}
609
610static void update_divide_count(struct kvm_lapic *apic)
611{
612 u32 tmp1, tmp2, tdcr;
613
614 tdcr = apic_get_reg(apic, APIC_TDCR);
615 tmp1 = tdcr & 0xf;
616 tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1;
617 apic->timer.divide_count = 0x1 << (tmp2 & 0x7);
618
619 apic_debug("timer divide count is 0x%x\n",
620 apic->timer.divide_count);
621}
622
623static void start_apic_timer(struct kvm_lapic *apic)
624{
625 ktime_t now = apic->timer.dev.base->get_time();
626
627 apic->timer.last_update = now;
628
629 apic->timer.period = apic_get_reg(apic, APIC_TMICT) *
630 APIC_BUS_CYCLE_NS * apic->timer.divide_count;
631 atomic_set(&apic->timer.pending, 0);
632 hrtimer_start(&apic->timer.dev,
633 ktime_add_ns(now, apic->timer.period),
634 HRTIMER_MODE_ABS);
635
636 apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
637 PRIx64 ", "
638 "timer initial count 0x%x, period %lldns, "
639 "expire @ 0x%016" PRIx64 ".\n", __FUNCTION__,
640 APIC_BUS_CYCLE_NS, ktime_to_ns(now),
641 apic_get_reg(apic, APIC_TMICT),
642 apic->timer.period,
643 ktime_to_ns(ktime_add_ns(now,
644 apic->timer.period)));
645}
646
647static void apic_mmio_write(struct kvm_io_device *this,
648 gpa_t address, int len, const void *data)
649{
650 struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
651 unsigned int offset = address - apic->base_address;
652 unsigned char alignment = offset & 0xf;
653 u32 val;
654
655 /*
656 * APIC register must be aligned on 128-bits boundary.
657 * 32/64/128 bits registers must be accessed thru 32 bits.
658 * Refer SDM 8.4.1
659 */
660 if (len != 4 || alignment) {
661 if (printk_ratelimit())
662 printk(KERN_ERR "apic write: bad size=%d %lx\n",
663 len, (long)address);
664 return;
665 }
666
667 val = *(u32 *) data;
668
669 /* too common printing */
670 if (offset != APIC_EOI)
671 apic_debug("%s: offset 0x%x with length 0x%x, and value is "
672 "0x%x\n", __FUNCTION__, offset, len, val);
673
674 offset &= 0xff0;
675
676 switch (offset) {
677 case APIC_ID: /* Local APIC ID */
678 apic_set_reg(apic, APIC_ID, val);
679 break;
680
681 case APIC_TASKPRI:
682 apic_set_tpr(apic, val & 0xff);
683 break;
684
685 case APIC_EOI:
686 apic_set_eoi(apic);
687 break;
688
689 case APIC_LDR:
690 apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK);
691 break;
692
693 case APIC_DFR:
694 apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
695 break;
696
697 case APIC_SPIV:
698 apic_set_reg(apic, APIC_SPIV, val & 0x3ff);
699 if (!(val & APIC_SPIV_APIC_ENABLED)) {
700 int i;
701 u32 lvt_val;
702
703 for (i = 0; i < APIC_LVT_NUM; i++) {
704 lvt_val = apic_get_reg(apic,
705 APIC_LVTT + 0x10 * i);
706 apic_set_reg(apic, APIC_LVTT + 0x10 * i,
707 lvt_val | APIC_LVT_MASKED);
708 }
709 atomic_set(&apic->timer.pending, 0);
710
711 }
712 break;
713
714 case APIC_ICR:
715 /* No delay here, so we always clear the pending bit */
716 apic_set_reg(apic, APIC_ICR, val & ~(1 << 12));
717 apic_send_ipi(apic);
718 break;
719
720 case APIC_ICR2:
721 apic_set_reg(apic, APIC_ICR2, val & 0xff000000);
722 break;
723
724 case APIC_LVTT:
725 case APIC_LVTTHMR:
726 case APIC_LVTPC:
727 case APIC_LVT0:
728 case APIC_LVT1:
729 case APIC_LVTERR:
730 /* TODO: Check vector */
731 if (!apic_sw_enabled(apic))
732 val |= APIC_LVT_MASKED;
733
734 val &= apic_lvt_mask[(offset - APIC_LVTT) >> 4];
735 apic_set_reg(apic, offset, val);
736
737 break;
738
739 case APIC_TMICT:
740 hrtimer_cancel(&apic->timer.dev);
741 apic_set_reg(apic, APIC_TMICT, val);
742 start_apic_timer(apic);
743 return;
744
745 case APIC_TDCR:
746 if (val & 4)
747 printk(KERN_ERR "KVM_WRITE:TDCR %x\n", val);
748 apic_set_reg(apic, APIC_TDCR, val);
749 update_divide_count(apic);
750 break;
751
752 default:
753 apic_debug("Local APIC Write to read-only register %x\n",
754 offset);
755 break;
756 }
757
758}
759
760static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr)
761{
762 struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
763 int ret = 0;
764
765
766 if (apic_hw_enabled(apic) &&
767 (addr >= apic->base_address) &&
768 (addr < (apic->base_address + LAPIC_MMIO_LENGTH)))
769 ret = 1;
770
771 return ret;
772}
773
774void kvm_free_lapic(struct kvm_vcpu *vcpu)
775{
776 if (!vcpu->arch.apic)
777 return;
778
779 hrtimer_cancel(&vcpu->arch.apic->timer.dev);
780
781 if (vcpu->arch.apic->regs_page)
782 __free_page(vcpu->arch.apic->regs_page);
783
784 kfree(vcpu->arch.apic);
785}
786
787/*
788 *----------------------------------------------------------------------
789 * LAPIC interface
790 *----------------------------------------------------------------------
791 */
792
793void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
794{
795 struct kvm_lapic *apic = vcpu->arch.apic;
796
797 if (!apic)
798 return;
799 apic_set_tpr(apic, ((cr8 & 0x0f) << 4));
800}
801
802u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
803{
804 struct kvm_lapic *apic = vcpu->arch.apic;
805 u64 tpr;
806
807 if (!apic)
808 return 0;
809 tpr = (u64) apic_get_reg(apic, APIC_TASKPRI);
810
811 return (tpr & 0xf0) >> 4;
812}
813EXPORT_SYMBOL_GPL(kvm_lapic_get_cr8);
814
815void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
816{
817 struct kvm_lapic *apic = vcpu->arch.apic;
818
819 if (!apic) {
820 value |= MSR_IA32_APICBASE_BSP;
821 vcpu->arch.apic_base = value;
822 return;
823 }
824 if (apic->vcpu->vcpu_id)
825 value &= ~MSR_IA32_APICBASE_BSP;
826
827 vcpu->arch.apic_base = value;
828 apic->base_address = apic->vcpu->arch.apic_base &
829 MSR_IA32_APICBASE_BASE;
830
831 /* with FSB delivery interrupt, we can restart APIC functionality */
832 apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is "
833 "0x%lx.\n", apic->vcpu->arch.apic_base, apic->base_address);
834
835}
836
837u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu)
838{
839 return vcpu->arch.apic_base;
840}
841EXPORT_SYMBOL_GPL(kvm_lapic_get_base);
842
843void kvm_lapic_reset(struct kvm_vcpu *vcpu)
844{
845 struct kvm_lapic *apic;
846 int i;
847
848 apic_debug("%s\n", __FUNCTION__);
849
850 ASSERT(vcpu);
851 apic = vcpu->arch.apic;
852 ASSERT(apic != NULL);
853
854 /* Stop the timer in case it's a reset to an active apic */
855 hrtimer_cancel(&apic->timer.dev);
856
857 apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24);
858 apic_set_reg(apic, APIC_LVR, APIC_VERSION);
859
860 for (i = 0; i < APIC_LVT_NUM; i++)
861 apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
862 apic_set_reg(apic, APIC_LVT0,
863 SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
864
865 apic_set_reg(apic, APIC_DFR, 0xffffffffU);
866 apic_set_reg(apic, APIC_SPIV, 0xff);
867 apic_set_reg(apic, APIC_TASKPRI, 0);
868 apic_set_reg(apic, APIC_LDR, 0);
869 apic_set_reg(apic, APIC_ESR, 0);
870 apic_set_reg(apic, APIC_ICR, 0);
871 apic_set_reg(apic, APIC_ICR2, 0);
872 apic_set_reg(apic, APIC_TDCR, 0);
873 apic_set_reg(apic, APIC_TMICT, 0);
874 for (i = 0; i < 8; i++) {
875 apic_set_reg(apic, APIC_IRR + 0x10 * i, 0);
876 apic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
877 apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
878 }
879 update_divide_count(apic);
880 atomic_set(&apic->timer.pending, 0);
881 if (vcpu->vcpu_id == 0)
882 vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
883 apic_update_ppr(apic);
884
885 apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr="
886 "0x%016" PRIx64 ", base_address=0x%0lx.\n", __FUNCTION__,
887 vcpu, kvm_apic_id(apic),
888 vcpu->arch.apic_base, apic->base_address);
889}
890EXPORT_SYMBOL_GPL(kvm_lapic_reset);
891
892int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
893{
894 struct kvm_lapic *apic = vcpu->arch.apic;
895 int ret = 0;
896
897 if (!apic)
898 return 0;
899 ret = apic_enabled(apic);
900
901 return ret;
902}
903EXPORT_SYMBOL_GPL(kvm_lapic_enabled);
904
905/*
906 *----------------------------------------------------------------------
907 * timer interface
908 *----------------------------------------------------------------------
909 */
910
911/* TODO: make sure __apic_timer_fn runs in current pCPU */
912static int __apic_timer_fn(struct kvm_lapic *apic)
913{
914 int result = 0;
915 wait_queue_head_t *q = &apic->vcpu->wq;
916
917 atomic_inc(&apic->timer.pending);
918 if (waitqueue_active(q)) {
919 apic->vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
920 wake_up_interruptible(q);
921 }
922 if (apic_lvtt_period(apic)) {
923 result = 1;
924 apic->timer.dev.expires = ktime_add_ns(
925 apic->timer.dev.expires,
926 apic->timer.period);
927 }
928 return result;
929}
930
931static int __inject_apic_timer_irq(struct kvm_lapic *apic)
932{
933 int vector;
934
935 vector = apic_lvt_vector(apic, APIC_LVTT);
936 return __apic_accept_irq(apic, APIC_DM_FIXED, vector, 1, 0);
937}
938
939static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
940{
941 struct kvm_lapic *apic;
942 int restart_timer = 0;
943
944 apic = container_of(data, struct kvm_lapic, timer.dev);
945
946 restart_timer = __apic_timer_fn(apic);
947
948 if (restart_timer)
949 return HRTIMER_RESTART;
950 else
951 return HRTIMER_NORESTART;
952}
953
954int kvm_create_lapic(struct kvm_vcpu *vcpu)
955{
956 struct kvm_lapic *apic;
957
958 ASSERT(vcpu != NULL);
959 apic_debug("apic_init %d\n", vcpu->vcpu_id);
960
961 apic = kzalloc(sizeof(*apic), GFP_KERNEL);
962 if (!apic)
963 goto nomem;
964
965 vcpu->arch.apic = apic;
966
967 apic->regs_page = alloc_page(GFP_KERNEL);
968 if (apic->regs_page == NULL) {
969 printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
970 vcpu->vcpu_id);
971 goto nomem_free_apic;
972 }
973 apic->regs = page_address(apic->regs_page);
974 memset(apic->regs, 0, PAGE_SIZE);
975 apic->vcpu = vcpu;
976
977 hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
978 apic->timer.dev.function = apic_timer_fn;
979 apic->base_address = APIC_DEFAULT_PHYS_BASE;
980 vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE;
981
982 kvm_lapic_reset(vcpu);
983 apic->dev.read = apic_mmio_read;
984 apic->dev.write = apic_mmio_write;
985 apic->dev.in_range = apic_mmio_range;
986 apic->dev.private = apic;
987
988 return 0;
989nomem_free_apic:
990 kfree(apic);
991nomem:
992 return -ENOMEM;
993}
994EXPORT_SYMBOL_GPL(kvm_create_lapic);
995
996int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
997{
998 struct kvm_lapic *apic = vcpu->arch.apic;
999 int highest_irr;
1000
1001 if (!apic || !apic_enabled(apic))
1002 return -1;
1003
1004 apic_update_ppr(apic);
1005 highest_irr = apic_find_highest_irr(apic);
1006 if ((highest_irr == -1) ||
1007 ((highest_irr & 0xF0) <= apic_get_reg(apic, APIC_PROCPRI)))
1008 return -1;
1009 return highest_irr;
1010}
1011
1012int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
1013{
1014 u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0);
1015 int r = 0;
1016
1017 if (vcpu->vcpu_id == 0) {
1018 if (!apic_hw_enabled(vcpu->arch.apic))
1019 r = 1;
1020 if ((lvt0 & APIC_LVT_MASKED) == 0 &&
1021 GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
1022 r = 1;
1023 }
1024 return r;
1025}
1026
1027void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
1028{
1029 struct kvm_lapic *apic = vcpu->arch.apic;
1030
1031 if (apic && apic_lvt_enabled(apic, APIC_LVTT) &&
1032 atomic_read(&apic->timer.pending) > 0) {
1033 if (__inject_apic_timer_irq(apic))
1034 atomic_dec(&apic->timer.pending);
1035 }
1036}
1037
1038void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
1039{
1040 struct kvm_lapic *apic = vcpu->arch.apic;
1041
1042 if (apic && apic_lvt_vector(apic, APIC_LVTT) == vec)
1043 apic->timer.last_update = ktime_add_ns(
1044 apic->timer.last_update,
1045 apic->timer.period);
1046}
1047
1048int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
1049{
1050 int vector = kvm_apic_has_interrupt(vcpu);
1051 struct kvm_lapic *apic = vcpu->arch.apic;
1052
1053 if (vector == -1)
1054 return -1;
1055
1056 apic_set_vector(vector, apic->regs + APIC_ISR);
1057 apic_update_ppr(apic);
1058 apic_clear_irr(vector, apic);
1059 return vector;
1060}
1061
1062void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
1063{
1064 struct kvm_lapic *apic = vcpu->arch.apic;
1065
1066 apic->base_address = vcpu->arch.apic_base &
1067 MSR_IA32_APICBASE_BASE;
1068 apic_set_reg(apic, APIC_LVR, APIC_VERSION);
1069 apic_update_ppr(apic);
1070 hrtimer_cancel(&apic->timer.dev);
1071 update_divide_count(apic);
1072 start_apic_timer(apic);
1073}
1074
1075void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
1076{
1077 struct kvm_lapic *apic = vcpu->arch.apic;
1078 struct hrtimer *timer;
1079
1080 if (!apic)
1081 return;
1082
1083 timer = &apic->timer.dev;
1084 if (hrtimer_cancel(timer))
1085 hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS);
1086}
1087EXPORT_SYMBOL_GPL(kvm_migrate_apic_timer);
diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c
deleted file mode 100644
index c26d83f86a3a..000000000000
--- a/drivers/kvm/mmu.c
+++ /dev/null
@@ -1,1806 +0,0 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * MMU support
8 *
9 * Copyright (C) 2006 Qumranet, Inc.
10 *
11 * Authors:
12 * Yaniv Kamay <yaniv@qumranet.com>
13 * Avi Kivity <avi@qumranet.com>
14 *
15 * This work is licensed under the terms of the GNU GPL, version 2. See
16 * the COPYING file in the top-level directory.
17 *
18 */
19
20#include "vmx.h"
21#include "kvm.h"
22#include "x86.h"
23#include "mmu.h"
24
25#include <linux/types.h>
26#include <linux/string.h>
27#include <linux/mm.h>
28#include <linux/highmem.h>
29#include <linux/module.h>
30#include <linux/swap.h>
31
32#include <asm/page.h>
33#include <asm/cmpxchg.h>
34#include <asm/io.h>
35
36#undef MMU_DEBUG
37
38#undef AUDIT
39
40#ifdef AUDIT
41static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
42#else
43static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
44#endif
45
46#ifdef MMU_DEBUG
47
48#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
49#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
50
51#else
52
53#define pgprintk(x...) do { } while (0)
54#define rmap_printk(x...) do { } while (0)
55
56#endif
57
58#if defined(MMU_DEBUG) || defined(AUDIT)
59static int dbg = 1;
60#endif
61
62#ifndef MMU_DEBUG
63#define ASSERT(x) do { } while (0)
64#else
65#define ASSERT(x) \
66 if (!(x)) { \
67 printk(KERN_WARNING "assertion failed %s:%d: %s\n", \
68 __FILE__, __LINE__, #x); \
69 }
70#endif
71
72#define PT64_PT_BITS 9
73#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
74#define PT32_PT_BITS 10
75#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
76
77#define PT_WRITABLE_SHIFT 1
78
79#define PT_PRESENT_MASK (1ULL << 0)
80#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
81#define PT_USER_MASK (1ULL << 2)
82#define PT_PWT_MASK (1ULL << 3)
83#define PT_PCD_MASK (1ULL << 4)
84#define PT_ACCESSED_MASK (1ULL << 5)
85#define PT_DIRTY_MASK (1ULL << 6)
86#define PT_PAGE_SIZE_MASK (1ULL << 7)
87#define PT_PAT_MASK (1ULL << 7)
88#define PT_GLOBAL_MASK (1ULL << 8)
89#define PT64_NX_SHIFT 63
90#define PT64_NX_MASK (1ULL << PT64_NX_SHIFT)
91
92#define PT_PAT_SHIFT 7
93#define PT_DIR_PAT_SHIFT 12
94#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
95
96#define PT32_DIR_PSE36_SIZE 4
97#define PT32_DIR_PSE36_SHIFT 13
98#define PT32_DIR_PSE36_MASK \
99 (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
100
101
102#define PT_FIRST_AVAIL_BITS_SHIFT 9
103#define PT64_SECOND_AVAIL_BITS_SHIFT 52
104
105#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
106
107#define VALID_PAGE(x) ((x) != INVALID_PAGE)
108
109#define PT64_LEVEL_BITS 9
110
111#define PT64_LEVEL_SHIFT(level) \
112 (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
113
114#define PT64_LEVEL_MASK(level) \
115 (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
116
117#define PT64_INDEX(address, level)\
118 (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
119
120
121#define PT32_LEVEL_BITS 10
122
123#define PT32_LEVEL_SHIFT(level) \
124 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
125
126#define PT32_LEVEL_MASK(level) \
127 (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
128
129#define PT32_INDEX(address, level)\
130 (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
131
132
133#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
134#define PT64_DIR_BASE_ADDR_MASK \
135 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
136
137#define PT32_BASE_ADDR_MASK PAGE_MASK
138#define PT32_DIR_BASE_ADDR_MASK \
139 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
140
141#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
142 | PT64_NX_MASK)
143
144#define PFERR_PRESENT_MASK (1U << 0)
145#define PFERR_WRITE_MASK (1U << 1)
146#define PFERR_USER_MASK (1U << 2)
147#define PFERR_FETCH_MASK (1U << 4)
148
149#define PT64_ROOT_LEVEL 4
150#define PT32_ROOT_LEVEL 2
151#define PT32E_ROOT_LEVEL 3
152
153#define PT_DIRECTORY_LEVEL 2
154#define PT_PAGE_TABLE_LEVEL 1
155
156#define RMAP_EXT 4
157
158#define ACC_EXEC_MASK 1
159#define ACC_WRITE_MASK PT_WRITABLE_MASK
160#define ACC_USER_MASK PT_USER_MASK
161#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
162
163struct kvm_rmap_desc {
164 u64 *shadow_ptes[RMAP_EXT];
165 struct kvm_rmap_desc *more;
166};
167
168static struct kmem_cache *pte_chain_cache;
169static struct kmem_cache *rmap_desc_cache;
170static struct kmem_cache *mmu_page_header_cache;
171
172static u64 __read_mostly shadow_trap_nonpresent_pte;
173static u64 __read_mostly shadow_notrap_nonpresent_pte;
174
175void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
176{
177 shadow_trap_nonpresent_pte = trap_pte;
178 shadow_notrap_nonpresent_pte = notrap_pte;
179}
180EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
181
182static int is_write_protection(struct kvm_vcpu *vcpu)
183{
184 return vcpu->arch.cr0 & X86_CR0_WP;
185}
186
187static int is_cpuid_PSE36(void)
188{
189 return 1;
190}
191
192static int is_nx(struct kvm_vcpu *vcpu)
193{
194 return vcpu->arch.shadow_efer & EFER_NX;
195}
196
197static int is_present_pte(unsigned long pte)
198{
199 return pte & PT_PRESENT_MASK;
200}
201
202static int is_shadow_present_pte(u64 pte)
203{
204 pte &= ~PT_SHADOW_IO_MARK;
205 return pte != shadow_trap_nonpresent_pte
206 && pte != shadow_notrap_nonpresent_pte;
207}
208
209static int is_writeble_pte(unsigned long pte)
210{
211 return pte & PT_WRITABLE_MASK;
212}
213
214static int is_dirty_pte(unsigned long pte)
215{
216 return pte & PT_DIRTY_MASK;
217}
218
219static int is_io_pte(unsigned long pte)
220{
221 return pte & PT_SHADOW_IO_MARK;
222}
223
224static int is_rmap_pte(u64 pte)
225{
226 return pte != shadow_trap_nonpresent_pte
227 && pte != shadow_notrap_nonpresent_pte;
228}
229
230static gfn_t pse36_gfn_delta(u32 gpte)
231{
232 int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
233
234 return (gpte & PT32_DIR_PSE36_MASK) << shift;
235}
236
237static void set_shadow_pte(u64 *sptep, u64 spte)
238{
239#ifdef CONFIG_X86_64
240 set_64bit((unsigned long *)sptep, spte);
241#else
242 set_64bit((unsigned long long *)sptep, spte);
243#endif
244}
245
246static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
247 struct kmem_cache *base_cache, int min)
248{
249 void *obj;
250
251 if (cache->nobjs >= min)
252 return 0;
253 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
254 obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
255 if (!obj)
256 return -ENOMEM;
257 cache->objects[cache->nobjs++] = obj;
258 }
259 return 0;
260}
261
262static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
263{
264 while (mc->nobjs)
265 kfree(mc->objects[--mc->nobjs]);
266}
267
268static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
269 int min)
270{
271 struct page *page;
272
273 if (cache->nobjs >= min)
274 return 0;
275 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
276 page = alloc_page(GFP_KERNEL);
277 if (!page)
278 return -ENOMEM;
279 set_page_private(page, 0);
280 cache->objects[cache->nobjs++] = page_address(page);
281 }
282 return 0;
283}
284
285static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
286{
287 while (mc->nobjs)
288 free_page((unsigned long)mc->objects[--mc->nobjs]);
289}
290
291static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
292{
293 int r;
294
295 kvm_mmu_free_some_pages(vcpu);
296 r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache,
297 pte_chain_cache, 4);
298 if (r)
299 goto out;
300 r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
301 rmap_desc_cache, 1);
302 if (r)
303 goto out;
304 r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
305 if (r)
306 goto out;
307 r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
308 mmu_page_header_cache, 4);
309out:
310 return r;
311}
312
313static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
314{
315 mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache);
316 mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache);
317 mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
318 mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
319}
320
321static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
322 size_t size)
323{
324 void *p;
325
326 BUG_ON(!mc->nobjs);
327 p = mc->objects[--mc->nobjs];
328 memset(p, 0, size);
329 return p;
330}
331
332static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
333{
334 return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache,
335 sizeof(struct kvm_pte_chain));
336}
337
338static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
339{
340 kfree(pc);
341}
342
343static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
344{
345 return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache,
346 sizeof(struct kvm_rmap_desc));
347}
348
349static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
350{
351 kfree(rd);
352}
353
354/*
355 * Take gfn and return the reverse mapping to it.
356 * Note: gfn must be unaliased before this function get called
357 */
358
359static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn)
360{
361 struct kvm_memory_slot *slot;
362
363 slot = gfn_to_memslot(kvm, gfn);
364 return &slot->rmap[gfn - slot->base_gfn];
365}
366
367/*
368 * Reverse mapping data structures:
369 *
370 * If rmapp bit zero is zero, then rmapp point to the shadw page table entry
371 * that points to page_address(page).
372 *
373 * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
374 * containing more mappings.
375 */
376static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
377{
378 struct kvm_mmu_page *sp;
379 struct kvm_rmap_desc *desc;
380 unsigned long *rmapp;
381 int i;
382
383 if (!is_rmap_pte(*spte))
384 return;
385 gfn = unalias_gfn(vcpu->kvm, gfn);
386 sp = page_header(__pa(spte));
387 sp->gfns[spte - sp->spt] = gfn;
388 rmapp = gfn_to_rmap(vcpu->kvm, gfn);
389 if (!*rmapp) {
390 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
391 *rmapp = (unsigned long)spte;
392 } else if (!(*rmapp & 1)) {
393 rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
394 desc = mmu_alloc_rmap_desc(vcpu);
395 desc->shadow_ptes[0] = (u64 *)*rmapp;
396 desc->shadow_ptes[1] = spte;
397 *rmapp = (unsigned long)desc | 1;
398 } else {
399 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
400 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
401 while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
402 desc = desc->more;
403 if (desc->shadow_ptes[RMAP_EXT-1]) {
404 desc->more = mmu_alloc_rmap_desc(vcpu);
405 desc = desc->more;
406 }
407 for (i = 0; desc->shadow_ptes[i]; ++i)
408 ;
409 desc->shadow_ptes[i] = spte;
410 }
411}
412
413static void rmap_desc_remove_entry(unsigned long *rmapp,
414 struct kvm_rmap_desc *desc,
415 int i,
416 struct kvm_rmap_desc *prev_desc)
417{
418 int j;
419
420 for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j)
421 ;
422 desc->shadow_ptes[i] = desc->shadow_ptes[j];
423 desc->shadow_ptes[j] = NULL;
424 if (j != 0)
425 return;
426 if (!prev_desc && !desc->more)
427 *rmapp = (unsigned long)desc->shadow_ptes[0];
428 else
429 if (prev_desc)
430 prev_desc->more = desc->more;
431 else
432 *rmapp = (unsigned long)desc->more | 1;
433 mmu_free_rmap_desc(desc);
434}
435
436static void rmap_remove(struct kvm *kvm, u64 *spte)
437{
438 struct kvm_rmap_desc *desc;
439 struct kvm_rmap_desc *prev_desc;
440 struct kvm_mmu_page *sp;
441 struct page *page;
442 unsigned long *rmapp;
443 int i;
444
445 if (!is_rmap_pte(*spte))
446 return;
447 sp = page_header(__pa(spte));
448 page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
449 mark_page_accessed(page);
450 if (is_writeble_pte(*spte))
451 kvm_release_page_dirty(page);
452 else
453 kvm_release_page_clean(page);
454 rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt]);
455 if (!*rmapp) {
456 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
457 BUG();
458 } else if (!(*rmapp & 1)) {
459 rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte);
460 if ((u64 *)*rmapp != spte) {
461 printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n",
462 spte, *spte);
463 BUG();
464 }
465 *rmapp = 0;
466 } else {
467 rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte);
468 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
469 prev_desc = NULL;
470 while (desc) {
471 for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
472 if (desc->shadow_ptes[i] == spte) {
473 rmap_desc_remove_entry(rmapp,
474 desc, i,
475 prev_desc);
476 return;
477 }
478 prev_desc = desc;
479 desc = desc->more;
480 }
481 BUG();
482 }
483}
484
485static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
486{
487 struct kvm_rmap_desc *desc;
488 struct kvm_rmap_desc *prev_desc;
489 u64 *prev_spte;
490 int i;
491
492 if (!*rmapp)
493 return NULL;
494 else if (!(*rmapp & 1)) {
495 if (!spte)
496 return (u64 *)*rmapp;
497 return NULL;
498 }
499 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
500 prev_desc = NULL;
501 prev_spte = NULL;
502 while (desc) {
503 for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) {
504 if (prev_spte == spte)
505 return desc->shadow_ptes[i];
506 prev_spte = desc->shadow_ptes[i];
507 }
508 desc = desc->more;
509 }
510 return NULL;
511}
512
513static void rmap_write_protect(struct kvm *kvm, u64 gfn)
514{
515 unsigned long *rmapp;
516 u64 *spte;
517
518 gfn = unalias_gfn(kvm, gfn);
519 rmapp = gfn_to_rmap(kvm, gfn);
520
521 spte = rmap_next(kvm, rmapp, NULL);
522 while (spte) {
523 BUG_ON(!spte);
524 BUG_ON(!(*spte & PT_PRESENT_MASK));
525 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
526 if (is_writeble_pte(*spte))
527 set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK);
528 kvm_flush_remote_tlbs(kvm);
529 spte = rmap_next(kvm, rmapp, spte);
530 }
531}
532
533#ifdef MMU_DEBUG
534static int is_empty_shadow_page(u64 *spt)
535{
536 u64 *pos;
537 u64 *end;
538
539 for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
540 if ((*pos & ~PT_SHADOW_IO_MARK) != shadow_trap_nonpresent_pte) {
541 printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
542 pos, *pos);
543 return 0;
544 }
545 return 1;
546}
547#endif
548
549static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
550{
551 ASSERT(is_empty_shadow_page(sp->spt));
552 list_del(&sp->link);
553 __free_page(virt_to_page(sp->spt));
554 __free_page(virt_to_page(sp->gfns));
555 kfree(sp);
556 ++kvm->arch.n_free_mmu_pages;
557}
558
559static unsigned kvm_page_table_hashfn(gfn_t gfn)
560{
561 return gfn;
562}
563
564static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
565 u64 *parent_pte)
566{
567 struct kvm_mmu_page *sp;
568
569 if (!vcpu->kvm->arch.n_free_mmu_pages)
570 return NULL;
571
572 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
573 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
574 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
575 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
576 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
577 ASSERT(is_empty_shadow_page(sp->spt));
578 sp->slot_bitmap = 0;
579 sp->multimapped = 0;
580 sp->parent_pte = parent_pte;
581 --vcpu->kvm->arch.n_free_mmu_pages;
582 return sp;
583}
584
585static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
586 struct kvm_mmu_page *sp, u64 *parent_pte)
587{
588 struct kvm_pte_chain *pte_chain;
589 struct hlist_node *node;
590 int i;
591
592 if (!parent_pte)
593 return;
594 if (!sp->multimapped) {
595 u64 *old = sp->parent_pte;
596
597 if (!old) {
598 sp->parent_pte = parent_pte;
599 return;
600 }
601 sp->multimapped = 1;
602 pte_chain = mmu_alloc_pte_chain(vcpu);
603 INIT_HLIST_HEAD(&sp->parent_ptes);
604 hlist_add_head(&pte_chain->link, &sp->parent_ptes);
605 pte_chain->parent_ptes[0] = old;
606 }
607 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) {
608 if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
609 continue;
610 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
611 if (!pte_chain->parent_ptes[i]) {
612 pte_chain->parent_ptes[i] = parent_pte;
613 return;
614 }
615 }
616 pte_chain = mmu_alloc_pte_chain(vcpu);
617 BUG_ON(!pte_chain);
618 hlist_add_head(&pte_chain->link, &sp->parent_ptes);
619 pte_chain->parent_ptes[0] = parent_pte;
620}
621
622static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
623 u64 *parent_pte)
624{
625 struct kvm_pte_chain *pte_chain;
626 struct hlist_node *node;
627 int i;
628
629 if (!sp->multimapped) {
630 BUG_ON(sp->parent_pte != parent_pte);
631 sp->parent_pte = NULL;
632 return;
633 }
634 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
635 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
636 if (!pte_chain->parent_ptes[i])
637 break;
638 if (pte_chain->parent_ptes[i] != parent_pte)
639 continue;
640 while (i + 1 < NR_PTE_CHAIN_ENTRIES
641 && pte_chain->parent_ptes[i + 1]) {
642 pte_chain->parent_ptes[i]
643 = pte_chain->parent_ptes[i + 1];
644 ++i;
645 }
646 pte_chain->parent_ptes[i] = NULL;
647 if (i == 0) {
648 hlist_del(&pte_chain->link);
649 mmu_free_pte_chain(pte_chain);
650 if (hlist_empty(&sp->parent_ptes)) {
651 sp->multimapped = 0;
652 sp->parent_pte = NULL;
653 }
654 }
655 return;
656 }
657 BUG();
658}
659
660static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
661{
662 unsigned index;
663 struct hlist_head *bucket;
664 struct kvm_mmu_page *sp;
665 struct hlist_node *node;
666
667 pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
668 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
669 bucket = &kvm->arch.mmu_page_hash[index];
670 hlist_for_each_entry(sp, node, bucket, hash_link)
671 if (sp->gfn == gfn && !sp->role.metaphysical) {
672 pgprintk("%s: found role %x\n",
673 __FUNCTION__, sp->role.word);
674 return sp;
675 }
676 return NULL;
677}
678
679static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
680 gfn_t gfn,
681 gva_t gaddr,
682 unsigned level,
683 int metaphysical,
684 unsigned access,
685 u64 *parent_pte,
686 bool *new_page)
687{
688 union kvm_mmu_page_role role;
689 unsigned index;
690 unsigned quadrant;
691 struct hlist_head *bucket;
692 struct kvm_mmu_page *sp;
693 struct hlist_node *node;
694
695 role.word = 0;
696 role.glevels = vcpu->arch.mmu.root_level;
697 role.level = level;
698 role.metaphysical = metaphysical;
699 role.access = access;
700 if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
701 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
702 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
703 role.quadrant = quadrant;
704 }
705 pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__,
706 gfn, role.word);
707 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
708 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
709 hlist_for_each_entry(sp, node, bucket, hash_link)
710 if (sp->gfn == gfn && sp->role.word == role.word) {
711 mmu_page_add_parent_pte(vcpu, sp, parent_pte);
712 pgprintk("%s: found\n", __FUNCTION__);
713 return sp;
714 }
715 sp = kvm_mmu_alloc_page(vcpu, parent_pte);
716 if (!sp)
717 return sp;
718 pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word);
719 sp->gfn = gfn;
720 sp->role = role;
721 hlist_add_head(&sp->hash_link, bucket);
722 vcpu->arch.mmu.prefetch_page(vcpu, sp);
723 if (!metaphysical)
724 rmap_write_protect(vcpu->kvm, gfn);
725 if (new_page)
726 *new_page = 1;
727 return sp;
728}
729
730static void kvm_mmu_page_unlink_children(struct kvm *kvm,
731 struct kvm_mmu_page *sp)
732{
733 unsigned i;
734 u64 *pt;
735 u64 ent;
736
737 pt = sp->spt;
738
739 if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
740 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
741 if (is_shadow_present_pte(pt[i]))
742 rmap_remove(kvm, &pt[i]);
743 pt[i] = shadow_trap_nonpresent_pte;
744 }
745 kvm_flush_remote_tlbs(kvm);
746 return;
747 }
748
749 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
750 ent = pt[i];
751
752 pt[i] = shadow_trap_nonpresent_pte;
753 if (!is_shadow_present_pte(ent))
754 continue;
755 ent &= PT64_BASE_ADDR_MASK;
756 mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
757 }
758 kvm_flush_remote_tlbs(kvm);
759}
760
761static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
762{
763 mmu_page_remove_parent_pte(sp, parent_pte);
764}
765
766static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
767{
768 int i;
769
770 for (i = 0; i < KVM_MAX_VCPUS; ++i)
771 if (kvm->vcpus[i])
772 kvm->vcpus[i]->arch.last_pte_updated = NULL;
773}
774
775static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
776{
777 u64 *parent_pte;
778
779 ++kvm->stat.mmu_shadow_zapped;
780 while (sp->multimapped || sp->parent_pte) {
781 if (!sp->multimapped)
782 parent_pte = sp->parent_pte;
783 else {
784 struct kvm_pte_chain *chain;
785
786 chain = container_of(sp->parent_ptes.first,
787 struct kvm_pte_chain, link);
788 parent_pte = chain->parent_ptes[0];
789 }
790 BUG_ON(!parent_pte);
791 kvm_mmu_put_page(sp, parent_pte);
792 set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte);
793 }
794 kvm_mmu_page_unlink_children(kvm, sp);
795 if (!sp->root_count) {
796 hlist_del(&sp->hash_link);
797 kvm_mmu_free_page(kvm, sp);
798 } else
799 list_move(&sp->link, &kvm->arch.active_mmu_pages);
800 kvm_mmu_reset_last_pte_updated(kvm);
801}
802
803/*
804 * Changing the number of mmu pages allocated to the vm
805 * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
806 */
807void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
808{
809 /*
810 * If we set the number of mmu pages to be smaller be than the
811 * number of actived pages , we must to free some mmu pages before we
812 * change the value
813 */
814
815 if ((kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages) >
816 kvm_nr_mmu_pages) {
817 int n_used_mmu_pages = kvm->arch.n_alloc_mmu_pages
818 - kvm->arch.n_free_mmu_pages;
819
820 while (n_used_mmu_pages > kvm_nr_mmu_pages) {
821 struct kvm_mmu_page *page;
822
823 page = container_of(kvm->arch.active_mmu_pages.prev,
824 struct kvm_mmu_page, link);
825 kvm_mmu_zap_page(kvm, page);
826 n_used_mmu_pages--;
827 }
828 kvm->arch.n_free_mmu_pages = 0;
829 }
830 else
831 kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages
832 - kvm->arch.n_alloc_mmu_pages;
833
834 kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages;
835}
836
837static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
838{
839 unsigned index;
840 struct hlist_head *bucket;
841 struct kvm_mmu_page *sp;
842 struct hlist_node *node, *n;
843 int r;
844
845 pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
846 r = 0;
847 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
848 bucket = &kvm->arch.mmu_page_hash[index];
849 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
850 if (sp->gfn == gfn && !sp->role.metaphysical) {
851 pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn,
852 sp->role.word);
853 kvm_mmu_zap_page(kvm, sp);
854 r = 1;
855 }
856 return r;
857}
858
859static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
860{
861 struct kvm_mmu_page *sp;
862
863 while ((sp = kvm_mmu_lookup_page(kvm, gfn)) != NULL) {
864 pgprintk("%s: zap %lx %x\n", __FUNCTION__, gfn, sp->role.word);
865 kvm_mmu_zap_page(kvm, sp);
866 }
867}
868
869static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
870{
871 int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn));
872 struct kvm_mmu_page *sp = page_header(__pa(pte));
873
874 __set_bit(slot, &sp->slot_bitmap);
875}
876
877struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
878{
879 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
880
881 if (gpa == UNMAPPED_GVA)
882 return NULL;
883 return gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
884}
885
886static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
887 unsigned pt_access, unsigned pte_access,
888 int user_fault, int write_fault, int dirty,
889 int *ptwrite, gfn_t gfn)
890{
891 u64 spte;
892 int was_rmapped = is_rmap_pte(*shadow_pte);
893 struct page *page;
894
895 pgprintk("%s: spte %llx access %x write_fault %d"
896 " user_fault %d gfn %lx\n",
897 __FUNCTION__, *shadow_pte, pt_access,
898 write_fault, user_fault, gfn);
899
900 /*
901 * We don't set the accessed bit, since we sometimes want to see
902 * whether the guest actually used the pte (in order to detect
903 * demand paging).
904 */
905 spte = PT_PRESENT_MASK | PT_DIRTY_MASK;
906 if (!dirty)
907 pte_access &= ~ACC_WRITE_MASK;
908 if (!(pte_access & ACC_EXEC_MASK))
909 spte |= PT64_NX_MASK;
910
911 page = gfn_to_page(vcpu->kvm, gfn);
912
913 spte |= PT_PRESENT_MASK;
914 if (pte_access & ACC_USER_MASK)
915 spte |= PT_USER_MASK;
916
917 if (is_error_page(page)) {
918 set_shadow_pte(shadow_pte,
919 shadow_trap_nonpresent_pte | PT_SHADOW_IO_MARK);
920 kvm_release_page_clean(page);
921 return;
922 }
923
924 spte |= page_to_phys(page);
925
926 if ((pte_access & ACC_WRITE_MASK)
927 || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
928 struct kvm_mmu_page *shadow;
929
930 spte |= PT_WRITABLE_MASK;
931 if (user_fault) {
932 mmu_unshadow(vcpu->kvm, gfn);
933 goto unshadowed;
934 }
935
936 shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
937 if (shadow) {
938 pgprintk("%s: found shadow page for %lx, marking ro\n",
939 __FUNCTION__, gfn);
940 pte_access &= ~ACC_WRITE_MASK;
941 if (is_writeble_pte(spte)) {
942 spte &= ~PT_WRITABLE_MASK;
943 kvm_x86_ops->tlb_flush(vcpu);
944 }
945 if (write_fault)
946 *ptwrite = 1;
947 }
948 }
949
950unshadowed:
951
952 if (pte_access & ACC_WRITE_MASK)
953 mark_page_dirty(vcpu->kvm, gfn);
954
955 pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte);
956 set_shadow_pte(shadow_pte, spte);
957 page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
958 if (!was_rmapped) {
959 rmap_add(vcpu, shadow_pte, gfn);
960 if (!is_rmap_pte(*shadow_pte))
961 kvm_release_page_clean(page);
962 }
963 else
964 kvm_release_page_clean(page);
965 if (!ptwrite || !*ptwrite)
966 vcpu->arch.last_pte_updated = shadow_pte;
967}
968
969static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
970{
971}
972
973static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
974{
975 int level = PT32E_ROOT_LEVEL;
976 hpa_t table_addr = vcpu->arch.mmu.root_hpa;
977 int pt_write = 0;
978
979 for (; ; level--) {
980 u32 index = PT64_INDEX(v, level);
981 u64 *table;
982
983 ASSERT(VALID_PAGE(table_addr));
984 table = __va(table_addr);
985
986 if (level == 1) {
987 mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
988 0, write, 1, &pt_write, gfn);
989 return pt_write || is_io_pte(table[index]);
990 }
991
992 if (table[index] == shadow_trap_nonpresent_pte) {
993 struct kvm_mmu_page *new_table;
994 gfn_t pseudo_gfn;
995
996 pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
997 >> PAGE_SHIFT;
998 new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
999 v, level - 1,
1000 1, ACC_ALL, &table[index],
1001 NULL);
1002 if (!new_table) {
1003 pgprintk("nonpaging_map: ENOMEM\n");
1004 return -ENOMEM;
1005 }
1006
1007 table[index] = __pa(new_table->spt) | PT_PRESENT_MASK
1008 | PT_WRITABLE_MASK | PT_USER_MASK;
1009 }
1010 table_addr = table[index] & PT64_BASE_ADDR_MASK;
1011 }
1012}
1013
1014static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
1015 struct kvm_mmu_page *sp)
1016{
1017 int i;
1018
1019 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1020 sp->spt[i] = shadow_trap_nonpresent_pte;
1021}
1022
1023static void mmu_free_roots(struct kvm_vcpu *vcpu)
1024{
1025 int i;
1026 struct kvm_mmu_page *sp;
1027
1028 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
1029 return;
1030#ifdef CONFIG_X86_64
1031 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
1032 hpa_t root = vcpu->arch.mmu.root_hpa;
1033
1034 sp = page_header(root);
1035 --sp->root_count;
1036 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
1037 return;
1038 }
1039#endif
1040 for (i = 0; i < 4; ++i) {
1041 hpa_t root = vcpu->arch.mmu.pae_root[i];
1042
1043 if (root) {
1044 root &= PT64_BASE_ADDR_MASK;
1045 sp = page_header(root);
1046 --sp->root_count;
1047 }
1048 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
1049 }
1050 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
1051}
1052
1053static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
1054{
1055 int i;
1056 gfn_t root_gfn;
1057 struct kvm_mmu_page *sp;
1058
1059 root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
1060
1061#ifdef CONFIG_X86_64
1062 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
1063 hpa_t root = vcpu->arch.mmu.root_hpa;
1064
1065 ASSERT(!VALID_PAGE(root));
1066 sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
1067 PT64_ROOT_LEVEL, 0, ACC_ALL, NULL, NULL);
1068 root = __pa(sp->spt);
1069 ++sp->root_count;
1070 vcpu->arch.mmu.root_hpa = root;
1071 return;
1072 }
1073#endif
1074 for (i = 0; i < 4; ++i) {
1075 hpa_t root = vcpu->arch.mmu.pae_root[i];
1076
1077 ASSERT(!VALID_PAGE(root));
1078 if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
1079 if (!is_present_pte(vcpu->arch.pdptrs[i])) {
1080 vcpu->arch.mmu.pae_root[i] = 0;
1081 continue;
1082 }
1083 root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT;
1084 } else if (vcpu->arch.mmu.root_level == 0)
1085 root_gfn = 0;
1086 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
1087 PT32_ROOT_LEVEL, !is_paging(vcpu),
1088 ACC_ALL, NULL, NULL);
1089 root = __pa(sp->spt);
1090 ++sp->root_count;
1091 vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
1092 }
1093 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
1094}
1095
1096static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
1097{
1098 return vaddr;
1099}
1100
1101static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
1102 u32 error_code)
1103{
1104 gfn_t gfn;
1105 int r;
1106
1107 pgprintk("%s: gva %lx error %x\n", __FUNCTION__, gva, error_code);
1108 r = mmu_topup_memory_caches(vcpu);
1109 if (r)
1110 return r;
1111
1112 ASSERT(vcpu);
1113 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
1114
1115 gfn = gva >> PAGE_SHIFT;
1116
1117 return nonpaging_map(vcpu, gva & PAGE_MASK,
1118 error_code & PFERR_WRITE_MASK, gfn);
1119}
1120
1121static void nonpaging_free(struct kvm_vcpu *vcpu)
1122{
1123 mmu_free_roots(vcpu);
1124}
1125
1126static int nonpaging_init_context(struct kvm_vcpu *vcpu)
1127{
1128 struct kvm_mmu *context = &vcpu->arch.mmu;
1129
1130 context->new_cr3 = nonpaging_new_cr3;
1131 context->page_fault = nonpaging_page_fault;
1132 context->gva_to_gpa = nonpaging_gva_to_gpa;
1133 context->free = nonpaging_free;
1134 context->prefetch_page = nonpaging_prefetch_page;
1135 context->root_level = 0;
1136 context->shadow_root_level = PT32E_ROOT_LEVEL;
1137 context->root_hpa = INVALID_PAGE;
1138 return 0;
1139}
1140
1141void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
1142{
1143 ++vcpu->stat.tlb_flush;
1144 kvm_x86_ops->tlb_flush(vcpu);
1145}
1146
1147static void paging_new_cr3(struct kvm_vcpu *vcpu)
1148{
1149 pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3);
1150 mmu_free_roots(vcpu);
1151}
1152
1153static void inject_page_fault(struct kvm_vcpu *vcpu,
1154 u64 addr,
1155 u32 err_code)
1156{
1157 kvm_inject_page_fault(vcpu, addr, err_code);
1158}
1159
1160static void paging_free(struct kvm_vcpu *vcpu)
1161{
1162 nonpaging_free(vcpu);
1163}
1164
1165#define PTTYPE 64
1166#include "paging_tmpl.h"
1167#undef PTTYPE
1168
1169#define PTTYPE 32
1170#include "paging_tmpl.h"
1171#undef PTTYPE
1172
1173static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
1174{
1175 struct kvm_mmu *context = &vcpu->arch.mmu;
1176
1177 ASSERT(is_pae(vcpu));
1178 context->new_cr3 = paging_new_cr3;
1179 context->page_fault = paging64_page_fault;
1180 context->gva_to_gpa = paging64_gva_to_gpa;
1181 context->prefetch_page = paging64_prefetch_page;
1182 context->free = paging_free;
1183 context->root_level = level;
1184 context->shadow_root_level = level;
1185 context->root_hpa = INVALID_PAGE;
1186 return 0;
1187}
1188
1189static int paging64_init_context(struct kvm_vcpu *vcpu)
1190{
1191 return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
1192}
1193
1194static int paging32_init_context(struct kvm_vcpu *vcpu)
1195{
1196 struct kvm_mmu *context = &vcpu->arch.mmu;
1197
1198 context->new_cr3 = paging_new_cr3;
1199 context->page_fault = paging32_page_fault;
1200 context->gva_to_gpa = paging32_gva_to_gpa;
1201 context->free = paging_free;
1202 context->prefetch_page = paging32_prefetch_page;
1203 context->root_level = PT32_ROOT_LEVEL;
1204 context->shadow_root_level = PT32E_ROOT_LEVEL;
1205 context->root_hpa = INVALID_PAGE;
1206 return 0;
1207}
1208
1209static int paging32E_init_context(struct kvm_vcpu *vcpu)
1210{
1211 return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
1212}
1213
1214static int init_kvm_mmu(struct kvm_vcpu *vcpu)
1215{
1216 ASSERT(vcpu);
1217 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
1218
1219 if (!is_paging(vcpu))
1220 return nonpaging_init_context(vcpu);
1221 else if (is_long_mode(vcpu))
1222 return paging64_init_context(vcpu);
1223 else if (is_pae(vcpu))
1224 return paging32E_init_context(vcpu);
1225 else
1226 return paging32_init_context(vcpu);
1227}
1228
1229static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
1230{
1231 ASSERT(vcpu);
1232 if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
1233 vcpu->arch.mmu.free(vcpu);
1234 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
1235 }
1236}
1237
1238int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
1239{
1240 destroy_kvm_mmu(vcpu);
1241 return init_kvm_mmu(vcpu);
1242}
1243EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
1244
1245int kvm_mmu_load(struct kvm_vcpu *vcpu)
1246{
1247 int r;
1248
1249 mutex_lock(&vcpu->kvm->lock);
1250 r = mmu_topup_memory_caches(vcpu);
1251 if (r)
1252 goto out;
1253 mmu_alloc_roots(vcpu);
1254 kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
1255 kvm_mmu_flush_tlb(vcpu);
1256out:
1257 mutex_unlock(&vcpu->kvm->lock);
1258 return r;
1259}
1260EXPORT_SYMBOL_GPL(kvm_mmu_load);
1261
1262void kvm_mmu_unload(struct kvm_vcpu *vcpu)
1263{
1264 mmu_free_roots(vcpu);
1265}
1266
1267static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
1268 struct kvm_mmu_page *sp,
1269 u64 *spte)
1270{
1271 u64 pte;
1272 struct kvm_mmu_page *child;
1273
1274 pte = *spte;
1275 if (is_shadow_present_pte(pte)) {
1276 if (sp->role.level == PT_PAGE_TABLE_LEVEL)
1277 rmap_remove(vcpu->kvm, spte);
1278 else {
1279 child = page_header(pte & PT64_BASE_ADDR_MASK);
1280 mmu_page_remove_parent_pte(child, spte);
1281 }
1282 }
1283 set_shadow_pte(spte, shadow_trap_nonpresent_pte);
1284}
1285
1286static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
1287 struct kvm_mmu_page *sp,
1288 u64 *spte,
1289 const void *new, int bytes,
1290 int offset_in_pte)
1291{
1292 if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
1293 ++vcpu->kvm->stat.mmu_pde_zapped;
1294 return;
1295 }
1296
1297 ++vcpu->kvm->stat.mmu_pte_updated;
1298 if (sp->role.glevels == PT32_ROOT_LEVEL)
1299 paging32_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte);
1300 else
1301 paging64_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte);
1302}
1303
1304static bool need_remote_flush(u64 old, u64 new)
1305{
1306 if (!is_shadow_present_pte(old))
1307 return false;
1308 if (!is_shadow_present_pte(new))
1309 return true;
1310 if ((old ^ new) & PT64_BASE_ADDR_MASK)
1311 return true;
1312 old ^= PT64_NX_MASK;
1313 new ^= PT64_NX_MASK;
1314 return (old & ~new & PT64_PERM_MASK) != 0;
1315}
1316
1317static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new)
1318{
1319 if (need_remote_flush(old, new))
1320 kvm_flush_remote_tlbs(vcpu->kvm);
1321 else
1322 kvm_mmu_flush_tlb(vcpu);
1323}
1324
1325static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
1326{
1327 u64 *spte = vcpu->arch.last_pte_updated;
1328
1329 return !!(spte && (*spte & PT_ACCESSED_MASK));
1330}
1331
1332void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1333 const u8 *new, int bytes)
1334{
1335 gfn_t gfn = gpa >> PAGE_SHIFT;
1336 struct kvm_mmu_page *sp;
1337 struct hlist_node *node, *n;
1338 struct hlist_head *bucket;
1339 unsigned index;
1340 u64 entry;
1341 u64 *spte;
1342 unsigned offset = offset_in_page(gpa);
1343 unsigned pte_size;
1344 unsigned page_offset;
1345 unsigned misaligned;
1346 unsigned quadrant;
1347 int level;
1348 int flooded = 0;
1349 int npte;
1350
1351 pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
1352 ++vcpu->kvm->stat.mmu_pte_write;
1353 kvm_mmu_audit(vcpu, "pre pte write");
1354 if (gfn == vcpu->arch.last_pt_write_gfn
1355 && !last_updated_pte_accessed(vcpu)) {
1356 ++vcpu->arch.last_pt_write_count;
1357 if (vcpu->arch.last_pt_write_count >= 3)
1358 flooded = 1;
1359 } else {
1360 vcpu->arch.last_pt_write_gfn = gfn;
1361 vcpu->arch.last_pt_write_count = 1;
1362 vcpu->arch.last_pte_updated = NULL;
1363 }
1364 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
1365 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
1366 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
1367 if (sp->gfn != gfn || sp->role.metaphysical)
1368 continue;
1369 pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
1370 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
1371 misaligned |= bytes < 4;
1372 if (misaligned || flooded) {
1373 /*
1374 * Misaligned accesses are too much trouble to fix
1375 * up; also, they usually indicate a page is not used
1376 * as a page table.
1377 *
1378 * If we're seeing too many writes to a page,
1379 * it may no longer be a page table, or we may be
1380 * forking, in which case it is better to unmap the
1381 * page.
1382 */
1383 pgprintk("misaligned: gpa %llx bytes %d role %x\n",
1384 gpa, bytes, sp->role.word);
1385 kvm_mmu_zap_page(vcpu->kvm, sp);
1386 ++vcpu->kvm->stat.mmu_flooded;
1387 continue;
1388 }
1389 page_offset = offset;
1390 level = sp->role.level;
1391 npte = 1;
1392 if (sp->role.glevels == PT32_ROOT_LEVEL) {
1393 page_offset <<= 1; /* 32->64 */
1394 /*
1395 * A 32-bit pde maps 4MB while the shadow pdes map
1396 * only 2MB. So we need to double the offset again
1397 * and zap two pdes instead of one.
1398 */
1399 if (level == PT32_ROOT_LEVEL) {
1400 page_offset &= ~7; /* kill rounding error */
1401 page_offset <<= 1;
1402 npte = 2;
1403 }
1404 quadrant = page_offset >> PAGE_SHIFT;
1405 page_offset &= ~PAGE_MASK;
1406 if (quadrant != sp->role.quadrant)
1407 continue;
1408 }
1409 spte = &sp->spt[page_offset / sizeof(*spte)];
1410 while (npte--) {
1411 entry = *spte;
1412 mmu_pte_write_zap_pte(vcpu, sp, spte);
1413 mmu_pte_write_new_pte(vcpu, sp, spte, new, bytes,
1414 page_offset & (pte_size - 1));
1415 mmu_pte_write_flush_tlb(vcpu, entry, *spte);
1416 ++spte;
1417 }
1418 }
1419 kvm_mmu_audit(vcpu, "post pte write");
1420}
1421
1422int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
1423{
1424 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
1425
1426 return kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1427}
1428
1429void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
1430{
1431 while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) {
1432 struct kvm_mmu_page *sp;
1433
1434 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
1435 struct kvm_mmu_page, link);
1436 kvm_mmu_zap_page(vcpu->kvm, sp);
1437 ++vcpu->kvm->stat.mmu_recycled;
1438 }
1439}
1440
1441int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
1442{
1443 int r;
1444 enum emulation_result er;
1445
1446 mutex_lock(&vcpu->kvm->lock);
1447 r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code);
1448 if (r < 0)
1449 goto out;
1450
1451 if (!r) {
1452 r = 1;
1453 goto out;
1454 }
1455
1456 r = mmu_topup_memory_caches(vcpu);
1457 if (r)
1458 goto out;
1459
1460 er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0);
1461 mutex_unlock(&vcpu->kvm->lock);
1462
1463 switch (er) {
1464 case EMULATE_DONE:
1465 return 1;
1466 case EMULATE_DO_MMIO:
1467 ++vcpu->stat.mmio_exits;
1468 return 0;
1469 case EMULATE_FAIL:
1470 kvm_report_emulation_failure(vcpu, "pagetable");
1471 return 1;
1472 default:
1473 BUG();
1474 }
1475out:
1476 mutex_unlock(&vcpu->kvm->lock);
1477 return r;
1478}
1479EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
1480
1481static void free_mmu_pages(struct kvm_vcpu *vcpu)
1482{
1483 struct kvm_mmu_page *sp;
1484
1485 while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
1486 sp = container_of(vcpu->kvm->arch.active_mmu_pages.next,
1487 struct kvm_mmu_page, link);
1488 kvm_mmu_zap_page(vcpu->kvm, sp);
1489 }
1490 free_page((unsigned long)vcpu->arch.mmu.pae_root);
1491}
1492
1493static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
1494{
1495 struct page *page;
1496 int i;
1497
1498 ASSERT(vcpu);
1499
1500 if (vcpu->kvm->arch.n_requested_mmu_pages)
1501 vcpu->kvm->arch.n_free_mmu_pages =
1502 vcpu->kvm->arch.n_requested_mmu_pages;
1503 else
1504 vcpu->kvm->arch.n_free_mmu_pages =
1505 vcpu->kvm->arch.n_alloc_mmu_pages;
1506 /*
1507 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
1508 * Therefore we need to allocate shadow page tables in the first
1509 * 4GB of memory, which happens to fit the DMA32 zone.
1510 */
1511 page = alloc_page(GFP_KERNEL | __GFP_DMA32);
1512 if (!page)
1513 goto error_1;
1514 vcpu->arch.mmu.pae_root = page_address(page);
1515 for (i = 0; i < 4; ++i)
1516 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
1517
1518 return 0;
1519
1520error_1:
1521 free_mmu_pages(vcpu);
1522 return -ENOMEM;
1523}
1524
1525int kvm_mmu_create(struct kvm_vcpu *vcpu)
1526{
1527 ASSERT(vcpu);
1528 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
1529
1530 return alloc_mmu_pages(vcpu);
1531}
1532
1533int kvm_mmu_setup(struct kvm_vcpu *vcpu)
1534{
1535 ASSERT(vcpu);
1536 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
1537
1538 return init_kvm_mmu(vcpu);
1539}
1540
1541void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
1542{
1543 ASSERT(vcpu);
1544
1545 destroy_kvm_mmu(vcpu);
1546 free_mmu_pages(vcpu);
1547 mmu_free_memory_caches(vcpu);
1548}
1549
1550void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
1551{
1552 struct kvm_mmu_page *sp;
1553
1554 list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
1555 int i;
1556 u64 *pt;
1557
1558 if (!test_bit(slot, &sp->slot_bitmap))
1559 continue;
1560
1561 pt = sp->spt;
1562 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1563 /* avoid RMW */
1564 if (pt[i] & PT_WRITABLE_MASK)
1565 pt[i] &= ~PT_WRITABLE_MASK;
1566 }
1567}
1568
1569void kvm_mmu_zap_all(struct kvm *kvm)
1570{
1571 struct kvm_mmu_page *sp, *node;
1572
1573 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
1574 kvm_mmu_zap_page(kvm, sp);
1575
1576 kvm_flush_remote_tlbs(kvm);
1577}
1578
1579void kvm_mmu_module_exit(void)
1580{
1581 if (pte_chain_cache)
1582 kmem_cache_destroy(pte_chain_cache);
1583 if (rmap_desc_cache)
1584 kmem_cache_destroy(rmap_desc_cache);
1585 if (mmu_page_header_cache)
1586 kmem_cache_destroy(mmu_page_header_cache);
1587}
1588
1589int kvm_mmu_module_init(void)
1590{
1591 pte_chain_cache = kmem_cache_create("kvm_pte_chain",
1592 sizeof(struct kvm_pte_chain),
1593 0, 0, NULL);
1594 if (!pte_chain_cache)
1595 goto nomem;
1596 rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
1597 sizeof(struct kvm_rmap_desc),
1598 0, 0, NULL);
1599 if (!rmap_desc_cache)
1600 goto nomem;
1601
1602 mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
1603 sizeof(struct kvm_mmu_page),
1604 0, 0, NULL);
1605 if (!mmu_page_header_cache)
1606 goto nomem;
1607
1608 return 0;
1609
1610nomem:
1611 kvm_mmu_module_exit();
1612 return -ENOMEM;
1613}
1614
1615/*
1616 * Caculate mmu pages needed for kvm.
1617 */
1618unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
1619{
1620 int i;
1621 unsigned int nr_mmu_pages;
1622 unsigned int nr_pages = 0;
1623
1624 for (i = 0; i < kvm->nmemslots; i++)
1625 nr_pages += kvm->memslots[i].npages;
1626
1627 nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
1628 nr_mmu_pages = max(nr_mmu_pages,
1629 (unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
1630
1631 return nr_mmu_pages;
1632}
1633
1634#ifdef AUDIT
1635
1636static const char *audit_msg;
1637
1638static gva_t canonicalize(gva_t gva)
1639{
1640#ifdef CONFIG_X86_64
1641 gva = (long long)(gva << 16) >> 16;
1642#endif
1643 return gva;
1644}
1645
1646static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
1647 gva_t va, int level)
1648{
1649 u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
1650 int i;
1651 gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
1652
1653 for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
1654 u64 ent = pt[i];
1655
1656 if (ent == shadow_trap_nonpresent_pte)
1657 continue;
1658
1659 va = canonicalize(va);
1660 if (level > 1) {
1661 if (ent == shadow_notrap_nonpresent_pte)
1662 printk(KERN_ERR "audit: (%s) nontrapping pte"
1663 " in nonleaf level: levels %d gva %lx"
1664 " level %d pte %llx\n", audit_msg,
1665 vcpu->arch.mmu.root_level, va, level, ent);
1666
1667 audit_mappings_page(vcpu, ent, va, level - 1);
1668 } else {
1669 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va);
1670 struct page *page = gpa_to_page(vcpu, gpa);
1671 hpa_t hpa = page_to_phys(page);
1672
1673 if (is_shadow_present_pte(ent)
1674 && (ent & PT64_BASE_ADDR_MASK) != hpa)
1675 printk(KERN_ERR "xx audit error: (%s) levels %d"
1676 " gva %lx gpa %llx hpa %llx ent %llx %d\n",
1677 audit_msg, vcpu->arch.mmu.root_level,
1678 va, gpa, hpa, ent,
1679 is_shadow_present_pte(ent));
1680 else if (ent == shadow_notrap_nonpresent_pte
1681 && !is_error_hpa(hpa))
1682 printk(KERN_ERR "audit: (%s) notrap shadow,"
1683 " valid guest gva %lx\n", audit_msg, va);
1684 kvm_release_page_clean(page);
1685
1686 }
1687 }
1688}
1689
1690static void audit_mappings(struct kvm_vcpu *vcpu)
1691{
1692 unsigned i;
1693
1694 if (vcpu->arch.mmu.root_level == 4)
1695 audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
1696 else
1697 for (i = 0; i < 4; ++i)
1698 if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
1699 audit_mappings_page(vcpu,
1700 vcpu->arch.mmu.pae_root[i],
1701 i << 30,
1702 2);
1703}
1704
1705static int count_rmaps(struct kvm_vcpu *vcpu)
1706{
1707 int nmaps = 0;
1708 int i, j, k;
1709
1710 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
1711 struct kvm_memory_slot *m = &vcpu->kvm->memslots[i];
1712 struct kvm_rmap_desc *d;
1713
1714 for (j = 0; j < m->npages; ++j) {
1715 unsigned long *rmapp = &m->rmap[j];
1716
1717 if (!*rmapp)
1718 continue;
1719 if (!(*rmapp & 1)) {
1720 ++nmaps;
1721 continue;
1722 }
1723 d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
1724 while (d) {
1725 for (k = 0; k < RMAP_EXT; ++k)
1726 if (d->shadow_ptes[k])
1727 ++nmaps;
1728 else
1729 break;
1730 d = d->more;
1731 }
1732 }
1733 }
1734 return nmaps;
1735}
1736
1737static int count_writable_mappings(struct kvm_vcpu *vcpu)
1738{
1739 int nmaps = 0;
1740 struct kvm_mmu_page *sp;
1741 int i;
1742
1743 list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
1744 u64 *pt = sp->spt;
1745
1746 if (sp->role.level != PT_PAGE_TABLE_LEVEL)
1747 continue;
1748
1749 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1750 u64 ent = pt[i];
1751
1752 if (!(ent & PT_PRESENT_MASK))
1753 continue;
1754 if (!(ent & PT_WRITABLE_MASK))
1755 continue;
1756 ++nmaps;
1757 }
1758 }
1759 return nmaps;
1760}
1761
1762static void audit_rmap(struct kvm_vcpu *vcpu)
1763{
1764 int n_rmap = count_rmaps(vcpu);
1765 int n_actual = count_writable_mappings(vcpu);
1766
1767 if (n_rmap != n_actual)
1768 printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
1769 __FUNCTION__, audit_msg, n_rmap, n_actual);
1770}
1771
1772static void audit_write_protection(struct kvm_vcpu *vcpu)
1773{
1774 struct kvm_mmu_page *sp;
1775 struct kvm_memory_slot *slot;
1776 unsigned long *rmapp;
1777 gfn_t gfn;
1778
1779 list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
1780 if (sp->role.metaphysical)
1781 continue;
1782
1783 slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
1784 gfn = unalias_gfn(vcpu->kvm, sp->gfn);
1785 rmapp = &slot->rmap[gfn - slot->base_gfn];
1786 if (*rmapp)
1787 printk(KERN_ERR "%s: (%s) shadow page has writable"
1788 " mappings: gfn %lx role %x\n",
1789 __FUNCTION__, audit_msg, sp->gfn,
1790 sp->role.word);
1791 }
1792}
1793
1794static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
1795{
1796 int olddbg = dbg;
1797
1798 dbg = 0;
1799 audit_msg = msg;
1800 audit_rmap(vcpu);
1801 audit_write_protection(vcpu);
1802 audit_mappings(vcpu);
1803 dbg = olddbg;
1804}
1805
1806#endif
diff --git a/drivers/kvm/mmu.h b/drivers/kvm/mmu.h
deleted file mode 100644
index cbfc272262df..000000000000
--- a/drivers/kvm/mmu.h
+++ /dev/null
@@ -1,44 +0,0 @@
1#ifndef __KVM_X86_MMU_H
2#define __KVM_X86_MMU_H
3
4#include "kvm.h"
5
6static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
7{
8 if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
9 __kvm_mmu_free_some_pages(vcpu);
10}
11
12static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
13{
14 if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE))
15 return 0;
16
17 return kvm_mmu_load(vcpu);
18}
19
20static inline int is_long_mode(struct kvm_vcpu *vcpu)
21{
22#ifdef CONFIG_X86_64
23 return vcpu->arch.shadow_efer & EFER_LME;
24#else
25 return 0;
26#endif
27}
28
29static inline int is_pae(struct kvm_vcpu *vcpu)
30{
31 return vcpu->arch.cr4 & X86_CR4_PAE;
32}
33
34static inline int is_pse(struct kvm_vcpu *vcpu)
35{
36 return vcpu->arch.cr4 & X86_CR4_PSE;
37}
38
39static inline int is_paging(struct kvm_vcpu *vcpu)
40{
41 return vcpu->arch.cr0 & X86_CR0_PG;
42}
43
44#endif
diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h
deleted file mode 100644
index 56b88f7e83ef..000000000000
--- a/drivers/kvm/paging_tmpl.h
+++ /dev/null
@@ -1,461 +0,0 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * MMU support
8 *
9 * Copyright (C) 2006 Qumranet, Inc.
10 *
11 * Authors:
12 * Yaniv Kamay <yaniv@qumranet.com>
13 * Avi Kivity <avi@qumranet.com>
14 *
15 * This work is licensed under the terms of the GNU GPL, version 2. See
16 * the COPYING file in the top-level directory.
17 *
18 */
19
20/*
21 * We need the mmu code to access both 32-bit and 64-bit guest ptes,
22 * so the code in this file is compiled twice, once per pte size.
23 */
24
25#if PTTYPE == 64
26 #define pt_element_t u64
27 #define guest_walker guest_walker64
28 #define FNAME(name) paging##64_##name
29 #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
30 #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
31 #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
32 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
33 #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
34 #define PT_LEVEL_BITS PT64_LEVEL_BITS
35 #ifdef CONFIG_X86_64
36 #define PT_MAX_FULL_LEVELS 4
37 #define CMPXCHG cmpxchg
38 #else
39 #define CMPXCHG cmpxchg64
40 #define PT_MAX_FULL_LEVELS 2
41 #endif
42#elif PTTYPE == 32
43 #define pt_element_t u32
44 #define guest_walker guest_walker32
45 #define FNAME(name) paging##32_##name
46 #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
47 #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
48 #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
49 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
50 #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
51 #define PT_LEVEL_BITS PT32_LEVEL_BITS
52 #define PT_MAX_FULL_LEVELS 2
53 #define CMPXCHG cmpxchg
54#else
55 #error Invalid PTTYPE value
56#endif
57
58#define gpte_to_gfn FNAME(gpte_to_gfn)
59#define gpte_to_gfn_pde FNAME(gpte_to_gfn_pde)
60
61/*
62 * The guest_walker structure emulates the behavior of the hardware page
63 * table walker.
64 */
65struct guest_walker {
66 int level;
67 gfn_t table_gfn[PT_MAX_FULL_LEVELS];
68 pt_element_t ptes[PT_MAX_FULL_LEVELS];
69 gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
70 unsigned pt_access;
71 unsigned pte_access;
72 gfn_t gfn;
73 u32 error_code;
74};
75
76static gfn_t gpte_to_gfn(pt_element_t gpte)
77{
78 return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
79}
80
81static gfn_t gpte_to_gfn_pde(pt_element_t gpte)
82{
83 return (gpte & PT_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
84}
85
86static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
87 gfn_t table_gfn, unsigned index,
88 pt_element_t orig_pte, pt_element_t new_pte)
89{
90 pt_element_t ret;
91 pt_element_t *table;
92 struct page *page;
93
94 page = gfn_to_page(kvm, table_gfn);
95 table = kmap_atomic(page, KM_USER0);
96
97 ret = CMPXCHG(&table[index], orig_pte, new_pte);
98
99 kunmap_atomic(table, KM_USER0);
100
101 kvm_release_page_dirty(page);
102
103 return (ret != orig_pte);
104}
105
106static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
107{
108 unsigned access;
109
110 access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
111#if PTTYPE == 64
112 if (is_nx(vcpu))
113 access &= ~(gpte >> PT64_NX_SHIFT);
114#endif
115 return access;
116}
117
118/*
119 * Fetch a guest pte for a guest virtual address
120 */
121static int FNAME(walk_addr)(struct guest_walker *walker,
122 struct kvm_vcpu *vcpu, gva_t addr,
123 int write_fault, int user_fault, int fetch_fault)
124{
125 pt_element_t pte;
126 gfn_t table_gfn;
127 unsigned index, pt_access, pte_access;
128 gpa_t pte_gpa;
129
130 pgprintk("%s: addr %lx\n", __FUNCTION__, addr);
131walk:
132 walker->level = vcpu->arch.mmu.root_level;
133 pte = vcpu->arch.cr3;
134#if PTTYPE == 64
135 if (!is_long_mode(vcpu)) {
136 pte = vcpu->arch.pdptrs[(addr >> 30) & 3];
137 if (!is_present_pte(pte))
138 goto not_present;
139 --walker->level;
140 }
141#endif
142 ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
143 (vcpu->cr3 & CR3_NONPAE_RESERVED_BITS) == 0);
144
145 pt_access = ACC_ALL;
146
147 for (;;) {
148 index = PT_INDEX(addr, walker->level);
149
150 table_gfn = gpte_to_gfn(pte);
151 pte_gpa = gfn_to_gpa(table_gfn);
152 pte_gpa += index * sizeof(pt_element_t);
153 walker->table_gfn[walker->level - 1] = table_gfn;
154 walker->pte_gpa[walker->level - 1] = pte_gpa;
155 pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
156 walker->level - 1, table_gfn);
157
158 kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte));
159
160 if (!is_present_pte(pte))
161 goto not_present;
162
163 if (write_fault && !is_writeble_pte(pte))
164 if (user_fault || is_write_protection(vcpu))
165 goto access_error;
166
167 if (user_fault && !(pte & PT_USER_MASK))
168 goto access_error;
169
170#if PTTYPE == 64
171 if (fetch_fault && is_nx(vcpu) && (pte & PT64_NX_MASK))
172 goto access_error;
173#endif
174
175 if (!(pte & PT_ACCESSED_MASK)) {
176 mark_page_dirty(vcpu->kvm, table_gfn);
177 if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn,
178 index, pte, pte|PT_ACCESSED_MASK))
179 goto walk;
180 pte |= PT_ACCESSED_MASK;
181 }
182
183 pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
184
185 walker->ptes[walker->level - 1] = pte;
186
187 if (walker->level == PT_PAGE_TABLE_LEVEL) {
188 walker->gfn = gpte_to_gfn(pte);
189 break;
190 }
191
192 if (walker->level == PT_DIRECTORY_LEVEL
193 && (pte & PT_PAGE_SIZE_MASK)
194 && (PTTYPE == 64 || is_pse(vcpu))) {
195 walker->gfn = gpte_to_gfn_pde(pte);
196 walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL);
197 if (PTTYPE == 32 && is_cpuid_PSE36())
198 walker->gfn += pse36_gfn_delta(pte);
199 break;
200 }
201
202 pt_access = pte_access;
203 --walker->level;
204 }
205
206 if (write_fault && !is_dirty_pte(pte)) {
207 bool ret;
208
209 mark_page_dirty(vcpu->kvm, table_gfn);
210 ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
211 pte|PT_DIRTY_MASK);
212 if (ret)
213 goto walk;
214 pte |= PT_DIRTY_MASK;
215 kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte));
216 walker->ptes[walker->level - 1] = pte;
217 }
218
219 walker->pt_access = pt_access;
220 walker->pte_access = pte_access;
221 pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
222 __FUNCTION__, (u64)pte, pt_access, pte_access);
223 return 1;
224
225not_present:
226 walker->error_code = 0;
227 goto err;
228
229access_error:
230 walker->error_code = PFERR_PRESENT_MASK;
231
232err:
233 if (write_fault)
234 walker->error_code |= PFERR_WRITE_MASK;
235 if (user_fault)
236 walker->error_code |= PFERR_USER_MASK;
237 if (fetch_fault)
238 walker->error_code |= PFERR_FETCH_MASK;
239 return 0;
240}
241
242static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
243 u64 *spte, const void *pte, int bytes,
244 int offset_in_pte)
245{
246 pt_element_t gpte;
247 unsigned pte_access;
248
249 gpte = *(const pt_element_t *)pte;
250 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
251 if (!offset_in_pte && !is_present_pte(gpte))
252 set_shadow_pte(spte, shadow_notrap_nonpresent_pte);
253 return;
254 }
255 if (bytes < sizeof(pt_element_t))
256 return;
257 pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte);
258 pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte);
259 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
260 gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte));
261}
262
263/*
264 * Fetch a shadow pte for a specific level in the paging hierarchy.
265 */
266static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
267 struct guest_walker *walker,
268 int user_fault, int write_fault, int *ptwrite)
269{
270 hpa_t shadow_addr;
271 int level;
272 u64 *shadow_ent;
273 unsigned access = walker->pt_access;
274
275 if (!is_present_pte(walker->ptes[walker->level - 1]))
276 return NULL;
277
278 shadow_addr = vcpu->arch.mmu.root_hpa;
279 level = vcpu->arch.mmu.shadow_root_level;
280 if (level == PT32E_ROOT_LEVEL) {
281 shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
282 shadow_addr &= PT64_BASE_ADDR_MASK;
283 --level;
284 }
285
286 for (; ; level--) {
287 u32 index = SHADOW_PT_INDEX(addr, level);
288 struct kvm_mmu_page *shadow_page;
289 u64 shadow_pte;
290 int metaphysical;
291 gfn_t table_gfn;
292 bool new_page = 0;
293
294 shadow_ent = ((u64 *)__va(shadow_addr)) + index;
295 if (is_shadow_present_pte(*shadow_ent)) {
296 if (level == PT_PAGE_TABLE_LEVEL)
297 break;
298 shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
299 continue;
300 }
301
302 if (level == PT_PAGE_TABLE_LEVEL)
303 break;
304
305 if (level - 1 == PT_PAGE_TABLE_LEVEL
306 && walker->level == PT_DIRECTORY_LEVEL) {
307 metaphysical = 1;
308 if (!is_dirty_pte(walker->ptes[level - 1]))
309 access &= ~ACC_WRITE_MASK;
310 table_gfn = gpte_to_gfn(walker->ptes[level - 1]);
311 } else {
312 metaphysical = 0;
313 table_gfn = walker->table_gfn[level - 2];
314 }
315 shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
316 metaphysical, access,
317 shadow_ent, &new_page);
318 if (new_page && !metaphysical) {
319 pt_element_t curr_pte;
320 kvm_read_guest(vcpu->kvm, walker->pte_gpa[level - 2],
321 &curr_pte, sizeof(curr_pte));
322 if (curr_pte != walker->ptes[level - 2])
323 return NULL;
324 }
325 shadow_addr = __pa(shadow_page->spt);
326 shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
327 | PT_WRITABLE_MASK | PT_USER_MASK;
328 *shadow_ent = shadow_pte;
329 }
330
331 mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access,
332 user_fault, write_fault,
333 walker->ptes[walker->level-1] & PT_DIRTY_MASK,
334 ptwrite, walker->gfn);
335
336 return shadow_ent;
337}
338
339/*
340 * Page fault handler. There are several causes for a page fault:
341 * - there is no shadow pte for the guest pte
342 * - write access through a shadow pte marked read only so that we can set
343 * the dirty bit
344 * - write access to a shadow pte marked read only so we can update the page
345 * dirty bitmap, when userspace requests it
346 * - mmio access; in this case we will never install a present shadow pte
347 * - normal guest page fault due to the guest pte marked not present, not
348 * writable, or not executable
349 *
350 * Returns: 1 if we need to emulate the instruction, 0 otherwise, or
351 * a negative value on error.
352 */
353static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
354 u32 error_code)
355{
356 int write_fault = error_code & PFERR_WRITE_MASK;
357 int user_fault = error_code & PFERR_USER_MASK;
358 int fetch_fault = error_code & PFERR_FETCH_MASK;
359 struct guest_walker walker;
360 u64 *shadow_pte;
361 int write_pt = 0;
362 int r;
363
364 pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code);
365 kvm_mmu_audit(vcpu, "pre page fault");
366
367 r = mmu_topup_memory_caches(vcpu);
368 if (r)
369 return r;
370
371 /*
372 * Look up the shadow pte for the faulting address.
373 */
374 r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
375 fetch_fault);
376
377 /*
378 * The page is not mapped by the guest. Let the guest handle it.
379 */
380 if (!r) {
381 pgprintk("%s: guest page fault\n", __FUNCTION__);
382 inject_page_fault(vcpu, addr, walker.error_code);
383 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
384 return 0;
385 }
386
387 shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
388 &write_pt);
389 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__,
390 shadow_pte, *shadow_pte, write_pt);
391
392 if (!write_pt)
393 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
394
395 /*
396 * mmio: emulate if accessible, otherwise its a guest fault.
397 */
398 if (shadow_pte && is_io_pte(*shadow_pte))
399 return 1;
400
401 ++vcpu->stat.pf_fixed;
402 kvm_mmu_audit(vcpu, "post page fault (fixed)");
403
404 return write_pt;
405}
406
407static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
408{
409 struct guest_walker walker;
410 gpa_t gpa = UNMAPPED_GVA;
411 int r;
412
413 r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0);
414
415 if (r) {
416 gpa = gfn_to_gpa(walker.gfn);
417 gpa |= vaddr & ~PAGE_MASK;
418 }
419
420 return gpa;
421}
422
423static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
424 struct kvm_mmu_page *sp)
425{
426 int i, offset = 0;
427 pt_element_t *gpt;
428 struct page *page;
429
430 if (sp->role.metaphysical
431 || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) {
432 nonpaging_prefetch_page(vcpu, sp);
433 return;
434 }
435
436 if (PTTYPE == 32)
437 offset = sp->role.quadrant << PT64_LEVEL_BITS;
438 page = gfn_to_page(vcpu->kvm, sp->gfn);
439 gpt = kmap_atomic(page, KM_USER0);
440 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
441 if (is_present_pte(gpt[offset + i]))
442 sp->spt[i] = shadow_trap_nonpresent_pte;
443 else
444 sp->spt[i] = shadow_notrap_nonpresent_pte;
445 kunmap_atomic(gpt, KM_USER0);
446 kvm_release_page_clean(page);
447}
448
449#undef pt_element_t
450#undef guest_walker
451#undef FNAME
452#undef PT_BASE_ADDR_MASK
453#undef PT_INDEX
454#undef SHADOW_PT_INDEX
455#undef PT_LEVEL_MASK
456#undef PT_DIR_BASE_ADDR_MASK
457#undef PT_LEVEL_BITS
458#undef PT_MAX_FULL_LEVELS
459#undef gpte_to_gfn
460#undef gpte_to_gfn_pde
461#undef CMPXCHG
diff --git a/drivers/kvm/segment_descriptor.h b/drivers/kvm/segment_descriptor.h
deleted file mode 100644
index 56fc4c873389..000000000000
--- a/drivers/kvm/segment_descriptor.h
+++ /dev/null
@@ -1,29 +0,0 @@
1#ifndef __SEGMENT_DESCRIPTOR_H
2#define __SEGMENT_DESCRIPTOR_H
3
4struct segment_descriptor {
5 u16 limit_low;
6 u16 base_low;
7 u8 base_mid;
8 u8 type : 4;
9 u8 system : 1;
10 u8 dpl : 2;
11 u8 present : 1;
12 u8 limit_high : 4;
13 u8 avl : 1;
14 u8 long_mode : 1;
15 u8 default_op : 1;
16 u8 granularity : 1;
17 u8 base_high;
18} __attribute__((packed));
19
20#ifdef CONFIG_X86_64
21/* LDT or TSS descriptor in the GDT. 16 bytes. */
22struct segment_descriptor_64 {
23 struct segment_descriptor s;
24 u32 base_higher;
25 u32 pad_zero;
26};
27
28#endif
29#endif
diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c
deleted file mode 100644
index e606f6d18669..000000000000
--- a/drivers/kvm/svm.c
+++ /dev/null
@@ -1,1725 +0,0 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * AMD SVM support
5 *
6 * Copyright (C) 2006 Qumranet, Inc.
7 *
8 * Authors:
9 * Yaniv Kamay <yaniv@qumranet.com>
10 * Avi Kivity <avi@qumranet.com>
11 *
12 * This work is licensed under the terms of the GNU GPL, version 2. See
13 * the COPYING file in the top-level directory.
14 *
15 */
16#include "x86.h"
17#include "kvm_svm.h"
18#include "x86_emulate.h"
19#include "irq.h"
20#include "mmu.h"
21
22#include <linux/module.h>
23#include <linux/kernel.h>
24#include <linux/vmalloc.h>
25#include <linux/highmem.h>
26#include <linux/sched.h>
27
28#include <asm/desc.h>
29
30MODULE_AUTHOR("Qumranet");
31MODULE_LICENSE("GPL");
32
33#define IOPM_ALLOC_ORDER 2
34#define MSRPM_ALLOC_ORDER 1
35
36#define DB_VECTOR 1
37#define UD_VECTOR 6
38#define GP_VECTOR 13
39
40#define DR7_GD_MASK (1 << 13)
41#define DR6_BD_MASK (1 << 13)
42
43#define SEG_TYPE_LDT 2
44#define SEG_TYPE_BUSY_TSS16 3
45
46#define SVM_FEATURE_NPT (1 << 0)
47#define SVM_FEATURE_LBRV (1 << 1)
48#define SVM_DEATURE_SVML (1 << 2)
49
50static void kvm_reput_irq(struct vcpu_svm *svm);
51
52static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
53{
54 return container_of(vcpu, struct vcpu_svm, vcpu);
55}
56
57unsigned long iopm_base;
58unsigned long msrpm_base;
59
60struct kvm_ldttss_desc {
61 u16 limit0;
62 u16 base0;
63 unsigned base1 : 8, type : 5, dpl : 2, p : 1;
64 unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8;
65 u32 base3;
66 u32 zero1;
67} __attribute__((packed));
68
69struct svm_cpu_data {
70 int cpu;
71
72 u64 asid_generation;
73 u32 max_asid;
74 u32 next_asid;
75 struct kvm_ldttss_desc *tss_desc;
76
77 struct page *save_area;
78};
79
80static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
81static uint32_t svm_features;
82
83struct svm_init_data {
84 int cpu;
85 int r;
86};
87
88static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
89
90#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
91#define MSRS_RANGE_SIZE 2048
92#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
93
94#define MAX_INST_SIZE 15
95
96static inline u32 svm_has(u32 feat)
97{
98 return svm_features & feat;
99}
100
101static inline u8 pop_irq(struct kvm_vcpu *vcpu)
102{
103 int word_index = __ffs(vcpu->arch.irq_summary);
104 int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
105 int irq = word_index * BITS_PER_LONG + bit_index;
106
107 clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
108 if (!vcpu->arch.irq_pending[word_index])
109 clear_bit(word_index, &vcpu->arch.irq_summary);
110 return irq;
111}
112
113static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq)
114{
115 set_bit(irq, vcpu->arch.irq_pending);
116 set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
117}
118
119static inline void clgi(void)
120{
121 asm volatile (SVM_CLGI);
122}
123
124static inline void stgi(void)
125{
126 asm volatile (SVM_STGI);
127}
128
129static inline void invlpga(unsigned long addr, u32 asid)
130{
131 asm volatile (SVM_INVLPGA :: "a"(addr), "c"(asid));
132}
133
134static inline unsigned long kvm_read_cr2(void)
135{
136 unsigned long cr2;
137
138 asm volatile ("mov %%cr2, %0" : "=r" (cr2));
139 return cr2;
140}
141
142static inline void kvm_write_cr2(unsigned long val)
143{
144 asm volatile ("mov %0, %%cr2" :: "r" (val));
145}
146
147static inline unsigned long read_dr6(void)
148{
149 unsigned long dr6;
150
151 asm volatile ("mov %%dr6, %0" : "=r" (dr6));
152 return dr6;
153}
154
155static inline void write_dr6(unsigned long val)
156{
157 asm volatile ("mov %0, %%dr6" :: "r" (val));
158}
159
160static inline unsigned long read_dr7(void)
161{
162 unsigned long dr7;
163
164 asm volatile ("mov %%dr7, %0" : "=r" (dr7));
165 return dr7;
166}
167
168static inline void write_dr7(unsigned long val)
169{
170 asm volatile ("mov %0, %%dr7" :: "r" (val));
171}
172
173static inline void force_new_asid(struct kvm_vcpu *vcpu)
174{
175 to_svm(vcpu)->asid_generation--;
176}
177
178static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
179{
180 force_new_asid(vcpu);
181}
182
183static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
184{
185 if (!(efer & EFER_LMA))
186 efer &= ~EFER_LME;
187
188 to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK;
189 vcpu->arch.shadow_efer = efer;
190}
191
192static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
193 bool has_error_code, u32 error_code)
194{
195 struct vcpu_svm *svm = to_svm(vcpu);
196
197 svm->vmcb->control.event_inj = nr
198 | SVM_EVTINJ_VALID
199 | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
200 | SVM_EVTINJ_TYPE_EXEPT;
201 svm->vmcb->control.event_inj_err = error_code;
202}
203
204static bool svm_exception_injected(struct kvm_vcpu *vcpu)
205{
206 struct vcpu_svm *svm = to_svm(vcpu);
207
208 return !(svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID);
209}
210
211static int is_external_interrupt(u32 info)
212{
213 info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
214 return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
215}
216
217static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
218{
219 struct vcpu_svm *svm = to_svm(vcpu);
220
221 if (!svm->next_rip) {
222 printk(KERN_DEBUG "%s: NOP\n", __FUNCTION__);
223 return;
224 }
225 if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE)
226 printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n",
227 __FUNCTION__,
228 svm->vmcb->save.rip,
229 svm->next_rip);
230
231 vcpu->arch.rip = svm->vmcb->save.rip = svm->next_rip;
232 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
233
234 vcpu->arch.interrupt_window_open = 1;
235}
236
237static int has_svm(void)
238{
239 uint32_t eax, ebx, ecx, edx;
240
241 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
242 printk(KERN_INFO "has_svm: not amd\n");
243 return 0;
244 }
245
246 cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
247 if (eax < SVM_CPUID_FUNC) {
248 printk(KERN_INFO "has_svm: can't execute cpuid_8000000a\n");
249 return 0;
250 }
251
252 cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
253 if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) {
254 printk(KERN_DEBUG "has_svm: svm not available\n");
255 return 0;
256 }
257 return 1;
258}
259
260static void svm_hardware_disable(void *garbage)
261{
262 struct svm_cpu_data *svm_data
263 = per_cpu(svm_data, raw_smp_processor_id());
264
265 if (svm_data) {
266 uint64_t efer;
267
268 wrmsrl(MSR_VM_HSAVE_PA, 0);
269 rdmsrl(MSR_EFER, efer);
270 wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK);
271 per_cpu(svm_data, raw_smp_processor_id()) = NULL;
272 __free_page(svm_data->save_area);
273 kfree(svm_data);
274 }
275}
276
277static void svm_hardware_enable(void *garbage)
278{
279
280 struct svm_cpu_data *svm_data;
281 uint64_t efer;
282#ifdef CONFIG_X86_64
283 struct desc_ptr gdt_descr;
284#else
285 struct desc_ptr gdt_descr;
286#endif
287 struct desc_struct *gdt;
288 int me = raw_smp_processor_id();
289
290 if (!has_svm()) {
291 printk(KERN_ERR "svm_cpu_init: err EOPNOTSUPP on %d\n", me);
292 return;
293 }
294 svm_data = per_cpu(svm_data, me);
295
296 if (!svm_data) {
297 printk(KERN_ERR "svm_cpu_init: svm_data is NULL on %d\n",
298 me);
299 return;
300 }
301
302 svm_data->asid_generation = 1;
303 svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
304 svm_data->next_asid = svm_data->max_asid + 1;
305 svm_features = cpuid_edx(SVM_CPUID_FUNC);
306
307 asm volatile ("sgdt %0" : "=m"(gdt_descr));
308 gdt = (struct desc_struct *)gdt_descr.address;
309 svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
310
311 rdmsrl(MSR_EFER, efer);
312 wrmsrl(MSR_EFER, efer | MSR_EFER_SVME_MASK);
313
314 wrmsrl(MSR_VM_HSAVE_PA,
315 page_to_pfn(svm_data->save_area) << PAGE_SHIFT);
316}
317
318static int svm_cpu_init(int cpu)
319{
320 struct svm_cpu_data *svm_data;
321 int r;
322
323 svm_data = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
324 if (!svm_data)
325 return -ENOMEM;
326 svm_data->cpu = cpu;
327 svm_data->save_area = alloc_page(GFP_KERNEL);
328 r = -ENOMEM;
329 if (!svm_data->save_area)
330 goto err_1;
331
332 per_cpu(svm_data, cpu) = svm_data;
333
334 return 0;
335
336err_1:
337 kfree(svm_data);
338 return r;
339
340}
341
342static void set_msr_interception(u32 *msrpm, unsigned msr,
343 int read, int write)
344{
345 int i;
346
347 for (i = 0; i < NUM_MSR_MAPS; i++) {
348 if (msr >= msrpm_ranges[i] &&
349 msr < msrpm_ranges[i] + MSRS_IN_RANGE) {
350 u32 msr_offset = (i * MSRS_IN_RANGE + msr -
351 msrpm_ranges[i]) * 2;
352
353 u32 *base = msrpm + (msr_offset / 32);
354 u32 msr_shift = msr_offset % 32;
355 u32 mask = ((write) ? 0 : 2) | ((read) ? 0 : 1);
356 *base = (*base & ~(0x3 << msr_shift)) |
357 (mask << msr_shift);
358 return;
359 }
360 }
361 BUG();
362}
363
364static __init int svm_hardware_setup(void)
365{
366 int cpu;
367 struct page *iopm_pages;
368 struct page *msrpm_pages;
369 void *iopm_va, *msrpm_va;
370 int r;
371
372 iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER);
373
374 if (!iopm_pages)
375 return -ENOMEM;
376
377 iopm_va = page_address(iopm_pages);
378 memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
379 clear_bit(0x80, iopm_va); /* allow direct access to PC debug port */
380 iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
381
382
383 msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
384
385 r = -ENOMEM;
386 if (!msrpm_pages)
387 goto err_1;
388
389 msrpm_va = page_address(msrpm_pages);
390 memset(msrpm_va, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
391 msrpm_base = page_to_pfn(msrpm_pages) << PAGE_SHIFT;
392
393#ifdef CONFIG_X86_64
394 set_msr_interception(msrpm_va, MSR_GS_BASE, 1, 1);
395 set_msr_interception(msrpm_va, MSR_FS_BASE, 1, 1);
396 set_msr_interception(msrpm_va, MSR_KERNEL_GS_BASE, 1, 1);
397 set_msr_interception(msrpm_va, MSR_LSTAR, 1, 1);
398 set_msr_interception(msrpm_va, MSR_CSTAR, 1, 1);
399 set_msr_interception(msrpm_va, MSR_SYSCALL_MASK, 1, 1);
400#endif
401 set_msr_interception(msrpm_va, MSR_K6_STAR, 1, 1);
402 set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_CS, 1, 1);
403 set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_ESP, 1, 1);
404 set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_EIP, 1, 1);
405
406 for_each_online_cpu(cpu) {
407 r = svm_cpu_init(cpu);
408 if (r)
409 goto err_2;
410 }
411 return 0;
412
413err_2:
414 __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
415 msrpm_base = 0;
416err_1:
417 __free_pages(iopm_pages, IOPM_ALLOC_ORDER);
418 iopm_base = 0;
419 return r;
420}
421
422static __exit void svm_hardware_unsetup(void)
423{
424 __free_pages(pfn_to_page(msrpm_base >> PAGE_SHIFT), MSRPM_ALLOC_ORDER);
425 __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
426 iopm_base = msrpm_base = 0;
427}
428
429static void init_seg(struct vmcb_seg *seg)
430{
431 seg->selector = 0;
432 seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
433 SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
434 seg->limit = 0xffff;
435 seg->base = 0;
436}
437
438static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
439{
440 seg->selector = 0;
441 seg->attrib = SVM_SELECTOR_P_MASK | type;
442 seg->limit = 0xffff;
443 seg->base = 0;
444}
445
446static void init_vmcb(struct vmcb *vmcb)
447{
448 struct vmcb_control_area *control = &vmcb->control;
449 struct vmcb_save_area *save = &vmcb->save;
450
451 control->intercept_cr_read = INTERCEPT_CR0_MASK |
452 INTERCEPT_CR3_MASK |
453 INTERCEPT_CR4_MASK |
454 INTERCEPT_CR8_MASK;
455
456 control->intercept_cr_write = INTERCEPT_CR0_MASK |
457 INTERCEPT_CR3_MASK |
458 INTERCEPT_CR4_MASK |
459 INTERCEPT_CR8_MASK;
460
461 control->intercept_dr_read = INTERCEPT_DR0_MASK |
462 INTERCEPT_DR1_MASK |
463 INTERCEPT_DR2_MASK |
464 INTERCEPT_DR3_MASK;
465
466 control->intercept_dr_write = INTERCEPT_DR0_MASK |
467 INTERCEPT_DR1_MASK |
468 INTERCEPT_DR2_MASK |
469 INTERCEPT_DR3_MASK |
470 INTERCEPT_DR5_MASK |
471 INTERCEPT_DR7_MASK;
472
473 control->intercept_exceptions = (1 << PF_VECTOR) |
474 (1 << UD_VECTOR);
475
476
477 control->intercept = (1ULL << INTERCEPT_INTR) |
478 (1ULL << INTERCEPT_NMI) |
479 (1ULL << INTERCEPT_SMI) |
480 /*
481 * selective cr0 intercept bug?
482 * 0: 0f 22 d8 mov %eax,%cr3
483 * 3: 0f 20 c0 mov %cr0,%eax
484 * 6: 0d 00 00 00 80 or $0x80000000,%eax
485 * b: 0f 22 c0 mov %eax,%cr0
486 * set cr3 ->interception
487 * get cr0 ->interception
488 * set cr0 -> no interception
489 */
490 /* (1ULL << INTERCEPT_SELECTIVE_CR0) | */
491 (1ULL << INTERCEPT_CPUID) |
492 (1ULL << INTERCEPT_INVD) |
493 (1ULL << INTERCEPT_HLT) |
494 (1ULL << INTERCEPT_INVLPGA) |
495 (1ULL << INTERCEPT_IOIO_PROT) |
496 (1ULL << INTERCEPT_MSR_PROT) |
497 (1ULL << INTERCEPT_TASK_SWITCH) |
498 (1ULL << INTERCEPT_SHUTDOWN) |
499 (1ULL << INTERCEPT_VMRUN) |
500 (1ULL << INTERCEPT_VMMCALL) |
501 (1ULL << INTERCEPT_VMLOAD) |
502 (1ULL << INTERCEPT_VMSAVE) |
503 (1ULL << INTERCEPT_STGI) |
504 (1ULL << INTERCEPT_CLGI) |
505 (1ULL << INTERCEPT_SKINIT) |
506 (1ULL << INTERCEPT_WBINVD) |
507 (1ULL << INTERCEPT_MONITOR) |
508 (1ULL << INTERCEPT_MWAIT);
509
510 control->iopm_base_pa = iopm_base;
511 control->msrpm_base_pa = msrpm_base;
512 control->tsc_offset = 0;
513 control->int_ctl = V_INTR_MASKING_MASK;
514
515 init_seg(&save->es);
516 init_seg(&save->ss);
517 init_seg(&save->ds);
518 init_seg(&save->fs);
519 init_seg(&save->gs);
520
521 save->cs.selector = 0xf000;
522 /* Executable/Readable Code Segment */
523 save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
524 SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
525 save->cs.limit = 0xffff;
526 /*
527 * cs.base should really be 0xffff0000, but vmx can't handle that, so
528 * be consistent with it.
529 *
530 * Replace when we have real mode working for vmx.
531 */
532 save->cs.base = 0xf0000;
533
534 save->gdtr.limit = 0xffff;
535 save->idtr.limit = 0xffff;
536
537 init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
538 init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
539
540 save->efer = MSR_EFER_SVME_MASK;
541 save->dr6 = 0xffff0ff0;
542 save->dr7 = 0x400;
543 save->rflags = 2;
544 save->rip = 0x0000fff0;
545
546 /*
547 * cr0 val on cpu init should be 0x60000010, we enable cpu
548 * cache by default. the orderly way is to enable cache in bios.
549 */
550 save->cr0 = 0x00000010 | X86_CR0_PG | X86_CR0_WP;
551 save->cr4 = X86_CR4_PAE;
552 /* rdx = ?? */
553}
554
555static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
556{
557 struct vcpu_svm *svm = to_svm(vcpu);
558
559 init_vmcb(svm->vmcb);
560
561 if (vcpu->vcpu_id != 0) {
562 svm->vmcb->save.rip = 0;
563 svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
564 svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
565 }
566
567 return 0;
568}
569
570static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
571{
572 struct vcpu_svm *svm;
573 struct page *page;
574 int err;
575
576 svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
577 if (!svm) {
578 err = -ENOMEM;
579 goto out;
580 }
581
582 err = kvm_vcpu_init(&svm->vcpu, kvm, id);
583 if (err)
584 goto free_svm;
585
586 page = alloc_page(GFP_KERNEL);
587 if (!page) {
588 err = -ENOMEM;
589 goto uninit;
590 }
591
592 svm->vmcb = page_address(page);
593 clear_page(svm->vmcb);
594 svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
595 svm->asid_generation = 0;
596 memset(svm->db_regs, 0, sizeof(svm->db_regs));
597 init_vmcb(svm->vmcb);
598
599 fx_init(&svm->vcpu);
600 svm->vcpu.fpu_active = 1;
601 svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
602 if (svm->vcpu.vcpu_id == 0)
603 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
604
605 return &svm->vcpu;
606
607uninit:
608 kvm_vcpu_uninit(&svm->vcpu);
609free_svm:
610 kmem_cache_free(kvm_vcpu_cache, svm);
611out:
612 return ERR_PTR(err);
613}
614
615static void svm_free_vcpu(struct kvm_vcpu *vcpu)
616{
617 struct vcpu_svm *svm = to_svm(vcpu);
618
619 __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT));
620 kvm_vcpu_uninit(vcpu);
621 kmem_cache_free(kvm_vcpu_cache, svm);
622}
623
624static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
625{
626 struct vcpu_svm *svm = to_svm(vcpu);
627 int i;
628
629 if (unlikely(cpu != vcpu->cpu)) {
630 u64 tsc_this, delta;
631
632 /*
633 * Make sure that the guest sees a monotonically
634 * increasing TSC.
635 */
636 rdtscll(tsc_this);
637 delta = vcpu->arch.host_tsc - tsc_this;
638 svm->vmcb->control.tsc_offset += delta;
639 vcpu->cpu = cpu;
640 kvm_migrate_apic_timer(vcpu);
641 }
642
643 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
644 rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
645}
646
647static void svm_vcpu_put(struct kvm_vcpu *vcpu)
648{
649 struct vcpu_svm *svm = to_svm(vcpu);
650 int i;
651
652 ++vcpu->stat.host_state_reload;
653 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
654 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
655
656 rdtscll(vcpu->arch.host_tsc);
657}
658
659static void svm_vcpu_decache(struct kvm_vcpu *vcpu)
660{
661}
662
663static void svm_cache_regs(struct kvm_vcpu *vcpu)
664{
665 struct vcpu_svm *svm = to_svm(vcpu);
666
667 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
668 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
669 vcpu->arch.rip = svm->vmcb->save.rip;
670}
671
672static void svm_decache_regs(struct kvm_vcpu *vcpu)
673{
674 struct vcpu_svm *svm = to_svm(vcpu);
675 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
676 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
677 svm->vmcb->save.rip = vcpu->arch.rip;
678}
679
680static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
681{
682 return to_svm(vcpu)->vmcb->save.rflags;
683}
684
685static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
686{
687 to_svm(vcpu)->vmcb->save.rflags = rflags;
688}
689
690static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
691{
692 struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
693
694 switch (seg) {
695 case VCPU_SREG_CS: return &save->cs;
696 case VCPU_SREG_DS: return &save->ds;
697 case VCPU_SREG_ES: return &save->es;
698 case VCPU_SREG_FS: return &save->fs;
699 case VCPU_SREG_GS: return &save->gs;
700 case VCPU_SREG_SS: return &save->ss;
701 case VCPU_SREG_TR: return &save->tr;
702 case VCPU_SREG_LDTR: return &save->ldtr;
703 }
704 BUG();
705 return NULL;
706}
707
708static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
709{
710 struct vmcb_seg *s = svm_seg(vcpu, seg);
711
712 return s->base;
713}
714
715static void svm_get_segment(struct kvm_vcpu *vcpu,
716 struct kvm_segment *var, int seg)
717{
718 struct vmcb_seg *s = svm_seg(vcpu, seg);
719
720 var->base = s->base;
721 var->limit = s->limit;
722 var->selector = s->selector;
723 var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
724 var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
725 var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
726 var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
727 var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
728 var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
729 var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
730 var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1;
731 var->unusable = !var->present;
732}
733
734static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
735{
736 struct vcpu_svm *svm = to_svm(vcpu);
737
738 dt->limit = svm->vmcb->save.idtr.limit;
739 dt->base = svm->vmcb->save.idtr.base;
740}
741
742static void svm_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
743{
744 struct vcpu_svm *svm = to_svm(vcpu);
745
746 svm->vmcb->save.idtr.limit = dt->limit;
747 svm->vmcb->save.idtr.base = dt->base ;
748}
749
750static void svm_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
751{
752 struct vcpu_svm *svm = to_svm(vcpu);
753
754 dt->limit = svm->vmcb->save.gdtr.limit;
755 dt->base = svm->vmcb->save.gdtr.base;
756}
757
758static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
759{
760 struct vcpu_svm *svm = to_svm(vcpu);
761
762 svm->vmcb->save.gdtr.limit = dt->limit;
763 svm->vmcb->save.gdtr.base = dt->base ;
764}
765
766static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
767{
768}
769
770static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
771{
772 struct vcpu_svm *svm = to_svm(vcpu);
773
774#ifdef CONFIG_X86_64
775 if (vcpu->arch.shadow_efer & EFER_LME) {
776 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
777 vcpu->arch.shadow_efer |= EFER_LMA;
778 svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
779 }
780
781 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
782 vcpu->arch.shadow_efer &= ~EFER_LMA;
783 svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
784 }
785 }
786#endif
787 if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) {
788 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
789 vcpu->fpu_active = 1;
790 }
791
792 vcpu->arch.cr0 = cr0;
793 cr0 |= X86_CR0_PG | X86_CR0_WP;
794 cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
795 svm->vmcb->save.cr0 = cr0;
796}
797
798static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
799{
800 vcpu->arch.cr4 = cr4;
801 to_svm(vcpu)->vmcb->save.cr4 = cr4 | X86_CR4_PAE;
802}
803
804static void svm_set_segment(struct kvm_vcpu *vcpu,
805 struct kvm_segment *var, int seg)
806{
807 struct vcpu_svm *svm = to_svm(vcpu);
808 struct vmcb_seg *s = svm_seg(vcpu, seg);
809
810 s->base = var->base;
811 s->limit = var->limit;
812 s->selector = var->selector;
813 if (var->unusable)
814 s->attrib = 0;
815 else {
816 s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
817 s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
818 s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
819 s->attrib |= (var->present & 1) << SVM_SELECTOR_P_SHIFT;
820 s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
821 s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
822 s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
823 s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
824 }
825 if (seg == VCPU_SREG_CS)
826 svm->vmcb->save.cpl
827 = (svm->vmcb->save.cs.attrib
828 >> SVM_SELECTOR_DPL_SHIFT) & 3;
829
830}
831
832/* FIXME:
833
834 svm(vcpu)->vmcb->control.int_ctl &= ~V_TPR_MASK;
835 svm(vcpu)->vmcb->control.int_ctl |= (sregs->cr8 & V_TPR_MASK);
836
837*/
838
839static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
840{
841 return -EOPNOTSUPP;
842}
843
844static int svm_get_irq(struct kvm_vcpu *vcpu)
845{
846 struct vcpu_svm *svm = to_svm(vcpu);
847 u32 exit_int_info = svm->vmcb->control.exit_int_info;
848
849 if (is_external_interrupt(exit_int_info))
850 return exit_int_info & SVM_EVTINJ_VEC_MASK;
851 return -1;
852}
853
854static void load_host_msrs(struct kvm_vcpu *vcpu)
855{
856#ifdef CONFIG_X86_64
857 wrmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
858#endif
859}
860
861static void save_host_msrs(struct kvm_vcpu *vcpu)
862{
863#ifdef CONFIG_X86_64
864 rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
865#endif
866}
867
868static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data)
869{
870 if (svm_data->next_asid > svm_data->max_asid) {
871 ++svm_data->asid_generation;
872 svm_data->next_asid = 1;
873 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
874 }
875
876 svm->vcpu.cpu = svm_data->cpu;
877 svm->asid_generation = svm_data->asid_generation;
878 svm->vmcb->control.asid = svm_data->next_asid++;
879}
880
881static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr)
882{
883 return to_svm(vcpu)->db_regs[dr];
884}
885
886static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
887 int *exception)
888{
889 struct vcpu_svm *svm = to_svm(vcpu);
890
891 *exception = 0;
892
893 if (svm->vmcb->save.dr7 & DR7_GD_MASK) {
894 svm->vmcb->save.dr7 &= ~DR7_GD_MASK;
895 svm->vmcb->save.dr6 |= DR6_BD_MASK;
896 *exception = DB_VECTOR;
897 return;
898 }
899
900 switch (dr) {
901 case 0 ... 3:
902 svm->db_regs[dr] = value;
903 return;
904 case 4 ... 5:
905 if (vcpu->arch.cr4 & X86_CR4_DE) {
906 *exception = UD_VECTOR;
907 return;
908 }
909 case 7: {
910 if (value & ~((1ULL << 32) - 1)) {
911 *exception = GP_VECTOR;
912 return;
913 }
914 svm->vmcb->save.dr7 = value;
915 return;
916 }
917 default:
918 printk(KERN_DEBUG "%s: unexpected dr %u\n",
919 __FUNCTION__, dr);
920 *exception = UD_VECTOR;
921 return;
922 }
923}
924
925static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
926{
927 u32 exit_int_info = svm->vmcb->control.exit_int_info;
928 struct kvm *kvm = svm->vcpu.kvm;
929 u64 fault_address;
930 u32 error_code;
931
932 if (!irqchip_in_kernel(kvm) &&
933 is_external_interrupt(exit_int_info))
934 push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
935
936 fault_address = svm->vmcb->control.exit_info_2;
937 error_code = svm->vmcb->control.exit_info_1;
938 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
939}
940
941static int ud_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
942{
943 int er;
944
945 er = emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0);
946 if (er != EMULATE_DONE)
947 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
948 return 1;
949}
950
951static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
952{
953 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
954 if (!(svm->vcpu.arch.cr0 & X86_CR0_TS))
955 svm->vmcb->save.cr0 &= ~X86_CR0_TS;
956 svm->vcpu.fpu_active = 1;
957
958 return 1;
959}
960
961static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
962{
963 /*
964 * VMCB is undefined after a SHUTDOWN intercept
965 * so reinitialize it.
966 */
967 clear_page(svm->vmcb);
968 init_vmcb(svm->vmcb);
969
970 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
971 return 0;
972}
973
974static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
975{
976 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
977 int size, down, in, string, rep;
978 unsigned port;
979
980 ++svm->vcpu.stat.io_exits;
981
982 svm->next_rip = svm->vmcb->control.exit_info_2;
983
984 string = (io_info & SVM_IOIO_STR_MASK) != 0;
985
986 if (string) {
987 if (emulate_instruction(&svm->vcpu,
988 kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
989 return 0;
990 return 1;
991 }
992
993 in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
994 port = io_info >> 16;
995 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
996 rep = (io_info & SVM_IOIO_REP_MASK) != 0;
997 down = (svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0;
998
999 return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port);
1000}
1001
1002static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1003{
1004 return 1;
1005}
1006
1007static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1008{
1009 svm->next_rip = svm->vmcb->save.rip + 1;
1010 skip_emulated_instruction(&svm->vcpu);
1011 return kvm_emulate_halt(&svm->vcpu);
1012}
1013
1014static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1015{
1016 svm->next_rip = svm->vmcb->save.rip + 3;
1017 skip_emulated_instruction(&svm->vcpu);
1018 kvm_emulate_hypercall(&svm->vcpu);
1019 return 1;
1020}
1021
1022static int invalid_op_interception(struct vcpu_svm *svm,
1023 struct kvm_run *kvm_run)
1024{
1025 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1026 return 1;
1027}
1028
1029static int task_switch_interception(struct vcpu_svm *svm,
1030 struct kvm_run *kvm_run)
1031{
1032 pr_unimpl(&svm->vcpu, "%s: task switch is unsupported\n", __FUNCTION__);
1033 kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
1034 return 0;
1035}
1036
1037static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1038{
1039 svm->next_rip = svm->vmcb->save.rip + 2;
1040 kvm_emulate_cpuid(&svm->vcpu);
1041 return 1;
1042}
1043
1044static int emulate_on_interception(struct vcpu_svm *svm,
1045 struct kvm_run *kvm_run)
1046{
1047 if (emulate_instruction(&svm->vcpu, NULL, 0, 0, 0) != EMULATE_DONE)
1048 pr_unimpl(&svm->vcpu, "%s: failed\n", __FUNCTION__);
1049 return 1;
1050}
1051
1052static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1053{
1054 emulate_instruction(&svm->vcpu, NULL, 0, 0, 0);
1055 if (irqchip_in_kernel(svm->vcpu.kvm))
1056 return 1;
1057 kvm_run->exit_reason = KVM_EXIT_SET_TPR;
1058 return 0;
1059}
1060
1061static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
1062{
1063 struct vcpu_svm *svm = to_svm(vcpu);
1064
1065 switch (ecx) {
1066 case MSR_IA32_TIME_STAMP_COUNTER: {
1067 u64 tsc;
1068
1069 rdtscll(tsc);
1070 *data = svm->vmcb->control.tsc_offset + tsc;
1071 break;
1072 }
1073 case MSR_K6_STAR:
1074 *data = svm->vmcb->save.star;
1075 break;
1076#ifdef CONFIG_X86_64
1077 case MSR_LSTAR:
1078 *data = svm->vmcb->save.lstar;
1079 break;
1080 case MSR_CSTAR:
1081 *data = svm->vmcb->save.cstar;
1082 break;
1083 case MSR_KERNEL_GS_BASE:
1084 *data = svm->vmcb->save.kernel_gs_base;
1085 break;
1086 case MSR_SYSCALL_MASK:
1087 *data = svm->vmcb->save.sfmask;
1088 break;
1089#endif
1090 case MSR_IA32_SYSENTER_CS:
1091 *data = svm->vmcb->save.sysenter_cs;
1092 break;
1093 case MSR_IA32_SYSENTER_EIP:
1094 *data = svm->vmcb->save.sysenter_eip;
1095 break;
1096 case MSR_IA32_SYSENTER_ESP:
1097 *data = svm->vmcb->save.sysenter_esp;
1098 break;
1099 default:
1100 return kvm_get_msr_common(vcpu, ecx, data);
1101 }
1102 return 0;
1103}
1104
1105static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1106{
1107 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
1108 u64 data;
1109
1110 if (svm_get_msr(&svm->vcpu, ecx, &data))
1111 kvm_inject_gp(&svm->vcpu, 0);
1112 else {
1113 svm->vmcb->save.rax = data & 0xffffffff;
1114 svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32;
1115 svm->next_rip = svm->vmcb->save.rip + 2;
1116 skip_emulated_instruction(&svm->vcpu);
1117 }
1118 return 1;
1119}
1120
1121static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
1122{
1123 struct vcpu_svm *svm = to_svm(vcpu);
1124
1125 switch (ecx) {
1126 case MSR_IA32_TIME_STAMP_COUNTER: {
1127 u64 tsc;
1128
1129 rdtscll(tsc);
1130 svm->vmcb->control.tsc_offset = data - tsc;
1131 break;
1132 }
1133 case MSR_K6_STAR:
1134 svm->vmcb->save.star = data;
1135 break;
1136#ifdef CONFIG_X86_64
1137 case MSR_LSTAR:
1138 svm->vmcb->save.lstar = data;
1139 break;
1140 case MSR_CSTAR:
1141 svm->vmcb->save.cstar = data;
1142 break;
1143 case MSR_KERNEL_GS_BASE:
1144 svm->vmcb->save.kernel_gs_base = data;
1145 break;
1146 case MSR_SYSCALL_MASK:
1147 svm->vmcb->save.sfmask = data;
1148 break;
1149#endif
1150 case MSR_IA32_SYSENTER_CS:
1151 svm->vmcb->save.sysenter_cs = data;
1152 break;
1153 case MSR_IA32_SYSENTER_EIP:
1154 svm->vmcb->save.sysenter_eip = data;
1155 break;
1156 case MSR_IA32_SYSENTER_ESP:
1157 svm->vmcb->save.sysenter_esp = data;
1158 break;
1159 case MSR_K7_EVNTSEL0:
1160 case MSR_K7_EVNTSEL1:
1161 case MSR_K7_EVNTSEL2:
1162 case MSR_K7_EVNTSEL3:
1163 /*
1164 * only support writing 0 to the performance counters for now
1165 * to make Windows happy. Should be replaced by a real
1166 * performance counter emulation later.
1167 */
1168 if (data != 0)
1169 goto unhandled;
1170 break;
1171 default:
1172 unhandled:
1173 return kvm_set_msr_common(vcpu, ecx, data);
1174 }
1175 return 0;
1176}
1177
1178static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1179{
1180 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
1181 u64 data = (svm->vmcb->save.rax & -1u)
1182 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
1183 svm->next_rip = svm->vmcb->save.rip + 2;
1184 if (svm_set_msr(&svm->vcpu, ecx, data))
1185 kvm_inject_gp(&svm->vcpu, 0);
1186 else
1187 skip_emulated_instruction(&svm->vcpu);
1188 return 1;
1189}
1190
1191static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1192{
1193 if (svm->vmcb->control.exit_info_1)
1194 return wrmsr_interception(svm, kvm_run);
1195 else
1196 return rdmsr_interception(svm, kvm_run);
1197}
1198
1199static int interrupt_window_interception(struct vcpu_svm *svm,
1200 struct kvm_run *kvm_run)
1201{
1202 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR);
1203 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
1204 /*
1205 * If the user space waits to inject interrupts, exit as soon as
1206 * possible
1207 */
1208 if (kvm_run->request_interrupt_window &&
1209 !svm->vcpu.arch.irq_summary) {
1210 ++svm->vcpu.stat.irq_window_exits;
1211 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
1212 return 0;
1213 }
1214
1215 return 1;
1216}
1217
1218static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
1219 struct kvm_run *kvm_run) = {
1220 [SVM_EXIT_READ_CR0] = emulate_on_interception,
1221 [SVM_EXIT_READ_CR3] = emulate_on_interception,
1222 [SVM_EXIT_READ_CR4] = emulate_on_interception,
1223 [SVM_EXIT_READ_CR8] = emulate_on_interception,
1224 /* for now: */
1225 [SVM_EXIT_WRITE_CR0] = emulate_on_interception,
1226 [SVM_EXIT_WRITE_CR3] = emulate_on_interception,
1227 [SVM_EXIT_WRITE_CR4] = emulate_on_interception,
1228 [SVM_EXIT_WRITE_CR8] = cr8_write_interception,
1229 [SVM_EXIT_READ_DR0] = emulate_on_interception,
1230 [SVM_EXIT_READ_DR1] = emulate_on_interception,
1231 [SVM_EXIT_READ_DR2] = emulate_on_interception,
1232 [SVM_EXIT_READ_DR3] = emulate_on_interception,
1233 [SVM_EXIT_WRITE_DR0] = emulate_on_interception,
1234 [SVM_EXIT_WRITE_DR1] = emulate_on_interception,
1235 [SVM_EXIT_WRITE_DR2] = emulate_on_interception,
1236 [SVM_EXIT_WRITE_DR3] = emulate_on_interception,
1237 [SVM_EXIT_WRITE_DR5] = emulate_on_interception,
1238 [SVM_EXIT_WRITE_DR7] = emulate_on_interception,
1239 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
1240 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
1241 [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception,
1242 [SVM_EXIT_INTR] = nop_on_interception,
1243 [SVM_EXIT_NMI] = nop_on_interception,
1244 [SVM_EXIT_SMI] = nop_on_interception,
1245 [SVM_EXIT_INIT] = nop_on_interception,
1246 [SVM_EXIT_VINTR] = interrupt_window_interception,
1247 /* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */
1248 [SVM_EXIT_CPUID] = cpuid_interception,
1249 [SVM_EXIT_INVD] = emulate_on_interception,
1250 [SVM_EXIT_HLT] = halt_interception,
1251 [SVM_EXIT_INVLPG] = emulate_on_interception,
1252 [SVM_EXIT_INVLPGA] = invalid_op_interception,
1253 [SVM_EXIT_IOIO] = io_interception,
1254 [SVM_EXIT_MSR] = msr_interception,
1255 [SVM_EXIT_TASK_SWITCH] = task_switch_interception,
1256 [SVM_EXIT_SHUTDOWN] = shutdown_interception,
1257 [SVM_EXIT_VMRUN] = invalid_op_interception,
1258 [SVM_EXIT_VMMCALL] = vmmcall_interception,
1259 [SVM_EXIT_VMLOAD] = invalid_op_interception,
1260 [SVM_EXIT_VMSAVE] = invalid_op_interception,
1261 [SVM_EXIT_STGI] = invalid_op_interception,
1262 [SVM_EXIT_CLGI] = invalid_op_interception,
1263 [SVM_EXIT_SKINIT] = invalid_op_interception,
1264 [SVM_EXIT_WBINVD] = emulate_on_interception,
1265 [SVM_EXIT_MONITOR] = invalid_op_interception,
1266 [SVM_EXIT_MWAIT] = invalid_op_interception,
1267};
1268
1269
1270static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1271{
1272 struct vcpu_svm *svm = to_svm(vcpu);
1273 u32 exit_code = svm->vmcb->control.exit_code;
1274
1275 kvm_reput_irq(svm);
1276
1277 if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
1278 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
1279 kvm_run->fail_entry.hardware_entry_failure_reason
1280 = svm->vmcb->control.exit_code;
1281 return 0;
1282 }
1283
1284 if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
1285 exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR)
1286 printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
1287 "exit_code 0x%x\n",
1288 __FUNCTION__, svm->vmcb->control.exit_int_info,
1289 exit_code);
1290
1291 if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
1292 || !svm_exit_handlers[exit_code]) {
1293 kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
1294 kvm_run->hw.hardware_exit_reason = exit_code;
1295 return 0;
1296 }
1297
1298 return svm_exit_handlers[exit_code](svm, kvm_run);
1299}
1300
1301static void reload_tss(struct kvm_vcpu *vcpu)
1302{
1303 int cpu = raw_smp_processor_id();
1304
1305 struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
1306 svm_data->tss_desc->type = 9; /* available 32/64-bit TSS */
1307 load_TR_desc();
1308}
1309
1310static void pre_svm_run(struct vcpu_svm *svm)
1311{
1312 int cpu = raw_smp_processor_id();
1313
1314 struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
1315
1316 svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
1317 if (svm->vcpu.cpu != cpu ||
1318 svm->asid_generation != svm_data->asid_generation)
1319 new_asid(svm, svm_data);
1320}
1321
1322
1323static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
1324{
1325 struct vmcb_control_area *control;
1326
1327 control = &svm->vmcb->control;
1328 control->int_vector = irq;
1329 control->int_ctl &= ~V_INTR_PRIO_MASK;
1330 control->int_ctl |= V_IRQ_MASK |
1331 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
1332}
1333
1334static void svm_set_irq(struct kvm_vcpu *vcpu, int irq)
1335{
1336 struct vcpu_svm *svm = to_svm(vcpu);
1337
1338 svm_inject_irq(svm, irq);
1339}
1340
1341static void svm_intr_assist(struct kvm_vcpu *vcpu)
1342{
1343 struct vcpu_svm *svm = to_svm(vcpu);
1344 struct vmcb *vmcb = svm->vmcb;
1345 int intr_vector = -1;
1346
1347 if ((vmcb->control.exit_int_info & SVM_EVTINJ_VALID) &&
1348 ((vmcb->control.exit_int_info & SVM_EVTINJ_TYPE_MASK) == 0)) {
1349 intr_vector = vmcb->control.exit_int_info &
1350 SVM_EVTINJ_VEC_MASK;
1351 vmcb->control.exit_int_info = 0;
1352 svm_inject_irq(svm, intr_vector);
1353 return;
1354 }
1355
1356 if (vmcb->control.int_ctl & V_IRQ_MASK)
1357 return;
1358
1359 if (!kvm_cpu_has_interrupt(vcpu))
1360 return;
1361
1362 if (!(vmcb->save.rflags & X86_EFLAGS_IF) ||
1363 (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
1364 (vmcb->control.event_inj & SVM_EVTINJ_VALID)) {
1365 /* unable to deliver irq, set pending irq */
1366 vmcb->control.intercept |= (1ULL << INTERCEPT_VINTR);
1367 svm_inject_irq(svm, 0x0);
1368 return;
1369 }
1370 /* Okay, we can deliver the interrupt: grab it and update PIC state. */
1371 intr_vector = kvm_cpu_get_interrupt(vcpu);
1372 svm_inject_irq(svm, intr_vector);
1373 kvm_timer_intr_post(vcpu, intr_vector);
1374}
1375
1376static void kvm_reput_irq(struct vcpu_svm *svm)
1377{
1378 struct vmcb_control_area *control = &svm->vmcb->control;
1379
1380 if ((control->int_ctl & V_IRQ_MASK)
1381 && !irqchip_in_kernel(svm->vcpu.kvm)) {
1382 control->int_ctl &= ~V_IRQ_MASK;
1383 push_irq(&svm->vcpu, control->int_vector);
1384 }
1385
1386 svm->vcpu.arch.interrupt_window_open =
1387 !(control->int_state & SVM_INTERRUPT_SHADOW_MASK);
1388}
1389
1390static void svm_do_inject_vector(struct vcpu_svm *svm)
1391{
1392 struct kvm_vcpu *vcpu = &svm->vcpu;
1393 int word_index = __ffs(vcpu->arch.irq_summary);
1394 int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
1395 int irq = word_index * BITS_PER_LONG + bit_index;
1396
1397 clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
1398 if (!vcpu->arch.irq_pending[word_index])
1399 clear_bit(word_index, &vcpu->arch.irq_summary);
1400 svm_inject_irq(svm, irq);
1401}
1402
1403static void do_interrupt_requests(struct kvm_vcpu *vcpu,
1404 struct kvm_run *kvm_run)
1405{
1406 struct vcpu_svm *svm = to_svm(vcpu);
1407 struct vmcb_control_area *control = &svm->vmcb->control;
1408
1409 svm->vcpu.arch.interrupt_window_open =
1410 (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
1411 (svm->vmcb->save.rflags & X86_EFLAGS_IF));
1412
1413 if (svm->vcpu.arch.interrupt_window_open && svm->vcpu.arch.irq_summary)
1414 /*
1415 * If interrupts enabled, and not blocked by sti or mov ss. Good.
1416 */
1417 svm_do_inject_vector(svm);
1418
1419 /*
1420 * Interrupts blocked. Wait for unblock.
1421 */
1422 if (!svm->vcpu.arch.interrupt_window_open &&
1423 (svm->vcpu.arch.irq_summary || kvm_run->request_interrupt_window))
1424 control->intercept |= 1ULL << INTERCEPT_VINTR;
1425 else
1426 control->intercept &= ~(1ULL << INTERCEPT_VINTR);
1427}
1428
1429static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
1430{
1431 return 0;
1432}
1433
1434static void save_db_regs(unsigned long *db_regs)
1435{
1436 asm volatile ("mov %%dr0, %0" : "=r"(db_regs[0]));
1437 asm volatile ("mov %%dr1, %0" : "=r"(db_regs[1]));
1438 asm volatile ("mov %%dr2, %0" : "=r"(db_regs[2]));
1439 asm volatile ("mov %%dr3, %0" : "=r"(db_regs[3]));
1440}
1441
1442static void load_db_regs(unsigned long *db_regs)
1443{
1444 asm volatile ("mov %0, %%dr0" : : "r"(db_regs[0]));
1445 asm volatile ("mov %0, %%dr1" : : "r"(db_regs[1]));
1446 asm volatile ("mov %0, %%dr2" : : "r"(db_regs[2]));
1447 asm volatile ("mov %0, %%dr3" : : "r"(db_regs[3]));
1448}
1449
1450static void svm_flush_tlb(struct kvm_vcpu *vcpu)
1451{
1452 force_new_asid(vcpu);
1453}
1454
1455static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
1456{
1457}
1458
1459static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1460{
1461 struct vcpu_svm *svm = to_svm(vcpu);
1462 u16 fs_selector;
1463 u16 gs_selector;
1464 u16 ldt_selector;
1465
1466 pre_svm_run(svm);
1467
1468 save_host_msrs(vcpu);
1469 fs_selector = read_fs();
1470 gs_selector = read_gs();
1471 ldt_selector = read_ldt();
1472 svm->host_cr2 = kvm_read_cr2();
1473 svm->host_dr6 = read_dr6();
1474 svm->host_dr7 = read_dr7();
1475 svm->vmcb->save.cr2 = vcpu->arch.cr2;
1476
1477 if (svm->vmcb->save.dr7 & 0xff) {
1478 write_dr7(0);
1479 save_db_regs(svm->host_db_regs);
1480 load_db_regs(svm->db_regs);
1481 }
1482
1483 clgi();
1484
1485 local_irq_enable();
1486
1487 asm volatile (
1488#ifdef CONFIG_X86_64
1489 "push %%rbp; \n\t"
1490#else
1491 "push %%ebp; \n\t"
1492#endif
1493
1494#ifdef CONFIG_X86_64
1495 "mov %c[rbx](%[svm]), %%rbx \n\t"
1496 "mov %c[rcx](%[svm]), %%rcx \n\t"
1497 "mov %c[rdx](%[svm]), %%rdx \n\t"
1498 "mov %c[rsi](%[svm]), %%rsi \n\t"
1499 "mov %c[rdi](%[svm]), %%rdi \n\t"
1500 "mov %c[rbp](%[svm]), %%rbp \n\t"
1501 "mov %c[r8](%[svm]), %%r8 \n\t"
1502 "mov %c[r9](%[svm]), %%r9 \n\t"
1503 "mov %c[r10](%[svm]), %%r10 \n\t"
1504 "mov %c[r11](%[svm]), %%r11 \n\t"
1505 "mov %c[r12](%[svm]), %%r12 \n\t"
1506 "mov %c[r13](%[svm]), %%r13 \n\t"
1507 "mov %c[r14](%[svm]), %%r14 \n\t"
1508 "mov %c[r15](%[svm]), %%r15 \n\t"
1509#else
1510 "mov %c[rbx](%[svm]), %%ebx \n\t"
1511 "mov %c[rcx](%[svm]), %%ecx \n\t"
1512 "mov %c[rdx](%[svm]), %%edx \n\t"
1513 "mov %c[rsi](%[svm]), %%esi \n\t"
1514 "mov %c[rdi](%[svm]), %%edi \n\t"
1515 "mov %c[rbp](%[svm]), %%ebp \n\t"
1516#endif
1517
1518#ifdef CONFIG_X86_64
1519 /* Enter guest mode */
1520 "push %%rax \n\t"
1521 "mov %c[vmcb](%[svm]), %%rax \n\t"
1522 SVM_VMLOAD "\n\t"
1523 SVM_VMRUN "\n\t"
1524 SVM_VMSAVE "\n\t"
1525 "pop %%rax \n\t"
1526#else
1527 /* Enter guest mode */
1528 "push %%eax \n\t"
1529 "mov %c[vmcb](%[svm]), %%eax \n\t"
1530 SVM_VMLOAD "\n\t"
1531 SVM_VMRUN "\n\t"
1532 SVM_VMSAVE "\n\t"
1533 "pop %%eax \n\t"
1534#endif
1535
1536 /* Save guest registers, load host registers */
1537#ifdef CONFIG_X86_64
1538 "mov %%rbx, %c[rbx](%[svm]) \n\t"
1539 "mov %%rcx, %c[rcx](%[svm]) \n\t"
1540 "mov %%rdx, %c[rdx](%[svm]) \n\t"
1541 "mov %%rsi, %c[rsi](%[svm]) \n\t"
1542 "mov %%rdi, %c[rdi](%[svm]) \n\t"
1543 "mov %%rbp, %c[rbp](%[svm]) \n\t"
1544 "mov %%r8, %c[r8](%[svm]) \n\t"
1545 "mov %%r9, %c[r9](%[svm]) \n\t"
1546 "mov %%r10, %c[r10](%[svm]) \n\t"
1547 "mov %%r11, %c[r11](%[svm]) \n\t"
1548 "mov %%r12, %c[r12](%[svm]) \n\t"
1549 "mov %%r13, %c[r13](%[svm]) \n\t"
1550 "mov %%r14, %c[r14](%[svm]) \n\t"
1551 "mov %%r15, %c[r15](%[svm]) \n\t"
1552
1553 "pop %%rbp; \n\t"
1554#else
1555 "mov %%ebx, %c[rbx](%[svm]) \n\t"
1556 "mov %%ecx, %c[rcx](%[svm]) \n\t"
1557 "mov %%edx, %c[rdx](%[svm]) \n\t"
1558 "mov %%esi, %c[rsi](%[svm]) \n\t"
1559 "mov %%edi, %c[rdi](%[svm]) \n\t"
1560 "mov %%ebp, %c[rbp](%[svm]) \n\t"
1561
1562 "pop %%ebp; \n\t"
1563#endif
1564 :
1565 : [svm]"a"(svm),
1566 [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
1567 [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])),
1568 [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])),
1569 [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])),
1570 [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])),
1571 [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])),
1572 [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP]))
1573#ifdef CONFIG_X86_64
1574 , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])),
1575 [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])),
1576 [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])),
1577 [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])),
1578 [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])),
1579 [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])),
1580 [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])),
1581 [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
1582#endif
1583 : "cc", "memory"
1584#ifdef CONFIG_X86_64
1585 , "rbx", "rcx", "rdx", "rsi", "rdi"
1586 , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
1587#else
1588 , "ebx", "ecx", "edx" , "esi", "edi"
1589#endif
1590 );
1591
1592 if ((svm->vmcb->save.dr7 & 0xff))
1593 load_db_regs(svm->host_db_regs);
1594
1595 vcpu->arch.cr2 = svm->vmcb->save.cr2;
1596
1597 write_dr6(svm->host_dr6);
1598 write_dr7(svm->host_dr7);
1599 kvm_write_cr2(svm->host_cr2);
1600
1601 load_fs(fs_selector);
1602 load_gs(gs_selector);
1603 load_ldt(ldt_selector);
1604 load_host_msrs(vcpu);
1605
1606 reload_tss(vcpu);
1607
1608 local_irq_disable();
1609
1610 stgi();
1611
1612 svm->next_rip = 0;
1613}
1614
1615static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
1616{
1617 struct vcpu_svm *svm = to_svm(vcpu);
1618
1619 svm->vmcb->save.cr3 = root;
1620 force_new_asid(vcpu);
1621
1622 if (vcpu->fpu_active) {
1623 svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR);
1624 svm->vmcb->save.cr0 |= X86_CR0_TS;
1625 vcpu->fpu_active = 0;
1626 }
1627}
1628
1629static int is_disabled(void)
1630{
1631 u64 vm_cr;
1632
1633 rdmsrl(MSR_VM_CR, vm_cr);
1634 if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
1635 return 1;
1636
1637 return 0;
1638}
1639
1640static void
1641svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
1642{
1643 /*
1644 * Patch in the VMMCALL instruction:
1645 */
1646 hypercall[0] = 0x0f;
1647 hypercall[1] = 0x01;
1648 hypercall[2] = 0xd9;
1649}
1650
1651static void svm_check_processor_compat(void *rtn)
1652{
1653 *(int *)rtn = 0;
1654}
1655
1656static struct kvm_x86_ops svm_x86_ops = {
1657 .cpu_has_kvm_support = has_svm,
1658 .disabled_by_bios = is_disabled,
1659 .hardware_setup = svm_hardware_setup,
1660 .hardware_unsetup = svm_hardware_unsetup,
1661 .check_processor_compatibility = svm_check_processor_compat,
1662 .hardware_enable = svm_hardware_enable,
1663 .hardware_disable = svm_hardware_disable,
1664
1665 .vcpu_create = svm_create_vcpu,
1666 .vcpu_free = svm_free_vcpu,
1667 .vcpu_reset = svm_vcpu_reset,
1668
1669 .prepare_guest_switch = svm_prepare_guest_switch,
1670 .vcpu_load = svm_vcpu_load,
1671 .vcpu_put = svm_vcpu_put,
1672 .vcpu_decache = svm_vcpu_decache,
1673
1674 .set_guest_debug = svm_guest_debug,
1675 .get_msr = svm_get_msr,
1676 .set_msr = svm_set_msr,
1677 .get_segment_base = svm_get_segment_base,
1678 .get_segment = svm_get_segment,
1679 .set_segment = svm_set_segment,
1680 .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
1681 .decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
1682 .set_cr0 = svm_set_cr0,
1683 .set_cr3 = svm_set_cr3,
1684 .set_cr4 = svm_set_cr4,
1685 .set_efer = svm_set_efer,
1686 .get_idt = svm_get_idt,
1687 .set_idt = svm_set_idt,
1688 .get_gdt = svm_get_gdt,
1689 .set_gdt = svm_set_gdt,
1690 .get_dr = svm_get_dr,
1691 .set_dr = svm_set_dr,
1692 .cache_regs = svm_cache_regs,
1693 .decache_regs = svm_decache_regs,
1694 .get_rflags = svm_get_rflags,
1695 .set_rflags = svm_set_rflags,
1696
1697 .tlb_flush = svm_flush_tlb,
1698
1699 .run = svm_vcpu_run,
1700 .handle_exit = handle_exit,
1701 .skip_emulated_instruction = skip_emulated_instruction,
1702 .patch_hypercall = svm_patch_hypercall,
1703 .get_irq = svm_get_irq,
1704 .set_irq = svm_set_irq,
1705 .queue_exception = svm_queue_exception,
1706 .exception_injected = svm_exception_injected,
1707 .inject_pending_irq = svm_intr_assist,
1708 .inject_pending_vectors = do_interrupt_requests,
1709
1710 .set_tss_addr = svm_set_tss_addr,
1711};
1712
1713static int __init svm_init(void)
1714{
1715 return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm),
1716 THIS_MODULE);
1717}
1718
1719static void __exit svm_exit(void)
1720{
1721 kvm_exit();
1722}
1723
1724module_init(svm_init)
1725module_exit(svm_exit)
diff --git a/drivers/kvm/svm.h b/drivers/kvm/svm.h
deleted file mode 100644
index 5fd50491b555..000000000000
--- a/drivers/kvm/svm.h
+++ /dev/null
@@ -1,325 +0,0 @@
1#ifndef __SVM_H
2#define __SVM_H
3
4enum {
5 INTERCEPT_INTR,
6 INTERCEPT_NMI,
7 INTERCEPT_SMI,
8 INTERCEPT_INIT,
9 INTERCEPT_VINTR,
10 INTERCEPT_SELECTIVE_CR0,
11 INTERCEPT_STORE_IDTR,
12 INTERCEPT_STORE_GDTR,
13 INTERCEPT_STORE_LDTR,
14 INTERCEPT_STORE_TR,
15 INTERCEPT_LOAD_IDTR,
16 INTERCEPT_LOAD_GDTR,
17 INTERCEPT_LOAD_LDTR,
18 INTERCEPT_LOAD_TR,
19 INTERCEPT_RDTSC,
20 INTERCEPT_RDPMC,
21 INTERCEPT_PUSHF,
22 INTERCEPT_POPF,
23 INTERCEPT_CPUID,
24 INTERCEPT_RSM,
25 INTERCEPT_IRET,
26 INTERCEPT_INTn,
27 INTERCEPT_INVD,
28 INTERCEPT_PAUSE,
29 INTERCEPT_HLT,
30 INTERCEPT_INVLPG,
31 INTERCEPT_INVLPGA,
32 INTERCEPT_IOIO_PROT,
33 INTERCEPT_MSR_PROT,
34 INTERCEPT_TASK_SWITCH,
35 INTERCEPT_FERR_FREEZE,
36 INTERCEPT_SHUTDOWN,
37 INTERCEPT_VMRUN,
38 INTERCEPT_VMMCALL,
39 INTERCEPT_VMLOAD,
40 INTERCEPT_VMSAVE,
41 INTERCEPT_STGI,
42 INTERCEPT_CLGI,
43 INTERCEPT_SKINIT,
44 INTERCEPT_RDTSCP,
45 INTERCEPT_ICEBP,
46 INTERCEPT_WBINVD,
47 INTERCEPT_MONITOR,
48 INTERCEPT_MWAIT,
49 INTERCEPT_MWAIT_COND,
50};
51
52
53struct __attribute__ ((__packed__)) vmcb_control_area {
54 u16 intercept_cr_read;
55 u16 intercept_cr_write;
56 u16 intercept_dr_read;
57 u16 intercept_dr_write;
58 u32 intercept_exceptions;
59 u64 intercept;
60 u8 reserved_1[44];
61 u64 iopm_base_pa;
62 u64 msrpm_base_pa;
63 u64 tsc_offset;
64 u32 asid;
65 u8 tlb_ctl;
66 u8 reserved_2[3];
67 u32 int_ctl;
68 u32 int_vector;
69 u32 int_state;
70 u8 reserved_3[4];
71 u32 exit_code;
72 u32 exit_code_hi;
73 u64 exit_info_1;
74 u64 exit_info_2;
75 u32 exit_int_info;
76 u32 exit_int_info_err;
77 u64 nested_ctl;
78 u8 reserved_4[16];
79 u32 event_inj;
80 u32 event_inj_err;
81 u64 nested_cr3;
82 u64 lbr_ctl;
83 u8 reserved_5[832];
84};
85
86
87#define TLB_CONTROL_DO_NOTHING 0
88#define TLB_CONTROL_FLUSH_ALL_ASID 1
89
90#define V_TPR_MASK 0x0f
91
92#define V_IRQ_SHIFT 8
93#define V_IRQ_MASK (1 << V_IRQ_SHIFT)
94
95#define V_INTR_PRIO_SHIFT 16
96#define V_INTR_PRIO_MASK (0x0f << V_INTR_PRIO_SHIFT)
97
98#define V_IGN_TPR_SHIFT 20
99#define V_IGN_TPR_MASK (1 << V_IGN_TPR_SHIFT)
100
101#define V_INTR_MASKING_SHIFT 24
102#define V_INTR_MASKING_MASK (1 << V_INTR_MASKING_SHIFT)
103
104#define SVM_INTERRUPT_SHADOW_MASK 1
105
106#define SVM_IOIO_STR_SHIFT 2
107#define SVM_IOIO_REP_SHIFT 3
108#define SVM_IOIO_SIZE_SHIFT 4
109#define SVM_IOIO_ASIZE_SHIFT 7
110
111#define SVM_IOIO_TYPE_MASK 1
112#define SVM_IOIO_STR_MASK (1 << SVM_IOIO_STR_SHIFT)
113#define SVM_IOIO_REP_MASK (1 << SVM_IOIO_REP_SHIFT)
114#define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT)
115#define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT)
116
117struct __attribute__ ((__packed__)) vmcb_seg {
118 u16 selector;
119 u16 attrib;
120 u32 limit;
121 u64 base;
122};
123
124struct __attribute__ ((__packed__)) vmcb_save_area {
125 struct vmcb_seg es;
126 struct vmcb_seg cs;
127 struct vmcb_seg ss;
128 struct vmcb_seg ds;
129 struct vmcb_seg fs;
130 struct vmcb_seg gs;
131 struct vmcb_seg gdtr;
132 struct vmcb_seg ldtr;
133 struct vmcb_seg idtr;
134 struct vmcb_seg tr;
135 u8 reserved_1[43];
136 u8 cpl;
137 u8 reserved_2[4];
138 u64 efer;
139 u8 reserved_3[112];
140 u64 cr4;
141 u64 cr3;
142 u64 cr0;
143 u64 dr7;
144 u64 dr6;
145 u64 rflags;
146 u64 rip;
147 u8 reserved_4[88];
148 u64 rsp;
149 u8 reserved_5[24];
150 u64 rax;
151 u64 star;
152 u64 lstar;
153 u64 cstar;
154 u64 sfmask;
155 u64 kernel_gs_base;
156 u64 sysenter_cs;
157 u64 sysenter_esp;
158 u64 sysenter_eip;
159 u64 cr2;
160 u8 reserved_6[32];
161 u64 g_pat;
162 u64 dbgctl;
163 u64 br_from;
164 u64 br_to;
165 u64 last_excp_from;
166 u64 last_excp_to;
167};
168
169struct __attribute__ ((__packed__)) vmcb {
170 struct vmcb_control_area control;
171 struct vmcb_save_area save;
172};
173
174#define SVM_CPUID_FEATURE_SHIFT 2
175#define SVM_CPUID_FUNC 0x8000000a
176
177#define MSR_EFER_SVME_MASK (1ULL << 12)
178#define MSR_VM_CR 0xc0010114
179#define MSR_VM_HSAVE_PA 0xc0010117ULL
180
181#define SVM_VM_CR_SVM_DISABLE 4
182
183#define SVM_SELECTOR_S_SHIFT 4
184#define SVM_SELECTOR_DPL_SHIFT 5
185#define SVM_SELECTOR_P_SHIFT 7
186#define SVM_SELECTOR_AVL_SHIFT 8
187#define SVM_SELECTOR_L_SHIFT 9
188#define SVM_SELECTOR_DB_SHIFT 10
189#define SVM_SELECTOR_G_SHIFT 11
190
191#define SVM_SELECTOR_TYPE_MASK (0xf)
192#define SVM_SELECTOR_S_MASK (1 << SVM_SELECTOR_S_SHIFT)
193#define SVM_SELECTOR_DPL_MASK (3 << SVM_SELECTOR_DPL_SHIFT)
194#define SVM_SELECTOR_P_MASK (1 << SVM_SELECTOR_P_SHIFT)
195#define SVM_SELECTOR_AVL_MASK (1 << SVM_SELECTOR_AVL_SHIFT)
196#define SVM_SELECTOR_L_MASK (1 << SVM_SELECTOR_L_SHIFT)
197#define SVM_SELECTOR_DB_MASK (1 << SVM_SELECTOR_DB_SHIFT)
198#define SVM_SELECTOR_G_MASK (1 << SVM_SELECTOR_G_SHIFT)
199
200#define SVM_SELECTOR_WRITE_MASK (1 << 1)
201#define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK
202#define SVM_SELECTOR_CODE_MASK (1 << 3)
203
204#define INTERCEPT_CR0_MASK 1
205#define INTERCEPT_CR3_MASK (1 << 3)
206#define INTERCEPT_CR4_MASK (1 << 4)
207#define INTERCEPT_CR8_MASK (1 << 8)
208
209#define INTERCEPT_DR0_MASK 1
210#define INTERCEPT_DR1_MASK (1 << 1)
211#define INTERCEPT_DR2_MASK (1 << 2)
212#define INTERCEPT_DR3_MASK (1 << 3)
213#define INTERCEPT_DR4_MASK (1 << 4)
214#define INTERCEPT_DR5_MASK (1 << 5)
215#define INTERCEPT_DR6_MASK (1 << 6)
216#define INTERCEPT_DR7_MASK (1 << 7)
217
218#define SVM_EVTINJ_VEC_MASK 0xff
219
220#define SVM_EVTINJ_TYPE_SHIFT 8
221#define SVM_EVTINJ_TYPE_MASK (7 << SVM_EVTINJ_TYPE_SHIFT)
222
223#define SVM_EVTINJ_TYPE_INTR (0 << SVM_EVTINJ_TYPE_SHIFT)
224#define SVM_EVTINJ_TYPE_NMI (2 << SVM_EVTINJ_TYPE_SHIFT)
225#define SVM_EVTINJ_TYPE_EXEPT (3 << SVM_EVTINJ_TYPE_SHIFT)
226#define SVM_EVTINJ_TYPE_SOFT (4 << SVM_EVTINJ_TYPE_SHIFT)
227
228#define SVM_EVTINJ_VALID (1 << 31)
229#define SVM_EVTINJ_VALID_ERR (1 << 11)
230
231#define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK
232
233#define SVM_EXITINTINFO_TYPE_INTR SVM_EVTINJ_TYPE_INTR
234#define SVM_EXITINTINFO_TYPE_NMI SVM_EVTINJ_TYPE_NMI
235#define SVM_EXITINTINFO_TYPE_EXEPT SVM_EVTINJ_TYPE_EXEPT
236#define SVM_EXITINTINFO_TYPE_SOFT SVM_EVTINJ_TYPE_SOFT
237
238#define SVM_EXITINTINFO_VALID SVM_EVTINJ_VALID
239#define SVM_EXITINTINFO_VALID_ERR SVM_EVTINJ_VALID_ERR
240
241#define SVM_EXIT_READ_CR0 0x000
242#define SVM_EXIT_READ_CR3 0x003
243#define SVM_EXIT_READ_CR4 0x004
244#define SVM_EXIT_READ_CR8 0x008
245#define SVM_EXIT_WRITE_CR0 0x010
246#define SVM_EXIT_WRITE_CR3 0x013
247#define SVM_EXIT_WRITE_CR4 0x014
248#define SVM_EXIT_WRITE_CR8 0x018
249#define SVM_EXIT_READ_DR0 0x020
250#define SVM_EXIT_READ_DR1 0x021
251#define SVM_EXIT_READ_DR2 0x022
252#define SVM_EXIT_READ_DR3 0x023
253#define SVM_EXIT_READ_DR4 0x024
254#define SVM_EXIT_READ_DR5 0x025
255#define SVM_EXIT_READ_DR6 0x026
256#define SVM_EXIT_READ_DR7 0x027
257#define SVM_EXIT_WRITE_DR0 0x030
258#define SVM_EXIT_WRITE_DR1 0x031
259#define SVM_EXIT_WRITE_DR2 0x032
260#define SVM_EXIT_WRITE_DR3 0x033
261#define SVM_EXIT_WRITE_DR4 0x034
262#define SVM_EXIT_WRITE_DR5 0x035
263#define SVM_EXIT_WRITE_DR6 0x036
264#define SVM_EXIT_WRITE_DR7 0x037
265#define SVM_EXIT_EXCP_BASE 0x040
266#define SVM_EXIT_INTR 0x060
267#define SVM_EXIT_NMI 0x061
268#define SVM_EXIT_SMI 0x062
269#define SVM_EXIT_INIT 0x063
270#define SVM_EXIT_VINTR 0x064
271#define SVM_EXIT_CR0_SEL_WRITE 0x065
272#define SVM_EXIT_IDTR_READ 0x066
273#define SVM_EXIT_GDTR_READ 0x067
274#define SVM_EXIT_LDTR_READ 0x068
275#define SVM_EXIT_TR_READ 0x069
276#define SVM_EXIT_IDTR_WRITE 0x06a
277#define SVM_EXIT_GDTR_WRITE 0x06b
278#define SVM_EXIT_LDTR_WRITE 0x06c
279#define SVM_EXIT_TR_WRITE 0x06d
280#define SVM_EXIT_RDTSC 0x06e
281#define SVM_EXIT_RDPMC 0x06f
282#define SVM_EXIT_PUSHF 0x070
283#define SVM_EXIT_POPF 0x071
284#define SVM_EXIT_CPUID 0x072
285#define SVM_EXIT_RSM 0x073
286#define SVM_EXIT_IRET 0x074
287#define SVM_EXIT_SWINT 0x075
288#define SVM_EXIT_INVD 0x076
289#define SVM_EXIT_PAUSE 0x077
290#define SVM_EXIT_HLT 0x078
291#define SVM_EXIT_INVLPG 0x079
292#define SVM_EXIT_INVLPGA 0x07a
293#define SVM_EXIT_IOIO 0x07b
294#define SVM_EXIT_MSR 0x07c
295#define SVM_EXIT_TASK_SWITCH 0x07d
296#define SVM_EXIT_FERR_FREEZE 0x07e
297#define SVM_EXIT_SHUTDOWN 0x07f
298#define SVM_EXIT_VMRUN 0x080
299#define SVM_EXIT_VMMCALL 0x081
300#define SVM_EXIT_VMLOAD 0x082
301#define SVM_EXIT_VMSAVE 0x083
302#define SVM_EXIT_STGI 0x084
303#define SVM_EXIT_CLGI 0x085
304#define SVM_EXIT_SKINIT 0x086
305#define SVM_EXIT_RDTSCP 0x087
306#define SVM_EXIT_ICEBP 0x088
307#define SVM_EXIT_WBINVD 0x089
308#define SVM_EXIT_MONITOR 0x08a
309#define SVM_EXIT_MWAIT 0x08b
310#define SVM_EXIT_MWAIT_COND 0x08c
311#define SVM_EXIT_NPF 0x400
312
313#define SVM_EXIT_ERR -1
314
315#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) /* TS and MP */
316
317#define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda"
318#define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8"
319#define SVM_VMSAVE ".byte 0x0f, 0x01, 0xdb"
320#define SVM_CLGI ".byte 0x0f, 0x01, 0xdd"
321#define SVM_STGI ".byte 0x0f, 0x01, 0xdc"
322#define SVM_INVLPGA ".byte 0x0f, 0x01, 0xdf"
323
324#endif
325
diff --git a/drivers/kvm/types.h b/drivers/kvm/types.h
deleted file mode 100644
index 1c4e46decb22..000000000000
--- a/drivers/kvm/types.h
+++ /dev/null
@@ -1,54 +0,0 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License.
5 *
6 * This program is distributed in the hope that it will be useful,
7 * but WITHOUT ANY WARRANTY; without even the implied warranty of
8 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
9 * GNU General Public License for more details.
10 *
11 * You should have received a copy of the GNU General Public License
12 * along with this program; if not, write to the Free Software
13 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
14 *
15 */
16
17#ifndef __KVM_TYPES_H__
18#define __KVM_TYPES_H__
19
20#include <asm/types.h>
21
22/*
23 * Address types:
24 *
25 * gva - guest virtual address
26 * gpa - guest physical address
27 * gfn - guest frame number
28 * hva - host virtual address
29 * hpa - host physical address
30 * hfn - host frame number
31 */
32
33typedef unsigned long gva_t;
34typedef u64 gpa_t;
35typedef unsigned long gfn_t;
36
37typedef unsigned long hva_t;
38typedef u64 hpa_t;
39typedef unsigned long hfn_t;
40
41struct kvm_pio_request {
42 unsigned long count;
43 int cur_count;
44 struct page *guest_pages[2];
45 unsigned guest_page_offset;
46 int in;
47 int port;
48 int size;
49 int string;
50 int down;
51 int rep;
52};
53
54#endif /* __KVM_TYPES_H__ */
diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c
deleted file mode 100644
index 11ca2340d38f..000000000000
--- a/drivers/kvm/vmx.c
+++ /dev/null
@@ -1,2673 +0,0 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * Copyright (C) 2006 Qumranet, Inc.
8 *
9 * Authors:
10 * Avi Kivity <avi@qumranet.com>
11 * Yaniv Kamay <yaniv@qumranet.com>
12 *
13 * This work is licensed under the terms of the GNU GPL, version 2. See
14 * the COPYING file in the top-level directory.
15 *
16 */
17
18#include "kvm.h"
19#include "x86.h"
20#include "x86_emulate.h"
21#include "irq.h"
22#include "vmx.h"
23#include "segment_descriptor.h"
24#include "mmu.h"
25
26#include <linux/module.h>
27#include <linux/kernel.h>
28#include <linux/mm.h>
29#include <linux/highmem.h>
30#include <linux/sched.h>
31#include <linux/moduleparam.h>
32
33#include <asm/io.h>
34#include <asm/desc.h>
35
36MODULE_AUTHOR("Qumranet");
37MODULE_LICENSE("GPL");
38
39static int bypass_guest_pf = 1;
40module_param(bypass_guest_pf, bool, 0);
41
42struct vmcs {
43 u32 revision_id;
44 u32 abort;
45 char data[0];
46};
47
48struct vcpu_vmx {
49 struct kvm_vcpu vcpu;
50 int launched;
51 u8 fail;
52 u32 idt_vectoring_info;
53 struct kvm_msr_entry *guest_msrs;
54 struct kvm_msr_entry *host_msrs;
55 int nmsrs;
56 int save_nmsrs;
57 int msr_offset_efer;
58#ifdef CONFIG_X86_64
59 int msr_offset_kernel_gs_base;
60#endif
61 struct vmcs *vmcs;
62 struct {
63 int loaded;
64 u16 fs_sel, gs_sel, ldt_sel;
65 int gs_ldt_reload_needed;
66 int fs_reload_needed;
67 int guest_efer_loaded;
68 } host_state;
69 struct {
70 struct {
71 bool pending;
72 u8 vector;
73 unsigned rip;
74 } irq;
75 } rmode;
76};
77
78static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
79{
80 return container_of(vcpu, struct vcpu_vmx, vcpu);
81}
82
83static int init_rmode_tss(struct kvm *kvm);
84
85static DEFINE_PER_CPU(struct vmcs *, vmxarea);
86static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
87
88static struct page *vmx_io_bitmap_a;
89static struct page *vmx_io_bitmap_b;
90
91static struct vmcs_config {
92 int size;
93 int order;
94 u32 revision_id;
95 u32 pin_based_exec_ctrl;
96 u32 cpu_based_exec_ctrl;
97 u32 cpu_based_2nd_exec_ctrl;
98 u32 vmexit_ctrl;
99 u32 vmentry_ctrl;
100} vmcs_config;
101
102#define VMX_SEGMENT_FIELD(seg) \
103 [VCPU_SREG_##seg] = { \
104 .selector = GUEST_##seg##_SELECTOR, \
105 .base = GUEST_##seg##_BASE, \
106 .limit = GUEST_##seg##_LIMIT, \
107 .ar_bytes = GUEST_##seg##_AR_BYTES, \
108 }
109
110static struct kvm_vmx_segment_field {
111 unsigned selector;
112 unsigned base;
113 unsigned limit;
114 unsigned ar_bytes;
115} kvm_vmx_segment_fields[] = {
116 VMX_SEGMENT_FIELD(CS),
117 VMX_SEGMENT_FIELD(DS),
118 VMX_SEGMENT_FIELD(ES),
119 VMX_SEGMENT_FIELD(FS),
120 VMX_SEGMENT_FIELD(GS),
121 VMX_SEGMENT_FIELD(SS),
122 VMX_SEGMENT_FIELD(TR),
123 VMX_SEGMENT_FIELD(LDTR),
124};
125
126/*
127 * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it
128 * away by decrementing the array size.
129 */
130static const u32 vmx_msr_index[] = {
131#ifdef CONFIG_X86_64
132 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE,
133#endif
134 MSR_EFER, MSR_K6_STAR,
135};
136#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
137
138static void load_msrs(struct kvm_msr_entry *e, int n)
139{
140 int i;
141
142 for (i = 0; i < n; ++i)
143 wrmsrl(e[i].index, e[i].data);
144}
145
146static void save_msrs(struct kvm_msr_entry *e, int n)
147{
148 int i;
149
150 for (i = 0; i < n; ++i)
151 rdmsrl(e[i].index, e[i].data);
152}
153
154static inline int is_page_fault(u32 intr_info)
155{
156 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
157 INTR_INFO_VALID_MASK)) ==
158 (INTR_TYPE_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
159}
160
161static inline int is_no_device(u32 intr_info)
162{
163 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
164 INTR_INFO_VALID_MASK)) ==
165 (INTR_TYPE_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
166}
167
168static inline int is_invalid_opcode(u32 intr_info)
169{
170 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
171 INTR_INFO_VALID_MASK)) ==
172 (INTR_TYPE_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
173}
174
175static inline int is_external_interrupt(u32 intr_info)
176{
177 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
178 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
179}
180
181static inline int cpu_has_vmx_tpr_shadow(void)
182{
183 return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW);
184}
185
186static inline int vm_need_tpr_shadow(struct kvm *kvm)
187{
188 return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm)));
189}
190
191static inline int cpu_has_secondary_exec_ctrls(void)
192{
193 return (vmcs_config.cpu_based_exec_ctrl &
194 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS);
195}
196
197static inline int cpu_has_vmx_virtualize_apic_accesses(void)
198{
199 return (vmcs_config.cpu_based_2nd_exec_ctrl &
200 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
201}
202
203static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
204{
205 return ((cpu_has_vmx_virtualize_apic_accesses()) &&
206 (irqchip_in_kernel(kvm)));
207}
208
209static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
210{
211 int i;
212
213 for (i = 0; i < vmx->nmsrs; ++i)
214 if (vmx->guest_msrs[i].index == msr)
215 return i;
216 return -1;
217}
218
219static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
220{
221 int i;
222
223 i = __find_msr_index(vmx, msr);
224 if (i >= 0)
225 return &vmx->guest_msrs[i];
226 return NULL;
227}
228
229static void vmcs_clear(struct vmcs *vmcs)
230{
231 u64 phys_addr = __pa(vmcs);
232 u8 error;
233
234 asm volatile (ASM_VMX_VMCLEAR_RAX "; setna %0"
235 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
236 : "cc", "memory");
237 if (error)
238 printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
239 vmcs, phys_addr);
240}
241
242static void __vcpu_clear(void *arg)
243{
244 struct vcpu_vmx *vmx = arg;
245 int cpu = raw_smp_processor_id();
246
247 if (vmx->vcpu.cpu == cpu)
248 vmcs_clear(vmx->vmcs);
249 if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
250 per_cpu(current_vmcs, cpu) = NULL;
251 rdtscll(vmx->vcpu.arch.host_tsc);
252}
253
254static void vcpu_clear(struct vcpu_vmx *vmx)
255{
256 if (vmx->vcpu.cpu == -1)
257 return;
258 smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 0, 1);
259 vmx->launched = 0;
260}
261
262static unsigned long vmcs_readl(unsigned long field)
263{
264 unsigned long value;
265
266 asm volatile (ASM_VMX_VMREAD_RDX_RAX
267 : "=a"(value) : "d"(field) : "cc");
268 return value;
269}
270
271static u16 vmcs_read16(unsigned long field)
272{
273 return vmcs_readl(field);
274}
275
276static u32 vmcs_read32(unsigned long field)
277{
278 return vmcs_readl(field);
279}
280
281static u64 vmcs_read64(unsigned long field)
282{
283#ifdef CONFIG_X86_64
284 return vmcs_readl(field);
285#else
286 return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32);
287#endif
288}
289
290static noinline void vmwrite_error(unsigned long field, unsigned long value)
291{
292 printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
293 field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
294 dump_stack();
295}
296
297static void vmcs_writel(unsigned long field, unsigned long value)
298{
299 u8 error;
300
301 asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0"
302 : "=q"(error) : "a"(value), "d"(field) : "cc");
303 if (unlikely(error))
304 vmwrite_error(field, value);
305}
306
307static void vmcs_write16(unsigned long field, u16 value)
308{
309 vmcs_writel(field, value);
310}
311
312static void vmcs_write32(unsigned long field, u32 value)
313{
314 vmcs_writel(field, value);
315}
316
317static void vmcs_write64(unsigned long field, u64 value)
318{
319#ifdef CONFIG_X86_64
320 vmcs_writel(field, value);
321#else
322 vmcs_writel(field, value);
323 asm volatile ("");
324 vmcs_writel(field+1, value >> 32);
325#endif
326}
327
328static void vmcs_clear_bits(unsigned long field, u32 mask)
329{
330 vmcs_writel(field, vmcs_readl(field) & ~mask);
331}
332
333static void vmcs_set_bits(unsigned long field, u32 mask)
334{
335 vmcs_writel(field, vmcs_readl(field) | mask);
336}
337
338static void update_exception_bitmap(struct kvm_vcpu *vcpu)
339{
340 u32 eb;
341
342 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR);
343 if (!vcpu->fpu_active)
344 eb |= 1u << NM_VECTOR;
345 if (vcpu->guest_debug.enabled)
346 eb |= 1u << 1;
347 if (vcpu->arch.rmode.active)
348 eb = ~0;
349 vmcs_write32(EXCEPTION_BITMAP, eb);
350}
351
352static void reload_tss(void)
353{
354#ifndef CONFIG_X86_64
355
356 /*
357 * VT restores TR but not its size. Useless.
358 */
359 struct descriptor_table gdt;
360 struct segment_descriptor *descs;
361
362 get_gdt(&gdt);
363 descs = (void *)gdt.base;
364 descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
365 load_TR_desc();
366#endif
367}
368
369static void load_transition_efer(struct vcpu_vmx *vmx)
370{
371 int efer_offset = vmx->msr_offset_efer;
372 u64 host_efer = vmx->host_msrs[efer_offset].data;
373 u64 guest_efer = vmx->guest_msrs[efer_offset].data;
374 u64 ignore_bits;
375
376 if (efer_offset < 0)
377 return;
378 /*
379 * NX is emulated; LMA and LME handled by hardware; SCE meaninless
380 * outside long mode
381 */
382 ignore_bits = EFER_NX | EFER_SCE;
383#ifdef CONFIG_X86_64
384 ignore_bits |= EFER_LMA | EFER_LME;
385 /* SCE is meaningful only in long mode on Intel */
386 if (guest_efer & EFER_LMA)
387 ignore_bits &= ~(u64)EFER_SCE;
388#endif
389 if ((guest_efer & ~ignore_bits) == (host_efer & ~ignore_bits))
390 return;
391
392 vmx->host_state.guest_efer_loaded = 1;
393 guest_efer &= ~ignore_bits;
394 guest_efer |= host_efer & ignore_bits;
395 wrmsrl(MSR_EFER, guest_efer);
396 vmx->vcpu.stat.efer_reload++;
397}
398
399static void reload_host_efer(struct vcpu_vmx *vmx)
400{
401 if (vmx->host_state.guest_efer_loaded) {
402 vmx->host_state.guest_efer_loaded = 0;
403 load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1);
404 }
405}
406
407static void vmx_save_host_state(struct kvm_vcpu *vcpu)
408{
409 struct vcpu_vmx *vmx = to_vmx(vcpu);
410
411 if (vmx->host_state.loaded)
412 return;
413
414 vmx->host_state.loaded = 1;
415 /*
416 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not
417 * allow segment selectors with cpl > 0 or ti == 1.
418 */
419 vmx->host_state.ldt_sel = read_ldt();
420 vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
421 vmx->host_state.fs_sel = read_fs();
422 if (!(vmx->host_state.fs_sel & 7)) {
423 vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
424 vmx->host_state.fs_reload_needed = 0;
425 } else {
426 vmcs_write16(HOST_FS_SELECTOR, 0);
427 vmx->host_state.fs_reload_needed = 1;
428 }
429 vmx->host_state.gs_sel = read_gs();
430 if (!(vmx->host_state.gs_sel & 7))
431 vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
432 else {
433 vmcs_write16(HOST_GS_SELECTOR, 0);
434 vmx->host_state.gs_ldt_reload_needed = 1;
435 }
436
437#ifdef CONFIG_X86_64
438 vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
439 vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
440#else
441 vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel));
442 vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel));
443#endif
444
445#ifdef CONFIG_X86_64
446 if (is_long_mode(&vmx->vcpu))
447 save_msrs(vmx->host_msrs +
448 vmx->msr_offset_kernel_gs_base, 1);
449
450#endif
451 load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
452 load_transition_efer(vmx);
453}
454
455static void vmx_load_host_state(struct vcpu_vmx *vmx)
456{
457 unsigned long flags;
458
459 if (!vmx->host_state.loaded)
460 return;
461
462 ++vmx->vcpu.stat.host_state_reload;
463 vmx->host_state.loaded = 0;
464 if (vmx->host_state.fs_reload_needed)
465 load_fs(vmx->host_state.fs_sel);
466 if (vmx->host_state.gs_ldt_reload_needed) {
467 load_ldt(vmx->host_state.ldt_sel);
468 /*
469 * If we have to reload gs, we must take care to
470 * preserve our gs base.
471 */
472 local_irq_save(flags);
473 load_gs(vmx->host_state.gs_sel);
474#ifdef CONFIG_X86_64
475 wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
476#endif
477 local_irq_restore(flags);
478 }
479 reload_tss();
480 save_msrs(vmx->guest_msrs, vmx->save_nmsrs);
481 load_msrs(vmx->host_msrs, vmx->save_nmsrs);
482 reload_host_efer(vmx);
483}
484
485/*
486 * Switches to specified vcpu, until a matching vcpu_put(), but assumes
487 * vcpu mutex is already taken.
488 */
489static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
490{
491 struct vcpu_vmx *vmx = to_vmx(vcpu);
492 u64 phys_addr = __pa(vmx->vmcs);
493 u64 tsc_this, delta;
494
495 if (vcpu->cpu != cpu) {
496 vcpu_clear(vmx);
497 kvm_migrate_apic_timer(vcpu);
498 }
499
500 if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
501 u8 error;
502
503 per_cpu(current_vmcs, cpu) = vmx->vmcs;
504 asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0"
505 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
506 : "cc");
507 if (error)
508 printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
509 vmx->vmcs, phys_addr);
510 }
511
512 if (vcpu->cpu != cpu) {
513 struct descriptor_table dt;
514 unsigned long sysenter_esp;
515
516 vcpu->cpu = cpu;
517 /*
518 * Linux uses per-cpu TSS and GDT, so set these when switching
519 * processors.
520 */
521 vmcs_writel(HOST_TR_BASE, read_tr_base()); /* 22.2.4 */
522 get_gdt(&dt);
523 vmcs_writel(HOST_GDTR_BASE, dt.base); /* 22.2.4 */
524
525 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
526 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
527
528 /*
529 * Make sure the time stamp counter is monotonous.
530 */
531 rdtscll(tsc_this);
532 delta = vcpu->arch.host_tsc - tsc_this;
533 vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta);
534 }
535}
536
537static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
538{
539 vmx_load_host_state(to_vmx(vcpu));
540}
541
542static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
543{
544 if (vcpu->fpu_active)
545 return;
546 vcpu->fpu_active = 1;
547 vmcs_clear_bits(GUEST_CR0, X86_CR0_TS);
548 if (vcpu->arch.cr0 & X86_CR0_TS)
549 vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
550 update_exception_bitmap(vcpu);
551}
552
553static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
554{
555 if (!vcpu->fpu_active)
556 return;
557 vcpu->fpu_active = 0;
558 vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
559 update_exception_bitmap(vcpu);
560}
561
562static void vmx_vcpu_decache(struct kvm_vcpu *vcpu)
563{
564 vcpu_clear(to_vmx(vcpu));
565}
566
567static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
568{
569 return vmcs_readl(GUEST_RFLAGS);
570}
571
572static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
573{
574 if (vcpu->arch.rmode.active)
575 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
576 vmcs_writel(GUEST_RFLAGS, rflags);
577}
578
579static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
580{
581 unsigned long rip;
582 u32 interruptibility;
583
584 rip = vmcs_readl(GUEST_RIP);
585 rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
586 vmcs_writel(GUEST_RIP, rip);
587
588 /*
589 * We emulated an instruction, so temporary interrupt blocking
590 * should be removed, if set.
591 */
592 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
593 if (interruptibility & 3)
594 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
595 interruptibility & ~3);
596 vcpu->arch.interrupt_window_open = 1;
597}
598
599static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
600 bool has_error_code, u32 error_code)
601{
602 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
603 nr | INTR_TYPE_EXCEPTION
604 | (has_error_code ? INTR_INFO_DELIEVER_CODE_MASK : 0)
605 | INTR_INFO_VALID_MASK);
606 if (has_error_code)
607 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
608}
609
610static bool vmx_exception_injected(struct kvm_vcpu *vcpu)
611{
612 struct vcpu_vmx *vmx = to_vmx(vcpu);
613
614 return !(vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
615}
616
617/*
618 * Swap MSR entry in host/guest MSR entry array.
619 */
620#ifdef CONFIG_X86_64
621static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
622{
623 struct kvm_msr_entry tmp;
624
625 tmp = vmx->guest_msrs[to];
626 vmx->guest_msrs[to] = vmx->guest_msrs[from];
627 vmx->guest_msrs[from] = tmp;
628 tmp = vmx->host_msrs[to];
629 vmx->host_msrs[to] = vmx->host_msrs[from];
630 vmx->host_msrs[from] = tmp;
631}
632#endif
633
634/*
635 * Set up the vmcs to automatically save and restore system
636 * msrs. Don't touch the 64-bit msrs if the guest is in legacy
637 * mode, as fiddling with msrs is very expensive.
638 */
639static void setup_msrs(struct vcpu_vmx *vmx)
640{
641 int save_nmsrs;
642
643 save_nmsrs = 0;
644#ifdef CONFIG_X86_64
645 if (is_long_mode(&vmx->vcpu)) {
646 int index;
647
648 index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
649 if (index >= 0)
650 move_msr_up(vmx, index, save_nmsrs++);
651 index = __find_msr_index(vmx, MSR_LSTAR);
652 if (index >= 0)
653 move_msr_up(vmx, index, save_nmsrs++);
654 index = __find_msr_index(vmx, MSR_CSTAR);
655 if (index >= 0)
656 move_msr_up(vmx, index, save_nmsrs++);
657 index = __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
658 if (index >= 0)
659 move_msr_up(vmx, index, save_nmsrs++);
660 /*
661 * MSR_K6_STAR is only needed on long mode guests, and only
662 * if efer.sce is enabled.
663 */
664 index = __find_msr_index(vmx, MSR_K6_STAR);
665 if ((index >= 0) && (vmx->vcpu.arch.shadow_efer & EFER_SCE))
666 move_msr_up(vmx, index, save_nmsrs++);
667 }
668#endif
669 vmx->save_nmsrs = save_nmsrs;
670
671#ifdef CONFIG_X86_64
672 vmx->msr_offset_kernel_gs_base =
673 __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
674#endif
675 vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER);
676}
677
678/*
679 * reads and returns guest's timestamp counter "register"
680 * guest_tsc = host_tsc + tsc_offset -- 21.3
681 */
682static u64 guest_read_tsc(void)
683{
684 u64 host_tsc, tsc_offset;
685
686 rdtscll(host_tsc);
687 tsc_offset = vmcs_read64(TSC_OFFSET);
688 return host_tsc + tsc_offset;
689}
690
691/*
692 * writes 'guest_tsc' into guest's timestamp counter "register"
693 * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc
694 */
695static void guest_write_tsc(u64 guest_tsc)
696{
697 u64 host_tsc;
698
699 rdtscll(host_tsc);
700 vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc);
701}
702
703/*
704 * Reads an msr value (of 'msr_index') into 'pdata'.
705 * Returns 0 on success, non-0 otherwise.
706 * Assumes vcpu_load() was already called.
707 */
708static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
709{
710 u64 data;
711 struct kvm_msr_entry *msr;
712
713 if (!pdata) {
714 printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
715 return -EINVAL;
716 }
717
718 switch (msr_index) {
719#ifdef CONFIG_X86_64
720 case MSR_FS_BASE:
721 data = vmcs_readl(GUEST_FS_BASE);
722 break;
723 case MSR_GS_BASE:
724 data = vmcs_readl(GUEST_GS_BASE);
725 break;
726 case MSR_EFER:
727 return kvm_get_msr_common(vcpu, msr_index, pdata);
728#endif
729 case MSR_IA32_TIME_STAMP_COUNTER:
730 data = guest_read_tsc();
731 break;
732 case MSR_IA32_SYSENTER_CS:
733 data = vmcs_read32(GUEST_SYSENTER_CS);
734 break;
735 case MSR_IA32_SYSENTER_EIP:
736 data = vmcs_readl(GUEST_SYSENTER_EIP);
737 break;
738 case MSR_IA32_SYSENTER_ESP:
739 data = vmcs_readl(GUEST_SYSENTER_ESP);
740 break;
741 default:
742 msr = find_msr_entry(to_vmx(vcpu), msr_index);
743 if (msr) {
744 data = msr->data;
745 break;
746 }
747 return kvm_get_msr_common(vcpu, msr_index, pdata);
748 }
749
750 *pdata = data;
751 return 0;
752}
753
754/*
755 * Writes msr value into into the appropriate "register".
756 * Returns 0 on success, non-0 otherwise.
757 * Assumes vcpu_load() was already called.
758 */
759static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
760{
761 struct vcpu_vmx *vmx = to_vmx(vcpu);
762 struct kvm_msr_entry *msr;
763 int ret = 0;
764
765 switch (msr_index) {
766#ifdef CONFIG_X86_64
767 case MSR_EFER:
768 ret = kvm_set_msr_common(vcpu, msr_index, data);
769 if (vmx->host_state.loaded) {
770 reload_host_efer(vmx);
771 load_transition_efer(vmx);
772 }
773 break;
774 case MSR_FS_BASE:
775 vmcs_writel(GUEST_FS_BASE, data);
776 break;
777 case MSR_GS_BASE:
778 vmcs_writel(GUEST_GS_BASE, data);
779 break;
780#endif
781 case MSR_IA32_SYSENTER_CS:
782 vmcs_write32(GUEST_SYSENTER_CS, data);
783 break;
784 case MSR_IA32_SYSENTER_EIP:
785 vmcs_writel(GUEST_SYSENTER_EIP, data);
786 break;
787 case MSR_IA32_SYSENTER_ESP:
788 vmcs_writel(GUEST_SYSENTER_ESP, data);
789 break;
790 case MSR_IA32_TIME_STAMP_COUNTER:
791 guest_write_tsc(data);
792 break;
793 default:
794 msr = find_msr_entry(vmx, msr_index);
795 if (msr) {
796 msr->data = data;
797 if (vmx->host_state.loaded)
798 load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
799 break;
800 }
801 ret = kvm_set_msr_common(vcpu, msr_index, data);
802 }
803
804 return ret;
805}
806
807/*
808 * Sync the rsp and rip registers into the vcpu structure. This allows
809 * registers to be accessed by indexing vcpu->arch.regs.
810 */
811static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu)
812{
813 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
814 vcpu->arch.rip = vmcs_readl(GUEST_RIP);
815}
816
817/*
818 * Syncs rsp and rip back into the vmcs. Should be called after possible
819 * modification.
820 */
821static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu)
822{
823 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
824 vmcs_writel(GUEST_RIP, vcpu->arch.rip);
825}
826
827static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
828{
829 unsigned long dr7 = 0x400;
830 int old_singlestep;
831
832 old_singlestep = vcpu->guest_debug.singlestep;
833
834 vcpu->guest_debug.enabled = dbg->enabled;
835 if (vcpu->guest_debug.enabled) {
836 int i;
837
838 dr7 |= 0x200; /* exact */
839 for (i = 0; i < 4; ++i) {
840 if (!dbg->breakpoints[i].enabled)
841 continue;
842 vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address;
843 dr7 |= 2 << (i*2); /* global enable */
844 dr7 |= 0 << (i*4+16); /* execution breakpoint */
845 }
846
847 vcpu->guest_debug.singlestep = dbg->singlestep;
848 } else
849 vcpu->guest_debug.singlestep = 0;
850
851 if (old_singlestep && !vcpu->guest_debug.singlestep) {
852 unsigned long flags;
853
854 flags = vmcs_readl(GUEST_RFLAGS);
855 flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
856 vmcs_writel(GUEST_RFLAGS, flags);
857 }
858
859 update_exception_bitmap(vcpu);
860 vmcs_writel(GUEST_DR7, dr7);
861
862 return 0;
863}
864
865static int vmx_get_irq(struct kvm_vcpu *vcpu)
866{
867 struct vcpu_vmx *vmx = to_vmx(vcpu);
868 u32 idtv_info_field;
869
870 idtv_info_field = vmx->idt_vectoring_info;
871 if (idtv_info_field & INTR_INFO_VALID_MASK) {
872 if (is_external_interrupt(idtv_info_field))
873 return idtv_info_field & VECTORING_INFO_VECTOR_MASK;
874 else
875 printk(KERN_DEBUG "pending exception: not handled yet\n");
876 }
877 return -1;
878}
879
880static __init int cpu_has_kvm_support(void)
881{
882 unsigned long ecx = cpuid_ecx(1);
883 return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */
884}
885
886static __init int vmx_disabled_by_bios(void)
887{
888 u64 msr;
889
890 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
891 return (msr & (MSR_IA32_FEATURE_CONTROL_LOCKED |
892 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
893 == MSR_IA32_FEATURE_CONTROL_LOCKED;
894 /* locked but not enabled */
895}
896
897static void hardware_enable(void *garbage)
898{
899 int cpu = raw_smp_processor_id();
900 u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
901 u64 old;
902
903 rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
904 if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED |
905 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
906 != (MSR_IA32_FEATURE_CONTROL_LOCKED |
907 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
908 /* enable and lock */
909 wrmsrl(MSR_IA32_FEATURE_CONTROL, old |
910 MSR_IA32_FEATURE_CONTROL_LOCKED |
911 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED);
912 write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
913 asm volatile (ASM_VMX_VMXON_RAX : : "a"(&phys_addr), "m"(phys_addr)
914 : "memory", "cc");
915}
916
917static void hardware_disable(void *garbage)
918{
919 asm volatile (ASM_VMX_VMXOFF : : : "cc");
920}
921
922static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
923 u32 msr, u32 *result)
924{
925 u32 vmx_msr_low, vmx_msr_high;
926 u32 ctl = ctl_min | ctl_opt;
927
928 rdmsr(msr, vmx_msr_low, vmx_msr_high);
929
930 ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
931 ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */
932
933 /* Ensure minimum (required) set of control bits are supported. */
934 if (ctl_min & ~ctl)
935 return -EIO;
936
937 *result = ctl;
938 return 0;
939}
940
941static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
942{
943 u32 vmx_msr_low, vmx_msr_high;
944 u32 min, opt;
945 u32 _pin_based_exec_control = 0;
946 u32 _cpu_based_exec_control = 0;
947 u32 _cpu_based_2nd_exec_control = 0;
948 u32 _vmexit_control = 0;
949 u32 _vmentry_control = 0;
950
951 min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
952 opt = 0;
953 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
954 &_pin_based_exec_control) < 0)
955 return -EIO;
956
957 min = CPU_BASED_HLT_EXITING |
958#ifdef CONFIG_X86_64
959 CPU_BASED_CR8_LOAD_EXITING |
960 CPU_BASED_CR8_STORE_EXITING |
961#endif
962 CPU_BASED_USE_IO_BITMAPS |
963 CPU_BASED_MOV_DR_EXITING |
964 CPU_BASED_USE_TSC_OFFSETING;
965 opt = CPU_BASED_TPR_SHADOW |
966 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
967 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
968 &_cpu_based_exec_control) < 0)
969 return -EIO;
970#ifdef CONFIG_X86_64
971 if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
972 _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
973 ~CPU_BASED_CR8_STORE_EXITING;
974#endif
975 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
976 min = 0;
977 opt = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
978 SECONDARY_EXEC_WBINVD_EXITING;
979 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS2,
980 &_cpu_based_2nd_exec_control) < 0)
981 return -EIO;
982 }
983#ifndef CONFIG_X86_64
984 if (!(_cpu_based_2nd_exec_control &
985 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
986 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
987#endif
988
989 min = 0;
990#ifdef CONFIG_X86_64
991 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
992#endif
993 opt = 0;
994 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
995 &_vmexit_control) < 0)
996 return -EIO;
997
998 min = opt = 0;
999 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
1000 &_vmentry_control) < 0)
1001 return -EIO;
1002
1003 rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
1004
1005 /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
1006 if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
1007 return -EIO;
1008
1009#ifdef CONFIG_X86_64
1010 /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
1011 if (vmx_msr_high & (1u<<16))
1012 return -EIO;
1013#endif
1014
1015 /* Require Write-Back (WB) memory type for VMCS accesses. */
1016 if (((vmx_msr_high >> 18) & 15) != 6)
1017 return -EIO;
1018
1019 vmcs_conf->size = vmx_msr_high & 0x1fff;
1020 vmcs_conf->order = get_order(vmcs_config.size);
1021 vmcs_conf->revision_id = vmx_msr_low;
1022
1023 vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
1024 vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
1025 vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
1026 vmcs_conf->vmexit_ctrl = _vmexit_control;
1027 vmcs_conf->vmentry_ctrl = _vmentry_control;
1028
1029 return 0;
1030}
1031
1032static struct vmcs *alloc_vmcs_cpu(int cpu)
1033{
1034 int node = cpu_to_node(cpu);
1035 struct page *pages;
1036 struct vmcs *vmcs;
1037
1038 pages = alloc_pages_node(node, GFP_KERNEL, vmcs_config.order);
1039 if (!pages)
1040 return NULL;
1041 vmcs = page_address(pages);
1042 memset(vmcs, 0, vmcs_config.size);
1043 vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */
1044 return vmcs;
1045}
1046
1047static struct vmcs *alloc_vmcs(void)
1048{
1049 return alloc_vmcs_cpu(raw_smp_processor_id());
1050}
1051
1052static void free_vmcs(struct vmcs *vmcs)
1053{
1054 free_pages((unsigned long)vmcs, vmcs_config.order);
1055}
1056
1057static void free_kvm_area(void)
1058{
1059 int cpu;
1060
1061 for_each_online_cpu(cpu)
1062 free_vmcs(per_cpu(vmxarea, cpu));
1063}
1064
1065static __init int alloc_kvm_area(void)
1066{
1067 int cpu;
1068
1069 for_each_online_cpu(cpu) {
1070 struct vmcs *vmcs;
1071
1072 vmcs = alloc_vmcs_cpu(cpu);
1073 if (!vmcs) {
1074 free_kvm_area();
1075 return -ENOMEM;
1076 }
1077
1078 per_cpu(vmxarea, cpu) = vmcs;
1079 }
1080 return 0;
1081}
1082
1083static __init int hardware_setup(void)
1084{
1085 if (setup_vmcs_config(&vmcs_config) < 0)
1086 return -EIO;
1087 return alloc_kvm_area();
1088}
1089
1090static __exit void hardware_unsetup(void)
1091{
1092 free_kvm_area();
1093}
1094
1095static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save)
1096{
1097 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1098
1099 if (vmcs_readl(sf->base) == save->base && (save->base & AR_S_MASK)) {
1100 vmcs_write16(sf->selector, save->selector);
1101 vmcs_writel(sf->base, save->base);
1102 vmcs_write32(sf->limit, save->limit);
1103 vmcs_write32(sf->ar_bytes, save->ar);
1104 } else {
1105 u32 dpl = (vmcs_read16(sf->selector) & SELECTOR_RPL_MASK)
1106 << AR_DPL_SHIFT;
1107 vmcs_write32(sf->ar_bytes, 0x93 | dpl);
1108 }
1109}
1110
1111static void enter_pmode(struct kvm_vcpu *vcpu)
1112{
1113 unsigned long flags;
1114
1115 vcpu->arch.rmode.active = 0;
1116
1117 vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base);
1118 vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit);
1119 vmcs_write32(GUEST_TR_AR_BYTES, vcpu->arch.rmode.tr.ar);
1120
1121 flags = vmcs_readl(GUEST_RFLAGS);
1122 flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM);
1123 flags |= (vcpu->arch.rmode.save_iopl << IOPL_SHIFT);
1124 vmcs_writel(GUEST_RFLAGS, flags);
1125
1126 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
1127 (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
1128
1129 update_exception_bitmap(vcpu);
1130
1131 fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
1132 fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
1133 fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
1134 fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
1135
1136 vmcs_write16(GUEST_SS_SELECTOR, 0);
1137 vmcs_write32(GUEST_SS_AR_BYTES, 0x93);
1138
1139 vmcs_write16(GUEST_CS_SELECTOR,
1140 vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK);
1141 vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
1142}
1143
1144static gva_t rmode_tss_base(struct kvm *kvm)
1145{
1146 if (!kvm->arch.tss_addr) {
1147 gfn_t base_gfn = kvm->memslots[0].base_gfn +
1148 kvm->memslots[0].npages - 3;
1149 return base_gfn << PAGE_SHIFT;
1150 }
1151 return kvm->arch.tss_addr;
1152}
1153
1154static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
1155{
1156 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1157
1158 save->selector = vmcs_read16(sf->selector);
1159 save->base = vmcs_readl(sf->base);
1160 save->limit = vmcs_read32(sf->limit);
1161 save->ar = vmcs_read32(sf->ar_bytes);
1162 vmcs_write16(sf->selector, save->base >> 4);
1163 vmcs_write32(sf->base, save->base & 0xfffff);
1164 vmcs_write32(sf->limit, 0xffff);
1165 vmcs_write32(sf->ar_bytes, 0xf3);
1166}
1167
1168static void enter_rmode(struct kvm_vcpu *vcpu)
1169{
1170 unsigned long flags;
1171
1172 vcpu->arch.rmode.active = 1;
1173
1174 vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
1175 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
1176
1177 vcpu->arch.rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
1178 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
1179
1180 vcpu->arch.rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
1181 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
1182
1183 flags = vmcs_readl(GUEST_RFLAGS);
1184 vcpu->arch.rmode.save_iopl
1185 = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
1186
1187 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
1188
1189 vmcs_writel(GUEST_RFLAGS, flags);
1190 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
1191 update_exception_bitmap(vcpu);
1192
1193 vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4);
1194 vmcs_write32(GUEST_SS_LIMIT, 0xffff);
1195 vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
1196
1197 vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
1198 vmcs_write32(GUEST_CS_LIMIT, 0xffff);
1199 if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000)
1200 vmcs_writel(GUEST_CS_BASE, 0xf0000);
1201 vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
1202
1203 fix_rmode_seg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
1204 fix_rmode_seg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
1205 fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
1206 fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
1207
1208 kvm_mmu_reset_context(vcpu);
1209 init_rmode_tss(vcpu->kvm);
1210}
1211
1212#ifdef CONFIG_X86_64
1213
1214static void enter_lmode(struct kvm_vcpu *vcpu)
1215{
1216 u32 guest_tr_ar;
1217
1218 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
1219 if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
1220 printk(KERN_DEBUG "%s: tss fixup for long mode. \n",
1221 __FUNCTION__);
1222 vmcs_write32(GUEST_TR_AR_BYTES,
1223 (guest_tr_ar & ~AR_TYPE_MASK)
1224 | AR_TYPE_BUSY_64_TSS);
1225 }
1226
1227 vcpu->arch.shadow_efer |= EFER_LMA;
1228
1229 find_msr_entry(to_vmx(vcpu), MSR_EFER)->data |= EFER_LMA | EFER_LME;
1230 vmcs_write32(VM_ENTRY_CONTROLS,
1231 vmcs_read32(VM_ENTRY_CONTROLS)
1232 | VM_ENTRY_IA32E_MODE);
1233}
1234
1235static void exit_lmode(struct kvm_vcpu *vcpu)
1236{
1237 vcpu->arch.shadow_efer &= ~EFER_LMA;
1238
1239 vmcs_write32(VM_ENTRY_CONTROLS,
1240 vmcs_read32(VM_ENTRY_CONTROLS)
1241 & ~VM_ENTRY_IA32E_MODE);
1242}
1243
1244#endif
1245
1246static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
1247{
1248 vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK;
1249 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK;
1250}
1251
1252static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1253{
1254 vmx_fpu_deactivate(vcpu);
1255
1256 if (vcpu->arch.rmode.active && (cr0 & X86_CR0_PE))
1257 enter_pmode(vcpu);
1258
1259 if (!vcpu->arch.rmode.active && !(cr0 & X86_CR0_PE))
1260 enter_rmode(vcpu);
1261
1262#ifdef CONFIG_X86_64
1263 if (vcpu->arch.shadow_efer & EFER_LME) {
1264 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
1265 enter_lmode(vcpu);
1266 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
1267 exit_lmode(vcpu);
1268 }
1269#endif
1270
1271 vmcs_writel(CR0_READ_SHADOW, cr0);
1272 vmcs_writel(GUEST_CR0,
1273 (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON);
1274 vcpu->arch.cr0 = cr0;
1275
1276 if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE))
1277 vmx_fpu_activate(vcpu);
1278}
1279
1280static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1281{
1282 vmcs_writel(GUEST_CR3, cr3);
1283 if (vcpu->arch.cr0 & X86_CR0_PE)
1284 vmx_fpu_deactivate(vcpu);
1285}
1286
1287static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1288{
1289 vmcs_writel(CR4_READ_SHADOW, cr4);
1290 vmcs_writel(GUEST_CR4, cr4 | (vcpu->arch.rmode.active ?
1291 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON));
1292 vcpu->arch.cr4 = cr4;
1293}
1294
1295#ifdef CONFIG_X86_64
1296
1297static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
1298{
1299 struct vcpu_vmx *vmx = to_vmx(vcpu);
1300 struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
1301
1302 vcpu->arch.shadow_efer = efer;
1303 if (efer & EFER_LMA) {
1304 vmcs_write32(VM_ENTRY_CONTROLS,
1305 vmcs_read32(VM_ENTRY_CONTROLS) |
1306 VM_ENTRY_IA32E_MODE);
1307 msr->data = efer;
1308
1309 } else {
1310 vmcs_write32(VM_ENTRY_CONTROLS,
1311 vmcs_read32(VM_ENTRY_CONTROLS) &
1312 ~VM_ENTRY_IA32E_MODE);
1313
1314 msr->data = efer & ~EFER_LME;
1315 }
1316 setup_msrs(vmx);
1317}
1318
1319#endif
1320
1321static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
1322{
1323 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1324
1325 return vmcs_readl(sf->base);
1326}
1327
1328static void vmx_get_segment(struct kvm_vcpu *vcpu,
1329 struct kvm_segment *var, int seg)
1330{
1331 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1332 u32 ar;
1333
1334 var->base = vmcs_readl(sf->base);
1335 var->limit = vmcs_read32(sf->limit);
1336 var->selector = vmcs_read16(sf->selector);
1337 ar = vmcs_read32(sf->ar_bytes);
1338 if (ar & AR_UNUSABLE_MASK)
1339 ar = 0;
1340 var->type = ar & 15;
1341 var->s = (ar >> 4) & 1;
1342 var->dpl = (ar >> 5) & 3;
1343 var->present = (ar >> 7) & 1;
1344 var->avl = (ar >> 12) & 1;
1345 var->l = (ar >> 13) & 1;
1346 var->db = (ar >> 14) & 1;
1347 var->g = (ar >> 15) & 1;
1348 var->unusable = (ar >> 16) & 1;
1349}
1350
1351static u32 vmx_segment_access_rights(struct kvm_segment *var)
1352{
1353 u32 ar;
1354
1355 if (var->unusable)
1356 ar = 1 << 16;
1357 else {
1358 ar = var->type & 15;
1359 ar |= (var->s & 1) << 4;
1360 ar |= (var->dpl & 3) << 5;
1361 ar |= (var->present & 1) << 7;
1362 ar |= (var->avl & 1) << 12;
1363 ar |= (var->l & 1) << 13;
1364 ar |= (var->db & 1) << 14;
1365 ar |= (var->g & 1) << 15;
1366 }
1367 if (ar == 0) /* a 0 value means unusable */
1368 ar = AR_UNUSABLE_MASK;
1369
1370 return ar;
1371}
1372
1373static void vmx_set_segment(struct kvm_vcpu *vcpu,
1374 struct kvm_segment *var, int seg)
1375{
1376 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1377 u32 ar;
1378
1379 if (vcpu->arch.rmode.active && seg == VCPU_SREG_TR) {
1380 vcpu->arch.rmode.tr.selector = var->selector;
1381 vcpu->arch.rmode.tr.base = var->base;
1382 vcpu->arch.rmode.tr.limit = var->limit;
1383 vcpu->arch.rmode.tr.ar = vmx_segment_access_rights(var);
1384 return;
1385 }
1386 vmcs_writel(sf->base, var->base);
1387 vmcs_write32(sf->limit, var->limit);
1388 vmcs_write16(sf->selector, var->selector);
1389 if (vcpu->arch.rmode.active && var->s) {
1390 /*
1391 * Hack real-mode segments into vm86 compatibility.
1392 */
1393 if (var->base == 0xffff0000 && var->selector == 0xf000)
1394 vmcs_writel(sf->base, 0xf0000);
1395 ar = 0xf3;
1396 } else
1397 ar = vmx_segment_access_rights(var);
1398 vmcs_write32(sf->ar_bytes, ar);
1399}
1400
1401static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
1402{
1403 u32 ar = vmcs_read32(GUEST_CS_AR_BYTES);
1404
1405 *db = (ar >> 14) & 1;
1406 *l = (ar >> 13) & 1;
1407}
1408
1409static void vmx_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
1410{
1411 dt->limit = vmcs_read32(GUEST_IDTR_LIMIT);
1412 dt->base = vmcs_readl(GUEST_IDTR_BASE);
1413}
1414
1415static void vmx_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
1416{
1417 vmcs_write32(GUEST_IDTR_LIMIT, dt->limit);
1418 vmcs_writel(GUEST_IDTR_BASE, dt->base);
1419}
1420
1421static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
1422{
1423 dt->limit = vmcs_read32(GUEST_GDTR_LIMIT);
1424 dt->base = vmcs_readl(GUEST_GDTR_BASE);
1425}
1426
1427static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
1428{
1429 vmcs_write32(GUEST_GDTR_LIMIT, dt->limit);
1430 vmcs_writel(GUEST_GDTR_BASE, dt->base);
1431}
1432
1433static int init_rmode_tss(struct kvm *kvm)
1434{
1435 gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
1436 u16 data = 0;
1437 int r;
1438
1439 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
1440 if (r < 0)
1441 return 0;
1442 data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
1443 r = kvm_write_guest_page(kvm, fn++, &data, 0x66, sizeof(u16));
1444 if (r < 0)
1445 return 0;
1446 r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
1447 if (r < 0)
1448 return 0;
1449 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
1450 if (r < 0)
1451 return 0;
1452 data = ~0;
1453 r = kvm_write_guest_page(kvm, fn, &data, RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
1454 sizeof(u8));
1455 if (r < 0)
1456 return 0;
1457 return 1;
1458}
1459
1460static void seg_setup(int seg)
1461{
1462 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1463
1464 vmcs_write16(sf->selector, 0);
1465 vmcs_writel(sf->base, 0);
1466 vmcs_write32(sf->limit, 0xffff);
1467 vmcs_write32(sf->ar_bytes, 0x93);
1468}
1469
1470static int alloc_apic_access_page(struct kvm *kvm)
1471{
1472 struct kvm_userspace_memory_region kvm_userspace_mem;
1473 int r = 0;
1474
1475 mutex_lock(&kvm->lock);
1476 if (kvm->arch.apic_access_page)
1477 goto out;
1478 kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
1479 kvm_userspace_mem.flags = 0;
1480 kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL;
1481 kvm_userspace_mem.memory_size = PAGE_SIZE;
1482 r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
1483 if (r)
1484 goto out;
1485 kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
1486out:
1487 mutex_unlock(&kvm->lock);
1488 return r;
1489}
1490
1491/*
1492 * Sets up the vmcs for emulated real mode.
1493 */
1494static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
1495{
1496 u32 host_sysenter_cs;
1497 u32 junk;
1498 unsigned long a;
1499 struct descriptor_table dt;
1500 int i;
1501 unsigned long kvm_vmx_return;
1502 u32 exec_control;
1503
1504 /* I/O */
1505 vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a));
1506 vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b));
1507
1508 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
1509
1510 /* Control */
1511 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
1512 vmcs_config.pin_based_exec_ctrl);
1513
1514 exec_control = vmcs_config.cpu_based_exec_ctrl;
1515 if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
1516 exec_control &= ~CPU_BASED_TPR_SHADOW;
1517#ifdef CONFIG_X86_64
1518 exec_control |= CPU_BASED_CR8_STORE_EXITING |
1519 CPU_BASED_CR8_LOAD_EXITING;
1520#endif
1521 }
1522 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
1523
1524 if (cpu_has_secondary_exec_ctrls()) {
1525 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
1526 if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
1527 exec_control &=
1528 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
1529 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
1530 }
1531
1532 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
1533 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
1534 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
1535
1536 vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */
1537 vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */
1538 vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
1539
1540 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
1541 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
1542 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
1543 vmcs_write16(HOST_FS_SELECTOR, read_fs()); /* 22.2.4 */
1544 vmcs_write16(HOST_GS_SELECTOR, read_gs()); /* 22.2.4 */
1545 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
1546#ifdef CONFIG_X86_64
1547 rdmsrl(MSR_FS_BASE, a);
1548 vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
1549 rdmsrl(MSR_GS_BASE, a);
1550 vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */
1551#else
1552 vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
1553 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
1554#endif
1555
1556 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
1557
1558 get_idt(&dt);
1559 vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */
1560
1561 asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
1562 vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
1563 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
1564 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
1565 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
1566
1567 rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
1568 vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
1569 rdmsrl(MSR_IA32_SYSENTER_ESP, a);
1570 vmcs_writel(HOST_IA32_SYSENTER_ESP, a); /* 22.2.3 */
1571 rdmsrl(MSR_IA32_SYSENTER_EIP, a);
1572 vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */
1573
1574 for (i = 0; i < NR_VMX_MSR; ++i) {
1575 u32 index = vmx_msr_index[i];
1576 u32 data_low, data_high;
1577 u64 data;
1578 int j = vmx->nmsrs;
1579
1580 if (rdmsr_safe(index, &data_low, &data_high) < 0)
1581 continue;
1582 if (wrmsr_safe(index, data_low, data_high) < 0)
1583 continue;
1584 data = data_low | ((u64)data_high << 32);
1585 vmx->host_msrs[j].index = index;
1586 vmx->host_msrs[j].reserved = 0;
1587 vmx->host_msrs[j].data = data;
1588 vmx->guest_msrs[j] = vmx->host_msrs[j];
1589 ++vmx->nmsrs;
1590 }
1591
1592 vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
1593
1594 /* 22.2.1, 20.8.1 */
1595 vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
1596
1597 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
1598 vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK);
1599
1600 if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
1601 if (alloc_apic_access_page(vmx->vcpu.kvm) != 0)
1602 return -ENOMEM;
1603
1604 return 0;
1605}
1606
1607static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
1608{
1609 struct vcpu_vmx *vmx = to_vmx(vcpu);
1610 u64 msr;
1611 int ret;
1612
1613 if (!init_rmode_tss(vmx->vcpu.kvm)) {
1614 ret = -ENOMEM;
1615 goto out;
1616 }
1617
1618 vmx->vcpu.arch.rmode.active = 0;
1619
1620 vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
1621 set_cr8(&vmx->vcpu, 0);
1622 msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
1623 if (vmx->vcpu.vcpu_id == 0)
1624 msr |= MSR_IA32_APICBASE_BSP;
1625 kvm_set_apic_base(&vmx->vcpu, msr);
1626
1627 fx_init(&vmx->vcpu);
1628
1629 /*
1630 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
1631 * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh.
1632 */
1633 if (vmx->vcpu.vcpu_id == 0) {
1634 vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
1635 vmcs_writel(GUEST_CS_BASE, 0x000f0000);
1636 } else {
1637 vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
1638 vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
1639 }
1640 vmcs_write32(GUEST_CS_LIMIT, 0xffff);
1641 vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
1642
1643 seg_setup(VCPU_SREG_DS);
1644 seg_setup(VCPU_SREG_ES);
1645 seg_setup(VCPU_SREG_FS);
1646 seg_setup(VCPU_SREG_GS);
1647 seg_setup(VCPU_SREG_SS);
1648
1649 vmcs_write16(GUEST_TR_SELECTOR, 0);
1650 vmcs_writel(GUEST_TR_BASE, 0);
1651 vmcs_write32(GUEST_TR_LIMIT, 0xffff);
1652 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
1653
1654 vmcs_write16(GUEST_LDTR_SELECTOR, 0);
1655 vmcs_writel(GUEST_LDTR_BASE, 0);
1656 vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
1657 vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
1658
1659 vmcs_write32(GUEST_SYSENTER_CS, 0);
1660 vmcs_writel(GUEST_SYSENTER_ESP, 0);
1661 vmcs_writel(GUEST_SYSENTER_EIP, 0);
1662
1663 vmcs_writel(GUEST_RFLAGS, 0x02);
1664 if (vmx->vcpu.vcpu_id == 0)
1665 vmcs_writel(GUEST_RIP, 0xfff0);
1666 else
1667 vmcs_writel(GUEST_RIP, 0);
1668 vmcs_writel(GUEST_RSP, 0);
1669
1670 /* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */
1671 vmcs_writel(GUEST_DR7, 0x400);
1672
1673 vmcs_writel(GUEST_GDTR_BASE, 0);
1674 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
1675
1676 vmcs_writel(GUEST_IDTR_BASE, 0);
1677 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
1678
1679 vmcs_write32(GUEST_ACTIVITY_STATE, 0);
1680 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
1681 vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
1682
1683 guest_write_tsc(0);
1684
1685 /* Special registers */
1686 vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
1687
1688 setup_msrs(vmx);
1689
1690 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
1691
1692 if (cpu_has_vmx_tpr_shadow()) {
1693 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
1694 if (vm_need_tpr_shadow(vmx->vcpu.kvm))
1695 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
1696 page_to_phys(vmx->vcpu.arch.apic->regs_page));
1697 vmcs_write32(TPR_THRESHOLD, 0);
1698 }
1699
1700 if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
1701 vmcs_write64(APIC_ACCESS_ADDR,
1702 page_to_phys(vmx->vcpu.kvm->arch.apic_access_page));
1703
1704 vmx->vcpu.arch.cr0 = 0x60000010;
1705 vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */
1706 vmx_set_cr4(&vmx->vcpu, 0);
1707#ifdef CONFIG_X86_64
1708 vmx_set_efer(&vmx->vcpu, 0);
1709#endif
1710 vmx_fpu_activate(&vmx->vcpu);
1711 update_exception_bitmap(&vmx->vcpu);
1712
1713 return 0;
1714
1715out:
1716 return ret;
1717}
1718
1719static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
1720{
1721 struct vcpu_vmx *vmx = to_vmx(vcpu);
1722
1723 if (vcpu->arch.rmode.active) {
1724 vmx->rmode.irq.pending = true;
1725 vmx->rmode.irq.vector = irq;
1726 vmx->rmode.irq.rip = vmcs_readl(GUEST_RIP);
1727 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
1728 irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK);
1729 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
1730 vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip - 1);
1731 return;
1732 }
1733 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
1734 irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
1735}
1736
1737static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
1738{
1739 int word_index = __ffs(vcpu->arch.irq_summary);
1740 int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
1741 int irq = word_index * BITS_PER_LONG + bit_index;
1742
1743 clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
1744 if (!vcpu->arch.irq_pending[word_index])
1745 clear_bit(word_index, &vcpu->arch.irq_summary);
1746 vmx_inject_irq(vcpu, irq);
1747}
1748
1749
1750static void do_interrupt_requests(struct kvm_vcpu *vcpu,
1751 struct kvm_run *kvm_run)
1752{
1753 u32 cpu_based_vm_exec_control;
1754
1755 vcpu->arch.interrupt_window_open =
1756 ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
1757 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
1758
1759 if (vcpu->arch.interrupt_window_open &&
1760 vcpu->arch.irq_summary &&
1761 !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
1762 /*
1763 * If interrupts enabled, and not blocked by sti or mov ss. Good.
1764 */
1765 kvm_do_inject_irq(vcpu);
1766
1767 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
1768 if (!vcpu->arch.interrupt_window_open &&
1769 (vcpu->arch.irq_summary || kvm_run->request_interrupt_window))
1770 /*
1771 * Interrupts blocked. Wait for unblock.
1772 */
1773 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
1774 else
1775 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
1776 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
1777}
1778
1779static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
1780{
1781 int ret;
1782 struct kvm_userspace_memory_region tss_mem = {
1783 .slot = 8,
1784 .guest_phys_addr = addr,
1785 .memory_size = PAGE_SIZE * 3,
1786 .flags = 0,
1787 };
1788
1789 ret = kvm_set_memory_region(kvm, &tss_mem, 0);
1790 if (ret)
1791 return ret;
1792 kvm->arch.tss_addr = addr;
1793 return 0;
1794}
1795
1796static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
1797{
1798 struct kvm_guest_debug *dbg = &vcpu->guest_debug;
1799
1800 set_debugreg(dbg->bp[0], 0);
1801 set_debugreg(dbg->bp[1], 1);
1802 set_debugreg(dbg->bp[2], 2);
1803 set_debugreg(dbg->bp[3], 3);
1804
1805 if (dbg->singlestep) {
1806 unsigned long flags;
1807
1808 flags = vmcs_readl(GUEST_RFLAGS);
1809 flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
1810 vmcs_writel(GUEST_RFLAGS, flags);
1811 }
1812}
1813
1814static int handle_rmode_exception(struct kvm_vcpu *vcpu,
1815 int vec, u32 err_code)
1816{
1817 if (!vcpu->arch.rmode.active)
1818 return 0;
1819
1820 /*
1821 * Instruction with address size override prefix opcode 0x67
1822 * Cause the #SS fault with 0 error code in VM86 mode.
1823 */
1824 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
1825 if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE)
1826 return 1;
1827 return 0;
1828}
1829
1830static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1831{
1832 struct vcpu_vmx *vmx = to_vmx(vcpu);
1833 u32 intr_info, error_code;
1834 unsigned long cr2, rip;
1835 u32 vect_info;
1836 enum emulation_result er;
1837
1838 vect_info = vmx->idt_vectoring_info;
1839 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
1840
1841 if ((vect_info & VECTORING_INFO_VALID_MASK) &&
1842 !is_page_fault(intr_info))
1843 printk(KERN_ERR "%s: unexpected, vectoring info 0x%x "
1844 "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info);
1845
1846 if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) {
1847 int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
1848 set_bit(irq, vcpu->arch.irq_pending);
1849 set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
1850 }
1851
1852 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */
1853 return 1; /* already handled by vmx_vcpu_run() */
1854
1855 if (is_no_device(intr_info)) {
1856 vmx_fpu_activate(vcpu);
1857 return 1;
1858 }
1859
1860 if (is_invalid_opcode(intr_info)) {
1861 er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
1862 if (er != EMULATE_DONE)
1863 kvm_queue_exception(vcpu, UD_VECTOR);
1864 return 1;
1865 }
1866
1867 error_code = 0;
1868 rip = vmcs_readl(GUEST_RIP);
1869 if (intr_info & INTR_INFO_DELIEVER_CODE_MASK)
1870 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
1871 if (is_page_fault(intr_info)) {
1872 cr2 = vmcs_readl(EXIT_QUALIFICATION);
1873 return kvm_mmu_page_fault(vcpu, cr2, error_code);
1874 }
1875
1876 if (vcpu->arch.rmode.active &&
1877 handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
1878 error_code)) {
1879 if (vcpu->arch.halt_request) {
1880 vcpu->arch.halt_request = 0;
1881 return kvm_emulate_halt(vcpu);
1882 }
1883 return 1;
1884 }
1885
1886 if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) ==
1887 (INTR_TYPE_EXCEPTION | 1)) {
1888 kvm_run->exit_reason = KVM_EXIT_DEBUG;
1889 return 0;
1890 }
1891 kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
1892 kvm_run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
1893 kvm_run->ex.error_code = error_code;
1894 return 0;
1895}
1896
1897static int handle_external_interrupt(struct kvm_vcpu *vcpu,
1898 struct kvm_run *kvm_run)
1899{
1900 ++vcpu->stat.irq_exits;
1901 return 1;
1902}
1903
1904static int handle_triple_fault(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1905{
1906 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
1907 return 0;
1908}
1909
1910static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1911{
1912 unsigned long exit_qualification;
1913 int size, down, in, string, rep;
1914 unsigned port;
1915
1916 ++vcpu->stat.io_exits;
1917 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
1918 string = (exit_qualification & 16) != 0;
1919
1920 if (string) {
1921 if (emulate_instruction(vcpu,
1922 kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
1923 return 0;
1924 return 1;
1925 }
1926
1927 size = (exit_qualification & 7) + 1;
1928 in = (exit_qualification & 8) != 0;
1929 down = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0;
1930 rep = (exit_qualification & 32) != 0;
1931 port = exit_qualification >> 16;
1932
1933 return kvm_emulate_pio(vcpu, kvm_run, in, size, port);
1934}
1935
1936static void
1937vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
1938{
1939 /*
1940 * Patch in the VMCALL instruction:
1941 */
1942 hypercall[0] = 0x0f;
1943 hypercall[1] = 0x01;
1944 hypercall[2] = 0xc1;
1945}
1946
1947static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1948{
1949 unsigned long exit_qualification;
1950 int cr;
1951 int reg;
1952
1953 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
1954 cr = exit_qualification & 15;
1955 reg = (exit_qualification >> 8) & 15;
1956 switch ((exit_qualification >> 4) & 3) {
1957 case 0: /* mov to cr */
1958 switch (cr) {
1959 case 0:
1960 vcpu_load_rsp_rip(vcpu);
1961 set_cr0(vcpu, vcpu->arch.regs[reg]);
1962 skip_emulated_instruction(vcpu);
1963 return 1;
1964 case 3:
1965 vcpu_load_rsp_rip(vcpu);
1966 set_cr3(vcpu, vcpu->arch.regs[reg]);
1967 skip_emulated_instruction(vcpu);
1968 return 1;
1969 case 4:
1970 vcpu_load_rsp_rip(vcpu);
1971 set_cr4(vcpu, vcpu->arch.regs[reg]);
1972 skip_emulated_instruction(vcpu);
1973 return 1;
1974 case 8:
1975 vcpu_load_rsp_rip(vcpu);
1976 set_cr8(vcpu, vcpu->arch.regs[reg]);
1977 skip_emulated_instruction(vcpu);
1978 if (irqchip_in_kernel(vcpu->kvm))
1979 return 1;
1980 kvm_run->exit_reason = KVM_EXIT_SET_TPR;
1981 return 0;
1982 };
1983 break;
1984 case 2: /* clts */
1985 vcpu_load_rsp_rip(vcpu);
1986 vmx_fpu_deactivate(vcpu);
1987 vcpu->arch.cr0 &= ~X86_CR0_TS;
1988 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
1989 vmx_fpu_activate(vcpu);
1990 skip_emulated_instruction(vcpu);
1991 return 1;
1992 case 1: /*mov from cr*/
1993 switch (cr) {
1994 case 3:
1995 vcpu_load_rsp_rip(vcpu);
1996 vcpu->arch.regs[reg] = vcpu->arch.cr3;
1997 vcpu_put_rsp_rip(vcpu);
1998 skip_emulated_instruction(vcpu);
1999 return 1;
2000 case 8:
2001 vcpu_load_rsp_rip(vcpu);
2002 vcpu->arch.regs[reg] = get_cr8(vcpu);
2003 vcpu_put_rsp_rip(vcpu);
2004 skip_emulated_instruction(vcpu);
2005 return 1;
2006 }
2007 break;
2008 case 3: /* lmsw */
2009 lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f);
2010
2011 skip_emulated_instruction(vcpu);
2012 return 1;
2013 default:
2014 break;
2015 }
2016 kvm_run->exit_reason = 0;
2017 pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
2018 (int)(exit_qualification >> 4) & 3, cr);
2019 return 0;
2020}
2021
2022static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2023{
2024 unsigned long exit_qualification;
2025 unsigned long val;
2026 int dr, reg;
2027
2028 /*
2029 * FIXME: this code assumes the host is debugging the guest.
2030 * need to deal with guest debugging itself too.
2031 */
2032 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
2033 dr = exit_qualification & 7;
2034 reg = (exit_qualification >> 8) & 15;
2035 vcpu_load_rsp_rip(vcpu);
2036 if (exit_qualification & 16) {
2037 /* mov from dr */
2038 switch (dr) {
2039 case 6:
2040 val = 0xffff0ff0;
2041 break;
2042 case 7:
2043 val = 0x400;
2044 break;
2045 default:
2046 val = 0;
2047 }
2048 vcpu->arch.regs[reg] = val;
2049 } else {
2050 /* mov to dr */
2051 }
2052 vcpu_put_rsp_rip(vcpu);
2053 skip_emulated_instruction(vcpu);
2054 return 1;
2055}
2056
2057static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2058{
2059 kvm_emulate_cpuid(vcpu);
2060 return 1;
2061}
2062
2063static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2064{
2065 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
2066 u64 data;
2067
2068 if (vmx_get_msr(vcpu, ecx, &data)) {
2069 kvm_inject_gp(vcpu, 0);
2070 return 1;
2071 }
2072
2073 /* FIXME: handling of bits 32:63 of rax, rdx */
2074 vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u;
2075 vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
2076 skip_emulated_instruction(vcpu);
2077 return 1;
2078}
2079
2080static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2081{
2082 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
2083 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
2084 | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
2085
2086 if (vmx_set_msr(vcpu, ecx, data) != 0) {
2087 kvm_inject_gp(vcpu, 0);
2088 return 1;
2089 }
2090
2091 skip_emulated_instruction(vcpu);
2092 return 1;
2093}
2094
2095static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu,
2096 struct kvm_run *kvm_run)
2097{
2098 return 1;
2099}
2100
2101static int handle_interrupt_window(struct kvm_vcpu *vcpu,
2102 struct kvm_run *kvm_run)
2103{
2104 u32 cpu_based_vm_exec_control;
2105
2106 /* clear pending irq */
2107 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2108 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2109 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2110 /*
2111 * If the user space waits to inject interrupts, exit as soon as
2112 * possible
2113 */
2114 if (kvm_run->request_interrupt_window &&
2115 !vcpu->arch.irq_summary) {
2116 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
2117 ++vcpu->stat.irq_window_exits;
2118 return 0;
2119 }
2120 return 1;
2121}
2122
2123static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2124{
2125 skip_emulated_instruction(vcpu);
2126 return kvm_emulate_halt(vcpu);
2127}
2128
2129static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2130{
2131 skip_emulated_instruction(vcpu);
2132 kvm_emulate_hypercall(vcpu);
2133 return 1;
2134}
2135
2136static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2137{
2138 skip_emulated_instruction(vcpu);
2139 /* TODO: Add support for VT-d/pass-through device */
2140 return 1;
2141}
2142
2143static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2144{
2145 u64 exit_qualification;
2146 enum emulation_result er;
2147 unsigned long offset;
2148
2149 exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2150 offset = exit_qualification & 0xffful;
2151
2152 er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
2153
2154 if (er != EMULATE_DONE) {
2155 printk(KERN_ERR
2156 "Fail to handle apic access vmexit! Offset is 0x%lx\n",
2157 offset);
2158 return -ENOTSUPP;
2159 }
2160 return 1;
2161}
2162
2163/*
2164 * The exit handlers return 1 if the exit was handled fully and guest execution
2165 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
2166 * to be done to userspace and return 0.
2167 */
2168static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
2169 struct kvm_run *kvm_run) = {
2170 [EXIT_REASON_EXCEPTION_NMI] = handle_exception,
2171 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
2172 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
2173 [EXIT_REASON_IO_INSTRUCTION] = handle_io,
2174 [EXIT_REASON_CR_ACCESS] = handle_cr,
2175 [EXIT_REASON_DR_ACCESS] = handle_dr,
2176 [EXIT_REASON_CPUID] = handle_cpuid,
2177 [EXIT_REASON_MSR_READ] = handle_rdmsr,
2178 [EXIT_REASON_MSR_WRITE] = handle_wrmsr,
2179 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
2180 [EXIT_REASON_HLT] = handle_halt,
2181 [EXIT_REASON_VMCALL] = handle_vmcall,
2182 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
2183 [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
2184 [EXIT_REASON_WBINVD] = handle_wbinvd,
2185};
2186
2187static const int kvm_vmx_max_exit_handlers =
2188 ARRAY_SIZE(kvm_vmx_exit_handlers);
2189
2190/*
2191 * The guest has exited. See if we can fix it or if we need userspace
2192 * assistance.
2193 */
2194static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2195{
2196 u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
2197 struct vcpu_vmx *vmx = to_vmx(vcpu);
2198 u32 vectoring_info = vmx->idt_vectoring_info;
2199
2200 if (unlikely(vmx->fail)) {
2201 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
2202 kvm_run->fail_entry.hardware_entry_failure_reason
2203 = vmcs_read32(VM_INSTRUCTION_ERROR);
2204 return 0;
2205 }
2206
2207 if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
2208 exit_reason != EXIT_REASON_EXCEPTION_NMI)
2209 printk(KERN_WARNING "%s: unexpected, valid vectoring info and "
2210 "exit reason is 0x%x\n", __FUNCTION__, exit_reason);
2211 if (exit_reason < kvm_vmx_max_exit_handlers
2212 && kvm_vmx_exit_handlers[exit_reason])
2213 return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run);
2214 else {
2215 kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
2216 kvm_run->hw.hardware_exit_reason = exit_reason;
2217 }
2218 return 0;
2219}
2220
2221static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
2222{
2223}
2224
2225static void update_tpr_threshold(struct kvm_vcpu *vcpu)
2226{
2227 int max_irr, tpr;
2228
2229 if (!vm_need_tpr_shadow(vcpu->kvm))
2230 return;
2231
2232 if (!kvm_lapic_enabled(vcpu) ||
2233 ((max_irr = kvm_lapic_find_highest_irr(vcpu)) == -1)) {
2234 vmcs_write32(TPR_THRESHOLD, 0);
2235 return;
2236 }
2237
2238 tpr = (kvm_lapic_get_cr8(vcpu) & 0x0f) << 4;
2239 vmcs_write32(TPR_THRESHOLD, (max_irr > tpr) ? tpr >> 4 : max_irr >> 4);
2240}
2241
2242static void enable_irq_window(struct kvm_vcpu *vcpu)
2243{
2244 u32 cpu_based_vm_exec_control;
2245
2246 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2247 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
2248 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2249}
2250
2251static void vmx_intr_assist(struct kvm_vcpu *vcpu)
2252{
2253 struct vcpu_vmx *vmx = to_vmx(vcpu);
2254 u32 idtv_info_field, intr_info_field;
2255 int has_ext_irq, interrupt_window_open;
2256 int vector;
2257
2258 update_tpr_threshold(vcpu);
2259
2260 has_ext_irq = kvm_cpu_has_interrupt(vcpu);
2261 intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
2262 idtv_info_field = vmx->idt_vectoring_info;
2263 if (intr_info_field & INTR_INFO_VALID_MASK) {
2264 if (idtv_info_field & INTR_INFO_VALID_MASK) {
2265 /* TODO: fault when IDT_Vectoring */
2266 if (printk_ratelimit())
2267 printk(KERN_ERR "Fault when IDT_Vectoring\n");
2268 }
2269 if (has_ext_irq)
2270 enable_irq_window(vcpu);
2271 return;
2272 }
2273 if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) {
2274 if ((idtv_info_field & VECTORING_INFO_TYPE_MASK)
2275 == INTR_TYPE_EXT_INTR
2276 && vcpu->arch.rmode.active) {
2277 u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK;
2278
2279 vmx_inject_irq(vcpu, vect);
2280 if (unlikely(has_ext_irq))
2281 enable_irq_window(vcpu);
2282 return;
2283 }
2284
2285 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field);
2286 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2287 vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
2288
2289 if (unlikely(idtv_info_field & INTR_INFO_DELIEVER_CODE_MASK))
2290 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
2291 vmcs_read32(IDT_VECTORING_ERROR_CODE));
2292 if (unlikely(has_ext_irq))
2293 enable_irq_window(vcpu);
2294 return;
2295 }
2296 if (!has_ext_irq)
2297 return;
2298 interrupt_window_open =
2299 ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
2300 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
2301 if (interrupt_window_open) {
2302 vector = kvm_cpu_get_interrupt(vcpu);
2303 vmx_inject_irq(vcpu, vector);
2304 kvm_timer_intr_post(vcpu, vector);
2305 } else
2306 enable_irq_window(vcpu);
2307}
2308
2309/*
2310 * Failure to inject an interrupt should give us the information
2311 * in IDT_VECTORING_INFO_FIELD. However, if the failure occurs
2312 * when fetching the interrupt redirection bitmap in the real-mode
2313 * tss, this doesn't happen. So we do it ourselves.
2314 */
2315static void fixup_rmode_irq(struct vcpu_vmx *vmx)
2316{
2317 vmx->rmode.irq.pending = 0;
2318 if (vmcs_readl(GUEST_RIP) + 1 != vmx->rmode.irq.rip)
2319 return;
2320 vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip);
2321 if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
2322 vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK;
2323 vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR;
2324 return;
2325 }
2326 vmx->idt_vectoring_info =
2327 VECTORING_INFO_VALID_MASK
2328 | INTR_TYPE_EXT_INTR
2329 | vmx->rmode.irq.vector;
2330}
2331
2332static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2333{
2334 struct vcpu_vmx *vmx = to_vmx(vcpu);
2335 u32 intr_info;
2336
2337 /*
2338 * Loading guest fpu may have cleared host cr0.ts
2339 */
2340 vmcs_writel(HOST_CR0, read_cr0());
2341
2342 asm(
2343 /* Store host registers */
2344#ifdef CONFIG_X86_64
2345 "push %%rdx; push %%rbp;"
2346 "push %%rcx \n\t"
2347#else
2348 "push %%edx; push %%ebp;"
2349 "push %%ecx \n\t"
2350#endif
2351 ASM_VMX_VMWRITE_RSP_RDX "\n\t"
2352 /* Check if vmlaunch of vmresume is needed */
2353 "cmpl $0, %c[launched](%0) \n\t"
2354 /* Load guest registers. Don't clobber flags. */
2355#ifdef CONFIG_X86_64
2356 "mov %c[cr2](%0), %%rax \n\t"
2357 "mov %%rax, %%cr2 \n\t"
2358 "mov %c[rax](%0), %%rax \n\t"
2359 "mov %c[rbx](%0), %%rbx \n\t"
2360 "mov %c[rdx](%0), %%rdx \n\t"
2361 "mov %c[rsi](%0), %%rsi \n\t"
2362 "mov %c[rdi](%0), %%rdi \n\t"
2363 "mov %c[rbp](%0), %%rbp \n\t"
2364 "mov %c[r8](%0), %%r8 \n\t"
2365 "mov %c[r9](%0), %%r9 \n\t"
2366 "mov %c[r10](%0), %%r10 \n\t"
2367 "mov %c[r11](%0), %%r11 \n\t"
2368 "mov %c[r12](%0), %%r12 \n\t"
2369 "mov %c[r13](%0), %%r13 \n\t"
2370 "mov %c[r14](%0), %%r14 \n\t"
2371 "mov %c[r15](%0), %%r15 \n\t"
2372 "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (rcx) */
2373#else
2374 "mov %c[cr2](%0), %%eax \n\t"
2375 "mov %%eax, %%cr2 \n\t"
2376 "mov %c[rax](%0), %%eax \n\t"
2377 "mov %c[rbx](%0), %%ebx \n\t"
2378 "mov %c[rdx](%0), %%edx \n\t"
2379 "mov %c[rsi](%0), %%esi \n\t"
2380 "mov %c[rdi](%0), %%edi \n\t"
2381 "mov %c[rbp](%0), %%ebp \n\t"
2382 "mov %c[rcx](%0), %%ecx \n\t" /* kills %0 (ecx) */
2383#endif
2384 /* Enter guest mode */
2385 "jne .Llaunched \n\t"
2386 ASM_VMX_VMLAUNCH "\n\t"
2387 "jmp .Lkvm_vmx_return \n\t"
2388 ".Llaunched: " ASM_VMX_VMRESUME "\n\t"
2389 ".Lkvm_vmx_return: "
2390 /* Save guest registers, load host registers, keep flags */
2391#ifdef CONFIG_X86_64
2392 "xchg %0, (%%rsp) \n\t"
2393 "mov %%rax, %c[rax](%0) \n\t"
2394 "mov %%rbx, %c[rbx](%0) \n\t"
2395 "pushq (%%rsp); popq %c[rcx](%0) \n\t"
2396 "mov %%rdx, %c[rdx](%0) \n\t"
2397 "mov %%rsi, %c[rsi](%0) \n\t"
2398 "mov %%rdi, %c[rdi](%0) \n\t"
2399 "mov %%rbp, %c[rbp](%0) \n\t"
2400 "mov %%r8, %c[r8](%0) \n\t"
2401 "mov %%r9, %c[r9](%0) \n\t"
2402 "mov %%r10, %c[r10](%0) \n\t"
2403 "mov %%r11, %c[r11](%0) \n\t"
2404 "mov %%r12, %c[r12](%0) \n\t"
2405 "mov %%r13, %c[r13](%0) \n\t"
2406 "mov %%r14, %c[r14](%0) \n\t"
2407 "mov %%r15, %c[r15](%0) \n\t"
2408 "mov %%cr2, %%rax \n\t"
2409 "mov %%rax, %c[cr2](%0) \n\t"
2410
2411 "pop %%rbp; pop %%rbp; pop %%rdx \n\t"
2412#else
2413 "xchg %0, (%%esp) \n\t"
2414 "mov %%eax, %c[rax](%0) \n\t"
2415 "mov %%ebx, %c[rbx](%0) \n\t"
2416 "pushl (%%esp); popl %c[rcx](%0) \n\t"
2417 "mov %%edx, %c[rdx](%0) \n\t"
2418 "mov %%esi, %c[rsi](%0) \n\t"
2419 "mov %%edi, %c[rdi](%0) \n\t"
2420 "mov %%ebp, %c[rbp](%0) \n\t"
2421 "mov %%cr2, %%eax \n\t"
2422 "mov %%eax, %c[cr2](%0) \n\t"
2423
2424 "pop %%ebp; pop %%ebp; pop %%edx \n\t"
2425#endif
2426 "setbe %c[fail](%0) \n\t"
2427 : : "c"(vmx), "d"((unsigned long)HOST_RSP),
2428 [launched]"i"(offsetof(struct vcpu_vmx, launched)),
2429 [fail]"i"(offsetof(struct vcpu_vmx, fail)),
2430 [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
2431 [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
2432 [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
2433 [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
2434 [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
2435 [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
2436 [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
2437#ifdef CONFIG_X86_64
2438 [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
2439 [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
2440 [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
2441 [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
2442 [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
2443 [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
2444 [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
2445 [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
2446#endif
2447 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2))
2448 : "cc", "memory"
2449#ifdef CONFIG_X86_64
2450 , "rbx", "rdi", "rsi"
2451 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
2452#else
2453 , "ebx", "edi", "rsi"
2454#endif
2455 );
2456
2457 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
2458 if (vmx->rmode.irq.pending)
2459 fixup_rmode_irq(vmx);
2460
2461 vcpu->arch.interrupt_window_open =
2462 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0;
2463
2464 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
2465 vmx->launched = 1;
2466
2467 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
2468
2469 /* We need to handle NMIs before interrupts are enabled */
2470 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */
2471 asm("int $2");
2472}
2473
2474static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
2475{
2476 struct vcpu_vmx *vmx = to_vmx(vcpu);
2477
2478 if (vmx->vmcs) {
2479 on_each_cpu(__vcpu_clear, vmx, 0, 1);
2480 free_vmcs(vmx->vmcs);
2481 vmx->vmcs = NULL;
2482 }
2483}
2484
2485static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
2486{
2487 struct vcpu_vmx *vmx = to_vmx(vcpu);
2488
2489 vmx_free_vmcs(vcpu);
2490 kfree(vmx->host_msrs);
2491 kfree(vmx->guest_msrs);
2492 kvm_vcpu_uninit(vcpu);
2493 kmem_cache_free(kvm_vcpu_cache, vmx);
2494}
2495
2496static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
2497{
2498 int err;
2499 struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
2500 int cpu;
2501
2502 if (!vmx)
2503 return ERR_PTR(-ENOMEM);
2504
2505 err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
2506 if (err)
2507 goto free_vcpu;
2508
2509 vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
2510 if (!vmx->guest_msrs) {
2511 err = -ENOMEM;
2512 goto uninit_vcpu;
2513 }
2514
2515 vmx->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
2516 if (!vmx->host_msrs)
2517 goto free_guest_msrs;
2518
2519 vmx->vmcs = alloc_vmcs();
2520 if (!vmx->vmcs)
2521 goto free_msrs;
2522
2523 vmcs_clear(vmx->vmcs);
2524
2525 cpu = get_cpu();
2526 vmx_vcpu_load(&vmx->vcpu, cpu);
2527 err = vmx_vcpu_setup(vmx);
2528 vmx_vcpu_put(&vmx->vcpu);
2529 put_cpu();
2530 if (err)
2531 goto free_vmcs;
2532
2533 return &vmx->vcpu;
2534
2535free_vmcs:
2536 free_vmcs(vmx->vmcs);
2537free_msrs:
2538 kfree(vmx->host_msrs);
2539free_guest_msrs:
2540 kfree(vmx->guest_msrs);
2541uninit_vcpu:
2542 kvm_vcpu_uninit(&vmx->vcpu);
2543free_vcpu:
2544 kmem_cache_free(kvm_vcpu_cache, vmx);
2545 return ERR_PTR(err);
2546}
2547
2548static void __init vmx_check_processor_compat(void *rtn)
2549{
2550 struct vmcs_config vmcs_conf;
2551
2552 *(int *)rtn = 0;
2553 if (setup_vmcs_config(&vmcs_conf) < 0)
2554 *(int *)rtn = -EIO;
2555 if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
2556 printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
2557 smp_processor_id());
2558 *(int *)rtn = -EIO;
2559 }
2560}
2561
2562static struct kvm_x86_ops vmx_x86_ops = {
2563 .cpu_has_kvm_support = cpu_has_kvm_support,
2564 .disabled_by_bios = vmx_disabled_by_bios,
2565 .hardware_setup = hardware_setup,
2566 .hardware_unsetup = hardware_unsetup,
2567 .check_processor_compatibility = vmx_check_processor_compat,
2568 .hardware_enable = hardware_enable,
2569 .hardware_disable = hardware_disable,
2570
2571 .vcpu_create = vmx_create_vcpu,
2572 .vcpu_free = vmx_free_vcpu,
2573 .vcpu_reset = vmx_vcpu_reset,
2574
2575 .prepare_guest_switch = vmx_save_host_state,
2576 .vcpu_load = vmx_vcpu_load,
2577 .vcpu_put = vmx_vcpu_put,
2578 .vcpu_decache = vmx_vcpu_decache,
2579
2580 .set_guest_debug = set_guest_debug,
2581 .guest_debug_pre = kvm_guest_debug_pre,
2582 .get_msr = vmx_get_msr,
2583 .set_msr = vmx_set_msr,
2584 .get_segment_base = vmx_get_segment_base,
2585 .get_segment = vmx_get_segment,
2586 .set_segment = vmx_set_segment,
2587 .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
2588 .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
2589 .set_cr0 = vmx_set_cr0,
2590 .set_cr3 = vmx_set_cr3,
2591 .set_cr4 = vmx_set_cr4,
2592#ifdef CONFIG_X86_64
2593 .set_efer = vmx_set_efer,
2594#endif
2595 .get_idt = vmx_get_idt,
2596 .set_idt = vmx_set_idt,
2597 .get_gdt = vmx_get_gdt,
2598 .set_gdt = vmx_set_gdt,
2599 .cache_regs = vcpu_load_rsp_rip,
2600 .decache_regs = vcpu_put_rsp_rip,
2601 .get_rflags = vmx_get_rflags,
2602 .set_rflags = vmx_set_rflags,
2603
2604 .tlb_flush = vmx_flush_tlb,
2605
2606 .run = vmx_vcpu_run,
2607 .handle_exit = kvm_handle_exit,
2608 .skip_emulated_instruction = skip_emulated_instruction,
2609 .patch_hypercall = vmx_patch_hypercall,
2610 .get_irq = vmx_get_irq,
2611 .set_irq = vmx_inject_irq,
2612 .queue_exception = vmx_queue_exception,
2613 .exception_injected = vmx_exception_injected,
2614 .inject_pending_irq = vmx_intr_assist,
2615 .inject_pending_vectors = do_interrupt_requests,
2616
2617 .set_tss_addr = vmx_set_tss_addr,
2618};
2619
2620static int __init vmx_init(void)
2621{
2622 void *iova;
2623 int r;
2624
2625 vmx_io_bitmap_a = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
2626 if (!vmx_io_bitmap_a)
2627 return -ENOMEM;
2628
2629 vmx_io_bitmap_b = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
2630 if (!vmx_io_bitmap_b) {
2631 r = -ENOMEM;
2632 goto out;
2633 }
2634
2635 /*
2636 * Allow direct access to the PC debug port (it is often used for I/O
2637 * delays, but the vmexits simply slow things down).
2638 */
2639 iova = kmap(vmx_io_bitmap_a);
2640 memset(iova, 0xff, PAGE_SIZE);
2641 clear_bit(0x80, iova);
2642 kunmap(vmx_io_bitmap_a);
2643
2644 iova = kmap(vmx_io_bitmap_b);
2645 memset(iova, 0xff, PAGE_SIZE);
2646 kunmap(vmx_io_bitmap_b);
2647
2648 r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
2649 if (r)
2650 goto out1;
2651
2652 if (bypass_guest_pf)
2653 kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
2654
2655 return 0;
2656
2657out1:
2658 __free_page(vmx_io_bitmap_b);
2659out:
2660 __free_page(vmx_io_bitmap_a);
2661 return r;
2662}
2663
2664static void __exit vmx_exit(void)
2665{
2666 __free_page(vmx_io_bitmap_b);
2667 __free_page(vmx_io_bitmap_a);
2668
2669 kvm_exit();
2670}
2671
2672module_init(vmx_init)
2673module_exit(vmx_exit)
diff --git a/drivers/kvm/vmx.h b/drivers/kvm/vmx.h
deleted file mode 100644
index d52ae8d7303d..000000000000
--- a/drivers/kvm/vmx.h
+++ /dev/null
@@ -1,324 +0,0 @@
1#ifndef VMX_H
2#define VMX_H
3
4/*
5 * vmx.h: VMX Architecture related definitions
6 * Copyright (c) 2004, Intel Corporation.
7 *
8 * This program is free software; you can redistribute it and/or modify it
9 * under the terms and conditions of the GNU General Public License,
10 * version 2, as published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope it will be useful, but WITHOUT
13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
15 * more details.
16 *
17 * You should have received a copy of the GNU General Public License along with
18 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
19 * Place - Suite 330, Boston, MA 02111-1307 USA.
20 *
21 * A few random additions are:
22 * Copyright (C) 2006 Qumranet
23 * Avi Kivity <avi@qumranet.com>
24 * Yaniv Kamay <yaniv@qumranet.com>
25 *
26 */
27
28/*
29 * Definitions of Primary Processor-Based VM-Execution Controls.
30 */
31#define CPU_BASED_VIRTUAL_INTR_PENDING 0x00000004
32#define CPU_BASED_USE_TSC_OFFSETING 0x00000008
33#define CPU_BASED_HLT_EXITING 0x00000080
34#define CPU_BASED_INVLPG_EXITING 0x00000200
35#define CPU_BASED_MWAIT_EXITING 0x00000400
36#define CPU_BASED_RDPMC_EXITING 0x00000800
37#define CPU_BASED_RDTSC_EXITING 0x00001000
38#define CPU_BASED_CR8_LOAD_EXITING 0x00080000
39#define CPU_BASED_CR8_STORE_EXITING 0x00100000
40#define CPU_BASED_TPR_SHADOW 0x00200000
41#define CPU_BASED_MOV_DR_EXITING 0x00800000
42#define CPU_BASED_UNCOND_IO_EXITING 0x01000000
43#define CPU_BASED_USE_IO_BITMAPS 0x02000000
44#define CPU_BASED_USE_MSR_BITMAPS 0x10000000
45#define CPU_BASED_MONITOR_EXITING 0x20000000
46#define CPU_BASED_PAUSE_EXITING 0x40000000
47#define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS 0x80000000
48/*
49 * Definitions of Secondary Processor-Based VM-Execution Controls.
50 */
51#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
52#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
53
54
55#define PIN_BASED_EXT_INTR_MASK 0x00000001
56#define PIN_BASED_NMI_EXITING 0x00000008
57#define PIN_BASED_VIRTUAL_NMIS 0x00000020
58
59#define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200
60#define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000
61
62#define VM_ENTRY_IA32E_MODE 0x00000200
63#define VM_ENTRY_SMM 0x00000400
64#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800
65
66/* VMCS Encodings */
67enum vmcs_field {
68 GUEST_ES_SELECTOR = 0x00000800,
69 GUEST_CS_SELECTOR = 0x00000802,
70 GUEST_SS_SELECTOR = 0x00000804,
71 GUEST_DS_SELECTOR = 0x00000806,
72 GUEST_FS_SELECTOR = 0x00000808,
73 GUEST_GS_SELECTOR = 0x0000080a,
74 GUEST_LDTR_SELECTOR = 0x0000080c,
75 GUEST_TR_SELECTOR = 0x0000080e,
76 HOST_ES_SELECTOR = 0x00000c00,
77 HOST_CS_SELECTOR = 0x00000c02,
78 HOST_SS_SELECTOR = 0x00000c04,
79 HOST_DS_SELECTOR = 0x00000c06,
80 HOST_FS_SELECTOR = 0x00000c08,
81 HOST_GS_SELECTOR = 0x00000c0a,
82 HOST_TR_SELECTOR = 0x00000c0c,
83 IO_BITMAP_A = 0x00002000,
84 IO_BITMAP_A_HIGH = 0x00002001,
85 IO_BITMAP_B = 0x00002002,
86 IO_BITMAP_B_HIGH = 0x00002003,
87 MSR_BITMAP = 0x00002004,
88 MSR_BITMAP_HIGH = 0x00002005,
89 VM_EXIT_MSR_STORE_ADDR = 0x00002006,
90 VM_EXIT_MSR_STORE_ADDR_HIGH = 0x00002007,
91 VM_EXIT_MSR_LOAD_ADDR = 0x00002008,
92 VM_EXIT_MSR_LOAD_ADDR_HIGH = 0x00002009,
93 VM_ENTRY_MSR_LOAD_ADDR = 0x0000200a,
94 VM_ENTRY_MSR_LOAD_ADDR_HIGH = 0x0000200b,
95 TSC_OFFSET = 0x00002010,
96 TSC_OFFSET_HIGH = 0x00002011,
97 VIRTUAL_APIC_PAGE_ADDR = 0x00002012,
98 VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013,
99 APIC_ACCESS_ADDR = 0x00002014,
100 APIC_ACCESS_ADDR_HIGH = 0x00002015,
101 VMCS_LINK_POINTER = 0x00002800,
102 VMCS_LINK_POINTER_HIGH = 0x00002801,
103 GUEST_IA32_DEBUGCTL = 0x00002802,
104 GUEST_IA32_DEBUGCTL_HIGH = 0x00002803,
105 PIN_BASED_VM_EXEC_CONTROL = 0x00004000,
106 CPU_BASED_VM_EXEC_CONTROL = 0x00004002,
107 EXCEPTION_BITMAP = 0x00004004,
108 PAGE_FAULT_ERROR_CODE_MASK = 0x00004006,
109 PAGE_FAULT_ERROR_CODE_MATCH = 0x00004008,
110 CR3_TARGET_COUNT = 0x0000400a,
111 VM_EXIT_CONTROLS = 0x0000400c,
112 VM_EXIT_MSR_STORE_COUNT = 0x0000400e,
113 VM_EXIT_MSR_LOAD_COUNT = 0x00004010,
114 VM_ENTRY_CONTROLS = 0x00004012,
115 VM_ENTRY_MSR_LOAD_COUNT = 0x00004014,
116 VM_ENTRY_INTR_INFO_FIELD = 0x00004016,
117 VM_ENTRY_EXCEPTION_ERROR_CODE = 0x00004018,
118 VM_ENTRY_INSTRUCTION_LEN = 0x0000401a,
119 TPR_THRESHOLD = 0x0000401c,
120 SECONDARY_VM_EXEC_CONTROL = 0x0000401e,
121 VM_INSTRUCTION_ERROR = 0x00004400,
122 VM_EXIT_REASON = 0x00004402,
123 VM_EXIT_INTR_INFO = 0x00004404,
124 VM_EXIT_INTR_ERROR_CODE = 0x00004406,
125 IDT_VECTORING_INFO_FIELD = 0x00004408,
126 IDT_VECTORING_ERROR_CODE = 0x0000440a,
127 VM_EXIT_INSTRUCTION_LEN = 0x0000440c,
128 VMX_INSTRUCTION_INFO = 0x0000440e,
129 GUEST_ES_LIMIT = 0x00004800,
130 GUEST_CS_LIMIT = 0x00004802,
131 GUEST_SS_LIMIT = 0x00004804,
132 GUEST_DS_LIMIT = 0x00004806,
133 GUEST_FS_LIMIT = 0x00004808,
134 GUEST_GS_LIMIT = 0x0000480a,
135 GUEST_LDTR_LIMIT = 0x0000480c,
136 GUEST_TR_LIMIT = 0x0000480e,
137 GUEST_GDTR_LIMIT = 0x00004810,
138 GUEST_IDTR_LIMIT = 0x00004812,
139 GUEST_ES_AR_BYTES = 0x00004814,
140 GUEST_CS_AR_BYTES = 0x00004816,
141 GUEST_SS_AR_BYTES = 0x00004818,
142 GUEST_DS_AR_BYTES = 0x0000481a,
143 GUEST_FS_AR_BYTES = 0x0000481c,
144 GUEST_GS_AR_BYTES = 0x0000481e,
145 GUEST_LDTR_AR_BYTES = 0x00004820,
146 GUEST_TR_AR_BYTES = 0x00004822,
147 GUEST_INTERRUPTIBILITY_INFO = 0x00004824,
148 GUEST_ACTIVITY_STATE = 0X00004826,
149 GUEST_SYSENTER_CS = 0x0000482A,
150 HOST_IA32_SYSENTER_CS = 0x00004c00,
151 CR0_GUEST_HOST_MASK = 0x00006000,
152 CR4_GUEST_HOST_MASK = 0x00006002,
153 CR0_READ_SHADOW = 0x00006004,
154 CR4_READ_SHADOW = 0x00006006,
155 CR3_TARGET_VALUE0 = 0x00006008,
156 CR3_TARGET_VALUE1 = 0x0000600a,
157 CR3_TARGET_VALUE2 = 0x0000600c,
158 CR3_TARGET_VALUE3 = 0x0000600e,
159 EXIT_QUALIFICATION = 0x00006400,
160 GUEST_LINEAR_ADDRESS = 0x0000640a,
161 GUEST_CR0 = 0x00006800,
162 GUEST_CR3 = 0x00006802,
163 GUEST_CR4 = 0x00006804,
164 GUEST_ES_BASE = 0x00006806,
165 GUEST_CS_BASE = 0x00006808,
166 GUEST_SS_BASE = 0x0000680a,
167 GUEST_DS_BASE = 0x0000680c,
168 GUEST_FS_BASE = 0x0000680e,
169 GUEST_GS_BASE = 0x00006810,
170 GUEST_LDTR_BASE = 0x00006812,
171 GUEST_TR_BASE = 0x00006814,
172 GUEST_GDTR_BASE = 0x00006816,
173 GUEST_IDTR_BASE = 0x00006818,
174 GUEST_DR7 = 0x0000681a,
175 GUEST_RSP = 0x0000681c,
176 GUEST_RIP = 0x0000681e,
177 GUEST_RFLAGS = 0x00006820,
178 GUEST_PENDING_DBG_EXCEPTIONS = 0x00006822,
179 GUEST_SYSENTER_ESP = 0x00006824,
180 GUEST_SYSENTER_EIP = 0x00006826,
181 HOST_CR0 = 0x00006c00,
182 HOST_CR3 = 0x00006c02,
183 HOST_CR4 = 0x00006c04,
184 HOST_FS_BASE = 0x00006c06,
185 HOST_GS_BASE = 0x00006c08,
186 HOST_TR_BASE = 0x00006c0a,
187 HOST_GDTR_BASE = 0x00006c0c,
188 HOST_IDTR_BASE = 0x00006c0e,
189 HOST_IA32_SYSENTER_ESP = 0x00006c10,
190 HOST_IA32_SYSENTER_EIP = 0x00006c12,
191 HOST_RSP = 0x00006c14,
192 HOST_RIP = 0x00006c16,
193};
194
195#define VMX_EXIT_REASONS_FAILED_VMENTRY 0x80000000
196
197#define EXIT_REASON_EXCEPTION_NMI 0
198#define EXIT_REASON_EXTERNAL_INTERRUPT 1
199#define EXIT_REASON_TRIPLE_FAULT 2
200
201#define EXIT_REASON_PENDING_INTERRUPT 7
202
203#define EXIT_REASON_TASK_SWITCH 9
204#define EXIT_REASON_CPUID 10
205#define EXIT_REASON_HLT 12
206#define EXIT_REASON_INVLPG 14
207#define EXIT_REASON_RDPMC 15
208#define EXIT_REASON_RDTSC 16
209#define EXIT_REASON_VMCALL 18
210#define EXIT_REASON_VMCLEAR 19
211#define EXIT_REASON_VMLAUNCH 20
212#define EXIT_REASON_VMPTRLD 21
213#define EXIT_REASON_VMPTRST 22
214#define EXIT_REASON_VMREAD 23
215#define EXIT_REASON_VMRESUME 24
216#define EXIT_REASON_VMWRITE 25
217#define EXIT_REASON_VMOFF 26
218#define EXIT_REASON_VMON 27
219#define EXIT_REASON_CR_ACCESS 28
220#define EXIT_REASON_DR_ACCESS 29
221#define EXIT_REASON_IO_INSTRUCTION 30
222#define EXIT_REASON_MSR_READ 31
223#define EXIT_REASON_MSR_WRITE 32
224#define EXIT_REASON_MWAIT_INSTRUCTION 36
225#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
226#define EXIT_REASON_APIC_ACCESS 44
227#define EXIT_REASON_WBINVD 54
228
229/*
230 * Interruption-information format
231 */
232#define INTR_INFO_VECTOR_MASK 0xff /* 7:0 */
233#define INTR_INFO_INTR_TYPE_MASK 0x700 /* 10:8 */
234#define INTR_INFO_DELIEVER_CODE_MASK 0x800 /* 11 */
235#define INTR_INFO_VALID_MASK 0x80000000 /* 31 */
236
237#define VECTORING_INFO_VECTOR_MASK INTR_INFO_VECTOR_MASK
238#define VECTORING_INFO_TYPE_MASK INTR_INFO_INTR_TYPE_MASK
239#define VECTORING_INFO_DELIEVER_CODE_MASK INTR_INFO_DELIEVER_CODE_MASK
240#define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK
241
242#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */
243#define INTR_TYPE_EXCEPTION (3 << 8) /* processor exception */
244#define INTR_TYPE_SOFT_INTR (4 << 8) /* software interrupt */
245
246/*
247 * Exit Qualifications for MOV for Control Register Access
248 */
249#define CONTROL_REG_ACCESS_NUM 0x7 /* 2:0, number of control reg.*/
250#define CONTROL_REG_ACCESS_TYPE 0x30 /* 5:4, access type */
251#define CONTROL_REG_ACCESS_REG 0xf00 /* 10:8, general purpose reg. */
252#define LMSW_SOURCE_DATA_SHIFT 16
253#define LMSW_SOURCE_DATA (0xFFFF << LMSW_SOURCE_DATA_SHIFT) /* 16:31 lmsw source */
254#define REG_EAX (0 << 8)
255#define REG_ECX (1 << 8)
256#define REG_EDX (2 << 8)
257#define REG_EBX (3 << 8)
258#define REG_ESP (4 << 8)
259#define REG_EBP (5 << 8)
260#define REG_ESI (6 << 8)
261#define REG_EDI (7 << 8)
262#define REG_R8 (8 << 8)
263#define REG_R9 (9 << 8)
264#define REG_R10 (10 << 8)
265#define REG_R11 (11 << 8)
266#define REG_R12 (12 << 8)
267#define REG_R13 (13 << 8)
268#define REG_R14 (14 << 8)
269#define REG_R15 (15 << 8)
270
271/*
272 * Exit Qualifications for MOV for Debug Register Access
273 */
274#define DEBUG_REG_ACCESS_NUM 0x7 /* 2:0, number of debug reg. */
275#define DEBUG_REG_ACCESS_TYPE 0x10 /* 4, direction of access */
276#define TYPE_MOV_TO_DR (0 << 4)
277#define TYPE_MOV_FROM_DR (1 << 4)
278#define DEBUG_REG_ACCESS_REG 0xf00 /* 11:8, general purpose reg. */
279
280
281/* segment AR */
282#define SEGMENT_AR_L_MASK (1 << 13)
283
284#define AR_TYPE_ACCESSES_MASK 1
285#define AR_TYPE_READABLE_MASK (1 << 1)
286#define AR_TYPE_WRITEABLE_MASK (1 << 2)
287#define AR_TYPE_CODE_MASK (1 << 3)
288#define AR_TYPE_MASK 0x0f
289#define AR_TYPE_BUSY_64_TSS 11
290#define AR_TYPE_BUSY_32_TSS 11
291#define AR_TYPE_BUSY_16_TSS 3
292#define AR_TYPE_LDT 2
293
294#define AR_UNUSABLE_MASK (1 << 16)
295#define AR_S_MASK (1 << 4)
296#define AR_P_MASK (1 << 7)
297#define AR_L_MASK (1 << 13)
298#define AR_DB_MASK (1 << 14)
299#define AR_G_MASK (1 << 15)
300#define AR_DPL_SHIFT 5
301#define AR_DPL(ar) (((ar) >> AR_DPL_SHIFT) & 3)
302
303#define AR_RESERVD_MASK 0xfffe0f00
304
305#define MSR_IA32_VMX_BASIC 0x480
306#define MSR_IA32_VMX_PINBASED_CTLS 0x481
307#define MSR_IA32_VMX_PROCBASED_CTLS 0x482
308#define MSR_IA32_VMX_EXIT_CTLS 0x483
309#define MSR_IA32_VMX_ENTRY_CTLS 0x484
310#define MSR_IA32_VMX_MISC 0x485
311#define MSR_IA32_VMX_CR0_FIXED0 0x486
312#define MSR_IA32_VMX_CR0_FIXED1 0x487
313#define MSR_IA32_VMX_CR4_FIXED0 0x488
314#define MSR_IA32_VMX_CR4_FIXED1 0x489
315#define MSR_IA32_VMX_VMCS_ENUM 0x48a
316#define MSR_IA32_VMX_PROCBASED_CTLS2 0x48b
317
318#define MSR_IA32_FEATURE_CONTROL 0x3a
319#define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1
320#define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4
321
322#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9
323
324#endif
diff --git a/drivers/kvm/x86.c b/drivers/kvm/x86.c
deleted file mode 100644
index b37c0093d728..000000000000
--- a/drivers/kvm/x86.c
+++ /dev/null
@@ -1,3148 +0,0 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * derived from drivers/kvm/kvm_main.c
5 *
6 * Copyright (C) 2006 Qumranet, Inc.
7 *
8 * Authors:
9 * Avi Kivity <avi@qumranet.com>
10 * Yaniv Kamay <yaniv@qumranet.com>
11 *
12 * This work is licensed under the terms of the GNU GPL, version 2. See
13 * the COPYING file in the top-level directory.
14 *
15 */
16
17#include "kvm.h"
18#include "x86.h"
19#include "x86_emulate.h"
20#include "segment_descriptor.h"
21#include "irq.h"
22#include "mmu.h"
23
24#include <linux/kvm.h>
25#include <linux/fs.h>
26#include <linux/vmalloc.h>
27#include <linux/module.h>
28#include <linux/mman.h>
29#include <linux/highmem.h>
30
31#include <asm/uaccess.h>
32#include <asm/msr.h>
33
34#define MAX_IO_MSRS 256
35#define CR0_RESERVED_BITS \
36 (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
37 | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
38 | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
39#define CR4_RESERVED_BITS \
40 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
41 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
42 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \
43 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
44
45#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
46#define EFER_RESERVED_BITS 0xfffffffffffff2fe
47
48#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
49#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
50
51struct kvm_x86_ops *kvm_x86_ops;
52
53struct kvm_stats_debugfs_item debugfs_entries[] = {
54 { "pf_fixed", VCPU_STAT(pf_fixed) },
55 { "pf_guest", VCPU_STAT(pf_guest) },
56 { "tlb_flush", VCPU_STAT(tlb_flush) },
57 { "invlpg", VCPU_STAT(invlpg) },
58 { "exits", VCPU_STAT(exits) },
59 { "io_exits", VCPU_STAT(io_exits) },
60 { "mmio_exits", VCPU_STAT(mmio_exits) },
61 { "signal_exits", VCPU_STAT(signal_exits) },
62 { "irq_window", VCPU_STAT(irq_window_exits) },
63 { "halt_exits", VCPU_STAT(halt_exits) },
64 { "halt_wakeup", VCPU_STAT(halt_wakeup) },
65 { "request_irq", VCPU_STAT(request_irq_exits) },
66 { "irq_exits", VCPU_STAT(irq_exits) },
67 { "host_state_reload", VCPU_STAT(host_state_reload) },
68 { "efer_reload", VCPU_STAT(efer_reload) },
69 { "fpu_reload", VCPU_STAT(fpu_reload) },
70 { "insn_emulation", VCPU_STAT(insn_emulation) },
71 { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
72 { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
73 { "mmu_pte_write", VM_STAT(mmu_pte_write) },
74 { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
75 { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
76 { "mmu_flooded", VM_STAT(mmu_flooded) },
77 { "mmu_recycled", VM_STAT(mmu_recycled) },
78 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
79 { NULL }
80};
81
82
83unsigned long segment_base(u16 selector)
84{
85 struct descriptor_table gdt;
86 struct segment_descriptor *d;
87 unsigned long table_base;
88 unsigned long v;
89
90 if (selector == 0)
91 return 0;
92
93 asm("sgdt %0" : "=m"(gdt));
94 table_base = gdt.base;
95
96 if (selector & 4) { /* from ldt */
97 u16 ldt_selector;
98
99 asm("sldt %0" : "=g"(ldt_selector));
100 table_base = segment_base(ldt_selector);
101 }
102 d = (struct segment_descriptor *)(table_base + (selector & ~7));
103 v = d->base_low | ((unsigned long)d->base_mid << 16) |
104 ((unsigned long)d->base_high << 24);
105#ifdef CONFIG_X86_64
106 if (d->system == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
107 v |= ((unsigned long) \
108 ((struct segment_descriptor_64 *)d)->base_higher) << 32;
109#endif
110 return v;
111}
112EXPORT_SYMBOL_GPL(segment_base);
113
114u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
115{
116 if (irqchip_in_kernel(vcpu->kvm))
117 return vcpu->arch.apic_base;
118 else
119 return vcpu->arch.apic_base;
120}
121EXPORT_SYMBOL_GPL(kvm_get_apic_base);
122
123void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
124{
125 /* TODO: reserve bits check */
126 if (irqchip_in_kernel(vcpu->kvm))
127 kvm_lapic_set_base(vcpu, data);
128 else
129 vcpu->arch.apic_base = data;
130}
131EXPORT_SYMBOL_GPL(kvm_set_apic_base);
132
133void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
134{
135 WARN_ON(vcpu->arch.exception.pending);
136 vcpu->arch.exception.pending = true;
137 vcpu->arch.exception.has_error_code = false;
138 vcpu->arch.exception.nr = nr;
139}
140EXPORT_SYMBOL_GPL(kvm_queue_exception);
141
142void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
143 u32 error_code)
144{
145 ++vcpu->stat.pf_guest;
146 if (vcpu->arch.exception.pending && vcpu->arch.exception.nr == PF_VECTOR) {
147 printk(KERN_DEBUG "kvm: inject_page_fault:"
148 " double fault 0x%lx\n", addr);
149 vcpu->arch.exception.nr = DF_VECTOR;
150 vcpu->arch.exception.error_code = 0;
151 return;
152 }
153 vcpu->arch.cr2 = addr;
154 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
155}
156
157void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
158{
159 WARN_ON(vcpu->arch.exception.pending);
160 vcpu->arch.exception.pending = true;
161 vcpu->arch.exception.has_error_code = true;
162 vcpu->arch.exception.nr = nr;
163 vcpu->arch.exception.error_code = error_code;
164}
165EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
166
167static void __queue_exception(struct kvm_vcpu *vcpu)
168{
169 kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
170 vcpu->arch.exception.has_error_code,
171 vcpu->arch.exception.error_code);
172}
173
174/*
175 * Load the pae pdptrs. Return true is they are all valid.
176 */
177int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
178{
179 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
180 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
181 int i;
182 int ret;
183 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
184
185 mutex_lock(&vcpu->kvm->lock);
186 ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
187 offset * sizeof(u64), sizeof(pdpte));
188 if (ret < 0) {
189 ret = 0;
190 goto out;
191 }
192 for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
193 if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {
194 ret = 0;
195 goto out;
196 }
197 }
198 ret = 1;
199
200 memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
201out:
202 mutex_unlock(&vcpu->kvm->lock);
203
204 return ret;
205}
206
207static bool pdptrs_changed(struct kvm_vcpu *vcpu)
208{
209 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
210 bool changed = true;
211 int r;
212
213 if (is_long_mode(vcpu) || !is_pae(vcpu))
214 return false;
215
216 mutex_lock(&vcpu->kvm->lock);
217 r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
218 if (r < 0)
219 goto out;
220 changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
221out:
222 mutex_unlock(&vcpu->kvm->lock);
223
224 return changed;
225}
226
227void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
228{
229 if (cr0 & CR0_RESERVED_BITS) {
230 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
231 cr0, vcpu->arch.cr0);
232 kvm_inject_gp(vcpu, 0);
233 return;
234 }
235
236 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
237 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
238 kvm_inject_gp(vcpu, 0);
239 return;
240 }
241
242 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
243 printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
244 "and a clear PE flag\n");
245 kvm_inject_gp(vcpu, 0);
246 return;
247 }
248
249 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
250#ifdef CONFIG_X86_64
251 if ((vcpu->arch.shadow_efer & EFER_LME)) {
252 int cs_db, cs_l;
253
254 if (!is_pae(vcpu)) {
255 printk(KERN_DEBUG "set_cr0: #GP, start paging "
256 "in long mode while PAE is disabled\n");
257 kvm_inject_gp(vcpu, 0);
258 return;
259 }
260 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
261 if (cs_l) {
262 printk(KERN_DEBUG "set_cr0: #GP, start paging "
263 "in long mode while CS.L == 1\n");
264 kvm_inject_gp(vcpu, 0);
265 return;
266
267 }
268 } else
269#endif
270 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
271 printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
272 "reserved bits\n");
273 kvm_inject_gp(vcpu, 0);
274 return;
275 }
276
277 }
278
279 kvm_x86_ops->set_cr0(vcpu, cr0);
280 vcpu->arch.cr0 = cr0;
281
282 mutex_lock(&vcpu->kvm->lock);
283 kvm_mmu_reset_context(vcpu);
284 mutex_unlock(&vcpu->kvm->lock);
285 return;
286}
287EXPORT_SYMBOL_GPL(set_cr0);
288
289void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
290{
291 set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
292}
293EXPORT_SYMBOL_GPL(lmsw);
294
295void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
296{
297 if (cr4 & CR4_RESERVED_BITS) {
298 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
299 kvm_inject_gp(vcpu, 0);
300 return;
301 }
302
303 if (is_long_mode(vcpu)) {
304 if (!(cr4 & X86_CR4_PAE)) {
305 printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
306 "in long mode\n");
307 kvm_inject_gp(vcpu, 0);
308 return;
309 }
310 } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE)
311 && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
312 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
313 kvm_inject_gp(vcpu, 0);
314 return;
315 }
316
317 if (cr4 & X86_CR4_VMXE) {
318 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
319 kvm_inject_gp(vcpu, 0);
320 return;
321 }
322 kvm_x86_ops->set_cr4(vcpu, cr4);
323 vcpu->arch.cr4 = cr4;
324 mutex_lock(&vcpu->kvm->lock);
325 kvm_mmu_reset_context(vcpu);
326 mutex_unlock(&vcpu->kvm->lock);
327}
328EXPORT_SYMBOL_GPL(set_cr4);
329
330void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
331{
332 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
333 kvm_mmu_flush_tlb(vcpu);
334 return;
335 }
336
337 if (is_long_mode(vcpu)) {
338 if (cr3 & CR3_L_MODE_RESERVED_BITS) {
339 printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
340 kvm_inject_gp(vcpu, 0);
341 return;
342 }
343 } else {
344 if (is_pae(vcpu)) {
345 if (cr3 & CR3_PAE_RESERVED_BITS) {
346 printk(KERN_DEBUG
347 "set_cr3: #GP, reserved bits\n");
348 kvm_inject_gp(vcpu, 0);
349 return;
350 }
351 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
352 printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
353 "reserved bits\n");
354 kvm_inject_gp(vcpu, 0);
355 return;
356 }
357 }
358 /*
359 * We don't check reserved bits in nonpae mode, because
360 * this isn't enforced, and VMware depends on this.
361 */
362 }
363
364 mutex_lock(&vcpu->kvm->lock);
365 /*
366 * Does the new cr3 value map to physical memory? (Note, we
367 * catch an invalid cr3 even in real-mode, because it would
368 * cause trouble later on when we turn on paging anyway.)
369 *
370 * A real CPU would silently accept an invalid cr3 and would
371 * attempt to use it - with largely undefined (and often hard
372 * to debug) behavior on the guest side.
373 */
374 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
375 kvm_inject_gp(vcpu, 0);
376 else {
377 vcpu->arch.cr3 = cr3;
378 vcpu->arch.mmu.new_cr3(vcpu);
379 }
380 mutex_unlock(&vcpu->kvm->lock);
381}
382EXPORT_SYMBOL_GPL(set_cr3);
383
384void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
385{
386 if (cr8 & CR8_RESERVED_BITS) {
387 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
388 kvm_inject_gp(vcpu, 0);
389 return;
390 }
391 if (irqchip_in_kernel(vcpu->kvm))
392 kvm_lapic_set_tpr(vcpu, cr8);
393 else
394 vcpu->arch.cr8 = cr8;
395}
396EXPORT_SYMBOL_GPL(set_cr8);
397
398unsigned long get_cr8(struct kvm_vcpu *vcpu)
399{
400 if (irqchip_in_kernel(vcpu->kvm))
401 return kvm_lapic_get_cr8(vcpu);
402 else
403 return vcpu->arch.cr8;
404}
405EXPORT_SYMBOL_GPL(get_cr8);
406
407/*
408 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
409 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
410 *
411 * This list is modified at module load time to reflect the
412 * capabilities of the host cpu.
413 */
414static u32 msrs_to_save[] = {
415 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
416 MSR_K6_STAR,
417#ifdef CONFIG_X86_64
418 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
419#endif
420 MSR_IA32_TIME_STAMP_COUNTER,
421};
422
423static unsigned num_msrs_to_save;
424
425static u32 emulated_msrs[] = {
426 MSR_IA32_MISC_ENABLE,
427};
428
429#ifdef CONFIG_X86_64
430
431static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
432{
433 if (efer & EFER_RESERVED_BITS) {
434 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
435 efer);
436 kvm_inject_gp(vcpu, 0);
437 return;
438 }
439
440 if (is_paging(vcpu)
441 && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) {
442 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
443 kvm_inject_gp(vcpu, 0);
444 return;
445 }
446
447 kvm_x86_ops->set_efer(vcpu, efer);
448
449 efer &= ~EFER_LMA;
450 efer |= vcpu->arch.shadow_efer & EFER_LMA;
451
452 vcpu->arch.shadow_efer = efer;
453}
454
455#endif
456
457/*
458 * Writes msr value into into the appropriate "register".
459 * Returns 0 on success, non-0 otherwise.
460 * Assumes vcpu_load() was already called.
461 */
462int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
463{
464 return kvm_x86_ops->set_msr(vcpu, msr_index, data);
465}
466
467/*
468 * Adapt set_msr() to msr_io()'s calling convention
469 */
470static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
471{
472 return kvm_set_msr(vcpu, index, *data);
473}
474
475
476int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
477{
478 switch (msr) {
479#ifdef CONFIG_X86_64
480 case MSR_EFER:
481 set_efer(vcpu, data);
482 break;
483#endif
484 case MSR_IA32_MC0_STATUS:
485 pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
486 __FUNCTION__, data);
487 break;
488 case MSR_IA32_MCG_STATUS:
489 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
490 __FUNCTION__, data);
491 break;
492 case MSR_IA32_UCODE_REV:
493 case MSR_IA32_UCODE_WRITE:
494 case 0x200 ... 0x2ff: /* MTRRs */
495 break;
496 case MSR_IA32_APICBASE:
497 kvm_set_apic_base(vcpu, data);
498 break;
499 case MSR_IA32_MISC_ENABLE:
500 vcpu->arch.ia32_misc_enable_msr = data;
501 break;
502 default:
503 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x\n", msr);
504 return 1;
505 }
506 return 0;
507}
508EXPORT_SYMBOL_GPL(kvm_set_msr_common);
509
510
511/*
512 * Reads an msr value (of 'msr_index') into 'pdata'.
513 * Returns 0 on success, non-0 otherwise.
514 * Assumes vcpu_load() was already called.
515 */
516int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
517{
518 return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
519}
520
521int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
522{
523 u64 data;
524
525 switch (msr) {
526 case 0xc0010010: /* SYSCFG */
527 case 0xc0010015: /* HWCR */
528 case MSR_IA32_PLATFORM_ID:
529 case MSR_IA32_P5_MC_ADDR:
530 case MSR_IA32_P5_MC_TYPE:
531 case MSR_IA32_MC0_CTL:
532 case MSR_IA32_MCG_STATUS:
533 case MSR_IA32_MCG_CAP:
534 case MSR_IA32_MC0_MISC:
535 case MSR_IA32_MC0_MISC+4:
536 case MSR_IA32_MC0_MISC+8:
537 case MSR_IA32_MC0_MISC+12:
538 case MSR_IA32_MC0_MISC+16:
539 case MSR_IA32_UCODE_REV:
540 case MSR_IA32_PERF_STATUS:
541 case MSR_IA32_EBL_CR_POWERON:
542 /* MTRR registers */
543 case 0xfe:
544 case 0x200 ... 0x2ff:
545 data = 0;
546 break;
547 case 0xcd: /* fsb frequency */
548 data = 3;
549 break;
550 case MSR_IA32_APICBASE:
551 data = kvm_get_apic_base(vcpu);
552 break;
553 case MSR_IA32_MISC_ENABLE:
554 data = vcpu->arch.ia32_misc_enable_msr;
555 break;
556#ifdef CONFIG_X86_64
557 case MSR_EFER:
558 data = vcpu->arch.shadow_efer;
559 break;
560#endif
561 default:
562 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
563 return 1;
564 }
565 *pdata = data;
566 return 0;
567}
568EXPORT_SYMBOL_GPL(kvm_get_msr_common);
569
570/*
571 * Read or write a bunch of msrs. All parameters are kernel addresses.
572 *
573 * @return number of msrs set successfully.
574 */
575static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
576 struct kvm_msr_entry *entries,
577 int (*do_msr)(struct kvm_vcpu *vcpu,
578 unsigned index, u64 *data))
579{
580 int i;
581
582 vcpu_load(vcpu);
583
584 for (i = 0; i < msrs->nmsrs; ++i)
585 if (do_msr(vcpu, entries[i].index, &entries[i].data))
586 break;
587
588 vcpu_put(vcpu);
589
590 return i;
591}
592
593/*
594 * Read or write a bunch of msrs. Parameters are user addresses.
595 *
596 * @return number of msrs set successfully.
597 */
598static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
599 int (*do_msr)(struct kvm_vcpu *vcpu,
600 unsigned index, u64 *data),
601 int writeback)
602{
603 struct kvm_msrs msrs;
604 struct kvm_msr_entry *entries;
605 int r, n;
606 unsigned size;
607
608 r = -EFAULT;
609 if (copy_from_user(&msrs, user_msrs, sizeof msrs))
610 goto out;
611
612 r = -E2BIG;
613 if (msrs.nmsrs >= MAX_IO_MSRS)
614 goto out;
615
616 r = -ENOMEM;
617 size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
618 entries = vmalloc(size);
619 if (!entries)
620 goto out;
621
622 r = -EFAULT;
623 if (copy_from_user(entries, user_msrs->entries, size))
624 goto out_free;
625
626 r = n = __msr_io(vcpu, &msrs, entries, do_msr);
627 if (r < 0)
628 goto out_free;
629
630 r = -EFAULT;
631 if (writeback && copy_to_user(user_msrs->entries, entries, size))
632 goto out_free;
633
634 r = n;
635
636out_free:
637 vfree(entries);
638out:
639 return r;
640}
641
642/*
643 * Make sure that a cpu that is being hot-unplugged does not have any vcpus
644 * cached on it.
645 */
646void decache_vcpus_on_cpu(int cpu)
647{
648 struct kvm *vm;
649 struct kvm_vcpu *vcpu;
650 int i;
651
652 spin_lock(&kvm_lock);
653 list_for_each_entry(vm, &vm_list, vm_list)
654 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
655 vcpu = vm->vcpus[i];
656 if (!vcpu)
657 continue;
658 /*
659 * If the vcpu is locked, then it is running on some
660 * other cpu and therefore it is not cached on the
661 * cpu in question.
662 *
663 * If it's not locked, check the last cpu it executed
664 * on.
665 */
666 if (mutex_trylock(&vcpu->mutex)) {
667 if (vcpu->cpu == cpu) {
668 kvm_x86_ops->vcpu_decache(vcpu);
669 vcpu->cpu = -1;
670 }
671 mutex_unlock(&vcpu->mutex);
672 }
673 }
674 spin_unlock(&kvm_lock);
675}
676
677int kvm_dev_ioctl_check_extension(long ext)
678{
679 int r;
680
681 switch (ext) {
682 case KVM_CAP_IRQCHIP:
683 case KVM_CAP_HLT:
684 case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
685 case KVM_CAP_USER_MEMORY:
686 case KVM_CAP_SET_TSS_ADDR:
687 case KVM_CAP_EXT_CPUID:
688 r = 1;
689 break;
690 default:
691 r = 0;
692 break;
693 }
694 return r;
695
696}
697
698long kvm_arch_dev_ioctl(struct file *filp,
699 unsigned int ioctl, unsigned long arg)
700{
701 void __user *argp = (void __user *)arg;
702 long r;
703
704 switch (ioctl) {
705 case KVM_GET_MSR_INDEX_LIST: {
706 struct kvm_msr_list __user *user_msr_list = argp;
707 struct kvm_msr_list msr_list;
708 unsigned n;
709
710 r = -EFAULT;
711 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
712 goto out;
713 n = msr_list.nmsrs;
714 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
715 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
716 goto out;
717 r = -E2BIG;
718 if (n < num_msrs_to_save)
719 goto out;
720 r = -EFAULT;
721 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
722 num_msrs_to_save * sizeof(u32)))
723 goto out;
724 if (copy_to_user(user_msr_list->indices
725 + num_msrs_to_save * sizeof(u32),
726 &emulated_msrs,
727 ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
728 goto out;
729 r = 0;
730 break;
731 }
732 default:
733 r = -EINVAL;
734 }
735out:
736 return r;
737}
738
739void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
740{
741 kvm_x86_ops->vcpu_load(vcpu, cpu);
742}
743
744void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
745{
746 kvm_x86_ops->vcpu_put(vcpu);
747 kvm_put_guest_fpu(vcpu);
748}
749
750static int is_efer_nx(void)
751{
752 u64 efer;
753
754 rdmsrl(MSR_EFER, efer);
755 return efer & EFER_NX;
756}
757
758static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
759{
760 int i;
761 struct kvm_cpuid_entry2 *e, *entry;
762
763 entry = NULL;
764 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
765 e = &vcpu->arch.cpuid_entries[i];
766 if (e->function == 0x80000001) {
767 entry = e;
768 break;
769 }
770 }
771 if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
772 entry->edx &= ~(1 << 20);
773 printk(KERN_INFO "kvm: guest NX capability removed\n");
774 }
775}
776
777/* when an old userspace process fills a new kernel module */
778static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
779 struct kvm_cpuid *cpuid,
780 struct kvm_cpuid_entry __user *entries)
781{
782 int r, i;
783 struct kvm_cpuid_entry *cpuid_entries;
784
785 r = -E2BIG;
786 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
787 goto out;
788 r = -ENOMEM;
789 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
790 if (!cpuid_entries)
791 goto out;
792 r = -EFAULT;
793 if (copy_from_user(cpuid_entries, entries,
794 cpuid->nent * sizeof(struct kvm_cpuid_entry)))
795 goto out_free;
796 for (i = 0; i < cpuid->nent; i++) {
797 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
798 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
799 vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
800 vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
801 vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
802 vcpu->arch.cpuid_entries[i].index = 0;
803 vcpu->arch.cpuid_entries[i].flags = 0;
804 vcpu->arch.cpuid_entries[i].padding[0] = 0;
805 vcpu->arch.cpuid_entries[i].padding[1] = 0;
806 vcpu->arch.cpuid_entries[i].padding[2] = 0;
807 }
808 vcpu->arch.cpuid_nent = cpuid->nent;
809 cpuid_fix_nx_cap(vcpu);
810 r = 0;
811
812out_free:
813 vfree(cpuid_entries);
814out:
815 return r;
816}
817
818static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
819 struct kvm_cpuid2 *cpuid,
820 struct kvm_cpuid_entry2 __user *entries)
821{
822 int r;
823
824 r = -E2BIG;
825 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
826 goto out;
827 r = -EFAULT;
828 if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
829 cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
830 goto out;
831 vcpu->arch.cpuid_nent = cpuid->nent;
832 return 0;
833
834out:
835 return r;
836}
837
838static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
839 struct kvm_cpuid2 *cpuid,
840 struct kvm_cpuid_entry2 __user *entries)
841{
842 int r;
843
844 r = -E2BIG;
845 if (cpuid->nent < vcpu->arch.cpuid_nent)
846 goto out;
847 r = -EFAULT;
848 if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
849 vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
850 goto out;
851 return 0;
852
853out:
854 cpuid->nent = vcpu->arch.cpuid_nent;
855 return r;
856}
857
858static inline u32 bit(int bitno)
859{
860 return 1 << (bitno & 31);
861}
862
863static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
864 u32 index)
865{
866 entry->function = function;
867 entry->index = index;
868 cpuid_count(entry->function, entry->index,
869 &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
870 entry->flags = 0;
871}
872
873static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
874 u32 index, int *nent, int maxnent)
875{
876 const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) |
877 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
878 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
879 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
880 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
881 bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) |
882 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
883 bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) |
884 bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) |
885 bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP);
886 const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) |
887 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
888 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
889 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
890 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
891 bit(X86_FEATURE_PGE) |
892 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
893 bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) |
894 bit(X86_FEATURE_SYSCALL) |
895 (bit(X86_FEATURE_NX) && is_efer_nx()) |
896#ifdef CONFIG_X86_64
897 bit(X86_FEATURE_LM) |
898#endif
899 bit(X86_FEATURE_MMXEXT) |
900 bit(X86_FEATURE_3DNOWEXT) |
901 bit(X86_FEATURE_3DNOW);
902 const u32 kvm_supported_word3_x86_features =
903 bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16);
904 const u32 kvm_supported_word6_x86_features =
905 bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY);
906
907 /* all func 2 cpuid_count() should be called on the same cpu */
908 get_cpu();
909 do_cpuid_1_ent(entry, function, index);
910 ++*nent;
911
912 switch (function) {
913 case 0:
914 entry->eax = min(entry->eax, (u32)0xb);
915 break;
916 case 1:
917 entry->edx &= kvm_supported_word0_x86_features;
918 entry->ecx &= kvm_supported_word3_x86_features;
919 break;
920 /* function 2 entries are STATEFUL. That is, repeated cpuid commands
921 * may return different values. This forces us to get_cpu() before
922 * issuing the first command, and also to emulate this annoying behavior
923 * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
924 case 2: {
925 int t, times = entry->eax & 0xff;
926
927 entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
928 for (t = 1; t < times && *nent < maxnent; ++t) {
929 do_cpuid_1_ent(&entry[t], function, 0);
930 entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
931 ++*nent;
932 }
933 break;
934 }
935 /* function 4 and 0xb have additional index. */
936 case 4: {
937 int index, cache_type;
938
939 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
940 /* read more entries until cache_type is zero */
941 for (index = 1; *nent < maxnent; ++index) {
942 cache_type = entry[index - 1].eax & 0x1f;
943 if (!cache_type)
944 break;
945 do_cpuid_1_ent(&entry[index], function, index);
946 entry[index].flags |=
947 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
948 ++*nent;
949 }
950 break;
951 }
952 case 0xb: {
953 int index, level_type;
954
955 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
956 /* read more entries until level_type is zero */
957 for (index = 1; *nent < maxnent; ++index) {
958 level_type = entry[index - 1].ecx & 0xff;
959 if (!level_type)
960 break;
961 do_cpuid_1_ent(&entry[index], function, index);
962 entry[index].flags |=
963 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
964 ++*nent;
965 }
966 break;
967 }
968 case 0x80000000:
969 entry->eax = min(entry->eax, 0x8000001a);
970 break;
971 case 0x80000001:
972 entry->edx &= kvm_supported_word1_x86_features;
973 entry->ecx &= kvm_supported_word6_x86_features;
974 break;
975 }
976 put_cpu();
977}
978
979static int kvm_vm_ioctl_get_supported_cpuid(struct kvm *kvm,
980 struct kvm_cpuid2 *cpuid,
981 struct kvm_cpuid_entry2 __user *entries)
982{
983 struct kvm_cpuid_entry2 *cpuid_entries;
984 int limit, nent = 0, r = -E2BIG;
985 u32 func;
986
987 if (cpuid->nent < 1)
988 goto out;
989 r = -ENOMEM;
990 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
991 if (!cpuid_entries)
992 goto out;
993
994 do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
995 limit = cpuid_entries[0].eax;
996 for (func = 1; func <= limit && nent < cpuid->nent; ++func)
997 do_cpuid_ent(&cpuid_entries[nent], func, 0,
998 &nent, cpuid->nent);
999 r = -E2BIG;
1000 if (nent >= cpuid->nent)
1001 goto out_free;
1002
1003 do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
1004 limit = cpuid_entries[nent - 1].eax;
1005 for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
1006 do_cpuid_ent(&cpuid_entries[nent], func, 0,
1007 &nent, cpuid->nent);
1008 r = -EFAULT;
1009 if (copy_to_user(entries, cpuid_entries,
1010 nent * sizeof(struct kvm_cpuid_entry2)))
1011 goto out_free;
1012 cpuid->nent = nent;
1013 r = 0;
1014
1015out_free:
1016 vfree(cpuid_entries);
1017out:
1018 return r;
1019}
1020
1021static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
1022 struct kvm_lapic_state *s)
1023{
1024 vcpu_load(vcpu);
1025 memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
1026 vcpu_put(vcpu);
1027
1028 return 0;
1029}
1030
1031static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
1032 struct kvm_lapic_state *s)
1033{
1034 vcpu_load(vcpu);
1035 memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
1036 kvm_apic_post_state_restore(vcpu);
1037 vcpu_put(vcpu);
1038
1039 return 0;
1040}
1041
1042static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
1043 struct kvm_interrupt *irq)
1044{
1045 if (irq->irq < 0 || irq->irq >= 256)
1046 return -EINVAL;
1047 if (irqchip_in_kernel(vcpu->kvm))
1048 return -ENXIO;
1049 vcpu_load(vcpu);
1050
1051 set_bit(irq->irq, vcpu->arch.irq_pending);
1052 set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
1053
1054 vcpu_put(vcpu);
1055
1056 return 0;
1057}
1058
1059long kvm_arch_vcpu_ioctl(struct file *filp,
1060 unsigned int ioctl, unsigned long arg)
1061{
1062 struct kvm_vcpu *vcpu = filp->private_data;
1063 void __user *argp = (void __user *)arg;
1064 int r;
1065
1066 switch (ioctl) {
1067 case KVM_GET_LAPIC: {
1068 struct kvm_lapic_state lapic;
1069
1070 memset(&lapic, 0, sizeof lapic);
1071 r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic);
1072 if (r)
1073 goto out;
1074 r = -EFAULT;
1075 if (copy_to_user(argp, &lapic, sizeof lapic))
1076 goto out;
1077 r = 0;
1078 break;
1079 }
1080 case KVM_SET_LAPIC: {
1081 struct kvm_lapic_state lapic;
1082
1083 r = -EFAULT;
1084 if (copy_from_user(&lapic, argp, sizeof lapic))
1085 goto out;
1086 r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);;
1087 if (r)
1088 goto out;
1089 r = 0;
1090 break;
1091 }
1092 case KVM_INTERRUPT: {
1093 struct kvm_interrupt irq;
1094
1095 r = -EFAULT;
1096 if (copy_from_user(&irq, argp, sizeof irq))
1097 goto out;
1098 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
1099 if (r)
1100 goto out;
1101 r = 0;
1102 break;
1103 }
1104 case KVM_SET_CPUID: {
1105 struct kvm_cpuid __user *cpuid_arg = argp;
1106 struct kvm_cpuid cpuid;
1107
1108 r = -EFAULT;
1109 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1110 goto out;
1111 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
1112 if (r)
1113 goto out;
1114 break;
1115 }
1116 case KVM_SET_CPUID2: {
1117 struct kvm_cpuid2 __user *cpuid_arg = argp;
1118 struct kvm_cpuid2 cpuid;
1119
1120 r = -EFAULT;
1121 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1122 goto out;
1123 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
1124 cpuid_arg->entries);
1125 if (r)
1126 goto out;
1127 break;
1128 }
1129 case KVM_GET_CPUID2: {
1130 struct kvm_cpuid2 __user *cpuid_arg = argp;
1131 struct kvm_cpuid2 cpuid;
1132
1133 r = -EFAULT;
1134 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1135 goto out;
1136 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
1137 cpuid_arg->entries);
1138 if (r)
1139 goto out;
1140 r = -EFAULT;
1141 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
1142 goto out;
1143 r = 0;
1144 break;
1145 }
1146 case KVM_GET_MSRS:
1147 r = msr_io(vcpu, argp, kvm_get_msr, 1);
1148 break;
1149 case KVM_SET_MSRS:
1150 r = msr_io(vcpu, argp, do_set_msr, 0);
1151 break;
1152 default:
1153 r = -EINVAL;
1154 }
1155out:
1156 return r;
1157}
1158
1159static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
1160{
1161 int ret;
1162
1163 if (addr > (unsigned int)(-3 * PAGE_SIZE))
1164 return -1;
1165 ret = kvm_x86_ops->set_tss_addr(kvm, addr);
1166 return ret;
1167}
1168
1169static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
1170 u32 kvm_nr_mmu_pages)
1171{
1172 if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
1173 return -EINVAL;
1174
1175 mutex_lock(&kvm->lock);
1176
1177 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
1178 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
1179
1180 mutex_unlock(&kvm->lock);
1181 return 0;
1182}
1183
1184static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
1185{
1186 return kvm->arch.n_alloc_mmu_pages;
1187}
1188
1189gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
1190{
1191 int i;
1192 struct kvm_mem_alias *alias;
1193
1194 for (i = 0; i < kvm->arch.naliases; ++i) {
1195 alias = &kvm->arch.aliases[i];
1196 if (gfn >= alias->base_gfn
1197 && gfn < alias->base_gfn + alias->npages)
1198 return alias->target_gfn + gfn - alias->base_gfn;
1199 }
1200 return gfn;
1201}
1202
1203/*
1204 * Set a new alias region. Aliases map a portion of physical memory into
1205 * another portion. This is useful for memory windows, for example the PC
1206 * VGA region.
1207 */
1208static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
1209 struct kvm_memory_alias *alias)
1210{
1211 int r, n;
1212 struct kvm_mem_alias *p;
1213
1214 r = -EINVAL;
1215 /* General sanity checks */
1216 if (alias->memory_size & (PAGE_SIZE - 1))
1217 goto out;
1218 if (alias->guest_phys_addr & (PAGE_SIZE - 1))
1219 goto out;
1220 if (alias->slot >= KVM_ALIAS_SLOTS)
1221 goto out;
1222 if (alias->guest_phys_addr + alias->memory_size
1223 < alias->guest_phys_addr)
1224 goto out;
1225 if (alias->target_phys_addr + alias->memory_size
1226 < alias->target_phys_addr)
1227 goto out;
1228
1229 mutex_lock(&kvm->lock);
1230
1231 p = &kvm->arch.aliases[alias->slot];
1232 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
1233 p->npages = alias->memory_size >> PAGE_SHIFT;
1234 p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
1235
1236 for (n = KVM_ALIAS_SLOTS; n > 0; --n)
1237 if (kvm->arch.aliases[n - 1].npages)
1238 break;
1239 kvm->arch.naliases = n;
1240
1241 kvm_mmu_zap_all(kvm);
1242
1243 mutex_unlock(&kvm->lock);
1244
1245 return 0;
1246
1247out:
1248 return r;
1249}
1250
1251static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
1252{
1253 int r;
1254
1255 r = 0;
1256 switch (chip->chip_id) {
1257 case KVM_IRQCHIP_PIC_MASTER:
1258 memcpy(&chip->chip.pic,
1259 &pic_irqchip(kvm)->pics[0],
1260 sizeof(struct kvm_pic_state));
1261 break;
1262 case KVM_IRQCHIP_PIC_SLAVE:
1263 memcpy(&chip->chip.pic,
1264 &pic_irqchip(kvm)->pics[1],
1265 sizeof(struct kvm_pic_state));
1266 break;
1267 case KVM_IRQCHIP_IOAPIC:
1268 memcpy(&chip->chip.ioapic,
1269 ioapic_irqchip(kvm),
1270 sizeof(struct kvm_ioapic_state));
1271 break;
1272 default:
1273 r = -EINVAL;
1274 break;
1275 }
1276 return r;
1277}
1278
1279static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
1280{
1281 int r;
1282
1283 r = 0;
1284 switch (chip->chip_id) {
1285 case KVM_IRQCHIP_PIC_MASTER:
1286 memcpy(&pic_irqchip(kvm)->pics[0],
1287 &chip->chip.pic,
1288 sizeof(struct kvm_pic_state));
1289 break;
1290 case KVM_IRQCHIP_PIC_SLAVE:
1291 memcpy(&pic_irqchip(kvm)->pics[1],
1292 &chip->chip.pic,
1293 sizeof(struct kvm_pic_state));
1294 break;
1295 case KVM_IRQCHIP_IOAPIC:
1296 memcpy(ioapic_irqchip(kvm),
1297 &chip->chip.ioapic,
1298 sizeof(struct kvm_ioapic_state));
1299 break;
1300 default:
1301 r = -EINVAL;
1302 break;
1303 }
1304 kvm_pic_update_irq(pic_irqchip(kvm));
1305 return r;
1306}
1307
1308/*
1309 * Get (and clear) the dirty memory log for a memory slot.
1310 */
1311int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
1312 struct kvm_dirty_log *log)
1313{
1314 int r;
1315 int n;
1316 struct kvm_memory_slot *memslot;
1317 int is_dirty = 0;
1318
1319 mutex_lock(&kvm->lock);
1320
1321 r = kvm_get_dirty_log(kvm, log, &is_dirty);
1322 if (r)
1323 goto out;
1324
1325 /* If nothing is dirty, don't bother messing with page tables. */
1326 if (is_dirty) {
1327 kvm_mmu_slot_remove_write_access(kvm, log->slot);
1328 kvm_flush_remote_tlbs(kvm);
1329 memslot = &kvm->memslots[log->slot];
1330 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
1331 memset(memslot->dirty_bitmap, 0, n);
1332 }
1333 r = 0;
1334out:
1335 mutex_unlock(&kvm->lock);
1336 return r;
1337}
1338
1339long kvm_arch_vm_ioctl(struct file *filp,
1340 unsigned int ioctl, unsigned long arg)
1341{
1342 struct kvm *kvm = filp->private_data;
1343 void __user *argp = (void __user *)arg;
1344 int r = -EINVAL;
1345
1346 switch (ioctl) {
1347 case KVM_SET_TSS_ADDR:
1348 r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
1349 if (r < 0)
1350 goto out;
1351 break;
1352 case KVM_SET_MEMORY_REGION: {
1353 struct kvm_memory_region kvm_mem;
1354 struct kvm_userspace_memory_region kvm_userspace_mem;
1355
1356 r = -EFAULT;
1357 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
1358 goto out;
1359 kvm_userspace_mem.slot = kvm_mem.slot;
1360 kvm_userspace_mem.flags = kvm_mem.flags;
1361 kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
1362 kvm_userspace_mem.memory_size = kvm_mem.memory_size;
1363 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
1364 if (r)
1365 goto out;
1366 break;
1367 }
1368 case KVM_SET_NR_MMU_PAGES:
1369 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
1370 if (r)
1371 goto out;
1372 break;
1373 case KVM_GET_NR_MMU_PAGES:
1374 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
1375 break;
1376 case KVM_SET_MEMORY_ALIAS: {
1377 struct kvm_memory_alias alias;
1378
1379 r = -EFAULT;
1380 if (copy_from_user(&alias, argp, sizeof alias))
1381 goto out;
1382 r = kvm_vm_ioctl_set_memory_alias(kvm, &alias);
1383 if (r)
1384 goto out;
1385 break;
1386 }
1387 case KVM_CREATE_IRQCHIP:
1388 r = -ENOMEM;
1389 kvm->arch.vpic = kvm_create_pic(kvm);
1390 if (kvm->arch.vpic) {
1391 r = kvm_ioapic_init(kvm);
1392 if (r) {
1393 kfree(kvm->arch.vpic);
1394 kvm->arch.vpic = NULL;
1395 goto out;
1396 }
1397 } else
1398 goto out;
1399 break;
1400 case KVM_IRQ_LINE: {
1401 struct kvm_irq_level irq_event;
1402
1403 r = -EFAULT;
1404 if (copy_from_user(&irq_event, argp, sizeof irq_event))
1405 goto out;
1406 if (irqchip_in_kernel(kvm)) {
1407 mutex_lock(&kvm->lock);
1408 if (irq_event.irq < 16)
1409 kvm_pic_set_irq(pic_irqchip(kvm),
1410 irq_event.irq,
1411 irq_event.level);
1412 kvm_ioapic_set_irq(kvm->arch.vioapic,
1413 irq_event.irq,
1414 irq_event.level);
1415 mutex_unlock(&kvm->lock);
1416 r = 0;
1417 }
1418 break;
1419 }
1420 case KVM_GET_IRQCHIP: {
1421 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
1422 struct kvm_irqchip chip;
1423
1424 r = -EFAULT;
1425 if (copy_from_user(&chip, argp, sizeof chip))
1426 goto out;
1427 r = -ENXIO;
1428 if (!irqchip_in_kernel(kvm))
1429 goto out;
1430 r = kvm_vm_ioctl_get_irqchip(kvm, &chip);
1431 if (r)
1432 goto out;
1433 r = -EFAULT;
1434 if (copy_to_user(argp, &chip, sizeof chip))
1435 goto out;
1436 r = 0;
1437 break;
1438 }
1439 case KVM_SET_IRQCHIP: {
1440 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
1441 struct kvm_irqchip chip;
1442
1443 r = -EFAULT;
1444 if (copy_from_user(&chip, argp, sizeof chip))
1445 goto out;
1446 r = -ENXIO;
1447 if (!irqchip_in_kernel(kvm))
1448 goto out;
1449 r = kvm_vm_ioctl_set_irqchip(kvm, &chip);
1450 if (r)
1451 goto out;
1452 r = 0;
1453 break;
1454 }
1455 case KVM_GET_SUPPORTED_CPUID: {
1456 struct kvm_cpuid2 __user *cpuid_arg = argp;
1457 struct kvm_cpuid2 cpuid;
1458
1459 r = -EFAULT;
1460 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1461 goto out;
1462 r = kvm_vm_ioctl_get_supported_cpuid(kvm, &cpuid,
1463 cpuid_arg->entries);
1464 if (r)
1465 goto out;
1466
1467 r = -EFAULT;
1468 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
1469 goto out;
1470 r = 0;
1471 break;
1472 }
1473 default:
1474 ;
1475 }
1476out:
1477 return r;
1478}
1479
1480static void kvm_init_msr_list(void)
1481{
1482 u32 dummy[2];
1483 unsigned i, j;
1484
1485 for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
1486 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
1487 continue;
1488 if (j < i)
1489 msrs_to_save[j] = msrs_to_save[i];
1490 j++;
1491 }
1492 num_msrs_to_save = j;
1493}
1494
1495/*
1496 * Only apic need an MMIO device hook, so shortcut now..
1497 */
1498static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
1499 gpa_t addr)
1500{
1501 struct kvm_io_device *dev;
1502
1503 if (vcpu->arch.apic) {
1504 dev = &vcpu->arch.apic->dev;
1505 if (dev->in_range(dev, addr))
1506 return dev;
1507 }
1508 return NULL;
1509}
1510
1511
1512static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
1513 gpa_t addr)
1514{
1515 struct kvm_io_device *dev;
1516
1517 dev = vcpu_find_pervcpu_dev(vcpu, addr);
1518 if (dev == NULL)
1519 dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr);
1520 return dev;
1521}
1522
1523int emulator_read_std(unsigned long addr,
1524 void *val,
1525 unsigned int bytes,
1526 struct kvm_vcpu *vcpu)
1527{
1528 void *data = val;
1529
1530 while (bytes) {
1531 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1532 unsigned offset = addr & (PAGE_SIZE-1);
1533 unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
1534 int ret;
1535
1536 if (gpa == UNMAPPED_GVA)
1537 return X86EMUL_PROPAGATE_FAULT;
1538 ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy);
1539 if (ret < 0)
1540 return X86EMUL_UNHANDLEABLE;
1541
1542 bytes -= tocopy;
1543 data += tocopy;
1544 addr += tocopy;
1545 }
1546
1547 return X86EMUL_CONTINUE;
1548}
1549EXPORT_SYMBOL_GPL(emulator_read_std);
1550
1551static int emulator_read_emulated(unsigned long addr,
1552 void *val,
1553 unsigned int bytes,
1554 struct kvm_vcpu *vcpu)
1555{
1556 struct kvm_io_device *mmio_dev;
1557 gpa_t gpa;
1558
1559 if (vcpu->mmio_read_completed) {
1560 memcpy(val, vcpu->mmio_data, bytes);
1561 vcpu->mmio_read_completed = 0;
1562 return X86EMUL_CONTINUE;
1563 }
1564
1565 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1566
1567 /* For APIC access vmexit */
1568 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
1569 goto mmio;
1570
1571 if (emulator_read_std(addr, val, bytes, vcpu)
1572 == X86EMUL_CONTINUE)
1573 return X86EMUL_CONTINUE;
1574 if (gpa == UNMAPPED_GVA)
1575 return X86EMUL_PROPAGATE_FAULT;
1576
1577mmio:
1578 /*
1579 * Is this MMIO handled locally?
1580 */
1581 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1582 if (mmio_dev) {
1583 kvm_iodevice_read(mmio_dev, gpa, bytes, val);
1584 return X86EMUL_CONTINUE;
1585 }
1586
1587 vcpu->mmio_needed = 1;
1588 vcpu->mmio_phys_addr = gpa;
1589 vcpu->mmio_size = bytes;
1590 vcpu->mmio_is_write = 0;
1591
1592 return X86EMUL_UNHANDLEABLE;
1593}
1594
1595static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
1596 const void *val, int bytes)
1597{
1598 int ret;
1599
1600 ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
1601 if (ret < 0)
1602 return 0;
1603 kvm_mmu_pte_write(vcpu, gpa, val, bytes);
1604 return 1;
1605}
1606
1607static int emulator_write_emulated_onepage(unsigned long addr,
1608 const void *val,
1609 unsigned int bytes,
1610 struct kvm_vcpu *vcpu)
1611{
1612 struct kvm_io_device *mmio_dev;
1613 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1614
1615 if (gpa == UNMAPPED_GVA) {
1616 kvm_inject_page_fault(vcpu, addr, 2);
1617 return X86EMUL_PROPAGATE_FAULT;
1618 }
1619
1620 /* For APIC access vmexit */
1621 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
1622 goto mmio;
1623
1624 if (emulator_write_phys(vcpu, gpa, val, bytes))
1625 return X86EMUL_CONTINUE;
1626
1627mmio:
1628 /*
1629 * Is this MMIO handled locally?
1630 */
1631 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1632 if (mmio_dev) {
1633 kvm_iodevice_write(mmio_dev, gpa, bytes, val);
1634 return X86EMUL_CONTINUE;
1635 }
1636
1637 vcpu->mmio_needed = 1;
1638 vcpu->mmio_phys_addr = gpa;
1639 vcpu->mmio_size = bytes;
1640 vcpu->mmio_is_write = 1;
1641 memcpy(vcpu->mmio_data, val, bytes);
1642
1643 return X86EMUL_CONTINUE;
1644}
1645
1646int emulator_write_emulated(unsigned long addr,
1647 const void *val,
1648 unsigned int bytes,
1649 struct kvm_vcpu *vcpu)
1650{
1651 /* Crossing a page boundary? */
1652 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
1653 int rc, now;
1654
1655 now = -addr & ~PAGE_MASK;
1656 rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
1657 if (rc != X86EMUL_CONTINUE)
1658 return rc;
1659 addr += now;
1660 val += now;
1661 bytes -= now;
1662 }
1663 return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
1664}
1665EXPORT_SYMBOL_GPL(emulator_write_emulated);
1666
1667static int emulator_cmpxchg_emulated(unsigned long addr,
1668 const void *old,
1669 const void *new,
1670 unsigned int bytes,
1671 struct kvm_vcpu *vcpu)
1672{
1673 static int reported;
1674
1675 if (!reported) {
1676 reported = 1;
1677 printk(KERN_WARNING "kvm: emulating exchange as write\n");
1678 }
1679#ifndef CONFIG_X86_64
1680 /* guests cmpxchg8b have to be emulated atomically */
1681 if (bytes == 8) {
1682 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1683 struct page *page;
1684 char *addr;
1685 u64 val;
1686
1687 if (gpa == UNMAPPED_GVA ||
1688 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
1689 goto emul_write;
1690
1691 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
1692 goto emul_write;
1693
1694 val = *(u64 *)new;
1695 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1696 addr = kmap_atomic(page, KM_USER0);
1697 set_64bit((u64 *)(addr + offset_in_page(gpa)), val);
1698 kunmap_atomic(addr, KM_USER0);
1699 kvm_release_page_dirty(page);
1700 }
1701emul_write:
1702#endif
1703
1704 return emulator_write_emulated(addr, new, bytes, vcpu);
1705}
1706
1707static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
1708{
1709 return kvm_x86_ops->get_segment_base(vcpu, seg);
1710}
1711
1712int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
1713{
1714 return X86EMUL_CONTINUE;
1715}
1716
1717int emulate_clts(struct kvm_vcpu *vcpu)
1718{
1719 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
1720 return X86EMUL_CONTINUE;
1721}
1722
1723int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
1724{
1725 struct kvm_vcpu *vcpu = ctxt->vcpu;
1726
1727 switch (dr) {
1728 case 0 ... 3:
1729 *dest = kvm_x86_ops->get_dr(vcpu, dr);
1730 return X86EMUL_CONTINUE;
1731 default:
1732 pr_unimpl(vcpu, "%s: unexpected dr %u\n", __FUNCTION__, dr);
1733 return X86EMUL_UNHANDLEABLE;
1734 }
1735}
1736
1737int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
1738{
1739 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
1740 int exception;
1741
1742 kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
1743 if (exception) {
1744 /* FIXME: better handling */
1745 return X86EMUL_UNHANDLEABLE;
1746 }
1747 return X86EMUL_CONTINUE;
1748}
1749
1750void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
1751{
1752 static int reported;
1753 u8 opcodes[4];
1754 unsigned long rip = vcpu->arch.rip;
1755 unsigned long rip_linear;
1756
1757 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
1758
1759 if (reported)
1760 return;
1761
1762 emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu);
1763
1764 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
1765 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
1766 reported = 1;
1767}
1768EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
1769
1770struct x86_emulate_ops emulate_ops = {
1771 .read_std = emulator_read_std,
1772 .read_emulated = emulator_read_emulated,
1773 .write_emulated = emulator_write_emulated,
1774 .cmpxchg_emulated = emulator_cmpxchg_emulated,
1775};
1776
1777int emulate_instruction(struct kvm_vcpu *vcpu,
1778 struct kvm_run *run,
1779 unsigned long cr2,
1780 u16 error_code,
1781 int no_decode)
1782{
1783 int r;
1784
1785 vcpu->arch.mmio_fault_cr2 = cr2;
1786 kvm_x86_ops->cache_regs(vcpu);
1787
1788 vcpu->mmio_is_write = 0;
1789 vcpu->arch.pio.string = 0;
1790
1791 if (!no_decode) {
1792 int cs_db, cs_l;
1793 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
1794
1795 vcpu->arch.emulate_ctxt.vcpu = vcpu;
1796 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
1797 vcpu->arch.emulate_ctxt.mode =
1798 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
1799 ? X86EMUL_MODE_REAL : cs_l
1800 ? X86EMUL_MODE_PROT64 : cs_db
1801 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
1802
1803 if (vcpu->arch.emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
1804 vcpu->arch.emulate_ctxt.cs_base = 0;
1805 vcpu->arch.emulate_ctxt.ds_base = 0;
1806 vcpu->arch.emulate_ctxt.es_base = 0;
1807 vcpu->arch.emulate_ctxt.ss_base = 0;
1808 } else {
1809 vcpu->arch.emulate_ctxt.cs_base =
1810 get_segment_base(vcpu, VCPU_SREG_CS);
1811 vcpu->arch.emulate_ctxt.ds_base =
1812 get_segment_base(vcpu, VCPU_SREG_DS);
1813 vcpu->arch.emulate_ctxt.es_base =
1814 get_segment_base(vcpu, VCPU_SREG_ES);
1815 vcpu->arch.emulate_ctxt.ss_base =
1816 get_segment_base(vcpu, VCPU_SREG_SS);
1817 }
1818
1819 vcpu->arch.emulate_ctxt.gs_base =
1820 get_segment_base(vcpu, VCPU_SREG_GS);
1821 vcpu->arch.emulate_ctxt.fs_base =
1822 get_segment_base(vcpu, VCPU_SREG_FS);
1823
1824 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
1825 ++vcpu->stat.insn_emulation;
1826 if (r) {
1827 ++vcpu->stat.insn_emulation_fail;
1828 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
1829 return EMULATE_DONE;
1830 return EMULATE_FAIL;
1831 }
1832 }
1833
1834 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
1835
1836 if (vcpu->arch.pio.string)
1837 return EMULATE_DO_MMIO;
1838
1839 if ((r || vcpu->mmio_is_write) && run) {
1840 run->exit_reason = KVM_EXIT_MMIO;
1841 run->mmio.phys_addr = vcpu->mmio_phys_addr;
1842 memcpy(run->mmio.data, vcpu->mmio_data, 8);
1843 run->mmio.len = vcpu->mmio_size;
1844 run->mmio.is_write = vcpu->mmio_is_write;
1845 }
1846
1847 if (r) {
1848 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
1849 return EMULATE_DONE;
1850 if (!vcpu->mmio_needed) {
1851 kvm_report_emulation_failure(vcpu, "mmio");
1852 return EMULATE_FAIL;
1853 }
1854 return EMULATE_DO_MMIO;
1855 }
1856
1857 kvm_x86_ops->decache_regs(vcpu);
1858 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
1859
1860 if (vcpu->mmio_is_write) {
1861 vcpu->mmio_needed = 0;
1862 return EMULATE_DO_MMIO;
1863 }
1864
1865 return EMULATE_DONE;
1866}
1867EXPORT_SYMBOL_GPL(emulate_instruction);
1868
1869static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
1870{
1871 int i;
1872
1873 for (i = 0; i < ARRAY_SIZE(vcpu->arch.pio.guest_pages); ++i)
1874 if (vcpu->arch.pio.guest_pages[i]) {
1875 kvm_release_page_dirty(vcpu->arch.pio.guest_pages[i]);
1876 vcpu->arch.pio.guest_pages[i] = NULL;
1877 }
1878}
1879
1880static int pio_copy_data(struct kvm_vcpu *vcpu)
1881{
1882 void *p = vcpu->arch.pio_data;
1883 void *q;
1884 unsigned bytes;
1885 int nr_pages = vcpu->arch.pio.guest_pages[1] ? 2 : 1;
1886
1887 q = vmap(vcpu->arch.pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
1888 PAGE_KERNEL);
1889 if (!q) {
1890 free_pio_guest_pages(vcpu);
1891 return -ENOMEM;
1892 }
1893 q += vcpu->arch.pio.guest_page_offset;
1894 bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
1895 if (vcpu->arch.pio.in)
1896 memcpy(q, p, bytes);
1897 else
1898 memcpy(p, q, bytes);
1899 q -= vcpu->arch.pio.guest_page_offset;
1900 vunmap(q);
1901 free_pio_guest_pages(vcpu);
1902 return 0;
1903}
1904
1905int complete_pio(struct kvm_vcpu *vcpu)
1906{
1907 struct kvm_pio_request *io = &vcpu->arch.pio;
1908 long delta;
1909 int r;
1910
1911 kvm_x86_ops->cache_regs(vcpu);
1912
1913 if (!io->string) {
1914 if (io->in)
1915 memcpy(&vcpu->arch.regs[VCPU_REGS_RAX], vcpu->arch.pio_data,
1916 io->size);
1917 } else {
1918 if (io->in) {
1919 r = pio_copy_data(vcpu);
1920 if (r) {
1921 kvm_x86_ops->cache_regs(vcpu);
1922 return r;
1923 }
1924 }
1925
1926 delta = 1;
1927 if (io->rep) {
1928 delta *= io->cur_count;
1929 /*
1930 * The size of the register should really depend on
1931 * current address size.
1932 */
1933 vcpu->arch.regs[VCPU_REGS_RCX] -= delta;
1934 }
1935 if (io->down)
1936 delta = -delta;
1937 delta *= io->size;
1938 if (io->in)
1939 vcpu->arch.regs[VCPU_REGS_RDI] += delta;
1940 else
1941 vcpu->arch.regs[VCPU_REGS_RSI] += delta;
1942 }
1943
1944 kvm_x86_ops->decache_regs(vcpu);
1945
1946 io->count -= io->cur_count;
1947 io->cur_count = 0;
1948
1949 return 0;
1950}
1951
1952static void kernel_pio(struct kvm_io_device *pio_dev,
1953 struct kvm_vcpu *vcpu,
1954 void *pd)
1955{
1956 /* TODO: String I/O for in kernel device */
1957
1958 mutex_lock(&vcpu->kvm->lock);
1959 if (vcpu->arch.pio.in)
1960 kvm_iodevice_read(pio_dev, vcpu->arch.pio.port,
1961 vcpu->arch.pio.size,
1962 pd);
1963 else
1964 kvm_iodevice_write(pio_dev, vcpu->arch.pio.port,
1965 vcpu->arch.pio.size,
1966 pd);
1967 mutex_unlock(&vcpu->kvm->lock);
1968}
1969
1970static void pio_string_write(struct kvm_io_device *pio_dev,
1971 struct kvm_vcpu *vcpu)
1972{
1973 struct kvm_pio_request *io = &vcpu->arch.pio;
1974 void *pd = vcpu->arch.pio_data;
1975 int i;
1976
1977 mutex_lock(&vcpu->kvm->lock);
1978 for (i = 0; i < io->cur_count; i++) {
1979 kvm_iodevice_write(pio_dev, io->port,
1980 io->size,
1981 pd);
1982 pd += io->size;
1983 }
1984 mutex_unlock(&vcpu->kvm->lock);
1985}
1986
1987static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
1988 gpa_t addr)
1989{
1990 return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr);
1991}
1992
1993int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
1994 int size, unsigned port)
1995{
1996 struct kvm_io_device *pio_dev;
1997
1998 vcpu->run->exit_reason = KVM_EXIT_IO;
1999 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
2000 vcpu->run->io.size = vcpu->arch.pio.size = size;
2001 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
2002 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1;
2003 vcpu->run->io.port = vcpu->arch.pio.port = port;
2004 vcpu->arch.pio.in = in;
2005 vcpu->arch.pio.string = 0;
2006 vcpu->arch.pio.down = 0;
2007 vcpu->arch.pio.guest_page_offset = 0;
2008 vcpu->arch.pio.rep = 0;
2009
2010 kvm_x86_ops->cache_regs(vcpu);
2011 memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4);
2012 kvm_x86_ops->decache_regs(vcpu);
2013
2014 kvm_x86_ops->skip_emulated_instruction(vcpu);
2015
2016 pio_dev = vcpu_find_pio_dev(vcpu, port);
2017 if (pio_dev) {
2018 kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
2019 complete_pio(vcpu);
2020 return 1;
2021 }
2022 return 0;
2023}
2024EXPORT_SYMBOL_GPL(kvm_emulate_pio);
2025
2026int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2027 int size, unsigned long count, int down,
2028 gva_t address, int rep, unsigned port)
2029{
2030 unsigned now, in_page;
2031 int i, ret = 0;
2032 int nr_pages = 1;
2033 struct page *page;
2034 struct kvm_io_device *pio_dev;
2035
2036 vcpu->run->exit_reason = KVM_EXIT_IO;
2037 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
2038 vcpu->run->io.size = vcpu->arch.pio.size = size;
2039 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
2040 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
2041 vcpu->run->io.port = vcpu->arch.pio.port = port;
2042 vcpu->arch.pio.in = in;
2043 vcpu->arch.pio.string = 1;
2044 vcpu->arch.pio.down = down;
2045 vcpu->arch.pio.guest_page_offset = offset_in_page(address);
2046 vcpu->arch.pio.rep = rep;
2047
2048 if (!count) {
2049 kvm_x86_ops->skip_emulated_instruction(vcpu);
2050 return 1;
2051 }
2052
2053 if (!down)
2054 in_page = PAGE_SIZE - offset_in_page(address);
2055 else
2056 in_page = offset_in_page(address) + size;
2057 now = min(count, (unsigned long)in_page / size);
2058 if (!now) {
2059 /*
2060 * String I/O straddles page boundary. Pin two guest pages
2061 * so that we satisfy atomicity constraints. Do just one
2062 * transaction to avoid complexity.
2063 */
2064 nr_pages = 2;
2065 now = 1;
2066 }
2067 if (down) {
2068 /*
2069 * String I/O in reverse. Yuck. Kill the guest, fix later.
2070 */
2071 pr_unimpl(vcpu, "guest string pio down\n");
2072 kvm_inject_gp(vcpu, 0);
2073 return 1;
2074 }
2075 vcpu->run->io.count = now;
2076 vcpu->arch.pio.cur_count = now;
2077
2078 if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
2079 kvm_x86_ops->skip_emulated_instruction(vcpu);
2080
2081 for (i = 0; i < nr_pages; ++i) {
2082 mutex_lock(&vcpu->kvm->lock);
2083 page = gva_to_page(vcpu, address + i * PAGE_SIZE);
2084 vcpu->arch.pio.guest_pages[i] = page;
2085 mutex_unlock(&vcpu->kvm->lock);
2086 if (!page) {
2087 kvm_inject_gp(vcpu, 0);
2088 free_pio_guest_pages(vcpu);
2089 return 1;
2090 }
2091 }
2092
2093 pio_dev = vcpu_find_pio_dev(vcpu, port);
2094 if (!vcpu->arch.pio.in) {
2095 /* string PIO write */
2096 ret = pio_copy_data(vcpu);
2097 if (ret >= 0 && pio_dev) {
2098 pio_string_write(pio_dev, vcpu);
2099 complete_pio(vcpu);
2100 if (vcpu->arch.pio.count == 0)
2101 ret = 1;
2102 }
2103 } else if (pio_dev)
2104 pr_unimpl(vcpu, "no string pio read support yet, "
2105 "port %x size %d count %ld\n",
2106 port, size, count);
2107
2108 return ret;
2109}
2110EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
2111
2112int kvm_arch_init(void *opaque)
2113{
2114 int r;
2115 struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
2116
2117 r = kvm_mmu_module_init();
2118 if (r)
2119 goto out_fail;
2120
2121 kvm_init_msr_list();
2122
2123 if (kvm_x86_ops) {
2124 printk(KERN_ERR "kvm: already loaded the other module\n");
2125 r = -EEXIST;
2126 goto out;
2127 }
2128
2129 if (!ops->cpu_has_kvm_support()) {
2130 printk(KERN_ERR "kvm: no hardware support\n");
2131 r = -EOPNOTSUPP;
2132 goto out;
2133 }
2134 if (ops->disabled_by_bios()) {
2135 printk(KERN_ERR "kvm: disabled by bios\n");
2136 r = -EOPNOTSUPP;
2137 goto out;
2138 }
2139
2140 kvm_x86_ops = ops;
2141 kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
2142 return 0;
2143
2144out:
2145 kvm_mmu_module_exit();
2146out_fail:
2147 return r;
2148}
2149
2150void kvm_arch_exit(void)
2151{
2152 kvm_x86_ops = NULL;
2153 kvm_mmu_module_exit();
2154}
2155
2156int kvm_emulate_halt(struct kvm_vcpu *vcpu)
2157{
2158 ++vcpu->stat.halt_exits;
2159 if (irqchip_in_kernel(vcpu->kvm)) {
2160 vcpu->arch.mp_state = VCPU_MP_STATE_HALTED;
2161 kvm_vcpu_block(vcpu);
2162 if (vcpu->arch.mp_state != VCPU_MP_STATE_RUNNABLE)
2163 return -EINTR;
2164 return 1;
2165 } else {
2166 vcpu->run->exit_reason = KVM_EXIT_HLT;
2167 return 0;
2168 }
2169}
2170EXPORT_SYMBOL_GPL(kvm_emulate_halt);
2171
2172int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
2173{
2174 unsigned long nr, a0, a1, a2, a3, ret;
2175
2176 kvm_x86_ops->cache_regs(vcpu);
2177
2178 nr = vcpu->arch.regs[VCPU_REGS_RAX];
2179 a0 = vcpu->arch.regs[VCPU_REGS_RBX];
2180 a1 = vcpu->arch.regs[VCPU_REGS_RCX];
2181 a2 = vcpu->arch.regs[VCPU_REGS_RDX];
2182 a3 = vcpu->arch.regs[VCPU_REGS_RSI];
2183
2184 if (!is_long_mode(vcpu)) {
2185 nr &= 0xFFFFFFFF;
2186 a0 &= 0xFFFFFFFF;
2187 a1 &= 0xFFFFFFFF;
2188 a2 &= 0xFFFFFFFF;
2189 a3 &= 0xFFFFFFFF;
2190 }
2191
2192 switch (nr) {
2193 default:
2194 ret = -KVM_ENOSYS;
2195 break;
2196 }
2197 vcpu->arch.regs[VCPU_REGS_RAX] = ret;
2198 kvm_x86_ops->decache_regs(vcpu);
2199 return 0;
2200}
2201EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
2202
2203int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
2204{
2205 char instruction[3];
2206 int ret = 0;
2207
2208 mutex_lock(&vcpu->kvm->lock);
2209
2210 /*
2211 * Blow out the MMU to ensure that no other VCPU has an active mapping
2212 * to ensure that the updated hypercall appears atomically across all
2213 * VCPUs.
2214 */
2215 kvm_mmu_zap_all(vcpu->kvm);
2216
2217 kvm_x86_ops->cache_regs(vcpu);
2218 kvm_x86_ops->patch_hypercall(vcpu, instruction);
2219 if (emulator_write_emulated(vcpu->arch.rip, instruction, 3, vcpu)
2220 != X86EMUL_CONTINUE)
2221 ret = -EFAULT;
2222
2223 mutex_unlock(&vcpu->kvm->lock);
2224
2225 return ret;
2226}
2227
2228static u64 mk_cr_64(u64 curr_cr, u32 new_val)
2229{
2230 return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
2231}
2232
2233void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
2234{
2235 struct descriptor_table dt = { limit, base };
2236
2237 kvm_x86_ops->set_gdt(vcpu, &dt);
2238}
2239
2240void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
2241{
2242 struct descriptor_table dt = { limit, base };
2243
2244 kvm_x86_ops->set_idt(vcpu, &dt);
2245}
2246
2247void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
2248 unsigned long *rflags)
2249{
2250 lmsw(vcpu, msw);
2251 *rflags = kvm_x86_ops->get_rflags(vcpu);
2252}
2253
2254unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
2255{
2256 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2257 switch (cr) {
2258 case 0:
2259 return vcpu->arch.cr0;
2260 case 2:
2261 return vcpu->arch.cr2;
2262 case 3:
2263 return vcpu->arch.cr3;
2264 case 4:
2265 return vcpu->arch.cr4;
2266 case 8:
2267 return get_cr8(vcpu);
2268 default:
2269 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
2270 return 0;
2271 }
2272}
2273
2274void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
2275 unsigned long *rflags)
2276{
2277 switch (cr) {
2278 case 0:
2279 set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
2280 *rflags = kvm_x86_ops->get_rflags(vcpu);
2281 break;
2282 case 2:
2283 vcpu->arch.cr2 = val;
2284 break;
2285 case 3:
2286 set_cr3(vcpu, val);
2287 break;
2288 case 4:
2289 set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val));
2290 break;
2291 case 8:
2292 set_cr8(vcpu, val & 0xfUL);
2293 break;
2294 default:
2295 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
2296 }
2297}
2298
2299static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
2300{
2301 struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
2302 int j, nent = vcpu->arch.cpuid_nent;
2303
2304 e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
2305 /* when no next entry is found, the current entry[i] is reselected */
2306 for (j = i + 1; j == i; j = (j + 1) % nent) {
2307 struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
2308 if (ej->function == e->function) {
2309 ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
2310 return j;
2311 }
2312 }
2313 return 0; /* silence gcc, even though control never reaches here */
2314}
2315
2316/* find an entry with matching function, matching index (if needed), and that
2317 * should be read next (if it's stateful) */
2318static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
2319 u32 function, u32 index)
2320{
2321 if (e->function != function)
2322 return 0;
2323 if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
2324 return 0;
2325 if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
2326 !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
2327 return 0;
2328 return 1;
2329}
2330
2331void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
2332{
2333 int i;
2334 u32 function, index;
2335 struct kvm_cpuid_entry2 *e, *best;
2336
2337 kvm_x86_ops->cache_regs(vcpu);
2338 function = vcpu->arch.regs[VCPU_REGS_RAX];
2339 index = vcpu->arch.regs[VCPU_REGS_RCX];
2340 vcpu->arch.regs[VCPU_REGS_RAX] = 0;
2341 vcpu->arch.regs[VCPU_REGS_RBX] = 0;
2342 vcpu->arch.regs[VCPU_REGS_RCX] = 0;
2343 vcpu->arch.regs[VCPU_REGS_RDX] = 0;
2344 best = NULL;
2345 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
2346 e = &vcpu->arch.cpuid_entries[i];
2347 if (is_matching_cpuid_entry(e, function, index)) {
2348 if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
2349 move_to_next_stateful_cpuid_entry(vcpu, i);
2350 best = e;
2351 break;
2352 }
2353 /*
2354 * Both basic or both extended?
2355 */
2356 if (((e->function ^ function) & 0x80000000) == 0)
2357 if (!best || e->function > best->function)
2358 best = e;
2359 }
2360 if (best) {
2361 vcpu->arch.regs[VCPU_REGS_RAX] = best->eax;
2362 vcpu->arch.regs[VCPU_REGS_RBX] = best->ebx;
2363 vcpu->arch.regs[VCPU_REGS_RCX] = best->ecx;
2364 vcpu->arch.regs[VCPU_REGS_RDX] = best->edx;
2365 }
2366 kvm_x86_ops->decache_regs(vcpu);
2367 kvm_x86_ops->skip_emulated_instruction(vcpu);
2368}
2369EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
2370
2371/*
2372 * Check if userspace requested an interrupt window, and that the
2373 * interrupt window is open.
2374 *
2375 * No need to exit to userspace if we already have an interrupt queued.
2376 */
2377static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
2378 struct kvm_run *kvm_run)
2379{
2380 return (!vcpu->arch.irq_summary &&
2381 kvm_run->request_interrupt_window &&
2382 vcpu->arch.interrupt_window_open &&
2383 (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
2384}
2385
2386static void post_kvm_run_save(struct kvm_vcpu *vcpu,
2387 struct kvm_run *kvm_run)
2388{
2389 kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
2390 kvm_run->cr8 = get_cr8(vcpu);
2391 kvm_run->apic_base = kvm_get_apic_base(vcpu);
2392 if (irqchip_in_kernel(vcpu->kvm))
2393 kvm_run->ready_for_interrupt_injection = 1;
2394 else
2395 kvm_run->ready_for_interrupt_injection =
2396 (vcpu->arch.interrupt_window_open &&
2397 vcpu->arch.irq_summary == 0);
2398}
2399
2400static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2401{
2402 int r;
2403
2404 if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) {
2405 pr_debug("vcpu %d received sipi with vector # %x\n",
2406 vcpu->vcpu_id, vcpu->arch.sipi_vector);
2407 kvm_lapic_reset(vcpu);
2408 r = kvm_x86_ops->vcpu_reset(vcpu);
2409 if (r)
2410 return r;
2411 vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
2412 }
2413
2414preempted:
2415 if (vcpu->guest_debug.enabled)
2416 kvm_x86_ops->guest_debug_pre(vcpu);
2417
2418again:
2419 r = kvm_mmu_reload(vcpu);
2420 if (unlikely(r))
2421 goto out;
2422
2423 kvm_inject_pending_timer_irqs(vcpu);
2424
2425 preempt_disable();
2426
2427 kvm_x86_ops->prepare_guest_switch(vcpu);
2428 kvm_load_guest_fpu(vcpu);
2429
2430 local_irq_disable();
2431
2432 if (signal_pending(current)) {
2433 local_irq_enable();
2434 preempt_enable();
2435 r = -EINTR;
2436 kvm_run->exit_reason = KVM_EXIT_INTR;
2437 ++vcpu->stat.signal_exits;
2438 goto out;
2439 }
2440
2441 if (vcpu->arch.exception.pending)
2442 __queue_exception(vcpu);
2443 else if (irqchip_in_kernel(vcpu->kvm))
2444 kvm_x86_ops->inject_pending_irq(vcpu);
2445 else
2446 kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
2447
2448 vcpu->guest_mode = 1;
2449 kvm_guest_enter();
2450
2451 if (vcpu->requests)
2452 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
2453 kvm_x86_ops->tlb_flush(vcpu);
2454
2455 kvm_x86_ops->run(vcpu, kvm_run);
2456
2457 vcpu->guest_mode = 0;
2458 local_irq_enable();
2459
2460 ++vcpu->stat.exits;
2461
2462 /*
2463 * We must have an instruction between local_irq_enable() and
2464 * kvm_guest_exit(), so the timer interrupt isn't delayed by
2465 * the interrupt shadow. The stat.exits increment will do nicely.
2466 * But we need to prevent reordering, hence this barrier():
2467 */
2468 barrier();
2469
2470 kvm_guest_exit();
2471
2472 preempt_enable();
2473
2474 /*
2475 * Profile KVM exit RIPs:
2476 */
2477 if (unlikely(prof_on == KVM_PROFILING)) {
2478 kvm_x86_ops->cache_regs(vcpu);
2479 profile_hit(KVM_PROFILING, (void *)vcpu->arch.rip);
2480 }
2481
2482 if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu))
2483 vcpu->arch.exception.pending = false;
2484
2485 r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
2486
2487 if (r > 0) {
2488 if (dm_request_for_irq_injection(vcpu, kvm_run)) {
2489 r = -EINTR;
2490 kvm_run->exit_reason = KVM_EXIT_INTR;
2491 ++vcpu->stat.request_irq_exits;
2492 goto out;
2493 }
2494 if (!need_resched())
2495 goto again;
2496 }
2497
2498out:
2499 if (r > 0) {
2500 kvm_resched(vcpu);
2501 goto preempted;
2502 }
2503
2504 post_kvm_run_save(vcpu, kvm_run);
2505
2506 return r;
2507}
2508
2509int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2510{
2511 int r;
2512 sigset_t sigsaved;
2513
2514 vcpu_load(vcpu);
2515
2516 if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_UNINITIALIZED)) {
2517 kvm_vcpu_block(vcpu);
2518 vcpu_put(vcpu);
2519 return -EAGAIN;
2520 }
2521
2522 if (vcpu->sigset_active)
2523 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
2524
2525 /* re-sync apic's tpr */
2526 if (!irqchip_in_kernel(vcpu->kvm))
2527 set_cr8(vcpu, kvm_run->cr8);
2528
2529 if (vcpu->arch.pio.cur_count) {
2530 r = complete_pio(vcpu);
2531 if (r)
2532 goto out;
2533 }
2534#if CONFIG_HAS_IOMEM
2535 if (vcpu->mmio_needed) {
2536 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
2537 vcpu->mmio_read_completed = 1;
2538 vcpu->mmio_needed = 0;
2539 r = emulate_instruction(vcpu, kvm_run,
2540 vcpu->arch.mmio_fault_cr2, 0, 1);
2541 if (r == EMULATE_DO_MMIO) {
2542 /*
2543 * Read-modify-write. Back to userspace.
2544 */
2545 r = 0;
2546 goto out;
2547 }
2548 }
2549#endif
2550 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) {
2551 kvm_x86_ops->cache_regs(vcpu);
2552 vcpu->arch.regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;
2553 kvm_x86_ops->decache_regs(vcpu);
2554 }
2555
2556 r = __vcpu_run(vcpu, kvm_run);
2557
2558out:
2559 if (vcpu->sigset_active)
2560 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
2561
2562 vcpu_put(vcpu);
2563 return r;
2564}
2565
2566int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
2567{
2568 vcpu_load(vcpu);
2569
2570 kvm_x86_ops->cache_regs(vcpu);
2571
2572 regs->rax = vcpu->arch.regs[VCPU_REGS_RAX];
2573 regs->rbx = vcpu->arch.regs[VCPU_REGS_RBX];
2574 regs->rcx = vcpu->arch.regs[VCPU_REGS_RCX];
2575 regs->rdx = vcpu->arch.regs[VCPU_REGS_RDX];
2576 regs->rsi = vcpu->arch.regs[VCPU_REGS_RSI];
2577 regs->rdi = vcpu->arch.regs[VCPU_REGS_RDI];
2578 regs->rsp = vcpu->arch.regs[VCPU_REGS_RSP];
2579 regs->rbp = vcpu->arch.regs[VCPU_REGS_RBP];
2580#ifdef CONFIG_X86_64
2581 regs->r8 = vcpu->arch.regs[VCPU_REGS_R8];
2582 regs->r9 = vcpu->arch.regs[VCPU_REGS_R9];
2583 regs->r10 = vcpu->arch.regs[VCPU_REGS_R10];
2584 regs->r11 = vcpu->arch.regs[VCPU_REGS_R11];
2585 regs->r12 = vcpu->arch.regs[VCPU_REGS_R12];
2586 regs->r13 = vcpu->arch.regs[VCPU_REGS_R13];
2587 regs->r14 = vcpu->arch.regs[VCPU_REGS_R14];
2588 regs->r15 = vcpu->arch.regs[VCPU_REGS_R15];
2589#endif
2590
2591 regs->rip = vcpu->arch.rip;
2592 regs->rflags = kvm_x86_ops->get_rflags(vcpu);
2593
2594 /*
2595 * Don't leak debug flags in case they were set for guest debugging
2596 */
2597 if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
2598 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
2599
2600 vcpu_put(vcpu);
2601
2602 return 0;
2603}
2604
2605int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
2606{
2607 vcpu_load(vcpu);
2608
2609 vcpu->arch.regs[VCPU_REGS_RAX] = regs->rax;
2610 vcpu->arch.regs[VCPU_REGS_RBX] = regs->rbx;
2611 vcpu->arch.regs[VCPU_REGS_RCX] = regs->rcx;
2612 vcpu->arch.regs[VCPU_REGS_RDX] = regs->rdx;
2613 vcpu->arch.regs[VCPU_REGS_RSI] = regs->rsi;
2614 vcpu->arch.regs[VCPU_REGS_RDI] = regs->rdi;
2615 vcpu->arch.regs[VCPU_REGS_RSP] = regs->rsp;
2616 vcpu->arch.regs[VCPU_REGS_RBP] = regs->rbp;
2617#ifdef CONFIG_X86_64
2618 vcpu->arch.regs[VCPU_REGS_R8] = regs->r8;
2619 vcpu->arch.regs[VCPU_REGS_R9] = regs->r9;
2620 vcpu->arch.regs[VCPU_REGS_R10] = regs->r10;
2621 vcpu->arch.regs[VCPU_REGS_R11] = regs->r11;
2622 vcpu->arch.regs[VCPU_REGS_R12] = regs->r12;
2623 vcpu->arch.regs[VCPU_REGS_R13] = regs->r13;
2624 vcpu->arch.regs[VCPU_REGS_R14] = regs->r14;
2625 vcpu->arch.regs[VCPU_REGS_R15] = regs->r15;
2626#endif
2627
2628 vcpu->arch.rip = regs->rip;
2629 kvm_x86_ops->set_rflags(vcpu, regs->rflags);
2630
2631 kvm_x86_ops->decache_regs(vcpu);
2632
2633 vcpu_put(vcpu);
2634
2635 return 0;
2636}
2637
2638static void get_segment(struct kvm_vcpu *vcpu,
2639 struct kvm_segment *var, int seg)
2640{
2641 return kvm_x86_ops->get_segment(vcpu, var, seg);
2642}
2643
2644void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
2645{
2646 struct kvm_segment cs;
2647
2648 get_segment(vcpu, &cs, VCPU_SREG_CS);
2649 *db = cs.db;
2650 *l = cs.l;
2651}
2652EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
2653
2654int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
2655 struct kvm_sregs *sregs)
2656{
2657 struct descriptor_table dt;
2658 int pending_vec;
2659
2660 vcpu_load(vcpu);
2661
2662 get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
2663 get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
2664 get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
2665 get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
2666 get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
2667 get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
2668
2669 get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
2670 get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
2671
2672 kvm_x86_ops->get_idt(vcpu, &dt);
2673 sregs->idt.limit = dt.limit;
2674 sregs->idt.base = dt.base;
2675 kvm_x86_ops->get_gdt(vcpu, &dt);
2676 sregs->gdt.limit = dt.limit;
2677 sregs->gdt.base = dt.base;
2678
2679 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2680 sregs->cr0 = vcpu->arch.cr0;
2681 sregs->cr2 = vcpu->arch.cr2;
2682 sregs->cr3 = vcpu->arch.cr3;
2683 sregs->cr4 = vcpu->arch.cr4;
2684 sregs->cr8 = get_cr8(vcpu);
2685 sregs->efer = vcpu->arch.shadow_efer;
2686 sregs->apic_base = kvm_get_apic_base(vcpu);
2687
2688 if (irqchip_in_kernel(vcpu->kvm)) {
2689 memset(sregs->interrupt_bitmap, 0,
2690 sizeof sregs->interrupt_bitmap);
2691 pending_vec = kvm_x86_ops->get_irq(vcpu);
2692 if (pending_vec >= 0)
2693 set_bit(pending_vec,
2694 (unsigned long *)sregs->interrupt_bitmap);
2695 } else
2696 memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending,
2697 sizeof sregs->interrupt_bitmap);
2698
2699 vcpu_put(vcpu);
2700
2701 return 0;
2702}
2703
2704static void set_segment(struct kvm_vcpu *vcpu,
2705 struct kvm_segment *var, int seg)
2706{
2707 return kvm_x86_ops->set_segment(vcpu, var, seg);
2708}
2709
2710int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
2711 struct kvm_sregs *sregs)
2712{
2713 int mmu_reset_needed = 0;
2714 int i, pending_vec, max_bits;
2715 struct descriptor_table dt;
2716
2717 vcpu_load(vcpu);
2718
2719 dt.limit = sregs->idt.limit;
2720 dt.base = sregs->idt.base;
2721 kvm_x86_ops->set_idt(vcpu, &dt);
2722 dt.limit = sregs->gdt.limit;
2723 dt.base = sregs->gdt.base;
2724 kvm_x86_ops->set_gdt(vcpu, &dt);
2725
2726 vcpu->arch.cr2 = sregs->cr2;
2727 mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
2728 vcpu->arch.cr3 = sregs->cr3;
2729
2730 set_cr8(vcpu, sregs->cr8);
2731
2732 mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer;
2733#ifdef CONFIG_X86_64
2734 kvm_x86_ops->set_efer(vcpu, sregs->efer);
2735#endif
2736 kvm_set_apic_base(vcpu, sregs->apic_base);
2737
2738 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2739
2740 mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0;
2741 vcpu->arch.cr0 = sregs->cr0;
2742 kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
2743
2744 mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4;
2745 kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
2746 if (!is_long_mode(vcpu) && is_pae(vcpu))
2747 load_pdptrs(vcpu, vcpu->arch.cr3);
2748
2749 if (mmu_reset_needed)
2750 kvm_mmu_reset_context(vcpu);
2751
2752 if (!irqchip_in_kernel(vcpu->kvm)) {
2753 memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap,
2754 sizeof vcpu->arch.irq_pending);
2755 vcpu->arch.irq_summary = 0;
2756 for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i)
2757 if (vcpu->arch.irq_pending[i])
2758 __set_bit(i, &vcpu->arch.irq_summary);
2759 } else {
2760 max_bits = (sizeof sregs->interrupt_bitmap) << 3;
2761 pending_vec = find_first_bit(
2762 (const unsigned long *)sregs->interrupt_bitmap,
2763 max_bits);
2764 /* Only pending external irq is handled here */
2765 if (pending_vec < max_bits) {
2766 kvm_x86_ops->set_irq(vcpu, pending_vec);
2767 pr_debug("Set back pending irq %d\n",
2768 pending_vec);
2769 }
2770 }
2771
2772 set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
2773 set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
2774 set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
2775 set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
2776 set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
2777 set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
2778
2779 set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
2780 set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
2781
2782 vcpu_put(vcpu);
2783
2784 return 0;
2785}
2786
2787int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
2788 struct kvm_debug_guest *dbg)
2789{
2790 int r;
2791
2792 vcpu_load(vcpu);
2793
2794 r = kvm_x86_ops->set_guest_debug(vcpu, dbg);
2795
2796 vcpu_put(vcpu);
2797
2798 return r;
2799}
2800
2801/*
2802 * fxsave fpu state. Taken from x86_64/processor.h. To be killed when
2803 * we have asm/x86/processor.h
2804 */
2805struct fxsave {
2806 u16 cwd;
2807 u16 swd;
2808 u16 twd;
2809 u16 fop;
2810 u64 rip;
2811 u64 rdp;
2812 u32 mxcsr;
2813 u32 mxcsr_mask;
2814 u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
2815#ifdef CONFIG_X86_64
2816 u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
2817#else
2818 u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */
2819#endif
2820};
2821
2822/*
2823 * Translate a guest virtual address to a guest physical address.
2824 */
2825int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
2826 struct kvm_translation *tr)
2827{
2828 unsigned long vaddr = tr->linear_address;
2829 gpa_t gpa;
2830
2831 vcpu_load(vcpu);
2832 mutex_lock(&vcpu->kvm->lock);
2833 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr);
2834 tr->physical_address = gpa;
2835 tr->valid = gpa != UNMAPPED_GVA;
2836 tr->writeable = 1;
2837 tr->usermode = 0;
2838 mutex_unlock(&vcpu->kvm->lock);
2839 vcpu_put(vcpu);
2840
2841 return 0;
2842}
2843
2844int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2845{
2846 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
2847
2848 vcpu_load(vcpu);
2849
2850 memcpy(fpu->fpr, fxsave->st_space, 128);
2851 fpu->fcw = fxsave->cwd;
2852 fpu->fsw = fxsave->swd;
2853 fpu->ftwx = fxsave->twd;
2854 fpu->last_opcode = fxsave->fop;
2855 fpu->last_ip = fxsave->rip;
2856 fpu->last_dp = fxsave->rdp;
2857 memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
2858
2859 vcpu_put(vcpu);
2860
2861 return 0;
2862}
2863
2864int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2865{
2866 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
2867
2868 vcpu_load(vcpu);
2869
2870 memcpy(fxsave->st_space, fpu->fpr, 128);
2871 fxsave->cwd = fpu->fcw;
2872 fxsave->swd = fpu->fsw;
2873 fxsave->twd = fpu->ftwx;
2874 fxsave->fop = fpu->last_opcode;
2875 fxsave->rip = fpu->last_ip;
2876 fxsave->rdp = fpu->last_dp;
2877 memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
2878
2879 vcpu_put(vcpu);
2880
2881 return 0;
2882}
2883
2884void fx_init(struct kvm_vcpu *vcpu)
2885{
2886 unsigned after_mxcsr_mask;
2887
2888 /* Initialize guest FPU by resetting ours and saving into guest's */
2889 preempt_disable();
2890 fx_save(&vcpu->arch.host_fx_image);
2891 fpu_init();
2892 fx_save(&vcpu->arch.guest_fx_image);
2893 fx_restore(&vcpu->arch.host_fx_image);
2894 preempt_enable();
2895
2896 vcpu->arch.cr0 |= X86_CR0_ET;
2897 after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
2898 vcpu->arch.guest_fx_image.mxcsr = 0x1f80;
2899 memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask,
2900 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
2901}
2902EXPORT_SYMBOL_GPL(fx_init);
2903
2904void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
2905{
2906 if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
2907 return;
2908
2909 vcpu->guest_fpu_loaded = 1;
2910 fx_save(&vcpu->arch.host_fx_image);
2911 fx_restore(&vcpu->arch.guest_fx_image);
2912}
2913EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
2914
2915void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
2916{
2917 if (!vcpu->guest_fpu_loaded)
2918 return;
2919
2920 vcpu->guest_fpu_loaded = 0;
2921 fx_save(&vcpu->arch.guest_fx_image);
2922 fx_restore(&vcpu->arch.host_fx_image);
2923 ++vcpu->stat.fpu_reload;
2924}
2925EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
2926
2927void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
2928{
2929 kvm_x86_ops->vcpu_free(vcpu);
2930}
2931
2932struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
2933 unsigned int id)
2934{
2935 return kvm_x86_ops->vcpu_create(kvm, id);
2936}
2937
2938int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
2939{
2940 int r;
2941
2942 /* We do fxsave: this must be aligned. */
2943 BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
2944
2945 vcpu_load(vcpu);
2946 r = kvm_arch_vcpu_reset(vcpu);
2947 if (r == 0)
2948 r = kvm_mmu_setup(vcpu);
2949 vcpu_put(vcpu);
2950 if (r < 0)
2951 goto free_vcpu;
2952
2953 return 0;
2954free_vcpu:
2955 kvm_x86_ops->vcpu_free(vcpu);
2956 return r;
2957}
2958
2959void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
2960{
2961 vcpu_load(vcpu);
2962 kvm_mmu_unload(vcpu);
2963 vcpu_put(vcpu);
2964
2965 kvm_x86_ops->vcpu_free(vcpu);
2966}
2967
2968int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
2969{
2970 return kvm_x86_ops->vcpu_reset(vcpu);
2971}
2972
2973void kvm_arch_hardware_enable(void *garbage)
2974{
2975 kvm_x86_ops->hardware_enable(garbage);
2976}
2977
2978void kvm_arch_hardware_disable(void *garbage)
2979{
2980 kvm_x86_ops->hardware_disable(garbage);
2981}
2982
2983int kvm_arch_hardware_setup(void)
2984{
2985 return kvm_x86_ops->hardware_setup();
2986}
2987
2988void kvm_arch_hardware_unsetup(void)
2989{
2990 kvm_x86_ops->hardware_unsetup();
2991}
2992
2993void kvm_arch_check_processor_compat(void *rtn)
2994{
2995 kvm_x86_ops->check_processor_compatibility(rtn);
2996}
2997
2998int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
2999{
3000 struct page *page;
3001 struct kvm *kvm;
3002 int r;
3003
3004 BUG_ON(vcpu->kvm == NULL);
3005 kvm = vcpu->kvm;
3006
3007 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
3008 if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0)
3009 vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
3010 else
3011 vcpu->arch.mp_state = VCPU_MP_STATE_UNINITIALIZED;
3012
3013 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
3014 if (!page) {
3015 r = -ENOMEM;
3016 goto fail;
3017 }
3018 vcpu->arch.pio_data = page_address(page);
3019
3020 r = kvm_mmu_create(vcpu);
3021 if (r < 0)
3022 goto fail_free_pio_data;
3023
3024 if (irqchip_in_kernel(kvm)) {
3025 r = kvm_create_lapic(vcpu);
3026 if (r < 0)
3027 goto fail_mmu_destroy;
3028 }
3029
3030 return 0;
3031
3032fail_mmu_destroy:
3033 kvm_mmu_destroy(vcpu);
3034fail_free_pio_data:
3035 free_page((unsigned long)vcpu->arch.pio_data);
3036fail:
3037 return r;
3038}
3039
3040void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
3041{
3042 kvm_free_lapic(vcpu);
3043 kvm_mmu_destroy(vcpu);
3044 free_page((unsigned long)vcpu->arch.pio_data);
3045}
3046
3047struct kvm *kvm_arch_create_vm(void)
3048{
3049 struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
3050
3051 if (!kvm)
3052 return ERR_PTR(-ENOMEM);
3053
3054 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
3055
3056 return kvm;
3057}
3058
3059static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
3060{
3061 vcpu_load(vcpu);
3062 kvm_mmu_unload(vcpu);
3063 vcpu_put(vcpu);
3064}
3065
3066static void kvm_free_vcpus(struct kvm *kvm)
3067{
3068 unsigned int i;
3069
3070 /*
3071 * Unpin any mmu pages first.
3072 */
3073 for (i = 0; i < KVM_MAX_VCPUS; ++i)
3074 if (kvm->vcpus[i])
3075 kvm_unload_vcpu_mmu(kvm->vcpus[i]);
3076 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
3077 if (kvm->vcpus[i]) {
3078 kvm_arch_vcpu_free(kvm->vcpus[i]);
3079 kvm->vcpus[i] = NULL;
3080 }
3081 }
3082
3083}
3084
3085void kvm_arch_destroy_vm(struct kvm *kvm)
3086{
3087 kfree(kvm->arch.vpic);
3088 kfree(kvm->arch.vioapic);
3089 kvm_free_vcpus(kvm);
3090 kvm_free_physmem(kvm);
3091 kfree(kvm);
3092}
3093
3094int kvm_arch_set_memory_region(struct kvm *kvm,
3095 struct kvm_userspace_memory_region *mem,
3096 struct kvm_memory_slot old,
3097 int user_alloc)
3098{
3099 int npages = mem->memory_size >> PAGE_SHIFT;
3100 struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
3101
3102 /*To keep backward compatibility with older userspace,
3103 *x86 needs to hanlde !user_alloc case.
3104 */
3105 if (!user_alloc) {
3106 if (npages && !old.rmap) {
3107 down_write(&current->mm->mmap_sem);
3108 memslot->userspace_addr = do_mmap(NULL, 0,
3109 npages * PAGE_SIZE,
3110 PROT_READ | PROT_WRITE,
3111 MAP_SHARED | MAP_ANONYMOUS,
3112 0);
3113 up_write(&current->mm->mmap_sem);
3114
3115 if (IS_ERR((void *)memslot->userspace_addr))
3116 return PTR_ERR((void *)memslot->userspace_addr);
3117 } else {
3118 if (!old.user_alloc && old.rmap) {
3119 int ret;
3120
3121 down_write(&current->mm->mmap_sem);
3122 ret = do_munmap(current->mm, old.userspace_addr,
3123 old.npages * PAGE_SIZE);
3124 up_write(&current->mm->mmap_sem);
3125 if (ret < 0)
3126 printk(KERN_WARNING
3127 "kvm_vm_ioctl_set_memory_region: "
3128 "failed to munmap memory\n");
3129 }
3130 }
3131 }
3132
3133 if (!kvm->arch.n_requested_mmu_pages) {
3134 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
3135 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
3136 }
3137
3138 kvm_mmu_slot_remove_write_access(kvm, mem->slot);
3139 kvm_flush_remote_tlbs(kvm);
3140
3141 return 0;
3142}
3143
3144int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
3145{
3146 return vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE
3147 || vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED;
3148}
diff --git a/drivers/kvm/x86.h b/drivers/kvm/x86.h
deleted file mode 100644
index dfb8091971a9..000000000000
--- a/drivers/kvm/x86.h
+++ /dev/null
@@ -1,602 +0,0 @@
1#/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This header defines architecture specific interfaces, x86 version
5 *
6 * This work is licensed under the terms of the GNU GPL, version 2. See
7 * the COPYING file in the top-level directory.
8 *
9 */
10
11#ifndef KVM_X86_H
12#define KVM_X86_H
13
14#include <linux/types.h>
15#include <linux/mm.h>
16
17#include <linux/kvm.h>
18#include <linux/kvm_para.h>
19
20#include <asm/desc.h>
21
22#include "types.h"
23
24#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1)
25#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD))
26#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS|0xFFFFFF0000000000ULL)
27
28#define KVM_GUEST_CR0_MASK \
29 (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE \
30 | X86_CR0_NW | X86_CR0_CD)
31#define KVM_VM_CR0_ALWAYS_ON \
32 (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE | X86_CR0_TS \
33 | X86_CR0_MP)
34#define KVM_GUEST_CR4_MASK \
35 (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE)
36#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
37#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
38
39#define INVALID_PAGE (~(hpa_t)0)
40#define UNMAPPED_GVA (~(gpa_t)0)
41
42#define DE_VECTOR 0
43#define UD_VECTOR 6
44#define NM_VECTOR 7
45#define DF_VECTOR 8
46#define TS_VECTOR 10
47#define NP_VECTOR 11
48#define SS_VECTOR 12
49#define GP_VECTOR 13
50#define PF_VECTOR 14
51
52#define SELECTOR_TI_MASK (1 << 2)
53#define SELECTOR_RPL_MASK 0x03
54
55#define IOPL_SHIFT 12
56
57#define KVM_ALIAS_SLOTS 4
58
59#define KVM_PERMILLE_MMU_PAGES 20
60#define KVM_MIN_ALLOC_MMU_PAGES 64
61#define KVM_NUM_MMU_PAGES 1024
62#define KVM_MIN_FREE_MMU_PAGES 5
63#define KVM_REFILL_PAGES 25
64#define KVM_MAX_CPUID_ENTRIES 40
65
66extern spinlock_t kvm_lock;
67extern struct list_head vm_list;
68
69struct kvm_vcpu;
70struct kvm;
71
72enum {
73 VCPU_REGS_RAX = 0,
74 VCPU_REGS_RCX = 1,
75 VCPU_REGS_RDX = 2,
76 VCPU_REGS_RBX = 3,
77 VCPU_REGS_RSP = 4,
78 VCPU_REGS_RBP = 5,
79 VCPU_REGS_RSI = 6,
80 VCPU_REGS_RDI = 7,
81#ifdef CONFIG_X86_64
82 VCPU_REGS_R8 = 8,
83 VCPU_REGS_R9 = 9,
84 VCPU_REGS_R10 = 10,
85 VCPU_REGS_R11 = 11,
86 VCPU_REGS_R12 = 12,
87 VCPU_REGS_R13 = 13,
88 VCPU_REGS_R14 = 14,
89 VCPU_REGS_R15 = 15,
90#endif
91 NR_VCPU_REGS
92};
93
94enum {
95 VCPU_SREG_CS,
96 VCPU_SREG_DS,
97 VCPU_SREG_ES,
98 VCPU_SREG_FS,
99 VCPU_SREG_GS,
100 VCPU_SREG_SS,
101 VCPU_SREG_TR,
102 VCPU_SREG_LDTR,
103};
104
105#include "x86_emulate.h"
106
107#define KVM_NR_MEM_OBJS 40
108
109/*
110 * We don't want allocation failures within the mmu code, so we preallocate
111 * enough memory for a single page fault in a cache.
112 */
113struct kvm_mmu_memory_cache {
114 int nobjs;
115 void *objects[KVM_NR_MEM_OBJS];
116};
117
118#define NR_PTE_CHAIN_ENTRIES 5
119
120struct kvm_pte_chain {
121 u64 *parent_ptes[NR_PTE_CHAIN_ENTRIES];
122 struct hlist_node link;
123};
124
125/*
126 * kvm_mmu_page_role, below, is defined as:
127 *
128 * bits 0:3 - total guest paging levels (2-4, or zero for real mode)
129 * bits 4:7 - page table level for this shadow (1-4)
130 * bits 8:9 - page table quadrant for 2-level guests
131 * bit 16 - "metaphysical" - gfn is not a real page (huge page/real mode)
132 * bits 17:19 - common access permissions for all ptes in this shadow page
133 */
134union kvm_mmu_page_role {
135 unsigned word;
136 struct {
137 unsigned glevels : 4;
138 unsigned level : 4;
139 unsigned quadrant : 2;
140 unsigned pad_for_nice_hex_output : 6;
141 unsigned metaphysical : 1;
142 unsigned access : 3;
143 };
144};
145
146struct kvm_mmu_page {
147 struct list_head link;
148 struct hlist_node hash_link;
149
150 /*
151 * The following two entries are used to key the shadow page in the
152 * hash table.
153 */
154 gfn_t gfn;
155 union kvm_mmu_page_role role;
156
157 u64 *spt;
158 /* hold the gfn of each spte inside spt */
159 gfn_t *gfns;
160 unsigned long slot_bitmap; /* One bit set per slot which has memory
161 * in this shadow page.
162 */
163 int multimapped; /* More than one parent_pte? */
164 int root_count; /* Currently serving as active root */
165 union {
166 u64 *parent_pte; /* !multimapped */
167 struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
168 };
169};
170
171/*
172 * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
173 * 32-bit). The kvm_mmu structure abstracts the details of the current mmu
174 * mode.
175 */
176struct kvm_mmu {
177 void (*new_cr3)(struct kvm_vcpu *vcpu);
178 int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
179 void (*free)(struct kvm_vcpu *vcpu);
180 gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva);
181 void (*prefetch_page)(struct kvm_vcpu *vcpu,
182 struct kvm_mmu_page *page);
183 hpa_t root_hpa;
184 int root_level;
185 int shadow_root_level;
186
187 u64 *pae_root;
188};
189
190struct kvm_vcpu_arch {
191 u64 host_tsc;
192 int interrupt_window_open;
193 unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
194 DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS);
195 unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */
196 unsigned long rip; /* needs vcpu_load_rsp_rip() */
197
198 unsigned long cr0;
199 unsigned long cr2;
200 unsigned long cr3;
201 unsigned long cr4;
202 unsigned long cr8;
203 u64 pdptrs[4]; /* pae */
204 u64 shadow_efer;
205 u64 apic_base;
206 struct kvm_lapic *apic; /* kernel irqchip context */
207#define VCPU_MP_STATE_RUNNABLE 0
208#define VCPU_MP_STATE_UNINITIALIZED 1
209#define VCPU_MP_STATE_INIT_RECEIVED 2
210#define VCPU_MP_STATE_SIPI_RECEIVED 3
211#define VCPU_MP_STATE_HALTED 4
212 int mp_state;
213 int sipi_vector;
214 u64 ia32_misc_enable_msr;
215
216 struct kvm_mmu mmu;
217
218 struct kvm_mmu_memory_cache mmu_pte_chain_cache;
219 struct kvm_mmu_memory_cache mmu_rmap_desc_cache;
220 struct kvm_mmu_memory_cache mmu_page_cache;
221 struct kvm_mmu_memory_cache mmu_page_header_cache;
222
223 gfn_t last_pt_write_gfn;
224 int last_pt_write_count;
225 u64 *last_pte_updated;
226
227 struct i387_fxsave_struct host_fx_image;
228 struct i387_fxsave_struct guest_fx_image;
229
230 gva_t mmio_fault_cr2;
231 struct kvm_pio_request pio;
232 void *pio_data;
233
234 struct kvm_queued_exception {
235 bool pending;
236 bool has_error_code;
237 u8 nr;
238 u32 error_code;
239 } exception;
240
241 struct {
242 int active;
243 u8 save_iopl;
244 struct kvm_save_segment {
245 u16 selector;
246 unsigned long base;
247 u32 limit;
248 u32 ar;
249 } tr, es, ds, fs, gs;
250 } rmode;
251 int halt_request; /* real mode on Intel only */
252
253 int cpuid_nent;
254 struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES];
255 /* emulate context */
256
257 struct x86_emulate_ctxt emulate_ctxt;
258};
259
260struct kvm_mem_alias {
261 gfn_t base_gfn;
262 unsigned long npages;
263 gfn_t target_gfn;
264};
265
266struct kvm_arch{
267 int naliases;
268 struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
269
270 unsigned int n_free_mmu_pages;
271 unsigned int n_requested_mmu_pages;
272 unsigned int n_alloc_mmu_pages;
273 struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
274 /*
275 * Hash table of struct kvm_mmu_page.
276 */
277 struct list_head active_mmu_pages;
278 struct kvm_pic *vpic;
279 struct kvm_ioapic *vioapic;
280
281 int round_robin_prev_vcpu;
282 unsigned int tss_addr;
283 struct page *apic_access_page;
284};
285
286struct kvm_vm_stat {
287 u32 mmu_shadow_zapped;
288 u32 mmu_pte_write;
289 u32 mmu_pte_updated;
290 u32 mmu_pde_zapped;
291 u32 mmu_flooded;
292 u32 mmu_recycled;
293 u32 remote_tlb_flush;
294};
295
296struct kvm_vcpu_stat {
297 u32 pf_fixed;
298 u32 pf_guest;
299 u32 tlb_flush;
300 u32 invlpg;
301
302 u32 exits;
303 u32 io_exits;
304 u32 mmio_exits;
305 u32 signal_exits;
306 u32 irq_window_exits;
307 u32 halt_exits;
308 u32 halt_wakeup;
309 u32 request_irq_exits;
310 u32 irq_exits;
311 u32 host_state_reload;
312 u32 efer_reload;
313 u32 fpu_reload;
314 u32 insn_emulation;
315 u32 insn_emulation_fail;
316};
317
318struct descriptor_table {
319 u16 limit;
320 unsigned long base;
321} __attribute__((packed));
322
323struct kvm_x86_ops {
324 int (*cpu_has_kvm_support)(void); /* __init */
325 int (*disabled_by_bios)(void); /* __init */
326 void (*hardware_enable)(void *dummy); /* __init */
327 void (*hardware_disable)(void *dummy);
328 void (*check_processor_compatibility)(void *rtn);
329 int (*hardware_setup)(void); /* __init */
330 void (*hardware_unsetup)(void); /* __exit */
331
332 /* Create, but do not attach this VCPU */
333 struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
334 void (*vcpu_free)(struct kvm_vcpu *vcpu);
335 int (*vcpu_reset)(struct kvm_vcpu *vcpu);
336
337 void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
338 void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
339 void (*vcpu_put)(struct kvm_vcpu *vcpu);
340 void (*vcpu_decache)(struct kvm_vcpu *vcpu);
341
342 int (*set_guest_debug)(struct kvm_vcpu *vcpu,
343 struct kvm_debug_guest *dbg);
344 void (*guest_debug_pre)(struct kvm_vcpu *vcpu);
345 int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
346 int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
347 u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
348 void (*get_segment)(struct kvm_vcpu *vcpu,
349 struct kvm_segment *var, int seg);
350 void (*set_segment)(struct kvm_vcpu *vcpu,
351 struct kvm_segment *var, int seg);
352 void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
353 void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
354 void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
355 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
356 void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
357 void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
358 void (*get_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
359 void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
360 void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
361 void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
362 unsigned long (*get_dr)(struct kvm_vcpu *vcpu, int dr);
363 void (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value,
364 int *exception);
365 void (*cache_regs)(struct kvm_vcpu *vcpu);
366 void (*decache_regs)(struct kvm_vcpu *vcpu);
367 unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
368 void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
369
370 void (*tlb_flush)(struct kvm_vcpu *vcpu);
371
372 void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run);
373 int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu);
374 void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
375 void (*patch_hypercall)(struct kvm_vcpu *vcpu,
376 unsigned char *hypercall_addr);
377 int (*get_irq)(struct kvm_vcpu *vcpu);
378 void (*set_irq)(struct kvm_vcpu *vcpu, int vec);
379 void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
380 bool has_error_code, u32 error_code);
381 bool (*exception_injected)(struct kvm_vcpu *vcpu);
382 void (*inject_pending_irq)(struct kvm_vcpu *vcpu);
383 void (*inject_pending_vectors)(struct kvm_vcpu *vcpu,
384 struct kvm_run *run);
385
386 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
387};
388
389extern struct kvm_x86_ops *kvm_x86_ops;
390
391int kvm_mmu_module_init(void);
392void kvm_mmu_module_exit(void);
393
394void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
395int kvm_mmu_create(struct kvm_vcpu *vcpu);
396int kvm_mmu_setup(struct kvm_vcpu *vcpu);
397void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
398
399int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
400void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
401void kvm_mmu_zap_all(struct kvm *kvm);
402unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
403void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
404
405enum emulation_result {
406 EMULATE_DONE, /* no further processing */
407 EMULATE_DO_MMIO, /* kvm_run filled with mmio request */
408 EMULATE_FAIL, /* can't emulate this instruction */
409};
410
411int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run,
412 unsigned long cr2, u16 error_code, int no_decode);
413void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
414void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
415void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
416void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
417 unsigned long *rflags);
418
419unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr);
420void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long value,
421 unsigned long *rflags);
422int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
423int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
424
425struct x86_emulate_ctxt;
426
427int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
428 int size, unsigned port);
429int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
430 int size, unsigned long count, int down,
431 gva_t address, int rep, unsigned port);
432void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
433int kvm_emulate_halt(struct kvm_vcpu *vcpu);
434int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
435int emulate_clts(struct kvm_vcpu *vcpu);
436int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
437 unsigned long *dest);
438int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
439 unsigned long value);
440
441void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
442void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr0);
443void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr0);
444void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr0);
445unsigned long get_cr8(struct kvm_vcpu *vcpu);
446void lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
447void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
448
449int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
450int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
451
452void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
453void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
454void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
455 u32 error_code);
456
457void fx_init(struct kvm_vcpu *vcpu);
458
459int emulator_read_std(unsigned long addr,
460 void *val,
461 unsigned int bytes,
462 struct kvm_vcpu *vcpu);
463int emulator_write_emulated(unsigned long addr,
464 const void *val,
465 unsigned int bytes,
466 struct kvm_vcpu *vcpu);
467
468unsigned long segment_base(u16 selector);
469
470void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
471void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
472 const u8 *new, int bytes);
473int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
474void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
475int kvm_mmu_load(struct kvm_vcpu *vcpu);
476void kvm_mmu_unload(struct kvm_vcpu *vcpu);
477
478int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
479
480int kvm_fix_hypercall(struct kvm_vcpu *vcpu);
481
482int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code);
483
484int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
485int complete_pio(struct kvm_vcpu *vcpu);
486
487static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
488{
489 struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
490
491 return (struct kvm_mmu_page *)page_private(page);
492}
493
494static inline u16 read_fs(void)
495{
496 u16 seg;
497 asm("mov %%fs, %0" : "=g"(seg));
498 return seg;
499}
500
501static inline u16 read_gs(void)
502{
503 u16 seg;
504 asm("mov %%gs, %0" : "=g"(seg));
505 return seg;
506}
507
508static inline u16 read_ldt(void)
509{
510 u16 ldt;
511 asm("sldt %0" : "=g"(ldt));
512 return ldt;
513}
514
515static inline void load_fs(u16 sel)
516{
517 asm("mov %0, %%fs" : : "rm"(sel));
518}
519
520static inline void load_gs(u16 sel)
521{
522 asm("mov %0, %%gs" : : "rm"(sel));
523}
524
525#ifndef load_ldt
526static inline void load_ldt(u16 sel)
527{
528 asm("lldt %0" : : "rm"(sel));
529}
530#endif
531
532static inline void get_idt(struct descriptor_table *table)
533{
534 asm("sidt %0" : "=m"(*table));
535}
536
537static inline void get_gdt(struct descriptor_table *table)
538{
539 asm("sgdt %0" : "=m"(*table));
540}
541
542static inline unsigned long read_tr_base(void)
543{
544 u16 tr;
545 asm("str %0" : "=g"(tr));
546 return segment_base(tr);
547}
548
549#ifdef CONFIG_X86_64
550static inline unsigned long read_msr(unsigned long msr)
551{
552 u64 value;
553
554 rdmsrl(msr, value);
555 return value;
556}
557#endif
558
559static inline void fx_save(struct i387_fxsave_struct *image)
560{
561 asm("fxsave (%0)":: "r" (image));
562}
563
564static inline void fx_restore(struct i387_fxsave_struct *image)
565{
566 asm("fxrstor (%0)":: "r" (image));
567}
568
569static inline void fpu_init(void)
570{
571 asm("finit");
572}
573
574static inline u32 get_rdx_init_val(void)
575{
576 return 0x600; /* P6 family */
577}
578
579static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
580{
581 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
582}
583
584#define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30"
585#define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2"
586#define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3"
587#define ASM_VMX_VMPTRLD_RAX ".byte 0x0f, 0xc7, 0x30"
588#define ASM_VMX_VMREAD_RDX_RAX ".byte 0x0f, 0x78, 0xd0"
589#define ASM_VMX_VMWRITE_RAX_RDX ".byte 0x0f, 0x79, 0xd0"
590#define ASM_VMX_VMWRITE_RSP_RDX ".byte 0x0f, 0x79, 0xd4"
591#define ASM_VMX_VMXOFF ".byte 0x0f, 0x01, 0xc4"
592#define ASM_VMX_VMXON_RAX ".byte 0xf3, 0x0f, 0xc7, 0x30"
593
594#define MSR_IA32_TIME_STAMP_COUNTER 0x010
595
596#define TSS_IOPB_BASE_OFFSET 0x66
597#define TSS_BASE_SIZE 0x68
598#define TSS_IOPB_SIZE (65536 / 8)
599#define TSS_REDIRECTION_SIZE (256 / 8)
600#define RMODE_TSS_SIZE (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1)
601
602#endif
diff --git a/drivers/kvm/x86_emulate.c b/drivers/kvm/x86_emulate.c
deleted file mode 100644
index 50b133f68743..000000000000
--- a/drivers/kvm/x86_emulate.c
+++ /dev/null
@@ -1,1913 +0,0 @@
1/******************************************************************************
2 * x86_emulate.c
3 *
4 * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
5 *
6 * Copyright (c) 2005 Keir Fraser
7 *
8 * Linux coding style, mod r/m decoder, segment base fixes, real-mode
9 * privileged instructions:
10 *
11 * Copyright (C) 2006 Qumranet
12 *
13 * Avi Kivity <avi@qumranet.com>
14 * Yaniv Kamay <yaniv@qumranet.com>
15 *
16 * This work is licensed under the terms of the GNU GPL, version 2. See
17 * the COPYING file in the top-level directory.
18 *
19 * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
20 */
21
22#ifndef __KERNEL__
23#include <stdio.h>
24#include <stdint.h>
25#include <public/xen.h>
26#define DPRINTF(_f, _a ...) printf(_f , ## _a)
27#else
28#include "kvm.h"
29#include "x86.h"
30#define DPRINTF(x...) do {} while (0)
31#endif
32#include "x86_emulate.h"
33#include <linux/module.h>
34
35/*
36 * Opcode effective-address decode tables.
37 * Note that we only emulate instructions that have at least one memory
38 * operand (excluding implicit stack references). We assume that stack
39 * references and instruction fetches will never occur in special memory
40 * areas that require emulation. So, for example, 'mov <imm>,<reg>' need
41 * not be handled.
42 */
43
44/* Operand sizes: 8-bit operands or specified/overridden size. */
45#define ByteOp (1<<0) /* 8-bit operands. */
46/* Destination operand type. */
47#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */
48#define DstReg (2<<1) /* Register operand. */
49#define DstMem (3<<1) /* Memory operand. */
50#define DstMask (3<<1)
51/* Source operand type. */
52#define SrcNone (0<<3) /* No source operand. */
53#define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */
54#define SrcReg (1<<3) /* Register operand. */
55#define SrcMem (2<<3) /* Memory operand. */
56#define SrcMem16 (3<<3) /* Memory operand (16-bit). */
57#define SrcMem32 (4<<3) /* Memory operand (32-bit). */
58#define SrcImm (5<<3) /* Immediate operand. */
59#define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */
60#define SrcMask (7<<3)
61/* Generic ModRM decode. */
62#define ModRM (1<<6)
63/* Destination is only written; never read. */
64#define Mov (1<<7)
65#define BitOp (1<<8)
66#define MemAbs (1<<9) /* Memory operand is absolute displacement */
67#define String (1<<10) /* String instruction (rep capable) */
68#define Stack (1<<11) /* Stack instruction (push/pop) */
69
70static u16 opcode_table[256] = {
71 /* 0x00 - 0x07 */
72 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
73 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
74 0, 0, 0, 0,
75 /* 0x08 - 0x0F */
76 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
77 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
78 0, 0, 0, 0,
79 /* 0x10 - 0x17 */
80 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
81 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
82 0, 0, 0, 0,
83 /* 0x18 - 0x1F */
84 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
85 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
86 0, 0, 0, 0,
87 /* 0x20 - 0x27 */
88 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
89 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
90 SrcImmByte, SrcImm, 0, 0,
91 /* 0x28 - 0x2F */
92 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
93 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
94 0, 0, 0, 0,
95 /* 0x30 - 0x37 */
96 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
97 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
98 0, 0, 0, 0,
99 /* 0x38 - 0x3F */
100 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
101 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
102 0, 0, 0, 0,
103 /* 0x40 - 0x47 */
104 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
105 /* 0x48 - 0x4F */
106 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
107 /* 0x50 - 0x57 */
108 SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
109 SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
110 /* 0x58 - 0x5F */
111 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
112 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
113 /* 0x60 - 0x67 */
114 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
115 0, 0, 0, 0,
116 /* 0x68 - 0x6F */
117 0, 0, ImplicitOps | Mov | Stack, 0,
118 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */
119 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */
120 /* 0x70 - 0x77 */
121 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
122 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
123 /* 0x78 - 0x7F */
124 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
125 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
126 /* 0x80 - 0x87 */
127 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
128 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
129 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
130 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
131 /* 0x88 - 0x8F */
132 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
133 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
134 0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov | Stack,
135 /* 0x90 - 0x9F */
136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
138 /* 0xA0 - 0xA7 */
139 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
140 ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs,
141 ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
142 ByteOp | ImplicitOps | String, ImplicitOps | String,
143 /* 0xA8 - 0xAF */
144 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
145 ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
146 ByteOp | ImplicitOps | String, ImplicitOps | String,
147 /* 0xB0 - 0xBF */
148 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
149 /* 0xC0 - 0xC7 */
150 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
151 0, ImplicitOps | Stack, 0, 0,
152 ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
153 /* 0xC8 - 0xCF */
154 0, 0, 0, 0, 0, 0, 0, 0,
155 /* 0xD0 - 0xD7 */
156 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
157 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
158 0, 0, 0, 0,
159 /* 0xD8 - 0xDF */
160 0, 0, 0, 0, 0, 0, 0, 0,
161 /* 0xE0 - 0xE7 */
162 0, 0, 0, 0, 0, 0, 0, 0,
163 /* 0xE8 - 0xEF */
164 ImplicitOps | Stack, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps,
165 0, 0, 0, 0,
166 /* 0xF0 - 0xF7 */
167 0, 0, 0, 0,
168 ImplicitOps, ImplicitOps,
169 ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
170 /* 0xF8 - 0xFF */
171 ImplicitOps, 0, ImplicitOps, ImplicitOps,
172 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM
173};
174
175static u16 twobyte_table[256] = {
176 /* 0x00 - 0x0F */
177 0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0,
178 ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
179 /* 0x10 - 0x1F */
180 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
181 /* 0x20 - 0x2F */
182 ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 /* 0x30 - 0x3F */
185 ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
186 /* 0x40 - 0x47 */
187 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
188 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
189 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
190 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
191 /* 0x48 - 0x4F */
192 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
193 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
194 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
195 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
196 /* 0x50 - 0x5F */
197 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
198 /* 0x60 - 0x6F */
199 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
200 /* 0x70 - 0x7F */
201 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
202 /* 0x80 - 0x8F */
203 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
204 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
205 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
206 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
207 /* 0x90 - 0x9F */
208 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
209 /* 0xA0 - 0xA7 */
210 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
211 /* 0xA8 - 0xAF */
212 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
213 /* 0xB0 - 0xB7 */
214 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0,
215 DstMem | SrcReg | ModRM | BitOp,
216 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
217 DstReg | SrcMem16 | ModRM | Mov,
218 /* 0xB8 - 0xBF */
219 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp,
220 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
221 DstReg | SrcMem16 | ModRM | Mov,
222 /* 0xC0 - 0xCF */
223 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM,
224 0, 0, 0, 0, 0, 0, 0, 0,
225 /* 0xD0 - 0xDF */
226 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
227 /* 0xE0 - 0xEF */
228 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
229 /* 0xF0 - 0xFF */
230 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
231};
232
233/* EFLAGS bit definitions. */
234#define EFLG_OF (1<<11)
235#define EFLG_DF (1<<10)
236#define EFLG_SF (1<<7)
237#define EFLG_ZF (1<<6)
238#define EFLG_AF (1<<4)
239#define EFLG_PF (1<<2)
240#define EFLG_CF (1<<0)
241
242/*
243 * Instruction emulation:
244 * Most instructions are emulated directly via a fragment of inline assembly
245 * code. This allows us to save/restore EFLAGS and thus very easily pick up
246 * any modified flags.
247 */
248
249#if defined(CONFIG_X86_64)
250#define _LO32 "k" /* force 32-bit operand */
251#define _STK "%%rsp" /* stack pointer */
252#elif defined(__i386__)
253#define _LO32 "" /* force 32-bit operand */
254#define _STK "%%esp" /* stack pointer */
255#endif
256
257/*
258 * These EFLAGS bits are restored from saved value during emulation, and
259 * any changes are written back to the saved value after emulation.
260 */
261#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF)
262
263/* Before executing instruction: restore necessary bits in EFLAGS. */
264#define _PRE_EFLAGS(_sav, _msk, _tmp) \
265 /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~_msk; */ \
266 "movl %"_sav",%"_LO32 _tmp"; " \
267 "push %"_tmp"; " \
268 "push %"_tmp"; " \
269 "movl %"_msk",%"_LO32 _tmp"; " \
270 "andl %"_LO32 _tmp",("_STK"); " \
271 "pushf; " \
272 "notl %"_LO32 _tmp"; " \
273 "andl %"_LO32 _tmp",("_STK"); " \
274 "andl %"_LO32 _tmp","__stringify(BITS_PER_LONG/4)"("_STK"); " \
275 "pop %"_tmp"; " \
276 "orl %"_LO32 _tmp",("_STK"); " \
277 "popf; " \
278 "pop %"_sav"; "
279
280/* After executing instruction: write-back necessary bits in EFLAGS. */
281#define _POST_EFLAGS(_sav, _msk, _tmp) \
282 /* _sav |= EFLAGS & _msk; */ \
283 "pushf; " \
284 "pop %"_tmp"; " \
285 "andl %"_msk",%"_LO32 _tmp"; " \
286 "orl %"_LO32 _tmp",%"_sav"; "
287
288/* Raw emulation: instruction has two explicit operands. */
289#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \
290 do { \
291 unsigned long _tmp; \
292 \
293 switch ((_dst).bytes) { \
294 case 2: \
295 __asm__ __volatile__ ( \
296 _PRE_EFLAGS("0", "4", "2") \
297 _op"w %"_wx"3,%1; " \
298 _POST_EFLAGS("0", "4", "2") \
299 : "=m" (_eflags), "=m" ((_dst).val), \
300 "=&r" (_tmp) \
301 : _wy ((_src).val), "i" (EFLAGS_MASK)); \
302 break; \
303 case 4: \
304 __asm__ __volatile__ ( \
305 _PRE_EFLAGS("0", "4", "2") \
306 _op"l %"_lx"3,%1; " \
307 _POST_EFLAGS("0", "4", "2") \
308 : "=m" (_eflags), "=m" ((_dst).val), \
309 "=&r" (_tmp) \
310 : _ly ((_src).val), "i" (EFLAGS_MASK)); \
311 break; \
312 case 8: \
313 __emulate_2op_8byte(_op, _src, _dst, \
314 _eflags, _qx, _qy); \
315 break; \
316 } \
317 } while (0)
318
319#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
320 do { \
321 unsigned long _tmp; \
322 switch ((_dst).bytes) { \
323 case 1: \
324 __asm__ __volatile__ ( \
325 _PRE_EFLAGS("0", "4", "2") \
326 _op"b %"_bx"3,%1; " \
327 _POST_EFLAGS("0", "4", "2") \
328 : "=m" (_eflags), "=m" ((_dst).val), \
329 "=&r" (_tmp) \
330 : _by ((_src).val), "i" (EFLAGS_MASK)); \
331 break; \
332 default: \
333 __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
334 _wx, _wy, _lx, _ly, _qx, _qy); \
335 break; \
336 } \
337 } while (0)
338
339/* Source operand is byte-sized and may be restricted to just %cl. */
340#define emulate_2op_SrcB(_op, _src, _dst, _eflags) \
341 __emulate_2op(_op, _src, _dst, _eflags, \
342 "b", "c", "b", "c", "b", "c", "b", "c")
343
344/* Source operand is byte, word, long or quad sized. */
345#define emulate_2op_SrcV(_op, _src, _dst, _eflags) \
346 __emulate_2op(_op, _src, _dst, _eflags, \
347 "b", "q", "w", "r", _LO32, "r", "", "r")
348
349/* Source operand is word, long or quad sized. */
350#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \
351 __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
352 "w", "r", _LO32, "r", "", "r")
353
354/* Instruction has only one explicit operand (no source operand). */
355#define emulate_1op(_op, _dst, _eflags) \
356 do { \
357 unsigned long _tmp; \
358 \
359 switch ((_dst).bytes) { \
360 case 1: \
361 __asm__ __volatile__ ( \
362 _PRE_EFLAGS("0", "3", "2") \
363 _op"b %1; " \
364 _POST_EFLAGS("0", "3", "2") \
365 : "=m" (_eflags), "=m" ((_dst).val), \
366 "=&r" (_tmp) \
367 : "i" (EFLAGS_MASK)); \
368 break; \
369 case 2: \
370 __asm__ __volatile__ ( \
371 _PRE_EFLAGS("0", "3", "2") \
372 _op"w %1; " \
373 _POST_EFLAGS("0", "3", "2") \
374 : "=m" (_eflags), "=m" ((_dst).val), \
375 "=&r" (_tmp) \
376 : "i" (EFLAGS_MASK)); \
377 break; \
378 case 4: \
379 __asm__ __volatile__ ( \
380 _PRE_EFLAGS("0", "3", "2") \
381 _op"l %1; " \
382 _POST_EFLAGS("0", "3", "2") \
383 : "=m" (_eflags), "=m" ((_dst).val), \
384 "=&r" (_tmp) \
385 : "i" (EFLAGS_MASK)); \
386 break; \
387 case 8: \
388 __emulate_1op_8byte(_op, _dst, _eflags); \
389 break; \
390 } \
391 } while (0)
392
393/* Emulate an instruction with quadword operands (x86/64 only). */
394#if defined(CONFIG_X86_64)
395#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) \
396 do { \
397 __asm__ __volatile__ ( \
398 _PRE_EFLAGS("0", "4", "2") \
399 _op"q %"_qx"3,%1; " \
400 _POST_EFLAGS("0", "4", "2") \
401 : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
402 : _qy ((_src).val), "i" (EFLAGS_MASK)); \
403 } while (0)
404
405#define __emulate_1op_8byte(_op, _dst, _eflags) \
406 do { \
407 __asm__ __volatile__ ( \
408 _PRE_EFLAGS("0", "3", "2") \
409 _op"q %1; " \
410 _POST_EFLAGS("0", "3", "2") \
411 : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
412 : "i" (EFLAGS_MASK)); \
413 } while (0)
414
415#elif defined(__i386__)
416#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)
417#define __emulate_1op_8byte(_op, _dst, _eflags)
418#endif /* __i386__ */
419
420/* Fetch next part of the instruction being emulated. */
421#define insn_fetch(_type, _size, _eip) \
422({ unsigned long _x; \
423 rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \
424 if (rc != 0) \
425 goto done; \
426 (_eip) += (_size); \
427 (_type)_x; \
428})
429
430/* Access/update address held in a register, based on addressing mode. */
431#define address_mask(reg) \
432 ((c->ad_bytes == sizeof(unsigned long)) ? \
433 (reg) : ((reg) & ((1UL << (c->ad_bytes << 3)) - 1)))
434#define register_address(base, reg) \
435 ((base) + address_mask(reg))
436#define register_address_increment(reg, inc) \
437 do { \
438 /* signed type ensures sign extension to long */ \
439 int _inc = (inc); \
440 if (c->ad_bytes == sizeof(unsigned long)) \
441 (reg) += _inc; \
442 else \
443 (reg) = ((reg) & \
444 ~((1UL << (c->ad_bytes << 3)) - 1)) | \
445 (((reg) + _inc) & \
446 ((1UL << (c->ad_bytes << 3)) - 1)); \
447 } while (0)
448
449#define JMP_REL(rel) \
450 do { \
451 register_address_increment(c->eip, rel); \
452 } while (0)
453
454static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
455 struct x86_emulate_ops *ops,
456 unsigned long linear, u8 *dest)
457{
458 struct fetch_cache *fc = &ctxt->decode.fetch;
459 int rc;
460 int size;
461
462 if (linear < fc->start || linear >= fc->end) {
463 size = min(15UL, PAGE_SIZE - offset_in_page(linear));
464 rc = ops->read_std(linear, fc->data, size, ctxt->vcpu);
465 if (rc)
466 return rc;
467 fc->start = linear;
468 fc->end = linear + size;
469 }
470 *dest = fc->data[linear - fc->start];
471 return 0;
472}
473
474static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
475 struct x86_emulate_ops *ops,
476 unsigned long eip, void *dest, unsigned size)
477{
478 int rc = 0;
479
480 eip += ctxt->cs_base;
481 while (size--) {
482 rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++);
483 if (rc)
484 return rc;
485 }
486 return 0;
487}
488
489/*
490 * Given the 'reg' portion of a ModRM byte, and a register block, return a
491 * pointer into the block that addresses the relevant register.
492 * @highbyte_regs specifies whether to decode AH,CH,DH,BH.
493 */
494static void *decode_register(u8 modrm_reg, unsigned long *regs,
495 int highbyte_regs)
496{
497 void *p;
498
499 p = &regs[modrm_reg];
500 if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8)
501 p = (unsigned char *)&regs[modrm_reg & 3] + 1;
502 return p;
503}
504
505static int read_descriptor(struct x86_emulate_ctxt *ctxt,
506 struct x86_emulate_ops *ops,
507 void *ptr,
508 u16 *size, unsigned long *address, int op_bytes)
509{
510 int rc;
511
512 if (op_bytes == 2)
513 op_bytes = 3;
514 *address = 0;
515 rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
516 ctxt->vcpu);
517 if (rc)
518 return rc;
519 rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
520 ctxt->vcpu);
521 return rc;
522}
523
524static int test_cc(unsigned int condition, unsigned int flags)
525{
526 int rc = 0;
527
528 switch ((condition & 15) >> 1) {
529 case 0: /* o */
530 rc |= (flags & EFLG_OF);
531 break;
532 case 1: /* b/c/nae */
533 rc |= (flags & EFLG_CF);
534 break;
535 case 2: /* z/e */
536 rc |= (flags & EFLG_ZF);
537 break;
538 case 3: /* be/na */
539 rc |= (flags & (EFLG_CF|EFLG_ZF));
540 break;
541 case 4: /* s */
542 rc |= (flags & EFLG_SF);
543 break;
544 case 5: /* p/pe */
545 rc |= (flags & EFLG_PF);
546 break;
547 case 7: /* le/ng */
548 rc |= (flags & EFLG_ZF);
549 /* fall through */
550 case 6: /* l/nge */
551 rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF));
552 break;
553 }
554
555 /* Odd condition identifiers (lsb == 1) have inverted sense. */
556 return (!!rc ^ (condition & 1));
557}
558
559static void decode_register_operand(struct operand *op,
560 struct decode_cache *c,
561 int inhibit_bytereg)
562{
563 unsigned reg = c->modrm_reg;
564 int highbyte_regs = c->rex_prefix == 0;
565
566 if (!(c->d & ModRM))
567 reg = (c->b & 7) | ((c->rex_prefix & 1) << 3);
568 op->type = OP_REG;
569 if ((c->d & ByteOp) && !inhibit_bytereg) {
570 op->ptr = decode_register(reg, c->regs, highbyte_regs);
571 op->val = *(u8 *)op->ptr;
572 op->bytes = 1;
573 } else {
574 op->ptr = decode_register(reg, c->regs, 0);
575 op->bytes = c->op_bytes;
576 switch (op->bytes) {
577 case 2:
578 op->val = *(u16 *)op->ptr;
579 break;
580 case 4:
581 op->val = *(u32 *)op->ptr;
582 break;
583 case 8:
584 op->val = *(u64 *) op->ptr;
585 break;
586 }
587 }
588 op->orig_val = op->val;
589}
590
591static int decode_modrm(struct x86_emulate_ctxt *ctxt,
592 struct x86_emulate_ops *ops)
593{
594 struct decode_cache *c = &ctxt->decode;
595 u8 sib;
596 int index_reg = 0, base_reg = 0, scale, rip_relative = 0;
597 int rc = 0;
598
599 if (c->rex_prefix) {
600 c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */
601 index_reg = (c->rex_prefix & 2) << 2; /* REX.X */
602 c->modrm_rm = base_reg = (c->rex_prefix & 1) << 3; /* REG.B */
603 }
604
605 c->modrm = insn_fetch(u8, 1, c->eip);
606 c->modrm_mod |= (c->modrm & 0xc0) >> 6;
607 c->modrm_reg |= (c->modrm & 0x38) >> 3;
608 c->modrm_rm |= (c->modrm & 0x07);
609 c->modrm_ea = 0;
610 c->use_modrm_ea = 1;
611
612 if (c->modrm_mod == 3) {
613 c->modrm_val = *(unsigned long *)
614 decode_register(c->modrm_rm, c->regs, c->d & ByteOp);
615 return rc;
616 }
617
618 if (c->ad_bytes == 2) {
619 unsigned bx = c->regs[VCPU_REGS_RBX];
620 unsigned bp = c->regs[VCPU_REGS_RBP];
621 unsigned si = c->regs[VCPU_REGS_RSI];
622 unsigned di = c->regs[VCPU_REGS_RDI];
623
624 /* 16-bit ModR/M decode. */
625 switch (c->modrm_mod) {
626 case 0:
627 if (c->modrm_rm == 6)
628 c->modrm_ea += insn_fetch(u16, 2, c->eip);
629 break;
630 case 1:
631 c->modrm_ea += insn_fetch(s8, 1, c->eip);
632 break;
633 case 2:
634 c->modrm_ea += insn_fetch(u16, 2, c->eip);
635 break;
636 }
637 switch (c->modrm_rm) {
638 case 0:
639 c->modrm_ea += bx + si;
640 break;
641 case 1:
642 c->modrm_ea += bx + di;
643 break;
644 case 2:
645 c->modrm_ea += bp + si;
646 break;
647 case 3:
648 c->modrm_ea += bp + di;
649 break;
650 case 4:
651 c->modrm_ea += si;
652 break;
653 case 5:
654 c->modrm_ea += di;
655 break;
656 case 6:
657 if (c->modrm_mod != 0)
658 c->modrm_ea += bp;
659 break;
660 case 7:
661 c->modrm_ea += bx;
662 break;
663 }
664 if (c->modrm_rm == 2 || c->modrm_rm == 3 ||
665 (c->modrm_rm == 6 && c->modrm_mod != 0))
666 if (!c->override_base)
667 c->override_base = &ctxt->ss_base;
668 c->modrm_ea = (u16)c->modrm_ea;
669 } else {
670 /* 32/64-bit ModR/M decode. */
671 switch (c->modrm_rm) {
672 case 4:
673 case 12:
674 sib = insn_fetch(u8, 1, c->eip);
675 index_reg |= (sib >> 3) & 7;
676 base_reg |= sib & 7;
677 scale = sib >> 6;
678
679 switch (base_reg) {
680 case 5:
681 if (c->modrm_mod != 0)
682 c->modrm_ea += c->regs[base_reg];
683 else
684 c->modrm_ea +=
685 insn_fetch(s32, 4, c->eip);
686 break;
687 default:
688 c->modrm_ea += c->regs[base_reg];
689 }
690 switch (index_reg) {
691 case 4:
692 break;
693 default:
694 c->modrm_ea += c->regs[index_reg] << scale;
695 }
696 break;
697 case 5:
698 if (c->modrm_mod != 0)
699 c->modrm_ea += c->regs[c->modrm_rm];
700 else if (ctxt->mode == X86EMUL_MODE_PROT64)
701 rip_relative = 1;
702 break;
703 default:
704 c->modrm_ea += c->regs[c->modrm_rm];
705 break;
706 }
707 switch (c->modrm_mod) {
708 case 0:
709 if (c->modrm_rm == 5)
710 c->modrm_ea += insn_fetch(s32, 4, c->eip);
711 break;
712 case 1:
713 c->modrm_ea += insn_fetch(s8, 1, c->eip);
714 break;
715 case 2:
716 c->modrm_ea += insn_fetch(s32, 4, c->eip);
717 break;
718 }
719 }
720 if (rip_relative) {
721 c->modrm_ea += c->eip;
722 switch (c->d & SrcMask) {
723 case SrcImmByte:
724 c->modrm_ea += 1;
725 break;
726 case SrcImm:
727 if (c->d & ByteOp)
728 c->modrm_ea += 1;
729 else
730 if (c->op_bytes == 8)
731 c->modrm_ea += 4;
732 else
733 c->modrm_ea += c->op_bytes;
734 }
735 }
736done:
737 return rc;
738}
739
740static int decode_abs(struct x86_emulate_ctxt *ctxt,
741 struct x86_emulate_ops *ops)
742{
743 struct decode_cache *c = &ctxt->decode;
744 int rc = 0;
745
746 switch (c->ad_bytes) {
747 case 2:
748 c->modrm_ea = insn_fetch(u16, 2, c->eip);
749 break;
750 case 4:
751 c->modrm_ea = insn_fetch(u32, 4, c->eip);
752 break;
753 case 8:
754 c->modrm_ea = insn_fetch(u64, 8, c->eip);
755 break;
756 }
757done:
758 return rc;
759}
760
761int
762x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
763{
764 struct decode_cache *c = &ctxt->decode;
765 int rc = 0;
766 int mode = ctxt->mode;
767 int def_op_bytes, def_ad_bytes;
768
769 /* Shadow copy of register state. Committed on successful emulation. */
770
771 memset(c, 0, sizeof(struct decode_cache));
772 c->eip = ctxt->vcpu->arch.rip;
773 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
774
775 switch (mode) {
776 case X86EMUL_MODE_REAL:
777 case X86EMUL_MODE_PROT16:
778 def_op_bytes = def_ad_bytes = 2;
779 break;
780 case X86EMUL_MODE_PROT32:
781 def_op_bytes = def_ad_bytes = 4;
782 break;
783#ifdef CONFIG_X86_64
784 case X86EMUL_MODE_PROT64:
785 def_op_bytes = 4;
786 def_ad_bytes = 8;
787 break;
788#endif
789 default:
790 return -1;
791 }
792
793 c->op_bytes = def_op_bytes;
794 c->ad_bytes = def_ad_bytes;
795
796 /* Legacy prefixes. */
797 for (;;) {
798 switch (c->b = insn_fetch(u8, 1, c->eip)) {
799 case 0x66: /* operand-size override */
800 /* switch between 2/4 bytes */
801 c->op_bytes = def_op_bytes ^ 6;
802 break;
803 case 0x67: /* address-size override */
804 if (mode == X86EMUL_MODE_PROT64)
805 /* switch between 4/8 bytes */
806 c->ad_bytes = def_ad_bytes ^ 12;
807 else
808 /* switch between 2/4 bytes */
809 c->ad_bytes = def_ad_bytes ^ 6;
810 break;
811 case 0x2e: /* CS override */
812 c->override_base = &ctxt->cs_base;
813 break;
814 case 0x3e: /* DS override */
815 c->override_base = &ctxt->ds_base;
816 break;
817 case 0x26: /* ES override */
818 c->override_base = &ctxt->es_base;
819 break;
820 case 0x64: /* FS override */
821 c->override_base = &ctxt->fs_base;
822 break;
823 case 0x65: /* GS override */
824 c->override_base = &ctxt->gs_base;
825 break;
826 case 0x36: /* SS override */
827 c->override_base = &ctxt->ss_base;
828 break;
829 case 0x40 ... 0x4f: /* REX */
830 if (mode != X86EMUL_MODE_PROT64)
831 goto done_prefixes;
832 c->rex_prefix = c->b;
833 continue;
834 case 0xf0: /* LOCK */
835 c->lock_prefix = 1;
836 break;
837 case 0xf2: /* REPNE/REPNZ */
838 c->rep_prefix = REPNE_PREFIX;
839 break;
840 case 0xf3: /* REP/REPE/REPZ */
841 c->rep_prefix = REPE_PREFIX;
842 break;
843 default:
844 goto done_prefixes;
845 }
846
847 /* Any legacy prefix after a REX prefix nullifies its effect. */
848
849 c->rex_prefix = 0;
850 }
851
852done_prefixes:
853
854 /* REX prefix. */
855 if (c->rex_prefix)
856 if (c->rex_prefix & 8)
857 c->op_bytes = 8; /* REX.W */
858
859 /* Opcode byte(s). */
860 c->d = opcode_table[c->b];
861 if (c->d == 0) {
862 /* Two-byte opcode? */
863 if (c->b == 0x0f) {
864 c->twobyte = 1;
865 c->b = insn_fetch(u8, 1, c->eip);
866 c->d = twobyte_table[c->b];
867 }
868
869 /* Unrecognised? */
870 if (c->d == 0) {
871 DPRINTF("Cannot emulate %02x\n", c->b);
872 return -1;
873 }
874 }
875
876 if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
877 c->op_bytes = 8;
878
879 /* ModRM and SIB bytes. */
880 if (c->d & ModRM)
881 rc = decode_modrm(ctxt, ops);
882 else if (c->d & MemAbs)
883 rc = decode_abs(ctxt, ops);
884 if (rc)
885 goto done;
886
887 if (!c->override_base)
888 c->override_base = &ctxt->ds_base;
889 if (mode == X86EMUL_MODE_PROT64 &&
890 c->override_base != &ctxt->fs_base &&
891 c->override_base != &ctxt->gs_base)
892 c->override_base = NULL;
893
894 if (c->override_base)
895 c->modrm_ea += *c->override_base;
896
897 if (c->ad_bytes != 8)
898 c->modrm_ea = (u32)c->modrm_ea;
899 /*
900 * Decode and fetch the source operand: register, memory
901 * or immediate.
902 */
903 switch (c->d & SrcMask) {
904 case SrcNone:
905 break;
906 case SrcReg:
907 decode_register_operand(&c->src, c, 0);
908 break;
909 case SrcMem16:
910 c->src.bytes = 2;
911 goto srcmem_common;
912 case SrcMem32:
913 c->src.bytes = 4;
914 goto srcmem_common;
915 case SrcMem:
916 c->src.bytes = (c->d & ByteOp) ? 1 :
917 c->op_bytes;
918 /* Don't fetch the address for invlpg: it could be unmapped. */
919 if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7)
920 break;
921 srcmem_common:
922 /*
923 * For instructions with a ModR/M byte, switch to register
924 * access if Mod = 3.
925 */
926 if ((c->d & ModRM) && c->modrm_mod == 3) {
927 c->src.type = OP_REG;
928 break;
929 }
930 c->src.type = OP_MEM;
931 break;
932 case SrcImm:
933 c->src.type = OP_IMM;
934 c->src.ptr = (unsigned long *)c->eip;
935 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
936 if (c->src.bytes == 8)
937 c->src.bytes = 4;
938 /* NB. Immediates are sign-extended as necessary. */
939 switch (c->src.bytes) {
940 case 1:
941 c->src.val = insn_fetch(s8, 1, c->eip);
942 break;
943 case 2:
944 c->src.val = insn_fetch(s16, 2, c->eip);
945 break;
946 case 4:
947 c->src.val = insn_fetch(s32, 4, c->eip);
948 break;
949 }
950 break;
951 case SrcImmByte:
952 c->src.type = OP_IMM;
953 c->src.ptr = (unsigned long *)c->eip;
954 c->src.bytes = 1;
955 c->src.val = insn_fetch(s8, 1, c->eip);
956 break;
957 }
958
959 /* Decode and fetch the destination operand: register or memory. */
960 switch (c->d & DstMask) {
961 case ImplicitOps:
962 /* Special instructions do their own operand decoding. */
963 return 0;
964 case DstReg:
965 decode_register_operand(&c->dst, c,
966 c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
967 break;
968 case DstMem:
969 if ((c->d & ModRM) && c->modrm_mod == 3) {
970 c->dst.type = OP_REG;
971 break;
972 }
973 c->dst.type = OP_MEM;
974 break;
975 }
976
977done:
978 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
979}
980
981static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
982{
983 struct decode_cache *c = &ctxt->decode;
984
985 c->dst.type = OP_MEM;
986 c->dst.bytes = c->op_bytes;
987 c->dst.val = c->src.val;
988 register_address_increment(c->regs[VCPU_REGS_RSP], -c->op_bytes);
989 c->dst.ptr = (void *) register_address(ctxt->ss_base,
990 c->regs[VCPU_REGS_RSP]);
991}
992
993static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
994 struct x86_emulate_ops *ops)
995{
996 struct decode_cache *c = &ctxt->decode;
997 int rc;
998
999 rc = ops->read_std(register_address(ctxt->ss_base,
1000 c->regs[VCPU_REGS_RSP]),
1001 &c->dst.val, c->dst.bytes, ctxt->vcpu);
1002 if (rc != 0)
1003 return rc;
1004
1005 register_address_increment(c->regs[VCPU_REGS_RSP], c->dst.bytes);
1006
1007 return 0;
1008}
1009
1010static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt)
1011{
1012 struct decode_cache *c = &ctxt->decode;
1013 switch (c->modrm_reg) {
1014 case 0: /* rol */
1015 emulate_2op_SrcB("rol", c->src, c->dst, ctxt->eflags);
1016 break;
1017 case 1: /* ror */
1018 emulate_2op_SrcB("ror", c->src, c->dst, ctxt->eflags);
1019 break;
1020 case 2: /* rcl */
1021 emulate_2op_SrcB("rcl", c->src, c->dst, ctxt->eflags);
1022 break;
1023 case 3: /* rcr */
1024 emulate_2op_SrcB("rcr", c->src, c->dst, ctxt->eflags);
1025 break;
1026 case 4: /* sal/shl */
1027 case 6: /* sal/shl */
1028 emulate_2op_SrcB("sal", c->src, c->dst, ctxt->eflags);
1029 break;
1030 case 5: /* shr */
1031 emulate_2op_SrcB("shr", c->src, c->dst, ctxt->eflags);
1032 break;
1033 case 7: /* sar */
1034 emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags);
1035 break;
1036 }
1037}
1038
1039static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
1040 struct x86_emulate_ops *ops)
1041{
1042 struct decode_cache *c = &ctxt->decode;
1043 int rc = 0;
1044
1045 switch (c->modrm_reg) {
1046 case 0 ... 1: /* test */
1047 /*
1048 * Special case in Grp3: test has an immediate
1049 * source operand.
1050 */
1051 c->src.type = OP_IMM;
1052 c->src.ptr = (unsigned long *)c->eip;
1053 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1054 if (c->src.bytes == 8)
1055 c->src.bytes = 4;
1056 switch (c->src.bytes) {
1057 case 1:
1058 c->src.val = insn_fetch(s8, 1, c->eip);
1059 break;
1060 case 2:
1061 c->src.val = insn_fetch(s16, 2, c->eip);
1062 break;
1063 case 4:
1064 c->src.val = insn_fetch(s32, 4, c->eip);
1065 break;
1066 }
1067 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
1068 break;
1069 case 2: /* not */
1070 c->dst.val = ~c->dst.val;
1071 break;
1072 case 3: /* neg */
1073 emulate_1op("neg", c->dst, ctxt->eflags);
1074 break;
1075 default:
1076 DPRINTF("Cannot emulate %02x\n", c->b);
1077 rc = X86EMUL_UNHANDLEABLE;
1078 break;
1079 }
1080done:
1081 return rc;
1082}
1083
1084static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
1085 struct x86_emulate_ops *ops)
1086{
1087 struct decode_cache *c = &ctxt->decode;
1088 int rc;
1089
1090 switch (c->modrm_reg) {
1091 case 0: /* inc */
1092 emulate_1op("inc", c->dst, ctxt->eflags);
1093 break;
1094 case 1: /* dec */
1095 emulate_1op("dec", c->dst, ctxt->eflags);
1096 break;
1097 case 4: /* jmp abs */
1098 if (c->b == 0xff)
1099 c->eip = c->dst.val;
1100 else {
1101 DPRINTF("Cannot emulate %02x\n", c->b);
1102 return X86EMUL_UNHANDLEABLE;
1103 }
1104 break;
1105 case 6: /* push */
1106
1107 /* 64-bit mode: PUSH always pushes a 64-bit operand. */
1108
1109 if (ctxt->mode == X86EMUL_MODE_PROT64) {
1110 c->dst.bytes = 8;
1111 rc = ops->read_std((unsigned long)c->dst.ptr,
1112 &c->dst.val, 8, ctxt->vcpu);
1113 if (rc != 0)
1114 return rc;
1115 }
1116 register_address_increment(c->regs[VCPU_REGS_RSP],
1117 -c->dst.bytes);
1118 rc = ops->write_emulated(register_address(ctxt->ss_base,
1119 c->regs[VCPU_REGS_RSP]), &c->dst.val,
1120 c->dst.bytes, ctxt->vcpu);
1121 if (rc != 0)
1122 return rc;
1123 c->dst.type = OP_NONE;
1124 break;
1125 default:
1126 DPRINTF("Cannot emulate %02x\n", c->b);
1127 return X86EMUL_UNHANDLEABLE;
1128 }
1129 return 0;
1130}
1131
1132static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
1133 struct x86_emulate_ops *ops,
1134 unsigned long memop)
1135{
1136 struct decode_cache *c = &ctxt->decode;
1137 u64 old, new;
1138 int rc;
1139
1140 rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu);
1141 if (rc != 0)
1142 return rc;
1143
1144 if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) ||
1145 ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) {
1146
1147 c->regs[VCPU_REGS_RAX] = (u32) (old >> 0);
1148 c->regs[VCPU_REGS_RDX] = (u32) (old >> 32);
1149 ctxt->eflags &= ~EFLG_ZF;
1150
1151 } else {
1152 new = ((u64)c->regs[VCPU_REGS_RCX] << 32) |
1153 (u32) c->regs[VCPU_REGS_RBX];
1154
1155 rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu);
1156 if (rc != 0)
1157 return rc;
1158 ctxt->eflags |= EFLG_ZF;
1159 }
1160 return 0;
1161}
1162
1163static inline int writeback(struct x86_emulate_ctxt *ctxt,
1164 struct x86_emulate_ops *ops)
1165{
1166 int rc;
1167 struct decode_cache *c = &ctxt->decode;
1168
1169 switch (c->dst.type) {
1170 case OP_REG:
1171 /* The 4-byte case *is* correct:
1172 * in 64-bit mode we zero-extend.
1173 */
1174 switch (c->dst.bytes) {
1175 case 1:
1176 *(u8 *)c->dst.ptr = (u8)c->dst.val;
1177 break;
1178 case 2:
1179 *(u16 *)c->dst.ptr = (u16)c->dst.val;
1180 break;
1181 case 4:
1182 *c->dst.ptr = (u32)c->dst.val;
1183 break; /* 64b: zero-ext */
1184 case 8:
1185 *c->dst.ptr = c->dst.val;
1186 break;
1187 }
1188 break;
1189 case OP_MEM:
1190 if (c->lock_prefix)
1191 rc = ops->cmpxchg_emulated(
1192 (unsigned long)c->dst.ptr,
1193 &c->dst.orig_val,
1194 &c->dst.val,
1195 c->dst.bytes,
1196 ctxt->vcpu);
1197 else
1198 rc = ops->write_emulated(
1199 (unsigned long)c->dst.ptr,
1200 &c->dst.val,
1201 c->dst.bytes,
1202 ctxt->vcpu);
1203 if (rc != 0)
1204 return rc;
1205 break;
1206 case OP_NONE:
1207 /* no writeback */
1208 break;
1209 default:
1210 break;
1211 }
1212 return 0;
1213}
1214
1215int
1216x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1217{
1218 unsigned long memop = 0;
1219 u64 msr_data;
1220 unsigned long saved_eip = 0;
1221 struct decode_cache *c = &ctxt->decode;
1222 int rc = 0;
1223
1224 /* Shadow copy of register state. Committed on successful emulation.
1225 * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't
1226 * modify them.
1227 */
1228
1229 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
1230 saved_eip = c->eip;
1231
1232 if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs))
1233 memop = c->modrm_ea;
1234
1235 if (c->rep_prefix && (c->d & String)) {
1236 /* All REP prefixes have the same first termination condition */
1237 if (c->regs[VCPU_REGS_RCX] == 0) {
1238 ctxt->vcpu->arch.rip = c->eip;
1239 goto done;
1240 }
1241 /* The second termination condition only applies for REPE
1242 * and REPNE. Test if the repeat string operation prefix is
1243 * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
1244 * corresponding termination condition according to:
1245 * - if REPE/REPZ and ZF = 0 then done
1246 * - if REPNE/REPNZ and ZF = 1 then done
1247 */
1248 if ((c->b == 0xa6) || (c->b == 0xa7) ||
1249 (c->b == 0xae) || (c->b == 0xaf)) {
1250 if ((c->rep_prefix == REPE_PREFIX) &&
1251 ((ctxt->eflags & EFLG_ZF) == 0)) {
1252 ctxt->vcpu->arch.rip = c->eip;
1253 goto done;
1254 }
1255 if ((c->rep_prefix == REPNE_PREFIX) &&
1256 ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) {
1257 ctxt->vcpu->arch.rip = c->eip;
1258 goto done;
1259 }
1260 }
1261 c->regs[VCPU_REGS_RCX]--;
1262 c->eip = ctxt->vcpu->arch.rip;
1263 }
1264
1265 if (c->src.type == OP_MEM) {
1266 c->src.ptr = (unsigned long *)memop;
1267 c->src.val = 0;
1268 rc = ops->read_emulated((unsigned long)c->src.ptr,
1269 &c->src.val,
1270 c->src.bytes,
1271 ctxt->vcpu);
1272 if (rc != 0)
1273 goto done;
1274 c->src.orig_val = c->src.val;
1275 }
1276
1277 if ((c->d & DstMask) == ImplicitOps)
1278 goto special_insn;
1279
1280
1281 if (c->dst.type == OP_MEM) {
1282 c->dst.ptr = (unsigned long *)memop;
1283 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1284 c->dst.val = 0;
1285 if (c->d & BitOp) {
1286 unsigned long mask = ~(c->dst.bytes * 8 - 1);
1287
1288 c->dst.ptr = (void *)c->dst.ptr +
1289 (c->src.val & mask) / 8;
1290 }
1291 if (!(c->d & Mov) &&
1292 /* optimisation - avoid slow emulated read */
1293 ((rc = ops->read_emulated((unsigned long)c->dst.ptr,
1294 &c->dst.val,
1295 c->dst.bytes, ctxt->vcpu)) != 0))
1296 goto done;
1297 }
1298 c->dst.orig_val = c->dst.val;
1299
1300special_insn:
1301
1302 if (c->twobyte)
1303 goto twobyte_insn;
1304
1305 switch (c->b) {
1306 case 0x00 ... 0x05:
1307 add: /* add */
1308 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
1309 break;
1310 case 0x08 ... 0x0d:
1311 or: /* or */
1312 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
1313 break;
1314 case 0x10 ... 0x15:
1315 adc: /* adc */
1316 emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
1317 break;
1318 case 0x18 ... 0x1d:
1319 sbb: /* sbb */
1320 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
1321 break;
1322 case 0x20 ... 0x23:
1323 and: /* and */
1324 emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
1325 break;
1326 case 0x24: /* and al imm8 */
1327 c->dst.type = OP_REG;
1328 c->dst.ptr = &c->regs[VCPU_REGS_RAX];
1329 c->dst.val = *(u8 *)c->dst.ptr;
1330 c->dst.bytes = 1;
1331 c->dst.orig_val = c->dst.val;
1332 goto and;
1333 case 0x25: /* and ax imm16, or eax imm32 */
1334 c->dst.type = OP_REG;
1335 c->dst.bytes = c->op_bytes;
1336 c->dst.ptr = &c->regs[VCPU_REGS_RAX];
1337 if (c->op_bytes == 2)
1338 c->dst.val = *(u16 *)c->dst.ptr;
1339 else
1340 c->dst.val = *(u32 *)c->dst.ptr;
1341 c->dst.orig_val = c->dst.val;
1342 goto and;
1343 case 0x28 ... 0x2d:
1344 sub: /* sub */
1345 emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags);
1346 break;
1347 case 0x30 ... 0x35:
1348 xor: /* xor */
1349 emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags);
1350 break;
1351 case 0x38 ... 0x3d:
1352 cmp: /* cmp */
1353 emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
1354 break;
1355 case 0x40 ... 0x47: /* inc r16/r32 */
1356 emulate_1op("inc", c->dst, ctxt->eflags);
1357 break;
1358 case 0x48 ... 0x4f: /* dec r16/r32 */
1359 emulate_1op("dec", c->dst, ctxt->eflags);
1360 break;
1361 case 0x50 ... 0x57: /* push reg */
1362 c->dst.type = OP_MEM;
1363 c->dst.bytes = c->op_bytes;
1364 c->dst.val = c->src.val;
1365 register_address_increment(c->regs[VCPU_REGS_RSP],
1366 -c->op_bytes);
1367 c->dst.ptr = (void *) register_address(
1368 ctxt->ss_base, c->regs[VCPU_REGS_RSP]);
1369 break;
1370 case 0x58 ... 0x5f: /* pop reg */
1371 pop_instruction:
1372 if ((rc = ops->read_std(register_address(ctxt->ss_base,
1373 c->regs[VCPU_REGS_RSP]), c->dst.ptr,
1374 c->op_bytes, ctxt->vcpu)) != 0)
1375 goto done;
1376
1377 register_address_increment(c->regs[VCPU_REGS_RSP],
1378 c->op_bytes);
1379 c->dst.type = OP_NONE; /* Disable writeback. */
1380 break;
1381 case 0x63: /* movsxd */
1382 if (ctxt->mode != X86EMUL_MODE_PROT64)
1383 goto cannot_emulate;
1384 c->dst.val = (s32) c->src.val;
1385 break;
1386 case 0x6a: /* push imm8 */
1387 c->src.val = 0L;
1388 c->src.val = insn_fetch(s8, 1, c->eip);
1389 emulate_push(ctxt);
1390 break;
1391 case 0x6c: /* insb */
1392 case 0x6d: /* insw/insd */
1393 if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
1394 1,
1395 (c->d & ByteOp) ? 1 : c->op_bytes,
1396 c->rep_prefix ?
1397 address_mask(c->regs[VCPU_REGS_RCX]) : 1,
1398 (ctxt->eflags & EFLG_DF),
1399 register_address(ctxt->es_base,
1400 c->regs[VCPU_REGS_RDI]),
1401 c->rep_prefix,
1402 c->regs[VCPU_REGS_RDX]) == 0) {
1403 c->eip = saved_eip;
1404 return -1;
1405 }
1406 return 0;
1407 case 0x6e: /* outsb */
1408 case 0x6f: /* outsw/outsd */
1409 if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
1410 0,
1411 (c->d & ByteOp) ? 1 : c->op_bytes,
1412 c->rep_prefix ?
1413 address_mask(c->regs[VCPU_REGS_RCX]) : 1,
1414 (ctxt->eflags & EFLG_DF),
1415 register_address(c->override_base ?
1416 *c->override_base :
1417 ctxt->ds_base,
1418 c->regs[VCPU_REGS_RSI]),
1419 c->rep_prefix,
1420 c->regs[VCPU_REGS_RDX]) == 0) {
1421 c->eip = saved_eip;
1422 return -1;
1423 }
1424 return 0;
1425 case 0x70 ... 0x7f: /* jcc (short) */ {
1426 int rel = insn_fetch(s8, 1, c->eip);
1427
1428 if (test_cc(c->b, ctxt->eflags))
1429 JMP_REL(rel);
1430 break;
1431 }
1432 case 0x80 ... 0x83: /* Grp1 */
1433 switch (c->modrm_reg) {
1434 case 0:
1435 goto add;
1436 case 1:
1437 goto or;
1438 case 2:
1439 goto adc;
1440 case 3:
1441 goto sbb;
1442 case 4:
1443 goto and;
1444 case 5:
1445 goto sub;
1446 case 6:
1447 goto xor;
1448 case 7:
1449 goto cmp;
1450 }
1451 break;
1452 case 0x84 ... 0x85:
1453 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
1454 break;
1455 case 0x86 ... 0x87: /* xchg */
1456 /* Write back the register source. */
1457 switch (c->dst.bytes) {
1458 case 1:
1459 *(u8 *) c->src.ptr = (u8) c->dst.val;
1460 break;
1461 case 2:
1462 *(u16 *) c->src.ptr = (u16) c->dst.val;
1463 break;
1464 case 4:
1465 *c->src.ptr = (u32) c->dst.val;
1466 break; /* 64b reg: zero-extend */
1467 case 8:
1468 *c->src.ptr = c->dst.val;
1469 break;
1470 }
1471 /*
1472 * Write back the memory destination with implicit LOCK
1473 * prefix.
1474 */
1475 c->dst.val = c->src.val;
1476 c->lock_prefix = 1;
1477 break;
1478 case 0x88 ... 0x8b: /* mov */
1479 goto mov;
1480 case 0x8d: /* lea r16/r32, m */
1481 c->dst.val = c->modrm_val;
1482 break;
1483 case 0x8f: /* pop (sole member of Grp1a) */
1484 rc = emulate_grp1a(ctxt, ops);
1485 if (rc != 0)
1486 goto done;
1487 break;
1488 case 0x9c: /* pushf */
1489 c->src.val = (unsigned long) ctxt->eflags;
1490 emulate_push(ctxt);
1491 break;
1492 case 0x9d: /* popf */
1493 c->dst.ptr = (unsigned long *) &ctxt->eflags;
1494 goto pop_instruction;
1495 case 0xa0 ... 0xa1: /* mov */
1496 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
1497 c->dst.val = c->src.val;
1498 break;
1499 case 0xa2 ... 0xa3: /* mov */
1500 c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX];
1501 break;
1502 case 0xa4 ... 0xa5: /* movs */
1503 c->dst.type = OP_MEM;
1504 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1505 c->dst.ptr = (unsigned long *)register_address(
1506 ctxt->es_base,
1507 c->regs[VCPU_REGS_RDI]);
1508 if ((rc = ops->read_emulated(register_address(
1509 c->override_base ? *c->override_base :
1510 ctxt->ds_base,
1511 c->regs[VCPU_REGS_RSI]),
1512 &c->dst.val,
1513 c->dst.bytes, ctxt->vcpu)) != 0)
1514 goto done;
1515 register_address_increment(c->regs[VCPU_REGS_RSI],
1516 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
1517 : c->dst.bytes);
1518 register_address_increment(c->regs[VCPU_REGS_RDI],
1519 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
1520 : c->dst.bytes);
1521 break;
1522 case 0xa6 ... 0xa7: /* cmps */
1523 c->src.type = OP_NONE; /* Disable writeback. */
1524 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1525 c->src.ptr = (unsigned long *)register_address(
1526 c->override_base ? *c->override_base :
1527 ctxt->ds_base,
1528 c->regs[VCPU_REGS_RSI]);
1529 if ((rc = ops->read_emulated((unsigned long)c->src.ptr,
1530 &c->src.val,
1531 c->src.bytes,
1532 ctxt->vcpu)) != 0)
1533 goto done;
1534
1535 c->dst.type = OP_NONE; /* Disable writeback. */
1536 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1537 c->dst.ptr = (unsigned long *)register_address(
1538 ctxt->es_base,
1539 c->regs[VCPU_REGS_RDI]);
1540 if ((rc = ops->read_emulated((unsigned long)c->dst.ptr,
1541 &c->dst.val,
1542 c->dst.bytes,
1543 ctxt->vcpu)) != 0)
1544 goto done;
1545
1546 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
1547
1548 emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
1549
1550 register_address_increment(c->regs[VCPU_REGS_RSI],
1551 (ctxt->eflags & EFLG_DF) ? -c->src.bytes
1552 : c->src.bytes);
1553 register_address_increment(c->regs[VCPU_REGS_RDI],
1554 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
1555 : c->dst.bytes);
1556
1557 break;
1558 case 0xaa ... 0xab: /* stos */
1559 c->dst.type = OP_MEM;
1560 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1561 c->dst.ptr = (unsigned long *)register_address(
1562 ctxt->es_base,
1563 c->regs[VCPU_REGS_RDI]);
1564 c->dst.val = c->regs[VCPU_REGS_RAX];
1565 register_address_increment(c->regs[VCPU_REGS_RDI],
1566 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
1567 : c->dst.bytes);
1568 break;
1569 case 0xac ... 0xad: /* lods */
1570 c->dst.type = OP_REG;
1571 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1572 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
1573 if ((rc = ops->read_emulated(register_address(
1574 c->override_base ? *c->override_base :
1575 ctxt->ds_base,
1576 c->regs[VCPU_REGS_RSI]),
1577 &c->dst.val,
1578 c->dst.bytes,
1579 ctxt->vcpu)) != 0)
1580 goto done;
1581 register_address_increment(c->regs[VCPU_REGS_RSI],
1582 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
1583 : c->dst.bytes);
1584 break;
1585 case 0xae ... 0xaf: /* scas */
1586 DPRINTF("Urk! I don't handle SCAS.\n");
1587 goto cannot_emulate;
1588 case 0xc0 ... 0xc1:
1589 emulate_grp2(ctxt);
1590 break;
1591 case 0xc3: /* ret */
1592 c->dst.ptr = &c->eip;
1593 goto pop_instruction;
1594 case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */
1595 mov:
1596 c->dst.val = c->src.val;
1597 break;
1598 case 0xd0 ... 0xd1: /* Grp2 */
1599 c->src.val = 1;
1600 emulate_grp2(ctxt);
1601 break;
1602 case 0xd2 ... 0xd3: /* Grp2 */
1603 c->src.val = c->regs[VCPU_REGS_RCX];
1604 emulate_grp2(ctxt);
1605 break;
1606 case 0xe8: /* call (near) */ {
1607 long int rel;
1608 switch (c->op_bytes) {
1609 case 2:
1610 rel = insn_fetch(s16, 2, c->eip);
1611 break;
1612 case 4:
1613 rel = insn_fetch(s32, 4, c->eip);
1614 break;
1615 default:
1616 DPRINTF("Call: Invalid op_bytes\n");
1617 goto cannot_emulate;
1618 }
1619 c->src.val = (unsigned long) c->eip;
1620 JMP_REL(rel);
1621 c->op_bytes = c->ad_bytes;
1622 emulate_push(ctxt);
1623 break;
1624 }
1625 case 0xe9: /* jmp rel */
1626 case 0xeb: /* jmp rel short */
1627 JMP_REL(c->src.val);
1628 c->dst.type = OP_NONE; /* Disable writeback. */
1629 break;
1630 case 0xf4: /* hlt */
1631 ctxt->vcpu->arch.halt_request = 1;
1632 goto done;
1633 case 0xf5: /* cmc */
1634 /* complement carry flag from eflags reg */
1635 ctxt->eflags ^= EFLG_CF;
1636 c->dst.type = OP_NONE; /* Disable writeback. */
1637 break;
1638 case 0xf6 ... 0xf7: /* Grp3 */
1639 rc = emulate_grp3(ctxt, ops);
1640 if (rc != 0)
1641 goto done;
1642 break;
1643 case 0xf8: /* clc */
1644 ctxt->eflags &= ~EFLG_CF;
1645 c->dst.type = OP_NONE; /* Disable writeback. */
1646 break;
1647 case 0xfa: /* cli */
1648 ctxt->eflags &= ~X86_EFLAGS_IF;
1649 c->dst.type = OP_NONE; /* Disable writeback. */
1650 break;
1651 case 0xfb: /* sti */
1652 ctxt->eflags |= X86_EFLAGS_IF;
1653 c->dst.type = OP_NONE; /* Disable writeback. */
1654 break;
1655 case 0xfe ... 0xff: /* Grp4/Grp5 */
1656 rc = emulate_grp45(ctxt, ops);
1657 if (rc != 0)
1658 goto done;
1659 break;
1660 }
1661
1662writeback:
1663 rc = writeback(ctxt, ops);
1664 if (rc != 0)
1665 goto done;
1666
1667 /* Commit shadow register state. */
1668 memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
1669 ctxt->vcpu->arch.rip = c->eip;
1670
1671done:
1672 if (rc == X86EMUL_UNHANDLEABLE) {
1673 c->eip = saved_eip;
1674 return -1;
1675 }
1676 return 0;
1677
1678twobyte_insn:
1679 switch (c->b) {
1680 case 0x01: /* lgdt, lidt, lmsw */
1681 switch (c->modrm_reg) {
1682 u16 size;
1683 unsigned long address;
1684
1685 case 0: /* vmcall */
1686 if (c->modrm_mod != 3 || c->modrm_rm != 1)
1687 goto cannot_emulate;
1688
1689 rc = kvm_fix_hypercall(ctxt->vcpu);
1690 if (rc)
1691 goto done;
1692
1693 kvm_emulate_hypercall(ctxt->vcpu);
1694 break;
1695 case 2: /* lgdt */
1696 rc = read_descriptor(ctxt, ops, c->src.ptr,
1697 &size, &address, c->op_bytes);
1698 if (rc)
1699 goto done;
1700 realmode_lgdt(ctxt->vcpu, size, address);
1701 break;
1702 case 3: /* lidt/vmmcall */
1703 if (c->modrm_mod == 3 && c->modrm_rm == 1) {
1704 rc = kvm_fix_hypercall(ctxt->vcpu);
1705 if (rc)
1706 goto done;
1707 kvm_emulate_hypercall(ctxt->vcpu);
1708 } else {
1709 rc = read_descriptor(ctxt, ops, c->src.ptr,
1710 &size, &address,
1711 c->op_bytes);
1712 if (rc)
1713 goto done;
1714 realmode_lidt(ctxt->vcpu, size, address);
1715 }
1716 break;
1717 case 4: /* smsw */
1718 if (c->modrm_mod != 3)
1719 goto cannot_emulate;
1720 *(u16 *)&c->regs[c->modrm_rm]
1721 = realmode_get_cr(ctxt->vcpu, 0);
1722 break;
1723 case 6: /* lmsw */
1724 if (c->modrm_mod != 3)
1725 goto cannot_emulate;
1726 realmode_lmsw(ctxt->vcpu, (u16)c->modrm_val,
1727 &ctxt->eflags);
1728 break;
1729 case 7: /* invlpg*/
1730 emulate_invlpg(ctxt->vcpu, memop);
1731 break;
1732 default:
1733 goto cannot_emulate;
1734 }
1735 /* Disable writeback. */
1736 c->dst.type = OP_NONE;
1737 break;
1738 case 0x06:
1739 emulate_clts(ctxt->vcpu);
1740 c->dst.type = OP_NONE;
1741 break;
1742 case 0x08: /* invd */
1743 case 0x09: /* wbinvd */
1744 case 0x0d: /* GrpP (prefetch) */
1745 case 0x18: /* Grp16 (prefetch/nop) */
1746 c->dst.type = OP_NONE;
1747 break;
1748 case 0x20: /* mov cr, reg */
1749 if (c->modrm_mod != 3)
1750 goto cannot_emulate;
1751 c->regs[c->modrm_rm] =
1752 realmode_get_cr(ctxt->vcpu, c->modrm_reg);
1753 c->dst.type = OP_NONE; /* no writeback */
1754 break;
1755 case 0x21: /* mov from dr to reg */
1756 if (c->modrm_mod != 3)
1757 goto cannot_emulate;
1758 rc = emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]);
1759 if (rc)
1760 goto cannot_emulate;
1761 c->dst.type = OP_NONE; /* no writeback */
1762 break;
1763 case 0x22: /* mov reg, cr */
1764 if (c->modrm_mod != 3)
1765 goto cannot_emulate;
1766 realmode_set_cr(ctxt->vcpu,
1767 c->modrm_reg, c->modrm_val, &ctxt->eflags);
1768 c->dst.type = OP_NONE;
1769 break;
1770 case 0x23: /* mov from reg to dr */
1771 if (c->modrm_mod != 3)
1772 goto cannot_emulate;
1773 rc = emulator_set_dr(ctxt, c->modrm_reg,
1774 c->regs[c->modrm_rm]);
1775 if (rc)
1776 goto cannot_emulate;
1777 c->dst.type = OP_NONE; /* no writeback */
1778 break;
1779 case 0x30:
1780 /* wrmsr */
1781 msr_data = (u32)c->regs[VCPU_REGS_RAX]
1782 | ((u64)c->regs[VCPU_REGS_RDX] << 32);
1783 rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data);
1784 if (rc) {
1785 kvm_inject_gp(ctxt->vcpu, 0);
1786 c->eip = ctxt->vcpu->arch.rip;
1787 }
1788 rc = X86EMUL_CONTINUE;
1789 c->dst.type = OP_NONE;
1790 break;
1791 case 0x32:
1792 /* rdmsr */
1793 rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data);
1794 if (rc) {
1795 kvm_inject_gp(ctxt->vcpu, 0);
1796 c->eip = ctxt->vcpu->arch.rip;
1797 } else {
1798 c->regs[VCPU_REGS_RAX] = (u32)msr_data;
1799 c->regs[VCPU_REGS_RDX] = msr_data >> 32;
1800 }
1801 rc = X86EMUL_CONTINUE;
1802 c->dst.type = OP_NONE;
1803 break;
1804 case 0x40 ... 0x4f: /* cmov */
1805 c->dst.val = c->dst.orig_val = c->src.val;
1806 if (!test_cc(c->b, ctxt->eflags))
1807 c->dst.type = OP_NONE; /* no writeback */
1808 break;
1809 case 0x80 ... 0x8f: /* jnz rel, etc*/ {
1810 long int rel;
1811
1812 switch (c->op_bytes) {
1813 case 2:
1814 rel = insn_fetch(s16, 2, c->eip);
1815 break;
1816 case 4:
1817 rel = insn_fetch(s32, 4, c->eip);
1818 break;
1819 case 8:
1820 rel = insn_fetch(s64, 8, c->eip);
1821 break;
1822 default:
1823 DPRINTF("jnz: Invalid op_bytes\n");
1824 goto cannot_emulate;
1825 }
1826 if (test_cc(c->b, ctxt->eflags))
1827 JMP_REL(rel);
1828 c->dst.type = OP_NONE;
1829 break;
1830 }
1831 case 0xa3:
1832 bt: /* bt */
1833 c->dst.type = OP_NONE;
1834 /* only subword offset */
1835 c->src.val &= (c->dst.bytes << 3) - 1;
1836 emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags);
1837 break;
1838 case 0xab:
1839 bts: /* bts */
1840 /* only subword offset */
1841 c->src.val &= (c->dst.bytes << 3) - 1;
1842 emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags);
1843 break;
1844 case 0xb0 ... 0xb1: /* cmpxchg */
1845 /*
1846 * Save real source value, then compare EAX against
1847 * destination.
1848 */
1849 c->src.orig_val = c->src.val;
1850 c->src.val = c->regs[VCPU_REGS_RAX];
1851 emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
1852 if (ctxt->eflags & EFLG_ZF) {
1853 /* Success: write back to memory. */
1854 c->dst.val = c->src.orig_val;
1855 } else {
1856 /* Failure: write the value we saw to EAX. */
1857 c->dst.type = OP_REG;
1858 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
1859 }
1860 break;
1861 case 0xb3:
1862 btr: /* btr */
1863 /* only subword offset */
1864 c->src.val &= (c->dst.bytes << 3) - 1;
1865 emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags);
1866 break;
1867 case 0xb6 ... 0xb7: /* movzx */
1868 c->dst.bytes = c->op_bytes;
1869 c->dst.val = (c->d & ByteOp) ? (u8) c->src.val
1870 : (u16) c->src.val;
1871 break;
1872 case 0xba: /* Grp8 */
1873 switch (c->modrm_reg & 3) {
1874 case 0:
1875 goto bt;
1876 case 1:
1877 goto bts;
1878 case 2:
1879 goto btr;
1880 case 3:
1881 goto btc;
1882 }
1883 break;
1884 case 0xbb:
1885 btc: /* btc */
1886 /* only subword offset */
1887 c->src.val &= (c->dst.bytes << 3) - 1;
1888 emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags);
1889 break;
1890 case 0xbe ... 0xbf: /* movsx */
1891 c->dst.bytes = c->op_bytes;
1892 c->dst.val = (c->d & ByteOp) ? (s8) c->src.val :
1893 (s16) c->src.val;
1894 break;
1895 case 0xc3: /* movnti */
1896 c->dst.bytes = c->op_bytes;
1897 c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val :
1898 (u64) c->src.val;
1899 break;
1900 case 0xc7: /* Grp9 (cmpxchg8b) */
1901 rc = emulate_grp9(ctxt, ops, memop);
1902 if (rc != 0)
1903 goto done;
1904 c->dst.type = OP_NONE;
1905 break;
1906 }
1907 goto writeback;
1908
1909cannot_emulate:
1910 DPRINTF("Cannot emulate %02x\n", c->b);
1911 c->eip = saved_eip;
1912 return -1;
1913}
diff --git a/drivers/kvm/x86_emulate.h b/drivers/kvm/x86_emulate.h
deleted file mode 100644
index 7db91b9bdcd4..000000000000
--- a/drivers/kvm/x86_emulate.h
+++ /dev/null
@@ -1,186 +0,0 @@
1/******************************************************************************
2 * x86_emulate.h
3 *
4 * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
5 *
6 * Copyright (c) 2005 Keir Fraser
7 *
8 * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
9 */
10
11#ifndef __X86_EMULATE_H__
12#define __X86_EMULATE_H__
13
14struct x86_emulate_ctxt;
15
16/*
17 * x86_emulate_ops:
18 *
19 * These operations represent the instruction emulator's interface to memory.
20 * There are two categories of operation: those that act on ordinary memory
21 * regions (*_std), and those that act on memory regions known to require
22 * special treatment or emulation (*_emulated).
23 *
24 * The emulator assumes that an instruction accesses only one 'emulated memory'
25 * location, that this location is the given linear faulting address (cr2), and
26 * that this is one of the instruction's data operands. Instruction fetches and
27 * stack operations are assumed never to access emulated memory. The emulator
28 * automatically deduces which operand of a string-move operation is accessing
29 * emulated memory, and assumes that the other operand accesses normal memory.
30 *
31 * NOTES:
32 * 1. The emulator isn't very smart about emulated vs. standard memory.
33 * 'Emulated memory' access addresses should be checked for sanity.
34 * 'Normal memory' accesses may fault, and the caller must arrange to
35 * detect and handle reentrancy into the emulator via recursive faults.
36 * Accesses may be unaligned and may cross page boundaries.
37 * 2. If the access fails (cannot emulate, or a standard access faults) then
38 * it is up to the memop to propagate the fault to the guest VM via
39 * some out-of-band mechanism, unknown to the emulator. The memop signals
40 * failure by returning X86EMUL_PROPAGATE_FAULT to the emulator, which will
41 * then immediately bail.
42 * 3. Valid access sizes are 1, 2, 4 and 8 bytes. On x86/32 systems only
43 * cmpxchg8b_emulated need support 8-byte accesses.
44 * 4. The emulator cannot handle 64-bit mode emulation on an x86/32 system.
45 */
46/* Access completed successfully: continue emulation as normal. */
47#define X86EMUL_CONTINUE 0
48/* Access is unhandleable: bail from emulation and return error to caller. */
49#define X86EMUL_UNHANDLEABLE 1
50/* Terminate emulation but return success to the caller. */
51#define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */
52#define X86EMUL_RETRY_INSTR 2 /* retry the instruction for some reason */
53#define X86EMUL_CMPXCHG_FAILED 2 /* cmpxchg did not see expected value */
54struct x86_emulate_ops {
55 /*
56 * read_std: Read bytes of standard (non-emulated/special) memory.
57 * Used for instruction fetch, stack operations, and others.
58 * @addr: [IN ] Linear address from which to read.
59 * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
60 * @bytes: [IN ] Number of bytes to read from memory.
61 */
62 int (*read_std)(unsigned long addr, void *val,
63 unsigned int bytes, struct kvm_vcpu *vcpu);
64
65 /*
66 * read_emulated: Read bytes from emulated/special memory area.
67 * @addr: [IN ] Linear address from which to read.
68 * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
69 * @bytes: [IN ] Number of bytes to read from memory.
70 */
71 int (*read_emulated) (unsigned long addr,
72 void *val,
73 unsigned int bytes,
74 struct kvm_vcpu *vcpu);
75
76 /*
77 * write_emulated: Read bytes from emulated/special memory area.
78 * @addr: [IN ] Linear address to which to write.
79 * @val: [IN ] Value to write to memory (low-order bytes used as
80 * required).
81 * @bytes: [IN ] Number of bytes to write to memory.
82 */
83 int (*write_emulated) (unsigned long addr,
84 const void *val,
85 unsigned int bytes,
86 struct kvm_vcpu *vcpu);
87
88 /*
89 * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an
90 * emulated/special memory area.
91 * @addr: [IN ] Linear address to access.
92 * @old: [IN ] Value expected to be current at @addr.
93 * @new: [IN ] Value to write to @addr.
94 * @bytes: [IN ] Number of bytes to access using CMPXCHG.
95 */
96 int (*cmpxchg_emulated) (unsigned long addr,
97 const void *old,
98 const void *new,
99 unsigned int bytes,
100 struct kvm_vcpu *vcpu);
101
102};
103
104/* Type, address-of, and value of an instruction's operand. */
105struct operand {
106 enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type;
107 unsigned int bytes;
108 unsigned long val, orig_val, *ptr;
109};
110
111struct fetch_cache {
112 u8 data[15];
113 unsigned long start;
114 unsigned long end;
115};
116
117struct decode_cache {
118 u8 twobyte;
119 u8 b;
120 u8 lock_prefix;
121 u8 rep_prefix;
122 u8 op_bytes;
123 u8 ad_bytes;
124 u8 rex_prefix;
125 struct operand src;
126 struct operand dst;
127 unsigned long *override_base;
128 unsigned int d;
129 unsigned long regs[NR_VCPU_REGS];
130 unsigned long eip;
131 /* modrm */
132 u8 modrm;
133 u8 modrm_mod;
134 u8 modrm_reg;
135 u8 modrm_rm;
136 u8 use_modrm_ea;
137 unsigned long modrm_ea;
138 unsigned long modrm_val;
139 struct fetch_cache fetch;
140};
141
142struct x86_emulate_ctxt {
143 /* Register state before/after emulation. */
144 struct kvm_vcpu *vcpu;
145
146 /* Linear faulting address (if emulating a page-faulting instruction). */
147 unsigned long eflags;
148
149 /* Emulated execution mode, represented by an X86EMUL_MODE value. */
150 int mode;
151
152 unsigned long cs_base;
153 unsigned long ds_base;
154 unsigned long es_base;
155 unsigned long ss_base;
156 unsigned long gs_base;
157 unsigned long fs_base;
158
159 /* decode cache */
160
161 struct decode_cache decode;
162};
163
164/* Repeat String Operation Prefix */
165#define REPE_PREFIX 1
166#define REPNE_PREFIX 2
167
168/* Execution mode, passed to the emulator. */
169#define X86EMUL_MODE_REAL 0 /* Real mode. */
170#define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */
171#define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */
172#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */
173
174/* Host execution mode. */
175#if defined(__i386__)
176#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32
177#elif defined(CONFIG_X86_64)
178#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
179#endif
180
181int x86_decode_insn(struct x86_emulate_ctxt *ctxt,
182 struct x86_emulate_ops *ops);
183int x86_emulate_insn(struct x86_emulate_ctxt *ctxt,
184 struct x86_emulate_ops *ops);
185
186#endif /* __X86_EMULATE_H__ */