aboutsummaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2008-01-30 17:30:10 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2008-01-30 17:30:10 -0500
commit2c57ee6f924c95e4dce61ed4776fb3f62e1b9f92 (patch)
treeb9d92e52e8c0ee68a0f5012b470c6146a9f0b65a /drivers
parentf389e9fcecdec4c4cb890ad28ea30a87a579ec3e (diff)
parent2f52d58c92d971bf421f461ad06eb93fb4f34981 (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm: (249 commits) KVM: Move apic timer migration away from critical section KVM: Put kvm_para.h include outside __KERNEL__ KVM: Fix unbounded preemption latency KVM: Initialize the mmu caches only after verifying cpu support KVM: MMU: Fix dirty page setting for pages removed from rmap KVM: Portability: Move kvm_fpu to asm-x86/kvm.h KVM: x86 emulator: Only allow VMCALL/VMMCALL trapped by #UD KVM: MMU: Merge shadow level check in FNAME(fetch) KVM: MMU: Move kvm_free_some_pages() into critical section KVM: MMU: Switch to mmu spinlock KVM: MMU: Avoid calling gfn_to_page() in mmu_set_spte() KVM: Add kvm_read_guest_atomic() KVM: MMU: Concurrent guest walkers KVM: Disable vapic support on Intel machines with FlexPriority KVM: Accelerated apic support KVM: local APIC TPR access reporting facility KVM: Print data for unimplemented wrmsr KVM: MMU: Add cache miss statistic KVM: MMU: Coalesce remote tlb flushes KVM: Expose ioapic to ia64 save/restore APIs ...
Diffstat (limited to 'drivers')
-rw-r--r--drivers/Kconfig2
-rw-r--r--drivers/Makefile1
-rw-r--r--drivers/kvm/Kconfig54
-rw-r--r--drivers/kvm/Makefile10
-rw-r--r--drivers/kvm/i8259.c450
-rw-r--r--drivers/kvm/ioapic.c388
-rw-r--r--drivers/kvm/irq.c98
-rw-r--r--drivers/kvm/irq.h165
-rw-r--r--drivers/kvm/kvm.h796
-rw-r--r--drivers/kvm/kvm_main.c3628
-rw-r--r--drivers/kvm/kvm_svm.h45
-rw-r--r--drivers/kvm/lapic.c1080
-rw-r--r--drivers/kvm/mmu.c1498
-rw-r--r--drivers/kvm/paging_tmpl.h511
-rw-r--r--drivers/kvm/segment_descriptor.h17
-rw-r--r--drivers/kvm/svm.c1754
-rw-r--r--drivers/kvm/svm.h324
-rw-r--r--drivers/kvm/vmx.c2566
-rw-r--r--drivers/kvm/vmx.h310
-rw-r--r--drivers/kvm/x86_emulate.c1662
-rw-r--r--drivers/kvm/x86_emulate.h155
21 files changed, 0 insertions, 15514 deletions
diff --git a/drivers/Kconfig b/drivers/Kconfig
index f4076d9e9902..08d4ae201597 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -90,8 +90,6 @@ source "drivers/dca/Kconfig"
90 90
91source "drivers/auxdisplay/Kconfig" 91source "drivers/auxdisplay/Kconfig"
92 92
93source "drivers/kvm/Kconfig"
94
95source "drivers/uio/Kconfig" 93source "drivers/uio/Kconfig"
96 94
97source "drivers/virtio/Kconfig" 95source "drivers/virtio/Kconfig"
diff --git a/drivers/Makefile b/drivers/Makefile
index d92d4d82d001..9e1f808e43cf 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -47,7 +47,6 @@ obj-$(CONFIG_SPI) += spi/
47obj-$(CONFIG_PCCARD) += pcmcia/ 47obj-$(CONFIG_PCCARD) += pcmcia/
48obj-$(CONFIG_DIO) += dio/ 48obj-$(CONFIG_DIO) += dio/
49obj-$(CONFIG_SBUS) += sbus/ 49obj-$(CONFIG_SBUS) += sbus/
50obj-$(CONFIG_KVM) += kvm/
51obj-$(CONFIG_ZORRO) += zorro/ 50obj-$(CONFIG_ZORRO) += zorro/
52obj-$(CONFIG_MAC) += macintosh/ 51obj-$(CONFIG_MAC) += macintosh/
53obj-$(CONFIG_ATA_OVER_ETH) += block/aoe/ 52obj-$(CONFIG_ATA_OVER_ETH) += block/aoe/
diff --git a/drivers/kvm/Kconfig b/drivers/kvm/Kconfig
deleted file mode 100644
index 656920636cb2..000000000000
--- a/drivers/kvm/Kconfig
+++ /dev/null
@@ -1,54 +0,0 @@
1#
2# KVM configuration
3#
4menuconfig VIRTUALIZATION
5 bool "Virtualization"
6 depends on X86
7 default y
8 ---help---
9 Say Y here to get to see options for using your Linux host to run other
10 operating systems inside virtual machines (guests).
11 This option alone does not add any kernel code.
12
13 If you say N, all options in this submenu will be skipped and disabled.
14
15if VIRTUALIZATION
16
17config KVM
18 tristate "Kernel-based Virtual Machine (KVM) support"
19 depends on X86 && EXPERIMENTAL
20 select PREEMPT_NOTIFIERS
21 select ANON_INODES
22 ---help---
23 Support hosting fully virtualized guest machines using hardware
24 virtualization extensions. You will need a fairly recent
25 processor equipped with virtualization extensions. You will also
26 need to select one or more of the processor modules below.
27
28 This module provides access to the hardware capabilities through
29 a character device node named /dev/kvm.
30
31 To compile this as a module, choose M here: the module
32 will be called kvm.
33
34 If unsure, say N.
35
36config KVM_INTEL
37 tristate "KVM for Intel processors support"
38 depends on KVM
39 ---help---
40 Provides support for KVM on Intel processors equipped with the VT
41 extensions.
42
43config KVM_AMD
44 tristate "KVM for AMD processors support"
45 depends on KVM
46 ---help---
47 Provides support for KVM on AMD processors equipped with the AMD-V
48 (SVM) extensions.
49
50# OK, it's a little counter-intuitive to do this, but it puts it neatly under
51# the virtualization menu.
52source drivers/lguest/Kconfig
53
54endif # VIRTUALIZATION
diff --git a/drivers/kvm/Makefile b/drivers/kvm/Makefile
deleted file mode 100644
index e5a8f4d3e973..000000000000
--- a/drivers/kvm/Makefile
+++ /dev/null
@@ -1,10 +0,0 @@
1#
2# Makefile for Kernel-based Virtual Machine module
3#
4
5kvm-objs := kvm_main.o mmu.o x86_emulate.o i8259.o irq.o lapic.o ioapic.o
6obj-$(CONFIG_KVM) += kvm.o
7kvm-intel-objs = vmx.o
8obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
9kvm-amd-objs = svm.o
10obj-$(CONFIG_KVM_AMD) += kvm-amd.o
diff --git a/drivers/kvm/i8259.c b/drivers/kvm/i8259.c
deleted file mode 100644
index a679157bc599..000000000000
--- a/drivers/kvm/i8259.c
+++ /dev/null
@@ -1,450 +0,0 @@
1/*
2 * 8259 interrupt controller emulation
3 *
4 * Copyright (c) 2003-2004 Fabrice Bellard
5 * Copyright (c) 2007 Intel Corporation
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a copy
8 * of this software and associated documentation files (the "Software"), to deal
9 * in the Software without restriction, including without limitation the rights
10 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 * copies of the Software, and to permit persons to whom the Software is
12 * furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be included in
15 * all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 * THE SOFTWARE.
24 * Authors:
25 * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
26 * Port from Qemu.
27 */
28#include <linux/mm.h>
29#include "irq.h"
30
31/*
32 * set irq level. If an edge is detected, then the IRR is set to 1
33 */
34static inline void pic_set_irq1(struct kvm_kpic_state *s, int irq, int level)
35{
36 int mask;
37 mask = 1 << irq;
38 if (s->elcr & mask) /* level triggered */
39 if (level) {
40 s->irr |= mask;
41 s->last_irr |= mask;
42 } else {
43 s->irr &= ~mask;
44 s->last_irr &= ~mask;
45 }
46 else /* edge triggered */
47 if (level) {
48 if ((s->last_irr & mask) == 0)
49 s->irr |= mask;
50 s->last_irr |= mask;
51 } else
52 s->last_irr &= ~mask;
53}
54
55/*
56 * return the highest priority found in mask (highest = smallest
57 * number). Return 8 if no irq
58 */
59static inline int get_priority(struct kvm_kpic_state *s, int mask)
60{
61 int priority;
62 if (mask == 0)
63 return 8;
64 priority = 0;
65 while ((mask & (1 << ((priority + s->priority_add) & 7))) == 0)
66 priority++;
67 return priority;
68}
69
70/*
71 * return the pic wanted interrupt. return -1 if none
72 */
73static int pic_get_irq(struct kvm_kpic_state *s)
74{
75 int mask, cur_priority, priority;
76
77 mask = s->irr & ~s->imr;
78 priority = get_priority(s, mask);
79 if (priority == 8)
80 return -1;
81 /*
82 * compute current priority. If special fully nested mode on the
83 * master, the IRQ coming from the slave is not taken into account
84 * for the priority computation.
85 */
86 mask = s->isr;
87 if (s->special_fully_nested_mode && s == &s->pics_state->pics[0])
88 mask &= ~(1 << 2);
89 cur_priority = get_priority(s, mask);
90 if (priority < cur_priority)
91 /*
92 * higher priority found: an irq should be generated
93 */
94 return (priority + s->priority_add) & 7;
95 else
96 return -1;
97}
98
99/*
100 * raise irq to CPU if necessary. must be called every time the active
101 * irq may change
102 */
103static void pic_update_irq(struct kvm_pic *s)
104{
105 int irq2, irq;
106
107 irq2 = pic_get_irq(&s->pics[1]);
108 if (irq2 >= 0) {
109 /*
110 * if irq request by slave pic, signal master PIC
111 */
112 pic_set_irq1(&s->pics[0], 2, 1);
113 pic_set_irq1(&s->pics[0], 2, 0);
114 }
115 irq = pic_get_irq(&s->pics[0]);
116 if (irq >= 0)
117 s->irq_request(s->irq_request_opaque, 1);
118 else
119 s->irq_request(s->irq_request_opaque, 0);
120}
121
122void kvm_pic_update_irq(struct kvm_pic *s)
123{
124 pic_update_irq(s);
125}
126
127void kvm_pic_set_irq(void *opaque, int irq, int level)
128{
129 struct kvm_pic *s = opaque;
130
131 pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
132 pic_update_irq(s);
133}
134
135/*
136 * acknowledge interrupt 'irq'
137 */
138static inline void pic_intack(struct kvm_kpic_state *s, int irq)
139{
140 if (s->auto_eoi) {
141 if (s->rotate_on_auto_eoi)
142 s->priority_add = (irq + 1) & 7;
143 } else
144 s->isr |= (1 << irq);
145 /*
146 * We don't clear a level sensitive interrupt here
147 */
148 if (!(s->elcr & (1 << irq)))
149 s->irr &= ~(1 << irq);
150}
151
152int kvm_pic_read_irq(struct kvm_pic *s)
153{
154 int irq, irq2, intno;
155
156 irq = pic_get_irq(&s->pics[0]);
157 if (irq >= 0) {
158 pic_intack(&s->pics[0], irq);
159 if (irq == 2) {
160 irq2 = pic_get_irq(&s->pics[1]);
161 if (irq2 >= 0)
162 pic_intack(&s->pics[1], irq2);
163 else
164 /*
165 * spurious IRQ on slave controller
166 */
167 irq2 = 7;
168 intno = s->pics[1].irq_base + irq2;
169 irq = irq2 + 8;
170 } else
171 intno = s->pics[0].irq_base + irq;
172 } else {
173 /*
174 * spurious IRQ on host controller
175 */
176 irq = 7;
177 intno = s->pics[0].irq_base + irq;
178 }
179 pic_update_irq(s);
180
181 return intno;
182}
183
184static void pic_reset(void *opaque)
185{
186 struct kvm_kpic_state *s = opaque;
187
188 s->last_irr = 0;
189 s->irr = 0;
190 s->imr = 0;
191 s->isr = 0;
192 s->priority_add = 0;
193 s->irq_base = 0;
194 s->read_reg_select = 0;
195 s->poll = 0;
196 s->special_mask = 0;
197 s->init_state = 0;
198 s->auto_eoi = 0;
199 s->rotate_on_auto_eoi = 0;
200 s->special_fully_nested_mode = 0;
201 s->init4 = 0;
202}
203
204static void pic_ioport_write(void *opaque, u32 addr, u32 val)
205{
206 struct kvm_kpic_state *s = opaque;
207 int priority, cmd, irq;
208
209 addr &= 1;
210 if (addr == 0) {
211 if (val & 0x10) {
212 pic_reset(s); /* init */
213 /*
214 * deassert a pending interrupt
215 */
216 s->pics_state->irq_request(s->pics_state->
217 irq_request_opaque, 0);
218 s->init_state = 1;
219 s->init4 = val & 1;
220 if (val & 0x02)
221 printk(KERN_ERR "single mode not supported");
222 if (val & 0x08)
223 printk(KERN_ERR
224 "level sensitive irq not supported");
225 } else if (val & 0x08) {
226 if (val & 0x04)
227 s->poll = 1;
228 if (val & 0x02)
229 s->read_reg_select = val & 1;
230 if (val & 0x40)
231 s->special_mask = (val >> 5) & 1;
232 } else {
233 cmd = val >> 5;
234 switch (cmd) {
235 case 0:
236 case 4:
237 s->rotate_on_auto_eoi = cmd >> 2;
238 break;
239 case 1: /* end of interrupt */
240 case 5:
241 priority = get_priority(s, s->isr);
242 if (priority != 8) {
243 irq = (priority + s->priority_add) & 7;
244 s->isr &= ~(1 << irq);
245 if (cmd == 5)
246 s->priority_add = (irq + 1) & 7;
247 pic_update_irq(s->pics_state);
248 }
249 break;
250 case 3:
251 irq = val & 7;
252 s->isr &= ~(1 << irq);
253 pic_update_irq(s->pics_state);
254 break;
255 case 6:
256 s->priority_add = (val + 1) & 7;
257 pic_update_irq(s->pics_state);
258 break;
259 case 7:
260 irq = val & 7;
261 s->isr &= ~(1 << irq);
262 s->priority_add = (irq + 1) & 7;
263 pic_update_irq(s->pics_state);
264 break;
265 default:
266 break; /* no operation */
267 }
268 }
269 } else
270 switch (s->init_state) {
271 case 0: /* normal mode */
272 s->imr = val;
273 pic_update_irq(s->pics_state);
274 break;
275 case 1:
276 s->irq_base = val & 0xf8;
277 s->init_state = 2;
278 break;
279 case 2:
280 if (s->init4)
281 s->init_state = 3;
282 else
283 s->init_state = 0;
284 break;
285 case 3:
286 s->special_fully_nested_mode = (val >> 4) & 1;
287 s->auto_eoi = (val >> 1) & 1;
288 s->init_state = 0;
289 break;
290 }
291}
292
293static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1)
294{
295 int ret;
296
297 ret = pic_get_irq(s);
298 if (ret >= 0) {
299 if (addr1 >> 7) {
300 s->pics_state->pics[0].isr &= ~(1 << 2);
301 s->pics_state->pics[0].irr &= ~(1 << 2);
302 }
303 s->irr &= ~(1 << ret);
304 s->isr &= ~(1 << ret);
305 if (addr1 >> 7 || ret != 2)
306 pic_update_irq(s->pics_state);
307 } else {
308 ret = 0x07;
309 pic_update_irq(s->pics_state);
310 }
311
312 return ret;
313}
314
315static u32 pic_ioport_read(void *opaque, u32 addr1)
316{
317 struct kvm_kpic_state *s = opaque;
318 unsigned int addr;
319 int ret;
320
321 addr = addr1;
322 addr &= 1;
323 if (s->poll) {
324 ret = pic_poll_read(s, addr1);
325 s->poll = 0;
326 } else
327 if (addr == 0)
328 if (s->read_reg_select)
329 ret = s->isr;
330 else
331 ret = s->irr;
332 else
333 ret = s->imr;
334 return ret;
335}
336
337static void elcr_ioport_write(void *opaque, u32 addr, u32 val)
338{
339 struct kvm_kpic_state *s = opaque;
340 s->elcr = val & s->elcr_mask;
341}
342
343static u32 elcr_ioport_read(void *opaque, u32 addr1)
344{
345 struct kvm_kpic_state *s = opaque;
346 return s->elcr;
347}
348
349static int picdev_in_range(struct kvm_io_device *this, gpa_t addr)
350{
351 switch (addr) {
352 case 0x20:
353 case 0x21:
354 case 0xa0:
355 case 0xa1:
356 case 0x4d0:
357 case 0x4d1:
358 return 1;
359 default:
360 return 0;
361 }
362}
363
364static void picdev_write(struct kvm_io_device *this,
365 gpa_t addr, int len, const void *val)
366{
367 struct kvm_pic *s = this->private;
368 unsigned char data = *(unsigned char *)val;
369
370 if (len != 1) {
371 if (printk_ratelimit())
372 printk(KERN_ERR "PIC: non byte write\n");
373 return;
374 }
375 switch (addr) {
376 case 0x20:
377 case 0x21:
378 case 0xa0:
379 case 0xa1:
380 pic_ioport_write(&s->pics[addr >> 7], addr, data);
381 break;
382 case 0x4d0:
383 case 0x4d1:
384 elcr_ioport_write(&s->pics[addr & 1], addr, data);
385 break;
386 }
387}
388
389static void picdev_read(struct kvm_io_device *this,
390 gpa_t addr, int len, void *val)
391{
392 struct kvm_pic *s = this->private;
393 unsigned char data = 0;
394
395 if (len != 1) {
396 if (printk_ratelimit())
397 printk(KERN_ERR "PIC: non byte read\n");
398 return;
399 }
400 switch (addr) {
401 case 0x20:
402 case 0x21:
403 case 0xa0:
404 case 0xa1:
405 data = pic_ioport_read(&s->pics[addr >> 7], addr);
406 break;
407 case 0x4d0:
408 case 0x4d1:
409 data = elcr_ioport_read(&s->pics[addr & 1], addr);
410 break;
411 }
412 *(unsigned char *)val = data;
413}
414
415/*
416 * callback when PIC0 irq status changed
417 */
418static void pic_irq_request(void *opaque, int level)
419{
420 struct kvm *kvm = opaque;
421 struct kvm_vcpu *vcpu = kvm->vcpus[0];
422
423 pic_irqchip(kvm)->output = level;
424 if (vcpu)
425 kvm_vcpu_kick(vcpu);
426}
427
428struct kvm_pic *kvm_create_pic(struct kvm *kvm)
429{
430 struct kvm_pic *s;
431 s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
432 if (!s)
433 return NULL;
434 s->pics[0].elcr_mask = 0xf8;
435 s->pics[1].elcr_mask = 0xde;
436 s->irq_request = pic_irq_request;
437 s->irq_request_opaque = kvm;
438 s->pics[0].pics_state = s;
439 s->pics[1].pics_state = s;
440
441 /*
442 * Initialize PIO device
443 */
444 s->dev.read = picdev_read;
445 s->dev.write = picdev_write;
446 s->dev.in_range = picdev_in_range;
447 s->dev.private = s;
448 kvm_io_bus_register_dev(&kvm->pio_bus, &s->dev);
449 return s;
450}
diff --git a/drivers/kvm/ioapic.c b/drivers/kvm/ioapic.c
deleted file mode 100644
index c7992e667fdb..000000000000
--- a/drivers/kvm/ioapic.c
+++ /dev/null
@@ -1,388 +0,0 @@
1/*
2 * Copyright (C) 2001 MandrakeSoft S.A.
3 *
4 * MandrakeSoft S.A.
5 * 43, rue d'Aboukir
6 * 75002 Paris - France
7 * http://www.linux-mandrake.com/
8 * http://www.mandrakesoft.com/
9 *
10 * This library is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This library is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with this library; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 *
24 * Yunhong Jiang <yunhong.jiang@intel.com>
25 * Yaozu (Eddie) Dong <eddie.dong@intel.com>
26 * Based on Xen 3.1 code.
27 */
28
29#include "kvm.h"
30#include <linux/kvm.h>
31#include <linux/mm.h>
32#include <linux/highmem.h>
33#include <linux/smp.h>
34#include <linux/hrtimer.h>
35#include <linux/io.h>
36#include <asm/processor.h>
37#include <asm/msr.h>
38#include <asm/page.h>
39#include <asm/current.h>
40#include <asm/apicdef.h>
41#include <asm/io_apic.h>
42#include "irq.h"
43/* #define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */
44#define ioapic_debug(fmt, arg...)
45static void ioapic_deliver(struct kvm_ioapic *vioapic, int irq);
46
47static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
48 unsigned long addr,
49 unsigned long length)
50{
51 unsigned long result = 0;
52
53 switch (ioapic->ioregsel) {
54 case IOAPIC_REG_VERSION:
55 result = ((((IOAPIC_NUM_PINS - 1) & 0xff) << 16)
56 | (IOAPIC_VERSION_ID & 0xff));
57 break;
58
59 case IOAPIC_REG_APIC_ID:
60 case IOAPIC_REG_ARB_ID:
61 result = ((ioapic->id & 0xf) << 24);
62 break;
63
64 default:
65 {
66 u32 redir_index = (ioapic->ioregsel - 0x10) >> 1;
67 u64 redir_content;
68
69 ASSERT(redir_index < IOAPIC_NUM_PINS);
70
71 redir_content = ioapic->redirtbl[redir_index].bits;
72 result = (ioapic->ioregsel & 0x1) ?
73 (redir_content >> 32) & 0xffffffff :
74 redir_content & 0xffffffff;
75 break;
76 }
77 }
78
79 return result;
80}
81
82static void ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx)
83{
84 union ioapic_redir_entry *pent;
85
86 pent = &ioapic->redirtbl[idx];
87
88 if (!pent->fields.mask) {
89 ioapic_deliver(ioapic, idx);
90 if (pent->fields.trig_mode == IOAPIC_LEVEL_TRIG)
91 pent->fields.remote_irr = 1;
92 }
93 if (!pent->fields.trig_mode)
94 ioapic->irr &= ~(1 << idx);
95}
96
97static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
98{
99 unsigned index;
100
101 switch (ioapic->ioregsel) {
102 case IOAPIC_REG_VERSION:
103 /* Writes are ignored. */
104 break;
105
106 case IOAPIC_REG_APIC_ID:
107 ioapic->id = (val >> 24) & 0xf;
108 break;
109
110 case IOAPIC_REG_ARB_ID:
111 break;
112
113 default:
114 index = (ioapic->ioregsel - 0x10) >> 1;
115
116 ioapic_debug("change redir index %x val %x", index, val);
117 if (index >= IOAPIC_NUM_PINS)
118 return;
119 if (ioapic->ioregsel & 1) {
120 ioapic->redirtbl[index].bits &= 0xffffffff;
121 ioapic->redirtbl[index].bits |= (u64) val << 32;
122 } else {
123 ioapic->redirtbl[index].bits &= ~0xffffffffULL;
124 ioapic->redirtbl[index].bits |= (u32) val;
125 ioapic->redirtbl[index].fields.remote_irr = 0;
126 }
127 if (ioapic->irr & (1 << index))
128 ioapic_service(ioapic, index);
129 break;
130 }
131}
132
133static void ioapic_inj_irq(struct kvm_ioapic *ioapic,
134 struct kvm_lapic *target,
135 u8 vector, u8 trig_mode, u8 delivery_mode)
136{
137 ioapic_debug("irq %d trig %d deliv %d", vector, trig_mode,
138 delivery_mode);
139
140 ASSERT((delivery_mode == dest_Fixed) ||
141 (delivery_mode == dest_LowestPrio));
142
143 kvm_apic_set_irq(target, vector, trig_mode);
144}
145
146static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
147 u8 dest_mode)
148{
149 u32 mask = 0;
150 int i;
151 struct kvm *kvm = ioapic->kvm;
152 struct kvm_vcpu *vcpu;
153
154 ioapic_debug("dest %d dest_mode %d", dest, dest_mode);
155
156 if (dest_mode == 0) { /* Physical mode. */
157 if (dest == 0xFF) { /* Broadcast. */
158 for (i = 0; i < KVM_MAX_VCPUS; ++i)
159 if (kvm->vcpus[i] && kvm->vcpus[i]->apic)
160 mask |= 1 << i;
161 return mask;
162 }
163 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
164 vcpu = kvm->vcpus[i];
165 if (!vcpu)
166 continue;
167 if (kvm_apic_match_physical_addr(vcpu->apic, dest)) {
168 if (vcpu->apic)
169 mask = 1 << i;
170 break;
171 }
172 }
173 } else if (dest != 0) /* Logical mode, MDA non-zero. */
174 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
175 vcpu = kvm->vcpus[i];
176 if (!vcpu)
177 continue;
178 if (vcpu->apic &&
179 kvm_apic_match_logical_addr(vcpu->apic, dest))
180 mask |= 1 << vcpu->vcpu_id;
181 }
182 ioapic_debug("mask %x", mask);
183 return mask;
184}
185
186static void ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
187{
188 u8 dest = ioapic->redirtbl[irq].fields.dest_id;
189 u8 dest_mode = ioapic->redirtbl[irq].fields.dest_mode;
190 u8 delivery_mode = ioapic->redirtbl[irq].fields.delivery_mode;
191 u8 vector = ioapic->redirtbl[irq].fields.vector;
192 u8 trig_mode = ioapic->redirtbl[irq].fields.trig_mode;
193 u32 deliver_bitmask;
194 struct kvm_lapic *target;
195 struct kvm_vcpu *vcpu;
196 int vcpu_id;
197
198 ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
199 "vector=%x trig_mode=%x",
200 dest, dest_mode, delivery_mode, vector, trig_mode);
201
202 deliver_bitmask = ioapic_get_delivery_bitmask(ioapic, dest, dest_mode);
203 if (!deliver_bitmask) {
204 ioapic_debug("no target on destination");
205 return;
206 }
207
208 switch (delivery_mode) {
209 case dest_LowestPrio:
210 target =
211 kvm_apic_round_robin(ioapic->kvm, vector, deliver_bitmask);
212 if (target != NULL)
213 ioapic_inj_irq(ioapic, target, vector,
214 trig_mode, delivery_mode);
215 else
216 ioapic_debug("null round robin: "
217 "mask=%x vector=%x delivery_mode=%x",
218 deliver_bitmask, vector, dest_LowestPrio);
219 break;
220 case dest_Fixed:
221 for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) {
222 if (!(deliver_bitmask & (1 << vcpu_id)))
223 continue;
224 deliver_bitmask &= ~(1 << vcpu_id);
225 vcpu = ioapic->kvm->vcpus[vcpu_id];
226 if (vcpu) {
227 target = vcpu->apic;
228 ioapic_inj_irq(ioapic, target, vector,
229 trig_mode, delivery_mode);
230 }
231 }
232 break;
233
234 /* TODO: NMI */
235 default:
236 printk(KERN_WARNING "Unsupported delivery mode %d\n",
237 delivery_mode);
238 break;
239 }
240}
241
242void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
243{
244 u32 old_irr = ioapic->irr;
245 u32 mask = 1 << irq;
246 union ioapic_redir_entry entry;
247
248 if (irq >= 0 && irq < IOAPIC_NUM_PINS) {
249 entry = ioapic->redirtbl[irq];
250 level ^= entry.fields.polarity;
251 if (!level)
252 ioapic->irr &= ~mask;
253 else {
254 ioapic->irr |= mask;
255 if ((!entry.fields.trig_mode && old_irr != ioapic->irr)
256 || !entry.fields.remote_irr)
257 ioapic_service(ioapic, irq);
258 }
259 }
260}
261
262static int get_eoi_gsi(struct kvm_ioapic *ioapic, int vector)
263{
264 int i;
265
266 for (i = 0; i < IOAPIC_NUM_PINS; i++)
267 if (ioapic->redirtbl[i].fields.vector == vector)
268 return i;
269 return -1;
270}
271
272void kvm_ioapic_update_eoi(struct kvm *kvm, int vector)
273{
274 struct kvm_ioapic *ioapic = kvm->vioapic;
275 union ioapic_redir_entry *ent;
276 int gsi;
277
278 gsi = get_eoi_gsi(ioapic, vector);
279 if (gsi == -1) {
280 printk(KERN_WARNING "Can't find redir item for %d EOI\n",
281 vector);
282 return;
283 }
284
285 ent = &ioapic->redirtbl[gsi];
286 ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
287
288 ent->fields.remote_irr = 0;
289 if (!ent->fields.mask && (ioapic->irr & (1 << gsi)))
290 ioapic_deliver(ioapic, gsi);
291}
292
293static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr)
294{
295 struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
296
297 return ((addr >= ioapic->base_address &&
298 (addr < ioapic->base_address + IOAPIC_MEM_LENGTH)));
299}
300
301static void ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
302 void *val)
303{
304 struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
305 u32 result;
306
307 ioapic_debug("addr %lx", (unsigned long)addr);
308 ASSERT(!(addr & 0xf)); /* check alignment */
309
310 addr &= 0xff;
311 switch (addr) {
312 case IOAPIC_REG_SELECT:
313 result = ioapic->ioregsel;
314 break;
315
316 case IOAPIC_REG_WINDOW:
317 result = ioapic_read_indirect(ioapic, addr, len);
318 break;
319
320 default:
321 result = 0;
322 break;
323 }
324 switch (len) {
325 case 8:
326 *(u64 *) val = result;
327 break;
328 case 1:
329 case 2:
330 case 4:
331 memcpy(val, (char *)&result, len);
332 break;
333 default:
334 printk(KERN_WARNING "ioapic: wrong length %d\n", len);
335 }
336}
337
338static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
339 const void *val)
340{
341 struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
342 u32 data;
343
344 ioapic_debug("ioapic_mmio_write addr=%lx len=%d val=%p\n",
345 addr, len, val);
346 ASSERT(!(addr & 0xf)); /* check alignment */
347 if (len == 4 || len == 8)
348 data = *(u32 *) val;
349 else {
350 printk(KERN_WARNING "ioapic: Unsupported size %d\n", len);
351 return;
352 }
353
354 addr &= 0xff;
355 switch (addr) {
356 case IOAPIC_REG_SELECT:
357 ioapic->ioregsel = data;
358 break;
359
360 case IOAPIC_REG_WINDOW:
361 ioapic_write_indirect(ioapic, data);
362 break;
363
364 default:
365 break;
366 }
367}
368
369int kvm_ioapic_init(struct kvm *kvm)
370{
371 struct kvm_ioapic *ioapic;
372 int i;
373
374 ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL);
375 if (!ioapic)
376 return -ENOMEM;
377 kvm->vioapic = ioapic;
378 for (i = 0; i < IOAPIC_NUM_PINS; i++)
379 ioapic->redirtbl[i].fields.mask = 1;
380 ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
381 ioapic->dev.read = ioapic_mmio_read;
382 ioapic->dev.write = ioapic_mmio_write;
383 ioapic->dev.in_range = ioapic_in_range;
384 ioapic->dev.private = ioapic;
385 ioapic->kvm = kvm;
386 kvm_io_bus_register_dev(&kvm->mmio_bus, &ioapic->dev);
387 return 0;
388}
diff --git a/drivers/kvm/irq.c b/drivers/kvm/irq.c
deleted file mode 100644
index 7628c7ff628f..000000000000
--- a/drivers/kvm/irq.c
+++ /dev/null
@@ -1,98 +0,0 @@
1/*
2 * irq.c: API for in kernel interrupt controller
3 * Copyright (c) 2007, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Authors:
18 * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
19 *
20 */
21
22#include <linux/module.h>
23
24#include "kvm.h"
25#include "irq.h"
26
27/*
28 * check if there is pending interrupt without
29 * intack.
30 */
31int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
32{
33 struct kvm_pic *s;
34
35 if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */
36 if (kvm_apic_accept_pic_intr(v)) {
37 s = pic_irqchip(v->kvm); /* PIC */
38 return s->output;
39 } else
40 return 0;
41 }
42 return 1;
43}
44EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
45
46/*
47 * Read pending interrupt vector and intack.
48 */
49int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
50{
51 struct kvm_pic *s;
52 int vector;
53
54 vector = kvm_get_apic_interrupt(v); /* APIC */
55 if (vector == -1) {
56 if (kvm_apic_accept_pic_intr(v)) {
57 s = pic_irqchip(v->kvm);
58 s->output = 0; /* PIC */
59 vector = kvm_pic_read_irq(s);
60 }
61 }
62 return vector;
63}
64EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
65
66static void vcpu_kick_intr(void *info)
67{
68#ifdef DEBUG
69 struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
70 printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu);
71#endif
72}
73
74void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
75{
76 int ipi_pcpu = vcpu->cpu;
77
78 if (waitqueue_active(&vcpu->wq)) {
79 wake_up_interruptible(&vcpu->wq);
80 ++vcpu->stat.halt_wakeup;
81 }
82 if (vcpu->guest_mode)
83 smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0);
84}
85
86void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
87{
88 kvm_inject_apic_timer_irqs(vcpu);
89 /* TODO: PIT, RTC etc. */
90}
91EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
92
93void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
94{
95 kvm_apic_timer_intr_post(vcpu, vec);
96 /* TODO: PIT, RTC etc. */
97}
98EXPORT_SYMBOL_GPL(kvm_timer_intr_post);
diff --git a/drivers/kvm/irq.h b/drivers/kvm/irq.h
deleted file mode 100644
index 11fc014e2b30..000000000000
--- a/drivers/kvm/irq.h
+++ /dev/null
@@ -1,165 +0,0 @@
1/*
2 * irq.h: in kernel interrupt controller related definitions
3 * Copyright (c) 2007, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Authors:
18 * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
19 *
20 */
21
22#ifndef __IRQ_H
23#define __IRQ_H
24
25#include "kvm.h"
26
27typedef void irq_request_func(void *opaque, int level);
28
29struct kvm_kpic_state {
30 u8 last_irr; /* edge detection */
31 u8 irr; /* interrupt request register */
32 u8 imr; /* interrupt mask register */
33 u8 isr; /* interrupt service register */
34 u8 priority_add; /* highest irq priority */
35 u8 irq_base;
36 u8 read_reg_select;
37 u8 poll;
38 u8 special_mask;
39 u8 init_state;
40 u8 auto_eoi;
41 u8 rotate_on_auto_eoi;
42 u8 special_fully_nested_mode;
43 u8 init4; /* true if 4 byte init */
44 u8 elcr; /* PIIX edge/trigger selection */
45 u8 elcr_mask;
46 struct kvm_pic *pics_state;
47};
48
49struct kvm_pic {
50 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
51 irq_request_func *irq_request;
52 void *irq_request_opaque;
53 int output; /* intr from master PIC */
54 struct kvm_io_device dev;
55};
56
57struct kvm_pic *kvm_create_pic(struct kvm *kvm);
58void kvm_pic_set_irq(void *opaque, int irq, int level);
59int kvm_pic_read_irq(struct kvm_pic *s);
60int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
61int kvm_cpu_has_interrupt(struct kvm_vcpu *v);
62void kvm_pic_update_irq(struct kvm_pic *s);
63
64#define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS
65#define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */
66#define IOAPIC_EDGE_TRIG 0
67#define IOAPIC_LEVEL_TRIG 1
68
69#define IOAPIC_DEFAULT_BASE_ADDRESS 0xfec00000
70#define IOAPIC_MEM_LENGTH 0x100
71
72/* Direct registers. */
73#define IOAPIC_REG_SELECT 0x00
74#define IOAPIC_REG_WINDOW 0x10
75#define IOAPIC_REG_EOI 0x40 /* IA64 IOSAPIC only */
76
77/* Indirect registers. */
78#define IOAPIC_REG_APIC_ID 0x00 /* x86 IOAPIC only */
79#define IOAPIC_REG_VERSION 0x01
80#define IOAPIC_REG_ARB_ID 0x02 /* x86 IOAPIC only */
81
82struct kvm_ioapic {
83 u64 base_address;
84 u32 ioregsel;
85 u32 id;
86 u32 irr;
87 u32 pad;
88 union ioapic_redir_entry {
89 u64 bits;
90 struct {
91 u8 vector;
92 u8 delivery_mode:3;
93 u8 dest_mode:1;
94 u8 delivery_status:1;
95 u8 polarity:1;
96 u8 remote_irr:1;
97 u8 trig_mode:1;
98 u8 mask:1;
99 u8 reserve:7;
100 u8 reserved[4];
101 u8 dest_id;
102 } fields;
103 } redirtbl[IOAPIC_NUM_PINS];
104 struct kvm_io_device dev;
105 struct kvm *kvm;
106};
107
108struct kvm_lapic {
109 unsigned long base_address;
110 struct kvm_io_device dev;
111 struct {
112 atomic_t pending;
113 s64 period; /* unit: ns */
114 u32 divide_count;
115 ktime_t last_update;
116 struct hrtimer dev;
117 } timer;
118 struct kvm_vcpu *vcpu;
119 struct page *regs_page;
120 void *regs;
121};
122
123#ifdef DEBUG
124#define ASSERT(x) \
125do { \
126 if (!(x)) { \
127 printk(KERN_EMERG "assertion failed %s: %d: %s\n", \
128 __FILE__, __LINE__, #x); \
129 BUG(); \
130 } \
131} while (0)
132#else
133#define ASSERT(x) do { } while (0)
134#endif
135
136void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
137int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
138int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
139int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
140int kvm_create_lapic(struct kvm_vcpu *vcpu);
141void kvm_lapic_reset(struct kvm_vcpu *vcpu);
142void kvm_free_apic(struct kvm_lapic *apic);
143u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
144void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
145void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
146struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
147 unsigned long bitmap);
148u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
149void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
150int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
151void kvm_ioapic_update_eoi(struct kvm *kvm, int vector);
152int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
153int kvm_apic_set_irq(struct kvm_lapic *apic, u8 vec, u8 trig);
154void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
155int kvm_ioapic_init(struct kvm *kvm);
156void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
157int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
158int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
159void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
160void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
161void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
162void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
163void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
164
165#endif
diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
deleted file mode 100644
index 3b0bc4bda5f2..000000000000
--- a/drivers/kvm/kvm.h
+++ /dev/null
@@ -1,796 +0,0 @@
1#ifndef __KVM_H
2#define __KVM_H
3
4/*
5 * This work is licensed under the terms of the GNU GPL, version 2. See
6 * the COPYING file in the top-level directory.
7 */
8
9#include <linux/types.h>
10#include <linux/list.h>
11#include <linux/mutex.h>
12#include <linux/spinlock.h>
13#include <linux/signal.h>
14#include <linux/sched.h>
15#include <linux/mm.h>
16#include <linux/preempt.h>
17#include <asm/signal.h>
18
19#include <linux/kvm.h>
20#include <linux/kvm_para.h>
21
22#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1)
23#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD))
24#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS|0xFFFFFF0000000000ULL)
25
26#define KVM_GUEST_CR0_MASK \
27 (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE \
28 | X86_CR0_NW | X86_CR0_CD)
29#define KVM_VM_CR0_ALWAYS_ON \
30 (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE | X86_CR0_TS \
31 | X86_CR0_MP)
32#define KVM_GUEST_CR4_MASK \
33 (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE)
34#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
35#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
36
37#define INVALID_PAGE (~(hpa_t)0)
38#define UNMAPPED_GVA (~(gpa_t)0)
39
40#define KVM_MAX_VCPUS 4
41#define KVM_ALIAS_SLOTS 4
42#define KVM_MEMORY_SLOTS 8
43#define KVM_NUM_MMU_PAGES 1024
44#define KVM_MIN_FREE_MMU_PAGES 5
45#define KVM_REFILL_PAGES 25
46#define KVM_MAX_CPUID_ENTRIES 40
47
48#define DE_VECTOR 0
49#define NM_VECTOR 7
50#define DF_VECTOR 8
51#define TS_VECTOR 10
52#define NP_VECTOR 11
53#define SS_VECTOR 12
54#define GP_VECTOR 13
55#define PF_VECTOR 14
56
57#define SELECTOR_TI_MASK (1 << 2)
58#define SELECTOR_RPL_MASK 0x03
59
60#define IOPL_SHIFT 12
61
62#define KVM_PIO_PAGE_OFFSET 1
63
64/*
65 * vcpu->requests bit members
66 */
67#define KVM_TLB_FLUSH 0
68
69/*
70 * Address types:
71 *
72 * gva - guest virtual address
73 * gpa - guest physical address
74 * gfn - guest frame number
75 * hva - host virtual address
76 * hpa - host physical address
77 * hfn - host frame number
78 */
79
80typedef unsigned long gva_t;
81typedef u64 gpa_t;
82typedef unsigned long gfn_t;
83
84typedef unsigned long hva_t;
85typedef u64 hpa_t;
86typedef unsigned long hfn_t;
87
88#define NR_PTE_CHAIN_ENTRIES 5
89
90struct kvm_pte_chain {
91 u64 *parent_ptes[NR_PTE_CHAIN_ENTRIES];
92 struct hlist_node link;
93};
94
95/*
96 * kvm_mmu_page_role, below, is defined as:
97 *
98 * bits 0:3 - total guest paging levels (2-4, or zero for real mode)
99 * bits 4:7 - page table level for this shadow (1-4)
100 * bits 8:9 - page table quadrant for 2-level guests
101 * bit 16 - "metaphysical" - gfn is not a real page (huge page/real mode)
102 * bits 17:19 - "access" - the user, writable, and nx bits of a huge page pde
103 */
104union kvm_mmu_page_role {
105 unsigned word;
106 struct {
107 unsigned glevels : 4;
108 unsigned level : 4;
109 unsigned quadrant : 2;
110 unsigned pad_for_nice_hex_output : 6;
111 unsigned metaphysical : 1;
112 unsigned hugepage_access : 3;
113 };
114};
115
116struct kvm_mmu_page {
117 struct list_head link;
118 struct hlist_node hash_link;
119
120 /*
121 * The following two entries are used to key the shadow page in the
122 * hash table.
123 */
124 gfn_t gfn;
125 union kvm_mmu_page_role role;
126
127 u64 *spt;
128 unsigned long slot_bitmap; /* One bit set per slot which has memory
129 * in this shadow page.
130 */
131 int multimapped; /* More than one parent_pte? */
132 int root_count; /* Currently serving as active root */
133 union {
134 u64 *parent_pte; /* !multimapped */
135 struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
136 };
137};
138
139struct kvm_vcpu;
140extern struct kmem_cache *kvm_vcpu_cache;
141
142/*
143 * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
144 * 32-bit). The kvm_mmu structure abstracts the details of the current mmu
145 * mode.
146 */
147struct kvm_mmu {
148 void (*new_cr3)(struct kvm_vcpu *vcpu);
149 int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
150 void (*free)(struct kvm_vcpu *vcpu);
151 gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva);
152 hpa_t root_hpa;
153 int root_level;
154 int shadow_root_level;
155
156 u64 *pae_root;
157};
158
159#define KVM_NR_MEM_OBJS 20
160
161struct kvm_mmu_memory_cache {
162 int nobjs;
163 void *objects[KVM_NR_MEM_OBJS];
164};
165
166/*
167 * We don't want allocation failures within the mmu code, so we preallocate
168 * enough memory for a single page fault in a cache.
169 */
170struct kvm_guest_debug {
171 int enabled;
172 unsigned long bp[4];
173 int singlestep;
174};
175
176enum {
177 VCPU_REGS_RAX = 0,
178 VCPU_REGS_RCX = 1,
179 VCPU_REGS_RDX = 2,
180 VCPU_REGS_RBX = 3,
181 VCPU_REGS_RSP = 4,
182 VCPU_REGS_RBP = 5,
183 VCPU_REGS_RSI = 6,
184 VCPU_REGS_RDI = 7,
185#ifdef CONFIG_X86_64
186 VCPU_REGS_R8 = 8,
187 VCPU_REGS_R9 = 9,
188 VCPU_REGS_R10 = 10,
189 VCPU_REGS_R11 = 11,
190 VCPU_REGS_R12 = 12,
191 VCPU_REGS_R13 = 13,
192 VCPU_REGS_R14 = 14,
193 VCPU_REGS_R15 = 15,
194#endif
195 NR_VCPU_REGS
196};
197
198enum {
199 VCPU_SREG_CS,
200 VCPU_SREG_DS,
201 VCPU_SREG_ES,
202 VCPU_SREG_FS,
203 VCPU_SREG_GS,
204 VCPU_SREG_SS,
205 VCPU_SREG_TR,
206 VCPU_SREG_LDTR,
207};
208
209struct kvm_pio_request {
210 unsigned long count;
211 int cur_count;
212 struct page *guest_pages[2];
213 unsigned guest_page_offset;
214 int in;
215 int port;
216 int size;
217 int string;
218 int down;
219 int rep;
220};
221
222struct kvm_stat {
223 u32 pf_fixed;
224 u32 pf_guest;
225 u32 tlb_flush;
226 u32 invlpg;
227
228 u32 exits;
229 u32 io_exits;
230 u32 mmio_exits;
231 u32 signal_exits;
232 u32 irq_window_exits;
233 u32 halt_exits;
234 u32 halt_wakeup;
235 u32 request_irq_exits;
236 u32 irq_exits;
237 u32 light_exits;
238 u32 efer_reload;
239};
240
241struct kvm_io_device {
242 void (*read)(struct kvm_io_device *this,
243 gpa_t addr,
244 int len,
245 void *val);
246 void (*write)(struct kvm_io_device *this,
247 gpa_t addr,
248 int len,
249 const void *val);
250 int (*in_range)(struct kvm_io_device *this, gpa_t addr);
251 void (*destructor)(struct kvm_io_device *this);
252
253 void *private;
254};
255
256static inline void kvm_iodevice_read(struct kvm_io_device *dev,
257 gpa_t addr,
258 int len,
259 void *val)
260{
261 dev->read(dev, addr, len, val);
262}
263
264static inline void kvm_iodevice_write(struct kvm_io_device *dev,
265 gpa_t addr,
266 int len,
267 const void *val)
268{
269 dev->write(dev, addr, len, val);
270}
271
272static inline int kvm_iodevice_inrange(struct kvm_io_device *dev, gpa_t addr)
273{
274 return dev->in_range(dev, addr);
275}
276
277static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
278{
279 if (dev->destructor)
280 dev->destructor(dev);
281}
282
283/*
284 * It would be nice to use something smarter than a linear search, TBD...
285 * Thankfully we dont expect many devices to register (famous last words :),
286 * so until then it will suffice. At least its abstracted so we can change
287 * in one place.
288 */
289struct kvm_io_bus {
290 int dev_count;
291#define NR_IOBUS_DEVS 6
292 struct kvm_io_device *devs[NR_IOBUS_DEVS];
293};
294
295void kvm_io_bus_init(struct kvm_io_bus *bus);
296void kvm_io_bus_destroy(struct kvm_io_bus *bus);
297struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr);
298void kvm_io_bus_register_dev(struct kvm_io_bus *bus,
299 struct kvm_io_device *dev);
300
301struct kvm_vcpu {
302 struct kvm *kvm;
303 struct preempt_notifier preempt_notifier;
304 int vcpu_id;
305 struct mutex mutex;
306 int cpu;
307 u64 host_tsc;
308 struct kvm_run *run;
309 int interrupt_window_open;
310 int guest_mode;
311 unsigned long requests;
312 unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
313 DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS);
314 unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */
315 unsigned long rip; /* needs vcpu_load_rsp_rip() */
316
317 unsigned long cr0;
318 unsigned long cr2;
319 unsigned long cr3;
320 gpa_t para_state_gpa;
321 struct page *para_state_page;
322 gpa_t hypercall_gpa;
323 unsigned long cr4;
324 unsigned long cr8;
325 u64 pdptrs[4]; /* pae */
326 u64 shadow_efer;
327 u64 apic_base;
328 struct kvm_lapic *apic; /* kernel irqchip context */
329#define VCPU_MP_STATE_RUNNABLE 0
330#define VCPU_MP_STATE_UNINITIALIZED 1
331#define VCPU_MP_STATE_INIT_RECEIVED 2
332#define VCPU_MP_STATE_SIPI_RECEIVED 3
333#define VCPU_MP_STATE_HALTED 4
334 int mp_state;
335 int sipi_vector;
336 u64 ia32_misc_enable_msr;
337
338 struct kvm_mmu mmu;
339
340 struct kvm_mmu_memory_cache mmu_pte_chain_cache;
341 struct kvm_mmu_memory_cache mmu_rmap_desc_cache;
342 struct kvm_mmu_memory_cache mmu_page_cache;
343 struct kvm_mmu_memory_cache mmu_page_header_cache;
344
345 gfn_t last_pt_write_gfn;
346 int last_pt_write_count;
347
348 struct kvm_guest_debug guest_debug;
349
350 struct i387_fxsave_struct host_fx_image;
351 struct i387_fxsave_struct guest_fx_image;
352 int fpu_active;
353 int guest_fpu_loaded;
354
355 int mmio_needed;
356 int mmio_read_completed;
357 int mmio_is_write;
358 int mmio_size;
359 unsigned char mmio_data[8];
360 gpa_t mmio_phys_addr;
361 gva_t mmio_fault_cr2;
362 struct kvm_pio_request pio;
363 void *pio_data;
364 wait_queue_head_t wq;
365
366 int sigset_active;
367 sigset_t sigset;
368
369 struct kvm_stat stat;
370
371 struct {
372 int active;
373 u8 save_iopl;
374 struct kvm_save_segment {
375 u16 selector;
376 unsigned long base;
377 u32 limit;
378 u32 ar;
379 } tr, es, ds, fs, gs;
380 } rmode;
381 int halt_request; /* real mode on Intel only */
382
383 int cpuid_nent;
384 struct kvm_cpuid_entry cpuid_entries[KVM_MAX_CPUID_ENTRIES];
385};
386
387struct kvm_mem_alias {
388 gfn_t base_gfn;
389 unsigned long npages;
390 gfn_t target_gfn;
391};
392
393struct kvm_memory_slot {
394 gfn_t base_gfn;
395 unsigned long npages;
396 unsigned long flags;
397 struct page **phys_mem;
398 unsigned long *dirty_bitmap;
399};
400
401struct kvm {
402 struct mutex lock; /* protects everything except vcpus */
403 int naliases;
404 struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
405 int nmemslots;
406 struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS];
407 /*
408 * Hash table of struct kvm_mmu_page.
409 */
410 struct list_head active_mmu_pages;
411 int n_free_mmu_pages;
412 struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
413 struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
414 unsigned long rmap_overflow;
415 struct list_head vm_list;
416 struct file *filp;
417 struct kvm_io_bus mmio_bus;
418 struct kvm_io_bus pio_bus;
419 struct kvm_pic *vpic;
420 struct kvm_ioapic *vioapic;
421 int round_robin_prev_vcpu;
422};
423
424static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
425{
426 return kvm->vpic;
427}
428
429static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
430{
431 return kvm->vioapic;
432}
433
434static inline int irqchip_in_kernel(struct kvm *kvm)
435{
436 return pic_irqchip(kvm) != 0;
437}
438
439struct descriptor_table {
440 u16 limit;
441 unsigned long base;
442} __attribute__((packed));
443
444struct kvm_x86_ops {
445 int (*cpu_has_kvm_support)(void); /* __init */
446 int (*disabled_by_bios)(void); /* __init */
447 void (*hardware_enable)(void *dummy); /* __init */
448 void (*hardware_disable)(void *dummy);
449 void (*check_processor_compatibility)(void *rtn);
450 int (*hardware_setup)(void); /* __init */
451 void (*hardware_unsetup)(void); /* __exit */
452
453 /* Create, but do not attach this VCPU */
454 struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
455 void (*vcpu_free)(struct kvm_vcpu *vcpu);
456 void (*vcpu_reset)(struct kvm_vcpu *vcpu);
457
458 void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
459 void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
460 void (*vcpu_put)(struct kvm_vcpu *vcpu);
461 void (*vcpu_decache)(struct kvm_vcpu *vcpu);
462
463 int (*set_guest_debug)(struct kvm_vcpu *vcpu,
464 struct kvm_debug_guest *dbg);
465 void (*guest_debug_pre)(struct kvm_vcpu *vcpu);
466 int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
467 int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
468 u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
469 void (*get_segment)(struct kvm_vcpu *vcpu,
470 struct kvm_segment *var, int seg);
471 void (*set_segment)(struct kvm_vcpu *vcpu,
472 struct kvm_segment *var, int seg);
473 void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
474 void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
475 void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
476 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
477 void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
478 void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
479 void (*get_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
480 void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
481 void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
482 void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
483 unsigned long (*get_dr)(struct kvm_vcpu *vcpu, int dr);
484 void (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value,
485 int *exception);
486 void (*cache_regs)(struct kvm_vcpu *vcpu);
487 void (*decache_regs)(struct kvm_vcpu *vcpu);
488 unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
489 void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
490
491 void (*tlb_flush)(struct kvm_vcpu *vcpu);
492 void (*inject_page_fault)(struct kvm_vcpu *vcpu,
493 unsigned long addr, u32 err_code);
494
495 void (*inject_gp)(struct kvm_vcpu *vcpu, unsigned err_code);
496
497 void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run);
498 int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu);
499 void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
500 void (*patch_hypercall)(struct kvm_vcpu *vcpu,
501 unsigned char *hypercall_addr);
502 int (*get_irq)(struct kvm_vcpu *vcpu);
503 void (*set_irq)(struct kvm_vcpu *vcpu, int vec);
504 void (*inject_pending_irq)(struct kvm_vcpu *vcpu);
505 void (*inject_pending_vectors)(struct kvm_vcpu *vcpu,
506 struct kvm_run *run);
507};
508
509extern struct kvm_x86_ops *kvm_x86_ops;
510
511/* The guest did something we don't support. */
512#define pr_unimpl(vcpu, fmt, ...) \
513 do { \
514 if (printk_ratelimit()) \
515 printk(KERN_ERR "kvm: %i: cpu%i " fmt, \
516 current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__); \
517 } while(0)
518
519#define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt)
520#define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt)
521
522int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id);
523void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
524
525int kvm_init_x86(struct kvm_x86_ops *ops, unsigned int vcpu_size,
526 struct module *module);
527void kvm_exit_x86(void);
528
529int kvm_mmu_module_init(void);
530void kvm_mmu_module_exit(void);
531
532void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
533int kvm_mmu_create(struct kvm_vcpu *vcpu);
534int kvm_mmu_setup(struct kvm_vcpu *vcpu);
535
536int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
537void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
538void kvm_mmu_zap_all(struct kvm *kvm);
539
540hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa);
541#define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
542#define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB)
543static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
544hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva);
545struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva);
546
547extern hpa_t bad_page_address;
548
549struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
550struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
551void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
552
553enum emulation_result {
554 EMULATE_DONE, /* no further processing */
555 EMULATE_DO_MMIO, /* kvm_run filled with mmio request */
556 EMULATE_FAIL, /* can't emulate this instruction */
557};
558
559int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run,
560 unsigned long cr2, u16 error_code);
561void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
562void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
563void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
564void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
565 unsigned long *rflags);
566
567unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr);
568void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long value,
569 unsigned long *rflags);
570int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
571int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
572
573struct x86_emulate_ctxt;
574
575int kvm_emulate_pio (struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
576 int size, unsigned port);
577int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
578 int size, unsigned long count, int down,
579 gva_t address, int rep, unsigned port);
580void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
581int kvm_emulate_halt(struct kvm_vcpu *vcpu);
582int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
583int emulate_clts(struct kvm_vcpu *vcpu);
584int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr,
585 unsigned long *dest);
586int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
587 unsigned long value);
588
589void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
590void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr0);
591void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr0);
592void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr0);
593unsigned long get_cr8(struct kvm_vcpu *vcpu);
594void lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
595void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
596
597int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
598int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
599
600void fx_init(struct kvm_vcpu *vcpu);
601
602void kvm_resched(struct kvm_vcpu *vcpu);
603void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
604void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
605void kvm_flush_remote_tlbs(struct kvm *kvm);
606
607int emulator_read_std(unsigned long addr,
608 void *val,
609 unsigned int bytes,
610 struct kvm_vcpu *vcpu);
611int emulator_write_emulated(unsigned long addr,
612 const void *val,
613 unsigned int bytes,
614 struct kvm_vcpu *vcpu);
615
616unsigned long segment_base(u16 selector);
617
618void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
619 const u8 *new, int bytes);
620int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
621void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
622int kvm_mmu_load(struct kvm_vcpu *vcpu);
623void kvm_mmu_unload(struct kvm_vcpu *vcpu);
624
625int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run);
626
627static inline void kvm_guest_enter(void)
628{
629 current->flags |= PF_VCPU;
630}
631
632static inline void kvm_guest_exit(void)
633{
634 current->flags &= ~PF_VCPU;
635}
636
637static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
638 u32 error_code)
639{
640 return vcpu->mmu.page_fault(vcpu, gva, error_code);
641}
642
643static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
644{
645 if (unlikely(vcpu->kvm->n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
646 __kvm_mmu_free_some_pages(vcpu);
647}
648
649static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
650{
651 if (likely(vcpu->mmu.root_hpa != INVALID_PAGE))
652 return 0;
653
654 return kvm_mmu_load(vcpu);
655}
656
657static inline int is_long_mode(struct kvm_vcpu *vcpu)
658{
659#ifdef CONFIG_X86_64
660 return vcpu->shadow_efer & EFER_LME;
661#else
662 return 0;
663#endif
664}
665
666static inline int is_pae(struct kvm_vcpu *vcpu)
667{
668 return vcpu->cr4 & X86_CR4_PAE;
669}
670
671static inline int is_pse(struct kvm_vcpu *vcpu)
672{
673 return vcpu->cr4 & X86_CR4_PSE;
674}
675
676static inline int is_paging(struct kvm_vcpu *vcpu)
677{
678 return vcpu->cr0 & X86_CR0_PG;
679}
680
681static inline int memslot_id(struct kvm *kvm, struct kvm_memory_slot *slot)
682{
683 return slot - kvm->memslots;
684}
685
686static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
687{
688 struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
689
690 return (struct kvm_mmu_page *)page_private(page);
691}
692
693static inline u16 read_fs(void)
694{
695 u16 seg;
696 asm ("mov %%fs, %0" : "=g"(seg));
697 return seg;
698}
699
700static inline u16 read_gs(void)
701{
702 u16 seg;
703 asm ("mov %%gs, %0" : "=g"(seg));
704 return seg;
705}
706
707static inline u16 read_ldt(void)
708{
709 u16 ldt;
710 asm ("sldt %0" : "=g"(ldt));
711 return ldt;
712}
713
714static inline void load_fs(u16 sel)
715{
716 asm ("mov %0, %%fs" : : "rm"(sel));
717}
718
719static inline void load_gs(u16 sel)
720{
721 asm ("mov %0, %%gs" : : "rm"(sel));
722}
723
724#ifndef load_ldt
725static inline void load_ldt(u16 sel)
726{
727 asm ("lldt %0" : : "rm"(sel));
728}
729#endif
730
731static inline void get_idt(struct descriptor_table *table)
732{
733 asm ("sidt %0" : "=m"(*table));
734}
735
736static inline void get_gdt(struct descriptor_table *table)
737{
738 asm ("sgdt %0" : "=m"(*table));
739}
740
741static inline unsigned long read_tr_base(void)
742{
743 u16 tr;
744 asm ("str %0" : "=g"(tr));
745 return segment_base(tr);
746}
747
748#ifdef CONFIG_X86_64
749static inline unsigned long read_msr(unsigned long msr)
750{
751 u64 value;
752
753 rdmsrl(msr, value);
754 return value;
755}
756#endif
757
758static inline void fx_save(struct i387_fxsave_struct *image)
759{
760 asm ("fxsave (%0)":: "r" (image));
761}
762
763static inline void fx_restore(struct i387_fxsave_struct *image)
764{
765 asm ("fxrstor (%0)":: "r" (image));
766}
767
768static inline void fpu_init(void)
769{
770 asm ("finit");
771}
772
773static inline u32 get_rdx_init_val(void)
774{
775 return 0x600; /* P6 family */
776}
777
778#define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30"
779#define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2"
780#define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3"
781#define ASM_VMX_VMPTRLD_RAX ".byte 0x0f, 0xc7, 0x30"
782#define ASM_VMX_VMREAD_RDX_RAX ".byte 0x0f, 0x78, 0xd0"
783#define ASM_VMX_VMWRITE_RAX_RDX ".byte 0x0f, 0x79, 0xd0"
784#define ASM_VMX_VMWRITE_RSP_RDX ".byte 0x0f, 0x79, 0xd4"
785#define ASM_VMX_VMXOFF ".byte 0x0f, 0x01, 0xc4"
786#define ASM_VMX_VMXON_RAX ".byte 0xf3, 0x0f, 0xc7, 0x30"
787
788#define MSR_IA32_TIME_STAMP_COUNTER 0x010
789
790#define TSS_IOPB_BASE_OFFSET 0x66
791#define TSS_BASE_SIZE 0x68
792#define TSS_IOPB_SIZE (65536 / 8)
793#define TSS_REDIRECTION_SIZE (256 / 8)
794#define RMODE_TSS_SIZE (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1)
795
796#endif
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
deleted file mode 100644
index c0f372f1d761..000000000000
--- a/drivers/kvm/kvm_main.c
+++ /dev/null
@@ -1,3628 +0,0 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * Copyright (C) 2006 Qumranet, Inc.
8 *
9 * Authors:
10 * Avi Kivity <avi@qumranet.com>
11 * Yaniv Kamay <yaniv@qumranet.com>
12 *
13 * This work is licensed under the terms of the GNU GPL, version 2. See
14 * the COPYING file in the top-level directory.
15 *
16 */
17
18#include "kvm.h"
19#include "x86_emulate.h"
20#include "segment_descriptor.h"
21#include "irq.h"
22
23#include <linux/kvm.h>
24#include <linux/module.h>
25#include <linux/errno.h>
26#include <linux/percpu.h>
27#include <linux/gfp.h>
28#include <linux/mm.h>
29#include <linux/miscdevice.h>
30#include <linux/vmalloc.h>
31#include <linux/reboot.h>
32#include <linux/debugfs.h>
33#include <linux/highmem.h>
34#include <linux/file.h>
35#include <linux/sysdev.h>
36#include <linux/cpu.h>
37#include <linux/sched.h>
38#include <linux/cpumask.h>
39#include <linux/smp.h>
40#include <linux/anon_inodes.h>
41#include <linux/profile.h>
42
43#include <asm/processor.h>
44#include <asm/msr.h>
45#include <asm/io.h>
46#include <asm/uaccess.h>
47#include <asm/desc.h>
48
49MODULE_AUTHOR("Qumranet");
50MODULE_LICENSE("GPL");
51
52static DEFINE_SPINLOCK(kvm_lock);
53static LIST_HEAD(vm_list);
54
55static cpumask_t cpus_hardware_enabled;
56
57struct kvm_x86_ops *kvm_x86_ops;
58struct kmem_cache *kvm_vcpu_cache;
59EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
60
61static __read_mostly struct preempt_ops kvm_preempt_ops;
62
63#define STAT_OFFSET(x) offsetof(struct kvm_vcpu, stat.x)
64
65static struct kvm_stats_debugfs_item {
66 const char *name;
67 int offset;
68 struct dentry *dentry;
69} debugfs_entries[] = {
70 { "pf_fixed", STAT_OFFSET(pf_fixed) },
71 { "pf_guest", STAT_OFFSET(pf_guest) },
72 { "tlb_flush", STAT_OFFSET(tlb_flush) },
73 { "invlpg", STAT_OFFSET(invlpg) },
74 { "exits", STAT_OFFSET(exits) },
75 { "io_exits", STAT_OFFSET(io_exits) },
76 { "mmio_exits", STAT_OFFSET(mmio_exits) },
77 { "signal_exits", STAT_OFFSET(signal_exits) },
78 { "irq_window", STAT_OFFSET(irq_window_exits) },
79 { "halt_exits", STAT_OFFSET(halt_exits) },
80 { "halt_wakeup", STAT_OFFSET(halt_wakeup) },
81 { "request_irq", STAT_OFFSET(request_irq_exits) },
82 { "irq_exits", STAT_OFFSET(irq_exits) },
83 { "light_exits", STAT_OFFSET(light_exits) },
84 { "efer_reload", STAT_OFFSET(efer_reload) },
85 { NULL }
86};
87
88static struct dentry *debugfs_dir;
89
90#define MAX_IO_MSRS 256
91
92#define CR0_RESERVED_BITS \
93 (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
94 | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
95 | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
96#define CR4_RESERVED_BITS \
97 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
98 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
99 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \
100 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
101
102#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
103#define EFER_RESERVED_BITS 0xfffffffffffff2fe
104
105#ifdef CONFIG_X86_64
106// LDT or TSS descriptor in the GDT. 16 bytes.
107struct segment_descriptor_64 {
108 struct segment_descriptor s;
109 u32 base_higher;
110 u32 pad_zero;
111};
112
113#endif
114
115static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
116 unsigned long arg);
117
118unsigned long segment_base(u16 selector)
119{
120 struct descriptor_table gdt;
121 struct segment_descriptor *d;
122 unsigned long table_base;
123 typedef unsigned long ul;
124 unsigned long v;
125
126 if (selector == 0)
127 return 0;
128
129 asm ("sgdt %0" : "=m"(gdt));
130 table_base = gdt.base;
131
132 if (selector & 4) { /* from ldt */
133 u16 ldt_selector;
134
135 asm ("sldt %0" : "=g"(ldt_selector));
136 table_base = segment_base(ldt_selector);
137 }
138 d = (struct segment_descriptor *)(table_base + (selector & ~7));
139 v = d->base_low | ((ul)d->base_mid << 16) | ((ul)d->base_high << 24);
140#ifdef CONFIG_X86_64
141 if (d->system == 0
142 && (d->type == 2 || d->type == 9 || d->type == 11))
143 v |= ((ul)((struct segment_descriptor_64 *)d)->base_higher) << 32;
144#endif
145 return v;
146}
147EXPORT_SYMBOL_GPL(segment_base);
148
149static inline int valid_vcpu(int n)
150{
151 return likely(n >= 0 && n < KVM_MAX_VCPUS);
152}
153
154void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
155{
156 if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
157 return;
158
159 vcpu->guest_fpu_loaded = 1;
160 fx_save(&vcpu->host_fx_image);
161 fx_restore(&vcpu->guest_fx_image);
162}
163EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
164
165void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
166{
167 if (!vcpu->guest_fpu_loaded)
168 return;
169
170 vcpu->guest_fpu_loaded = 0;
171 fx_save(&vcpu->guest_fx_image);
172 fx_restore(&vcpu->host_fx_image);
173}
174EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
175
176/*
177 * Switches to specified vcpu, until a matching vcpu_put()
178 */
179static void vcpu_load(struct kvm_vcpu *vcpu)
180{
181 int cpu;
182
183 mutex_lock(&vcpu->mutex);
184 cpu = get_cpu();
185 preempt_notifier_register(&vcpu->preempt_notifier);
186 kvm_x86_ops->vcpu_load(vcpu, cpu);
187 put_cpu();
188}
189
190static void vcpu_put(struct kvm_vcpu *vcpu)
191{
192 preempt_disable();
193 kvm_x86_ops->vcpu_put(vcpu);
194 preempt_notifier_unregister(&vcpu->preempt_notifier);
195 preempt_enable();
196 mutex_unlock(&vcpu->mutex);
197}
198
199static void ack_flush(void *_completed)
200{
201}
202
203void kvm_flush_remote_tlbs(struct kvm *kvm)
204{
205 int i, cpu;
206 cpumask_t cpus;
207 struct kvm_vcpu *vcpu;
208
209 cpus_clear(cpus);
210 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
211 vcpu = kvm->vcpus[i];
212 if (!vcpu)
213 continue;
214 if (test_and_set_bit(KVM_TLB_FLUSH, &vcpu->requests))
215 continue;
216 cpu = vcpu->cpu;
217 if (cpu != -1 && cpu != raw_smp_processor_id())
218 cpu_set(cpu, cpus);
219 }
220 smp_call_function_mask(cpus, ack_flush, NULL, 1);
221}
222
223int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
224{
225 struct page *page;
226 int r;
227
228 mutex_init(&vcpu->mutex);
229 vcpu->cpu = -1;
230 vcpu->mmu.root_hpa = INVALID_PAGE;
231 vcpu->kvm = kvm;
232 vcpu->vcpu_id = id;
233 if (!irqchip_in_kernel(kvm) || id == 0)
234 vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
235 else
236 vcpu->mp_state = VCPU_MP_STATE_UNINITIALIZED;
237 init_waitqueue_head(&vcpu->wq);
238
239 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
240 if (!page) {
241 r = -ENOMEM;
242 goto fail;
243 }
244 vcpu->run = page_address(page);
245
246 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
247 if (!page) {
248 r = -ENOMEM;
249 goto fail_free_run;
250 }
251 vcpu->pio_data = page_address(page);
252
253 r = kvm_mmu_create(vcpu);
254 if (r < 0)
255 goto fail_free_pio_data;
256
257 return 0;
258
259fail_free_pio_data:
260 free_page((unsigned long)vcpu->pio_data);
261fail_free_run:
262 free_page((unsigned long)vcpu->run);
263fail:
264 return -ENOMEM;
265}
266EXPORT_SYMBOL_GPL(kvm_vcpu_init);
267
268void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
269{
270 kvm_mmu_destroy(vcpu);
271 if (vcpu->apic)
272 hrtimer_cancel(&vcpu->apic->timer.dev);
273 kvm_free_apic(vcpu->apic);
274 free_page((unsigned long)vcpu->pio_data);
275 free_page((unsigned long)vcpu->run);
276}
277EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
278
279static struct kvm *kvm_create_vm(void)
280{
281 struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
282
283 if (!kvm)
284 return ERR_PTR(-ENOMEM);
285
286 kvm_io_bus_init(&kvm->pio_bus);
287 mutex_init(&kvm->lock);
288 INIT_LIST_HEAD(&kvm->active_mmu_pages);
289 kvm_io_bus_init(&kvm->mmio_bus);
290 spin_lock(&kvm_lock);
291 list_add(&kvm->vm_list, &vm_list);
292 spin_unlock(&kvm_lock);
293 return kvm;
294}
295
296/*
297 * Free any memory in @free but not in @dont.
298 */
299static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
300 struct kvm_memory_slot *dont)
301{
302 int i;
303
304 if (!dont || free->phys_mem != dont->phys_mem)
305 if (free->phys_mem) {
306 for (i = 0; i < free->npages; ++i)
307 if (free->phys_mem[i])
308 __free_page(free->phys_mem[i]);
309 vfree(free->phys_mem);
310 }
311
312 if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
313 vfree(free->dirty_bitmap);
314
315 free->phys_mem = NULL;
316 free->npages = 0;
317 free->dirty_bitmap = NULL;
318}
319
320static void kvm_free_physmem(struct kvm *kvm)
321{
322 int i;
323
324 for (i = 0; i < kvm->nmemslots; ++i)
325 kvm_free_physmem_slot(&kvm->memslots[i], NULL);
326}
327
328static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
329{
330 int i;
331
332 for (i = 0; i < ARRAY_SIZE(vcpu->pio.guest_pages); ++i)
333 if (vcpu->pio.guest_pages[i]) {
334 __free_page(vcpu->pio.guest_pages[i]);
335 vcpu->pio.guest_pages[i] = NULL;
336 }
337}
338
339static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
340{
341 vcpu_load(vcpu);
342 kvm_mmu_unload(vcpu);
343 vcpu_put(vcpu);
344}
345
346static void kvm_free_vcpus(struct kvm *kvm)
347{
348 unsigned int i;
349
350 /*
351 * Unpin any mmu pages first.
352 */
353 for (i = 0; i < KVM_MAX_VCPUS; ++i)
354 if (kvm->vcpus[i])
355 kvm_unload_vcpu_mmu(kvm->vcpus[i]);
356 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
357 if (kvm->vcpus[i]) {
358 kvm_x86_ops->vcpu_free(kvm->vcpus[i]);
359 kvm->vcpus[i] = NULL;
360 }
361 }
362
363}
364
365static void kvm_destroy_vm(struct kvm *kvm)
366{
367 spin_lock(&kvm_lock);
368 list_del(&kvm->vm_list);
369 spin_unlock(&kvm_lock);
370 kvm_io_bus_destroy(&kvm->pio_bus);
371 kvm_io_bus_destroy(&kvm->mmio_bus);
372 kfree(kvm->vpic);
373 kfree(kvm->vioapic);
374 kvm_free_vcpus(kvm);
375 kvm_free_physmem(kvm);
376 kfree(kvm);
377}
378
379static int kvm_vm_release(struct inode *inode, struct file *filp)
380{
381 struct kvm *kvm = filp->private_data;
382
383 kvm_destroy_vm(kvm);
384 return 0;
385}
386
387static void inject_gp(struct kvm_vcpu *vcpu)
388{
389 kvm_x86_ops->inject_gp(vcpu, 0);
390}
391
392/*
393 * Load the pae pdptrs. Return true is they are all valid.
394 */
395static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
396{
397 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
398 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
399 int i;
400 u64 *pdpt;
401 int ret;
402 struct page *page;
403 u64 pdpte[ARRAY_SIZE(vcpu->pdptrs)];
404
405 mutex_lock(&vcpu->kvm->lock);
406 page = gfn_to_page(vcpu->kvm, pdpt_gfn);
407 if (!page) {
408 ret = 0;
409 goto out;
410 }
411
412 pdpt = kmap_atomic(page, KM_USER0);
413 memcpy(pdpte, pdpt+offset, sizeof(pdpte));
414 kunmap_atomic(pdpt, KM_USER0);
415
416 for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
417 if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {
418 ret = 0;
419 goto out;
420 }
421 }
422 ret = 1;
423
424 memcpy(vcpu->pdptrs, pdpte, sizeof(vcpu->pdptrs));
425out:
426 mutex_unlock(&vcpu->kvm->lock);
427
428 return ret;
429}
430
431void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
432{
433 if (cr0 & CR0_RESERVED_BITS) {
434 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
435 cr0, vcpu->cr0);
436 inject_gp(vcpu);
437 return;
438 }
439
440 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
441 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
442 inject_gp(vcpu);
443 return;
444 }
445
446 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
447 printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
448 "and a clear PE flag\n");
449 inject_gp(vcpu);
450 return;
451 }
452
453 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
454#ifdef CONFIG_X86_64
455 if ((vcpu->shadow_efer & EFER_LME)) {
456 int cs_db, cs_l;
457
458 if (!is_pae(vcpu)) {
459 printk(KERN_DEBUG "set_cr0: #GP, start paging "
460 "in long mode while PAE is disabled\n");
461 inject_gp(vcpu);
462 return;
463 }
464 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
465 if (cs_l) {
466 printk(KERN_DEBUG "set_cr0: #GP, start paging "
467 "in long mode while CS.L == 1\n");
468 inject_gp(vcpu);
469 return;
470
471 }
472 } else
473#endif
474 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->cr3)) {
475 printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
476 "reserved bits\n");
477 inject_gp(vcpu);
478 return;
479 }
480
481 }
482
483 kvm_x86_ops->set_cr0(vcpu, cr0);
484 vcpu->cr0 = cr0;
485
486 mutex_lock(&vcpu->kvm->lock);
487 kvm_mmu_reset_context(vcpu);
488 mutex_unlock(&vcpu->kvm->lock);
489 return;
490}
491EXPORT_SYMBOL_GPL(set_cr0);
492
493void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
494{
495 set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f));
496}
497EXPORT_SYMBOL_GPL(lmsw);
498
499void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
500{
501 if (cr4 & CR4_RESERVED_BITS) {
502 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
503 inject_gp(vcpu);
504 return;
505 }
506
507 if (is_long_mode(vcpu)) {
508 if (!(cr4 & X86_CR4_PAE)) {
509 printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
510 "in long mode\n");
511 inject_gp(vcpu);
512 return;
513 }
514 } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE)
515 && !load_pdptrs(vcpu, vcpu->cr3)) {
516 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
517 inject_gp(vcpu);
518 return;
519 }
520
521 if (cr4 & X86_CR4_VMXE) {
522 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
523 inject_gp(vcpu);
524 return;
525 }
526 kvm_x86_ops->set_cr4(vcpu, cr4);
527 vcpu->cr4 = cr4;
528 mutex_lock(&vcpu->kvm->lock);
529 kvm_mmu_reset_context(vcpu);
530 mutex_unlock(&vcpu->kvm->lock);
531}
532EXPORT_SYMBOL_GPL(set_cr4);
533
534void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
535{
536 if (is_long_mode(vcpu)) {
537 if (cr3 & CR3_L_MODE_RESERVED_BITS) {
538 printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
539 inject_gp(vcpu);
540 return;
541 }
542 } else {
543 if (is_pae(vcpu)) {
544 if (cr3 & CR3_PAE_RESERVED_BITS) {
545 printk(KERN_DEBUG
546 "set_cr3: #GP, reserved bits\n");
547 inject_gp(vcpu);
548 return;
549 }
550 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
551 printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
552 "reserved bits\n");
553 inject_gp(vcpu);
554 return;
555 }
556 } else {
557 if (cr3 & CR3_NONPAE_RESERVED_BITS) {
558 printk(KERN_DEBUG
559 "set_cr3: #GP, reserved bits\n");
560 inject_gp(vcpu);
561 return;
562 }
563 }
564 }
565
566 mutex_lock(&vcpu->kvm->lock);
567 /*
568 * Does the new cr3 value map to physical memory? (Note, we
569 * catch an invalid cr3 even in real-mode, because it would
570 * cause trouble later on when we turn on paging anyway.)
571 *
572 * A real CPU would silently accept an invalid cr3 and would
573 * attempt to use it - with largely undefined (and often hard
574 * to debug) behavior on the guest side.
575 */
576 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
577 inject_gp(vcpu);
578 else {
579 vcpu->cr3 = cr3;
580 vcpu->mmu.new_cr3(vcpu);
581 }
582 mutex_unlock(&vcpu->kvm->lock);
583}
584EXPORT_SYMBOL_GPL(set_cr3);
585
586void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
587{
588 if (cr8 & CR8_RESERVED_BITS) {
589 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
590 inject_gp(vcpu);
591 return;
592 }
593 if (irqchip_in_kernel(vcpu->kvm))
594 kvm_lapic_set_tpr(vcpu, cr8);
595 else
596 vcpu->cr8 = cr8;
597}
598EXPORT_SYMBOL_GPL(set_cr8);
599
600unsigned long get_cr8(struct kvm_vcpu *vcpu)
601{
602 if (irqchip_in_kernel(vcpu->kvm))
603 return kvm_lapic_get_cr8(vcpu);
604 else
605 return vcpu->cr8;
606}
607EXPORT_SYMBOL_GPL(get_cr8);
608
609u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
610{
611 if (irqchip_in_kernel(vcpu->kvm))
612 return vcpu->apic_base;
613 else
614 return vcpu->apic_base;
615}
616EXPORT_SYMBOL_GPL(kvm_get_apic_base);
617
618void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
619{
620 /* TODO: reserve bits check */
621 if (irqchip_in_kernel(vcpu->kvm))
622 kvm_lapic_set_base(vcpu, data);
623 else
624 vcpu->apic_base = data;
625}
626EXPORT_SYMBOL_GPL(kvm_set_apic_base);
627
628void fx_init(struct kvm_vcpu *vcpu)
629{
630 unsigned after_mxcsr_mask;
631
632 /* Initialize guest FPU by resetting ours and saving into guest's */
633 preempt_disable();
634 fx_save(&vcpu->host_fx_image);
635 fpu_init();
636 fx_save(&vcpu->guest_fx_image);
637 fx_restore(&vcpu->host_fx_image);
638 preempt_enable();
639
640 vcpu->cr0 |= X86_CR0_ET;
641 after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
642 vcpu->guest_fx_image.mxcsr = 0x1f80;
643 memset((void *)&vcpu->guest_fx_image + after_mxcsr_mask,
644 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
645}
646EXPORT_SYMBOL_GPL(fx_init);
647
648/*
649 * Allocate some memory and give it an address in the guest physical address
650 * space.
651 *
652 * Discontiguous memory is allowed, mostly for framebuffers.
653 */
654static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
655 struct kvm_memory_region *mem)
656{
657 int r;
658 gfn_t base_gfn;
659 unsigned long npages;
660 unsigned long i;
661 struct kvm_memory_slot *memslot;
662 struct kvm_memory_slot old, new;
663
664 r = -EINVAL;
665 /* General sanity checks */
666 if (mem->memory_size & (PAGE_SIZE - 1))
667 goto out;
668 if (mem->guest_phys_addr & (PAGE_SIZE - 1))
669 goto out;
670 if (mem->slot >= KVM_MEMORY_SLOTS)
671 goto out;
672 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
673 goto out;
674
675 memslot = &kvm->memslots[mem->slot];
676 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
677 npages = mem->memory_size >> PAGE_SHIFT;
678
679 if (!npages)
680 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
681
682 mutex_lock(&kvm->lock);
683
684 new = old = *memslot;
685
686 new.base_gfn = base_gfn;
687 new.npages = npages;
688 new.flags = mem->flags;
689
690 /* Disallow changing a memory slot's size. */
691 r = -EINVAL;
692 if (npages && old.npages && npages != old.npages)
693 goto out_unlock;
694
695 /* Check for overlaps */
696 r = -EEXIST;
697 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
698 struct kvm_memory_slot *s = &kvm->memslots[i];
699
700 if (s == memslot)
701 continue;
702 if (!((base_gfn + npages <= s->base_gfn) ||
703 (base_gfn >= s->base_gfn + s->npages)))
704 goto out_unlock;
705 }
706
707 /* Deallocate if slot is being removed */
708 if (!npages)
709 new.phys_mem = NULL;
710
711 /* Free page dirty bitmap if unneeded */
712 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
713 new.dirty_bitmap = NULL;
714
715 r = -ENOMEM;
716
717 /* Allocate if a slot is being created */
718 if (npages && !new.phys_mem) {
719 new.phys_mem = vmalloc(npages * sizeof(struct page *));
720
721 if (!new.phys_mem)
722 goto out_unlock;
723
724 memset(new.phys_mem, 0, npages * sizeof(struct page *));
725 for (i = 0; i < npages; ++i) {
726 new.phys_mem[i] = alloc_page(GFP_HIGHUSER
727 | __GFP_ZERO);
728 if (!new.phys_mem[i])
729 goto out_unlock;
730 set_page_private(new.phys_mem[i],0);
731 }
732 }
733
734 /* Allocate page dirty bitmap if needed */
735 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
736 unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8;
737
738 new.dirty_bitmap = vmalloc(dirty_bytes);
739 if (!new.dirty_bitmap)
740 goto out_unlock;
741 memset(new.dirty_bitmap, 0, dirty_bytes);
742 }
743
744 if (mem->slot >= kvm->nmemslots)
745 kvm->nmemslots = mem->slot + 1;
746
747 *memslot = new;
748
749 kvm_mmu_slot_remove_write_access(kvm, mem->slot);
750 kvm_flush_remote_tlbs(kvm);
751
752 mutex_unlock(&kvm->lock);
753
754 kvm_free_physmem_slot(&old, &new);
755 return 0;
756
757out_unlock:
758 mutex_unlock(&kvm->lock);
759 kvm_free_physmem_slot(&new, &old);
760out:
761 return r;
762}
763
764/*
765 * Get (and clear) the dirty memory log for a memory slot.
766 */
767static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
768 struct kvm_dirty_log *log)
769{
770 struct kvm_memory_slot *memslot;
771 int r, i;
772 int n;
773 unsigned long any = 0;
774
775 mutex_lock(&kvm->lock);
776
777 r = -EINVAL;
778 if (log->slot >= KVM_MEMORY_SLOTS)
779 goto out;
780
781 memslot = &kvm->memslots[log->slot];
782 r = -ENOENT;
783 if (!memslot->dirty_bitmap)
784 goto out;
785
786 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
787
788 for (i = 0; !any && i < n/sizeof(long); ++i)
789 any = memslot->dirty_bitmap[i];
790
791 r = -EFAULT;
792 if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
793 goto out;
794
795 /* If nothing is dirty, don't bother messing with page tables. */
796 if (any) {
797 kvm_mmu_slot_remove_write_access(kvm, log->slot);
798 kvm_flush_remote_tlbs(kvm);
799 memset(memslot->dirty_bitmap, 0, n);
800 }
801
802 r = 0;
803
804out:
805 mutex_unlock(&kvm->lock);
806 return r;
807}
808
809/*
810 * Set a new alias region. Aliases map a portion of physical memory into
811 * another portion. This is useful for memory windows, for example the PC
812 * VGA region.
813 */
814static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
815 struct kvm_memory_alias *alias)
816{
817 int r, n;
818 struct kvm_mem_alias *p;
819
820 r = -EINVAL;
821 /* General sanity checks */
822 if (alias->memory_size & (PAGE_SIZE - 1))
823 goto out;
824 if (alias->guest_phys_addr & (PAGE_SIZE - 1))
825 goto out;
826 if (alias->slot >= KVM_ALIAS_SLOTS)
827 goto out;
828 if (alias->guest_phys_addr + alias->memory_size
829 < alias->guest_phys_addr)
830 goto out;
831 if (alias->target_phys_addr + alias->memory_size
832 < alias->target_phys_addr)
833 goto out;
834
835 mutex_lock(&kvm->lock);
836
837 p = &kvm->aliases[alias->slot];
838 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
839 p->npages = alias->memory_size >> PAGE_SHIFT;
840 p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
841
842 for (n = KVM_ALIAS_SLOTS; n > 0; --n)
843 if (kvm->aliases[n - 1].npages)
844 break;
845 kvm->naliases = n;
846
847 kvm_mmu_zap_all(kvm);
848
849 mutex_unlock(&kvm->lock);
850
851 return 0;
852
853out:
854 return r;
855}
856
857static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
858{
859 int r;
860
861 r = 0;
862 switch (chip->chip_id) {
863 case KVM_IRQCHIP_PIC_MASTER:
864 memcpy (&chip->chip.pic,
865 &pic_irqchip(kvm)->pics[0],
866 sizeof(struct kvm_pic_state));
867 break;
868 case KVM_IRQCHIP_PIC_SLAVE:
869 memcpy (&chip->chip.pic,
870 &pic_irqchip(kvm)->pics[1],
871 sizeof(struct kvm_pic_state));
872 break;
873 case KVM_IRQCHIP_IOAPIC:
874 memcpy (&chip->chip.ioapic,
875 ioapic_irqchip(kvm),
876 sizeof(struct kvm_ioapic_state));
877 break;
878 default:
879 r = -EINVAL;
880 break;
881 }
882 return r;
883}
884
885static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
886{
887 int r;
888
889 r = 0;
890 switch (chip->chip_id) {
891 case KVM_IRQCHIP_PIC_MASTER:
892 memcpy (&pic_irqchip(kvm)->pics[0],
893 &chip->chip.pic,
894 sizeof(struct kvm_pic_state));
895 break;
896 case KVM_IRQCHIP_PIC_SLAVE:
897 memcpy (&pic_irqchip(kvm)->pics[1],
898 &chip->chip.pic,
899 sizeof(struct kvm_pic_state));
900 break;
901 case KVM_IRQCHIP_IOAPIC:
902 memcpy (ioapic_irqchip(kvm),
903 &chip->chip.ioapic,
904 sizeof(struct kvm_ioapic_state));
905 break;
906 default:
907 r = -EINVAL;
908 break;
909 }
910 kvm_pic_update_irq(pic_irqchip(kvm));
911 return r;
912}
913
914static gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
915{
916 int i;
917 struct kvm_mem_alias *alias;
918
919 for (i = 0; i < kvm->naliases; ++i) {
920 alias = &kvm->aliases[i];
921 if (gfn >= alias->base_gfn
922 && gfn < alias->base_gfn + alias->npages)
923 return alias->target_gfn + gfn - alias->base_gfn;
924 }
925 return gfn;
926}
927
928static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
929{
930 int i;
931
932 for (i = 0; i < kvm->nmemslots; ++i) {
933 struct kvm_memory_slot *memslot = &kvm->memslots[i];
934
935 if (gfn >= memslot->base_gfn
936 && gfn < memslot->base_gfn + memslot->npages)
937 return memslot;
938 }
939 return NULL;
940}
941
942struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
943{
944 gfn = unalias_gfn(kvm, gfn);
945 return __gfn_to_memslot(kvm, gfn);
946}
947
948struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
949{
950 struct kvm_memory_slot *slot;
951
952 gfn = unalias_gfn(kvm, gfn);
953 slot = __gfn_to_memslot(kvm, gfn);
954 if (!slot)
955 return NULL;
956 return slot->phys_mem[gfn - slot->base_gfn];
957}
958EXPORT_SYMBOL_GPL(gfn_to_page);
959
960/* WARNING: Does not work on aliased pages. */
961void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
962{
963 struct kvm_memory_slot *memslot;
964
965 memslot = __gfn_to_memslot(kvm, gfn);
966 if (memslot && memslot->dirty_bitmap) {
967 unsigned long rel_gfn = gfn - memslot->base_gfn;
968
969 /* avoid RMW */
970 if (!test_bit(rel_gfn, memslot->dirty_bitmap))
971 set_bit(rel_gfn, memslot->dirty_bitmap);
972 }
973}
974
975int emulator_read_std(unsigned long addr,
976 void *val,
977 unsigned int bytes,
978 struct kvm_vcpu *vcpu)
979{
980 void *data = val;
981
982 while (bytes) {
983 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
984 unsigned offset = addr & (PAGE_SIZE-1);
985 unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
986 unsigned long pfn;
987 struct page *page;
988 void *page_virt;
989
990 if (gpa == UNMAPPED_GVA)
991 return X86EMUL_PROPAGATE_FAULT;
992 pfn = gpa >> PAGE_SHIFT;
993 page = gfn_to_page(vcpu->kvm, pfn);
994 if (!page)
995 return X86EMUL_UNHANDLEABLE;
996 page_virt = kmap_atomic(page, KM_USER0);
997
998 memcpy(data, page_virt + offset, tocopy);
999
1000 kunmap_atomic(page_virt, KM_USER0);
1001
1002 bytes -= tocopy;
1003 data += tocopy;
1004 addr += tocopy;
1005 }
1006
1007 return X86EMUL_CONTINUE;
1008}
1009EXPORT_SYMBOL_GPL(emulator_read_std);
1010
1011static int emulator_write_std(unsigned long addr,
1012 const void *val,
1013 unsigned int bytes,
1014 struct kvm_vcpu *vcpu)
1015{
1016 pr_unimpl(vcpu, "emulator_write_std: addr %lx n %d\n", addr, bytes);
1017 return X86EMUL_UNHANDLEABLE;
1018}
1019
1020/*
1021 * Only apic need an MMIO device hook, so shortcut now..
1022 */
1023static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
1024 gpa_t addr)
1025{
1026 struct kvm_io_device *dev;
1027
1028 if (vcpu->apic) {
1029 dev = &vcpu->apic->dev;
1030 if (dev->in_range(dev, addr))
1031 return dev;
1032 }
1033 return NULL;
1034}
1035
1036static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
1037 gpa_t addr)
1038{
1039 struct kvm_io_device *dev;
1040
1041 dev = vcpu_find_pervcpu_dev(vcpu, addr);
1042 if (dev == NULL)
1043 dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr);
1044 return dev;
1045}
1046
1047static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
1048 gpa_t addr)
1049{
1050 return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr);
1051}
1052
1053static int emulator_read_emulated(unsigned long addr,
1054 void *val,
1055 unsigned int bytes,
1056 struct kvm_vcpu *vcpu)
1057{
1058 struct kvm_io_device *mmio_dev;
1059 gpa_t gpa;
1060
1061 if (vcpu->mmio_read_completed) {
1062 memcpy(val, vcpu->mmio_data, bytes);
1063 vcpu->mmio_read_completed = 0;
1064 return X86EMUL_CONTINUE;
1065 } else if (emulator_read_std(addr, val, bytes, vcpu)
1066 == X86EMUL_CONTINUE)
1067 return X86EMUL_CONTINUE;
1068
1069 gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1070 if (gpa == UNMAPPED_GVA)
1071 return X86EMUL_PROPAGATE_FAULT;
1072
1073 /*
1074 * Is this MMIO handled locally?
1075 */
1076 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1077 if (mmio_dev) {
1078 kvm_iodevice_read(mmio_dev, gpa, bytes, val);
1079 return X86EMUL_CONTINUE;
1080 }
1081
1082 vcpu->mmio_needed = 1;
1083 vcpu->mmio_phys_addr = gpa;
1084 vcpu->mmio_size = bytes;
1085 vcpu->mmio_is_write = 0;
1086
1087 return X86EMUL_UNHANDLEABLE;
1088}
1089
1090static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
1091 const void *val, int bytes)
1092{
1093 struct page *page;
1094 void *virt;
1095
1096 if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT))
1097 return 0;
1098 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1099 if (!page)
1100 return 0;
1101 mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT);
1102 virt = kmap_atomic(page, KM_USER0);
1103 kvm_mmu_pte_write(vcpu, gpa, val, bytes);
1104 memcpy(virt + offset_in_page(gpa), val, bytes);
1105 kunmap_atomic(virt, KM_USER0);
1106 return 1;
1107}
1108
1109static int emulator_write_emulated_onepage(unsigned long addr,
1110 const void *val,
1111 unsigned int bytes,
1112 struct kvm_vcpu *vcpu)
1113{
1114 struct kvm_io_device *mmio_dev;
1115 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1116
1117 if (gpa == UNMAPPED_GVA) {
1118 kvm_x86_ops->inject_page_fault(vcpu, addr, 2);
1119 return X86EMUL_PROPAGATE_FAULT;
1120 }
1121
1122 if (emulator_write_phys(vcpu, gpa, val, bytes))
1123 return X86EMUL_CONTINUE;
1124
1125 /*
1126 * Is this MMIO handled locally?
1127 */
1128 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1129 if (mmio_dev) {
1130 kvm_iodevice_write(mmio_dev, gpa, bytes, val);
1131 return X86EMUL_CONTINUE;
1132 }
1133
1134 vcpu->mmio_needed = 1;
1135 vcpu->mmio_phys_addr = gpa;
1136 vcpu->mmio_size = bytes;
1137 vcpu->mmio_is_write = 1;
1138 memcpy(vcpu->mmio_data, val, bytes);
1139
1140 return X86EMUL_CONTINUE;
1141}
1142
1143int emulator_write_emulated(unsigned long addr,
1144 const void *val,
1145 unsigned int bytes,
1146 struct kvm_vcpu *vcpu)
1147{
1148 /* Crossing a page boundary? */
1149 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
1150 int rc, now;
1151
1152 now = -addr & ~PAGE_MASK;
1153 rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
1154 if (rc != X86EMUL_CONTINUE)
1155 return rc;
1156 addr += now;
1157 val += now;
1158 bytes -= now;
1159 }
1160 return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
1161}
1162EXPORT_SYMBOL_GPL(emulator_write_emulated);
1163
1164static int emulator_cmpxchg_emulated(unsigned long addr,
1165 const void *old,
1166 const void *new,
1167 unsigned int bytes,
1168 struct kvm_vcpu *vcpu)
1169{
1170 static int reported;
1171
1172 if (!reported) {
1173 reported = 1;
1174 printk(KERN_WARNING "kvm: emulating exchange as write\n");
1175 }
1176 return emulator_write_emulated(addr, new, bytes, vcpu);
1177}
1178
1179static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
1180{
1181 return kvm_x86_ops->get_segment_base(vcpu, seg);
1182}
1183
1184int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
1185{
1186 return X86EMUL_CONTINUE;
1187}
1188
1189int emulate_clts(struct kvm_vcpu *vcpu)
1190{
1191 kvm_x86_ops->set_cr0(vcpu, vcpu->cr0 & ~X86_CR0_TS);
1192 return X86EMUL_CONTINUE;
1193}
1194
1195int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, unsigned long *dest)
1196{
1197 struct kvm_vcpu *vcpu = ctxt->vcpu;
1198
1199 switch (dr) {
1200 case 0 ... 3:
1201 *dest = kvm_x86_ops->get_dr(vcpu, dr);
1202 return X86EMUL_CONTINUE;
1203 default:
1204 pr_unimpl(vcpu, "%s: unexpected dr %u\n", __FUNCTION__, dr);
1205 return X86EMUL_UNHANDLEABLE;
1206 }
1207}
1208
1209int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
1210{
1211 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
1212 int exception;
1213
1214 kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
1215 if (exception) {
1216 /* FIXME: better handling */
1217 return X86EMUL_UNHANDLEABLE;
1218 }
1219 return X86EMUL_CONTINUE;
1220}
1221
1222void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
1223{
1224 static int reported;
1225 u8 opcodes[4];
1226 unsigned long rip = vcpu->rip;
1227 unsigned long rip_linear;
1228
1229 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
1230
1231 if (reported)
1232 return;
1233
1234 emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu);
1235
1236 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
1237 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
1238 reported = 1;
1239}
1240EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
1241
1242struct x86_emulate_ops emulate_ops = {
1243 .read_std = emulator_read_std,
1244 .write_std = emulator_write_std,
1245 .read_emulated = emulator_read_emulated,
1246 .write_emulated = emulator_write_emulated,
1247 .cmpxchg_emulated = emulator_cmpxchg_emulated,
1248};
1249
1250int emulate_instruction(struct kvm_vcpu *vcpu,
1251 struct kvm_run *run,
1252 unsigned long cr2,
1253 u16 error_code)
1254{
1255 struct x86_emulate_ctxt emulate_ctxt;
1256 int r;
1257 int cs_db, cs_l;
1258
1259 vcpu->mmio_fault_cr2 = cr2;
1260 kvm_x86_ops->cache_regs(vcpu);
1261
1262 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
1263
1264 emulate_ctxt.vcpu = vcpu;
1265 emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
1266 emulate_ctxt.cr2 = cr2;
1267 emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM)
1268 ? X86EMUL_MODE_REAL : cs_l
1269 ? X86EMUL_MODE_PROT64 : cs_db
1270 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
1271
1272 if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
1273 emulate_ctxt.cs_base = 0;
1274 emulate_ctxt.ds_base = 0;
1275 emulate_ctxt.es_base = 0;
1276 emulate_ctxt.ss_base = 0;
1277 } else {
1278 emulate_ctxt.cs_base = get_segment_base(vcpu, VCPU_SREG_CS);
1279 emulate_ctxt.ds_base = get_segment_base(vcpu, VCPU_SREG_DS);
1280 emulate_ctxt.es_base = get_segment_base(vcpu, VCPU_SREG_ES);
1281 emulate_ctxt.ss_base = get_segment_base(vcpu, VCPU_SREG_SS);
1282 }
1283
1284 emulate_ctxt.gs_base = get_segment_base(vcpu, VCPU_SREG_GS);
1285 emulate_ctxt.fs_base = get_segment_base(vcpu, VCPU_SREG_FS);
1286
1287 vcpu->mmio_is_write = 0;
1288 vcpu->pio.string = 0;
1289 r = x86_emulate_memop(&emulate_ctxt, &emulate_ops);
1290 if (vcpu->pio.string)
1291 return EMULATE_DO_MMIO;
1292
1293 if ((r || vcpu->mmio_is_write) && run) {
1294 run->exit_reason = KVM_EXIT_MMIO;
1295 run->mmio.phys_addr = vcpu->mmio_phys_addr;
1296 memcpy(run->mmio.data, vcpu->mmio_data, 8);
1297 run->mmio.len = vcpu->mmio_size;
1298 run->mmio.is_write = vcpu->mmio_is_write;
1299 }
1300
1301 if (r) {
1302 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
1303 return EMULATE_DONE;
1304 if (!vcpu->mmio_needed) {
1305 kvm_report_emulation_failure(vcpu, "mmio");
1306 return EMULATE_FAIL;
1307 }
1308 return EMULATE_DO_MMIO;
1309 }
1310
1311 kvm_x86_ops->decache_regs(vcpu);
1312 kvm_x86_ops->set_rflags(vcpu, emulate_ctxt.eflags);
1313
1314 if (vcpu->mmio_is_write) {
1315 vcpu->mmio_needed = 0;
1316 return EMULATE_DO_MMIO;
1317 }
1318
1319 return EMULATE_DONE;
1320}
1321EXPORT_SYMBOL_GPL(emulate_instruction);
1322
1323/*
1324 * The vCPU has executed a HLT instruction with in-kernel mode enabled.
1325 */
1326static void kvm_vcpu_block(struct kvm_vcpu *vcpu)
1327{
1328 DECLARE_WAITQUEUE(wait, current);
1329
1330 add_wait_queue(&vcpu->wq, &wait);
1331
1332 /*
1333 * We will block until either an interrupt or a signal wakes us up
1334 */
1335 while (!kvm_cpu_has_interrupt(vcpu)
1336 && !signal_pending(current)
1337 && vcpu->mp_state != VCPU_MP_STATE_RUNNABLE
1338 && vcpu->mp_state != VCPU_MP_STATE_SIPI_RECEIVED) {
1339 set_current_state(TASK_INTERRUPTIBLE);
1340 vcpu_put(vcpu);
1341 schedule();
1342 vcpu_load(vcpu);
1343 }
1344
1345 __set_current_state(TASK_RUNNING);
1346 remove_wait_queue(&vcpu->wq, &wait);
1347}
1348
1349int kvm_emulate_halt(struct kvm_vcpu *vcpu)
1350{
1351 ++vcpu->stat.halt_exits;
1352 if (irqchip_in_kernel(vcpu->kvm)) {
1353 vcpu->mp_state = VCPU_MP_STATE_HALTED;
1354 kvm_vcpu_block(vcpu);
1355 if (vcpu->mp_state != VCPU_MP_STATE_RUNNABLE)
1356 return -EINTR;
1357 return 1;
1358 } else {
1359 vcpu->run->exit_reason = KVM_EXIT_HLT;
1360 return 0;
1361 }
1362}
1363EXPORT_SYMBOL_GPL(kvm_emulate_halt);
1364
1365int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run)
1366{
1367 unsigned long nr, a0, a1, a2, a3, a4, a5, ret;
1368
1369 kvm_x86_ops->cache_regs(vcpu);
1370 ret = -KVM_EINVAL;
1371#ifdef CONFIG_X86_64
1372 if (is_long_mode(vcpu)) {
1373 nr = vcpu->regs[VCPU_REGS_RAX];
1374 a0 = vcpu->regs[VCPU_REGS_RDI];
1375 a1 = vcpu->regs[VCPU_REGS_RSI];
1376 a2 = vcpu->regs[VCPU_REGS_RDX];
1377 a3 = vcpu->regs[VCPU_REGS_RCX];
1378 a4 = vcpu->regs[VCPU_REGS_R8];
1379 a5 = vcpu->regs[VCPU_REGS_R9];
1380 } else
1381#endif
1382 {
1383 nr = vcpu->regs[VCPU_REGS_RBX] & -1u;
1384 a0 = vcpu->regs[VCPU_REGS_RAX] & -1u;
1385 a1 = vcpu->regs[VCPU_REGS_RCX] & -1u;
1386 a2 = vcpu->regs[VCPU_REGS_RDX] & -1u;
1387 a3 = vcpu->regs[VCPU_REGS_RSI] & -1u;
1388 a4 = vcpu->regs[VCPU_REGS_RDI] & -1u;
1389 a5 = vcpu->regs[VCPU_REGS_RBP] & -1u;
1390 }
1391 switch (nr) {
1392 default:
1393 run->hypercall.nr = nr;
1394 run->hypercall.args[0] = a0;
1395 run->hypercall.args[1] = a1;
1396 run->hypercall.args[2] = a2;
1397 run->hypercall.args[3] = a3;
1398 run->hypercall.args[4] = a4;
1399 run->hypercall.args[5] = a5;
1400 run->hypercall.ret = ret;
1401 run->hypercall.longmode = is_long_mode(vcpu);
1402 kvm_x86_ops->decache_regs(vcpu);
1403 return 0;
1404 }
1405 vcpu->regs[VCPU_REGS_RAX] = ret;
1406 kvm_x86_ops->decache_regs(vcpu);
1407 return 1;
1408}
1409EXPORT_SYMBOL_GPL(kvm_hypercall);
1410
1411static u64 mk_cr_64(u64 curr_cr, u32 new_val)
1412{
1413 return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
1414}
1415
1416void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
1417{
1418 struct descriptor_table dt = { limit, base };
1419
1420 kvm_x86_ops->set_gdt(vcpu, &dt);
1421}
1422
1423void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
1424{
1425 struct descriptor_table dt = { limit, base };
1426
1427 kvm_x86_ops->set_idt(vcpu, &dt);
1428}
1429
1430void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
1431 unsigned long *rflags)
1432{
1433 lmsw(vcpu, msw);
1434 *rflags = kvm_x86_ops->get_rflags(vcpu);
1435}
1436
1437unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
1438{
1439 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
1440 switch (cr) {
1441 case 0:
1442 return vcpu->cr0;
1443 case 2:
1444 return vcpu->cr2;
1445 case 3:
1446 return vcpu->cr3;
1447 case 4:
1448 return vcpu->cr4;
1449 default:
1450 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1451 return 0;
1452 }
1453}
1454
1455void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
1456 unsigned long *rflags)
1457{
1458 switch (cr) {
1459 case 0:
1460 set_cr0(vcpu, mk_cr_64(vcpu->cr0, val));
1461 *rflags = kvm_x86_ops->get_rflags(vcpu);
1462 break;
1463 case 2:
1464 vcpu->cr2 = val;
1465 break;
1466 case 3:
1467 set_cr3(vcpu, val);
1468 break;
1469 case 4:
1470 set_cr4(vcpu, mk_cr_64(vcpu->cr4, val));
1471 break;
1472 default:
1473 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1474 }
1475}
1476
1477/*
1478 * Register the para guest with the host:
1479 */
1480static int vcpu_register_para(struct kvm_vcpu *vcpu, gpa_t para_state_gpa)
1481{
1482 struct kvm_vcpu_para_state *para_state;
1483 hpa_t para_state_hpa, hypercall_hpa;
1484 struct page *para_state_page;
1485 unsigned char *hypercall;
1486 gpa_t hypercall_gpa;
1487
1488 printk(KERN_DEBUG "kvm: guest trying to enter paravirtual mode\n");
1489 printk(KERN_DEBUG ".... para_state_gpa: %08Lx\n", para_state_gpa);
1490
1491 /*
1492 * Needs to be page aligned:
1493 */
1494 if (para_state_gpa != PAGE_ALIGN(para_state_gpa))
1495 goto err_gp;
1496
1497 para_state_hpa = gpa_to_hpa(vcpu, para_state_gpa);
1498 printk(KERN_DEBUG ".... para_state_hpa: %08Lx\n", para_state_hpa);
1499 if (is_error_hpa(para_state_hpa))
1500 goto err_gp;
1501
1502 mark_page_dirty(vcpu->kvm, para_state_gpa >> PAGE_SHIFT);
1503 para_state_page = pfn_to_page(para_state_hpa >> PAGE_SHIFT);
1504 para_state = kmap(para_state_page);
1505
1506 printk(KERN_DEBUG ".... guest version: %d\n", para_state->guest_version);
1507 printk(KERN_DEBUG ".... size: %d\n", para_state->size);
1508
1509 para_state->host_version = KVM_PARA_API_VERSION;
1510 /*
1511 * We cannot support guests that try to register themselves
1512 * with a newer API version than the host supports:
1513 */
1514 if (para_state->guest_version > KVM_PARA_API_VERSION) {
1515 para_state->ret = -KVM_EINVAL;
1516 goto err_kunmap_skip;
1517 }
1518
1519 hypercall_gpa = para_state->hypercall_gpa;
1520 hypercall_hpa = gpa_to_hpa(vcpu, hypercall_gpa);
1521 printk(KERN_DEBUG ".... hypercall_hpa: %08Lx\n", hypercall_hpa);
1522 if (is_error_hpa(hypercall_hpa)) {
1523 para_state->ret = -KVM_EINVAL;
1524 goto err_kunmap_skip;
1525 }
1526
1527 printk(KERN_DEBUG "kvm: para guest successfully registered.\n");
1528 vcpu->para_state_page = para_state_page;
1529 vcpu->para_state_gpa = para_state_gpa;
1530 vcpu->hypercall_gpa = hypercall_gpa;
1531
1532 mark_page_dirty(vcpu->kvm, hypercall_gpa >> PAGE_SHIFT);
1533 hypercall = kmap_atomic(pfn_to_page(hypercall_hpa >> PAGE_SHIFT),
1534 KM_USER1) + (hypercall_hpa & ~PAGE_MASK);
1535 kvm_x86_ops->patch_hypercall(vcpu, hypercall);
1536 kunmap_atomic(hypercall, KM_USER1);
1537
1538 para_state->ret = 0;
1539err_kunmap_skip:
1540 kunmap(para_state_page);
1541 return 0;
1542err_gp:
1543 return 1;
1544}
1545
1546int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1547{
1548 u64 data;
1549
1550 switch (msr) {
1551 case 0xc0010010: /* SYSCFG */
1552 case 0xc0010015: /* HWCR */
1553 case MSR_IA32_PLATFORM_ID:
1554 case MSR_IA32_P5_MC_ADDR:
1555 case MSR_IA32_P5_MC_TYPE:
1556 case MSR_IA32_MC0_CTL:
1557 case MSR_IA32_MCG_STATUS:
1558 case MSR_IA32_MCG_CAP:
1559 case MSR_IA32_MC0_MISC:
1560 case MSR_IA32_MC0_MISC+4:
1561 case MSR_IA32_MC0_MISC+8:
1562 case MSR_IA32_MC0_MISC+12:
1563 case MSR_IA32_MC0_MISC+16:
1564 case MSR_IA32_UCODE_REV:
1565 case MSR_IA32_PERF_STATUS:
1566 case MSR_IA32_EBL_CR_POWERON:
1567 /* MTRR registers */
1568 case 0xfe:
1569 case 0x200 ... 0x2ff:
1570 data = 0;
1571 break;
1572 case 0xcd: /* fsb frequency */
1573 data = 3;
1574 break;
1575 case MSR_IA32_APICBASE:
1576 data = kvm_get_apic_base(vcpu);
1577 break;
1578 case MSR_IA32_MISC_ENABLE:
1579 data = vcpu->ia32_misc_enable_msr;
1580 break;
1581#ifdef CONFIG_X86_64
1582 case MSR_EFER:
1583 data = vcpu->shadow_efer;
1584 break;
1585#endif
1586 default:
1587 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
1588 return 1;
1589 }
1590 *pdata = data;
1591 return 0;
1592}
1593EXPORT_SYMBOL_GPL(kvm_get_msr_common);
1594
1595/*
1596 * Reads an msr value (of 'msr_index') into 'pdata'.
1597 * Returns 0 on success, non-0 otherwise.
1598 * Assumes vcpu_load() was already called.
1599 */
1600int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1601{
1602 return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
1603}
1604
1605#ifdef CONFIG_X86_64
1606
1607static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
1608{
1609 if (efer & EFER_RESERVED_BITS) {
1610 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
1611 efer);
1612 inject_gp(vcpu);
1613 return;
1614 }
1615
1616 if (is_paging(vcpu)
1617 && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) {
1618 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
1619 inject_gp(vcpu);
1620 return;
1621 }
1622
1623 kvm_x86_ops->set_efer(vcpu, efer);
1624
1625 efer &= ~EFER_LMA;
1626 efer |= vcpu->shadow_efer & EFER_LMA;
1627
1628 vcpu->shadow_efer = efer;
1629}
1630
1631#endif
1632
1633int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1634{
1635 switch (msr) {
1636#ifdef CONFIG_X86_64
1637 case MSR_EFER:
1638 set_efer(vcpu, data);
1639 break;
1640#endif
1641 case MSR_IA32_MC0_STATUS:
1642 pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
1643 __FUNCTION__, data);
1644 break;
1645 case MSR_IA32_MCG_STATUS:
1646 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
1647 __FUNCTION__, data);
1648 break;
1649 case MSR_IA32_UCODE_REV:
1650 case MSR_IA32_UCODE_WRITE:
1651 case 0x200 ... 0x2ff: /* MTRRs */
1652 break;
1653 case MSR_IA32_APICBASE:
1654 kvm_set_apic_base(vcpu, data);
1655 break;
1656 case MSR_IA32_MISC_ENABLE:
1657 vcpu->ia32_misc_enable_msr = data;
1658 break;
1659 /*
1660 * This is the 'probe whether the host is KVM' logic:
1661 */
1662 case MSR_KVM_API_MAGIC:
1663 return vcpu_register_para(vcpu, data);
1664
1665 default:
1666 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x\n", msr);
1667 return 1;
1668 }
1669 return 0;
1670}
1671EXPORT_SYMBOL_GPL(kvm_set_msr_common);
1672
1673/*
1674 * Writes msr value into into the appropriate "register".
1675 * Returns 0 on success, non-0 otherwise.
1676 * Assumes vcpu_load() was already called.
1677 */
1678int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1679{
1680 return kvm_x86_ops->set_msr(vcpu, msr_index, data);
1681}
1682
1683void kvm_resched(struct kvm_vcpu *vcpu)
1684{
1685 if (!need_resched())
1686 return;
1687 cond_resched();
1688}
1689EXPORT_SYMBOL_GPL(kvm_resched);
1690
1691void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
1692{
1693 int i;
1694 u32 function;
1695 struct kvm_cpuid_entry *e, *best;
1696
1697 kvm_x86_ops->cache_regs(vcpu);
1698 function = vcpu->regs[VCPU_REGS_RAX];
1699 vcpu->regs[VCPU_REGS_RAX] = 0;
1700 vcpu->regs[VCPU_REGS_RBX] = 0;
1701 vcpu->regs[VCPU_REGS_RCX] = 0;
1702 vcpu->regs[VCPU_REGS_RDX] = 0;
1703 best = NULL;
1704 for (i = 0; i < vcpu->cpuid_nent; ++i) {
1705 e = &vcpu->cpuid_entries[i];
1706 if (e->function == function) {
1707 best = e;
1708 break;
1709 }
1710 /*
1711 * Both basic or both extended?
1712 */
1713 if (((e->function ^ function) & 0x80000000) == 0)
1714 if (!best || e->function > best->function)
1715 best = e;
1716 }
1717 if (best) {
1718 vcpu->regs[VCPU_REGS_RAX] = best->eax;
1719 vcpu->regs[VCPU_REGS_RBX] = best->ebx;
1720 vcpu->regs[VCPU_REGS_RCX] = best->ecx;
1721 vcpu->regs[VCPU_REGS_RDX] = best->edx;
1722 }
1723 kvm_x86_ops->decache_regs(vcpu);
1724 kvm_x86_ops->skip_emulated_instruction(vcpu);
1725}
1726EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
1727
1728static int pio_copy_data(struct kvm_vcpu *vcpu)
1729{
1730 void *p = vcpu->pio_data;
1731 void *q;
1732 unsigned bytes;
1733 int nr_pages = vcpu->pio.guest_pages[1] ? 2 : 1;
1734
1735 q = vmap(vcpu->pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
1736 PAGE_KERNEL);
1737 if (!q) {
1738 free_pio_guest_pages(vcpu);
1739 return -ENOMEM;
1740 }
1741 q += vcpu->pio.guest_page_offset;
1742 bytes = vcpu->pio.size * vcpu->pio.cur_count;
1743 if (vcpu->pio.in)
1744 memcpy(q, p, bytes);
1745 else
1746 memcpy(p, q, bytes);
1747 q -= vcpu->pio.guest_page_offset;
1748 vunmap(q);
1749 free_pio_guest_pages(vcpu);
1750 return 0;
1751}
1752
1753static int complete_pio(struct kvm_vcpu *vcpu)
1754{
1755 struct kvm_pio_request *io = &vcpu->pio;
1756 long delta;
1757 int r;
1758
1759 kvm_x86_ops->cache_regs(vcpu);
1760
1761 if (!io->string) {
1762 if (io->in)
1763 memcpy(&vcpu->regs[VCPU_REGS_RAX], vcpu->pio_data,
1764 io->size);
1765 } else {
1766 if (io->in) {
1767 r = pio_copy_data(vcpu);
1768 if (r) {
1769 kvm_x86_ops->cache_regs(vcpu);
1770 return r;
1771 }
1772 }
1773
1774 delta = 1;
1775 if (io->rep) {
1776 delta *= io->cur_count;
1777 /*
1778 * The size of the register should really depend on
1779 * current address size.
1780 */
1781 vcpu->regs[VCPU_REGS_RCX] -= delta;
1782 }
1783 if (io->down)
1784 delta = -delta;
1785 delta *= io->size;
1786 if (io->in)
1787 vcpu->regs[VCPU_REGS_RDI] += delta;
1788 else
1789 vcpu->regs[VCPU_REGS_RSI] += delta;
1790 }
1791
1792 kvm_x86_ops->decache_regs(vcpu);
1793
1794 io->count -= io->cur_count;
1795 io->cur_count = 0;
1796
1797 return 0;
1798}
1799
1800static void kernel_pio(struct kvm_io_device *pio_dev,
1801 struct kvm_vcpu *vcpu,
1802 void *pd)
1803{
1804 /* TODO: String I/O for in kernel device */
1805
1806 mutex_lock(&vcpu->kvm->lock);
1807 if (vcpu->pio.in)
1808 kvm_iodevice_read(pio_dev, vcpu->pio.port,
1809 vcpu->pio.size,
1810 pd);
1811 else
1812 kvm_iodevice_write(pio_dev, vcpu->pio.port,
1813 vcpu->pio.size,
1814 pd);
1815 mutex_unlock(&vcpu->kvm->lock);
1816}
1817
1818static void pio_string_write(struct kvm_io_device *pio_dev,
1819 struct kvm_vcpu *vcpu)
1820{
1821 struct kvm_pio_request *io = &vcpu->pio;
1822 void *pd = vcpu->pio_data;
1823 int i;
1824
1825 mutex_lock(&vcpu->kvm->lock);
1826 for (i = 0; i < io->cur_count; i++) {
1827 kvm_iodevice_write(pio_dev, io->port,
1828 io->size,
1829 pd);
1830 pd += io->size;
1831 }
1832 mutex_unlock(&vcpu->kvm->lock);
1833}
1834
1835int kvm_emulate_pio (struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
1836 int size, unsigned port)
1837{
1838 struct kvm_io_device *pio_dev;
1839
1840 vcpu->run->exit_reason = KVM_EXIT_IO;
1841 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
1842 vcpu->run->io.size = vcpu->pio.size = size;
1843 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
1844 vcpu->run->io.count = vcpu->pio.count = vcpu->pio.cur_count = 1;
1845 vcpu->run->io.port = vcpu->pio.port = port;
1846 vcpu->pio.in = in;
1847 vcpu->pio.string = 0;
1848 vcpu->pio.down = 0;
1849 vcpu->pio.guest_page_offset = 0;
1850 vcpu->pio.rep = 0;
1851
1852 kvm_x86_ops->cache_regs(vcpu);
1853 memcpy(vcpu->pio_data, &vcpu->regs[VCPU_REGS_RAX], 4);
1854 kvm_x86_ops->decache_regs(vcpu);
1855
1856 kvm_x86_ops->skip_emulated_instruction(vcpu);
1857
1858 pio_dev = vcpu_find_pio_dev(vcpu, port);
1859 if (pio_dev) {
1860 kernel_pio(pio_dev, vcpu, vcpu->pio_data);
1861 complete_pio(vcpu);
1862 return 1;
1863 }
1864 return 0;
1865}
1866EXPORT_SYMBOL_GPL(kvm_emulate_pio);
1867
1868int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
1869 int size, unsigned long count, int down,
1870 gva_t address, int rep, unsigned port)
1871{
1872 unsigned now, in_page;
1873 int i, ret = 0;
1874 int nr_pages = 1;
1875 struct page *page;
1876 struct kvm_io_device *pio_dev;
1877
1878 vcpu->run->exit_reason = KVM_EXIT_IO;
1879 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
1880 vcpu->run->io.size = vcpu->pio.size = size;
1881 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
1882 vcpu->run->io.count = vcpu->pio.count = vcpu->pio.cur_count = count;
1883 vcpu->run->io.port = vcpu->pio.port = port;
1884 vcpu->pio.in = in;
1885 vcpu->pio.string = 1;
1886 vcpu->pio.down = down;
1887 vcpu->pio.guest_page_offset = offset_in_page(address);
1888 vcpu->pio.rep = rep;
1889
1890 if (!count) {
1891 kvm_x86_ops->skip_emulated_instruction(vcpu);
1892 return 1;
1893 }
1894
1895 if (!down)
1896 in_page = PAGE_SIZE - offset_in_page(address);
1897 else
1898 in_page = offset_in_page(address) + size;
1899 now = min(count, (unsigned long)in_page / size);
1900 if (!now) {
1901 /*
1902 * String I/O straddles page boundary. Pin two guest pages
1903 * so that we satisfy atomicity constraints. Do just one
1904 * transaction to avoid complexity.
1905 */
1906 nr_pages = 2;
1907 now = 1;
1908 }
1909 if (down) {
1910 /*
1911 * String I/O in reverse. Yuck. Kill the guest, fix later.
1912 */
1913 pr_unimpl(vcpu, "guest string pio down\n");
1914 inject_gp(vcpu);
1915 return 1;
1916 }
1917 vcpu->run->io.count = now;
1918 vcpu->pio.cur_count = now;
1919
1920 if (vcpu->pio.cur_count == vcpu->pio.count)
1921 kvm_x86_ops->skip_emulated_instruction(vcpu);
1922
1923 for (i = 0; i < nr_pages; ++i) {
1924 mutex_lock(&vcpu->kvm->lock);
1925 page = gva_to_page(vcpu, address + i * PAGE_SIZE);
1926 if (page)
1927 get_page(page);
1928 vcpu->pio.guest_pages[i] = page;
1929 mutex_unlock(&vcpu->kvm->lock);
1930 if (!page) {
1931 inject_gp(vcpu);
1932 free_pio_guest_pages(vcpu);
1933 return 1;
1934 }
1935 }
1936
1937 pio_dev = vcpu_find_pio_dev(vcpu, port);
1938 if (!vcpu->pio.in) {
1939 /* string PIO write */
1940 ret = pio_copy_data(vcpu);
1941 if (ret >= 0 && pio_dev) {
1942 pio_string_write(pio_dev, vcpu);
1943 complete_pio(vcpu);
1944 if (vcpu->pio.count == 0)
1945 ret = 1;
1946 }
1947 } else if (pio_dev)
1948 pr_unimpl(vcpu, "no string pio read support yet, "
1949 "port %x size %d count %ld\n",
1950 port, size, count);
1951
1952 return ret;
1953}
1954EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
1955
1956/*
1957 * Check if userspace requested an interrupt window, and that the
1958 * interrupt window is open.
1959 *
1960 * No need to exit to userspace if we already have an interrupt queued.
1961 */
1962static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
1963 struct kvm_run *kvm_run)
1964{
1965 return (!vcpu->irq_summary &&
1966 kvm_run->request_interrupt_window &&
1967 vcpu->interrupt_window_open &&
1968 (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
1969}
1970
1971static void post_kvm_run_save(struct kvm_vcpu *vcpu,
1972 struct kvm_run *kvm_run)
1973{
1974 kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
1975 kvm_run->cr8 = get_cr8(vcpu);
1976 kvm_run->apic_base = kvm_get_apic_base(vcpu);
1977 if (irqchip_in_kernel(vcpu->kvm))
1978 kvm_run->ready_for_interrupt_injection = 1;
1979 else
1980 kvm_run->ready_for_interrupt_injection =
1981 (vcpu->interrupt_window_open &&
1982 vcpu->irq_summary == 0);
1983}
1984
1985static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1986{
1987 int r;
1988
1989 if (unlikely(vcpu->mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) {
1990 printk("vcpu %d received sipi with vector # %x\n",
1991 vcpu->vcpu_id, vcpu->sipi_vector);
1992 kvm_lapic_reset(vcpu);
1993 kvm_x86_ops->vcpu_reset(vcpu);
1994 vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
1995 }
1996
1997preempted:
1998 if (vcpu->guest_debug.enabled)
1999 kvm_x86_ops->guest_debug_pre(vcpu);
2000
2001again:
2002 r = kvm_mmu_reload(vcpu);
2003 if (unlikely(r))
2004 goto out;
2005
2006 preempt_disable();
2007
2008 kvm_x86_ops->prepare_guest_switch(vcpu);
2009 kvm_load_guest_fpu(vcpu);
2010
2011 local_irq_disable();
2012
2013 if (signal_pending(current)) {
2014 local_irq_enable();
2015 preempt_enable();
2016 r = -EINTR;
2017 kvm_run->exit_reason = KVM_EXIT_INTR;
2018 ++vcpu->stat.signal_exits;
2019 goto out;
2020 }
2021
2022 if (irqchip_in_kernel(vcpu->kvm))
2023 kvm_x86_ops->inject_pending_irq(vcpu);
2024 else if (!vcpu->mmio_read_completed)
2025 kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
2026
2027 vcpu->guest_mode = 1;
2028 kvm_guest_enter();
2029
2030 if (vcpu->requests)
2031 if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests))
2032 kvm_x86_ops->tlb_flush(vcpu);
2033
2034 kvm_x86_ops->run(vcpu, kvm_run);
2035
2036 vcpu->guest_mode = 0;
2037 local_irq_enable();
2038
2039 ++vcpu->stat.exits;
2040
2041 /*
2042 * We must have an instruction between local_irq_enable() and
2043 * kvm_guest_exit(), so the timer interrupt isn't delayed by
2044 * the interrupt shadow. The stat.exits increment will do nicely.
2045 * But we need to prevent reordering, hence this barrier():
2046 */
2047 barrier();
2048
2049 kvm_guest_exit();
2050
2051 preempt_enable();
2052
2053 /*
2054 * Profile KVM exit RIPs:
2055 */
2056 if (unlikely(prof_on == KVM_PROFILING)) {
2057 kvm_x86_ops->cache_regs(vcpu);
2058 profile_hit(KVM_PROFILING, (void *)vcpu->rip);
2059 }
2060
2061 r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
2062
2063 if (r > 0) {
2064 if (dm_request_for_irq_injection(vcpu, kvm_run)) {
2065 r = -EINTR;
2066 kvm_run->exit_reason = KVM_EXIT_INTR;
2067 ++vcpu->stat.request_irq_exits;
2068 goto out;
2069 }
2070 if (!need_resched()) {
2071 ++vcpu->stat.light_exits;
2072 goto again;
2073 }
2074 }
2075
2076out:
2077 if (r > 0) {
2078 kvm_resched(vcpu);
2079 goto preempted;
2080 }
2081
2082 post_kvm_run_save(vcpu, kvm_run);
2083
2084 return r;
2085}
2086
2087
2088static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2089{
2090 int r;
2091 sigset_t sigsaved;
2092
2093 vcpu_load(vcpu);
2094
2095 if (unlikely(vcpu->mp_state == VCPU_MP_STATE_UNINITIALIZED)) {
2096 kvm_vcpu_block(vcpu);
2097 vcpu_put(vcpu);
2098 return -EAGAIN;
2099 }
2100
2101 if (vcpu->sigset_active)
2102 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
2103
2104 /* re-sync apic's tpr */
2105 if (!irqchip_in_kernel(vcpu->kvm))
2106 set_cr8(vcpu, kvm_run->cr8);
2107
2108 if (vcpu->pio.cur_count) {
2109 r = complete_pio(vcpu);
2110 if (r)
2111 goto out;
2112 }
2113
2114 if (vcpu->mmio_needed) {
2115 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
2116 vcpu->mmio_read_completed = 1;
2117 vcpu->mmio_needed = 0;
2118 r = emulate_instruction(vcpu, kvm_run,
2119 vcpu->mmio_fault_cr2, 0);
2120 if (r == EMULATE_DO_MMIO) {
2121 /*
2122 * Read-modify-write. Back to userspace.
2123 */
2124 r = 0;
2125 goto out;
2126 }
2127 }
2128
2129 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) {
2130 kvm_x86_ops->cache_regs(vcpu);
2131 vcpu->regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;
2132 kvm_x86_ops->decache_regs(vcpu);
2133 }
2134
2135 r = __vcpu_run(vcpu, kvm_run);
2136
2137out:
2138 if (vcpu->sigset_active)
2139 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
2140
2141 vcpu_put(vcpu);
2142 return r;
2143}
2144
2145static int kvm_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu,
2146 struct kvm_regs *regs)
2147{
2148 vcpu_load(vcpu);
2149
2150 kvm_x86_ops->cache_regs(vcpu);
2151
2152 regs->rax = vcpu->regs[VCPU_REGS_RAX];
2153 regs->rbx = vcpu->regs[VCPU_REGS_RBX];
2154 regs->rcx = vcpu->regs[VCPU_REGS_RCX];
2155 regs->rdx = vcpu->regs[VCPU_REGS_RDX];
2156 regs->rsi = vcpu->regs[VCPU_REGS_RSI];
2157 regs->rdi = vcpu->regs[VCPU_REGS_RDI];
2158 regs->rsp = vcpu->regs[VCPU_REGS_RSP];
2159 regs->rbp = vcpu->regs[VCPU_REGS_RBP];
2160#ifdef CONFIG_X86_64
2161 regs->r8 = vcpu->regs[VCPU_REGS_R8];
2162 regs->r9 = vcpu->regs[VCPU_REGS_R9];
2163 regs->r10 = vcpu->regs[VCPU_REGS_R10];
2164 regs->r11 = vcpu->regs[VCPU_REGS_R11];
2165 regs->r12 = vcpu->regs[VCPU_REGS_R12];
2166 regs->r13 = vcpu->regs[VCPU_REGS_R13];
2167 regs->r14 = vcpu->regs[VCPU_REGS_R14];
2168 regs->r15 = vcpu->regs[VCPU_REGS_R15];
2169#endif
2170
2171 regs->rip = vcpu->rip;
2172 regs->rflags = kvm_x86_ops->get_rflags(vcpu);
2173
2174 /*
2175 * Don't leak debug flags in case they were set for guest debugging
2176 */
2177 if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
2178 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
2179
2180 vcpu_put(vcpu);
2181
2182 return 0;
2183}
2184
2185static int kvm_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu,
2186 struct kvm_regs *regs)
2187{
2188 vcpu_load(vcpu);
2189
2190 vcpu->regs[VCPU_REGS_RAX] = regs->rax;
2191 vcpu->regs[VCPU_REGS_RBX] = regs->rbx;
2192 vcpu->regs[VCPU_REGS_RCX] = regs->rcx;
2193 vcpu->regs[VCPU_REGS_RDX] = regs->rdx;
2194 vcpu->regs[VCPU_REGS_RSI] = regs->rsi;
2195 vcpu->regs[VCPU_REGS_RDI] = regs->rdi;
2196 vcpu->regs[VCPU_REGS_RSP] = regs->rsp;
2197 vcpu->regs[VCPU_REGS_RBP] = regs->rbp;
2198#ifdef CONFIG_X86_64
2199 vcpu->regs[VCPU_REGS_R8] = regs->r8;
2200 vcpu->regs[VCPU_REGS_R9] = regs->r9;
2201 vcpu->regs[VCPU_REGS_R10] = regs->r10;
2202 vcpu->regs[VCPU_REGS_R11] = regs->r11;
2203 vcpu->regs[VCPU_REGS_R12] = regs->r12;
2204 vcpu->regs[VCPU_REGS_R13] = regs->r13;
2205 vcpu->regs[VCPU_REGS_R14] = regs->r14;
2206 vcpu->regs[VCPU_REGS_R15] = regs->r15;
2207#endif
2208
2209 vcpu->rip = regs->rip;
2210 kvm_x86_ops->set_rflags(vcpu, regs->rflags);
2211
2212 kvm_x86_ops->decache_regs(vcpu);
2213
2214 vcpu_put(vcpu);
2215
2216 return 0;
2217}
2218
2219static void get_segment(struct kvm_vcpu *vcpu,
2220 struct kvm_segment *var, int seg)
2221{
2222 return kvm_x86_ops->get_segment(vcpu, var, seg);
2223}
2224
2225static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
2226 struct kvm_sregs *sregs)
2227{
2228 struct descriptor_table dt;
2229 int pending_vec;
2230
2231 vcpu_load(vcpu);
2232
2233 get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
2234 get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
2235 get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
2236 get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
2237 get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
2238 get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
2239
2240 get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
2241 get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
2242
2243 kvm_x86_ops->get_idt(vcpu, &dt);
2244 sregs->idt.limit = dt.limit;
2245 sregs->idt.base = dt.base;
2246 kvm_x86_ops->get_gdt(vcpu, &dt);
2247 sregs->gdt.limit = dt.limit;
2248 sregs->gdt.base = dt.base;
2249
2250 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2251 sregs->cr0 = vcpu->cr0;
2252 sregs->cr2 = vcpu->cr2;
2253 sregs->cr3 = vcpu->cr3;
2254 sregs->cr4 = vcpu->cr4;
2255 sregs->cr8 = get_cr8(vcpu);
2256 sregs->efer = vcpu->shadow_efer;
2257 sregs->apic_base = kvm_get_apic_base(vcpu);
2258
2259 if (irqchip_in_kernel(vcpu->kvm)) {
2260 memset(sregs->interrupt_bitmap, 0,
2261 sizeof sregs->interrupt_bitmap);
2262 pending_vec = kvm_x86_ops->get_irq(vcpu);
2263 if (pending_vec >= 0)
2264 set_bit(pending_vec, (unsigned long *)sregs->interrupt_bitmap);
2265 } else
2266 memcpy(sregs->interrupt_bitmap, vcpu->irq_pending,
2267 sizeof sregs->interrupt_bitmap);
2268
2269 vcpu_put(vcpu);
2270
2271 return 0;
2272}
2273
2274static void set_segment(struct kvm_vcpu *vcpu,
2275 struct kvm_segment *var, int seg)
2276{
2277 return kvm_x86_ops->set_segment(vcpu, var, seg);
2278}
2279
2280static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
2281 struct kvm_sregs *sregs)
2282{
2283 int mmu_reset_needed = 0;
2284 int i, pending_vec, max_bits;
2285 struct descriptor_table dt;
2286
2287 vcpu_load(vcpu);
2288
2289 dt.limit = sregs->idt.limit;
2290 dt.base = sregs->idt.base;
2291 kvm_x86_ops->set_idt(vcpu, &dt);
2292 dt.limit = sregs->gdt.limit;
2293 dt.base = sregs->gdt.base;
2294 kvm_x86_ops->set_gdt(vcpu, &dt);
2295
2296 vcpu->cr2 = sregs->cr2;
2297 mmu_reset_needed |= vcpu->cr3 != sregs->cr3;
2298 vcpu->cr3 = sregs->cr3;
2299
2300 set_cr8(vcpu, sregs->cr8);
2301
2302 mmu_reset_needed |= vcpu->shadow_efer != sregs->efer;
2303#ifdef CONFIG_X86_64
2304 kvm_x86_ops->set_efer(vcpu, sregs->efer);
2305#endif
2306 kvm_set_apic_base(vcpu, sregs->apic_base);
2307
2308 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2309
2310 mmu_reset_needed |= vcpu->cr0 != sregs->cr0;
2311 vcpu->cr0 = sregs->cr0;
2312 kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
2313
2314 mmu_reset_needed |= vcpu->cr4 != sregs->cr4;
2315 kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
2316 if (!is_long_mode(vcpu) && is_pae(vcpu))
2317 load_pdptrs(vcpu, vcpu->cr3);
2318
2319 if (mmu_reset_needed)
2320 kvm_mmu_reset_context(vcpu);
2321
2322 if (!irqchip_in_kernel(vcpu->kvm)) {
2323 memcpy(vcpu->irq_pending, sregs->interrupt_bitmap,
2324 sizeof vcpu->irq_pending);
2325 vcpu->irq_summary = 0;
2326 for (i = 0; i < ARRAY_SIZE(vcpu->irq_pending); ++i)
2327 if (vcpu->irq_pending[i])
2328 __set_bit(i, &vcpu->irq_summary);
2329 } else {
2330 max_bits = (sizeof sregs->interrupt_bitmap) << 3;
2331 pending_vec = find_first_bit(
2332 (const unsigned long *)sregs->interrupt_bitmap,
2333 max_bits);
2334 /* Only pending external irq is handled here */
2335 if (pending_vec < max_bits) {
2336 kvm_x86_ops->set_irq(vcpu, pending_vec);
2337 printk("Set back pending irq %d\n", pending_vec);
2338 }
2339 }
2340
2341 set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
2342 set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
2343 set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
2344 set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
2345 set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
2346 set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
2347
2348 set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
2349 set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
2350
2351 vcpu_put(vcpu);
2352
2353 return 0;
2354}
2355
2356void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
2357{
2358 struct kvm_segment cs;
2359
2360 get_segment(vcpu, &cs, VCPU_SREG_CS);
2361 *db = cs.db;
2362 *l = cs.l;
2363}
2364EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
2365
2366/*
2367 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
2368 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
2369 *
2370 * This list is modified at module load time to reflect the
2371 * capabilities of the host cpu.
2372 */
2373static u32 msrs_to_save[] = {
2374 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
2375 MSR_K6_STAR,
2376#ifdef CONFIG_X86_64
2377 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
2378#endif
2379 MSR_IA32_TIME_STAMP_COUNTER,
2380};
2381
2382static unsigned num_msrs_to_save;
2383
2384static u32 emulated_msrs[] = {
2385 MSR_IA32_MISC_ENABLE,
2386};
2387
2388static __init void kvm_init_msr_list(void)
2389{
2390 u32 dummy[2];
2391 unsigned i, j;
2392
2393 for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
2394 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
2395 continue;
2396 if (j < i)
2397 msrs_to_save[j] = msrs_to_save[i];
2398 j++;
2399 }
2400 num_msrs_to_save = j;
2401}
2402
2403/*
2404 * Adapt set_msr() to msr_io()'s calling convention
2405 */
2406static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
2407{
2408 return kvm_set_msr(vcpu, index, *data);
2409}
2410
2411/*
2412 * Read or write a bunch of msrs. All parameters are kernel addresses.
2413 *
2414 * @return number of msrs set successfully.
2415 */
2416static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
2417 struct kvm_msr_entry *entries,
2418 int (*do_msr)(struct kvm_vcpu *vcpu,
2419 unsigned index, u64 *data))
2420{
2421 int i;
2422
2423 vcpu_load(vcpu);
2424
2425 for (i = 0; i < msrs->nmsrs; ++i)
2426 if (do_msr(vcpu, entries[i].index, &entries[i].data))
2427 break;
2428
2429 vcpu_put(vcpu);
2430
2431 return i;
2432}
2433
2434/*
2435 * Read or write a bunch of msrs. Parameters are user addresses.
2436 *
2437 * @return number of msrs set successfully.
2438 */
2439static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
2440 int (*do_msr)(struct kvm_vcpu *vcpu,
2441 unsigned index, u64 *data),
2442 int writeback)
2443{
2444 struct kvm_msrs msrs;
2445 struct kvm_msr_entry *entries;
2446 int r, n;
2447 unsigned size;
2448
2449 r = -EFAULT;
2450 if (copy_from_user(&msrs, user_msrs, sizeof msrs))
2451 goto out;
2452
2453 r = -E2BIG;
2454 if (msrs.nmsrs >= MAX_IO_MSRS)
2455 goto out;
2456
2457 r = -ENOMEM;
2458 size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
2459 entries = vmalloc(size);
2460 if (!entries)
2461 goto out;
2462
2463 r = -EFAULT;
2464 if (copy_from_user(entries, user_msrs->entries, size))
2465 goto out_free;
2466
2467 r = n = __msr_io(vcpu, &msrs, entries, do_msr);
2468 if (r < 0)
2469 goto out_free;
2470
2471 r = -EFAULT;
2472 if (writeback && copy_to_user(user_msrs->entries, entries, size))
2473 goto out_free;
2474
2475 r = n;
2476
2477out_free:
2478 vfree(entries);
2479out:
2480 return r;
2481}
2482
2483/*
2484 * Translate a guest virtual address to a guest physical address.
2485 */
2486static int kvm_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
2487 struct kvm_translation *tr)
2488{
2489 unsigned long vaddr = tr->linear_address;
2490 gpa_t gpa;
2491
2492 vcpu_load(vcpu);
2493 mutex_lock(&vcpu->kvm->lock);
2494 gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr);
2495 tr->physical_address = gpa;
2496 tr->valid = gpa != UNMAPPED_GVA;
2497 tr->writeable = 1;
2498 tr->usermode = 0;
2499 mutex_unlock(&vcpu->kvm->lock);
2500 vcpu_put(vcpu);
2501
2502 return 0;
2503}
2504
2505static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
2506 struct kvm_interrupt *irq)
2507{
2508 if (irq->irq < 0 || irq->irq >= 256)
2509 return -EINVAL;
2510 if (irqchip_in_kernel(vcpu->kvm))
2511 return -ENXIO;
2512 vcpu_load(vcpu);
2513
2514 set_bit(irq->irq, vcpu->irq_pending);
2515 set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary);
2516
2517 vcpu_put(vcpu);
2518
2519 return 0;
2520}
2521
2522static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
2523 struct kvm_debug_guest *dbg)
2524{
2525 int r;
2526
2527 vcpu_load(vcpu);
2528
2529 r = kvm_x86_ops->set_guest_debug(vcpu, dbg);
2530
2531 vcpu_put(vcpu);
2532
2533 return r;
2534}
2535
2536static struct page *kvm_vcpu_nopage(struct vm_area_struct *vma,
2537 unsigned long address,
2538 int *type)
2539{
2540 struct kvm_vcpu *vcpu = vma->vm_file->private_data;
2541 unsigned long pgoff;
2542 struct page *page;
2543
2544 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2545 if (pgoff == 0)
2546 page = virt_to_page(vcpu->run);
2547 else if (pgoff == KVM_PIO_PAGE_OFFSET)
2548 page = virt_to_page(vcpu->pio_data);
2549 else
2550 return NOPAGE_SIGBUS;
2551 get_page(page);
2552 if (type != NULL)
2553 *type = VM_FAULT_MINOR;
2554
2555 return page;
2556}
2557
2558static struct vm_operations_struct kvm_vcpu_vm_ops = {
2559 .nopage = kvm_vcpu_nopage,
2560};
2561
2562static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
2563{
2564 vma->vm_ops = &kvm_vcpu_vm_ops;
2565 return 0;
2566}
2567
2568static int kvm_vcpu_release(struct inode *inode, struct file *filp)
2569{
2570 struct kvm_vcpu *vcpu = filp->private_data;
2571
2572 fput(vcpu->kvm->filp);
2573 return 0;
2574}
2575
2576static struct file_operations kvm_vcpu_fops = {
2577 .release = kvm_vcpu_release,
2578 .unlocked_ioctl = kvm_vcpu_ioctl,
2579 .compat_ioctl = kvm_vcpu_ioctl,
2580 .mmap = kvm_vcpu_mmap,
2581};
2582
2583/*
2584 * Allocates an inode for the vcpu.
2585 */
2586static int create_vcpu_fd(struct kvm_vcpu *vcpu)
2587{
2588 int fd, r;
2589 struct inode *inode;
2590 struct file *file;
2591
2592 r = anon_inode_getfd(&fd, &inode, &file,
2593 "kvm-vcpu", &kvm_vcpu_fops, vcpu);
2594 if (r)
2595 return r;
2596 atomic_inc(&vcpu->kvm->filp->f_count);
2597 return fd;
2598}
2599
2600/*
2601 * Creates some virtual cpus. Good luck creating more than one.
2602 */
2603static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
2604{
2605 int r;
2606 struct kvm_vcpu *vcpu;
2607
2608 if (!valid_vcpu(n))
2609 return -EINVAL;
2610
2611 vcpu = kvm_x86_ops->vcpu_create(kvm, n);
2612 if (IS_ERR(vcpu))
2613 return PTR_ERR(vcpu);
2614
2615 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
2616
2617 /* We do fxsave: this must be aligned. */
2618 BUG_ON((unsigned long)&vcpu->host_fx_image & 0xF);
2619
2620 vcpu_load(vcpu);
2621 r = kvm_mmu_setup(vcpu);
2622 vcpu_put(vcpu);
2623 if (r < 0)
2624 goto free_vcpu;
2625
2626 mutex_lock(&kvm->lock);
2627 if (kvm->vcpus[n]) {
2628 r = -EEXIST;
2629 mutex_unlock(&kvm->lock);
2630 goto mmu_unload;
2631 }
2632 kvm->vcpus[n] = vcpu;
2633 mutex_unlock(&kvm->lock);
2634
2635 /* Now it's all set up, let userspace reach it */
2636 r = create_vcpu_fd(vcpu);
2637 if (r < 0)
2638 goto unlink;
2639 return r;
2640
2641unlink:
2642 mutex_lock(&kvm->lock);
2643 kvm->vcpus[n] = NULL;
2644 mutex_unlock(&kvm->lock);
2645
2646mmu_unload:
2647 vcpu_load(vcpu);
2648 kvm_mmu_unload(vcpu);
2649 vcpu_put(vcpu);
2650
2651free_vcpu:
2652 kvm_x86_ops->vcpu_free(vcpu);
2653 return r;
2654}
2655
2656static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
2657{
2658 u64 efer;
2659 int i;
2660 struct kvm_cpuid_entry *e, *entry;
2661
2662 rdmsrl(MSR_EFER, efer);
2663 entry = NULL;
2664 for (i = 0; i < vcpu->cpuid_nent; ++i) {
2665 e = &vcpu->cpuid_entries[i];
2666 if (e->function == 0x80000001) {
2667 entry = e;
2668 break;
2669 }
2670 }
2671 if (entry && (entry->edx & (1 << 20)) && !(efer & EFER_NX)) {
2672 entry->edx &= ~(1 << 20);
2673 printk(KERN_INFO "kvm: guest NX capability removed\n");
2674 }
2675}
2676
2677static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
2678 struct kvm_cpuid *cpuid,
2679 struct kvm_cpuid_entry __user *entries)
2680{
2681 int r;
2682
2683 r = -E2BIG;
2684 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
2685 goto out;
2686 r = -EFAULT;
2687 if (copy_from_user(&vcpu->cpuid_entries, entries,
2688 cpuid->nent * sizeof(struct kvm_cpuid_entry)))
2689 goto out;
2690 vcpu->cpuid_nent = cpuid->nent;
2691 cpuid_fix_nx_cap(vcpu);
2692 return 0;
2693
2694out:
2695 return r;
2696}
2697
2698static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
2699{
2700 if (sigset) {
2701 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
2702 vcpu->sigset_active = 1;
2703 vcpu->sigset = *sigset;
2704 } else
2705 vcpu->sigset_active = 0;
2706 return 0;
2707}
2708
2709/*
2710 * fxsave fpu state. Taken from x86_64/processor.h. To be killed when
2711 * we have asm/x86/processor.h
2712 */
2713struct fxsave {
2714 u16 cwd;
2715 u16 swd;
2716 u16 twd;
2717 u16 fop;
2718 u64 rip;
2719 u64 rdp;
2720 u32 mxcsr;
2721 u32 mxcsr_mask;
2722 u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
2723#ifdef CONFIG_X86_64
2724 u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
2725#else
2726 u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */
2727#endif
2728};
2729
2730static int kvm_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2731{
2732 struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image;
2733
2734 vcpu_load(vcpu);
2735
2736 memcpy(fpu->fpr, fxsave->st_space, 128);
2737 fpu->fcw = fxsave->cwd;
2738 fpu->fsw = fxsave->swd;
2739 fpu->ftwx = fxsave->twd;
2740 fpu->last_opcode = fxsave->fop;
2741 fpu->last_ip = fxsave->rip;
2742 fpu->last_dp = fxsave->rdp;
2743 memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
2744
2745 vcpu_put(vcpu);
2746
2747 return 0;
2748}
2749
2750static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2751{
2752 struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image;
2753
2754 vcpu_load(vcpu);
2755
2756 memcpy(fxsave->st_space, fpu->fpr, 128);
2757 fxsave->cwd = fpu->fcw;
2758 fxsave->swd = fpu->fsw;
2759 fxsave->twd = fpu->ftwx;
2760 fxsave->fop = fpu->last_opcode;
2761 fxsave->rip = fpu->last_ip;
2762 fxsave->rdp = fpu->last_dp;
2763 memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
2764
2765 vcpu_put(vcpu);
2766
2767 return 0;
2768}
2769
2770static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
2771 struct kvm_lapic_state *s)
2772{
2773 vcpu_load(vcpu);
2774 memcpy(s->regs, vcpu->apic->regs, sizeof *s);
2775 vcpu_put(vcpu);
2776
2777 return 0;
2778}
2779
2780static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
2781 struct kvm_lapic_state *s)
2782{
2783 vcpu_load(vcpu);
2784 memcpy(vcpu->apic->regs, s->regs, sizeof *s);
2785 kvm_apic_post_state_restore(vcpu);
2786 vcpu_put(vcpu);
2787
2788 return 0;
2789}
2790
2791static long kvm_vcpu_ioctl(struct file *filp,
2792 unsigned int ioctl, unsigned long arg)
2793{
2794 struct kvm_vcpu *vcpu = filp->private_data;
2795 void __user *argp = (void __user *)arg;
2796 int r = -EINVAL;
2797
2798 switch (ioctl) {
2799 case KVM_RUN:
2800 r = -EINVAL;
2801 if (arg)
2802 goto out;
2803 r = kvm_vcpu_ioctl_run(vcpu, vcpu->run);
2804 break;
2805 case KVM_GET_REGS: {
2806 struct kvm_regs kvm_regs;
2807
2808 memset(&kvm_regs, 0, sizeof kvm_regs);
2809 r = kvm_vcpu_ioctl_get_regs(vcpu, &kvm_regs);
2810 if (r)
2811 goto out;
2812 r = -EFAULT;
2813 if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs))
2814 goto out;
2815 r = 0;
2816 break;
2817 }
2818 case KVM_SET_REGS: {
2819 struct kvm_regs kvm_regs;
2820
2821 r = -EFAULT;
2822 if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs))
2823 goto out;
2824 r = kvm_vcpu_ioctl_set_regs(vcpu, &kvm_regs);
2825 if (r)
2826 goto out;
2827 r = 0;
2828 break;
2829 }
2830 case KVM_GET_SREGS: {
2831 struct kvm_sregs kvm_sregs;
2832
2833 memset(&kvm_sregs, 0, sizeof kvm_sregs);
2834 r = kvm_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs);
2835 if (r)
2836 goto out;
2837 r = -EFAULT;
2838 if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs))
2839 goto out;
2840 r = 0;
2841 break;
2842 }
2843 case KVM_SET_SREGS: {
2844 struct kvm_sregs kvm_sregs;
2845
2846 r = -EFAULT;
2847 if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs))
2848 goto out;
2849 r = kvm_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs);
2850 if (r)
2851 goto out;
2852 r = 0;
2853 break;
2854 }
2855 case KVM_TRANSLATE: {
2856 struct kvm_translation tr;
2857
2858 r = -EFAULT;
2859 if (copy_from_user(&tr, argp, sizeof tr))
2860 goto out;
2861 r = kvm_vcpu_ioctl_translate(vcpu, &tr);
2862 if (r)
2863 goto out;
2864 r = -EFAULT;
2865 if (copy_to_user(argp, &tr, sizeof tr))
2866 goto out;
2867 r = 0;
2868 break;
2869 }
2870 case KVM_INTERRUPT: {
2871 struct kvm_interrupt irq;
2872
2873 r = -EFAULT;
2874 if (copy_from_user(&irq, argp, sizeof irq))
2875 goto out;
2876 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
2877 if (r)
2878 goto out;
2879 r = 0;
2880 break;
2881 }
2882 case KVM_DEBUG_GUEST: {
2883 struct kvm_debug_guest dbg;
2884
2885 r = -EFAULT;
2886 if (copy_from_user(&dbg, argp, sizeof dbg))
2887 goto out;
2888 r = kvm_vcpu_ioctl_debug_guest(vcpu, &dbg);
2889 if (r)
2890 goto out;
2891 r = 0;
2892 break;
2893 }
2894 case KVM_GET_MSRS:
2895 r = msr_io(vcpu, argp, kvm_get_msr, 1);
2896 break;
2897 case KVM_SET_MSRS:
2898 r = msr_io(vcpu, argp, do_set_msr, 0);
2899 break;
2900 case KVM_SET_CPUID: {
2901 struct kvm_cpuid __user *cpuid_arg = argp;
2902 struct kvm_cpuid cpuid;
2903
2904 r = -EFAULT;
2905 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
2906 goto out;
2907 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
2908 if (r)
2909 goto out;
2910 break;
2911 }
2912 case KVM_SET_SIGNAL_MASK: {
2913 struct kvm_signal_mask __user *sigmask_arg = argp;
2914 struct kvm_signal_mask kvm_sigmask;
2915 sigset_t sigset, *p;
2916
2917 p = NULL;
2918 if (argp) {
2919 r = -EFAULT;
2920 if (copy_from_user(&kvm_sigmask, argp,
2921 sizeof kvm_sigmask))
2922 goto out;
2923 r = -EINVAL;
2924 if (kvm_sigmask.len != sizeof sigset)
2925 goto out;
2926 r = -EFAULT;
2927 if (copy_from_user(&sigset, sigmask_arg->sigset,
2928 sizeof sigset))
2929 goto out;
2930 p = &sigset;
2931 }
2932 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
2933 break;
2934 }
2935 case KVM_GET_FPU: {
2936 struct kvm_fpu fpu;
2937
2938 memset(&fpu, 0, sizeof fpu);
2939 r = kvm_vcpu_ioctl_get_fpu(vcpu, &fpu);
2940 if (r)
2941 goto out;
2942 r = -EFAULT;
2943 if (copy_to_user(argp, &fpu, sizeof fpu))
2944 goto out;
2945 r = 0;
2946 break;
2947 }
2948 case KVM_SET_FPU: {
2949 struct kvm_fpu fpu;
2950
2951 r = -EFAULT;
2952 if (copy_from_user(&fpu, argp, sizeof fpu))
2953 goto out;
2954 r = kvm_vcpu_ioctl_set_fpu(vcpu, &fpu);
2955 if (r)
2956 goto out;
2957 r = 0;
2958 break;
2959 }
2960 case KVM_GET_LAPIC: {
2961 struct kvm_lapic_state lapic;
2962
2963 memset(&lapic, 0, sizeof lapic);
2964 r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic);
2965 if (r)
2966 goto out;
2967 r = -EFAULT;
2968 if (copy_to_user(argp, &lapic, sizeof lapic))
2969 goto out;
2970 r = 0;
2971 break;
2972 }
2973 case KVM_SET_LAPIC: {
2974 struct kvm_lapic_state lapic;
2975
2976 r = -EFAULT;
2977 if (copy_from_user(&lapic, argp, sizeof lapic))
2978 goto out;
2979 r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);;
2980 if (r)
2981 goto out;
2982 r = 0;
2983 break;
2984 }
2985 default:
2986 ;
2987 }
2988out:
2989 return r;
2990}
2991
2992static long kvm_vm_ioctl(struct file *filp,
2993 unsigned int ioctl, unsigned long arg)
2994{
2995 struct kvm *kvm = filp->private_data;
2996 void __user *argp = (void __user *)arg;
2997 int r = -EINVAL;
2998
2999 switch (ioctl) {
3000 case KVM_CREATE_VCPU:
3001 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
3002 if (r < 0)
3003 goto out;
3004 break;
3005 case KVM_SET_MEMORY_REGION: {
3006 struct kvm_memory_region kvm_mem;
3007
3008 r = -EFAULT;
3009 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
3010 goto out;
3011 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_mem);
3012 if (r)
3013 goto out;
3014 break;
3015 }
3016 case KVM_GET_DIRTY_LOG: {
3017 struct kvm_dirty_log log;
3018
3019 r = -EFAULT;
3020 if (copy_from_user(&log, argp, sizeof log))
3021 goto out;
3022 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
3023 if (r)
3024 goto out;
3025 break;
3026 }
3027 case KVM_SET_MEMORY_ALIAS: {
3028 struct kvm_memory_alias alias;
3029
3030 r = -EFAULT;
3031 if (copy_from_user(&alias, argp, sizeof alias))
3032 goto out;
3033 r = kvm_vm_ioctl_set_memory_alias(kvm, &alias);
3034 if (r)
3035 goto out;
3036 break;
3037 }
3038 case KVM_CREATE_IRQCHIP:
3039 r = -ENOMEM;
3040 kvm->vpic = kvm_create_pic(kvm);
3041 if (kvm->vpic) {
3042 r = kvm_ioapic_init(kvm);
3043 if (r) {
3044 kfree(kvm->vpic);
3045 kvm->vpic = NULL;
3046 goto out;
3047 }
3048 }
3049 else
3050 goto out;
3051 break;
3052 case KVM_IRQ_LINE: {
3053 struct kvm_irq_level irq_event;
3054
3055 r = -EFAULT;
3056 if (copy_from_user(&irq_event, argp, sizeof irq_event))
3057 goto out;
3058 if (irqchip_in_kernel(kvm)) {
3059 mutex_lock(&kvm->lock);
3060 if (irq_event.irq < 16)
3061 kvm_pic_set_irq(pic_irqchip(kvm),
3062 irq_event.irq,
3063 irq_event.level);
3064 kvm_ioapic_set_irq(kvm->vioapic,
3065 irq_event.irq,
3066 irq_event.level);
3067 mutex_unlock(&kvm->lock);
3068 r = 0;
3069 }
3070 break;
3071 }
3072 case KVM_GET_IRQCHIP: {
3073 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
3074 struct kvm_irqchip chip;
3075
3076 r = -EFAULT;
3077 if (copy_from_user(&chip, argp, sizeof chip))
3078 goto out;
3079 r = -ENXIO;
3080 if (!irqchip_in_kernel(kvm))
3081 goto out;
3082 r = kvm_vm_ioctl_get_irqchip(kvm, &chip);
3083 if (r)
3084 goto out;
3085 r = -EFAULT;
3086 if (copy_to_user(argp, &chip, sizeof chip))
3087 goto out;
3088 r = 0;
3089 break;
3090 }
3091 case KVM_SET_IRQCHIP: {
3092 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
3093 struct kvm_irqchip chip;
3094
3095 r = -EFAULT;
3096 if (copy_from_user(&chip, argp, sizeof chip))
3097 goto out;
3098 r = -ENXIO;
3099 if (!irqchip_in_kernel(kvm))
3100 goto out;
3101 r = kvm_vm_ioctl_set_irqchip(kvm, &chip);
3102 if (r)
3103 goto out;
3104 r = 0;
3105 break;
3106 }
3107 default:
3108 ;
3109 }
3110out:
3111 return r;
3112}
3113
3114static struct page *kvm_vm_nopage(struct vm_area_struct *vma,
3115 unsigned long address,
3116 int *type)
3117{
3118 struct kvm *kvm = vma->vm_file->private_data;
3119 unsigned long pgoff;
3120 struct page *page;
3121
3122 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
3123 page = gfn_to_page(kvm, pgoff);
3124 if (!page)
3125 return NOPAGE_SIGBUS;
3126 get_page(page);
3127 if (type != NULL)
3128 *type = VM_FAULT_MINOR;
3129
3130 return page;
3131}
3132
3133static struct vm_operations_struct kvm_vm_vm_ops = {
3134 .nopage = kvm_vm_nopage,
3135};
3136
3137static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
3138{
3139 vma->vm_ops = &kvm_vm_vm_ops;
3140 return 0;
3141}
3142
3143static struct file_operations kvm_vm_fops = {
3144 .release = kvm_vm_release,
3145 .unlocked_ioctl = kvm_vm_ioctl,
3146 .compat_ioctl = kvm_vm_ioctl,
3147 .mmap = kvm_vm_mmap,
3148};
3149
3150static int kvm_dev_ioctl_create_vm(void)
3151{
3152 int fd, r;
3153 struct inode *inode;
3154 struct file *file;
3155 struct kvm *kvm;
3156
3157 kvm = kvm_create_vm();
3158 if (IS_ERR(kvm))
3159 return PTR_ERR(kvm);
3160 r = anon_inode_getfd(&fd, &inode, &file, "kvm-vm", &kvm_vm_fops, kvm);
3161 if (r) {
3162 kvm_destroy_vm(kvm);
3163 return r;
3164 }
3165
3166 kvm->filp = file;
3167
3168 return fd;
3169}
3170
3171static long kvm_dev_ioctl(struct file *filp,
3172 unsigned int ioctl, unsigned long arg)
3173{
3174 void __user *argp = (void __user *)arg;
3175 long r = -EINVAL;
3176
3177 switch (ioctl) {
3178 case KVM_GET_API_VERSION:
3179 r = -EINVAL;
3180 if (arg)
3181 goto out;
3182 r = KVM_API_VERSION;
3183 break;
3184 case KVM_CREATE_VM:
3185 r = -EINVAL;
3186 if (arg)
3187 goto out;
3188 r = kvm_dev_ioctl_create_vm();
3189 break;
3190 case KVM_GET_MSR_INDEX_LIST: {
3191 struct kvm_msr_list __user *user_msr_list = argp;
3192 struct kvm_msr_list msr_list;
3193 unsigned n;
3194
3195 r = -EFAULT;
3196 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
3197 goto out;
3198 n = msr_list.nmsrs;
3199 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
3200 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
3201 goto out;
3202 r = -E2BIG;
3203 if (n < num_msrs_to_save)
3204 goto out;
3205 r = -EFAULT;
3206 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
3207 num_msrs_to_save * sizeof(u32)))
3208 goto out;
3209 if (copy_to_user(user_msr_list->indices
3210 + num_msrs_to_save * sizeof(u32),
3211 &emulated_msrs,
3212 ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
3213 goto out;
3214 r = 0;
3215 break;
3216 }
3217 case KVM_CHECK_EXTENSION: {
3218 int ext = (long)argp;
3219
3220 switch (ext) {
3221 case KVM_CAP_IRQCHIP:
3222 case KVM_CAP_HLT:
3223 r = 1;
3224 break;
3225 default:
3226 r = 0;
3227 break;
3228 }
3229 break;
3230 }
3231 case KVM_GET_VCPU_MMAP_SIZE:
3232 r = -EINVAL;
3233 if (arg)
3234 goto out;
3235 r = 2 * PAGE_SIZE;
3236 break;
3237 default:
3238 ;
3239 }
3240out:
3241 return r;
3242}
3243
3244static struct file_operations kvm_chardev_ops = {
3245 .unlocked_ioctl = kvm_dev_ioctl,
3246 .compat_ioctl = kvm_dev_ioctl,
3247};
3248
3249static struct miscdevice kvm_dev = {
3250 KVM_MINOR,
3251 "kvm",
3252 &kvm_chardev_ops,
3253};
3254
3255/*
3256 * Make sure that a cpu that is being hot-unplugged does not have any vcpus
3257 * cached on it.
3258 */
3259static void decache_vcpus_on_cpu(int cpu)
3260{
3261 struct kvm *vm;
3262 struct kvm_vcpu *vcpu;
3263 int i;
3264
3265 spin_lock(&kvm_lock);
3266 list_for_each_entry(vm, &vm_list, vm_list)
3267 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
3268 vcpu = vm->vcpus[i];
3269 if (!vcpu)
3270 continue;
3271 /*
3272 * If the vcpu is locked, then it is running on some
3273 * other cpu and therefore it is not cached on the
3274 * cpu in question.
3275 *
3276 * If it's not locked, check the last cpu it executed
3277 * on.
3278 */
3279 if (mutex_trylock(&vcpu->mutex)) {
3280 if (vcpu->cpu == cpu) {
3281 kvm_x86_ops->vcpu_decache(vcpu);
3282 vcpu->cpu = -1;
3283 }
3284 mutex_unlock(&vcpu->mutex);
3285 }
3286 }
3287 spin_unlock(&kvm_lock);
3288}
3289
3290static void hardware_enable(void *junk)
3291{
3292 int cpu = raw_smp_processor_id();
3293
3294 if (cpu_isset(cpu, cpus_hardware_enabled))
3295 return;
3296 cpu_set(cpu, cpus_hardware_enabled);
3297 kvm_x86_ops->hardware_enable(NULL);
3298}
3299
3300static void hardware_disable(void *junk)
3301{
3302 int cpu = raw_smp_processor_id();
3303
3304 if (!cpu_isset(cpu, cpus_hardware_enabled))
3305 return;
3306 cpu_clear(cpu, cpus_hardware_enabled);
3307 decache_vcpus_on_cpu(cpu);
3308 kvm_x86_ops->hardware_disable(NULL);
3309}
3310
3311static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
3312 void *v)
3313{
3314 int cpu = (long)v;
3315
3316 switch (val) {
3317 case CPU_DYING:
3318 case CPU_DYING_FROZEN:
3319 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
3320 cpu);
3321 hardware_disable(NULL);
3322 break;
3323 case CPU_UP_CANCELED:
3324 case CPU_UP_CANCELED_FROZEN:
3325 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
3326 cpu);
3327 smp_call_function_single(cpu, hardware_disable, NULL, 0, 1);
3328 break;
3329 case CPU_ONLINE:
3330 case CPU_ONLINE_FROZEN:
3331 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
3332 cpu);
3333 smp_call_function_single(cpu, hardware_enable, NULL, 0, 1);
3334 break;
3335 }
3336 return NOTIFY_OK;
3337}
3338
3339static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
3340 void *v)
3341{
3342 if (val == SYS_RESTART) {
3343 /*
3344 * Some (well, at least mine) BIOSes hang on reboot if
3345 * in vmx root mode.
3346 */
3347 printk(KERN_INFO "kvm: exiting hardware virtualization\n");
3348 on_each_cpu(hardware_disable, NULL, 0, 1);
3349 }
3350 return NOTIFY_OK;
3351}
3352
3353static struct notifier_block kvm_reboot_notifier = {
3354 .notifier_call = kvm_reboot,
3355 .priority = 0,
3356};
3357
3358void kvm_io_bus_init(struct kvm_io_bus *bus)
3359{
3360 memset(bus, 0, sizeof(*bus));
3361}
3362
3363void kvm_io_bus_destroy(struct kvm_io_bus *bus)
3364{
3365 int i;
3366
3367 for (i = 0; i < bus->dev_count; i++) {
3368 struct kvm_io_device *pos = bus->devs[i];
3369
3370 kvm_iodevice_destructor(pos);
3371 }
3372}
3373
3374struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr)
3375{
3376 int i;
3377
3378 for (i = 0; i < bus->dev_count; i++) {
3379 struct kvm_io_device *pos = bus->devs[i];
3380
3381 if (pos->in_range(pos, addr))
3382 return pos;
3383 }
3384
3385 return NULL;
3386}
3387
3388void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev)
3389{
3390 BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1));
3391
3392 bus->devs[bus->dev_count++] = dev;
3393}
3394
3395static struct notifier_block kvm_cpu_notifier = {
3396 .notifier_call = kvm_cpu_hotplug,
3397 .priority = 20, /* must be > scheduler priority */
3398};
3399
3400static u64 stat_get(void *_offset)
3401{
3402 unsigned offset = (long)_offset;
3403 u64 total = 0;
3404 struct kvm *kvm;
3405 struct kvm_vcpu *vcpu;
3406 int i;
3407
3408 spin_lock(&kvm_lock);
3409 list_for_each_entry(kvm, &vm_list, vm_list)
3410 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
3411 vcpu = kvm->vcpus[i];
3412 if (vcpu)
3413 total += *(u32 *)((void *)vcpu + offset);
3414 }
3415 spin_unlock(&kvm_lock);
3416 return total;
3417}
3418
3419DEFINE_SIMPLE_ATTRIBUTE(stat_fops, stat_get, NULL, "%llu\n");
3420
3421static __init void kvm_init_debug(void)
3422{
3423 struct kvm_stats_debugfs_item *p;
3424
3425 debugfs_dir = debugfs_create_dir("kvm", NULL);
3426 for (p = debugfs_entries; p->name; ++p)
3427 p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir,
3428 (void *)(long)p->offset,
3429 &stat_fops);
3430}
3431
3432static void kvm_exit_debug(void)
3433{
3434 struct kvm_stats_debugfs_item *p;
3435
3436 for (p = debugfs_entries; p->name; ++p)
3437 debugfs_remove(p->dentry);
3438 debugfs_remove(debugfs_dir);
3439}
3440
3441static int kvm_suspend(struct sys_device *dev, pm_message_t state)
3442{
3443 hardware_disable(NULL);
3444 return 0;
3445}
3446
3447static int kvm_resume(struct sys_device *dev)
3448{
3449 hardware_enable(NULL);
3450 return 0;
3451}
3452
3453static struct sysdev_class kvm_sysdev_class = {
3454 .name = "kvm",
3455 .suspend = kvm_suspend,
3456 .resume = kvm_resume,
3457};
3458
3459static struct sys_device kvm_sysdev = {
3460 .id = 0,
3461 .cls = &kvm_sysdev_class,
3462};
3463
3464hpa_t bad_page_address;
3465
3466static inline
3467struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
3468{
3469 return container_of(pn, struct kvm_vcpu, preempt_notifier);
3470}
3471
3472static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
3473{
3474 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
3475
3476 kvm_x86_ops->vcpu_load(vcpu, cpu);
3477}
3478
3479static void kvm_sched_out(struct preempt_notifier *pn,
3480 struct task_struct *next)
3481{
3482 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
3483
3484 kvm_x86_ops->vcpu_put(vcpu);
3485}
3486
3487int kvm_init_x86(struct kvm_x86_ops *ops, unsigned int vcpu_size,
3488 struct module *module)
3489{
3490 int r;
3491 int cpu;
3492
3493 if (kvm_x86_ops) {
3494 printk(KERN_ERR "kvm: already loaded the other module\n");
3495 return -EEXIST;
3496 }
3497
3498 if (!ops->cpu_has_kvm_support()) {
3499 printk(KERN_ERR "kvm: no hardware support\n");
3500 return -EOPNOTSUPP;
3501 }
3502 if (ops->disabled_by_bios()) {
3503 printk(KERN_ERR "kvm: disabled by bios\n");
3504 return -EOPNOTSUPP;
3505 }
3506
3507 kvm_x86_ops = ops;
3508
3509 r = kvm_x86_ops->hardware_setup();
3510 if (r < 0)
3511 goto out;
3512
3513 for_each_online_cpu(cpu) {
3514 smp_call_function_single(cpu,
3515 kvm_x86_ops->check_processor_compatibility,
3516 &r, 0, 1);
3517 if (r < 0)
3518 goto out_free_0;
3519 }
3520
3521 on_each_cpu(hardware_enable, NULL, 0, 1);
3522 r = register_cpu_notifier(&kvm_cpu_notifier);
3523 if (r)
3524 goto out_free_1;
3525 register_reboot_notifier(&kvm_reboot_notifier);
3526
3527 r = sysdev_class_register(&kvm_sysdev_class);
3528 if (r)
3529 goto out_free_2;
3530
3531 r = sysdev_register(&kvm_sysdev);
3532 if (r)
3533 goto out_free_3;
3534
3535 /* A kmem cache lets us meet the alignment requirements of fx_save. */
3536 kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size,
3537 __alignof__(struct kvm_vcpu), 0, 0);
3538 if (!kvm_vcpu_cache) {
3539 r = -ENOMEM;
3540 goto out_free_4;
3541 }
3542
3543 kvm_chardev_ops.owner = module;
3544
3545 r = misc_register(&kvm_dev);
3546 if (r) {
3547 printk (KERN_ERR "kvm: misc device register failed\n");
3548 goto out_free;
3549 }
3550
3551 kvm_preempt_ops.sched_in = kvm_sched_in;
3552 kvm_preempt_ops.sched_out = kvm_sched_out;
3553
3554 return r;
3555
3556out_free:
3557 kmem_cache_destroy(kvm_vcpu_cache);
3558out_free_4:
3559 sysdev_unregister(&kvm_sysdev);
3560out_free_3:
3561 sysdev_class_unregister(&kvm_sysdev_class);
3562out_free_2:
3563 unregister_reboot_notifier(&kvm_reboot_notifier);
3564 unregister_cpu_notifier(&kvm_cpu_notifier);
3565out_free_1:
3566 on_each_cpu(hardware_disable, NULL, 0, 1);
3567out_free_0:
3568 kvm_x86_ops->hardware_unsetup();
3569out:
3570 kvm_x86_ops = NULL;
3571 return r;
3572}
3573
3574void kvm_exit_x86(void)
3575{
3576 misc_deregister(&kvm_dev);
3577 kmem_cache_destroy(kvm_vcpu_cache);
3578 sysdev_unregister(&kvm_sysdev);
3579 sysdev_class_unregister(&kvm_sysdev_class);
3580 unregister_reboot_notifier(&kvm_reboot_notifier);
3581 unregister_cpu_notifier(&kvm_cpu_notifier);
3582 on_each_cpu(hardware_disable, NULL, 0, 1);
3583 kvm_x86_ops->hardware_unsetup();
3584 kvm_x86_ops = NULL;
3585}
3586
3587static __init int kvm_init(void)
3588{
3589 static struct page *bad_page;
3590 int r;
3591
3592 r = kvm_mmu_module_init();
3593 if (r)
3594 goto out4;
3595
3596 kvm_init_debug();
3597
3598 kvm_init_msr_list();
3599
3600 if ((bad_page = alloc_page(GFP_KERNEL)) == NULL) {
3601 r = -ENOMEM;
3602 goto out;
3603 }
3604
3605 bad_page_address = page_to_pfn(bad_page) << PAGE_SHIFT;
3606 memset(__va(bad_page_address), 0, PAGE_SIZE);
3607
3608 return 0;
3609
3610out:
3611 kvm_exit_debug();
3612 kvm_mmu_module_exit();
3613out4:
3614 return r;
3615}
3616
3617static __exit void kvm_exit(void)
3618{
3619 kvm_exit_debug();
3620 __free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT));
3621 kvm_mmu_module_exit();
3622}
3623
3624module_init(kvm_init)
3625module_exit(kvm_exit)
3626
3627EXPORT_SYMBOL_GPL(kvm_init_x86);
3628EXPORT_SYMBOL_GPL(kvm_exit_x86);
diff --git a/drivers/kvm/kvm_svm.h b/drivers/kvm/kvm_svm.h
deleted file mode 100644
index a0e415daef5b..000000000000
--- a/drivers/kvm/kvm_svm.h
+++ /dev/null
@@ -1,45 +0,0 @@
1#ifndef __KVM_SVM_H
2#define __KVM_SVM_H
3
4#include <linux/kernel.h>
5#include <linux/types.h>
6#include <linux/list.h>
7#include <asm/msr.h>
8
9#include "svm.h"
10#include "kvm.h"
11
12static const u32 host_save_user_msrs[] = {
13#ifdef CONFIG_X86_64
14 MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
15 MSR_FS_BASE,
16#endif
17 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
18};
19
20#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
21#define NUM_DB_REGS 4
22
23struct kvm_vcpu;
24
25struct vcpu_svm {
26 struct kvm_vcpu vcpu;
27 struct vmcb *vmcb;
28 unsigned long vmcb_pa;
29 struct svm_cpu_data *svm_data;
30 uint64_t asid_generation;
31
32 unsigned long db_regs[NUM_DB_REGS];
33
34 u64 next_rip;
35
36 u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
37 u64 host_gs_base;
38 unsigned long host_cr2;
39 unsigned long host_db_regs[NUM_DB_REGS];
40 unsigned long host_dr6;
41 unsigned long host_dr7;
42};
43
44#endif
45
diff --git a/drivers/kvm/lapic.c b/drivers/kvm/lapic.c
deleted file mode 100644
index 238fcad3cece..000000000000
--- a/drivers/kvm/lapic.c
+++ /dev/null
@@ -1,1080 +0,0 @@
1
2/*
3 * Local APIC virtualization
4 *
5 * Copyright (C) 2006 Qumranet, Inc.
6 * Copyright (C) 2007 Novell
7 * Copyright (C) 2007 Intel
8 *
9 * Authors:
10 * Dor Laor <dor.laor@qumranet.com>
11 * Gregory Haskins <ghaskins@novell.com>
12 * Yaozu (Eddie) Dong <eddie.dong@intel.com>
13 *
14 * Based on Xen 3.1 code, Copyright (c) 2004, Intel Corporation.
15 *
16 * This work is licensed under the terms of the GNU GPL, version 2. See
17 * the COPYING file in the top-level directory.
18 */
19
20#include "kvm.h"
21#include <linux/kvm.h>
22#include <linux/mm.h>
23#include <linux/highmem.h>
24#include <linux/smp.h>
25#include <linux/hrtimer.h>
26#include <linux/io.h>
27#include <linux/module.h>
28#include <asm/processor.h>
29#include <asm/msr.h>
30#include <asm/page.h>
31#include <asm/current.h>
32#include <asm/apicdef.h>
33#include <asm/atomic.h>
34#include <asm/div64.h>
35#include "irq.h"
36
37#define PRId64 "d"
38#define PRIx64 "llx"
39#define PRIu64 "u"
40#define PRIo64 "o"
41
42#define APIC_BUS_CYCLE_NS 1
43
44/* #define apic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */
45#define apic_debug(fmt, arg...)
46
47#define APIC_LVT_NUM 6
48/* 14 is the version for Xeon and Pentium 8.4.8*/
49#define APIC_VERSION (0x14UL | ((APIC_LVT_NUM - 1) << 16))
50#define LAPIC_MMIO_LENGTH (1 << 12)
51/* followed define is not in apicdef.h */
52#define APIC_SHORT_MASK 0xc0000
53#define APIC_DEST_NOSHORT 0x0
54#define APIC_DEST_MASK 0x800
55#define MAX_APIC_VECTOR 256
56
57#define VEC_POS(v) ((v) & (32 - 1))
58#define REG_POS(v) (((v) >> 5) << 4)
59static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off)
60{
61 return *((u32 *) (apic->regs + reg_off));
62}
63
64static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
65{
66 *((u32 *) (apic->regs + reg_off)) = val;
67}
68
69static inline int apic_test_and_set_vector(int vec, void *bitmap)
70{
71 return test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
72}
73
74static inline int apic_test_and_clear_vector(int vec, void *bitmap)
75{
76 return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
77}
78
79static inline void apic_set_vector(int vec, void *bitmap)
80{
81 set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
82}
83
84static inline void apic_clear_vector(int vec, void *bitmap)
85{
86 clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
87}
88
89static inline int apic_hw_enabled(struct kvm_lapic *apic)
90{
91 return (apic)->vcpu->apic_base & MSR_IA32_APICBASE_ENABLE;
92}
93
94static inline int apic_sw_enabled(struct kvm_lapic *apic)
95{
96 return apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED;
97}
98
99static inline int apic_enabled(struct kvm_lapic *apic)
100{
101 return apic_sw_enabled(apic) && apic_hw_enabled(apic);
102}
103
104#define LVT_MASK \
105 (APIC_LVT_MASKED | APIC_SEND_PENDING | APIC_VECTOR_MASK)
106
107#define LINT_MASK \
108 (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
109 APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
110
111static inline int kvm_apic_id(struct kvm_lapic *apic)
112{
113 return (apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
114}
115
116static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
117{
118 return !(apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED);
119}
120
121static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type)
122{
123 return apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK;
124}
125
126static inline int apic_lvtt_period(struct kvm_lapic *apic)
127{
128 return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC;
129}
130
131static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
132 LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */
133 LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */
134 LVT_MASK | APIC_MODE_MASK, /* LVTPC */
135 LINT_MASK, LINT_MASK, /* LVT0-1 */
136 LVT_MASK /* LVTERR */
137};
138
139static int find_highest_vector(void *bitmap)
140{
141 u32 *word = bitmap;
142 int word_offset = MAX_APIC_VECTOR >> 5;
143
144 while ((word_offset != 0) && (word[(--word_offset) << 2] == 0))
145 continue;
146
147 if (likely(!word_offset && !word[0]))
148 return -1;
149 else
150 return fls(word[word_offset << 2]) - 1 + (word_offset << 5);
151}
152
153static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic)
154{
155 return apic_test_and_set_vector(vec, apic->regs + APIC_IRR);
156}
157
158static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
159{
160 apic_clear_vector(vec, apic->regs + APIC_IRR);
161}
162
163static inline int apic_find_highest_irr(struct kvm_lapic *apic)
164{
165 int result;
166
167 result = find_highest_vector(apic->regs + APIC_IRR);
168 ASSERT(result == -1 || result >= 16);
169
170 return result;
171}
172
173int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
174{
175 struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
176 int highest_irr;
177
178 if (!apic)
179 return 0;
180 highest_irr = apic_find_highest_irr(apic);
181
182 return highest_irr;
183}
184EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
185
186int kvm_apic_set_irq(struct kvm_lapic *apic, u8 vec, u8 trig)
187{
188 if (!apic_test_and_set_irr(vec, apic)) {
189 /* a new pending irq is set in IRR */
190 if (trig)
191 apic_set_vector(vec, apic->regs + APIC_TMR);
192 else
193 apic_clear_vector(vec, apic->regs + APIC_TMR);
194 kvm_vcpu_kick(apic->vcpu);
195 return 1;
196 }
197 return 0;
198}
199
200static inline int apic_find_highest_isr(struct kvm_lapic *apic)
201{
202 int result;
203
204 result = find_highest_vector(apic->regs + APIC_ISR);
205 ASSERT(result == -1 || result >= 16);
206
207 return result;
208}
209
210static void apic_update_ppr(struct kvm_lapic *apic)
211{
212 u32 tpr, isrv, ppr;
213 int isr;
214
215 tpr = apic_get_reg(apic, APIC_TASKPRI);
216 isr = apic_find_highest_isr(apic);
217 isrv = (isr != -1) ? isr : 0;
218
219 if ((tpr & 0xf0) >= (isrv & 0xf0))
220 ppr = tpr & 0xff;
221 else
222 ppr = isrv & 0xf0;
223
224 apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x",
225 apic, ppr, isr, isrv);
226
227 apic_set_reg(apic, APIC_PROCPRI, ppr);
228}
229
230static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
231{
232 apic_set_reg(apic, APIC_TASKPRI, tpr);
233 apic_update_ppr(apic);
234}
235
236int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest)
237{
238 return kvm_apic_id(apic) == dest;
239}
240
241int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
242{
243 int result = 0;
244 u8 logical_id;
245
246 logical_id = GET_APIC_LOGICAL_ID(apic_get_reg(apic, APIC_LDR));
247
248 switch (apic_get_reg(apic, APIC_DFR)) {
249 case APIC_DFR_FLAT:
250 if (logical_id & mda)
251 result = 1;
252 break;
253 case APIC_DFR_CLUSTER:
254 if (((logical_id >> 4) == (mda >> 0x4))
255 && (logical_id & mda & 0xf))
256 result = 1;
257 break;
258 default:
259 printk(KERN_WARNING "Bad DFR vcpu %d: %08x\n",
260 apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR));
261 break;
262 }
263
264 return result;
265}
266
267static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
268 int short_hand, int dest, int dest_mode)
269{
270 int result = 0;
271 struct kvm_lapic *target = vcpu->apic;
272
273 apic_debug("target %p, source %p, dest 0x%x, "
274 "dest_mode 0x%x, short_hand 0x%x",
275 target, source, dest, dest_mode, short_hand);
276
277 ASSERT(!target);
278 switch (short_hand) {
279 case APIC_DEST_NOSHORT:
280 if (dest_mode == 0) {
281 /* Physical mode. */
282 if ((dest == 0xFF) || (dest == kvm_apic_id(target)))
283 result = 1;
284 } else
285 /* Logical mode. */
286 result = kvm_apic_match_logical_addr(target, dest);
287 break;
288 case APIC_DEST_SELF:
289 if (target == source)
290 result = 1;
291 break;
292 case APIC_DEST_ALLINC:
293 result = 1;
294 break;
295 case APIC_DEST_ALLBUT:
296 if (target != source)
297 result = 1;
298 break;
299 default:
300 printk(KERN_WARNING "Bad dest shorthand value %x\n",
301 short_hand);
302 break;
303 }
304
305 return result;
306}
307
308/*
309 * Add a pending IRQ into lapic.
310 * Return 1 if successfully added and 0 if discarded.
311 */
312static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
313 int vector, int level, int trig_mode)
314{
315 int orig_irr, result = 0;
316 struct kvm_vcpu *vcpu = apic->vcpu;
317
318 switch (delivery_mode) {
319 case APIC_DM_FIXED:
320 case APIC_DM_LOWEST:
321 /* FIXME add logic for vcpu on reset */
322 if (unlikely(!apic_enabled(apic)))
323 break;
324
325 orig_irr = apic_test_and_set_irr(vector, apic);
326 if (orig_irr && trig_mode) {
327 apic_debug("level trig mode repeatedly for vector %d",
328 vector);
329 break;
330 }
331
332 if (trig_mode) {
333 apic_debug("level trig mode for vector %d", vector);
334 apic_set_vector(vector, apic->regs + APIC_TMR);
335 } else
336 apic_clear_vector(vector, apic->regs + APIC_TMR);
337
338 if (vcpu->mp_state == VCPU_MP_STATE_RUNNABLE)
339 kvm_vcpu_kick(vcpu);
340 else if (vcpu->mp_state == VCPU_MP_STATE_HALTED) {
341 vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
342 if (waitqueue_active(&vcpu->wq))
343 wake_up_interruptible(&vcpu->wq);
344 }
345
346 result = (orig_irr == 0);
347 break;
348
349 case APIC_DM_REMRD:
350 printk(KERN_DEBUG "Ignoring delivery mode 3\n");
351 break;
352
353 case APIC_DM_SMI:
354 printk(KERN_DEBUG "Ignoring guest SMI\n");
355 break;
356 case APIC_DM_NMI:
357 printk(KERN_DEBUG "Ignoring guest NMI\n");
358 break;
359
360 case APIC_DM_INIT:
361 if (level) {
362 if (vcpu->mp_state == VCPU_MP_STATE_RUNNABLE)
363 printk(KERN_DEBUG
364 "INIT on a runnable vcpu %d\n",
365 vcpu->vcpu_id);
366 vcpu->mp_state = VCPU_MP_STATE_INIT_RECEIVED;
367 kvm_vcpu_kick(vcpu);
368 } else {
369 printk(KERN_DEBUG
370 "Ignoring de-assert INIT to vcpu %d\n",
371 vcpu->vcpu_id);
372 }
373
374 break;
375
376 case APIC_DM_STARTUP:
377 printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n",
378 vcpu->vcpu_id, vector);
379 if (vcpu->mp_state == VCPU_MP_STATE_INIT_RECEIVED) {
380 vcpu->sipi_vector = vector;
381 vcpu->mp_state = VCPU_MP_STATE_SIPI_RECEIVED;
382 if (waitqueue_active(&vcpu->wq))
383 wake_up_interruptible(&vcpu->wq);
384 }
385 break;
386
387 default:
388 printk(KERN_ERR "TODO: unsupported delivery mode %x\n",
389 delivery_mode);
390 break;
391 }
392 return result;
393}
394
395struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
396 unsigned long bitmap)
397{
398 int vcpu_id;
399 int last;
400 int next;
401 struct kvm_lapic *apic;
402
403 last = kvm->round_robin_prev_vcpu;
404 next = last;
405
406 do {
407 if (++next == KVM_MAX_VCPUS)
408 next = 0;
409 if (kvm->vcpus[next] == NULL || !test_bit(next, &bitmap))
410 continue;
411 apic = kvm->vcpus[next]->apic;
412 if (apic && apic_enabled(apic))
413 break;
414 apic = NULL;
415 } while (next != last);
416 kvm->round_robin_prev_vcpu = next;
417
418 if (!apic) {
419 vcpu_id = ffs(bitmap) - 1;
420 if (vcpu_id < 0) {
421 vcpu_id = 0;
422 printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n");
423 }
424 apic = kvm->vcpus[vcpu_id]->apic;
425 }
426
427 return apic;
428}
429
430static void apic_set_eoi(struct kvm_lapic *apic)
431{
432 int vector = apic_find_highest_isr(apic);
433
434 /*
435 * Not every write EOI will has corresponding ISR,
436 * one example is when Kernel check timer on setup_IO_APIC
437 */
438 if (vector == -1)
439 return;
440
441 apic_clear_vector(vector, apic->regs + APIC_ISR);
442 apic_update_ppr(apic);
443
444 if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR))
445 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector);
446}
447
448static void apic_send_ipi(struct kvm_lapic *apic)
449{
450 u32 icr_low = apic_get_reg(apic, APIC_ICR);
451 u32 icr_high = apic_get_reg(apic, APIC_ICR2);
452
453 unsigned int dest = GET_APIC_DEST_FIELD(icr_high);
454 unsigned int short_hand = icr_low & APIC_SHORT_MASK;
455 unsigned int trig_mode = icr_low & APIC_INT_LEVELTRIG;
456 unsigned int level = icr_low & APIC_INT_ASSERT;
457 unsigned int dest_mode = icr_low & APIC_DEST_MASK;
458 unsigned int delivery_mode = icr_low & APIC_MODE_MASK;
459 unsigned int vector = icr_low & APIC_VECTOR_MASK;
460
461 struct kvm_lapic *target;
462 struct kvm_vcpu *vcpu;
463 unsigned long lpr_map = 0;
464 int i;
465
466 apic_debug("icr_high 0x%x, icr_low 0x%x, "
467 "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, "
468 "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n",
469 icr_high, icr_low, short_hand, dest,
470 trig_mode, level, dest_mode, delivery_mode, vector);
471
472 for (i = 0; i < KVM_MAX_VCPUS; i++) {
473 vcpu = apic->vcpu->kvm->vcpus[i];
474 if (!vcpu)
475 continue;
476
477 if (vcpu->apic &&
478 apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) {
479 if (delivery_mode == APIC_DM_LOWEST)
480 set_bit(vcpu->vcpu_id, &lpr_map);
481 else
482 __apic_accept_irq(vcpu->apic, delivery_mode,
483 vector, level, trig_mode);
484 }
485 }
486
487 if (delivery_mode == APIC_DM_LOWEST) {
488 target = kvm_apic_round_robin(vcpu->kvm, vector, lpr_map);
489 if (target != NULL)
490 __apic_accept_irq(target, delivery_mode,
491 vector, level, trig_mode);
492 }
493}
494
495static u32 apic_get_tmcct(struct kvm_lapic *apic)
496{
497 u64 counter_passed;
498 ktime_t passed, now;
499 u32 tmcct;
500
501 ASSERT(apic != NULL);
502
503 now = apic->timer.dev.base->get_time();
504 tmcct = apic_get_reg(apic, APIC_TMICT);
505
506 /* if initial count is 0, current count should also be 0 */
507 if (tmcct == 0)
508 return 0;
509
510 if (unlikely(ktime_to_ns(now) <=
511 ktime_to_ns(apic->timer.last_update))) {
512 /* Wrap around */
513 passed = ktime_add(( {
514 (ktime_t) {
515 .tv64 = KTIME_MAX -
516 (apic->timer.last_update).tv64}; }
517 ), now);
518 apic_debug("time elapsed\n");
519 } else
520 passed = ktime_sub(now, apic->timer.last_update);
521
522 counter_passed = div64_64(ktime_to_ns(passed),
523 (APIC_BUS_CYCLE_NS * apic->timer.divide_count));
524
525 if (counter_passed > tmcct) {
526 if (unlikely(!apic_lvtt_period(apic))) {
527 /* one-shot timers stick at 0 until reset */
528 tmcct = 0;
529 } else {
530 /*
531 * periodic timers reset to APIC_TMICT when they
532 * hit 0. The while loop simulates this happening N
533 * times. (counter_passed %= tmcct) would also work,
534 * but might be slower or not work on 32-bit??
535 */
536 while (counter_passed > tmcct)
537 counter_passed -= tmcct;
538 tmcct -= counter_passed;
539 }
540 } else {
541 tmcct -= counter_passed;
542 }
543
544 return tmcct;
545}
546
547static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
548{
549 u32 val = 0;
550
551 if (offset >= LAPIC_MMIO_LENGTH)
552 return 0;
553
554 switch (offset) {
555 case APIC_ARBPRI:
556 printk(KERN_WARNING "Access APIC ARBPRI register "
557 "which is for P6\n");
558 break;
559
560 case APIC_TMCCT: /* Timer CCR */
561 val = apic_get_tmcct(apic);
562 break;
563
564 default:
565 apic_update_ppr(apic);
566 val = apic_get_reg(apic, offset);
567 break;
568 }
569
570 return val;
571}
572
573static void apic_mmio_read(struct kvm_io_device *this,
574 gpa_t address, int len, void *data)
575{
576 struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
577 unsigned int offset = address - apic->base_address;
578 unsigned char alignment = offset & 0xf;
579 u32 result;
580
581 if ((alignment + len) > 4) {
582 printk(KERN_ERR "KVM_APIC_READ: alignment error %lx %d",
583 (unsigned long)address, len);
584 return;
585 }
586 result = __apic_read(apic, offset & ~0xf);
587
588 switch (len) {
589 case 1:
590 case 2:
591 case 4:
592 memcpy(data, (char *)&result + alignment, len);
593 break;
594 default:
595 printk(KERN_ERR "Local APIC read with len = %x, "
596 "should be 1,2, or 4 instead\n", len);
597 break;
598 }
599}
600
601static void update_divide_count(struct kvm_lapic *apic)
602{
603 u32 tmp1, tmp2, tdcr;
604
605 tdcr = apic_get_reg(apic, APIC_TDCR);
606 tmp1 = tdcr & 0xf;
607 tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1;
608 apic->timer.divide_count = 0x1 << (tmp2 & 0x7);
609
610 apic_debug("timer divide count is 0x%x\n",
611 apic->timer.divide_count);
612}
613
614static void start_apic_timer(struct kvm_lapic *apic)
615{
616 ktime_t now = apic->timer.dev.base->get_time();
617
618 apic->timer.last_update = now;
619
620 apic->timer.period = apic_get_reg(apic, APIC_TMICT) *
621 APIC_BUS_CYCLE_NS * apic->timer.divide_count;
622 atomic_set(&apic->timer.pending, 0);
623 hrtimer_start(&apic->timer.dev,
624 ktime_add_ns(now, apic->timer.period),
625 HRTIMER_MODE_ABS);
626
627 apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
628 PRIx64 ", "
629 "timer initial count 0x%x, period %lldns, "
630 "expire @ 0x%016" PRIx64 ".\n", __FUNCTION__,
631 APIC_BUS_CYCLE_NS, ktime_to_ns(now),
632 apic_get_reg(apic, APIC_TMICT),
633 apic->timer.period,
634 ktime_to_ns(ktime_add_ns(now,
635 apic->timer.period)));
636}
637
638static void apic_mmio_write(struct kvm_io_device *this,
639 gpa_t address, int len, const void *data)
640{
641 struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
642 unsigned int offset = address - apic->base_address;
643 unsigned char alignment = offset & 0xf;
644 u32 val;
645
646 /*
647 * APIC register must be aligned on 128-bits boundary.
648 * 32/64/128 bits registers must be accessed thru 32 bits.
649 * Refer SDM 8.4.1
650 */
651 if (len != 4 || alignment) {
652 if (printk_ratelimit())
653 printk(KERN_ERR "apic write: bad size=%d %lx\n",
654 len, (long)address);
655 return;
656 }
657
658 val = *(u32 *) data;
659
660 /* too common printing */
661 if (offset != APIC_EOI)
662 apic_debug("%s: offset 0x%x with length 0x%x, and value is "
663 "0x%x\n", __FUNCTION__, offset, len, val);
664
665 offset &= 0xff0;
666
667 switch (offset) {
668 case APIC_ID: /* Local APIC ID */
669 apic_set_reg(apic, APIC_ID, val);
670 break;
671
672 case APIC_TASKPRI:
673 apic_set_tpr(apic, val & 0xff);
674 break;
675
676 case APIC_EOI:
677 apic_set_eoi(apic);
678 break;
679
680 case APIC_LDR:
681 apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK);
682 break;
683
684 case APIC_DFR:
685 apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
686 break;
687
688 case APIC_SPIV:
689 apic_set_reg(apic, APIC_SPIV, val & 0x3ff);
690 if (!(val & APIC_SPIV_APIC_ENABLED)) {
691 int i;
692 u32 lvt_val;
693
694 for (i = 0; i < APIC_LVT_NUM; i++) {
695 lvt_val = apic_get_reg(apic,
696 APIC_LVTT + 0x10 * i);
697 apic_set_reg(apic, APIC_LVTT + 0x10 * i,
698 lvt_val | APIC_LVT_MASKED);
699 }
700 atomic_set(&apic->timer.pending, 0);
701
702 }
703 break;
704
705 case APIC_ICR:
706 /* No delay here, so we always clear the pending bit */
707 apic_set_reg(apic, APIC_ICR, val & ~(1 << 12));
708 apic_send_ipi(apic);
709 break;
710
711 case APIC_ICR2:
712 apic_set_reg(apic, APIC_ICR2, val & 0xff000000);
713 break;
714
715 case APIC_LVTT:
716 case APIC_LVTTHMR:
717 case APIC_LVTPC:
718 case APIC_LVT0:
719 case APIC_LVT1:
720 case APIC_LVTERR:
721 /* TODO: Check vector */
722 if (!apic_sw_enabled(apic))
723 val |= APIC_LVT_MASKED;
724
725 val &= apic_lvt_mask[(offset - APIC_LVTT) >> 4];
726 apic_set_reg(apic, offset, val);
727
728 break;
729
730 case APIC_TMICT:
731 hrtimer_cancel(&apic->timer.dev);
732 apic_set_reg(apic, APIC_TMICT, val);
733 start_apic_timer(apic);
734 return;
735
736 case APIC_TDCR:
737 if (val & 4)
738 printk(KERN_ERR "KVM_WRITE:TDCR %x\n", val);
739 apic_set_reg(apic, APIC_TDCR, val);
740 update_divide_count(apic);
741 break;
742
743 default:
744 apic_debug("Local APIC Write to read-only register %x\n",
745 offset);
746 break;
747 }
748
749}
750
751static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr)
752{
753 struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
754 int ret = 0;
755
756
757 if (apic_hw_enabled(apic) &&
758 (addr >= apic->base_address) &&
759 (addr < (apic->base_address + LAPIC_MMIO_LENGTH)))
760 ret = 1;
761
762 return ret;
763}
764
765void kvm_free_apic(struct kvm_lapic *apic)
766{
767 if (!apic)
768 return;
769
770 hrtimer_cancel(&apic->timer.dev);
771
772 if (apic->regs_page) {
773 __free_page(apic->regs_page);
774 apic->regs_page = 0;
775 }
776
777 kfree(apic);
778}
779
780/*
781 *----------------------------------------------------------------------
782 * LAPIC interface
783 *----------------------------------------------------------------------
784 */
785
786void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
787{
788 struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
789
790 if (!apic)
791 return;
792 apic_set_tpr(apic, ((cr8 & 0x0f) << 4));
793}
794
795u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
796{
797 struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
798 u64 tpr;
799
800 if (!apic)
801 return 0;
802 tpr = (u64) apic_get_reg(apic, APIC_TASKPRI);
803
804 return (tpr & 0xf0) >> 4;
805}
806EXPORT_SYMBOL_GPL(kvm_lapic_get_cr8);
807
808void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
809{
810 struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
811
812 if (!apic) {
813 value |= MSR_IA32_APICBASE_BSP;
814 vcpu->apic_base = value;
815 return;
816 }
817 if (apic->vcpu->vcpu_id)
818 value &= ~MSR_IA32_APICBASE_BSP;
819
820 vcpu->apic_base = value;
821 apic->base_address = apic->vcpu->apic_base &
822 MSR_IA32_APICBASE_BASE;
823
824 /* with FSB delivery interrupt, we can restart APIC functionality */
825 apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is "
826 "0x%lx.\n", apic->apic_base, apic->base_address);
827
828}
829
830u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu)
831{
832 return vcpu->apic_base;
833}
834EXPORT_SYMBOL_GPL(kvm_lapic_get_base);
835
836void kvm_lapic_reset(struct kvm_vcpu *vcpu)
837{
838 struct kvm_lapic *apic;
839 int i;
840
841 apic_debug("%s\n", __FUNCTION__);
842
843 ASSERT(vcpu);
844 apic = vcpu->apic;
845 ASSERT(apic != NULL);
846
847 /* Stop the timer in case it's a reset to an active apic */
848 hrtimer_cancel(&apic->timer.dev);
849
850 apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24);
851 apic_set_reg(apic, APIC_LVR, APIC_VERSION);
852
853 for (i = 0; i < APIC_LVT_NUM; i++)
854 apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
855 apic_set_reg(apic, APIC_LVT0,
856 SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
857
858 apic_set_reg(apic, APIC_DFR, 0xffffffffU);
859 apic_set_reg(apic, APIC_SPIV, 0xff);
860 apic_set_reg(apic, APIC_TASKPRI, 0);
861 apic_set_reg(apic, APIC_LDR, 0);
862 apic_set_reg(apic, APIC_ESR, 0);
863 apic_set_reg(apic, APIC_ICR, 0);
864 apic_set_reg(apic, APIC_ICR2, 0);
865 apic_set_reg(apic, APIC_TDCR, 0);
866 apic_set_reg(apic, APIC_TMICT, 0);
867 for (i = 0; i < 8; i++) {
868 apic_set_reg(apic, APIC_IRR + 0x10 * i, 0);
869 apic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
870 apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
871 }
872 update_divide_count(apic);
873 atomic_set(&apic->timer.pending, 0);
874 if (vcpu->vcpu_id == 0)
875 vcpu->apic_base |= MSR_IA32_APICBASE_BSP;
876 apic_update_ppr(apic);
877
878 apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr="
879 "0x%016" PRIx64 ", base_address=0x%0lx.\n", __FUNCTION__,
880 vcpu, kvm_apic_id(apic),
881 vcpu->apic_base, apic->base_address);
882}
883EXPORT_SYMBOL_GPL(kvm_lapic_reset);
884
885int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
886{
887 struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
888 int ret = 0;
889
890 if (!apic)
891 return 0;
892 ret = apic_enabled(apic);
893
894 return ret;
895}
896EXPORT_SYMBOL_GPL(kvm_lapic_enabled);
897
898/*
899 *----------------------------------------------------------------------
900 * timer interface
901 *----------------------------------------------------------------------
902 */
903
904/* TODO: make sure __apic_timer_fn runs in current pCPU */
905static int __apic_timer_fn(struct kvm_lapic *apic)
906{
907 int result = 0;
908 wait_queue_head_t *q = &apic->vcpu->wq;
909
910 atomic_inc(&apic->timer.pending);
911 if (waitqueue_active(q))
912 {
913 apic->vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
914 wake_up_interruptible(q);
915 }
916 if (apic_lvtt_period(apic)) {
917 result = 1;
918 apic->timer.dev.expires = ktime_add_ns(
919 apic->timer.dev.expires,
920 apic->timer.period);
921 }
922 return result;
923}
924
925static int __inject_apic_timer_irq(struct kvm_lapic *apic)
926{
927 int vector;
928
929 vector = apic_lvt_vector(apic, APIC_LVTT);
930 return __apic_accept_irq(apic, APIC_DM_FIXED, vector, 1, 0);
931}
932
933static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
934{
935 struct kvm_lapic *apic;
936 int restart_timer = 0;
937
938 apic = container_of(data, struct kvm_lapic, timer.dev);
939
940 restart_timer = __apic_timer_fn(apic);
941
942 if (restart_timer)
943 return HRTIMER_RESTART;
944 else
945 return HRTIMER_NORESTART;
946}
947
948int kvm_create_lapic(struct kvm_vcpu *vcpu)
949{
950 struct kvm_lapic *apic;
951
952 ASSERT(vcpu != NULL);
953 apic_debug("apic_init %d\n", vcpu->vcpu_id);
954
955 apic = kzalloc(sizeof(*apic), GFP_KERNEL);
956 if (!apic)
957 goto nomem;
958
959 vcpu->apic = apic;
960
961 apic->regs_page = alloc_page(GFP_KERNEL);
962 if (apic->regs_page == NULL) {
963 printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
964 vcpu->vcpu_id);
965 goto nomem;
966 }
967 apic->regs = page_address(apic->regs_page);
968 memset(apic->regs, 0, PAGE_SIZE);
969 apic->vcpu = vcpu;
970
971 hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
972 apic->timer.dev.function = apic_timer_fn;
973 apic->base_address = APIC_DEFAULT_PHYS_BASE;
974 vcpu->apic_base = APIC_DEFAULT_PHYS_BASE;
975
976 kvm_lapic_reset(vcpu);
977 apic->dev.read = apic_mmio_read;
978 apic->dev.write = apic_mmio_write;
979 apic->dev.in_range = apic_mmio_range;
980 apic->dev.private = apic;
981
982 return 0;
983nomem:
984 kvm_free_apic(apic);
985 return -ENOMEM;
986}
987EXPORT_SYMBOL_GPL(kvm_create_lapic);
988
989int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
990{
991 struct kvm_lapic *apic = vcpu->apic;
992 int highest_irr;
993
994 if (!apic || !apic_enabled(apic))
995 return -1;
996
997 apic_update_ppr(apic);
998 highest_irr = apic_find_highest_irr(apic);
999 if ((highest_irr == -1) ||
1000 ((highest_irr & 0xF0) <= apic_get_reg(apic, APIC_PROCPRI)))
1001 return -1;
1002 return highest_irr;
1003}
1004
1005int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
1006{
1007 u32 lvt0 = apic_get_reg(vcpu->apic, APIC_LVT0);
1008 int r = 0;
1009
1010 if (vcpu->vcpu_id == 0) {
1011 if (!apic_hw_enabled(vcpu->apic))
1012 r = 1;
1013 if ((lvt0 & APIC_LVT_MASKED) == 0 &&
1014 GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
1015 r = 1;
1016 }
1017 return r;
1018}
1019
1020void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
1021{
1022 struct kvm_lapic *apic = vcpu->apic;
1023
1024 if (apic && apic_lvt_enabled(apic, APIC_LVTT) &&
1025 atomic_read(&apic->timer.pending) > 0) {
1026 if (__inject_apic_timer_irq(apic))
1027 atomic_dec(&apic->timer.pending);
1028 }
1029}
1030
1031void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
1032{
1033 struct kvm_lapic *apic = vcpu->apic;
1034
1035 if (apic && apic_lvt_vector(apic, APIC_LVTT) == vec)
1036 apic->timer.last_update = ktime_add_ns(
1037 apic->timer.last_update,
1038 apic->timer.period);
1039}
1040
1041int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
1042{
1043 int vector = kvm_apic_has_interrupt(vcpu);
1044 struct kvm_lapic *apic = vcpu->apic;
1045
1046 if (vector == -1)
1047 return -1;
1048
1049 apic_set_vector(vector, apic->regs + APIC_ISR);
1050 apic_update_ppr(apic);
1051 apic_clear_irr(vector, apic);
1052 return vector;
1053}
1054
1055void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
1056{
1057 struct kvm_lapic *apic = vcpu->apic;
1058
1059 apic->base_address = vcpu->apic_base &
1060 MSR_IA32_APICBASE_BASE;
1061 apic_set_reg(apic, APIC_LVR, APIC_VERSION);
1062 apic_update_ppr(apic);
1063 hrtimer_cancel(&apic->timer.dev);
1064 update_divide_count(apic);
1065 start_apic_timer(apic);
1066}
1067
1068void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
1069{
1070 struct kvm_lapic *apic = vcpu->apic;
1071 struct hrtimer *timer;
1072
1073 if (!apic)
1074 return;
1075
1076 timer = &apic->timer.dev;
1077 if (hrtimer_cancel(timer))
1078 hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS);
1079}
1080EXPORT_SYMBOL_GPL(kvm_migrate_apic_timer);
diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c
deleted file mode 100644
index feb5ac986c5d..000000000000
--- a/drivers/kvm/mmu.c
+++ /dev/null
@@ -1,1498 +0,0 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * MMU support
8 *
9 * Copyright (C) 2006 Qumranet, Inc.
10 *
11 * Authors:
12 * Yaniv Kamay <yaniv@qumranet.com>
13 * Avi Kivity <avi@qumranet.com>
14 *
15 * This work is licensed under the terms of the GNU GPL, version 2. See
16 * the COPYING file in the top-level directory.
17 *
18 */
19
20#include "vmx.h"
21#include "kvm.h"
22
23#include <linux/types.h>
24#include <linux/string.h>
25#include <linux/mm.h>
26#include <linux/highmem.h>
27#include <linux/module.h>
28
29#include <asm/page.h>
30#include <asm/cmpxchg.h>
31
32#undef MMU_DEBUG
33
34#undef AUDIT
35
36#ifdef AUDIT
37static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
38#else
39static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
40#endif
41
42#ifdef MMU_DEBUG
43
44#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
45#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
46
47#else
48
49#define pgprintk(x...) do { } while (0)
50#define rmap_printk(x...) do { } while (0)
51
52#endif
53
54#if defined(MMU_DEBUG) || defined(AUDIT)
55static int dbg = 1;
56#endif
57
58#ifndef MMU_DEBUG
59#define ASSERT(x) do { } while (0)
60#else
61#define ASSERT(x) \
62 if (!(x)) { \
63 printk(KERN_WARNING "assertion failed %s:%d: %s\n", \
64 __FILE__, __LINE__, #x); \
65 }
66#endif
67
68#define PT64_PT_BITS 9
69#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
70#define PT32_PT_BITS 10
71#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
72
73#define PT_WRITABLE_SHIFT 1
74
75#define PT_PRESENT_MASK (1ULL << 0)
76#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
77#define PT_USER_MASK (1ULL << 2)
78#define PT_PWT_MASK (1ULL << 3)
79#define PT_PCD_MASK (1ULL << 4)
80#define PT_ACCESSED_MASK (1ULL << 5)
81#define PT_DIRTY_MASK (1ULL << 6)
82#define PT_PAGE_SIZE_MASK (1ULL << 7)
83#define PT_PAT_MASK (1ULL << 7)
84#define PT_GLOBAL_MASK (1ULL << 8)
85#define PT64_NX_MASK (1ULL << 63)
86
87#define PT_PAT_SHIFT 7
88#define PT_DIR_PAT_SHIFT 12
89#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
90
91#define PT32_DIR_PSE36_SIZE 4
92#define PT32_DIR_PSE36_SHIFT 13
93#define PT32_DIR_PSE36_MASK (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
94
95
96#define PT_FIRST_AVAIL_BITS_SHIFT 9
97#define PT64_SECOND_AVAIL_BITS_SHIFT 52
98
99#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
100
101#define VALID_PAGE(x) ((x) != INVALID_PAGE)
102
103#define PT64_LEVEL_BITS 9
104
105#define PT64_LEVEL_SHIFT(level) \
106 ( PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS )
107
108#define PT64_LEVEL_MASK(level) \
109 (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
110
111#define PT64_INDEX(address, level)\
112 (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
113
114
115#define PT32_LEVEL_BITS 10
116
117#define PT32_LEVEL_SHIFT(level) \
118 ( PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS )
119
120#define PT32_LEVEL_MASK(level) \
121 (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
122
123#define PT32_INDEX(address, level)\
124 (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
125
126
127#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
128#define PT64_DIR_BASE_ADDR_MASK \
129 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
130
131#define PT32_BASE_ADDR_MASK PAGE_MASK
132#define PT32_DIR_BASE_ADDR_MASK \
133 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
134
135
136#define PFERR_PRESENT_MASK (1U << 0)
137#define PFERR_WRITE_MASK (1U << 1)
138#define PFERR_USER_MASK (1U << 2)
139#define PFERR_FETCH_MASK (1U << 4)
140
141#define PT64_ROOT_LEVEL 4
142#define PT32_ROOT_LEVEL 2
143#define PT32E_ROOT_LEVEL 3
144
145#define PT_DIRECTORY_LEVEL 2
146#define PT_PAGE_TABLE_LEVEL 1
147
148#define RMAP_EXT 4
149
150struct kvm_rmap_desc {
151 u64 *shadow_ptes[RMAP_EXT];
152 struct kvm_rmap_desc *more;
153};
154
155static struct kmem_cache *pte_chain_cache;
156static struct kmem_cache *rmap_desc_cache;
157static struct kmem_cache *mmu_page_header_cache;
158
159static int is_write_protection(struct kvm_vcpu *vcpu)
160{
161 return vcpu->cr0 & X86_CR0_WP;
162}
163
164static int is_cpuid_PSE36(void)
165{
166 return 1;
167}
168
169static int is_nx(struct kvm_vcpu *vcpu)
170{
171 return vcpu->shadow_efer & EFER_NX;
172}
173
174static int is_present_pte(unsigned long pte)
175{
176 return pte & PT_PRESENT_MASK;
177}
178
179static int is_writeble_pte(unsigned long pte)
180{
181 return pte & PT_WRITABLE_MASK;
182}
183
184static int is_io_pte(unsigned long pte)
185{
186 return pte & PT_SHADOW_IO_MARK;
187}
188
189static int is_rmap_pte(u64 pte)
190{
191 return (pte & (PT_WRITABLE_MASK | PT_PRESENT_MASK))
192 == (PT_WRITABLE_MASK | PT_PRESENT_MASK);
193}
194
195static void set_shadow_pte(u64 *sptep, u64 spte)
196{
197#ifdef CONFIG_X86_64
198 set_64bit((unsigned long *)sptep, spte);
199#else
200 set_64bit((unsigned long long *)sptep, spte);
201#endif
202}
203
204static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
205 struct kmem_cache *base_cache, int min)
206{
207 void *obj;
208
209 if (cache->nobjs >= min)
210 return 0;
211 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
212 obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
213 if (!obj)
214 return -ENOMEM;
215 cache->objects[cache->nobjs++] = obj;
216 }
217 return 0;
218}
219
220static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
221{
222 while (mc->nobjs)
223 kfree(mc->objects[--mc->nobjs]);
224}
225
226static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
227 int min)
228{
229 struct page *page;
230
231 if (cache->nobjs >= min)
232 return 0;
233 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
234 page = alloc_page(GFP_KERNEL);
235 if (!page)
236 return -ENOMEM;
237 set_page_private(page, 0);
238 cache->objects[cache->nobjs++] = page_address(page);
239 }
240 return 0;
241}
242
243static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
244{
245 while (mc->nobjs)
246 free_page((unsigned long)mc->objects[--mc->nobjs]);
247}
248
249static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
250{
251 int r;
252
253 kvm_mmu_free_some_pages(vcpu);
254 r = mmu_topup_memory_cache(&vcpu->mmu_pte_chain_cache,
255 pte_chain_cache, 4);
256 if (r)
257 goto out;
258 r = mmu_topup_memory_cache(&vcpu->mmu_rmap_desc_cache,
259 rmap_desc_cache, 1);
260 if (r)
261 goto out;
262 r = mmu_topup_memory_cache_page(&vcpu->mmu_page_cache, 4);
263 if (r)
264 goto out;
265 r = mmu_topup_memory_cache(&vcpu->mmu_page_header_cache,
266 mmu_page_header_cache, 4);
267out:
268 return r;
269}
270
271static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
272{
273 mmu_free_memory_cache(&vcpu->mmu_pte_chain_cache);
274 mmu_free_memory_cache(&vcpu->mmu_rmap_desc_cache);
275 mmu_free_memory_cache_page(&vcpu->mmu_page_cache);
276 mmu_free_memory_cache(&vcpu->mmu_page_header_cache);
277}
278
279static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
280 size_t size)
281{
282 void *p;
283
284 BUG_ON(!mc->nobjs);
285 p = mc->objects[--mc->nobjs];
286 memset(p, 0, size);
287 return p;
288}
289
290static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
291{
292 return mmu_memory_cache_alloc(&vcpu->mmu_pte_chain_cache,
293 sizeof(struct kvm_pte_chain));
294}
295
296static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
297{
298 kfree(pc);
299}
300
301static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
302{
303 return mmu_memory_cache_alloc(&vcpu->mmu_rmap_desc_cache,
304 sizeof(struct kvm_rmap_desc));
305}
306
307static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
308{
309 kfree(rd);
310}
311
312/*
313 * Reverse mapping data structures:
314 *
315 * If page->private bit zero is zero, then page->private points to the
316 * shadow page table entry that points to page_address(page).
317 *
318 * If page->private bit zero is one, (then page->private & ~1) points
319 * to a struct kvm_rmap_desc containing more mappings.
320 */
321static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte)
322{
323 struct page *page;
324 struct kvm_rmap_desc *desc;
325 int i;
326
327 if (!is_rmap_pte(*spte))
328 return;
329 page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
330 if (!page_private(page)) {
331 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
332 set_page_private(page,(unsigned long)spte);
333 } else if (!(page_private(page) & 1)) {
334 rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
335 desc = mmu_alloc_rmap_desc(vcpu);
336 desc->shadow_ptes[0] = (u64 *)page_private(page);
337 desc->shadow_ptes[1] = spte;
338 set_page_private(page,(unsigned long)desc | 1);
339 } else {
340 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
341 desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
342 while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
343 desc = desc->more;
344 if (desc->shadow_ptes[RMAP_EXT-1]) {
345 desc->more = mmu_alloc_rmap_desc(vcpu);
346 desc = desc->more;
347 }
348 for (i = 0; desc->shadow_ptes[i]; ++i)
349 ;
350 desc->shadow_ptes[i] = spte;
351 }
352}
353
354static void rmap_desc_remove_entry(struct page *page,
355 struct kvm_rmap_desc *desc,
356 int i,
357 struct kvm_rmap_desc *prev_desc)
358{
359 int j;
360
361 for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j)
362 ;
363 desc->shadow_ptes[i] = desc->shadow_ptes[j];
364 desc->shadow_ptes[j] = NULL;
365 if (j != 0)
366 return;
367 if (!prev_desc && !desc->more)
368 set_page_private(page,(unsigned long)desc->shadow_ptes[0]);
369 else
370 if (prev_desc)
371 prev_desc->more = desc->more;
372 else
373 set_page_private(page,(unsigned long)desc->more | 1);
374 mmu_free_rmap_desc(desc);
375}
376
377static void rmap_remove(u64 *spte)
378{
379 struct page *page;
380 struct kvm_rmap_desc *desc;
381 struct kvm_rmap_desc *prev_desc;
382 int i;
383
384 if (!is_rmap_pte(*spte))
385 return;
386 page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
387 if (!page_private(page)) {
388 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
389 BUG();
390 } else if (!(page_private(page) & 1)) {
391 rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte);
392 if ((u64 *)page_private(page) != spte) {
393 printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n",
394 spte, *spte);
395 BUG();
396 }
397 set_page_private(page,0);
398 } else {
399 rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte);
400 desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
401 prev_desc = NULL;
402 while (desc) {
403 for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
404 if (desc->shadow_ptes[i] == spte) {
405 rmap_desc_remove_entry(page,
406 desc, i,
407 prev_desc);
408 return;
409 }
410 prev_desc = desc;
411 desc = desc->more;
412 }
413 BUG();
414 }
415}
416
417static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
418{
419 struct kvm *kvm = vcpu->kvm;
420 struct page *page;
421 struct kvm_rmap_desc *desc;
422 u64 *spte;
423
424 page = gfn_to_page(kvm, gfn);
425 BUG_ON(!page);
426
427 while (page_private(page)) {
428 if (!(page_private(page) & 1))
429 spte = (u64 *)page_private(page);
430 else {
431 desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
432 spte = desc->shadow_ptes[0];
433 }
434 BUG_ON(!spte);
435 BUG_ON((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT
436 != page_to_pfn(page));
437 BUG_ON(!(*spte & PT_PRESENT_MASK));
438 BUG_ON(!(*spte & PT_WRITABLE_MASK));
439 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
440 rmap_remove(spte);
441 set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK);
442 kvm_flush_remote_tlbs(vcpu->kvm);
443 }
444}
445
446#ifdef MMU_DEBUG
447static int is_empty_shadow_page(u64 *spt)
448{
449 u64 *pos;
450 u64 *end;
451
452 for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
453 if (*pos != 0) {
454 printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
455 pos, *pos);
456 return 0;
457 }
458 return 1;
459}
460#endif
461
462static void kvm_mmu_free_page(struct kvm *kvm,
463 struct kvm_mmu_page *page_head)
464{
465 ASSERT(is_empty_shadow_page(page_head->spt));
466 list_del(&page_head->link);
467 __free_page(virt_to_page(page_head->spt));
468 kfree(page_head);
469 ++kvm->n_free_mmu_pages;
470}
471
472static unsigned kvm_page_table_hashfn(gfn_t gfn)
473{
474 return gfn;
475}
476
477static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
478 u64 *parent_pte)
479{
480 struct kvm_mmu_page *page;
481
482 if (!vcpu->kvm->n_free_mmu_pages)
483 return NULL;
484
485 page = mmu_memory_cache_alloc(&vcpu->mmu_page_header_cache,
486 sizeof *page);
487 page->spt = mmu_memory_cache_alloc(&vcpu->mmu_page_cache, PAGE_SIZE);
488 set_page_private(virt_to_page(page->spt), (unsigned long)page);
489 list_add(&page->link, &vcpu->kvm->active_mmu_pages);
490 ASSERT(is_empty_shadow_page(page->spt));
491 page->slot_bitmap = 0;
492 page->multimapped = 0;
493 page->parent_pte = parent_pte;
494 --vcpu->kvm->n_free_mmu_pages;
495 return page;
496}
497
498static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
499 struct kvm_mmu_page *page, u64 *parent_pte)
500{
501 struct kvm_pte_chain *pte_chain;
502 struct hlist_node *node;
503 int i;
504
505 if (!parent_pte)
506 return;
507 if (!page->multimapped) {
508 u64 *old = page->parent_pte;
509
510 if (!old) {
511 page->parent_pte = parent_pte;
512 return;
513 }
514 page->multimapped = 1;
515 pte_chain = mmu_alloc_pte_chain(vcpu);
516 INIT_HLIST_HEAD(&page->parent_ptes);
517 hlist_add_head(&pte_chain->link, &page->parent_ptes);
518 pte_chain->parent_ptes[0] = old;
519 }
520 hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link) {
521 if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
522 continue;
523 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
524 if (!pte_chain->parent_ptes[i]) {
525 pte_chain->parent_ptes[i] = parent_pte;
526 return;
527 }
528 }
529 pte_chain = mmu_alloc_pte_chain(vcpu);
530 BUG_ON(!pte_chain);
531 hlist_add_head(&pte_chain->link, &page->parent_ptes);
532 pte_chain->parent_ptes[0] = parent_pte;
533}
534
535static void mmu_page_remove_parent_pte(struct kvm_mmu_page *page,
536 u64 *parent_pte)
537{
538 struct kvm_pte_chain *pte_chain;
539 struct hlist_node *node;
540 int i;
541
542 if (!page->multimapped) {
543 BUG_ON(page->parent_pte != parent_pte);
544 page->parent_pte = NULL;
545 return;
546 }
547 hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link)
548 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
549 if (!pte_chain->parent_ptes[i])
550 break;
551 if (pte_chain->parent_ptes[i] != parent_pte)
552 continue;
553 while (i + 1 < NR_PTE_CHAIN_ENTRIES
554 && pte_chain->parent_ptes[i + 1]) {
555 pte_chain->parent_ptes[i]
556 = pte_chain->parent_ptes[i + 1];
557 ++i;
558 }
559 pte_chain->parent_ptes[i] = NULL;
560 if (i == 0) {
561 hlist_del(&pte_chain->link);
562 mmu_free_pte_chain(pte_chain);
563 if (hlist_empty(&page->parent_ptes)) {
564 page->multimapped = 0;
565 page->parent_pte = NULL;
566 }
567 }
568 return;
569 }
570 BUG();
571}
572
573static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm_vcpu *vcpu,
574 gfn_t gfn)
575{
576 unsigned index;
577 struct hlist_head *bucket;
578 struct kvm_mmu_page *page;
579 struct hlist_node *node;
580
581 pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
582 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
583 bucket = &vcpu->kvm->mmu_page_hash[index];
584 hlist_for_each_entry(page, node, bucket, hash_link)
585 if (page->gfn == gfn && !page->role.metaphysical) {
586 pgprintk("%s: found role %x\n",
587 __FUNCTION__, page->role.word);
588 return page;
589 }
590 return NULL;
591}
592
593static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
594 gfn_t gfn,
595 gva_t gaddr,
596 unsigned level,
597 int metaphysical,
598 unsigned hugepage_access,
599 u64 *parent_pte)
600{
601 union kvm_mmu_page_role role;
602 unsigned index;
603 unsigned quadrant;
604 struct hlist_head *bucket;
605 struct kvm_mmu_page *page;
606 struct hlist_node *node;
607
608 role.word = 0;
609 role.glevels = vcpu->mmu.root_level;
610 role.level = level;
611 role.metaphysical = metaphysical;
612 role.hugepage_access = hugepage_access;
613 if (vcpu->mmu.root_level <= PT32_ROOT_LEVEL) {
614 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
615 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
616 role.quadrant = quadrant;
617 }
618 pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__,
619 gfn, role.word);
620 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
621 bucket = &vcpu->kvm->mmu_page_hash[index];
622 hlist_for_each_entry(page, node, bucket, hash_link)
623 if (page->gfn == gfn && page->role.word == role.word) {
624 mmu_page_add_parent_pte(vcpu, page, parent_pte);
625 pgprintk("%s: found\n", __FUNCTION__);
626 return page;
627 }
628 page = kvm_mmu_alloc_page(vcpu, parent_pte);
629 if (!page)
630 return page;
631 pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word);
632 page->gfn = gfn;
633 page->role = role;
634 hlist_add_head(&page->hash_link, bucket);
635 if (!metaphysical)
636 rmap_write_protect(vcpu, gfn);
637 return page;
638}
639
640static void kvm_mmu_page_unlink_children(struct kvm *kvm,
641 struct kvm_mmu_page *page)
642{
643 unsigned i;
644 u64 *pt;
645 u64 ent;
646
647 pt = page->spt;
648
649 if (page->role.level == PT_PAGE_TABLE_LEVEL) {
650 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
651 if (pt[i] & PT_PRESENT_MASK)
652 rmap_remove(&pt[i]);
653 pt[i] = 0;
654 }
655 kvm_flush_remote_tlbs(kvm);
656 return;
657 }
658
659 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
660 ent = pt[i];
661
662 pt[i] = 0;
663 if (!(ent & PT_PRESENT_MASK))
664 continue;
665 ent &= PT64_BASE_ADDR_MASK;
666 mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
667 }
668 kvm_flush_remote_tlbs(kvm);
669}
670
671static void kvm_mmu_put_page(struct kvm_mmu_page *page,
672 u64 *parent_pte)
673{
674 mmu_page_remove_parent_pte(page, parent_pte);
675}
676
677static void kvm_mmu_zap_page(struct kvm *kvm,
678 struct kvm_mmu_page *page)
679{
680 u64 *parent_pte;
681
682 while (page->multimapped || page->parent_pte) {
683 if (!page->multimapped)
684 parent_pte = page->parent_pte;
685 else {
686 struct kvm_pte_chain *chain;
687
688 chain = container_of(page->parent_ptes.first,
689 struct kvm_pte_chain, link);
690 parent_pte = chain->parent_ptes[0];
691 }
692 BUG_ON(!parent_pte);
693 kvm_mmu_put_page(page, parent_pte);
694 set_shadow_pte(parent_pte, 0);
695 }
696 kvm_mmu_page_unlink_children(kvm, page);
697 if (!page->root_count) {
698 hlist_del(&page->hash_link);
699 kvm_mmu_free_page(kvm, page);
700 } else
701 list_move(&page->link, &kvm->active_mmu_pages);
702}
703
704static int kvm_mmu_unprotect_page(struct kvm_vcpu *vcpu, gfn_t gfn)
705{
706 unsigned index;
707 struct hlist_head *bucket;
708 struct kvm_mmu_page *page;
709 struct hlist_node *node, *n;
710 int r;
711
712 pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
713 r = 0;
714 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
715 bucket = &vcpu->kvm->mmu_page_hash[index];
716 hlist_for_each_entry_safe(page, node, n, bucket, hash_link)
717 if (page->gfn == gfn && !page->role.metaphysical) {
718 pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn,
719 page->role.word);
720 kvm_mmu_zap_page(vcpu->kvm, page);
721 r = 1;
722 }
723 return r;
724}
725
726static void mmu_unshadow(struct kvm_vcpu *vcpu, gfn_t gfn)
727{
728 struct kvm_mmu_page *page;
729
730 while ((page = kvm_mmu_lookup_page(vcpu, gfn)) != NULL) {
731 pgprintk("%s: zap %lx %x\n",
732 __FUNCTION__, gfn, page->role.word);
733 kvm_mmu_zap_page(vcpu->kvm, page);
734 }
735}
736
737static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa)
738{
739 int slot = memslot_id(kvm, gfn_to_memslot(kvm, gpa >> PAGE_SHIFT));
740 struct kvm_mmu_page *page_head = page_header(__pa(pte));
741
742 __set_bit(slot, &page_head->slot_bitmap);
743}
744
745hpa_t safe_gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa)
746{
747 hpa_t hpa = gpa_to_hpa(vcpu, gpa);
748
749 return is_error_hpa(hpa) ? bad_page_address | (gpa & ~PAGE_MASK): hpa;
750}
751
752hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa)
753{
754 struct page *page;
755
756 ASSERT((gpa & HPA_ERR_MASK) == 0);
757 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
758 if (!page)
759 return gpa | HPA_ERR_MASK;
760 return ((hpa_t)page_to_pfn(page) << PAGE_SHIFT)
761 | (gpa & (PAGE_SIZE-1));
762}
763
764hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva)
765{
766 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
767
768 if (gpa == UNMAPPED_GVA)
769 return UNMAPPED_GVA;
770 return gpa_to_hpa(vcpu, gpa);
771}
772
773struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
774{
775 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
776
777 if (gpa == UNMAPPED_GVA)
778 return NULL;
779 return pfn_to_page(gpa_to_hpa(vcpu, gpa) >> PAGE_SHIFT);
780}
781
782static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
783{
784}
785
786static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p)
787{
788 int level = PT32E_ROOT_LEVEL;
789 hpa_t table_addr = vcpu->mmu.root_hpa;
790
791 for (; ; level--) {
792 u32 index = PT64_INDEX(v, level);
793 u64 *table;
794 u64 pte;
795
796 ASSERT(VALID_PAGE(table_addr));
797 table = __va(table_addr);
798
799 if (level == 1) {
800 pte = table[index];
801 if (is_present_pte(pte) && is_writeble_pte(pte))
802 return 0;
803 mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT);
804 page_header_update_slot(vcpu->kvm, table, v);
805 table[index] = p | PT_PRESENT_MASK | PT_WRITABLE_MASK |
806 PT_USER_MASK;
807 rmap_add(vcpu, &table[index]);
808 return 0;
809 }
810
811 if (table[index] == 0) {
812 struct kvm_mmu_page *new_table;
813 gfn_t pseudo_gfn;
814
815 pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
816 >> PAGE_SHIFT;
817 new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
818 v, level - 1,
819 1, 0, &table[index]);
820 if (!new_table) {
821 pgprintk("nonpaging_map: ENOMEM\n");
822 return -ENOMEM;
823 }
824
825 table[index] = __pa(new_table->spt) | PT_PRESENT_MASK
826 | PT_WRITABLE_MASK | PT_USER_MASK;
827 }
828 table_addr = table[index] & PT64_BASE_ADDR_MASK;
829 }
830}
831
832static void mmu_free_roots(struct kvm_vcpu *vcpu)
833{
834 int i;
835 struct kvm_mmu_page *page;
836
837 if (!VALID_PAGE(vcpu->mmu.root_hpa))
838 return;
839#ifdef CONFIG_X86_64
840 if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
841 hpa_t root = vcpu->mmu.root_hpa;
842
843 page = page_header(root);
844 --page->root_count;
845 vcpu->mmu.root_hpa = INVALID_PAGE;
846 return;
847 }
848#endif
849 for (i = 0; i < 4; ++i) {
850 hpa_t root = vcpu->mmu.pae_root[i];
851
852 if (root) {
853 root &= PT64_BASE_ADDR_MASK;
854 page = page_header(root);
855 --page->root_count;
856 }
857 vcpu->mmu.pae_root[i] = INVALID_PAGE;
858 }
859 vcpu->mmu.root_hpa = INVALID_PAGE;
860}
861
862static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
863{
864 int i;
865 gfn_t root_gfn;
866 struct kvm_mmu_page *page;
867
868 root_gfn = vcpu->cr3 >> PAGE_SHIFT;
869
870#ifdef CONFIG_X86_64
871 if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
872 hpa_t root = vcpu->mmu.root_hpa;
873
874 ASSERT(!VALID_PAGE(root));
875 page = kvm_mmu_get_page(vcpu, root_gfn, 0,
876 PT64_ROOT_LEVEL, 0, 0, NULL);
877 root = __pa(page->spt);
878 ++page->root_count;
879 vcpu->mmu.root_hpa = root;
880 return;
881 }
882#endif
883 for (i = 0; i < 4; ++i) {
884 hpa_t root = vcpu->mmu.pae_root[i];
885
886 ASSERT(!VALID_PAGE(root));
887 if (vcpu->mmu.root_level == PT32E_ROOT_LEVEL) {
888 if (!is_present_pte(vcpu->pdptrs[i])) {
889 vcpu->mmu.pae_root[i] = 0;
890 continue;
891 }
892 root_gfn = vcpu->pdptrs[i] >> PAGE_SHIFT;
893 } else if (vcpu->mmu.root_level == 0)
894 root_gfn = 0;
895 page = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
896 PT32_ROOT_LEVEL, !is_paging(vcpu),
897 0, NULL);
898 root = __pa(page->spt);
899 ++page->root_count;
900 vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK;
901 }
902 vcpu->mmu.root_hpa = __pa(vcpu->mmu.pae_root);
903}
904
905static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
906{
907 return vaddr;
908}
909
910static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
911 u32 error_code)
912{
913 gpa_t addr = gva;
914 hpa_t paddr;
915 int r;
916
917 r = mmu_topup_memory_caches(vcpu);
918 if (r)
919 return r;
920
921 ASSERT(vcpu);
922 ASSERT(VALID_PAGE(vcpu->mmu.root_hpa));
923
924
925 paddr = gpa_to_hpa(vcpu , addr & PT64_BASE_ADDR_MASK);
926
927 if (is_error_hpa(paddr))
928 return 1;
929
930 return nonpaging_map(vcpu, addr & PAGE_MASK, paddr);
931}
932
933static void nonpaging_free(struct kvm_vcpu *vcpu)
934{
935 mmu_free_roots(vcpu);
936}
937
938static int nonpaging_init_context(struct kvm_vcpu *vcpu)
939{
940 struct kvm_mmu *context = &vcpu->mmu;
941
942 context->new_cr3 = nonpaging_new_cr3;
943 context->page_fault = nonpaging_page_fault;
944 context->gva_to_gpa = nonpaging_gva_to_gpa;
945 context->free = nonpaging_free;
946 context->root_level = 0;
947 context->shadow_root_level = PT32E_ROOT_LEVEL;
948 context->root_hpa = INVALID_PAGE;
949 return 0;
950}
951
952static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
953{
954 ++vcpu->stat.tlb_flush;
955 kvm_x86_ops->tlb_flush(vcpu);
956}
957
958static void paging_new_cr3(struct kvm_vcpu *vcpu)
959{
960 pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3);
961 mmu_free_roots(vcpu);
962}
963
964static void inject_page_fault(struct kvm_vcpu *vcpu,
965 u64 addr,
966 u32 err_code)
967{
968 kvm_x86_ops->inject_page_fault(vcpu, addr, err_code);
969}
970
971static void paging_free(struct kvm_vcpu *vcpu)
972{
973 nonpaging_free(vcpu);
974}
975
976#define PTTYPE 64
977#include "paging_tmpl.h"
978#undef PTTYPE
979
980#define PTTYPE 32
981#include "paging_tmpl.h"
982#undef PTTYPE
983
984static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
985{
986 struct kvm_mmu *context = &vcpu->mmu;
987
988 ASSERT(is_pae(vcpu));
989 context->new_cr3 = paging_new_cr3;
990 context->page_fault = paging64_page_fault;
991 context->gva_to_gpa = paging64_gva_to_gpa;
992 context->free = paging_free;
993 context->root_level = level;
994 context->shadow_root_level = level;
995 context->root_hpa = INVALID_PAGE;
996 return 0;
997}
998
999static int paging64_init_context(struct kvm_vcpu *vcpu)
1000{
1001 return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
1002}
1003
1004static int paging32_init_context(struct kvm_vcpu *vcpu)
1005{
1006 struct kvm_mmu *context = &vcpu->mmu;
1007
1008 context->new_cr3 = paging_new_cr3;
1009 context->page_fault = paging32_page_fault;
1010 context->gva_to_gpa = paging32_gva_to_gpa;
1011 context->free = paging_free;
1012 context->root_level = PT32_ROOT_LEVEL;
1013 context->shadow_root_level = PT32E_ROOT_LEVEL;
1014 context->root_hpa = INVALID_PAGE;
1015 return 0;
1016}
1017
1018static int paging32E_init_context(struct kvm_vcpu *vcpu)
1019{
1020 return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
1021}
1022
1023static int init_kvm_mmu(struct kvm_vcpu *vcpu)
1024{
1025 ASSERT(vcpu);
1026 ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
1027
1028 if (!is_paging(vcpu))
1029 return nonpaging_init_context(vcpu);
1030 else if (is_long_mode(vcpu))
1031 return paging64_init_context(vcpu);
1032 else if (is_pae(vcpu))
1033 return paging32E_init_context(vcpu);
1034 else
1035 return paging32_init_context(vcpu);
1036}
1037
1038static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
1039{
1040 ASSERT(vcpu);
1041 if (VALID_PAGE(vcpu->mmu.root_hpa)) {
1042 vcpu->mmu.free(vcpu);
1043 vcpu->mmu.root_hpa = INVALID_PAGE;
1044 }
1045}
1046
1047int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
1048{
1049 destroy_kvm_mmu(vcpu);
1050 return init_kvm_mmu(vcpu);
1051}
1052EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
1053
1054int kvm_mmu_load(struct kvm_vcpu *vcpu)
1055{
1056 int r;
1057
1058 mutex_lock(&vcpu->kvm->lock);
1059 r = mmu_topup_memory_caches(vcpu);
1060 if (r)
1061 goto out;
1062 mmu_alloc_roots(vcpu);
1063 kvm_x86_ops->set_cr3(vcpu, vcpu->mmu.root_hpa);
1064 kvm_mmu_flush_tlb(vcpu);
1065out:
1066 mutex_unlock(&vcpu->kvm->lock);
1067 return r;
1068}
1069EXPORT_SYMBOL_GPL(kvm_mmu_load);
1070
1071void kvm_mmu_unload(struct kvm_vcpu *vcpu)
1072{
1073 mmu_free_roots(vcpu);
1074}
1075
1076static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
1077 struct kvm_mmu_page *page,
1078 u64 *spte)
1079{
1080 u64 pte;
1081 struct kvm_mmu_page *child;
1082
1083 pte = *spte;
1084 if (is_present_pte(pte)) {
1085 if (page->role.level == PT_PAGE_TABLE_LEVEL)
1086 rmap_remove(spte);
1087 else {
1088 child = page_header(pte & PT64_BASE_ADDR_MASK);
1089 mmu_page_remove_parent_pte(child, spte);
1090 }
1091 }
1092 set_shadow_pte(spte, 0);
1093 kvm_flush_remote_tlbs(vcpu->kvm);
1094}
1095
1096static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
1097 struct kvm_mmu_page *page,
1098 u64 *spte,
1099 const void *new, int bytes)
1100{
1101 if (page->role.level != PT_PAGE_TABLE_LEVEL)
1102 return;
1103
1104 if (page->role.glevels == PT32_ROOT_LEVEL)
1105 paging32_update_pte(vcpu, page, spte, new, bytes);
1106 else
1107 paging64_update_pte(vcpu, page, spte, new, bytes);
1108}
1109
1110void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1111 const u8 *new, int bytes)
1112{
1113 gfn_t gfn = gpa >> PAGE_SHIFT;
1114 struct kvm_mmu_page *page;
1115 struct hlist_node *node, *n;
1116 struct hlist_head *bucket;
1117 unsigned index;
1118 u64 *spte;
1119 unsigned offset = offset_in_page(gpa);
1120 unsigned pte_size;
1121 unsigned page_offset;
1122 unsigned misaligned;
1123 unsigned quadrant;
1124 int level;
1125 int flooded = 0;
1126 int npte;
1127
1128 pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
1129 if (gfn == vcpu->last_pt_write_gfn) {
1130 ++vcpu->last_pt_write_count;
1131 if (vcpu->last_pt_write_count >= 3)
1132 flooded = 1;
1133 } else {
1134 vcpu->last_pt_write_gfn = gfn;
1135 vcpu->last_pt_write_count = 1;
1136 }
1137 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
1138 bucket = &vcpu->kvm->mmu_page_hash[index];
1139 hlist_for_each_entry_safe(page, node, n, bucket, hash_link) {
1140 if (page->gfn != gfn || page->role.metaphysical)
1141 continue;
1142 pte_size = page->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
1143 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
1144 misaligned |= bytes < 4;
1145 if (misaligned || flooded) {
1146 /*
1147 * Misaligned accesses are too much trouble to fix
1148 * up; also, they usually indicate a page is not used
1149 * as a page table.
1150 *
1151 * If we're seeing too many writes to a page,
1152 * it may no longer be a page table, or we may be
1153 * forking, in which case it is better to unmap the
1154 * page.
1155 */
1156 pgprintk("misaligned: gpa %llx bytes %d role %x\n",
1157 gpa, bytes, page->role.word);
1158 kvm_mmu_zap_page(vcpu->kvm, page);
1159 continue;
1160 }
1161 page_offset = offset;
1162 level = page->role.level;
1163 npte = 1;
1164 if (page->role.glevels == PT32_ROOT_LEVEL) {
1165 page_offset <<= 1; /* 32->64 */
1166 /*
1167 * A 32-bit pde maps 4MB while the shadow pdes map
1168 * only 2MB. So we need to double the offset again
1169 * and zap two pdes instead of one.
1170 */
1171 if (level == PT32_ROOT_LEVEL) {
1172 page_offset &= ~7; /* kill rounding error */
1173 page_offset <<= 1;
1174 npte = 2;
1175 }
1176 quadrant = page_offset >> PAGE_SHIFT;
1177 page_offset &= ~PAGE_MASK;
1178 if (quadrant != page->role.quadrant)
1179 continue;
1180 }
1181 spte = &page->spt[page_offset / sizeof(*spte)];
1182 while (npte--) {
1183 mmu_pte_write_zap_pte(vcpu, page, spte);
1184 mmu_pte_write_new_pte(vcpu, page, spte, new, bytes);
1185 ++spte;
1186 }
1187 }
1188}
1189
1190int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
1191{
1192 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
1193
1194 return kvm_mmu_unprotect_page(vcpu, gpa >> PAGE_SHIFT);
1195}
1196
1197void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
1198{
1199 while (vcpu->kvm->n_free_mmu_pages < KVM_REFILL_PAGES) {
1200 struct kvm_mmu_page *page;
1201
1202 page = container_of(vcpu->kvm->active_mmu_pages.prev,
1203 struct kvm_mmu_page, link);
1204 kvm_mmu_zap_page(vcpu->kvm, page);
1205 }
1206}
1207
1208static void free_mmu_pages(struct kvm_vcpu *vcpu)
1209{
1210 struct kvm_mmu_page *page;
1211
1212 while (!list_empty(&vcpu->kvm->active_mmu_pages)) {
1213 page = container_of(vcpu->kvm->active_mmu_pages.next,
1214 struct kvm_mmu_page, link);
1215 kvm_mmu_zap_page(vcpu->kvm, page);
1216 }
1217 free_page((unsigned long)vcpu->mmu.pae_root);
1218}
1219
1220static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
1221{
1222 struct page *page;
1223 int i;
1224
1225 ASSERT(vcpu);
1226
1227 vcpu->kvm->n_free_mmu_pages = KVM_NUM_MMU_PAGES;
1228
1229 /*
1230 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
1231 * Therefore we need to allocate shadow page tables in the first
1232 * 4GB of memory, which happens to fit the DMA32 zone.
1233 */
1234 page = alloc_page(GFP_KERNEL | __GFP_DMA32);
1235 if (!page)
1236 goto error_1;
1237 vcpu->mmu.pae_root = page_address(page);
1238 for (i = 0; i < 4; ++i)
1239 vcpu->mmu.pae_root[i] = INVALID_PAGE;
1240
1241 return 0;
1242
1243error_1:
1244 free_mmu_pages(vcpu);
1245 return -ENOMEM;
1246}
1247
1248int kvm_mmu_create(struct kvm_vcpu *vcpu)
1249{
1250 ASSERT(vcpu);
1251 ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
1252
1253 return alloc_mmu_pages(vcpu);
1254}
1255
1256int kvm_mmu_setup(struct kvm_vcpu *vcpu)
1257{
1258 ASSERT(vcpu);
1259 ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
1260
1261 return init_kvm_mmu(vcpu);
1262}
1263
1264void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
1265{
1266 ASSERT(vcpu);
1267
1268 destroy_kvm_mmu(vcpu);
1269 free_mmu_pages(vcpu);
1270 mmu_free_memory_caches(vcpu);
1271}
1272
1273void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
1274{
1275 struct kvm_mmu_page *page;
1276
1277 list_for_each_entry(page, &kvm->active_mmu_pages, link) {
1278 int i;
1279 u64 *pt;
1280
1281 if (!test_bit(slot, &page->slot_bitmap))
1282 continue;
1283
1284 pt = page->spt;
1285 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1286 /* avoid RMW */
1287 if (pt[i] & PT_WRITABLE_MASK) {
1288 rmap_remove(&pt[i]);
1289 pt[i] &= ~PT_WRITABLE_MASK;
1290 }
1291 }
1292}
1293
1294void kvm_mmu_zap_all(struct kvm *kvm)
1295{
1296 struct kvm_mmu_page *page, *node;
1297
1298 list_for_each_entry_safe(page, node, &kvm->active_mmu_pages, link)
1299 kvm_mmu_zap_page(kvm, page);
1300
1301 kvm_flush_remote_tlbs(kvm);
1302}
1303
1304void kvm_mmu_module_exit(void)
1305{
1306 if (pte_chain_cache)
1307 kmem_cache_destroy(pte_chain_cache);
1308 if (rmap_desc_cache)
1309 kmem_cache_destroy(rmap_desc_cache);
1310 if (mmu_page_header_cache)
1311 kmem_cache_destroy(mmu_page_header_cache);
1312}
1313
1314int kvm_mmu_module_init(void)
1315{
1316 pte_chain_cache = kmem_cache_create("kvm_pte_chain",
1317 sizeof(struct kvm_pte_chain),
1318 0, 0, NULL);
1319 if (!pte_chain_cache)
1320 goto nomem;
1321 rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
1322 sizeof(struct kvm_rmap_desc),
1323 0, 0, NULL);
1324 if (!rmap_desc_cache)
1325 goto nomem;
1326
1327 mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
1328 sizeof(struct kvm_mmu_page),
1329 0, 0, NULL);
1330 if (!mmu_page_header_cache)
1331 goto nomem;
1332
1333 return 0;
1334
1335nomem:
1336 kvm_mmu_module_exit();
1337 return -ENOMEM;
1338}
1339
1340#ifdef AUDIT
1341
1342static const char *audit_msg;
1343
1344static gva_t canonicalize(gva_t gva)
1345{
1346#ifdef CONFIG_X86_64
1347 gva = (long long)(gva << 16) >> 16;
1348#endif
1349 return gva;
1350}
1351
1352static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
1353 gva_t va, int level)
1354{
1355 u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
1356 int i;
1357 gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
1358
1359 for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
1360 u64 ent = pt[i];
1361
1362 if (!(ent & PT_PRESENT_MASK))
1363 continue;
1364
1365 va = canonicalize(va);
1366 if (level > 1)
1367 audit_mappings_page(vcpu, ent, va, level - 1);
1368 else {
1369 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, va);
1370 hpa_t hpa = gpa_to_hpa(vcpu, gpa);
1371
1372 if ((ent & PT_PRESENT_MASK)
1373 && (ent & PT64_BASE_ADDR_MASK) != hpa)
1374 printk(KERN_ERR "audit error: (%s) levels %d"
1375 " gva %lx gpa %llx hpa %llx ent %llx\n",
1376 audit_msg, vcpu->mmu.root_level,
1377 va, gpa, hpa, ent);
1378 }
1379 }
1380}
1381
1382static void audit_mappings(struct kvm_vcpu *vcpu)
1383{
1384 unsigned i;
1385
1386 if (vcpu->mmu.root_level == 4)
1387 audit_mappings_page(vcpu, vcpu->mmu.root_hpa, 0, 4);
1388 else
1389 for (i = 0; i < 4; ++i)
1390 if (vcpu->mmu.pae_root[i] & PT_PRESENT_MASK)
1391 audit_mappings_page(vcpu,
1392 vcpu->mmu.pae_root[i],
1393 i << 30,
1394 2);
1395}
1396
1397static int count_rmaps(struct kvm_vcpu *vcpu)
1398{
1399 int nmaps = 0;
1400 int i, j, k;
1401
1402 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
1403 struct kvm_memory_slot *m = &vcpu->kvm->memslots[i];
1404 struct kvm_rmap_desc *d;
1405
1406 for (j = 0; j < m->npages; ++j) {
1407 struct page *page = m->phys_mem[j];
1408
1409 if (!page->private)
1410 continue;
1411 if (!(page->private & 1)) {
1412 ++nmaps;
1413 continue;
1414 }
1415 d = (struct kvm_rmap_desc *)(page->private & ~1ul);
1416 while (d) {
1417 for (k = 0; k < RMAP_EXT; ++k)
1418 if (d->shadow_ptes[k])
1419 ++nmaps;
1420 else
1421 break;
1422 d = d->more;
1423 }
1424 }
1425 }
1426 return nmaps;
1427}
1428
1429static int count_writable_mappings(struct kvm_vcpu *vcpu)
1430{
1431 int nmaps = 0;
1432 struct kvm_mmu_page *page;
1433 int i;
1434
1435 list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) {
1436 u64 *pt = page->spt;
1437
1438 if (page->role.level != PT_PAGE_TABLE_LEVEL)
1439 continue;
1440
1441 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1442 u64 ent = pt[i];
1443
1444 if (!(ent & PT_PRESENT_MASK))
1445 continue;
1446 if (!(ent & PT_WRITABLE_MASK))
1447 continue;
1448 ++nmaps;
1449 }
1450 }
1451 return nmaps;
1452}
1453
1454static void audit_rmap(struct kvm_vcpu *vcpu)
1455{
1456 int n_rmap = count_rmaps(vcpu);
1457 int n_actual = count_writable_mappings(vcpu);
1458
1459 if (n_rmap != n_actual)
1460 printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
1461 __FUNCTION__, audit_msg, n_rmap, n_actual);
1462}
1463
1464static void audit_write_protection(struct kvm_vcpu *vcpu)
1465{
1466 struct kvm_mmu_page *page;
1467
1468 list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) {
1469 hfn_t hfn;
1470 struct page *pg;
1471
1472 if (page->role.metaphysical)
1473 continue;
1474
1475 hfn = gpa_to_hpa(vcpu, (gpa_t)page->gfn << PAGE_SHIFT)
1476 >> PAGE_SHIFT;
1477 pg = pfn_to_page(hfn);
1478 if (pg->private)
1479 printk(KERN_ERR "%s: (%s) shadow page has writable"
1480 " mappings: gfn %lx role %x\n",
1481 __FUNCTION__, audit_msg, page->gfn,
1482 page->role.word);
1483 }
1484}
1485
1486static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
1487{
1488 int olddbg = dbg;
1489
1490 dbg = 0;
1491 audit_msg = msg;
1492 audit_rmap(vcpu);
1493 audit_write_protection(vcpu);
1494 audit_mappings(vcpu);
1495 dbg = olddbg;
1496}
1497
1498#endif
diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h
deleted file mode 100644
index 6b094b44f8fb..000000000000
--- a/drivers/kvm/paging_tmpl.h
+++ /dev/null
@@ -1,511 +0,0 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * MMU support
8 *
9 * Copyright (C) 2006 Qumranet, Inc.
10 *
11 * Authors:
12 * Yaniv Kamay <yaniv@qumranet.com>
13 * Avi Kivity <avi@qumranet.com>
14 *
15 * This work is licensed under the terms of the GNU GPL, version 2. See
16 * the COPYING file in the top-level directory.
17 *
18 */
19
20/*
21 * We need the mmu code to access both 32-bit and 64-bit guest ptes,
22 * so the code in this file is compiled twice, once per pte size.
23 */
24
25#if PTTYPE == 64
26 #define pt_element_t u64
27 #define guest_walker guest_walker64
28 #define FNAME(name) paging##64_##name
29 #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
30 #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
31 #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
32 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
33 #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
34 #ifdef CONFIG_X86_64
35 #define PT_MAX_FULL_LEVELS 4
36 #else
37 #define PT_MAX_FULL_LEVELS 2
38 #endif
39#elif PTTYPE == 32
40 #define pt_element_t u32
41 #define guest_walker guest_walker32
42 #define FNAME(name) paging##32_##name
43 #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
44 #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
45 #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
46 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
47 #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
48 #define PT_MAX_FULL_LEVELS 2
49#else
50 #error Invalid PTTYPE value
51#endif
52
53/*
54 * The guest_walker structure emulates the behavior of the hardware page
55 * table walker.
56 */
57struct guest_walker {
58 int level;
59 gfn_t table_gfn[PT_MAX_FULL_LEVELS];
60 pt_element_t *table;
61 pt_element_t pte;
62 pt_element_t *ptep;
63 struct page *page;
64 int index;
65 pt_element_t inherited_ar;
66 gfn_t gfn;
67 u32 error_code;
68};
69
70/*
71 * Fetch a guest pte for a guest virtual address
72 */
73static int FNAME(walk_addr)(struct guest_walker *walker,
74 struct kvm_vcpu *vcpu, gva_t addr,
75 int write_fault, int user_fault, int fetch_fault)
76{
77 hpa_t hpa;
78 struct kvm_memory_slot *slot;
79 pt_element_t *ptep;
80 pt_element_t root;
81 gfn_t table_gfn;
82
83 pgprintk("%s: addr %lx\n", __FUNCTION__, addr);
84 walker->level = vcpu->mmu.root_level;
85 walker->table = NULL;
86 walker->page = NULL;
87 walker->ptep = NULL;
88 root = vcpu->cr3;
89#if PTTYPE == 64
90 if (!is_long_mode(vcpu)) {
91 walker->ptep = &vcpu->pdptrs[(addr >> 30) & 3];
92 root = *walker->ptep;
93 walker->pte = root;
94 if (!(root & PT_PRESENT_MASK))
95 goto not_present;
96 --walker->level;
97 }
98#endif
99 table_gfn = (root & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
100 walker->table_gfn[walker->level - 1] = table_gfn;
101 pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
102 walker->level - 1, table_gfn);
103 slot = gfn_to_memslot(vcpu->kvm, table_gfn);
104 hpa = safe_gpa_to_hpa(vcpu, root & PT64_BASE_ADDR_MASK);
105 walker->page = pfn_to_page(hpa >> PAGE_SHIFT);
106 walker->table = kmap_atomic(walker->page, KM_USER0);
107
108 ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
109 (vcpu->cr3 & CR3_NONPAE_RESERVED_BITS) == 0);
110
111 walker->inherited_ar = PT_USER_MASK | PT_WRITABLE_MASK;
112
113 for (;;) {
114 int index = PT_INDEX(addr, walker->level);
115 hpa_t paddr;
116
117 ptep = &walker->table[index];
118 walker->index = index;
119 ASSERT(((unsigned long)walker->table & PAGE_MASK) ==
120 ((unsigned long)ptep & PAGE_MASK));
121
122 if (!is_present_pte(*ptep))
123 goto not_present;
124
125 if (write_fault && !is_writeble_pte(*ptep))
126 if (user_fault || is_write_protection(vcpu))
127 goto access_error;
128
129 if (user_fault && !(*ptep & PT_USER_MASK))
130 goto access_error;
131
132#if PTTYPE == 64
133 if (fetch_fault && is_nx(vcpu) && (*ptep & PT64_NX_MASK))
134 goto access_error;
135#endif
136
137 if (!(*ptep & PT_ACCESSED_MASK)) {
138 mark_page_dirty(vcpu->kvm, table_gfn);
139 *ptep |= PT_ACCESSED_MASK;
140 }
141
142 if (walker->level == PT_PAGE_TABLE_LEVEL) {
143 walker->gfn = (*ptep & PT_BASE_ADDR_MASK)
144 >> PAGE_SHIFT;
145 break;
146 }
147
148 if (walker->level == PT_DIRECTORY_LEVEL
149 && (*ptep & PT_PAGE_SIZE_MASK)
150 && (PTTYPE == 64 || is_pse(vcpu))) {
151 walker->gfn = (*ptep & PT_DIR_BASE_ADDR_MASK)
152 >> PAGE_SHIFT;
153 walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL);
154 break;
155 }
156
157 walker->inherited_ar &= walker->table[index];
158 table_gfn = (*ptep & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
159 kunmap_atomic(walker->table, KM_USER0);
160 paddr = safe_gpa_to_hpa(vcpu, table_gfn << PAGE_SHIFT);
161 walker->page = pfn_to_page(paddr >> PAGE_SHIFT);
162 walker->table = kmap_atomic(walker->page, KM_USER0);
163 --walker->level;
164 walker->table_gfn[walker->level - 1 ] = table_gfn;
165 pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
166 walker->level - 1, table_gfn);
167 }
168 walker->pte = *ptep;
169 if (walker->page)
170 walker->ptep = NULL;
171 if (walker->table)
172 kunmap_atomic(walker->table, KM_USER0);
173 pgprintk("%s: pte %llx\n", __FUNCTION__, (u64)*ptep);
174 return 1;
175
176not_present:
177 walker->error_code = 0;
178 goto err;
179
180access_error:
181 walker->error_code = PFERR_PRESENT_MASK;
182
183err:
184 if (write_fault)
185 walker->error_code |= PFERR_WRITE_MASK;
186 if (user_fault)
187 walker->error_code |= PFERR_USER_MASK;
188 if (fetch_fault)
189 walker->error_code |= PFERR_FETCH_MASK;
190 if (walker->table)
191 kunmap_atomic(walker->table, KM_USER0);
192 return 0;
193}
194
195static void FNAME(mark_pagetable_dirty)(struct kvm *kvm,
196 struct guest_walker *walker)
197{
198 mark_page_dirty(kvm, walker->table_gfn[walker->level - 1]);
199}
200
201static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu,
202 u64 *shadow_pte,
203 gpa_t gaddr,
204 pt_element_t gpte,
205 u64 access_bits,
206 int user_fault,
207 int write_fault,
208 int *ptwrite,
209 struct guest_walker *walker,
210 gfn_t gfn)
211{
212 hpa_t paddr;
213 int dirty = gpte & PT_DIRTY_MASK;
214 u64 spte = *shadow_pte;
215 int was_rmapped = is_rmap_pte(spte);
216
217 pgprintk("%s: spte %llx gpte %llx access %llx write_fault %d"
218 " user_fault %d gfn %lx\n",
219 __FUNCTION__, spte, (u64)gpte, access_bits,
220 write_fault, user_fault, gfn);
221
222 if (write_fault && !dirty) {
223 pt_element_t *guest_ent, *tmp = NULL;
224
225 if (walker->ptep)
226 guest_ent = walker->ptep;
227 else {
228 tmp = kmap_atomic(walker->page, KM_USER0);
229 guest_ent = &tmp[walker->index];
230 }
231
232 *guest_ent |= PT_DIRTY_MASK;
233 if (!walker->ptep)
234 kunmap_atomic(tmp, KM_USER0);
235 dirty = 1;
236 FNAME(mark_pagetable_dirty)(vcpu->kvm, walker);
237 }
238
239 spte |= PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK;
240 spte |= gpte & PT64_NX_MASK;
241 if (!dirty)
242 access_bits &= ~PT_WRITABLE_MASK;
243
244 paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK);
245
246 spte |= PT_PRESENT_MASK;
247 if (access_bits & PT_USER_MASK)
248 spte |= PT_USER_MASK;
249
250 if (is_error_hpa(paddr)) {
251 spte |= gaddr;
252 spte |= PT_SHADOW_IO_MARK;
253 spte &= ~PT_PRESENT_MASK;
254 set_shadow_pte(shadow_pte, spte);
255 return;
256 }
257
258 spte |= paddr;
259
260 if ((access_bits & PT_WRITABLE_MASK)
261 || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
262 struct kvm_mmu_page *shadow;
263
264 spte |= PT_WRITABLE_MASK;
265 if (user_fault) {
266 mmu_unshadow(vcpu, gfn);
267 goto unshadowed;
268 }
269
270 shadow = kvm_mmu_lookup_page(vcpu, gfn);
271 if (shadow) {
272 pgprintk("%s: found shadow page for %lx, marking ro\n",
273 __FUNCTION__, gfn);
274 access_bits &= ~PT_WRITABLE_MASK;
275 if (is_writeble_pte(spte)) {
276 spte &= ~PT_WRITABLE_MASK;
277 kvm_x86_ops->tlb_flush(vcpu);
278 }
279 if (write_fault)
280 *ptwrite = 1;
281 }
282 }
283
284unshadowed:
285
286 if (access_bits & PT_WRITABLE_MASK)
287 mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT);
288
289 set_shadow_pte(shadow_pte, spte);
290 page_header_update_slot(vcpu->kvm, shadow_pte, gaddr);
291 if (!was_rmapped)
292 rmap_add(vcpu, shadow_pte);
293}
294
295static void FNAME(set_pte)(struct kvm_vcpu *vcpu, pt_element_t gpte,
296 u64 *shadow_pte, u64 access_bits,
297 int user_fault, int write_fault, int *ptwrite,
298 struct guest_walker *walker, gfn_t gfn)
299{
300 access_bits &= gpte;
301 FNAME(set_pte_common)(vcpu, shadow_pte, gpte & PT_BASE_ADDR_MASK,
302 gpte, access_bits, user_fault, write_fault,
303 ptwrite, walker, gfn);
304}
305
306static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
307 u64 *spte, const void *pte, int bytes)
308{
309 pt_element_t gpte;
310
311 if (bytes < sizeof(pt_element_t))
312 return;
313 gpte = *(const pt_element_t *)pte;
314 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK))
315 return;
316 pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte);
317 FNAME(set_pte)(vcpu, gpte, spte, PT_USER_MASK | PT_WRITABLE_MASK, 0,
318 0, NULL, NULL,
319 (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT);
320}
321
322static void FNAME(set_pde)(struct kvm_vcpu *vcpu, pt_element_t gpde,
323 u64 *shadow_pte, u64 access_bits,
324 int user_fault, int write_fault, int *ptwrite,
325 struct guest_walker *walker, gfn_t gfn)
326{
327 gpa_t gaddr;
328
329 access_bits &= gpde;
330 gaddr = (gpa_t)gfn << PAGE_SHIFT;
331 if (PTTYPE == 32 && is_cpuid_PSE36())
332 gaddr |= (gpde & PT32_DIR_PSE36_MASK) <<
333 (32 - PT32_DIR_PSE36_SHIFT);
334 FNAME(set_pte_common)(vcpu, shadow_pte, gaddr,
335 gpde, access_bits, user_fault, write_fault,
336 ptwrite, walker, gfn);
337}
338
339/*
340 * Fetch a shadow pte for a specific level in the paging hierarchy.
341 */
342static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
343 struct guest_walker *walker,
344 int user_fault, int write_fault, int *ptwrite)
345{
346 hpa_t shadow_addr;
347 int level;
348 u64 *shadow_ent;
349 u64 *prev_shadow_ent = NULL;
350
351 if (!is_present_pte(walker->pte))
352 return NULL;
353
354 shadow_addr = vcpu->mmu.root_hpa;
355 level = vcpu->mmu.shadow_root_level;
356 if (level == PT32E_ROOT_LEVEL) {
357 shadow_addr = vcpu->mmu.pae_root[(addr >> 30) & 3];
358 shadow_addr &= PT64_BASE_ADDR_MASK;
359 --level;
360 }
361
362 for (; ; level--) {
363 u32 index = SHADOW_PT_INDEX(addr, level);
364 struct kvm_mmu_page *shadow_page;
365 u64 shadow_pte;
366 int metaphysical;
367 gfn_t table_gfn;
368 unsigned hugepage_access = 0;
369
370 shadow_ent = ((u64 *)__va(shadow_addr)) + index;
371 if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) {
372 if (level == PT_PAGE_TABLE_LEVEL)
373 break;
374 shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
375 prev_shadow_ent = shadow_ent;
376 continue;
377 }
378
379 if (level == PT_PAGE_TABLE_LEVEL)
380 break;
381
382 if (level - 1 == PT_PAGE_TABLE_LEVEL
383 && walker->level == PT_DIRECTORY_LEVEL) {
384 metaphysical = 1;
385 hugepage_access = walker->pte;
386 hugepage_access &= PT_USER_MASK | PT_WRITABLE_MASK;
387 if (walker->pte & PT64_NX_MASK)
388 hugepage_access |= (1 << 2);
389 hugepage_access >>= PT_WRITABLE_SHIFT;
390 table_gfn = (walker->pte & PT_BASE_ADDR_MASK)
391 >> PAGE_SHIFT;
392 } else {
393 metaphysical = 0;
394 table_gfn = walker->table_gfn[level - 2];
395 }
396 shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
397 metaphysical, hugepage_access,
398 shadow_ent);
399 shadow_addr = __pa(shadow_page->spt);
400 shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
401 | PT_WRITABLE_MASK | PT_USER_MASK;
402 *shadow_ent = shadow_pte;
403 prev_shadow_ent = shadow_ent;
404 }
405
406 if (walker->level == PT_DIRECTORY_LEVEL) {
407 FNAME(set_pde)(vcpu, walker->pte, shadow_ent,
408 walker->inherited_ar, user_fault, write_fault,
409 ptwrite, walker, walker->gfn);
410 } else {
411 ASSERT(walker->level == PT_PAGE_TABLE_LEVEL);
412 FNAME(set_pte)(vcpu, walker->pte, shadow_ent,
413 walker->inherited_ar, user_fault, write_fault,
414 ptwrite, walker, walker->gfn);
415 }
416 return shadow_ent;
417}
418
419/*
420 * Page fault handler. There are several causes for a page fault:
421 * - there is no shadow pte for the guest pte
422 * - write access through a shadow pte marked read only so that we can set
423 * the dirty bit
424 * - write access to a shadow pte marked read only so we can update the page
425 * dirty bitmap, when userspace requests it
426 * - mmio access; in this case we will never install a present shadow pte
427 * - normal guest page fault due to the guest pte marked not present, not
428 * writable, or not executable
429 *
430 * Returns: 1 if we need to emulate the instruction, 0 otherwise, or
431 * a negative value on error.
432 */
433static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
434 u32 error_code)
435{
436 int write_fault = error_code & PFERR_WRITE_MASK;
437 int user_fault = error_code & PFERR_USER_MASK;
438 int fetch_fault = error_code & PFERR_FETCH_MASK;
439 struct guest_walker walker;
440 u64 *shadow_pte;
441 int write_pt = 0;
442 int r;
443
444 pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code);
445 kvm_mmu_audit(vcpu, "pre page fault");
446
447 r = mmu_topup_memory_caches(vcpu);
448 if (r)
449 return r;
450
451 /*
452 * Look up the shadow pte for the faulting address.
453 */
454 r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
455 fetch_fault);
456
457 /*
458 * The page is not mapped by the guest. Let the guest handle it.
459 */
460 if (!r) {
461 pgprintk("%s: guest page fault\n", __FUNCTION__);
462 inject_page_fault(vcpu, addr, walker.error_code);
463 vcpu->last_pt_write_count = 0; /* reset fork detector */
464 return 0;
465 }
466
467 shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
468 &write_pt);
469 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__,
470 shadow_pte, *shadow_pte, write_pt);
471
472 if (!write_pt)
473 vcpu->last_pt_write_count = 0; /* reset fork detector */
474
475 /*
476 * mmio: emulate if accessible, otherwise its a guest fault.
477 */
478 if (is_io_pte(*shadow_pte))
479 return 1;
480
481 ++vcpu->stat.pf_fixed;
482 kvm_mmu_audit(vcpu, "post page fault (fixed)");
483
484 return write_pt;
485}
486
487static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
488{
489 struct guest_walker walker;
490 gpa_t gpa = UNMAPPED_GVA;
491 int r;
492
493 r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0);
494
495 if (r) {
496 gpa = (gpa_t)walker.gfn << PAGE_SHIFT;
497 gpa |= vaddr & ~PAGE_MASK;
498 }
499
500 return gpa;
501}
502
503#undef pt_element_t
504#undef guest_walker
505#undef FNAME
506#undef PT_BASE_ADDR_MASK
507#undef PT_INDEX
508#undef SHADOW_PT_INDEX
509#undef PT_LEVEL_MASK
510#undef PT_DIR_BASE_ADDR_MASK
511#undef PT_MAX_FULL_LEVELS
diff --git a/drivers/kvm/segment_descriptor.h b/drivers/kvm/segment_descriptor.h
deleted file mode 100644
index 71fdf458619a..000000000000
--- a/drivers/kvm/segment_descriptor.h
+++ /dev/null
@@ -1,17 +0,0 @@
1struct segment_descriptor {
2 u16 limit_low;
3 u16 base_low;
4 u8 base_mid;
5 u8 type : 4;
6 u8 system : 1;
7 u8 dpl : 2;
8 u8 present : 1;
9 u8 limit_high : 4;
10 u8 avl : 1;
11 u8 long_mode : 1;
12 u8 default_op : 1;
13 u8 granularity : 1;
14 u8 base_high;
15} __attribute__((packed));
16
17
diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c
deleted file mode 100644
index ced4ac1955db..000000000000
--- a/drivers/kvm/svm.c
+++ /dev/null
@@ -1,1754 +0,0 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * AMD SVM support
5 *
6 * Copyright (C) 2006 Qumranet, Inc.
7 *
8 * Authors:
9 * Yaniv Kamay <yaniv@qumranet.com>
10 * Avi Kivity <avi@qumranet.com>
11 *
12 * This work is licensed under the terms of the GNU GPL, version 2. See
13 * the COPYING file in the top-level directory.
14 *
15 */
16
17#include "kvm_svm.h"
18#include "x86_emulate.h"
19#include "irq.h"
20
21#include <linux/module.h>
22#include <linux/kernel.h>
23#include <linux/vmalloc.h>
24#include <linux/highmem.h>
25#include <linux/sched.h>
26
27#include <asm/desc.h>
28
29MODULE_AUTHOR("Qumranet");
30MODULE_LICENSE("GPL");
31
32#define IOPM_ALLOC_ORDER 2
33#define MSRPM_ALLOC_ORDER 1
34
35#define DB_VECTOR 1
36#define UD_VECTOR 6
37#define GP_VECTOR 13
38
39#define DR7_GD_MASK (1 << 13)
40#define DR6_BD_MASK (1 << 13)
41
42#define SEG_TYPE_LDT 2
43#define SEG_TYPE_BUSY_TSS16 3
44
45#define KVM_EFER_LMA (1 << 10)
46#define KVM_EFER_LME (1 << 8)
47
48#define SVM_FEATURE_NPT (1 << 0)
49#define SVM_FEATURE_LBRV (1 << 1)
50#define SVM_DEATURE_SVML (1 << 2)
51
52static void kvm_reput_irq(struct vcpu_svm *svm);
53
54static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
55{
56 return container_of(vcpu, struct vcpu_svm, vcpu);
57}
58
59unsigned long iopm_base;
60unsigned long msrpm_base;
61
62struct kvm_ldttss_desc {
63 u16 limit0;
64 u16 base0;
65 unsigned base1 : 8, type : 5, dpl : 2, p : 1;
66 unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8;
67 u32 base3;
68 u32 zero1;
69} __attribute__((packed));
70
71struct svm_cpu_data {
72 int cpu;
73
74 u64 asid_generation;
75 u32 max_asid;
76 u32 next_asid;
77 struct kvm_ldttss_desc *tss_desc;
78
79 struct page *save_area;
80};
81
82static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
83static uint32_t svm_features;
84
85struct svm_init_data {
86 int cpu;
87 int r;
88};
89
90static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
91
92#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
93#define MSRS_RANGE_SIZE 2048
94#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
95
96#define MAX_INST_SIZE 15
97
98static inline u32 svm_has(u32 feat)
99{
100 return svm_features & feat;
101}
102
103static inline u8 pop_irq(struct kvm_vcpu *vcpu)
104{
105 int word_index = __ffs(vcpu->irq_summary);
106 int bit_index = __ffs(vcpu->irq_pending[word_index]);
107 int irq = word_index * BITS_PER_LONG + bit_index;
108
109 clear_bit(bit_index, &vcpu->irq_pending[word_index]);
110 if (!vcpu->irq_pending[word_index])
111 clear_bit(word_index, &vcpu->irq_summary);
112 return irq;
113}
114
115static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq)
116{
117 set_bit(irq, vcpu->irq_pending);
118 set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary);
119}
120
121static inline void clgi(void)
122{
123 asm volatile (SVM_CLGI);
124}
125
126static inline void stgi(void)
127{
128 asm volatile (SVM_STGI);
129}
130
131static inline void invlpga(unsigned long addr, u32 asid)
132{
133 asm volatile (SVM_INVLPGA :: "a"(addr), "c"(asid));
134}
135
136static inline unsigned long kvm_read_cr2(void)
137{
138 unsigned long cr2;
139
140 asm volatile ("mov %%cr2, %0" : "=r" (cr2));
141 return cr2;
142}
143
144static inline void kvm_write_cr2(unsigned long val)
145{
146 asm volatile ("mov %0, %%cr2" :: "r" (val));
147}
148
149static inline unsigned long read_dr6(void)
150{
151 unsigned long dr6;
152
153 asm volatile ("mov %%dr6, %0" : "=r" (dr6));
154 return dr6;
155}
156
157static inline void write_dr6(unsigned long val)
158{
159 asm volatile ("mov %0, %%dr6" :: "r" (val));
160}
161
162static inline unsigned long read_dr7(void)
163{
164 unsigned long dr7;
165
166 asm volatile ("mov %%dr7, %0" : "=r" (dr7));
167 return dr7;
168}
169
170static inline void write_dr7(unsigned long val)
171{
172 asm volatile ("mov %0, %%dr7" :: "r" (val));
173}
174
175static inline void force_new_asid(struct kvm_vcpu *vcpu)
176{
177 to_svm(vcpu)->asid_generation--;
178}
179
180static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
181{
182 force_new_asid(vcpu);
183}
184
185static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
186{
187 if (!(efer & KVM_EFER_LMA))
188 efer &= ~KVM_EFER_LME;
189
190 to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK;
191 vcpu->shadow_efer = efer;
192}
193
194static void svm_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code)
195{
196 struct vcpu_svm *svm = to_svm(vcpu);
197
198 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID |
199 SVM_EVTINJ_VALID_ERR |
200 SVM_EVTINJ_TYPE_EXEPT |
201 GP_VECTOR;
202 svm->vmcb->control.event_inj_err = error_code;
203}
204
205static void inject_ud(struct kvm_vcpu *vcpu)
206{
207 to_svm(vcpu)->vmcb->control.event_inj = SVM_EVTINJ_VALID |
208 SVM_EVTINJ_TYPE_EXEPT |
209 UD_VECTOR;
210}
211
212static int is_page_fault(uint32_t info)
213{
214 info &= SVM_EVTINJ_VEC_MASK | SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
215 return info == (PF_VECTOR | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT);
216}
217
218static int is_external_interrupt(u32 info)
219{
220 info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
221 return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
222}
223
224static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
225{
226 struct vcpu_svm *svm = to_svm(vcpu);
227
228 if (!svm->next_rip) {
229 printk(KERN_DEBUG "%s: NOP\n", __FUNCTION__);
230 return;
231 }
232 if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE) {
233 printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n",
234 __FUNCTION__,
235 svm->vmcb->save.rip,
236 svm->next_rip);
237 }
238
239 vcpu->rip = svm->vmcb->save.rip = svm->next_rip;
240 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
241
242 vcpu->interrupt_window_open = 1;
243}
244
245static int has_svm(void)
246{
247 uint32_t eax, ebx, ecx, edx;
248
249 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
250 printk(KERN_INFO "has_svm: not amd\n");
251 return 0;
252 }
253
254 cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
255 if (eax < SVM_CPUID_FUNC) {
256 printk(KERN_INFO "has_svm: can't execute cpuid_8000000a\n");
257 return 0;
258 }
259
260 cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
261 if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) {
262 printk(KERN_DEBUG "has_svm: svm not available\n");
263 return 0;
264 }
265 return 1;
266}
267
268static void svm_hardware_disable(void *garbage)
269{
270 struct svm_cpu_data *svm_data
271 = per_cpu(svm_data, raw_smp_processor_id());
272
273 if (svm_data) {
274 uint64_t efer;
275
276 wrmsrl(MSR_VM_HSAVE_PA, 0);
277 rdmsrl(MSR_EFER, efer);
278 wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK);
279 per_cpu(svm_data, raw_smp_processor_id()) = NULL;
280 __free_page(svm_data->save_area);
281 kfree(svm_data);
282 }
283}
284
285static void svm_hardware_enable(void *garbage)
286{
287
288 struct svm_cpu_data *svm_data;
289 uint64_t efer;
290#ifdef CONFIG_X86_64
291 struct desc_ptr gdt_descr;
292#else
293 struct desc_ptr gdt_descr;
294#endif
295 struct desc_struct *gdt;
296 int me = raw_smp_processor_id();
297
298 if (!has_svm()) {
299 printk(KERN_ERR "svm_cpu_init: err EOPNOTSUPP on %d\n", me);
300 return;
301 }
302 svm_data = per_cpu(svm_data, me);
303
304 if (!svm_data) {
305 printk(KERN_ERR "svm_cpu_init: svm_data is NULL on %d\n",
306 me);
307 return;
308 }
309
310 svm_data->asid_generation = 1;
311 svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
312 svm_data->next_asid = svm_data->max_asid + 1;
313 svm_features = cpuid_edx(SVM_CPUID_FUNC);
314
315 asm volatile ( "sgdt %0" : "=m"(gdt_descr) );
316 gdt = (struct desc_struct *)gdt_descr.address;
317 svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
318
319 rdmsrl(MSR_EFER, efer);
320 wrmsrl(MSR_EFER, efer | MSR_EFER_SVME_MASK);
321
322 wrmsrl(MSR_VM_HSAVE_PA,
323 page_to_pfn(svm_data->save_area) << PAGE_SHIFT);
324}
325
326static int svm_cpu_init(int cpu)
327{
328 struct svm_cpu_data *svm_data;
329 int r;
330
331 svm_data = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
332 if (!svm_data)
333 return -ENOMEM;
334 svm_data->cpu = cpu;
335 svm_data->save_area = alloc_page(GFP_KERNEL);
336 r = -ENOMEM;
337 if (!svm_data->save_area)
338 goto err_1;
339
340 per_cpu(svm_data, cpu) = svm_data;
341
342 return 0;
343
344err_1:
345 kfree(svm_data);
346 return r;
347
348}
349
350static void set_msr_interception(u32 *msrpm, unsigned msr,
351 int read, int write)
352{
353 int i;
354
355 for (i = 0; i < NUM_MSR_MAPS; i++) {
356 if (msr >= msrpm_ranges[i] &&
357 msr < msrpm_ranges[i] + MSRS_IN_RANGE) {
358 u32 msr_offset = (i * MSRS_IN_RANGE + msr -
359 msrpm_ranges[i]) * 2;
360
361 u32 *base = msrpm + (msr_offset / 32);
362 u32 msr_shift = msr_offset % 32;
363 u32 mask = ((write) ? 0 : 2) | ((read) ? 0 : 1);
364 *base = (*base & ~(0x3 << msr_shift)) |
365 (mask << msr_shift);
366 return;
367 }
368 }
369 BUG();
370}
371
372static __init int svm_hardware_setup(void)
373{
374 int cpu;
375 struct page *iopm_pages;
376 struct page *msrpm_pages;
377 void *iopm_va, *msrpm_va;
378 int r;
379
380 iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER);
381
382 if (!iopm_pages)
383 return -ENOMEM;
384
385 iopm_va = page_address(iopm_pages);
386 memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
387 clear_bit(0x80, iopm_va); /* allow direct access to PC debug port */
388 iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
389
390
391 msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
392
393 r = -ENOMEM;
394 if (!msrpm_pages)
395 goto err_1;
396
397 msrpm_va = page_address(msrpm_pages);
398 memset(msrpm_va, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
399 msrpm_base = page_to_pfn(msrpm_pages) << PAGE_SHIFT;
400
401#ifdef CONFIG_X86_64
402 set_msr_interception(msrpm_va, MSR_GS_BASE, 1, 1);
403 set_msr_interception(msrpm_va, MSR_FS_BASE, 1, 1);
404 set_msr_interception(msrpm_va, MSR_KERNEL_GS_BASE, 1, 1);
405 set_msr_interception(msrpm_va, MSR_LSTAR, 1, 1);
406 set_msr_interception(msrpm_va, MSR_CSTAR, 1, 1);
407 set_msr_interception(msrpm_va, MSR_SYSCALL_MASK, 1, 1);
408#endif
409 set_msr_interception(msrpm_va, MSR_K6_STAR, 1, 1);
410 set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_CS, 1, 1);
411 set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_ESP, 1, 1);
412 set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_EIP, 1, 1);
413
414 for_each_online_cpu(cpu) {
415 r = svm_cpu_init(cpu);
416 if (r)
417 goto err_2;
418 }
419 return 0;
420
421err_2:
422 __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
423 msrpm_base = 0;
424err_1:
425 __free_pages(iopm_pages, IOPM_ALLOC_ORDER);
426 iopm_base = 0;
427 return r;
428}
429
430static __exit void svm_hardware_unsetup(void)
431{
432 __free_pages(pfn_to_page(msrpm_base >> PAGE_SHIFT), MSRPM_ALLOC_ORDER);
433 __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
434 iopm_base = msrpm_base = 0;
435}
436
437static void init_seg(struct vmcb_seg *seg)
438{
439 seg->selector = 0;
440 seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
441 SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
442 seg->limit = 0xffff;
443 seg->base = 0;
444}
445
446static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
447{
448 seg->selector = 0;
449 seg->attrib = SVM_SELECTOR_P_MASK | type;
450 seg->limit = 0xffff;
451 seg->base = 0;
452}
453
454static void init_vmcb(struct vmcb *vmcb)
455{
456 struct vmcb_control_area *control = &vmcb->control;
457 struct vmcb_save_area *save = &vmcb->save;
458
459 control->intercept_cr_read = INTERCEPT_CR0_MASK |
460 INTERCEPT_CR3_MASK |
461 INTERCEPT_CR4_MASK;
462
463 control->intercept_cr_write = INTERCEPT_CR0_MASK |
464 INTERCEPT_CR3_MASK |
465 INTERCEPT_CR4_MASK;
466
467 control->intercept_dr_read = INTERCEPT_DR0_MASK |
468 INTERCEPT_DR1_MASK |
469 INTERCEPT_DR2_MASK |
470 INTERCEPT_DR3_MASK;
471
472 control->intercept_dr_write = INTERCEPT_DR0_MASK |
473 INTERCEPT_DR1_MASK |
474 INTERCEPT_DR2_MASK |
475 INTERCEPT_DR3_MASK |
476 INTERCEPT_DR5_MASK |
477 INTERCEPT_DR7_MASK;
478
479 control->intercept_exceptions = 1 << PF_VECTOR;
480
481
482 control->intercept = (1ULL << INTERCEPT_INTR) |
483 (1ULL << INTERCEPT_NMI) |
484 (1ULL << INTERCEPT_SMI) |
485 /*
486 * selective cr0 intercept bug?
487 * 0: 0f 22 d8 mov %eax,%cr3
488 * 3: 0f 20 c0 mov %cr0,%eax
489 * 6: 0d 00 00 00 80 or $0x80000000,%eax
490 * b: 0f 22 c0 mov %eax,%cr0
491 * set cr3 ->interception
492 * get cr0 ->interception
493 * set cr0 -> no interception
494 */
495 /* (1ULL << INTERCEPT_SELECTIVE_CR0) | */
496 (1ULL << INTERCEPT_CPUID) |
497 (1ULL << INTERCEPT_INVD) |
498 (1ULL << INTERCEPT_HLT) |
499 (1ULL << INTERCEPT_INVLPGA) |
500 (1ULL << INTERCEPT_IOIO_PROT) |
501 (1ULL << INTERCEPT_MSR_PROT) |
502 (1ULL << INTERCEPT_TASK_SWITCH) |
503 (1ULL << INTERCEPT_SHUTDOWN) |
504 (1ULL << INTERCEPT_VMRUN) |
505 (1ULL << INTERCEPT_VMMCALL) |
506 (1ULL << INTERCEPT_VMLOAD) |
507 (1ULL << INTERCEPT_VMSAVE) |
508 (1ULL << INTERCEPT_STGI) |
509 (1ULL << INTERCEPT_CLGI) |
510 (1ULL << INTERCEPT_SKINIT) |
511 (1ULL << INTERCEPT_WBINVD) |
512 (1ULL << INTERCEPT_MONITOR) |
513 (1ULL << INTERCEPT_MWAIT);
514
515 control->iopm_base_pa = iopm_base;
516 control->msrpm_base_pa = msrpm_base;
517 control->tsc_offset = 0;
518 control->int_ctl = V_INTR_MASKING_MASK;
519
520 init_seg(&save->es);
521 init_seg(&save->ss);
522 init_seg(&save->ds);
523 init_seg(&save->fs);
524 init_seg(&save->gs);
525
526 save->cs.selector = 0xf000;
527 /* Executable/Readable Code Segment */
528 save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
529 SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
530 save->cs.limit = 0xffff;
531 /*
532 * cs.base should really be 0xffff0000, but vmx can't handle that, so
533 * be consistent with it.
534 *
535 * Replace when we have real mode working for vmx.
536 */
537 save->cs.base = 0xf0000;
538
539 save->gdtr.limit = 0xffff;
540 save->idtr.limit = 0xffff;
541
542 init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
543 init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
544
545 save->efer = MSR_EFER_SVME_MASK;
546
547 save->dr6 = 0xffff0ff0;
548 save->dr7 = 0x400;
549 save->rflags = 2;
550 save->rip = 0x0000fff0;
551
552 /*
553 * cr0 val on cpu init should be 0x60000010, we enable cpu
554 * cache by default. the orderly way is to enable cache in bios.
555 */
556 save->cr0 = 0x00000010 | X86_CR0_PG | X86_CR0_WP;
557 save->cr4 = X86_CR4_PAE;
558 /* rdx = ?? */
559}
560
561static void svm_vcpu_reset(struct kvm_vcpu *vcpu)
562{
563 struct vcpu_svm *svm = to_svm(vcpu);
564
565 init_vmcb(svm->vmcb);
566
567 if (vcpu->vcpu_id != 0) {
568 svm->vmcb->save.rip = 0;
569 svm->vmcb->save.cs.base = svm->vcpu.sipi_vector << 12;
570 svm->vmcb->save.cs.selector = svm->vcpu.sipi_vector << 8;
571 }
572}
573
574static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
575{
576 struct vcpu_svm *svm;
577 struct page *page;
578 int err;
579
580 svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
581 if (!svm) {
582 err = -ENOMEM;
583 goto out;
584 }
585
586 err = kvm_vcpu_init(&svm->vcpu, kvm, id);
587 if (err)
588 goto free_svm;
589
590 if (irqchip_in_kernel(kvm)) {
591 err = kvm_create_lapic(&svm->vcpu);
592 if (err < 0)
593 goto free_svm;
594 }
595
596 page = alloc_page(GFP_KERNEL);
597 if (!page) {
598 err = -ENOMEM;
599 goto uninit;
600 }
601
602 svm->vmcb = page_address(page);
603 clear_page(svm->vmcb);
604 svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
605 svm->asid_generation = 0;
606 memset(svm->db_regs, 0, sizeof(svm->db_regs));
607 init_vmcb(svm->vmcb);
608
609 fx_init(&svm->vcpu);
610 svm->vcpu.fpu_active = 1;
611 svm->vcpu.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
612 if (svm->vcpu.vcpu_id == 0)
613 svm->vcpu.apic_base |= MSR_IA32_APICBASE_BSP;
614
615 return &svm->vcpu;
616
617uninit:
618 kvm_vcpu_uninit(&svm->vcpu);
619free_svm:
620 kmem_cache_free(kvm_vcpu_cache, svm);
621out:
622 return ERR_PTR(err);
623}
624
625static void svm_free_vcpu(struct kvm_vcpu *vcpu)
626{
627 struct vcpu_svm *svm = to_svm(vcpu);
628
629 __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT));
630 kvm_vcpu_uninit(vcpu);
631 kmem_cache_free(kvm_vcpu_cache, svm);
632}
633
634static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
635{
636 struct vcpu_svm *svm = to_svm(vcpu);
637 int i;
638
639 if (unlikely(cpu != vcpu->cpu)) {
640 u64 tsc_this, delta;
641
642 /*
643 * Make sure that the guest sees a monotonically
644 * increasing TSC.
645 */
646 rdtscll(tsc_this);
647 delta = vcpu->host_tsc - tsc_this;
648 svm->vmcb->control.tsc_offset += delta;
649 vcpu->cpu = cpu;
650 kvm_migrate_apic_timer(vcpu);
651 }
652
653 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
654 rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
655}
656
657static void svm_vcpu_put(struct kvm_vcpu *vcpu)
658{
659 struct vcpu_svm *svm = to_svm(vcpu);
660 int i;
661
662 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
663 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
664
665 rdtscll(vcpu->host_tsc);
666 kvm_put_guest_fpu(vcpu);
667}
668
669static void svm_vcpu_decache(struct kvm_vcpu *vcpu)
670{
671}
672
673static void svm_cache_regs(struct kvm_vcpu *vcpu)
674{
675 struct vcpu_svm *svm = to_svm(vcpu);
676
677 vcpu->regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
678 vcpu->regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
679 vcpu->rip = svm->vmcb->save.rip;
680}
681
682static void svm_decache_regs(struct kvm_vcpu *vcpu)
683{
684 struct vcpu_svm *svm = to_svm(vcpu);
685 svm->vmcb->save.rax = vcpu->regs[VCPU_REGS_RAX];
686 svm->vmcb->save.rsp = vcpu->regs[VCPU_REGS_RSP];
687 svm->vmcb->save.rip = vcpu->rip;
688}
689
690static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
691{
692 return to_svm(vcpu)->vmcb->save.rflags;
693}
694
695static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
696{
697 to_svm(vcpu)->vmcb->save.rflags = rflags;
698}
699
700static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
701{
702 struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
703
704 switch (seg) {
705 case VCPU_SREG_CS: return &save->cs;
706 case VCPU_SREG_DS: return &save->ds;
707 case VCPU_SREG_ES: return &save->es;
708 case VCPU_SREG_FS: return &save->fs;
709 case VCPU_SREG_GS: return &save->gs;
710 case VCPU_SREG_SS: return &save->ss;
711 case VCPU_SREG_TR: return &save->tr;
712 case VCPU_SREG_LDTR: return &save->ldtr;
713 }
714 BUG();
715 return NULL;
716}
717
718static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
719{
720 struct vmcb_seg *s = svm_seg(vcpu, seg);
721
722 return s->base;
723}
724
725static void svm_get_segment(struct kvm_vcpu *vcpu,
726 struct kvm_segment *var, int seg)
727{
728 struct vmcb_seg *s = svm_seg(vcpu, seg);
729
730 var->base = s->base;
731 var->limit = s->limit;
732 var->selector = s->selector;
733 var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
734 var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
735 var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
736 var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
737 var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
738 var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
739 var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
740 var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1;
741 var->unusable = !var->present;
742}
743
744static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
745{
746 struct vcpu_svm *svm = to_svm(vcpu);
747
748 dt->limit = svm->vmcb->save.idtr.limit;
749 dt->base = svm->vmcb->save.idtr.base;
750}
751
752static void svm_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
753{
754 struct vcpu_svm *svm = to_svm(vcpu);
755
756 svm->vmcb->save.idtr.limit = dt->limit;
757 svm->vmcb->save.idtr.base = dt->base ;
758}
759
760static void svm_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
761{
762 struct vcpu_svm *svm = to_svm(vcpu);
763
764 dt->limit = svm->vmcb->save.gdtr.limit;
765 dt->base = svm->vmcb->save.gdtr.base;
766}
767
768static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
769{
770 struct vcpu_svm *svm = to_svm(vcpu);
771
772 svm->vmcb->save.gdtr.limit = dt->limit;
773 svm->vmcb->save.gdtr.base = dt->base ;
774}
775
776static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
777{
778}
779
780static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
781{
782 struct vcpu_svm *svm = to_svm(vcpu);
783
784#ifdef CONFIG_X86_64
785 if (vcpu->shadow_efer & KVM_EFER_LME) {
786 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
787 vcpu->shadow_efer |= KVM_EFER_LMA;
788 svm->vmcb->save.efer |= KVM_EFER_LMA | KVM_EFER_LME;
789 }
790
791 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG) ) {
792 vcpu->shadow_efer &= ~KVM_EFER_LMA;
793 svm->vmcb->save.efer &= ~(KVM_EFER_LMA | KVM_EFER_LME);
794 }
795 }
796#endif
797 if ((vcpu->cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) {
798 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
799 vcpu->fpu_active = 1;
800 }
801
802 vcpu->cr0 = cr0;
803 cr0 |= X86_CR0_PG | X86_CR0_WP;
804 cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
805 svm->vmcb->save.cr0 = cr0;
806}
807
808static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
809{
810 vcpu->cr4 = cr4;
811 to_svm(vcpu)->vmcb->save.cr4 = cr4 | X86_CR4_PAE;
812}
813
814static void svm_set_segment(struct kvm_vcpu *vcpu,
815 struct kvm_segment *var, int seg)
816{
817 struct vcpu_svm *svm = to_svm(vcpu);
818 struct vmcb_seg *s = svm_seg(vcpu, seg);
819
820 s->base = var->base;
821 s->limit = var->limit;
822 s->selector = var->selector;
823 if (var->unusable)
824 s->attrib = 0;
825 else {
826 s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
827 s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
828 s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
829 s->attrib |= (var->present & 1) << SVM_SELECTOR_P_SHIFT;
830 s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
831 s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
832 s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
833 s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
834 }
835 if (seg == VCPU_SREG_CS)
836 svm->vmcb->save.cpl
837 = (svm->vmcb->save.cs.attrib
838 >> SVM_SELECTOR_DPL_SHIFT) & 3;
839
840}
841
842/* FIXME:
843
844 svm(vcpu)->vmcb->control.int_ctl &= ~V_TPR_MASK;
845 svm(vcpu)->vmcb->control.int_ctl |= (sregs->cr8 & V_TPR_MASK);
846
847*/
848
849static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
850{
851 return -EOPNOTSUPP;
852}
853
854static int svm_get_irq(struct kvm_vcpu *vcpu)
855{
856 struct vcpu_svm *svm = to_svm(vcpu);
857 u32 exit_int_info = svm->vmcb->control.exit_int_info;
858
859 if (is_external_interrupt(exit_int_info))
860 return exit_int_info & SVM_EVTINJ_VEC_MASK;
861 return -1;
862}
863
864static void load_host_msrs(struct kvm_vcpu *vcpu)
865{
866#ifdef CONFIG_X86_64
867 wrmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
868#endif
869}
870
871static void save_host_msrs(struct kvm_vcpu *vcpu)
872{
873#ifdef CONFIG_X86_64
874 rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
875#endif
876}
877
878static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data)
879{
880 if (svm_data->next_asid > svm_data->max_asid) {
881 ++svm_data->asid_generation;
882 svm_data->next_asid = 1;
883 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
884 }
885
886 svm->vcpu.cpu = svm_data->cpu;
887 svm->asid_generation = svm_data->asid_generation;
888 svm->vmcb->control.asid = svm_data->next_asid++;
889}
890
891static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr)
892{
893 return to_svm(vcpu)->db_regs[dr];
894}
895
896static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
897 int *exception)
898{
899 struct vcpu_svm *svm = to_svm(vcpu);
900
901 *exception = 0;
902
903 if (svm->vmcb->save.dr7 & DR7_GD_MASK) {
904 svm->vmcb->save.dr7 &= ~DR7_GD_MASK;
905 svm->vmcb->save.dr6 |= DR6_BD_MASK;
906 *exception = DB_VECTOR;
907 return;
908 }
909
910 switch (dr) {
911 case 0 ... 3:
912 svm->db_regs[dr] = value;
913 return;
914 case 4 ... 5:
915 if (vcpu->cr4 & X86_CR4_DE) {
916 *exception = UD_VECTOR;
917 return;
918 }
919 case 7: {
920 if (value & ~((1ULL << 32) - 1)) {
921 *exception = GP_VECTOR;
922 return;
923 }
924 svm->vmcb->save.dr7 = value;
925 return;
926 }
927 default:
928 printk(KERN_DEBUG "%s: unexpected dr %u\n",
929 __FUNCTION__, dr);
930 *exception = UD_VECTOR;
931 return;
932 }
933}
934
935static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
936{
937 u32 exit_int_info = svm->vmcb->control.exit_int_info;
938 struct kvm *kvm = svm->vcpu.kvm;
939 u64 fault_address;
940 u32 error_code;
941 enum emulation_result er;
942 int r;
943
944 if (!irqchip_in_kernel(kvm) &&
945 is_external_interrupt(exit_int_info))
946 push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
947
948 mutex_lock(&kvm->lock);
949
950 fault_address = svm->vmcb->control.exit_info_2;
951 error_code = svm->vmcb->control.exit_info_1;
952 r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
953 if (r < 0) {
954 mutex_unlock(&kvm->lock);
955 return r;
956 }
957 if (!r) {
958 mutex_unlock(&kvm->lock);
959 return 1;
960 }
961 er = emulate_instruction(&svm->vcpu, kvm_run, fault_address,
962 error_code);
963 mutex_unlock(&kvm->lock);
964
965 switch (er) {
966 case EMULATE_DONE:
967 return 1;
968 case EMULATE_DO_MMIO:
969 ++svm->vcpu.stat.mmio_exits;
970 return 0;
971 case EMULATE_FAIL:
972 kvm_report_emulation_failure(&svm->vcpu, "pagetable");
973 break;
974 default:
975 BUG();
976 }
977
978 kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
979 return 0;
980}
981
982static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
983{
984 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
985 if (!(svm->vcpu.cr0 & X86_CR0_TS))
986 svm->vmcb->save.cr0 &= ~X86_CR0_TS;
987 svm->vcpu.fpu_active = 1;
988
989 return 1;
990}
991
992static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
993{
994 /*
995 * VMCB is undefined after a SHUTDOWN intercept
996 * so reinitialize it.
997 */
998 clear_page(svm->vmcb);
999 init_vmcb(svm->vmcb);
1000
1001 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
1002 return 0;
1003}
1004
1005static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1006{
1007 u32 io_info = svm->vmcb->control.exit_info_1; //address size bug?
1008 int size, down, in, string, rep;
1009 unsigned port;
1010
1011 ++svm->vcpu.stat.io_exits;
1012
1013 svm->next_rip = svm->vmcb->control.exit_info_2;
1014
1015 string = (io_info & SVM_IOIO_STR_MASK) != 0;
1016
1017 if (string) {
1018 if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0) == EMULATE_DO_MMIO)
1019 return 0;
1020 return 1;
1021 }
1022
1023 in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
1024 port = io_info >> 16;
1025 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
1026 rep = (io_info & SVM_IOIO_REP_MASK) != 0;
1027 down = (svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0;
1028
1029 return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port);
1030}
1031
1032static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1033{
1034 return 1;
1035}
1036
1037static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1038{
1039 svm->next_rip = svm->vmcb->save.rip + 1;
1040 skip_emulated_instruction(&svm->vcpu);
1041 return kvm_emulate_halt(&svm->vcpu);
1042}
1043
1044static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1045{
1046 svm->next_rip = svm->vmcb->save.rip + 3;
1047 skip_emulated_instruction(&svm->vcpu);
1048 return kvm_hypercall(&svm->vcpu, kvm_run);
1049}
1050
1051static int invalid_op_interception(struct vcpu_svm *svm,
1052 struct kvm_run *kvm_run)
1053{
1054 inject_ud(&svm->vcpu);
1055 return 1;
1056}
1057
1058static int task_switch_interception(struct vcpu_svm *svm,
1059 struct kvm_run *kvm_run)
1060{
1061 pr_unimpl(&svm->vcpu, "%s: task switch is unsupported\n", __FUNCTION__);
1062 kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
1063 return 0;
1064}
1065
1066static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1067{
1068 svm->next_rip = svm->vmcb->save.rip + 2;
1069 kvm_emulate_cpuid(&svm->vcpu);
1070 return 1;
1071}
1072
1073static int emulate_on_interception(struct vcpu_svm *svm,
1074 struct kvm_run *kvm_run)
1075{
1076 if (emulate_instruction(&svm->vcpu, NULL, 0, 0) != EMULATE_DONE)
1077 pr_unimpl(&svm->vcpu, "%s: failed\n", __FUNCTION__);
1078 return 1;
1079}
1080
1081static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
1082{
1083 struct vcpu_svm *svm = to_svm(vcpu);
1084
1085 switch (ecx) {
1086 case MSR_IA32_TIME_STAMP_COUNTER: {
1087 u64 tsc;
1088
1089 rdtscll(tsc);
1090 *data = svm->vmcb->control.tsc_offset + tsc;
1091 break;
1092 }
1093 case MSR_K6_STAR:
1094 *data = svm->vmcb->save.star;
1095 break;
1096#ifdef CONFIG_X86_64
1097 case MSR_LSTAR:
1098 *data = svm->vmcb->save.lstar;
1099 break;
1100 case MSR_CSTAR:
1101 *data = svm->vmcb->save.cstar;
1102 break;
1103 case MSR_KERNEL_GS_BASE:
1104 *data = svm->vmcb->save.kernel_gs_base;
1105 break;
1106 case MSR_SYSCALL_MASK:
1107 *data = svm->vmcb->save.sfmask;
1108 break;
1109#endif
1110 case MSR_IA32_SYSENTER_CS:
1111 *data = svm->vmcb->save.sysenter_cs;
1112 break;
1113 case MSR_IA32_SYSENTER_EIP:
1114 *data = svm->vmcb->save.sysenter_eip;
1115 break;
1116 case MSR_IA32_SYSENTER_ESP:
1117 *data = svm->vmcb->save.sysenter_esp;
1118 break;
1119 default:
1120 return kvm_get_msr_common(vcpu, ecx, data);
1121 }
1122 return 0;
1123}
1124
1125static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1126{
1127 u32 ecx = svm->vcpu.regs[VCPU_REGS_RCX];
1128 u64 data;
1129
1130 if (svm_get_msr(&svm->vcpu, ecx, &data))
1131 svm_inject_gp(&svm->vcpu, 0);
1132 else {
1133 svm->vmcb->save.rax = data & 0xffffffff;
1134 svm->vcpu.regs[VCPU_REGS_RDX] = data >> 32;
1135 svm->next_rip = svm->vmcb->save.rip + 2;
1136 skip_emulated_instruction(&svm->vcpu);
1137 }
1138 return 1;
1139}
1140
1141static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
1142{
1143 struct vcpu_svm *svm = to_svm(vcpu);
1144
1145 switch (ecx) {
1146 case MSR_IA32_TIME_STAMP_COUNTER: {
1147 u64 tsc;
1148
1149 rdtscll(tsc);
1150 svm->vmcb->control.tsc_offset = data - tsc;
1151 break;
1152 }
1153 case MSR_K6_STAR:
1154 svm->vmcb->save.star = data;
1155 break;
1156#ifdef CONFIG_X86_64
1157 case MSR_LSTAR:
1158 svm->vmcb->save.lstar = data;
1159 break;
1160 case MSR_CSTAR:
1161 svm->vmcb->save.cstar = data;
1162 break;
1163 case MSR_KERNEL_GS_BASE:
1164 svm->vmcb->save.kernel_gs_base = data;
1165 break;
1166 case MSR_SYSCALL_MASK:
1167 svm->vmcb->save.sfmask = data;
1168 break;
1169#endif
1170 case MSR_IA32_SYSENTER_CS:
1171 svm->vmcb->save.sysenter_cs = data;
1172 break;
1173 case MSR_IA32_SYSENTER_EIP:
1174 svm->vmcb->save.sysenter_eip = data;
1175 break;
1176 case MSR_IA32_SYSENTER_ESP:
1177 svm->vmcb->save.sysenter_esp = data;
1178 break;
1179 default:
1180 return kvm_set_msr_common(vcpu, ecx, data);
1181 }
1182 return 0;
1183}
1184
1185static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1186{
1187 u32 ecx = svm->vcpu.regs[VCPU_REGS_RCX];
1188 u64 data = (svm->vmcb->save.rax & -1u)
1189 | ((u64)(svm->vcpu.regs[VCPU_REGS_RDX] & -1u) << 32);
1190 svm->next_rip = svm->vmcb->save.rip + 2;
1191 if (svm_set_msr(&svm->vcpu, ecx, data))
1192 svm_inject_gp(&svm->vcpu, 0);
1193 else
1194 skip_emulated_instruction(&svm->vcpu);
1195 return 1;
1196}
1197
1198static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1199{
1200 if (svm->vmcb->control.exit_info_1)
1201 return wrmsr_interception(svm, kvm_run);
1202 else
1203 return rdmsr_interception(svm, kvm_run);
1204}
1205
1206static int interrupt_window_interception(struct vcpu_svm *svm,
1207 struct kvm_run *kvm_run)
1208{
1209 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR);
1210 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
1211 /*
1212 * If the user space waits to inject interrupts, exit as soon as
1213 * possible
1214 */
1215 if (kvm_run->request_interrupt_window &&
1216 !svm->vcpu.irq_summary) {
1217 ++svm->vcpu.stat.irq_window_exits;
1218 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
1219 return 0;
1220 }
1221
1222 return 1;
1223}
1224
1225static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
1226 struct kvm_run *kvm_run) = {
1227 [SVM_EXIT_READ_CR0] = emulate_on_interception,
1228 [SVM_EXIT_READ_CR3] = emulate_on_interception,
1229 [SVM_EXIT_READ_CR4] = emulate_on_interception,
1230 /* for now: */
1231 [SVM_EXIT_WRITE_CR0] = emulate_on_interception,
1232 [SVM_EXIT_WRITE_CR3] = emulate_on_interception,
1233 [SVM_EXIT_WRITE_CR4] = emulate_on_interception,
1234 [SVM_EXIT_READ_DR0] = emulate_on_interception,
1235 [SVM_EXIT_READ_DR1] = emulate_on_interception,
1236 [SVM_EXIT_READ_DR2] = emulate_on_interception,
1237 [SVM_EXIT_READ_DR3] = emulate_on_interception,
1238 [SVM_EXIT_WRITE_DR0] = emulate_on_interception,
1239 [SVM_EXIT_WRITE_DR1] = emulate_on_interception,
1240 [SVM_EXIT_WRITE_DR2] = emulate_on_interception,
1241 [SVM_EXIT_WRITE_DR3] = emulate_on_interception,
1242 [SVM_EXIT_WRITE_DR5] = emulate_on_interception,
1243 [SVM_EXIT_WRITE_DR7] = emulate_on_interception,
1244 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
1245 [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception,
1246 [SVM_EXIT_INTR] = nop_on_interception,
1247 [SVM_EXIT_NMI] = nop_on_interception,
1248 [SVM_EXIT_SMI] = nop_on_interception,
1249 [SVM_EXIT_INIT] = nop_on_interception,
1250 [SVM_EXIT_VINTR] = interrupt_window_interception,
1251 /* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */
1252 [SVM_EXIT_CPUID] = cpuid_interception,
1253 [SVM_EXIT_INVD] = emulate_on_interception,
1254 [SVM_EXIT_HLT] = halt_interception,
1255 [SVM_EXIT_INVLPG] = emulate_on_interception,
1256 [SVM_EXIT_INVLPGA] = invalid_op_interception,
1257 [SVM_EXIT_IOIO] = io_interception,
1258 [SVM_EXIT_MSR] = msr_interception,
1259 [SVM_EXIT_TASK_SWITCH] = task_switch_interception,
1260 [SVM_EXIT_SHUTDOWN] = shutdown_interception,
1261 [SVM_EXIT_VMRUN] = invalid_op_interception,
1262 [SVM_EXIT_VMMCALL] = vmmcall_interception,
1263 [SVM_EXIT_VMLOAD] = invalid_op_interception,
1264 [SVM_EXIT_VMSAVE] = invalid_op_interception,
1265 [SVM_EXIT_STGI] = invalid_op_interception,
1266 [SVM_EXIT_CLGI] = invalid_op_interception,
1267 [SVM_EXIT_SKINIT] = invalid_op_interception,
1268 [SVM_EXIT_WBINVD] = emulate_on_interception,
1269 [SVM_EXIT_MONITOR] = invalid_op_interception,
1270 [SVM_EXIT_MWAIT] = invalid_op_interception,
1271};
1272
1273
1274static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1275{
1276 struct vcpu_svm *svm = to_svm(vcpu);
1277 u32 exit_code = svm->vmcb->control.exit_code;
1278
1279 kvm_reput_irq(svm);
1280
1281 if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
1282 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
1283 kvm_run->fail_entry.hardware_entry_failure_reason
1284 = svm->vmcb->control.exit_code;
1285 return 0;
1286 }
1287
1288 if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
1289 exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR)
1290 printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
1291 "exit_code 0x%x\n",
1292 __FUNCTION__, svm->vmcb->control.exit_int_info,
1293 exit_code);
1294
1295 if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
1296 || svm_exit_handlers[exit_code] == 0) {
1297 kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
1298 kvm_run->hw.hardware_exit_reason = exit_code;
1299 return 0;
1300 }
1301
1302 return svm_exit_handlers[exit_code](svm, kvm_run);
1303}
1304
1305static void reload_tss(struct kvm_vcpu *vcpu)
1306{
1307 int cpu = raw_smp_processor_id();
1308
1309 struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
1310 svm_data->tss_desc->type = 9; //available 32/64-bit TSS
1311 load_TR_desc();
1312}
1313
1314static void pre_svm_run(struct vcpu_svm *svm)
1315{
1316 int cpu = raw_smp_processor_id();
1317
1318 struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
1319
1320 svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
1321 if (svm->vcpu.cpu != cpu ||
1322 svm->asid_generation != svm_data->asid_generation)
1323 new_asid(svm, svm_data);
1324}
1325
1326
1327static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
1328{
1329 struct vmcb_control_area *control;
1330
1331 control = &svm->vmcb->control;
1332 control->int_vector = irq;
1333 control->int_ctl &= ~V_INTR_PRIO_MASK;
1334 control->int_ctl |= V_IRQ_MASK |
1335 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
1336}
1337
1338static void svm_set_irq(struct kvm_vcpu *vcpu, int irq)
1339{
1340 struct vcpu_svm *svm = to_svm(vcpu);
1341
1342 svm_inject_irq(svm, irq);
1343}
1344
1345static void svm_intr_assist(struct kvm_vcpu *vcpu)
1346{
1347 struct vcpu_svm *svm = to_svm(vcpu);
1348 struct vmcb *vmcb = svm->vmcb;
1349 int intr_vector = -1;
1350
1351 kvm_inject_pending_timer_irqs(vcpu);
1352 if ((vmcb->control.exit_int_info & SVM_EVTINJ_VALID) &&
1353 ((vmcb->control.exit_int_info & SVM_EVTINJ_TYPE_MASK) == 0)) {
1354 intr_vector = vmcb->control.exit_int_info &
1355 SVM_EVTINJ_VEC_MASK;
1356 vmcb->control.exit_int_info = 0;
1357 svm_inject_irq(svm, intr_vector);
1358 return;
1359 }
1360
1361 if (vmcb->control.int_ctl & V_IRQ_MASK)
1362 return;
1363
1364 if (!kvm_cpu_has_interrupt(vcpu))
1365 return;
1366
1367 if (!(vmcb->save.rflags & X86_EFLAGS_IF) ||
1368 (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
1369 (vmcb->control.event_inj & SVM_EVTINJ_VALID)) {
1370 /* unable to deliver irq, set pending irq */
1371 vmcb->control.intercept |= (1ULL << INTERCEPT_VINTR);
1372 svm_inject_irq(svm, 0x0);
1373 return;
1374 }
1375 /* Okay, we can deliver the interrupt: grab it and update PIC state. */
1376 intr_vector = kvm_cpu_get_interrupt(vcpu);
1377 svm_inject_irq(svm, intr_vector);
1378 kvm_timer_intr_post(vcpu, intr_vector);
1379}
1380
1381static void kvm_reput_irq(struct vcpu_svm *svm)
1382{
1383 struct vmcb_control_area *control = &svm->vmcb->control;
1384
1385 if ((control->int_ctl & V_IRQ_MASK)
1386 && !irqchip_in_kernel(svm->vcpu.kvm)) {
1387 control->int_ctl &= ~V_IRQ_MASK;
1388 push_irq(&svm->vcpu, control->int_vector);
1389 }
1390
1391 svm->vcpu.interrupt_window_open =
1392 !(control->int_state & SVM_INTERRUPT_SHADOW_MASK);
1393}
1394
1395static void svm_do_inject_vector(struct vcpu_svm *svm)
1396{
1397 struct kvm_vcpu *vcpu = &svm->vcpu;
1398 int word_index = __ffs(vcpu->irq_summary);
1399 int bit_index = __ffs(vcpu->irq_pending[word_index]);
1400 int irq = word_index * BITS_PER_LONG + bit_index;
1401
1402 clear_bit(bit_index, &vcpu->irq_pending[word_index]);
1403 if (!vcpu->irq_pending[word_index])
1404 clear_bit(word_index, &vcpu->irq_summary);
1405 svm_inject_irq(svm, irq);
1406}
1407
1408static void do_interrupt_requests(struct kvm_vcpu *vcpu,
1409 struct kvm_run *kvm_run)
1410{
1411 struct vcpu_svm *svm = to_svm(vcpu);
1412 struct vmcb_control_area *control = &svm->vmcb->control;
1413
1414 svm->vcpu.interrupt_window_open =
1415 (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
1416 (svm->vmcb->save.rflags & X86_EFLAGS_IF));
1417
1418 if (svm->vcpu.interrupt_window_open && svm->vcpu.irq_summary)
1419 /*
1420 * If interrupts enabled, and not blocked by sti or mov ss. Good.
1421 */
1422 svm_do_inject_vector(svm);
1423
1424 /*
1425 * Interrupts blocked. Wait for unblock.
1426 */
1427 if (!svm->vcpu.interrupt_window_open &&
1428 (svm->vcpu.irq_summary || kvm_run->request_interrupt_window)) {
1429 control->intercept |= 1ULL << INTERCEPT_VINTR;
1430 } else
1431 control->intercept &= ~(1ULL << INTERCEPT_VINTR);
1432}
1433
1434static void save_db_regs(unsigned long *db_regs)
1435{
1436 asm volatile ("mov %%dr0, %0" : "=r"(db_regs[0]));
1437 asm volatile ("mov %%dr1, %0" : "=r"(db_regs[1]));
1438 asm volatile ("mov %%dr2, %0" : "=r"(db_regs[2]));
1439 asm volatile ("mov %%dr3, %0" : "=r"(db_regs[3]));
1440}
1441
1442static void load_db_regs(unsigned long *db_regs)
1443{
1444 asm volatile ("mov %0, %%dr0" : : "r"(db_regs[0]));
1445 asm volatile ("mov %0, %%dr1" : : "r"(db_regs[1]));
1446 asm volatile ("mov %0, %%dr2" : : "r"(db_regs[2]));
1447 asm volatile ("mov %0, %%dr3" : : "r"(db_regs[3]));
1448}
1449
1450static void svm_flush_tlb(struct kvm_vcpu *vcpu)
1451{
1452 force_new_asid(vcpu);
1453}
1454
1455static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
1456{
1457}
1458
1459static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1460{
1461 struct vcpu_svm *svm = to_svm(vcpu);
1462 u16 fs_selector;
1463 u16 gs_selector;
1464 u16 ldt_selector;
1465
1466 pre_svm_run(svm);
1467
1468 save_host_msrs(vcpu);
1469 fs_selector = read_fs();
1470 gs_selector = read_gs();
1471 ldt_selector = read_ldt();
1472 svm->host_cr2 = kvm_read_cr2();
1473 svm->host_dr6 = read_dr6();
1474 svm->host_dr7 = read_dr7();
1475 svm->vmcb->save.cr2 = vcpu->cr2;
1476
1477 if (svm->vmcb->save.dr7 & 0xff) {
1478 write_dr7(0);
1479 save_db_regs(svm->host_db_regs);
1480 load_db_regs(svm->db_regs);
1481 }
1482
1483 clgi();
1484
1485 local_irq_enable();
1486
1487 asm volatile (
1488#ifdef CONFIG_X86_64
1489 "push %%rbx; push %%rcx; push %%rdx;"
1490 "push %%rsi; push %%rdi; push %%rbp;"
1491 "push %%r8; push %%r9; push %%r10; push %%r11;"
1492 "push %%r12; push %%r13; push %%r14; push %%r15;"
1493#else
1494 "push %%ebx; push %%ecx; push %%edx;"
1495 "push %%esi; push %%edi; push %%ebp;"
1496#endif
1497
1498#ifdef CONFIG_X86_64
1499 "mov %c[rbx](%[svm]), %%rbx \n\t"
1500 "mov %c[rcx](%[svm]), %%rcx \n\t"
1501 "mov %c[rdx](%[svm]), %%rdx \n\t"
1502 "mov %c[rsi](%[svm]), %%rsi \n\t"
1503 "mov %c[rdi](%[svm]), %%rdi \n\t"
1504 "mov %c[rbp](%[svm]), %%rbp \n\t"
1505 "mov %c[r8](%[svm]), %%r8 \n\t"
1506 "mov %c[r9](%[svm]), %%r9 \n\t"
1507 "mov %c[r10](%[svm]), %%r10 \n\t"
1508 "mov %c[r11](%[svm]), %%r11 \n\t"
1509 "mov %c[r12](%[svm]), %%r12 \n\t"
1510 "mov %c[r13](%[svm]), %%r13 \n\t"
1511 "mov %c[r14](%[svm]), %%r14 \n\t"
1512 "mov %c[r15](%[svm]), %%r15 \n\t"
1513#else
1514 "mov %c[rbx](%[svm]), %%ebx \n\t"
1515 "mov %c[rcx](%[svm]), %%ecx \n\t"
1516 "mov %c[rdx](%[svm]), %%edx \n\t"
1517 "mov %c[rsi](%[svm]), %%esi \n\t"
1518 "mov %c[rdi](%[svm]), %%edi \n\t"
1519 "mov %c[rbp](%[svm]), %%ebp \n\t"
1520#endif
1521
1522#ifdef CONFIG_X86_64
1523 /* Enter guest mode */
1524 "push %%rax \n\t"
1525 "mov %c[vmcb](%[svm]), %%rax \n\t"
1526 SVM_VMLOAD "\n\t"
1527 SVM_VMRUN "\n\t"
1528 SVM_VMSAVE "\n\t"
1529 "pop %%rax \n\t"
1530#else
1531 /* Enter guest mode */
1532 "push %%eax \n\t"
1533 "mov %c[vmcb](%[svm]), %%eax \n\t"
1534 SVM_VMLOAD "\n\t"
1535 SVM_VMRUN "\n\t"
1536 SVM_VMSAVE "\n\t"
1537 "pop %%eax \n\t"
1538#endif
1539
1540 /* Save guest registers, load host registers */
1541#ifdef CONFIG_X86_64
1542 "mov %%rbx, %c[rbx](%[svm]) \n\t"
1543 "mov %%rcx, %c[rcx](%[svm]) \n\t"
1544 "mov %%rdx, %c[rdx](%[svm]) \n\t"
1545 "mov %%rsi, %c[rsi](%[svm]) \n\t"
1546 "mov %%rdi, %c[rdi](%[svm]) \n\t"
1547 "mov %%rbp, %c[rbp](%[svm]) \n\t"
1548 "mov %%r8, %c[r8](%[svm]) \n\t"
1549 "mov %%r9, %c[r9](%[svm]) \n\t"
1550 "mov %%r10, %c[r10](%[svm]) \n\t"
1551 "mov %%r11, %c[r11](%[svm]) \n\t"
1552 "mov %%r12, %c[r12](%[svm]) \n\t"
1553 "mov %%r13, %c[r13](%[svm]) \n\t"
1554 "mov %%r14, %c[r14](%[svm]) \n\t"
1555 "mov %%r15, %c[r15](%[svm]) \n\t"
1556
1557 "pop %%r15; pop %%r14; pop %%r13; pop %%r12;"
1558 "pop %%r11; pop %%r10; pop %%r9; pop %%r8;"
1559 "pop %%rbp; pop %%rdi; pop %%rsi;"
1560 "pop %%rdx; pop %%rcx; pop %%rbx; \n\t"
1561#else
1562 "mov %%ebx, %c[rbx](%[svm]) \n\t"
1563 "mov %%ecx, %c[rcx](%[svm]) \n\t"
1564 "mov %%edx, %c[rdx](%[svm]) \n\t"
1565 "mov %%esi, %c[rsi](%[svm]) \n\t"
1566 "mov %%edi, %c[rdi](%[svm]) \n\t"
1567 "mov %%ebp, %c[rbp](%[svm]) \n\t"
1568
1569 "pop %%ebp; pop %%edi; pop %%esi;"
1570 "pop %%edx; pop %%ecx; pop %%ebx; \n\t"
1571#endif
1572 :
1573 : [svm]"a"(svm),
1574 [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
1575 [rbx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RBX])),
1576 [rcx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RCX])),
1577 [rdx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RDX])),
1578 [rsi]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RSI])),
1579 [rdi]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RDI])),
1580 [rbp]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RBP]))
1581#ifdef CONFIG_X86_64
1582 ,[r8 ]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R8])),
1583 [r9 ]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R9 ])),
1584 [r10]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R10])),
1585 [r11]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R11])),
1586 [r12]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R12])),
1587 [r13]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R13])),
1588 [r14]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R14])),
1589 [r15]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R15]))
1590#endif
1591 : "cc", "memory" );
1592
1593 if ((svm->vmcb->save.dr7 & 0xff))
1594 load_db_regs(svm->host_db_regs);
1595
1596 vcpu->cr2 = svm->vmcb->save.cr2;
1597
1598 write_dr6(svm->host_dr6);
1599 write_dr7(svm->host_dr7);
1600 kvm_write_cr2(svm->host_cr2);
1601
1602 load_fs(fs_selector);
1603 load_gs(gs_selector);
1604 load_ldt(ldt_selector);
1605 load_host_msrs(vcpu);
1606
1607 reload_tss(vcpu);
1608
1609 local_irq_disable();
1610
1611 stgi();
1612
1613 svm->next_rip = 0;
1614}
1615
1616static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
1617{
1618 struct vcpu_svm *svm = to_svm(vcpu);
1619
1620 svm->vmcb->save.cr3 = root;
1621 force_new_asid(vcpu);
1622
1623 if (vcpu->fpu_active) {
1624 svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR);
1625 svm->vmcb->save.cr0 |= X86_CR0_TS;
1626 vcpu->fpu_active = 0;
1627 }
1628}
1629
1630static void svm_inject_page_fault(struct kvm_vcpu *vcpu,
1631 unsigned long addr,
1632 uint32_t err_code)
1633{
1634 struct vcpu_svm *svm = to_svm(vcpu);
1635 uint32_t exit_int_info = svm->vmcb->control.exit_int_info;
1636
1637 ++vcpu->stat.pf_guest;
1638
1639 if (is_page_fault(exit_int_info)) {
1640
1641 svm->vmcb->control.event_inj_err = 0;
1642 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID |
1643 SVM_EVTINJ_VALID_ERR |
1644 SVM_EVTINJ_TYPE_EXEPT |
1645 DF_VECTOR;
1646 return;
1647 }
1648 vcpu->cr2 = addr;
1649 svm->vmcb->save.cr2 = addr;
1650 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID |
1651 SVM_EVTINJ_VALID_ERR |
1652 SVM_EVTINJ_TYPE_EXEPT |
1653 PF_VECTOR;
1654 svm->vmcb->control.event_inj_err = err_code;
1655}
1656
1657
1658static int is_disabled(void)
1659{
1660 u64 vm_cr;
1661
1662 rdmsrl(MSR_VM_CR, vm_cr);
1663 if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
1664 return 1;
1665
1666 return 0;
1667}
1668
1669static void
1670svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
1671{
1672 /*
1673 * Patch in the VMMCALL instruction:
1674 */
1675 hypercall[0] = 0x0f;
1676 hypercall[1] = 0x01;
1677 hypercall[2] = 0xd9;
1678 hypercall[3] = 0xc3;
1679}
1680
1681static void svm_check_processor_compat(void *rtn)
1682{
1683 *(int *)rtn = 0;
1684}
1685
1686static struct kvm_x86_ops svm_x86_ops = {
1687 .cpu_has_kvm_support = has_svm,
1688 .disabled_by_bios = is_disabled,
1689 .hardware_setup = svm_hardware_setup,
1690 .hardware_unsetup = svm_hardware_unsetup,
1691 .check_processor_compatibility = svm_check_processor_compat,
1692 .hardware_enable = svm_hardware_enable,
1693 .hardware_disable = svm_hardware_disable,
1694
1695 .vcpu_create = svm_create_vcpu,
1696 .vcpu_free = svm_free_vcpu,
1697 .vcpu_reset = svm_vcpu_reset,
1698
1699 .prepare_guest_switch = svm_prepare_guest_switch,
1700 .vcpu_load = svm_vcpu_load,
1701 .vcpu_put = svm_vcpu_put,
1702 .vcpu_decache = svm_vcpu_decache,
1703
1704 .set_guest_debug = svm_guest_debug,
1705 .get_msr = svm_get_msr,
1706 .set_msr = svm_set_msr,
1707 .get_segment_base = svm_get_segment_base,
1708 .get_segment = svm_get_segment,
1709 .set_segment = svm_set_segment,
1710 .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
1711 .decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
1712 .set_cr0 = svm_set_cr0,
1713 .set_cr3 = svm_set_cr3,
1714 .set_cr4 = svm_set_cr4,
1715 .set_efer = svm_set_efer,
1716 .get_idt = svm_get_idt,
1717 .set_idt = svm_set_idt,
1718 .get_gdt = svm_get_gdt,
1719 .set_gdt = svm_set_gdt,
1720 .get_dr = svm_get_dr,
1721 .set_dr = svm_set_dr,
1722 .cache_regs = svm_cache_regs,
1723 .decache_regs = svm_decache_regs,
1724 .get_rflags = svm_get_rflags,
1725 .set_rflags = svm_set_rflags,
1726
1727 .tlb_flush = svm_flush_tlb,
1728 .inject_page_fault = svm_inject_page_fault,
1729
1730 .inject_gp = svm_inject_gp,
1731
1732 .run = svm_vcpu_run,
1733 .handle_exit = handle_exit,
1734 .skip_emulated_instruction = skip_emulated_instruction,
1735 .patch_hypercall = svm_patch_hypercall,
1736 .get_irq = svm_get_irq,
1737 .set_irq = svm_set_irq,
1738 .inject_pending_irq = svm_intr_assist,
1739 .inject_pending_vectors = do_interrupt_requests,
1740};
1741
1742static int __init svm_init(void)
1743{
1744 return kvm_init_x86(&svm_x86_ops, sizeof(struct vcpu_svm),
1745 THIS_MODULE);
1746}
1747
1748static void __exit svm_exit(void)
1749{
1750 kvm_exit_x86();
1751}
1752
1753module_init(svm_init)
1754module_exit(svm_exit)
diff --git a/drivers/kvm/svm.h b/drivers/kvm/svm.h
deleted file mode 100644
index 3b1b0f35b6cb..000000000000
--- a/drivers/kvm/svm.h
+++ /dev/null
@@ -1,324 +0,0 @@
1#ifndef __SVM_H
2#define __SVM_H
3
4enum {
5 INTERCEPT_INTR,
6 INTERCEPT_NMI,
7 INTERCEPT_SMI,
8 INTERCEPT_INIT,
9 INTERCEPT_VINTR,
10 INTERCEPT_SELECTIVE_CR0,
11 INTERCEPT_STORE_IDTR,
12 INTERCEPT_STORE_GDTR,
13 INTERCEPT_STORE_LDTR,
14 INTERCEPT_STORE_TR,
15 INTERCEPT_LOAD_IDTR,
16 INTERCEPT_LOAD_GDTR,
17 INTERCEPT_LOAD_LDTR,
18 INTERCEPT_LOAD_TR,
19 INTERCEPT_RDTSC,
20 INTERCEPT_RDPMC,
21 INTERCEPT_PUSHF,
22 INTERCEPT_POPF,
23 INTERCEPT_CPUID,
24 INTERCEPT_RSM,
25 INTERCEPT_IRET,
26 INTERCEPT_INTn,
27 INTERCEPT_INVD,
28 INTERCEPT_PAUSE,
29 INTERCEPT_HLT,
30 INTERCEPT_INVLPG,
31 INTERCEPT_INVLPGA,
32 INTERCEPT_IOIO_PROT,
33 INTERCEPT_MSR_PROT,
34 INTERCEPT_TASK_SWITCH,
35 INTERCEPT_FERR_FREEZE,
36 INTERCEPT_SHUTDOWN,
37 INTERCEPT_VMRUN,
38 INTERCEPT_VMMCALL,
39 INTERCEPT_VMLOAD,
40 INTERCEPT_VMSAVE,
41 INTERCEPT_STGI,
42 INTERCEPT_CLGI,
43 INTERCEPT_SKINIT,
44 INTERCEPT_RDTSCP,
45 INTERCEPT_ICEBP,
46 INTERCEPT_WBINVD,
47 INTERCEPT_MONITOR,
48 INTERCEPT_MWAIT,
49 INTERCEPT_MWAIT_COND,
50};
51
52
53struct __attribute__ ((__packed__)) vmcb_control_area {
54 u16 intercept_cr_read;
55 u16 intercept_cr_write;
56 u16 intercept_dr_read;
57 u16 intercept_dr_write;
58 u32 intercept_exceptions;
59 u64 intercept;
60 u8 reserved_1[44];
61 u64 iopm_base_pa;
62 u64 msrpm_base_pa;
63 u64 tsc_offset;
64 u32 asid;
65 u8 tlb_ctl;
66 u8 reserved_2[3];
67 u32 int_ctl;
68 u32 int_vector;
69 u32 int_state;
70 u8 reserved_3[4];
71 u32 exit_code;
72 u32 exit_code_hi;
73 u64 exit_info_1;
74 u64 exit_info_2;
75 u32 exit_int_info;
76 u32 exit_int_info_err;
77 u64 nested_ctl;
78 u8 reserved_4[16];
79 u32 event_inj;
80 u32 event_inj_err;
81 u64 nested_cr3;
82 u64 lbr_ctl;
83 u8 reserved_5[832];
84};
85
86
87#define TLB_CONTROL_DO_NOTHING 0
88#define TLB_CONTROL_FLUSH_ALL_ASID 1
89
90#define V_TPR_MASK 0x0f
91
92#define V_IRQ_SHIFT 8
93#define V_IRQ_MASK (1 << V_IRQ_SHIFT)
94
95#define V_INTR_PRIO_SHIFT 16
96#define V_INTR_PRIO_MASK (0x0f << V_INTR_PRIO_SHIFT)
97
98#define V_IGN_TPR_SHIFT 20
99#define V_IGN_TPR_MASK (1 << V_IGN_TPR_SHIFT)
100
101#define V_INTR_MASKING_SHIFT 24
102#define V_INTR_MASKING_MASK (1 << V_INTR_MASKING_SHIFT)
103
104#define SVM_INTERRUPT_SHADOW_MASK 1
105
106#define SVM_IOIO_STR_SHIFT 2
107#define SVM_IOIO_REP_SHIFT 3
108#define SVM_IOIO_SIZE_SHIFT 4
109#define SVM_IOIO_ASIZE_SHIFT 7
110
111#define SVM_IOIO_TYPE_MASK 1
112#define SVM_IOIO_STR_MASK (1 << SVM_IOIO_STR_SHIFT)
113#define SVM_IOIO_REP_MASK (1 << SVM_IOIO_REP_SHIFT)
114#define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT)
115#define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT)
116
117struct __attribute__ ((__packed__)) vmcb_seg {
118 u16 selector;
119 u16 attrib;
120 u32 limit;
121 u64 base;
122};
123
124struct __attribute__ ((__packed__)) vmcb_save_area {
125 struct vmcb_seg es;
126 struct vmcb_seg cs;
127 struct vmcb_seg ss;
128 struct vmcb_seg ds;
129 struct vmcb_seg fs;
130 struct vmcb_seg gs;
131 struct vmcb_seg gdtr;
132 struct vmcb_seg ldtr;
133 struct vmcb_seg idtr;
134 struct vmcb_seg tr;
135 u8 reserved_1[43];
136 u8 cpl;
137 u8 reserved_2[4];
138 u64 efer;
139 u8 reserved_3[112];
140 u64 cr4;
141 u64 cr3;
142 u64 cr0;
143 u64 dr7;
144 u64 dr6;
145 u64 rflags;
146 u64 rip;
147 u8 reserved_4[88];
148 u64 rsp;
149 u8 reserved_5[24];
150 u64 rax;
151 u64 star;
152 u64 lstar;
153 u64 cstar;
154 u64 sfmask;
155 u64 kernel_gs_base;
156 u64 sysenter_cs;
157 u64 sysenter_esp;
158 u64 sysenter_eip;
159 u64 cr2;
160 u8 reserved_6[32];
161 u64 g_pat;
162 u64 dbgctl;
163 u64 br_from;
164 u64 br_to;
165 u64 last_excp_from;
166 u64 last_excp_to;
167};
168
169struct __attribute__ ((__packed__)) vmcb {
170 struct vmcb_control_area control;
171 struct vmcb_save_area save;
172};
173
174#define SVM_CPUID_FEATURE_SHIFT 2
175#define SVM_CPUID_FUNC 0x8000000a
176
177#define MSR_EFER_SVME_MASK (1ULL << 12)
178#define MSR_VM_CR 0xc0010114
179#define MSR_VM_HSAVE_PA 0xc0010117ULL
180
181#define SVM_VM_CR_SVM_DISABLE 4
182
183#define SVM_SELECTOR_S_SHIFT 4
184#define SVM_SELECTOR_DPL_SHIFT 5
185#define SVM_SELECTOR_P_SHIFT 7
186#define SVM_SELECTOR_AVL_SHIFT 8
187#define SVM_SELECTOR_L_SHIFT 9
188#define SVM_SELECTOR_DB_SHIFT 10
189#define SVM_SELECTOR_G_SHIFT 11
190
191#define SVM_SELECTOR_TYPE_MASK (0xf)
192#define SVM_SELECTOR_S_MASK (1 << SVM_SELECTOR_S_SHIFT)
193#define SVM_SELECTOR_DPL_MASK (3 << SVM_SELECTOR_DPL_SHIFT)
194#define SVM_SELECTOR_P_MASK (1 << SVM_SELECTOR_P_SHIFT)
195#define SVM_SELECTOR_AVL_MASK (1 << SVM_SELECTOR_AVL_SHIFT)
196#define SVM_SELECTOR_L_MASK (1 << SVM_SELECTOR_L_SHIFT)
197#define SVM_SELECTOR_DB_MASK (1 << SVM_SELECTOR_DB_SHIFT)
198#define SVM_SELECTOR_G_MASK (1 << SVM_SELECTOR_G_SHIFT)
199
200#define SVM_SELECTOR_WRITE_MASK (1 << 1)
201#define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK
202#define SVM_SELECTOR_CODE_MASK (1 << 3)
203
204#define INTERCEPT_CR0_MASK 1
205#define INTERCEPT_CR3_MASK (1 << 3)
206#define INTERCEPT_CR4_MASK (1 << 4)
207
208#define INTERCEPT_DR0_MASK 1
209#define INTERCEPT_DR1_MASK (1 << 1)
210#define INTERCEPT_DR2_MASK (1 << 2)
211#define INTERCEPT_DR3_MASK (1 << 3)
212#define INTERCEPT_DR4_MASK (1 << 4)
213#define INTERCEPT_DR5_MASK (1 << 5)
214#define INTERCEPT_DR6_MASK (1 << 6)
215#define INTERCEPT_DR7_MASK (1 << 7)
216
217#define SVM_EVTINJ_VEC_MASK 0xff
218
219#define SVM_EVTINJ_TYPE_SHIFT 8
220#define SVM_EVTINJ_TYPE_MASK (7 << SVM_EVTINJ_TYPE_SHIFT)
221
222#define SVM_EVTINJ_TYPE_INTR (0 << SVM_EVTINJ_TYPE_SHIFT)
223#define SVM_EVTINJ_TYPE_NMI (2 << SVM_EVTINJ_TYPE_SHIFT)
224#define SVM_EVTINJ_TYPE_EXEPT (3 << SVM_EVTINJ_TYPE_SHIFT)
225#define SVM_EVTINJ_TYPE_SOFT (4 << SVM_EVTINJ_TYPE_SHIFT)
226
227#define SVM_EVTINJ_VALID (1 << 31)
228#define SVM_EVTINJ_VALID_ERR (1 << 11)
229
230#define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK
231
232#define SVM_EXITINTINFO_TYPE_INTR SVM_EVTINJ_TYPE_INTR
233#define SVM_EXITINTINFO_TYPE_NMI SVM_EVTINJ_TYPE_NMI
234#define SVM_EXITINTINFO_TYPE_EXEPT SVM_EVTINJ_TYPE_EXEPT
235#define SVM_EXITINTINFO_TYPE_SOFT SVM_EVTINJ_TYPE_SOFT
236
237#define SVM_EXITINTINFO_VALID SVM_EVTINJ_VALID
238#define SVM_EXITINTINFO_VALID_ERR SVM_EVTINJ_VALID_ERR
239
240#define SVM_EXIT_READ_CR0 0x000
241#define SVM_EXIT_READ_CR3 0x003
242#define SVM_EXIT_READ_CR4 0x004
243#define SVM_EXIT_READ_CR8 0x008
244#define SVM_EXIT_WRITE_CR0 0x010
245#define SVM_EXIT_WRITE_CR3 0x013
246#define SVM_EXIT_WRITE_CR4 0x014
247#define SVM_EXIT_WRITE_CR8 0x018
248#define SVM_EXIT_READ_DR0 0x020
249#define SVM_EXIT_READ_DR1 0x021
250#define SVM_EXIT_READ_DR2 0x022
251#define SVM_EXIT_READ_DR3 0x023
252#define SVM_EXIT_READ_DR4 0x024
253#define SVM_EXIT_READ_DR5 0x025
254#define SVM_EXIT_READ_DR6 0x026
255#define SVM_EXIT_READ_DR7 0x027
256#define SVM_EXIT_WRITE_DR0 0x030
257#define SVM_EXIT_WRITE_DR1 0x031
258#define SVM_EXIT_WRITE_DR2 0x032
259#define SVM_EXIT_WRITE_DR3 0x033
260#define SVM_EXIT_WRITE_DR4 0x034
261#define SVM_EXIT_WRITE_DR5 0x035
262#define SVM_EXIT_WRITE_DR6 0x036
263#define SVM_EXIT_WRITE_DR7 0x037
264#define SVM_EXIT_EXCP_BASE 0x040
265#define SVM_EXIT_INTR 0x060
266#define SVM_EXIT_NMI 0x061
267#define SVM_EXIT_SMI 0x062
268#define SVM_EXIT_INIT 0x063
269#define SVM_EXIT_VINTR 0x064
270#define SVM_EXIT_CR0_SEL_WRITE 0x065
271#define SVM_EXIT_IDTR_READ 0x066
272#define SVM_EXIT_GDTR_READ 0x067
273#define SVM_EXIT_LDTR_READ 0x068
274#define SVM_EXIT_TR_READ 0x069
275#define SVM_EXIT_IDTR_WRITE 0x06a
276#define SVM_EXIT_GDTR_WRITE 0x06b
277#define SVM_EXIT_LDTR_WRITE 0x06c
278#define SVM_EXIT_TR_WRITE 0x06d
279#define SVM_EXIT_RDTSC 0x06e
280#define SVM_EXIT_RDPMC 0x06f
281#define SVM_EXIT_PUSHF 0x070
282#define SVM_EXIT_POPF 0x071
283#define SVM_EXIT_CPUID 0x072
284#define SVM_EXIT_RSM 0x073
285#define SVM_EXIT_IRET 0x074
286#define SVM_EXIT_SWINT 0x075
287#define SVM_EXIT_INVD 0x076
288#define SVM_EXIT_PAUSE 0x077
289#define SVM_EXIT_HLT 0x078
290#define SVM_EXIT_INVLPG 0x079
291#define SVM_EXIT_INVLPGA 0x07a
292#define SVM_EXIT_IOIO 0x07b
293#define SVM_EXIT_MSR 0x07c
294#define SVM_EXIT_TASK_SWITCH 0x07d
295#define SVM_EXIT_FERR_FREEZE 0x07e
296#define SVM_EXIT_SHUTDOWN 0x07f
297#define SVM_EXIT_VMRUN 0x080
298#define SVM_EXIT_VMMCALL 0x081
299#define SVM_EXIT_VMLOAD 0x082
300#define SVM_EXIT_VMSAVE 0x083
301#define SVM_EXIT_STGI 0x084
302#define SVM_EXIT_CLGI 0x085
303#define SVM_EXIT_SKINIT 0x086
304#define SVM_EXIT_RDTSCP 0x087
305#define SVM_EXIT_ICEBP 0x088
306#define SVM_EXIT_WBINVD 0x089
307#define SVM_EXIT_MONITOR 0x08a
308#define SVM_EXIT_MWAIT 0x08b
309#define SVM_EXIT_MWAIT_COND 0x08c
310#define SVM_EXIT_NPF 0x400
311
312#define SVM_EXIT_ERR -1
313
314#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) // TS and MP
315
316#define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda"
317#define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8"
318#define SVM_VMSAVE ".byte 0x0f, 0x01, 0xdb"
319#define SVM_CLGI ".byte 0x0f, 0x01, 0xdd"
320#define SVM_STGI ".byte 0x0f, 0x01, 0xdc"
321#define SVM_INVLPGA ".byte 0x0f, 0x01, 0xdf"
322
323#endif
324
diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c
deleted file mode 100644
index 5b397b6c9f93..000000000000
--- a/drivers/kvm/vmx.c
+++ /dev/null
@@ -1,2566 +0,0 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * Copyright (C) 2006 Qumranet, Inc.
8 *
9 * Authors:
10 * Avi Kivity <avi@qumranet.com>
11 * Yaniv Kamay <yaniv@qumranet.com>
12 *
13 * This work is licensed under the terms of the GNU GPL, version 2. See
14 * the COPYING file in the top-level directory.
15 *
16 */
17
18#include "kvm.h"
19#include "x86_emulate.h"
20#include "irq.h"
21#include "vmx.h"
22#include "segment_descriptor.h"
23
24#include <linux/module.h>
25#include <linux/kernel.h>
26#include <linux/mm.h>
27#include <linux/highmem.h>
28#include <linux/sched.h>
29
30#include <asm/io.h>
31#include <asm/desc.h>
32
33MODULE_AUTHOR("Qumranet");
34MODULE_LICENSE("GPL");
35
36struct vmcs {
37 u32 revision_id;
38 u32 abort;
39 char data[0];
40};
41
42struct vcpu_vmx {
43 struct kvm_vcpu vcpu;
44 int launched;
45 u8 fail;
46 struct kvm_msr_entry *guest_msrs;
47 struct kvm_msr_entry *host_msrs;
48 int nmsrs;
49 int save_nmsrs;
50 int msr_offset_efer;
51#ifdef CONFIG_X86_64
52 int msr_offset_kernel_gs_base;
53#endif
54 struct vmcs *vmcs;
55 struct {
56 int loaded;
57 u16 fs_sel, gs_sel, ldt_sel;
58 int gs_ldt_reload_needed;
59 int fs_reload_needed;
60 }host_state;
61
62};
63
64static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
65{
66 return container_of(vcpu, struct vcpu_vmx, vcpu);
67}
68
69static int init_rmode_tss(struct kvm *kvm);
70
71static DEFINE_PER_CPU(struct vmcs *, vmxarea);
72static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
73
74static struct page *vmx_io_bitmap_a;
75static struct page *vmx_io_bitmap_b;
76
77#define EFER_SAVE_RESTORE_BITS ((u64)EFER_SCE)
78
79static struct vmcs_config {
80 int size;
81 int order;
82 u32 revision_id;
83 u32 pin_based_exec_ctrl;
84 u32 cpu_based_exec_ctrl;
85 u32 vmexit_ctrl;
86 u32 vmentry_ctrl;
87} vmcs_config;
88
89#define VMX_SEGMENT_FIELD(seg) \
90 [VCPU_SREG_##seg] = { \
91 .selector = GUEST_##seg##_SELECTOR, \
92 .base = GUEST_##seg##_BASE, \
93 .limit = GUEST_##seg##_LIMIT, \
94 .ar_bytes = GUEST_##seg##_AR_BYTES, \
95 }
96
97static struct kvm_vmx_segment_field {
98 unsigned selector;
99 unsigned base;
100 unsigned limit;
101 unsigned ar_bytes;
102} kvm_vmx_segment_fields[] = {
103 VMX_SEGMENT_FIELD(CS),
104 VMX_SEGMENT_FIELD(DS),
105 VMX_SEGMENT_FIELD(ES),
106 VMX_SEGMENT_FIELD(FS),
107 VMX_SEGMENT_FIELD(GS),
108 VMX_SEGMENT_FIELD(SS),
109 VMX_SEGMENT_FIELD(TR),
110 VMX_SEGMENT_FIELD(LDTR),
111};
112
113/*
114 * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it
115 * away by decrementing the array size.
116 */
117static const u32 vmx_msr_index[] = {
118#ifdef CONFIG_X86_64
119 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE,
120#endif
121 MSR_EFER, MSR_K6_STAR,
122};
123#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
124
125static void load_msrs(struct kvm_msr_entry *e, int n)
126{
127 int i;
128
129 for (i = 0; i < n; ++i)
130 wrmsrl(e[i].index, e[i].data);
131}
132
133static void save_msrs(struct kvm_msr_entry *e, int n)
134{
135 int i;
136
137 for (i = 0; i < n; ++i)
138 rdmsrl(e[i].index, e[i].data);
139}
140
141static inline u64 msr_efer_save_restore_bits(struct kvm_msr_entry msr)
142{
143 return (u64)msr.data & EFER_SAVE_RESTORE_BITS;
144}
145
146static inline int msr_efer_need_save_restore(struct vcpu_vmx *vmx)
147{
148 int efer_offset = vmx->msr_offset_efer;
149 return msr_efer_save_restore_bits(vmx->host_msrs[efer_offset]) !=
150 msr_efer_save_restore_bits(vmx->guest_msrs[efer_offset]);
151}
152
153static inline int is_page_fault(u32 intr_info)
154{
155 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
156 INTR_INFO_VALID_MASK)) ==
157 (INTR_TYPE_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
158}
159
160static inline int is_no_device(u32 intr_info)
161{
162 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
163 INTR_INFO_VALID_MASK)) ==
164 (INTR_TYPE_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
165}
166
167static inline int is_external_interrupt(u32 intr_info)
168{
169 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
170 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
171}
172
173static inline int cpu_has_vmx_tpr_shadow(void)
174{
175 return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW);
176}
177
178static inline int vm_need_tpr_shadow(struct kvm *kvm)
179{
180 return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm)));
181}
182
183static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
184{
185 int i;
186
187 for (i = 0; i < vmx->nmsrs; ++i)
188 if (vmx->guest_msrs[i].index == msr)
189 return i;
190 return -1;
191}
192
193static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
194{
195 int i;
196
197 i = __find_msr_index(vmx, msr);
198 if (i >= 0)
199 return &vmx->guest_msrs[i];
200 return NULL;
201}
202
203static void vmcs_clear(struct vmcs *vmcs)
204{
205 u64 phys_addr = __pa(vmcs);
206 u8 error;
207
208 asm volatile (ASM_VMX_VMCLEAR_RAX "; setna %0"
209 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
210 : "cc", "memory");
211 if (error)
212 printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
213 vmcs, phys_addr);
214}
215
216static void __vcpu_clear(void *arg)
217{
218 struct vcpu_vmx *vmx = arg;
219 int cpu = raw_smp_processor_id();
220
221 if (vmx->vcpu.cpu == cpu)
222 vmcs_clear(vmx->vmcs);
223 if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
224 per_cpu(current_vmcs, cpu) = NULL;
225 rdtscll(vmx->vcpu.host_tsc);
226}
227
228static void vcpu_clear(struct vcpu_vmx *vmx)
229{
230 if (vmx->vcpu.cpu != raw_smp_processor_id() && vmx->vcpu.cpu != -1)
231 smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear,
232 vmx, 0, 1);
233 else
234 __vcpu_clear(vmx);
235 vmx->launched = 0;
236}
237
238static unsigned long vmcs_readl(unsigned long field)
239{
240 unsigned long value;
241
242 asm volatile (ASM_VMX_VMREAD_RDX_RAX
243 : "=a"(value) : "d"(field) : "cc");
244 return value;
245}
246
247static u16 vmcs_read16(unsigned long field)
248{
249 return vmcs_readl(field);
250}
251
252static u32 vmcs_read32(unsigned long field)
253{
254 return vmcs_readl(field);
255}
256
257static u64 vmcs_read64(unsigned long field)
258{
259#ifdef CONFIG_X86_64
260 return vmcs_readl(field);
261#else
262 return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32);
263#endif
264}
265
266static noinline void vmwrite_error(unsigned long field, unsigned long value)
267{
268 printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
269 field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
270 dump_stack();
271}
272
273static void vmcs_writel(unsigned long field, unsigned long value)
274{
275 u8 error;
276
277 asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0"
278 : "=q"(error) : "a"(value), "d"(field) : "cc" );
279 if (unlikely(error))
280 vmwrite_error(field, value);
281}
282
283static void vmcs_write16(unsigned long field, u16 value)
284{
285 vmcs_writel(field, value);
286}
287
288static void vmcs_write32(unsigned long field, u32 value)
289{
290 vmcs_writel(field, value);
291}
292
293static void vmcs_write64(unsigned long field, u64 value)
294{
295#ifdef CONFIG_X86_64
296 vmcs_writel(field, value);
297#else
298 vmcs_writel(field, value);
299 asm volatile ("");
300 vmcs_writel(field+1, value >> 32);
301#endif
302}
303
304static void vmcs_clear_bits(unsigned long field, u32 mask)
305{
306 vmcs_writel(field, vmcs_readl(field) & ~mask);
307}
308
309static void vmcs_set_bits(unsigned long field, u32 mask)
310{
311 vmcs_writel(field, vmcs_readl(field) | mask);
312}
313
314static void update_exception_bitmap(struct kvm_vcpu *vcpu)
315{
316 u32 eb;
317
318 eb = 1u << PF_VECTOR;
319 if (!vcpu->fpu_active)
320 eb |= 1u << NM_VECTOR;
321 if (vcpu->guest_debug.enabled)
322 eb |= 1u << 1;
323 if (vcpu->rmode.active)
324 eb = ~0;
325 vmcs_write32(EXCEPTION_BITMAP, eb);
326}
327
328static void reload_tss(void)
329{
330#ifndef CONFIG_X86_64
331
332 /*
333 * VT restores TR but not its size. Useless.
334 */
335 struct descriptor_table gdt;
336 struct segment_descriptor *descs;
337
338 get_gdt(&gdt);
339 descs = (void *)gdt.base;
340 descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
341 load_TR_desc();
342#endif
343}
344
345static void load_transition_efer(struct vcpu_vmx *vmx)
346{
347 u64 trans_efer;
348 int efer_offset = vmx->msr_offset_efer;
349
350 trans_efer = vmx->host_msrs[efer_offset].data;
351 trans_efer &= ~EFER_SAVE_RESTORE_BITS;
352 trans_efer |= msr_efer_save_restore_bits(vmx->guest_msrs[efer_offset]);
353 wrmsrl(MSR_EFER, trans_efer);
354 vmx->vcpu.stat.efer_reload++;
355}
356
357static void vmx_save_host_state(struct kvm_vcpu *vcpu)
358{
359 struct vcpu_vmx *vmx = to_vmx(vcpu);
360
361 if (vmx->host_state.loaded)
362 return;
363
364 vmx->host_state.loaded = 1;
365 /*
366 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not
367 * allow segment selectors with cpl > 0 or ti == 1.
368 */
369 vmx->host_state.ldt_sel = read_ldt();
370 vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
371 vmx->host_state.fs_sel = read_fs();
372 if (!(vmx->host_state.fs_sel & 7)) {
373 vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
374 vmx->host_state.fs_reload_needed = 0;
375 } else {
376 vmcs_write16(HOST_FS_SELECTOR, 0);
377 vmx->host_state.fs_reload_needed = 1;
378 }
379 vmx->host_state.gs_sel = read_gs();
380 if (!(vmx->host_state.gs_sel & 7))
381 vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
382 else {
383 vmcs_write16(HOST_GS_SELECTOR, 0);
384 vmx->host_state.gs_ldt_reload_needed = 1;
385 }
386
387#ifdef CONFIG_X86_64
388 vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
389 vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
390#else
391 vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel));
392 vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel));
393#endif
394
395#ifdef CONFIG_X86_64
396 if (is_long_mode(&vmx->vcpu)) {
397 save_msrs(vmx->host_msrs +
398 vmx->msr_offset_kernel_gs_base, 1);
399 }
400#endif
401 load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
402 if (msr_efer_need_save_restore(vmx))
403 load_transition_efer(vmx);
404}
405
406static void vmx_load_host_state(struct vcpu_vmx *vmx)
407{
408 unsigned long flags;
409
410 if (!vmx->host_state.loaded)
411 return;
412
413 vmx->host_state.loaded = 0;
414 if (vmx->host_state.fs_reload_needed)
415 load_fs(vmx->host_state.fs_sel);
416 if (vmx->host_state.gs_ldt_reload_needed) {
417 load_ldt(vmx->host_state.ldt_sel);
418 /*
419 * If we have to reload gs, we must take care to
420 * preserve our gs base.
421 */
422 local_irq_save(flags);
423 load_gs(vmx->host_state.gs_sel);
424#ifdef CONFIG_X86_64
425 wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
426#endif
427 local_irq_restore(flags);
428 }
429 reload_tss();
430 save_msrs(vmx->guest_msrs, vmx->save_nmsrs);
431 load_msrs(vmx->host_msrs, vmx->save_nmsrs);
432 if (msr_efer_need_save_restore(vmx))
433 load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1);
434}
435
436/*
437 * Switches to specified vcpu, until a matching vcpu_put(), but assumes
438 * vcpu mutex is already taken.
439 */
440static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
441{
442 struct vcpu_vmx *vmx = to_vmx(vcpu);
443 u64 phys_addr = __pa(vmx->vmcs);
444 u64 tsc_this, delta;
445
446 if (vcpu->cpu != cpu) {
447 vcpu_clear(vmx);
448 kvm_migrate_apic_timer(vcpu);
449 }
450
451 if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
452 u8 error;
453
454 per_cpu(current_vmcs, cpu) = vmx->vmcs;
455 asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0"
456 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
457 : "cc");
458 if (error)
459 printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
460 vmx->vmcs, phys_addr);
461 }
462
463 if (vcpu->cpu != cpu) {
464 struct descriptor_table dt;
465 unsigned long sysenter_esp;
466
467 vcpu->cpu = cpu;
468 /*
469 * Linux uses per-cpu TSS and GDT, so set these when switching
470 * processors.
471 */
472 vmcs_writel(HOST_TR_BASE, read_tr_base()); /* 22.2.4 */
473 get_gdt(&dt);
474 vmcs_writel(HOST_GDTR_BASE, dt.base); /* 22.2.4 */
475
476 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
477 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
478
479 /*
480 * Make sure the time stamp counter is monotonous.
481 */
482 rdtscll(tsc_this);
483 delta = vcpu->host_tsc - tsc_this;
484 vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta);
485 }
486}
487
488static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
489{
490 vmx_load_host_state(to_vmx(vcpu));
491 kvm_put_guest_fpu(vcpu);
492}
493
494static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
495{
496 if (vcpu->fpu_active)
497 return;
498 vcpu->fpu_active = 1;
499 vmcs_clear_bits(GUEST_CR0, X86_CR0_TS);
500 if (vcpu->cr0 & X86_CR0_TS)
501 vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
502 update_exception_bitmap(vcpu);
503}
504
505static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
506{
507 if (!vcpu->fpu_active)
508 return;
509 vcpu->fpu_active = 0;
510 vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
511 update_exception_bitmap(vcpu);
512}
513
514static void vmx_vcpu_decache(struct kvm_vcpu *vcpu)
515{
516 vcpu_clear(to_vmx(vcpu));
517}
518
519static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
520{
521 return vmcs_readl(GUEST_RFLAGS);
522}
523
524static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
525{
526 if (vcpu->rmode.active)
527 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
528 vmcs_writel(GUEST_RFLAGS, rflags);
529}
530
531static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
532{
533 unsigned long rip;
534 u32 interruptibility;
535
536 rip = vmcs_readl(GUEST_RIP);
537 rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
538 vmcs_writel(GUEST_RIP, rip);
539
540 /*
541 * We emulated an instruction, so temporary interrupt blocking
542 * should be removed, if set.
543 */
544 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
545 if (interruptibility & 3)
546 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
547 interruptibility & ~3);
548 vcpu->interrupt_window_open = 1;
549}
550
551static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code)
552{
553 printk(KERN_DEBUG "inject_general_protection: rip 0x%lx\n",
554 vmcs_readl(GUEST_RIP));
555 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
556 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
557 GP_VECTOR |
558 INTR_TYPE_EXCEPTION |
559 INTR_INFO_DELIEVER_CODE_MASK |
560 INTR_INFO_VALID_MASK);
561}
562
563/*
564 * Swap MSR entry in host/guest MSR entry array.
565 */
566#ifdef CONFIG_X86_64
567static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
568{
569 struct kvm_msr_entry tmp;
570
571 tmp = vmx->guest_msrs[to];
572 vmx->guest_msrs[to] = vmx->guest_msrs[from];
573 vmx->guest_msrs[from] = tmp;
574 tmp = vmx->host_msrs[to];
575 vmx->host_msrs[to] = vmx->host_msrs[from];
576 vmx->host_msrs[from] = tmp;
577}
578#endif
579
580/*
581 * Set up the vmcs to automatically save and restore system
582 * msrs. Don't touch the 64-bit msrs if the guest is in legacy
583 * mode, as fiddling with msrs is very expensive.
584 */
585static void setup_msrs(struct vcpu_vmx *vmx)
586{
587 int save_nmsrs;
588
589 save_nmsrs = 0;
590#ifdef CONFIG_X86_64
591 if (is_long_mode(&vmx->vcpu)) {
592 int index;
593
594 index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
595 if (index >= 0)
596 move_msr_up(vmx, index, save_nmsrs++);
597 index = __find_msr_index(vmx, MSR_LSTAR);
598 if (index >= 0)
599 move_msr_up(vmx, index, save_nmsrs++);
600 index = __find_msr_index(vmx, MSR_CSTAR);
601 if (index >= 0)
602 move_msr_up(vmx, index, save_nmsrs++);
603 index = __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
604 if (index >= 0)
605 move_msr_up(vmx, index, save_nmsrs++);
606 /*
607 * MSR_K6_STAR is only needed on long mode guests, and only
608 * if efer.sce is enabled.
609 */
610 index = __find_msr_index(vmx, MSR_K6_STAR);
611 if ((index >= 0) && (vmx->vcpu.shadow_efer & EFER_SCE))
612 move_msr_up(vmx, index, save_nmsrs++);
613 }
614#endif
615 vmx->save_nmsrs = save_nmsrs;
616
617#ifdef CONFIG_X86_64
618 vmx->msr_offset_kernel_gs_base =
619 __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
620#endif
621 vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER);
622}
623
624/*
625 * reads and returns guest's timestamp counter "register"
626 * guest_tsc = host_tsc + tsc_offset -- 21.3
627 */
628static u64 guest_read_tsc(void)
629{
630 u64 host_tsc, tsc_offset;
631
632 rdtscll(host_tsc);
633 tsc_offset = vmcs_read64(TSC_OFFSET);
634 return host_tsc + tsc_offset;
635}
636
637/*
638 * writes 'guest_tsc' into guest's timestamp counter "register"
639 * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc
640 */
641static void guest_write_tsc(u64 guest_tsc)
642{
643 u64 host_tsc;
644
645 rdtscll(host_tsc);
646 vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc);
647}
648
649/*
650 * Reads an msr value (of 'msr_index') into 'pdata'.
651 * Returns 0 on success, non-0 otherwise.
652 * Assumes vcpu_load() was already called.
653 */
654static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
655{
656 u64 data;
657 struct kvm_msr_entry *msr;
658
659 if (!pdata) {
660 printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
661 return -EINVAL;
662 }
663
664 switch (msr_index) {
665#ifdef CONFIG_X86_64
666 case MSR_FS_BASE:
667 data = vmcs_readl(GUEST_FS_BASE);
668 break;
669 case MSR_GS_BASE:
670 data = vmcs_readl(GUEST_GS_BASE);
671 break;
672 case MSR_EFER:
673 return kvm_get_msr_common(vcpu, msr_index, pdata);
674#endif
675 case MSR_IA32_TIME_STAMP_COUNTER:
676 data = guest_read_tsc();
677 break;
678 case MSR_IA32_SYSENTER_CS:
679 data = vmcs_read32(GUEST_SYSENTER_CS);
680 break;
681 case MSR_IA32_SYSENTER_EIP:
682 data = vmcs_readl(GUEST_SYSENTER_EIP);
683 break;
684 case MSR_IA32_SYSENTER_ESP:
685 data = vmcs_readl(GUEST_SYSENTER_ESP);
686 break;
687 default:
688 msr = find_msr_entry(to_vmx(vcpu), msr_index);
689 if (msr) {
690 data = msr->data;
691 break;
692 }
693 return kvm_get_msr_common(vcpu, msr_index, pdata);
694 }
695
696 *pdata = data;
697 return 0;
698}
699
700/*
701 * Writes msr value into into the appropriate "register".
702 * Returns 0 on success, non-0 otherwise.
703 * Assumes vcpu_load() was already called.
704 */
705static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
706{
707 struct vcpu_vmx *vmx = to_vmx(vcpu);
708 struct kvm_msr_entry *msr;
709 int ret = 0;
710
711 switch (msr_index) {
712#ifdef CONFIG_X86_64
713 case MSR_EFER:
714 ret = kvm_set_msr_common(vcpu, msr_index, data);
715 if (vmx->host_state.loaded)
716 load_transition_efer(vmx);
717 break;
718 case MSR_FS_BASE:
719 vmcs_writel(GUEST_FS_BASE, data);
720 break;
721 case MSR_GS_BASE:
722 vmcs_writel(GUEST_GS_BASE, data);
723 break;
724#endif
725 case MSR_IA32_SYSENTER_CS:
726 vmcs_write32(GUEST_SYSENTER_CS, data);
727 break;
728 case MSR_IA32_SYSENTER_EIP:
729 vmcs_writel(GUEST_SYSENTER_EIP, data);
730 break;
731 case MSR_IA32_SYSENTER_ESP:
732 vmcs_writel(GUEST_SYSENTER_ESP, data);
733 break;
734 case MSR_IA32_TIME_STAMP_COUNTER:
735 guest_write_tsc(data);
736 break;
737 default:
738 msr = find_msr_entry(vmx, msr_index);
739 if (msr) {
740 msr->data = data;
741 if (vmx->host_state.loaded)
742 load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
743 break;
744 }
745 ret = kvm_set_msr_common(vcpu, msr_index, data);
746 }
747
748 return ret;
749}
750
751/*
752 * Sync the rsp and rip registers into the vcpu structure. This allows
753 * registers to be accessed by indexing vcpu->regs.
754 */
755static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu)
756{
757 vcpu->regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
758 vcpu->rip = vmcs_readl(GUEST_RIP);
759}
760
761/*
762 * Syncs rsp and rip back into the vmcs. Should be called after possible
763 * modification.
764 */
765static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu)
766{
767 vmcs_writel(GUEST_RSP, vcpu->regs[VCPU_REGS_RSP]);
768 vmcs_writel(GUEST_RIP, vcpu->rip);
769}
770
771static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
772{
773 unsigned long dr7 = 0x400;
774 int old_singlestep;
775
776 old_singlestep = vcpu->guest_debug.singlestep;
777
778 vcpu->guest_debug.enabled = dbg->enabled;
779 if (vcpu->guest_debug.enabled) {
780 int i;
781
782 dr7 |= 0x200; /* exact */
783 for (i = 0; i < 4; ++i) {
784 if (!dbg->breakpoints[i].enabled)
785 continue;
786 vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address;
787 dr7 |= 2 << (i*2); /* global enable */
788 dr7 |= 0 << (i*4+16); /* execution breakpoint */
789 }
790
791 vcpu->guest_debug.singlestep = dbg->singlestep;
792 } else
793 vcpu->guest_debug.singlestep = 0;
794
795 if (old_singlestep && !vcpu->guest_debug.singlestep) {
796 unsigned long flags;
797
798 flags = vmcs_readl(GUEST_RFLAGS);
799 flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
800 vmcs_writel(GUEST_RFLAGS, flags);
801 }
802
803 update_exception_bitmap(vcpu);
804 vmcs_writel(GUEST_DR7, dr7);
805
806 return 0;
807}
808
809static int vmx_get_irq(struct kvm_vcpu *vcpu)
810{
811 u32 idtv_info_field;
812
813 idtv_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD);
814 if (idtv_info_field & INTR_INFO_VALID_MASK) {
815 if (is_external_interrupt(idtv_info_field))
816 return idtv_info_field & VECTORING_INFO_VECTOR_MASK;
817 else
818 printk("pending exception: not handled yet\n");
819 }
820 return -1;
821}
822
823static __init int cpu_has_kvm_support(void)
824{
825 unsigned long ecx = cpuid_ecx(1);
826 return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */
827}
828
829static __init int vmx_disabled_by_bios(void)
830{
831 u64 msr;
832
833 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
834 return (msr & (MSR_IA32_FEATURE_CONTROL_LOCKED |
835 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
836 == MSR_IA32_FEATURE_CONTROL_LOCKED;
837 /* locked but not enabled */
838}
839
840static void hardware_enable(void *garbage)
841{
842 int cpu = raw_smp_processor_id();
843 u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
844 u64 old;
845
846 rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
847 if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED |
848 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
849 != (MSR_IA32_FEATURE_CONTROL_LOCKED |
850 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
851 /* enable and lock */
852 wrmsrl(MSR_IA32_FEATURE_CONTROL, old |
853 MSR_IA32_FEATURE_CONTROL_LOCKED |
854 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED);
855 write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
856 asm volatile (ASM_VMX_VMXON_RAX : : "a"(&phys_addr), "m"(phys_addr)
857 : "memory", "cc");
858}
859
860static void hardware_disable(void *garbage)
861{
862 asm volatile (ASM_VMX_VMXOFF : : : "cc");
863}
864
865static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
866 u32 msr, u32* result)
867{
868 u32 vmx_msr_low, vmx_msr_high;
869 u32 ctl = ctl_min | ctl_opt;
870
871 rdmsr(msr, vmx_msr_low, vmx_msr_high);
872
873 ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
874 ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */
875
876 /* Ensure minimum (required) set of control bits are supported. */
877 if (ctl_min & ~ctl)
878 return -EIO;
879
880 *result = ctl;
881 return 0;
882}
883
884static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
885{
886 u32 vmx_msr_low, vmx_msr_high;
887 u32 min, opt;
888 u32 _pin_based_exec_control = 0;
889 u32 _cpu_based_exec_control = 0;
890 u32 _vmexit_control = 0;
891 u32 _vmentry_control = 0;
892
893 min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
894 opt = 0;
895 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
896 &_pin_based_exec_control) < 0)
897 return -EIO;
898
899 min = CPU_BASED_HLT_EXITING |
900#ifdef CONFIG_X86_64
901 CPU_BASED_CR8_LOAD_EXITING |
902 CPU_BASED_CR8_STORE_EXITING |
903#endif
904 CPU_BASED_USE_IO_BITMAPS |
905 CPU_BASED_MOV_DR_EXITING |
906 CPU_BASED_USE_TSC_OFFSETING;
907#ifdef CONFIG_X86_64
908 opt = CPU_BASED_TPR_SHADOW;
909#else
910 opt = 0;
911#endif
912 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
913 &_cpu_based_exec_control) < 0)
914 return -EIO;
915#ifdef CONFIG_X86_64
916 if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
917 _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
918 ~CPU_BASED_CR8_STORE_EXITING;
919#endif
920
921 min = 0;
922#ifdef CONFIG_X86_64
923 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
924#endif
925 opt = 0;
926 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
927 &_vmexit_control) < 0)
928 return -EIO;
929
930 min = opt = 0;
931 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
932 &_vmentry_control) < 0)
933 return -EIO;
934
935 rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
936
937 /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
938 if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
939 return -EIO;
940
941#ifdef CONFIG_X86_64
942 /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
943 if (vmx_msr_high & (1u<<16))
944 return -EIO;
945#endif
946
947 /* Require Write-Back (WB) memory type for VMCS accesses. */
948 if (((vmx_msr_high >> 18) & 15) != 6)
949 return -EIO;
950
951 vmcs_conf->size = vmx_msr_high & 0x1fff;
952 vmcs_conf->order = get_order(vmcs_config.size);
953 vmcs_conf->revision_id = vmx_msr_low;
954
955 vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
956 vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
957 vmcs_conf->vmexit_ctrl = _vmexit_control;
958 vmcs_conf->vmentry_ctrl = _vmentry_control;
959
960 return 0;
961}
962
963static struct vmcs *alloc_vmcs_cpu(int cpu)
964{
965 int node = cpu_to_node(cpu);
966 struct page *pages;
967 struct vmcs *vmcs;
968
969 pages = alloc_pages_node(node, GFP_KERNEL, vmcs_config.order);
970 if (!pages)
971 return NULL;
972 vmcs = page_address(pages);
973 memset(vmcs, 0, vmcs_config.size);
974 vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */
975 return vmcs;
976}
977
978static struct vmcs *alloc_vmcs(void)
979{
980 return alloc_vmcs_cpu(raw_smp_processor_id());
981}
982
983static void free_vmcs(struct vmcs *vmcs)
984{
985 free_pages((unsigned long)vmcs, vmcs_config.order);
986}
987
988static void free_kvm_area(void)
989{
990 int cpu;
991
992 for_each_online_cpu(cpu)
993 free_vmcs(per_cpu(vmxarea, cpu));
994}
995
996static __init int alloc_kvm_area(void)
997{
998 int cpu;
999
1000 for_each_online_cpu(cpu) {
1001 struct vmcs *vmcs;
1002
1003 vmcs = alloc_vmcs_cpu(cpu);
1004 if (!vmcs) {
1005 free_kvm_area();
1006 return -ENOMEM;
1007 }
1008
1009 per_cpu(vmxarea, cpu) = vmcs;
1010 }
1011 return 0;
1012}
1013
1014static __init int hardware_setup(void)
1015{
1016 if (setup_vmcs_config(&vmcs_config) < 0)
1017 return -EIO;
1018 return alloc_kvm_area();
1019}
1020
1021static __exit void hardware_unsetup(void)
1022{
1023 free_kvm_area();
1024}
1025
1026static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save)
1027{
1028 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1029
1030 if (vmcs_readl(sf->base) == save->base && (save->base & AR_S_MASK)) {
1031 vmcs_write16(sf->selector, save->selector);
1032 vmcs_writel(sf->base, save->base);
1033 vmcs_write32(sf->limit, save->limit);
1034 vmcs_write32(sf->ar_bytes, save->ar);
1035 } else {
1036 u32 dpl = (vmcs_read16(sf->selector) & SELECTOR_RPL_MASK)
1037 << AR_DPL_SHIFT;
1038 vmcs_write32(sf->ar_bytes, 0x93 | dpl);
1039 }
1040}
1041
1042static void enter_pmode(struct kvm_vcpu *vcpu)
1043{
1044 unsigned long flags;
1045
1046 vcpu->rmode.active = 0;
1047
1048 vmcs_writel(GUEST_TR_BASE, vcpu->rmode.tr.base);
1049 vmcs_write32(GUEST_TR_LIMIT, vcpu->rmode.tr.limit);
1050 vmcs_write32(GUEST_TR_AR_BYTES, vcpu->rmode.tr.ar);
1051
1052 flags = vmcs_readl(GUEST_RFLAGS);
1053 flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM);
1054 flags |= (vcpu->rmode.save_iopl << IOPL_SHIFT);
1055 vmcs_writel(GUEST_RFLAGS, flags);
1056
1057 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
1058 (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
1059
1060 update_exception_bitmap(vcpu);
1061
1062 fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->rmode.es);
1063 fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->rmode.ds);
1064 fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->rmode.gs);
1065 fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->rmode.fs);
1066
1067 vmcs_write16(GUEST_SS_SELECTOR, 0);
1068 vmcs_write32(GUEST_SS_AR_BYTES, 0x93);
1069
1070 vmcs_write16(GUEST_CS_SELECTOR,
1071 vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK);
1072 vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
1073}
1074
1075static gva_t rmode_tss_base(struct kvm* kvm)
1076{
1077 gfn_t base_gfn = kvm->memslots[0].base_gfn + kvm->memslots[0].npages - 3;
1078 return base_gfn << PAGE_SHIFT;
1079}
1080
1081static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
1082{
1083 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1084
1085 save->selector = vmcs_read16(sf->selector);
1086 save->base = vmcs_readl(sf->base);
1087 save->limit = vmcs_read32(sf->limit);
1088 save->ar = vmcs_read32(sf->ar_bytes);
1089 vmcs_write16(sf->selector, vmcs_readl(sf->base) >> 4);
1090 vmcs_write32(sf->limit, 0xffff);
1091 vmcs_write32(sf->ar_bytes, 0xf3);
1092}
1093
1094static void enter_rmode(struct kvm_vcpu *vcpu)
1095{
1096 unsigned long flags;
1097
1098 vcpu->rmode.active = 1;
1099
1100 vcpu->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
1101 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
1102
1103 vcpu->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
1104 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
1105
1106 vcpu->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
1107 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
1108
1109 flags = vmcs_readl(GUEST_RFLAGS);
1110 vcpu->rmode.save_iopl = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
1111
1112 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
1113
1114 vmcs_writel(GUEST_RFLAGS, flags);
1115 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
1116 update_exception_bitmap(vcpu);
1117
1118 vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4);
1119 vmcs_write32(GUEST_SS_LIMIT, 0xffff);
1120 vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
1121
1122 vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
1123 vmcs_write32(GUEST_CS_LIMIT, 0xffff);
1124 if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000)
1125 vmcs_writel(GUEST_CS_BASE, 0xf0000);
1126 vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
1127
1128 fix_rmode_seg(VCPU_SREG_ES, &vcpu->rmode.es);
1129 fix_rmode_seg(VCPU_SREG_DS, &vcpu->rmode.ds);
1130 fix_rmode_seg(VCPU_SREG_GS, &vcpu->rmode.gs);
1131 fix_rmode_seg(VCPU_SREG_FS, &vcpu->rmode.fs);
1132
1133 kvm_mmu_reset_context(vcpu);
1134 init_rmode_tss(vcpu->kvm);
1135}
1136
1137#ifdef CONFIG_X86_64
1138
1139static void enter_lmode(struct kvm_vcpu *vcpu)
1140{
1141 u32 guest_tr_ar;
1142
1143 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
1144 if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
1145 printk(KERN_DEBUG "%s: tss fixup for long mode. \n",
1146 __FUNCTION__);
1147 vmcs_write32(GUEST_TR_AR_BYTES,
1148 (guest_tr_ar & ~AR_TYPE_MASK)
1149 | AR_TYPE_BUSY_64_TSS);
1150 }
1151
1152 vcpu->shadow_efer |= EFER_LMA;
1153
1154 find_msr_entry(to_vmx(vcpu), MSR_EFER)->data |= EFER_LMA | EFER_LME;
1155 vmcs_write32(VM_ENTRY_CONTROLS,
1156 vmcs_read32(VM_ENTRY_CONTROLS)
1157 | VM_ENTRY_IA32E_MODE);
1158}
1159
1160static void exit_lmode(struct kvm_vcpu *vcpu)
1161{
1162 vcpu->shadow_efer &= ~EFER_LMA;
1163
1164 vmcs_write32(VM_ENTRY_CONTROLS,
1165 vmcs_read32(VM_ENTRY_CONTROLS)
1166 & ~VM_ENTRY_IA32E_MODE);
1167}
1168
1169#endif
1170
1171static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
1172{
1173 vcpu->cr4 &= KVM_GUEST_CR4_MASK;
1174 vcpu->cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK;
1175}
1176
1177static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1178{
1179 vmx_fpu_deactivate(vcpu);
1180
1181 if (vcpu->rmode.active && (cr0 & X86_CR0_PE))
1182 enter_pmode(vcpu);
1183
1184 if (!vcpu->rmode.active && !(cr0 & X86_CR0_PE))
1185 enter_rmode(vcpu);
1186
1187#ifdef CONFIG_X86_64
1188 if (vcpu->shadow_efer & EFER_LME) {
1189 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
1190 enter_lmode(vcpu);
1191 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
1192 exit_lmode(vcpu);
1193 }
1194#endif
1195
1196 vmcs_writel(CR0_READ_SHADOW, cr0);
1197 vmcs_writel(GUEST_CR0,
1198 (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON);
1199 vcpu->cr0 = cr0;
1200
1201 if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE))
1202 vmx_fpu_activate(vcpu);
1203}
1204
1205static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1206{
1207 vmcs_writel(GUEST_CR3, cr3);
1208 if (vcpu->cr0 & X86_CR0_PE)
1209 vmx_fpu_deactivate(vcpu);
1210}
1211
1212static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1213{
1214 vmcs_writel(CR4_READ_SHADOW, cr4);
1215 vmcs_writel(GUEST_CR4, cr4 | (vcpu->rmode.active ?
1216 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON));
1217 vcpu->cr4 = cr4;
1218}
1219
1220#ifdef CONFIG_X86_64
1221
1222static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
1223{
1224 struct vcpu_vmx *vmx = to_vmx(vcpu);
1225 struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
1226
1227 vcpu->shadow_efer = efer;
1228 if (efer & EFER_LMA) {
1229 vmcs_write32(VM_ENTRY_CONTROLS,
1230 vmcs_read32(VM_ENTRY_CONTROLS) |
1231 VM_ENTRY_IA32E_MODE);
1232 msr->data = efer;
1233
1234 } else {
1235 vmcs_write32(VM_ENTRY_CONTROLS,
1236 vmcs_read32(VM_ENTRY_CONTROLS) &
1237 ~VM_ENTRY_IA32E_MODE);
1238
1239 msr->data = efer & ~EFER_LME;
1240 }
1241 setup_msrs(vmx);
1242}
1243
1244#endif
1245
1246static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
1247{
1248 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1249
1250 return vmcs_readl(sf->base);
1251}
1252
1253static void vmx_get_segment(struct kvm_vcpu *vcpu,
1254 struct kvm_segment *var, int seg)
1255{
1256 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1257 u32 ar;
1258
1259 var->base = vmcs_readl(sf->base);
1260 var->limit = vmcs_read32(sf->limit);
1261 var->selector = vmcs_read16(sf->selector);
1262 ar = vmcs_read32(sf->ar_bytes);
1263 if (ar & AR_UNUSABLE_MASK)
1264 ar = 0;
1265 var->type = ar & 15;
1266 var->s = (ar >> 4) & 1;
1267 var->dpl = (ar >> 5) & 3;
1268 var->present = (ar >> 7) & 1;
1269 var->avl = (ar >> 12) & 1;
1270 var->l = (ar >> 13) & 1;
1271 var->db = (ar >> 14) & 1;
1272 var->g = (ar >> 15) & 1;
1273 var->unusable = (ar >> 16) & 1;
1274}
1275
1276static u32 vmx_segment_access_rights(struct kvm_segment *var)
1277{
1278 u32 ar;
1279
1280 if (var->unusable)
1281 ar = 1 << 16;
1282 else {
1283 ar = var->type & 15;
1284 ar |= (var->s & 1) << 4;
1285 ar |= (var->dpl & 3) << 5;
1286 ar |= (var->present & 1) << 7;
1287 ar |= (var->avl & 1) << 12;
1288 ar |= (var->l & 1) << 13;
1289 ar |= (var->db & 1) << 14;
1290 ar |= (var->g & 1) << 15;
1291 }
1292 if (ar == 0) /* a 0 value means unusable */
1293 ar = AR_UNUSABLE_MASK;
1294
1295 return ar;
1296}
1297
1298static void vmx_set_segment(struct kvm_vcpu *vcpu,
1299 struct kvm_segment *var, int seg)
1300{
1301 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1302 u32 ar;
1303
1304 if (vcpu->rmode.active && seg == VCPU_SREG_TR) {
1305 vcpu->rmode.tr.selector = var->selector;
1306 vcpu->rmode.tr.base = var->base;
1307 vcpu->rmode.tr.limit = var->limit;
1308 vcpu->rmode.tr.ar = vmx_segment_access_rights(var);
1309 return;
1310 }
1311 vmcs_writel(sf->base, var->base);
1312 vmcs_write32(sf->limit, var->limit);
1313 vmcs_write16(sf->selector, var->selector);
1314 if (vcpu->rmode.active && var->s) {
1315 /*
1316 * Hack real-mode segments into vm86 compatibility.
1317 */
1318 if (var->base == 0xffff0000 && var->selector == 0xf000)
1319 vmcs_writel(sf->base, 0xf0000);
1320 ar = 0xf3;
1321 } else
1322 ar = vmx_segment_access_rights(var);
1323 vmcs_write32(sf->ar_bytes, ar);
1324}
1325
1326static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
1327{
1328 u32 ar = vmcs_read32(GUEST_CS_AR_BYTES);
1329
1330 *db = (ar >> 14) & 1;
1331 *l = (ar >> 13) & 1;
1332}
1333
1334static void vmx_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
1335{
1336 dt->limit = vmcs_read32(GUEST_IDTR_LIMIT);
1337 dt->base = vmcs_readl(GUEST_IDTR_BASE);
1338}
1339
1340static void vmx_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
1341{
1342 vmcs_write32(GUEST_IDTR_LIMIT, dt->limit);
1343 vmcs_writel(GUEST_IDTR_BASE, dt->base);
1344}
1345
1346static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
1347{
1348 dt->limit = vmcs_read32(GUEST_GDTR_LIMIT);
1349 dt->base = vmcs_readl(GUEST_GDTR_BASE);
1350}
1351
1352static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
1353{
1354 vmcs_write32(GUEST_GDTR_LIMIT, dt->limit);
1355 vmcs_writel(GUEST_GDTR_BASE, dt->base);
1356}
1357
1358static int init_rmode_tss(struct kvm* kvm)
1359{
1360 struct page *p1, *p2, *p3;
1361 gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
1362 char *page;
1363
1364 p1 = gfn_to_page(kvm, fn++);
1365 p2 = gfn_to_page(kvm, fn++);
1366 p3 = gfn_to_page(kvm, fn);
1367
1368 if (!p1 || !p2 || !p3) {
1369 kvm_printf(kvm,"%s: gfn_to_page failed\n", __FUNCTION__);
1370 return 0;
1371 }
1372
1373 page = kmap_atomic(p1, KM_USER0);
1374 clear_page(page);
1375 *(u16*)(page + 0x66) = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
1376 kunmap_atomic(page, KM_USER0);
1377
1378 page = kmap_atomic(p2, KM_USER0);
1379 clear_page(page);
1380 kunmap_atomic(page, KM_USER0);
1381
1382 page = kmap_atomic(p3, KM_USER0);
1383 clear_page(page);
1384 *(page + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1) = ~0;
1385 kunmap_atomic(page, KM_USER0);
1386
1387 return 1;
1388}
1389
1390static void seg_setup(int seg)
1391{
1392 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1393
1394 vmcs_write16(sf->selector, 0);
1395 vmcs_writel(sf->base, 0);
1396 vmcs_write32(sf->limit, 0xffff);
1397 vmcs_write32(sf->ar_bytes, 0x93);
1398}
1399
1400/*
1401 * Sets up the vmcs for emulated real mode.
1402 */
1403static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
1404{
1405 u32 host_sysenter_cs;
1406 u32 junk;
1407 unsigned long a;
1408 struct descriptor_table dt;
1409 int i;
1410 int ret = 0;
1411 unsigned long kvm_vmx_return;
1412 u64 msr;
1413 u32 exec_control;
1414
1415 if (!init_rmode_tss(vmx->vcpu.kvm)) {
1416 ret = -ENOMEM;
1417 goto out;
1418 }
1419
1420 vmx->vcpu.rmode.active = 0;
1421
1422 vmx->vcpu.regs[VCPU_REGS_RDX] = get_rdx_init_val();
1423 set_cr8(&vmx->vcpu, 0);
1424 msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
1425 if (vmx->vcpu.vcpu_id == 0)
1426 msr |= MSR_IA32_APICBASE_BSP;
1427 kvm_set_apic_base(&vmx->vcpu, msr);
1428
1429 fx_init(&vmx->vcpu);
1430
1431 /*
1432 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
1433 * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh.
1434 */
1435 if (vmx->vcpu.vcpu_id == 0) {
1436 vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
1437 vmcs_writel(GUEST_CS_BASE, 0x000f0000);
1438 } else {
1439 vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.sipi_vector << 8);
1440 vmcs_writel(GUEST_CS_BASE, vmx->vcpu.sipi_vector << 12);
1441 }
1442 vmcs_write32(GUEST_CS_LIMIT, 0xffff);
1443 vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
1444
1445 seg_setup(VCPU_SREG_DS);
1446 seg_setup(VCPU_SREG_ES);
1447 seg_setup(VCPU_SREG_FS);
1448 seg_setup(VCPU_SREG_GS);
1449 seg_setup(VCPU_SREG_SS);
1450
1451 vmcs_write16(GUEST_TR_SELECTOR, 0);
1452 vmcs_writel(GUEST_TR_BASE, 0);
1453 vmcs_write32(GUEST_TR_LIMIT, 0xffff);
1454 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
1455
1456 vmcs_write16(GUEST_LDTR_SELECTOR, 0);
1457 vmcs_writel(GUEST_LDTR_BASE, 0);
1458 vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
1459 vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
1460
1461 vmcs_write32(GUEST_SYSENTER_CS, 0);
1462 vmcs_writel(GUEST_SYSENTER_ESP, 0);
1463 vmcs_writel(GUEST_SYSENTER_EIP, 0);
1464
1465 vmcs_writel(GUEST_RFLAGS, 0x02);
1466 if (vmx->vcpu.vcpu_id == 0)
1467 vmcs_writel(GUEST_RIP, 0xfff0);
1468 else
1469 vmcs_writel(GUEST_RIP, 0);
1470 vmcs_writel(GUEST_RSP, 0);
1471
1472 //todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0
1473 vmcs_writel(GUEST_DR7, 0x400);
1474
1475 vmcs_writel(GUEST_GDTR_BASE, 0);
1476 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
1477
1478 vmcs_writel(GUEST_IDTR_BASE, 0);
1479 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
1480
1481 vmcs_write32(GUEST_ACTIVITY_STATE, 0);
1482 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
1483 vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
1484
1485 /* I/O */
1486 vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a));
1487 vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b));
1488
1489 guest_write_tsc(0);
1490
1491 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
1492
1493 /* Special registers */
1494 vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
1495
1496 /* Control */
1497 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
1498 vmcs_config.pin_based_exec_ctrl);
1499
1500 exec_control = vmcs_config.cpu_based_exec_ctrl;
1501 if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
1502 exec_control &= ~CPU_BASED_TPR_SHADOW;
1503#ifdef CONFIG_X86_64
1504 exec_control |= CPU_BASED_CR8_STORE_EXITING |
1505 CPU_BASED_CR8_LOAD_EXITING;
1506#endif
1507 }
1508 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
1509
1510 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
1511 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
1512 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
1513
1514 vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */
1515 vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */
1516 vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
1517
1518 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
1519 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
1520 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
1521 vmcs_write16(HOST_FS_SELECTOR, read_fs()); /* 22.2.4 */
1522 vmcs_write16(HOST_GS_SELECTOR, read_gs()); /* 22.2.4 */
1523 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
1524#ifdef CONFIG_X86_64
1525 rdmsrl(MSR_FS_BASE, a);
1526 vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
1527 rdmsrl(MSR_GS_BASE, a);
1528 vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */
1529#else
1530 vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
1531 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
1532#endif
1533
1534 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
1535
1536 get_idt(&dt);
1537 vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */
1538
1539 asm ("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
1540 vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
1541 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
1542 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
1543 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
1544
1545 rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
1546 vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
1547 rdmsrl(MSR_IA32_SYSENTER_ESP, a);
1548 vmcs_writel(HOST_IA32_SYSENTER_ESP, a); /* 22.2.3 */
1549 rdmsrl(MSR_IA32_SYSENTER_EIP, a);
1550 vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */
1551
1552 for (i = 0; i < NR_VMX_MSR; ++i) {
1553 u32 index = vmx_msr_index[i];
1554 u32 data_low, data_high;
1555 u64 data;
1556 int j = vmx->nmsrs;
1557
1558 if (rdmsr_safe(index, &data_low, &data_high) < 0)
1559 continue;
1560 if (wrmsr_safe(index, data_low, data_high) < 0)
1561 continue;
1562 data = data_low | ((u64)data_high << 32);
1563 vmx->host_msrs[j].index = index;
1564 vmx->host_msrs[j].reserved = 0;
1565 vmx->host_msrs[j].data = data;
1566 vmx->guest_msrs[j] = vmx->host_msrs[j];
1567 ++vmx->nmsrs;
1568 }
1569
1570 setup_msrs(vmx);
1571
1572 vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
1573
1574 /* 22.2.1, 20.8.1 */
1575 vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
1576
1577 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
1578
1579#ifdef CONFIG_X86_64
1580 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
1581 if (vm_need_tpr_shadow(vmx->vcpu.kvm))
1582 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
1583 page_to_phys(vmx->vcpu.apic->regs_page));
1584 vmcs_write32(TPR_THRESHOLD, 0);
1585#endif
1586
1587 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
1588 vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK);
1589
1590 vmx->vcpu.cr0 = 0x60000010;
1591 vmx_set_cr0(&vmx->vcpu, vmx->vcpu.cr0); // enter rmode
1592 vmx_set_cr4(&vmx->vcpu, 0);
1593#ifdef CONFIG_X86_64
1594 vmx_set_efer(&vmx->vcpu, 0);
1595#endif
1596 vmx_fpu_activate(&vmx->vcpu);
1597 update_exception_bitmap(&vmx->vcpu);
1598
1599 return 0;
1600
1601out:
1602 return ret;
1603}
1604
1605static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
1606{
1607 struct vcpu_vmx *vmx = to_vmx(vcpu);
1608
1609 vmx_vcpu_setup(vmx);
1610}
1611
1612static void inject_rmode_irq(struct kvm_vcpu *vcpu, int irq)
1613{
1614 u16 ent[2];
1615 u16 cs;
1616 u16 ip;
1617 unsigned long flags;
1618 unsigned long ss_base = vmcs_readl(GUEST_SS_BASE);
1619 u16 sp = vmcs_readl(GUEST_RSP);
1620 u32 ss_limit = vmcs_read32(GUEST_SS_LIMIT);
1621
1622 if (sp > ss_limit || sp < 6 ) {
1623 vcpu_printf(vcpu, "%s: #SS, rsp 0x%lx ss 0x%lx limit 0x%x\n",
1624 __FUNCTION__,
1625 vmcs_readl(GUEST_RSP),
1626 vmcs_readl(GUEST_SS_BASE),
1627 vmcs_read32(GUEST_SS_LIMIT));
1628 return;
1629 }
1630
1631 if (emulator_read_std(irq * sizeof(ent), &ent, sizeof(ent), vcpu) !=
1632 X86EMUL_CONTINUE) {
1633 vcpu_printf(vcpu, "%s: read guest err\n", __FUNCTION__);
1634 return;
1635 }
1636
1637 flags = vmcs_readl(GUEST_RFLAGS);
1638 cs = vmcs_readl(GUEST_CS_BASE) >> 4;
1639 ip = vmcs_readl(GUEST_RIP);
1640
1641
1642 if (emulator_write_emulated(ss_base + sp - 2, &flags, 2, vcpu) != X86EMUL_CONTINUE ||
1643 emulator_write_emulated(ss_base + sp - 4, &cs, 2, vcpu) != X86EMUL_CONTINUE ||
1644 emulator_write_emulated(ss_base + sp - 6, &ip, 2, vcpu) != X86EMUL_CONTINUE) {
1645 vcpu_printf(vcpu, "%s: write guest err\n", __FUNCTION__);
1646 return;
1647 }
1648
1649 vmcs_writel(GUEST_RFLAGS, flags &
1650 ~( X86_EFLAGS_IF | X86_EFLAGS_AC | X86_EFLAGS_TF));
1651 vmcs_write16(GUEST_CS_SELECTOR, ent[1]) ;
1652 vmcs_writel(GUEST_CS_BASE, ent[1] << 4);
1653 vmcs_writel(GUEST_RIP, ent[0]);
1654 vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) & ~0xffff) | (sp - 6));
1655}
1656
1657static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
1658{
1659 if (vcpu->rmode.active) {
1660 inject_rmode_irq(vcpu, irq);
1661 return;
1662 }
1663 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
1664 irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
1665}
1666
1667static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
1668{
1669 int word_index = __ffs(vcpu->irq_summary);
1670 int bit_index = __ffs(vcpu->irq_pending[word_index]);
1671 int irq = word_index * BITS_PER_LONG + bit_index;
1672
1673 clear_bit(bit_index, &vcpu->irq_pending[word_index]);
1674 if (!vcpu->irq_pending[word_index])
1675 clear_bit(word_index, &vcpu->irq_summary);
1676 vmx_inject_irq(vcpu, irq);
1677}
1678
1679
1680static void do_interrupt_requests(struct kvm_vcpu *vcpu,
1681 struct kvm_run *kvm_run)
1682{
1683 u32 cpu_based_vm_exec_control;
1684
1685 vcpu->interrupt_window_open =
1686 ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
1687 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
1688
1689 if (vcpu->interrupt_window_open &&
1690 vcpu->irq_summary &&
1691 !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
1692 /*
1693 * If interrupts enabled, and not blocked by sti or mov ss. Good.
1694 */
1695 kvm_do_inject_irq(vcpu);
1696
1697 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
1698 if (!vcpu->interrupt_window_open &&
1699 (vcpu->irq_summary || kvm_run->request_interrupt_window))
1700 /*
1701 * Interrupts blocked. Wait for unblock.
1702 */
1703 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
1704 else
1705 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
1706 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
1707}
1708
1709static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
1710{
1711 struct kvm_guest_debug *dbg = &vcpu->guest_debug;
1712
1713 set_debugreg(dbg->bp[0], 0);
1714 set_debugreg(dbg->bp[1], 1);
1715 set_debugreg(dbg->bp[2], 2);
1716 set_debugreg(dbg->bp[3], 3);
1717
1718 if (dbg->singlestep) {
1719 unsigned long flags;
1720
1721 flags = vmcs_readl(GUEST_RFLAGS);
1722 flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
1723 vmcs_writel(GUEST_RFLAGS, flags);
1724 }
1725}
1726
1727static int handle_rmode_exception(struct kvm_vcpu *vcpu,
1728 int vec, u32 err_code)
1729{
1730 if (!vcpu->rmode.active)
1731 return 0;
1732
1733 /*
1734 * Instruction with address size override prefix opcode 0x67
1735 * Cause the #SS fault with 0 error code in VM86 mode.
1736 */
1737 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
1738 if (emulate_instruction(vcpu, NULL, 0, 0) == EMULATE_DONE)
1739 return 1;
1740 return 0;
1741}
1742
1743static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1744{
1745 u32 intr_info, error_code;
1746 unsigned long cr2, rip;
1747 u32 vect_info;
1748 enum emulation_result er;
1749 int r;
1750
1751 vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
1752 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
1753
1754 if ((vect_info & VECTORING_INFO_VALID_MASK) &&
1755 !is_page_fault(intr_info)) {
1756 printk(KERN_ERR "%s: unexpected, vectoring info 0x%x "
1757 "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info);
1758 }
1759
1760 if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) {
1761 int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
1762 set_bit(irq, vcpu->irq_pending);
1763 set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary);
1764 }
1765
1766 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */
1767 return 1; /* already handled by vmx_vcpu_run() */
1768
1769 if (is_no_device(intr_info)) {
1770 vmx_fpu_activate(vcpu);
1771 return 1;
1772 }
1773
1774 error_code = 0;
1775 rip = vmcs_readl(GUEST_RIP);
1776 if (intr_info & INTR_INFO_DELIEVER_CODE_MASK)
1777 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
1778 if (is_page_fault(intr_info)) {
1779 cr2 = vmcs_readl(EXIT_QUALIFICATION);
1780
1781 mutex_lock(&vcpu->kvm->lock);
1782 r = kvm_mmu_page_fault(vcpu, cr2, error_code);
1783 if (r < 0) {
1784 mutex_unlock(&vcpu->kvm->lock);
1785 return r;
1786 }
1787 if (!r) {
1788 mutex_unlock(&vcpu->kvm->lock);
1789 return 1;
1790 }
1791
1792 er = emulate_instruction(vcpu, kvm_run, cr2, error_code);
1793 mutex_unlock(&vcpu->kvm->lock);
1794
1795 switch (er) {
1796 case EMULATE_DONE:
1797 return 1;
1798 case EMULATE_DO_MMIO:
1799 ++vcpu->stat.mmio_exits;
1800 return 0;
1801 case EMULATE_FAIL:
1802 kvm_report_emulation_failure(vcpu, "pagetable");
1803 break;
1804 default:
1805 BUG();
1806 }
1807 }
1808
1809 if (vcpu->rmode.active &&
1810 handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
1811 error_code)) {
1812 if (vcpu->halt_request) {
1813 vcpu->halt_request = 0;
1814 return kvm_emulate_halt(vcpu);
1815 }
1816 return 1;
1817 }
1818
1819 if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == (INTR_TYPE_EXCEPTION | 1)) {
1820 kvm_run->exit_reason = KVM_EXIT_DEBUG;
1821 return 0;
1822 }
1823 kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
1824 kvm_run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
1825 kvm_run->ex.error_code = error_code;
1826 return 0;
1827}
1828
1829static int handle_external_interrupt(struct kvm_vcpu *vcpu,
1830 struct kvm_run *kvm_run)
1831{
1832 ++vcpu->stat.irq_exits;
1833 return 1;
1834}
1835
1836static int handle_triple_fault(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1837{
1838 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
1839 return 0;
1840}
1841
1842static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1843{
1844 unsigned long exit_qualification;
1845 int size, down, in, string, rep;
1846 unsigned port;
1847
1848 ++vcpu->stat.io_exits;
1849 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
1850 string = (exit_qualification & 16) != 0;
1851
1852 if (string) {
1853 if (emulate_instruction(vcpu, kvm_run, 0, 0) == EMULATE_DO_MMIO)
1854 return 0;
1855 return 1;
1856 }
1857
1858 size = (exit_qualification & 7) + 1;
1859 in = (exit_qualification & 8) != 0;
1860 down = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0;
1861 rep = (exit_qualification & 32) != 0;
1862 port = exit_qualification >> 16;
1863
1864 return kvm_emulate_pio(vcpu, kvm_run, in, size, port);
1865}
1866
1867static void
1868vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
1869{
1870 /*
1871 * Patch in the VMCALL instruction:
1872 */
1873 hypercall[0] = 0x0f;
1874 hypercall[1] = 0x01;
1875 hypercall[2] = 0xc1;
1876 hypercall[3] = 0xc3;
1877}
1878
1879static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1880{
1881 unsigned long exit_qualification;
1882 int cr;
1883 int reg;
1884
1885 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
1886 cr = exit_qualification & 15;
1887 reg = (exit_qualification >> 8) & 15;
1888 switch ((exit_qualification >> 4) & 3) {
1889 case 0: /* mov to cr */
1890 switch (cr) {
1891 case 0:
1892 vcpu_load_rsp_rip(vcpu);
1893 set_cr0(vcpu, vcpu->regs[reg]);
1894 skip_emulated_instruction(vcpu);
1895 return 1;
1896 case 3:
1897 vcpu_load_rsp_rip(vcpu);
1898 set_cr3(vcpu, vcpu->regs[reg]);
1899 skip_emulated_instruction(vcpu);
1900 return 1;
1901 case 4:
1902 vcpu_load_rsp_rip(vcpu);
1903 set_cr4(vcpu, vcpu->regs[reg]);
1904 skip_emulated_instruction(vcpu);
1905 return 1;
1906 case 8:
1907 vcpu_load_rsp_rip(vcpu);
1908 set_cr8(vcpu, vcpu->regs[reg]);
1909 skip_emulated_instruction(vcpu);
1910 kvm_run->exit_reason = KVM_EXIT_SET_TPR;
1911 return 0;
1912 };
1913 break;
1914 case 2: /* clts */
1915 vcpu_load_rsp_rip(vcpu);
1916 vmx_fpu_deactivate(vcpu);
1917 vcpu->cr0 &= ~X86_CR0_TS;
1918 vmcs_writel(CR0_READ_SHADOW, vcpu->cr0);
1919 vmx_fpu_activate(vcpu);
1920 skip_emulated_instruction(vcpu);
1921 return 1;
1922 case 1: /*mov from cr*/
1923 switch (cr) {
1924 case 3:
1925 vcpu_load_rsp_rip(vcpu);
1926 vcpu->regs[reg] = vcpu->cr3;
1927 vcpu_put_rsp_rip(vcpu);
1928 skip_emulated_instruction(vcpu);
1929 return 1;
1930 case 8:
1931 vcpu_load_rsp_rip(vcpu);
1932 vcpu->regs[reg] = get_cr8(vcpu);
1933 vcpu_put_rsp_rip(vcpu);
1934 skip_emulated_instruction(vcpu);
1935 return 1;
1936 }
1937 break;
1938 case 3: /* lmsw */
1939 lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f);
1940
1941 skip_emulated_instruction(vcpu);
1942 return 1;
1943 default:
1944 break;
1945 }
1946 kvm_run->exit_reason = 0;
1947 pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
1948 (int)(exit_qualification >> 4) & 3, cr);
1949 return 0;
1950}
1951
1952static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1953{
1954 unsigned long exit_qualification;
1955 unsigned long val;
1956 int dr, reg;
1957
1958 /*
1959 * FIXME: this code assumes the host is debugging the guest.
1960 * need to deal with guest debugging itself too.
1961 */
1962 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
1963 dr = exit_qualification & 7;
1964 reg = (exit_qualification >> 8) & 15;
1965 vcpu_load_rsp_rip(vcpu);
1966 if (exit_qualification & 16) {
1967 /* mov from dr */
1968 switch (dr) {
1969 case 6:
1970 val = 0xffff0ff0;
1971 break;
1972 case 7:
1973 val = 0x400;
1974 break;
1975 default:
1976 val = 0;
1977 }
1978 vcpu->regs[reg] = val;
1979 } else {
1980 /* mov to dr */
1981 }
1982 vcpu_put_rsp_rip(vcpu);
1983 skip_emulated_instruction(vcpu);
1984 return 1;
1985}
1986
1987static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1988{
1989 kvm_emulate_cpuid(vcpu);
1990 return 1;
1991}
1992
1993static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1994{
1995 u32 ecx = vcpu->regs[VCPU_REGS_RCX];
1996 u64 data;
1997
1998 if (vmx_get_msr(vcpu, ecx, &data)) {
1999 vmx_inject_gp(vcpu, 0);
2000 return 1;
2001 }
2002
2003 /* FIXME: handling of bits 32:63 of rax, rdx */
2004 vcpu->regs[VCPU_REGS_RAX] = data & -1u;
2005 vcpu->regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
2006 skip_emulated_instruction(vcpu);
2007 return 1;
2008}
2009
2010static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2011{
2012 u32 ecx = vcpu->regs[VCPU_REGS_RCX];
2013 u64 data = (vcpu->regs[VCPU_REGS_RAX] & -1u)
2014 | ((u64)(vcpu->regs[VCPU_REGS_RDX] & -1u) << 32);
2015
2016 if (vmx_set_msr(vcpu, ecx, data) != 0) {
2017 vmx_inject_gp(vcpu, 0);
2018 return 1;
2019 }
2020
2021 skip_emulated_instruction(vcpu);
2022 return 1;
2023}
2024
2025static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu,
2026 struct kvm_run *kvm_run)
2027{
2028 return 1;
2029}
2030
2031static int handle_interrupt_window(struct kvm_vcpu *vcpu,
2032 struct kvm_run *kvm_run)
2033{
2034 u32 cpu_based_vm_exec_control;
2035
2036 /* clear pending irq */
2037 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2038 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2039 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2040 /*
2041 * If the user space waits to inject interrupts, exit as soon as
2042 * possible
2043 */
2044 if (kvm_run->request_interrupt_window &&
2045 !vcpu->irq_summary) {
2046 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
2047 ++vcpu->stat.irq_window_exits;
2048 return 0;
2049 }
2050 return 1;
2051}
2052
2053static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2054{
2055 skip_emulated_instruction(vcpu);
2056 return kvm_emulate_halt(vcpu);
2057}
2058
2059static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2060{
2061 skip_emulated_instruction(vcpu);
2062 return kvm_hypercall(vcpu, kvm_run);
2063}
2064
2065/*
2066 * The exit handlers return 1 if the exit was handled fully and guest execution
2067 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
2068 * to be done to userspace and return 0.
2069 */
2070static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
2071 struct kvm_run *kvm_run) = {
2072 [EXIT_REASON_EXCEPTION_NMI] = handle_exception,
2073 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
2074 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
2075 [EXIT_REASON_IO_INSTRUCTION] = handle_io,
2076 [EXIT_REASON_CR_ACCESS] = handle_cr,
2077 [EXIT_REASON_DR_ACCESS] = handle_dr,
2078 [EXIT_REASON_CPUID] = handle_cpuid,
2079 [EXIT_REASON_MSR_READ] = handle_rdmsr,
2080 [EXIT_REASON_MSR_WRITE] = handle_wrmsr,
2081 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
2082 [EXIT_REASON_HLT] = handle_halt,
2083 [EXIT_REASON_VMCALL] = handle_vmcall,
2084 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold
2085};
2086
2087static const int kvm_vmx_max_exit_handlers =
2088 ARRAY_SIZE(kvm_vmx_exit_handlers);
2089
2090/*
2091 * The guest has exited. See if we can fix it or if we need userspace
2092 * assistance.
2093 */
2094static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2095{
2096 u32 vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
2097 u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
2098 struct vcpu_vmx *vmx = to_vmx(vcpu);
2099
2100 if (unlikely(vmx->fail)) {
2101 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
2102 kvm_run->fail_entry.hardware_entry_failure_reason
2103 = vmcs_read32(VM_INSTRUCTION_ERROR);
2104 return 0;
2105 }
2106
2107 if ( (vectoring_info & VECTORING_INFO_VALID_MASK) &&
2108 exit_reason != EXIT_REASON_EXCEPTION_NMI )
2109 printk(KERN_WARNING "%s: unexpected, valid vectoring info and "
2110 "exit reason is 0x%x\n", __FUNCTION__, exit_reason);
2111 if (exit_reason < kvm_vmx_max_exit_handlers
2112 && kvm_vmx_exit_handlers[exit_reason])
2113 return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run);
2114 else {
2115 kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
2116 kvm_run->hw.hardware_exit_reason = exit_reason;
2117 }
2118 return 0;
2119}
2120
2121static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
2122{
2123}
2124
2125static void update_tpr_threshold(struct kvm_vcpu *vcpu)
2126{
2127 int max_irr, tpr;
2128
2129 if (!vm_need_tpr_shadow(vcpu->kvm))
2130 return;
2131
2132 if (!kvm_lapic_enabled(vcpu) ||
2133 ((max_irr = kvm_lapic_find_highest_irr(vcpu)) == -1)) {
2134 vmcs_write32(TPR_THRESHOLD, 0);
2135 return;
2136 }
2137
2138 tpr = (kvm_lapic_get_cr8(vcpu) & 0x0f) << 4;
2139 vmcs_write32(TPR_THRESHOLD, (max_irr > tpr) ? tpr >> 4 : max_irr >> 4);
2140}
2141
2142static void enable_irq_window(struct kvm_vcpu *vcpu)
2143{
2144 u32 cpu_based_vm_exec_control;
2145
2146 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2147 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
2148 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2149}
2150
2151static void vmx_intr_assist(struct kvm_vcpu *vcpu)
2152{
2153 u32 idtv_info_field, intr_info_field;
2154 int has_ext_irq, interrupt_window_open;
2155 int vector;
2156
2157 kvm_inject_pending_timer_irqs(vcpu);
2158 update_tpr_threshold(vcpu);
2159
2160 has_ext_irq = kvm_cpu_has_interrupt(vcpu);
2161 intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
2162 idtv_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD);
2163 if (intr_info_field & INTR_INFO_VALID_MASK) {
2164 if (idtv_info_field & INTR_INFO_VALID_MASK) {
2165 /* TODO: fault when IDT_Vectoring */
2166 printk(KERN_ERR "Fault when IDT_Vectoring\n");
2167 }
2168 if (has_ext_irq)
2169 enable_irq_window(vcpu);
2170 return;
2171 }
2172 if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) {
2173 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field);
2174 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2175 vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
2176
2177 if (unlikely(idtv_info_field & INTR_INFO_DELIEVER_CODE_MASK))
2178 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
2179 vmcs_read32(IDT_VECTORING_ERROR_CODE));
2180 if (unlikely(has_ext_irq))
2181 enable_irq_window(vcpu);
2182 return;
2183 }
2184 if (!has_ext_irq)
2185 return;
2186 interrupt_window_open =
2187 ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
2188 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
2189 if (interrupt_window_open) {
2190 vector = kvm_cpu_get_interrupt(vcpu);
2191 vmx_inject_irq(vcpu, vector);
2192 kvm_timer_intr_post(vcpu, vector);
2193 } else
2194 enable_irq_window(vcpu);
2195}
2196
2197static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2198{
2199 struct vcpu_vmx *vmx = to_vmx(vcpu);
2200 u32 intr_info;
2201
2202 /*
2203 * Loading guest fpu may have cleared host cr0.ts
2204 */
2205 vmcs_writel(HOST_CR0, read_cr0());
2206
2207 asm (
2208 /* Store host registers */
2209#ifdef CONFIG_X86_64
2210 "push %%rax; push %%rbx; push %%rdx;"
2211 "push %%rsi; push %%rdi; push %%rbp;"
2212 "push %%r8; push %%r9; push %%r10; push %%r11;"
2213 "push %%r12; push %%r13; push %%r14; push %%r15;"
2214 "push %%rcx \n\t"
2215 ASM_VMX_VMWRITE_RSP_RDX "\n\t"
2216#else
2217 "pusha; push %%ecx \n\t"
2218 ASM_VMX_VMWRITE_RSP_RDX "\n\t"
2219#endif
2220 /* Check if vmlaunch of vmresume is needed */
2221 "cmp $0, %1 \n\t"
2222 /* Load guest registers. Don't clobber flags. */
2223#ifdef CONFIG_X86_64
2224 "mov %c[cr2](%3), %%rax \n\t"
2225 "mov %%rax, %%cr2 \n\t"
2226 "mov %c[rax](%3), %%rax \n\t"
2227 "mov %c[rbx](%3), %%rbx \n\t"
2228 "mov %c[rdx](%3), %%rdx \n\t"
2229 "mov %c[rsi](%3), %%rsi \n\t"
2230 "mov %c[rdi](%3), %%rdi \n\t"
2231 "mov %c[rbp](%3), %%rbp \n\t"
2232 "mov %c[r8](%3), %%r8 \n\t"
2233 "mov %c[r9](%3), %%r9 \n\t"
2234 "mov %c[r10](%3), %%r10 \n\t"
2235 "mov %c[r11](%3), %%r11 \n\t"
2236 "mov %c[r12](%3), %%r12 \n\t"
2237 "mov %c[r13](%3), %%r13 \n\t"
2238 "mov %c[r14](%3), %%r14 \n\t"
2239 "mov %c[r15](%3), %%r15 \n\t"
2240 "mov %c[rcx](%3), %%rcx \n\t" /* kills %3 (rcx) */
2241#else
2242 "mov %c[cr2](%3), %%eax \n\t"
2243 "mov %%eax, %%cr2 \n\t"
2244 "mov %c[rax](%3), %%eax \n\t"
2245 "mov %c[rbx](%3), %%ebx \n\t"
2246 "mov %c[rdx](%3), %%edx \n\t"
2247 "mov %c[rsi](%3), %%esi \n\t"
2248 "mov %c[rdi](%3), %%edi \n\t"
2249 "mov %c[rbp](%3), %%ebp \n\t"
2250 "mov %c[rcx](%3), %%ecx \n\t" /* kills %3 (ecx) */
2251#endif
2252 /* Enter guest mode */
2253 "jne .Llaunched \n\t"
2254 ASM_VMX_VMLAUNCH "\n\t"
2255 "jmp .Lkvm_vmx_return \n\t"
2256 ".Llaunched: " ASM_VMX_VMRESUME "\n\t"
2257 ".Lkvm_vmx_return: "
2258 /* Save guest registers, load host registers, keep flags */
2259#ifdef CONFIG_X86_64
2260 "xchg %3, (%%rsp) \n\t"
2261 "mov %%rax, %c[rax](%3) \n\t"
2262 "mov %%rbx, %c[rbx](%3) \n\t"
2263 "pushq (%%rsp); popq %c[rcx](%3) \n\t"
2264 "mov %%rdx, %c[rdx](%3) \n\t"
2265 "mov %%rsi, %c[rsi](%3) \n\t"
2266 "mov %%rdi, %c[rdi](%3) \n\t"
2267 "mov %%rbp, %c[rbp](%3) \n\t"
2268 "mov %%r8, %c[r8](%3) \n\t"
2269 "mov %%r9, %c[r9](%3) \n\t"
2270 "mov %%r10, %c[r10](%3) \n\t"
2271 "mov %%r11, %c[r11](%3) \n\t"
2272 "mov %%r12, %c[r12](%3) \n\t"
2273 "mov %%r13, %c[r13](%3) \n\t"
2274 "mov %%r14, %c[r14](%3) \n\t"
2275 "mov %%r15, %c[r15](%3) \n\t"
2276 "mov %%cr2, %%rax \n\t"
2277 "mov %%rax, %c[cr2](%3) \n\t"
2278 "mov (%%rsp), %3 \n\t"
2279
2280 "pop %%rcx; pop %%r15; pop %%r14; pop %%r13; pop %%r12;"
2281 "pop %%r11; pop %%r10; pop %%r9; pop %%r8;"
2282 "pop %%rbp; pop %%rdi; pop %%rsi;"
2283 "pop %%rdx; pop %%rbx; pop %%rax \n\t"
2284#else
2285 "xchg %3, (%%esp) \n\t"
2286 "mov %%eax, %c[rax](%3) \n\t"
2287 "mov %%ebx, %c[rbx](%3) \n\t"
2288 "pushl (%%esp); popl %c[rcx](%3) \n\t"
2289 "mov %%edx, %c[rdx](%3) \n\t"
2290 "mov %%esi, %c[rsi](%3) \n\t"
2291 "mov %%edi, %c[rdi](%3) \n\t"
2292 "mov %%ebp, %c[rbp](%3) \n\t"
2293 "mov %%cr2, %%eax \n\t"
2294 "mov %%eax, %c[cr2](%3) \n\t"
2295 "mov (%%esp), %3 \n\t"
2296
2297 "pop %%ecx; popa \n\t"
2298#endif
2299 "setbe %0 \n\t"
2300 : "=q" (vmx->fail)
2301 : "r"(vmx->launched), "d"((unsigned long)HOST_RSP),
2302 "c"(vcpu),
2303 [rax]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RAX])),
2304 [rbx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBX])),
2305 [rcx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RCX])),
2306 [rdx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDX])),
2307 [rsi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RSI])),
2308 [rdi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDI])),
2309 [rbp]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBP])),
2310#ifdef CONFIG_X86_64
2311 [r8 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R8 ])),
2312 [r9 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R9 ])),
2313 [r10]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R10])),
2314 [r11]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R11])),
2315 [r12]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R12])),
2316 [r13]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R13])),
2317 [r14]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R14])),
2318 [r15]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R15])),
2319#endif
2320 [cr2]"i"(offsetof(struct kvm_vcpu, cr2))
2321 : "cc", "memory" );
2322
2323 vcpu->interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0;
2324
2325 asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
2326 vmx->launched = 1;
2327
2328 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
2329
2330 /* We need to handle NMIs before interrupts are enabled */
2331 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */
2332 asm("int $2");
2333}
2334
2335static void vmx_inject_page_fault(struct kvm_vcpu *vcpu,
2336 unsigned long addr,
2337 u32 err_code)
2338{
2339 u32 vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
2340
2341 ++vcpu->stat.pf_guest;
2342
2343 if (is_page_fault(vect_info)) {
2344 printk(KERN_DEBUG "inject_page_fault: "
2345 "double fault 0x%lx @ 0x%lx\n",
2346 addr, vmcs_readl(GUEST_RIP));
2347 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 0);
2348 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2349 DF_VECTOR |
2350 INTR_TYPE_EXCEPTION |
2351 INTR_INFO_DELIEVER_CODE_MASK |
2352 INTR_INFO_VALID_MASK);
2353 return;
2354 }
2355 vcpu->cr2 = addr;
2356 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, err_code);
2357 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2358 PF_VECTOR |
2359 INTR_TYPE_EXCEPTION |
2360 INTR_INFO_DELIEVER_CODE_MASK |
2361 INTR_INFO_VALID_MASK);
2362
2363}
2364
2365static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
2366{
2367 struct vcpu_vmx *vmx = to_vmx(vcpu);
2368
2369 if (vmx->vmcs) {
2370 on_each_cpu(__vcpu_clear, vmx, 0, 1);
2371 free_vmcs(vmx->vmcs);
2372 vmx->vmcs = NULL;
2373 }
2374}
2375
2376static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
2377{
2378 struct vcpu_vmx *vmx = to_vmx(vcpu);
2379
2380 vmx_free_vmcs(vcpu);
2381 kfree(vmx->host_msrs);
2382 kfree(vmx->guest_msrs);
2383 kvm_vcpu_uninit(vcpu);
2384 kmem_cache_free(kvm_vcpu_cache, vmx);
2385}
2386
2387static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
2388{
2389 int err;
2390 struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
2391 int cpu;
2392
2393 if (!vmx)
2394 return ERR_PTR(-ENOMEM);
2395
2396 err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
2397 if (err)
2398 goto free_vcpu;
2399
2400 if (irqchip_in_kernel(kvm)) {
2401 err = kvm_create_lapic(&vmx->vcpu);
2402 if (err < 0)
2403 goto free_vcpu;
2404 }
2405
2406 vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
2407 if (!vmx->guest_msrs) {
2408 err = -ENOMEM;
2409 goto uninit_vcpu;
2410 }
2411
2412 vmx->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
2413 if (!vmx->host_msrs)
2414 goto free_guest_msrs;
2415
2416 vmx->vmcs = alloc_vmcs();
2417 if (!vmx->vmcs)
2418 goto free_msrs;
2419
2420 vmcs_clear(vmx->vmcs);
2421
2422 cpu = get_cpu();
2423 vmx_vcpu_load(&vmx->vcpu, cpu);
2424 err = vmx_vcpu_setup(vmx);
2425 vmx_vcpu_put(&vmx->vcpu);
2426 put_cpu();
2427 if (err)
2428 goto free_vmcs;
2429
2430 return &vmx->vcpu;
2431
2432free_vmcs:
2433 free_vmcs(vmx->vmcs);
2434free_msrs:
2435 kfree(vmx->host_msrs);
2436free_guest_msrs:
2437 kfree(vmx->guest_msrs);
2438uninit_vcpu:
2439 kvm_vcpu_uninit(&vmx->vcpu);
2440free_vcpu:
2441 kmem_cache_free(kvm_vcpu_cache, vmx);
2442 return ERR_PTR(err);
2443}
2444
2445static void __init vmx_check_processor_compat(void *rtn)
2446{
2447 struct vmcs_config vmcs_conf;
2448
2449 *(int *)rtn = 0;
2450 if (setup_vmcs_config(&vmcs_conf) < 0)
2451 *(int *)rtn = -EIO;
2452 if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
2453 printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
2454 smp_processor_id());
2455 *(int *)rtn = -EIO;
2456 }
2457}
2458
2459static struct kvm_x86_ops vmx_x86_ops = {
2460 .cpu_has_kvm_support = cpu_has_kvm_support,
2461 .disabled_by_bios = vmx_disabled_by_bios,
2462 .hardware_setup = hardware_setup,
2463 .hardware_unsetup = hardware_unsetup,
2464 .check_processor_compatibility = vmx_check_processor_compat,
2465 .hardware_enable = hardware_enable,
2466 .hardware_disable = hardware_disable,
2467
2468 .vcpu_create = vmx_create_vcpu,
2469 .vcpu_free = vmx_free_vcpu,
2470 .vcpu_reset = vmx_vcpu_reset,
2471
2472 .prepare_guest_switch = vmx_save_host_state,
2473 .vcpu_load = vmx_vcpu_load,
2474 .vcpu_put = vmx_vcpu_put,
2475 .vcpu_decache = vmx_vcpu_decache,
2476
2477 .set_guest_debug = set_guest_debug,
2478 .guest_debug_pre = kvm_guest_debug_pre,
2479 .get_msr = vmx_get_msr,
2480 .set_msr = vmx_set_msr,
2481 .get_segment_base = vmx_get_segment_base,
2482 .get_segment = vmx_get_segment,
2483 .set_segment = vmx_set_segment,
2484 .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
2485 .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
2486 .set_cr0 = vmx_set_cr0,
2487 .set_cr3 = vmx_set_cr3,
2488 .set_cr4 = vmx_set_cr4,
2489#ifdef CONFIG_X86_64
2490 .set_efer = vmx_set_efer,
2491#endif
2492 .get_idt = vmx_get_idt,
2493 .set_idt = vmx_set_idt,
2494 .get_gdt = vmx_get_gdt,
2495 .set_gdt = vmx_set_gdt,
2496 .cache_regs = vcpu_load_rsp_rip,
2497 .decache_regs = vcpu_put_rsp_rip,
2498 .get_rflags = vmx_get_rflags,
2499 .set_rflags = vmx_set_rflags,
2500
2501 .tlb_flush = vmx_flush_tlb,
2502 .inject_page_fault = vmx_inject_page_fault,
2503
2504 .inject_gp = vmx_inject_gp,
2505
2506 .run = vmx_vcpu_run,
2507 .handle_exit = kvm_handle_exit,
2508 .skip_emulated_instruction = skip_emulated_instruction,
2509 .patch_hypercall = vmx_patch_hypercall,
2510 .get_irq = vmx_get_irq,
2511 .set_irq = vmx_inject_irq,
2512 .inject_pending_irq = vmx_intr_assist,
2513 .inject_pending_vectors = do_interrupt_requests,
2514};
2515
2516static int __init vmx_init(void)
2517{
2518 void *iova;
2519 int r;
2520
2521 vmx_io_bitmap_a = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
2522 if (!vmx_io_bitmap_a)
2523 return -ENOMEM;
2524
2525 vmx_io_bitmap_b = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
2526 if (!vmx_io_bitmap_b) {
2527 r = -ENOMEM;
2528 goto out;
2529 }
2530
2531 /*
2532 * Allow direct access to the PC debug port (it is often used for I/O
2533 * delays, but the vmexits simply slow things down).
2534 */
2535 iova = kmap(vmx_io_bitmap_a);
2536 memset(iova, 0xff, PAGE_SIZE);
2537 clear_bit(0x80, iova);
2538 kunmap(vmx_io_bitmap_a);
2539
2540 iova = kmap(vmx_io_bitmap_b);
2541 memset(iova, 0xff, PAGE_SIZE);
2542 kunmap(vmx_io_bitmap_b);
2543
2544 r = kvm_init_x86(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
2545 if (r)
2546 goto out1;
2547
2548 return 0;
2549
2550out1:
2551 __free_page(vmx_io_bitmap_b);
2552out:
2553 __free_page(vmx_io_bitmap_a);
2554 return r;
2555}
2556
2557static void __exit vmx_exit(void)
2558{
2559 __free_page(vmx_io_bitmap_b);
2560 __free_page(vmx_io_bitmap_a);
2561
2562 kvm_exit_x86();
2563}
2564
2565module_init(vmx_init)
2566module_exit(vmx_exit)
diff --git a/drivers/kvm/vmx.h b/drivers/kvm/vmx.h
deleted file mode 100644
index fd4e14666088..000000000000
--- a/drivers/kvm/vmx.h
+++ /dev/null
@@ -1,310 +0,0 @@
1#ifndef VMX_H
2#define VMX_H
3
4/*
5 * vmx.h: VMX Architecture related definitions
6 * Copyright (c) 2004, Intel Corporation.
7 *
8 * This program is free software; you can redistribute it and/or modify it
9 * under the terms and conditions of the GNU General Public License,
10 * version 2, as published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope it will be useful, but WITHOUT
13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
15 * more details.
16 *
17 * You should have received a copy of the GNU General Public License along with
18 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
19 * Place - Suite 330, Boston, MA 02111-1307 USA.
20 *
21 * A few random additions are:
22 * Copyright (C) 2006 Qumranet
23 * Avi Kivity <avi@qumranet.com>
24 * Yaniv Kamay <yaniv@qumranet.com>
25 *
26 */
27
28#define CPU_BASED_VIRTUAL_INTR_PENDING 0x00000004
29#define CPU_BASED_USE_TSC_OFFSETING 0x00000008
30#define CPU_BASED_HLT_EXITING 0x00000080
31#define CPU_BASED_INVLPG_EXITING 0x00000200
32#define CPU_BASED_MWAIT_EXITING 0x00000400
33#define CPU_BASED_RDPMC_EXITING 0x00000800
34#define CPU_BASED_RDTSC_EXITING 0x00001000
35#define CPU_BASED_CR8_LOAD_EXITING 0x00080000
36#define CPU_BASED_CR8_STORE_EXITING 0x00100000
37#define CPU_BASED_TPR_SHADOW 0x00200000
38#define CPU_BASED_MOV_DR_EXITING 0x00800000
39#define CPU_BASED_UNCOND_IO_EXITING 0x01000000
40#define CPU_BASED_USE_IO_BITMAPS 0x02000000
41#define CPU_BASED_USE_MSR_BITMAPS 0x10000000
42#define CPU_BASED_MONITOR_EXITING 0x20000000
43#define CPU_BASED_PAUSE_EXITING 0x40000000
44#define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS 0x80000000
45
46#define PIN_BASED_EXT_INTR_MASK 0x00000001
47#define PIN_BASED_NMI_EXITING 0x00000008
48#define PIN_BASED_VIRTUAL_NMIS 0x00000020
49
50#define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200
51#define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000
52
53#define VM_ENTRY_IA32E_MODE 0x00000200
54#define VM_ENTRY_SMM 0x00000400
55#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800
56
57#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
58
59/* VMCS Encodings */
60enum vmcs_field {
61 GUEST_ES_SELECTOR = 0x00000800,
62 GUEST_CS_SELECTOR = 0x00000802,
63 GUEST_SS_SELECTOR = 0x00000804,
64 GUEST_DS_SELECTOR = 0x00000806,
65 GUEST_FS_SELECTOR = 0x00000808,
66 GUEST_GS_SELECTOR = 0x0000080a,
67 GUEST_LDTR_SELECTOR = 0x0000080c,
68 GUEST_TR_SELECTOR = 0x0000080e,
69 HOST_ES_SELECTOR = 0x00000c00,
70 HOST_CS_SELECTOR = 0x00000c02,
71 HOST_SS_SELECTOR = 0x00000c04,
72 HOST_DS_SELECTOR = 0x00000c06,
73 HOST_FS_SELECTOR = 0x00000c08,
74 HOST_GS_SELECTOR = 0x00000c0a,
75 HOST_TR_SELECTOR = 0x00000c0c,
76 IO_BITMAP_A = 0x00002000,
77 IO_BITMAP_A_HIGH = 0x00002001,
78 IO_BITMAP_B = 0x00002002,
79 IO_BITMAP_B_HIGH = 0x00002003,
80 MSR_BITMAP = 0x00002004,
81 MSR_BITMAP_HIGH = 0x00002005,
82 VM_EXIT_MSR_STORE_ADDR = 0x00002006,
83 VM_EXIT_MSR_STORE_ADDR_HIGH = 0x00002007,
84 VM_EXIT_MSR_LOAD_ADDR = 0x00002008,
85 VM_EXIT_MSR_LOAD_ADDR_HIGH = 0x00002009,
86 VM_ENTRY_MSR_LOAD_ADDR = 0x0000200a,
87 VM_ENTRY_MSR_LOAD_ADDR_HIGH = 0x0000200b,
88 TSC_OFFSET = 0x00002010,
89 TSC_OFFSET_HIGH = 0x00002011,
90 VIRTUAL_APIC_PAGE_ADDR = 0x00002012,
91 VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013,
92 VMCS_LINK_POINTER = 0x00002800,
93 VMCS_LINK_POINTER_HIGH = 0x00002801,
94 GUEST_IA32_DEBUGCTL = 0x00002802,
95 GUEST_IA32_DEBUGCTL_HIGH = 0x00002803,
96 PIN_BASED_VM_EXEC_CONTROL = 0x00004000,
97 CPU_BASED_VM_EXEC_CONTROL = 0x00004002,
98 EXCEPTION_BITMAP = 0x00004004,
99 PAGE_FAULT_ERROR_CODE_MASK = 0x00004006,
100 PAGE_FAULT_ERROR_CODE_MATCH = 0x00004008,
101 CR3_TARGET_COUNT = 0x0000400a,
102 VM_EXIT_CONTROLS = 0x0000400c,
103 VM_EXIT_MSR_STORE_COUNT = 0x0000400e,
104 VM_EXIT_MSR_LOAD_COUNT = 0x00004010,
105 VM_ENTRY_CONTROLS = 0x00004012,
106 VM_ENTRY_MSR_LOAD_COUNT = 0x00004014,
107 VM_ENTRY_INTR_INFO_FIELD = 0x00004016,
108 VM_ENTRY_EXCEPTION_ERROR_CODE = 0x00004018,
109 VM_ENTRY_INSTRUCTION_LEN = 0x0000401a,
110 TPR_THRESHOLD = 0x0000401c,
111 SECONDARY_VM_EXEC_CONTROL = 0x0000401e,
112 VM_INSTRUCTION_ERROR = 0x00004400,
113 VM_EXIT_REASON = 0x00004402,
114 VM_EXIT_INTR_INFO = 0x00004404,
115 VM_EXIT_INTR_ERROR_CODE = 0x00004406,
116 IDT_VECTORING_INFO_FIELD = 0x00004408,
117 IDT_VECTORING_ERROR_CODE = 0x0000440a,
118 VM_EXIT_INSTRUCTION_LEN = 0x0000440c,
119 VMX_INSTRUCTION_INFO = 0x0000440e,
120 GUEST_ES_LIMIT = 0x00004800,
121 GUEST_CS_LIMIT = 0x00004802,
122 GUEST_SS_LIMIT = 0x00004804,
123 GUEST_DS_LIMIT = 0x00004806,
124 GUEST_FS_LIMIT = 0x00004808,
125 GUEST_GS_LIMIT = 0x0000480a,
126 GUEST_LDTR_LIMIT = 0x0000480c,
127 GUEST_TR_LIMIT = 0x0000480e,
128 GUEST_GDTR_LIMIT = 0x00004810,
129 GUEST_IDTR_LIMIT = 0x00004812,
130 GUEST_ES_AR_BYTES = 0x00004814,
131 GUEST_CS_AR_BYTES = 0x00004816,
132 GUEST_SS_AR_BYTES = 0x00004818,
133 GUEST_DS_AR_BYTES = 0x0000481a,
134 GUEST_FS_AR_BYTES = 0x0000481c,
135 GUEST_GS_AR_BYTES = 0x0000481e,
136 GUEST_LDTR_AR_BYTES = 0x00004820,
137 GUEST_TR_AR_BYTES = 0x00004822,
138 GUEST_INTERRUPTIBILITY_INFO = 0x00004824,
139 GUEST_ACTIVITY_STATE = 0X00004826,
140 GUEST_SYSENTER_CS = 0x0000482A,
141 HOST_IA32_SYSENTER_CS = 0x00004c00,
142 CR0_GUEST_HOST_MASK = 0x00006000,
143 CR4_GUEST_HOST_MASK = 0x00006002,
144 CR0_READ_SHADOW = 0x00006004,
145 CR4_READ_SHADOW = 0x00006006,
146 CR3_TARGET_VALUE0 = 0x00006008,
147 CR3_TARGET_VALUE1 = 0x0000600a,
148 CR3_TARGET_VALUE2 = 0x0000600c,
149 CR3_TARGET_VALUE3 = 0x0000600e,
150 EXIT_QUALIFICATION = 0x00006400,
151 GUEST_LINEAR_ADDRESS = 0x0000640a,
152 GUEST_CR0 = 0x00006800,
153 GUEST_CR3 = 0x00006802,
154 GUEST_CR4 = 0x00006804,
155 GUEST_ES_BASE = 0x00006806,
156 GUEST_CS_BASE = 0x00006808,
157 GUEST_SS_BASE = 0x0000680a,
158 GUEST_DS_BASE = 0x0000680c,
159 GUEST_FS_BASE = 0x0000680e,
160 GUEST_GS_BASE = 0x00006810,
161 GUEST_LDTR_BASE = 0x00006812,
162 GUEST_TR_BASE = 0x00006814,
163 GUEST_GDTR_BASE = 0x00006816,
164 GUEST_IDTR_BASE = 0x00006818,
165 GUEST_DR7 = 0x0000681a,
166 GUEST_RSP = 0x0000681c,
167 GUEST_RIP = 0x0000681e,
168 GUEST_RFLAGS = 0x00006820,
169 GUEST_PENDING_DBG_EXCEPTIONS = 0x00006822,
170 GUEST_SYSENTER_ESP = 0x00006824,
171 GUEST_SYSENTER_EIP = 0x00006826,
172 HOST_CR0 = 0x00006c00,
173 HOST_CR3 = 0x00006c02,
174 HOST_CR4 = 0x00006c04,
175 HOST_FS_BASE = 0x00006c06,
176 HOST_GS_BASE = 0x00006c08,
177 HOST_TR_BASE = 0x00006c0a,
178 HOST_GDTR_BASE = 0x00006c0c,
179 HOST_IDTR_BASE = 0x00006c0e,
180 HOST_IA32_SYSENTER_ESP = 0x00006c10,
181 HOST_IA32_SYSENTER_EIP = 0x00006c12,
182 HOST_RSP = 0x00006c14,
183 HOST_RIP = 0x00006c16,
184};
185
186#define VMX_EXIT_REASONS_FAILED_VMENTRY 0x80000000
187
188#define EXIT_REASON_EXCEPTION_NMI 0
189#define EXIT_REASON_EXTERNAL_INTERRUPT 1
190#define EXIT_REASON_TRIPLE_FAULT 2
191
192#define EXIT_REASON_PENDING_INTERRUPT 7
193
194#define EXIT_REASON_TASK_SWITCH 9
195#define EXIT_REASON_CPUID 10
196#define EXIT_REASON_HLT 12
197#define EXIT_REASON_INVLPG 14
198#define EXIT_REASON_RDPMC 15
199#define EXIT_REASON_RDTSC 16
200#define EXIT_REASON_VMCALL 18
201#define EXIT_REASON_VMCLEAR 19
202#define EXIT_REASON_VMLAUNCH 20
203#define EXIT_REASON_VMPTRLD 21
204#define EXIT_REASON_VMPTRST 22
205#define EXIT_REASON_VMREAD 23
206#define EXIT_REASON_VMRESUME 24
207#define EXIT_REASON_VMWRITE 25
208#define EXIT_REASON_VMOFF 26
209#define EXIT_REASON_VMON 27
210#define EXIT_REASON_CR_ACCESS 28
211#define EXIT_REASON_DR_ACCESS 29
212#define EXIT_REASON_IO_INSTRUCTION 30
213#define EXIT_REASON_MSR_READ 31
214#define EXIT_REASON_MSR_WRITE 32
215#define EXIT_REASON_MWAIT_INSTRUCTION 36
216#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
217
218/*
219 * Interruption-information format
220 */
221#define INTR_INFO_VECTOR_MASK 0xff /* 7:0 */
222#define INTR_INFO_INTR_TYPE_MASK 0x700 /* 10:8 */
223#define INTR_INFO_DELIEVER_CODE_MASK 0x800 /* 11 */
224#define INTR_INFO_VALID_MASK 0x80000000 /* 31 */
225
226#define VECTORING_INFO_VECTOR_MASK INTR_INFO_VECTOR_MASK
227#define VECTORING_INFO_TYPE_MASK INTR_INFO_INTR_TYPE_MASK
228#define VECTORING_INFO_DELIEVER_CODE_MASK INTR_INFO_DELIEVER_CODE_MASK
229#define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK
230
231#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */
232#define INTR_TYPE_EXCEPTION (3 << 8) /* processor exception */
233
234/*
235 * Exit Qualifications for MOV for Control Register Access
236 */
237#define CONTROL_REG_ACCESS_NUM 0x7 /* 2:0, number of control register */
238#define CONTROL_REG_ACCESS_TYPE 0x30 /* 5:4, access type */
239#define CONTROL_REG_ACCESS_REG 0xf00 /* 10:8, general purpose register */
240#define LMSW_SOURCE_DATA_SHIFT 16
241#define LMSW_SOURCE_DATA (0xFFFF << LMSW_SOURCE_DATA_SHIFT) /* 16:31 lmsw source */
242#define REG_EAX (0 << 8)
243#define REG_ECX (1 << 8)
244#define REG_EDX (2 << 8)
245#define REG_EBX (3 << 8)
246#define REG_ESP (4 << 8)
247#define REG_EBP (5 << 8)
248#define REG_ESI (6 << 8)
249#define REG_EDI (7 << 8)
250#define REG_R8 (8 << 8)
251#define REG_R9 (9 << 8)
252#define REG_R10 (10 << 8)
253#define REG_R11 (11 << 8)
254#define REG_R12 (12 << 8)
255#define REG_R13 (13 << 8)
256#define REG_R14 (14 << 8)
257#define REG_R15 (15 << 8)
258
259/*
260 * Exit Qualifications for MOV for Debug Register Access
261 */
262#define DEBUG_REG_ACCESS_NUM 0x7 /* 2:0, number of debug register */
263#define DEBUG_REG_ACCESS_TYPE 0x10 /* 4, direction of access */
264#define TYPE_MOV_TO_DR (0 << 4)
265#define TYPE_MOV_FROM_DR (1 << 4)
266#define DEBUG_REG_ACCESS_REG 0xf00 /* 11:8, general purpose register */
267
268
269/* segment AR */
270#define SEGMENT_AR_L_MASK (1 << 13)
271
272#define AR_TYPE_ACCESSES_MASK 1
273#define AR_TYPE_READABLE_MASK (1 << 1)
274#define AR_TYPE_WRITEABLE_MASK (1 << 2)
275#define AR_TYPE_CODE_MASK (1 << 3)
276#define AR_TYPE_MASK 0x0f
277#define AR_TYPE_BUSY_64_TSS 11
278#define AR_TYPE_BUSY_32_TSS 11
279#define AR_TYPE_BUSY_16_TSS 3
280#define AR_TYPE_LDT 2
281
282#define AR_UNUSABLE_MASK (1 << 16)
283#define AR_S_MASK (1 << 4)
284#define AR_P_MASK (1 << 7)
285#define AR_L_MASK (1 << 13)
286#define AR_DB_MASK (1 << 14)
287#define AR_G_MASK (1 << 15)
288#define AR_DPL_SHIFT 5
289#define AR_DPL(ar) (((ar) >> AR_DPL_SHIFT) & 3)
290
291#define AR_RESERVD_MASK 0xfffe0f00
292
293#define MSR_IA32_VMX_BASIC 0x480
294#define MSR_IA32_VMX_PINBASED_CTLS 0x481
295#define MSR_IA32_VMX_PROCBASED_CTLS 0x482
296#define MSR_IA32_VMX_EXIT_CTLS 0x483
297#define MSR_IA32_VMX_ENTRY_CTLS 0x484
298#define MSR_IA32_VMX_MISC 0x485
299#define MSR_IA32_VMX_CR0_FIXED0 0x486
300#define MSR_IA32_VMX_CR0_FIXED1 0x487
301#define MSR_IA32_VMX_CR4_FIXED0 0x488
302#define MSR_IA32_VMX_CR4_FIXED1 0x489
303#define MSR_IA32_VMX_VMCS_ENUM 0x48a
304#define MSR_IA32_VMX_PROCBASED_CTLS2 0x48b
305
306#define MSR_IA32_FEATURE_CONTROL 0x3a
307#define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1
308#define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4
309
310#endif
diff --git a/drivers/kvm/x86_emulate.c b/drivers/kvm/x86_emulate.c
deleted file mode 100644
index bd46de6bf891..000000000000
--- a/drivers/kvm/x86_emulate.c
+++ /dev/null
@@ -1,1662 +0,0 @@
1/******************************************************************************
2 * x86_emulate.c
3 *
4 * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
5 *
6 * Copyright (c) 2005 Keir Fraser
7 *
8 * Linux coding style, mod r/m decoder, segment base fixes, real-mode
9 * privileged instructions:
10 *
11 * Copyright (C) 2006 Qumranet
12 *
13 * Avi Kivity <avi@qumranet.com>
14 * Yaniv Kamay <yaniv@qumranet.com>
15 *
16 * This work is licensed under the terms of the GNU GPL, version 2. See
17 * the COPYING file in the top-level directory.
18 *
19 * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
20 */
21
22#ifndef __KERNEL__
23#include <stdio.h>
24#include <stdint.h>
25#include <public/xen.h>
26#define DPRINTF(_f, _a ...) printf( _f , ## _a )
27#else
28#include "kvm.h"
29#define DPRINTF(x...) do {} while (0)
30#endif
31#include "x86_emulate.h"
32#include <linux/module.h>
33
34/*
35 * Opcode effective-address decode tables.
36 * Note that we only emulate instructions that have at least one memory
37 * operand (excluding implicit stack references). We assume that stack
38 * references and instruction fetches will never occur in special memory
39 * areas that require emulation. So, for example, 'mov <imm>,<reg>' need
40 * not be handled.
41 */
42
43/* Operand sizes: 8-bit operands or specified/overridden size. */
44#define ByteOp (1<<0) /* 8-bit operands. */
45/* Destination operand type. */
46#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */
47#define DstReg (2<<1) /* Register operand. */
48#define DstMem (3<<1) /* Memory operand. */
49#define DstMask (3<<1)
50/* Source operand type. */
51#define SrcNone (0<<3) /* No source operand. */
52#define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */
53#define SrcReg (1<<3) /* Register operand. */
54#define SrcMem (2<<3) /* Memory operand. */
55#define SrcMem16 (3<<3) /* Memory operand (16-bit). */
56#define SrcMem32 (4<<3) /* Memory operand (32-bit). */
57#define SrcImm (5<<3) /* Immediate operand. */
58#define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */
59#define SrcMask (7<<3)
60/* Generic ModRM decode. */
61#define ModRM (1<<6)
62/* Destination is only written; never read. */
63#define Mov (1<<7)
64#define BitOp (1<<8)
65
66static u8 opcode_table[256] = {
67 /* 0x00 - 0x07 */
68 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
69 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
70 0, 0, 0, 0,
71 /* 0x08 - 0x0F */
72 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
73 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
74 0, 0, 0, 0,
75 /* 0x10 - 0x17 */
76 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
77 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
78 0, 0, 0, 0,
79 /* 0x18 - 0x1F */
80 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
81 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
82 0, 0, 0, 0,
83 /* 0x20 - 0x27 */
84 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
85 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
86 SrcImmByte, SrcImm, 0, 0,
87 /* 0x28 - 0x2F */
88 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
89 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
90 0, 0, 0, 0,
91 /* 0x30 - 0x37 */
92 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
93 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
94 0, 0, 0, 0,
95 /* 0x38 - 0x3F */
96 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
97 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
98 0, 0, 0, 0,
99 /* 0x40 - 0x4F */
100 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
101 /* 0x50 - 0x57 */
102 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
103 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
104 /* 0x58 - 0x5F */
105 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
106 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
107 /* 0x60 - 0x67 */
108 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
109 0, 0, 0, 0,
110 /* 0x68 - 0x6F */
111 0, 0, ImplicitOps|Mov, 0,
112 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */
113 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */
114 /* 0x70 - 0x77 */
115 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
116 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
117 /* 0x78 - 0x7F */
118 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
119 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
120 /* 0x80 - 0x87 */
121 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
122 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
123 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
124 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
125 /* 0x88 - 0x8F */
126 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
127 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
128 0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov,
129 /* 0x90 - 0x9F */
130 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps, ImplicitOps, 0, 0,
131 /* 0xA0 - 0xA7 */
132 ByteOp | DstReg | SrcMem | Mov, DstReg | SrcMem | Mov,
133 ByteOp | DstMem | SrcReg | Mov, DstMem | SrcReg | Mov,
134 ByteOp | ImplicitOps | Mov, ImplicitOps | Mov,
135 ByteOp | ImplicitOps, ImplicitOps,
136 /* 0xA8 - 0xAF */
137 0, 0, ByteOp | ImplicitOps | Mov, ImplicitOps | Mov,
138 ByteOp | ImplicitOps | Mov, ImplicitOps | Mov,
139 ByteOp | ImplicitOps, ImplicitOps,
140 /* 0xB0 - 0xBF */
141 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
142 /* 0xC0 - 0xC7 */
143 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
144 0, ImplicitOps, 0, 0,
145 ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
146 /* 0xC8 - 0xCF */
147 0, 0, 0, 0, 0, 0, 0, 0,
148 /* 0xD0 - 0xD7 */
149 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
150 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
151 0, 0, 0, 0,
152 /* 0xD8 - 0xDF */
153 0, 0, 0, 0, 0, 0, 0, 0,
154 /* 0xE0 - 0xE7 */
155 0, 0, 0, 0, 0, 0, 0, 0,
156 /* 0xE8 - 0xEF */
157 ImplicitOps, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps, 0, 0, 0, 0,
158 /* 0xF0 - 0xF7 */
159 0, 0, 0, 0,
160 ImplicitOps, 0,
161 ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
162 /* 0xF8 - 0xFF */
163 0, 0, 0, 0,
164 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM
165};
166
167static u16 twobyte_table[256] = {
168 /* 0x00 - 0x0F */
169 0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0,
170 ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
171 /* 0x10 - 0x1F */
172 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
173 /* 0x20 - 0x2F */
174 ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 /* 0x30 - 0x3F */
177 ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
178 /* 0x40 - 0x47 */
179 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
180 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
181 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
182 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
183 /* 0x48 - 0x4F */
184 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
185 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
186 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
187 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
188 /* 0x50 - 0x5F */
189 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
190 /* 0x60 - 0x6F */
191 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
192 /* 0x70 - 0x7F */
193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
194 /* 0x80 - 0x8F */
195 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
196 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
197 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
198 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
199 /* 0x90 - 0x9F */
200 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
201 /* 0xA0 - 0xA7 */
202 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
203 /* 0xA8 - 0xAF */
204 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
205 /* 0xB0 - 0xB7 */
206 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0,
207 DstMem | SrcReg | ModRM | BitOp,
208 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
209 DstReg | SrcMem16 | ModRM | Mov,
210 /* 0xB8 - 0xBF */
211 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp,
212 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
213 DstReg | SrcMem16 | ModRM | Mov,
214 /* 0xC0 - 0xCF */
215 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 /* 0xD0 - 0xDF */
218 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
219 /* 0xE0 - 0xEF */
220 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
221 /* 0xF0 - 0xFF */
222 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
223};
224
225/* Type, address-of, and value of an instruction's operand. */
226struct operand {
227 enum { OP_REG, OP_MEM, OP_IMM } type;
228 unsigned int bytes;
229 unsigned long val, orig_val, *ptr;
230};
231
232/* EFLAGS bit definitions. */
233#define EFLG_OF (1<<11)
234#define EFLG_DF (1<<10)
235#define EFLG_SF (1<<7)
236#define EFLG_ZF (1<<6)
237#define EFLG_AF (1<<4)
238#define EFLG_PF (1<<2)
239#define EFLG_CF (1<<0)
240
241/*
242 * Instruction emulation:
243 * Most instructions are emulated directly via a fragment of inline assembly
244 * code. This allows us to save/restore EFLAGS and thus very easily pick up
245 * any modified flags.
246 */
247
248#if defined(CONFIG_X86_64)
249#define _LO32 "k" /* force 32-bit operand */
250#define _STK "%%rsp" /* stack pointer */
251#elif defined(__i386__)
252#define _LO32 "" /* force 32-bit operand */
253#define _STK "%%esp" /* stack pointer */
254#endif
255
256/*
257 * These EFLAGS bits are restored from saved value during emulation, and
258 * any changes are written back to the saved value after emulation.
259 */
260#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF)
261
262/* Before executing instruction: restore necessary bits in EFLAGS. */
263#define _PRE_EFLAGS(_sav, _msk, _tmp) \
264 /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); */ \
265 "push %"_sav"; " \
266 "movl %"_msk",%"_LO32 _tmp"; " \
267 "andl %"_LO32 _tmp",("_STK"); " \
268 "pushf; " \
269 "notl %"_LO32 _tmp"; " \
270 "andl %"_LO32 _tmp",("_STK"); " \
271 "pop %"_tmp"; " \
272 "orl %"_LO32 _tmp",("_STK"); " \
273 "popf; " \
274 /* _sav &= ~msk; */ \
275 "movl %"_msk",%"_LO32 _tmp"; " \
276 "notl %"_LO32 _tmp"; " \
277 "andl %"_LO32 _tmp",%"_sav"; "
278
279/* After executing instruction: write-back necessary bits in EFLAGS. */
280#define _POST_EFLAGS(_sav, _msk, _tmp) \
281 /* _sav |= EFLAGS & _msk; */ \
282 "pushf; " \
283 "pop %"_tmp"; " \
284 "andl %"_msk",%"_LO32 _tmp"; " \
285 "orl %"_LO32 _tmp",%"_sav"; "
286
287/* Raw emulation: instruction has two explicit operands. */
288#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \
289 do { \
290 unsigned long _tmp; \
291 \
292 switch ((_dst).bytes) { \
293 case 2: \
294 __asm__ __volatile__ ( \
295 _PRE_EFLAGS("0","4","2") \
296 _op"w %"_wx"3,%1; " \
297 _POST_EFLAGS("0","4","2") \
298 : "=m" (_eflags), "=m" ((_dst).val), \
299 "=&r" (_tmp) \
300 : _wy ((_src).val), "i" (EFLAGS_MASK) ); \
301 break; \
302 case 4: \
303 __asm__ __volatile__ ( \
304 _PRE_EFLAGS("0","4","2") \
305 _op"l %"_lx"3,%1; " \
306 _POST_EFLAGS("0","4","2") \
307 : "=m" (_eflags), "=m" ((_dst).val), \
308 "=&r" (_tmp) \
309 : _ly ((_src).val), "i" (EFLAGS_MASK) ); \
310 break; \
311 case 8: \
312 __emulate_2op_8byte(_op, _src, _dst, \
313 _eflags, _qx, _qy); \
314 break; \
315 } \
316 } while (0)
317
318#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
319 do { \
320 unsigned long _tmp; \
321 switch ( (_dst).bytes ) \
322 { \
323 case 1: \
324 __asm__ __volatile__ ( \
325 _PRE_EFLAGS("0","4","2") \
326 _op"b %"_bx"3,%1; " \
327 _POST_EFLAGS("0","4","2") \
328 : "=m" (_eflags), "=m" ((_dst).val), \
329 "=&r" (_tmp) \
330 : _by ((_src).val), "i" (EFLAGS_MASK) ); \
331 break; \
332 default: \
333 __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
334 _wx, _wy, _lx, _ly, _qx, _qy); \
335 break; \
336 } \
337 } while (0)
338
339/* Source operand is byte-sized and may be restricted to just %cl. */
340#define emulate_2op_SrcB(_op, _src, _dst, _eflags) \
341 __emulate_2op(_op, _src, _dst, _eflags, \
342 "b", "c", "b", "c", "b", "c", "b", "c")
343
344/* Source operand is byte, word, long or quad sized. */
345#define emulate_2op_SrcV(_op, _src, _dst, _eflags) \
346 __emulate_2op(_op, _src, _dst, _eflags, \
347 "b", "q", "w", "r", _LO32, "r", "", "r")
348
349/* Source operand is word, long or quad sized. */
350#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \
351 __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
352 "w", "r", _LO32, "r", "", "r")
353
354/* Instruction has only one explicit operand (no source operand). */
355#define emulate_1op(_op, _dst, _eflags) \
356 do { \
357 unsigned long _tmp; \
358 \
359 switch ( (_dst).bytes ) \
360 { \
361 case 1: \
362 __asm__ __volatile__ ( \
363 _PRE_EFLAGS("0","3","2") \
364 _op"b %1; " \
365 _POST_EFLAGS("0","3","2") \
366 : "=m" (_eflags), "=m" ((_dst).val), \
367 "=&r" (_tmp) \
368 : "i" (EFLAGS_MASK) ); \
369 break; \
370 case 2: \
371 __asm__ __volatile__ ( \
372 _PRE_EFLAGS("0","3","2") \
373 _op"w %1; " \
374 _POST_EFLAGS("0","3","2") \
375 : "=m" (_eflags), "=m" ((_dst).val), \
376 "=&r" (_tmp) \
377 : "i" (EFLAGS_MASK) ); \
378 break; \
379 case 4: \
380 __asm__ __volatile__ ( \
381 _PRE_EFLAGS("0","3","2") \
382 _op"l %1; " \
383 _POST_EFLAGS("0","3","2") \
384 : "=m" (_eflags), "=m" ((_dst).val), \
385 "=&r" (_tmp) \
386 : "i" (EFLAGS_MASK) ); \
387 break; \
388 case 8: \
389 __emulate_1op_8byte(_op, _dst, _eflags); \
390 break; \
391 } \
392 } while (0)
393
394/* Emulate an instruction with quadword operands (x86/64 only). */
395#if defined(CONFIG_X86_64)
396#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) \
397 do { \
398 __asm__ __volatile__ ( \
399 _PRE_EFLAGS("0","4","2") \
400 _op"q %"_qx"3,%1; " \
401 _POST_EFLAGS("0","4","2") \
402 : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
403 : _qy ((_src).val), "i" (EFLAGS_MASK) ); \
404 } while (0)
405
406#define __emulate_1op_8byte(_op, _dst, _eflags) \
407 do { \
408 __asm__ __volatile__ ( \
409 _PRE_EFLAGS("0","3","2") \
410 _op"q %1; " \
411 _POST_EFLAGS("0","3","2") \
412 : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
413 : "i" (EFLAGS_MASK) ); \
414 } while (0)
415
416#elif defined(__i386__)
417#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)
418#define __emulate_1op_8byte(_op, _dst, _eflags)
419#endif /* __i386__ */
420
421/* Fetch next part of the instruction being emulated. */
422#define insn_fetch(_type, _size, _eip) \
423({ unsigned long _x; \
424 rc = ops->read_std((unsigned long)(_eip) + ctxt->cs_base, &_x, \
425 (_size), ctxt->vcpu); \
426 if ( rc != 0 ) \
427 goto done; \
428 (_eip) += (_size); \
429 (_type)_x; \
430})
431
432/* Access/update address held in a register, based on addressing mode. */
433#define address_mask(reg) \
434 ((ad_bytes == sizeof(unsigned long)) ? \
435 (reg) : ((reg) & ((1UL << (ad_bytes << 3)) - 1)))
436#define register_address(base, reg) \
437 ((base) + address_mask(reg))
438#define register_address_increment(reg, inc) \
439 do { \
440 /* signed type ensures sign extension to long */ \
441 int _inc = (inc); \
442 if ( ad_bytes == sizeof(unsigned long) ) \
443 (reg) += _inc; \
444 else \
445 (reg) = ((reg) & ~((1UL << (ad_bytes << 3)) - 1)) | \
446 (((reg) + _inc) & ((1UL << (ad_bytes << 3)) - 1)); \
447 } while (0)
448
449#define JMP_REL(rel) \
450 do { \
451 register_address_increment(_eip, rel); \
452 } while (0)
453
454/*
455 * Given the 'reg' portion of a ModRM byte, and a register block, return a
456 * pointer into the block that addresses the relevant register.
457 * @highbyte_regs specifies whether to decode AH,CH,DH,BH.
458 */
459static void *decode_register(u8 modrm_reg, unsigned long *regs,
460 int highbyte_regs)
461{
462 void *p;
463
464 p = &regs[modrm_reg];
465 if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8)
466 p = (unsigned char *)&regs[modrm_reg & 3] + 1;
467 return p;
468}
469
470static int read_descriptor(struct x86_emulate_ctxt *ctxt,
471 struct x86_emulate_ops *ops,
472 void *ptr,
473 u16 *size, unsigned long *address, int op_bytes)
474{
475 int rc;
476
477 if (op_bytes == 2)
478 op_bytes = 3;
479 *address = 0;
480 rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
481 ctxt->vcpu);
482 if (rc)
483 return rc;
484 rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
485 ctxt->vcpu);
486 return rc;
487}
488
489static int test_cc(unsigned int condition, unsigned int flags)
490{
491 int rc = 0;
492
493 switch ((condition & 15) >> 1) {
494 case 0: /* o */
495 rc |= (flags & EFLG_OF);
496 break;
497 case 1: /* b/c/nae */
498 rc |= (flags & EFLG_CF);
499 break;
500 case 2: /* z/e */
501 rc |= (flags & EFLG_ZF);
502 break;
503 case 3: /* be/na */
504 rc |= (flags & (EFLG_CF|EFLG_ZF));
505 break;
506 case 4: /* s */
507 rc |= (flags & EFLG_SF);
508 break;
509 case 5: /* p/pe */
510 rc |= (flags & EFLG_PF);
511 break;
512 case 7: /* le/ng */
513 rc |= (flags & EFLG_ZF);
514 /* fall through */
515 case 6: /* l/nge */
516 rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF));
517 break;
518 }
519
520 /* Odd condition identifiers (lsb == 1) have inverted sense. */
521 return (!!rc ^ (condition & 1));
522}
523
524int
525x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
526{
527 unsigned d;
528 u8 b, sib, twobyte = 0, rex_prefix = 0;
529 u8 modrm, modrm_mod = 0, modrm_reg = 0, modrm_rm = 0;
530 unsigned long *override_base = NULL;
531 unsigned int op_bytes, ad_bytes, lock_prefix = 0, rep_prefix = 0, i;
532 int rc = 0;
533 struct operand src, dst;
534 unsigned long cr2 = ctxt->cr2;
535 int mode = ctxt->mode;
536 unsigned long modrm_ea;
537 int use_modrm_ea, index_reg = 0, base_reg = 0, scale, rip_relative = 0;
538 int no_wb = 0;
539 u64 msr_data;
540
541 /* Shadow copy of register state. Committed on successful emulation. */
542 unsigned long _regs[NR_VCPU_REGS];
543 unsigned long _eip = ctxt->vcpu->rip, _eflags = ctxt->eflags;
544 unsigned long modrm_val = 0;
545
546 memcpy(_regs, ctxt->vcpu->regs, sizeof _regs);
547
548 switch (mode) {
549 case X86EMUL_MODE_REAL:
550 case X86EMUL_MODE_PROT16:
551 op_bytes = ad_bytes = 2;
552 break;
553 case X86EMUL_MODE_PROT32:
554 op_bytes = ad_bytes = 4;
555 break;
556#ifdef CONFIG_X86_64
557 case X86EMUL_MODE_PROT64:
558 op_bytes = 4;
559 ad_bytes = 8;
560 break;
561#endif
562 default:
563 return -1;
564 }
565
566 /* Legacy prefixes. */
567 for (i = 0; i < 8; i++) {
568 switch (b = insn_fetch(u8, 1, _eip)) {
569 case 0x66: /* operand-size override */
570 op_bytes ^= 6; /* switch between 2/4 bytes */
571 break;
572 case 0x67: /* address-size override */
573 if (mode == X86EMUL_MODE_PROT64)
574 ad_bytes ^= 12; /* switch between 4/8 bytes */
575 else
576 ad_bytes ^= 6; /* switch between 2/4 bytes */
577 break;
578 case 0x2e: /* CS override */
579 override_base = &ctxt->cs_base;
580 break;
581 case 0x3e: /* DS override */
582 override_base = &ctxt->ds_base;
583 break;
584 case 0x26: /* ES override */
585 override_base = &ctxt->es_base;
586 break;
587 case 0x64: /* FS override */
588 override_base = &ctxt->fs_base;
589 break;
590 case 0x65: /* GS override */
591 override_base = &ctxt->gs_base;
592 break;
593 case 0x36: /* SS override */
594 override_base = &ctxt->ss_base;
595 break;
596 case 0xf0: /* LOCK */
597 lock_prefix = 1;
598 break;
599 case 0xf2: /* REPNE/REPNZ */
600 case 0xf3: /* REP/REPE/REPZ */
601 rep_prefix = 1;
602 break;
603 default:
604 goto done_prefixes;
605 }
606 }
607
608done_prefixes:
609
610 /* REX prefix. */
611 if ((mode == X86EMUL_MODE_PROT64) && ((b & 0xf0) == 0x40)) {
612 rex_prefix = b;
613 if (b & 8)
614 op_bytes = 8; /* REX.W */
615 modrm_reg = (b & 4) << 1; /* REX.R */
616 index_reg = (b & 2) << 2; /* REX.X */
617 modrm_rm = base_reg = (b & 1) << 3; /* REG.B */
618 b = insn_fetch(u8, 1, _eip);
619 }
620
621 /* Opcode byte(s). */
622 d = opcode_table[b];
623 if (d == 0) {
624 /* Two-byte opcode? */
625 if (b == 0x0f) {
626 twobyte = 1;
627 b = insn_fetch(u8, 1, _eip);
628 d = twobyte_table[b];
629 }
630
631 /* Unrecognised? */
632 if (d == 0)
633 goto cannot_emulate;
634 }
635
636 /* ModRM and SIB bytes. */
637 if (d & ModRM) {
638 modrm = insn_fetch(u8, 1, _eip);
639 modrm_mod |= (modrm & 0xc0) >> 6;
640 modrm_reg |= (modrm & 0x38) >> 3;
641 modrm_rm |= (modrm & 0x07);
642 modrm_ea = 0;
643 use_modrm_ea = 1;
644
645 if (modrm_mod == 3) {
646 modrm_val = *(unsigned long *)
647 decode_register(modrm_rm, _regs, d & ByteOp);
648 goto modrm_done;
649 }
650
651 if (ad_bytes == 2) {
652 unsigned bx = _regs[VCPU_REGS_RBX];
653 unsigned bp = _regs[VCPU_REGS_RBP];
654 unsigned si = _regs[VCPU_REGS_RSI];
655 unsigned di = _regs[VCPU_REGS_RDI];
656
657 /* 16-bit ModR/M decode. */
658 switch (modrm_mod) {
659 case 0:
660 if (modrm_rm == 6)
661 modrm_ea += insn_fetch(u16, 2, _eip);
662 break;
663 case 1:
664 modrm_ea += insn_fetch(s8, 1, _eip);
665 break;
666 case 2:
667 modrm_ea += insn_fetch(u16, 2, _eip);
668 break;
669 }
670 switch (modrm_rm) {
671 case 0:
672 modrm_ea += bx + si;
673 break;
674 case 1:
675 modrm_ea += bx + di;
676 break;
677 case 2:
678 modrm_ea += bp + si;
679 break;
680 case 3:
681 modrm_ea += bp + di;
682 break;
683 case 4:
684 modrm_ea += si;
685 break;
686 case 5:
687 modrm_ea += di;
688 break;
689 case 6:
690 if (modrm_mod != 0)
691 modrm_ea += bp;
692 break;
693 case 7:
694 modrm_ea += bx;
695 break;
696 }
697 if (modrm_rm == 2 || modrm_rm == 3 ||
698 (modrm_rm == 6 && modrm_mod != 0))
699 if (!override_base)
700 override_base = &ctxt->ss_base;
701 modrm_ea = (u16)modrm_ea;
702 } else {
703 /* 32/64-bit ModR/M decode. */
704 switch (modrm_rm) {
705 case 4:
706 case 12:
707 sib = insn_fetch(u8, 1, _eip);
708 index_reg |= (sib >> 3) & 7;
709 base_reg |= sib & 7;
710 scale = sib >> 6;
711
712 switch (base_reg) {
713 case 5:
714 if (modrm_mod != 0)
715 modrm_ea += _regs[base_reg];
716 else
717 modrm_ea += insn_fetch(s32, 4, _eip);
718 break;
719 default:
720 modrm_ea += _regs[base_reg];
721 }
722 switch (index_reg) {
723 case 4:
724 break;
725 default:
726 modrm_ea += _regs[index_reg] << scale;
727
728 }
729 break;
730 case 5:
731 if (modrm_mod != 0)
732 modrm_ea += _regs[modrm_rm];
733 else if (mode == X86EMUL_MODE_PROT64)
734 rip_relative = 1;
735 break;
736 default:
737 modrm_ea += _regs[modrm_rm];
738 break;
739 }
740 switch (modrm_mod) {
741 case 0:
742 if (modrm_rm == 5)
743 modrm_ea += insn_fetch(s32, 4, _eip);
744 break;
745 case 1:
746 modrm_ea += insn_fetch(s8, 1, _eip);
747 break;
748 case 2:
749 modrm_ea += insn_fetch(s32, 4, _eip);
750 break;
751 }
752 }
753 if (!override_base)
754 override_base = &ctxt->ds_base;
755 if (mode == X86EMUL_MODE_PROT64 &&
756 override_base != &ctxt->fs_base &&
757 override_base != &ctxt->gs_base)
758 override_base = NULL;
759
760 if (override_base)
761 modrm_ea += *override_base;
762
763 if (rip_relative) {
764 modrm_ea += _eip;
765 switch (d & SrcMask) {
766 case SrcImmByte:
767 modrm_ea += 1;
768 break;
769 case SrcImm:
770 if (d & ByteOp)
771 modrm_ea += 1;
772 else
773 if (op_bytes == 8)
774 modrm_ea += 4;
775 else
776 modrm_ea += op_bytes;
777 }
778 }
779 if (ad_bytes != 8)
780 modrm_ea = (u32)modrm_ea;
781 cr2 = modrm_ea;
782 modrm_done:
783 ;
784 }
785
786 /*
787 * Decode and fetch the source operand: register, memory
788 * or immediate.
789 */
790 switch (d & SrcMask) {
791 case SrcNone:
792 break;
793 case SrcReg:
794 src.type = OP_REG;
795 if (d & ByteOp) {
796 src.ptr = decode_register(modrm_reg, _regs,
797 (rex_prefix == 0));
798 src.val = src.orig_val = *(u8 *) src.ptr;
799 src.bytes = 1;
800 } else {
801 src.ptr = decode_register(modrm_reg, _regs, 0);
802 switch ((src.bytes = op_bytes)) {
803 case 2:
804 src.val = src.orig_val = *(u16 *) src.ptr;
805 break;
806 case 4:
807 src.val = src.orig_val = *(u32 *) src.ptr;
808 break;
809 case 8:
810 src.val = src.orig_val = *(u64 *) src.ptr;
811 break;
812 }
813 }
814 break;
815 case SrcMem16:
816 src.bytes = 2;
817 goto srcmem_common;
818 case SrcMem32:
819 src.bytes = 4;
820 goto srcmem_common;
821 case SrcMem:
822 src.bytes = (d & ByteOp) ? 1 : op_bytes;
823 /* Don't fetch the address for invlpg: it could be unmapped. */
824 if (twobyte && b == 0x01 && modrm_reg == 7)
825 break;
826 srcmem_common:
827 /*
828 * For instructions with a ModR/M byte, switch to register
829 * access if Mod = 3.
830 */
831 if ((d & ModRM) && modrm_mod == 3) {
832 src.type = OP_REG;
833 break;
834 }
835 src.type = OP_MEM;
836 src.ptr = (unsigned long *)cr2;
837 src.val = 0;
838 if ((rc = ops->read_emulated((unsigned long)src.ptr,
839 &src.val, src.bytes, ctxt->vcpu)) != 0)
840 goto done;
841 src.orig_val = src.val;
842 break;
843 case SrcImm:
844 src.type = OP_IMM;
845 src.ptr = (unsigned long *)_eip;
846 src.bytes = (d & ByteOp) ? 1 : op_bytes;
847 if (src.bytes == 8)
848 src.bytes = 4;
849 /* NB. Immediates are sign-extended as necessary. */
850 switch (src.bytes) {
851 case 1:
852 src.val = insn_fetch(s8, 1, _eip);
853 break;
854 case 2:
855 src.val = insn_fetch(s16, 2, _eip);
856 break;
857 case 4:
858 src.val = insn_fetch(s32, 4, _eip);
859 break;
860 }
861 break;
862 case SrcImmByte:
863 src.type = OP_IMM;
864 src.ptr = (unsigned long *)_eip;
865 src.bytes = 1;
866 src.val = insn_fetch(s8, 1, _eip);
867 break;
868 }
869
870 /* Decode and fetch the destination operand: register or memory. */
871 switch (d & DstMask) {
872 case ImplicitOps:
873 /* Special instructions do their own operand decoding. */
874 goto special_insn;
875 case DstReg:
876 dst.type = OP_REG;
877 if ((d & ByteOp)
878 && !(twobyte && (b == 0xb6 || b == 0xb7))) {
879 dst.ptr = decode_register(modrm_reg, _regs,
880 (rex_prefix == 0));
881 dst.val = *(u8 *) dst.ptr;
882 dst.bytes = 1;
883 } else {
884 dst.ptr = decode_register(modrm_reg, _regs, 0);
885 switch ((dst.bytes = op_bytes)) {
886 case 2:
887 dst.val = *(u16 *)dst.ptr;
888 break;
889 case 4:
890 dst.val = *(u32 *)dst.ptr;
891 break;
892 case 8:
893 dst.val = *(u64 *)dst.ptr;
894 break;
895 }
896 }
897 break;
898 case DstMem:
899 dst.type = OP_MEM;
900 dst.ptr = (unsigned long *)cr2;
901 dst.bytes = (d & ByteOp) ? 1 : op_bytes;
902 dst.val = 0;
903 /*
904 * For instructions with a ModR/M byte, switch to register
905 * access if Mod = 3.
906 */
907 if ((d & ModRM) && modrm_mod == 3) {
908 dst.type = OP_REG;
909 break;
910 }
911 if (d & BitOp) {
912 unsigned long mask = ~(dst.bytes * 8 - 1);
913
914 dst.ptr = (void *)dst.ptr + (src.val & mask) / 8;
915 }
916 if (!(d & Mov) && /* optimisation - avoid slow emulated read */
917 ((rc = ops->read_emulated((unsigned long)dst.ptr,
918 &dst.val, dst.bytes, ctxt->vcpu)) != 0))
919 goto done;
920 break;
921 }
922 dst.orig_val = dst.val;
923
924 if (twobyte)
925 goto twobyte_insn;
926
927 switch (b) {
928 case 0x00 ... 0x05:
929 add: /* add */
930 emulate_2op_SrcV("add", src, dst, _eflags);
931 break;
932 case 0x08 ... 0x0d:
933 or: /* or */
934 emulate_2op_SrcV("or", src, dst, _eflags);
935 break;
936 case 0x10 ... 0x15:
937 adc: /* adc */
938 emulate_2op_SrcV("adc", src, dst, _eflags);
939 break;
940 case 0x18 ... 0x1d:
941 sbb: /* sbb */
942 emulate_2op_SrcV("sbb", src, dst, _eflags);
943 break;
944 case 0x20 ... 0x23:
945 and: /* and */
946 emulate_2op_SrcV("and", src, dst, _eflags);
947 break;
948 case 0x24: /* and al imm8 */
949 dst.type = OP_REG;
950 dst.ptr = &_regs[VCPU_REGS_RAX];
951 dst.val = *(u8 *)dst.ptr;
952 dst.bytes = 1;
953 dst.orig_val = dst.val;
954 goto and;
955 case 0x25: /* and ax imm16, or eax imm32 */
956 dst.type = OP_REG;
957 dst.bytes = op_bytes;
958 dst.ptr = &_regs[VCPU_REGS_RAX];
959 if (op_bytes == 2)
960 dst.val = *(u16 *)dst.ptr;
961 else
962 dst.val = *(u32 *)dst.ptr;
963 dst.orig_val = dst.val;
964 goto and;
965 case 0x28 ... 0x2d:
966 sub: /* sub */
967 emulate_2op_SrcV("sub", src, dst, _eflags);
968 break;
969 case 0x30 ... 0x35:
970 xor: /* xor */
971 emulate_2op_SrcV("xor", src, dst, _eflags);
972 break;
973 case 0x38 ... 0x3d:
974 cmp: /* cmp */
975 emulate_2op_SrcV("cmp", src, dst, _eflags);
976 break;
977 case 0x63: /* movsxd */
978 if (mode != X86EMUL_MODE_PROT64)
979 goto cannot_emulate;
980 dst.val = (s32) src.val;
981 break;
982 case 0x80 ... 0x83: /* Grp1 */
983 switch (modrm_reg) {
984 case 0:
985 goto add;
986 case 1:
987 goto or;
988 case 2:
989 goto adc;
990 case 3:
991 goto sbb;
992 case 4:
993 goto and;
994 case 5:
995 goto sub;
996 case 6:
997 goto xor;
998 case 7:
999 goto cmp;
1000 }
1001 break;
1002 case 0x84 ... 0x85:
1003 test: /* test */
1004 emulate_2op_SrcV("test", src, dst, _eflags);
1005 break;
1006 case 0x86 ... 0x87: /* xchg */
1007 /* Write back the register source. */
1008 switch (dst.bytes) {
1009 case 1:
1010 *(u8 *) src.ptr = (u8) dst.val;
1011 break;
1012 case 2:
1013 *(u16 *) src.ptr = (u16) dst.val;
1014 break;
1015 case 4:
1016 *src.ptr = (u32) dst.val;
1017 break; /* 64b reg: zero-extend */
1018 case 8:
1019 *src.ptr = dst.val;
1020 break;
1021 }
1022 /*
1023 * Write back the memory destination with implicit LOCK
1024 * prefix.
1025 */
1026 dst.val = src.val;
1027 lock_prefix = 1;
1028 break;
1029 case 0x88 ... 0x8b: /* mov */
1030 goto mov;
1031 case 0x8d: /* lea r16/r32, m */
1032 dst.val = modrm_val;
1033 break;
1034 case 0x8f: /* pop (sole member of Grp1a) */
1035 /* 64-bit mode: POP always pops a 64-bit operand. */
1036 if (mode == X86EMUL_MODE_PROT64)
1037 dst.bytes = 8;
1038 if ((rc = ops->read_std(register_address(ctxt->ss_base,
1039 _regs[VCPU_REGS_RSP]),
1040 &dst.val, dst.bytes, ctxt->vcpu)) != 0)
1041 goto done;
1042 register_address_increment(_regs[VCPU_REGS_RSP], dst.bytes);
1043 break;
1044 case 0xa0 ... 0xa1: /* mov */
1045 dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
1046 dst.val = src.val;
1047 _eip += ad_bytes; /* skip src displacement */
1048 break;
1049 case 0xa2 ... 0xa3: /* mov */
1050 dst.val = (unsigned long)_regs[VCPU_REGS_RAX];
1051 _eip += ad_bytes; /* skip dst displacement */
1052 break;
1053 case 0xc0 ... 0xc1:
1054 grp2: /* Grp2 */
1055 switch (modrm_reg) {
1056 case 0: /* rol */
1057 emulate_2op_SrcB("rol", src, dst, _eflags);
1058 break;
1059 case 1: /* ror */
1060 emulate_2op_SrcB("ror", src, dst, _eflags);
1061 break;
1062 case 2: /* rcl */
1063 emulate_2op_SrcB("rcl", src, dst, _eflags);
1064 break;
1065 case 3: /* rcr */
1066 emulate_2op_SrcB("rcr", src, dst, _eflags);
1067 break;
1068 case 4: /* sal/shl */
1069 case 6: /* sal/shl */
1070 emulate_2op_SrcB("sal", src, dst, _eflags);
1071 break;
1072 case 5: /* shr */
1073 emulate_2op_SrcB("shr", src, dst, _eflags);
1074 break;
1075 case 7: /* sar */
1076 emulate_2op_SrcB("sar", src, dst, _eflags);
1077 break;
1078 }
1079 break;
1080 case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */
1081 mov:
1082 dst.val = src.val;
1083 break;
1084 case 0xd0 ... 0xd1: /* Grp2 */
1085 src.val = 1;
1086 goto grp2;
1087 case 0xd2 ... 0xd3: /* Grp2 */
1088 src.val = _regs[VCPU_REGS_RCX];
1089 goto grp2;
1090 case 0xf6 ... 0xf7: /* Grp3 */
1091 switch (modrm_reg) {
1092 case 0 ... 1: /* test */
1093 /*
1094 * Special case in Grp3: test has an immediate
1095 * source operand.
1096 */
1097 src.type = OP_IMM;
1098 src.ptr = (unsigned long *)_eip;
1099 src.bytes = (d & ByteOp) ? 1 : op_bytes;
1100 if (src.bytes == 8)
1101 src.bytes = 4;
1102 switch (src.bytes) {
1103 case 1:
1104 src.val = insn_fetch(s8, 1, _eip);
1105 break;
1106 case 2:
1107 src.val = insn_fetch(s16, 2, _eip);
1108 break;
1109 case 4:
1110 src.val = insn_fetch(s32, 4, _eip);
1111 break;
1112 }
1113 goto test;
1114 case 2: /* not */
1115 dst.val = ~dst.val;
1116 break;
1117 case 3: /* neg */
1118 emulate_1op("neg", dst, _eflags);
1119 break;
1120 default:
1121 goto cannot_emulate;
1122 }
1123 break;
1124 case 0xfe ... 0xff: /* Grp4/Grp5 */
1125 switch (modrm_reg) {
1126 case 0: /* inc */
1127 emulate_1op("inc", dst, _eflags);
1128 break;
1129 case 1: /* dec */
1130 emulate_1op("dec", dst, _eflags);
1131 break;
1132 case 4: /* jmp abs */
1133 if (b == 0xff)
1134 _eip = dst.val;
1135 else
1136 goto cannot_emulate;
1137 break;
1138 case 6: /* push */
1139 /* 64-bit mode: PUSH always pushes a 64-bit operand. */
1140 if (mode == X86EMUL_MODE_PROT64) {
1141 dst.bytes = 8;
1142 if ((rc = ops->read_std((unsigned long)dst.ptr,
1143 &dst.val, 8,
1144 ctxt->vcpu)) != 0)
1145 goto done;
1146 }
1147 register_address_increment(_regs[VCPU_REGS_RSP],
1148 -dst.bytes);
1149 if ((rc = ops->write_emulated(
1150 register_address(ctxt->ss_base,
1151 _regs[VCPU_REGS_RSP]),
1152 &dst.val, dst.bytes, ctxt->vcpu)) != 0)
1153 goto done;
1154 no_wb = 1;
1155 break;
1156 default:
1157 goto cannot_emulate;
1158 }
1159 break;
1160 }
1161
1162writeback:
1163 if (!no_wb) {
1164 switch (dst.type) {
1165 case OP_REG:
1166 /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
1167 switch (dst.bytes) {
1168 case 1:
1169 *(u8 *)dst.ptr = (u8)dst.val;
1170 break;
1171 case 2:
1172 *(u16 *)dst.ptr = (u16)dst.val;
1173 break;
1174 case 4:
1175 *dst.ptr = (u32)dst.val;
1176 break; /* 64b: zero-ext */
1177 case 8:
1178 *dst.ptr = dst.val;
1179 break;
1180 }
1181 break;
1182 case OP_MEM:
1183 if (lock_prefix)
1184 rc = ops->cmpxchg_emulated((unsigned long)dst.
1185 ptr, &dst.orig_val,
1186 &dst.val, dst.bytes,
1187 ctxt->vcpu);
1188 else
1189 rc = ops->write_emulated((unsigned long)dst.ptr,
1190 &dst.val, dst.bytes,
1191 ctxt->vcpu);
1192 if (rc != 0)
1193 goto done;
1194 default:
1195 break;
1196 }
1197 }
1198
1199 /* Commit shadow register state. */
1200 memcpy(ctxt->vcpu->regs, _regs, sizeof _regs);
1201 ctxt->eflags = _eflags;
1202 ctxt->vcpu->rip = _eip;
1203
1204done:
1205 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
1206
1207special_insn:
1208 if (twobyte)
1209 goto twobyte_special_insn;
1210 switch(b) {
1211 case 0x50 ... 0x57: /* push reg */
1212 if (op_bytes == 2)
1213 src.val = (u16) _regs[b & 0x7];
1214 else
1215 src.val = (u32) _regs[b & 0x7];
1216 dst.type = OP_MEM;
1217 dst.bytes = op_bytes;
1218 dst.val = src.val;
1219 register_address_increment(_regs[VCPU_REGS_RSP], -op_bytes);
1220 dst.ptr = (void *) register_address(
1221 ctxt->ss_base, _regs[VCPU_REGS_RSP]);
1222 break;
1223 case 0x58 ... 0x5f: /* pop reg */
1224 dst.ptr = (unsigned long *)&_regs[b & 0x7];
1225 pop_instruction:
1226 if ((rc = ops->read_std(register_address(ctxt->ss_base,
1227 _regs[VCPU_REGS_RSP]), dst.ptr, op_bytes, ctxt->vcpu))
1228 != 0)
1229 goto done;
1230
1231 register_address_increment(_regs[VCPU_REGS_RSP], op_bytes);
1232 no_wb = 1; /* Disable writeback. */
1233 break;
1234 case 0x6a: /* push imm8 */
1235 src.val = 0L;
1236 src.val = insn_fetch(s8, 1, _eip);
1237 push:
1238 dst.type = OP_MEM;
1239 dst.bytes = op_bytes;
1240 dst.val = src.val;
1241 register_address_increment(_regs[VCPU_REGS_RSP], -op_bytes);
1242 dst.ptr = (void *) register_address(ctxt->ss_base,
1243 _regs[VCPU_REGS_RSP]);
1244 break;
1245 case 0x6c: /* insb */
1246 case 0x6d: /* insw/insd */
1247 if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
1248 1, /* in */
1249 (d & ByteOp) ? 1 : op_bytes, /* size */
1250 rep_prefix ?
1251 address_mask(_regs[VCPU_REGS_RCX]) : 1, /* count */
1252 (_eflags & EFLG_DF), /* down */
1253 register_address(ctxt->es_base,
1254 _regs[VCPU_REGS_RDI]), /* address */
1255 rep_prefix,
1256 _regs[VCPU_REGS_RDX] /* port */
1257 ) == 0)
1258 return -1;
1259 return 0;
1260 case 0x6e: /* outsb */
1261 case 0x6f: /* outsw/outsd */
1262 if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
1263 0, /* in */
1264 (d & ByteOp) ? 1 : op_bytes, /* size */
1265 rep_prefix ?
1266 address_mask(_regs[VCPU_REGS_RCX]) : 1, /* count */
1267 (_eflags & EFLG_DF), /* down */
1268 register_address(override_base ?
1269 *override_base : ctxt->ds_base,
1270 _regs[VCPU_REGS_RSI]), /* address */
1271 rep_prefix,
1272 _regs[VCPU_REGS_RDX] /* port */
1273 ) == 0)
1274 return -1;
1275 return 0;
1276 case 0x70 ... 0x7f: /* jcc (short) */ {
1277 int rel = insn_fetch(s8, 1, _eip);
1278
1279 if (test_cc(b, _eflags))
1280 JMP_REL(rel);
1281 break;
1282 }
1283 case 0x9c: /* pushf */
1284 src.val = (unsigned long) _eflags;
1285 goto push;
1286 case 0x9d: /* popf */
1287 dst.ptr = (unsigned long *) &_eflags;
1288 goto pop_instruction;
1289 case 0xc3: /* ret */
1290 dst.ptr = &_eip;
1291 goto pop_instruction;
1292 case 0xf4: /* hlt */
1293 ctxt->vcpu->halt_request = 1;
1294 goto done;
1295 }
1296 if (rep_prefix) {
1297 if (_regs[VCPU_REGS_RCX] == 0) {
1298 ctxt->vcpu->rip = _eip;
1299 goto done;
1300 }
1301 _regs[VCPU_REGS_RCX]--;
1302 _eip = ctxt->vcpu->rip;
1303 }
1304 switch (b) {
1305 case 0xa4 ... 0xa5: /* movs */
1306 dst.type = OP_MEM;
1307 dst.bytes = (d & ByteOp) ? 1 : op_bytes;
1308 dst.ptr = (unsigned long *)register_address(ctxt->es_base,
1309 _regs[VCPU_REGS_RDI]);
1310 if ((rc = ops->read_emulated(register_address(
1311 override_base ? *override_base : ctxt->ds_base,
1312 _regs[VCPU_REGS_RSI]), &dst.val, dst.bytes, ctxt->vcpu)) != 0)
1313 goto done;
1314 register_address_increment(_regs[VCPU_REGS_RSI],
1315 (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
1316 register_address_increment(_regs[VCPU_REGS_RDI],
1317 (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
1318 break;
1319 case 0xa6 ... 0xa7: /* cmps */
1320 DPRINTF("Urk! I don't handle CMPS.\n");
1321 goto cannot_emulate;
1322 case 0xaa ... 0xab: /* stos */
1323 dst.type = OP_MEM;
1324 dst.bytes = (d & ByteOp) ? 1 : op_bytes;
1325 dst.ptr = (unsigned long *)cr2;
1326 dst.val = _regs[VCPU_REGS_RAX];
1327 register_address_increment(_regs[VCPU_REGS_RDI],
1328 (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
1329 break;
1330 case 0xac ... 0xad: /* lods */
1331 dst.type = OP_REG;
1332 dst.bytes = (d & ByteOp) ? 1 : op_bytes;
1333 dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
1334 if ((rc = ops->read_emulated(cr2, &dst.val, dst.bytes,
1335 ctxt->vcpu)) != 0)
1336 goto done;
1337 register_address_increment(_regs[VCPU_REGS_RSI],
1338 (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
1339 break;
1340 case 0xae ... 0xaf: /* scas */
1341 DPRINTF("Urk! I don't handle SCAS.\n");
1342 goto cannot_emulate;
1343 case 0xe8: /* call (near) */ {
1344 long int rel;
1345 switch (op_bytes) {
1346 case 2:
1347 rel = insn_fetch(s16, 2, _eip);
1348 break;
1349 case 4:
1350 rel = insn_fetch(s32, 4, _eip);
1351 break;
1352 case 8:
1353 rel = insn_fetch(s64, 8, _eip);
1354 break;
1355 default:
1356 DPRINTF("Call: Invalid op_bytes\n");
1357 goto cannot_emulate;
1358 }
1359 src.val = (unsigned long) _eip;
1360 JMP_REL(rel);
1361 op_bytes = ad_bytes;
1362 goto push;
1363 }
1364 case 0xe9: /* jmp rel */
1365 case 0xeb: /* jmp rel short */
1366 JMP_REL(src.val);
1367 no_wb = 1; /* Disable writeback. */
1368 break;
1369
1370
1371 }
1372 goto writeback;
1373
1374twobyte_insn:
1375 switch (b) {
1376 case 0x01: /* lgdt, lidt, lmsw */
1377 /* Disable writeback. */
1378 no_wb = 1;
1379 switch (modrm_reg) {
1380 u16 size;
1381 unsigned long address;
1382
1383 case 2: /* lgdt */
1384 rc = read_descriptor(ctxt, ops, src.ptr,
1385 &size, &address, op_bytes);
1386 if (rc)
1387 goto done;
1388 realmode_lgdt(ctxt->vcpu, size, address);
1389 break;
1390 case 3: /* lidt */
1391 rc = read_descriptor(ctxt, ops, src.ptr,
1392 &size, &address, op_bytes);
1393 if (rc)
1394 goto done;
1395 realmode_lidt(ctxt->vcpu, size, address);
1396 break;
1397 case 4: /* smsw */
1398 if (modrm_mod != 3)
1399 goto cannot_emulate;
1400 *(u16 *)&_regs[modrm_rm]
1401 = realmode_get_cr(ctxt->vcpu, 0);
1402 break;
1403 case 6: /* lmsw */
1404 if (modrm_mod != 3)
1405 goto cannot_emulate;
1406 realmode_lmsw(ctxt->vcpu, (u16)modrm_val, &_eflags);
1407 break;
1408 case 7: /* invlpg*/
1409 emulate_invlpg(ctxt->vcpu, cr2);
1410 break;
1411 default:
1412 goto cannot_emulate;
1413 }
1414 break;
1415 case 0x21: /* mov from dr to reg */
1416 no_wb = 1;
1417 if (modrm_mod != 3)
1418 goto cannot_emulate;
1419 rc = emulator_get_dr(ctxt, modrm_reg, &_regs[modrm_rm]);
1420 break;
1421 case 0x23: /* mov from reg to dr */
1422 no_wb = 1;
1423 if (modrm_mod != 3)
1424 goto cannot_emulate;
1425 rc = emulator_set_dr(ctxt, modrm_reg, _regs[modrm_rm]);
1426 break;
1427 case 0x40 ... 0x4f: /* cmov */
1428 dst.val = dst.orig_val = src.val;
1429 no_wb = 1;
1430 /*
1431 * First, assume we're decoding an even cmov opcode
1432 * (lsb == 0).
1433 */
1434 switch ((b & 15) >> 1) {
1435 case 0: /* cmovo */
1436 no_wb = (_eflags & EFLG_OF) ? 0 : 1;
1437 break;
1438 case 1: /* cmovb/cmovc/cmovnae */
1439 no_wb = (_eflags & EFLG_CF) ? 0 : 1;
1440 break;
1441 case 2: /* cmovz/cmove */
1442 no_wb = (_eflags & EFLG_ZF) ? 0 : 1;
1443 break;
1444 case 3: /* cmovbe/cmovna */
1445 no_wb = (_eflags & (EFLG_CF | EFLG_ZF)) ? 0 : 1;
1446 break;
1447 case 4: /* cmovs */
1448 no_wb = (_eflags & EFLG_SF) ? 0 : 1;
1449 break;
1450 case 5: /* cmovp/cmovpe */
1451 no_wb = (_eflags & EFLG_PF) ? 0 : 1;
1452 break;
1453 case 7: /* cmovle/cmovng */
1454 no_wb = (_eflags & EFLG_ZF) ? 0 : 1;
1455 /* fall through */
1456 case 6: /* cmovl/cmovnge */
1457 no_wb &= (!(_eflags & EFLG_SF) !=
1458 !(_eflags & EFLG_OF)) ? 0 : 1;
1459 break;
1460 }
1461 /* Odd cmov opcodes (lsb == 1) have inverted sense. */
1462 no_wb ^= b & 1;
1463 break;
1464 case 0xa3:
1465 bt: /* bt */
1466 src.val &= (dst.bytes << 3) - 1; /* only subword offset */
1467 emulate_2op_SrcV_nobyte("bt", src, dst, _eflags);
1468 break;
1469 case 0xab:
1470 bts: /* bts */
1471 src.val &= (dst.bytes << 3) - 1; /* only subword offset */
1472 emulate_2op_SrcV_nobyte("bts", src, dst, _eflags);
1473 break;
1474 case 0xb0 ... 0xb1: /* cmpxchg */
1475 /*
1476 * Save real source value, then compare EAX against
1477 * destination.
1478 */
1479 src.orig_val = src.val;
1480 src.val = _regs[VCPU_REGS_RAX];
1481 emulate_2op_SrcV("cmp", src, dst, _eflags);
1482 if (_eflags & EFLG_ZF) {
1483 /* Success: write back to memory. */
1484 dst.val = src.orig_val;
1485 } else {
1486 /* Failure: write the value we saw to EAX. */
1487 dst.type = OP_REG;
1488 dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
1489 }
1490 break;
1491 case 0xb3:
1492 btr: /* btr */
1493 src.val &= (dst.bytes << 3) - 1; /* only subword offset */
1494 emulate_2op_SrcV_nobyte("btr", src, dst, _eflags);
1495 break;
1496 case 0xb6 ... 0xb7: /* movzx */
1497 dst.bytes = op_bytes;
1498 dst.val = (d & ByteOp) ? (u8) src.val : (u16) src.val;
1499 break;
1500 case 0xba: /* Grp8 */
1501 switch (modrm_reg & 3) {
1502 case 0:
1503 goto bt;
1504 case 1:
1505 goto bts;
1506 case 2:
1507 goto btr;
1508 case 3:
1509 goto btc;
1510 }
1511 break;
1512 case 0xbb:
1513 btc: /* btc */
1514 src.val &= (dst.bytes << 3) - 1; /* only subword offset */
1515 emulate_2op_SrcV_nobyte("btc", src, dst, _eflags);
1516 break;
1517 case 0xbe ... 0xbf: /* movsx */
1518 dst.bytes = op_bytes;
1519 dst.val = (d & ByteOp) ? (s8) src.val : (s16) src.val;
1520 break;
1521 case 0xc3: /* movnti */
1522 dst.bytes = op_bytes;
1523 dst.val = (op_bytes == 4) ? (u32) src.val : (u64) src.val;
1524 break;
1525 }
1526 goto writeback;
1527
1528twobyte_special_insn:
1529 /* Disable writeback. */
1530 no_wb = 1;
1531 switch (b) {
1532 case 0x06:
1533 emulate_clts(ctxt->vcpu);
1534 break;
1535 case 0x08: /* invd */
1536 break;
1537 case 0x09: /* wbinvd */
1538 break;
1539 case 0x0d: /* GrpP (prefetch) */
1540 case 0x18: /* Grp16 (prefetch/nop) */
1541 break;
1542 case 0x20: /* mov cr, reg */
1543 if (modrm_mod != 3)
1544 goto cannot_emulate;
1545 _regs[modrm_rm] = realmode_get_cr(ctxt->vcpu, modrm_reg);
1546 break;
1547 case 0x22: /* mov reg, cr */
1548 if (modrm_mod != 3)
1549 goto cannot_emulate;
1550 realmode_set_cr(ctxt->vcpu, modrm_reg, modrm_val, &_eflags);
1551 break;
1552 case 0x30:
1553 /* wrmsr */
1554 msr_data = (u32)_regs[VCPU_REGS_RAX]
1555 | ((u64)_regs[VCPU_REGS_RDX] << 32);
1556 rc = kvm_set_msr(ctxt->vcpu, _regs[VCPU_REGS_RCX], msr_data);
1557 if (rc) {
1558 kvm_x86_ops->inject_gp(ctxt->vcpu, 0);
1559 _eip = ctxt->vcpu->rip;
1560 }
1561 rc = X86EMUL_CONTINUE;
1562 break;
1563 case 0x32:
1564 /* rdmsr */
1565 rc = kvm_get_msr(ctxt->vcpu, _regs[VCPU_REGS_RCX], &msr_data);
1566 if (rc) {
1567 kvm_x86_ops->inject_gp(ctxt->vcpu, 0);
1568 _eip = ctxt->vcpu->rip;
1569 } else {
1570 _regs[VCPU_REGS_RAX] = (u32)msr_data;
1571 _regs[VCPU_REGS_RDX] = msr_data >> 32;
1572 }
1573 rc = X86EMUL_CONTINUE;
1574 break;
1575 case 0x80 ... 0x8f: /* jnz rel, etc*/ {
1576 long int rel;
1577
1578 switch (op_bytes) {
1579 case 2:
1580 rel = insn_fetch(s16, 2, _eip);
1581 break;
1582 case 4:
1583 rel = insn_fetch(s32, 4, _eip);
1584 break;
1585 case 8:
1586 rel = insn_fetch(s64, 8, _eip);
1587 break;
1588 default:
1589 DPRINTF("jnz: Invalid op_bytes\n");
1590 goto cannot_emulate;
1591 }
1592 if (test_cc(b, _eflags))
1593 JMP_REL(rel);
1594 break;
1595 }
1596 case 0xc7: /* Grp9 (cmpxchg8b) */
1597 {
1598 u64 old, new;
1599 if ((rc = ops->read_emulated(cr2, &old, 8, ctxt->vcpu))
1600 != 0)
1601 goto done;
1602 if (((u32) (old >> 0) != (u32) _regs[VCPU_REGS_RAX]) ||
1603 ((u32) (old >> 32) != (u32) _regs[VCPU_REGS_RDX])) {
1604 _regs[VCPU_REGS_RAX] = (u32) (old >> 0);
1605 _regs[VCPU_REGS_RDX] = (u32) (old >> 32);
1606 _eflags &= ~EFLG_ZF;
1607 } else {
1608 new = ((u64)_regs[VCPU_REGS_RCX] << 32)
1609 | (u32) _regs[VCPU_REGS_RBX];
1610 if ((rc = ops->cmpxchg_emulated(cr2, &old,
1611 &new, 8, ctxt->vcpu)) != 0)
1612 goto done;
1613 _eflags |= EFLG_ZF;
1614 }
1615 break;
1616 }
1617 }
1618 goto writeback;
1619
1620cannot_emulate:
1621 DPRINTF("Cannot emulate %02x\n", b);
1622 return -1;
1623}
1624
1625#ifdef __XEN__
1626
1627#include <asm/mm.h>
1628#include <asm/uaccess.h>
1629
1630int
1631x86_emulate_read_std(unsigned long addr,
1632 unsigned long *val,
1633 unsigned int bytes, struct x86_emulate_ctxt *ctxt)
1634{
1635 unsigned int rc;
1636
1637 *val = 0;
1638
1639 if ((rc = copy_from_user((void *)val, (void *)addr, bytes)) != 0) {
1640 propagate_page_fault(addr + bytes - rc, 0); /* read fault */
1641 return X86EMUL_PROPAGATE_FAULT;
1642 }
1643
1644 return X86EMUL_CONTINUE;
1645}
1646
1647int
1648x86_emulate_write_std(unsigned long addr,
1649 unsigned long val,
1650 unsigned int bytes, struct x86_emulate_ctxt *ctxt)
1651{
1652 unsigned int rc;
1653
1654 if ((rc = copy_to_user((void *)addr, (void *)&val, bytes)) != 0) {
1655 propagate_page_fault(addr + bytes - rc, PGERR_write_access);
1656 return X86EMUL_PROPAGATE_FAULT;
1657 }
1658
1659 return X86EMUL_CONTINUE;
1660}
1661
1662#endif
diff --git a/drivers/kvm/x86_emulate.h b/drivers/kvm/x86_emulate.h
deleted file mode 100644
index 92c73aa7f9ac..000000000000
--- a/drivers/kvm/x86_emulate.h
+++ /dev/null
@@ -1,155 +0,0 @@
1/******************************************************************************
2 * x86_emulate.h
3 *
4 * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
5 *
6 * Copyright (c) 2005 Keir Fraser
7 *
8 * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
9 */
10
11#ifndef __X86_EMULATE_H__
12#define __X86_EMULATE_H__
13
14struct x86_emulate_ctxt;
15
16/*
17 * x86_emulate_ops:
18 *
19 * These operations represent the instruction emulator's interface to memory.
20 * There are two categories of operation: those that act on ordinary memory
21 * regions (*_std), and those that act on memory regions known to require
22 * special treatment or emulation (*_emulated).
23 *
24 * The emulator assumes that an instruction accesses only one 'emulated memory'
25 * location, that this location is the given linear faulting address (cr2), and
26 * that this is one of the instruction's data operands. Instruction fetches and
27 * stack operations are assumed never to access emulated memory. The emulator
28 * automatically deduces which operand of a string-move operation is accessing
29 * emulated memory, and assumes that the other operand accesses normal memory.
30 *
31 * NOTES:
32 * 1. The emulator isn't very smart about emulated vs. standard memory.
33 * 'Emulated memory' access addresses should be checked for sanity.
34 * 'Normal memory' accesses may fault, and the caller must arrange to
35 * detect and handle reentrancy into the emulator via recursive faults.
36 * Accesses may be unaligned and may cross page boundaries.
37 * 2. If the access fails (cannot emulate, or a standard access faults) then
38 * it is up to the memop to propagate the fault to the guest VM via
39 * some out-of-band mechanism, unknown to the emulator. The memop signals
40 * failure by returning X86EMUL_PROPAGATE_FAULT to the emulator, which will
41 * then immediately bail.
42 * 3. Valid access sizes are 1, 2, 4 and 8 bytes. On x86/32 systems only
43 * cmpxchg8b_emulated need support 8-byte accesses.
44 * 4. The emulator cannot handle 64-bit mode emulation on an x86/32 system.
45 */
46/* Access completed successfully: continue emulation as normal. */
47#define X86EMUL_CONTINUE 0
48/* Access is unhandleable: bail from emulation and return error to caller. */
49#define X86EMUL_UNHANDLEABLE 1
50/* Terminate emulation but return success to the caller. */
51#define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */
52#define X86EMUL_RETRY_INSTR 2 /* retry the instruction for some reason */
53#define X86EMUL_CMPXCHG_FAILED 2 /* cmpxchg did not see expected value */
54struct x86_emulate_ops {
55 /*
56 * read_std: Read bytes of standard (non-emulated/special) memory.
57 * Used for instruction fetch, stack operations, and others.
58 * @addr: [IN ] Linear address from which to read.
59 * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
60 * @bytes: [IN ] Number of bytes to read from memory.
61 */
62 int (*read_std)(unsigned long addr, void *val,
63 unsigned int bytes, struct kvm_vcpu *vcpu);
64
65 /*
66 * write_std: Write bytes of standard (non-emulated/special) memory.
67 * Used for stack operations, and others.
68 * @addr: [IN ] Linear address to which to write.
69 * @val: [IN ] Value to write to memory (low-order bytes used as
70 * required).
71 * @bytes: [IN ] Number of bytes to write to memory.
72 */
73 int (*write_std)(unsigned long addr, const void *val,
74 unsigned int bytes, struct kvm_vcpu *vcpu);
75
76 /*
77 * read_emulated: Read bytes from emulated/special memory area.
78 * @addr: [IN ] Linear address from which to read.
79 * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
80 * @bytes: [IN ] Number of bytes to read from memory.
81 */
82 int (*read_emulated) (unsigned long addr,
83 void *val,
84 unsigned int bytes,
85 struct kvm_vcpu *vcpu);
86
87 /*
88 * write_emulated: Read bytes from emulated/special memory area.
89 * @addr: [IN ] Linear address to which to write.
90 * @val: [IN ] Value to write to memory (low-order bytes used as
91 * required).
92 * @bytes: [IN ] Number of bytes to write to memory.
93 */
94 int (*write_emulated) (unsigned long addr,
95 const void *val,
96 unsigned int bytes,
97 struct kvm_vcpu *vcpu);
98
99 /*
100 * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an
101 * emulated/special memory area.
102 * @addr: [IN ] Linear address to access.
103 * @old: [IN ] Value expected to be current at @addr.
104 * @new: [IN ] Value to write to @addr.
105 * @bytes: [IN ] Number of bytes to access using CMPXCHG.
106 */
107 int (*cmpxchg_emulated) (unsigned long addr,
108 const void *old,
109 const void *new,
110 unsigned int bytes,
111 struct kvm_vcpu *vcpu);
112
113};
114
115struct x86_emulate_ctxt {
116 /* Register state before/after emulation. */
117 struct kvm_vcpu *vcpu;
118
119 /* Linear faulting address (if emulating a page-faulting instruction). */
120 unsigned long eflags;
121 unsigned long cr2;
122
123 /* Emulated execution mode, represented by an X86EMUL_MODE value. */
124 int mode;
125
126 unsigned long cs_base;
127 unsigned long ds_base;
128 unsigned long es_base;
129 unsigned long ss_base;
130 unsigned long gs_base;
131 unsigned long fs_base;
132};
133
134/* Execution mode, passed to the emulator. */
135#define X86EMUL_MODE_REAL 0 /* Real mode. */
136#define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */
137#define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */
138#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */
139
140/* Host execution mode. */
141#if defined(__i386__)
142#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32
143#elif defined(CONFIG_X86_64)
144#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
145#endif
146
147/*
148 * x86_emulate_memop: Emulate an instruction that faulted attempting to
149 * read/write a 'special' memory area.
150 * Returns -1 on failure, 0 on success.
151 */
152int x86_emulate_memop(struct x86_emulate_ctxt *ctxt,
153 struct x86_emulate_ops *ops);
154
155#endif /* __X86_EMULATE_H__ */