diff options
author | Avi Kivity <avi@qumranet.com> | 2007-12-16 04:02:48 -0500 |
---|---|---|
committer | Avi Kivity <avi@qumranet.com> | 2008-01-30 11:01:18 -0500 |
commit | edf884172e9828c6234b254208af04655855038d (patch) | |
tree | f5e5d1eecaed9737eced6ba60d09fe93149751c1 /drivers | |
parent | 9584bf2c93f56656dba0de8f6c75b54ca7995143 (diff) |
KVM: Move arch dependent files to new directory arch/x86/kvm/
This paves the way for multiple architecture support. Note that while
ioapic.c could potentially be shared with ia64, it is also moved.
Signed-off-by: Avi Kivity <avi@qumranet.com>
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/Kconfig | 2 | ||||
-rw-r--r-- | drivers/Makefile | 1 | ||||
-rw-r--r-- | drivers/kvm/Kconfig | 57 | ||||
-rw-r--r-- | drivers/kvm/Makefile | 10 | ||||
-rw-r--r-- | drivers/kvm/i8259.c | 449 | ||||
-rw-r--r-- | drivers/kvm/ioapic.c | 402 | ||||
-rw-r--r-- | drivers/kvm/iodev.h | 2 | ||||
-rw-r--r-- | drivers/kvm/irq.c | 99 | ||||
-rw-r--r-- | drivers/kvm/irq.h | 196 | ||||
-rw-r--r-- | drivers/kvm/kvm.h | 289 | ||||
-rw-r--r-- | drivers/kvm/kvm_main.c | 2 | ||||
-rw-r--r-- | drivers/kvm/kvm_svm.h | 45 | ||||
-rw-r--r-- | drivers/kvm/lapic.c | 1087 | ||||
-rw-r--r-- | drivers/kvm/mmu.c | 1806 | ||||
-rw-r--r-- | drivers/kvm/mmu.h | 44 | ||||
-rw-r--r-- | drivers/kvm/paging_tmpl.h | 461 | ||||
-rw-r--r-- | drivers/kvm/segment_descriptor.h | 29 | ||||
-rw-r--r-- | drivers/kvm/svm.c | 1725 | ||||
-rw-r--r-- | drivers/kvm/svm.h | 325 | ||||
-rw-r--r-- | drivers/kvm/types.h | 54 | ||||
-rw-r--r-- | drivers/kvm/vmx.c | 2673 | ||||
-rw-r--r-- | drivers/kvm/vmx.h | 324 | ||||
-rw-r--r-- | drivers/kvm/x86.c | 3148 | ||||
-rw-r--r-- | drivers/kvm/x86.h | 602 | ||||
-rw-r--r-- | drivers/kvm/x86_emulate.c | 1913 | ||||
-rw-r--r-- | drivers/kvm/x86_emulate.h | 186 |
26 files changed, 2 insertions, 15929 deletions
diff --git a/drivers/Kconfig b/drivers/Kconfig index f4076d9e9902..08d4ae201597 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig | |||
@@ -90,8 +90,6 @@ source "drivers/dca/Kconfig" | |||
90 | 90 | ||
91 | source "drivers/auxdisplay/Kconfig" | 91 | source "drivers/auxdisplay/Kconfig" |
92 | 92 | ||
93 | source "drivers/kvm/Kconfig" | ||
94 | |||
95 | source "drivers/uio/Kconfig" | 93 | source "drivers/uio/Kconfig" |
96 | 94 | ||
97 | source "drivers/virtio/Kconfig" | 95 | source "drivers/virtio/Kconfig" |
diff --git a/drivers/Makefile b/drivers/Makefile index d92d4d82d001..9e1f808e43cf 100644 --- a/drivers/Makefile +++ b/drivers/Makefile | |||
@@ -47,7 +47,6 @@ obj-$(CONFIG_SPI) += spi/ | |||
47 | obj-$(CONFIG_PCCARD) += pcmcia/ | 47 | obj-$(CONFIG_PCCARD) += pcmcia/ |
48 | obj-$(CONFIG_DIO) += dio/ | 48 | obj-$(CONFIG_DIO) += dio/ |
49 | obj-$(CONFIG_SBUS) += sbus/ | 49 | obj-$(CONFIG_SBUS) += sbus/ |
50 | obj-$(CONFIG_KVM) += kvm/ | ||
51 | obj-$(CONFIG_ZORRO) += zorro/ | 50 | obj-$(CONFIG_ZORRO) += zorro/ |
52 | obj-$(CONFIG_MAC) += macintosh/ | 51 | obj-$(CONFIG_MAC) += macintosh/ |
53 | obj-$(CONFIG_ATA_OVER_ETH) += block/aoe/ | 52 | obj-$(CONFIG_ATA_OVER_ETH) += block/aoe/ |
diff --git a/drivers/kvm/Kconfig b/drivers/kvm/Kconfig deleted file mode 100644 index c83e1c9b5129..000000000000 --- a/drivers/kvm/Kconfig +++ /dev/null | |||
@@ -1,57 +0,0 @@ | |||
1 | # | ||
2 | # KVM configuration | ||
3 | # | ||
4 | config HAVE_KVM | ||
5 | bool | ||
6 | |||
7 | menuconfig VIRTUALIZATION | ||
8 | bool "Virtualization" | ||
9 | depends on HAVE_KVM || X86 | ||
10 | default y | ||
11 | ---help--- | ||
12 | Say Y here to get to see options for using your Linux host to run other | ||
13 | operating systems inside virtual machines (guests). | ||
14 | This option alone does not add any kernel code. | ||
15 | |||
16 | If you say N, all options in this submenu will be skipped and disabled. | ||
17 | |||
18 | if VIRTUALIZATION | ||
19 | |||
20 | config KVM | ||
21 | tristate "Kernel-based Virtual Machine (KVM) support" | ||
22 | depends on HAVE_KVM && EXPERIMENTAL | ||
23 | select PREEMPT_NOTIFIERS | ||
24 | select ANON_INODES | ||
25 | ---help--- | ||
26 | Support hosting fully virtualized guest machines using hardware | ||
27 | virtualization extensions. You will need a fairly recent | ||
28 | processor equipped with virtualization extensions. You will also | ||
29 | need to select one or more of the processor modules below. | ||
30 | |||
31 | This module provides access to the hardware capabilities through | ||
32 | a character device node named /dev/kvm. | ||
33 | |||
34 | To compile this as a module, choose M here: the module | ||
35 | will be called kvm. | ||
36 | |||
37 | If unsure, say N. | ||
38 | |||
39 | config KVM_INTEL | ||
40 | tristate "KVM for Intel processors support" | ||
41 | depends on KVM | ||
42 | ---help--- | ||
43 | Provides support for KVM on Intel processors equipped with the VT | ||
44 | extensions. | ||
45 | |||
46 | config KVM_AMD | ||
47 | tristate "KVM for AMD processors support" | ||
48 | depends on KVM | ||
49 | ---help--- | ||
50 | Provides support for KVM on AMD processors equipped with the AMD-V | ||
51 | (SVM) extensions. | ||
52 | |||
53 | # OK, it's a little counter-intuitive to do this, but it puts it neatly under | ||
54 | # the virtualization menu. | ||
55 | source drivers/lguest/Kconfig | ||
56 | |||
57 | endif # VIRTUALIZATION | ||
diff --git a/drivers/kvm/Makefile b/drivers/kvm/Makefile deleted file mode 100644 index cf18ad46e987..000000000000 --- a/drivers/kvm/Makefile +++ /dev/null | |||
@@ -1,10 +0,0 @@ | |||
1 | # | ||
2 | # Makefile for Kernel-based Virtual Machine module | ||
3 | # | ||
4 | |||
5 | kvm-objs := kvm_main.o x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o ioapic.o | ||
6 | obj-$(CONFIG_KVM) += kvm.o | ||
7 | kvm-intel-objs = vmx.o | ||
8 | obj-$(CONFIG_KVM_INTEL) += kvm-intel.o | ||
9 | kvm-amd-objs = svm.o | ||
10 | obj-$(CONFIG_KVM_AMD) += kvm-amd.o | ||
diff --git a/drivers/kvm/i8259.c b/drivers/kvm/i8259.c deleted file mode 100644 index b3cad632f3d5..000000000000 --- a/drivers/kvm/i8259.c +++ /dev/null | |||
@@ -1,449 +0,0 @@ | |||
1 | /* | ||
2 | * 8259 interrupt controller emulation | ||
3 | * | ||
4 | * Copyright (c) 2003-2004 Fabrice Bellard | ||
5 | * Copyright (c) 2007 Intel Corporation | ||
6 | * | ||
7 | * Permission is hereby granted, free of charge, to any person obtaining a copy | ||
8 | * of this software and associated documentation files (the "Software"), to deal | ||
9 | * in the Software without restriction, including without limitation the rights | ||
10 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
11 | * copies of the Software, and to permit persons to whom the Software is | ||
12 | * furnished to do so, subject to the following conditions: | ||
13 | * | ||
14 | * The above copyright notice and this permission notice shall be included in | ||
15 | * all copies or substantial portions of the Software. | ||
16 | * | ||
17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | ||
20 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
22 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | ||
23 | * THE SOFTWARE. | ||
24 | * Authors: | ||
25 | * Yaozu (Eddie) Dong <Eddie.dong@intel.com> | ||
26 | * Port from Qemu. | ||
27 | */ | ||
28 | #include <linux/mm.h> | ||
29 | #include "irq.h" | ||
30 | #include "kvm.h" | ||
31 | |||
32 | /* | ||
33 | * set irq level. If an edge is detected, then the IRR is set to 1 | ||
34 | */ | ||
35 | static inline void pic_set_irq1(struct kvm_kpic_state *s, int irq, int level) | ||
36 | { | ||
37 | int mask; | ||
38 | mask = 1 << irq; | ||
39 | if (s->elcr & mask) /* level triggered */ | ||
40 | if (level) { | ||
41 | s->irr |= mask; | ||
42 | s->last_irr |= mask; | ||
43 | } else { | ||
44 | s->irr &= ~mask; | ||
45 | s->last_irr &= ~mask; | ||
46 | } | ||
47 | else /* edge triggered */ | ||
48 | if (level) { | ||
49 | if ((s->last_irr & mask) == 0) | ||
50 | s->irr |= mask; | ||
51 | s->last_irr |= mask; | ||
52 | } else | ||
53 | s->last_irr &= ~mask; | ||
54 | } | ||
55 | |||
56 | /* | ||
57 | * return the highest priority found in mask (highest = smallest | ||
58 | * number). Return 8 if no irq | ||
59 | */ | ||
60 | static inline int get_priority(struct kvm_kpic_state *s, int mask) | ||
61 | { | ||
62 | int priority; | ||
63 | if (mask == 0) | ||
64 | return 8; | ||
65 | priority = 0; | ||
66 | while ((mask & (1 << ((priority + s->priority_add) & 7))) == 0) | ||
67 | priority++; | ||
68 | return priority; | ||
69 | } | ||
70 | |||
71 | /* | ||
72 | * return the pic wanted interrupt. return -1 if none | ||
73 | */ | ||
74 | static int pic_get_irq(struct kvm_kpic_state *s) | ||
75 | { | ||
76 | int mask, cur_priority, priority; | ||
77 | |||
78 | mask = s->irr & ~s->imr; | ||
79 | priority = get_priority(s, mask); | ||
80 | if (priority == 8) | ||
81 | return -1; | ||
82 | /* | ||
83 | * compute current priority. If special fully nested mode on the | ||
84 | * master, the IRQ coming from the slave is not taken into account | ||
85 | * for the priority computation. | ||
86 | */ | ||
87 | mask = s->isr; | ||
88 | if (s->special_fully_nested_mode && s == &s->pics_state->pics[0]) | ||
89 | mask &= ~(1 << 2); | ||
90 | cur_priority = get_priority(s, mask); | ||
91 | if (priority < cur_priority) | ||
92 | /* | ||
93 | * higher priority found: an irq should be generated | ||
94 | */ | ||
95 | return (priority + s->priority_add) & 7; | ||
96 | else | ||
97 | return -1; | ||
98 | } | ||
99 | |||
100 | /* | ||
101 | * raise irq to CPU if necessary. must be called every time the active | ||
102 | * irq may change | ||
103 | */ | ||
104 | static void pic_update_irq(struct kvm_pic *s) | ||
105 | { | ||
106 | int irq2, irq; | ||
107 | |||
108 | irq2 = pic_get_irq(&s->pics[1]); | ||
109 | if (irq2 >= 0) { | ||
110 | /* | ||
111 | * if irq request by slave pic, signal master PIC | ||
112 | */ | ||
113 | pic_set_irq1(&s->pics[0], 2, 1); | ||
114 | pic_set_irq1(&s->pics[0], 2, 0); | ||
115 | } | ||
116 | irq = pic_get_irq(&s->pics[0]); | ||
117 | if (irq >= 0) | ||
118 | s->irq_request(s->irq_request_opaque, 1); | ||
119 | else | ||
120 | s->irq_request(s->irq_request_opaque, 0); | ||
121 | } | ||
122 | |||
123 | void kvm_pic_update_irq(struct kvm_pic *s) | ||
124 | { | ||
125 | pic_update_irq(s); | ||
126 | } | ||
127 | |||
128 | void kvm_pic_set_irq(void *opaque, int irq, int level) | ||
129 | { | ||
130 | struct kvm_pic *s = opaque; | ||
131 | |||
132 | pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); | ||
133 | pic_update_irq(s); | ||
134 | } | ||
135 | |||
136 | /* | ||
137 | * acknowledge interrupt 'irq' | ||
138 | */ | ||
139 | static inline void pic_intack(struct kvm_kpic_state *s, int irq) | ||
140 | { | ||
141 | if (s->auto_eoi) { | ||
142 | if (s->rotate_on_auto_eoi) | ||
143 | s->priority_add = (irq + 1) & 7; | ||
144 | } else | ||
145 | s->isr |= (1 << irq); | ||
146 | /* | ||
147 | * We don't clear a level sensitive interrupt here | ||
148 | */ | ||
149 | if (!(s->elcr & (1 << irq))) | ||
150 | s->irr &= ~(1 << irq); | ||
151 | } | ||
152 | |||
153 | int kvm_pic_read_irq(struct kvm_pic *s) | ||
154 | { | ||
155 | int irq, irq2, intno; | ||
156 | |||
157 | irq = pic_get_irq(&s->pics[0]); | ||
158 | if (irq >= 0) { | ||
159 | pic_intack(&s->pics[0], irq); | ||
160 | if (irq == 2) { | ||
161 | irq2 = pic_get_irq(&s->pics[1]); | ||
162 | if (irq2 >= 0) | ||
163 | pic_intack(&s->pics[1], irq2); | ||
164 | else | ||
165 | /* | ||
166 | * spurious IRQ on slave controller | ||
167 | */ | ||
168 | irq2 = 7; | ||
169 | intno = s->pics[1].irq_base + irq2; | ||
170 | irq = irq2 + 8; | ||
171 | } else | ||
172 | intno = s->pics[0].irq_base + irq; | ||
173 | } else { | ||
174 | /* | ||
175 | * spurious IRQ on host controller | ||
176 | */ | ||
177 | irq = 7; | ||
178 | intno = s->pics[0].irq_base + irq; | ||
179 | } | ||
180 | pic_update_irq(s); | ||
181 | |||
182 | return intno; | ||
183 | } | ||
184 | |||
185 | void kvm_pic_reset(struct kvm_kpic_state *s) | ||
186 | { | ||
187 | s->last_irr = 0; | ||
188 | s->irr = 0; | ||
189 | s->imr = 0; | ||
190 | s->isr = 0; | ||
191 | s->priority_add = 0; | ||
192 | s->irq_base = 0; | ||
193 | s->read_reg_select = 0; | ||
194 | s->poll = 0; | ||
195 | s->special_mask = 0; | ||
196 | s->init_state = 0; | ||
197 | s->auto_eoi = 0; | ||
198 | s->rotate_on_auto_eoi = 0; | ||
199 | s->special_fully_nested_mode = 0; | ||
200 | s->init4 = 0; | ||
201 | } | ||
202 | |||
203 | static void pic_ioport_write(void *opaque, u32 addr, u32 val) | ||
204 | { | ||
205 | struct kvm_kpic_state *s = opaque; | ||
206 | int priority, cmd, irq; | ||
207 | |||
208 | addr &= 1; | ||
209 | if (addr == 0) { | ||
210 | if (val & 0x10) { | ||
211 | kvm_pic_reset(s); /* init */ | ||
212 | /* | ||
213 | * deassert a pending interrupt | ||
214 | */ | ||
215 | s->pics_state->irq_request(s->pics_state-> | ||
216 | irq_request_opaque, 0); | ||
217 | s->init_state = 1; | ||
218 | s->init4 = val & 1; | ||
219 | if (val & 0x02) | ||
220 | printk(KERN_ERR "single mode not supported"); | ||
221 | if (val & 0x08) | ||
222 | printk(KERN_ERR | ||
223 | "level sensitive irq not supported"); | ||
224 | } else if (val & 0x08) { | ||
225 | if (val & 0x04) | ||
226 | s->poll = 1; | ||
227 | if (val & 0x02) | ||
228 | s->read_reg_select = val & 1; | ||
229 | if (val & 0x40) | ||
230 | s->special_mask = (val >> 5) & 1; | ||
231 | } else { | ||
232 | cmd = val >> 5; | ||
233 | switch (cmd) { | ||
234 | case 0: | ||
235 | case 4: | ||
236 | s->rotate_on_auto_eoi = cmd >> 2; | ||
237 | break; | ||
238 | case 1: /* end of interrupt */ | ||
239 | case 5: | ||
240 | priority = get_priority(s, s->isr); | ||
241 | if (priority != 8) { | ||
242 | irq = (priority + s->priority_add) & 7; | ||
243 | s->isr &= ~(1 << irq); | ||
244 | if (cmd == 5) | ||
245 | s->priority_add = (irq + 1) & 7; | ||
246 | pic_update_irq(s->pics_state); | ||
247 | } | ||
248 | break; | ||
249 | case 3: | ||
250 | irq = val & 7; | ||
251 | s->isr &= ~(1 << irq); | ||
252 | pic_update_irq(s->pics_state); | ||
253 | break; | ||
254 | case 6: | ||
255 | s->priority_add = (val + 1) & 7; | ||
256 | pic_update_irq(s->pics_state); | ||
257 | break; | ||
258 | case 7: | ||
259 | irq = val & 7; | ||
260 | s->isr &= ~(1 << irq); | ||
261 | s->priority_add = (irq + 1) & 7; | ||
262 | pic_update_irq(s->pics_state); | ||
263 | break; | ||
264 | default: | ||
265 | break; /* no operation */ | ||
266 | } | ||
267 | } | ||
268 | } else | ||
269 | switch (s->init_state) { | ||
270 | case 0: /* normal mode */ | ||
271 | s->imr = val; | ||
272 | pic_update_irq(s->pics_state); | ||
273 | break; | ||
274 | case 1: | ||
275 | s->irq_base = val & 0xf8; | ||
276 | s->init_state = 2; | ||
277 | break; | ||
278 | case 2: | ||
279 | if (s->init4) | ||
280 | s->init_state = 3; | ||
281 | else | ||
282 | s->init_state = 0; | ||
283 | break; | ||
284 | case 3: | ||
285 | s->special_fully_nested_mode = (val >> 4) & 1; | ||
286 | s->auto_eoi = (val >> 1) & 1; | ||
287 | s->init_state = 0; | ||
288 | break; | ||
289 | } | ||
290 | } | ||
291 | |||
292 | static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1) | ||
293 | { | ||
294 | int ret; | ||
295 | |||
296 | ret = pic_get_irq(s); | ||
297 | if (ret >= 0) { | ||
298 | if (addr1 >> 7) { | ||
299 | s->pics_state->pics[0].isr &= ~(1 << 2); | ||
300 | s->pics_state->pics[0].irr &= ~(1 << 2); | ||
301 | } | ||
302 | s->irr &= ~(1 << ret); | ||
303 | s->isr &= ~(1 << ret); | ||
304 | if (addr1 >> 7 || ret != 2) | ||
305 | pic_update_irq(s->pics_state); | ||
306 | } else { | ||
307 | ret = 0x07; | ||
308 | pic_update_irq(s->pics_state); | ||
309 | } | ||
310 | |||
311 | return ret; | ||
312 | } | ||
313 | |||
314 | static u32 pic_ioport_read(void *opaque, u32 addr1) | ||
315 | { | ||
316 | struct kvm_kpic_state *s = opaque; | ||
317 | unsigned int addr; | ||
318 | int ret; | ||
319 | |||
320 | addr = addr1; | ||
321 | addr &= 1; | ||
322 | if (s->poll) { | ||
323 | ret = pic_poll_read(s, addr1); | ||
324 | s->poll = 0; | ||
325 | } else | ||
326 | if (addr == 0) | ||
327 | if (s->read_reg_select) | ||
328 | ret = s->isr; | ||
329 | else | ||
330 | ret = s->irr; | ||
331 | else | ||
332 | ret = s->imr; | ||
333 | return ret; | ||
334 | } | ||
335 | |||
336 | static void elcr_ioport_write(void *opaque, u32 addr, u32 val) | ||
337 | { | ||
338 | struct kvm_kpic_state *s = opaque; | ||
339 | s->elcr = val & s->elcr_mask; | ||
340 | } | ||
341 | |||
342 | static u32 elcr_ioport_read(void *opaque, u32 addr1) | ||
343 | { | ||
344 | struct kvm_kpic_state *s = opaque; | ||
345 | return s->elcr; | ||
346 | } | ||
347 | |||
348 | static int picdev_in_range(struct kvm_io_device *this, gpa_t addr) | ||
349 | { | ||
350 | switch (addr) { | ||
351 | case 0x20: | ||
352 | case 0x21: | ||
353 | case 0xa0: | ||
354 | case 0xa1: | ||
355 | case 0x4d0: | ||
356 | case 0x4d1: | ||
357 | return 1; | ||
358 | default: | ||
359 | return 0; | ||
360 | } | ||
361 | } | ||
362 | |||
363 | static void picdev_write(struct kvm_io_device *this, | ||
364 | gpa_t addr, int len, const void *val) | ||
365 | { | ||
366 | struct kvm_pic *s = this->private; | ||
367 | unsigned char data = *(unsigned char *)val; | ||
368 | |||
369 | if (len != 1) { | ||
370 | if (printk_ratelimit()) | ||
371 | printk(KERN_ERR "PIC: non byte write\n"); | ||
372 | return; | ||
373 | } | ||
374 | switch (addr) { | ||
375 | case 0x20: | ||
376 | case 0x21: | ||
377 | case 0xa0: | ||
378 | case 0xa1: | ||
379 | pic_ioport_write(&s->pics[addr >> 7], addr, data); | ||
380 | break; | ||
381 | case 0x4d0: | ||
382 | case 0x4d1: | ||
383 | elcr_ioport_write(&s->pics[addr & 1], addr, data); | ||
384 | break; | ||
385 | } | ||
386 | } | ||
387 | |||
388 | static void picdev_read(struct kvm_io_device *this, | ||
389 | gpa_t addr, int len, void *val) | ||
390 | { | ||
391 | struct kvm_pic *s = this->private; | ||
392 | unsigned char data = 0; | ||
393 | |||
394 | if (len != 1) { | ||
395 | if (printk_ratelimit()) | ||
396 | printk(KERN_ERR "PIC: non byte read\n"); | ||
397 | return; | ||
398 | } | ||
399 | switch (addr) { | ||
400 | case 0x20: | ||
401 | case 0x21: | ||
402 | case 0xa0: | ||
403 | case 0xa1: | ||
404 | data = pic_ioport_read(&s->pics[addr >> 7], addr); | ||
405 | break; | ||
406 | case 0x4d0: | ||
407 | case 0x4d1: | ||
408 | data = elcr_ioport_read(&s->pics[addr & 1], addr); | ||
409 | break; | ||
410 | } | ||
411 | *(unsigned char *)val = data; | ||
412 | } | ||
413 | |||
414 | /* | ||
415 | * callback when PIC0 irq status changed | ||
416 | */ | ||
417 | static void pic_irq_request(void *opaque, int level) | ||
418 | { | ||
419 | struct kvm *kvm = opaque; | ||
420 | struct kvm_vcpu *vcpu = kvm->vcpus[0]; | ||
421 | |||
422 | pic_irqchip(kvm)->output = level; | ||
423 | if (vcpu) | ||
424 | kvm_vcpu_kick(vcpu); | ||
425 | } | ||
426 | |||
427 | struct kvm_pic *kvm_create_pic(struct kvm *kvm) | ||
428 | { | ||
429 | struct kvm_pic *s; | ||
430 | s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); | ||
431 | if (!s) | ||
432 | return NULL; | ||
433 | s->pics[0].elcr_mask = 0xf8; | ||
434 | s->pics[1].elcr_mask = 0xde; | ||
435 | s->irq_request = pic_irq_request; | ||
436 | s->irq_request_opaque = kvm; | ||
437 | s->pics[0].pics_state = s; | ||
438 | s->pics[1].pics_state = s; | ||
439 | |||
440 | /* | ||
441 | * Initialize PIO device | ||
442 | */ | ||
443 | s->dev.read = picdev_read; | ||
444 | s->dev.write = picdev_write; | ||
445 | s->dev.in_range = picdev_in_range; | ||
446 | s->dev.private = s; | ||
447 | kvm_io_bus_register_dev(&kvm->pio_bus, &s->dev); | ||
448 | return s; | ||
449 | } | ||
diff --git a/drivers/kvm/ioapic.c b/drivers/kvm/ioapic.c deleted file mode 100644 index f8236774c1b4..000000000000 --- a/drivers/kvm/ioapic.c +++ /dev/null | |||
@@ -1,402 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2001 MandrakeSoft S.A. | ||
3 | * | ||
4 | * MandrakeSoft S.A. | ||
5 | * 43, rue d'Aboukir | ||
6 | * 75002 Paris - France | ||
7 | * http://www.linux-mandrake.com/ | ||
8 | * http://www.mandrakesoft.com/ | ||
9 | * | ||
10 | * This library is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU Lesser General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This library is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * Lesser General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU Lesser General Public | ||
21 | * License along with this library; if not, write to the Free Software | ||
22 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
23 | * | ||
24 | * Yunhong Jiang <yunhong.jiang@intel.com> | ||
25 | * Yaozu (Eddie) Dong <eddie.dong@intel.com> | ||
26 | * Based on Xen 3.1 code. | ||
27 | */ | ||
28 | |||
29 | #include "kvm.h" | ||
30 | #include "x86.h" | ||
31 | |||
32 | #include <linux/kvm.h> | ||
33 | #include <linux/mm.h> | ||
34 | #include <linux/highmem.h> | ||
35 | #include <linux/smp.h> | ||
36 | #include <linux/hrtimer.h> | ||
37 | #include <linux/io.h> | ||
38 | #include <asm/processor.h> | ||
39 | #include <asm/page.h> | ||
40 | #include <asm/current.h> | ||
41 | #include "irq.h" | ||
42 | #if 0 | ||
43 | #define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) | ||
44 | #else | ||
45 | #define ioapic_debug(fmt, arg...) | ||
46 | #endif | ||
47 | static void ioapic_deliver(struct kvm_ioapic *vioapic, int irq); | ||
48 | |||
49 | static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, | ||
50 | unsigned long addr, | ||
51 | unsigned long length) | ||
52 | { | ||
53 | unsigned long result = 0; | ||
54 | |||
55 | switch (ioapic->ioregsel) { | ||
56 | case IOAPIC_REG_VERSION: | ||
57 | result = ((((IOAPIC_NUM_PINS - 1) & 0xff) << 16) | ||
58 | | (IOAPIC_VERSION_ID & 0xff)); | ||
59 | break; | ||
60 | |||
61 | case IOAPIC_REG_APIC_ID: | ||
62 | case IOAPIC_REG_ARB_ID: | ||
63 | result = ((ioapic->id & 0xf) << 24); | ||
64 | break; | ||
65 | |||
66 | default: | ||
67 | { | ||
68 | u32 redir_index = (ioapic->ioregsel - 0x10) >> 1; | ||
69 | u64 redir_content; | ||
70 | |||
71 | ASSERT(redir_index < IOAPIC_NUM_PINS); | ||
72 | |||
73 | redir_content = ioapic->redirtbl[redir_index].bits; | ||
74 | result = (ioapic->ioregsel & 0x1) ? | ||
75 | (redir_content >> 32) & 0xffffffff : | ||
76 | redir_content & 0xffffffff; | ||
77 | break; | ||
78 | } | ||
79 | } | ||
80 | |||
81 | return result; | ||
82 | } | ||
83 | |||
84 | static void ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx) | ||
85 | { | ||
86 | union ioapic_redir_entry *pent; | ||
87 | |||
88 | pent = &ioapic->redirtbl[idx]; | ||
89 | |||
90 | if (!pent->fields.mask) { | ||
91 | ioapic_deliver(ioapic, idx); | ||
92 | if (pent->fields.trig_mode == IOAPIC_LEVEL_TRIG) | ||
93 | pent->fields.remote_irr = 1; | ||
94 | } | ||
95 | if (!pent->fields.trig_mode) | ||
96 | ioapic->irr &= ~(1 << idx); | ||
97 | } | ||
98 | |||
99 | static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) | ||
100 | { | ||
101 | unsigned index; | ||
102 | |||
103 | switch (ioapic->ioregsel) { | ||
104 | case IOAPIC_REG_VERSION: | ||
105 | /* Writes are ignored. */ | ||
106 | break; | ||
107 | |||
108 | case IOAPIC_REG_APIC_ID: | ||
109 | ioapic->id = (val >> 24) & 0xf; | ||
110 | break; | ||
111 | |||
112 | case IOAPIC_REG_ARB_ID: | ||
113 | break; | ||
114 | |||
115 | default: | ||
116 | index = (ioapic->ioregsel - 0x10) >> 1; | ||
117 | |||
118 | ioapic_debug("change redir index %x val %x\n", index, val); | ||
119 | if (index >= IOAPIC_NUM_PINS) | ||
120 | return; | ||
121 | if (ioapic->ioregsel & 1) { | ||
122 | ioapic->redirtbl[index].bits &= 0xffffffff; | ||
123 | ioapic->redirtbl[index].bits |= (u64) val << 32; | ||
124 | } else { | ||
125 | ioapic->redirtbl[index].bits &= ~0xffffffffULL; | ||
126 | ioapic->redirtbl[index].bits |= (u32) val; | ||
127 | ioapic->redirtbl[index].fields.remote_irr = 0; | ||
128 | } | ||
129 | if (ioapic->irr & (1 << index)) | ||
130 | ioapic_service(ioapic, index); | ||
131 | break; | ||
132 | } | ||
133 | } | ||
134 | |||
135 | static void ioapic_inj_irq(struct kvm_ioapic *ioapic, | ||
136 | struct kvm_vcpu *vcpu, | ||
137 | u8 vector, u8 trig_mode, u8 delivery_mode) | ||
138 | { | ||
139 | ioapic_debug("irq %d trig %d deliv %d\n", vector, trig_mode, | ||
140 | delivery_mode); | ||
141 | |||
142 | ASSERT((delivery_mode == IOAPIC_FIXED) || | ||
143 | (delivery_mode == IOAPIC_LOWEST_PRIORITY)); | ||
144 | |||
145 | kvm_apic_set_irq(vcpu, vector, trig_mode); | ||
146 | } | ||
147 | |||
148 | static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, | ||
149 | u8 dest_mode) | ||
150 | { | ||
151 | u32 mask = 0; | ||
152 | int i; | ||
153 | struct kvm *kvm = ioapic->kvm; | ||
154 | struct kvm_vcpu *vcpu; | ||
155 | |||
156 | ioapic_debug("dest %d dest_mode %d\n", dest, dest_mode); | ||
157 | |||
158 | if (dest_mode == 0) { /* Physical mode. */ | ||
159 | if (dest == 0xFF) { /* Broadcast. */ | ||
160 | for (i = 0; i < KVM_MAX_VCPUS; ++i) | ||
161 | if (kvm->vcpus[i] && kvm->vcpus[i]->arch.apic) | ||
162 | mask |= 1 << i; | ||
163 | return mask; | ||
164 | } | ||
165 | for (i = 0; i < KVM_MAX_VCPUS; ++i) { | ||
166 | vcpu = kvm->vcpus[i]; | ||
167 | if (!vcpu) | ||
168 | continue; | ||
169 | if (kvm_apic_match_physical_addr(vcpu->arch.apic, dest)) { | ||
170 | if (vcpu->arch.apic) | ||
171 | mask = 1 << i; | ||
172 | break; | ||
173 | } | ||
174 | } | ||
175 | } else if (dest != 0) /* Logical mode, MDA non-zero. */ | ||
176 | for (i = 0; i < KVM_MAX_VCPUS; ++i) { | ||
177 | vcpu = kvm->vcpus[i]; | ||
178 | if (!vcpu) | ||
179 | continue; | ||
180 | if (vcpu->arch.apic && | ||
181 | kvm_apic_match_logical_addr(vcpu->arch.apic, dest)) | ||
182 | mask |= 1 << vcpu->vcpu_id; | ||
183 | } | ||
184 | ioapic_debug("mask %x\n", mask); | ||
185 | return mask; | ||
186 | } | ||
187 | |||
188 | static void ioapic_deliver(struct kvm_ioapic *ioapic, int irq) | ||
189 | { | ||
190 | u8 dest = ioapic->redirtbl[irq].fields.dest_id; | ||
191 | u8 dest_mode = ioapic->redirtbl[irq].fields.dest_mode; | ||
192 | u8 delivery_mode = ioapic->redirtbl[irq].fields.delivery_mode; | ||
193 | u8 vector = ioapic->redirtbl[irq].fields.vector; | ||
194 | u8 trig_mode = ioapic->redirtbl[irq].fields.trig_mode; | ||
195 | u32 deliver_bitmask; | ||
196 | struct kvm_vcpu *vcpu; | ||
197 | int vcpu_id; | ||
198 | |||
199 | ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x " | ||
200 | "vector=%x trig_mode=%x\n", | ||
201 | dest, dest_mode, delivery_mode, vector, trig_mode); | ||
202 | |||
203 | deliver_bitmask = ioapic_get_delivery_bitmask(ioapic, dest, dest_mode); | ||
204 | if (!deliver_bitmask) { | ||
205 | ioapic_debug("no target on destination\n"); | ||
206 | return; | ||
207 | } | ||
208 | |||
209 | switch (delivery_mode) { | ||
210 | case IOAPIC_LOWEST_PRIORITY: | ||
211 | vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm, vector, | ||
212 | deliver_bitmask); | ||
213 | if (vcpu != NULL) | ||
214 | ioapic_inj_irq(ioapic, vcpu, vector, | ||
215 | trig_mode, delivery_mode); | ||
216 | else | ||
217 | ioapic_debug("null lowest prio vcpu: " | ||
218 | "mask=%x vector=%x delivery_mode=%x\n", | ||
219 | deliver_bitmask, vector, IOAPIC_LOWEST_PRIORITY); | ||
220 | break; | ||
221 | case IOAPIC_FIXED: | ||
222 | for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) { | ||
223 | if (!(deliver_bitmask & (1 << vcpu_id))) | ||
224 | continue; | ||
225 | deliver_bitmask &= ~(1 << vcpu_id); | ||
226 | vcpu = ioapic->kvm->vcpus[vcpu_id]; | ||
227 | if (vcpu) { | ||
228 | ioapic_inj_irq(ioapic, vcpu, vector, | ||
229 | trig_mode, delivery_mode); | ||
230 | } | ||
231 | } | ||
232 | break; | ||
233 | |||
234 | /* TODO: NMI */ | ||
235 | default: | ||
236 | printk(KERN_WARNING "Unsupported delivery mode %d\n", | ||
237 | delivery_mode); | ||
238 | break; | ||
239 | } | ||
240 | } | ||
241 | |||
242 | void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level) | ||
243 | { | ||
244 | u32 old_irr = ioapic->irr; | ||
245 | u32 mask = 1 << irq; | ||
246 | union ioapic_redir_entry entry; | ||
247 | |||
248 | if (irq >= 0 && irq < IOAPIC_NUM_PINS) { | ||
249 | entry = ioapic->redirtbl[irq]; | ||
250 | level ^= entry.fields.polarity; | ||
251 | if (!level) | ||
252 | ioapic->irr &= ~mask; | ||
253 | else { | ||
254 | ioapic->irr |= mask; | ||
255 | if ((!entry.fields.trig_mode && old_irr != ioapic->irr) | ||
256 | || !entry.fields.remote_irr) | ||
257 | ioapic_service(ioapic, irq); | ||
258 | } | ||
259 | } | ||
260 | } | ||
261 | |||
262 | static int get_eoi_gsi(struct kvm_ioapic *ioapic, int vector) | ||
263 | { | ||
264 | int i; | ||
265 | |||
266 | for (i = 0; i < IOAPIC_NUM_PINS; i++) | ||
267 | if (ioapic->redirtbl[i].fields.vector == vector) | ||
268 | return i; | ||
269 | return -1; | ||
270 | } | ||
271 | |||
272 | void kvm_ioapic_update_eoi(struct kvm *kvm, int vector) | ||
273 | { | ||
274 | struct kvm_ioapic *ioapic = kvm->arch.vioapic; | ||
275 | union ioapic_redir_entry *ent; | ||
276 | int gsi; | ||
277 | |||
278 | gsi = get_eoi_gsi(ioapic, vector); | ||
279 | if (gsi == -1) { | ||
280 | printk(KERN_WARNING "Can't find redir item for %d EOI\n", | ||
281 | vector); | ||
282 | return; | ||
283 | } | ||
284 | |||
285 | ent = &ioapic->redirtbl[gsi]; | ||
286 | ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG); | ||
287 | |||
288 | ent->fields.remote_irr = 0; | ||
289 | if (!ent->fields.mask && (ioapic->irr & (1 << gsi))) | ||
290 | ioapic_deliver(ioapic, gsi); | ||
291 | } | ||
292 | |||
293 | static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr) | ||
294 | { | ||
295 | struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; | ||
296 | |||
297 | return ((addr >= ioapic->base_address && | ||
298 | (addr < ioapic->base_address + IOAPIC_MEM_LENGTH))); | ||
299 | } | ||
300 | |||
301 | static void ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len, | ||
302 | void *val) | ||
303 | { | ||
304 | struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; | ||
305 | u32 result; | ||
306 | |||
307 | ioapic_debug("addr %lx\n", (unsigned long)addr); | ||
308 | ASSERT(!(addr & 0xf)); /* check alignment */ | ||
309 | |||
310 | addr &= 0xff; | ||
311 | switch (addr) { | ||
312 | case IOAPIC_REG_SELECT: | ||
313 | result = ioapic->ioregsel; | ||
314 | break; | ||
315 | |||
316 | case IOAPIC_REG_WINDOW: | ||
317 | result = ioapic_read_indirect(ioapic, addr, len); | ||
318 | break; | ||
319 | |||
320 | default: | ||
321 | result = 0; | ||
322 | break; | ||
323 | } | ||
324 | switch (len) { | ||
325 | case 8: | ||
326 | *(u64 *) val = result; | ||
327 | break; | ||
328 | case 1: | ||
329 | case 2: | ||
330 | case 4: | ||
331 | memcpy(val, (char *)&result, len); | ||
332 | break; | ||
333 | default: | ||
334 | printk(KERN_WARNING "ioapic: wrong length %d\n", len); | ||
335 | } | ||
336 | } | ||
337 | |||
338 | static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, | ||
339 | const void *val) | ||
340 | { | ||
341 | struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; | ||
342 | u32 data; | ||
343 | |||
344 | ioapic_debug("ioapic_mmio_write addr=%p len=%d val=%p\n", | ||
345 | (void*)addr, len, val); | ||
346 | ASSERT(!(addr & 0xf)); /* check alignment */ | ||
347 | if (len == 4 || len == 8) | ||
348 | data = *(u32 *) val; | ||
349 | else { | ||
350 | printk(KERN_WARNING "ioapic: Unsupported size %d\n", len); | ||
351 | return; | ||
352 | } | ||
353 | |||
354 | addr &= 0xff; | ||
355 | switch (addr) { | ||
356 | case IOAPIC_REG_SELECT: | ||
357 | ioapic->ioregsel = data; | ||
358 | break; | ||
359 | |||
360 | case IOAPIC_REG_WINDOW: | ||
361 | ioapic_write_indirect(ioapic, data); | ||
362 | break; | ||
363 | #ifdef CONFIG_IA64 | ||
364 | case IOAPIC_REG_EOI: | ||
365 | kvm_ioapic_update_eoi(ioapic, data); | ||
366 | break; | ||
367 | #endif | ||
368 | |||
369 | default: | ||
370 | break; | ||
371 | } | ||
372 | } | ||
373 | |||
374 | void kvm_ioapic_reset(struct kvm_ioapic *ioapic) | ||
375 | { | ||
376 | int i; | ||
377 | |||
378 | for (i = 0; i < IOAPIC_NUM_PINS; i++) | ||
379 | ioapic->redirtbl[i].fields.mask = 1; | ||
380 | ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS; | ||
381 | ioapic->ioregsel = 0; | ||
382 | ioapic->irr = 0; | ||
383 | ioapic->id = 0; | ||
384 | } | ||
385 | |||
386 | int kvm_ioapic_init(struct kvm *kvm) | ||
387 | { | ||
388 | struct kvm_ioapic *ioapic; | ||
389 | |||
390 | ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL); | ||
391 | if (!ioapic) | ||
392 | return -ENOMEM; | ||
393 | kvm->arch.vioapic = ioapic; | ||
394 | kvm_ioapic_reset(ioapic); | ||
395 | ioapic->dev.read = ioapic_mmio_read; | ||
396 | ioapic->dev.write = ioapic_mmio_write; | ||
397 | ioapic->dev.in_range = ioapic_in_range; | ||
398 | ioapic->dev.private = ioapic; | ||
399 | ioapic->kvm = kvm; | ||
400 | kvm_io_bus_register_dev(&kvm->mmio_bus, &ioapic->dev); | ||
401 | return 0; | ||
402 | } | ||
diff --git a/drivers/kvm/iodev.h b/drivers/kvm/iodev.h index eb9e8a71843a..c14e642027b2 100644 --- a/drivers/kvm/iodev.h +++ b/drivers/kvm/iodev.h | |||
@@ -16,7 +16,7 @@ | |||
16 | #ifndef __KVM_IODEV_H__ | 16 | #ifndef __KVM_IODEV_H__ |
17 | #define __KVM_IODEV_H__ | 17 | #define __KVM_IODEV_H__ |
18 | 18 | ||
19 | #include "types.h" | 19 | #include <linux/kvm_types.h> |
20 | 20 | ||
21 | struct kvm_io_device { | 21 | struct kvm_io_device { |
22 | void (*read)(struct kvm_io_device *this, | 22 | void (*read)(struct kvm_io_device *this, |
diff --git a/drivers/kvm/irq.c b/drivers/kvm/irq.c deleted file mode 100644 index 59b47c55fc76..000000000000 --- a/drivers/kvm/irq.c +++ /dev/null | |||
@@ -1,99 +0,0 @@ | |||
1 | /* | ||
2 | * irq.c: API for in kernel interrupt controller | ||
3 | * Copyright (c) 2007, Intel Corporation. | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify it | ||
6 | * under the terms and conditions of the GNU General Public License, | ||
7 | * version 2, as published by the Free Software Foundation. | ||
8 | * | ||
9 | * This program is distributed in the hope it will be useful, but WITHOUT | ||
10 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
11 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
12 | * more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License along with | ||
15 | * this program; if not, write to the Free Software Foundation, Inc., 59 Temple | ||
16 | * Place - Suite 330, Boston, MA 02111-1307 USA. | ||
17 | * Authors: | ||
18 | * Yaozu (Eddie) Dong <Eddie.dong@intel.com> | ||
19 | * | ||
20 | */ | ||
21 | |||
22 | #include <linux/module.h> | ||
23 | |||
24 | #include "kvm.h" | ||
25 | #include "x86.h" | ||
26 | #include "irq.h" | ||
27 | |||
28 | /* | ||
29 | * check if there is pending interrupt without | ||
30 | * intack. | ||
31 | */ | ||
32 | int kvm_cpu_has_interrupt(struct kvm_vcpu *v) | ||
33 | { | ||
34 | struct kvm_pic *s; | ||
35 | |||
36 | if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */ | ||
37 | if (kvm_apic_accept_pic_intr(v)) { | ||
38 | s = pic_irqchip(v->kvm); /* PIC */ | ||
39 | return s->output; | ||
40 | } else | ||
41 | return 0; | ||
42 | } | ||
43 | return 1; | ||
44 | } | ||
45 | EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt); | ||
46 | |||
47 | /* | ||
48 | * Read pending interrupt vector and intack. | ||
49 | */ | ||
50 | int kvm_cpu_get_interrupt(struct kvm_vcpu *v) | ||
51 | { | ||
52 | struct kvm_pic *s; | ||
53 | int vector; | ||
54 | |||
55 | vector = kvm_get_apic_interrupt(v); /* APIC */ | ||
56 | if (vector == -1) { | ||
57 | if (kvm_apic_accept_pic_intr(v)) { | ||
58 | s = pic_irqchip(v->kvm); | ||
59 | s->output = 0; /* PIC */ | ||
60 | vector = kvm_pic_read_irq(s); | ||
61 | } | ||
62 | } | ||
63 | return vector; | ||
64 | } | ||
65 | EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt); | ||
66 | |||
67 | static void vcpu_kick_intr(void *info) | ||
68 | { | ||
69 | #ifdef DEBUG | ||
70 | struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info; | ||
71 | printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu); | ||
72 | #endif | ||
73 | } | ||
74 | |||
75 | void kvm_vcpu_kick(struct kvm_vcpu *vcpu) | ||
76 | { | ||
77 | int ipi_pcpu = vcpu->cpu; | ||
78 | |||
79 | if (waitqueue_active(&vcpu->wq)) { | ||
80 | wake_up_interruptible(&vcpu->wq); | ||
81 | ++vcpu->stat.halt_wakeup; | ||
82 | } | ||
83 | if (vcpu->guest_mode) | ||
84 | smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0); | ||
85 | } | ||
86 | |||
87 | void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) | ||
88 | { | ||
89 | kvm_inject_apic_timer_irqs(vcpu); | ||
90 | /* TODO: PIT, RTC etc. */ | ||
91 | } | ||
92 | EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs); | ||
93 | |||
94 | void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec) | ||
95 | { | ||
96 | kvm_apic_timer_intr_post(vcpu, vec); | ||
97 | /* TODO: PIT, RTC etc. */ | ||
98 | } | ||
99 | EXPORT_SYMBOL_GPL(kvm_timer_intr_post); | ||
diff --git a/drivers/kvm/irq.h b/drivers/kvm/irq.h deleted file mode 100644 index 6e023dc3f848..000000000000 --- a/drivers/kvm/irq.h +++ /dev/null | |||
@@ -1,196 +0,0 @@ | |||
1 | /* | ||
2 | * irq.h: in kernel interrupt controller related definitions | ||
3 | * Copyright (c) 2007, Intel Corporation. | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify it | ||
6 | * under the terms and conditions of the GNU General Public License, | ||
7 | * version 2, as published by the Free Software Foundation. | ||
8 | * | ||
9 | * This program is distributed in the hope it will be useful, but WITHOUT | ||
10 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
11 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
12 | * more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License along with | ||
15 | * this program; if not, write to the Free Software Foundation, Inc., 59 Temple | ||
16 | * Place - Suite 330, Boston, MA 02111-1307 USA. | ||
17 | * Authors: | ||
18 | * Yaozu (Eddie) Dong <Eddie.dong@intel.com> | ||
19 | * | ||
20 | */ | ||
21 | |||
22 | #ifndef __IRQ_H | ||
23 | #define __IRQ_H | ||
24 | |||
25 | #include <linux/mm_types.h> | ||
26 | #include <linux/hrtimer.h> | ||
27 | #include <asm/kvm.h> | ||
28 | #include "iodev.h" | ||
29 | #include "kvm.h" | ||
30 | |||
31 | struct kvm; | ||
32 | struct kvm_vcpu; | ||
33 | |||
34 | typedef void irq_request_func(void *opaque, int level); | ||
35 | |||
36 | struct kvm_kpic_state { | ||
37 | u8 last_irr; /* edge detection */ | ||
38 | u8 irr; /* interrupt request register */ | ||
39 | u8 imr; /* interrupt mask register */ | ||
40 | u8 isr; /* interrupt service register */ | ||
41 | u8 priority_add; /* highest irq priority */ | ||
42 | u8 irq_base; | ||
43 | u8 read_reg_select; | ||
44 | u8 poll; | ||
45 | u8 special_mask; | ||
46 | u8 init_state; | ||
47 | u8 auto_eoi; | ||
48 | u8 rotate_on_auto_eoi; | ||
49 | u8 special_fully_nested_mode; | ||
50 | u8 init4; /* true if 4 byte init */ | ||
51 | u8 elcr; /* PIIX edge/trigger selection */ | ||
52 | u8 elcr_mask; | ||
53 | struct kvm_pic *pics_state; | ||
54 | }; | ||
55 | |||
56 | struct kvm_pic { | ||
57 | struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ | ||
58 | irq_request_func *irq_request; | ||
59 | void *irq_request_opaque; | ||
60 | int output; /* intr from master PIC */ | ||
61 | struct kvm_io_device dev; | ||
62 | }; | ||
63 | |||
64 | struct kvm_pic *kvm_create_pic(struct kvm *kvm); | ||
65 | void kvm_pic_set_irq(void *opaque, int irq, int level); | ||
66 | int kvm_pic_read_irq(struct kvm_pic *s); | ||
67 | void kvm_pic_update_irq(struct kvm_pic *s); | ||
68 | |||
69 | #define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS | ||
70 | #define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */ | ||
71 | #define IOAPIC_EDGE_TRIG 0 | ||
72 | #define IOAPIC_LEVEL_TRIG 1 | ||
73 | |||
74 | #define IOAPIC_DEFAULT_BASE_ADDRESS 0xfec00000 | ||
75 | #define IOAPIC_MEM_LENGTH 0x100 | ||
76 | |||
77 | /* Direct registers. */ | ||
78 | #define IOAPIC_REG_SELECT 0x00 | ||
79 | #define IOAPIC_REG_WINDOW 0x10 | ||
80 | #define IOAPIC_REG_EOI 0x40 /* IA64 IOSAPIC only */ | ||
81 | |||
82 | /* Indirect registers. */ | ||
83 | #define IOAPIC_REG_APIC_ID 0x00 /* x86 IOAPIC only */ | ||
84 | #define IOAPIC_REG_VERSION 0x01 | ||
85 | #define IOAPIC_REG_ARB_ID 0x02 /* x86 IOAPIC only */ | ||
86 | |||
87 | /*ioapic delivery mode*/ | ||
88 | #define IOAPIC_FIXED 0x0 | ||
89 | #define IOAPIC_LOWEST_PRIORITY 0x1 | ||
90 | #define IOAPIC_PMI 0x2 | ||
91 | #define IOAPIC_NMI 0x4 | ||
92 | #define IOAPIC_INIT 0x5 | ||
93 | #define IOAPIC_EXTINT 0x7 | ||
94 | |||
95 | struct kvm_ioapic { | ||
96 | u64 base_address; | ||
97 | u32 ioregsel; | ||
98 | u32 id; | ||
99 | u32 irr; | ||
100 | u32 pad; | ||
101 | union ioapic_redir_entry { | ||
102 | u64 bits; | ||
103 | struct { | ||
104 | u8 vector; | ||
105 | u8 delivery_mode:3; | ||
106 | u8 dest_mode:1; | ||
107 | u8 delivery_status:1; | ||
108 | u8 polarity:1; | ||
109 | u8 remote_irr:1; | ||
110 | u8 trig_mode:1; | ||
111 | u8 mask:1; | ||
112 | u8 reserve:7; | ||
113 | u8 reserved[4]; | ||
114 | u8 dest_id; | ||
115 | } fields; | ||
116 | } redirtbl[IOAPIC_NUM_PINS]; | ||
117 | struct kvm_io_device dev; | ||
118 | struct kvm *kvm; | ||
119 | }; | ||
120 | |||
121 | struct kvm_lapic { | ||
122 | unsigned long base_address; | ||
123 | struct kvm_io_device dev; | ||
124 | struct { | ||
125 | atomic_t pending; | ||
126 | s64 period; /* unit: ns */ | ||
127 | u32 divide_count; | ||
128 | ktime_t last_update; | ||
129 | struct hrtimer dev; | ||
130 | } timer; | ||
131 | struct kvm_vcpu *vcpu; | ||
132 | struct page *regs_page; | ||
133 | void *regs; | ||
134 | }; | ||
135 | |||
136 | #ifdef DEBUG | ||
137 | #define ASSERT(x) \ | ||
138 | do { \ | ||
139 | if (!(x)) { \ | ||
140 | printk(KERN_EMERG "assertion failed %s: %d: %s\n", \ | ||
141 | __FILE__, __LINE__, #x); \ | ||
142 | BUG(); \ | ||
143 | } \ | ||
144 | } while (0) | ||
145 | #else | ||
146 | #define ASSERT(x) do { } while (0) | ||
147 | #endif | ||
148 | |||
149 | static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) | ||
150 | { | ||
151 | return kvm->arch.vpic; | ||
152 | } | ||
153 | |||
154 | static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm) | ||
155 | { | ||
156 | return kvm->arch.vioapic; | ||
157 | } | ||
158 | |||
159 | static inline int irqchip_in_kernel(struct kvm *kvm) | ||
160 | { | ||
161 | return pic_irqchip(kvm) != NULL; | ||
162 | } | ||
163 | |||
164 | void kvm_vcpu_kick(struct kvm_vcpu *vcpu); | ||
165 | int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); | ||
166 | int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu); | ||
167 | int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); | ||
168 | int kvm_create_lapic(struct kvm_vcpu *vcpu); | ||
169 | void kvm_lapic_reset(struct kvm_vcpu *vcpu); | ||
170 | void kvm_pic_reset(struct kvm_kpic_state *s); | ||
171 | void kvm_ioapic_reset(struct kvm_ioapic *ioapic); | ||
172 | void kvm_free_lapic(struct kvm_vcpu *vcpu); | ||
173 | u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); | ||
174 | void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); | ||
175 | void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); | ||
176 | |||
177 | struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector, | ||
178 | unsigned long bitmap); | ||
179 | u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); | ||
180 | void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data); | ||
181 | int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); | ||
182 | void kvm_ioapic_update_eoi(struct kvm *kvm, int vector); | ||
183 | int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); | ||
184 | int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig); | ||
185 | void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu); | ||
186 | int kvm_ioapic_init(struct kvm *kvm); | ||
187 | void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level); | ||
188 | int kvm_lapic_enabled(struct kvm_vcpu *vcpu); | ||
189 | int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); | ||
190 | void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec); | ||
191 | void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec); | ||
192 | void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); | ||
193 | void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); | ||
194 | void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu); | ||
195 | |||
196 | #endif | ||
diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h deleted file mode 100644 index bf6a3b330a3d..000000000000 --- a/drivers/kvm/kvm.h +++ /dev/null | |||
@@ -1,289 +0,0 @@ | |||
1 | #ifndef __KVM_H | ||
2 | #define __KVM_H | ||
3 | |||
4 | /* | ||
5 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
6 | * the COPYING file in the top-level directory. | ||
7 | */ | ||
8 | |||
9 | #include <linux/types.h> | ||
10 | #include <linux/hardirq.h> | ||
11 | #include <linux/list.h> | ||
12 | #include <linux/mutex.h> | ||
13 | #include <linux/spinlock.h> | ||
14 | #include <linux/signal.h> | ||
15 | #include <linux/sched.h> | ||
16 | #include <linux/mm.h> | ||
17 | #include <linux/preempt.h> | ||
18 | #include <asm/signal.h> | ||
19 | |||
20 | #include <linux/kvm.h> | ||
21 | #include <linux/kvm_para.h> | ||
22 | |||
23 | #include "types.h" | ||
24 | |||
25 | #include "x86.h" | ||
26 | |||
27 | #define KVM_MAX_VCPUS 4 | ||
28 | #define KVM_MEMORY_SLOTS 8 | ||
29 | /* memory slots that does not exposed to userspace */ | ||
30 | #define KVM_PRIVATE_MEM_SLOTS 4 | ||
31 | |||
32 | #define KVM_PIO_PAGE_OFFSET 1 | ||
33 | |||
34 | /* | ||
35 | * vcpu->requests bit members | ||
36 | */ | ||
37 | #define KVM_REQ_TLB_FLUSH 0 | ||
38 | |||
39 | |||
40 | struct kvm_vcpu; | ||
41 | extern struct kmem_cache *kvm_vcpu_cache; | ||
42 | |||
43 | struct kvm_guest_debug { | ||
44 | int enabled; | ||
45 | unsigned long bp[4]; | ||
46 | int singlestep; | ||
47 | }; | ||
48 | |||
49 | /* | ||
50 | * It would be nice to use something smarter than a linear search, TBD... | ||
51 | * Thankfully we dont expect many devices to register (famous last words :), | ||
52 | * so until then it will suffice. At least its abstracted so we can change | ||
53 | * in one place. | ||
54 | */ | ||
55 | struct kvm_io_bus { | ||
56 | int dev_count; | ||
57 | #define NR_IOBUS_DEVS 6 | ||
58 | struct kvm_io_device *devs[NR_IOBUS_DEVS]; | ||
59 | }; | ||
60 | |||
61 | void kvm_io_bus_init(struct kvm_io_bus *bus); | ||
62 | void kvm_io_bus_destroy(struct kvm_io_bus *bus); | ||
63 | struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr); | ||
64 | void kvm_io_bus_register_dev(struct kvm_io_bus *bus, | ||
65 | struct kvm_io_device *dev); | ||
66 | |||
67 | struct kvm_vcpu { | ||
68 | struct kvm *kvm; | ||
69 | struct preempt_notifier preempt_notifier; | ||
70 | int vcpu_id; | ||
71 | struct mutex mutex; | ||
72 | int cpu; | ||
73 | struct kvm_run *run; | ||
74 | int guest_mode; | ||
75 | unsigned long requests; | ||
76 | struct kvm_guest_debug guest_debug; | ||
77 | int fpu_active; | ||
78 | int guest_fpu_loaded; | ||
79 | wait_queue_head_t wq; | ||
80 | int sigset_active; | ||
81 | sigset_t sigset; | ||
82 | struct kvm_vcpu_stat stat; | ||
83 | |||
84 | #ifdef CONFIG_HAS_IOMEM | ||
85 | int mmio_needed; | ||
86 | int mmio_read_completed; | ||
87 | int mmio_is_write; | ||
88 | int mmio_size; | ||
89 | unsigned char mmio_data[8]; | ||
90 | gpa_t mmio_phys_addr; | ||
91 | #endif | ||
92 | |||
93 | struct kvm_vcpu_arch arch; | ||
94 | }; | ||
95 | |||
96 | struct kvm_memory_slot { | ||
97 | gfn_t base_gfn; | ||
98 | unsigned long npages; | ||
99 | unsigned long flags; | ||
100 | unsigned long *rmap; | ||
101 | unsigned long *dirty_bitmap; | ||
102 | unsigned long userspace_addr; | ||
103 | int user_alloc; | ||
104 | }; | ||
105 | |||
106 | struct kvm { | ||
107 | struct mutex lock; /* protects everything except vcpus */ | ||
108 | struct mm_struct *mm; /* userspace tied to this vm */ | ||
109 | int nmemslots; | ||
110 | struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS + | ||
111 | KVM_PRIVATE_MEM_SLOTS]; | ||
112 | struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; | ||
113 | struct list_head vm_list; | ||
114 | struct file *filp; | ||
115 | struct kvm_io_bus mmio_bus; | ||
116 | struct kvm_io_bus pio_bus; | ||
117 | struct kvm_vm_stat stat; | ||
118 | struct kvm_arch arch; | ||
119 | }; | ||
120 | |||
121 | /* The guest did something we don't support. */ | ||
122 | #define pr_unimpl(vcpu, fmt, ...) \ | ||
123 | do { \ | ||
124 | if (printk_ratelimit()) \ | ||
125 | printk(KERN_ERR "kvm: %i: cpu%i " fmt, \ | ||
126 | current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__); \ | ||
127 | } while (0) | ||
128 | |||
129 | #define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt) | ||
130 | #define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt) | ||
131 | |||
132 | int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id); | ||
133 | void kvm_vcpu_uninit(struct kvm_vcpu *vcpu); | ||
134 | |||
135 | void vcpu_load(struct kvm_vcpu *vcpu); | ||
136 | void vcpu_put(struct kvm_vcpu *vcpu); | ||
137 | |||
138 | void decache_vcpus_on_cpu(int cpu); | ||
139 | |||
140 | |||
141 | int kvm_init(void *opaque, unsigned int vcpu_size, | ||
142 | struct module *module); | ||
143 | void kvm_exit(void); | ||
144 | |||
145 | #define HPA_MSB ((sizeof(hpa_t) * 8) - 1) | ||
146 | #define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB) | ||
147 | static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; } | ||
148 | struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva); | ||
149 | |||
150 | extern struct page *bad_page; | ||
151 | |||
152 | int is_error_page(struct page *page); | ||
153 | int kvm_is_error_hva(unsigned long addr); | ||
154 | int kvm_set_memory_region(struct kvm *kvm, | ||
155 | struct kvm_userspace_memory_region *mem, | ||
156 | int user_alloc); | ||
157 | int __kvm_set_memory_region(struct kvm *kvm, | ||
158 | struct kvm_userspace_memory_region *mem, | ||
159 | int user_alloc); | ||
160 | int kvm_arch_set_memory_region(struct kvm *kvm, | ||
161 | struct kvm_userspace_memory_region *mem, | ||
162 | struct kvm_memory_slot old, | ||
163 | int user_alloc); | ||
164 | gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn); | ||
165 | struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); | ||
166 | void kvm_release_page_clean(struct page *page); | ||
167 | void kvm_release_page_dirty(struct page *page); | ||
168 | int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, | ||
169 | int len); | ||
170 | int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len); | ||
171 | int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, | ||
172 | int offset, int len); | ||
173 | int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, | ||
174 | unsigned long len); | ||
175 | int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len); | ||
176 | int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len); | ||
177 | struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); | ||
178 | int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn); | ||
179 | void mark_page_dirty(struct kvm *kvm, gfn_t gfn); | ||
180 | |||
181 | void kvm_vcpu_block(struct kvm_vcpu *vcpu); | ||
182 | void kvm_resched(struct kvm_vcpu *vcpu); | ||
183 | void kvm_load_guest_fpu(struct kvm_vcpu *vcpu); | ||
184 | void kvm_put_guest_fpu(struct kvm_vcpu *vcpu); | ||
185 | void kvm_flush_remote_tlbs(struct kvm *kvm); | ||
186 | |||
187 | long kvm_arch_dev_ioctl(struct file *filp, | ||
188 | unsigned int ioctl, unsigned long arg); | ||
189 | long kvm_arch_vcpu_ioctl(struct file *filp, | ||
190 | unsigned int ioctl, unsigned long arg); | ||
191 | void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu); | ||
192 | void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu); | ||
193 | |||
194 | int kvm_dev_ioctl_check_extension(long ext); | ||
195 | |||
196 | int kvm_get_dirty_log(struct kvm *kvm, | ||
197 | struct kvm_dirty_log *log, int *is_dirty); | ||
198 | int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, | ||
199 | struct kvm_dirty_log *log); | ||
200 | |||
201 | int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, | ||
202 | struct | ||
203 | kvm_userspace_memory_region *mem, | ||
204 | int user_alloc); | ||
205 | long kvm_arch_vm_ioctl(struct file *filp, | ||
206 | unsigned int ioctl, unsigned long arg); | ||
207 | void kvm_arch_destroy_vm(struct kvm *kvm); | ||
208 | |||
209 | int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu); | ||
210 | int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu); | ||
211 | |||
212 | int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, | ||
213 | struct kvm_translation *tr); | ||
214 | |||
215 | int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs); | ||
216 | int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs); | ||
217 | int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, | ||
218 | struct kvm_sregs *sregs); | ||
219 | int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, | ||
220 | struct kvm_sregs *sregs); | ||
221 | int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu, | ||
222 | struct kvm_debug_guest *dbg); | ||
223 | int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run); | ||
224 | |||
225 | int kvm_arch_init(void *opaque); | ||
226 | void kvm_arch_exit(void); | ||
227 | |||
228 | int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu); | ||
229 | void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu); | ||
230 | |||
231 | void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu); | ||
232 | void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu); | ||
233 | void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu); | ||
234 | struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id); | ||
235 | int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu); | ||
236 | void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu); | ||
237 | |||
238 | int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu); | ||
239 | void kvm_arch_hardware_enable(void *garbage); | ||
240 | void kvm_arch_hardware_disable(void *garbage); | ||
241 | int kvm_arch_hardware_setup(void); | ||
242 | void kvm_arch_hardware_unsetup(void); | ||
243 | void kvm_arch_check_processor_compat(void *rtn); | ||
244 | int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); | ||
245 | |||
246 | void kvm_free_physmem(struct kvm *kvm); | ||
247 | |||
248 | struct kvm *kvm_arch_create_vm(void); | ||
249 | void kvm_arch_destroy_vm(struct kvm *kvm); | ||
250 | |||
251 | int kvm_cpu_get_interrupt(struct kvm_vcpu *v); | ||
252 | int kvm_cpu_has_interrupt(struct kvm_vcpu *v); | ||
253 | |||
254 | static inline void kvm_guest_enter(void) | ||
255 | { | ||
256 | account_system_vtime(current); | ||
257 | current->flags |= PF_VCPU; | ||
258 | } | ||
259 | |||
260 | static inline void kvm_guest_exit(void) | ||
261 | { | ||
262 | account_system_vtime(current); | ||
263 | current->flags &= ~PF_VCPU; | ||
264 | } | ||
265 | |||
266 | static inline int memslot_id(struct kvm *kvm, struct kvm_memory_slot *slot) | ||
267 | { | ||
268 | return slot - kvm->memslots; | ||
269 | } | ||
270 | |||
271 | static inline gpa_t gfn_to_gpa(gfn_t gfn) | ||
272 | { | ||
273 | return (gpa_t)gfn << PAGE_SHIFT; | ||
274 | } | ||
275 | |||
276 | enum kvm_stat_kind { | ||
277 | KVM_STAT_VM, | ||
278 | KVM_STAT_VCPU, | ||
279 | }; | ||
280 | |||
281 | struct kvm_stats_debugfs_item { | ||
282 | const char *name; | ||
283 | int offset; | ||
284 | enum kvm_stat_kind kind; | ||
285 | struct dentry *dentry; | ||
286 | }; | ||
287 | extern struct kvm_stats_debugfs_item debugfs_entries[]; | ||
288 | |||
289 | #endif | ||
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index ae2a1bf640bc..4026d7d64296 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c | |||
@@ -15,9 +15,9 @@ | |||
15 | * | 15 | * |
16 | */ | 16 | */ |
17 | 17 | ||
18 | #include "kvm.h" | ||
19 | #include "iodev.h" | 18 | #include "iodev.h" |
20 | 19 | ||
20 | #include <linux/kvm_host.h> | ||
21 | #include <linux/kvm.h> | 21 | #include <linux/kvm.h> |
22 | #include <linux/module.h> | 22 | #include <linux/module.h> |
23 | #include <linux/errno.h> | 23 | #include <linux/errno.h> |
diff --git a/drivers/kvm/kvm_svm.h b/drivers/kvm/kvm_svm.h deleted file mode 100644 index a0e415daef5b..000000000000 --- a/drivers/kvm/kvm_svm.h +++ /dev/null | |||
@@ -1,45 +0,0 @@ | |||
1 | #ifndef __KVM_SVM_H | ||
2 | #define __KVM_SVM_H | ||
3 | |||
4 | #include <linux/kernel.h> | ||
5 | #include <linux/types.h> | ||
6 | #include <linux/list.h> | ||
7 | #include <asm/msr.h> | ||
8 | |||
9 | #include "svm.h" | ||
10 | #include "kvm.h" | ||
11 | |||
12 | static const u32 host_save_user_msrs[] = { | ||
13 | #ifdef CONFIG_X86_64 | ||
14 | MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE, | ||
15 | MSR_FS_BASE, | ||
16 | #endif | ||
17 | MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, | ||
18 | }; | ||
19 | |||
20 | #define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs) | ||
21 | #define NUM_DB_REGS 4 | ||
22 | |||
23 | struct kvm_vcpu; | ||
24 | |||
25 | struct vcpu_svm { | ||
26 | struct kvm_vcpu vcpu; | ||
27 | struct vmcb *vmcb; | ||
28 | unsigned long vmcb_pa; | ||
29 | struct svm_cpu_data *svm_data; | ||
30 | uint64_t asid_generation; | ||
31 | |||
32 | unsigned long db_regs[NUM_DB_REGS]; | ||
33 | |||
34 | u64 next_rip; | ||
35 | |||
36 | u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; | ||
37 | u64 host_gs_base; | ||
38 | unsigned long host_cr2; | ||
39 | unsigned long host_db_regs[NUM_DB_REGS]; | ||
40 | unsigned long host_dr6; | ||
41 | unsigned long host_dr7; | ||
42 | }; | ||
43 | |||
44 | #endif | ||
45 | |||
diff --git a/drivers/kvm/lapic.c b/drivers/kvm/lapic.c deleted file mode 100644 index 8c74bf184a07..000000000000 --- a/drivers/kvm/lapic.c +++ /dev/null | |||
@@ -1,1087 +0,0 @@ | |||
1 | |||
2 | /* | ||
3 | * Local APIC virtualization | ||
4 | * | ||
5 | * Copyright (C) 2006 Qumranet, Inc. | ||
6 | * Copyright (C) 2007 Novell | ||
7 | * Copyright (C) 2007 Intel | ||
8 | * | ||
9 | * Authors: | ||
10 | * Dor Laor <dor.laor@qumranet.com> | ||
11 | * Gregory Haskins <ghaskins@novell.com> | ||
12 | * Yaozu (Eddie) Dong <eddie.dong@intel.com> | ||
13 | * | ||
14 | * Based on Xen 3.1 code, Copyright (c) 2004, Intel Corporation. | ||
15 | * | ||
16 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
17 | * the COPYING file in the top-level directory. | ||
18 | */ | ||
19 | |||
20 | #include "kvm.h" | ||
21 | #include "x86.h" | ||
22 | |||
23 | #include <linux/kvm.h> | ||
24 | #include <linux/mm.h> | ||
25 | #include <linux/highmem.h> | ||
26 | #include <linux/smp.h> | ||
27 | #include <linux/hrtimer.h> | ||
28 | #include <linux/io.h> | ||
29 | #include <linux/module.h> | ||
30 | #include <asm/processor.h> | ||
31 | #include <asm/msr.h> | ||
32 | #include <asm/page.h> | ||
33 | #include <asm/current.h> | ||
34 | #include <asm/apicdef.h> | ||
35 | #include <asm/atomic.h> | ||
36 | #include <asm/div64.h> | ||
37 | #include "irq.h" | ||
38 | |||
39 | #define PRId64 "d" | ||
40 | #define PRIx64 "llx" | ||
41 | #define PRIu64 "u" | ||
42 | #define PRIo64 "o" | ||
43 | |||
44 | #define APIC_BUS_CYCLE_NS 1 | ||
45 | |||
46 | /* #define apic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */ | ||
47 | #define apic_debug(fmt, arg...) | ||
48 | |||
49 | #define APIC_LVT_NUM 6 | ||
50 | /* 14 is the version for Xeon and Pentium 8.4.8*/ | ||
51 | #define APIC_VERSION (0x14UL | ((APIC_LVT_NUM - 1) << 16)) | ||
52 | #define LAPIC_MMIO_LENGTH (1 << 12) | ||
53 | /* followed define is not in apicdef.h */ | ||
54 | #define APIC_SHORT_MASK 0xc0000 | ||
55 | #define APIC_DEST_NOSHORT 0x0 | ||
56 | #define APIC_DEST_MASK 0x800 | ||
57 | #define MAX_APIC_VECTOR 256 | ||
58 | |||
59 | #define VEC_POS(v) ((v) & (32 - 1)) | ||
60 | #define REG_POS(v) (((v) >> 5) << 4) | ||
61 | |||
62 | static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off) | ||
63 | { | ||
64 | return *((u32 *) (apic->regs + reg_off)); | ||
65 | } | ||
66 | |||
67 | static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val) | ||
68 | { | ||
69 | *((u32 *) (apic->regs + reg_off)) = val; | ||
70 | } | ||
71 | |||
72 | static inline int apic_test_and_set_vector(int vec, void *bitmap) | ||
73 | { | ||
74 | return test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); | ||
75 | } | ||
76 | |||
77 | static inline int apic_test_and_clear_vector(int vec, void *bitmap) | ||
78 | { | ||
79 | return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); | ||
80 | } | ||
81 | |||
82 | static inline void apic_set_vector(int vec, void *bitmap) | ||
83 | { | ||
84 | set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); | ||
85 | } | ||
86 | |||
87 | static inline void apic_clear_vector(int vec, void *bitmap) | ||
88 | { | ||
89 | clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); | ||
90 | } | ||
91 | |||
92 | static inline int apic_hw_enabled(struct kvm_lapic *apic) | ||
93 | { | ||
94 | return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE; | ||
95 | } | ||
96 | |||
97 | static inline int apic_sw_enabled(struct kvm_lapic *apic) | ||
98 | { | ||
99 | return apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED; | ||
100 | } | ||
101 | |||
102 | static inline int apic_enabled(struct kvm_lapic *apic) | ||
103 | { | ||
104 | return apic_sw_enabled(apic) && apic_hw_enabled(apic); | ||
105 | } | ||
106 | |||
107 | #define LVT_MASK \ | ||
108 | (APIC_LVT_MASKED | APIC_SEND_PENDING | APIC_VECTOR_MASK) | ||
109 | |||
110 | #define LINT_MASK \ | ||
111 | (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \ | ||
112 | APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER) | ||
113 | |||
114 | static inline int kvm_apic_id(struct kvm_lapic *apic) | ||
115 | { | ||
116 | return (apic_get_reg(apic, APIC_ID) >> 24) & 0xff; | ||
117 | } | ||
118 | |||
119 | static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type) | ||
120 | { | ||
121 | return !(apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED); | ||
122 | } | ||
123 | |||
124 | static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type) | ||
125 | { | ||
126 | return apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK; | ||
127 | } | ||
128 | |||
129 | static inline int apic_lvtt_period(struct kvm_lapic *apic) | ||
130 | { | ||
131 | return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC; | ||
132 | } | ||
133 | |||
134 | static unsigned int apic_lvt_mask[APIC_LVT_NUM] = { | ||
135 | LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */ | ||
136 | LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ | ||
137 | LVT_MASK | APIC_MODE_MASK, /* LVTPC */ | ||
138 | LINT_MASK, LINT_MASK, /* LVT0-1 */ | ||
139 | LVT_MASK /* LVTERR */ | ||
140 | }; | ||
141 | |||
142 | static int find_highest_vector(void *bitmap) | ||
143 | { | ||
144 | u32 *word = bitmap; | ||
145 | int word_offset = MAX_APIC_VECTOR >> 5; | ||
146 | |||
147 | while ((word_offset != 0) && (word[(--word_offset) << 2] == 0)) | ||
148 | continue; | ||
149 | |||
150 | if (likely(!word_offset && !word[0])) | ||
151 | return -1; | ||
152 | else | ||
153 | return fls(word[word_offset << 2]) - 1 + (word_offset << 5); | ||
154 | } | ||
155 | |||
156 | static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic) | ||
157 | { | ||
158 | return apic_test_and_set_vector(vec, apic->regs + APIC_IRR); | ||
159 | } | ||
160 | |||
161 | static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) | ||
162 | { | ||
163 | apic_clear_vector(vec, apic->regs + APIC_IRR); | ||
164 | } | ||
165 | |||
166 | static inline int apic_find_highest_irr(struct kvm_lapic *apic) | ||
167 | { | ||
168 | int result; | ||
169 | |||
170 | result = find_highest_vector(apic->regs + APIC_IRR); | ||
171 | ASSERT(result == -1 || result >= 16); | ||
172 | |||
173 | return result; | ||
174 | } | ||
175 | |||
176 | int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) | ||
177 | { | ||
178 | struct kvm_lapic *apic = vcpu->arch.apic; | ||
179 | int highest_irr; | ||
180 | |||
181 | if (!apic) | ||
182 | return 0; | ||
183 | highest_irr = apic_find_highest_irr(apic); | ||
184 | |||
185 | return highest_irr; | ||
186 | } | ||
187 | EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr); | ||
188 | |||
189 | int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig) | ||
190 | { | ||
191 | struct kvm_lapic *apic = vcpu->arch.apic; | ||
192 | |||
193 | if (!apic_test_and_set_irr(vec, apic)) { | ||
194 | /* a new pending irq is set in IRR */ | ||
195 | if (trig) | ||
196 | apic_set_vector(vec, apic->regs + APIC_TMR); | ||
197 | else | ||
198 | apic_clear_vector(vec, apic->regs + APIC_TMR); | ||
199 | kvm_vcpu_kick(apic->vcpu); | ||
200 | return 1; | ||
201 | } | ||
202 | return 0; | ||
203 | } | ||
204 | |||
205 | static inline int apic_find_highest_isr(struct kvm_lapic *apic) | ||
206 | { | ||
207 | int result; | ||
208 | |||
209 | result = find_highest_vector(apic->regs + APIC_ISR); | ||
210 | ASSERT(result == -1 || result >= 16); | ||
211 | |||
212 | return result; | ||
213 | } | ||
214 | |||
215 | static void apic_update_ppr(struct kvm_lapic *apic) | ||
216 | { | ||
217 | u32 tpr, isrv, ppr; | ||
218 | int isr; | ||
219 | |||
220 | tpr = apic_get_reg(apic, APIC_TASKPRI); | ||
221 | isr = apic_find_highest_isr(apic); | ||
222 | isrv = (isr != -1) ? isr : 0; | ||
223 | |||
224 | if ((tpr & 0xf0) >= (isrv & 0xf0)) | ||
225 | ppr = tpr & 0xff; | ||
226 | else | ||
227 | ppr = isrv & 0xf0; | ||
228 | |||
229 | apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x", | ||
230 | apic, ppr, isr, isrv); | ||
231 | |||
232 | apic_set_reg(apic, APIC_PROCPRI, ppr); | ||
233 | } | ||
234 | |||
235 | static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) | ||
236 | { | ||
237 | apic_set_reg(apic, APIC_TASKPRI, tpr); | ||
238 | apic_update_ppr(apic); | ||
239 | } | ||
240 | |||
241 | int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest) | ||
242 | { | ||
243 | return kvm_apic_id(apic) == dest; | ||
244 | } | ||
245 | |||
246 | int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda) | ||
247 | { | ||
248 | int result = 0; | ||
249 | u8 logical_id; | ||
250 | |||
251 | logical_id = GET_APIC_LOGICAL_ID(apic_get_reg(apic, APIC_LDR)); | ||
252 | |||
253 | switch (apic_get_reg(apic, APIC_DFR)) { | ||
254 | case APIC_DFR_FLAT: | ||
255 | if (logical_id & mda) | ||
256 | result = 1; | ||
257 | break; | ||
258 | case APIC_DFR_CLUSTER: | ||
259 | if (((logical_id >> 4) == (mda >> 0x4)) | ||
260 | && (logical_id & mda & 0xf)) | ||
261 | result = 1; | ||
262 | break; | ||
263 | default: | ||
264 | printk(KERN_WARNING "Bad DFR vcpu %d: %08x\n", | ||
265 | apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR)); | ||
266 | break; | ||
267 | } | ||
268 | |||
269 | return result; | ||
270 | } | ||
271 | |||
272 | static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, | ||
273 | int short_hand, int dest, int dest_mode) | ||
274 | { | ||
275 | int result = 0; | ||
276 | struct kvm_lapic *target = vcpu->arch.apic; | ||
277 | |||
278 | apic_debug("target %p, source %p, dest 0x%x, " | ||
279 | "dest_mode 0x%x, short_hand 0x%x", | ||
280 | target, source, dest, dest_mode, short_hand); | ||
281 | |||
282 | ASSERT(!target); | ||
283 | switch (short_hand) { | ||
284 | case APIC_DEST_NOSHORT: | ||
285 | if (dest_mode == 0) { | ||
286 | /* Physical mode. */ | ||
287 | if ((dest == 0xFF) || (dest == kvm_apic_id(target))) | ||
288 | result = 1; | ||
289 | } else | ||
290 | /* Logical mode. */ | ||
291 | result = kvm_apic_match_logical_addr(target, dest); | ||
292 | break; | ||
293 | case APIC_DEST_SELF: | ||
294 | if (target == source) | ||
295 | result = 1; | ||
296 | break; | ||
297 | case APIC_DEST_ALLINC: | ||
298 | result = 1; | ||
299 | break; | ||
300 | case APIC_DEST_ALLBUT: | ||
301 | if (target != source) | ||
302 | result = 1; | ||
303 | break; | ||
304 | default: | ||
305 | printk(KERN_WARNING "Bad dest shorthand value %x\n", | ||
306 | short_hand); | ||
307 | break; | ||
308 | } | ||
309 | |||
310 | return result; | ||
311 | } | ||
312 | |||
313 | /* | ||
314 | * Add a pending IRQ into lapic. | ||
315 | * Return 1 if successfully added and 0 if discarded. | ||
316 | */ | ||
317 | static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, | ||
318 | int vector, int level, int trig_mode) | ||
319 | { | ||
320 | int orig_irr, result = 0; | ||
321 | struct kvm_vcpu *vcpu = apic->vcpu; | ||
322 | |||
323 | switch (delivery_mode) { | ||
324 | case APIC_DM_FIXED: | ||
325 | case APIC_DM_LOWEST: | ||
326 | /* FIXME add logic for vcpu on reset */ | ||
327 | if (unlikely(!apic_enabled(apic))) | ||
328 | break; | ||
329 | |||
330 | orig_irr = apic_test_and_set_irr(vector, apic); | ||
331 | if (orig_irr && trig_mode) { | ||
332 | apic_debug("level trig mode repeatedly for vector %d", | ||
333 | vector); | ||
334 | break; | ||
335 | } | ||
336 | |||
337 | if (trig_mode) { | ||
338 | apic_debug("level trig mode for vector %d", vector); | ||
339 | apic_set_vector(vector, apic->regs + APIC_TMR); | ||
340 | } else | ||
341 | apic_clear_vector(vector, apic->regs + APIC_TMR); | ||
342 | |||
343 | if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE) | ||
344 | kvm_vcpu_kick(vcpu); | ||
345 | else if (vcpu->arch.mp_state == VCPU_MP_STATE_HALTED) { | ||
346 | vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE; | ||
347 | if (waitqueue_active(&vcpu->wq)) | ||
348 | wake_up_interruptible(&vcpu->wq); | ||
349 | } | ||
350 | |||
351 | result = (orig_irr == 0); | ||
352 | break; | ||
353 | |||
354 | case APIC_DM_REMRD: | ||
355 | printk(KERN_DEBUG "Ignoring delivery mode 3\n"); | ||
356 | break; | ||
357 | |||
358 | case APIC_DM_SMI: | ||
359 | printk(KERN_DEBUG "Ignoring guest SMI\n"); | ||
360 | break; | ||
361 | case APIC_DM_NMI: | ||
362 | printk(KERN_DEBUG "Ignoring guest NMI\n"); | ||
363 | break; | ||
364 | |||
365 | case APIC_DM_INIT: | ||
366 | if (level) { | ||
367 | if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE) | ||
368 | printk(KERN_DEBUG | ||
369 | "INIT on a runnable vcpu %d\n", | ||
370 | vcpu->vcpu_id); | ||
371 | vcpu->arch.mp_state = VCPU_MP_STATE_INIT_RECEIVED; | ||
372 | kvm_vcpu_kick(vcpu); | ||
373 | } else { | ||
374 | printk(KERN_DEBUG | ||
375 | "Ignoring de-assert INIT to vcpu %d\n", | ||
376 | vcpu->vcpu_id); | ||
377 | } | ||
378 | |||
379 | break; | ||
380 | |||
381 | case APIC_DM_STARTUP: | ||
382 | printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n", | ||
383 | vcpu->vcpu_id, vector); | ||
384 | if (vcpu->arch.mp_state == VCPU_MP_STATE_INIT_RECEIVED) { | ||
385 | vcpu->arch.sipi_vector = vector; | ||
386 | vcpu->arch.mp_state = VCPU_MP_STATE_SIPI_RECEIVED; | ||
387 | if (waitqueue_active(&vcpu->wq)) | ||
388 | wake_up_interruptible(&vcpu->wq); | ||
389 | } | ||
390 | break; | ||
391 | |||
392 | default: | ||
393 | printk(KERN_ERR "TODO: unsupported delivery mode %x\n", | ||
394 | delivery_mode); | ||
395 | break; | ||
396 | } | ||
397 | return result; | ||
398 | } | ||
399 | |||
400 | static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector, | ||
401 | unsigned long bitmap) | ||
402 | { | ||
403 | int last; | ||
404 | int next; | ||
405 | struct kvm_lapic *apic = NULL; | ||
406 | |||
407 | last = kvm->arch.round_robin_prev_vcpu; | ||
408 | next = last; | ||
409 | |||
410 | do { | ||
411 | if (++next == KVM_MAX_VCPUS) | ||
412 | next = 0; | ||
413 | if (kvm->vcpus[next] == NULL || !test_bit(next, &bitmap)) | ||
414 | continue; | ||
415 | apic = kvm->vcpus[next]->arch.apic; | ||
416 | if (apic && apic_enabled(apic)) | ||
417 | break; | ||
418 | apic = NULL; | ||
419 | } while (next != last); | ||
420 | kvm->arch.round_robin_prev_vcpu = next; | ||
421 | |||
422 | if (!apic) | ||
423 | printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n"); | ||
424 | |||
425 | return apic; | ||
426 | } | ||
427 | |||
428 | struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector, | ||
429 | unsigned long bitmap) | ||
430 | { | ||
431 | struct kvm_lapic *apic; | ||
432 | |||
433 | apic = kvm_apic_round_robin(kvm, vector, bitmap); | ||
434 | if (apic) | ||
435 | return apic->vcpu; | ||
436 | return NULL; | ||
437 | } | ||
438 | |||
439 | static void apic_set_eoi(struct kvm_lapic *apic) | ||
440 | { | ||
441 | int vector = apic_find_highest_isr(apic); | ||
442 | |||
443 | /* | ||
444 | * Not every write EOI will has corresponding ISR, | ||
445 | * one example is when Kernel check timer on setup_IO_APIC | ||
446 | */ | ||
447 | if (vector == -1) | ||
448 | return; | ||
449 | |||
450 | apic_clear_vector(vector, apic->regs + APIC_ISR); | ||
451 | apic_update_ppr(apic); | ||
452 | |||
453 | if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR)) | ||
454 | kvm_ioapic_update_eoi(apic->vcpu->kvm, vector); | ||
455 | } | ||
456 | |||
457 | static void apic_send_ipi(struct kvm_lapic *apic) | ||
458 | { | ||
459 | u32 icr_low = apic_get_reg(apic, APIC_ICR); | ||
460 | u32 icr_high = apic_get_reg(apic, APIC_ICR2); | ||
461 | |||
462 | unsigned int dest = GET_APIC_DEST_FIELD(icr_high); | ||
463 | unsigned int short_hand = icr_low & APIC_SHORT_MASK; | ||
464 | unsigned int trig_mode = icr_low & APIC_INT_LEVELTRIG; | ||
465 | unsigned int level = icr_low & APIC_INT_ASSERT; | ||
466 | unsigned int dest_mode = icr_low & APIC_DEST_MASK; | ||
467 | unsigned int delivery_mode = icr_low & APIC_MODE_MASK; | ||
468 | unsigned int vector = icr_low & APIC_VECTOR_MASK; | ||
469 | |||
470 | struct kvm_vcpu *target; | ||
471 | struct kvm_vcpu *vcpu; | ||
472 | unsigned long lpr_map = 0; | ||
473 | int i; | ||
474 | |||
475 | apic_debug("icr_high 0x%x, icr_low 0x%x, " | ||
476 | "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, " | ||
477 | "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n", | ||
478 | icr_high, icr_low, short_hand, dest, | ||
479 | trig_mode, level, dest_mode, delivery_mode, vector); | ||
480 | |||
481 | for (i = 0; i < KVM_MAX_VCPUS; i++) { | ||
482 | vcpu = apic->vcpu->kvm->vcpus[i]; | ||
483 | if (!vcpu) | ||
484 | continue; | ||
485 | |||
486 | if (vcpu->arch.apic && | ||
487 | apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) { | ||
488 | if (delivery_mode == APIC_DM_LOWEST) | ||
489 | set_bit(vcpu->vcpu_id, &lpr_map); | ||
490 | else | ||
491 | __apic_accept_irq(vcpu->arch.apic, delivery_mode, | ||
492 | vector, level, trig_mode); | ||
493 | } | ||
494 | } | ||
495 | |||
496 | if (delivery_mode == APIC_DM_LOWEST) { | ||
497 | target = kvm_get_lowest_prio_vcpu(vcpu->kvm, vector, lpr_map); | ||
498 | if (target != NULL) | ||
499 | __apic_accept_irq(target->arch.apic, delivery_mode, | ||
500 | vector, level, trig_mode); | ||
501 | } | ||
502 | } | ||
503 | |||
504 | static u32 apic_get_tmcct(struct kvm_lapic *apic) | ||
505 | { | ||
506 | u64 counter_passed; | ||
507 | ktime_t passed, now; | ||
508 | u32 tmcct; | ||
509 | |||
510 | ASSERT(apic != NULL); | ||
511 | |||
512 | now = apic->timer.dev.base->get_time(); | ||
513 | tmcct = apic_get_reg(apic, APIC_TMICT); | ||
514 | |||
515 | /* if initial count is 0, current count should also be 0 */ | ||
516 | if (tmcct == 0) | ||
517 | return 0; | ||
518 | |||
519 | if (unlikely(ktime_to_ns(now) <= | ||
520 | ktime_to_ns(apic->timer.last_update))) { | ||
521 | /* Wrap around */ | ||
522 | passed = ktime_add(( { | ||
523 | (ktime_t) { | ||
524 | .tv64 = KTIME_MAX - | ||
525 | (apic->timer.last_update).tv64}; } | ||
526 | ), now); | ||
527 | apic_debug("time elapsed\n"); | ||
528 | } else | ||
529 | passed = ktime_sub(now, apic->timer.last_update); | ||
530 | |||
531 | counter_passed = div64_64(ktime_to_ns(passed), | ||
532 | (APIC_BUS_CYCLE_NS * apic->timer.divide_count)); | ||
533 | |||
534 | if (counter_passed > tmcct) { | ||
535 | if (unlikely(!apic_lvtt_period(apic))) { | ||
536 | /* one-shot timers stick at 0 until reset */ | ||
537 | tmcct = 0; | ||
538 | } else { | ||
539 | /* | ||
540 | * periodic timers reset to APIC_TMICT when they | ||
541 | * hit 0. The while loop simulates this happening N | ||
542 | * times. (counter_passed %= tmcct) would also work, | ||
543 | * but might be slower or not work on 32-bit?? | ||
544 | */ | ||
545 | while (counter_passed > tmcct) | ||
546 | counter_passed -= tmcct; | ||
547 | tmcct -= counter_passed; | ||
548 | } | ||
549 | } else { | ||
550 | tmcct -= counter_passed; | ||
551 | } | ||
552 | |||
553 | return tmcct; | ||
554 | } | ||
555 | |||
556 | static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) | ||
557 | { | ||
558 | u32 val = 0; | ||
559 | |||
560 | if (offset >= LAPIC_MMIO_LENGTH) | ||
561 | return 0; | ||
562 | |||
563 | switch (offset) { | ||
564 | case APIC_ARBPRI: | ||
565 | printk(KERN_WARNING "Access APIC ARBPRI register " | ||
566 | "which is for P6\n"); | ||
567 | break; | ||
568 | |||
569 | case APIC_TMCCT: /* Timer CCR */ | ||
570 | val = apic_get_tmcct(apic); | ||
571 | break; | ||
572 | |||
573 | default: | ||
574 | apic_update_ppr(apic); | ||
575 | val = apic_get_reg(apic, offset); | ||
576 | break; | ||
577 | } | ||
578 | |||
579 | return val; | ||
580 | } | ||
581 | |||
582 | static void apic_mmio_read(struct kvm_io_device *this, | ||
583 | gpa_t address, int len, void *data) | ||
584 | { | ||
585 | struct kvm_lapic *apic = (struct kvm_lapic *)this->private; | ||
586 | unsigned int offset = address - apic->base_address; | ||
587 | unsigned char alignment = offset & 0xf; | ||
588 | u32 result; | ||
589 | |||
590 | if ((alignment + len) > 4) { | ||
591 | printk(KERN_ERR "KVM_APIC_READ: alignment error %lx %d", | ||
592 | (unsigned long)address, len); | ||
593 | return; | ||
594 | } | ||
595 | result = __apic_read(apic, offset & ~0xf); | ||
596 | |||
597 | switch (len) { | ||
598 | case 1: | ||
599 | case 2: | ||
600 | case 4: | ||
601 | memcpy(data, (char *)&result + alignment, len); | ||
602 | break; | ||
603 | default: | ||
604 | printk(KERN_ERR "Local APIC read with len = %x, " | ||
605 | "should be 1,2, or 4 instead\n", len); | ||
606 | break; | ||
607 | } | ||
608 | } | ||
609 | |||
610 | static void update_divide_count(struct kvm_lapic *apic) | ||
611 | { | ||
612 | u32 tmp1, tmp2, tdcr; | ||
613 | |||
614 | tdcr = apic_get_reg(apic, APIC_TDCR); | ||
615 | tmp1 = tdcr & 0xf; | ||
616 | tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1; | ||
617 | apic->timer.divide_count = 0x1 << (tmp2 & 0x7); | ||
618 | |||
619 | apic_debug("timer divide count is 0x%x\n", | ||
620 | apic->timer.divide_count); | ||
621 | } | ||
622 | |||
623 | static void start_apic_timer(struct kvm_lapic *apic) | ||
624 | { | ||
625 | ktime_t now = apic->timer.dev.base->get_time(); | ||
626 | |||
627 | apic->timer.last_update = now; | ||
628 | |||
629 | apic->timer.period = apic_get_reg(apic, APIC_TMICT) * | ||
630 | APIC_BUS_CYCLE_NS * apic->timer.divide_count; | ||
631 | atomic_set(&apic->timer.pending, 0); | ||
632 | hrtimer_start(&apic->timer.dev, | ||
633 | ktime_add_ns(now, apic->timer.period), | ||
634 | HRTIMER_MODE_ABS); | ||
635 | |||
636 | apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" | ||
637 | PRIx64 ", " | ||
638 | "timer initial count 0x%x, period %lldns, " | ||
639 | "expire @ 0x%016" PRIx64 ".\n", __FUNCTION__, | ||
640 | APIC_BUS_CYCLE_NS, ktime_to_ns(now), | ||
641 | apic_get_reg(apic, APIC_TMICT), | ||
642 | apic->timer.period, | ||
643 | ktime_to_ns(ktime_add_ns(now, | ||
644 | apic->timer.period))); | ||
645 | } | ||
646 | |||
647 | static void apic_mmio_write(struct kvm_io_device *this, | ||
648 | gpa_t address, int len, const void *data) | ||
649 | { | ||
650 | struct kvm_lapic *apic = (struct kvm_lapic *)this->private; | ||
651 | unsigned int offset = address - apic->base_address; | ||
652 | unsigned char alignment = offset & 0xf; | ||
653 | u32 val; | ||
654 | |||
655 | /* | ||
656 | * APIC register must be aligned on 128-bits boundary. | ||
657 | * 32/64/128 bits registers must be accessed thru 32 bits. | ||
658 | * Refer SDM 8.4.1 | ||
659 | */ | ||
660 | if (len != 4 || alignment) { | ||
661 | if (printk_ratelimit()) | ||
662 | printk(KERN_ERR "apic write: bad size=%d %lx\n", | ||
663 | len, (long)address); | ||
664 | return; | ||
665 | } | ||
666 | |||
667 | val = *(u32 *) data; | ||
668 | |||
669 | /* too common printing */ | ||
670 | if (offset != APIC_EOI) | ||
671 | apic_debug("%s: offset 0x%x with length 0x%x, and value is " | ||
672 | "0x%x\n", __FUNCTION__, offset, len, val); | ||
673 | |||
674 | offset &= 0xff0; | ||
675 | |||
676 | switch (offset) { | ||
677 | case APIC_ID: /* Local APIC ID */ | ||
678 | apic_set_reg(apic, APIC_ID, val); | ||
679 | break; | ||
680 | |||
681 | case APIC_TASKPRI: | ||
682 | apic_set_tpr(apic, val & 0xff); | ||
683 | break; | ||
684 | |||
685 | case APIC_EOI: | ||
686 | apic_set_eoi(apic); | ||
687 | break; | ||
688 | |||
689 | case APIC_LDR: | ||
690 | apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK); | ||
691 | break; | ||
692 | |||
693 | case APIC_DFR: | ||
694 | apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF); | ||
695 | break; | ||
696 | |||
697 | case APIC_SPIV: | ||
698 | apic_set_reg(apic, APIC_SPIV, val & 0x3ff); | ||
699 | if (!(val & APIC_SPIV_APIC_ENABLED)) { | ||
700 | int i; | ||
701 | u32 lvt_val; | ||
702 | |||
703 | for (i = 0; i < APIC_LVT_NUM; i++) { | ||
704 | lvt_val = apic_get_reg(apic, | ||
705 | APIC_LVTT + 0x10 * i); | ||
706 | apic_set_reg(apic, APIC_LVTT + 0x10 * i, | ||
707 | lvt_val | APIC_LVT_MASKED); | ||
708 | } | ||
709 | atomic_set(&apic->timer.pending, 0); | ||
710 | |||
711 | } | ||
712 | break; | ||
713 | |||
714 | case APIC_ICR: | ||
715 | /* No delay here, so we always clear the pending bit */ | ||
716 | apic_set_reg(apic, APIC_ICR, val & ~(1 << 12)); | ||
717 | apic_send_ipi(apic); | ||
718 | break; | ||
719 | |||
720 | case APIC_ICR2: | ||
721 | apic_set_reg(apic, APIC_ICR2, val & 0xff000000); | ||
722 | break; | ||
723 | |||
724 | case APIC_LVTT: | ||
725 | case APIC_LVTTHMR: | ||
726 | case APIC_LVTPC: | ||
727 | case APIC_LVT0: | ||
728 | case APIC_LVT1: | ||
729 | case APIC_LVTERR: | ||
730 | /* TODO: Check vector */ | ||
731 | if (!apic_sw_enabled(apic)) | ||
732 | val |= APIC_LVT_MASKED; | ||
733 | |||
734 | val &= apic_lvt_mask[(offset - APIC_LVTT) >> 4]; | ||
735 | apic_set_reg(apic, offset, val); | ||
736 | |||
737 | break; | ||
738 | |||
739 | case APIC_TMICT: | ||
740 | hrtimer_cancel(&apic->timer.dev); | ||
741 | apic_set_reg(apic, APIC_TMICT, val); | ||
742 | start_apic_timer(apic); | ||
743 | return; | ||
744 | |||
745 | case APIC_TDCR: | ||
746 | if (val & 4) | ||
747 | printk(KERN_ERR "KVM_WRITE:TDCR %x\n", val); | ||
748 | apic_set_reg(apic, APIC_TDCR, val); | ||
749 | update_divide_count(apic); | ||
750 | break; | ||
751 | |||
752 | default: | ||
753 | apic_debug("Local APIC Write to read-only register %x\n", | ||
754 | offset); | ||
755 | break; | ||
756 | } | ||
757 | |||
758 | } | ||
759 | |||
760 | static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr) | ||
761 | { | ||
762 | struct kvm_lapic *apic = (struct kvm_lapic *)this->private; | ||
763 | int ret = 0; | ||
764 | |||
765 | |||
766 | if (apic_hw_enabled(apic) && | ||
767 | (addr >= apic->base_address) && | ||
768 | (addr < (apic->base_address + LAPIC_MMIO_LENGTH))) | ||
769 | ret = 1; | ||
770 | |||
771 | return ret; | ||
772 | } | ||
773 | |||
774 | void kvm_free_lapic(struct kvm_vcpu *vcpu) | ||
775 | { | ||
776 | if (!vcpu->arch.apic) | ||
777 | return; | ||
778 | |||
779 | hrtimer_cancel(&vcpu->arch.apic->timer.dev); | ||
780 | |||
781 | if (vcpu->arch.apic->regs_page) | ||
782 | __free_page(vcpu->arch.apic->regs_page); | ||
783 | |||
784 | kfree(vcpu->arch.apic); | ||
785 | } | ||
786 | |||
787 | /* | ||
788 | *---------------------------------------------------------------------- | ||
789 | * LAPIC interface | ||
790 | *---------------------------------------------------------------------- | ||
791 | */ | ||
792 | |||
793 | void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8) | ||
794 | { | ||
795 | struct kvm_lapic *apic = vcpu->arch.apic; | ||
796 | |||
797 | if (!apic) | ||
798 | return; | ||
799 | apic_set_tpr(apic, ((cr8 & 0x0f) << 4)); | ||
800 | } | ||
801 | |||
802 | u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) | ||
803 | { | ||
804 | struct kvm_lapic *apic = vcpu->arch.apic; | ||
805 | u64 tpr; | ||
806 | |||
807 | if (!apic) | ||
808 | return 0; | ||
809 | tpr = (u64) apic_get_reg(apic, APIC_TASKPRI); | ||
810 | |||
811 | return (tpr & 0xf0) >> 4; | ||
812 | } | ||
813 | EXPORT_SYMBOL_GPL(kvm_lapic_get_cr8); | ||
814 | |||
815 | void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) | ||
816 | { | ||
817 | struct kvm_lapic *apic = vcpu->arch.apic; | ||
818 | |||
819 | if (!apic) { | ||
820 | value |= MSR_IA32_APICBASE_BSP; | ||
821 | vcpu->arch.apic_base = value; | ||
822 | return; | ||
823 | } | ||
824 | if (apic->vcpu->vcpu_id) | ||
825 | value &= ~MSR_IA32_APICBASE_BSP; | ||
826 | |||
827 | vcpu->arch.apic_base = value; | ||
828 | apic->base_address = apic->vcpu->arch.apic_base & | ||
829 | MSR_IA32_APICBASE_BASE; | ||
830 | |||
831 | /* with FSB delivery interrupt, we can restart APIC functionality */ | ||
832 | apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is " | ||
833 | "0x%lx.\n", apic->vcpu->arch.apic_base, apic->base_address); | ||
834 | |||
835 | } | ||
836 | |||
837 | u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu) | ||
838 | { | ||
839 | return vcpu->arch.apic_base; | ||
840 | } | ||
841 | EXPORT_SYMBOL_GPL(kvm_lapic_get_base); | ||
842 | |||
843 | void kvm_lapic_reset(struct kvm_vcpu *vcpu) | ||
844 | { | ||
845 | struct kvm_lapic *apic; | ||
846 | int i; | ||
847 | |||
848 | apic_debug("%s\n", __FUNCTION__); | ||
849 | |||
850 | ASSERT(vcpu); | ||
851 | apic = vcpu->arch.apic; | ||
852 | ASSERT(apic != NULL); | ||
853 | |||
854 | /* Stop the timer in case it's a reset to an active apic */ | ||
855 | hrtimer_cancel(&apic->timer.dev); | ||
856 | |||
857 | apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24); | ||
858 | apic_set_reg(apic, APIC_LVR, APIC_VERSION); | ||
859 | |||
860 | for (i = 0; i < APIC_LVT_NUM; i++) | ||
861 | apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED); | ||
862 | apic_set_reg(apic, APIC_LVT0, | ||
863 | SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT)); | ||
864 | |||
865 | apic_set_reg(apic, APIC_DFR, 0xffffffffU); | ||
866 | apic_set_reg(apic, APIC_SPIV, 0xff); | ||
867 | apic_set_reg(apic, APIC_TASKPRI, 0); | ||
868 | apic_set_reg(apic, APIC_LDR, 0); | ||
869 | apic_set_reg(apic, APIC_ESR, 0); | ||
870 | apic_set_reg(apic, APIC_ICR, 0); | ||
871 | apic_set_reg(apic, APIC_ICR2, 0); | ||
872 | apic_set_reg(apic, APIC_TDCR, 0); | ||
873 | apic_set_reg(apic, APIC_TMICT, 0); | ||
874 | for (i = 0; i < 8; i++) { | ||
875 | apic_set_reg(apic, APIC_IRR + 0x10 * i, 0); | ||
876 | apic_set_reg(apic, APIC_ISR + 0x10 * i, 0); | ||
877 | apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); | ||
878 | } | ||
879 | update_divide_count(apic); | ||
880 | atomic_set(&apic->timer.pending, 0); | ||
881 | if (vcpu->vcpu_id == 0) | ||
882 | vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; | ||
883 | apic_update_ppr(apic); | ||
884 | |||
885 | apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr=" | ||
886 | "0x%016" PRIx64 ", base_address=0x%0lx.\n", __FUNCTION__, | ||
887 | vcpu, kvm_apic_id(apic), | ||
888 | vcpu->arch.apic_base, apic->base_address); | ||
889 | } | ||
890 | EXPORT_SYMBOL_GPL(kvm_lapic_reset); | ||
891 | |||
892 | int kvm_lapic_enabled(struct kvm_vcpu *vcpu) | ||
893 | { | ||
894 | struct kvm_lapic *apic = vcpu->arch.apic; | ||
895 | int ret = 0; | ||
896 | |||
897 | if (!apic) | ||
898 | return 0; | ||
899 | ret = apic_enabled(apic); | ||
900 | |||
901 | return ret; | ||
902 | } | ||
903 | EXPORT_SYMBOL_GPL(kvm_lapic_enabled); | ||
904 | |||
905 | /* | ||
906 | *---------------------------------------------------------------------- | ||
907 | * timer interface | ||
908 | *---------------------------------------------------------------------- | ||
909 | */ | ||
910 | |||
911 | /* TODO: make sure __apic_timer_fn runs in current pCPU */ | ||
912 | static int __apic_timer_fn(struct kvm_lapic *apic) | ||
913 | { | ||
914 | int result = 0; | ||
915 | wait_queue_head_t *q = &apic->vcpu->wq; | ||
916 | |||
917 | atomic_inc(&apic->timer.pending); | ||
918 | if (waitqueue_active(q)) { | ||
919 | apic->vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE; | ||
920 | wake_up_interruptible(q); | ||
921 | } | ||
922 | if (apic_lvtt_period(apic)) { | ||
923 | result = 1; | ||
924 | apic->timer.dev.expires = ktime_add_ns( | ||
925 | apic->timer.dev.expires, | ||
926 | apic->timer.period); | ||
927 | } | ||
928 | return result; | ||
929 | } | ||
930 | |||
931 | static int __inject_apic_timer_irq(struct kvm_lapic *apic) | ||
932 | { | ||
933 | int vector; | ||
934 | |||
935 | vector = apic_lvt_vector(apic, APIC_LVTT); | ||
936 | return __apic_accept_irq(apic, APIC_DM_FIXED, vector, 1, 0); | ||
937 | } | ||
938 | |||
939 | static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) | ||
940 | { | ||
941 | struct kvm_lapic *apic; | ||
942 | int restart_timer = 0; | ||
943 | |||
944 | apic = container_of(data, struct kvm_lapic, timer.dev); | ||
945 | |||
946 | restart_timer = __apic_timer_fn(apic); | ||
947 | |||
948 | if (restart_timer) | ||
949 | return HRTIMER_RESTART; | ||
950 | else | ||
951 | return HRTIMER_NORESTART; | ||
952 | } | ||
953 | |||
954 | int kvm_create_lapic(struct kvm_vcpu *vcpu) | ||
955 | { | ||
956 | struct kvm_lapic *apic; | ||
957 | |||
958 | ASSERT(vcpu != NULL); | ||
959 | apic_debug("apic_init %d\n", vcpu->vcpu_id); | ||
960 | |||
961 | apic = kzalloc(sizeof(*apic), GFP_KERNEL); | ||
962 | if (!apic) | ||
963 | goto nomem; | ||
964 | |||
965 | vcpu->arch.apic = apic; | ||
966 | |||
967 | apic->regs_page = alloc_page(GFP_KERNEL); | ||
968 | if (apic->regs_page == NULL) { | ||
969 | printk(KERN_ERR "malloc apic regs error for vcpu %x\n", | ||
970 | vcpu->vcpu_id); | ||
971 | goto nomem_free_apic; | ||
972 | } | ||
973 | apic->regs = page_address(apic->regs_page); | ||
974 | memset(apic->regs, 0, PAGE_SIZE); | ||
975 | apic->vcpu = vcpu; | ||
976 | |||
977 | hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); | ||
978 | apic->timer.dev.function = apic_timer_fn; | ||
979 | apic->base_address = APIC_DEFAULT_PHYS_BASE; | ||
980 | vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE; | ||
981 | |||
982 | kvm_lapic_reset(vcpu); | ||
983 | apic->dev.read = apic_mmio_read; | ||
984 | apic->dev.write = apic_mmio_write; | ||
985 | apic->dev.in_range = apic_mmio_range; | ||
986 | apic->dev.private = apic; | ||
987 | |||
988 | return 0; | ||
989 | nomem_free_apic: | ||
990 | kfree(apic); | ||
991 | nomem: | ||
992 | return -ENOMEM; | ||
993 | } | ||
994 | EXPORT_SYMBOL_GPL(kvm_create_lapic); | ||
995 | |||
996 | int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) | ||
997 | { | ||
998 | struct kvm_lapic *apic = vcpu->arch.apic; | ||
999 | int highest_irr; | ||
1000 | |||
1001 | if (!apic || !apic_enabled(apic)) | ||
1002 | return -1; | ||
1003 | |||
1004 | apic_update_ppr(apic); | ||
1005 | highest_irr = apic_find_highest_irr(apic); | ||
1006 | if ((highest_irr == -1) || | ||
1007 | ((highest_irr & 0xF0) <= apic_get_reg(apic, APIC_PROCPRI))) | ||
1008 | return -1; | ||
1009 | return highest_irr; | ||
1010 | } | ||
1011 | |||
1012 | int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) | ||
1013 | { | ||
1014 | u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0); | ||
1015 | int r = 0; | ||
1016 | |||
1017 | if (vcpu->vcpu_id == 0) { | ||
1018 | if (!apic_hw_enabled(vcpu->arch.apic)) | ||
1019 | r = 1; | ||
1020 | if ((lvt0 & APIC_LVT_MASKED) == 0 && | ||
1021 | GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) | ||
1022 | r = 1; | ||
1023 | } | ||
1024 | return r; | ||
1025 | } | ||
1026 | |||
1027 | void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) | ||
1028 | { | ||
1029 | struct kvm_lapic *apic = vcpu->arch.apic; | ||
1030 | |||
1031 | if (apic && apic_lvt_enabled(apic, APIC_LVTT) && | ||
1032 | atomic_read(&apic->timer.pending) > 0) { | ||
1033 | if (__inject_apic_timer_irq(apic)) | ||
1034 | atomic_dec(&apic->timer.pending); | ||
1035 | } | ||
1036 | } | ||
1037 | |||
1038 | void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec) | ||
1039 | { | ||
1040 | struct kvm_lapic *apic = vcpu->arch.apic; | ||
1041 | |||
1042 | if (apic && apic_lvt_vector(apic, APIC_LVTT) == vec) | ||
1043 | apic->timer.last_update = ktime_add_ns( | ||
1044 | apic->timer.last_update, | ||
1045 | apic->timer.period); | ||
1046 | } | ||
1047 | |||
1048 | int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) | ||
1049 | { | ||
1050 | int vector = kvm_apic_has_interrupt(vcpu); | ||
1051 | struct kvm_lapic *apic = vcpu->arch.apic; | ||
1052 | |||
1053 | if (vector == -1) | ||
1054 | return -1; | ||
1055 | |||
1056 | apic_set_vector(vector, apic->regs + APIC_ISR); | ||
1057 | apic_update_ppr(apic); | ||
1058 | apic_clear_irr(vector, apic); | ||
1059 | return vector; | ||
1060 | } | ||
1061 | |||
1062 | void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) | ||
1063 | { | ||
1064 | struct kvm_lapic *apic = vcpu->arch.apic; | ||
1065 | |||
1066 | apic->base_address = vcpu->arch.apic_base & | ||
1067 | MSR_IA32_APICBASE_BASE; | ||
1068 | apic_set_reg(apic, APIC_LVR, APIC_VERSION); | ||
1069 | apic_update_ppr(apic); | ||
1070 | hrtimer_cancel(&apic->timer.dev); | ||
1071 | update_divide_count(apic); | ||
1072 | start_apic_timer(apic); | ||
1073 | } | ||
1074 | |||
1075 | void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) | ||
1076 | { | ||
1077 | struct kvm_lapic *apic = vcpu->arch.apic; | ||
1078 | struct hrtimer *timer; | ||
1079 | |||
1080 | if (!apic) | ||
1081 | return; | ||
1082 | |||
1083 | timer = &apic->timer.dev; | ||
1084 | if (hrtimer_cancel(timer)) | ||
1085 | hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS); | ||
1086 | } | ||
1087 | EXPORT_SYMBOL_GPL(kvm_migrate_apic_timer); | ||
diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c deleted file mode 100644 index c26d83f86a3a..000000000000 --- a/drivers/kvm/mmu.c +++ /dev/null | |||
@@ -1,1806 +0,0 @@ | |||
1 | /* | ||
2 | * Kernel-based Virtual Machine driver for Linux | ||
3 | * | ||
4 | * This module enables machines with Intel VT-x extensions to run virtual | ||
5 | * machines without emulation or binary translation. | ||
6 | * | ||
7 | * MMU support | ||
8 | * | ||
9 | * Copyright (C) 2006 Qumranet, Inc. | ||
10 | * | ||
11 | * Authors: | ||
12 | * Yaniv Kamay <yaniv@qumranet.com> | ||
13 | * Avi Kivity <avi@qumranet.com> | ||
14 | * | ||
15 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
16 | * the COPYING file in the top-level directory. | ||
17 | * | ||
18 | */ | ||
19 | |||
20 | #include "vmx.h" | ||
21 | #include "kvm.h" | ||
22 | #include "x86.h" | ||
23 | #include "mmu.h" | ||
24 | |||
25 | #include <linux/types.h> | ||
26 | #include <linux/string.h> | ||
27 | #include <linux/mm.h> | ||
28 | #include <linux/highmem.h> | ||
29 | #include <linux/module.h> | ||
30 | #include <linux/swap.h> | ||
31 | |||
32 | #include <asm/page.h> | ||
33 | #include <asm/cmpxchg.h> | ||
34 | #include <asm/io.h> | ||
35 | |||
36 | #undef MMU_DEBUG | ||
37 | |||
38 | #undef AUDIT | ||
39 | |||
40 | #ifdef AUDIT | ||
41 | static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg); | ||
42 | #else | ||
43 | static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {} | ||
44 | #endif | ||
45 | |||
46 | #ifdef MMU_DEBUG | ||
47 | |||
48 | #define pgprintk(x...) do { if (dbg) printk(x); } while (0) | ||
49 | #define rmap_printk(x...) do { if (dbg) printk(x); } while (0) | ||
50 | |||
51 | #else | ||
52 | |||
53 | #define pgprintk(x...) do { } while (0) | ||
54 | #define rmap_printk(x...) do { } while (0) | ||
55 | |||
56 | #endif | ||
57 | |||
58 | #if defined(MMU_DEBUG) || defined(AUDIT) | ||
59 | static int dbg = 1; | ||
60 | #endif | ||
61 | |||
62 | #ifndef MMU_DEBUG | ||
63 | #define ASSERT(x) do { } while (0) | ||
64 | #else | ||
65 | #define ASSERT(x) \ | ||
66 | if (!(x)) { \ | ||
67 | printk(KERN_WARNING "assertion failed %s:%d: %s\n", \ | ||
68 | __FILE__, __LINE__, #x); \ | ||
69 | } | ||
70 | #endif | ||
71 | |||
72 | #define PT64_PT_BITS 9 | ||
73 | #define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) | ||
74 | #define PT32_PT_BITS 10 | ||
75 | #define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS) | ||
76 | |||
77 | #define PT_WRITABLE_SHIFT 1 | ||
78 | |||
79 | #define PT_PRESENT_MASK (1ULL << 0) | ||
80 | #define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT) | ||
81 | #define PT_USER_MASK (1ULL << 2) | ||
82 | #define PT_PWT_MASK (1ULL << 3) | ||
83 | #define PT_PCD_MASK (1ULL << 4) | ||
84 | #define PT_ACCESSED_MASK (1ULL << 5) | ||
85 | #define PT_DIRTY_MASK (1ULL << 6) | ||
86 | #define PT_PAGE_SIZE_MASK (1ULL << 7) | ||
87 | #define PT_PAT_MASK (1ULL << 7) | ||
88 | #define PT_GLOBAL_MASK (1ULL << 8) | ||
89 | #define PT64_NX_SHIFT 63 | ||
90 | #define PT64_NX_MASK (1ULL << PT64_NX_SHIFT) | ||
91 | |||
92 | #define PT_PAT_SHIFT 7 | ||
93 | #define PT_DIR_PAT_SHIFT 12 | ||
94 | #define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT) | ||
95 | |||
96 | #define PT32_DIR_PSE36_SIZE 4 | ||
97 | #define PT32_DIR_PSE36_SHIFT 13 | ||
98 | #define PT32_DIR_PSE36_MASK \ | ||
99 | (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT) | ||
100 | |||
101 | |||
102 | #define PT_FIRST_AVAIL_BITS_SHIFT 9 | ||
103 | #define PT64_SECOND_AVAIL_BITS_SHIFT 52 | ||
104 | |||
105 | #define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) | ||
106 | |||
107 | #define VALID_PAGE(x) ((x) != INVALID_PAGE) | ||
108 | |||
109 | #define PT64_LEVEL_BITS 9 | ||
110 | |||
111 | #define PT64_LEVEL_SHIFT(level) \ | ||
112 | (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS) | ||
113 | |||
114 | #define PT64_LEVEL_MASK(level) \ | ||
115 | (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level)) | ||
116 | |||
117 | #define PT64_INDEX(address, level)\ | ||
118 | (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) | ||
119 | |||
120 | |||
121 | #define PT32_LEVEL_BITS 10 | ||
122 | |||
123 | #define PT32_LEVEL_SHIFT(level) \ | ||
124 | (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS) | ||
125 | |||
126 | #define PT32_LEVEL_MASK(level) \ | ||
127 | (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level)) | ||
128 | |||
129 | #define PT32_INDEX(address, level)\ | ||
130 | (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) | ||
131 | |||
132 | |||
133 | #define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) | ||
134 | #define PT64_DIR_BASE_ADDR_MASK \ | ||
135 | (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1)) | ||
136 | |||
137 | #define PT32_BASE_ADDR_MASK PAGE_MASK | ||
138 | #define PT32_DIR_BASE_ADDR_MASK \ | ||
139 | (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1)) | ||
140 | |||
141 | #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ | ||
142 | | PT64_NX_MASK) | ||
143 | |||
144 | #define PFERR_PRESENT_MASK (1U << 0) | ||
145 | #define PFERR_WRITE_MASK (1U << 1) | ||
146 | #define PFERR_USER_MASK (1U << 2) | ||
147 | #define PFERR_FETCH_MASK (1U << 4) | ||
148 | |||
149 | #define PT64_ROOT_LEVEL 4 | ||
150 | #define PT32_ROOT_LEVEL 2 | ||
151 | #define PT32E_ROOT_LEVEL 3 | ||
152 | |||
153 | #define PT_DIRECTORY_LEVEL 2 | ||
154 | #define PT_PAGE_TABLE_LEVEL 1 | ||
155 | |||
156 | #define RMAP_EXT 4 | ||
157 | |||
158 | #define ACC_EXEC_MASK 1 | ||
159 | #define ACC_WRITE_MASK PT_WRITABLE_MASK | ||
160 | #define ACC_USER_MASK PT_USER_MASK | ||
161 | #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) | ||
162 | |||
163 | struct kvm_rmap_desc { | ||
164 | u64 *shadow_ptes[RMAP_EXT]; | ||
165 | struct kvm_rmap_desc *more; | ||
166 | }; | ||
167 | |||
168 | static struct kmem_cache *pte_chain_cache; | ||
169 | static struct kmem_cache *rmap_desc_cache; | ||
170 | static struct kmem_cache *mmu_page_header_cache; | ||
171 | |||
172 | static u64 __read_mostly shadow_trap_nonpresent_pte; | ||
173 | static u64 __read_mostly shadow_notrap_nonpresent_pte; | ||
174 | |||
175 | void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte) | ||
176 | { | ||
177 | shadow_trap_nonpresent_pte = trap_pte; | ||
178 | shadow_notrap_nonpresent_pte = notrap_pte; | ||
179 | } | ||
180 | EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes); | ||
181 | |||
182 | static int is_write_protection(struct kvm_vcpu *vcpu) | ||
183 | { | ||
184 | return vcpu->arch.cr0 & X86_CR0_WP; | ||
185 | } | ||
186 | |||
187 | static int is_cpuid_PSE36(void) | ||
188 | { | ||
189 | return 1; | ||
190 | } | ||
191 | |||
192 | static int is_nx(struct kvm_vcpu *vcpu) | ||
193 | { | ||
194 | return vcpu->arch.shadow_efer & EFER_NX; | ||
195 | } | ||
196 | |||
197 | static int is_present_pte(unsigned long pte) | ||
198 | { | ||
199 | return pte & PT_PRESENT_MASK; | ||
200 | } | ||
201 | |||
202 | static int is_shadow_present_pte(u64 pte) | ||
203 | { | ||
204 | pte &= ~PT_SHADOW_IO_MARK; | ||
205 | return pte != shadow_trap_nonpresent_pte | ||
206 | && pte != shadow_notrap_nonpresent_pte; | ||
207 | } | ||
208 | |||
209 | static int is_writeble_pte(unsigned long pte) | ||
210 | { | ||
211 | return pte & PT_WRITABLE_MASK; | ||
212 | } | ||
213 | |||
214 | static int is_dirty_pte(unsigned long pte) | ||
215 | { | ||
216 | return pte & PT_DIRTY_MASK; | ||
217 | } | ||
218 | |||
219 | static int is_io_pte(unsigned long pte) | ||
220 | { | ||
221 | return pte & PT_SHADOW_IO_MARK; | ||
222 | } | ||
223 | |||
224 | static int is_rmap_pte(u64 pte) | ||
225 | { | ||
226 | return pte != shadow_trap_nonpresent_pte | ||
227 | && pte != shadow_notrap_nonpresent_pte; | ||
228 | } | ||
229 | |||
230 | static gfn_t pse36_gfn_delta(u32 gpte) | ||
231 | { | ||
232 | int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT; | ||
233 | |||
234 | return (gpte & PT32_DIR_PSE36_MASK) << shift; | ||
235 | } | ||
236 | |||
237 | static void set_shadow_pte(u64 *sptep, u64 spte) | ||
238 | { | ||
239 | #ifdef CONFIG_X86_64 | ||
240 | set_64bit((unsigned long *)sptep, spte); | ||
241 | #else | ||
242 | set_64bit((unsigned long long *)sptep, spte); | ||
243 | #endif | ||
244 | } | ||
245 | |||
246 | static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, | ||
247 | struct kmem_cache *base_cache, int min) | ||
248 | { | ||
249 | void *obj; | ||
250 | |||
251 | if (cache->nobjs >= min) | ||
252 | return 0; | ||
253 | while (cache->nobjs < ARRAY_SIZE(cache->objects)) { | ||
254 | obj = kmem_cache_zalloc(base_cache, GFP_KERNEL); | ||
255 | if (!obj) | ||
256 | return -ENOMEM; | ||
257 | cache->objects[cache->nobjs++] = obj; | ||
258 | } | ||
259 | return 0; | ||
260 | } | ||
261 | |||
262 | static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) | ||
263 | { | ||
264 | while (mc->nobjs) | ||
265 | kfree(mc->objects[--mc->nobjs]); | ||
266 | } | ||
267 | |||
268 | static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, | ||
269 | int min) | ||
270 | { | ||
271 | struct page *page; | ||
272 | |||
273 | if (cache->nobjs >= min) | ||
274 | return 0; | ||
275 | while (cache->nobjs < ARRAY_SIZE(cache->objects)) { | ||
276 | page = alloc_page(GFP_KERNEL); | ||
277 | if (!page) | ||
278 | return -ENOMEM; | ||
279 | set_page_private(page, 0); | ||
280 | cache->objects[cache->nobjs++] = page_address(page); | ||
281 | } | ||
282 | return 0; | ||
283 | } | ||
284 | |||
285 | static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc) | ||
286 | { | ||
287 | while (mc->nobjs) | ||
288 | free_page((unsigned long)mc->objects[--mc->nobjs]); | ||
289 | } | ||
290 | |||
291 | static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) | ||
292 | { | ||
293 | int r; | ||
294 | |||
295 | kvm_mmu_free_some_pages(vcpu); | ||
296 | r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache, | ||
297 | pte_chain_cache, 4); | ||
298 | if (r) | ||
299 | goto out; | ||
300 | r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, | ||
301 | rmap_desc_cache, 1); | ||
302 | if (r) | ||
303 | goto out; | ||
304 | r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); | ||
305 | if (r) | ||
306 | goto out; | ||
307 | r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache, | ||
308 | mmu_page_header_cache, 4); | ||
309 | out: | ||
310 | return r; | ||
311 | } | ||
312 | |||
313 | static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) | ||
314 | { | ||
315 | mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache); | ||
316 | mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache); | ||
317 | mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache); | ||
318 | mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); | ||
319 | } | ||
320 | |||
321 | static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, | ||
322 | size_t size) | ||
323 | { | ||
324 | void *p; | ||
325 | |||
326 | BUG_ON(!mc->nobjs); | ||
327 | p = mc->objects[--mc->nobjs]; | ||
328 | memset(p, 0, size); | ||
329 | return p; | ||
330 | } | ||
331 | |||
332 | static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu) | ||
333 | { | ||
334 | return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache, | ||
335 | sizeof(struct kvm_pte_chain)); | ||
336 | } | ||
337 | |||
338 | static void mmu_free_pte_chain(struct kvm_pte_chain *pc) | ||
339 | { | ||
340 | kfree(pc); | ||
341 | } | ||
342 | |||
343 | static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu) | ||
344 | { | ||
345 | return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache, | ||
346 | sizeof(struct kvm_rmap_desc)); | ||
347 | } | ||
348 | |||
349 | static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd) | ||
350 | { | ||
351 | kfree(rd); | ||
352 | } | ||
353 | |||
354 | /* | ||
355 | * Take gfn and return the reverse mapping to it. | ||
356 | * Note: gfn must be unaliased before this function get called | ||
357 | */ | ||
358 | |||
359 | static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn) | ||
360 | { | ||
361 | struct kvm_memory_slot *slot; | ||
362 | |||
363 | slot = gfn_to_memslot(kvm, gfn); | ||
364 | return &slot->rmap[gfn - slot->base_gfn]; | ||
365 | } | ||
366 | |||
367 | /* | ||
368 | * Reverse mapping data structures: | ||
369 | * | ||
370 | * If rmapp bit zero is zero, then rmapp point to the shadw page table entry | ||
371 | * that points to page_address(page). | ||
372 | * | ||
373 | * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc | ||
374 | * containing more mappings. | ||
375 | */ | ||
376 | static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) | ||
377 | { | ||
378 | struct kvm_mmu_page *sp; | ||
379 | struct kvm_rmap_desc *desc; | ||
380 | unsigned long *rmapp; | ||
381 | int i; | ||
382 | |||
383 | if (!is_rmap_pte(*spte)) | ||
384 | return; | ||
385 | gfn = unalias_gfn(vcpu->kvm, gfn); | ||
386 | sp = page_header(__pa(spte)); | ||
387 | sp->gfns[spte - sp->spt] = gfn; | ||
388 | rmapp = gfn_to_rmap(vcpu->kvm, gfn); | ||
389 | if (!*rmapp) { | ||
390 | rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); | ||
391 | *rmapp = (unsigned long)spte; | ||
392 | } else if (!(*rmapp & 1)) { | ||
393 | rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte); | ||
394 | desc = mmu_alloc_rmap_desc(vcpu); | ||
395 | desc->shadow_ptes[0] = (u64 *)*rmapp; | ||
396 | desc->shadow_ptes[1] = spte; | ||
397 | *rmapp = (unsigned long)desc | 1; | ||
398 | } else { | ||
399 | rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); | ||
400 | desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); | ||
401 | while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) | ||
402 | desc = desc->more; | ||
403 | if (desc->shadow_ptes[RMAP_EXT-1]) { | ||
404 | desc->more = mmu_alloc_rmap_desc(vcpu); | ||
405 | desc = desc->more; | ||
406 | } | ||
407 | for (i = 0; desc->shadow_ptes[i]; ++i) | ||
408 | ; | ||
409 | desc->shadow_ptes[i] = spte; | ||
410 | } | ||
411 | } | ||
412 | |||
413 | static void rmap_desc_remove_entry(unsigned long *rmapp, | ||
414 | struct kvm_rmap_desc *desc, | ||
415 | int i, | ||
416 | struct kvm_rmap_desc *prev_desc) | ||
417 | { | ||
418 | int j; | ||
419 | |||
420 | for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j) | ||
421 | ; | ||
422 | desc->shadow_ptes[i] = desc->shadow_ptes[j]; | ||
423 | desc->shadow_ptes[j] = NULL; | ||
424 | if (j != 0) | ||
425 | return; | ||
426 | if (!prev_desc && !desc->more) | ||
427 | *rmapp = (unsigned long)desc->shadow_ptes[0]; | ||
428 | else | ||
429 | if (prev_desc) | ||
430 | prev_desc->more = desc->more; | ||
431 | else | ||
432 | *rmapp = (unsigned long)desc->more | 1; | ||
433 | mmu_free_rmap_desc(desc); | ||
434 | } | ||
435 | |||
436 | static void rmap_remove(struct kvm *kvm, u64 *spte) | ||
437 | { | ||
438 | struct kvm_rmap_desc *desc; | ||
439 | struct kvm_rmap_desc *prev_desc; | ||
440 | struct kvm_mmu_page *sp; | ||
441 | struct page *page; | ||
442 | unsigned long *rmapp; | ||
443 | int i; | ||
444 | |||
445 | if (!is_rmap_pte(*spte)) | ||
446 | return; | ||
447 | sp = page_header(__pa(spte)); | ||
448 | page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT); | ||
449 | mark_page_accessed(page); | ||
450 | if (is_writeble_pte(*spte)) | ||
451 | kvm_release_page_dirty(page); | ||
452 | else | ||
453 | kvm_release_page_clean(page); | ||
454 | rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt]); | ||
455 | if (!*rmapp) { | ||
456 | printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); | ||
457 | BUG(); | ||
458 | } else if (!(*rmapp & 1)) { | ||
459 | rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte); | ||
460 | if ((u64 *)*rmapp != spte) { | ||
461 | printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n", | ||
462 | spte, *spte); | ||
463 | BUG(); | ||
464 | } | ||
465 | *rmapp = 0; | ||
466 | } else { | ||
467 | rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte); | ||
468 | desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); | ||
469 | prev_desc = NULL; | ||
470 | while (desc) { | ||
471 | for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) | ||
472 | if (desc->shadow_ptes[i] == spte) { | ||
473 | rmap_desc_remove_entry(rmapp, | ||
474 | desc, i, | ||
475 | prev_desc); | ||
476 | return; | ||
477 | } | ||
478 | prev_desc = desc; | ||
479 | desc = desc->more; | ||
480 | } | ||
481 | BUG(); | ||
482 | } | ||
483 | } | ||
484 | |||
485 | static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) | ||
486 | { | ||
487 | struct kvm_rmap_desc *desc; | ||
488 | struct kvm_rmap_desc *prev_desc; | ||
489 | u64 *prev_spte; | ||
490 | int i; | ||
491 | |||
492 | if (!*rmapp) | ||
493 | return NULL; | ||
494 | else if (!(*rmapp & 1)) { | ||
495 | if (!spte) | ||
496 | return (u64 *)*rmapp; | ||
497 | return NULL; | ||
498 | } | ||
499 | desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); | ||
500 | prev_desc = NULL; | ||
501 | prev_spte = NULL; | ||
502 | while (desc) { | ||
503 | for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) { | ||
504 | if (prev_spte == spte) | ||
505 | return desc->shadow_ptes[i]; | ||
506 | prev_spte = desc->shadow_ptes[i]; | ||
507 | } | ||
508 | desc = desc->more; | ||
509 | } | ||
510 | return NULL; | ||
511 | } | ||
512 | |||
513 | static void rmap_write_protect(struct kvm *kvm, u64 gfn) | ||
514 | { | ||
515 | unsigned long *rmapp; | ||
516 | u64 *spte; | ||
517 | |||
518 | gfn = unalias_gfn(kvm, gfn); | ||
519 | rmapp = gfn_to_rmap(kvm, gfn); | ||
520 | |||
521 | spte = rmap_next(kvm, rmapp, NULL); | ||
522 | while (spte) { | ||
523 | BUG_ON(!spte); | ||
524 | BUG_ON(!(*spte & PT_PRESENT_MASK)); | ||
525 | rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); | ||
526 | if (is_writeble_pte(*spte)) | ||
527 | set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK); | ||
528 | kvm_flush_remote_tlbs(kvm); | ||
529 | spte = rmap_next(kvm, rmapp, spte); | ||
530 | } | ||
531 | } | ||
532 | |||
533 | #ifdef MMU_DEBUG | ||
534 | static int is_empty_shadow_page(u64 *spt) | ||
535 | { | ||
536 | u64 *pos; | ||
537 | u64 *end; | ||
538 | |||
539 | for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++) | ||
540 | if ((*pos & ~PT_SHADOW_IO_MARK) != shadow_trap_nonpresent_pte) { | ||
541 | printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__, | ||
542 | pos, *pos); | ||
543 | return 0; | ||
544 | } | ||
545 | return 1; | ||
546 | } | ||
547 | #endif | ||
548 | |||
549 | static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) | ||
550 | { | ||
551 | ASSERT(is_empty_shadow_page(sp->spt)); | ||
552 | list_del(&sp->link); | ||
553 | __free_page(virt_to_page(sp->spt)); | ||
554 | __free_page(virt_to_page(sp->gfns)); | ||
555 | kfree(sp); | ||
556 | ++kvm->arch.n_free_mmu_pages; | ||
557 | } | ||
558 | |||
559 | static unsigned kvm_page_table_hashfn(gfn_t gfn) | ||
560 | { | ||
561 | return gfn; | ||
562 | } | ||
563 | |||
564 | static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, | ||
565 | u64 *parent_pte) | ||
566 | { | ||
567 | struct kvm_mmu_page *sp; | ||
568 | |||
569 | if (!vcpu->kvm->arch.n_free_mmu_pages) | ||
570 | return NULL; | ||
571 | |||
572 | sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp); | ||
573 | sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); | ||
574 | sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); | ||
575 | set_page_private(virt_to_page(sp->spt), (unsigned long)sp); | ||
576 | list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); | ||
577 | ASSERT(is_empty_shadow_page(sp->spt)); | ||
578 | sp->slot_bitmap = 0; | ||
579 | sp->multimapped = 0; | ||
580 | sp->parent_pte = parent_pte; | ||
581 | --vcpu->kvm->arch.n_free_mmu_pages; | ||
582 | return sp; | ||
583 | } | ||
584 | |||
585 | static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, | ||
586 | struct kvm_mmu_page *sp, u64 *parent_pte) | ||
587 | { | ||
588 | struct kvm_pte_chain *pte_chain; | ||
589 | struct hlist_node *node; | ||
590 | int i; | ||
591 | |||
592 | if (!parent_pte) | ||
593 | return; | ||
594 | if (!sp->multimapped) { | ||
595 | u64 *old = sp->parent_pte; | ||
596 | |||
597 | if (!old) { | ||
598 | sp->parent_pte = parent_pte; | ||
599 | return; | ||
600 | } | ||
601 | sp->multimapped = 1; | ||
602 | pte_chain = mmu_alloc_pte_chain(vcpu); | ||
603 | INIT_HLIST_HEAD(&sp->parent_ptes); | ||
604 | hlist_add_head(&pte_chain->link, &sp->parent_ptes); | ||
605 | pte_chain->parent_ptes[0] = old; | ||
606 | } | ||
607 | hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) { | ||
608 | if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1]) | ||
609 | continue; | ||
610 | for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) | ||
611 | if (!pte_chain->parent_ptes[i]) { | ||
612 | pte_chain->parent_ptes[i] = parent_pte; | ||
613 | return; | ||
614 | } | ||
615 | } | ||
616 | pte_chain = mmu_alloc_pte_chain(vcpu); | ||
617 | BUG_ON(!pte_chain); | ||
618 | hlist_add_head(&pte_chain->link, &sp->parent_ptes); | ||
619 | pte_chain->parent_ptes[0] = parent_pte; | ||
620 | } | ||
621 | |||
622 | static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, | ||
623 | u64 *parent_pte) | ||
624 | { | ||
625 | struct kvm_pte_chain *pte_chain; | ||
626 | struct hlist_node *node; | ||
627 | int i; | ||
628 | |||
629 | if (!sp->multimapped) { | ||
630 | BUG_ON(sp->parent_pte != parent_pte); | ||
631 | sp->parent_pte = NULL; | ||
632 | return; | ||
633 | } | ||
634 | hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) | ||
635 | for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { | ||
636 | if (!pte_chain->parent_ptes[i]) | ||
637 | break; | ||
638 | if (pte_chain->parent_ptes[i] != parent_pte) | ||
639 | continue; | ||
640 | while (i + 1 < NR_PTE_CHAIN_ENTRIES | ||
641 | && pte_chain->parent_ptes[i + 1]) { | ||
642 | pte_chain->parent_ptes[i] | ||
643 | = pte_chain->parent_ptes[i + 1]; | ||
644 | ++i; | ||
645 | } | ||
646 | pte_chain->parent_ptes[i] = NULL; | ||
647 | if (i == 0) { | ||
648 | hlist_del(&pte_chain->link); | ||
649 | mmu_free_pte_chain(pte_chain); | ||
650 | if (hlist_empty(&sp->parent_ptes)) { | ||
651 | sp->multimapped = 0; | ||
652 | sp->parent_pte = NULL; | ||
653 | } | ||
654 | } | ||
655 | return; | ||
656 | } | ||
657 | BUG(); | ||
658 | } | ||
659 | |||
660 | static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) | ||
661 | { | ||
662 | unsigned index; | ||
663 | struct hlist_head *bucket; | ||
664 | struct kvm_mmu_page *sp; | ||
665 | struct hlist_node *node; | ||
666 | |||
667 | pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn); | ||
668 | index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; | ||
669 | bucket = &kvm->arch.mmu_page_hash[index]; | ||
670 | hlist_for_each_entry(sp, node, bucket, hash_link) | ||
671 | if (sp->gfn == gfn && !sp->role.metaphysical) { | ||
672 | pgprintk("%s: found role %x\n", | ||
673 | __FUNCTION__, sp->role.word); | ||
674 | return sp; | ||
675 | } | ||
676 | return NULL; | ||
677 | } | ||
678 | |||
679 | static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | ||
680 | gfn_t gfn, | ||
681 | gva_t gaddr, | ||
682 | unsigned level, | ||
683 | int metaphysical, | ||
684 | unsigned access, | ||
685 | u64 *parent_pte, | ||
686 | bool *new_page) | ||
687 | { | ||
688 | union kvm_mmu_page_role role; | ||
689 | unsigned index; | ||
690 | unsigned quadrant; | ||
691 | struct hlist_head *bucket; | ||
692 | struct kvm_mmu_page *sp; | ||
693 | struct hlist_node *node; | ||
694 | |||
695 | role.word = 0; | ||
696 | role.glevels = vcpu->arch.mmu.root_level; | ||
697 | role.level = level; | ||
698 | role.metaphysical = metaphysical; | ||
699 | role.access = access; | ||
700 | if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { | ||
701 | quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); | ||
702 | quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; | ||
703 | role.quadrant = quadrant; | ||
704 | } | ||
705 | pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__, | ||
706 | gfn, role.word); | ||
707 | index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; | ||
708 | bucket = &vcpu->kvm->arch.mmu_page_hash[index]; | ||
709 | hlist_for_each_entry(sp, node, bucket, hash_link) | ||
710 | if (sp->gfn == gfn && sp->role.word == role.word) { | ||
711 | mmu_page_add_parent_pte(vcpu, sp, parent_pte); | ||
712 | pgprintk("%s: found\n", __FUNCTION__); | ||
713 | return sp; | ||
714 | } | ||
715 | sp = kvm_mmu_alloc_page(vcpu, parent_pte); | ||
716 | if (!sp) | ||
717 | return sp; | ||
718 | pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word); | ||
719 | sp->gfn = gfn; | ||
720 | sp->role = role; | ||
721 | hlist_add_head(&sp->hash_link, bucket); | ||
722 | vcpu->arch.mmu.prefetch_page(vcpu, sp); | ||
723 | if (!metaphysical) | ||
724 | rmap_write_protect(vcpu->kvm, gfn); | ||
725 | if (new_page) | ||
726 | *new_page = 1; | ||
727 | return sp; | ||
728 | } | ||
729 | |||
730 | static void kvm_mmu_page_unlink_children(struct kvm *kvm, | ||
731 | struct kvm_mmu_page *sp) | ||
732 | { | ||
733 | unsigned i; | ||
734 | u64 *pt; | ||
735 | u64 ent; | ||
736 | |||
737 | pt = sp->spt; | ||
738 | |||
739 | if (sp->role.level == PT_PAGE_TABLE_LEVEL) { | ||
740 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | ||
741 | if (is_shadow_present_pte(pt[i])) | ||
742 | rmap_remove(kvm, &pt[i]); | ||
743 | pt[i] = shadow_trap_nonpresent_pte; | ||
744 | } | ||
745 | kvm_flush_remote_tlbs(kvm); | ||
746 | return; | ||
747 | } | ||
748 | |||
749 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | ||
750 | ent = pt[i]; | ||
751 | |||
752 | pt[i] = shadow_trap_nonpresent_pte; | ||
753 | if (!is_shadow_present_pte(ent)) | ||
754 | continue; | ||
755 | ent &= PT64_BASE_ADDR_MASK; | ||
756 | mmu_page_remove_parent_pte(page_header(ent), &pt[i]); | ||
757 | } | ||
758 | kvm_flush_remote_tlbs(kvm); | ||
759 | } | ||
760 | |||
761 | static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) | ||
762 | { | ||
763 | mmu_page_remove_parent_pte(sp, parent_pte); | ||
764 | } | ||
765 | |||
766 | static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm) | ||
767 | { | ||
768 | int i; | ||
769 | |||
770 | for (i = 0; i < KVM_MAX_VCPUS; ++i) | ||
771 | if (kvm->vcpus[i]) | ||
772 | kvm->vcpus[i]->arch.last_pte_updated = NULL; | ||
773 | } | ||
774 | |||
775 | static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) | ||
776 | { | ||
777 | u64 *parent_pte; | ||
778 | |||
779 | ++kvm->stat.mmu_shadow_zapped; | ||
780 | while (sp->multimapped || sp->parent_pte) { | ||
781 | if (!sp->multimapped) | ||
782 | parent_pte = sp->parent_pte; | ||
783 | else { | ||
784 | struct kvm_pte_chain *chain; | ||
785 | |||
786 | chain = container_of(sp->parent_ptes.first, | ||
787 | struct kvm_pte_chain, link); | ||
788 | parent_pte = chain->parent_ptes[0]; | ||
789 | } | ||
790 | BUG_ON(!parent_pte); | ||
791 | kvm_mmu_put_page(sp, parent_pte); | ||
792 | set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte); | ||
793 | } | ||
794 | kvm_mmu_page_unlink_children(kvm, sp); | ||
795 | if (!sp->root_count) { | ||
796 | hlist_del(&sp->hash_link); | ||
797 | kvm_mmu_free_page(kvm, sp); | ||
798 | } else | ||
799 | list_move(&sp->link, &kvm->arch.active_mmu_pages); | ||
800 | kvm_mmu_reset_last_pte_updated(kvm); | ||
801 | } | ||
802 | |||
803 | /* | ||
804 | * Changing the number of mmu pages allocated to the vm | ||
805 | * Note: if kvm_nr_mmu_pages is too small, you will get dead lock | ||
806 | */ | ||
807 | void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) | ||
808 | { | ||
809 | /* | ||
810 | * If we set the number of mmu pages to be smaller be than the | ||
811 | * number of actived pages , we must to free some mmu pages before we | ||
812 | * change the value | ||
813 | */ | ||
814 | |||
815 | if ((kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages) > | ||
816 | kvm_nr_mmu_pages) { | ||
817 | int n_used_mmu_pages = kvm->arch.n_alloc_mmu_pages | ||
818 | - kvm->arch.n_free_mmu_pages; | ||
819 | |||
820 | while (n_used_mmu_pages > kvm_nr_mmu_pages) { | ||
821 | struct kvm_mmu_page *page; | ||
822 | |||
823 | page = container_of(kvm->arch.active_mmu_pages.prev, | ||
824 | struct kvm_mmu_page, link); | ||
825 | kvm_mmu_zap_page(kvm, page); | ||
826 | n_used_mmu_pages--; | ||
827 | } | ||
828 | kvm->arch.n_free_mmu_pages = 0; | ||
829 | } | ||
830 | else | ||
831 | kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages | ||
832 | - kvm->arch.n_alloc_mmu_pages; | ||
833 | |||
834 | kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages; | ||
835 | } | ||
836 | |||
837 | static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) | ||
838 | { | ||
839 | unsigned index; | ||
840 | struct hlist_head *bucket; | ||
841 | struct kvm_mmu_page *sp; | ||
842 | struct hlist_node *node, *n; | ||
843 | int r; | ||
844 | |||
845 | pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn); | ||
846 | r = 0; | ||
847 | index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; | ||
848 | bucket = &kvm->arch.mmu_page_hash[index]; | ||
849 | hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) | ||
850 | if (sp->gfn == gfn && !sp->role.metaphysical) { | ||
851 | pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn, | ||
852 | sp->role.word); | ||
853 | kvm_mmu_zap_page(kvm, sp); | ||
854 | r = 1; | ||
855 | } | ||
856 | return r; | ||
857 | } | ||
858 | |||
859 | static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) | ||
860 | { | ||
861 | struct kvm_mmu_page *sp; | ||
862 | |||
863 | while ((sp = kvm_mmu_lookup_page(kvm, gfn)) != NULL) { | ||
864 | pgprintk("%s: zap %lx %x\n", __FUNCTION__, gfn, sp->role.word); | ||
865 | kvm_mmu_zap_page(kvm, sp); | ||
866 | } | ||
867 | } | ||
868 | |||
869 | static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) | ||
870 | { | ||
871 | int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn)); | ||
872 | struct kvm_mmu_page *sp = page_header(__pa(pte)); | ||
873 | |||
874 | __set_bit(slot, &sp->slot_bitmap); | ||
875 | } | ||
876 | |||
877 | struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) | ||
878 | { | ||
879 | gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); | ||
880 | |||
881 | if (gpa == UNMAPPED_GVA) | ||
882 | return NULL; | ||
883 | return gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); | ||
884 | } | ||
885 | |||
886 | static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | ||
887 | unsigned pt_access, unsigned pte_access, | ||
888 | int user_fault, int write_fault, int dirty, | ||
889 | int *ptwrite, gfn_t gfn) | ||
890 | { | ||
891 | u64 spte; | ||
892 | int was_rmapped = is_rmap_pte(*shadow_pte); | ||
893 | struct page *page; | ||
894 | |||
895 | pgprintk("%s: spte %llx access %x write_fault %d" | ||
896 | " user_fault %d gfn %lx\n", | ||
897 | __FUNCTION__, *shadow_pte, pt_access, | ||
898 | write_fault, user_fault, gfn); | ||
899 | |||
900 | /* | ||
901 | * We don't set the accessed bit, since we sometimes want to see | ||
902 | * whether the guest actually used the pte (in order to detect | ||
903 | * demand paging). | ||
904 | */ | ||
905 | spte = PT_PRESENT_MASK | PT_DIRTY_MASK; | ||
906 | if (!dirty) | ||
907 | pte_access &= ~ACC_WRITE_MASK; | ||
908 | if (!(pte_access & ACC_EXEC_MASK)) | ||
909 | spte |= PT64_NX_MASK; | ||
910 | |||
911 | page = gfn_to_page(vcpu->kvm, gfn); | ||
912 | |||
913 | spte |= PT_PRESENT_MASK; | ||
914 | if (pte_access & ACC_USER_MASK) | ||
915 | spte |= PT_USER_MASK; | ||
916 | |||
917 | if (is_error_page(page)) { | ||
918 | set_shadow_pte(shadow_pte, | ||
919 | shadow_trap_nonpresent_pte | PT_SHADOW_IO_MARK); | ||
920 | kvm_release_page_clean(page); | ||
921 | return; | ||
922 | } | ||
923 | |||
924 | spte |= page_to_phys(page); | ||
925 | |||
926 | if ((pte_access & ACC_WRITE_MASK) | ||
927 | || (write_fault && !is_write_protection(vcpu) && !user_fault)) { | ||
928 | struct kvm_mmu_page *shadow; | ||
929 | |||
930 | spte |= PT_WRITABLE_MASK; | ||
931 | if (user_fault) { | ||
932 | mmu_unshadow(vcpu->kvm, gfn); | ||
933 | goto unshadowed; | ||
934 | } | ||
935 | |||
936 | shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); | ||
937 | if (shadow) { | ||
938 | pgprintk("%s: found shadow page for %lx, marking ro\n", | ||
939 | __FUNCTION__, gfn); | ||
940 | pte_access &= ~ACC_WRITE_MASK; | ||
941 | if (is_writeble_pte(spte)) { | ||
942 | spte &= ~PT_WRITABLE_MASK; | ||
943 | kvm_x86_ops->tlb_flush(vcpu); | ||
944 | } | ||
945 | if (write_fault) | ||
946 | *ptwrite = 1; | ||
947 | } | ||
948 | } | ||
949 | |||
950 | unshadowed: | ||
951 | |||
952 | if (pte_access & ACC_WRITE_MASK) | ||
953 | mark_page_dirty(vcpu->kvm, gfn); | ||
954 | |||
955 | pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte); | ||
956 | set_shadow_pte(shadow_pte, spte); | ||
957 | page_header_update_slot(vcpu->kvm, shadow_pte, gfn); | ||
958 | if (!was_rmapped) { | ||
959 | rmap_add(vcpu, shadow_pte, gfn); | ||
960 | if (!is_rmap_pte(*shadow_pte)) | ||
961 | kvm_release_page_clean(page); | ||
962 | } | ||
963 | else | ||
964 | kvm_release_page_clean(page); | ||
965 | if (!ptwrite || !*ptwrite) | ||
966 | vcpu->arch.last_pte_updated = shadow_pte; | ||
967 | } | ||
968 | |||
969 | static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) | ||
970 | { | ||
971 | } | ||
972 | |||
973 | static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) | ||
974 | { | ||
975 | int level = PT32E_ROOT_LEVEL; | ||
976 | hpa_t table_addr = vcpu->arch.mmu.root_hpa; | ||
977 | int pt_write = 0; | ||
978 | |||
979 | for (; ; level--) { | ||
980 | u32 index = PT64_INDEX(v, level); | ||
981 | u64 *table; | ||
982 | |||
983 | ASSERT(VALID_PAGE(table_addr)); | ||
984 | table = __va(table_addr); | ||
985 | |||
986 | if (level == 1) { | ||
987 | mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, | ||
988 | 0, write, 1, &pt_write, gfn); | ||
989 | return pt_write || is_io_pte(table[index]); | ||
990 | } | ||
991 | |||
992 | if (table[index] == shadow_trap_nonpresent_pte) { | ||
993 | struct kvm_mmu_page *new_table; | ||
994 | gfn_t pseudo_gfn; | ||
995 | |||
996 | pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK) | ||
997 | >> PAGE_SHIFT; | ||
998 | new_table = kvm_mmu_get_page(vcpu, pseudo_gfn, | ||
999 | v, level - 1, | ||
1000 | 1, ACC_ALL, &table[index], | ||
1001 | NULL); | ||
1002 | if (!new_table) { | ||
1003 | pgprintk("nonpaging_map: ENOMEM\n"); | ||
1004 | return -ENOMEM; | ||
1005 | } | ||
1006 | |||
1007 | table[index] = __pa(new_table->spt) | PT_PRESENT_MASK | ||
1008 | | PT_WRITABLE_MASK | PT_USER_MASK; | ||
1009 | } | ||
1010 | table_addr = table[index] & PT64_BASE_ADDR_MASK; | ||
1011 | } | ||
1012 | } | ||
1013 | |||
1014 | static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, | ||
1015 | struct kvm_mmu_page *sp) | ||
1016 | { | ||
1017 | int i; | ||
1018 | |||
1019 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) | ||
1020 | sp->spt[i] = shadow_trap_nonpresent_pte; | ||
1021 | } | ||
1022 | |||
1023 | static void mmu_free_roots(struct kvm_vcpu *vcpu) | ||
1024 | { | ||
1025 | int i; | ||
1026 | struct kvm_mmu_page *sp; | ||
1027 | |||
1028 | if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) | ||
1029 | return; | ||
1030 | #ifdef CONFIG_X86_64 | ||
1031 | if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { | ||
1032 | hpa_t root = vcpu->arch.mmu.root_hpa; | ||
1033 | |||
1034 | sp = page_header(root); | ||
1035 | --sp->root_count; | ||
1036 | vcpu->arch.mmu.root_hpa = INVALID_PAGE; | ||
1037 | return; | ||
1038 | } | ||
1039 | #endif | ||
1040 | for (i = 0; i < 4; ++i) { | ||
1041 | hpa_t root = vcpu->arch.mmu.pae_root[i]; | ||
1042 | |||
1043 | if (root) { | ||
1044 | root &= PT64_BASE_ADDR_MASK; | ||
1045 | sp = page_header(root); | ||
1046 | --sp->root_count; | ||
1047 | } | ||
1048 | vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; | ||
1049 | } | ||
1050 | vcpu->arch.mmu.root_hpa = INVALID_PAGE; | ||
1051 | } | ||
1052 | |||
1053 | static void mmu_alloc_roots(struct kvm_vcpu *vcpu) | ||
1054 | { | ||
1055 | int i; | ||
1056 | gfn_t root_gfn; | ||
1057 | struct kvm_mmu_page *sp; | ||
1058 | |||
1059 | root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT; | ||
1060 | |||
1061 | #ifdef CONFIG_X86_64 | ||
1062 | if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { | ||
1063 | hpa_t root = vcpu->arch.mmu.root_hpa; | ||
1064 | |||
1065 | ASSERT(!VALID_PAGE(root)); | ||
1066 | sp = kvm_mmu_get_page(vcpu, root_gfn, 0, | ||
1067 | PT64_ROOT_LEVEL, 0, ACC_ALL, NULL, NULL); | ||
1068 | root = __pa(sp->spt); | ||
1069 | ++sp->root_count; | ||
1070 | vcpu->arch.mmu.root_hpa = root; | ||
1071 | return; | ||
1072 | } | ||
1073 | #endif | ||
1074 | for (i = 0; i < 4; ++i) { | ||
1075 | hpa_t root = vcpu->arch.mmu.pae_root[i]; | ||
1076 | |||
1077 | ASSERT(!VALID_PAGE(root)); | ||
1078 | if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { | ||
1079 | if (!is_present_pte(vcpu->arch.pdptrs[i])) { | ||
1080 | vcpu->arch.mmu.pae_root[i] = 0; | ||
1081 | continue; | ||
1082 | } | ||
1083 | root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT; | ||
1084 | } else if (vcpu->arch.mmu.root_level == 0) | ||
1085 | root_gfn = 0; | ||
1086 | sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, | ||
1087 | PT32_ROOT_LEVEL, !is_paging(vcpu), | ||
1088 | ACC_ALL, NULL, NULL); | ||
1089 | root = __pa(sp->spt); | ||
1090 | ++sp->root_count; | ||
1091 | vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; | ||
1092 | } | ||
1093 | vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); | ||
1094 | } | ||
1095 | |||
1096 | static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) | ||
1097 | { | ||
1098 | return vaddr; | ||
1099 | } | ||
1100 | |||
1101 | static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, | ||
1102 | u32 error_code) | ||
1103 | { | ||
1104 | gfn_t gfn; | ||
1105 | int r; | ||
1106 | |||
1107 | pgprintk("%s: gva %lx error %x\n", __FUNCTION__, gva, error_code); | ||
1108 | r = mmu_topup_memory_caches(vcpu); | ||
1109 | if (r) | ||
1110 | return r; | ||
1111 | |||
1112 | ASSERT(vcpu); | ||
1113 | ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); | ||
1114 | |||
1115 | gfn = gva >> PAGE_SHIFT; | ||
1116 | |||
1117 | return nonpaging_map(vcpu, gva & PAGE_MASK, | ||
1118 | error_code & PFERR_WRITE_MASK, gfn); | ||
1119 | } | ||
1120 | |||
1121 | static void nonpaging_free(struct kvm_vcpu *vcpu) | ||
1122 | { | ||
1123 | mmu_free_roots(vcpu); | ||
1124 | } | ||
1125 | |||
1126 | static int nonpaging_init_context(struct kvm_vcpu *vcpu) | ||
1127 | { | ||
1128 | struct kvm_mmu *context = &vcpu->arch.mmu; | ||
1129 | |||
1130 | context->new_cr3 = nonpaging_new_cr3; | ||
1131 | context->page_fault = nonpaging_page_fault; | ||
1132 | context->gva_to_gpa = nonpaging_gva_to_gpa; | ||
1133 | context->free = nonpaging_free; | ||
1134 | context->prefetch_page = nonpaging_prefetch_page; | ||
1135 | context->root_level = 0; | ||
1136 | context->shadow_root_level = PT32E_ROOT_LEVEL; | ||
1137 | context->root_hpa = INVALID_PAGE; | ||
1138 | return 0; | ||
1139 | } | ||
1140 | |||
1141 | void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) | ||
1142 | { | ||
1143 | ++vcpu->stat.tlb_flush; | ||
1144 | kvm_x86_ops->tlb_flush(vcpu); | ||
1145 | } | ||
1146 | |||
1147 | static void paging_new_cr3(struct kvm_vcpu *vcpu) | ||
1148 | { | ||
1149 | pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3); | ||
1150 | mmu_free_roots(vcpu); | ||
1151 | } | ||
1152 | |||
1153 | static void inject_page_fault(struct kvm_vcpu *vcpu, | ||
1154 | u64 addr, | ||
1155 | u32 err_code) | ||
1156 | { | ||
1157 | kvm_inject_page_fault(vcpu, addr, err_code); | ||
1158 | } | ||
1159 | |||
1160 | static void paging_free(struct kvm_vcpu *vcpu) | ||
1161 | { | ||
1162 | nonpaging_free(vcpu); | ||
1163 | } | ||
1164 | |||
1165 | #define PTTYPE 64 | ||
1166 | #include "paging_tmpl.h" | ||
1167 | #undef PTTYPE | ||
1168 | |||
1169 | #define PTTYPE 32 | ||
1170 | #include "paging_tmpl.h" | ||
1171 | #undef PTTYPE | ||
1172 | |||
1173 | static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) | ||
1174 | { | ||
1175 | struct kvm_mmu *context = &vcpu->arch.mmu; | ||
1176 | |||
1177 | ASSERT(is_pae(vcpu)); | ||
1178 | context->new_cr3 = paging_new_cr3; | ||
1179 | context->page_fault = paging64_page_fault; | ||
1180 | context->gva_to_gpa = paging64_gva_to_gpa; | ||
1181 | context->prefetch_page = paging64_prefetch_page; | ||
1182 | context->free = paging_free; | ||
1183 | context->root_level = level; | ||
1184 | context->shadow_root_level = level; | ||
1185 | context->root_hpa = INVALID_PAGE; | ||
1186 | return 0; | ||
1187 | } | ||
1188 | |||
1189 | static int paging64_init_context(struct kvm_vcpu *vcpu) | ||
1190 | { | ||
1191 | return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL); | ||
1192 | } | ||
1193 | |||
1194 | static int paging32_init_context(struct kvm_vcpu *vcpu) | ||
1195 | { | ||
1196 | struct kvm_mmu *context = &vcpu->arch.mmu; | ||
1197 | |||
1198 | context->new_cr3 = paging_new_cr3; | ||
1199 | context->page_fault = paging32_page_fault; | ||
1200 | context->gva_to_gpa = paging32_gva_to_gpa; | ||
1201 | context->free = paging_free; | ||
1202 | context->prefetch_page = paging32_prefetch_page; | ||
1203 | context->root_level = PT32_ROOT_LEVEL; | ||
1204 | context->shadow_root_level = PT32E_ROOT_LEVEL; | ||
1205 | context->root_hpa = INVALID_PAGE; | ||
1206 | return 0; | ||
1207 | } | ||
1208 | |||
1209 | static int paging32E_init_context(struct kvm_vcpu *vcpu) | ||
1210 | { | ||
1211 | return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL); | ||
1212 | } | ||
1213 | |||
1214 | static int init_kvm_mmu(struct kvm_vcpu *vcpu) | ||
1215 | { | ||
1216 | ASSERT(vcpu); | ||
1217 | ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); | ||
1218 | |||
1219 | if (!is_paging(vcpu)) | ||
1220 | return nonpaging_init_context(vcpu); | ||
1221 | else if (is_long_mode(vcpu)) | ||
1222 | return paging64_init_context(vcpu); | ||
1223 | else if (is_pae(vcpu)) | ||
1224 | return paging32E_init_context(vcpu); | ||
1225 | else | ||
1226 | return paging32_init_context(vcpu); | ||
1227 | } | ||
1228 | |||
1229 | static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) | ||
1230 | { | ||
1231 | ASSERT(vcpu); | ||
1232 | if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) { | ||
1233 | vcpu->arch.mmu.free(vcpu); | ||
1234 | vcpu->arch.mmu.root_hpa = INVALID_PAGE; | ||
1235 | } | ||
1236 | } | ||
1237 | |||
1238 | int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) | ||
1239 | { | ||
1240 | destroy_kvm_mmu(vcpu); | ||
1241 | return init_kvm_mmu(vcpu); | ||
1242 | } | ||
1243 | EXPORT_SYMBOL_GPL(kvm_mmu_reset_context); | ||
1244 | |||
1245 | int kvm_mmu_load(struct kvm_vcpu *vcpu) | ||
1246 | { | ||
1247 | int r; | ||
1248 | |||
1249 | mutex_lock(&vcpu->kvm->lock); | ||
1250 | r = mmu_topup_memory_caches(vcpu); | ||
1251 | if (r) | ||
1252 | goto out; | ||
1253 | mmu_alloc_roots(vcpu); | ||
1254 | kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); | ||
1255 | kvm_mmu_flush_tlb(vcpu); | ||
1256 | out: | ||
1257 | mutex_unlock(&vcpu->kvm->lock); | ||
1258 | return r; | ||
1259 | } | ||
1260 | EXPORT_SYMBOL_GPL(kvm_mmu_load); | ||
1261 | |||
1262 | void kvm_mmu_unload(struct kvm_vcpu *vcpu) | ||
1263 | { | ||
1264 | mmu_free_roots(vcpu); | ||
1265 | } | ||
1266 | |||
1267 | static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, | ||
1268 | struct kvm_mmu_page *sp, | ||
1269 | u64 *spte) | ||
1270 | { | ||
1271 | u64 pte; | ||
1272 | struct kvm_mmu_page *child; | ||
1273 | |||
1274 | pte = *spte; | ||
1275 | if (is_shadow_present_pte(pte)) { | ||
1276 | if (sp->role.level == PT_PAGE_TABLE_LEVEL) | ||
1277 | rmap_remove(vcpu->kvm, spte); | ||
1278 | else { | ||
1279 | child = page_header(pte & PT64_BASE_ADDR_MASK); | ||
1280 | mmu_page_remove_parent_pte(child, spte); | ||
1281 | } | ||
1282 | } | ||
1283 | set_shadow_pte(spte, shadow_trap_nonpresent_pte); | ||
1284 | } | ||
1285 | |||
1286 | static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, | ||
1287 | struct kvm_mmu_page *sp, | ||
1288 | u64 *spte, | ||
1289 | const void *new, int bytes, | ||
1290 | int offset_in_pte) | ||
1291 | { | ||
1292 | if (sp->role.level != PT_PAGE_TABLE_LEVEL) { | ||
1293 | ++vcpu->kvm->stat.mmu_pde_zapped; | ||
1294 | return; | ||
1295 | } | ||
1296 | |||
1297 | ++vcpu->kvm->stat.mmu_pte_updated; | ||
1298 | if (sp->role.glevels == PT32_ROOT_LEVEL) | ||
1299 | paging32_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte); | ||
1300 | else | ||
1301 | paging64_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte); | ||
1302 | } | ||
1303 | |||
1304 | static bool need_remote_flush(u64 old, u64 new) | ||
1305 | { | ||
1306 | if (!is_shadow_present_pte(old)) | ||
1307 | return false; | ||
1308 | if (!is_shadow_present_pte(new)) | ||
1309 | return true; | ||
1310 | if ((old ^ new) & PT64_BASE_ADDR_MASK) | ||
1311 | return true; | ||
1312 | old ^= PT64_NX_MASK; | ||
1313 | new ^= PT64_NX_MASK; | ||
1314 | return (old & ~new & PT64_PERM_MASK) != 0; | ||
1315 | } | ||
1316 | |||
1317 | static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new) | ||
1318 | { | ||
1319 | if (need_remote_flush(old, new)) | ||
1320 | kvm_flush_remote_tlbs(vcpu->kvm); | ||
1321 | else | ||
1322 | kvm_mmu_flush_tlb(vcpu); | ||
1323 | } | ||
1324 | |||
1325 | static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu) | ||
1326 | { | ||
1327 | u64 *spte = vcpu->arch.last_pte_updated; | ||
1328 | |||
1329 | return !!(spte && (*spte & PT_ACCESSED_MASK)); | ||
1330 | } | ||
1331 | |||
1332 | void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | ||
1333 | const u8 *new, int bytes) | ||
1334 | { | ||
1335 | gfn_t gfn = gpa >> PAGE_SHIFT; | ||
1336 | struct kvm_mmu_page *sp; | ||
1337 | struct hlist_node *node, *n; | ||
1338 | struct hlist_head *bucket; | ||
1339 | unsigned index; | ||
1340 | u64 entry; | ||
1341 | u64 *spte; | ||
1342 | unsigned offset = offset_in_page(gpa); | ||
1343 | unsigned pte_size; | ||
1344 | unsigned page_offset; | ||
1345 | unsigned misaligned; | ||
1346 | unsigned quadrant; | ||
1347 | int level; | ||
1348 | int flooded = 0; | ||
1349 | int npte; | ||
1350 | |||
1351 | pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes); | ||
1352 | ++vcpu->kvm->stat.mmu_pte_write; | ||
1353 | kvm_mmu_audit(vcpu, "pre pte write"); | ||
1354 | if (gfn == vcpu->arch.last_pt_write_gfn | ||
1355 | && !last_updated_pte_accessed(vcpu)) { | ||
1356 | ++vcpu->arch.last_pt_write_count; | ||
1357 | if (vcpu->arch.last_pt_write_count >= 3) | ||
1358 | flooded = 1; | ||
1359 | } else { | ||
1360 | vcpu->arch.last_pt_write_gfn = gfn; | ||
1361 | vcpu->arch.last_pt_write_count = 1; | ||
1362 | vcpu->arch.last_pte_updated = NULL; | ||
1363 | } | ||
1364 | index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; | ||
1365 | bucket = &vcpu->kvm->arch.mmu_page_hash[index]; | ||
1366 | hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { | ||
1367 | if (sp->gfn != gfn || sp->role.metaphysical) | ||
1368 | continue; | ||
1369 | pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; | ||
1370 | misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); | ||
1371 | misaligned |= bytes < 4; | ||
1372 | if (misaligned || flooded) { | ||
1373 | /* | ||
1374 | * Misaligned accesses are too much trouble to fix | ||
1375 | * up; also, they usually indicate a page is not used | ||
1376 | * as a page table. | ||
1377 | * | ||
1378 | * If we're seeing too many writes to a page, | ||
1379 | * it may no longer be a page table, or we may be | ||
1380 | * forking, in which case it is better to unmap the | ||
1381 | * page. | ||
1382 | */ | ||
1383 | pgprintk("misaligned: gpa %llx bytes %d role %x\n", | ||
1384 | gpa, bytes, sp->role.word); | ||
1385 | kvm_mmu_zap_page(vcpu->kvm, sp); | ||
1386 | ++vcpu->kvm->stat.mmu_flooded; | ||
1387 | continue; | ||
1388 | } | ||
1389 | page_offset = offset; | ||
1390 | level = sp->role.level; | ||
1391 | npte = 1; | ||
1392 | if (sp->role.glevels == PT32_ROOT_LEVEL) { | ||
1393 | page_offset <<= 1; /* 32->64 */ | ||
1394 | /* | ||
1395 | * A 32-bit pde maps 4MB while the shadow pdes map | ||
1396 | * only 2MB. So we need to double the offset again | ||
1397 | * and zap two pdes instead of one. | ||
1398 | */ | ||
1399 | if (level == PT32_ROOT_LEVEL) { | ||
1400 | page_offset &= ~7; /* kill rounding error */ | ||
1401 | page_offset <<= 1; | ||
1402 | npte = 2; | ||
1403 | } | ||
1404 | quadrant = page_offset >> PAGE_SHIFT; | ||
1405 | page_offset &= ~PAGE_MASK; | ||
1406 | if (quadrant != sp->role.quadrant) | ||
1407 | continue; | ||
1408 | } | ||
1409 | spte = &sp->spt[page_offset / sizeof(*spte)]; | ||
1410 | while (npte--) { | ||
1411 | entry = *spte; | ||
1412 | mmu_pte_write_zap_pte(vcpu, sp, spte); | ||
1413 | mmu_pte_write_new_pte(vcpu, sp, spte, new, bytes, | ||
1414 | page_offset & (pte_size - 1)); | ||
1415 | mmu_pte_write_flush_tlb(vcpu, entry, *spte); | ||
1416 | ++spte; | ||
1417 | } | ||
1418 | } | ||
1419 | kvm_mmu_audit(vcpu, "post pte write"); | ||
1420 | } | ||
1421 | |||
1422 | int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) | ||
1423 | { | ||
1424 | gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); | ||
1425 | |||
1426 | return kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); | ||
1427 | } | ||
1428 | |||
1429 | void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) | ||
1430 | { | ||
1431 | while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) { | ||
1432 | struct kvm_mmu_page *sp; | ||
1433 | |||
1434 | sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, | ||
1435 | struct kvm_mmu_page, link); | ||
1436 | kvm_mmu_zap_page(vcpu->kvm, sp); | ||
1437 | ++vcpu->kvm->stat.mmu_recycled; | ||
1438 | } | ||
1439 | } | ||
1440 | |||
1441 | int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) | ||
1442 | { | ||
1443 | int r; | ||
1444 | enum emulation_result er; | ||
1445 | |||
1446 | mutex_lock(&vcpu->kvm->lock); | ||
1447 | r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code); | ||
1448 | if (r < 0) | ||
1449 | goto out; | ||
1450 | |||
1451 | if (!r) { | ||
1452 | r = 1; | ||
1453 | goto out; | ||
1454 | } | ||
1455 | |||
1456 | r = mmu_topup_memory_caches(vcpu); | ||
1457 | if (r) | ||
1458 | goto out; | ||
1459 | |||
1460 | er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0); | ||
1461 | mutex_unlock(&vcpu->kvm->lock); | ||
1462 | |||
1463 | switch (er) { | ||
1464 | case EMULATE_DONE: | ||
1465 | return 1; | ||
1466 | case EMULATE_DO_MMIO: | ||
1467 | ++vcpu->stat.mmio_exits; | ||
1468 | return 0; | ||
1469 | case EMULATE_FAIL: | ||
1470 | kvm_report_emulation_failure(vcpu, "pagetable"); | ||
1471 | return 1; | ||
1472 | default: | ||
1473 | BUG(); | ||
1474 | } | ||
1475 | out: | ||
1476 | mutex_unlock(&vcpu->kvm->lock); | ||
1477 | return r; | ||
1478 | } | ||
1479 | EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); | ||
1480 | |||
1481 | static void free_mmu_pages(struct kvm_vcpu *vcpu) | ||
1482 | { | ||
1483 | struct kvm_mmu_page *sp; | ||
1484 | |||
1485 | while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) { | ||
1486 | sp = container_of(vcpu->kvm->arch.active_mmu_pages.next, | ||
1487 | struct kvm_mmu_page, link); | ||
1488 | kvm_mmu_zap_page(vcpu->kvm, sp); | ||
1489 | } | ||
1490 | free_page((unsigned long)vcpu->arch.mmu.pae_root); | ||
1491 | } | ||
1492 | |||
1493 | static int alloc_mmu_pages(struct kvm_vcpu *vcpu) | ||
1494 | { | ||
1495 | struct page *page; | ||
1496 | int i; | ||
1497 | |||
1498 | ASSERT(vcpu); | ||
1499 | |||
1500 | if (vcpu->kvm->arch.n_requested_mmu_pages) | ||
1501 | vcpu->kvm->arch.n_free_mmu_pages = | ||
1502 | vcpu->kvm->arch.n_requested_mmu_pages; | ||
1503 | else | ||
1504 | vcpu->kvm->arch.n_free_mmu_pages = | ||
1505 | vcpu->kvm->arch.n_alloc_mmu_pages; | ||
1506 | /* | ||
1507 | * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. | ||
1508 | * Therefore we need to allocate shadow page tables in the first | ||
1509 | * 4GB of memory, which happens to fit the DMA32 zone. | ||
1510 | */ | ||
1511 | page = alloc_page(GFP_KERNEL | __GFP_DMA32); | ||
1512 | if (!page) | ||
1513 | goto error_1; | ||
1514 | vcpu->arch.mmu.pae_root = page_address(page); | ||
1515 | for (i = 0; i < 4; ++i) | ||
1516 | vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; | ||
1517 | |||
1518 | return 0; | ||
1519 | |||
1520 | error_1: | ||
1521 | free_mmu_pages(vcpu); | ||
1522 | return -ENOMEM; | ||
1523 | } | ||
1524 | |||
1525 | int kvm_mmu_create(struct kvm_vcpu *vcpu) | ||
1526 | { | ||
1527 | ASSERT(vcpu); | ||
1528 | ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); | ||
1529 | |||
1530 | return alloc_mmu_pages(vcpu); | ||
1531 | } | ||
1532 | |||
1533 | int kvm_mmu_setup(struct kvm_vcpu *vcpu) | ||
1534 | { | ||
1535 | ASSERT(vcpu); | ||
1536 | ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); | ||
1537 | |||
1538 | return init_kvm_mmu(vcpu); | ||
1539 | } | ||
1540 | |||
1541 | void kvm_mmu_destroy(struct kvm_vcpu *vcpu) | ||
1542 | { | ||
1543 | ASSERT(vcpu); | ||
1544 | |||
1545 | destroy_kvm_mmu(vcpu); | ||
1546 | free_mmu_pages(vcpu); | ||
1547 | mmu_free_memory_caches(vcpu); | ||
1548 | } | ||
1549 | |||
1550 | void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) | ||
1551 | { | ||
1552 | struct kvm_mmu_page *sp; | ||
1553 | |||
1554 | list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { | ||
1555 | int i; | ||
1556 | u64 *pt; | ||
1557 | |||
1558 | if (!test_bit(slot, &sp->slot_bitmap)) | ||
1559 | continue; | ||
1560 | |||
1561 | pt = sp->spt; | ||
1562 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) | ||
1563 | /* avoid RMW */ | ||
1564 | if (pt[i] & PT_WRITABLE_MASK) | ||
1565 | pt[i] &= ~PT_WRITABLE_MASK; | ||
1566 | } | ||
1567 | } | ||
1568 | |||
1569 | void kvm_mmu_zap_all(struct kvm *kvm) | ||
1570 | { | ||
1571 | struct kvm_mmu_page *sp, *node; | ||
1572 | |||
1573 | list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) | ||
1574 | kvm_mmu_zap_page(kvm, sp); | ||
1575 | |||
1576 | kvm_flush_remote_tlbs(kvm); | ||
1577 | } | ||
1578 | |||
1579 | void kvm_mmu_module_exit(void) | ||
1580 | { | ||
1581 | if (pte_chain_cache) | ||
1582 | kmem_cache_destroy(pte_chain_cache); | ||
1583 | if (rmap_desc_cache) | ||
1584 | kmem_cache_destroy(rmap_desc_cache); | ||
1585 | if (mmu_page_header_cache) | ||
1586 | kmem_cache_destroy(mmu_page_header_cache); | ||
1587 | } | ||
1588 | |||
1589 | int kvm_mmu_module_init(void) | ||
1590 | { | ||
1591 | pte_chain_cache = kmem_cache_create("kvm_pte_chain", | ||
1592 | sizeof(struct kvm_pte_chain), | ||
1593 | 0, 0, NULL); | ||
1594 | if (!pte_chain_cache) | ||
1595 | goto nomem; | ||
1596 | rmap_desc_cache = kmem_cache_create("kvm_rmap_desc", | ||
1597 | sizeof(struct kvm_rmap_desc), | ||
1598 | 0, 0, NULL); | ||
1599 | if (!rmap_desc_cache) | ||
1600 | goto nomem; | ||
1601 | |||
1602 | mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", | ||
1603 | sizeof(struct kvm_mmu_page), | ||
1604 | 0, 0, NULL); | ||
1605 | if (!mmu_page_header_cache) | ||
1606 | goto nomem; | ||
1607 | |||
1608 | return 0; | ||
1609 | |||
1610 | nomem: | ||
1611 | kvm_mmu_module_exit(); | ||
1612 | return -ENOMEM; | ||
1613 | } | ||
1614 | |||
1615 | /* | ||
1616 | * Caculate mmu pages needed for kvm. | ||
1617 | */ | ||
1618 | unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) | ||
1619 | { | ||
1620 | int i; | ||
1621 | unsigned int nr_mmu_pages; | ||
1622 | unsigned int nr_pages = 0; | ||
1623 | |||
1624 | for (i = 0; i < kvm->nmemslots; i++) | ||
1625 | nr_pages += kvm->memslots[i].npages; | ||
1626 | |||
1627 | nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; | ||
1628 | nr_mmu_pages = max(nr_mmu_pages, | ||
1629 | (unsigned int) KVM_MIN_ALLOC_MMU_PAGES); | ||
1630 | |||
1631 | return nr_mmu_pages; | ||
1632 | } | ||
1633 | |||
1634 | #ifdef AUDIT | ||
1635 | |||
1636 | static const char *audit_msg; | ||
1637 | |||
1638 | static gva_t canonicalize(gva_t gva) | ||
1639 | { | ||
1640 | #ifdef CONFIG_X86_64 | ||
1641 | gva = (long long)(gva << 16) >> 16; | ||
1642 | #endif | ||
1643 | return gva; | ||
1644 | } | ||
1645 | |||
1646 | static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, | ||
1647 | gva_t va, int level) | ||
1648 | { | ||
1649 | u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK); | ||
1650 | int i; | ||
1651 | gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1)); | ||
1652 | |||
1653 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) { | ||
1654 | u64 ent = pt[i]; | ||
1655 | |||
1656 | if (ent == shadow_trap_nonpresent_pte) | ||
1657 | continue; | ||
1658 | |||
1659 | va = canonicalize(va); | ||
1660 | if (level > 1) { | ||
1661 | if (ent == shadow_notrap_nonpresent_pte) | ||
1662 | printk(KERN_ERR "audit: (%s) nontrapping pte" | ||
1663 | " in nonleaf level: levels %d gva %lx" | ||
1664 | " level %d pte %llx\n", audit_msg, | ||
1665 | vcpu->arch.mmu.root_level, va, level, ent); | ||
1666 | |||
1667 | audit_mappings_page(vcpu, ent, va, level - 1); | ||
1668 | } else { | ||
1669 | gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va); | ||
1670 | struct page *page = gpa_to_page(vcpu, gpa); | ||
1671 | hpa_t hpa = page_to_phys(page); | ||
1672 | |||
1673 | if (is_shadow_present_pte(ent) | ||
1674 | && (ent & PT64_BASE_ADDR_MASK) != hpa) | ||
1675 | printk(KERN_ERR "xx audit error: (%s) levels %d" | ||
1676 | " gva %lx gpa %llx hpa %llx ent %llx %d\n", | ||
1677 | audit_msg, vcpu->arch.mmu.root_level, | ||
1678 | va, gpa, hpa, ent, | ||
1679 | is_shadow_present_pte(ent)); | ||
1680 | else if (ent == shadow_notrap_nonpresent_pte | ||
1681 | && !is_error_hpa(hpa)) | ||
1682 | printk(KERN_ERR "audit: (%s) notrap shadow," | ||
1683 | " valid guest gva %lx\n", audit_msg, va); | ||
1684 | kvm_release_page_clean(page); | ||
1685 | |||
1686 | } | ||
1687 | } | ||
1688 | } | ||
1689 | |||
1690 | static void audit_mappings(struct kvm_vcpu *vcpu) | ||
1691 | { | ||
1692 | unsigned i; | ||
1693 | |||
1694 | if (vcpu->arch.mmu.root_level == 4) | ||
1695 | audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4); | ||
1696 | else | ||
1697 | for (i = 0; i < 4; ++i) | ||
1698 | if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK) | ||
1699 | audit_mappings_page(vcpu, | ||
1700 | vcpu->arch.mmu.pae_root[i], | ||
1701 | i << 30, | ||
1702 | 2); | ||
1703 | } | ||
1704 | |||
1705 | static int count_rmaps(struct kvm_vcpu *vcpu) | ||
1706 | { | ||
1707 | int nmaps = 0; | ||
1708 | int i, j, k; | ||
1709 | |||
1710 | for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { | ||
1711 | struct kvm_memory_slot *m = &vcpu->kvm->memslots[i]; | ||
1712 | struct kvm_rmap_desc *d; | ||
1713 | |||
1714 | for (j = 0; j < m->npages; ++j) { | ||
1715 | unsigned long *rmapp = &m->rmap[j]; | ||
1716 | |||
1717 | if (!*rmapp) | ||
1718 | continue; | ||
1719 | if (!(*rmapp & 1)) { | ||
1720 | ++nmaps; | ||
1721 | continue; | ||
1722 | } | ||
1723 | d = (struct kvm_rmap_desc *)(*rmapp & ~1ul); | ||
1724 | while (d) { | ||
1725 | for (k = 0; k < RMAP_EXT; ++k) | ||
1726 | if (d->shadow_ptes[k]) | ||
1727 | ++nmaps; | ||
1728 | else | ||
1729 | break; | ||
1730 | d = d->more; | ||
1731 | } | ||
1732 | } | ||
1733 | } | ||
1734 | return nmaps; | ||
1735 | } | ||
1736 | |||
1737 | static int count_writable_mappings(struct kvm_vcpu *vcpu) | ||
1738 | { | ||
1739 | int nmaps = 0; | ||
1740 | struct kvm_mmu_page *sp; | ||
1741 | int i; | ||
1742 | |||
1743 | list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { | ||
1744 | u64 *pt = sp->spt; | ||
1745 | |||
1746 | if (sp->role.level != PT_PAGE_TABLE_LEVEL) | ||
1747 | continue; | ||
1748 | |||
1749 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | ||
1750 | u64 ent = pt[i]; | ||
1751 | |||
1752 | if (!(ent & PT_PRESENT_MASK)) | ||
1753 | continue; | ||
1754 | if (!(ent & PT_WRITABLE_MASK)) | ||
1755 | continue; | ||
1756 | ++nmaps; | ||
1757 | } | ||
1758 | } | ||
1759 | return nmaps; | ||
1760 | } | ||
1761 | |||
1762 | static void audit_rmap(struct kvm_vcpu *vcpu) | ||
1763 | { | ||
1764 | int n_rmap = count_rmaps(vcpu); | ||
1765 | int n_actual = count_writable_mappings(vcpu); | ||
1766 | |||
1767 | if (n_rmap != n_actual) | ||
1768 | printk(KERN_ERR "%s: (%s) rmap %d actual %d\n", | ||
1769 | __FUNCTION__, audit_msg, n_rmap, n_actual); | ||
1770 | } | ||
1771 | |||
1772 | static void audit_write_protection(struct kvm_vcpu *vcpu) | ||
1773 | { | ||
1774 | struct kvm_mmu_page *sp; | ||
1775 | struct kvm_memory_slot *slot; | ||
1776 | unsigned long *rmapp; | ||
1777 | gfn_t gfn; | ||
1778 | |||
1779 | list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { | ||
1780 | if (sp->role.metaphysical) | ||
1781 | continue; | ||
1782 | |||
1783 | slot = gfn_to_memslot(vcpu->kvm, sp->gfn); | ||
1784 | gfn = unalias_gfn(vcpu->kvm, sp->gfn); | ||
1785 | rmapp = &slot->rmap[gfn - slot->base_gfn]; | ||
1786 | if (*rmapp) | ||
1787 | printk(KERN_ERR "%s: (%s) shadow page has writable" | ||
1788 | " mappings: gfn %lx role %x\n", | ||
1789 | __FUNCTION__, audit_msg, sp->gfn, | ||
1790 | sp->role.word); | ||
1791 | } | ||
1792 | } | ||
1793 | |||
1794 | static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) | ||
1795 | { | ||
1796 | int olddbg = dbg; | ||
1797 | |||
1798 | dbg = 0; | ||
1799 | audit_msg = msg; | ||
1800 | audit_rmap(vcpu); | ||
1801 | audit_write_protection(vcpu); | ||
1802 | audit_mappings(vcpu); | ||
1803 | dbg = olddbg; | ||
1804 | } | ||
1805 | |||
1806 | #endif | ||
diff --git a/drivers/kvm/mmu.h b/drivers/kvm/mmu.h deleted file mode 100644 index cbfc272262df..000000000000 --- a/drivers/kvm/mmu.h +++ /dev/null | |||
@@ -1,44 +0,0 @@ | |||
1 | #ifndef __KVM_X86_MMU_H | ||
2 | #define __KVM_X86_MMU_H | ||
3 | |||
4 | #include "kvm.h" | ||
5 | |||
6 | static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) | ||
7 | { | ||
8 | if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) | ||
9 | __kvm_mmu_free_some_pages(vcpu); | ||
10 | } | ||
11 | |||
12 | static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) | ||
13 | { | ||
14 | if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE)) | ||
15 | return 0; | ||
16 | |||
17 | return kvm_mmu_load(vcpu); | ||
18 | } | ||
19 | |||
20 | static inline int is_long_mode(struct kvm_vcpu *vcpu) | ||
21 | { | ||
22 | #ifdef CONFIG_X86_64 | ||
23 | return vcpu->arch.shadow_efer & EFER_LME; | ||
24 | #else | ||
25 | return 0; | ||
26 | #endif | ||
27 | } | ||
28 | |||
29 | static inline int is_pae(struct kvm_vcpu *vcpu) | ||
30 | { | ||
31 | return vcpu->arch.cr4 & X86_CR4_PAE; | ||
32 | } | ||
33 | |||
34 | static inline int is_pse(struct kvm_vcpu *vcpu) | ||
35 | { | ||
36 | return vcpu->arch.cr4 & X86_CR4_PSE; | ||
37 | } | ||
38 | |||
39 | static inline int is_paging(struct kvm_vcpu *vcpu) | ||
40 | { | ||
41 | return vcpu->arch.cr0 & X86_CR0_PG; | ||
42 | } | ||
43 | |||
44 | #endif | ||
diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h deleted file mode 100644 index 56b88f7e83ef..000000000000 --- a/drivers/kvm/paging_tmpl.h +++ /dev/null | |||
@@ -1,461 +0,0 @@ | |||
1 | /* | ||
2 | * Kernel-based Virtual Machine driver for Linux | ||
3 | * | ||
4 | * This module enables machines with Intel VT-x extensions to run virtual | ||
5 | * machines without emulation or binary translation. | ||
6 | * | ||
7 | * MMU support | ||
8 | * | ||
9 | * Copyright (C) 2006 Qumranet, Inc. | ||
10 | * | ||
11 | * Authors: | ||
12 | * Yaniv Kamay <yaniv@qumranet.com> | ||
13 | * Avi Kivity <avi@qumranet.com> | ||
14 | * | ||
15 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
16 | * the COPYING file in the top-level directory. | ||
17 | * | ||
18 | */ | ||
19 | |||
20 | /* | ||
21 | * We need the mmu code to access both 32-bit and 64-bit guest ptes, | ||
22 | * so the code in this file is compiled twice, once per pte size. | ||
23 | */ | ||
24 | |||
25 | #if PTTYPE == 64 | ||
26 | #define pt_element_t u64 | ||
27 | #define guest_walker guest_walker64 | ||
28 | #define FNAME(name) paging##64_##name | ||
29 | #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK | ||
30 | #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK | ||
31 | #define PT_INDEX(addr, level) PT64_INDEX(addr, level) | ||
32 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) | ||
33 | #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) | ||
34 | #define PT_LEVEL_BITS PT64_LEVEL_BITS | ||
35 | #ifdef CONFIG_X86_64 | ||
36 | #define PT_MAX_FULL_LEVELS 4 | ||
37 | #define CMPXCHG cmpxchg | ||
38 | #else | ||
39 | #define CMPXCHG cmpxchg64 | ||
40 | #define PT_MAX_FULL_LEVELS 2 | ||
41 | #endif | ||
42 | #elif PTTYPE == 32 | ||
43 | #define pt_element_t u32 | ||
44 | #define guest_walker guest_walker32 | ||
45 | #define FNAME(name) paging##32_##name | ||
46 | #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK | ||
47 | #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK | ||
48 | #define PT_INDEX(addr, level) PT32_INDEX(addr, level) | ||
49 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) | ||
50 | #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) | ||
51 | #define PT_LEVEL_BITS PT32_LEVEL_BITS | ||
52 | #define PT_MAX_FULL_LEVELS 2 | ||
53 | #define CMPXCHG cmpxchg | ||
54 | #else | ||
55 | #error Invalid PTTYPE value | ||
56 | #endif | ||
57 | |||
58 | #define gpte_to_gfn FNAME(gpte_to_gfn) | ||
59 | #define gpte_to_gfn_pde FNAME(gpte_to_gfn_pde) | ||
60 | |||
61 | /* | ||
62 | * The guest_walker structure emulates the behavior of the hardware page | ||
63 | * table walker. | ||
64 | */ | ||
65 | struct guest_walker { | ||
66 | int level; | ||
67 | gfn_t table_gfn[PT_MAX_FULL_LEVELS]; | ||
68 | pt_element_t ptes[PT_MAX_FULL_LEVELS]; | ||
69 | gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; | ||
70 | unsigned pt_access; | ||
71 | unsigned pte_access; | ||
72 | gfn_t gfn; | ||
73 | u32 error_code; | ||
74 | }; | ||
75 | |||
76 | static gfn_t gpte_to_gfn(pt_element_t gpte) | ||
77 | { | ||
78 | return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; | ||
79 | } | ||
80 | |||
81 | static gfn_t gpte_to_gfn_pde(pt_element_t gpte) | ||
82 | { | ||
83 | return (gpte & PT_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT; | ||
84 | } | ||
85 | |||
86 | static bool FNAME(cmpxchg_gpte)(struct kvm *kvm, | ||
87 | gfn_t table_gfn, unsigned index, | ||
88 | pt_element_t orig_pte, pt_element_t new_pte) | ||
89 | { | ||
90 | pt_element_t ret; | ||
91 | pt_element_t *table; | ||
92 | struct page *page; | ||
93 | |||
94 | page = gfn_to_page(kvm, table_gfn); | ||
95 | table = kmap_atomic(page, KM_USER0); | ||
96 | |||
97 | ret = CMPXCHG(&table[index], orig_pte, new_pte); | ||
98 | |||
99 | kunmap_atomic(table, KM_USER0); | ||
100 | |||
101 | kvm_release_page_dirty(page); | ||
102 | |||
103 | return (ret != orig_pte); | ||
104 | } | ||
105 | |||
106 | static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte) | ||
107 | { | ||
108 | unsigned access; | ||
109 | |||
110 | access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; | ||
111 | #if PTTYPE == 64 | ||
112 | if (is_nx(vcpu)) | ||
113 | access &= ~(gpte >> PT64_NX_SHIFT); | ||
114 | #endif | ||
115 | return access; | ||
116 | } | ||
117 | |||
118 | /* | ||
119 | * Fetch a guest pte for a guest virtual address | ||
120 | */ | ||
121 | static int FNAME(walk_addr)(struct guest_walker *walker, | ||
122 | struct kvm_vcpu *vcpu, gva_t addr, | ||
123 | int write_fault, int user_fault, int fetch_fault) | ||
124 | { | ||
125 | pt_element_t pte; | ||
126 | gfn_t table_gfn; | ||
127 | unsigned index, pt_access, pte_access; | ||
128 | gpa_t pte_gpa; | ||
129 | |||
130 | pgprintk("%s: addr %lx\n", __FUNCTION__, addr); | ||
131 | walk: | ||
132 | walker->level = vcpu->arch.mmu.root_level; | ||
133 | pte = vcpu->arch.cr3; | ||
134 | #if PTTYPE == 64 | ||
135 | if (!is_long_mode(vcpu)) { | ||
136 | pte = vcpu->arch.pdptrs[(addr >> 30) & 3]; | ||
137 | if (!is_present_pte(pte)) | ||
138 | goto not_present; | ||
139 | --walker->level; | ||
140 | } | ||
141 | #endif | ||
142 | ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || | ||
143 | (vcpu->cr3 & CR3_NONPAE_RESERVED_BITS) == 0); | ||
144 | |||
145 | pt_access = ACC_ALL; | ||
146 | |||
147 | for (;;) { | ||
148 | index = PT_INDEX(addr, walker->level); | ||
149 | |||
150 | table_gfn = gpte_to_gfn(pte); | ||
151 | pte_gpa = gfn_to_gpa(table_gfn); | ||
152 | pte_gpa += index * sizeof(pt_element_t); | ||
153 | walker->table_gfn[walker->level - 1] = table_gfn; | ||
154 | walker->pte_gpa[walker->level - 1] = pte_gpa; | ||
155 | pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__, | ||
156 | walker->level - 1, table_gfn); | ||
157 | |||
158 | kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)); | ||
159 | |||
160 | if (!is_present_pte(pte)) | ||
161 | goto not_present; | ||
162 | |||
163 | if (write_fault && !is_writeble_pte(pte)) | ||
164 | if (user_fault || is_write_protection(vcpu)) | ||
165 | goto access_error; | ||
166 | |||
167 | if (user_fault && !(pte & PT_USER_MASK)) | ||
168 | goto access_error; | ||
169 | |||
170 | #if PTTYPE == 64 | ||
171 | if (fetch_fault && is_nx(vcpu) && (pte & PT64_NX_MASK)) | ||
172 | goto access_error; | ||
173 | #endif | ||
174 | |||
175 | if (!(pte & PT_ACCESSED_MASK)) { | ||
176 | mark_page_dirty(vcpu->kvm, table_gfn); | ||
177 | if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, | ||
178 | index, pte, pte|PT_ACCESSED_MASK)) | ||
179 | goto walk; | ||
180 | pte |= PT_ACCESSED_MASK; | ||
181 | } | ||
182 | |||
183 | pte_access = pt_access & FNAME(gpte_access)(vcpu, pte); | ||
184 | |||
185 | walker->ptes[walker->level - 1] = pte; | ||
186 | |||
187 | if (walker->level == PT_PAGE_TABLE_LEVEL) { | ||
188 | walker->gfn = gpte_to_gfn(pte); | ||
189 | break; | ||
190 | } | ||
191 | |||
192 | if (walker->level == PT_DIRECTORY_LEVEL | ||
193 | && (pte & PT_PAGE_SIZE_MASK) | ||
194 | && (PTTYPE == 64 || is_pse(vcpu))) { | ||
195 | walker->gfn = gpte_to_gfn_pde(pte); | ||
196 | walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL); | ||
197 | if (PTTYPE == 32 && is_cpuid_PSE36()) | ||
198 | walker->gfn += pse36_gfn_delta(pte); | ||
199 | break; | ||
200 | } | ||
201 | |||
202 | pt_access = pte_access; | ||
203 | --walker->level; | ||
204 | } | ||
205 | |||
206 | if (write_fault && !is_dirty_pte(pte)) { | ||
207 | bool ret; | ||
208 | |||
209 | mark_page_dirty(vcpu->kvm, table_gfn); | ||
210 | ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, | ||
211 | pte|PT_DIRTY_MASK); | ||
212 | if (ret) | ||
213 | goto walk; | ||
214 | pte |= PT_DIRTY_MASK; | ||
215 | kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte)); | ||
216 | walker->ptes[walker->level - 1] = pte; | ||
217 | } | ||
218 | |||
219 | walker->pt_access = pt_access; | ||
220 | walker->pte_access = pte_access; | ||
221 | pgprintk("%s: pte %llx pte_access %x pt_access %x\n", | ||
222 | __FUNCTION__, (u64)pte, pt_access, pte_access); | ||
223 | return 1; | ||
224 | |||
225 | not_present: | ||
226 | walker->error_code = 0; | ||
227 | goto err; | ||
228 | |||
229 | access_error: | ||
230 | walker->error_code = PFERR_PRESENT_MASK; | ||
231 | |||
232 | err: | ||
233 | if (write_fault) | ||
234 | walker->error_code |= PFERR_WRITE_MASK; | ||
235 | if (user_fault) | ||
236 | walker->error_code |= PFERR_USER_MASK; | ||
237 | if (fetch_fault) | ||
238 | walker->error_code |= PFERR_FETCH_MASK; | ||
239 | return 0; | ||
240 | } | ||
241 | |||
242 | static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, | ||
243 | u64 *spte, const void *pte, int bytes, | ||
244 | int offset_in_pte) | ||
245 | { | ||
246 | pt_element_t gpte; | ||
247 | unsigned pte_access; | ||
248 | |||
249 | gpte = *(const pt_element_t *)pte; | ||
250 | if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { | ||
251 | if (!offset_in_pte && !is_present_pte(gpte)) | ||
252 | set_shadow_pte(spte, shadow_notrap_nonpresent_pte); | ||
253 | return; | ||
254 | } | ||
255 | if (bytes < sizeof(pt_element_t)) | ||
256 | return; | ||
257 | pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte); | ||
258 | pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte); | ||
259 | mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, | ||
260 | gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte)); | ||
261 | } | ||
262 | |||
263 | /* | ||
264 | * Fetch a shadow pte for a specific level in the paging hierarchy. | ||
265 | */ | ||
266 | static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | ||
267 | struct guest_walker *walker, | ||
268 | int user_fault, int write_fault, int *ptwrite) | ||
269 | { | ||
270 | hpa_t shadow_addr; | ||
271 | int level; | ||
272 | u64 *shadow_ent; | ||
273 | unsigned access = walker->pt_access; | ||
274 | |||
275 | if (!is_present_pte(walker->ptes[walker->level - 1])) | ||
276 | return NULL; | ||
277 | |||
278 | shadow_addr = vcpu->arch.mmu.root_hpa; | ||
279 | level = vcpu->arch.mmu.shadow_root_level; | ||
280 | if (level == PT32E_ROOT_LEVEL) { | ||
281 | shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; | ||
282 | shadow_addr &= PT64_BASE_ADDR_MASK; | ||
283 | --level; | ||
284 | } | ||
285 | |||
286 | for (; ; level--) { | ||
287 | u32 index = SHADOW_PT_INDEX(addr, level); | ||
288 | struct kvm_mmu_page *shadow_page; | ||
289 | u64 shadow_pte; | ||
290 | int metaphysical; | ||
291 | gfn_t table_gfn; | ||
292 | bool new_page = 0; | ||
293 | |||
294 | shadow_ent = ((u64 *)__va(shadow_addr)) + index; | ||
295 | if (is_shadow_present_pte(*shadow_ent)) { | ||
296 | if (level == PT_PAGE_TABLE_LEVEL) | ||
297 | break; | ||
298 | shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK; | ||
299 | continue; | ||
300 | } | ||
301 | |||
302 | if (level == PT_PAGE_TABLE_LEVEL) | ||
303 | break; | ||
304 | |||
305 | if (level - 1 == PT_PAGE_TABLE_LEVEL | ||
306 | && walker->level == PT_DIRECTORY_LEVEL) { | ||
307 | metaphysical = 1; | ||
308 | if (!is_dirty_pte(walker->ptes[level - 1])) | ||
309 | access &= ~ACC_WRITE_MASK; | ||
310 | table_gfn = gpte_to_gfn(walker->ptes[level - 1]); | ||
311 | } else { | ||
312 | metaphysical = 0; | ||
313 | table_gfn = walker->table_gfn[level - 2]; | ||
314 | } | ||
315 | shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, | ||
316 | metaphysical, access, | ||
317 | shadow_ent, &new_page); | ||
318 | if (new_page && !metaphysical) { | ||
319 | pt_element_t curr_pte; | ||
320 | kvm_read_guest(vcpu->kvm, walker->pte_gpa[level - 2], | ||
321 | &curr_pte, sizeof(curr_pte)); | ||
322 | if (curr_pte != walker->ptes[level - 2]) | ||
323 | return NULL; | ||
324 | } | ||
325 | shadow_addr = __pa(shadow_page->spt); | ||
326 | shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK | ||
327 | | PT_WRITABLE_MASK | PT_USER_MASK; | ||
328 | *shadow_ent = shadow_pte; | ||
329 | } | ||
330 | |||
331 | mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access, | ||
332 | user_fault, write_fault, | ||
333 | walker->ptes[walker->level-1] & PT_DIRTY_MASK, | ||
334 | ptwrite, walker->gfn); | ||
335 | |||
336 | return shadow_ent; | ||
337 | } | ||
338 | |||
339 | /* | ||
340 | * Page fault handler. There are several causes for a page fault: | ||
341 | * - there is no shadow pte for the guest pte | ||
342 | * - write access through a shadow pte marked read only so that we can set | ||
343 | * the dirty bit | ||
344 | * - write access to a shadow pte marked read only so we can update the page | ||
345 | * dirty bitmap, when userspace requests it | ||
346 | * - mmio access; in this case we will never install a present shadow pte | ||
347 | * - normal guest page fault due to the guest pte marked not present, not | ||
348 | * writable, or not executable | ||
349 | * | ||
350 | * Returns: 1 if we need to emulate the instruction, 0 otherwise, or | ||
351 | * a negative value on error. | ||
352 | */ | ||
353 | static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | ||
354 | u32 error_code) | ||
355 | { | ||
356 | int write_fault = error_code & PFERR_WRITE_MASK; | ||
357 | int user_fault = error_code & PFERR_USER_MASK; | ||
358 | int fetch_fault = error_code & PFERR_FETCH_MASK; | ||
359 | struct guest_walker walker; | ||
360 | u64 *shadow_pte; | ||
361 | int write_pt = 0; | ||
362 | int r; | ||
363 | |||
364 | pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code); | ||
365 | kvm_mmu_audit(vcpu, "pre page fault"); | ||
366 | |||
367 | r = mmu_topup_memory_caches(vcpu); | ||
368 | if (r) | ||
369 | return r; | ||
370 | |||
371 | /* | ||
372 | * Look up the shadow pte for the faulting address. | ||
373 | */ | ||
374 | r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault, | ||
375 | fetch_fault); | ||
376 | |||
377 | /* | ||
378 | * The page is not mapped by the guest. Let the guest handle it. | ||
379 | */ | ||
380 | if (!r) { | ||
381 | pgprintk("%s: guest page fault\n", __FUNCTION__); | ||
382 | inject_page_fault(vcpu, addr, walker.error_code); | ||
383 | vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ | ||
384 | return 0; | ||
385 | } | ||
386 | |||
387 | shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, | ||
388 | &write_pt); | ||
389 | pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__, | ||
390 | shadow_pte, *shadow_pte, write_pt); | ||
391 | |||
392 | if (!write_pt) | ||
393 | vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ | ||
394 | |||
395 | /* | ||
396 | * mmio: emulate if accessible, otherwise its a guest fault. | ||
397 | */ | ||
398 | if (shadow_pte && is_io_pte(*shadow_pte)) | ||
399 | return 1; | ||
400 | |||
401 | ++vcpu->stat.pf_fixed; | ||
402 | kvm_mmu_audit(vcpu, "post page fault (fixed)"); | ||
403 | |||
404 | return write_pt; | ||
405 | } | ||
406 | |||
407 | static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) | ||
408 | { | ||
409 | struct guest_walker walker; | ||
410 | gpa_t gpa = UNMAPPED_GVA; | ||
411 | int r; | ||
412 | |||
413 | r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0); | ||
414 | |||
415 | if (r) { | ||
416 | gpa = gfn_to_gpa(walker.gfn); | ||
417 | gpa |= vaddr & ~PAGE_MASK; | ||
418 | } | ||
419 | |||
420 | return gpa; | ||
421 | } | ||
422 | |||
423 | static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, | ||
424 | struct kvm_mmu_page *sp) | ||
425 | { | ||
426 | int i, offset = 0; | ||
427 | pt_element_t *gpt; | ||
428 | struct page *page; | ||
429 | |||
430 | if (sp->role.metaphysical | ||
431 | || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) { | ||
432 | nonpaging_prefetch_page(vcpu, sp); | ||
433 | return; | ||
434 | } | ||
435 | |||
436 | if (PTTYPE == 32) | ||
437 | offset = sp->role.quadrant << PT64_LEVEL_BITS; | ||
438 | page = gfn_to_page(vcpu->kvm, sp->gfn); | ||
439 | gpt = kmap_atomic(page, KM_USER0); | ||
440 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) | ||
441 | if (is_present_pte(gpt[offset + i])) | ||
442 | sp->spt[i] = shadow_trap_nonpresent_pte; | ||
443 | else | ||
444 | sp->spt[i] = shadow_notrap_nonpresent_pte; | ||
445 | kunmap_atomic(gpt, KM_USER0); | ||
446 | kvm_release_page_clean(page); | ||
447 | } | ||
448 | |||
449 | #undef pt_element_t | ||
450 | #undef guest_walker | ||
451 | #undef FNAME | ||
452 | #undef PT_BASE_ADDR_MASK | ||
453 | #undef PT_INDEX | ||
454 | #undef SHADOW_PT_INDEX | ||
455 | #undef PT_LEVEL_MASK | ||
456 | #undef PT_DIR_BASE_ADDR_MASK | ||
457 | #undef PT_LEVEL_BITS | ||
458 | #undef PT_MAX_FULL_LEVELS | ||
459 | #undef gpte_to_gfn | ||
460 | #undef gpte_to_gfn_pde | ||
461 | #undef CMPXCHG | ||
diff --git a/drivers/kvm/segment_descriptor.h b/drivers/kvm/segment_descriptor.h deleted file mode 100644 index 56fc4c873389..000000000000 --- a/drivers/kvm/segment_descriptor.h +++ /dev/null | |||
@@ -1,29 +0,0 @@ | |||
1 | #ifndef __SEGMENT_DESCRIPTOR_H | ||
2 | #define __SEGMENT_DESCRIPTOR_H | ||
3 | |||
4 | struct segment_descriptor { | ||
5 | u16 limit_low; | ||
6 | u16 base_low; | ||
7 | u8 base_mid; | ||
8 | u8 type : 4; | ||
9 | u8 system : 1; | ||
10 | u8 dpl : 2; | ||
11 | u8 present : 1; | ||
12 | u8 limit_high : 4; | ||
13 | u8 avl : 1; | ||
14 | u8 long_mode : 1; | ||
15 | u8 default_op : 1; | ||
16 | u8 granularity : 1; | ||
17 | u8 base_high; | ||
18 | } __attribute__((packed)); | ||
19 | |||
20 | #ifdef CONFIG_X86_64 | ||
21 | /* LDT or TSS descriptor in the GDT. 16 bytes. */ | ||
22 | struct segment_descriptor_64 { | ||
23 | struct segment_descriptor s; | ||
24 | u32 base_higher; | ||
25 | u32 pad_zero; | ||
26 | }; | ||
27 | |||
28 | #endif | ||
29 | #endif | ||
diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c deleted file mode 100644 index e606f6d18669..000000000000 --- a/drivers/kvm/svm.c +++ /dev/null | |||
@@ -1,1725 +0,0 @@ | |||
1 | /* | ||
2 | * Kernel-based Virtual Machine driver for Linux | ||
3 | * | ||
4 | * AMD SVM support | ||
5 | * | ||
6 | * Copyright (C) 2006 Qumranet, Inc. | ||
7 | * | ||
8 | * Authors: | ||
9 | * Yaniv Kamay <yaniv@qumranet.com> | ||
10 | * Avi Kivity <avi@qumranet.com> | ||
11 | * | ||
12 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
13 | * the COPYING file in the top-level directory. | ||
14 | * | ||
15 | */ | ||
16 | #include "x86.h" | ||
17 | #include "kvm_svm.h" | ||
18 | #include "x86_emulate.h" | ||
19 | #include "irq.h" | ||
20 | #include "mmu.h" | ||
21 | |||
22 | #include <linux/module.h> | ||
23 | #include <linux/kernel.h> | ||
24 | #include <linux/vmalloc.h> | ||
25 | #include <linux/highmem.h> | ||
26 | #include <linux/sched.h> | ||
27 | |||
28 | #include <asm/desc.h> | ||
29 | |||
30 | MODULE_AUTHOR("Qumranet"); | ||
31 | MODULE_LICENSE("GPL"); | ||
32 | |||
33 | #define IOPM_ALLOC_ORDER 2 | ||
34 | #define MSRPM_ALLOC_ORDER 1 | ||
35 | |||
36 | #define DB_VECTOR 1 | ||
37 | #define UD_VECTOR 6 | ||
38 | #define GP_VECTOR 13 | ||
39 | |||
40 | #define DR7_GD_MASK (1 << 13) | ||
41 | #define DR6_BD_MASK (1 << 13) | ||
42 | |||
43 | #define SEG_TYPE_LDT 2 | ||
44 | #define SEG_TYPE_BUSY_TSS16 3 | ||
45 | |||
46 | #define SVM_FEATURE_NPT (1 << 0) | ||
47 | #define SVM_FEATURE_LBRV (1 << 1) | ||
48 | #define SVM_DEATURE_SVML (1 << 2) | ||
49 | |||
50 | static void kvm_reput_irq(struct vcpu_svm *svm); | ||
51 | |||
52 | static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) | ||
53 | { | ||
54 | return container_of(vcpu, struct vcpu_svm, vcpu); | ||
55 | } | ||
56 | |||
57 | unsigned long iopm_base; | ||
58 | unsigned long msrpm_base; | ||
59 | |||
60 | struct kvm_ldttss_desc { | ||
61 | u16 limit0; | ||
62 | u16 base0; | ||
63 | unsigned base1 : 8, type : 5, dpl : 2, p : 1; | ||
64 | unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8; | ||
65 | u32 base3; | ||
66 | u32 zero1; | ||
67 | } __attribute__((packed)); | ||
68 | |||
69 | struct svm_cpu_data { | ||
70 | int cpu; | ||
71 | |||
72 | u64 asid_generation; | ||
73 | u32 max_asid; | ||
74 | u32 next_asid; | ||
75 | struct kvm_ldttss_desc *tss_desc; | ||
76 | |||
77 | struct page *save_area; | ||
78 | }; | ||
79 | |||
80 | static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); | ||
81 | static uint32_t svm_features; | ||
82 | |||
83 | struct svm_init_data { | ||
84 | int cpu; | ||
85 | int r; | ||
86 | }; | ||
87 | |||
88 | static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000}; | ||
89 | |||
90 | #define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges) | ||
91 | #define MSRS_RANGE_SIZE 2048 | ||
92 | #define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2) | ||
93 | |||
94 | #define MAX_INST_SIZE 15 | ||
95 | |||
96 | static inline u32 svm_has(u32 feat) | ||
97 | { | ||
98 | return svm_features & feat; | ||
99 | } | ||
100 | |||
101 | static inline u8 pop_irq(struct kvm_vcpu *vcpu) | ||
102 | { | ||
103 | int word_index = __ffs(vcpu->arch.irq_summary); | ||
104 | int bit_index = __ffs(vcpu->arch.irq_pending[word_index]); | ||
105 | int irq = word_index * BITS_PER_LONG + bit_index; | ||
106 | |||
107 | clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); | ||
108 | if (!vcpu->arch.irq_pending[word_index]) | ||
109 | clear_bit(word_index, &vcpu->arch.irq_summary); | ||
110 | return irq; | ||
111 | } | ||
112 | |||
113 | static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq) | ||
114 | { | ||
115 | set_bit(irq, vcpu->arch.irq_pending); | ||
116 | set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary); | ||
117 | } | ||
118 | |||
119 | static inline void clgi(void) | ||
120 | { | ||
121 | asm volatile (SVM_CLGI); | ||
122 | } | ||
123 | |||
124 | static inline void stgi(void) | ||
125 | { | ||
126 | asm volatile (SVM_STGI); | ||
127 | } | ||
128 | |||
129 | static inline void invlpga(unsigned long addr, u32 asid) | ||
130 | { | ||
131 | asm volatile (SVM_INVLPGA :: "a"(addr), "c"(asid)); | ||
132 | } | ||
133 | |||
134 | static inline unsigned long kvm_read_cr2(void) | ||
135 | { | ||
136 | unsigned long cr2; | ||
137 | |||
138 | asm volatile ("mov %%cr2, %0" : "=r" (cr2)); | ||
139 | return cr2; | ||
140 | } | ||
141 | |||
142 | static inline void kvm_write_cr2(unsigned long val) | ||
143 | { | ||
144 | asm volatile ("mov %0, %%cr2" :: "r" (val)); | ||
145 | } | ||
146 | |||
147 | static inline unsigned long read_dr6(void) | ||
148 | { | ||
149 | unsigned long dr6; | ||
150 | |||
151 | asm volatile ("mov %%dr6, %0" : "=r" (dr6)); | ||
152 | return dr6; | ||
153 | } | ||
154 | |||
155 | static inline void write_dr6(unsigned long val) | ||
156 | { | ||
157 | asm volatile ("mov %0, %%dr6" :: "r" (val)); | ||
158 | } | ||
159 | |||
160 | static inline unsigned long read_dr7(void) | ||
161 | { | ||
162 | unsigned long dr7; | ||
163 | |||
164 | asm volatile ("mov %%dr7, %0" : "=r" (dr7)); | ||
165 | return dr7; | ||
166 | } | ||
167 | |||
168 | static inline void write_dr7(unsigned long val) | ||
169 | { | ||
170 | asm volatile ("mov %0, %%dr7" :: "r" (val)); | ||
171 | } | ||
172 | |||
173 | static inline void force_new_asid(struct kvm_vcpu *vcpu) | ||
174 | { | ||
175 | to_svm(vcpu)->asid_generation--; | ||
176 | } | ||
177 | |||
178 | static inline void flush_guest_tlb(struct kvm_vcpu *vcpu) | ||
179 | { | ||
180 | force_new_asid(vcpu); | ||
181 | } | ||
182 | |||
183 | static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) | ||
184 | { | ||
185 | if (!(efer & EFER_LMA)) | ||
186 | efer &= ~EFER_LME; | ||
187 | |||
188 | to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK; | ||
189 | vcpu->arch.shadow_efer = efer; | ||
190 | } | ||
191 | |||
192 | static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, | ||
193 | bool has_error_code, u32 error_code) | ||
194 | { | ||
195 | struct vcpu_svm *svm = to_svm(vcpu); | ||
196 | |||
197 | svm->vmcb->control.event_inj = nr | ||
198 | | SVM_EVTINJ_VALID | ||
199 | | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0) | ||
200 | | SVM_EVTINJ_TYPE_EXEPT; | ||
201 | svm->vmcb->control.event_inj_err = error_code; | ||
202 | } | ||
203 | |||
204 | static bool svm_exception_injected(struct kvm_vcpu *vcpu) | ||
205 | { | ||
206 | struct vcpu_svm *svm = to_svm(vcpu); | ||
207 | |||
208 | return !(svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID); | ||
209 | } | ||
210 | |||
211 | static int is_external_interrupt(u32 info) | ||
212 | { | ||
213 | info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID; | ||
214 | return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR); | ||
215 | } | ||
216 | |||
217 | static void skip_emulated_instruction(struct kvm_vcpu *vcpu) | ||
218 | { | ||
219 | struct vcpu_svm *svm = to_svm(vcpu); | ||
220 | |||
221 | if (!svm->next_rip) { | ||
222 | printk(KERN_DEBUG "%s: NOP\n", __FUNCTION__); | ||
223 | return; | ||
224 | } | ||
225 | if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE) | ||
226 | printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n", | ||
227 | __FUNCTION__, | ||
228 | svm->vmcb->save.rip, | ||
229 | svm->next_rip); | ||
230 | |||
231 | vcpu->arch.rip = svm->vmcb->save.rip = svm->next_rip; | ||
232 | svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; | ||
233 | |||
234 | vcpu->arch.interrupt_window_open = 1; | ||
235 | } | ||
236 | |||
237 | static int has_svm(void) | ||
238 | { | ||
239 | uint32_t eax, ebx, ecx, edx; | ||
240 | |||
241 | if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) { | ||
242 | printk(KERN_INFO "has_svm: not amd\n"); | ||
243 | return 0; | ||
244 | } | ||
245 | |||
246 | cpuid(0x80000000, &eax, &ebx, &ecx, &edx); | ||
247 | if (eax < SVM_CPUID_FUNC) { | ||
248 | printk(KERN_INFO "has_svm: can't execute cpuid_8000000a\n"); | ||
249 | return 0; | ||
250 | } | ||
251 | |||
252 | cpuid(0x80000001, &eax, &ebx, &ecx, &edx); | ||
253 | if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) { | ||
254 | printk(KERN_DEBUG "has_svm: svm not available\n"); | ||
255 | return 0; | ||
256 | } | ||
257 | return 1; | ||
258 | } | ||
259 | |||
260 | static void svm_hardware_disable(void *garbage) | ||
261 | { | ||
262 | struct svm_cpu_data *svm_data | ||
263 | = per_cpu(svm_data, raw_smp_processor_id()); | ||
264 | |||
265 | if (svm_data) { | ||
266 | uint64_t efer; | ||
267 | |||
268 | wrmsrl(MSR_VM_HSAVE_PA, 0); | ||
269 | rdmsrl(MSR_EFER, efer); | ||
270 | wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK); | ||
271 | per_cpu(svm_data, raw_smp_processor_id()) = NULL; | ||
272 | __free_page(svm_data->save_area); | ||
273 | kfree(svm_data); | ||
274 | } | ||
275 | } | ||
276 | |||
277 | static void svm_hardware_enable(void *garbage) | ||
278 | { | ||
279 | |||
280 | struct svm_cpu_data *svm_data; | ||
281 | uint64_t efer; | ||
282 | #ifdef CONFIG_X86_64 | ||
283 | struct desc_ptr gdt_descr; | ||
284 | #else | ||
285 | struct desc_ptr gdt_descr; | ||
286 | #endif | ||
287 | struct desc_struct *gdt; | ||
288 | int me = raw_smp_processor_id(); | ||
289 | |||
290 | if (!has_svm()) { | ||
291 | printk(KERN_ERR "svm_cpu_init: err EOPNOTSUPP on %d\n", me); | ||
292 | return; | ||
293 | } | ||
294 | svm_data = per_cpu(svm_data, me); | ||
295 | |||
296 | if (!svm_data) { | ||
297 | printk(KERN_ERR "svm_cpu_init: svm_data is NULL on %d\n", | ||
298 | me); | ||
299 | return; | ||
300 | } | ||
301 | |||
302 | svm_data->asid_generation = 1; | ||
303 | svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; | ||
304 | svm_data->next_asid = svm_data->max_asid + 1; | ||
305 | svm_features = cpuid_edx(SVM_CPUID_FUNC); | ||
306 | |||
307 | asm volatile ("sgdt %0" : "=m"(gdt_descr)); | ||
308 | gdt = (struct desc_struct *)gdt_descr.address; | ||
309 | svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); | ||
310 | |||
311 | rdmsrl(MSR_EFER, efer); | ||
312 | wrmsrl(MSR_EFER, efer | MSR_EFER_SVME_MASK); | ||
313 | |||
314 | wrmsrl(MSR_VM_HSAVE_PA, | ||
315 | page_to_pfn(svm_data->save_area) << PAGE_SHIFT); | ||
316 | } | ||
317 | |||
318 | static int svm_cpu_init(int cpu) | ||
319 | { | ||
320 | struct svm_cpu_data *svm_data; | ||
321 | int r; | ||
322 | |||
323 | svm_data = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL); | ||
324 | if (!svm_data) | ||
325 | return -ENOMEM; | ||
326 | svm_data->cpu = cpu; | ||
327 | svm_data->save_area = alloc_page(GFP_KERNEL); | ||
328 | r = -ENOMEM; | ||
329 | if (!svm_data->save_area) | ||
330 | goto err_1; | ||
331 | |||
332 | per_cpu(svm_data, cpu) = svm_data; | ||
333 | |||
334 | return 0; | ||
335 | |||
336 | err_1: | ||
337 | kfree(svm_data); | ||
338 | return r; | ||
339 | |||
340 | } | ||
341 | |||
342 | static void set_msr_interception(u32 *msrpm, unsigned msr, | ||
343 | int read, int write) | ||
344 | { | ||
345 | int i; | ||
346 | |||
347 | for (i = 0; i < NUM_MSR_MAPS; i++) { | ||
348 | if (msr >= msrpm_ranges[i] && | ||
349 | msr < msrpm_ranges[i] + MSRS_IN_RANGE) { | ||
350 | u32 msr_offset = (i * MSRS_IN_RANGE + msr - | ||
351 | msrpm_ranges[i]) * 2; | ||
352 | |||
353 | u32 *base = msrpm + (msr_offset / 32); | ||
354 | u32 msr_shift = msr_offset % 32; | ||
355 | u32 mask = ((write) ? 0 : 2) | ((read) ? 0 : 1); | ||
356 | *base = (*base & ~(0x3 << msr_shift)) | | ||
357 | (mask << msr_shift); | ||
358 | return; | ||
359 | } | ||
360 | } | ||
361 | BUG(); | ||
362 | } | ||
363 | |||
364 | static __init int svm_hardware_setup(void) | ||
365 | { | ||
366 | int cpu; | ||
367 | struct page *iopm_pages; | ||
368 | struct page *msrpm_pages; | ||
369 | void *iopm_va, *msrpm_va; | ||
370 | int r; | ||
371 | |||
372 | iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER); | ||
373 | |||
374 | if (!iopm_pages) | ||
375 | return -ENOMEM; | ||
376 | |||
377 | iopm_va = page_address(iopm_pages); | ||
378 | memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER)); | ||
379 | clear_bit(0x80, iopm_va); /* allow direct access to PC debug port */ | ||
380 | iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; | ||
381 | |||
382 | |||
383 | msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); | ||
384 | |||
385 | r = -ENOMEM; | ||
386 | if (!msrpm_pages) | ||
387 | goto err_1; | ||
388 | |||
389 | msrpm_va = page_address(msrpm_pages); | ||
390 | memset(msrpm_va, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER)); | ||
391 | msrpm_base = page_to_pfn(msrpm_pages) << PAGE_SHIFT; | ||
392 | |||
393 | #ifdef CONFIG_X86_64 | ||
394 | set_msr_interception(msrpm_va, MSR_GS_BASE, 1, 1); | ||
395 | set_msr_interception(msrpm_va, MSR_FS_BASE, 1, 1); | ||
396 | set_msr_interception(msrpm_va, MSR_KERNEL_GS_BASE, 1, 1); | ||
397 | set_msr_interception(msrpm_va, MSR_LSTAR, 1, 1); | ||
398 | set_msr_interception(msrpm_va, MSR_CSTAR, 1, 1); | ||
399 | set_msr_interception(msrpm_va, MSR_SYSCALL_MASK, 1, 1); | ||
400 | #endif | ||
401 | set_msr_interception(msrpm_va, MSR_K6_STAR, 1, 1); | ||
402 | set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_CS, 1, 1); | ||
403 | set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_ESP, 1, 1); | ||
404 | set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_EIP, 1, 1); | ||
405 | |||
406 | for_each_online_cpu(cpu) { | ||
407 | r = svm_cpu_init(cpu); | ||
408 | if (r) | ||
409 | goto err_2; | ||
410 | } | ||
411 | return 0; | ||
412 | |||
413 | err_2: | ||
414 | __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER); | ||
415 | msrpm_base = 0; | ||
416 | err_1: | ||
417 | __free_pages(iopm_pages, IOPM_ALLOC_ORDER); | ||
418 | iopm_base = 0; | ||
419 | return r; | ||
420 | } | ||
421 | |||
422 | static __exit void svm_hardware_unsetup(void) | ||
423 | { | ||
424 | __free_pages(pfn_to_page(msrpm_base >> PAGE_SHIFT), MSRPM_ALLOC_ORDER); | ||
425 | __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER); | ||
426 | iopm_base = msrpm_base = 0; | ||
427 | } | ||
428 | |||
429 | static void init_seg(struct vmcb_seg *seg) | ||
430 | { | ||
431 | seg->selector = 0; | ||
432 | seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK | | ||
433 | SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */ | ||
434 | seg->limit = 0xffff; | ||
435 | seg->base = 0; | ||
436 | } | ||
437 | |||
438 | static void init_sys_seg(struct vmcb_seg *seg, uint32_t type) | ||
439 | { | ||
440 | seg->selector = 0; | ||
441 | seg->attrib = SVM_SELECTOR_P_MASK | type; | ||
442 | seg->limit = 0xffff; | ||
443 | seg->base = 0; | ||
444 | } | ||
445 | |||
446 | static void init_vmcb(struct vmcb *vmcb) | ||
447 | { | ||
448 | struct vmcb_control_area *control = &vmcb->control; | ||
449 | struct vmcb_save_area *save = &vmcb->save; | ||
450 | |||
451 | control->intercept_cr_read = INTERCEPT_CR0_MASK | | ||
452 | INTERCEPT_CR3_MASK | | ||
453 | INTERCEPT_CR4_MASK | | ||
454 | INTERCEPT_CR8_MASK; | ||
455 | |||
456 | control->intercept_cr_write = INTERCEPT_CR0_MASK | | ||
457 | INTERCEPT_CR3_MASK | | ||
458 | INTERCEPT_CR4_MASK | | ||
459 | INTERCEPT_CR8_MASK; | ||
460 | |||
461 | control->intercept_dr_read = INTERCEPT_DR0_MASK | | ||
462 | INTERCEPT_DR1_MASK | | ||
463 | INTERCEPT_DR2_MASK | | ||
464 | INTERCEPT_DR3_MASK; | ||
465 | |||
466 | control->intercept_dr_write = INTERCEPT_DR0_MASK | | ||
467 | INTERCEPT_DR1_MASK | | ||
468 | INTERCEPT_DR2_MASK | | ||
469 | INTERCEPT_DR3_MASK | | ||
470 | INTERCEPT_DR5_MASK | | ||
471 | INTERCEPT_DR7_MASK; | ||
472 | |||
473 | control->intercept_exceptions = (1 << PF_VECTOR) | | ||
474 | (1 << UD_VECTOR); | ||
475 | |||
476 | |||
477 | control->intercept = (1ULL << INTERCEPT_INTR) | | ||
478 | (1ULL << INTERCEPT_NMI) | | ||
479 | (1ULL << INTERCEPT_SMI) | | ||
480 | /* | ||
481 | * selective cr0 intercept bug? | ||
482 | * 0: 0f 22 d8 mov %eax,%cr3 | ||
483 | * 3: 0f 20 c0 mov %cr0,%eax | ||
484 | * 6: 0d 00 00 00 80 or $0x80000000,%eax | ||
485 | * b: 0f 22 c0 mov %eax,%cr0 | ||
486 | * set cr3 ->interception | ||
487 | * get cr0 ->interception | ||
488 | * set cr0 -> no interception | ||
489 | */ | ||
490 | /* (1ULL << INTERCEPT_SELECTIVE_CR0) | */ | ||
491 | (1ULL << INTERCEPT_CPUID) | | ||
492 | (1ULL << INTERCEPT_INVD) | | ||
493 | (1ULL << INTERCEPT_HLT) | | ||
494 | (1ULL << INTERCEPT_INVLPGA) | | ||
495 | (1ULL << INTERCEPT_IOIO_PROT) | | ||
496 | (1ULL << INTERCEPT_MSR_PROT) | | ||
497 | (1ULL << INTERCEPT_TASK_SWITCH) | | ||
498 | (1ULL << INTERCEPT_SHUTDOWN) | | ||
499 | (1ULL << INTERCEPT_VMRUN) | | ||
500 | (1ULL << INTERCEPT_VMMCALL) | | ||
501 | (1ULL << INTERCEPT_VMLOAD) | | ||
502 | (1ULL << INTERCEPT_VMSAVE) | | ||
503 | (1ULL << INTERCEPT_STGI) | | ||
504 | (1ULL << INTERCEPT_CLGI) | | ||
505 | (1ULL << INTERCEPT_SKINIT) | | ||
506 | (1ULL << INTERCEPT_WBINVD) | | ||
507 | (1ULL << INTERCEPT_MONITOR) | | ||
508 | (1ULL << INTERCEPT_MWAIT); | ||
509 | |||
510 | control->iopm_base_pa = iopm_base; | ||
511 | control->msrpm_base_pa = msrpm_base; | ||
512 | control->tsc_offset = 0; | ||
513 | control->int_ctl = V_INTR_MASKING_MASK; | ||
514 | |||
515 | init_seg(&save->es); | ||
516 | init_seg(&save->ss); | ||
517 | init_seg(&save->ds); | ||
518 | init_seg(&save->fs); | ||
519 | init_seg(&save->gs); | ||
520 | |||
521 | save->cs.selector = 0xf000; | ||
522 | /* Executable/Readable Code Segment */ | ||
523 | save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK | | ||
524 | SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK; | ||
525 | save->cs.limit = 0xffff; | ||
526 | /* | ||
527 | * cs.base should really be 0xffff0000, but vmx can't handle that, so | ||
528 | * be consistent with it. | ||
529 | * | ||
530 | * Replace when we have real mode working for vmx. | ||
531 | */ | ||
532 | save->cs.base = 0xf0000; | ||
533 | |||
534 | save->gdtr.limit = 0xffff; | ||
535 | save->idtr.limit = 0xffff; | ||
536 | |||
537 | init_sys_seg(&save->ldtr, SEG_TYPE_LDT); | ||
538 | init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); | ||
539 | |||
540 | save->efer = MSR_EFER_SVME_MASK; | ||
541 | save->dr6 = 0xffff0ff0; | ||
542 | save->dr7 = 0x400; | ||
543 | save->rflags = 2; | ||
544 | save->rip = 0x0000fff0; | ||
545 | |||
546 | /* | ||
547 | * cr0 val on cpu init should be 0x60000010, we enable cpu | ||
548 | * cache by default. the orderly way is to enable cache in bios. | ||
549 | */ | ||
550 | save->cr0 = 0x00000010 | X86_CR0_PG | X86_CR0_WP; | ||
551 | save->cr4 = X86_CR4_PAE; | ||
552 | /* rdx = ?? */ | ||
553 | } | ||
554 | |||
555 | static int svm_vcpu_reset(struct kvm_vcpu *vcpu) | ||
556 | { | ||
557 | struct vcpu_svm *svm = to_svm(vcpu); | ||
558 | |||
559 | init_vmcb(svm->vmcb); | ||
560 | |||
561 | if (vcpu->vcpu_id != 0) { | ||
562 | svm->vmcb->save.rip = 0; | ||
563 | svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; | ||
564 | svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; | ||
565 | } | ||
566 | |||
567 | return 0; | ||
568 | } | ||
569 | |||
570 | static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) | ||
571 | { | ||
572 | struct vcpu_svm *svm; | ||
573 | struct page *page; | ||
574 | int err; | ||
575 | |||
576 | svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); | ||
577 | if (!svm) { | ||
578 | err = -ENOMEM; | ||
579 | goto out; | ||
580 | } | ||
581 | |||
582 | err = kvm_vcpu_init(&svm->vcpu, kvm, id); | ||
583 | if (err) | ||
584 | goto free_svm; | ||
585 | |||
586 | page = alloc_page(GFP_KERNEL); | ||
587 | if (!page) { | ||
588 | err = -ENOMEM; | ||
589 | goto uninit; | ||
590 | } | ||
591 | |||
592 | svm->vmcb = page_address(page); | ||
593 | clear_page(svm->vmcb); | ||
594 | svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; | ||
595 | svm->asid_generation = 0; | ||
596 | memset(svm->db_regs, 0, sizeof(svm->db_regs)); | ||
597 | init_vmcb(svm->vmcb); | ||
598 | |||
599 | fx_init(&svm->vcpu); | ||
600 | svm->vcpu.fpu_active = 1; | ||
601 | svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; | ||
602 | if (svm->vcpu.vcpu_id == 0) | ||
603 | svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; | ||
604 | |||
605 | return &svm->vcpu; | ||
606 | |||
607 | uninit: | ||
608 | kvm_vcpu_uninit(&svm->vcpu); | ||
609 | free_svm: | ||
610 | kmem_cache_free(kvm_vcpu_cache, svm); | ||
611 | out: | ||
612 | return ERR_PTR(err); | ||
613 | } | ||
614 | |||
615 | static void svm_free_vcpu(struct kvm_vcpu *vcpu) | ||
616 | { | ||
617 | struct vcpu_svm *svm = to_svm(vcpu); | ||
618 | |||
619 | __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT)); | ||
620 | kvm_vcpu_uninit(vcpu); | ||
621 | kmem_cache_free(kvm_vcpu_cache, svm); | ||
622 | } | ||
623 | |||
624 | static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | ||
625 | { | ||
626 | struct vcpu_svm *svm = to_svm(vcpu); | ||
627 | int i; | ||
628 | |||
629 | if (unlikely(cpu != vcpu->cpu)) { | ||
630 | u64 tsc_this, delta; | ||
631 | |||
632 | /* | ||
633 | * Make sure that the guest sees a monotonically | ||
634 | * increasing TSC. | ||
635 | */ | ||
636 | rdtscll(tsc_this); | ||
637 | delta = vcpu->arch.host_tsc - tsc_this; | ||
638 | svm->vmcb->control.tsc_offset += delta; | ||
639 | vcpu->cpu = cpu; | ||
640 | kvm_migrate_apic_timer(vcpu); | ||
641 | } | ||
642 | |||
643 | for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) | ||
644 | rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); | ||
645 | } | ||
646 | |||
647 | static void svm_vcpu_put(struct kvm_vcpu *vcpu) | ||
648 | { | ||
649 | struct vcpu_svm *svm = to_svm(vcpu); | ||
650 | int i; | ||
651 | |||
652 | ++vcpu->stat.host_state_reload; | ||
653 | for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) | ||
654 | wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); | ||
655 | |||
656 | rdtscll(vcpu->arch.host_tsc); | ||
657 | } | ||
658 | |||
659 | static void svm_vcpu_decache(struct kvm_vcpu *vcpu) | ||
660 | { | ||
661 | } | ||
662 | |||
663 | static void svm_cache_regs(struct kvm_vcpu *vcpu) | ||
664 | { | ||
665 | struct vcpu_svm *svm = to_svm(vcpu); | ||
666 | |||
667 | vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; | ||
668 | vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; | ||
669 | vcpu->arch.rip = svm->vmcb->save.rip; | ||
670 | } | ||
671 | |||
672 | static void svm_decache_regs(struct kvm_vcpu *vcpu) | ||
673 | { | ||
674 | struct vcpu_svm *svm = to_svm(vcpu); | ||
675 | svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; | ||
676 | svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; | ||
677 | svm->vmcb->save.rip = vcpu->arch.rip; | ||
678 | } | ||
679 | |||
680 | static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) | ||
681 | { | ||
682 | return to_svm(vcpu)->vmcb->save.rflags; | ||
683 | } | ||
684 | |||
685 | static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) | ||
686 | { | ||
687 | to_svm(vcpu)->vmcb->save.rflags = rflags; | ||
688 | } | ||
689 | |||
690 | static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg) | ||
691 | { | ||
692 | struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save; | ||
693 | |||
694 | switch (seg) { | ||
695 | case VCPU_SREG_CS: return &save->cs; | ||
696 | case VCPU_SREG_DS: return &save->ds; | ||
697 | case VCPU_SREG_ES: return &save->es; | ||
698 | case VCPU_SREG_FS: return &save->fs; | ||
699 | case VCPU_SREG_GS: return &save->gs; | ||
700 | case VCPU_SREG_SS: return &save->ss; | ||
701 | case VCPU_SREG_TR: return &save->tr; | ||
702 | case VCPU_SREG_LDTR: return &save->ldtr; | ||
703 | } | ||
704 | BUG(); | ||
705 | return NULL; | ||
706 | } | ||
707 | |||
708 | static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg) | ||
709 | { | ||
710 | struct vmcb_seg *s = svm_seg(vcpu, seg); | ||
711 | |||
712 | return s->base; | ||
713 | } | ||
714 | |||
715 | static void svm_get_segment(struct kvm_vcpu *vcpu, | ||
716 | struct kvm_segment *var, int seg) | ||
717 | { | ||
718 | struct vmcb_seg *s = svm_seg(vcpu, seg); | ||
719 | |||
720 | var->base = s->base; | ||
721 | var->limit = s->limit; | ||
722 | var->selector = s->selector; | ||
723 | var->type = s->attrib & SVM_SELECTOR_TYPE_MASK; | ||
724 | var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1; | ||
725 | var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3; | ||
726 | var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1; | ||
727 | var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1; | ||
728 | var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1; | ||
729 | var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1; | ||
730 | var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1; | ||
731 | var->unusable = !var->present; | ||
732 | } | ||
733 | |||
734 | static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) | ||
735 | { | ||
736 | struct vcpu_svm *svm = to_svm(vcpu); | ||
737 | |||
738 | dt->limit = svm->vmcb->save.idtr.limit; | ||
739 | dt->base = svm->vmcb->save.idtr.base; | ||
740 | } | ||
741 | |||
742 | static void svm_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) | ||
743 | { | ||
744 | struct vcpu_svm *svm = to_svm(vcpu); | ||
745 | |||
746 | svm->vmcb->save.idtr.limit = dt->limit; | ||
747 | svm->vmcb->save.idtr.base = dt->base ; | ||
748 | } | ||
749 | |||
750 | static void svm_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) | ||
751 | { | ||
752 | struct vcpu_svm *svm = to_svm(vcpu); | ||
753 | |||
754 | dt->limit = svm->vmcb->save.gdtr.limit; | ||
755 | dt->base = svm->vmcb->save.gdtr.base; | ||
756 | } | ||
757 | |||
758 | static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) | ||
759 | { | ||
760 | struct vcpu_svm *svm = to_svm(vcpu); | ||
761 | |||
762 | svm->vmcb->save.gdtr.limit = dt->limit; | ||
763 | svm->vmcb->save.gdtr.base = dt->base ; | ||
764 | } | ||
765 | |||
766 | static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) | ||
767 | { | ||
768 | } | ||
769 | |||
770 | static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | ||
771 | { | ||
772 | struct vcpu_svm *svm = to_svm(vcpu); | ||
773 | |||
774 | #ifdef CONFIG_X86_64 | ||
775 | if (vcpu->arch.shadow_efer & EFER_LME) { | ||
776 | if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { | ||
777 | vcpu->arch.shadow_efer |= EFER_LMA; | ||
778 | svm->vmcb->save.efer |= EFER_LMA | EFER_LME; | ||
779 | } | ||
780 | |||
781 | if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) { | ||
782 | vcpu->arch.shadow_efer &= ~EFER_LMA; | ||
783 | svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME); | ||
784 | } | ||
785 | } | ||
786 | #endif | ||
787 | if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) { | ||
788 | svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); | ||
789 | vcpu->fpu_active = 1; | ||
790 | } | ||
791 | |||
792 | vcpu->arch.cr0 = cr0; | ||
793 | cr0 |= X86_CR0_PG | X86_CR0_WP; | ||
794 | cr0 &= ~(X86_CR0_CD | X86_CR0_NW); | ||
795 | svm->vmcb->save.cr0 = cr0; | ||
796 | } | ||
797 | |||
798 | static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | ||
799 | { | ||
800 | vcpu->arch.cr4 = cr4; | ||
801 | to_svm(vcpu)->vmcb->save.cr4 = cr4 | X86_CR4_PAE; | ||
802 | } | ||
803 | |||
804 | static void svm_set_segment(struct kvm_vcpu *vcpu, | ||
805 | struct kvm_segment *var, int seg) | ||
806 | { | ||
807 | struct vcpu_svm *svm = to_svm(vcpu); | ||
808 | struct vmcb_seg *s = svm_seg(vcpu, seg); | ||
809 | |||
810 | s->base = var->base; | ||
811 | s->limit = var->limit; | ||
812 | s->selector = var->selector; | ||
813 | if (var->unusable) | ||
814 | s->attrib = 0; | ||
815 | else { | ||
816 | s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK); | ||
817 | s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT; | ||
818 | s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT; | ||
819 | s->attrib |= (var->present & 1) << SVM_SELECTOR_P_SHIFT; | ||
820 | s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT; | ||
821 | s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT; | ||
822 | s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT; | ||
823 | s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT; | ||
824 | } | ||
825 | if (seg == VCPU_SREG_CS) | ||
826 | svm->vmcb->save.cpl | ||
827 | = (svm->vmcb->save.cs.attrib | ||
828 | >> SVM_SELECTOR_DPL_SHIFT) & 3; | ||
829 | |||
830 | } | ||
831 | |||
832 | /* FIXME: | ||
833 | |||
834 | svm(vcpu)->vmcb->control.int_ctl &= ~V_TPR_MASK; | ||
835 | svm(vcpu)->vmcb->control.int_ctl |= (sregs->cr8 & V_TPR_MASK); | ||
836 | |||
837 | */ | ||
838 | |||
839 | static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) | ||
840 | { | ||
841 | return -EOPNOTSUPP; | ||
842 | } | ||
843 | |||
844 | static int svm_get_irq(struct kvm_vcpu *vcpu) | ||
845 | { | ||
846 | struct vcpu_svm *svm = to_svm(vcpu); | ||
847 | u32 exit_int_info = svm->vmcb->control.exit_int_info; | ||
848 | |||
849 | if (is_external_interrupt(exit_int_info)) | ||
850 | return exit_int_info & SVM_EVTINJ_VEC_MASK; | ||
851 | return -1; | ||
852 | } | ||
853 | |||
854 | static void load_host_msrs(struct kvm_vcpu *vcpu) | ||
855 | { | ||
856 | #ifdef CONFIG_X86_64 | ||
857 | wrmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base); | ||
858 | #endif | ||
859 | } | ||
860 | |||
861 | static void save_host_msrs(struct kvm_vcpu *vcpu) | ||
862 | { | ||
863 | #ifdef CONFIG_X86_64 | ||
864 | rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base); | ||
865 | #endif | ||
866 | } | ||
867 | |||
868 | static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data) | ||
869 | { | ||
870 | if (svm_data->next_asid > svm_data->max_asid) { | ||
871 | ++svm_data->asid_generation; | ||
872 | svm_data->next_asid = 1; | ||
873 | svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID; | ||
874 | } | ||
875 | |||
876 | svm->vcpu.cpu = svm_data->cpu; | ||
877 | svm->asid_generation = svm_data->asid_generation; | ||
878 | svm->vmcb->control.asid = svm_data->next_asid++; | ||
879 | } | ||
880 | |||
881 | static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr) | ||
882 | { | ||
883 | return to_svm(vcpu)->db_regs[dr]; | ||
884 | } | ||
885 | |||
886 | static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, | ||
887 | int *exception) | ||
888 | { | ||
889 | struct vcpu_svm *svm = to_svm(vcpu); | ||
890 | |||
891 | *exception = 0; | ||
892 | |||
893 | if (svm->vmcb->save.dr7 & DR7_GD_MASK) { | ||
894 | svm->vmcb->save.dr7 &= ~DR7_GD_MASK; | ||
895 | svm->vmcb->save.dr6 |= DR6_BD_MASK; | ||
896 | *exception = DB_VECTOR; | ||
897 | return; | ||
898 | } | ||
899 | |||
900 | switch (dr) { | ||
901 | case 0 ... 3: | ||
902 | svm->db_regs[dr] = value; | ||
903 | return; | ||
904 | case 4 ... 5: | ||
905 | if (vcpu->arch.cr4 & X86_CR4_DE) { | ||
906 | *exception = UD_VECTOR; | ||
907 | return; | ||
908 | } | ||
909 | case 7: { | ||
910 | if (value & ~((1ULL << 32) - 1)) { | ||
911 | *exception = GP_VECTOR; | ||
912 | return; | ||
913 | } | ||
914 | svm->vmcb->save.dr7 = value; | ||
915 | return; | ||
916 | } | ||
917 | default: | ||
918 | printk(KERN_DEBUG "%s: unexpected dr %u\n", | ||
919 | __FUNCTION__, dr); | ||
920 | *exception = UD_VECTOR; | ||
921 | return; | ||
922 | } | ||
923 | } | ||
924 | |||
925 | static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | ||
926 | { | ||
927 | u32 exit_int_info = svm->vmcb->control.exit_int_info; | ||
928 | struct kvm *kvm = svm->vcpu.kvm; | ||
929 | u64 fault_address; | ||
930 | u32 error_code; | ||
931 | |||
932 | if (!irqchip_in_kernel(kvm) && | ||
933 | is_external_interrupt(exit_int_info)) | ||
934 | push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK); | ||
935 | |||
936 | fault_address = svm->vmcb->control.exit_info_2; | ||
937 | error_code = svm->vmcb->control.exit_info_1; | ||
938 | return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); | ||
939 | } | ||
940 | |||
941 | static int ud_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | ||
942 | { | ||
943 | int er; | ||
944 | |||
945 | er = emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0); | ||
946 | if (er != EMULATE_DONE) | ||
947 | kvm_queue_exception(&svm->vcpu, UD_VECTOR); | ||
948 | return 1; | ||
949 | } | ||
950 | |||
951 | static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | ||
952 | { | ||
953 | svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); | ||
954 | if (!(svm->vcpu.arch.cr0 & X86_CR0_TS)) | ||
955 | svm->vmcb->save.cr0 &= ~X86_CR0_TS; | ||
956 | svm->vcpu.fpu_active = 1; | ||
957 | |||
958 | return 1; | ||
959 | } | ||
960 | |||
961 | static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | ||
962 | { | ||
963 | /* | ||
964 | * VMCB is undefined after a SHUTDOWN intercept | ||
965 | * so reinitialize it. | ||
966 | */ | ||
967 | clear_page(svm->vmcb); | ||
968 | init_vmcb(svm->vmcb); | ||
969 | |||
970 | kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; | ||
971 | return 0; | ||
972 | } | ||
973 | |||
974 | static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | ||
975 | { | ||
976 | u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */ | ||
977 | int size, down, in, string, rep; | ||
978 | unsigned port; | ||
979 | |||
980 | ++svm->vcpu.stat.io_exits; | ||
981 | |||
982 | svm->next_rip = svm->vmcb->control.exit_info_2; | ||
983 | |||
984 | string = (io_info & SVM_IOIO_STR_MASK) != 0; | ||
985 | |||
986 | if (string) { | ||
987 | if (emulate_instruction(&svm->vcpu, | ||
988 | kvm_run, 0, 0, 0) == EMULATE_DO_MMIO) | ||
989 | return 0; | ||
990 | return 1; | ||
991 | } | ||
992 | |||
993 | in = (io_info & SVM_IOIO_TYPE_MASK) != 0; | ||
994 | port = io_info >> 16; | ||
995 | size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; | ||
996 | rep = (io_info & SVM_IOIO_REP_MASK) != 0; | ||
997 | down = (svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0; | ||
998 | |||
999 | return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port); | ||
1000 | } | ||
1001 | |||
1002 | static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | ||
1003 | { | ||
1004 | return 1; | ||
1005 | } | ||
1006 | |||
1007 | static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | ||
1008 | { | ||
1009 | svm->next_rip = svm->vmcb->save.rip + 1; | ||
1010 | skip_emulated_instruction(&svm->vcpu); | ||
1011 | return kvm_emulate_halt(&svm->vcpu); | ||
1012 | } | ||
1013 | |||
1014 | static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | ||
1015 | { | ||
1016 | svm->next_rip = svm->vmcb->save.rip + 3; | ||
1017 | skip_emulated_instruction(&svm->vcpu); | ||
1018 | kvm_emulate_hypercall(&svm->vcpu); | ||
1019 | return 1; | ||
1020 | } | ||
1021 | |||
1022 | static int invalid_op_interception(struct vcpu_svm *svm, | ||
1023 | struct kvm_run *kvm_run) | ||
1024 | { | ||
1025 | kvm_queue_exception(&svm->vcpu, UD_VECTOR); | ||
1026 | return 1; | ||
1027 | } | ||
1028 | |||
1029 | static int task_switch_interception(struct vcpu_svm *svm, | ||
1030 | struct kvm_run *kvm_run) | ||
1031 | { | ||
1032 | pr_unimpl(&svm->vcpu, "%s: task switch is unsupported\n", __FUNCTION__); | ||
1033 | kvm_run->exit_reason = KVM_EXIT_UNKNOWN; | ||
1034 | return 0; | ||
1035 | } | ||
1036 | |||
1037 | static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | ||
1038 | { | ||
1039 | svm->next_rip = svm->vmcb->save.rip + 2; | ||
1040 | kvm_emulate_cpuid(&svm->vcpu); | ||
1041 | return 1; | ||
1042 | } | ||
1043 | |||
1044 | static int emulate_on_interception(struct vcpu_svm *svm, | ||
1045 | struct kvm_run *kvm_run) | ||
1046 | { | ||
1047 | if (emulate_instruction(&svm->vcpu, NULL, 0, 0, 0) != EMULATE_DONE) | ||
1048 | pr_unimpl(&svm->vcpu, "%s: failed\n", __FUNCTION__); | ||
1049 | return 1; | ||
1050 | } | ||
1051 | |||
1052 | static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | ||
1053 | { | ||
1054 | emulate_instruction(&svm->vcpu, NULL, 0, 0, 0); | ||
1055 | if (irqchip_in_kernel(svm->vcpu.kvm)) | ||
1056 | return 1; | ||
1057 | kvm_run->exit_reason = KVM_EXIT_SET_TPR; | ||
1058 | return 0; | ||
1059 | } | ||
1060 | |||
1061 | static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) | ||
1062 | { | ||
1063 | struct vcpu_svm *svm = to_svm(vcpu); | ||
1064 | |||
1065 | switch (ecx) { | ||
1066 | case MSR_IA32_TIME_STAMP_COUNTER: { | ||
1067 | u64 tsc; | ||
1068 | |||
1069 | rdtscll(tsc); | ||
1070 | *data = svm->vmcb->control.tsc_offset + tsc; | ||
1071 | break; | ||
1072 | } | ||
1073 | case MSR_K6_STAR: | ||
1074 | *data = svm->vmcb->save.star; | ||
1075 | break; | ||
1076 | #ifdef CONFIG_X86_64 | ||
1077 | case MSR_LSTAR: | ||
1078 | *data = svm->vmcb->save.lstar; | ||
1079 | break; | ||
1080 | case MSR_CSTAR: | ||
1081 | *data = svm->vmcb->save.cstar; | ||
1082 | break; | ||
1083 | case MSR_KERNEL_GS_BASE: | ||
1084 | *data = svm->vmcb->save.kernel_gs_base; | ||
1085 | break; | ||
1086 | case MSR_SYSCALL_MASK: | ||
1087 | *data = svm->vmcb->save.sfmask; | ||
1088 | break; | ||
1089 | #endif | ||
1090 | case MSR_IA32_SYSENTER_CS: | ||
1091 | *data = svm->vmcb->save.sysenter_cs; | ||
1092 | break; | ||
1093 | case MSR_IA32_SYSENTER_EIP: | ||
1094 | *data = svm->vmcb->save.sysenter_eip; | ||
1095 | break; | ||
1096 | case MSR_IA32_SYSENTER_ESP: | ||
1097 | *data = svm->vmcb->save.sysenter_esp; | ||
1098 | break; | ||
1099 | default: | ||
1100 | return kvm_get_msr_common(vcpu, ecx, data); | ||
1101 | } | ||
1102 | return 0; | ||
1103 | } | ||
1104 | |||
1105 | static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | ||
1106 | { | ||
1107 | u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; | ||
1108 | u64 data; | ||
1109 | |||
1110 | if (svm_get_msr(&svm->vcpu, ecx, &data)) | ||
1111 | kvm_inject_gp(&svm->vcpu, 0); | ||
1112 | else { | ||
1113 | svm->vmcb->save.rax = data & 0xffffffff; | ||
1114 | svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32; | ||
1115 | svm->next_rip = svm->vmcb->save.rip + 2; | ||
1116 | skip_emulated_instruction(&svm->vcpu); | ||
1117 | } | ||
1118 | return 1; | ||
1119 | } | ||
1120 | |||
1121 | static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) | ||
1122 | { | ||
1123 | struct vcpu_svm *svm = to_svm(vcpu); | ||
1124 | |||
1125 | switch (ecx) { | ||
1126 | case MSR_IA32_TIME_STAMP_COUNTER: { | ||
1127 | u64 tsc; | ||
1128 | |||
1129 | rdtscll(tsc); | ||
1130 | svm->vmcb->control.tsc_offset = data - tsc; | ||
1131 | break; | ||
1132 | } | ||
1133 | case MSR_K6_STAR: | ||
1134 | svm->vmcb->save.star = data; | ||
1135 | break; | ||
1136 | #ifdef CONFIG_X86_64 | ||
1137 | case MSR_LSTAR: | ||
1138 | svm->vmcb->save.lstar = data; | ||
1139 | break; | ||
1140 | case MSR_CSTAR: | ||
1141 | svm->vmcb->save.cstar = data; | ||
1142 | break; | ||
1143 | case MSR_KERNEL_GS_BASE: | ||
1144 | svm->vmcb->save.kernel_gs_base = data; | ||
1145 | break; | ||
1146 | case MSR_SYSCALL_MASK: | ||
1147 | svm->vmcb->save.sfmask = data; | ||
1148 | break; | ||
1149 | #endif | ||
1150 | case MSR_IA32_SYSENTER_CS: | ||
1151 | svm->vmcb->save.sysenter_cs = data; | ||
1152 | break; | ||
1153 | case MSR_IA32_SYSENTER_EIP: | ||
1154 | svm->vmcb->save.sysenter_eip = data; | ||
1155 | break; | ||
1156 | case MSR_IA32_SYSENTER_ESP: | ||
1157 | svm->vmcb->save.sysenter_esp = data; | ||
1158 | break; | ||
1159 | case MSR_K7_EVNTSEL0: | ||
1160 | case MSR_K7_EVNTSEL1: | ||
1161 | case MSR_K7_EVNTSEL2: | ||
1162 | case MSR_K7_EVNTSEL3: | ||
1163 | /* | ||
1164 | * only support writing 0 to the performance counters for now | ||
1165 | * to make Windows happy. Should be replaced by a real | ||
1166 | * performance counter emulation later. | ||
1167 | */ | ||
1168 | if (data != 0) | ||
1169 | goto unhandled; | ||
1170 | break; | ||
1171 | default: | ||
1172 | unhandled: | ||
1173 | return kvm_set_msr_common(vcpu, ecx, data); | ||
1174 | } | ||
1175 | return 0; | ||
1176 | } | ||
1177 | |||
1178 | static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | ||
1179 | { | ||
1180 | u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; | ||
1181 | u64 data = (svm->vmcb->save.rax & -1u) | ||
1182 | | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); | ||
1183 | svm->next_rip = svm->vmcb->save.rip + 2; | ||
1184 | if (svm_set_msr(&svm->vcpu, ecx, data)) | ||
1185 | kvm_inject_gp(&svm->vcpu, 0); | ||
1186 | else | ||
1187 | skip_emulated_instruction(&svm->vcpu); | ||
1188 | return 1; | ||
1189 | } | ||
1190 | |||
1191 | static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | ||
1192 | { | ||
1193 | if (svm->vmcb->control.exit_info_1) | ||
1194 | return wrmsr_interception(svm, kvm_run); | ||
1195 | else | ||
1196 | return rdmsr_interception(svm, kvm_run); | ||
1197 | } | ||
1198 | |||
1199 | static int interrupt_window_interception(struct vcpu_svm *svm, | ||
1200 | struct kvm_run *kvm_run) | ||
1201 | { | ||
1202 | svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR); | ||
1203 | svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; | ||
1204 | /* | ||
1205 | * If the user space waits to inject interrupts, exit as soon as | ||
1206 | * possible | ||
1207 | */ | ||
1208 | if (kvm_run->request_interrupt_window && | ||
1209 | !svm->vcpu.arch.irq_summary) { | ||
1210 | ++svm->vcpu.stat.irq_window_exits; | ||
1211 | kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; | ||
1212 | return 0; | ||
1213 | } | ||
1214 | |||
1215 | return 1; | ||
1216 | } | ||
1217 | |||
1218 | static int (*svm_exit_handlers[])(struct vcpu_svm *svm, | ||
1219 | struct kvm_run *kvm_run) = { | ||
1220 | [SVM_EXIT_READ_CR0] = emulate_on_interception, | ||
1221 | [SVM_EXIT_READ_CR3] = emulate_on_interception, | ||
1222 | [SVM_EXIT_READ_CR4] = emulate_on_interception, | ||
1223 | [SVM_EXIT_READ_CR8] = emulate_on_interception, | ||
1224 | /* for now: */ | ||
1225 | [SVM_EXIT_WRITE_CR0] = emulate_on_interception, | ||
1226 | [SVM_EXIT_WRITE_CR3] = emulate_on_interception, | ||
1227 | [SVM_EXIT_WRITE_CR4] = emulate_on_interception, | ||
1228 | [SVM_EXIT_WRITE_CR8] = cr8_write_interception, | ||
1229 | [SVM_EXIT_READ_DR0] = emulate_on_interception, | ||
1230 | [SVM_EXIT_READ_DR1] = emulate_on_interception, | ||
1231 | [SVM_EXIT_READ_DR2] = emulate_on_interception, | ||
1232 | [SVM_EXIT_READ_DR3] = emulate_on_interception, | ||
1233 | [SVM_EXIT_WRITE_DR0] = emulate_on_interception, | ||
1234 | [SVM_EXIT_WRITE_DR1] = emulate_on_interception, | ||
1235 | [SVM_EXIT_WRITE_DR2] = emulate_on_interception, | ||
1236 | [SVM_EXIT_WRITE_DR3] = emulate_on_interception, | ||
1237 | [SVM_EXIT_WRITE_DR5] = emulate_on_interception, | ||
1238 | [SVM_EXIT_WRITE_DR7] = emulate_on_interception, | ||
1239 | [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, | ||
1240 | [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, | ||
1241 | [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, | ||
1242 | [SVM_EXIT_INTR] = nop_on_interception, | ||
1243 | [SVM_EXIT_NMI] = nop_on_interception, | ||
1244 | [SVM_EXIT_SMI] = nop_on_interception, | ||
1245 | [SVM_EXIT_INIT] = nop_on_interception, | ||
1246 | [SVM_EXIT_VINTR] = interrupt_window_interception, | ||
1247 | /* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */ | ||
1248 | [SVM_EXIT_CPUID] = cpuid_interception, | ||
1249 | [SVM_EXIT_INVD] = emulate_on_interception, | ||
1250 | [SVM_EXIT_HLT] = halt_interception, | ||
1251 | [SVM_EXIT_INVLPG] = emulate_on_interception, | ||
1252 | [SVM_EXIT_INVLPGA] = invalid_op_interception, | ||
1253 | [SVM_EXIT_IOIO] = io_interception, | ||
1254 | [SVM_EXIT_MSR] = msr_interception, | ||
1255 | [SVM_EXIT_TASK_SWITCH] = task_switch_interception, | ||
1256 | [SVM_EXIT_SHUTDOWN] = shutdown_interception, | ||
1257 | [SVM_EXIT_VMRUN] = invalid_op_interception, | ||
1258 | [SVM_EXIT_VMMCALL] = vmmcall_interception, | ||
1259 | [SVM_EXIT_VMLOAD] = invalid_op_interception, | ||
1260 | [SVM_EXIT_VMSAVE] = invalid_op_interception, | ||
1261 | [SVM_EXIT_STGI] = invalid_op_interception, | ||
1262 | [SVM_EXIT_CLGI] = invalid_op_interception, | ||
1263 | [SVM_EXIT_SKINIT] = invalid_op_interception, | ||
1264 | [SVM_EXIT_WBINVD] = emulate_on_interception, | ||
1265 | [SVM_EXIT_MONITOR] = invalid_op_interception, | ||
1266 | [SVM_EXIT_MWAIT] = invalid_op_interception, | ||
1267 | }; | ||
1268 | |||
1269 | |||
1270 | static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) | ||
1271 | { | ||
1272 | struct vcpu_svm *svm = to_svm(vcpu); | ||
1273 | u32 exit_code = svm->vmcb->control.exit_code; | ||
1274 | |||
1275 | kvm_reput_irq(svm); | ||
1276 | |||
1277 | if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { | ||
1278 | kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; | ||
1279 | kvm_run->fail_entry.hardware_entry_failure_reason | ||
1280 | = svm->vmcb->control.exit_code; | ||
1281 | return 0; | ||
1282 | } | ||
1283 | |||
1284 | if (is_external_interrupt(svm->vmcb->control.exit_int_info) && | ||
1285 | exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR) | ||
1286 | printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x " | ||
1287 | "exit_code 0x%x\n", | ||
1288 | __FUNCTION__, svm->vmcb->control.exit_int_info, | ||
1289 | exit_code); | ||
1290 | |||
1291 | if (exit_code >= ARRAY_SIZE(svm_exit_handlers) | ||
1292 | || !svm_exit_handlers[exit_code]) { | ||
1293 | kvm_run->exit_reason = KVM_EXIT_UNKNOWN; | ||
1294 | kvm_run->hw.hardware_exit_reason = exit_code; | ||
1295 | return 0; | ||
1296 | } | ||
1297 | |||
1298 | return svm_exit_handlers[exit_code](svm, kvm_run); | ||
1299 | } | ||
1300 | |||
1301 | static void reload_tss(struct kvm_vcpu *vcpu) | ||
1302 | { | ||
1303 | int cpu = raw_smp_processor_id(); | ||
1304 | |||
1305 | struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); | ||
1306 | svm_data->tss_desc->type = 9; /* available 32/64-bit TSS */ | ||
1307 | load_TR_desc(); | ||
1308 | } | ||
1309 | |||
1310 | static void pre_svm_run(struct vcpu_svm *svm) | ||
1311 | { | ||
1312 | int cpu = raw_smp_processor_id(); | ||
1313 | |||
1314 | struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); | ||
1315 | |||
1316 | svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; | ||
1317 | if (svm->vcpu.cpu != cpu || | ||
1318 | svm->asid_generation != svm_data->asid_generation) | ||
1319 | new_asid(svm, svm_data); | ||
1320 | } | ||
1321 | |||
1322 | |||
1323 | static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) | ||
1324 | { | ||
1325 | struct vmcb_control_area *control; | ||
1326 | |||
1327 | control = &svm->vmcb->control; | ||
1328 | control->int_vector = irq; | ||
1329 | control->int_ctl &= ~V_INTR_PRIO_MASK; | ||
1330 | control->int_ctl |= V_IRQ_MASK | | ||
1331 | ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); | ||
1332 | } | ||
1333 | |||
1334 | static void svm_set_irq(struct kvm_vcpu *vcpu, int irq) | ||
1335 | { | ||
1336 | struct vcpu_svm *svm = to_svm(vcpu); | ||
1337 | |||
1338 | svm_inject_irq(svm, irq); | ||
1339 | } | ||
1340 | |||
1341 | static void svm_intr_assist(struct kvm_vcpu *vcpu) | ||
1342 | { | ||
1343 | struct vcpu_svm *svm = to_svm(vcpu); | ||
1344 | struct vmcb *vmcb = svm->vmcb; | ||
1345 | int intr_vector = -1; | ||
1346 | |||
1347 | if ((vmcb->control.exit_int_info & SVM_EVTINJ_VALID) && | ||
1348 | ((vmcb->control.exit_int_info & SVM_EVTINJ_TYPE_MASK) == 0)) { | ||
1349 | intr_vector = vmcb->control.exit_int_info & | ||
1350 | SVM_EVTINJ_VEC_MASK; | ||
1351 | vmcb->control.exit_int_info = 0; | ||
1352 | svm_inject_irq(svm, intr_vector); | ||
1353 | return; | ||
1354 | } | ||
1355 | |||
1356 | if (vmcb->control.int_ctl & V_IRQ_MASK) | ||
1357 | return; | ||
1358 | |||
1359 | if (!kvm_cpu_has_interrupt(vcpu)) | ||
1360 | return; | ||
1361 | |||
1362 | if (!(vmcb->save.rflags & X86_EFLAGS_IF) || | ||
1363 | (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) || | ||
1364 | (vmcb->control.event_inj & SVM_EVTINJ_VALID)) { | ||
1365 | /* unable to deliver irq, set pending irq */ | ||
1366 | vmcb->control.intercept |= (1ULL << INTERCEPT_VINTR); | ||
1367 | svm_inject_irq(svm, 0x0); | ||
1368 | return; | ||
1369 | } | ||
1370 | /* Okay, we can deliver the interrupt: grab it and update PIC state. */ | ||
1371 | intr_vector = kvm_cpu_get_interrupt(vcpu); | ||
1372 | svm_inject_irq(svm, intr_vector); | ||
1373 | kvm_timer_intr_post(vcpu, intr_vector); | ||
1374 | } | ||
1375 | |||
1376 | static void kvm_reput_irq(struct vcpu_svm *svm) | ||
1377 | { | ||
1378 | struct vmcb_control_area *control = &svm->vmcb->control; | ||
1379 | |||
1380 | if ((control->int_ctl & V_IRQ_MASK) | ||
1381 | && !irqchip_in_kernel(svm->vcpu.kvm)) { | ||
1382 | control->int_ctl &= ~V_IRQ_MASK; | ||
1383 | push_irq(&svm->vcpu, control->int_vector); | ||
1384 | } | ||
1385 | |||
1386 | svm->vcpu.arch.interrupt_window_open = | ||
1387 | !(control->int_state & SVM_INTERRUPT_SHADOW_MASK); | ||
1388 | } | ||
1389 | |||
1390 | static void svm_do_inject_vector(struct vcpu_svm *svm) | ||
1391 | { | ||
1392 | struct kvm_vcpu *vcpu = &svm->vcpu; | ||
1393 | int word_index = __ffs(vcpu->arch.irq_summary); | ||
1394 | int bit_index = __ffs(vcpu->arch.irq_pending[word_index]); | ||
1395 | int irq = word_index * BITS_PER_LONG + bit_index; | ||
1396 | |||
1397 | clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); | ||
1398 | if (!vcpu->arch.irq_pending[word_index]) | ||
1399 | clear_bit(word_index, &vcpu->arch.irq_summary); | ||
1400 | svm_inject_irq(svm, irq); | ||
1401 | } | ||
1402 | |||
1403 | static void do_interrupt_requests(struct kvm_vcpu *vcpu, | ||
1404 | struct kvm_run *kvm_run) | ||
1405 | { | ||
1406 | struct vcpu_svm *svm = to_svm(vcpu); | ||
1407 | struct vmcb_control_area *control = &svm->vmcb->control; | ||
1408 | |||
1409 | svm->vcpu.arch.interrupt_window_open = | ||
1410 | (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) && | ||
1411 | (svm->vmcb->save.rflags & X86_EFLAGS_IF)); | ||
1412 | |||
1413 | if (svm->vcpu.arch.interrupt_window_open && svm->vcpu.arch.irq_summary) | ||
1414 | /* | ||
1415 | * If interrupts enabled, and not blocked by sti or mov ss. Good. | ||
1416 | */ | ||
1417 | svm_do_inject_vector(svm); | ||
1418 | |||
1419 | /* | ||
1420 | * Interrupts blocked. Wait for unblock. | ||
1421 | */ | ||
1422 | if (!svm->vcpu.arch.interrupt_window_open && | ||
1423 | (svm->vcpu.arch.irq_summary || kvm_run->request_interrupt_window)) | ||
1424 | control->intercept |= 1ULL << INTERCEPT_VINTR; | ||
1425 | else | ||
1426 | control->intercept &= ~(1ULL << INTERCEPT_VINTR); | ||
1427 | } | ||
1428 | |||
1429 | static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) | ||
1430 | { | ||
1431 | return 0; | ||
1432 | } | ||
1433 | |||
1434 | static void save_db_regs(unsigned long *db_regs) | ||
1435 | { | ||
1436 | asm volatile ("mov %%dr0, %0" : "=r"(db_regs[0])); | ||
1437 | asm volatile ("mov %%dr1, %0" : "=r"(db_regs[1])); | ||
1438 | asm volatile ("mov %%dr2, %0" : "=r"(db_regs[2])); | ||
1439 | asm volatile ("mov %%dr3, %0" : "=r"(db_regs[3])); | ||
1440 | } | ||
1441 | |||
1442 | static void load_db_regs(unsigned long *db_regs) | ||
1443 | { | ||
1444 | asm volatile ("mov %0, %%dr0" : : "r"(db_regs[0])); | ||
1445 | asm volatile ("mov %0, %%dr1" : : "r"(db_regs[1])); | ||
1446 | asm volatile ("mov %0, %%dr2" : : "r"(db_regs[2])); | ||
1447 | asm volatile ("mov %0, %%dr3" : : "r"(db_regs[3])); | ||
1448 | } | ||
1449 | |||
1450 | static void svm_flush_tlb(struct kvm_vcpu *vcpu) | ||
1451 | { | ||
1452 | force_new_asid(vcpu); | ||
1453 | } | ||
1454 | |||
1455 | static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) | ||
1456 | { | ||
1457 | } | ||
1458 | |||
1459 | static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
1460 | { | ||
1461 | struct vcpu_svm *svm = to_svm(vcpu); | ||
1462 | u16 fs_selector; | ||
1463 | u16 gs_selector; | ||
1464 | u16 ldt_selector; | ||
1465 | |||
1466 | pre_svm_run(svm); | ||
1467 | |||
1468 | save_host_msrs(vcpu); | ||
1469 | fs_selector = read_fs(); | ||
1470 | gs_selector = read_gs(); | ||
1471 | ldt_selector = read_ldt(); | ||
1472 | svm->host_cr2 = kvm_read_cr2(); | ||
1473 | svm->host_dr6 = read_dr6(); | ||
1474 | svm->host_dr7 = read_dr7(); | ||
1475 | svm->vmcb->save.cr2 = vcpu->arch.cr2; | ||
1476 | |||
1477 | if (svm->vmcb->save.dr7 & 0xff) { | ||
1478 | write_dr7(0); | ||
1479 | save_db_regs(svm->host_db_regs); | ||
1480 | load_db_regs(svm->db_regs); | ||
1481 | } | ||
1482 | |||
1483 | clgi(); | ||
1484 | |||
1485 | local_irq_enable(); | ||
1486 | |||
1487 | asm volatile ( | ||
1488 | #ifdef CONFIG_X86_64 | ||
1489 | "push %%rbp; \n\t" | ||
1490 | #else | ||
1491 | "push %%ebp; \n\t" | ||
1492 | #endif | ||
1493 | |||
1494 | #ifdef CONFIG_X86_64 | ||
1495 | "mov %c[rbx](%[svm]), %%rbx \n\t" | ||
1496 | "mov %c[rcx](%[svm]), %%rcx \n\t" | ||
1497 | "mov %c[rdx](%[svm]), %%rdx \n\t" | ||
1498 | "mov %c[rsi](%[svm]), %%rsi \n\t" | ||
1499 | "mov %c[rdi](%[svm]), %%rdi \n\t" | ||
1500 | "mov %c[rbp](%[svm]), %%rbp \n\t" | ||
1501 | "mov %c[r8](%[svm]), %%r8 \n\t" | ||
1502 | "mov %c[r9](%[svm]), %%r9 \n\t" | ||
1503 | "mov %c[r10](%[svm]), %%r10 \n\t" | ||
1504 | "mov %c[r11](%[svm]), %%r11 \n\t" | ||
1505 | "mov %c[r12](%[svm]), %%r12 \n\t" | ||
1506 | "mov %c[r13](%[svm]), %%r13 \n\t" | ||
1507 | "mov %c[r14](%[svm]), %%r14 \n\t" | ||
1508 | "mov %c[r15](%[svm]), %%r15 \n\t" | ||
1509 | #else | ||
1510 | "mov %c[rbx](%[svm]), %%ebx \n\t" | ||
1511 | "mov %c[rcx](%[svm]), %%ecx \n\t" | ||
1512 | "mov %c[rdx](%[svm]), %%edx \n\t" | ||
1513 | "mov %c[rsi](%[svm]), %%esi \n\t" | ||
1514 | "mov %c[rdi](%[svm]), %%edi \n\t" | ||
1515 | "mov %c[rbp](%[svm]), %%ebp \n\t" | ||
1516 | #endif | ||
1517 | |||
1518 | #ifdef CONFIG_X86_64 | ||
1519 | /* Enter guest mode */ | ||
1520 | "push %%rax \n\t" | ||
1521 | "mov %c[vmcb](%[svm]), %%rax \n\t" | ||
1522 | SVM_VMLOAD "\n\t" | ||
1523 | SVM_VMRUN "\n\t" | ||
1524 | SVM_VMSAVE "\n\t" | ||
1525 | "pop %%rax \n\t" | ||
1526 | #else | ||
1527 | /* Enter guest mode */ | ||
1528 | "push %%eax \n\t" | ||
1529 | "mov %c[vmcb](%[svm]), %%eax \n\t" | ||
1530 | SVM_VMLOAD "\n\t" | ||
1531 | SVM_VMRUN "\n\t" | ||
1532 | SVM_VMSAVE "\n\t" | ||
1533 | "pop %%eax \n\t" | ||
1534 | #endif | ||
1535 | |||
1536 | /* Save guest registers, load host registers */ | ||
1537 | #ifdef CONFIG_X86_64 | ||
1538 | "mov %%rbx, %c[rbx](%[svm]) \n\t" | ||
1539 | "mov %%rcx, %c[rcx](%[svm]) \n\t" | ||
1540 | "mov %%rdx, %c[rdx](%[svm]) \n\t" | ||
1541 | "mov %%rsi, %c[rsi](%[svm]) \n\t" | ||
1542 | "mov %%rdi, %c[rdi](%[svm]) \n\t" | ||
1543 | "mov %%rbp, %c[rbp](%[svm]) \n\t" | ||
1544 | "mov %%r8, %c[r8](%[svm]) \n\t" | ||
1545 | "mov %%r9, %c[r9](%[svm]) \n\t" | ||
1546 | "mov %%r10, %c[r10](%[svm]) \n\t" | ||
1547 | "mov %%r11, %c[r11](%[svm]) \n\t" | ||
1548 | "mov %%r12, %c[r12](%[svm]) \n\t" | ||
1549 | "mov %%r13, %c[r13](%[svm]) \n\t" | ||
1550 | "mov %%r14, %c[r14](%[svm]) \n\t" | ||
1551 | "mov %%r15, %c[r15](%[svm]) \n\t" | ||
1552 | |||
1553 | "pop %%rbp; \n\t" | ||
1554 | #else | ||
1555 | "mov %%ebx, %c[rbx](%[svm]) \n\t" | ||
1556 | "mov %%ecx, %c[rcx](%[svm]) \n\t" | ||
1557 | "mov %%edx, %c[rdx](%[svm]) \n\t" | ||
1558 | "mov %%esi, %c[rsi](%[svm]) \n\t" | ||
1559 | "mov %%edi, %c[rdi](%[svm]) \n\t" | ||
1560 | "mov %%ebp, %c[rbp](%[svm]) \n\t" | ||
1561 | |||
1562 | "pop %%ebp; \n\t" | ||
1563 | #endif | ||
1564 | : | ||
1565 | : [svm]"a"(svm), | ||
1566 | [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), | ||
1567 | [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])), | ||
1568 | [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])), | ||
1569 | [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])), | ||
1570 | [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])), | ||
1571 | [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])), | ||
1572 | [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP])) | ||
1573 | #ifdef CONFIG_X86_64 | ||
1574 | , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])), | ||
1575 | [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])), | ||
1576 | [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])), | ||
1577 | [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])), | ||
1578 | [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])), | ||
1579 | [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])), | ||
1580 | [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])), | ||
1581 | [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15])) | ||
1582 | #endif | ||
1583 | : "cc", "memory" | ||
1584 | #ifdef CONFIG_X86_64 | ||
1585 | , "rbx", "rcx", "rdx", "rsi", "rdi" | ||
1586 | , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15" | ||
1587 | #else | ||
1588 | , "ebx", "ecx", "edx" , "esi", "edi" | ||
1589 | #endif | ||
1590 | ); | ||
1591 | |||
1592 | if ((svm->vmcb->save.dr7 & 0xff)) | ||
1593 | load_db_regs(svm->host_db_regs); | ||
1594 | |||
1595 | vcpu->arch.cr2 = svm->vmcb->save.cr2; | ||
1596 | |||
1597 | write_dr6(svm->host_dr6); | ||
1598 | write_dr7(svm->host_dr7); | ||
1599 | kvm_write_cr2(svm->host_cr2); | ||
1600 | |||
1601 | load_fs(fs_selector); | ||
1602 | load_gs(gs_selector); | ||
1603 | load_ldt(ldt_selector); | ||
1604 | load_host_msrs(vcpu); | ||
1605 | |||
1606 | reload_tss(vcpu); | ||
1607 | |||
1608 | local_irq_disable(); | ||
1609 | |||
1610 | stgi(); | ||
1611 | |||
1612 | svm->next_rip = 0; | ||
1613 | } | ||
1614 | |||
1615 | static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) | ||
1616 | { | ||
1617 | struct vcpu_svm *svm = to_svm(vcpu); | ||
1618 | |||
1619 | svm->vmcb->save.cr3 = root; | ||
1620 | force_new_asid(vcpu); | ||
1621 | |||
1622 | if (vcpu->fpu_active) { | ||
1623 | svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR); | ||
1624 | svm->vmcb->save.cr0 |= X86_CR0_TS; | ||
1625 | vcpu->fpu_active = 0; | ||
1626 | } | ||
1627 | } | ||
1628 | |||
1629 | static int is_disabled(void) | ||
1630 | { | ||
1631 | u64 vm_cr; | ||
1632 | |||
1633 | rdmsrl(MSR_VM_CR, vm_cr); | ||
1634 | if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE)) | ||
1635 | return 1; | ||
1636 | |||
1637 | return 0; | ||
1638 | } | ||
1639 | |||
1640 | static void | ||
1641 | svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) | ||
1642 | { | ||
1643 | /* | ||
1644 | * Patch in the VMMCALL instruction: | ||
1645 | */ | ||
1646 | hypercall[0] = 0x0f; | ||
1647 | hypercall[1] = 0x01; | ||
1648 | hypercall[2] = 0xd9; | ||
1649 | } | ||
1650 | |||
1651 | static void svm_check_processor_compat(void *rtn) | ||
1652 | { | ||
1653 | *(int *)rtn = 0; | ||
1654 | } | ||
1655 | |||
1656 | static struct kvm_x86_ops svm_x86_ops = { | ||
1657 | .cpu_has_kvm_support = has_svm, | ||
1658 | .disabled_by_bios = is_disabled, | ||
1659 | .hardware_setup = svm_hardware_setup, | ||
1660 | .hardware_unsetup = svm_hardware_unsetup, | ||
1661 | .check_processor_compatibility = svm_check_processor_compat, | ||
1662 | .hardware_enable = svm_hardware_enable, | ||
1663 | .hardware_disable = svm_hardware_disable, | ||
1664 | |||
1665 | .vcpu_create = svm_create_vcpu, | ||
1666 | .vcpu_free = svm_free_vcpu, | ||
1667 | .vcpu_reset = svm_vcpu_reset, | ||
1668 | |||
1669 | .prepare_guest_switch = svm_prepare_guest_switch, | ||
1670 | .vcpu_load = svm_vcpu_load, | ||
1671 | .vcpu_put = svm_vcpu_put, | ||
1672 | .vcpu_decache = svm_vcpu_decache, | ||
1673 | |||
1674 | .set_guest_debug = svm_guest_debug, | ||
1675 | .get_msr = svm_get_msr, | ||
1676 | .set_msr = svm_set_msr, | ||
1677 | .get_segment_base = svm_get_segment_base, | ||
1678 | .get_segment = svm_get_segment, | ||
1679 | .set_segment = svm_set_segment, | ||
1680 | .get_cs_db_l_bits = kvm_get_cs_db_l_bits, | ||
1681 | .decache_cr4_guest_bits = svm_decache_cr4_guest_bits, | ||
1682 | .set_cr0 = svm_set_cr0, | ||
1683 | .set_cr3 = svm_set_cr3, | ||
1684 | .set_cr4 = svm_set_cr4, | ||
1685 | .set_efer = svm_set_efer, | ||
1686 | .get_idt = svm_get_idt, | ||
1687 | .set_idt = svm_set_idt, | ||
1688 | .get_gdt = svm_get_gdt, | ||
1689 | .set_gdt = svm_set_gdt, | ||
1690 | .get_dr = svm_get_dr, | ||
1691 | .set_dr = svm_set_dr, | ||
1692 | .cache_regs = svm_cache_regs, | ||
1693 | .decache_regs = svm_decache_regs, | ||
1694 | .get_rflags = svm_get_rflags, | ||
1695 | .set_rflags = svm_set_rflags, | ||
1696 | |||
1697 | .tlb_flush = svm_flush_tlb, | ||
1698 | |||
1699 | .run = svm_vcpu_run, | ||
1700 | .handle_exit = handle_exit, | ||
1701 | .skip_emulated_instruction = skip_emulated_instruction, | ||
1702 | .patch_hypercall = svm_patch_hypercall, | ||
1703 | .get_irq = svm_get_irq, | ||
1704 | .set_irq = svm_set_irq, | ||
1705 | .queue_exception = svm_queue_exception, | ||
1706 | .exception_injected = svm_exception_injected, | ||
1707 | .inject_pending_irq = svm_intr_assist, | ||
1708 | .inject_pending_vectors = do_interrupt_requests, | ||
1709 | |||
1710 | .set_tss_addr = svm_set_tss_addr, | ||
1711 | }; | ||
1712 | |||
1713 | static int __init svm_init(void) | ||
1714 | { | ||
1715 | return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm), | ||
1716 | THIS_MODULE); | ||
1717 | } | ||
1718 | |||
1719 | static void __exit svm_exit(void) | ||
1720 | { | ||
1721 | kvm_exit(); | ||
1722 | } | ||
1723 | |||
1724 | module_init(svm_init) | ||
1725 | module_exit(svm_exit) | ||
diff --git a/drivers/kvm/svm.h b/drivers/kvm/svm.h deleted file mode 100644 index 5fd50491b555..000000000000 --- a/drivers/kvm/svm.h +++ /dev/null | |||
@@ -1,325 +0,0 @@ | |||
1 | #ifndef __SVM_H | ||
2 | #define __SVM_H | ||
3 | |||
4 | enum { | ||
5 | INTERCEPT_INTR, | ||
6 | INTERCEPT_NMI, | ||
7 | INTERCEPT_SMI, | ||
8 | INTERCEPT_INIT, | ||
9 | INTERCEPT_VINTR, | ||
10 | INTERCEPT_SELECTIVE_CR0, | ||
11 | INTERCEPT_STORE_IDTR, | ||
12 | INTERCEPT_STORE_GDTR, | ||
13 | INTERCEPT_STORE_LDTR, | ||
14 | INTERCEPT_STORE_TR, | ||
15 | INTERCEPT_LOAD_IDTR, | ||
16 | INTERCEPT_LOAD_GDTR, | ||
17 | INTERCEPT_LOAD_LDTR, | ||
18 | INTERCEPT_LOAD_TR, | ||
19 | INTERCEPT_RDTSC, | ||
20 | INTERCEPT_RDPMC, | ||
21 | INTERCEPT_PUSHF, | ||
22 | INTERCEPT_POPF, | ||
23 | INTERCEPT_CPUID, | ||
24 | INTERCEPT_RSM, | ||
25 | INTERCEPT_IRET, | ||
26 | INTERCEPT_INTn, | ||
27 | INTERCEPT_INVD, | ||
28 | INTERCEPT_PAUSE, | ||
29 | INTERCEPT_HLT, | ||
30 | INTERCEPT_INVLPG, | ||
31 | INTERCEPT_INVLPGA, | ||
32 | INTERCEPT_IOIO_PROT, | ||
33 | INTERCEPT_MSR_PROT, | ||
34 | INTERCEPT_TASK_SWITCH, | ||
35 | INTERCEPT_FERR_FREEZE, | ||
36 | INTERCEPT_SHUTDOWN, | ||
37 | INTERCEPT_VMRUN, | ||
38 | INTERCEPT_VMMCALL, | ||
39 | INTERCEPT_VMLOAD, | ||
40 | INTERCEPT_VMSAVE, | ||
41 | INTERCEPT_STGI, | ||
42 | INTERCEPT_CLGI, | ||
43 | INTERCEPT_SKINIT, | ||
44 | INTERCEPT_RDTSCP, | ||
45 | INTERCEPT_ICEBP, | ||
46 | INTERCEPT_WBINVD, | ||
47 | INTERCEPT_MONITOR, | ||
48 | INTERCEPT_MWAIT, | ||
49 | INTERCEPT_MWAIT_COND, | ||
50 | }; | ||
51 | |||
52 | |||
53 | struct __attribute__ ((__packed__)) vmcb_control_area { | ||
54 | u16 intercept_cr_read; | ||
55 | u16 intercept_cr_write; | ||
56 | u16 intercept_dr_read; | ||
57 | u16 intercept_dr_write; | ||
58 | u32 intercept_exceptions; | ||
59 | u64 intercept; | ||
60 | u8 reserved_1[44]; | ||
61 | u64 iopm_base_pa; | ||
62 | u64 msrpm_base_pa; | ||
63 | u64 tsc_offset; | ||
64 | u32 asid; | ||
65 | u8 tlb_ctl; | ||
66 | u8 reserved_2[3]; | ||
67 | u32 int_ctl; | ||
68 | u32 int_vector; | ||
69 | u32 int_state; | ||
70 | u8 reserved_3[4]; | ||
71 | u32 exit_code; | ||
72 | u32 exit_code_hi; | ||
73 | u64 exit_info_1; | ||
74 | u64 exit_info_2; | ||
75 | u32 exit_int_info; | ||
76 | u32 exit_int_info_err; | ||
77 | u64 nested_ctl; | ||
78 | u8 reserved_4[16]; | ||
79 | u32 event_inj; | ||
80 | u32 event_inj_err; | ||
81 | u64 nested_cr3; | ||
82 | u64 lbr_ctl; | ||
83 | u8 reserved_5[832]; | ||
84 | }; | ||
85 | |||
86 | |||
87 | #define TLB_CONTROL_DO_NOTHING 0 | ||
88 | #define TLB_CONTROL_FLUSH_ALL_ASID 1 | ||
89 | |||
90 | #define V_TPR_MASK 0x0f | ||
91 | |||
92 | #define V_IRQ_SHIFT 8 | ||
93 | #define V_IRQ_MASK (1 << V_IRQ_SHIFT) | ||
94 | |||
95 | #define V_INTR_PRIO_SHIFT 16 | ||
96 | #define V_INTR_PRIO_MASK (0x0f << V_INTR_PRIO_SHIFT) | ||
97 | |||
98 | #define V_IGN_TPR_SHIFT 20 | ||
99 | #define V_IGN_TPR_MASK (1 << V_IGN_TPR_SHIFT) | ||
100 | |||
101 | #define V_INTR_MASKING_SHIFT 24 | ||
102 | #define V_INTR_MASKING_MASK (1 << V_INTR_MASKING_SHIFT) | ||
103 | |||
104 | #define SVM_INTERRUPT_SHADOW_MASK 1 | ||
105 | |||
106 | #define SVM_IOIO_STR_SHIFT 2 | ||
107 | #define SVM_IOIO_REP_SHIFT 3 | ||
108 | #define SVM_IOIO_SIZE_SHIFT 4 | ||
109 | #define SVM_IOIO_ASIZE_SHIFT 7 | ||
110 | |||
111 | #define SVM_IOIO_TYPE_MASK 1 | ||
112 | #define SVM_IOIO_STR_MASK (1 << SVM_IOIO_STR_SHIFT) | ||
113 | #define SVM_IOIO_REP_MASK (1 << SVM_IOIO_REP_SHIFT) | ||
114 | #define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT) | ||
115 | #define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT) | ||
116 | |||
117 | struct __attribute__ ((__packed__)) vmcb_seg { | ||
118 | u16 selector; | ||
119 | u16 attrib; | ||
120 | u32 limit; | ||
121 | u64 base; | ||
122 | }; | ||
123 | |||
124 | struct __attribute__ ((__packed__)) vmcb_save_area { | ||
125 | struct vmcb_seg es; | ||
126 | struct vmcb_seg cs; | ||
127 | struct vmcb_seg ss; | ||
128 | struct vmcb_seg ds; | ||
129 | struct vmcb_seg fs; | ||
130 | struct vmcb_seg gs; | ||
131 | struct vmcb_seg gdtr; | ||
132 | struct vmcb_seg ldtr; | ||
133 | struct vmcb_seg idtr; | ||
134 | struct vmcb_seg tr; | ||
135 | u8 reserved_1[43]; | ||
136 | u8 cpl; | ||
137 | u8 reserved_2[4]; | ||
138 | u64 efer; | ||
139 | u8 reserved_3[112]; | ||
140 | u64 cr4; | ||
141 | u64 cr3; | ||
142 | u64 cr0; | ||
143 | u64 dr7; | ||
144 | u64 dr6; | ||
145 | u64 rflags; | ||
146 | u64 rip; | ||
147 | u8 reserved_4[88]; | ||
148 | u64 rsp; | ||
149 | u8 reserved_5[24]; | ||
150 | u64 rax; | ||
151 | u64 star; | ||
152 | u64 lstar; | ||
153 | u64 cstar; | ||
154 | u64 sfmask; | ||
155 | u64 kernel_gs_base; | ||
156 | u64 sysenter_cs; | ||
157 | u64 sysenter_esp; | ||
158 | u64 sysenter_eip; | ||
159 | u64 cr2; | ||
160 | u8 reserved_6[32]; | ||
161 | u64 g_pat; | ||
162 | u64 dbgctl; | ||
163 | u64 br_from; | ||
164 | u64 br_to; | ||
165 | u64 last_excp_from; | ||
166 | u64 last_excp_to; | ||
167 | }; | ||
168 | |||
169 | struct __attribute__ ((__packed__)) vmcb { | ||
170 | struct vmcb_control_area control; | ||
171 | struct vmcb_save_area save; | ||
172 | }; | ||
173 | |||
174 | #define SVM_CPUID_FEATURE_SHIFT 2 | ||
175 | #define SVM_CPUID_FUNC 0x8000000a | ||
176 | |||
177 | #define MSR_EFER_SVME_MASK (1ULL << 12) | ||
178 | #define MSR_VM_CR 0xc0010114 | ||
179 | #define MSR_VM_HSAVE_PA 0xc0010117ULL | ||
180 | |||
181 | #define SVM_VM_CR_SVM_DISABLE 4 | ||
182 | |||
183 | #define SVM_SELECTOR_S_SHIFT 4 | ||
184 | #define SVM_SELECTOR_DPL_SHIFT 5 | ||
185 | #define SVM_SELECTOR_P_SHIFT 7 | ||
186 | #define SVM_SELECTOR_AVL_SHIFT 8 | ||
187 | #define SVM_SELECTOR_L_SHIFT 9 | ||
188 | #define SVM_SELECTOR_DB_SHIFT 10 | ||
189 | #define SVM_SELECTOR_G_SHIFT 11 | ||
190 | |||
191 | #define SVM_SELECTOR_TYPE_MASK (0xf) | ||
192 | #define SVM_SELECTOR_S_MASK (1 << SVM_SELECTOR_S_SHIFT) | ||
193 | #define SVM_SELECTOR_DPL_MASK (3 << SVM_SELECTOR_DPL_SHIFT) | ||
194 | #define SVM_SELECTOR_P_MASK (1 << SVM_SELECTOR_P_SHIFT) | ||
195 | #define SVM_SELECTOR_AVL_MASK (1 << SVM_SELECTOR_AVL_SHIFT) | ||
196 | #define SVM_SELECTOR_L_MASK (1 << SVM_SELECTOR_L_SHIFT) | ||
197 | #define SVM_SELECTOR_DB_MASK (1 << SVM_SELECTOR_DB_SHIFT) | ||
198 | #define SVM_SELECTOR_G_MASK (1 << SVM_SELECTOR_G_SHIFT) | ||
199 | |||
200 | #define SVM_SELECTOR_WRITE_MASK (1 << 1) | ||
201 | #define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK | ||
202 | #define SVM_SELECTOR_CODE_MASK (1 << 3) | ||
203 | |||
204 | #define INTERCEPT_CR0_MASK 1 | ||
205 | #define INTERCEPT_CR3_MASK (1 << 3) | ||
206 | #define INTERCEPT_CR4_MASK (1 << 4) | ||
207 | #define INTERCEPT_CR8_MASK (1 << 8) | ||
208 | |||
209 | #define INTERCEPT_DR0_MASK 1 | ||
210 | #define INTERCEPT_DR1_MASK (1 << 1) | ||
211 | #define INTERCEPT_DR2_MASK (1 << 2) | ||
212 | #define INTERCEPT_DR3_MASK (1 << 3) | ||
213 | #define INTERCEPT_DR4_MASK (1 << 4) | ||
214 | #define INTERCEPT_DR5_MASK (1 << 5) | ||
215 | #define INTERCEPT_DR6_MASK (1 << 6) | ||
216 | #define INTERCEPT_DR7_MASK (1 << 7) | ||
217 | |||
218 | #define SVM_EVTINJ_VEC_MASK 0xff | ||
219 | |||
220 | #define SVM_EVTINJ_TYPE_SHIFT 8 | ||
221 | #define SVM_EVTINJ_TYPE_MASK (7 << SVM_EVTINJ_TYPE_SHIFT) | ||
222 | |||
223 | #define SVM_EVTINJ_TYPE_INTR (0 << SVM_EVTINJ_TYPE_SHIFT) | ||
224 | #define SVM_EVTINJ_TYPE_NMI (2 << SVM_EVTINJ_TYPE_SHIFT) | ||
225 | #define SVM_EVTINJ_TYPE_EXEPT (3 << SVM_EVTINJ_TYPE_SHIFT) | ||
226 | #define SVM_EVTINJ_TYPE_SOFT (4 << SVM_EVTINJ_TYPE_SHIFT) | ||
227 | |||
228 | #define SVM_EVTINJ_VALID (1 << 31) | ||
229 | #define SVM_EVTINJ_VALID_ERR (1 << 11) | ||
230 | |||
231 | #define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK | ||
232 | |||
233 | #define SVM_EXITINTINFO_TYPE_INTR SVM_EVTINJ_TYPE_INTR | ||
234 | #define SVM_EXITINTINFO_TYPE_NMI SVM_EVTINJ_TYPE_NMI | ||
235 | #define SVM_EXITINTINFO_TYPE_EXEPT SVM_EVTINJ_TYPE_EXEPT | ||
236 | #define SVM_EXITINTINFO_TYPE_SOFT SVM_EVTINJ_TYPE_SOFT | ||
237 | |||
238 | #define SVM_EXITINTINFO_VALID SVM_EVTINJ_VALID | ||
239 | #define SVM_EXITINTINFO_VALID_ERR SVM_EVTINJ_VALID_ERR | ||
240 | |||
241 | #define SVM_EXIT_READ_CR0 0x000 | ||
242 | #define SVM_EXIT_READ_CR3 0x003 | ||
243 | #define SVM_EXIT_READ_CR4 0x004 | ||
244 | #define SVM_EXIT_READ_CR8 0x008 | ||
245 | #define SVM_EXIT_WRITE_CR0 0x010 | ||
246 | #define SVM_EXIT_WRITE_CR3 0x013 | ||
247 | #define SVM_EXIT_WRITE_CR4 0x014 | ||
248 | #define SVM_EXIT_WRITE_CR8 0x018 | ||
249 | #define SVM_EXIT_READ_DR0 0x020 | ||
250 | #define SVM_EXIT_READ_DR1 0x021 | ||
251 | #define SVM_EXIT_READ_DR2 0x022 | ||
252 | #define SVM_EXIT_READ_DR3 0x023 | ||
253 | #define SVM_EXIT_READ_DR4 0x024 | ||
254 | #define SVM_EXIT_READ_DR5 0x025 | ||
255 | #define SVM_EXIT_READ_DR6 0x026 | ||
256 | #define SVM_EXIT_READ_DR7 0x027 | ||
257 | #define SVM_EXIT_WRITE_DR0 0x030 | ||
258 | #define SVM_EXIT_WRITE_DR1 0x031 | ||
259 | #define SVM_EXIT_WRITE_DR2 0x032 | ||
260 | #define SVM_EXIT_WRITE_DR3 0x033 | ||
261 | #define SVM_EXIT_WRITE_DR4 0x034 | ||
262 | #define SVM_EXIT_WRITE_DR5 0x035 | ||
263 | #define SVM_EXIT_WRITE_DR6 0x036 | ||
264 | #define SVM_EXIT_WRITE_DR7 0x037 | ||
265 | #define SVM_EXIT_EXCP_BASE 0x040 | ||
266 | #define SVM_EXIT_INTR 0x060 | ||
267 | #define SVM_EXIT_NMI 0x061 | ||
268 | #define SVM_EXIT_SMI 0x062 | ||
269 | #define SVM_EXIT_INIT 0x063 | ||
270 | #define SVM_EXIT_VINTR 0x064 | ||
271 | #define SVM_EXIT_CR0_SEL_WRITE 0x065 | ||
272 | #define SVM_EXIT_IDTR_READ 0x066 | ||
273 | #define SVM_EXIT_GDTR_READ 0x067 | ||
274 | #define SVM_EXIT_LDTR_READ 0x068 | ||
275 | #define SVM_EXIT_TR_READ 0x069 | ||
276 | #define SVM_EXIT_IDTR_WRITE 0x06a | ||
277 | #define SVM_EXIT_GDTR_WRITE 0x06b | ||
278 | #define SVM_EXIT_LDTR_WRITE 0x06c | ||
279 | #define SVM_EXIT_TR_WRITE 0x06d | ||
280 | #define SVM_EXIT_RDTSC 0x06e | ||
281 | #define SVM_EXIT_RDPMC 0x06f | ||
282 | #define SVM_EXIT_PUSHF 0x070 | ||
283 | #define SVM_EXIT_POPF 0x071 | ||
284 | #define SVM_EXIT_CPUID 0x072 | ||
285 | #define SVM_EXIT_RSM 0x073 | ||
286 | #define SVM_EXIT_IRET 0x074 | ||
287 | #define SVM_EXIT_SWINT 0x075 | ||
288 | #define SVM_EXIT_INVD 0x076 | ||
289 | #define SVM_EXIT_PAUSE 0x077 | ||
290 | #define SVM_EXIT_HLT 0x078 | ||
291 | #define SVM_EXIT_INVLPG 0x079 | ||
292 | #define SVM_EXIT_INVLPGA 0x07a | ||
293 | #define SVM_EXIT_IOIO 0x07b | ||
294 | #define SVM_EXIT_MSR 0x07c | ||
295 | #define SVM_EXIT_TASK_SWITCH 0x07d | ||
296 | #define SVM_EXIT_FERR_FREEZE 0x07e | ||
297 | #define SVM_EXIT_SHUTDOWN 0x07f | ||
298 | #define SVM_EXIT_VMRUN 0x080 | ||
299 | #define SVM_EXIT_VMMCALL 0x081 | ||
300 | #define SVM_EXIT_VMLOAD 0x082 | ||
301 | #define SVM_EXIT_VMSAVE 0x083 | ||
302 | #define SVM_EXIT_STGI 0x084 | ||
303 | #define SVM_EXIT_CLGI 0x085 | ||
304 | #define SVM_EXIT_SKINIT 0x086 | ||
305 | #define SVM_EXIT_RDTSCP 0x087 | ||
306 | #define SVM_EXIT_ICEBP 0x088 | ||
307 | #define SVM_EXIT_WBINVD 0x089 | ||
308 | #define SVM_EXIT_MONITOR 0x08a | ||
309 | #define SVM_EXIT_MWAIT 0x08b | ||
310 | #define SVM_EXIT_MWAIT_COND 0x08c | ||
311 | #define SVM_EXIT_NPF 0x400 | ||
312 | |||
313 | #define SVM_EXIT_ERR -1 | ||
314 | |||
315 | #define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) /* TS and MP */ | ||
316 | |||
317 | #define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda" | ||
318 | #define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8" | ||
319 | #define SVM_VMSAVE ".byte 0x0f, 0x01, 0xdb" | ||
320 | #define SVM_CLGI ".byte 0x0f, 0x01, 0xdd" | ||
321 | #define SVM_STGI ".byte 0x0f, 0x01, 0xdc" | ||
322 | #define SVM_INVLPGA ".byte 0x0f, 0x01, 0xdf" | ||
323 | |||
324 | #endif | ||
325 | |||
diff --git a/drivers/kvm/types.h b/drivers/kvm/types.h deleted file mode 100644 index 1c4e46decb22..000000000000 --- a/drivers/kvm/types.h +++ /dev/null | |||
@@ -1,54 +0,0 @@ | |||
1 | /* | ||
2 | * This program is free software; you can redistribute it and/or modify | ||
3 | * it under the terms of the GNU General Public License as published by | ||
4 | * the Free Software Foundation; either version 2 of the License. | ||
5 | * | ||
6 | * This program is distributed in the hope that it will be useful, | ||
7 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
8 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
9 | * GNU General Public License for more details. | ||
10 | * | ||
11 | * You should have received a copy of the GNU General Public License | ||
12 | * along with this program; if not, write to the Free Software | ||
13 | * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | ||
14 | * | ||
15 | */ | ||
16 | |||
17 | #ifndef __KVM_TYPES_H__ | ||
18 | #define __KVM_TYPES_H__ | ||
19 | |||
20 | #include <asm/types.h> | ||
21 | |||
22 | /* | ||
23 | * Address types: | ||
24 | * | ||
25 | * gva - guest virtual address | ||
26 | * gpa - guest physical address | ||
27 | * gfn - guest frame number | ||
28 | * hva - host virtual address | ||
29 | * hpa - host physical address | ||
30 | * hfn - host frame number | ||
31 | */ | ||
32 | |||
33 | typedef unsigned long gva_t; | ||
34 | typedef u64 gpa_t; | ||
35 | typedef unsigned long gfn_t; | ||
36 | |||
37 | typedef unsigned long hva_t; | ||
38 | typedef u64 hpa_t; | ||
39 | typedef unsigned long hfn_t; | ||
40 | |||
41 | struct kvm_pio_request { | ||
42 | unsigned long count; | ||
43 | int cur_count; | ||
44 | struct page *guest_pages[2]; | ||
45 | unsigned guest_page_offset; | ||
46 | int in; | ||
47 | int port; | ||
48 | int size; | ||
49 | int string; | ||
50 | int down; | ||
51 | int rep; | ||
52 | }; | ||
53 | |||
54 | #endif /* __KVM_TYPES_H__ */ | ||
diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c deleted file mode 100644 index 11ca2340d38f..000000000000 --- a/drivers/kvm/vmx.c +++ /dev/null | |||
@@ -1,2673 +0,0 @@ | |||
1 | /* | ||
2 | * Kernel-based Virtual Machine driver for Linux | ||
3 | * | ||
4 | * This module enables machines with Intel VT-x extensions to run virtual | ||
5 | * machines without emulation or binary translation. | ||
6 | * | ||
7 | * Copyright (C) 2006 Qumranet, Inc. | ||
8 | * | ||
9 | * Authors: | ||
10 | * Avi Kivity <avi@qumranet.com> | ||
11 | * Yaniv Kamay <yaniv@qumranet.com> | ||
12 | * | ||
13 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
14 | * the COPYING file in the top-level directory. | ||
15 | * | ||
16 | */ | ||
17 | |||
18 | #include "kvm.h" | ||
19 | #include "x86.h" | ||
20 | #include "x86_emulate.h" | ||
21 | #include "irq.h" | ||
22 | #include "vmx.h" | ||
23 | #include "segment_descriptor.h" | ||
24 | #include "mmu.h" | ||
25 | |||
26 | #include <linux/module.h> | ||
27 | #include <linux/kernel.h> | ||
28 | #include <linux/mm.h> | ||
29 | #include <linux/highmem.h> | ||
30 | #include <linux/sched.h> | ||
31 | #include <linux/moduleparam.h> | ||
32 | |||
33 | #include <asm/io.h> | ||
34 | #include <asm/desc.h> | ||
35 | |||
36 | MODULE_AUTHOR("Qumranet"); | ||
37 | MODULE_LICENSE("GPL"); | ||
38 | |||
39 | static int bypass_guest_pf = 1; | ||
40 | module_param(bypass_guest_pf, bool, 0); | ||
41 | |||
42 | struct vmcs { | ||
43 | u32 revision_id; | ||
44 | u32 abort; | ||
45 | char data[0]; | ||
46 | }; | ||
47 | |||
48 | struct vcpu_vmx { | ||
49 | struct kvm_vcpu vcpu; | ||
50 | int launched; | ||
51 | u8 fail; | ||
52 | u32 idt_vectoring_info; | ||
53 | struct kvm_msr_entry *guest_msrs; | ||
54 | struct kvm_msr_entry *host_msrs; | ||
55 | int nmsrs; | ||
56 | int save_nmsrs; | ||
57 | int msr_offset_efer; | ||
58 | #ifdef CONFIG_X86_64 | ||
59 | int msr_offset_kernel_gs_base; | ||
60 | #endif | ||
61 | struct vmcs *vmcs; | ||
62 | struct { | ||
63 | int loaded; | ||
64 | u16 fs_sel, gs_sel, ldt_sel; | ||
65 | int gs_ldt_reload_needed; | ||
66 | int fs_reload_needed; | ||
67 | int guest_efer_loaded; | ||
68 | } host_state; | ||
69 | struct { | ||
70 | struct { | ||
71 | bool pending; | ||
72 | u8 vector; | ||
73 | unsigned rip; | ||
74 | } irq; | ||
75 | } rmode; | ||
76 | }; | ||
77 | |||
78 | static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) | ||
79 | { | ||
80 | return container_of(vcpu, struct vcpu_vmx, vcpu); | ||
81 | } | ||
82 | |||
83 | static int init_rmode_tss(struct kvm *kvm); | ||
84 | |||
85 | static DEFINE_PER_CPU(struct vmcs *, vmxarea); | ||
86 | static DEFINE_PER_CPU(struct vmcs *, current_vmcs); | ||
87 | |||
88 | static struct page *vmx_io_bitmap_a; | ||
89 | static struct page *vmx_io_bitmap_b; | ||
90 | |||
91 | static struct vmcs_config { | ||
92 | int size; | ||
93 | int order; | ||
94 | u32 revision_id; | ||
95 | u32 pin_based_exec_ctrl; | ||
96 | u32 cpu_based_exec_ctrl; | ||
97 | u32 cpu_based_2nd_exec_ctrl; | ||
98 | u32 vmexit_ctrl; | ||
99 | u32 vmentry_ctrl; | ||
100 | } vmcs_config; | ||
101 | |||
102 | #define VMX_SEGMENT_FIELD(seg) \ | ||
103 | [VCPU_SREG_##seg] = { \ | ||
104 | .selector = GUEST_##seg##_SELECTOR, \ | ||
105 | .base = GUEST_##seg##_BASE, \ | ||
106 | .limit = GUEST_##seg##_LIMIT, \ | ||
107 | .ar_bytes = GUEST_##seg##_AR_BYTES, \ | ||
108 | } | ||
109 | |||
110 | static struct kvm_vmx_segment_field { | ||
111 | unsigned selector; | ||
112 | unsigned base; | ||
113 | unsigned limit; | ||
114 | unsigned ar_bytes; | ||
115 | } kvm_vmx_segment_fields[] = { | ||
116 | VMX_SEGMENT_FIELD(CS), | ||
117 | VMX_SEGMENT_FIELD(DS), | ||
118 | VMX_SEGMENT_FIELD(ES), | ||
119 | VMX_SEGMENT_FIELD(FS), | ||
120 | VMX_SEGMENT_FIELD(GS), | ||
121 | VMX_SEGMENT_FIELD(SS), | ||
122 | VMX_SEGMENT_FIELD(TR), | ||
123 | VMX_SEGMENT_FIELD(LDTR), | ||
124 | }; | ||
125 | |||
126 | /* | ||
127 | * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it | ||
128 | * away by decrementing the array size. | ||
129 | */ | ||
130 | static const u32 vmx_msr_index[] = { | ||
131 | #ifdef CONFIG_X86_64 | ||
132 | MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE, | ||
133 | #endif | ||
134 | MSR_EFER, MSR_K6_STAR, | ||
135 | }; | ||
136 | #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) | ||
137 | |||
138 | static void load_msrs(struct kvm_msr_entry *e, int n) | ||
139 | { | ||
140 | int i; | ||
141 | |||
142 | for (i = 0; i < n; ++i) | ||
143 | wrmsrl(e[i].index, e[i].data); | ||
144 | } | ||
145 | |||
146 | static void save_msrs(struct kvm_msr_entry *e, int n) | ||
147 | { | ||
148 | int i; | ||
149 | |||
150 | for (i = 0; i < n; ++i) | ||
151 | rdmsrl(e[i].index, e[i].data); | ||
152 | } | ||
153 | |||
154 | static inline int is_page_fault(u32 intr_info) | ||
155 | { | ||
156 | return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | | ||
157 | INTR_INFO_VALID_MASK)) == | ||
158 | (INTR_TYPE_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK); | ||
159 | } | ||
160 | |||
161 | static inline int is_no_device(u32 intr_info) | ||
162 | { | ||
163 | return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | | ||
164 | INTR_INFO_VALID_MASK)) == | ||
165 | (INTR_TYPE_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK); | ||
166 | } | ||
167 | |||
168 | static inline int is_invalid_opcode(u32 intr_info) | ||
169 | { | ||
170 | return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | | ||
171 | INTR_INFO_VALID_MASK)) == | ||
172 | (INTR_TYPE_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK); | ||
173 | } | ||
174 | |||
175 | static inline int is_external_interrupt(u32 intr_info) | ||
176 | { | ||
177 | return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) | ||
178 | == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); | ||
179 | } | ||
180 | |||
181 | static inline int cpu_has_vmx_tpr_shadow(void) | ||
182 | { | ||
183 | return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW); | ||
184 | } | ||
185 | |||
186 | static inline int vm_need_tpr_shadow(struct kvm *kvm) | ||
187 | { | ||
188 | return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm))); | ||
189 | } | ||
190 | |||
191 | static inline int cpu_has_secondary_exec_ctrls(void) | ||
192 | { | ||
193 | return (vmcs_config.cpu_based_exec_ctrl & | ||
194 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS); | ||
195 | } | ||
196 | |||
197 | static inline int cpu_has_vmx_virtualize_apic_accesses(void) | ||
198 | { | ||
199 | return (vmcs_config.cpu_based_2nd_exec_ctrl & | ||
200 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); | ||
201 | } | ||
202 | |||
203 | static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) | ||
204 | { | ||
205 | return ((cpu_has_vmx_virtualize_apic_accesses()) && | ||
206 | (irqchip_in_kernel(kvm))); | ||
207 | } | ||
208 | |||
209 | static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) | ||
210 | { | ||
211 | int i; | ||
212 | |||
213 | for (i = 0; i < vmx->nmsrs; ++i) | ||
214 | if (vmx->guest_msrs[i].index == msr) | ||
215 | return i; | ||
216 | return -1; | ||
217 | } | ||
218 | |||
219 | static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) | ||
220 | { | ||
221 | int i; | ||
222 | |||
223 | i = __find_msr_index(vmx, msr); | ||
224 | if (i >= 0) | ||
225 | return &vmx->guest_msrs[i]; | ||
226 | return NULL; | ||
227 | } | ||
228 | |||
229 | static void vmcs_clear(struct vmcs *vmcs) | ||
230 | { | ||
231 | u64 phys_addr = __pa(vmcs); | ||
232 | u8 error; | ||
233 | |||
234 | asm volatile (ASM_VMX_VMCLEAR_RAX "; setna %0" | ||
235 | : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) | ||
236 | : "cc", "memory"); | ||
237 | if (error) | ||
238 | printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n", | ||
239 | vmcs, phys_addr); | ||
240 | } | ||
241 | |||
242 | static void __vcpu_clear(void *arg) | ||
243 | { | ||
244 | struct vcpu_vmx *vmx = arg; | ||
245 | int cpu = raw_smp_processor_id(); | ||
246 | |||
247 | if (vmx->vcpu.cpu == cpu) | ||
248 | vmcs_clear(vmx->vmcs); | ||
249 | if (per_cpu(current_vmcs, cpu) == vmx->vmcs) | ||
250 | per_cpu(current_vmcs, cpu) = NULL; | ||
251 | rdtscll(vmx->vcpu.arch.host_tsc); | ||
252 | } | ||
253 | |||
254 | static void vcpu_clear(struct vcpu_vmx *vmx) | ||
255 | { | ||
256 | if (vmx->vcpu.cpu == -1) | ||
257 | return; | ||
258 | smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 0, 1); | ||
259 | vmx->launched = 0; | ||
260 | } | ||
261 | |||
262 | static unsigned long vmcs_readl(unsigned long field) | ||
263 | { | ||
264 | unsigned long value; | ||
265 | |||
266 | asm volatile (ASM_VMX_VMREAD_RDX_RAX | ||
267 | : "=a"(value) : "d"(field) : "cc"); | ||
268 | return value; | ||
269 | } | ||
270 | |||
271 | static u16 vmcs_read16(unsigned long field) | ||
272 | { | ||
273 | return vmcs_readl(field); | ||
274 | } | ||
275 | |||
276 | static u32 vmcs_read32(unsigned long field) | ||
277 | { | ||
278 | return vmcs_readl(field); | ||
279 | } | ||
280 | |||
281 | static u64 vmcs_read64(unsigned long field) | ||
282 | { | ||
283 | #ifdef CONFIG_X86_64 | ||
284 | return vmcs_readl(field); | ||
285 | #else | ||
286 | return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32); | ||
287 | #endif | ||
288 | } | ||
289 | |||
290 | static noinline void vmwrite_error(unsigned long field, unsigned long value) | ||
291 | { | ||
292 | printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n", | ||
293 | field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); | ||
294 | dump_stack(); | ||
295 | } | ||
296 | |||
297 | static void vmcs_writel(unsigned long field, unsigned long value) | ||
298 | { | ||
299 | u8 error; | ||
300 | |||
301 | asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0" | ||
302 | : "=q"(error) : "a"(value), "d"(field) : "cc"); | ||
303 | if (unlikely(error)) | ||
304 | vmwrite_error(field, value); | ||
305 | } | ||
306 | |||
307 | static void vmcs_write16(unsigned long field, u16 value) | ||
308 | { | ||
309 | vmcs_writel(field, value); | ||
310 | } | ||
311 | |||
312 | static void vmcs_write32(unsigned long field, u32 value) | ||
313 | { | ||
314 | vmcs_writel(field, value); | ||
315 | } | ||
316 | |||
317 | static void vmcs_write64(unsigned long field, u64 value) | ||
318 | { | ||
319 | #ifdef CONFIG_X86_64 | ||
320 | vmcs_writel(field, value); | ||
321 | #else | ||
322 | vmcs_writel(field, value); | ||
323 | asm volatile (""); | ||
324 | vmcs_writel(field+1, value >> 32); | ||
325 | #endif | ||
326 | } | ||
327 | |||
328 | static void vmcs_clear_bits(unsigned long field, u32 mask) | ||
329 | { | ||
330 | vmcs_writel(field, vmcs_readl(field) & ~mask); | ||
331 | } | ||
332 | |||
333 | static void vmcs_set_bits(unsigned long field, u32 mask) | ||
334 | { | ||
335 | vmcs_writel(field, vmcs_readl(field) | mask); | ||
336 | } | ||
337 | |||
338 | static void update_exception_bitmap(struct kvm_vcpu *vcpu) | ||
339 | { | ||
340 | u32 eb; | ||
341 | |||
342 | eb = (1u << PF_VECTOR) | (1u << UD_VECTOR); | ||
343 | if (!vcpu->fpu_active) | ||
344 | eb |= 1u << NM_VECTOR; | ||
345 | if (vcpu->guest_debug.enabled) | ||
346 | eb |= 1u << 1; | ||
347 | if (vcpu->arch.rmode.active) | ||
348 | eb = ~0; | ||
349 | vmcs_write32(EXCEPTION_BITMAP, eb); | ||
350 | } | ||
351 | |||
352 | static void reload_tss(void) | ||
353 | { | ||
354 | #ifndef CONFIG_X86_64 | ||
355 | |||
356 | /* | ||
357 | * VT restores TR but not its size. Useless. | ||
358 | */ | ||
359 | struct descriptor_table gdt; | ||
360 | struct segment_descriptor *descs; | ||
361 | |||
362 | get_gdt(&gdt); | ||
363 | descs = (void *)gdt.base; | ||
364 | descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ | ||
365 | load_TR_desc(); | ||
366 | #endif | ||
367 | } | ||
368 | |||
369 | static void load_transition_efer(struct vcpu_vmx *vmx) | ||
370 | { | ||
371 | int efer_offset = vmx->msr_offset_efer; | ||
372 | u64 host_efer = vmx->host_msrs[efer_offset].data; | ||
373 | u64 guest_efer = vmx->guest_msrs[efer_offset].data; | ||
374 | u64 ignore_bits; | ||
375 | |||
376 | if (efer_offset < 0) | ||
377 | return; | ||
378 | /* | ||
379 | * NX is emulated; LMA and LME handled by hardware; SCE meaninless | ||
380 | * outside long mode | ||
381 | */ | ||
382 | ignore_bits = EFER_NX | EFER_SCE; | ||
383 | #ifdef CONFIG_X86_64 | ||
384 | ignore_bits |= EFER_LMA | EFER_LME; | ||
385 | /* SCE is meaningful only in long mode on Intel */ | ||
386 | if (guest_efer & EFER_LMA) | ||
387 | ignore_bits &= ~(u64)EFER_SCE; | ||
388 | #endif | ||
389 | if ((guest_efer & ~ignore_bits) == (host_efer & ~ignore_bits)) | ||
390 | return; | ||
391 | |||
392 | vmx->host_state.guest_efer_loaded = 1; | ||
393 | guest_efer &= ~ignore_bits; | ||
394 | guest_efer |= host_efer & ignore_bits; | ||
395 | wrmsrl(MSR_EFER, guest_efer); | ||
396 | vmx->vcpu.stat.efer_reload++; | ||
397 | } | ||
398 | |||
399 | static void reload_host_efer(struct vcpu_vmx *vmx) | ||
400 | { | ||
401 | if (vmx->host_state.guest_efer_loaded) { | ||
402 | vmx->host_state.guest_efer_loaded = 0; | ||
403 | load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1); | ||
404 | } | ||
405 | } | ||
406 | |||
407 | static void vmx_save_host_state(struct kvm_vcpu *vcpu) | ||
408 | { | ||
409 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
410 | |||
411 | if (vmx->host_state.loaded) | ||
412 | return; | ||
413 | |||
414 | vmx->host_state.loaded = 1; | ||
415 | /* | ||
416 | * Set host fs and gs selectors. Unfortunately, 22.2.3 does not | ||
417 | * allow segment selectors with cpl > 0 or ti == 1. | ||
418 | */ | ||
419 | vmx->host_state.ldt_sel = read_ldt(); | ||
420 | vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel; | ||
421 | vmx->host_state.fs_sel = read_fs(); | ||
422 | if (!(vmx->host_state.fs_sel & 7)) { | ||
423 | vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel); | ||
424 | vmx->host_state.fs_reload_needed = 0; | ||
425 | } else { | ||
426 | vmcs_write16(HOST_FS_SELECTOR, 0); | ||
427 | vmx->host_state.fs_reload_needed = 1; | ||
428 | } | ||
429 | vmx->host_state.gs_sel = read_gs(); | ||
430 | if (!(vmx->host_state.gs_sel & 7)) | ||
431 | vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel); | ||
432 | else { | ||
433 | vmcs_write16(HOST_GS_SELECTOR, 0); | ||
434 | vmx->host_state.gs_ldt_reload_needed = 1; | ||
435 | } | ||
436 | |||
437 | #ifdef CONFIG_X86_64 | ||
438 | vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE)); | ||
439 | vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE)); | ||
440 | #else | ||
441 | vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel)); | ||
442 | vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel)); | ||
443 | #endif | ||
444 | |||
445 | #ifdef CONFIG_X86_64 | ||
446 | if (is_long_mode(&vmx->vcpu)) | ||
447 | save_msrs(vmx->host_msrs + | ||
448 | vmx->msr_offset_kernel_gs_base, 1); | ||
449 | |||
450 | #endif | ||
451 | load_msrs(vmx->guest_msrs, vmx->save_nmsrs); | ||
452 | load_transition_efer(vmx); | ||
453 | } | ||
454 | |||
455 | static void vmx_load_host_state(struct vcpu_vmx *vmx) | ||
456 | { | ||
457 | unsigned long flags; | ||
458 | |||
459 | if (!vmx->host_state.loaded) | ||
460 | return; | ||
461 | |||
462 | ++vmx->vcpu.stat.host_state_reload; | ||
463 | vmx->host_state.loaded = 0; | ||
464 | if (vmx->host_state.fs_reload_needed) | ||
465 | load_fs(vmx->host_state.fs_sel); | ||
466 | if (vmx->host_state.gs_ldt_reload_needed) { | ||
467 | load_ldt(vmx->host_state.ldt_sel); | ||
468 | /* | ||
469 | * If we have to reload gs, we must take care to | ||
470 | * preserve our gs base. | ||
471 | */ | ||
472 | local_irq_save(flags); | ||
473 | load_gs(vmx->host_state.gs_sel); | ||
474 | #ifdef CONFIG_X86_64 | ||
475 | wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE)); | ||
476 | #endif | ||
477 | local_irq_restore(flags); | ||
478 | } | ||
479 | reload_tss(); | ||
480 | save_msrs(vmx->guest_msrs, vmx->save_nmsrs); | ||
481 | load_msrs(vmx->host_msrs, vmx->save_nmsrs); | ||
482 | reload_host_efer(vmx); | ||
483 | } | ||
484 | |||
485 | /* | ||
486 | * Switches to specified vcpu, until a matching vcpu_put(), but assumes | ||
487 | * vcpu mutex is already taken. | ||
488 | */ | ||
489 | static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | ||
490 | { | ||
491 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
492 | u64 phys_addr = __pa(vmx->vmcs); | ||
493 | u64 tsc_this, delta; | ||
494 | |||
495 | if (vcpu->cpu != cpu) { | ||
496 | vcpu_clear(vmx); | ||
497 | kvm_migrate_apic_timer(vcpu); | ||
498 | } | ||
499 | |||
500 | if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { | ||
501 | u8 error; | ||
502 | |||
503 | per_cpu(current_vmcs, cpu) = vmx->vmcs; | ||
504 | asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0" | ||
505 | : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) | ||
506 | : "cc"); | ||
507 | if (error) | ||
508 | printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", | ||
509 | vmx->vmcs, phys_addr); | ||
510 | } | ||
511 | |||
512 | if (vcpu->cpu != cpu) { | ||
513 | struct descriptor_table dt; | ||
514 | unsigned long sysenter_esp; | ||
515 | |||
516 | vcpu->cpu = cpu; | ||
517 | /* | ||
518 | * Linux uses per-cpu TSS and GDT, so set these when switching | ||
519 | * processors. | ||
520 | */ | ||
521 | vmcs_writel(HOST_TR_BASE, read_tr_base()); /* 22.2.4 */ | ||
522 | get_gdt(&dt); | ||
523 | vmcs_writel(HOST_GDTR_BASE, dt.base); /* 22.2.4 */ | ||
524 | |||
525 | rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); | ||
526 | vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ | ||
527 | |||
528 | /* | ||
529 | * Make sure the time stamp counter is monotonous. | ||
530 | */ | ||
531 | rdtscll(tsc_this); | ||
532 | delta = vcpu->arch.host_tsc - tsc_this; | ||
533 | vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta); | ||
534 | } | ||
535 | } | ||
536 | |||
537 | static void vmx_vcpu_put(struct kvm_vcpu *vcpu) | ||
538 | { | ||
539 | vmx_load_host_state(to_vmx(vcpu)); | ||
540 | } | ||
541 | |||
542 | static void vmx_fpu_activate(struct kvm_vcpu *vcpu) | ||
543 | { | ||
544 | if (vcpu->fpu_active) | ||
545 | return; | ||
546 | vcpu->fpu_active = 1; | ||
547 | vmcs_clear_bits(GUEST_CR0, X86_CR0_TS); | ||
548 | if (vcpu->arch.cr0 & X86_CR0_TS) | ||
549 | vmcs_set_bits(GUEST_CR0, X86_CR0_TS); | ||
550 | update_exception_bitmap(vcpu); | ||
551 | } | ||
552 | |||
553 | static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) | ||
554 | { | ||
555 | if (!vcpu->fpu_active) | ||
556 | return; | ||
557 | vcpu->fpu_active = 0; | ||
558 | vmcs_set_bits(GUEST_CR0, X86_CR0_TS); | ||
559 | update_exception_bitmap(vcpu); | ||
560 | } | ||
561 | |||
562 | static void vmx_vcpu_decache(struct kvm_vcpu *vcpu) | ||
563 | { | ||
564 | vcpu_clear(to_vmx(vcpu)); | ||
565 | } | ||
566 | |||
567 | static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) | ||
568 | { | ||
569 | return vmcs_readl(GUEST_RFLAGS); | ||
570 | } | ||
571 | |||
572 | static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) | ||
573 | { | ||
574 | if (vcpu->arch.rmode.active) | ||
575 | rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; | ||
576 | vmcs_writel(GUEST_RFLAGS, rflags); | ||
577 | } | ||
578 | |||
579 | static void skip_emulated_instruction(struct kvm_vcpu *vcpu) | ||
580 | { | ||
581 | unsigned long rip; | ||
582 | u32 interruptibility; | ||
583 | |||
584 | rip = vmcs_readl(GUEST_RIP); | ||
585 | rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); | ||
586 | vmcs_writel(GUEST_RIP, rip); | ||
587 | |||
588 | /* | ||
589 | * We emulated an instruction, so temporary interrupt blocking | ||
590 | * should be removed, if set. | ||
591 | */ | ||
592 | interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); | ||
593 | if (interruptibility & 3) | ||
594 | vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, | ||
595 | interruptibility & ~3); | ||
596 | vcpu->arch.interrupt_window_open = 1; | ||
597 | } | ||
598 | |||
599 | static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, | ||
600 | bool has_error_code, u32 error_code) | ||
601 | { | ||
602 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | ||
603 | nr | INTR_TYPE_EXCEPTION | ||
604 | | (has_error_code ? INTR_INFO_DELIEVER_CODE_MASK : 0) | ||
605 | | INTR_INFO_VALID_MASK); | ||
606 | if (has_error_code) | ||
607 | vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); | ||
608 | } | ||
609 | |||
610 | static bool vmx_exception_injected(struct kvm_vcpu *vcpu) | ||
611 | { | ||
612 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
613 | |||
614 | return !(vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); | ||
615 | } | ||
616 | |||
617 | /* | ||
618 | * Swap MSR entry in host/guest MSR entry array. | ||
619 | */ | ||
620 | #ifdef CONFIG_X86_64 | ||
621 | static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) | ||
622 | { | ||
623 | struct kvm_msr_entry tmp; | ||
624 | |||
625 | tmp = vmx->guest_msrs[to]; | ||
626 | vmx->guest_msrs[to] = vmx->guest_msrs[from]; | ||
627 | vmx->guest_msrs[from] = tmp; | ||
628 | tmp = vmx->host_msrs[to]; | ||
629 | vmx->host_msrs[to] = vmx->host_msrs[from]; | ||
630 | vmx->host_msrs[from] = tmp; | ||
631 | } | ||
632 | #endif | ||
633 | |||
634 | /* | ||
635 | * Set up the vmcs to automatically save and restore system | ||
636 | * msrs. Don't touch the 64-bit msrs if the guest is in legacy | ||
637 | * mode, as fiddling with msrs is very expensive. | ||
638 | */ | ||
639 | static void setup_msrs(struct vcpu_vmx *vmx) | ||
640 | { | ||
641 | int save_nmsrs; | ||
642 | |||
643 | save_nmsrs = 0; | ||
644 | #ifdef CONFIG_X86_64 | ||
645 | if (is_long_mode(&vmx->vcpu)) { | ||
646 | int index; | ||
647 | |||
648 | index = __find_msr_index(vmx, MSR_SYSCALL_MASK); | ||
649 | if (index >= 0) | ||
650 | move_msr_up(vmx, index, save_nmsrs++); | ||
651 | index = __find_msr_index(vmx, MSR_LSTAR); | ||
652 | if (index >= 0) | ||
653 | move_msr_up(vmx, index, save_nmsrs++); | ||
654 | index = __find_msr_index(vmx, MSR_CSTAR); | ||
655 | if (index >= 0) | ||
656 | move_msr_up(vmx, index, save_nmsrs++); | ||
657 | index = __find_msr_index(vmx, MSR_KERNEL_GS_BASE); | ||
658 | if (index >= 0) | ||
659 | move_msr_up(vmx, index, save_nmsrs++); | ||
660 | /* | ||
661 | * MSR_K6_STAR is only needed on long mode guests, and only | ||
662 | * if efer.sce is enabled. | ||
663 | */ | ||
664 | index = __find_msr_index(vmx, MSR_K6_STAR); | ||
665 | if ((index >= 0) && (vmx->vcpu.arch.shadow_efer & EFER_SCE)) | ||
666 | move_msr_up(vmx, index, save_nmsrs++); | ||
667 | } | ||
668 | #endif | ||
669 | vmx->save_nmsrs = save_nmsrs; | ||
670 | |||
671 | #ifdef CONFIG_X86_64 | ||
672 | vmx->msr_offset_kernel_gs_base = | ||
673 | __find_msr_index(vmx, MSR_KERNEL_GS_BASE); | ||
674 | #endif | ||
675 | vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER); | ||
676 | } | ||
677 | |||
678 | /* | ||
679 | * reads and returns guest's timestamp counter "register" | ||
680 | * guest_tsc = host_tsc + tsc_offset -- 21.3 | ||
681 | */ | ||
682 | static u64 guest_read_tsc(void) | ||
683 | { | ||
684 | u64 host_tsc, tsc_offset; | ||
685 | |||
686 | rdtscll(host_tsc); | ||
687 | tsc_offset = vmcs_read64(TSC_OFFSET); | ||
688 | return host_tsc + tsc_offset; | ||
689 | } | ||
690 | |||
691 | /* | ||
692 | * writes 'guest_tsc' into guest's timestamp counter "register" | ||
693 | * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc | ||
694 | */ | ||
695 | static void guest_write_tsc(u64 guest_tsc) | ||
696 | { | ||
697 | u64 host_tsc; | ||
698 | |||
699 | rdtscll(host_tsc); | ||
700 | vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc); | ||
701 | } | ||
702 | |||
703 | /* | ||
704 | * Reads an msr value (of 'msr_index') into 'pdata'. | ||
705 | * Returns 0 on success, non-0 otherwise. | ||
706 | * Assumes vcpu_load() was already called. | ||
707 | */ | ||
708 | static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) | ||
709 | { | ||
710 | u64 data; | ||
711 | struct kvm_msr_entry *msr; | ||
712 | |||
713 | if (!pdata) { | ||
714 | printk(KERN_ERR "BUG: get_msr called with NULL pdata\n"); | ||
715 | return -EINVAL; | ||
716 | } | ||
717 | |||
718 | switch (msr_index) { | ||
719 | #ifdef CONFIG_X86_64 | ||
720 | case MSR_FS_BASE: | ||
721 | data = vmcs_readl(GUEST_FS_BASE); | ||
722 | break; | ||
723 | case MSR_GS_BASE: | ||
724 | data = vmcs_readl(GUEST_GS_BASE); | ||
725 | break; | ||
726 | case MSR_EFER: | ||
727 | return kvm_get_msr_common(vcpu, msr_index, pdata); | ||
728 | #endif | ||
729 | case MSR_IA32_TIME_STAMP_COUNTER: | ||
730 | data = guest_read_tsc(); | ||
731 | break; | ||
732 | case MSR_IA32_SYSENTER_CS: | ||
733 | data = vmcs_read32(GUEST_SYSENTER_CS); | ||
734 | break; | ||
735 | case MSR_IA32_SYSENTER_EIP: | ||
736 | data = vmcs_readl(GUEST_SYSENTER_EIP); | ||
737 | break; | ||
738 | case MSR_IA32_SYSENTER_ESP: | ||
739 | data = vmcs_readl(GUEST_SYSENTER_ESP); | ||
740 | break; | ||
741 | default: | ||
742 | msr = find_msr_entry(to_vmx(vcpu), msr_index); | ||
743 | if (msr) { | ||
744 | data = msr->data; | ||
745 | break; | ||
746 | } | ||
747 | return kvm_get_msr_common(vcpu, msr_index, pdata); | ||
748 | } | ||
749 | |||
750 | *pdata = data; | ||
751 | return 0; | ||
752 | } | ||
753 | |||
754 | /* | ||
755 | * Writes msr value into into the appropriate "register". | ||
756 | * Returns 0 on success, non-0 otherwise. | ||
757 | * Assumes vcpu_load() was already called. | ||
758 | */ | ||
759 | static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) | ||
760 | { | ||
761 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
762 | struct kvm_msr_entry *msr; | ||
763 | int ret = 0; | ||
764 | |||
765 | switch (msr_index) { | ||
766 | #ifdef CONFIG_X86_64 | ||
767 | case MSR_EFER: | ||
768 | ret = kvm_set_msr_common(vcpu, msr_index, data); | ||
769 | if (vmx->host_state.loaded) { | ||
770 | reload_host_efer(vmx); | ||
771 | load_transition_efer(vmx); | ||
772 | } | ||
773 | break; | ||
774 | case MSR_FS_BASE: | ||
775 | vmcs_writel(GUEST_FS_BASE, data); | ||
776 | break; | ||
777 | case MSR_GS_BASE: | ||
778 | vmcs_writel(GUEST_GS_BASE, data); | ||
779 | break; | ||
780 | #endif | ||
781 | case MSR_IA32_SYSENTER_CS: | ||
782 | vmcs_write32(GUEST_SYSENTER_CS, data); | ||
783 | break; | ||
784 | case MSR_IA32_SYSENTER_EIP: | ||
785 | vmcs_writel(GUEST_SYSENTER_EIP, data); | ||
786 | break; | ||
787 | case MSR_IA32_SYSENTER_ESP: | ||
788 | vmcs_writel(GUEST_SYSENTER_ESP, data); | ||
789 | break; | ||
790 | case MSR_IA32_TIME_STAMP_COUNTER: | ||
791 | guest_write_tsc(data); | ||
792 | break; | ||
793 | default: | ||
794 | msr = find_msr_entry(vmx, msr_index); | ||
795 | if (msr) { | ||
796 | msr->data = data; | ||
797 | if (vmx->host_state.loaded) | ||
798 | load_msrs(vmx->guest_msrs, vmx->save_nmsrs); | ||
799 | break; | ||
800 | } | ||
801 | ret = kvm_set_msr_common(vcpu, msr_index, data); | ||
802 | } | ||
803 | |||
804 | return ret; | ||
805 | } | ||
806 | |||
807 | /* | ||
808 | * Sync the rsp and rip registers into the vcpu structure. This allows | ||
809 | * registers to be accessed by indexing vcpu->arch.regs. | ||
810 | */ | ||
811 | static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu) | ||
812 | { | ||
813 | vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); | ||
814 | vcpu->arch.rip = vmcs_readl(GUEST_RIP); | ||
815 | } | ||
816 | |||
817 | /* | ||
818 | * Syncs rsp and rip back into the vmcs. Should be called after possible | ||
819 | * modification. | ||
820 | */ | ||
821 | static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu) | ||
822 | { | ||
823 | vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); | ||
824 | vmcs_writel(GUEST_RIP, vcpu->arch.rip); | ||
825 | } | ||
826 | |||
827 | static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) | ||
828 | { | ||
829 | unsigned long dr7 = 0x400; | ||
830 | int old_singlestep; | ||
831 | |||
832 | old_singlestep = vcpu->guest_debug.singlestep; | ||
833 | |||
834 | vcpu->guest_debug.enabled = dbg->enabled; | ||
835 | if (vcpu->guest_debug.enabled) { | ||
836 | int i; | ||
837 | |||
838 | dr7 |= 0x200; /* exact */ | ||
839 | for (i = 0; i < 4; ++i) { | ||
840 | if (!dbg->breakpoints[i].enabled) | ||
841 | continue; | ||
842 | vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address; | ||
843 | dr7 |= 2 << (i*2); /* global enable */ | ||
844 | dr7 |= 0 << (i*4+16); /* execution breakpoint */ | ||
845 | } | ||
846 | |||
847 | vcpu->guest_debug.singlestep = dbg->singlestep; | ||
848 | } else | ||
849 | vcpu->guest_debug.singlestep = 0; | ||
850 | |||
851 | if (old_singlestep && !vcpu->guest_debug.singlestep) { | ||
852 | unsigned long flags; | ||
853 | |||
854 | flags = vmcs_readl(GUEST_RFLAGS); | ||
855 | flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); | ||
856 | vmcs_writel(GUEST_RFLAGS, flags); | ||
857 | } | ||
858 | |||
859 | update_exception_bitmap(vcpu); | ||
860 | vmcs_writel(GUEST_DR7, dr7); | ||
861 | |||
862 | return 0; | ||
863 | } | ||
864 | |||
865 | static int vmx_get_irq(struct kvm_vcpu *vcpu) | ||
866 | { | ||
867 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
868 | u32 idtv_info_field; | ||
869 | |||
870 | idtv_info_field = vmx->idt_vectoring_info; | ||
871 | if (idtv_info_field & INTR_INFO_VALID_MASK) { | ||
872 | if (is_external_interrupt(idtv_info_field)) | ||
873 | return idtv_info_field & VECTORING_INFO_VECTOR_MASK; | ||
874 | else | ||
875 | printk(KERN_DEBUG "pending exception: not handled yet\n"); | ||
876 | } | ||
877 | return -1; | ||
878 | } | ||
879 | |||
880 | static __init int cpu_has_kvm_support(void) | ||
881 | { | ||
882 | unsigned long ecx = cpuid_ecx(1); | ||
883 | return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */ | ||
884 | } | ||
885 | |||
886 | static __init int vmx_disabled_by_bios(void) | ||
887 | { | ||
888 | u64 msr; | ||
889 | |||
890 | rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); | ||
891 | return (msr & (MSR_IA32_FEATURE_CONTROL_LOCKED | | ||
892 | MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) | ||
893 | == MSR_IA32_FEATURE_CONTROL_LOCKED; | ||
894 | /* locked but not enabled */ | ||
895 | } | ||
896 | |||
897 | static void hardware_enable(void *garbage) | ||
898 | { | ||
899 | int cpu = raw_smp_processor_id(); | ||
900 | u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); | ||
901 | u64 old; | ||
902 | |||
903 | rdmsrl(MSR_IA32_FEATURE_CONTROL, old); | ||
904 | if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED | | ||
905 | MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) | ||
906 | != (MSR_IA32_FEATURE_CONTROL_LOCKED | | ||
907 | MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) | ||
908 | /* enable and lock */ | ||
909 | wrmsrl(MSR_IA32_FEATURE_CONTROL, old | | ||
910 | MSR_IA32_FEATURE_CONTROL_LOCKED | | ||
911 | MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED); | ||
912 | write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ | ||
913 | asm volatile (ASM_VMX_VMXON_RAX : : "a"(&phys_addr), "m"(phys_addr) | ||
914 | : "memory", "cc"); | ||
915 | } | ||
916 | |||
917 | static void hardware_disable(void *garbage) | ||
918 | { | ||
919 | asm volatile (ASM_VMX_VMXOFF : : : "cc"); | ||
920 | } | ||
921 | |||
922 | static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, | ||
923 | u32 msr, u32 *result) | ||
924 | { | ||
925 | u32 vmx_msr_low, vmx_msr_high; | ||
926 | u32 ctl = ctl_min | ctl_opt; | ||
927 | |||
928 | rdmsr(msr, vmx_msr_low, vmx_msr_high); | ||
929 | |||
930 | ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */ | ||
931 | ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */ | ||
932 | |||
933 | /* Ensure minimum (required) set of control bits are supported. */ | ||
934 | if (ctl_min & ~ctl) | ||
935 | return -EIO; | ||
936 | |||
937 | *result = ctl; | ||
938 | return 0; | ||
939 | } | ||
940 | |||
941 | static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) | ||
942 | { | ||
943 | u32 vmx_msr_low, vmx_msr_high; | ||
944 | u32 min, opt; | ||
945 | u32 _pin_based_exec_control = 0; | ||
946 | u32 _cpu_based_exec_control = 0; | ||
947 | u32 _cpu_based_2nd_exec_control = 0; | ||
948 | u32 _vmexit_control = 0; | ||
949 | u32 _vmentry_control = 0; | ||
950 | |||
951 | min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; | ||
952 | opt = 0; | ||
953 | if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, | ||
954 | &_pin_based_exec_control) < 0) | ||
955 | return -EIO; | ||
956 | |||
957 | min = CPU_BASED_HLT_EXITING | | ||
958 | #ifdef CONFIG_X86_64 | ||
959 | CPU_BASED_CR8_LOAD_EXITING | | ||
960 | CPU_BASED_CR8_STORE_EXITING | | ||
961 | #endif | ||
962 | CPU_BASED_USE_IO_BITMAPS | | ||
963 | CPU_BASED_MOV_DR_EXITING | | ||
964 | CPU_BASED_USE_TSC_OFFSETING; | ||
965 | opt = CPU_BASED_TPR_SHADOW | | ||
966 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; | ||
967 | if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, | ||
968 | &_cpu_based_exec_control) < 0) | ||
969 | return -EIO; | ||
970 | #ifdef CONFIG_X86_64 | ||
971 | if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) | ||
972 | _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING & | ||
973 | ~CPU_BASED_CR8_STORE_EXITING; | ||
974 | #endif | ||
975 | if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { | ||
976 | min = 0; | ||
977 | opt = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | | ||
978 | SECONDARY_EXEC_WBINVD_EXITING; | ||
979 | if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS2, | ||
980 | &_cpu_based_2nd_exec_control) < 0) | ||
981 | return -EIO; | ||
982 | } | ||
983 | #ifndef CONFIG_X86_64 | ||
984 | if (!(_cpu_based_2nd_exec_control & | ||
985 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) | ||
986 | _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; | ||
987 | #endif | ||
988 | |||
989 | min = 0; | ||
990 | #ifdef CONFIG_X86_64 | ||
991 | min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; | ||
992 | #endif | ||
993 | opt = 0; | ||
994 | if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, | ||
995 | &_vmexit_control) < 0) | ||
996 | return -EIO; | ||
997 | |||
998 | min = opt = 0; | ||
999 | if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, | ||
1000 | &_vmentry_control) < 0) | ||
1001 | return -EIO; | ||
1002 | |||
1003 | rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high); | ||
1004 | |||
1005 | /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ | ||
1006 | if ((vmx_msr_high & 0x1fff) > PAGE_SIZE) | ||
1007 | return -EIO; | ||
1008 | |||
1009 | #ifdef CONFIG_X86_64 | ||
1010 | /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */ | ||
1011 | if (vmx_msr_high & (1u<<16)) | ||
1012 | return -EIO; | ||
1013 | #endif | ||
1014 | |||
1015 | /* Require Write-Back (WB) memory type for VMCS accesses. */ | ||
1016 | if (((vmx_msr_high >> 18) & 15) != 6) | ||
1017 | return -EIO; | ||
1018 | |||
1019 | vmcs_conf->size = vmx_msr_high & 0x1fff; | ||
1020 | vmcs_conf->order = get_order(vmcs_config.size); | ||
1021 | vmcs_conf->revision_id = vmx_msr_low; | ||
1022 | |||
1023 | vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; | ||
1024 | vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; | ||
1025 | vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; | ||
1026 | vmcs_conf->vmexit_ctrl = _vmexit_control; | ||
1027 | vmcs_conf->vmentry_ctrl = _vmentry_control; | ||
1028 | |||
1029 | return 0; | ||
1030 | } | ||
1031 | |||
1032 | static struct vmcs *alloc_vmcs_cpu(int cpu) | ||
1033 | { | ||
1034 | int node = cpu_to_node(cpu); | ||
1035 | struct page *pages; | ||
1036 | struct vmcs *vmcs; | ||
1037 | |||
1038 | pages = alloc_pages_node(node, GFP_KERNEL, vmcs_config.order); | ||
1039 | if (!pages) | ||
1040 | return NULL; | ||
1041 | vmcs = page_address(pages); | ||
1042 | memset(vmcs, 0, vmcs_config.size); | ||
1043 | vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */ | ||
1044 | return vmcs; | ||
1045 | } | ||
1046 | |||
1047 | static struct vmcs *alloc_vmcs(void) | ||
1048 | { | ||
1049 | return alloc_vmcs_cpu(raw_smp_processor_id()); | ||
1050 | } | ||
1051 | |||
1052 | static void free_vmcs(struct vmcs *vmcs) | ||
1053 | { | ||
1054 | free_pages((unsigned long)vmcs, vmcs_config.order); | ||
1055 | } | ||
1056 | |||
1057 | static void free_kvm_area(void) | ||
1058 | { | ||
1059 | int cpu; | ||
1060 | |||
1061 | for_each_online_cpu(cpu) | ||
1062 | free_vmcs(per_cpu(vmxarea, cpu)); | ||
1063 | } | ||
1064 | |||
1065 | static __init int alloc_kvm_area(void) | ||
1066 | { | ||
1067 | int cpu; | ||
1068 | |||
1069 | for_each_online_cpu(cpu) { | ||
1070 | struct vmcs *vmcs; | ||
1071 | |||
1072 | vmcs = alloc_vmcs_cpu(cpu); | ||
1073 | if (!vmcs) { | ||
1074 | free_kvm_area(); | ||
1075 | return -ENOMEM; | ||
1076 | } | ||
1077 | |||
1078 | per_cpu(vmxarea, cpu) = vmcs; | ||
1079 | } | ||
1080 | return 0; | ||
1081 | } | ||
1082 | |||
1083 | static __init int hardware_setup(void) | ||
1084 | { | ||
1085 | if (setup_vmcs_config(&vmcs_config) < 0) | ||
1086 | return -EIO; | ||
1087 | return alloc_kvm_area(); | ||
1088 | } | ||
1089 | |||
1090 | static __exit void hardware_unsetup(void) | ||
1091 | { | ||
1092 | free_kvm_area(); | ||
1093 | } | ||
1094 | |||
1095 | static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save) | ||
1096 | { | ||
1097 | struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; | ||
1098 | |||
1099 | if (vmcs_readl(sf->base) == save->base && (save->base & AR_S_MASK)) { | ||
1100 | vmcs_write16(sf->selector, save->selector); | ||
1101 | vmcs_writel(sf->base, save->base); | ||
1102 | vmcs_write32(sf->limit, save->limit); | ||
1103 | vmcs_write32(sf->ar_bytes, save->ar); | ||
1104 | } else { | ||
1105 | u32 dpl = (vmcs_read16(sf->selector) & SELECTOR_RPL_MASK) | ||
1106 | << AR_DPL_SHIFT; | ||
1107 | vmcs_write32(sf->ar_bytes, 0x93 | dpl); | ||
1108 | } | ||
1109 | } | ||
1110 | |||
1111 | static void enter_pmode(struct kvm_vcpu *vcpu) | ||
1112 | { | ||
1113 | unsigned long flags; | ||
1114 | |||
1115 | vcpu->arch.rmode.active = 0; | ||
1116 | |||
1117 | vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base); | ||
1118 | vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit); | ||
1119 | vmcs_write32(GUEST_TR_AR_BYTES, vcpu->arch.rmode.tr.ar); | ||
1120 | |||
1121 | flags = vmcs_readl(GUEST_RFLAGS); | ||
1122 | flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM); | ||
1123 | flags |= (vcpu->arch.rmode.save_iopl << IOPL_SHIFT); | ||
1124 | vmcs_writel(GUEST_RFLAGS, flags); | ||
1125 | |||
1126 | vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | | ||
1127 | (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME)); | ||
1128 | |||
1129 | update_exception_bitmap(vcpu); | ||
1130 | |||
1131 | fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es); | ||
1132 | fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); | ||
1133 | fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); | ||
1134 | fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); | ||
1135 | |||
1136 | vmcs_write16(GUEST_SS_SELECTOR, 0); | ||
1137 | vmcs_write32(GUEST_SS_AR_BYTES, 0x93); | ||
1138 | |||
1139 | vmcs_write16(GUEST_CS_SELECTOR, | ||
1140 | vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK); | ||
1141 | vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); | ||
1142 | } | ||
1143 | |||
1144 | static gva_t rmode_tss_base(struct kvm *kvm) | ||
1145 | { | ||
1146 | if (!kvm->arch.tss_addr) { | ||
1147 | gfn_t base_gfn = kvm->memslots[0].base_gfn + | ||
1148 | kvm->memslots[0].npages - 3; | ||
1149 | return base_gfn << PAGE_SHIFT; | ||
1150 | } | ||
1151 | return kvm->arch.tss_addr; | ||
1152 | } | ||
1153 | |||
1154 | static void fix_rmode_seg(int seg, struct kvm_save_segment *save) | ||
1155 | { | ||
1156 | struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; | ||
1157 | |||
1158 | save->selector = vmcs_read16(sf->selector); | ||
1159 | save->base = vmcs_readl(sf->base); | ||
1160 | save->limit = vmcs_read32(sf->limit); | ||
1161 | save->ar = vmcs_read32(sf->ar_bytes); | ||
1162 | vmcs_write16(sf->selector, save->base >> 4); | ||
1163 | vmcs_write32(sf->base, save->base & 0xfffff); | ||
1164 | vmcs_write32(sf->limit, 0xffff); | ||
1165 | vmcs_write32(sf->ar_bytes, 0xf3); | ||
1166 | } | ||
1167 | |||
1168 | static void enter_rmode(struct kvm_vcpu *vcpu) | ||
1169 | { | ||
1170 | unsigned long flags; | ||
1171 | |||
1172 | vcpu->arch.rmode.active = 1; | ||
1173 | |||
1174 | vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE); | ||
1175 | vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); | ||
1176 | |||
1177 | vcpu->arch.rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT); | ||
1178 | vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); | ||
1179 | |||
1180 | vcpu->arch.rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES); | ||
1181 | vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); | ||
1182 | |||
1183 | flags = vmcs_readl(GUEST_RFLAGS); | ||
1184 | vcpu->arch.rmode.save_iopl | ||
1185 | = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; | ||
1186 | |||
1187 | flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; | ||
1188 | |||
1189 | vmcs_writel(GUEST_RFLAGS, flags); | ||
1190 | vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); | ||
1191 | update_exception_bitmap(vcpu); | ||
1192 | |||
1193 | vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4); | ||
1194 | vmcs_write32(GUEST_SS_LIMIT, 0xffff); | ||
1195 | vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); | ||
1196 | |||
1197 | vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); | ||
1198 | vmcs_write32(GUEST_CS_LIMIT, 0xffff); | ||
1199 | if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000) | ||
1200 | vmcs_writel(GUEST_CS_BASE, 0xf0000); | ||
1201 | vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4); | ||
1202 | |||
1203 | fix_rmode_seg(VCPU_SREG_ES, &vcpu->arch.rmode.es); | ||
1204 | fix_rmode_seg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); | ||
1205 | fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); | ||
1206 | fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); | ||
1207 | |||
1208 | kvm_mmu_reset_context(vcpu); | ||
1209 | init_rmode_tss(vcpu->kvm); | ||
1210 | } | ||
1211 | |||
1212 | #ifdef CONFIG_X86_64 | ||
1213 | |||
1214 | static void enter_lmode(struct kvm_vcpu *vcpu) | ||
1215 | { | ||
1216 | u32 guest_tr_ar; | ||
1217 | |||
1218 | guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); | ||
1219 | if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) { | ||
1220 | printk(KERN_DEBUG "%s: tss fixup for long mode. \n", | ||
1221 | __FUNCTION__); | ||
1222 | vmcs_write32(GUEST_TR_AR_BYTES, | ||
1223 | (guest_tr_ar & ~AR_TYPE_MASK) | ||
1224 | | AR_TYPE_BUSY_64_TSS); | ||
1225 | } | ||
1226 | |||
1227 | vcpu->arch.shadow_efer |= EFER_LMA; | ||
1228 | |||
1229 | find_msr_entry(to_vmx(vcpu), MSR_EFER)->data |= EFER_LMA | EFER_LME; | ||
1230 | vmcs_write32(VM_ENTRY_CONTROLS, | ||
1231 | vmcs_read32(VM_ENTRY_CONTROLS) | ||
1232 | | VM_ENTRY_IA32E_MODE); | ||
1233 | } | ||
1234 | |||
1235 | static void exit_lmode(struct kvm_vcpu *vcpu) | ||
1236 | { | ||
1237 | vcpu->arch.shadow_efer &= ~EFER_LMA; | ||
1238 | |||
1239 | vmcs_write32(VM_ENTRY_CONTROLS, | ||
1240 | vmcs_read32(VM_ENTRY_CONTROLS) | ||
1241 | & ~VM_ENTRY_IA32E_MODE); | ||
1242 | } | ||
1243 | |||
1244 | #endif | ||
1245 | |||
1246 | static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) | ||
1247 | { | ||
1248 | vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK; | ||
1249 | vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK; | ||
1250 | } | ||
1251 | |||
1252 | static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | ||
1253 | { | ||
1254 | vmx_fpu_deactivate(vcpu); | ||
1255 | |||
1256 | if (vcpu->arch.rmode.active && (cr0 & X86_CR0_PE)) | ||
1257 | enter_pmode(vcpu); | ||
1258 | |||
1259 | if (!vcpu->arch.rmode.active && !(cr0 & X86_CR0_PE)) | ||
1260 | enter_rmode(vcpu); | ||
1261 | |||
1262 | #ifdef CONFIG_X86_64 | ||
1263 | if (vcpu->arch.shadow_efer & EFER_LME) { | ||
1264 | if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) | ||
1265 | enter_lmode(vcpu); | ||
1266 | if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) | ||
1267 | exit_lmode(vcpu); | ||
1268 | } | ||
1269 | #endif | ||
1270 | |||
1271 | vmcs_writel(CR0_READ_SHADOW, cr0); | ||
1272 | vmcs_writel(GUEST_CR0, | ||
1273 | (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON); | ||
1274 | vcpu->arch.cr0 = cr0; | ||
1275 | |||
1276 | if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE)) | ||
1277 | vmx_fpu_activate(vcpu); | ||
1278 | } | ||
1279 | |||
1280 | static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | ||
1281 | { | ||
1282 | vmcs_writel(GUEST_CR3, cr3); | ||
1283 | if (vcpu->arch.cr0 & X86_CR0_PE) | ||
1284 | vmx_fpu_deactivate(vcpu); | ||
1285 | } | ||
1286 | |||
1287 | static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | ||
1288 | { | ||
1289 | vmcs_writel(CR4_READ_SHADOW, cr4); | ||
1290 | vmcs_writel(GUEST_CR4, cr4 | (vcpu->arch.rmode.active ? | ||
1291 | KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON)); | ||
1292 | vcpu->arch.cr4 = cr4; | ||
1293 | } | ||
1294 | |||
1295 | #ifdef CONFIG_X86_64 | ||
1296 | |||
1297 | static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) | ||
1298 | { | ||
1299 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
1300 | struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); | ||
1301 | |||
1302 | vcpu->arch.shadow_efer = efer; | ||
1303 | if (efer & EFER_LMA) { | ||
1304 | vmcs_write32(VM_ENTRY_CONTROLS, | ||
1305 | vmcs_read32(VM_ENTRY_CONTROLS) | | ||
1306 | VM_ENTRY_IA32E_MODE); | ||
1307 | msr->data = efer; | ||
1308 | |||
1309 | } else { | ||
1310 | vmcs_write32(VM_ENTRY_CONTROLS, | ||
1311 | vmcs_read32(VM_ENTRY_CONTROLS) & | ||
1312 | ~VM_ENTRY_IA32E_MODE); | ||
1313 | |||
1314 | msr->data = efer & ~EFER_LME; | ||
1315 | } | ||
1316 | setup_msrs(vmx); | ||
1317 | } | ||
1318 | |||
1319 | #endif | ||
1320 | |||
1321 | static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) | ||
1322 | { | ||
1323 | struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; | ||
1324 | |||
1325 | return vmcs_readl(sf->base); | ||
1326 | } | ||
1327 | |||
1328 | static void vmx_get_segment(struct kvm_vcpu *vcpu, | ||
1329 | struct kvm_segment *var, int seg) | ||
1330 | { | ||
1331 | struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; | ||
1332 | u32 ar; | ||
1333 | |||
1334 | var->base = vmcs_readl(sf->base); | ||
1335 | var->limit = vmcs_read32(sf->limit); | ||
1336 | var->selector = vmcs_read16(sf->selector); | ||
1337 | ar = vmcs_read32(sf->ar_bytes); | ||
1338 | if (ar & AR_UNUSABLE_MASK) | ||
1339 | ar = 0; | ||
1340 | var->type = ar & 15; | ||
1341 | var->s = (ar >> 4) & 1; | ||
1342 | var->dpl = (ar >> 5) & 3; | ||
1343 | var->present = (ar >> 7) & 1; | ||
1344 | var->avl = (ar >> 12) & 1; | ||
1345 | var->l = (ar >> 13) & 1; | ||
1346 | var->db = (ar >> 14) & 1; | ||
1347 | var->g = (ar >> 15) & 1; | ||
1348 | var->unusable = (ar >> 16) & 1; | ||
1349 | } | ||
1350 | |||
1351 | static u32 vmx_segment_access_rights(struct kvm_segment *var) | ||
1352 | { | ||
1353 | u32 ar; | ||
1354 | |||
1355 | if (var->unusable) | ||
1356 | ar = 1 << 16; | ||
1357 | else { | ||
1358 | ar = var->type & 15; | ||
1359 | ar |= (var->s & 1) << 4; | ||
1360 | ar |= (var->dpl & 3) << 5; | ||
1361 | ar |= (var->present & 1) << 7; | ||
1362 | ar |= (var->avl & 1) << 12; | ||
1363 | ar |= (var->l & 1) << 13; | ||
1364 | ar |= (var->db & 1) << 14; | ||
1365 | ar |= (var->g & 1) << 15; | ||
1366 | } | ||
1367 | if (ar == 0) /* a 0 value means unusable */ | ||
1368 | ar = AR_UNUSABLE_MASK; | ||
1369 | |||
1370 | return ar; | ||
1371 | } | ||
1372 | |||
1373 | static void vmx_set_segment(struct kvm_vcpu *vcpu, | ||
1374 | struct kvm_segment *var, int seg) | ||
1375 | { | ||
1376 | struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; | ||
1377 | u32 ar; | ||
1378 | |||
1379 | if (vcpu->arch.rmode.active && seg == VCPU_SREG_TR) { | ||
1380 | vcpu->arch.rmode.tr.selector = var->selector; | ||
1381 | vcpu->arch.rmode.tr.base = var->base; | ||
1382 | vcpu->arch.rmode.tr.limit = var->limit; | ||
1383 | vcpu->arch.rmode.tr.ar = vmx_segment_access_rights(var); | ||
1384 | return; | ||
1385 | } | ||
1386 | vmcs_writel(sf->base, var->base); | ||
1387 | vmcs_write32(sf->limit, var->limit); | ||
1388 | vmcs_write16(sf->selector, var->selector); | ||
1389 | if (vcpu->arch.rmode.active && var->s) { | ||
1390 | /* | ||
1391 | * Hack real-mode segments into vm86 compatibility. | ||
1392 | */ | ||
1393 | if (var->base == 0xffff0000 && var->selector == 0xf000) | ||
1394 | vmcs_writel(sf->base, 0xf0000); | ||
1395 | ar = 0xf3; | ||
1396 | } else | ||
1397 | ar = vmx_segment_access_rights(var); | ||
1398 | vmcs_write32(sf->ar_bytes, ar); | ||
1399 | } | ||
1400 | |||
1401 | static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) | ||
1402 | { | ||
1403 | u32 ar = vmcs_read32(GUEST_CS_AR_BYTES); | ||
1404 | |||
1405 | *db = (ar >> 14) & 1; | ||
1406 | *l = (ar >> 13) & 1; | ||
1407 | } | ||
1408 | |||
1409 | static void vmx_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) | ||
1410 | { | ||
1411 | dt->limit = vmcs_read32(GUEST_IDTR_LIMIT); | ||
1412 | dt->base = vmcs_readl(GUEST_IDTR_BASE); | ||
1413 | } | ||
1414 | |||
1415 | static void vmx_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) | ||
1416 | { | ||
1417 | vmcs_write32(GUEST_IDTR_LIMIT, dt->limit); | ||
1418 | vmcs_writel(GUEST_IDTR_BASE, dt->base); | ||
1419 | } | ||
1420 | |||
1421 | static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) | ||
1422 | { | ||
1423 | dt->limit = vmcs_read32(GUEST_GDTR_LIMIT); | ||
1424 | dt->base = vmcs_readl(GUEST_GDTR_BASE); | ||
1425 | } | ||
1426 | |||
1427 | static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) | ||
1428 | { | ||
1429 | vmcs_write32(GUEST_GDTR_LIMIT, dt->limit); | ||
1430 | vmcs_writel(GUEST_GDTR_BASE, dt->base); | ||
1431 | } | ||
1432 | |||
1433 | static int init_rmode_tss(struct kvm *kvm) | ||
1434 | { | ||
1435 | gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; | ||
1436 | u16 data = 0; | ||
1437 | int r; | ||
1438 | |||
1439 | r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); | ||
1440 | if (r < 0) | ||
1441 | return 0; | ||
1442 | data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; | ||
1443 | r = kvm_write_guest_page(kvm, fn++, &data, 0x66, sizeof(u16)); | ||
1444 | if (r < 0) | ||
1445 | return 0; | ||
1446 | r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE); | ||
1447 | if (r < 0) | ||
1448 | return 0; | ||
1449 | r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); | ||
1450 | if (r < 0) | ||
1451 | return 0; | ||
1452 | data = ~0; | ||
1453 | r = kvm_write_guest_page(kvm, fn, &data, RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1, | ||
1454 | sizeof(u8)); | ||
1455 | if (r < 0) | ||
1456 | return 0; | ||
1457 | return 1; | ||
1458 | } | ||
1459 | |||
1460 | static void seg_setup(int seg) | ||
1461 | { | ||
1462 | struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; | ||
1463 | |||
1464 | vmcs_write16(sf->selector, 0); | ||
1465 | vmcs_writel(sf->base, 0); | ||
1466 | vmcs_write32(sf->limit, 0xffff); | ||
1467 | vmcs_write32(sf->ar_bytes, 0x93); | ||
1468 | } | ||
1469 | |||
1470 | static int alloc_apic_access_page(struct kvm *kvm) | ||
1471 | { | ||
1472 | struct kvm_userspace_memory_region kvm_userspace_mem; | ||
1473 | int r = 0; | ||
1474 | |||
1475 | mutex_lock(&kvm->lock); | ||
1476 | if (kvm->arch.apic_access_page) | ||
1477 | goto out; | ||
1478 | kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT; | ||
1479 | kvm_userspace_mem.flags = 0; | ||
1480 | kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL; | ||
1481 | kvm_userspace_mem.memory_size = PAGE_SIZE; | ||
1482 | r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0); | ||
1483 | if (r) | ||
1484 | goto out; | ||
1485 | kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); | ||
1486 | out: | ||
1487 | mutex_unlock(&kvm->lock); | ||
1488 | return r; | ||
1489 | } | ||
1490 | |||
1491 | /* | ||
1492 | * Sets up the vmcs for emulated real mode. | ||
1493 | */ | ||
1494 | static int vmx_vcpu_setup(struct vcpu_vmx *vmx) | ||
1495 | { | ||
1496 | u32 host_sysenter_cs; | ||
1497 | u32 junk; | ||
1498 | unsigned long a; | ||
1499 | struct descriptor_table dt; | ||
1500 | int i; | ||
1501 | unsigned long kvm_vmx_return; | ||
1502 | u32 exec_control; | ||
1503 | |||
1504 | /* I/O */ | ||
1505 | vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a)); | ||
1506 | vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b)); | ||
1507 | |||
1508 | vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ | ||
1509 | |||
1510 | /* Control */ | ||
1511 | vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, | ||
1512 | vmcs_config.pin_based_exec_ctrl); | ||
1513 | |||
1514 | exec_control = vmcs_config.cpu_based_exec_ctrl; | ||
1515 | if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) { | ||
1516 | exec_control &= ~CPU_BASED_TPR_SHADOW; | ||
1517 | #ifdef CONFIG_X86_64 | ||
1518 | exec_control |= CPU_BASED_CR8_STORE_EXITING | | ||
1519 | CPU_BASED_CR8_LOAD_EXITING; | ||
1520 | #endif | ||
1521 | } | ||
1522 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); | ||
1523 | |||
1524 | if (cpu_has_secondary_exec_ctrls()) { | ||
1525 | exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; | ||
1526 | if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) | ||
1527 | exec_control &= | ||
1528 | ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; | ||
1529 | vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); | ||
1530 | } | ||
1531 | |||
1532 | vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf); | ||
1533 | vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf); | ||
1534 | vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ | ||
1535 | |||
1536 | vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */ | ||
1537 | vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */ | ||
1538 | vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ | ||
1539 | |||
1540 | vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ | ||
1541 | vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ | ||
1542 | vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ | ||
1543 | vmcs_write16(HOST_FS_SELECTOR, read_fs()); /* 22.2.4 */ | ||
1544 | vmcs_write16(HOST_GS_SELECTOR, read_gs()); /* 22.2.4 */ | ||
1545 | vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ | ||
1546 | #ifdef CONFIG_X86_64 | ||
1547 | rdmsrl(MSR_FS_BASE, a); | ||
1548 | vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */ | ||
1549 | rdmsrl(MSR_GS_BASE, a); | ||
1550 | vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */ | ||
1551 | #else | ||
1552 | vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */ | ||
1553 | vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ | ||
1554 | #endif | ||
1555 | |||
1556 | vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ | ||
1557 | |||
1558 | get_idt(&dt); | ||
1559 | vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */ | ||
1560 | |||
1561 | asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return)); | ||
1562 | vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */ | ||
1563 | vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); | ||
1564 | vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); | ||
1565 | vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); | ||
1566 | |||
1567 | rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk); | ||
1568 | vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs); | ||
1569 | rdmsrl(MSR_IA32_SYSENTER_ESP, a); | ||
1570 | vmcs_writel(HOST_IA32_SYSENTER_ESP, a); /* 22.2.3 */ | ||
1571 | rdmsrl(MSR_IA32_SYSENTER_EIP, a); | ||
1572 | vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */ | ||
1573 | |||
1574 | for (i = 0; i < NR_VMX_MSR; ++i) { | ||
1575 | u32 index = vmx_msr_index[i]; | ||
1576 | u32 data_low, data_high; | ||
1577 | u64 data; | ||
1578 | int j = vmx->nmsrs; | ||
1579 | |||
1580 | if (rdmsr_safe(index, &data_low, &data_high) < 0) | ||
1581 | continue; | ||
1582 | if (wrmsr_safe(index, data_low, data_high) < 0) | ||
1583 | continue; | ||
1584 | data = data_low | ((u64)data_high << 32); | ||
1585 | vmx->host_msrs[j].index = index; | ||
1586 | vmx->host_msrs[j].reserved = 0; | ||
1587 | vmx->host_msrs[j].data = data; | ||
1588 | vmx->guest_msrs[j] = vmx->host_msrs[j]; | ||
1589 | ++vmx->nmsrs; | ||
1590 | } | ||
1591 | |||
1592 | vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); | ||
1593 | |||
1594 | /* 22.2.1, 20.8.1 */ | ||
1595 | vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); | ||
1596 | |||
1597 | vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); | ||
1598 | vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK); | ||
1599 | |||
1600 | if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) | ||
1601 | if (alloc_apic_access_page(vmx->vcpu.kvm) != 0) | ||
1602 | return -ENOMEM; | ||
1603 | |||
1604 | return 0; | ||
1605 | } | ||
1606 | |||
1607 | static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) | ||
1608 | { | ||
1609 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
1610 | u64 msr; | ||
1611 | int ret; | ||
1612 | |||
1613 | if (!init_rmode_tss(vmx->vcpu.kvm)) { | ||
1614 | ret = -ENOMEM; | ||
1615 | goto out; | ||
1616 | } | ||
1617 | |||
1618 | vmx->vcpu.arch.rmode.active = 0; | ||
1619 | |||
1620 | vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); | ||
1621 | set_cr8(&vmx->vcpu, 0); | ||
1622 | msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; | ||
1623 | if (vmx->vcpu.vcpu_id == 0) | ||
1624 | msr |= MSR_IA32_APICBASE_BSP; | ||
1625 | kvm_set_apic_base(&vmx->vcpu, msr); | ||
1626 | |||
1627 | fx_init(&vmx->vcpu); | ||
1628 | |||
1629 | /* | ||
1630 | * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode | ||
1631 | * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. | ||
1632 | */ | ||
1633 | if (vmx->vcpu.vcpu_id == 0) { | ||
1634 | vmcs_write16(GUEST_CS_SELECTOR, 0xf000); | ||
1635 | vmcs_writel(GUEST_CS_BASE, 0x000f0000); | ||
1636 | } else { | ||
1637 | vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8); | ||
1638 | vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12); | ||
1639 | } | ||
1640 | vmcs_write32(GUEST_CS_LIMIT, 0xffff); | ||
1641 | vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); | ||
1642 | |||
1643 | seg_setup(VCPU_SREG_DS); | ||
1644 | seg_setup(VCPU_SREG_ES); | ||
1645 | seg_setup(VCPU_SREG_FS); | ||
1646 | seg_setup(VCPU_SREG_GS); | ||
1647 | seg_setup(VCPU_SREG_SS); | ||
1648 | |||
1649 | vmcs_write16(GUEST_TR_SELECTOR, 0); | ||
1650 | vmcs_writel(GUEST_TR_BASE, 0); | ||
1651 | vmcs_write32(GUEST_TR_LIMIT, 0xffff); | ||
1652 | vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); | ||
1653 | |||
1654 | vmcs_write16(GUEST_LDTR_SELECTOR, 0); | ||
1655 | vmcs_writel(GUEST_LDTR_BASE, 0); | ||
1656 | vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); | ||
1657 | vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); | ||
1658 | |||
1659 | vmcs_write32(GUEST_SYSENTER_CS, 0); | ||
1660 | vmcs_writel(GUEST_SYSENTER_ESP, 0); | ||
1661 | vmcs_writel(GUEST_SYSENTER_EIP, 0); | ||
1662 | |||
1663 | vmcs_writel(GUEST_RFLAGS, 0x02); | ||
1664 | if (vmx->vcpu.vcpu_id == 0) | ||
1665 | vmcs_writel(GUEST_RIP, 0xfff0); | ||
1666 | else | ||
1667 | vmcs_writel(GUEST_RIP, 0); | ||
1668 | vmcs_writel(GUEST_RSP, 0); | ||
1669 | |||
1670 | /* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */ | ||
1671 | vmcs_writel(GUEST_DR7, 0x400); | ||
1672 | |||
1673 | vmcs_writel(GUEST_GDTR_BASE, 0); | ||
1674 | vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); | ||
1675 | |||
1676 | vmcs_writel(GUEST_IDTR_BASE, 0); | ||
1677 | vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); | ||
1678 | |||
1679 | vmcs_write32(GUEST_ACTIVITY_STATE, 0); | ||
1680 | vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); | ||
1681 | vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); | ||
1682 | |||
1683 | guest_write_tsc(0); | ||
1684 | |||
1685 | /* Special registers */ | ||
1686 | vmcs_write64(GUEST_IA32_DEBUGCTL, 0); | ||
1687 | |||
1688 | setup_msrs(vmx); | ||
1689 | |||
1690 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ | ||
1691 | |||
1692 | if (cpu_has_vmx_tpr_shadow()) { | ||
1693 | vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); | ||
1694 | if (vm_need_tpr_shadow(vmx->vcpu.kvm)) | ||
1695 | vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, | ||
1696 | page_to_phys(vmx->vcpu.arch.apic->regs_page)); | ||
1697 | vmcs_write32(TPR_THRESHOLD, 0); | ||
1698 | } | ||
1699 | |||
1700 | if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) | ||
1701 | vmcs_write64(APIC_ACCESS_ADDR, | ||
1702 | page_to_phys(vmx->vcpu.kvm->arch.apic_access_page)); | ||
1703 | |||
1704 | vmx->vcpu.arch.cr0 = 0x60000010; | ||
1705 | vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */ | ||
1706 | vmx_set_cr4(&vmx->vcpu, 0); | ||
1707 | #ifdef CONFIG_X86_64 | ||
1708 | vmx_set_efer(&vmx->vcpu, 0); | ||
1709 | #endif | ||
1710 | vmx_fpu_activate(&vmx->vcpu); | ||
1711 | update_exception_bitmap(&vmx->vcpu); | ||
1712 | |||
1713 | return 0; | ||
1714 | |||
1715 | out: | ||
1716 | return ret; | ||
1717 | } | ||
1718 | |||
1719 | static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) | ||
1720 | { | ||
1721 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
1722 | |||
1723 | if (vcpu->arch.rmode.active) { | ||
1724 | vmx->rmode.irq.pending = true; | ||
1725 | vmx->rmode.irq.vector = irq; | ||
1726 | vmx->rmode.irq.rip = vmcs_readl(GUEST_RIP); | ||
1727 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | ||
1728 | irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK); | ||
1729 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); | ||
1730 | vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip - 1); | ||
1731 | return; | ||
1732 | } | ||
1733 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | ||
1734 | irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); | ||
1735 | } | ||
1736 | |||
1737 | static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) | ||
1738 | { | ||
1739 | int word_index = __ffs(vcpu->arch.irq_summary); | ||
1740 | int bit_index = __ffs(vcpu->arch.irq_pending[word_index]); | ||
1741 | int irq = word_index * BITS_PER_LONG + bit_index; | ||
1742 | |||
1743 | clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); | ||
1744 | if (!vcpu->arch.irq_pending[word_index]) | ||
1745 | clear_bit(word_index, &vcpu->arch.irq_summary); | ||
1746 | vmx_inject_irq(vcpu, irq); | ||
1747 | } | ||
1748 | |||
1749 | |||
1750 | static void do_interrupt_requests(struct kvm_vcpu *vcpu, | ||
1751 | struct kvm_run *kvm_run) | ||
1752 | { | ||
1753 | u32 cpu_based_vm_exec_control; | ||
1754 | |||
1755 | vcpu->arch.interrupt_window_open = | ||
1756 | ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && | ||
1757 | (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0); | ||
1758 | |||
1759 | if (vcpu->arch.interrupt_window_open && | ||
1760 | vcpu->arch.irq_summary && | ||
1761 | !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK)) | ||
1762 | /* | ||
1763 | * If interrupts enabled, and not blocked by sti or mov ss. Good. | ||
1764 | */ | ||
1765 | kvm_do_inject_irq(vcpu); | ||
1766 | |||
1767 | cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); | ||
1768 | if (!vcpu->arch.interrupt_window_open && | ||
1769 | (vcpu->arch.irq_summary || kvm_run->request_interrupt_window)) | ||
1770 | /* | ||
1771 | * Interrupts blocked. Wait for unblock. | ||
1772 | */ | ||
1773 | cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; | ||
1774 | else | ||
1775 | cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; | ||
1776 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); | ||
1777 | } | ||
1778 | |||
1779 | static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) | ||
1780 | { | ||
1781 | int ret; | ||
1782 | struct kvm_userspace_memory_region tss_mem = { | ||
1783 | .slot = 8, | ||
1784 | .guest_phys_addr = addr, | ||
1785 | .memory_size = PAGE_SIZE * 3, | ||
1786 | .flags = 0, | ||
1787 | }; | ||
1788 | |||
1789 | ret = kvm_set_memory_region(kvm, &tss_mem, 0); | ||
1790 | if (ret) | ||
1791 | return ret; | ||
1792 | kvm->arch.tss_addr = addr; | ||
1793 | return 0; | ||
1794 | } | ||
1795 | |||
1796 | static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu) | ||
1797 | { | ||
1798 | struct kvm_guest_debug *dbg = &vcpu->guest_debug; | ||
1799 | |||
1800 | set_debugreg(dbg->bp[0], 0); | ||
1801 | set_debugreg(dbg->bp[1], 1); | ||
1802 | set_debugreg(dbg->bp[2], 2); | ||
1803 | set_debugreg(dbg->bp[3], 3); | ||
1804 | |||
1805 | if (dbg->singlestep) { | ||
1806 | unsigned long flags; | ||
1807 | |||
1808 | flags = vmcs_readl(GUEST_RFLAGS); | ||
1809 | flags |= X86_EFLAGS_TF | X86_EFLAGS_RF; | ||
1810 | vmcs_writel(GUEST_RFLAGS, flags); | ||
1811 | } | ||
1812 | } | ||
1813 | |||
1814 | static int handle_rmode_exception(struct kvm_vcpu *vcpu, | ||
1815 | int vec, u32 err_code) | ||
1816 | { | ||
1817 | if (!vcpu->arch.rmode.active) | ||
1818 | return 0; | ||
1819 | |||
1820 | /* | ||
1821 | * Instruction with address size override prefix opcode 0x67 | ||
1822 | * Cause the #SS fault with 0 error code in VM86 mode. | ||
1823 | */ | ||
1824 | if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) | ||
1825 | if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE) | ||
1826 | return 1; | ||
1827 | return 0; | ||
1828 | } | ||
1829 | |||
1830 | static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
1831 | { | ||
1832 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
1833 | u32 intr_info, error_code; | ||
1834 | unsigned long cr2, rip; | ||
1835 | u32 vect_info; | ||
1836 | enum emulation_result er; | ||
1837 | |||
1838 | vect_info = vmx->idt_vectoring_info; | ||
1839 | intr_info = vmcs_read32(VM_EXIT_INTR_INFO); | ||
1840 | |||
1841 | if ((vect_info & VECTORING_INFO_VALID_MASK) && | ||
1842 | !is_page_fault(intr_info)) | ||
1843 | printk(KERN_ERR "%s: unexpected, vectoring info 0x%x " | ||
1844 | "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info); | ||
1845 | |||
1846 | if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) { | ||
1847 | int irq = vect_info & VECTORING_INFO_VECTOR_MASK; | ||
1848 | set_bit(irq, vcpu->arch.irq_pending); | ||
1849 | set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary); | ||
1850 | } | ||
1851 | |||
1852 | if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */ | ||
1853 | return 1; /* already handled by vmx_vcpu_run() */ | ||
1854 | |||
1855 | if (is_no_device(intr_info)) { | ||
1856 | vmx_fpu_activate(vcpu); | ||
1857 | return 1; | ||
1858 | } | ||
1859 | |||
1860 | if (is_invalid_opcode(intr_info)) { | ||
1861 | er = emulate_instruction(vcpu, kvm_run, 0, 0, 0); | ||
1862 | if (er != EMULATE_DONE) | ||
1863 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
1864 | return 1; | ||
1865 | } | ||
1866 | |||
1867 | error_code = 0; | ||
1868 | rip = vmcs_readl(GUEST_RIP); | ||
1869 | if (intr_info & INTR_INFO_DELIEVER_CODE_MASK) | ||
1870 | error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); | ||
1871 | if (is_page_fault(intr_info)) { | ||
1872 | cr2 = vmcs_readl(EXIT_QUALIFICATION); | ||
1873 | return kvm_mmu_page_fault(vcpu, cr2, error_code); | ||
1874 | } | ||
1875 | |||
1876 | if (vcpu->arch.rmode.active && | ||
1877 | handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK, | ||
1878 | error_code)) { | ||
1879 | if (vcpu->arch.halt_request) { | ||
1880 | vcpu->arch.halt_request = 0; | ||
1881 | return kvm_emulate_halt(vcpu); | ||
1882 | } | ||
1883 | return 1; | ||
1884 | } | ||
1885 | |||
1886 | if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == | ||
1887 | (INTR_TYPE_EXCEPTION | 1)) { | ||
1888 | kvm_run->exit_reason = KVM_EXIT_DEBUG; | ||
1889 | return 0; | ||
1890 | } | ||
1891 | kvm_run->exit_reason = KVM_EXIT_EXCEPTION; | ||
1892 | kvm_run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK; | ||
1893 | kvm_run->ex.error_code = error_code; | ||
1894 | return 0; | ||
1895 | } | ||
1896 | |||
1897 | static int handle_external_interrupt(struct kvm_vcpu *vcpu, | ||
1898 | struct kvm_run *kvm_run) | ||
1899 | { | ||
1900 | ++vcpu->stat.irq_exits; | ||
1901 | return 1; | ||
1902 | } | ||
1903 | |||
1904 | static int handle_triple_fault(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
1905 | { | ||
1906 | kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; | ||
1907 | return 0; | ||
1908 | } | ||
1909 | |||
1910 | static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
1911 | { | ||
1912 | unsigned long exit_qualification; | ||
1913 | int size, down, in, string, rep; | ||
1914 | unsigned port; | ||
1915 | |||
1916 | ++vcpu->stat.io_exits; | ||
1917 | exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
1918 | string = (exit_qualification & 16) != 0; | ||
1919 | |||
1920 | if (string) { | ||
1921 | if (emulate_instruction(vcpu, | ||
1922 | kvm_run, 0, 0, 0) == EMULATE_DO_MMIO) | ||
1923 | return 0; | ||
1924 | return 1; | ||
1925 | } | ||
1926 | |||
1927 | size = (exit_qualification & 7) + 1; | ||
1928 | in = (exit_qualification & 8) != 0; | ||
1929 | down = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0; | ||
1930 | rep = (exit_qualification & 32) != 0; | ||
1931 | port = exit_qualification >> 16; | ||
1932 | |||
1933 | return kvm_emulate_pio(vcpu, kvm_run, in, size, port); | ||
1934 | } | ||
1935 | |||
1936 | static void | ||
1937 | vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) | ||
1938 | { | ||
1939 | /* | ||
1940 | * Patch in the VMCALL instruction: | ||
1941 | */ | ||
1942 | hypercall[0] = 0x0f; | ||
1943 | hypercall[1] = 0x01; | ||
1944 | hypercall[2] = 0xc1; | ||
1945 | } | ||
1946 | |||
1947 | static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
1948 | { | ||
1949 | unsigned long exit_qualification; | ||
1950 | int cr; | ||
1951 | int reg; | ||
1952 | |||
1953 | exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
1954 | cr = exit_qualification & 15; | ||
1955 | reg = (exit_qualification >> 8) & 15; | ||
1956 | switch ((exit_qualification >> 4) & 3) { | ||
1957 | case 0: /* mov to cr */ | ||
1958 | switch (cr) { | ||
1959 | case 0: | ||
1960 | vcpu_load_rsp_rip(vcpu); | ||
1961 | set_cr0(vcpu, vcpu->arch.regs[reg]); | ||
1962 | skip_emulated_instruction(vcpu); | ||
1963 | return 1; | ||
1964 | case 3: | ||
1965 | vcpu_load_rsp_rip(vcpu); | ||
1966 | set_cr3(vcpu, vcpu->arch.regs[reg]); | ||
1967 | skip_emulated_instruction(vcpu); | ||
1968 | return 1; | ||
1969 | case 4: | ||
1970 | vcpu_load_rsp_rip(vcpu); | ||
1971 | set_cr4(vcpu, vcpu->arch.regs[reg]); | ||
1972 | skip_emulated_instruction(vcpu); | ||
1973 | return 1; | ||
1974 | case 8: | ||
1975 | vcpu_load_rsp_rip(vcpu); | ||
1976 | set_cr8(vcpu, vcpu->arch.regs[reg]); | ||
1977 | skip_emulated_instruction(vcpu); | ||
1978 | if (irqchip_in_kernel(vcpu->kvm)) | ||
1979 | return 1; | ||
1980 | kvm_run->exit_reason = KVM_EXIT_SET_TPR; | ||
1981 | return 0; | ||
1982 | }; | ||
1983 | break; | ||
1984 | case 2: /* clts */ | ||
1985 | vcpu_load_rsp_rip(vcpu); | ||
1986 | vmx_fpu_deactivate(vcpu); | ||
1987 | vcpu->arch.cr0 &= ~X86_CR0_TS; | ||
1988 | vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); | ||
1989 | vmx_fpu_activate(vcpu); | ||
1990 | skip_emulated_instruction(vcpu); | ||
1991 | return 1; | ||
1992 | case 1: /*mov from cr*/ | ||
1993 | switch (cr) { | ||
1994 | case 3: | ||
1995 | vcpu_load_rsp_rip(vcpu); | ||
1996 | vcpu->arch.regs[reg] = vcpu->arch.cr3; | ||
1997 | vcpu_put_rsp_rip(vcpu); | ||
1998 | skip_emulated_instruction(vcpu); | ||
1999 | return 1; | ||
2000 | case 8: | ||
2001 | vcpu_load_rsp_rip(vcpu); | ||
2002 | vcpu->arch.regs[reg] = get_cr8(vcpu); | ||
2003 | vcpu_put_rsp_rip(vcpu); | ||
2004 | skip_emulated_instruction(vcpu); | ||
2005 | return 1; | ||
2006 | } | ||
2007 | break; | ||
2008 | case 3: /* lmsw */ | ||
2009 | lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f); | ||
2010 | |||
2011 | skip_emulated_instruction(vcpu); | ||
2012 | return 1; | ||
2013 | default: | ||
2014 | break; | ||
2015 | } | ||
2016 | kvm_run->exit_reason = 0; | ||
2017 | pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n", | ||
2018 | (int)(exit_qualification >> 4) & 3, cr); | ||
2019 | return 0; | ||
2020 | } | ||
2021 | |||
2022 | static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
2023 | { | ||
2024 | unsigned long exit_qualification; | ||
2025 | unsigned long val; | ||
2026 | int dr, reg; | ||
2027 | |||
2028 | /* | ||
2029 | * FIXME: this code assumes the host is debugging the guest. | ||
2030 | * need to deal with guest debugging itself too. | ||
2031 | */ | ||
2032 | exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
2033 | dr = exit_qualification & 7; | ||
2034 | reg = (exit_qualification >> 8) & 15; | ||
2035 | vcpu_load_rsp_rip(vcpu); | ||
2036 | if (exit_qualification & 16) { | ||
2037 | /* mov from dr */ | ||
2038 | switch (dr) { | ||
2039 | case 6: | ||
2040 | val = 0xffff0ff0; | ||
2041 | break; | ||
2042 | case 7: | ||
2043 | val = 0x400; | ||
2044 | break; | ||
2045 | default: | ||
2046 | val = 0; | ||
2047 | } | ||
2048 | vcpu->arch.regs[reg] = val; | ||
2049 | } else { | ||
2050 | /* mov to dr */ | ||
2051 | } | ||
2052 | vcpu_put_rsp_rip(vcpu); | ||
2053 | skip_emulated_instruction(vcpu); | ||
2054 | return 1; | ||
2055 | } | ||
2056 | |||
2057 | static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
2058 | { | ||
2059 | kvm_emulate_cpuid(vcpu); | ||
2060 | return 1; | ||
2061 | } | ||
2062 | |||
2063 | static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
2064 | { | ||
2065 | u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; | ||
2066 | u64 data; | ||
2067 | |||
2068 | if (vmx_get_msr(vcpu, ecx, &data)) { | ||
2069 | kvm_inject_gp(vcpu, 0); | ||
2070 | return 1; | ||
2071 | } | ||
2072 | |||
2073 | /* FIXME: handling of bits 32:63 of rax, rdx */ | ||
2074 | vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u; | ||
2075 | vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u; | ||
2076 | skip_emulated_instruction(vcpu); | ||
2077 | return 1; | ||
2078 | } | ||
2079 | |||
2080 | static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
2081 | { | ||
2082 | u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; | ||
2083 | u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) | ||
2084 | | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); | ||
2085 | |||
2086 | if (vmx_set_msr(vcpu, ecx, data) != 0) { | ||
2087 | kvm_inject_gp(vcpu, 0); | ||
2088 | return 1; | ||
2089 | } | ||
2090 | |||
2091 | skip_emulated_instruction(vcpu); | ||
2092 | return 1; | ||
2093 | } | ||
2094 | |||
2095 | static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu, | ||
2096 | struct kvm_run *kvm_run) | ||
2097 | { | ||
2098 | return 1; | ||
2099 | } | ||
2100 | |||
2101 | static int handle_interrupt_window(struct kvm_vcpu *vcpu, | ||
2102 | struct kvm_run *kvm_run) | ||
2103 | { | ||
2104 | u32 cpu_based_vm_exec_control; | ||
2105 | |||
2106 | /* clear pending irq */ | ||
2107 | cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); | ||
2108 | cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; | ||
2109 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); | ||
2110 | /* | ||
2111 | * If the user space waits to inject interrupts, exit as soon as | ||
2112 | * possible | ||
2113 | */ | ||
2114 | if (kvm_run->request_interrupt_window && | ||
2115 | !vcpu->arch.irq_summary) { | ||
2116 | kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; | ||
2117 | ++vcpu->stat.irq_window_exits; | ||
2118 | return 0; | ||
2119 | } | ||
2120 | return 1; | ||
2121 | } | ||
2122 | |||
2123 | static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
2124 | { | ||
2125 | skip_emulated_instruction(vcpu); | ||
2126 | return kvm_emulate_halt(vcpu); | ||
2127 | } | ||
2128 | |||
2129 | static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
2130 | { | ||
2131 | skip_emulated_instruction(vcpu); | ||
2132 | kvm_emulate_hypercall(vcpu); | ||
2133 | return 1; | ||
2134 | } | ||
2135 | |||
2136 | static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
2137 | { | ||
2138 | skip_emulated_instruction(vcpu); | ||
2139 | /* TODO: Add support for VT-d/pass-through device */ | ||
2140 | return 1; | ||
2141 | } | ||
2142 | |||
2143 | static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
2144 | { | ||
2145 | u64 exit_qualification; | ||
2146 | enum emulation_result er; | ||
2147 | unsigned long offset; | ||
2148 | |||
2149 | exit_qualification = vmcs_read64(EXIT_QUALIFICATION); | ||
2150 | offset = exit_qualification & 0xffful; | ||
2151 | |||
2152 | er = emulate_instruction(vcpu, kvm_run, 0, 0, 0); | ||
2153 | |||
2154 | if (er != EMULATE_DONE) { | ||
2155 | printk(KERN_ERR | ||
2156 | "Fail to handle apic access vmexit! Offset is 0x%lx\n", | ||
2157 | offset); | ||
2158 | return -ENOTSUPP; | ||
2159 | } | ||
2160 | return 1; | ||
2161 | } | ||
2162 | |||
2163 | /* | ||
2164 | * The exit handlers return 1 if the exit was handled fully and guest execution | ||
2165 | * may resume. Otherwise they set the kvm_run parameter to indicate what needs | ||
2166 | * to be done to userspace and return 0. | ||
2167 | */ | ||
2168 | static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, | ||
2169 | struct kvm_run *kvm_run) = { | ||
2170 | [EXIT_REASON_EXCEPTION_NMI] = handle_exception, | ||
2171 | [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, | ||
2172 | [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, | ||
2173 | [EXIT_REASON_IO_INSTRUCTION] = handle_io, | ||
2174 | [EXIT_REASON_CR_ACCESS] = handle_cr, | ||
2175 | [EXIT_REASON_DR_ACCESS] = handle_dr, | ||
2176 | [EXIT_REASON_CPUID] = handle_cpuid, | ||
2177 | [EXIT_REASON_MSR_READ] = handle_rdmsr, | ||
2178 | [EXIT_REASON_MSR_WRITE] = handle_wrmsr, | ||
2179 | [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, | ||
2180 | [EXIT_REASON_HLT] = handle_halt, | ||
2181 | [EXIT_REASON_VMCALL] = handle_vmcall, | ||
2182 | [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, | ||
2183 | [EXIT_REASON_APIC_ACCESS] = handle_apic_access, | ||
2184 | [EXIT_REASON_WBINVD] = handle_wbinvd, | ||
2185 | }; | ||
2186 | |||
2187 | static const int kvm_vmx_max_exit_handlers = | ||
2188 | ARRAY_SIZE(kvm_vmx_exit_handlers); | ||
2189 | |||
2190 | /* | ||
2191 | * The guest has exited. See if we can fix it or if we need userspace | ||
2192 | * assistance. | ||
2193 | */ | ||
2194 | static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) | ||
2195 | { | ||
2196 | u32 exit_reason = vmcs_read32(VM_EXIT_REASON); | ||
2197 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
2198 | u32 vectoring_info = vmx->idt_vectoring_info; | ||
2199 | |||
2200 | if (unlikely(vmx->fail)) { | ||
2201 | kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; | ||
2202 | kvm_run->fail_entry.hardware_entry_failure_reason | ||
2203 | = vmcs_read32(VM_INSTRUCTION_ERROR); | ||
2204 | return 0; | ||
2205 | } | ||
2206 | |||
2207 | if ((vectoring_info & VECTORING_INFO_VALID_MASK) && | ||
2208 | exit_reason != EXIT_REASON_EXCEPTION_NMI) | ||
2209 | printk(KERN_WARNING "%s: unexpected, valid vectoring info and " | ||
2210 | "exit reason is 0x%x\n", __FUNCTION__, exit_reason); | ||
2211 | if (exit_reason < kvm_vmx_max_exit_handlers | ||
2212 | && kvm_vmx_exit_handlers[exit_reason]) | ||
2213 | return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run); | ||
2214 | else { | ||
2215 | kvm_run->exit_reason = KVM_EXIT_UNKNOWN; | ||
2216 | kvm_run->hw.hardware_exit_reason = exit_reason; | ||
2217 | } | ||
2218 | return 0; | ||
2219 | } | ||
2220 | |||
2221 | static void vmx_flush_tlb(struct kvm_vcpu *vcpu) | ||
2222 | { | ||
2223 | } | ||
2224 | |||
2225 | static void update_tpr_threshold(struct kvm_vcpu *vcpu) | ||
2226 | { | ||
2227 | int max_irr, tpr; | ||
2228 | |||
2229 | if (!vm_need_tpr_shadow(vcpu->kvm)) | ||
2230 | return; | ||
2231 | |||
2232 | if (!kvm_lapic_enabled(vcpu) || | ||
2233 | ((max_irr = kvm_lapic_find_highest_irr(vcpu)) == -1)) { | ||
2234 | vmcs_write32(TPR_THRESHOLD, 0); | ||
2235 | return; | ||
2236 | } | ||
2237 | |||
2238 | tpr = (kvm_lapic_get_cr8(vcpu) & 0x0f) << 4; | ||
2239 | vmcs_write32(TPR_THRESHOLD, (max_irr > tpr) ? tpr >> 4 : max_irr >> 4); | ||
2240 | } | ||
2241 | |||
2242 | static void enable_irq_window(struct kvm_vcpu *vcpu) | ||
2243 | { | ||
2244 | u32 cpu_based_vm_exec_control; | ||
2245 | |||
2246 | cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); | ||
2247 | cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; | ||
2248 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); | ||
2249 | } | ||
2250 | |||
2251 | static void vmx_intr_assist(struct kvm_vcpu *vcpu) | ||
2252 | { | ||
2253 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
2254 | u32 idtv_info_field, intr_info_field; | ||
2255 | int has_ext_irq, interrupt_window_open; | ||
2256 | int vector; | ||
2257 | |||
2258 | update_tpr_threshold(vcpu); | ||
2259 | |||
2260 | has_ext_irq = kvm_cpu_has_interrupt(vcpu); | ||
2261 | intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); | ||
2262 | idtv_info_field = vmx->idt_vectoring_info; | ||
2263 | if (intr_info_field & INTR_INFO_VALID_MASK) { | ||
2264 | if (idtv_info_field & INTR_INFO_VALID_MASK) { | ||
2265 | /* TODO: fault when IDT_Vectoring */ | ||
2266 | if (printk_ratelimit()) | ||
2267 | printk(KERN_ERR "Fault when IDT_Vectoring\n"); | ||
2268 | } | ||
2269 | if (has_ext_irq) | ||
2270 | enable_irq_window(vcpu); | ||
2271 | return; | ||
2272 | } | ||
2273 | if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) { | ||
2274 | if ((idtv_info_field & VECTORING_INFO_TYPE_MASK) | ||
2275 | == INTR_TYPE_EXT_INTR | ||
2276 | && vcpu->arch.rmode.active) { | ||
2277 | u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK; | ||
2278 | |||
2279 | vmx_inject_irq(vcpu, vect); | ||
2280 | if (unlikely(has_ext_irq)) | ||
2281 | enable_irq_window(vcpu); | ||
2282 | return; | ||
2283 | } | ||
2284 | |||
2285 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field); | ||
2286 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, | ||
2287 | vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); | ||
2288 | |||
2289 | if (unlikely(idtv_info_field & INTR_INFO_DELIEVER_CODE_MASK)) | ||
2290 | vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, | ||
2291 | vmcs_read32(IDT_VECTORING_ERROR_CODE)); | ||
2292 | if (unlikely(has_ext_irq)) | ||
2293 | enable_irq_window(vcpu); | ||
2294 | return; | ||
2295 | } | ||
2296 | if (!has_ext_irq) | ||
2297 | return; | ||
2298 | interrupt_window_open = | ||
2299 | ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && | ||
2300 | (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0); | ||
2301 | if (interrupt_window_open) { | ||
2302 | vector = kvm_cpu_get_interrupt(vcpu); | ||
2303 | vmx_inject_irq(vcpu, vector); | ||
2304 | kvm_timer_intr_post(vcpu, vector); | ||
2305 | } else | ||
2306 | enable_irq_window(vcpu); | ||
2307 | } | ||
2308 | |||
2309 | /* | ||
2310 | * Failure to inject an interrupt should give us the information | ||
2311 | * in IDT_VECTORING_INFO_FIELD. However, if the failure occurs | ||
2312 | * when fetching the interrupt redirection bitmap in the real-mode | ||
2313 | * tss, this doesn't happen. So we do it ourselves. | ||
2314 | */ | ||
2315 | static void fixup_rmode_irq(struct vcpu_vmx *vmx) | ||
2316 | { | ||
2317 | vmx->rmode.irq.pending = 0; | ||
2318 | if (vmcs_readl(GUEST_RIP) + 1 != vmx->rmode.irq.rip) | ||
2319 | return; | ||
2320 | vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip); | ||
2321 | if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) { | ||
2322 | vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK; | ||
2323 | vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR; | ||
2324 | return; | ||
2325 | } | ||
2326 | vmx->idt_vectoring_info = | ||
2327 | VECTORING_INFO_VALID_MASK | ||
2328 | | INTR_TYPE_EXT_INTR | ||
2329 | | vmx->rmode.irq.vector; | ||
2330 | } | ||
2331 | |||
2332 | static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
2333 | { | ||
2334 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
2335 | u32 intr_info; | ||
2336 | |||
2337 | /* | ||
2338 | * Loading guest fpu may have cleared host cr0.ts | ||
2339 | */ | ||
2340 | vmcs_writel(HOST_CR0, read_cr0()); | ||
2341 | |||
2342 | asm( | ||
2343 | /* Store host registers */ | ||
2344 | #ifdef CONFIG_X86_64 | ||
2345 | "push %%rdx; push %%rbp;" | ||
2346 | "push %%rcx \n\t" | ||
2347 | #else | ||
2348 | "push %%edx; push %%ebp;" | ||
2349 | "push %%ecx \n\t" | ||
2350 | #endif | ||
2351 | ASM_VMX_VMWRITE_RSP_RDX "\n\t" | ||
2352 | /* Check if vmlaunch of vmresume is needed */ | ||
2353 | "cmpl $0, %c[launched](%0) \n\t" | ||
2354 | /* Load guest registers. Don't clobber flags. */ | ||
2355 | #ifdef CONFIG_X86_64 | ||
2356 | "mov %c[cr2](%0), %%rax \n\t" | ||
2357 | "mov %%rax, %%cr2 \n\t" | ||
2358 | "mov %c[rax](%0), %%rax \n\t" | ||
2359 | "mov %c[rbx](%0), %%rbx \n\t" | ||
2360 | "mov %c[rdx](%0), %%rdx \n\t" | ||
2361 | "mov %c[rsi](%0), %%rsi \n\t" | ||
2362 | "mov %c[rdi](%0), %%rdi \n\t" | ||
2363 | "mov %c[rbp](%0), %%rbp \n\t" | ||
2364 | "mov %c[r8](%0), %%r8 \n\t" | ||
2365 | "mov %c[r9](%0), %%r9 \n\t" | ||
2366 | "mov %c[r10](%0), %%r10 \n\t" | ||
2367 | "mov %c[r11](%0), %%r11 \n\t" | ||
2368 | "mov %c[r12](%0), %%r12 \n\t" | ||
2369 | "mov %c[r13](%0), %%r13 \n\t" | ||
2370 | "mov %c[r14](%0), %%r14 \n\t" | ||
2371 | "mov %c[r15](%0), %%r15 \n\t" | ||
2372 | "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (rcx) */ | ||
2373 | #else | ||
2374 | "mov %c[cr2](%0), %%eax \n\t" | ||
2375 | "mov %%eax, %%cr2 \n\t" | ||
2376 | "mov %c[rax](%0), %%eax \n\t" | ||
2377 | "mov %c[rbx](%0), %%ebx \n\t" | ||
2378 | "mov %c[rdx](%0), %%edx \n\t" | ||
2379 | "mov %c[rsi](%0), %%esi \n\t" | ||
2380 | "mov %c[rdi](%0), %%edi \n\t" | ||
2381 | "mov %c[rbp](%0), %%ebp \n\t" | ||
2382 | "mov %c[rcx](%0), %%ecx \n\t" /* kills %0 (ecx) */ | ||
2383 | #endif | ||
2384 | /* Enter guest mode */ | ||
2385 | "jne .Llaunched \n\t" | ||
2386 | ASM_VMX_VMLAUNCH "\n\t" | ||
2387 | "jmp .Lkvm_vmx_return \n\t" | ||
2388 | ".Llaunched: " ASM_VMX_VMRESUME "\n\t" | ||
2389 | ".Lkvm_vmx_return: " | ||
2390 | /* Save guest registers, load host registers, keep flags */ | ||
2391 | #ifdef CONFIG_X86_64 | ||
2392 | "xchg %0, (%%rsp) \n\t" | ||
2393 | "mov %%rax, %c[rax](%0) \n\t" | ||
2394 | "mov %%rbx, %c[rbx](%0) \n\t" | ||
2395 | "pushq (%%rsp); popq %c[rcx](%0) \n\t" | ||
2396 | "mov %%rdx, %c[rdx](%0) \n\t" | ||
2397 | "mov %%rsi, %c[rsi](%0) \n\t" | ||
2398 | "mov %%rdi, %c[rdi](%0) \n\t" | ||
2399 | "mov %%rbp, %c[rbp](%0) \n\t" | ||
2400 | "mov %%r8, %c[r8](%0) \n\t" | ||
2401 | "mov %%r9, %c[r9](%0) \n\t" | ||
2402 | "mov %%r10, %c[r10](%0) \n\t" | ||
2403 | "mov %%r11, %c[r11](%0) \n\t" | ||
2404 | "mov %%r12, %c[r12](%0) \n\t" | ||
2405 | "mov %%r13, %c[r13](%0) \n\t" | ||
2406 | "mov %%r14, %c[r14](%0) \n\t" | ||
2407 | "mov %%r15, %c[r15](%0) \n\t" | ||
2408 | "mov %%cr2, %%rax \n\t" | ||
2409 | "mov %%rax, %c[cr2](%0) \n\t" | ||
2410 | |||
2411 | "pop %%rbp; pop %%rbp; pop %%rdx \n\t" | ||
2412 | #else | ||
2413 | "xchg %0, (%%esp) \n\t" | ||
2414 | "mov %%eax, %c[rax](%0) \n\t" | ||
2415 | "mov %%ebx, %c[rbx](%0) \n\t" | ||
2416 | "pushl (%%esp); popl %c[rcx](%0) \n\t" | ||
2417 | "mov %%edx, %c[rdx](%0) \n\t" | ||
2418 | "mov %%esi, %c[rsi](%0) \n\t" | ||
2419 | "mov %%edi, %c[rdi](%0) \n\t" | ||
2420 | "mov %%ebp, %c[rbp](%0) \n\t" | ||
2421 | "mov %%cr2, %%eax \n\t" | ||
2422 | "mov %%eax, %c[cr2](%0) \n\t" | ||
2423 | |||
2424 | "pop %%ebp; pop %%ebp; pop %%edx \n\t" | ||
2425 | #endif | ||
2426 | "setbe %c[fail](%0) \n\t" | ||
2427 | : : "c"(vmx), "d"((unsigned long)HOST_RSP), | ||
2428 | [launched]"i"(offsetof(struct vcpu_vmx, launched)), | ||
2429 | [fail]"i"(offsetof(struct vcpu_vmx, fail)), | ||
2430 | [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), | ||
2431 | [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), | ||
2432 | [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), | ||
2433 | [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])), | ||
2434 | [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])), | ||
2435 | [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])), | ||
2436 | [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])), | ||
2437 | #ifdef CONFIG_X86_64 | ||
2438 | [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])), | ||
2439 | [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])), | ||
2440 | [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])), | ||
2441 | [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])), | ||
2442 | [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])), | ||
2443 | [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])), | ||
2444 | [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])), | ||
2445 | [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])), | ||
2446 | #endif | ||
2447 | [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) | ||
2448 | : "cc", "memory" | ||
2449 | #ifdef CONFIG_X86_64 | ||
2450 | , "rbx", "rdi", "rsi" | ||
2451 | , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" | ||
2452 | #else | ||
2453 | , "ebx", "edi", "rsi" | ||
2454 | #endif | ||
2455 | ); | ||
2456 | |||
2457 | vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); | ||
2458 | if (vmx->rmode.irq.pending) | ||
2459 | fixup_rmode_irq(vmx); | ||
2460 | |||
2461 | vcpu->arch.interrupt_window_open = | ||
2462 | (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0; | ||
2463 | |||
2464 | asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); | ||
2465 | vmx->launched = 1; | ||
2466 | |||
2467 | intr_info = vmcs_read32(VM_EXIT_INTR_INFO); | ||
2468 | |||
2469 | /* We need to handle NMIs before interrupts are enabled */ | ||
2470 | if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */ | ||
2471 | asm("int $2"); | ||
2472 | } | ||
2473 | |||
2474 | static void vmx_free_vmcs(struct kvm_vcpu *vcpu) | ||
2475 | { | ||
2476 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
2477 | |||
2478 | if (vmx->vmcs) { | ||
2479 | on_each_cpu(__vcpu_clear, vmx, 0, 1); | ||
2480 | free_vmcs(vmx->vmcs); | ||
2481 | vmx->vmcs = NULL; | ||
2482 | } | ||
2483 | } | ||
2484 | |||
2485 | static void vmx_free_vcpu(struct kvm_vcpu *vcpu) | ||
2486 | { | ||
2487 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
2488 | |||
2489 | vmx_free_vmcs(vcpu); | ||
2490 | kfree(vmx->host_msrs); | ||
2491 | kfree(vmx->guest_msrs); | ||
2492 | kvm_vcpu_uninit(vcpu); | ||
2493 | kmem_cache_free(kvm_vcpu_cache, vmx); | ||
2494 | } | ||
2495 | |||
2496 | static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) | ||
2497 | { | ||
2498 | int err; | ||
2499 | struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); | ||
2500 | int cpu; | ||
2501 | |||
2502 | if (!vmx) | ||
2503 | return ERR_PTR(-ENOMEM); | ||
2504 | |||
2505 | err = kvm_vcpu_init(&vmx->vcpu, kvm, id); | ||
2506 | if (err) | ||
2507 | goto free_vcpu; | ||
2508 | |||
2509 | vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); | ||
2510 | if (!vmx->guest_msrs) { | ||
2511 | err = -ENOMEM; | ||
2512 | goto uninit_vcpu; | ||
2513 | } | ||
2514 | |||
2515 | vmx->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); | ||
2516 | if (!vmx->host_msrs) | ||
2517 | goto free_guest_msrs; | ||
2518 | |||
2519 | vmx->vmcs = alloc_vmcs(); | ||
2520 | if (!vmx->vmcs) | ||
2521 | goto free_msrs; | ||
2522 | |||
2523 | vmcs_clear(vmx->vmcs); | ||
2524 | |||
2525 | cpu = get_cpu(); | ||
2526 | vmx_vcpu_load(&vmx->vcpu, cpu); | ||
2527 | err = vmx_vcpu_setup(vmx); | ||
2528 | vmx_vcpu_put(&vmx->vcpu); | ||
2529 | put_cpu(); | ||
2530 | if (err) | ||
2531 | goto free_vmcs; | ||
2532 | |||
2533 | return &vmx->vcpu; | ||
2534 | |||
2535 | free_vmcs: | ||
2536 | free_vmcs(vmx->vmcs); | ||
2537 | free_msrs: | ||
2538 | kfree(vmx->host_msrs); | ||
2539 | free_guest_msrs: | ||
2540 | kfree(vmx->guest_msrs); | ||
2541 | uninit_vcpu: | ||
2542 | kvm_vcpu_uninit(&vmx->vcpu); | ||
2543 | free_vcpu: | ||
2544 | kmem_cache_free(kvm_vcpu_cache, vmx); | ||
2545 | return ERR_PTR(err); | ||
2546 | } | ||
2547 | |||
2548 | static void __init vmx_check_processor_compat(void *rtn) | ||
2549 | { | ||
2550 | struct vmcs_config vmcs_conf; | ||
2551 | |||
2552 | *(int *)rtn = 0; | ||
2553 | if (setup_vmcs_config(&vmcs_conf) < 0) | ||
2554 | *(int *)rtn = -EIO; | ||
2555 | if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) { | ||
2556 | printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n", | ||
2557 | smp_processor_id()); | ||
2558 | *(int *)rtn = -EIO; | ||
2559 | } | ||
2560 | } | ||
2561 | |||
2562 | static struct kvm_x86_ops vmx_x86_ops = { | ||
2563 | .cpu_has_kvm_support = cpu_has_kvm_support, | ||
2564 | .disabled_by_bios = vmx_disabled_by_bios, | ||
2565 | .hardware_setup = hardware_setup, | ||
2566 | .hardware_unsetup = hardware_unsetup, | ||
2567 | .check_processor_compatibility = vmx_check_processor_compat, | ||
2568 | .hardware_enable = hardware_enable, | ||
2569 | .hardware_disable = hardware_disable, | ||
2570 | |||
2571 | .vcpu_create = vmx_create_vcpu, | ||
2572 | .vcpu_free = vmx_free_vcpu, | ||
2573 | .vcpu_reset = vmx_vcpu_reset, | ||
2574 | |||
2575 | .prepare_guest_switch = vmx_save_host_state, | ||
2576 | .vcpu_load = vmx_vcpu_load, | ||
2577 | .vcpu_put = vmx_vcpu_put, | ||
2578 | .vcpu_decache = vmx_vcpu_decache, | ||
2579 | |||
2580 | .set_guest_debug = set_guest_debug, | ||
2581 | .guest_debug_pre = kvm_guest_debug_pre, | ||
2582 | .get_msr = vmx_get_msr, | ||
2583 | .set_msr = vmx_set_msr, | ||
2584 | .get_segment_base = vmx_get_segment_base, | ||
2585 | .get_segment = vmx_get_segment, | ||
2586 | .set_segment = vmx_set_segment, | ||
2587 | .get_cs_db_l_bits = vmx_get_cs_db_l_bits, | ||
2588 | .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, | ||
2589 | .set_cr0 = vmx_set_cr0, | ||
2590 | .set_cr3 = vmx_set_cr3, | ||
2591 | .set_cr4 = vmx_set_cr4, | ||
2592 | #ifdef CONFIG_X86_64 | ||
2593 | .set_efer = vmx_set_efer, | ||
2594 | #endif | ||
2595 | .get_idt = vmx_get_idt, | ||
2596 | .set_idt = vmx_set_idt, | ||
2597 | .get_gdt = vmx_get_gdt, | ||
2598 | .set_gdt = vmx_set_gdt, | ||
2599 | .cache_regs = vcpu_load_rsp_rip, | ||
2600 | .decache_regs = vcpu_put_rsp_rip, | ||
2601 | .get_rflags = vmx_get_rflags, | ||
2602 | .set_rflags = vmx_set_rflags, | ||
2603 | |||
2604 | .tlb_flush = vmx_flush_tlb, | ||
2605 | |||
2606 | .run = vmx_vcpu_run, | ||
2607 | .handle_exit = kvm_handle_exit, | ||
2608 | .skip_emulated_instruction = skip_emulated_instruction, | ||
2609 | .patch_hypercall = vmx_patch_hypercall, | ||
2610 | .get_irq = vmx_get_irq, | ||
2611 | .set_irq = vmx_inject_irq, | ||
2612 | .queue_exception = vmx_queue_exception, | ||
2613 | .exception_injected = vmx_exception_injected, | ||
2614 | .inject_pending_irq = vmx_intr_assist, | ||
2615 | .inject_pending_vectors = do_interrupt_requests, | ||
2616 | |||
2617 | .set_tss_addr = vmx_set_tss_addr, | ||
2618 | }; | ||
2619 | |||
2620 | static int __init vmx_init(void) | ||
2621 | { | ||
2622 | void *iova; | ||
2623 | int r; | ||
2624 | |||
2625 | vmx_io_bitmap_a = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); | ||
2626 | if (!vmx_io_bitmap_a) | ||
2627 | return -ENOMEM; | ||
2628 | |||
2629 | vmx_io_bitmap_b = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); | ||
2630 | if (!vmx_io_bitmap_b) { | ||
2631 | r = -ENOMEM; | ||
2632 | goto out; | ||
2633 | } | ||
2634 | |||
2635 | /* | ||
2636 | * Allow direct access to the PC debug port (it is often used for I/O | ||
2637 | * delays, but the vmexits simply slow things down). | ||
2638 | */ | ||
2639 | iova = kmap(vmx_io_bitmap_a); | ||
2640 | memset(iova, 0xff, PAGE_SIZE); | ||
2641 | clear_bit(0x80, iova); | ||
2642 | kunmap(vmx_io_bitmap_a); | ||
2643 | |||
2644 | iova = kmap(vmx_io_bitmap_b); | ||
2645 | memset(iova, 0xff, PAGE_SIZE); | ||
2646 | kunmap(vmx_io_bitmap_b); | ||
2647 | |||
2648 | r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE); | ||
2649 | if (r) | ||
2650 | goto out1; | ||
2651 | |||
2652 | if (bypass_guest_pf) | ||
2653 | kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull); | ||
2654 | |||
2655 | return 0; | ||
2656 | |||
2657 | out1: | ||
2658 | __free_page(vmx_io_bitmap_b); | ||
2659 | out: | ||
2660 | __free_page(vmx_io_bitmap_a); | ||
2661 | return r; | ||
2662 | } | ||
2663 | |||
2664 | static void __exit vmx_exit(void) | ||
2665 | { | ||
2666 | __free_page(vmx_io_bitmap_b); | ||
2667 | __free_page(vmx_io_bitmap_a); | ||
2668 | |||
2669 | kvm_exit(); | ||
2670 | } | ||
2671 | |||
2672 | module_init(vmx_init) | ||
2673 | module_exit(vmx_exit) | ||
diff --git a/drivers/kvm/vmx.h b/drivers/kvm/vmx.h deleted file mode 100644 index d52ae8d7303d..000000000000 --- a/drivers/kvm/vmx.h +++ /dev/null | |||
@@ -1,324 +0,0 @@ | |||
1 | #ifndef VMX_H | ||
2 | #define VMX_H | ||
3 | |||
4 | /* | ||
5 | * vmx.h: VMX Architecture related definitions | ||
6 | * Copyright (c) 2004, Intel Corporation. | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or modify it | ||
9 | * under the terms and conditions of the GNU General Public License, | ||
10 | * version 2, as published by the Free Software Foundation. | ||
11 | * | ||
12 | * This program is distributed in the hope it will be useful, but WITHOUT | ||
13 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
14 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
15 | * more details. | ||
16 | * | ||
17 | * You should have received a copy of the GNU General Public License along with | ||
18 | * this program; if not, write to the Free Software Foundation, Inc., 59 Temple | ||
19 | * Place - Suite 330, Boston, MA 02111-1307 USA. | ||
20 | * | ||
21 | * A few random additions are: | ||
22 | * Copyright (C) 2006 Qumranet | ||
23 | * Avi Kivity <avi@qumranet.com> | ||
24 | * Yaniv Kamay <yaniv@qumranet.com> | ||
25 | * | ||
26 | */ | ||
27 | |||
28 | /* | ||
29 | * Definitions of Primary Processor-Based VM-Execution Controls. | ||
30 | */ | ||
31 | #define CPU_BASED_VIRTUAL_INTR_PENDING 0x00000004 | ||
32 | #define CPU_BASED_USE_TSC_OFFSETING 0x00000008 | ||
33 | #define CPU_BASED_HLT_EXITING 0x00000080 | ||
34 | #define CPU_BASED_INVLPG_EXITING 0x00000200 | ||
35 | #define CPU_BASED_MWAIT_EXITING 0x00000400 | ||
36 | #define CPU_BASED_RDPMC_EXITING 0x00000800 | ||
37 | #define CPU_BASED_RDTSC_EXITING 0x00001000 | ||
38 | #define CPU_BASED_CR8_LOAD_EXITING 0x00080000 | ||
39 | #define CPU_BASED_CR8_STORE_EXITING 0x00100000 | ||
40 | #define CPU_BASED_TPR_SHADOW 0x00200000 | ||
41 | #define CPU_BASED_MOV_DR_EXITING 0x00800000 | ||
42 | #define CPU_BASED_UNCOND_IO_EXITING 0x01000000 | ||
43 | #define CPU_BASED_USE_IO_BITMAPS 0x02000000 | ||
44 | #define CPU_BASED_USE_MSR_BITMAPS 0x10000000 | ||
45 | #define CPU_BASED_MONITOR_EXITING 0x20000000 | ||
46 | #define CPU_BASED_PAUSE_EXITING 0x40000000 | ||
47 | #define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS 0x80000000 | ||
48 | /* | ||
49 | * Definitions of Secondary Processor-Based VM-Execution Controls. | ||
50 | */ | ||
51 | #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 | ||
52 | #define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 | ||
53 | |||
54 | |||
55 | #define PIN_BASED_EXT_INTR_MASK 0x00000001 | ||
56 | #define PIN_BASED_NMI_EXITING 0x00000008 | ||
57 | #define PIN_BASED_VIRTUAL_NMIS 0x00000020 | ||
58 | |||
59 | #define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200 | ||
60 | #define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000 | ||
61 | |||
62 | #define VM_ENTRY_IA32E_MODE 0x00000200 | ||
63 | #define VM_ENTRY_SMM 0x00000400 | ||
64 | #define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800 | ||
65 | |||
66 | /* VMCS Encodings */ | ||
67 | enum vmcs_field { | ||
68 | GUEST_ES_SELECTOR = 0x00000800, | ||
69 | GUEST_CS_SELECTOR = 0x00000802, | ||
70 | GUEST_SS_SELECTOR = 0x00000804, | ||
71 | GUEST_DS_SELECTOR = 0x00000806, | ||
72 | GUEST_FS_SELECTOR = 0x00000808, | ||
73 | GUEST_GS_SELECTOR = 0x0000080a, | ||
74 | GUEST_LDTR_SELECTOR = 0x0000080c, | ||
75 | GUEST_TR_SELECTOR = 0x0000080e, | ||
76 | HOST_ES_SELECTOR = 0x00000c00, | ||
77 | HOST_CS_SELECTOR = 0x00000c02, | ||
78 | HOST_SS_SELECTOR = 0x00000c04, | ||
79 | HOST_DS_SELECTOR = 0x00000c06, | ||
80 | HOST_FS_SELECTOR = 0x00000c08, | ||
81 | HOST_GS_SELECTOR = 0x00000c0a, | ||
82 | HOST_TR_SELECTOR = 0x00000c0c, | ||
83 | IO_BITMAP_A = 0x00002000, | ||
84 | IO_BITMAP_A_HIGH = 0x00002001, | ||
85 | IO_BITMAP_B = 0x00002002, | ||
86 | IO_BITMAP_B_HIGH = 0x00002003, | ||
87 | MSR_BITMAP = 0x00002004, | ||
88 | MSR_BITMAP_HIGH = 0x00002005, | ||
89 | VM_EXIT_MSR_STORE_ADDR = 0x00002006, | ||
90 | VM_EXIT_MSR_STORE_ADDR_HIGH = 0x00002007, | ||
91 | VM_EXIT_MSR_LOAD_ADDR = 0x00002008, | ||
92 | VM_EXIT_MSR_LOAD_ADDR_HIGH = 0x00002009, | ||
93 | VM_ENTRY_MSR_LOAD_ADDR = 0x0000200a, | ||
94 | VM_ENTRY_MSR_LOAD_ADDR_HIGH = 0x0000200b, | ||
95 | TSC_OFFSET = 0x00002010, | ||
96 | TSC_OFFSET_HIGH = 0x00002011, | ||
97 | VIRTUAL_APIC_PAGE_ADDR = 0x00002012, | ||
98 | VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013, | ||
99 | APIC_ACCESS_ADDR = 0x00002014, | ||
100 | APIC_ACCESS_ADDR_HIGH = 0x00002015, | ||
101 | VMCS_LINK_POINTER = 0x00002800, | ||
102 | VMCS_LINK_POINTER_HIGH = 0x00002801, | ||
103 | GUEST_IA32_DEBUGCTL = 0x00002802, | ||
104 | GUEST_IA32_DEBUGCTL_HIGH = 0x00002803, | ||
105 | PIN_BASED_VM_EXEC_CONTROL = 0x00004000, | ||
106 | CPU_BASED_VM_EXEC_CONTROL = 0x00004002, | ||
107 | EXCEPTION_BITMAP = 0x00004004, | ||
108 | PAGE_FAULT_ERROR_CODE_MASK = 0x00004006, | ||
109 | PAGE_FAULT_ERROR_CODE_MATCH = 0x00004008, | ||
110 | CR3_TARGET_COUNT = 0x0000400a, | ||
111 | VM_EXIT_CONTROLS = 0x0000400c, | ||
112 | VM_EXIT_MSR_STORE_COUNT = 0x0000400e, | ||
113 | VM_EXIT_MSR_LOAD_COUNT = 0x00004010, | ||
114 | VM_ENTRY_CONTROLS = 0x00004012, | ||
115 | VM_ENTRY_MSR_LOAD_COUNT = 0x00004014, | ||
116 | VM_ENTRY_INTR_INFO_FIELD = 0x00004016, | ||
117 | VM_ENTRY_EXCEPTION_ERROR_CODE = 0x00004018, | ||
118 | VM_ENTRY_INSTRUCTION_LEN = 0x0000401a, | ||
119 | TPR_THRESHOLD = 0x0000401c, | ||
120 | SECONDARY_VM_EXEC_CONTROL = 0x0000401e, | ||
121 | VM_INSTRUCTION_ERROR = 0x00004400, | ||
122 | VM_EXIT_REASON = 0x00004402, | ||
123 | VM_EXIT_INTR_INFO = 0x00004404, | ||
124 | VM_EXIT_INTR_ERROR_CODE = 0x00004406, | ||
125 | IDT_VECTORING_INFO_FIELD = 0x00004408, | ||
126 | IDT_VECTORING_ERROR_CODE = 0x0000440a, | ||
127 | VM_EXIT_INSTRUCTION_LEN = 0x0000440c, | ||
128 | VMX_INSTRUCTION_INFO = 0x0000440e, | ||
129 | GUEST_ES_LIMIT = 0x00004800, | ||
130 | GUEST_CS_LIMIT = 0x00004802, | ||
131 | GUEST_SS_LIMIT = 0x00004804, | ||
132 | GUEST_DS_LIMIT = 0x00004806, | ||
133 | GUEST_FS_LIMIT = 0x00004808, | ||
134 | GUEST_GS_LIMIT = 0x0000480a, | ||
135 | GUEST_LDTR_LIMIT = 0x0000480c, | ||
136 | GUEST_TR_LIMIT = 0x0000480e, | ||
137 | GUEST_GDTR_LIMIT = 0x00004810, | ||
138 | GUEST_IDTR_LIMIT = 0x00004812, | ||
139 | GUEST_ES_AR_BYTES = 0x00004814, | ||
140 | GUEST_CS_AR_BYTES = 0x00004816, | ||
141 | GUEST_SS_AR_BYTES = 0x00004818, | ||
142 | GUEST_DS_AR_BYTES = 0x0000481a, | ||
143 | GUEST_FS_AR_BYTES = 0x0000481c, | ||
144 | GUEST_GS_AR_BYTES = 0x0000481e, | ||
145 | GUEST_LDTR_AR_BYTES = 0x00004820, | ||
146 | GUEST_TR_AR_BYTES = 0x00004822, | ||
147 | GUEST_INTERRUPTIBILITY_INFO = 0x00004824, | ||
148 | GUEST_ACTIVITY_STATE = 0X00004826, | ||
149 | GUEST_SYSENTER_CS = 0x0000482A, | ||
150 | HOST_IA32_SYSENTER_CS = 0x00004c00, | ||
151 | CR0_GUEST_HOST_MASK = 0x00006000, | ||
152 | CR4_GUEST_HOST_MASK = 0x00006002, | ||
153 | CR0_READ_SHADOW = 0x00006004, | ||
154 | CR4_READ_SHADOW = 0x00006006, | ||
155 | CR3_TARGET_VALUE0 = 0x00006008, | ||
156 | CR3_TARGET_VALUE1 = 0x0000600a, | ||
157 | CR3_TARGET_VALUE2 = 0x0000600c, | ||
158 | CR3_TARGET_VALUE3 = 0x0000600e, | ||
159 | EXIT_QUALIFICATION = 0x00006400, | ||
160 | GUEST_LINEAR_ADDRESS = 0x0000640a, | ||
161 | GUEST_CR0 = 0x00006800, | ||
162 | GUEST_CR3 = 0x00006802, | ||
163 | GUEST_CR4 = 0x00006804, | ||
164 | GUEST_ES_BASE = 0x00006806, | ||
165 | GUEST_CS_BASE = 0x00006808, | ||
166 | GUEST_SS_BASE = 0x0000680a, | ||
167 | GUEST_DS_BASE = 0x0000680c, | ||
168 | GUEST_FS_BASE = 0x0000680e, | ||
169 | GUEST_GS_BASE = 0x00006810, | ||
170 | GUEST_LDTR_BASE = 0x00006812, | ||
171 | GUEST_TR_BASE = 0x00006814, | ||
172 | GUEST_GDTR_BASE = 0x00006816, | ||
173 | GUEST_IDTR_BASE = 0x00006818, | ||
174 | GUEST_DR7 = 0x0000681a, | ||
175 | GUEST_RSP = 0x0000681c, | ||
176 | GUEST_RIP = 0x0000681e, | ||
177 | GUEST_RFLAGS = 0x00006820, | ||
178 | GUEST_PENDING_DBG_EXCEPTIONS = 0x00006822, | ||
179 | GUEST_SYSENTER_ESP = 0x00006824, | ||
180 | GUEST_SYSENTER_EIP = 0x00006826, | ||
181 | HOST_CR0 = 0x00006c00, | ||
182 | HOST_CR3 = 0x00006c02, | ||
183 | HOST_CR4 = 0x00006c04, | ||
184 | HOST_FS_BASE = 0x00006c06, | ||
185 | HOST_GS_BASE = 0x00006c08, | ||
186 | HOST_TR_BASE = 0x00006c0a, | ||
187 | HOST_GDTR_BASE = 0x00006c0c, | ||
188 | HOST_IDTR_BASE = 0x00006c0e, | ||
189 | HOST_IA32_SYSENTER_ESP = 0x00006c10, | ||
190 | HOST_IA32_SYSENTER_EIP = 0x00006c12, | ||
191 | HOST_RSP = 0x00006c14, | ||
192 | HOST_RIP = 0x00006c16, | ||
193 | }; | ||
194 | |||
195 | #define VMX_EXIT_REASONS_FAILED_VMENTRY 0x80000000 | ||
196 | |||
197 | #define EXIT_REASON_EXCEPTION_NMI 0 | ||
198 | #define EXIT_REASON_EXTERNAL_INTERRUPT 1 | ||
199 | #define EXIT_REASON_TRIPLE_FAULT 2 | ||
200 | |||
201 | #define EXIT_REASON_PENDING_INTERRUPT 7 | ||
202 | |||
203 | #define EXIT_REASON_TASK_SWITCH 9 | ||
204 | #define EXIT_REASON_CPUID 10 | ||
205 | #define EXIT_REASON_HLT 12 | ||
206 | #define EXIT_REASON_INVLPG 14 | ||
207 | #define EXIT_REASON_RDPMC 15 | ||
208 | #define EXIT_REASON_RDTSC 16 | ||
209 | #define EXIT_REASON_VMCALL 18 | ||
210 | #define EXIT_REASON_VMCLEAR 19 | ||
211 | #define EXIT_REASON_VMLAUNCH 20 | ||
212 | #define EXIT_REASON_VMPTRLD 21 | ||
213 | #define EXIT_REASON_VMPTRST 22 | ||
214 | #define EXIT_REASON_VMREAD 23 | ||
215 | #define EXIT_REASON_VMRESUME 24 | ||
216 | #define EXIT_REASON_VMWRITE 25 | ||
217 | #define EXIT_REASON_VMOFF 26 | ||
218 | #define EXIT_REASON_VMON 27 | ||
219 | #define EXIT_REASON_CR_ACCESS 28 | ||
220 | #define EXIT_REASON_DR_ACCESS 29 | ||
221 | #define EXIT_REASON_IO_INSTRUCTION 30 | ||
222 | #define EXIT_REASON_MSR_READ 31 | ||
223 | #define EXIT_REASON_MSR_WRITE 32 | ||
224 | #define EXIT_REASON_MWAIT_INSTRUCTION 36 | ||
225 | #define EXIT_REASON_TPR_BELOW_THRESHOLD 43 | ||
226 | #define EXIT_REASON_APIC_ACCESS 44 | ||
227 | #define EXIT_REASON_WBINVD 54 | ||
228 | |||
229 | /* | ||
230 | * Interruption-information format | ||
231 | */ | ||
232 | #define INTR_INFO_VECTOR_MASK 0xff /* 7:0 */ | ||
233 | #define INTR_INFO_INTR_TYPE_MASK 0x700 /* 10:8 */ | ||
234 | #define INTR_INFO_DELIEVER_CODE_MASK 0x800 /* 11 */ | ||
235 | #define INTR_INFO_VALID_MASK 0x80000000 /* 31 */ | ||
236 | |||
237 | #define VECTORING_INFO_VECTOR_MASK INTR_INFO_VECTOR_MASK | ||
238 | #define VECTORING_INFO_TYPE_MASK INTR_INFO_INTR_TYPE_MASK | ||
239 | #define VECTORING_INFO_DELIEVER_CODE_MASK INTR_INFO_DELIEVER_CODE_MASK | ||
240 | #define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK | ||
241 | |||
242 | #define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */ | ||
243 | #define INTR_TYPE_EXCEPTION (3 << 8) /* processor exception */ | ||
244 | #define INTR_TYPE_SOFT_INTR (4 << 8) /* software interrupt */ | ||
245 | |||
246 | /* | ||
247 | * Exit Qualifications for MOV for Control Register Access | ||
248 | */ | ||
249 | #define CONTROL_REG_ACCESS_NUM 0x7 /* 2:0, number of control reg.*/ | ||
250 | #define CONTROL_REG_ACCESS_TYPE 0x30 /* 5:4, access type */ | ||
251 | #define CONTROL_REG_ACCESS_REG 0xf00 /* 10:8, general purpose reg. */ | ||
252 | #define LMSW_SOURCE_DATA_SHIFT 16 | ||
253 | #define LMSW_SOURCE_DATA (0xFFFF << LMSW_SOURCE_DATA_SHIFT) /* 16:31 lmsw source */ | ||
254 | #define REG_EAX (0 << 8) | ||
255 | #define REG_ECX (1 << 8) | ||
256 | #define REG_EDX (2 << 8) | ||
257 | #define REG_EBX (3 << 8) | ||
258 | #define REG_ESP (4 << 8) | ||
259 | #define REG_EBP (5 << 8) | ||
260 | #define REG_ESI (6 << 8) | ||
261 | #define REG_EDI (7 << 8) | ||
262 | #define REG_R8 (8 << 8) | ||
263 | #define REG_R9 (9 << 8) | ||
264 | #define REG_R10 (10 << 8) | ||
265 | #define REG_R11 (11 << 8) | ||
266 | #define REG_R12 (12 << 8) | ||
267 | #define REG_R13 (13 << 8) | ||
268 | #define REG_R14 (14 << 8) | ||
269 | #define REG_R15 (15 << 8) | ||
270 | |||
271 | /* | ||
272 | * Exit Qualifications for MOV for Debug Register Access | ||
273 | */ | ||
274 | #define DEBUG_REG_ACCESS_NUM 0x7 /* 2:0, number of debug reg. */ | ||
275 | #define DEBUG_REG_ACCESS_TYPE 0x10 /* 4, direction of access */ | ||
276 | #define TYPE_MOV_TO_DR (0 << 4) | ||
277 | #define TYPE_MOV_FROM_DR (1 << 4) | ||
278 | #define DEBUG_REG_ACCESS_REG 0xf00 /* 11:8, general purpose reg. */ | ||
279 | |||
280 | |||
281 | /* segment AR */ | ||
282 | #define SEGMENT_AR_L_MASK (1 << 13) | ||
283 | |||
284 | #define AR_TYPE_ACCESSES_MASK 1 | ||
285 | #define AR_TYPE_READABLE_MASK (1 << 1) | ||
286 | #define AR_TYPE_WRITEABLE_MASK (1 << 2) | ||
287 | #define AR_TYPE_CODE_MASK (1 << 3) | ||
288 | #define AR_TYPE_MASK 0x0f | ||
289 | #define AR_TYPE_BUSY_64_TSS 11 | ||
290 | #define AR_TYPE_BUSY_32_TSS 11 | ||
291 | #define AR_TYPE_BUSY_16_TSS 3 | ||
292 | #define AR_TYPE_LDT 2 | ||
293 | |||
294 | #define AR_UNUSABLE_MASK (1 << 16) | ||
295 | #define AR_S_MASK (1 << 4) | ||
296 | #define AR_P_MASK (1 << 7) | ||
297 | #define AR_L_MASK (1 << 13) | ||
298 | #define AR_DB_MASK (1 << 14) | ||
299 | #define AR_G_MASK (1 << 15) | ||
300 | #define AR_DPL_SHIFT 5 | ||
301 | #define AR_DPL(ar) (((ar) >> AR_DPL_SHIFT) & 3) | ||
302 | |||
303 | #define AR_RESERVD_MASK 0xfffe0f00 | ||
304 | |||
305 | #define MSR_IA32_VMX_BASIC 0x480 | ||
306 | #define MSR_IA32_VMX_PINBASED_CTLS 0x481 | ||
307 | #define MSR_IA32_VMX_PROCBASED_CTLS 0x482 | ||
308 | #define MSR_IA32_VMX_EXIT_CTLS 0x483 | ||
309 | #define MSR_IA32_VMX_ENTRY_CTLS 0x484 | ||
310 | #define MSR_IA32_VMX_MISC 0x485 | ||
311 | #define MSR_IA32_VMX_CR0_FIXED0 0x486 | ||
312 | #define MSR_IA32_VMX_CR0_FIXED1 0x487 | ||
313 | #define MSR_IA32_VMX_CR4_FIXED0 0x488 | ||
314 | #define MSR_IA32_VMX_CR4_FIXED1 0x489 | ||
315 | #define MSR_IA32_VMX_VMCS_ENUM 0x48a | ||
316 | #define MSR_IA32_VMX_PROCBASED_CTLS2 0x48b | ||
317 | |||
318 | #define MSR_IA32_FEATURE_CONTROL 0x3a | ||
319 | #define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1 | ||
320 | #define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4 | ||
321 | |||
322 | #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9 | ||
323 | |||
324 | #endif | ||
diff --git a/drivers/kvm/x86.c b/drivers/kvm/x86.c deleted file mode 100644 index b37c0093d728..000000000000 --- a/drivers/kvm/x86.c +++ /dev/null | |||
@@ -1,3148 +0,0 @@ | |||
1 | /* | ||
2 | * Kernel-based Virtual Machine driver for Linux | ||
3 | * | ||
4 | * derived from drivers/kvm/kvm_main.c | ||
5 | * | ||
6 | * Copyright (C) 2006 Qumranet, Inc. | ||
7 | * | ||
8 | * Authors: | ||
9 | * Avi Kivity <avi@qumranet.com> | ||
10 | * Yaniv Kamay <yaniv@qumranet.com> | ||
11 | * | ||
12 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
13 | * the COPYING file in the top-level directory. | ||
14 | * | ||
15 | */ | ||
16 | |||
17 | #include "kvm.h" | ||
18 | #include "x86.h" | ||
19 | #include "x86_emulate.h" | ||
20 | #include "segment_descriptor.h" | ||
21 | #include "irq.h" | ||
22 | #include "mmu.h" | ||
23 | |||
24 | #include <linux/kvm.h> | ||
25 | #include <linux/fs.h> | ||
26 | #include <linux/vmalloc.h> | ||
27 | #include <linux/module.h> | ||
28 | #include <linux/mman.h> | ||
29 | #include <linux/highmem.h> | ||
30 | |||
31 | #include <asm/uaccess.h> | ||
32 | #include <asm/msr.h> | ||
33 | |||
34 | #define MAX_IO_MSRS 256 | ||
35 | #define CR0_RESERVED_BITS \ | ||
36 | (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ | ||
37 | | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ | ||
38 | | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) | ||
39 | #define CR4_RESERVED_BITS \ | ||
40 | (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ | ||
41 | | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ | ||
42 | | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ | ||
43 | | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) | ||
44 | |||
45 | #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) | ||
46 | #define EFER_RESERVED_BITS 0xfffffffffffff2fe | ||
47 | |||
48 | #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM | ||
49 | #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU | ||
50 | |||
51 | struct kvm_x86_ops *kvm_x86_ops; | ||
52 | |||
53 | struct kvm_stats_debugfs_item debugfs_entries[] = { | ||
54 | { "pf_fixed", VCPU_STAT(pf_fixed) }, | ||
55 | { "pf_guest", VCPU_STAT(pf_guest) }, | ||
56 | { "tlb_flush", VCPU_STAT(tlb_flush) }, | ||
57 | { "invlpg", VCPU_STAT(invlpg) }, | ||
58 | { "exits", VCPU_STAT(exits) }, | ||
59 | { "io_exits", VCPU_STAT(io_exits) }, | ||
60 | { "mmio_exits", VCPU_STAT(mmio_exits) }, | ||
61 | { "signal_exits", VCPU_STAT(signal_exits) }, | ||
62 | { "irq_window", VCPU_STAT(irq_window_exits) }, | ||
63 | { "halt_exits", VCPU_STAT(halt_exits) }, | ||
64 | { "halt_wakeup", VCPU_STAT(halt_wakeup) }, | ||
65 | { "request_irq", VCPU_STAT(request_irq_exits) }, | ||
66 | { "irq_exits", VCPU_STAT(irq_exits) }, | ||
67 | { "host_state_reload", VCPU_STAT(host_state_reload) }, | ||
68 | { "efer_reload", VCPU_STAT(efer_reload) }, | ||
69 | { "fpu_reload", VCPU_STAT(fpu_reload) }, | ||
70 | { "insn_emulation", VCPU_STAT(insn_emulation) }, | ||
71 | { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, | ||
72 | { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, | ||
73 | { "mmu_pte_write", VM_STAT(mmu_pte_write) }, | ||
74 | { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, | ||
75 | { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) }, | ||
76 | { "mmu_flooded", VM_STAT(mmu_flooded) }, | ||
77 | { "mmu_recycled", VM_STAT(mmu_recycled) }, | ||
78 | { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, | ||
79 | { NULL } | ||
80 | }; | ||
81 | |||
82 | |||
83 | unsigned long segment_base(u16 selector) | ||
84 | { | ||
85 | struct descriptor_table gdt; | ||
86 | struct segment_descriptor *d; | ||
87 | unsigned long table_base; | ||
88 | unsigned long v; | ||
89 | |||
90 | if (selector == 0) | ||
91 | return 0; | ||
92 | |||
93 | asm("sgdt %0" : "=m"(gdt)); | ||
94 | table_base = gdt.base; | ||
95 | |||
96 | if (selector & 4) { /* from ldt */ | ||
97 | u16 ldt_selector; | ||
98 | |||
99 | asm("sldt %0" : "=g"(ldt_selector)); | ||
100 | table_base = segment_base(ldt_selector); | ||
101 | } | ||
102 | d = (struct segment_descriptor *)(table_base + (selector & ~7)); | ||
103 | v = d->base_low | ((unsigned long)d->base_mid << 16) | | ||
104 | ((unsigned long)d->base_high << 24); | ||
105 | #ifdef CONFIG_X86_64 | ||
106 | if (d->system == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) | ||
107 | v |= ((unsigned long) \ | ||
108 | ((struct segment_descriptor_64 *)d)->base_higher) << 32; | ||
109 | #endif | ||
110 | return v; | ||
111 | } | ||
112 | EXPORT_SYMBOL_GPL(segment_base); | ||
113 | |||
114 | u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) | ||
115 | { | ||
116 | if (irqchip_in_kernel(vcpu->kvm)) | ||
117 | return vcpu->arch.apic_base; | ||
118 | else | ||
119 | return vcpu->arch.apic_base; | ||
120 | } | ||
121 | EXPORT_SYMBOL_GPL(kvm_get_apic_base); | ||
122 | |||
123 | void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) | ||
124 | { | ||
125 | /* TODO: reserve bits check */ | ||
126 | if (irqchip_in_kernel(vcpu->kvm)) | ||
127 | kvm_lapic_set_base(vcpu, data); | ||
128 | else | ||
129 | vcpu->arch.apic_base = data; | ||
130 | } | ||
131 | EXPORT_SYMBOL_GPL(kvm_set_apic_base); | ||
132 | |||
133 | void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) | ||
134 | { | ||
135 | WARN_ON(vcpu->arch.exception.pending); | ||
136 | vcpu->arch.exception.pending = true; | ||
137 | vcpu->arch.exception.has_error_code = false; | ||
138 | vcpu->arch.exception.nr = nr; | ||
139 | } | ||
140 | EXPORT_SYMBOL_GPL(kvm_queue_exception); | ||
141 | |||
142 | void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, | ||
143 | u32 error_code) | ||
144 | { | ||
145 | ++vcpu->stat.pf_guest; | ||
146 | if (vcpu->arch.exception.pending && vcpu->arch.exception.nr == PF_VECTOR) { | ||
147 | printk(KERN_DEBUG "kvm: inject_page_fault:" | ||
148 | " double fault 0x%lx\n", addr); | ||
149 | vcpu->arch.exception.nr = DF_VECTOR; | ||
150 | vcpu->arch.exception.error_code = 0; | ||
151 | return; | ||
152 | } | ||
153 | vcpu->arch.cr2 = addr; | ||
154 | kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); | ||
155 | } | ||
156 | |||
157 | void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) | ||
158 | { | ||
159 | WARN_ON(vcpu->arch.exception.pending); | ||
160 | vcpu->arch.exception.pending = true; | ||
161 | vcpu->arch.exception.has_error_code = true; | ||
162 | vcpu->arch.exception.nr = nr; | ||
163 | vcpu->arch.exception.error_code = error_code; | ||
164 | } | ||
165 | EXPORT_SYMBOL_GPL(kvm_queue_exception_e); | ||
166 | |||
167 | static void __queue_exception(struct kvm_vcpu *vcpu) | ||
168 | { | ||
169 | kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, | ||
170 | vcpu->arch.exception.has_error_code, | ||
171 | vcpu->arch.exception.error_code); | ||
172 | } | ||
173 | |||
174 | /* | ||
175 | * Load the pae pdptrs. Return true is they are all valid. | ||
176 | */ | ||
177 | int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) | ||
178 | { | ||
179 | gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; | ||
180 | unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; | ||
181 | int i; | ||
182 | int ret; | ||
183 | u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; | ||
184 | |||
185 | mutex_lock(&vcpu->kvm->lock); | ||
186 | ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte, | ||
187 | offset * sizeof(u64), sizeof(pdpte)); | ||
188 | if (ret < 0) { | ||
189 | ret = 0; | ||
190 | goto out; | ||
191 | } | ||
192 | for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { | ||
193 | if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) { | ||
194 | ret = 0; | ||
195 | goto out; | ||
196 | } | ||
197 | } | ||
198 | ret = 1; | ||
199 | |||
200 | memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); | ||
201 | out: | ||
202 | mutex_unlock(&vcpu->kvm->lock); | ||
203 | |||
204 | return ret; | ||
205 | } | ||
206 | |||
207 | static bool pdptrs_changed(struct kvm_vcpu *vcpu) | ||
208 | { | ||
209 | u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; | ||
210 | bool changed = true; | ||
211 | int r; | ||
212 | |||
213 | if (is_long_mode(vcpu) || !is_pae(vcpu)) | ||
214 | return false; | ||
215 | |||
216 | mutex_lock(&vcpu->kvm->lock); | ||
217 | r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); | ||
218 | if (r < 0) | ||
219 | goto out; | ||
220 | changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0; | ||
221 | out: | ||
222 | mutex_unlock(&vcpu->kvm->lock); | ||
223 | |||
224 | return changed; | ||
225 | } | ||
226 | |||
227 | void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | ||
228 | { | ||
229 | if (cr0 & CR0_RESERVED_BITS) { | ||
230 | printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", | ||
231 | cr0, vcpu->arch.cr0); | ||
232 | kvm_inject_gp(vcpu, 0); | ||
233 | return; | ||
234 | } | ||
235 | |||
236 | if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { | ||
237 | printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); | ||
238 | kvm_inject_gp(vcpu, 0); | ||
239 | return; | ||
240 | } | ||
241 | |||
242 | if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { | ||
243 | printk(KERN_DEBUG "set_cr0: #GP, set PG flag " | ||
244 | "and a clear PE flag\n"); | ||
245 | kvm_inject_gp(vcpu, 0); | ||
246 | return; | ||
247 | } | ||
248 | |||
249 | if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { | ||
250 | #ifdef CONFIG_X86_64 | ||
251 | if ((vcpu->arch.shadow_efer & EFER_LME)) { | ||
252 | int cs_db, cs_l; | ||
253 | |||
254 | if (!is_pae(vcpu)) { | ||
255 | printk(KERN_DEBUG "set_cr0: #GP, start paging " | ||
256 | "in long mode while PAE is disabled\n"); | ||
257 | kvm_inject_gp(vcpu, 0); | ||
258 | return; | ||
259 | } | ||
260 | kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); | ||
261 | if (cs_l) { | ||
262 | printk(KERN_DEBUG "set_cr0: #GP, start paging " | ||
263 | "in long mode while CS.L == 1\n"); | ||
264 | kvm_inject_gp(vcpu, 0); | ||
265 | return; | ||
266 | |||
267 | } | ||
268 | } else | ||
269 | #endif | ||
270 | if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { | ||
271 | printk(KERN_DEBUG "set_cr0: #GP, pdptrs " | ||
272 | "reserved bits\n"); | ||
273 | kvm_inject_gp(vcpu, 0); | ||
274 | return; | ||
275 | } | ||
276 | |||
277 | } | ||
278 | |||
279 | kvm_x86_ops->set_cr0(vcpu, cr0); | ||
280 | vcpu->arch.cr0 = cr0; | ||
281 | |||
282 | mutex_lock(&vcpu->kvm->lock); | ||
283 | kvm_mmu_reset_context(vcpu); | ||
284 | mutex_unlock(&vcpu->kvm->lock); | ||
285 | return; | ||
286 | } | ||
287 | EXPORT_SYMBOL_GPL(set_cr0); | ||
288 | |||
289 | void lmsw(struct kvm_vcpu *vcpu, unsigned long msw) | ||
290 | { | ||
291 | set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); | ||
292 | } | ||
293 | EXPORT_SYMBOL_GPL(lmsw); | ||
294 | |||
295 | void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | ||
296 | { | ||
297 | if (cr4 & CR4_RESERVED_BITS) { | ||
298 | printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); | ||
299 | kvm_inject_gp(vcpu, 0); | ||
300 | return; | ||
301 | } | ||
302 | |||
303 | if (is_long_mode(vcpu)) { | ||
304 | if (!(cr4 & X86_CR4_PAE)) { | ||
305 | printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while " | ||
306 | "in long mode\n"); | ||
307 | kvm_inject_gp(vcpu, 0); | ||
308 | return; | ||
309 | } | ||
310 | } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE) | ||
311 | && !load_pdptrs(vcpu, vcpu->arch.cr3)) { | ||
312 | printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); | ||
313 | kvm_inject_gp(vcpu, 0); | ||
314 | return; | ||
315 | } | ||
316 | |||
317 | if (cr4 & X86_CR4_VMXE) { | ||
318 | printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n"); | ||
319 | kvm_inject_gp(vcpu, 0); | ||
320 | return; | ||
321 | } | ||
322 | kvm_x86_ops->set_cr4(vcpu, cr4); | ||
323 | vcpu->arch.cr4 = cr4; | ||
324 | mutex_lock(&vcpu->kvm->lock); | ||
325 | kvm_mmu_reset_context(vcpu); | ||
326 | mutex_unlock(&vcpu->kvm->lock); | ||
327 | } | ||
328 | EXPORT_SYMBOL_GPL(set_cr4); | ||
329 | |||
330 | void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | ||
331 | { | ||
332 | if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { | ||
333 | kvm_mmu_flush_tlb(vcpu); | ||
334 | return; | ||
335 | } | ||
336 | |||
337 | if (is_long_mode(vcpu)) { | ||
338 | if (cr3 & CR3_L_MODE_RESERVED_BITS) { | ||
339 | printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); | ||
340 | kvm_inject_gp(vcpu, 0); | ||
341 | return; | ||
342 | } | ||
343 | } else { | ||
344 | if (is_pae(vcpu)) { | ||
345 | if (cr3 & CR3_PAE_RESERVED_BITS) { | ||
346 | printk(KERN_DEBUG | ||
347 | "set_cr3: #GP, reserved bits\n"); | ||
348 | kvm_inject_gp(vcpu, 0); | ||
349 | return; | ||
350 | } | ||
351 | if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { | ||
352 | printk(KERN_DEBUG "set_cr3: #GP, pdptrs " | ||
353 | "reserved bits\n"); | ||
354 | kvm_inject_gp(vcpu, 0); | ||
355 | return; | ||
356 | } | ||
357 | } | ||
358 | /* | ||
359 | * We don't check reserved bits in nonpae mode, because | ||
360 | * this isn't enforced, and VMware depends on this. | ||
361 | */ | ||
362 | } | ||
363 | |||
364 | mutex_lock(&vcpu->kvm->lock); | ||
365 | /* | ||
366 | * Does the new cr3 value map to physical memory? (Note, we | ||
367 | * catch an invalid cr3 even in real-mode, because it would | ||
368 | * cause trouble later on when we turn on paging anyway.) | ||
369 | * | ||
370 | * A real CPU would silently accept an invalid cr3 and would | ||
371 | * attempt to use it - with largely undefined (and often hard | ||
372 | * to debug) behavior on the guest side. | ||
373 | */ | ||
374 | if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) | ||
375 | kvm_inject_gp(vcpu, 0); | ||
376 | else { | ||
377 | vcpu->arch.cr3 = cr3; | ||
378 | vcpu->arch.mmu.new_cr3(vcpu); | ||
379 | } | ||
380 | mutex_unlock(&vcpu->kvm->lock); | ||
381 | } | ||
382 | EXPORT_SYMBOL_GPL(set_cr3); | ||
383 | |||
384 | void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) | ||
385 | { | ||
386 | if (cr8 & CR8_RESERVED_BITS) { | ||
387 | printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8); | ||
388 | kvm_inject_gp(vcpu, 0); | ||
389 | return; | ||
390 | } | ||
391 | if (irqchip_in_kernel(vcpu->kvm)) | ||
392 | kvm_lapic_set_tpr(vcpu, cr8); | ||
393 | else | ||
394 | vcpu->arch.cr8 = cr8; | ||
395 | } | ||
396 | EXPORT_SYMBOL_GPL(set_cr8); | ||
397 | |||
398 | unsigned long get_cr8(struct kvm_vcpu *vcpu) | ||
399 | { | ||
400 | if (irqchip_in_kernel(vcpu->kvm)) | ||
401 | return kvm_lapic_get_cr8(vcpu); | ||
402 | else | ||
403 | return vcpu->arch.cr8; | ||
404 | } | ||
405 | EXPORT_SYMBOL_GPL(get_cr8); | ||
406 | |||
407 | /* | ||
408 | * List of msr numbers which we expose to userspace through KVM_GET_MSRS | ||
409 | * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. | ||
410 | * | ||
411 | * This list is modified at module load time to reflect the | ||
412 | * capabilities of the host cpu. | ||
413 | */ | ||
414 | static u32 msrs_to_save[] = { | ||
415 | MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, | ||
416 | MSR_K6_STAR, | ||
417 | #ifdef CONFIG_X86_64 | ||
418 | MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, | ||
419 | #endif | ||
420 | MSR_IA32_TIME_STAMP_COUNTER, | ||
421 | }; | ||
422 | |||
423 | static unsigned num_msrs_to_save; | ||
424 | |||
425 | static u32 emulated_msrs[] = { | ||
426 | MSR_IA32_MISC_ENABLE, | ||
427 | }; | ||
428 | |||
429 | #ifdef CONFIG_X86_64 | ||
430 | |||
431 | static void set_efer(struct kvm_vcpu *vcpu, u64 efer) | ||
432 | { | ||
433 | if (efer & EFER_RESERVED_BITS) { | ||
434 | printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n", | ||
435 | efer); | ||
436 | kvm_inject_gp(vcpu, 0); | ||
437 | return; | ||
438 | } | ||
439 | |||
440 | if (is_paging(vcpu) | ||
441 | && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) { | ||
442 | printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n"); | ||
443 | kvm_inject_gp(vcpu, 0); | ||
444 | return; | ||
445 | } | ||
446 | |||
447 | kvm_x86_ops->set_efer(vcpu, efer); | ||
448 | |||
449 | efer &= ~EFER_LMA; | ||
450 | efer |= vcpu->arch.shadow_efer & EFER_LMA; | ||
451 | |||
452 | vcpu->arch.shadow_efer = efer; | ||
453 | } | ||
454 | |||
455 | #endif | ||
456 | |||
457 | /* | ||
458 | * Writes msr value into into the appropriate "register". | ||
459 | * Returns 0 on success, non-0 otherwise. | ||
460 | * Assumes vcpu_load() was already called. | ||
461 | */ | ||
462 | int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) | ||
463 | { | ||
464 | return kvm_x86_ops->set_msr(vcpu, msr_index, data); | ||
465 | } | ||
466 | |||
467 | /* | ||
468 | * Adapt set_msr() to msr_io()'s calling convention | ||
469 | */ | ||
470 | static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) | ||
471 | { | ||
472 | return kvm_set_msr(vcpu, index, *data); | ||
473 | } | ||
474 | |||
475 | |||
476 | int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | ||
477 | { | ||
478 | switch (msr) { | ||
479 | #ifdef CONFIG_X86_64 | ||
480 | case MSR_EFER: | ||
481 | set_efer(vcpu, data); | ||
482 | break; | ||
483 | #endif | ||
484 | case MSR_IA32_MC0_STATUS: | ||
485 | pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", | ||
486 | __FUNCTION__, data); | ||
487 | break; | ||
488 | case MSR_IA32_MCG_STATUS: | ||
489 | pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n", | ||
490 | __FUNCTION__, data); | ||
491 | break; | ||
492 | case MSR_IA32_UCODE_REV: | ||
493 | case MSR_IA32_UCODE_WRITE: | ||
494 | case 0x200 ... 0x2ff: /* MTRRs */ | ||
495 | break; | ||
496 | case MSR_IA32_APICBASE: | ||
497 | kvm_set_apic_base(vcpu, data); | ||
498 | break; | ||
499 | case MSR_IA32_MISC_ENABLE: | ||
500 | vcpu->arch.ia32_misc_enable_msr = data; | ||
501 | break; | ||
502 | default: | ||
503 | pr_unimpl(vcpu, "unhandled wrmsr: 0x%x\n", msr); | ||
504 | return 1; | ||
505 | } | ||
506 | return 0; | ||
507 | } | ||
508 | EXPORT_SYMBOL_GPL(kvm_set_msr_common); | ||
509 | |||
510 | |||
511 | /* | ||
512 | * Reads an msr value (of 'msr_index') into 'pdata'. | ||
513 | * Returns 0 on success, non-0 otherwise. | ||
514 | * Assumes vcpu_load() was already called. | ||
515 | */ | ||
516 | int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) | ||
517 | { | ||
518 | return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); | ||
519 | } | ||
520 | |||
521 | int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) | ||
522 | { | ||
523 | u64 data; | ||
524 | |||
525 | switch (msr) { | ||
526 | case 0xc0010010: /* SYSCFG */ | ||
527 | case 0xc0010015: /* HWCR */ | ||
528 | case MSR_IA32_PLATFORM_ID: | ||
529 | case MSR_IA32_P5_MC_ADDR: | ||
530 | case MSR_IA32_P5_MC_TYPE: | ||
531 | case MSR_IA32_MC0_CTL: | ||
532 | case MSR_IA32_MCG_STATUS: | ||
533 | case MSR_IA32_MCG_CAP: | ||
534 | case MSR_IA32_MC0_MISC: | ||
535 | case MSR_IA32_MC0_MISC+4: | ||
536 | case MSR_IA32_MC0_MISC+8: | ||
537 | case MSR_IA32_MC0_MISC+12: | ||
538 | case MSR_IA32_MC0_MISC+16: | ||
539 | case MSR_IA32_UCODE_REV: | ||
540 | case MSR_IA32_PERF_STATUS: | ||
541 | case MSR_IA32_EBL_CR_POWERON: | ||
542 | /* MTRR registers */ | ||
543 | case 0xfe: | ||
544 | case 0x200 ... 0x2ff: | ||
545 | data = 0; | ||
546 | break; | ||
547 | case 0xcd: /* fsb frequency */ | ||
548 | data = 3; | ||
549 | break; | ||
550 | case MSR_IA32_APICBASE: | ||
551 | data = kvm_get_apic_base(vcpu); | ||
552 | break; | ||
553 | case MSR_IA32_MISC_ENABLE: | ||
554 | data = vcpu->arch.ia32_misc_enable_msr; | ||
555 | break; | ||
556 | #ifdef CONFIG_X86_64 | ||
557 | case MSR_EFER: | ||
558 | data = vcpu->arch.shadow_efer; | ||
559 | break; | ||
560 | #endif | ||
561 | default: | ||
562 | pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); | ||
563 | return 1; | ||
564 | } | ||
565 | *pdata = data; | ||
566 | return 0; | ||
567 | } | ||
568 | EXPORT_SYMBOL_GPL(kvm_get_msr_common); | ||
569 | |||
570 | /* | ||
571 | * Read or write a bunch of msrs. All parameters are kernel addresses. | ||
572 | * | ||
573 | * @return number of msrs set successfully. | ||
574 | */ | ||
575 | static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, | ||
576 | struct kvm_msr_entry *entries, | ||
577 | int (*do_msr)(struct kvm_vcpu *vcpu, | ||
578 | unsigned index, u64 *data)) | ||
579 | { | ||
580 | int i; | ||
581 | |||
582 | vcpu_load(vcpu); | ||
583 | |||
584 | for (i = 0; i < msrs->nmsrs; ++i) | ||
585 | if (do_msr(vcpu, entries[i].index, &entries[i].data)) | ||
586 | break; | ||
587 | |||
588 | vcpu_put(vcpu); | ||
589 | |||
590 | return i; | ||
591 | } | ||
592 | |||
593 | /* | ||
594 | * Read or write a bunch of msrs. Parameters are user addresses. | ||
595 | * | ||
596 | * @return number of msrs set successfully. | ||
597 | */ | ||
598 | static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, | ||
599 | int (*do_msr)(struct kvm_vcpu *vcpu, | ||
600 | unsigned index, u64 *data), | ||
601 | int writeback) | ||
602 | { | ||
603 | struct kvm_msrs msrs; | ||
604 | struct kvm_msr_entry *entries; | ||
605 | int r, n; | ||
606 | unsigned size; | ||
607 | |||
608 | r = -EFAULT; | ||
609 | if (copy_from_user(&msrs, user_msrs, sizeof msrs)) | ||
610 | goto out; | ||
611 | |||
612 | r = -E2BIG; | ||
613 | if (msrs.nmsrs >= MAX_IO_MSRS) | ||
614 | goto out; | ||
615 | |||
616 | r = -ENOMEM; | ||
617 | size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; | ||
618 | entries = vmalloc(size); | ||
619 | if (!entries) | ||
620 | goto out; | ||
621 | |||
622 | r = -EFAULT; | ||
623 | if (copy_from_user(entries, user_msrs->entries, size)) | ||
624 | goto out_free; | ||
625 | |||
626 | r = n = __msr_io(vcpu, &msrs, entries, do_msr); | ||
627 | if (r < 0) | ||
628 | goto out_free; | ||
629 | |||
630 | r = -EFAULT; | ||
631 | if (writeback && copy_to_user(user_msrs->entries, entries, size)) | ||
632 | goto out_free; | ||
633 | |||
634 | r = n; | ||
635 | |||
636 | out_free: | ||
637 | vfree(entries); | ||
638 | out: | ||
639 | return r; | ||
640 | } | ||
641 | |||
642 | /* | ||
643 | * Make sure that a cpu that is being hot-unplugged does not have any vcpus | ||
644 | * cached on it. | ||
645 | */ | ||
646 | void decache_vcpus_on_cpu(int cpu) | ||
647 | { | ||
648 | struct kvm *vm; | ||
649 | struct kvm_vcpu *vcpu; | ||
650 | int i; | ||
651 | |||
652 | spin_lock(&kvm_lock); | ||
653 | list_for_each_entry(vm, &vm_list, vm_list) | ||
654 | for (i = 0; i < KVM_MAX_VCPUS; ++i) { | ||
655 | vcpu = vm->vcpus[i]; | ||
656 | if (!vcpu) | ||
657 | continue; | ||
658 | /* | ||
659 | * If the vcpu is locked, then it is running on some | ||
660 | * other cpu and therefore it is not cached on the | ||
661 | * cpu in question. | ||
662 | * | ||
663 | * If it's not locked, check the last cpu it executed | ||
664 | * on. | ||
665 | */ | ||
666 | if (mutex_trylock(&vcpu->mutex)) { | ||
667 | if (vcpu->cpu == cpu) { | ||
668 | kvm_x86_ops->vcpu_decache(vcpu); | ||
669 | vcpu->cpu = -1; | ||
670 | } | ||
671 | mutex_unlock(&vcpu->mutex); | ||
672 | } | ||
673 | } | ||
674 | spin_unlock(&kvm_lock); | ||
675 | } | ||
676 | |||
677 | int kvm_dev_ioctl_check_extension(long ext) | ||
678 | { | ||
679 | int r; | ||
680 | |||
681 | switch (ext) { | ||
682 | case KVM_CAP_IRQCHIP: | ||
683 | case KVM_CAP_HLT: | ||
684 | case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: | ||
685 | case KVM_CAP_USER_MEMORY: | ||
686 | case KVM_CAP_SET_TSS_ADDR: | ||
687 | case KVM_CAP_EXT_CPUID: | ||
688 | r = 1; | ||
689 | break; | ||
690 | default: | ||
691 | r = 0; | ||
692 | break; | ||
693 | } | ||
694 | return r; | ||
695 | |||
696 | } | ||
697 | |||
698 | long kvm_arch_dev_ioctl(struct file *filp, | ||
699 | unsigned int ioctl, unsigned long arg) | ||
700 | { | ||
701 | void __user *argp = (void __user *)arg; | ||
702 | long r; | ||
703 | |||
704 | switch (ioctl) { | ||
705 | case KVM_GET_MSR_INDEX_LIST: { | ||
706 | struct kvm_msr_list __user *user_msr_list = argp; | ||
707 | struct kvm_msr_list msr_list; | ||
708 | unsigned n; | ||
709 | |||
710 | r = -EFAULT; | ||
711 | if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list)) | ||
712 | goto out; | ||
713 | n = msr_list.nmsrs; | ||
714 | msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs); | ||
715 | if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) | ||
716 | goto out; | ||
717 | r = -E2BIG; | ||
718 | if (n < num_msrs_to_save) | ||
719 | goto out; | ||
720 | r = -EFAULT; | ||
721 | if (copy_to_user(user_msr_list->indices, &msrs_to_save, | ||
722 | num_msrs_to_save * sizeof(u32))) | ||
723 | goto out; | ||
724 | if (copy_to_user(user_msr_list->indices | ||
725 | + num_msrs_to_save * sizeof(u32), | ||
726 | &emulated_msrs, | ||
727 | ARRAY_SIZE(emulated_msrs) * sizeof(u32))) | ||
728 | goto out; | ||
729 | r = 0; | ||
730 | break; | ||
731 | } | ||
732 | default: | ||
733 | r = -EINVAL; | ||
734 | } | ||
735 | out: | ||
736 | return r; | ||
737 | } | ||
738 | |||
739 | void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | ||
740 | { | ||
741 | kvm_x86_ops->vcpu_load(vcpu, cpu); | ||
742 | } | ||
743 | |||
744 | void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) | ||
745 | { | ||
746 | kvm_x86_ops->vcpu_put(vcpu); | ||
747 | kvm_put_guest_fpu(vcpu); | ||
748 | } | ||
749 | |||
750 | static int is_efer_nx(void) | ||
751 | { | ||
752 | u64 efer; | ||
753 | |||
754 | rdmsrl(MSR_EFER, efer); | ||
755 | return efer & EFER_NX; | ||
756 | } | ||
757 | |||
758 | static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) | ||
759 | { | ||
760 | int i; | ||
761 | struct kvm_cpuid_entry2 *e, *entry; | ||
762 | |||
763 | entry = NULL; | ||
764 | for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { | ||
765 | e = &vcpu->arch.cpuid_entries[i]; | ||
766 | if (e->function == 0x80000001) { | ||
767 | entry = e; | ||
768 | break; | ||
769 | } | ||
770 | } | ||
771 | if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) { | ||
772 | entry->edx &= ~(1 << 20); | ||
773 | printk(KERN_INFO "kvm: guest NX capability removed\n"); | ||
774 | } | ||
775 | } | ||
776 | |||
777 | /* when an old userspace process fills a new kernel module */ | ||
778 | static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, | ||
779 | struct kvm_cpuid *cpuid, | ||
780 | struct kvm_cpuid_entry __user *entries) | ||
781 | { | ||
782 | int r, i; | ||
783 | struct kvm_cpuid_entry *cpuid_entries; | ||
784 | |||
785 | r = -E2BIG; | ||
786 | if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) | ||
787 | goto out; | ||
788 | r = -ENOMEM; | ||
789 | cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent); | ||
790 | if (!cpuid_entries) | ||
791 | goto out; | ||
792 | r = -EFAULT; | ||
793 | if (copy_from_user(cpuid_entries, entries, | ||
794 | cpuid->nent * sizeof(struct kvm_cpuid_entry))) | ||
795 | goto out_free; | ||
796 | for (i = 0; i < cpuid->nent; i++) { | ||
797 | vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; | ||
798 | vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; | ||
799 | vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx; | ||
800 | vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx; | ||
801 | vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx; | ||
802 | vcpu->arch.cpuid_entries[i].index = 0; | ||
803 | vcpu->arch.cpuid_entries[i].flags = 0; | ||
804 | vcpu->arch.cpuid_entries[i].padding[0] = 0; | ||
805 | vcpu->arch.cpuid_entries[i].padding[1] = 0; | ||
806 | vcpu->arch.cpuid_entries[i].padding[2] = 0; | ||
807 | } | ||
808 | vcpu->arch.cpuid_nent = cpuid->nent; | ||
809 | cpuid_fix_nx_cap(vcpu); | ||
810 | r = 0; | ||
811 | |||
812 | out_free: | ||
813 | vfree(cpuid_entries); | ||
814 | out: | ||
815 | return r; | ||
816 | } | ||
817 | |||
818 | static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, | ||
819 | struct kvm_cpuid2 *cpuid, | ||
820 | struct kvm_cpuid_entry2 __user *entries) | ||
821 | { | ||
822 | int r; | ||
823 | |||
824 | r = -E2BIG; | ||
825 | if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) | ||
826 | goto out; | ||
827 | r = -EFAULT; | ||
828 | if (copy_from_user(&vcpu->arch.cpuid_entries, entries, | ||
829 | cpuid->nent * sizeof(struct kvm_cpuid_entry2))) | ||
830 | goto out; | ||
831 | vcpu->arch.cpuid_nent = cpuid->nent; | ||
832 | return 0; | ||
833 | |||
834 | out: | ||
835 | return r; | ||
836 | } | ||
837 | |||
838 | static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, | ||
839 | struct kvm_cpuid2 *cpuid, | ||
840 | struct kvm_cpuid_entry2 __user *entries) | ||
841 | { | ||
842 | int r; | ||
843 | |||
844 | r = -E2BIG; | ||
845 | if (cpuid->nent < vcpu->arch.cpuid_nent) | ||
846 | goto out; | ||
847 | r = -EFAULT; | ||
848 | if (copy_to_user(entries, &vcpu->arch.cpuid_entries, | ||
849 | vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) | ||
850 | goto out; | ||
851 | return 0; | ||
852 | |||
853 | out: | ||
854 | cpuid->nent = vcpu->arch.cpuid_nent; | ||
855 | return r; | ||
856 | } | ||
857 | |||
858 | static inline u32 bit(int bitno) | ||
859 | { | ||
860 | return 1 << (bitno & 31); | ||
861 | } | ||
862 | |||
863 | static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, | ||
864 | u32 index) | ||
865 | { | ||
866 | entry->function = function; | ||
867 | entry->index = index; | ||
868 | cpuid_count(entry->function, entry->index, | ||
869 | &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); | ||
870 | entry->flags = 0; | ||
871 | } | ||
872 | |||
873 | static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | ||
874 | u32 index, int *nent, int maxnent) | ||
875 | { | ||
876 | const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) | | ||
877 | bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) | | ||
878 | bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) | | ||
879 | bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) | | ||
880 | bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) | | ||
881 | bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) | | ||
882 | bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) | | ||
883 | bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) | | ||
884 | bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) | | ||
885 | bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP); | ||
886 | const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) | | ||
887 | bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) | | ||
888 | bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) | | ||
889 | bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) | | ||
890 | bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) | | ||
891 | bit(X86_FEATURE_PGE) | | ||
892 | bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) | | ||
893 | bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) | | ||
894 | bit(X86_FEATURE_SYSCALL) | | ||
895 | (bit(X86_FEATURE_NX) && is_efer_nx()) | | ||
896 | #ifdef CONFIG_X86_64 | ||
897 | bit(X86_FEATURE_LM) | | ||
898 | #endif | ||
899 | bit(X86_FEATURE_MMXEXT) | | ||
900 | bit(X86_FEATURE_3DNOWEXT) | | ||
901 | bit(X86_FEATURE_3DNOW); | ||
902 | const u32 kvm_supported_word3_x86_features = | ||
903 | bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16); | ||
904 | const u32 kvm_supported_word6_x86_features = | ||
905 | bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY); | ||
906 | |||
907 | /* all func 2 cpuid_count() should be called on the same cpu */ | ||
908 | get_cpu(); | ||
909 | do_cpuid_1_ent(entry, function, index); | ||
910 | ++*nent; | ||
911 | |||
912 | switch (function) { | ||
913 | case 0: | ||
914 | entry->eax = min(entry->eax, (u32)0xb); | ||
915 | break; | ||
916 | case 1: | ||
917 | entry->edx &= kvm_supported_word0_x86_features; | ||
918 | entry->ecx &= kvm_supported_word3_x86_features; | ||
919 | break; | ||
920 | /* function 2 entries are STATEFUL. That is, repeated cpuid commands | ||
921 | * may return different values. This forces us to get_cpu() before | ||
922 | * issuing the first command, and also to emulate this annoying behavior | ||
923 | * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */ | ||
924 | case 2: { | ||
925 | int t, times = entry->eax & 0xff; | ||
926 | |||
927 | entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; | ||
928 | for (t = 1; t < times && *nent < maxnent; ++t) { | ||
929 | do_cpuid_1_ent(&entry[t], function, 0); | ||
930 | entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; | ||
931 | ++*nent; | ||
932 | } | ||
933 | break; | ||
934 | } | ||
935 | /* function 4 and 0xb have additional index. */ | ||
936 | case 4: { | ||
937 | int index, cache_type; | ||
938 | |||
939 | entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; | ||
940 | /* read more entries until cache_type is zero */ | ||
941 | for (index = 1; *nent < maxnent; ++index) { | ||
942 | cache_type = entry[index - 1].eax & 0x1f; | ||
943 | if (!cache_type) | ||
944 | break; | ||
945 | do_cpuid_1_ent(&entry[index], function, index); | ||
946 | entry[index].flags |= | ||
947 | KVM_CPUID_FLAG_SIGNIFCANT_INDEX; | ||
948 | ++*nent; | ||
949 | } | ||
950 | break; | ||
951 | } | ||
952 | case 0xb: { | ||
953 | int index, level_type; | ||
954 | |||
955 | entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; | ||
956 | /* read more entries until level_type is zero */ | ||
957 | for (index = 1; *nent < maxnent; ++index) { | ||
958 | level_type = entry[index - 1].ecx & 0xff; | ||
959 | if (!level_type) | ||
960 | break; | ||
961 | do_cpuid_1_ent(&entry[index], function, index); | ||
962 | entry[index].flags |= | ||
963 | KVM_CPUID_FLAG_SIGNIFCANT_INDEX; | ||
964 | ++*nent; | ||
965 | } | ||
966 | break; | ||
967 | } | ||
968 | case 0x80000000: | ||
969 | entry->eax = min(entry->eax, 0x8000001a); | ||
970 | break; | ||
971 | case 0x80000001: | ||
972 | entry->edx &= kvm_supported_word1_x86_features; | ||
973 | entry->ecx &= kvm_supported_word6_x86_features; | ||
974 | break; | ||
975 | } | ||
976 | put_cpu(); | ||
977 | } | ||
978 | |||
979 | static int kvm_vm_ioctl_get_supported_cpuid(struct kvm *kvm, | ||
980 | struct kvm_cpuid2 *cpuid, | ||
981 | struct kvm_cpuid_entry2 __user *entries) | ||
982 | { | ||
983 | struct kvm_cpuid_entry2 *cpuid_entries; | ||
984 | int limit, nent = 0, r = -E2BIG; | ||
985 | u32 func; | ||
986 | |||
987 | if (cpuid->nent < 1) | ||
988 | goto out; | ||
989 | r = -ENOMEM; | ||
990 | cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent); | ||
991 | if (!cpuid_entries) | ||
992 | goto out; | ||
993 | |||
994 | do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent); | ||
995 | limit = cpuid_entries[0].eax; | ||
996 | for (func = 1; func <= limit && nent < cpuid->nent; ++func) | ||
997 | do_cpuid_ent(&cpuid_entries[nent], func, 0, | ||
998 | &nent, cpuid->nent); | ||
999 | r = -E2BIG; | ||
1000 | if (nent >= cpuid->nent) | ||
1001 | goto out_free; | ||
1002 | |||
1003 | do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent); | ||
1004 | limit = cpuid_entries[nent - 1].eax; | ||
1005 | for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) | ||
1006 | do_cpuid_ent(&cpuid_entries[nent], func, 0, | ||
1007 | &nent, cpuid->nent); | ||
1008 | r = -EFAULT; | ||
1009 | if (copy_to_user(entries, cpuid_entries, | ||
1010 | nent * sizeof(struct kvm_cpuid_entry2))) | ||
1011 | goto out_free; | ||
1012 | cpuid->nent = nent; | ||
1013 | r = 0; | ||
1014 | |||
1015 | out_free: | ||
1016 | vfree(cpuid_entries); | ||
1017 | out: | ||
1018 | return r; | ||
1019 | } | ||
1020 | |||
1021 | static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, | ||
1022 | struct kvm_lapic_state *s) | ||
1023 | { | ||
1024 | vcpu_load(vcpu); | ||
1025 | memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); | ||
1026 | vcpu_put(vcpu); | ||
1027 | |||
1028 | return 0; | ||
1029 | } | ||
1030 | |||
1031 | static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, | ||
1032 | struct kvm_lapic_state *s) | ||
1033 | { | ||
1034 | vcpu_load(vcpu); | ||
1035 | memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); | ||
1036 | kvm_apic_post_state_restore(vcpu); | ||
1037 | vcpu_put(vcpu); | ||
1038 | |||
1039 | return 0; | ||
1040 | } | ||
1041 | |||
1042 | static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, | ||
1043 | struct kvm_interrupt *irq) | ||
1044 | { | ||
1045 | if (irq->irq < 0 || irq->irq >= 256) | ||
1046 | return -EINVAL; | ||
1047 | if (irqchip_in_kernel(vcpu->kvm)) | ||
1048 | return -ENXIO; | ||
1049 | vcpu_load(vcpu); | ||
1050 | |||
1051 | set_bit(irq->irq, vcpu->arch.irq_pending); | ||
1052 | set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary); | ||
1053 | |||
1054 | vcpu_put(vcpu); | ||
1055 | |||
1056 | return 0; | ||
1057 | } | ||
1058 | |||
1059 | long kvm_arch_vcpu_ioctl(struct file *filp, | ||
1060 | unsigned int ioctl, unsigned long arg) | ||
1061 | { | ||
1062 | struct kvm_vcpu *vcpu = filp->private_data; | ||
1063 | void __user *argp = (void __user *)arg; | ||
1064 | int r; | ||
1065 | |||
1066 | switch (ioctl) { | ||
1067 | case KVM_GET_LAPIC: { | ||
1068 | struct kvm_lapic_state lapic; | ||
1069 | |||
1070 | memset(&lapic, 0, sizeof lapic); | ||
1071 | r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic); | ||
1072 | if (r) | ||
1073 | goto out; | ||
1074 | r = -EFAULT; | ||
1075 | if (copy_to_user(argp, &lapic, sizeof lapic)) | ||
1076 | goto out; | ||
1077 | r = 0; | ||
1078 | break; | ||
1079 | } | ||
1080 | case KVM_SET_LAPIC: { | ||
1081 | struct kvm_lapic_state lapic; | ||
1082 | |||
1083 | r = -EFAULT; | ||
1084 | if (copy_from_user(&lapic, argp, sizeof lapic)) | ||
1085 | goto out; | ||
1086 | r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);; | ||
1087 | if (r) | ||
1088 | goto out; | ||
1089 | r = 0; | ||
1090 | break; | ||
1091 | } | ||
1092 | case KVM_INTERRUPT: { | ||
1093 | struct kvm_interrupt irq; | ||
1094 | |||
1095 | r = -EFAULT; | ||
1096 | if (copy_from_user(&irq, argp, sizeof irq)) | ||
1097 | goto out; | ||
1098 | r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); | ||
1099 | if (r) | ||
1100 | goto out; | ||
1101 | r = 0; | ||
1102 | break; | ||
1103 | } | ||
1104 | case KVM_SET_CPUID: { | ||
1105 | struct kvm_cpuid __user *cpuid_arg = argp; | ||
1106 | struct kvm_cpuid cpuid; | ||
1107 | |||
1108 | r = -EFAULT; | ||
1109 | if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) | ||
1110 | goto out; | ||
1111 | r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); | ||
1112 | if (r) | ||
1113 | goto out; | ||
1114 | break; | ||
1115 | } | ||
1116 | case KVM_SET_CPUID2: { | ||
1117 | struct kvm_cpuid2 __user *cpuid_arg = argp; | ||
1118 | struct kvm_cpuid2 cpuid; | ||
1119 | |||
1120 | r = -EFAULT; | ||
1121 | if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) | ||
1122 | goto out; | ||
1123 | r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid, | ||
1124 | cpuid_arg->entries); | ||
1125 | if (r) | ||
1126 | goto out; | ||
1127 | break; | ||
1128 | } | ||
1129 | case KVM_GET_CPUID2: { | ||
1130 | struct kvm_cpuid2 __user *cpuid_arg = argp; | ||
1131 | struct kvm_cpuid2 cpuid; | ||
1132 | |||
1133 | r = -EFAULT; | ||
1134 | if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) | ||
1135 | goto out; | ||
1136 | r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid, | ||
1137 | cpuid_arg->entries); | ||
1138 | if (r) | ||
1139 | goto out; | ||
1140 | r = -EFAULT; | ||
1141 | if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) | ||
1142 | goto out; | ||
1143 | r = 0; | ||
1144 | break; | ||
1145 | } | ||
1146 | case KVM_GET_MSRS: | ||
1147 | r = msr_io(vcpu, argp, kvm_get_msr, 1); | ||
1148 | break; | ||
1149 | case KVM_SET_MSRS: | ||
1150 | r = msr_io(vcpu, argp, do_set_msr, 0); | ||
1151 | break; | ||
1152 | default: | ||
1153 | r = -EINVAL; | ||
1154 | } | ||
1155 | out: | ||
1156 | return r; | ||
1157 | } | ||
1158 | |||
1159 | static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) | ||
1160 | { | ||
1161 | int ret; | ||
1162 | |||
1163 | if (addr > (unsigned int)(-3 * PAGE_SIZE)) | ||
1164 | return -1; | ||
1165 | ret = kvm_x86_ops->set_tss_addr(kvm, addr); | ||
1166 | return ret; | ||
1167 | } | ||
1168 | |||
1169 | static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, | ||
1170 | u32 kvm_nr_mmu_pages) | ||
1171 | { | ||
1172 | if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) | ||
1173 | return -EINVAL; | ||
1174 | |||
1175 | mutex_lock(&kvm->lock); | ||
1176 | |||
1177 | kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); | ||
1178 | kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; | ||
1179 | |||
1180 | mutex_unlock(&kvm->lock); | ||
1181 | return 0; | ||
1182 | } | ||
1183 | |||
1184 | static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) | ||
1185 | { | ||
1186 | return kvm->arch.n_alloc_mmu_pages; | ||
1187 | } | ||
1188 | |||
1189 | gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) | ||
1190 | { | ||
1191 | int i; | ||
1192 | struct kvm_mem_alias *alias; | ||
1193 | |||
1194 | for (i = 0; i < kvm->arch.naliases; ++i) { | ||
1195 | alias = &kvm->arch.aliases[i]; | ||
1196 | if (gfn >= alias->base_gfn | ||
1197 | && gfn < alias->base_gfn + alias->npages) | ||
1198 | return alias->target_gfn + gfn - alias->base_gfn; | ||
1199 | } | ||
1200 | return gfn; | ||
1201 | } | ||
1202 | |||
1203 | /* | ||
1204 | * Set a new alias region. Aliases map a portion of physical memory into | ||
1205 | * another portion. This is useful for memory windows, for example the PC | ||
1206 | * VGA region. | ||
1207 | */ | ||
1208 | static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, | ||
1209 | struct kvm_memory_alias *alias) | ||
1210 | { | ||
1211 | int r, n; | ||
1212 | struct kvm_mem_alias *p; | ||
1213 | |||
1214 | r = -EINVAL; | ||
1215 | /* General sanity checks */ | ||
1216 | if (alias->memory_size & (PAGE_SIZE - 1)) | ||
1217 | goto out; | ||
1218 | if (alias->guest_phys_addr & (PAGE_SIZE - 1)) | ||
1219 | goto out; | ||
1220 | if (alias->slot >= KVM_ALIAS_SLOTS) | ||
1221 | goto out; | ||
1222 | if (alias->guest_phys_addr + alias->memory_size | ||
1223 | < alias->guest_phys_addr) | ||
1224 | goto out; | ||
1225 | if (alias->target_phys_addr + alias->memory_size | ||
1226 | < alias->target_phys_addr) | ||
1227 | goto out; | ||
1228 | |||
1229 | mutex_lock(&kvm->lock); | ||
1230 | |||
1231 | p = &kvm->arch.aliases[alias->slot]; | ||
1232 | p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; | ||
1233 | p->npages = alias->memory_size >> PAGE_SHIFT; | ||
1234 | p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; | ||
1235 | |||
1236 | for (n = KVM_ALIAS_SLOTS; n > 0; --n) | ||
1237 | if (kvm->arch.aliases[n - 1].npages) | ||
1238 | break; | ||
1239 | kvm->arch.naliases = n; | ||
1240 | |||
1241 | kvm_mmu_zap_all(kvm); | ||
1242 | |||
1243 | mutex_unlock(&kvm->lock); | ||
1244 | |||
1245 | return 0; | ||
1246 | |||
1247 | out: | ||
1248 | return r; | ||
1249 | } | ||
1250 | |||
1251 | static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) | ||
1252 | { | ||
1253 | int r; | ||
1254 | |||
1255 | r = 0; | ||
1256 | switch (chip->chip_id) { | ||
1257 | case KVM_IRQCHIP_PIC_MASTER: | ||
1258 | memcpy(&chip->chip.pic, | ||
1259 | &pic_irqchip(kvm)->pics[0], | ||
1260 | sizeof(struct kvm_pic_state)); | ||
1261 | break; | ||
1262 | case KVM_IRQCHIP_PIC_SLAVE: | ||
1263 | memcpy(&chip->chip.pic, | ||
1264 | &pic_irqchip(kvm)->pics[1], | ||
1265 | sizeof(struct kvm_pic_state)); | ||
1266 | break; | ||
1267 | case KVM_IRQCHIP_IOAPIC: | ||
1268 | memcpy(&chip->chip.ioapic, | ||
1269 | ioapic_irqchip(kvm), | ||
1270 | sizeof(struct kvm_ioapic_state)); | ||
1271 | break; | ||
1272 | default: | ||
1273 | r = -EINVAL; | ||
1274 | break; | ||
1275 | } | ||
1276 | return r; | ||
1277 | } | ||
1278 | |||
1279 | static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) | ||
1280 | { | ||
1281 | int r; | ||
1282 | |||
1283 | r = 0; | ||
1284 | switch (chip->chip_id) { | ||
1285 | case KVM_IRQCHIP_PIC_MASTER: | ||
1286 | memcpy(&pic_irqchip(kvm)->pics[0], | ||
1287 | &chip->chip.pic, | ||
1288 | sizeof(struct kvm_pic_state)); | ||
1289 | break; | ||
1290 | case KVM_IRQCHIP_PIC_SLAVE: | ||
1291 | memcpy(&pic_irqchip(kvm)->pics[1], | ||
1292 | &chip->chip.pic, | ||
1293 | sizeof(struct kvm_pic_state)); | ||
1294 | break; | ||
1295 | case KVM_IRQCHIP_IOAPIC: | ||
1296 | memcpy(ioapic_irqchip(kvm), | ||
1297 | &chip->chip.ioapic, | ||
1298 | sizeof(struct kvm_ioapic_state)); | ||
1299 | break; | ||
1300 | default: | ||
1301 | r = -EINVAL; | ||
1302 | break; | ||
1303 | } | ||
1304 | kvm_pic_update_irq(pic_irqchip(kvm)); | ||
1305 | return r; | ||
1306 | } | ||
1307 | |||
1308 | /* | ||
1309 | * Get (and clear) the dirty memory log for a memory slot. | ||
1310 | */ | ||
1311 | int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, | ||
1312 | struct kvm_dirty_log *log) | ||
1313 | { | ||
1314 | int r; | ||
1315 | int n; | ||
1316 | struct kvm_memory_slot *memslot; | ||
1317 | int is_dirty = 0; | ||
1318 | |||
1319 | mutex_lock(&kvm->lock); | ||
1320 | |||
1321 | r = kvm_get_dirty_log(kvm, log, &is_dirty); | ||
1322 | if (r) | ||
1323 | goto out; | ||
1324 | |||
1325 | /* If nothing is dirty, don't bother messing with page tables. */ | ||
1326 | if (is_dirty) { | ||
1327 | kvm_mmu_slot_remove_write_access(kvm, log->slot); | ||
1328 | kvm_flush_remote_tlbs(kvm); | ||
1329 | memslot = &kvm->memslots[log->slot]; | ||
1330 | n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; | ||
1331 | memset(memslot->dirty_bitmap, 0, n); | ||
1332 | } | ||
1333 | r = 0; | ||
1334 | out: | ||
1335 | mutex_unlock(&kvm->lock); | ||
1336 | return r; | ||
1337 | } | ||
1338 | |||
1339 | long kvm_arch_vm_ioctl(struct file *filp, | ||
1340 | unsigned int ioctl, unsigned long arg) | ||
1341 | { | ||
1342 | struct kvm *kvm = filp->private_data; | ||
1343 | void __user *argp = (void __user *)arg; | ||
1344 | int r = -EINVAL; | ||
1345 | |||
1346 | switch (ioctl) { | ||
1347 | case KVM_SET_TSS_ADDR: | ||
1348 | r = kvm_vm_ioctl_set_tss_addr(kvm, arg); | ||
1349 | if (r < 0) | ||
1350 | goto out; | ||
1351 | break; | ||
1352 | case KVM_SET_MEMORY_REGION: { | ||
1353 | struct kvm_memory_region kvm_mem; | ||
1354 | struct kvm_userspace_memory_region kvm_userspace_mem; | ||
1355 | |||
1356 | r = -EFAULT; | ||
1357 | if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem)) | ||
1358 | goto out; | ||
1359 | kvm_userspace_mem.slot = kvm_mem.slot; | ||
1360 | kvm_userspace_mem.flags = kvm_mem.flags; | ||
1361 | kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr; | ||
1362 | kvm_userspace_mem.memory_size = kvm_mem.memory_size; | ||
1363 | r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0); | ||
1364 | if (r) | ||
1365 | goto out; | ||
1366 | break; | ||
1367 | } | ||
1368 | case KVM_SET_NR_MMU_PAGES: | ||
1369 | r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); | ||
1370 | if (r) | ||
1371 | goto out; | ||
1372 | break; | ||
1373 | case KVM_GET_NR_MMU_PAGES: | ||
1374 | r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); | ||
1375 | break; | ||
1376 | case KVM_SET_MEMORY_ALIAS: { | ||
1377 | struct kvm_memory_alias alias; | ||
1378 | |||
1379 | r = -EFAULT; | ||
1380 | if (copy_from_user(&alias, argp, sizeof alias)) | ||
1381 | goto out; | ||
1382 | r = kvm_vm_ioctl_set_memory_alias(kvm, &alias); | ||
1383 | if (r) | ||
1384 | goto out; | ||
1385 | break; | ||
1386 | } | ||
1387 | case KVM_CREATE_IRQCHIP: | ||
1388 | r = -ENOMEM; | ||
1389 | kvm->arch.vpic = kvm_create_pic(kvm); | ||
1390 | if (kvm->arch.vpic) { | ||
1391 | r = kvm_ioapic_init(kvm); | ||
1392 | if (r) { | ||
1393 | kfree(kvm->arch.vpic); | ||
1394 | kvm->arch.vpic = NULL; | ||
1395 | goto out; | ||
1396 | } | ||
1397 | } else | ||
1398 | goto out; | ||
1399 | break; | ||
1400 | case KVM_IRQ_LINE: { | ||
1401 | struct kvm_irq_level irq_event; | ||
1402 | |||
1403 | r = -EFAULT; | ||
1404 | if (copy_from_user(&irq_event, argp, sizeof irq_event)) | ||
1405 | goto out; | ||
1406 | if (irqchip_in_kernel(kvm)) { | ||
1407 | mutex_lock(&kvm->lock); | ||
1408 | if (irq_event.irq < 16) | ||
1409 | kvm_pic_set_irq(pic_irqchip(kvm), | ||
1410 | irq_event.irq, | ||
1411 | irq_event.level); | ||
1412 | kvm_ioapic_set_irq(kvm->arch.vioapic, | ||
1413 | irq_event.irq, | ||
1414 | irq_event.level); | ||
1415 | mutex_unlock(&kvm->lock); | ||
1416 | r = 0; | ||
1417 | } | ||
1418 | break; | ||
1419 | } | ||
1420 | case KVM_GET_IRQCHIP: { | ||
1421 | /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ | ||
1422 | struct kvm_irqchip chip; | ||
1423 | |||
1424 | r = -EFAULT; | ||
1425 | if (copy_from_user(&chip, argp, sizeof chip)) | ||
1426 | goto out; | ||
1427 | r = -ENXIO; | ||
1428 | if (!irqchip_in_kernel(kvm)) | ||
1429 | goto out; | ||
1430 | r = kvm_vm_ioctl_get_irqchip(kvm, &chip); | ||
1431 | if (r) | ||
1432 | goto out; | ||
1433 | r = -EFAULT; | ||
1434 | if (copy_to_user(argp, &chip, sizeof chip)) | ||
1435 | goto out; | ||
1436 | r = 0; | ||
1437 | break; | ||
1438 | } | ||
1439 | case KVM_SET_IRQCHIP: { | ||
1440 | /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ | ||
1441 | struct kvm_irqchip chip; | ||
1442 | |||
1443 | r = -EFAULT; | ||
1444 | if (copy_from_user(&chip, argp, sizeof chip)) | ||
1445 | goto out; | ||
1446 | r = -ENXIO; | ||
1447 | if (!irqchip_in_kernel(kvm)) | ||
1448 | goto out; | ||
1449 | r = kvm_vm_ioctl_set_irqchip(kvm, &chip); | ||
1450 | if (r) | ||
1451 | goto out; | ||
1452 | r = 0; | ||
1453 | break; | ||
1454 | } | ||
1455 | case KVM_GET_SUPPORTED_CPUID: { | ||
1456 | struct kvm_cpuid2 __user *cpuid_arg = argp; | ||
1457 | struct kvm_cpuid2 cpuid; | ||
1458 | |||
1459 | r = -EFAULT; | ||
1460 | if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) | ||
1461 | goto out; | ||
1462 | r = kvm_vm_ioctl_get_supported_cpuid(kvm, &cpuid, | ||
1463 | cpuid_arg->entries); | ||
1464 | if (r) | ||
1465 | goto out; | ||
1466 | |||
1467 | r = -EFAULT; | ||
1468 | if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) | ||
1469 | goto out; | ||
1470 | r = 0; | ||
1471 | break; | ||
1472 | } | ||
1473 | default: | ||
1474 | ; | ||
1475 | } | ||
1476 | out: | ||
1477 | return r; | ||
1478 | } | ||
1479 | |||
1480 | static void kvm_init_msr_list(void) | ||
1481 | { | ||
1482 | u32 dummy[2]; | ||
1483 | unsigned i, j; | ||
1484 | |||
1485 | for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { | ||
1486 | if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) | ||
1487 | continue; | ||
1488 | if (j < i) | ||
1489 | msrs_to_save[j] = msrs_to_save[i]; | ||
1490 | j++; | ||
1491 | } | ||
1492 | num_msrs_to_save = j; | ||
1493 | } | ||
1494 | |||
1495 | /* | ||
1496 | * Only apic need an MMIO device hook, so shortcut now.. | ||
1497 | */ | ||
1498 | static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu, | ||
1499 | gpa_t addr) | ||
1500 | { | ||
1501 | struct kvm_io_device *dev; | ||
1502 | |||
1503 | if (vcpu->arch.apic) { | ||
1504 | dev = &vcpu->arch.apic->dev; | ||
1505 | if (dev->in_range(dev, addr)) | ||
1506 | return dev; | ||
1507 | } | ||
1508 | return NULL; | ||
1509 | } | ||
1510 | |||
1511 | |||
1512 | static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, | ||
1513 | gpa_t addr) | ||
1514 | { | ||
1515 | struct kvm_io_device *dev; | ||
1516 | |||
1517 | dev = vcpu_find_pervcpu_dev(vcpu, addr); | ||
1518 | if (dev == NULL) | ||
1519 | dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr); | ||
1520 | return dev; | ||
1521 | } | ||
1522 | |||
1523 | int emulator_read_std(unsigned long addr, | ||
1524 | void *val, | ||
1525 | unsigned int bytes, | ||
1526 | struct kvm_vcpu *vcpu) | ||
1527 | { | ||
1528 | void *data = val; | ||
1529 | |||
1530 | while (bytes) { | ||
1531 | gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); | ||
1532 | unsigned offset = addr & (PAGE_SIZE-1); | ||
1533 | unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset); | ||
1534 | int ret; | ||
1535 | |||
1536 | if (gpa == UNMAPPED_GVA) | ||
1537 | return X86EMUL_PROPAGATE_FAULT; | ||
1538 | ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy); | ||
1539 | if (ret < 0) | ||
1540 | return X86EMUL_UNHANDLEABLE; | ||
1541 | |||
1542 | bytes -= tocopy; | ||
1543 | data += tocopy; | ||
1544 | addr += tocopy; | ||
1545 | } | ||
1546 | |||
1547 | return X86EMUL_CONTINUE; | ||
1548 | } | ||
1549 | EXPORT_SYMBOL_GPL(emulator_read_std); | ||
1550 | |||
1551 | static int emulator_read_emulated(unsigned long addr, | ||
1552 | void *val, | ||
1553 | unsigned int bytes, | ||
1554 | struct kvm_vcpu *vcpu) | ||
1555 | { | ||
1556 | struct kvm_io_device *mmio_dev; | ||
1557 | gpa_t gpa; | ||
1558 | |||
1559 | if (vcpu->mmio_read_completed) { | ||
1560 | memcpy(val, vcpu->mmio_data, bytes); | ||
1561 | vcpu->mmio_read_completed = 0; | ||
1562 | return X86EMUL_CONTINUE; | ||
1563 | } | ||
1564 | |||
1565 | gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); | ||
1566 | |||
1567 | /* For APIC access vmexit */ | ||
1568 | if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) | ||
1569 | goto mmio; | ||
1570 | |||
1571 | if (emulator_read_std(addr, val, bytes, vcpu) | ||
1572 | == X86EMUL_CONTINUE) | ||
1573 | return X86EMUL_CONTINUE; | ||
1574 | if (gpa == UNMAPPED_GVA) | ||
1575 | return X86EMUL_PROPAGATE_FAULT; | ||
1576 | |||
1577 | mmio: | ||
1578 | /* | ||
1579 | * Is this MMIO handled locally? | ||
1580 | */ | ||
1581 | mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); | ||
1582 | if (mmio_dev) { | ||
1583 | kvm_iodevice_read(mmio_dev, gpa, bytes, val); | ||
1584 | return X86EMUL_CONTINUE; | ||
1585 | } | ||
1586 | |||
1587 | vcpu->mmio_needed = 1; | ||
1588 | vcpu->mmio_phys_addr = gpa; | ||
1589 | vcpu->mmio_size = bytes; | ||
1590 | vcpu->mmio_is_write = 0; | ||
1591 | |||
1592 | return X86EMUL_UNHANDLEABLE; | ||
1593 | } | ||
1594 | |||
1595 | static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, | ||
1596 | const void *val, int bytes) | ||
1597 | { | ||
1598 | int ret; | ||
1599 | |||
1600 | ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); | ||
1601 | if (ret < 0) | ||
1602 | return 0; | ||
1603 | kvm_mmu_pte_write(vcpu, gpa, val, bytes); | ||
1604 | return 1; | ||
1605 | } | ||
1606 | |||
1607 | static int emulator_write_emulated_onepage(unsigned long addr, | ||
1608 | const void *val, | ||
1609 | unsigned int bytes, | ||
1610 | struct kvm_vcpu *vcpu) | ||
1611 | { | ||
1612 | struct kvm_io_device *mmio_dev; | ||
1613 | gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); | ||
1614 | |||
1615 | if (gpa == UNMAPPED_GVA) { | ||
1616 | kvm_inject_page_fault(vcpu, addr, 2); | ||
1617 | return X86EMUL_PROPAGATE_FAULT; | ||
1618 | } | ||
1619 | |||
1620 | /* For APIC access vmexit */ | ||
1621 | if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) | ||
1622 | goto mmio; | ||
1623 | |||
1624 | if (emulator_write_phys(vcpu, gpa, val, bytes)) | ||
1625 | return X86EMUL_CONTINUE; | ||
1626 | |||
1627 | mmio: | ||
1628 | /* | ||
1629 | * Is this MMIO handled locally? | ||
1630 | */ | ||
1631 | mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); | ||
1632 | if (mmio_dev) { | ||
1633 | kvm_iodevice_write(mmio_dev, gpa, bytes, val); | ||
1634 | return X86EMUL_CONTINUE; | ||
1635 | } | ||
1636 | |||
1637 | vcpu->mmio_needed = 1; | ||
1638 | vcpu->mmio_phys_addr = gpa; | ||
1639 | vcpu->mmio_size = bytes; | ||
1640 | vcpu->mmio_is_write = 1; | ||
1641 | memcpy(vcpu->mmio_data, val, bytes); | ||
1642 | |||
1643 | return X86EMUL_CONTINUE; | ||
1644 | } | ||
1645 | |||
1646 | int emulator_write_emulated(unsigned long addr, | ||
1647 | const void *val, | ||
1648 | unsigned int bytes, | ||
1649 | struct kvm_vcpu *vcpu) | ||
1650 | { | ||
1651 | /* Crossing a page boundary? */ | ||
1652 | if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { | ||
1653 | int rc, now; | ||
1654 | |||
1655 | now = -addr & ~PAGE_MASK; | ||
1656 | rc = emulator_write_emulated_onepage(addr, val, now, vcpu); | ||
1657 | if (rc != X86EMUL_CONTINUE) | ||
1658 | return rc; | ||
1659 | addr += now; | ||
1660 | val += now; | ||
1661 | bytes -= now; | ||
1662 | } | ||
1663 | return emulator_write_emulated_onepage(addr, val, bytes, vcpu); | ||
1664 | } | ||
1665 | EXPORT_SYMBOL_GPL(emulator_write_emulated); | ||
1666 | |||
1667 | static int emulator_cmpxchg_emulated(unsigned long addr, | ||
1668 | const void *old, | ||
1669 | const void *new, | ||
1670 | unsigned int bytes, | ||
1671 | struct kvm_vcpu *vcpu) | ||
1672 | { | ||
1673 | static int reported; | ||
1674 | |||
1675 | if (!reported) { | ||
1676 | reported = 1; | ||
1677 | printk(KERN_WARNING "kvm: emulating exchange as write\n"); | ||
1678 | } | ||
1679 | #ifndef CONFIG_X86_64 | ||
1680 | /* guests cmpxchg8b have to be emulated atomically */ | ||
1681 | if (bytes == 8) { | ||
1682 | gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); | ||
1683 | struct page *page; | ||
1684 | char *addr; | ||
1685 | u64 val; | ||
1686 | |||
1687 | if (gpa == UNMAPPED_GVA || | ||
1688 | (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) | ||
1689 | goto emul_write; | ||
1690 | |||
1691 | if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK)) | ||
1692 | goto emul_write; | ||
1693 | |||
1694 | val = *(u64 *)new; | ||
1695 | page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); | ||
1696 | addr = kmap_atomic(page, KM_USER0); | ||
1697 | set_64bit((u64 *)(addr + offset_in_page(gpa)), val); | ||
1698 | kunmap_atomic(addr, KM_USER0); | ||
1699 | kvm_release_page_dirty(page); | ||
1700 | } | ||
1701 | emul_write: | ||
1702 | #endif | ||
1703 | |||
1704 | return emulator_write_emulated(addr, new, bytes, vcpu); | ||
1705 | } | ||
1706 | |||
1707 | static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) | ||
1708 | { | ||
1709 | return kvm_x86_ops->get_segment_base(vcpu, seg); | ||
1710 | } | ||
1711 | |||
1712 | int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) | ||
1713 | { | ||
1714 | return X86EMUL_CONTINUE; | ||
1715 | } | ||
1716 | |||
1717 | int emulate_clts(struct kvm_vcpu *vcpu) | ||
1718 | { | ||
1719 | kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); | ||
1720 | return X86EMUL_CONTINUE; | ||
1721 | } | ||
1722 | |||
1723 | int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) | ||
1724 | { | ||
1725 | struct kvm_vcpu *vcpu = ctxt->vcpu; | ||
1726 | |||
1727 | switch (dr) { | ||
1728 | case 0 ... 3: | ||
1729 | *dest = kvm_x86_ops->get_dr(vcpu, dr); | ||
1730 | return X86EMUL_CONTINUE; | ||
1731 | default: | ||
1732 | pr_unimpl(vcpu, "%s: unexpected dr %u\n", __FUNCTION__, dr); | ||
1733 | return X86EMUL_UNHANDLEABLE; | ||
1734 | } | ||
1735 | } | ||
1736 | |||
1737 | int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) | ||
1738 | { | ||
1739 | unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; | ||
1740 | int exception; | ||
1741 | |||
1742 | kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception); | ||
1743 | if (exception) { | ||
1744 | /* FIXME: better handling */ | ||
1745 | return X86EMUL_UNHANDLEABLE; | ||
1746 | } | ||
1747 | return X86EMUL_CONTINUE; | ||
1748 | } | ||
1749 | |||
1750 | void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) | ||
1751 | { | ||
1752 | static int reported; | ||
1753 | u8 opcodes[4]; | ||
1754 | unsigned long rip = vcpu->arch.rip; | ||
1755 | unsigned long rip_linear; | ||
1756 | |||
1757 | rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); | ||
1758 | |||
1759 | if (reported) | ||
1760 | return; | ||
1761 | |||
1762 | emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu); | ||
1763 | |||
1764 | printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", | ||
1765 | context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); | ||
1766 | reported = 1; | ||
1767 | } | ||
1768 | EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); | ||
1769 | |||
1770 | struct x86_emulate_ops emulate_ops = { | ||
1771 | .read_std = emulator_read_std, | ||
1772 | .read_emulated = emulator_read_emulated, | ||
1773 | .write_emulated = emulator_write_emulated, | ||
1774 | .cmpxchg_emulated = emulator_cmpxchg_emulated, | ||
1775 | }; | ||
1776 | |||
1777 | int emulate_instruction(struct kvm_vcpu *vcpu, | ||
1778 | struct kvm_run *run, | ||
1779 | unsigned long cr2, | ||
1780 | u16 error_code, | ||
1781 | int no_decode) | ||
1782 | { | ||
1783 | int r; | ||
1784 | |||
1785 | vcpu->arch.mmio_fault_cr2 = cr2; | ||
1786 | kvm_x86_ops->cache_regs(vcpu); | ||
1787 | |||
1788 | vcpu->mmio_is_write = 0; | ||
1789 | vcpu->arch.pio.string = 0; | ||
1790 | |||
1791 | if (!no_decode) { | ||
1792 | int cs_db, cs_l; | ||
1793 | kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); | ||
1794 | |||
1795 | vcpu->arch.emulate_ctxt.vcpu = vcpu; | ||
1796 | vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); | ||
1797 | vcpu->arch.emulate_ctxt.mode = | ||
1798 | (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) | ||
1799 | ? X86EMUL_MODE_REAL : cs_l | ||
1800 | ? X86EMUL_MODE_PROT64 : cs_db | ||
1801 | ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; | ||
1802 | |||
1803 | if (vcpu->arch.emulate_ctxt.mode == X86EMUL_MODE_PROT64) { | ||
1804 | vcpu->arch.emulate_ctxt.cs_base = 0; | ||
1805 | vcpu->arch.emulate_ctxt.ds_base = 0; | ||
1806 | vcpu->arch.emulate_ctxt.es_base = 0; | ||
1807 | vcpu->arch.emulate_ctxt.ss_base = 0; | ||
1808 | } else { | ||
1809 | vcpu->arch.emulate_ctxt.cs_base = | ||
1810 | get_segment_base(vcpu, VCPU_SREG_CS); | ||
1811 | vcpu->arch.emulate_ctxt.ds_base = | ||
1812 | get_segment_base(vcpu, VCPU_SREG_DS); | ||
1813 | vcpu->arch.emulate_ctxt.es_base = | ||
1814 | get_segment_base(vcpu, VCPU_SREG_ES); | ||
1815 | vcpu->arch.emulate_ctxt.ss_base = | ||
1816 | get_segment_base(vcpu, VCPU_SREG_SS); | ||
1817 | } | ||
1818 | |||
1819 | vcpu->arch.emulate_ctxt.gs_base = | ||
1820 | get_segment_base(vcpu, VCPU_SREG_GS); | ||
1821 | vcpu->arch.emulate_ctxt.fs_base = | ||
1822 | get_segment_base(vcpu, VCPU_SREG_FS); | ||
1823 | |||
1824 | r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); | ||
1825 | ++vcpu->stat.insn_emulation; | ||
1826 | if (r) { | ||
1827 | ++vcpu->stat.insn_emulation_fail; | ||
1828 | if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) | ||
1829 | return EMULATE_DONE; | ||
1830 | return EMULATE_FAIL; | ||
1831 | } | ||
1832 | } | ||
1833 | |||
1834 | r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); | ||
1835 | |||
1836 | if (vcpu->arch.pio.string) | ||
1837 | return EMULATE_DO_MMIO; | ||
1838 | |||
1839 | if ((r || vcpu->mmio_is_write) && run) { | ||
1840 | run->exit_reason = KVM_EXIT_MMIO; | ||
1841 | run->mmio.phys_addr = vcpu->mmio_phys_addr; | ||
1842 | memcpy(run->mmio.data, vcpu->mmio_data, 8); | ||
1843 | run->mmio.len = vcpu->mmio_size; | ||
1844 | run->mmio.is_write = vcpu->mmio_is_write; | ||
1845 | } | ||
1846 | |||
1847 | if (r) { | ||
1848 | if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) | ||
1849 | return EMULATE_DONE; | ||
1850 | if (!vcpu->mmio_needed) { | ||
1851 | kvm_report_emulation_failure(vcpu, "mmio"); | ||
1852 | return EMULATE_FAIL; | ||
1853 | } | ||
1854 | return EMULATE_DO_MMIO; | ||
1855 | } | ||
1856 | |||
1857 | kvm_x86_ops->decache_regs(vcpu); | ||
1858 | kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); | ||
1859 | |||
1860 | if (vcpu->mmio_is_write) { | ||
1861 | vcpu->mmio_needed = 0; | ||
1862 | return EMULATE_DO_MMIO; | ||
1863 | } | ||
1864 | |||
1865 | return EMULATE_DONE; | ||
1866 | } | ||
1867 | EXPORT_SYMBOL_GPL(emulate_instruction); | ||
1868 | |||
1869 | static void free_pio_guest_pages(struct kvm_vcpu *vcpu) | ||
1870 | { | ||
1871 | int i; | ||
1872 | |||
1873 | for (i = 0; i < ARRAY_SIZE(vcpu->arch.pio.guest_pages); ++i) | ||
1874 | if (vcpu->arch.pio.guest_pages[i]) { | ||
1875 | kvm_release_page_dirty(vcpu->arch.pio.guest_pages[i]); | ||
1876 | vcpu->arch.pio.guest_pages[i] = NULL; | ||
1877 | } | ||
1878 | } | ||
1879 | |||
1880 | static int pio_copy_data(struct kvm_vcpu *vcpu) | ||
1881 | { | ||
1882 | void *p = vcpu->arch.pio_data; | ||
1883 | void *q; | ||
1884 | unsigned bytes; | ||
1885 | int nr_pages = vcpu->arch.pio.guest_pages[1] ? 2 : 1; | ||
1886 | |||
1887 | q = vmap(vcpu->arch.pio.guest_pages, nr_pages, VM_READ|VM_WRITE, | ||
1888 | PAGE_KERNEL); | ||
1889 | if (!q) { | ||
1890 | free_pio_guest_pages(vcpu); | ||
1891 | return -ENOMEM; | ||
1892 | } | ||
1893 | q += vcpu->arch.pio.guest_page_offset; | ||
1894 | bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count; | ||
1895 | if (vcpu->arch.pio.in) | ||
1896 | memcpy(q, p, bytes); | ||
1897 | else | ||
1898 | memcpy(p, q, bytes); | ||
1899 | q -= vcpu->arch.pio.guest_page_offset; | ||
1900 | vunmap(q); | ||
1901 | free_pio_guest_pages(vcpu); | ||
1902 | return 0; | ||
1903 | } | ||
1904 | |||
1905 | int complete_pio(struct kvm_vcpu *vcpu) | ||
1906 | { | ||
1907 | struct kvm_pio_request *io = &vcpu->arch.pio; | ||
1908 | long delta; | ||
1909 | int r; | ||
1910 | |||
1911 | kvm_x86_ops->cache_regs(vcpu); | ||
1912 | |||
1913 | if (!io->string) { | ||
1914 | if (io->in) | ||
1915 | memcpy(&vcpu->arch.regs[VCPU_REGS_RAX], vcpu->arch.pio_data, | ||
1916 | io->size); | ||
1917 | } else { | ||
1918 | if (io->in) { | ||
1919 | r = pio_copy_data(vcpu); | ||
1920 | if (r) { | ||
1921 | kvm_x86_ops->cache_regs(vcpu); | ||
1922 | return r; | ||
1923 | } | ||
1924 | } | ||
1925 | |||
1926 | delta = 1; | ||
1927 | if (io->rep) { | ||
1928 | delta *= io->cur_count; | ||
1929 | /* | ||
1930 | * The size of the register should really depend on | ||
1931 | * current address size. | ||
1932 | */ | ||
1933 | vcpu->arch.regs[VCPU_REGS_RCX] -= delta; | ||
1934 | } | ||
1935 | if (io->down) | ||
1936 | delta = -delta; | ||
1937 | delta *= io->size; | ||
1938 | if (io->in) | ||
1939 | vcpu->arch.regs[VCPU_REGS_RDI] += delta; | ||
1940 | else | ||
1941 | vcpu->arch.regs[VCPU_REGS_RSI] += delta; | ||
1942 | } | ||
1943 | |||
1944 | kvm_x86_ops->decache_regs(vcpu); | ||
1945 | |||
1946 | io->count -= io->cur_count; | ||
1947 | io->cur_count = 0; | ||
1948 | |||
1949 | return 0; | ||
1950 | } | ||
1951 | |||
1952 | static void kernel_pio(struct kvm_io_device *pio_dev, | ||
1953 | struct kvm_vcpu *vcpu, | ||
1954 | void *pd) | ||
1955 | { | ||
1956 | /* TODO: String I/O for in kernel device */ | ||
1957 | |||
1958 | mutex_lock(&vcpu->kvm->lock); | ||
1959 | if (vcpu->arch.pio.in) | ||
1960 | kvm_iodevice_read(pio_dev, vcpu->arch.pio.port, | ||
1961 | vcpu->arch.pio.size, | ||
1962 | pd); | ||
1963 | else | ||
1964 | kvm_iodevice_write(pio_dev, vcpu->arch.pio.port, | ||
1965 | vcpu->arch.pio.size, | ||
1966 | pd); | ||
1967 | mutex_unlock(&vcpu->kvm->lock); | ||
1968 | } | ||
1969 | |||
1970 | static void pio_string_write(struct kvm_io_device *pio_dev, | ||
1971 | struct kvm_vcpu *vcpu) | ||
1972 | { | ||
1973 | struct kvm_pio_request *io = &vcpu->arch.pio; | ||
1974 | void *pd = vcpu->arch.pio_data; | ||
1975 | int i; | ||
1976 | |||
1977 | mutex_lock(&vcpu->kvm->lock); | ||
1978 | for (i = 0; i < io->cur_count; i++) { | ||
1979 | kvm_iodevice_write(pio_dev, io->port, | ||
1980 | io->size, | ||
1981 | pd); | ||
1982 | pd += io->size; | ||
1983 | } | ||
1984 | mutex_unlock(&vcpu->kvm->lock); | ||
1985 | } | ||
1986 | |||
1987 | static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu, | ||
1988 | gpa_t addr) | ||
1989 | { | ||
1990 | return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr); | ||
1991 | } | ||
1992 | |||
1993 | int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, | ||
1994 | int size, unsigned port) | ||
1995 | { | ||
1996 | struct kvm_io_device *pio_dev; | ||
1997 | |||
1998 | vcpu->run->exit_reason = KVM_EXIT_IO; | ||
1999 | vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; | ||
2000 | vcpu->run->io.size = vcpu->arch.pio.size = size; | ||
2001 | vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; | ||
2002 | vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1; | ||
2003 | vcpu->run->io.port = vcpu->arch.pio.port = port; | ||
2004 | vcpu->arch.pio.in = in; | ||
2005 | vcpu->arch.pio.string = 0; | ||
2006 | vcpu->arch.pio.down = 0; | ||
2007 | vcpu->arch.pio.guest_page_offset = 0; | ||
2008 | vcpu->arch.pio.rep = 0; | ||
2009 | |||
2010 | kvm_x86_ops->cache_regs(vcpu); | ||
2011 | memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4); | ||
2012 | kvm_x86_ops->decache_regs(vcpu); | ||
2013 | |||
2014 | kvm_x86_ops->skip_emulated_instruction(vcpu); | ||
2015 | |||
2016 | pio_dev = vcpu_find_pio_dev(vcpu, port); | ||
2017 | if (pio_dev) { | ||
2018 | kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data); | ||
2019 | complete_pio(vcpu); | ||
2020 | return 1; | ||
2021 | } | ||
2022 | return 0; | ||
2023 | } | ||
2024 | EXPORT_SYMBOL_GPL(kvm_emulate_pio); | ||
2025 | |||
2026 | int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, | ||
2027 | int size, unsigned long count, int down, | ||
2028 | gva_t address, int rep, unsigned port) | ||
2029 | { | ||
2030 | unsigned now, in_page; | ||
2031 | int i, ret = 0; | ||
2032 | int nr_pages = 1; | ||
2033 | struct page *page; | ||
2034 | struct kvm_io_device *pio_dev; | ||
2035 | |||
2036 | vcpu->run->exit_reason = KVM_EXIT_IO; | ||
2037 | vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; | ||
2038 | vcpu->run->io.size = vcpu->arch.pio.size = size; | ||
2039 | vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; | ||
2040 | vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count; | ||
2041 | vcpu->run->io.port = vcpu->arch.pio.port = port; | ||
2042 | vcpu->arch.pio.in = in; | ||
2043 | vcpu->arch.pio.string = 1; | ||
2044 | vcpu->arch.pio.down = down; | ||
2045 | vcpu->arch.pio.guest_page_offset = offset_in_page(address); | ||
2046 | vcpu->arch.pio.rep = rep; | ||
2047 | |||
2048 | if (!count) { | ||
2049 | kvm_x86_ops->skip_emulated_instruction(vcpu); | ||
2050 | return 1; | ||
2051 | } | ||
2052 | |||
2053 | if (!down) | ||
2054 | in_page = PAGE_SIZE - offset_in_page(address); | ||
2055 | else | ||
2056 | in_page = offset_in_page(address) + size; | ||
2057 | now = min(count, (unsigned long)in_page / size); | ||
2058 | if (!now) { | ||
2059 | /* | ||
2060 | * String I/O straddles page boundary. Pin two guest pages | ||
2061 | * so that we satisfy atomicity constraints. Do just one | ||
2062 | * transaction to avoid complexity. | ||
2063 | */ | ||
2064 | nr_pages = 2; | ||
2065 | now = 1; | ||
2066 | } | ||
2067 | if (down) { | ||
2068 | /* | ||
2069 | * String I/O in reverse. Yuck. Kill the guest, fix later. | ||
2070 | */ | ||
2071 | pr_unimpl(vcpu, "guest string pio down\n"); | ||
2072 | kvm_inject_gp(vcpu, 0); | ||
2073 | return 1; | ||
2074 | } | ||
2075 | vcpu->run->io.count = now; | ||
2076 | vcpu->arch.pio.cur_count = now; | ||
2077 | |||
2078 | if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count) | ||
2079 | kvm_x86_ops->skip_emulated_instruction(vcpu); | ||
2080 | |||
2081 | for (i = 0; i < nr_pages; ++i) { | ||
2082 | mutex_lock(&vcpu->kvm->lock); | ||
2083 | page = gva_to_page(vcpu, address + i * PAGE_SIZE); | ||
2084 | vcpu->arch.pio.guest_pages[i] = page; | ||
2085 | mutex_unlock(&vcpu->kvm->lock); | ||
2086 | if (!page) { | ||
2087 | kvm_inject_gp(vcpu, 0); | ||
2088 | free_pio_guest_pages(vcpu); | ||
2089 | return 1; | ||
2090 | } | ||
2091 | } | ||
2092 | |||
2093 | pio_dev = vcpu_find_pio_dev(vcpu, port); | ||
2094 | if (!vcpu->arch.pio.in) { | ||
2095 | /* string PIO write */ | ||
2096 | ret = pio_copy_data(vcpu); | ||
2097 | if (ret >= 0 && pio_dev) { | ||
2098 | pio_string_write(pio_dev, vcpu); | ||
2099 | complete_pio(vcpu); | ||
2100 | if (vcpu->arch.pio.count == 0) | ||
2101 | ret = 1; | ||
2102 | } | ||
2103 | } else if (pio_dev) | ||
2104 | pr_unimpl(vcpu, "no string pio read support yet, " | ||
2105 | "port %x size %d count %ld\n", | ||
2106 | port, size, count); | ||
2107 | |||
2108 | return ret; | ||
2109 | } | ||
2110 | EXPORT_SYMBOL_GPL(kvm_emulate_pio_string); | ||
2111 | |||
2112 | int kvm_arch_init(void *opaque) | ||
2113 | { | ||
2114 | int r; | ||
2115 | struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; | ||
2116 | |||
2117 | r = kvm_mmu_module_init(); | ||
2118 | if (r) | ||
2119 | goto out_fail; | ||
2120 | |||
2121 | kvm_init_msr_list(); | ||
2122 | |||
2123 | if (kvm_x86_ops) { | ||
2124 | printk(KERN_ERR "kvm: already loaded the other module\n"); | ||
2125 | r = -EEXIST; | ||
2126 | goto out; | ||
2127 | } | ||
2128 | |||
2129 | if (!ops->cpu_has_kvm_support()) { | ||
2130 | printk(KERN_ERR "kvm: no hardware support\n"); | ||
2131 | r = -EOPNOTSUPP; | ||
2132 | goto out; | ||
2133 | } | ||
2134 | if (ops->disabled_by_bios()) { | ||
2135 | printk(KERN_ERR "kvm: disabled by bios\n"); | ||
2136 | r = -EOPNOTSUPP; | ||
2137 | goto out; | ||
2138 | } | ||
2139 | |||
2140 | kvm_x86_ops = ops; | ||
2141 | kvm_mmu_set_nonpresent_ptes(0ull, 0ull); | ||
2142 | return 0; | ||
2143 | |||
2144 | out: | ||
2145 | kvm_mmu_module_exit(); | ||
2146 | out_fail: | ||
2147 | return r; | ||
2148 | } | ||
2149 | |||
2150 | void kvm_arch_exit(void) | ||
2151 | { | ||
2152 | kvm_x86_ops = NULL; | ||
2153 | kvm_mmu_module_exit(); | ||
2154 | } | ||
2155 | |||
2156 | int kvm_emulate_halt(struct kvm_vcpu *vcpu) | ||
2157 | { | ||
2158 | ++vcpu->stat.halt_exits; | ||
2159 | if (irqchip_in_kernel(vcpu->kvm)) { | ||
2160 | vcpu->arch.mp_state = VCPU_MP_STATE_HALTED; | ||
2161 | kvm_vcpu_block(vcpu); | ||
2162 | if (vcpu->arch.mp_state != VCPU_MP_STATE_RUNNABLE) | ||
2163 | return -EINTR; | ||
2164 | return 1; | ||
2165 | } else { | ||
2166 | vcpu->run->exit_reason = KVM_EXIT_HLT; | ||
2167 | return 0; | ||
2168 | } | ||
2169 | } | ||
2170 | EXPORT_SYMBOL_GPL(kvm_emulate_halt); | ||
2171 | |||
2172 | int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) | ||
2173 | { | ||
2174 | unsigned long nr, a0, a1, a2, a3, ret; | ||
2175 | |||
2176 | kvm_x86_ops->cache_regs(vcpu); | ||
2177 | |||
2178 | nr = vcpu->arch.regs[VCPU_REGS_RAX]; | ||
2179 | a0 = vcpu->arch.regs[VCPU_REGS_RBX]; | ||
2180 | a1 = vcpu->arch.regs[VCPU_REGS_RCX]; | ||
2181 | a2 = vcpu->arch.regs[VCPU_REGS_RDX]; | ||
2182 | a3 = vcpu->arch.regs[VCPU_REGS_RSI]; | ||
2183 | |||
2184 | if (!is_long_mode(vcpu)) { | ||
2185 | nr &= 0xFFFFFFFF; | ||
2186 | a0 &= 0xFFFFFFFF; | ||
2187 | a1 &= 0xFFFFFFFF; | ||
2188 | a2 &= 0xFFFFFFFF; | ||
2189 | a3 &= 0xFFFFFFFF; | ||
2190 | } | ||
2191 | |||
2192 | switch (nr) { | ||
2193 | default: | ||
2194 | ret = -KVM_ENOSYS; | ||
2195 | break; | ||
2196 | } | ||
2197 | vcpu->arch.regs[VCPU_REGS_RAX] = ret; | ||
2198 | kvm_x86_ops->decache_regs(vcpu); | ||
2199 | return 0; | ||
2200 | } | ||
2201 | EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); | ||
2202 | |||
2203 | int kvm_fix_hypercall(struct kvm_vcpu *vcpu) | ||
2204 | { | ||
2205 | char instruction[3]; | ||
2206 | int ret = 0; | ||
2207 | |||
2208 | mutex_lock(&vcpu->kvm->lock); | ||
2209 | |||
2210 | /* | ||
2211 | * Blow out the MMU to ensure that no other VCPU has an active mapping | ||
2212 | * to ensure that the updated hypercall appears atomically across all | ||
2213 | * VCPUs. | ||
2214 | */ | ||
2215 | kvm_mmu_zap_all(vcpu->kvm); | ||
2216 | |||
2217 | kvm_x86_ops->cache_regs(vcpu); | ||
2218 | kvm_x86_ops->patch_hypercall(vcpu, instruction); | ||
2219 | if (emulator_write_emulated(vcpu->arch.rip, instruction, 3, vcpu) | ||
2220 | != X86EMUL_CONTINUE) | ||
2221 | ret = -EFAULT; | ||
2222 | |||
2223 | mutex_unlock(&vcpu->kvm->lock); | ||
2224 | |||
2225 | return ret; | ||
2226 | } | ||
2227 | |||
2228 | static u64 mk_cr_64(u64 curr_cr, u32 new_val) | ||
2229 | { | ||
2230 | return (curr_cr & ~((1ULL << 32) - 1)) | new_val; | ||
2231 | } | ||
2232 | |||
2233 | void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) | ||
2234 | { | ||
2235 | struct descriptor_table dt = { limit, base }; | ||
2236 | |||
2237 | kvm_x86_ops->set_gdt(vcpu, &dt); | ||
2238 | } | ||
2239 | |||
2240 | void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) | ||
2241 | { | ||
2242 | struct descriptor_table dt = { limit, base }; | ||
2243 | |||
2244 | kvm_x86_ops->set_idt(vcpu, &dt); | ||
2245 | } | ||
2246 | |||
2247 | void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, | ||
2248 | unsigned long *rflags) | ||
2249 | { | ||
2250 | lmsw(vcpu, msw); | ||
2251 | *rflags = kvm_x86_ops->get_rflags(vcpu); | ||
2252 | } | ||
2253 | |||
2254 | unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) | ||
2255 | { | ||
2256 | kvm_x86_ops->decache_cr4_guest_bits(vcpu); | ||
2257 | switch (cr) { | ||
2258 | case 0: | ||
2259 | return vcpu->arch.cr0; | ||
2260 | case 2: | ||
2261 | return vcpu->arch.cr2; | ||
2262 | case 3: | ||
2263 | return vcpu->arch.cr3; | ||
2264 | case 4: | ||
2265 | return vcpu->arch.cr4; | ||
2266 | case 8: | ||
2267 | return get_cr8(vcpu); | ||
2268 | default: | ||
2269 | vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr); | ||
2270 | return 0; | ||
2271 | } | ||
2272 | } | ||
2273 | |||
2274 | void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, | ||
2275 | unsigned long *rflags) | ||
2276 | { | ||
2277 | switch (cr) { | ||
2278 | case 0: | ||
2279 | set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); | ||
2280 | *rflags = kvm_x86_ops->get_rflags(vcpu); | ||
2281 | break; | ||
2282 | case 2: | ||
2283 | vcpu->arch.cr2 = val; | ||
2284 | break; | ||
2285 | case 3: | ||
2286 | set_cr3(vcpu, val); | ||
2287 | break; | ||
2288 | case 4: | ||
2289 | set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val)); | ||
2290 | break; | ||
2291 | case 8: | ||
2292 | set_cr8(vcpu, val & 0xfUL); | ||
2293 | break; | ||
2294 | default: | ||
2295 | vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr); | ||
2296 | } | ||
2297 | } | ||
2298 | |||
2299 | static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) | ||
2300 | { | ||
2301 | struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; | ||
2302 | int j, nent = vcpu->arch.cpuid_nent; | ||
2303 | |||
2304 | e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT; | ||
2305 | /* when no next entry is found, the current entry[i] is reselected */ | ||
2306 | for (j = i + 1; j == i; j = (j + 1) % nent) { | ||
2307 | struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j]; | ||
2308 | if (ej->function == e->function) { | ||
2309 | ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; | ||
2310 | return j; | ||
2311 | } | ||
2312 | } | ||
2313 | return 0; /* silence gcc, even though control never reaches here */ | ||
2314 | } | ||
2315 | |||
2316 | /* find an entry with matching function, matching index (if needed), and that | ||
2317 | * should be read next (if it's stateful) */ | ||
2318 | static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e, | ||
2319 | u32 function, u32 index) | ||
2320 | { | ||
2321 | if (e->function != function) | ||
2322 | return 0; | ||
2323 | if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index) | ||
2324 | return 0; | ||
2325 | if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) && | ||
2326 | !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT)) | ||
2327 | return 0; | ||
2328 | return 1; | ||
2329 | } | ||
2330 | |||
2331 | void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) | ||
2332 | { | ||
2333 | int i; | ||
2334 | u32 function, index; | ||
2335 | struct kvm_cpuid_entry2 *e, *best; | ||
2336 | |||
2337 | kvm_x86_ops->cache_regs(vcpu); | ||
2338 | function = vcpu->arch.regs[VCPU_REGS_RAX]; | ||
2339 | index = vcpu->arch.regs[VCPU_REGS_RCX]; | ||
2340 | vcpu->arch.regs[VCPU_REGS_RAX] = 0; | ||
2341 | vcpu->arch.regs[VCPU_REGS_RBX] = 0; | ||
2342 | vcpu->arch.regs[VCPU_REGS_RCX] = 0; | ||
2343 | vcpu->arch.regs[VCPU_REGS_RDX] = 0; | ||
2344 | best = NULL; | ||
2345 | for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { | ||
2346 | e = &vcpu->arch.cpuid_entries[i]; | ||
2347 | if (is_matching_cpuid_entry(e, function, index)) { | ||
2348 | if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) | ||
2349 | move_to_next_stateful_cpuid_entry(vcpu, i); | ||
2350 | best = e; | ||
2351 | break; | ||
2352 | } | ||
2353 | /* | ||
2354 | * Both basic or both extended? | ||
2355 | */ | ||
2356 | if (((e->function ^ function) & 0x80000000) == 0) | ||
2357 | if (!best || e->function > best->function) | ||
2358 | best = e; | ||
2359 | } | ||
2360 | if (best) { | ||
2361 | vcpu->arch.regs[VCPU_REGS_RAX] = best->eax; | ||
2362 | vcpu->arch.regs[VCPU_REGS_RBX] = best->ebx; | ||
2363 | vcpu->arch.regs[VCPU_REGS_RCX] = best->ecx; | ||
2364 | vcpu->arch.regs[VCPU_REGS_RDX] = best->edx; | ||
2365 | } | ||
2366 | kvm_x86_ops->decache_regs(vcpu); | ||
2367 | kvm_x86_ops->skip_emulated_instruction(vcpu); | ||
2368 | } | ||
2369 | EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); | ||
2370 | |||
2371 | /* | ||
2372 | * Check if userspace requested an interrupt window, and that the | ||
2373 | * interrupt window is open. | ||
2374 | * | ||
2375 | * No need to exit to userspace if we already have an interrupt queued. | ||
2376 | */ | ||
2377 | static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, | ||
2378 | struct kvm_run *kvm_run) | ||
2379 | { | ||
2380 | return (!vcpu->arch.irq_summary && | ||
2381 | kvm_run->request_interrupt_window && | ||
2382 | vcpu->arch.interrupt_window_open && | ||
2383 | (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF)); | ||
2384 | } | ||
2385 | |||
2386 | static void post_kvm_run_save(struct kvm_vcpu *vcpu, | ||
2387 | struct kvm_run *kvm_run) | ||
2388 | { | ||
2389 | kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0; | ||
2390 | kvm_run->cr8 = get_cr8(vcpu); | ||
2391 | kvm_run->apic_base = kvm_get_apic_base(vcpu); | ||
2392 | if (irqchip_in_kernel(vcpu->kvm)) | ||
2393 | kvm_run->ready_for_interrupt_injection = 1; | ||
2394 | else | ||
2395 | kvm_run->ready_for_interrupt_injection = | ||
2396 | (vcpu->arch.interrupt_window_open && | ||
2397 | vcpu->arch.irq_summary == 0); | ||
2398 | } | ||
2399 | |||
2400 | static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
2401 | { | ||
2402 | int r; | ||
2403 | |||
2404 | if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) { | ||
2405 | pr_debug("vcpu %d received sipi with vector # %x\n", | ||
2406 | vcpu->vcpu_id, vcpu->arch.sipi_vector); | ||
2407 | kvm_lapic_reset(vcpu); | ||
2408 | r = kvm_x86_ops->vcpu_reset(vcpu); | ||
2409 | if (r) | ||
2410 | return r; | ||
2411 | vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE; | ||
2412 | } | ||
2413 | |||
2414 | preempted: | ||
2415 | if (vcpu->guest_debug.enabled) | ||
2416 | kvm_x86_ops->guest_debug_pre(vcpu); | ||
2417 | |||
2418 | again: | ||
2419 | r = kvm_mmu_reload(vcpu); | ||
2420 | if (unlikely(r)) | ||
2421 | goto out; | ||
2422 | |||
2423 | kvm_inject_pending_timer_irqs(vcpu); | ||
2424 | |||
2425 | preempt_disable(); | ||
2426 | |||
2427 | kvm_x86_ops->prepare_guest_switch(vcpu); | ||
2428 | kvm_load_guest_fpu(vcpu); | ||
2429 | |||
2430 | local_irq_disable(); | ||
2431 | |||
2432 | if (signal_pending(current)) { | ||
2433 | local_irq_enable(); | ||
2434 | preempt_enable(); | ||
2435 | r = -EINTR; | ||
2436 | kvm_run->exit_reason = KVM_EXIT_INTR; | ||
2437 | ++vcpu->stat.signal_exits; | ||
2438 | goto out; | ||
2439 | } | ||
2440 | |||
2441 | if (vcpu->arch.exception.pending) | ||
2442 | __queue_exception(vcpu); | ||
2443 | else if (irqchip_in_kernel(vcpu->kvm)) | ||
2444 | kvm_x86_ops->inject_pending_irq(vcpu); | ||
2445 | else | ||
2446 | kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run); | ||
2447 | |||
2448 | vcpu->guest_mode = 1; | ||
2449 | kvm_guest_enter(); | ||
2450 | |||
2451 | if (vcpu->requests) | ||
2452 | if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) | ||
2453 | kvm_x86_ops->tlb_flush(vcpu); | ||
2454 | |||
2455 | kvm_x86_ops->run(vcpu, kvm_run); | ||
2456 | |||
2457 | vcpu->guest_mode = 0; | ||
2458 | local_irq_enable(); | ||
2459 | |||
2460 | ++vcpu->stat.exits; | ||
2461 | |||
2462 | /* | ||
2463 | * We must have an instruction between local_irq_enable() and | ||
2464 | * kvm_guest_exit(), so the timer interrupt isn't delayed by | ||
2465 | * the interrupt shadow. The stat.exits increment will do nicely. | ||
2466 | * But we need to prevent reordering, hence this barrier(): | ||
2467 | */ | ||
2468 | barrier(); | ||
2469 | |||
2470 | kvm_guest_exit(); | ||
2471 | |||
2472 | preempt_enable(); | ||
2473 | |||
2474 | /* | ||
2475 | * Profile KVM exit RIPs: | ||
2476 | */ | ||
2477 | if (unlikely(prof_on == KVM_PROFILING)) { | ||
2478 | kvm_x86_ops->cache_regs(vcpu); | ||
2479 | profile_hit(KVM_PROFILING, (void *)vcpu->arch.rip); | ||
2480 | } | ||
2481 | |||
2482 | if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu)) | ||
2483 | vcpu->arch.exception.pending = false; | ||
2484 | |||
2485 | r = kvm_x86_ops->handle_exit(kvm_run, vcpu); | ||
2486 | |||
2487 | if (r > 0) { | ||
2488 | if (dm_request_for_irq_injection(vcpu, kvm_run)) { | ||
2489 | r = -EINTR; | ||
2490 | kvm_run->exit_reason = KVM_EXIT_INTR; | ||
2491 | ++vcpu->stat.request_irq_exits; | ||
2492 | goto out; | ||
2493 | } | ||
2494 | if (!need_resched()) | ||
2495 | goto again; | ||
2496 | } | ||
2497 | |||
2498 | out: | ||
2499 | if (r > 0) { | ||
2500 | kvm_resched(vcpu); | ||
2501 | goto preempted; | ||
2502 | } | ||
2503 | |||
2504 | post_kvm_run_save(vcpu, kvm_run); | ||
2505 | |||
2506 | return r; | ||
2507 | } | ||
2508 | |||
2509 | int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
2510 | { | ||
2511 | int r; | ||
2512 | sigset_t sigsaved; | ||
2513 | |||
2514 | vcpu_load(vcpu); | ||
2515 | |||
2516 | if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_UNINITIALIZED)) { | ||
2517 | kvm_vcpu_block(vcpu); | ||
2518 | vcpu_put(vcpu); | ||
2519 | return -EAGAIN; | ||
2520 | } | ||
2521 | |||
2522 | if (vcpu->sigset_active) | ||
2523 | sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); | ||
2524 | |||
2525 | /* re-sync apic's tpr */ | ||
2526 | if (!irqchip_in_kernel(vcpu->kvm)) | ||
2527 | set_cr8(vcpu, kvm_run->cr8); | ||
2528 | |||
2529 | if (vcpu->arch.pio.cur_count) { | ||
2530 | r = complete_pio(vcpu); | ||
2531 | if (r) | ||
2532 | goto out; | ||
2533 | } | ||
2534 | #if CONFIG_HAS_IOMEM | ||
2535 | if (vcpu->mmio_needed) { | ||
2536 | memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); | ||
2537 | vcpu->mmio_read_completed = 1; | ||
2538 | vcpu->mmio_needed = 0; | ||
2539 | r = emulate_instruction(vcpu, kvm_run, | ||
2540 | vcpu->arch.mmio_fault_cr2, 0, 1); | ||
2541 | if (r == EMULATE_DO_MMIO) { | ||
2542 | /* | ||
2543 | * Read-modify-write. Back to userspace. | ||
2544 | */ | ||
2545 | r = 0; | ||
2546 | goto out; | ||
2547 | } | ||
2548 | } | ||
2549 | #endif | ||
2550 | if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) { | ||
2551 | kvm_x86_ops->cache_regs(vcpu); | ||
2552 | vcpu->arch.regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret; | ||
2553 | kvm_x86_ops->decache_regs(vcpu); | ||
2554 | } | ||
2555 | |||
2556 | r = __vcpu_run(vcpu, kvm_run); | ||
2557 | |||
2558 | out: | ||
2559 | if (vcpu->sigset_active) | ||
2560 | sigprocmask(SIG_SETMASK, &sigsaved, NULL); | ||
2561 | |||
2562 | vcpu_put(vcpu); | ||
2563 | return r; | ||
2564 | } | ||
2565 | |||
2566 | int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) | ||
2567 | { | ||
2568 | vcpu_load(vcpu); | ||
2569 | |||
2570 | kvm_x86_ops->cache_regs(vcpu); | ||
2571 | |||
2572 | regs->rax = vcpu->arch.regs[VCPU_REGS_RAX]; | ||
2573 | regs->rbx = vcpu->arch.regs[VCPU_REGS_RBX]; | ||
2574 | regs->rcx = vcpu->arch.regs[VCPU_REGS_RCX]; | ||
2575 | regs->rdx = vcpu->arch.regs[VCPU_REGS_RDX]; | ||
2576 | regs->rsi = vcpu->arch.regs[VCPU_REGS_RSI]; | ||
2577 | regs->rdi = vcpu->arch.regs[VCPU_REGS_RDI]; | ||
2578 | regs->rsp = vcpu->arch.regs[VCPU_REGS_RSP]; | ||
2579 | regs->rbp = vcpu->arch.regs[VCPU_REGS_RBP]; | ||
2580 | #ifdef CONFIG_X86_64 | ||
2581 | regs->r8 = vcpu->arch.regs[VCPU_REGS_R8]; | ||
2582 | regs->r9 = vcpu->arch.regs[VCPU_REGS_R9]; | ||
2583 | regs->r10 = vcpu->arch.regs[VCPU_REGS_R10]; | ||
2584 | regs->r11 = vcpu->arch.regs[VCPU_REGS_R11]; | ||
2585 | regs->r12 = vcpu->arch.regs[VCPU_REGS_R12]; | ||
2586 | regs->r13 = vcpu->arch.regs[VCPU_REGS_R13]; | ||
2587 | regs->r14 = vcpu->arch.regs[VCPU_REGS_R14]; | ||
2588 | regs->r15 = vcpu->arch.regs[VCPU_REGS_R15]; | ||
2589 | #endif | ||
2590 | |||
2591 | regs->rip = vcpu->arch.rip; | ||
2592 | regs->rflags = kvm_x86_ops->get_rflags(vcpu); | ||
2593 | |||
2594 | /* | ||
2595 | * Don't leak debug flags in case they were set for guest debugging | ||
2596 | */ | ||
2597 | if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep) | ||
2598 | regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); | ||
2599 | |||
2600 | vcpu_put(vcpu); | ||
2601 | |||
2602 | return 0; | ||
2603 | } | ||
2604 | |||
2605 | int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) | ||
2606 | { | ||
2607 | vcpu_load(vcpu); | ||
2608 | |||
2609 | vcpu->arch.regs[VCPU_REGS_RAX] = regs->rax; | ||
2610 | vcpu->arch.regs[VCPU_REGS_RBX] = regs->rbx; | ||
2611 | vcpu->arch.regs[VCPU_REGS_RCX] = regs->rcx; | ||
2612 | vcpu->arch.regs[VCPU_REGS_RDX] = regs->rdx; | ||
2613 | vcpu->arch.regs[VCPU_REGS_RSI] = regs->rsi; | ||
2614 | vcpu->arch.regs[VCPU_REGS_RDI] = regs->rdi; | ||
2615 | vcpu->arch.regs[VCPU_REGS_RSP] = regs->rsp; | ||
2616 | vcpu->arch.regs[VCPU_REGS_RBP] = regs->rbp; | ||
2617 | #ifdef CONFIG_X86_64 | ||
2618 | vcpu->arch.regs[VCPU_REGS_R8] = regs->r8; | ||
2619 | vcpu->arch.regs[VCPU_REGS_R9] = regs->r9; | ||
2620 | vcpu->arch.regs[VCPU_REGS_R10] = regs->r10; | ||
2621 | vcpu->arch.regs[VCPU_REGS_R11] = regs->r11; | ||
2622 | vcpu->arch.regs[VCPU_REGS_R12] = regs->r12; | ||
2623 | vcpu->arch.regs[VCPU_REGS_R13] = regs->r13; | ||
2624 | vcpu->arch.regs[VCPU_REGS_R14] = regs->r14; | ||
2625 | vcpu->arch.regs[VCPU_REGS_R15] = regs->r15; | ||
2626 | #endif | ||
2627 | |||
2628 | vcpu->arch.rip = regs->rip; | ||
2629 | kvm_x86_ops->set_rflags(vcpu, regs->rflags); | ||
2630 | |||
2631 | kvm_x86_ops->decache_regs(vcpu); | ||
2632 | |||
2633 | vcpu_put(vcpu); | ||
2634 | |||
2635 | return 0; | ||
2636 | } | ||
2637 | |||
2638 | static void get_segment(struct kvm_vcpu *vcpu, | ||
2639 | struct kvm_segment *var, int seg) | ||
2640 | { | ||
2641 | return kvm_x86_ops->get_segment(vcpu, var, seg); | ||
2642 | } | ||
2643 | |||
2644 | void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) | ||
2645 | { | ||
2646 | struct kvm_segment cs; | ||
2647 | |||
2648 | get_segment(vcpu, &cs, VCPU_SREG_CS); | ||
2649 | *db = cs.db; | ||
2650 | *l = cs.l; | ||
2651 | } | ||
2652 | EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits); | ||
2653 | |||
2654 | int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, | ||
2655 | struct kvm_sregs *sregs) | ||
2656 | { | ||
2657 | struct descriptor_table dt; | ||
2658 | int pending_vec; | ||
2659 | |||
2660 | vcpu_load(vcpu); | ||
2661 | |||
2662 | get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); | ||
2663 | get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); | ||
2664 | get_segment(vcpu, &sregs->es, VCPU_SREG_ES); | ||
2665 | get_segment(vcpu, &sregs->fs, VCPU_SREG_FS); | ||
2666 | get_segment(vcpu, &sregs->gs, VCPU_SREG_GS); | ||
2667 | get_segment(vcpu, &sregs->ss, VCPU_SREG_SS); | ||
2668 | |||
2669 | get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); | ||
2670 | get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); | ||
2671 | |||
2672 | kvm_x86_ops->get_idt(vcpu, &dt); | ||
2673 | sregs->idt.limit = dt.limit; | ||
2674 | sregs->idt.base = dt.base; | ||
2675 | kvm_x86_ops->get_gdt(vcpu, &dt); | ||
2676 | sregs->gdt.limit = dt.limit; | ||
2677 | sregs->gdt.base = dt.base; | ||
2678 | |||
2679 | kvm_x86_ops->decache_cr4_guest_bits(vcpu); | ||
2680 | sregs->cr0 = vcpu->arch.cr0; | ||
2681 | sregs->cr2 = vcpu->arch.cr2; | ||
2682 | sregs->cr3 = vcpu->arch.cr3; | ||
2683 | sregs->cr4 = vcpu->arch.cr4; | ||
2684 | sregs->cr8 = get_cr8(vcpu); | ||
2685 | sregs->efer = vcpu->arch.shadow_efer; | ||
2686 | sregs->apic_base = kvm_get_apic_base(vcpu); | ||
2687 | |||
2688 | if (irqchip_in_kernel(vcpu->kvm)) { | ||
2689 | memset(sregs->interrupt_bitmap, 0, | ||
2690 | sizeof sregs->interrupt_bitmap); | ||
2691 | pending_vec = kvm_x86_ops->get_irq(vcpu); | ||
2692 | if (pending_vec >= 0) | ||
2693 | set_bit(pending_vec, | ||
2694 | (unsigned long *)sregs->interrupt_bitmap); | ||
2695 | } else | ||
2696 | memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending, | ||
2697 | sizeof sregs->interrupt_bitmap); | ||
2698 | |||
2699 | vcpu_put(vcpu); | ||
2700 | |||
2701 | return 0; | ||
2702 | } | ||
2703 | |||
2704 | static void set_segment(struct kvm_vcpu *vcpu, | ||
2705 | struct kvm_segment *var, int seg) | ||
2706 | { | ||
2707 | return kvm_x86_ops->set_segment(vcpu, var, seg); | ||
2708 | } | ||
2709 | |||
2710 | int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, | ||
2711 | struct kvm_sregs *sregs) | ||
2712 | { | ||
2713 | int mmu_reset_needed = 0; | ||
2714 | int i, pending_vec, max_bits; | ||
2715 | struct descriptor_table dt; | ||
2716 | |||
2717 | vcpu_load(vcpu); | ||
2718 | |||
2719 | dt.limit = sregs->idt.limit; | ||
2720 | dt.base = sregs->idt.base; | ||
2721 | kvm_x86_ops->set_idt(vcpu, &dt); | ||
2722 | dt.limit = sregs->gdt.limit; | ||
2723 | dt.base = sregs->gdt.base; | ||
2724 | kvm_x86_ops->set_gdt(vcpu, &dt); | ||
2725 | |||
2726 | vcpu->arch.cr2 = sregs->cr2; | ||
2727 | mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; | ||
2728 | vcpu->arch.cr3 = sregs->cr3; | ||
2729 | |||
2730 | set_cr8(vcpu, sregs->cr8); | ||
2731 | |||
2732 | mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer; | ||
2733 | #ifdef CONFIG_X86_64 | ||
2734 | kvm_x86_ops->set_efer(vcpu, sregs->efer); | ||
2735 | #endif | ||
2736 | kvm_set_apic_base(vcpu, sregs->apic_base); | ||
2737 | |||
2738 | kvm_x86_ops->decache_cr4_guest_bits(vcpu); | ||
2739 | |||
2740 | mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0; | ||
2741 | vcpu->arch.cr0 = sregs->cr0; | ||
2742 | kvm_x86_ops->set_cr0(vcpu, sregs->cr0); | ||
2743 | |||
2744 | mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4; | ||
2745 | kvm_x86_ops->set_cr4(vcpu, sregs->cr4); | ||
2746 | if (!is_long_mode(vcpu) && is_pae(vcpu)) | ||
2747 | load_pdptrs(vcpu, vcpu->arch.cr3); | ||
2748 | |||
2749 | if (mmu_reset_needed) | ||
2750 | kvm_mmu_reset_context(vcpu); | ||
2751 | |||
2752 | if (!irqchip_in_kernel(vcpu->kvm)) { | ||
2753 | memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap, | ||
2754 | sizeof vcpu->arch.irq_pending); | ||
2755 | vcpu->arch.irq_summary = 0; | ||
2756 | for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i) | ||
2757 | if (vcpu->arch.irq_pending[i]) | ||
2758 | __set_bit(i, &vcpu->arch.irq_summary); | ||
2759 | } else { | ||
2760 | max_bits = (sizeof sregs->interrupt_bitmap) << 3; | ||
2761 | pending_vec = find_first_bit( | ||
2762 | (const unsigned long *)sregs->interrupt_bitmap, | ||
2763 | max_bits); | ||
2764 | /* Only pending external irq is handled here */ | ||
2765 | if (pending_vec < max_bits) { | ||
2766 | kvm_x86_ops->set_irq(vcpu, pending_vec); | ||
2767 | pr_debug("Set back pending irq %d\n", | ||
2768 | pending_vec); | ||
2769 | } | ||
2770 | } | ||
2771 | |||
2772 | set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); | ||
2773 | set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); | ||
2774 | set_segment(vcpu, &sregs->es, VCPU_SREG_ES); | ||
2775 | set_segment(vcpu, &sregs->fs, VCPU_SREG_FS); | ||
2776 | set_segment(vcpu, &sregs->gs, VCPU_SREG_GS); | ||
2777 | set_segment(vcpu, &sregs->ss, VCPU_SREG_SS); | ||
2778 | |||
2779 | set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); | ||
2780 | set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); | ||
2781 | |||
2782 | vcpu_put(vcpu); | ||
2783 | |||
2784 | return 0; | ||
2785 | } | ||
2786 | |||
2787 | int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu, | ||
2788 | struct kvm_debug_guest *dbg) | ||
2789 | { | ||
2790 | int r; | ||
2791 | |||
2792 | vcpu_load(vcpu); | ||
2793 | |||
2794 | r = kvm_x86_ops->set_guest_debug(vcpu, dbg); | ||
2795 | |||
2796 | vcpu_put(vcpu); | ||
2797 | |||
2798 | return r; | ||
2799 | } | ||
2800 | |||
2801 | /* | ||
2802 | * fxsave fpu state. Taken from x86_64/processor.h. To be killed when | ||
2803 | * we have asm/x86/processor.h | ||
2804 | */ | ||
2805 | struct fxsave { | ||
2806 | u16 cwd; | ||
2807 | u16 swd; | ||
2808 | u16 twd; | ||
2809 | u16 fop; | ||
2810 | u64 rip; | ||
2811 | u64 rdp; | ||
2812 | u32 mxcsr; | ||
2813 | u32 mxcsr_mask; | ||
2814 | u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ | ||
2815 | #ifdef CONFIG_X86_64 | ||
2816 | u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ | ||
2817 | #else | ||
2818 | u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */ | ||
2819 | #endif | ||
2820 | }; | ||
2821 | |||
2822 | /* | ||
2823 | * Translate a guest virtual address to a guest physical address. | ||
2824 | */ | ||
2825 | int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, | ||
2826 | struct kvm_translation *tr) | ||
2827 | { | ||
2828 | unsigned long vaddr = tr->linear_address; | ||
2829 | gpa_t gpa; | ||
2830 | |||
2831 | vcpu_load(vcpu); | ||
2832 | mutex_lock(&vcpu->kvm->lock); | ||
2833 | gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr); | ||
2834 | tr->physical_address = gpa; | ||
2835 | tr->valid = gpa != UNMAPPED_GVA; | ||
2836 | tr->writeable = 1; | ||
2837 | tr->usermode = 0; | ||
2838 | mutex_unlock(&vcpu->kvm->lock); | ||
2839 | vcpu_put(vcpu); | ||
2840 | |||
2841 | return 0; | ||
2842 | } | ||
2843 | |||
2844 | int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) | ||
2845 | { | ||
2846 | struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; | ||
2847 | |||
2848 | vcpu_load(vcpu); | ||
2849 | |||
2850 | memcpy(fpu->fpr, fxsave->st_space, 128); | ||
2851 | fpu->fcw = fxsave->cwd; | ||
2852 | fpu->fsw = fxsave->swd; | ||
2853 | fpu->ftwx = fxsave->twd; | ||
2854 | fpu->last_opcode = fxsave->fop; | ||
2855 | fpu->last_ip = fxsave->rip; | ||
2856 | fpu->last_dp = fxsave->rdp; | ||
2857 | memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space); | ||
2858 | |||
2859 | vcpu_put(vcpu); | ||
2860 | |||
2861 | return 0; | ||
2862 | } | ||
2863 | |||
2864 | int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) | ||
2865 | { | ||
2866 | struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; | ||
2867 | |||
2868 | vcpu_load(vcpu); | ||
2869 | |||
2870 | memcpy(fxsave->st_space, fpu->fpr, 128); | ||
2871 | fxsave->cwd = fpu->fcw; | ||
2872 | fxsave->swd = fpu->fsw; | ||
2873 | fxsave->twd = fpu->ftwx; | ||
2874 | fxsave->fop = fpu->last_opcode; | ||
2875 | fxsave->rip = fpu->last_ip; | ||
2876 | fxsave->rdp = fpu->last_dp; | ||
2877 | memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space); | ||
2878 | |||
2879 | vcpu_put(vcpu); | ||
2880 | |||
2881 | return 0; | ||
2882 | } | ||
2883 | |||
2884 | void fx_init(struct kvm_vcpu *vcpu) | ||
2885 | { | ||
2886 | unsigned after_mxcsr_mask; | ||
2887 | |||
2888 | /* Initialize guest FPU by resetting ours and saving into guest's */ | ||
2889 | preempt_disable(); | ||
2890 | fx_save(&vcpu->arch.host_fx_image); | ||
2891 | fpu_init(); | ||
2892 | fx_save(&vcpu->arch.guest_fx_image); | ||
2893 | fx_restore(&vcpu->arch.host_fx_image); | ||
2894 | preempt_enable(); | ||
2895 | |||
2896 | vcpu->arch.cr0 |= X86_CR0_ET; | ||
2897 | after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space); | ||
2898 | vcpu->arch.guest_fx_image.mxcsr = 0x1f80; | ||
2899 | memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask, | ||
2900 | 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask); | ||
2901 | } | ||
2902 | EXPORT_SYMBOL_GPL(fx_init); | ||
2903 | |||
2904 | void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) | ||
2905 | { | ||
2906 | if (!vcpu->fpu_active || vcpu->guest_fpu_loaded) | ||
2907 | return; | ||
2908 | |||
2909 | vcpu->guest_fpu_loaded = 1; | ||
2910 | fx_save(&vcpu->arch.host_fx_image); | ||
2911 | fx_restore(&vcpu->arch.guest_fx_image); | ||
2912 | } | ||
2913 | EXPORT_SYMBOL_GPL(kvm_load_guest_fpu); | ||
2914 | |||
2915 | void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) | ||
2916 | { | ||
2917 | if (!vcpu->guest_fpu_loaded) | ||
2918 | return; | ||
2919 | |||
2920 | vcpu->guest_fpu_loaded = 0; | ||
2921 | fx_save(&vcpu->arch.guest_fx_image); | ||
2922 | fx_restore(&vcpu->arch.host_fx_image); | ||
2923 | ++vcpu->stat.fpu_reload; | ||
2924 | } | ||
2925 | EXPORT_SYMBOL_GPL(kvm_put_guest_fpu); | ||
2926 | |||
2927 | void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) | ||
2928 | { | ||
2929 | kvm_x86_ops->vcpu_free(vcpu); | ||
2930 | } | ||
2931 | |||
2932 | struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, | ||
2933 | unsigned int id) | ||
2934 | { | ||
2935 | return kvm_x86_ops->vcpu_create(kvm, id); | ||
2936 | } | ||
2937 | |||
2938 | int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) | ||
2939 | { | ||
2940 | int r; | ||
2941 | |||
2942 | /* We do fxsave: this must be aligned. */ | ||
2943 | BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF); | ||
2944 | |||
2945 | vcpu_load(vcpu); | ||
2946 | r = kvm_arch_vcpu_reset(vcpu); | ||
2947 | if (r == 0) | ||
2948 | r = kvm_mmu_setup(vcpu); | ||
2949 | vcpu_put(vcpu); | ||
2950 | if (r < 0) | ||
2951 | goto free_vcpu; | ||
2952 | |||
2953 | return 0; | ||
2954 | free_vcpu: | ||
2955 | kvm_x86_ops->vcpu_free(vcpu); | ||
2956 | return r; | ||
2957 | } | ||
2958 | |||
2959 | void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) | ||
2960 | { | ||
2961 | vcpu_load(vcpu); | ||
2962 | kvm_mmu_unload(vcpu); | ||
2963 | vcpu_put(vcpu); | ||
2964 | |||
2965 | kvm_x86_ops->vcpu_free(vcpu); | ||
2966 | } | ||
2967 | |||
2968 | int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) | ||
2969 | { | ||
2970 | return kvm_x86_ops->vcpu_reset(vcpu); | ||
2971 | } | ||
2972 | |||
2973 | void kvm_arch_hardware_enable(void *garbage) | ||
2974 | { | ||
2975 | kvm_x86_ops->hardware_enable(garbage); | ||
2976 | } | ||
2977 | |||
2978 | void kvm_arch_hardware_disable(void *garbage) | ||
2979 | { | ||
2980 | kvm_x86_ops->hardware_disable(garbage); | ||
2981 | } | ||
2982 | |||
2983 | int kvm_arch_hardware_setup(void) | ||
2984 | { | ||
2985 | return kvm_x86_ops->hardware_setup(); | ||
2986 | } | ||
2987 | |||
2988 | void kvm_arch_hardware_unsetup(void) | ||
2989 | { | ||
2990 | kvm_x86_ops->hardware_unsetup(); | ||
2991 | } | ||
2992 | |||
2993 | void kvm_arch_check_processor_compat(void *rtn) | ||
2994 | { | ||
2995 | kvm_x86_ops->check_processor_compatibility(rtn); | ||
2996 | } | ||
2997 | |||
2998 | int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) | ||
2999 | { | ||
3000 | struct page *page; | ||
3001 | struct kvm *kvm; | ||
3002 | int r; | ||
3003 | |||
3004 | BUG_ON(vcpu->kvm == NULL); | ||
3005 | kvm = vcpu->kvm; | ||
3006 | |||
3007 | vcpu->arch.mmu.root_hpa = INVALID_PAGE; | ||
3008 | if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0) | ||
3009 | vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE; | ||
3010 | else | ||
3011 | vcpu->arch.mp_state = VCPU_MP_STATE_UNINITIALIZED; | ||
3012 | |||
3013 | page = alloc_page(GFP_KERNEL | __GFP_ZERO); | ||
3014 | if (!page) { | ||
3015 | r = -ENOMEM; | ||
3016 | goto fail; | ||
3017 | } | ||
3018 | vcpu->arch.pio_data = page_address(page); | ||
3019 | |||
3020 | r = kvm_mmu_create(vcpu); | ||
3021 | if (r < 0) | ||
3022 | goto fail_free_pio_data; | ||
3023 | |||
3024 | if (irqchip_in_kernel(kvm)) { | ||
3025 | r = kvm_create_lapic(vcpu); | ||
3026 | if (r < 0) | ||
3027 | goto fail_mmu_destroy; | ||
3028 | } | ||
3029 | |||
3030 | return 0; | ||
3031 | |||
3032 | fail_mmu_destroy: | ||
3033 | kvm_mmu_destroy(vcpu); | ||
3034 | fail_free_pio_data: | ||
3035 | free_page((unsigned long)vcpu->arch.pio_data); | ||
3036 | fail: | ||
3037 | return r; | ||
3038 | } | ||
3039 | |||
3040 | void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) | ||
3041 | { | ||
3042 | kvm_free_lapic(vcpu); | ||
3043 | kvm_mmu_destroy(vcpu); | ||
3044 | free_page((unsigned long)vcpu->arch.pio_data); | ||
3045 | } | ||
3046 | |||
3047 | struct kvm *kvm_arch_create_vm(void) | ||
3048 | { | ||
3049 | struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); | ||
3050 | |||
3051 | if (!kvm) | ||
3052 | return ERR_PTR(-ENOMEM); | ||
3053 | |||
3054 | INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); | ||
3055 | |||
3056 | return kvm; | ||
3057 | } | ||
3058 | |||
3059 | static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) | ||
3060 | { | ||
3061 | vcpu_load(vcpu); | ||
3062 | kvm_mmu_unload(vcpu); | ||
3063 | vcpu_put(vcpu); | ||
3064 | } | ||
3065 | |||
3066 | static void kvm_free_vcpus(struct kvm *kvm) | ||
3067 | { | ||
3068 | unsigned int i; | ||
3069 | |||
3070 | /* | ||
3071 | * Unpin any mmu pages first. | ||
3072 | */ | ||
3073 | for (i = 0; i < KVM_MAX_VCPUS; ++i) | ||
3074 | if (kvm->vcpus[i]) | ||
3075 | kvm_unload_vcpu_mmu(kvm->vcpus[i]); | ||
3076 | for (i = 0; i < KVM_MAX_VCPUS; ++i) { | ||
3077 | if (kvm->vcpus[i]) { | ||
3078 | kvm_arch_vcpu_free(kvm->vcpus[i]); | ||
3079 | kvm->vcpus[i] = NULL; | ||
3080 | } | ||
3081 | } | ||
3082 | |||
3083 | } | ||
3084 | |||
3085 | void kvm_arch_destroy_vm(struct kvm *kvm) | ||
3086 | { | ||
3087 | kfree(kvm->arch.vpic); | ||
3088 | kfree(kvm->arch.vioapic); | ||
3089 | kvm_free_vcpus(kvm); | ||
3090 | kvm_free_physmem(kvm); | ||
3091 | kfree(kvm); | ||
3092 | } | ||
3093 | |||
3094 | int kvm_arch_set_memory_region(struct kvm *kvm, | ||
3095 | struct kvm_userspace_memory_region *mem, | ||
3096 | struct kvm_memory_slot old, | ||
3097 | int user_alloc) | ||
3098 | { | ||
3099 | int npages = mem->memory_size >> PAGE_SHIFT; | ||
3100 | struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot]; | ||
3101 | |||
3102 | /*To keep backward compatibility with older userspace, | ||
3103 | *x86 needs to hanlde !user_alloc case. | ||
3104 | */ | ||
3105 | if (!user_alloc) { | ||
3106 | if (npages && !old.rmap) { | ||
3107 | down_write(¤t->mm->mmap_sem); | ||
3108 | memslot->userspace_addr = do_mmap(NULL, 0, | ||
3109 | npages * PAGE_SIZE, | ||
3110 | PROT_READ | PROT_WRITE, | ||
3111 | MAP_SHARED | MAP_ANONYMOUS, | ||
3112 | 0); | ||
3113 | up_write(¤t->mm->mmap_sem); | ||
3114 | |||
3115 | if (IS_ERR((void *)memslot->userspace_addr)) | ||
3116 | return PTR_ERR((void *)memslot->userspace_addr); | ||
3117 | } else { | ||
3118 | if (!old.user_alloc && old.rmap) { | ||
3119 | int ret; | ||
3120 | |||
3121 | down_write(¤t->mm->mmap_sem); | ||
3122 | ret = do_munmap(current->mm, old.userspace_addr, | ||
3123 | old.npages * PAGE_SIZE); | ||
3124 | up_write(¤t->mm->mmap_sem); | ||
3125 | if (ret < 0) | ||
3126 | printk(KERN_WARNING | ||
3127 | "kvm_vm_ioctl_set_memory_region: " | ||
3128 | "failed to munmap memory\n"); | ||
3129 | } | ||
3130 | } | ||
3131 | } | ||
3132 | |||
3133 | if (!kvm->arch.n_requested_mmu_pages) { | ||
3134 | unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); | ||
3135 | kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); | ||
3136 | } | ||
3137 | |||
3138 | kvm_mmu_slot_remove_write_access(kvm, mem->slot); | ||
3139 | kvm_flush_remote_tlbs(kvm); | ||
3140 | |||
3141 | return 0; | ||
3142 | } | ||
3143 | |||
3144 | int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) | ||
3145 | { | ||
3146 | return vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE | ||
3147 | || vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED; | ||
3148 | } | ||
diff --git a/drivers/kvm/x86.h b/drivers/kvm/x86.h deleted file mode 100644 index dfb8091971a9..000000000000 --- a/drivers/kvm/x86.h +++ /dev/null | |||
@@ -1,602 +0,0 @@ | |||
1 | #/* | ||
2 | * Kernel-based Virtual Machine driver for Linux | ||
3 | * | ||
4 | * This header defines architecture specific interfaces, x86 version | ||
5 | * | ||
6 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
7 | * the COPYING file in the top-level directory. | ||
8 | * | ||
9 | */ | ||
10 | |||
11 | #ifndef KVM_X86_H | ||
12 | #define KVM_X86_H | ||
13 | |||
14 | #include <linux/types.h> | ||
15 | #include <linux/mm.h> | ||
16 | |||
17 | #include <linux/kvm.h> | ||
18 | #include <linux/kvm_para.h> | ||
19 | |||
20 | #include <asm/desc.h> | ||
21 | |||
22 | #include "types.h" | ||
23 | |||
24 | #define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1) | ||
25 | #define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD)) | ||
26 | #define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS|0xFFFFFF0000000000ULL) | ||
27 | |||
28 | #define KVM_GUEST_CR0_MASK \ | ||
29 | (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE \ | ||
30 | | X86_CR0_NW | X86_CR0_CD) | ||
31 | #define KVM_VM_CR0_ALWAYS_ON \ | ||
32 | (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE | X86_CR0_TS \ | ||
33 | | X86_CR0_MP) | ||
34 | #define KVM_GUEST_CR4_MASK \ | ||
35 | (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE) | ||
36 | #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) | ||
37 | #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) | ||
38 | |||
39 | #define INVALID_PAGE (~(hpa_t)0) | ||
40 | #define UNMAPPED_GVA (~(gpa_t)0) | ||
41 | |||
42 | #define DE_VECTOR 0 | ||
43 | #define UD_VECTOR 6 | ||
44 | #define NM_VECTOR 7 | ||
45 | #define DF_VECTOR 8 | ||
46 | #define TS_VECTOR 10 | ||
47 | #define NP_VECTOR 11 | ||
48 | #define SS_VECTOR 12 | ||
49 | #define GP_VECTOR 13 | ||
50 | #define PF_VECTOR 14 | ||
51 | |||
52 | #define SELECTOR_TI_MASK (1 << 2) | ||
53 | #define SELECTOR_RPL_MASK 0x03 | ||
54 | |||
55 | #define IOPL_SHIFT 12 | ||
56 | |||
57 | #define KVM_ALIAS_SLOTS 4 | ||
58 | |||
59 | #define KVM_PERMILLE_MMU_PAGES 20 | ||
60 | #define KVM_MIN_ALLOC_MMU_PAGES 64 | ||
61 | #define KVM_NUM_MMU_PAGES 1024 | ||
62 | #define KVM_MIN_FREE_MMU_PAGES 5 | ||
63 | #define KVM_REFILL_PAGES 25 | ||
64 | #define KVM_MAX_CPUID_ENTRIES 40 | ||
65 | |||
66 | extern spinlock_t kvm_lock; | ||
67 | extern struct list_head vm_list; | ||
68 | |||
69 | struct kvm_vcpu; | ||
70 | struct kvm; | ||
71 | |||
72 | enum { | ||
73 | VCPU_REGS_RAX = 0, | ||
74 | VCPU_REGS_RCX = 1, | ||
75 | VCPU_REGS_RDX = 2, | ||
76 | VCPU_REGS_RBX = 3, | ||
77 | VCPU_REGS_RSP = 4, | ||
78 | VCPU_REGS_RBP = 5, | ||
79 | VCPU_REGS_RSI = 6, | ||
80 | VCPU_REGS_RDI = 7, | ||
81 | #ifdef CONFIG_X86_64 | ||
82 | VCPU_REGS_R8 = 8, | ||
83 | VCPU_REGS_R9 = 9, | ||
84 | VCPU_REGS_R10 = 10, | ||
85 | VCPU_REGS_R11 = 11, | ||
86 | VCPU_REGS_R12 = 12, | ||
87 | VCPU_REGS_R13 = 13, | ||
88 | VCPU_REGS_R14 = 14, | ||
89 | VCPU_REGS_R15 = 15, | ||
90 | #endif | ||
91 | NR_VCPU_REGS | ||
92 | }; | ||
93 | |||
94 | enum { | ||
95 | VCPU_SREG_CS, | ||
96 | VCPU_SREG_DS, | ||
97 | VCPU_SREG_ES, | ||
98 | VCPU_SREG_FS, | ||
99 | VCPU_SREG_GS, | ||
100 | VCPU_SREG_SS, | ||
101 | VCPU_SREG_TR, | ||
102 | VCPU_SREG_LDTR, | ||
103 | }; | ||
104 | |||
105 | #include "x86_emulate.h" | ||
106 | |||
107 | #define KVM_NR_MEM_OBJS 40 | ||
108 | |||
109 | /* | ||
110 | * We don't want allocation failures within the mmu code, so we preallocate | ||
111 | * enough memory for a single page fault in a cache. | ||
112 | */ | ||
113 | struct kvm_mmu_memory_cache { | ||
114 | int nobjs; | ||
115 | void *objects[KVM_NR_MEM_OBJS]; | ||
116 | }; | ||
117 | |||
118 | #define NR_PTE_CHAIN_ENTRIES 5 | ||
119 | |||
120 | struct kvm_pte_chain { | ||
121 | u64 *parent_ptes[NR_PTE_CHAIN_ENTRIES]; | ||
122 | struct hlist_node link; | ||
123 | }; | ||
124 | |||
125 | /* | ||
126 | * kvm_mmu_page_role, below, is defined as: | ||
127 | * | ||
128 | * bits 0:3 - total guest paging levels (2-4, or zero for real mode) | ||
129 | * bits 4:7 - page table level for this shadow (1-4) | ||
130 | * bits 8:9 - page table quadrant for 2-level guests | ||
131 | * bit 16 - "metaphysical" - gfn is not a real page (huge page/real mode) | ||
132 | * bits 17:19 - common access permissions for all ptes in this shadow page | ||
133 | */ | ||
134 | union kvm_mmu_page_role { | ||
135 | unsigned word; | ||
136 | struct { | ||
137 | unsigned glevels : 4; | ||
138 | unsigned level : 4; | ||
139 | unsigned quadrant : 2; | ||
140 | unsigned pad_for_nice_hex_output : 6; | ||
141 | unsigned metaphysical : 1; | ||
142 | unsigned access : 3; | ||
143 | }; | ||
144 | }; | ||
145 | |||
146 | struct kvm_mmu_page { | ||
147 | struct list_head link; | ||
148 | struct hlist_node hash_link; | ||
149 | |||
150 | /* | ||
151 | * The following two entries are used to key the shadow page in the | ||
152 | * hash table. | ||
153 | */ | ||
154 | gfn_t gfn; | ||
155 | union kvm_mmu_page_role role; | ||
156 | |||
157 | u64 *spt; | ||
158 | /* hold the gfn of each spte inside spt */ | ||
159 | gfn_t *gfns; | ||
160 | unsigned long slot_bitmap; /* One bit set per slot which has memory | ||
161 | * in this shadow page. | ||
162 | */ | ||
163 | int multimapped; /* More than one parent_pte? */ | ||
164 | int root_count; /* Currently serving as active root */ | ||
165 | union { | ||
166 | u64 *parent_pte; /* !multimapped */ | ||
167 | struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */ | ||
168 | }; | ||
169 | }; | ||
170 | |||
171 | /* | ||
172 | * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level | ||
173 | * 32-bit). The kvm_mmu structure abstracts the details of the current mmu | ||
174 | * mode. | ||
175 | */ | ||
176 | struct kvm_mmu { | ||
177 | void (*new_cr3)(struct kvm_vcpu *vcpu); | ||
178 | int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); | ||
179 | void (*free)(struct kvm_vcpu *vcpu); | ||
180 | gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva); | ||
181 | void (*prefetch_page)(struct kvm_vcpu *vcpu, | ||
182 | struct kvm_mmu_page *page); | ||
183 | hpa_t root_hpa; | ||
184 | int root_level; | ||
185 | int shadow_root_level; | ||
186 | |||
187 | u64 *pae_root; | ||
188 | }; | ||
189 | |||
190 | struct kvm_vcpu_arch { | ||
191 | u64 host_tsc; | ||
192 | int interrupt_window_open; | ||
193 | unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */ | ||
194 | DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS); | ||
195 | unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */ | ||
196 | unsigned long rip; /* needs vcpu_load_rsp_rip() */ | ||
197 | |||
198 | unsigned long cr0; | ||
199 | unsigned long cr2; | ||
200 | unsigned long cr3; | ||
201 | unsigned long cr4; | ||
202 | unsigned long cr8; | ||
203 | u64 pdptrs[4]; /* pae */ | ||
204 | u64 shadow_efer; | ||
205 | u64 apic_base; | ||
206 | struct kvm_lapic *apic; /* kernel irqchip context */ | ||
207 | #define VCPU_MP_STATE_RUNNABLE 0 | ||
208 | #define VCPU_MP_STATE_UNINITIALIZED 1 | ||
209 | #define VCPU_MP_STATE_INIT_RECEIVED 2 | ||
210 | #define VCPU_MP_STATE_SIPI_RECEIVED 3 | ||
211 | #define VCPU_MP_STATE_HALTED 4 | ||
212 | int mp_state; | ||
213 | int sipi_vector; | ||
214 | u64 ia32_misc_enable_msr; | ||
215 | |||
216 | struct kvm_mmu mmu; | ||
217 | |||
218 | struct kvm_mmu_memory_cache mmu_pte_chain_cache; | ||
219 | struct kvm_mmu_memory_cache mmu_rmap_desc_cache; | ||
220 | struct kvm_mmu_memory_cache mmu_page_cache; | ||
221 | struct kvm_mmu_memory_cache mmu_page_header_cache; | ||
222 | |||
223 | gfn_t last_pt_write_gfn; | ||
224 | int last_pt_write_count; | ||
225 | u64 *last_pte_updated; | ||
226 | |||
227 | struct i387_fxsave_struct host_fx_image; | ||
228 | struct i387_fxsave_struct guest_fx_image; | ||
229 | |||
230 | gva_t mmio_fault_cr2; | ||
231 | struct kvm_pio_request pio; | ||
232 | void *pio_data; | ||
233 | |||
234 | struct kvm_queued_exception { | ||
235 | bool pending; | ||
236 | bool has_error_code; | ||
237 | u8 nr; | ||
238 | u32 error_code; | ||
239 | } exception; | ||
240 | |||
241 | struct { | ||
242 | int active; | ||
243 | u8 save_iopl; | ||
244 | struct kvm_save_segment { | ||
245 | u16 selector; | ||
246 | unsigned long base; | ||
247 | u32 limit; | ||
248 | u32 ar; | ||
249 | } tr, es, ds, fs, gs; | ||
250 | } rmode; | ||
251 | int halt_request; /* real mode on Intel only */ | ||
252 | |||
253 | int cpuid_nent; | ||
254 | struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES]; | ||
255 | /* emulate context */ | ||
256 | |||
257 | struct x86_emulate_ctxt emulate_ctxt; | ||
258 | }; | ||
259 | |||
260 | struct kvm_mem_alias { | ||
261 | gfn_t base_gfn; | ||
262 | unsigned long npages; | ||
263 | gfn_t target_gfn; | ||
264 | }; | ||
265 | |||
266 | struct kvm_arch{ | ||
267 | int naliases; | ||
268 | struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS]; | ||
269 | |||
270 | unsigned int n_free_mmu_pages; | ||
271 | unsigned int n_requested_mmu_pages; | ||
272 | unsigned int n_alloc_mmu_pages; | ||
273 | struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; | ||
274 | /* | ||
275 | * Hash table of struct kvm_mmu_page. | ||
276 | */ | ||
277 | struct list_head active_mmu_pages; | ||
278 | struct kvm_pic *vpic; | ||
279 | struct kvm_ioapic *vioapic; | ||
280 | |||
281 | int round_robin_prev_vcpu; | ||
282 | unsigned int tss_addr; | ||
283 | struct page *apic_access_page; | ||
284 | }; | ||
285 | |||
286 | struct kvm_vm_stat { | ||
287 | u32 mmu_shadow_zapped; | ||
288 | u32 mmu_pte_write; | ||
289 | u32 mmu_pte_updated; | ||
290 | u32 mmu_pde_zapped; | ||
291 | u32 mmu_flooded; | ||
292 | u32 mmu_recycled; | ||
293 | u32 remote_tlb_flush; | ||
294 | }; | ||
295 | |||
296 | struct kvm_vcpu_stat { | ||
297 | u32 pf_fixed; | ||
298 | u32 pf_guest; | ||
299 | u32 tlb_flush; | ||
300 | u32 invlpg; | ||
301 | |||
302 | u32 exits; | ||
303 | u32 io_exits; | ||
304 | u32 mmio_exits; | ||
305 | u32 signal_exits; | ||
306 | u32 irq_window_exits; | ||
307 | u32 halt_exits; | ||
308 | u32 halt_wakeup; | ||
309 | u32 request_irq_exits; | ||
310 | u32 irq_exits; | ||
311 | u32 host_state_reload; | ||
312 | u32 efer_reload; | ||
313 | u32 fpu_reload; | ||
314 | u32 insn_emulation; | ||
315 | u32 insn_emulation_fail; | ||
316 | }; | ||
317 | |||
318 | struct descriptor_table { | ||
319 | u16 limit; | ||
320 | unsigned long base; | ||
321 | } __attribute__((packed)); | ||
322 | |||
323 | struct kvm_x86_ops { | ||
324 | int (*cpu_has_kvm_support)(void); /* __init */ | ||
325 | int (*disabled_by_bios)(void); /* __init */ | ||
326 | void (*hardware_enable)(void *dummy); /* __init */ | ||
327 | void (*hardware_disable)(void *dummy); | ||
328 | void (*check_processor_compatibility)(void *rtn); | ||
329 | int (*hardware_setup)(void); /* __init */ | ||
330 | void (*hardware_unsetup)(void); /* __exit */ | ||
331 | |||
332 | /* Create, but do not attach this VCPU */ | ||
333 | struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id); | ||
334 | void (*vcpu_free)(struct kvm_vcpu *vcpu); | ||
335 | int (*vcpu_reset)(struct kvm_vcpu *vcpu); | ||
336 | |||
337 | void (*prepare_guest_switch)(struct kvm_vcpu *vcpu); | ||
338 | void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); | ||
339 | void (*vcpu_put)(struct kvm_vcpu *vcpu); | ||
340 | void (*vcpu_decache)(struct kvm_vcpu *vcpu); | ||
341 | |||
342 | int (*set_guest_debug)(struct kvm_vcpu *vcpu, | ||
343 | struct kvm_debug_guest *dbg); | ||
344 | void (*guest_debug_pre)(struct kvm_vcpu *vcpu); | ||
345 | int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); | ||
346 | int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); | ||
347 | u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); | ||
348 | void (*get_segment)(struct kvm_vcpu *vcpu, | ||
349 | struct kvm_segment *var, int seg); | ||
350 | void (*set_segment)(struct kvm_vcpu *vcpu, | ||
351 | struct kvm_segment *var, int seg); | ||
352 | void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); | ||
353 | void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu); | ||
354 | void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); | ||
355 | void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); | ||
356 | void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); | ||
357 | void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer); | ||
358 | void (*get_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); | ||
359 | void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); | ||
360 | void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); | ||
361 | void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); | ||
362 | unsigned long (*get_dr)(struct kvm_vcpu *vcpu, int dr); | ||
363 | void (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value, | ||
364 | int *exception); | ||
365 | void (*cache_regs)(struct kvm_vcpu *vcpu); | ||
366 | void (*decache_regs)(struct kvm_vcpu *vcpu); | ||
367 | unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); | ||
368 | void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); | ||
369 | |||
370 | void (*tlb_flush)(struct kvm_vcpu *vcpu); | ||
371 | |||
372 | void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run); | ||
373 | int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu); | ||
374 | void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); | ||
375 | void (*patch_hypercall)(struct kvm_vcpu *vcpu, | ||
376 | unsigned char *hypercall_addr); | ||
377 | int (*get_irq)(struct kvm_vcpu *vcpu); | ||
378 | void (*set_irq)(struct kvm_vcpu *vcpu, int vec); | ||
379 | void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr, | ||
380 | bool has_error_code, u32 error_code); | ||
381 | bool (*exception_injected)(struct kvm_vcpu *vcpu); | ||
382 | void (*inject_pending_irq)(struct kvm_vcpu *vcpu); | ||
383 | void (*inject_pending_vectors)(struct kvm_vcpu *vcpu, | ||
384 | struct kvm_run *run); | ||
385 | |||
386 | int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); | ||
387 | }; | ||
388 | |||
389 | extern struct kvm_x86_ops *kvm_x86_ops; | ||
390 | |||
391 | int kvm_mmu_module_init(void); | ||
392 | void kvm_mmu_module_exit(void); | ||
393 | |||
394 | void kvm_mmu_destroy(struct kvm_vcpu *vcpu); | ||
395 | int kvm_mmu_create(struct kvm_vcpu *vcpu); | ||
396 | int kvm_mmu_setup(struct kvm_vcpu *vcpu); | ||
397 | void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte); | ||
398 | |||
399 | int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); | ||
400 | void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); | ||
401 | void kvm_mmu_zap_all(struct kvm *kvm); | ||
402 | unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); | ||
403 | void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); | ||
404 | |||
405 | enum emulation_result { | ||
406 | EMULATE_DONE, /* no further processing */ | ||
407 | EMULATE_DO_MMIO, /* kvm_run filled with mmio request */ | ||
408 | EMULATE_FAIL, /* can't emulate this instruction */ | ||
409 | }; | ||
410 | |||
411 | int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run, | ||
412 | unsigned long cr2, u16 error_code, int no_decode); | ||
413 | void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context); | ||
414 | void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); | ||
415 | void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); | ||
416 | void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, | ||
417 | unsigned long *rflags); | ||
418 | |||
419 | unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr); | ||
420 | void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long value, | ||
421 | unsigned long *rflags); | ||
422 | int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data); | ||
423 | int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); | ||
424 | |||
425 | struct x86_emulate_ctxt; | ||
426 | |||
427 | int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, | ||
428 | int size, unsigned port); | ||
429 | int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, | ||
430 | int size, unsigned long count, int down, | ||
431 | gva_t address, int rep, unsigned port); | ||
432 | void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); | ||
433 | int kvm_emulate_halt(struct kvm_vcpu *vcpu); | ||
434 | int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); | ||
435 | int emulate_clts(struct kvm_vcpu *vcpu); | ||
436 | int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, | ||
437 | unsigned long *dest); | ||
438 | int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, | ||
439 | unsigned long value); | ||
440 | |||
441 | void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); | ||
442 | void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr0); | ||
443 | void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr0); | ||
444 | void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr0); | ||
445 | unsigned long get_cr8(struct kvm_vcpu *vcpu); | ||
446 | void lmsw(struct kvm_vcpu *vcpu, unsigned long msw); | ||
447 | void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l); | ||
448 | |||
449 | int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); | ||
450 | int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data); | ||
451 | |||
452 | void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); | ||
453 | void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); | ||
454 | void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2, | ||
455 | u32 error_code); | ||
456 | |||
457 | void fx_init(struct kvm_vcpu *vcpu); | ||
458 | |||
459 | int emulator_read_std(unsigned long addr, | ||
460 | void *val, | ||
461 | unsigned int bytes, | ||
462 | struct kvm_vcpu *vcpu); | ||
463 | int emulator_write_emulated(unsigned long addr, | ||
464 | const void *val, | ||
465 | unsigned int bytes, | ||
466 | struct kvm_vcpu *vcpu); | ||
467 | |||
468 | unsigned long segment_base(u16 selector); | ||
469 | |||
470 | void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu); | ||
471 | void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | ||
472 | const u8 *new, int bytes); | ||
473 | int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); | ||
474 | void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); | ||
475 | int kvm_mmu_load(struct kvm_vcpu *vcpu); | ||
476 | void kvm_mmu_unload(struct kvm_vcpu *vcpu); | ||
477 | |||
478 | int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); | ||
479 | |||
480 | int kvm_fix_hypercall(struct kvm_vcpu *vcpu); | ||
481 | |||
482 | int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code); | ||
483 | |||
484 | int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); | ||
485 | int complete_pio(struct kvm_vcpu *vcpu); | ||
486 | |||
487 | static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) | ||
488 | { | ||
489 | struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); | ||
490 | |||
491 | return (struct kvm_mmu_page *)page_private(page); | ||
492 | } | ||
493 | |||
494 | static inline u16 read_fs(void) | ||
495 | { | ||
496 | u16 seg; | ||
497 | asm("mov %%fs, %0" : "=g"(seg)); | ||
498 | return seg; | ||
499 | } | ||
500 | |||
501 | static inline u16 read_gs(void) | ||
502 | { | ||
503 | u16 seg; | ||
504 | asm("mov %%gs, %0" : "=g"(seg)); | ||
505 | return seg; | ||
506 | } | ||
507 | |||
508 | static inline u16 read_ldt(void) | ||
509 | { | ||
510 | u16 ldt; | ||
511 | asm("sldt %0" : "=g"(ldt)); | ||
512 | return ldt; | ||
513 | } | ||
514 | |||
515 | static inline void load_fs(u16 sel) | ||
516 | { | ||
517 | asm("mov %0, %%fs" : : "rm"(sel)); | ||
518 | } | ||
519 | |||
520 | static inline void load_gs(u16 sel) | ||
521 | { | ||
522 | asm("mov %0, %%gs" : : "rm"(sel)); | ||
523 | } | ||
524 | |||
525 | #ifndef load_ldt | ||
526 | static inline void load_ldt(u16 sel) | ||
527 | { | ||
528 | asm("lldt %0" : : "rm"(sel)); | ||
529 | } | ||
530 | #endif | ||
531 | |||
532 | static inline void get_idt(struct descriptor_table *table) | ||
533 | { | ||
534 | asm("sidt %0" : "=m"(*table)); | ||
535 | } | ||
536 | |||
537 | static inline void get_gdt(struct descriptor_table *table) | ||
538 | { | ||
539 | asm("sgdt %0" : "=m"(*table)); | ||
540 | } | ||
541 | |||
542 | static inline unsigned long read_tr_base(void) | ||
543 | { | ||
544 | u16 tr; | ||
545 | asm("str %0" : "=g"(tr)); | ||
546 | return segment_base(tr); | ||
547 | } | ||
548 | |||
549 | #ifdef CONFIG_X86_64 | ||
550 | static inline unsigned long read_msr(unsigned long msr) | ||
551 | { | ||
552 | u64 value; | ||
553 | |||
554 | rdmsrl(msr, value); | ||
555 | return value; | ||
556 | } | ||
557 | #endif | ||
558 | |||
559 | static inline void fx_save(struct i387_fxsave_struct *image) | ||
560 | { | ||
561 | asm("fxsave (%0)":: "r" (image)); | ||
562 | } | ||
563 | |||
564 | static inline void fx_restore(struct i387_fxsave_struct *image) | ||
565 | { | ||
566 | asm("fxrstor (%0)":: "r" (image)); | ||
567 | } | ||
568 | |||
569 | static inline void fpu_init(void) | ||
570 | { | ||
571 | asm("finit"); | ||
572 | } | ||
573 | |||
574 | static inline u32 get_rdx_init_val(void) | ||
575 | { | ||
576 | return 0x600; /* P6 family */ | ||
577 | } | ||
578 | |||
579 | static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code) | ||
580 | { | ||
581 | kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); | ||
582 | } | ||
583 | |||
584 | #define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30" | ||
585 | #define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2" | ||
586 | #define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3" | ||
587 | #define ASM_VMX_VMPTRLD_RAX ".byte 0x0f, 0xc7, 0x30" | ||
588 | #define ASM_VMX_VMREAD_RDX_RAX ".byte 0x0f, 0x78, 0xd0" | ||
589 | #define ASM_VMX_VMWRITE_RAX_RDX ".byte 0x0f, 0x79, 0xd0" | ||
590 | #define ASM_VMX_VMWRITE_RSP_RDX ".byte 0x0f, 0x79, 0xd4" | ||
591 | #define ASM_VMX_VMXOFF ".byte 0x0f, 0x01, 0xc4" | ||
592 | #define ASM_VMX_VMXON_RAX ".byte 0xf3, 0x0f, 0xc7, 0x30" | ||
593 | |||
594 | #define MSR_IA32_TIME_STAMP_COUNTER 0x010 | ||
595 | |||
596 | #define TSS_IOPB_BASE_OFFSET 0x66 | ||
597 | #define TSS_BASE_SIZE 0x68 | ||
598 | #define TSS_IOPB_SIZE (65536 / 8) | ||
599 | #define TSS_REDIRECTION_SIZE (256 / 8) | ||
600 | #define RMODE_TSS_SIZE (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1) | ||
601 | |||
602 | #endif | ||
diff --git a/drivers/kvm/x86_emulate.c b/drivers/kvm/x86_emulate.c deleted file mode 100644 index 50b133f68743..000000000000 --- a/drivers/kvm/x86_emulate.c +++ /dev/null | |||
@@ -1,1913 +0,0 @@ | |||
1 | /****************************************************************************** | ||
2 | * x86_emulate.c | ||
3 | * | ||
4 | * Generic x86 (32-bit and 64-bit) instruction decoder and emulator. | ||
5 | * | ||
6 | * Copyright (c) 2005 Keir Fraser | ||
7 | * | ||
8 | * Linux coding style, mod r/m decoder, segment base fixes, real-mode | ||
9 | * privileged instructions: | ||
10 | * | ||
11 | * Copyright (C) 2006 Qumranet | ||
12 | * | ||
13 | * Avi Kivity <avi@qumranet.com> | ||
14 | * Yaniv Kamay <yaniv@qumranet.com> | ||
15 | * | ||
16 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
17 | * the COPYING file in the top-level directory. | ||
18 | * | ||
19 | * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 | ||
20 | */ | ||
21 | |||
22 | #ifndef __KERNEL__ | ||
23 | #include <stdio.h> | ||
24 | #include <stdint.h> | ||
25 | #include <public/xen.h> | ||
26 | #define DPRINTF(_f, _a ...) printf(_f , ## _a) | ||
27 | #else | ||
28 | #include "kvm.h" | ||
29 | #include "x86.h" | ||
30 | #define DPRINTF(x...) do {} while (0) | ||
31 | #endif | ||
32 | #include "x86_emulate.h" | ||
33 | #include <linux/module.h> | ||
34 | |||
35 | /* | ||
36 | * Opcode effective-address decode tables. | ||
37 | * Note that we only emulate instructions that have at least one memory | ||
38 | * operand (excluding implicit stack references). We assume that stack | ||
39 | * references and instruction fetches will never occur in special memory | ||
40 | * areas that require emulation. So, for example, 'mov <imm>,<reg>' need | ||
41 | * not be handled. | ||
42 | */ | ||
43 | |||
44 | /* Operand sizes: 8-bit operands or specified/overridden size. */ | ||
45 | #define ByteOp (1<<0) /* 8-bit operands. */ | ||
46 | /* Destination operand type. */ | ||
47 | #define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ | ||
48 | #define DstReg (2<<1) /* Register operand. */ | ||
49 | #define DstMem (3<<1) /* Memory operand. */ | ||
50 | #define DstMask (3<<1) | ||
51 | /* Source operand type. */ | ||
52 | #define SrcNone (0<<3) /* No source operand. */ | ||
53 | #define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */ | ||
54 | #define SrcReg (1<<3) /* Register operand. */ | ||
55 | #define SrcMem (2<<3) /* Memory operand. */ | ||
56 | #define SrcMem16 (3<<3) /* Memory operand (16-bit). */ | ||
57 | #define SrcMem32 (4<<3) /* Memory operand (32-bit). */ | ||
58 | #define SrcImm (5<<3) /* Immediate operand. */ | ||
59 | #define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */ | ||
60 | #define SrcMask (7<<3) | ||
61 | /* Generic ModRM decode. */ | ||
62 | #define ModRM (1<<6) | ||
63 | /* Destination is only written; never read. */ | ||
64 | #define Mov (1<<7) | ||
65 | #define BitOp (1<<8) | ||
66 | #define MemAbs (1<<9) /* Memory operand is absolute displacement */ | ||
67 | #define String (1<<10) /* String instruction (rep capable) */ | ||
68 | #define Stack (1<<11) /* Stack instruction (push/pop) */ | ||
69 | |||
70 | static u16 opcode_table[256] = { | ||
71 | /* 0x00 - 0x07 */ | ||
72 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
73 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
74 | 0, 0, 0, 0, | ||
75 | /* 0x08 - 0x0F */ | ||
76 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
77 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
78 | 0, 0, 0, 0, | ||
79 | /* 0x10 - 0x17 */ | ||
80 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
81 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
82 | 0, 0, 0, 0, | ||
83 | /* 0x18 - 0x1F */ | ||
84 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
85 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
86 | 0, 0, 0, 0, | ||
87 | /* 0x20 - 0x27 */ | ||
88 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
89 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
90 | SrcImmByte, SrcImm, 0, 0, | ||
91 | /* 0x28 - 0x2F */ | ||
92 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
93 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
94 | 0, 0, 0, 0, | ||
95 | /* 0x30 - 0x37 */ | ||
96 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
97 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
98 | 0, 0, 0, 0, | ||
99 | /* 0x38 - 0x3F */ | ||
100 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
101 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
102 | 0, 0, 0, 0, | ||
103 | /* 0x40 - 0x47 */ | ||
104 | DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, | ||
105 | /* 0x48 - 0x4F */ | ||
106 | DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, | ||
107 | /* 0x50 - 0x57 */ | ||
108 | SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, | ||
109 | SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, | ||
110 | /* 0x58 - 0x5F */ | ||
111 | DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, | ||
112 | DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, | ||
113 | /* 0x60 - 0x67 */ | ||
114 | 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , | ||
115 | 0, 0, 0, 0, | ||
116 | /* 0x68 - 0x6F */ | ||
117 | 0, 0, ImplicitOps | Mov | Stack, 0, | ||
118 | SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */ | ||
119 | SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */ | ||
120 | /* 0x70 - 0x77 */ | ||
121 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
122 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
123 | /* 0x78 - 0x7F */ | ||
124 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
125 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
126 | /* 0x80 - 0x87 */ | ||
127 | ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, | ||
128 | ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, | ||
129 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
130 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
131 | /* 0x88 - 0x8F */ | ||
132 | ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, | ||
133 | ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
134 | 0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov | Stack, | ||
135 | /* 0x90 - 0x9F */ | ||
136 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
137 | 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, | ||
138 | /* 0xA0 - 0xA7 */ | ||
139 | ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, | ||
140 | ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs, | ||
141 | ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, | ||
142 | ByteOp | ImplicitOps | String, ImplicitOps | String, | ||
143 | /* 0xA8 - 0xAF */ | ||
144 | 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, | ||
145 | ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, | ||
146 | ByteOp | ImplicitOps | String, ImplicitOps | String, | ||
147 | /* 0xB0 - 0xBF */ | ||
148 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
149 | /* 0xC0 - 0xC7 */ | ||
150 | ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, | ||
151 | 0, ImplicitOps | Stack, 0, 0, | ||
152 | ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov, | ||
153 | /* 0xC8 - 0xCF */ | ||
154 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
155 | /* 0xD0 - 0xD7 */ | ||
156 | ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, | ||
157 | ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, | ||
158 | 0, 0, 0, 0, | ||
159 | /* 0xD8 - 0xDF */ | ||
160 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
161 | /* 0xE0 - 0xE7 */ | ||
162 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
163 | /* 0xE8 - 0xEF */ | ||
164 | ImplicitOps | Stack, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps, | ||
165 | 0, 0, 0, 0, | ||
166 | /* 0xF0 - 0xF7 */ | ||
167 | 0, 0, 0, 0, | ||
168 | ImplicitOps, ImplicitOps, | ||
169 | ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, | ||
170 | /* 0xF8 - 0xFF */ | ||
171 | ImplicitOps, 0, ImplicitOps, ImplicitOps, | ||
172 | 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM | ||
173 | }; | ||
174 | |||
175 | static u16 twobyte_table[256] = { | ||
176 | /* 0x00 - 0x0F */ | ||
177 | 0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0, | ||
178 | ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, | ||
179 | /* 0x10 - 0x1F */ | ||
180 | 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, | ||
181 | /* 0x20 - 0x2F */ | ||
182 | ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0, | ||
183 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
184 | /* 0x30 - 0x3F */ | ||
185 | ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
186 | /* 0x40 - 0x47 */ | ||
187 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
188 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
189 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
190 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
191 | /* 0x48 - 0x4F */ | ||
192 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
193 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
194 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
195 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
196 | /* 0x50 - 0x5F */ | ||
197 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
198 | /* 0x60 - 0x6F */ | ||
199 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
200 | /* 0x70 - 0x7F */ | ||
201 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
202 | /* 0x80 - 0x8F */ | ||
203 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
204 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
205 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
206 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
207 | /* 0x90 - 0x9F */ | ||
208 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
209 | /* 0xA0 - 0xA7 */ | ||
210 | 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0, | ||
211 | /* 0xA8 - 0xAF */ | ||
212 | 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0, | ||
213 | /* 0xB0 - 0xB7 */ | ||
214 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0, | ||
215 | DstMem | SrcReg | ModRM | BitOp, | ||
216 | 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, | ||
217 | DstReg | SrcMem16 | ModRM | Mov, | ||
218 | /* 0xB8 - 0xBF */ | ||
219 | 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp, | ||
220 | 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, | ||
221 | DstReg | SrcMem16 | ModRM | Mov, | ||
222 | /* 0xC0 - 0xCF */ | ||
223 | 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM, | ||
224 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
225 | /* 0xD0 - 0xDF */ | ||
226 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
227 | /* 0xE0 - 0xEF */ | ||
228 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
229 | /* 0xF0 - 0xFF */ | ||
230 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | ||
231 | }; | ||
232 | |||
233 | /* EFLAGS bit definitions. */ | ||
234 | #define EFLG_OF (1<<11) | ||
235 | #define EFLG_DF (1<<10) | ||
236 | #define EFLG_SF (1<<7) | ||
237 | #define EFLG_ZF (1<<6) | ||
238 | #define EFLG_AF (1<<4) | ||
239 | #define EFLG_PF (1<<2) | ||
240 | #define EFLG_CF (1<<0) | ||
241 | |||
242 | /* | ||
243 | * Instruction emulation: | ||
244 | * Most instructions are emulated directly via a fragment of inline assembly | ||
245 | * code. This allows us to save/restore EFLAGS and thus very easily pick up | ||
246 | * any modified flags. | ||
247 | */ | ||
248 | |||
249 | #if defined(CONFIG_X86_64) | ||
250 | #define _LO32 "k" /* force 32-bit operand */ | ||
251 | #define _STK "%%rsp" /* stack pointer */ | ||
252 | #elif defined(__i386__) | ||
253 | #define _LO32 "" /* force 32-bit operand */ | ||
254 | #define _STK "%%esp" /* stack pointer */ | ||
255 | #endif | ||
256 | |||
257 | /* | ||
258 | * These EFLAGS bits are restored from saved value during emulation, and | ||
259 | * any changes are written back to the saved value after emulation. | ||
260 | */ | ||
261 | #define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF) | ||
262 | |||
263 | /* Before executing instruction: restore necessary bits in EFLAGS. */ | ||
264 | #define _PRE_EFLAGS(_sav, _msk, _tmp) \ | ||
265 | /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~_msk; */ \ | ||
266 | "movl %"_sav",%"_LO32 _tmp"; " \ | ||
267 | "push %"_tmp"; " \ | ||
268 | "push %"_tmp"; " \ | ||
269 | "movl %"_msk",%"_LO32 _tmp"; " \ | ||
270 | "andl %"_LO32 _tmp",("_STK"); " \ | ||
271 | "pushf; " \ | ||
272 | "notl %"_LO32 _tmp"; " \ | ||
273 | "andl %"_LO32 _tmp",("_STK"); " \ | ||
274 | "andl %"_LO32 _tmp","__stringify(BITS_PER_LONG/4)"("_STK"); " \ | ||
275 | "pop %"_tmp"; " \ | ||
276 | "orl %"_LO32 _tmp",("_STK"); " \ | ||
277 | "popf; " \ | ||
278 | "pop %"_sav"; " | ||
279 | |||
280 | /* After executing instruction: write-back necessary bits in EFLAGS. */ | ||
281 | #define _POST_EFLAGS(_sav, _msk, _tmp) \ | ||
282 | /* _sav |= EFLAGS & _msk; */ \ | ||
283 | "pushf; " \ | ||
284 | "pop %"_tmp"; " \ | ||
285 | "andl %"_msk",%"_LO32 _tmp"; " \ | ||
286 | "orl %"_LO32 _tmp",%"_sav"; " | ||
287 | |||
288 | /* Raw emulation: instruction has two explicit operands. */ | ||
289 | #define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \ | ||
290 | do { \ | ||
291 | unsigned long _tmp; \ | ||
292 | \ | ||
293 | switch ((_dst).bytes) { \ | ||
294 | case 2: \ | ||
295 | __asm__ __volatile__ ( \ | ||
296 | _PRE_EFLAGS("0", "4", "2") \ | ||
297 | _op"w %"_wx"3,%1; " \ | ||
298 | _POST_EFLAGS("0", "4", "2") \ | ||
299 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
300 | "=&r" (_tmp) \ | ||
301 | : _wy ((_src).val), "i" (EFLAGS_MASK)); \ | ||
302 | break; \ | ||
303 | case 4: \ | ||
304 | __asm__ __volatile__ ( \ | ||
305 | _PRE_EFLAGS("0", "4", "2") \ | ||
306 | _op"l %"_lx"3,%1; " \ | ||
307 | _POST_EFLAGS("0", "4", "2") \ | ||
308 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
309 | "=&r" (_tmp) \ | ||
310 | : _ly ((_src).val), "i" (EFLAGS_MASK)); \ | ||
311 | break; \ | ||
312 | case 8: \ | ||
313 | __emulate_2op_8byte(_op, _src, _dst, \ | ||
314 | _eflags, _qx, _qy); \ | ||
315 | break; \ | ||
316 | } \ | ||
317 | } while (0) | ||
318 | |||
319 | #define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \ | ||
320 | do { \ | ||
321 | unsigned long _tmp; \ | ||
322 | switch ((_dst).bytes) { \ | ||
323 | case 1: \ | ||
324 | __asm__ __volatile__ ( \ | ||
325 | _PRE_EFLAGS("0", "4", "2") \ | ||
326 | _op"b %"_bx"3,%1; " \ | ||
327 | _POST_EFLAGS("0", "4", "2") \ | ||
328 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
329 | "=&r" (_tmp) \ | ||
330 | : _by ((_src).val), "i" (EFLAGS_MASK)); \ | ||
331 | break; \ | ||
332 | default: \ | ||
333 | __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ | ||
334 | _wx, _wy, _lx, _ly, _qx, _qy); \ | ||
335 | break; \ | ||
336 | } \ | ||
337 | } while (0) | ||
338 | |||
339 | /* Source operand is byte-sized and may be restricted to just %cl. */ | ||
340 | #define emulate_2op_SrcB(_op, _src, _dst, _eflags) \ | ||
341 | __emulate_2op(_op, _src, _dst, _eflags, \ | ||
342 | "b", "c", "b", "c", "b", "c", "b", "c") | ||
343 | |||
344 | /* Source operand is byte, word, long or quad sized. */ | ||
345 | #define emulate_2op_SrcV(_op, _src, _dst, _eflags) \ | ||
346 | __emulate_2op(_op, _src, _dst, _eflags, \ | ||
347 | "b", "q", "w", "r", _LO32, "r", "", "r") | ||
348 | |||
349 | /* Source operand is word, long or quad sized. */ | ||
350 | #define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \ | ||
351 | __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ | ||
352 | "w", "r", _LO32, "r", "", "r") | ||
353 | |||
354 | /* Instruction has only one explicit operand (no source operand). */ | ||
355 | #define emulate_1op(_op, _dst, _eflags) \ | ||
356 | do { \ | ||
357 | unsigned long _tmp; \ | ||
358 | \ | ||
359 | switch ((_dst).bytes) { \ | ||
360 | case 1: \ | ||
361 | __asm__ __volatile__ ( \ | ||
362 | _PRE_EFLAGS("0", "3", "2") \ | ||
363 | _op"b %1; " \ | ||
364 | _POST_EFLAGS("0", "3", "2") \ | ||
365 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
366 | "=&r" (_tmp) \ | ||
367 | : "i" (EFLAGS_MASK)); \ | ||
368 | break; \ | ||
369 | case 2: \ | ||
370 | __asm__ __volatile__ ( \ | ||
371 | _PRE_EFLAGS("0", "3", "2") \ | ||
372 | _op"w %1; " \ | ||
373 | _POST_EFLAGS("0", "3", "2") \ | ||
374 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
375 | "=&r" (_tmp) \ | ||
376 | : "i" (EFLAGS_MASK)); \ | ||
377 | break; \ | ||
378 | case 4: \ | ||
379 | __asm__ __volatile__ ( \ | ||
380 | _PRE_EFLAGS("0", "3", "2") \ | ||
381 | _op"l %1; " \ | ||
382 | _POST_EFLAGS("0", "3", "2") \ | ||
383 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
384 | "=&r" (_tmp) \ | ||
385 | : "i" (EFLAGS_MASK)); \ | ||
386 | break; \ | ||
387 | case 8: \ | ||
388 | __emulate_1op_8byte(_op, _dst, _eflags); \ | ||
389 | break; \ | ||
390 | } \ | ||
391 | } while (0) | ||
392 | |||
393 | /* Emulate an instruction with quadword operands (x86/64 only). */ | ||
394 | #if defined(CONFIG_X86_64) | ||
395 | #define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) \ | ||
396 | do { \ | ||
397 | __asm__ __volatile__ ( \ | ||
398 | _PRE_EFLAGS("0", "4", "2") \ | ||
399 | _op"q %"_qx"3,%1; " \ | ||
400 | _POST_EFLAGS("0", "4", "2") \ | ||
401 | : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ | ||
402 | : _qy ((_src).val), "i" (EFLAGS_MASK)); \ | ||
403 | } while (0) | ||
404 | |||
405 | #define __emulate_1op_8byte(_op, _dst, _eflags) \ | ||
406 | do { \ | ||
407 | __asm__ __volatile__ ( \ | ||
408 | _PRE_EFLAGS("0", "3", "2") \ | ||
409 | _op"q %1; " \ | ||
410 | _POST_EFLAGS("0", "3", "2") \ | ||
411 | : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ | ||
412 | : "i" (EFLAGS_MASK)); \ | ||
413 | } while (0) | ||
414 | |||
415 | #elif defined(__i386__) | ||
416 | #define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) | ||
417 | #define __emulate_1op_8byte(_op, _dst, _eflags) | ||
418 | #endif /* __i386__ */ | ||
419 | |||
420 | /* Fetch next part of the instruction being emulated. */ | ||
421 | #define insn_fetch(_type, _size, _eip) \ | ||
422 | ({ unsigned long _x; \ | ||
423 | rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \ | ||
424 | if (rc != 0) \ | ||
425 | goto done; \ | ||
426 | (_eip) += (_size); \ | ||
427 | (_type)_x; \ | ||
428 | }) | ||
429 | |||
430 | /* Access/update address held in a register, based on addressing mode. */ | ||
431 | #define address_mask(reg) \ | ||
432 | ((c->ad_bytes == sizeof(unsigned long)) ? \ | ||
433 | (reg) : ((reg) & ((1UL << (c->ad_bytes << 3)) - 1))) | ||
434 | #define register_address(base, reg) \ | ||
435 | ((base) + address_mask(reg)) | ||
436 | #define register_address_increment(reg, inc) \ | ||
437 | do { \ | ||
438 | /* signed type ensures sign extension to long */ \ | ||
439 | int _inc = (inc); \ | ||
440 | if (c->ad_bytes == sizeof(unsigned long)) \ | ||
441 | (reg) += _inc; \ | ||
442 | else \ | ||
443 | (reg) = ((reg) & \ | ||
444 | ~((1UL << (c->ad_bytes << 3)) - 1)) | \ | ||
445 | (((reg) + _inc) & \ | ||
446 | ((1UL << (c->ad_bytes << 3)) - 1)); \ | ||
447 | } while (0) | ||
448 | |||
449 | #define JMP_REL(rel) \ | ||
450 | do { \ | ||
451 | register_address_increment(c->eip, rel); \ | ||
452 | } while (0) | ||
453 | |||
454 | static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, | ||
455 | struct x86_emulate_ops *ops, | ||
456 | unsigned long linear, u8 *dest) | ||
457 | { | ||
458 | struct fetch_cache *fc = &ctxt->decode.fetch; | ||
459 | int rc; | ||
460 | int size; | ||
461 | |||
462 | if (linear < fc->start || linear >= fc->end) { | ||
463 | size = min(15UL, PAGE_SIZE - offset_in_page(linear)); | ||
464 | rc = ops->read_std(linear, fc->data, size, ctxt->vcpu); | ||
465 | if (rc) | ||
466 | return rc; | ||
467 | fc->start = linear; | ||
468 | fc->end = linear + size; | ||
469 | } | ||
470 | *dest = fc->data[linear - fc->start]; | ||
471 | return 0; | ||
472 | } | ||
473 | |||
474 | static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, | ||
475 | struct x86_emulate_ops *ops, | ||
476 | unsigned long eip, void *dest, unsigned size) | ||
477 | { | ||
478 | int rc = 0; | ||
479 | |||
480 | eip += ctxt->cs_base; | ||
481 | while (size--) { | ||
482 | rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++); | ||
483 | if (rc) | ||
484 | return rc; | ||
485 | } | ||
486 | return 0; | ||
487 | } | ||
488 | |||
489 | /* | ||
490 | * Given the 'reg' portion of a ModRM byte, and a register block, return a | ||
491 | * pointer into the block that addresses the relevant register. | ||
492 | * @highbyte_regs specifies whether to decode AH,CH,DH,BH. | ||
493 | */ | ||
494 | static void *decode_register(u8 modrm_reg, unsigned long *regs, | ||
495 | int highbyte_regs) | ||
496 | { | ||
497 | void *p; | ||
498 | |||
499 | p = ®s[modrm_reg]; | ||
500 | if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8) | ||
501 | p = (unsigned char *)®s[modrm_reg & 3] + 1; | ||
502 | return p; | ||
503 | } | ||
504 | |||
505 | static int read_descriptor(struct x86_emulate_ctxt *ctxt, | ||
506 | struct x86_emulate_ops *ops, | ||
507 | void *ptr, | ||
508 | u16 *size, unsigned long *address, int op_bytes) | ||
509 | { | ||
510 | int rc; | ||
511 | |||
512 | if (op_bytes == 2) | ||
513 | op_bytes = 3; | ||
514 | *address = 0; | ||
515 | rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, | ||
516 | ctxt->vcpu); | ||
517 | if (rc) | ||
518 | return rc; | ||
519 | rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, | ||
520 | ctxt->vcpu); | ||
521 | return rc; | ||
522 | } | ||
523 | |||
524 | static int test_cc(unsigned int condition, unsigned int flags) | ||
525 | { | ||
526 | int rc = 0; | ||
527 | |||
528 | switch ((condition & 15) >> 1) { | ||
529 | case 0: /* o */ | ||
530 | rc |= (flags & EFLG_OF); | ||
531 | break; | ||
532 | case 1: /* b/c/nae */ | ||
533 | rc |= (flags & EFLG_CF); | ||
534 | break; | ||
535 | case 2: /* z/e */ | ||
536 | rc |= (flags & EFLG_ZF); | ||
537 | break; | ||
538 | case 3: /* be/na */ | ||
539 | rc |= (flags & (EFLG_CF|EFLG_ZF)); | ||
540 | break; | ||
541 | case 4: /* s */ | ||
542 | rc |= (flags & EFLG_SF); | ||
543 | break; | ||
544 | case 5: /* p/pe */ | ||
545 | rc |= (flags & EFLG_PF); | ||
546 | break; | ||
547 | case 7: /* le/ng */ | ||
548 | rc |= (flags & EFLG_ZF); | ||
549 | /* fall through */ | ||
550 | case 6: /* l/nge */ | ||
551 | rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF)); | ||
552 | break; | ||
553 | } | ||
554 | |||
555 | /* Odd condition identifiers (lsb == 1) have inverted sense. */ | ||
556 | return (!!rc ^ (condition & 1)); | ||
557 | } | ||
558 | |||
559 | static void decode_register_operand(struct operand *op, | ||
560 | struct decode_cache *c, | ||
561 | int inhibit_bytereg) | ||
562 | { | ||
563 | unsigned reg = c->modrm_reg; | ||
564 | int highbyte_regs = c->rex_prefix == 0; | ||
565 | |||
566 | if (!(c->d & ModRM)) | ||
567 | reg = (c->b & 7) | ((c->rex_prefix & 1) << 3); | ||
568 | op->type = OP_REG; | ||
569 | if ((c->d & ByteOp) && !inhibit_bytereg) { | ||
570 | op->ptr = decode_register(reg, c->regs, highbyte_regs); | ||
571 | op->val = *(u8 *)op->ptr; | ||
572 | op->bytes = 1; | ||
573 | } else { | ||
574 | op->ptr = decode_register(reg, c->regs, 0); | ||
575 | op->bytes = c->op_bytes; | ||
576 | switch (op->bytes) { | ||
577 | case 2: | ||
578 | op->val = *(u16 *)op->ptr; | ||
579 | break; | ||
580 | case 4: | ||
581 | op->val = *(u32 *)op->ptr; | ||
582 | break; | ||
583 | case 8: | ||
584 | op->val = *(u64 *) op->ptr; | ||
585 | break; | ||
586 | } | ||
587 | } | ||
588 | op->orig_val = op->val; | ||
589 | } | ||
590 | |||
591 | static int decode_modrm(struct x86_emulate_ctxt *ctxt, | ||
592 | struct x86_emulate_ops *ops) | ||
593 | { | ||
594 | struct decode_cache *c = &ctxt->decode; | ||
595 | u8 sib; | ||
596 | int index_reg = 0, base_reg = 0, scale, rip_relative = 0; | ||
597 | int rc = 0; | ||
598 | |||
599 | if (c->rex_prefix) { | ||
600 | c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */ | ||
601 | index_reg = (c->rex_prefix & 2) << 2; /* REX.X */ | ||
602 | c->modrm_rm = base_reg = (c->rex_prefix & 1) << 3; /* REG.B */ | ||
603 | } | ||
604 | |||
605 | c->modrm = insn_fetch(u8, 1, c->eip); | ||
606 | c->modrm_mod |= (c->modrm & 0xc0) >> 6; | ||
607 | c->modrm_reg |= (c->modrm & 0x38) >> 3; | ||
608 | c->modrm_rm |= (c->modrm & 0x07); | ||
609 | c->modrm_ea = 0; | ||
610 | c->use_modrm_ea = 1; | ||
611 | |||
612 | if (c->modrm_mod == 3) { | ||
613 | c->modrm_val = *(unsigned long *) | ||
614 | decode_register(c->modrm_rm, c->regs, c->d & ByteOp); | ||
615 | return rc; | ||
616 | } | ||
617 | |||
618 | if (c->ad_bytes == 2) { | ||
619 | unsigned bx = c->regs[VCPU_REGS_RBX]; | ||
620 | unsigned bp = c->regs[VCPU_REGS_RBP]; | ||
621 | unsigned si = c->regs[VCPU_REGS_RSI]; | ||
622 | unsigned di = c->regs[VCPU_REGS_RDI]; | ||
623 | |||
624 | /* 16-bit ModR/M decode. */ | ||
625 | switch (c->modrm_mod) { | ||
626 | case 0: | ||
627 | if (c->modrm_rm == 6) | ||
628 | c->modrm_ea += insn_fetch(u16, 2, c->eip); | ||
629 | break; | ||
630 | case 1: | ||
631 | c->modrm_ea += insn_fetch(s8, 1, c->eip); | ||
632 | break; | ||
633 | case 2: | ||
634 | c->modrm_ea += insn_fetch(u16, 2, c->eip); | ||
635 | break; | ||
636 | } | ||
637 | switch (c->modrm_rm) { | ||
638 | case 0: | ||
639 | c->modrm_ea += bx + si; | ||
640 | break; | ||
641 | case 1: | ||
642 | c->modrm_ea += bx + di; | ||
643 | break; | ||
644 | case 2: | ||
645 | c->modrm_ea += bp + si; | ||
646 | break; | ||
647 | case 3: | ||
648 | c->modrm_ea += bp + di; | ||
649 | break; | ||
650 | case 4: | ||
651 | c->modrm_ea += si; | ||
652 | break; | ||
653 | case 5: | ||
654 | c->modrm_ea += di; | ||
655 | break; | ||
656 | case 6: | ||
657 | if (c->modrm_mod != 0) | ||
658 | c->modrm_ea += bp; | ||
659 | break; | ||
660 | case 7: | ||
661 | c->modrm_ea += bx; | ||
662 | break; | ||
663 | } | ||
664 | if (c->modrm_rm == 2 || c->modrm_rm == 3 || | ||
665 | (c->modrm_rm == 6 && c->modrm_mod != 0)) | ||
666 | if (!c->override_base) | ||
667 | c->override_base = &ctxt->ss_base; | ||
668 | c->modrm_ea = (u16)c->modrm_ea; | ||
669 | } else { | ||
670 | /* 32/64-bit ModR/M decode. */ | ||
671 | switch (c->modrm_rm) { | ||
672 | case 4: | ||
673 | case 12: | ||
674 | sib = insn_fetch(u8, 1, c->eip); | ||
675 | index_reg |= (sib >> 3) & 7; | ||
676 | base_reg |= sib & 7; | ||
677 | scale = sib >> 6; | ||
678 | |||
679 | switch (base_reg) { | ||
680 | case 5: | ||
681 | if (c->modrm_mod != 0) | ||
682 | c->modrm_ea += c->regs[base_reg]; | ||
683 | else | ||
684 | c->modrm_ea += | ||
685 | insn_fetch(s32, 4, c->eip); | ||
686 | break; | ||
687 | default: | ||
688 | c->modrm_ea += c->regs[base_reg]; | ||
689 | } | ||
690 | switch (index_reg) { | ||
691 | case 4: | ||
692 | break; | ||
693 | default: | ||
694 | c->modrm_ea += c->regs[index_reg] << scale; | ||
695 | } | ||
696 | break; | ||
697 | case 5: | ||
698 | if (c->modrm_mod != 0) | ||
699 | c->modrm_ea += c->regs[c->modrm_rm]; | ||
700 | else if (ctxt->mode == X86EMUL_MODE_PROT64) | ||
701 | rip_relative = 1; | ||
702 | break; | ||
703 | default: | ||
704 | c->modrm_ea += c->regs[c->modrm_rm]; | ||
705 | break; | ||
706 | } | ||
707 | switch (c->modrm_mod) { | ||
708 | case 0: | ||
709 | if (c->modrm_rm == 5) | ||
710 | c->modrm_ea += insn_fetch(s32, 4, c->eip); | ||
711 | break; | ||
712 | case 1: | ||
713 | c->modrm_ea += insn_fetch(s8, 1, c->eip); | ||
714 | break; | ||
715 | case 2: | ||
716 | c->modrm_ea += insn_fetch(s32, 4, c->eip); | ||
717 | break; | ||
718 | } | ||
719 | } | ||
720 | if (rip_relative) { | ||
721 | c->modrm_ea += c->eip; | ||
722 | switch (c->d & SrcMask) { | ||
723 | case SrcImmByte: | ||
724 | c->modrm_ea += 1; | ||
725 | break; | ||
726 | case SrcImm: | ||
727 | if (c->d & ByteOp) | ||
728 | c->modrm_ea += 1; | ||
729 | else | ||
730 | if (c->op_bytes == 8) | ||
731 | c->modrm_ea += 4; | ||
732 | else | ||
733 | c->modrm_ea += c->op_bytes; | ||
734 | } | ||
735 | } | ||
736 | done: | ||
737 | return rc; | ||
738 | } | ||
739 | |||
740 | static int decode_abs(struct x86_emulate_ctxt *ctxt, | ||
741 | struct x86_emulate_ops *ops) | ||
742 | { | ||
743 | struct decode_cache *c = &ctxt->decode; | ||
744 | int rc = 0; | ||
745 | |||
746 | switch (c->ad_bytes) { | ||
747 | case 2: | ||
748 | c->modrm_ea = insn_fetch(u16, 2, c->eip); | ||
749 | break; | ||
750 | case 4: | ||
751 | c->modrm_ea = insn_fetch(u32, 4, c->eip); | ||
752 | break; | ||
753 | case 8: | ||
754 | c->modrm_ea = insn_fetch(u64, 8, c->eip); | ||
755 | break; | ||
756 | } | ||
757 | done: | ||
758 | return rc; | ||
759 | } | ||
760 | |||
761 | int | ||
762 | x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | ||
763 | { | ||
764 | struct decode_cache *c = &ctxt->decode; | ||
765 | int rc = 0; | ||
766 | int mode = ctxt->mode; | ||
767 | int def_op_bytes, def_ad_bytes; | ||
768 | |||
769 | /* Shadow copy of register state. Committed on successful emulation. */ | ||
770 | |||
771 | memset(c, 0, sizeof(struct decode_cache)); | ||
772 | c->eip = ctxt->vcpu->arch.rip; | ||
773 | memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); | ||
774 | |||
775 | switch (mode) { | ||
776 | case X86EMUL_MODE_REAL: | ||
777 | case X86EMUL_MODE_PROT16: | ||
778 | def_op_bytes = def_ad_bytes = 2; | ||
779 | break; | ||
780 | case X86EMUL_MODE_PROT32: | ||
781 | def_op_bytes = def_ad_bytes = 4; | ||
782 | break; | ||
783 | #ifdef CONFIG_X86_64 | ||
784 | case X86EMUL_MODE_PROT64: | ||
785 | def_op_bytes = 4; | ||
786 | def_ad_bytes = 8; | ||
787 | break; | ||
788 | #endif | ||
789 | default: | ||
790 | return -1; | ||
791 | } | ||
792 | |||
793 | c->op_bytes = def_op_bytes; | ||
794 | c->ad_bytes = def_ad_bytes; | ||
795 | |||
796 | /* Legacy prefixes. */ | ||
797 | for (;;) { | ||
798 | switch (c->b = insn_fetch(u8, 1, c->eip)) { | ||
799 | case 0x66: /* operand-size override */ | ||
800 | /* switch between 2/4 bytes */ | ||
801 | c->op_bytes = def_op_bytes ^ 6; | ||
802 | break; | ||
803 | case 0x67: /* address-size override */ | ||
804 | if (mode == X86EMUL_MODE_PROT64) | ||
805 | /* switch between 4/8 bytes */ | ||
806 | c->ad_bytes = def_ad_bytes ^ 12; | ||
807 | else | ||
808 | /* switch between 2/4 bytes */ | ||
809 | c->ad_bytes = def_ad_bytes ^ 6; | ||
810 | break; | ||
811 | case 0x2e: /* CS override */ | ||
812 | c->override_base = &ctxt->cs_base; | ||
813 | break; | ||
814 | case 0x3e: /* DS override */ | ||
815 | c->override_base = &ctxt->ds_base; | ||
816 | break; | ||
817 | case 0x26: /* ES override */ | ||
818 | c->override_base = &ctxt->es_base; | ||
819 | break; | ||
820 | case 0x64: /* FS override */ | ||
821 | c->override_base = &ctxt->fs_base; | ||
822 | break; | ||
823 | case 0x65: /* GS override */ | ||
824 | c->override_base = &ctxt->gs_base; | ||
825 | break; | ||
826 | case 0x36: /* SS override */ | ||
827 | c->override_base = &ctxt->ss_base; | ||
828 | break; | ||
829 | case 0x40 ... 0x4f: /* REX */ | ||
830 | if (mode != X86EMUL_MODE_PROT64) | ||
831 | goto done_prefixes; | ||
832 | c->rex_prefix = c->b; | ||
833 | continue; | ||
834 | case 0xf0: /* LOCK */ | ||
835 | c->lock_prefix = 1; | ||
836 | break; | ||
837 | case 0xf2: /* REPNE/REPNZ */ | ||
838 | c->rep_prefix = REPNE_PREFIX; | ||
839 | break; | ||
840 | case 0xf3: /* REP/REPE/REPZ */ | ||
841 | c->rep_prefix = REPE_PREFIX; | ||
842 | break; | ||
843 | default: | ||
844 | goto done_prefixes; | ||
845 | } | ||
846 | |||
847 | /* Any legacy prefix after a REX prefix nullifies its effect. */ | ||
848 | |||
849 | c->rex_prefix = 0; | ||
850 | } | ||
851 | |||
852 | done_prefixes: | ||
853 | |||
854 | /* REX prefix. */ | ||
855 | if (c->rex_prefix) | ||
856 | if (c->rex_prefix & 8) | ||
857 | c->op_bytes = 8; /* REX.W */ | ||
858 | |||
859 | /* Opcode byte(s). */ | ||
860 | c->d = opcode_table[c->b]; | ||
861 | if (c->d == 0) { | ||
862 | /* Two-byte opcode? */ | ||
863 | if (c->b == 0x0f) { | ||
864 | c->twobyte = 1; | ||
865 | c->b = insn_fetch(u8, 1, c->eip); | ||
866 | c->d = twobyte_table[c->b]; | ||
867 | } | ||
868 | |||
869 | /* Unrecognised? */ | ||
870 | if (c->d == 0) { | ||
871 | DPRINTF("Cannot emulate %02x\n", c->b); | ||
872 | return -1; | ||
873 | } | ||
874 | } | ||
875 | |||
876 | if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) | ||
877 | c->op_bytes = 8; | ||
878 | |||
879 | /* ModRM and SIB bytes. */ | ||
880 | if (c->d & ModRM) | ||
881 | rc = decode_modrm(ctxt, ops); | ||
882 | else if (c->d & MemAbs) | ||
883 | rc = decode_abs(ctxt, ops); | ||
884 | if (rc) | ||
885 | goto done; | ||
886 | |||
887 | if (!c->override_base) | ||
888 | c->override_base = &ctxt->ds_base; | ||
889 | if (mode == X86EMUL_MODE_PROT64 && | ||
890 | c->override_base != &ctxt->fs_base && | ||
891 | c->override_base != &ctxt->gs_base) | ||
892 | c->override_base = NULL; | ||
893 | |||
894 | if (c->override_base) | ||
895 | c->modrm_ea += *c->override_base; | ||
896 | |||
897 | if (c->ad_bytes != 8) | ||
898 | c->modrm_ea = (u32)c->modrm_ea; | ||
899 | /* | ||
900 | * Decode and fetch the source operand: register, memory | ||
901 | * or immediate. | ||
902 | */ | ||
903 | switch (c->d & SrcMask) { | ||
904 | case SrcNone: | ||
905 | break; | ||
906 | case SrcReg: | ||
907 | decode_register_operand(&c->src, c, 0); | ||
908 | break; | ||
909 | case SrcMem16: | ||
910 | c->src.bytes = 2; | ||
911 | goto srcmem_common; | ||
912 | case SrcMem32: | ||
913 | c->src.bytes = 4; | ||
914 | goto srcmem_common; | ||
915 | case SrcMem: | ||
916 | c->src.bytes = (c->d & ByteOp) ? 1 : | ||
917 | c->op_bytes; | ||
918 | /* Don't fetch the address for invlpg: it could be unmapped. */ | ||
919 | if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7) | ||
920 | break; | ||
921 | srcmem_common: | ||
922 | /* | ||
923 | * For instructions with a ModR/M byte, switch to register | ||
924 | * access if Mod = 3. | ||
925 | */ | ||
926 | if ((c->d & ModRM) && c->modrm_mod == 3) { | ||
927 | c->src.type = OP_REG; | ||
928 | break; | ||
929 | } | ||
930 | c->src.type = OP_MEM; | ||
931 | break; | ||
932 | case SrcImm: | ||
933 | c->src.type = OP_IMM; | ||
934 | c->src.ptr = (unsigned long *)c->eip; | ||
935 | c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
936 | if (c->src.bytes == 8) | ||
937 | c->src.bytes = 4; | ||
938 | /* NB. Immediates are sign-extended as necessary. */ | ||
939 | switch (c->src.bytes) { | ||
940 | case 1: | ||
941 | c->src.val = insn_fetch(s8, 1, c->eip); | ||
942 | break; | ||
943 | case 2: | ||
944 | c->src.val = insn_fetch(s16, 2, c->eip); | ||
945 | break; | ||
946 | case 4: | ||
947 | c->src.val = insn_fetch(s32, 4, c->eip); | ||
948 | break; | ||
949 | } | ||
950 | break; | ||
951 | case SrcImmByte: | ||
952 | c->src.type = OP_IMM; | ||
953 | c->src.ptr = (unsigned long *)c->eip; | ||
954 | c->src.bytes = 1; | ||
955 | c->src.val = insn_fetch(s8, 1, c->eip); | ||
956 | break; | ||
957 | } | ||
958 | |||
959 | /* Decode and fetch the destination operand: register or memory. */ | ||
960 | switch (c->d & DstMask) { | ||
961 | case ImplicitOps: | ||
962 | /* Special instructions do their own operand decoding. */ | ||
963 | return 0; | ||
964 | case DstReg: | ||
965 | decode_register_operand(&c->dst, c, | ||
966 | c->twobyte && (c->b == 0xb6 || c->b == 0xb7)); | ||
967 | break; | ||
968 | case DstMem: | ||
969 | if ((c->d & ModRM) && c->modrm_mod == 3) { | ||
970 | c->dst.type = OP_REG; | ||
971 | break; | ||
972 | } | ||
973 | c->dst.type = OP_MEM; | ||
974 | break; | ||
975 | } | ||
976 | |||
977 | done: | ||
978 | return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; | ||
979 | } | ||
980 | |||
981 | static inline void emulate_push(struct x86_emulate_ctxt *ctxt) | ||
982 | { | ||
983 | struct decode_cache *c = &ctxt->decode; | ||
984 | |||
985 | c->dst.type = OP_MEM; | ||
986 | c->dst.bytes = c->op_bytes; | ||
987 | c->dst.val = c->src.val; | ||
988 | register_address_increment(c->regs[VCPU_REGS_RSP], -c->op_bytes); | ||
989 | c->dst.ptr = (void *) register_address(ctxt->ss_base, | ||
990 | c->regs[VCPU_REGS_RSP]); | ||
991 | } | ||
992 | |||
993 | static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, | ||
994 | struct x86_emulate_ops *ops) | ||
995 | { | ||
996 | struct decode_cache *c = &ctxt->decode; | ||
997 | int rc; | ||
998 | |||
999 | rc = ops->read_std(register_address(ctxt->ss_base, | ||
1000 | c->regs[VCPU_REGS_RSP]), | ||
1001 | &c->dst.val, c->dst.bytes, ctxt->vcpu); | ||
1002 | if (rc != 0) | ||
1003 | return rc; | ||
1004 | |||
1005 | register_address_increment(c->regs[VCPU_REGS_RSP], c->dst.bytes); | ||
1006 | |||
1007 | return 0; | ||
1008 | } | ||
1009 | |||
1010 | static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt) | ||
1011 | { | ||
1012 | struct decode_cache *c = &ctxt->decode; | ||
1013 | switch (c->modrm_reg) { | ||
1014 | case 0: /* rol */ | ||
1015 | emulate_2op_SrcB("rol", c->src, c->dst, ctxt->eflags); | ||
1016 | break; | ||
1017 | case 1: /* ror */ | ||
1018 | emulate_2op_SrcB("ror", c->src, c->dst, ctxt->eflags); | ||
1019 | break; | ||
1020 | case 2: /* rcl */ | ||
1021 | emulate_2op_SrcB("rcl", c->src, c->dst, ctxt->eflags); | ||
1022 | break; | ||
1023 | case 3: /* rcr */ | ||
1024 | emulate_2op_SrcB("rcr", c->src, c->dst, ctxt->eflags); | ||
1025 | break; | ||
1026 | case 4: /* sal/shl */ | ||
1027 | case 6: /* sal/shl */ | ||
1028 | emulate_2op_SrcB("sal", c->src, c->dst, ctxt->eflags); | ||
1029 | break; | ||
1030 | case 5: /* shr */ | ||
1031 | emulate_2op_SrcB("shr", c->src, c->dst, ctxt->eflags); | ||
1032 | break; | ||
1033 | case 7: /* sar */ | ||
1034 | emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags); | ||
1035 | break; | ||
1036 | } | ||
1037 | } | ||
1038 | |||
1039 | static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, | ||
1040 | struct x86_emulate_ops *ops) | ||
1041 | { | ||
1042 | struct decode_cache *c = &ctxt->decode; | ||
1043 | int rc = 0; | ||
1044 | |||
1045 | switch (c->modrm_reg) { | ||
1046 | case 0 ... 1: /* test */ | ||
1047 | /* | ||
1048 | * Special case in Grp3: test has an immediate | ||
1049 | * source operand. | ||
1050 | */ | ||
1051 | c->src.type = OP_IMM; | ||
1052 | c->src.ptr = (unsigned long *)c->eip; | ||
1053 | c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
1054 | if (c->src.bytes == 8) | ||
1055 | c->src.bytes = 4; | ||
1056 | switch (c->src.bytes) { | ||
1057 | case 1: | ||
1058 | c->src.val = insn_fetch(s8, 1, c->eip); | ||
1059 | break; | ||
1060 | case 2: | ||
1061 | c->src.val = insn_fetch(s16, 2, c->eip); | ||
1062 | break; | ||
1063 | case 4: | ||
1064 | c->src.val = insn_fetch(s32, 4, c->eip); | ||
1065 | break; | ||
1066 | } | ||
1067 | emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); | ||
1068 | break; | ||
1069 | case 2: /* not */ | ||
1070 | c->dst.val = ~c->dst.val; | ||
1071 | break; | ||
1072 | case 3: /* neg */ | ||
1073 | emulate_1op("neg", c->dst, ctxt->eflags); | ||
1074 | break; | ||
1075 | default: | ||
1076 | DPRINTF("Cannot emulate %02x\n", c->b); | ||
1077 | rc = X86EMUL_UNHANDLEABLE; | ||
1078 | break; | ||
1079 | } | ||
1080 | done: | ||
1081 | return rc; | ||
1082 | } | ||
1083 | |||
1084 | static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, | ||
1085 | struct x86_emulate_ops *ops) | ||
1086 | { | ||
1087 | struct decode_cache *c = &ctxt->decode; | ||
1088 | int rc; | ||
1089 | |||
1090 | switch (c->modrm_reg) { | ||
1091 | case 0: /* inc */ | ||
1092 | emulate_1op("inc", c->dst, ctxt->eflags); | ||
1093 | break; | ||
1094 | case 1: /* dec */ | ||
1095 | emulate_1op("dec", c->dst, ctxt->eflags); | ||
1096 | break; | ||
1097 | case 4: /* jmp abs */ | ||
1098 | if (c->b == 0xff) | ||
1099 | c->eip = c->dst.val; | ||
1100 | else { | ||
1101 | DPRINTF("Cannot emulate %02x\n", c->b); | ||
1102 | return X86EMUL_UNHANDLEABLE; | ||
1103 | } | ||
1104 | break; | ||
1105 | case 6: /* push */ | ||
1106 | |||
1107 | /* 64-bit mode: PUSH always pushes a 64-bit operand. */ | ||
1108 | |||
1109 | if (ctxt->mode == X86EMUL_MODE_PROT64) { | ||
1110 | c->dst.bytes = 8; | ||
1111 | rc = ops->read_std((unsigned long)c->dst.ptr, | ||
1112 | &c->dst.val, 8, ctxt->vcpu); | ||
1113 | if (rc != 0) | ||
1114 | return rc; | ||
1115 | } | ||
1116 | register_address_increment(c->regs[VCPU_REGS_RSP], | ||
1117 | -c->dst.bytes); | ||
1118 | rc = ops->write_emulated(register_address(ctxt->ss_base, | ||
1119 | c->regs[VCPU_REGS_RSP]), &c->dst.val, | ||
1120 | c->dst.bytes, ctxt->vcpu); | ||
1121 | if (rc != 0) | ||
1122 | return rc; | ||
1123 | c->dst.type = OP_NONE; | ||
1124 | break; | ||
1125 | default: | ||
1126 | DPRINTF("Cannot emulate %02x\n", c->b); | ||
1127 | return X86EMUL_UNHANDLEABLE; | ||
1128 | } | ||
1129 | return 0; | ||
1130 | } | ||
1131 | |||
1132 | static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt, | ||
1133 | struct x86_emulate_ops *ops, | ||
1134 | unsigned long memop) | ||
1135 | { | ||
1136 | struct decode_cache *c = &ctxt->decode; | ||
1137 | u64 old, new; | ||
1138 | int rc; | ||
1139 | |||
1140 | rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu); | ||
1141 | if (rc != 0) | ||
1142 | return rc; | ||
1143 | |||
1144 | if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) || | ||
1145 | ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) { | ||
1146 | |||
1147 | c->regs[VCPU_REGS_RAX] = (u32) (old >> 0); | ||
1148 | c->regs[VCPU_REGS_RDX] = (u32) (old >> 32); | ||
1149 | ctxt->eflags &= ~EFLG_ZF; | ||
1150 | |||
1151 | } else { | ||
1152 | new = ((u64)c->regs[VCPU_REGS_RCX] << 32) | | ||
1153 | (u32) c->regs[VCPU_REGS_RBX]; | ||
1154 | |||
1155 | rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu); | ||
1156 | if (rc != 0) | ||
1157 | return rc; | ||
1158 | ctxt->eflags |= EFLG_ZF; | ||
1159 | } | ||
1160 | return 0; | ||
1161 | } | ||
1162 | |||
1163 | static inline int writeback(struct x86_emulate_ctxt *ctxt, | ||
1164 | struct x86_emulate_ops *ops) | ||
1165 | { | ||
1166 | int rc; | ||
1167 | struct decode_cache *c = &ctxt->decode; | ||
1168 | |||
1169 | switch (c->dst.type) { | ||
1170 | case OP_REG: | ||
1171 | /* The 4-byte case *is* correct: | ||
1172 | * in 64-bit mode we zero-extend. | ||
1173 | */ | ||
1174 | switch (c->dst.bytes) { | ||
1175 | case 1: | ||
1176 | *(u8 *)c->dst.ptr = (u8)c->dst.val; | ||
1177 | break; | ||
1178 | case 2: | ||
1179 | *(u16 *)c->dst.ptr = (u16)c->dst.val; | ||
1180 | break; | ||
1181 | case 4: | ||
1182 | *c->dst.ptr = (u32)c->dst.val; | ||
1183 | break; /* 64b: zero-ext */ | ||
1184 | case 8: | ||
1185 | *c->dst.ptr = c->dst.val; | ||
1186 | break; | ||
1187 | } | ||
1188 | break; | ||
1189 | case OP_MEM: | ||
1190 | if (c->lock_prefix) | ||
1191 | rc = ops->cmpxchg_emulated( | ||
1192 | (unsigned long)c->dst.ptr, | ||
1193 | &c->dst.orig_val, | ||
1194 | &c->dst.val, | ||
1195 | c->dst.bytes, | ||
1196 | ctxt->vcpu); | ||
1197 | else | ||
1198 | rc = ops->write_emulated( | ||
1199 | (unsigned long)c->dst.ptr, | ||
1200 | &c->dst.val, | ||
1201 | c->dst.bytes, | ||
1202 | ctxt->vcpu); | ||
1203 | if (rc != 0) | ||
1204 | return rc; | ||
1205 | break; | ||
1206 | case OP_NONE: | ||
1207 | /* no writeback */ | ||
1208 | break; | ||
1209 | default: | ||
1210 | break; | ||
1211 | } | ||
1212 | return 0; | ||
1213 | } | ||
1214 | |||
1215 | int | ||
1216 | x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | ||
1217 | { | ||
1218 | unsigned long memop = 0; | ||
1219 | u64 msr_data; | ||
1220 | unsigned long saved_eip = 0; | ||
1221 | struct decode_cache *c = &ctxt->decode; | ||
1222 | int rc = 0; | ||
1223 | |||
1224 | /* Shadow copy of register state. Committed on successful emulation. | ||
1225 | * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't | ||
1226 | * modify them. | ||
1227 | */ | ||
1228 | |||
1229 | memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); | ||
1230 | saved_eip = c->eip; | ||
1231 | |||
1232 | if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs)) | ||
1233 | memop = c->modrm_ea; | ||
1234 | |||
1235 | if (c->rep_prefix && (c->d & String)) { | ||
1236 | /* All REP prefixes have the same first termination condition */ | ||
1237 | if (c->regs[VCPU_REGS_RCX] == 0) { | ||
1238 | ctxt->vcpu->arch.rip = c->eip; | ||
1239 | goto done; | ||
1240 | } | ||
1241 | /* The second termination condition only applies for REPE | ||
1242 | * and REPNE. Test if the repeat string operation prefix is | ||
1243 | * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the | ||
1244 | * corresponding termination condition according to: | ||
1245 | * - if REPE/REPZ and ZF = 0 then done | ||
1246 | * - if REPNE/REPNZ and ZF = 1 then done | ||
1247 | */ | ||
1248 | if ((c->b == 0xa6) || (c->b == 0xa7) || | ||
1249 | (c->b == 0xae) || (c->b == 0xaf)) { | ||
1250 | if ((c->rep_prefix == REPE_PREFIX) && | ||
1251 | ((ctxt->eflags & EFLG_ZF) == 0)) { | ||
1252 | ctxt->vcpu->arch.rip = c->eip; | ||
1253 | goto done; | ||
1254 | } | ||
1255 | if ((c->rep_prefix == REPNE_PREFIX) && | ||
1256 | ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) { | ||
1257 | ctxt->vcpu->arch.rip = c->eip; | ||
1258 | goto done; | ||
1259 | } | ||
1260 | } | ||
1261 | c->regs[VCPU_REGS_RCX]--; | ||
1262 | c->eip = ctxt->vcpu->arch.rip; | ||
1263 | } | ||
1264 | |||
1265 | if (c->src.type == OP_MEM) { | ||
1266 | c->src.ptr = (unsigned long *)memop; | ||
1267 | c->src.val = 0; | ||
1268 | rc = ops->read_emulated((unsigned long)c->src.ptr, | ||
1269 | &c->src.val, | ||
1270 | c->src.bytes, | ||
1271 | ctxt->vcpu); | ||
1272 | if (rc != 0) | ||
1273 | goto done; | ||
1274 | c->src.orig_val = c->src.val; | ||
1275 | } | ||
1276 | |||
1277 | if ((c->d & DstMask) == ImplicitOps) | ||
1278 | goto special_insn; | ||
1279 | |||
1280 | |||
1281 | if (c->dst.type == OP_MEM) { | ||
1282 | c->dst.ptr = (unsigned long *)memop; | ||
1283 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
1284 | c->dst.val = 0; | ||
1285 | if (c->d & BitOp) { | ||
1286 | unsigned long mask = ~(c->dst.bytes * 8 - 1); | ||
1287 | |||
1288 | c->dst.ptr = (void *)c->dst.ptr + | ||
1289 | (c->src.val & mask) / 8; | ||
1290 | } | ||
1291 | if (!(c->d & Mov) && | ||
1292 | /* optimisation - avoid slow emulated read */ | ||
1293 | ((rc = ops->read_emulated((unsigned long)c->dst.ptr, | ||
1294 | &c->dst.val, | ||
1295 | c->dst.bytes, ctxt->vcpu)) != 0)) | ||
1296 | goto done; | ||
1297 | } | ||
1298 | c->dst.orig_val = c->dst.val; | ||
1299 | |||
1300 | special_insn: | ||
1301 | |||
1302 | if (c->twobyte) | ||
1303 | goto twobyte_insn; | ||
1304 | |||
1305 | switch (c->b) { | ||
1306 | case 0x00 ... 0x05: | ||
1307 | add: /* add */ | ||
1308 | emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); | ||
1309 | break; | ||
1310 | case 0x08 ... 0x0d: | ||
1311 | or: /* or */ | ||
1312 | emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); | ||
1313 | break; | ||
1314 | case 0x10 ... 0x15: | ||
1315 | adc: /* adc */ | ||
1316 | emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags); | ||
1317 | break; | ||
1318 | case 0x18 ... 0x1d: | ||
1319 | sbb: /* sbb */ | ||
1320 | emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); | ||
1321 | break; | ||
1322 | case 0x20 ... 0x23: | ||
1323 | and: /* and */ | ||
1324 | emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); | ||
1325 | break; | ||
1326 | case 0x24: /* and al imm8 */ | ||
1327 | c->dst.type = OP_REG; | ||
1328 | c->dst.ptr = &c->regs[VCPU_REGS_RAX]; | ||
1329 | c->dst.val = *(u8 *)c->dst.ptr; | ||
1330 | c->dst.bytes = 1; | ||
1331 | c->dst.orig_val = c->dst.val; | ||
1332 | goto and; | ||
1333 | case 0x25: /* and ax imm16, or eax imm32 */ | ||
1334 | c->dst.type = OP_REG; | ||
1335 | c->dst.bytes = c->op_bytes; | ||
1336 | c->dst.ptr = &c->regs[VCPU_REGS_RAX]; | ||
1337 | if (c->op_bytes == 2) | ||
1338 | c->dst.val = *(u16 *)c->dst.ptr; | ||
1339 | else | ||
1340 | c->dst.val = *(u32 *)c->dst.ptr; | ||
1341 | c->dst.orig_val = c->dst.val; | ||
1342 | goto and; | ||
1343 | case 0x28 ... 0x2d: | ||
1344 | sub: /* sub */ | ||
1345 | emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags); | ||
1346 | break; | ||
1347 | case 0x30 ... 0x35: | ||
1348 | xor: /* xor */ | ||
1349 | emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags); | ||
1350 | break; | ||
1351 | case 0x38 ... 0x3d: | ||
1352 | cmp: /* cmp */ | ||
1353 | emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); | ||
1354 | break; | ||
1355 | case 0x40 ... 0x47: /* inc r16/r32 */ | ||
1356 | emulate_1op("inc", c->dst, ctxt->eflags); | ||
1357 | break; | ||
1358 | case 0x48 ... 0x4f: /* dec r16/r32 */ | ||
1359 | emulate_1op("dec", c->dst, ctxt->eflags); | ||
1360 | break; | ||
1361 | case 0x50 ... 0x57: /* push reg */ | ||
1362 | c->dst.type = OP_MEM; | ||
1363 | c->dst.bytes = c->op_bytes; | ||
1364 | c->dst.val = c->src.val; | ||
1365 | register_address_increment(c->regs[VCPU_REGS_RSP], | ||
1366 | -c->op_bytes); | ||
1367 | c->dst.ptr = (void *) register_address( | ||
1368 | ctxt->ss_base, c->regs[VCPU_REGS_RSP]); | ||
1369 | break; | ||
1370 | case 0x58 ... 0x5f: /* pop reg */ | ||
1371 | pop_instruction: | ||
1372 | if ((rc = ops->read_std(register_address(ctxt->ss_base, | ||
1373 | c->regs[VCPU_REGS_RSP]), c->dst.ptr, | ||
1374 | c->op_bytes, ctxt->vcpu)) != 0) | ||
1375 | goto done; | ||
1376 | |||
1377 | register_address_increment(c->regs[VCPU_REGS_RSP], | ||
1378 | c->op_bytes); | ||
1379 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
1380 | break; | ||
1381 | case 0x63: /* movsxd */ | ||
1382 | if (ctxt->mode != X86EMUL_MODE_PROT64) | ||
1383 | goto cannot_emulate; | ||
1384 | c->dst.val = (s32) c->src.val; | ||
1385 | break; | ||
1386 | case 0x6a: /* push imm8 */ | ||
1387 | c->src.val = 0L; | ||
1388 | c->src.val = insn_fetch(s8, 1, c->eip); | ||
1389 | emulate_push(ctxt); | ||
1390 | break; | ||
1391 | case 0x6c: /* insb */ | ||
1392 | case 0x6d: /* insw/insd */ | ||
1393 | if (kvm_emulate_pio_string(ctxt->vcpu, NULL, | ||
1394 | 1, | ||
1395 | (c->d & ByteOp) ? 1 : c->op_bytes, | ||
1396 | c->rep_prefix ? | ||
1397 | address_mask(c->regs[VCPU_REGS_RCX]) : 1, | ||
1398 | (ctxt->eflags & EFLG_DF), | ||
1399 | register_address(ctxt->es_base, | ||
1400 | c->regs[VCPU_REGS_RDI]), | ||
1401 | c->rep_prefix, | ||
1402 | c->regs[VCPU_REGS_RDX]) == 0) { | ||
1403 | c->eip = saved_eip; | ||
1404 | return -1; | ||
1405 | } | ||
1406 | return 0; | ||
1407 | case 0x6e: /* outsb */ | ||
1408 | case 0x6f: /* outsw/outsd */ | ||
1409 | if (kvm_emulate_pio_string(ctxt->vcpu, NULL, | ||
1410 | 0, | ||
1411 | (c->d & ByteOp) ? 1 : c->op_bytes, | ||
1412 | c->rep_prefix ? | ||
1413 | address_mask(c->regs[VCPU_REGS_RCX]) : 1, | ||
1414 | (ctxt->eflags & EFLG_DF), | ||
1415 | register_address(c->override_base ? | ||
1416 | *c->override_base : | ||
1417 | ctxt->ds_base, | ||
1418 | c->regs[VCPU_REGS_RSI]), | ||
1419 | c->rep_prefix, | ||
1420 | c->regs[VCPU_REGS_RDX]) == 0) { | ||
1421 | c->eip = saved_eip; | ||
1422 | return -1; | ||
1423 | } | ||
1424 | return 0; | ||
1425 | case 0x70 ... 0x7f: /* jcc (short) */ { | ||
1426 | int rel = insn_fetch(s8, 1, c->eip); | ||
1427 | |||
1428 | if (test_cc(c->b, ctxt->eflags)) | ||
1429 | JMP_REL(rel); | ||
1430 | break; | ||
1431 | } | ||
1432 | case 0x80 ... 0x83: /* Grp1 */ | ||
1433 | switch (c->modrm_reg) { | ||
1434 | case 0: | ||
1435 | goto add; | ||
1436 | case 1: | ||
1437 | goto or; | ||
1438 | case 2: | ||
1439 | goto adc; | ||
1440 | case 3: | ||
1441 | goto sbb; | ||
1442 | case 4: | ||
1443 | goto and; | ||
1444 | case 5: | ||
1445 | goto sub; | ||
1446 | case 6: | ||
1447 | goto xor; | ||
1448 | case 7: | ||
1449 | goto cmp; | ||
1450 | } | ||
1451 | break; | ||
1452 | case 0x84 ... 0x85: | ||
1453 | emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); | ||
1454 | break; | ||
1455 | case 0x86 ... 0x87: /* xchg */ | ||
1456 | /* Write back the register source. */ | ||
1457 | switch (c->dst.bytes) { | ||
1458 | case 1: | ||
1459 | *(u8 *) c->src.ptr = (u8) c->dst.val; | ||
1460 | break; | ||
1461 | case 2: | ||
1462 | *(u16 *) c->src.ptr = (u16) c->dst.val; | ||
1463 | break; | ||
1464 | case 4: | ||
1465 | *c->src.ptr = (u32) c->dst.val; | ||
1466 | break; /* 64b reg: zero-extend */ | ||
1467 | case 8: | ||
1468 | *c->src.ptr = c->dst.val; | ||
1469 | break; | ||
1470 | } | ||
1471 | /* | ||
1472 | * Write back the memory destination with implicit LOCK | ||
1473 | * prefix. | ||
1474 | */ | ||
1475 | c->dst.val = c->src.val; | ||
1476 | c->lock_prefix = 1; | ||
1477 | break; | ||
1478 | case 0x88 ... 0x8b: /* mov */ | ||
1479 | goto mov; | ||
1480 | case 0x8d: /* lea r16/r32, m */ | ||
1481 | c->dst.val = c->modrm_val; | ||
1482 | break; | ||
1483 | case 0x8f: /* pop (sole member of Grp1a) */ | ||
1484 | rc = emulate_grp1a(ctxt, ops); | ||
1485 | if (rc != 0) | ||
1486 | goto done; | ||
1487 | break; | ||
1488 | case 0x9c: /* pushf */ | ||
1489 | c->src.val = (unsigned long) ctxt->eflags; | ||
1490 | emulate_push(ctxt); | ||
1491 | break; | ||
1492 | case 0x9d: /* popf */ | ||
1493 | c->dst.ptr = (unsigned long *) &ctxt->eflags; | ||
1494 | goto pop_instruction; | ||
1495 | case 0xa0 ... 0xa1: /* mov */ | ||
1496 | c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; | ||
1497 | c->dst.val = c->src.val; | ||
1498 | break; | ||
1499 | case 0xa2 ... 0xa3: /* mov */ | ||
1500 | c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX]; | ||
1501 | break; | ||
1502 | case 0xa4 ... 0xa5: /* movs */ | ||
1503 | c->dst.type = OP_MEM; | ||
1504 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
1505 | c->dst.ptr = (unsigned long *)register_address( | ||
1506 | ctxt->es_base, | ||
1507 | c->regs[VCPU_REGS_RDI]); | ||
1508 | if ((rc = ops->read_emulated(register_address( | ||
1509 | c->override_base ? *c->override_base : | ||
1510 | ctxt->ds_base, | ||
1511 | c->regs[VCPU_REGS_RSI]), | ||
1512 | &c->dst.val, | ||
1513 | c->dst.bytes, ctxt->vcpu)) != 0) | ||
1514 | goto done; | ||
1515 | register_address_increment(c->regs[VCPU_REGS_RSI], | ||
1516 | (ctxt->eflags & EFLG_DF) ? -c->dst.bytes | ||
1517 | : c->dst.bytes); | ||
1518 | register_address_increment(c->regs[VCPU_REGS_RDI], | ||
1519 | (ctxt->eflags & EFLG_DF) ? -c->dst.bytes | ||
1520 | : c->dst.bytes); | ||
1521 | break; | ||
1522 | case 0xa6 ... 0xa7: /* cmps */ | ||
1523 | c->src.type = OP_NONE; /* Disable writeback. */ | ||
1524 | c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
1525 | c->src.ptr = (unsigned long *)register_address( | ||
1526 | c->override_base ? *c->override_base : | ||
1527 | ctxt->ds_base, | ||
1528 | c->regs[VCPU_REGS_RSI]); | ||
1529 | if ((rc = ops->read_emulated((unsigned long)c->src.ptr, | ||
1530 | &c->src.val, | ||
1531 | c->src.bytes, | ||
1532 | ctxt->vcpu)) != 0) | ||
1533 | goto done; | ||
1534 | |||
1535 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
1536 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
1537 | c->dst.ptr = (unsigned long *)register_address( | ||
1538 | ctxt->es_base, | ||
1539 | c->regs[VCPU_REGS_RDI]); | ||
1540 | if ((rc = ops->read_emulated((unsigned long)c->dst.ptr, | ||
1541 | &c->dst.val, | ||
1542 | c->dst.bytes, | ||
1543 | ctxt->vcpu)) != 0) | ||
1544 | goto done; | ||
1545 | |||
1546 | DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); | ||
1547 | |||
1548 | emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); | ||
1549 | |||
1550 | register_address_increment(c->regs[VCPU_REGS_RSI], | ||
1551 | (ctxt->eflags & EFLG_DF) ? -c->src.bytes | ||
1552 | : c->src.bytes); | ||
1553 | register_address_increment(c->regs[VCPU_REGS_RDI], | ||
1554 | (ctxt->eflags & EFLG_DF) ? -c->dst.bytes | ||
1555 | : c->dst.bytes); | ||
1556 | |||
1557 | break; | ||
1558 | case 0xaa ... 0xab: /* stos */ | ||
1559 | c->dst.type = OP_MEM; | ||
1560 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
1561 | c->dst.ptr = (unsigned long *)register_address( | ||
1562 | ctxt->es_base, | ||
1563 | c->regs[VCPU_REGS_RDI]); | ||
1564 | c->dst.val = c->regs[VCPU_REGS_RAX]; | ||
1565 | register_address_increment(c->regs[VCPU_REGS_RDI], | ||
1566 | (ctxt->eflags & EFLG_DF) ? -c->dst.bytes | ||
1567 | : c->dst.bytes); | ||
1568 | break; | ||
1569 | case 0xac ... 0xad: /* lods */ | ||
1570 | c->dst.type = OP_REG; | ||
1571 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
1572 | c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; | ||
1573 | if ((rc = ops->read_emulated(register_address( | ||
1574 | c->override_base ? *c->override_base : | ||
1575 | ctxt->ds_base, | ||
1576 | c->regs[VCPU_REGS_RSI]), | ||
1577 | &c->dst.val, | ||
1578 | c->dst.bytes, | ||
1579 | ctxt->vcpu)) != 0) | ||
1580 | goto done; | ||
1581 | register_address_increment(c->regs[VCPU_REGS_RSI], | ||
1582 | (ctxt->eflags & EFLG_DF) ? -c->dst.bytes | ||
1583 | : c->dst.bytes); | ||
1584 | break; | ||
1585 | case 0xae ... 0xaf: /* scas */ | ||
1586 | DPRINTF("Urk! I don't handle SCAS.\n"); | ||
1587 | goto cannot_emulate; | ||
1588 | case 0xc0 ... 0xc1: | ||
1589 | emulate_grp2(ctxt); | ||
1590 | break; | ||
1591 | case 0xc3: /* ret */ | ||
1592 | c->dst.ptr = &c->eip; | ||
1593 | goto pop_instruction; | ||
1594 | case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */ | ||
1595 | mov: | ||
1596 | c->dst.val = c->src.val; | ||
1597 | break; | ||
1598 | case 0xd0 ... 0xd1: /* Grp2 */ | ||
1599 | c->src.val = 1; | ||
1600 | emulate_grp2(ctxt); | ||
1601 | break; | ||
1602 | case 0xd2 ... 0xd3: /* Grp2 */ | ||
1603 | c->src.val = c->regs[VCPU_REGS_RCX]; | ||
1604 | emulate_grp2(ctxt); | ||
1605 | break; | ||
1606 | case 0xe8: /* call (near) */ { | ||
1607 | long int rel; | ||
1608 | switch (c->op_bytes) { | ||
1609 | case 2: | ||
1610 | rel = insn_fetch(s16, 2, c->eip); | ||
1611 | break; | ||
1612 | case 4: | ||
1613 | rel = insn_fetch(s32, 4, c->eip); | ||
1614 | break; | ||
1615 | default: | ||
1616 | DPRINTF("Call: Invalid op_bytes\n"); | ||
1617 | goto cannot_emulate; | ||
1618 | } | ||
1619 | c->src.val = (unsigned long) c->eip; | ||
1620 | JMP_REL(rel); | ||
1621 | c->op_bytes = c->ad_bytes; | ||
1622 | emulate_push(ctxt); | ||
1623 | break; | ||
1624 | } | ||
1625 | case 0xe9: /* jmp rel */ | ||
1626 | case 0xeb: /* jmp rel short */ | ||
1627 | JMP_REL(c->src.val); | ||
1628 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
1629 | break; | ||
1630 | case 0xf4: /* hlt */ | ||
1631 | ctxt->vcpu->arch.halt_request = 1; | ||
1632 | goto done; | ||
1633 | case 0xf5: /* cmc */ | ||
1634 | /* complement carry flag from eflags reg */ | ||
1635 | ctxt->eflags ^= EFLG_CF; | ||
1636 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
1637 | break; | ||
1638 | case 0xf6 ... 0xf7: /* Grp3 */ | ||
1639 | rc = emulate_grp3(ctxt, ops); | ||
1640 | if (rc != 0) | ||
1641 | goto done; | ||
1642 | break; | ||
1643 | case 0xf8: /* clc */ | ||
1644 | ctxt->eflags &= ~EFLG_CF; | ||
1645 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
1646 | break; | ||
1647 | case 0xfa: /* cli */ | ||
1648 | ctxt->eflags &= ~X86_EFLAGS_IF; | ||
1649 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
1650 | break; | ||
1651 | case 0xfb: /* sti */ | ||
1652 | ctxt->eflags |= X86_EFLAGS_IF; | ||
1653 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
1654 | break; | ||
1655 | case 0xfe ... 0xff: /* Grp4/Grp5 */ | ||
1656 | rc = emulate_grp45(ctxt, ops); | ||
1657 | if (rc != 0) | ||
1658 | goto done; | ||
1659 | break; | ||
1660 | } | ||
1661 | |||
1662 | writeback: | ||
1663 | rc = writeback(ctxt, ops); | ||
1664 | if (rc != 0) | ||
1665 | goto done; | ||
1666 | |||
1667 | /* Commit shadow register state. */ | ||
1668 | memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); | ||
1669 | ctxt->vcpu->arch.rip = c->eip; | ||
1670 | |||
1671 | done: | ||
1672 | if (rc == X86EMUL_UNHANDLEABLE) { | ||
1673 | c->eip = saved_eip; | ||
1674 | return -1; | ||
1675 | } | ||
1676 | return 0; | ||
1677 | |||
1678 | twobyte_insn: | ||
1679 | switch (c->b) { | ||
1680 | case 0x01: /* lgdt, lidt, lmsw */ | ||
1681 | switch (c->modrm_reg) { | ||
1682 | u16 size; | ||
1683 | unsigned long address; | ||
1684 | |||
1685 | case 0: /* vmcall */ | ||
1686 | if (c->modrm_mod != 3 || c->modrm_rm != 1) | ||
1687 | goto cannot_emulate; | ||
1688 | |||
1689 | rc = kvm_fix_hypercall(ctxt->vcpu); | ||
1690 | if (rc) | ||
1691 | goto done; | ||
1692 | |||
1693 | kvm_emulate_hypercall(ctxt->vcpu); | ||
1694 | break; | ||
1695 | case 2: /* lgdt */ | ||
1696 | rc = read_descriptor(ctxt, ops, c->src.ptr, | ||
1697 | &size, &address, c->op_bytes); | ||
1698 | if (rc) | ||
1699 | goto done; | ||
1700 | realmode_lgdt(ctxt->vcpu, size, address); | ||
1701 | break; | ||
1702 | case 3: /* lidt/vmmcall */ | ||
1703 | if (c->modrm_mod == 3 && c->modrm_rm == 1) { | ||
1704 | rc = kvm_fix_hypercall(ctxt->vcpu); | ||
1705 | if (rc) | ||
1706 | goto done; | ||
1707 | kvm_emulate_hypercall(ctxt->vcpu); | ||
1708 | } else { | ||
1709 | rc = read_descriptor(ctxt, ops, c->src.ptr, | ||
1710 | &size, &address, | ||
1711 | c->op_bytes); | ||
1712 | if (rc) | ||
1713 | goto done; | ||
1714 | realmode_lidt(ctxt->vcpu, size, address); | ||
1715 | } | ||
1716 | break; | ||
1717 | case 4: /* smsw */ | ||
1718 | if (c->modrm_mod != 3) | ||
1719 | goto cannot_emulate; | ||
1720 | *(u16 *)&c->regs[c->modrm_rm] | ||
1721 | = realmode_get_cr(ctxt->vcpu, 0); | ||
1722 | break; | ||
1723 | case 6: /* lmsw */ | ||
1724 | if (c->modrm_mod != 3) | ||
1725 | goto cannot_emulate; | ||
1726 | realmode_lmsw(ctxt->vcpu, (u16)c->modrm_val, | ||
1727 | &ctxt->eflags); | ||
1728 | break; | ||
1729 | case 7: /* invlpg*/ | ||
1730 | emulate_invlpg(ctxt->vcpu, memop); | ||
1731 | break; | ||
1732 | default: | ||
1733 | goto cannot_emulate; | ||
1734 | } | ||
1735 | /* Disable writeback. */ | ||
1736 | c->dst.type = OP_NONE; | ||
1737 | break; | ||
1738 | case 0x06: | ||
1739 | emulate_clts(ctxt->vcpu); | ||
1740 | c->dst.type = OP_NONE; | ||
1741 | break; | ||
1742 | case 0x08: /* invd */ | ||
1743 | case 0x09: /* wbinvd */ | ||
1744 | case 0x0d: /* GrpP (prefetch) */ | ||
1745 | case 0x18: /* Grp16 (prefetch/nop) */ | ||
1746 | c->dst.type = OP_NONE; | ||
1747 | break; | ||
1748 | case 0x20: /* mov cr, reg */ | ||
1749 | if (c->modrm_mod != 3) | ||
1750 | goto cannot_emulate; | ||
1751 | c->regs[c->modrm_rm] = | ||
1752 | realmode_get_cr(ctxt->vcpu, c->modrm_reg); | ||
1753 | c->dst.type = OP_NONE; /* no writeback */ | ||
1754 | break; | ||
1755 | case 0x21: /* mov from dr to reg */ | ||
1756 | if (c->modrm_mod != 3) | ||
1757 | goto cannot_emulate; | ||
1758 | rc = emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]); | ||
1759 | if (rc) | ||
1760 | goto cannot_emulate; | ||
1761 | c->dst.type = OP_NONE; /* no writeback */ | ||
1762 | break; | ||
1763 | case 0x22: /* mov reg, cr */ | ||
1764 | if (c->modrm_mod != 3) | ||
1765 | goto cannot_emulate; | ||
1766 | realmode_set_cr(ctxt->vcpu, | ||
1767 | c->modrm_reg, c->modrm_val, &ctxt->eflags); | ||
1768 | c->dst.type = OP_NONE; | ||
1769 | break; | ||
1770 | case 0x23: /* mov from reg to dr */ | ||
1771 | if (c->modrm_mod != 3) | ||
1772 | goto cannot_emulate; | ||
1773 | rc = emulator_set_dr(ctxt, c->modrm_reg, | ||
1774 | c->regs[c->modrm_rm]); | ||
1775 | if (rc) | ||
1776 | goto cannot_emulate; | ||
1777 | c->dst.type = OP_NONE; /* no writeback */ | ||
1778 | break; | ||
1779 | case 0x30: | ||
1780 | /* wrmsr */ | ||
1781 | msr_data = (u32)c->regs[VCPU_REGS_RAX] | ||
1782 | | ((u64)c->regs[VCPU_REGS_RDX] << 32); | ||
1783 | rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data); | ||
1784 | if (rc) { | ||
1785 | kvm_inject_gp(ctxt->vcpu, 0); | ||
1786 | c->eip = ctxt->vcpu->arch.rip; | ||
1787 | } | ||
1788 | rc = X86EMUL_CONTINUE; | ||
1789 | c->dst.type = OP_NONE; | ||
1790 | break; | ||
1791 | case 0x32: | ||
1792 | /* rdmsr */ | ||
1793 | rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data); | ||
1794 | if (rc) { | ||
1795 | kvm_inject_gp(ctxt->vcpu, 0); | ||
1796 | c->eip = ctxt->vcpu->arch.rip; | ||
1797 | } else { | ||
1798 | c->regs[VCPU_REGS_RAX] = (u32)msr_data; | ||
1799 | c->regs[VCPU_REGS_RDX] = msr_data >> 32; | ||
1800 | } | ||
1801 | rc = X86EMUL_CONTINUE; | ||
1802 | c->dst.type = OP_NONE; | ||
1803 | break; | ||
1804 | case 0x40 ... 0x4f: /* cmov */ | ||
1805 | c->dst.val = c->dst.orig_val = c->src.val; | ||
1806 | if (!test_cc(c->b, ctxt->eflags)) | ||
1807 | c->dst.type = OP_NONE; /* no writeback */ | ||
1808 | break; | ||
1809 | case 0x80 ... 0x8f: /* jnz rel, etc*/ { | ||
1810 | long int rel; | ||
1811 | |||
1812 | switch (c->op_bytes) { | ||
1813 | case 2: | ||
1814 | rel = insn_fetch(s16, 2, c->eip); | ||
1815 | break; | ||
1816 | case 4: | ||
1817 | rel = insn_fetch(s32, 4, c->eip); | ||
1818 | break; | ||
1819 | case 8: | ||
1820 | rel = insn_fetch(s64, 8, c->eip); | ||
1821 | break; | ||
1822 | default: | ||
1823 | DPRINTF("jnz: Invalid op_bytes\n"); | ||
1824 | goto cannot_emulate; | ||
1825 | } | ||
1826 | if (test_cc(c->b, ctxt->eflags)) | ||
1827 | JMP_REL(rel); | ||
1828 | c->dst.type = OP_NONE; | ||
1829 | break; | ||
1830 | } | ||
1831 | case 0xa3: | ||
1832 | bt: /* bt */ | ||
1833 | c->dst.type = OP_NONE; | ||
1834 | /* only subword offset */ | ||
1835 | c->src.val &= (c->dst.bytes << 3) - 1; | ||
1836 | emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags); | ||
1837 | break; | ||
1838 | case 0xab: | ||
1839 | bts: /* bts */ | ||
1840 | /* only subword offset */ | ||
1841 | c->src.val &= (c->dst.bytes << 3) - 1; | ||
1842 | emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags); | ||
1843 | break; | ||
1844 | case 0xb0 ... 0xb1: /* cmpxchg */ | ||
1845 | /* | ||
1846 | * Save real source value, then compare EAX against | ||
1847 | * destination. | ||
1848 | */ | ||
1849 | c->src.orig_val = c->src.val; | ||
1850 | c->src.val = c->regs[VCPU_REGS_RAX]; | ||
1851 | emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); | ||
1852 | if (ctxt->eflags & EFLG_ZF) { | ||
1853 | /* Success: write back to memory. */ | ||
1854 | c->dst.val = c->src.orig_val; | ||
1855 | } else { | ||
1856 | /* Failure: write the value we saw to EAX. */ | ||
1857 | c->dst.type = OP_REG; | ||
1858 | c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; | ||
1859 | } | ||
1860 | break; | ||
1861 | case 0xb3: | ||
1862 | btr: /* btr */ | ||
1863 | /* only subword offset */ | ||
1864 | c->src.val &= (c->dst.bytes << 3) - 1; | ||
1865 | emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags); | ||
1866 | break; | ||
1867 | case 0xb6 ... 0xb7: /* movzx */ | ||
1868 | c->dst.bytes = c->op_bytes; | ||
1869 | c->dst.val = (c->d & ByteOp) ? (u8) c->src.val | ||
1870 | : (u16) c->src.val; | ||
1871 | break; | ||
1872 | case 0xba: /* Grp8 */ | ||
1873 | switch (c->modrm_reg & 3) { | ||
1874 | case 0: | ||
1875 | goto bt; | ||
1876 | case 1: | ||
1877 | goto bts; | ||
1878 | case 2: | ||
1879 | goto btr; | ||
1880 | case 3: | ||
1881 | goto btc; | ||
1882 | } | ||
1883 | break; | ||
1884 | case 0xbb: | ||
1885 | btc: /* btc */ | ||
1886 | /* only subword offset */ | ||
1887 | c->src.val &= (c->dst.bytes << 3) - 1; | ||
1888 | emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags); | ||
1889 | break; | ||
1890 | case 0xbe ... 0xbf: /* movsx */ | ||
1891 | c->dst.bytes = c->op_bytes; | ||
1892 | c->dst.val = (c->d & ByteOp) ? (s8) c->src.val : | ||
1893 | (s16) c->src.val; | ||
1894 | break; | ||
1895 | case 0xc3: /* movnti */ | ||
1896 | c->dst.bytes = c->op_bytes; | ||
1897 | c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val : | ||
1898 | (u64) c->src.val; | ||
1899 | break; | ||
1900 | case 0xc7: /* Grp9 (cmpxchg8b) */ | ||
1901 | rc = emulate_grp9(ctxt, ops, memop); | ||
1902 | if (rc != 0) | ||
1903 | goto done; | ||
1904 | c->dst.type = OP_NONE; | ||
1905 | break; | ||
1906 | } | ||
1907 | goto writeback; | ||
1908 | |||
1909 | cannot_emulate: | ||
1910 | DPRINTF("Cannot emulate %02x\n", c->b); | ||
1911 | c->eip = saved_eip; | ||
1912 | return -1; | ||
1913 | } | ||
diff --git a/drivers/kvm/x86_emulate.h b/drivers/kvm/x86_emulate.h deleted file mode 100644 index 7db91b9bdcd4..000000000000 --- a/drivers/kvm/x86_emulate.h +++ /dev/null | |||
@@ -1,186 +0,0 @@ | |||
1 | /****************************************************************************** | ||
2 | * x86_emulate.h | ||
3 | * | ||
4 | * Generic x86 (32-bit and 64-bit) instruction decoder and emulator. | ||
5 | * | ||
6 | * Copyright (c) 2005 Keir Fraser | ||
7 | * | ||
8 | * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 | ||
9 | */ | ||
10 | |||
11 | #ifndef __X86_EMULATE_H__ | ||
12 | #define __X86_EMULATE_H__ | ||
13 | |||
14 | struct x86_emulate_ctxt; | ||
15 | |||
16 | /* | ||
17 | * x86_emulate_ops: | ||
18 | * | ||
19 | * These operations represent the instruction emulator's interface to memory. | ||
20 | * There are two categories of operation: those that act on ordinary memory | ||
21 | * regions (*_std), and those that act on memory regions known to require | ||
22 | * special treatment or emulation (*_emulated). | ||
23 | * | ||
24 | * The emulator assumes that an instruction accesses only one 'emulated memory' | ||
25 | * location, that this location is the given linear faulting address (cr2), and | ||
26 | * that this is one of the instruction's data operands. Instruction fetches and | ||
27 | * stack operations are assumed never to access emulated memory. The emulator | ||
28 | * automatically deduces which operand of a string-move operation is accessing | ||
29 | * emulated memory, and assumes that the other operand accesses normal memory. | ||
30 | * | ||
31 | * NOTES: | ||
32 | * 1. The emulator isn't very smart about emulated vs. standard memory. | ||
33 | * 'Emulated memory' access addresses should be checked for sanity. | ||
34 | * 'Normal memory' accesses may fault, and the caller must arrange to | ||
35 | * detect and handle reentrancy into the emulator via recursive faults. | ||
36 | * Accesses may be unaligned and may cross page boundaries. | ||
37 | * 2. If the access fails (cannot emulate, or a standard access faults) then | ||
38 | * it is up to the memop to propagate the fault to the guest VM via | ||
39 | * some out-of-band mechanism, unknown to the emulator. The memop signals | ||
40 | * failure by returning X86EMUL_PROPAGATE_FAULT to the emulator, which will | ||
41 | * then immediately bail. | ||
42 | * 3. Valid access sizes are 1, 2, 4 and 8 bytes. On x86/32 systems only | ||
43 | * cmpxchg8b_emulated need support 8-byte accesses. | ||
44 | * 4. The emulator cannot handle 64-bit mode emulation on an x86/32 system. | ||
45 | */ | ||
46 | /* Access completed successfully: continue emulation as normal. */ | ||
47 | #define X86EMUL_CONTINUE 0 | ||
48 | /* Access is unhandleable: bail from emulation and return error to caller. */ | ||
49 | #define X86EMUL_UNHANDLEABLE 1 | ||
50 | /* Terminate emulation but return success to the caller. */ | ||
51 | #define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */ | ||
52 | #define X86EMUL_RETRY_INSTR 2 /* retry the instruction for some reason */ | ||
53 | #define X86EMUL_CMPXCHG_FAILED 2 /* cmpxchg did not see expected value */ | ||
54 | struct x86_emulate_ops { | ||
55 | /* | ||
56 | * read_std: Read bytes of standard (non-emulated/special) memory. | ||
57 | * Used for instruction fetch, stack operations, and others. | ||
58 | * @addr: [IN ] Linear address from which to read. | ||
59 | * @val: [OUT] Value read from memory, zero-extended to 'u_long'. | ||
60 | * @bytes: [IN ] Number of bytes to read from memory. | ||
61 | */ | ||
62 | int (*read_std)(unsigned long addr, void *val, | ||
63 | unsigned int bytes, struct kvm_vcpu *vcpu); | ||
64 | |||
65 | /* | ||
66 | * read_emulated: Read bytes from emulated/special memory area. | ||
67 | * @addr: [IN ] Linear address from which to read. | ||
68 | * @val: [OUT] Value read from memory, zero-extended to 'u_long'. | ||
69 | * @bytes: [IN ] Number of bytes to read from memory. | ||
70 | */ | ||
71 | int (*read_emulated) (unsigned long addr, | ||
72 | void *val, | ||
73 | unsigned int bytes, | ||
74 | struct kvm_vcpu *vcpu); | ||
75 | |||
76 | /* | ||
77 | * write_emulated: Read bytes from emulated/special memory area. | ||
78 | * @addr: [IN ] Linear address to which to write. | ||
79 | * @val: [IN ] Value to write to memory (low-order bytes used as | ||
80 | * required). | ||
81 | * @bytes: [IN ] Number of bytes to write to memory. | ||
82 | */ | ||
83 | int (*write_emulated) (unsigned long addr, | ||
84 | const void *val, | ||
85 | unsigned int bytes, | ||
86 | struct kvm_vcpu *vcpu); | ||
87 | |||
88 | /* | ||
89 | * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an | ||
90 | * emulated/special memory area. | ||
91 | * @addr: [IN ] Linear address to access. | ||
92 | * @old: [IN ] Value expected to be current at @addr. | ||
93 | * @new: [IN ] Value to write to @addr. | ||
94 | * @bytes: [IN ] Number of bytes to access using CMPXCHG. | ||
95 | */ | ||
96 | int (*cmpxchg_emulated) (unsigned long addr, | ||
97 | const void *old, | ||
98 | const void *new, | ||
99 | unsigned int bytes, | ||
100 | struct kvm_vcpu *vcpu); | ||
101 | |||
102 | }; | ||
103 | |||
104 | /* Type, address-of, and value of an instruction's operand. */ | ||
105 | struct operand { | ||
106 | enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type; | ||
107 | unsigned int bytes; | ||
108 | unsigned long val, orig_val, *ptr; | ||
109 | }; | ||
110 | |||
111 | struct fetch_cache { | ||
112 | u8 data[15]; | ||
113 | unsigned long start; | ||
114 | unsigned long end; | ||
115 | }; | ||
116 | |||
117 | struct decode_cache { | ||
118 | u8 twobyte; | ||
119 | u8 b; | ||
120 | u8 lock_prefix; | ||
121 | u8 rep_prefix; | ||
122 | u8 op_bytes; | ||
123 | u8 ad_bytes; | ||
124 | u8 rex_prefix; | ||
125 | struct operand src; | ||
126 | struct operand dst; | ||
127 | unsigned long *override_base; | ||
128 | unsigned int d; | ||
129 | unsigned long regs[NR_VCPU_REGS]; | ||
130 | unsigned long eip; | ||
131 | /* modrm */ | ||
132 | u8 modrm; | ||
133 | u8 modrm_mod; | ||
134 | u8 modrm_reg; | ||
135 | u8 modrm_rm; | ||
136 | u8 use_modrm_ea; | ||
137 | unsigned long modrm_ea; | ||
138 | unsigned long modrm_val; | ||
139 | struct fetch_cache fetch; | ||
140 | }; | ||
141 | |||
142 | struct x86_emulate_ctxt { | ||
143 | /* Register state before/after emulation. */ | ||
144 | struct kvm_vcpu *vcpu; | ||
145 | |||
146 | /* Linear faulting address (if emulating a page-faulting instruction). */ | ||
147 | unsigned long eflags; | ||
148 | |||
149 | /* Emulated execution mode, represented by an X86EMUL_MODE value. */ | ||
150 | int mode; | ||
151 | |||
152 | unsigned long cs_base; | ||
153 | unsigned long ds_base; | ||
154 | unsigned long es_base; | ||
155 | unsigned long ss_base; | ||
156 | unsigned long gs_base; | ||
157 | unsigned long fs_base; | ||
158 | |||
159 | /* decode cache */ | ||
160 | |||
161 | struct decode_cache decode; | ||
162 | }; | ||
163 | |||
164 | /* Repeat String Operation Prefix */ | ||
165 | #define REPE_PREFIX 1 | ||
166 | #define REPNE_PREFIX 2 | ||
167 | |||
168 | /* Execution mode, passed to the emulator. */ | ||
169 | #define X86EMUL_MODE_REAL 0 /* Real mode. */ | ||
170 | #define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */ | ||
171 | #define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */ | ||
172 | #define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */ | ||
173 | |||
174 | /* Host execution mode. */ | ||
175 | #if defined(__i386__) | ||
176 | #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32 | ||
177 | #elif defined(CONFIG_X86_64) | ||
178 | #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 | ||
179 | #endif | ||
180 | |||
181 | int x86_decode_insn(struct x86_emulate_ctxt *ctxt, | ||
182 | struct x86_emulate_ops *ops); | ||
183 | int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, | ||
184 | struct x86_emulate_ops *ops); | ||
185 | |||
186 | #endif /* __X86_EMULATE_H__ */ | ||