diff options
Diffstat (limited to 'arch/x86')
-rw-r--r-- | arch/x86/Kconfig | 2 | ||||
-rw-r--r-- | arch/x86/Makefile | 2 | ||||
-rw-r--r-- | arch/x86/kvm/Kconfig | 57 | ||||
-rw-r--r-- | arch/x86/kvm/Makefile | 15 | ||||
-rw-r--r-- | arch/x86/kvm/i8259.c | 450 | ||||
-rw-r--r-- | arch/x86/kvm/ioapic.c | 400 | ||||
-rw-r--r-- | arch/x86/kvm/irq.c | 98 | ||||
-rw-r--r-- | arch/x86/kvm/irq.h | 195 | ||||
-rw-r--r-- | arch/x86/kvm/kvm_svm.h | 45 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c | 1085 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.c | 1805 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.h | 44 | ||||
-rw-r--r-- | arch/x86/kvm/paging_tmpl.h | 461 | ||||
-rw-r--r-- | arch/x86/kvm/segment_descriptor.h | 29 | ||||
-rw-r--r-- | arch/x86/kvm/svm.c | 1725 | ||||
-rw-r--r-- | arch/x86/kvm/svm.h | 325 | ||||
-rw-r--r-- | arch/x86/kvm/vmx.c | 2671 | ||||
-rw-r--r-- | arch/x86/kvm/vmx.h | 324 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 3146 | ||||
-rw-r--r-- | arch/x86/kvm/x86_emulate.c | 1912 |
20 files changed, 14791 insertions, 0 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d289cfcf92c4..65b449134cf7 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
@@ -1599,4 +1599,6 @@ source "security/Kconfig" | |||
1599 | 1599 | ||
1600 | source "crypto/Kconfig" | 1600 | source "crypto/Kconfig" |
1601 | 1601 | ||
1602 | source "arch/x86/kvm/Kconfig" | ||
1603 | |||
1602 | source "lib/Kconfig" | 1604 | source "lib/Kconfig" |
diff --git a/arch/x86/Makefile b/arch/x86/Makefile index b08f18261df6..da8f4129780b 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile | |||
@@ -7,6 +7,8 @@ else | |||
7 | KBUILD_DEFCONFIG := $(ARCH)_defconfig | 7 | KBUILD_DEFCONFIG := $(ARCH)_defconfig |
8 | endif | 8 | endif |
9 | 9 | ||
10 | core-$(CONFIG_KVM) += arch/x86/kvm/ | ||
11 | |||
10 | # BITS is used as extension for files which are available in a 32 bit | 12 | # BITS is used as extension for files which are available in a 32 bit |
11 | # and a 64 bit version to simplify shared Makefiles. | 13 | # and a 64 bit version to simplify shared Makefiles. |
12 | # e.g.: obj-y += foo_$(BITS).o | 14 | # e.g.: obj-y += foo_$(BITS).o |
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig new file mode 100644 index 000000000000..c83e1c9b5129 --- /dev/null +++ b/arch/x86/kvm/Kconfig | |||
@@ -0,0 +1,57 @@ | |||
1 | # | ||
2 | # KVM configuration | ||
3 | # | ||
4 | config HAVE_KVM | ||
5 | bool | ||
6 | |||
7 | menuconfig VIRTUALIZATION | ||
8 | bool "Virtualization" | ||
9 | depends on HAVE_KVM || X86 | ||
10 | default y | ||
11 | ---help--- | ||
12 | Say Y here to get to see options for using your Linux host to run other | ||
13 | operating systems inside virtual machines (guests). | ||
14 | This option alone does not add any kernel code. | ||
15 | |||
16 | If you say N, all options in this submenu will be skipped and disabled. | ||
17 | |||
18 | if VIRTUALIZATION | ||
19 | |||
20 | config KVM | ||
21 | tristate "Kernel-based Virtual Machine (KVM) support" | ||
22 | depends on HAVE_KVM && EXPERIMENTAL | ||
23 | select PREEMPT_NOTIFIERS | ||
24 | select ANON_INODES | ||
25 | ---help--- | ||
26 | Support hosting fully virtualized guest machines using hardware | ||
27 | virtualization extensions. You will need a fairly recent | ||
28 | processor equipped with virtualization extensions. You will also | ||
29 | need to select one or more of the processor modules below. | ||
30 | |||
31 | This module provides access to the hardware capabilities through | ||
32 | a character device node named /dev/kvm. | ||
33 | |||
34 | To compile this as a module, choose M here: the module | ||
35 | will be called kvm. | ||
36 | |||
37 | If unsure, say N. | ||
38 | |||
39 | config KVM_INTEL | ||
40 | tristate "KVM for Intel processors support" | ||
41 | depends on KVM | ||
42 | ---help--- | ||
43 | Provides support for KVM on Intel processors equipped with the VT | ||
44 | extensions. | ||
45 | |||
46 | config KVM_AMD | ||
47 | tristate "KVM for AMD processors support" | ||
48 | depends on KVM | ||
49 | ---help--- | ||
50 | Provides support for KVM on AMD processors equipped with the AMD-V | ||
51 | (SVM) extensions. | ||
52 | |||
53 | # OK, it's a little counter-intuitive to do this, but it puts it neatly under | ||
54 | # the virtualization menu. | ||
55 | source drivers/lguest/Kconfig | ||
56 | |||
57 | endif # VIRTUALIZATION | ||
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile new file mode 100644 index 000000000000..880ffe403b35 --- /dev/null +++ b/arch/x86/kvm/Makefile | |||
@@ -0,0 +1,15 @@ | |||
1 | # | ||
2 | # Makefile for Kernel-based Virtual Machine module | ||
3 | # | ||
4 | |||
5 | common-objs = $(addprefix ../../../drivers/kvm/, kvm_main.o) | ||
6 | |||
7 | EXTRA_CFLAGS += -I drivers/kvm | ||
8 | |||
9 | kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \ | ||
10 | ioapic.o | ||
11 | obj-$(CONFIG_KVM) += kvm.o | ||
12 | kvm-intel-objs = vmx.o | ||
13 | obj-$(CONFIG_KVM_INTEL) += kvm-intel.o | ||
14 | kvm-amd-objs = svm.o | ||
15 | obj-$(CONFIG_KVM_AMD) += kvm-amd.o | ||
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c new file mode 100644 index 000000000000..ab29cf2def47 --- /dev/null +++ b/arch/x86/kvm/i8259.c | |||
@@ -0,0 +1,450 @@ | |||
1 | /* | ||
2 | * 8259 interrupt controller emulation | ||
3 | * | ||
4 | * Copyright (c) 2003-2004 Fabrice Bellard | ||
5 | * Copyright (c) 2007 Intel Corporation | ||
6 | * | ||
7 | * Permission is hereby granted, free of charge, to any person obtaining a copy | ||
8 | * of this software and associated documentation files (the "Software"), to deal | ||
9 | * in the Software without restriction, including without limitation the rights | ||
10 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
11 | * copies of the Software, and to permit persons to whom the Software is | ||
12 | * furnished to do so, subject to the following conditions: | ||
13 | * | ||
14 | * The above copyright notice and this permission notice shall be included in | ||
15 | * all copies or substantial portions of the Software. | ||
16 | * | ||
17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | ||
20 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
22 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | ||
23 | * THE SOFTWARE. | ||
24 | * Authors: | ||
25 | * Yaozu (Eddie) Dong <Eddie.dong@intel.com> | ||
26 | * Port from Qemu. | ||
27 | */ | ||
28 | #include <linux/mm.h> | ||
29 | #include "irq.h" | ||
30 | |||
31 | #include <linux/kvm_host.h> | ||
32 | |||
33 | /* | ||
34 | * set irq level. If an edge is detected, then the IRR is set to 1 | ||
35 | */ | ||
36 | static inline void pic_set_irq1(struct kvm_kpic_state *s, int irq, int level) | ||
37 | { | ||
38 | int mask; | ||
39 | mask = 1 << irq; | ||
40 | if (s->elcr & mask) /* level triggered */ | ||
41 | if (level) { | ||
42 | s->irr |= mask; | ||
43 | s->last_irr |= mask; | ||
44 | } else { | ||
45 | s->irr &= ~mask; | ||
46 | s->last_irr &= ~mask; | ||
47 | } | ||
48 | else /* edge triggered */ | ||
49 | if (level) { | ||
50 | if ((s->last_irr & mask) == 0) | ||
51 | s->irr |= mask; | ||
52 | s->last_irr |= mask; | ||
53 | } else | ||
54 | s->last_irr &= ~mask; | ||
55 | } | ||
56 | |||
57 | /* | ||
58 | * return the highest priority found in mask (highest = smallest | ||
59 | * number). Return 8 if no irq | ||
60 | */ | ||
61 | static inline int get_priority(struct kvm_kpic_state *s, int mask) | ||
62 | { | ||
63 | int priority; | ||
64 | if (mask == 0) | ||
65 | return 8; | ||
66 | priority = 0; | ||
67 | while ((mask & (1 << ((priority + s->priority_add) & 7))) == 0) | ||
68 | priority++; | ||
69 | return priority; | ||
70 | } | ||
71 | |||
72 | /* | ||
73 | * return the pic wanted interrupt. return -1 if none | ||
74 | */ | ||
75 | static int pic_get_irq(struct kvm_kpic_state *s) | ||
76 | { | ||
77 | int mask, cur_priority, priority; | ||
78 | |||
79 | mask = s->irr & ~s->imr; | ||
80 | priority = get_priority(s, mask); | ||
81 | if (priority == 8) | ||
82 | return -1; | ||
83 | /* | ||
84 | * compute current priority. If special fully nested mode on the | ||
85 | * master, the IRQ coming from the slave is not taken into account | ||
86 | * for the priority computation. | ||
87 | */ | ||
88 | mask = s->isr; | ||
89 | if (s->special_fully_nested_mode && s == &s->pics_state->pics[0]) | ||
90 | mask &= ~(1 << 2); | ||
91 | cur_priority = get_priority(s, mask); | ||
92 | if (priority < cur_priority) | ||
93 | /* | ||
94 | * higher priority found: an irq should be generated | ||
95 | */ | ||
96 | return (priority + s->priority_add) & 7; | ||
97 | else | ||
98 | return -1; | ||
99 | } | ||
100 | |||
101 | /* | ||
102 | * raise irq to CPU if necessary. must be called every time the active | ||
103 | * irq may change | ||
104 | */ | ||
105 | static void pic_update_irq(struct kvm_pic *s) | ||
106 | { | ||
107 | int irq2, irq; | ||
108 | |||
109 | irq2 = pic_get_irq(&s->pics[1]); | ||
110 | if (irq2 >= 0) { | ||
111 | /* | ||
112 | * if irq request by slave pic, signal master PIC | ||
113 | */ | ||
114 | pic_set_irq1(&s->pics[0], 2, 1); | ||
115 | pic_set_irq1(&s->pics[0], 2, 0); | ||
116 | } | ||
117 | irq = pic_get_irq(&s->pics[0]); | ||
118 | if (irq >= 0) | ||
119 | s->irq_request(s->irq_request_opaque, 1); | ||
120 | else | ||
121 | s->irq_request(s->irq_request_opaque, 0); | ||
122 | } | ||
123 | |||
124 | void kvm_pic_update_irq(struct kvm_pic *s) | ||
125 | { | ||
126 | pic_update_irq(s); | ||
127 | } | ||
128 | |||
129 | void kvm_pic_set_irq(void *opaque, int irq, int level) | ||
130 | { | ||
131 | struct kvm_pic *s = opaque; | ||
132 | |||
133 | pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); | ||
134 | pic_update_irq(s); | ||
135 | } | ||
136 | |||
137 | /* | ||
138 | * acknowledge interrupt 'irq' | ||
139 | */ | ||
140 | static inline void pic_intack(struct kvm_kpic_state *s, int irq) | ||
141 | { | ||
142 | if (s->auto_eoi) { | ||
143 | if (s->rotate_on_auto_eoi) | ||
144 | s->priority_add = (irq + 1) & 7; | ||
145 | } else | ||
146 | s->isr |= (1 << irq); | ||
147 | /* | ||
148 | * We don't clear a level sensitive interrupt here | ||
149 | */ | ||
150 | if (!(s->elcr & (1 << irq))) | ||
151 | s->irr &= ~(1 << irq); | ||
152 | } | ||
153 | |||
154 | int kvm_pic_read_irq(struct kvm_pic *s) | ||
155 | { | ||
156 | int irq, irq2, intno; | ||
157 | |||
158 | irq = pic_get_irq(&s->pics[0]); | ||
159 | if (irq >= 0) { | ||
160 | pic_intack(&s->pics[0], irq); | ||
161 | if (irq == 2) { | ||
162 | irq2 = pic_get_irq(&s->pics[1]); | ||
163 | if (irq2 >= 0) | ||
164 | pic_intack(&s->pics[1], irq2); | ||
165 | else | ||
166 | /* | ||
167 | * spurious IRQ on slave controller | ||
168 | */ | ||
169 | irq2 = 7; | ||
170 | intno = s->pics[1].irq_base + irq2; | ||
171 | irq = irq2 + 8; | ||
172 | } else | ||
173 | intno = s->pics[0].irq_base + irq; | ||
174 | } else { | ||
175 | /* | ||
176 | * spurious IRQ on host controller | ||
177 | */ | ||
178 | irq = 7; | ||
179 | intno = s->pics[0].irq_base + irq; | ||
180 | } | ||
181 | pic_update_irq(s); | ||
182 | |||
183 | return intno; | ||
184 | } | ||
185 | |||
186 | void kvm_pic_reset(struct kvm_kpic_state *s) | ||
187 | { | ||
188 | s->last_irr = 0; | ||
189 | s->irr = 0; | ||
190 | s->imr = 0; | ||
191 | s->isr = 0; | ||
192 | s->priority_add = 0; | ||
193 | s->irq_base = 0; | ||
194 | s->read_reg_select = 0; | ||
195 | s->poll = 0; | ||
196 | s->special_mask = 0; | ||
197 | s->init_state = 0; | ||
198 | s->auto_eoi = 0; | ||
199 | s->rotate_on_auto_eoi = 0; | ||
200 | s->special_fully_nested_mode = 0; | ||
201 | s->init4 = 0; | ||
202 | } | ||
203 | |||
204 | static void pic_ioport_write(void *opaque, u32 addr, u32 val) | ||
205 | { | ||
206 | struct kvm_kpic_state *s = opaque; | ||
207 | int priority, cmd, irq; | ||
208 | |||
209 | addr &= 1; | ||
210 | if (addr == 0) { | ||
211 | if (val & 0x10) { | ||
212 | kvm_pic_reset(s); /* init */ | ||
213 | /* | ||
214 | * deassert a pending interrupt | ||
215 | */ | ||
216 | s->pics_state->irq_request(s->pics_state-> | ||
217 | irq_request_opaque, 0); | ||
218 | s->init_state = 1; | ||
219 | s->init4 = val & 1; | ||
220 | if (val & 0x02) | ||
221 | printk(KERN_ERR "single mode not supported"); | ||
222 | if (val & 0x08) | ||
223 | printk(KERN_ERR | ||
224 | "level sensitive irq not supported"); | ||
225 | } else if (val & 0x08) { | ||
226 | if (val & 0x04) | ||
227 | s->poll = 1; | ||
228 | if (val & 0x02) | ||
229 | s->read_reg_select = val & 1; | ||
230 | if (val & 0x40) | ||
231 | s->special_mask = (val >> 5) & 1; | ||
232 | } else { | ||
233 | cmd = val >> 5; | ||
234 | switch (cmd) { | ||
235 | case 0: | ||
236 | case 4: | ||
237 | s->rotate_on_auto_eoi = cmd >> 2; | ||
238 | break; | ||
239 | case 1: /* end of interrupt */ | ||
240 | case 5: | ||
241 | priority = get_priority(s, s->isr); | ||
242 | if (priority != 8) { | ||
243 | irq = (priority + s->priority_add) & 7; | ||
244 | s->isr &= ~(1 << irq); | ||
245 | if (cmd == 5) | ||
246 | s->priority_add = (irq + 1) & 7; | ||
247 | pic_update_irq(s->pics_state); | ||
248 | } | ||
249 | break; | ||
250 | case 3: | ||
251 | irq = val & 7; | ||
252 | s->isr &= ~(1 << irq); | ||
253 | pic_update_irq(s->pics_state); | ||
254 | break; | ||
255 | case 6: | ||
256 | s->priority_add = (val + 1) & 7; | ||
257 | pic_update_irq(s->pics_state); | ||
258 | break; | ||
259 | case 7: | ||
260 | irq = val & 7; | ||
261 | s->isr &= ~(1 << irq); | ||
262 | s->priority_add = (irq + 1) & 7; | ||
263 | pic_update_irq(s->pics_state); | ||
264 | break; | ||
265 | default: | ||
266 | break; /* no operation */ | ||
267 | } | ||
268 | } | ||
269 | } else | ||
270 | switch (s->init_state) { | ||
271 | case 0: /* normal mode */ | ||
272 | s->imr = val; | ||
273 | pic_update_irq(s->pics_state); | ||
274 | break; | ||
275 | case 1: | ||
276 | s->irq_base = val & 0xf8; | ||
277 | s->init_state = 2; | ||
278 | break; | ||
279 | case 2: | ||
280 | if (s->init4) | ||
281 | s->init_state = 3; | ||
282 | else | ||
283 | s->init_state = 0; | ||
284 | break; | ||
285 | case 3: | ||
286 | s->special_fully_nested_mode = (val >> 4) & 1; | ||
287 | s->auto_eoi = (val >> 1) & 1; | ||
288 | s->init_state = 0; | ||
289 | break; | ||
290 | } | ||
291 | } | ||
292 | |||
293 | static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1) | ||
294 | { | ||
295 | int ret; | ||
296 | |||
297 | ret = pic_get_irq(s); | ||
298 | if (ret >= 0) { | ||
299 | if (addr1 >> 7) { | ||
300 | s->pics_state->pics[0].isr &= ~(1 << 2); | ||
301 | s->pics_state->pics[0].irr &= ~(1 << 2); | ||
302 | } | ||
303 | s->irr &= ~(1 << ret); | ||
304 | s->isr &= ~(1 << ret); | ||
305 | if (addr1 >> 7 || ret != 2) | ||
306 | pic_update_irq(s->pics_state); | ||
307 | } else { | ||
308 | ret = 0x07; | ||
309 | pic_update_irq(s->pics_state); | ||
310 | } | ||
311 | |||
312 | return ret; | ||
313 | } | ||
314 | |||
315 | static u32 pic_ioport_read(void *opaque, u32 addr1) | ||
316 | { | ||
317 | struct kvm_kpic_state *s = opaque; | ||
318 | unsigned int addr; | ||
319 | int ret; | ||
320 | |||
321 | addr = addr1; | ||
322 | addr &= 1; | ||
323 | if (s->poll) { | ||
324 | ret = pic_poll_read(s, addr1); | ||
325 | s->poll = 0; | ||
326 | } else | ||
327 | if (addr == 0) | ||
328 | if (s->read_reg_select) | ||
329 | ret = s->isr; | ||
330 | else | ||
331 | ret = s->irr; | ||
332 | else | ||
333 | ret = s->imr; | ||
334 | return ret; | ||
335 | } | ||
336 | |||
337 | static void elcr_ioport_write(void *opaque, u32 addr, u32 val) | ||
338 | { | ||
339 | struct kvm_kpic_state *s = opaque; | ||
340 | s->elcr = val & s->elcr_mask; | ||
341 | } | ||
342 | |||
343 | static u32 elcr_ioport_read(void *opaque, u32 addr1) | ||
344 | { | ||
345 | struct kvm_kpic_state *s = opaque; | ||
346 | return s->elcr; | ||
347 | } | ||
348 | |||
349 | static int picdev_in_range(struct kvm_io_device *this, gpa_t addr) | ||
350 | { | ||
351 | switch (addr) { | ||
352 | case 0x20: | ||
353 | case 0x21: | ||
354 | case 0xa0: | ||
355 | case 0xa1: | ||
356 | case 0x4d0: | ||
357 | case 0x4d1: | ||
358 | return 1; | ||
359 | default: | ||
360 | return 0; | ||
361 | } | ||
362 | } | ||
363 | |||
364 | static void picdev_write(struct kvm_io_device *this, | ||
365 | gpa_t addr, int len, const void *val) | ||
366 | { | ||
367 | struct kvm_pic *s = this->private; | ||
368 | unsigned char data = *(unsigned char *)val; | ||
369 | |||
370 | if (len != 1) { | ||
371 | if (printk_ratelimit()) | ||
372 | printk(KERN_ERR "PIC: non byte write\n"); | ||
373 | return; | ||
374 | } | ||
375 | switch (addr) { | ||
376 | case 0x20: | ||
377 | case 0x21: | ||
378 | case 0xa0: | ||
379 | case 0xa1: | ||
380 | pic_ioport_write(&s->pics[addr >> 7], addr, data); | ||
381 | break; | ||
382 | case 0x4d0: | ||
383 | case 0x4d1: | ||
384 | elcr_ioport_write(&s->pics[addr & 1], addr, data); | ||
385 | break; | ||
386 | } | ||
387 | } | ||
388 | |||
389 | static void picdev_read(struct kvm_io_device *this, | ||
390 | gpa_t addr, int len, void *val) | ||
391 | { | ||
392 | struct kvm_pic *s = this->private; | ||
393 | unsigned char data = 0; | ||
394 | |||
395 | if (len != 1) { | ||
396 | if (printk_ratelimit()) | ||
397 | printk(KERN_ERR "PIC: non byte read\n"); | ||
398 | return; | ||
399 | } | ||
400 | switch (addr) { | ||
401 | case 0x20: | ||
402 | case 0x21: | ||
403 | case 0xa0: | ||
404 | case 0xa1: | ||
405 | data = pic_ioport_read(&s->pics[addr >> 7], addr); | ||
406 | break; | ||
407 | case 0x4d0: | ||
408 | case 0x4d1: | ||
409 | data = elcr_ioport_read(&s->pics[addr & 1], addr); | ||
410 | break; | ||
411 | } | ||
412 | *(unsigned char *)val = data; | ||
413 | } | ||
414 | |||
415 | /* | ||
416 | * callback when PIC0 irq status changed | ||
417 | */ | ||
418 | static void pic_irq_request(void *opaque, int level) | ||
419 | { | ||
420 | struct kvm *kvm = opaque; | ||
421 | struct kvm_vcpu *vcpu = kvm->vcpus[0]; | ||
422 | |||
423 | pic_irqchip(kvm)->output = level; | ||
424 | if (vcpu) | ||
425 | kvm_vcpu_kick(vcpu); | ||
426 | } | ||
427 | |||
428 | struct kvm_pic *kvm_create_pic(struct kvm *kvm) | ||
429 | { | ||
430 | struct kvm_pic *s; | ||
431 | s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); | ||
432 | if (!s) | ||
433 | return NULL; | ||
434 | s->pics[0].elcr_mask = 0xf8; | ||
435 | s->pics[1].elcr_mask = 0xde; | ||
436 | s->irq_request = pic_irq_request; | ||
437 | s->irq_request_opaque = kvm; | ||
438 | s->pics[0].pics_state = s; | ||
439 | s->pics[1].pics_state = s; | ||
440 | |||
441 | /* | ||
442 | * Initialize PIO device | ||
443 | */ | ||
444 | s->dev.read = picdev_read; | ||
445 | s->dev.write = picdev_write; | ||
446 | s->dev.in_range = picdev_in_range; | ||
447 | s->dev.private = s; | ||
448 | kvm_io_bus_register_dev(&kvm->pio_bus, &s->dev); | ||
449 | return s; | ||
450 | } | ||
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c new file mode 100644 index 000000000000..72f12f75495d --- /dev/null +++ b/arch/x86/kvm/ioapic.c | |||
@@ -0,0 +1,400 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2001 MandrakeSoft S.A. | ||
3 | * | ||
4 | * MandrakeSoft S.A. | ||
5 | * 43, rue d'Aboukir | ||
6 | * 75002 Paris - France | ||
7 | * http://www.linux-mandrake.com/ | ||
8 | * http://www.mandrakesoft.com/ | ||
9 | * | ||
10 | * This library is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU Lesser General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This library is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * Lesser General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU Lesser General Public | ||
21 | * License along with this library; if not, write to the Free Software | ||
22 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
23 | * | ||
24 | * Yunhong Jiang <yunhong.jiang@intel.com> | ||
25 | * Yaozu (Eddie) Dong <eddie.dong@intel.com> | ||
26 | * Based on Xen 3.1 code. | ||
27 | */ | ||
28 | |||
29 | #include <linux/kvm_host.h> | ||
30 | #include <linux/kvm.h> | ||
31 | #include <linux/mm.h> | ||
32 | #include <linux/highmem.h> | ||
33 | #include <linux/smp.h> | ||
34 | #include <linux/hrtimer.h> | ||
35 | #include <linux/io.h> | ||
36 | #include <asm/processor.h> | ||
37 | #include <asm/page.h> | ||
38 | #include <asm/current.h> | ||
39 | #include "irq.h" | ||
40 | #if 0 | ||
41 | #define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) | ||
42 | #else | ||
43 | #define ioapic_debug(fmt, arg...) | ||
44 | #endif | ||
45 | static void ioapic_deliver(struct kvm_ioapic *vioapic, int irq); | ||
46 | |||
47 | static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, | ||
48 | unsigned long addr, | ||
49 | unsigned long length) | ||
50 | { | ||
51 | unsigned long result = 0; | ||
52 | |||
53 | switch (ioapic->ioregsel) { | ||
54 | case IOAPIC_REG_VERSION: | ||
55 | result = ((((IOAPIC_NUM_PINS - 1) & 0xff) << 16) | ||
56 | | (IOAPIC_VERSION_ID & 0xff)); | ||
57 | break; | ||
58 | |||
59 | case IOAPIC_REG_APIC_ID: | ||
60 | case IOAPIC_REG_ARB_ID: | ||
61 | result = ((ioapic->id & 0xf) << 24); | ||
62 | break; | ||
63 | |||
64 | default: | ||
65 | { | ||
66 | u32 redir_index = (ioapic->ioregsel - 0x10) >> 1; | ||
67 | u64 redir_content; | ||
68 | |||
69 | ASSERT(redir_index < IOAPIC_NUM_PINS); | ||
70 | |||
71 | redir_content = ioapic->redirtbl[redir_index].bits; | ||
72 | result = (ioapic->ioregsel & 0x1) ? | ||
73 | (redir_content >> 32) & 0xffffffff : | ||
74 | redir_content & 0xffffffff; | ||
75 | break; | ||
76 | } | ||
77 | } | ||
78 | |||
79 | return result; | ||
80 | } | ||
81 | |||
82 | static void ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx) | ||
83 | { | ||
84 | union ioapic_redir_entry *pent; | ||
85 | |||
86 | pent = &ioapic->redirtbl[idx]; | ||
87 | |||
88 | if (!pent->fields.mask) { | ||
89 | ioapic_deliver(ioapic, idx); | ||
90 | if (pent->fields.trig_mode == IOAPIC_LEVEL_TRIG) | ||
91 | pent->fields.remote_irr = 1; | ||
92 | } | ||
93 | if (!pent->fields.trig_mode) | ||
94 | ioapic->irr &= ~(1 << idx); | ||
95 | } | ||
96 | |||
97 | static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) | ||
98 | { | ||
99 | unsigned index; | ||
100 | |||
101 | switch (ioapic->ioregsel) { | ||
102 | case IOAPIC_REG_VERSION: | ||
103 | /* Writes are ignored. */ | ||
104 | break; | ||
105 | |||
106 | case IOAPIC_REG_APIC_ID: | ||
107 | ioapic->id = (val >> 24) & 0xf; | ||
108 | break; | ||
109 | |||
110 | case IOAPIC_REG_ARB_ID: | ||
111 | break; | ||
112 | |||
113 | default: | ||
114 | index = (ioapic->ioregsel - 0x10) >> 1; | ||
115 | |||
116 | ioapic_debug("change redir index %x val %x\n", index, val); | ||
117 | if (index >= IOAPIC_NUM_PINS) | ||
118 | return; | ||
119 | if (ioapic->ioregsel & 1) { | ||
120 | ioapic->redirtbl[index].bits &= 0xffffffff; | ||
121 | ioapic->redirtbl[index].bits |= (u64) val << 32; | ||
122 | } else { | ||
123 | ioapic->redirtbl[index].bits &= ~0xffffffffULL; | ||
124 | ioapic->redirtbl[index].bits |= (u32) val; | ||
125 | ioapic->redirtbl[index].fields.remote_irr = 0; | ||
126 | } | ||
127 | if (ioapic->irr & (1 << index)) | ||
128 | ioapic_service(ioapic, index); | ||
129 | break; | ||
130 | } | ||
131 | } | ||
132 | |||
133 | static void ioapic_inj_irq(struct kvm_ioapic *ioapic, | ||
134 | struct kvm_vcpu *vcpu, | ||
135 | u8 vector, u8 trig_mode, u8 delivery_mode) | ||
136 | { | ||
137 | ioapic_debug("irq %d trig %d deliv %d\n", vector, trig_mode, | ||
138 | delivery_mode); | ||
139 | |||
140 | ASSERT((delivery_mode == IOAPIC_FIXED) || | ||
141 | (delivery_mode == IOAPIC_LOWEST_PRIORITY)); | ||
142 | |||
143 | kvm_apic_set_irq(vcpu, vector, trig_mode); | ||
144 | } | ||
145 | |||
146 | static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, | ||
147 | u8 dest_mode) | ||
148 | { | ||
149 | u32 mask = 0; | ||
150 | int i; | ||
151 | struct kvm *kvm = ioapic->kvm; | ||
152 | struct kvm_vcpu *vcpu; | ||
153 | |||
154 | ioapic_debug("dest %d dest_mode %d\n", dest, dest_mode); | ||
155 | |||
156 | if (dest_mode == 0) { /* Physical mode. */ | ||
157 | if (dest == 0xFF) { /* Broadcast. */ | ||
158 | for (i = 0; i < KVM_MAX_VCPUS; ++i) | ||
159 | if (kvm->vcpus[i] && kvm->vcpus[i]->arch.apic) | ||
160 | mask |= 1 << i; | ||
161 | return mask; | ||
162 | } | ||
163 | for (i = 0; i < KVM_MAX_VCPUS; ++i) { | ||
164 | vcpu = kvm->vcpus[i]; | ||
165 | if (!vcpu) | ||
166 | continue; | ||
167 | if (kvm_apic_match_physical_addr(vcpu->arch.apic, dest)) { | ||
168 | if (vcpu->arch.apic) | ||
169 | mask = 1 << i; | ||
170 | break; | ||
171 | } | ||
172 | } | ||
173 | } else if (dest != 0) /* Logical mode, MDA non-zero. */ | ||
174 | for (i = 0; i < KVM_MAX_VCPUS; ++i) { | ||
175 | vcpu = kvm->vcpus[i]; | ||
176 | if (!vcpu) | ||
177 | continue; | ||
178 | if (vcpu->arch.apic && | ||
179 | kvm_apic_match_logical_addr(vcpu->arch.apic, dest)) | ||
180 | mask |= 1 << vcpu->vcpu_id; | ||
181 | } | ||
182 | ioapic_debug("mask %x\n", mask); | ||
183 | return mask; | ||
184 | } | ||
185 | |||
186 | static void ioapic_deliver(struct kvm_ioapic *ioapic, int irq) | ||
187 | { | ||
188 | u8 dest = ioapic->redirtbl[irq].fields.dest_id; | ||
189 | u8 dest_mode = ioapic->redirtbl[irq].fields.dest_mode; | ||
190 | u8 delivery_mode = ioapic->redirtbl[irq].fields.delivery_mode; | ||
191 | u8 vector = ioapic->redirtbl[irq].fields.vector; | ||
192 | u8 trig_mode = ioapic->redirtbl[irq].fields.trig_mode; | ||
193 | u32 deliver_bitmask; | ||
194 | struct kvm_vcpu *vcpu; | ||
195 | int vcpu_id; | ||
196 | |||
197 | ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x " | ||
198 | "vector=%x trig_mode=%x\n", | ||
199 | dest, dest_mode, delivery_mode, vector, trig_mode); | ||
200 | |||
201 | deliver_bitmask = ioapic_get_delivery_bitmask(ioapic, dest, dest_mode); | ||
202 | if (!deliver_bitmask) { | ||
203 | ioapic_debug("no target on destination\n"); | ||
204 | return; | ||
205 | } | ||
206 | |||
207 | switch (delivery_mode) { | ||
208 | case IOAPIC_LOWEST_PRIORITY: | ||
209 | vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm, vector, | ||
210 | deliver_bitmask); | ||
211 | if (vcpu != NULL) | ||
212 | ioapic_inj_irq(ioapic, vcpu, vector, | ||
213 | trig_mode, delivery_mode); | ||
214 | else | ||
215 | ioapic_debug("null lowest prio vcpu: " | ||
216 | "mask=%x vector=%x delivery_mode=%x\n", | ||
217 | deliver_bitmask, vector, IOAPIC_LOWEST_PRIORITY); | ||
218 | break; | ||
219 | case IOAPIC_FIXED: | ||
220 | for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) { | ||
221 | if (!(deliver_bitmask & (1 << vcpu_id))) | ||
222 | continue; | ||
223 | deliver_bitmask &= ~(1 << vcpu_id); | ||
224 | vcpu = ioapic->kvm->vcpus[vcpu_id]; | ||
225 | if (vcpu) { | ||
226 | ioapic_inj_irq(ioapic, vcpu, vector, | ||
227 | trig_mode, delivery_mode); | ||
228 | } | ||
229 | } | ||
230 | break; | ||
231 | |||
232 | /* TODO: NMI */ | ||
233 | default: | ||
234 | printk(KERN_WARNING "Unsupported delivery mode %d\n", | ||
235 | delivery_mode); | ||
236 | break; | ||
237 | } | ||
238 | } | ||
239 | |||
240 | void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level) | ||
241 | { | ||
242 | u32 old_irr = ioapic->irr; | ||
243 | u32 mask = 1 << irq; | ||
244 | union ioapic_redir_entry entry; | ||
245 | |||
246 | if (irq >= 0 && irq < IOAPIC_NUM_PINS) { | ||
247 | entry = ioapic->redirtbl[irq]; | ||
248 | level ^= entry.fields.polarity; | ||
249 | if (!level) | ||
250 | ioapic->irr &= ~mask; | ||
251 | else { | ||
252 | ioapic->irr |= mask; | ||
253 | if ((!entry.fields.trig_mode && old_irr != ioapic->irr) | ||
254 | || !entry.fields.remote_irr) | ||
255 | ioapic_service(ioapic, irq); | ||
256 | } | ||
257 | } | ||
258 | } | ||
259 | |||
260 | static int get_eoi_gsi(struct kvm_ioapic *ioapic, int vector) | ||
261 | { | ||
262 | int i; | ||
263 | |||
264 | for (i = 0; i < IOAPIC_NUM_PINS; i++) | ||
265 | if (ioapic->redirtbl[i].fields.vector == vector) | ||
266 | return i; | ||
267 | return -1; | ||
268 | } | ||
269 | |||
270 | void kvm_ioapic_update_eoi(struct kvm *kvm, int vector) | ||
271 | { | ||
272 | struct kvm_ioapic *ioapic = kvm->arch.vioapic; | ||
273 | union ioapic_redir_entry *ent; | ||
274 | int gsi; | ||
275 | |||
276 | gsi = get_eoi_gsi(ioapic, vector); | ||
277 | if (gsi == -1) { | ||
278 | printk(KERN_WARNING "Can't find redir item for %d EOI\n", | ||
279 | vector); | ||
280 | return; | ||
281 | } | ||
282 | |||
283 | ent = &ioapic->redirtbl[gsi]; | ||
284 | ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG); | ||
285 | |||
286 | ent->fields.remote_irr = 0; | ||
287 | if (!ent->fields.mask && (ioapic->irr & (1 << gsi))) | ||
288 | ioapic_deliver(ioapic, gsi); | ||
289 | } | ||
290 | |||
291 | static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr) | ||
292 | { | ||
293 | struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; | ||
294 | |||
295 | return ((addr >= ioapic->base_address && | ||
296 | (addr < ioapic->base_address + IOAPIC_MEM_LENGTH))); | ||
297 | } | ||
298 | |||
299 | static void ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len, | ||
300 | void *val) | ||
301 | { | ||
302 | struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; | ||
303 | u32 result; | ||
304 | |||
305 | ioapic_debug("addr %lx\n", (unsigned long)addr); | ||
306 | ASSERT(!(addr & 0xf)); /* check alignment */ | ||
307 | |||
308 | addr &= 0xff; | ||
309 | switch (addr) { | ||
310 | case IOAPIC_REG_SELECT: | ||
311 | result = ioapic->ioregsel; | ||
312 | break; | ||
313 | |||
314 | case IOAPIC_REG_WINDOW: | ||
315 | result = ioapic_read_indirect(ioapic, addr, len); | ||
316 | break; | ||
317 | |||
318 | default: | ||
319 | result = 0; | ||
320 | break; | ||
321 | } | ||
322 | switch (len) { | ||
323 | case 8: | ||
324 | *(u64 *) val = result; | ||
325 | break; | ||
326 | case 1: | ||
327 | case 2: | ||
328 | case 4: | ||
329 | memcpy(val, (char *)&result, len); | ||
330 | break; | ||
331 | default: | ||
332 | printk(KERN_WARNING "ioapic: wrong length %d\n", len); | ||
333 | } | ||
334 | } | ||
335 | |||
336 | static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, | ||
337 | const void *val) | ||
338 | { | ||
339 | struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; | ||
340 | u32 data; | ||
341 | |||
342 | ioapic_debug("ioapic_mmio_write addr=%p len=%d val=%p\n", | ||
343 | (void*)addr, len, val); | ||
344 | ASSERT(!(addr & 0xf)); /* check alignment */ | ||
345 | if (len == 4 || len == 8) | ||
346 | data = *(u32 *) val; | ||
347 | else { | ||
348 | printk(KERN_WARNING "ioapic: Unsupported size %d\n", len); | ||
349 | return; | ||
350 | } | ||
351 | |||
352 | addr &= 0xff; | ||
353 | switch (addr) { | ||
354 | case IOAPIC_REG_SELECT: | ||
355 | ioapic->ioregsel = data; | ||
356 | break; | ||
357 | |||
358 | case IOAPIC_REG_WINDOW: | ||
359 | ioapic_write_indirect(ioapic, data); | ||
360 | break; | ||
361 | #ifdef CONFIG_IA64 | ||
362 | case IOAPIC_REG_EOI: | ||
363 | kvm_ioapic_update_eoi(ioapic, data); | ||
364 | break; | ||
365 | #endif | ||
366 | |||
367 | default: | ||
368 | break; | ||
369 | } | ||
370 | } | ||
371 | |||
372 | void kvm_ioapic_reset(struct kvm_ioapic *ioapic) | ||
373 | { | ||
374 | int i; | ||
375 | |||
376 | for (i = 0; i < IOAPIC_NUM_PINS; i++) | ||
377 | ioapic->redirtbl[i].fields.mask = 1; | ||
378 | ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS; | ||
379 | ioapic->ioregsel = 0; | ||
380 | ioapic->irr = 0; | ||
381 | ioapic->id = 0; | ||
382 | } | ||
383 | |||
384 | int kvm_ioapic_init(struct kvm *kvm) | ||
385 | { | ||
386 | struct kvm_ioapic *ioapic; | ||
387 | |||
388 | ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL); | ||
389 | if (!ioapic) | ||
390 | return -ENOMEM; | ||
391 | kvm->arch.vioapic = ioapic; | ||
392 | kvm_ioapic_reset(ioapic); | ||
393 | ioapic->dev.read = ioapic_mmio_read; | ||
394 | ioapic->dev.write = ioapic_mmio_write; | ||
395 | ioapic->dev.in_range = ioapic_in_range; | ||
396 | ioapic->dev.private = ioapic; | ||
397 | ioapic->kvm = kvm; | ||
398 | kvm_io_bus_register_dev(&kvm->mmio_bus, &ioapic->dev); | ||
399 | return 0; | ||
400 | } | ||
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c new file mode 100644 index 000000000000..07a09aad4fd6 --- /dev/null +++ b/arch/x86/kvm/irq.c | |||
@@ -0,0 +1,98 @@ | |||
1 | /* | ||
2 | * irq.c: API for in kernel interrupt controller | ||
3 | * Copyright (c) 2007, Intel Corporation. | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify it | ||
6 | * under the terms and conditions of the GNU General Public License, | ||
7 | * version 2, as published by the Free Software Foundation. | ||
8 | * | ||
9 | * This program is distributed in the hope it will be useful, but WITHOUT | ||
10 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
11 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
12 | * more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License along with | ||
15 | * this program; if not, write to the Free Software Foundation, Inc., 59 Temple | ||
16 | * Place - Suite 330, Boston, MA 02111-1307 USA. | ||
17 | * Authors: | ||
18 | * Yaozu (Eddie) Dong <Eddie.dong@intel.com> | ||
19 | * | ||
20 | */ | ||
21 | |||
22 | #include <linux/module.h> | ||
23 | #include <linux/kvm_host.h> | ||
24 | |||
25 | #include "irq.h" | ||
26 | |||
27 | /* | ||
28 | * check if there is pending interrupt without | ||
29 | * intack. | ||
30 | */ | ||
31 | int kvm_cpu_has_interrupt(struct kvm_vcpu *v) | ||
32 | { | ||
33 | struct kvm_pic *s; | ||
34 | |||
35 | if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */ | ||
36 | if (kvm_apic_accept_pic_intr(v)) { | ||
37 | s = pic_irqchip(v->kvm); /* PIC */ | ||
38 | return s->output; | ||
39 | } else | ||
40 | return 0; | ||
41 | } | ||
42 | return 1; | ||
43 | } | ||
44 | EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt); | ||
45 | |||
46 | /* | ||
47 | * Read pending interrupt vector and intack. | ||
48 | */ | ||
49 | int kvm_cpu_get_interrupt(struct kvm_vcpu *v) | ||
50 | { | ||
51 | struct kvm_pic *s; | ||
52 | int vector; | ||
53 | |||
54 | vector = kvm_get_apic_interrupt(v); /* APIC */ | ||
55 | if (vector == -1) { | ||
56 | if (kvm_apic_accept_pic_intr(v)) { | ||
57 | s = pic_irqchip(v->kvm); | ||
58 | s->output = 0; /* PIC */ | ||
59 | vector = kvm_pic_read_irq(s); | ||
60 | } | ||
61 | } | ||
62 | return vector; | ||
63 | } | ||
64 | EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt); | ||
65 | |||
66 | static void vcpu_kick_intr(void *info) | ||
67 | { | ||
68 | #ifdef DEBUG | ||
69 | struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info; | ||
70 | printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu); | ||
71 | #endif | ||
72 | } | ||
73 | |||
74 | void kvm_vcpu_kick(struct kvm_vcpu *vcpu) | ||
75 | { | ||
76 | int ipi_pcpu = vcpu->cpu; | ||
77 | |||
78 | if (waitqueue_active(&vcpu->wq)) { | ||
79 | wake_up_interruptible(&vcpu->wq); | ||
80 | ++vcpu->stat.halt_wakeup; | ||
81 | } | ||
82 | if (vcpu->guest_mode) | ||
83 | smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0); | ||
84 | } | ||
85 | |||
86 | void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) | ||
87 | { | ||
88 | kvm_inject_apic_timer_irqs(vcpu); | ||
89 | /* TODO: PIT, RTC etc. */ | ||
90 | } | ||
91 | EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs); | ||
92 | |||
93 | void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec) | ||
94 | { | ||
95 | kvm_apic_timer_intr_post(vcpu, vec); | ||
96 | /* TODO: PIT, RTC etc. */ | ||
97 | } | ||
98 | EXPORT_SYMBOL_GPL(kvm_timer_intr_post); | ||
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h new file mode 100644 index 000000000000..6316638eec9f --- /dev/null +++ b/arch/x86/kvm/irq.h | |||
@@ -0,0 +1,195 @@ | |||
1 | /* | ||
2 | * irq.h: in kernel interrupt controller related definitions | ||
3 | * Copyright (c) 2007, Intel Corporation. | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify it | ||
6 | * under the terms and conditions of the GNU General Public License, | ||
7 | * version 2, as published by the Free Software Foundation. | ||
8 | * | ||
9 | * This program is distributed in the hope it will be useful, but WITHOUT | ||
10 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
11 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
12 | * more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License along with | ||
15 | * this program; if not, write to the Free Software Foundation, Inc., 59 Temple | ||
16 | * Place - Suite 330, Boston, MA 02111-1307 USA. | ||
17 | * Authors: | ||
18 | * Yaozu (Eddie) Dong <Eddie.dong@intel.com> | ||
19 | * | ||
20 | */ | ||
21 | |||
22 | #ifndef __IRQ_H | ||
23 | #define __IRQ_H | ||
24 | |||
25 | #include <linux/mm_types.h> | ||
26 | #include <linux/hrtimer.h> | ||
27 | #include <linux/kvm_host.h> | ||
28 | #include "iodev.h" | ||
29 | |||
30 | struct kvm; | ||
31 | struct kvm_vcpu; | ||
32 | |||
33 | typedef void irq_request_func(void *opaque, int level); | ||
34 | |||
35 | struct kvm_kpic_state { | ||
36 | u8 last_irr; /* edge detection */ | ||
37 | u8 irr; /* interrupt request register */ | ||
38 | u8 imr; /* interrupt mask register */ | ||
39 | u8 isr; /* interrupt service register */ | ||
40 | u8 priority_add; /* highest irq priority */ | ||
41 | u8 irq_base; | ||
42 | u8 read_reg_select; | ||
43 | u8 poll; | ||
44 | u8 special_mask; | ||
45 | u8 init_state; | ||
46 | u8 auto_eoi; | ||
47 | u8 rotate_on_auto_eoi; | ||
48 | u8 special_fully_nested_mode; | ||
49 | u8 init4; /* true if 4 byte init */ | ||
50 | u8 elcr; /* PIIX edge/trigger selection */ | ||
51 | u8 elcr_mask; | ||
52 | struct kvm_pic *pics_state; | ||
53 | }; | ||
54 | |||
55 | struct kvm_pic { | ||
56 | struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ | ||
57 | irq_request_func *irq_request; | ||
58 | void *irq_request_opaque; | ||
59 | int output; /* intr from master PIC */ | ||
60 | struct kvm_io_device dev; | ||
61 | }; | ||
62 | |||
63 | struct kvm_pic *kvm_create_pic(struct kvm *kvm); | ||
64 | void kvm_pic_set_irq(void *opaque, int irq, int level); | ||
65 | int kvm_pic_read_irq(struct kvm_pic *s); | ||
66 | void kvm_pic_update_irq(struct kvm_pic *s); | ||
67 | |||
68 | #define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS | ||
69 | #define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */ | ||
70 | #define IOAPIC_EDGE_TRIG 0 | ||
71 | #define IOAPIC_LEVEL_TRIG 1 | ||
72 | |||
73 | #define IOAPIC_DEFAULT_BASE_ADDRESS 0xfec00000 | ||
74 | #define IOAPIC_MEM_LENGTH 0x100 | ||
75 | |||
76 | /* Direct registers. */ | ||
77 | #define IOAPIC_REG_SELECT 0x00 | ||
78 | #define IOAPIC_REG_WINDOW 0x10 | ||
79 | #define IOAPIC_REG_EOI 0x40 /* IA64 IOSAPIC only */ | ||
80 | |||
81 | /* Indirect registers. */ | ||
82 | #define IOAPIC_REG_APIC_ID 0x00 /* x86 IOAPIC only */ | ||
83 | #define IOAPIC_REG_VERSION 0x01 | ||
84 | #define IOAPIC_REG_ARB_ID 0x02 /* x86 IOAPIC only */ | ||
85 | |||
86 | /*ioapic delivery mode*/ | ||
87 | #define IOAPIC_FIXED 0x0 | ||
88 | #define IOAPIC_LOWEST_PRIORITY 0x1 | ||
89 | #define IOAPIC_PMI 0x2 | ||
90 | #define IOAPIC_NMI 0x4 | ||
91 | #define IOAPIC_INIT 0x5 | ||
92 | #define IOAPIC_EXTINT 0x7 | ||
93 | |||
94 | struct kvm_ioapic { | ||
95 | u64 base_address; | ||
96 | u32 ioregsel; | ||
97 | u32 id; | ||
98 | u32 irr; | ||
99 | u32 pad; | ||
100 | union ioapic_redir_entry { | ||
101 | u64 bits; | ||
102 | struct { | ||
103 | u8 vector; | ||
104 | u8 delivery_mode:3; | ||
105 | u8 dest_mode:1; | ||
106 | u8 delivery_status:1; | ||
107 | u8 polarity:1; | ||
108 | u8 remote_irr:1; | ||
109 | u8 trig_mode:1; | ||
110 | u8 mask:1; | ||
111 | u8 reserve:7; | ||
112 | u8 reserved[4]; | ||
113 | u8 dest_id; | ||
114 | } fields; | ||
115 | } redirtbl[IOAPIC_NUM_PINS]; | ||
116 | struct kvm_io_device dev; | ||
117 | struct kvm *kvm; | ||
118 | }; | ||
119 | |||
120 | struct kvm_lapic { | ||
121 | unsigned long base_address; | ||
122 | struct kvm_io_device dev; | ||
123 | struct { | ||
124 | atomic_t pending; | ||
125 | s64 period; /* unit: ns */ | ||
126 | u32 divide_count; | ||
127 | ktime_t last_update; | ||
128 | struct hrtimer dev; | ||
129 | } timer; | ||
130 | struct kvm_vcpu *vcpu; | ||
131 | struct page *regs_page; | ||
132 | void *regs; | ||
133 | }; | ||
134 | |||
135 | #ifdef DEBUG | ||
136 | #define ASSERT(x) \ | ||
137 | do { \ | ||
138 | if (!(x)) { \ | ||
139 | printk(KERN_EMERG "assertion failed %s: %d: %s\n", \ | ||
140 | __FILE__, __LINE__, #x); \ | ||
141 | BUG(); \ | ||
142 | } \ | ||
143 | } while (0) | ||
144 | #else | ||
145 | #define ASSERT(x) do { } while (0) | ||
146 | #endif | ||
147 | |||
148 | static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) | ||
149 | { | ||
150 | return kvm->arch.vpic; | ||
151 | } | ||
152 | |||
153 | static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm) | ||
154 | { | ||
155 | return kvm->arch.vioapic; | ||
156 | } | ||
157 | |||
158 | static inline int irqchip_in_kernel(struct kvm *kvm) | ||
159 | { | ||
160 | return pic_irqchip(kvm) != NULL; | ||
161 | } | ||
162 | |||
163 | void kvm_vcpu_kick(struct kvm_vcpu *vcpu); | ||
164 | int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); | ||
165 | int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu); | ||
166 | int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); | ||
167 | int kvm_create_lapic(struct kvm_vcpu *vcpu); | ||
168 | void kvm_lapic_reset(struct kvm_vcpu *vcpu); | ||
169 | void kvm_pic_reset(struct kvm_kpic_state *s); | ||
170 | void kvm_ioapic_reset(struct kvm_ioapic *ioapic); | ||
171 | void kvm_free_lapic(struct kvm_vcpu *vcpu); | ||
172 | u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); | ||
173 | void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); | ||
174 | void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); | ||
175 | |||
176 | struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector, | ||
177 | unsigned long bitmap); | ||
178 | u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); | ||
179 | void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data); | ||
180 | int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); | ||
181 | void kvm_ioapic_update_eoi(struct kvm *kvm, int vector); | ||
182 | int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); | ||
183 | int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig); | ||
184 | void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu); | ||
185 | int kvm_ioapic_init(struct kvm *kvm); | ||
186 | void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level); | ||
187 | int kvm_lapic_enabled(struct kvm_vcpu *vcpu); | ||
188 | int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); | ||
189 | void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec); | ||
190 | void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec); | ||
191 | void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); | ||
192 | void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); | ||
193 | void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu); | ||
194 | |||
195 | #endif | ||
diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h new file mode 100644 index 000000000000..ecdfe97e4635 --- /dev/null +++ b/arch/x86/kvm/kvm_svm.h | |||
@@ -0,0 +1,45 @@ | |||
1 | #ifndef __KVM_SVM_H | ||
2 | #define __KVM_SVM_H | ||
3 | |||
4 | #include <linux/kernel.h> | ||
5 | #include <linux/types.h> | ||
6 | #include <linux/list.h> | ||
7 | #include <linux/kvm_host.h> | ||
8 | #include <asm/msr.h> | ||
9 | |||
10 | #include "svm.h" | ||
11 | |||
12 | static const u32 host_save_user_msrs[] = { | ||
13 | #ifdef CONFIG_X86_64 | ||
14 | MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE, | ||
15 | MSR_FS_BASE, | ||
16 | #endif | ||
17 | MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, | ||
18 | }; | ||
19 | |||
20 | #define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs) | ||
21 | #define NUM_DB_REGS 4 | ||
22 | |||
23 | struct kvm_vcpu; | ||
24 | |||
25 | struct vcpu_svm { | ||
26 | struct kvm_vcpu vcpu; | ||
27 | struct vmcb *vmcb; | ||
28 | unsigned long vmcb_pa; | ||
29 | struct svm_cpu_data *svm_data; | ||
30 | uint64_t asid_generation; | ||
31 | |||
32 | unsigned long db_regs[NUM_DB_REGS]; | ||
33 | |||
34 | u64 next_rip; | ||
35 | |||
36 | u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; | ||
37 | u64 host_gs_base; | ||
38 | unsigned long host_cr2; | ||
39 | unsigned long host_db_regs[NUM_DB_REGS]; | ||
40 | unsigned long host_dr6; | ||
41 | unsigned long host_dr7; | ||
42 | }; | ||
43 | |||
44 | #endif | ||
45 | |||
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c new file mode 100644 index 000000000000..4076331b01ee --- /dev/null +++ b/arch/x86/kvm/lapic.c | |||
@@ -0,0 +1,1085 @@ | |||
1 | |||
2 | /* | ||
3 | * Local APIC virtualization | ||
4 | * | ||
5 | * Copyright (C) 2006 Qumranet, Inc. | ||
6 | * Copyright (C) 2007 Novell | ||
7 | * Copyright (C) 2007 Intel | ||
8 | * | ||
9 | * Authors: | ||
10 | * Dor Laor <dor.laor@qumranet.com> | ||
11 | * Gregory Haskins <ghaskins@novell.com> | ||
12 | * Yaozu (Eddie) Dong <eddie.dong@intel.com> | ||
13 | * | ||
14 | * Based on Xen 3.1 code, Copyright (c) 2004, Intel Corporation. | ||
15 | * | ||
16 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
17 | * the COPYING file in the top-level directory. | ||
18 | */ | ||
19 | |||
20 | #include <linux/kvm_host.h> | ||
21 | #include <linux/kvm.h> | ||
22 | #include <linux/mm.h> | ||
23 | #include <linux/highmem.h> | ||
24 | #include <linux/smp.h> | ||
25 | #include <linux/hrtimer.h> | ||
26 | #include <linux/io.h> | ||
27 | #include <linux/module.h> | ||
28 | #include <asm/processor.h> | ||
29 | #include <asm/msr.h> | ||
30 | #include <asm/page.h> | ||
31 | #include <asm/current.h> | ||
32 | #include <asm/apicdef.h> | ||
33 | #include <asm/atomic.h> | ||
34 | #include <asm/div64.h> | ||
35 | #include "irq.h" | ||
36 | |||
37 | #define PRId64 "d" | ||
38 | #define PRIx64 "llx" | ||
39 | #define PRIu64 "u" | ||
40 | #define PRIo64 "o" | ||
41 | |||
42 | #define APIC_BUS_CYCLE_NS 1 | ||
43 | |||
44 | /* #define apic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */ | ||
45 | #define apic_debug(fmt, arg...) | ||
46 | |||
47 | #define APIC_LVT_NUM 6 | ||
48 | /* 14 is the version for Xeon and Pentium 8.4.8*/ | ||
49 | #define APIC_VERSION (0x14UL | ((APIC_LVT_NUM - 1) << 16)) | ||
50 | #define LAPIC_MMIO_LENGTH (1 << 12) | ||
51 | /* followed define is not in apicdef.h */ | ||
52 | #define APIC_SHORT_MASK 0xc0000 | ||
53 | #define APIC_DEST_NOSHORT 0x0 | ||
54 | #define APIC_DEST_MASK 0x800 | ||
55 | #define MAX_APIC_VECTOR 256 | ||
56 | |||
57 | #define VEC_POS(v) ((v) & (32 - 1)) | ||
58 | #define REG_POS(v) (((v) >> 5) << 4) | ||
59 | |||
60 | static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off) | ||
61 | { | ||
62 | return *((u32 *) (apic->regs + reg_off)); | ||
63 | } | ||
64 | |||
65 | static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val) | ||
66 | { | ||
67 | *((u32 *) (apic->regs + reg_off)) = val; | ||
68 | } | ||
69 | |||
70 | static inline int apic_test_and_set_vector(int vec, void *bitmap) | ||
71 | { | ||
72 | return test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); | ||
73 | } | ||
74 | |||
75 | static inline int apic_test_and_clear_vector(int vec, void *bitmap) | ||
76 | { | ||
77 | return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); | ||
78 | } | ||
79 | |||
80 | static inline void apic_set_vector(int vec, void *bitmap) | ||
81 | { | ||
82 | set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); | ||
83 | } | ||
84 | |||
85 | static inline void apic_clear_vector(int vec, void *bitmap) | ||
86 | { | ||
87 | clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); | ||
88 | } | ||
89 | |||
90 | static inline int apic_hw_enabled(struct kvm_lapic *apic) | ||
91 | { | ||
92 | return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE; | ||
93 | } | ||
94 | |||
95 | static inline int apic_sw_enabled(struct kvm_lapic *apic) | ||
96 | { | ||
97 | return apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED; | ||
98 | } | ||
99 | |||
100 | static inline int apic_enabled(struct kvm_lapic *apic) | ||
101 | { | ||
102 | return apic_sw_enabled(apic) && apic_hw_enabled(apic); | ||
103 | } | ||
104 | |||
105 | #define LVT_MASK \ | ||
106 | (APIC_LVT_MASKED | APIC_SEND_PENDING | APIC_VECTOR_MASK) | ||
107 | |||
108 | #define LINT_MASK \ | ||
109 | (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \ | ||
110 | APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER) | ||
111 | |||
112 | static inline int kvm_apic_id(struct kvm_lapic *apic) | ||
113 | { | ||
114 | return (apic_get_reg(apic, APIC_ID) >> 24) & 0xff; | ||
115 | } | ||
116 | |||
117 | static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type) | ||
118 | { | ||
119 | return !(apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED); | ||
120 | } | ||
121 | |||
122 | static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type) | ||
123 | { | ||
124 | return apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK; | ||
125 | } | ||
126 | |||
127 | static inline int apic_lvtt_period(struct kvm_lapic *apic) | ||
128 | { | ||
129 | return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC; | ||
130 | } | ||
131 | |||
132 | static unsigned int apic_lvt_mask[APIC_LVT_NUM] = { | ||
133 | LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */ | ||
134 | LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ | ||
135 | LVT_MASK | APIC_MODE_MASK, /* LVTPC */ | ||
136 | LINT_MASK, LINT_MASK, /* LVT0-1 */ | ||
137 | LVT_MASK /* LVTERR */ | ||
138 | }; | ||
139 | |||
140 | static int find_highest_vector(void *bitmap) | ||
141 | { | ||
142 | u32 *word = bitmap; | ||
143 | int word_offset = MAX_APIC_VECTOR >> 5; | ||
144 | |||
145 | while ((word_offset != 0) && (word[(--word_offset) << 2] == 0)) | ||
146 | continue; | ||
147 | |||
148 | if (likely(!word_offset && !word[0])) | ||
149 | return -1; | ||
150 | else | ||
151 | return fls(word[word_offset << 2]) - 1 + (word_offset << 5); | ||
152 | } | ||
153 | |||
154 | static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic) | ||
155 | { | ||
156 | return apic_test_and_set_vector(vec, apic->regs + APIC_IRR); | ||
157 | } | ||
158 | |||
159 | static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) | ||
160 | { | ||
161 | apic_clear_vector(vec, apic->regs + APIC_IRR); | ||
162 | } | ||
163 | |||
164 | static inline int apic_find_highest_irr(struct kvm_lapic *apic) | ||
165 | { | ||
166 | int result; | ||
167 | |||
168 | result = find_highest_vector(apic->regs + APIC_IRR); | ||
169 | ASSERT(result == -1 || result >= 16); | ||
170 | |||
171 | return result; | ||
172 | } | ||
173 | |||
174 | int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) | ||
175 | { | ||
176 | struct kvm_lapic *apic = vcpu->arch.apic; | ||
177 | int highest_irr; | ||
178 | |||
179 | if (!apic) | ||
180 | return 0; | ||
181 | highest_irr = apic_find_highest_irr(apic); | ||
182 | |||
183 | return highest_irr; | ||
184 | } | ||
185 | EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr); | ||
186 | |||
187 | int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig) | ||
188 | { | ||
189 | struct kvm_lapic *apic = vcpu->arch.apic; | ||
190 | |||
191 | if (!apic_test_and_set_irr(vec, apic)) { | ||
192 | /* a new pending irq is set in IRR */ | ||
193 | if (trig) | ||
194 | apic_set_vector(vec, apic->regs + APIC_TMR); | ||
195 | else | ||
196 | apic_clear_vector(vec, apic->regs + APIC_TMR); | ||
197 | kvm_vcpu_kick(apic->vcpu); | ||
198 | return 1; | ||
199 | } | ||
200 | return 0; | ||
201 | } | ||
202 | |||
203 | static inline int apic_find_highest_isr(struct kvm_lapic *apic) | ||
204 | { | ||
205 | int result; | ||
206 | |||
207 | result = find_highest_vector(apic->regs + APIC_ISR); | ||
208 | ASSERT(result == -1 || result >= 16); | ||
209 | |||
210 | return result; | ||
211 | } | ||
212 | |||
213 | static void apic_update_ppr(struct kvm_lapic *apic) | ||
214 | { | ||
215 | u32 tpr, isrv, ppr; | ||
216 | int isr; | ||
217 | |||
218 | tpr = apic_get_reg(apic, APIC_TASKPRI); | ||
219 | isr = apic_find_highest_isr(apic); | ||
220 | isrv = (isr != -1) ? isr : 0; | ||
221 | |||
222 | if ((tpr & 0xf0) >= (isrv & 0xf0)) | ||
223 | ppr = tpr & 0xff; | ||
224 | else | ||
225 | ppr = isrv & 0xf0; | ||
226 | |||
227 | apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x", | ||
228 | apic, ppr, isr, isrv); | ||
229 | |||
230 | apic_set_reg(apic, APIC_PROCPRI, ppr); | ||
231 | } | ||
232 | |||
233 | static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) | ||
234 | { | ||
235 | apic_set_reg(apic, APIC_TASKPRI, tpr); | ||
236 | apic_update_ppr(apic); | ||
237 | } | ||
238 | |||
239 | int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest) | ||
240 | { | ||
241 | return kvm_apic_id(apic) == dest; | ||
242 | } | ||
243 | |||
244 | int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda) | ||
245 | { | ||
246 | int result = 0; | ||
247 | u8 logical_id; | ||
248 | |||
249 | logical_id = GET_APIC_LOGICAL_ID(apic_get_reg(apic, APIC_LDR)); | ||
250 | |||
251 | switch (apic_get_reg(apic, APIC_DFR)) { | ||
252 | case APIC_DFR_FLAT: | ||
253 | if (logical_id & mda) | ||
254 | result = 1; | ||
255 | break; | ||
256 | case APIC_DFR_CLUSTER: | ||
257 | if (((logical_id >> 4) == (mda >> 0x4)) | ||
258 | && (logical_id & mda & 0xf)) | ||
259 | result = 1; | ||
260 | break; | ||
261 | default: | ||
262 | printk(KERN_WARNING "Bad DFR vcpu %d: %08x\n", | ||
263 | apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR)); | ||
264 | break; | ||
265 | } | ||
266 | |||
267 | return result; | ||
268 | } | ||
269 | |||
270 | static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, | ||
271 | int short_hand, int dest, int dest_mode) | ||
272 | { | ||
273 | int result = 0; | ||
274 | struct kvm_lapic *target = vcpu->arch.apic; | ||
275 | |||
276 | apic_debug("target %p, source %p, dest 0x%x, " | ||
277 | "dest_mode 0x%x, short_hand 0x%x", | ||
278 | target, source, dest, dest_mode, short_hand); | ||
279 | |||
280 | ASSERT(!target); | ||
281 | switch (short_hand) { | ||
282 | case APIC_DEST_NOSHORT: | ||
283 | if (dest_mode == 0) { | ||
284 | /* Physical mode. */ | ||
285 | if ((dest == 0xFF) || (dest == kvm_apic_id(target))) | ||
286 | result = 1; | ||
287 | } else | ||
288 | /* Logical mode. */ | ||
289 | result = kvm_apic_match_logical_addr(target, dest); | ||
290 | break; | ||
291 | case APIC_DEST_SELF: | ||
292 | if (target == source) | ||
293 | result = 1; | ||
294 | break; | ||
295 | case APIC_DEST_ALLINC: | ||
296 | result = 1; | ||
297 | break; | ||
298 | case APIC_DEST_ALLBUT: | ||
299 | if (target != source) | ||
300 | result = 1; | ||
301 | break; | ||
302 | default: | ||
303 | printk(KERN_WARNING "Bad dest shorthand value %x\n", | ||
304 | short_hand); | ||
305 | break; | ||
306 | } | ||
307 | |||
308 | return result; | ||
309 | } | ||
310 | |||
311 | /* | ||
312 | * Add a pending IRQ into lapic. | ||
313 | * Return 1 if successfully added and 0 if discarded. | ||
314 | */ | ||
315 | static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, | ||
316 | int vector, int level, int trig_mode) | ||
317 | { | ||
318 | int orig_irr, result = 0; | ||
319 | struct kvm_vcpu *vcpu = apic->vcpu; | ||
320 | |||
321 | switch (delivery_mode) { | ||
322 | case APIC_DM_FIXED: | ||
323 | case APIC_DM_LOWEST: | ||
324 | /* FIXME add logic for vcpu on reset */ | ||
325 | if (unlikely(!apic_enabled(apic))) | ||
326 | break; | ||
327 | |||
328 | orig_irr = apic_test_and_set_irr(vector, apic); | ||
329 | if (orig_irr && trig_mode) { | ||
330 | apic_debug("level trig mode repeatedly for vector %d", | ||
331 | vector); | ||
332 | break; | ||
333 | } | ||
334 | |||
335 | if (trig_mode) { | ||
336 | apic_debug("level trig mode for vector %d", vector); | ||
337 | apic_set_vector(vector, apic->regs + APIC_TMR); | ||
338 | } else | ||
339 | apic_clear_vector(vector, apic->regs + APIC_TMR); | ||
340 | |||
341 | if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE) | ||
342 | kvm_vcpu_kick(vcpu); | ||
343 | else if (vcpu->arch.mp_state == VCPU_MP_STATE_HALTED) { | ||
344 | vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE; | ||
345 | if (waitqueue_active(&vcpu->wq)) | ||
346 | wake_up_interruptible(&vcpu->wq); | ||
347 | } | ||
348 | |||
349 | result = (orig_irr == 0); | ||
350 | break; | ||
351 | |||
352 | case APIC_DM_REMRD: | ||
353 | printk(KERN_DEBUG "Ignoring delivery mode 3\n"); | ||
354 | break; | ||
355 | |||
356 | case APIC_DM_SMI: | ||
357 | printk(KERN_DEBUG "Ignoring guest SMI\n"); | ||
358 | break; | ||
359 | case APIC_DM_NMI: | ||
360 | printk(KERN_DEBUG "Ignoring guest NMI\n"); | ||
361 | break; | ||
362 | |||
363 | case APIC_DM_INIT: | ||
364 | if (level) { | ||
365 | if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE) | ||
366 | printk(KERN_DEBUG | ||
367 | "INIT on a runnable vcpu %d\n", | ||
368 | vcpu->vcpu_id); | ||
369 | vcpu->arch.mp_state = VCPU_MP_STATE_INIT_RECEIVED; | ||
370 | kvm_vcpu_kick(vcpu); | ||
371 | } else { | ||
372 | printk(KERN_DEBUG | ||
373 | "Ignoring de-assert INIT to vcpu %d\n", | ||
374 | vcpu->vcpu_id); | ||
375 | } | ||
376 | |||
377 | break; | ||
378 | |||
379 | case APIC_DM_STARTUP: | ||
380 | printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n", | ||
381 | vcpu->vcpu_id, vector); | ||
382 | if (vcpu->arch.mp_state == VCPU_MP_STATE_INIT_RECEIVED) { | ||
383 | vcpu->arch.sipi_vector = vector; | ||
384 | vcpu->arch.mp_state = VCPU_MP_STATE_SIPI_RECEIVED; | ||
385 | if (waitqueue_active(&vcpu->wq)) | ||
386 | wake_up_interruptible(&vcpu->wq); | ||
387 | } | ||
388 | break; | ||
389 | |||
390 | default: | ||
391 | printk(KERN_ERR "TODO: unsupported delivery mode %x\n", | ||
392 | delivery_mode); | ||
393 | break; | ||
394 | } | ||
395 | return result; | ||
396 | } | ||
397 | |||
398 | static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector, | ||
399 | unsigned long bitmap) | ||
400 | { | ||
401 | int last; | ||
402 | int next; | ||
403 | struct kvm_lapic *apic = NULL; | ||
404 | |||
405 | last = kvm->arch.round_robin_prev_vcpu; | ||
406 | next = last; | ||
407 | |||
408 | do { | ||
409 | if (++next == KVM_MAX_VCPUS) | ||
410 | next = 0; | ||
411 | if (kvm->vcpus[next] == NULL || !test_bit(next, &bitmap)) | ||
412 | continue; | ||
413 | apic = kvm->vcpus[next]->arch.apic; | ||
414 | if (apic && apic_enabled(apic)) | ||
415 | break; | ||
416 | apic = NULL; | ||
417 | } while (next != last); | ||
418 | kvm->arch.round_robin_prev_vcpu = next; | ||
419 | |||
420 | if (!apic) | ||
421 | printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n"); | ||
422 | |||
423 | return apic; | ||
424 | } | ||
425 | |||
426 | struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector, | ||
427 | unsigned long bitmap) | ||
428 | { | ||
429 | struct kvm_lapic *apic; | ||
430 | |||
431 | apic = kvm_apic_round_robin(kvm, vector, bitmap); | ||
432 | if (apic) | ||
433 | return apic->vcpu; | ||
434 | return NULL; | ||
435 | } | ||
436 | |||
437 | static void apic_set_eoi(struct kvm_lapic *apic) | ||
438 | { | ||
439 | int vector = apic_find_highest_isr(apic); | ||
440 | |||
441 | /* | ||
442 | * Not every write EOI will has corresponding ISR, | ||
443 | * one example is when Kernel check timer on setup_IO_APIC | ||
444 | */ | ||
445 | if (vector == -1) | ||
446 | return; | ||
447 | |||
448 | apic_clear_vector(vector, apic->regs + APIC_ISR); | ||
449 | apic_update_ppr(apic); | ||
450 | |||
451 | if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR)) | ||
452 | kvm_ioapic_update_eoi(apic->vcpu->kvm, vector); | ||
453 | } | ||
454 | |||
455 | static void apic_send_ipi(struct kvm_lapic *apic) | ||
456 | { | ||
457 | u32 icr_low = apic_get_reg(apic, APIC_ICR); | ||
458 | u32 icr_high = apic_get_reg(apic, APIC_ICR2); | ||
459 | |||
460 | unsigned int dest = GET_APIC_DEST_FIELD(icr_high); | ||
461 | unsigned int short_hand = icr_low & APIC_SHORT_MASK; | ||
462 | unsigned int trig_mode = icr_low & APIC_INT_LEVELTRIG; | ||
463 | unsigned int level = icr_low & APIC_INT_ASSERT; | ||
464 | unsigned int dest_mode = icr_low & APIC_DEST_MASK; | ||
465 | unsigned int delivery_mode = icr_low & APIC_MODE_MASK; | ||
466 | unsigned int vector = icr_low & APIC_VECTOR_MASK; | ||
467 | |||
468 | struct kvm_vcpu *target; | ||
469 | struct kvm_vcpu *vcpu; | ||
470 | unsigned long lpr_map = 0; | ||
471 | int i; | ||
472 | |||
473 | apic_debug("icr_high 0x%x, icr_low 0x%x, " | ||
474 | "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, " | ||
475 | "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n", | ||
476 | icr_high, icr_low, short_hand, dest, | ||
477 | trig_mode, level, dest_mode, delivery_mode, vector); | ||
478 | |||
479 | for (i = 0; i < KVM_MAX_VCPUS; i++) { | ||
480 | vcpu = apic->vcpu->kvm->vcpus[i]; | ||
481 | if (!vcpu) | ||
482 | continue; | ||
483 | |||
484 | if (vcpu->arch.apic && | ||
485 | apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) { | ||
486 | if (delivery_mode == APIC_DM_LOWEST) | ||
487 | set_bit(vcpu->vcpu_id, &lpr_map); | ||
488 | else | ||
489 | __apic_accept_irq(vcpu->arch.apic, delivery_mode, | ||
490 | vector, level, trig_mode); | ||
491 | } | ||
492 | } | ||
493 | |||
494 | if (delivery_mode == APIC_DM_LOWEST) { | ||
495 | target = kvm_get_lowest_prio_vcpu(vcpu->kvm, vector, lpr_map); | ||
496 | if (target != NULL) | ||
497 | __apic_accept_irq(target->arch.apic, delivery_mode, | ||
498 | vector, level, trig_mode); | ||
499 | } | ||
500 | } | ||
501 | |||
502 | static u32 apic_get_tmcct(struct kvm_lapic *apic) | ||
503 | { | ||
504 | u64 counter_passed; | ||
505 | ktime_t passed, now; | ||
506 | u32 tmcct; | ||
507 | |||
508 | ASSERT(apic != NULL); | ||
509 | |||
510 | now = apic->timer.dev.base->get_time(); | ||
511 | tmcct = apic_get_reg(apic, APIC_TMICT); | ||
512 | |||
513 | /* if initial count is 0, current count should also be 0 */ | ||
514 | if (tmcct == 0) | ||
515 | return 0; | ||
516 | |||
517 | if (unlikely(ktime_to_ns(now) <= | ||
518 | ktime_to_ns(apic->timer.last_update))) { | ||
519 | /* Wrap around */ | ||
520 | passed = ktime_add(( { | ||
521 | (ktime_t) { | ||
522 | .tv64 = KTIME_MAX - | ||
523 | (apic->timer.last_update).tv64}; } | ||
524 | ), now); | ||
525 | apic_debug("time elapsed\n"); | ||
526 | } else | ||
527 | passed = ktime_sub(now, apic->timer.last_update); | ||
528 | |||
529 | counter_passed = div64_64(ktime_to_ns(passed), | ||
530 | (APIC_BUS_CYCLE_NS * apic->timer.divide_count)); | ||
531 | |||
532 | if (counter_passed > tmcct) { | ||
533 | if (unlikely(!apic_lvtt_period(apic))) { | ||
534 | /* one-shot timers stick at 0 until reset */ | ||
535 | tmcct = 0; | ||
536 | } else { | ||
537 | /* | ||
538 | * periodic timers reset to APIC_TMICT when they | ||
539 | * hit 0. The while loop simulates this happening N | ||
540 | * times. (counter_passed %= tmcct) would also work, | ||
541 | * but might be slower or not work on 32-bit?? | ||
542 | */ | ||
543 | while (counter_passed > tmcct) | ||
544 | counter_passed -= tmcct; | ||
545 | tmcct -= counter_passed; | ||
546 | } | ||
547 | } else { | ||
548 | tmcct -= counter_passed; | ||
549 | } | ||
550 | |||
551 | return tmcct; | ||
552 | } | ||
553 | |||
554 | static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) | ||
555 | { | ||
556 | u32 val = 0; | ||
557 | |||
558 | if (offset >= LAPIC_MMIO_LENGTH) | ||
559 | return 0; | ||
560 | |||
561 | switch (offset) { | ||
562 | case APIC_ARBPRI: | ||
563 | printk(KERN_WARNING "Access APIC ARBPRI register " | ||
564 | "which is for P6\n"); | ||
565 | break; | ||
566 | |||
567 | case APIC_TMCCT: /* Timer CCR */ | ||
568 | val = apic_get_tmcct(apic); | ||
569 | break; | ||
570 | |||
571 | default: | ||
572 | apic_update_ppr(apic); | ||
573 | val = apic_get_reg(apic, offset); | ||
574 | break; | ||
575 | } | ||
576 | |||
577 | return val; | ||
578 | } | ||
579 | |||
580 | static void apic_mmio_read(struct kvm_io_device *this, | ||
581 | gpa_t address, int len, void *data) | ||
582 | { | ||
583 | struct kvm_lapic *apic = (struct kvm_lapic *)this->private; | ||
584 | unsigned int offset = address - apic->base_address; | ||
585 | unsigned char alignment = offset & 0xf; | ||
586 | u32 result; | ||
587 | |||
588 | if ((alignment + len) > 4) { | ||
589 | printk(KERN_ERR "KVM_APIC_READ: alignment error %lx %d", | ||
590 | (unsigned long)address, len); | ||
591 | return; | ||
592 | } | ||
593 | result = __apic_read(apic, offset & ~0xf); | ||
594 | |||
595 | switch (len) { | ||
596 | case 1: | ||
597 | case 2: | ||
598 | case 4: | ||
599 | memcpy(data, (char *)&result + alignment, len); | ||
600 | break; | ||
601 | default: | ||
602 | printk(KERN_ERR "Local APIC read with len = %x, " | ||
603 | "should be 1,2, or 4 instead\n", len); | ||
604 | break; | ||
605 | } | ||
606 | } | ||
607 | |||
608 | static void update_divide_count(struct kvm_lapic *apic) | ||
609 | { | ||
610 | u32 tmp1, tmp2, tdcr; | ||
611 | |||
612 | tdcr = apic_get_reg(apic, APIC_TDCR); | ||
613 | tmp1 = tdcr & 0xf; | ||
614 | tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1; | ||
615 | apic->timer.divide_count = 0x1 << (tmp2 & 0x7); | ||
616 | |||
617 | apic_debug("timer divide count is 0x%x\n", | ||
618 | apic->timer.divide_count); | ||
619 | } | ||
620 | |||
621 | static void start_apic_timer(struct kvm_lapic *apic) | ||
622 | { | ||
623 | ktime_t now = apic->timer.dev.base->get_time(); | ||
624 | |||
625 | apic->timer.last_update = now; | ||
626 | |||
627 | apic->timer.period = apic_get_reg(apic, APIC_TMICT) * | ||
628 | APIC_BUS_CYCLE_NS * apic->timer.divide_count; | ||
629 | atomic_set(&apic->timer.pending, 0); | ||
630 | hrtimer_start(&apic->timer.dev, | ||
631 | ktime_add_ns(now, apic->timer.period), | ||
632 | HRTIMER_MODE_ABS); | ||
633 | |||
634 | apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" | ||
635 | PRIx64 ", " | ||
636 | "timer initial count 0x%x, period %lldns, " | ||
637 | "expire @ 0x%016" PRIx64 ".\n", __FUNCTION__, | ||
638 | APIC_BUS_CYCLE_NS, ktime_to_ns(now), | ||
639 | apic_get_reg(apic, APIC_TMICT), | ||
640 | apic->timer.period, | ||
641 | ktime_to_ns(ktime_add_ns(now, | ||
642 | apic->timer.period))); | ||
643 | } | ||
644 | |||
645 | static void apic_mmio_write(struct kvm_io_device *this, | ||
646 | gpa_t address, int len, const void *data) | ||
647 | { | ||
648 | struct kvm_lapic *apic = (struct kvm_lapic *)this->private; | ||
649 | unsigned int offset = address - apic->base_address; | ||
650 | unsigned char alignment = offset & 0xf; | ||
651 | u32 val; | ||
652 | |||
653 | /* | ||
654 | * APIC register must be aligned on 128-bits boundary. | ||
655 | * 32/64/128 bits registers must be accessed thru 32 bits. | ||
656 | * Refer SDM 8.4.1 | ||
657 | */ | ||
658 | if (len != 4 || alignment) { | ||
659 | if (printk_ratelimit()) | ||
660 | printk(KERN_ERR "apic write: bad size=%d %lx\n", | ||
661 | len, (long)address); | ||
662 | return; | ||
663 | } | ||
664 | |||
665 | val = *(u32 *) data; | ||
666 | |||
667 | /* too common printing */ | ||
668 | if (offset != APIC_EOI) | ||
669 | apic_debug("%s: offset 0x%x with length 0x%x, and value is " | ||
670 | "0x%x\n", __FUNCTION__, offset, len, val); | ||
671 | |||
672 | offset &= 0xff0; | ||
673 | |||
674 | switch (offset) { | ||
675 | case APIC_ID: /* Local APIC ID */ | ||
676 | apic_set_reg(apic, APIC_ID, val); | ||
677 | break; | ||
678 | |||
679 | case APIC_TASKPRI: | ||
680 | apic_set_tpr(apic, val & 0xff); | ||
681 | break; | ||
682 | |||
683 | case APIC_EOI: | ||
684 | apic_set_eoi(apic); | ||
685 | break; | ||
686 | |||
687 | case APIC_LDR: | ||
688 | apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK); | ||
689 | break; | ||
690 | |||
691 | case APIC_DFR: | ||
692 | apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF); | ||
693 | break; | ||
694 | |||
695 | case APIC_SPIV: | ||
696 | apic_set_reg(apic, APIC_SPIV, val & 0x3ff); | ||
697 | if (!(val & APIC_SPIV_APIC_ENABLED)) { | ||
698 | int i; | ||
699 | u32 lvt_val; | ||
700 | |||
701 | for (i = 0; i < APIC_LVT_NUM; i++) { | ||
702 | lvt_val = apic_get_reg(apic, | ||
703 | APIC_LVTT + 0x10 * i); | ||
704 | apic_set_reg(apic, APIC_LVTT + 0x10 * i, | ||
705 | lvt_val | APIC_LVT_MASKED); | ||
706 | } | ||
707 | atomic_set(&apic->timer.pending, 0); | ||
708 | |||
709 | } | ||
710 | break; | ||
711 | |||
712 | case APIC_ICR: | ||
713 | /* No delay here, so we always clear the pending bit */ | ||
714 | apic_set_reg(apic, APIC_ICR, val & ~(1 << 12)); | ||
715 | apic_send_ipi(apic); | ||
716 | break; | ||
717 | |||
718 | case APIC_ICR2: | ||
719 | apic_set_reg(apic, APIC_ICR2, val & 0xff000000); | ||
720 | break; | ||
721 | |||
722 | case APIC_LVTT: | ||
723 | case APIC_LVTTHMR: | ||
724 | case APIC_LVTPC: | ||
725 | case APIC_LVT0: | ||
726 | case APIC_LVT1: | ||
727 | case APIC_LVTERR: | ||
728 | /* TODO: Check vector */ | ||
729 | if (!apic_sw_enabled(apic)) | ||
730 | val |= APIC_LVT_MASKED; | ||
731 | |||
732 | val &= apic_lvt_mask[(offset - APIC_LVTT) >> 4]; | ||
733 | apic_set_reg(apic, offset, val); | ||
734 | |||
735 | break; | ||
736 | |||
737 | case APIC_TMICT: | ||
738 | hrtimer_cancel(&apic->timer.dev); | ||
739 | apic_set_reg(apic, APIC_TMICT, val); | ||
740 | start_apic_timer(apic); | ||
741 | return; | ||
742 | |||
743 | case APIC_TDCR: | ||
744 | if (val & 4) | ||
745 | printk(KERN_ERR "KVM_WRITE:TDCR %x\n", val); | ||
746 | apic_set_reg(apic, APIC_TDCR, val); | ||
747 | update_divide_count(apic); | ||
748 | break; | ||
749 | |||
750 | default: | ||
751 | apic_debug("Local APIC Write to read-only register %x\n", | ||
752 | offset); | ||
753 | break; | ||
754 | } | ||
755 | |||
756 | } | ||
757 | |||
758 | static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr) | ||
759 | { | ||
760 | struct kvm_lapic *apic = (struct kvm_lapic *)this->private; | ||
761 | int ret = 0; | ||
762 | |||
763 | |||
764 | if (apic_hw_enabled(apic) && | ||
765 | (addr >= apic->base_address) && | ||
766 | (addr < (apic->base_address + LAPIC_MMIO_LENGTH))) | ||
767 | ret = 1; | ||
768 | |||
769 | return ret; | ||
770 | } | ||
771 | |||
772 | void kvm_free_lapic(struct kvm_vcpu *vcpu) | ||
773 | { | ||
774 | if (!vcpu->arch.apic) | ||
775 | return; | ||
776 | |||
777 | hrtimer_cancel(&vcpu->arch.apic->timer.dev); | ||
778 | |||
779 | if (vcpu->arch.apic->regs_page) | ||
780 | __free_page(vcpu->arch.apic->regs_page); | ||
781 | |||
782 | kfree(vcpu->arch.apic); | ||
783 | } | ||
784 | |||
785 | /* | ||
786 | *---------------------------------------------------------------------- | ||
787 | * LAPIC interface | ||
788 | *---------------------------------------------------------------------- | ||
789 | */ | ||
790 | |||
791 | void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8) | ||
792 | { | ||
793 | struct kvm_lapic *apic = vcpu->arch.apic; | ||
794 | |||
795 | if (!apic) | ||
796 | return; | ||
797 | apic_set_tpr(apic, ((cr8 & 0x0f) << 4)); | ||
798 | } | ||
799 | |||
800 | u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) | ||
801 | { | ||
802 | struct kvm_lapic *apic = vcpu->arch.apic; | ||
803 | u64 tpr; | ||
804 | |||
805 | if (!apic) | ||
806 | return 0; | ||
807 | tpr = (u64) apic_get_reg(apic, APIC_TASKPRI); | ||
808 | |||
809 | return (tpr & 0xf0) >> 4; | ||
810 | } | ||
811 | EXPORT_SYMBOL_GPL(kvm_lapic_get_cr8); | ||
812 | |||
813 | void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) | ||
814 | { | ||
815 | struct kvm_lapic *apic = vcpu->arch.apic; | ||
816 | |||
817 | if (!apic) { | ||
818 | value |= MSR_IA32_APICBASE_BSP; | ||
819 | vcpu->arch.apic_base = value; | ||
820 | return; | ||
821 | } | ||
822 | if (apic->vcpu->vcpu_id) | ||
823 | value &= ~MSR_IA32_APICBASE_BSP; | ||
824 | |||
825 | vcpu->arch.apic_base = value; | ||
826 | apic->base_address = apic->vcpu->arch.apic_base & | ||
827 | MSR_IA32_APICBASE_BASE; | ||
828 | |||
829 | /* with FSB delivery interrupt, we can restart APIC functionality */ | ||
830 | apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is " | ||
831 | "0x%lx.\n", apic->vcpu->arch.apic_base, apic->base_address); | ||
832 | |||
833 | } | ||
834 | |||
835 | u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu) | ||
836 | { | ||
837 | return vcpu->arch.apic_base; | ||
838 | } | ||
839 | EXPORT_SYMBOL_GPL(kvm_lapic_get_base); | ||
840 | |||
841 | void kvm_lapic_reset(struct kvm_vcpu *vcpu) | ||
842 | { | ||
843 | struct kvm_lapic *apic; | ||
844 | int i; | ||
845 | |||
846 | apic_debug("%s\n", __FUNCTION__); | ||
847 | |||
848 | ASSERT(vcpu); | ||
849 | apic = vcpu->arch.apic; | ||
850 | ASSERT(apic != NULL); | ||
851 | |||
852 | /* Stop the timer in case it's a reset to an active apic */ | ||
853 | hrtimer_cancel(&apic->timer.dev); | ||
854 | |||
855 | apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24); | ||
856 | apic_set_reg(apic, APIC_LVR, APIC_VERSION); | ||
857 | |||
858 | for (i = 0; i < APIC_LVT_NUM; i++) | ||
859 | apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED); | ||
860 | apic_set_reg(apic, APIC_LVT0, | ||
861 | SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT)); | ||
862 | |||
863 | apic_set_reg(apic, APIC_DFR, 0xffffffffU); | ||
864 | apic_set_reg(apic, APIC_SPIV, 0xff); | ||
865 | apic_set_reg(apic, APIC_TASKPRI, 0); | ||
866 | apic_set_reg(apic, APIC_LDR, 0); | ||
867 | apic_set_reg(apic, APIC_ESR, 0); | ||
868 | apic_set_reg(apic, APIC_ICR, 0); | ||
869 | apic_set_reg(apic, APIC_ICR2, 0); | ||
870 | apic_set_reg(apic, APIC_TDCR, 0); | ||
871 | apic_set_reg(apic, APIC_TMICT, 0); | ||
872 | for (i = 0; i < 8; i++) { | ||
873 | apic_set_reg(apic, APIC_IRR + 0x10 * i, 0); | ||
874 | apic_set_reg(apic, APIC_ISR + 0x10 * i, 0); | ||
875 | apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); | ||
876 | } | ||
877 | update_divide_count(apic); | ||
878 | atomic_set(&apic->timer.pending, 0); | ||
879 | if (vcpu->vcpu_id == 0) | ||
880 | vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; | ||
881 | apic_update_ppr(apic); | ||
882 | |||
883 | apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr=" | ||
884 | "0x%016" PRIx64 ", base_address=0x%0lx.\n", __FUNCTION__, | ||
885 | vcpu, kvm_apic_id(apic), | ||
886 | vcpu->arch.apic_base, apic->base_address); | ||
887 | } | ||
888 | EXPORT_SYMBOL_GPL(kvm_lapic_reset); | ||
889 | |||
890 | int kvm_lapic_enabled(struct kvm_vcpu *vcpu) | ||
891 | { | ||
892 | struct kvm_lapic *apic = vcpu->arch.apic; | ||
893 | int ret = 0; | ||
894 | |||
895 | if (!apic) | ||
896 | return 0; | ||
897 | ret = apic_enabled(apic); | ||
898 | |||
899 | return ret; | ||
900 | } | ||
901 | EXPORT_SYMBOL_GPL(kvm_lapic_enabled); | ||
902 | |||
903 | /* | ||
904 | *---------------------------------------------------------------------- | ||
905 | * timer interface | ||
906 | *---------------------------------------------------------------------- | ||
907 | */ | ||
908 | |||
909 | /* TODO: make sure __apic_timer_fn runs in current pCPU */ | ||
910 | static int __apic_timer_fn(struct kvm_lapic *apic) | ||
911 | { | ||
912 | int result = 0; | ||
913 | wait_queue_head_t *q = &apic->vcpu->wq; | ||
914 | |||
915 | atomic_inc(&apic->timer.pending); | ||
916 | if (waitqueue_active(q)) { | ||
917 | apic->vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE; | ||
918 | wake_up_interruptible(q); | ||
919 | } | ||
920 | if (apic_lvtt_period(apic)) { | ||
921 | result = 1; | ||
922 | apic->timer.dev.expires = ktime_add_ns( | ||
923 | apic->timer.dev.expires, | ||
924 | apic->timer.period); | ||
925 | } | ||
926 | return result; | ||
927 | } | ||
928 | |||
929 | static int __inject_apic_timer_irq(struct kvm_lapic *apic) | ||
930 | { | ||
931 | int vector; | ||
932 | |||
933 | vector = apic_lvt_vector(apic, APIC_LVTT); | ||
934 | return __apic_accept_irq(apic, APIC_DM_FIXED, vector, 1, 0); | ||
935 | } | ||
936 | |||
937 | static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) | ||
938 | { | ||
939 | struct kvm_lapic *apic; | ||
940 | int restart_timer = 0; | ||
941 | |||
942 | apic = container_of(data, struct kvm_lapic, timer.dev); | ||
943 | |||
944 | restart_timer = __apic_timer_fn(apic); | ||
945 | |||
946 | if (restart_timer) | ||
947 | return HRTIMER_RESTART; | ||
948 | else | ||
949 | return HRTIMER_NORESTART; | ||
950 | } | ||
951 | |||
952 | int kvm_create_lapic(struct kvm_vcpu *vcpu) | ||
953 | { | ||
954 | struct kvm_lapic *apic; | ||
955 | |||
956 | ASSERT(vcpu != NULL); | ||
957 | apic_debug("apic_init %d\n", vcpu->vcpu_id); | ||
958 | |||
959 | apic = kzalloc(sizeof(*apic), GFP_KERNEL); | ||
960 | if (!apic) | ||
961 | goto nomem; | ||
962 | |||
963 | vcpu->arch.apic = apic; | ||
964 | |||
965 | apic->regs_page = alloc_page(GFP_KERNEL); | ||
966 | if (apic->regs_page == NULL) { | ||
967 | printk(KERN_ERR "malloc apic regs error for vcpu %x\n", | ||
968 | vcpu->vcpu_id); | ||
969 | goto nomem_free_apic; | ||
970 | } | ||
971 | apic->regs = page_address(apic->regs_page); | ||
972 | memset(apic->regs, 0, PAGE_SIZE); | ||
973 | apic->vcpu = vcpu; | ||
974 | |||
975 | hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); | ||
976 | apic->timer.dev.function = apic_timer_fn; | ||
977 | apic->base_address = APIC_DEFAULT_PHYS_BASE; | ||
978 | vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE; | ||
979 | |||
980 | kvm_lapic_reset(vcpu); | ||
981 | apic->dev.read = apic_mmio_read; | ||
982 | apic->dev.write = apic_mmio_write; | ||
983 | apic->dev.in_range = apic_mmio_range; | ||
984 | apic->dev.private = apic; | ||
985 | |||
986 | return 0; | ||
987 | nomem_free_apic: | ||
988 | kfree(apic); | ||
989 | nomem: | ||
990 | return -ENOMEM; | ||
991 | } | ||
992 | EXPORT_SYMBOL_GPL(kvm_create_lapic); | ||
993 | |||
994 | int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) | ||
995 | { | ||
996 | struct kvm_lapic *apic = vcpu->arch.apic; | ||
997 | int highest_irr; | ||
998 | |||
999 | if (!apic || !apic_enabled(apic)) | ||
1000 | return -1; | ||
1001 | |||
1002 | apic_update_ppr(apic); | ||
1003 | highest_irr = apic_find_highest_irr(apic); | ||
1004 | if ((highest_irr == -1) || | ||
1005 | ((highest_irr & 0xF0) <= apic_get_reg(apic, APIC_PROCPRI))) | ||
1006 | return -1; | ||
1007 | return highest_irr; | ||
1008 | } | ||
1009 | |||
1010 | int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) | ||
1011 | { | ||
1012 | u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0); | ||
1013 | int r = 0; | ||
1014 | |||
1015 | if (vcpu->vcpu_id == 0) { | ||
1016 | if (!apic_hw_enabled(vcpu->arch.apic)) | ||
1017 | r = 1; | ||
1018 | if ((lvt0 & APIC_LVT_MASKED) == 0 && | ||
1019 | GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) | ||
1020 | r = 1; | ||
1021 | } | ||
1022 | return r; | ||
1023 | } | ||
1024 | |||
1025 | void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) | ||
1026 | { | ||
1027 | struct kvm_lapic *apic = vcpu->arch.apic; | ||
1028 | |||
1029 | if (apic && apic_lvt_enabled(apic, APIC_LVTT) && | ||
1030 | atomic_read(&apic->timer.pending) > 0) { | ||
1031 | if (__inject_apic_timer_irq(apic)) | ||
1032 | atomic_dec(&apic->timer.pending); | ||
1033 | } | ||
1034 | } | ||
1035 | |||
1036 | void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec) | ||
1037 | { | ||
1038 | struct kvm_lapic *apic = vcpu->arch.apic; | ||
1039 | |||
1040 | if (apic && apic_lvt_vector(apic, APIC_LVTT) == vec) | ||
1041 | apic->timer.last_update = ktime_add_ns( | ||
1042 | apic->timer.last_update, | ||
1043 | apic->timer.period); | ||
1044 | } | ||
1045 | |||
1046 | int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) | ||
1047 | { | ||
1048 | int vector = kvm_apic_has_interrupt(vcpu); | ||
1049 | struct kvm_lapic *apic = vcpu->arch.apic; | ||
1050 | |||
1051 | if (vector == -1) | ||
1052 | return -1; | ||
1053 | |||
1054 | apic_set_vector(vector, apic->regs + APIC_ISR); | ||
1055 | apic_update_ppr(apic); | ||
1056 | apic_clear_irr(vector, apic); | ||
1057 | return vector; | ||
1058 | } | ||
1059 | |||
1060 | void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) | ||
1061 | { | ||
1062 | struct kvm_lapic *apic = vcpu->arch.apic; | ||
1063 | |||
1064 | apic->base_address = vcpu->arch.apic_base & | ||
1065 | MSR_IA32_APICBASE_BASE; | ||
1066 | apic_set_reg(apic, APIC_LVR, APIC_VERSION); | ||
1067 | apic_update_ppr(apic); | ||
1068 | hrtimer_cancel(&apic->timer.dev); | ||
1069 | update_divide_count(apic); | ||
1070 | start_apic_timer(apic); | ||
1071 | } | ||
1072 | |||
1073 | void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) | ||
1074 | { | ||
1075 | struct kvm_lapic *apic = vcpu->arch.apic; | ||
1076 | struct hrtimer *timer; | ||
1077 | |||
1078 | if (!apic) | ||
1079 | return; | ||
1080 | |||
1081 | timer = &apic->timer.dev; | ||
1082 | if (hrtimer_cancel(timer)) | ||
1083 | hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS); | ||
1084 | } | ||
1085 | EXPORT_SYMBOL_GPL(kvm_migrate_apic_timer); | ||
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c new file mode 100644 index 000000000000..401eb7ce3207 --- /dev/null +++ b/arch/x86/kvm/mmu.c | |||
@@ -0,0 +1,1805 @@ | |||
1 | /* | ||
2 | * Kernel-based Virtual Machine driver for Linux | ||
3 | * | ||
4 | * This module enables machines with Intel VT-x extensions to run virtual | ||
5 | * machines without emulation or binary translation. | ||
6 | * | ||
7 | * MMU support | ||
8 | * | ||
9 | * Copyright (C) 2006 Qumranet, Inc. | ||
10 | * | ||
11 | * Authors: | ||
12 | * Yaniv Kamay <yaniv@qumranet.com> | ||
13 | * Avi Kivity <avi@qumranet.com> | ||
14 | * | ||
15 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
16 | * the COPYING file in the top-level directory. | ||
17 | * | ||
18 | */ | ||
19 | |||
20 | #include "vmx.h" | ||
21 | #include "mmu.h" | ||
22 | |||
23 | #include <linux/kvm_host.h> | ||
24 | #include <linux/types.h> | ||
25 | #include <linux/string.h> | ||
26 | #include <linux/mm.h> | ||
27 | #include <linux/highmem.h> | ||
28 | #include <linux/module.h> | ||
29 | #include <linux/swap.h> | ||
30 | |||
31 | #include <asm/page.h> | ||
32 | #include <asm/cmpxchg.h> | ||
33 | #include <asm/io.h> | ||
34 | |||
35 | #undef MMU_DEBUG | ||
36 | |||
37 | #undef AUDIT | ||
38 | |||
39 | #ifdef AUDIT | ||
40 | static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg); | ||
41 | #else | ||
42 | static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {} | ||
43 | #endif | ||
44 | |||
45 | #ifdef MMU_DEBUG | ||
46 | |||
47 | #define pgprintk(x...) do { if (dbg) printk(x); } while (0) | ||
48 | #define rmap_printk(x...) do { if (dbg) printk(x); } while (0) | ||
49 | |||
50 | #else | ||
51 | |||
52 | #define pgprintk(x...) do { } while (0) | ||
53 | #define rmap_printk(x...) do { } while (0) | ||
54 | |||
55 | #endif | ||
56 | |||
57 | #if defined(MMU_DEBUG) || defined(AUDIT) | ||
58 | static int dbg = 1; | ||
59 | #endif | ||
60 | |||
61 | #ifndef MMU_DEBUG | ||
62 | #define ASSERT(x) do { } while (0) | ||
63 | #else | ||
64 | #define ASSERT(x) \ | ||
65 | if (!(x)) { \ | ||
66 | printk(KERN_WARNING "assertion failed %s:%d: %s\n", \ | ||
67 | __FILE__, __LINE__, #x); \ | ||
68 | } | ||
69 | #endif | ||
70 | |||
71 | #define PT64_PT_BITS 9 | ||
72 | #define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) | ||
73 | #define PT32_PT_BITS 10 | ||
74 | #define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS) | ||
75 | |||
76 | #define PT_WRITABLE_SHIFT 1 | ||
77 | |||
78 | #define PT_PRESENT_MASK (1ULL << 0) | ||
79 | #define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT) | ||
80 | #define PT_USER_MASK (1ULL << 2) | ||
81 | #define PT_PWT_MASK (1ULL << 3) | ||
82 | #define PT_PCD_MASK (1ULL << 4) | ||
83 | #define PT_ACCESSED_MASK (1ULL << 5) | ||
84 | #define PT_DIRTY_MASK (1ULL << 6) | ||
85 | #define PT_PAGE_SIZE_MASK (1ULL << 7) | ||
86 | #define PT_PAT_MASK (1ULL << 7) | ||
87 | #define PT_GLOBAL_MASK (1ULL << 8) | ||
88 | #define PT64_NX_SHIFT 63 | ||
89 | #define PT64_NX_MASK (1ULL << PT64_NX_SHIFT) | ||
90 | |||
91 | #define PT_PAT_SHIFT 7 | ||
92 | #define PT_DIR_PAT_SHIFT 12 | ||
93 | #define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT) | ||
94 | |||
95 | #define PT32_DIR_PSE36_SIZE 4 | ||
96 | #define PT32_DIR_PSE36_SHIFT 13 | ||
97 | #define PT32_DIR_PSE36_MASK \ | ||
98 | (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT) | ||
99 | |||
100 | |||
101 | #define PT_FIRST_AVAIL_BITS_SHIFT 9 | ||
102 | #define PT64_SECOND_AVAIL_BITS_SHIFT 52 | ||
103 | |||
104 | #define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) | ||
105 | |||
106 | #define VALID_PAGE(x) ((x) != INVALID_PAGE) | ||
107 | |||
108 | #define PT64_LEVEL_BITS 9 | ||
109 | |||
110 | #define PT64_LEVEL_SHIFT(level) \ | ||
111 | (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS) | ||
112 | |||
113 | #define PT64_LEVEL_MASK(level) \ | ||
114 | (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level)) | ||
115 | |||
116 | #define PT64_INDEX(address, level)\ | ||
117 | (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) | ||
118 | |||
119 | |||
120 | #define PT32_LEVEL_BITS 10 | ||
121 | |||
122 | #define PT32_LEVEL_SHIFT(level) \ | ||
123 | (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS) | ||
124 | |||
125 | #define PT32_LEVEL_MASK(level) \ | ||
126 | (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level)) | ||
127 | |||
128 | #define PT32_INDEX(address, level)\ | ||
129 | (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) | ||
130 | |||
131 | |||
132 | #define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) | ||
133 | #define PT64_DIR_BASE_ADDR_MASK \ | ||
134 | (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1)) | ||
135 | |||
136 | #define PT32_BASE_ADDR_MASK PAGE_MASK | ||
137 | #define PT32_DIR_BASE_ADDR_MASK \ | ||
138 | (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1)) | ||
139 | |||
140 | #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ | ||
141 | | PT64_NX_MASK) | ||
142 | |||
143 | #define PFERR_PRESENT_MASK (1U << 0) | ||
144 | #define PFERR_WRITE_MASK (1U << 1) | ||
145 | #define PFERR_USER_MASK (1U << 2) | ||
146 | #define PFERR_FETCH_MASK (1U << 4) | ||
147 | |||
148 | #define PT64_ROOT_LEVEL 4 | ||
149 | #define PT32_ROOT_LEVEL 2 | ||
150 | #define PT32E_ROOT_LEVEL 3 | ||
151 | |||
152 | #define PT_DIRECTORY_LEVEL 2 | ||
153 | #define PT_PAGE_TABLE_LEVEL 1 | ||
154 | |||
155 | #define RMAP_EXT 4 | ||
156 | |||
157 | #define ACC_EXEC_MASK 1 | ||
158 | #define ACC_WRITE_MASK PT_WRITABLE_MASK | ||
159 | #define ACC_USER_MASK PT_USER_MASK | ||
160 | #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) | ||
161 | |||
162 | struct kvm_rmap_desc { | ||
163 | u64 *shadow_ptes[RMAP_EXT]; | ||
164 | struct kvm_rmap_desc *more; | ||
165 | }; | ||
166 | |||
167 | static struct kmem_cache *pte_chain_cache; | ||
168 | static struct kmem_cache *rmap_desc_cache; | ||
169 | static struct kmem_cache *mmu_page_header_cache; | ||
170 | |||
171 | static u64 __read_mostly shadow_trap_nonpresent_pte; | ||
172 | static u64 __read_mostly shadow_notrap_nonpresent_pte; | ||
173 | |||
174 | void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte) | ||
175 | { | ||
176 | shadow_trap_nonpresent_pte = trap_pte; | ||
177 | shadow_notrap_nonpresent_pte = notrap_pte; | ||
178 | } | ||
179 | EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes); | ||
180 | |||
181 | static int is_write_protection(struct kvm_vcpu *vcpu) | ||
182 | { | ||
183 | return vcpu->arch.cr0 & X86_CR0_WP; | ||
184 | } | ||
185 | |||
186 | static int is_cpuid_PSE36(void) | ||
187 | { | ||
188 | return 1; | ||
189 | } | ||
190 | |||
191 | static int is_nx(struct kvm_vcpu *vcpu) | ||
192 | { | ||
193 | return vcpu->arch.shadow_efer & EFER_NX; | ||
194 | } | ||
195 | |||
196 | static int is_present_pte(unsigned long pte) | ||
197 | { | ||
198 | return pte & PT_PRESENT_MASK; | ||
199 | } | ||
200 | |||
201 | static int is_shadow_present_pte(u64 pte) | ||
202 | { | ||
203 | pte &= ~PT_SHADOW_IO_MARK; | ||
204 | return pte != shadow_trap_nonpresent_pte | ||
205 | && pte != shadow_notrap_nonpresent_pte; | ||
206 | } | ||
207 | |||
208 | static int is_writeble_pte(unsigned long pte) | ||
209 | { | ||
210 | return pte & PT_WRITABLE_MASK; | ||
211 | } | ||
212 | |||
213 | static int is_dirty_pte(unsigned long pte) | ||
214 | { | ||
215 | return pte & PT_DIRTY_MASK; | ||
216 | } | ||
217 | |||
218 | static int is_io_pte(unsigned long pte) | ||
219 | { | ||
220 | return pte & PT_SHADOW_IO_MARK; | ||
221 | } | ||
222 | |||
223 | static int is_rmap_pte(u64 pte) | ||
224 | { | ||
225 | return pte != shadow_trap_nonpresent_pte | ||
226 | && pte != shadow_notrap_nonpresent_pte; | ||
227 | } | ||
228 | |||
229 | static gfn_t pse36_gfn_delta(u32 gpte) | ||
230 | { | ||
231 | int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT; | ||
232 | |||
233 | return (gpte & PT32_DIR_PSE36_MASK) << shift; | ||
234 | } | ||
235 | |||
236 | static void set_shadow_pte(u64 *sptep, u64 spte) | ||
237 | { | ||
238 | #ifdef CONFIG_X86_64 | ||
239 | set_64bit((unsigned long *)sptep, spte); | ||
240 | #else | ||
241 | set_64bit((unsigned long long *)sptep, spte); | ||
242 | #endif | ||
243 | } | ||
244 | |||
245 | static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, | ||
246 | struct kmem_cache *base_cache, int min) | ||
247 | { | ||
248 | void *obj; | ||
249 | |||
250 | if (cache->nobjs >= min) | ||
251 | return 0; | ||
252 | while (cache->nobjs < ARRAY_SIZE(cache->objects)) { | ||
253 | obj = kmem_cache_zalloc(base_cache, GFP_KERNEL); | ||
254 | if (!obj) | ||
255 | return -ENOMEM; | ||
256 | cache->objects[cache->nobjs++] = obj; | ||
257 | } | ||
258 | return 0; | ||
259 | } | ||
260 | |||
261 | static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) | ||
262 | { | ||
263 | while (mc->nobjs) | ||
264 | kfree(mc->objects[--mc->nobjs]); | ||
265 | } | ||
266 | |||
267 | static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, | ||
268 | int min) | ||
269 | { | ||
270 | struct page *page; | ||
271 | |||
272 | if (cache->nobjs >= min) | ||
273 | return 0; | ||
274 | while (cache->nobjs < ARRAY_SIZE(cache->objects)) { | ||
275 | page = alloc_page(GFP_KERNEL); | ||
276 | if (!page) | ||
277 | return -ENOMEM; | ||
278 | set_page_private(page, 0); | ||
279 | cache->objects[cache->nobjs++] = page_address(page); | ||
280 | } | ||
281 | return 0; | ||
282 | } | ||
283 | |||
284 | static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc) | ||
285 | { | ||
286 | while (mc->nobjs) | ||
287 | free_page((unsigned long)mc->objects[--mc->nobjs]); | ||
288 | } | ||
289 | |||
290 | static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) | ||
291 | { | ||
292 | int r; | ||
293 | |||
294 | kvm_mmu_free_some_pages(vcpu); | ||
295 | r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache, | ||
296 | pte_chain_cache, 4); | ||
297 | if (r) | ||
298 | goto out; | ||
299 | r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, | ||
300 | rmap_desc_cache, 1); | ||
301 | if (r) | ||
302 | goto out; | ||
303 | r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); | ||
304 | if (r) | ||
305 | goto out; | ||
306 | r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache, | ||
307 | mmu_page_header_cache, 4); | ||
308 | out: | ||
309 | return r; | ||
310 | } | ||
311 | |||
312 | static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) | ||
313 | { | ||
314 | mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache); | ||
315 | mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache); | ||
316 | mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache); | ||
317 | mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); | ||
318 | } | ||
319 | |||
320 | static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, | ||
321 | size_t size) | ||
322 | { | ||
323 | void *p; | ||
324 | |||
325 | BUG_ON(!mc->nobjs); | ||
326 | p = mc->objects[--mc->nobjs]; | ||
327 | memset(p, 0, size); | ||
328 | return p; | ||
329 | } | ||
330 | |||
331 | static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu) | ||
332 | { | ||
333 | return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache, | ||
334 | sizeof(struct kvm_pte_chain)); | ||
335 | } | ||
336 | |||
337 | static void mmu_free_pte_chain(struct kvm_pte_chain *pc) | ||
338 | { | ||
339 | kfree(pc); | ||
340 | } | ||
341 | |||
342 | static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu) | ||
343 | { | ||
344 | return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache, | ||
345 | sizeof(struct kvm_rmap_desc)); | ||
346 | } | ||
347 | |||
348 | static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd) | ||
349 | { | ||
350 | kfree(rd); | ||
351 | } | ||
352 | |||
353 | /* | ||
354 | * Take gfn and return the reverse mapping to it. | ||
355 | * Note: gfn must be unaliased before this function get called | ||
356 | */ | ||
357 | |||
358 | static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn) | ||
359 | { | ||
360 | struct kvm_memory_slot *slot; | ||
361 | |||
362 | slot = gfn_to_memslot(kvm, gfn); | ||
363 | return &slot->rmap[gfn - slot->base_gfn]; | ||
364 | } | ||
365 | |||
366 | /* | ||
367 | * Reverse mapping data structures: | ||
368 | * | ||
369 | * If rmapp bit zero is zero, then rmapp point to the shadw page table entry | ||
370 | * that points to page_address(page). | ||
371 | * | ||
372 | * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc | ||
373 | * containing more mappings. | ||
374 | */ | ||
375 | static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) | ||
376 | { | ||
377 | struct kvm_mmu_page *sp; | ||
378 | struct kvm_rmap_desc *desc; | ||
379 | unsigned long *rmapp; | ||
380 | int i; | ||
381 | |||
382 | if (!is_rmap_pte(*spte)) | ||
383 | return; | ||
384 | gfn = unalias_gfn(vcpu->kvm, gfn); | ||
385 | sp = page_header(__pa(spte)); | ||
386 | sp->gfns[spte - sp->spt] = gfn; | ||
387 | rmapp = gfn_to_rmap(vcpu->kvm, gfn); | ||
388 | if (!*rmapp) { | ||
389 | rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); | ||
390 | *rmapp = (unsigned long)spte; | ||
391 | } else if (!(*rmapp & 1)) { | ||
392 | rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte); | ||
393 | desc = mmu_alloc_rmap_desc(vcpu); | ||
394 | desc->shadow_ptes[0] = (u64 *)*rmapp; | ||
395 | desc->shadow_ptes[1] = spte; | ||
396 | *rmapp = (unsigned long)desc | 1; | ||
397 | } else { | ||
398 | rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); | ||
399 | desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); | ||
400 | while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) | ||
401 | desc = desc->more; | ||
402 | if (desc->shadow_ptes[RMAP_EXT-1]) { | ||
403 | desc->more = mmu_alloc_rmap_desc(vcpu); | ||
404 | desc = desc->more; | ||
405 | } | ||
406 | for (i = 0; desc->shadow_ptes[i]; ++i) | ||
407 | ; | ||
408 | desc->shadow_ptes[i] = spte; | ||
409 | } | ||
410 | } | ||
411 | |||
412 | static void rmap_desc_remove_entry(unsigned long *rmapp, | ||
413 | struct kvm_rmap_desc *desc, | ||
414 | int i, | ||
415 | struct kvm_rmap_desc *prev_desc) | ||
416 | { | ||
417 | int j; | ||
418 | |||
419 | for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j) | ||
420 | ; | ||
421 | desc->shadow_ptes[i] = desc->shadow_ptes[j]; | ||
422 | desc->shadow_ptes[j] = NULL; | ||
423 | if (j != 0) | ||
424 | return; | ||
425 | if (!prev_desc && !desc->more) | ||
426 | *rmapp = (unsigned long)desc->shadow_ptes[0]; | ||
427 | else | ||
428 | if (prev_desc) | ||
429 | prev_desc->more = desc->more; | ||
430 | else | ||
431 | *rmapp = (unsigned long)desc->more | 1; | ||
432 | mmu_free_rmap_desc(desc); | ||
433 | } | ||
434 | |||
435 | static void rmap_remove(struct kvm *kvm, u64 *spte) | ||
436 | { | ||
437 | struct kvm_rmap_desc *desc; | ||
438 | struct kvm_rmap_desc *prev_desc; | ||
439 | struct kvm_mmu_page *sp; | ||
440 | struct page *page; | ||
441 | unsigned long *rmapp; | ||
442 | int i; | ||
443 | |||
444 | if (!is_rmap_pte(*spte)) | ||
445 | return; | ||
446 | sp = page_header(__pa(spte)); | ||
447 | page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT); | ||
448 | mark_page_accessed(page); | ||
449 | if (is_writeble_pte(*spte)) | ||
450 | kvm_release_page_dirty(page); | ||
451 | else | ||
452 | kvm_release_page_clean(page); | ||
453 | rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt]); | ||
454 | if (!*rmapp) { | ||
455 | printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); | ||
456 | BUG(); | ||
457 | } else if (!(*rmapp & 1)) { | ||
458 | rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte); | ||
459 | if ((u64 *)*rmapp != spte) { | ||
460 | printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n", | ||
461 | spte, *spte); | ||
462 | BUG(); | ||
463 | } | ||
464 | *rmapp = 0; | ||
465 | } else { | ||
466 | rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte); | ||
467 | desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); | ||
468 | prev_desc = NULL; | ||
469 | while (desc) { | ||
470 | for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) | ||
471 | if (desc->shadow_ptes[i] == spte) { | ||
472 | rmap_desc_remove_entry(rmapp, | ||
473 | desc, i, | ||
474 | prev_desc); | ||
475 | return; | ||
476 | } | ||
477 | prev_desc = desc; | ||
478 | desc = desc->more; | ||
479 | } | ||
480 | BUG(); | ||
481 | } | ||
482 | } | ||
483 | |||
484 | static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) | ||
485 | { | ||
486 | struct kvm_rmap_desc *desc; | ||
487 | struct kvm_rmap_desc *prev_desc; | ||
488 | u64 *prev_spte; | ||
489 | int i; | ||
490 | |||
491 | if (!*rmapp) | ||
492 | return NULL; | ||
493 | else if (!(*rmapp & 1)) { | ||
494 | if (!spte) | ||
495 | return (u64 *)*rmapp; | ||
496 | return NULL; | ||
497 | } | ||
498 | desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); | ||
499 | prev_desc = NULL; | ||
500 | prev_spte = NULL; | ||
501 | while (desc) { | ||
502 | for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) { | ||
503 | if (prev_spte == spte) | ||
504 | return desc->shadow_ptes[i]; | ||
505 | prev_spte = desc->shadow_ptes[i]; | ||
506 | } | ||
507 | desc = desc->more; | ||
508 | } | ||
509 | return NULL; | ||
510 | } | ||
511 | |||
512 | static void rmap_write_protect(struct kvm *kvm, u64 gfn) | ||
513 | { | ||
514 | unsigned long *rmapp; | ||
515 | u64 *spte; | ||
516 | |||
517 | gfn = unalias_gfn(kvm, gfn); | ||
518 | rmapp = gfn_to_rmap(kvm, gfn); | ||
519 | |||
520 | spte = rmap_next(kvm, rmapp, NULL); | ||
521 | while (spte) { | ||
522 | BUG_ON(!spte); | ||
523 | BUG_ON(!(*spte & PT_PRESENT_MASK)); | ||
524 | rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); | ||
525 | if (is_writeble_pte(*spte)) | ||
526 | set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK); | ||
527 | kvm_flush_remote_tlbs(kvm); | ||
528 | spte = rmap_next(kvm, rmapp, spte); | ||
529 | } | ||
530 | } | ||
531 | |||
532 | #ifdef MMU_DEBUG | ||
533 | static int is_empty_shadow_page(u64 *spt) | ||
534 | { | ||
535 | u64 *pos; | ||
536 | u64 *end; | ||
537 | |||
538 | for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++) | ||
539 | if ((*pos & ~PT_SHADOW_IO_MARK) != shadow_trap_nonpresent_pte) { | ||
540 | printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__, | ||
541 | pos, *pos); | ||
542 | return 0; | ||
543 | } | ||
544 | return 1; | ||
545 | } | ||
546 | #endif | ||
547 | |||
548 | static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) | ||
549 | { | ||
550 | ASSERT(is_empty_shadow_page(sp->spt)); | ||
551 | list_del(&sp->link); | ||
552 | __free_page(virt_to_page(sp->spt)); | ||
553 | __free_page(virt_to_page(sp->gfns)); | ||
554 | kfree(sp); | ||
555 | ++kvm->arch.n_free_mmu_pages; | ||
556 | } | ||
557 | |||
558 | static unsigned kvm_page_table_hashfn(gfn_t gfn) | ||
559 | { | ||
560 | return gfn; | ||
561 | } | ||
562 | |||
563 | static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, | ||
564 | u64 *parent_pte) | ||
565 | { | ||
566 | struct kvm_mmu_page *sp; | ||
567 | |||
568 | if (!vcpu->kvm->arch.n_free_mmu_pages) | ||
569 | return NULL; | ||
570 | |||
571 | sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp); | ||
572 | sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); | ||
573 | sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); | ||
574 | set_page_private(virt_to_page(sp->spt), (unsigned long)sp); | ||
575 | list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); | ||
576 | ASSERT(is_empty_shadow_page(sp->spt)); | ||
577 | sp->slot_bitmap = 0; | ||
578 | sp->multimapped = 0; | ||
579 | sp->parent_pte = parent_pte; | ||
580 | --vcpu->kvm->arch.n_free_mmu_pages; | ||
581 | return sp; | ||
582 | } | ||
583 | |||
584 | static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, | ||
585 | struct kvm_mmu_page *sp, u64 *parent_pte) | ||
586 | { | ||
587 | struct kvm_pte_chain *pte_chain; | ||
588 | struct hlist_node *node; | ||
589 | int i; | ||
590 | |||
591 | if (!parent_pte) | ||
592 | return; | ||
593 | if (!sp->multimapped) { | ||
594 | u64 *old = sp->parent_pte; | ||
595 | |||
596 | if (!old) { | ||
597 | sp->parent_pte = parent_pte; | ||
598 | return; | ||
599 | } | ||
600 | sp->multimapped = 1; | ||
601 | pte_chain = mmu_alloc_pte_chain(vcpu); | ||
602 | INIT_HLIST_HEAD(&sp->parent_ptes); | ||
603 | hlist_add_head(&pte_chain->link, &sp->parent_ptes); | ||
604 | pte_chain->parent_ptes[0] = old; | ||
605 | } | ||
606 | hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) { | ||
607 | if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1]) | ||
608 | continue; | ||
609 | for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) | ||
610 | if (!pte_chain->parent_ptes[i]) { | ||
611 | pte_chain->parent_ptes[i] = parent_pte; | ||
612 | return; | ||
613 | } | ||
614 | } | ||
615 | pte_chain = mmu_alloc_pte_chain(vcpu); | ||
616 | BUG_ON(!pte_chain); | ||
617 | hlist_add_head(&pte_chain->link, &sp->parent_ptes); | ||
618 | pte_chain->parent_ptes[0] = parent_pte; | ||
619 | } | ||
620 | |||
621 | static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, | ||
622 | u64 *parent_pte) | ||
623 | { | ||
624 | struct kvm_pte_chain *pte_chain; | ||
625 | struct hlist_node *node; | ||
626 | int i; | ||
627 | |||
628 | if (!sp->multimapped) { | ||
629 | BUG_ON(sp->parent_pte != parent_pte); | ||
630 | sp->parent_pte = NULL; | ||
631 | return; | ||
632 | } | ||
633 | hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) | ||
634 | for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { | ||
635 | if (!pte_chain->parent_ptes[i]) | ||
636 | break; | ||
637 | if (pte_chain->parent_ptes[i] != parent_pte) | ||
638 | continue; | ||
639 | while (i + 1 < NR_PTE_CHAIN_ENTRIES | ||
640 | && pte_chain->parent_ptes[i + 1]) { | ||
641 | pte_chain->parent_ptes[i] | ||
642 | = pte_chain->parent_ptes[i + 1]; | ||
643 | ++i; | ||
644 | } | ||
645 | pte_chain->parent_ptes[i] = NULL; | ||
646 | if (i == 0) { | ||
647 | hlist_del(&pte_chain->link); | ||
648 | mmu_free_pte_chain(pte_chain); | ||
649 | if (hlist_empty(&sp->parent_ptes)) { | ||
650 | sp->multimapped = 0; | ||
651 | sp->parent_pte = NULL; | ||
652 | } | ||
653 | } | ||
654 | return; | ||
655 | } | ||
656 | BUG(); | ||
657 | } | ||
658 | |||
659 | static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) | ||
660 | { | ||
661 | unsigned index; | ||
662 | struct hlist_head *bucket; | ||
663 | struct kvm_mmu_page *sp; | ||
664 | struct hlist_node *node; | ||
665 | |||
666 | pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn); | ||
667 | index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; | ||
668 | bucket = &kvm->arch.mmu_page_hash[index]; | ||
669 | hlist_for_each_entry(sp, node, bucket, hash_link) | ||
670 | if (sp->gfn == gfn && !sp->role.metaphysical) { | ||
671 | pgprintk("%s: found role %x\n", | ||
672 | __FUNCTION__, sp->role.word); | ||
673 | return sp; | ||
674 | } | ||
675 | return NULL; | ||
676 | } | ||
677 | |||
678 | static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | ||
679 | gfn_t gfn, | ||
680 | gva_t gaddr, | ||
681 | unsigned level, | ||
682 | int metaphysical, | ||
683 | unsigned access, | ||
684 | u64 *parent_pte, | ||
685 | bool *new_page) | ||
686 | { | ||
687 | union kvm_mmu_page_role role; | ||
688 | unsigned index; | ||
689 | unsigned quadrant; | ||
690 | struct hlist_head *bucket; | ||
691 | struct kvm_mmu_page *sp; | ||
692 | struct hlist_node *node; | ||
693 | |||
694 | role.word = 0; | ||
695 | role.glevels = vcpu->arch.mmu.root_level; | ||
696 | role.level = level; | ||
697 | role.metaphysical = metaphysical; | ||
698 | role.access = access; | ||
699 | if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { | ||
700 | quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); | ||
701 | quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; | ||
702 | role.quadrant = quadrant; | ||
703 | } | ||
704 | pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__, | ||
705 | gfn, role.word); | ||
706 | index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; | ||
707 | bucket = &vcpu->kvm->arch.mmu_page_hash[index]; | ||
708 | hlist_for_each_entry(sp, node, bucket, hash_link) | ||
709 | if (sp->gfn == gfn && sp->role.word == role.word) { | ||
710 | mmu_page_add_parent_pte(vcpu, sp, parent_pte); | ||
711 | pgprintk("%s: found\n", __FUNCTION__); | ||
712 | return sp; | ||
713 | } | ||
714 | sp = kvm_mmu_alloc_page(vcpu, parent_pte); | ||
715 | if (!sp) | ||
716 | return sp; | ||
717 | pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word); | ||
718 | sp->gfn = gfn; | ||
719 | sp->role = role; | ||
720 | hlist_add_head(&sp->hash_link, bucket); | ||
721 | vcpu->arch.mmu.prefetch_page(vcpu, sp); | ||
722 | if (!metaphysical) | ||
723 | rmap_write_protect(vcpu->kvm, gfn); | ||
724 | if (new_page) | ||
725 | *new_page = 1; | ||
726 | return sp; | ||
727 | } | ||
728 | |||
729 | static void kvm_mmu_page_unlink_children(struct kvm *kvm, | ||
730 | struct kvm_mmu_page *sp) | ||
731 | { | ||
732 | unsigned i; | ||
733 | u64 *pt; | ||
734 | u64 ent; | ||
735 | |||
736 | pt = sp->spt; | ||
737 | |||
738 | if (sp->role.level == PT_PAGE_TABLE_LEVEL) { | ||
739 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | ||
740 | if (is_shadow_present_pte(pt[i])) | ||
741 | rmap_remove(kvm, &pt[i]); | ||
742 | pt[i] = shadow_trap_nonpresent_pte; | ||
743 | } | ||
744 | kvm_flush_remote_tlbs(kvm); | ||
745 | return; | ||
746 | } | ||
747 | |||
748 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | ||
749 | ent = pt[i]; | ||
750 | |||
751 | pt[i] = shadow_trap_nonpresent_pte; | ||
752 | if (!is_shadow_present_pte(ent)) | ||
753 | continue; | ||
754 | ent &= PT64_BASE_ADDR_MASK; | ||
755 | mmu_page_remove_parent_pte(page_header(ent), &pt[i]); | ||
756 | } | ||
757 | kvm_flush_remote_tlbs(kvm); | ||
758 | } | ||
759 | |||
760 | static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) | ||
761 | { | ||
762 | mmu_page_remove_parent_pte(sp, parent_pte); | ||
763 | } | ||
764 | |||
765 | static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm) | ||
766 | { | ||
767 | int i; | ||
768 | |||
769 | for (i = 0; i < KVM_MAX_VCPUS; ++i) | ||
770 | if (kvm->vcpus[i]) | ||
771 | kvm->vcpus[i]->arch.last_pte_updated = NULL; | ||
772 | } | ||
773 | |||
774 | static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) | ||
775 | { | ||
776 | u64 *parent_pte; | ||
777 | |||
778 | ++kvm->stat.mmu_shadow_zapped; | ||
779 | while (sp->multimapped || sp->parent_pte) { | ||
780 | if (!sp->multimapped) | ||
781 | parent_pte = sp->parent_pte; | ||
782 | else { | ||
783 | struct kvm_pte_chain *chain; | ||
784 | |||
785 | chain = container_of(sp->parent_ptes.first, | ||
786 | struct kvm_pte_chain, link); | ||
787 | parent_pte = chain->parent_ptes[0]; | ||
788 | } | ||
789 | BUG_ON(!parent_pte); | ||
790 | kvm_mmu_put_page(sp, parent_pte); | ||
791 | set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte); | ||
792 | } | ||
793 | kvm_mmu_page_unlink_children(kvm, sp); | ||
794 | if (!sp->root_count) { | ||
795 | hlist_del(&sp->hash_link); | ||
796 | kvm_mmu_free_page(kvm, sp); | ||
797 | } else | ||
798 | list_move(&sp->link, &kvm->arch.active_mmu_pages); | ||
799 | kvm_mmu_reset_last_pte_updated(kvm); | ||
800 | } | ||
801 | |||
802 | /* | ||
803 | * Changing the number of mmu pages allocated to the vm | ||
804 | * Note: if kvm_nr_mmu_pages is too small, you will get dead lock | ||
805 | */ | ||
806 | void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) | ||
807 | { | ||
808 | /* | ||
809 | * If we set the number of mmu pages to be smaller be than the | ||
810 | * number of actived pages , we must to free some mmu pages before we | ||
811 | * change the value | ||
812 | */ | ||
813 | |||
814 | if ((kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages) > | ||
815 | kvm_nr_mmu_pages) { | ||
816 | int n_used_mmu_pages = kvm->arch.n_alloc_mmu_pages | ||
817 | - kvm->arch.n_free_mmu_pages; | ||
818 | |||
819 | while (n_used_mmu_pages > kvm_nr_mmu_pages) { | ||
820 | struct kvm_mmu_page *page; | ||
821 | |||
822 | page = container_of(kvm->arch.active_mmu_pages.prev, | ||
823 | struct kvm_mmu_page, link); | ||
824 | kvm_mmu_zap_page(kvm, page); | ||
825 | n_used_mmu_pages--; | ||
826 | } | ||
827 | kvm->arch.n_free_mmu_pages = 0; | ||
828 | } | ||
829 | else | ||
830 | kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages | ||
831 | - kvm->arch.n_alloc_mmu_pages; | ||
832 | |||
833 | kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages; | ||
834 | } | ||
835 | |||
836 | static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) | ||
837 | { | ||
838 | unsigned index; | ||
839 | struct hlist_head *bucket; | ||
840 | struct kvm_mmu_page *sp; | ||
841 | struct hlist_node *node, *n; | ||
842 | int r; | ||
843 | |||
844 | pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn); | ||
845 | r = 0; | ||
846 | index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; | ||
847 | bucket = &kvm->arch.mmu_page_hash[index]; | ||
848 | hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) | ||
849 | if (sp->gfn == gfn && !sp->role.metaphysical) { | ||
850 | pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn, | ||
851 | sp->role.word); | ||
852 | kvm_mmu_zap_page(kvm, sp); | ||
853 | r = 1; | ||
854 | } | ||
855 | return r; | ||
856 | } | ||
857 | |||
858 | static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) | ||
859 | { | ||
860 | struct kvm_mmu_page *sp; | ||
861 | |||
862 | while ((sp = kvm_mmu_lookup_page(kvm, gfn)) != NULL) { | ||
863 | pgprintk("%s: zap %lx %x\n", __FUNCTION__, gfn, sp->role.word); | ||
864 | kvm_mmu_zap_page(kvm, sp); | ||
865 | } | ||
866 | } | ||
867 | |||
868 | static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) | ||
869 | { | ||
870 | int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn)); | ||
871 | struct kvm_mmu_page *sp = page_header(__pa(pte)); | ||
872 | |||
873 | __set_bit(slot, &sp->slot_bitmap); | ||
874 | } | ||
875 | |||
876 | struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) | ||
877 | { | ||
878 | gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); | ||
879 | |||
880 | if (gpa == UNMAPPED_GVA) | ||
881 | return NULL; | ||
882 | return gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); | ||
883 | } | ||
884 | |||
885 | static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | ||
886 | unsigned pt_access, unsigned pte_access, | ||
887 | int user_fault, int write_fault, int dirty, | ||
888 | int *ptwrite, gfn_t gfn) | ||
889 | { | ||
890 | u64 spte; | ||
891 | int was_rmapped = is_rmap_pte(*shadow_pte); | ||
892 | struct page *page; | ||
893 | |||
894 | pgprintk("%s: spte %llx access %x write_fault %d" | ||
895 | " user_fault %d gfn %lx\n", | ||
896 | __FUNCTION__, *shadow_pte, pt_access, | ||
897 | write_fault, user_fault, gfn); | ||
898 | |||
899 | /* | ||
900 | * We don't set the accessed bit, since we sometimes want to see | ||
901 | * whether the guest actually used the pte (in order to detect | ||
902 | * demand paging). | ||
903 | */ | ||
904 | spte = PT_PRESENT_MASK | PT_DIRTY_MASK; | ||
905 | if (!dirty) | ||
906 | pte_access &= ~ACC_WRITE_MASK; | ||
907 | if (!(pte_access & ACC_EXEC_MASK)) | ||
908 | spte |= PT64_NX_MASK; | ||
909 | |||
910 | page = gfn_to_page(vcpu->kvm, gfn); | ||
911 | |||
912 | spte |= PT_PRESENT_MASK; | ||
913 | if (pte_access & ACC_USER_MASK) | ||
914 | spte |= PT_USER_MASK; | ||
915 | |||
916 | if (is_error_page(page)) { | ||
917 | set_shadow_pte(shadow_pte, | ||
918 | shadow_trap_nonpresent_pte | PT_SHADOW_IO_MARK); | ||
919 | kvm_release_page_clean(page); | ||
920 | return; | ||
921 | } | ||
922 | |||
923 | spte |= page_to_phys(page); | ||
924 | |||
925 | if ((pte_access & ACC_WRITE_MASK) | ||
926 | || (write_fault && !is_write_protection(vcpu) && !user_fault)) { | ||
927 | struct kvm_mmu_page *shadow; | ||
928 | |||
929 | spte |= PT_WRITABLE_MASK; | ||
930 | if (user_fault) { | ||
931 | mmu_unshadow(vcpu->kvm, gfn); | ||
932 | goto unshadowed; | ||
933 | } | ||
934 | |||
935 | shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); | ||
936 | if (shadow) { | ||
937 | pgprintk("%s: found shadow page for %lx, marking ro\n", | ||
938 | __FUNCTION__, gfn); | ||
939 | pte_access &= ~ACC_WRITE_MASK; | ||
940 | if (is_writeble_pte(spte)) { | ||
941 | spte &= ~PT_WRITABLE_MASK; | ||
942 | kvm_x86_ops->tlb_flush(vcpu); | ||
943 | } | ||
944 | if (write_fault) | ||
945 | *ptwrite = 1; | ||
946 | } | ||
947 | } | ||
948 | |||
949 | unshadowed: | ||
950 | |||
951 | if (pte_access & ACC_WRITE_MASK) | ||
952 | mark_page_dirty(vcpu->kvm, gfn); | ||
953 | |||
954 | pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte); | ||
955 | set_shadow_pte(shadow_pte, spte); | ||
956 | page_header_update_slot(vcpu->kvm, shadow_pte, gfn); | ||
957 | if (!was_rmapped) { | ||
958 | rmap_add(vcpu, shadow_pte, gfn); | ||
959 | if (!is_rmap_pte(*shadow_pte)) | ||
960 | kvm_release_page_clean(page); | ||
961 | } | ||
962 | else | ||
963 | kvm_release_page_clean(page); | ||
964 | if (!ptwrite || !*ptwrite) | ||
965 | vcpu->arch.last_pte_updated = shadow_pte; | ||
966 | } | ||
967 | |||
968 | static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) | ||
969 | { | ||
970 | } | ||
971 | |||
972 | static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) | ||
973 | { | ||
974 | int level = PT32E_ROOT_LEVEL; | ||
975 | hpa_t table_addr = vcpu->arch.mmu.root_hpa; | ||
976 | int pt_write = 0; | ||
977 | |||
978 | for (; ; level--) { | ||
979 | u32 index = PT64_INDEX(v, level); | ||
980 | u64 *table; | ||
981 | |||
982 | ASSERT(VALID_PAGE(table_addr)); | ||
983 | table = __va(table_addr); | ||
984 | |||
985 | if (level == 1) { | ||
986 | mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, | ||
987 | 0, write, 1, &pt_write, gfn); | ||
988 | return pt_write || is_io_pte(table[index]); | ||
989 | } | ||
990 | |||
991 | if (table[index] == shadow_trap_nonpresent_pte) { | ||
992 | struct kvm_mmu_page *new_table; | ||
993 | gfn_t pseudo_gfn; | ||
994 | |||
995 | pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK) | ||
996 | >> PAGE_SHIFT; | ||
997 | new_table = kvm_mmu_get_page(vcpu, pseudo_gfn, | ||
998 | v, level - 1, | ||
999 | 1, ACC_ALL, &table[index], | ||
1000 | NULL); | ||
1001 | if (!new_table) { | ||
1002 | pgprintk("nonpaging_map: ENOMEM\n"); | ||
1003 | return -ENOMEM; | ||
1004 | } | ||
1005 | |||
1006 | table[index] = __pa(new_table->spt) | PT_PRESENT_MASK | ||
1007 | | PT_WRITABLE_MASK | PT_USER_MASK; | ||
1008 | } | ||
1009 | table_addr = table[index] & PT64_BASE_ADDR_MASK; | ||
1010 | } | ||
1011 | } | ||
1012 | |||
1013 | static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, | ||
1014 | struct kvm_mmu_page *sp) | ||
1015 | { | ||
1016 | int i; | ||
1017 | |||
1018 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) | ||
1019 | sp->spt[i] = shadow_trap_nonpresent_pte; | ||
1020 | } | ||
1021 | |||
1022 | static void mmu_free_roots(struct kvm_vcpu *vcpu) | ||
1023 | { | ||
1024 | int i; | ||
1025 | struct kvm_mmu_page *sp; | ||
1026 | |||
1027 | if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) | ||
1028 | return; | ||
1029 | #ifdef CONFIG_X86_64 | ||
1030 | if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { | ||
1031 | hpa_t root = vcpu->arch.mmu.root_hpa; | ||
1032 | |||
1033 | sp = page_header(root); | ||
1034 | --sp->root_count; | ||
1035 | vcpu->arch.mmu.root_hpa = INVALID_PAGE; | ||
1036 | return; | ||
1037 | } | ||
1038 | #endif | ||
1039 | for (i = 0; i < 4; ++i) { | ||
1040 | hpa_t root = vcpu->arch.mmu.pae_root[i]; | ||
1041 | |||
1042 | if (root) { | ||
1043 | root &= PT64_BASE_ADDR_MASK; | ||
1044 | sp = page_header(root); | ||
1045 | --sp->root_count; | ||
1046 | } | ||
1047 | vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; | ||
1048 | } | ||
1049 | vcpu->arch.mmu.root_hpa = INVALID_PAGE; | ||
1050 | } | ||
1051 | |||
1052 | static void mmu_alloc_roots(struct kvm_vcpu *vcpu) | ||
1053 | { | ||
1054 | int i; | ||
1055 | gfn_t root_gfn; | ||
1056 | struct kvm_mmu_page *sp; | ||
1057 | |||
1058 | root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT; | ||
1059 | |||
1060 | #ifdef CONFIG_X86_64 | ||
1061 | if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { | ||
1062 | hpa_t root = vcpu->arch.mmu.root_hpa; | ||
1063 | |||
1064 | ASSERT(!VALID_PAGE(root)); | ||
1065 | sp = kvm_mmu_get_page(vcpu, root_gfn, 0, | ||
1066 | PT64_ROOT_LEVEL, 0, ACC_ALL, NULL, NULL); | ||
1067 | root = __pa(sp->spt); | ||
1068 | ++sp->root_count; | ||
1069 | vcpu->arch.mmu.root_hpa = root; | ||
1070 | return; | ||
1071 | } | ||
1072 | #endif | ||
1073 | for (i = 0; i < 4; ++i) { | ||
1074 | hpa_t root = vcpu->arch.mmu.pae_root[i]; | ||
1075 | |||
1076 | ASSERT(!VALID_PAGE(root)); | ||
1077 | if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { | ||
1078 | if (!is_present_pte(vcpu->arch.pdptrs[i])) { | ||
1079 | vcpu->arch.mmu.pae_root[i] = 0; | ||
1080 | continue; | ||
1081 | } | ||
1082 | root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT; | ||
1083 | } else if (vcpu->arch.mmu.root_level == 0) | ||
1084 | root_gfn = 0; | ||
1085 | sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, | ||
1086 | PT32_ROOT_LEVEL, !is_paging(vcpu), | ||
1087 | ACC_ALL, NULL, NULL); | ||
1088 | root = __pa(sp->spt); | ||
1089 | ++sp->root_count; | ||
1090 | vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; | ||
1091 | } | ||
1092 | vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); | ||
1093 | } | ||
1094 | |||
1095 | static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) | ||
1096 | { | ||
1097 | return vaddr; | ||
1098 | } | ||
1099 | |||
1100 | static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, | ||
1101 | u32 error_code) | ||
1102 | { | ||
1103 | gfn_t gfn; | ||
1104 | int r; | ||
1105 | |||
1106 | pgprintk("%s: gva %lx error %x\n", __FUNCTION__, gva, error_code); | ||
1107 | r = mmu_topup_memory_caches(vcpu); | ||
1108 | if (r) | ||
1109 | return r; | ||
1110 | |||
1111 | ASSERT(vcpu); | ||
1112 | ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); | ||
1113 | |||
1114 | gfn = gva >> PAGE_SHIFT; | ||
1115 | |||
1116 | return nonpaging_map(vcpu, gva & PAGE_MASK, | ||
1117 | error_code & PFERR_WRITE_MASK, gfn); | ||
1118 | } | ||
1119 | |||
1120 | static void nonpaging_free(struct kvm_vcpu *vcpu) | ||
1121 | { | ||
1122 | mmu_free_roots(vcpu); | ||
1123 | } | ||
1124 | |||
1125 | static int nonpaging_init_context(struct kvm_vcpu *vcpu) | ||
1126 | { | ||
1127 | struct kvm_mmu *context = &vcpu->arch.mmu; | ||
1128 | |||
1129 | context->new_cr3 = nonpaging_new_cr3; | ||
1130 | context->page_fault = nonpaging_page_fault; | ||
1131 | context->gva_to_gpa = nonpaging_gva_to_gpa; | ||
1132 | context->free = nonpaging_free; | ||
1133 | context->prefetch_page = nonpaging_prefetch_page; | ||
1134 | context->root_level = 0; | ||
1135 | context->shadow_root_level = PT32E_ROOT_LEVEL; | ||
1136 | context->root_hpa = INVALID_PAGE; | ||
1137 | return 0; | ||
1138 | } | ||
1139 | |||
1140 | void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) | ||
1141 | { | ||
1142 | ++vcpu->stat.tlb_flush; | ||
1143 | kvm_x86_ops->tlb_flush(vcpu); | ||
1144 | } | ||
1145 | |||
1146 | static void paging_new_cr3(struct kvm_vcpu *vcpu) | ||
1147 | { | ||
1148 | pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3); | ||
1149 | mmu_free_roots(vcpu); | ||
1150 | } | ||
1151 | |||
1152 | static void inject_page_fault(struct kvm_vcpu *vcpu, | ||
1153 | u64 addr, | ||
1154 | u32 err_code) | ||
1155 | { | ||
1156 | kvm_inject_page_fault(vcpu, addr, err_code); | ||
1157 | } | ||
1158 | |||
1159 | static void paging_free(struct kvm_vcpu *vcpu) | ||
1160 | { | ||
1161 | nonpaging_free(vcpu); | ||
1162 | } | ||
1163 | |||
1164 | #define PTTYPE 64 | ||
1165 | #include "paging_tmpl.h" | ||
1166 | #undef PTTYPE | ||
1167 | |||
1168 | #define PTTYPE 32 | ||
1169 | #include "paging_tmpl.h" | ||
1170 | #undef PTTYPE | ||
1171 | |||
1172 | static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) | ||
1173 | { | ||
1174 | struct kvm_mmu *context = &vcpu->arch.mmu; | ||
1175 | |||
1176 | ASSERT(is_pae(vcpu)); | ||
1177 | context->new_cr3 = paging_new_cr3; | ||
1178 | context->page_fault = paging64_page_fault; | ||
1179 | context->gva_to_gpa = paging64_gva_to_gpa; | ||
1180 | context->prefetch_page = paging64_prefetch_page; | ||
1181 | context->free = paging_free; | ||
1182 | context->root_level = level; | ||
1183 | context->shadow_root_level = level; | ||
1184 | context->root_hpa = INVALID_PAGE; | ||
1185 | return 0; | ||
1186 | } | ||
1187 | |||
1188 | static int paging64_init_context(struct kvm_vcpu *vcpu) | ||
1189 | { | ||
1190 | return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL); | ||
1191 | } | ||
1192 | |||
1193 | static int paging32_init_context(struct kvm_vcpu *vcpu) | ||
1194 | { | ||
1195 | struct kvm_mmu *context = &vcpu->arch.mmu; | ||
1196 | |||
1197 | context->new_cr3 = paging_new_cr3; | ||
1198 | context->page_fault = paging32_page_fault; | ||
1199 | context->gva_to_gpa = paging32_gva_to_gpa; | ||
1200 | context->free = paging_free; | ||
1201 | context->prefetch_page = paging32_prefetch_page; | ||
1202 | context->root_level = PT32_ROOT_LEVEL; | ||
1203 | context->shadow_root_level = PT32E_ROOT_LEVEL; | ||
1204 | context->root_hpa = INVALID_PAGE; | ||
1205 | return 0; | ||
1206 | } | ||
1207 | |||
1208 | static int paging32E_init_context(struct kvm_vcpu *vcpu) | ||
1209 | { | ||
1210 | return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL); | ||
1211 | } | ||
1212 | |||
1213 | static int init_kvm_mmu(struct kvm_vcpu *vcpu) | ||
1214 | { | ||
1215 | ASSERT(vcpu); | ||
1216 | ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); | ||
1217 | |||
1218 | if (!is_paging(vcpu)) | ||
1219 | return nonpaging_init_context(vcpu); | ||
1220 | else if (is_long_mode(vcpu)) | ||
1221 | return paging64_init_context(vcpu); | ||
1222 | else if (is_pae(vcpu)) | ||
1223 | return paging32E_init_context(vcpu); | ||
1224 | else | ||
1225 | return paging32_init_context(vcpu); | ||
1226 | } | ||
1227 | |||
1228 | static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) | ||
1229 | { | ||
1230 | ASSERT(vcpu); | ||
1231 | if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) { | ||
1232 | vcpu->arch.mmu.free(vcpu); | ||
1233 | vcpu->arch.mmu.root_hpa = INVALID_PAGE; | ||
1234 | } | ||
1235 | } | ||
1236 | |||
1237 | int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) | ||
1238 | { | ||
1239 | destroy_kvm_mmu(vcpu); | ||
1240 | return init_kvm_mmu(vcpu); | ||
1241 | } | ||
1242 | EXPORT_SYMBOL_GPL(kvm_mmu_reset_context); | ||
1243 | |||
1244 | int kvm_mmu_load(struct kvm_vcpu *vcpu) | ||
1245 | { | ||
1246 | int r; | ||
1247 | |||
1248 | mutex_lock(&vcpu->kvm->lock); | ||
1249 | r = mmu_topup_memory_caches(vcpu); | ||
1250 | if (r) | ||
1251 | goto out; | ||
1252 | mmu_alloc_roots(vcpu); | ||
1253 | kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); | ||
1254 | kvm_mmu_flush_tlb(vcpu); | ||
1255 | out: | ||
1256 | mutex_unlock(&vcpu->kvm->lock); | ||
1257 | return r; | ||
1258 | } | ||
1259 | EXPORT_SYMBOL_GPL(kvm_mmu_load); | ||
1260 | |||
1261 | void kvm_mmu_unload(struct kvm_vcpu *vcpu) | ||
1262 | { | ||
1263 | mmu_free_roots(vcpu); | ||
1264 | } | ||
1265 | |||
1266 | static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, | ||
1267 | struct kvm_mmu_page *sp, | ||
1268 | u64 *spte) | ||
1269 | { | ||
1270 | u64 pte; | ||
1271 | struct kvm_mmu_page *child; | ||
1272 | |||
1273 | pte = *spte; | ||
1274 | if (is_shadow_present_pte(pte)) { | ||
1275 | if (sp->role.level == PT_PAGE_TABLE_LEVEL) | ||
1276 | rmap_remove(vcpu->kvm, spte); | ||
1277 | else { | ||
1278 | child = page_header(pte & PT64_BASE_ADDR_MASK); | ||
1279 | mmu_page_remove_parent_pte(child, spte); | ||
1280 | } | ||
1281 | } | ||
1282 | set_shadow_pte(spte, shadow_trap_nonpresent_pte); | ||
1283 | } | ||
1284 | |||
1285 | static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, | ||
1286 | struct kvm_mmu_page *sp, | ||
1287 | u64 *spte, | ||
1288 | const void *new, int bytes, | ||
1289 | int offset_in_pte) | ||
1290 | { | ||
1291 | if (sp->role.level != PT_PAGE_TABLE_LEVEL) { | ||
1292 | ++vcpu->kvm->stat.mmu_pde_zapped; | ||
1293 | return; | ||
1294 | } | ||
1295 | |||
1296 | ++vcpu->kvm->stat.mmu_pte_updated; | ||
1297 | if (sp->role.glevels == PT32_ROOT_LEVEL) | ||
1298 | paging32_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte); | ||
1299 | else | ||
1300 | paging64_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte); | ||
1301 | } | ||
1302 | |||
1303 | static bool need_remote_flush(u64 old, u64 new) | ||
1304 | { | ||
1305 | if (!is_shadow_present_pte(old)) | ||
1306 | return false; | ||
1307 | if (!is_shadow_present_pte(new)) | ||
1308 | return true; | ||
1309 | if ((old ^ new) & PT64_BASE_ADDR_MASK) | ||
1310 | return true; | ||
1311 | old ^= PT64_NX_MASK; | ||
1312 | new ^= PT64_NX_MASK; | ||
1313 | return (old & ~new & PT64_PERM_MASK) != 0; | ||
1314 | } | ||
1315 | |||
1316 | static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new) | ||
1317 | { | ||
1318 | if (need_remote_flush(old, new)) | ||
1319 | kvm_flush_remote_tlbs(vcpu->kvm); | ||
1320 | else | ||
1321 | kvm_mmu_flush_tlb(vcpu); | ||
1322 | } | ||
1323 | |||
1324 | static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu) | ||
1325 | { | ||
1326 | u64 *spte = vcpu->arch.last_pte_updated; | ||
1327 | |||
1328 | return !!(spte && (*spte & PT_ACCESSED_MASK)); | ||
1329 | } | ||
1330 | |||
1331 | void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | ||
1332 | const u8 *new, int bytes) | ||
1333 | { | ||
1334 | gfn_t gfn = gpa >> PAGE_SHIFT; | ||
1335 | struct kvm_mmu_page *sp; | ||
1336 | struct hlist_node *node, *n; | ||
1337 | struct hlist_head *bucket; | ||
1338 | unsigned index; | ||
1339 | u64 entry; | ||
1340 | u64 *spte; | ||
1341 | unsigned offset = offset_in_page(gpa); | ||
1342 | unsigned pte_size; | ||
1343 | unsigned page_offset; | ||
1344 | unsigned misaligned; | ||
1345 | unsigned quadrant; | ||
1346 | int level; | ||
1347 | int flooded = 0; | ||
1348 | int npte; | ||
1349 | |||
1350 | pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes); | ||
1351 | ++vcpu->kvm->stat.mmu_pte_write; | ||
1352 | kvm_mmu_audit(vcpu, "pre pte write"); | ||
1353 | if (gfn == vcpu->arch.last_pt_write_gfn | ||
1354 | && !last_updated_pte_accessed(vcpu)) { | ||
1355 | ++vcpu->arch.last_pt_write_count; | ||
1356 | if (vcpu->arch.last_pt_write_count >= 3) | ||
1357 | flooded = 1; | ||
1358 | } else { | ||
1359 | vcpu->arch.last_pt_write_gfn = gfn; | ||
1360 | vcpu->arch.last_pt_write_count = 1; | ||
1361 | vcpu->arch.last_pte_updated = NULL; | ||
1362 | } | ||
1363 | index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; | ||
1364 | bucket = &vcpu->kvm->arch.mmu_page_hash[index]; | ||
1365 | hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { | ||
1366 | if (sp->gfn != gfn || sp->role.metaphysical) | ||
1367 | continue; | ||
1368 | pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; | ||
1369 | misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); | ||
1370 | misaligned |= bytes < 4; | ||
1371 | if (misaligned || flooded) { | ||
1372 | /* | ||
1373 | * Misaligned accesses are too much trouble to fix | ||
1374 | * up; also, they usually indicate a page is not used | ||
1375 | * as a page table. | ||
1376 | * | ||
1377 | * If we're seeing too many writes to a page, | ||
1378 | * it may no longer be a page table, or we may be | ||
1379 | * forking, in which case it is better to unmap the | ||
1380 | * page. | ||
1381 | */ | ||
1382 | pgprintk("misaligned: gpa %llx bytes %d role %x\n", | ||
1383 | gpa, bytes, sp->role.word); | ||
1384 | kvm_mmu_zap_page(vcpu->kvm, sp); | ||
1385 | ++vcpu->kvm->stat.mmu_flooded; | ||
1386 | continue; | ||
1387 | } | ||
1388 | page_offset = offset; | ||
1389 | level = sp->role.level; | ||
1390 | npte = 1; | ||
1391 | if (sp->role.glevels == PT32_ROOT_LEVEL) { | ||
1392 | page_offset <<= 1; /* 32->64 */ | ||
1393 | /* | ||
1394 | * A 32-bit pde maps 4MB while the shadow pdes map | ||
1395 | * only 2MB. So we need to double the offset again | ||
1396 | * and zap two pdes instead of one. | ||
1397 | */ | ||
1398 | if (level == PT32_ROOT_LEVEL) { | ||
1399 | page_offset &= ~7; /* kill rounding error */ | ||
1400 | page_offset <<= 1; | ||
1401 | npte = 2; | ||
1402 | } | ||
1403 | quadrant = page_offset >> PAGE_SHIFT; | ||
1404 | page_offset &= ~PAGE_MASK; | ||
1405 | if (quadrant != sp->role.quadrant) | ||
1406 | continue; | ||
1407 | } | ||
1408 | spte = &sp->spt[page_offset / sizeof(*spte)]; | ||
1409 | while (npte--) { | ||
1410 | entry = *spte; | ||
1411 | mmu_pte_write_zap_pte(vcpu, sp, spte); | ||
1412 | mmu_pte_write_new_pte(vcpu, sp, spte, new, bytes, | ||
1413 | page_offset & (pte_size - 1)); | ||
1414 | mmu_pte_write_flush_tlb(vcpu, entry, *spte); | ||
1415 | ++spte; | ||
1416 | } | ||
1417 | } | ||
1418 | kvm_mmu_audit(vcpu, "post pte write"); | ||
1419 | } | ||
1420 | |||
1421 | int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) | ||
1422 | { | ||
1423 | gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); | ||
1424 | |||
1425 | return kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); | ||
1426 | } | ||
1427 | |||
1428 | void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) | ||
1429 | { | ||
1430 | while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) { | ||
1431 | struct kvm_mmu_page *sp; | ||
1432 | |||
1433 | sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, | ||
1434 | struct kvm_mmu_page, link); | ||
1435 | kvm_mmu_zap_page(vcpu->kvm, sp); | ||
1436 | ++vcpu->kvm->stat.mmu_recycled; | ||
1437 | } | ||
1438 | } | ||
1439 | |||
1440 | int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) | ||
1441 | { | ||
1442 | int r; | ||
1443 | enum emulation_result er; | ||
1444 | |||
1445 | mutex_lock(&vcpu->kvm->lock); | ||
1446 | r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code); | ||
1447 | if (r < 0) | ||
1448 | goto out; | ||
1449 | |||
1450 | if (!r) { | ||
1451 | r = 1; | ||
1452 | goto out; | ||
1453 | } | ||
1454 | |||
1455 | r = mmu_topup_memory_caches(vcpu); | ||
1456 | if (r) | ||
1457 | goto out; | ||
1458 | |||
1459 | er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0); | ||
1460 | mutex_unlock(&vcpu->kvm->lock); | ||
1461 | |||
1462 | switch (er) { | ||
1463 | case EMULATE_DONE: | ||
1464 | return 1; | ||
1465 | case EMULATE_DO_MMIO: | ||
1466 | ++vcpu->stat.mmio_exits; | ||
1467 | return 0; | ||
1468 | case EMULATE_FAIL: | ||
1469 | kvm_report_emulation_failure(vcpu, "pagetable"); | ||
1470 | return 1; | ||
1471 | default: | ||
1472 | BUG(); | ||
1473 | } | ||
1474 | out: | ||
1475 | mutex_unlock(&vcpu->kvm->lock); | ||
1476 | return r; | ||
1477 | } | ||
1478 | EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); | ||
1479 | |||
1480 | static void free_mmu_pages(struct kvm_vcpu *vcpu) | ||
1481 | { | ||
1482 | struct kvm_mmu_page *sp; | ||
1483 | |||
1484 | while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) { | ||
1485 | sp = container_of(vcpu->kvm->arch.active_mmu_pages.next, | ||
1486 | struct kvm_mmu_page, link); | ||
1487 | kvm_mmu_zap_page(vcpu->kvm, sp); | ||
1488 | } | ||
1489 | free_page((unsigned long)vcpu->arch.mmu.pae_root); | ||
1490 | } | ||
1491 | |||
1492 | static int alloc_mmu_pages(struct kvm_vcpu *vcpu) | ||
1493 | { | ||
1494 | struct page *page; | ||
1495 | int i; | ||
1496 | |||
1497 | ASSERT(vcpu); | ||
1498 | |||
1499 | if (vcpu->kvm->arch.n_requested_mmu_pages) | ||
1500 | vcpu->kvm->arch.n_free_mmu_pages = | ||
1501 | vcpu->kvm->arch.n_requested_mmu_pages; | ||
1502 | else | ||
1503 | vcpu->kvm->arch.n_free_mmu_pages = | ||
1504 | vcpu->kvm->arch.n_alloc_mmu_pages; | ||
1505 | /* | ||
1506 | * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. | ||
1507 | * Therefore we need to allocate shadow page tables in the first | ||
1508 | * 4GB of memory, which happens to fit the DMA32 zone. | ||
1509 | */ | ||
1510 | page = alloc_page(GFP_KERNEL | __GFP_DMA32); | ||
1511 | if (!page) | ||
1512 | goto error_1; | ||
1513 | vcpu->arch.mmu.pae_root = page_address(page); | ||
1514 | for (i = 0; i < 4; ++i) | ||
1515 | vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; | ||
1516 | |||
1517 | return 0; | ||
1518 | |||
1519 | error_1: | ||
1520 | free_mmu_pages(vcpu); | ||
1521 | return -ENOMEM; | ||
1522 | } | ||
1523 | |||
1524 | int kvm_mmu_create(struct kvm_vcpu *vcpu) | ||
1525 | { | ||
1526 | ASSERT(vcpu); | ||
1527 | ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); | ||
1528 | |||
1529 | return alloc_mmu_pages(vcpu); | ||
1530 | } | ||
1531 | |||
1532 | int kvm_mmu_setup(struct kvm_vcpu *vcpu) | ||
1533 | { | ||
1534 | ASSERT(vcpu); | ||
1535 | ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); | ||
1536 | |||
1537 | return init_kvm_mmu(vcpu); | ||
1538 | } | ||
1539 | |||
1540 | void kvm_mmu_destroy(struct kvm_vcpu *vcpu) | ||
1541 | { | ||
1542 | ASSERT(vcpu); | ||
1543 | |||
1544 | destroy_kvm_mmu(vcpu); | ||
1545 | free_mmu_pages(vcpu); | ||
1546 | mmu_free_memory_caches(vcpu); | ||
1547 | } | ||
1548 | |||
1549 | void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) | ||
1550 | { | ||
1551 | struct kvm_mmu_page *sp; | ||
1552 | |||
1553 | list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { | ||
1554 | int i; | ||
1555 | u64 *pt; | ||
1556 | |||
1557 | if (!test_bit(slot, &sp->slot_bitmap)) | ||
1558 | continue; | ||
1559 | |||
1560 | pt = sp->spt; | ||
1561 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) | ||
1562 | /* avoid RMW */ | ||
1563 | if (pt[i] & PT_WRITABLE_MASK) | ||
1564 | pt[i] &= ~PT_WRITABLE_MASK; | ||
1565 | } | ||
1566 | } | ||
1567 | |||
1568 | void kvm_mmu_zap_all(struct kvm *kvm) | ||
1569 | { | ||
1570 | struct kvm_mmu_page *sp, *node; | ||
1571 | |||
1572 | list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) | ||
1573 | kvm_mmu_zap_page(kvm, sp); | ||
1574 | |||
1575 | kvm_flush_remote_tlbs(kvm); | ||
1576 | } | ||
1577 | |||
1578 | void kvm_mmu_module_exit(void) | ||
1579 | { | ||
1580 | if (pte_chain_cache) | ||
1581 | kmem_cache_destroy(pte_chain_cache); | ||
1582 | if (rmap_desc_cache) | ||
1583 | kmem_cache_destroy(rmap_desc_cache); | ||
1584 | if (mmu_page_header_cache) | ||
1585 | kmem_cache_destroy(mmu_page_header_cache); | ||
1586 | } | ||
1587 | |||
1588 | int kvm_mmu_module_init(void) | ||
1589 | { | ||
1590 | pte_chain_cache = kmem_cache_create("kvm_pte_chain", | ||
1591 | sizeof(struct kvm_pte_chain), | ||
1592 | 0, 0, NULL); | ||
1593 | if (!pte_chain_cache) | ||
1594 | goto nomem; | ||
1595 | rmap_desc_cache = kmem_cache_create("kvm_rmap_desc", | ||
1596 | sizeof(struct kvm_rmap_desc), | ||
1597 | 0, 0, NULL); | ||
1598 | if (!rmap_desc_cache) | ||
1599 | goto nomem; | ||
1600 | |||
1601 | mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", | ||
1602 | sizeof(struct kvm_mmu_page), | ||
1603 | 0, 0, NULL); | ||
1604 | if (!mmu_page_header_cache) | ||
1605 | goto nomem; | ||
1606 | |||
1607 | return 0; | ||
1608 | |||
1609 | nomem: | ||
1610 | kvm_mmu_module_exit(); | ||
1611 | return -ENOMEM; | ||
1612 | } | ||
1613 | |||
1614 | /* | ||
1615 | * Caculate mmu pages needed for kvm. | ||
1616 | */ | ||
1617 | unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) | ||
1618 | { | ||
1619 | int i; | ||
1620 | unsigned int nr_mmu_pages; | ||
1621 | unsigned int nr_pages = 0; | ||
1622 | |||
1623 | for (i = 0; i < kvm->nmemslots; i++) | ||
1624 | nr_pages += kvm->memslots[i].npages; | ||
1625 | |||
1626 | nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; | ||
1627 | nr_mmu_pages = max(nr_mmu_pages, | ||
1628 | (unsigned int) KVM_MIN_ALLOC_MMU_PAGES); | ||
1629 | |||
1630 | return nr_mmu_pages; | ||
1631 | } | ||
1632 | |||
1633 | #ifdef AUDIT | ||
1634 | |||
1635 | static const char *audit_msg; | ||
1636 | |||
1637 | static gva_t canonicalize(gva_t gva) | ||
1638 | { | ||
1639 | #ifdef CONFIG_X86_64 | ||
1640 | gva = (long long)(gva << 16) >> 16; | ||
1641 | #endif | ||
1642 | return gva; | ||
1643 | } | ||
1644 | |||
1645 | static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, | ||
1646 | gva_t va, int level) | ||
1647 | { | ||
1648 | u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK); | ||
1649 | int i; | ||
1650 | gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1)); | ||
1651 | |||
1652 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) { | ||
1653 | u64 ent = pt[i]; | ||
1654 | |||
1655 | if (ent == shadow_trap_nonpresent_pte) | ||
1656 | continue; | ||
1657 | |||
1658 | va = canonicalize(va); | ||
1659 | if (level > 1) { | ||
1660 | if (ent == shadow_notrap_nonpresent_pte) | ||
1661 | printk(KERN_ERR "audit: (%s) nontrapping pte" | ||
1662 | " in nonleaf level: levels %d gva %lx" | ||
1663 | " level %d pte %llx\n", audit_msg, | ||
1664 | vcpu->arch.mmu.root_level, va, level, ent); | ||
1665 | |||
1666 | audit_mappings_page(vcpu, ent, va, level - 1); | ||
1667 | } else { | ||
1668 | gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va); | ||
1669 | struct page *page = gpa_to_page(vcpu, gpa); | ||
1670 | hpa_t hpa = page_to_phys(page); | ||
1671 | |||
1672 | if (is_shadow_present_pte(ent) | ||
1673 | && (ent & PT64_BASE_ADDR_MASK) != hpa) | ||
1674 | printk(KERN_ERR "xx audit error: (%s) levels %d" | ||
1675 | " gva %lx gpa %llx hpa %llx ent %llx %d\n", | ||
1676 | audit_msg, vcpu->arch.mmu.root_level, | ||
1677 | va, gpa, hpa, ent, | ||
1678 | is_shadow_present_pte(ent)); | ||
1679 | else if (ent == shadow_notrap_nonpresent_pte | ||
1680 | && !is_error_hpa(hpa)) | ||
1681 | printk(KERN_ERR "audit: (%s) notrap shadow," | ||
1682 | " valid guest gva %lx\n", audit_msg, va); | ||
1683 | kvm_release_page_clean(page); | ||
1684 | |||
1685 | } | ||
1686 | } | ||
1687 | } | ||
1688 | |||
1689 | static void audit_mappings(struct kvm_vcpu *vcpu) | ||
1690 | { | ||
1691 | unsigned i; | ||
1692 | |||
1693 | if (vcpu->arch.mmu.root_level == 4) | ||
1694 | audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4); | ||
1695 | else | ||
1696 | for (i = 0; i < 4; ++i) | ||
1697 | if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK) | ||
1698 | audit_mappings_page(vcpu, | ||
1699 | vcpu->arch.mmu.pae_root[i], | ||
1700 | i << 30, | ||
1701 | 2); | ||
1702 | } | ||
1703 | |||
1704 | static int count_rmaps(struct kvm_vcpu *vcpu) | ||
1705 | { | ||
1706 | int nmaps = 0; | ||
1707 | int i, j, k; | ||
1708 | |||
1709 | for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { | ||
1710 | struct kvm_memory_slot *m = &vcpu->kvm->memslots[i]; | ||
1711 | struct kvm_rmap_desc *d; | ||
1712 | |||
1713 | for (j = 0; j < m->npages; ++j) { | ||
1714 | unsigned long *rmapp = &m->rmap[j]; | ||
1715 | |||
1716 | if (!*rmapp) | ||
1717 | continue; | ||
1718 | if (!(*rmapp & 1)) { | ||
1719 | ++nmaps; | ||
1720 | continue; | ||
1721 | } | ||
1722 | d = (struct kvm_rmap_desc *)(*rmapp & ~1ul); | ||
1723 | while (d) { | ||
1724 | for (k = 0; k < RMAP_EXT; ++k) | ||
1725 | if (d->shadow_ptes[k]) | ||
1726 | ++nmaps; | ||
1727 | else | ||
1728 | break; | ||
1729 | d = d->more; | ||
1730 | } | ||
1731 | } | ||
1732 | } | ||
1733 | return nmaps; | ||
1734 | } | ||
1735 | |||
1736 | static int count_writable_mappings(struct kvm_vcpu *vcpu) | ||
1737 | { | ||
1738 | int nmaps = 0; | ||
1739 | struct kvm_mmu_page *sp; | ||
1740 | int i; | ||
1741 | |||
1742 | list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { | ||
1743 | u64 *pt = sp->spt; | ||
1744 | |||
1745 | if (sp->role.level != PT_PAGE_TABLE_LEVEL) | ||
1746 | continue; | ||
1747 | |||
1748 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | ||
1749 | u64 ent = pt[i]; | ||
1750 | |||
1751 | if (!(ent & PT_PRESENT_MASK)) | ||
1752 | continue; | ||
1753 | if (!(ent & PT_WRITABLE_MASK)) | ||
1754 | continue; | ||
1755 | ++nmaps; | ||
1756 | } | ||
1757 | } | ||
1758 | return nmaps; | ||
1759 | } | ||
1760 | |||
1761 | static void audit_rmap(struct kvm_vcpu *vcpu) | ||
1762 | { | ||
1763 | int n_rmap = count_rmaps(vcpu); | ||
1764 | int n_actual = count_writable_mappings(vcpu); | ||
1765 | |||
1766 | if (n_rmap != n_actual) | ||
1767 | printk(KERN_ERR "%s: (%s) rmap %d actual %d\n", | ||
1768 | __FUNCTION__, audit_msg, n_rmap, n_actual); | ||
1769 | } | ||
1770 | |||
1771 | static void audit_write_protection(struct kvm_vcpu *vcpu) | ||
1772 | { | ||
1773 | struct kvm_mmu_page *sp; | ||
1774 | struct kvm_memory_slot *slot; | ||
1775 | unsigned long *rmapp; | ||
1776 | gfn_t gfn; | ||
1777 | |||
1778 | list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { | ||
1779 | if (sp->role.metaphysical) | ||
1780 | continue; | ||
1781 | |||
1782 | slot = gfn_to_memslot(vcpu->kvm, sp->gfn); | ||
1783 | gfn = unalias_gfn(vcpu->kvm, sp->gfn); | ||
1784 | rmapp = &slot->rmap[gfn - slot->base_gfn]; | ||
1785 | if (*rmapp) | ||
1786 | printk(KERN_ERR "%s: (%s) shadow page has writable" | ||
1787 | " mappings: gfn %lx role %x\n", | ||
1788 | __FUNCTION__, audit_msg, sp->gfn, | ||
1789 | sp->role.word); | ||
1790 | } | ||
1791 | } | ||
1792 | |||
1793 | static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) | ||
1794 | { | ||
1795 | int olddbg = dbg; | ||
1796 | |||
1797 | dbg = 0; | ||
1798 | audit_msg = msg; | ||
1799 | audit_rmap(vcpu); | ||
1800 | audit_write_protection(vcpu); | ||
1801 | audit_mappings(vcpu); | ||
1802 | dbg = olddbg; | ||
1803 | } | ||
1804 | |||
1805 | #endif | ||
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h new file mode 100644 index 000000000000..1fce19ec7a23 --- /dev/null +++ b/arch/x86/kvm/mmu.h | |||
@@ -0,0 +1,44 @@ | |||
1 | #ifndef __KVM_X86_MMU_H | ||
2 | #define __KVM_X86_MMU_H | ||
3 | |||
4 | #include <linux/kvm_host.h> | ||
5 | |||
6 | static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) | ||
7 | { | ||
8 | if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) | ||
9 | __kvm_mmu_free_some_pages(vcpu); | ||
10 | } | ||
11 | |||
12 | static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) | ||
13 | { | ||
14 | if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE)) | ||
15 | return 0; | ||
16 | |||
17 | return kvm_mmu_load(vcpu); | ||
18 | } | ||
19 | |||
20 | static inline int is_long_mode(struct kvm_vcpu *vcpu) | ||
21 | { | ||
22 | #ifdef CONFIG_X86_64 | ||
23 | return vcpu->arch.shadow_efer & EFER_LME; | ||
24 | #else | ||
25 | return 0; | ||
26 | #endif | ||
27 | } | ||
28 | |||
29 | static inline int is_pae(struct kvm_vcpu *vcpu) | ||
30 | { | ||
31 | return vcpu->arch.cr4 & X86_CR4_PAE; | ||
32 | } | ||
33 | |||
34 | static inline int is_pse(struct kvm_vcpu *vcpu) | ||
35 | { | ||
36 | return vcpu->arch.cr4 & X86_CR4_PSE; | ||
37 | } | ||
38 | |||
39 | static inline int is_paging(struct kvm_vcpu *vcpu) | ||
40 | { | ||
41 | return vcpu->arch.cr0 & X86_CR0_PG; | ||
42 | } | ||
43 | |||
44 | #endif | ||
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h new file mode 100644 index 000000000000..56b88f7e83ef --- /dev/null +++ b/arch/x86/kvm/paging_tmpl.h | |||
@@ -0,0 +1,461 @@ | |||
1 | /* | ||
2 | * Kernel-based Virtual Machine driver for Linux | ||
3 | * | ||
4 | * This module enables machines with Intel VT-x extensions to run virtual | ||
5 | * machines without emulation or binary translation. | ||
6 | * | ||
7 | * MMU support | ||
8 | * | ||
9 | * Copyright (C) 2006 Qumranet, Inc. | ||
10 | * | ||
11 | * Authors: | ||
12 | * Yaniv Kamay <yaniv@qumranet.com> | ||
13 | * Avi Kivity <avi@qumranet.com> | ||
14 | * | ||
15 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
16 | * the COPYING file in the top-level directory. | ||
17 | * | ||
18 | */ | ||
19 | |||
20 | /* | ||
21 | * We need the mmu code to access both 32-bit and 64-bit guest ptes, | ||
22 | * so the code in this file is compiled twice, once per pte size. | ||
23 | */ | ||
24 | |||
25 | #if PTTYPE == 64 | ||
26 | #define pt_element_t u64 | ||
27 | #define guest_walker guest_walker64 | ||
28 | #define FNAME(name) paging##64_##name | ||
29 | #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK | ||
30 | #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK | ||
31 | #define PT_INDEX(addr, level) PT64_INDEX(addr, level) | ||
32 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) | ||
33 | #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) | ||
34 | #define PT_LEVEL_BITS PT64_LEVEL_BITS | ||
35 | #ifdef CONFIG_X86_64 | ||
36 | #define PT_MAX_FULL_LEVELS 4 | ||
37 | #define CMPXCHG cmpxchg | ||
38 | #else | ||
39 | #define CMPXCHG cmpxchg64 | ||
40 | #define PT_MAX_FULL_LEVELS 2 | ||
41 | #endif | ||
42 | #elif PTTYPE == 32 | ||
43 | #define pt_element_t u32 | ||
44 | #define guest_walker guest_walker32 | ||
45 | #define FNAME(name) paging##32_##name | ||
46 | #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK | ||
47 | #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK | ||
48 | #define PT_INDEX(addr, level) PT32_INDEX(addr, level) | ||
49 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) | ||
50 | #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) | ||
51 | #define PT_LEVEL_BITS PT32_LEVEL_BITS | ||
52 | #define PT_MAX_FULL_LEVELS 2 | ||
53 | #define CMPXCHG cmpxchg | ||
54 | #else | ||
55 | #error Invalid PTTYPE value | ||
56 | #endif | ||
57 | |||
58 | #define gpte_to_gfn FNAME(gpte_to_gfn) | ||
59 | #define gpte_to_gfn_pde FNAME(gpte_to_gfn_pde) | ||
60 | |||
61 | /* | ||
62 | * The guest_walker structure emulates the behavior of the hardware page | ||
63 | * table walker. | ||
64 | */ | ||
65 | struct guest_walker { | ||
66 | int level; | ||
67 | gfn_t table_gfn[PT_MAX_FULL_LEVELS]; | ||
68 | pt_element_t ptes[PT_MAX_FULL_LEVELS]; | ||
69 | gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; | ||
70 | unsigned pt_access; | ||
71 | unsigned pte_access; | ||
72 | gfn_t gfn; | ||
73 | u32 error_code; | ||
74 | }; | ||
75 | |||
76 | static gfn_t gpte_to_gfn(pt_element_t gpte) | ||
77 | { | ||
78 | return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; | ||
79 | } | ||
80 | |||
81 | static gfn_t gpte_to_gfn_pde(pt_element_t gpte) | ||
82 | { | ||
83 | return (gpte & PT_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT; | ||
84 | } | ||
85 | |||
86 | static bool FNAME(cmpxchg_gpte)(struct kvm *kvm, | ||
87 | gfn_t table_gfn, unsigned index, | ||
88 | pt_element_t orig_pte, pt_element_t new_pte) | ||
89 | { | ||
90 | pt_element_t ret; | ||
91 | pt_element_t *table; | ||
92 | struct page *page; | ||
93 | |||
94 | page = gfn_to_page(kvm, table_gfn); | ||
95 | table = kmap_atomic(page, KM_USER0); | ||
96 | |||
97 | ret = CMPXCHG(&table[index], orig_pte, new_pte); | ||
98 | |||
99 | kunmap_atomic(table, KM_USER0); | ||
100 | |||
101 | kvm_release_page_dirty(page); | ||
102 | |||
103 | return (ret != orig_pte); | ||
104 | } | ||
105 | |||
106 | static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte) | ||
107 | { | ||
108 | unsigned access; | ||
109 | |||
110 | access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; | ||
111 | #if PTTYPE == 64 | ||
112 | if (is_nx(vcpu)) | ||
113 | access &= ~(gpte >> PT64_NX_SHIFT); | ||
114 | #endif | ||
115 | return access; | ||
116 | } | ||
117 | |||
118 | /* | ||
119 | * Fetch a guest pte for a guest virtual address | ||
120 | */ | ||
121 | static int FNAME(walk_addr)(struct guest_walker *walker, | ||
122 | struct kvm_vcpu *vcpu, gva_t addr, | ||
123 | int write_fault, int user_fault, int fetch_fault) | ||
124 | { | ||
125 | pt_element_t pte; | ||
126 | gfn_t table_gfn; | ||
127 | unsigned index, pt_access, pte_access; | ||
128 | gpa_t pte_gpa; | ||
129 | |||
130 | pgprintk("%s: addr %lx\n", __FUNCTION__, addr); | ||
131 | walk: | ||
132 | walker->level = vcpu->arch.mmu.root_level; | ||
133 | pte = vcpu->arch.cr3; | ||
134 | #if PTTYPE == 64 | ||
135 | if (!is_long_mode(vcpu)) { | ||
136 | pte = vcpu->arch.pdptrs[(addr >> 30) & 3]; | ||
137 | if (!is_present_pte(pte)) | ||
138 | goto not_present; | ||
139 | --walker->level; | ||
140 | } | ||
141 | #endif | ||
142 | ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || | ||
143 | (vcpu->cr3 & CR3_NONPAE_RESERVED_BITS) == 0); | ||
144 | |||
145 | pt_access = ACC_ALL; | ||
146 | |||
147 | for (;;) { | ||
148 | index = PT_INDEX(addr, walker->level); | ||
149 | |||
150 | table_gfn = gpte_to_gfn(pte); | ||
151 | pte_gpa = gfn_to_gpa(table_gfn); | ||
152 | pte_gpa += index * sizeof(pt_element_t); | ||
153 | walker->table_gfn[walker->level - 1] = table_gfn; | ||
154 | walker->pte_gpa[walker->level - 1] = pte_gpa; | ||
155 | pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__, | ||
156 | walker->level - 1, table_gfn); | ||
157 | |||
158 | kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)); | ||
159 | |||
160 | if (!is_present_pte(pte)) | ||
161 | goto not_present; | ||
162 | |||
163 | if (write_fault && !is_writeble_pte(pte)) | ||
164 | if (user_fault || is_write_protection(vcpu)) | ||
165 | goto access_error; | ||
166 | |||
167 | if (user_fault && !(pte & PT_USER_MASK)) | ||
168 | goto access_error; | ||
169 | |||
170 | #if PTTYPE == 64 | ||
171 | if (fetch_fault && is_nx(vcpu) && (pte & PT64_NX_MASK)) | ||
172 | goto access_error; | ||
173 | #endif | ||
174 | |||
175 | if (!(pte & PT_ACCESSED_MASK)) { | ||
176 | mark_page_dirty(vcpu->kvm, table_gfn); | ||
177 | if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, | ||
178 | index, pte, pte|PT_ACCESSED_MASK)) | ||
179 | goto walk; | ||
180 | pte |= PT_ACCESSED_MASK; | ||
181 | } | ||
182 | |||
183 | pte_access = pt_access & FNAME(gpte_access)(vcpu, pte); | ||
184 | |||
185 | walker->ptes[walker->level - 1] = pte; | ||
186 | |||
187 | if (walker->level == PT_PAGE_TABLE_LEVEL) { | ||
188 | walker->gfn = gpte_to_gfn(pte); | ||
189 | break; | ||
190 | } | ||
191 | |||
192 | if (walker->level == PT_DIRECTORY_LEVEL | ||
193 | && (pte & PT_PAGE_SIZE_MASK) | ||
194 | && (PTTYPE == 64 || is_pse(vcpu))) { | ||
195 | walker->gfn = gpte_to_gfn_pde(pte); | ||
196 | walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL); | ||
197 | if (PTTYPE == 32 && is_cpuid_PSE36()) | ||
198 | walker->gfn += pse36_gfn_delta(pte); | ||
199 | break; | ||
200 | } | ||
201 | |||
202 | pt_access = pte_access; | ||
203 | --walker->level; | ||
204 | } | ||
205 | |||
206 | if (write_fault && !is_dirty_pte(pte)) { | ||
207 | bool ret; | ||
208 | |||
209 | mark_page_dirty(vcpu->kvm, table_gfn); | ||
210 | ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, | ||
211 | pte|PT_DIRTY_MASK); | ||
212 | if (ret) | ||
213 | goto walk; | ||
214 | pte |= PT_DIRTY_MASK; | ||
215 | kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte)); | ||
216 | walker->ptes[walker->level - 1] = pte; | ||
217 | } | ||
218 | |||
219 | walker->pt_access = pt_access; | ||
220 | walker->pte_access = pte_access; | ||
221 | pgprintk("%s: pte %llx pte_access %x pt_access %x\n", | ||
222 | __FUNCTION__, (u64)pte, pt_access, pte_access); | ||
223 | return 1; | ||
224 | |||
225 | not_present: | ||
226 | walker->error_code = 0; | ||
227 | goto err; | ||
228 | |||
229 | access_error: | ||
230 | walker->error_code = PFERR_PRESENT_MASK; | ||
231 | |||
232 | err: | ||
233 | if (write_fault) | ||
234 | walker->error_code |= PFERR_WRITE_MASK; | ||
235 | if (user_fault) | ||
236 | walker->error_code |= PFERR_USER_MASK; | ||
237 | if (fetch_fault) | ||
238 | walker->error_code |= PFERR_FETCH_MASK; | ||
239 | return 0; | ||
240 | } | ||
241 | |||
242 | static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, | ||
243 | u64 *spte, const void *pte, int bytes, | ||
244 | int offset_in_pte) | ||
245 | { | ||
246 | pt_element_t gpte; | ||
247 | unsigned pte_access; | ||
248 | |||
249 | gpte = *(const pt_element_t *)pte; | ||
250 | if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { | ||
251 | if (!offset_in_pte && !is_present_pte(gpte)) | ||
252 | set_shadow_pte(spte, shadow_notrap_nonpresent_pte); | ||
253 | return; | ||
254 | } | ||
255 | if (bytes < sizeof(pt_element_t)) | ||
256 | return; | ||
257 | pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte); | ||
258 | pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte); | ||
259 | mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, | ||
260 | gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte)); | ||
261 | } | ||
262 | |||
263 | /* | ||
264 | * Fetch a shadow pte for a specific level in the paging hierarchy. | ||
265 | */ | ||
266 | static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | ||
267 | struct guest_walker *walker, | ||
268 | int user_fault, int write_fault, int *ptwrite) | ||
269 | { | ||
270 | hpa_t shadow_addr; | ||
271 | int level; | ||
272 | u64 *shadow_ent; | ||
273 | unsigned access = walker->pt_access; | ||
274 | |||
275 | if (!is_present_pte(walker->ptes[walker->level - 1])) | ||
276 | return NULL; | ||
277 | |||
278 | shadow_addr = vcpu->arch.mmu.root_hpa; | ||
279 | level = vcpu->arch.mmu.shadow_root_level; | ||
280 | if (level == PT32E_ROOT_LEVEL) { | ||
281 | shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; | ||
282 | shadow_addr &= PT64_BASE_ADDR_MASK; | ||
283 | --level; | ||
284 | } | ||
285 | |||
286 | for (; ; level--) { | ||
287 | u32 index = SHADOW_PT_INDEX(addr, level); | ||
288 | struct kvm_mmu_page *shadow_page; | ||
289 | u64 shadow_pte; | ||
290 | int metaphysical; | ||
291 | gfn_t table_gfn; | ||
292 | bool new_page = 0; | ||
293 | |||
294 | shadow_ent = ((u64 *)__va(shadow_addr)) + index; | ||
295 | if (is_shadow_present_pte(*shadow_ent)) { | ||
296 | if (level == PT_PAGE_TABLE_LEVEL) | ||
297 | break; | ||
298 | shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK; | ||
299 | continue; | ||
300 | } | ||
301 | |||
302 | if (level == PT_PAGE_TABLE_LEVEL) | ||
303 | break; | ||
304 | |||
305 | if (level - 1 == PT_PAGE_TABLE_LEVEL | ||
306 | && walker->level == PT_DIRECTORY_LEVEL) { | ||
307 | metaphysical = 1; | ||
308 | if (!is_dirty_pte(walker->ptes[level - 1])) | ||
309 | access &= ~ACC_WRITE_MASK; | ||
310 | table_gfn = gpte_to_gfn(walker->ptes[level - 1]); | ||
311 | } else { | ||
312 | metaphysical = 0; | ||
313 | table_gfn = walker->table_gfn[level - 2]; | ||
314 | } | ||
315 | shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, | ||
316 | metaphysical, access, | ||
317 | shadow_ent, &new_page); | ||
318 | if (new_page && !metaphysical) { | ||
319 | pt_element_t curr_pte; | ||
320 | kvm_read_guest(vcpu->kvm, walker->pte_gpa[level - 2], | ||
321 | &curr_pte, sizeof(curr_pte)); | ||
322 | if (curr_pte != walker->ptes[level - 2]) | ||
323 | return NULL; | ||
324 | } | ||
325 | shadow_addr = __pa(shadow_page->spt); | ||
326 | shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK | ||
327 | | PT_WRITABLE_MASK | PT_USER_MASK; | ||
328 | *shadow_ent = shadow_pte; | ||
329 | } | ||
330 | |||
331 | mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access, | ||
332 | user_fault, write_fault, | ||
333 | walker->ptes[walker->level-1] & PT_DIRTY_MASK, | ||
334 | ptwrite, walker->gfn); | ||
335 | |||
336 | return shadow_ent; | ||
337 | } | ||
338 | |||
339 | /* | ||
340 | * Page fault handler. There are several causes for a page fault: | ||
341 | * - there is no shadow pte for the guest pte | ||
342 | * - write access through a shadow pte marked read only so that we can set | ||
343 | * the dirty bit | ||
344 | * - write access to a shadow pte marked read only so we can update the page | ||
345 | * dirty bitmap, when userspace requests it | ||
346 | * - mmio access; in this case we will never install a present shadow pte | ||
347 | * - normal guest page fault due to the guest pte marked not present, not | ||
348 | * writable, or not executable | ||
349 | * | ||
350 | * Returns: 1 if we need to emulate the instruction, 0 otherwise, or | ||
351 | * a negative value on error. | ||
352 | */ | ||
353 | static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | ||
354 | u32 error_code) | ||
355 | { | ||
356 | int write_fault = error_code & PFERR_WRITE_MASK; | ||
357 | int user_fault = error_code & PFERR_USER_MASK; | ||
358 | int fetch_fault = error_code & PFERR_FETCH_MASK; | ||
359 | struct guest_walker walker; | ||
360 | u64 *shadow_pte; | ||
361 | int write_pt = 0; | ||
362 | int r; | ||
363 | |||
364 | pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code); | ||
365 | kvm_mmu_audit(vcpu, "pre page fault"); | ||
366 | |||
367 | r = mmu_topup_memory_caches(vcpu); | ||
368 | if (r) | ||
369 | return r; | ||
370 | |||
371 | /* | ||
372 | * Look up the shadow pte for the faulting address. | ||
373 | */ | ||
374 | r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault, | ||
375 | fetch_fault); | ||
376 | |||
377 | /* | ||
378 | * The page is not mapped by the guest. Let the guest handle it. | ||
379 | */ | ||
380 | if (!r) { | ||
381 | pgprintk("%s: guest page fault\n", __FUNCTION__); | ||
382 | inject_page_fault(vcpu, addr, walker.error_code); | ||
383 | vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ | ||
384 | return 0; | ||
385 | } | ||
386 | |||
387 | shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, | ||
388 | &write_pt); | ||
389 | pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__, | ||
390 | shadow_pte, *shadow_pte, write_pt); | ||
391 | |||
392 | if (!write_pt) | ||
393 | vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ | ||
394 | |||
395 | /* | ||
396 | * mmio: emulate if accessible, otherwise its a guest fault. | ||
397 | */ | ||
398 | if (shadow_pte && is_io_pte(*shadow_pte)) | ||
399 | return 1; | ||
400 | |||
401 | ++vcpu->stat.pf_fixed; | ||
402 | kvm_mmu_audit(vcpu, "post page fault (fixed)"); | ||
403 | |||
404 | return write_pt; | ||
405 | } | ||
406 | |||
407 | static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) | ||
408 | { | ||
409 | struct guest_walker walker; | ||
410 | gpa_t gpa = UNMAPPED_GVA; | ||
411 | int r; | ||
412 | |||
413 | r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0); | ||
414 | |||
415 | if (r) { | ||
416 | gpa = gfn_to_gpa(walker.gfn); | ||
417 | gpa |= vaddr & ~PAGE_MASK; | ||
418 | } | ||
419 | |||
420 | return gpa; | ||
421 | } | ||
422 | |||
423 | static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, | ||
424 | struct kvm_mmu_page *sp) | ||
425 | { | ||
426 | int i, offset = 0; | ||
427 | pt_element_t *gpt; | ||
428 | struct page *page; | ||
429 | |||
430 | if (sp->role.metaphysical | ||
431 | || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) { | ||
432 | nonpaging_prefetch_page(vcpu, sp); | ||
433 | return; | ||
434 | } | ||
435 | |||
436 | if (PTTYPE == 32) | ||
437 | offset = sp->role.quadrant << PT64_LEVEL_BITS; | ||
438 | page = gfn_to_page(vcpu->kvm, sp->gfn); | ||
439 | gpt = kmap_atomic(page, KM_USER0); | ||
440 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) | ||
441 | if (is_present_pte(gpt[offset + i])) | ||
442 | sp->spt[i] = shadow_trap_nonpresent_pte; | ||
443 | else | ||
444 | sp->spt[i] = shadow_notrap_nonpresent_pte; | ||
445 | kunmap_atomic(gpt, KM_USER0); | ||
446 | kvm_release_page_clean(page); | ||
447 | } | ||
448 | |||
449 | #undef pt_element_t | ||
450 | #undef guest_walker | ||
451 | #undef FNAME | ||
452 | #undef PT_BASE_ADDR_MASK | ||
453 | #undef PT_INDEX | ||
454 | #undef SHADOW_PT_INDEX | ||
455 | #undef PT_LEVEL_MASK | ||
456 | #undef PT_DIR_BASE_ADDR_MASK | ||
457 | #undef PT_LEVEL_BITS | ||
458 | #undef PT_MAX_FULL_LEVELS | ||
459 | #undef gpte_to_gfn | ||
460 | #undef gpte_to_gfn_pde | ||
461 | #undef CMPXCHG | ||
diff --git a/arch/x86/kvm/segment_descriptor.h b/arch/x86/kvm/segment_descriptor.h new file mode 100644 index 000000000000..56fc4c873389 --- /dev/null +++ b/arch/x86/kvm/segment_descriptor.h | |||
@@ -0,0 +1,29 @@ | |||
1 | #ifndef __SEGMENT_DESCRIPTOR_H | ||
2 | #define __SEGMENT_DESCRIPTOR_H | ||
3 | |||
4 | struct segment_descriptor { | ||
5 | u16 limit_low; | ||
6 | u16 base_low; | ||
7 | u8 base_mid; | ||
8 | u8 type : 4; | ||
9 | u8 system : 1; | ||
10 | u8 dpl : 2; | ||
11 | u8 present : 1; | ||
12 | u8 limit_high : 4; | ||
13 | u8 avl : 1; | ||
14 | u8 long_mode : 1; | ||
15 | u8 default_op : 1; | ||
16 | u8 granularity : 1; | ||
17 | u8 base_high; | ||
18 | } __attribute__((packed)); | ||
19 | |||
20 | #ifdef CONFIG_X86_64 | ||
21 | /* LDT or TSS descriptor in the GDT. 16 bytes. */ | ||
22 | struct segment_descriptor_64 { | ||
23 | struct segment_descriptor s; | ||
24 | u32 base_higher; | ||
25 | u32 pad_zero; | ||
26 | }; | ||
27 | |||
28 | #endif | ||
29 | #endif | ||
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c new file mode 100644 index 000000000000..3d4b71a94440 --- /dev/null +++ b/arch/x86/kvm/svm.c | |||
@@ -0,0 +1,1725 @@ | |||
1 | /* | ||
2 | * Kernel-based Virtual Machine driver for Linux | ||
3 | * | ||
4 | * AMD SVM support | ||
5 | * | ||
6 | * Copyright (C) 2006 Qumranet, Inc. | ||
7 | * | ||
8 | * Authors: | ||
9 | * Yaniv Kamay <yaniv@qumranet.com> | ||
10 | * Avi Kivity <avi@qumranet.com> | ||
11 | * | ||
12 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
13 | * the COPYING file in the top-level directory. | ||
14 | * | ||
15 | */ | ||
16 | #include <linux/kvm_host.h> | ||
17 | |||
18 | #include "kvm_svm.h" | ||
19 | #include "irq.h" | ||
20 | #include "mmu.h" | ||
21 | |||
22 | #include <linux/module.h> | ||
23 | #include <linux/kernel.h> | ||
24 | #include <linux/vmalloc.h> | ||
25 | #include <linux/highmem.h> | ||
26 | #include <linux/sched.h> | ||
27 | |||
28 | #include <asm/desc.h> | ||
29 | |||
30 | MODULE_AUTHOR("Qumranet"); | ||
31 | MODULE_LICENSE("GPL"); | ||
32 | |||
33 | #define IOPM_ALLOC_ORDER 2 | ||
34 | #define MSRPM_ALLOC_ORDER 1 | ||
35 | |||
36 | #define DB_VECTOR 1 | ||
37 | #define UD_VECTOR 6 | ||
38 | #define GP_VECTOR 13 | ||
39 | |||
40 | #define DR7_GD_MASK (1 << 13) | ||
41 | #define DR6_BD_MASK (1 << 13) | ||
42 | |||
43 | #define SEG_TYPE_LDT 2 | ||
44 | #define SEG_TYPE_BUSY_TSS16 3 | ||
45 | |||
46 | #define SVM_FEATURE_NPT (1 << 0) | ||
47 | #define SVM_FEATURE_LBRV (1 << 1) | ||
48 | #define SVM_DEATURE_SVML (1 << 2) | ||
49 | |||
50 | static void kvm_reput_irq(struct vcpu_svm *svm); | ||
51 | |||
52 | static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) | ||
53 | { | ||
54 | return container_of(vcpu, struct vcpu_svm, vcpu); | ||
55 | } | ||
56 | |||
57 | unsigned long iopm_base; | ||
58 | unsigned long msrpm_base; | ||
59 | |||
60 | struct kvm_ldttss_desc { | ||
61 | u16 limit0; | ||
62 | u16 base0; | ||
63 | unsigned base1 : 8, type : 5, dpl : 2, p : 1; | ||
64 | unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8; | ||
65 | u32 base3; | ||
66 | u32 zero1; | ||
67 | } __attribute__((packed)); | ||
68 | |||
69 | struct svm_cpu_data { | ||
70 | int cpu; | ||
71 | |||
72 | u64 asid_generation; | ||
73 | u32 max_asid; | ||
74 | u32 next_asid; | ||
75 | struct kvm_ldttss_desc *tss_desc; | ||
76 | |||
77 | struct page *save_area; | ||
78 | }; | ||
79 | |||
80 | static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); | ||
81 | static uint32_t svm_features; | ||
82 | |||
83 | struct svm_init_data { | ||
84 | int cpu; | ||
85 | int r; | ||
86 | }; | ||
87 | |||
88 | static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000}; | ||
89 | |||
90 | #define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges) | ||
91 | #define MSRS_RANGE_SIZE 2048 | ||
92 | #define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2) | ||
93 | |||
94 | #define MAX_INST_SIZE 15 | ||
95 | |||
96 | static inline u32 svm_has(u32 feat) | ||
97 | { | ||
98 | return svm_features & feat; | ||
99 | } | ||
100 | |||
101 | static inline u8 pop_irq(struct kvm_vcpu *vcpu) | ||
102 | { | ||
103 | int word_index = __ffs(vcpu->arch.irq_summary); | ||
104 | int bit_index = __ffs(vcpu->arch.irq_pending[word_index]); | ||
105 | int irq = word_index * BITS_PER_LONG + bit_index; | ||
106 | |||
107 | clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); | ||
108 | if (!vcpu->arch.irq_pending[word_index]) | ||
109 | clear_bit(word_index, &vcpu->arch.irq_summary); | ||
110 | return irq; | ||
111 | } | ||
112 | |||
113 | static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq) | ||
114 | { | ||
115 | set_bit(irq, vcpu->arch.irq_pending); | ||
116 | set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary); | ||
117 | } | ||
118 | |||
119 | static inline void clgi(void) | ||
120 | { | ||
121 | asm volatile (SVM_CLGI); | ||
122 | } | ||
123 | |||
124 | static inline void stgi(void) | ||
125 | { | ||
126 | asm volatile (SVM_STGI); | ||
127 | } | ||
128 | |||
129 | static inline void invlpga(unsigned long addr, u32 asid) | ||
130 | { | ||
131 | asm volatile (SVM_INVLPGA :: "a"(addr), "c"(asid)); | ||
132 | } | ||
133 | |||
134 | static inline unsigned long kvm_read_cr2(void) | ||
135 | { | ||
136 | unsigned long cr2; | ||
137 | |||
138 | asm volatile ("mov %%cr2, %0" : "=r" (cr2)); | ||
139 | return cr2; | ||
140 | } | ||
141 | |||
142 | static inline void kvm_write_cr2(unsigned long val) | ||
143 | { | ||
144 | asm volatile ("mov %0, %%cr2" :: "r" (val)); | ||
145 | } | ||
146 | |||
147 | static inline unsigned long read_dr6(void) | ||
148 | { | ||
149 | unsigned long dr6; | ||
150 | |||
151 | asm volatile ("mov %%dr6, %0" : "=r" (dr6)); | ||
152 | return dr6; | ||
153 | } | ||
154 | |||
155 | static inline void write_dr6(unsigned long val) | ||
156 | { | ||
157 | asm volatile ("mov %0, %%dr6" :: "r" (val)); | ||
158 | } | ||
159 | |||
160 | static inline unsigned long read_dr7(void) | ||
161 | { | ||
162 | unsigned long dr7; | ||
163 | |||
164 | asm volatile ("mov %%dr7, %0" : "=r" (dr7)); | ||
165 | return dr7; | ||
166 | } | ||
167 | |||
168 | static inline void write_dr7(unsigned long val) | ||
169 | { | ||
170 | asm volatile ("mov %0, %%dr7" :: "r" (val)); | ||
171 | } | ||
172 | |||
173 | static inline void force_new_asid(struct kvm_vcpu *vcpu) | ||
174 | { | ||
175 | to_svm(vcpu)->asid_generation--; | ||
176 | } | ||
177 | |||
178 | static inline void flush_guest_tlb(struct kvm_vcpu *vcpu) | ||
179 | { | ||
180 | force_new_asid(vcpu); | ||
181 | } | ||
182 | |||
183 | static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) | ||
184 | { | ||
185 | if (!(efer & EFER_LMA)) | ||
186 | efer &= ~EFER_LME; | ||
187 | |||
188 | to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK; | ||
189 | vcpu->arch.shadow_efer = efer; | ||
190 | } | ||
191 | |||
192 | static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, | ||
193 | bool has_error_code, u32 error_code) | ||
194 | { | ||
195 | struct vcpu_svm *svm = to_svm(vcpu); | ||
196 | |||
197 | svm->vmcb->control.event_inj = nr | ||
198 | | SVM_EVTINJ_VALID | ||
199 | | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0) | ||
200 | | SVM_EVTINJ_TYPE_EXEPT; | ||
201 | svm->vmcb->control.event_inj_err = error_code; | ||
202 | } | ||
203 | |||
204 | static bool svm_exception_injected(struct kvm_vcpu *vcpu) | ||
205 | { | ||
206 | struct vcpu_svm *svm = to_svm(vcpu); | ||
207 | |||
208 | return !(svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID); | ||
209 | } | ||
210 | |||
211 | static int is_external_interrupt(u32 info) | ||
212 | { | ||
213 | info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID; | ||
214 | return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR); | ||
215 | } | ||
216 | |||
217 | static void skip_emulated_instruction(struct kvm_vcpu *vcpu) | ||
218 | { | ||
219 | struct vcpu_svm *svm = to_svm(vcpu); | ||
220 | |||
221 | if (!svm->next_rip) { | ||
222 | printk(KERN_DEBUG "%s: NOP\n", __FUNCTION__); | ||
223 | return; | ||
224 | } | ||
225 | if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE) | ||
226 | printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n", | ||
227 | __FUNCTION__, | ||
228 | svm->vmcb->save.rip, | ||
229 | svm->next_rip); | ||
230 | |||
231 | vcpu->arch.rip = svm->vmcb->save.rip = svm->next_rip; | ||
232 | svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; | ||
233 | |||
234 | vcpu->arch.interrupt_window_open = 1; | ||
235 | } | ||
236 | |||
237 | static int has_svm(void) | ||
238 | { | ||
239 | uint32_t eax, ebx, ecx, edx; | ||
240 | |||
241 | if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) { | ||
242 | printk(KERN_INFO "has_svm: not amd\n"); | ||
243 | return 0; | ||
244 | } | ||
245 | |||
246 | cpuid(0x80000000, &eax, &ebx, &ecx, &edx); | ||
247 | if (eax < SVM_CPUID_FUNC) { | ||
248 | printk(KERN_INFO "has_svm: can't execute cpuid_8000000a\n"); | ||
249 | return 0; | ||
250 | } | ||
251 | |||
252 | cpuid(0x80000001, &eax, &ebx, &ecx, &edx); | ||
253 | if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) { | ||
254 | printk(KERN_DEBUG "has_svm: svm not available\n"); | ||
255 | return 0; | ||
256 | } | ||
257 | return 1; | ||
258 | } | ||
259 | |||
260 | static void svm_hardware_disable(void *garbage) | ||
261 | { | ||
262 | struct svm_cpu_data *svm_data | ||
263 | = per_cpu(svm_data, raw_smp_processor_id()); | ||
264 | |||
265 | if (svm_data) { | ||
266 | uint64_t efer; | ||
267 | |||
268 | wrmsrl(MSR_VM_HSAVE_PA, 0); | ||
269 | rdmsrl(MSR_EFER, efer); | ||
270 | wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK); | ||
271 | per_cpu(svm_data, raw_smp_processor_id()) = NULL; | ||
272 | __free_page(svm_data->save_area); | ||
273 | kfree(svm_data); | ||
274 | } | ||
275 | } | ||
276 | |||
277 | static void svm_hardware_enable(void *garbage) | ||
278 | { | ||
279 | |||
280 | struct svm_cpu_data *svm_data; | ||
281 | uint64_t efer; | ||
282 | #ifdef CONFIG_X86_64 | ||
283 | struct desc_ptr gdt_descr; | ||
284 | #else | ||
285 | struct desc_ptr gdt_descr; | ||
286 | #endif | ||
287 | struct desc_struct *gdt; | ||
288 | int me = raw_smp_processor_id(); | ||
289 | |||
290 | if (!has_svm()) { | ||
291 | printk(KERN_ERR "svm_cpu_init: err EOPNOTSUPP on %d\n", me); | ||
292 | return; | ||
293 | } | ||
294 | svm_data = per_cpu(svm_data, me); | ||
295 | |||
296 | if (!svm_data) { | ||
297 | printk(KERN_ERR "svm_cpu_init: svm_data is NULL on %d\n", | ||
298 | me); | ||
299 | return; | ||
300 | } | ||
301 | |||
302 | svm_data->asid_generation = 1; | ||
303 | svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; | ||
304 | svm_data->next_asid = svm_data->max_asid + 1; | ||
305 | svm_features = cpuid_edx(SVM_CPUID_FUNC); | ||
306 | |||
307 | asm volatile ("sgdt %0" : "=m"(gdt_descr)); | ||
308 | gdt = (struct desc_struct *)gdt_descr.address; | ||
309 | svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); | ||
310 | |||
311 | rdmsrl(MSR_EFER, efer); | ||
312 | wrmsrl(MSR_EFER, efer | MSR_EFER_SVME_MASK); | ||
313 | |||
314 | wrmsrl(MSR_VM_HSAVE_PA, | ||
315 | page_to_pfn(svm_data->save_area) << PAGE_SHIFT); | ||
316 | } | ||
317 | |||
318 | static int svm_cpu_init(int cpu) | ||
319 | { | ||
320 | struct svm_cpu_data *svm_data; | ||
321 | int r; | ||
322 | |||
323 | svm_data = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL); | ||
324 | if (!svm_data) | ||
325 | return -ENOMEM; | ||
326 | svm_data->cpu = cpu; | ||
327 | svm_data->save_area = alloc_page(GFP_KERNEL); | ||
328 | r = -ENOMEM; | ||
329 | if (!svm_data->save_area) | ||
330 | goto err_1; | ||
331 | |||
332 | per_cpu(svm_data, cpu) = svm_data; | ||
333 | |||
334 | return 0; | ||
335 | |||
336 | err_1: | ||
337 | kfree(svm_data); | ||
338 | return r; | ||
339 | |||
340 | } | ||
341 | |||
342 | static void set_msr_interception(u32 *msrpm, unsigned msr, | ||
343 | int read, int write) | ||
344 | { | ||
345 | int i; | ||
346 | |||
347 | for (i = 0; i < NUM_MSR_MAPS; i++) { | ||
348 | if (msr >= msrpm_ranges[i] && | ||
349 | msr < msrpm_ranges[i] + MSRS_IN_RANGE) { | ||
350 | u32 msr_offset = (i * MSRS_IN_RANGE + msr - | ||
351 | msrpm_ranges[i]) * 2; | ||
352 | |||
353 | u32 *base = msrpm + (msr_offset / 32); | ||
354 | u32 msr_shift = msr_offset % 32; | ||
355 | u32 mask = ((write) ? 0 : 2) | ((read) ? 0 : 1); | ||
356 | *base = (*base & ~(0x3 << msr_shift)) | | ||
357 | (mask << msr_shift); | ||
358 | return; | ||
359 | } | ||
360 | } | ||
361 | BUG(); | ||
362 | } | ||
363 | |||
364 | static __init int svm_hardware_setup(void) | ||
365 | { | ||
366 | int cpu; | ||
367 | struct page *iopm_pages; | ||
368 | struct page *msrpm_pages; | ||
369 | void *iopm_va, *msrpm_va; | ||
370 | int r; | ||
371 | |||
372 | iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER); | ||
373 | |||
374 | if (!iopm_pages) | ||
375 | return -ENOMEM; | ||
376 | |||
377 | iopm_va = page_address(iopm_pages); | ||
378 | memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER)); | ||
379 | clear_bit(0x80, iopm_va); /* allow direct access to PC debug port */ | ||
380 | iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; | ||
381 | |||
382 | |||
383 | msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); | ||
384 | |||
385 | r = -ENOMEM; | ||
386 | if (!msrpm_pages) | ||
387 | goto err_1; | ||
388 | |||
389 | msrpm_va = page_address(msrpm_pages); | ||
390 | memset(msrpm_va, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER)); | ||
391 | msrpm_base = page_to_pfn(msrpm_pages) << PAGE_SHIFT; | ||
392 | |||
393 | #ifdef CONFIG_X86_64 | ||
394 | set_msr_interception(msrpm_va, MSR_GS_BASE, 1, 1); | ||
395 | set_msr_interception(msrpm_va, MSR_FS_BASE, 1, 1); | ||
396 | set_msr_interception(msrpm_va, MSR_KERNEL_GS_BASE, 1, 1); | ||
397 | set_msr_interception(msrpm_va, MSR_LSTAR, 1, 1); | ||
398 | set_msr_interception(msrpm_va, MSR_CSTAR, 1, 1); | ||
399 | set_msr_interception(msrpm_va, MSR_SYSCALL_MASK, 1, 1); | ||
400 | #endif | ||
401 | set_msr_interception(msrpm_va, MSR_K6_STAR, 1, 1); | ||
402 | set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_CS, 1, 1); | ||
403 | set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_ESP, 1, 1); | ||
404 | set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_EIP, 1, 1); | ||
405 | |||
406 | for_each_online_cpu(cpu) { | ||
407 | r = svm_cpu_init(cpu); | ||
408 | if (r) | ||
409 | goto err_2; | ||
410 | } | ||
411 | return 0; | ||
412 | |||
413 | err_2: | ||
414 | __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER); | ||
415 | msrpm_base = 0; | ||
416 | err_1: | ||
417 | __free_pages(iopm_pages, IOPM_ALLOC_ORDER); | ||
418 | iopm_base = 0; | ||
419 | return r; | ||
420 | } | ||
421 | |||
422 | static __exit void svm_hardware_unsetup(void) | ||
423 | { | ||
424 | __free_pages(pfn_to_page(msrpm_base >> PAGE_SHIFT), MSRPM_ALLOC_ORDER); | ||
425 | __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER); | ||
426 | iopm_base = msrpm_base = 0; | ||
427 | } | ||
428 | |||
429 | static void init_seg(struct vmcb_seg *seg) | ||
430 | { | ||
431 | seg->selector = 0; | ||
432 | seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK | | ||
433 | SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */ | ||
434 | seg->limit = 0xffff; | ||
435 | seg->base = 0; | ||
436 | } | ||
437 | |||
438 | static void init_sys_seg(struct vmcb_seg *seg, uint32_t type) | ||
439 | { | ||
440 | seg->selector = 0; | ||
441 | seg->attrib = SVM_SELECTOR_P_MASK | type; | ||
442 | seg->limit = 0xffff; | ||
443 | seg->base = 0; | ||
444 | } | ||
445 | |||
446 | static void init_vmcb(struct vmcb *vmcb) | ||
447 | { | ||
448 | struct vmcb_control_area *control = &vmcb->control; | ||
449 | struct vmcb_save_area *save = &vmcb->save; | ||
450 | |||
451 | control->intercept_cr_read = INTERCEPT_CR0_MASK | | ||
452 | INTERCEPT_CR3_MASK | | ||
453 | INTERCEPT_CR4_MASK | | ||
454 | INTERCEPT_CR8_MASK; | ||
455 | |||
456 | control->intercept_cr_write = INTERCEPT_CR0_MASK | | ||
457 | INTERCEPT_CR3_MASK | | ||
458 | INTERCEPT_CR4_MASK | | ||
459 | INTERCEPT_CR8_MASK; | ||
460 | |||
461 | control->intercept_dr_read = INTERCEPT_DR0_MASK | | ||
462 | INTERCEPT_DR1_MASK | | ||
463 | INTERCEPT_DR2_MASK | | ||
464 | INTERCEPT_DR3_MASK; | ||
465 | |||
466 | control->intercept_dr_write = INTERCEPT_DR0_MASK | | ||
467 | INTERCEPT_DR1_MASK | | ||
468 | INTERCEPT_DR2_MASK | | ||
469 | INTERCEPT_DR3_MASK | | ||
470 | INTERCEPT_DR5_MASK | | ||
471 | INTERCEPT_DR7_MASK; | ||
472 | |||
473 | control->intercept_exceptions = (1 << PF_VECTOR) | | ||
474 | (1 << UD_VECTOR); | ||
475 | |||
476 | |||
477 | control->intercept = (1ULL << INTERCEPT_INTR) | | ||
478 | (1ULL << INTERCEPT_NMI) | | ||
479 | (1ULL << INTERCEPT_SMI) | | ||
480 | /* | ||
481 | * selective cr0 intercept bug? | ||
482 | * 0: 0f 22 d8 mov %eax,%cr3 | ||
483 | * 3: 0f 20 c0 mov %cr0,%eax | ||
484 | * 6: 0d 00 00 00 80 or $0x80000000,%eax | ||
485 | * b: 0f 22 c0 mov %eax,%cr0 | ||
486 | * set cr3 ->interception | ||
487 | * get cr0 ->interception | ||
488 | * set cr0 -> no interception | ||
489 | */ | ||
490 | /* (1ULL << INTERCEPT_SELECTIVE_CR0) | */ | ||
491 | (1ULL << INTERCEPT_CPUID) | | ||
492 | (1ULL << INTERCEPT_INVD) | | ||
493 | (1ULL << INTERCEPT_HLT) | | ||
494 | (1ULL << INTERCEPT_INVLPGA) | | ||
495 | (1ULL << INTERCEPT_IOIO_PROT) | | ||
496 | (1ULL << INTERCEPT_MSR_PROT) | | ||
497 | (1ULL << INTERCEPT_TASK_SWITCH) | | ||
498 | (1ULL << INTERCEPT_SHUTDOWN) | | ||
499 | (1ULL << INTERCEPT_VMRUN) | | ||
500 | (1ULL << INTERCEPT_VMMCALL) | | ||
501 | (1ULL << INTERCEPT_VMLOAD) | | ||
502 | (1ULL << INTERCEPT_VMSAVE) | | ||
503 | (1ULL << INTERCEPT_STGI) | | ||
504 | (1ULL << INTERCEPT_CLGI) | | ||
505 | (1ULL << INTERCEPT_SKINIT) | | ||
506 | (1ULL << INTERCEPT_WBINVD) | | ||
507 | (1ULL << INTERCEPT_MONITOR) | | ||
508 | (1ULL << INTERCEPT_MWAIT); | ||
509 | |||
510 | control->iopm_base_pa = iopm_base; | ||
511 | control->msrpm_base_pa = msrpm_base; | ||
512 | control->tsc_offset = 0; | ||
513 | control->int_ctl = V_INTR_MASKING_MASK; | ||
514 | |||
515 | init_seg(&save->es); | ||
516 | init_seg(&save->ss); | ||
517 | init_seg(&save->ds); | ||
518 | init_seg(&save->fs); | ||
519 | init_seg(&save->gs); | ||
520 | |||
521 | save->cs.selector = 0xf000; | ||
522 | /* Executable/Readable Code Segment */ | ||
523 | save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK | | ||
524 | SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK; | ||
525 | save->cs.limit = 0xffff; | ||
526 | /* | ||
527 | * cs.base should really be 0xffff0000, but vmx can't handle that, so | ||
528 | * be consistent with it. | ||
529 | * | ||
530 | * Replace when we have real mode working for vmx. | ||
531 | */ | ||
532 | save->cs.base = 0xf0000; | ||
533 | |||
534 | save->gdtr.limit = 0xffff; | ||
535 | save->idtr.limit = 0xffff; | ||
536 | |||
537 | init_sys_seg(&save->ldtr, SEG_TYPE_LDT); | ||
538 | init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); | ||
539 | |||
540 | save->efer = MSR_EFER_SVME_MASK; | ||
541 | save->dr6 = 0xffff0ff0; | ||
542 | save->dr7 = 0x400; | ||
543 | save->rflags = 2; | ||
544 | save->rip = 0x0000fff0; | ||
545 | |||
546 | /* | ||
547 | * cr0 val on cpu init should be 0x60000010, we enable cpu | ||
548 | * cache by default. the orderly way is to enable cache in bios. | ||
549 | */ | ||
550 | save->cr0 = 0x00000010 | X86_CR0_PG | X86_CR0_WP; | ||
551 | save->cr4 = X86_CR4_PAE; | ||
552 | /* rdx = ?? */ | ||
553 | } | ||
554 | |||
555 | static int svm_vcpu_reset(struct kvm_vcpu *vcpu) | ||
556 | { | ||
557 | struct vcpu_svm *svm = to_svm(vcpu); | ||
558 | |||
559 | init_vmcb(svm->vmcb); | ||
560 | |||
561 | if (vcpu->vcpu_id != 0) { | ||
562 | svm->vmcb->save.rip = 0; | ||
563 | svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; | ||
564 | svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; | ||
565 | } | ||
566 | |||
567 | return 0; | ||
568 | } | ||
569 | |||
570 | static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) | ||
571 | { | ||
572 | struct vcpu_svm *svm; | ||
573 | struct page *page; | ||
574 | int err; | ||
575 | |||
576 | svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); | ||
577 | if (!svm) { | ||
578 | err = -ENOMEM; | ||
579 | goto out; | ||
580 | } | ||
581 | |||
582 | err = kvm_vcpu_init(&svm->vcpu, kvm, id); | ||
583 | if (err) | ||
584 | goto free_svm; | ||
585 | |||
586 | page = alloc_page(GFP_KERNEL); | ||
587 | if (!page) { | ||
588 | err = -ENOMEM; | ||
589 | goto uninit; | ||
590 | } | ||
591 | |||
592 | svm->vmcb = page_address(page); | ||
593 | clear_page(svm->vmcb); | ||
594 | svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; | ||
595 | svm->asid_generation = 0; | ||
596 | memset(svm->db_regs, 0, sizeof(svm->db_regs)); | ||
597 | init_vmcb(svm->vmcb); | ||
598 | |||
599 | fx_init(&svm->vcpu); | ||
600 | svm->vcpu.fpu_active = 1; | ||
601 | svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; | ||
602 | if (svm->vcpu.vcpu_id == 0) | ||
603 | svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; | ||
604 | |||
605 | return &svm->vcpu; | ||
606 | |||
607 | uninit: | ||
608 | kvm_vcpu_uninit(&svm->vcpu); | ||
609 | free_svm: | ||
610 | kmem_cache_free(kvm_vcpu_cache, svm); | ||
611 | out: | ||
612 | return ERR_PTR(err); | ||
613 | } | ||
614 | |||
615 | static void svm_free_vcpu(struct kvm_vcpu *vcpu) | ||
616 | { | ||
617 | struct vcpu_svm *svm = to_svm(vcpu); | ||
618 | |||
619 | __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT)); | ||
620 | kvm_vcpu_uninit(vcpu); | ||
621 | kmem_cache_free(kvm_vcpu_cache, svm); | ||
622 | } | ||
623 | |||
624 | static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | ||
625 | { | ||
626 | struct vcpu_svm *svm = to_svm(vcpu); | ||
627 | int i; | ||
628 | |||
629 | if (unlikely(cpu != vcpu->cpu)) { | ||
630 | u64 tsc_this, delta; | ||
631 | |||
632 | /* | ||
633 | * Make sure that the guest sees a monotonically | ||
634 | * increasing TSC. | ||
635 | */ | ||
636 | rdtscll(tsc_this); | ||
637 | delta = vcpu->arch.host_tsc - tsc_this; | ||
638 | svm->vmcb->control.tsc_offset += delta; | ||
639 | vcpu->cpu = cpu; | ||
640 | kvm_migrate_apic_timer(vcpu); | ||
641 | } | ||
642 | |||
643 | for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) | ||
644 | rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); | ||
645 | } | ||
646 | |||
647 | static void svm_vcpu_put(struct kvm_vcpu *vcpu) | ||
648 | { | ||
649 | struct vcpu_svm *svm = to_svm(vcpu); | ||
650 | int i; | ||
651 | |||
652 | ++vcpu->stat.host_state_reload; | ||
653 | for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) | ||
654 | wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); | ||
655 | |||
656 | rdtscll(vcpu->arch.host_tsc); | ||
657 | } | ||
658 | |||
659 | static void svm_vcpu_decache(struct kvm_vcpu *vcpu) | ||
660 | { | ||
661 | } | ||
662 | |||
663 | static void svm_cache_regs(struct kvm_vcpu *vcpu) | ||
664 | { | ||
665 | struct vcpu_svm *svm = to_svm(vcpu); | ||
666 | |||
667 | vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; | ||
668 | vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; | ||
669 | vcpu->arch.rip = svm->vmcb->save.rip; | ||
670 | } | ||
671 | |||
672 | static void svm_decache_regs(struct kvm_vcpu *vcpu) | ||
673 | { | ||
674 | struct vcpu_svm *svm = to_svm(vcpu); | ||
675 | svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; | ||
676 | svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; | ||
677 | svm->vmcb->save.rip = vcpu->arch.rip; | ||
678 | } | ||
679 | |||
680 | static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) | ||
681 | { | ||
682 | return to_svm(vcpu)->vmcb->save.rflags; | ||
683 | } | ||
684 | |||
685 | static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) | ||
686 | { | ||
687 | to_svm(vcpu)->vmcb->save.rflags = rflags; | ||
688 | } | ||
689 | |||
690 | static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg) | ||
691 | { | ||
692 | struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save; | ||
693 | |||
694 | switch (seg) { | ||
695 | case VCPU_SREG_CS: return &save->cs; | ||
696 | case VCPU_SREG_DS: return &save->ds; | ||
697 | case VCPU_SREG_ES: return &save->es; | ||
698 | case VCPU_SREG_FS: return &save->fs; | ||
699 | case VCPU_SREG_GS: return &save->gs; | ||
700 | case VCPU_SREG_SS: return &save->ss; | ||
701 | case VCPU_SREG_TR: return &save->tr; | ||
702 | case VCPU_SREG_LDTR: return &save->ldtr; | ||
703 | } | ||
704 | BUG(); | ||
705 | return NULL; | ||
706 | } | ||
707 | |||
708 | static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg) | ||
709 | { | ||
710 | struct vmcb_seg *s = svm_seg(vcpu, seg); | ||
711 | |||
712 | return s->base; | ||
713 | } | ||
714 | |||
715 | static void svm_get_segment(struct kvm_vcpu *vcpu, | ||
716 | struct kvm_segment *var, int seg) | ||
717 | { | ||
718 | struct vmcb_seg *s = svm_seg(vcpu, seg); | ||
719 | |||
720 | var->base = s->base; | ||
721 | var->limit = s->limit; | ||
722 | var->selector = s->selector; | ||
723 | var->type = s->attrib & SVM_SELECTOR_TYPE_MASK; | ||
724 | var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1; | ||
725 | var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3; | ||
726 | var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1; | ||
727 | var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1; | ||
728 | var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1; | ||
729 | var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1; | ||
730 | var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1; | ||
731 | var->unusable = !var->present; | ||
732 | } | ||
733 | |||
734 | static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) | ||
735 | { | ||
736 | struct vcpu_svm *svm = to_svm(vcpu); | ||
737 | |||
738 | dt->limit = svm->vmcb->save.idtr.limit; | ||
739 | dt->base = svm->vmcb->save.idtr.base; | ||
740 | } | ||
741 | |||
742 | static void svm_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) | ||
743 | { | ||
744 | struct vcpu_svm *svm = to_svm(vcpu); | ||
745 | |||
746 | svm->vmcb->save.idtr.limit = dt->limit; | ||
747 | svm->vmcb->save.idtr.base = dt->base ; | ||
748 | } | ||
749 | |||
750 | static void svm_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) | ||
751 | { | ||
752 | struct vcpu_svm *svm = to_svm(vcpu); | ||
753 | |||
754 | dt->limit = svm->vmcb->save.gdtr.limit; | ||
755 | dt->base = svm->vmcb->save.gdtr.base; | ||
756 | } | ||
757 | |||
758 | static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) | ||
759 | { | ||
760 | struct vcpu_svm *svm = to_svm(vcpu); | ||
761 | |||
762 | svm->vmcb->save.gdtr.limit = dt->limit; | ||
763 | svm->vmcb->save.gdtr.base = dt->base ; | ||
764 | } | ||
765 | |||
766 | static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) | ||
767 | { | ||
768 | } | ||
769 | |||
770 | static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | ||
771 | { | ||
772 | struct vcpu_svm *svm = to_svm(vcpu); | ||
773 | |||
774 | #ifdef CONFIG_X86_64 | ||
775 | if (vcpu->arch.shadow_efer & EFER_LME) { | ||
776 | if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { | ||
777 | vcpu->arch.shadow_efer |= EFER_LMA; | ||
778 | svm->vmcb->save.efer |= EFER_LMA | EFER_LME; | ||
779 | } | ||
780 | |||
781 | if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) { | ||
782 | vcpu->arch.shadow_efer &= ~EFER_LMA; | ||
783 | svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME); | ||
784 | } | ||
785 | } | ||
786 | #endif | ||
787 | if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) { | ||
788 | svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); | ||
789 | vcpu->fpu_active = 1; | ||
790 | } | ||
791 | |||
792 | vcpu->arch.cr0 = cr0; | ||
793 | cr0 |= X86_CR0_PG | X86_CR0_WP; | ||
794 | cr0 &= ~(X86_CR0_CD | X86_CR0_NW); | ||
795 | svm->vmcb->save.cr0 = cr0; | ||
796 | } | ||
797 | |||
798 | static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | ||
799 | { | ||
800 | vcpu->arch.cr4 = cr4; | ||
801 | to_svm(vcpu)->vmcb->save.cr4 = cr4 | X86_CR4_PAE; | ||
802 | } | ||
803 | |||
804 | static void svm_set_segment(struct kvm_vcpu *vcpu, | ||
805 | struct kvm_segment *var, int seg) | ||
806 | { | ||
807 | struct vcpu_svm *svm = to_svm(vcpu); | ||
808 | struct vmcb_seg *s = svm_seg(vcpu, seg); | ||
809 | |||
810 | s->base = var->base; | ||
811 | s->limit = var->limit; | ||
812 | s->selector = var->selector; | ||
813 | if (var->unusable) | ||
814 | s->attrib = 0; | ||
815 | else { | ||
816 | s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK); | ||
817 | s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT; | ||
818 | s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT; | ||
819 | s->attrib |= (var->present & 1) << SVM_SELECTOR_P_SHIFT; | ||
820 | s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT; | ||
821 | s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT; | ||
822 | s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT; | ||
823 | s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT; | ||
824 | } | ||
825 | if (seg == VCPU_SREG_CS) | ||
826 | svm->vmcb->save.cpl | ||
827 | = (svm->vmcb->save.cs.attrib | ||
828 | >> SVM_SELECTOR_DPL_SHIFT) & 3; | ||
829 | |||
830 | } | ||
831 | |||
832 | /* FIXME: | ||
833 | |||
834 | svm(vcpu)->vmcb->control.int_ctl &= ~V_TPR_MASK; | ||
835 | svm(vcpu)->vmcb->control.int_ctl |= (sregs->cr8 & V_TPR_MASK); | ||
836 | |||
837 | */ | ||
838 | |||
839 | static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) | ||
840 | { | ||
841 | return -EOPNOTSUPP; | ||
842 | } | ||
843 | |||
844 | static int svm_get_irq(struct kvm_vcpu *vcpu) | ||
845 | { | ||
846 | struct vcpu_svm *svm = to_svm(vcpu); | ||
847 | u32 exit_int_info = svm->vmcb->control.exit_int_info; | ||
848 | |||
849 | if (is_external_interrupt(exit_int_info)) | ||
850 | return exit_int_info & SVM_EVTINJ_VEC_MASK; | ||
851 | return -1; | ||
852 | } | ||
853 | |||
854 | static void load_host_msrs(struct kvm_vcpu *vcpu) | ||
855 | { | ||
856 | #ifdef CONFIG_X86_64 | ||
857 | wrmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base); | ||
858 | #endif | ||
859 | } | ||
860 | |||
861 | static void save_host_msrs(struct kvm_vcpu *vcpu) | ||
862 | { | ||
863 | #ifdef CONFIG_X86_64 | ||
864 | rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base); | ||
865 | #endif | ||
866 | } | ||
867 | |||
868 | static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data) | ||
869 | { | ||
870 | if (svm_data->next_asid > svm_data->max_asid) { | ||
871 | ++svm_data->asid_generation; | ||
872 | svm_data->next_asid = 1; | ||
873 | svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID; | ||
874 | } | ||
875 | |||
876 | svm->vcpu.cpu = svm_data->cpu; | ||
877 | svm->asid_generation = svm_data->asid_generation; | ||
878 | svm->vmcb->control.asid = svm_data->next_asid++; | ||
879 | } | ||
880 | |||
881 | static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr) | ||
882 | { | ||
883 | return to_svm(vcpu)->db_regs[dr]; | ||
884 | } | ||
885 | |||
886 | static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, | ||
887 | int *exception) | ||
888 | { | ||
889 | struct vcpu_svm *svm = to_svm(vcpu); | ||
890 | |||
891 | *exception = 0; | ||
892 | |||
893 | if (svm->vmcb->save.dr7 & DR7_GD_MASK) { | ||
894 | svm->vmcb->save.dr7 &= ~DR7_GD_MASK; | ||
895 | svm->vmcb->save.dr6 |= DR6_BD_MASK; | ||
896 | *exception = DB_VECTOR; | ||
897 | return; | ||
898 | } | ||
899 | |||
900 | switch (dr) { | ||
901 | case 0 ... 3: | ||
902 | svm->db_regs[dr] = value; | ||
903 | return; | ||
904 | case 4 ... 5: | ||
905 | if (vcpu->arch.cr4 & X86_CR4_DE) { | ||
906 | *exception = UD_VECTOR; | ||
907 | return; | ||
908 | } | ||
909 | case 7: { | ||
910 | if (value & ~((1ULL << 32) - 1)) { | ||
911 | *exception = GP_VECTOR; | ||
912 | return; | ||
913 | } | ||
914 | svm->vmcb->save.dr7 = value; | ||
915 | return; | ||
916 | } | ||
917 | default: | ||
918 | printk(KERN_DEBUG "%s: unexpected dr %u\n", | ||
919 | __FUNCTION__, dr); | ||
920 | *exception = UD_VECTOR; | ||
921 | return; | ||
922 | } | ||
923 | } | ||
924 | |||
925 | static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | ||
926 | { | ||
927 | u32 exit_int_info = svm->vmcb->control.exit_int_info; | ||
928 | struct kvm *kvm = svm->vcpu.kvm; | ||
929 | u64 fault_address; | ||
930 | u32 error_code; | ||
931 | |||
932 | if (!irqchip_in_kernel(kvm) && | ||
933 | is_external_interrupt(exit_int_info)) | ||
934 | push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK); | ||
935 | |||
936 | fault_address = svm->vmcb->control.exit_info_2; | ||
937 | error_code = svm->vmcb->control.exit_info_1; | ||
938 | return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); | ||
939 | } | ||
940 | |||
941 | static int ud_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | ||
942 | { | ||
943 | int er; | ||
944 | |||
945 | er = emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0); | ||
946 | if (er != EMULATE_DONE) | ||
947 | kvm_queue_exception(&svm->vcpu, UD_VECTOR); | ||
948 | return 1; | ||
949 | } | ||
950 | |||
951 | static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | ||
952 | { | ||
953 | svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); | ||
954 | if (!(svm->vcpu.arch.cr0 & X86_CR0_TS)) | ||
955 | svm->vmcb->save.cr0 &= ~X86_CR0_TS; | ||
956 | svm->vcpu.fpu_active = 1; | ||
957 | |||
958 | return 1; | ||
959 | } | ||
960 | |||
961 | static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | ||
962 | { | ||
963 | /* | ||
964 | * VMCB is undefined after a SHUTDOWN intercept | ||
965 | * so reinitialize it. | ||
966 | */ | ||
967 | clear_page(svm->vmcb); | ||
968 | init_vmcb(svm->vmcb); | ||
969 | |||
970 | kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; | ||
971 | return 0; | ||
972 | } | ||
973 | |||
974 | static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | ||
975 | { | ||
976 | u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */ | ||
977 | int size, down, in, string, rep; | ||
978 | unsigned port; | ||
979 | |||
980 | ++svm->vcpu.stat.io_exits; | ||
981 | |||
982 | svm->next_rip = svm->vmcb->control.exit_info_2; | ||
983 | |||
984 | string = (io_info & SVM_IOIO_STR_MASK) != 0; | ||
985 | |||
986 | if (string) { | ||
987 | if (emulate_instruction(&svm->vcpu, | ||
988 | kvm_run, 0, 0, 0) == EMULATE_DO_MMIO) | ||
989 | return 0; | ||
990 | return 1; | ||
991 | } | ||
992 | |||
993 | in = (io_info & SVM_IOIO_TYPE_MASK) != 0; | ||
994 | port = io_info >> 16; | ||
995 | size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; | ||
996 | rep = (io_info & SVM_IOIO_REP_MASK) != 0; | ||
997 | down = (svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0; | ||
998 | |||
999 | return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port); | ||
1000 | } | ||
1001 | |||
1002 | static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | ||
1003 | { | ||
1004 | return 1; | ||
1005 | } | ||
1006 | |||
1007 | static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | ||
1008 | { | ||
1009 | svm->next_rip = svm->vmcb->save.rip + 1; | ||
1010 | skip_emulated_instruction(&svm->vcpu); | ||
1011 | return kvm_emulate_halt(&svm->vcpu); | ||
1012 | } | ||
1013 | |||
1014 | static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | ||
1015 | { | ||
1016 | svm->next_rip = svm->vmcb->save.rip + 3; | ||
1017 | skip_emulated_instruction(&svm->vcpu); | ||
1018 | kvm_emulate_hypercall(&svm->vcpu); | ||
1019 | return 1; | ||
1020 | } | ||
1021 | |||
1022 | static int invalid_op_interception(struct vcpu_svm *svm, | ||
1023 | struct kvm_run *kvm_run) | ||
1024 | { | ||
1025 | kvm_queue_exception(&svm->vcpu, UD_VECTOR); | ||
1026 | return 1; | ||
1027 | } | ||
1028 | |||
1029 | static int task_switch_interception(struct vcpu_svm *svm, | ||
1030 | struct kvm_run *kvm_run) | ||
1031 | { | ||
1032 | pr_unimpl(&svm->vcpu, "%s: task switch is unsupported\n", __FUNCTION__); | ||
1033 | kvm_run->exit_reason = KVM_EXIT_UNKNOWN; | ||
1034 | return 0; | ||
1035 | } | ||
1036 | |||
1037 | static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | ||
1038 | { | ||
1039 | svm->next_rip = svm->vmcb->save.rip + 2; | ||
1040 | kvm_emulate_cpuid(&svm->vcpu); | ||
1041 | return 1; | ||
1042 | } | ||
1043 | |||
1044 | static int emulate_on_interception(struct vcpu_svm *svm, | ||
1045 | struct kvm_run *kvm_run) | ||
1046 | { | ||
1047 | if (emulate_instruction(&svm->vcpu, NULL, 0, 0, 0) != EMULATE_DONE) | ||
1048 | pr_unimpl(&svm->vcpu, "%s: failed\n", __FUNCTION__); | ||
1049 | return 1; | ||
1050 | } | ||
1051 | |||
1052 | static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | ||
1053 | { | ||
1054 | emulate_instruction(&svm->vcpu, NULL, 0, 0, 0); | ||
1055 | if (irqchip_in_kernel(svm->vcpu.kvm)) | ||
1056 | return 1; | ||
1057 | kvm_run->exit_reason = KVM_EXIT_SET_TPR; | ||
1058 | return 0; | ||
1059 | } | ||
1060 | |||
1061 | static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) | ||
1062 | { | ||
1063 | struct vcpu_svm *svm = to_svm(vcpu); | ||
1064 | |||
1065 | switch (ecx) { | ||
1066 | case MSR_IA32_TIME_STAMP_COUNTER: { | ||
1067 | u64 tsc; | ||
1068 | |||
1069 | rdtscll(tsc); | ||
1070 | *data = svm->vmcb->control.tsc_offset + tsc; | ||
1071 | break; | ||
1072 | } | ||
1073 | case MSR_K6_STAR: | ||
1074 | *data = svm->vmcb->save.star; | ||
1075 | break; | ||
1076 | #ifdef CONFIG_X86_64 | ||
1077 | case MSR_LSTAR: | ||
1078 | *data = svm->vmcb->save.lstar; | ||
1079 | break; | ||
1080 | case MSR_CSTAR: | ||
1081 | *data = svm->vmcb->save.cstar; | ||
1082 | break; | ||
1083 | case MSR_KERNEL_GS_BASE: | ||
1084 | *data = svm->vmcb->save.kernel_gs_base; | ||
1085 | break; | ||
1086 | case MSR_SYSCALL_MASK: | ||
1087 | *data = svm->vmcb->save.sfmask; | ||
1088 | break; | ||
1089 | #endif | ||
1090 | case MSR_IA32_SYSENTER_CS: | ||
1091 | *data = svm->vmcb->save.sysenter_cs; | ||
1092 | break; | ||
1093 | case MSR_IA32_SYSENTER_EIP: | ||
1094 | *data = svm->vmcb->save.sysenter_eip; | ||
1095 | break; | ||
1096 | case MSR_IA32_SYSENTER_ESP: | ||
1097 | *data = svm->vmcb->save.sysenter_esp; | ||
1098 | break; | ||
1099 | default: | ||
1100 | return kvm_get_msr_common(vcpu, ecx, data); | ||
1101 | } | ||
1102 | return 0; | ||
1103 | } | ||
1104 | |||
1105 | static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | ||
1106 | { | ||
1107 | u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; | ||
1108 | u64 data; | ||
1109 | |||
1110 | if (svm_get_msr(&svm->vcpu, ecx, &data)) | ||
1111 | kvm_inject_gp(&svm->vcpu, 0); | ||
1112 | else { | ||
1113 | svm->vmcb->save.rax = data & 0xffffffff; | ||
1114 | svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32; | ||
1115 | svm->next_rip = svm->vmcb->save.rip + 2; | ||
1116 | skip_emulated_instruction(&svm->vcpu); | ||
1117 | } | ||
1118 | return 1; | ||
1119 | } | ||
1120 | |||
1121 | static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) | ||
1122 | { | ||
1123 | struct vcpu_svm *svm = to_svm(vcpu); | ||
1124 | |||
1125 | switch (ecx) { | ||
1126 | case MSR_IA32_TIME_STAMP_COUNTER: { | ||
1127 | u64 tsc; | ||
1128 | |||
1129 | rdtscll(tsc); | ||
1130 | svm->vmcb->control.tsc_offset = data - tsc; | ||
1131 | break; | ||
1132 | } | ||
1133 | case MSR_K6_STAR: | ||
1134 | svm->vmcb->save.star = data; | ||
1135 | break; | ||
1136 | #ifdef CONFIG_X86_64 | ||
1137 | case MSR_LSTAR: | ||
1138 | svm->vmcb->save.lstar = data; | ||
1139 | break; | ||
1140 | case MSR_CSTAR: | ||
1141 | svm->vmcb->save.cstar = data; | ||
1142 | break; | ||
1143 | case MSR_KERNEL_GS_BASE: | ||
1144 | svm->vmcb->save.kernel_gs_base = data; | ||
1145 | break; | ||
1146 | case MSR_SYSCALL_MASK: | ||
1147 | svm->vmcb->save.sfmask = data; | ||
1148 | break; | ||
1149 | #endif | ||
1150 | case MSR_IA32_SYSENTER_CS: | ||
1151 | svm->vmcb->save.sysenter_cs = data; | ||
1152 | break; | ||
1153 | case MSR_IA32_SYSENTER_EIP: | ||
1154 | svm->vmcb->save.sysenter_eip = data; | ||
1155 | break; | ||
1156 | case MSR_IA32_SYSENTER_ESP: | ||
1157 | svm->vmcb->save.sysenter_esp = data; | ||
1158 | break; | ||
1159 | case MSR_K7_EVNTSEL0: | ||
1160 | case MSR_K7_EVNTSEL1: | ||
1161 | case MSR_K7_EVNTSEL2: | ||
1162 | case MSR_K7_EVNTSEL3: | ||
1163 | /* | ||
1164 | * only support writing 0 to the performance counters for now | ||
1165 | * to make Windows happy. Should be replaced by a real | ||
1166 | * performance counter emulation later. | ||
1167 | */ | ||
1168 | if (data != 0) | ||
1169 | goto unhandled; | ||
1170 | break; | ||
1171 | default: | ||
1172 | unhandled: | ||
1173 | return kvm_set_msr_common(vcpu, ecx, data); | ||
1174 | } | ||
1175 | return 0; | ||
1176 | } | ||
1177 | |||
1178 | static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | ||
1179 | { | ||
1180 | u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; | ||
1181 | u64 data = (svm->vmcb->save.rax & -1u) | ||
1182 | | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); | ||
1183 | svm->next_rip = svm->vmcb->save.rip + 2; | ||
1184 | if (svm_set_msr(&svm->vcpu, ecx, data)) | ||
1185 | kvm_inject_gp(&svm->vcpu, 0); | ||
1186 | else | ||
1187 | skip_emulated_instruction(&svm->vcpu); | ||
1188 | return 1; | ||
1189 | } | ||
1190 | |||
1191 | static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | ||
1192 | { | ||
1193 | if (svm->vmcb->control.exit_info_1) | ||
1194 | return wrmsr_interception(svm, kvm_run); | ||
1195 | else | ||
1196 | return rdmsr_interception(svm, kvm_run); | ||
1197 | } | ||
1198 | |||
1199 | static int interrupt_window_interception(struct vcpu_svm *svm, | ||
1200 | struct kvm_run *kvm_run) | ||
1201 | { | ||
1202 | svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR); | ||
1203 | svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; | ||
1204 | /* | ||
1205 | * If the user space waits to inject interrupts, exit as soon as | ||
1206 | * possible | ||
1207 | */ | ||
1208 | if (kvm_run->request_interrupt_window && | ||
1209 | !svm->vcpu.arch.irq_summary) { | ||
1210 | ++svm->vcpu.stat.irq_window_exits; | ||
1211 | kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; | ||
1212 | return 0; | ||
1213 | } | ||
1214 | |||
1215 | return 1; | ||
1216 | } | ||
1217 | |||
1218 | static int (*svm_exit_handlers[])(struct vcpu_svm *svm, | ||
1219 | struct kvm_run *kvm_run) = { | ||
1220 | [SVM_EXIT_READ_CR0] = emulate_on_interception, | ||
1221 | [SVM_EXIT_READ_CR3] = emulate_on_interception, | ||
1222 | [SVM_EXIT_READ_CR4] = emulate_on_interception, | ||
1223 | [SVM_EXIT_READ_CR8] = emulate_on_interception, | ||
1224 | /* for now: */ | ||
1225 | [SVM_EXIT_WRITE_CR0] = emulate_on_interception, | ||
1226 | [SVM_EXIT_WRITE_CR3] = emulate_on_interception, | ||
1227 | [SVM_EXIT_WRITE_CR4] = emulate_on_interception, | ||
1228 | [SVM_EXIT_WRITE_CR8] = cr8_write_interception, | ||
1229 | [SVM_EXIT_READ_DR0] = emulate_on_interception, | ||
1230 | [SVM_EXIT_READ_DR1] = emulate_on_interception, | ||
1231 | [SVM_EXIT_READ_DR2] = emulate_on_interception, | ||
1232 | [SVM_EXIT_READ_DR3] = emulate_on_interception, | ||
1233 | [SVM_EXIT_WRITE_DR0] = emulate_on_interception, | ||
1234 | [SVM_EXIT_WRITE_DR1] = emulate_on_interception, | ||
1235 | [SVM_EXIT_WRITE_DR2] = emulate_on_interception, | ||
1236 | [SVM_EXIT_WRITE_DR3] = emulate_on_interception, | ||
1237 | [SVM_EXIT_WRITE_DR5] = emulate_on_interception, | ||
1238 | [SVM_EXIT_WRITE_DR7] = emulate_on_interception, | ||
1239 | [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, | ||
1240 | [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, | ||
1241 | [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, | ||
1242 | [SVM_EXIT_INTR] = nop_on_interception, | ||
1243 | [SVM_EXIT_NMI] = nop_on_interception, | ||
1244 | [SVM_EXIT_SMI] = nop_on_interception, | ||
1245 | [SVM_EXIT_INIT] = nop_on_interception, | ||
1246 | [SVM_EXIT_VINTR] = interrupt_window_interception, | ||
1247 | /* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */ | ||
1248 | [SVM_EXIT_CPUID] = cpuid_interception, | ||
1249 | [SVM_EXIT_INVD] = emulate_on_interception, | ||
1250 | [SVM_EXIT_HLT] = halt_interception, | ||
1251 | [SVM_EXIT_INVLPG] = emulate_on_interception, | ||
1252 | [SVM_EXIT_INVLPGA] = invalid_op_interception, | ||
1253 | [SVM_EXIT_IOIO] = io_interception, | ||
1254 | [SVM_EXIT_MSR] = msr_interception, | ||
1255 | [SVM_EXIT_TASK_SWITCH] = task_switch_interception, | ||
1256 | [SVM_EXIT_SHUTDOWN] = shutdown_interception, | ||
1257 | [SVM_EXIT_VMRUN] = invalid_op_interception, | ||
1258 | [SVM_EXIT_VMMCALL] = vmmcall_interception, | ||
1259 | [SVM_EXIT_VMLOAD] = invalid_op_interception, | ||
1260 | [SVM_EXIT_VMSAVE] = invalid_op_interception, | ||
1261 | [SVM_EXIT_STGI] = invalid_op_interception, | ||
1262 | [SVM_EXIT_CLGI] = invalid_op_interception, | ||
1263 | [SVM_EXIT_SKINIT] = invalid_op_interception, | ||
1264 | [SVM_EXIT_WBINVD] = emulate_on_interception, | ||
1265 | [SVM_EXIT_MONITOR] = invalid_op_interception, | ||
1266 | [SVM_EXIT_MWAIT] = invalid_op_interception, | ||
1267 | }; | ||
1268 | |||
1269 | |||
1270 | static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) | ||
1271 | { | ||
1272 | struct vcpu_svm *svm = to_svm(vcpu); | ||
1273 | u32 exit_code = svm->vmcb->control.exit_code; | ||
1274 | |||
1275 | kvm_reput_irq(svm); | ||
1276 | |||
1277 | if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { | ||
1278 | kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; | ||
1279 | kvm_run->fail_entry.hardware_entry_failure_reason | ||
1280 | = svm->vmcb->control.exit_code; | ||
1281 | return 0; | ||
1282 | } | ||
1283 | |||
1284 | if (is_external_interrupt(svm->vmcb->control.exit_int_info) && | ||
1285 | exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR) | ||
1286 | printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x " | ||
1287 | "exit_code 0x%x\n", | ||
1288 | __FUNCTION__, svm->vmcb->control.exit_int_info, | ||
1289 | exit_code); | ||
1290 | |||
1291 | if (exit_code >= ARRAY_SIZE(svm_exit_handlers) | ||
1292 | || !svm_exit_handlers[exit_code]) { | ||
1293 | kvm_run->exit_reason = KVM_EXIT_UNKNOWN; | ||
1294 | kvm_run->hw.hardware_exit_reason = exit_code; | ||
1295 | return 0; | ||
1296 | } | ||
1297 | |||
1298 | return svm_exit_handlers[exit_code](svm, kvm_run); | ||
1299 | } | ||
1300 | |||
1301 | static void reload_tss(struct kvm_vcpu *vcpu) | ||
1302 | { | ||
1303 | int cpu = raw_smp_processor_id(); | ||
1304 | |||
1305 | struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); | ||
1306 | svm_data->tss_desc->type = 9; /* available 32/64-bit TSS */ | ||
1307 | load_TR_desc(); | ||
1308 | } | ||
1309 | |||
1310 | static void pre_svm_run(struct vcpu_svm *svm) | ||
1311 | { | ||
1312 | int cpu = raw_smp_processor_id(); | ||
1313 | |||
1314 | struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); | ||
1315 | |||
1316 | svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; | ||
1317 | if (svm->vcpu.cpu != cpu || | ||
1318 | svm->asid_generation != svm_data->asid_generation) | ||
1319 | new_asid(svm, svm_data); | ||
1320 | } | ||
1321 | |||
1322 | |||
1323 | static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) | ||
1324 | { | ||
1325 | struct vmcb_control_area *control; | ||
1326 | |||
1327 | control = &svm->vmcb->control; | ||
1328 | control->int_vector = irq; | ||
1329 | control->int_ctl &= ~V_INTR_PRIO_MASK; | ||
1330 | control->int_ctl |= V_IRQ_MASK | | ||
1331 | ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); | ||
1332 | } | ||
1333 | |||
1334 | static void svm_set_irq(struct kvm_vcpu *vcpu, int irq) | ||
1335 | { | ||
1336 | struct vcpu_svm *svm = to_svm(vcpu); | ||
1337 | |||
1338 | svm_inject_irq(svm, irq); | ||
1339 | } | ||
1340 | |||
1341 | static void svm_intr_assist(struct kvm_vcpu *vcpu) | ||
1342 | { | ||
1343 | struct vcpu_svm *svm = to_svm(vcpu); | ||
1344 | struct vmcb *vmcb = svm->vmcb; | ||
1345 | int intr_vector = -1; | ||
1346 | |||
1347 | if ((vmcb->control.exit_int_info & SVM_EVTINJ_VALID) && | ||
1348 | ((vmcb->control.exit_int_info & SVM_EVTINJ_TYPE_MASK) == 0)) { | ||
1349 | intr_vector = vmcb->control.exit_int_info & | ||
1350 | SVM_EVTINJ_VEC_MASK; | ||
1351 | vmcb->control.exit_int_info = 0; | ||
1352 | svm_inject_irq(svm, intr_vector); | ||
1353 | return; | ||
1354 | } | ||
1355 | |||
1356 | if (vmcb->control.int_ctl & V_IRQ_MASK) | ||
1357 | return; | ||
1358 | |||
1359 | if (!kvm_cpu_has_interrupt(vcpu)) | ||
1360 | return; | ||
1361 | |||
1362 | if (!(vmcb->save.rflags & X86_EFLAGS_IF) || | ||
1363 | (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) || | ||
1364 | (vmcb->control.event_inj & SVM_EVTINJ_VALID)) { | ||
1365 | /* unable to deliver irq, set pending irq */ | ||
1366 | vmcb->control.intercept |= (1ULL << INTERCEPT_VINTR); | ||
1367 | svm_inject_irq(svm, 0x0); | ||
1368 | return; | ||
1369 | } | ||
1370 | /* Okay, we can deliver the interrupt: grab it and update PIC state. */ | ||
1371 | intr_vector = kvm_cpu_get_interrupt(vcpu); | ||
1372 | svm_inject_irq(svm, intr_vector); | ||
1373 | kvm_timer_intr_post(vcpu, intr_vector); | ||
1374 | } | ||
1375 | |||
1376 | static void kvm_reput_irq(struct vcpu_svm *svm) | ||
1377 | { | ||
1378 | struct vmcb_control_area *control = &svm->vmcb->control; | ||
1379 | |||
1380 | if ((control->int_ctl & V_IRQ_MASK) | ||
1381 | && !irqchip_in_kernel(svm->vcpu.kvm)) { | ||
1382 | control->int_ctl &= ~V_IRQ_MASK; | ||
1383 | push_irq(&svm->vcpu, control->int_vector); | ||
1384 | } | ||
1385 | |||
1386 | svm->vcpu.arch.interrupt_window_open = | ||
1387 | !(control->int_state & SVM_INTERRUPT_SHADOW_MASK); | ||
1388 | } | ||
1389 | |||
1390 | static void svm_do_inject_vector(struct vcpu_svm *svm) | ||
1391 | { | ||
1392 | struct kvm_vcpu *vcpu = &svm->vcpu; | ||
1393 | int word_index = __ffs(vcpu->arch.irq_summary); | ||
1394 | int bit_index = __ffs(vcpu->arch.irq_pending[word_index]); | ||
1395 | int irq = word_index * BITS_PER_LONG + bit_index; | ||
1396 | |||
1397 | clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); | ||
1398 | if (!vcpu->arch.irq_pending[word_index]) | ||
1399 | clear_bit(word_index, &vcpu->arch.irq_summary); | ||
1400 | svm_inject_irq(svm, irq); | ||
1401 | } | ||
1402 | |||
1403 | static void do_interrupt_requests(struct kvm_vcpu *vcpu, | ||
1404 | struct kvm_run *kvm_run) | ||
1405 | { | ||
1406 | struct vcpu_svm *svm = to_svm(vcpu); | ||
1407 | struct vmcb_control_area *control = &svm->vmcb->control; | ||
1408 | |||
1409 | svm->vcpu.arch.interrupt_window_open = | ||
1410 | (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) && | ||
1411 | (svm->vmcb->save.rflags & X86_EFLAGS_IF)); | ||
1412 | |||
1413 | if (svm->vcpu.arch.interrupt_window_open && svm->vcpu.arch.irq_summary) | ||
1414 | /* | ||
1415 | * If interrupts enabled, and not blocked by sti or mov ss. Good. | ||
1416 | */ | ||
1417 | svm_do_inject_vector(svm); | ||
1418 | |||
1419 | /* | ||
1420 | * Interrupts blocked. Wait for unblock. | ||
1421 | */ | ||
1422 | if (!svm->vcpu.arch.interrupt_window_open && | ||
1423 | (svm->vcpu.arch.irq_summary || kvm_run->request_interrupt_window)) | ||
1424 | control->intercept |= 1ULL << INTERCEPT_VINTR; | ||
1425 | else | ||
1426 | control->intercept &= ~(1ULL << INTERCEPT_VINTR); | ||
1427 | } | ||
1428 | |||
1429 | static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) | ||
1430 | { | ||
1431 | return 0; | ||
1432 | } | ||
1433 | |||
1434 | static void save_db_regs(unsigned long *db_regs) | ||
1435 | { | ||
1436 | asm volatile ("mov %%dr0, %0" : "=r"(db_regs[0])); | ||
1437 | asm volatile ("mov %%dr1, %0" : "=r"(db_regs[1])); | ||
1438 | asm volatile ("mov %%dr2, %0" : "=r"(db_regs[2])); | ||
1439 | asm volatile ("mov %%dr3, %0" : "=r"(db_regs[3])); | ||
1440 | } | ||
1441 | |||
1442 | static void load_db_regs(unsigned long *db_regs) | ||
1443 | { | ||
1444 | asm volatile ("mov %0, %%dr0" : : "r"(db_regs[0])); | ||
1445 | asm volatile ("mov %0, %%dr1" : : "r"(db_regs[1])); | ||
1446 | asm volatile ("mov %0, %%dr2" : : "r"(db_regs[2])); | ||
1447 | asm volatile ("mov %0, %%dr3" : : "r"(db_regs[3])); | ||
1448 | } | ||
1449 | |||
1450 | static void svm_flush_tlb(struct kvm_vcpu *vcpu) | ||
1451 | { | ||
1452 | force_new_asid(vcpu); | ||
1453 | } | ||
1454 | |||
1455 | static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) | ||
1456 | { | ||
1457 | } | ||
1458 | |||
1459 | static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
1460 | { | ||
1461 | struct vcpu_svm *svm = to_svm(vcpu); | ||
1462 | u16 fs_selector; | ||
1463 | u16 gs_selector; | ||
1464 | u16 ldt_selector; | ||
1465 | |||
1466 | pre_svm_run(svm); | ||
1467 | |||
1468 | save_host_msrs(vcpu); | ||
1469 | fs_selector = read_fs(); | ||
1470 | gs_selector = read_gs(); | ||
1471 | ldt_selector = read_ldt(); | ||
1472 | svm->host_cr2 = kvm_read_cr2(); | ||
1473 | svm->host_dr6 = read_dr6(); | ||
1474 | svm->host_dr7 = read_dr7(); | ||
1475 | svm->vmcb->save.cr2 = vcpu->arch.cr2; | ||
1476 | |||
1477 | if (svm->vmcb->save.dr7 & 0xff) { | ||
1478 | write_dr7(0); | ||
1479 | save_db_regs(svm->host_db_regs); | ||
1480 | load_db_regs(svm->db_regs); | ||
1481 | } | ||
1482 | |||
1483 | clgi(); | ||
1484 | |||
1485 | local_irq_enable(); | ||
1486 | |||
1487 | asm volatile ( | ||
1488 | #ifdef CONFIG_X86_64 | ||
1489 | "push %%rbp; \n\t" | ||
1490 | #else | ||
1491 | "push %%ebp; \n\t" | ||
1492 | #endif | ||
1493 | |||
1494 | #ifdef CONFIG_X86_64 | ||
1495 | "mov %c[rbx](%[svm]), %%rbx \n\t" | ||
1496 | "mov %c[rcx](%[svm]), %%rcx \n\t" | ||
1497 | "mov %c[rdx](%[svm]), %%rdx \n\t" | ||
1498 | "mov %c[rsi](%[svm]), %%rsi \n\t" | ||
1499 | "mov %c[rdi](%[svm]), %%rdi \n\t" | ||
1500 | "mov %c[rbp](%[svm]), %%rbp \n\t" | ||
1501 | "mov %c[r8](%[svm]), %%r8 \n\t" | ||
1502 | "mov %c[r9](%[svm]), %%r9 \n\t" | ||
1503 | "mov %c[r10](%[svm]), %%r10 \n\t" | ||
1504 | "mov %c[r11](%[svm]), %%r11 \n\t" | ||
1505 | "mov %c[r12](%[svm]), %%r12 \n\t" | ||
1506 | "mov %c[r13](%[svm]), %%r13 \n\t" | ||
1507 | "mov %c[r14](%[svm]), %%r14 \n\t" | ||
1508 | "mov %c[r15](%[svm]), %%r15 \n\t" | ||
1509 | #else | ||
1510 | "mov %c[rbx](%[svm]), %%ebx \n\t" | ||
1511 | "mov %c[rcx](%[svm]), %%ecx \n\t" | ||
1512 | "mov %c[rdx](%[svm]), %%edx \n\t" | ||
1513 | "mov %c[rsi](%[svm]), %%esi \n\t" | ||
1514 | "mov %c[rdi](%[svm]), %%edi \n\t" | ||
1515 | "mov %c[rbp](%[svm]), %%ebp \n\t" | ||
1516 | #endif | ||
1517 | |||
1518 | #ifdef CONFIG_X86_64 | ||
1519 | /* Enter guest mode */ | ||
1520 | "push %%rax \n\t" | ||
1521 | "mov %c[vmcb](%[svm]), %%rax \n\t" | ||
1522 | SVM_VMLOAD "\n\t" | ||
1523 | SVM_VMRUN "\n\t" | ||
1524 | SVM_VMSAVE "\n\t" | ||
1525 | "pop %%rax \n\t" | ||
1526 | #else | ||
1527 | /* Enter guest mode */ | ||
1528 | "push %%eax \n\t" | ||
1529 | "mov %c[vmcb](%[svm]), %%eax \n\t" | ||
1530 | SVM_VMLOAD "\n\t" | ||
1531 | SVM_VMRUN "\n\t" | ||
1532 | SVM_VMSAVE "\n\t" | ||
1533 | "pop %%eax \n\t" | ||
1534 | #endif | ||
1535 | |||
1536 | /* Save guest registers, load host registers */ | ||
1537 | #ifdef CONFIG_X86_64 | ||
1538 | "mov %%rbx, %c[rbx](%[svm]) \n\t" | ||
1539 | "mov %%rcx, %c[rcx](%[svm]) \n\t" | ||
1540 | "mov %%rdx, %c[rdx](%[svm]) \n\t" | ||
1541 | "mov %%rsi, %c[rsi](%[svm]) \n\t" | ||
1542 | "mov %%rdi, %c[rdi](%[svm]) \n\t" | ||
1543 | "mov %%rbp, %c[rbp](%[svm]) \n\t" | ||
1544 | "mov %%r8, %c[r8](%[svm]) \n\t" | ||
1545 | "mov %%r9, %c[r9](%[svm]) \n\t" | ||
1546 | "mov %%r10, %c[r10](%[svm]) \n\t" | ||
1547 | "mov %%r11, %c[r11](%[svm]) \n\t" | ||
1548 | "mov %%r12, %c[r12](%[svm]) \n\t" | ||
1549 | "mov %%r13, %c[r13](%[svm]) \n\t" | ||
1550 | "mov %%r14, %c[r14](%[svm]) \n\t" | ||
1551 | "mov %%r15, %c[r15](%[svm]) \n\t" | ||
1552 | |||
1553 | "pop %%rbp; \n\t" | ||
1554 | #else | ||
1555 | "mov %%ebx, %c[rbx](%[svm]) \n\t" | ||
1556 | "mov %%ecx, %c[rcx](%[svm]) \n\t" | ||
1557 | "mov %%edx, %c[rdx](%[svm]) \n\t" | ||
1558 | "mov %%esi, %c[rsi](%[svm]) \n\t" | ||
1559 | "mov %%edi, %c[rdi](%[svm]) \n\t" | ||
1560 | "mov %%ebp, %c[rbp](%[svm]) \n\t" | ||
1561 | |||
1562 | "pop %%ebp; \n\t" | ||
1563 | #endif | ||
1564 | : | ||
1565 | : [svm]"a"(svm), | ||
1566 | [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), | ||
1567 | [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])), | ||
1568 | [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])), | ||
1569 | [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])), | ||
1570 | [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])), | ||
1571 | [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])), | ||
1572 | [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP])) | ||
1573 | #ifdef CONFIG_X86_64 | ||
1574 | , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])), | ||
1575 | [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])), | ||
1576 | [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])), | ||
1577 | [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])), | ||
1578 | [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])), | ||
1579 | [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])), | ||
1580 | [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])), | ||
1581 | [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15])) | ||
1582 | #endif | ||
1583 | : "cc", "memory" | ||
1584 | #ifdef CONFIG_X86_64 | ||
1585 | , "rbx", "rcx", "rdx", "rsi", "rdi" | ||
1586 | , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15" | ||
1587 | #else | ||
1588 | , "ebx", "ecx", "edx" , "esi", "edi" | ||
1589 | #endif | ||
1590 | ); | ||
1591 | |||
1592 | if ((svm->vmcb->save.dr7 & 0xff)) | ||
1593 | load_db_regs(svm->host_db_regs); | ||
1594 | |||
1595 | vcpu->arch.cr2 = svm->vmcb->save.cr2; | ||
1596 | |||
1597 | write_dr6(svm->host_dr6); | ||
1598 | write_dr7(svm->host_dr7); | ||
1599 | kvm_write_cr2(svm->host_cr2); | ||
1600 | |||
1601 | load_fs(fs_selector); | ||
1602 | load_gs(gs_selector); | ||
1603 | load_ldt(ldt_selector); | ||
1604 | load_host_msrs(vcpu); | ||
1605 | |||
1606 | reload_tss(vcpu); | ||
1607 | |||
1608 | local_irq_disable(); | ||
1609 | |||
1610 | stgi(); | ||
1611 | |||
1612 | svm->next_rip = 0; | ||
1613 | } | ||
1614 | |||
1615 | static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) | ||
1616 | { | ||
1617 | struct vcpu_svm *svm = to_svm(vcpu); | ||
1618 | |||
1619 | svm->vmcb->save.cr3 = root; | ||
1620 | force_new_asid(vcpu); | ||
1621 | |||
1622 | if (vcpu->fpu_active) { | ||
1623 | svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR); | ||
1624 | svm->vmcb->save.cr0 |= X86_CR0_TS; | ||
1625 | vcpu->fpu_active = 0; | ||
1626 | } | ||
1627 | } | ||
1628 | |||
1629 | static int is_disabled(void) | ||
1630 | { | ||
1631 | u64 vm_cr; | ||
1632 | |||
1633 | rdmsrl(MSR_VM_CR, vm_cr); | ||
1634 | if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE)) | ||
1635 | return 1; | ||
1636 | |||
1637 | return 0; | ||
1638 | } | ||
1639 | |||
1640 | static void | ||
1641 | svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) | ||
1642 | { | ||
1643 | /* | ||
1644 | * Patch in the VMMCALL instruction: | ||
1645 | */ | ||
1646 | hypercall[0] = 0x0f; | ||
1647 | hypercall[1] = 0x01; | ||
1648 | hypercall[2] = 0xd9; | ||
1649 | } | ||
1650 | |||
1651 | static void svm_check_processor_compat(void *rtn) | ||
1652 | { | ||
1653 | *(int *)rtn = 0; | ||
1654 | } | ||
1655 | |||
1656 | static struct kvm_x86_ops svm_x86_ops = { | ||
1657 | .cpu_has_kvm_support = has_svm, | ||
1658 | .disabled_by_bios = is_disabled, | ||
1659 | .hardware_setup = svm_hardware_setup, | ||
1660 | .hardware_unsetup = svm_hardware_unsetup, | ||
1661 | .check_processor_compatibility = svm_check_processor_compat, | ||
1662 | .hardware_enable = svm_hardware_enable, | ||
1663 | .hardware_disable = svm_hardware_disable, | ||
1664 | |||
1665 | .vcpu_create = svm_create_vcpu, | ||
1666 | .vcpu_free = svm_free_vcpu, | ||
1667 | .vcpu_reset = svm_vcpu_reset, | ||
1668 | |||
1669 | .prepare_guest_switch = svm_prepare_guest_switch, | ||
1670 | .vcpu_load = svm_vcpu_load, | ||
1671 | .vcpu_put = svm_vcpu_put, | ||
1672 | .vcpu_decache = svm_vcpu_decache, | ||
1673 | |||
1674 | .set_guest_debug = svm_guest_debug, | ||
1675 | .get_msr = svm_get_msr, | ||
1676 | .set_msr = svm_set_msr, | ||
1677 | .get_segment_base = svm_get_segment_base, | ||
1678 | .get_segment = svm_get_segment, | ||
1679 | .set_segment = svm_set_segment, | ||
1680 | .get_cs_db_l_bits = kvm_get_cs_db_l_bits, | ||
1681 | .decache_cr4_guest_bits = svm_decache_cr4_guest_bits, | ||
1682 | .set_cr0 = svm_set_cr0, | ||
1683 | .set_cr3 = svm_set_cr3, | ||
1684 | .set_cr4 = svm_set_cr4, | ||
1685 | .set_efer = svm_set_efer, | ||
1686 | .get_idt = svm_get_idt, | ||
1687 | .set_idt = svm_set_idt, | ||
1688 | .get_gdt = svm_get_gdt, | ||
1689 | .set_gdt = svm_set_gdt, | ||
1690 | .get_dr = svm_get_dr, | ||
1691 | .set_dr = svm_set_dr, | ||
1692 | .cache_regs = svm_cache_regs, | ||
1693 | .decache_regs = svm_decache_regs, | ||
1694 | .get_rflags = svm_get_rflags, | ||
1695 | .set_rflags = svm_set_rflags, | ||
1696 | |||
1697 | .tlb_flush = svm_flush_tlb, | ||
1698 | |||
1699 | .run = svm_vcpu_run, | ||
1700 | .handle_exit = handle_exit, | ||
1701 | .skip_emulated_instruction = skip_emulated_instruction, | ||
1702 | .patch_hypercall = svm_patch_hypercall, | ||
1703 | .get_irq = svm_get_irq, | ||
1704 | .set_irq = svm_set_irq, | ||
1705 | .queue_exception = svm_queue_exception, | ||
1706 | .exception_injected = svm_exception_injected, | ||
1707 | .inject_pending_irq = svm_intr_assist, | ||
1708 | .inject_pending_vectors = do_interrupt_requests, | ||
1709 | |||
1710 | .set_tss_addr = svm_set_tss_addr, | ||
1711 | }; | ||
1712 | |||
1713 | static int __init svm_init(void) | ||
1714 | { | ||
1715 | return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm), | ||
1716 | THIS_MODULE); | ||
1717 | } | ||
1718 | |||
1719 | static void __exit svm_exit(void) | ||
1720 | { | ||
1721 | kvm_exit(); | ||
1722 | } | ||
1723 | |||
1724 | module_init(svm_init) | ||
1725 | module_exit(svm_exit) | ||
diff --git a/arch/x86/kvm/svm.h b/arch/x86/kvm/svm.h new file mode 100644 index 000000000000..5fd50491b555 --- /dev/null +++ b/arch/x86/kvm/svm.h | |||
@@ -0,0 +1,325 @@ | |||
1 | #ifndef __SVM_H | ||
2 | #define __SVM_H | ||
3 | |||
4 | enum { | ||
5 | INTERCEPT_INTR, | ||
6 | INTERCEPT_NMI, | ||
7 | INTERCEPT_SMI, | ||
8 | INTERCEPT_INIT, | ||
9 | INTERCEPT_VINTR, | ||
10 | INTERCEPT_SELECTIVE_CR0, | ||
11 | INTERCEPT_STORE_IDTR, | ||
12 | INTERCEPT_STORE_GDTR, | ||
13 | INTERCEPT_STORE_LDTR, | ||
14 | INTERCEPT_STORE_TR, | ||
15 | INTERCEPT_LOAD_IDTR, | ||
16 | INTERCEPT_LOAD_GDTR, | ||
17 | INTERCEPT_LOAD_LDTR, | ||
18 | INTERCEPT_LOAD_TR, | ||
19 | INTERCEPT_RDTSC, | ||
20 | INTERCEPT_RDPMC, | ||
21 | INTERCEPT_PUSHF, | ||
22 | INTERCEPT_POPF, | ||
23 | INTERCEPT_CPUID, | ||
24 | INTERCEPT_RSM, | ||
25 | INTERCEPT_IRET, | ||
26 | INTERCEPT_INTn, | ||
27 | INTERCEPT_INVD, | ||
28 | INTERCEPT_PAUSE, | ||
29 | INTERCEPT_HLT, | ||
30 | INTERCEPT_INVLPG, | ||
31 | INTERCEPT_INVLPGA, | ||
32 | INTERCEPT_IOIO_PROT, | ||
33 | INTERCEPT_MSR_PROT, | ||
34 | INTERCEPT_TASK_SWITCH, | ||
35 | INTERCEPT_FERR_FREEZE, | ||
36 | INTERCEPT_SHUTDOWN, | ||
37 | INTERCEPT_VMRUN, | ||
38 | INTERCEPT_VMMCALL, | ||
39 | INTERCEPT_VMLOAD, | ||
40 | INTERCEPT_VMSAVE, | ||
41 | INTERCEPT_STGI, | ||
42 | INTERCEPT_CLGI, | ||
43 | INTERCEPT_SKINIT, | ||
44 | INTERCEPT_RDTSCP, | ||
45 | INTERCEPT_ICEBP, | ||
46 | INTERCEPT_WBINVD, | ||
47 | INTERCEPT_MONITOR, | ||
48 | INTERCEPT_MWAIT, | ||
49 | INTERCEPT_MWAIT_COND, | ||
50 | }; | ||
51 | |||
52 | |||
53 | struct __attribute__ ((__packed__)) vmcb_control_area { | ||
54 | u16 intercept_cr_read; | ||
55 | u16 intercept_cr_write; | ||
56 | u16 intercept_dr_read; | ||
57 | u16 intercept_dr_write; | ||
58 | u32 intercept_exceptions; | ||
59 | u64 intercept; | ||
60 | u8 reserved_1[44]; | ||
61 | u64 iopm_base_pa; | ||
62 | u64 msrpm_base_pa; | ||
63 | u64 tsc_offset; | ||
64 | u32 asid; | ||
65 | u8 tlb_ctl; | ||
66 | u8 reserved_2[3]; | ||
67 | u32 int_ctl; | ||
68 | u32 int_vector; | ||
69 | u32 int_state; | ||
70 | u8 reserved_3[4]; | ||
71 | u32 exit_code; | ||
72 | u32 exit_code_hi; | ||
73 | u64 exit_info_1; | ||
74 | u64 exit_info_2; | ||
75 | u32 exit_int_info; | ||
76 | u32 exit_int_info_err; | ||
77 | u64 nested_ctl; | ||
78 | u8 reserved_4[16]; | ||
79 | u32 event_inj; | ||
80 | u32 event_inj_err; | ||
81 | u64 nested_cr3; | ||
82 | u64 lbr_ctl; | ||
83 | u8 reserved_5[832]; | ||
84 | }; | ||
85 | |||
86 | |||
87 | #define TLB_CONTROL_DO_NOTHING 0 | ||
88 | #define TLB_CONTROL_FLUSH_ALL_ASID 1 | ||
89 | |||
90 | #define V_TPR_MASK 0x0f | ||
91 | |||
92 | #define V_IRQ_SHIFT 8 | ||
93 | #define V_IRQ_MASK (1 << V_IRQ_SHIFT) | ||
94 | |||
95 | #define V_INTR_PRIO_SHIFT 16 | ||
96 | #define V_INTR_PRIO_MASK (0x0f << V_INTR_PRIO_SHIFT) | ||
97 | |||
98 | #define V_IGN_TPR_SHIFT 20 | ||
99 | #define V_IGN_TPR_MASK (1 << V_IGN_TPR_SHIFT) | ||
100 | |||
101 | #define V_INTR_MASKING_SHIFT 24 | ||
102 | #define V_INTR_MASKING_MASK (1 << V_INTR_MASKING_SHIFT) | ||
103 | |||
104 | #define SVM_INTERRUPT_SHADOW_MASK 1 | ||
105 | |||
106 | #define SVM_IOIO_STR_SHIFT 2 | ||
107 | #define SVM_IOIO_REP_SHIFT 3 | ||
108 | #define SVM_IOIO_SIZE_SHIFT 4 | ||
109 | #define SVM_IOIO_ASIZE_SHIFT 7 | ||
110 | |||
111 | #define SVM_IOIO_TYPE_MASK 1 | ||
112 | #define SVM_IOIO_STR_MASK (1 << SVM_IOIO_STR_SHIFT) | ||
113 | #define SVM_IOIO_REP_MASK (1 << SVM_IOIO_REP_SHIFT) | ||
114 | #define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT) | ||
115 | #define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT) | ||
116 | |||
117 | struct __attribute__ ((__packed__)) vmcb_seg { | ||
118 | u16 selector; | ||
119 | u16 attrib; | ||
120 | u32 limit; | ||
121 | u64 base; | ||
122 | }; | ||
123 | |||
124 | struct __attribute__ ((__packed__)) vmcb_save_area { | ||
125 | struct vmcb_seg es; | ||
126 | struct vmcb_seg cs; | ||
127 | struct vmcb_seg ss; | ||
128 | struct vmcb_seg ds; | ||
129 | struct vmcb_seg fs; | ||
130 | struct vmcb_seg gs; | ||
131 | struct vmcb_seg gdtr; | ||
132 | struct vmcb_seg ldtr; | ||
133 | struct vmcb_seg idtr; | ||
134 | struct vmcb_seg tr; | ||
135 | u8 reserved_1[43]; | ||
136 | u8 cpl; | ||
137 | u8 reserved_2[4]; | ||
138 | u64 efer; | ||
139 | u8 reserved_3[112]; | ||
140 | u64 cr4; | ||
141 | u64 cr3; | ||
142 | u64 cr0; | ||
143 | u64 dr7; | ||
144 | u64 dr6; | ||
145 | u64 rflags; | ||
146 | u64 rip; | ||
147 | u8 reserved_4[88]; | ||
148 | u64 rsp; | ||
149 | u8 reserved_5[24]; | ||
150 | u64 rax; | ||
151 | u64 star; | ||
152 | u64 lstar; | ||
153 | u64 cstar; | ||
154 | u64 sfmask; | ||
155 | u64 kernel_gs_base; | ||
156 | u64 sysenter_cs; | ||
157 | u64 sysenter_esp; | ||
158 | u64 sysenter_eip; | ||
159 | u64 cr2; | ||
160 | u8 reserved_6[32]; | ||
161 | u64 g_pat; | ||
162 | u64 dbgctl; | ||
163 | u64 br_from; | ||
164 | u64 br_to; | ||
165 | u64 last_excp_from; | ||
166 | u64 last_excp_to; | ||
167 | }; | ||
168 | |||
169 | struct __attribute__ ((__packed__)) vmcb { | ||
170 | struct vmcb_control_area control; | ||
171 | struct vmcb_save_area save; | ||
172 | }; | ||
173 | |||
174 | #define SVM_CPUID_FEATURE_SHIFT 2 | ||
175 | #define SVM_CPUID_FUNC 0x8000000a | ||
176 | |||
177 | #define MSR_EFER_SVME_MASK (1ULL << 12) | ||
178 | #define MSR_VM_CR 0xc0010114 | ||
179 | #define MSR_VM_HSAVE_PA 0xc0010117ULL | ||
180 | |||
181 | #define SVM_VM_CR_SVM_DISABLE 4 | ||
182 | |||
183 | #define SVM_SELECTOR_S_SHIFT 4 | ||
184 | #define SVM_SELECTOR_DPL_SHIFT 5 | ||
185 | #define SVM_SELECTOR_P_SHIFT 7 | ||
186 | #define SVM_SELECTOR_AVL_SHIFT 8 | ||
187 | #define SVM_SELECTOR_L_SHIFT 9 | ||
188 | #define SVM_SELECTOR_DB_SHIFT 10 | ||
189 | #define SVM_SELECTOR_G_SHIFT 11 | ||
190 | |||
191 | #define SVM_SELECTOR_TYPE_MASK (0xf) | ||
192 | #define SVM_SELECTOR_S_MASK (1 << SVM_SELECTOR_S_SHIFT) | ||
193 | #define SVM_SELECTOR_DPL_MASK (3 << SVM_SELECTOR_DPL_SHIFT) | ||
194 | #define SVM_SELECTOR_P_MASK (1 << SVM_SELECTOR_P_SHIFT) | ||
195 | #define SVM_SELECTOR_AVL_MASK (1 << SVM_SELECTOR_AVL_SHIFT) | ||
196 | #define SVM_SELECTOR_L_MASK (1 << SVM_SELECTOR_L_SHIFT) | ||
197 | #define SVM_SELECTOR_DB_MASK (1 << SVM_SELECTOR_DB_SHIFT) | ||
198 | #define SVM_SELECTOR_G_MASK (1 << SVM_SELECTOR_G_SHIFT) | ||
199 | |||
200 | #define SVM_SELECTOR_WRITE_MASK (1 << 1) | ||
201 | #define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK | ||
202 | #define SVM_SELECTOR_CODE_MASK (1 << 3) | ||
203 | |||
204 | #define INTERCEPT_CR0_MASK 1 | ||
205 | #define INTERCEPT_CR3_MASK (1 << 3) | ||
206 | #define INTERCEPT_CR4_MASK (1 << 4) | ||
207 | #define INTERCEPT_CR8_MASK (1 << 8) | ||
208 | |||
209 | #define INTERCEPT_DR0_MASK 1 | ||
210 | #define INTERCEPT_DR1_MASK (1 << 1) | ||
211 | #define INTERCEPT_DR2_MASK (1 << 2) | ||
212 | #define INTERCEPT_DR3_MASK (1 << 3) | ||
213 | #define INTERCEPT_DR4_MASK (1 << 4) | ||
214 | #define INTERCEPT_DR5_MASK (1 << 5) | ||
215 | #define INTERCEPT_DR6_MASK (1 << 6) | ||
216 | #define INTERCEPT_DR7_MASK (1 << 7) | ||
217 | |||
218 | #define SVM_EVTINJ_VEC_MASK 0xff | ||
219 | |||
220 | #define SVM_EVTINJ_TYPE_SHIFT 8 | ||
221 | #define SVM_EVTINJ_TYPE_MASK (7 << SVM_EVTINJ_TYPE_SHIFT) | ||
222 | |||
223 | #define SVM_EVTINJ_TYPE_INTR (0 << SVM_EVTINJ_TYPE_SHIFT) | ||
224 | #define SVM_EVTINJ_TYPE_NMI (2 << SVM_EVTINJ_TYPE_SHIFT) | ||
225 | #define SVM_EVTINJ_TYPE_EXEPT (3 << SVM_EVTINJ_TYPE_SHIFT) | ||
226 | #define SVM_EVTINJ_TYPE_SOFT (4 << SVM_EVTINJ_TYPE_SHIFT) | ||
227 | |||
228 | #define SVM_EVTINJ_VALID (1 << 31) | ||
229 | #define SVM_EVTINJ_VALID_ERR (1 << 11) | ||
230 | |||
231 | #define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK | ||
232 | |||
233 | #define SVM_EXITINTINFO_TYPE_INTR SVM_EVTINJ_TYPE_INTR | ||
234 | #define SVM_EXITINTINFO_TYPE_NMI SVM_EVTINJ_TYPE_NMI | ||
235 | #define SVM_EXITINTINFO_TYPE_EXEPT SVM_EVTINJ_TYPE_EXEPT | ||
236 | #define SVM_EXITINTINFO_TYPE_SOFT SVM_EVTINJ_TYPE_SOFT | ||
237 | |||
238 | #define SVM_EXITINTINFO_VALID SVM_EVTINJ_VALID | ||
239 | #define SVM_EXITINTINFO_VALID_ERR SVM_EVTINJ_VALID_ERR | ||
240 | |||
241 | #define SVM_EXIT_READ_CR0 0x000 | ||
242 | #define SVM_EXIT_READ_CR3 0x003 | ||
243 | #define SVM_EXIT_READ_CR4 0x004 | ||
244 | #define SVM_EXIT_READ_CR8 0x008 | ||
245 | #define SVM_EXIT_WRITE_CR0 0x010 | ||
246 | #define SVM_EXIT_WRITE_CR3 0x013 | ||
247 | #define SVM_EXIT_WRITE_CR4 0x014 | ||
248 | #define SVM_EXIT_WRITE_CR8 0x018 | ||
249 | #define SVM_EXIT_READ_DR0 0x020 | ||
250 | #define SVM_EXIT_READ_DR1 0x021 | ||
251 | #define SVM_EXIT_READ_DR2 0x022 | ||
252 | #define SVM_EXIT_READ_DR3 0x023 | ||
253 | #define SVM_EXIT_READ_DR4 0x024 | ||
254 | #define SVM_EXIT_READ_DR5 0x025 | ||
255 | #define SVM_EXIT_READ_DR6 0x026 | ||
256 | #define SVM_EXIT_READ_DR7 0x027 | ||
257 | #define SVM_EXIT_WRITE_DR0 0x030 | ||
258 | #define SVM_EXIT_WRITE_DR1 0x031 | ||
259 | #define SVM_EXIT_WRITE_DR2 0x032 | ||
260 | #define SVM_EXIT_WRITE_DR3 0x033 | ||
261 | #define SVM_EXIT_WRITE_DR4 0x034 | ||
262 | #define SVM_EXIT_WRITE_DR5 0x035 | ||
263 | #define SVM_EXIT_WRITE_DR6 0x036 | ||
264 | #define SVM_EXIT_WRITE_DR7 0x037 | ||
265 | #define SVM_EXIT_EXCP_BASE 0x040 | ||
266 | #define SVM_EXIT_INTR 0x060 | ||
267 | #define SVM_EXIT_NMI 0x061 | ||
268 | #define SVM_EXIT_SMI 0x062 | ||
269 | #define SVM_EXIT_INIT 0x063 | ||
270 | #define SVM_EXIT_VINTR 0x064 | ||
271 | #define SVM_EXIT_CR0_SEL_WRITE 0x065 | ||
272 | #define SVM_EXIT_IDTR_READ 0x066 | ||
273 | #define SVM_EXIT_GDTR_READ 0x067 | ||
274 | #define SVM_EXIT_LDTR_READ 0x068 | ||
275 | #define SVM_EXIT_TR_READ 0x069 | ||
276 | #define SVM_EXIT_IDTR_WRITE 0x06a | ||
277 | #define SVM_EXIT_GDTR_WRITE 0x06b | ||
278 | #define SVM_EXIT_LDTR_WRITE 0x06c | ||
279 | #define SVM_EXIT_TR_WRITE 0x06d | ||
280 | #define SVM_EXIT_RDTSC 0x06e | ||
281 | #define SVM_EXIT_RDPMC 0x06f | ||
282 | #define SVM_EXIT_PUSHF 0x070 | ||
283 | #define SVM_EXIT_POPF 0x071 | ||
284 | #define SVM_EXIT_CPUID 0x072 | ||
285 | #define SVM_EXIT_RSM 0x073 | ||
286 | #define SVM_EXIT_IRET 0x074 | ||
287 | #define SVM_EXIT_SWINT 0x075 | ||
288 | #define SVM_EXIT_INVD 0x076 | ||
289 | #define SVM_EXIT_PAUSE 0x077 | ||
290 | #define SVM_EXIT_HLT 0x078 | ||
291 | #define SVM_EXIT_INVLPG 0x079 | ||
292 | #define SVM_EXIT_INVLPGA 0x07a | ||
293 | #define SVM_EXIT_IOIO 0x07b | ||
294 | #define SVM_EXIT_MSR 0x07c | ||
295 | #define SVM_EXIT_TASK_SWITCH 0x07d | ||
296 | #define SVM_EXIT_FERR_FREEZE 0x07e | ||
297 | #define SVM_EXIT_SHUTDOWN 0x07f | ||
298 | #define SVM_EXIT_VMRUN 0x080 | ||
299 | #define SVM_EXIT_VMMCALL 0x081 | ||
300 | #define SVM_EXIT_VMLOAD 0x082 | ||
301 | #define SVM_EXIT_VMSAVE 0x083 | ||
302 | #define SVM_EXIT_STGI 0x084 | ||
303 | #define SVM_EXIT_CLGI 0x085 | ||
304 | #define SVM_EXIT_SKINIT 0x086 | ||
305 | #define SVM_EXIT_RDTSCP 0x087 | ||
306 | #define SVM_EXIT_ICEBP 0x088 | ||
307 | #define SVM_EXIT_WBINVD 0x089 | ||
308 | #define SVM_EXIT_MONITOR 0x08a | ||
309 | #define SVM_EXIT_MWAIT 0x08b | ||
310 | #define SVM_EXIT_MWAIT_COND 0x08c | ||
311 | #define SVM_EXIT_NPF 0x400 | ||
312 | |||
313 | #define SVM_EXIT_ERR -1 | ||
314 | |||
315 | #define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) /* TS and MP */ | ||
316 | |||
317 | #define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda" | ||
318 | #define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8" | ||
319 | #define SVM_VMSAVE ".byte 0x0f, 0x01, 0xdb" | ||
320 | #define SVM_CLGI ".byte 0x0f, 0x01, 0xdd" | ||
321 | #define SVM_STGI ".byte 0x0f, 0x01, 0xdc" | ||
322 | #define SVM_INVLPGA ".byte 0x0f, 0x01, 0xdf" | ||
323 | |||
324 | #endif | ||
325 | |||
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c new file mode 100644 index 000000000000..fc494aff5d8b --- /dev/null +++ b/arch/x86/kvm/vmx.c | |||
@@ -0,0 +1,2671 @@ | |||
1 | /* | ||
2 | * Kernel-based Virtual Machine driver for Linux | ||
3 | * | ||
4 | * This module enables machines with Intel VT-x extensions to run virtual | ||
5 | * machines without emulation or binary translation. | ||
6 | * | ||
7 | * Copyright (C) 2006 Qumranet, Inc. | ||
8 | * | ||
9 | * Authors: | ||
10 | * Avi Kivity <avi@qumranet.com> | ||
11 | * Yaniv Kamay <yaniv@qumranet.com> | ||
12 | * | ||
13 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
14 | * the COPYING file in the top-level directory. | ||
15 | * | ||
16 | */ | ||
17 | |||
18 | #include "irq.h" | ||
19 | #include "vmx.h" | ||
20 | #include "segment_descriptor.h" | ||
21 | #include "mmu.h" | ||
22 | |||
23 | #include <linux/kvm_host.h> | ||
24 | #include <linux/module.h> | ||
25 | #include <linux/kernel.h> | ||
26 | #include <linux/mm.h> | ||
27 | #include <linux/highmem.h> | ||
28 | #include <linux/sched.h> | ||
29 | #include <linux/moduleparam.h> | ||
30 | |||
31 | #include <asm/io.h> | ||
32 | #include <asm/desc.h> | ||
33 | |||
34 | MODULE_AUTHOR("Qumranet"); | ||
35 | MODULE_LICENSE("GPL"); | ||
36 | |||
37 | static int bypass_guest_pf = 1; | ||
38 | module_param(bypass_guest_pf, bool, 0); | ||
39 | |||
40 | struct vmcs { | ||
41 | u32 revision_id; | ||
42 | u32 abort; | ||
43 | char data[0]; | ||
44 | }; | ||
45 | |||
46 | struct vcpu_vmx { | ||
47 | struct kvm_vcpu vcpu; | ||
48 | int launched; | ||
49 | u8 fail; | ||
50 | u32 idt_vectoring_info; | ||
51 | struct kvm_msr_entry *guest_msrs; | ||
52 | struct kvm_msr_entry *host_msrs; | ||
53 | int nmsrs; | ||
54 | int save_nmsrs; | ||
55 | int msr_offset_efer; | ||
56 | #ifdef CONFIG_X86_64 | ||
57 | int msr_offset_kernel_gs_base; | ||
58 | #endif | ||
59 | struct vmcs *vmcs; | ||
60 | struct { | ||
61 | int loaded; | ||
62 | u16 fs_sel, gs_sel, ldt_sel; | ||
63 | int gs_ldt_reload_needed; | ||
64 | int fs_reload_needed; | ||
65 | int guest_efer_loaded; | ||
66 | } host_state; | ||
67 | struct { | ||
68 | struct { | ||
69 | bool pending; | ||
70 | u8 vector; | ||
71 | unsigned rip; | ||
72 | } irq; | ||
73 | } rmode; | ||
74 | }; | ||
75 | |||
76 | static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) | ||
77 | { | ||
78 | return container_of(vcpu, struct vcpu_vmx, vcpu); | ||
79 | } | ||
80 | |||
81 | static int init_rmode_tss(struct kvm *kvm); | ||
82 | |||
83 | static DEFINE_PER_CPU(struct vmcs *, vmxarea); | ||
84 | static DEFINE_PER_CPU(struct vmcs *, current_vmcs); | ||
85 | |||
86 | static struct page *vmx_io_bitmap_a; | ||
87 | static struct page *vmx_io_bitmap_b; | ||
88 | |||
89 | static struct vmcs_config { | ||
90 | int size; | ||
91 | int order; | ||
92 | u32 revision_id; | ||
93 | u32 pin_based_exec_ctrl; | ||
94 | u32 cpu_based_exec_ctrl; | ||
95 | u32 cpu_based_2nd_exec_ctrl; | ||
96 | u32 vmexit_ctrl; | ||
97 | u32 vmentry_ctrl; | ||
98 | } vmcs_config; | ||
99 | |||
100 | #define VMX_SEGMENT_FIELD(seg) \ | ||
101 | [VCPU_SREG_##seg] = { \ | ||
102 | .selector = GUEST_##seg##_SELECTOR, \ | ||
103 | .base = GUEST_##seg##_BASE, \ | ||
104 | .limit = GUEST_##seg##_LIMIT, \ | ||
105 | .ar_bytes = GUEST_##seg##_AR_BYTES, \ | ||
106 | } | ||
107 | |||
108 | static struct kvm_vmx_segment_field { | ||
109 | unsigned selector; | ||
110 | unsigned base; | ||
111 | unsigned limit; | ||
112 | unsigned ar_bytes; | ||
113 | } kvm_vmx_segment_fields[] = { | ||
114 | VMX_SEGMENT_FIELD(CS), | ||
115 | VMX_SEGMENT_FIELD(DS), | ||
116 | VMX_SEGMENT_FIELD(ES), | ||
117 | VMX_SEGMENT_FIELD(FS), | ||
118 | VMX_SEGMENT_FIELD(GS), | ||
119 | VMX_SEGMENT_FIELD(SS), | ||
120 | VMX_SEGMENT_FIELD(TR), | ||
121 | VMX_SEGMENT_FIELD(LDTR), | ||
122 | }; | ||
123 | |||
124 | /* | ||
125 | * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it | ||
126 | * away by decrementing the array size. | ||
127 | */ | ||
128 | static const u32 vmx_msr_index[] = { | ||
129 | #ifdef CONFIG_X86_64 | ||
130 | MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE, | ||
131 | #endif | ||
132 | MSR_EFER, MSR_K6_STAR, | ||
133 | }; | ||
134 | #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) | ||
135 | |||
136 | static void load_msrs(struct kvm_msr_entry *e, int n) | ||
137 | { | ||
138 | int i; | ||
139 | |||
140 | for (i = 0; i < n; ++i) | ||
141 | wrmsrl(e[i].index, e[i].data); | ||
142 | } | ||
143 | |||
144 | static void save_msrs(struct kvm_msr_entry *e, int n) | ||
145 | { | ||
146 | int i; | ||
147 | |||
148 | for (i = 0; i < n; ++i) | ||
149 | rdmsrl(e[i].index, e[i].data); | ||
150 | } | ||
151 | |||
152 | static inline int is_page_fault(u32 intr_info) | ||
153 | { | ||
154 | return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | | ||
155 | INTR_INFO_VALID_MASK)) == | ||
156 | (INTR_TYPE_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK); | ||
157 | } | ||
158 | |||
159 | static inline int is_no_device(u32 intr_info) | ||
160 | { | ||
161 | return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | | ||
162 | INTR_INFO_VALID_MASK)) == | ||
163 | (INTR_TYPE_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK); | ||
164 | } | ||
165 | |||
166 | static inline int is_invalid_opcode(u32 intr_info) | ||
167 | { | ||
168 | return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | | ||
169 | INTR_INFO_VALID_MASK)) == | ||
170 | (INTR_TYPE_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK); | ||
171 | } | ||
172 | |||
173 | static inline int is_external_interrupt(u32 intr_info) | ||
174 | { | ||
175 | return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) | ||
176 | == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); | ||
177 | } | ||
178 | |||
179 | static inline int cpu_has_vmx_tpr_shadow(void) | ||
180 | { | ||
181 | return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW); | ||
182 | } | ||
183 | |||
184 | static inline int vm_need_tpr_shadow(struct kvm *kvm) | ||
185 | { | ||
186 | return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm))); | ||
187 | } | ||
188 | |||
189 | static inline int cpu_has_secondary_exec_ctrls(void) | ||
190 | { | ||
191 | return (vmcs_config.cpu_based_exec_ctrl & | ||
192 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS); | ||
193 | } | ||
194 | |||
195 | static inline int cpu_has_vmx_virtualize_apic_accesses(void) | ||
196 | { | ||
197 | return (vmcs_config.cpu_based_2nd_exec_ctrl & | ||
198 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); | ||
199 | } | ||
200 | |||
201 | static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) | ||
202 | { | ||
203 | return ((cpu_has_vmx_virtualize_apic_accesses()) && | ||
204 | (irqchip_in_kernel(kvm))); | ||
205 | } | ||
206 | |||
207 | static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) | ||
208 | { | ||
209 | int i; | ||
210 | |||
211 | for (i = 0; i < vmx->nmsrs; ++i) | ||
212 | if (vmx->guest_msrs[i].index == msr) | ||
213 | return i; | ||
214 | return -1; | ||
215 | } | ||
216 | |||
217 | static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) | ||
218 | { | ||
219 | int i; | ||
220 | |||
221 | i = __find_msr_index(vmx, msr); | ||
222 | if (i >= 0) | ||
223 | return &vmx->guest_msrs[i]; | ||
224 | return NULL; | ||
225 | } | ||
226 | |||
227 | static void vmcs_clear(struct vmcs *vmcs) | ||
228 | { | ||
229 | u64 phys_addr = __pa(vmcs); | ||
230 | u8 error; | ||
231 | |||
232 | asm volatile (ASM_VMX_VMCLEAR_RAX "; setna %0" | ||
233 | : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) | ||
234 | : "cc", "memory"); | ||
235 | if (error) | ||
236 | printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n", | ||
237 | vmcs, phys_addr); | ||
238 | } | ||
239 | |||
240 | static void __vcpu_clear(void *arg) | ||
241 | { | ||
242 | struct vcpu_vmx *vmx = arg; | ||
243 | int cpu = raw_smp_processor_id(); | ||
244 | |||
245 | if (vmx->vcpu.cpu == cpu) | ||
246 | vmcs_clear(vmx->vmcs); | ||
247 | if (per_cpu(current_vmcs, cpu) == vmx->vmcs) | ||
248 | per_cpu(current_vmcs, cpu) = NULL; | ||
249 | rdtscll(vmx->vcpu.arch.host_tsc); | ||
250 | } | ||
251 | |||
252 | static void vcpu_clear(struct vcpu_vmx *vmx) | ||
253 | { | ||
254 | if (vmx->vcpu.cpu == -1) | ||
255 | return; | ||
256 | smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 0, 1); | ||
257 | vmx->launched = 0; | ||
258 | } | ||
259 | |||
260 | static unsigned long vmcs_readl(unsigned long field) | ||
261 | { | ||
262 | unsigned long value; | ||
263 | |||
264 | asm volatile (ASM_VMX_VMREAD_RDX_RAX | ||
265 | : "=a"(value) : "d"(field) : "cc"); | ||
266 | return value; | ||
267 | } | ||
268 | |||
269 | static u16 vmcs_read16(unsigned long field) | ||
270 | { | ||
271 | return vmcs_readl(field); | ||
272 | } | ||
273 | |||
274 | static u32 vmcs_read32(unsigned long field) | ||
275 | { | ||
276 | return vmcs_readl(field); | ||
277 | } | ||
278 | |||
279 | static u64 vmcs_read64(unsigned long field) | ||
280 | { | ||
281 | #ifdef CONFIG_X86_64 | ||
282 | return vmcs_readl(field); | ||
283 | #else | ||
284 | return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32); | ||
285 | #endif | ||
286 | } | ||
287 | |||
288 | static noinline void vmwrite_error(unsigned long field, unsigned long value) | ||
289 | { | ||
290 | printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n", | ||
291 | field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); | ||
292 | dump_stack(); | ||
293 | } | ||
294 | |||
295 | static void vmcs_writel(unsigned long field, unsigned long value) | ||
296 | { | ||
297 | u8 error; | ||
298 | |||
299 | asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0" | ||
300 | : "=q"(error) : "a"(value), "d"(field) : "cc"); | ||
301 | if (unlikely(error)) | ||
302 | vmwrite_error(field, value); | ||
303 | } | ||
304 | |||
305 | static void vmcs_write16(unsigned long field, u16 value) | ||
306 | { | ||
307 | vmcs_writel(field, value); | ||
308 | } | ||
309 | |||
310 | static void vmcs_write32(unsigned long field, u32 value) | ||
311 | { | ||
312 | vmcs_writel(field, value); | ||
313 | } | ||
314 | |||
315 | static void vmcs_write64(unsigned long field, u64 value) | ||
316 | { | ||
317 | #ifdef CONFIG_X86_64 | ||
318 | vmcs_writel(field, value); | ||
319 | #else | ||
320 | vmcs_writel(field, value); | ||
321 | asm volatile (""); | ||
322 | vmcs_writel(field+1, value >> 32); | ||
323 | #endif | ||
324 | } | ||
325 | |||
326 | static void vmcs_clear_bits(unsigned long field, u32 mask) | ||
327 | { | ||
328 | vmcs_writel(field, vmcs_readl(field) & ~mask); | ||
329 | } | ||
330 | |||
331 | static void vmcs_set_bits(unsigned long field, u32 mask) | ||
332 | { | ||
333 | vmcs_writel(field, vmcs_readl(field) | mask); | ||
334 | } | ||
335 | |||
336 | static void update_exception_bitmap(struct kvm_vcpu *vcpu) | ||
337 | { | ||
338 | u32 eb; | ||
339 | |||
340 | eb = (1u << PF_VECTOR) | (1u << UD_VECTOR); | ||
341 | if (!vcpu->fpu_active) | ||
342 | eb |= 1u << NM_VECTOR; | ||
343 | if (vcpu->guest_debug.enabled) | ||
344 | eb |= 1u << 1; | ||
345 | if (vcpu->arch.rmode.active) | ||
346 | eb = ~0; | ||
347 | vmcs_write32(EXCEPTION_BITMAP, eb); | ||
348 | } | ||
349 | |||
350 | static void reload_tss(void) | ||
351 | { | ||
352 | #ifndef CONFIG_X86_64 | ||
353 | |||
354 | /* | ||
355 | * VT restores TR but not its size. Useless. | ||
356 | */ | ||
357 | struct descriptor_table gdt; | ||
358 | struct segment_descriptor *descs; | ||
359 | |||
360 | get_gdt(&gdt); | ||
361 | descs = (void *)gdt.base; | ||
362 | descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ | ||
363 | load_TR_desc(); | ||
364 | #endif | ||
365 | } | ||
366 | |||
367 | static void load_transition_efer(struct vcpu_vmx *vmx) | ||
368 | { | ||
369 | int efer_offset = vmx->msr_offset_efer; | ||
370 | u64 host_efer = vmx->host_msrs[efer_offset].data; | ||
371 | u64 guest_efer = vmx->guest_msrs[efer_offset].data; | ||
372 | u64 ignore_bits; | ||
373 | |||
374 | if (efer_offset < 0) | ||
375 | return; | ||
376 | /* | ||
377 | * NX is emulated; LMA and LME handled by hardware; SCE meaninless | ||
378 | * outside long mode | ||
379 | */ | ||
380 | ignore_bits = EFER_NX | EFER_SCE; | ||
381 | #ifdef CONFIG_X86_64 | ||
382 | ignore_bits |= EFER_LMA | EFER_LME; | ||
383 | /* SCE is meaningful only in long mode on Intel */ | ||
384 | if (guest_efer & EFER_LMA) | ||
385 | ignore_bits &= ~(u64)EFER_SCE; | ||
386 | #endif | ||
387 | if ((guest_efer & ~ignore_bits) == (host_efer & ~ignore_bits)) | ||
388 | return; | ||
389 | |||
390 | vmx->host_state.guest_efer_loaded = 1; | ||
391 | guest_efer &= ~ignore_bits; | ||
392 | guest_efer |= host_efer & ignore_bits; | ||
393 | wrmsrl(MSR_EFER, guest_efer); | ||
394 | vmx->vcpu.stat.efer_reload++; | ||
395 | } | ||
396 | |||
397 | static void reload_host_efer(struct vcpu_vmx *vmx) | ||
398 | { | ||
399 | if (vmx->host_state.guest_efer_loaded) { | ||
400 | vmx->host_state.guest_efer_loaded = 0; | ||
401 | load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1); | ||
402 | } | ||
403 | } | ||
404 | |||
405 | static void vmx_save_host_state(struct kvm_vcpu *vcpu) | ||
406 | { | ||
407 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
408 | |||
409 | if (vmx->host_state.loaded) | ||
410 | return; | ||
411 | |||
412 | vmx->host_state.loaded = 1; | ||
413 | /* | ||
414 | * Set host fs and gs selectors. Unfortunately, 22.2.3 does not | ||
415 | * allow segment selectors with cpl > 0 or ti == 1. | ||
416 | */ | ||
417 | vmx->host_state.ldt_sel = read_ldt(); | ||
418 | vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel; | ||
419 | vmx->host_state.fs_sel = read_fs(); | ||
420 | if (!(vmx->host_state.fs_sel & 7)) { | ||
421 | vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel); | ||
422 | vmx->host_state.fs_reload_needed = 0; | ||
423 | } else { | ||
424 | vmcs_write16(HOST_FS_SELECTOR, 0); | ||
425 | vmx->host_state.fs_reload_needed = 1; | ||
426 | } | ||
427 | vmx->host_state.gs_sel = read_gs(); | ||
428 | if (!(vmx->host_state.gs_sel & 7)) | ||
429 | vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel); | ||
430 | else { | ||
431 | vmcs_write16(HOST_GS_SELECTOR, 0); | ||
432 | vmx->host_state.gs_ldt_reload_needed = 1; | ||
433 | } | ||
434 | |||
435 | #ifdef CONFIG_X86_64 | ||
436 | vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE)); | ||
437 | vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE)); | ||
438 | #else | ||
439 | vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel)); | ||
440 | vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel)); | ||
441 | #endif | ||
442 | |||
443 | #ifdef CONFIG_X86_64 | ||
444 | if (is_long_mode(&vmx->vcpu)) | ||
445 | save_msrs(vmx->host_msrs + | ||
446 | vmx->msr_offset_kernel_gs_base, 1); | ||
447 | |||
448 | #endif | ||
449 | load_msrs(vmx->guest_msrs, vmx->save_nmsrs); | ||
450 | load_transition_efer(vmx); | ||
451 | } | ||
452 | |||
453 | static void vmx_load_host_state(struct vcpu_vmx *vmx) | ||
454 | { | ||
455 | unsigned long flags; | ||
456 | |||
457 | if (!vmx->host_state.loaded) | ||
458 | return; | ||
459 | |||
460 | ++vmx->vcpu.stat.host_state_reload; | ||
461 | vmx->host_state.loaded = 0; | ||
462 | if (vmx->host_state.fs_reload_needed) | ||
463 | load_fs(vmx->host_state.fs_sel); | ||
464 | if (vmx->host_state.gs_ldt_reload_needed) { | ||
465 | load_ldt(vmx->host_state.ldt_sel); | ||
466 | /* | ||
467 | * If we have to reload gs, we must take care to | ||
468 | * preserve our gs base. | ||
469 | */ | ||
470 | local_irq_save(flags); | ||
471 | load_gs(vmx->host_state.gs_sel); | ||
472 | #ifdef CONFIG_X86_64 | ||
473 | wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE)); | ||
474 | #endif | ||
475 | local_irq_restore(flags); | ||
476 | } | ||
477 | reload_tss(); | ||
478 | save_msrs(vmx->guest_msrs, vmx->save_nmsrs); | ||
479 | load_msrs(vmx->host_msrs, vmx->save_nmsrs); | ||
480 | reload_host_efer(vmx); | ||
481 | } | ||
482 | |||
483 | /* | ||
484 | * Switches to specified vcpu, until a matching vcpu_put(), but assumes | ||
485 | * vcpu mutex is already taken. | ||
486 | */ | ||
487 | static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | ||
488 | { | ||
489 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
490 | u64 phys_addr = __pa(vmx->vmcs); | ||
491 | u64 tsc_this, delta; | ||
492 | |||
493 | if (vcpu->cpu != cpu) { | ||
494 | vcpu_clear(vmx); | ||
495 | kvm_migrate_apic_timer(vcpu); | ||
496 | } | ||
497 | |||
498 | if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { | ||
499 | u8 error; | ||
500 | |||
501 | per_cpu(current_vmcs, cpu) = vmx->vmcs; | ||
502 | asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0" | ||
503 | : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) | ||
504 | : "cc"); | ||
505 | if (error) | ||
506 | printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", | ||
507 | vmx->vmcs, phys_addr); | ||
508 | } | ||
509 | |||
510 | if (vcpu->cpu != cpu) { | ||
511 | struct descriptor_table dt; | ||
512 | unsigned long sysenter_esp; | ||
513 | |||
514 | vcpu->cpu = cpu; | ||
515 | /* | ||
516 | * Linux uses per-cpu TSS and GDT, so set these when switching | ||
517 | * processors. | ||
518 | */ | ||
519 | vmcs_writel(HOST_TR_BASE, read_tr_base()); /* 22.2.4 */ | ||
520 | get_gdt(&dt); | ||
521 | vmcs_writel(HOST_GDTR_BASE, dt.base); /* 22.2.4 */ | ||
522 | |||
523 | rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); | ||
524 | vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ | ||
525 | |||
526 | /* | ||
527 | * Make sure the time stamp counter is monotonous. | ||
528 | */ | ||
529 | rdtscll(tsc_this); | ||
530 | delta = vcpu->arch.host_tsc - tsc_this; | ||
531 | vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta); | ||
532 | } | ||
533 | } | ||
534 | |||
535 | static void vmx_vcpu_put(struct kvm_vcpu *vcpu) | ||
536 | { | ||
537 | vmx_load_host_state(to_vmx(vcpu)); | ||
538 | } | ||
539 | |||
540 | static void vmx_fpu_activate(struct kvm_vcpu *vcpu) | ||
541 | { | ||
542 | if (vcpu->fpu_active) | ||
543 | return; | ||
544 | vcpu->fpu_active = 1; | ||
545 | vmcs_clear_bits(GUEST_CR0, X86_CR0_TS); | ||
546 | if (vcpu->arch.cr0 & X86_CR0_TS) | ||
547 | vmcs_set_bits(GUEST_CR0, X86_CR0_TS); | ||
548 | update_exception_bitmap(vcpu); | ||
549 | } | ||
550 | |||
551 | static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) | ||
552 | { | ||
553 | if (!vcpu->fpu_active) | ||
554 | return; | ||
555 | vcpu->fpu_active = 0; | ||
556 | vmcs_set_bits(GUEST_CR0, X86_CR0_TS); | ||
557 | update_exception_bitmap(vcpu); | ||
558 | } | ||
559 | |||
560 | static void vmx_vcpu_decache(struct kvm_vcpu *vcpu) | ||
561 | { | ||
562 | vcpu_clear(to_vmx(vcpu)); | ||
563 | } | ||
564 | |||
565 | static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) | ||
566 | { | ||
567 | return vmcs_readl(GUEST_RFLAGS); | ||
568 | } | ||
569 | |||
570 | static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) | ||
571 | { | ||
572 | if (vcpu->arch.rmode.active) | ||
573 | rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; | ||
574 | vmcs_writel(GUEST_RFLAGS, rflags); | ||
575 | } | ||
576 | |||
577 | static void skip_emulated_instruction(struct kvm_vcpu *vcpu) | ||
578 | { | ||
579 | unsigned long rip; | ||
580 | u32 interruptibility; | ||
581 | |||
582 | rip = vmcs_readl(GUEST_RIP); | ||
583 | rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); | ||
584 | vmcs_writel(GUEST_RIP, rip); | ||
585 | |||
586 | /* | ||
587 | * We emulated an instruction, so temporary interrupt blocking | ||
588 | * should be removed, if set. | ||
589 | */ | ||
590 | interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); | ||
591 | if (interruptibility & 3) | ||
592 | vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, | ||
593 | interruptibility & ~3); | ||
594 | vcpu->arch.interrupt_window_open = 1; | ||
595 | } | ||
596 | |||
597 | static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, | ||
598 | bool has_error_code, u32 error_code) | ||
599 | { | ||
600 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | ||
601 | nr | INTR_TYPE_EXCEPTION | ||
602 | | (has_error_code ? INTR_INFO_DELIEVER_CODE_MASK : 0) | ||
603 | | INTR_INFO_VALID_MASK); | ||
604 | if (has_error_code) | ||
605 | vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); | ||
606 | } | ||
607 | |||
608 | static bool vmx_exception_injected(struct kvm_vcpu *vcpu) | ||
609 | { | ||
610 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
611 | |||
612 | return !(vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); | ||
613 | } | ||
614 | |||
615 | /* | ||
616 | * Swap MSR entry in host/guest MSR entry array. | ||
617 | */ | ||
618 | #ifdef CONFIG_X86_64 | ||
619 | static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) | ||
620 | { | ||
621 | struct kvm_msr_entry tmp; | ||
622 | |||
623 | tmp = vmx->guest_msrs[to]; | ||
624 | vmx->guest_msrs[to] = vmx->guest_msrs[from]; | ||
625 | vmx->guest_msrs[from] = tmp; | ||
626 | tmp = vmx->host_msrs[to]; | ||
627 | vmx->host_msrs[to] = vmx->host_msrs[from]; | ||
628 | vmx->host_msrs[from] = tmp; | ||
629 | } | ||
630 | #endif | ||
631 | |||
632 | /* | ||
633 | * Set up the vmcs to automatically save and restore system | ||
634 | * msrs. Don't touch the 64-bit msrs if the guest is in legacy | ||
635 | * mode, as fiddling with msrs is very expensive. | ||
636 | */ | ||
637 | static void setup_msrs(struct vcpu_vmx *vmx) | ||
638 | { | ||
639 | int save_nmsrs; | ||
640 | |||
641 | save_nmsrs = 0; | ||
642 | #ifdef CONFIG_X86_64 | ||
643 | if (is_long_mode(&vmx->vcpu)) { | ||
644 | int index; | ||
645 | |||
646 | index = __find_msr_index(vmx, MSR_SYSCALL_MASK); | ||
647 | if (index >= 0) | ||
648 | move_msr_up(vmx, index, save_nmsrs++); | ||
649 | index = __find_msr_index(vmx, MSR_LSTAR); | ||
650 | if (index >= 0) | ||
651 | move_msr_up(vmx, index, save_nmsrs++); | ||
652 | index = __find_msr_index(vmx, MSR_CSTAR); | ||
653 | if (index >= 0) | ||
654 | move_msr_up(vmx, index, save_nmsrs++); | ||
655 | index = __find_msr_index(vmx, MSR_KERNEL_GS_BASE); | ||
656 | if (index >= 0) | ||
657 | move_msr_up(vmx, index, save_nmsrs++); | ||
658 | /* | ||
659 | * MSR_K6_STAR is only needed on long mode guests, and only | ||
660 | * if efer.sce is enabled. | ||
661 | */ | ||
662 | index = __find_msr_index(vmx, MSR_K6_STAR); | ||
663 | if ((index >= 0) && (vmx->vcpu.arch.shadow_efer & EFER_SCE)) | ||
664 | move_msr_up(vmx, index, save_nmsrs++); | ||
665 | } | ||
666 | #endif | ||
667 | vmx->save_nmsrs = save_nmsrs; | ||
668 | |||
669 | #ifdef CONFIG_X86_64 | ||
670 | vmx->msr_offset_kernel_gs_base = | ||
671 | __find_msr_index(vmx, MSR_KERNEL_GS_BASE); | ||
672 | #endif | ||
673 | vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER); | ||
674 | } | ||
675 | |||
676 | /* | ||
677 | * reads and returns guest's timestamp counter "register" | ||
678 | * guest_tsc = host_tsc + tsc_offset -- 21.3 | ||
679 | */ | ||
680 | static u64 guest_read_tsc(void) | ||
681 | { | ||
682 | u64 host_tsc, tsc_offset; | ||
683 | |||
684 | rdtscll(host_tsc); | ||
685 | tsc_offset = vmcs_read64(TSC_OFFSET); | ||
686 | return host_tsc + tsc_offset; | ||
687 | } | ||
688 | |||
689 | /* | ||
690 | * writes 'guest_tsc' into guest's timestamp counter "register" | ||
691 | * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc | ||
692 | */ | ||
693 | static void guest_write_tsc(u64 guest_tsc) | ||
694 | { | ||
695 | u64 host_tsc; | ||
696 | |||
697 | rdtscll(host_tsc); | ||
698 | vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc); | ||
699 | } | ||
700 | |||
701 | /* | ||
702 | * Reads an msr value (of 'msr_index') into 'pdata'. | ||
703 | * Returns 0 on success, non-0 otherwise. | ||
704 | * Assumes vcpu_load() was already called. | ||
705 | */ | ||
706 | static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) | ||
707 | { | ||
708 | u64 data; | ||
709 | struct kvm_msr_entry *msr; | ||
710 | |||
711 | if (!pdata) { | ||
712 | printk(KERN_ERR "BUG: get_msr called with NULL pdata\n"); | ||
713 | return -EINVAL; | ||
714 | } | ||
715 | |||
716 | switch (msr_index) { | ||
717 | #ifdef CONFIG_X86_64 | ||
718 | case MSR_FS_BASE: | ||
719 | data = vmcs_readl(GUEST_FS_BASE); | ||
720 | break; | ||
721 | case MSR_GS_BASE: | ||
722 | data = vmcs_readl(GUEST_GS_BASE); | ||
723 | break; | ||
724 | case MSR_EFER: | ||
725 | return kvm_get_msr_common(vcpu, msr_index, pdata); | ||
726 | #endif | ||
727 | case MSR_IA32_TIME_STAMP_COUNTER: | ||
728 | data = guest_read_tsc(); | ||
729 | break; | ||
730 | case MSR_IA32_SYSENTER_CS: | ||
731 | data = vmcs_read32(GUEST_SYSENTER_CS); | ||
732 | break; | ||
733 | case MSR_IA32_SYSENTER_EIP: | ||
734 | data = vmcs_readl(GUEST_SYSENTER_EIP); | ||
735 | break; | ||
736 | case MSR_IA32_SYSENTER_ESP: | ||
737 | data = vmcs_readl(GUEST_SYSENTER_ESP); | ||
738 | break; | ||
739 | default: | ||
740 | msr = find_msr_entry(to_vmx(vcpu), msr_index); | ||
741 | if (msr) { | ||
742 | data = msr->data; | ||
743 | break; | ||
744 | } | ||
745 | return kvm_get_msr_common(vcpu, msr_index, pdata); | ||
746 | } | ||
747 | |||
748 | *pdata = data; | ||
749 | return 0; | ||
750 | } | ||
751 | |||
752 | /* | ||
753 | * Writes msr value into into the appropriate "register". | ||
754 | * Returns 0 on success, non-0 otherwise. | ||
755 | * Assumes vcpu_load() was already called. | ||
756 | */ | ||
757 | static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) | ||
758 | { | ||
759 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
760 | struct kvm_msr_entry *msr; | ||
761 | int ret = 0; | ||
762 | |||
763 | switch (msr_index) { | ||
764 | #ifdef CONFIG_X86_64 | ||
765 | case MSR_EFER: | ||
766 | ret = kvm_set_msr_common(vcpu, msr_index, data); | ||
767 | if (vmx->host_state.loaded) { | ||
768 | reload_host_efer(vmx); | ||
769 | load_transition_efer(vmx); | ||
770 | } | ||
771 | break; | ||
772 | case MSR_FS_BASE: | ||
773 | vmcs_writel(GUEST_FS_BASE, data); | ||
774 | break; | ||
775 | case MSR_GS_BASE: | ||
776 | vmcs_writel(GUEST_GS_BASE, data); | ||
777 | break; | ||
778 | #endif | ||
779 | case MSR_IA32_SYSENTER_CS: | ||
780 | vmcs_write32(GUEST_SYSENTER_CS, data); | ||
781 | break; | ||
782 | case MSR_IA32_SYSENTER_EIP: | ||
783 | vmcs_writel(GUEST_SYSENTER_EIP, data); | ||
784 | break; | ||
785 | case MSR_IA32_SYSENTER_ESP: | ||
786 | vmcs_writel(GUEST_SYSENTER_ESP, data); | ||
787 | break; | ||
788 | case MSR_IA32_TIME_STAMP_COUNTER: | ||
789 | guest_write_tsc(data); | ||
790 | break; | ||
791 | default: | ||
792 | msr = find_msr_entry(vmx, msr_index); | ||
793 | if (msr) { | ||
794 | msr->data = data; | ||
795 | if (vmx->host_state.loaded) | ||
796 | load_msrs(vmx->guest_msrs, vmx->save_nmsrs); | ||
797 | break; | ||
798 | } | ||
799 | ret = kvm_set_msr_common(vcpu, msr_index, data); | ||
800 | } | ||
801 | |||
802 | return ret; | ||
803 | } | ||
804 | |||
805 | /* | ||
806 | * Sync the rsp and rip registers into the vcpu structure. This allows | ||
807 | * registers to be accessed by indexing vcpu->arch.regs. | ||
808 | */ | ||
809 | static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu) | ||
810 | { | ||
811 | vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); | ||
812 | vcpu->arch.rip = vmcs_readl(GUEST_RIP); | ||
813 | } | ||
814 | |||
815 | /* | ||
816 | * Syncs rsp and rip back into the vmcs. Should be called after possible | ||
817 | * modification. | ||
818 | */ | ||
819 | static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu) | ||
820 | { | ||
821 | vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); | ||
822 | vmcs_writel(GUEST_RIP, vcpu->arch.rip); | ||
823 | } | ||
824 | |||
825 | static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) | ||
826 | { | ||
827 | unsigned long dr7 = 0x400; | ||
828 | int old_singlestep; | ||
829 | |||
830 | old_singlestep = vcpu->guest_debug.singlestep; | ||
831 | |||
832 | vcpu->guest_debug.enabled = dbg->enabled; | ||
833 | if (vcpu->guest_debug.enabled) { | ||
834 | int i; | ||
835 | |||
836 | dr7 |= 0x200; /* exact */ | ||
837 | for (i = 0; i < 4; ++i) { | ||
838 | if (!dbg->breakpoints[i].enabled) | ||
839 | continue; | ||
840 | vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address; | ||
841 | dr7 |= 2 << (i*2); /* global enable */ | ||
842 | dr7 |= 0 << (i*4+16); /* execution breakpoint */ | ||
843 | } | ||
844 | |||
845 | vcpu->guest_debug.singlestep = dbg->singlestep; | ||
846 | } else | ||
847 | vcpu->guest_debug.singlestep = 0; | ||
848 | |||
849 | if (old_singlestep && !vcpu->guest_debug.singlestep) { | ||
850 | unsigned long flags; | ||
851 | |||
852 | flags = vmcs_readl(GUEST_RFLAGS); | ||
853 | flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); | ||
854 | vmcs_writel(GUEST_RFLAGS, flags); | ||
855 | } | ||
856 | |||
857 | update_exception_bitmap(vcpu); | ||
858 | vmcs_writel(GUEST_DR7, dr7); | ||
859 | |||
860 | return 0; | ||
861 | } | ||
862 | |||
863 | static int vmx_get_irq(struct kvm_vcpu *vcpu) | ||
864 | { | ||
865 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
866 | u32 idtv_info_field; | ||
867 | |||
868 | idtv_info_field = vmx->idt_vectoring_info; | ||
869 | if (idtv_info_field & INTR_INFO_VALID_MASK) { | ||
870 | if (is_external_interrupt(idtv_info_field)) | ||
871 | return idtv_info_field & VECTORING_INFO_VECTOR_MASK; | ||
872 | else | ||
873 | printk(KERN_DEBUG "pending exception: not handled yet\n"); | ||
874 | } | ||
875 | return -1; | ||
876 | } | ||
877 | |||
878 | static __init int cpu_has_kvm_support(void) | ||
879 | { | ||
880 | unsigned long ecx = cpuid_ecx(1); | ||
881 | return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */ | ||
882 | } | ||
883 | |||
884 | static __init int vmx_disabled_by_bios(void) | ||
885 | { | ||
886 | u64 msr; | ||
887 | |||
888 | rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); | ||
889 | return (msr & (MSR_IA32_FEATURE_CONTROL_LOCKED | | ||
890 | MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) | ||
891 | == MSR_IA32_FEATURE_CONTROL_LOCKED; | ||
892 | /* locked but not enabled */ | ||
893 | } | ||
894 | |||
895 | static void hardware_enable(void *garbage) | ||
896 | { | ||
897 | int cpu = raw_smp_processor_id(); | ||
898 | u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); | ||
899 | u64 old; | ||
900 | |||
901 | rdmsrl(MSR_IA32_FEATURE_CONTROL, old); | ||
902 | if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED | | ||
903 | MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) | ||
904 | != (MSR_IA32_FEATURE_CONTROL_LOCKED | | ||
905 | MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) | ||
906 | /* enable and lock */ | ||
907 | wrmsrl(MSR_IA32_FEATURE_CONTROL, old | | ||
908 | MSR_IA32_FEATURE_CONTROL_LOCKED | | ||
909 | MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED); | ||
910 | write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ | ||
911 | asm volatile (ASM_VMX_VMXON_RAX : : "a"(&phys_addr), "m"(phys_addr) | ||
912 | : "memory", "cc"); | ||
913 | } | ||
914 | |||
915 | static void hardware_disable(void *garbage) | ||
916 | { | ||
917 | asm volatile (ASM_VMX_VMXOFF : : : "cc"); | ||
918 | } | ||
919 | |||
920 | static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, | ||
921 | u32 msr, u32 *result) | ||
922 | { | ||
923 | u32 vmx_msr_low, vmx_msr_high; | ||
924 | u32 ctl = ctl_min | ctl_opt; | ||
925 | |||
926 | rdmsr(msr, vmx_msr_low, vmx_msr_high); | ||
927 | |||
928 | ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */ | ||
929 | ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */ | ||
930 | |||
931 | /* Ensure minimum (required) set of control bits are supported. */ | ||
932 | if (ctl_min & ~ctl) | ||
933 | return -EIO; | ||
934 | |||
935 | *result = ctl; | ||
936 | return 0; | ||
937 | } | ||
938 | |||
939 | static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) | ||
940 | { | ||
941 | u32 vmx_msr_low, vmx_msr_high; | ||
942 | u32 min, opt; | ||
943 | u32 _pin_based_exec_control = 0; | ||
944 | u32 _cpu_based_exec_control = 0; | ||
945 | u32 _cpu_based_2nd_exec_control = 0; | ||
946 | u32 _vmexit_control = 0; | ||
947 | u32 _vmentry_control = 0; | ||
948 | |||
949 | min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; | ||
950 | opt = 0; | ||
951 | if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, | ||
952 | &_pin_based_exec_control) < 0) | ||
953 | return -EIO; | ||
954 | |||
955 | min = CPU_BASED_HLT_EXITING | | ||
956 | #ifdef CONFIG_X86_64 | ||
957 | CPU_BASED_CR8_LOAD_EXITING | | ||
958 | CPU_BASED_CR8_STORE_EXITING | | ||
959 | #endif | ||
960 | CPU_BASED_USE_IO_BITMAPS | | ||
961 | CPU_BASED_MOV_DR_EXITING | | ||
962 | CPU_BASED_USE_TSC_OFFSETING; | ||
963 | opt = CPU_BASED_TPR_SHADOW | | ||
964 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; | ||
965 | if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, | ||
966 | &_cpu_based_exec_control) < 0) | ||
967 | return -EIO; | ||
968 | #ifdef CONFIG_X86_64 | ||
969 | if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) | ||
970 | _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING & | ||
971 | ~CPU_BASED_CR8_STORE_EXITING; | ||
972 | #endif | ||
973 | if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { | ||
974 | min = 0; | ||
975 | opt = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | | ||
976 | SECONDARY_EXEC_WBINVD_EXITING; | ||
977 | if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS2, | ||
978 | &_cpu_based_2nd_exec_control) < 0) | ||
979 | return -EIO; | ||
980 | } | ||
981 | #ifndef CONFIG_X86_64 | ||
982 | if (!(_cpu_based_2nd_exec_control & | ||
983 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) | ||
984 | _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; | ||
985 | #endif | ||
986 | |||
987 | min = 0; | ||
988 | #ifdef CONFIG_X86_64 | ||
989 | min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; | ||
990 | #endif | ||
991 | opt = 0; | ||
992 | if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, | ||
993 | &_vmexit_control) < 0) | ||
994 | return -EIO; | ||
995 | |||
996 | min = opt = 0; | ||
997 | if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, | ||
998 | &_vmentry_control) < 0) | ||
999 | return -EIO; | ||
1000 | |||
1001 | rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high); | ||
1002 | |||
1003 | /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ | ||
1004 | if ((vmx_msr_high & 0x1fff) > PAGE_SIZE) | ||
1005 | return -EIO; | ||
1006 | |||
1007 | #ifdef CONFIG_X86_64 | ||
1008 | /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */ | ||
1009 | if (vmx_msr_high & (1u<<16)) | ||
1010 | return -EIO; | ||
1011 | #endif | ||
1012 | |||
1013 | /* Require Write-Back (WB) memory type for VMCS accesses. */ | ||
1014 | if (((vmx_msr_high >> 18) & 15) != 6) | ||
1015 | return -EIO; | ||
1016 | |||
1017 | vmcs_conf->size = vmx_msr_high & 0x1fff; | ||
1018 | vmcs_conf->order = get_order(vmcs_config.size); | ||
1019 | vmcs_conf->revision_id = vmx_msr_low; | ||
1020 | |||
1021 | vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; | ||
1022 | vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; | ||
1023 | vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; | ||
1024 | vmcs_conf->vmexit_ctrl = _vmexit_control; | ||
1025 | vmcs_conf->vmentry_ctrl = _vmentry_control; | ||
1026 | |||
1027 | return 0; | ||
1028 | } | ||
1029 | |||
1030 | static struct vmcs *alloc_vmcs_cpu(int cpu) | ||
1031 | { | ||
1032 | int node = cpu_to_node(cpu); | ||
1033 | struct page *pages; | ||
1034 | struct vmcs *vmcs; | ||
1035 | |||
1036 | pages = alloc_pages_node(node, GFP_KERNEL, vmcs_config.order); | ||
1037 | if (!pages) | ||
1038 | return NULL; | ||
1039 | vmcs = page_address(pages); | ||
1040 | memset(vmcs, 0, vmcs_config.size); | ||
1041 | vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */ | ||
1042 | return vmcs; | ||
1043 | } | ||
1044 | |||
1045 | static struct vmcs *alloc_vmcs(void) | ||
1046 | { | ||
1047 | return alloc_vmcs_cpu(raw_smp_processor_id()); | ||
1048 | } | ||
1049 | |||
1050 | static void free_vmcs(struct vmcs *vmcs) | ||
1051 | { | ||
1052 | free_pages((unsigned long)vmcs, vmcs_config.order); | ||
1053 | } | ||
1054 | |||
1055 | static void free_kvm_area(void) | ||
1056 | { | ||
1057 | int cpu; | ||
1058 | |||
1059 | for_each_online_cpu(cpu) | ||
1060 | free_vmcs(per_cpu(vmxarea, cpu)); | ||
1061 | } | ||
1062 | |||
1063 | static __init int alloc_kvm_area(void) | ||
1064 | { | ||
1065 | int cpu; | ||
1066 | |||
1067 | for_each_online_cpu(cpu) { | ||
1068 | struct vmcs *vmcs; | ||
1069 | |||
1070 | vmcs = alloc_vmcs_cpu(cpu); | ||
1071 | if (!vmcs) { | ||
1072 | free_kvm_area(); | ||
1073 | return -ENOMEM; | ||
1074 | } | ||
1075 | |||
1076 | per_cpu(vmxarea, cpu) = vmcs; | ||
1077 | } | ||
1078 | return 0; | ||
1079 | } | ||
1080 | |||
1081 | static __init int hardware_setup(void) | ||
1082 | { | ||
1083 | if (setup_vmcs_config(&vmcs_config) < 0) | ||
1084 | return -EIO; | ||
1085 | return alloc_kvm_area(); | ||
1086 | } | ||
1087 | |||
1088 | static __exit void hardware_unsetup(void) | ||
1089 | { | ||
1090 | free_kvm_area(); | ||
1091 | } | ||
1092 | |||
1093 | static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save) | ||
1094 | { | ||
1095 | struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; | ||
1096 | |||
1097 | if (vmcs_readl(sf->base) == save->base && (save->base & AR_S_MASK)) { | ||
1098 | vmcs_write16(sf->selector, save->selector); | ||
1099 | vmcs_writel(sf->base, save->base); | ||
1100 | vmcs_write32(sf->limit, save->limit); | ||
1101 | vmcs_write32(sf->ar_bytes, save->ar); | ||
1102 | } else { | ||
1103 | u32 dpl = (vmcs_read16(sf->selector) & SELECTOR_RPL_MASK) | ||
1104 | << AR_DPL_SHIFT; | ||
1105 | vmcs_write32(sf->ar_bytes, 0x93 | dpl); | ||
1106 | } | ||
1107 | } | ||
1108 | |||
1109 | static void enter_pmode(struct kvm_vcpu *vcpu) | ||
1110 | { | ||
1111 | unsigned long flags; | ||
1112 | |||
1113 | vcpu->arch.rmode.active = 0; | ||
1114 | |||
1115 | vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base); | ||
1116 | vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit); | ||
1117 | vmcs_write32(GUEST_TR_AR_BYTES, vcpu->arch.rmode.tr.ar); | ||
1118 | |||
1119 | flags = vmcs_readl(GUEST_RFLAGS); | ||
1120 | flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM); | ||
1121 | flags |= (vcpu->arch.rmode.save_iopl << IOPL_SHIFT); | ||
1122 | vmcs_writel(GUEST_RFLAGS, flags); | ||
1123 | |||
1124 | vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | | ||
1125 | (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME)); | ||
1126 | |||
1127 | update_exception_bitmap(vcpu); | ||
1128 | |||
1129 | fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es); | ||
1130 | fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); | ||
1131 | fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); | ||
1132 | fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); | ||
1133 | |||
1134 | vmcs_write16(GUEST_SS_SELECTOR, 0); | ||
1135 | vmcs_write32(GUEST_SS_AR_BYTES, 0x93); | ||
1136 | |||
1137 | vmcs_write16(GUEST_CS_SELECTOR, | ||
1138 | vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK); | ||
1139 | vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); | ||
1140 | } | ||
1141 | |||
1142 | static gva_t rmode_tss_base(struct kvm *kvm) | ||
1143 | { | ||
1144 | if (!kvm->arch.tss_addr) { | ||
1145 | gfn_t base_gfn = kvm->memslots[0].base_gfn + | ||
1146 | kvm->memslots[0].npages - 3; | ||
1147 | return base_gfn << PAGE_SHIFT; | ||
1148 | } | ||
1149 | return kvm->arch.tss_addr; | ||
1150 | } | ||
1151 | |||
1152 | static void fix_rmode_seg(int seg, struct kvm_save_segment *save) | ||
1153 | { | ||
1154 | struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; | ||
1155 | |||
1156 | save->selector = vmcs_read16(sf->selector); | ||
1157 | save->base = vmcs_readl(sf->base); | ||
1158 | save->limit = vmcs_read32(sf->limit); | ||
1159 | save->ar = vmcs_read32(sf->ar_bytes); | ||
1160 | vmcs_write16(sf->selector, save->base >> 4); | ||
1161 | vmcs_write32(sf->base, save->base & 0xfffff); | ||
1162 | vmcs_write32(sf->limit, 0xffff); | ||
1163 | vmcs_write32(sf->ar_bytes, 0xf3); | ||
1164 | } | ||
1165 | |||
1166 | static void enter_rmode(struct kvm_vcpu *vcpu) | ||
1167 | { | ||
1168 | unsigned long flags; | ||
1169 | |||
1170 | vcpu->arch.rmode.active = 1; | ||
1171 | |||
1172 | vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE); | ||
1173 | vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); | ||
1174 | |||
1175 | vcpu->arch.rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT); | ||
1176 | vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); | ||
1177 | |||
1178 | vcpu->arch.rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES); | ||
1179 | vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); | ||
1180 | |||
1181 | flags = vmcs_readl(GUEST_RFLAGS); | ||
1182 | vcpu->arch.rmode.save_iopl | ||
1183 | = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; | ||
1184 | |||
1185 | flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; | ||
1186 | |||
1187 | vmcs_writel(GUEST_RFLAGS, flags); | ||
1188 | vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); | ||
1189 | update_exception_bitmap(vcpu); | ||
1190 | |||
1191 | vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4); | ||
1192 | vmcs_write32(GUEST_SS_LIMIT, 0xffff); | ||
1193 | vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); | ||
1194 | |||
1195 | vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); | ||
1196 | vmcs_write32(GUEST_CS_LIMIT, 0xffff); | ||
1197 | if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000) | ||
1198 | vmcs_writel(GUEST_CS_BASE, 0xf0000); | ||
1199 | vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4); | ||
1200 | |||
1201 | fix_rmode_seg(VCPU_SREG_ES, &vcpu->arch.rmode.es); | ||
1202 | fix_rmode_seg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); | ||
1203 | fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); | ||
1204 | fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); | ||
1205 | |||
1206 | kvm_mmu_reset_context(vcpu); | ||
1207 | init_rmode_tss(vcpu->kvm); | ||
1208 | } | ||
1209 | |||
1210 | #ifdef CONFIG_X86_64 | ||
1211 | |||
1212 | static void enter_lmode(struct kvm_vcpu *vcpu) | ||
1213 | { | ||
1214 | u32 guest_tr_ar; | ||
1215 | |||
1216 | guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); | ||
1217 | if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) { | ||
1218 | printk(KERN_DEBUG "%s: tss fixup for long mode. \n", | ||
1219 | __FUNCTION__); | ||
1220 | vmcs_write32(GUEST_TR_AR_BYTES, | ||
1221 | (guest_tr_ar & ~AR_TYPE_MASK) | ||
1222 | | AR_TYPE_BUSY_64_TSS); | ||
1223 | } | ||
1224 | |||
1225 | vcpu->arch.shadow_efer |= EFER_LMA; | ||
1226 | |||
1227 | find_msr_entry(to_vmx(vcpu), MSR_EFER)->data |= EFER_LMA | EFER_LME; | ||
1228 | vmcs_write32(VM_ENTRY_CONTROLS, | ||
1229 | vmcs_read32(VM_ENTRY_CONTROLS) | ||
1230 | | VM_ENTRY_IA32E_MODE); | ||
1231 | } | ||
1232 | |||
1233 | static void exit_lmode(struct kvm_vcpu *vcpu) | ||
1234 | { | ||
1235 | vcpu->arch.shadow_efer &= ~EFER_LMA; | ||
1236 | |||
1237 | vmcs_write32(VM_ENTRY_CONTROLS, | ||
1238 | vmcs_read32(VM_ENTRY_CONTROLS) | ||
1239 | & ~VM_ENTRY_IA32E_MODE); | ||
1240 | } | ||
1241 | |||
1242 | #endif | ||
1243 | |||
1244 | static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) | ||
1245 | { | ||
1246 | vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK; | ||
1247 | vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK; | ||
1248 | } | ||
1249 | |||
1250 | static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | ||
1251 | { | ||
1252 | vmx_fpu_deactivate(vcpu); | ||
1253 | |||
1254 | if (vcpu->arch.rmode.active && (cr0 & X86_CR0_PE)) | ||
1255 | enter_pmode(vcpu); | ||
1256 | |||
1257 | if (!vcpu->arch.rmode.active && !(cr0 & X86_CR0_PE)) | ||
1258 | enter_rmode(vcpu); | ||
1259 | |||
1260 | #ifdef CONFIG_X86_64 | ||
1261 | if (vcpu->arch.shadow_efer & EFER_LME) { | ||
1262 | if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) | ||
1263 | enter_lmode(vcpu); | ||
1264 | if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) | ||
1265 | exit_lmode(vcpu); | ||
1266 | } | ||
1267 | #endif | ||
1268 | |||
1269 | vmcs_writel(CR0_READ_SHADOW, cr0); | ||
1270 | vmcs_writel(GUEST_CR0, | ||
1271 | (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON); | ||
1272 | vcpu->arch.cr0 = cr0; | ||
1273 | |||
1274 | if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE)) | ||
1275 | vmx_fpu_activate(vcpu); | ||
1276 | } | ||
1277 | |||
1278 | static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | ||
1279 | { | ||
1280 | vmcs_writel(GUEST_CR3, cr3); | ||
1281 | if (vcpu->arch.cr0 & X86_CR0_PE) | ||
1282 | vmx_fpu_deactivate(vcpu); | ||
1283 | } | ||
1284 | |||
1285 | static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | ||
1286 | { | ||
1287 | vmcs_writel(CR4_READ_SHADOW, cr4); | ||
1288 | vmcs_writel(GUEST_CR4, cr4 | (vcpu->arch.rmode.active ? | ||
1289 | KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON)); | ||
1290 | vcpu->arch.cr4 = cr4; | ||
1291 | } | ||
1292 | |||
1293 | #ifdef CONFIG_X86_64 | ||
1294 | |||
1295 | static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) | ||
1296 | { | ||
1297 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
1298 | struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); | ||
1299 | |||
1300 | vcpu->arch.shadow_efer = efer; | ||
1301 | if (efer & EFER_LMA) { | ||
1302 | vmcs_write32(VM_ENTRY_CONTROLS, | ||
1303 | vmcs_read32(VM_ENTRY_CONTROLS) | | ||
1304 | VM_ENTRY_IA32E_MODE); | ||
1305 | msr->data = efer; | ||
1306 | |||
1307 | } else { | ||
1308 | vmcs_write32(VM_ENTRY_CONTROLS, | ||
1309 | vmcs_read32(VM_ENTRY_CONTROLS) & | ||
1310 | ~VM_ENTRY_IA32E_MODE); | ||
1311 | |||
1312 | msr->data = efer & ~EFER_LME; | ||
1313 | } | ||
1314 | setup_msrs(vmx); | ||
1315 | } | ||
1316 | |||
1317 | #endif | ||
1318 | |||
1319 | static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) | ||
1320 | { | ||
1321 | struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; | ||
1322 | |||
1323 | return vmcs_readl(sf->base); | ||
1324 | } | ||
1325 | |||
1326 | static void vmx_get_segment(struct kvm_vcpu *vcpu, | ||
1327 | struct kvm_segment *var, int seg) | ||
1328 | { | ||
1329 | struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; | ||
1330 | u32 ar; | ||
1331 | |||
1332 | var->base = vmcs_readl(sf->base); | ||
1333 | var->limit = vmcs_read32(sf->limit); | ||
1334 | var->selector = vmcs_read16(sf->selector); | ||
1335 | ar = vmcs_read32(sf->ar_bytes); | ||
1336 | if (ar & AR_UNUSABLE_MASK) | ||
1337 | ar = 0; | ||
1338 | var->type = ar & 15; | ||
1339 | var->s = (ar >> 4) & 1; | ||
1340 | var->dpl = (ar >> 5) & 3; | ||
1341 | var->present = (ar >> 7) & 1; | ||
1342 | var->avl = (ar >> 12) & 1; | ||
1343 | var->l = (ar >> 13) & 1; | ||
1344 | var->db = (ar >> 14) & 1; | ||
1345 | var->g = (ar >> 15) & 1; | ||
1346 | var->unusable = (ar >> 16) & 1; | ||
1347 | } | ||
1348 | |||
1349 | static u32 vmx_segment_access_rights(struct kvm_segment *var) | ||
1350 | { | ||
1351 | u32 ar; | ||
1352 | |||
1353 | if (var->unusable) | ||
1354 | ar = 1 << 16; | ||
1355 | else { | ||
1356 | ar = var->type & 15; | ||
1357 | ar |= (var->s & 1) << 4; | ||
1358 | ar |= (var->dpl & 3) << 5; | ||
1359 | ar |= (var->present & 1) << 7; | ||
1360 | ar |= (var->avl & 1) << 12; | ||
1361 | ar |= (var->l & 1) << 13; | ||
1362 | ar |= (var->db & 1) << 14; | ||
1363 | ar |= (var->g & 1) << 15; | ||
1364 | } | ||
1365 | if (ar == 0) /* a 0 value means unusable */ | ||
1366 | ar = AR_UNUSABLE_MASK; | ||
1367 | |||
1368 | return ar; | ||
1369 | } | ||
1370 | |||
1371 | static void vmx_set_segment(struct kvm_vcpu *vcpu, | ||
1372 | struct kvm_segment *var, int seg) | ||
1373 | { | ||
1374 | struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; | ||
1375 | u32 ar; | ||
1376 | |||
1377 | if (vcpu->arch.rmode.active && seg == VCPU_SREG_TR) { | ||
1378 | vcpu->arch.rmode.tr.selector = var->selector; | ||
1379 | vcpu->arch.rmode.tr.base = var->base; | ||
1380 | vcpu->arch.rmode.tr.limit = var->limit; | ||
1381 | vcpu->arch.rmode.tr.ar = vmx_segment_access_rights(var); | ||
1382 | return; | ||
1383 | } | ||
1384 | vmcs_writel(sf->base, var->base); | ||
1385 | vmcs_write32(sf->limit, var->limit); | ||
1386 | vmcs_write16(sf->selector, var->selector); | ||
1387 | if (vcpu->arch.rmode.active && var->s) { | ||
1388 | /* | ||
1389 | * Hack real-mode segments into vm86 compatibility. | ||
1390 | */ | ||
1391 | if (var->base == 0xffff0000 && var->selector == 0xf000) | ||
1392 | vmcs_writel(sf->base, 0xf0000); | ||
1393 | ar = 0xf3; | ||
1394 | } else | ||
1395 | ar = vmx_segment_access_rights(var); | ||
1396 | vmcs_write32(sf->ar_bytes, ar); | ||
1397 | } | ||
1398 | |||
1399 | static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) | ||
1400 | { | ||
1401 | u32 ar = vmcs_read32(GUEST_CS_AR_BYTES); | ||
1402 | |||
1403 | *db = (ar >> 14) & 1; | ||
1404 | *l = (ar >> 13) & 1; | ||
1405 | } | ||
1406 | |||
1407 | static void vmx_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) | ||
1408 | { | ||
1409 | dt->limit = vmcs_read32(GUEST_IDTR_LIMIT); | ||
1410 | dt->base = vmcs_readl(GUEST_IDTR_BASE); | ||
1411 | } | ||
1412 | |||
1413 | static void vmx_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) | ||
1414 | { | ||
1415 | vmcs_write32(GUEST_IDTR_LIMIT, dt->limit); | ||
1416 | vmcs_writel(GUEST_IDTR_BASE, dt->base); | ||
1417 | } | ||
1418 | |||
1419 | static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) | ||
1420 | { | ||
1421 | dt->limit = vmcs_read32(GUEST_GDTR_LIMIT); | ||
1422 | dt->base = vmcs_readl(GUEST_GDTR_BASE); | ||
1423 | } | ||
1424 | |||
1425 | static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) | ||
1426 | { | ||
1427 | vmcs_write32(GUEST_GDTR_LIMIT, dt->limit); | ||
1428 | vmcs_writel(GUEST_GDTR_BASE, dt->base); | ||
1429 | } | ||
1430 | |||
1431 | static int init_rmode_tss(struct kvm *kvm) | ||
1432 | { | ||
1433 | gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; | ||
1434 | u16 data = 0; | ||
1435 | int r; | ||
1436 | |||
1437 | r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); | ||
1438 | if (r < 0) | ||
1439 | return 0; | ||
1440 | data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; | ||
1441 | r = kvm_write_guest_page(kvm, fn++, &data, 0x66, sizeof(u16)); | ||
1442 | if (r < 0) | ||
1443 | return 0; | ||
1444 | r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE); | ||
1445 | if (r < 0) | ||
1446 | return 0; | ||
1447 | r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); | ||
1448 | if (r < 0) | ||
1449 | return 0; | ||
1450 | data = ~0; | ||
1451 | r = kvm_write_guest_page(kvm, fn, &data, RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1, | ||
1452 | sizeof(u8)); | ||
1453 | if (r < 0) | ||
1454 | return 0; | ||
1455 | return 1; | ||
1456 | } | ||
1457 | |||
1458 | static void seg_setup(int seg) | ||
1459 | { | ||
1460 | struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; | ||
1461 | |||
1462 | vmcs_write16(sf->selector, 0); | ||
1463 | vmcs_writel(sf->base, 0); | ||
1464 | vmcs_write32(sf->limit, 0xffff); | ||
1465 | vmcs_write32(sf->ar_bytes, 0x93); | ||
1466 | } | ||
1467 | |||
1468 | static int alloc_apic_access_page(struct kvm *kvm) | ||
1469 | { | ||
1470 | struct kvm_userspace_memory_region kvm_userspace_mem; | ||
1471 | int r = 0; | ||
1472 | |||
1473 | mutex_lock(&kvm->lock); | ||
1474 | if (kvm->arch.apic_access_page) | ||
1475 | goto out; | ||
1476 | kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT; | ||
1477 | kvm_userspace_mem.flags = 0; | ||
1478 | kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL; | ||
1479 | kvm_userspace_mem.memory_size = PAGE_SIZE; | ||
1480 | r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0); | ||
1481 | if (r) | ||
1482 | goto out; | ||
1483 | kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); | ||
1484 | out: | ||
1485 | mutex_unlock(&kvm->lock); | ||
1486 | return r; | ||
1487 | } | ||
1488 | |||
1489 | /* | ||
1490 | * Sets up the vmcs for emulated real mode. | ||
1491 | */ | ||
1492 | static int vmx_vcpu_setup(struct vcpu_vmx *vmx) | ||
1493 | { | ||
1494 | u32 host_sysenter_cs; | ||
1495 | u32 junk; | ||
1496 | unsigned long a; | ||
1497 | struct descriptor_table dt; | ||
1498 | int i; | ||
1499 | unsigned long kvm_vmx_return; | ||
1500 | u32 exec_control; | ||
1501 | |||
1502 | /* I/O */ | ||
1503 | vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a)); | ||
1504 | vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b)); | ||
1505 | |||
1506 | vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ | ||
1507 | |||
1508 | /* Control */ | ||
1509 | vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, | ||
1510 | vmcs_config.pin_based_exec_ctrl); | ||
1511 | |||
1512 | exec_control = vmcs_config.cpu_based_exec_ctrl; | ||
1513 | if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) { | ||
1514 | exec_control &= ~CPU_BASED_TPR_SHADOW; | ||
1515 | #ifdef CONFIG_X86_64 | ||
1516 | exec_control |= CPU_BASED_CR8_STORE_EXITING | | ||
1517 | CPU_BASED_CR8_LOAD_EXITING; | ||
1518 | #endif | ||
1519 | } | ||
1520 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); | ||
1521 | |||
1522 | if (cpu_has_secondary_exec_ctrls()) { | ||
1523 | exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; | ||
1524 | if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) | ||
1525 | exec_control &= | ||
1526 | ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; | ||
1527 | vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); | ||
1528 | } | ||
1529 | |||
1530 | vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf); | ||
1531 | vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf); | ||
1532 | vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ | ||
1533 | |||
1534 | vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */ | ||
1535 | vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */ | ||
1536 | vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ | ||
1537 | |||
1538 | vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ | ||
1539 | vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ | ||
1540 | vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ | ||
1541 | vmcs_write16(HOST_FS_SELECTOR, read_fs()); /* 22.2.4 */ | ||
1542 | vmcs_write16(HOST_GS_SELECTOR, read_gs()); /* 22.2.4 */ | ||
1543 | vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ | ||
1544 | #ifdef CONFIG_X86_64 | ||
1545 | rdmsrl(MSR_FS_BASE, a); | ||
1546 | vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */ | ||
1547 | rdmsrl(MSR_GS_BASE, a); | ||
1548 | vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */ | ||
1549 | #else | ||
1550 | vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */ | ||
1551 | vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ | ||
1552 | #endif | ||
1553 | |||
1554 | vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ | ||
1555 | |||
1556 | get_idt(&dt); | ||
1557 | vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */ | ||
1558 | |||
1559 | asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return)); | ||
1560 | vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */ | ||
1561 | vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); | ||
1562 | vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); | ||
1563 | vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); | ||
1564 | |||
1565 | rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk); | ||
1566 | vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs); | ||
1567 | rdmsrl(MSR_IA32_SYSENTER_ESP, a); | ||
1568 | vmcs_writel(HOST_IA32_SYSENTER_ESP, a); /* 22.2.3 */ | ||
1569 | rdmsrl(MSR_IA32_SYSENTER_EIP, a); | ||
1570 | vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */ | ||
1571 | |||
1572 | for (i = 0; i < NR_VMX_MSR; ++i) { | ||
1573 | u32 index = vmx_msr_index[i]; | ||
1574 | u32 data_low, data_high; | ||
1575 | u64 data; | ||
1576 | int j = vmx->nmsrs; | ||
1577 | |||
1578 | if (rdmsr_safe(index, &data_low, &data_high) < 0) | ||
1579 | continue; | ||
1580 | if (wrmsr_safe(index, data_low, data_high) < 0) | ||
1581 | continue; | ||
1582 | data = data_low | ((u64)data_high << 32); | ||
1583 | vmx->host_msrs[j].index = index; | ||
1584 | vmx->host_msrs[j].reserved = 0; | ||
1585 | vmx->host_msrs[j].data = data; | ||
1586 | vmx->guest_msrs[j] = vmx->host_msrs[j]; | ||
1587 | ++vmx->nmsrs; | ||
1588 | } | ||
1589 | |||
1590 | vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); | ||
1591 | |||
1592 | /* 22.2.1, 20.8.1 */ | ||
1593 | vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); | ||
1594 | |||
1595 | vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); | ||
1596 | vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK); | ||
1597 | |||
1598 | if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) | ||
1599 | if (alloc_apic_access_page(vmx->vcpu.kvm) != 0) | ||
1600 | return -ENOMEM; | ||
1601 | |||
1602 | return 0; | ||
1603 | } | ||
1604 | |||
1605 | static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) | ||
1606 | { | ||
1607 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
1608 | u64 msr; | ||
1609 | int ret; | ||
1610 | |||
1611 | if (!init_rmode_tss(vmx->vcpu.kvm)) { | ||
1612 | ret = -ENOMEM; | ||
1613 | goto out; | ||
1614 | } | ||
1615 | |||
1616 | vmx->vcpu.arch.rmode.active = 0; | ||
1617 | |||
1618 | vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); | ||
1619 | set_cr8(&vmx->vcpu, 0); | ||
1620 | msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; | ||
1621 | if (vmx->vcpu.vcpu_id == 0) | ||
1622 | msr |= MSR_IA32_APICBASE_BSP; | ||
1623 | kvm_set_apic_base(&vmx->vcpu, msr); | ||
1624 | |||
1625 | fx_init(&vmx->vcpu); | ||
1626 | |||
1627 | /* | ||
1628 | * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode | ||
1629 | * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. | ||
1630 | */ | ||
1631 | if (vmx->vcpu.vcpu_id == 0) { | ||
1632 | vmcs_write16(GUEST_CS_SELECTOR, 0xf000); | ||
1633 | vmcs_writel(GUEST_CS_BASE, 0x000f0000); | ||
1634 | } else { | ||
1635 | vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8); | ||
1636 | vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12); | ||
1637 | } | ||
1638 | vmcs_write32(GUEST_CS_LIMIT, 0xffff); | ||
1639 | vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); | ||
1640 | |||
1641 | seg_setup(VCPU_SREG_DS); | ||
1642 | seg_setup(VCPU_SREG_ES); | ||
1643 | seg_setup(VCPU_SREG_FS); | ||
1644 | seg_setup(VCPU_SREG_GS); | ||
1645 | seg_setup(VCPU_SREG_SS); | ||
1646 | |||
1647 | vmcs_write16(GUEST_TR_SELECTOR, 0); | ||
1648 | vmcs_writel(GUEST_TR_BASE, 0); | ||
1649 | vmcs_write32(GUEST_TR_LIMIT, 0xffff); | ||
1650 | vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); | ||
1651 | |||
1652 | vmcs_write16(GUEST_LDTR_SELECTOR, 0); | ||
1653 | vmcs_writel(GUEST_LDTR_BASE, 0); | ||
1654 | vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); | ||
1655 | vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); | ||
1656 | |||
1657 | vmcs_write32(GUEST_SYSENTER_CS, 0); | ||
1658 | vmcs_writel(GUEST_SYSENTER_ESP, 0); | ||
1659 | vmcs_writel(GUEST_SYSENTER_EIP, 0); | ||
1660 | |||
1661 | vmcs_writel(GUEST_RFLAGS, 0x02); | ||
1662 | if (vmx->vcpu.vcpu_id == 0) | ||
1663 | vmcs_writel(GUEST_RIP, 0xfff0); | ||
1664 | else | ||
1665 | vmcs_writel(GUEST_RIP, 0); | ||
1666 | vmcs_writel(GUEST_RSP, 0); | ||
1667 | |||
1668 | /* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */ | ||
1669 | vmcs_writel(GUEST_DR7, 0x400); | ||
1670 | |||
1671 | vmcs_writel(GUEST_GDTR_BASE, 0); | ||
1672 | vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); | ||
1673 | |||
1674 | vmcs_writel(GUEST_IDTR_BASE, 0); | ||
1675 | vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); | ||
1676 | |||
1677 | vmcs_write32(GUEST_ACTIVITY_STATE, 0); | ||
1678 | vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); | ||
1679 | vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); | ||
1680 | |||
1681 | guest_write_tsc(0); | ||
1682 | |||
1683 | /* Special registers */ | ||
1684 | vmcs_write64(GUEST_IA32_DEBUGCTL, 0); | ||
1685 | |||
1686 | setup_msrs(vmx); | ||
1687 | |||
1688 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ | ||
1689 | |||
1690 | if (cpu_has_vmx_tpr_shadow()) { | ||
1691 | vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); | ||
1692 | if (vm_need_tpr_shadow(vmx->vcpu.kvm)) | ||
1693 | vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, | ||
1694 | page_to_phys(vmx->vcpu.arch.apic->regs_page)); | ||
1695 | vmcs_write32(TPR_THRESHOLD, 0); | ||
1696 | } | ||
1697 | |||
1698 | if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) | ||
1699 | vmcs_write64(APIC_ACCESS_ADDR, | ||
1700 | page_to_phys(vmx->vcpu.kvm->arch.apic_access_page)); | ||
1701 | |||
1702 | vmx->vcpu.arch.cr0 = 0x60000010; | ||
1703 | vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */ | ||
1704 | vmx_set_cr4(&vmx->vcpu, 0); | ||
1705 | #ifdef CONFIG_X86_64 | ||
1706 | vmx_set_efer(&vmx->vcpu, 0); | ||
1707 | #endif | ||
1708 | vmx_fpu_activate(&vmx->vcpu); | ||
1709 | update_exception_bitmap(&vmx->vcpu); | ||
1710 | |||
1711 | return 0; | ||
1712 | |||
1713 | out: | ||
1714 | return ret; | ||
1715 | } | ||
1716 | |||
1717 | static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) | ||
1718 | { | ||
1719 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
1720 | |||
1721 | if (vcpu->arch.rmode.active) { | ||
1722 | vmx->rmode.irq.pending = true; | ||
1723 | vmx->rmode.irq.vector = irq; | ||
1724 | vmx->rmode.irq.rip = vmcs_readl(GUEST_RIP); | ||
1725 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | ||
1726 | irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK); | ||
1727 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); | ||
1728 | vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip - 1); | ||
1729 | return; | ||
1730 | } | ||
1731 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | ||
1732 | irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); | ||
1733 | } | ||
1734 | |||
1735 | static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) | ||
1736 | { | ||
1737 | int word_index = __ffs(vcpu->arch.irq_summary); | ||
1738 | int bit_index = __ffs(vcpu->arch.irq_pending[word_index]); | ||
1739 | int irq = word_index * BITS_PER_LONG + bit_index; | ||
1740 | |||
1741 | clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); | ||
1742 | if (!vcpu->arch.irq_pending[word_index]) | ||
1743 | clear_bit(word_index, &vcpu->arch.irq_summary); | ||
1744 | vmx_inject_irq(vcpu, irq); | ||
1745 | } | ||
1746 | |||
1747 | |||
1748 | static void do_interrupt_requests(struct kvm_vcpu *vcpu, | ||
1749 | struct kvm_run *kvm_run) | ||
1750 | { | ||
1751 | u32 cpu_based_vm_exec_control; | ||
1752 | |||
1753 | vcpu->arch.interrupt_window_open = | ||
1754 | ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && | ||
1755 | (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0); | ||
1756 | |||
1757 | if (vcpu->arch.interrupt_window_open && | ||
1758 | vcpu->arch.irq_summary && | ||
1759 | !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK)) | ||
1760 | /* | ||
1761 | * If interrupts enabled, and not blocked by sti or mov ss. Good. | ||
1762 | */ | ||
1763 | kvm_do_inject_irq(vcpu); | ||
1764 | |||
1765 | cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); | ||
1766 | if (!vcpu->arch.interrupt_window_open && | ||
1767 | (vcpu->arch.irq_summary || kvm_run->request_interrupt_window)) | ||
1768 | /* | ||
1769 | * Interrupts blocked. Wait for unblock. | ||
1770 | */ | ||
1771 | cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; | ||
1772 | else | ||
1773 | cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; | ||
1774 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); | ||
1775 | } | ||
1776 | |||
1777 | static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) | ||
1778 | { | ||
1779 | int ret; | ||
1780 | struct kvm_userspace_memory_region tss_mem = { | ||
1781 | .slot = 8, | ||
1782 | .guest_phys_addr = addr, | ||
1783 | .memory_size = PAGE_SIZE * 3, | ||
1784 | .flags = 0, | ||
1785 | }; | ||
1786 | |||
1787 | ret = kvm_set_memory_region(kvm, &tss_mem, 0); | ||
1788 | if (ret) | ||
1789 | return ret; | ||
1790 | kvm->arch.tss_addr = addr; | ||
1791 | return 0; | ||
1792 | } | ||
1793 | |||
1794 | static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu) | ||
1795 | { | ||
1796 | struct kvm_guest_debug *dbg = &vcpu->guest_debug; | ||
1797 | |||
1798 | set_debugreg(dbg->bp[0], 0); | ||
1799 | set_debugreg(dbg->bp[1], 1); | ||
1800 | set_debugreg(dbg->bp[2], 2); | ||
1801 | set_debugreg(dbg->bp[3], 3); | ||
1802 | |||
1803 | if (dbg->singlestep) { | ||
1804 | unsigned long flags; | ||
1805 | |||
1806 | flags = vmcs_readl(GUEST_RFLAGS); | ||
1807 | flags |= X86_EFLAGS_TF | X86_EFLAGS_RF; | ||
1808 | vmcs_writel(GUEST_RFLAGS, flags); | ||
1809 | } | ||
1810 | } | ||
1811 | |||
1812 | static int handle_rmode_exception(struct kvm_vcpu *vcpu, | ||
1813 | int vec, u32 err_code) | ||
1814 | { | ||
1815 | if (!vcpu->arch.rmode.active) | ||
1816 | return 0; | ||
1817 | |||
1818 | /* | ||
1819 | * Instruction with address size override prefix opcode 0x67 | ||
1820 | * Cause the #SS fault with 0 error code in VM86 mode. | ||
1821 | */ | ||
1822 | if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) | ||
1823 | if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE) | ||
1824 | return 1; | ||
1825 | return 0; | ||
1826 | } | ||
1827 | |||
1828 | static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
1829 | { | ||
1830 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
1831 | u32 intr_info, error_code; | ||
1832 | unsigned long cr2, rip; | ||
1833 | u32 vect_info; | ||
1834 | enum emulation_result er; | ||
1835 | |||
1836 | vect_info = vmx->idt_vectoring_info; | ||
1837 | intr_info = vmcs_read32(VM_EXIT_INTR_INFO); | ||
1838 | |||
1839 | if ((vect_info & VECTORING_INFO_VALID_MASK) && | ||
1840 | !is_page_fault(intr_info)) | ||
1841 | printk(KERN_ERR "%s: unexpected, vectoring info 0x%x " | ||
1842 | "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info); | ||
1843 | |||
1844 | if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) { | ||
1845 | int irq = vect_info & VECTORING_INFO_VECTOR_MASK; | ||
1846 | set_bit(irq, vcpu->arch.irq_pending); | ||
1847 | set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary); | ||
1848 | } | ||
1849 | |||
1850 | if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */ | ||
1851 | return 1; /* already handled by vmx_vcpu_run() */ | ||
1852 | |||
1853 | if (is_no_device(intr_info)) { | ||
1854 | vmx_fpu_activate(vcpu); | ||
1855 | return 1; | ||
1856 | } | ||
1857 | |||
1858 | if (is_invalid_opcode(intr_info)) { | ||
1859 | er = emulate_instruction(vcpu, kvm_run, 0, 0, 0); | ||
1860 | if (er != EMULATE_DONE) | ||
1861 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
1862 | return 1; | ||
1863 | } | ||
1864 | |||
1865 | error_code = 0; | ||
1866 | rip = vmcs_readl(GUEST_RIP); | ||
1867 | if (intr_info & INTR_INFO_DELIEVER_CODE_MASK) | ||
1868 | error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); | ||
1869 | if (is_page_fault(intr_info)) { | ||
1870 | cr2 = vmcs_readl(EXIT_QUALIFICATION); | ||
1871 | return kvm_mmu_page_fault(vcpu, cr2, error_code); | ||
1872 | } | ||
1873 | |||
1874 | if (vcpu->arch.rmode.active && | ||
1875 | handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK, | ||
1876 | error_code)) { | ||
1877 | if (vcpu->arch.halt_request) { | ||
1878 | vcpu->arch.halt_request = 0; | ||
1879 | return kvm_emulate_halt(vcpu); | ||
1880 | } | ||
1881 | return 1; | ||
1882 | } | ||
1883 | |||
1884 | if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == | ||
1885 | (INTR_TYPE_EXCEPTION | 1)) { | ||
1886 | kvm_run->exit_reason = KVM_EXIT_DEBUG; | ||
1887 | return 0; | ||
1888 | } | ||
1889 | kvm_run->exit_reason = KVM_EXIT_EXCEPTION; | ||
1890 | kvm_run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK; | ||
1891 | kvm_run->ex.error_code = error_code; | ||
1892 | return 0; | ||
1893 | } | ||
1894 | |||
1895 | static int handle_external_interrupt(struct kvm_vcpu *vcpu, | ||
1896 | struct kvm_run *kvm_run) | ||
1897 | { | ||
1898 | ++vcpu->stat.irq_exits; | ||
1899 | return 1; | ||
1900 | } | ||
1901 | |||
1902 | static int handle_triple_fault(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
1903 | { | ||
1904 | kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; | ||
1905 | return 0; | ||
1906 | } | ||
1907 | |||
1908 | static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
1909 | { | ||
1910 | unsigned long exit_qualification; | ||
1911 | int size, down, in, string, rep; | ||
1912 | unsigned port; | ||
1913 | |||
1914 | ++vcpu->stat.io_exits; | ||
1915 | exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
1916 | string = (exit_qualification & 16) != 0; | ||
1917 | |||
1918 | if (string) { | ||
1919 | if (emulate_instruction(vcpu, | ||
1920 | kvm_run, 0, 0, 0) == EMULATE_DO_MMIO) | ||
1921 | return 0; | ||
1922 | return 1; | ||
1923 | } | ||
1924 | |||
1925 | size = (exit_qualification & 7) + 1; | ||
1926 | in = (exit_qualification & 8) != 0; | ||
1927 | down = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0; | ||
1928 | rep = (exit_qualification & 32) != 0; | ||
1929 | port = exit_qualification >> 16; | ||
1930 | |||
1931 | return kvm_emulate_pio(vcpu, kvm_run, in, size, port); | ||
1932 | } | ||
1933 | |||
1934 | static void | ||
1935 | vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) | ||
1936 | { | ||
1937 | /* | ||
1938 | * Patch in the VMCALL instruction: | ||
1939 | */ | ||
1940 | hypercall[0] = 0x0f; | ||
1941 | hypercall[1] = 0x01; | ||
1942 | hypercall[2] = 0xc1; | ||
1943 | } | ||
1944 | |||
1945 | static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
1946 | { | ||
1947 | unsigned long exit_qualification; | ||
1948 | int cr; | ||
1949 | int reg; | ||
1950 | |||
1951 | exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
1952 | cr = exit_qualification & 15; | ||
1953 | reg = (exit_qualification >> 8) & 15; | ||
1954 | switch ((exit_qualification >> 4) & 3) { | ||
1955 | case 0: /* mov to cr */ | ||
1956 | switch (cr) { | ||
1957 | case 0: | ||
1958 | vcpu_load_rsp_rip(vcpu); | ||
1959 | set_cr0(vcpu, vcpu->arch.regs[reg]); | ||
1960 | skip_emulated_instruction(vcpu); | ||
1961 | return 1; | ||
1962 | case 3: | ||
1963 | vcpu_load_rsp_rip(vcpu); | ||
1964 | set_cr3(vcpu, vcpu->arch.regs[reg]); | ||
1965 | skip_emulated_instruction(vcpu); | ||
1966 | return 1; | ||
1967 | case 4: | ||
1968 | vcpu_load_rsp_rip(vcpu); | ||
1969 | set_cr4(vcpu, vcpu->arch.regs[reg]); | ||
1970 | skip_emulated_instruction(vcpu); | ||
1971 | return 1; | ||
1972 | case 8: | ||
1973 | vcpu_load_rsp_rip(vcpu); | ||
1974 | set_cr8(vcpu, vcpu->arch.regs[reg]); | ||
1975 | skip_emulated_instruction(vcpu); | ||
1976 | if (irqchip_in_kernel(vcpu->kvm)) | ||
1977 | return 1; | ||
1978 | kvm_run->exit_reason = KVM_EXIT_SET_TPR; | ||
1979 | return 0; | ||
1980 | }; | ||
1981 | break; | ||
1982 | case 2: /* clts */ | ||
1983 | vcpu_load_rsp_rip(vcpu); | ||
1984 | vmx_fpu_deactivate(vcpu); | ||
1985 | vcpu->arch.cr0 &= ~X86_CR0_TS; | ||
1986 | vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); | ||
1987 | vmx_fpu_activate(vcpu); | ||
1988 | skip_emulated_instruction(vcpu); | ||
1989 | return 1; | ||
1990 | case 1: /*mov from cr*/ | ||
1991 | switch (cr) { | ||
1992 | case 3: | ||
1993 | vcpu_load_rsp_rip(vcpu); | ||
1994 | vcpu->arch.regs[reg] = vcpu->arch.cr3; | ||
1995 | vcpu_put_rsp_rip(vcpu); | ||
1996 | skip_emulated_instruction(vcpu); | ||
1997 | return 1; | ||
1998 | case 8: | ||
1999 | vcpu_load_rsp_rip(vcpu); | ||
2000 | vcpu->arch.regs[reg] = get_cr8(vcpu); | ||
2001 | vcpu_put_rsp_rip(vcpu); | ||
2002 | skip_emulated_instruction(vcpu); | ||
2003 | return 1; | ||
2004 | } | ||
2005 | break; | ||
2006 | case 3: /* lmsw */ | ||
2007 | lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f); | ||
2008 | |||
2009 | skip_emulated_instruction(vcpu); | ||
2010 | return 1; | ||
2011 | default: | ||
2012 | break; | ||
2013 | } | ||
2014 | kvm_run->exit_reason = 0; | ||
2015 | pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n", | ||
2016 | (int)(exit_qualification >> 4) & 3, cr); | ||
2017 | return 0; | ||
2018 | } | ||
2019 | |||
2020 | static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
2021 | { | ||
2022 | unsigned long exit_qualification; | ||
2023 | unsigned long val; | ||
2024 | int dr, reg; | ||
2025 | |||
2026 | /* | ||
2027 | * FIXME: this code assumes the host is debugging the guest. | ||
2028 | * need to deal with guest debugging itself too. | ||
2029 | */ | ||
2030 | exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
2031 | dr = exit_qualification & 7; | ||
2032 | reg = (exit_qualification >> 8) & 15; | ||
2033 | vcpu_load_rsp_rip(vcpu); | ||
2034 | if (exit_qualification & 16) { | ||
2035 | /* mov from dr */ | ||
2036 | switch (dr) { | ||
2037 | case 6: | ||
2038 | val = 0xffff0ff0; | ||
2039 | break; | ||
2040 | case 7: | ||
2041 | val = 0x400; | ||
2042 | break; | ||
2043 | default: | ||
2044 | val = 0; | ||
2045 | } | ||
2046 | vcpu->arch.regs[reg] = val; | ||
2047 | } else { | ||
2048 | /* mov to dr */ | ||
2049 | } | ||
2050 | vcpu_put_rsp_rip(vcpu); | ||
2051 | skip_emulated_instruction(vcpu); | ||
2052 | return 1; | ||
2053 | } | ||
2054 | |||
2055 | static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
2056 | { | ||
2057 | kvm_emulate_cpuid(vcpu); | ||
2058 | return 1; | ||
2059 | } | ||
2060 | |||
2061 | static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
2062 | { | ||
2063 | u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; | ||
2064 | u64 data; | ||
2065 | |||
2066 | if (vmx_get_msr(vcpu, ecx, &data)) { | ||
2067 | kvm_inject_gp(vcpu, 0); | ||
2068 | return 1; | ||
2069 | } | ||
2070 | |||
2071 | /* FIXME: handling of bits 32:63 of rax, rdx */ | ||
2072 | vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u; | ||
2073 | vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u; | ||
2074 | skip_emulated_instruction(vcpu); | ||
2075 | return 1; | ||
2076 | } | ||
2077 | |||
2078 | static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
2079 | { | ||
2080 | u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; | ||
2081 | u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) | ||
2082 | | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); | ||
2083 | |||
2084 | if (vmx_set_msr(vcpu, ecx, data) != 0) { | ||
2085 | kvm_inject_gp(vcpu, 0); | ||
2086 | return 1; | ||
2087 | } | ||
2088 | |||
2089 | skip_emulated_instruction(vcpu); | ||
2090 | return 1; | ||
2091 | } | ||
2092 | |||
2093 | static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu, | ||
2094 | struct kvm_run *kvm_run) | ||
2095 | { | ||
2096 | return 1; | ||
2097 | } | ||
2098 | |||
2099 | static int handle_interrupt_window(struct kvm_vcpu *vcpu, | ||
2100 | struct kvm_run *kvm_run) | ||
2101 | { | ||
2102 | u32 cpu_based_vm_exec_control; | ||
2103 | |||
2104 | /* clear pending irq */ | ||
2105 | cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); | ||
2106 | cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; | ||
2107 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); | ||
2108 | /* | ||
2109 | * If the user space waits to inject interrupts, exit as soon as | ||
2110 | * possible | ||
2111 | */ | ||
2112 | if (kvm_run->request_interrupt_window && | ||
2113 | !vcpu->arch.irq_summary) { | ||
2114 | kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; | ||
2115 | ++vcpu->stat.irq_window_exits; | ||
2116 | return 0; | ||
2117 | } | ||
2118 | return 1; | ||
2119 | } | ||
2120 | |||
2121 | static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
2122 | { | ||
2123 | skip_emulated_instruction(vcpu); | ||
2124 | return kvm_emulate_halt(vcpu); | ||
2125 | } | ||
2126 | |||
2127 | static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
2128 | { | ||
2129 | skip_emulated_instruction(vcpu); | ||
2130 | kvm_emulate_hypercall(vcpu); | ||
2131 | return 1; | ||
2132 | } | ||
2133 | |||
2134 | static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
2135 | { | ||
2136 | skip_emulated_instruction(vcpu); | ||
2137 | /* TODO: Add support for VT-d/pass-through device */ | ||
2138 | return 1; | ||
2139 | } | ||
2140 | |||
2141 | static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
2142 | { | ||
2143 | u64 exit_qualification; | ||
2144 | enum emulation_result er; | ||
2145 | unsigned long offset; | ||
2146 | |||
2147 | exit_qualification = vmcs_read64(EXIT_QUALIFICATION); | ||
2148 | offset = exit_qualification & 0xffful; | ||
2149 | |||
2150 | er = emulate_instruction(vcpu, kvm_run, 0, 0, 0); | ||
2151 | |||
2152 | if (er != EMULATE_DONE) { | ||
2153 | printk(KERN_ERR | ||
2154 | "Fail to handle apic access vmexit! Offset is 0x%lx\n", | ||
2155 | offset); | ||
2156 | return -ENOTSUPP; | ||
2157 | } | ||
2158 | return 1; | ||
2159 | } | ||
2160 | |||
2161 | /* | ||
2162 | * The exit handlers return 1 if the exit was handled fully and guest execution | ||
2163 | * may resume. Otherwise they set the kvm_run parameter to indicate what needs | ||
2164 | * to be done to userspace and return 0. | ||
2165 | */ | ||
2166 | static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, | ||
2167 | struct kvm_run *kvm_run) = { | ||
2168 | [EXIT_REASON_EXCEPTION_NMI] = handle_exception, | ||
2169 | [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, | ||
2170 | [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, | ||
2171 | [EXIT_REASON_IO_INSTRUCTION] = handle_io, | ||
2172 | [EXIT_REASON_CR_ACCESS] = handle_cr, | ||
2173 | [EXIT_REASON_DR_ACCESS] = handle_dr, | ||
2174 | [EXIT_REASON_CPUID] = handle_cpuid, | ||
2175 | [EXIT_REASON_MSR_READ] = handle_rdmsr, | ||
2176 | [EXIT_REASON_MSR_WRITE] = handle_wrmsr, | ||
2177 | [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, | ||
2178 | [EXIT_REASON_HLT] = handle_halt, | ||
2179 | [EXIT_REASON_VMCALL] = handle_vmcall, | ||
2180 | [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, | ||
2181 | [EXIT_REASON_APIC_ACCESS] = handle_apic_access, | ||
2182 | [EXIT_REASON_WBINVD] = handle_wbinvd, | ||
2183 | }; | ||
2184 | |||
2185 | static const int kvm_vmx_max_exit_handlers = | ||
2186 | ARRAY_SIZE(kvm_vmx_exit_handlers); | ||
2187 | |||
2188 | /* | ||
2189 | * The guest has exited. See if we can fix it or if we need userspace | ||
2190 | * assistance. | ||
2191 | */ | ||
2192 | static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) | ||
2193 | { | ||
2194 | u32 exit_reason = vmcs_read32(VM_EXIT_REASON); | ||
2195 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
2196 | u32 vectoring_info = vmx->idt_vectoring_info; | ||
2197 | |||
2198 | if (unlikely(vmx->fail)) { | ||
2199 | kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; | ||
2200 | kvm_run->fail_entry.hardware_entry_failure_reason | ||
2201 | = vmcs_read32(VM_INSTRUCTION_ERROR); | ||
2202 | return 0; | ||
2203 | } | ||
2204 | |||
2205 | if ((vectoring_info & VECTORING_INFO_VALID_MASK) && | ||
2206 | exit_reason != EXIT_REASON_EXCEPTION_NMI) | ||
2207 | printk(KERN_WARNING "%s: unexpected, valid vectoring info and " | ||
2208 | "exit reason is 0x%x\n", __FUNCTION__, exit_reason); | ||
2209 | if (exit_reason < kvm_vmx_max_exit_handlers | ||
2210 | && kvm_vmx_exit_handlers[exit_reason]) | ||
2211 | return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run); | ||
2212 | else { | ||
2213 | kvm_run->exit_reason = KVM_EXIT_UNKNOWN; | ||
2214 | kvm_run->hw.hardware_exit_reason = exit_reason; | ||
2215 | } | ||
2216 | return 0; | ||
2217 | } | ||
2218 | |||
2219 | static void vmx_flush_tlb(struct kvm_vcpu *vcpu) | ||
2220 | { | ||
2221 | } | ||
2222 | |||
2223 | static void update_tpr_threshold(struct kvm_vcpu *vcpu) | ||
2224 | { | ||
2225 | int max_irr, tpr; | ||
2226 | |||
2227 | if (!vm_need_tpr_shadow(vcpu->kvm)) | ||
2228 | return; | ||
2229 | |||
2230 | if (!kvm_lapic_enabled(vcpu) || | ||
2231 | ((max_irr = kvm_lapic_find_highest_irr(vcpu)) == -1)) { | ||
2232 | vmcs_write32(TPR_THRESHOLD, 0); | ||
2233 | return; | ||
2234 | } | ||
2235 | |||
2236 | tpr = (kvm_lapic_get_cr8(vcpu) & 0x0f) << 4; | ||
2237 | vmcs_write32(TPR_THRESHOLD, (max_irr > tpr) ? tpr >> 4 : max_irr >> 4); | ||
2238 | } | ||
2239 | |||
2240 | static void enable_irq_window(struct kvm_vcpu *vcpu) | ||
2241 | { | ||
2242 | u32 cpu_based_vm_exec_control; | ||
2243 | |||
2244 | cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); | ||
2245 | cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; | ||
2246 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); | ||
2247 | } | ||
2248 | |||
2249 | static void vmx_intr_assist(struct kvm_vcpu *vcpu) | ||
2250 | { | ||
2251 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
2252 | u32 idtv_info_field, intr_info_field; | ||
2253 | int has_ext_irq, interrupt_window_open; | ||
2254 | int vector; | ||
2255 | |||
2256 | update_tpr_threshold(vcpu); | ||
2257 | |||
2258 | has_ext_irq = kvm_cpu_has_interrupt(vcpu); | ||
2259 | intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); | ||
2260 | idtv_info_field = vmx->idt_vectoring_info; | ||
2261 | if (intr_info_field & INTR_INFO_VALID_MASK) { | ||
2262 | if (idtv_info_field & INTR_INFO_VALID_MASK) { | ||
2263 | /* TODO: fault when IDT_Vectoring */ | ||
2264 | if (printk_ratelimit()) | ||
2265 | printk(KERN_ERR "Fault when IDT_Vectoring\n"); | ||
2266 | } | ||
2267 | if (has_ext_irq) | ||
2268 | enable_irq_window(vcpu); | ||
2269 | return; | ||
2270 | } | ||
2271 | if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) { | ||
2272 | if ((idtv_info_field & VECTORING_INFO_TYPE_MASK) | ||
2273 | == INTR_TYPE_EXT_INTR | ||
2274 | && vcpu->arch.rmode.active) { | ||
2275 | u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK; | ||
2276 | |||
2277 | vmx_inject_irq(vcpu, vect); | ||
2278 | if (unlikely(has_ext_irq)) | ||
2279 | enable_irq_window(vcpu); | ||
2280 | return; | ||
2281 | } | ||
2282 | |||
2283 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field); | ||
2284 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, | ||
2285 | vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); | ||
2286 | |||
2287 | if (unlikely(idtv_info_field & INTR_INFO_DELIEVER_CODE_MASK)) | ||
2288 | vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, | ||
2289 | vmcs_read32(IDT_VECTORING_ERROR_CODE)); | ||
2290 | if (unlikely(has_ext_irq)) | ||
2291 | enable_irq_window(vcpu); | ||
2292 | return; | ||
2293 | } | ||
2294 | if (!has_ext_irq) | ||
2295 | return; | ||
2296 | interrupt_window_open = | ||
2297 | ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && | ||
2298 | (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0); | ||
2299 | if (interrupt_window_open) { | ||
2300 | vector = kvm_cpu_get_interrupt(vcpu); | ||
2301 | vmx_inject_irq(vcpu, vector); | ||
2302 | kvm_timer_intr_post(vcpu, vector); | ||
2303 | } else | ||
2304 | enable_irq_window(vcpu); | ||
2305 | } | ||
2306 | |||
2307 | /* | ||
2308 | * Failure to inject an interrupt should give us the information | ||
2309 | * in IDT_VECTORING_INFO_FIELD. However, if the failure occurs | ||
2310 | * when fetching the interrupt redirection bitmap in the real-mode | ||
2311 | * tss, this doesn't happen. So we do it ourselves. | ||
2312 | */ | ||
2313 | static void fixup_rmode_irq(struct vcpu_vmx *vmx) | ||
2314 | { | ||
2315 | vmx->rmode.irq.pending = 0; | ||
2316 | if (vmcs_readl(GUEST_RIP) + 1 != vmx->rmode.irq.rip) | ||
2317 | return; | ||
2318 | vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip); | ||
2319 | if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) { | ||
2320 | vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK; | ||
2321 | vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR; | ||
2322 | return; | ||
2323 | } | ||
2324 | vmx->idt_vectoring_info = | ||
2325 | VECTORING_INFO_VALID_MASK | ||
2326 | | INTR_TYPE_EXT_INTR | ||
2327 | | vmx->rmode.irq.vector; | ||
2328 | } | ||
2329 | |||
2330 | static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
2331 | { | ||
2332 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
2333 | u32 intr_info; | ||
2334 | |||
2335 | /* | ||
2336 | * Loading guest fpu may have cleared host cr0.ts | ||
2337 | */ | ||
2338 | vmcs_writel(HOST_CR0, read_cr0()); | ||
2339 | |||
2340 | asm( | ||
2341 | /* Store host registers */ | ||
2342 | #ifdef CONFIG_X86_64 | ||
2343 | "push %%rdx; push %%rbp;" | ||
2344 | "push %%rcx \n\t" | ||
2345 | #else | ||
2346 | "push %%edx; push %%ebp;" | ||
2347 | "push %%ecx \n\t" | ||
2348 | #endif | ||
2349 | ASM_VMX_VMWRITE_RSP_RDX "\n\t" | ||
2350 | /* Check if vmlaunch of vmresume is needed */ | ||
2351 | "cmpl $0, %c[launched](%0) \n\t" | ||
2352 | /* Load guest registers. Don't clobber flags. */ | ||
2353 | #ifdef CONFIG_X86_64 | ||
2354 | "mov %c[cr2](%0), %%rax \n\t" | ||
2355 | "mov %%rax, %%cr2 \n\t" | ||
2356 | "mov %c[rax](%0), %%rax \n\t" | ||
2357 | "mov %c[rbx](%0), %%rbx \n\t" | ||
2358 | "mov %c[rdx](%0), %%rdx \n\t" | ||
2359 | "mov %c[rsi](%0), %%rsi \n\t" | ||
2360 | "mov %c[rdi](%0), %%rdi \n\t" | ||
2361 | "mov %c[rbp](%0), %%rbp \n\t" | ||
2362 | "mov %c[r8](%0), %%r8 \n\t" | ||
2363 | "mov %c[r9](%0), %%r9 \n\t" | ||
2364 | "mov %c[r10](%0), %%r10 \n\t" | ||
2365 | "mov %c[r11](%0), %%r11 \n\t" | ||
2366 | "mov %c[r12](%0), %%r12 \n\t" | ||
2367 | "mov %c[r13](%0), %%r13 \n\t" | ||
2368 | "mov %c[r14](%0), %%r14 \n\t" | ||
2369 | "mov %c[r15](%0), %%r15 \n\t" | ||
2370 | "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (rcx) */ | ||
2371 | #else | ||
2372 | "mov %c[cr2](%0), %%eax \n\t" | ||
2373 | "mov %%eax, %%cr2 \n\t" | ||
2374 | "mov %c[rax](%0), %%eax \n\t" | ||
2375 | "mov %c[rbx](%0), %%ebx \n\t" | ||
2376 | "mov %c[rdx](%0), %%edx \n\t" | ||
2377 | "mov %c[rsi](%0), %%esi \n\t" | ||
2378 | "mov %c[rdi](%0), %%edi \n\t" | ||
2379 | "mov %c[rbp](%0), %%ebp \n\t" | ||
2380 | "mov %c[rcx](%0), %%ecx \n\t" /* kills %0 (ecx) */ | ||
2381 | #endif | ||
2382 | /* Enter guest mode */ | ||
2383 | "jne .Llaunched \n\t" | ||
2384 | ASM_VMX_VMLAUNCH "\n\t" | ||
2385 | "jmp .Lkvm_vmx_return \n\t" | ||
2386 | ".Llaunched: " ASM_VMX_VMRESUME "\n\t" | ||
2387 | ".Lkvm_vmx_return: " | ||
2388 | /* Save guest registers, load host registers, keep flags */ | ||
2389 | #ifdef CONFIG_X86_64 | ||
2390 | "xchg %0, (%%rsp) \n\t" | ||
2391 | "mov %%rax, %c[rax](%0) \n\t" | ||
2392 | "mov %%rbx, %c[rbx](%0) \n\t" | ||
2393 | "pushq (%%rsp); popq %c[rcx](%0) \n\t" | ||
2394 | "mov %%rdx, %c[rdx](%0) \n\t" | ||
2395 | "mov %%rsi, %c[rsi](%0) \n\t" | ||
2396 | "mov %%rdi, %c[rdi](%0) \n\t" | ||
2397 | "mov %%rbp, %c[rbp](%0) \n\t" | ||
2398 | "mov %%r8, %c[r8](%0) \n\t" | ||
2399 | "mov %%r9, %c[r9](%0) \n\t" | ||
2400 | "mov %%r10, %c[r10](%0) \n\t" | ||
2401 | "mov %%r11, %c[r11](%0) \n\t" | ||
2402 | "mov %%r12, %c[r12](%0) \n\t" | ||
2403 | "mov %%r13, %c[r13](%0) \n\t" | ||
2404 | "mov %%r14, %c[r14](%0) \n\t" | ||
2405 | "mov %%r15, %c[r15](%0) \n\t" | ||
2406 | "mov %%cr2, %%rax \n\t" | ||
2407 | "mov %%rax, %c[cr2](%0) \n\t" | ||
2408 | |||
2409 | "pop %%rbp; pop %%rbp; pop %%rdx \n\t" | ||
2410 | #else | ||
2411 | "xchg %0, (%%esp) \n\t" | ||
2412 | "mov %%eax, %c[rax](%0) \n\t" | ||
2413 | "mov %%ebx, %c[rbx](%0) \n\t" | ||
2414 | "pushl (%%esp); popl %c[rcx](%0) \n\t" | ||
2415 | "mov %%edx, %c[rdx](%0) \n\t" | ||
2416 | "mov %%esi, %c[rsi](%0) \n\t" | ||
2417 | "mov %%edi, %c[rdi](%0) \n\t" | ||
2418 | "mov %%ebp, %c[rbp](%0) \n\t" | ||
2419 | "mov %%cr2, %%eax \n\t" | ||
2420 | "mov %%eax, %c[cr2](%0) \n\t" | ||
2421 | |||
2422 | "pop %%ebp; pop %%ebp; pop %%edx \n\t" | ||
2423 | #endif | ||
2424 | "setbe %c[fail](%0) \n\t" | ||
2425 | : : "c"(vmx), "d"((unsigned long)HOST_RSP), | ||
2426 | [launched]"i"(offsetof(struct vcpu_vmx, launched)), | ||
2427 | [fail]"i"(offsetof(struct vcpu_vmx, fail)), | ||
2428 | [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), | ||
2429 | [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), | ||
2430 | [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), | ||
2431 | [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])), | ||
2432 | [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])), | ||
2433 | [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])), | ||
2434 | [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])), | ||
2435 | #ifdef CONFIG_X86_64 | ||
2436 | [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])), | ||
2437 | [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])), | ||
2438 | [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])), | ||
2439 | [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])), | ||
2440 | [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])), | ||
2441 | [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])), | ||
2442 | [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])), | ||
2443 | [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])), | ||
2444 | #endif | ||
2445 | [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) | ||
2446 | : "cc", "memory" | ||
2447 | #ifdef CONFIG_X86_64 | ||
2448 | , "rbx", "rdi", "rsi" | ||
2449 | , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" | ||
2450 | #else | ||
2451 | , "ebx", "edi", "rsi" | ||
2452 | #endif | ||
2453 | ); | ||
2454 | |||
2455 | vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); | ||
2456 | if (vmx->rmode.irq.pending) | ||
2457 | fixup_rmode_irq(vmx); | ||
2458 | |||
2459 | vcpu->arch.interrupt_window_open = | ||
2460 | (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0; | ||
2461 | |||
2462 | asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); | ||
2463 | vmx->launched = 1; | ||
2464 | |||
2465 | intr_info = vmcs_read32(VM_EXIT_INTR_INFO); | ||
2466 | |||
2467 | /* We need to handle NMIs before interrupts are enabled */ | ||
2468 | if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */ | ||
2469 | asm("int $2"); | ||
2470 | } | ||
2471 | |||
2472 | static void vmx_free_vmcs(struct kvm_vcpu *vcpu) | ||
2473 | { | ||
2474 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
2475 | |||
2476 | if (vmx->vmcs) { | ||
2477 | on_each_cpu(__vcpu_clear, vmx, 0, 1); | ||
2478 | free_vmcs(vmx->vmcs); | ||
2479 | vmx->vmcs = NULL; | ||
2480 | } | ||
2481 | } | ||
2482 | |||
2483 | static void vmx_free_vcpu(struct kvm_vcpu *vcpu) | ||
2484 | { | ||
2485 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
2486 | |||
2487 | vmx_free_vmcs(vcpu); | ||
2488 | kfree(vmx->host_msrs); | ||
2489 | kfree(vmx->guest_msrs); | ||
2490 | kvm_vcpu_uninit(vcpu); | ||
2491 | kmem_cache_free(kvm_vcpu_cache, vmx); | ||
2492 | } | ||
2493 | |||
2494 | static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) | ||
2495 | { | ||
2496 | int err; | ||
2497 | struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); | ||
2498 | int cpu; | ||
2499 | |||
2500 | if (!vmx) | ||
2501 | return ERR_PTR(-ENOMEM); | ||
2502 | |||
2503 | err = kvm_vcpu_init(&vmx->vcpu, kvm, id); | ||
2504 | if (err) | ||
2505 | goto free_vcpu; | ||
2506 | |||
2507 | vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); | ||
2508 | if (!vmx->guest_msrs) { | ||
2509 | err = -ENOMEM; | ||
2510 | goto uninit_vcpu; | ||
2511 | } | ||
2512 | |||
2513 | vmx->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); | ||
2514 | if (!vmx->host_msrs) | ||
2515 | goto free_guest_msrs; | ||
2516 | |||
2517 | vmx->vmcs = alloc_vmcs(); | ||
2518 | if (!vmx->vmcs) | ||
2519 | goto free_msrs; | ||
2520 | |||
2521 | vmcs_clear(vmx->vmcs); | ||
2522 | |||
2523 | cpu = get_cpu(); | ||
2524 | vmx_vcpu_load(&vmx->vcpu, cpu); | ||
2525 | err = vmx_vcpu_setup(vmx); | ||
2526 | vmx_vcpu_put(&vmx->vcpu); | ||
2527 | put_cpu(); | ||
2528 | if (err) | ||
2529 | goto free_vmcs; | ||
2530 | |||
2531 | return &vmx->vcpu; | ||
2532 | |||
2533 | free_vmcs: | ||
2534 | free_vmcs(vmx->vmcs); | ||
2535 | free_msrs: | ||
2536 | kfree(vmx->host_msrs); | ||
2537 | free_guest_msrs: | ||
2538 | kfree(vmx->guest_msrs); | ||
2539 | uninit_vcpu: | ||
2540 | kvm_vcpu_uninit(&vmx->vcpu); | ||
2541 | free_vcpu: | ||
2542 | kmem_cache_free(kvm_vcpu_cache, vmx); | ||
2543 | return ERR_PTR(err); | ||
2544 | } | ||
2545 | |||
2546 | static void __init vmx_check_processor_compat(void *rtn) | ||
2547 | { | ||
2548 | struct vmcs_config vmcs_conf; | ||
2549 | |||
2550 | *(int *)rtn = 0; | ||
2551 | if (setup_vmcs_config(&vmcs_conf) < 0) | ||
2552 | *(int *)rtn = -EIO; | ||
2553 | if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) { | ||
2554 | printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n", | ||
2555 | smp_processor_id()); | ||
2556 | *(int *)rtn = -EIO; | ||
2557 | } | ||
2558 | } | ||
2559 | |||
2560 | static struct kvm_x86_ops vmx_x86_ops = { | ||
2561 | .cpu_has_kvm_support = cpu_has_kvm_support, | ||
2562 | .disabled_by_bios = vmx_disabled_by_bios, | ||
2563 | .hardware_setup = hardware_setup, | ||
2564 | .hardware_unsetup = hardware_unsetup, | ||
2565 | .check_processor_compatibility = vmx_check_processor_compat, | ||
2566 | .hardware_enable = hardware_enable, | ||
2567 | .hardware_disable = hardware_disable, | ||
2568 | |||
2569 | .vcpu_create = vmx_create_vcpu, | ||
2570 | .vcpu_free = vmx_free_vcpu, | ||
2571 | .vcpu_reset = vmx_vcpu_reset, | ||
2572 | |||
2573 | .prepare_guest_switch = vmx_save_host_state, | ||
2574 | .vcpu_load = vmx_vcpu_load, | ||
2575 | .vcpu_put = vmx_vcpu_put, | ||
2576 | .vcpu_decache = vmx_vcpu_decache, | ||
2577 | |||
2578 | .set_guest_debug = set_guest_debug, | ||
2579 | .guest_debug_pre = kvm_guest_debug_pre, | ||
2580 | .get_msr = vmx_get_msr, | ||
2581 | .set_msr = vmx_set_msr, | ||
2582 | .get_segment_base = vmx_get_segment_base, | ||
2583 | .get_segment = vmx_get_segment, | ||
2584 | .set_segment = vmx_set_segment, | ||
2585 | .get_cs_db_l_bits = vmx_get_cs_db_l_bits, | ||
2586 | .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, | ||
2587 | .set_cr0 = vmx_set_cr0, | ||
2588 | .set_cr3 = vmx_set_cr3, | ||
2589 | .set_cr4 = vmx_set_cr4, | ||
2590 | #ifdef CONFIG_X86_64 | ||
2591 | .set_efer = vmx_set_efer, | ||
2592 | #endif | ||
2593 | .get_idt = vmx_get_idt, | ||
2594 | .set_idt = vmx_set_idt, | ||
2595 | .get_gdt = vmx_get_gdt, | ||
2596 | .set_gdt = vmx_set_gdt, | ||
2597 | .cache_regs = vcpu_load_rsp_rip, | ||
2598 | .decache_regs = vcpu_put_rsp_rip, | ||
2599 | .get_rflags = vmx_get_rflags, | ||
2600 | .set_rflags = vmx_set_rflags, | ||
2601 | |||
2602 | .tlb_flush = vmx_flush_tlb, | ||
2603 | |||
2604 | .run = vmx_vcpu_run, | ||
2605 | .handle_exit = kvm_handle_exit, | ||
2606 | .skip_emulated_instruction = skip_emulated_instruction, | ||
2607 | .patch_hypercall = vmx_patch_hypercall, | ||
2608 | .get_irq = vmx_get_irq, | ||
2609 | .set_irq = vmx_inject_irq, | ||
2610 | .queue_exception = vmx_queue_exception, | ||
2611 | .exception_injected = vmx_exception_injected, | ||
2612 | .inject_pending_irq = vmx_intr_assist, | ||
2613 | .inject_pending_vectors = do_interrupt_requests, | ||
2614 | |||
2615 | .set_tss_addr = vmx_set_tss_addr, | ||
2616 | }; | ||
2617 | |||
2618 | static int __init vmx_init(void) | ||
2619 | { | ||
2620 | void *iova; | ||
2621 | int r; | ||
2622 | |||
2623 | vmx_io_bitmap_a = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); | ||
2624 | if (!vmx_io_bitmap_a) | ||
2625 | return -ENOMEM; | ||
2626 | |||
2627 | vmx_io_bitmap_b = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); | ||
2628 | if (!vmx_io_bitmap_b) { | ||
2629 | r = -ENOMEM; | ||
2630 | goto out; | ||
2631 | } | ||
2632 | |||
2633 | /* | ||
2634 | * Allow direct access to the PC debug port (it is often used for I/O | ||
2635 | * delays, but the vmexits simply slow things down). | ||
2636 | */ | ||
2637 | iova = kmap(vmx_io_bitmap_a); | ||
2638 | memset(iova, 0xff, PAGE_SIZE); | ||
2639 | clear_bit(0x80, iova); | ||
2640 | kunmap(vmx_io_bitmap_a); | ||
2641 | |||
2642 | iova = kmap(vmx_io_bitmap_b); | ||
2643 | memset(iova, 0xff, PAGE_SIZE); | ||
2644 | kunmap(vmx_io_bitmap_b); | ||
2645 | |||
2646 | r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE); | ||
2647 | if (r) | ||
2648 | goto out1; | ||
2649 | |||
2650 | if (bypass_guest_pf) | ||
2651 | kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull); | ||
2652 | |||
2653 | return 0; | ||
2654 | |||
2655 | out1: | ||
2656 | __free_page(vmx_io_bitmap_b); | ||
2657 | out: | ||
2658 | __free_page(vmx_io_bitmap_a); | ||
2659 | return r; | ||
2660 | } | ||
2661 | |||
2662 | static void __exit vmx_exit(void) | ||
2663 | { | ||
2664 | __free_page(vmx_io_bitmap_b); | ||
2665 | __free_page(vmx_io_bitmap_a); | ||
2666 | |||
2667 | kvm_exit(); | ||
2668 | } | ||
2669 | |||
2670 | module_init(vmx_init) | ||
2671 | module_exit(vmx_exit) | ||
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h new file mode 100644 index 000000000000..d52ae8d7303d --- /dev/null +++ b/arch/x86/kvm/vmx.h | |||
@@ -0,0 +1,324 @@ | |||
1 | #ifndef VMX_H | ||
2 | #define VMX_H | ||
3 | |||
4 | /* | ||
5 | * vmx.h: VMX Architecture related definitions | ||
6 | * Copyright (c) 2004, Intel Corporation. | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or modify it | ||
9 | * under the terms and conditions of the GNU General Public License, | ||
10 | * version 2, as published by the Free Software Foundation. | ||
11 | * | ||
12 | * This program is distributed in the hope it will be useful, but WITHOUT | ||
13 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
14 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
15 | * more details. | ||
16 | * | ||
17 | * You should have received a copy of the GNU General Public License along with | ||
18 | * this program; if not, write to the Free Software Foundation, Inc., 59 Temple | ||
19 | * Place - Suite 330, Boston, MA 02111-1307 USA. | ||
20 | * | ||
21 | * A few random additions are: | ||
22 | * Copyright (C) 2006 Qumranet | ||
23 | * Avi Kivity <avi@qumranet.com> | ||
24 | * Yaniv Kamay <yaniv@qumranet.com> | ||
25 | * | ||
26 | */ | ||
27 | |||
28 | /* | ||
29 | * Definitions of Primary Processor-Based VM-Execution Controls. | ||
30 | */ | ||
31 | #define CPU_BASED_VIRTUAL_INTR_PENDING 0x00000004 | ||
32 | #define CPU_BASED_USE_TSC_OFFSETING 0x00000008 | ||
33 | #define CPU_BASED_HLT_EXITING 0x00000080 | ||
34 | #define CPU_BASED_INVLPG_EXITING 0x00000200 | ||
35 | #define CPU_BASED_MWAIT_EXITING 0x00000400 | ||
36 | #define CPU_BASED_RDPMC_EXITING 0x00000800 | ||
37 | #define CPU_BASED_RDTSC_EXITING 0x00001000 | ||
38 | #define CPU_BASED_CR8_LOAD_EXITING 0x00080000 | ||
39 | #define CPU_BASED_CR8_STORE_EXITING 0x00100000 | ||
40 | #define CPU_BASED_TPR_SHADOW 0x00200000 | ||
41 | #define CPU_BASED_MOV_DR_EXITING 0x00800000 | ||
42 | #define CPU_BASED_UNCOND_IO_EXITING 0x01000000 | ||
43 | #define CPU_BASED_USE_IO_BITMAPS 0x02000000 | ||
44 | #define CPU_BASED_USE_MSR_BITMAPS 0x10000000 | ||
45 | #define CPU_BASED_MONITOR_EXITING 0x20000000 | ||
46 | #define CPU_BASED_PAUSE_EXITING 0x40000000 | ||
47 | #define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS 0x80000000 | ||
48 | /* | ||
49 | * Definitions of Secondary Processor-Based VM-Execution Controls. | ||
50 | */ | ||
51 | #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 | ||
52 | #define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 | ||
53 | |||
54 | |||
55 | #define PIN_BASED_EXT_INTR_MASK 0x00000001 | ||
56 | #define PIN_BASED_NMI_EXITING 0x00000008 | ||
57 | #define PIN_BASED_VIRTUAL_NMIS 0x00000020 | ||
58 | |||
59 | #define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200 | ||
60 | #define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000 | ||
61 | |||
62 | #define VM_ENTRY_IA32E_MODE 0x00000200 | ||
63 | #define VM_ENTRY_SMM 0x00000400 | ||
64 | #define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800 | ||
65 | |||
66 | /* VMCS Encodings */ | ||
67 | enum vmcs_field { | ||
68 | GUEST_ES_SELECTOR = 0x00000800, | ||
69 | GUEST_CS_SELECTOR = 0x00000802, | ||
70 | GUEST_SS_SELECTOR = 0x00000804, | ||
71 | GUEST_DS_SELECTOR = 0x00000806, | ||
72 | GUEST_FS_SELECTOR = 0x00000808, | ||
73 | GUEST_GS_SELECTOR = 0x0000080a, | ||
74 | GUEST_LDTR_SELECTOR = 0x0000080c, | ||
75 | GUEST_TR_SELECTOR = 0x0000080e, | ||
76 | HOST_ES_SELECTOR = 0x00000c00, | ||
77 | HOST_CS_SELECTOR = 0x00000c02, | ||
78 | HOST_SS_SELECTOR = 0x00000c04, | ||
79 | HOST_DS_SELECTOR = 0x00000c06, | ||
80 | HOST_FS_SELECTOR = 0x00000c08, | ||
81 | HOST_GS_SELECTOR = 0x00000c0a, | ||
82 | HOST_TR_SELECTOR = 0x00000c0c, | ||
83 | IO_BITMAP_A = 0x00002000, | ||
84 | IO_BITMAP_A_HIGH = 0x00002001, | ||
85 | IO_BITMAP_B = 0x00002002, | ||
86 | IO_BITMAP_B_HIGH = 0x00002003, | ||
87 | MSR_BITMAP = 0x00002004, | ||
88 | MSR_BITMAP_HIGH = 0x00002005, | ||
89 | VM_EXIT_MSR_STORE_ADDR = 0x00002006, | ||
90 | VM_EXIT_MSR_STORE_ADDR_HIGH = 0x00002007, | ||
91 | VM_EXIT_MSR_LOAD_ADDR = 0x00002008, | ||
92 | VM_EXIT_MSR_LOAD_ADDR_HIGH = 0x00002009, | ||
93 | VM_ENTRY_MSR_LOAD_ADDR = 0x0000200a, | ||
94 | VM_ENTRY_MSR_LOAD_ADDR_HIGH = 0x0000200b, | ||
95 | TSC_OFFSET = 0x00002010, | ||
96 | TSC_OFFSET_HIGH = 0x00002011, | ||
97 | VIRTUAL_APIC_PAGE_ADDR = 0x00002012, | ||
98 | VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013, | ||
99 | APIC_ACCESS_ADDR = 0x00002014, | ||
100 | APIC_ACCESS_ADDR_HIGH = 0x00002015, | ||
101 | VMCS_LINK_POINTER = 0x00002800, | ||
102 | VMCS_LINK_POINTER_HIGH = 0x00002801, | ||
103 | GUEST_IA32_DEBUGCTL = 0x00002802, | ||
104 | GUEST_IA32_DEBUGCTL_HIGH = 0x00002803, | ||
105 | PIN_BASED_VM_EXEC_CONTROL = 0x00004000, | ||
106 | CPU_BASED_VM_EXEC_CONTROL = 0x00004002, | ||
107 | EXCEPTION_BITMAP = 0x00004004, | ||
108 | PAGE_FAULT_ERROR_CODE_MASK = 0x00004006, | ||
109 | PAGE_FAULT_ERROR_CODE_MATCH = 0x00004008, | ||
110 | CR3_TARGET_COUNT = 0x0000400a, | ||
111 | VM_EXIT_CONTROLS = 0x0000400c, | ||
112 | VM_EXIT_MSR_STORE_COUNT = 0x0000400e, | ||
113 | VM_EXIT_MSR_LOAD_COUNT = 0x00004010, | ||
114 | VM_ENTRY_CONTROLS = 0x00004012, | ||
115 | VM_ENTRY_MSR_LOAD_COUNT = 0x00004014, | ||
116 | VM_ENTRY_INTR_INFO_FIELD = 0x00004016, | ||
117 | VM_ENTRY_EXCEPTION_ERROR_CODE = 0x00004018, | ||
118 | VM_ENTRY_INSTRUCTION_LEN = 0x0000401a, | ||
119 | TPR_THRESHOLD = 0x0000401c, | ||
120 | SECONDARY_VM_EXEC_CONTROL = 0x0000401e, | ||
121 | VM_INSTRUCTION_ERROR = 0x00004400, | ||
122 | VM_EXIT_REASON = 0x00004402, | ||
123 | VM_EXIT_INTR_INFO = 0x00004404, | ||
124 | VM_EXIT_INTR_ERROR_CODE = 0x00004406, | ||
125 | IDT_VECTORING_INFO_FIELD = 0x00004408, | ||
126 | IDT_VECTORING_ERROR_CODE = 0x0000440a, | ||
127 | VM_EXIT_INSTRUCTION_LEN = 0x0000440c, | ||
128 | VMX_INSTRUCTION_INFO = 0x0000440e, | ||
129 | GUEST_ES_LIMIT = 0x00004800, | ||
130 | GUEST_CS_LIMIT = 0x00004802, | ||
131 | GUEST_SS_LIMIT = 0x00004804, | ||
132 | GUEST_DS_LIMIT = 0x00004806, | ||
133 | GUEST_FS_LIMIT = 0x00004808, | ||
134 | GUEST_GS_LIMIT = 0x0000480a, | ||
135 | GUEST_LDTR_LIMIT = 0x0000480c, | ||
136 | GUEST_TR_LIMIT = 0x0000480e, | ||
137 | GUEST_GDTR_LIMIT = 0x00004810, | ||
138 | GUEST_IDTR_LIMIT = 0x00004812, | ||
139 | GUEST_ES_AR_BYTES = 0x00004814, | ||
140 | GUEST_CS_AR_BYTES = 0x00004816, | ||
141 | GUEST_SS_AR_BYTES = 0x00004818, | ||
142 | GUEST_DS_AR_BYTES = 0x0000481a, | ||
143 | GUEST_FS_AR_BYTES = 0x0000481c, | ||
144 | GUEST_GS_AR_BYTES = 0x0000481e, | ||
145 | GUEST_LDTR_AR_BYTES = 0x00004820, | ||
146 | GUEST_TR_AR_BYTES = 0x00004822, | ||
147 | GUEST_INTERRUPTIBILITY_INFO = 0x00004824, | ||
148 | GUEST_ACTIVITY_STATE = 0X00004826, | ||
149 | GUEST_SYSENTER_CS = 0x0000482A, | ||
150 | HOST_IA32_SYSENTER_CS = 0x00004c00, | ||
151 | CR0_GUEST_HOST_MASK = 0x00006000, | ||
152 | CR4_GUEST_HOST_MASK = 0x00006002, | ||
153 | CR0_READ_SHADOW = 0x00006004, | ||
154 | CR4_READ_SHADOW = 0x00006006, | ||
155 | CR3_TARGET_VALUE0 = 0x00006008, | ||
156 | CR3_TARGET_VALUE1 = 0x0000600a, | ||
157 | CR3_TARGET_VALUE2 = 0x0000600c, | ||
158 | CR3_TARGET_VALUE3 = 0x0000600e, | ||
159 | EXIT_QUALIFICATION = 0x00006400, | ||
160 | GUEST_LINEAR_ADDRESS = 0x0000640a, | ||
161 | GUEST_CR0 = 0x00006800, | ||
162 | GUEST_CR3 = 0x00006802, | ||
163 | GUEST_CR4 = 0x00006804, | ||
164 | GUEST_ES_BASE = 0x00006806, | ||
165 | GUEST_CS_BASE = 0x00006808, | ||
166 | GUEST_SS_BASE = 0x0000680a, | ||
167 | GUEST_DS_BASE = 0x0000680c, | ||
168 | GUEST_FS_BASE = 0x0000680e, | ||
169 | GUEST_GS_BASE = 0x00006810, | ||
170 | GUEST_LDTR_BASE = 0x00006812, | ||
171 | GUEST_TR_BASE = 0x00006814, | ||
172 | GUEST_GDTR_BASE = 0x00006816, | ||
173 | GUEST_IDTR_BASE = 0x00006818, | ||
174 | GUEST_DR7 = 0x0000681a, | ||
175 | GUEST_RSP = 0x0000681c, | ||
176 | GUEST_RIP = 0x0000681e, | ||
177 | GUEST_RFLAGS = 0x00006820, | ||
178 | GUEST_PENDING_DBG_EXCEPTIONS = 0x00006822, | ||
179 | GUEST_SYSENTER_ESP = 0x00006824, | ||
180 | GUEST_SYSENTER_EIP = 0x00006826, | ||
181 | HOST_CR0 = 0x00006c00, | ||
182 | HOST_CR3 = 0x00006c02, | ||
183 | HOST_CR4 = 0x00006c04, | ||
184 | HOST_FS_BASE = 0x00006c06, | ||
185 | HOST_GS_BASE = 0x00006c08, | ||
186 | HOST_TR_BASE = 0x00006c0a, | ||
187 | HOST_GDTR_BASE = 0x00006c0c, | ||
188 | HOST_IDTR_BASE = 0x00006c0e, | ||
189 | HOST_IA32_SYSENTER_ESP = 0x00006c10, | ||
190 | HOST_IA32_SYSENTER_EIP = 0x00006c12, | ||
191 | HOST_RSP = 0x00006c14, | ||
192 | HOST_RIP = 0x00006c16, | ||
193 | }; | ||
194 | |||
195 | #define VMX_EXIT_REASONS_FAILED_VMENTRY 0x80000000 | ||
196 | |||
197 | #define EXIT_REASON_EXCEPTION_NMI 0 | ||
198 | #define EXIT_REASON_EXTERNAL_INTERRUPT 1 | ||
199 | #define EXIT_REASON_TRIPLE_FAULT 2 | ||
200 | |||
201 | #define EXIT_REASON_PENDING_INTERRUPT 7 | ||
202 | |||
203 | #define EXIT_REASON_TASK_SWITCH 9 | ||
204 | #define EXIT_REASON_CPUID 10 | ||
205 | #define EXIT_REASON_HLT 12 | ||
206 | #define EXIT_REASON_INVLPG 14 | ||
207 | #define EXIT_REASON_RDPMC 15 | ||
208 | #define EXIT_REASON_RDTSC 16 | ||
209 | #define EXIT_REASON_VMCALL 18 | ||
210 | #define EXIT_REASON_VMCLEAR 19 | ||
211 | #define EXIT_REASON_VMLAUNCH 20 | ||
212 | #define EXIT_REASON_VMPTRLD 21 | ||
213 | #define EXIT_REASON_VMPTRST 22 | ||
214 | #define EXIT_REASON_VMREAD 23 | ||
215 | #define EXIT_REASON_VMRESUME 24 | ||
216 | #define EXIT_REASON_VMWRITE 25 | ||
217 | #define EXIT_REASON_VMOFF 26 | ||
218 | #define EXIT_REASON_VMON 27 | ||
219 | #define EXIT_REASON_CR_ACCESS 28 | ||
220 | #define EXIT_REASON_DR_ACCESS 29 | ||
221 | #define EXIT_REASON_IO_INSTRUCTION 30 | ||
222 | #define EXIT_REASON_MSR_READ 31 | ||
223 | #define EXIT_REASON_MSR_WRITE 32 | ||
224 | #define EXIT_REASON_MWAIT_INSTRUCTION 36 | ||
225 | #define EXIT_REASON_TPR_BELOW_THRESHOLD 43 | ||
226 | #define EXIT_REASON_APIC_ACCESS 44 | ||
227 | #define EXIT_REASON_WBINVD 54 | ||
228 | |||
229 | /* | ||
230 | * Interruption-information format | ||
231 | */ | ||
232 | #define INTR_INFO_VECTOR_MASK 0xff /* 7:0 */ | ||
233 | #define INTR_INFO_INTR_TYPE_MASK 0x700 /* 10:8 */ | ||
234 | #define INTR_INFO_DELIEVER_CODE_MASK 0x800 /* 11 */ | ||
235 | #define INTR_INFO_VALID_MASK 0x80000000 /* 31 */ | ||
236 | |||
237 | #define VECTORING_INFO_VECTOR_MASK INTR_INFO_VECTOR_MASK | ||
238 | #define VECTORING_INFO_TYPE_MASK INTR_INFO_INTR_TYPE_MASK | ||
239 | #define VECTORING_INFO_DELIEVER_CODE_MASK INTR_INFO_DELIEVER_CODE_MASK | ||
240 | #define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK | ||
241 | |||
242 | #define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */ | ||
243 | #define INTR_TYPE_EXCEPTION (3 << 8) /* processor exception */ | ||
244 | #define INTR_TYPE_SOFT_INTR (4 << 8) /* software interrupt */ | ||
245 | |||
246 | /* | ||
247 | * Exit Qualifications for MOV for Control Register Access | ||
248 | */ | ||
249 | #define CONTROL_REG_ACCESS_NUM 0x7 /* 2:0, number of control reg.*/ | ||
250 | #define CONTROL_REG_ACCESS_TYPE 0x30 /* 5:4, access type */ | ||
251 | #define CONTROL_REG_ACCESS_REG 0xf00 /* 10:8, general purpose reg. */ | ||
252 | #define LMSW_SOURCE_DATA_SHIFT 16 | ||
253 | #define LMSW_SOURCE_DATA (0xFFFF << LMSW_SOURCE_DATA_SHIFT) /* 16:31 lmsw source */ | ||
254 | #define REG_EAX (0 << 8) | ||
255 | #define REG_ECX (1 << 8) | ||
256 | #define REG_EDX (2 << 8) | ||
257 | #define REG_EBX (3 << 8) | ||
258 | #define REG_ESP (4 << 8) | ||
259 | #define REG_EBP (5 << 8) | ||
260 | #define REG_ESI (6 << 8) | ||
261 | #define REG_EDI (7 << 8) | ||
262 | #define REG_R8 (8 << 8) | ||
263 | #define REG_R9 (9 << 8) | ||
264 | #define REG_R10 (10 << 8) | ||
265 | #define REG_R11 (11 << 8) | ||
266 | #define REG_R12 (12 << 8) | ||
267 | #define REG_R13 (13 << 8) | ||
268 | #define REG_R14 (14 << 8) | ||
269 | #define REG_R15 (15 << 8) | ||
270 | |||
271 | /* | ||
272 | * Exit Qualifications for MOV for Debug Register Access | ||
273 | */ | ||
274 | #define DEBUG_REG_ACCESS_NUM 0x7 /* 2:0, number of debug reg. */ | ||
275 | #define DEBUG_REG_ACCESS_TYPE 0x10 /* 4, direction of access */ | ||
276 | #define TYPE_MOV_TO_DR (0 << 4) | ||
277 | #define TYPE_MOV_FROM_DR (1 << 4) | ||
278 | #define DEBUG_REG_ACCESS_REG 0xf00 /* 11:8, general purpose reg. */ | ||
279 | |||
280 | |||
281 | /* segment AR */ | ||
282 | #define SEGMENT_AR_L_MASK (1 << 13) | ||
283 | |||
284 | #define AR_TYPE_ACCESSES_MASK 1 | ||
285 | #define AR_TYPE_READABLE_MASK (1 << 1) | ||
286 | #define AR_TYPE_WRITEABLE_MASK (1 << 2) | ||
287 | #define AR_TYPE_CODE_MASK (1 << 3) | ||
288 | #define AR_TYPE_MASK 0x0f | ||
289 | #define AR_TYPE_BUSY_64_TSS 11 | ||
290 | #define AR_TYPE_BUSY_32_TSS 11 | ||
291 | #define AR_TYPE_BUSY_16_TSS 3 | ||
292 | #define AR_TYPE_LDT 2 | ||
293 | |||
294 | #define AR_UNUSABLE_MASK (1 << 16) | ||
295 | #define AR_S_MASK (1 << 4) | ||
296 | #define AR_P_MASK (1 << 7) | ||
297 | #define AR_L_MASK (1 << 13) | ||
298 | #define AR_DB_MASK (1 << 14) | ||
299 | #define AR_G_MASK (1 << 15) | ||
300 | #define AR_DPL_SHIFT 5 | ||
301 | #define AR_DPL(ar) (((ar) >> AR_DPL_SHIFT) & 3) | ||
302 | |||
303 | #define AR_RESERVD_MASK 0xfffe0f00 | ||
304 | |||
305 | #define MSR_IA32_VMX_BASIC 0x480 | ||
306 | #define MSR_IA32_VMX_PINBASED_CTLS 0x481 | ||
307 | #define MSR_IA32_VMX_PROCBASED_CTLS 0x482 | ||
308 | #define MSR_IA32_VMX_EXIT_CTLS 0x483 | ||
309 | #define MSR_IA32_VMX_ENTRY_CTLS 0x484 | ||
310 | #define MSR_IA32_VMX_MISC 0x485 | ||
311 | #define MSR_IA32_VMX_CR0_FIXED0 0x486 | ||
312 | #define MSR_IA32_VMX_CR0_FIXED1 0x487 | ||
313 | #define MSR_IA32_VMX_CR4_FIXED0 0x488 | ||
314 | #define MSR_IA32_VMX_CR4_FIXED1 0x489 | ||
315 | #define MSR_IA32_VMX_VMCS_ENUM 0x48a | ||
316 | #define MSR_IA32_VMX_PROCBASED_CTLS2 0x48b | ||
317 | |||
318 | #define MSR_IA32_FEATURE_CONTROL 0x3a | ||
319 | #define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1 | ||
320 | #define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4 | ||
321 | |||
322 | #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9 | ||
323 | |||
324 | #endif | ||
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c new file mode 100644 index 000000000000..5902c5cbc1bb --- /dev/null +++ b/arch/x86/kvm/x86.c | |||
@@ -0,0 +1,3146 @@ | |||
1 | /* | ||
2 | * Kernel-based Virtual Machine driver for Linux | ||
3 | * | ||
4 | * derived from drivers/kvm/kvm_main.c | ||
5 | * | ||
6 | * Copyright (C) 2006 Qumranet, Inc. | ||
7 | * | ||
8 | * Authors: | ||
9 | * Avi Kivity <avi@qumranet.com> | ||
10 | * Yaniv Kamay <yaniv@qumranet.com> | ||
11 | * | ||
12 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
13 | * the COPYING file in the top-level directory. | ||
14 | * | ||
15 | */ | ||
16 | |||
17 | #include <linux/kvm_host.h> | ||
18 | #include "segment_descriptor.h" | ||
19 | #include "irq.h" | ||
20 | #include "mmu.h" | ||
21 | |||
22 | #include <linux/kvm.h> | ||
23 | #include <linux/fs.h> | ||
24 | #include <linux/vmalloc.h> | ||
25 | #include <linux/module.h> | ||
26 | #include <linux/mman.h> | ||
27 | #include <linux/highmem.h> | ||
28 | |||
29 | #include <asm/uaccess.h> | ||
30 | #include <asm/msr.h> | ||
31 | |||
32 | #define MAX_IO_MSRS 256 | ||
33 | #define CR0_RESERVED_BITS \ | ||
34 | (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ | ||
35 | | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ | ||
36 | | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) | ||
37 | #define CR4_RESERVED_BITS \ | ||
38 | (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ | ||
39 | | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ | ||
40 | | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ | ||
41 | | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) | ||
42 | |||
43 | #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) | ||
44 | #define EFER_RESERVED_BITS 0xfffffffffffff2fe | ||
45 | |||
46 | #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM | ||
47 | #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU | ||
48 | |||
49 | struct kvm_x86_ops *kvm_x86_ops; | ||
50 | |||
51 | struct kvm_stats_debugfs_item debugfs_entries[] = { | ||
52 | { "pf_fixed", VCPU_STAT(pf_fixed) }, | ||
53 | { "pf_guest", VCPU_STAT(pf_guest) }, | ||
54 | { "tlb_flush", VCPU_STAT(tlb_flush) }, | ||
55 | { "invlpg", VCPU_STAT(invlpg) }, | ||
56 | { "exits", VCPU_STAT(exits) }, | ||
57 | { "io_exits", VCPU_STAT(io_exits) }, | ||
58 | { "mmio_exits", VCPU_STAT(mmio_exits) }, | ||
59 | { "signal_exits", VCPU_STAT(signal_exits) }, | ||
60 | { "irq_window", VCPU_STAT(irq_window_exits) }, | ||
61 | { "halt_exits", VCPU_STAT(halt_exits) }, | ||
62 | { "halt_wakeup", VCPU_STAT(halt_wakeup) }, | ||
63 | { "request_irq", VCPU_STAT(request_irq_exits) }, | ||
64 | { "irq_exits", VCPU_STAT(irq_exits) }, | ||
65 | { "host_state_reload", VCPU_STAT(host_state_reload) }, | ||
66 | { "efer_reload", VCPU_STAT(efer_reload) }, | ||
67 | { "fpu_reload", VCPU_STAT(fpu_reload) }, | ||
68 | { "insn_emulation", VCPU_STAT(insn_emulation) }, | ||
69 | { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, | ||
70 | { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, | ||
71 | { "mmu_pte_write", VM_STAT(mmu_pte_write) }, | ||
72 | { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, | ||
73 | { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) }, | ||
74 | { "mmu_flooded", VM_STAT(mmu_flooded) }, | ||
75 | { "mmu_recycled", VM_STAT(mmu_recycled) }, | ||
76 | { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, | ||
77 | { NULL } | ||
78 | }; | ||
79 | |||
80 | |||
81 | unsigned long segment_base(u16 selector) | ||
82 | { | ||
83 | struct descriptor_table gdt; | ||
84 | struct segment_descriptor *d; | ||
85 | unsigned long table_base; | ||
86 | unsigned long v; | ||
87 | |||
88 | if (selector == 0) | ||
89 | return 0; | ||
90 | |||
91 | asm("sgdt %0" : "=m"(gdt)); | ||
92 | table_base = gdt.base; | ||
93 | |||
94 | if (selector & 4) { /* from ldt */ | ||
95 | u16 ldt_selector; | ||
96 | |||
97 | asm("sldt %0" : "=g"(ldt_selector)); | ||
98 | table_base = segment_base(ldt_selector); | ||
99 | } | ||
100 | d = (struct segment_descriptor *)(table_base + (selector & ~7)); | ||
101 | v = d->base_low | ((unsigned long)d->base_mid << 16) | | ||
102 | ((unsigned long)d->base_high << 24); | ||
103 | #ifdef CONFIG_X86_64 | ||
104 | if (d->system == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) | ||
105 | v |= ((unsigned long) \ | ||
106 | ((struct segment_descriptor_64 *)d)->base_higher) << 32; | ||
107 | #endif | ||
108 | return v; | ||
109 | } | ||
110 | EXPORT_SYMBOL_GPL(segment_base); | ||
111 | |||
112 | u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) | ||
113 | { | ||
114 | if (irqchip_in_kernel(vcpu->kvm)) | ||
115 | return vcpu->arch.apic_base; | ||
116 | else | ||
117 | return vcpu->arch.apic_base; | ||
118 | } | ||
119 | EXPORT_SYMBOL_GPL(kvm_get_apic_base); | ||
120 | |||
121 | void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) | ||
122 | { | ||
123 | /* TODO: reserve bits check */ | ||
124 | if (irqchip_in_kernel(vcpu->kvm)) | ||
125 | kvm_lapic_set_base(vcpu, data); | ||
126 | else | ||
127 | vcpu->arch.apic_base = data; | ||
128 | } | ||
129 | EXPORT_SYMBOL_GPL(kvm_set_apic_base); | ||
130 | |||
131 | void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) | ||
132 | { | ||
133 | WARN_ON(vcpu->arch.exception.pending); | ||
134 | vcpu->arch.exception.pending = true; | ||
135 | vcpu->arch.exception.has_error_code = false; | ||
136 | vcpu->arch.exception.nr = nr; | ||
137 | } | ||
138 | EXPORT_SYMBOL_GPL(kvm_queue_exception); | ||
139 | |||
140 | void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, | ||
141 | u32 error_code) | ||
142 | { | ||
143 | ++vcpu->stat.pf_guest; | ||
144 | if (vcpu->arch.exception.pending && vcpu->arch.exception.nr == PF_VECTOR) { | ||
145 | printk(KERN_DEBUG "kvm: inject_page_fault:" | ||
146 | " double fault 0x%lx\n", addr); | ||
147 | vcpu->arch.exception.nr = DF_VECTOR; | ||
148 | vcpu->arch.exception.error_code = 0; | ||
149 | return; | ||
150 | } | ||
151 | vcpu->arch.cr2 = addr; | ||
152 | kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); | ||
153 | } | ||
154 | |||
155 | void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) | ||
156 | { | ||
157 | WARN_ON(vcpu->arch.exception.pending); | ||
158 | vcpu->arch.exception.pending = true; | ||
159 | vcpu->arch.exception.has_error_code = true; | ||
160 | vcpu->arch.exception.nr = nr; | ||
161 | vcpu->arch.exception.error_code = error_code; | ||
162 | } | ||
163 | EXPORT_SYMBOL_GPL(kvm_queue_exception_e); | ||
164 | |||
165 | static void __queue_exception(struct kvm_vcpu *vcpu) | ||
166 | { | ||
167 | kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, | ||
168 | vcpu->arch.exception.has_error_code, | ||
169 | vcpu->arch.exception.error_code); | ||
170 | } | ||
171 | |||
172 | /* | ||
173 | * Load the pae pdptrs. Return true is they are all valid. | ||
174 | */ | ||
175 | int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) | ||
176 | { | ||
177 | gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; | ||
178 | unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; | ||
179 | int i; | ||
180 | int ret; | ||
181 | u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; | ||
182 | |||
183 | mutex_lock(&vcpu->kvm->lock); | ||
184 | ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte, | ||
185 | offset * sizeof(u64), sizeof(pdpte)); | ||
186 | if (ret < 0) { | ||
187 | ret = 0; | ||
188 | goto out; | ||
189 | } | ||
190 | for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { | ||
191 | if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) { | ||
192 | ret = 0; | ||
193 | goto out; | ||
194 | } | ||
195 | } | ||
196 | ret = 1; | ||
197 | |||
198 | memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); | ||
199 | out: | ||
200 | mutex_unlock(&vcpu->kvm->lock); | ||
201 | |||
202 | return ret; | ||
203 | } | ||
204 | |||
205 | static bool pdptrs_changed(struct kvm_vcpu *vcpu) | ||
206 | { | ||
207 | u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; | ||
208 | bool changed = true; | ||
209 | int r; | ||
210 | |||
211 | if (is_long_mode(vcpu) || !is_pae(vcpu)) | ||
212 | return false; | ||
213 | |||
214 | mutex_lock(&vcpu->kvm->lock); | ||
215 | r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); | ||
216 | if (r < 0) | ||
217 | goto out; | ||
218 | changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0; | ||
219 | out: | ||
220 | mutex_unlock(&vcpu->kvm->lock); | ||
221 | |||
222 | return changed; | ||
223 | } | ||
224 | |||
225 | void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | ||
226 | { | ||
227 | if (cr0 & CR0_RESERVED_BITS) { | ||
228 | printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", | ||
229 | cr0, vcpu->arch.cr0); | ||
230 | kvm_inject_gp(vcpu, 0); | ||
231 | return; | ||
232 | } | ||
233 | |||
234 | if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { | ||
235 | printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); | ||
236 | kvm_inject_gp(vcpu, 0); | ||
237 | return; | ||
238 | } | ||
239 | |||
240 | if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { | ||
241 | printk(KERN_DEBUG "set_cr0: #GP, set PG flag " | ||
242 | "and a clear PE flag\n"); | ||
243 | kvm_inject_gp(vcpu, 0); | ||
244 | return; | ||
245 | } | ||
246 | |||
247 | if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { | ||
248 | #ifdef CONFIG_X86_64 | ||
249 | if ((vcpu->arch.shadow_efer & EFER_LME)) { | ||
250 | int cs_db, cs_l; | ||
251 | |||
252 | if (!is_pae(vcpu)) { | ||
253 | printk(KERN_DEBUG "set_cr0: #GP, start paging " | ||
254 | "in long mode while PAE is disabled\n"); | ||
255 | kvm_inject_gp(vcpu, 0); | ||
256 | return; | ||
257 | } | ||
258 | kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); | ||
259 | if (cs_l) { | ||
260 | printk(KERN_DEBUG "set_cr0: #GP, start paging " | ||
261 | "in long mode while CS.L == 1\n"); | ||
262 | kvm_inject_gp(vcpu, 0); | ||
263 | return; | ||
264 | |||
265 | } | ||
266 | } else | ||
267 | #endif | ||
268 | if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { | ||
269 | printk(KERN_DEBUG "set_cr0: #GP, pdptrs " | ||
270 | "reserved bits\n"); | ||
271 | kvm_inject_gp(vcpu, 0); | ||
272 | return; | ||
273 | } | ||
274 | |||
275 | } | ||
276 | |||
277 | kvm_x86_ops->set_cr0(vcpu, cr0); | ||
278 | vcpu->arch.cr0 = cr0; | ||
279 | |||
280 | mutex_lock(&vcpu->kvm->lock); | ||
281 | kvm_mmu_reset_context(vcpu); | ||
282 | mutex_unlock(&vcpu->kvm->lock); | ||
283 | return; | ||
284 | } | ||
285 | EXPORT_SYMBOL_GPL(set_cr0); | ||
286 | |||
287 | void lmsw(struct kvm_vcpu *vcpu, unsigned long msw) | ||
288 | { | ||
289 | set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); | ||
290 | } | ||
291 | EXPORT_SYMBOL_GPL(lmsw); | ||
292 | |||
293 | void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | ||
294 | { | ||
295 | if (cr4 & CR4_RESERVED_BITS) { | ||
296 | printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); | ||
297 | kvm_inject_gp(vcpu, 0); | ||
298 | return; | ||
299 | } | ||
300 | |||
301 | if (is_long_mode(vcpu)) { | ||
302 | if (!(cr4 & X86_CR4_PAE)) { | ||
303 | printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while " | ||
304 | "in long mode\n"); | ||
305 | kvm_inject_gp(vcpu, 0); | ||
306 | return; | ||
307 | } | ||
308 | } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE) | ||
309 | && !load_pdptrs(vcpu, vcpu->arch.cr3)) { | ||
310 | printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); | ||
311 | kvm_inject_gp(vcpu, 0); | ||
312 | return; | ||
313 | } | ||
314 | |||
315 | if (cr4 & X86_CR4_VMXE) { | ||
316 | printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n"); | ||
317 | kvm_inject_gp(vcpu, 0); | ||
318 | return; | ||
319 | } | ||
320 | kvm_x86_ops->set_cr4(vcpu, cr4); | ||
321 | vcpu->arch.cr4 = cr4; | ||
322 | mutex_lock(&vcpu->kvm->lock); | ||
323 | kvm_mmu_reset_context(vcpu); | ||
324 | mutex_unlock(&vcpu->kvm->lock); | ||
325 | } | ||
326 | EXPORT_SYMBOL_GPL(set_cr4); | ||
327 | |||
328 | void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | ||
329 | { | ||
330 | if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { | ||
331 | kvm_mmu_flush_tlb(vcpu); | ||
332 | return; | ||
333 | } | ||
334 | |||
335 | if (is_long_mode(vcpu)) { | ||
336 | if (cr3 & CR3_L_MODE_RESERVED_BITS) { | ||
337 | printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); | ||
338 | kvm_inject_gp(vcpu, 0); | ||
339 | return; | ||
340 | } | ||
341 | } else { | ||
342 | if (is_pae(vcpu)) { | ||
343 | if (cr3 & CR3_PAE_RESERVED_BITS) { | ||
344 | printk(KERN_DEBUG | ||
345 | "set_cr3: #GP, reserved bits\n"); | ||
346 | kvm_inject_gp(vcpu, 0); | ||
347 | return; | ||
348 | } | ||
349 | if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { | ||
350 | printk(KERN_DEBUG "set_cr3: #GP, pdptrs " | ||
351 | "reserved bits\n"); | ||
352 | kvm_inject_gp(vcpu, 0); | ||
353 | return; | ||
354 | } | ||
355 | } | ||
356 | /* | ||
357 | * We don't check reserved bits in nonpae mode, because | ||
358 | * this isn't enforced, and VMware depends on this. | ||
359 | */ | ||
360 | } | ||
361 | |||
362 | mutex_lock(&vcpu->kvm->lock); | ||
363 | /* | ||
364 | * Does the new cr3 value map to physical memory? (Note, we | ||
365 | * catch an invalid cr3 even in real-mode, because it would | ||
366 | * cause trouble later on when we turn on paging anyway.) | ||
367 | * | ||
368 | * A real CPU would silently accept an invalid cr3 and would | ||
369 | * attempt to use it - with largely undefined (and often hard | ||
370 | * to debug) behavior on the guest side. | ||
371 | */ | ||
372 | if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) | ||
373 | kvm_inject_gp(vcpu, 0); | ||
374 | else { | ||
375 | vcpu->arch.cr3 = cr3; | ||
376 | vcpu->arch.mmu.new_cr3(vcpu); | ||
377 | } | ||
378 | mutex_unlock(&vcpu->kvm->lock); | ||
379 | } | ||
380 | EXPORT_SYMBOL_GPL(set_cr3); | ||
381 | |||
382 | void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) | ||
383 | { | ||
384 | if (cr8 & CR8_RESERVED_BITS) { | ||
385 | printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8); | ||
386 | kvm_inject_gp(vcpu, 0); | ||
387 | return; | ||
388 | } | ||
389 | if (irqchip_in_kernel(vcpu->kvm)) | ||
390 | kvm_lapic_set_tpr(vcpu, cr8); | ||
391 | else | ||
392 | vcpu->arch.cr8 = cr8; | ||
393 | } | ||
394 | EXPORT_SYMBOL_GPL(set_cr8); | ||
395 | |||
396 | unsigned long get_cr8(struct kvm_vcpu *vcpu) | ||
397 | { | ||
398 | if (irqchip_in_kernel(vcpu->kvm)) | ||
399 | return kvm_lapic_get_cr8(vcpu); | ||
400 | else | ||
401 | return vcpu->arch.cr8; | ||
402 | } | ||
403 | EXPORT_SYMBOL_GPL(get_cr8); | ||
404 | |||
405 | /* | ||
406 | * List of msr numbers which we expose to userspace through KVM_GET_MSRS | ||
407 | * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. | ||
408 | * | ||
409 | * This list is modified at module load time to reflect the | ||
410 | * capabilities of the host cpu. | ||
411 | */ | ||
412 | static u32 msrs_to_save[] = { | ||
413 | MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, | ||
414 | MSR_K6_STAR, | ||
415 | #ifdef CONFIG_X86_64 | ||
416 | MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, | ||
417 | #endif | ||
418 | MSR_IA32_TIME_STAMP_COUNTER, | ||
419 | }; | ||
420 | |||
421 | static unsigned num_msrs_to_save; | ||
422 | |||
423 | static u32 emulated_msrs[] = { | ||
424 | MSR_IA32_MISC_ENABLE, | ||
425 | }; | ||
426 | |||
427 | #ifdef CONFIG_X86_64 | ||
428 | |||
429 | static void set_efer(struct kvm_vcpu *vcpu, u64 efer) | ||
430 | { | ||
431 | if (efer & EFER_RESERVED_BITS) { | ||
432 | printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n", | ||
433 | efer); | ||
434 | kvm_inject_gp(vcpu, 0); | ||
435 | return; | ||
436 | } | ||
437 | |||
438 | if (is_paging(vcpu) | ||
439 | && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) { | ||
440 | printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n"); | ||
441 | kvm_inject_gp(vcpu, 0); | ||
442 | return; | ||
443 | } | ||
444 | |||
445 | kvm_x86_ops->set_efer(vcpu, efer); | ||
446 | |||
447 | efer &= ~EFER_LMA; | ||
448 | efer |= vcpu->arch.shadow_efer & EFER_LMA; | ||
449 | |||
450 | vcpu->arch.shadow_efer = efer; | ||
451 | } | ||
452 | |||
453 | #endif | ||
454 | |||
455 | /* | ||
456 | * Writes msr value into into the appropriate "register". | ||
457 | * Returns 0 on success, non-0 otherwise. | ||
458 | * Assumes vcpu_load() was already called. | ||
459 | */ | ||
460 | int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) | ||
461 | { | ||
462 | return kvm_x86_ops->set_msr(vcpu, msr_index, data); | ||
463 | } | ||
464 | |||
465 | /* | ||
466 | * Adapt set_msr() to msr_io()'s calling convention | ||
467 | */ | ||
468 | static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) | ||
469 | { | ||
470 | return kvm_set_msr(vcpu, index, *data); | ||
471 | } | ||
472 | |||
473 | |||
474 | int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | ||
475 | { | ||
476 | switch (msr) { | ||
477 | #ifdef CONFIG_X86_64 | ||
478 | case MSR_EFER: | ||
479 | set_efer(vcpu, data); | ||
480 | break; | ||
481 | #endif | ||
482 | case MSR_IA32_MC0_STATUS: | ||
483 | pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", | ||
484 | __FUNCTION__, data); | ||
485 | break; | ||
486 | case MSR_IA32_MCG_STATUS: | ||
487 | pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n", | ||
488 | __FUNCTION__, data); | ||
489 | break; | ||
490 | case MSR_IA32_UCODE_REV: | ||
491 | case MSR_IA32_UCODE_WRITE: | ||
492 | case 0x200 ... 0x2ff: /* MTRRs */ | ||
493 | break; | ||
494 | case MSR_IA32_APICBASE: | ||
495 | kvm_set_apic_base(vcpu, data); | ||
496 | break; | ||
497 | case MSR_IA32_MISC_ENABLE: | ||
498 | vcpu->arch.ia32_misc_enable_msr = data; | ||
499 | break; | ||
500 | default: | ||
501 | pr_unimpl(vcpu, "unhandled wrmsr: 0x%x\n", msr); | ||
502 | return 1; | ||
503 | } | ||
504 | return 0; | ||
505 | } | ||
506 | EXPORT_SYMBOL_GPL(kvm_set_msr_common); | ||
507 | |||
508 | |||
509 | /* | ||
510 | * Reads an msr value (of 'msr_index') into 'pdata'. | ||
511 | * Returns 0 on success, non-0 otherwise. | ||
512 | * Assumes vcpu_load() was already called. | ||
513 | */ | ||
514 | int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) | ||
515 | { | ||
516 | return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); | ||
517 | } | ||
518 | |||
519 | int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) | ||
520 | { | ||
521 | u64 data; | ||
522 | |||
523 | switch (msr) { | ||
524 | case 0xc0010010: /* SYSCFG */ | ||
525 | case 0xc0010015: /* HWCR */ | ||
526 | case MSR_IA32_PLATFORM_ID: | ||
527 | case MSR_IA32_P5_MC_ADDR: | ||
528 | case MSR_IA32_P5_MC_TYPE: | ||
529 | case MSR_IA32_MC0_CTL: | ||
530 | case MSR_IA32_MCG_STATUS: | ||
531 | case MSR_IA32_MCG_CAP: | ||
532 | case MSR_IA32_MC0_MISC: | ||
533 | case MSR_IA32_MC0_MISC+4: | ||
534 | case MSR_IA32_MC0_MISC+8: | ||
535 | case MSR_IA32_MC0_MISC+12: | ||
536 | case MSR_IA32_MC0_MISC+16: | ||
537 | case MSR_IA32_UCODE_REV: | ||
538 | case MSR_IA32_PERF_STATUS: | ||
539 | case MSR_IA32_EBL_CR_POWERON: | ||
540 | /* MTRR registers */ | ||
541 | case 0xfe: | ||
542 | case 0x200 ... 0x2ff: | ||
543 | data = 0; | ||
544 | break; | ||
545 | case 0xcd: /* fsb frequency */ | ||
546 | data = 3; | ||
547 | break; | ||
548 | case MSR_IA32_APICBASE: | ||
549 | data = kvm_get_apic_base(vcpu); | ||
550 | break; | ||
551 | case MSR_IA32_MISC_ENABLE: | ||
552 | data = vcpu->arch.ia32_misc_enable_msr; | ||
553 | break; | ||
554 | #ifdef CONFIG_X86_64 | ||
555 | case MSR_EFER: | ||
556 | data = vcpu->arch.shadow_efer; | ||
557 | break; | ||
558 | #endif | ||
559 | default: | ||
560 | pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); | ||
561 | return 1; | ||
562 | } | ||
563 | *pdata = data; | ||
564 | return 0; | ||
565 | } | ||
566 | EXPORT_SYMBOL_GPL(kvm_get_msr_common); | ||
567 | |||
568 | /* | ||
569 | * Read or write a bunch of msrs. All parameters are kernel addresses. | ||
570 | * | ||
571 | * @return number of msrs set successfully. | ||
572 | */ | ||
573 | static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, | ||
574 | struct kvm_msr_entry *entries, | ||
575 | int (*do_msr)(struct kvm_vcpu *vcpu, | ||
576 | unsigned index, u64 *data)) | ||
577 | { | ||
578 | int i; | ||
579 | |||
580 | vcpu_load(vcpu); | ||
581 | |||
582 | for (i = 0; i < msrs->nmsrs; ++i) | ||
583 | if (do_msr(vcpu, entries[i].index, &entries[i].data)) | ||
584 | break; | ||
585 | |||
586 | vcpu_put(vcpu); | ||
587 | |||
588 | return i; | ||
589 | } | ||
590 | |||
591 | /* | ||
592 | * Read or write a bunch of msrs. Parameters are user addresses. | ||
593 | * | ||
594 | * @return number of msrs set successfully. | ||
595 | */ | ||
596 | static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, | ||
597 | int (*do_msr)(struct kvm_vcpu *vcpu, | ||
598 | unsigned index, u64 *data), | ||
599 | int writeback) | ||
600 | { | ||
601 | struct kvm_msrs msrs; | ||
602 | struct kvm_msr_entry *entries; | ||
603 | int r, n; | ||
604 | unsigned size; | ||
605 | |||
606 | r = -EFAULT; | ||
607 | if (copy_from_user(&msrs, user_msrs, sizeof msrs)) | ||
608 | goto out; | ||
609 | |||
610 | r = -E2BIG; | ||
611 | if (msrs.nmsrs >= MAX_IO_MSRS) | ||
612 | goto out; | ||
613 | |||
614 | r = -ENOMEM; | ||
615 | size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; | ||
616 | entries = vmalloc(size); | ||
617 | if (!entries) | ||
618 | goto out; | ||
619 | |||
620 | r = -EFAULT; | ||
621 | if (copy_from_user(entries, user_msrs->entries, size)) | ||
622 | goto out_free; | ||
623 | |||
624 | r = n = __msr_io(vcpu, &msrs, entries, do_msr); | ||
625 | if (r < 0) | ||
626 | goto out_free; | ||
627 | |||
628 | r = -EFAULT; | ||
629 | if (writeback && copy_to_user(user_msrs->entries, entries, size)) | ||
630 | goto out_free; | ||
631 | |||
632 | r = n; | ||
633 | |||
634 | out_free: | ||
635 | vfree(entries); | ||
636 | out: | ||
637 | return r; | ||
638 | } | ||
639 | |||
640 | /* | ||
641 | * Make sure that a cpu that is being hot-unplugged does not have any vcpus | ||
642 | * cached on it. | ||
643 | */ | ||
644 | void decache_vcpus_on_cpu(int cpu) | ||
645 | { | ||
646 | struct kvm *vm; | ||
647 | struct kvm_vcpu *vcpu; | ||
648 | int i; | ||
649 | |||
650 | spin_lock(&kvm_lock); | ||
651 | list_for_each_entry(vm, &vm_list, vm_list) | ||
652 | for (i = 0; i < KVM_MAX_VCPUS; ++i) { | ||
653 | vcpu = vm->vcpus[i]; | ||
654 | if (!vcpu) | ||
655 | continue; | ||
656 | /* | ||
657 | * If the vcpu is locked, then it is running on some | ||
658 | * other cpu and therefore it is not cached on the | ||
659 | * cpu in question. | ||
660 | * | ||
661 | * If it's not locked, check the last cpu it executed | ||
662 | * on. | ||
663 | */ | ||
664 | if (mutex_trylock(&vcpu->mutex)) { | ||
665 | if (vcpu->cpu == cpu) { | ||
666 | kvm_x86_ops->vcpu_decache(vcpu); | ||
667 | vcpu->cpu = -1; | ||
668 | } | ||
669 | mutex_unlock(&vcpu->mutex); | ||
670 | } | ||
671 | } | ||
672 | spin_unlock(&kvm_lock); | ||
673 | } | ||
674 | |||
675 | int kvm_dev_ioctl_check_extension(long ext) | ||
676 | { | ||
677 | int r; | ||
678 | |||
679 | switch (ext) { | ||
680 | case KVM_CAP_IRQCHIP: | ||
681 | case KVM_CAP_HLT: | ||
682 | case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: | ||
683 | case KVM_CAP_USER_MEMORY: | ||
684 | case KVM_CAP_SET_TSS_ADDR: | ||
685 | case KVM_CAP_EXT_CPUID: | ||
686 | r = 1; | ||
687 | break; | ||
688 | default: | ||
689 | r = 0; | ||
690 | break; | ||
691 | } | ||
692 | return r; | ||
693 | |||
694 | } | ||
695 | |||
696 | long kvm_arch_dev_ioctl(struct file *filp, | ||
697 | unsigned int ioctl, unsigned long arg) | ||
698 | { | ||
699 | void __user *argp = (void __user *)arg; | ||
700 | long r; | ||
701 | |||
702 | switch (ioctl) { | ||
703 | case KVM_GET_MSR_INDEX_LIST: { | ||
704 | struct kvm_msr_list __user *user_msr_list = argp; | ||
705 | struct kvm_msr_list msr_list; | ||
706 | unsigned n; | ||
707 | |||
708 | r = -EFAULT; | ||
709 | if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list)) | ||
710 | goto out; | ||
711 | n = msr_list.nmsrs; | ||
712 | msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs); | ||
713 | if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) | ||
714 | goto out; | ||
715 | r = -E2BIG; | ||
716 | if (n < num_msrs_to_save) | ||
717 | goto out; | ||
718 | r = -EFAULT; | ||
719 | if (copy_to_user(user_msr_list->indices, &msrs_to_save, | ||
720 | num_msrs_to_save * sizeof(u32))) | ||
721 | goto out; | ||
722 | if (copy_to_user(user_msr_list->indices | ||
723 | + num_msrs_to_save * sizeof(u32), | ||
724 | &emulated_msrs, | ||
725 | ARRAY_SIZE(emulated_msrs) * sizeof(u32))) | ||
726 | goto out; | ||
727 | r = 0; | ||
728 | break; | ||
729 | } | ||
730 | default: | ||
731 | r = -EINVAL; | ||
732 | } | ||
733 | out: | ||
734 | return r; | ||
735 | } | ||
736 | |||
737 | void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | ||
738 | { | ||
739 | kvm_x86_ops->vcpu_load(vcpu, cpu); | ||
740 | } | ||
741 | |||
742 | void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) | ||
743 | { | ||
744 | kvm_x86_ops->vcpu_put(vcpu); | ||
745 | kvm_put_guest_fpu(vcpu); | ||
746 | } | ||
747 | |||
748 | static int is_efer_nx(void) | ||
749 | { | ||
750 | u64 efer; | ||
751 | |||
752 | rdmsrl(MSR_EFER, efer); | ||
753 | return efer & EFER_NX; | ||
754 | } | ||
755 | |||
756 | static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) | ||
757 | { | ||
758 | int i; | ||
759 | struct kvm_cpuid_entry2 *e, *entry; | ||
760 | |||
761 | entry = NULL; | ||
762 | for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { | ||
763 | e = &vcpu->arch.cpuid_entries[i]; | ||
764 | if (e->function == 0x80000001) { | ||
765 | entry = e; | ||
766 | break; | ||
767 | } | ||
768 | } | ||
769 | if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) { | ||
770 | entry->edx &= ~(1 << 20); | ||
771 | printk(KERN_INFO "kvm: guest NX capability removed\n"); | ||
772 | } | ||
773 | } | ||
774 | |||
775 | /* when an old userspace process fills a new kernel module */ | ||
776 | static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, | ||
777 | struct kvm_cpuid *cpuid, | ||
778 | struct kvm_cpuid_entry __user *entries) | ||
779 | { | ||
780 | int r, i; | ||
781 | struct kvm_cpuid_entry *cpuid_entries; | ||
782 | |||
783 | r = -E2BIG; | ||
784 | if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) | ||
785 | goto out; | ||
786 | r = -ENOMEM; | ||
787 | cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent); | ||
788 | if (!cpuid_entries) | ||
789 | goto out; | ||
790 | r = -EFAULT; | ||
791 | if (copy_from_user(cpuid_entries, entries, | ||
792 | cpuid->nent * sizeof(struct kvm_cpuid_entry))) | ||
793 | goto out_free; | ||
794 | for (i = 0; i < cpuid->nent; i++) { | ||
795 | vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; | ||
796 | vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; | ||
797 | vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx; | ||
798 | vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx; | ||
799 | vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx; | ||
800 | vcpu->arch.cpuid_entries[i].index = 0; | ||
801 | vcpu->arch.cpuid_entries[i].flags = 0; | ||
802 | vcpu->arch.cpuid_entries[i].padding[0] = 0; | ||
803 | vcpu->arch.cpuid_entries[i].padding[1] = 0; | ||
804 | vcpu->arch.cpuid_entries[i].padding[2] = 0; | ||
805 | } | ||
806 | vcpu->arch.cpuid_nent = cpuid->nent; | ||
807 | cpuid_fix_nx_cap(vcpu); | ||
808 | r = 0; | ||
809 | |||
810 | out_free: | ||
811 | vfree(cpuid_entries); | ||
812 | out: | ||
813 | return r; | ||
814 | } | ||
815 | |||
816 | static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, | ||
817 | struct kvm_cpuid2 *cpuid, | ||
818 | struct kvm_cpuid_entry2 __user *entries) | ||
819 | { | ||
820 | int r; | ||
821 | |||
822 | r = -E2BIG; | ||
823 | if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) | ||
824 | goto out; | ||
825 | r = -EFAULT; | ||
826 | if (copy_from_user(&vcpu->arch.cpuid_entries, entries, | ||
827 | cpuid->nent * sizeof(struct kvm_cpuid_entry2))) | ||
828 | goto out; | ||
829 | vcpu->arch.cpuid_nent = cpuid->nent; | ||
830 | return 0; | ||
831 | |||
832 | out: | ||
833 | return r; | ||
834 | } | ||
835 | |||
836 | static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, | ||
837 | struct kvm_cpuid2 *cpuid, | ||
838 | struct kvm_cpuid_entry2 __user *entries) | ||
839 | { | ||
840 | int r; | ||
841 | |||
842 | r = -E2BIG; | ||
843 | if (cpuid->nent < vcpu->arch.cpuid_nent) | ||
844 | goto out; | ||
845 | r = -EFAULT; | ||
846 | if (copy_to_user(entries, &vcpu->arch.cpuid_entries, | ||
847 | vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) | ||
848 | goto out; | ||
849 | return 0; | ||
850 | |||
851 | out: | ||
852 | cpuid->nent = vcpu->arch.cpuid_nent; | ||
853 | return r; | ||
854 | } | ||
855 | |||
856 | static inline u32 bit(int bitno) | ||
857 | { | ||
858 | return 1 << (bitno & 31); | ||
859 | } | ||
860 | |||
861 | static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, | ||
862 | u32 index) | ||
863 | { | ||
864 | entry->function = function; | ||
865 | entry->index = index; | ||
866 | cpuid_count(entry->function, entry->index, | ||
867 | &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); | ||
868 | entry->flags = 0; | ||
869 | } | ||
870 | |||
871 | static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | ||
872 | u32 index, int *nent, int maxnent) | ||
873 | { | ||
874 | const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) | | ||
875 | bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) | | ||
876 | bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) | | ||
877 | bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) | | ||
878 | bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) | | ||
879 | bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) | | ||
880 | bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) | | ||
881 | bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) | | ||
882 | bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) | | ||
883 | bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP); | ||
884 | const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) | | ||
885 | bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) | | ||
886 | bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) | | ||
887 | bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) | | ||
888 | bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) | | ||
889 | bit(X86_FEATURE_PGE) | | ||
890 | bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) | | ||
891 | bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) | | ||
892 | bit(X86_FEATURE_SYSCALL) | | ||
893 | (bit(X86_FEATURE_NX) && is_efer_nx()) | | ||
894 | #ifdef CONFIG_X86_64 | ||
895 | bit(X86_FEATURE_LM) | | ||
896 | #endif | ||
897 | bit(X86_FEATURE_MMXEXT) | | ||
898 | bit(X86_FEATURE_3DNOWEXT) | | ||
899 | bit(X86_FEATURE_3DNOW); | ||
900 | const u32 kvm_supported_word3_x86_features = | ||
901 | bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16); | ||
902 | const u32 kvm_supported_word6_x86_features = | ||
903 | bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY); | ||
904 | |||
905 | /* all func 2 cpuid_count() should be called on the same cpu */ | ||
906 | get_cpu(); | ||
907 | do_cpuid_1_ent(entry, function, index); | ||
908 | ++*nent; | ||
909 | |||
910 | switch (function) { | ||
911 | case 0: | ||
912 | entry->eax = min(entry->eax, (u32)0xb); | ||
913 | break; | ||
914 | case 1: | ||
915 | entry->edx &= kvm_supported_word0_x86_features; | ||
916 | entry->ecx &= kvm_supported_word3_x86_features; | ||
917 | break; | ||
918 | /* function 2 entries are STATEFUL. That is, repeated cpuid commands | ||
919 | * may return different values. This forces us to get_cpu() before | ||
920 | * issuing the first command, and also to emulate this annoying behavior | ||
921 | * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */ | ||
922 | case 2: { | ||
923 | int t, times = entry->eax & 0xff; | ||
924 | |||
925 | entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; | ||
926 | for (t = 1; t < times && *nent < maxnent; ++t) { | ||
927 | do_cpuid_1_ent(&entry[t], function, 0); | ||
928 | entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; | ||
929 | ++*nent; | ||
930 | } | ||
931 | break; | ||
932 | } | ||
933 | /* function 4 and 0xb have additional index. */ | ||
934 | case 4: { | ||
935 | int index, cache_type; | ||
936 | |||
937 | entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; | ||
938 | /* read more entries until cache_type is zero */ | ||
939 | for (index = 1; *nent < maxnent; ++index) { | ||
940 | cache_type = entry[index - 1].eax & 0x1f; | ||
941 | if (!cache_type) | ||
942 | break; | ||
943 | do_cpuid_1_ent(&entry[index], function, index); | ||
944 | entry[index].flags |= | ||
945 | KVM_CPUID_FLAG_SIGNIFCANT_INDEX; | ||
946 | ++*nent; | ||
947 | } | ||
948 | break; | ||
949 | } | ||
950 | case 0xb: { | ||
951 | int index, level_type; | ||
952 | |||
953 | entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; | ||
954 | /* read more entries until level_type is zero */ | ||
955 | for (index = 1; *nent < maxnent; ++index) { | ||
956 | level_type = entry[index - 1].ecx & 0xff; | ||
957 | if (!level_type) | ||
958 | break; | ||
959 | do_cpuid_1_ent(&entry[index], function, index); | ||
960 | entry[index].flags |= | ||
961 | KVM_CPUID_FLAG_SIGNIFCANT_INDEX; | ||
962 | ++*nent; | ||
963 | } | ||
964 | break; | ||
965 | } | ||
966 | case 0x80000000: | ||
967 | entry->eax = min(entry->eax, 0x8000001a); | ||
968 | break; | ||
969 | case 0x80000001: | ||
970 | entry->edx &= kvm_supported_word1_x86_features; | ||
971 | entry->ecx &= kvm_supported_word6_x86_features; | ||
972 | break; | ||
973 | } | ||
974 | put_cpu(); | ||
975 | } | ||
976 | |||
977 | static int kvm_vm_ioctl_get_supported_cpuid(struct kvm *kvm, | ||
978 | struct kvm_cpuid2 *cpuid, | ||
979 | struct kvm_cpuid_entry2 __user *entries) | ||
980 | { | ||
981 | struct kvm_cpuid_entry2 *cpuid_entries; | ||
982 | int limit, nent = 0, r = -E2BIG; | ||
983 | u32 func; | ||
984 | |||
985 | if (cpuid->nent < 1) | ||
986 | goto out; | ||
987 | r = -ENOMEM; | ||
988 | cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent); | ||
989 | if (!cpuid_entries) | ||
990 | goto out; | ||
991 | |||
992 | do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent); | ||
993 | limit = cpuid_entries[0].eax; | ||
994 | for (func = 1; func <= limit && nent < cpuid->nent; ++func) | ||
995 | do_cpuid_ent(&cpuid_entries[nent], func, 0, | ||
996 | &nent, cpuid->nent); | ||
997 | r = -E2BIG; | ||
998 | if (nent >= cpuid->nent) | ||
999 | goto out_free; | ||
1000 | |||
1001 | do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent); | ||
1002 | limit = cpuid_entries[nent - 1].eax; | ||
1003 | for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) | ||
1004 | do_cpuid_ent(&cpuid_entries[nent], func, 0, | ||
1005 | &nent, cpuid->nent); | ||
1006 | r = -EFAULT; | ||
1007 | if (copy_to_user(entries, cpuid_entries, | ||
1008 | nent * sizeof(struct kvm_cpuid_entry2))) | ||
1009 | goto out_free; | ||
1010 | cpuid->nent = nent; | ||
1011 | r = 0; | ||
1012 | |||
1013 | out_free: | ||
1014 | vfree(cpuid_entries); | ||
1015 | out: | ||
1016 | return r; | ||
1017 | } | ||
1018 | |||
1019 | static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, | ||
1020 | struct kvm_lapic_state *s) | ||
1021 | { | ||
1022 | vcpu_load(vcpu); | ||
1023 | memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); | ||
1024 | vcpu_put(vcpu); | ||
1025 | |||
1026 | return 0; | ||
1027 | } | ||
1028 | |||
1029 | static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, | ||
1030 | struct kvm_lapic_state *s) | ||
1031 | { | ||
1032 | vcpu_load(vcpu); | ||
1033 | memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); | ||
1034 | kvm_apic_post_state_restore(vcpu); | ||
1035 | vcpu_put(vcpu); | ||
1036 | |||
1037 | return 0; | ||
1038 | } | ||
1039 | |||
1040 | static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, | ||
1041 | struct kvm_interrupt *irq) | ||
1042 | { | ||
1043 | if (irq->irq < 0 || irq->irq >= 256) | ||
1044 | return -EINVAL; | ||
1045 | if (irqchip_in_kernel(vcpu->kvm)) | ||
1046 | return -ENXIO; | ||
1047 | vcpu_load(vcpu); | ||
1048 | |||
1049 | set_bit(irq->irq, vcpu->arch.irq_pending); | ||
1050 | set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary); | ||
1051 | |||
1052 | vcpu_put(vcpu); | ||
1053 | |||
1054 | return 0; | ||
1055 | } | ||
1056 | |||
1057 | long kvm_arch_vcpu_ioctl(struct file *filp, | ||
1058 | unsigned int ioctl, unsigned long arg) | ||
1059 | { | ||
1060 | struct kvm_vcpu *vcpu = filp->private_data; | ||
1061 | void __user *argp = (void __user *)arg; | ||
1062 | int r; | ||
1063 | |||
1064 | switch (ioctl) { | ||
1065 | case KVM_GET_LAPIC: { | ||
1066 | struct kvm_lapic_state lapic; | ||
1067 | |||
1068 | memset(&lapic, 0, sizeof lapic); | ||
1069 | r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic); | ||
1070 | if (r) | ||
1071 | goto out; | ||
1072 | r = -EFAULT; | ||
1073 | if (copy_to_user(argp, &lapic, sizeof lapic)) | ||
1074 | goto out; | ||
1075 | r = 0; | ||
1076 | break; | ||
1077 | } | ||
1078 | case KVM_SET_LAPIC: { | ||
1079 | struct kvm_lapic_state lapic; | ||
1080 | |||
1081 | r = -EFAULT; | ||
1082 | if (copy_from_user(&lapic, argp, sizeof lapic)) | ||
1083 | goto out; | ||
1084 | r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);; | ||
1085 | if (r) | ||
1086 | goto out; | ||
1087 | r = 0; | ||
1088 | break; | ||
1089 | } | ||
1090 | case KVM_INTERRUPT: { | ||
1091 | struct kvm_interrupt irq; | ||
1092 | |||
1093 | r = -EFAULT; | ||
1094 | if (copy_from_user(&irq, argp, sizeof irq)) | ||
1095 | goto out; | ||
1096 | r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); | ||
1097 | if (r) | ||
1098 | goto out; | ||
1099 | r = 0; | ||
1100 | break; | ||
1101 | } | ||
1102 | case KVM_SET_CPUID: { | ||
1103 | struct kvm_cpuid __user *cpuid_arg = argp; | ||
1104 | struct kvm_cpuid cpuid; | ||
1105 | |||
1106 | r = -EFAULT; | ||
1107 | if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) | ||
1108 | goto out; | ||
1109 | r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); | ||
1110 | if (r) | ||
1111 | goto out; | ||
1112 | break; | ||
1113 | } | ||
1114 | case KVM_SET_CPUID2: { | ||
1115 | struct kvm_cpuid2 __user *cpuid_arg = argp; | ||
1116 | struct kvm_cpuid2 cpuid; | ||
1117 | |||
1118 | r = -EFAULT; | ||
1119 | if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) | ||
1120 | goto out; | ||
1121 | r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid, | ||
1122 | cpuid_arg->entries); | ||
1123 | if (r) | ||
1124 | goto out; | ||
1125 | break; | ||
1126 | } | ||
1127 | case KVM_GET_CPUID2: { | ||
1128 | struct kvm_cpuid2 __user *cpuid_arg = argp; | ||
1129 | struct kvm_cpuid2 cpuid; | ||
1130 | |||
1131 | r = -EFAULT; | ||
1132 | if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) | ||
1133 | goto out; | ||
1134 | r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid, | ||
1135 | cpuid_arg->entries); | ||
1136 | if (r) | ||
1137 | goto out; | ||
1138 | r = -EFAULT; | ||
1139 | if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) | ||
1140 | goto out; | ||
1141 | r = 0; | ||
1142 | break; | ||
1143 | } | ||
1144 | case KVM_GET_MSRS: | ||
1145 | r = msr_io(vcpu, argp, kvm_get_msr, 1); | ||
1146 | break; | ||
1147 | case KVM_SET_MSRS: | ||
1148 | r = msr_io(vcpu, argp, do_set_msr, 0); | ||
1149 | break; | ||
1150 | default: | ||
1151 | r = -EINVAL; | ||
1152 | } | ||
1153 | out: | ||
1154 | return r; | ||
1155 | } | ||
1156 | |||
1157 | static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) | ||
1158 | { | ||
1159 | int ret; | ||
1160 | |||
1161 | if (addr > (unsigned int)(-3 * PAGE_SIZE)) | ||
1162 | return -1; | ||
1163 | ret = kvm_x86_ops->set_tss_addr(kvm, addr); | ||
1164 | return ret; | ||
1165 | } | ||
1166 | |||
1167 | static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, | ||
1168 | u32 kvm_nr_mmu_pages) | ||
1169 | { | ||
1170 | if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) | ||
1171 | return -EINVAL; | ||
1172 | |||
1173 | mutex_lock(&kvm->lock); | ||
1174 | |||
1175 | kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); | ||
1176 | kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; | ||
1177 | |||
1178 | mutex_unlock(&kvm->lock); | ||
1179 | return 0; | ||
1180 | } | ||
1181 | |||
1182 | static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) | ||
1183 | { | ||
1184 | return kvm->arch.n_alloc_mmu_pages; | ||
1185 | } | ||
1186 | |||
1187 | gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) | ||
1188 | { | ||
1189 | int i; | ||
1190 | struct kvm_mem_alias *alias; | ||
1191 | |||
1192 | for (i = 0; i < kvm->arch.naliases; ++i) { | ||
1193 | alias = &kvm->arch.aliases[i]; | ||
1194 | if (gfn >= alias->base_gfn | ||
1195 | && gfn < alias->base_gfn + alias->npages) | ||
1196 | return alias->target_gfn + gfn - alias->base_gfn; | ||
1197 | } | ||
1198 | return gfn; | ||
1199 | } | ||
1200 | |||
1201 | /* | ||
1202 | * Set a new alias region. Aliases map a portion of physical memory into | ||
1203 | * another portion. This is useful for memory windows, for example the PC | ||
1204 | * VGA region. | ||
1205 | */ | ||
1206 | static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, | ||
1207 | struct kvm_memory_alias *alias) | ||
1208 | { | ||
1209 | int r, n; | ||
1210 | struct kvm_mem_alias *p; | ||
1211 | |||
1212 | r = -EINVAL; | ||
1213 | /* General sanity checks */ | ||
1214 | if (alias->memory_size & (PAGE_SIZE - 1)) | ||
1215 | goto out; | ||
1216 | if (alias->guest_phys_addr & (PAGE_SIZE - 1)) | ||
1217 | goto out; | ||
1218 | if (alias->slot >= KVM_ALIAS_SLOTS) | ||
1219 | goto out; | ||
1220 | if (alias->guest_phys_addr + alias->memory_size | ||
1221 | < alias->guest_phys_addr) | ||
1222 | goto out; | ||
1223 | if (alias->target_phys_addr + alias->memory_size | ||
1224 | < alias->target_phys_addr) | ||
1225 | goto out; | ||
1226 | |||
1227 | mutex_lock(&kvm->lock); | ||
1228 | |||
1229 | p = &kvm->arch.aliases[alias->slot]; | ||
1230 | p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; | ||
1231 | p->npages = alias->memory_size >> PAGE_SHIFT; | ||
1232 | p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; | ||
1233 | |||
1234 | for (n = KVM_ALIAS_SLOTS; n > 0; --n) | ||
1235 | if (kvm->arch.aliases[n - 1].npages) | ||
1236 | break; | ||
1237 | kvm->arch.naliases = n; | ||
1238 | |||
1239 | kvm_mmu_zap_all(kvm); | ||
1240 | |||
1241 | mutex_unlock(&kvm->lock); | ||
1242 | |||
1243 | return 0; | ||
1244 | |||
1245 | out: | ||
1246 | return r; | ||
1247 | } | ||
1248 | |||
1249 | static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) | ||
1250 | { | ||
1251 | int r; | ||
1252 | |||
1253 | r = 0; | ||
1254 | switch (chip->chip_id) { | ||
1255 | case KVM_IRQCHIP_PIC_MASTER: | ||
1256 | memcpy(&chip->chip.pic, | ||
1257 | &pic_irqchip(kvm)->pics[0], | ||
1258 | sizeof(struct kvm_pic_state)); | ||
1259 | break; | ||
1260 | case KVM_IRQCHIP_PIC_SLAVE: | ||
1261 | memcpy(&chip->chip.pic, | ||
1262 | &pic_irqchip(kvm)->pics[1], | ||
1263 | sizeof(struct kvm_pic_state)); | ||
1264 | break; | ||
1265 | case KVM_IRQCHIP_IOAPIC: | ||
1266 | memcpy(&chip->chip.ioapic, | ||
1267 | ioapic_irqchip(kvm), | ||
1268 | sizeof(struct kvm_ioapic_state)); | ||
1269 | break; | ||
1270 | default: | ||
1271 | r = -EINVAL; | ||
1272 | break; | ||
1273 | } | ||
1274 | return r; | ||
1275 | } | ||
1276 | |||
1277 | static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) | ||
1278 | { | ||
1279 | int r; | ||
1280 | |||
1281 | r = 0; | ||
1282 | switch (chip->chip_id) { | ||
1283 | case KVM_IRQCHIP_PIC_MASTER: | ||
1284 | memcpy(&pic_irqchip(kvm)->pics[0], | ||
1285 | &chip->chip.pic, | ||
1286 | sizeof(struct kvm_pic_state)); | ||
1287 | break; | ||
1288 | case KVM_IRQCHIP_PIC_SLAVE: | ||
1289 | memcpy(&pic_irqchip(kvm)->pics[1], | ||
1290 | &chip->chip.pic, | ||
1291 | sizeof(struct kvm_pic_state)); | ||
1292 | break; | ||
1293 | case KVM_IRQCHIP_IOAPIC: | ||
1294 | memcpy(ioapic_irqchip(kvm), | ||
1295 | &chip->chip.ioapic, | ||
1296 | sizeof(struct kvm_ioapic_state)); | ||
1297 | break; | ||
1298 | default: | ||
1299 | r = -EINVAL; | ||
1300 | break; | ||
1301 | } | ||
1302 | kvm_pic_update_irq(pic_irqchip(kvm)); | ||
1303 | return r; | ||
1304 | } | ||
1305 | |||
1306 | /* | ||
1307 | * Get (and clear) the dirty memory log for a memory slot. | ||
1308 | */ | ||
1309 | int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, | ||
1310 | struct kvm_dirty_log *log) | ||
1311 | { | ||
1312 | int r; | ||
1313 | int n; | ||
1314 | struct kvm_memory_slot *memslot; | ||
1315 | int is_dirty = 0; | ||
1316 | |||
1317 | mutex_lock(&kvm->lock); | ||
1318 | |||
1319 | r = kvm_get_dirty_log(kvm, log, &is_dirty); | ||
1320 | if (r) | ||
1321 | goto out; | ||
1322 | |||
1323 | /* If nothing is dirty, don't bother messing with page tables. */ | ||
1324 | if (is_dirty) { | ||
1325 | kvm_mmu_slot_remove_write_access(kvm, log->slot); | ||
1326 | kvm_flush_remote_tlbs(kvm); | ||
1327 | memslot = &kvm->memslots[log->slot]; | ||
1328 | n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; | ||
1329 | memset(memslot->dirty_bitmap, 0, n); | ||
1330 | } | ||
1331 | r = 0; | ||
1332 | out: | ||
1333 | mutex_unlock(&kvm->lock); | ||
1334 | return r; | ||
1335 | } | ||
1336 | |||
1337 | long kvm_arch_vm_ioctl(struct file *filp, | ||
1338 | unsigned int ioctl, unsigned long arg) | ||
1339 | { | ||
1340 | struct kvm *kvm = filp->private_data; | ||
1341 | void __user *argp = (void __user *)arg; | ||
1342 | int r = -EINVAL; | ||
1343 | |||
1344 | switch (ioctl) { | ||
1345 | case KVM_SET_TSS_ADDR: | ||
1346 | r = kvm_vm_ioctl_set_tss_addr(kvm, arg); | ||
1347 | if (r < 0) | ||
1348 | goto out; | ||
1349 | break; | ||
1350 | case KVM_SET_MEMORY_REGION: { | ||
1351 | struct kvm_memory_region kvm_mem; | ||
1352 | struct kvm_userspace_memory_region kvm_userspace_mem; | ||
1353 | |||
1354 | r = -EFAULT; | ||
1355 | if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem)) | ||
1356 | goto out; | ||
1357 | kvm_userspace_mem.slot = kvm_mem.slot; | ||
1358 | kvm_userspace_mem.flags = kvm_mem.flags; | ||
1359 | kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr; | ||
1360 | kvm_userspace_mem.memory_size = kvm_mem.memory_size; | ||
1361 | r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0); | ||
1362 | if (r) | ||
1363 | goto out; | ||
1364 | break; | ||
1365 | } | ||
1366 | case KVM_SET_NR_MMU_PAGES: | ||
1367 | r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); | ||
1368 | if (r) | ||
1369 | goto out; | ||
1370 | break; | ||
1371 | case KVM_GET_NR_MMU_PAGES: | ||
1372 | r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); | ||
1373 | break; | ||
1374 | case KVM_SET_MEMORY_ALIAS: { | ||
1375 | struct kvm_memory_alias alias; | ||
1376 | |||
1377 | r = -EFAULT; | ||
1378 | if (copy_from_user(&alias, argp, sizeof alias)) | ||
1379 | goto out; | ||
1380 | r = kvm_vm_ioctl_set_memory_alias(kvm, &alias); | ||
1381 | if (r) | ||
1382 | goto out; | ||
1383 | break; | ||
1384 | } | ||
1385 | case KVM_CREATE_IRQCHIP: | ||
1386 | r = -ENOMEM; | ||
1387 | kvm->arch.vpic = kvm_create_pic(kvm); | ||
1388 | if (kvm->arch.vpic) { | ||
1389 | r = kvm_ioapic_init(kvm); | ||
1390 | if (r) { | ||
1391 | kfree(kvm->arch.vpic); | ||
1392 | kvm->arch.vpic = NULL; | ||
1393 | goto out; | ||
1394 | } | ||
1395 | } else | ||
1396 | goto out; | ||
1397 | break; | ||
1398 | case KVM_IRQ_LINE: { | ||
1399 | struct kvm_irq_level irq_event; | ||
1400 | |||
1401 | r = -EFAULT; | ||
1402 | if (copy_from_user(&irq_event, argp, sizeof irq_event)) | ||
1403 | goto out; | ||
1404 | if (irqchip_in_kernel(kvm)) { | ||
1405 | mutex_lock(&kvm->lock); | ||
1406 | if (irq_event.irq < 16) | ||
1407 | kvm_pic_set_irq(pic_irqchip(kvm), | ||
1408 | irq_event.irq, | ||
1409 | irq_event.level); | ||
1410 | kvm_ioapic_set_irq(kvm->arch.vioapic, | ||
1411 | irq_event.irq, | ||
1412 | irq_event.level); | ||
1413 | mutex_unlock(&kvm->lock); | ||
1414 | r = 0; | ||
1415 | } | ||
1416 | break; | ||
1417 | } | ||
1418 | case KVM_GET_IRQCHIP: { | ||
1419 | /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ | ||
1420 | struct kvm_irqchip chip; | ||
1421 | |||
1422 | r = -EFAULT; | ||
1423 | if (copy_from_user(&chip, argp, sizeof chip)) | ||
1424 | goto out; | ||
1425 | r = -ENXIO; | ||
1426 | if (!irqchip_in_kernel(kvm)) | ||
1427 | goto out; | ||
1428 | r = kvm_vm_ioctl_get_irqchip(kvm, &chip); | ||
1429 | if (r) | ||
1430 | goto out; | ||
1431 | r = -EFAULT; | ||
1432 | if (copy_to_user(argp, &chip, sizeof chip)) | ||
1433 | goto out; | ||
1434 | r = 0; | ||
1435 | break; | ||
1436 | } | ||
1437 | case KVM_SET_IRQCHIP: { | ||
1438 | /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ | ||
1439 | struct kvm_irqchip chip; | ||
1440 | |||
1441 | r = -EFAULT; | ||
1442 | if (copy_from_user(&chip, argp, sizeof chip)) | ||
1443 | goto out; | ||
1444 | r = -ENXIO; | ||
1445 | if (!irqchip_in_kernel(kvm)) | ||
1446 | goto out; | ||
1447 | r = kvm_vm_ioctl_set_irqchip(kvm, &chip); | ||
1448 | if (r) | ||
1449 | goto out; | ||
1450 | r = 0; | ||
1451 | break; | ||
1452 | } | ||
1453 | case KVM_GET_SUPPORTED_CPUID: { | ||
1454 | struct kvm_cpuid2 __user *cpuid_arg = argp; | ||
1455 | struct kvm_cpuid2 cpuid; | ||
1456 | |||
1457 | r = -EFAULT; | ||
1458 | if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) | ||
1459 | goto out; | ||
1460 | r = kvm_vm_ioctl_get_supported_cpuid(kvm, &cpuid, | ||
1461 | cpuid_arg->entries); | ||
1462 | if (r) | ||
1463 | goto out; | ||
1464 | |||
1465 | r = -EFAULT; | ||
1466 | if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) | ||
1467 | goto out; | ||
1468 | r = 0; | ||
1469 | break; | ||
1470 | } | ||
1471 | default: | ||
1472 | ; | ||
1473 | } | ||
1474 | out: | ||
1475 | return r; | ||
1476 | } | ||
1477 | |||
1478 | static void kvm_init_msr_list(void) | ||
1479 | { | ||
1480 | u32 dummy[2]; | ||
1481 | unsigned i, j; | ||
1482 | |||
1483 | for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { | ||
1484 | if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) | ||
1485 | continue; | ||
1486 | if (j < i) | ||
1487 | msrs_to_save[j] = msrs_to_save[i]; | ||
1488 | j++; | ||
1489 | } | ||
1490 | num_msrs_to_save = j; | ||
1491 | } | ||
1492 | |||
1493 | /* | ||
1494 | * Only apic need an MMIO device hook, so shortcut now.. | ||
1495 | */ | ||
1496 | static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu, | ||
1497 | gpa_t addr) | ||
1498 | { | ||
1499 | struct kvm_io_device *dev; | ||
1500 | |||
1501 | if (vcpu->arch.apic) { | ||
1502 | dev = &vcpu->arch.apic->dev; | ||
1503 | if (dev->in_range(dev, addr)) | ||
1504 | return dev; | ||
1505 | } | ||
1506 | return NULL; | ||
1507 | } | ||
1508 | |||
1509 | |||
1510 | static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, | ||
1511 | gpa_t addr) | ||
1512 | { | ||
1513 | struct kvm_io_device *dev; | ||
1514 | |||
1515 | dev = vcpu_find_pervcpu_dev(vcpu, addr); | ||
1516 | if (dev == NULL) | ||
1517 | dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr); | ||
1518 | return dev; | ||
1519 | } | ||
1520 | |||
1521 | int emulator_read_std(unsigned long addr, | ||
1522 | void *val, | ||
1523 | unsigned int bytes, | ||
1524 | struct kvm_vcpu *vcpu) | ||
1525 | { | ||
1526 | void *data = val; | ||
1527 | |||
1528 | while (bytes) { | ||
1529 | gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); | ||
1530 | unsigned offset = addr & (PAGE_SIZE-1); | ||
1531 | unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset); | ||
1532 | int ret; | ||
1533 | |||
1534 | if (gpa == UNMAPPED_GVA) | ||
1535 | return X86EMUL_PROPAGATE_FAULT; | ||
1536 | ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy); | ||
1537 | if (ret < 0) | ||
1538 | return X86EMUL_UNHANDLEABLE; | ||
1539 | |||
1540 | bytes -= tocopy; | ||
1541 | data += tocopy; | ||
1542 | addr += tocopy; | ||
1543 | } | ||
1544 | |||
1545 | return X86EMUL_CONTINUE; | ||
1546 | } | ||
1547 | EXPORT_SYMBOL_GPL(emulator_read_std); | ||
1548 | |||
1549 | static int emulator_read_emulated(unsigned long addr, | ||
1550 | void *val, | ||
1551 | unsigned int bytes, | ||
1552 | struct kvm_vcpu *vcpu) | ||
1553 | { | ||
1554 | struct kvm_io_device *mmio_dev; | ||
1555 | gpa_t gpa; | ||
1556 | |||
1557 | if (vcpu->mmio_read_completed) { | ||
1558 | memcpy(val, vcpu->mmio_data, bytes); | ||
1559 | vcpu->mmio_read_completed = 0; | ||
1560 | return X86EMUL_CONTINUE; | ||
1561 | } | ||
1562 | |||
1563 | gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); | ||
1564 | |||
1565 | /* For APIC access vmexit */ | ||
1566 | if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) | ||
1567 | goto mmio; | ||
1568 | |||
1569 | if (emulator_read_std(addr, val, bytes, vcpu) | ||
1570 | == X86EMUL_CONTINUE) | ||
1571 | return X86EMUL_CONTINUE; | ||
1572 | if (gpa == UNMAPPED_GVA) | ||
1573 | return X86EMUL_PROPAGATE_FAULT; | ||
1574 | |||
1575 | mmio: | ||
1576 | /* | ||
1577 | * Is this MMIO handled locally? | ||
1578 | */ | ||
1579 | mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); | ||
1580 | if (mmio_dev) { | ||
1581 | kvm_iodevice_read(mmio_dev, gpa, bytes, val); | ||
1582 | return X86EMUL_CONTINUE; | ||
1583 | } | ||
1584 | |||
1585 | vcpu->mmio_needed = 1; | ||
1586 | vcpu->mmio_phys_addr = gpa; | ||
1587 | vcpu->mmio_size = bytes; | ||
1588 | vcpu->mmio_is_write = 0; | ||
1589 | |||
1590 | return X86EMUL_UNHANDLEABLE; | ||
1591 | } | ||
1592 | |||
1593 | static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, | ||
1594 | const void *val, int bytes) | ||
1595 | { | ||
1596 | int ret; | ||
1597 | |||
1598 | ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); | ||
1599 | if (ret < 0) | ||
1600 | return 0; | ||
1601 | kvm_mmu_pte_write(vcpu, gpa, val, bytes); | ||
1602 | return 1; | ||
1603 | } | ||
1604 | |||
1605 | static int emulator_write_emulated_onepage(unsigned long addr, | ||
1606 | const void *val, | ||
1607 | unsigned int bytes, | ||
1608 | struct kvm_vcpu *vcpu) | ||
1609 | { | ||
1610 | struct kvm_io_device *mmio_dev; | ||
1611 | gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); | ||
1612 | |||
1613 | if (gpa == UNMAPPED_GVA) { | ||
1614 | kvm_inject_page_fault(vcpu, addr, 2); | ||
1615 | return X86EMUL_PROPAGATE_FAULT; | ||
1616 | } | ||
1617 | |||
1618 | /* For APIC access vmexit */ | ||
1619 | if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) | ||
1620 | goto mmio; | ||
1621 | |||
1622 | if (emulator_write_phys(vcpu, gpa, val, bytes)) | ||
1623 | return X86EMUL_CONTINUE; | ||
1624 | |||
1625 | mmio: | ||
1626 | /* | ||
1627 | * Is this MMIO handled locally? | ||
1628 | */ | ||
1629 | mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); | ||
1630 | if (mmio_dev) { | ||
1631 | kvm_iodevice_write(mmio_dev, gpa, bytes, val); | ||
1632 | return X86EMUL_CONTINUE; | ||
1633 | } | ||
1634 | |||
1635 | vcpu->mmio_needed = 1; | ||
1636 | vcpu->mmio_phys_addr = gpa; | ||
1637 | vcpu->mmio_size = bytes; | ||
1638 | vcpu->mmio_is_write = 1; | ||
1639 | memcpy(vcpu->mmio_data, val, bytes); | ||
1640 | |||
1641 | return X86EMUL_CONTINUE; | ||
1642 | } | ||
1643 | |||
1644 | int emulator_write_emulated(unsigned long addr, | ||
1645 | const void *val, | ||
1646 | unsigned int bytes, | ||
1647 | struct kvm_vcpu *vcpu) | ||
1648 | { | ||
1649 | /* Crossing a page boundary? */ | ||
1650 | if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { | ||
1651 | int rc, now; | ||
1652 | |||
1653 | now = -addr & ~PAGE_MASK; | ||
1654 | rc = emulator_write_emulated_onepage(addr, val, now, vcpu); | ||
1655 | if (rc != X86EMUL_CONTINUE) | ||
1656 | return rc; | ||
1657 | addr += now; | ||
1658 | val += now; | ||
1659 | bytes -= now; | ||
1660 | } | ||
1661 | return emulator_write_emulated_onepage(addr, val, bytes, vcpu); | ||
1662 | } | ||
1663 | EXPORT_SYMBOL_GPL(emulator_write_emulated); | ||
1664 | |||
1665 | static int emulator_cmpxchg_emulated(unsigned long addr, | ||
1666 | const void *old, | ||
1667 | const void *new, | ||
1668 | unsigned int bytes, | ||
1669 | struct kvm_vcpu *vcpu) | ||
1670 | { | ||
1671 | static int reported; | ||
1672 | |||
1673 | if (!reported) { | ||
1674 | reported = 1; | ||
1675 | printk(KERN_WARNING "kvm: emulating exchange as write\n"); | ||
1676 | } | ||
1677 | #ifndef CONFIG_X86_64 | ||
1678 | /* guests cmpxchg8b have to be emulated atomically */ | ||
1679 | if (bytes == 8) { | ||
1680 | gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); | ||
1681 | struct page *page; | ||
1682 | char *addr; | ||
1683 | u64 val; | ||
1684 | |||
1685 | if (gpa == UNMAPPED_GVA || | ||
1686 | (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) | ||
1687 | goto emul_write; | ||
1688 | |||
1689 | if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK)) | ||
1690 | goto emul_write; | ||
1691 | |||
1692 | val = *(u64 *)new; | ||
1693 | page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); | ||
1694 | addr = kmap_atomic(page, KM_USER0); | ||
1695 | set_64bit((u64 *)(addr + offset_in_page(gpa)), val); | ||
1696 | kunmap_atomic(addr, KM_USER0); | ||
1697 | kvm_release_page_dirty(page); | ||
1698 | } | ||
1699 | emul_write: | ||
1700 | #endif | ||
1701 | |||
1702 | return emulator_write_emulated(addr, new, bytes, vcpu); | ||
1703 | } | ||
1704 | |||
1705 | static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) | ||
1706 | { | ||
1707 | return kvm_x86_ops->get_segment_base(vcpu, seg); | ||
1708 | } | ||
1709 | |||
1710 | int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) | ||
1711 | { | ||
1712 | return X86EMUL_CONTINUE; | ||
1713 | } | ||
1714 | |||
1715 | int emulate_clts(struct kvm_vcpu *vcpu) | ||
1716 | { | ||
1717 | kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); | ||
1718 | return X86EMUL_CONTINUE; | ||
1719 | } | ||
1720 | |||
1721 | int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) | ||
1722 | { | ||
1723 | struct kvm_vcpu *vcpu = ctxt->vcpu; | ||
1724 | |||
1725 | switch (dr) { | ||
1726 | case 0 ... 3: | ||
1727 | *dest = kvm_x86_ops->get_dr(vcpu, dr); | ||
1728 | return X86EMUL_CONTINUE; | ||
1729 | default: | ||
1730 | pr_unimpl(vcpu, "%s: unexpected dr %u\n", __FUNCTION__, dr); | ||
1731 | return X86EMUL_UNHANDLEABLE; | ||
1732 | } | ||
1733 | } | ||
1734 | |||
1735 | int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) | ||
1736 | { | ||
1737 | unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; | ||
1738 | int exception; | ||
1739 | |||
1740 | kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception); | ||
1741 | if (exception) { | ||
1742 | /* FIXME: better handling */ | ||
1743 | return X86EMUL_UNHANDLEABLE; | ||
1744 | } | ||
1745 | return X86EMUL_CONTINUE; | ||
1746 | } | ||
1747 | |||
1748 | void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) | ||
1749 | { | ||
1750 | static int reported; | ||
1751 | u8 opcodes[4]; | ||
1752 | unsigned long rip = vcpu->arch.rip; | ||
1753 | unsigned long rip_linear; | ||
1754 | |||
1755 | rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); | ||
1756 | |||
1757 | if (reported) | ||
1758 | return; | ||
1759 | |||
1760 | emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu); | ||
1761 | |||
1762 | printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", | ||
1763 | context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); | ||
1764 | reported = 1; | ||
1765 | } | ||
1766 | EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); | ||
1767 | |||
1768 | struct x86_emulate_ops emulate_ops = { | ||
1769 | .read_std = emulator_read_std, | ||
1770 | .read_emulated = emulator_read_emulated, | ||
1771 | .write_emulated = emulator_write_emulated, | ||
1772 | .cmpxchg_emulated = emulator_cmpxchg_emulated, | ||
1773 | }; | ||
1774 | |||
1775 | int emulate_instruction(struct kvm_vcpu *vcpu, | ||
1776 | struct kvm_run *run, | ||
1777 | unsigned long cr2, | ||
1778 | u16 error_code, | ||
1779 | int no_decode) | ||
1780 | { | ||
1781 | int r; | ||
1782 | |||
1783 | vcpu->arch.mmio_fault_cr2 = cr2; | ||
1784 | kvm_x86_ops->cache_regs(vcpu); | ||
1785 | |||
1786 | vcpu->mmio_is_write = 0; | ||
1787 | vcpu->arch.pio.string = 0; | ||
1788 | |||
1789 | if (!no_decode) { | ||
1790 | int cs_db, cs_l; | ||
1791 | kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); | ||
1792 | |||
1793 | vcpu->arch.emulate_ctxt.vcpu = vcpu; | ||
1794 | vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); | ||
1795 | vcpu->arch.emulate_ctxt.mode = | ||
1796 | (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) | ||
1797 | ? X86EMUL_MODE_REAL : cs_l | ||
1798 | ? X86EMUL_MODE_PROT64 : cs_db | ||
1799 | ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; | ||
1800 | |||
1801 | if (vcpu->arch.emulate_ctxt.mode == X86EMUL_MODE_PROT64) { | ||
1802 | vcpu->arch.emulate_ctxt.cs_base = 0; | ||
1803 | vcpu->arch.emulate_ctxt.ds_base = 0; | ||
1804 | vcpu->arch.emulate_ctxt.es_base = 0; | ||
1805 | vcpu->arch.emulate_ctxt.ss_base = 0; | ||
1806 | } else { | ||
1807 | vcpu->arch.emulate_ctxt.cs_base = | ||
1808 | get_segment_base(vcpu, VCPU_SREG_CS); | ||
1809 | vcpu->arch.emulate_ctxt.ds_base = | ||
1810 | get_segment_base(vcpu, VCPU_SREG_DS); | ||
1811 | vcpu->arch.emulate_ctxt.es_base = | ||
1812 | get_segment_base(vcpu, VCPU_SREG_ES); | ||
1813 | vcpu->arch.emulate_ctxt.ss_base = | ||
1814 | get_segment_base(vcpu, VCPU_SREG_SS); | ||
1815 | } | ||
1816 | |||
1817 | vcpu->arch.emulate_ctxt.gs_base = | ||
1818 | get_segment_base(vcpu, VCPU_SREG_GS); | ||
1819 | vcpu->arch.emulate_ctxt.fs_base = | ||
1820 | get_segment_base(vcpu, VCPU_SREG_FS); | ||
1821 | |||
1822 | r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); | ||
1823 | ++vcpu->stat.insn_emulation; | ||
1824 | if (r) { | ||
1825 | ++vcpu->stat.insn_emulation_fail; | ||
1826 | if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) | ||
1827 | return EMULATE_DONE; | ||
1828 | return EMULATE_FAIL; | ||
1829 | } | ||
1830 | } | ||
1831 | |||
1832 | r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); | ||
1833 | |||
1834 | if (vcpu->arch.pio.string) | ||
1835 | return EMULATE_DO_MMIO; | ||
1836 | |||
1837 | if ((r || vcpu->mmio_is_write) && run) { | ||
1838 | run->exit_reason = KVM_EXIT_MMIO; | ||
1839 | run->mmio.phys_addr = vcpu->mmio_phys_addr; | ||
1840 | memcpy(run->mmio.data, vcpu->mmio_data, 8); | ||
1841 | run->mmio.len = vcpu->mmio_size; | ||
1842 | run->mmio.is_write = vcpu->mmio_is_write; | ||
1843 | } | ||
1844 | |||
1845 | if (r) { | ||
1846 | if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) | ||
1847 | return EMULATE_DONE; | ||
1848 | if (!vcpu->mmio_needed) { | ||
1849 | kvm_report_emulation_failure(vcpu, "mmio"); | ||
1850 | return EMULATE_FAIL; | ||
1851 | } | ||
1852 | return EMULATE_DO_MMIO; | ||
1853 | } | ||
1854 | |||
1855 | kvm_x86_ops->decache_regs(vcpu); | ||
1856 | kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); | ||
1857 | |||
1858 | if (vcpu->mmio_is_write) { | ||
1859 | vcpu->mmio_needed = 0; | ||
1860 | return EMULATE_DO_MMIO; | ||
1861 | } | ||
1862 | |||
1863 | return EMULATE_DONE; | ||
1864 | } | ||
1865 | EXPORT_SYMBOL_GPL(emulate_instruction); | ||
1866 | |||
1867 | static void free_pio_guest_pages(struct kvm_vcpu *vcpu) | ||
1868 | { | ||
1869 | int i; | ||
1870 | |||
1871 | for (i = 0; i < ARRAY_SIZE(vcpu->arch.pio.guest_pages); ++i) | ||
1872 | if (vcpu->arch.pio.guest_pages[i]) { | ||
1873 | kvm_release_page_dirty(vcpu->arch.pio.guest_pages[i]); | ||
1874 | vcpu->arch.pio.guest_pages[i] = NULL; | ||
1875 | } | ||
1876 | } | ||
1877 | |||
1878 | static int pio_copy_data(struct kvm_vcpu *vcpu) | ||
1879 | { | ||
1880 | void *p = vcpu->arch.pio_data; | ||
1881 | void *q; | ||
1882 | unsigned bytes; | ||
1883 | int nr_pages = vcpu->arch.pio.guest_pages[1] ? 2 : 1; | ||
1884 | |||
1885 | q = vmap(vcpu->arch.pio.guest_pages, nr_pages, VM_READ|VM_WRITE, | ||
1886 | PAGE_KERNEL); | ||
1887 | if (!q) { | ||
1888 | free_pio_guest_pages(vcpu); | ||
1889 | return -ENOMEM; | ||
1890 | } | ||
1891 | q += vcpu->arch.pio.guest_page_offset; | ||
1892 | bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count; | ||
1893 | if (vcpu->arch.pio.in) | ||
1894 | memcpy(q, p, bytes); | ||
1895 | else | ||
1896 | memcpy(p, q, bytes); | ||
1897 | q -= vcpu->arch.pio.guest_page_offset; | ||
1898 | vunmap(q); | ||
1899 | free_pio_guest_pages(vcpu); | ||
1900 | return 0; | ||
1901 | } | ||
1902 | |||
1903 | int complete_pio(struct kvm_vcpu *vcpu) | ||
1904 | { | ||
1905 | struct kvm_pio_request *io = &vcpu->arch.pio; | ||
1906 | long delta; | ||
1907 | int r; | ||
1908 | |||
1909 | kvm_x86_ops->cache_regs(vcpu); | ||
1910 | |||
1911 | if (!io->string) { | ||
1912 | if (io->in) | ||
1913 | memcpy(&vcpu->arch.regs[VCPU_REGS_RAX], vcpu->arch.pio_data, | ||
1914 | io->size); | ||
1915 | } else { | ||
1916 | if (io->in) { | ||
1917 | r = pio_copy_data(vcpu); | ||
1918 | if (r) { | ||
1919 | kvm_x86_ops->cache_regs(vcpu); | ||
1920 | return r; | ||
1921 | } | ||
1922 | } | ||
1923 | |||
1924 | delta = 1; | ||
1925 | if (io->rep) { | ||
1926 | delta *= io->cur_count; | ||
1927 | /* | ||
1928 | * The size of the register should really depend on | ||
1929 | * current address size. | ||
1930 | */ | ||
1931 | vcpu->arch.regs[VCPU_REGS_RCX] -= delta; | ||
1932 | } | ||
1933 | if (io->down) | ||
1934 | delta = -delta; | ||
1935 | delta *= io->size; | ||
1936 | if (io->in) | ||
1937 | vcpu->arch.regs[VCPU_REGS_RDI] += delta; | ||
1938 | else | ||
1939 | vcpu->arch.regs[VCPU_REGS_RSI] += delta; | ||
1940 | } | ||
1941 | |||
1942 | kvm_x86_ops->decache_regs(vcpu); | ||
1943 | |||
1944 | io->count -= io->cur_count; | ||
1945 | io->cur_count = 0; | ||
1946 | |||
1947 | return 0; | ||
1948 | } | ||
1949 | |||
1950 | static void kernel_pio(struct kvm_io_device *pio_dev, | ||
1951 | struct kvm_vcpu *vcpu, | ||
1952 | void *pd) | ||
1953 | { | ||
1954 | /* TODO: String I/O for in kernel device */ | ||
1955 | |||
1956 | mutex_lock(&vcpu->kvm->lock); | ||
1957 | if (vcpu->arch.pio.in) | ||
1958 | kvm_iodevice_read(pio_dev, vcpu->arch.pio.port, | ||
1959 | vcpu->arch.pio.size, | ||
1960 | pd); | ||
1961 | else | ||
1962 | kvm_iodevice_write(pio_dev, vcpu->arch.pio.port, | ||
1963 | vcpu->arch.pio.size, | ||
1964 | pd); | ||
1965 | mutex_unlock(&vcpu->kvm->lock); | ||
1966 | } | ||
1967 | |||
1968 | static void pio_string_write(struct kvm_io_device *pio_dev, | ||
1969 | struct kvm_vcpu *vcpu) | ||
1970 | { | ||
1971 | struct kvm_pio_request *io = &vcpu->arch.pio; | ||
1972 | void *pd = vcpu->arch.pio_data; | ||
1973 | int i; | ||
1974 | |||
1975 | mutex_lock(&vcpu->kvm->lock); | ||
1976 | for (i = 0; i < io->cur_count; i++) { | ||
1977 | kvm_iodevice_write(pio_dev, io->port, | ||
1978 | io->size, | ||
1979 | pd); | ||
1980 | pd += io->size; | ||
1981 | } | ||
1982 | mutex_unlock(&vcpu->kvm->lock); | ||
1983 | } | ||
1984 | |||
1985 | static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu, | ||
1986 | gpa_t addr) | ||
1987 | { | ||
1988 | return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr); | ||
1989 | } | ||
1990 | |||
1991 | int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, | ||
1992 | int size, unsigned port) | ||
1993 | { | ||
1994 | struct kvm_io_device *pio_dev; | ||
1995 | |||
1996 | vcpu->run->exit_reason = KVM_EXIT_IO; | ||
1997 | vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; | ||
1998 | vcpu->run->io.size = vcpu->arch.pio.size = size; | ||
1999 | vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; | ||
2000 | vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1; | ||
2001 | vcpu->run->io.port = vcpu->arch.pio.port = port; | ||
2002 | vcpu->arch.pio.in = in; | ||
2003 | vcpu->arch.pio.string = 0; | ||
2004 | vcpu->arch.pio.down = 0; | ||
2005 | vcpu->arch.pio.guest_page_offset = 0; | ||
2006 | vcpu->arch.pio.rep = 0; | ||
2007 | |||
2008 | kvm_x86_ops->cache_regs(vcpu); | ||
2009 | memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4); | ||
2010 | kvm_x86_ops->decache_regs(vcpu); | ||
2011 | |||
2012 | kvm_x86_ops->skip_emulated_instruction(vcpu); | ||
2013 | |||
2014 | pio_dev = vcpu_find_pio_dev(vcpu, port); | ||
2015 | if (pio_dev) { | ||
2016 | kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data); | ||
2017 | complete_pio(vcpu); | ||
2018 | return 1; | ||
2019 | } | ||
2020 | return 0; | ||
2021 | } | ||
2022 | EXPORT_SYMBOL_GPL(kvm_emulate_pio); | ||
2023 | |||
2024 | int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, | ||
2025 | int size, unsigned long count, int down, | ||
2026 | gva_t address, int rep, unsigned port) | ||
2027 | { | ||
2028 | unsigned now, in_page; | ||
2029 | int i, ret = 0; | ||
2030 | int nr_pages = 1; | ||
2031 | struct page *page; | ||
2032 | struct kvm_io_device *pio_dev; | ||
2033 | |||
2034 | vcpu->run->exit_reason = KVM_EXIT_IO; | ||
2035 | vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; | ||
2036 | vcpu->run->io.size = vcpu->arch.pio.size = size; | ||
2037 | vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; | ||
2038 | vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count; | ||
2039 | vcpu->run->io.port = vcpu->arch.pio.port = port; | ||
2040 | vcpu->arch.pio.in = in; | ||
2041 | vcpu->arch.pio.string = 1; | ||
2042 | vcpu->arch.pio.down = down; | ||
2043 | vcpu->arch.pio.guest_page_offset = offset_in_page(address); | ||
2044 | vcpu->arch.pio.rep = rep; | ||
2045 | |||
2046 | if (!count) { | ||
2047 | kvm_x86_ops->skip_emulated_instruction(vcpu); | ||
2048 | return 1; | ||
2049 | } | ||
2050 | |||
2051 | if (!down) | ||
2052 | in_page = PAGE_SIZE - offset_in_page(address); | ||
2053 | else | ||
2054 | in_page = offset_in_page(address) + size; | ||
2055 | now = min(count, (unsigned long)in_page / size); | ||
2056 | if (!now) { | ||
2057 | /* | ||
2058 | * String I/O straddles page boundary. Pin two guest pages | ||
2059 | * so that we satisfy atomicity constraints. Do just one | ||
2060 | * transaction to avoid complexity. | ||
2061 | */ | ||
2062 | nr_pages = 2; | ||
2063 | now = 1; | ||
2064 | } | ||
2065 | if (down) { | ||
2066 | /* | ||
2067 | * String I/O in reverse. Yuck. Kill the guest, fix later. | ||
2068 | */ | ||
2069 | pr_unimpl(vcpu, "guest string pio down\n"); | ||
2070 | kvm_inject_gp(vcpu, 0); | ||
2071 | return 1; | ||
2072 | } | ||
2073 | vcpu->run->io.count = now; | ||
2074 | vcpu->arch.pio.cur_count = now; | ||
2075 | |||
2076 | if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count) | ||
2077 | kvm_x86_ops->skip_emulated_instruction(vcpu); | ||
2078 | |||
2079 | for (i = 0; i < nr_pages; ++i) { | ||
2080 | mutex_lock(&vcpu->kvm->lock); | ||
2081 | page = gva_to_page(vcpu, address + i * PAGE_SIZE); | ||
2082 | vcpu->arch.pio.guest_pages[i] = page; | ||
2083 | mutex_unlock(&vcpu->kvm->lock); | ||
2084 | if (!page) { | ||
2085 | kvm_inject_gp(vcpu, 0); | ||
2086 | free_pio_guest_pages(vcpu); | ||
2087 | return 1; | ||
2088 | } | ||
2089 | } | ||
2090 | |||
2091 | pio_dev = vcpu_find_pio_dev(vcpu, port); | ||
2092 | if (!vcpu->arch.pio.in) { | ||
2093 | /* string PIO write */ | ||
2094 | ret = pio_copy_data(vcpu); | ||
2095 | if (ret >= 0 && pio_dev) { | ||
2096 | pio_string_write(pio_dev, vcpu); | ||
2097 | complete_pio(vcpu); | ||
2098 | if (vcpu->arch.pio.count == 0) | ||
2099 | ret = 1; | ||
2100 | } | ||
2101 | } else if (pio_dev) | ||
2102 | pr_unimpl(vcpu, "no string pio read support yet, " | ||
2103 | "port %x size %d count %ld\n", | ||
2104 | port, size, count); | ||
2105 | |||
2106 | return ret; | ||
2107 | } | ||
2108 | EXPORT_SYMBOL_GPL(kvm_emulate_pio_string); | ||
2109 | |||
2110 | int kvm_arch_init(void *opaque) | ||
2111 | { | ||
2112 | int r; | ||
2113 | struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; | ||
2114 | |||
2115 | r = kvm_mmu_module_init(); | ||
2116 | if (r) | ||
2117 | goto out_fail; | ||
2118 | |||
2119 | kvm_init_msr_list(); | ||
2120 | |||
2121 | if (kvm_x86_ops) { | ||
2122 | printk(KERN_ERR "kvm: already loaded the other module\n"); | ||
2123 | r = -EEXIST; | ||
2124 | goto out; | ||
2125 | } | ||
2126 | |||
2127 | if (!ops->cpu_has_kvm_support()) { | ||
2128 | printk(KERN_ERR "kvm: no hardware support\n"); | ||
2129 | r = -EOPNOTSUPP; | ||
2130 | goto out; | ||
2131 | } | ||
2132 | if (ops->disabled_by_bios()) { | ||
2133 | printk(KERN_ERR "kvm: disabled by bios\n"); | ||
2134 | r = -EOPNOTSUPP; | ||
2135 | goto out; | ||
2136 | } | ||
2137 | |||
2138 | kvm_x86_ops = ops; | ||
2139 | kvm_mmu_set_nonpresent_ptes(0ull, 0ull); | ||
2140 | return 0; | ||
2141 | |||
2142 | out: | ||
2143 | kvm_mmu_module_exit(); | ||
2144 | out_fail: | ||
2145 | return r; | ||
2146 | } | ||
2147 | |||
2148 | void kvm_arch_exit(void) | ||
2149 | { | ||
2150 | kvm_x86_ops = NULL; | ||
2151 | kvm_mmu_module_exit(); | ||
2152 | } | ||
2153 | |||
2154 | int kvm_emulate_halt(struct kvm_vcpu *vcpu) | ||
2155 | { | ||
2156 | ++vcpu->stat.halt_exits; | ||
2157 | if (irqchip_in_kernel(vcpu->kvm)) { | ||
2158 | vcpu->arch.mp_state = VCPU_MP_STATE_HALTED; | ||
2159 | kvm_vcpu_block(vcpu); | ||
2160 | if (vcpu->arch.mp_state != VCPU_MP_STATE_RUNNABLE) | ||
2161 | return -EINTR; | ||
2162 | return 1; | ||
2163 | } else { | ||
2164 | vcpu->run->exit_reason = KVM_EXIT_HLT; | ||
2165 | return 0; | ||
2166 | } | ||
2167 | } | ||
2168 | EXPORT_SYMBOL_GPL(kvm_emulate_halt); | ||
2169 | |||
2170 | int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) | ||
2171 | { | ||
2172 | unsigned long nr, a0, a1, a2, a3, ret; | ||
2173 | |||
2174 | kvm_x86_ops->cache_regs(vcpu); | ||
2175 | |||
2176 | nr = vcpu->arch.regs[VCPU_REGS_RAX]; | ||
2177 | a0 = vcpu->arch.regs[VCPU_REGS_RBX]; | ||
2178 | a1 = vcpu->arch.regs[VCPU_REGS_RCX]; | ||
2179 | a2 = vcpu->arch.regs[VCPU_REGS_RDX]; | ||
2180 | a3 = vcpu->arch.regs[VCPU_REGS_RSI]; | ||
2181 | |||
2182 | if (!is_long_mode(vcpu)) { | ||
2183 | nr &= 0xFFFFFFFF; | ||
2184 | a0 &= 0xFFFFFFFF; | ||
2185 | a1 &= 0xFFFFFFFF; | ||
2186 | a2 &= 0xFFFFFFFF; | ||
2187 | a3 &= 0xFFFFFFFF; | ||
2188 | } | ||
2189 | |||
2190 | switch (nr) { | ||
2191 | default: | ||
2192 | ret = -KVM_ENOSYS; | ||
2193 | break; | ||
2194 | } | ||
2195 | vcpu->arch.regs[VCPU_REGS_RAX] = ret; | ||
2196 | kvm_x86_ops->decache_regs(vcpu); | ||
2197 | return 0; | ||
2198 | } | ||
2199 | EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); | ||
2200 | |||
2201 | int kvm_fix_hypercall(struct kvm_vcpu *vcpu) | ||
2202 | { | ||
2203 | char instruction[3]; | ||
2204 | int ret = 0; | ||
2205 | |||
2206 | mutex_lock(&vcpu->kvm->lock); | ||
2207 | |||
2208 | /* | ||
2209 | * Blow out the MMU to ensure that no other VCPU has an active mapping | ||
2210 | * to ensure that the updated hypercall appears atomically across all | ||
2211 | * VCPUs. | ||
2212 | */ | ||
2213 | kvm_mmu_zap_all(vcpu->kvm); | ||
2214 | |||
2215 | kvm_x86_ops->cache_regs(vcpu); | ||
2216 | kvm_x86_ops->patch_hypercall(vcpu, instruction); | ||
2217 | if (emulator_write_emulated(vcpu->arch.rip, instruction, 3, vcpu) | ||
2218 | != X86EMUL_CONTINUE) | ||
2219 | ret = -EFAULT; | ||
2220 | |||
2221 | mutex_unlock(&vcpu->kvm->lock); | ||
2222 | |||
2223 | return ret; | ||
2224 | } | ||
2225 | |||
2226 | static u64 mk_cr_64(u64 curr_cr, u32 new_val) | ||
2227 | { | ||
2228 | return (curr_cr & ~((1ULL << 32) - 1)) | new_val; | ||
2229 | } | ||
2230 | |||
2231 | void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) | ||
2232 | { | ||
2233 | struct descriptor_table dt = { limit, base }; | ||
2234 | |||
2235 | kvm_x86_ops->set_gdt(vcpu, &dt); | ||
2236 | } | ||
2237 | |||
2238 | void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) | ||
2239 | { | ||
2240 | struct descriptor_table dt = { limit, base }; | ||
2241 | |||
2242 | kvm_x86_ops->set_idt(vcpu, &dt); | ||
2243 | } | ||
2244 | |||
2245 | void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, | ||
2246 | unsigned long *rflags) | ||
2247 | { | ||
2248 | lmsw(vcpu, msw); | ||
2249 | *rflags = kvm_x86_ops->get_rflags(vcpu); | ||
2250 | } | ||
2251 | |||
2252 | unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) | ||
2253 | { | ||
2254 | kvm_x86_ops->decache_cr4_guest_bits(vcpu); | ||
2255 | switch (cr) { | ||
2256 | case 0: | ||
2257 | return vcpu->arch.cr0; | ||
2258 | case 2: | ||
2259 | return vcpu->arch.cr2; | ||
2260 | case 3: | ||
2261 | return vcpu->arch.cr3; | ||
2262 | case 4: | ||
2263 | return vcpu->arch.cr4; | ||
2264 | case 8: | ||
2265 | return get_cr8(vcpu); | ||
2266 | default: | ||
2267 | vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr); | ||
2268 | return 0; | ||
2269 | } | ||
2270 | } | ||
2271 | |||
2272 | void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, | ||
2273 | unsigned long *rflags) | ||
2274 | { | ||
2275 | switch (cr) { | ||
2276 | case 0: | ||
2277 | set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); | ||
2278 | *rflags = kvm_x86_ops->get_rflags(vcpu); | ||
2279 | break; | ||
2280 | case 2: | ||
2281 | vcpu->arch.cr2 = val; | ||
2282 | break; | ||
2283 | case 3: | ||
2284 | set_cr3(vcpu, val); | ||
2285 | break; | ||
2286 | case 4: | ||
2287 | set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val)); | ||
2288 | break; | ||
2289 | case 8: | ||
2290 | set_cr8(vcpu, val & 0xfUL); | ||
2291 | break; | ||
2292 | default: | ||
2293 | vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr); | ||
2294 | } | ||
2295 | } | ||
2296 | |||
2297 | static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) | ||
2298 | { | ||
2299 | struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; | ||
2300 | int j, nent = vcpu->arch.cpuid_nent; | ||
2301 | |||
2302 | e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT; | ||
2303 | /* when no next entry is found, the current entry[i] is reselected */ | ||
2304 | for (j = i + 1; j == i; j = (j + 1) % nent) { | ||
2305 | struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j]; | ||
2306 | if (ej->function == e->function) { | ||
2307 | ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; | ||
2308 | return j; | ||
2309 | } | ||
2310 | } | ||
2311 | return 0; /* silence gcc, even though control never reaches here */ | ||
2312 | } | ||
2313 | |||
2314 | /* find an entry with matching function, matching index (if needed), and that | ||
2315 | * should be read next (if it's stateful) */ | ||
2316 | static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e, | ||
2317 | u32 function, u32 index) | ||
2318 | { | ||
2319 | if (e->function != function) | ||
2320 | return 0; | ||
2321 | if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index) | ||
2322 | return 0; | ||
2323 | if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) && | ||
2324 | !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT)) | ||
2325 | return 0; | ||
2326 | return 1; | ||
2327 | } | ||
2328 | |||
2329 | void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) | ||
2330 | { | ||
2331 | int i; | ||
2332 | u32 function, index; | ||
2333 | struct kvm_cpuid_entry2 *e, *best; | ||
2334 | |||
2335 | kvm_x86_ops->cache_regs(vcpu); | ||
2336 | function = vcpu->arch.regs[VCPU_REGS_RAX]; | ||
2337 | index = vcpu->arch.regs[VCPU_REGS_RCX]; | ||
2338 | vcpu->arch.regs[VCPU_REGS_RAX] = 0; | ||
2339 | vcpu->arch.regs[VCPU_REGS_RBX] = 0; | ||
2340 | vcpu->arch.regs[VCPU_REGS_RCX] = 0; | ||
2341 | vcpu->arch.regs[VCPU_REGS_RDX] = 0; | ||
2342 | best = NULL; | ||
2343 | for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { | ||
2344 | e = &vcpu->arch.cpuid_entries[i]; | ||
2345 | if (is_matching_cpuid_entry(e, function, index)) { | ||
2346 | if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) | ||
2347 | move_to_next_stateful_cpuid_entry(vcpu, i); | ||
2348 | best = e; | ||
2349 | break; | ||
2350 | } | ||
2351 | /* | ||
2352 | * Both basic or both extended? | ||
2353 | */ | ||
2354 | if (((e->function ^ function) & 0x80000000) == 0) | ||
2355 | if (!best || e->function > best->function) | ||
2356 | best = e; | ||
2357 | } | ||
2358 | if (best) { | ||
2359 | vcpu->arch.regs[VCPU_REGS_RAX] = best->eax; | ||
2360 | vcpu->arch.regs[VCPU_REGS_RBX] = best->ebx; | ||
2361 | vcpu->arch.regs[VCPU_REGS_RCX] = best->ecx; | ||
2362 | vcpu->arch.regs[VCPU_REGS_RDX] = best->edx; | ||
2363 | } | ||
2364 | kvm_x86_ops->decache_regs(vcpu); | ||
2365 | kvm_x86_ops->skip_emulated_instruction(vcpu); | ||
2366 | } | ||
2367 | EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); | ||
2368 | |||
2369 | /* | ||
2370 | * Check if userspace requested an interrupt window, and that the | ||
2371 | * interrupt window is open. | ||
2372 | * | ||
2373 | * No need to exit to userspace if we already have an interrupt queued. | ||
2374 | */ | ||
2375 | static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, | ||
2376 | struct kvm_run *kvm_run) | ||
2377 | { | ||
2378 | return (!vcpu->arch.irq_summary && | ||
2379 | kvm_run->request_interrupt_window && | ||
2380 | vcpu->arch.interrupt_window_open && | ||
2381 | (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF)); | ||
2382 | } | ||
2383 | |||
2384 | static void post_kvm_run_save(struct kvm_vcpu *vcpu, | ||
2385 | struct kvm_run *kvm_run) | ||
2386 | { | ||
2387 | kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0; | ||
2388 | kvm_run->cr8 = get_cr8(vcpu); | ||
2389 | kvm_run->apic_base = kvm_get_apic_base(vcpu); | ||
2390 | if (irqchip_in_kernel(vcpu->kvm)) | ||
2391 | kvm_run->ready_for_interrupt_injection = 1; | ||
2392 | else | ||
2393 | kvm_run->ready_for_interrupt_injection = | ||
2394 | (vcpu->arch.interrupt_window_open && | ||
2395 | vcpu->arch.irq_summary == 0); | ||
2396 | } | ||
2397 | |||
2398 | static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
2399 | { | ||
2400 | int r; | ||
2401 | |||
2402 | if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) { | ||
2403 | pr_debug("vcpu %d received sipi with vector # %x\n", | ||
2404 | vcpu->vcpu_id, vcpu->arch.sipi_vector); | ||
2405 | kvm_lapic_reset(vcpu); | ||
2406 | r = kvm_x86_ops->vcpu_reset(vcpu); | ||
2407 | if (r) | ||
2408 | return r; | ||
2409 | vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE; | ||
2410 | } | ||
2411 | |||
2412 | preempted: | ||
2413 | if (vcpu->guest_debug.enabled) | ||
2414 | kvm_x86_ops->guest_debug_pre(vcpu); | ||
2415 | |||
2416 | again: | ||
2417 | r = kvm_mmu_reload(vcpu); | ||
2418 | if (unlikely(r)) | ||
2419 | goto out; | ||
2420 | |||
2421 | kvm_inject_pending_timer_irqs(vcpu); | ||
2422 | |||
2423 | preempt_disable(); | ||
2424 | |||
2425 | kvm_x86_ops->prepare_guest_switch(vcpu); | ||
2426 | kvm_load_guest_fpu(vcpu); | ||
2427 | |||
2428 | local_irq_disable(); | ||
2429 | |||
2430 | if (signal_pending(current)) { | ||
2431 | local_irq_enable(); | ||
2432 | preempt_enable(); | ||
2433 | r = -EINTR; | ||
2434 | kvm_run->exit_reason = KVM_EXIT_INTR; | ||
2435 | ++vcpu->stat.signal_exits; | ||
2436 | goto out; | ||
2437 | } | ||
2438 | |||
2439 | if (vcpu->arch.exception.pending) | ||
2440 | __queue_exception(vcpu); | ||
2441 | else if (irqchip_in_kernel(vcpu->kvm)) | ||
2442 | kvm_x86_ops->inject_pending_irq(vcpu); | ||
2443 | else | ||
2444 | kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run); | ||
2445 | |||
2446 | vcpu->guest_mode = 1; | ||
2447 | kvm_guest_enter(); | ||
2448 | |||
2449 | if (vcpu->requests) | ||
2450 | if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) | ||
2451 | kvm_x86_ops->tlb_flush(vcpu); | ||
2452 | |||
2453 | kvm_x86_ops->run(vcpu, kvm_run); | ||
2454 | |||
2455 | vcpu->guest_mode = 0; | ||
2456 | local_irq_enable(); | ||
2457 | |||
2458 | ++vcpu->stat.exits; | ||
2459 | |||
2460 | /* | ||
2461 | * We must have an instruction between local_irq_enable() and | ||
2462 | * kvm_guest_exit(), so the timer interrupt isn't delayed by | ||
2463 | * the interrupt shadow. The stat.exits increment will do nicely. | ||
2464 | * But we need to prevent reordering, hence this barrier(): | ||
2465 | */ | ||
2466 | barrier(); | ||
2467 | |||
2468 | kvm_guest_exit(); | ||
2469 | |||
2470 | preempt_enable(); | ||
2471 | |||
2472 | /* | ||
2473 | * Profile KVM exit RIPs: | ||
2474 | */ | ||
2475 | if (unlikely(prof_on == KVM_PROFILING)) { | ||
2476 | kvm_x86_ops->cache_regs(vcpu); | ||
2477 | profile_hit(KVM_PROFILING, (void *)vcpu->arch.rip); | ||
2478 | } | ||
2479 | |||
2480 | if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu)) | ||
2481 | vcpu->arch.exception.pending = false; | ||
2482 | |||
2483 | r = kvm_x86_ops->handle_exit(kvm_run, vcpu); | ||
2484 | |||
2485 | if (r > 0) { | ||
2486 | if (dm_request_for_irq_injection(vcpu, kvm_run)) { | ||
2487 | r = -EINTR; | ||
2488 | kvm_run->exit_reason = KVM_EXIT_INTR; | ||
2489 | ++vcpu->stat.request_irq_exits; | ||
2490 | goto out; | ||
2491 | } | ||
2492 | if (!need_resched()) | ||
2493 | goto again; | ||
2494 | } | ||
2495 | |||
2496 | out: | ||
2497 | if (r > 0) { | ||
2498 | kvm_resched(vcpu); | ||
2499 | goto preempted; | ||
2500 | } | ||
2501 | |||
2502 | post_kvm_run_save(vcpu, kvm_run); | ||
2503 | |||
2504 | return r; | ||
2505 | } | ||
2506 | |||
2507 | int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
2508 | { | ||
2509 | int r; | ||
2510 | sigset_t sigsaved; | ||
2511 | |||
2512 | vcpu_load(vcpu); | ||
2513 | |||
2514 | if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_UNINITIALIZED)) { | ||
2515 | kvm_vcpu_block(vcpu); | ||
2516 | vcpu_put(vcpu); | ||
2517 | return -EAGAIN; | ||
2518 | } | ||
2519 | |||
2520 | if (vcpu->sigset_active) | ||
2521 | sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); | ||
2522 | |||
2523 | /* re-sync apic's tpr */ | ||
2524 | if (!irqchip_in_kernel(vcpu->kvm)) | ||
2525 | set_cr8(vcpu, kvm_run->cr8); | ||
2526 | |||
2527 | if (vcpu->arch.pio.cur_count) { | ||
2528 | r = complete_pio(vcpu); | ||
2529 | if (r) | ||
2530 | goto out; | ||
2531 | } | ||
2532 | #if CONFIG_HAS_IOMEM | ||
2533 | if (vcpu->mmio_needed) { | ||
2534 | memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); | ||
2535 | vcpu->mmio_read_completed = 1; | ||
2536 | vcpu->mmio_needed = 0; | ||
2537 | r = emulate_instruction(vcpu, kvm_run, | ||
2538 | vcpu->arch.mmio_fault_cr2, 0, 1); | ||
2539 | if (r == EMULATE_DO_MMIO) { | ||
2540 | /* | ||
2541 | * Read-modify-write. Back to userspace. | ||
2542 | */ | ||
2543 | r = 0; | ||
2544 | goto out; | ||
2545 | } | ||
2546 | } | ||
2547 | #endif | ||
2548 | if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) { | ||
2549 | kvm_x86_ops->cache_regs(vcpu); | ||
2550 | vcpu->arch.regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret; | ||
2551 | kvm_x86_ops->decache_regs(vcpu); | ||
2552 | } | ||
2553 | |||
2554 | r = __vcpu_run(vcpu, kvm_run); | ||
2555 | |||
2556 | out: | ||
2557 | if (vcpu->sigset_active) | ||
2558 | sigprocmask(SIG_SETMASK, &sigsaved, NULL); | ||
2559 | |||
2560 | vcpu_put(vcpu); | ||
2561 | return r; | ||
2562 | } | ||
2563 | |||
2564 | int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) | ||
2565 | { | ||
2566 | vcpu_load(vcpu); | ||
2567 | |||
2568 | kvm_x86_ops->cache_regs(vcpu); | ||
2569 | |||
2570 | regs->rax = vcpu->arch.regs[VCPU_REGS_RAX]; | ||
2571 | regs->rbx = vcpu->arch.regs[VCPU_REGS_RBX]; | ||
2572 | regs->rcx = vcpu->arch.regs[VCPU_REGS_RCX]; | ||
2573 | regs->rdx = vcpu->arch.regs[VCPU_REGS_RDX]; | ||
2574 | regs->rsi = vcpu->arch.regs[VCPU_REGS_RSI]; | ||
2575 | regs->rdi = vcpu->arch.regs[VCPU_REGS_RDI]; | ||
2576 | regs->rsp = vcpu->arch.regs[VCPU_REGS_RSP]; | ||
2577 | regs->rbp = vcpu->arch.regs[VCPU_REGS_RBP]; | ||
2578 | #ifdef CONFIG_X86_64 | ||
2579 | regs->r8 = vcpu->arch.regs[VCPU_REGS_R8]; | ||
2580 | regs->r9 = vcpu->arch.regs[VCPU_REGS_R9]; | ||
2581 | regs->r10 = vcpu->arch.regs[VCPU_REGS_R10]; | ||
2582 | regs->r11 = vcpu->arch.regs[VCPU_REGS_R11]; | ||
2583 | regs->r12 = vcpu->arch.regs[VCPU_REGS_R12]; | ||
2584 | regs->r13 = vcpu->arch.regs[VCPU_REGS_R13]; | ||
2585 | regs->r14 = vcpu->arch.regs[VCPU_REGS_R14]; | ||
2586 | regs->r15 = vcpu->arch.regs[VCPU_REGS_R15]; | ||
2587 | #endif | ||
2588 | |||
2589 | regs->rip = vcpu->arch.rip; | ||
2590 | regs->rflags = kvm_x86_ops->get_rflags(vcpu); | ||
2591 | |||
2592 | /* | ||
2593 | * Don't leak debug flags in case they were set for guest debugging | ||
2594 | */ | ||
2595 | if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep) | ||
2596 | regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); | ||
2597 | |||
2598 | vcpu_put(vcpu); | ||
2599 | |||
2600 | return 0; | ||
2601 | } | ||
2602 | |||
2603 | int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) | ||
2604 | { | ||
2605 | vcpu_load(vcpu); | ||
2606 | |||
2607 | vcpu->arch.regs[VCPU_REGS_RAX] = regs->rax; | ||
2608 | vcpu->arch.regs[VCPU_REGS_RBX] = regs->rbx; | ||
2609 | vcpu->arch.regs[VCPU_REGS_RCX] = regs->rcx; | ||
2610 | vcpu->arch.regs[VCPU_REGS_RDX] = regs->rdx; | ||
2611 | vcpu->arch.regs[VCPU_REGS_RSI] = regs->rsi; | ||
2612 | vcpu->arch.regs[VCPU_REGS_RDI] = regs->rdi; | ||
2613 | vcpu->arch.regs[VCPU_REGS_RSP] = regs->rsp; | ||
2614 | vcpu->arch.regs[VCPU_REGS_RBP] = regs->rbp; | ||
2615 | #ifdef CONFIG_X86_64 | ||
2616 | vcpu->arch.regs[VCPU_REGS_R8] = regs->r8; | ||
2617 | vcpu->arch.regs[VCPU_REGS_R9] = regs->r9; | ||
2618 | vcpu->arch.regs[VCPU_REGS_R10] = regs->r10; | ||
2619 | vcpu->arch.regs[VCPU_REGS_R11] = regs->r11; | ||
2620 | vcpu->arch.regs[VCPU_REGS_R12] = regs->r12; | ||
2621 | vcpu->arch.regs[VCPU_REGS_R13] = regs->r13; | ||
2622 | vcpu->arch.regs[VCPU_REGS_R14] = regs->r14; | ||
2623 | vcpu->arch.regs[VCPU_REGS_R15] = regs->r15; | ||
2624 | #endif | ||
2625 | |||
2626 | vcpu->arch.rip = regs->rip; | ||
2627 | kvm_x86_ops->set_rflags(vcpu, regs->rflags); | ||
2628 | |||
2629 | kvm_x86_ops->decache_regs(vcpu); | ||
2630 | |||
2631 | vcpu_put(vcpu); | ||
2632 | |||
2633 | return 0; | ||
2634 | } | ||
2635 | |||
2636 | static void get_segment(struct kvm_vcpu *vcpu, | ||
2637 | struct kvm_segment *var, int seg) | ||
2638 | { | ||
2639 | return kvm_x86_ops->get_segment(vcpu, var, seg); | ||
2640 | } | ||
2641 | |||
2642 | void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) | ||
2643 | { | ||
2644 | struct kvm_segment cs; | ||
2645 | |||
2646 | get_segment(vcpu, &cs, VCPU_SREG_CS); | ||
2647 | *db = cs.db; | ||
2648 | *l = cs.l; | ||
2649 | } | ||
2650 | EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits); | ||
2651 | |||
2652 | int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, | ||
2653 | struct kvm_sregs *sregs) | ||
2654 | { | ||
2655 | struct descriptor_table dt; | ||
2656 | int pending_vec; | ||
2657 | |||
2658 | vcpu_load(vcpu); | ||
2659 | |||
2660 | get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); | ||
2661 | get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); | ||
2662 | get_segment(vcpu, &sregs->es, VCPU_SREG_ES); | ||
2663 | get_segment(vcpu, &sregs->fs, VCPU_SREG_FS); | ||
2664 | get_segment(vcpu, &sregs->gs, VCPU_SREG_GS); | ||
2665 | get_segment(vcpu, &sregs->ss, VCPU_SREG_SS); | ||
2666 | |||
2667 | get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); | ||
2668 | get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); | ||
2669 | |||
2670 | kvm_x86_ops->get_idt(vcpu, &dt); | ||
2671 | sregs->idt.limit = dt.limit; | ||
2672 | sregs->idt.base = dt.base; | ||
2673 | kvm_x86_ops->get_gdt(vcpu, &dt); | ||
2674 | sregs->gdt.limit = dt.limit; | ||
2675 | sregs->gdt.base = dt.base; | ||
2676 | |||
2677 | kvm_x86_ops->decache_cr4_guest_bits(vcpu); | ||
2678 | sregs->cr0 = vcpu->arch.cr0; | ||
2679 | sregs->cr2 = vcpu->arch.cr2; | ||
2680 | sregs->cr3 = vcpu->arch.cr3; | ||
2681 | sregs->cr4 = vcpu->arch.cr4; | ||
2682 | sregs->cr8 = get_cr8(vcpu); | ||
2683 | sregs->efer = vcpu->arch.shadow_efer; | ||
2684 | sregs->apic_base = kvm_get_apic_base(vcpu); | ||
2685 | |||
2686 | if (irqchip_in_kernel(vcpu->kvm)) { | ||
2687 | memset(sregs->interrupt_bitmap, 0, | ||
2688 | sizeof sregs->interrupt_bitmap); | ||
2689 | pending_vec = kvm_x86_ops->get_irq(vcpu); | ||
2690 | if (pending_vec >= 0) | ||
2691 | set_bit(pending_vec, | ||
2692 | (unsigned long *)sregs->interrupt_bitmap); | ||
2693 | } else | ||
2694 | memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending, | ||
2695 | sizeof sregs->interrupt_bitmap); | ||
2696 | |||
2697 | vcpu_put(vcpu); | ||
2698 | |||
2699 | return 0; | ||
2700 | } | ||
2701 | |||
2702 | static void set_segment(struct kvm_vcpu *vcpu, | ||
2703 | struct kvm_segment *var, int seg) | ||
2704 | { | ||
2705 | return kvm_x86_ops->set_segment(vcpu, var, seg); | ||
2706 | } | ||
2707 | |||
2708 | int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, | ||
2709 | struct kvm_sregs *sregs) | ||
2710 | { | ||
2711 | int mmu_reset_needed = 0; | ||
2712 | int i, pending_vec, max_bits; | ||
2713 | struct descriptor_table dt; | ||
2714 | |||
2715 | vcpu_load(vcpu); | ||
2716 | |||
2717 | dt.limit = sregs->idt.limit; | ||
2718 | dt.base = sregs->idt.base; | ||
2719 | kvm_x86_ops->set_idt(vcpu, &dt); | ||
2720 | dt.limit = sregs->gdt.limit; | ||
2721 | dt.base = sregs->gdt.base; | ||
2722 | kvm_x86_ops->set_gdt(vcpu, &dt); | ||
2723 | |||
2724 | vcpu->arch.cr2 = sregs->cr2; | ||
2725 | mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; | ||
2726 | vcpu->arch.cr3 = sregs->cr3; | ||
2727 | |||
2728 | set_cr8(vcpu, sregs->cr8); | ||
2729 | |||
2730 | mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer; | ||
2731 | #ifdef CONFIG_X86_64 | ||
2732 | kvm_x86_ops->set_efer(vcpu, sregs->efer); | ||
2733 | #endif | ||
2734 | kvm_set_apic_base(vcpu, sregs->apic_base); | ||
2735 | |||
2736 | kvm_x86_ops->decache_cr4_guest_bits(vcpu); | ||
2737 | |||
2738 | mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0; | ||
2739 | vcpu->arch.cr0 = sregs->cr0; | ||
2740 | kvm_x86_ops->set_cr0(vcpu, sregs->cr0); | ||
2741 | |||
2742 | mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4; | ||
2743 | kvm_x86_ops->set_cr4(vcpu, sregs->cr4); | ||
2744 | if (!is_long_mode(vcpu) && is_pae(vcpu)) | ||
2745 | load_pdptrs(vcpu, vcpu->arch.cr3); | ||
2746 | |||
2747 | if (mmu_reset_needed) | ||
2748 | kvm_mmu_reset_context(vcpu); | ||
2749 | |||
2750 | if (!irqchip_in_kernel(vcpu->kvm)) { | ||
2751 | memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap, | ||
2752 | sizeof vcpu->arch.irq_pending); | ||
2753 | vcpu->arch.irq_summary = 0; | ||
2754 | for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i) | ||
2755 | if (vcpu->arch.irq_pending[i]) | ||
2756 | __set_bit(i, &vcpu->arch.irq_summary); | ||
2757 | } else { | ||
2758 | max_bits = (sizeof sregs->interrupt_bitmap) << 3; | ||
2759 | pending_vec = find_first_bit( | ||
2760 | (const unsigned long *)sregs->interrupt_bitmap, | ||
2761 | max_bits); | ||
2762 | /* Only pending external irq is handled here */ | ||
2763 | if (pending_vec < max_bits) { | ||
2764 | kvm_x86_ops->set_irq(vcpu, pending_vec); | ||
2765 | pr_debug("Set back pending irq %d\n", | ||
2766 | pending_vec); | ||
2767 | } | ||
2768 | } | ||
2769 | |||
2770 | set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); | ||
2771 | set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); | ||
2772 | set_segment(vcpu, &sregs->es, VCPU_SREG_ES); | ||
2773 | set_segment(vcpu, &sregs->fs, VCPU_SREG_FS); | ||
2774 | set_segment(vcpu, &sregs->gs, VCPU_SREG_GS); | ||
2775 | set_segment(vcpu, &sregs->ss, VCPU_SREG_SS); | ||
2776 | |||
2777 | set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); | ||
2778 | set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); | ||
2779 | |||
2780 | vcpu_put(vcpu); | ||
2781 | |||
2782 | return 0; | ||
2783 | } | ||
2784 | |||
2785 | int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu, | ||
2786 | struct kvm_debug_guest *dbg) | ||
2787 | { | ||
2788 | int r; | ||
2789 | |||
2790 | vcpu_load(vcpu); | ||
2791 | |||
2792 | r = kvm_x86_ops->set_guest_debug(vcpu, dbg); | ||
2793 | |||
2794 | vcpu_put(vcpu); | ||
2795 | |||
2796 | return r; | ||
2797 | } | ||
2798 | |||
2799 | /* | ||
2800 | * fxsave fpu state. Taken from x86_64/processor.h. To be killed when | ||
2801 | * we have asm/x86/processor.h | ||
2802 | */ | ||
2803 | struct fxsave { | ||
2804 | u16 cwd; | ||
2805 | u16 swd; | ||
2806 | u16 twd; | ||
2807 | u16 fop; | ||
2808 | u64 rip; | ||
2809 | u64 rdp; | ||
2810 | u32 mxcsr; | ||
2811 | u32 mxcsr_mask; | ||
2812 | u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ | ||
2813 | #ifdef CONFIG_X86_64 | ||
2814 | u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ | ||
2815 | #else | ||
2816 | u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */ | ||
2817 | #endif | ||
2818 | }; | ||
2819 | |||
2820 | /* | ||
2821 | * Translate a guest virtual address to a guest physical address. | ||
2822 | */ | ||
2823 | int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, | ||
2824 | struct kvm_translation *tr) | ||
2825 | { | ||
2826 | unsigned long vaddr = tr->linear_address; | ||
2827 | gpa_t gpa; | ||
2828 | |||
2829 | vcpu_load(vcpu); | ||
2830 | mutex_lock(&vcpu->kvm->lock); | ||
2831 | gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr); | ||
2832 | tr->physical_address = gpa; | ||
2833 | tr->valid = gpa != UNMAPPED_GVA; | ||
2834 | tr->writeable = 1; | ||
2835 | tr->usermode = 0; | ||
2836 | mutex_unlock(&vcpu->kvm->lock); | ||
2837 | vcpu_put(vcpu); | ||
2838 | |||
2839 | return 0; | ||
2840 | } | ||
2841 | |||
2842 | int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) | ||
2843 | { | ||
2844 | struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; | ||
2845 | |||
2846 | vcpu_load(vcpu); | ||
2847 | |||
2848 | memcpy(fpu->fpr, fxsave->st_space, 128); | ||
2849 | fpu->fcw = fxsave->cwd; | ||
2850 | fpu->fsw = fxsave->swd; | ||
2851 | fpu->ftwx = fxsave->twd; | ||
2852 | fpu->last_opcode = fxsave->fop; | ||
2853 | fpu->last_ip = fxsave->rip; | ||
2854 | fpu->last_dp = fxsave->rdp; | ||
2855 | memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space); | ||
2856 | |||
2857 | vcpu_put(vcpu); | ||
2858 | |||
2859 | return 0; | ||
2860 | } | ||
2861 | |||
2862 | int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) | ||
2863 | { | ||
2864 | struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; | ||
2865 | |||
2866 | vcpu_load(vcpu); | ||
2867 | |||
2868 | memcpy(fxsave->st_space, fpu->fpr, 128); | ||
2869 | fxsave->cwd = fpu->fcw; | ||
2870 | fxsave->swd = fpu->fsw; | ||
2871 | fxsave->twd = fpu->ftwx; | ||
2872 | fxsave->fop = fpu->last_opcode; | ||
2873 | fxsave->rip = fpu->last_ip; | ||
2874 | fxsave->rdp = fpu->last_dp; | ||
2875 | memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space); | ||
2876 | |||
2877 | vcpu_put(vcpu); | ||
2878 | |||
2879 | return 0; | ||
2880 | } | ||
2881 | |||
2882 | void fx_init(struct kvm_vcpu *vcpu) | ||
2883 | { | ||
2884 | unsigned after_mxcsr_mask; | ||
2885 | |||
2886 | /* Initialize guest FPU by resetting ours and saving into guest's */ | ||
2887 | preempt_disable(); | ||
2888 | fx_save(&vcpu->arch.host_fx_image); | ||
2889 | fpu_init(); | ||
2890 | fx_save(&vcpu->arch.guest_fx_image); | ||
2891 | fx_restore(&vcpu->arch.host_fx_image); | ||
2892 | preempt_enable(); | ||
2893 | |||
2894 | vcpu->arch.cr0 |= X86_CR0_ET; | ||
2895 | after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space); | ||
2896 | vcpu->arch.guest_fx_image.mxcsr = 0x1f80; | ||
2897 | memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask, | ||
2898 | 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask); | ||
2899 | } | ||
2900 | EXPORT_SYMBOL_GPL(fx_init); | ||
2901 | |||
2902 | void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) | ||
2903 | { | ||
2904 | if (!vcpu->fpu_active || vcpu->guest_fpu_loaded) | ||
2905 | return; | ||
2906 | |||
2907 | vcpu->guest_fpu_loaded = 1; | ||
2908 | fx_save(&vcpu->arch.host_fx_image); | ||
2909 | fx_restore(&vcpu->arch.guest_fx_image); | ||
2910 | } | ||
2911 | EXPORT_SYMBOL_GPL(kvm_load_guest_fpu); | ||
2912 | |||
2913 | void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) | ||
2914 | { | ||
2915 | if (!vcpu->guest_fpu_loaded) | ||
2916 | return; | ||
2917 | |||
2918 | vcpu->guest_fpu_loaded = 0; | ||
2919 | fx_save(&vcpu->arch.guest_fx_image); | ||
2920 | fx_restore(&vcpu->arch.host_fx_image); | ||
2921 | ++vcpu->stat.fpu_reload; | ||
2922 | } | ||
2923 | EXPORT_SYMBOL_GPL(kvm_put_guest_fpu); | ||
2924 | |||
2925 | void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) | ||
2926 | { | ||
2927 | kvm_x86_ops->vcpu_free(vcpu); | ||
2928 | } | ||
2929 | |||
2930 | struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, | ||
2931 | unsigned int id) | ||
2932 | { | ||
2933 | return kvm_x86_ops->vcpu_create(kvm, id); | ||
2934 | } | ||
2935 | |||
2936 | int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) | ||
2937 | { | ||
2938 | int r; | ||
2939 | |||
2940 | /* We do fxsave: this must be aligned. */ | ||
2941 | BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF); | ||
2942 | |||
2943 | vcpu_load(vcpu); | ||
2944 | r = kvm_arch_vcpu_reset(vcpu); | ||
2945 | if (r == 0) | ||
2946 | r = kvm_mmu_setup(vcpu); | ||
2947 | vcpu_put(vcpu); | ||
2948 | if (r < 0) | ||
2949 | goto free_vcpu; | ||
2950 | |||
2951 | return 0; | ||
2952 | free_vcpu: | ||
2953 | kvm_x86_ops->vcpu_free(vcpu); | ||
2954 | return r; | ||
2955 | } | ||
2956 | |||
2957 | void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) | ||
2958 | { | ||
2959 | vcpu_load(vcpu); | ||
2960 | kvm_mmu_unload(vcpu); | ||
2961 | vcpu_put(vcpu); | ||
2962 | |||
2963 | kvm_x86_ops->vcpu_free(vcpu); | ||
2964 | } | ||
2965 | |||
2966 | int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) | ||
2967 | { | ||
2968 | return kvm_x86_ops->vcpu_reset(vcpu); | ||
2969 | } | ||
2970 | |||
2971 | void kvm_arch_hardware_enable(void *garbage) | ||
2972 | { | ||
2973 | kvm_x86_ops->hardware_enable(garbage); | ||
2974 | } | ||
2975 | |||
2976 | void kvm_arch_hardware_disable(void *garbage) | ||
2977 | { | ||
2978 | kvm_x86_ops->hardware_disable(garbage); | ||
2979 | } | ||
2980 | |||
2981 | int kvm_arch_hardware_setup(void) | ||
2982 | { | ||
2983 | return kvm_x86_ops->hardware_setup(); | ||
2984 | } | ||
2985 | |||
2986 | void kvm_arch_hardware_unsetup(void) | ||
2987 | { | ||
2988 | kvm_x86_ops->hardware_unsetup(); | ||
2989 | } | ||
2990 | |||
2991 | void kvm_arch_check_processor_compat(void *rtn) | ||
2992 | { | ||
2993 | kvm_x86_ops->check_processor_compatibility(rtn); | ||
2994 | } | ||
2995 | |||
2996 | int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) | ||
2997 | { | ||
2998 | struct page *page; | ||
2999 | struct kvm *kvm; | ||
3000 | int r; | ||
3001 | |||
3002 | BUG_ON(vcpu->kvm == NULL); | ||
3003 | kvm = vcpu->kvm; | ||
3004 | |||
3005 | vcpu->arch.mmu.root_hpa = INVALID_PAGE; | ||
3006 | if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0) | ||
3007 | vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE; | ||
3008 | else | ||
3009 | vcpu->arch.mp_state = VCPU_MP_STATE_UNINITIALIZED; | ||
3010 | |||
3011 | page = alloc_page(GFP_KERNEL | __GFP_ZERO); | ||
3012 | if (!page) { | ||
3013 | r = -ENOMEM; | ||
3014 | goto fail; | ||
3015 | } | ||
3016 | vcpu->arch.pio_data = page_address(page); | ||
3017 | |||
3018 | r = kvm_mmu_create(vcpu); | ||
3019 | if (r < 0) | ||
3020 | goto fail_free_pio_data; | ||
3021 | |||
3022 | if (irqchip_in_kernel(kvm)) { | ||
3023 | r = kvm_create_lapic(vcpu); | ||
3024 | if (r < 0) | ||
3025 | goto fail_mmu_destroy; | ||
3026 | } | ||
3027 | |||
3028 | return 0; | ||
3029 | |||
3030 | fail_mmu_destroy: | ||
3031 | kvm_mmu_destroy(vcpu); | ||
3032 | fail_free_pio_data: | ||
3033 | free_page((unsigned long)vcpu->arch.pio_data); | ||
3034 | fail: | ||
3035 | return r; | ||
3036 | } | ||
3037 | |||
3038 | void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) | ||
3039 | { | ||
3040 | kvm_free_lapic(vcpu); | ||
3041 | kvm_mmu_destroy(vcpu); | ||
3042 | free_page((unsigned long)vcpu->arch.pio_data); | ||
3043 | } | ||
3044 | |||
3045 | struct kvm *kvm_arch_create_vm(void) | ||
3046 | { | ||
3047 | struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); | ||
3048 | |||
3049 | if (!kvm) | ||
3050 | return ERR_PTR(-ENOMEM); | ||
3051 | |||
3052 | INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); | ||
3053 | |||
3054 | return kvm; | ||
3055 | } | ||
3056 | |||
3057 | static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) | ||
3058 | { | ||
3059 | vcpu_load(vcpu); | ||
3060 | kvm_mmu_unload(vcpu); | ||
3061 | vcpu_put(vcpu); | ||
3062 | } | ||
3063 | |||
3064 | static void kvm_free_vcpus(struct kvm *kvm) | ||
3065 | { | ||
3066 | unsigned int i; | ||
3067 | |||
3068 | /* | ||
3069 | * Unpin any mmu pages first. | ||
3070 | */ | ||
3071 | for (i = 0; i < KVM_MAX_VCPUS; ++i) | ||
3072 | if (kvm->vcpus[i]) | ||
3073 | kvm_unload_vcpu_mmu(kvm->vcpus[i]); | ||
3074 | for (i = 0; i < KVM_MAX_VCPUS; ++i) { | ||
3075 | if (kvm->vcpus[i]) { | ||
3076 | kvm_arch_vcpu_free(kvm->vcpus[i]); | ||
3077 | kvm->vcpus[i] = NULL; | ||
3078 | } | ||
3079 | } | ||
3080 | |||
3081 | } | ||
3082 | |||
3083 | void kvm_arch_destroy_vm(struct kvm *kvm) | ||
3084 | { | ||
3085 | kfree(kvm->arch.vpic); | ||
3086 | kfree(kvm->arch.vioapic); | ||
3087 | kvm_free_vcpus(kvm); | ||
3088 | kvm_free_physmem(kvm); | ||
3089 | kfree(kvm); | ||
3090 | } | ||
3091 | |||
3092 | int kvm_arch_set_memory_region(struct kvm *kvm, | ||
3093 | struct kvm_userspace_memory_region *mem, | ||
3094 | struct kvm_memory_slot old, | ||
3095 | int user_alloc) | ||
3096 | { | ||
3097 | int npages = mem->memory_size >> PAGE_SHIFT; | ||
3098 | struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot]; | ||
3099 | |||
3100 | /*To keep backward compatibility with older userspace, | ||
3101 | *x86 needs to hanlde !user_alloc case. | ||
3102 | */ | ||
3103 | if (!user_alloc) { | ||
3104 | if (npages && !old.rmap) { | ||
3105 | down_write(¤t->mm->mmap_sem); | ||
3106 | memslot->userspace_addr = do_mmap(NULL, 0, | ||
3107 | npages * PAGE_SIZE, | ||
3108 | PROT_READ | PROT_WRITE, | ||
3109 | MAP_SHARED | MAP_ANONYMOUS, | ||
3110 | 0); | ||
3111 | up_write(¤t->mm->mmap_sem); | ||
3112 | |||
3113 | if (IS_ERR((void *)memslot->userspace_addr)) | ||
3114 | return PTR_ERR((void *)memslot->userspace_addr); | ||
3115 | } else { | ||
3116 | if (!old.user_alloc && old.rmap) { | ||
3117 | int ret; | ||
3118 | |||
3119 | down_write(¤t->mm->mmap_sem); | ||
3120 | ret = do_munmap(current->mm, old.userspace_addr, | ||
3121 | old.npages * PAGE_SIZE); | ||
3122 | up_write(¤t->mm->mmap_sem); | ||
3123 | if (ret < 0) | ||
3124 | printk(KERN_WARNING | ||
3125 | "kvm_vm_ioctl_set_memory_region: " | ||
3126 | "failed to munmap memory\n"); | ||
3127 | } | ||
3128 | } | ||
3129 | } | ||
3130 | |||
3131 | if (!kvm->arch.n_requested_mmu_pages) { | ||
3132 | unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); | ||
3133 | kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); | ||
3134 | } | ||
3135 | |||
3136 | kvm_mmu_slot_remove_write_access(kvm, mem->slot); | ||
3137 | kvm_flush_remote_tlbs(kvm); | ||
3138 | |||
3139 | return 0; | ||
3140 | } | ||
3141 | |||
3142 | int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) | ||
3143 | { | ||
3144 | return vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE | ||
3145 | || vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED; | ||
3146 | } | ||
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c new file mode 100644 index 000000000000..79586003397a --- /dev/null +++ b/arch/x86/kvm/x86_emulate.c | |||
@@ -0,0 +1,1912 @@ | |||
1 | /****************************************************************************** | ||
2 | * x86_emulate.c | ||
3 | * | ||
4 | * Generic x86 (32-bit and 64-bit) instruction decoder and emulator. | ||
5 | * | ||
6 | * Copyright (c) 2005 Keir Fraser | ||
7 | * | ||
8 | * Linux coding style, mod r/m decoder, segment base fixes, real-mode | ||
9 | * privileged instructions: | ||
10 | * | ||
11 | * Copyright (C) 2006 Qumranet | ||
12 | * | ||
13 | * Avi Kivity <avi@qumranet.com> | ||
14 | * Yaniv Kamay <yaniv@qumranet.com> | ||
15 | * | ||
16 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
17 | * the COPYING file in the top-level directory. | ||
18 | * | ||
19 | * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 | ||
20 | */ | ||
21 | |||
22 | #ifndef __KERNEL__ | ||
23 | #include <stdio.h> | ||
24 | #include <stdint.h> | ||
25 | #include <public/xen.h> | ||
26 | #define DPRINTF(_f, _a ...) printf(_f , ## _a) | ||
27 | #else | ||
28 | #include <linux/kvm_host.h> | ||
29 | #define DPRINTF(x...) do {} while (0) | ||
30 | #endif | ||
31 | #include <linux/module.h> | ||
32 | #include <asm/kvm_x86_emulate.h> | ||
33 | |||
34 | /* | ||
35 | * Opcode effective-address decode tables. | ||
36 | * Note that we only emulate instructions that have at least one memory | ||
37 | * operand (excluding implicit stack references). We assume that stack | ||
38 | * references and instruction fetches will never occur in special memory | ||
39 | * areas that require emulation. So, for example, 'mov <imm>,<reg>' need | ||
40 | * not be handled. | ||
41 | */ | ||
42 | |||
43 | /* Operand sizes: 8-bit operands or specified/overridden size. */ | ||
44 | #define ByteOp (1<<0) /* 8-bit operands. */ | ||
45 | /* Destination operand type. */ | ||
46 | #define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ | ||
47 | #define DstReg (2<<1) /* Register operand. */ | ||
48 | #define DstMem (3<<1) /* Memory operand. */ | ||
49 | #define DstMask (3<<1) | ||
50 | /* Source operand type. */ | ||
51 | #define SrcNone (0<<3) /* No source operand. */ | ||
52 | #define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */ | ||
53 | #define SrcReg (1<<3) /* Register operand. */ | ||
54 | #define SrcMem (2<<3) /* Memory operand. */ | ||
55 | #define SrcMem16 (3<<3) /* Memory operand (16-bit). */ | ||
56 | #define SrcMem32 (4<<3) /* Memory operand (32-bit). */ | ||
57 | #define SrcImm (5<<3) /* Immediate operand. */ | ||
58 | #define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */ | ||
59 | #define SrcMask (7<<3) | ||
60 | /* Generic ModRM decode. */ | ||
61 | #define ModRM (1<<6) | ||
62 | /* Destination is only written; never read. */ | ||
63 | #define Mov (1<<7) | ||
64 | #define BitOp (1<<8) | ||
65 | #define MemAbs (1<<9) /* Memory operand is absolute displacement */ | ||
66 | #define String (1<<10) /* String instruction (rep capable) */ | ||
67 | #define Stack (1<<11) /* Stack instruction (push/pop) */ | ||
68 | |||
69 | static u16 opcode_table[256] = { | ||
70 | /* 0x00 - 0x07 */ | ||
71 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
72 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
73 | 0, 0, 0, 0, | ||
74 | /* 0x08 - 0x0F */ | ||
75 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
76 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
77 | 0, 0, 0, 0, | ||
78 | /* 0x10 - 0x17 */ | ||
79 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
80 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
81 | 0, 0, 0, 0, | ||
82 | /* 0x18 - 0x1F */ | ||
83 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
84 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
85 | 0, 0, 0, 0, | ||
86 | /* 0x20 - 0x27 */ | ||
87 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
88 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
89 | SrcImmByte, SrcImm, 0, 0, | ||
90 | /* 0x28 - 0x2F */ | ||
91 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
92 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
93 | 0, 0, 0, 0, | ||
94 | /* 0x30 - 0x37 */ | ||
95 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
96 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
97 | 0, 0, 0, 0, | ||
98 | /* 0x38 - 0x3F */ | ||
99 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
100 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
101 | 0, 0, 0, 0, | ||
102 | /* 0x40 - 0x47 */ | ||
103 | DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, | ||
104 | /* 0x48 - 0x4F */ | ||
105 | DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, | ||
106 | /* 0x50 - 0x57 */ | ||
107 | SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, | ||
108 | SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, | ||
109 | /* 0x58 - 0x5F */ | ||
110 | DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, | ||
111 | DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, | ||
112 | /* 0x60 - 0x67 */ | ||
113 | 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , | ||
114 | 0, 0, 0, 0, | ||
115 | /* 0x68 - 0x6F */ | ||
116 | 0, 0, ImplicitOps | Mov | Stack, 0, | ||
117 | SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */ | ||
118 | SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */ | ||
119 | /* 0x70 - 0x77 */ | ||
120 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
121 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
122 | /* 0x78 - 0x7F */ | ||
123 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
124 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
125 | /* 0x80 - 0x87 */ | ||
126 | ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, | ||
127 | ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, | ||
128 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
129 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
130 | /* 0x88 - 0x8F */ | ||
131 | ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, | ||
132 | ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
133 | 0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov | Stack, | ||
134 | /* 0x90 - 0x9F */ | ||
135 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
136 | 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, | ||
137 | /* 0xA0 - 0xA7 */ | ||
138 | ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, | ||
139 | ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs, | ||
140 | ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, | ||
141 | ByteOp | ImplicitOps | String, ImplicitOps | String, | ||
142 | /* 0xA8 - 0xAF */ | ||
143 | 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, | ||
144 | ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, | ||
145 | ByteOp | ImplicitOps | String, ImplicitOps | String, | ||
146 | /* 0xB0 - 0xBF */ | ||
147 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
148 | /* 0xC0 - 0xC7 */ | ||
149 | ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, | ||
150 | 0, ImplicitOps | Stack, 0, 0, | ||
151 | ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov, | ||
152 | /* 0xC8 - 0xCF */ | ||
153 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
154 | /* 0xD0 - 0xD7 */ | ||
155 | ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, | ||
156 | ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, | ||
157 | 0, 0, 0, 0, | ||
158 | /* 0xD8 - 0xDF */ | ||
159 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
160 | /* 0xE0 - 0xE7 */ | ||
161 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
162 | /* 0xE8 - 0xEF */ | ||
163 | ImplicitOps | Stack, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps, | ||
164 | 0, 0, 0, 0, | ||
165 | /* 0xF0 - 0xF7 */ | ||
166 | 0, 0, 0, 0, | ||
167 | ImplicitOps, ImplicitOps, | ||
168 | ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, | ||
169 | /* 0xF8 - 0xFF */ | ||
170 | ImplicitOps, 0, ImplicitOps, ImplicitOps, | ||
171 | 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM | ||
172 | }; | ||
173 | |||
174 | static u16 twobyte_table[256] = { | ||
175 | /* 0x00 - 0x0F */ | ||
176 | 0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0, | ||
177 | ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, | ||
178 | /* 0x10 - 0x1F */ | ||
179 | 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, | ||
180 | /* 0x20 - 0x2F */ | ||
181 | ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0, | ||
182 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
183 | /* 0x30 - 0x3F */ | ||
184 | ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
185 | /* 0x40 - 0x47 */ | ||
186 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
187 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
188 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
189 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
190 | /* 0x48 - 0x4F */ | ||
191 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
192 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
193 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
194 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
195 | /* 0x50 - 0x5F */ | ||
196 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
197 | /* 0x60 - 0x6F */ | ||
198 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
199 | /* 0x70 - 0x7F */ | ||
200 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
201 | /* 0x80 - 0x8F */ | ||
202 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
203 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
204 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
205 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
206 | /* 0x90 - 0x9F */ | ||
207 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
208 | /* 0xA0 - 0xA7 */ | ||
209 | 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0, | ||
210 | /* 0xA8 - 0xAF */ | ||
211 | 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0, | ||
212 | /* 0xB0 - 0xB7 */ | ||
213 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0, | ||
214 | DstMem | SrcReg | ModRM | BitOp, | ||
215 | 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, | ||
216 | DstReg | SrcMem16 | ModRM | Mov, | ||
217 | /* 0xB8 - 0xBF */ | ||
218 | 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp, | ||
219 | 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, | ||
220 | DstReg | SrcMem16 | ModRM | Mov, | ||
221 | /* 0xC0 - 0xCF */ | ||
222 | 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM, | ||
223 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
224 | /* 0xD0 - 0xDF */ | ||
225 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
226 | /* 0xE0 - 0xEF */ | ||
227 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
228 | /* 0xF0 - 0xFF */ | ||
229 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | ||
230 | }; | ||
231 | |||
232 | /* EFLAGS bit definitions. */ | ||
233 | #define EFLG_OF (1<<11) | ||
234 | #define EFLG_DF (1<<10) | ||
235 | #define EFLG_SF (1<<7) | ||
236 | #define EFLG_ZF (1<<6) | ||
237 | #define EFLG_AF (1<<4) | ||
238 | #define EFLG_PF (1<<2) | ||
239 | #define EFLG_CF (1<<0) | ||
240 | |||
241 | /* | ||
242 | * Instruction emulation: | ||
243 | * Most instructions are emulated directly via a fragment of inline assembly | ||
244 | * code. This allows us to save/restore EFLAGS and thus very easily pick up | ||
245 | * any modified flags. | ||
246 | */ | ||
247 | |||
248 | #if defined(CONFIG_X86_64) | ||
249 | #define _LO32 "k" /* force 32-bit operand */ | ||
250 | #define _STK "%%rsp" /* stack pointer */ | ||
251 | #elif defined(__i386__) | ||
252 | #define _LO32 "" /* force 32-bit operand */ | ||
253 | #define _STK "%%esp" /* stack pointer */ | ||
254 | #endif | ||
255 | |||
256 | /* | ||
257 | * These EFLAGS bits are restored from saved value during emulation, and | ||
258 | * any changes are written back to the saved value after emulation. | ||
259 | */ | ||
260 | #define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF) | ||
261 | |||
262 | /* Before executing instruction: restore necessary bits in EFLAGS. */ | ||
263 | #define _PRE_EFLAGS(_sav, _msk, _tmp) \ | ||
264 | /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~_msk; */ \ | ||
265 | "movl %"_sav",%"_LO32 _tmp"; " \ | ||
266 | "push %"_tmp"; " \ | ||
267 | "push %"_tmp"; " \ | ||
268 | "movl %"_msk",%"_LO32 _tmp"; " \ | ||
269 | "andl %"_LO32 _tmp",("_STK"); " \ | ||
270 | "pushf; " \ | ||
271 | "notl %"_LO32 _tmp"; " \ | ||
272 | "andl %"_LO32 _tmp",("_STK"); " \ | ||
273 | "andl %"_LO32 _tmp","__stringify(BITS_PER_LONG/4)"("_STK"); " \ | ||
274 | "pop %"_tmp"; " \ | ||
275 | "orl %"_LO32 _tmp",("_STK"); " \ | ||
276 | "popf; " \ | ||
277 | "pop %"_sav"; " | ||
278 | |||
279 | /* After executing instruction: write-back necessary bits in EFLAGS. */ | ||
280 | #define _POST_EFLAGS(_sav, _msk, _tmp) \ | ||
281 | /* _sav |= EFLAGS & _msk; */ \ | ||
282 | "pushf; " \ | ||
283 | "pop %"_tmp"; " \ | ||
284 | "andl %"_msk",%"_LO32 _tmp"; " \ | ||
285 | "orl %"_LO32 _tmp",%"_sav"; " | ||
286 | |||
287 | /* Raw emulation: instruction has two explicit operands. */ | ||
288 | #define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \ | ||
289 | do { \ | ||
290 | unsigned long _tmp; \ | ||
291 | \ | ||
292 | switch ((_dst).bytes) { \ | ||
293 | case 2: \ | ||
294 | __asm__ __volatile__ ( \ | ||
295 | _PRE_EFLAGS("0", "4", "2") \ | ||
296 | _op"w %"_wx"3,%1; " \ | ||
297 | _POST_EFLAGS("0", "4", "2") \ | ||
298 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
299 | "=&r" (_tmp) \ | ||
300 | : _wy ((_src).val), "i" (EFLAGS_MASK)); \ | ||
301 | break; \ | ||
302 | case 4: \ | ||
303 | __asm__ __volatile__ ( \ | ||
304 | _PRE_EFLAGS("0", "4", "2") \ | ||
305 | _op"l %"_lx"3,%1; " \ | ||
306 | _POST_EFLAGS("0", "4", "2") \ | ||
307 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
308 | "=&r" (_tmp) \ | ||
309 | : _ly ((_src).val), "i" (EFLAGS_MASK)); \ | ||
310 | break; \ | ||
311 | case 8: \ | ||
312 | __emulate_2op_8byte(_op, _src, _dst, \ | ||
313 | _eflags, _qx, _qy); \ | ||
314 | break; \ | ||
315 | } \ | ||
316 | } while (0) | ||
317 | |||
318 | #define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \ | ||
319 | do { \ | ||
320 | unsigned long _tmp; \ | ||
321 | switch ((_dst).bytes) { \ | ||
322 | case 1: \ | ||
323 | __asm__ __volatile__ ( \ | ||
324 | _PRE_EFLAGS("0", "4", "2") \ | ||
325 | _op"b %"_bx"3,%1; " \ | ||
326 | _POST_EFLAGS("0", "4", "2") \ | ||
327 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
328 | "=&r" (_tmp) \ | ||
329 | : _by ((_src).val), "i" (EFLAGS_MASK)); \ | ||
330 | break; \ | ||
331 | default: \ | ||
332 | __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ | ||
333 | _wx, _wy, _lx, _ly, _qx, _qy); \ | ||
334 | break; \ | ||
335 | } \ | ||
336 | } while (0) | ||
337 | |||
338 | /* Source operand is byte-sized and may be restricted to just %cl. */ | ||
339 | #define emulate_2op_SrcB(_op, _src, _dst, _eflags) \ | ||
340 | __emulate_2op(_op, _src, _dst, _eflags, \ | ||
341 | "b", "c", "b", "c", "b", "c", "b", "c") | ||
342 | |||
343 | /* Source operand is byte, word, long or quad sized. */ | ||
344 | #define emulate_2op_SrcV(_op, _src, _dst, _eflags) \ | ||
345 | __emulate_2op(_op, _src, _dst, _eflags, \ | ||
346 | "b", "q", "w", "r", _LO32, "r", "", "r") | ||
347 | |||
348 | /* Source operand is word, long or quad sized. */ | ||
349 | #define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \ | ||
350 | __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ | ||
351 | "w", "r", _LO32, "r", "", "r") | ||
352 | |||
353 | /* Instruction has only one explicit operand (no source operand). */ | ||
354 | #define emulate_1op(_op, _dst, _eflags) \ | ||
355 | do { \ | ||
356 | unsigned long _tmp; \ | ||
357 | \ | ||
358 | switch ((_dst).bytes) { \ | ||
359 | case 1: \ | ||
360 | __asm__ __volatile__ ( \ | ||
361 | _PRE_EFLAGS("0", "3", "2") \ | ||
362 | _op"b %1; " \ | ||
363 | _POST_EFLAGS("0", "3", "2") \ | ||
364 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
365 | "=&r" (_tmp) \ | ||
366 | : "i" (EFLAGS_MASK)); \ | ||
367 | break; \ | ||
368 | case 2: \ | ||
369 | __asm__ __volatile__ ( \ | ||
370 | _PRE_EFLAGS("0", "3", "2") \ | ||
371 | _op"w %1; " \ | ||
372 | _POST_EFLAGS("0", "3", "2") \ | ||
373 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
374 | "=&r" (_tmp) \ | ||
375 | : "i" (EFLAGS_MASK)); \ | ||
376 | break; \ | ||
377 | case 4: \ | ||
378 | __asm__ __volatile__ ( \ | ||
379 | _PRE_EFLAGS("0", "3", "2") \ | ||
380 | _op"l %1; " \ | ||
381 | _POST_EFLAGS("0", "3", "2") \ | ||
382 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
383 | "=&r" (_tmp) \ | ||
384 | : "i" (EFLAGS_MASK)); \ | ||
385 | break; \ | ||
386 | case 8: \ | ||
387 | __emulate_1op_8byte(_op, _dst, _eflags); \ | ||
388 | break; \ | ||
389 | } \ | ||
390 | } while (0) | ||
391 | |||
392 | /* Emulate an instruction with quadword operands (x86/64 only). */ | ||
393 | #if defined(CONFIG_X86_64) | ||
394 | #define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) \ | ||
395 | do { \ | ||
396 | __asm__ __volatile__ ( \ | ||
397 | _PRE_EFLAGS("0", "4", "2") \ | ||
398 | _op"q %"_qx"3,%1; " \ | ||
399 | _POST_EFLAGS("0", "4", "2") \ | ||
400 | : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ | ||
401 | : _qy ((_src).val), "i" (EFLAGS_MASK)); \ | ||
402 | } while (0) | ||
403 | |||
404 | #define __emulate_1op_8byte(_op, _dst, _eflags) \ | ||
405 | do { \ | ||
406 | __asm__ __volatile__ ( \ | ||
407 | _PRE_EFLAGS("0", "3", "2") \ | ||
408 | _op"q %1; " \ | ||
409 | _POST_EFLAGS("0", "3", "2") \ | ||
410 | : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ | ||
411 | : "i" (EFLAGS_MASK)); \ | ||
412 | } while (0) | ||
413 | |||
414 | #elif defined(__i386__) | ||
415 | #define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) | ||
416 | #define __emulate_1op_8byte(_op, _dst, _eflags) | ||
417 | #endif /* __i386__ */ | ||
418 | |||
419 | /* Fetch next part of the instruction being emulated. */ | ||
420 | #define insn_fetch(_type, _size, _eip) \ | ||
421 | ({ unsigned long _x; \ | ||
422 | rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \ | ||
423 | if (rc != 0) \ | ||
424 | goto done; \ | ||
425 | (_eip) += (_size); \ | ||
426 | (_type)_x; \ | ||
427 | }) | ||
428 | |||
429 | /* Access/update address held in a register, based on addressing mode. */ | ||
430 | #define address_mask(reg) \ | ||
431 | ((c->ad_bytes == sizeof(unsigned long)) ? \ | ||
432 | (reg) : ((reg) & ((1UL << (c->ad_bytes << 3)) - 1))) | ||
433 | #define register_address(base, reg) \ | ||
434 | ((base) + address_mask(reg)) | ||
435 | #define register_address_increment(reg, inc) \ | ||
436 | do { \ | ||
437 | /* signed type ensures sign extension to long */ \ | ||
438 | int _inc = (inc); \ | ||
439 | if (c->ad_bytes == sizeof(unsigned long)) \ | ||
440 | (reg) += _inc; \ | ||
441 | else \ | ||
442 | (reg) = ((reg) & \ | ||
443 | ~((1UL << (c->ad_bytes << 3)) - 1)) | \ | ||
444 | (((reg) + _inc) & \ | ||
445 | ((1UL << (c->ad_bytes << 3)) - 1)); \ | ||
446 | } while (0) | ||
447 | |||
448 | #define JMP_REL(rel) \ | ||
449 | do { \ | ||
450 | register_address_increment(c->eip, rel); \ | ||
451 | } while (0) | ||
452 | |||
453 | static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, | ||
454 | struct x86_emulate_ops *ops, | ||
455 | unsigned long linear, u8 *dest) | ||
456 | { | ||
457 | struct fetch_cache *fc = &ctxt->decode.fetch; | ||
458 | int rc; | ||
459 | int size; | ||
460 | |||
461 | if (linear < fc->start || linear >= fc->end) { | ||
462 | size = min(15UL, PAGE_SIZE - offset_in_page(linear)); | ||
463 | rc = ops->read_std(linear, fc->data, size, ctxt->vcpu); | ||
464 | if (rc) | ||
465 | return rc; | ||
466 | fc->start = linear; | ||
467 | fc->end = linear + size; | ||
468 | } | ||
469 | *dest = fc->data[linear - fc->start]; | ||
470 | return 0; | ||
471 | } | ||
472 | |||
473 | static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, | ||
474 | struct x86_emulate_ops *ops, | ||
475 | unsigned long eip, void *dest, unsigned size) | ||
476 | { | ||
477 | int rc = 0; | ||
478 | |||
479 | eip += ctxt->cs_base; | ||
480 | while (size--) { | ||
481 | rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++); | ||
482 | if (rc) | ||
483 | return rc; | ||
484 | } | ||
485 | return 0; | ||
486 | } | ||
487 | |||
488 | /* | ||
489 | * Given the 'reg' portion of a ModRM byte, and a register block, return a | ||
490 | * pointer into the block that addresses the relevant register. | ||
491 | * @highbyte_regs specifies whether to decode AH,CH,DH,BH. | ||
492 | */ | ||
493 | static void *decode_register(u8 modrm_reg, unsigned long *regs, | ||
494 | int highbyte_regs) | ||
495 | { | ||
496 | void *p; | ||
497 | |||
498 | p = ®s[modrm_reg]; | ||
499 | if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8) | ||
500 | p = (unsigned char *)®s[modrm_reg & 3] + 1; | ||
501 | return p; | ||
502 | } | ||
503 | |||
504 | static int read_descriptor(struct x86_emulate_ctxt *ctxt, | ||
505 | struct x86_emulate_ops *ops, | ||
506 | void *ptr, | ||
507 | u16 *size, unsigned long *address, int op_bytes) | ||
508 | { | ||
509 | int rc; | ||
510 | |||
511 | if (op_bytes == 2) | ||
512 | op_bytes = 3; | ||
513 | *address = 0; | ||
514 | rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, | ||
515 | ctxt->vcpu); | ||
516 | if (rc) | ||
517 | return rc; | ||
518 | rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, | ||
519 | ctxt->vcpu); | ||
520 | return rc; | ||
521 | } | ||
522 | |||
523 | static int test_cc(unsigned int condition, unsigned int flags) | ||
524 | { | ||
525 | int rc = 0; | ||
526 | |||
527 | switch ((condition & 15) >> 1) { | ||
528 | case 0: /* o */ | ||
529 | rc |= (flags & EFLG_OF); | ||
530 | break; | ||
531 | case 1: /* b/c/nae */ | ||
532 | rc |= (flags & EFLG_CF); | ||
533 | break; | ||
534 | case 2: /* z/e */ | ||
535 | rc |= (flags & EFLG_ZF); | ||
536 | break; | ||
537 | case 3: /* be/na */ | ||
538 | rc |= (flags & (EFLG_CF|EFLG_ZF)); | ||
539 | break; | ||
540 | case 4: /* s */ | ||
541 | rc |= (flags & EFLG_SF); | ||
542 | break; | ||
543 | case 5: /* p/pe */ | ||
544 | rc |= (flags & EFLG_PF); | ||
545 | break; | ||
546 | case 7: /* le/ng */ | ||
547 | rc |= (flags & EFLG_ZF); | ||
548 | /* fall through */ | ||
549 | case 6: /* l/nge */ | ||
550 | rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF)); | ||
551 | break; | ||
552 | } | ||
553 | |||
554 | /* Odd condition identifiers (lsb == 1) have inverted sense. */ | ||
555 | return (!!rc ^ (condition & 1)); | ||
556 | } | ||
557 | |||
558 | static void decode_register_operand(struct operand *op, | ||
559 | struct decode_cache *c, | ||
560 | int inhibit_bytereg) | ||
561 | { | ||
562 | unsigned reg = c->modrm_reg; | ||
563 | int highbyte_regs = c->rex_prefix == 0; | ||
564 | |||
565 | if (!(c->d & ModRM)) | ||
566 | reg = (c->b & 7) | ((c->rex_prefix & 1) << 3); | ||
567 | op->type = OP_REG; | ||
568 | if ((c->d & ByteOp) && !inhibit_bytereg) { | ||
569 | op->ptr = decode_register(reg, c->regs, highbyte_regs); | ||
570 | op->val = *(u8 *)op->ptr; | ||
571 | op->bytes = 1; | ||
572 | } else { | ||
573 | op->ptr = decode_register(reg, c->regs, 0); | ||
574 | op->bytes = c->op_bytes; | ||
575 | switch (op->bytes) { | ||
576 | case 2: | ||
577 | op->val = *(u16 *)op->ptr; | ||
578 | break; | ||
579 | case 4: | ||
580 | op->val = *(u32 *)op->ptr; | ||
581 | break; | ||
582 | case 8: | ||
583 | op->val = *(u64 *) op->ptr; | ||
584 | break; | ||
585 | } | ||
586 | } | ||
587 | op->orig_val = op->val; | ||
588 | } | ||
589 | |||
590 | static int decode_modrm(struct x86_emulate_ctxt *ctxt, | ||
591 | struct x86_emulate_ops *ops) | ||
592 | { | ||
593 | struct decode_cache *c = &ctxt->decode; | ||
594 | u8 sib; | ||
595 | int index_reg = 0, base_reg = 0, scale, rip_relative = 0; | ||
596 | int rc = 0; | ||
597 | |||
598 | if (c->rex_prefix) { | ||
599 | c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */ | ||
600 | index_reg = (c->rex_prefix & 2) << 2; /* REX.X */ | ||
601 | c->modrm_rm = base_reg = (c->rex_prefix & 1) << 3; /* REG.B */ | ||
602 | } | ||
603 | |||
604 | c->modrm = insn_fetch(u8, 1, c->eip); | ||
605 | c->modrm_mod |= (c->modrm & 0xc0) >> 6; | ||
606 | c->modrm_reg |= (c->modrm & 0x38) >> 3; | ||
607 | c->modrm_rm |= (c->modrm & 0x07); | ||
608 | c->modrm_ea = 0; | ||
609 | c->use_modrm_ea = 1; | ||
610 | |||
611 | if (c->modrm_mod == 3) { | ||
612 | c->modrm_val = *(unsigned long *) | ||
613 | decode_register(c->modrm_rm, c->regs, c->d & ByteOp); | ||
614 | return rc; | ||
615 | } | ||
616 | |||
617 | if (c->ad_bytes == 2) { | ||
618 | unsigned bx = c->regs[VCPU_REGS_RBX]; | ||
619 | unsigned bp = c->regs[VCPU_REGS_RBP]; | ||
620 | unsigned si = c->regs[VCPU_REGS_RSI]; | ||
621 | unsigned di = c->regs[VCPU_REGS_RDI]; | ||
622 | |||
623 | /* 16-bit ModR/M decode. */ | ||
624 | switch (c->modrm_mod) { | ||
625 | case 0: | ||
626 | if (c->modrm_rm == 6) | ||
627 | c->modrm_ea += insn_fetch(u16, 2, c->eip); | ||
628 | break; | ||
629 | case 1: | ||
630 | c->modrm_ea += insn_fetch(s8, 1, c->eip); | ||
631 | break; | ||
632 | case 2: | ||
633 | c->modrm_ea += insn_fetch(u16, 2, c->eip); | ||
634 | break; | ||
635 | } | ||
636 | switch (c->modrm_rm) { | ||
637 | case 0: | ||
638 | c->modrm_ea += bx + si; | ||
639 | break; | ||
640 | case 1: | ||
641 | c->modrm_ea += bx + di; | ||
642 | break; | ||
643 | case 2: | ||
644 | c->modrm_ea += bp + si; | ||
645 | break; | ||
646 | case 3: | ||
647 | c->modrm_ea += bp + di; | ||
648 | break; | ||
649 | case 4: | ||
650 | c->modrm_ea += si; | ||
651 | break; | ||
652 | case 5: | ||
653 | c->modrm_ea += di; | ||
654 | break; | ||
655 | case 6: | ||
656 | if (c->modrm_mod != 0) | ||
657 | c->modrm_ea += bp; | ||
658 | break; | ||
659 | case 7: | ||
660 | c->modrm_ea += bx; | ||
661 | break; | ||
662 | } | ||
663 | if (c->modrm_rm == 2 || c->modrm_rm == 3 || | ||
664 | (c->modrm_rm == 6 && c->modrm_mod != 0)) | ||
665 | if (!c->override_base) | ||
666 | c->override_base = &ctxt->ss_base; | ||
667 | c->modrm_ea = (u16)c->modrm_ea; | ||
668 | } else { | ||
669 | /* 32/64-bit ModR/M decode. */ | ||
670 | switch (c->modrm_rm) { | ||
671 | case 4: | ||
672 | case 12: | ||
673 | sib = insn_fetch(u8, 1, c->eip); | ||
674 | index_reg |= (sib >> 3) & 7; | ||
675 | base_reg |= sib & 7; | ||
676 | scale = sib >> 6; | ||
677 | |||
678 | switch (base_reg) { | ||
679 | case 5: | ||
680 | if (c->modrm_mod != 0) | ||
681 | c->modrm_ea += c->regs[base_reg]; | ||
682 | else | ||
683 | c->modrm_ea += | ||
684 | insn_fetch(s32, 4, c->eip); | ||
685 | break; | ||
686 | default: | ||
687 | c->modrm_ea += c->regs[base_reg]; | ||
688 | } | ||
689 | switch (index_reg) { | ||
690 | case 4: | ||
691 | break; | ||
692 | default: | ||
693 | c->modrm_ea += c->regs[index_reg] << scale; | ||
694 | } | ||
695 | break; | ||
696 | case 5: | ||
697 | if (c->modrm_mod != 0) | ||
698 | c->modrm_ea += c->regs[c->modrm_rm]; | ||
699 | else if (ctxt->mode == X86EMUL_MODE_PROT64) | ||
700 | rip_relative = 1; | ||
701 | break; | ||
702 | default: | ||
703 | c->modrm_ea += c->regs[c->modrm_rm]; | ||
704 | break; | ||
705 | } | ||
706 | switch (c->modrm_mod) { | ||
707 | case 0: | ||
708 | if (c->modrm_rm == 5) | ||
709 | c->modrm_ea += insn_fetch(s32, 4, c->eip); | ||
710 | break; | ||
711 | case 1: | ||
712 | c->modrm_ea += insn_fetch(s8, 1, c->eip); | ||
713 | break; | ||
714 | case 2: | ||
715 | c->modrm_ea += insn_fetch(s32, 4, c->eip); | ||
716 | break; | ||
717 | } | ||
718 | } | ||
719 | if (rip_relative) { | ||
720 | c->modrm_ea += c->eip; | ||
721 | switch (c->d & SrcMask) { | ||
722 | case SrcImmByte: | ||
723 | c->modrm_ea += 1; | ||
724 | break; | ||
725 | case SrcImm: | ||
726 | if (c->d & ByteOp) | ||
727 | c->modrm_ea += 1; | ||
728 | else | ||
729 | if (c->op_bytes == 8) | ||
730 | c->modrm_ea += 4; | ||
731 | else | ||
732 | c->modrm_ea += c->op_bytes; | ||
733 | } | ||
734 | } | ||
735 | done: | ||
736 | return rc; | ||
737 | } | ||
738 | |||
739 | static int decode_abs(struct x86_emulate_ctxt *ctxt, | ||
740 | struct x86_emulate_ops *ops) | ||
741 | { | ||
742 | struct decode_cache *c = &ctxt->decode; | ||
743 | int rc = 0; | ||
744 | |||
745 | switch (c->ad_bytes) { | ||
746 | case 2: | ||
747 | c->modrm_ea = insn_fetch(u16, 2, c->eip); | ||
748 | break; | ||
749 | case 4: | ||
750 | c->modrm_ea = insn_fetch(u32, 4, c->eip); | ||
751 | break; | ||
752 | case 8: | ||
753 | c->modrm_ea = insn_fetch(u64, 8, c->eip); | ||
754 | break; | ||
755 | } | ||
756 | done: | ||
757 | return rc; | ||
758 | } | ||
759 | |||
760 | int | ||
761 | x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | ||
762 | { | ||
763 | struct decode_cache *c = &ctxt->decode; | ||
764 | int rc = 0; | ||
765 | int mode = ctxt->mode; | ||
766 | int def_op_bytes, def_ad_bytes; | ||
767 | |||
768 | /* Shadow copy of register state. Committed on successful emulation. */ | ||
769 | |||
770 | memset(c, 0, sizeof(struct decode_cache)); | ||
771 | c->eip = ctxt->vcpu->arch.rip; | ||
772 | memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); | ||
773 | |||
774 | switch (mode) { | ||
775 | case X86EMUL_MODE_REAL: | ||
776 | case X86EMUL_MODE_PROT16: | ||
777 | def_op_bytes = def_ad_bytes = 2; | ||
778 | break; | ||
779 | case X86EMUL_MODE_PROT32: | ||
780 | def_op_bytes = def_ad_bytes = 4; | ||
781 | break; | ||
782 | #ifdef CONFIG_X86_64 | ||
783 | case X86EMUL_MODE_PROT64: | ||
784 | def_op_bytes = 4; | ||
785 | def_ad_bytes = 8; | ||
786 | break; | ||
787 | #endif | ||
788 | default: | ||
789 | return -1; | ||
790 | } | ||
791 | |||
792 | c->op_bytes = def_op_bytes; | ||
793 | c->ad_bytes = def_ad_bytes; | ||
794 | |||
795 | /* Legacy prefixes. */ | ||
796 | for (;;) { | ||
797 | switch (c->b = insn_fetch(u8, 1, c->eip)) { | ||
798 | case 0x66: /* operand-size override */ | ||
799 | /* switch between 2/4 bytes */ | ||
800 | c->op_bytes = def_op_bytes ^ 6; | ||
801 | break; | ||
802 | case 0x67: /* address-size override */ | ||
803 | if (mode == X86EMUL_MODE_PROT64) | ||
804 | /* switch between 4/8 bytes */ | ||
805 | c->ad_bytes = def_ad_bytes ^ 12; | ||
806 | else | ||
807 | /* switch between 2/4 bytes */ | ||
808 | c->ad_bytes = def_ad_bytes ^ 6; | ||
809 | break; | ||
810 | case 0x2e: /* CS override */ | ||
811 | c->override_base = &ctxt->cs_base; | ||
812 | break; | ||
813 | case 0x3e: /* DS override */ | ||
814 | c->override_base = &ctxt->ds_base; | ||
815 | break; | ||
816 | case 0x26: /* ES override */ | ||
817 | c->override_base = &ctxt->es_base; | ||
818 | break; | ||
819 | case 0x64: /* FS override */ | ||
820 | c->override_base = &ctxt->fs_base; | ||
821 | break; | ||
822 | case 0x65: /* GS override */ | ||
823 | c->override_base = &ctxt->gs_base; | ||
824 | break; | ||
825 | case 0x36: /* SS override */ | ||
826 | c->override_base = &ctxt->ss_base; | ||
827 | break; | ||
828 | case 0x40 ... 0x4f: /* REX */ | ||
829 | if (mode != X86EMUL_MODE_PROT64) | ||
830 | goto done_prefixes; | ||
831 | c->rex_prefix = c->b; | ||
832 | continue; | ||
833 | case 0xf0: /* LOCK */ | ||
834 | c->lock_prefix = 1; | ||
835 | break; | ||
836 | case 0xf2: /* REPNE/REPNZ */ | ||
837 | c->rep_prefix = REPNE_PREFIX; | ||
838 | break; | ||
839 | case 0xf3: /* REP/REPE/REPZ */ | ||
840 | c->rep_prefix = REPE_PREFIX; | ||
841 | break; | ||
842 | default: | ||
843 | goto done_prefixes; | ||
844 | } | ||
845 | |||
846 | /* Any legacy prefix after a REX prefix nullifies its effect. */ | ||
847 | |||
848 | c->rex_prefix = 0; | ||
849 | } | ||
850 | |||
851 | done_prefixes: | ||
852 | |||
853 | /* REX prefix. */ | ||
854 | if (c->rex_prefix) | ||
855 | if (c->rex_prefix & 8) | ||
856 | c->op_bytes = 8; /* REX.W */ | ||
857 | |||
858 | /* Opcode byte(s). */ | ||
859 | c->d = opcode_table[c->b]; | ||
860 | if (c->d == 0) { | ||
861 | /* Two-byte opcode? */ | ||
862 | if (c->b == 0x0f) { | ||
863 | c->twobyte = 1; | ||
864 | c->b = insn_fetch(u8, 1, c->eip); | ||
865 | c->d = twobyte_table[c->b]; | ||
866 | } | ||
867 | |||
868 | /* Unrecognised? */ | ||
869 | if (c->d == 0) { | ||
870 | DPRINTF("Cannot emulate %02x\n", c->b); | ||
871 | return -1; | ||
872 | } | ||
873 | } | ||
874 | |||
875 | if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) | ||
876 | c->op_bytes = 8; | ||
877 | |||
878 | /* ModRM and SIB bytes. */ | ||
879 | if (c->d & ModRM) | ||
880 | rc = decode_modrm(ctxt, ops); | ||
881 | else if (c->d & MemAbs) | ||
882 | rc = decode_abs(ctxt, ops); | ||
883 | if (rc) | ||
884 | goto done; | ||
885 | |||
886 | if (!c->override_base) | ||
887 | c->override_base = &ctxt->ds_base; | ||
888 | if (mode == X86EMUL_MODE_PROT64 && | ||
889 | c->override_base != &ctxt->fs_base && | ||
890 | c->override_base != &ctxt->gs_base) | ||
891 | c->override_base = NULL; | ||
892 | |||
893 | if (c->override_base) | ||
894 | c->modrm_ea += *c->override_base; | ||
895 | |||
896 | if (c->ad_bytes != 8) | ||
897 | c->modrm_ea = (u32)c->modrm_ea; | ||
898 | /* | ||
899 | * Decode and fetch the source operand: register, memory | ||
900 | * or immediate. | ||
901 | */ | ||
902 | switch (c->d & SrcMask) { | ||
903 | case SrcNone: | ||
904 | break; | ||
905 | case SrcReg: | ||
906 | decode_register_operand(&c->src, c, 0); | ||
907 | break; | ||
908 | case SrcMem16: | ||
909 | c->src.bytes = 2; | ||
910 | goto srcmem_common; | ||
911 | case SrcMem32: | ||
912 | c->src.bytes = 4; | ||
913 | goto srcmem_common; | ||
914 | case SrcMem: | ||
915 | c->src.bytes = (c->d & ByteOp) ? 1 : | ||
916 | c->op_bytes; | ||
917 | /* Don't fetch the address for invlpg: it could be unmapped. */ | ||
918 | if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7) | ||
919 | break; | ||
920 | srcmem_common: | ||
921 | /* | ||
922 | * For instructions with a ModR/M byte, switch to register | ||
923 | * access if Mod = 3. | ||
924 | */ | ||
925 | if ((c->d & ModRM) && c->modrm_mod == 3) { | ||
926 | c->src.type = OP_REG; | ||
927 | break; | ||
928 | } | ||
929 | c->src.type = OP_MEM; | ||
930 | break; | ||
931 | case SrcImm: | ||
932 | c->src.type = OP_IMM; | ||
933 | c->src.ptr = (unsigned long *)c->eip; | ||
934 | c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
935 | if (c->src.bytes == 8) | ||
936 | c->src.bytes = 4; | ||
937 | /* NB. Immediates are sign-extended as necessary. */ | ||
938 | switch (c->src.bytes) { | ||
939 | case 1: | ||
940 | c->src.val = insn_fetch(s8, 1, c->eip); | ||
941 | break; | ||
942 | case 2: | ||
943 | c->src.val = insn_fetch(s16, 2, c->eip); | ||
944 | break; | ||
945 | case 4: | ||
946 | c->src.val = insn_fetch(s32, 4, c->eip); | ||
947 | break; | ||
948 | } | ||
949 | break; | ||
950 | case SrcImmByte: | ||
951 | c->src.type = OP_IMM; | ||
952 | c->src.ptr = (unsigned long *)c->eip; | ||
953 | c->src.bytes = 1; | ||
954 | c->src.val = insn_fetch(s8, 1, c->eip); | ||
955 | break; | ||
956 | } | ||
957 | |||
958 | /* Decode and fetch the destination operand: register or memory. */ | ||
959 | switch (c->d & DstMask) { | ||
960 | case ImplicitOps: | ||
961 | /* Special instructions do their own operand decoding. */ | ||
962 | return 0; | ||
963 | case DstReg: | ||
964 | decode_register_operand(&c->dst, c, | ||
965 | c->twobyte && (c->b == 0xb6 || c->b == 0xb7)); | ||
966 | break; | ||
967 | case DstMem: | ||
968 | if ((c->d & ModRM) && c->modrm_mod == 3) { | ||
969 | c->dst.type = OP_REG; | ||
970 | break; | ||
971 | } | ||
972 | c->dst.type = OP_MEM; | ||
973 | break; | ||
974 | } | ||
975 | |||
976 | done: | ||
977 | return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; | ||
978 | } | ||
979 | |||
980 | static inline void emulate_push(struct x86_emulate_ctxt *ctxt) | ||
981 | { | ||
982 | struct decode_cache *c = &ctxt->decode; | ||
983 | |||
984 | c->dst.type = OP_MEM; | ||
985 | c->dst.bytes = c->op_bytes; | ||
986 | c->dst.val = c->src.val; | ||
987 | register_address_increment(c->regs[VCPU_REGS_RSP], -c->op_bytes); | ||
988 | c->dst.ptr = (void *) register_address(ctxt->ss_base, | ||
989 | c->regs[VCPU_REGS_RSP]); | ||
990 | } | ||
991 | |||
992 | static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, | ||
993 | struct x86_emulate_ops *ops) | ||
994 | { | ||
995 | struct decode_cache *c = &ctxt->decode; | ||
996 | int rc; | ||
997 | |||
998 | rc = ops->read_std(register_address(ctxt->ss_base, | ||
999 | c->regs[VCPU_REGS_RSP]), | ||
1000 | &c->dst.val, c->dst.bytes, ctxt->vcpu); | ||
1001 | if (rc != 0) | ||
1002 | return rc; | ||
1003 | |||
1004 | register_address_increment(c->regs[VCPU_REGS_RSP], c->dst.bytes); | ||
1005 | |||
1006 | return 0; | ||
1007 | } | ||
1008 | |||
1009 | static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt) | ||
1010 | { | ||
1011 | struct decode_cache *c = &ctxt->decode; | ||
1012 | switch (c->modrm_reg) { | ||
1013 | case 0: /* rol */ | ||
1014 | emulate_2op_SrcB("rol", c->src, c->dst, ctxt->eflags); | ||
1015 | break; | ||
1016 | case 1: /* ror */ | ||
1017 | emulate_2op_SrcB("ror", c->src, c->dst, ctxt->eflags); | ||
1018 | break; | ||
1019 | case 2: /* rcl */ | ||
1020 | emulate_2op_SrcB("rcl", c->src, c->dst, ctxt->eflags); | ||
1021 | break; | ||
1022 | case 3: /* rcr */ | ||
1023 | emulate_2op_SrcB("rcr", c->src, c->dst, ctxt->eflags); | ||
1024 | break; | ||
1025 | case 4: /* sal/shl */ | ||
1026 | case 6: /* sal/shl */ | ||
1027 | emulate_2op_SrcB("sal", c->src, c->dst, ctxt->eflags); | ||
1028 | break; | ||
1029 | case 5: /* shr */ | ||
1030 | emulate_2op_SrcB("shr", c->src, c->dst, ctxt->eflags); | ||
1031 | break; | ||
1032 | case 7: /* sar */ | ||
1033 | emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags); | ||
1034 | break; | ||
1035 | } | ||
1036 | } | ||
1037 | |||
1038 | static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, | ||
1039 | struct x86_emulate_ops *ops) | ||
1040 | { | ||
1041 | struct decode_cache *c = &ctxt->decode; | ||
1042 | int rc = 0; | ||
1043 | |||
1044 | switch (c->modrm_reg) { | ||
1045 | case 0 ... 1: /* test */ | ||
1046 | /* | ||
1047 | * Special case in Grp3: test has an immediate | ||
1048 | * source operand. | ||
1049 | */ | ||
1050 | c->src.type = OP_IMM; | ||
1051 | c->src.ptr = (unsigned long *)c->eip; | ||
1052 | c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
1053 | if (c->src.bytes == 8) | ||
1054 | c->src.bytes = 4; | ||
1055 | switch (c->src.bytes) { | ||
1056 | case 1: | ||
1057 | c->src.val = insn_fetch(s8, 1, c->eip); | ||
1058 | break; | ||
1059 | case 2: | ||
1060 | c->src.val = insn_fetch(s16, 2, c->eip); | ||
1061 | break; | ||
1062 | case 4: | ||
1063 | c->src.val = insn_fetch(s32, 4, c->eip); | ||
1064 | break; | ||
1065 | } | ||
1066 | emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); | ||
1067 | break; | ||
1068 | case 2: /* not */ | ||
1069 | c->dst.val = ~c->dst.val; | ||
1070 | break; | ||
1071 | case 3: /* neg */ | ||
1072 | emulate_1op("neg", c->dst, ctxt->eflags); | ||
1073 | break; | ||
1074 | default: | ||
1075 | DPRINTF("Cannot emulate %02x\n", c->b); | ||
1076 | rc = X86EMUL_UNHANDLEABLE; | ||
1077 | break; | ||
1078 | } | ||
1079 | done: | ||
1080 | return rc; | ||
1081 | } | ||
1082 | |||
1083 | static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, | ||
1084 | struct x86_emulate_ops *ops) | ||
1085 | { | ||
1086 | struct decode_cache *c = &ctxt->decode; | ||
1087 | int rc; | ||
1088 | |||
1089 | switch (c->modrm_reg) { | ||
1090 | case 0: /* inc */ | ||
1091 | emulate_1op("inc", c->dst, ctxt->eflags); | ||
1092 | break; | ||
1093 | case 1: /* dec */ | ||
1094 | emulate_1op("dec", c->dst, ctxt->eflags); | ||
1095 | break; | ||
1096 | case 4: /* jmp abs */ | ||
1097 | if (c->b == 0xff) | ||
1098 | c->eip = c->dst.val; | ||
1099 | else { | ||
1100 | DPRINTF("Cannot emulate %02x\n", c->b); | ||
1101 | return X86EMUL_UNHANDLEABLE; | ||
1102 | } | ||
1103 | break; | ||
1104 | case 6: /* push */ | ||
1105 | |||
1106 | /* 64-bit mode: PUSH always pushes a 64-bit operand. */ | ||
1107 | |||
1108 | if (ctxt->mode == X86EMUL_MODE_PROT64) { | ||
1109 | c->dst.bytes = 8; | ||
1110 | rc = ops->read_std((unsigned long)c->dst.ptr, | ||
1111 | &c->dst.val, 8, ctxt->vcpu); | ||
1112 | if (rc != 0) | ||
1113 | return rc; | ||
1114 | } | ||
1115 | register_address_increment(c->regs[VCPU_REGS_RSP], | ||
1116 | -c->dst.bytes); | ||
1117 | rc = ops->write_emulated(register_address(ctxt->ss_base, | ||
1118 | c->regs[VCPU_REGS_RSP]), &c->dst.val, | ||
1119 | c->dst.bytes, ctxt->vcpu); | ||
1120 | if (rc != 0) | ||
1121 | return rc; | ||
1122 | c->dst.type = OP_NONE; | ||
1123 | break; | ||
1124 | default: | ||
1125 | DPRINTF("Cannot emulate %02x\n", c->b); | ||
1126 | return X86EMUL_UNHANDLEABLE; | ||
1127 | } | ||
1128 | return 0; | ||
1129 | } | ||
1130 | |||
1131 | static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt, | ||
1132 | struct x86_emulate_ops *ops, | ||
1133 | unsigned long memop) | ||
1134 | { | ||
1135 | struct decode_cache *c = &ctxt->decode; | ||
1136 | u64 old, new; | ||
1137 | int rc; | ||
1138 | |||
1139 | rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu); | ||
1140 | if (rc != 0) | ||
1141 | return rc; | ||
1142 | |||
1143 | if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) || | ||
1144 | ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) { | ||
1145 | |||
1146 | c->regs[VCPU_REGS_RAX] = (u32) (old >> 0); | ||
1147 | c->regs[VCPU_REGS_RDX] = (u32) (old >> 32); | ||
1148 | ctxt->eflags &= ~EFLG_ZF; | ||
1149 | |||
1150 | } else { | ||
1151 | new = ((u64)c->regs[VCPU_REGS_RCX] << 32) | | ||
1152 | (u32) c->regs[VCPU_REGS_RBX]; | ||
1153 | |||
1154 | rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu); | ||
1155 | if (rc != 0) | ||
1156 | return rc; | ||
1157 | ctxt->eflags |= EFLG_ZF; | ||
1158 | } | ||
1159 | return 0; | ||
1160 | } | ||
1161 | |||
1162 | static inline int writeback(struct x86_emulate_ctxt *ctxt, | ||
1163 | struct x86_emulate_ops *ops) | ||
1164 | { | ||
1165 | int rc; | ||
1166 | struct decode_cache *c = &ctxt->decode; | ||
1167 | |||
1168 | switch (c->dst.type) { | ||
1169 | case OP_REG: | ||
1170 | /* The 4-byte case *is* correct: | ||
1171 | * in 64-bit mode we zero-extend. | ||
1172 | */ | ||
1173 | switch (c->dst.bytes) { | ||
1174 | case 1: | ||
1175 | *(u8 *)c->dst.ptr = (u8)c->dst.val; | ||
1176 | break; | ||
1177 | case 2: | ||
1178 | *(u16 *)c->dst.ptr = (u16)c->dst.val; | ||
1179 | break; | ||
1180 | case 4: | ||
1181 | *c->dst.ptr = (u32)c->dst.val; | ||
1182 | break; /* 64b: zero-ext */ | ||
1183 | case 8: | ||
1184 | *c->dst.ptr = c->dst.val; | ||
1185 | break; | ||
1186 | } | ||
1187 | break; | ||
1188 | case OP_MEM: | ||
1189 | if (c->lock_prefix) | ||
1190 | rc = ops->cmpxchg_emulated( | ||
1191 | (unsigned long)c->dst.ptr, | ||
1192 | &c->dst.orig_val, | ||
1193 | &c->dst.val, | ||
1194 | c->dst.bytes, | ||
1195 | ctxt->vcpu); | ||
1196 | else | ||
1197 | rc = ops->write_emulated( | ||
1198 | (unsigned long)c->dst.ptr, | ||
1199 | &c->dst.val, | ||
1200 | c->dst.bytes, | ||
1201 | ctxt->vcpu); | ||
1202 | if (rc != 0) | ||
1203 | return rc; | ||
1204 | break; | ||
1205 | case OP_NONE: | ||
1206 | /* no writeback */ | ||
1207 | break; | ||
1208 | default: | ||
1209 | break; | ||
1210 | } | ||
1211 | return 0; | ||
1212 | } | ||
1213 | |||
1214 | int | ||
1215 | x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | ||
1216 | { | ||
1217 | unsigned long memop = 0; | ||
1218 | u64 msr_data; | ||
1219 | unsigned long saved_eip = 0; | ||
1220 | struct decode_cache *c = &ctxt->decode; | ||
1221 | int rc = 0; | ||
1222 | |||
1223 | /* Shadow copy of register state. Committed on successful emulation. | ||
1224 | * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't | ||
1225 | * modify them. | ||
1226 | */ | ||
1227 | |||
1228 | memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); | ||
1229 | saved_eip = c->eip; | ||
1230 | |||
1231 | if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs)) | ||
1232 | memop = c->modrm_ea; | ||
1233 | |||
1234 | if (c->rep_prefix && (c->d & String)) { | ||
1235 | /* All REP prefixes have the same first termination condition */ | ||
1236 | if (c->regs[VCPU_REGS_RCX] == 0) { | ||
1237 | ctxt->vcpu->arch.rip = c->eip; | ||
1238 | goto done; | ||
1239 | } | ||
1240 | /* The second termination condition only applies for REPE | ||
1241 | * and REPNE. Test if the repeat string operation prefix is | ||
1242 | * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the | ||
1243 | * corresponding termination condition according to: | ||
1244 | * - if REPE/REPZ and ZF = 0 then done | ||
1245 | * - if REPNE/REPNZ and ZF = 1 then done | ||
1246 | */ | ||
1247 | if ((c->b == 0xa6) || (c->b == 0xa7) || | ||
1248 | (c->b == 0xae) || (c->b == 0xaf)) { | ||
1249 | if ((c->rep_prefix == REPE_PREFIX) && | ||
1250 | ((ctxt->eflags & EFLG_ZF) == 0)) { | ||
1251 | ctxt->vcpu->arch.rip = c->eip; | ||
1252 | goto done; | ||
1253 | } | ||
1254 | if ((c->rep_prefix == REPNE_PREFIX) && | ||
1255 | ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) { | ||
1256 | ctxt->vcpu->arch.rip = c->eip; | ||
1257 | goto done; | ||
1258 | } | ||
1259 | } | ||
1260 | c->regs[VCPU_REGS_RCX]--; | ||
1261 | c->eip = ctxt->vcpu->arch.rip; | ||
1262 | } | ||
1263 | |||
1264 | if (c->src.type == OP_MEM) { | ||
1265 | c->src.ptr = (unsigned long *)memop; | ||
1266 | c->src.val = 0; | ||
1267 | rc = ops->read_emulated((unsigned long)c->src.ptr, | ||
1268 | &c->src.val, | ||
1269 | c->src.bytes, | ||
1270 | ctxt->vcpu); | ||
1271 | if (rc != 0) | ||
1272 | goto done; | ||
1273 | c->src.orig_val = c->src.val; | ||
1274 | } | ||
1275 | |||
1276 | if ((c->d & DstMask) == ImplicitOps) | ||
1277 | goto special_insn; | ||
1278 | |||
1279 | |||
1280 | if (c->dst.type == OP_MEM) { | ||
1281 | c->dst.ptr = (unsigned long *)memop; | ||
1282 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
1283 | c->dst.val = 0; | ||
1284 | if (c->d & BitOp) { | ||
1285 | unsigned long mask = ~(c->dst.bytes * 8 - 1); | ||
1286 | |||
1287 | c->dst.ptr = (void *)c->dst.ptr + | ||
1288 | (c->src.val & mask) / 8; | ||
1289 | } | ||
1290 | if (!(c->d & Mov) && | ||
1291 | /* optimisation - avoid slow emulated read */ | ||
1292 | ((rc = ops->read_emulated((unsigned long)c->dst.ptr, | ||
1293 | &c->dst.val, | ||
1294 | c->dst.bytes, ctxt->vcpu)) != 0)) | ||
1295 | goto done; | ||
1296 | } | ||
1297 | c->dst.orig_val = c->dst.val; | ||
1298 | |||
1299 | special_insn: | ||
1300 | |||
1301 | if (c->twobyte) | ||
1302 | goto twobyte_insn; | ||
1303 | |||
1304 | switch (c->b) { | ||
1305 | case 0x00 ... 0x05: | ||
1306 | add: /* add */ | ||
1307 | emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); | ||
1308 | break; | ||
1309 | case 0x08 ... 0x0d: | ||
1310 | or: /* or */ | ||
1311 | emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); | ||
1312 | break; | ||
1313 | case 0x10 ... 0x15: | ||
1314 | adc: /* adc */ | ||
1315 | emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags); | ||
1316 | break; | ||
1317 | case 0x18 ... 0x1d: | ||
1318 | sbb: /* sbb */ | ||
1319 | emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); | ||
1320 | break; | ||
1321 | case 0x20 ... 0x23: | ||
1322 | and: /* and */ | ||
1323 | emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); | ||
1324 | break; | ||
1325 | case 0x24: /* and al imm8 */ | ||
1326 | c->dst.type = OP_REG; | ||
1327 | c->dst.ptr = &c->regs[VCPU_REGS_RAX]; | ||
1328 | c->dst.val = *(u8 *)c->dst.ptr; | ||
1329 | c->dst.bytes = 1; | ||
1330 | c->dst.orig_val = c->dst.val; | ||
1331 | goto and; | ||
1332 | case 0x25: /* and ax imm16, or eax imm32 */ | ||
1333 | c->dst.type = OP_REG; | ||
1334 | c->dst.bytes = c->op_bytes; | ||
1335 | c->dst.ptr = &c->regs[VCPU_REGS_RAX]; | ||
1336 | if (c->op_bytes == 2) | ||
1337 | c->dst.val = *(u16 *)c->dst.ptr; | ||
1338 | else | ||
1339 | c->dst.val = *(u32 *)c->dst.ptr; | ||
1340 | c->dst.orig_val = c->dst.val; | ||
1341 | goto and; | ||
1342 | case 0x28 ... 0x2d: | ||
1343 | sub: /* sub */ | ||
1344 | emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags); | ||
1345 | break; | ||
1346 | case 0x30 ... 0x35: | ||
1347 | xor: /* xor */ | ||
1348 | emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags); | ||
1349 | break; | ||
1350 | case 0x38 ... 0x3d: | ||
1351 | cmp: /* cmp */ | ||
1352 | emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); | ||
1353 | break; | ||
1354 | case 0x40 ... 0x47: /* inc r16/r32 */ | ||
1355 | emulate_1op("inc", c->dst, ctxt->eflags); | ||
1356 | break; | ||
1357 | case 0x48 ... 0x4f: /* dec r16/r32 */ | ||
1358 | emulate_1op("dec", c->dst, ctxt->eflags); | ||
1359 | break; | ||
1360 | case 0x50 ... 0x57: /* push reg */ | ||
1361 | c->dst.type = OP_MEM; | ||
1362 | c->dst.bytes = c->op_bytes; | ||
1363 | c->dst.val = c->src.val; | ||
1364 | register_address_increment(c->regs[VCPU_REGS_RSP], | ||
1365 | -c->op_bytes); | ||
1366 | c->dst.ptr = (void *) register_address( | ||
1367 | ctxt->ss_base, c->regs[VCPU_REGS_RSP]); | ||
1368 | break; | ||
1369 | case 0x58 ... 0x5f: /* pop reg */ | ||
1370 | pop_instruction: | ||
1371 | if ((rc = ops->read_std(register_address(ctxt->ss_base, | ||
1372 | c->regs[VCPU_REGS_RSP]), c->dst.ptr, | ||
1373 | c->op_bytes, ctxt->vcpu)) != 0) | ||
1374 | goto done; | ||
1375 | |||
1376 | register_address_increment(c->regs[VCPU_REGS_RSP], | ||
1377 | c->op_bytes); | ||
1378 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
1379 | break; | ||
1380 | case 0x63: /* movsxd */ | ||
1381 | if (ctxt->mode != X86EMUL_MODE_PROT64) | ||
1382 | goto cannot_emulate; | ||
1383 | c->dst.val = (s32) c->src.val; | ||
1384 | break; | ||
1385 | case 0x6a: /* push imm8 */ | ||
1386 | c->src.val = 0L; | ||
1387 | c->src.val = insn_fetch(s8, 1, c->eip); | ||
1388 | emulate_push(ctxt); | ||
1389 | break; | ||
1390 | case 0x6c: /* insb */ | ||
1391 | case 0x6d: /* insw/insd */ | ||
1392 | if (kvm_emulate_pio_string(ctxt->vcpu, NULL, | ||
1393 | 1, | ||
1394 | (c->d & ByteOp) ? 1 : c->op_bytes, | ||
1395 | c->rep_prefix ? | ||
1396 | address_mask(c->regs[VCPU_REGS_RCX]) : 1, | ||
1397 | (ctxt->eflags & EFLG_DF), | ||
1398 | register_address(ctxt->es_base, | ||
1399 | c->regs[VCPU_REGS_RDI]), | ||
1400 | c->rep_prefix, | ||
1401 | c->regs[VCPU_REGS_RDX]) == 0) { | ||
1402 | c->eip = saved_eip; | ||
1403 | return -1; | ||
1404 | } | ||
1405 | return 0; | ||
1406 | case 0x6e: /* outsb */ | ||
1407 | case 0x6f: /* outsw/outsd */ | ||
1408 | if (kvm_emulate_pio_string(ctxt->vcpu, NULL, | ||
1409 | 0, | ||
1410 | (c->d & ByteOp) ? 1 : c->op_bytes, | ||
1411 | c->rep_prefix ? | ||
1412 | address_mask(c->regs[VCPU_REGS_RCX]) : 1, | ||
1413 | (ctxt->eflags & EFLG_DF), | ||
1414 | register_address(c->override_base ? | ||
1415 | *c->override_base : | ||
1416 | ctxt->ds_base, | ||
1417 | c->regs[VCPU_REGS_RSI]), | ||
1418 | c->rep_prefix, | ||
1419 | c->regs[VCPU_REGS_RDX]) == 0) { | ||
1420 | c->eip = saved_eip; | ||
1421 | return -1; | ||
1422 | } | ||
1423 | return 0; | ||
1424 | case 0x70 ... 0x7f: /* jcc (short) */ { | ||
1425 | int rel = insn_fetch(s8, 1, c->eip); | ||
1426 | |||
1427 | if (test_cc(c->b, ctxt->eflags)) | ||
1428 | JMP_REL(rel); | ||
1429 | break; | ||
1430 | } | ||
1431 | case 0x80 ... 0x83: /* Grp1 */ | ||
1432 | switch (c->modrm_reg) { | ||
1433 | case 0: | ||
1434 | goto add; | ||
1435 | case 1: | ||
1436 | goto or; | ||
1437 | case 2: | ||
1438 | goto adc; | ||
1439 | case 3: | ||
1440 | goto sbb; | ||
1441 | case 4: | ||
1442 | goto and; | ||
1443 | case 5: | ||
1444 | goto sub; | ||
1445 | case 6: | ||
1446 | goto xor; | ||
1447 | case 7: | ||
1448 | goto cmp; | ||
1449 | } | ||
1450 | break; | ||
1451 | case 0x84 ... 0x85: | ||
1452 | emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); | ||
1453 | break; | ||
1454 | case 0x86 ... 0x87: /* xchg */ | ||
1455 | /* Write back the register source. */ | ||
1456 | switch (c->dst.bytes) { | ||
1457 | case 1: | ||
1458 | *(u8 *) c->src.ptr = (u8) c->dst.val; | ||
1459 | break; | ||
1460 | case 2: | ||
1461 | *(u16 *) c->src.ptr = (u16) c->dst.val; | ||
1462 | break; | ||
1463 | case 4: | ||
1464 | *c->src.ptr = (u32) c->dst.val; | ||
1465 | break; /* 64b reg: zero-extend */ | ||
1466 | case 8: | ||
1467 | *c->src.ptr = c->dst.val; | ||
1468 | break; | ||
1469 | } | ||
1470 | /* | ||
1471 | * Write back the memory destination with implicit LOCK | ||
1472 | * prefix. | ||
1473 | */ | ||
1474 | c->dst.val = c->src.val; | ||
1475 | c->lock_prefix = 1; | ||
1476 | break; | ||
1477 | case 0x88 ... 0x8b: /* mov */ | ||
1478 | goto mov; | ||
1479 | case 0x8d: /* lea r16/r32, m */ | ||
1480 | c->dst.val = c->modrm_val; | ||
1481 | break; | ||
1482 | case 0x8f: /* pop (sole member of Grp1a) */ | ||
1483 | rc = emulate_grp1a(ctxt, ops); | ||
1484 | if (rc != 0) | ||
1485 | goto done; | ||
1486 | break; | ||
1487 | case 0x9c: /* pushf */ | ||
1488 | c->src.val = (unsigned long) ctxt->eflags; | ||
1489 | emulate_push(ctxt); | ||
1490 | break; | ||
1491 | case 0x9d: /* popf */ | ||
1492 | c->dst.ptr = (unsigned long *) &ctxt->eflags; | ||
1493 | goto pop_instruction; | ||
1494 | case 0xa0 ... 0xa1: /* mov */ | ||
1495 | c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; | ||
1496 | c->dst.val = c->src.val; | ||
1497 | break; | ||
1498 | case 0xa2 ... 0xa3: /* mov */ | ||
1499 | c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX]; | ||
1500 | break; | ||
1501 | case 0xa4 ... 0xa5: /* movs */ | ||
1502 | c->dst.type = OP_MEM; | ||
1503 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
1504 | c->dst.ptr = (unsigned long *)register_address( | ||
1505 | ctxt->es_base, | ||
1506 | c->regs[VCPU_REGS_RDI]); | ||
1507 | if ((rc = ops->read_emulated(register_address( | ||
1508 | c->override_base ? *c->override_base : | ||
1509 | ctxt->ds_base, | ||
1510 | c->regs[VCPU_REGS_RSI]), | ||
1511 | &c->dst.val, | ||
1512 | c->dst.bytes, ctxt->vcpu)) != 0) | ||
1513 | goto done; | ||
1514 | register_address_increment(c->regs[VCPU_REGS_RSI], | ||
1515 | (ctxt->eflags & EFLG_DF) ? -c->dst.bytes | ||
1516 | : c->dst.bytes); | ||
1517 | register_address_increment(c->regs[VCPU_REGS_RDI], | ||
1518 | (ctxt->eflags & EFLG_DF) ? -c->dst.bytes | ||
1519 | : c->dst.bytes); | ||
1520 | break; | ||
1521 | case 0xa6 ... 0xa7: /* cmps */ | ||
1522 | c->src.type = OP_NONE; /* Disable writeback. */ | ||
1523 | c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
1524 | c->src.ptr = (unsigned long *)register_address( | ||
1525 | c->override_base ? *c->override_base : | ||
1526 | ctxt->ds_base, | ||
1527 | c->regs[VCPU_REGS_RSI]); | ||
1528 | if ((rc = ops->read_emulated((unsigned long)c->src.ptr, | ||
1529 | &c->src.val, | ||
1530 | c->src.bytes, | ||
1531 | ctxt->vcpu)) != 0) | ||
1532 | goto done; | ||
1533 | |||
1534 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
1535 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
1536 | c->dst.ptr = (unsigned long *)register_address( | ||
1537 | ctxt->es_base, | ||
1538 | c->regs[VCPU_REGS_RDI]); | ||
1539 | if ((rc = ops->read_emulated((unsigned long)c->dst.ptr, | ||
1540 | &c->dst.val, | ||
1541 | c->dst.bytes, | ||
1542 | ctxt->vcpu)) != 0) | ||
1543 | goto done; | ||
1544 | |||
1545 | DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); | ||
1546 | |||
1547 | emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); | ||
1548 | |||
1549 | register_address_increment(c->regs[VCPU_REGS_RSI], | ||
1550 | (ctxt->eflags & EFLG_DF) ? -c->src.bytes | ||
1551 | : c->src.bytes); | ||
1552 | register_address_increment(c->regs[VCPU_REGS_RDI], | ||
1553 | (ctxt->eflags & EFLG_DF) ? -c->dst.bytes | ||
1554 | : c->dst.bytes); | ||
1555 | |||
1556 | break; | ||
1557 | case 0xaa ... 0xab: /* stos */ | ||
1558 | c->dst.type = OP_MEM; | ||
1559 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
1560 | c->dst.ptr = (unsigned long *)register_address( | ||
1561 | ctxt->es_base, | ||
1562 | c->regs[VCPU_REGS_RDI]); | ||
1563 | c->dst.val = c->regs[VCPU_REGS_RAX]; | ||
1564 | register_address_increment(c->regs[VCPU_REGS_RDI], | ||
1565 | (ctxt->eflags & EFLG_DF) ? -c->dst.bytes | ||
1566 | : c->dst.bytes); | ||
1567 | break; | ||
1568 | case 0xac ... 0xad: /* lods */ | ||
1569 | c->dst.type = OP_REG; | ||
1570 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
1571 | c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; | ||
1572 | if ((rc = ops->read_emulated(register_address( | ||
1573 | c->override_base ? *c->override_base : | ||
1574 | ctxt->ds_base, | ||
1575 | c->regs[VCPU_REGS_RSI]), | ||
1576 | &c->dst.val, | ||
1577 | c->dst.bytes, | ||
1578 | ctxt->vcpu)) != 0) | ||
1579 | goto done; | ||
1580 | register_address_increment(c->regs[VCPU_REGS_RSI], | ||
1581 | (ctxt->eflags & EFLG_DF) ? -c->dst.bytes | ||
1582 | : c->dst.bytes); | ||
1583 | break; | ||
1584 | case 0xae ... 0xaf: /* scas */ | ||
1585 | DPRINTF("Urk! I don't handle SCAS.\n"); | ||
1586 | goto cannot_emulate; | ||
1587 | case 0xc0 ... 0xc1: | ||
1588 | emulate_grp2(ctxt); | ||
1589 | break; | ||
1590 | case 0xc3: /* ret */ | ||
1591 | c->dst.ptr = &c->eip; | ||
1592 | goto pop_instruction; | ||
1593 | case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */ | ||
1594 | mov: | ||
1595 | c->dst.val = c->src.val; | ||
1596 | break; | ||
1597 | case 0xd0 ... 0xd1: /* Grp2 */ | ||
1598 | c->src.val = 1; | ||
1599 | emulate_grp2(ctxt); | ||
1600 | break; | ||
1601 | case 0xd2 ... 0xd3: /* Grp2 */ | ||
1602 | c->src.val = c->regs[VCPU_REGS_RCX]; | ||
1603 | emulate_grp2(ctxt); | ||
1604 | break; | ||
1605 | case 0xe8: /* call (near) */ { | ||
1606 | long int rel; | ||
1607 | switch (c->op_bytes) { | ||
1608 | case 2: | ||
1609 | rel = insn_fetch(s16, 2, c->eip); | ||
1610 | break; | ||
1611 | case 4: | ||
1612 | rel = insn_fetch(s32, 4, c->eip); | ||
1613 | break; | ||
1614 | default: | ||
1615 | DPRINTF("Call: Invalid op_bytes\n"); | ||
1616 | goto cannot_emulate; | ||
1617 | } | ||
1618 | c->src.val = (unsigned long) c->eip; | ||
1619 | JMP_REL(rel); | ||
1620 | c->op_bytes = c->ad_bytes; | ||
1621 | emulate_push(ctxt); | ||
1622 | break; | ||
1623 | } | ||
1624 | case 0xe9: /* jmp rel */ | ||
1625 | case 0xeb: /* jmp rel short */ | ||
1626 | JMP_REL(c->src.val); | ||
1627 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
1628 | break; | ||
1629 | case 0xf4: /* hlt */ | ||
1630 | ctxt->vcpu->arch.halt_request = 1; | ||
1631 | goto done; | ||
1632 | case 0xf5: /* cmc */ | ||
1633 | /* complement carry flag from eflags reg */ | ||
1634 | ctxt->eflags ^= EFLG_CF; | ||
1635 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
1636 | break; | ||
1637 | case 0xf6 ... 0xf7: /* Grp3 */ | ||
1638 | rc = emulate_grp3(ctxt, ops); | ||
1639 | if (rc != 0) | ||
1640 | goto done; | ||
1641 | break; | ||
1642 | case 0xf8: /* clc */ | ||
1643 | ctxt->eflags &= ~EFLG_CF; | ||
1644 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
1645 | break; | ||
1646 | case 0xfa: /* cli */ | ||
1647 | ctxt->eflags &= ~X86_EFLAGS_IF; | ||
1648 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
1649 | break; | ||
1650 | case 0xfb: /* sti */ | ||
1651 | ctxt->eflags |= X86_EFLAGS_IF; | ||
1652 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
1653 | break; | ||
1654 | case 0xfe ... 0xff: /* Grp4/Grp5 */ | ||
1655 | rc = emulate_grp45(ctxt, ops); | ||
1656 | if (rc != 0) | ||
1657 | goto done; | ||
1658 | break; | ||
1659 | } | ||
1660 | |||
1661 | writeback: | ||
1662 | rc = writeback(ctxt, ops); | ||
1663 | if (rc != 0) | ||
1664 | goto done; | ||
1665 | |||
1666 | /* Commit shadow register state. */ | ||
1667 | memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); | ||
1668 | ctxt->vcpu->arch.rip = c->eip; | ||
1669 | |||
1670 | done: | ||
1671 | if (rc == X86EMUL_UNHANDLEABLE) { | ||
1672 | c->eip = saved_eip; | ||
1673 | return -1; | ||
1674 | } | ||
1675 | return 0; | ||
1676 | |||
1677 | twobyte_insn: | ||
1678 | switch (c->b) { | ||
1679 | case 0x01: /* lgdt, lidt, lmsw */ | ||
1680 | switch (c->modrm_reg) { | ||
1681 | u16 size; | ||
1682 | unsigned long address; | ||
1683 | |||
1684 | case 0: /* vmcall */ | ||
1685 | if (c->modrm_mod != 3 || c->modrm_rm != 1) | ||
1686 | goto cannot_emulate; | ||
1687 | |||
1688 | rc = kvm_fix_hypercall(ctxt->vcpu); | ||
1689 | if (rc) | ||
1690 | goto done; | ||
1691 | |||
1692 | kvm_emulate_hypercall(ctxt->vcpu); | ||
1693 | break; | ||
1694 | case 2: /* lgdt */ | ||
1695 | rc = read_descriptor(ctxt, ops, c->src.ptr, | ||
1696 | &size, &address, c->op_bytes); | ||
1697 | if (rc) | ||
1698 | goto done; | ||
1699 | realmode_lgdt(ctxt->vcpu, size, address); | ||
1700 | break; | ||
1701 | case 3: /* lidt/vmmcall */ | ||
1702 | if (c->modrm_mod == 3 && c->modrm_rm == 1) { | ||
1703 | rc = kvm_fix_hypercall(ctxt->vcpu); | ||
1704 | if (rc) | ||
1705 | goto done; | ||
1706 | kvm_emulate_hypercall(ctxt->vcpu); | ||
1707 | } else { | ||
1708 | rc = read_descriptor(ctxt, ops, c->src.ptr, | ||
1709 | &size, &address, | ||
1710 | c->op_bytes); | ||
1711 | if (rc) | ||
1712 | goto done; | ||
1713 | realmode_lidt(ctxt->vcpu, size, address); | ||
1714 | } | ||
1715 | break; | ||
1716 | case 4: /* smsw */ | ||
1717 | if (c->modrm_mod != 3) | ||
1718 | goto cannot_emulate; | ||
1719 | *(u16 *)&c->regs[c->modrm_rm] | ||
1720 | = realmode_get_cr(ctxt->vcpu, 0); | ||
1721 | break; | ||
1722 | case 6: /* lmsw */ | ||
1723 | if (c->modrm_mod != 3) | ||
1724 | goto cannot_emulate; | ||
1725 | realmode_lmsw(ctxt->vcpu, (u16)c->modrm_val, | ||
1726 | &ctxt->eflags); | ||
1727 | break; | ||
1728 | case 7: /* invlpg*/ | ||
1729 | emulate_invlpg(ctxt->vcpu, memop); | ||
1730 | break; | ||
1731 | default: | ||
1732 | goto cannot_emulate; | ||
1733 | } | ||
1734 | /* Disable writeback. */ | ||
1735 | c->dst.type = OP_NONE; | ||
1736 | break; | ||
1737 | case 0x06: | ||
1738 | emulate_clts(ctxt->vcpu); | ||
1739 | c->dst.type = OP_NONE; | ||
1740 | break; | ||
1741 | case 0x08: /* invd */ | ||
1742 | case 0x09: /* wbinvd */ | ||
1743 | case 0x0d: /* GrpP (prefetch) */ | ||
1744 | case 0x18: /* Grp16 (prefetch/nop) */ | ||
1745 | c->dst.type = OP_NONE; | ||
1746 | break; | ||
1747 | case 0x20: /* mov cr, reg */ | ||
1748 | if (c->modrm_mod != 3) | ||
1749 | goto cannot_emulate; | ||
1750 | c->regs[c->modrm_rm] = | ||
1751 | realmode_get_cr(ctxt->vcpu, c->modrm_reg); | ||
1752 | c->dst.type = OP_NONE; /* no writeback */ | ||
1753 | break; | ||
1754 | case 0x21: /* mov from dr to reg */ | ||
1755 | if (c->modrm_mod != 3) | ||
1756 | goto cannot_emulate; | ||
1757 | rc = emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]); | ||
1758 | if (rc) | ||
1759 | goto cannot_emulate; | ||
1760 | c->dst.type = OP_NONE; /* no writeback */ | ||
1761 | break; | ||
1762 | case 0x22: /* mov reg, cr */ | ||
1763 | if (c->modrm_mod != 3) | ||
1764 | goto cannot_emulate; | ||
1765 | realmode_set_cr(ctxt->vcpu, | ||
1766 | c->modrm_reg, c->modrm_val, &ctxt->eflags); | ||
1767 | c->dst.type = OP_NONE; | ||
1768 | break; | ||
1769 | case 0x23: /* mov from reg to dr */ | ||
1770 | if (c->modrm_mod != 3) | ||
1771 | goto cannot_emulate; | ||
1772 | rc = emulator_set_dr(ctxt, c->modrm_reg, | ||
1773 | c->regs[c->modrm_rm]); | ||
1774 | if (rc) | ||
1775 | goto cannot_emulate; | ||
1776 | c->dst.type = OP_NONE; /* no writeback */ | ||
1777 | break; | ||
1778 | case 0x30: | ||
1779 | /* wrmsr */ | ||
1780 | msr_data = (u32)c->regs[VCPU_REGS_RAX] | ||
1781 | | ((u64)c->regs[VCPU_REGS_RDX] << 32); | ||
1782 | rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data); | ||
1783 | if (rc) { | ||
1784 | kvm_inject_gp(ctxt->vcpu, 0); | ||
1785 | c->eip = ctxt->vcpu->arch.rip; | ||
1786 | } | ||
1787 | rc = X86EMUL_CONTINUE; | ||
1788 | c->dst.type = OP_NONE; | ||
1789 | break; | ||
1790 | case 0x32: | ||
1791 | /* rdmsr */ | ||
1792 | rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data); | ||
1793 | if (rc) { | ||
1794 | kvm_inject_gp(ctxt->vcpu, 0); | ||
1795 | c->eip = ctxt->vcpu->arch.rip; | ||
1796 | } else { | ||
1797 | c->regs[VCPU_REGS_RAX] = (u32)msr_data; | ||
1798 | c->regs[VCPU_REGS_RDX] = msr_data >> 32; | ||
1799 | } | ||
1800 | rc = X86EMUL_CONTINUE; | ||
1801 | c->dst.type = OP_NONE; | ||
1802 | break; | ||
1803 | case 0x40 ... 0x4f: /* cmov */ | ||
1804 | c->dst.val = c->dst.orig_val = c->src.val; | ||
1805 | if (!test_cc(c->b, ctxt->eflags)) | ||
1806 | c->dst.type = OP_NONE; /* no writeback */ | ||
1807 | break; | ||
1808 | case 0x80 ... 0x8f: /* jnz rel, etc*/ { | ||
1809 | long int rel; | ||
1810 | |||
1811 | switch (c->op_bytes) { | ||
1812 | case 2: | ||
1813 | rel = insn_fetch(s16, 2, c->eip); | ||
1814 | break; | ||
1815 | case 4: | ||
1816 | rel = insn_fetch(s32, 4, c->eip); | ||
1817 | break; | ||
1818 | case 8: | ||
1819 | rel = insn_fetch(s64, 8, c->eip); | ||
1820 | break; | ||
1821 | default: | ||
1822 | DPRINTF("jnz: Invalid op_bytes\n"); | ||
1823 | goto cannot_emulate; | ||
1824 | } | ||
1825 | if (test_cc(c->b, ctxt->eflags)) | ||
1826 | JMP_REL(rel); | ||
1827 | c->dst.type = OP_NONE; | ||
1828 | break; | ||
1829 | } | ||
1830 | case 0xa3: | ||
1831 | bt: /* bt */ | ||
1832 | c->dst.type = OP_NONE; | ||
1833 | /* only subword offset */ | ||
1834 | c->src.val &= (c->dst.bytes << 3) - 1; | ||
1835 | emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags); | ||
1836 | break; | ||
1837 | case 0xab: | ||
1838 | bts: /* bts */ | ||
1839 | /* only subword offset */ | ||
1840 | c->src.val &= (c->dst.bytes << 3) - 1; | ||
1841 | emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags); | ||
1842 | break; | ||
1843 | case 0xb0 ... 0xb1: /* cmpxchg */ | ||
1844 | /* | ||
1845 | * Save real source value, then compare EAX against | ||
1846 | * destination. | ||
1847 | */ | ||
1848 | c->src.orig_val = c->src.val; | ||
1849 | c->src.val = c->regs[VCPU_REGS_RAX]; | ||
1850 | emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); | ||
1851 | if (ctxt->eflags & EFLG_ZF) { | ||
1852 | /* Success: write back to memory. */ | ||
1853 | c->dst.val = c->src.orig_val; | ||
1854 | } else { | ||
1855 | /* Failure: write the value we saw to EAX. */ | ||
1856 | c->dst.type = OP_REG; | ||
1857 | c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; | ||
1858 | } | ||
1859 | break; | ||
1860 | case 0xb3: | ||
1861 | btr: /* btr */ | ||
1862 | /* only subword offset */ | ||
1863 | c->src.val &= (c->dst.bytes << 3) - 1; | ||
1864 | emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags); | ||
1865 | break; | ||
1866 | case 0xb6 ... 0xb7: /* movzx */ | ||
1867 | c->dst.bytes = c->op_bytes; | ||
1868 | c->dst.val = (c->d & ByteOp) ? (u8) c->src.val | ||
1869 | : (u16) c->src.val; | ||
1870 | break; | ||
1871 | case 0xba: /* Grp8 */ | ||
1872 | switch (c->modrm_reg & 3) { | ||
1873 | case 0: | ||
1874 | goto bt; | ||
1875 | case 1: | ||
1876 | goto bts; | ||
1877 | case 2: | ||
1878 | goto btr; | ||
1879 | case 3: | ||
1880 | goto btc; | ||
1881 | } | ||
1882 | break; | ||
1883 | case 0xbb: | ||
1884 | btc: /* btc */ | ||
1885 | /* only subword offset */ | ||
1886 | c->src.val &= (c->dst.bytes << 3) - 1; | ||
1887 | emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags); | ||
1888 | break; | ||
1889 | case 0xbe ... 0xbf: /* movsx */ | ||
1890 | c->dst.bytes = c->op_bytes; | ||
1891 | c->dst.val = (c->d & ByteOp) ? (s8) c->src.val : | ||
1892 | (s16) c->src.val; | ||
1893 | break; | ||
1894 | case 0xc3: /* movnti */ | ||
1895 | c->dst.bytes = c->op_bytes; | ||
1896 | c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val : | ||
1897 | (u64) c->src.val; | ||
1898 | break; | ||
1899 | case 0xc7: /* Grp9 (cmpxchg8b) */ | ||
1900 | rc = emulate_grp9(ctxt, ops, memop); | ||
1901 | if (rc != 0) | ||
1902 | goto done; | ||
1903 | c->dst.type = OP_NONE; | ||
1904 | break; | ||
1905 | } | ||
1906 | goto writeback; | ||
1907 | |||
1908 | cannot_emulate: | ||
1909 | DPRINTF("Cannot emulate %02x\n", c->b); | ||
1910 | c->eip = saved_eip; | ||
1911 | return -1; | ||
1912 | } | ||