diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2008-01-30 17:30:10 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2008-01-30 17:30:10 -0500 |
commit | 2c57ee6f924c95e4dce61ed4776fb3f62e1b9f92 (patch) | |
tree | b9d92e52e8c0ee68a0f5012b470c6146a9f0b65a /drivers | |
parent | f389e9fcecdec4c4cb890ad28ea30a87a579ec3e (diff) | |
parent | 2f52d58c92d971bf421f461ad06eb93fb4f34981 (diff) |
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm: (249 commits)
KVM: Move apic timer migration away from critical section
KVM: Put kvm_para.h include outside __KERNEL__
KVM: Fix unbounded preemption latency
KVM: Initialize the mmu caches only after verifying cpu support
KVM: MMU: Fix dirty page setting for pages removed from rmap
KVM: Portability: Move kvm_fpu to asm-x86/kvm.h
KVM: x86 emulator: Only allow VMCALL/VMMCALL trapped by #UD
KVM: MMU: Merge shadow level check in FNAME(fetch)
KVM: MMU: Move kvm_free_some_pages() into critical section
KVM: MMU: Switch to mmu spinlock
KVM: MMU: Avoid calling gfn_to_page() in mmu_set_spte()
KVM: Add kvm_read_guest_atomic()
KVM: MMU: Concurrent guest walkers
KVM: Disable vapic support on Intel machines with FlexPriority
KVM: Accelerated apic support
KVM: local APIC TPR access reporting facility
KVM: Print data for unimplemented wrmsr
KVM: MMU: Add cache miss statistic
KVM: MMU: Coalesce remote tlb flushes
KVM: Expose ioapic to ia64 save/restore APIs
...
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/Kconfig | 2 | ||||
-rw-r--r-- | drivers/Makefile | 1 | ||||
-rw-r--r-- | drivers/kvm/Kconfig | 54 | ||||
-rw-r--r-- | drivers/kvm/Makefile | 10 | ||||
-rw-r--r-- | drivers/kvm/i8259.c | 450 | ||||
-rw-r--r-- | drivers/kvm/ioapic.c | 388 | ||||
-rw-r--r-- | drivers/kvm/irq.c | 98 | ||||
-rw-r--r-- | drivers/kvm/irq.h | 165 | ||||
-rw-r--r-- | drivers/kvm/kvm.h | 796 | ||||
-rw-r--r-- | drivers/kvm/kvm_main.c | 3628 | ||||
-rw-r--r-- | drivers/kvm/kvm_svm.h | 45 | ||||
-rw-r--r-- | drivers/kvm/lapic.c | 1080 | ||||
-rw-r--r-- | drivers/kvm/mmu.c | 1498 | ||||
-rw-r--r-- | drivers/kvm/paging_tmpl.h | 511 | ||||
-rw-r--r-- | drivers/kvm/segment_descriptor.h | 17 | ||||
-rw-r--r-- | drivers/kvm/svm.c | 1754 | ||||
-rw-r--r-- | drivers/kvm/svm.h | 324 | ||||
-rw-r--r-- | drivers/kvm/vmx.c | 2566 | ||||
-rw-r--r-- | drivers/kvm/vmx.h | 310 | ||||
-rw-r--r-- | drivers/kvm/x86_emulate.c | 1662 | ||||
-rw-r--r-- | drivers/kvm/x86_emulate.h | 155 |
21 files changed, 0 insertions, 15514 deletions
diff --git a/drivers/Kconfig b/drivers/Kconfig index f4076d9e9902..08d4ae201597 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig | |||
@@ -90,8 +90,6 @@ source "drivers/dca/Kconfig" | |||
90 | 90 | ||
91 | source "drivers/auxdisplay/Kconfig" | 91 | source "drivers/auxdisplay/Kconfig" |
92 | 92 | ||
93 | source "drivers/kvm/Kconfig" | ||
94 | |||
95 | source "drivers/uio/Kconfig" | 93 | source "drivers/uio/Kconfig" |
96 | 94 | ||
97 | source "drivers/virtio/Kconfig" | 95 | source "drivers/virtio/Kconfig" |
diff --git a/drivers/Makefile b/drivers/Makefile index d92d4d82d001..9e1f808e43cf 100644 --- a/drivers/Makefile +++ b/drivers/Makefile | |||
@@ -47,7 +47,6 @@ obj-$(CONFIG_SPI) += spi/ | |||
47 | obj-$(CONFIG_PCCARD) += pcmcia/ | 47 | obj-$(CONFIG_PCCARD) += pcmcia/ |
48 | obj-$(CONFIG_DIO) += dio/ | 48 | obj-$(CONFIG_DIO) += dio/ |
49 | obj-$(CONFIG_SBUS) += sbus/ | 49 | obj-$(CONFIG_SBUS) += sbus/ |
50 | obj-$(CONFIG_KVM) += kvm/ | ||
51 | obj-$(CONFIG_ZORRO) += zorro/ | 50 | obj-$(CONFIG_ZORRO) += zorro/ |
52 | obj-$(CONFIG_MAC) += macintosh/ | 51 | obj-$(CONFIG_MAC) += macintosh/ |
53 | obj-$(CONFIG_ATA_OVER_ETH) += block/aoe/ | 52 | obj-$(CONFIG_ATA_OVER_ETH) += block/aoe/ |
diff --git a/drivers/kvm/Kconfig b/drivers/kvm/Kconfig deleted file mode 100644 index 656920636cb2..000000000000 --- a/drivers/kvm/Kconfig +++ /dev/null | |||
@@ -1,54 +0,0 @@ | |||
1 | # | ||
2 | # KVM configuration | ||
3 | # | ||
4 | menuconfig VIRTUALIZATION | ||
5 | bool "Virtualization" | ||
6 | depends on X86 | ||
7 | default y | ||
8 | ---help--- | ||
9 | Say Y here to get to see options for using your Linux host to run other | ||
10 | operating systems inside virtual machines (guests). | ||
11 | This option alone does not add any kernel code. | ||
12 | |||
13 | If you say N, all options in this submenu will be skipped and disabled. | ||
14 | |||
15 | if VIRTUALIZATION | ||
16 | |||
17 | config KVM | ||
18 | tristate "Kernel-based Virtual Machine (KVM) support" | ||
19 | depends on X86 && EXPERIMENTAL | ||
20 | select PREEMPT_NOTIFIERS | ||
21 | select ANON_INODES | ||
22 | ---help--- | ||
23 | Support hosting fully virtualized guest machines using hardware | ||
24 | virtualization extensions. You will need a fairly recent | ||
25 | processor equipped with virtualization extensions. You will also | ||
26 | need to select one or more of the processor modules below. | ||
27 | |||
28 | This module provides access to the hardware capabilities through | ||
29 | a character device node named /dev/kvm. | ||
30 | |||
31 | To compile this as a module, choose M here: the module | ||
32 | will be called kvm. | ||
33 | |||
34 | If unsure, say N. | ||
35 | |||
36 | config KVM_INTEL | ||
37 | tristate "KVM for Intel processors support" | ||
38 | depends on KVM | ||
39 | ---help--- | ||
40 | Provides support for KVM on Intel processors equipped with the VT | ||
41 | extensions. | ||
42 | |||
43 | config KVM_AMD | ||
44 | tristate "KVM for AMD processors support" | ||
45 | depends on KVM | ||
46 | ---help--- | ||
47 | Provides support for KVM on AMD processors equipped with the AMD-V | ||
48 | (SVM) extensions. | ||
49 | |||
50 | # OK, it's a little counter-intuitive to do this, but it puts it neatly under | ||
51 | # the virtualization menu. | ||
52 | source drivers/lguest/Kconfig | ||
53 | |||
54 | endif # VIRTUALIZATION | ||
diff --git a/drivers/kvm/Makefile b/drivers/kvm/Makefile deleted file mode 100644 index e5a8f4d3e973..000000000000 --- a/drivers/kvm/Makefile +++ /dev/null | |||
@@ -1,10 +0,0 @@ | |||
1 | # | ||
2 | # Makefile for Kernel-based Virtual Machine module | ||
3 | # | ||
4 | |||
5 | kvm-objs := kvm_main.o mmu.o x86_emulate.o i8259.o irq.o lapic.o ioapic.o | ||
6 | obj-$(CONFIG_KVM) += kvm.o | ||
7 | kvm-intel-objs = vmx.o | ||
8 | obj-$(CONFIG_KVM_INTEL) += kvm-intel.o | ||
9 | kvm-amd-objs = svm.o | ||
10 | obj-$(CONFIG_KVM_AMD) += kvm-amd.o | ||
diff --git a/drivers/kvm/i8259.c b/drivers/kvm/i8259.c deleted file mode 100644 index a679157bc599..000000000000 --- a/drivers/kvm/i8259.c +++ /dev/null | |||
@@ -1,450 +0,0 @@ | |||
1 | /* | ||
2 | * 8259 interrupt controller emulation | ||
3 | * | ||
4 | * Copyright (c) 2003-2004 Fabrice Bellard | ||
5 | * Copyright (c) 2007 Intel Corporation | ||
6 | * | ||
7 | * Permission is hereby granted, free of charge, to any person obtaining a copy | ||
8 | * of this software and associated documentation files (the "Software"), to deal | ||
9 | * in the Software without restriction, including without limitation the rights | ||
10 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
11 | * copies of the Software, and to permit persons to whom the Software is | ||
12 | * furnished to do so, subject to the following conditions: | ||
13 | * | ||
14 | * The above copyright notice and this permission notice shall be included in | ||
15 | * all copies or substantial portions of the Software. | ||
16 | * | ||
17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | ||
20 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
22 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | ||
23 | * THE SOFTWARE. | ||
24 | * Authors: | ||
25 | * Yaozu (Eddie) Dong <Eddie.dong@intel.com> | ||
26 | * Port from Qemu. | ||
27 | */ | ||
28 | #include <linux/mm.h> | ||
29 | #include "irq.h" | ||
30 | |||
31 | /* | ||
32 | * set irq level. If an edge is detected, then the IRR is set to 1 | ||
33 | */ | ||
34 | static inline void pic_set_irq1(struct kvm_kpic_state *s, int irq, int level) | ||
35 | { | ||
36 | int mask; | ||
37 | mask = 1 << irq; | ||
38 | if (s->elcr & mask) /* level triggered */ | ||
39 | if (level) { | ||
40 | s->irr |= mask; | ||
41 | s->last_irr |= mask; | ||
42 | } else { | ||
43 | s->irr &= ~mask; | ||
44 | s->last_irr &= ~mask; | ||
45 | } | ||
46 | else /* edge triggered */ | ||
47 | if (level) { | ||
48 | if ((s->last_irr & mask) == 0) | ||
49 | s->irr |= mask; | ||
50 | s->last_irr |= mask; | ||
51 | } else | ||
52 | s->last_irr &= ~mask; | ||
53 | } | ||
54 | |||
55 | /* | ||
56 | * return the highest priority found in mask (highest = smallest | ||
57 | * number). Return 8 if no irq | ||
58 | */ | ||
59 | static inline int get_priority(struct kvm_kpic_state *s, int mask) | ||
60 | { | ||
61 | int priority; | ||
62 | if (mask == 0) | ||
63 | return 8; | ||
64 | priority = 0; | ||
65 | while ((mask & (1 << ((priority + s->priority_add) & 7))) == 0) | ||
66 | priority++; | ||
67 | return priority; | ||
68 | } | ||
69 | |||
70 | /* | ||
71 | * return the pic wanted interrupt. return -1 if none | ||
72 | */ | ||
73 | static int pic_get_irq(struct kvm_kpic_state *s) | ||
74 | { | ||
75 | int mask, cur_priority, priority; | ||
76 | |||
77 | mask = s->irr & ~s->imr; | ||
78 | priority = get_priority(s, mask); | ||
79 | if (priority == 8) | ||
80 | return -1; | ||
81 | /* | ||
82 | * compute current priority. If special fully nested mode on the | ||
83 | * master, the IRQ coming from the slave is not taken into account | ||
84 | * for the priority computation. | ||
85 | */ | ||
86 | mask = s->isr; | ||
87 | if (s->special_fully_nested_mode && s == &s->pics_state->pics[0]) | ||
88 | mask &= ~(1 << 2); | ||
89 | cur_priority = get_priority(s, mask); | ||
90 | if (priority < cur_priority) | ||
91 | /* | ||
92 | * higher priority found: an irq should be generated | ||
93 | */ | ||
94 | return (priority + s->priority_add) & 7; | ||
95 | else | ||
96 | return -1; | ||
97 | } | ||
98 | |||
99 | /* | ||
100 | * raise irq to CPU if necessary. must be called every time the active | ||
101 | * irq may change | ||
102 | */ | ||
103 | static void pic_update_irq(struct kvm_pic *s) | ||
104 | { | ||
105 | int irq2, irq; | ||
106 | |||
107 | irq2 = pic_get_irq(&s->pics[1]); | ||
108 | if (irq2 >= 0) { | ||
109 | /* | ||
110 | * if irq request by slave pic, signal master PIC | ||
111 | */ | ||
112 | pic_set_irq1(&s->pics[0], 2, 1); | ||
113 | pic_set_irq1(&s->pics[0], 2, 0); | ||
114 | } | ||
115 | irq = pic_get_irq(&s->pics[0]); | ||
116 | if (irq >= 0) | ||
117 | s->irq_request(s->irq_request_opaque, 1); | ||
118 | else | ||
119 | s->irq_request(s->irq_request_opaque, 0); | ||
120 | } | ||
121 | |||
122 | void kvm_pic_update_irq(struct kvm_pic *s) | ||
123 | { | ||
124 | pic_update_irq(s); | ||
125 | } | ||
126 | |||
127 | void kvm_pic_set_irq(void *opaque, int irq, int level) | ||
128 | { | ||
129 | struct kvm_pic *s = opaque; | ||
130 | |||
131 | pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); | ||
132 | pic_update_irq(s); | ||
133 | } | ||
134 | |||
135 | /* | ||
136 | * acknowledge interrupt 'irq' | ||
137 | */ | ||
138 | static inline void pic_intack(struct kvm_kpic_state *s, int irq) | ||
139 | { | ||
140 | if (s->auto_eoi) { | ||
141 | if (s->rotate_on_auto_eoi) | ||
142 | s->priority_add = (irq + 1) & 7; | ||
143 | } else | ||
144 | s->isr |= (1 << irq); | ||
145 | /* | ||
146 | * We don't clear a level sensitive interrupt here | ||
147 | */ | ||
148 | if (!(s->elcr & (1 << irq))) | ||
149 | s->irr &= ~(1 << irq); | ||
150 | } | ||
151 | |||
152 | int kvm_pic_read_irq(struct kvm_pic *s) | ||
153 | { | ||
154 | int irq, irq2, intno; | ||
155 | |||
156 | irq = pic_get_irq(&s->pics[0]); | ||
157 | if (irq >= 0) { | ||
158 | pic_intack(&s->pics[0], irq); | ||
159 | if (irq == 2) { | ||
160 | irq2 = pic_get_irq(&s->pics[1]); | ||
161 | if (irq2 >= 0) | ||
162 | pic_intack(&s->pics[1], irq2); | ||
163 | else | ||
164 | /* | ||
165 | * spurious IRQ on slave controller | ||
166 | */ | ||
167 | irq2 = 7; | ||
168 | intno = s->pics[1].irq_base + irq2; | ||
169 | irq = irq2 + 8; | ||
170 | } else | ||
171 | intno = s->pics[0].irq_base + irq; | ||
172 | } else { | ||
173 | /* | ||
174 | * spurious IRQ on host controller | ||
175 | */ | ||
176 | irq = 7; | ||
177 | intno = s->pics[0].irq_base + irq; | ||
178 | } | ||
179 | pic_update_irq(s); | ||
180 | |||
181 | return intno; | ||
182 | } | ||
183 | |||
184 | static void pic_reset(void *opaque) | ||
185 | { | ||
186 | struct kvm_kpic_state *s = opaque; | ||
187 | |||
188 | s->last_irr = 0; | ||
189 | s->irr = 0; | ||
190 | s->imr = 0; | ||
191 | s->isr = 0; | ||
192 | s->priority_add = 0; | ||
193 | s->irq_base = 0; | ||
194 | s->read_reg_select = 0; | ||
195 | s->poll = 0; | ||
196 | s->special_mask = 0; | ||
197 | s->init_state = 0; | ||
198 | s->auto_eoi = 0; | ||
199 | s->rotate_on_auto_eoi = 0; | ||
200 | s->special_fully_nested_mode = 0; | ||
201 | s->init4 = 0; | ||
202 | } | ||
203 | |||
204 | static void pic_ioport_write(void *opaque, u32 addr, u32 val) | ||
205 | { | ||
206 | struct kvm_kpic_state *s = opaque; | ||
207 | int priority, cmd, irq; | ||
208 | |||
209 | addr &= 1; | ||
210 | if (addr == 0) { | ||
211 | if (val & 0x10) { | ||
212 | pic_reset(s); /* init */ | ||
213 | /* | ||
214 | * deassert a pending interrupt | ||
215 | */ | ||
216 | s->pics_state->irq_request(s->pics_state-> | ||
217 | irq_request_opaque, 0); | ||
218 | s->init_state = 1; | ||
219 | s->init4 = val & 1; | ||
220 | if (val & 0x02) | ||
221 | printk(KERN_ERR "single mode not supported"); | ||
222 | if (val & 0x08) | ||
223 | printk(KERN_ERR | ||
224 | "level sensitive irq not supported"); | ||
225 | } else if (val & 0x08) { | ||
226 | if (val & 0x04) | ||
227 | s->poll = 1; | ||
228 | if (val & 0x02) | ||
229 | s->read_reg_select = val & 1; | ||
230 | if (val & 0x40) | ||
231 | s->special_mask = (val >> 5) & 1; | ||
232 | } else { | ||
233 | cmd = val >> 5; | ||
234 | switch (cmd) { | ||
235 | case 0: | ||
236 | case 4: | ||
237 | s->rotate_on_auto_eoi = cmd >> 2; | ||
238 | break; | ||
239 | case 1: /* end of interrupt */ | ||
240 | case 5: | ||
241 | priority = get_priority(s, s->isr); | ||
242 | if (priority != 8) { | ||
243 | irq = (priority + s->priority_add) & 7; | ||
244 | s->isr &= ~(1 << irq); | ||
245 | if (cmd == 5) | ||
246 | s->priority_add = (irq + 1) & 7; | ||
247 | pic_update_irq(s->pics_state); | ||
248 | } | ||
249 | break; | ||
250 | case 3: | ||
251 | irq = val & 7; | ||
252 | s->isr &= ~(1 << irq); | ||
253 | pic_update_irq(s->pics_state); | ||
254 | break; | ||
255 | case 6: | ||
256 | s->priority_add = (val + 1) & 7; | ||
257 | pic_update_irq(s->pics_state); | ||
258 | break; | ||
259 | case 7: | ||
260 | irq = val & 7; | ||
261 | s->isr &= ~(1 << irq); | ||
262 | s->priority_add = (irq + 1) & 7; | ||
263 | pic_update_irq(s->pics_state); | ||
264 | break; | ||
265 | default: | ||
266 | break; /* no operation */ | ||
267 | } | ||
268 | } | ||
269 | } else | ||
270 | switch (s->init_state) { | ||
271 | case 0: /* normal mode */ | ||
272 | s->imr = val; | ||
273 | pic_update_irq(s->pics_state); | ||
274 | break; | ||
275 | case 1: | ||
276 | s->irq_base = val & 0xf8; | ||
277 | s->init_state = 2; | ||
278 | break; | ||
279 | case 2: | ||
280 | if (s->init4) | ||
281 | s->init_state = 3; | ||
282 | else | ||
283 | s->init_state = 0; | ||
284 | break; | ||
285 | case 3: | ||
286 | s->special_fully_nested_mode = (val >> 4) & 1; | ||
287 | s->auto_eoi = (val >> 1) & 1; | ||
288 | s->init_state = 0; | ||
289 | break; | ||
290 | } | ||
291 | } | ||
292 | |||
293 | static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1) | ||
294 | { | ||
295 | int ret; | ||
296 | |||
297 | ret = pic_get_irq(s); | ||
298 | if (ret >= 0) { | ||
299 | if (addr1 >> 7) { | ||
300 | s->pics_state->pics[0].isr &= ~(1 << 2); | ||
301 | s->pics_state->pics[0].irr &= ~(1 << 2); | ||
302 | } | ||
303 | s->irr &= ~(1 << ret); | ||
304 | s->isr &= ~(1 << ret); | ||
305 | if (addr1 >> 7 || ret != 2) | ||
306 | pic_update_irq(s->pics_state); | ||
307 | } else { | ||
308 | ret = 0x07; | ||
309 | pic_update_irq(s->pics_state); | ||
310 | } | ||
311 | |||
312 | return ret; | ||
313 | } | ||
314 | |||
315 | static u32 pic_ioport_read(void *opaque, u32 addr1) | ||
316 | { | ||
317 | struct kvm_kpic_state *s = opaque; | ||
318 | unsigned int addr; | ||
319 | int ret; | ||
320 | |||
321 | addr = addr1; | ||
322 | addr &= 1; | ||
323 | if (s->poll) { | ||
324 | ret = pic_poll_read(s, addr1); | ||
325 | s->poll = 0; | ||
326 | } else | ||
327 | if (addr == 0) | ||
328 | if (s->read_reg_select) | ||
329 | ret = s->isr; | ||
330 | else | ||
331 | ret = s->irr; | ||
332 | else | ||
333 | ret = s->imr; | ||
334 | return ret; | ||
335 | } | ||
336 | |||
337 | static void elcr_ioport_write(void *opaque, u32 addr, u32 val) | ||
338 | { | ||
339 | struct kvm_kpic_state *s = opaque; | ||
340 | s->elcr = val & s->elcr_mask; | ||
341 | } | ||
342 | |||
343 | static u32 elcr_ioport_read(void *opaque, u32 addr1) | ||
344 | { | ||
345 | struct kvm_kpic_state *s = opaque; | ||
346 | return s->elcr; | ||
347 | } | ||
348 | |||
349 | static int picdev_in_range(struct kvm_io_device *this, gpa_t addr) | ||
350 | { | ||
351 | switch (addr) { | ||
352 | case 0x20: | ||
353 | case 0x21: | ||
354 | case 0xa0: | ||
355 | case 0xa1: | ||
356 | case 0x4d0: | ||
357 | case 0x4d1: | ||
358 | return 1; | ||
359 | default: | ||
360 | return 0; | ||
361 | } | ||
362 | } | ||
363 | |||
364 | static void picdev_write(struct kvm_io_device *this, | ||
365 | gpa_t addr, int len, const void *val) | ||
366 | { | ||
367 | struct kvm_pic *s = this->private; | ||
368 | unsigned char data = *(unsigned char *)val; | ||
369 | |||
370 | if (len != 1) { | ||
371 | if (printk_ratelimit()) | ||
372 | printk(KERN_ERR "PIC: non byte write\n"); | ||
373 | return; | ||
374 | } | ||
375 | switch (addr) { | ||
376 | case 0x20: | ||
377 | case 0x21: | ||
378 | case 0xa0: | ||
379 | case 0xa1: | ||
380 | pic_ioport_write(&s->pics[addr >> 7], addr, data); | ||
381 | break; | ||
382 | case 0x4d0: | ||
383 | case 0x4d1: | ||
384 | elcr_ioport_write(&s->pics[addr & 1], addr, data); | ||
385 | break; | ||
386 | } | ||
387 | } | ||
388 | |||
389 | static void picdev_read(struct kvm_io_device *this, | ||
390 | gpa_t addr, int len, void *val) | ||
391 | { | ||
392 | struct kvm_pic *s = this->private; | ||
393 | unsigned char data = 0; | ||
394 | |||
395 | if (len != 1) { | ||
396 | if (printk_ratelimit()) | ||
397 | printk(KERN_ERR "PIC: non byte read\n"); | ||
398 | return; | ||
399 | } | ||
400 | switch (addr) { | ||
401 | case 0x20: | ||
402 | case 0x21: | ||
403 | case 0xa0: | ||
404 | case 0xa1: | ||
405 | data = pic_ioport_read(&s->pics[addr >> 7], addr); | ||
406 | break; | ||
407 | case 0x4d0: | ||
408 | case 0x4d1: | ||
409 | data = elcr_ioport_read(&s->pics[addr & 1], addr); | ||
410 | break; | ||
411 | } | ||
412 | *(unsigned char *)val = data; | ||
413 | } | ||
414 | |||
415 | /* | ||
416 | * callback when PIC0 irq status changed | ||
417 | */ | ||
418 | static void pic_irq_request(void *opaque, int level) | ||
419 | { | ||
420 | struct kvm *kvm = opaque; | ||
421 | struct kvm_vcpu *vcpu = kvm->vcpus[0]; | ||
422 | |||
423 | pic_irqchip(kvm)->output = level; | ||
424 | if (vcpu) | ||
425 | kvm_vcpu_kick(vcpu); | ||
426 | } | ||
427 | |||
428 | struct kvm_pic *kvm_create_pic(struct kvm *kvm) | ||
429 | { | ||
430 | struct kvm_pic *s; | ||
431 | s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); | ||
432 | if (!s) | ||
433 | return NULL; | ||
434 | s->pics[0].elcr_mask = 0xf8; | ||
435 | s->pics[1].elcr_mask = 0xde; | ||
436 | s->irq_request = pic_irq_request; | ||
437 | s->irq_request_opaque = kvm; | ||
438 | s->pics[0].pics_state = s; | ||
439 | s->pics[1].pics_state = s; | ||
440 | |||
441 | /* | ||
442 | * Initialize PIO device | ||
443 | */ | ||
444 | s->dev.read = picdev_read; | ||
445 | s->dev.write = picdev_write; | ||
446 | s->dev.in_range = picdev_in_range; | ||
447 | s->dev.private = s; | ||
448 | kvm_io_bus_register_dev(&kvm->pio_bus, &s->dev); | ||
449 | return s; | ||
450 | } | ||
diff --git a/drivers/kvm/ioapic.c b/drivers/kvm/ioapic.c deleted file mode 100644 index c7992e667fdb..000000000000 --- a/drivers/kvm/ioapic.c +++ /dev/null | |||
@@ -1,388 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2001 MandrakeSoft S.A. | ||
3 | * | ||
4 | * MandrakeSoft S.A. | ||
5 | * 43, rue d'Aboukir | ||
6 | * 75002 Paris - France | ||
7 | * http://www.linux-mandrake.com/ | ||
8 | * http://www.mandrakesoft.com/ | ||
9 | * | ||
10 | * This library is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU Lesser General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This library is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * Lesser General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU Lesser General Public | ||
21 | * License along with this library; if not, write to the Free Software | ||
22 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
23 | * | ||
24 | * Yunhong Jiang <yunhong.jiang@intel.com> | ||
25 | * Yaozu (Eddie) Dong <eddie.dong@intel.com> | ||
26 | * Based on Xen 3.1 code. | ||
27 | */ | ||
28 | |||
29 | #include "kvm.h" | ||
30 | #include <linux/kvm.h> | ||
31 | #include <linux/mm.h> | ||
32 | #include <linux/highmem.h> | ||
33 | #include <linux/smp.h> | ||
34 | #include <linux/hrtimer.h> | ||
35 | #include <linux/io.h> | ||
36 | #include <asm/processor.h> | ||
37 | #include <asm/msr.h> | ||
38 | #include <asm/page.h> | ||
39 | #include <asm/current.h> | ||
40 | #include <asm/apicdef.h> | ||
41 | #include <asm/io_apic.h> | ||
42 | #include "irq.h" | ||
43 | /* #define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */ | ||
44 | #define ioapic_debug(fmt, arg...) | ||
45 | static void ioapic_deliver(struct kvm_ioapic *vioapic, int irq); | ||
46 | |||
47 | static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, | ||
48 | unsigned long addr, | ||
49 | unsigned long length) | ||
50 | { | ||
51 | unsigned long result = 0; | ||
52 | |||
53 | switch (ioapic->ioregsel) { | ||
54 | case IOAPIC_REG_VERSION: | ||
55 | result = ((((IOAPIC_NUM_PINS - 1) & 0xff) << 16) | ||
56 | | (IOAPIC_VERSION_ID & 0xff)); | ||
57 | break; | ||
58 | |||
59 | case IOAPIC_REG_APIC_ID: | ||
60 | case IOAPIC_REG_ARB_ID: | ||
61 | result = ((ioapic->id & 0xf) << 24); | ||
62 | break; | ||
63 | |||
64 | default: | ||
65 | { | ||
66 | u32 redir_index = (ioapic->ioregsel - 0x10) >> 1; | ||
67 | u64 redir_content; | ||
68 | |||
69 | ASSERT(redir_index < IOAPIC_NUM_PINS); | ||
70 | |||
71 | redir_content = ioapic->redirtbl[redir_index].bits; | ||
72 | result = (ioapic->ioregsel & 0x1) ? | ||
73 | (redir_content >> 32) & 0xffffffff : | ||
74 | redir_content & 0xffffffff; | ||
75 | break; | ||
76 | } | ||
77 | } | ||
78 | |||
79 | return result; | ||
80 | } | ||
81 | |||
82 | static void ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx) | ||
83 | { | ||
84 | union ioapic_redir_entry *pent; | ||
85 | |||
86 | pent = &ioapic->redirtbl[idx]; | ||
87 | |||
88 | if (!pent->fields.mask) { | ||
89 | ioapic_deliver(ioapic, idx); | ||
90 | if (pent->fields.trig_mode == IOAPIC_LEVEL_TRIG) | ||
91 | pent->fields.remote_irr = 1; | ||
92 | } | ||
93 | if (!pent->fields.trig_mode) | ||
94 | ioapic->irr &= ~(1 << idx); | ||
95 | } | ||
96 | |||
97 | static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) | ||
98 | { | ||
99 | unsigned index; | ||
100 | |||
101 | switch (ioapic->ioregsel) { | ||
102 | case IOAPIC_REG_VERSION: | ||
103 | /* Writes are ignored. */ | ||
104 | break; | ||
105 | |||
106 | case IOAPIC_REG_APIC_ID: | ||
107 | ioapic->id = (val >> 24) & 0xf; | ||
108 | break; | ||
109 | |||
110 | case IOAPIC_REG_ARB_ID: | ||
111 | break; | ||
112 | |||
113 | default: | ||
114 | index = (ioapic->ioregsel - 0x10) >> 1; | ||
115 | |||
116 | ioapic_debug("change redir index %x val %x", index, val); | ||
117 | if (index >= IOAPIC_NUM_PINS) | ||
118 | return; | ||
119 | if (ioapic->ioregsel & 1) { | ||
120 | ioapic->redirtbl[index].bits &= 0xffffffff; | ||
121 | ioapic->redirtbl[index].bits |= (u64) val << 32; | ||
122 | } else { | ||
123 | ioapic->redirtbl[index].bits &= ~0xffffffffULL; | ||
124 | ioapic->redirtbl[index].bits |= (u32) val; | ||
125 | ioapic->redirtbl[index].fields.remote_irr = 0; | ||
126 | } | ||
127 | if (ioapic->irr & (1 << index)) | ||
128 | ioapic_service(ioapic, index); | ||
129 | break; | ||
130 | } | ||
131 | } | ||
132 | |||
133 | static void ioapic_inj_irq(struct kvm_ioapic *ioapic, | ||
134 | struct kvm_lapic *target, | ||
135 | u8 vector, u8 trig_mode, u8 delivery_mode) | ||
136 | { | ||
137 | ioapic_debug("irq %d trig %d deliv %d", vector, trig_mode, | ||
138 | delivery_mode); | ||
139 | |||
140 | ASSERT((delivery_mode == dest_Fixed) || | ||
141 | (delivery_mode == dest_LowestPrio)); | ||
142 | |||
143 | kvm_apic_set_irq(target, vector, trig_mode); | ||
144 | } | ||
145 | |||
146 | static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, | ||
147 | u8 dest_mode) | ||
148 | { | ||
149 | u32 mask = 0; | ||
150 | int i; | ||
151 | struct kvm *kvm = ioapic->kvm; | ||
152 | struct kvm_vcpu *vcpu; | ||
153 | |||
154 | ioapic_debug("dest %d dest_mode %d", dest, dest_mode); | ||
155 | |||
156 | if (dest_mode == 0) { /* Physical mode. */ | ||
157 | if (dest == 0xFF) { /* Broadcast. */ | ||
158 | for (i = 0; i < KVM_MAX_VCPUS; ++i) | ||
159 | if (kvm->vcpus[i] && kvm->vcpus[i]->apic) | ||
160 | mask |= 1 << i; | ||
161 | return mask; | ||
162 | } | ||
163 | for (i = 0; i < KVM_MAX_VCPUS; ++i) { | ||
164 | vcpu = kvm->vcpus[i]; | ||
165 | if (!vcpu) | ||
166 | continue; | ||
167 | if (kvm_apic_match_physical_addr(vcpu->apic, dest)) { | ||
168 | if (vcpu->apic) | ||
169 | mask = 1 << i; | ||
170 | break; | ||
171 | } | ||
172 | } | ||
173 | } else if (dest != 0) /* Logical mode, MDA non-zero. */ | ||
174 | for (i = 0; i < KVM_MAX_VCPUS; ++i) { | ||
175 | vcpu = kvm->vcpus[i]; | ||
176 | if (!vcpu) | ||
177 | continue; | ||
178 | if (vcpu->apic && | ||
179 | kvm_apic_match_logical_addr(vcpu->apic, dest)) | ||
180 | mask |= 1 << vcpu->vcpu_id; | ||
181 | } | ||
182 | ioapic_debug("mask %x", mask); | ||
183 | return mask; | ||
184 | } | ||
185 | |||
186 | static void ioapic_deliver(struct kvm_ioapic *ioapic, int irq) | ||
187 | { | ||
188 | u8 dest = ioapic->redirtbl[irq].fields.dest_id; | ||
189 | u8 dest_mode = ioapic->redirtbl[irq].fields.dest_mode; | ||
190 | u8 delivery_mode = ioapic->redirtbl[irq].fields.delivery_mode; | ||
191 | u8 vector = ioapic->redirtbl[irq].fields.vector; | ||
192 | u8 trig_mode = ioapic->redirtbl[irq].fields.trig_mode; | ||
193 | u32 deliver_bitmask; | ||
194 | struct kvm_lapic *target; | ||
195 | struct kvm_vcpu *vcpu; | ||
196 | int vcpu_id; | ||
197 | |||
198 | ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x " | ||
199 | "vector=%x trig_mode=%x", | ||
200 | dest, dest_mode, delivery_mode, vector, trig_mode); | ||
201 | |||
202 | deliver_bitmask = ioapic_get_delivery_bitmask(ioapic, dest, dest_mode); | ||
203 | if (!deliver_bitmask) { | ||
204 | ioapic_debug("no target on destination"); | ||
205 | return; | ||
206 | } | ||
207 | |||
208 | switch (delivery_mode) { | ||
209 | case dest_LowestPrio: | ||
210 | target = | ||
211 | kvm_apic_round_robin(ioapic->kvm, vector, deliver_bitmask); | ||
212 | if (target != NULL) | ||
213 | ioapic_inj_irq(ioapic, target, vector, | ||
214 | trig_mode, delivery_mode); | ||
215 | else | ||
216 | ioapic_debug("null round robin: " | ||
217 | "mask=%x vector=%x delivery_mode=%x", | ||
218 | deliver_bitmask, vector, dest_LowestPrio); | ||
219 | break; | ||
220 | case dest_Fixed: | ||
221 | for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) { | ||
222 | if (!(deliver_bitmask & (1 << vcpu_id))) | ||
223 | continue; | ||
224 | deliver_bitmask &= ~(1 << vcpu_id); | ||
225 | vcpu = ioapic->kvm->vcpus[vcpu_id]; | ||
226 | if (vcpu) { | ||
227 | target = vcpu->apic; | ||
228 | ioapic_inj_irq(ioapic, target, vector, | ||
229 | trig_mode, delivery_mode); | ||
230 | } | ||
231 | } | ||
232 | break; | ||
233 | |||
234 | /* TODO: NMI */ | ||
235 | default: | ||
236 | printk(KERN_WARNING "Unsupported delivery mode %d\n", | ||
237 | delivery_mode); | ||
238 | break; | ||
239 | } | ||
240 | } | ||
241 | |||
242 | void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level) | ||
243 | { | ||
244 | u32 old_irr = ioapic->irr; | ||
245 | u32 mask = 1 << irq; | ||
246 | union ioapic_redir_entry entry; | ||
247 | |||
248 | if (irq >= 0 && irq < IOAPIC_NUM_PINS) { | ||
249 | entry = ioapic->redirtbl[irq]; | ||
250 | level ^= entry.fields.polarity; | ||
251 | if (!level) | ||
252 | ioapic->irr &= ~mask; | ||
253 | else { | ||
254 | ioapic->irr |= mask; | ||
255 | if ((!entry.fields.trig_mode && old_irr != ioapic->irr) | ||
256 | || !entry.fields.remote_irr) | ||
257 | ioapic_service(ioapic, irq); | ||
258 | } | ||
259 | } | ||
260 | } | ||
261 | |||
262 | static int get_eoi_gsi(struct kvm_ioapic *ioapic, int vector) | ||
263 | { | ||
264 | int i; | ||
265 | |||
266 | for (i = 0; i < IOAPIC_NUM_PINS; i++) | ||
267 | if (ioapic->redirtbl[i].fields.vector == vector) | ||
268 | return i; | ||
269 | return -1; | ||
270 | } | ||
271 | |||
272 | void kvm_ioapic_update_eoi(struct kvm *kvm, int vector) | ||
273 | { | ||
274 | struct kvm_ioapic *ioapic = kvm->vioapic; | ||
275 | union ioapic_redir_entry *ent; | ||
276 | int gsi; | ||
277 | |||
278 | gsi = get_eoi_gsi(ioapic, vector); | ||
279 | if (gsi == -1) { | ||
280 | printk(KERN_WARNING "Can't find redir item for %d EOI\n", | ||
281 | vector); | ||
282 | return; | ||
283 | } | ||
284 | |||
285 | ent = &ioapic->redirtbl[gsi]; | ||
286 | ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG); | ||
287 | |||
288 | ent->fields.remote_irr = 0; | ||
289 | if (!ent->fields.mask && (ioapic->irr & (1 << gsi))) | ||
290 | ioapic_deliver(ioapic, gsi); | ||
291 | } | ||
292 | |||
293 | static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr) | ||
294 | { | ||
295 | struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; | ||
296 | |||
297 | return ((addr >= ioapic->base_address && | ||
298 | (addr < ioapic->base_address + IOAPIC_MEM_LENGTH))); | ||
299 | } | ||
300 | |||
301 | static void ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len, | ||
302 | void *val) | ||
303 | { | ||
304 | struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; | ||
305 | u32 result; | ||
306 | |||
307 | ioapic_debug("addr %lx", (unsigned long)addr); | ||
308 | ASSERT(!(addr & 0xf)); /* check alignment */ | ||
309 | |||
310 | addr &= 0xff; | ||
311 | switch (addr) { | ||
312 | case IOAPIC_REG_SELECT: | ||
313 | result = ioapic->ioregsel; | ||
314 | break; | ||
315 | |||
316 | case IOAPIC_REG_WINDOW: | ||
317 | result = ioapic_read_indirect(ioapic, addr, len); | ||
318 | break; | ||
319 | |||
320 | default: | ||
321 | result = 0; | ||
322 | break; | ||
323 | } | ||
324 | switch (len) { | ||
325 | case 8: | ||
326 | *(u64 *) val = result; | ||
327 | break; | ||
328 | case 1: | ||
329 | case 2: | ||
330 | case 4: | ||
331 | memcpy(val, (char *)&result, len); | ||
332 | break; | ||
333 | default: | ||
334 | printk(KERN_WARNING "ioapic: wrong length %d\n", len); | ||
335 | } | ||
336 | } | ||
337 | |||
338 | static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, | ||
339 | const void *val) | ||
340 | { | ||
341 | struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; | ||
342 | u32 data; | ||
343 | |||
344 | ioapic_debug("ioapic_mmio_write addr=%lx len=%d val=%p\n", | ||
345 | addr, len, val); | ||
346 | ASSERT(!(addr & 0xf)); /* check alignment */ | ||
347 | if (len == 4 || len == 8) | ||
348 | data = *(u32 *) val; | ||
349 | else { | ||
350 | printk(KERN_WARNING "ioapic: Unsupported size %d\n", len); | ||
351 | return; | ||
352 | } | ||
353 | |||
354 | addr &= 0xff; | ||
355 | switch (addr) { | ||
356 | case IOAPIC_REG_SELECT: | ||
357 | ioapic->ioregsel = data; | ||
358 | break; | ||
359 | |||
360 | case IOAPIC_REG_WINDOW: | ||
361 | ioapic_write_indirect(ioapic, data); | ||
362 | break; | ||
363 | |||
364 | default: | ||
365 | break; | ||
366 | } | ||
367 | } | ||
368 | |||
369 | int kvm_ioapic_init(struct kvm *kvm) | ||
370 | { | ||
371 | struct kvm_ioapic *ioapic; | ||
372 | int i; | ||
373 | |||
374 | ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL); | ||
375 | if (!ioapic) | ||
376 | return -ENOMEM; | ||
377 | kvm->vioapic = ioapic; | ||
378 | for (i = 0; i < IOAPIC_NUM_PINS; i++) | ||
379 | ioapic->redirtbl[i].fields.mask = 1; | ||
380 | ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS; | ||
381 | ioapic->dev.read = ioapic_mmio_read; | ||
382 | ioapic->dev.write = ioapic_mmio_write; | ||
383 | ioapic->dev.in_range = ioapic_in_range; | ||
384 | ioapic->dev.private = ioapic; | ||
385 | ioapic->kvm = kvm; | ||
386 | kvm_io_bus_register_dev(&kvm->mmio_bus, &ioapic->dev); | ||
387 | return 0; | ||
388 | } | ||
diff --git a/drivers/kvm/irq.c b/drivers/kvm/irq.c deleted file mode 100644 index 7628c7ff628f..000000000000 --- a/drivers/kvm/irq.c +++ /dev/null | |||
@@ -1,98 +0,0 @@ | |||
1 | /* | ||
2 | * irq.c: API for in kernel interrupt controller | ||
3 | * Copyright (c) 2007, Intel Corporation. | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify it | ||
6 | * under the terms and conditions of the GNU General Public License, | ||
7 | * version 2, as published by the Free Software Foundation. | ||
8 | * | ||
9 | * This program is distributed in the hope it will be useful, but WITHOUT | ||
10 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
11 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
12 | * more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License along with | ||
15 | * this program; if not, write to the Free Software Foundation, Inc., 59 Temple | ||
16 | * Place - Suite 330, Boston, MA 02111-1307 USA. | ||
17 | * Authors: | ||
18 | * Yaozu (Eddie) Dong <Eddie.dong@intel.com> | ||
19 | * | ||
20 | */ | ||
21 | |||
22 | #include <linux/module.h> | ||
23 | |||
24 | #include "kvm.h" | ||
25 | #include "irq.h" | ||
26 | |||
27 | /* | ||
28 | * check if there is pending interrupt without | ||
29 | * intack. | ||
30 | */ | ||
31 | int kvm_cpu_has_interrupt(struct kvm_vcpu *v) | ||
32 | { | ||
33 | struct kvm_pic *s; | ||
34 | |||
35 | if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */ | ||
36 | if (kvm_apic_accept_pic_intr(v)) { | ||
37 | s = pic_irqchip(v->kvm); /* PIC */ | ||
38 | return s->output; | ||
39 | } else | ||
40 | return 0; | ||
41 | } | ||
42 | return 1; | ||
43 | } | ||
44 | EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt); | ||
45 | |||
46 | /* | ||
47 | * Read pending interrupt vector and intack. | ||
48 | */ | ||
49 | int kvm_cpu_get_interrupt(struct kvm_vcpu *v) | ||
50 | { | ||
51 | struct kvm_pic *s; | ||
52 | int vector; | ||
53 | |||
54 | vector = kvm_get_apic_interrupt(v); /* APIC */ | ||
55 | if (vector == -1) { | ||
56 | if (kvm_apic_accept_pic_intr(v)) { | ||
57 | s = pic_irqchip(v->kvm); | ||
58 | s->output = 0; /* PIC */ | ||
59 | vector = kvm_pic_read_irq(s); | ||
60 | } | ||
61 | } | ||
62 | return vector; | ||
63 | } | ||
64 | EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt); | ||
65 | |||
66 | static void vcpu_kick_intr(void *info) | ||
67 | { | ||
68 | #ifdef DEBUG | ||
69 | struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info; | ||
70 | printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu); | ||
71 | #endif | ||
72 | } | ||
73 | |||
74 | void kvm_vcpu_kick(struct kvm_vcpu *vcpu) | ||
75 | { | ||
76 | int ipi_pcpu = vcpu->cpu; | ||
77 | |||
78 | if (waitqueue_active(&vcpu->wq)) { | ||
79 | wake_up_interruptible(&vcpu->wq); | ||
80 | ++vcpu->stat.halt_wakeup; | ||
81 | } | ||
82 | if (vcpu->guest_mode) | ||
83 | smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0); | ||
84 | } | ||
85 | |||
86 | void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) | ||
87 | { | ||
88 | kvm_inject_apic_timer_irqs(vcpu); | ||
89 | /* TODO: PIT, RTC etc. */ | ||
90 | } | ||
91 | EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs); | ||
92 | |||
93 | void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec) | ||
94 | { | ||
95 | kvm_apic_timer_intr_post(vcpu, vec); | ||
96 | /* TODO: PIT, RTC etc. */ | ||
97 | } | ||
98 | EXPORT_SYMBOL_GPL(kvm_timer_intr_post); | ||
diff --git a/drivers/kvm/irq.h b/drivers/kvm/irq.h deleted file mode 100644 index 11fc014e2b30..000000000000 --- a/drivers/kvm/irq.h +++ /dev/null | |||
@@ -1,165 +0,0 @@ | |||
1 | /* | ||
2 | * irq.h: in kernel interrupt controller related definitions | ||
3 | * Copyright (c) 2007, Intel Corporation. | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify it | ||
6 | * under the terms and conditions of the GNU General Public License, | ||
7 | * version 2, as published by the Free Software Foundation. | ||
8 | * | ||
9 | * This program is distributed in the hope it will be useful, but WITHOUT | ||
10 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
11 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
12 | * more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License along with | ||
15 | * this program; if not, write to the Free Software Foundation, Inc., 59 Temple | ||
16 | * Place - Suite 330, Boston, MA 02111-1307 USA. | ||
17 | * Authors: | ||
18 | * Yaozu (Eddie) Dong <Eddie.dong@intel.com> | ||
19 | * | ||
20 | */ | ||
21 | |||
22 | #ifndef __IRQ_H | ||
23 | #define __IRQ_H | ||
24 | |||
25 | #include "kvm.h" | ||
26 | |||
27 | typedef void irq_request_func(void *opaque, int level); | ||
28 | |||
29 | struct kvm_kpic_state { | ||
30 | u8 last_irr; /* edge detection */ | ||
31 | u8 irr; /* interrupt request register */ | ||
32 | u8 imr; /* interrupt mask register */ | ||
33 | u8 isr; /* interrupt service register */ | ||
34 | u8 priority_add; /* highest irq priority */ | ||
35 | u8 irq_base; | ||
36 | u8 read_reg_select; | ||
37 | u8 poll; | ||
38 | u8 special_mask; | ||
39 | u8 init_state; | ||
40 | u8 auto_eoi; | ||
41 | u8 rotate_on_auto_eoi; | ||
42 | u8 special_fully_nested_mode; | ||
43 | u8 init4; /* true if 4 byte init */ | ||
44 | u8 elcr; /* PIIX edge/trigger selection */ | ||
45 | u8 elcr_mask; | ||
46 | struct kvm_pic *pics_state; | ||
47 | }; | ||
48 | |||
49 | struct kvm_pic { | ||
50 | struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ | ||
51 | irq_request_func *irq_request; | ||
52 | void *irq_request_opaque; | ||
53 | int output; /* intr from master PIC */ | ||
54 | struct kvm_io_device dev; | ||
55 | }; | ||
56 | |||
57 | struct kvm_pic *kvm_create_pic(struct kvm *kvm); | ||
58 | void kvm_pic_set_irq(void *opaque, int irq, int level); | ||
59 | int kvm_pic_read_irq(struct kvm_pic *s); | ||
60 | int kvm_cpu_get_interrupt(struct kvm_vcpu *v); | ||
61 | int kvm_cpu_has_interrupt(struct kvm_vcpu *v); | ||
62 | void kvm_pic_update_irq(struct kvm_pic *s); | ||
63 | |||
64 | #define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS | ||
65 | #define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */ | ||
66 | #define IOAPIC_EDGE_TRIG 0 | ||
67 | #define IOAPIC_LEVEL_TRIG 1 | ||
68 | |||
69 | #define IOAPIC_DEFAULT_BASE_ADDRESS 0xfec00000 | ||
70 | #define IOAPIC_MEM_LENGTH 0x100 | ||
71 | |||
72 | /* Direct registers. */ | ||
73 | #define IOAPIC_REG_SELECT 0x00 | ||
74 | #define IOAPIC_REG_WINDOW 0x10 | ||
75 | #define IOAPIC_REG_EOI 0x40 /* IA64 IOSAPIC only */ | ||
76 | |||
77 | /* Indirect registers. */ | ||
78 | #define IOAPIC_REG_APIC_ID 0x00 /* x86 IOAPIC only */ | ||
79 | #define IOAPIC_REG_VERSION 0x01 | ||
80 | #define IOAPIC_REG_ARB_ID 0x02 /* x86 IOAPIC only */ | ||
81 | |||
82 | struct kvm_ioapic { | ||
83 | u64 base_address; | ||
84 | u32 ioregsel; | ||
85 | u32 id; | ||
86 | u32 irr; | ||
87 | u32 pad; | ||
88 | union ioapic_redir_entry { | ||
89 | u64 bits; | ||
90 | struct { | ||
91 | u8 vector; | ||
92 | u8 delivery_mode:3; | ||
93 | u8 dest_mode:1; | ||
94 | u8 delivery_status:1; | ||
95 | u8 polarity:1; | ||
96 | u8 remote_irr:1; | ||
97 | u8 trig_mode:1; | ||
98 | u8 mask:1; | ||
99 | u8 reserve:7; | ||
100 | u8 reserved[4]; | ||
101 | u8 dest_id; | ||
102 | } fields; | ||
103 | } redirtbl[IOAPIC_NUM_PINS]; | ||
104 | struct kvm_io_device dev; | ||
105 | struct kvm *kvm; | ||
106 | }; | ||
107 | |||
108 | struct kvm_lapic { | ||
109 | unsigned long base_address; | ||
110 | struct kvm_io_device dev; | ||
111 | struct { | ||
112 | atomic_t pending; | ||
113 | s64 period; /* unit: ns */ | ||
114 | u32 divide_count; | ||
115 | ktime_t last_update; | ||
116 | struct hrtimer dev; | ||
117 | } timer; | ||
118 | struct kvm_vcpu *vcpu; | ||
119 | struct page *regs_page; | ||
120 | void *regs; | ||
121 | }; | ||
122 | |||
123 | #ifdef DEBUG | ||
124 | #define ASSERT(x) \ | ||
125 | do { \ | ||
126 | if (!(x)) { \ | ||
127 | printk(KERN_EMERG "assertion failed %s: %d: %s\n", \ | ||
128 | __FILE__, __LINE__, #x); \ | ||
129 | BUG(); \ | ||
130 | } \ | ||
131 | } while (0) | ||
132 | #else | ||
133 | #define ASSERT(x) do { } while (0) | ||
134 | #endif | ||
135 | |||
136 | void kvm_vcpu_kick(struct kvm_vcpu *vcpu); | ||
137 | int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); | ||
138 | int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu); | ||
139 | int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); | ||
140 | int kvm_create_lapic(struct kvm_vcpu *vcpu); | ||
141 | void kvm_lapic_reset(struct kvm_vcpu *vcpu); | ||
142 | void kvm_free_apic(struct kvm_lapic *apic); | ||
143 | u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); | ||
144 | void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); | ||
145 | void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); | ||
146 | struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector, | ||
147 | unsigned long bitmap); | ||
148 | u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); | ||
149 | void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data); | ||
150 | int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); | ||
151 | void kvm_ioapic_update_eoi(struct kvm *kvm, int vector); | ||
152 | int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); | ||
153 | int kvm_apic_set_irq(struct kvm_lapic *apic, u8 vec, u8 trig); | ||
154 | void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu); | ||
155 | int kvm_ioapic_init(struct kvm *kvm); | ||
156 | void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level); | ||
157 | int kvm_lapic_enabled(struct kvm_vcpu *vcpu); | ||
158 | int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); | ||
159 | void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec); | ||
160 | void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec); | ||
161 | void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); | ||
162 | void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); | ||
163 | void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu); | ||
164 | |||
165 | #endif | ||
diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h deleted file mode 100644 index 3b0bc4bda5f2..000000000000 --- a/drivers/kvm/kvm.h +++ /dev/null | |||
@@ -1,796 +0,0 @@ | |||
1 | #ifndef __KVM_H | ||
2 | #define __KVM_H | ||
3 | |||
4 | /* | ||
5 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
6 | * the COPYING file in the top-level directory. | ||
7 | */ | ||
8 | |||
9 | #include <linux/types.h> | ||
10 | #include <linux/list.h> | ||
11 | #include <linux/mutex.h> | ||
12 | #include <linux/spinlock.h> | ||
13 | #include <linux/signal.h> | ||
14 | #include <linux/sched.h> | ||
15 | #include <linux/mm.h> | ||
16 | #include <linux/preempt.h> | ||
17 | #include <asm/signal.h> | ||
18 | |||
19 | #include <linux/kvm.h> | ||
20 | #include <linux/kvm_para.h> | ||
21 | |||
22 | #define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1) | ||
23 | #define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD)) | ||
24 | #define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS|0xFFFFFF0000000000ULL) | ||
25 | |||
26 | #define KVM_GUEST_CR0_MASK \ | ||
27 | (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE \ | ||
28 | | X86_CR0_NW | X86_CR0_CD) | ||
29 | #define KVM_VM_CR0_ALWAYS_ON \ | ||
30 | (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE | X86_CR0_TS \ | ||
31 | | X86_CR0_MP) | ||
32 | #define KVM_GUEST_CR4_MASK \ | ||
33 | (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE) | ||
34 | #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) | ||
35 | #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) | ||
36 | |||
37 | #define INVALID_PAGE (~(hpa_t)0) | ||
38 | #define UNMAPPED_GVA (~(gpa_t)0) | ||
39 | |||
40 | #define KVM_MAX_VCPUS 4 | ||
41 | #define KVM_ALIAS_SLOTS 4 | ||
42 | #define KVM_MEMORY_SLOTS 8 | ||
43 | #define KVM_NUM_MMU_PAGES 1024 | ||
44 | #define KVM_MIN_FREE_MMU_PAGES 5 | ||
45 | #define KVM_REFILL_PAGES 25 | ||
46 | #define KVM_MAX_CPUID_ENTRIES 40 | ||
47 | |||
48 | #define DE_VECTOR 0 | ||
49 | #define NM_VECTOR 7 | ||
50 | #define DF_VECTOR 8 | ||
51 | #define TS_VECTOR 10 | ||
52 | #define NP_VECTOR 11 | ||
53 | #define SS_VECTOR 12 | ||
54 | #define GP_VECTOR 13 | ||
55 | #define PF_VECTOR 14 | ||
56 | |||
57 | #define SELECTOR_TI_MASK (1 << 2) | ||
58 | #define SELECTOR_RPL_MASK 0x03 | ||
59 | |||
60 | #define IOPL_SHIFT 12 | ||
61 | |||
62 | #define KVM_PIO_PAGE_OFFSET 1 | ||
63 | |||
64 | /* | ||
65 | * vcpu->requests bit members | ||
66 | */ | ||
67 | #define KVM_TLB_FLUSH 0 | ||
68 | |||
69 | /* | ||
70 | * Address types: | ||
71 | * | ||
72 | * gva - guest virtual address | ||
73 | * gpa - guest physical address | ||
74 | * gfn - guest frame number | ||
75 | * hva - host virtual address | ||
76 | * hpa - host physical address | ||
77 | * hfn - host frame number | ||
78 | */ | ||
79 | |||
80 | typedef unsigned long gva_t; | ||
81 | typedef u64 gpa_t; | ||
82 | typedef unsigned long gfn_t; | ||
83 | |||
84 | typedef unsigned long hva_t; | ||
85 | typedef u64 hpa_t; | ||
86 | typedef unsigned long hfn_t; | ||
87 | |||
88 | #define NR_PTE_CHAIN_ENTRIES 5 | ||
89 | |||
90 | struct kvm_pte_chain { | ||
91 | u64 *parent_ptes[NR_PTE_CHAIN_ENTRIES]; | ||
92 | struct hlist_node link; | ||
93 | }; | ||
94 | |||
95 | /* | ||
96 | * kvm_mmu_page_role, below, is defined as: | ||
97 | * | ||
98 | * bits 0:3 - total guest paging levels (2-4, or zero for real mode) | ||
99 | * bits 4:7 - page table level for this shadow (1-4) | ||
100 | * bits 8:9 - page table quadrant for 2-level guests | ||
101 | * bit 16 - "metaphysical" - gfn is not a real page (huge page/real mode) | ||
102 | * bits 17:19 - "access" - the user, writable, and nx bits of a huge page pde | ||
103 | */ | ||
104 | union kvm_mmu_page_role { | ||
105 | unsigned word; | ||
106 | struct { | ||
107 | unsigned glevels : 4; | ||
108 | unsigned level : 4; | ||
109 | unsigned quadrant : 2; | ||
110 | unsigned pad_for_nice_hex_output : 6; | ||
111 | unsigned metaphysical : 1; | ||
112 | unsigned hugepage_access : 3; | ||
113 | }; | ||
114 | }; | ||
115 | |||
116 | struct kvm_mmu_page { | ||
117 | struct list_head link; | ||
118 | struct hlist_node hash_link; | ||
119 | |||
120 | /* | ||
121 | * The following two entries are used to key the shadow page in the | ||
122 | * hash table. | ||
123 | */ | ||
124 | gfn_t gfn; | ||
125 | union kvm_mmu_page_role role; | ||
126 | |||
127 | u64 *spt; | ||
128 | unsigned long slot_bitmap; /* One bit set per slot which has memory | ||
129 | * in this shadow page. | ||
130 | */ | ||
131 | int multimapped; /* More than one parent_pte? */ | ||
132 | int root_count; /* Currently serving as active root */ | ||
133 | union { | ||
134 | u64 *parent_pte; /* !multimapped */ | ||
135 | struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */ | ||
136 | }; | ||
137 | }; | ||
138 | |||
139 | struct kvm_vcpu; | ||
140 | extern struct kmem_cache *kvm_vcpu_cache; | ||
141 | |||
142 | /* | ||
143 | * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level | ||
144 | * 32-bit). The kvm_mmu structure abstracts the details of the current mmu | ||
145 | * mode. | ||
146 | */ | ||
147 | struct kvm_mmu { | ||
148 | void (*new_cr3)(struct kvm_vcpu *vcpu); | ||
149 | int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); | ||
150 | void (*free)(struct kvm_vcpu *vcpu); | ||
151 | gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva); | ||
152 | hpa_t root_hpa; | ||
153 | int root_level; | ||
154 | int shadow_root_level; | ||
155 | |||
156 | u64 *pae_root; | ||
157 | }; | ||
158 | |||
159 | #define KVM_NR_MEM_OBJS 20 | ||
160 | |||
161 | struct kvm_mmu_memory_cache { | ||
162 | int nobjs; | ||
163 | void *objects[KVM_NR_MEM_OBJS]; | ||
164 | }; | ||
165 | |||
166 | /* | ||
167 | * We don't want allocation failures within the mmu code, so we preallocate | ||
168 | * enough memory for a single page fault in a cache. | ||
169 | */ | ||
170 | struct kvm_guest_debug { | ||
171 | int enabled; | ||
172 | unsigned long bp[4]; | ||
173 | int singlestep; | ||
174 | }; | ||
175 | |||
176 | enum { | ||
177 | VCPU_REGS_RAX = 0, | ||
178 | VCPU_REGS_RCX = 1, | ||
179 | VCPU_REGS_RDX = 2, | ||
180 | VCPU_REGS_RBX = 3, | ||
181 | VCPU_REGS_RSP = 4, | ||
182 | VCPU_REGS_RBP = 5, | ||
183 | VCPU_REGS_RSI = 6, | ||
184 | VCPU_REGS_RDI = 7, | ||
185 | #ifdef CONFIG_X86_64 | ||
186 | VCPU_REGS_R8 = 8, | ||
187 | VCPU_REGS_R9 = 9, | ||
188 | VCPU_REGS_R10 = 10, | ||
189 | VCPU_REGS_R11 = 11, | ||
190 | VCPU_REGS_R12 = 12, | ||
191 | VCPU_REGS_R13 = 13, | ||
192 | VCPU_REGS_R14 = 14, | ||
193 | VCPU_REGS_R15 = 15, | ||
194 | #endif | ||
195 | NR_VCPU_REGS | ||
196 | }; | ||
197 | |||
198 | enum { | ||
199 | VCPU_SREG_CS, | ||
200 | VCPU_SREG_DS, | ||
201 | VCPU_SREG_ES, | ||
202 | VCPU_SREG_FS, | ||
203 | VCPU_SREG_GS, | ||
204 | VCPU_SREG_SS, | ||
205 | VCPU_SREG_TR, | ||
206 | VCPU_SREG_LDTR, | ||
207 | }; | ||
208 | |||
209 | struct kvm_pio_request { | ||
210 | unsigned long count; | ||
211 | int cur_count; | ||
212 | struct page *guest_pages[2]; | ||
213 | unsigned guest_page_offset; | ||
214 | int in; | ||
215 | int port; | ||
216 | int size; | ||
217 | int string; | ||
218 | int down; | ||
219 | int rep; | ||
220 | }; | ||
221 | |||
222 | struct kvm_stat { | ||
223 | u32 pf_fixed; | ||
224 | u32 pf_guest; | ||
225 | u32 tlb_flush; | ||
226 | u32 invlpg; | ||
227 | |||
228 | u32 exits; | ||
229 | u32 io_exits; | ||
230 | u32 mmio_exits; | ||
231 | u32 signal_exits; | ||
232 | u32 irq_window_exits; | ||
233 | u32 halt_exits; | ||
234 | u32 halt_wakeup; | ||
235 | u32 request_irq_exits; | ||
236 | u32 irq_exits; | ||
237 | u32 light_exits; | ||
238 | u32 efer_reload; | ||
239 | }; | ||
240 | |||
241 | struct kvm_io_device { | ||
242 | void (*read)(struct kvm_io_device *this, | ||
243 | gpa_t addr, | ||
244 | int len, | ||
245 | void *val); | ||
246 | void (*write)(struct kvm_io_device *this, | ||
247 | gpa_t addr, | ||
248 | int len, | ||
249 | const void *val); | ||
250 | int (*in_range)(struct kvm_io_device *this, gpa_t addr); | ||
251 | void (*destructor)(struct kvm_io_device *this); | ||
252 | |||
253 | void *private; | ||
254 | }; | ||
255 | |||
256 | static inline void kvm_iodevice_read(struct kvm_io_device *dev, | ||
257 | gpa_t addr, | ||
258 | int len, | ||
259 | void *val) | ||
260 | { | ||
261 | dev->read(dev, addr, len, val); | ||
262 | } | ||
263 | |||
264 | static inline void kvm_iodevice_write(struct kvm_io_device *dev, | ||
265 | gpa_t addr, | ||
266 | int len, | ||
267 | const void *val) | ||
268 | { | ||
269 | dev->write(dev, addr, len, val); | ||
270 | } | ||
271 | |||
272 | static inline int kvm_iodevice_inrange(struct kvm_io_device *dev, gpa_t addr) | ||
273 | { | ||
274 | return dev->in_range(dev, addr); | ||
275 | } | ||
276 | |||
277 | static inline void kvm_iodevice_destructor(struct kvm_io_device *dev) | ||
278 | { | ||
279 | if (dev->destructor) | ||
280 | dev->destructor(dev); | ||
281 | } | ||
282 | |||
283 | /* | ||
284 | * It would be nice to use something smarter than a linear search, TBD... | ||
285 | * Thankfully we dont expect many devices to register (famous last words :), | ||
286 | * so until then it will suffice. At least its abstracted so we can change | ||
287 | * in one place. | ||
288 | */ | ||
289 | struct kvm_io_bus { | ||
290 | int dev_count; | ||
291 | #define NR_IOBUS_DEVS 6 | ||
292 | struct kvm_io_device *devs[NR_IOBUS_DEVS]; | ||
293 | }; | ||
294 | |||
295 | void kvm_io_bus_init(struct kvm_io_bus *bus); | ||
296 | void kvm_io_bus_destroy(struct kvm_io_bus *bus); | ||
297 | struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr); | ||
298 | void kvm_io_bus_register_dev(struct kvm_io_bus *bus, | ||
299 | struct kvm_io_device *dev); | ||
300 | |||
301 | struct kvm_vcpu { | ||
302 | struct kvm *kvm; | ||
303 | struct preempt_notifier preempt_notifier; | ||
304 | int vcpu_id; | ||
305 | struct mutex mutex; | ||
306 | int cpu; | ||
307 | u64 host_tsc; | ||
308 | struct kvm_run *run; | ||
309 | int interrupt_window_open; | ||
310 | int guest_mode; | ||
311 | unsigned long requests; | ||
312 | unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */ | ||
313 | DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS); | ||
314 | unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */ | ||
315 | unsigned long rip; /* needs vcpu_load_rsp_rip() */ | ||
316 | |||
317 | unsigned long cr0; | ||
318 | unsigned long cr2; | ||
319 | unsigned long cr3; | ||
320 | gpa_t para_state_gpa; | ||
321 | struct page *para_state_page; | ||
322 | gpa_t hypercall_gpa; | ||
323 | unsigned long cr4; | ||
324 | unsigned long cr8; | ||
325 | u64 pdptrs[4]; /* pae */ | ||
326 | u64 shadow_efer; | ||
327 | u64 apic_base; | ||
328 | struct kvm_lapic *apic; /* kernel irqchip context */ | ||
329 | #define VCPU_MP_STATE_RUNNABLE 0 | ||
330 | #define VCPU_MP_STATE_UNINITIALIZED 1 | ||
331 | #define VCPU_MP_STATE_INIT_RECEIVED 2 | ||
332 | #define VCPU_MP_STATE_SIPI_RECEIVED 3 | ||
333 | #define VCPU_MP_STATE_HALTED 4 | ||
334 | int mp_state; | ||
335 | int sipi_vector; | ||
336 | u64 ia32_misc_enable_msr; | ||
337 | |||
338 | struct kvm_mmu mmu; | ||
339 | |||
340 | struct kvm_mmu_memory_cache mmu_pte_chain_cache; | ||
341 | struct kvm_mmu_memory_cache mmu_rmap_desc_cache; | ||
342 | struct kvm_mmu_memory_cache mmu_page_cache; | ||
343 | struct kvm_mmu_memory_cache mmu_page_header_cache; | ||
344 | |||
345 | gfn_t last_pt_write_gfn; | ||
346 | int last_pt_write_count; | ||
347 | |||
348 | struct kvm_guest_debug guest_debug; | ||
349 | |||
350 | struct i387_fxsave_struct host_fx_image; | ||
351 | struct i387_fxsave_struct guest_fx_image; | ||
352 | int fpu_active; | ||
353 | int guest_fpu_loaded; | ||
354 | |||
355 | int mmio_needed; | ||
356 | int mmio_read_completed; | ||
357 | int mmio_is_write; | ||
358 | int mmio_size; | ||
359 | unsigned char mmio_data[8]; | ||
360 | gpa_t mmio_phys_addr; | ||
361 | gva_t mmio_fault_cr2; | ||
362 | struct kvm_pio_request pio; | ||
363 | void *pio_data; | ||
364 | wait_queue_head_t wq; | ||
365 | |||
366 | int sigset_active; | ||
367 | sigset_t sigset; | ||
368 | |||
369 | struct kvm_stat stat; | ||
370 | |||
371 | struct { | ||
372 | int active; | ||
373 | u8 save_iopl; | ||
374 | struct kvm_save_segment { | ||
375 | u16 selector; | ||
376 | unsigned long base; | ||
377 | u32 limit; | ||
378 | u32 ar; | ||
379 | } tr, es, ds, fs, gs; | ||
380 | } rmode; | ||
381 | int halt_request; /* real mode on Intel only */ | ||
382 | |||
383 | int cpuid_nent; | ||
384 | struct kvm_cpuid_entry cpuid_entries[KVM_MAX_CPUID_ENTRIES]; | ||
385 | }; | ||
386 | |||
387 | struct kvm_mem_alias { | ||
388 | gfn_t base_gfn; | ||
389 | unsigned long npages; | ||
390 | gfn_t target_gfn; | ||
391 | }; | ||
392 | |||
393 | struct kvm_memory_slot { | ||
394 | gfn_t base_gfn; | ||
395 | unsigned long npages; | ||
396 | unsigned long flags; | ||
397 | struct page **phys_mem; | ||
398 | unsigned long *dirty_bitmap; | ||
399 | }; | ||
400 | |||
401 | struct kvm { | ||
402 | struct mutex lock; /* protects everything except vcpus */ | ||
403 | int naliases; | ||
404 | struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS]; | ||
405 | int nmemslots; | ||
406 | struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS]; | ||
407 | /* | ||
408 | * Hash table of struct kvm_mmu_page. | ||
409 | */ | ||
410 | struct list_head active_mmu_pages; | ||
411 | int n_free_mmu_pages; | ||
412 | struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; | ||
413 | struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; | ||
414 | unsigned long rmap_overflow; | ||
415 | struct list_head vm_list; | ||
416 | struct file *filp; | ||
417 | struct kvm_io_bus mmio_bus; | ||
418 | struct kvm_io_bus pio_bus; | ||
419 | struct kvm_pic *vpic; | ||
420 | struct kvm_ioapic *vioapic; | ||
421 | int round_robin_prev_vcpu; | ||
422 | }; | ||
423 | |||
424 | static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) | ||
425 | { | ||
426 | return kvm->vpic; | ||
427 | } | ||
428 | |||
429 | static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm) | ||
430 | { | ||
431 | return kvm->vioapic; | ||
432 | } | ||
433 | |||
434 | static inline int irqchip_in_kernel(struct kvm *kvm) | ||
435 | { | ||
436 | return pic_irqchip(kvm) != 0; | ||
437 | } | ||
438 | |||
439 | struct descriptor_table { | ||
440 | u16 limit; | ||
441 | unsigned long base; | ||
442 | } __attribute__((packed)); | ||
443 | |||
444 | struct kvm_x86_ops { | ||
445 | int (*cpu_has_kvm_support)(void); /* __init */ | ||
446 | int (*disabled_by_bios)(void); /* __init */ | ||
447 | void (*hardware_enable)(void *dummy); /* __init */ | ||
448 | void (*hardware_disable)(void *dummy); | ||
449 | void (*check_processor_compatibility)(void *rtn); | ||
450 | int (*hardware_setup)(void); /* __init */ | ||
451 | void (*hardware_unsetup)(void); /* __exit */ | ||
452 | |||
453 | /* Create, but do not attach this VCPU */ | ||
454 | struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id); | ||
455 | void (*vcpu_free)(struct kvm_vcpu *vcpu); | ||
456 | void (*vcpu_reset)(struct kvm_vcpu *vcpu); | ||
457 | |||
458 | void (*prepare_guest_switch)(struct kvm_vcpu *vcpu); | ||
459 | void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); | ||
460 | void (*vcpu_put)(struct kvm_vcpu *vcpu); | ||
461 | void (*vcpu_decache)(struct kvm_vcpu *vcpu); | ||
462 | |||
463 | int (*set_guest_debug)(struct kvm_vcpu *vcpu, | ||
464 | struct kvm_debug_guest *dbg); | ||
465 | void (*guest_debug_pre)(struct kvm_vcpu *vcpu); | ||
466 | int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); | ||
467 | int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); | ||
468 | u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); | ||
469 | void (*get_segment)(struct kvm_vcpu *vcpu, | ||
470 | struct kvm_segment *var, int seg); | ||
471 | void (*set_segment)(struct kvm_vcpu *vcpu, | ||
472 | struct kvm_segment *var, int seg); | ||
473 | void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); | ||
474 | void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu); | ||
475 | void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); | ||
476 | void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); | ||
477 | void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); | ||
478 | void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer); | ||
479 | void (*get_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); | ||
480 | void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); | ||
481 | void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); | ||
482 | void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); | ||
483 | unsigned long (*get_dr)(struct kvm_vcpu *vcpu, int dr); | ||
484 | void (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value, | ||
485 | int *exception); | ||
486 | void (*cache_regs)(struct kvm_vcpu *vcpu); | ||
487 | void (*decache_regs)(struct kvm_vcpu *vcpu); | ||
488 | unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); | ||
489 | void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); | ||
490 | |||
491 | void (*tlb_flush)(struct kvm_vcpu *vcpu); | ||
492 | void (*inject_page_fault)(struct kvm_vcpu *vcpu, | ||
493 | unsigned long addr, u32 err_code); | ||
494 | |||
495 | void (*inject_gp)(struct kvm_vcpu *vcpu, unsigned err_code); | ||
496 | |||
497 | void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run); | ||
498 | int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu); | ||
499 | void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); | ||
500 | void (*patch_hypercall)(struct kvm_vcpu *vcpu, | ||
501 | unsigned char *hypercall_addr); | ||
502 | int (*get_irq)(struct kvm_vcpu *vcpu); | ||
503 | void (*set_irq)(struct kvm_vcpu *vcpu, int vec); | ||
504 | void (*inject_pending_irq)(struct kvm_vcpu *vcpu); | ||
505 | void (*inject_pending_vectors)(struct kvm_vcpu *vcpu, | ||
506 | struct kvm_run *run); | ||
507 | }; | ||
508 | |||
509 | extern struct kvm_x86_ops *kvm_x86_ops; | ||
510 | |||
511 | /* The guest did something we don't support. */ | ||
512 | #define pr_unimpl(vcpu, fmt, ...) \ | ||
513 | do { \ | ||
514 | if (printk_ratelimit()) \ | ||
515 | printk(KERN_ERR "kvm: %i: cpu%i " fmt, \ | ||
516 | current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__); \ | ||
517 | } while(0) | ||
518 | |||
519 | #define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt) | ||
520 | #define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt) | ||
521 | |||
522 | int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id); | ||
523 | void kvm_vcpu_uninit(struct kvm_vcpu *vcpu); | ||
524 | |||
525 | int kvm_init_x86(struct kvm_x86_ops *ops, unsigned int vcpu_size, | ||
526 | struct module *module); | ||
527 | void kvm_exit_x86(void); | ||
528 | |||
529 | int kvm_mmu_module_init(void); | ||
530 | void kvm_mmu_module_exit(void); | ||
531 | |||
532 | void kvm_mmu_destroy(struct kvm_vcpu *vcpu); | ||
533 | int kvm_mmu_create(struct kvm_vcpu *vcpu); | ||
534 | int kvm_mmu_setup(struct kvm_vcpu *vcpu); | ||
535 | |||
536 | int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); | ||
537 | void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); | ||
538 | void kvm_mmu_zap_all(struct kvm *kvm); | ||
539 | |||
540 | hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa); | ||
541 | #define HPA_MSB ((sizeof(hpa_t) * 8) - 1) | ||
542 | #define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB) | ||
543 | static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; } | ||
544 | hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva); | ||
545 | struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva); | ||
546 | |||
547 | extern hpa_t bad_page_address; | ||
548 | |||
549 | struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); | ||
550 | struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); | ||
551 | void mark_page_dirty(struct kvm *kvm, gfn_t gfn); | ||
552 | |||
553 | enum emulation_result { | ||
554 | EMULATE_DONE, /* no further processing */ | ||
555 | EMULATE_DO_MMIO, /* kvm_run filled with mmio request */ | ||
556 | EMULATE_FAIL, /* can't emulate this instruction */ | ||
557 | }; | ||
558 | |||
559 | int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run, | ||
560 | unsigned long cr2, u16 error_code); | ||
561 | void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context); | ||
562 | void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); | ||
563 | void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); | ||
564 | void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, | ||
565 | unsigned long *rflags); | ||
566 | |||
567 | unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr); | ||
568 | void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long value, | ||
569 | unsigned long *rflags); | ||
570 | int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data); | ||
571 | int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); | ||
572 | |||
573 | struct x86_emulate_ctxt; | ||
574 | |||
575 | int kvm_emulate_pio (struct kvm_vcpu *vcpu, struct kvm_run *run, int in, | ||
576 | int size, unsigned port); | ||
577 | int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, | ||
578 | int size, unsigned long count, int down, | ||
579 | gva_t address, int rep, unsigned port); | ||
580 | void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); | ||
581 | int kvm_emulate_halt(struct kvm_vcpu *vcpu); | ||
582 | int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); | ||
583 | int emulate_clts(struct kvm_vcpu *vcpu); | ||
584 | int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, | ||
585 | unsigned long *dest); | ||
586 | int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, | ||
587 | unsigned long value); | ||
588 | |||
589 | void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); | ||
590 | void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr0); | ||
591 | void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr0); | ||
592 | void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr0); | ||
593 | unsigned long get_cr8(struct kvm_vcpu *vcpu); | ||
594 | void lmsw(struct kvm_vcpu *vcpu, unsigned long msw); | ||
595 | void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l); | ||
596 | |||
597 | int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); | ||
598 | int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data); | ||
599 | |||
600 | void fx_init(struct kvm_vcpu *vcpu); | ||
601 | |||
602 | void kvm_resched(struct kvm_vcpu *vcpu); | ||
603 | void kvm_load_guest_fpu(struct kvm_vcpu *vcpu); | ||
604 | void kvm_put_guest_fpu(struct kvm_vcpu *vcpu); | ||
605 | void kvm_flush_remote_tlbs(struct kvm *kvm); | ||
606 | |||
607 | int emulator_read_std(unsigned long addr, | ||
608 | void *val, | ||
609 | unsigned int bytes, | ||
610 | struct kvm_vcpu *vcpu); | ||
611 | int emulator_write_emulated(unsigned long addr, | ||
612 | const void *val, | ||
613 | unsigned int bytes, | ||
614 | struct kvm_vcpu *vcpu); | ||
615 | |||
616 | unsigned long segment_base(u16 selector); | ||
617 | |||
618 | void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | ||
619 | const u8 *new, int bytes); | ||
620 | int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); | ||
621 | void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); | ||
622 | int kvm_mmu_load(struct kvm_vcpu *vcpu); | ||
623 | void kvm_mmu_unload(struct kvm_vcpu *vcpu); | ||
624 | |||
625 | int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run); | ||
626 | |||
627 | static inline void kvm_guest_enter(void) | ||
628 | { | ||
629 | current->flags |= PF_VCPU; | ||
630 | } | ||
631 | |||
632 | static inline void kvm_guest_exit(void) | ||
633 | { | ||
634 | current->flags &= ~PF_VCPU; | ||
635 | } | ||
636 | |||
637 | static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, | ||
638 | u32 error_code) | ||
639 | { | ||
640 | return vcpu->mmu.page_fault(vcpu, gva, error_code); | ||
641 | } | ||
642 | |||
643 | static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) | ||
644 | { | ||
645 | if (unlikely(vcpu->kvm->n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) | ||
646 | __kvm_mmu_free_some_pages(vcpu); | ||
647 | } | ||
648 | |||
649 | static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) | ||
650 | { | ||
651 | if (likely(vcpu->mmu.root_hpa != INVALID_PAGE)) | ||
652 | return 0; | ||
653 | |||
654 | return kvm_mmu_load(vcpu); | ||
655 | } | ||
656 | |||
657 | static inline int is_long_mode(struct kvm_vcpu *vcpu) | ||
658 | { | ||
659 | #ifdef CONFIG_X86_64 | ||
660 | return vcpu->shadow_efer & EFER_LME; | ||
661 | #else | ||
662 | return 0; | ||
663 | #endif | ||
664 | } | ||
665 | |||
666 | static inline int is_pae(struct kvm_vcpu *vcpu) | ||
667 | { | ||
668 | return vcpu->cr4 & X86_CR4_PAE; | ||
669 | } | ||
670 | |||
671 | static inline int is_pse(struct kvm_vcpu *vcpu) | ||
672 | { | ||
673 | return vcpu->cr4 & X86_CR4_PSE; | ||
674 | } | ||
675 | |||
676 | static inline int is_paging(struct kvm_vcpu *vcpu) | ||
677 | { | ||
678 | return vcpu->cr0 & X86_CR0_PG; | ||
679 | } | ||
680 | |||
681 | static inline int memslot_id(struct kvm *kvm, struct kvm_memory_slot *slot) | ||
682 | { | ||
683 | return slot - kvm->memslots; | ||
684 | } | ||
685 | |||
686 | static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) | ||
687 | { | ||
688 | struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); | ||
689 | |||
690 | return (struct kvm_mmu_page *)page_private(page); | ||
691 | } | ||
692 | |||
693 | static inline u16 read_fs(void) | ||
694 | { | ||
695 | u16 seg; | ||
696 | asm ("mov %%fs, %0" : "=g"(seg)); | ||
697 | return seg; | ||
698 | } | ||
699 | |||
700 | static inline u16 read_gs(void) | ||
701 | { | ||
702 | u16 seg; | ||
703 | asm ("mov %%gs, %0" : "=g"(seg)); | ||
704 | return seg; | ||
705 | } | ||
706 | |||
707 | static inline u16 read_ldt(void) | ||
708 | { | ||
709 | u16 ldt; | ||
710 | asm ("sldt %0" : "=g"(ldt)); | ||
711 | return ldt; | ||
712 | } | ||
713 | |||
714 | static inline void load_fs(u16 sel) | ||
715 | { | ||
716 | asm ("mov %0, %%fs" : : "rm"(sel)); | ||
717 | } | ||
718 | |||
719 | static inline void load_gs(u16 sel) | ||
720 | { | ||
721 | asm ("mov %0, %%gs" : : "rm"(sel)); | ||
722 | } | ||
723 | |||
724 | #ifndef load_ldt | ||
725 | static inline void load_ldt(u16 sel) | ||
726 | { | ||
727 | asm ("lldt %0" : : "rm"(sel)); | ||
728 | } | ||
729 | #endif | ||
730 | |||
731 | static inline void get_idt(struct descriptor_table *table) | ||
732 | { | ||
733 | asm ("sidt %0" : "=m"(*table)); | ||
734 | } | ||
735 | |||
736 | static inline void get_gdt(struct descriptor_table *table) | ||
737 | { | ||
738 | asm ("sgdt %0" : "=m"(*table)); | ||
739 | } | ||
740 | |||
741 | static inline unsigned long read_tr_base(void) | ||
742 | { | ||
743 | u16 tr; | ||
744 | asm ("str %0" : "=g"(tr)); | ||
745 | return segment_base(tr); | ||
746 | } | ||
747 | |||
748 | #ifdef CONFIG_X86_64 | ||
749 | static inline unsigned long read_msr(unsigned long msr) | ||
750 | { | ||
751 | u64 value; | ||
752 | |||
753 | rdmsrl(msr, value); | ||
754 | return value; | ||
755 | } | ||
756 | #endif | ||
757 | |||
758 | static inline void fx_save(struct i387_fxsave_struct *image) | ||
759 | { | ||
760 | asm ("fxsave (%0)":: "r" (image)); | ||
761 | } | ||
762 | |||
763 | static inline void fx_restore(struct i387_fxsave_struct *image) | ||
764 | { | ||
765 | asm ("fxrstor (%0)":: "r" (image)); | ||
766 | } | ||
767 | |||
768 | static inline void fpu_init(void) | ||
769 | { | ||
770 | asm ("finit"); | ||
771 | } | ||
772 | |||
773 | static inline u32 get_rdx_init_val(void) | ||
774 | { | ||
775 | return 0x600; /* P6 family */ | ||
776 | } | ||
777 | |||
778 | #define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30" | ||
779 | #define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2" | ||
780 | #define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3" | ||
781 | #define ASM_VMX_VMPTRLD_RAX ".byte 0x0f, 0xc7, 0x30" | ||
782 | #define ASM_VMX_VMREAD_RDX_RAX ".byte 0x0f, 0x78, 0xd0" | ||
783 | #define ASM_VMX_VMWRITE_RAX_RDX ".byte 0x0f, 0x79, 0xd0" | ||
784 | #define ASM_VMX_VMWRITE_RSP_RDX ".byte 0x0f, 0x79, 0xd4" | ||
785 | #define ASM_VMX_VMXOFF ".byte 0x0f, 0x01, 0xc4" | ||
786 | #define ASM_VMX_VMXON_RAX ".byte 0xf3, 0x0f, 0xc7, 0x30" | ||
787 | |||
788 | #define MSR_IA32_TIME_STAMP_COUNTER 0x010 | ||
789 | |||
790 | #define TSS_IOPB_BASE_OFFSET 0x66 | ||
791 | #define TSS_BASE_SIZE 0x68 | ||
792 | #define TSS_IOPB_SIZE (65536 / 8) | ||
793 | #define TSS_REDIRECTION_SIZE (256 / 8) | ||
794 | #define RMODE_TSS_SIZE (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1) | ||
795 | |||
796 | #endif | ||
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c deleted file mode 100644 index c0f372f1d761..000000000000 --- a/drivers/kvm/kvm_main.c +++ /dev/null | |||
@@ -1,3628 +0,0 @@ | |||
1 | /* | ||
2 | * Kernel-based Virtual Machine driver for Linux | ||
3 | * | ||
4 | * This module enables machines with Intel VT-x extensions to run virtual | ||
5 | * machines without emulation or binary translation. | ||
6 | * | ||
7 | * Copyright (C) 2006 Qumranet, Inc. | ||
8 | * | ||
9 | * Authors: | ||
10 | * Avi Kivity <avi@qumranet.com> | ||
11 | * Yaniv Kamay <yaniv@qumranet.com> | ||
12 | * | ||
13 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
14 | * the COPYING file in the top-level directory. | ||
15 | * | ||
16 | */ | ||
17 | |||
18 | #include "kvm.h" | ||
19 | #include "x86_emulate.h" | ||
20 | #include "segment_descriptor.h" | ||
21 | #include "irq.h" | ||
22 | |||
23 | #include <linux/kvm.h> | ||
24 | #include <linux/module.h> | ||
25 | #include <linux/errno.h> | ||
26 | #include <linux/percpu.h> | ||
27 | #include <linux/gfp.h> | ||
28 | #include <linux/mm.h> | ||
29 | #include <linux/miscdevice.h> | ||
30 | #include <linux/vmalloc.h> | ||
31 | #include <linux/reboot.h> | ||
32 | #include <linux/debugfs.h> | ||
33 | #include <linux/highmem.h> | ||
34 | #include <linux/file.h> | ||
35 | #include <linux/sysdev.h> | ||
36 | #include <linux/cpu.h> | ||
37 | #include <linux/sched.h> | ||
38 | #include <linux/cpumask.h> | ||
39 | #include <linux/smp.h> | ||
40 | #include <linux/anon_inodes.h> | ||
41 | #include <linux/profile.h> | ||
42 | |||
43 | #include <asm/processor.h> | ||
44 | #include <asm/msr.h> | ||
45 | #include <asm/io.h> | ||
46 | #include <asm/uaccess.h> | ||
47 | #include <asm/desc.h> | ||
48 | |||
49 | MODULE_AUTHOR("Qumranet"); | ||
50 | MODULE_LICENSE("GPL"); | ||
51 | |||
52 | static DEFINE_SPINLOCK(kvm_lock); | ||
53 | static LIST_HEAD(vm_list); | ||
54 | |||
55 | static cpumask_t cpus_hardware_enabled; | ||
56 | |||
57 | struct kvm_x86_ops *kvm_x86_ops; | ||
58 | struct kmem_cache *kvm_vcpu_cache; | ||
59 | EXPORT_SYMBOL_GPL(kvm_vcpu_cache); | ||
60 | |||
61 | static __read_mostly struct preempt_ops kvm_preempt_ops; | ||
62 | |||
63 | #define STAT_OFFSET(x) offsetof(struct kvm_vcpu, stat.x) | ||
64 | |||
65 | static struct kvm_stats_debugfs_item { | ||
66 | const char *name; | ||
67 | int offset; | ||
68 | struct dentry *dentry; | ||
69 | } debugfs_entries[] = { | ||
70 | { "pf_fixed", STAT_OFFSET(pf_fixed) }, | ||
71 | { "pf_guest", STAT_OFFSET(pf_guest) }, | ||
72 | { "tlb_flush", STAT_OFFSET(tlb_flush) }, | ||
73 | { "invlpg", STAT_OFFSET(invlpg) }, | ||
74 | { "exits", STAT_OFFSET(exits) }, | ||
75 | { "io_exits", STAT_OFFSET(io_exits) }, | ||
76 | { "mmio_exits", STAT_OFFSET(mmio_exits) }, | ||
77 | { "signal_exits", STAT_OFFSET(signal_exits) }, | ||
78 | { "irq_window", STAT_OFFSET(irq_window_exits) }, | ||
79 | { "halt_exits", STAT_OFFSET(halt_exits) }, | ||
80 | { "halt_wakeup", STAT_OFFSET(halt_wakeup) }, | ||
81 | { "request_irq", STAT_OFFSET(request_irq_exits) }, | ||
82 | { "irq_exits", STAT_OFFSET(irq_exits) }, | ||
83 | { "light_exits", STAT_OFFSET(light_exits) }, | ||
84 | { "efer_reload", STAT_OFFSET(efer_reload) }, | ||
85 | { NULL } | ||
86 | }; | ||
87 | |||
88 | static struct dentry *debugfs_dir; | ||
89 | |||
90 | #define MAX_IO_MSRS 256 | ||
91 | |||
92 | #define CR0_RESERVED_BITS \ | ||
93 | (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ | ||
94 | | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ | ||
95 | | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) | ||
96 | #define CR4_RESERVED_BITS \ | ||
97 | (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ | ||
98 | | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ | ||
99 | | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ | ||
100 | | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) | ||
101 | |||
102 | #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) | ||
103 | #define EFER_RESERVED_BITS 0xfffffffffffff2fe | ||
104 | |||
105 | #ifdef CONFIG_X86_64 | ||
106 | // LDT or TSS descriptor in the GDT. 16 bytes. | ||
107 | struct segment_descriptor_64 { | ||
108 | struct segment_descriptor s; | ||
109 | u32 base_higher; | ||
110 | u32 pad_zero; | ||
111 | }; | ||
112 | |||
113 | #endif | ||
114 | |||
115 | static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, | ||
116 | unsigned long arg); | ||
117 | |||
118 | unsigned long segment_base(u16 selector) | ||
119 | { | ||
120 | struct descriptor_table gdt; | ||
121 | struct segment_descriptor *d; | ||
122 | unsigned long table_base; | ||
123 | typedef unsigned long ul; | ||
124 | unsigned long v; | ||
125 | |||
126 | if (selector == 0) | ||
127 | return 0; | ||
128 | |||
129 | asm ("sgdt %0" : "=m"(gdt)); | ||
130 | table_base = gdt.base; | ||
131 | |||
132 | if (selector & 4) { /* from ldt */ | ||
133 | u16 ldt_selector; | ||
134 | |||
135 | asm ("sldt %0" : "=g"(ldt_selector)); | ||
136 | table_base = segment_base(ldt_selector); | ||
137 | } | ||
138 | d = (struct segment_descriptor *)(table_base + (selector & ~7)); | ||
139 | v = d->base_low | ((ul)d->base_mid << 16) | ((ul)d->base_high << 24); | ||
140 | #ifdef CONFIG_X86_64 | ||
141 | if (d->system == 0 | ||
142 | && (d->type == 2 || d->type == 9 || d->type == 11)) | ||
143 | v |= ((ul)((struct segment_descriptor_64 *)d)->base_higher) << 32; | ||
144 | #endif | ||
145 | return v; | ||
146 | } | ||
147 | EXPORT_SYMBOL_GPL(segment_base); | ||
148 | |||
149 | static inline int valid_vcpu(int n) | ||
150 | { | ||
151 | return likely(n >= 0 && n < KVM_MAX_VCPUS); | ||
152 | } | ||
153 | |||
154 | void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) | ||
155 | { | ||
156 | if (!vcpu->fpu_active || vcpu->guest_fpu_loaded) | ||
157 | return; | ||
158 | |||
159 | vcpu->guest_fpu_loaded = 1; | ||
160 | fx_save(&vcpu->host_fx_image); | ||
161 | fx_restore(&vcpu->guest_fx_image); | ||
162 | } | ||
163 | EXPORT_SYMBOL_GPL(kvm_load_guest_fpu); | ||
164 | |||
165 | void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) | ||
166 | { | ||
167 | if (!vcpu->guest_fpu_loaded) | ||
168 | return; | ||
169 | |||
170 | vcpu->guest_fpu_loaded = 0; | ||
171 | fx_save(&vcpu->guest_fx_image); | ||
172 | fx_restore(&vcpu->host_fx_image); | ||
173 | } | ||
174 | EXPORT_SYMBOL_GPL(kvm_put_guest_fpu); | ||
175 | |||
176 | /* | ||
177 | * Switches to specified vcpu, until a matching vcpu_put() | ||
178 | */ | ||
179 | static void vcpu_load(struct kvm_vcpu *vcpu) | ||
180 | { | ||
181 | int cpu; | ||
182 | |||
183 | mutex_lock(&vcpu->mutex); | ||
184 | cpu = get_cpu(); | ||
185 | preempt_notifier_register(&vcpu->preempt_notifier); | ||
186 | kvm_x86_ops->vcpu_load(vcpu, cpu); | ||
187 | put_cpu(); | ||
188 | } | ||
189 | |||
190 | static void vcpu_put(struct kvm_vcpu *vcpu) | ||
191 | { | ||
192 | preempt_disable(); | ||
193 | kvm_x86_ops->vcpu_put(vcpu); | ||
194 | preempt_notifier_unregister(&vcpu->preempt_notifier); | ||
195 | preempt_enable(); | ||
196 | mutex_unlock(&vcpu->mutex); | ||
197 | } | ||
198 | |||
199 | static void ack_flush(void *_completed) | ||
200 | { | ||
201 | } | ||
202 | |||
203 | void kvm_flush_remote_tlbs(struct kvm *kvm) | ||
204 | { | ||
205 | int i, cpu; | ||
206 | cpumask_t cpus; | ||
207 | struct kvm_vcpu *vcpu; | ||
208 | |||
209 | cpus_clear(cpus); | ||
210 | for (i = 0; i < KVM_MAX_VCPUS; ++i) { | ||
211 | vcpu = kvm->vcpus[i]; | ||
212 | if (!vcpu) | ||
213 | continue; | ||
214 | if (test_and_set_bit(KVM_TLB_FLUSH, &vcpu->requests)) | ||
215 | continue; | ||
216 | cpu = vcpu->cpu; | ||
217 | if (cpu != -1 && cpu != raw_smp_processor_id()) | ||
218 | cpu_set(cpu, cpus); | ||
219 | } | ||
220 | smp_call_function_mask(cpus, ack_flush, NULL, 1); | ||
221 | } | ||
222 | |||
223 | int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) | ||
224 | { | ||
225 | struct page *page; | ||
226 | int r; | ||
227 | |||
228 | mutex_init(&vcpu->mutex); | ||
229 | vcpu->cpu = -1; | ||
230 | vcpu->mmu.root_hpa = INVALID_PAGE; | ||
231 | vcpu->kvm = kvm; | ||
232 | vcpu->vcpu_id = id; | ||
233 | if (!irqchip_in_kernel(kvm) || id == 0) | ||
234 | vcpu->mp_state = VCPU_MP_STATE_RUNNABLE; | ||
235 | else | ||
236 | vcpu->mp_state = VCPU_MP_STATE_UNINITIALIZED; | ||
237 | init_waitqueue_head(&vcpu->wq); | ||
238 | |||
239 | page = alloc_page(GFP_KERNEL | __GFP_ZERO); | ||
240 | if (!page) { | ||
241 | r = -ENOMEM; | ||
242 | goto fail; | ||
243 | } | ||
244 | vcpu->run = page_address(page); | ||
245 | |||
246 | page = alloc_page(GFP_KERNEL | __GFP_ZERO); | ||
247 | if (!page) { | ||
248 | r = -ENOMEM; | ||
249 | goto fail_free_run; | ||
250 | } | ||
251 | vcpu->pio_data = page_address(page); | ||
252 | |||
253 | r = kvm_mmu_create(vcpu); | ||
254 | if (r < 0) | ||
255 | goto fail_free_pio_data; | ||
256 | |||
257 | return 0; | ||
258 | |||
259 | fail_free_pio_data: | ||
260 | free_page((unsigned long)vcpu->pio_data); | ||
261 | fail_free_run: | ||
262 | free_page((unsigned long)vcpu->run); | ||
263 | fail: | ||
264 | return -ENOMEM; | ||
265 | } | ||
266 | EXPORT_SYMBOL_GPL(kvm_vcpu_init); | ||
267 | |||
268 | void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) | ||
269 | { | ||
270 | kvm_mmu_destroy(vcpu); | ||
271 | if (vcpu->apic) | ||
272 | hrtimer_cancel(&vcpu->apic->timer.dev); | ||
273 | kvm_free_apic(vcpu->apic); | ||
274 | free_page((unsigned long)vcpu->pio_data); | ||
275 | free_page((unsigned long)vcpu->run); | ||
276 | } | ||
277 | EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); | ||
278 | |||
279 | static struct kvm *kvm_create_vm(void) | ||
280 | { | ||
281 | struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); | ||
282 | |||
283 | if (!kvm) | ||
284 | return ERR_PTR(-ENOMEM); | ||
285 | |||
286 | kvm_io_bus_init(&kvm->pio_bus); | ||
287 | mutex_init(&kvm->lock); | ||
288 | INIT_LIST_HEAD(&kvm->active_mmu_pages); | ||
289 | kvm_io_bus_init(&kvm->mmio_bus); | ||
290 | spin_lock(&kvm_lock); | ||
291 | list_add(&kvm->vm_list, &vm_list); | ||
292 | spin_unlock(&kvm_lock); | ||
293 | return kvm; | ||
294 | } | ||
295 | |||
296 | /* | ||
297 | * Free any memory in @free but not in @dont. | ||
298 | */ | ||
299 | static void kvm_free_physmem_slot(struct kvm_memory_slot *free, | ||
300 | struct kvm_memory_slot *dont) | ||
301 | { | ||
302 | int i; | ||
303 | |||
304 | if (!dont || free->phys_mem != dont->phys_mem) | ||
305 | if (free->phys_mem) { | ||
306 | for (i = 0; i < free->npages; ++i) | ||
307 | if (free->phys_mem[i]) | ||
308 | __free_page(free->phys_mem[i]); | ||
309 | vfree(free->phys_mem); | ||
310 | } | ||
311 | |||
312 | if (!dont || free->dirty_bitmap != dont->dirty_bitmap) | ||
313 | vfree(free->dirty_bitmap); | ||
314 | |||
315 | free->phys_mem = NULL; | ||
316 | free->npages = 0; | ||
317 | free->dirty_bitmap = NULL; | ||
318 | } | ||
319 | |||
320 | static void kvm_free_physmem(struct kvm *kvm) | ||
321 | { | ||
322 | int i; | ||
323 | |||
324 | for (i = 0; i < kvm->nmemslots; ++i) | ||
325 | kvm_free_physmem_slot(&kvm->memslots[i], NULL); | ||
326 | } | ||
327 | |||
328 | static void free_pio_guest_pages(struct kvm_vcpu *vcpu) | ||
329 | { | ||
330 | int i; | ||
331 | |||
332 | for (i = 0; i < ARRAY_SIZE(vcpu->pio.guest_pages); ++i) | ||
333 | if (vcpu->pio.guest_pages[i]) { | ||
334 | __free_page(vcpu->pio.guest_pages[i]); | ||
335 | vcpu->pio.guest_pages[i] = NULL; | ||
336 | } | ||
337 | } | ||
338 | |||
339 | static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) | ||
340 | { | ||
341 | vcpu_load(vcpu); | ||
342 | kvm_mmu_unload(vcpu); | ||
343 | vcpu_put(vcpu); | ||
344 | } | ||
345 | |||
346 | static void kvm_free_vcpus(struct kvm *kvm) | ||
347 | { | ||
348 | unsigned int i; | ||
349 | |||
350 | /* | ||
351 | * Unpin any mmu pages first. | ||
352 | */ | ||
353 | for (i = 0; i < KVM_MAX_VCPUS; ++i) | ||
354 | if (kvm->vcpus[i]) | ||
355 | kvm_unload_vcpu_mmu(kvm->vcpus[i]); | ||
356 | for (i = 0; i < KVM_MAX_VCPUS; ++i) { | ||
357 | if (kvm->vcpus[i]) { | ||
358 | kvm_x86_ops->vcpu_free(kvm->vcpus[i]); | ||
359 | kvm->vcpus[i] = NULL; | ||
360 | } | ||
361 | } | ||
362 | |||
363 | } | ||
364 | |||
365 | static void kvm_destroy_vm(struct kvm *kvm) | ||
366 | { | ||
367 | spin_lock(&kvm_lock); | ||
368 | list_del(&kvm->vm_list); | ||
369 | spin_unlock(&kvm_lock); | ||
370 | kvm_io_bus_destroy(&kvm->pio_bus); | ||
371 | kvm_io_bus_destroy(&kvm->mmio_bus); | ||
372 | kfree(kvm->vpic); | ||
373 | kfree(kvm->vioapic); | ||
374 | kvm_free_vcpus(kvm); | ||
375 | kvm_free_physmem(kvm); | ||
376 | kfree(kvm); | ||
377 | } | ||
378 | |||
379 | static int kvm_vm_release(struct inode *inode, struct file *filp) | ||
380 | { | ||
381 | struct kvm *kvm = filp->private_data; | ||
382 | |||
383 | kvm_destroy_vm(kvm); | ||
384 | return 0; | ||
385 | } | ||
386 | |||
387 | static void inject_gp(struct kvm_vcpu *vcpu) | ||
388 | { | ||
389 | kvm_x86_ops->inject_gp(vcpu, 0); | ||
390 | } | ||
391 | |||
392 | /* | ||
393 | * Load the pae pdptrs. Return true is they are all valid. | ||
394 | */ | ||
395 | static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) | ||
396 | { | ||
397 | gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; | ||
398 | unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; | ||
399 | int i; | ||
400 | u64 *pdpt; | ||
401 | int ret; | ||
402 | struct page *page; | ||
403 | u64 pdpte[ARRAY_SIZE(vcpu->pdptrs)]; | ||
404 | |||
405 | mutex_lock(&vcpu->kvm->lock); | ||
406 | page = gfn_to_page(vcpu->kvm, pdpt_gfn); | ||
407 | if (!page) { | ||
408 | ret = 0; | ||
409 | goto out; | ||
410 | } | ||
411 | |||
412 | pdpt = kmap_atomic(page, KM_USER0); | ||
413 | memcpy(pdpte, pdpt+offset, sizeof(pdpte)); | ||
414 | kunmap_atomic(pdpt, KM_USER0); | ||
415 | |||
416 | for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { | ||
417 | if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) { | ||
418 | ret = 0; | ||
419 | goto out; | ||
420 | } | ||
421 | } | ||
422 | ret = 1; | ||
423 | |||
424 | memcpy(vcpu->pdptrs, pdpte, sizeof(vcpu->pdptrs)); | ||
425 | out: | ||
426 | mutex_unlock(&vcpu->kvm->lock); | ||
427 | |||
428 | return ret; | ||
429 | } | ||
430 | |||
431 | void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | ||
432 | { | ||
433 | if (cr0 & CR0_RESERVED_BITS) { | ||
434 | printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", | ||
435 | cr0, vcpu->cr0); | ||
436 | inject_gp(vcpu); | ||
437 | return; | ||
438 | } | ||
439 | |||
440 | if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { | ||
441 | printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); | ||
442 | inject_gp(vcpu); | ||
443 | return; | ||
444 | } | ||
445 | |||
446 | if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { | ||
447 | printk(KERN_DEBUG "set_cr0: #GP, set PG flag " | ||
448 | "and a clear PE flag\n"); | ||
449 | inject_gp(vcpu); | ||
450 | return; | ||
451 | } | ||
452 | |||
453 | if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { | ||
454 | #ifdef CONFIG_X86_64 | ||
455 | if ((vcpu->shadow_efer & EFER_LME)) { | ||
456 | int cs_db, cs_l; | ||
457 | |||
458 | if (!is_pae(vcpu)) { | ||
459 | printk(KERN_DEBUG "set_cr0: #GP, start paging " | ||
460 | "in long mode while PAE is disabled\n"); | ||
461 | inject_gp(vcpu); | ||
462 | return; | ||
463 | } | ||
464 | kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); | ||
465 | if (cs_l) { | ||
466 | printk(KERN_DEBUG "set_cr0: #GP, start paging " | ||
467 | "in long mode while CS.L == 1\n"); | ||
468 | inject_gp(vcpu); | ||
469 | return; | ||
470 | |||
471 | } | ||
472 | } else | ||
473 | #endif | ||
474 | if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->cr3)) { | ||
475 | printk(KERN_DEBUG "set_cr0: #GP, pdptrs " | ||
476 | "reserved bits\n"); | ||
477 | inject_gp(vcpu); | ||
478 | return; | ||
479 | } | ||
480 | |||
481 | } | ||
482 | |||
483 | kvm_x86_ops->set_cr0(vcpu, cr0); | ||
484 | vcpu->cr0 = cr0; | ||
485 | |||
486 | mutex_lock(&vcpu->kvm->lock); | ||
487 | kvm_mmu_reset_context(vcpu); | ||
488 | mutex_unlock(&vcpu->kvm->lock); | ||
489 | return; | ||
490 | } | ||
491 | EXPORT_SYMBOL_GPL(set_cr0); | ||
492 | |||
493 | void lmsw(struct kvm_vcpu *vcpu, unsigned long msw) | ||
494 | { | ||
495 | set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f)); | ||
496 | } | ||
497 | EXPORT_SYMBOL_GPL(lmsw); | ||
498 | |||
499 | void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | ||
500 | { | ||
501 | if (cr4 & CR4_RESERVED_BITS) { | ||
502 | printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); | ||
503 | inject_gp(vcpu); | ||
504 | return; | ||
505 | } | ||
506 | |||
507 | if (is_long_mode(vcpu)) { | ||
508 | if (!(cr4 & X86_CR4_PAE)) { | ||
509 | printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while " | ||
510 | "in long mode\n"); | ||
511 | inject_gp(vcpu); | ||
512 | return; | ||
513 | } | ||
514 | } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE) | ||
515 | && !load_pdptrs(vcpu, vcpu->cr3)) { | ||
516 | printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); | ||
517 | inject_gp(vcpu); | ||
518 | return; | ||
519 | } | ||
520 | |||
521 | if (cr4 & X86_CR4_VMXE) { | ||
522 | printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n"); | ||
523 | inject_gp(vcpu); | ||
524 | return; | ||
525 | } | ||
526 | kvm_x86_ops->set_cr4(vcpu, cr4); | ||
527 | vcpu->cr4 = cr4; | ||
528 | mutex_lock(&vcpu->kvm->lock); | ||
529 | kvm_mmu_reset_context(vcpu); | ||
530 | mutex_unlock(&vcpu->kvm->lock); | ||
531 | } | ||
532 | EXPORT_SYMBOL_GPL(set_cr4); | ||
533 | |||
534 | void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | ||
535 | { | ||
536 | if (is_long_mode(vcpu)) { | ||
537 | if (cr3 & CR3_L_MODE_RESERVED_BITS) { | ||
538 | printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); | ||
539 | inject_gp(vcpu); | ||
540 | return; | ||
541 | } | ||
542 | } else { | ||
543 | if (is_pae(vcpu)) { | ||
544 | if (cr3 & CR3_PAE_RESERVED_BITS) { | ||
545 | printk(KERN_DEBUG | ||
546 | "set_cr3: #GP, reserved bits\n"); | ||
547 | inject_gp(vcpu); | ||
548 | return; | ||
549 | } | ||
550 | if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { | ||
551 | printk(KERN_DEBUG "set_cr3: #GP, pdptrs " | ||
552 | "reserved bits\n"); | ||
553 | inject_gp(vcpu); | ||
554 | return; | ||
555 | } | ||
556 | } else { | ||
557 | if (cr3 & CR3_NONPAE_RESERVED_BITS) { | ||
558 | printk(KERN_DEBUG | ||
559 | "set_cr3: #GP, reserved bits\n"); | ||
560 | inject_gp(vcpu); | ||
561 | return; | ||
562 | } | ||
563 | } | ||
564 | } | ||
565 | |||
566 | mutex_lock(&vcpu->kvm->lock); | ||
567 | /* | ||
568 | * Does the new cr3 value map to physical memory? (Note, we | ||
569 | * catch an invalid cr3 even in real-mode, because it would | ||
570 | * cause trouble later on when we turn on paging anyway.) | ||
571 | * | ||
572 | * A real CPU would silently accept an invalid cr3 and would | ||
573 | * attempt to use it - with largely undefined (and often hard | ||
574 | * to debug) behavior on the guest side. | ||
575 | */ | ||
576 | if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) | ||
577 | inject_gp(vcpu); | ||
578 | else { | ||
579 | vcpu->cr3 = cr3; | ||
580 | vcpu->mmu.new_cr3(vcpu); | ||
581 | } | ||
582 | mutex_unlock(&vcpu->kvm->lock); | ||
583 | } | ||
584 | EXPORT_SYMBOL_GPL(set_cr3); | ||
585 | |||
586 | void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) | ||
587 | { | ||
588 | if (cr8 & CR8_RESERVED_BITS) { | ||
589 | printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8); | ||
590 | inject_gp(vcpu); | ||
591 | return; | ||
592 | } | ||
593 | if (irqchip_in_kernel(vcpu->kvm)) | ||
594 | kvm_lapic_set_tpr(vcpu, cr8); | ||
595 | else | ||
596 | vcpu->cr8 = cr8; | ||
597 | } | ||
598 | EXPORT_SYMBOL_GPL(set_cr8); | ||
599 | |||
600 | unsigned long get_cr8(struct kvm_vcpu *vcpu) | ||
601 | { | ||
602 | if (irqchip_in_kernel(vcpu->kvm)) | ||
603 | return kvm_lapic_get_cr8(vcpu); | ||
604 | else | ||
605 | return vcpu->cr8; | ||
606 | } | ||
607 | EXPORT_SYMBOL_GPL(get_cr8); | ||
608 | |||
609 | u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) | ||
610 | { | ||
611 | if (irqchip_in_kernel(vcpu->kvm)) | ||
612 | return vcpu->apic_base; | ||
613 | else | ||
614 | return vcpu->apic_base; | ||
615 | } | ||
616 | EXPORT_SYMBOL_GPL(kvm_get_apic_base); | ||
617 | |||
618 | void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) | ||
619 | { | ||
620 | /* TODO: reserve bits check */ | ||
621 | if (irqchip_in_kernel(vcpu->kvm)) | ||
622 | kvm_lapic_set_base(vcpu, data); | ||
623 | else | ||
624 | vcpu->apic_base = data; | ||
625 | } | ||
626 | EXPORT_SYMBOL_GPL(kvm_set_apic_base); | ||
627 | |||
628 | void fx_init(struct kvm_vcpu *vcpu) | ||
629 | { | ||
630 | unsigned after_mxcsr_mask; | ||
631 | |||
632 | /* Initialize guest FPU by resetting ours and saving into guest's */ | ||
633 | preempt_disable(); | ||
634 | fx_save(&vcpu->host_fx_image); | ||
635 | fpu_init(); | ||
636 | fx_save(&vcpu->guest_fx_image); | ||
637 | fx_restore(&vcpu->host_fx_image); | ||
638 | preempt_enable(); | ||
639 | |||
640 | vcpu->cr0 |= X86_CR0_ET; | ||
641 | after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space); | ||
642 | vcpu->guest_fx_image.mxcsr = 0x1f80; | ||
643 | memset((void *)&vcpu->guest_fx_image + after_mxcsr_mask, | ||
644 | 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask); | ||
645 | } | ||
646 | EXPORT_SYMBOL_GPL(fx_init); | ||
647 | |||
648 | /* | ||
649 | * Allocate some memory and give it an address in the guest physical address | ||
650 | * space. | ||
651 | * | ||
652 | * Discontiguous memory is allowed, mostly for framebuffers. | ||
653 | */ | ||
654 | static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, | ||
655 | struct kvm_memory_region *mem) | ||
656 | { | ||
657 | int r; | ||
658 | gfn_t base_gfn; | ||
659 | unsigned long npages; | ||
660 | unsigned long i; | ||
661 | struct kvm_memory_slot *memslot; | ||
662 | struct kvm_memory_slot old, new; | ||
663 | |||
664 | r = -EINVAL; | ||
665 | /* General sanity checks */ | ||
666 | if (mem->memory_size & (PAGE_SIZE - 1)) | ||
667 | goto out; | ||
668 | if (mem->guest_phys_addr & (PAGE_SIZE - 1)) | ||
669 | goto out; | ||
670 | if (mem->slot >= KVM_MEMORY_SLOTS) | ||
671 | goto out; | ||
672 | if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) | ||
673 | goto out; | ||
674 | |||
675 | memslot = &kvm->memslots[mem->slot]; | ||
676 | base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; | ||
677 | npages = mem->memory_size >> PAGE_SHIFT; | ||
678 | |||
679 | if (!npages) | ||
680 | mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES; | ||
681 | |||
682 | mutex_lock(&kvm->lock); | ||
683 | |||
684 | new = old = *memslot; | ||
685 | |||
686 | new.base_gfn = base_gfn; | ||
687 | new.npages = npages; | ||
688 | new.flags = mem->flags; | ||
689 | |||
690 | /* Disallow changing a memory slot's size. */ | ||
691 | r = -EINVAL; | ||
692 | if (npages && old.npages && npages != old.npages) | ||
693 | goto out_unlock; | ||
694 | |||
695 | /* Check for overlaps */ | ||
696 | r = -EEXIST; | ||
697 | for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { | ||
698 | struct kvm_memory_slot *s = &kvm->memslots[i]; | ||
699 | |||
700 | if (s == memslot) | ||
701 | continue; | ||
702 | if (!((base_gfn + npages <= s->base_gfn) || | ||
703 | (base_gfn >= s->base_gfn + s->npages))) | ||
704 | goto out_unlock; | ||
705 | } | ||
706 | |||
707 | /* Deallocate if slot is being removed */ | ||
708 | if (!npages) | ||
709 | new.phys_mem = NULL; | ||
710 | |||
711 | /* Free page dirty bitmap if unneeded */ | ||
712 | if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) | ||
713 | new.dirty_bitmap = NULL; | ||
714 | |||
715 | r = -ENOMEM; | ||
716 | |||
717 | /* Allocate if a slot is being created */ | ||
718 | if (npages && !new.phys_mem) { | ||
719 | new.phys_mem = vmalloc(npages * sizeof(struct page *)); | ||
720 | |||
721 | if (!new.phys_mem) | ||
722 | goto out_unlock; | ||
723 | |||
724 | memset(new.phys_mem, 0, npages * sizeof(struct page *)); | ||
725 | for (i = 0; i < npages; ++i) { | ||
726 | new.phys_mem[i] = alloc_page(GFP_HIGHUSER | ||
727 | | __GFP_ZERO); | ||
728 | if (!new.phys_mem[i]) | ||
729 | goto out_unlock; | ||
730 | set_page_private(new.phys_mem[i],0); | ||
731 | } | ||
732 | } | ||
733 | |||
734 | /* Allocate page dirty bitmap if needed */ | ||
735 | if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { | ||
736 | unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8; | ||
737 | |||
738 | new.dirty_bitmap = vmalloc(dirty_bytes); | ||
739 | if (!new.dirty_bitmap) | ||
740 | goto out_unlock; | ||
741 | memset(new.dirty_bitmap, 0, dirty_bytes); | ||
742 | } | ||
743 | |||
744 | if (mem->slot >= kvm->nmemslots) | ||
745 | kvm->nmemslots = mem->slot + 1; | ||
746 | |||
747 | *memslot = new; | ||
748 | |||
749 | kvm_mmu_slot_remove_write_access(kvm, mem->slot); | ||
750 | kvm_flush_remote_tlbs(kvm); | ||
751 | |||
752 | mutex_unlock(&kvm->lock); | ||
753 | |||
754 | kvm_free_physmem_slot(&old, &new); | ||
755 | return 0; | ||
756 | |||
757 | out_unlock: | ||
758 | mutex_unlock(&kvm->lock); | ||
759 | kvm_free_physmem_slot(&new, &old); | ||
760 | out: | ||
761 | return r; | ||
762 | } | ||
763 | |||
764 | /* | ||
765 | * Get (and clear) the dirty memory log for a memory slot. | ||
766 | */ | ||
767 | static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, | ||
768 | struct kvm_dirty_log *log) | ||
769 | { | ||
770 | struct kvm_memory_slot *memslot; | ||
771 | int r, i; | ||
772 | int n; | ||
773 | unsigned long any = 0; | ||
774 | |||
775 | mutex_lock(&kvm->lock); | ||
776 | |||
777 | r = -EINVAL; | ||
778 | if (log->slot >= KVM_MEMORY_SLOTS) | ||
779 | goto out; | ||
780 | |||
781 | memslot = &kvm->memslots[log->slot]; | ||
782 | r = -ENOENT; | ||
783 | if (!memslot->dirty_bitmap) | ||
784 | goto out; | ||
785 | |||
786 | n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; | ||
787 | |||
788 | for (i = 0; !any && i < n/sizeof(long); ++i) | ||
789 | any = memslot->dirty_bitmap[i]; | ||
790 | |||
791 | r = -EFAULT; | ||
792 | if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) | ||
793 | goto out; | ||
794 | |||
795 | /* If nothing is dirty, don't bother messing with page tables. */ | ||
796 | if (any) { | ||
797 | kvm_mmu_slot_remove_write_access(kvm, log->slot); | ||
798 | kvm_flush_remote_tlbs(kvm); | ||
799 | memset(memslot->dirty_bitmap, 0, n); | ||
800 | } | ||
801 | |||
802 | r = 0; | ||
803 | |||
804 | out: | ||
805 | mutex_unlock(&kvm->lock); | ||
806 | return r; | ||
807 | } | ||
808 | |||
809 | /* | ||
810 | * Set a new alias region. Aliases map a portion of physical memory into | ||
811 | * another portion. This is useful for memory windows, for example the PC | ||
812 | * VGA region. | ||
813 | */ | ||
814 | static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, | ||
815 | struct kvm_memory_alias *alias) | ||
816 | { | ||
817 | int r, n; | ||
818 | struct kvm_mem_alias *p; | ||
819 | |||
820 | r = -EINVAL; | ||
821 | /* General sanity checks */ | ||
822 | if (alias->memory_size & (PAGE_SIZE - 1)) | ||
823 | goto out; | ||
824 | if (alias->guest_phys_addr & (PAGE_SIZE - 1)) | ||
825 | goto out; | ||
826 | if (alias->slot >= KVM_ALIAS_SLOTS) | ||
827 | goto out; | ||
828 | if (alias->guest_phys_addr + alias->memory_size | ||
829 | < alias->guest_phys_addr) | ||
830 | goto out; | ||
831 | if (alias->target_phys_addr + alias->memory_size | ||
832 | < alias->target_phys_addr) | ||
833 | goto out; | ||
834 | |||
835 | mutex_lock(&kvm->lock); | ||
836 | |||
837 | p = &kvm->aliases[alias->slot]; | ||
838 | p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; | ||
839 | p->npages = alias->memory_size >> PAGE_SHIFT; | ||
840 | p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; | ||
841 | |||
842 | for (n = KVM_ALIAS_SLOTS; n > 0; --n) | ||
843 | if (kvm->aliases[n - 1].npages) | ||
844 | break; | ||
845 | kvm->naliases = n; | ||
846 | |||
847 | kvm_mmu_zap_all(kvm); | ||
848 | |||
849 | mutex_unlock(&kvm->lock); | ||
850 | |||
851 | return 0; | ||
852 | |||
853 | out: | ||
854 | return r; | ||
855 | } | ||
856 | |||
857 | static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) | ||
858 | { | ||
859 | int r; | ||
860 | |||
861 | r = 0; | ||
862 | switch (chip->chip_id) { | ||
863 | case KVM_IRQCHIP_PIC_MASTER: | ||
864 | memcpy (&chip->chip.pic, | ||
865 | &pic_irqchip(kvm)->pics[0], | ||
866 | sizeof(struct kvm_pic_state)); | ||
867 | break; | ||
868 | case KVM_IRQCHIP_PIC_SLAVE: | ||
869 | memcpy (&chip->chip.pic, | ||
870 | &pic_irqchip(kvm)->pics[1], | ||
871 | sizeof(struct kvm_pic_state)); | ||
872 | break; | ||
873 | case KVM_IRQCHIP_IOAPIC: | ||
874 | memcpy (&chip->chip.ioapic, | ||
875 | ioapic_irqchip(kvm), | ||
876 | sizeof(struct kvm_ioapic_state)); | ||
877 | break; | ||
878 | default: | ||
879 | r = -EINVAL; | ||
880 | break; | ||
881 | } | ||
882 | return r; | ||
883 | } | ||
884 | |||
885 | static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) | ||
886 | { | ||
887 | int r; | ||
888 | |||
889 | r = 0; | ||
890 | switch (chip->chip_id) { | ||
891 | case KVM_IRQCHIP_PIC_MASTER: | ||
892 | memcpy (&pic_irqchip(kvm)->pics[0], | ||
893 | &chip->chip.pic, | ||
894 | sizeof(struct kvm_pic_state)); | ||
895 | break; | ||
896 | case KVM_IRQCHIP_PIC_SLAVE: | ||
897 | memcpy (&pic_irqchip(kvm)->pics[1], | ||
898 | &chip->chip.pic, | ||
899 | sizeof(struct kvm_pic_state)); | ||
900 | break; | ||
901 | case KVM_IRQCHIP_IOAPIC: | ||
902 | memcpy (ioapic_irqchip(kvm), | ||
903 | &chip->chip.ioapic, | ||
904 | sizeof(struct kvm_ioapic_state)); | ||
905 | break; | ||
906 | default: | ||
907 | r = -EINVAL; | ||
908 | break; | ||
909 | } | ||
910 | kvm_pic_update_irq(pic_irqchip(kvm)); | ||
911 | return r; | ||
912 | } | ||
913 | |||
914 | static gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) | ||
915 | { | ||
916 | int i; | ||
917 | struct kvm_mem_alias *alias; | ||
918 | |||
919 | for (i = 0; i < kvm->naliases; ++i) { | ||
920 | alias = &kvm->aliases[i]; | ||
921 | if (gfn >= alias->base_gfn | ||
922 | && gfn < alias->base_gfn + alias->npages) | ||
923 | return alias->target_gfn + gfn - alias->base_gfn; | ||
924 | } | ||
925 | return gfn; | ||
926 | } | ||
927 | |||
928 | static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn) | ||
929 | { | ||
930 | int i; | ||
931 | |||
932 | for (i = 0; i < kvm->nmemslots; ++i) { | ||
933 | struct kvm_memory_slot *memslot = &kvm->memslots[i]; | ||
934 | |||
935 | if (gfn >= memslot->base_gfn | ||
936 | && gfn < memslot->base_gfn + memslot->npages) | ||
937 | return memslot; | ||
938 | } | ||
939 | return NULL; | ||
940 | } | ||
941 | |||
942 | struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) | ||
943 | { | ||
944 | gfn = unalias_gfn(kvm, gfn); | ||
945 | return __gfn_to_memslot(kvm, gfn); | ||
946 | } | ||
947 | |||
948 | struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) | ||
949 | { | ||
950 | struct kvm_memory_slot *slot; | ||
951 | |||
952 | gfn = unalias_gfn(kvm, gfn); | ||
953 | slot = __gfn_to_memslot(kvm, gfn); | ||
954 | if (!slot) | ||
955 | return NULL; | ||
956 | return slot->phys_mem[gfn - slot->base_gfn]; | ||
957 | } | ||
958 | EXPORT_SYMBOL_GPL(gfn_to_page); | ||
959 | |||
960 | /* WARNING: Does not work on aliased pages. */ | ||
961 | void mark_page_dirty(struct kvm *kvm, gfn_t gfn) | ||
962 | { | ||
963 | struct kvm_memory_slot *memslot; | ||
964 | |||
965 | memslot = __gfn_to_memslot(kvm, gfn); | ||
966 | if (memslot && memslot->dirty_bitmap) { | ||
967 | unsigned long rel_gfn = gfn - memslot->base_gfn; | ||
968 | |||
969 | /* avoid RMW */ | ||
970 | if (!test_bit(rel_gfn, memslot->dirty_bitmap)) | ||
971 | set_bit(rel_gfn, memslot->dirty_bitmap); | ||
972 | } | ||
973 | } | ||
974 | |||
975 | int emulator_read_std(unsigned long addr, | ||
976 | void *val, | ||
977 | unsigned int bytes, | ||
978 | struct kvm_vcpu *vcpu) | ||
979 | { | ||
980 | void *data = val; | ||
981 | |||
982 | while (bytes) { | ||
983 | gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); | ||
984 | unsigned offset = addr & (PAGE_SIZE-1); | ||
985 | unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset); | ||
986 | unsigned long pfn; | ||
987 | struct page *page; | ||
988 | void *page_virt; | ||
989 | |||
990 | if (gpa == UNMAPPED_GVA) | ||
991 | return X86EMUL_PROPAGATE_FAULT; | ||
992 | pfn = gpa >> PAGE_SHIFT; | ||
993 | page = gfn_to_page(vcpu->kvm, pfn); | ||
994 | if (!page) | ||
995 | return X86EMUL_UNHANDLEABLE; | ||
996 | page_virt = kmap_atomic(page, KM_USER0); | ||
997 | |||
998 | memcpy(data, page_virt + offset, tocopy); | ||
999 | |||
1000 | kunmap_atomic(page_virt, KM_USER0); | ||
1001 | |||
1002 | bytes -= tocopy; | ||
1003 | data += tocopy; | ||
1004 | addr += tocopy; | ||
1005 | } | ||
1006 | |||
1007 | return X86EMUL_CONTINUE; | ||
1008 | } | ||
1009 | EXPORT_SYMBOL_GPL(emulator_read_std); | ||
1010 | |||
1011 | static int emulator_write_std(unsigned long addr, | ||
1012 | const void *val, | ||
1013 | unsigned int bytes, | ||
1014 | struct kvm_vcpu *vcpu) | ||
1015 | { | ||
1016 | pr_unimpl(vcpu, "emulator_write_std: addr %lx n %d\n", addr, bytes); | ||
1017 | return X86EMUL_UNHANDLEABLE; | ||
1018 | } | ||
1019 | |||
1020 | /* | ||
1021 | * Only apic need an MMIO device hook, so shortcut now.. | ||
1022 | */ | ||
1023 | static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu, | ||
1024 | gpa_t addr) | ||
1025 | { | ||
1026 | struct kvm_io_device *dev; | ||
1027 | |||
1028 | if (vcpu->apic) { | ||
1029 | dev = &vcpu->apic->dev; | ||
1030 | if (dev->in_range(dev, addr)) | ||
1031 | return dev; | ||
1032 | } | ||
1033 | return NULL; | ||
1034 | } | ||
1035 | |||
1036 | static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, | ||
1037 | gpa_t addr) | ||
1038 | { | ||
1039 | struct kvm_io_device *dev; | ||
1040 | |||
1041 | dev = vcpu_find_pervcpu_dev(vcpu, addr); | ||
1042 | if (dev == NULL) | ||
1043 | dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr); | ||
1044 | return dev; | ||
1045 | } | ||
1046 | |||
1047 | static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu, | ||
1048 | gpa_t addr) | ||
1049 | { | ||
1050 | return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr); | ||
1051 | } | ||
1052 | |||
1053 | static int emulator_read_emulated(unsigned long addr, | ||
1054 | void *val, | ||
1055 | unsigned int bytes, | ||
1056 | struct kvm_vcpu *vcpu) | ||
1057 | { | ||
1058 | struct kvm_io_device *mmio_dev; | ||
1059 | gpa_t gpa; | ||
1060 | |||
1061 | if (vcpu->mmio_read_completed) { | ||
1062 | memcpy(val, vcpu->mmio_data, bytes); | ||
1063 | vcpu->mmio_read_completed = 0; | ||
1064 | return X86EMUL_CONTINUE; | ||
1065 | } else if (emulator_read_std(addr, val, bytes, vcpu) | ||
1066 | == X86EMUL_CONTINUE) | ||
1067 | return X86EMUL_CONTINUE; | ||
1068 | |||
1069 | gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); | ||
1070 | if (gpa == UNMAPPED_GVA) | ||
1071 | return X86EMUL_PROPAGATE_FAULT; | ||
1072 | |||
1073 | /* | ||
1074 | * Is this MMIO handled locally? | ||
1075 | */ | ||
1076 | mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); | ||
1077 | if (mmio_dev) { | ||
1078 | kvm_iodevice_read(mmio_dev, gpa, bytes, val); | ||
1079 | return X86EMUL_CONTINUE; | ||
1080 | } | ||
1081 | |||
1082 | vcpu->mmio_needed = 1; | ||
1083 | vcpu->mmio_phys_addr = gpa; | ||
1084 | vcpu->mmio_size = bytes; | ||
1085 | vcpu->mmio_is_write = 0; | ||
1086 | |||
1087 | return X86EMUL_UNHANDLEABLE; | ||
1088 | } | ||
1089 | |||
1090 | static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, | ||
1091 | const void *val, int bytes) | ||
1092 | { | ||
1093 | struct page *page; | ||
1094 | void *virt; | ||
1095 | |||
1096 | if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT)) | ||
1097 | return 0; | ||
1098 | page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); | ||
1099 | if (!page) | ||
1100 | return 0; | ||
1101 | mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT); | ||
1102 | virt = kmap_atomic(page, KM_USER0); | ||
1103 | kvm_mmu_pte_write(vcpu, gpa, val, bytes); | ||
1104 | memcpy(virt + offset_in_page(gpa), val, bytes); | ||
1105 | kunmap_atomic(virt, KM_USER0); | ||
1106 | return 1; | ||
1107 | } | ||
1108 | |||
1109 | static int emulator_write_emulated_onepage(unsigned long addr, | ||
1110 | const void *val, | ||
1111 | unsigned int bytes, | ||
1112 | struct kvm_vcpu *vcpu) | ||
1113 | { | ||
1114 | struct kvm_io_device *mmio_dev; | ||
1115 | gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); | ||
1116 | |||
1117 | if (gpa == UNMAPPED_GVA) { | ||
1118 | kvm_x86_ops->inject_page_fault(vcpu, addr, 2); | ||
1119 | return X86EMUL_PROPAGATE_FAULT; | ||
1120 | } | ||
1121 | |||
1122 | if (emulator_write_phys(vcpu, gpa, val, bytes)) | ||
1123 | return X86EMUL_CONTINUE; | ||
1124 | |||
1125 | /* | ||
1126 | * Is this MMIO handled locally? | ||
1127 | */ | ||
1128 | mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); | ||
1129 | if (mmio_dev) { | ||
1130 | kvm_iodevice_write(mmio_dev, gpa, bytes, val); | ||
1131 | return X86EMUL_CONTINUE; | ||
1132 | } | ||
1133 | |||
1134 | vcpu->mmio_needed = 1; | ||
1135 | vcpu->mmio_phys_addr = gpa; | ||
1136 | vcpu->mmio_size = bytes; | ||
1137 | vcpu->mmio_is_write = 1; | ||
1138 | memcpy(vcpu->mmio_data, val, bytes); | ||
1139 | |||
1140 | return X86EMUL_CONTINUE; | ||
1141 | } | ||
1142 | |||
1143 | int emulator_write_emulated(unsigned long addr, | ||
1144 | const void *val, | ||
1145 | unsigned int bytes, | ||
1146 | struct kvm_vcpu *vcpu) | ||
1147 | { | ||
1148 | /* Crossing a page boundary? */ | ||
1149 | if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { | ||
1150 | int rc, now; | ||
1151 | |||
1152 | now = -addr & ~PAGE_MASK; | ||
1153 | rc = emulator_write_emulated_onepage(addr, val, now, vcpu); | ||
1154 | if (rc != X86EMUL_CONTINUE) | ||
1155 | return rc; | ||
1156 | addr += now; | ||
1157 | val += now; | ||
1158 | bytes -= now; | ||
1159 | } | ||
1160 | return emulator_write_emulated_onepage(addr, val, bytes, vcpu); | ||
1161 | } | ||
1162 | EXPORT_SYMBOL_GPL(emulator_write_emulated); | ||
1163 | |||
1164 | static int emulator_cmpxchg_emulated(unsigned long addr, | ||
1165 | const void *old, | ||
1166 | const void *new, | ||
1167 | unsigned int bytes, | ||
1168 | struct kvm_vcpu *vcpu) | ||
1169 | { | ||
1170 | static int reported; | ||
1171 | |||
1172 | if (!reported) { | ||
1173 | reported = 1; | ||
1174 | printk(KERN_WARNING "kvm: emulating exchange as write\n"); | ||
1175 | } | ||
1176 | return emulator_write_emulated(addr, new, bytes, vcpu); | ||
1177 | } | ||
1178 | |||
1179 | static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) | ||
1180 | { | ||
1181 | return kvm_x86_ops->get_segment_base(vcpu, seg); | ||
1182 | } | ||
1183 | |||
1184 | int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) | ||
1185 | { | ||
1186 | return X86EMUL_CONTINUE; | ||
1187 | } | ||
1188 | |||
1189 | int emulate_clts(struct kvm_vcpu *vcpu) | ||
1190 | { | ||
1191 | kvm_x86_ops->set_cr0(vcpu, vcpu->cr0 & ~X86_CR0_TS); | ||
1192 | return X86EMUL_CONTINUE; | ||
1193 | } | ||
1194 | |||
1195 | int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, unsigned long *dest) | ||
1196 | { | ||
1197 | struct kvm_vcpu *vcpu = ctxt->vcpu; | ||
1198 | |||
1199 | switch (dr) { | ||
1200 | case 0 ... 3: | ||
1201 | *dest = kvm_x86_ops->get_dr(vcpu, dr); | ||
1202 | return X86EMUL_CONTINUE; | ||
1203 | default: | ||
1204 | pr_unimpl(vcpu, "%s: unexpected dr %u\n", __FUNCTION__, dr); | ||
1205 | return X86EMUL_UNHANDLEABLE; | ||
1206 | } | ||
1207 | } | ||
1208 | |||
1209 | int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) | ||
1210 | { | ||
1211 | unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; | ||
1212 | int exception; | ||
1213 | |||
1214 | kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception); | ||
1215 | if (exception) { | ||
1216 | /* FIXME: better handling */ | ||
1217 | return X86EMUL_UNHANDLEABLE; | ||
1218 | } | ||
1219 | return X86EMUL_CONTINUE; | ||
1220 | } | ||
1221 | |||
1222 | void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) | ||
1223 | { | ||
1224 | static int reported; | ||
1225 | u8 opcodes[4]; | ||
1226 | unsigned long rip = vcpu->rip; | ||
1227 | unsigned long rip_linear; | ||
1228 | |||
1229 | rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); | ||
1230 | |||
1231 | if (reported) | ||
1232 | return; | ||
1233 | |||
1234 | emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu); | ||
1235 | |||
1236 | printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", | ||
1237 | context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); | ||
1238 | reported = 1; | ||
1239 | } | ||
1240 | EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); | ||
1241 | |||
1242 | struct x86_emulate_ops emulate_ops = { | ||
1243 | .read_std = emulator_read_std, | ||
1244 | .write_std = emulator_write_std, | ||
1245 | .read_emulated = emulator_read_emulated, | ||
1246 | .write_emulated = emulator_write_emulated, | ||
1247 | .cmpxchg_emulated = emulator_cmpxchg_emulated, | ||
1248 | }; | ||
1249 | |||
1250 | int emulate_instruction(struct kvm_vcpu *vcpu, | ||
1251 | struct kvm_run *run, | ||
1252 | unsigned long cr2, | ||
1253 | u16 error_code) | ||
1254 | { | ||
1255 | struct x86_emulate_ctxt emulate_ctxt; | ||
1256 | int r; | ||
1257 | int cs_db, cs_l; | ||
1258 | |||
1259 | vcpu->mmio_fault_cr2 = cr2; | ||
1260 | kvm_x86_ops->cache_regs(vcpu); | ||
1261 | |||
1262 | kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); | ||
1263 | |||
1264 | emulate_ctxt.vcpu = vcpu; | ||
1265 | emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); | ||
1266 | emulate_ctxt.cr2 = cr2; | ||
1267 | emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM) | ||
1268 | ? X86EMUL_MODE_REAL : cs_l | ||
1269 | ? X86EMUL_MODE_PROT64 : cs_db | ||
1270 | ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; | ||
1271 | |||
1272 | if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) { | ||
1273 | emulate_ctxt.cs_base = 0; | ||
1274 | emulate_ctxt.ds_base = 0; | ||
1275 | emulate_ctxt.es_base = 0; | ||
1276 | emulate_ctxt.ss_base = 0; | ||
1277 | } else { | ||
1278 | emulate_ctxt.cs_base = get_segment_base(vcpu, VCPU_SREG_CS); | ||
1279 | emulate_ctxt.ds_base = get_segment_base(vcpu, VCPU_SREG_DS); | ||
1280 | emulate_ctxt.es_base = get_segment_base(vcpu, VCPU_SREG_ES); | ||
1281 | emulate_ctxt.ss_base = get_segment_base(vcpu, VCPU_SREG_SS); | ||
1282 | } | ||
1283 | |||
1284 | emulate_ctxt.gs_base = get_segment_base(vcpu, VCPU_SREG_GS); | ||
1285 | emulate_ctxt.fs_base = get_segment_base(vcpu, VCPU_SREG_FS); | ||
1286 | |||
1287 | vcpu->mmio_is_write = 0; | ||
1288 | vcpu->pio.string = 0; | ||
1289 | r = x86_emulate_memop(&emulate_ctxt, &emulate_ops); | ||
1290 | if (vcpu->pio.string) | ||
1291 | return EMULATE_DO_MMIO; | ||
1292 | |||
1293 | if ((r || vcpu->mmio_is_write) && run) { | ||
1294 | run->exit_reason = KVM_EXIT_MMIO; | ||
1295 | run->mmio.phys_addr = vcpu->mmio_phys_addr; | ||
1296 | memcpy(run->mmio.data, vcpu->mmio_data, 8); | ||
1297 | run->mmio.len = vcpu->mmio_size; | ||
1298 | run->mmio.is_write = vcpu->mmio_is_write; | ||
1299 | } | ||
1300 | |||
1301 | if (r) { | ||
1302 | if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) | ||
1303 | return EMULATE_DONE; | ||
1304 | if (!vcpu->mmio_needed) { | ||
1305 | kvm_report_emulation_failure(vcpu, "mmio"); | ||
1306 | return EMULATE_FAIL; | ||
1307 | } | ||
1308 | return EMULATE_DO_MMIO; | ||
1309 | } | ||
1310 | |||
1311 | kvm_x86_ops->decache_regs(vcpu); | ||
1312 | kvm_x86_ops->set_rflags(vcpu, emulate_ctxt.eflags); | ||
1313 | |||
1314 | if (vcpu->mmio_is_write) { | ||
1315 | vcpu->mmio_needed = 0; | ||
1316 | return EMULATE_DO_MMIO; | ||
1317 | } | ||
1318 | |||
1319 | return EMULATE_DONE; | ||
1320 | } | ||
1321 | EXPORT_SYMBOL_GPL(emulate_instruction); | ||
1322 | |||
1323 | /* | ||
1324 | * The vCPU has executed a HLT instruction with in-kernel mode enabled. | ||
1325 | */ | ||
1326 | static void kvm_vcpu_block(struct kvm_vcpu *vcpu) | ||
1327 | { | ||
1328 | DECLARE_WAITQUEUE(wait, current); | ||
1329 | |||
1330 | add_wait_queue(&vcpu->wq, &wait); | ||
1331 | |||
1332 | /* | ||
1333 | * We will block until either an interrupt or a signal wakes us up | ||
1334 | */ | ||
1335 | while (!kvm_cpu_has_interrupt(vcpu) | ||
1336 | && !signal_pending(current) | ||
1337 | && vcpu->mp_state != VCPU_MP_STATE_RUNNABLE | ||
1338 | && vcpu->mp_state != VCPU_MP_STATE_SIPI_RECEIVED) { | ||
1339 | set_current_state(TASK_INTERRUPTIBLE); | ||
1340 | vcpu_put(vcpu); | ||
1341 | schedule(); | ||
1342 | vcpu_load(vcpu); | ||
1343 | } | ||
1344 | |||
1345 | __set_current_state(TASK_RUNNING); | ||
1346 | remove_wait_queue(&vcpu->wq, &wait); | ||
1347 | } | ||
1348 | |||
1349 | int kvm_emulate_halt(struct kvm_vcpu *vcpu) | ||
1350 | { | ||
1351 | ++vcpu->stat.halt_exits; | ||
1352 | if (irqchip_in_kernel(vcpu->kvm)) { | ||
1353 | vcpu->mp_state = VCPU_MP_STATE_HALTED; | ||
1354 | kvm_vcpu_block(vcpu); | ||
1355 | if (vcpu->mp_state != VCPU_MP_STATE_RUNNABLE) | ||
1356 | return -EINTR; | ||
1357 | return 1; | ||
1358 | } else { | ||
1359 | vcpu->run->exit_reason = KVM_EXIT_HLT; | ||
1360 | return 0; | ||
1361 | } | ||
1362 | } | ||
1363 | EXPORT_SYMBOL_GPL(kvm_emulate_halt); | ||
1364 | |||
1365 | int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run) | ||
1366 | { | ||
1367 | unsigned long nr, a0, a1, a2, a3, a4, a5, ret; | ||
1368 | |||
1369 | kvm_x86_ops->cache_regs(vcpu); | ||
1370 | ret = -KVM_EINVAL; | ||
1371 | #ifdef CONFIG_X86_64 | ||
1372 | if (is_long_mode(vcpu)) { | ||
1373 | nr = vcpu->regs[VCPU_REGS_RAX]; | ||
1374 | a0 = vcpu->regs[VCPU_REGS_RDI]; | ||
1375 | a1 = vcpu->regs[VCPU_REGS_RSI]; | ||
1376 | a2 = vcpu->regs[VCPU_REGS_RDX]; | ||
1377 | a3 = vcpu->regs[VCPU_REGS_RCX]; | ||
1378 | a4 = vcpu->regs[VCPU_REGS_R8]; | ||
1379 | a5 = vcpu->regs[VCPU_REGS_R9]; | ||
1380 | } else | ||
1381 | #endif | ||
1382 | { | ||
1383 | nr = vcpu->regs[VCPU_REGS_RBX] & -1u; | ||
1384 | a0 = vcpu->regs[VCPU_REGS_RAX] & -1u; | ||
1385 | a1 = vcpu->regs[VCPU_REGS_RCX] & -1u; | ||
1386 | a2 = vcpu->regs[VCPU_REGS_RDX] & -1u; | ||
1387 | a3 = vcpu->regs[VCPU_REGS_RSI] & -1u; | ||
1388 | a4 = vcpu->regs[VCPU_REGS_RDI] & -1u; | ||
1389 | a5 = vcpu->regs[VCPU_REGS_RBP] & -1u; | ||
1390 | } | ||
1391 | switch (nr) { | ||
1392 | default: | ||
1393 | run->hypercall.nr = nr; | ||
1394 | run->hypercall.args[0] = a0; | ||
1395 | run->hypercall.args[1] = a1; | ||
1396 | run->hypercall.args[2] = a2; | ||
1397 | run->hypercall.args[3] = a3; | ||
1398 | run->hypercall.args[4] = a4; | ||
1399 | run->hypercall.args[5] = a5; | ||
1400 | run->hypercall.ret = ret; | ||
1401 | run->hypercall.longmode = is_long_mode(vcpu); | ||
1402 | kvm_x86_ops->decache_regs(vcpu); | ||
1403 | return 0; | ||
1404 | } | ||
1405 | vcpu->regs[VCPU_REGS_RAX] = ret; | ||
1406 | kvm_x86_ops->decache_regs(vcpu); | ||
1407 | return 1; | ||
1408 | } | ||
1409 | EXPORT_SYMBOL_GPL(kvm_hypercall); | ||
1410 | |||
1411 | static u64 mk_cr_64(u64 curr_cr, u32 new_val) | ||
1412 | { | ||
1413 | return (curr_cr & ~((1ULL << 32) - 1)) | new_val; | ||
1414 | } | ||
1415 | |||
1416 | void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) | ||
1417 | { | ||
1418 | struct descriptor_table dt = { limit, base }; | ||
1419 | |||
1420 | kvm_x86_ops->set_gdt(vcpu, &dt); | ||
1421 | } | ||
1422 | |||
1423 | void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) | ||
1424 | { | ||
1425 | struct descriptor_table dt = { limit, base }; | ||
1426 | |||
1427 | kvm_x86_ops->set_idt(vcpu, &dt); | ||
1428 | } | ||
1429 | |||
1430 | void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, | ||
1431 | unsigned long *rflags) | ||
1432 | { | ||
1433 | lmsw(vcpu, msw); | ||
1434 | *rflags = kvm_x86_ops->get_rflags(vcpu); | ||
1435 | } | ||
1436 | |||
1437 | unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) | ||
1438 | { | ||
1439 | kvm_x86_ops->decache_cr4_guest_bits(vcpu); | ||
1440 | switch (cr) { | ||
1441 | case 0: | ||
1442 | return vcpu->cr0; | ||
1443 | case 2: | ||
1444 | return vcpu->cr2; | ||
1445 | case 3: | ||
1446 | return vcpu->cr3; | ||
1447 | case 4: | ||
1448 | return vcpu->cr4; | ||
1449 | default: | ||
1450 | vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr); | ||
1451 | return 0; | ||
1452 | } | ||
1453 | } | ||
1454 | |||
1455 | void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, | ||
1456 | unsigned long *rflags) | ||
1457 | { | ||
1458 | switch (cr) { | ||
1459 | case 0: | ||
1460 | set_cr0(vcpu, mk_cr_64(vcpu->cr0, val)); | ||
1461 | *rflags = kvm_x86_ops->get_rflags(vcpu); | ||
1462 | break; | ||
1463 | case 2: | ||
1464 | vcpu->cr2 = val; | ||
1465 | break; | ||
1466 | case 3: | ||
1467 | set_cr3(vcpu, val); | ||
1468 | break; | ||
1469 | case 4: | ||
1470 | set_cr4(vcpu, mk_cr_64(vcpu->cr4, val)); | ||
1471 | break; | ||
1472 | default: | ||
1473 | vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr); | ||
1474 | } | ||
1475 | } | ||
1476 | |||
1477 | /* | ||
1478 | * Register the para guest with the host: | ||
1479 | */ | ||
1480 | static int vcpu_register_para(struct kvm_vcpu *vcpu, gpa_t para_state_gpa) | ||
1481 | { | ||
1482 | struct kvm_vcpu_para_state *para_state; | ||
1483 | hpa_t para_state_hpa, hypercall_hpa; | ||
1484 | struct page *para_state_page; | ||
1485 | unsigned char *hypercall; | ||
1486 | gpa_t hypercall_gpa; | ||
1487 | |||
1488 | printk(KERN_DEBUG "kvm: guest trying to enter paravirtual mode\n"); | ||
1489 | printk(KERN_DEBUG ".... para_state_gpa: %08Lx\n", para_state_gpa); | ||
1490 | |||
1491 | /* | ||
1492 | * Needs to be page aligned: | ||
1493 | */ | ||
1494 | if (para_state_gpa != PAGE_ALIGN(para_state_gpa)) | ||
1495 | goto err_gp; | ||
1496 | |||
1497 | para_state_hpa = gpa_to_hpa(vcpu, para_state_gpa); | ||
1498 | printk(KERN_DEBUG ".... para_state_hpa: %08Lx\n", para_state_hpa); | ||
1499 | if (is_error_hpa(para_state_hpa)) | ||
1500 | goto err_gp; | ||
1501 | |||
1502 | mark_page_dirty(vcpu->kvm, para_state_gpa >> PAGE_SHIFT); | ||
1503 | para_state_page = pfn_to_page(para_state_hpa >> PAGE_SHIFT); | ||
1504 | para_state = kmap(para_state_page); | ||
1505 | |||
1506 | printk(KERN_DEBUG ".... guest version: %d\n", para_state->guest_version); | ||
1507 | printk(KERN_DEBUG ".... size: %d\n", para_state->size); | ||
1508 | |||
1509 | para_state->host_version = KVM_PARA_API_VERSION; | ||
1510 | /* | ||
1511 | * We cannot support guests that try to register themselves | ||
1512 | * with a newer API version than the host supports: | ||
1513 | */ | ||
1514 | if (para_state->guest_version > KVM_PARA_API_VERSION) { | ||
1515 | para_state->ret = -KVM_EINVAL; | ||
1516 | goto err_kunmap_skip; | ||
1517 | } | ||
1518 | |||
1519 | hypercall_gpa = para_state->hypercall_gpa; | ||
1520 | hypercall_hpa = gpa_to_hpa(vcpu, hypercall_gpa); | ||
1521 | printk(KERN_DEBUG ".... hypercall_hpa: %08Lx\n", hypercall_hpa); | ||
1522 | if (is_error_hpa(hypercall_hpa)) { | ||
1523 | para_state->ret = -KVM_EINVAL; | ||
1524 | goto err_kunmap_skip; | ||
1525 | } | ||
1526 | |||
1527 | printk(KERN_DEBUG "kvm: para guest successfully registered.\n"); | ||
1528 | vcpu->para_state_page = para_state_page; | ||
1529 | vcpu->para_state_gpa = para_state_gpa; | ||
1530 | vcpu->hypercall_gpa = hypercall_gpa; | ||
1531 | |||
1532 | mark_page_dirty(vcpu->kvm, hypercall_gpa >> PAGE_SHIFT); | ||
1533 | hypercall = kmap_atomic(pfn_to_page(hypercall_hpa >> PAGE_SHIFT), | ||
1534 | KM_USER1) + (hypercall_hpa & ~PAGE_MASK); | ||
1535 | kvm_x86_ops->patch_hypercall(vcpu, hypercall); | ||
1536 | kunmap_atomic(hypercall, KM_USER1); | ||
1537 | |||
1538 | para_state->ret = 0; | ||
1539 | err_kunmap_skip: | ||
1540 | kunmap(para_state_page); | ||
1541 | return 0; | ||
1542 | err_gp: | ||
1543 | return 1; | ||
1544 | } | ||
1545 | |||
1546 | int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) | ||
1547 | { | ||
1548 | u64 data; | ||
1549 | |||
1550 | switch (msr) { | ||
1551 | case 0xc0010010: /* SYSCFG */ | ||
1552 | case 0xc0010015: /* HWCR */ | ||
1553 | case MSR_IA32_PLATFORM_ID: | ||
1554 | case MSR_IA32_P5_MC_ADDR: | ||
1555 | case MSR_IA32_P5_MC_TYPE: | ||
1556 | case MSR_IA32_MC0_CTL: | ||
1557 | case MSR_IA32_MCG_STATUS: | ||
1558 | case MSR_IA32_MCG_CAP: | ||
1559 | case MSR_IA32_MC0_MISC: | ||
1560 | case MSR_IA32_MC0_MISC+4: | ||
1561 | case MSR_IA32_MC0_MISC+8: | ||
1562 | case MSR_IA32_MC0_MISC+12: | ||
1563 | case MSR_IA32_MC0_MISC+16: | ||
1564 | case MSR_IA32_UCODE_REV: | ||
1565 | case MSR_IA32_PERF_STATUS: | ||
1566 | case MSR_IA32_EBL_CR_POWERON: | ||
1567 | /* MTRR registers */ | ||
1568 | case 0xfe: | ||
1569 | case 0x200 ... 0x2ff: | ||
1570 | data = 0; | ||
1571 | break; | ||
1572 | case 0xcd: /* fsb frequency */ | ||
1573 | data = 3; | ||
1574 | break; | ||
1575 | case MSR_IA32_APICBASE: | ||
1576 | data = kvm_get_apic_base(vcpu); | ||
1577 | break; | ||
1578 | case MSR_IA32_MISC_ENABLE: | ||
1579 | data = vcpu->ia32_misc_enable_msr; | ||
1580 | break; | ||
1581 | #ifdef CONFIG_X86_64 | ||
1582 | case MSR_EFER: | ||
1583 | data = vcpu->shadow_efer; | ||
1584 | break; | ||
1585 | #endif | ||
1586 | default: | ||
1587 | pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); | ||
1588 | return 1; | ||
1589 | } | ||
1590 | *pdata = data; | ||
1591 | return 0; | ||
1592 | } | ||
1593 | EXPORT_SYMBOL_GPL(kvm_get_msr_common); | ||
1594 | |||
1595 | /* | ||
1596 | * Reads an msr value (of 'msr_index') into 'pdata'. | ||
1597 | * Returns 0 on success, non-0 otherwise. | ||
1598 | * Assumes vcpu_load() was already called. | ||
1599 | */ | ||
1600 | int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) | ||
1601 | { | ||
1602 | return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); | ||
1603 | } | ||
1604 | |||
1605 | #ifdef CONFIG_X86_64 | ||
1606 | |||
1607 | static void set_efer(struct kvm_vcpu *vcpu, u64 efer) | ||
1608 | { | ||
1609 | if (efer & EFER_RESERVED_BITS) { | ||
1610 | printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n", | ||
1611 | efer); | ||
1612 | inject_gp(vcpu); | ||
1613 | return; | ||
1614 | } | ||
1615 | |||
1616 | if (is_paging(vcpu) | ||
1617 | && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) { | ||
1618 | printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n"); | ||
1619 | inject_gp(vcpu); | ||
1620 | return; | ||
1621 | } | ||
1622 | |||
1623 | kvm_x86_ops->set_efer(vcpu, efer); | ||
1624 | |||
1625 | efer &= ~EFER_LMA; | ||
1626 | efer |= vcpu->shadow_efer & EFER_LMA; | ||
1627 | |||
1628 | vcpu->shadow_efer = efer; | ||
1629 | } | ||
1630 | |||
1631 | #endif | ||
1632 | |||
1633 | int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | ||
1634 | { | ||
1635 | switch (msr) { | ||
1636 | #ifdef CONFIG_X86_64 | ||
1637 | case MSR_EFER: | ||
1638 | set_efer(vcpu, data); | ||
1639 | break; | ||
1640 | #endif | ||
1641 | case MSR_IA32_MC0_STATUS: | ||
1642 | pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", | ||
1643 | __FUNCTION__, data); | ||
1644 | break; | ||
1645 | case MSR_IA32_MCG_STATUS: | ||
1646 | pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n", | ||
1647 | __FUNCTION__, data); | ||
1648 | break; | ||
1649 | case MSR_IA32_UCODE_REV: | ||
1650 | case MSR_IA32_UCODE_WRITE: | ||
1651 | case 0x200 ... 0x2ff: /* MTRRs */ | ||
1652 | break; | ||
1653 | case MSR_IA32_APICBASE: | ||
1654 | kvm_set_apic_base(vcpu, data); | ||
1655 | break; | ||
1656 | case MSR_IA32_MISC_ENABLE: | ||
1657 | vcpu->ia32_misc_enable_msr = data; | ||
1658 | break; | ||
1659 | /* | ||
1660 | * This is the 'probe whether the host is KVM' logic: | ||
1661 | */ | ||
1662 | case MSR_KVM_API_MAGIC: | ||
1663 | return vcpu_register_para(vcpu, data); | ||
1664 | |||
1665 | default: | ||
1666 | pr_unimpl(vcpu, "unhandled wrmsr: 0x%x\n", msr); | ||
1667 | return 1; | ||
1668 | } | ||
1669 | return 0; | ||
1670 | } | ||
1671 | EXPORT_SYMBOL_GPL(kvm_set_msr_common); | ||
1672 | |||
1673 | /* | ||
1674 | * Writes msr value into into the appropriate "register". | ||
1675 | * Returns 0 on success, non-0 otherwise. | ||
1676 | * Assumes vcpu_load() was already called. | ||
1677 | */ | ||
1678 | int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) | ||
1679 | { | ||
1680 | return kvm_x86_ops->set_msr(vcpu, msr_index, data); | ||
1681 | } | ||
1682 | |||
1683 | void kvm_resched(struct kvm_vcpu *vcpu) | ||
1684 | { | ||
1685 | if (!need_resched()) | ||
1686 | return; | ||
1687 | cond_resched(); | ||
1688 | } | ||
1689 | EXPORT_SYMBOL_GPL(kvm_resched); | ||
1690 | |||
1691 | void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) | ||
1692 | { | ||
1693 | int i; | ||
1694 | u32 function; | ||
1695 | struct kvm_cpuid_entry *e, *best; | ||
1696 | |||
1697 | kvm_x86_ops->cache_regs(vcpu); | ||
1698 | function = vcpu->regs[VCPU_REGS_RAX]; | ||
1699 | vcpu->regs[VCPU_REGS_RAX] = 0; | ||
1700 | vcpu->regs[VCPU_REGS_RBX] = 0; | ||
1701 | vcpu->regs[VCPU_REGS_RCX] = 0; | ||
1702 | vcpu->regs[VCPU_REGS_RDX] = 0; | ||
1703 | best = NULL; | ||
1704 | for (i = 0; i < vcpu->cpuid_nent; ++i) { | ||
1705 | e = &vcpu->cpuid_entries[i]; | ||
1706 | if (e->function == function) { | ||
1707 | best = e; | ||
1708 | break; | ||
1709 | } | ||
1710 | /* | ||
1711 | * Both basic or both extended? | ||
1712 | */ | ||
1713 | if (((e->function ^ function) & 0x80000000) == 0) | ||
1714 | if (!best || e->function > best->function) | ||
1715 | best = e; | ||
1716 | } | ||
1717 | if (best) { | ||
1718 | vcpu->regs[VCPU_REGS_RAX] = best->eax; | ||
1719 | vcpu->regs[VCPU_REGS_RBX] = best->ebx; | ||
1720 | vcpu->regs[VCPU_REGS_RCX] = best->ecx; | ||
1721 | vcpu->regs[VCPU_REGS_RDX] = best->edx; | ||
1722 | } | ||
1723 | kvm_x86_ops->decache_regs(vcpu); | ||
1724 | kvm_x86_ops->skip_emulated_instruction(vcpu); | ||
1725 | } | ||
1726 | EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); | ||
1727 | |||
1728 | static int pio_copy_data(struct kvm_vcpu *vcpu) | ||
1729 | { | ||
1730 | void *p = vcpu->pio_data; | ||
1731 | void *q; | ||
1732 | unsigned bytes; | ||
1733 | int nr_pages = vcpu->pio.guest_pages[1] ? 2 : 1; | ||
1734 | |||
1735 | q = vmap(vcpu->pio.guest_pages, nr_pages, VM_READ|VM_WRITE, | ||
1736 | PAGE_KERNEL); | ||
1737 | if (!q) { | ||
1738 | free_pio_guest_pages(vcpu); | ||
1739 | return -ENOMEM; | ||
1740 | } | ||
1741 | q += vcpu->pio.guest_page_offset; | ||
1742 | bytes = vcpu->pio.size * vcpu->pio.cur_count; | ||
1743 | if (vcpu->pio.in) | ||
1744 | memcpy(q, p, bytes); | ||
1745 | else | ||
1746 | memcpy(p, q, bytes); | ||
1747 | q -= vcpu->pio.guest_page_offset; | ||
1748 | vunmap(q); | ||
1749 | free_pio_guest_pages(vcpu); | ||
1750 | return 0; | ||
1751 | } | ||
1752 | |||
1753 | static int complete_pio(struct kvm_vcpu *vcpu) | ||
1754 | { | ||
1755 | struct kvm_pio_request *io = &vcpu->pio; | ||
1756 | long delta; | ||
1757 | int r; | ||
1758 | |||
1759 | kvm_x86_ops->cache_regs(vcpu); | ||
1760 | |||
1761 | if (!io->string) { | ||
1762 | if (io->in) | ||
1763 | memcpy(&vcpu->regs[VCPU_REGS_RAX], vcpu->pio_data, | ||
1764 | io->size); | ||
1765 | } else { | ||
1766 | if (io->in) { | ||
1767 | r = pio_copy_data(vcpu); | ||
1768 | if (r) { | ||
1769 | kvm_x86_ops->cache_regs(vcpu); | ||
1770 | return r; | ||
1771 | } | ||
1772 | } | ||
1773 | |||
1774 | delta = 1; | ||
1775 | if (io->rep) { | ||
1776 | delta *= io->cur_count; | ||
1777 | /* | ||
1778 | * The size of the register should really depend on | ||
1779 | * current address size. | ||
1780 | */ | ||
1781 | vcpu->regs[VCPU_REGS_RCX] -= delta; | ||
1782 | } | ||
1783 | if (io->down) | ||
1784 | delta = -delta; | ||
1785 | delta *= io->size; | ||
1786 | if (io->in) | ||
1787 | vcpu->regs[VCPU_REGS_RDI] += delta; | ||
1788 | else | ||
1789 | vcpu->regs[VCPU_REGS_RSI] += delta; | ||
1790 | } | ||
1791 | |||
1792 | kvm_x86_ops->decache_regs(vcpu); | ||
1793 | |||
1794 | io->count -= io->cur_count; | ||
1795 | io->cur_count = 0; | ||
1796 | |||
1797 | return 0; | ||
1798 | } | ||
1799 | |||
1800 | static void kernel_pio(struct kvm_io_device *pio_dev, | ||
1801 | struct kvm_vcpu *vcpu, | ||
1802 | void *pd) | ||
1803 | { | ||
1804 | /* TODO: String I/O for in kernel device */ | ||
1805 | |||
1806 | mutex_lock(&vcpu->kvm->lock); | ||
1807 | if (vcpu->pio.in) | ||
1808 | kvm_iodevice_read(pio_dev, vcpu->pio.port, | ||
1809 | vcpu->pio.size, | ||
1810 | pd); | ||
1811 | else | ||
1812 | kvm_iodevice_write(pio_dev, vcpu->pio.port, | ||
1813 | vcpu->pio.size, | ||
1814 | pd); | ||
1815 | mutex_unlock(&vcpu->kvm->lock); | ||
1816 | } | ||
1817 | |||
1818 | static void pio_string_write(struct kvm_io_device *pio_dev, | ||
1819 | struct kvm_vcpu *vcpu) | ||
1820 | { | ||
1821 | struct kvm_pio_request *io = &vcpu->pio; | ||
1822 | void *pd = vcpu->pio_data; | ||
1823 | int i; | ||
1824 | |||
1825 | mutex_lock(&vcpu->kvm->lock); | ||
1826 | for (i = 0; i < io->cur_count; i++) { | ||
1827 | kvm_iodevice_write(pio_dev, io->port, | ||
1828 | io->size, | ||
1829 | pd); | ||
1830 | pd += io->size; | ||
1831 | } | ||
1832 | mutex_unlock(&vcpu->kvm->lock); | ||
1833 | } | ||
1834 | |||
1835 | int kvm_emulate_pio (struct kvm_vcpu *vcpu, struct kvm_run *run, int in, | ||
1836 | int size, unsigned port) | ||
1837 | { | ||
1838 | struct kvm_io_device *pio_dev; | ||
1839 | |||
1840 | vcpu->run->exit_reason = KVM_EXIT_IO; | ||
1841 | vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; | ||
1842 | vcpu->run->io.size = vcpu->pio.size = size; | ||
1843 | vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; | ||
1844 | vcpu->run->io.count = vcpu->pio.count = vcpu->pio.cur_count = 1; | ||
1845 | vcpu->run->io.port = vcpu->pio.port = port; | ||
1846 | vcpu->pio.in = in; | ||
1847 | vcpu->pio.string = 0; | ||
1848 | vcpu->pio.down = 0; | ||
1849 | vcpu->pio.guest_page_offset = 0; | ||
1850 | vcpu->pio.rep = 0; | ||
1851 | |||
1852 | kvm_x86_ops->cache_regs(vcpu); | ||
1853 | memcpy(vcpu->pio_data, &vcpu->regs[VCPU_REGS_RAX], 4); | ||
1854 | kvm_x86_ops->decache_regs(vcpu); | ||
1855 | |||
1856 | kvm_x86_ops->skip_emulated_instruction(vcpu); | ||
1857 | |||
1858 | pio_dev = vcpu_find_pio_dev(vcpu, port); | ||
1859 | if (pio_dev) { | ||
1860 | kernel_pio(pio_dev, vcpu, vcpu->pio_data); | ||
1861 | complete_pio(vcpu); | ||
1862 | return 1; | ||
1863 | } | ||
1864 | return 0; | ||
1865 | } | ||
1866 | EXPORT_SYMBOL_GPL(kvm_emulate_pio); | ||
1867 | |||
1868 | int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, | ||
1869 | int size, unsigned long count, int down, | ||
1870 | gva_t address, int rep, unsigned port) | ||
1871 | { | ||
1872 | unsigned now, in_page; | ||
1873 | int i, ret = 0; | ||
1874 | int nr_pages = 1; | ||
1875 | struct page *page; | ||
1876 | struct kvm_io_device *pio_dev; | ||
1877 | |||
1878 | vcpu->run->exit_reason = KVM_EXIT_IO; | ||
1879 | vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; | ||
1880 | vcpu->run->io.size = vcpu->pio.size = size; | ||
1881 | vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; | ||
1882 | vcpu->run->io.count = vcpu->pio.count = vcpu->pio.cur_count = count; | ||
1883 | vcpu->run->io.port = vcpu->pio.port = port; | ||
1884 | vcpu->pio.in = in; | ||
1885 | vcpu->pio.string = 1; | ||
1886 | vcpu->pio.down = down; | ||
1887 | vcpu->pio.guest_page_offset = offset_in_page(address); | ||
1888 | vcpu->pio.rep = rep; | ||
1889 | |||
1890 | if (!count) { | ||
1891 | kvm_x86_ops->skip_emulated_instruction(vcpu); | ||
1892 | return 1; | ||
1893 | } | ||
1894 | |||
1895 | if (!down) | ||
1896 | in_page = PAGE_SIZE - offset_in_page(address); | ||
1897 | else | ||
1898 | in_page = offset_in_page(address) + size; | ||
1899 | now = min(count, (unsigned long)in_page / size); | ||
1900 | if (!now) { | ||
1901 | /* | ||
1902 | * String I/O straddles page boundary. Pin two guest pages | ||
1903 | * so that we satisfy atomicity constraints. Do just one | ||
1904 | * transaction to avoid complexity. | ||
1905 | */ | ||
1906 | nr_pages = 2; | ||
1907 | now = 1; | ||
1908 | } | ||
1909 | if (down) { | ||
1910 | /* | ||
1911 | * String I/O in reverse. Yuck. Kill the guest, fix later. | ||
1912 | */ | ||
1913 | pr_unimpl(vcpu, "guest string pio down\n"); | ||
1914 | inject_gp(vcpu); | ||
1915 | return 1; | ||
1916 | } | ||
1917 | vcpu->run->io.count = now; | ||
1918 | vcpu->pio.cur_count = now; | ||
1919 | |||
1920 | if (vcpu->pio.cur_count == vcpu->pio.count) | ||
1921 | kvm_x86_ops->skip_emulated_instruction(vcpu); | ||
1922 | |||
1923 | for (i = 0; i < nr_pages; ++i) { | ||
1924 | mutex_lock(&vcpu->kvm->lock); | ||
1925 | page = gva_to_page(vcpu, address + i * PAGE_SIZE); | ||
1926 | if (page) | ||
1927 | get_page(page); | ||
1928 | vcpu->pio.guest_pages[i] = page; | ||
1929 | mutex_unlock(&vcpu->kvm->lock); | ||
1930 | if (!page) { | ||
1931 | inject_gp(vcpu); | ||
1932 | free_pio_guest_pages(vcpu); | ||
1933 | return 1; | ||
1934 | } | ||
1935 | } | ||
1936 | |||
1937 | pio_dev = vcpu_find_pio_dev(vcpu, port); | ||
1938 | if (!vcpu->pio.in) { | ||
1939 | /* string PIO write */ | ||
1940 | ret = pio_copy_data(vcpu); | ||
1941 | if (ret >= 0 && pio_dev) { | ||
1942 | pio_string_write(pio_dev, vcpu); | ||
1943 | complete_pio(vcpu); | ||
1944 | if (vcpu->pio.count == 0) | ||
1945 | ret = 1; | ||
1946 | } | ||
1947 | } else if (pio_dev) | ||
1948 | pr_unimpl(vcpu, "no string pio read support yet, " | ||
1949 | "port %x size %d count %ld\n", | ||
1950 | port, size, count); | ||
1951 | |||
1952 | return ret; | ||
1953 | } | ||
1954 | EXPORT_SYMBOL_GPL(kvm_emulate_pio_string); | ||
1955 | |||
1956 | /* | ||
1957 | * Check if userspace requested an interrupt window, and that the | ||
1958 | * interrupt window is open. | ||
1959 | * | ||
1960 | * No need to exit to userspace if we already have an interrupt queued. | ||
1961 | */ | ||
1962 | static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, | ||
1963 | struct kvm_run *kvm_run) | ||
1964 | { | ||
1965 | return (!vcpu->irq_summary && | ||
1966 | kvm_run->request_interrupt_window && | ||
1967 | vcpu->interrupt_window_open && | ||
1968 | (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF)); | ||
1969 | } | ||
1970 | |||
1971 | static void post_kvm_run_save(struct kvm_vcpu *vcpu, | ||
1972 | struct kvm_run *kvm_run) | ||
1973 | { | ||
1974 | kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0; | ||
1975 | kvm_run->cr8 = get_cr8(vcpu); | ||
1976 | kvm_run->apic_base = kvm_get_apic_base(vcpu); | ||
1977 | if (irqchip_in_kernel(vcpu->kvm)) | ||
1978 | kvm_run->ready_for_interrupt_injection = 1; | ||
1979 | else | ||
1980 | kvm_run->ready_for_interrupt_injection = | ||
1981 | (vcpu->interrupt_window_open && | ||
1982 | vcpu->irq_summary == 0); | ||
1983 | } | ||
1984 | |||
1985 | static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
1986 | { | ||
1987 | int r; | ||
1988 | |||
1989 | if (unlikely(vcpu->mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) { | ||
1990 | printk("vcpu %d received sipi with vector # %x\n", | ||
1991 | vcpu->vcpu_id, vcpu->sipi_vector); | ||
1992 | kvm_lapic_reset(vcpu); | ||
1993 | kvm_x86_ops->vcpu_reset(vcpu); | ||
1994 | vcpu->mp_state = VCPU_MP_STATE_RUNNABLE; | ||
1995 | } | ||
1996 | |||
1997 | preempted: | ||
1998 | if (vcpu->guest_debug.enabled) | ||
1999 | kvm_x86_ops->guest_debug_pre(vcpu); | ||
2000 | |||
2001 | again: | ||
2002 | r = kvm_mmu_reload(vcpu); | ||
2003 | if (unlikely(r)) | ||
2004 | goto out; | ||
2005 | |||
2006 | preempt_disable(); | ||
2007 | |||
2008 | kvm_x86_ops->prepare_guest_switch(vcpu); | ||
2009 | kvm_load_guest_fpu(vcpu); | ||
2010 | |||
2011 | local_irq_disable(); | ||
2012 | |||
2013 | if (signal_pending(current)) { | ||
2014 | local_irq_enable(); | ||
2015 | preempt_enable(); | ||
2016 | r = -EINTR; | ||
2017 | kvm_run->exit_reason = KVM_EXIT_INTR; | ||
2018 | ++vcpu->stat.signal_exits; | ||
2019 | goto out; | ||
2020 | } | ||
2021 | |||
2022 | if (irqchip_in_kernel(vcpu->kvm)) | ||
2023 | kvm_x86_ops->inject_pending_irq(vcpu); | ||
2024 | else if (!vcpu->mmio_read_completed) | ||
2025 | kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run); | ||
2026 | |||
2027 | vcpu->guest_mode = 1; | ||
2028 | kvm_guest_enter(); | ||
2029 | |||
2030 | if (vcpu->requests) | ||
2031 | if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests)) | ||
2032 | kvm_x86_ops->tlb_flush(vcpu); | ||
2033 | |||
2034 | kvm_x86_ops->run(vcpu, kvm_run); | ||
2035 | |||
2036 | vcpu->guest_mode = 0; | ||
2037 | local_irq_enable(); | ||
2038 | |||
2039 | ++vcpu->stat.exits; | ||
2040 | |||
2041 | /* | ||
2042 | * We must have an instruction between local_irq_enable() and | ||
2043 | * kvm_guest_exit(), so the timer interrupt isn't delayed by | ||
2044 | * the interrupt shadow. The stat.exits increment will do nicely. | ||
2045 | * But we need to prevent reordering, hence this barrier(): | ||
2046 | */ | ||
2047 | barrier(); | ||
2048 | |||
2049 | kvm_guest_exit(); | ||
2050 | |||
2051 | preempt_enable(); | ||
2052 | |||
2053 | /* | ||
2054 | * Profile KVM exit RIPs: | ||
2055 | */ | ||
2056 | if (unlikely(prof_on == KVM_PROFILING)) { | ||
2057 | kvm_x86_ops->cache_regs(vcpu); | ||
2058 | profile_hit(KVM_PROFILING, (void *)vcpu->rip); | ||
2059 | } | ||
2060 | |||
2061 | r = kvm_x86_ops->handle_exit(kvm_run, vcpu); | ||
2062 | |||
2063 | if (r > 0) { | ||
2064 | if (dm_request_for_irq_injection(vcpu, kvm_run)) { | ||
2065 | r = -EINTR; | ||
2066 | kvm_run->exit_reason = KVM_EXIT_INTR; | ||
2067 | ++vcpu->stat.request_irq_exits; | ||
2068 | goto out; | ||
2069 | } | ||
2070 | if (!need_resched()) { | ||
2071 | ++vcpu->stat.light_exits; | ||
2072 | goto again; | ||
2073 | } | ||
2074 | } | ||
2075 | |||
2076 | out: | ||
2077 | if (r > 0) { | ||
2078 | kvm_resched(vcpu); | ||
2079 | goto preempted; | ||
2080 | } | ||
2081 | |||
2082 | post_kvm_run_save(vcpu, kvm_run); | ||
2083 | |||
2084 | return r; | ||
2085 | } | ||
2086 | |||
2087 | |||
2088 | static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
2089 | { | ||
2090 | int r; | ||
2091 | sigset_t sigsaved; | ||
2092 | |||
2093 | vcpu_load(vcpu); | ||
2094 | |||
2095 | if (unlikely(vcpu->mp_state == VCPU_MP_STATE_UNINITIALIZED)) { | ||
2096 | kvm_vcpu_block(vcpu); | ||
2097 | vcpu_put(vcpu); | ||
2098 | return -EAGAIN; | ||
2099 | } | ||
2100 | |||
2101 | if (vcpu->sigset_active) | ||
2102 | sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); | ||
2103 | |||
2104 | /* re-sync apic's tpr */ | ||
2105 | if (!irqchip_in_kernel(vcpu->kvm)) | ||
2106 | set_cr8(vcpu, kvm_run->cr8); | ||
2107 | |||
2108 | if (vcpu->pio.cur_count) { | ||
2109 | r = complete_pio(vcpu); | ||
2110 | if (r) | ||
2111 | goto out; | ||
2112 | } | ||
2113 | |||
2114 | if (vcpu->mmio_needed) { | ||
2115 | memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); | ||
2116 | vcpu->mmio_read_completed = 1; | ||
2117 | vcpu->mmio_needed = 0; | ||
2118 | r = emulate_instruction(vcpu, kvm_run, | ||
2119 | vcpu->mmio_fault_cr2, 0); | ||
2120 | if (r == EMULATE_DO_MMIO) { | ||
2121 | /* | ||
2122 | * Read-modify-write. Back to userspace. | ||
2123 | */ | ||
2124 | r = 0; | ||
2125 | goto out; | ||
2126 | } | ||
2127 | } | ||
2128 | |||
2129 | if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) { | ||
2130 | kvm_x86_ops->cache_regs(vcpu); | ||
2131 | vcpu->regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret; | ||
2132 | kvm_x86_ops->decache_regs(vcpu); | ||
2133 | } | ||
2134 | |||
2135 | r = __vcpu_run(vcpu, kvm_run); | ||
2136 | |||
2137 | out: | ||
2138 | if (vcpu->sigset_active) | ||
2139 | sigprocmask(SIG_SETMASK, &sigsaved, NULL); | ||
2140 | |||
2141 | vcpu_put(vcpu); | ||
2142 | return r; | ||
2143 | } | ||
2144 | |||
2145 | static int kvm_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, | ||
2146 | struct kvm_regs *regs) | ||
2147 | { | ||
2148 | vcpu_load(vcpu); | ||
2149 | |||
2150 | kvm_x86_ops->cache_regs(vcpu); | ||
2151 | |||
2152 | regs->rax = vcpu->regs[VCPU_REGS_RAX]; | ||
2153 | regs->rbx = vcpu->regs[VCPU_REGS_RBX]; | ||
2154 | regs->rcx = vcpu->regs[VCPU_REGS_RCX]; | ||
2155 | regs->rdx = vcpu->regs[VCPU_REGS_RDX]; | ||
2156 | regs->rsi = vcpu->regs[VCPU_REGS_RSI]; | ||
2157 | regs->rdi = vcpu->regs[VCPU_REGS_RDI]; | ||
2158 | regs->rsp = vcpu->regs[VCPU_REGS_RSP]; | ||
2159 | regs->rbp = vcpu->regs[VCPU_REGS_RBP]; | ||
2160 | #ifdef CONFIG_X86_64 | ||
2161 | regs->r8 = vcpu->regs[VCPU_REGS_R8]; | ||
2162 | regs->r9 = vcpu->regs[VCPU_REGS_R9]; | ||
2163 | regs->r10 = vcpu->regs[VCPU_REGS_R10]; | ||
2164 | regs->r11 = vcpu->regs[VCPU_REGS_R11]; | ||
2165 | regs->r12 = vcpu->regs[VCPU_REGS_R12]; | ||
2166 | regs->r13 = vcpu->regs[VCPU_REGS_R13]; | ||
2167 | regs->r14 = vcpu->regs[VCPU_REGS_R14]; | ||
2168 | regs->r15 = vcpu->regs[VCPU_REGS_R15]; | ||
2169 | #endif | ||
2170 | |||
2171 | regs->rip = vcpu->rip; | ||
2172 | regs->rflags = kvm_x86_ops->get_rflags(vcpu); | ||
2173 | |||
2174 | /* | ||
2175 | * Don't leak debug flags in case they were set for guest debugging | ||
2176 | */ | ||
2177 | if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep) | ||
2178 | regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); | ||
2179 | |||
2180 | vcpu_put(vcpu); | ||
2181 | |||
2182 | return 0; | ||
2183 | } | ||
2184 | |||
2185 | static int kvm_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, | ||
2186 | struct kvm_regs *regs) | ||
2187 | { | ||
2188 | vcpu_load(vcpu); | ||
2189 | |||
2190 | vcpu->regs[VCPU_REGS_RAX] = regs->rax; | ||
2191 | vcpu->regs[VCPU_REGS_RBX] = regs->rbx; | ||
2192 | vcpu->regs[VCPU_REGS_RCX] = regs->rcx; | ||
2193 | vcpu->regs[VCPU_REGS_RDX] = regs->rdx; | ||
2194 | vcpu->regs[VCPU_REGS_RSI] = regs->rsi; | ||
2195 | vcpu->regs[VCPU_REGS_RDI] = regs->rdi; | ||
2196 | vcpu->regs[VCPU_REGS_RSP] = regs->rsp; | ||
2197 | vcpu->regs[VCPU_REGS_RBP] = regs->rbp; | ||
2198 | #ifdef CONFIG_X86_64 | ||
2199 | vcpu->regs[VCPU_REGS_R8] = regs->r8; | ||
2200 | vcpu->regs[VCPU_REGS_R9] = regs->r9; | ||
2201 | vcpu->regs[VCPU_REGS_R10] = regs->r10; | ||
2202 | vcpu->regs[VCPU_REGS_R11] = regs->r11; | ||
2203 | vcpu->regs[VCPU_REGS_R12] = regs->r12; | ||
2204 | vcpu->regs[VCPU_REGS_R13] = regs->r13; | ||
2205 | vcpu->regs[VCPU_REGS_R14] = regs->r14; | ||
2206 | vcpu->regs[VCPU_REGS_R15] = regs->r15; | ||
2207 | #endif | ||
2208 | |||
2209 | vcpu->rip = regs->rip; | ||
2210 | kvm_x86_ops->set_rflags(vcpu, regs->rflags); | ||
2211 | |||
2212 | kvm_x86_ops->decache_regs(vcpu); | ||
2213 | |||
2214 | vcpu_put(vcpu); | ||
2215 | |||
2216 | return 0; | ||
2217 | } | ||
2218 | |||
2219 | static void get_segment(struct kvm_vcpu *vcpu, | ||
2220 | struct kvm_segment *var, int seg) | ||
2221 | { | ||
2222 | return kvm_x86_ops->get_segment(vcpu, var, seg); | ||
2223 | } | ||
2224 | |||
2225 | static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, | ||
2226 | struct kvm_sregs *sregs) | ||
2227 | { | ||
2228 | struct descriptor_table dt; | ||
2229 | int pending_vec; | ||
2230 | |||
2231 | vcpu_load(vcpu); | ||
2232 | |||
2233 | get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); | ||
2234 | get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); | ||
2235 | get_segment(vcpu, &sregs->es, VCPU_SREG_ES); | ||
2236 | get_segment(vcpu, &sregs->fs, VCPU_SREG_FS); | ||
2237 | get_segment(vcpu, &sregs->gs, VCPU_SREG_GS); | ||
2238 | get_segment(vcpu, &sregs->ss, VCPU_SREG_SS); | ||
2239 | |||
2240 | get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); | ||
2241 | get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); | ||
2242 | |||
2243 | kvm_x86_ops->get_idt(vcpu, &dt); | ||
2244 | sregs->idt.limit = dt.limit; | ||
2245 | sregs->idt.base = dt.base; | ||
2246 | kvm_x86_ops->get_gdt(vcpu, &dt); | ||
2247 | sregs->gdt.limit = dt.limit; | ||
2248 | sregs->gdt.base = dt.base; | ||
2249 | |||
2250 | kvm_x86_ops->decache_cr4_guest_bits(vcpu); | ||
2251 | sregs->cr0 = vcpu->cr0; | ||
2252 | sregs->cr2 = vcpu->cr2; | ||
2253 | sregs->cr3 = vcpu->cr3; | ||
2254 | sregs->cr4 = vcpu->cr4; | ||
2255 | sregs->cr8 = get_cr8(vcpu); | ||
2256 | sregs->efer = vcpu->shadow_efer; | ||
2257 | sregs->apic_base = kvm_get_apic_base(vcpu); | ||
2258 | |||
2259 | if (irqchip_in_kernel(vcpu->kvm)) { | ||
2260 | memset(sregs->interrupt_bitmap, 0, | ||
2261 | sizeof sregs->interrupt_bitmap); | ||
2262 | pending_vec = kvm_x86_ops->get_irq(vcpu); | ||
2263 | if (pending_vec >= 0) | ||
2264 | set_bit(pending_vec, (unsigned long *)sregs->interrupt_bitmap); | ||
2265 | } else | ||
2266 | memcpy(sregs->interrupt_bitmap, vcpu->irq_pending, | ||
2267 | sizeof sregs->interrupt_bitmap); | ||
2268 | |||
2269 | vcpu_put(vcpu); | ||
2270 | |||
2271 | return 0; | ||
2272 | } | ||
2273 | |||
2274 | static void set_segment(struct kvm_vcpu *vcpu, | ||
2275 | struct kvm_segment *var, int seg) | ||
2276 | { | ||
2277 | return kvm_x86_ops->set_segment(vcpu, var, seg); | ||
2278 | } | ||
2279 | |||
2280 | static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, | ||
2281 | struct kvm_sregs *sregs) | ||
2282 | { | ||
2283 | int mmu_reset_needed = 0; | ||
2284 | int i, pending_vec, max_bits; | ||
2285 | struct descriptor_table dt; | ||
2286 | |||
2287 | vcpu_load(vcpu); | ||
2288 | |||
2289 | dt.limit = sregs->idt.limit; | ||
2290 | dt.base = sregs->idt.base; | ||
2291 | kvm_x86_ops->set_idt(vcpu, &dt); | ||
2292 | dt.limit = sregs->gdt.limit; | ||
2293 | dt.base = sregs->gdt.base; | ||
2294 | kvm_x86_ops->set_gdt(vcpu, &dt); | ||
2295 | |||
2296 | vcpu->cr2 = sregs->cr2; | ||
2297 | mmu_reset_needed |= vcpu->cr3 != sregs->cr3; | ||
2298 | vcpu->cr3 = sregs->cr3; | ||
2299 | |||
2300 | set_cr8(vcpu, sregs->cr8); | ||
2301 | |||
2302 | mmu_reset_needed |= vcpu->shadow_efer != sregs->efer; | ||
2303 | #ifdef CONFIG_X86_64 | ||
2304 | kvm_x86_ops->set_efer(vcpu, sregs->efer); | ||
2305 | #endif | ||
2306 | kvm_set_apic_base(vcpu, sregs->apic_base); | ||
2307 | |||
2308 | kvm_x86_ops->decache_cr4_guest_bits(vcpu); | ||
2309 | |||
2310 | mmu_reset_needed |= vcpu->cr0 != sregs->cr0; | ||
2311 | vcpu->cr0 = sregs->cr0; | ||
2312 | kvm_x86_ops->set_cr0(vcpu, sregs->cr0); | ||
2313 | |||
2314 | mmu_reset_needed |= vcpu->cr4 != sregs->cr4; | ||
2315 | kvm_x86_ops->set_cr4(vcpu, sregs->cr4); | ||
2316 | if (!is_long_mode(vcpu) && is_pae(vcpu)) | ||
2317 | load_pdptrs(vcpu, vcpu->cr3); | ||
2318 | |||
2319 | if (mmu_reset_needed) | ||
2320 | kvm_mmu_reset_context(vcpu); | ||
2321 | |||
2322 | if (!irqchip_in_kernel(vcpu->kvm)) { | ||
2323 | memcpy(vcpu->irq_pending, sregs->interrupt_bitmap, | ||
2324 | sizeof vcpu->irq_pending); | ||
2325 | vcpu->irq_summary = 0; | ||
2326 | for (i = 0; i < ARRAY_SIZE(vcpu->irq_pending); ++i) | ||
2327 | if (vcpu->irq_pending[i]) | ||
2328 | __set_bit(i, &vcpu->irq_summary); | ||
2329 | } else { | ||
2330 | max_bits = (sizeof sregs->interrupt_bitmap) << 3; | ||
2331 | pending_vec = find_first_bit( | ||
2332 | (const unsigned long *)sregs->interrupt_bitmap, | ||
2333 | max_bits); | ||
2334 | /* Only pending external irq is handled here */ | ||
2335 | if (pending_vec < max_bits) { | ||
2336 | kvm_x86_ops->set_irq(vcpu, pending_vec); | ||
2337 | printk("Set back pending irq %d\n", pending_vec); | ||
2338 | } | ||
2339 | } | ||
2340 | |||
2341 | set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); | ||
2342 | set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); | ||
2343 | set_segment(vcpu, &sregs->es, VCPU_SREG_ES); | ||
2344 | set_segment(vcpu, &sregs->fs, VCPU_SREG_FS); | ||
2345 | set_segment(vcpu, &sregs->gs, VCPU_SREG_GS); | ||
2346 | set_segment(vcpu, &sregs->ss, VCPU_SREG_SS); | ||
2347 | |||
2348 | set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); | ||
2349 | set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); | ||
2350 | |||
2351 | vcpu_put(vcpu); | ||
2352 | |||
2353 | return 0; | ||
2354 | } | ||
2355 | |||
2356 | void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) | ||
2357 | { | ||
2358 | struct kvm_segment cs; | ||
2359 | |||
2360 | get_segment(vcpu, &cs, VCPU_SREG_CS); | ||
2361 | *db = cs.db; | ||
2362 | *l = cs.l; | ||
2363 | } | ||
2364 | EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits); | ||
2365 | |||
2366 | /* | ||
2367 | * List of msr numbers which we expose to userspace through KVM_GET_MSRS | ||
2368 | * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. | ||
2369 | * | ||
2370 | * This list is modified at module load time to reflect the | ||
2371 | * capabilities of the host cpu. | ||
2372 | */ | ||
2373 | static u32 msrs_to_save[] = { | ||
2374 | MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, | ||
2375 | MSR_K6_STAR, | ||
2376 | #ifdef CONFIG_X86_64 | ||
2377 | MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, | ||
2378 | #endif | ||
2379 | MSR_IA32_TIME_STAMP_COUNTER, | ||
2380 | }; | ||
2381 | |||
2382 | static unsigned num_msrs_to_save; | ||
2383 | |||
2384 | static u32 emulated_msrs[] = { | ||
2385 | MSR_IA32_MISC_ENABLE, | ||
2386 | }; | ||
2387 | |||
2388 | static __init void kvm_init_msr_list(void) | ||
2389 | { | ||
2390 | u32 dummy[2]; | ||
2391 | unsigned i, j; | ||
2392 | |||
2393 | for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { | ||
2394 | if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) | ||
2395 | continue; | ||
2396 | if (j < i) | ||
2397 | msrs_to_save[j] = msrs_to_save[i]; | ||
2398 | j++; | ||
2399 | } | ||
2400 | num_msrs_to_save = j; | ||
2401 | } | ||
2402 | |||
2403 | /* | ||
2404 | * Adapt set_msr() to msr_io()'s calling convention | ||
2405 | */ | ||
2406 | static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) | ||
2407 | { | ||
2408 | return kvm_set_msr(vcpu, index, *data); | ||
2409 | } | ||
2410 | |||
2411 | /* | ||
2412 | * Read or write a bunch of msrs. All parameters are kernel addresses. | ||
2413 | * | ||
2414 | * @return number of msrs set successfully. | ||
2415 | */ | ||
2416 | static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, | ||
2417 | struct kvm_msr_entry *entries, | ||
2418 | int (*do_msr)(struct kvm_vcpu *vcpu, | ||
2419 | unsigned index, u64 *data)) | ||
2420 | { | ||
2421 | int i; | ||
2422 | |||
2423 | vcpu_load(vcpu); | ||
2424 | |||
2425 | for (i = 0; i < msrs->nmsrs; ++i) | ||
2426 | if (do_msr(vcpu, entries[i].index, &entries[i].data)) | ||
2427 | break; | ||
2428 | |||
2429 | vcpu_put(vcpu); | ||
2430 | |||
2431 | return i; | ||
2432 | } | ||
2433 | |||
2434 | /* | ||
2435 | * Read or write a bunch of msrs. Parameters are user addresses. | ||
2436 | * | ||
2437 | * @return number of msrs set successfully. | ||
2438 | */ | ||
2439 | static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, | ||
2440 | int (*do_msr)(struct kvm_vcpu *vcpu, | ||
2441 | unsigned index, u64 *data), | ||
2442 | int writeback) | ||
2443 | { | ||
2444 | struct kvm_msrs msrs; | ||
2445 | struct kvm_msr_entry *entries; | ||
2446 | int r, n; | ||
2447 | unsigned size; | ||
2448 | |||
2449 | r = -EFAULT; | ||
2450 | if (copy_from_user(&msrs, user_msrs, sizeof msrs)) | ||
2451 | goto out; | ||
2452 | |||
2453 | r = -E2BIG; | ||
2454 | if (msrs.nmsrs >= MAX_IO_MSRS) | ||
2455 | goto out; | ||
2456 | |||
2457 | r = -ENOMEM; | ||
2458 | size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; | ||
2459 | entries = vmalloc(size); | ||
2460 | if (!entries) | ||
2461 | goto out; | ||
2462 | |||
2463 | r = -EFAULT; | ||
2464 | if (copy_from_user(entries, user_msrs->entries, size)) | ||
2465 | goto out_free; | ||
2466 | |||
2467 | r = n = __msr_io(vcpu, &msrs, entries, do_msr); | ||
2468 | if (r < 0) | ||
2469 | goto out_free; | ||
2470 | |||
2471 | r = -EFAULT; | ||
2472 | if (writeback && copy_to_user(user_msrs->entries, entries, size)) | ||
2473 | goto out_free; | ||
2474 | |||
2475 | r = n; | ||
2476 | |||
2477 | out_free: | ||
2478 | vfree(entries); | ||
2479 | out: | ||
2480 | return r; | ||
2481 | } | ||
2482 | |||
2483 | /* | ||
2484 | * Translate a guest virtual address to a guest physical address. | ||
2485 | */ | ||
2486 | static int kvm_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, | ||
2487 | struct kvm_translation *tr) | ||
2488 | { | ||
2489 | unsigned long vaddr = tr->linear_address; | ||
2490 | gpa_t gpa; | ||
2491 | |||
2492 | vcpu_load(vcpu); | ||
2493 | mutex_lock(&vcpu->kvm->lock); | ||
2494 | gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr); | ||
2495 | tr->physical_address = gpa; | ||
2496 | tr->valid = gpa != UNMAPPED_GVA; | ||
2497 | tr->writeable = 1; | ||
2498 | tr->usermode = 0; | ||
2499 | mutex_unlock(&vcpu->kvm->lock); | ||
2500 | vcpu_put(vcpu); | ||
2501 | |||
2502 | return 0; | ||
2503 | } | ||
2504 | |||
2505 | static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, | ||
2506 | struct kvm_interrupt *irq) | ||
2507 | { | ||
2508 | if (irq->irq < 0 || irq->irq >= 256) | ||
2509 | return -EINVAL; | ||
2510 | if (irqchip_in_kernel(vcpu->kvm)) | ||
2511 | return -ENXIO; | ||
2512 | vcpu_load(vcpu); | ||
2513 | |||
2514 | set_bit(irq->irq, vcpu->irq_pending); | ||
2515 | set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary); | ||
2516 | |||
2517 | vcpu_put(vcpu); | ||
2518 | |||
2519 | return 0; | ||
2520 | } | ||
2521 | |||
2522 | static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu, | ||
2523 | struct kvm_debug_guest *dbg) | ||
2524 | { | ||
2525 | int r; | ||
2526 | |||
2527 | vcpu_load(vcpu); | ||
2528 | |||
2529 | r = kvm_x86_ops->set_guest_debug(vcpu, dbg); | ||
2530 | |||
2531 | vcpu_put(vcpu); | ||
2532 | |||
2533 | return r; | ||
2534 | } | ||
2535 | |||
2536 | static struct page *kvm_vcpu_nopage(struct vm_area_struct *vma, | ||
2537 | unsigned long address, | ||
2538 | int *type) | ||
2539 | { | ||
2540 | struct kvm_vcpu *vcpu = vma->vm_file->private_data; | ||
2541 | unsigned long pgoff; | ||
2542 | struct page *page; | ||
2543 | |||
2544 | pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | ||
2545 | if (pgoff == 0) | ||
2546 | page = virt_to_page(vcpu->run); | ||
2547 | else if (pgoff == KVM_PIO_PAGE_OFFSET) | ||
2548 | page = virt_to_page(vcpu->pio_data); | ||
2549 | else | ||
2550 | return NOPAGE_SIGBUS; | ||
2551 | get_page(page); | ||
2552 | if (type != NULL) | ||
2553 | *type = VM_FAULT_MINOR; | ||
2554 | |||
2555 | return page; | ||
2556 | } | ||
2557 | |||
2558 | static struct vm_operations_struct kvm_vcpu_vm_ops = { | ||
2559 | .nopage = kvm_vcpu_nopage, | ||
2560 | }; | ||
2561 | |||
2562 | static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) | ||
2563 | { | ||
2564 | vma->vm_ops = &kvm_vcpu_vm_ops; | ||
2565 | return 0; | ||
2566 | } | ||
2567 | |||
2568 | static int kvm_vcpu_release(struct inode *inode, struct file *filp) | ||
2569 | { | ||
2570 | struct kvm_vcpu *vcpu = filp->private_data; | ||
2571 | |||
2572 | fput(vcpu->kvm->filp); | ||
2573 | return 0; | ||
2574 | } | ||
2575 | |||
2576 | static struct file_operations kvm_vcpu_fops = { | ||
2577 | .release = kvm_vcpu_release, | ||
2578 | .unlocked_ioctl = kvm_vcpu_ioctl, | ||
2579 | .compat_ioctl = kvm_vcpu_ioctl, | ||
2580 | .mmap = kvm_vcpu_mmap, | ||
2581 | }; | ||
2582 | |||
2583 | /* | ||
2584 | * Allocates an inode for the vcpu. | ||
2585 | */ | ||
2586 | static int create_vcpu_fd(struct kvm_vcpu *vcpu) | ||
2587 | { | ||
2588 | int fd, r; | ||
2589 | struct inode *inode; | ||
2590 | struct file *file; | ||
2591 | |||
2592 | r = anon_inode_getfd(&fd, &inode, &file, | ||
2593 | "kvm-vcpu", &kvm_vcpu_fops, vcpu); | ||
2594 | if (r) | ||
2595 | return r; | ||
2596 | atomic_inc(&vcpu->kvm->filp->f_count); | ||
2597 | return fd; | ||
2598 | } | ||
2599 | |||
2600 | /* | ||
2601 | * Creates some virtual cpus. Good luck creating more than one. | ||
2602 | */ | ||
2603 | static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) | ||
2604 | { | ||
2605 | int r; | ||
2606 | struct kvm_vcpu *vcpu; | ||
2607 | |||
2608 | if (!valid_vcpu(n)) | ||
2609 | return -EINVAL; | ||
2610 | |||
2611 | vcpu = kvm_x86_ops->vcpu_create(kvm, n); | ||
2612 | if (IS_ERR(vcpu)) | ||
2613 | return PTR_ERR(vcpu); | ||
2614 | |||
2615 | preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); | ||
2616 | |||
2617 | /* We do fxsave: this must be aligned. */ | ||
2618 | BUG_ON((unsigned long)&vcpu->host_fx_image & 0xF); | ||
2619 | |||
2620 | vcpu_load(vcpu); | ||
2621 | r = kvm_mmu_setup(vcpu); | ||
2622 | vcpu_put(vcpu); | ||
2623 | if (r < 0) | ||
2624 | goto free_vcpu; | ||
2625 | |||
2626 | mutex_lock(&kvm->lock); | ||
2627 | if (kvm->vcpus[n]) { | ||
2628 | r = -EEXIST; | ||
2629 | mutex_unlock(&kvm->lock); | ||
2630 | goto mmu_unload; | ||
2631 | } | ||
2632 | kvm->vcpus[n] = vcpu; | ||
2633 | mutex_unlock(&kvm->lock); | ||
2634 | |||
2635 | /* Now it's all set up, let userspace reach it */ | ||
2636 | r = create_vcpu_fd(vcpu); | ||
2637 | if (r < 0) | ||
2638 | goto unlink; | ||
2639 | return r; | ||
2640 | |||
2641 | unlink: | ||
2642 | mutex_lock(&kvm->lock); | ||
2643 | kvm->vcpus[n] = NULL; | ||
2644 | mutex_unlock(&kvm->lock); | ||
2645 | |||
2646 | mmu_unload: | ||
2647 | vcpu_load(vcpu); | ||
2648 | kvm_mmu_unload(vcpu); | ||
2649 | vcpu_put(vcpu); | ||
2650 | |||
2651 | free_vcpu: | ||
2652 | kvm_x86_ops->vcpu_free(vcpu); | ||
2653 | return r; | ||
2654 | } | ||
2655 | |||
2656 | static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) | ||
2657 | { | ||
2658 | u64 efer; | ||
2659 | int i; | ||
2660 | struct kvm_cpuid_entry *e, *entry; | ||
2661 | |||
2662 | rdmsrl(MSR_EFER, efer); | ||
2663 | entry = NULL; | ||
2664 | for (i = 0; i < vcpu->cpuid_nent; ++i) { | ||
2665 | e = &vcpu->cpuid_entries[i]; | ||
2666 | if (e->function == 0x80000001) { | ||
2667 | entry = e; | ||
2668 | break; | ||
2669 | } | ||
2670 | } | ||
2671 | if (entry && (entry->edx & (1 << 20)) && !(efer & EFER_NX)) { | ||
2672 | entry->edx &= ~(1 << 20); | ||
2673 | printk(KERN_INFO "kvm: guest NX capability removed\n"); | ||
2674 | } | ||
2675 | } | ||
2676 | |||
2677 | static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, | ||
2678 | struct kvm_cpuid *cpuid, | ||
2679 | struct kvm_cpuid_entry __user *entries) | ||
2680 | { | ||
2681 | int r; | ||
2682 | |||
2683 | r = -E2BIG; | ||
2684 | if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) | ||
2685 | goto out; | ||
2686 | r = -EFAULT; | ||
2687 | if (copy_from_user(&vcpu->cpuid_entries, entries, | ||
2688 | cpuid->nent * sizeof(struct kvm_cpuid_entry))) | ||
2689 | goto out; | ||
2690 | vcpu->cpuid_nent = cpuid->nent; | ||
2691 | cpuid_fix_nx_cap(vcpu); | ||
2692 | return 0; | ||
2693 | |||
2694 | out: | ||
2695 | return r; | ||
2696 | } | ||
2697 | |||
2698 | static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) | ||
2699 | { | ||
2700 | if (sigset) { | ||
2701 | sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); | ||
2702 | vcpu->sigset_active = 1; | ||
2703 | vcpu->sigset = *sigset; | ||
2704 | } else | ||
2705 | vcpu->sigset_active = 0; | ||
2706 | return 0; | ||
2707 | } | ||
2708 | |||
2709 | /* | ||
2710 | * fxsave fpu state. Taken from x86_64/processor.h. To be killed when | ||
2711 | * we have asm/x86/processor.h | ||
2712 | */ | ||
2713 | struct fxsave { | ||
2714 | u16 cwd; | ||
2715 | u16 swd; | ||
2716 | u16 twd; | ||
2717 | u16 fop; | ||
2718 | u64 rip; | ||
2719 | u64 rdp; | ||
2720 | u32 mxcsr; | ||
2721 | u32 mxcsr_mask; | ||
2722 | u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ | ||
2723 | #ifdef CONFIG_X86_64 | ||
2724 | u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ | ||
2725 | #else | ||
2726 | u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */ | ||
2727 | #endif | ||
2728 | }; | ||
2729 | |||
2730 | static int kvm_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) | ||
2731 | { | ||
2732 | struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image; | ||
2733 | |||
2734 | vcpu_load(vcpu); | ||
2735 | |||
2736 | memcpy(fpu->fpr, fxsave->st_space, 128); | ||
2737 | fpu->fcw = fxsave->cwd; | ||
2738 | fpu->fsw = fxsave->swd; | ||
2739 | fpu->ftwx = fxsave->twd; | ||
2740 | fpu->last_opcode = fxsave->fop; | ||
2741 | fpu->last_ip = fxsave->rip; | ||
2742 | fpu->last_dp = fxsave->rdp; | ||
2743 | memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space); | ||
2744 | |||
2745 | vcpu_put(vcpu); | ||
2746 | |||
2747 | return 0; | ||
2748 | } | ||
2749 | |||
2750 | static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) | ||
2751 | { | ||
2752 | struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image; | ||
2753 | |||
2754 | vcpu_load(vcpu); | ||
2755 | |||
2756 | memcpy(fxsave->st_space, fpu->fpr, 128); | ||
2757 | fxsave->cwd = fpu->fcw; | ||
2758 | fxsave->swd = fpu->fsw; | ||
2759 | fxsave->twd = fpu->ftwx; | ||
2760 | fxsave->fop = fpu->last_opcode; | ||
2761 | fxsave->rip = fpu->last_ip; | ||
2762 | fxsave->rdp = fpu->last_dp; | ||
2763 | memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space); | ||
2764 | |||
2765 | vcpu_put(vcpu); | ||
2766 | |||
2767 | return 0; | ||
2768 | } | ||
2769 | |||
2770 | static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, | ||
2771 | struct kvm_lapic_state *s) | ||
2772 | { | ||
2773 | vcpu_load(vcpu); | ||
2774 | memcpy(s->regs, vcpu->apic->regs, sizeof *s); | ||
2775 | vcpu_put(vcpu); | ||
2776 | |||
2777 | return 0; | ||
2778 | } | ||
2779 | |||
2780 | static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, | ||
2781 | struct kvm_lapic_state *s) | ||
2782 | { | ||
2783 | vcpu_load(vcpu); | ||
2784 | memcpy(vcpu->apic->regs, s->regs, sizeof *s); | ||
2785 | kvm_apic_post_state_restore(vcpu); | ||
2786 | vcpu_put(vcpu); | ||
2787 | |||
2788 | return 0; | ||
2789 | } | ||
2790 | |||
2791 | static long kvm_vcpu_ioctl(struct file *filp, | ||
2792 | unsigned int ioctl, unsigned long arg) | ||
2793 | { | ||
2794 | struct kvm_vcpu *vcpu = filp->private_data; | ||
2795 | void __user *argp = (void __user *)arg; | ||
2796 | int r = -EINVAL; | ||
2797 | |||
2798 | switch (ioctl) { | ||
2799 | case KVM_RUN: | ||
2800 | r = -EINVAL; | ||
2801 | if (arg) | ||
2802 | goto out; | ||
2803 | r = kvm_vcpu_ioctl_run(vcpu, vcpu->run); | ||
2804 | break; | ||
2805 | case KVM_GET_REGS: { | ||
2806 | struct kvm_regs kvm_regs; | ||
2807 | |||
2808 | memset(&kvm_regs, 0, sizeof kvm_regs); | ||
2809 | r = kvm_vcpu_ioctl_get_regs(vcpu, &kvm_regs); | ||
2810 | if (r) | ||
2811 | goto out; | ||
2812 | r = -EFAULT; | ||
2813 | if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs)) | ||
2814 | goto out; | ||
2815 | r = 0; | ||
2816 | break; | ||
2817 | } | ||
2818 | case KVM_SET_REGS: { | ||
2819 | struct kvm_regs kvm_regs; | ||
2820 | |||
2821 | r = -EFAULT; | ||
2822 | if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs)) | ||
2823 | goto out; | ||
2824 | r = kvm_vcpu_ioctl_set_regs(vcpu, &kvm_regs); | ||
2825 | if (r) | ||
2826 | goto out; | ||
2827 | r = 0; | ||
2828 | break; | ||
2829 | } | ||
2830 | case KVM_GET_SREGS: { | ||
2831 | struct kvm_sregs kvm_sregs; | ||
2832 | |||
2833 | memset(&kvm_sregs, 0, sizeof kvm_sregs); | ||
2834 | r = kvm_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs); | ||
2835 | if (r) | ||
2836 | goto out; | ||
2837 | r = -EFAULT; | ||
2838 | if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs)) | ||
2839 | goto out; | ||
2840 | r = 0; | ||
2841 | break; | ||
2842 | } | ||
2843 | case KVM_SET_SREGS: { | ||
2844 | struct kvm_sregs kvm_sregs; | ||
2845 | |||
2846 | r = -EFAULT; | ||
2847 | if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs)) | ||
2848 | goto out; | ||
2849 | r = kvm_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs); | ||
2850 | if (r) | ||
2851 | goto out; | ||
2852 | r = 0; | ||
2853 | break; | ||
2854 | } | ||
2855 | case KVM_TRANSLATE: { | ||
2856 | struct kvm_translation tr; | ||
2857 | |||
2858 | r = -EFAULT; | ||
2859 | if (copy_from_user(&tr, argp, sizeof tr)) | ||
2860 | goto out; | ||
2861 | r = kvm_vcpu_ioctl_translate(vcpu, &tr); | ||
2862 | if (r) | ||
2863 | goto out; | ||
2864 | r = -EFAULT; | ||
2865 | if (copy_to_user(argp, &tr, sizeof tr)) | ||
2866 | goto out; | ||
2867 | r = 0; | ||
2868 | break; | ||
2869 | } | ||
2870 | case KVM_INTERRUPT: { | ||
2871 | struct kvm_interrupt irq; | ||
2872 | |||
2873 | r = -EFAULT; | ||
2874 | if (copy_from_user(&irq, argp, sizeof irq)) | ||
2875 | goto out; | ||
2876 | r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); | ||
2877 | if (r) | ||
2878 | goto out; | ||
2879 | r = 0; | ||
2880 | break; | ||
2881 | } | ||
2882 | case KVM_DEBUG_GUEST: { | ||
2883 | struct kvm_debug_guest dbg; | ||
2884 | |||
2885 | r = -EFAULT; | ||
2886 | if (copy_from_user(&dbg, argp, sizeof dbg)) | ||
2887 | goto out; | ||
2888 | r = kvm_vcpu_ioctl_debug_guest(vcpu, &dbg); | ||
2889 | if (r) | ||
2890 | goto out; | ||
2891 | r = 0; | ||
2892 | break; | ||
2893 | } | ||
2894 | case KVM_GET_MSRS: | ||
2895 | r = msr_io(vcpu, argp, kvm_get_msr, 1); | ||
2896 | break; | ||
2897 | case KVM_SET_MSRS: | ||
2898 | r = msr_io(vcpu, argp, do_set_msr, 0); | ||
2899 | break; | ||
2900 | case KVM_SET_CPUID: { | ||
2901 | struct kvm_cpuid __user *cpuid_arg = argp; | ||
2902 | struct kvm_cpuid cpuid; | ||
2903 | |||
2904 | r = -EFAULT; | ||
2905 | if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) | ||
2906 | goto out; | ||
2907 | r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); | ||
2908 | if (r) | ||
2909 | goto out; | ||
2910 | break; | ||
2911 | } | ||
2912 | case KVM_SET_SIGNAL_MASK: { | ||
2913 | struct kvm_signal_mask __user *sigmask_arg = argp; | ||
2914 | struct kvm_signal_mask kvm_sigmask; | ||
2915 | sigset_t sigset, *p; | ||
2916 | |||
2917 | p = NULL; | ||
2918 | if (argp) { | ||
2919 | r = -EFAULT; | ||
2920 | if (copy_from_user(&kvm_sigmask, argp, | ||
2921 | sizeof kvm_sigmask)) | ||
2922 | goto out; | ||
2923 | r = -EINVAL; | ||
2924 | if (kvm_sigmask.len != sizeof sigset) | ||
2925 | goto out; | ||
2926 | r = -EFAULT; | ||
2927 | if (copy_from_user(&sigset, sigmask_arg->sigset, | ||
2928 | sizeof sigset)) | ||
2929 | goto out; | ||
2930 | p = &sigset; | ||
2931 | } | ||
2932 | r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); | ||
2933 | break; | ||
2934 | } | ||
2935 | case KVM_GET_FPU: { | ||
2936 | struct kvm_fpu fpu; | ||
2937 | |||
2938 | memset(&fpu, 0, sizeof fpu); | ||
2939 | r = kvm_vcpu_ioctl_get_fpu(vcpu, &fpu); | ||
2940 | if (r) | ||
2941 | goto out; | ||
2942 | r = -EFAULT; | ||
2943 | if (copy_to_user(argp, &fpu, sizeof fpu)) | ||
2944 | goto out; | ||
2945 | r = 0; | ||
2946 | break; | ||
2947 | } | ||
2948 | case KVM_SET_FPU: { | ||
2949 | struct kvm_fpu fpu; | ||
2950 | |||
2951 | r = -EFAULT; | ||
2952 | if (copy_from_user(&fpu, argp, sizeof fpu)) | ||
2953 | goto out; | ||
2954 | r = kvm_vcpu_ioctl_set_fpu(vcpu, &fpu); | ||
2955 | if (r) | ||
2956 | goto out; | ||
2957 | r = 0; | ||
2958 | break; | ||
2959 | } | ||
2960 | case KVM_GET_LAPIC: { | ||
2961 | struct kvm_lapic_state lapic; | ||
2962 | |||
2963 | memset(&lapic, 0, sizeof lapic); | ||
2964 | r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic); | ||
2965 | if (r) | ||
2966 | goto out; | ||
2967 | r = -EFAULT; | ||
2968 | if (copy_to_user(argp, &lapic, sizeof lapic)) | ||
2969 | goto out; | ||
2970 | r = 0; | ||
2971 | break; | ||
2972 | } | ||
2973 | case KVM_SET_LAPIC: { | ||
2974 | struct kvm_lapic_state lapic; | ||
2975 | |||
2976 | r = -EFAULT; | ||
2977 | if (copy_from_user(&lapic, argp, sizeof lapic)) | ||
2978 | goto out; | ||
2979 | r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);; | ||
2980 | if (r) | ||
2981 | goto out; | ||
2982 | r = 0; | ||
2983 | break; | ||
2984 | } | ||
2985 | default: | ||
2986 | ; | ||
2987 | } | ||
2988 | out: | ||
2989 | return r; | ||
2990 | } | ||
2991 | |||
2992 | static long kvm_vm_ioctl(struct file *filp, | ||
2993 | unsigned int ioctl, unsigned long arg) | ||
2994 | { | ||
2995 | struct kvm *kvm = filp->private_data; | ||
2996 | void __user *argp = (void __user *)arg; | ||
2997 | int r = -EINVAL; | ||
2998 | |||
2999 | switch (ioctl) { | ||
3000 | case KVM_CREATE_VCPU: | ||
3001 | r = kvm_vm_ioctl_create_vcpu(kvm, arg); | ||
3002 | if (r < 0) | ||
3003 | goto out; | ||
3004 | break; | ||
3005 | case KVM_SET_MEMORY_REGION: { | ||
3006 | struct kvm_memory_region kvm_mem; | ||
3007 | |||
3008 | r = -EFAULT; | ||
3009 | if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem)) | ||
3010 | goto out; | ||
3011 | r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_mem); | ||
3012 | if (r) | ||
3013 | goto out; | ||
3014 | break; | ||
3015 | } | ||
3016 | case KVM_GET_DIRTY_LOG: { | ||
3017 | struct kvm_dirty_log log; | ||
3018 | |||
3019 | r = -EFAULT; | ||
3020 | if (copy_from_user(&log, argp, sizeof log)) | ||
3021 | goto out; | ||
3022 | r = kvm_vm_ioctl_get_dirty_log(kvm, &log); | ||
3023 | if (r) | ||
3024 | goto out; | ||
3025 | break; | ||
3026 | } | ||
3027 | case KVM_SET_MEMORY_ALIAS: { | ||
3028 | struct kvm_memory_alias alias; | ||
3029 | |||
3030 | r = -EFAULT; | ||
3031 | if (copy_from_user(&alias, argp, sizeof alias)) | ||
3032 | goto out; | ||
3033 | r = kvm_vm_ioctl_set_memory_alias(kvm, &alias); | ||
3034 | if (r) | ||
3035 | goto out; | ||
3036 | break; | ||
3037 | } | ||
3038 | case KVM_CREATE_IRQCHIP: | ||
3039 | r = -ENOMEM; | ||
3040 | kvm->vpic = kvm_create_pic(kvm); | ||
3041 | if (kvm->vpic) { | ||
3042 | r = kvm_ioapic_init(kvm); | ||
3043 | if (r) { | ||
3044 | kfree(kvm->vpic); | ||
3045 | kvm->vpic = NULL; | ||
3046 | goto out; | ||
3047 | } | ||
3048 | } | ||
3049 | else | ||
3050 | goto out; | ||
3051 | break; | ||
3052 | case KVM_IRQ_LINE: { | ||
3053 | struct kvm_irq_level irq_event; | ||
3054 | |||
3055 | r = -EFAULT; | ||
3056 | if (copy_from_user(&irq_event, argp, sizeof irq_event)) | ||
3057 | goto out; | ||
3058 | if (irqchip_in_kernel(kvm)) { | ||
3059 | mutex_lock(&kvm->lock); | ||
3060 | if (irq_event.irq < 16) | ||
3061 | kvm_pic_set_irq(pic_irqchip(kvm), | ||
3062 | irq_event.irq, | ||
3063 | irq_event.level); | ||
3064 | kvm_ioapic_set_irq(kvm->vioapic, | ||
3065 | irq_event.irq, | ||
3066 | irq_event.level); | ||
3067 | mutex_unlock(&kvm->lock); | ||
3068 | r = 0; | ||
3069 | } | ||
3070 | break; | ||
3071 | } | ||
3072 | case KVM_GET_IRQCHIP: { | ||
3073 | /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ | ||
3074 | struct kvm_irqchip chip; | ||
3075 | |||
3076 | r = -EFAULT; | ||
3077 | if (copy_from_user(&chip, argp, sizeof chip)) | ||
3078 | goto out; | ||
3079 | r = -ENXIO; | ||
3080 | if (!irqchip_in_kernel(kvm)) | ||
3081 | goto out; | ||
3082 | r = kvm_vm_ioctl_get_irqchip(kvm, &chip); | ||
3083 | if (r) | ||
3084 | goto out; | ||
3085 | r = -EFAULT; | ||
3086 | if (copy_to_user(argp, &chip, sizeof chip)) | ||
3087 | goto out; | ||
3088 | r = 0; | ||
3089 | break; | ||
3090 | } | ||
3091 | case KVM_SET_IRQCHIP: { | ||
3092 | /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ | ||
3093 | struct kvm_irqchip chip; | ||
3094 | |||
3095 | r = -EFAULT; | ||
3096 | if (copy_from_user(&chip, argp, sizeof chip)) | ||
3097 | goto out; | ||
3098 | r = -ENXIO; | ||
3099 | if (!irqchip_in_kernel(kvm)) | ||
3100 | goto out; | ||
3101 | r = kvm_vm_ioctl_set_irqchip(kvm, &chip); | ||
3102 | if (r) | ||
3103 | goto out; | ||
3104 | r = 0; | ||
3105 | break; | ||
3106 | } | ||
3107 | default: | ||
3108 | ; | ||
3109 | } | ||
3110 | out: | ||
3111 | return r; | ||
3112 | } | ||
3113 | |||
3114 | static struct page *kvm_vm_nopage(struct vm_area_struct *vma, | ||
3115 | unsigned long address, | ||
3116 | int *type) | ||
3117 | { | ||
3118 | struct kvm *kvm = vma->vm_file->private_data; | ||
3119 | unsigned long pgoff; | ||
3120 | struct page *page; | ||
3121 | |||
3122 | pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | ||
3123 | page = gfn_to_page(kvm, pgoff); | ||
3124 | if (!page) | ||
3125 | return NOPAGE_SIGBUS; | ||
3126 | get_page(page); | ||
3127 | if (type != NULL) | ||
3128 | *type = VM_FAULT_MINOR; | ||
3129 | |||
3130 | return page; | ||
3131 | } | ||
3132 | |||
3133 | static struct vm_operations_struct kvm_vm_vm_ops = { | ||
3134 | .nopage = kvm_vm_nopage, | ||
3135 | }; | ||
3136 | |||
3137 | static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma) | ||
3138 | { | ||
3139 | vma->vm_ops = &kvm_vm_vm_ops; | ||
3140 | return 0; | ||
3141 | } | ||
3142 | |||
3143 | static struct file_operations kvm_vm_fops = { | ||
3144 | .release = kvm_vm_release, | ||
3145 | .unlocked_ioctl = kvm_vm_ioctl, | ||
3146 | .compat_ioctl = kvm_vm_ioctl, | ||
3147 | .mmap = kvm_vm_mmap, | ||
3148 | }; | ||
3149 | |||
3150 | static int kvm_dev_ioctl_create_vm(void) | ||
3151 | { | ||
3152 | int fd, r; | ||
3153 | struct inode *inode; | ||
3154 | struct file *file; | ||
3155 | struct kvm *kvm; | ||
3156 | |||
3157 | kvm = kvm_create_vm(); | ||
3158 | if (IS_ERR(kvm)) | ||
3159 | return PTR_ERR(kvm); | ||
3160 | r = anon_inode_getfd(&fd, &inode, &file, "kvm-vm", &kvm_vm_fops, kvm); | ||
3161 | if (r) { | ||
3162 | kvm_destroy_vm(kvm); | ||
3163 | return r; | ||
3164 | } | ||
3165 | |||
3166 | kvm->filp = file; | ||
3167 | |||
3168 | return fd; | ||
3169 | } | ||
3170 | |||
3171 | static long kvm_dev_ioctl(struct file *filp, | ||
3172 | unsigned int ioctl, unsigned long arg) | ||
3173 | { | ||
3174 | void __user *argp = (void __user *)arg; | ||
3175 | long r = -EINVAL; | ||
3176 | |||
3177 | switch (ioctl) { | ||
3178 | case KVM_GET_API_VERSION: | ||
3179 | r = -EINVAL; | ||
3180 | if (arg) | ||
3181 | goto out; | ||
3182 | r = KVM_API_VERSION; | ||
3183 | break; | ||
3184 | case KVM_CREATE_VM: | ||
3185 | r = -EINVAL; | ||
3186 | if (arg) | ||
3187 | goto out; | ||
3188 | r = kvm_dev_ioctl_create_vm(); | ||
3189 | break; | ||
3190 | case KVM_GET_MSR_INDEX_LIST: { | ||
3191 | struct kvm_msr_list __user *user_msr_list = argp; | ||
3192 | struct kvm_msr_list msr_list; | ||
3193 | unsigned n; | ||
3194 | |||
3195 | r = -EFAULT; | ||
3196 | if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list)) | ||
3197 | goto out; | ||
3198 | n = msr_list.nmsrs; | ||
3199 | msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs); | ||
3200 | if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) | ||
3201 | goto out; | ||
3202 | r = -E2BIG; | ||
3203 | if (n < num_msrs_to_save) | ||
3204 | goto out; | ||
3205 | r = -EFAULT; | ||
3206 | if (copy_to_user(user_msr_list->indices, &msrs_to_save, | ||
3207 | num_msrs_to_save * sizeof(u32))) | ||
3208 | goto out; | ||
3209 | if (copy_to_user(user_msr_list->indices | ||
3210 | + num_msrs_to_save * sizeof(u32), | ||
3211 | &emulated_msrs, | ||
3212 | ARRAY_SIZE(emulated_msrs) * sizeof(u32))) | ||
3213 | goto out; | ||
3214 | r = 0; | ||
3215 | break; | ||
3216 | } | ||
3217 | case KVM_CHECK_EXTENSION: { | ||
3218 | int ext = (long)argp; | ||
3219 | |||
3220 | switch (ext) { | ||
3221 | case KVM_CAP_IRQCHIP: | ||
3222 | case KVM_CAP_HLT: | ||
3223 | r = 1; | ||
3224 | break; | ||
3225 | default: | ||
3226 | r = 0; | ||
3227 | break; | ||
3228 | } | ||
3229 | break; | ||
3230 | } | ||
3231 | case KVM_GET_VCPU_MMAP_SIZE: | ||
3232 | r = -EINVAL; | ||
3233 | if (arg) | ||
3234 | goto out; | ||
3235 | r = 2 * PAGE_SIZE; | ||
3236 | break; | ||
3237 | default: | ||
3238 | ; | ||
3239 | } | ||
3240 | out: | ||
3241 | return r; | ||
3242 | } | ||
3243 | |||
3244 | static struct file_operations kvm_chardev_ops = { | ||
3245 | .unlocked_ioctl = kvm_dev_ioctl, | ||
3246 | .compat_ioctl = kvm_dev_ioctl, | ||
3247 | }; | ||
3248 | |||
3249 | static struct miscdevice kvm_dev = { | ||
3250 | KVM_MINOR, | ||
3251 | "kvm", | ||
3252 | &kvm_chardev_ops, | ||
3253 | }; | ||
3254 | |||
3255 | /* | ||
3256 | * Make sure that a cpu that is being hot-unplugged does not have any vcpus | ||
3257 | * cached on it. | ||
3258 | */ | ||
3259 | static void decache_vcpus_on_cpu(int cpu) | ||
3260 | { | ||
3261 | struct kvm *vm; | ||
3262 | struct kvm_vcpu *vcpu; | ||
3263 | int i; | ||
3264 | |||
3265 | spin_lock(&kvm_lock); | ||
3266 | list_for_each_entry(vm, &vm_list, vm_list) | ||
3267 | for (i = 0; i < KVM_MAX_VCPUS; ++i) { | ||
3268 | vcpu = vm->vcpus[i]; | ||
3269 | if (!vcpu) | ||
3270 | continue; | ||
3271 | /* | ||
3272 | * If the vcpu is locked, then it is running on some | ||
3273 | * other cpu and therefore it is not cached on the | ||
3274 | * cpu in question. | ||
3275 | * | ||
3276 | * If it's not locked, check the last cpu it executed | ||
3277 | * on. | ||
3278 | */ | ||
3279 | if (mutex_trylock(&vcpu->mutex)) { | ||
3280 | if (vcpu->cpu == cpu) { | ||
3281 | kvm_x86_ops->vcpu_decache(vcpu); | ||
3282 | vcpu->cpu = -1; | ||
3283 | } | ||
3284 | mutex_unlock(&vcpu->mutex); | ||
3285 | } | ||
3286 | } | ||
3287 | spin_unlock(&kvm_lock); | ||
3288 | } | ||
3289 | |||
3290 | static void hardware_enable(void *junk) | ||
3291 | { | ||
3292 | int cpu = raw_smp_processor_id(); | ||
3293 | |||
3294 | if (cpu_isset(cpu, cpus_hardware_enabled)) | ||
3295 | return; | ||
3296 | cpu_set(cpu, cpus_hardware_enabled); | ||
3297 | kvm_x86_ops->hardware_enable(NULL); | ||
3298 | } | ||
3299 | |||
3300 | static void hardware_disable(void *junk) | ||
3301 | { | ||
3302 | int cpu = raw_smp_processor_id(); | ||
3303 | |||
3304 | if (!cpu_isset(cpu, cpus_hardware_enabled)) | ||
3305 | return; | ||
3306 | cpu_clear(cpu, cpus_hardware_enabled); | ||
3307 | decache_vcpus_on_cpu(cpu); | ||
3308 | kvm_x86_ops->hardware_disable(NULL); | ||
3309 | } | ||
3310 | |||
3311 | static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, | ||
3312 | void *v) | ||
3313 | { | ||
3314 | int cpu = (long)v; | ||
3315 | |||
3316 | switch (val) { | ||
3317 | case CPU_DYING: | ||
3318 | case CPU_DYING_FROZEN: | ||
3319 | printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", | ||
3320 | cpu); | ||
3321 | hardware_disable(NULL); | ||
3322 | break; | ||
3323 | case CPU_UP_CANCELED: | ||
3324 | case CPU_UP_CANCELED_FROZEN: | ||
3325 | printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", | ||
3326 | cpu); | ||
3327 | smp_call_function_single(cpu, hardware_disable, NULL, 0, 1); | ||
3328 | break; | ||
3329 | case CPU_ONLINE: | ||
3330 | case CPU_ONLINE_FROZEN: | ||
3331 | printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", | ||
3332 | cpu); | ||
3333 | smp_call_function_single(cpu, hardware_enable, NULL, 0, 1); | ||
3334 | break; | ||
3335 | } | ||
3336 | return NOTIFY_OK; | ||
3337 | } | ||
3338 | |||
3339 | static int kvm_reboot(struct notifier_block *notifier, unsigned long val, | ||
3340 | void *v) | ||
3341 | { | ||
3342 | if (val == SYS_RESTART) { | ||
3343 | /* | ||
3344 | * Some (well, at least mine) BIOSes hang on reboot if | ||
3345 | * in vmx root mode. | ||
3346 | */ | ||
3347 | printk(KERN_INFO "kvm: exiting hardware virtualization\n"); | ||
3348 | on_each_cpu(hardware_disable, NULL, 0, 1); | ||
3349 | } | ||
3350 | return NOTIFY_OK; | ||
3351 | } | ||
3352 | |||
3353 | static struct notifier_block kvm_reboot_notifier = { | ||
3354 | .notifier_call = kvm_reboot, | ||
3355 | .priority = 0, | ||
3356 | }; | ||
3357 | |||
3358 | void kvm_io_bus_init(struct kvm_io_bus *bus) | ||
3359 | { | ||
3360 | memset(bus, 0, sizeof(*bus)); | ||
3361 | } | ||
3362 | |||
3363 | void kvm_io_bus_destroy(struct kvm_io_bus *bus) | ||
3364 | { | ||
3365 | int i; | ||
3366 | |||
3367 | for (i = 0; i < bus->dev_count; i++) { | ||
3368 | struct kvm_io_device *pos = bus->devs[i]; | ||
3369 | |||
3370 | kvm_iodevice_destructor(pos); | ||
3371 | } | ||
3372 | } | ||
3373 | |||
3374 | struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr) | ||
3375 | { | ||
3376 | int i; | ||
3377 | |||
3378 | for (i = 0; i < bus->dev_count; i++) { | ||
3379 | struct kvm_io_device *pos = bus->devs[i]; | ||
3380 | |||
3381 | if (pos->in_range(pos, addr)) | ||
3382 | return pos; | ||
3383 | } | ||
3384 | |||
3385 | return NULL; | ||
3386 | } | ||
3387 | |||
3388 | void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev) | ||
3389 | { | ||
3390 | BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1)); | ||
3391 | |||
3392 | bus->devs[bus->dev_count++] = dev; | ||
3393 | } | ||
3394 | |||
3395 | static struct notifier_block kvm_cpu_notifier = { | ||
3396 | .notifier_call = kvm_cpu_hotplug, | ||
3397 | .priority = 20, /* must be > scheduler priority */ | ||
3398 | }; | ||
3399 | |||
3400 | static u64 stat_get(void *_offset) | ||
3401 | { | ||
3402 | unsigned offset = (long)_offset; | ||
3403 | u64 total = 0; | ||
3404 | struct kvm *kvm; | ||
3405 | struct kvm_vcpu *vcpu; | ||
3406 | int i; | ||
3407 | |||
3408 | spin_lock(&kvm_lock); | ||
3409 | list_for_each_entry(kvm, &vm_list, vm_list) | ||
3410 | for (i = 0; i < KVM_MAX_VCPUS; ++i) { | ||
3411 | vcpu = kvm->vcpus[i]; | ||
3412 | if (vcpu) | ||
3413 | total += *(u32 *)((void *)vcpu + offset); | ||
3414 | } | ||
3415 | spin_unlock(&kvm_lock); | ||
3416 | return total; | ||
3417 | } | ||
3418 | |||
3419 | DEFINE_SIMPLE_ATTRIBUTE(stat_fops, stat_get, NULL, "%llu\n"); | ||
3420 | |||
3421 | static __init void kvm_init_debug(void) | ||
3422 | { | ||
3423 | struct kvm_stats_debugfs_item *p; | ||
3424 | |||
3425 | debugfs_dir = debugfs_create_dir("kvm", NULL); | ||
3426 | for (p = debugfs_entries; p->name; ++p) | ||
3427 | p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir, | ||
3428 | (void *)(long)p->offset, | ||
3429 | &stat_fops); | ||
3430 | } | ||
3431 | |||
3432 | static void kvm_exit_debug(void) | ||
3433 | { | ||
3434 | struct kvm_stats_debugfs_item *p; | ||
3435 | |||
3436 | for (p = debugfs_entries; p->name; ++p) | ||
3437 | debugfs_remove(p->dentry); | ||
3438 | debugfs_remove(debugfs_dir); | ||
3439 | } | ||
3440 | |||
3441 | static int kvm_suspend(struct sys_device *dev, pm_message_t state) | ||
3442 | { | ||
3443 | hardware_disable(NULL); | ||
3444 | return 0; | ||
3445 | } | ||
3446 | |||
3447 | static int kvm_resume(struct sys_device *dev) | ||
3448 | { | ||
3449 | hardware_enable(NULL); | ||
3450 | return 0; | ||
3451 | } | ||
3452 | |||
3453 | static struct sysdev_class kvm_sysdev_class = { | ||
3454 | .name = "kvm", | ||
3455 | .suspend = kvm_suspend, | ||
3456 | .resume = kvm_resume, | ||
3457 | }; | ||
3458 | |||
3459 | static struct sys_device kvm_sysdev = { | ||
3460 | .id = 0, | ||
3461 | .cls = &kvm_sysdev_class, | ||
3462 | }; | ||
3463 | |||
3464 | hpa_t bad_page_address; | ||
3465 | |||
3466 | static inline | ||
3467 | struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) | ||
3468 | { | ||
3469 | return container_of(pn, struct kvm_vcpu, preempt_notifier); | ||
3470 | } | ||
3471 | |||
3472 | static void kvm_sched_in(struct preempt_notifier *pn, int cpu) | ||
3473 | { | ||
3474 | struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); | ||
3475 | |||
3476 | kvm_x86_ops->vcpu_load(vcpu, cpu); | ||
3477 | } | ||
3478 | |||
3479 | static void kvm_sched_out(struct preempt_notifier *pn, | ||
3480 | struct task_struct *next) | ||
3481 | { | ||
3482 | struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); | ||
3483 | |||
3484 | kvm_x86_ops->vcpu_put(vcpu); | ||
3485 | } | ||
3486 | |||
3487 | int kvm_init_x86(struct kvm_x86_ops *ops, unsigned int vcpu_size, | ||
3488 | struct module *module) | ||
3489 | { | ||
3490 | int r; | ||
3491 | int cpu; | ||
3492 | |||
3493 | if (kvm_x86_ops) { | ||
3494 | printk(KERN_ERR "kvm: already loaded the other module\n"); | ||
3495 | return -EEXIST; | ||
3496 | } | ||
3497 | |||
3498 | if (!ops->cpu_has_kvm_support()) { | ||
3499 | printk(KERN_ERR "kvm: no hardware support\n"); | ||
3500 | return -EOPNOTSUPP; | ||
3501 | } | ||
3502 | if (ops->disabled_by_bios()) { | ||
3503 | printk(KERN_ERR "kvm: disabled by bios\n"); | ||
3504 | return -EOPNOTSUPP; | ||
3505 | } | ||
3506 | |||
3507 | kvm_x86_ops = ops; | ||
3508 | |||
3509 | r = kvm_x86_ops->hardware_setup(); | ||
3510 | if (r < 0) | ||
3511 | goto out; | ||
3512 | |||
3513 | for_each_online_cpu(cpu) { | ||
3514 | smp_call_function_single(cpu, | ||
3515 | kvm_x86_ops->check_processor_compatibility, | ||
3516 | &r, 0, 1); | ||
3517 | if (r < 0) | ||
3518 | goto out_free_0; | ||
3519 | } | ||
3520 | |||
3521 | on_each_cpu(hardware_enable, NULL, 0, 1); | ||
3522 | r = register_cpu_notifier(&kvm_cpu_notifier); | ||
3523 | if (r) | ||
3524 | goto out_free_1; | ||
3525 | register_reboot_notifier(&kvm_reboot_notifier); | ||
3526 | |||
3527 | r = sysdev_class_register(&kvm_sysdev_class); | ||
3528 | if (r) | ||
3529 | goto out_free_2; | ||
3530 | |||
3531 | r = sysdev_register(&kvm_sysdev); | ||
3532 | if (r) | ||
3533 | goto out_free_3; | ||
3534 | |||
3535 | /* A kmem cache lets us meet the alignment requirements of fx_save. */ | ||
3536 | kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, | ||
3537 | __alignof__(struct kvm_vcpu), 0, 0); | ||
3538 | if (!kvm_vcpu_cache) { | ||
3539 | r = -ENOMEM; | ||
3540 | goto out_free_4; | ||
3541 | } | ||
3542 | |||
3543 | kvm_chardev_ops.owner = module; | ||
3544 | |||
3545 | r = misc_register(&kvm_dev); | ||
3546 | if (r) { | ||
3547 | printk (KERN_ERR "kvm: misc device register failed\n"); | ||
3548 | goto out_free; | ||
3549 | } | ||
3550 | |||
3551 | kvm_preempt_ops.sched_in = kvm_sched_in; | ||
3552 | kvm_preempt_ops.sched_out = kvm_sched_out; | ||
3553 | |||
3554 | return r; | ||
3555 | |||
3556 | out_free: | ||
3557 | kmem_cache_destroy(kvm_vcpu_cache); | ||
3558 | out_free_4: | ||
3559 | sysdev_unregister(&kvm_sysdev); | ||
3560 | out_free_3: | ||
3561 | sysdev_class_unregister(&kvm_sysdev_class); | ||
3562 | out_free_2: | ||
3563 | unregister_reboot_notifier(&kvm_reboot_notifier); | ||
3564 | unregister_cpu_notifier(&kvm_cpu_notifier); | ||
3565 | out_free_1: | ||
3566 | on_each_cpu(hardware_disable, NULL, 0, 1); | ||
3567 | out_free_0: | ||
3568 | kvm_x86_ops->hardware_unsetup(); | ||
3569 | out: | ||
3570 | kvm_x86_ops = NULL; | ||
3571 | return r; | ||
3572 | } | ||
3573 | |||
3574 | void kvm_exit_x86(void) | ||
3575 | { | ||
3576 | misc_deregister(&kvm_dev); | ||
3577 | kmem_cache_destroy(kvm_vcpu_cache); | ||
3578 | sysdev_unregister(&kvm_sysdev); | ||
3579 | sysdev_class_unregister(&kvm_sysdev_class); | ||
3580 | unregister_reboot_notifier(&kvm_reboot_notifier); | ||
3581 | unregister_cpu_notifier(&kvm_cpu_notifier); | ||
3582 | on_each_cpu(hardware_disable, NULL, 0, 1); | ||
3583 | kvm_x86_ops->hardware_unsetup(); | ||
3584 | kvm_x86_ops = NULL; | ||
3585 | } | ||
3586 | |||
3587 | static __init int kvm_init(void) | ||
3588 | { | ||
3589 | static struct page *bad_page; | ||
3590 | int r; | ||
3591 | |||
3592 | r = kvm_mmu_module_init(); | ||
3593 | if (r) | ||
3594 | goto out4; | ||
3595 | |||
3596 | kvm_init_debug(); | ||
3597 | |||
3598 | kvm_init_msr_list(); | ||
3599 | |||
3600 | if ((bad_page = alloc_page(GFP_KERNEL)) == NULL) { | ||
3601 | r = -ENOMEM; | ||
3602 | goto out; | ||
3603 | } | ||
3604 | |||
3605 | bad_page_address = page_to_pfn(bad_page) << PAGE_SHIFT; | ||
3606 | memset(__va(bad_page_address), 0, PAGE_SIZE); | ||
3607 | |||
3608 | return 0; | ||
3609 | |||
3610 | out: | ||
3611 | kvm_exit_debug(); | ||
3612 | kvm_mmu_module_exit(); | ||
3613 | out4: | ||
3614 | return r; | ||
3615 | } | ||
3616 | |||
3617 | static __exit void kvm_exit(void) | ||
3618 | { | ||
3619 | kvm_exit_debug(); | ||
3620 | __free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT)); | ||
3621 | kvm_mmu_module_exit(); | ||
3622 | } | ||
3623 | |||
3624 | module_init(kvm_init) | ||
3625 | module_exit(kvm_exit) | ||
3626 | |||
3627 | EXPORT_SYMBOL_GPL(kvm_init_x86); | ||
3628 | EXPORT_SYMBOL_GPL(kvm_exit_x86); | ||
diff --git a/drivers/kvm/kvm_svm.h b/drivers/kvm/kvm_svm.h deleted file mode 100644 index a0e415daef5b..000000000000 --- a/drivers/kvm/kvm_svm.h +++ /dev/null | |||
@@ -1,45 +0,0 @@ | |||
1 | #ifndef __KVM_SVM_H | ||
2 | #define __KVM_SVM_H | ||
3 | |||
4 | #include <linux/kernel.h> | ||
5 | #include <linux/types.h> | ||
6 | #include <linux/list.h> | ||
7 | #include <asm/msr.h> | ||
8 | |||
9 | #include "svm.h" | ||
10 | #include "kvm.h" | ||
11 | |||
12 | static const u32 host_save_user_msrs[] = { | ||
13 | #ifdef CONFIG_X86_64 | ||
14 | MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE, | ||
15 | MSR_FS_BASE, | ||
16 | #endif | ||
17 | MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, | ||
18 | }; | ||
19 | |||
20 | #define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs) | ||
21 | #define NUM_DB_REGS 4 | ||
22 | |||
23 | struct kvm_vcpu; | ||
24 | |||
25 | struct vcpu_svm { | ||
26 | struct kvm_vcpu vcpu; | ||
27 | struct vmcb *vmcb; | ||
28 | unsigned long vmcb_pa; | ||
29 | struct svm_cpu_data *svm_data; | ||
30 | uint64_t asid_generation; | ||
31 | |||
32 | unsigned long db_regs[NUM_DB_REGS]; | ||
33 | |||
34 | u64 next_rip; | ||
35 | |||
36 | u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; | ||
37 | u64 host_gs_base; | ||
38 | unsigned long host_cr2; | ||
39 | unsigned long host_db_regs[NUM_DB_REGS]; | ||
40 | unsigned long host_dr6; | ||
41 | unsigned long host_dr7; | ||
42 | }; | ||
43 | |||
44 | #endif | ||
45 | |||
diff --git a/drivers/kvm/lapic.c b/drivers/kvm/lapic.c deleted file mode 100644 index 238fcad3cece..000000000000 --- a/drivers/kvm/lapic.c +++ /dev/null | |||
@@ -1,1080 +0,0 @@ | |||
1 | |||
2 | /* | ||
3 | * Local APIC virtualization | ||
4 | * | ||
5 | * Copyright (C) 2006 Qumranet, Inc. | ||
6 | * Copyright (C) 2007 Novell | ||
7 | * Copyright (C) 2007 Intel | ||
8 | * | ||
9 | * Authors: | ||
10 | * Dor Laor <dor.laor@qumranet.com> | ||
11 | * Gregory Haskins <ghaskins@novell.com> | ||
12 | * Yaozu (Eddie) Dong <eddie.dong@intel.com> | ||
13 | * | ||
14 | * Based on Xen 3.1 code, Copyright (c) 2004, Intel Corporation. | ||
15 | * | ||
16 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
17 | * the COPYING file in the top-level directory. | ||
18 | */ | ||
19 | |||
20 | #include "kvm.h" | ||
21 | #include <linux/kvm.h> | ||
22 | #include <linux/mm.h> | ||
23 | #include <linux/highmem.h> | ||
24 | #include <linux/smp.h> | ||
25 | #include <linux/hrtimer.h> | ||
26 | #include <linux/io.h> | ||
27 | #include <linux/module.h> | ||
28 | #include <asm/processor.h> | ||
29 | #include <asm/msr.h> | ||
30 | #include <asm/page.h> | ||
31 | #include <asm/current.h> | ||
32 | #include <asm/apicdef.h> | ||
33 | #include <asm/atomic.h> | ||
34 | #include <asm/div64.h> | ||
35 | #include "irq.h" | ||
36 | |||
37 | #define PRId64 "d" | ||
38 | #define PRIx64 "llx" | ||
39 | #define PRIu64 "u" | ||
40 | #define PRIo64 "o" | ||
41 | |||
42 | #define APIC_BUS_CYCLE_NS 1 | ||
43 | |||
44 | /* #define apic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */ | ||
45 | #define apic_debug(fmt, arg...) | ||
46 | |||
47 | #define APIC_LVT_NUM 6 | ||
48 | /* 14 is the version for Xeon and Pentium 8.4.8*/ | ||
49 | #define APIC_VERSION (0x14UL | ((APIC_LVT_NUM - 1) << 16)) | ||
50 | #define LAPIC_MMIO_LENGTH (1 << 12) | ||
51 | /* followed define is not in apicdef.h */ | ||
52 | #define APIC_SHORT_MASK 0xc0000 | ||
53 | #define APIC_DEST_NOSHORT 0x0 | ||
54 | #define APIC_DEST_MASK 0x800 | ||
55 | #define MAX_APIC_VECTOR 256 | ||
56 | |||
57 | #define VEC_POS(v) ((v) & (32 - 1)) | ||
58 | #define REG_POS(v) (((v) >> 5) << 4) | ||
59 | static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off) | ||
60 | { | ||
61 | return *((u32 *) (apic->regs + reg_off)); | ||
62 | } | ||
63 | |||
64 | static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val) | ||
65 | { | ||
66 | *((u32 *) (apic->regs + reg_off)) = val; | ||
67 | } | ||
68 | |||
69 | static inline int apic_test_and_set_vector(int vec, void *bitmap) | ||
70 | { | ||
71 | return test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); | ||
72 | } | ||
73 | |||
74 | static inline int apic_test_and_clear_vector(int vec, void *bitmap) | ||
75 | { | ||
76 | return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); | ||
77 | } | ||
78 | |||
79 | static inline void apic_set_vector(int vec, void *bitmap) | ||
80 | { | ||
81 | set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); | ||
82 | } | ||
83 | |||
84 | static inline void apic_clear_vector(int vec, void *bitmap) | ||
85 | { | ||
86 | clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); | ||
87 | } | ||
88 | |||
89 | static inline int apic_hw_enabled(struct kvm_lapic *apic) | ||
90 | { | ||
91 | return (apic)->vcpu->apic_base & MSR_IA32_APICBASE_ENABLE; | ||
92 | } | ||
93 | |||
94 | static inline int apic_sw_enabled(struct kvm_lapic *apic) | ||
95 | { | ||
96 | return apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED; | ||
97 | } | ||
98 | |||
99 | static inline int apic_enabled(struct kvm_lapic *apic) | ||
100 | { | ||
101 | return apic_sw_enabled(apic) && apic_hw_enabled(apic); | ||
102 | } | ||
103 | |||
104 | #define LVT_MASK \ | ||
105 | (APIC_LVT_MASKED | APIC_SEND_PENDING | APIC_VECTOR_MASK) | ||
106 | |||
107 | #define LINT_MASK \ | ||
108 | (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \ | ||
109 | APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER) | ||
110 | |||
111 | static inline int kvm_apic_id(struct kvm_lapic *apic) | ||
112 | { | ||
113 | return (apic_get_reg(apic, APIC_ID) >> 24) & 0xff; | ||
114 | } | ||
115 | |||
116 | static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type) | ||
117 | { | ||
118 | return !(apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED); | ||
119 | } | ||
120 | |||
121 | static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type) | ||
122 | { | ||
123 | return apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK; | ||
124 | } | ||
125 | |||
126 | static inline int apic_lvtt_period(struct kvm_lapic *apic) | ||
127 | { | ||
128 | return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC; | ||
129 | } | ||
130 | |||
131 | static unsigned int apic_lvt_mask[APIC_LVT_NUM] = { | ||
132 | LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */ | ||
133 | LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ | ||
134 | LVT_MASK | APIC_MODE_MASK, /* LVTPC */ | ||
135 | LINT_MASK, LINT_MASK, /* LVT0-1 */ | ||
136 | LVT_MASK /* LVTERR */ | ||
137 | }; | ||
138 | |||
139 | static int find_highest_vector(void *bitmap) | ||
140 | { | ||
141 | u32 *word = bitmap; | ||
142 | int word_offset = MAX_APIC_VECTOR >> 5; | ||
143 | |||
144 | while ((word_offset != 0) && (word[(--word_offset) << 2] == 0)) | ||
145 | continue; | ||
146 | |||
147 | if (likely(!word_offset && !word[0])) | ||
148 | return -1; | ||
149 | else | ||
150 | return fls(word[word_offset << 2]) - 1 + (word_offset << 5); | ||
151 | } | ||
152 | |||
153 | static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic) | ||
154 | { | ||
155 | return apic_test_and_set_vector(vec, apic->regs + APIC_IRR); | ||
156 | } | ||
157 | |||
158 | static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) | ||
159 | { | ||
160 | apic_clear_vector(vec, apic->regs + APIC_IRR); | ||
161 | } | ||
162 | |||
163 | static inline int apic_find_highest_irr(struct kvm_lapic *apic) | ||
164 | { | ||
165 | int result; | ||
166 | |||
167 | result = find_highest_vector(apic->regs + APIC_IRR); | ||
168 | ASSERT(result == -1 || result >= 16); | ||
169 | |||
170 | return result; | ||
171 | } | ||
172 | |||
173 | int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) | ||
174 | { | ||
175 | struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic; | ||
176 | int highest_irr; | ||
177 | |||
178 | if (!apic) | ||
179 | return 0; | ||
180 | highest_irr = apic_find_highest_irr(apic); | ||
181 | |||
182 | return highest_irr; | ||
183 | } | ||
184 | EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr); | ||
185 | |||
186 | int kvm_apic_set_irq(struct kvm_lapic *apic, u8 vec, u8 trig) | ||
187 | { | ||
188 | if (!apic_test_and_set_irr(vec, apic)) { | ||
189 | /* a new pending irq is set in IRR */ | ||
190 | if (trig) | ||
191 | apic_set_vector(vec, apic->regs + APIC_TMR); | ||
192 | else | ||
193 | apic_clear_vector(vec, apic->regs + APIC_TMR); | ||
194 | kvm_vcpu_kick(apic->vcpu); | ||
195 | return 1; | ||
196 | } | ||
197 | return 0; | ||
198 | } | ||
199 | |||
200 | static inline int apic_find_highest_isr(struct kvm_lapic *apic) | ||
201 | { | ||
202 | int result; | ||
203 | |||
204 | result = find_highest_vector(apic->regs + APIC_ISR); | ||
205 | ASSERT(result == -1 || result >= 16); | ||
206 | |||
207 | return result; | ||
208 | } | ||
209 | |||
210 | static void apic_update_ppr(struct kvm_lapic *apic) | ||
211 | { | ||
212 | u32 tpr, isrv, ppr; | ||
213 | int isr; | ||
214 | |||
215 | tpr = apic_get_reg(apic, APIC_TASKPRI); | ||
216 | isr = apic_find_highest_isr(apic); | ||
217 | isrv = (isr != -1) ? isr : 0; | ||
218 | |||
219 | if ((tpr & 0xf0) >= (isrv & 0xf0)) | ||
220 | ppr = tpr & 0xff; | ||
221 | else | ||
222 | ppr = isrv & 0xf0; | ||
223 | |||
224 | apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x", | ||
225 | apic, ppr, isr, isrv); | ||
226 | |||
227 | apic_set_reg(apic, APIC_PROCPRI, ppr); | ||
228 | } | ||
229 | |||
230 | static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) | ||
231 | { | ||
232 | apic_set_reg(apic, APIC_TASKPRI, tpr); | ||
233 | apic_update_ppr(apic); | ||
234 | } | ||
235 | |||
236 | int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest) | ||
237 | { | ||
238 | return kvm_apic_id(apic) == dest; | ||
239 | } | ||
240 | |||
241 | int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda) | ||
242 | { | ||
243 | int result = 0; | ||
244 | u8 logical_id; | ||
245 | |||
246 | logical_id = GET_APIC_LOGICAL_ID(apic_get_reg(apic, APIC_LDR)); | ||
247 | |||
248 | switch (apic_get_reg(apic, APIC_DFR)) { | ||
249 | case APIC_DFR_FLAT: | ||
250 | if (logical_id & mda) | ||
251 | result = 1; | ||
252 | break; | ||
253 | case APIC_DFR_CLUSTER: | ||
254 | if (((logical_id >> 4) == (mda >> 0x4)) | ||
255 | && (logical_id & mda & 0xf)) | ||
256 | result = 1; | ||
257 | break; | ||
258 | default: | ||
259 | printk(KERN_WARNING "Bad DFR vcpu %d: %08x\n", | ||
260 | apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR)); | ||
261 | break; | ||
262 | } | ||
263 | |||
264 | return result; | ||
265 | } | ||
266 | |||
267 | static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, | ||
268 | int short_hand, int dest, int dest_mode) | ||
269 | { | ||
270 | int result = 0; | ||
271 | struct kvm_lapic *target = vcpu->apic; | ||
272 | |||
273 | apic_debug("target %p, source %p, dest 0x%x, " | ||
274 | "dest_mode 0x%x, short_hand 0x%x", | ||
275 | target, source, dest, dest_mode, short_hand); | ||
276 | |||
277 | ASSERT(!target); | ||
278 | switch (short_hand) { | ||
279 | case APIC_DEST_NOSHORT: | ||
280 | if (dest_mode == 0) { | ||
281 | /* Physical mode. */ | ||
282 | if ((dest == 0xFF) || (dest == kvm_apic_id(target))) | ||
283 | result = 1; | ||
284 | } else | ||
285 | /* Logical mode. */ | ||
286 | result = kvm_apic_match_logical_addr(target, dest); | ||
287 | break; | ||
288 | case APIC_DEST_SELF: | ||
289 | if (target == source) | ||
290 | result = 1; | ||
291 | break; | ||
292 | case APIC_DEST_ALLINC: | ||
293 | result = 1; | ||
294 | break; | ||
295 | case APIC_DEST_ALLBUT: | ||
296 | if (target != source) | ||
297 | result = 1; | ||
298 | break; | ||
299 | default: | ||
300 | printk(KERN_WARNING "Bad dest shorthand value %x\n", | ||
301 | short_hand); | ||
302 | break; | ||
303 | } | ||
304 | |||
305 | return result; | ||
306 | } | ||
307 | |||
308 | /* | ||
309 | * Add a pending IRQ into lapic. | ||
310 | * Return 1 if successfully added and 0 if discarded. | ||
311 | */ | ||
312 | static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, | ||
313 | int vector, int level, int trig_mode) | ||
314 | { | ||
315 | int orig_irr, result = 0; | ||
316 | struct kvm_vcpu *vcpu = apic->vcpu; | ||
317 | |||
318 | switch (delivery_mode) { | ||
319 | case APIC_DM_FIXED: | ||
320 | case APIC_DM_LOWEST: | ||
321 | /* FIXME add logic for vcpu on reset */ | ||
322 | if (unlikely(!apic_enabled(apic))) | ||
323 | break; | ||
324 | |||
325 | orig_irr = apic_test_and_set_irr(vector, apic); | ||
326 | if (orig_irr && trig_mode) { | ||
327 | apic_debug("level trig mode repeatedly for vector %d", | ||
328 | vector); | ||
329 | break; | ||
330 | } | ||
331 | |||
332 | if (trig_mode) { | ||
333 | apic_debug("level trig mode for vector %d", vector); | ||
334 | apic_set_vector(vector, apic->regs + APIC_TMR); | ||
335 | } else | ||
336 | apic_clear_vector(vector, apic->regs + APIC_TMR); | ||
337 | |||
338 | if (vcpu->mp_state == VCPU_MP_STATE_RUNNABLE) | ||
339 | kvm_vcpu_kick(vcpu); | ||
340 | else if (vcpu->mp_state == VCPU_MP_STATE_HALTED) { | ||
341 | vcpu->mp_state = VCPU_MP_STATE_RUNNABLE; | ||
342 | if (waitqueue_active(&vcpu->wq)) | ||
343 | wake_up_interruptible(&vcpu->wq); | ||
344 | } | ||
345 | |||
346 | result = (orig_irr == 0); | ||
347 | break; | ||
348 | |||
349 | case APIC_DM_REMRD: | ||
350 | printk(KERN_DEBUG "Ignoring delivery mode 3\n"); | ||
351 | break; | ||
352 | |||
353 | case APIC_DM_SMI: | ||
354 | printk(KERN_DEBUG "Ignoring guest SMI\n"); | ||
355 | break; | ||
356 | case APIC_DM_NMI: | ||
357 | printk(KERN_DEBUG "Ignoring guest NMI\n"); | ||
358 | break; | ||
359 | |||
360 | case APIC_DM_INIT: | ||
361 | if (level) { | ||
362 | if (vcpu->mp_state == VCPU_MP_STATE_RUNNABLE) | ||
363 | printk(KERN_DEBUG | ||
364 | "INIT on a runnable vcpu %d\n", | ||
365 | vcpu->vcpu_id); | ||
366 | vcpu->mp_state = VCPU_MP_STATE_INIT_RECEIVED; | ||
367 | kvm_vcpu_kick(vcpu); | ||
368 | } else { | ||
369 | printk(KERN_DEBUG | ||
370 | "Ignoring de-assert INIT to vcpu %d\n", | ||
371 | vcpu->vcpu_id); | ||
372 | } | ||
373 | |||
374 | break; | ||
375 | |||
376 | case APIC_DM_STARTUP: | ||
377 | printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n", | ||
378 | vcpu->vcpu_id, vector); | ||
379 | if (vcpu->mp_state == VCPU_MP_STATE_INIT_RECEIVED) { | ||
380 | vcpu->sipi_vector = vector; | ||
381 | vcpu->mp_state = VCPU_MP_STATE_SIPI_RECEIVED; | ||
382 | if (waitqueue_active(&vcpu->wq)) | ||
383 | wake_up_interruptible(&vcpu->wq); | ||
384 | } | ||
385 | break; | ||
386 | |||
387 | default: | ||
388 | printk(KERN_ERR "TODO: unsupported delivery mode %x\n", | ||
389 | delivery_mode); | ||
390 | break; | ||
391 | } | ||
392 | return result; | ||
393 | } | ||
394 | |||
395 | struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector, | ||
396 | unsigned long bitmap) | ||
397 | { | ||
398 | int vcpu_id; | ||
399 | int last; | ||
400 | int next; | ||
401 | struct kvm_lapic *apic; | ||
402 | |||
403 | last = kvm->round_robin_prev_vcpu; | ||
404 | next = last; | ||
405 | |||
406 | do { | ||
407 | if (++next == KVM_MAX_VCPUS) | ||
408 | next = 0; | ||
409 | if (kvm->vcpus[next] == NULL || !test_bit(next, &bitmap)) | ||
410 | continue; | ||
411 | apic = kvm->vcpus[next]->apic; | ||
412 | if (apic && apic_enabled(apic)) | ||
413 | break; | ||
414 | apic = NULL; | ||
415 | } while (next != last); | ||
416 | kvm->round_robin_prev_vcpu = next; | ||
417 | |||
418 | if (!apic) { | ||
419 | vcpu_id = ffs(bitmap) - 1; | ||
420 | if (vcpu_id < 0) { | ||
421 | vcpu_id = 0; | ||
422 | printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n"); | ||
423 | } | ||
424 | apic = kvm->vcpus[vcpu_id]->apic; | ||
425 | } | ||
426 | |||
427 | return apic; | ||
428 | } | ||
429 | |||
430 | static void apic_set_eoi(struct kvm_lapic *apic) | ||
431 | { | ||
432 | int vector = apic_find_highest_isr(apic); | ||
433 | |||
434 | /* | ||
435 | * Not every write EOI will has corresponding ISR, | ||
436 | * one example is when Kernel check timer on setup_IO_APIC | ||
437 | */ | ||
438 | if (vector == -1) | ||
439 | return; | ||
440 | |||
441 | apic_clear_vector(vector, apic->regs + APIC_ISR); | ||
442 | apic_update_ppr(apic); | ||
443 | |||
444 | if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR)) | ||
445 | kvm_ioapic_update_eoi(apic->vcpu->kvm, vector); | ||
446 | } | ||
447 | |||
448 | static void apic_send_ipi(struct kvm_lapic *apic) | ||
449 | { | ||
450 | u32 icr_low = apic_get_reg(apic, APIC_ICR); | ||
451 | u32 icr_high = apic_get_reg(apic, APIC_ICR2); | ||
452 | |||
453 | unsigned int dest = GET_APIC_DEST_FIELD(icr_high); | ||
454 | unsigned int short_hand = icr_low & APIC_SHORT_MASK; | ||
455 | unsigned int trig_mode = icr_low & APIC_INT_LEVELTRIG; | ||
456 | unsigned int level = icr_low & APIC_INT_ASSERT; | ||
457 | unsigned int dest_mode = icr_low & APIC_DEST_MASK; | ||
458 | unsigned int delivery_mode = icr_low & APIC_MODE_MASK; | ||
459 | unsigned int vector = icr_low & APIC_VECTOR_MASK; | ||
460 | |||
461 | struct kvm_lapic *target; | ||
462 | struct kvm_vcpu *vcpu; | ||
463 | unsigned long lpr_map = 0; | ||
464 | int i; | ||
465 | |||
466 | apic_debug("icr_high 0x%x, icr_low 0x%x, " | ||
467 | "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, " | ||
468 | "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n", | ||
469 | icr_high, icr_low, short_hand, dest, | ||
470 | trig_mode, level, dest_mode, delivery_mode, vector); | ||
471 | |||
472 | for (i = 0; i < KVM_MAX_VCPUS; i++) { | ||
473 | vcpu = apic->vcpu->kvm->vcpus[i]; | ||
474 | if (!vcpu) | ||
475 | continue; | ||
476 | |||
477 | if (vcpu->apic && | ||
478 | apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) { | ||
479 | if (delivery_mode == APIC_DM_LOWEST) | ||
480 | set_bit(vcpu->vcpu_id, &lpr_map); | ||
481 | else | ||
482 | __apic_accept_irq(vcpu->apic, delivery_mode, | ||
483 | vector, level, trig_mode); | ||
484 | } | ||
485 | } | ||
486 | |||
487 | if (delivery_mode == APIC_DM_LOWEST) { | ||
488 | target = kvm_apic_round_robin(vcpu->kvm, vector, lpr_map); | ||
489 | if (target != NULL) | ||
490 | __apic_accept_irq(target, delivery_mode, | ||
491 | vector, level, trig_mode); | ||
492 | } | ||
493 | } | ||
494 | |||
495 | static u32 apic_get_tmcct(struct kvm_lapic *apic) | ||
496 | { | ||
497 | u64 counter_passed; | ||
498 | ktime_t passed, now; | ||
499 | u32 tmcct; | ||
500 | |||
501 | ASSERT(apic != NULL); | ||
502 | |||
503 | now = apic->timer.dev.base->get_time(); | ||
504 | tmcct = apic_get_reg(apic, APIC_TMICT); | ||
505 | |||
506 | /* if initial count is 0, current count should also be 0 */ | ||
507 | if (tmcct == 0) | ||
508 | return 0; | ||
509 | |||
510 | if (unlikely(ktime_to_ns(now) <= | ||
511 | ktime_to_ns(apic->timer.last_update))) { | ||
512 | /* Wrap around */ | ||
513 | passed = ktime_add(( { | ||
514 | (ktime_t) { | ||
515 | .tv64 = KTIME_MAX - | ||
516 | (apic->timer.last_update).tv64}; } | ||
517 | ), now); | ||
518 | apic_debug("time elapsed\n"); | ||
519 | } else | ||
520 | passed = ktime_sub(now, apic->timer.last_update); | ||
521 | |||
522 | counter_passed = div64_64(ktime_to_ns(passed), | ||
523 | (APIC_BUS_CYCLE_NS * apic->timer.divide_count)); | ||
524 | |||
525 | if (counter_passed > tmcct) { | ||
526 | if (unlikely(!apic_lvtt_period(apic))) { | ||
527 | /* one-shot timers stick at 0 until reset */ | ||
528 | tmcct = 0; | ||
529 | } else { | ||
530 | /* | ||
531 | * periodic timers reset to APIC_TMICT when they | ||
532 | * hit 0. The while loop simulates this happening N | ||
533 | * times. (counter_passed %= tmcct) would also work, | ||
534 | * but might be slower or not work on 32-bit?? | ||
535 | */ | ||
536 | while (counter_passed > tmcct) | ||
537 | counter_passed -= tmcct; | ||
538 | tmcct -= counter_passed; | ||
539 | } | ||
540 | } else { | ||
541 | tmcct -= counter_passed; | ||
542 | } | ||
543 | |||
544 | return tmcct; | ||
545 | } | ||
546 | |||
547 | static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) | ||
548 | { | ||
549 | u32 val = 0; | ||
550 | |||
551 | if (offset >= LAPIC_MMIO_LENGTH) | ||
552 | return 0; | ||
553 | |||
554 | switch (offset) { | ||
555 | case APIC_ARBPRI: | ||
556 | printk(KERN_WARNING "Access APIC ARBPRI register " | ||
557 | "which is for P6\n"); | ||
558 | break; | ||
559 | |||
560 | case APIC_TMCCT: /* Timer CCR */ | ||
561 | val = apic_get_tmcct(apic); | ||
562 | break; | ||
563 | |||
564 | default: | ||
565 | apic_update_ppr(apic); | ||
566 | val = apic_get_reg(apic, offset); | ||
567 | break; | ||
568 | } | ||
569 | |||
570 | return val; | ||
571 | } | ||
572 | |||
573 | static void apic_mmio_read(struct kvm_io_device *this, | ||
574 | gpa_t address, int len, void *data) | ||
575 | { | ||
576 | struct kvm_lapic *apic = (struct kvm_lapic *)this->private; | ||
577 | unsigned int offset = address - apic->base_address; | ||
578 | unsigned char alignment = offset & 0xf; | ||
579 | u32 result; | ||
580 | |||
581 | if ((alignment + len) > 4) { | ||
582 | printk(KERN_ERR "KVM_APIC_READ: alignment error %lx %d", | ||
583 | (unsigned long)address, len); | ||
584 | return; | ||
585 | } | ||
586 | result = __apic_read(apic, offset & ~0xf); | ||
587 | |||
588 | switch (len) { | ||
589 | case 1: | ||
590 | case 2: | ||
591 | case 4: | ||
592 | memcpy(data, (char *)&result + alignment, len); | ||
593 | break; | ||
594 | default: | ||
595 | printk(KERN_ERR "Local APIC read with len = %x, " | ||
596 | "should be 1,2, or 4 instead\n", len); | ||
597 | break; | ||
598 | } | ||
599 | } | ||
600 | |||
601 | static void update_divide_count(struct kvm_lapic *apic) | ||
602 | { | ||
603 | u32 tmp1, tmp2, tdcr; | ||
604 | |||
605 | tdcr = apic_get_reg(apic, APIC_TDCR); | ||
606 | tmp1 = tdcr & 0xf; | ||
607 | tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1; | ||
608 | apic->timer.divide_count = 0x1 << (tmp2 & 0x7); | ||
609 | |||
610 | apic_debug("timer divide count is 0x%x\n", | ||
611 | apic->timer.divide_count); | ||
612 | } | ||
613 | |||
614 | static void start_apic_timer(struct kvm_lapic *apic) | ||
615 | { | ||
616 | ktime_t now = apic->timer.dev.base->get_time(); | ||
617 | |||
618 | apic->timer.last_update = now; | ||
619 | |||
620 | apic->timer.period = apic_get_reg(apic, APIC_TMICT) * | ||
621 | APIC_BUS_CYCLE_NS * apic->timer.divide_count; | ||
622 | atomic_set(&apic->timer.pending, 0); | ||
623 | hrtimer_start(&apic->timer.dev, | ||
624 | ktime_add_ns(now, apic->timer.period), | ||
625 | HRTIMER_MODE_ABS); | ||
626 | |||
627 | apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" | ||
628 | PRIx64 ", " | ||
629 | "timer initial count 0x%x, period %lldns, " | ||
630 | "expire @ 0x%016" PRIx64 ".\n", __FUNCTION__, | ||
631 | APIC_BUS_CYCLE_NS, ktime_to_ns(now), | ||
632 | apic_get_reg(apic, APIC_TMICT), | ||
633 | apic->timer.period, | ||
634 | ktime_to_ns(ktime_add_ns(now, | ||
635 | apic->timer.period))); | ||
636 | } | ||
637 | |||
638 | static void apic_mmio_write(struct kvm_io_device *this, | ||
639 | gpa_t address, int len, const void *data) | ||
640 | { | ||
641 | struct kvm_lapic *apic = (struct kvm_lapic *)this->private; | ||
642 | unsigned int offset = address - apic->base_address; | ||
643 | unsigned char alignment = offset & 0xf; | ||
644 | u32 val; | ||
645 | |||
646 | /* | ||
647 | * APIC register must be aligned on 128-bits boundary. | ||
648 | * 32/64/128 bits registers must be accessed thru 32 bits. | ||
649 | * Refer SDM 8.4.1 | ||
650 | */ | ||
651 | if (len != 4 || alignment) { | ||
652 | if (printk_ratelimit()) | ||
653 | printk(KERN_ERR "apic write: bad size=%d %lx\n", | ||
654 | len, (long)address); | ||
655 | return; | ||
656 | } | ||
657 | |||
658 | val = *(u32 *) data; | ||
659 | |||
660 | /* too common printing */ | ||
661 | if (offset != APIC_EOI) | ||
662 | apic_debug("%s: offset 0x%x with length 0x%x, and value is " | ||
663 | "0x%x\n", __FUNCTION__, offset, len, val); | ||
664 | |||
665 | offset &= 0xff0; | ||
666 | |||
667 | switch (offset) { | ||
668 | case APIC_ID: /* Local APIC ID */ | ||
669 | apic_set_reg(apic, APIC_ID, val); | ||
670 | break; | ||
671 | |||
672 | case APIC_TASKPRI: | ||
673 | apic_set_tpr(apic, val & 0xff); | ||
674 | break; | ||
675 | |||
676 | case APIC_EOI: | ||
677 | apic_set_eoi(apic); | ||
678 | break; | ||
679 | |||
680 | case APIC_LDR: | ||
681 | apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK); | ||
682 | break; | ||
683 | |||
684 | case APIC_DFR: | ||
685 | apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF); | ||
686 | break; | ||
687 | |||
688 | case APIC_SPIV: | ||
689 | apic_set_reg(apic, APIC_SPIV, val & 0x3ff); | ||
690 | if (!(val & APIC_SPIV_APIC_ENABLED)) { | ||
691 | int i; | ||
692 | u32 lvt_val; | ||
693 | |||
694 | for (i = 0; i < APIC_LVT_NUM; i++) { | ||
695 | lvt_val = apic_get_reg(apic, | ||
696 | APIC_LVTT + 0x10 * i); | ||
697 | apic_set_reg(apic, APIC_LVTT + 0x10 * i, | ||
698 | lvt_val | APIC_LVT_MASKED); | ||
699 | } | ||
700 | atomic_set(&apic->timer.pending, 0); | ||
701 | |||
702 | } | ||
703 | break; | ||
704 | |||
705 | case APIC_ICR: | ||
706 | /* No delay here, so we always clear the pending bit */ | ||
707 | apic_set_reg(apic, APIC_ICR, val & ~(1 << 12)); | ||
708 | apic_send_ipi(apic); | ||
709 | break; | ||
710 | |||
711 | case APIC_ICR2: | ||
712 | apic_set_reg(apic, APIC_ICR2, val & 0xff000000); | ||
713 | break; | ||
714 | |||
715 | case APIC_LVTT: | ||
716 | case APIC_LVTTHMR: | ||
717 | case APIC_LVTPC: | ||
718 | case APIC_LVT0: | ||
719 | case APIC_LVT1: | ||
720 | case APIC_LVTERR: | ||
721 | /* TODO: Check vector */ | ||
722 | if (!apic_sw_enabled(apic)) | ||
723 | val |= APIC_LVT_MASKED; | ||
724 | |||
725 | val &= apic_lvt_mask[(offset - APIC_LVTT) >> 4]; | ||
726 | apic_set_reg(apic, offset, val); | ||
727 | |||
728 | break; | ||
729 | |||
730 | case APIC_TMICT: | ||
731 | hrtimer_cancel(&apic->timer.dev); | ||
732 | apic_set_reg(apic, APIC_TMICT, val); | ||
733 | start_apic_timer(apic); | ||
734 | return; | ||
735 | |||
736 | case APIC_TDCR: | ||
737 | if (val & 4) | ||
738 | printk(KERN_ERR "KVM_WRITE:TDCR %x\n", val); | ||
739 | apic_set_reg(apic, APIC_TDCR, val); | ||
740 | update_divide_count(apic); | ||
741 | break; | ||
742 | |||
743 | default: | ||
744 | apic_debug("Local APIC Write to read-only register %x\n", | ||
745 | offset); | ||
746 | break; | ||
747 | } | ||
748 | |||
749 | } | ||
750 | |||
751 | static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr) | ||
752 | { | ||
753 | struct kvm_lapic *apic = (struct kvm_lapic *)this->private; | ||
754 | int ret = 0; | ||
755 | |||
756 | |||
757 | if (apic_hw_enabled(apic) && | ||
758 | (addr >= apic->base_address) && | ||
759 | (addr < (apic->base_address + LAPIC_MMIO_LENGTH))) | ||
760 | ret = 1; | ||
761 | |||
762 | return ret; | ||
763 | } | ||
764 | |||
765 | void kvm_free_apic(struct kvm_lapic *apic) | ||
766 | { | ||
767 | if (!apic) | ||
768 | return; | ||
769 | |||
770 | hrtimer_cancel(&apic->timer.dev); | ||
771 | |||
772 | if (apic->regs_page) { | ||
773 | __free_page(apic->regs_page); | ||
774 | apic->regs_page = 0; | ||
775 | } | ||
776 | |||
777 | kfree(apic); | ||
778 | } | ||
779 | |||
780 | /* | ||
781 | *---------------------------------------------------------------------- | ||
782 | * LAPIC interface | ||
783 | *---------------------------------------------------------------------- | ||
784 | */ | ||
785 | |||
786 | void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8) | ||
787 | { | ||
788 | struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic; | ||
789 | |||
790 | if (!apic) | ||
791 | return; | ||
792 | apic_set_tpr(apic, ((cr8 & 0x0f) << 4)); | ||
793 | } | ||
794 | |||
795 | u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) | ||
796 | { | ||
797 | struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic; | ||
798 | u64 tpr; | ||
799 | |||
800 | if (!apic) | ||
801 | return 0; | ||
802 | tpr = (u64) apic_get_reg(apic, APIC_TASKPRI); | ||
803 | |||
804 | return (tpr & 0xf0) >> 4; | ||
805 | } | ||
806 | EXPORT_SYMBOL_GPL(kvm_lapic_get_cr8); | ||
807 | |||
808 | void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) | ||
809 | { | ||
810 | struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic; | ||
811 | |||
812 | if (!apic) { | ||
813 | value |= MSR_IA32_APICBASE_BSP; | ||
814 | vcpu->apic_base = value; | ||
815 | return; | ||
816 | } | ||
817 | if (apic->vcpu->vcpu_id) | ||
818 | value &= ~MSR_IA32_APICBASE_BSP; | ||
819 | |||
820 | vcpu->apic_base = value; | ||
821 | apic->base_address = apic->vcpu->apic_base & | ||
822 | MSR_IA32_APICBASE_BASE; | ||
823 | |||
824 | /* with FSB delivery interrupt, we can restart APIC functionality */ | ||
825 | apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is " | ||
826 | "0x%lx.\n", apic->apic_base, apic->base_address); | ||
827 | |||
828 | } | ||
829 | |||
830 | u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu) | ||
831 | { | ||
832 | return vcpu->apic_base; | ||
833 | } | ||
834 | EXPORT_SYMBOL_GPL(kvm_lapic_get_base); | ||
835 | |||
836 | void kvm_lapic_reset(struct kvm_vcpu *vcpu) | ||
837 | { | ||
838 | struct kvm_lapic *apic; | ||
839 | int i; | ||
840 | |||
841 | apic_debug("%s\n", __FUNCTION__); | ||
842 | |||
843 | ASSERT(vcpu); | ||
844 | apic = vcpu->apic; | ||
845 | ASSERT(apic != NULL); | ||
846 | |||
847 | /* Stop the timer in case it's a reset to an active apic */ | ||
848 | hrtimer_cancel(&apic->timer.dev); | ||
849 | |||
850 | apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24); | ||
851 | apic_set_reg(apic, APIC_LVR, APIC_VERSION); | ||
852 | |||
853 | for (i = 0; i < APIC_LVT_NUM; i++) | ||
854 | apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED); | ||
855 | apic_set_reg(apic, APIC_LVT0, | ||
856 | SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT)); | ||
857 | |||
858 | apic_set_reg(apic, APIC_DFR, 0xffffffffU); | ||
859 | apic_set_reg(apic, APIC_SPIV, 0xff); | ||
860 | apic_set_reg(apic, APIC_TASKPRI, 0); | ||
861 | apic_set_reg(apic, APIC_LDR, 0); | ||
862 | apic_set_reg(apic, APIC_ESR, 0); | ||
863 | apic_set_reg(apic, APIC_ICR, 0); | ||
864 | apic_set_reg(apic, APIC_ICR2, 0); | ||
865 | apic_set_reg(apic, APIC_TDCR, 0); | ||
866 | apic_set_reg(apic, APIC_TMICT, 0); | ||
867 | for (i = 0; i < 8; i++) { | ||
868 | apic_set_reg(apic, APIC_IRR + 0x10 * i, 0); | ||
869 | apic_set_reg(apic, APIC_ISR + 0x10 * i, 0); | ||
870 | apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); | ||
871 | } | ||
872 | update_divide_count(apic); | ||
873 | atomic_set(&apic->timer.pending, 0); | ||
874 | if (vcpu->vcpu_id == 0) | ||
875 | vcpu->apic_base |= MSR_IA32_APICBASE_BSP; | ||
876 | apic_update_ppr(apic); | ||
877 | |||
878 | apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr=" | ||
879 | "0x%016" PRIx64 ", base_address=0x%0lx.\n", __FUNCTION__, | ||
880 | vcpu, kvm_apic_id(apic), | ||
881 | vcpu->apic_base, apic->base_address); | ||
882 | } | ||
883 | EXPORT_SYMBOL_GPL(kvm_lapic_reset); | ||
884 | |||
885 | int kvm_lapic_enabled(struct kvm_vcpu *vcpu) | ||
886 | { | ||
887 | struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic; | ||
888 | int ret = 0; | ||
889 | |||
890 | if (!apic) | ||
891 | return 0; | ||
892 | ret = apic_enabled(apic); | ||
893 | |||
894 | return ret; | ||
895 | } | ||
896 | EXPORT_SYMBOL_GPL(kvm_lapic_enabled); | ||
897 | |||
898 | /* | ||
899 | *---------------------------------------------------------------------- | ||
900 | * timer interface | ||
901 | *---------------------------------------------------------------------- | ||
902 | */ | ||
903 | |||
904 | /* TODO: make sure __apic_timer_fn runs in current pCPU */ | ||
905 | static int __apic_timer_fn(struct kvm_lapic *apic) | ||
906 | { | ||
907 | int result = 0; | ||
908 | wait_queue_head_t *q = &apic->vcpu->wq; | ||
909 | |||
910 | atomic_inc(&apic->timer.pending); | ||
911 | if (waitqueue_active(q)) | ||
912 | { | ||
913 | apic->vcpu->mp_state = VCPU_MP_STATE_RUNNABLE; | ||
914 | wake_up_interruptible(q); | ||
915 | } | ||
916 | if (apic_lvtt_period(apic)) { | ||
917 | result = 1; | ||
918 | apic->timer.dev.expires = ktime_add_ns( | ||
919 | apic->timer.dev.expires, | ||
920 | apic->timer.period); | ||
921 | } | ||
922 | return result; | ||
923 | } | ||
924 | |||
925 | static int __inject_apic_timer_irq(struct kvm_lapic *apic) | ||
926 | { | ||
927 | int vector; | ||
928 | |||
929 | vector = apic_lvt_vector(apic, APIC_LVTT); | ||
930 | return __apic_accept_irq(apic, APIC_DM_FIXED, vector, 1, 0); | ||
931 | } | ||
932 | |||
933 | static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) | ||
934 | { | ||
935 | struct kvm_lapic *apic; | ||
936 | int restart_timer = 0; | ||
937 | |||
938 | apic = container_of(data, struct kvm_lapic, timer.dev); | ||
939 | |||
940 | restart_timer = __apic_timer_fn(apic); | ||
941 | |||
942 | if (restart_timer) | ||
943 | return HRTIMER_RESTART; | ||
944 | else | ||
945 | return HRTIMER_NORESTART; | ||
946 | } | ||
947 | |||
948 | int kvm_create_lapic(struct kvm_vcpu *vcpu) | ||
949 | { | ||
950 | struct kvm_lapic *apic; | ||
951 | |||
952 | ASSERT(vcpu != NULL); | ||
953 | apic_debug("apic_init %d\n", vcpu->vcpu_id); | ||
954 | |||
955 | apic = kzalloc(sizeof(*apic), GFP_KERNEL); | ||
956 | if (!apic) | ||
957 | goto nomem; | ||
958 | |||
959 | vcpu->apic = apic; | ||
960 | |||
961 | apic->regs_page = alloc_page(GFP_KERNEL); | ||
962 | if (apic->regs_page == NULL) { | ||
963 | printk(KERN_ERR "malloc apic regs error for vcpu %x\n", | ||
964 | vcpu->vcpu_id); | ||
965 | goto nomem; | ||
966 | } | ||
967 | apic->regs = page_address(apic->regs_page); | ||
968 | memset(apic->regs, 0, PAGE_SIZE); | ||
969 | apic->vcpu = vcpu; | ||
970 | |||
971 | hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); | ||
972 | apic->timer.dev.function = apic_timer_fn; | ||
973 | apic->base_address = APIC_DEFAULT_PHYS_BASE; | ||
974 | vcpu->apic_base = APIC_DEFAULT_PHYS_BASE; | ||
975 | |||
976 | kvm_lapic_reset(vcpu); | ||
977 | apic->dev.read = apic_mmio_read; | ||
978 | apic->dev.write = apic_mmio_write; | ||
979 | apic->dev.in_range = apic_mmio_range; | ||
980 | apic->dev.private = apic; | ||
981 | |||
982 | return 0; | ||
983 | nomem: | ||
984 | kvm_free_apic(apic); | ||
985 | return -ENOMEM; | ||
986 | } | ||
987 | EXPORT_SYMBOL_GPL(kvm_create_lapic); | ||
988 | |||
989 | int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) | ||
990 | { | ||
991 | struct kvm_lapic *apic = vcpu->apic; | ||
992 | int highest_irr; | ||
993 | |||
994 | if (!apic || !apic_enabled(apic)) | ||
995 | return -1; | ||
996 | |||
997 | apic_update_ppr(apic); | ||
998 | highest_irr = apic_find_highest_irr(apic); | ||
999 | if ((highest_irr == -1) || | ||
1000 | ((highest_irr & 0xF0) <= apic_get_reg(apic, APIC_PROCPRI))) | ||
1001 | return -1; | ||
1002 | return highest_irr; | ||
1003 | } | ||
1004 | |||
1005 | int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) | ||
1006 | { | ||
1007 | u32 lvt0 = apic_get_reg(vcpu->apic, APIC_LVT0); | ||
1008 | int r = 0; | ||
1009 | |||
1010 | if (vcpu->vcpu_id == 0) { | ||
1011 | if (!apic_hw_enabled(vcpu->apic)) | ||
1012 | r = 1; | ||
1013 | if ((lvt0 & APIC_LVT_MASKED) == 0 && | ||
1014 | GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) | ||
1015 | r = 1; | ||
1016 | } | ||
1017 | return r; | ||
1018 | } | ||
1019 | |||
1020 | void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) | ||
1021 | { | ||
1022 | struct kvm_lapic *apic = vcpu->apic; | ||
1023 | |||
1024 | if (apic && apic_lvt_enabled(apic, APIC_LVTT) && | ||
1025 | atomic_read(&apic->timer.pending) > 0) { | ||
1026 | if (__inject_apic_timer_irq(apic)) | ||
1027 | atomic_dec(&apic->timer.pending); | ||
1028 | } | ||
1029 | } | ||
1030 | |||
1031 | void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec) | ||
1032 | { | ||
1033 | struct kvm_lapic *apic = vcpu->apic; | ||
1034 | |||
1035 | if (apic && apic_lvt_vector(apic, APIC_LVTT) == vec) | ||
1036 | apic->timer.last_update = ktime_add_ns( | ||
1037 | apic->timer.last_update, | ||
1038 | apic->timer.period); | ||
1039 | } | ||
1040 | |||
1041 | int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) | ||
1042 | { | ||
1043 | int vector = kvm_apic_has_interrupt(vcpu); | ||
1044 | struct kvm_lapic *apic = vcpu->apic; | ||
1045 | |||
1046 | if (vector == -1) | ||
1047 | return -1; | ||
1048 | |||
1049 | apic_set_vector(vector, apic->regs + APIC_ISR); | ||
1050 | apic_update_ppr(apic); | ||
1051 | apic_clear_irr(vector, apic); | ||
1052 | return vector; | ||
1053 | } | ||
1054 | |||
1055 | void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) | ||
1056 | { | ||
1057 | struct kvm_lapic *apic = vcpu->apic; | ||
1058 | |||
1059 | apic->base_address = vcpu->apic_base & | ||
1060 | MSR_IA32_APICBASE_BASE; | ||
1061 | apic_set_reg(apic, APIC_LVR, APIC_VERSION); | ||
1062 | apic_update_ppr(apic); | ||
1063 | hrtimer_cancel(&apic->timer.dev); | ||
1064 | update_divide_count(apic); | ||
1065 | start_apic_timer(apic); | ||
1066 | } | ||
1067 | |||
1068 | void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) | ||
1069 | { | ||
1070 | struct kvm_lapic *apic = vcpu->apic; | ||
1071 | struct hrtimer *timer; | ||
1072 | |||
1073 | if (!apic) | ||
1074 | return; | ||
1075 | |||
1076 | timer = &apic->timer.dev; | ||
1077 | if (hrtimer_cancel(timer)) | ||
1078 | hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS); | ||
1079 | } | ||
1080 | EXPORT_SYMBOL_GPL(kvm_migrate_apic_timer); | ||
diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c deleted file mode 100644 index feb5ac986c5d..000000000000 --- a/drivers/kvm/mmu.c +++ /dev/null | |||
@@ -1,1498 +0,0 @@ | |||
1 | /* | ||
2 | * Kernel-based Virtual Machine driver for Linux | ||
3 | * | ||
4 | * This module enables machines with Intel VT-x extensions to run virtual | ||
5 | * machines without emulation or binary translation. | ||
6 | * | ||
7 | * MMU support | ||
8 | * | ||
9 | * Copyright (C) 2006 Qumranet, Inc. | ||
10 | * | ||
11 | * Authors: | ||
12 | * Yaniv Kamay <yaniv@qumranet.com> | ||
13 | * Avi Kivity <avi@qumranet.com> | ||
14 | * | ||
15 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
16 | * the COPYING file in the top-level directory. | ||
17 | * | ||
18 | */ | ||
19 | |||
20 | #include "vmx.h" | ||
21 | #include "kvm.h" | ||
22 | |||
23 | #include <linux/types.h> | ||
24 | #include <linux/string.h> | ||
25 | #include <linux/mm.h> | ||
26 | #include <linux/highmem.h> | ||
27 | #include <linux/module.h> | ||
28 | |||
29 | #include <asm/page.h> | ||
30 | #include <asm/cmpxchg.h> | ||
31 | |||
32 | #undef MMU_DEBUG | ||
33 | |||
34 | #undef AUDIT | ||
35 | |||
36 | #ifdef AUDIT | ||
37 | static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg); | ||
38 | #else | ||
39 | static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {} | ||
40 | #endif | ||
41 | |||
42 | #ifdef MMU_DEBUG | ||
43 | |||
44 | #define pgprintk(x...) do { if (dbg) printk(x); } while (0) | ||
45 | #define rmap_printk(x...) do { if (dbg) printk(x); } while (0) | ||
46 | |||
47 | #else | ||
48 | |||
49 | #define pgprintk(x...) do { } while (0) | ||
50 | #define rmap_printk(x...) do { } while (0) | ||
51 | |||
52 | #endif | ||
53 | |||
54 | #if defined(MMU_DEBUG) || defined(AUDIT) | ||
55 | static int dbg = 1; | ||
56 | #endif | ||
57 | |||
58 | #ifndef MMU_DEBUG | ||
59 | #define ASSERT(x) do { } while (0) | ||
60 | #else | ||
61 | #define ASSERT(x) \ | ||
62 | if (!(x)) { \ | ||
63 | printk(KERN_WARNING "assertion failed %s:%d: %s\n", \ | ||
64 | __FILE__, __LINE__, #x); \ | ||
65 | } | ||
66 | #endif | ||
67 | |||
68 | #define PT64_PT_BITS 9 | ||
69 | #define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) | ||
70 | #define PT32_PT_BITS 10 | ||
71 | #define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS) | ||
72 | |||
73 | #define PT_WRITABLE_SHIFT 1 | ||
74 | |||
75 | #define PT_PRESENT_MASK (1ULL << 0) | ||
76 | #define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT) | ||
77 | #define PT_USER_MASK (1ULL << 2) | ||
78 | #define PT_PWT_MASK (1ULL << 3) | ||
79 | #define PT_PCD_MASK (1ULL << 4) | ||
80 | #define PT_ACCESSED_MASK (1ULL << 5) | ||
81 | #define PT_DIRTY_MASK (1ULL << 6) | ||
82 | #define PT_PAGE_SIZE_MASK (1ULL << 7) | ||
83 | #define PT_PAT_MASK (1ULL << 7) | ||
84 | #define PT_GLOBAL_MASK (1ULL << 8) | ||
85 | #define PT64_NX_MASK (1ULL << 63) | ||
86 | |||
87 | #define PT_PAT_SHIFT 7 | ||
88 | #define PT_DIR_PAT_SHIFT 12 | ||
89 | #define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT) | ||
90 | |||
91 | #define PT32_DIR_PSE36_SIZE 4 | ||
92 | #define PT32_DIR_PSE36_SHIFT 13 | ||
93 | #define PT32_DIR_PSE36_MASK (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT) | ||
94 | |||
95 | |||
96 | #define PT_FIRST_AVAIL_BITS_SHIFT 9 | ||
97 | #define PT64_SECOND_AVAIL_BITS_SHIFT 52 | ||
98 | |||
99 | #define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) | ||
100 | |||
101 | #define VALID_PAGE(x) ((x) != INVALID_PAGE) | ||
102 | |||
103 | #define PT64_LEVEL_BITS 9 | ||
104 | |||
105 | #define PT64_LEVEL_SHIFT(level) \ | ||
106 | ( PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS ) | ||
107 | |||
108 | #define PT64_LEVEL_MASK(level) \ | ||
109 | (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level)) | ||
110 | |||
111 | #define PT64_INDEX(address, level)\ | ||
112 | (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) | ||
113 | |||
114 | |||
115 | #define PT32_LEVEL_BITS 10 | ||
116 | |||
117 | #define PT32_LEVEL_SHIFT(level) \ | ||
118 | ( PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS ) | ||
119 | |||
120 | #define PT32_LEVEL_MASK(level) \ | ||
121 | (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level)) | ||
122 | |||
123 | #define PT32_INDEX(address, level)\ | ||
124 | (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) | ||
125 | |||
126 | |||
127 | #define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) | ||
128 | #define PT64_DIR_BASE_ADDR_MASK \ | ||
129 | (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1)) | ||
130 | |||
131 | #define PT32_BASE_ADDR_MASK PAGE_MASK | ||
132 | #define PT32_DIR_BASE_ADDR_MASK \ | ||
133 | (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1)) | ||
134 | |||
135 | |||
136 | #define PFERR_PRESENT_MASK (1U << 0) | ||
137 | #define PFERR_WRITE_MASK (1U << 1) | ||
138 | #define PFERR_USER_MASK (1U << 2) | ||
139 | #define PFERR_FETCH_MASK (1U << 4) | ||
140 | |||
141 | #define PT64_ROOT_LEVEL 4 | ||
142 | #define PT32_ROOT_LEVEL 2 | ||
143 | #define PT32E_ROOT_LEVEL 3 | ||
144 | |||
145 | #define PT_DIRECTORY_LEVEL 2 | ||
146 | #define PT_PAGE_TABLE_LEVEL 1 | ||
147 | |||
148 | #define RMAP_EXT 4 | ||
149 | |||
150 | struct kvm_rmap_desc { | ||
151 | u64 *shadow_ptes[RMAP_EXT]; | ||
152 | struct kvm_rmap_desc *more; | ||
153 | }; | ||
154 | |||
155 | static struct kmem_cache *pte_chain_cache; | ||
156 | static struct kmem_cache *rmap_desc_cache; | ||
157 | static struct kmem_cache *mmu_page_header_cache; | ||
158 | |||
159 | static int is_write_protection(struct kvm_vcpu *vcpu) | ||
160 | { | ||
161 | return vcpu->cr0 & X86_CR0_WP; | ||
162 | } | ||
163 | |||
164 | static int is_cpuid_PSE36(void) | ||
165 | { | ||
166 | return 1; | ||
167 | } | ||
168 | |||
169 | static int is_nx(struct kvm_vcpu *vcpu) | ||
170 | { | ||
171 | return vcpu->shadow_efer & EFER_NX; | ||
172 | } | ||
173 | |||
174 | static int is_present_pte(unsigned long pte) | ||
175 | { | ||
176 | return pte & PT_PRESENT_MASK; | ||
177 | } | ||
178 | |||
179 | static int is_writeble_pte(unsigned long pte) | ||
180 | { | ||
181 | return pte & PT_WRITABLE_MASK; | ||
182 | } | ||
183 | |||
184 | static int is_io_pte(unsigned long pte) | ||
185 | { | ||
186 | return pte & PT_SHADOW_IO_MARK; | ||
187 | } | ||
188 | |||
189 | static int is_rmap_pte(u64 pte) | ||
190 | { | ||
191 | return (pte & (PT_WRITABLE_MASK | PT_PRESENT_MASK)) | ||
192 | == (PT_WRITABLE_MASK | PT_PRESENT_MASK); | ||
193 | } | ||
194 | |||
195 | static void set_shadow_pte(u64 *sptep, u64 spte) | ||
196 | { | ||
197 | #ifdef CONFIG_X86_64 | ||
198 | set_64bit((unsigned long *)sptep, spte); | ||
199 | #else | ||
200 | set_64bit((unsigned long long *)sptep, spte); | ||
201 | #endif | ||
202 | } | ||
203 | |||
204 | static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, | ||
205 | struct kmem_cache *base_cache, int min) | ||
206 | { | ||
207 | void *obj; | ||
208 | |||
209 | if (cache->nobjs >= min) | ||
210 | return 0; | ||
211 | while (cache->nobjs < ARRAY_SIZE(cache->objects)) { | ||
212 | obj = kmem_cache_zalloc(base_cache, GFP_KERNEL); | ||
213 | if (!obj) | ||
214 | return -ENOMEM; | ||
215 | cache->objects[cache->nobjs++] = obj; | ||
216 | } | ||
217 | return 0; | ||
218 | } | ||
219 | |||
220 | static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) | ||
221 | { | ||
222 | while (mc->nobjs) | ||
223 | kfree(mc->objects[--mc->nobjs]); | ||
224 | } | ||
225 | |||
226 | static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, | ||
227 | int min) | ||
228 | { | ||
229 | struct page *page; | ||
230 | |||
231 | if (cache->nobjs >= min) | ||
232 | return 0; | ||
233 | while (cache->nobjs < ARRAY_SIZE(cache->objects)) { | ||
234 | page = alloc_page(GFP_KERNEL); | ||
235 | if (!page) | ||
236 | return -ENOMEM; | ||
237 | set_page_private(page, 0); | ||
238 | cache->objects[cache->nobjs++] = page_address(page); | ||
239 | } | ||
240 | return 0; | ||
241 | } | ||
242 | |||
243 | static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc) | ||
244 | { | ||
245 | while (mc->nobjs) | ||
246 | free_page((unsigned long)mc->objects[--mc->nobjs]); | ||
247 | } | ||
248 | |||
249 | static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) | ||
250 | { | ||
251 | int r; | ||
252 | |||
253 | kvm_mmu_free_some_pages(vcpu); | ||
254 | r = mmu_topup_memory_cache(&vcpu->mmu_pte_chain_cache, | ||
255 | pte_chain_cache, 4); | ||
256 | if (r) | ||
257 | goto out; | ||
258 | r = mmu_topup_memory_cache(&vcpu->mmu_rmap_desc_cache, | ||
259 | rmap_desc_cache, 1); | ||
260 | if (r) | ||
261 | goto out; | ||
262 | r = mmu_topup_memory_cache_page(&vcpu->mmu_page_cache, 4); | ||
263 | if (r) | ||
264 | goto out; | ||
265 | r = mmu_topup_memory_cache(&vcpu->mmu_page_header_cache, | ||
266 | mmu_page_header_cache, 4); | ||
267 | out: | ||
268 | return r; | ||
269 | } | ||
270 | |||
271 | static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) | ||
272 | { | ||
273 | mmu_free_memory_cache(&vcpu->mmu_pte_chain_cache); | ||
274 | mmu_free_memory_cache(&vcpu->mmu_rmap_desc_cache); | ||
275 | mmu_free_memory_cache_page(&vcpu->mmu_page_cache); | ||
276 | mmu_free_memory_cache(&vcpu->mmu_page_header_cache); | ||
277 | } | ||
278 | |||
279 | static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, | ||
280 | size_t size) | ||
281 | { | ||
282 | void *p; | ||
283 | |||
284 | BUG_ON(!mc->nobjs); | ||
285 | p = mc->objects[--mc->nobjs]; | ||
286 | memset(p, 0, size); | ||
287 | return p; | ||
288 | } | ||
289 | |||
290 | static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu) | ||
291 | { | ||
292 | return mmu_memory_cache_alloc(&vcpu->mmu_pte_chain_cache, | ||
293 | sizeof(struct kvm_pte_chain)); | ||
294 | } | ||
295 | |||
296 | static void mmu_free_pte_chain(struct kvm_pte_chain *pc) | ||
297 | { | ||
298 | kfree(pc); | ||
299 | } | ||
300 | |||
301 | static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu) | ||
302 | { | ||
303 | return mmu_memory_cache_alloc(&vcpu->mmu_rmap_desc_cache, | ||
304 | sizeof(struct kvm_rmap_desc)); | ||
305 | } | ||
306 | |||
307 | static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd) | ||
308 | { | ||
309 | kfree(rd); | ||
310 | } | ||
311 | |||
312 | /* | ||
313 | * Reverse mapping data structures: | ||
314 | * | ||
315 | * If page->private bit zero is zero, then page->private points to the | ||
316 | * shadow page table entry that points to page_address(page). | ||
317 | * | ||
318 | * If page->private bit zero is one, (then page->private & ~1) points | ||
319 | * to a struct kvm_rmap_desc containing more mappings. | ||
320 | */ | ||
321 | static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte) | ||
322 | { | ||
323 | struct page *page; | ||
324 | struct kvm_rmap_desc *desc; | ||
325 | int i; | ||
326 | |||
327 | if (!is_rmap_pte(*spte)) | ||
328 | return; | ||
329 | page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT); | ||
330 | if (!page_private(page)) { | ||
331 | rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); | ||
332 | set_page_private(page,(unsigned long)spte); | ||
333 | } else if (!(page_private(page) & 1)) { | ||
334 | rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte); | ||
335 | desc = mmu_alloc_rmap_desc(vcpu); | ||
336 | desc->shadow_ptes[0] = (u64 *)page_private(page); | ||
337 | desc->shadow_ptes[1] = spte; | ||
338 | set_page_private(page,(unsigned long)desc | 1); | ||
339 | } else { | ||
340 | rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); | ||
341 | desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul); | ||
342 | while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) | ||
343 | desc = desc->more; | ||
344 | if (desc->shadow_ptes[RMAP_EXT-1]) { | ||
345 | desc->more = mmu_alloc_rmap_desc(vcpu); | ||
346 | desc = desc->more; | ||
347 | } | ||
348 | for (i = 0; desc->shadow_ptes[i]; ++i) | ||
349 | ; | ||
350 | desc->shadow_ptes[i] = spte; | ||
351 | } | ||
352 | } | ||
353 | |||
354 | static void rmap_desc_remove_entry(struct page *page, | ||
355 | struct kvm_rmap_desc *desc, | ||
356 | int i, | ||
357 | struct kvm_rmap_desc *prev_desc) | ||
358 | { | ||
359 | int j; | ||
360 | |||
361 | for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j) | ||
362 | ; | ||
363 | desc->shadow_ptes[i] = desc->shadow_ptes[j]; | ||
364 | desc->shadow_ptes[j] = NULL; | ||
365 | if (j != 0) | ||
366 | return; | ||
367 | if (!prev_desc && !desc->more) | ||
368 | set_page_private(page,(unsigned long)desc->shadow_ptes[0]); | ||
369 | else | ||
370 | if (prev_desc) | ||
371 | prev_desc->more = desc->more; | ||
372 | else | ||
373 | set_page_private(page,(unsigned long)desc->more | 1); | ||
374 | mmu_free_rmap_desc(desc); | ||
375 | } | ||
376 | |||
377 | static void rmap_remove(u64 *spte) | ||
378 | { | ||
379 | struct page *page; | ||
380 | struct kvm_rmap_desc *desc; | ||
381 | struct kvm_rmap_desc *prev_desc; | ||
382 | int i; | ||
383 | |||
384 | if (!is_rmap_pte(*spte)) | ||
385 | return; | ||
386 | page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT); | ||
387 | if (!page_private(page)) { | ||
388 | printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); | ||
389 | BUG(); | ||
390 | } else if (!(page_private(page) & 1)) { | ||
391 | rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte); | ||
392 | if ((u64 *)page_private(page) != spte) { | ||
393 | printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n", | ||
394 | spte, *spte); | ||
395 | BUG(); | ||
396 | } | ||
397 | set_page_private(page,0); | ||
398 | } else { | ||
399 | rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte); | ||
400 | desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul); | ||
401 | prev_desc = NULL; | ||
402 | while (desc) { | ||
403 | for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) | ||
404 | if (desc->shadow_ptes[i] == spte) { | ||
405 | rmap_desc_remove_entry(page, | ||
406 | desc, i, | ||
407 | prev_desc); | ||
408 | return; | ||
409 | } | ||
410 | prev_desc = desc; | ||
411 | desc = desc->more; | ||
412 | } | ||
413 | BUG(); | ||
414 | } | ||
415 | } | ||
416 | |||
417 | static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) | ||
418 | { | ||
419 | struct kvm *kvm = vcpu->kvm; | ||
420 | struct page *page; | ||
421 | struct kvm_rmap_desc *desc; | ||
422 | u64 *spte; | ||
423 | |||
424 | page = gfn_to_page(kvm, gfn); | ||
425 | BUG_ON(!page); | ||
426 | |||
427 | while (page_private(page)) { | ||
428 | if (!(page_private(page) & 1)) | ||
429 | spte = (u64 *)page_private(page); | ||
430 | else { | ||
431 | desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul); | ||
432 | spte = desc->shadow_ptes[0]; | ||
433 | } | ||
434 | BUG_ON(!spte); | ||
435 | BUG_ON((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT | ||
436 | != page_to_pfn(page)); | ||
437 | BUG_ON(!(*spte & PT_PRESENT_MASK)); | ||
438 | BUG_ON(!(*spte & PT_WRITABLE_MASK)); | ||
439 | rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); | ||
440 | rmap_remove(spte); | ||
441 | set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK); | ||
442 | kvm_flush_remote_tlbs(vcpu->kvm); | ||
443 | } | ||
444 | } | ||
445 | |||
446 | #ifdef MMU_DEBUG | ||
447 | static int is_empty_shadow_page(u64 *spt) | ||
448 | { | ||
449 | u64 *pos; | ||
450 | u64 *end; | ||
451 | |||
452 | for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++) | ||
453 | if (*pos != 0) { | ||
454 | printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__, | ||
455 | pos, *pos); | ||
456 | return 0; | ||
457 | } | ||
458 | return 1; | ||
459 | } | ||
460 | #endif | ||
461 | |||
462 | static void kvm_mmu_free_page(struct kvm *kvm, | ||
463 | struct kvm_mmu_page *page_head) | ||
464 | { | ||
465 | ASSERT(is_empty_shadow_page(page_head->spt)); | ||
466 | list_del(&page_head->link); | ||
467 | __free_page(virt_to_page(page_head->spt)); | ||
468 | kfree(page_head); | ||
469 | ++kvm->n_free_mmu_pages; | ||
470 | } | ||
471 | |||
472 | static unsigned kvm_page_table_hashfn(gfn_t gfn) | ||
473 | { | ||
474 | return gfn; | ||
475 | } | ||
476 | |||
477 | static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, | ||
478 | u64 *parent_pte) | ||
479 | { | ||
480 | struct kvm_mmu_page *page; | ||
481 | |||
482 | if (!vcpu->kvm->n_free_mmu_pages) | ||
483 | return NULL; | ||
484 | |||
485 | page = mmu_memory_cache_alloc(&vcpu->mmu_page_header_cache, | ||
486 | sizeof *page); | ||
487 | page->spt = mmu_memory_cache_alloc(&vcpu->mmu_page_cache, PAGE_SIZE); | ||
488 | set_page_private(virt_to_page(page->spt), (unsigned long)page); | ||
489 | list_add(&page->link, &vcpu->kvm->active_mmu_pages); | ||
490 | ASSERT(is_empty_shadow_page(page->spt)); | ||
491 | page->slot_bitmap = 0; | ||
492 | page->multimapped = 0; | ||
493 | page->parent_pte = parent_pte; | ||
494 | --vcpu->kvm->n_free_mmu_pages; | ||
495 | return page; | ||
496 | } | ||
497 | |||
498 | static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, | ||
499 | struct kvm_mmu_page *page, u64 *parent_pte) | ||
500 | { | ||
501 | struct kvm_pte_chain *pte_chain; | ||
502 | struct hlist_node *node; | ||
503 | int i; | ||
504 | |||
505 | if (!parent_pte) | ||
506 | return; | ||
507 | if (!page->multimapped) { | ||
508 | u64 *old = page->parent_pte; | ||
509 | |||
510 | if (!old) { | ||
511 | page->parent_pte = parent_pte; | ||
512 | return; | ||
513 | } | ||
514 | page->multimapped = 1; | ||
515 | pte_chain = mmu_alloc_pte_chain(vcpu); | ||
516 | INIT_HLIST_HEAD(&page->parent_ptes); | ||
517 | hlist_add_head(&pte_chain->link, &page->parent_ptes); | ||
518 | pte_chain->parent_ptes[0] = old; | ||
519 | } | ||
520 | hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link) { | ||
521 | if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1]) | ||
522 | continue; | ||
523 | for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) | ||
524 | if (!pte_chain->parent_ptes[i]) { | ||
525 | pte_chain->parent_ptes[i] = parent_pte; | ||
526 | return; | ||
527 | } | ||
528 | } | ||
529 | pte_chain = mmu_alloc_pte_chain(vcpu); | ||
530 | BUG_ON(!pte_chain); | ||
531 | hlist_add_head(&pte_chain->link, &page->parent_ptes); | ||
532 | pte_chain->parent_ptes[0] = parent_pte; | ||
533 | } | ||
534 | |||
535 | static void mmu_page_remove_parent_pte(struct kvm_mmu_page *page, | ||
536 | u64 *parent_pte) | ||
537 | { | ||
538 | struct kvm_pte_chain *pte_chain; | ||
539 | struct hlist_node *node; | ||
540 | int i; | ||
541 | |||
542 | if (!page->multimapped) { | ||
543 | BUG_ON(page->parent_pte != parent_pte); | ||
544 | page->parent_pte = NULL; | ||
545 | return; | ||
546 | } | ||
547 | hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link) | ||
548 | for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { | ||
549 | if (!pte_chain->parent_ptes[i]) | ||
550 | break; | ||
551 | if (pte_chain->parent_ptes[i] != parent_pte) | ||
552 | continue; | ||
553 | while (i + 1 < NR_PTE_CHAIN_ENTRIES | ||
554 | && pte_chain->parent_ptes[i + 1]) { | ||
555 | pte_chain->parent_ptes[i] | ||
556 | = pte_chain->parent_ptes[i + 1]; | ||
557 | ++i; | ||
558 | } | ||
559 | pte_chain->parent_ptes[i] = NULL; | ||
560 | if (i == 0) { | ||
561 | hlist_del(&pte_chain->link); | ||
562 | mmu_free_pte_chain(pte_chain); | ||
563 | if (hlist_empty(&page->parent_ptes)) { | ||
564 | page->multimapped = 0; | ||
565 | page->parent_pte = NULL; | ||
566 | } | ||
567 | } | ||
568 | return; | ||
569 | } | ||
570 | BUG(); | ||
571 | } | ||
572 | |||
573 | static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm_vcpu *vcpu, | ||
574 | gfn_t gfn) | ||
575 | { | ||
576 | unsigned index; | ||
577 | struct hlist_head *bucket; | ||
578 | struct kvm_mmu_page *page; | ||
579 | struct hlist_node *node; | ||
580 | |||
581 | pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn); | ||
582 | index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; | ||
583 | bucket = &vcpu->kvm->mmu_page_hash[index]; | ||
584 | hlist_for_each_entry(page, node, bucket, hash_link) | ||
585 | if (page->gfn == gfn && !page->role.metaphysical) { | ||
586 | pgprintk("%s: found role %x\n", | ||
587 | __FUNCTION__, page->role.word); | ||
588 | return page; | ||
589 | } | ||
590 | return NULL; | ||
591 | } | ||
592 | |||
593 | static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | ||
594 | gfn_t gfn, | ||
595 | gva_t gaddr, | ||
596 | unsigned level, | ||
597 | int metaphysical, | ||
598 | unsigned hugepage_access, | ||
599 | u64 *parent_pte) | ||
600 | { | ||
601 | union kvm_mmu_page_role role; | ||
602 | unsigned index; | ||
603 | unsigned quadrant; | ||
604 | struct hlist_head *bucket; | ||
605 | struct kvm_mmu_page *page; | ||
606 | struct hlist_node *node; | ||
607 | |||
608 | role.word = 0; | ||
609 | role.glevels = vcpu->mmu.root_level; | ||
610 | role.level = level; | ||
611 | role.metaphysical = metaphysical; | ||
612 | role.hugepage_access = hugepage_access; | ||
613 | if (vcpu->mmu.root_level <= PT32_ROOT_LEVEL) { | ||
614 | quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); | ||
615 | quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; | ||
616 | role.quadrant = quadrant; | ||
617 | } | ||
618 | pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__, | ||
619 | gfn, role.word); | ||
620 | index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; | ||
621 | bucket = &vcpu->kvm->mmu_page_hash[index]; | ||
622 | hlist_for_each_entry(page, node, bucket, hash_link) | ||
623 | if (page->gfn == gfn && page->role.word == role.word) { | ||
624 | mmu_page_add_parent_pte(vcpu, page, parent_pte); | ||
625 | pgprintk("%s: found\n", __FUNCTION__); | ||
626 | return page; | ||
627 | } | ||
628 | page = kvm_mmu_alloc_page(vcpu, parent_pte); | ||
629 | if (!page) | ||
630 | return page; | ||
631 | pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word); | ||
632 | page->gfn = gfn; | ||
633 | page->role = role; | ||
634 | hlist_add_head(&page->hash_link, bucket); | ||
635 | if (!metaphysical) | ||
636 | rmap_write_protect(vcpu, gfn); | ||
637 | return page; | ||
638 | } | ||
639 | |||
640 | static void kvm_mmu_page_unlink_children(struct kvm *kvm, | ||
641 | struct kvm_mmu_page *page) | ||
642 | { | ||
643 | unsigned i; | ||
644 | u64 *pt; | ||
645 | u64 ent; | ||
646 | |||
647 | pt = page->spt; | ||
648 | |||
649 | if (page->role.level == PT_PAGE_TABLE_LEVEL) { | ||
650 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | ||
651 | if (pt[i] & PT_PRESENT_MASK) | ||
652 | rmap_remove(&pt[i]); | ||
653 | pt[i] = 0; | ||
654 | } | ||
655 | kvm_flush_remote_tlbs(kvm); | ||
656 | return; | ||
657 | } | ||
658 | |||
659 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | ||
660 | ent = pt[i]; | ||
661 | |||
662 | pt[i] = 0; | ||
663 | if (!(ent & PT_PRESENT_MASK)) | ||
664 | continue; | ||
665 | ent &= PT64_BASE_ADDR_MASK; | ||
666 | mmu_page_remove_parent_pte(page_header(ent), &pt[i]); | ||
667 | } | ||
668 | kvm_flush_remote_tlbs(kvm); | ||
669 | } | ||
670 | |||
671 | static void kvm_mmu_put_page(struct kvm_mmu_page *page, | ||
672 | u64 *parent_pte) | ||
673 | { | ||
674 | mmu_page_remove_parent_pte(page, parent_pte); | ||
675 | } | ||
676 | |||
677 | static void kvm_mmu_zap_page(struct kvm *kvm, | ||
678 | struct kvm_mmu_page *page) | ||
679 | { | ||
680 | u64 *parent_pte; | ||
681 | |||
682 | while (page->multimapped || page->parent_pte) { | ||
683 | if (!page->multimapped) | ||
684 | parent_pte = page->parent_pte; | ||
685 | else { | ||
686 | struct kvm_pte_chain *chain; | ||
687 | |||
688 | chain = container_of(page->parent_ptes.first, | ||
689 | struct kvm_pte_chain, link); | ||
690 | parent_pte = chain->parent_ptes[0]; | ||
691 | } | ||
692 | BUG_ON(!parent_pte); | ||
693 | kvm_mmu_put_page(page, parent_pte); | ||
694 | set_shadow_pte(parent_pte, 0); | ||
695 | } | ||
696 | kvm_mmu_page_unlink_children(kvm, page); | ||
697 | if (!page->root_count) { | ||
698 | hlist_del(&page->hash_link); | ||
699 | kvm_mmu_free_page(kvm, page); | ||
700 | } else | ||
701 | list_move(&page->link, &kvm->active_mmu_pages); | ||
702 | } | ||
703 | |||
704 | static int kvm_mmu_unprotect_page(struct kvm_vcpu *vcpu, gfn_t gfn) | ||
705 | { | ||
706 | unsigned index; | ||
707 | struct hlist_head *bucket; | ||
708 | struct kvm_mmu_page *page; | ||
709 | struct hlist_node *node, *n; | ||
710 | int r; | ||
711 | |||
712 | pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn); | ||
713 | r = 0; | ||
714 | index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; | ||
715 | bucket = &vcpu->kvm->mmu_page_hash[index]; | ||
716 | hlist_for_each_entry_safe(page, node, n, bucket, hash_link) | ||
717 | if (page->gfn == gfn && !page->role.metaphysical) { | ||
718 | pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn, | ||
719 | page->role.word); | ||
720 | kvm_mmu_zap_page(vcpu->kvm, page); | ||
721 | r = 1; | ||
722 | } | ||
723 | return r; | ||
724 | } | ||
725 | |||
726 | static void mmu_unshadow(struct kvm_vcpu *vcpu, gfn_t gfn) | ||
727 | { | ||
728 | struct kvm_mmu_page *page; | ||
729 | |||
730 | while ((page = kvm_mmu_lookup_page(vcpu, gfn)) != NULL) { | ||
731 | pgprintk("%s: zap %lx %x\n", | ||
732 | __FUNCTION__, gfn, page->role.word); | ||
733 | kvm_mmu_zap_page(vcpu->kvm, page); | ||
734 | } | ||
735 | } | ||
736 | |||
737 | static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa) | ||
738 | { | ||
739 | int slot = memslot_id(kvm, gfn_to_memslot(kvm, gpa >> PAGE_SHIFT)); | ||
740 | struct kvm_mmu_page *page_head = page_header(__pa(pte)); | ||
741 | |||
742 | __set_bit(slot, &page_head->slot_bitmap); | ||
743 | } | ||
744 | |||
745 | hpa_t safe_gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa) | ||
746 | { | ||
747 | hpa_t hpa = gpa_to_hpa(vcpu, gpa); | ||
748 | |||
749 | return is_error_hpa(hpa) ? bad_page_address | (gpa & ~PAGE_MASK): hpa; | ||
750 | } | ||
751 | |||
752 | hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa) | ||
753 | { | ||
754 | struct page *page; | ||
755 | |||
756 | ASSERT((gpa & HPA_ERR_MASK) == 0); | ||
757 | page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); | ||
758 | if (!page) | ||
759 | return gpa | HPA_ERR_MASK; | ||
760 | return ((hpa_t)page_to_pfn(page) << PAGE_SHIFT) | ||
761 | | (gpa & (PAGE_SIZE-1)); | ||
762 | } | ||
763 | |||
764 | hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva) | ||
765 | { | ||
766 | gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva); | ||
767 | |||
768 | if (gpa == UNMAPPED_GVA) | ||
769 | return UNMAPPED_GVA; | ||
770 | return gpa_to_hpa(vcpu, gpa); | ||
771 | } | ||
772 | |||
773 | struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) | ||
774 | { | ||
775 | gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva); | ||
776 | |||
777 | if (gpa == UNMAPPED_GVA) | ||
778 | return NULL; | ||
779 | return pfn_to_page(gpa_to_hpa(vcpu, gpa) >> PAGE_SHIFT); | ||
780 | } | ||
781 | |||
782 | static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) | ||
783 | { | ||
784 | } | ||
785 | |||
786 | static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p) | ||
787 | { | ||
788 | int level = PT32E_ROOT_LEVEL; | ||
789 | hpa_t table_addr = vcpu->mmu.root_hpa; | ||
790 | |||
791 | for (; ; level--) { | ||
792 | u32 index = PT64_INDEX(v, level); | ||
793 | u64 *table; | ||
794 | u64 pte; | ||
795 | |||
796 | ASSERT(VALID_PAGE(table_addr)); | ||
797 | table = __va(table_addr); | ||
798 | |||
799 | if (level == 1) { | ||
800 | pte = table[index]; | ||
801 | if (is_present_pte(pte) && is_writeble_pte(pte)) | ||
802 | return 0; | ||
803 | mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT); | ||
804 | page_header_update_slot(vcpu->kvm, table, v); | ||
805 | table[index] = p | PT_PRESENT_MASK | PT_WRITABLE_MASK | | ||
806 | PT_USER_MASK; | ||
807 | rmap_add(vcpu, &table[index]); | ||
808 | return 0; | ||
809 | } | ||
810 | |||
811 | if (table[index] == 0) { | ||
812 | struct kvm_mmu_page *new_table; | ||
813 | gfn_t pseudo_gfn; | ||
814 | |||
815 | pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK) | ||
816 | >> PAGE_SHIFT; | ||
817 | new_table = kvm_mmu_get_page(vcpu, pseudo_gfn, | ||
818 | v, level - 1, | ||
819 | 1, 0, &table[index]); | ||
820 | if (!new_table) { | ||
821 | pgprintk("nonpaging_map: ENOMEM\n"); | ||
822 | return -ENOMEM; | ||
823 | } | ||
824 | |||
825 | table[index] = __pa(new_table->spt) | PT_PRESENT_MASK | ||
826 | | PT_WRITABLE_MASK | PT_USER_MASK; | ||
827 | } | ||
828 | table_addr = table[index] & PT64_BASE_ADDR_MASK; | ||
829 | } | ||
830 | } | ||
831 | |||
832 | static void mmu_free_roots(struct kvm_vcpu *vcpu) | ||
833 | { | ||
834 | int i; | ||
835 | struct kvm_mmu_page *page; | ||
836 | |||
837 | if (!VALID_PAGE(vcpu->mmu.root_hpa)) | ||
838 | return; | ||
839 | #ifdef CONFIG_X86_64 | ||
840 | if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) { | ||
841 | hpa_t root = vcpu->mmu.root_hpa; | ||
842 | |||
843 | page = page_header(root); | ||
844 | --page->root_count; | ||
845 | vcpu->mmu.root_hpa = INVALID_PAGE; | ||
846 | return; | ||
847 | } | ||
848 | #endif | ||
849 | for (i = 0; i < 4; ++i) { | ||
850 | hpa_t root = vcpu->mmu.pae_root[i]; | ||
851 | |||
852 | if (root) { | ||
853 | root &= PT64_BASE_ADDR_MASK; | ||
854 | page = page_header(root); | ||
855 | --page->root_count; | ||
856 | } | ||
857 | vcpu->mmu.pae_root[i] = INVALID_PAGE; | ||
858 | } | ||
859 | vcpu->mmu.root_hpa = INVALID_PAGE; | ||
860 | } | ||
861 | |||
862 | static void mmu_alloc_roots(struct kvm_vcpu *vcpu) | ||
863 | { | ||
864 | int i; | ||
865 | gfn_t root_gfn; | ||
866 | struct kvm_mmu_page *page; | ||
867 | |||
868 | root_gfn = vcpu->cr3 >> PAGE_SHIFT; | ||
869 | |||
870 | #ifdef CONFIG_X86_64 | ||
871 | if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) { | ||
872 | hpa_t root = vcpu->mmu.root_hpa; | ||
873 | |||
874 | ASSERT(!VALID_PAGE(root)); | ||
875 | page = kvm_mmu_get_page(vcpu, root_gfn, 0, | ||
876 | PT64_ROOT_LEVEL, 0, 0, NULL); | ||
877 | root = __pa(page->spt); | ||
878 | ++page->root_count; | ||
879 | vcpu->mmu.root_hpa = root; | ||
880 | return; | ||
881 | } | ||
882 | #endif | ||
883 | for (i = 0; i < 4; ++i) { | ||
884 | hpa_t root = vcpu->mmu.pae_root[i]; | ||
885 | |||
886 | ASSERT(!VALID_PAGE(root)); | ||
887 | if (vcpu->mmu.root_level == PT32E_ROOT_LEVEL) { | ||
888 | if (!is_present_pte(vcpu->pdptrs[i])) { | ||
889 | vcpu->mmu.pae_root[i] = 0; | ||
890 | continue; | ||
891 | } | ||
892 | root_gfn = vcpu->pdptrs[i] >> PAGE_SHIFT; | ||
893 | } else if (vcpu->mmu.root_level == 0) | ||
894 | root_gfn = 0; | ||
895 | page = kvm_mmu_get_page(vcpu, root_gfn, i << 30, | ||
896 | PT32_ROOT_LEVEL, !is_paging(vcpu), | ||
897 | 0, NULL); | ||
898 | root = __pa(page->spt); | ||
899 | ++page->root_count; | ||
900 | vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK; | ||
901 | } | ||
902 | vcpu->mmu.root_hpa = __pa(vcpu->mmu.pae_root); | ||
903 | } | ||
904 | |||
905 | static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) | ||
906 | { | ||
907 | return vaddr; | ||
908 | } | ||
909 | |||
910 | static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, | ||
911 | u32 error_code) | ||
912 | { | ||
913 | gpa_t addr = gva; | ||
914 | hpa_t paddr; | ||
915 | int r; | ||
916 | |||
917 | r = mmu_topup_memory_caches(vcpu); | ||
918 | if (r) | ||
919 | return r; | ||
920 | |||
921 | ASSERT(vcpu); | ||
922 | ASSERT(VALID_PAGE(vcpu->mmu.root_hpa)); | ||
923 | |||
924 | |||
925 | paddr = gpa_to_hpa(vcpu , addr & PT64_BASE_ADDR_MASK); | ||
926 | |||
927 | if (is_error_hpa(paddr)) | ||
928 | return 1; | ||
929 | |||
930 | return nonpaging_map(vcpu, addr & PAGE_MASK, paddr); | ||
931 | } | ||
932 | |||
933 | static void nonpaging_free(struct kvm_vcpu *vcpu) | ||
934 | { | ||
935 | mmu_free_roots(vcpu); | ||
936 | } | ||
937 | |||
938 | static int nonpaging_init_context(struct kvm_vcpu *vcpu) | ||
939 | { | ||
940 | struct kvm_mmu *context = &vcpu->mmu; | ||
941 | |||
942 | context->new_cr3 = nonpaging_new_cr3; | ||
943 | context->page_fault = nonpaging_page_fault; | ||
944 | context->gva_to_gpa = nonpaging_gva_to_gpa; | ||
945 | context->free = nonpaging_free; | ||
946 | context->root_level = 0; | ||
947 | context->shadow_root_level = PT32E_ROOT_LEVEL; | ||
948 | context->root_hpa = INVALID_PAGE; | ||
949 | return 0; | ||
950 | } | ||
951 | |||
952 | static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) | ||
953 | { | ||
954 | ++vcpu->stat.tlb_flush; | ||
955 | kvm_x86_ops->tlb_flush(vcpu); | ||
956 | } | ||
957 | |||
958 | static void paging_new_cr3(struct kvm_vcpu *vcpu) | ||
959 | { | ||
960 | pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3); | ||
961 | mmu_free_roots(vcpu); | ||
962 | } | ||
963 | |||
964 | static void inject_page_fault(struct kvm_vcpu *vcpu, | ||
965 | u64 addr, | ||
966 | u32 err_code) | ||
967 | { | ||
968 | kvm_x86_ops->inject_page_fault(vcpu, addr, err_code); | ||
969 | } | ||
970 | |||
971 | static void paging_free(struct kvm_vcpu *vcpu) | ||
972 | { | ||
973 | nonpaging_free(vcpu); | ||
974 | } | ||
975 | |||
976 | #define PTTYPE 64 | ||
977 | #include "paging_tmpl.h" | ||
978 | #undef PTTYPE | ||
979 | |||
980 | #define PTTYPE 32 | ||
981 | #include "paging_tmpl.h" | ||
982 | #undef PTTYPE | ||
983 | |||
984 | static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) | ||
985 | { | ||
986 | struct kvm_mmu *context = &vcpu->mmu; | ||
987 | |||
988 | ASSERT(is_pae(vcpu)); | ||
989 | context->new_cr3 = paging_new_cr3; | ||
990 | context->page_fault = paging64_page_fault; | ||
991 | context->gva_to_gpa = paging64_gva_to_gpa; | ||
992 | context->free = paging_free; | ||
993 | context->root_level = level; | ||
994 | context->shadow_root_level = level; | ||
995 | context->root_hpa = INVALID_PAGE; | ||
996 | return 0; | ||
997 | } | ||
998 | |||
999 | static int paging64_init_context(struct kvm_vcpu *vcpu) | ||
1000 | { | ||
1001 | return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL); | ||
1002 | } | ||
1003 | |||
1004 | static int paging32_init_context(struct kvm_vcpu *vcpu) | ||
1005 | { | ||
1006 | struct kvm_mmu *context = &vcpu->mmu; | ||
1007 | |||
1008 | context->new_cr3 = paging_new_cr3; | ||
1009 | context->page_fault = paging32_page_fault; | ||
1010 | context->gva_to_gpa = paging32_gva_to_gpa; | ||
1011 | context->free = paging_free; | ||
1012 | context->root_level = PT32_ROOT_LEVEL; | ||
1013 | context->shadow_root_level = PT32E_ROOT_LEVEL; | ||
1014 | context->root_hpa = INVALID_PAGE; | ||
1015 | return 0; | ||
1016 | } | ||
1017 | |||
1018 | static int paging32E_init_context(struct kvm_vcpu *vcpu) | ||
1019 | { | ||
1020 | return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL); | ||
1021 | } | ||
1022 | |||
1023 | static int init_kvm_mmu(struct kvm_vcpu *vcpu) | ||
1024 | { | ||
1025 | ASSERT(vcpu); | ||
1026 | ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa)); | ||
1027 | |||
1028 | if (!is_paging(vcpu)) | ||
1029 | return nonpaging_init_context(vcpu); | ||
1030 | else if (is_long_mode(vcpu)) | ||
1031 | return paging64_init_context(vcpu); | ||
1032 | else if (is_pae(vcpu)) | ||
1033 | return paging32E_init_context(vcpu); | ||
1034 | else | ||
1035 | return paging32_init_context(vcpu); | ||
1036 | } | ||
1037 | |||
1038 | static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) | ||
1039 | { | ||
1040 | ASSERT(vcpu); | ||
1041 | if (VALID_PAGE(vcpu->mmu.root_hpa)) { | ||
1042 | vcpu->mmu.free(vcpu); | ||
1043 | vcpu->mmu.root_hpa = INVALID_PAGE; | ||
1044 | } | ||
1045 | } | ||
1046 | |||
1047 | int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) | ||
1048 | { | ||
1049 | destroy_kvm_mmu(vcpu); | ||
1050 | return init_kvm_mmu(vcpu); | ||
1051 | } | ||
1052 | EXPORT_SYMBOL_GPL(kvm_mmu_reset_context); | ||
1053 | |||
1054 | int kvm_mmu_load(struct kvm_vcpu *vcpu) | ||
1055 | { | ||
1056 | int r; | ||
1057 | |||
1058 | mutex_lock(&vcpu->kvm->lock); | ||
1059 | r = mmu_topup_memory_caches(vcpu); | ||
1060 | if (r) | ||
1061 | goto out; | ||
1062 | mmu_alloc_roots(vcpu); | ||
1063 | kvm_x86_ops->set_cr3(vcpu, vcpu->mmu.root_hpa); | ||
1064 | kvm_mmu_flush_tlb(vcpu); | ||
1065 | out: | ||
1066 | mutex_unlock(&vcpu->kvm->lock); | ||
1067 | return r; | ||
1068 | } | ||
1069 | EXPORT_SYMBOL_GPL(kvm_mmu_load); | ||
1070 | |||
1071 | void kvm_mmu_unload(struct kvm_vcpu *vcpu) | ||
1072 | { | ||
1073 | mmu_free_roots(vcpu); | ||
1074 | } | ||
1075 | |||
1076 | static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, | ||
1077 | struct kvm_mmu_page *page, | ||
1078 | u64 *spte) | ||
1079 | { | ||
1080 | u64 pte; | ||
1081 | struct kvm_mmu_page *child; | ||
1082 | |||
1083 | pte = *spte; | ||
1084 | if (is_present_pte(pte)) { | ||
1085 | if (page->role.level == PT_PAGE_TABLE_LEVEL) | ||
1086 | rmap_remove(spte); | ||
1087 | else { | ||
1088 | child = page_header(pte & PT64_BASE_ADDR_MASK); | ||
1089 | mmu_page_remove_parent_pte(child, spte); | ||
1090 | } | ||
1091 | } | ||
1092 | set_shadow_pte(spte, 0); | ||
1093 | kvm_flush_remote_tlbs(vcpu->kvm); | ||
1094 | } | ||
1095 | |||
1096 | static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, | ||
1097 | struct kvm_mmu_page *page, | ||
1098 | u64 *spte, | ||
1099 | const void *new, int bytes) | ||
1100 | { | ||
1101 | if (page->role.level != PT_PAGE_TABLE_LEVEL) | ||
1102 | return; | ||
1103 | |||
1104 | if (page->role.glevels == PT32_ROOT_LEVEL) | ||
1105 | paging32_update_pte(vcpu, page, spte, new, bytes); | ||
1106 | else | ||
1107 | paging64_update_pte(vcpu, page, spte, new, bytes); | ||
1108 | } | ||
1109 | |||
1110 | void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | ||
1111 | const u8 *new, int bytes) | ||
1112 | { | ||
1113 | gfn_t gfn = gpa >> PAGE_SHIFT; | ||
1114 | struct kvm_mmu_page *page; | ||
1115 | struct hlist_node *node, *n; | ||
1116 | struct hlist_head *bucket; | ||
1117 | unsigned index; | ||
1118 | u64 *spte; | ||
1119 | unsigned offset = offset_in_page(gpa); | ||
1120 | unsigned pte_size; | ||
1121 | unsigned page_offset; | ||
1122 | unsigned misaligned; | ||
1123 | unsigned quadrant; | ||
1124 | int level; | ||
1125 | int flooded = 0; | ||
1126 | int npte; | ||
1127 | |||
1128 | pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes); | ||
1129 | if (gfn == vcpu->last_pt_write_gfn) { | ||
1130 | ++vcpu->last_pt_write_count; | ||
1131 | if (vcpu->last_pt_write_count >= 3) | ||
1132 | flooded = 1; | ||
1133 | } else { | ||
1134 | vcpu->last_pt_write_gfn = gfn; | ||
1135 | vcpu->last_pt_write_count = 1; | ||
1136 | } | ||
1137 | index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; | ||
1138 | bucket = &vcpu->kvm->mmu_page_hash[index]; | ||
1139 | hlist_for_each_entry_safe(page, node, n, bucket, hash_link) { | ||
1140 | if (page->gfn != gfn || page->role.metaphysical) | ||
1141 | continue; | ||
1142 | pte_size = page->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; | ||
1143 | misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); | ||
1144 | misaligned |= bytes < 4; | ||
1145 | if (misaligned || flooded) { | ||
1146 | /* | ||
1147 | * Misaligned accesses are too much trouble to fix | ||
1148 | * up; also, they usually indicate a page is not used | ||
1149 | * as a page table. | ||
1150 | * | ||
1151 | * If we're seeing too many writes to a page, | ||
1152 | * it may no longer be a page table, or we may be | ||
1153 | * forking, in which case it is better to unmap the | ||
1154 | * page. | ||
1155 | */ | ||
1156 | pgprintk("misaligned: gpa %llx bytes %d role %x\n", | ||
1157 | gpa, bytes, page->role.word); | ||
1158 | kvm_mmu_zap_page(vcpu->kvm, page); | ||
1159 | continue; | ||
1160 | } | ||
1161 | page_offset = offset; | ||
1162 | level = page->role.level; | ||
1163 | npte = 1; | ||
1164 | if (page->role.glevels == PT32_ROOT_LEVEL) { | ||
1165 | page_offset <<= 1; /* 32->64 */ | ||
1166 | /* | ||
1167 | * A 32-bit pde maps 4MB while the shadow pdes map | ||
1168 | * only 2MB. So we need to double the offset again | ||
1169 | * and zap two pdes instead of one. | ||
1170 | */ | ||
1171 | if (level == PT32_ROOT_LEVEL) { | ||
1172 | page_offset &= ~7; /* kill rounding error */ | ||
1173 | page_offset <<= 1; | ||
1174 | npte = 2; | ||
1175 | } | ||
1176 | quadrant = page_offset >> PAGE_SHIFT; | ||
1177 | page_offset &= ~PAGE_MASK; | ||
1178 | if (quadrant != page->role.quadrant) | ||
1179 | continue; | ||
1180 | } | ||
1181 | spte = &page->spt[page_offset / sizeof(*spte)]; | ||
1182 | while (npte--) { | ||
1183 | mmu_pte_write_zap_pte(vcpu, page, spte); | ||
1184 | mmu_pte_write_new_pte(vcpu, page, spte, new, bytes); | ||
1185 | ++spte; | ||
1186 | } | ||
1187 | } | ||
1188 | } | ||
1189 | |||
1190 | int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) | ||
1191 | { | ||
1192 | gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva); | ||
1193 | |||
1194 | return kvm_mmu_unprotect_page(vcpu, gpa >> PAGE_SHIFT); | ||
1195 | } | ||
1196 | |||
1197 | void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) | ||
1198 | { | ||
1199 | while (vcpu->kvm->n_free_mmu_pages < KVM_REFILL_PAGES) { | ||
1200 | struct kvm_mmu_page *page; | ||
1201 | |||
1202 | page = container_of(vcpu->kvm->active_mmu_pages.prev, | ||
1203 | struct kvm_mmu_page, link); | ||
1204 | kvm_mmu_zap_page(vcpu->kvm, page); | ||
1205 | } | ||
1206 | } | ||
1207 | |||
1208 | static void free_mmu_pages(struct kvm_vcpu *vcpu) | ||
1209 | { | ||
1210 | struct kvm_mmu_page *page; | ||
1211 | |||
1212 | while (!list_empty(&vcpu->kvm->active_mmu_pages)) { | ||
1213 | page = container_of(vcpu->kvm->active_mmu_pages.next, | ||
1214 | struct kvm_mmu_page, link); | ||
1215 | kvm_mmu_zap_page(vcpu->kvm, page); | ||
1216 | } | ||
1217 | free_page((unsigned long)vcpu->mmu.pae_root); | ||
1218 | } | ||
1219 | |||
1220 | static int alloc_mmu_pages(struct kvm_vcpu *vcpu) | ||
1221 | { | ||
1222 | struct page *page; | ||
1223 | int i; | ||
1224 | |||
1225 | ASSERT(vcpu); | ||
1226 | |||
1227 | vcpu->kvm->n_free_mmu_pages = KVM_NUM_MMU_PAGES; | ||
1228 | |||
1229 | /* | ||
1230 | * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. | ||
1231 | * Therefore we need to allocate shadow page tables in the first | ||
1232 | * 4GB of memory, which happens to fit the DMA32 zone. | ||
1233 | */ | ||
1234 | page = alloc_page(GFP_KERNEL | __GFP_DMA32); | ||
1235 | if (!page) | ||
1236 | goto error_1; | ||
1237 | vcpu->mmu.pae_root = page_address(page); | ||
1238 | for (i = 0; i < 4; ++i) | ||
1239 | vcpu->mmu.pae_root[i] = INVALID_PAGE; | ||
1240 | |||
1241 | return 0; | ||
1242 | |||
1243 | error_1: | ||
1244 | free_mmu_pages(vcpu); | ||
1245 | return -ENOMEM; | ||
1246 | } | ||
1247 | |||
1248 | int kvm_mmu_create(struct kvm_vcpu *vcpu) | ||
1249 | { | ||
1250 | ASSERT(vcpu); | ||
1251 | ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa)); | ||
1252 | |||
1253 | return alloc_mmu_pages(vcpu); | ||
1254 | } | ||
1255 | |||
1256 | int kvm_mmu_setup(struct kvm_vcpu *vcpu) | ||
1257 | { | ||
1258 | ASSERT(vcpu); | ||
1259 | ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa)); | ||
1260 | |||
1261 | return init_kvm_mmu(vcpu); | ||
1262 | } | ||
1263 | |||
1264 | void kvm_mmu_destroy(struct kvm_vcpu *vcpu) | ||
1265 | { | ||
1266 | ASSERT(vcpu); | ||
1267 | |||
1268 | destroy_kvm_mmu(vcpu); | ||
1269 | free_mmu_pages(vcpu); | ||
1270 | mmu_free_memory_caches(vcpu); | ||
1271 | } | ||
1272 | |||
1273 | void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) | ||
1274 | { | ||
1275 | struct kvm_mmu_page *page; | ||
1276 | |||
1277 | list_for_each_entry(page, &kvm->active_mmu_pages, link) { | ||
1278 | int i; | ||
1279 | u64 *pt; | ||
1280 | |||
1281 | if (!test_bit(slot, &page->slot_bitmap)) | ||
1282 | continue; | ||
1283 | |||
1284 | pt = page->spt; | ||
1285 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) | ||
1286 | /* avoid RMW */ | ||
1287 | if (pt[i] & PT_WRITABLE_MASK) { | ||
1288 | rmap_remove(&pt[i]); | ||
1289 | pt[i] &= ~PT_WRITABLE_MASK; | ||
1290 | } | ||
1291 | } | ||
1292 | } | ||
1293 | |||
1294 | void kvm_mmu_zap_all(struct kvm *kvm) | ||
1295 | { | ||
1296 | struct kvm_mmu_page *page, *node; | ||
1297 | |||
1298 | list_for_each_entry_safe(page, node, &kvm->active_mmu_pages, link) | ||
1299 | kvm_mmu_zap_page(kvm, page); | ||
1300 | |||
1301 | kvm_flush_remote_tlbs(kvm); | ||
1302 | } | ||
1303 | |||
1304 | void kvm_mmu_module_exit(void) | ||
1305 | { | ||
1306 | if (pte_chain_cache) | ||
1307 | kmem_cache_destroy(pte_chain_cache); | ||
1308 | if (rmap_desc_cache) | ||
1309 | kmem_cache_destroy(rmap_desc_cache); | ||
1310 | if (mmu_page_header_cache) | ||
1311 | kmem_cache_destroy(mmu_page_header_cache); | ||
1312 | } | ||
1313 | |||
1314 | int kvm_mmu_module_init(void) | ||
1315 | { | ||
1316 | pte_chain_cache = kmem_cache_create("kvm_pte_chain", | ||
1317 | sizeof(struct kvm_pte_chain), | ||
1318 | 0, 0, NULL); | ||
1319 | if (!pte_chain_cache) | ||
1320 | goto nomem; | ||
1321 | rmap_desc_cache = kmem_cache_create("kvm_rmap_desc", | ||
1322 | sizeof(struct kvm_rmap_desc), | ||
1323 | 0, 0, NULL); | ||
1324 | if (!rmap_desc_cache) | ||
1325 | goto nomem; | ||
1326 | |||
1327 | mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", | ||
1328 | sizeof(struct kvm_mmu_page), | ||
1329 | 0, 0, NULL); | ||
1330 | if (!mmu_page_header_cache) | ||
1331 | goto nomem; | ||
1332 | |||
1333 | return 0; | ||
1334 | |||
1335 | nomem: | ||
1336 | kvm_mmu_module_exit(); | ||
1337 | return -ENOMEM; | ||
1338 | } | ||
1339 | |||
1340 | #ifdef AUDIT | ||
1341 | |||
1342 | static const char *audit_msg; | ||
1343 | |||
1344 | static gva_t canonicalize(gva_t gva) | ||
1345 | { | ||
1346 | #ifdef CONFIG_X86_64 | ||
1347 | gva = (long long)(gva << 16) >> 16; | ||
1348 | #endif | ||
1349 | return gva; | ||
1350 | } | ||
1351 | |||
1352 | static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, | ||
1353 | gva_t va, int level) | ||
1354 | { | ||
1355 | u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK); | ||
1356 | int i; | ||
1357 | gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1)); | ||
1358 | |||
1359 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) { | ||
1360 | u64 ent = pt[i]; | ||
1361 | |||
1362 | if (!(ent & PT_PRESENT_MASK)) | ||
1363 | continue; | ||
1364 | |||
1365 | va = canonicalize(va); | ||
1366 | if (level > 1) | ||
1367 | audit_mappings_page(vcpu, ent, va, level - 1); | ||
1368 | else { | ||
1369 | gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, va); | ||
1370 | hpa_t hpa = gpa_to_hpa(vcpu, gpa); | ||
1371 | |||
1372 | if ((ent & PT_PRESENT_MASK) | ||
1373 | && (ent & PT64_BASE_ADDR_MASK) != hpa) | ||
1374 | printk(KERN_ERR "audit error: (%s) levels %d" | ||
1375 | " gva %lx gpa %llx hpa %llx ent %llx\n", | ||
1376 | audit_msg, vcpu->mmu.root_level, | ||
1377 | va, gpa, hpa, ent); | ||
1378 | } | ||
1379 | } | ||
1380 | } | ||
1381 | |||
1382 | static void audit_mappings(struct kvm_vcpu *vcpu) | ||
1383 | { | ||
1384 | unsigned i; | ||
1385 | |||
1386 | if (vcpu->mmu.root_level == 4) | ||
1387 | audit_mappings_page(vcpu, vcpu->mmu.root_hpa, 0, 4); | ||
1388 | else | ||
1389 | for (i = 0; i < 4; ++i) | ||
1390 | if (vcpu->mmu.pae_root[i] & PT_PRESENT_MASK) | ||
1391 | audit_mappings_page(vcpu, | ||
1392 | vcpu->mmu.pae_root[i], | ||
1393 | i << 30, | ||
1394 | 2); | ||
1395 | } | ||
1396 | |||
1397 | static int count_rmaps(struct kvm_vcpu *vcpu) | ||
1398 | { | ||
1399 | int nmaps = 0; | ||
1400 | int i, j, k; | ||
1401 | |||
1402 | for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { | ||
1403 | struct kvm_memory_slot *m = &vcpu->kvm->memslots[i]; | ||
1404 | struct kvm_rmap_desc *d; | ||
1405 | |||
1406 | for (j = 0; j < m->npages; ++j) { | ||
1407 | struct page *page = m->phys_mem[j]; | ||
1408 | |||
1409 | if (!page->private) | ||
1410 | continue; | ||
1411 | if (!(page->private & 1)) { | ||
1412 | ++nmaps; | ||
1413 | continue; | ||
1414 | } | ||
1415 | d = (struct kvm_rmap_desc *)(page->private & ~1ul); | ||
1416 | while (d) { | ||
1417 | for (k = 0; k < RMAP_EXT; ++k) | ||
1418 | if (d->shadow_ptes[k]) | ||
1419 | ++nmaps; | ||
1420 | else | ||
1421 | break; | ||
1422 | d = d->more; | ||
1423 | } | ||
1424 | } | ||
1425 | } | ||
1426 | return nmaps; | ||
1427 | } | ||
1428 | |||
1429 | static int count_writable_mappings(struct kvm_vcpu *vcpu) | ||
1430 | { | ||
1431 | int nmaps = 0; | ||
1432 | struct kvm_mmu_page *page; | ||
1433 | int i; | ||
1434 | |||
1435 | list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) { | ||
1436 | u64 *pt = page->spt; | ||
1437 | |||
1438 | if (page->role.level != PT_PAGE_TABLE_LEVEL) | ||
1439 | continue; | ||
1440 | |||
1441 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | ||
1442 | u64 ent = pt[i]; | ||
1443 | |||
1444 | if (!(ent & PT_PRESENT_MASK)) | ||
1445 | continue; | ||
1446 | if (!(ent & PT_WRITABLE_MASK)) | ||
1447 | continue; | ||
1448 | ++nmaps; | ||
1449 | } | ||
1450 | } | ||
1451 | return nmaps; | ||
1452 | } | ||
1453 | |||
1454 | static void audit_rmap(struct kvm_vcpu *vcpu) | ||
1455 | { | ||
1456 | int n_rmap = count_rmaps(vcpu); | ||
1457 | int n_actual = count_writable_mappings(vcpu); | ||
1458 | |||
1459 | if (n_rmap != n_actual) | ||
1460 | printk(KERN_ERR "%s: (%s) rmap %d actual %d\n", | ||
1461 | __FUNCTION__, audit_msg, n_rmap, n_actual); | ||
1462 | } | ||
1463 | |||
1464 | static void audit_write_protection(struct kvm_vcpu *vcpu) | ||
1465 | { | ||
1466 | struct kvm_mmu_page *page; | ||
1467 | |||
1468 | list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) { | ||
1469 | hfn_t hfn; | ||
1470 | struct page *pg; | ||
1471 | |||
1472 | if (page->role.metaphysical) | ||
1473 | continue; | ||
1474 | |||
1475 | hfn = gpa_to_hpa(vcpu, (gpa_t)page->gfn << PAGE_SHIFT) | ||
1476 | >> PAGE_SHIFT; | ||
1477 | pg = pfn_to_page(hfn); | ||
1478 | if (pg->private) | ||
1479 | printk(KERN_ERR "%s: (%s) shadow page has writable" | ||
1480 | " mappings: gfn %lx role %x\n", | ||
1481 | __FUNCTION__, audit_msg, page->gfn, | ||
1482 | page->role.word); | ||
1483 | } | ||
1484 | } | ||
1485 | |||
1486 | static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) | ||
1487 | { | ||
1488 | int olddbg = dbg; | ||
1489 | |||
1490 | dbg = 0; | ||
1491 | audit_msg = msg; | ||
1492 | audit_rmap(vcpu); | ||
1493 | audit_write_protection(vcpu); | ||
1494 | audit_mappings(vcpu); | ||
1495 | dbg = olddbg; | ||
1496 | } | ||
1497 | |||
1498 | #endif | ||
diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h deleted file mode 100644 index 6b094b44f8fb..000000000000 --- a/drivers/kvm/paging_tmpl.h +++ /dev/null | |||
@@ -1,511 +0,0 @@ | |||
1 | /* | ||
2 | * Kernel-based Virtual Machine driver for Linux | ||
3 | * | ||
4 | * This module enables machines with Intel VT-x extensions to run virtual | ||
5 | * machines without emulation or binary translation. | ||
6 | * | ||
7 | * MMU support | ||
8 | * | ||
9 | * Copyright (C) 2006 Qumranet, Inc. | ||
10 | * | ||
11 | * Authors: | ||
12 | * Yaniv Kamay <yaniv@qumranet.com> | ||
13 | * Avi Kivity <avi@qumranet.com> | ||
14 | * | ||
15 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
16 | * the COPYING file in the top-level directory. | ||
17 | * | ||
18 | */ | ||
19 | |||
20 | /* | ||
21 | * We need the mmu code to access both 32-bit and 64-bit guest ptes, | ||
22 | * so the code in this file is compiled twice, once per pte size. | ||
23 | */ | ||
24 | |||
25 | #if PTTYPE == 64 | ||
26 | #define pt_element_t u64 | ||
27 | #define guest_walker guest_walker64 | ||
28 | #define FNAME(name) paging##64_##name | ||
29 | #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK | ||
30 | #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK | ||
31 | #define PT_INDEX(addr, level) PT64_INDEX(addr, level) | ||
32 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) | ||
33 | #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) | ||
34 | #ifdef CONFIG_X86_64 | ||
35 | #define PT_MAX_FULL_LEVELS 4 | ||
36 | #else | ||
37 | #define PT_MAX_FULL_LEVELS 2 | ||
38 | #endif | ||
39 | #elif PTTYPE == 32 | ||
40 | #define pt_element_t u32 | ||
41 | #define guest_walker guest_walker32 | ||
42 | #define FNAME(name) paging##32_##name | ||
43 | #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK | ||
44 | #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK | ||
45 | #define PT_INDEX(addr, level) PT32_INDEX(addr, level) | ||
46 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) | ||
47 | #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) | ||
48 | #define PT_MAX_FULL_LEVELS 2 | ||
49 | #else | ||
50 | #error Invalid PTTYPE value | ||
51 | #endif | ||
52 | |||
53 | /* | ||
54 | * The guest_walker structure emulates the behavior of the hardware page | ||
55 | * table walker. | ||
56 | */ | ||
57 | struct guest_walker { | ||
58 | int level; | ||
59 | gfn_t table_gfn[PT_MAX_FULL_LEVELS]; | ||
60 | pt_element_t *table; | ||
61 | pt_element_t pte; | ||
62 | pt_element_t *ptep; | ||
63 | struct page *page; | ||
64 | int index; | ||
65 | pt_element_t inherited_ar; | ||
66 | gfn_t gfn; | ||
67 | u32 error_code; | ||
68 | }; | ||
69 | |||
70 | /* | ||
71 | * Fetch a guest pte for a guest virtual address | ||
72 | */ | ||
73 | static int FNAME(walk_addr)(struct guest_walker *walker, | ||
74 | struct kvm_vcpu *vcpu, gva_t addr, | ||
75 | int write_fault, int user_fault, int fetch_fault) | ||
76 | { | ||
77 | hpa_t hpa; | ||
78 | struct kvm_memory_slot *slot; | ||
79 | pt_element_t *ptep; | ||
80 | pt_element_t root; | ||
81 | gfn_t table_gfn; | ||
82 | |||
83 | pgprintk("%s: addr %lx\n", __FUNCTION__, addr); | ||
84 | walker->level = vcpu->mmu.root_level; | ||
85 | walker->table = NULL; | ||
86 | walker->page = NULL; | ||
87 | walker->ptep = NULL; | ||
88 | root = vcpu->cr3; | ||
89 | #if PTTYPE == 64 | ||
90 | if (!is_long_mode(vcpu)) { | ||
91 | walker->ptep = &vcpu->pdptrs[(addr >> 30) & 3]; | ||
92 | root = *walker->ptep; | ||
93 | walker->pte = root; | ||
94 | if (!(root & PT_PRESENT_MASK)) | ||
95 | goto not_present; | ||
96 | --walker->level; | ||
97 | } | ||
98 | #endif | ||
99 | table_gfn = (root & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; | ||
100 | walker->table_gfn[walker->level - 1] = table_gfn; | ||
101 | pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__, | ||
102 | walker->level - 1, table_gfn); | ||
103 | slot = gfn_to_memslot(vcpu->kvm, table_gfn); | ||
104 | hpa = safe_gpa_to_hpa(vcpu, root & PT64_BASE_ADDR_MASK); | ||
105 | walker->page = pfn_to_page(hpa >> PAGE_SHIFT); | ||
106 | walker->table = kmap_atomic(walker->page, KM_USER0); | ||
107 | |||
108 | ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || | ||
109 | (vcpu->cr3 & CR3_NONPAE_RESERVED_BITS) == 0); | ||
110 | |||
111 | walker->inherited_ar = PT_USER_MASK | PT_WRITABLE_MASK; | ||
112 | |||
113 | for (;;) { | ||
114 | int index = PT_INDEX(addr, walker->level); | ||
115 | hpa_t paddr; | ||
116 | |||
117 | ptep = &walker->table[index]; | ||
118 | walker->index = index; | ||
119 | ASSERT(((unsigned long)walker->table & PAGE_MASK) == | ||
120 | ((unsigned long)ptep & PAGE_MASK)); | ||
121 | |||
122 | if (!is_present_pte(*ptep)) | ||
123 | goto not_present; | ||
124 | |||
125 | if (write_fault && !is_writeble_pte(*ptep)) | ||
126 | if (user_fault || is_write_protection(vcpu)) | ||
127 | goto access_error; | ||
128 | |||
129 | if (user_fault && !(*ptep & PT_USER_MASK)) | ||
130 | goto access_error; | ||
131 | |||
132 | #if PTTYPE == 64 | ||
133 | if (fetch_fault && is_nx(vcpu) && (*ptep & PT64_NX_MASK)) | ||
134 | goto access_error; | ||
135 | #endif | ||
136 | |||
137 | if (!(*ptep & PT_ACCESSED_MASK)) { | ||
138 | mark_page_dirty(vcpu->kvm, table_gfn); | ||
139 | *ptep |= PT_ACCESSED_MASK; | ||
140 | } | ||
141 | |||
142 | if (walker->level == PT_PAGE_TABLE_LEVEL) { | ||
143 | walker->gfn = (*ptep & PT_BASE_ADDR_MASK) | ||
144 | >> PAGE_SHIFT; | ||
145 | break; | ||
146 | } | ||
147 | |||
148 | if (walker->level == PT_DIRECTORY_LEVEL | ||
149 | && (*ptep & PT_PAGE_SIZE_MASK) | ||
150 | && (PTTYPE == 64 || is_pse(vcpu))) { | ||
151 | walker->gfn = (*ptep & PT_DIR_BASE_ADDR_MASK) | ||
152 | >> PAGE_SHIFT; | ||
153 | walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL); | ||
154 | break; | ||
155 | } | ||
156 | |||
157 | walker->inherited_ar &= walker->table[index]; | ||
158 | table_gfn = (*ptep & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; | ||
159 | kunmap_atomic(walker->table, KM_USER0); | ||
160 | paddr = safe_gpa_to_hpa(vcpu, table_gfn << PAGE_SHIFT); | ||
161 | walker->page = pfn_to_page(paddr >> PAGE_SHIFT); | ||
162 | walker->table = kmap_atomic(walker->page, KM_USER0); | ||
163 | --walker->level; | ||
164 | walker->table_gfn[walker->level - 1 ] = table_gfn; | ||
165 | pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__, | ||
166 | walker->level - 1, table_gfn); | ||
167 | } | ||
168 | walker->pte = *ptep; | ||
169 | if (walker->page) | ||
170 | walker->ptep = NULL; | ||
171 | if (walker->table) | ||
172 | kunmap_atomic(walker->table, KM_USER0); | ||
173 | pgprintk("%s: pte %llx\n", __FUNCTION__, (u64)*ptep); | ||
174 | return 1; | ||
175 | |||
176 | not_present: | ||
177 | walker->error_code = 0; | ||
178 | goto err; | ||
179 | |||
180 | access_error: | ||
181 | walker->error_code = PFERR_PRESENT_MASK; | ||
182 | |||
183 | err: | ||
184 | if (write_fault) | ||
185 | walker->error_code |= PFERR_WRITE_MASK; | ||
186 | if (user_fault) | ||
187 | walker->error_code |= PFERR_USER_MASK; | ||
188 | if (fetch_fault) | ||
189 | walker->error_code |= PFERR_FETCH_MASK; | ||
190 | if (walker->table) | ||
191 | kunmap_atomic(walker->table, KM_USER0); | ||
192 | return 0; | ||
193 | } | ||
194 | |||
195 | static void FNAME(mark_pagetable_dirty)(struct kvm *kvm, | ||
196 | struct guest_walker *walker) | ||
197 | { | ||
198 | mark_page_dirty(kvm, walker->table_gfn[walker->level - 1]); | ||
199 | } | ||
200 | |||
201 | static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu, | ||
202 | u64 *shadow_pte, | ||
203 | gpa_t gaddr, | ||
204 | pt_element_t gpte, | ||
205 | u64 access_bits, | ||
206 | int user_fault, | ||
207 | int write_fault, | ||
208 | int *ptwrite, | ||
209 | struct guest_walker *walker, | ||
210 | gfn_t gfn) | ||
211 | { | ||
212 | hpa_t paddr; | ||
213 | int dirty = gpte & PT_DIRTY_MASK; | ||
214 | u64 spte = *shadow_pte; | ||
215 | int was_rmapped = is_rmap_pte(spte); | ||
216 | |||
217 | pgprintk("%s: spte %llx gpte %llx access %llx write_fault %d" | ||
218 | " user_fault %d gfn %lx\n", | ||
219 | __FUNCTION__, spte, (u64)gpte, access_bits, | ||
220 | write_fault, user_fault, gfn); | ||
221 | |||
222 | if (write_fault && !dirty) { | ||
223 | pt_element_t *guest_ent, *tmp = NULL; | ||
224 | |||
225 | if (walker->ptep) | ||
226 | guest_ent = walker->ptep; | ||
227 | else { | ||
228 | tmp = kmap_atomic(walker->page, KM_USER0); | ||
229 | guest_ent = &tmp[walker->index]; | ||
230 | } | ||
231 | |||
232 | *guest_ent |= PT_DIRTY_MASK; | ||
233 | if (!walker->ptep) | ||
234 | kunmap_atomic(tmp, KM_USER0); | ||
235 | dirty = 1; | ||
236 | FNAME(mark_pagetable_dirty)(vcpu->kvm, walker); | ||
237 | } | ||
238 | |||
239 | spte |= PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK; | ||
240 | spte |= gpte & PT64_NX_MASK; | ||
241 | if (!dirty) | ||
242 | access_bits &= ~PT_WRITABLE_MASK; | ||
243 | |||
244 | paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK); | ||
245 | |||
246 | spte |= PT_PRESENT_MASK; | ||
247 | if (access_bits & PT_USER_MASK) | ||
248 | spte |= PT_USER_MASK; | ||
249 | |||
250 | if (is_error_hpa(paddr)) { | ||
251 | spte |= gaddr; | ||
252 | spte |= PT_SHADOW_IO_MARK; | ||
253 | spte &= ~PT_PRESENT_MASK; | ||
254 | set_shadow_pte(shadow_pte, spte); | ||
255 | return; | ||
256 | } | ||
257 | |||
258 | spte |= paddr; | ||
259 | |||
260 | if ((access_bits & PT_WRITABLE_MASK) | ||
261 | || (write_fault && !is_write_protection(vcpu) && !user_fault)) { | ||
262 | struct kvm_mmu_page *shadow; | ||
263 | |||
264 | spte |= PT_WRITABLE_MASK; | ||
265 | if (user_fault) { | ||
266 | mmu_unshadow(vcpu, gfn); | ||
267 | goto unshadowed; | ||
268 | } | ||
269 | |||
270 | shadow = kvm_mmu_lookup_page(vcpu, gfn); | ||
271 | if (shadow) { | ||
272 | pgprintk("%s: found shadow page for %lx, marking ro\n", | ||
273 | __FUNCTION__, gfn); | ||
274 | access_bits &= ~PT_WRITABLE_MASK; | ||
275 | if (is_writeble_pte(spte)) { | ||
276 | spte &= ~PT_WRITABLE_MASK; | ||
277 | kvm_x86_ops->tlb_flush(vcpu); | ||
278 | } | ||
279 | if (write_fault) | ||
280 | *ptwrite = 1; | ||
281 | } | ||
282 | } | ||
283 | |||
284 | unshadowed: | ||
285 | |||
286 | if (access_bits & PT_WRITABLE_MASK) | ||
287 | mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT); | ||
288 | |||
289 | set_shadow_pte(shadow_pte, spte); | ||
290 | page_header_update_slot(vcpu->kvm, shadow_pte, gaddr); | ||
291 | if (!was_rmapped) | ||
292 | rmap_add(vcpu, shadow_pte); | ||
293 | } | ||
294 | |||
295 | static void FNAME(set_pte)(struct kvm_vcpu *vcpu, pt_element_t gpte, | ||
296 | u64 *shadow_pte, u64 access_bits, | ||
297 | int user_fault, int write_fault, int *ptwrite, | ||
298 | struct guest_walker *walker, gfn_t gfn) | ||
299 | { | ||
300 | access_bits &= gpte; | ||
301 | FNAME(set_pte_common)(vcpu, shadow_pte, gpte & PT_BASE_ADDR_MASK, | ||
302 | gpte, access_bits, user_fault, write_fault, | ||
303 | ptwrite, walker, gfn); | ||
304 | } | ||
305 | |||
306 | static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, | ||
307 | u64 *spte, const void *pte, int bytes) | ||
308 | { | ||
309 | pt_element_t gpte; | ||
310 | |||
311 | if (bytes < sizeof(pt_element_t)) | ||
312 | return; | ||
313 | gpte = *(const pt_element_t *)pte; | ||
314 | if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) | ||
315 | return; | ||
316 | pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte); | ||
317 | FNAME(set_pte)(vcpu, gpte, spte, PT_USER_MASK | PT_WRITABLE_MASK, 0, | ||
318 | 0, NULL, NULL, | ||
319 | (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT); | ||
320 | } | ||
321 | |||
322 | static void FNAME(set_pde)(struct kvm_vcpu *vcpu, pt_element_t gpde, | ||
323 | u64 *shadow_pte, u64 access_bits, | ||
324 | int user_fault, int write_fault, int *ptwrite, | ||
325 | struct guest_walker *walker, gfn_t gfn) | ||
326 | { | ||
327 | gpa_t gaddr; | ||
328 | |||
329 | access_bits &= gpde; | ||
330 | gaddr = (gpa_t)gfn << PAGE_SHIFT; | ||
331 | if (PTTYPE == 32 && is_cpuid_PSE36()) | ||
332 | gaddr |= (gpde & PT32_DIR_PSE36_MASK) << | ||
333 | (32 - PT32_DIR_PSE36_SHIFT); | ||
334 | FNAME(set_pte_common)(vcpu, shadow_pte, gaddr, | ||
335 | gpde, access_bits, user_fault, write_fault, | ||
336 | ptwrite, walker, gfn); | ||
337 | } | ||
338 | |||
339 | /* | ||
340 | * Fetch a shadow pte for a specific level in the paging hierarchy. | ||
341 | */ | ||
342 | static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | ||
343 | struct guest_walker *walker, | ||
344 | int user_fault, int write_fault, int *ptwrite) | ||
345 | { | ||
346 | hpa_t shadow_addr; | ||
347 | int level; | ||
348 | u64 *shadow_ent; | ||
349 | u64 *prev_shadow_ent = NULL; | ||
350 | |||
351 | if (!is_present_pte(walker->pte)) | ||
352 | return NULL; | ||
353 | |||
354 | shadow_addr = vcpu->mmu.root_hpa; | ||
355 | level = vcpu->mmu.shadow_root_level; | ||
356 | if (level == PT32E_ROOT_LEVEL) { | ||
357 | shadow_addr = vcpu->mmu.pae_root[(addr >> 30) & 3]; | ||
358 | shadow_addr &= PT64_BASE_ADDR_MASK; | ||
359 | --level; | ||
360 | } | ||
361 | |||
362 | for (; ; level--) { | ||
363 | u32 index = SHADOW_PT_INDEX(addr, level); | ||
364 | struct kvm_mmu_page *shadow_page; | ||
365 | u64 shadow_pte; | ||
366 | int metaphysical; | ||
367 | gfn_t table_gfn; | ||
368 | unsigned hugepage_access = 0; | ||
369 | |||
370 | shadow_ent = ((u64 *)__va(shadow_addr)) + index; | ||
371 | if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) { | ||
372 | if (level == PT_PAGE_TABLE_LEVEL) | ||
373 | break; | ||
374 | shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK; | ||
375 | prev_shadow_ent = shadow_ent; | ||
376 | continue; | ||
377 | } | ||
378 | |||
379 | if (level == PT_PAGE_TABLE_LEVEL) | ||
380 | break; | ||
381 | |||
382 | if (level - 1 == PT_PAGE_TABLE_LEVEL | ||
383 | && walker->level == PT_DIRECTORY_LEVEL) { | ||
384 | metaphysical = 1; | ||
385 | hugepage_access = walker->pte; | ||
386 | hugepage_access &= PT_USER_MASK | PT_WRITABLE_MASK; | ||
387 | if (walker->pte & PT64_NX_MASK) | ||
388 | hugepage_access |= (1 << 2); | ||
389 | hugepage_access >>= PT_WRITABLE_SHIFT; | ||
390 | table_gfn = (walker->pte & PT_BASE_ADDR_MASK) | ||
391 | >> PAGE_SHIFT; | ||
392 | } else { | ||
393 | metaphysical = 0; | ||
394 | table_gfn = walker->table_gfn[level - 2]; | ||
395 | } | ||
396 | shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, | ||
397 | metaphysical, hugepage_access, | ||
398 | shadow_ent); | ||
399 | shadow_addr = __pa(shadow_page->spt); | ||
400 | shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK | ||
401 | | PT_WRITABLE_MASK | PT_USER_MASK; | ||
402 | *shadow_ent = shadow_pte; | ||
403 | prev_shadow_ent = shadow_ent; | ||
404 | } | ||
405 | |||
406 | if (walker->level == PT_DIRECTORY_LEVEL) { | ||
407 | FNAME(set_pde)(vcpu, walker->pte, shadow_ent, | ||
408 | walker->inherited_ar, user_fault, write_fault, | ||
409 | ptwrite, walker, walker->gfn); | ||
410 | } else { | ||
411 | ASSERT(walker->level == PT_PAGE_TABLE_LEVEL); | ||
412 | FNAME(set_pte)(vcpu, walker->pte, shadow_ent, | ||
413 | walker->inherited_ar, user_fault, write_fault, | ||
414 | ptwrite, walker, walker->gfn); | ||
415 | } | ||
416 | return shadow_ent; | ||
417 | } | ||
418 | |||
419 | /* | ||
420 | * Page fault handler. There are several causes for a page fault: | ||
421 | * - there is no shadow pte for the guest pte | ||
422 | * - write access through a shadow pte marked read only so that we can set | ||
423 | * the dirty bit | ||
424 | * - write access to a shadow pte marked read only so we can update the page | ||
425 | * dirty bitmap, when userspace requests it | ||
426 | * - mmio access; in this case we will never install a present shadow pte | ||
427 | * - normal guest page fault due to the guest pte marked not present, not | ||
428 | * writable, or not executable | ||
429 | * | ||
430 | * Returns: 1 if we need to emulate the instruction, 0 otherwise, or | ||
431 | * a negative value on error. | ||
432 | */ | ||
433 | static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | ||
434 | u32 error_code) | ||
435 | { | ||
436 | int write_fault = error_code & PFERR_WRITE_MASK; | ||
437 | int user_fault = error_code & PFERR_USER_MASK; | ||
438 | int fetch_fault = error_code & PFERR_FETCH_MASK; | ||
439 | struct guest_walker walker; | ||
440 | u64 *shadow_pte; | ||
441 | int write_pt = 0; | ||
442 | int r; | ||
443 | |||
444 | pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code); | ||
445 | kvm_mmu_audit(vcpu, "pre page fault"); | ||
446 | |||
447 | r = mmu_topup_memory_caches(vcpu); | ||
448 | if (r) | ||
449 | return r; | ||
450 | |||
451 | /* | ||
452 | * Look up the shadow pte for the faulting address. | ||
453 | */ | ||
454 | r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault, | ||
455 | fetch_fault); | ||
456 | |||
457 | /* | ||
458 | * The page is not mapped by the guest. Let the guest handle it. | ||
459 | */ | ||
460 | if (!r) { | ||
461 | pgprintk("%s: guest page fault\n", __FUNCTION__); | ||
462 | inject_page_fault(vcpu, addr, walker.error_code); | ||
463 | vcpu->last_pt_write_count = 0; /* reset fork detector */ | ||
464 | return 0; | ||
465 | } | ||
466 | |||
467 | shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, | ||
468 | &write_pt); | ||
469 | pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__, | ||
470 | shadow_pte, *shadow_pte, write_pt); | ||
471 | |||
472 | if (!write_pt) | ||
473 | vcpu->last_pt_write_count = 0; /* reset fork detector */ | ||
474 | |||
475 | /* | ||
476 | * mmio: emulate if accessible, otherwise its a guest fault. | ||
477 | */ | ||
478 | if (is_io_pte(*shadow_pte)) | ||
479 | return 1; | ||
480 | |||
481 | ++vcpu->stat.pf_fixed; | ||
482 | kvm_mmu_audit(vcpu, "post page fault (fixed)"); | ||
483 | |||
484 | return write_pt; | ||
485 | } | ||
486 | |||
487 | static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) | ||
488 | { | ||
489 | struct guest_walker walker; | ||
490 | gpa_t gpa = UNMAPPED_GVA; | ||
491 | int r; | ||
492 | |||
493 | r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0); | ||
494 | |||
495 | if (r) { | ||
496 | gpa = (gpa_t)walker.gfn << PAGE_SHIFT; | ||
497 | gpa |= vaddr & ~PAGE_MASK; | ||
498 | } | ||
499 | |||
500 | return gpa; | ||
501 | } | ||
502 | |||
503 | #undef pt_element_t | ||
504 | #undef guest_walker | ||
505 | #undef FNAME | ||
506 | #undef PT_BASE_ADDR_MASK | ||
507 | #undef PT_INDEX | ||
508 | #undef SHADOW_PT_INDEX | ||
509 | #undef PT_LEVEL_MASK | ||
510 | #undef PT_DIR_BASE_ADDR_MASK | ||
511 | #undef PT_MAX_FULL_LEVELS | ||
diff --git a/drivers/kvm/segment_descriptor.h b/drivers/kvm/segment_descriptor.h deleted file mode 100644 index 71fdf458619a..000000000000 --- a/drivers/kvm/segment_descriptor.h +++ /dev/null | |||
@@ -1,17 +0,0 @@ | |||
1 | struct segment_descriptor { | ||
2 | u16 limit_low; | ||
3 | u16 base_low; | ||
4 | u8 base_mid; | ||
5 | u8 type : 4; | ||
6 | u8 system : 1; | ||
7 | u8 dpl : 2; | ||
8 | u8 present : 1; | ||
9 | u8 limit_high : 4; | ||
10 | u8 avl : 1; | ||
11 | u8 long_mode : 1; | ||
12 | u8 default_op : 1; | ||
13 | u8 granularity : 1; | ||
14 | u8 base_high; | ||
15 | } __attribute__((packed)); | ||
16 | |||
17 | |||
diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c deleted file mode 100644 index ced4ac1955db..000000000000 --- a/drivers/kvm/svm.c +++ /dev/null | |||
@@ -1,1754 +0,0 @@ | |||
1 | /* | ||
2 | * Kernel-based Virtual Machine driver for Linux | ||
3 | * | ||
4 | * AMD SVM support | ||
5 | * | ||
6 | * Copyright (C) 2006 Qumranet, Inc. | ||
7 | * | ||
8 | * Authors: | ||
9 | * Yaniv Kamay <yaniv@qumranet.com> | ||
10 | * Avi Kivity <avi@qumranet.com> | ||
11 | * | ||
12 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
13 | * the COPYING file in the top-level directory. | ||
14 | * | ||
15 | */ | ||
16 | |||
17 | #include "kvm_svm.h" | ||
18 | #include "x86_emulate.h" | ||
19 | #include "irq.h" | ||
20 | |||
21 | #include <linux/module.h> | ||
22 | #include <linux/kernel.h> | ||
23 | #include <linux/vmalloc.h> | ||
24 | #include <linux/highmem.h> | ||
25 | #include <linux/sched.h> | ||
26 | |||
27 | #include <asm/desc.h> | ||
28 | |||
29 | MODULE_AUTHOR("Qumranet"); | ||
30 | MODULE_LICENSE("GPL"); | ||
31 | |||
32 | #define IOPM_ALLOC_ORDER 2 | ||
33 | #define MSRPM_ALLOC_ORDER 1 | ||
34 | |||
35 | #define DB_VECTOR 1 | ||
36 | #define UD_VECTOR 6 | ||
37 | #define GP_VECTOR 13 | ||
38 | |||
39 | #define DR7_GD_MASK (1 << 13) | ||
40 | #define DR6_BD_MASK (1 << 13) | ||
41 | |||
42 | #define SEG_TYPE_LDT 2 | ||
43 | #define SEG_TYPE_BUSY_TSS16 3 | ||
44 | |||
45 | #define KVM_EFER_LMA (1 << 10) | ||
46 | #define KVM_EFER_LME (1 << 8) | ||
47 | |||
48 | #define SVM_FEATURE_NPT (1 << 0) | ||
49 | #define SVM_FEATURE_LBRV (1 << 1) | ||
50 | #define SVM_DEATURE_SVML (1 << 2) | ||
51 | |||
52 | static void kvm_reput_irq(struct vcpu_svm *svm); | ||
53 | |||
54 | static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) | ||
55 | { | ||
56 | return container_of(vcpu, struct vcpu_svm, vcpu); | ||
57 | } | ||
58 | |||
59 | unsigned long iopm_base; | ||
60 | unsigned long msrpm_base; | ||
61 | |||
62 | struct kvm_ldttss_desc { | ||
63 | u16 limit0; | ||
64 | u16 base0; | ||
65 | unsigned base1 : 8, type : 5, dpl : 2, p : 1; | ||
66 | unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8; | ||
67 | u32 base3; | ||
68 | u32 zero1; | ||
69 | } __attribute__((packed)); | ||
70 | |||
71 | struct svm_cpu_data { | ||
72 | int cpu; | ||
73 | |||
74 | u64 asid_generation; | ||
75 | u32 max_asid; | ||
76 | u32 next_asid; | ||
77 | struct kvm_ldttss_desc *tss_desc; | ||
78 | |||
79 | struct page *save_area; | ||
80 | }; | ||
81 | |||
82 | static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); | ||
83 | static uint32_t svm_features; | ||
84 | |||
85 | struct svm_init_data { | ||
86 | int cpu; | ||
87 | int r; | ||
88 | }; | ||
89 | |||
90 | static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000}; | ||
91 | |||
92 | #define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges) | ||
93 | #define MSRS_RANGE_SIZE 2048 | ||
94 | #define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2) | ||
95 | |||
96 | #define MAX_INST_SIZE 15 | ||
97 | |||
98 | static inline u32 svm_has(u32 feat) | ||
99 | { | ||
100 | return svm_features & feat; | ||
101 | } | ||
102 | |||
103 | static inline u8 pop_irq(struct kvm_vcpu *vcpu) | ||
104 | { | ||
105 | int word_index = __ffs(vcpu->irq_summary); | ||
106 | int bit_index = __ffs(vcpu->irq_pending[word_index]); | ||
107 | int irq = word_index * BITS_PER_LONG + bit_index; | ||
108 | |||
109 | clear_bit(bit_index, &vcpu->irq_pending[word_index]); | ||
110 | if (!vcpu->irq_pending[word_index]) | ||
111 | clear_bit(word_index, &vcpu->irq_summary); | ||
112 | return irq; | ||
113 | } | ||
114 | |||
115 | static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq) | ||
116 | { | ||
117 | set_bit(irq, vcpu->irq_pending); | ||
118 | set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary); | ||
119 | } | ||
120 | |||
121 | static inline void clgi(void) | ||
122 | { | ||
123 | asm volatile (SVM_CLGI); | ||
124 | } | ||
125 | |||
126 | static inline void stgi(void) | ||
127 | { | ||
128 | asm volatile (SVM_STGI); | ||
129 | } | ||
130 | |||
131 | static inline void invlpga(unsigned long addr, u32 asid) | ||
132 | { | ||
133 | asm volatile (SVM_INVLPGA :: "a"(addr), "c"(asid)); | ||
134 | } | ||
135 | |||
136 | static inline unsigned long kvm_read_cr2(void) | ||
137 | { | ||
138 | unsigned long cr2; | ||
139 | |||
140 | asm volatile ("mov %%cr2, %0" : "=r" (cr2)); | ||
141 | return cr2; | ||
142 | } | ||
143 | |||
144 | static inline void kvm_write_cr2(unsigned long val) | ||
145 | { | ||
146 | asm volatile ("mov %0, %%cr2" :: "r" (val)); | ||
147 | } | ||
148 | |||
149 | static inline unsigned long read_dr6(void) | ||
150 | { | ||
151 | unsigned long dr6; | ||
152 | |||
153 | asm volatile ("mov %%dr6, %0" : "=r" (dr6)); | ||
154 | return dr6; | ||
155 | } | ||
156 | |||
157 | static inline void write_dr6(unsigned long val) | ||
158 | { | ||
159 | asm volatile ("mov %0, %%dr6" :: "r" (val)); | ||
160 | } | ||
161 | |||
162 | static inline unsigned long read_dr7(void) | ||
163 | { | ||
164 | unsigned long dr7; | ||
165 | |||
166 | asm volatile ("mov %%dr7, %0" : "=r" (dr7)); | ||
167 | return dr7; | ||
168 | } | ||
169 | |||
170 | static inline void write_dr7(unsigned long val) | ||
171 | { | ||
172 | asm volatile ("mov %0, %%dr7" :: "r" (val)); | ||
173 | } | ||
174 | |||
175 | static inline void force_new_asid(struct kvm_vcpu *vcpu) | ||
176 | { | ||
177 | to_svm(vcpu)->asid_generation--; | ||
178 | } | ||
179 | |||
180 | static inline void flush_guest_tlb(struct kvm_vcpu *vcpu) | ||
181 | { | ||
182 | force_new_asid(vcpu); | ||
183 | } | ||
184 | |||
185 | static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) | ||
186 | { | ||
187 | if (!(efer & KVM_EFER_LMA)) | ||
188 | efer &= ~KVM_EFER_LME; | ||
189 | |||
190 | to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK; | ||
191 | vcpu->shadow_efer = efer; | ||
192 | } | ||
193 | |||
194 | static void svm_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code) | ||
195 | { | ||
196 | struct vcpu_svm *svm = to_svm(vcpu); | ||
197 | |||
198 | svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | | ||
199 | SVM_EVTINJ_VALID_ERR | | ||
200 | SVM_EVTINJ_TYPE_EXEPT | | ||
201 | GP_VECTOR; | ||
202 | svm->vmcb->control.event_inj_err = error_code; | ||
203 | } | ||
204 | |||
205 | static void inject_ud(struct kvm_vcpu *vcpu) | ||
206 | { | ||
207 | to_svm(vcpu)->vmcb->control.event_inj = SVM_EVTINJ_VALID | | ||
208 | SVM_EVTINJ_TYPE_EXEPT | | ||
209 | UD_VECTOR; | ||
210 | } | ||
211 | |||
212 | static int is_page_fault(uint32_t info) | ||
213 | { | ||
214 | info &= SVM_EVTINJ_VEC_MASK | SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID; | ||
215 | return info == (PF_VECTOR | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT); | ||
216 | } | ||
217 | |||
218 | static int is_external_interrupt(u32 info) | ||
219 | { | ||
220 | info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID; | ||
221 | return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR); | ||
222 | } | ||
223 | |||
224 | static void skip_emulated_instruction(struct kvm_vcpu *vcpu) | ||
225 | { | ||
226 | struct vcpu_svm *svm = to_svm(vcpu); | ||
227 | |||
228 | if (!svm->next_rip) { | ||
229 | printk(KERN_DEBUG "%s: NOP\n", __FUNCTION__); | ||
230 | return; | ||
231 | } | ||
232 | if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE) { | ||
233 | printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n", | ||
234 | __FUNCTION__, | ||
235 | svm->vmcb->save.rip, | ||
236 | svm->next_rip); | ||
237 | } | ||
238 | |||
239 | vcpu->rip = svm->vmcb->save.rip = svm->next_rip; | ||
240 | svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; | ||
241 | |||
242 | vcpu->interrupt_window_open = 1; | ||
243 | } | ||
244 | |||
245 | static int has_svm(void) | ||
246 | { | ||
247 | uint32_t eax, ebx, ecx, edx; | ||
248 | |||
249 | if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) { | ||
250 | printk(KERN_INFO "has_svm: not amd\n"); | ||
251 | return 0; | ||
252 | } | ||
253 | |||
254 | cpuid(0x80000000, &eax, &ebx, &ecx, &edx); | ||
255 | if (eax < SVM_CPUID_FUNC) { | ||
256 | printk(KERN_INFO "has_svm: can't execute cpuid_8000000a\n"); | ||
257 | return 0; | ||
258 | } | ||
259 | |||
260 | cpuid(0x80000001, &eax, &ebx, &ecx, &edx); | ||
261 | if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) { | ||
262 | printk(KERN_DEBUG "has_svm: svm not available\n"); | ||
263 | return 0; | ||
264 | } | ||
265 | return 1; | ||
266 | } | ||
267 | |||
268 | static void svm_hardware_disable(void *garbage) | ||
269 | { | ||
270 | struct svm_cpu_data *svm_data | ||
271 | = per_cpu(svm_data, raw_smp_processor_id()); | ||
272 | |||
273 | if (svm_data) { | ||
274 | uint64_t efer; | ||
275 | |||
276 | wrmsrl(MSR_VM_HSAVE_PA, 0); | ||
277 | rdmsrl(MSR_EFER, efer); | ||
278 | wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK); | ||
279 | per_cpu(svm_data, raw_smp_processor_id()) = NULL; | ||
280 | __free_page(svm_data->save_area); | ||
281 | kfree(svm_data); | ||
282 | } | ||
283 | } | ||
284 | |||
285 | static void svm_hardware_enable(void *garbage) | ||
286 | { | ||
287 | |||
288 | struct svm_cpu_data *svm_data; | ||
289 | uint64_t efer; | ||
290 | #ifdef CONFIG_X86_64 | ||
291 | struct desc_ptr gdt_descr; | ||
292 | #else | ||
293 | struct desc_ptr gdt_descr; | ||
294 | #endif | ||
295 | struct desc_struct *gdt; | ||
296 | int me = raw_smp_processor_id(); | ||
297 | |||
298 | if (!has_svm()) { | ||
299 | printk(KERN_ERR "svm_cpu_init: err EOPNOTSUPP on %d\n", me); | ||
300 | return; | ||
301 | } | ||
302 | svm_data = per_cpu(svm_data, me); | ||
303 | |||
304 | if (!svm_data) { | ||
305 | printk(KERN_ERR "svm_cpu_init: svm_data is NULL on %d\n", | ||
306 | me); | ||
307 | return; | ||
308 | } | ||
309 | |||
310 | svm_data->asid_generation = 1; | ||
311 | svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; | ||
312 | svm_data->next_asid = svm_data->max_asid + 1; | ||
313 | svm_features = cpuid_edx(SVM_CPUID_FUNC); | ||
314 | |||
315 | asm volatile ( "sgdt %0" : "=m"(gdt_descr) ); | ||
316 | gdt = (struct desc_struct *)gdt_descr.address; | ||
317 | svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); | ||
318 | |||
319 | rdmsrl(MSR_EFER, efer); | ||
320 | wrmsrl(MSR_EFER, efer | MSR_EFER_SVME_MASK); | ||
321 | |||
322 | wrmsrl(MSR_VM_HSAVE_PA, | ||
323 | page_to_pfn(svm_data->save_area) << PAGE_SHIFT); | ||
324 | } | ||
325 | |||
326 | static int svm_cpu_init(int cpu) | ||
327 | { | ||
328 | struct svm_cpu_data *svm_data; | ||
329 | int r; | ||
330 | |||
331 | svm_data = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL); | ||
332 | if (!svm_data) | ||
333 | return -ENOMEM; | ||
334 | svm_data->cpu = cpu; | ||
335 | svm_data->save_area = alloc_page(GFP_KERNEL); | ||
336 | r = -ENOMEM; | ||
337 | if (!svm_data->save_area) | ||
338 | goto err_1; | ||
339 | |||
340 | per_cpu(svm_data, cpu) = svm_data; | ||
341 | |||
342 | return 0; | ||
343 | |||
344 | err_1: | ||
345 | kfree(svm_data); | ||
346 | return r; | ||
347 | |||
348 | } | ||
349 | |||
350 | static void set_msr_interception(u32 *msrpm, unsigned msr, | ||
351 | int read, int write) | ||
352 | { | ||
353 | int i; | ||
354 | |||
355 | for (i = 0; i < NUM_MSR_MAPS; i++) { | ||
356 | if (msr >= msrpm_ranges[i] && | ||
357 | msr < msrpm_ranges[i] + MSRS_IN_RANGE) { | ||
358 | u32 msr_offset = (i * MSRS_IN_RANGE + msr - | ||
359 | msrpm_ranges[i]) * 2; | ||
360 | |||
361 | u32 *base = msrpm + (msr_offset / 32); | ||
362 | u32 msr_shift = msr_offset % 32; | ||
363 | u32 mask = ((write) ? 0 : 2) | ((read) ? 0 : 1); | ||
364 | *base = (*base & ~(0x3 << msr_shift)) | | ||
365 | (mask << msr_shift); | ||
366 | return; | ||
367 | } | ||
368 | } | ||
369 | BUG(); | ||
370 | } | ||
371 | |||
372 | static __init int svm_hardware_setup(void) | ||
373 | { | ||
374 | int cpu; | ||
375 | struct page *iopm_pages; | ||
376 | struct page *msrpm_pages; | ||
377 | void *iopm_va, *msrpm_va; | ||
378 | int r; | ||
379 | |||
380 | iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER); | ||
381 | |||
382 | if (!iopm_pages) | ||
383 | return -ENOMEM; | ||
384 | |||
385 | iopm_va = page_address(iopm_pages); | ||
386 | memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER)); | ||
387 | clear_bit(0x80, iopm_va); /* allow direct access to PC debug port */ | ||
388 | iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; | ||
389 | |||
390 | |||
391 | msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); | ||
392 | |||
393 | r = -ENOMEM; | ||
394 | if (!msrpm_pages) | ||
395 | goto err_1; | ||
396 | |||
397 | msrpm_va = page_address(msrpm_pages); | ||
398 | memset(msrpm_va, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER)); | ||
399 | msrpm_base = page_to_pfn(msrpm_pages) << PAGE_SHIFT; | ||
400 | |||
401 | #ifdef CONFIG_X86_64 | ||
402 | set_msr_interception(msrpm_va, MSR_GS_BASE, 1, 1); | ||
403 | set_msr_interception(msrpm_va, MSR_FS_BASE, 1, 1); | ||
404 | set_msr_interception(msrpm_va, MSR_KERNEL_GS_BASE, 1, 1); | ||
405 | set_msr_interception(msrpm_va, MSR_LSTAR, 1, 1); | ||
406 | set_msr_interception(msrpm_va, MSR_CSTAR, 1, 1); | ||
407 | set_msr_interception(msrpm_va, MSR_SYSCALL_MASK, 1, 1); | ||
408 | #endif | ||
409 | set_msr_interception(msrpm_va, MSR_K6_STAR, 1, 1); | ||
410 | set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_CS, 1, 1); | ||
411 | set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_ESP, 1, 1); | ||
412 | set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_EIP, 1, 1); | ||
413 | |||
414 | for_each_online_cpu(cpu) { | ||
415 | r = svm_cpu_init(cpu); | ||
416 | if (r) | ||
417 | goto err_2; | ||
418 | } | ||
419 | return 0; | ||
420 | |||
421 | err_2: | ||
422 | __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER); | ||
423 | msrpm_base = 0; | ||
424 | err_1: | ||
425 | __free_pages(iopm_pages, IOPM_ALLOC_ORDER); | ||
426 | iopm_base = 0; | ||
427 | return r; | ||
428 | } | ||
429 | |||
430 | static __exit void svm_hardware_unsetup(void) | ||
431 | { | ||
432 | __free_pages(pfn_to_page(msrpm_base >> PAGE_SHIFT), MSRPM_ALLOC_ORDER); | ||
433 | __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER); | ||
434 | iopm_base = msrpm_base = 0; | ||
435 | } | ||
436 | |||
437 | static void init_seg(struct vmcb_seg *seg) | ||
438 | { | ||
439 | seg->selector = 0; | ||
440 | seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK | | ||
441 | SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */ | ||
442 | seg->limit = 0xffff; | ||
443 | seg->base = 0; | ||
444 | } | ||
445 | |||
446 | static void init_sys_seg(struct vmcb_seg *seg, uint32_t type) | ||
447 | { | ||
448 | seg->selector = 0; | ||
449 | seg->attrib = SVM_SELECTOR_P_MASK | type; | ||
450 | seg->limit = 0xffff; | ||
451 | seg->base = 0; | ||
452 | } | ||
453 | |||
454 | static void init_vmcb(struct vmcb *vmcb) | ||
455 | { | ||
456 | struct vmcb_control_area *control = &vmcb->control; | ||
457 | struct vmcb_save_area *save = &vmcb->save; | ||
458 | |||
459 | control->intercept_cr_read = INTERCEPT_CR0_MASK | | ||
460 | INTERCEPT_CR3_MASK | | ||
461 | INTERCEPT_CR4_MASK; | ||
462 | |||
463 | control->intercept_cr_write = INTERCEPT_CR0_MASK | | ||
464 | INTERCEPT_CR3_MASK | | ||
465 | INTERCEPT_CR4_MASK; | ||
466 | |||
467 | control->intercept_dr_read = INTERCEPT_DR0_MASK | | ||
468 | INTERCEPT_DR1_MASK | | ||
469 | INTERCEPT_DR2_MASK | | ||
470 | INTERCEPT_DR3_MASK; | ||
471 | |||
472 | control->intercept_dr_write = INTERCEPT_DR0_MASK | | ||
473 | INTERCEPT_DR1_MASK | | ||
474 | INTERCEPT_DR2_MASK | | ||
475 | INTERCEPT_DR3_MASK | | ||
476 | INTERCEPT_DR5_MASK | | ||
477 | INTERCEPT_DR7_MASK; | ||
478 | |||
479 | control->intercept_exceptions = 1 << PF_VECTOR; | ||
480 | |||
481 | |||
482 | control->intercept = (1ULL << INTERCEPT_INTR) | | ||
483 | (1ULL << INTERCEPT_NMI) | | ||
484 | (1ULL << INTERCEPT_SMI) | | ||
485 | /* | ||
486 | * selective cr0 intercept bug? | ||
487 | * 0: 0f 22 d8 mov %eax,%cr3 | ||
488 | * 3: 0f 20 c0 mov %cr0,%eax | ||
489 | * 6: 0d 00 00 00 80 or $0x80000000,%eax | ||
490 | * b: 0f 22 c0 mov %eax,%cr0 | ||
491 | * set cr3 ->interception | ||
492 | * get cr0 ->interception | ||
493 | * set cr0 -> no interception | ||
494 | */ | ||
495 | /* (1ULL << INTERCEPT_SELECTIVE_CR0) | */ | ||
496 | (1ULL << INTERCEPT_CPUID) | | ||
497 | (1ULL << INTERCEPT_INVD) | | ||
498 | (1ULL << INTERCEPT_HLT) | | ||
499 | (1ULL << INTERCEPT_INVLPGA) | | ||
500 | (1ULL << INTERCEPT_IOIO_PROT) | | ||
501 | (1ULL << INTERCEPT_MSR_PROT) | | ||
502 | (1ULL << INTERCEPT_TASK_SWITCH) | | ||
503 | (1ULL << INTERCEPT_SHUTDOWN) | | ||
504 | (1ULL << INTERCEPT_VMRUN) | | ||
505 | (1ULL << INTERCEPT_VMMCALL) | | ||
506 | (1ULL << INTERCEPT_VMLOAD) | | ||
507 | (1ULL << INTERCEPT_VMSAVE) | | ||
508 | (1ULL << INTERCEPT_STGI) | | ||
509 | (1ULL << INTERCEPT_CLGI) | | ||
510 | (1ULL << INTERCEPT_SKINIT) | | ||
511 | (1ULL << INTERCEPT_WBINVD) | | ||
512 | (1ULL << INTERCEPT_MONITOR) | | ||
513 | (1ULL << INTERCEPT_MWAIT); | ||
514 | |||
515 | control->iopm_base_pa = iopm_base; | ||
516 | control->msrpm_base_pa = msrpm_base; | ||
517 | control->tsc_offset = 0; | ||
518 | control->int_ctl = V_INTR_MASKING_MASK; | ||
519 | |||
520 | init_seg(&save->es); | ||
521 | init_seg(&save->ss); | ||
522 | init_seg(&save->ds); | ||
523 | init_seg(&save->fs); | ||
524 | init_seg(&save->gs); | ||
525 | |||
526 | save->cs.selector = 0xf000; | ||
527 | /* Executable/Readable Code Segment */ | ||
528 | save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK | | ||
529 | SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK; | ||
530 | save->cs.limit = 0xffff; | ||
531 | /* | ||
532 | * cs.base should really be 0xffff0000, but vmx can't handle that, so | ||
533 | * be consistent with it. | ||
534 | * | ||
535 | * Replace when we have real mode working for vmx. | ||
536 | */ | ||
537 | save->cs.base = 0xf0000; | ||
538 | |||
539 | save->gdtr.limit = 0xffff; | ||
540 | save->idtr.limit = 0xffff; | ||
541 | |||
542 | init_sys_seg(&save->ldtr, SEG_TYPE_LDT); | ||
543 | init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); | ||
544 | |||
545 | save->efer = MSR_EFER_SVME_MASK; | ||
546 | |||
547 | save->dr6 = 0xffff0ff0; | ||
548 | save->dr7 = 0x400; | ||
549 | save->rflags = 2; | ||
550 | save->rip = 0x0000fff0; | ||
551 | |||
552 | /* | ||
553 | * cr0 val on cpu init should be 0x60000010, we enable cpu | ||
554 | * cache by default. the orderly way is to enable cache in bios. | ||
555 | */ | ||
556 | save->cr0 = 0x00000010 | X86_CR0_PG | X86_CR0_WP; | ||
557 | save->cr4 = X86_CR4_PAE; | ||
558 | /* rdx = ?? */ | ||
559 | } | ||
560 | |||
561 | static void svm_vcpu_reset(struct kvm_vcpu *vcpu) | ||
562 | { | ||
563 | struct vcpu_svm *svm = to_svm(vcpu); | ||
564 | |||
565 | init_vmcb(svm->vmcb); | ||
566 | |||
567 | if (vcpu->vcpu_id != 0) { | ||
568 | svm->vmcb->save.rip = 0; | ||
569 | svm->vmcb->save.cs.base = svm->vcpu.sipi_vector << 12; | ||
570 | svm->vmcb->save.cs.selector = svm->vcpu.sipi_vector << 8; | ||
571 | } | ||
572 | } | ||
573 | |||
574 | static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) | ||
575 | { | ||
576 | struct vcpu_svm *svm; | ||
577 | struct page *page; | ||
578 | int err; | ||
579 | |||
580 | svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); | ||
581 | if (!svm) { | ||
582 | err = -ENOMEM; | ||
583 | goto out; | ||
584 | } | ||
585 | |||
586 | err = kvm_vcpu_init(&svm->vcpu, kvm, id); | ||
587 | if (err) | ||
588 | goto free_svm; | ||
589 | |||
590 | if (irqchip_in_kernel(kvm)) { | ||
591 | err = kvm_create_lapic(&svm->vcpu); | ||
592 | if (err < 0) | ||
593 | goto free_svm; | ||
594 | } | ||
595 | |||
596 | page = alloc_page(GFP_KERNEL); | ||
597 | if (!page) { | ||
598 | err = -ENOMEM; | ||
599 | goto uninit; | ||
600 | } | ||
601 | |||
602 | svm->vmcb = page_address(page); | ||
603 | clear_page(svm->vmcb); | ||
604 | svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; | ||
605 | svm->asid_generation = 0; | ||
606 | memset(svm->db_regs, 0, sizeof(svm->db_regs)); | ||
607 | init_vmcb(svm->vmcb); | ||
608 | |||
609 | fx_init(&svm->vcpu); | ||
610 | svm->vcpu.fpu_active = 1; | ||
611 | svm->vcpu.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; | ||
612 | if (svm->vcpu.vcpu_id == 0) | ||
613 | svm->vcpu.apic_base |= MSR_IA32_APICBASE_BSP; | ||
614 | |||
615 | return &svm->vcpu; | ||
616 | |||
617 | uninit: | ||
618 | kvm_vcpu_uninit(&svm->vcpu); | ||
619 | free_svm: | ||
620 | kmem_cache_free(kvm_vcpu_cache, svm); | ||
621 | out: | ||
622 | return ERR_PTR(err); | ||
623 | } | ||
624 | |||
625 | static void svm_free_vcpu(struct kvm_vcpu *vcpu) | ||
626 | { | ||
627 | struct vcpu_svm *svm = to_svm(vcpu); | ||
628 | |||
629 | __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT)); | ||
630 | kvm_vcpu_uninit(vcpu); | ||
631 | kmem_cache_free(kvm_vcpu_cache, svm); | ||
632 | } | ||
633 | |||
634 | static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | ||
635 | { | ||
636 | struct vcpu_svm *svm = to_svm(vcpu); | ||
637 | int i; | ||
638 | |||
639 | if (unlikely(cpu != vcpu->cpu)) { | ||
640 | u64 tsc_this, delta; | ||
641 | |||
642 | /* | ||
643 | * Make sure that the guest sees a monotonically | ||
644 | * increasing TSC. | ||
645 | */ | ||
646 | rdtscll(tsc_this); | ||
647 | delta = vcpu->host_tsc - tsc_this; | ||
648 | svm->vmcb->control.tsc_offset += delta; | ||
649 | vcpu->cpu = cpu; | ||
650 | kvm_migrate_apic_timer(vcpu); | ||
651 | } | ||
652 | |||
653 | for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) | ||
654 | rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); | ||
655 | } | ||
656 | |||
657 | static void svm_vcpu_put(struct kvm_vcpu *vcpu) | ||
658 | { | ||
659 | struct vcpu_svm *svm = to_svm(vcpu); | ||
660 | int i; | ||
661 | |||
662 | for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) | ||
663 | wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); | ||
664 | |||
665 | rdtscll(vcpu->host_tsc); | ||
666 | kvm_put_guest_fpu(vcpu); | ||
667 | } | ||
668 | |||
669 | static void svm_vcpu_decache(struct kvm_vcpu *vcpu) | ||
670 | { | ||
671 | } | ||
672 | |||
673 | static void svm_cache_regs(struct kvm_vcpu *vcpu) | ||
674 | { | ||
675 | struct vcpu_svm *svm = to_svm(vcpu); | ||
676 | |||
677 | vcpu->regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; | ||
678 | vcpu->regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; | ||
679 | vcpu->rip = svm->vmcb->save.rip; | ||
680 | } | ||
681 | |||
682 | static void svm_decache_regs(struct kvm_vcpu *vcpu) | ||
683 | { | ||
684 | struct vcpu_svm *svm = to_svm(vcpu); | ||
685 | svm->vmcb->save.rax = vcpu->regs[VCPU_REGS_RAX]; | ||
686 | svm->vmcb->save.rsp = vcpu->regs[VCPU_REGS_RSP]; | ||
687 | svm->vmcb->save.rip = vcpu->rip; | ||
688 | } | ||
689 | |||
690 | static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) | ||
691 | { | ||
692 | return to_svm(vcpu)->vmcb->save.rflags; | ||
693 | } | ||
694 | |||
695 | static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) | ||
696 | { | ||
697 | to_svm(vcpu)->vmcb->save.rflags = rflags; | ||
698 | } | ||
699 | |||
700 | static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg) | ||
701 | { | ||
702 | struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save; | ||
703 | |||
704 | switch (seg) { | ||
705 | case VCPU_SREG_CS: return &save->cs; | ||
706 | case VCPU_SREG_DS: return &save->ds; | ||
707 | case VCPU_SREG_ES: return &save->es; | ||
708 | case VCPU_SREG_FS: return &save->fs; | ||
709 | case VCPU_SREG_GS: return &save->gs; | ||
710 | case VCPU_SREG_SS: return &save->ss; | ||
711 | case VCPU_SREG_TR: return &save->tr; | ||
712 | case VCPU_SREG_LDTR: return &save->ldtr; | ||
713 | } | ||
714 | BUG(); | ||
715 | return NULL; | ||
716 | } | ||
717 | |||
718 | static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg) | ||
719 | { | ||
720 | struct vmcb_seg *s = svm_seg(vcpu, seg); | ||
721 | |||
722 | return s->base; | ||
723 | } | ||
724 | |||
725 | static void svm_get_segment(struct kvm_vcpu *vcpu, | ||
726 | struct kvm_segment *var, int seg) | ||
727 | { | ||
728 | struct vmcb_seg *s = svm_seg(vcpu, seg); | ||
729 | |||
730 | var->base = s->base; | ||
731 | var->limit = s->limit; | ||
732 | var->selector = s->selector; | ||
733 | var->type = s->attrib & SVM_SELECTOR_TYPE_MASK; | ||
734 | var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1; | ||
735 | var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3; | ||
736 | var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1; | ||
737 | var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1; | ||
738 | var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1; | ||
739 | var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1; | ||
740 | var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1; | ||
741 | var->unusable = !var->present; | ||
742 | } | ||
743 | |||
744 | static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) | ||
745 | { | ||
746 | struct vcpu_svm *svm = to_svm(vcpu); | ||
747 | |||
748 | dt->limit = svm->vmcb->save.idtr.limit; | ||
749 | dt->base = svm->vmcb->save.idtr.base; | ||
750 | } | ||
751 | |||
752 | static void svm_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) | ||
753 | { | ||
754 | struct vcpu_svm *svm = to_svm(vcpu); | ||
755 | |||
756 | svm->vmcb->save.idtr.limit = dt->limit; | ||
757 | svm->vmcb->save.idtr.base = dt->base ; | ||
758 | } | ||
759 | |||
760 | static void svm_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) | ||
761 | { | ||
762 | struct vcpu_svm *svm = to_svm(vcpu); | ||
763 | |||
764 | dt->limit = svm->vmcb->save.gdtr.limit; | ||
765 | dt->base = svm->vmcb->save.gdtr.base; | ||
766 | } | ||
767 | |||
768 | static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) | ||
769 | { | ||
770 | struct vcpu_svm *svm = to_svm(vcpu); | ||
771 | |||
772 | svm->vmcb->save.gdtr.limit = dt->limit; | ||
773 | svm->vmcb->save.gdtr.base = dt->base ; | ||
774 | } | ||
775 | |||
776 | static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) | ||
777 | { | ||
778 | } | ||
779 | |||
780 | static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | ||
781 | { | ||
782 | struct vcpu_svm *svm = to_svm(vcpu); | ||
783 | |||
784 | #ifdef CONFIG_X86_64 | ||
785 | if (vcpu->shadow_efer & KVM_EFER_LME) { | ||
786 | if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { | ||
787 | vcpu->shadow_efer |= KVM_EFER_LMA; | ||
788 | svm->vmcb->save.efer |= KVM_EFER_LMA | KVM_EFER_LME; | ||
789 | } | ||
790 | |||
791 | if (is_paging(vcpu) && !(cr0 & X86_CR0_PG) ) { | ||
792 | vcpu->shadow_efer &= ~KVM_EFER_LMA; | ||
793 | svm->vmcb->save.efer &= ~(KVM_EFER_LMA | KVM_EFER_LME); | ||
794 | } | ||
795 | } | ||
796 | #endif | ||
797 | if ((vcpu->cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) { | ||
798 | svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); | ||
799 | vcpu->fpu_active = 1; | ||
800 | } | ||
801 | |||
802 | vcpu->cr0 = cr0; | ||
803 | cr0 |= X86_CR0_PG | X86_CR0_WP; | ||
804 | cr0 &= ~(X86_CR0_CD | X86_CR0_NW); | ||
805 | svm->vmcb->save.cr0 = cr0; | ||
806 | } | ||
807 | |||
808 | static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | ||
809 | { | ||
810 | vcpu->cr4 = cr4; | ||
811 | to_svm(vcpu)->vmcb->save.cr4 = cr4 | X86_CR4_PAE; | ||
812 | } | ||
813 | |||
814 | static void svm_set_segment(struct kvm_vcpu *vcpu, | ||
815 | struct kvm_segment *var, int seg) | ||
816 | { | ||
817 | struct vcpu_svm *svm = to_svm(vcpu); | ||
818 | struct vmcb_seg *s = svm_seg(vcpu, seg); | ||
819 | |||
820 | s->base = var->base; | ||
821 | s->limit = var->limit; | ||
822 | s->selector = var->selector; | ||
823 | if (var->unusable) | ||
824 | s->attrib = 0; | ||
825 | else { | ||
826 | s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK); | ||
827 | s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT; | ||
828 | s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT; | ||
829 | s->attrib |= (var->present & 1) << SVM_SELECTOR_P_SHIFT; | ||
830 | s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT; | ||
831 | s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT; | ||
832 | s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT; | ||
833 | s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT; | ||
834 | } | ||
835 | if (seg == VCPU_SREG_CS) | ||
836 | svm->vmcb->save.cpl | ||
837 | = (svm->vmcb->save.cs.attrib | ||
838 | >> SVM_SELECTOR_DPL_SHIFT) & 3; | ||
839 | |||
840 | } | ||
841 | |||
842 | /* FIXME: | ||
843 | |||
844 | svm(vcpu)->vmcb->control.int_ctl &= ~V_TPR_MASK; | ||
845 | svm(vcpu)->vmcb->control.int_ctl |= (sregs->cr8 & V_TPR_MASK); | ||
846 | |||
847 | */ | ||
848 | |||
849 | static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) | ||
850 | { | ||
851 | return -EOPNOTSUPP; | ||
852 | } | ||
853 | |||
854 | static int svm_get_irq(struct kvm_vcpu *vcpu) | ||
855 | { | ||
856 | struct vcpu_svm *svm = to_svm(vcpu); | ||
857 | u32 exit_int_info = svm->vmcb->control.exit_int_info; | ||
858 | |||
859 | if (is_external_interrupt(exit_int_info)) | ||
860 | return exit_int_info & SVM_EVTINJ_VEC_MASK; | ||
861 | return -1; | ||
862 | } | ||
863 | |||
864 | static void load_host_msrs(struct kvm_vcpu *vcpu) | ||
865 | { | ||
866 | #ifdef CONFIG_X86_64 | ||
867 | wrmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base); | ||
868 | #endif | ||
869 | } | ||
870 | |||
871 | static void save_host_msrs(struct kvm_vcpu *vcpu) | ||
872 | { | ||
873 | #ifdef CONFIG_X86_64 | ||
874 | rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base); | ||
875 | #endif | ||
876 | } | ||
877 | |||
878 | static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data) | ||
879 | { | ||
880 | if (svm_data->next_asid > svm_data->max_asid) { | ||
881 | ++svm_data->asid_generation; | ||
882 | svm_data->next_asid = 1; | ||
883 | svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID; | ||
884 | } | ||
885 | |||
886 | svm->vcpu.cpu = svm_data->cpu; | ||
887 | svm->asid_generation = svm_data->asid_generation; | ||
888 | svm->vmcb->control.asid = svm_data->next_asid++; | ||
889 | } | ||
890 | |||
891 | static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr) | ||
892 | { | ||
893 | return to_svm(vcpu)->db_regs[dr]; | ||
894 | } | ||
895 | |||
896 | static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, | ||
897 | int *exception) | ||
898 | { | ||
899 | struct vcpu_svm *svm = to_svm(vcpu); | ||
900 | |||
901 | *exception = 0; | ||
902 | |||
903 | if (svm->vmcb->save.dr7 & DR7_GD_MASK) { | ||
904 | svm->vmcb->save.dr7 &= ~DR7_GD_MASK; | ||
905 | svm->vmcb->save.dr6 |= DR6_BD_MASK; | ||
906 | *exception = DB_VECTOR; | ||
907 | return; | ||
908 | } | ||
909 | |||
910 | switch (dr) { | ||
911 | case 0 ... 3: | ||
912 | svm->db_regs[dr] = value; | ||
913 | return; | ||
914 | case 4 ... 5: | ||
915 | if (vcpu->cr4 & X86_CR4_DE) { | ||
916 | *exception = UD_VECTOR; | ||
917 | return; | ||
918 | } | ||
919 | case 7: { | ||
920 | if (value & ~((1ULL << 32) - 1)) { | ||
921 | *exception = GP_VECTOR; | ||
922 | return; | ||
923 | } | ||
924 | svm->vmcb->save.dr7 = value; | ||
925 | return; | ||
926 | } | ||
927 | default: | ||
928 | printk(KERN_DEBUG "%s: unexpected dr %u\n", | ||
929 | __FUNCTION__, dr); | ||
930 | *exception = UD_VECTOR; | ||
931 | return; | ||
932 | } | ||
933 | } | ||
934 | |||
935 | static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | ||
936 | { | ||
937 | u32 exit_int_info = svm->vmcb->control.exit_int_info; | ||
938 | struct kvm *kvm = svm->vcpu.kvm; | ||
939 | u64 fault_address; | ||
940 | u32 error_code; | ||
941 | enum emulation_result er; | ||
942 | int r; | ||
943 | |||
944 | if (!irqchip_in_kernel(kvm) && | ||
945 | is_external_interrupt(exit_int_info)) | ||
946 | push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK); | ||
947 | |||
948 | mutex_lock(&kvm->lock); | ||
949 | |||
950 | fault_address = svm->vmcb->control.exit_info_2; | ||
951 | error_code = svm->vmcb->control.exit_info_1; | ||
952 | r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); | ||
953 | if (r < 0) { | ||
954 | mutex_unlock(&kvm->lock); | ||
955 | return r; | ||
956 | } | ||
957 | if (!r) { | ||
958 | mutex_unlock(&kvm->lock); | ||
959 | return 1; | ||
960 | } | ||
961 | er = emulate_instruction(&svm->vcpu, kvm_run, fault_address, | ||
962 | error_code); | ||
963 | mutex_unlock(&kvm->lock); | ||
964 | |||
965 | switch (er) { | ||
966 | case EMULATE_DONE: | ||
967 | return 1; | ||
968 | case EMULATE_DO_MMIO: | ||
969 | ++svm->vcpu.stat.mmio_exits; | ||
970 | return 0; | ||
971 | case EMULATE_FAIL: | ||
972 | kvm_report_emulation_failure(&svm->vcpu, "pagetable"); | ||
973 | break; | ||
974 | default: | ||
975 | BUG(); | ||
976 | } | ||
977 | |||
978 | kvm_run->exit_reason = KVM_EXIT_UNKNOWN; | ||
979 | return 0; | ||
980 | } | ||
981 | |||
982 | static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | ||
983 | { | ||
984 | svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); | ||
985 | if (!(svm->vcpu.cr0 & X86_CR0_TS)) | ||
986 | svm->vmcb->save.cr0 &= ~X86_CR0_TS; | ||
987 | svm->vcpu.fpu_active = 1; | ||
988 | |||
989 | return 1; | ||
990 | } | ||
991 | |||
992 | static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | ||
993 | { | ||
994 | /* | ||
995 | * VMCB is undefined after a SHUTDOWN intercept | ||
996 | * so reinitialize it. | ||
997 | */ | ||
998 | clear_page(svm->vmcb); | ||
999 | init_vmcb(svm->vmcb); | ||
1000 | |||
1001 | kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; | ||
1002 | return 0; | ||
1003 | } | ||
1004 | |||
1005 | static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | ||
1006 | { | ||
1007 | u32 io_info = svm->vmcb->control.exit_info_1; //address size bug? | ||
1008 | int size, down, in, string, rep; | ||
1009 | unsigned port; | ||
1010 | |||
1011 | ++svm->vcpu.stat.io_exits; | ||
1012 | |||
1013 | svm->next_rip = svm->vmcb->control.exit_info_2; | ||
1014 | |||
1015 | string = (io_info & SVM_IOIO_STR_MASK) != 0; | ||
1016 | |||
1017 | if (string) { | ||
1018 | if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0) == EMULATE_DO_MMIO) | ||
1019 | return 0; | ||
1020 | return 1; | ||
1021 | } | ||
1022 | |||
1023 | in = (io_info & SVM_IOIO_TYPE_MASK) != 0; | ||
1024 | port = io_info >> 16; | ||
1025 | size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; | ||
1026 | rep = (io_info & SVM_IOIO_REP_MASK) != 0; | ||
1027 | down = (svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0; | ||
1028 | |||
1029 | return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port); | ||
1030 | } | ||
1031 | |||
1032 | static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | ||
1033 | { | ||
1034 | return 1; | ||
1035 | } | ||
1036 | |||
1037 | static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | ||
1038 | { | ||
1039 | svm->next_rip = svm->vmcb->save.rip + 1; | ||
1040 | skip_emulated_instruction(&svm->vcpu); | ||
1041 | return kvm_emulate_halt(&svm->vcpu); | ||
1042 | } | ||
1043 | |||
1044 | static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | ||
1045 | { | ||
1046 | svm->next_rip = svm->vmcb->save.rip + 3; | ||
1047 | skip_emulated_instruction(&svm->vcpu); | ||
1048 | return kvm_hypercall(&svm->vcpu, kvm_run); | ||
1049 | } | ||
1050 | |||
1051 | static int invalid_op_interception(struct vcpu_svm *svm, | ||
1052 | struct kvm_run *kvm_run) | ||
1053 | { | ||
1054 | inject_ud(&svm->vcpu); | ||
1055 | return 1; | ||
1056 | } | ||
1057 | |||
1058 | static int task_switch_interception(struct vcpu_svm *svm, | ||
1059 | struct kvm_run *kvm_run) | ||
1060 | { | ||
1061 | pr_unimpl(&svm->vcpu, "%s: task switch is unsupported\n", __FUNCTION__); | ||
1062 | kvm_run->exit_reason = KVM_EXIT_UNKNOWN; | ||
1063 | return 0; | ||
1064 | } | ||
1065 | |||
1066 | static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | ||
1067 | { | ||
1068 | svm->next_rip = svm->vmcb->save.rip + 2; | ||
1069 | kvm_emulate_cpuid(&svm->vcpu); | ||
1070 | return 1; | ||
1071 | } | ||
1072 | |||
1073 | static int emulate_on_interception(struct vcpu_svm *svm, | ||
1074 | struct kvm_run *kvm_run) | ||
1075 | { | ||
1076 | if (emulate_instruction(&svm->vcpu, NULL, 0, 0) != EMULATE_DONE) | ||
1077 | pr_unimpl(&svm->vcpu, "%s: failed\n", __FUNCTION__); | ||
1078 | return 1; | ||
1079 | } | ||
1080 | |||
1081 | static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) | ||
1082 | { | ||
1083 | struct vcpu_svm *svm = to_svm(vcpu); | ||
1084 | |||
1085 | switch (ecx) { | ||
1086 | case MSR_IA32_TIME_STAMP_COUNTER: { | ||
1087 | u64 tsc; | ||
1088 | |||
1089 | rdtscll(tsc); | ||
1090 | *data = svm->vmcb->control.tsc_offset + tsc; | ||
1091 | break; | ||
1092 | } | ||
1093 | case MSR_K6_STAR: | ||
1094 | *data = svm->vmcb->save.star; | ||
1095 | break; | ||
1096 | #ifdef CONFIG_X86_64 | ||
1097 | case MSR_LSTAR: | ||
1098 | *data = svm->vmcb->save.lstar; | ||
1099 | break; | ||
1100 | case MSR_CSTAR: | ||
1101 | *data = svm->vmcb->save.cstar; | ||
1102 | break; | ||
1103 | case MSR_KERNEL_GS_BASE: | ||
1104 | *data = svm->vmcb->save.kernel_gs_base; | ||
1105 | break; | ||
1106 | case MSR_SYSCALL_MASK: | ||
1107 | *data = svm->vmcb->save.sfmask; | ||
1108 | break; | ||
1109 | #endif | ||
1110 | case MSR_IA32_SYSENTER_CS: | ||
1111 | *data = svm->vmcb->save.sysenter_cs; | ||
1112 | break; | ||
1113 | case MSR_IA32_SYSENTER_EIP: | ||
1114 | *data = svm->vmcb->save.sysenter_eip; | ||
1115 | break; | ||
1116 | case MSR_IA32_SYSENTER_ESP: | ||
1117 | *data = svm->vmcb->save.sysenter_esp; | ||
1118 | break; | ||
1119 | default: | ||
1120 | return kvm_get_msr_common(vcpu, ecx, data); | ||
1121 | } | ||
1122 | return 0; | ||
1123 | } | ||
1124 | |||
1125 | static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | ||
1126 | { | ||
1127 | u32 ecx = svm->vcpu.regs[VCPU_REGS_RCX]; | ||
1128 | u64 data; | ||
1129 | |||
1130 | if (svm_get_msr(&svm->vcpu, ecx, &data)) | ||
1131 | svm_inject_gp(&svm->vcpu, 0); | ||
1132 | else { | ||
1133 | svm->vmcb->save.rax = data & 0xffffffff; | ||
1134 | svm->vcpu.regs[VCPU_REGS_RDX] = data >> 32; | ||
1135 | svm->next_rip = svm->vmcb->save.rip + 2; | ||
1136 | skip_emulated_instruction(&svm->vcpu); | ||
1137 | } | ||
1138 | return 1; | ||
1139 | } | ||
1140 | |||
1141 | static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) | ||
1142 | { | ||
1143 | struct vcpu_svm *svm = to_svm(vcpu); | ||
1144 | |||
1145 | switch (ecx) { | ||
1146 | case MSR_IA32_TIME_STAMP_COUNTER: { | ||
1147 | u64 tsc; | ||
1148 | |||
1149 | rdtscll(tsc); | ||
1150 | svm->vmcb->control.tsc_offset = data - tsc; | ||
1151 | break; | ||
1152 | } | ||
1153 | case MSR_K6_STAR: | ||
1154 | svm->vmcb->save.star = data; | ||
1155 | break; | ||
1156 | #ifdef CONFIG_X86_64 | ||
1157 | case MSR_LSTAR: | ||
1158 | svm->vmcb->save.lstar = data; | ||
1159 | break; | ||
1160 | case MSR_CSTAR: | ||
1161 | svm->vmcb->save.cstar = data; | ||
1162 | break; | ||
1163 | case MSR_KERNEL_GS_BASE: | ||
1164 | svm->vmcb->save.kernel_gs_base = data; | ||
1165 | break; | ||
1166 | case MSR_SYSCALL_MASK: | ||
1167 | svm->vmcb->save.sfmask = data; | ||
1168 | break; | ||
1169 | #endif | ||
1170 | case MSR_IA32_SYSENTER_CS: | ||
1171 | svm->vmcb->save.sysenter_cs = data; | ||
1172 | break; | ||
1173 | case MSR_IA32_SYSENTER_EIP: | ||
1174 | svm->vmcb->save.sysenter_eip = data; | ||
1175 | break; | ||
1176 | case MSR_IA32_SYSENTER_ESP: | ||
1177 | svm->vmcb->save.sysenter_esp = data; | ||
1178 | break; | ||
1179 | default: | ||
1180 | return kvm_set_msr_common(vcpu, ecx, data); | ||
1181 | } | ||
1182 | return 0; | ||
1183 | } | ||
1184 | |||
1185 | static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | ||
1186 | { | ||
1187 | u32 ecx = svm->vcpu.regs[VCPU_REGS_RCX]; | ||
1188 | u64 data = (svm->vmcb->save.rax & -1u) | ||
1189 | | ((u64)(svm->vcpu.regs[VCPU_REGS_RDX] & -1u) << 32); | ||
1190 | svm->next_rip = svm->vmcb->save.rip + 2; | ||
1191 | if (svm_set_msr(&svm->vcpu, ecx, data)) | ||
1192 | svm_inject_gp(&svm->vcpu, 0); | ||
1193 | else | ||
1194 | skip_emulated_instruction(&svm->vcpu); | ||
1195 | return 1; | ||
1196 | } | ||
1197 | |||
1198 | static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | ||
1199 | { | ||
1200 | if (svm->vmcb->control.exit_info_1) | ||
1201 | return wrmsr_interception(svm, kvm_run); | ||
1202 | else | ||
1203 | return rdmsr_interception(svm, kvm_run); | ||
1204 | } | ||
1205 | |||
1206 | static int interrupt_window_interception(struct vcpu_svm *svm, | ||
1207 | struct kvm_run *kvm_run) | ||
1208 | { | ||
1209 | svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR); | ||
1210 | svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; | ||
1211 | /* | ||
1212 | * If the user space waits to inject interrupts, exit as soon as | ||
1213 | * possible | ||
1214 | */ | ||
1215 | if (kvm_run->request_interrupt_window && | ||
1216 | !svm->vcpu.irq_summary) { | ||
1217 | ++svm->vcpu.stat.irq_window_exits; | ||
1218 | kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; | ||
1219 | return 0; | ||
1220 | } | ||
1221 | |||
1222 | return 1; | ||
1223 | } | ||
1224 | |||
1225 | static int (*svm_exit_handlers[])(struct vcpu_svm *svm, | ||
1226 | struct kvm_run *kvm_run) = { | ||
1227 | [SVM_EXIT_READ_CR0] = emulate_on_interception, | ||
1228 | [SVM_EXIT_READ_CR3] = emulate_on_interception, | ||
1229 | [SVM_EXIT_READ_CR4] = emulate_on_interception, | ||
1230 | /* for now: */ | ||
1231 | [SVM_EXIT_WRITE_CR0] = emulate_on_interception, | ||
1232 | [SVM_EXIT_WRITE_CR3] = emulate_on_interception, | ||
1233 | [SVM_EXIT_WRITE_CR4] = emulate_on_interception, | ||
1234 | [SVM_EXIT_READ_DR0] = emulate_on_interception, | ||
1235 | [SVM_EXIT_READ_DR1] = emulate_on_interception, | ||
1236 | [SVM_EXIT_READ_DR2] = emulate_on_interception, | ||
1237 | [SVM_EXIT_READ_DR3] = emulate_on_interception, | ||
1238 | [SVM_EXIT_WRITE_DR0] = emulate_on_interception, | ||
1239 | [SVM_EXIT_WRITE_DR1] = emulate_on_interception, | ||
1240 | [SVM_EXIT_WRITE_DR2] = emulate_on_interception, | ||
1241 | [SVM_EXIT_WRITE_DR3] = emulate_on_interception, | ||
1242 | [SVM_EXIT_WRITE_DR5] = emulate_on_interception, | ||
1243 | [SVM_EXIT_WRITE_DR7] = emulate_on_interception, | ||
1244 | [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, | ||
1245 | [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, | ||
1246 | [SVM_EXIT_INTR] = nop_on_interception, | ||
1247 | [SVM_EXIT_NMI] = nop_on_interception, | ||
1248 | [SVM_EXIT_SMI] = nop_on_interception, | ||
1249 | [SVM_EXIT_INIT] = nop_on_interception, | ||
1250 | [SVM_EXIT_VINTR] = interrupt_window_interception, | ||
1251 | /* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */ | ||
1252 | [SVM_EXIT_CPUID] = cpuid_interception, | ||
1253 | [SVM_EXIT_INVD] = emulate_on_interception, | ||
1254 | [SVM_EXIT_HLT] = halt_interception, | ||
1255 | [SVM_EXIT_INVLPG] = emulate_on_interception, | ||
1256 | [SVM_EXIT_INVLPGA] = invalid_op_interception, | ||
1257 | [SVM_EXIT_IOIO] = io_interception, | ||
1258 | [SVM_EXIT_MSR] = msr_interception, | ||
1259 | [SVM_EXIT_TASK_SWITCH] = task_switch_interception, | ||
1260 | [SVM_EXIT_SHUTDOWN] = shutdown_interception, | ||
1261 | [SVM_EXIT_VMRUN] = invalid_op_interception, | ||
1262 | [SVM_EXIT_VMMCALL] = vmmcall_interception, | ||
1263 | [SVM_EXIT_VMLOAD] = invalid_op_interception, | ||
1264 | [SVM_EXIT_VMSAVE] = invalid_op_interception, | ||
1265 | [SVM_EXIT_STGI] = invalid_op_interception, | ||
1266 | [SVM_EXIT_CLGI] = invalid_op_interception, | ||
1267 | [SVM_EXIT_SKINIT] = invalid_op_interception, | ||
1268 | [SVM_EXIT_WBINVD] = emulate_on_interception, | ||
1269 | [SVM_EXIT_MONITOR] = invalid_op_interception, | ||
1270 | [SVM_EXIT_MWAIT] = invalid_op_interception, | ||
1271 | }; | ||
1272 | |||
1273 | |||
1274 | static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) | ||
1275 | { | ||
1276 | struct vcpu_svm *svm = to_svm(vcpu); | ||
1277 | u32 exit_code = svm->vmcb->control.exit_code; | ||
1278 | |||
1279 | kvm_reput_irq(svm); | ||
1280 | |||
1281 | if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { | ||
1282 | kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; | ||
1283 | kvm_run->fail_entry.hardware_entry_failure_reason | ||
1284 | = svm->vmcb->control.exit_code; | ||
1285 | return 0; | ||
1286 | } | ||
1287 | |||
1288 | if (is_external_interrupt(svm->vmcb->control.exit_int_info) && | ||
1289 | exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR) | ||
1290 | printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x " | ||
1291 | "exit_code 0x%x\n", | ||
1292 | __FUNCTION__, svm->vmcb->control.exit_int_info, | ||
1293 | exit_code); | ||
1294 | |||
1295 | if (exit_code >= ARRAY_SIZE(svm_exit_handlers) | ||
1296 | || svm_exit_handlers[exit_code] == 0) { | ||
1297 | kvm_run->exit_reason = KVM_EXIT_UNKNOWN; | ||
1298 | kvm_run->hw.hardware_exit_reason = exit_code; | ||
1299 | return 0; | ||
1300 | } | ||
1301 | |||
1302 | return svm_exit_handlers[exit_code](svm, kvm_run); | ||
1303 | } | ||
1304 | |||
1305 | static void reload_tss(struct kvm_vcpu *vcpu) | ||
1306 | { | ||
1307 | int cpu = raw_smp_processor_id(); | ||
1308 | |||
1309 | struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); | ||
1310 | svm_data->tss_desc->type = 9; //available 32/64-bit TSS | ||
1311 | load_TR_desc(); | ||
1312 | } | ||
1313 | |||
1314 | static void pre_svm_run(struct vcpu_svm *svm) | ||
1315 | { | ||
1316 | int cpu = raw_smp_processor_id(); | ||
1317 | |||
1318 | struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); | ||
1319 | |||
1320 | svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; | ||
1321 | if (svm->vcpu.cpu != cpu || | ||
1322 | svm->asid_generation != svm_data->asid_generation) | ||
1323 | new_asid(svm, svm_data); | ||
1324 | } | ||
1325 | |||
1326 | |||
1327 | static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) | ||
1328 | { | ||
1329 | struct vmcb_control_area *control; | ||
1330 | |||
1331 | control = &svm->vmcb->control; | ||
1332 | control->int_vector = irq; | ||
1333 | control->int_ctl &= ~V_INTR_PRIO_MASK; | ||
1334 | control->int_ctl |= V_IRQ_MASK | | ||
1335 | ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); | ||
1336 | } | ||
1337 | |||
1338 | static void svm_set_irq(struct kvm_vcpu *vcpu, int irq) | ||
1339 | { | ||
1340 | struct vcpu_svm *svm = to_svm(vcpu); | ||
1341 | |||
1342 | svm_inject_irq(svm, irq); | ||
1343 | } | ||
1344 | |||
1345 | static void svm_intr_assist(struct kvm_vcpu *vcpu) | ||
1346 | { | ||
1347 | struct vcpu_svm *svm = to_svm(vcpu); | ||
1348 | struct vmcb *vmcb = svm->vmcb; | ||
1349 | int intr_vector = -1; | ||
1350 | |||
1351 | kvm_inject_pending_timer_irqs(vcpu); | ||
1352 | if ((vmcb->control.exit_int_info & SVM_EVTINJ_VALID) && | ||
1353 | ((vmcb->control.exit_int_info & SVM_EVTINJ_TYPE_MASK) == 0)) { | ||
1354 | intr_vector = vmcb->control.exit_int_info & | ||
1355 | SVM_EVTINJ_VEC_MASK; | ||
1356 | vmcb->control.exit_int_info = 0; | ||
1357 | svm_inject_irq(svm, intr_vector); | ||
1358 | return; | ||
1359 | } | ||
1360 | |||
1361 | if (vmcb->control.int_ctl & V_IRQ_MASK) | ||
1362 | return; | ||
1363 | |||
1364 | if (!kvm_cpu_has_interrupt(vcpu)) | ||
1365 | return; | ||
1366 | |||
1367 | if (!(vmcb->save.rflags & X86_EFLAGS_IF) || | ||
1368 | (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) || | ||
1369 | (vmcb->control.event_inj & SVM_EVTINJ_VALID)) { | ||
1370 | /* unable to deliver irq, set pending irq */ | ||
1371 | vmcb->control.intercept |= (1ULL << INTERCEPT_VINTR); | ||
1372 | svm_inject_irq(svm, 0x0); | ||
1373 | return; | ||
1374 | } | ||
1375 | /* Okay, we can deliver the interrupt: grab it and update PIC state. */ | ||
1376 | intr_vector = kvm_cpu_get_interrupt(vcpu); | ||
1377 | svm_inject_irq(svm, intr_vector); | ||
1378 | kvm_timer_intr_post(vcpu, intr_vector); | ||
1379 | } | ||
1380 | |||
1381 | static void kvm_reput_irq(struct vcpu_svm *svm) | ||
1382 | { | ||
1383 | struct vmcb_control_area *control = &svm->vmcb->control; | ||
1384 | |||
1385 | if ((control->int_ctl & V_IRQ_MASK) | ||
1386 | && !irqchip_in_kernel(svm->vcpu.kvm)) { | ||
1387 | control->int_ctl &= ~V_IRQ_MASK; | ||
1388 | push_irq(&svm->vcpu, control->int_vector); | ||
1389 | } | ||
1390 | |||
1391 | svm->vcpu.interrupt_window_open = | ||
1392 | !(control->int_state & SVM_INTERRUPT_SHADOW_MASK); | ||
1393 | } | ||
1394 | |||
1395 | static void svm_do_inject_vector(struct vcpu_svm *svm) | ||
1396 | { | ||
1397 | struct kvm_vcpu *vcpu = &svm->vcpu; | ||
1398 | int word_index = __ffs(vcpu->irq_summary); | ||
1399 | int bit_index = __ffs(vcpu->irq_pending[word_index]); | ||
1400 | int irq = word_index * BITS_PER_LONG + bit_index; | ||
1401 | |||
1402 | clear_bit(bit_index, &vcpu->irq_pending[word_index]); | ||
1403 | if (!vcpu->irq_pending[word_index]) | ||
1404 | clear_bit(word_index, &vcpu->irq_summary); | ||
1405 | svm_inject_irq(svm, irq); | ||
1406 | } | ||
1407 | |||
1408 | static void do_interrupt_requests(struct kvm_vcpu *vcpu, | ||
1409 | struct kvm_run *kvm_run) | ||
1410 | { | ||
1411 | struct vcpu_svm *svm = to_svm(vcpu); | ||
1412 | struct vmcb_control_area *control = &svm->vmcb->control; | ||
1413 | |||
1414 | svm->vcpu.interrupt_window_open = | ||
1415 | (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) && | ||
1416 | (svm->vmcb->save.rflags & X86_EFLAGS_IF)); | ||
1417 | |||
1418 | if (svm->vcpu.interrupt_window_open && svm->vcpu.irq_summary) | ||
1419 | /* | ||
1420 | * If interrupts enabled, and not blocked by sti or mov ss. Good. | ||
1421 | */ | ||
1422 | svm_do_inject_vector(svm); | ||
1423 | |||
1424 | /* | ||
1425 | * Interrupts blocked. Wait for unblock. | ||
1426 | */ | ||
1427 | if (!svm->vcpu.interrupt_window_open && | ||
1428 | (svm->vcpu.irq_summary || kvm_run->request_interrupt_window)) { | ||
1429 | control->intercept |= 1ULL << INTERCEPT_VINTR; | ||
1430 | } else | ||
1431 | control->intercept &= ~(1ULL << INTERCEPT_VINTR); | ||
1432 | } | ||
1433 | |||
1434 | static void save_db_regs(unsigned long *db_regs) | ||
1435 | { | ||
1436 | asm volatile ("mov %%dr0, %0" : "=r"(db_regs[0])); | ||
1437 | asm volatile ("mov %%dr1, %0" : "=r"(db_regs[1])); | ||
1438 | asm volatile ("mov %%dr2, %0" : "=r"(db_regs[2])); | ||
1439 | asm volatile ("mov %%dr3, %0" : "=r"(db_regs[3])); | ||
1440 | } | ||
1441 | |||
1442 | static void load_db_regs(unsigned long *db_regs) | ||
1443 | { | ||
1444 | asm volatile ("mov %0, %%dr0" : : "r"(db_regs[0])); | ||
1445 | asm volatile ("mov %0, %%dr1" : : "r"(db_regs[1])); | ||
1446 | asm volatile ("mov %0, %%dr2" : : "r"(db_regs[2])); | ||
1447 | asm volatile ("mov %0, %%dr3" : : "r"(db_regs[3])); | ||
1448 | } | ||
1449 | |||
1450 | static void svm_flush_tlb(struct kvm_vcpu *vcpu) | ||
1451 | { | ||
1452 | force_new_asid(vcpu); | ||
1453 | } | ||
1454 | |||
1455 | static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) | ||
1456 | { | ||
1457 | } | ||
1458 | |||
1459 | static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
1460 | { | ||
1461 | struct vcpu_svm *svm = to_svm(vcpu); | ||
1462 | u16 fs_selector; | ||
1463 | u16 gs_selector; | ||
1464 | u16 ldt_selector; | ||
1465 | |||
1466 | pre_svm_run(svm); | ||
1467 | |||
1468 | save_host_msrs(vcpu); | ||
1469 | fs_selector = read_fs(); | ||
1470 | gs_selector = read_gs(); | ||
1471 | ldt_selector = read_ldt(); | ||
1472 | svm->host_cr2 = kvm_read_cr2(); | ||
1473 | svm->host_dr6 = read_dr6(); | ||
1474 | svm->host_dr7 = read_dr7(); | ||
1475 | svm->vmcb->save.cr2 = vcpu->cr2; | ||
1476 | |||
1477 | if (svm->vmcb->save.dr7 & 0xff) { | ||
1478 | write_dr7(0); | ||
1479 | save_db_regs(svm->host_db_regs); | ||
1480 | load_db_regs(svm->db_regs); | ||
1481 | } | ||
1482 | |||
1483 | clgi(); | ||
1484 | |||
1485 | local_irq_enable(); | ||
1486 | |||
1487 | asm volatile ( | ||
1488 | #ifdef CONFIG_X86_64 | ||
1489 | "push %%rbx; push %%rcx; push %%rdx;" | ||
1490 | "push %%rsi; push %%rdi; push %%rbp;" | ||
1491 | "push %%r8; push %%r9; push %%r10; push %%r11;" | ||
1492 | "push %%r12; push %%r13; push %%r14; push %%r15;" | ||
1493 | #else | ||
1494 | "push %%ebx; push %%ecx; push %%edx;" | ||
1495 | "push %%esi; push %%edi; push %%ebp;" | ||
1496 | #endif | ||
1497 | |||
1498 | #ifdef CONFIG_X86_64 | ||
1499 | "mov %c[rbx](%[svm]), %%rbx \n\t" | ||
1500 | "mov %c[rcx](%[svm]), %%rcx \n\t" | ||
1501 | "mov %c[rdx](%[svm]), %%rdx \n\t" | ||
1502 | "mov %c[rsi](%[svm]), %%rsi \n\t" | ||
1503 | "mov %c[rdi](%[svm]), %%rdi \n\t" | ||
1504 | "mov %c[rbp](%[svm]), %%rbp \n\t" | ||
1505 | "mov %c[r8](%[svm]), %%r8 \n\t" | ||
1506 | "mov %c[r9](%[svm]), %%r9 \n\t" | ||
1507 | "mov %c[r10](%[svm]), %%r10 \n\t" | ||
1508 | "mov %c[r11](%[svm]), %%r11 \n\t" | ||
1509 | "mov %c[r12](%[svm]), %%r12 \n\t" | ||
1510 | "mov %c[r13](%[svm]), %%r13 \n\t" | ||
1511 | "mov %c[r14](%[svm]), %%r14 \n\t" | ||
1512 | "mov %c[r15](%[svm]), %%r15 \n\t" | ||
1513 | #else | ||
1514 | "mov %c[rbx](%[svm]), %%ebx \n\t" | ||
1515 | "mov %c[rcx](%[svm]), %%ecx \n\t" | ||
1516 | "mov %c[rdx](%[svm]), %%edx \n\t" | ||
1517 | "mov %c[rsi](%[svm]), %%esi \n\t" | ||
1518 | "mov %c[rdi](%[svm]), %%edi \n\t" | ||
1519 | "mov %c[rbp](%[svm]), %%ebp \n\t" | ||
1520 | #endif | ||
1521 | |||
1522 | #ifdef CONFIG_X86_64 | ||
1523 | /* Enter guest mode */ | ||
1524 | "push %%rax \n\t" | ||
1525 | "mov %c[vmcb](%[svm]), %%rax \n\t" | ||
1526 | SVM_VMLOAD "\n\t" | ||
1527 | SVM_VMRUN "\n\t" | ||
1528 | SVM_VMSAVE "\n\t" | ||
1529 | "pop %%rax \n\t" | ||
1530 | #else | ||
1531 | /* Enter guest mode */ | ||
1532 | "push %%eax \n\t" | ||
1533 | "mov %c[vmcb](%[svm]), %%eax \n\t" | ||
1534 | SVM_VMLOAD "\n\t" | ||
1535 | SVM_VMRUN "\n\t" | ||
1536 | SVM_VMSAVE "\n\t" | ||
1537 | "pop %%eax \n\t" | ||
1538 | #endif | ||
1539 | |||
1540 | /* Save guest registers, load host registers */ | ||
1541 | #ifdef CONFIG_X86_64 | ||
1542 | "mov %%rbx, %c[rbx](%[svm]) \n\t" | ||
1543 | "mov %%rcx, %c[rcx](%[svm]) \n\t" | ||
1544 | "mov %%rdx, %c[rdx](%[svm]) \n\t" | ||
1545 | "mov %%rsi, %c[rsi](%[svm]) \n\t" | ||
1546 | "mov %%rdi, %c[rdi](%[svm]) \n\t" | ||
1547 | "mov %%rbp, %c[rbp](%[svm]) \n\t" | ||
1548 | "mov %%r8, %c[r8](%[svm]) \n\t" | ||
1549 | "mov %%r9, %c[r9](%[svm]) \n\t" | ||
1550 | "mov %%r10, %c[r10](%[svm]) \n\t" | ||
1551 | "mov %%r11, %c[r11](%[svm]) \n\t" | ||
1552 | "mov %%r12, %c[r12](%[svm]) \n\t" | ||
1553 | "mov %%r13, %c[r13](%[svm]) \n\t" | ||
1554 | "mov %%r14, %c[r14](%[svm]) \n\t" | ||
1555 | "mov %%r15, %c[r15](%[svm]) \n\t" | ||
1556 | |||
1557 | "pop %%r15; pop %%r14; pop %%r13; pop %%r12;" | ||
1558 | "pop %%r11; pop %%r10; pop %%r9; pop %%r8;" | ||
1559 | "pop %%rbp; pop %%rdi; pop %%rsi;" | ||
1560 | "pop %%rdx; pop %%rcx; pop %%rbx; \n\t" | ||
1561 | #else | ||
1562 | "mov %%ebx, %c[rbx](%[svm]) \n\t" | ||
1563 | "mov %%ecx, %c[rcx](%[svm]) \n\t" | ||
1564 | "mov %%edx, %c[rdx](%[svm]) \n\t" | ||
1565 | "mov %%esi, %c[rsi](%[svm]) \n\t" | ||
1566 | "mov %%edi, %c[rdi](%[svm]) \n\t" | ||
1567 | "mov %%ebp, %c[rbp](%[svm]) \n\t" | ||
1568 | |||
1569 | "pop %%ebp; pop %%edi; pop %%esi;" | ||
1570 | "pop %%edx; pop %%ecx; pop %%ebx; \n\t" | ||
1571 | #endif | ||
1572 | : | ||
1573 | : [svm]"a"(svm), | ||
1574 | [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), | ||
1575 | [rbx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RBX])), | ||
1576 | [rcx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RCX])), | ||
1577 | [rdx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RDX])), | ||
1578 | [rsi]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RSI])), | ||
1579 | [rdi]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RDI])), | ||
1580 | [rbp]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RBP])) | ||
1581 | #ifdef CONFIG_X86_64 | ||
1582 | ,[r8 ]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R8])), | ||
1583 | [r9 ]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R9 ])), | ||
1584 | [r10]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R10])), | ||
1585 | [r11]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R11])), | ||
1586 | [r12]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R12])), | ||
1587 | [r13]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R13])), | ||
1588 | [r14]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R14])), | ||
1589 | [r15]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R15])) | ||
1590 | #endif | ||
1591 | : "cc", "memory" ); | ||
1592 | |||
1593 | if ((svm->vmcb->save.dr7 & 0xff)) | ||
1594 | load_db_regs(svm->host_db_regs); | ||
1595 | |||
1596 | vcpu->cr2 = svm->vmcb->save.cr2; | ||
1597 | |||
1598 | write_dr6(svm->host_dr6); | ||
1599 | write_dr7(svm->host_dr7); | ||
1600 | kvm_write_cr2(svm->host_cr2); | ||
1601 | |||
1602 | load_fs(fs_selector); | ||
1603 | load_gs(gs_selector); | ||
1604 | load_ldt(ldt_selector); | ||
1605 | load_host_msrs(vcpu); | ||
1606 | |||
1607 | reload_tss(vcpu); | ||
1608 | |||
1609 | local_irq_disable(); | ||
1610 | |||
1611 | stgi(); | ||
1612 | |||
1613 | svm->next_rip = 0; | ||
1614 | } | ||
1615 | |||
1616 | static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) | ||
1617 | { | ||
1618 | struct vcpu_svm *svm = to_svm(vcpu); | ||
1619 | |||
1620 | svm->vmcb->save.cr3 = root; | ||
1621 | force_new_asid(vcpu); | ||
1622 | |||
1623 | if (vcpu->fpu_active) { | ||
1624 | svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR); | ||
1625 | svm->vmcb->save.cr0 |= X86_CR0_TS; | ||
1626 | vcpu->fpu_active = 0; | ||
1627 | } | ||
1628 | } | ||
1629 | |||
1630 | static void svm_inject_page_fault(struct kvm_vcpu *vcpu, | ||
1631 | unsigned long addr, | ||
1632 | uint32_t err_code) | ||
1633 | { | ||
1634 | struct vcpu_svm *svm = to_svm(vcpu); | ||
1635 | uint32_t exit_int_info = svm->vmcb->control.exit_int_info; | ||
1636 | |||
1637 | ++vcpu->stat.pf_guest; | ||
1638 | |||
1639 | if (is_page_fault(exit_int_info)) { | ||
1640 | |||
1641 | svm->vmcb->control.event_inj_err = 0; | ||
1642 | svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | | ||
1643 | SVM_EVTINJ_VALID_ERR | | ||
1644 | SVM_EVTINJ_TYPE_EXEPT | | ||
1645 | DF_VECTOR; | ||
1646 | return; | ||
1647 | } | ||
1648 | vcpu->cr2 = addr; | ||
1649 | svm->vmcb->save.cr2 = addr; | ||
1650 | svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | | ||
1651 | SVM_EVTINJ_VALID_ERR | | ||
1652 | SVM_EVTINJ_TYPE_EXEPT | | ||
1653 | PF_VECTOR; | ||
1654 | svm->vmcb->control.event_inj_err = err_code; | ||
1655 | } | ||
1656 | |||
1657 | |||
1658 | static int is_disabled(void) | ||
1659 | { | ||
1660 | u64 vm_cr; | ||
1661 | |||
1662 | rdmsrl(MSR_VM_CR, vm_cr); | ||
1663 | if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE)) | ||
1664 | return 1; | ||
1665 | |||
1666 | return 0; | ||
1667 | } | ||
1668 | |||
1669 | static void | ||
1670 | svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) | ||
1671 | { | ||
1672 | /* | ||
1673 | * Patch in the VMMCALL instruction: | ||
1674 | */ | ||
1675 | hypercall[0] = 0x0f; | ||
1676 | hypercall[1] = 0x01; | ||
1677 | hypercall[2] = 0xd9; | ||
1678 | hypercall[3] = 0xc3; | ||
1679 | } | ||
1680 | |||
1681 | static void svm_check_processor_compat(void *rtn) | ||
1682 | { | ||
1683 | *(int *)rtn = 0; | ||
1684 | } | ||
1685 | |||
1686 | static struct kvm_x86_ops svm_x86_ops = { | ||
1687 | .cpu_has_kvm_support = has_svm, | ||
1688 | .disabled_by_bios = is_disabled, | ||
1689 | .hardware_setup = svm_hardware_setup, | ||
1690 | .hardware_unsetup = svm_hardware_unsetup, | ||
1691 | .check_processor_compatibility = svm_check_processor_compat, | ||
1692 | .hardware_enable = svm_hardware_enable, | ||
1693 | .hardware_disable = svm_hardware_disable, | ||
1694 | |||
1695 | .vcpu_create = svm_create_vcpu, | ||
1696 | .vcpu_free = svm_free_vcpu, | ||
1697 | .vcpu_reset = svm_vcpu_reset, | ||
1698 | |||
1699 | .prepare_guest_switch = svm_prepare_guest_switch, | ||
1700 | .vcpu_load = svm_vcpu_load, | ||
1701 | .vcpu_put = svm_vcpu_put, | ||
1702 | .vcpu_decache = svm_vcpu_decache, | ||
1703 | |||
1704 | .set_guest_debug = svm_guest_debug, | ||
1705 | .get_msr = svm_get_msr, | ||
1706 | .set_msr = svm_set_msr, | ||
1707 | .get_segment_base = svm_get_segment_base, | ||
1708 | .get_segment = svm_get_segment, | ||
1709 | .set_segment = svm_set_segment, | ||
1710 | .get_cs_db_l_bits = kvm_get_cs_db_l_bits, | ||
1711 | .decache_cr4_guest_bits = svm_decache_cr4_guest_bits, | ||
1712 | .set_cr0 = svm_set_cr0, | ||
1713 | .set_cr3 = svm_set_cr3, | ||
1714 | .set_cr4 = svm_set_cr4, | ||
1715 | .set_efer = svm_set_efer, | ||
1716 | .get_idt = svm_get_idt, | ||
1717 | .set_idt = svm_set_idt, | ||
1718 | .get_gdt = svm_get_gdt, | ||
1719 | .set_gdt = svm_set_gdt, | ||
1720 | .get_dr = svm_get_dr, | ||
1721 | .set_dr = svm_set_dr, | ||
1722 | .cache_regs = svm_cache_regs, | ||
1723 | .decache_regs = svm_decache_regs, | ||
1724 | .get_rflags = svm_get_rflags, | ||
1725 | .set_rflags = svm_set_rflags, | ||
1726 | |||
1727 | .tlb_flush = svm_flush_tlb, | ||
1728 | .inject_page_fault = svm_inject_page_fault, | ||
1729 | |||
1730 | .inject_gp = svm_inject_gp, | ||
1731 | |||
1732 | .run = svm_vcpu_run, | ||
1733 | .handle_exit = handle_exit, | ||
1734 | .skip_emulated_instruction = skip_emulated_instruction, | ||
1735 | .patch_hypercall = svm_patch_hypercall, | ||
1736 | .get_irq = svm_get_irq, | ||
1737 | .set_irq = svm_set_irq, | ||
1738 | .inject_pending_irq = svm_intr_assist, | ||
1739 | .inject_pending_vectors = do_interrupt_requests, | ||
1740 | }; | ||
1741 | |||
1742 | static int __init svm_init(void) | ||
1743 | { | ||
1744 | return kvm_init_x86(&svm_x86_ops, sizeof(struct vcpu_svm), | ||
1745 | THIS_MODULE); | ||
1746 | } | ||
1747 | |||
1748 | static void __exit svm_exit(void) | ||
1749 | { | ||
1750 | kvm_exit_x86(); | ||
1751 | } | ||
1752 | |||
1753 | module_init(svm_init) | ||
1754 | module_exit(svm_exit) | ||
diff --git a/drivers/kvm/svm.h b/drivers/kvm/svm.h deleted file mode 100644 index 3b1b0f35b6cb..000000000000 --- a/drivers/kvm/svm.h +++ /dev/null | |||
@@ -1,324 +0,0 @@ | |||
1 | #ifndef __SVM_H | ||
2 | #define __SVM_H | ||
3 | |||
4 | enum { | ||
5 | INTERCEPT_INTR, | ||
6 | INTERCEPT_NMI, | ||
7 | INTERCEPT_SMI, | ||
8 | INTERCEPT_INIT, | ||
9 | INTERCEPT_VINTR, | ||
10 | INTERCEPT_SELECTIVE_CR0, | ||
11 | INTERCEPT_STORE_IDTR, | ||
12 | INTERCEPT_STORE_GDTR, | ||
13 | INTERCEPT_STORE_LDTR, | ||
14 | INTERCEPT_STORE_TR, | ||
15 | INTERCEPT_LOAD_IDTR, | ||
16 | INTERCEPT_LOAD_GDTR, | ||
17 | INTERCEPT_LOAD_LDTR, | ||
18 | INTERCEPT_LOAD_TR, | ||
19 | INTERCEPT_RDTSC, | ||
20 | INTERCEPT_RDPMC, | ||
21 | INTERCEPT_PUSHF, | ||
22 | INTERCEPT_POPF, | ||
23 | INTERCEPT_CPUID, | ||
24 | INTERCEPT_RSM, | ||
25 | INTERCEPT_IRET, | ||
26 | INTERCEPT_INTn, | ||
27 | INTERCEPT_INVD, | ||
28 | INTERCEPT_PAUSE, | ||
29 | INTERCEPT_HLT, | ||
30 | INTERCEPT_INVLPG, | ||
31 | INTERCEPT_INVLPGA, | ||
32 | INTERCEPT_IOIO_PROT, | ||
33 | INTERCEPT_MSR_PROT, | ||
34 | INTERCEPT_TASK_SWITCH, | ||
35 | INTERCEPT_FERR_FREEZE, | ||
36 | INTERCEPT_SHUTDOWN, | ||
37 | INTERCEPT_VMRUN, | ||
38 | INTERCEPT_VMMCALL, | ||
39 | INTERCEPT_VMLOAD, | ||
40 | INTERCEPT_VMSAVE, | ||
41 | INTERCEPT_STGI, | ||
42 | INTERCEPT_CLGI, | ||
43 | INTERCEPT_SKINIT, | ||
44 | INTERCEPT_RDTSCP, | ||
45 | INTERCEPT_ICEBP, | ||
46 | INTERCEPT_WBINVD, | ||
47 | INTERCEPT_MONITOR, | ||
48 | INTERCEPT_MWAIT, | ||
49 | INTERCEPT_MWAIT_COND, | ||
50 | }; | ||
51 | |||
52 | |||
53 | struct __attribute__ ((__packed__)) vmcb_control_area { | ||
54 | u16 intercept_cr_read; | ||
55 | u16 intercept_cr_write; | ||
56 | u16 intercept_dr_read; | ||
57 | u16 intercept_dr_write; | ||
58 | u32 intercept_exceptions; | ||
59 | u64 intercept; | ||
60 | u8 reserved_1[44]; | ||
61 | u64 iopm_base_pa; | ||
62 | u64 msrpm_base_pa; | ||
63 | u64 tsc_offset; | ||
64 | u32 asid; | ||
65 | u8 tlb_ctl; | ||
66 | u8 reserved_2[3]; | ||
67 | u32 int_ctl; | ||
68 | u32 int_vector; | ||
69 | u32 int_state; | ||
70 | u8 reserved_3[4]; | ||
71 | u32 exit_code; | ||
72 | u32 exit_code_hi; | ||
73 | u64 exit_info_1; | ||
74 | u64 exit_info_2; | ||
75 | u32 exit_int_info; | ||
76 | u32 exit_int_info_err; | ||
77 | u64 nested_ctl; | ||
78 | u8 reserved_4[16]; | ||
79 | u32 event_inj; | ||
80 | u32 event_inj_err; | ||
81 | u64 nested_cr3; | ||
82 | u64 lbr_ctl; | ||
83 | u8 reserved_5[832]; | ||
84 | }; | ||
85 | |||
86 | |||
87 | #define TLB_CONTROL_DO_NOTHING 0 | ||
88 | #define TLB_CONTROL_FLUSH_ALL_ASID 1 | ||
89 | |||
90 | #define V_TPR_MASK 0x0f | ||
91 | |||
92 | #define V_IRQ_SHIFT 8 | ||
93 | #define V_IRQ_MASK (1 << V_IRQ_SHIFT) | ||
94 | |||
95 | #define V_INTR_PRIO_SHIFT 16 | ||
96 | #define V_INTR_PRIO_MASK (0x0f << V_INTR_PRIO_SHIFT) | ||
97 | |||
98 | #define V_IGN_TPR_SHIFT 20 | ||
99 | #define V_IGN_TPR_MASK (1 << V_IGN_TPR_SHIFT) | ||
100 | |||
101 | #define V_INTR_MASKING_SHIFT 24 | ||
102 | #define V_INTR_MASKING_MASK (1 << V_INTR_MASKING_SHIFT) | ||
103 | |||
104 | #define SVM_INTERRUPT_SHADOW_MASK 1 | ||
105 | |||
106 | #define SVM_IOIO_STR_SHIFT 2 | ||
107 | #define SVM_IOIO_REP_SHIFT 3 | ||
108 | #define SVM_IOIO_SIZE_SHIFT 4 | ||
109 | #define SVM_IOIO_ASIZE_SHIFT 7 | ||
110 | |||
111 | #define SVM_IOIO_TYPE_MASK 1 | ||
112 | #define SVM_IOIO_STR_MASK (1 << SVM_IOIO_STR_SHIFT) | ||
113 | #define SVM_IOIO_REP_MASK (1 << SVM_IOIO_REP_SHIFT) | ||
114 | #define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT) | ||
115 | #define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT) | ||
116 | |||
117 | struct __attribute__ ((__packed__)) vmcb_seg { | ||
118 | u16 selector; | ||
119 | u16 attrib; | ||
120 | u32 limit; | ||
121 | u64 base; | ||
122 | }; | ||
123 | |||
124 | struct __attribute__ ((__packed__)) vmcb_save_area { | ||
125 | struct vmcb_seg es; | ||
126 | struct vmcb_seg cs; | ||
127 | struct vmcb_seg ss; | ||
128 | struct vmcb_seg ds; | ||
129 | struct vmcb_seg fs; | ||
130 | struct vmcb_seg gs; | ||
131 | struct vmcb_seg gdtr; | ||
132 | struct vmcb_seg ldtr; | ||
133 | struct vmcb_seg idtr; | ||
134 | struct vmcb_seg tr; | ||
135 | u8 reserved_1[43]; | ||
136 | u8 cpl; | ||
137 | u8 reserved_2[4]; | ||
138 | u64 efer; | ||
139 | u8 reserved_3[112]; | ||
140 | u64 cr4; | ||
141 | u64 cr3; | ||
142 | u64 cr0; | ||
143 | u64 dr7; | ||
144 | u64 dr6; | ||
145 | u64 rflags; | ||
146 | u64 rip; | ||
147 | u8 reserved_4[88]; | ||
148 | u64 rsp; | ||
149 | u8 reserved_5[24]; | ||
150 | u64 rax; | ||
151 | u64 star; | ||
152 | u64 lstar; | ||
153 | u64 cstar; | ||
154 | u64 sfmask; | ||
155 | u64 kernel_gs_base; | ||
156 | u64 sysenter_cs; | ||
157 | u64 sysenter_esp; | ||
158 | u64 sysenter_eip; | ||
159 | u64 cr2; | ||
160 | u8 reserved_6[32]; | ||
161 | u64 g_pat; | ||
162 | u64 dbgctl; | ||
163 | u64 br_from; | ||
164 | u64 br_to; | ||
165 | u64 last_excp_from; | ||
166 | u64 last_excp_to; | ||
167 | }; | ||
168 | |||
169 | struct __attribute__ ((__packed__)) vmcb { | ||
170 | struct vmcb_control_area control; | ||
171 | struct vmcb_save_area save; | ||
172 | }; | ||
173 | |||
174 | #define SVM_CPUID_FEATURE_SHIFT 2 | ||
175 | #define SVM_CPUID_FUNC 0x8000000a | ||
176 | |||
177 | #define MSR_EFER_SVME_MASK (1ULL << 12) | ||
178 | #define MSR_VM_CR 0xc0010114 | ||
179 | #define MSR_VM_HSAVE_PA 0xc0010117ULL | ||
180 | |||
181 | #define SVM_VM_CR_SVM_DISABLE 4 | ||
182 | |||
183 | #define SVM_SELECTOR_S_SHIFT 4 | ||
184 | #define SVM_SELECTOR_DPL_SHIFT 5 | ||
185 | #define SVM_SELECTOR_P_SHIFT 7 | ||
186 | #define SVM_SELECTOR_AVL_SHIFT 8 | ||
187 | #define SVM_SELECTOR_L_SHIFT 9 | ||
188 | #define SVM_SELECTOR_DB_SHIFT 10 | ||
189 | #define SVM_SELECTOR_G_SHIFT 11 | ||
190 | |||
191 | #define SVM_SELECTOR_TYPE_MASK (0xf) | ||
192 | #define SVM_SELECTOR_S_MASK (1 << SVM_SELECTOR_S_SHIFT) | ||
193 | #define SVM_SELECTOR_DPL_MASK (3 << SVM_SELECTOR_DPL_SHIFT) | ||
194 | #define SVM_SELECTOR_P_MASK (1 << SVM_SELECTOR_P_SHIFT) | ||
195 | #define SVM_SELECTOR_AVL_MASK (1 << SVM_SELECTOR_AVL_SHIFT) | ||
196 | #define SVM_SELECTOR_L_MASK (1 << SVM_SELECTOR_L_SHIFT) | ||
197 | #define SVM_SELECTOR_DB_MASK (1 << SVM_SELECTOR_DB_SHIFT) | ||
198 | #define SVM_SELECTOR_G_MASK (1 << SVM_SELECTOR_G_SHIFT) | ||
199 | |||
200 | #define SVM_SELECTOR_WRITE_MASK (1 << 1) | ||
201 | #define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK | ||
202 | #define SVM_SELECTOR_CODE_MASK (1 << 3) | ||
203 | |||
204 | #define INTERCEPT_CR0_MASK 1 | ||
205 | #define INTERCEPT_CR3_MASK (1 << 3) | ||
206 | #define INTERCEPT_CR4_MASK (1 << 4) | ||
207 | |||
208 | #define INTERCEPT_DR0_MASK 1 | ||
209 | #define INTERCEPT_DR1_MASK (1 << 1) | ||
210 | #define INTERCEPT_DR2_MASK (1 << 2) | ||
211 | #define INTERCEPT_DR3_MASK (1 << 3) | ||
212 | #define INTERCEPT_DR4_MASK (1 << 4) | ||
213 | #define INTERCEPT_DR5_MASK (1 << 5) | ||
214 | #define INTERCEPT_DR6_MASK (1 << 6) | ||
215 | #define INTERCEPT_DR7_MASK (1 << 7) | ||
216 | |||
217 | #define SVM_EVTINJ_VEC_MASK 0xff | ||
218 | |||
219 | #define SVM_EVTINJ_TYPE_SHIFT 8 | ||
220 | #define SVM_EVTINJ_TYPE_MASK (7 << SVM_EVTINJ_TYPE_SHIFT) | ||
221 | |||
222 | #define SVM_EVTINJ_TYPE_INTR (0 << SVM_EVTINJ_TYPE_SHIFT) | ||
223 | #define SVM_EVTINJ_TYPE_NMI (2 << SVM_EVTINJ_TYPE_SHIFT) | ||
224 | #define SVM_EVTINJ_TYPE_EXEPT (3 << SVM_EVTINJ_TYPE_SHIFT) | ||
225 | #define SVM_EVTINJ_TYPE_SOFT (4 << SVM_EVTINJ_TYPE_SHIFT) | ||
226 | |||
227 | #define SVM_EVTINJ_VALID (1 << 31) | ||
228 | #define SVM_EVTINJ_VALID_ERR (1 << 11) | ||
229 | |||
230 | #define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK | ||
231 | |||
232 | #define SVM_EXITINTINFO_TYPE_INTR SVM_EVTINJ_TYPE_INTR | ||
233 | #define SVM_EXITINTINFO_TYPE_NMI SVM_EVTINJ_TYPE_NMI | ||
234 | #define SVM_EXITINTINFO_TYPE_EXEPT SVM_EVTINJ_TYPE_EXEPT | ||
235 | #define SVM_EXITINTINFO_TYPE_SOFT SVM_EVTINJ_TYPE_SOFT | ||
236 | |||
237 | #define SVM_EXITINTINFO_VALID SVM_EVTINJ_VALID | ||
238 | #define SVM_EXITINTINFO_VALID_ERR SVM_EVTINJ_VALID_ERR | ||
239 | |||
240 | #define SVM_EXIT_READ_CR0 0x000 | ||
241 | #define SVM_EXIT_READ_CR3 0x003 | ||
242 | #define SVM_EXIT_READ_CR4 0x004 | ||
243 | #define SVM_EXIT_READ_CR8 0x008 | ||
244 | #define SVM_EXIT_WRITE_CR0 0x010 | ||
245 | #define SVM_EXIT_WRITE_CR3 0x013 | ||
246 | #define SVM_EXIT_WRITE_CR4 0x014 | ||
247 | #define SVM_EXIT_WRITE_CR8 0x018 | ||
248 | #define SVM_EXIT_READ_DR0 0x020 | ||
249 | #define SVM_EXIT_READ_DR1 0x021 | ||
250 | #define SVM_EXIT_READ_DR2 0x022 | ||
251 | #define SVM_EXIT_READ_DR3 0x023 | ||
252 | #define SVM_EXIT_READ_DR4 0x024 | ||
253 | #define SVM_EXIT_READ_DR5 0x025 | ||
254 | #define SVM_EXIT_READ_DR6 0x026 | ||
255 | #define SVM_EXIT_READ_DR7 0x027 | ||
256 | #define SVM_EXIT_WRITE_DR0 0x030 | ||
257 | #define SVM_EXIT_WRITE_DR1 0x031 | ||
258 | #define SVM_EXIT_WRITE_DR2 0x032 | ||
259 | #define SVM_EXIT_WRITE_DR3 0x033 | ||
260 | #define SVM_EXIT_WRITE_DR4 0x034 | ||
261 | #define SVM_EXIT_WRITE_DR5 0x035 | ||
262 | #define SVM_EXIT_WRITE_DR6 0x036 | ||
263 | #define SVM_EXIT_WRITE_DR7 0x037 | ||
264 | #define SVM_EXIT_EXCP_BASE 0x040 | ||
265 | #define SVM_EXIT_INTR 0x060 | ||
266 | #define SVM_EXIT_NMI 0x061 | ||
267 | #define SVM_EXIT_SMI 0x062 | ||
268 | #define SVM_EXIT_INIT 0x063 | ||
269 | #define SVM_EXIT_VINTR 0x064 | ||
270 | #define SVM_EXIT_CR0_SEL_WRITE 0x065 | ||
271 | #define SVM_EXIT_IDTR_READ 0x066 | ||
272 | #define SVM_EXIT_GDTR_READ 0x067 | ||
273 | #define SVM_EXIT_LDTR_READ 0x068 | ||
274 | #define SVM_EXIT_TR_READ 0x069 | ||
275 | #define SVM_EXIT_IDTR_WRITE 0x06a | ||
276 | #define SVM_EXIT_GDTR_WRITE 0x06b | ||
277 | #define SVM_EXIT_LDTR_WRITE 0x06c | ||
278 | #define SVM_EXIT_TR_WRITE 0x06d | ||
279 | #define SVM_EXIT_RDTSC 0x06e | ||
280 | #define SVM_EXIT_RDPMC 0x06f | ||
281 | #define SVM_EXIT_PUSHF 0x070 | ||
282 | #define SVM_EXIT_POPF 0x071 | ||
283 | #define SVM_EXIT_CPUID 0x072 | ||
284 | #define SVM_EXIT_RSM 0x073 | ||
285 | #define SVM_EXIT_IRET 0x074 | ||
286 | #define SVM_EXIT_SWINT 0x075 | ||
287 | #define SVM_EXIT_INVD 0x076 | ||
288 | #define SVM_EXIT_PAUSE 0x077 | ||
289 | #define SVM_EXIT_HLT 0x078 | ||
290 | #define SVM_EXIT_INVLPG 0x079 | ||
291 | #define SVM_EXIT_INVLPGA 0x07a | ||
292 | #define SVM_EXIT_IOIO 0x07b | ||
293 | #define SVM_EXIT_MSR 0x07c | ||
294 | #define SVM_EXIT_TASK_SWITCH 0x07d | ||
295 | #define SVM_EXIT_FERR_FREEZE 0x07e | ||
296 | #define SVM_EXIT_SHUTDOWN 0x07f | ||
297 | #define SVM_EXIT_VMRUN 0x080 | ||
298 | #define SVM_EXIT_VMMCALL 0x081 | ||
299 | #define SVM_EXIT_VMLOAD 0x082 | ||
300 | #define SVM_EXIT_VMSAVE 0x083 | ||
301 | #define SVM_EXIT_STGI 0x084 | ||
302 | #define SVM_EXIT_CLGI 0x085 | ||
303 | #define SVM_EXIT_SKINIT 0x086 | ||
304 | #define SVM_EXIT_RDTSCP 0x087 | ||
305 | #define SVM_EXIT_ICEBP 0x088 | ||
306 | #define SVM_EXIT_WBINVD 0x089 | ||
307 | #define SVM_EXIT_MONITOR 0x08a | ||
308 | #define SVM_EXIT_MWAIT 0x08b | ||
309 | #define SVM_EXIT_MWAIT_COND 0x08c | ||
310 | #define SVM_EXIT_NPF 0x400 | ||
311 | |||
312 | #define SVM_EXIT_ERR -1 | ||
313 | |||
314 | #define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) // TS and MP | ||
315 | |||
316 | #define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda" | ||
317 | #define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8" | ||
318 | #define SVM_VMSAVE ".byte 0x0f, 0x01, 0xdb" | ||
319 | #define SVM_CLGI ".byte 0x0f, 0x01, 0xdd" | ||
320 | #define SVM_STGI ".byte 0x0f, 0x01, 0xdc" | ||
321 | #define SVM_INVLPGA ".byte 0x0f, 0x01, 0xdf" | ||
322 | |||
323 | #endif | ||
324 | |||
diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c deleted file mode 100644 index 5b397b6c9f93..000000000000 --- a/drivers/kvm/vmx.c +++ /dev/null | |||
@@ -1,2566 +0,0 @@ | |||
1 | /* | ||
2 | * Kernel-based Virtual Machine driver for Linux | ||
3 | * | ||
4 | * This module enables machines with Intel VT-x extensions to run virtual | ||
5 | * machines without emulation or binary translation. | ||
6 | * | ||
7 | * Copyright (C) 2006 Qumranet, Inc. | ||
8 | * | ||
9 | * Authors: | ||
10 | * Avi Kivity <avi@qumranet.com> | ||
11 | * Yaniv Kamay <yaniv@qumranet.com> | ||
12 | * | ||
13 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
14 | * the COPYING file in the top-level directory. | ||
15 | * | ||
16 | */ | ||
17 | |||
18 | #include "kvm.h" | ||
19 | #include "x86_emulate.h" | ||
20 | #include "irq.h" | ||
21 | #include "vmx.h" | ||
22 | #include "segment_descriptor.h" | ||
23 | |||
24 | #include <linux/module.h> | ||
25 | #include <linux/kernel.h> | ||
26 | #include <linux/mm.h> | ||
27 | #include <linux/highmem.h> | ||
28 | #include <linux/sched.h> | ||
29 | |||
30 | #include <asm/io.h> | ||
31 | #include <asm/desc.h> | ||
32 | |||
33 | MODULE_AUTHOR("Qumranet"); | ||
34 | MODULE_LICENSE("GPL"); | ||
35 | |||
36 | struct vmcs { | ||
37 | u32 revision_id; | ||
38 | u32 abort; | ||
39 | char data[0]; | ||
40 | }; | ||
41 | |||
42 | struct vcpu_vmx { | ||
43 | struct kvm_vcpu vcpu; | ||
44 | int launched; | ||
45 | u8 fail; | ||
46 | struct kvm_msr_entry *guest_msrs; | ||
47 | struct kvm_msr_entry *host_msrs; | ||
48 | int nmsrs; | ||
49 | int save_nmsrs; | ||
50 | int msr_offset_efer; | ||
51 | #ifdef CONFIG_X86_64 | ||
52 | int msr_offset_kernel_gs_base; | ||
53 | #endif | ||
54 | struct vmcs *vmcs; | ||
55 | struct { | ||
56 | int loaded; | ||
57 | u16 fs_sel, gs_sel, ldt_sel; | ||
58 | int gs_ldt_reload_needed; | ||
59 | int fs_reload_needed; | ||
60 | }host_state; | ||
61 | |||
62 | }; | ||
63 | |||
64 | static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) | ||
65 | { | ||
66 | return container_of(vcpu, struct vcpu_vmx, vcpu); | ||
67 | } | ||
68 | |||
69 | static int init_rmode_tss(struct kvm *kvm); | ||
70 | |||
71 | static DEFINE_PER_CPU(struct vmcs *, vmxarea); | ||
72 | static DEFINE_PER_CPU(struct vmcs *, current_vmcs); | ||
73 | |||
74 | static struct page *vmx_io_bitmap_a; | ||
75 | static struct page *vmx_io_bitmap_b; | ||
76 | |||
77 | #define EFER_SAVE_RESTORE_BITS ((u64)EFER_SCE) | ||
78 | |||
79 | static struct vmcs_config { | ||
80 | int size; | ||
81 | int order; | ||
82 | u32 revision_id; | ||
83 | u32 pin_based_exec_ctrl; | ||
84 | u32 cpu_based_exec_ctrl; | ||
85 | u32 vmexit_ctrl; | ||
86 | u32 vmentry_ctrl; | ||
87 | } vmcs_config; | ||
88 | |||
89 | #define VMX_SEGMENT_FIELD(seg) \ | ||
90 | [VCPU_SREG_##seg] = { \ | ||
91 | .selector = GUEST_##seg##_SELECTOR, \ | ||
92 | .base = GUEST_##seg##_BASE, \ | ||
93 | .limit = GUEST_##seg##_LIMIT, \ | ||
94 | .ar_bytes = GUEST_##seg##_AR_BYTES, \ | ||
95 | } | ||
96 | |||
97 | static struct kvm_vmx_segment_field { | ||
98 | unsigned selector; | ||
99 | unsigned base; | ||
100 | unsigned limit; | ||
101 | unsigned ar_bytes; | ||
102 | } kvm_vmx_segment_fields[] = { | ||
103 | VMX_SEGMENT_FIELD(CS), | ||
104 | VMX_SEGMENT_FIELD(DS), | ||
105 | VMX_SEGMENT_FIELD(ES), | ||
106 | VMX_SEGMENT_FIELD(FS), | ||
107 | VMX_SEGMENT_FIELD(GS), | ||
108 | VMX_SEGMENT_FIELD(SS), | ||
109 | VMX_SEGMENT_FIELD(TR), | ||
110 | VMX_SEGMENT_FIELD(LDTR), | ||
111 | }; | ||
112 | |||
113 | /* | ||
114 | * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it | ||
115 | * away by decrementing the array size. | ||
116 | */ | ||
117 | static const u32 vmx_msr_index[] = { | ||
118 | #ifdef CONFIG_X86_64 | ||
119 | MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE, | ||
120 | #endif | ||
121 | MSR_EFER, MSR_K6_STAR, | ||
122 | }; | ||
123 | #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) | ||
124 | |||
125 | static void load_msrs(struct kvm_msr_entry *e, int n) | ||
126 | { | ||
127 | int i; | ||
128 | |||
129 | for (i = 0; i < n; ++i) | ||
130 | wrmsrl(e[i].index, e[i].data); | ||
131 | } | ||
132 | |||
133 | static void save_msrs(struct kvm_msr_entry *e, int n) | ||
134 | { | ||
135 | int i; | ||
136 | |||
137 | for (i = 0; i < n; ++i) | ||
138 | rdmsrl(e[i].index, e[i].data); | ||
139 | } | ||
140 | |||
141 | static inline u64 msr_efer_save_restore_bits(struct kvm_msr_entry msr) | ||
142 | { | ||
143 | return (u64)msr.data & EFER_SAVE_RESTORE_BITS; | ||
144 | } | ||
145 | |||
146 | static inline int msr_efer_need_save_restore(struct vcpu_vmx *vmx) | ||
147 | { | ||
148 | int efer_offset = vmx->msr_offset_efer; | ||
149 | return msr_efer_save_restore_bits(vmx->host_msrs[efer_offset]) != | ||
150 | msr_efer_save_restore_bits(vmx->guest_msrs[efer_offset]); | ||
151 | } | ||
152 | |||
153 | static inline int is_page_fault(u32 intr_info) | ||
154 | { | ||
155 | return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | | ||
156 | INTR_INFO_VALID_MASK)) == | ||
157 | (INTR_TYPE_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK); | ||
158 | } | ||
159 | |||
160 | static inline int is_no_device(u32 intr_info) | ||
161 | { | ||
162 | return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | | ||
163 | INTR_INFO_VALID_MASK)) == | ||
164 | (INTR_TYPE_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK); | ||
165 | } | ||
166 | |||
167 | static inline int is_external_interrupt(u32 intr_info) | ||
168 | { | ||
169 | return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) | ||
170 | == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); | ||
171 | } | ||
172 | |||
173 | static inline int cpu_has_vmx_tpr_shadow(void) | ||
174 | { | ||
175 | return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW); | ||
176 | } | ||
177 | |||
178 | static inline int vm_need_tpr_shadow(struct kvm *kvm) | ||
179 | { | ||
180 | return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm))); | ||
181 | } | ||
182 | |||
183 | static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) | ||
184 | { | ||
185 | int i; | ||
186 | |||
187 | for (i = 0; i < vmx->nmsrs; ++i) | ||
188 | if (vmx->guest_msrs[i].index == msr) | ||
189 | return i; | ||
190 | return -1; | ||
191 | } | ||
192 | |||
193 | static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) | ||
194 | { | ||
195 | int i; | ||
196 | |||
197 | i = __find_msr_index(vmx, msr); | ||
198 | if (i >= 0) | ||
199 | return &vmx->guest_msrs[i]; | ||
200 | return NULL; | ||
201 | } | ||
202 | |||
203 | static void vmcs_clear(struct vmcs *vmcs) | ||
204 | { | ||
205 | u64 phys_addr = __pa(vmcs); | ||
206 | u8 error; | ||
207 | |||
208 | asm volatile (ASM_VMX_VMCLEAR_RAX "; setna %0" | ||
209 | : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) | ||
210 | : "cc", "memory"); | ||
211 | if (error) | ||
212 | printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n", | ||
213 | vmcs, phys_addr); | ||
214 | } | ||
215 | |||
216 | static void __vcpu_clear(void *arg) | ||
217 | { | ||
218 | struct vcpu_vmx *vmx = arg; | ||
219 | int cpu = raw_smp_processor_id(); | ||
220 | |||
221 | if (vmx->vcpu.cpu == cpu) | ||
222 | vmcs_clear(vmx->vmcs); | ||
223 | if (per_cpu(current_vmcs, cpu) == vmx->vmcs) | ||
224 | per_cpu(current_vmcs, cpu) = NULL; | ||
225 | rdtscll(vmx->vcpu.host_tsc); | ||
226 | } | ||
227 | |||
228 | static void vcpu_clear(struct vcpu_vmx *vmx) | ||
229 | { | ||
230 | if (vmx->vcpu.cpu != raw_smp_processor_id() && vmx->vcpu.cpu != -1) | ||
231 | smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, | ||
232 | vmx, 0, 1); | ||
233 | else | ||
234 | __vcpu_clear(vmx); | ||
235 | vmx->launched = 0; | ||
236 | } | ||
237 | |||
238 | static unsigned long vmcs_readl(unsigned long field) | ||
239 | { | ||
240 | unsigned long value; | ||
241 | |||
242 | asm volatile (ASM_VMX_VMREAD_RDX_RAX | ||
243 | : "=a"(value) : "d"(field) : "cc"); | ||
244 | return value; | ||
245 | } | ||
246 | |||
247 | static u16 vmcs_read16(unsigned long field) | ||
248 | { | ||
249 | return vmcs_readl(field); | ||
250 | } | ||
251 | |||
252 | static u32 vmcs_read32(unsigned long field) | ||
253 | { | ||
254 | return vmcs_readl(field); | ||
255 | } | ||
256 | |||
257 | static u64 vmcs_read64(unsigned long field) | ||
258 | { | ||
259 | #ifdef CONFIG_X86_64 | ||
260 | return vmcs_readl(field); | ||
261 | #else | ||
262 | return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32); | ||
263 | #endif | ||
264 | } | ||
265 | |||
266 | static noinline void vmwrite_error(unsigned long field, unsigned long value) | ||
267 | { | ||
268 | printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n", | ||
269 | field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); | ||
270 | dump_stack(); | ||
271 | } | ||
272 | |||
273 | static void vmcs_writel(unsigned long field, unsigned long value) | ||
274 | { | ||
275 | u8 error; | ||
276 | |||
277 | asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0" | ||
278 | : "=q"(error) : "a"(value), "d"(field) : "cc" ); | ||
279 | if (unlikely(error)) | ||
280 | vmwrite_error(field, value); | ||
281 | } | ||
282 | |||
283 | static void vmcs_write16(unsigned long field, u16 value) | ||
284 | { | ||
285 | vmcs_writel(field, value); | ||
286 | } | ||
287 | |||
288 | static void vmcs_write32(unsigned long field, u32 value) | ||
289 | { | ||
290 | vmcs_writel(field, value); | ||
291 | } | ||
292 | |||
293 | static void vmcs_write64(unsigned long field, u64 value) | ||
294 | { | ||
295 | #ifdef CONFIG_X86_64 | ||
296 | vmcs_writel(field, value); | ||
297 | #else | ||
298 | vmcs_writel(field, value); | ||
299 | asm volatile (""); | ||
300 | vmcs_writel(field+1, value >> 32); | ||
301 | #endif | ||
302 | } | ||
303 | |||
304 | static void vmcs_clear_bits(unsigned long field, u32 mask) | ||
305 | { | ||
306 | vmcs_writel(field, vmcs_readl(field) & ~mask); | ||
307 | } | ||
308 | |||
309 | static void vmcs_set_bits(unsigned long field, u32 mask) | ||
310 | { | ||
311 | vmcs_writel(field, vmcs_readl(field) | mask); | ||
312 | } | ||
313 | |||
314 | static void update_exception_bitmap(struct kvm_vcpu *vcpu) | ||
315 | { | ||
316 | u32 eb; | ||
317 | |||
318 | eb = 1u << PF_VECTOR; | ||
319 | if (!vcpu->fpu_active) | ||
320 | eb |= 1u << NM_VECTOR; | ||
321 | if (vcpu->guest_debug.enabled) | ||
322 | eb |= 1u << 1; | ||
323 | if (vcpu->rmode.active) | ||
324 | eb = ~0; | ||
325 | vmcs_write32(EXCEPTION_BITMAP, eb); | ||
326 | } | ||
327 | |||
328 | static void reload_tss(void) | ||
329 | { | ||
330 | #ifndef CONFIG_X86_64 | ||
331 | |||
332 | /* | ||
333 | * VT restores TR but not its size. Useless. | ||
334 | */ | ||
335 | struct descriptor_table gdt; | ||
336 | struct segment_descriptor *descs; | ||
337 | |||
338 | get_gdt(&gdt); | ||
339 | descs = (void *)gdt.base; | ||
340 | descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ | ||
341 | load_TR_desc(); | ||
342 | #endif | ||
343 | } | ||
344 | |||
345 | static void load_transition_efer(struct vcpu_vmx *vmx) | ||
346 | { | ||
347 | u64 trans_efer; | ||
348 | int efer_offset = vmx->msr_offset_efer; | ||
349 | |||
350 | trans_efer = vmx->host_msrs[efer_offset].data; | ||
351 | trans_efer &= ~EFER_SAVE_RESTORE_BITS; | ||
352 | trans_efer |= msr_efer_save_restore_bits(vmx->guest_msrs[efer_offset]); | ||
353 | wrmsrl(MSR_EFER, trans_efer); | ||
354 | vmx->vcpu.stat.efer_reload++; | ||
355 | } | ||
356 | |||
357 | static void vmx_save_host_state(struct kvm_vcpu *vcpu) | ||
358 | { | ||
359 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
360 | |||
361 | if (vmx->host_state.loaded) | ||
362 | return; | ||
363 | |||
364 | vmx->host_state.loaded = 1; | ||
365 | /* | ||
366 | * Set host fs and gs selectors. Unfortunately, 22.2.3 does not | ||
367 | * allow segment selectors with cpl > 0 or ti == 1. | ||
368 | */ | ||
369 | vmx->host_state.ldt_sel = read_ldt(); | ||
370 | vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel; | ||
371 | vmx->host_state.fs_sel = read_fs(); | ||
372 | if (!(vmx->host_state.fs_sel & 7)) { | ||
373 | vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel); | ||
374 | vmx->host_state.fs_reload_needed = 0; | ||
375 | } else { | ||
376 | vmcs_write16(HOST_FS_SELECTOR, 0); | ||
377 | vmx->host_state.fs_reload_needed = 1; | ||
378 | } | ||
379 | vmx->host_state.gs_sel = read_gs(); | ||
380 | if (!(vmx->host_state.gs_sel & 7)) | ||
381 | vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel); | ||
382 | else { | ||
383 | vmcs_write16(HOST_GS_SELECTOR, 0); | ||
384 | vmx->host_state.gs_ldt_reload_needed = 1; | ||
385 | } | ||
386 | |||
387 | #ifdef CONFIG_X86_64 | ||
388 | vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE)); | ||
389 | vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE)); | ||
390 | #else | ||
391 | vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel)); | ||
392 | vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel)); | ||
393 | #endif | ||
394 | |||
395 | #ifdef CONFIG_X86_64 | ||
396 | if (is_long_mode(&vmx->vcpu)) { | ||
397 | save_msrs(vmx->host_msrs + | ||
398 | vmx->msr_offset_kernel_gs_base, 1); | ||
399 | } | ||
400 | #endif | ||
401 | load_msrs(vmx->guest_msrs, vmx->save_nmsrs); | ||
402 | if (msr_efer_need_save_restore(vmx)) | ||
403 | load_transition_efer(vmx); | ||
404 | } | ||
405 | |||
406 | static void vmx_load_host_state(struct vcpu_vmx *vmx) | ||
407 | { | ||
408 | unsigned long flags; | ||
409 | |||
410 | if (!vmx->host_state.loaded) | ||
411 | return; | ||
412 | |||
413 | vmx->host_state.loaded = 0; | ||
414 | if (vmx->host_state.fs_reload_needed) | ||
415 | load_fs(vmx->host_state.fs_sel); | ||
416 | if (vmx->host_state.gs_ldt_reload_needed) { | ||
417 | load_ldt(vmx->host_state.ldt_sel); | ||
418 | /* | ||
419 | * If we have to reload gs, we must take care to | ||
420 | * preserve our gs base. | ||
421 | */ | ||
422 | local_irq_save(flags); | ||
423 | load_gs(vmx->host_state.gs_sel); | ||
424 | #ifdef CONFIG_X86_64 | ||
425 | wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE)); | ||
426 | #endif | ||
427 | local_irq_restore(flags); | ||
428 | } | ||
429 | reload_tss(); | ||
430 | save_msrs(vmx->guest_msrs, vmx->save_nmsrs); | ||
431 | load_msrs(vmx->host_msrs, vmx->save_nmsrs); | ||
432 | if (msr_efer_need_save_restore(vmx)) | ||
433 | load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1); | ||
434 | } | ||
435 | |||
436 | /* | ||
437 | * Switches to specified vcpu, until a matching vcpu_put(), but assumes | ||
438 | * vcpu mutex is already taken. | ||
439 | */ | ||
440 | static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | ||
441 | { | ||
442 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
443 | u64 phys_addr = __pa(vmx->vmcs); | ||
444 | u64 tsc_this, delta; | ||
445 | |||
446 | if (vcpu->cpu != cpu) { | ||
447 | vcpu_clear(vmx); | ||
448 | kvm_migrate_apic_timer(vcpu); | ||
449 | } | ||
450 | |||
451 | if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { | ||
452 | u8 error; | ||
453 | |||
454 | per_cpu(current_vmcs, cpu) = vmx->vmcs; | ||
455 | asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0" | ||
456 | : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) | ||
457 | : "cc"); | ||
458 | if (error) | ||
459 | printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", | ||
460 | vmx->vmcs, phys_addr); | ||
461 | } | ||
462 | |||
463 | if (vcpu->cpu != cpu) { | ||
464 | struct descriptor_table dt; | ||
465 | unsigned long sysenter_esp; | ||
466 | |||
467 | vcpu->cpu = cpu; | ||
468 | /* | ||
469 | * Linux uses per-cpu TSS and GDT, so set these when switching | ||
470 | * processors. | ||
471 | */ | ||
472 | vmcs_writel(HOST_TR_BASE, read_tr_base()); /* 22.2.4 */ | ||
473 | get_gdt(&dt); | ||
474 | vmcs_writel(HOST_GDTR_BASE, dt.base); /* 22.2.4 */ | ||
475 | |||
476 | rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); | ||
477 | vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ | ||
478 | |||
479 | /* | ||
480 | * Make sure the time stamp counter is monotonous. | ||
481 | */ | ||
482 | rdtscll(tsc_this); | ||
483 | delta = vcpu->host_tsc - tsc_this; | ||
484 | vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta); | ||
485 | } | ||
486 | } | ||
487 | |||
488 | static void vmx_vcpu_put(struct kvm_vcpu *vcpu) | ||
489 | { | ||
490 | vmx_load_host_state(to_vmx(vcpu)); | ||
491 | kvm_put_guest_fpu(vcpu); | ||
492 | } | ||
493 | |||
494 | static void vmx_fpu_activate(struct kvm_vcpu *vcpu) | ||
495 | { | ||
496 | if (vcpu->fpu_active) | ||
497 | return; | ||
498 | vcpu->fpu_active = 1; | ||
499 | vmcs_clear_bits(GUEST_CR0, X86_CR0_TS); | ||
500 | if (vcpu->cr0 & X86_CR0_TS) | ||
501 | vmcs_set_bits(GUEST_CR0, X86_CR0_TS); | ||
502 | update_exception_bitmap(vcpu); | ||
503 | } | ||
504 | |||
505 | static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) | ||
506 | { | ||
507 | if (!vcpu->fpu_active) | ||
508 | return; | ||
509 | vcpu->fpu_active = 0; | ||
510 | vmcs_set_bits(GUEST_CR0, X86_CR0_TS); | ||
511 | update_exception_bitmap(vcpu); | ||
512 | } | ||
513 | |||
514 | static void vmx_vcpu_decache(struct kvm_vcpu *vcpu) | ||
515 | { | ||
516 | vcpu_clear(to_vmx(vcpu)); | ||
517 | } | ||
518 | |||
519 | static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) | ||
520 | { | ||
521 | return vmcs_readl(GUEST_RFLAGS); | ||
522 | } | ||
523 | |||
524 | static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) | ||
525 | { | ||
526 | if (vcpu->rmode.active) | ||
527 | rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; | ||
528 | vmcs_writel(GUEST_RFLAGS, rflags); | ||
529 | } | ||
530 | |||
531 | static void skip_emulated_instruction(struct kvm_vcpu *vcpu) | ||
532 | { | ||
533 | unsigned long rip; | ||
534 | u32 interruptibility; | ||
535 | |||
536 | rip = vmcs_readl(GUEST_RIP); | ||
537 | rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); | ||
538 | vmcs_writel(GUEST_RIP, rip); | ||
539 | |||
540 | /* | ||
541 | * We emulated an instruction, so temporary interrupt blocking | ||
542 | * should be removed, if set. | ||
543 | */ | ||
544 | interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); | ||
545 | if (interruptibility & 3) | ||
546 | vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, | ||
547 | interruptibility & ~3); | ||
548 | vcpu->interrupt_window_open = 1; | ||
549 | } | ||
550 | |||
551 | static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code) | ||
552 | { | ||
553 | printk(KERN_DEBUG "inject_general_protection: rip 0x%lx\n", | ||
554 | vmcs_readl(GUEST_RIP)); | ||
555 | vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); | ||
556 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | ||
557 | GP_VECTOR | | ||
558 | INTR_TYPE_EXCEPTION | | ||
559 | INTR_INFO_DELIEVER_CODE_MASK | | ||
560 | INTR_INFO_VALID_MASK); | ||
561 | } | ||
562 | |||
563 | /* | ||
564 | * Swap MSR entry in host/guest MSR entry array. | ||
565 | */ | ||
566 | #ifdef CONFIG_X86_64 | ||
567 | static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) | ||
568 | { | ||
569 | struct kvm_msr_entry tmp; | ||
570 | |||
571 | tmp = vmx->guest_msrs[to]; | ||
572 | vmx->guest_msrs[to] = vmx->guest_msrs[from]; | ||
573 | vmx->guest_msrs[from] = tmp; | ||
574 | tmp = vmx->host_msrs[to]; | ||
575 | vmx->host_msrs[to] = vmx->host_msrs[from]; | ||
576 | vmx->host_msrs[from] = tmp; | ||
577 | } | ||
578 | #endif | ||
579 | |||
580 | /* | ||
581 | * Set up the vmcs to automatically save and restore system | ||
582 | * msrs. Don't touch the 64-bit msrs if the guest is in legacy | ||
583 | * mode, as fiddling with msrs is very expensive. | ||
584 | */ | ||
585 | static void setup_msrs(struct vcpu_vmx *vmx) | ||
586 | { | ||
587 | int save_nmsrs; | ||
588 | |||
589 | save_nmsrs = 0; | ||
590 | #ifdef CONFIG_X86_64 | ||
591 | if (is_long_mode(&vmx->vcpu)) { | ||
592 | int index; | ||
593 | |||
594 | index = __find_msr_index(vmx, MSR_SYSCALL_MASK); | ||
595 | if (index >= 0) | ||
596 | move_msr_up(vmx, index, save_nmsrs++); | ||
597 | index = __find_msr_index(vmx, MSR_LSTAR); | ||
598 | if (index >= 0) | ||
599 | move_msr_up(vmx, index, save_nmsrs++); | ||
600 | index = __find_msr_index(vmx, MSR_CSTAR); | ||
601 | if (index >= 0) | ||
602 | move_msr_up(vmx, index, save_nmsrs++); | ||
603 | index = __find_msr_index(vmx, MSR_KERNEL_GS_BASE); | ||
604 | if (index >= 0) | ||
605 | move_msr_up(vmx, index, save_nmsrs++); | ||
606 | /* | ||
607 | * MSR_K6_STAR is only needed on long mode guests, and only | ||
608 | * if efer.sce is enabled. | ||
609 | */ | ||
610 | index = __find_msr_index(vmx, MSR_K6_STAR); | ||
611 | if ((index >= 0) && (vmx->vcpu.shadow_efer & EFER_SCE)) | ||
612 | move_msr_up(vmx, index, save_nmsrs++); | ||
613 | } | ||
614 | #endif | ||
615 | vmx->save_nmsrs = save_nmsrs; | ||
616 | |||
617 | #ifdef CONFIG_X86_64 | ||
618 | vmx->msr_offset_kernel_gs_base = | ||
619 | __find_msr_index(vmx, MSR_KERNEL_GS_BASE); | ||
620 | #endif | ||
621 | vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER); | ||
622 | } | ||
623 | |||
624 | /* | ||
625 | * reads and returns guest's timestamp counter "register" | ||
626 | * guest_tsc = host_tsc + tsc_offset -- 21.3 | ||
627 | */ | ||
628 | static u64 guest_read_tsc(void) | ||
629 | { | ||
630 | u64 host_tsc, tsc_offset; | ||
631 | |||
632 | rdtscll(host_tsc); | ||
633 | tsc_offset = vmcs_read64(TSC_OFFSET); | ||
634 | return host_tsc + tsc_offset; | ||
635 | } | ||
636 | |||
637 | /* | ||
638 | * writes 'guest_tsc' into guest's timestamp counter "register" | ||
639 | * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc | ||
640 | */ | ||
641 | static void guest_write_tsc(u64 guest_tsc) | ||
642 | { | ||
643 | u64 host_tsc; | ||
644 | |||
645 | rdtscll(host_tsc); | ||
646 | vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc); | ||
647 | } | ||
648 | |||
649 | /* | ||
650 | * Reads an msr value (of 'msr_index') into 'pdata'. | ||
651 | * Returns 0 on success, non-0 otherwise. | ||
652 | * Assumes vcpu_load() was already called. | ||
653 | */ | ||
654 | static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) | ||
655 | { | ||
656 | u64 data; | ||
657 | struct kvm_msr_entry *msr; | ||
658 | |||
659 | if (!pdata) { | ||
660 | printk(KERN_ERR "BUG: get_msr called with NULL pdata\n"); | ||
661 | return -EINVAL; | ||
662 | } | ||
663 | |||
664 | switch (msr_index) { | ||
665 | #ifdef CONFIG_X86_64 | ||
666 | case MSR_FS_BASE: | ||
667 | data = vmcs_readl(GUEST_FS_BASE); | ||
668 | break; | ||
669 | case MSR_GS_BASE: | ||
670 | data = vmcs_readl(GUEST_GS_BASE); | ||
671 | break; | ||
672 | case MSR_EFER: | ||
673 | return kvm_get_msr_common(vcpu, msr_index, pdata); | ||
674 | #endif | ||
675 | case MSR_IA32_TIME_STAMP_COUNTER: | ||
676 | data = guest_read_tsc(); | ||
677 | break; | ||
678 | case MSR_IA32_SYSENTER_CS: | ||
679 | data = vmcs_read32(GUEST_SYSENTER_CS); | ||
680 | break; | ||
681 | case MSR_IA32_SYSENTER_EIP: | ||
682 | data = vmcs_readl(GUEST_SYSENTER_EIP); | ||
683 | break; | ||
684 | case MSR_IA32_SYSENTER_ESP: | ||
685 | data = vmcs_readl(GUEST_SYSENTER_ESP); | ||
686 | break; | ||
687 | default: | ||
688 | msr = find_msr_entry(to_vmx(vcpu), msr_index); | ||
689 | if (msr) { | ||
690 | data = msr->data; | ||
691 | break; | ||
692 | } | ||
693 | return kvm_get_msr_common(vcpu, msr_index, pdata); | ||
694 | } | ||
695 | |||
696 | *pdata = data; | ||
697 | return 0; | ||
698 | } | ||
699 | |||
700 | /* | ||
701 | * Writes msr value into into the appropriate "register". | ||
702 | * Returns 0 on success, non-0 otherwise. | ||
703 | * Assumes vcpu_load() was already called. | ||
704 | */ | ||
705 | static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) | ||
706 | { | ||
707 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
708 | struct kvm_msr_entry *msr; | ||
709 | int ret = 0; | ||
710 | |||
711 | switch (msr_index) { | ||
712 | #ifdef CONFIG_X86_64 | ||
713 | case MSR_EFER: | ||
714 | ret = kvm_set_msr_common(vcpu, msr_index, data); | ||
715 | if (vmx->host_state.loaded) | ||
716 | load_transition_efer(vmx); | ||
717 | break; | ||
718 | case MSR_FS_BASE: | ||
719 | vmcs_writel(GUEST_FS_BASE, data); | ||
720 | break; | ||
721 | case MSR_GS_BASE: | ||
722 | vmcs_writel(GUEST_GS_BASE, data); | ||
723 | break; | ||
724 | #endif | ||
725 | case MSR_IA32_SYSENTER_CS: | ||
726 | vmcs_write32(GUEST_SYSENTER_CS, data); | ||
727 | break; | ||
728 | case MSR_IA32_SYSENTER_EIP: | ||
729 | vmcs_writel(GUEST_SYSENTER_EIP, data); | ||
730 | break; | ||
731 | case MSR_IA32_SYSENTER_ESP: | ||
732 | vmcs_writel(GUEST_SYSENTER_ESP, data); | ||
733 | break; | ||
734 | case MSR_IA32_TIME_STAMP_COUNTER: | ||
735 | guest_write_tsc(data); | ||
736 | break; | ||
737 | default: | ||
738 | msr = find_msr_entry(vmx, msr_index); | ||
739 | if (msr) { | ||
740 | msr->data = data; | ||
741 | if (vmx->host_state.loaded) | ||
742 | load_msrs(vmx->guest_msrs, vmx->save_nmsrs); | ||
743 | break; | ||
744 | } | ||
745 | ret = kvm_set_msr_common(vcpu, msr_index, data); | ||
746 | } | ||
747 | |||
748 | return ret; | ||
749 | } | ||
750 | |||
751 | /* | ||
752 | * Sync the rsp and rip registers into the vcpu structure. This allows | ||
753 | * registers to be accessed by indexing vcpu->regs. | ||
754 | */ | ||
755 | static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu) | ||
756 | { | ||
757 | vcpu->regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); | ||
758 | vcpu->rip = vmcs_readl(GUEST_RIP); | ||
759 | } | ||
760 | |||
761 | /* | ||
762 | * Syncs rsp and rip back into the vmcs. Should be called after possible | ||
763 | * modification. | ||
764 | */ | ||
765 | static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu) | ||
766 | { | ||
767 | vmcs_writel(GUEST_RSP, vcpu->regs[VCPU_REGS_RSP]); | ||
768 | vmcs_writel(GUEST_RIP, vcpu->rip); | ||
769 | } | ||
770 | |||
771 | static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) | ||
772 | { | ||
773 | unsigned long dr7 = 0x400; | ||
774 | int old_singlestep; | ||
775 | |||
776 | old_singlestep = vcpu->guest_debug.singlestep; | ||
777 | |||
778 | vcpu->guest_debug.enabled = dbg->enabled; | ||
779 | if (vcpu->guest_debug.enabled) { | ||
780 | int i; | ||
781 | |||
782 | dr7 |= 0x200; /* exact */ | ||
783 | for (i = 0; i < 4; ++i) { | ||
784 | if (!dbg->breakpoints[i].enabled) | ||
785 | continue; | ||
786 | vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address; | ||
787 | dr7 |= 2 << (i*2); /* global enable */ | ||
788 | dr7 |= 0 << (i*4+16); /* execution breakpoint */ | ||
789 | } | ||
790 | |||
791 | vcpu->guest_debug.singlestep = dbg->singlestep; | ||
792 | } else | ||
793 | vcpu->guest_debug.singlestep = 0; | ||
794 | |||
795 | if (old_singlestep && !vcpu->guest_debug.singlestep) { | ||
796 | unsigned long flags; | ||
797 | |||
798 | flags = vmcs_readl(GUEST_RFLAGS); | ||
799 | flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); | ||
800 | vmcs_writel(GUEST_RFLAGS, flags); | ||
801 | } | ||
802 | |||
803 | update_exception_bitmap(vcpu); | ||
804 | vmcs_writel(GUEST_DR7, dr7); | ||
805 | |||
806 | return 0; | ||
807 | } | ||
808 | |||
809 | static int vmx_get_irq(struct kvm_vcpu *vcpu) | ||
810 | { | ||
811 | u32 idtv_info_field; | ||
812 | |||
813 | idtv_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD); | ||
814 | if (idtv_info_field & INTR_INFO_VALID_MASK) { | ||
815 | if (is_external_interrupt(idtv_info_field)) | ||
816 | return idtv_info_field & VECTORING_INFO_VECTOR_MASK; | ||
817 | else | ||
818 | printk("pending exception: not handled yet\n"); | ||
819 | } | ||
820 | return -1; | ||
821 | } | ||
822 | |||
823 | static __init int cpu_has_kvm_support(void) | ||
824 | { | ||
825 | unsigned long ecx = cpuid_ecx(1); | ||
826 | return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */ | ||
827 | } | ||
828 | |||
829 | static __init int vmx_disabled_by_bios(void) | ||
830 | { | ||
831 | u64 msr; | ||
832 | |||
833 | rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); | ||
834 | return (msr & (MSR_IA32_FEATURE_CONTROL_LOCKED | | ||
835 | MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) | ||
836 | == MSR_IA32_FEATURE_CONTROL_LOCKED; | ||
837 | /* locked but not enabled */ | ||
838 | } | ||
839 | |||
840 | static void hardware_enable(void *garbage) | ||
841 | { | ||
842 | int cpu = raw_smp_processor_id(); | ||
843 | u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); | ||
844 | u64 old; | ||
845 | |||
846 | rdmsrl(MSR_IA32_FEATURE_CONTROL, old); | ||
847 | if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED | | ||
848 | MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) | ||
849 | != (MSR_IA32_FEATURE_CONTROL_LOCKED | | ||
850 | MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) | ||
851 | /* enable and lock */ | ||
852 | wrmsrl(MSR_IA32_FEATURE_CONTROL, old | | ||
853 | MSR_IA32_FEATURE_CONTROL_LOCKED | | ||
854 | MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED); | ||
855 | write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ | ||
856 | asm volatile (ASM_VMX_VMXON_RAX : : "a"(&phys_addr), "m"(phys_addr) | ||
857 | : "memory", "cc"); | ||
858 | } | ||
859 | |||
860 | static void hardware_disable(void *garbage) | ||
861 | { | ||
862 | asm volatile (ASM_VMX_VMXOFF : : : "cc"); | ||
863 | } | ||
864 | |||
865 | static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, | ||
866 | u32 msr, u32* result) | ||
867 | { | ||
868 | u32 vmx_msr_low, vmx_msr_high; | ||
869 | u32 ctl = ctl_min | ctl_opt; | ||
870 | |||
871 | rdmsr(msr, vmx_msr_low, vmx_msr_high); | ||
872 | |||
873 | ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */ | ||
874 | ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */ | ||
875 | |||
876 | /* Ensure minimum (required) set of control bits are supported. */ | ||
877 | if (ctl_min & ~ctl) | ||
878 | return -EIO; | ||
879 | |||
880 | *result = ctl; | ||
881 | return 0; | ||
882 | } | ||
883 | |||
884 | static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) | ||
885 | { | ||
886 | u32 vmx_msr_low, vmx_msr_high; | ||
887 | u32 min, opt; | ||
888 | u32 _pin_based_exec_control = 0; | ||
889 | u32 _cpu_based_exec_control = 0; | ||
890 | u32 _vmexit_control = 0; | ||
891 | u32 _vmentry_control = 0; | ||
892 | |||
893 | min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; | ||
894 | opt = 0; | ||
895 | if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, | ||
896 | &_pin_based_exec_control) < 0) | ||
897 | return -EIO; | ||
898 | |||
899 | min = CPU_BASED_HLT_EXITING | | ||
900 | #ifdef CONFIG_X86_64 | ||
901 | CPU_BASED_CR8_LOAD_EXITING | | ||
902 | CPU_BASED_CR8_STORE_EXITING | | ||
903 | #endif | ||
904 | CPU_BASED_USE_IO_BITMAPS | | ||
905 | CPU_BASED_MOV_DR_EXITING | | ||
906 | CPU_BASED_USE_TSC_OFFSETING; | ||
907 | #ifdef CONFIG_X86_64 | ||
908 | opt = CPU_BASED_TPR_SHADOW; | ||
909 | #else | ||
910 | opt = 0; | ||
911 | #endif | ||
912 | if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, | ||
913 | &_cpu_based_exec_control) < 0) | ||
914 | return -EIO; | ||
915 | #ifdef CONFIG_X86_64 | ||
916 | if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) | ||
917 | _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING & | ||
918 | ~CPU_BASED_CR8_STORE_EXITING; | ||
919 | #endif | ||
920 | |||
921 | min = 0; | ||
922 | #ifdef CONFIG_X86_64 | ||
923 | min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; | ||
924 | #endif | ||
925 | opt = 0; | ||
926 | if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, | ||
927 | &_vmexit_control) < 0) | ||
928 | return -EIO; | ||
929 | |||
930 | min = opt = 0; | ||
931 | if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, | ||
932 | &_vmentry_control) < 0) | ||
933 | return -EIO; | ||
934 | |||
935 | rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high); | ||
936 | |||
937 | /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ | ||
938 | if ((vmx_msr_high & 0x1fff) > PAGE_SIZE) | ||
939 | return -EIO; | ||
940 | |||
941 | #ifdef CONFIG_X86_64 | ||
942 | /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */ | ||
943 | if (vmx_msr_high & (1u<<16)) | ||
944 | return -EIO; | ||
945 | #endif | ||
946 | |||
947 | /* Require Write-Back (WB) memory type for VMCS accesses. */ | ||
948 | if (((vmx_msr_high >> 18) & 15) != 6) | ||
949 | return -EIO; | ||
950 | |||
951 | vmcs_conf->size = vmx_msr_high & 0x1fff; | ||
952 | vmcs_conf->order = get_order(vmcs_config.size); | ||
953 | vmcs_conf->revision_id = vmx_msr_low; | ||
954 | |||
955 | vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; | ||
956 | vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; | ||
957 | vmcs_conf->vmexit_ctrl = _vmexit_control; | ||
958 | vmcs_conf->vmentry_ctrl = _vmentry_control; | ||
959 | |||
960 | return 0; | ||
961 | } | ||
962 | |||
963 | static struct vmcs *alloc_vmcs_cpu(int cpu) | ||
964 | { | ||
965 | int node = cpu_to_node(cpu); | ||
966 | struct page *pages; | ||
967 | struct vmcs *vmcs; | ||
968 | |||
969 | pages = alloc_pages_node(node, GFP_KERNEL, vmcs_config.order); | ||
970 | if (!pages) | ||
971 | return NULL; | ||
972 | vmcs = page_address(pages); | ||
973 | memset(vmcs, 0, vmcs_config.size); | ||
974 | vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */ | ||
975 | return vmcs; | ||
976 | } | ||
977 | |||
978 | static struct vmcs *alloc_vmcs(void) | ||
979 | { | ||
980 | return alloc_vmcs_cpu(raw_smp_processor_id()); | ||
981 | } | ||
982 | |||
983 | static void free_vmcs(struct vmcs *vmcs) | ||
984 | { | ||
985 | free_pages((unsigned long)vmcs, vmcs_config.order); | ||
986 | } | ||
987 | |||
988 | static void free_kvm_area(void) | ||
989 | { | ||
990 | int cpu; | ||
991 | |||
992 | for_each_online_cpu(cpu) | ||
993 | free_vmcs(per_cpu(vmxarea, cpu)); | ||
994 | } | ||
995 | |||
996 | static __init int alloc_kvm_area(void) | ||
997 | { | ||
998 | int cpu; | ||
999 | |||
1000 | for_each_online_cpu(cpu) { | ||
1001 | struct vmcs *vmcs; | ||
1002 | |||
1003 | vmcs = alloc_vmcs_cpu(cpu); | ||
1004 | if (!vmcs) { | ||
1005 | free_kvm_area(); | ||
1006 | return -ENOMEM; | ||
1007 | } | ||
1008 | |||
1009 | per_cpu(vmxarea, cpu) = vmcs; | ||
1010 | } | ||
1011 | return 0; | ||
1012 | } | ||
1013 | |||
1014 | static __init int hardware_setup(void) | ||
1015 | { | ||
1016 | if (setup_vmcs_config(&vmcs_config) < 0) | ||
1017 | return -EIO; | ||
1018 | return alloc_kvm_area(); | ||
1019 | } | ||
1020 | |||
1021 | static __exit void hardware_unsetup(void) | ||
1022 | { | ||
1023 | free_kvm_area(); | ||
1024 | } | ||
1025 | |||
1026 | static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save) | ||
1027 | { | ||
1028 | struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; | ||
1029 | |||
1030 | if (vmcs_readl(sf->base) == save->base && (save->base & AR_S_MASK)) { | ||
1031 | vmcs_write16(sf->selector, save->selector); | ||
1032 | vmcs_writel(sf->base, save->base); | ||
1033 | vmcs_write32(sf->limit, save->limit); | ||
1034 | vmcs_write32(sf->ar_bytes, save->ar); | ||
1035 | } else { | ||
1036 | u32 dpl = (vmcs_read16(sf->selector) & SELECTOR_RPL_MASK) | ||
1037 | << AR_DPL_SHIFT; | ||
1038 | vmcs_write32(sf->ar_bytes, 0x93 | dpl); | ||
1039 | } | ||
1040 | } | ||
1041 | |||
1042 | static void enter_pmode(struct kvm_vcpu *vcpu) | ||
1043 | { | ||
1044 | unsigned long flags; | ||
1045 | |||
1046 | vcpu->rmode.active = 0; | ||
1047 | |||
1048 | vmcs_writel(GUEST_TR_BASE, vcpu->rmode.tr.base); | ||
1049 | vmcs_write32(GUEST_TR_LIMIT, vcpu->rmode.tr.limit); | ||
1050 | vmcs_write32(GUEST_TR_AR_BYTES, vcpu->rmode.tr.ar); | ||
1051 | |||
1052 | flags = vmcs_readl(GUEST_RFLAGS); | ||
1053 | flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM); | ||
1054 | flags |= (vcpu->rmode.save_iopl << IOPL_SHIFT); | ||
1055 | vmcs_writel(GUEST_RFLAGS, flags); | ||
1056 | |||
1057 | vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | | ||
1058 | (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME)); | ||
1059 | |||
1060 | update_exception_bitmap(vcpu); | ||
1061 | |||
1062 | fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->rmode.es); | ||
1063 | fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->rmode.ds); | ||
1064 | fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->rmode.gs); | ||
1065 | fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->rmode.fs); | ||
1066 | |||
1067 | vmcs_write16(GUEST_SS_SELECTOR, 0); | ||
1068 | vmcs_write32(GUEST_SS_AR_BYTES, 0x93); | ||
1069 | |||
1070 | vmcs_write16(GUEST_CS_SELECTOR, | ||
1071 | vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK); | ||
1072 | vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); | ||
1073 | } | ||
1074 | |||
1075 | static gva_t rmode_tss_base(struct kvm* kvm) | ||
1076 | { | ||
1077 | gfn_t base_gfn = kvm->memslots[0].base_gfn + kvm->memslots[0].npages - 3; | ||
1078 | return base_gfn << PAGE_SHIFT; | ||
1079 | } | ||
1080 | |||
1081 | static void fix_rmode_seg(int seg, struct kvm_save_segment *save) | ||
1082 | { | ||
1083 | struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; | ||
1084 | |||
1085 | save->selector = vmcs_read16(sf->selector); | ||
1086 | save->base = vmcs_readl(sf->base); | ||
1087 | save->limit = vmcs_read32(sf->limit); | ||
1088 | save->ar = vmcs_read32(sf->ar_bytes); | ||
1089 | vmcs_write16(sf->selector, vmcs_readl(sf->base) >> 4); | ||
1090 | vmcs_write32(sf->limit, 0xffff); | ||
1091 | vmcs_write32(sf->ar_bytes, 0xf3); | ||
1092 | } | ||
1093 | |||
1094 | static void enter_rmode(struct kvm_vcpu *vcpu) | ||
1095 | { | ||
1096 | unsigned long flags; | ||
1097 | |||
1098 | vcpu->rmode.active = 1; | ||
1099 | |||
1100 | vcpu->rmode.tr.base = vmcs_readl(GUEST_TR_BASE); | ||
1101 | vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); | ||
1102 | |||
1103 | vcpu->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT); | ||
1104 | vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); | ||
1105 | |||
1106 | vcpu->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES); | ||
1107 | vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); | ||
1108 | |||
1109 | flags = vmcs_readl(GUEST_RFLAGS); | ||
1110 | vcpu->rmode.save_iopl = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; | ||
1111 | |||
1112 | flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; | ||
1113 | |||
1114 | vmcs_writel(GUEST_RFLAGS, flags); | ||
1115 | vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); | ||
1116 | update_exception_bitmap(vcpu); | ||
1117 | |||
1118 | vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4); | ||
1119 | vmcs_write32(GUEST_SS_LIMIT, 0xffff); | ||
1120 | vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); | ||
1121 | |||
1122 | vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); | ||
1123 | vmcs_write32(GUEST_CS_LIMIT, 0xffff); | ||
1124 | if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000) | ||
1125 | vmcs_writel(GUEST_CS_BASE, 0xf0000); | ||
1126 | vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4); | ||
1127 | |||
1128 | fix_rmode_seg(VCPU_SREG_ES, &vcpu->rmode.es); | ||
1129 | fix_rmode_seg(VCPU_SREG_DS, &vcpu->rmode.ds); | ||
1130 | fix_rmode_seg(VCPU_SREG_GS, &vcpu->rmode.gs); | ||
1131 | fix_rmode_seg(VCPU_SREG_FS, &vcpu->rmode.fs); | ||
1132 | |||
1133 | kvm_mmu_reset_context(vcpu); | ||
1134 | init_rmode_tss(vcpu->kvm); | ||
1135 | } | ||
1136 | |||
1137 | #ifdef CONFIG_X86_64 | ||
1138 | |||
1139 | static void enter_lmode(struct kvm_vcpu *vcpu) | ||
1140 | { | ||
1141 | u32 guest_tr_ar; | ||
1142 | |||
1143 | guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); | ||
1144 | if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) { | ||
1145 | printk(KERN_DEBUG "%s: tss fixup for long mode. \n", | ||
1146 | __FUNCTION__); | ||
1147 | vmcs_write32(GUEST_TR_AR_BYTES, | ||
1148 | (guest_tr_ar & ~AR_TYPE_MASK) | ||
1149 | | AR_TYPE_BUSY_64_TSS); | ||
1150 | } | ||
1151 | |||
1152 | vcpu->shadow_efer |= EFER_LMA; | ||
1153 | |||
1154 | find_msr_entry(to_vmx(vcpu), MSR_EFER)->data |= EFER_LMA | EFER_LME; | ||
1155 | vmcs_write32(VM_ENTRY_CONTROLS, | ||
1156 | vmcs_read32(VM_ENTRY_CONTROLS) | ||
1157 | | VM_ENTRY_IA32E_MODE); | ||
1158 | } | ||
1159 | |||
1160 | static void exit_lmode(struct kvm_vcpu *vcpu) | ||
1161 | { | ||
1162 | vcpu->shadow_efer &= ~EFER_LMA; | ||
1163 | |||
1164 | vmcs_write32(VM_ENTRY_CONTROLS, | ||
1165 | vmcs_read32(VM_ENTRY_CONTROLS) | ||
1166 | & ~VM_ENTRY_IA32E_MODE); | ||
1167 | } | ||
1168 | |||
1169 | #endif | ||
1170 | |||
1171 | static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) | ||
1172 | { | ||
1173 | vcpu->cr4 &= KVM_GUEST_CR4_MASK; | ||
1174 | vcpu->cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK; | ||
1175 | } | ||
1176 | |||
1177 | static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | ||
1178 | { | ||
1179 | vmx_fpu_deactivate(vcpu); | ||
1180 | |||
1181 | if (vcpu->rmode.active && (cr0 & X86_CR0_PE)) | ||
1182 | enter_pmode(vcpu); | ||
1183 | |||
1184 | if (!vcpu->rmode.active && !(cr0 & X86_CR0_PE)) | ||
1185 | enter_rmode(vcpu); | ||
1186 | |||
1187 | #ifdef CONFIG_X86_64 | ||
1188 | if (vcpu->shadow_efer & EFER_LME) { | ||
1189 | if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) | ||
1190 | enter_lmode(vcpu); | ||
1191 | if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) | ||
1192 | exit_lmode(vcpu); | ||
1193 | } | ||
1194 | #endif | ||
1195 | |||
1196 | vmcs_writel(CR0_READ_SHADOW, cr0); | ||
1197 | vmcs_writel(GUEST_CR0, | ||
1198 | (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON); | ||
1199 | vcpu->cr0 = cr0; | ||
1200 | |||
1201 | if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE)) | ||
1202 | vmx_fpu_activate(vcpu); | ||
1203 | } | ||
1204 | |||
1205 | static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | ||
1206 | { | ||
1207 | vmcs_writel(GUEST_CR3, cr3); | ||
1208 | if (vcpu->cr0 & X86_CR0_PE) | ||
1209 | vmx_fpu_deactivate(vcpu); | ||
1210 | } | ||
1211 | |||
1212 | static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | ||
1213 | { | ||
1214 | vmcs_writel(CR4_READ_SHADOW, cr4); | ||
1215 | vmcs_writel(GUEST_CR4, cr4 | (vcpu->rmode.active ? | ||
1216 | KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON)); | ||
1217 | vcpu->cr4 = cr4; | ||
1218 | } | ||
1219 | |||
1220 | #ifdef CONFIG_X86_64 | ||
1221 | |||
1222 | static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) | ||
1223 | { | ||
1224 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
1225 | struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); | ||
1226 | |||
1227 | vcpu->shadow_efer = efer; | ||
1228 | if (efer & EFER_LMA) { | ||
1229 | vmcs_write32(VM_ENTRY_CONTROLS, | ||
1230 | vmcs_read32(VM_ENTRY_CONTROLS) | | ||
1231 | VM_ENTRY_IA32E_MODE); | ||
1232 | msr->data = efer; | ||
1233 | |||
1234 | } else { | ||
1235 | vmcs_write32(VM_ENTRY_CONTROLS, | ||
1236 | vmcs_read32(VM_ENTRY_CONTROLS) & | ||
1237 | ~VM_ENTRY_IA32E_MODE); | ||
1238 | |||
1239 | msr->data = efer & ~EFER_LME; | ||
1240 | } | ||
1241 | setup_msrs(vmx); | ||
1242 | } | ||
1243 | |||
1244 | #endif | ||
1245 | |||
1246 | static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) | ||
1247 | { | ||
1248 | struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; | ||
1249 | |||
1250 | return vmcs_readl(sf->base); | ||
1251 | } | ||
1252 | |||
1253 | static void vmx_get_segment(struct kvm_vcpu *vcpu, | ||
1254 | struct kvm_segment *var, int seg) | ||
1255 | { | ||
1256 | struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; | ||
1257 | u32 ar; | ||
1258 | |||
1259 | var->base = vmcs_readl(sf->base); | ||
1260 | var->limit = vmcs_read32(sf->limit); | ||
1261 | var->selector = vmcs_read16(sf->selector); | ||
1262 | ar = vmcs_read32(sf->ar_bytes); | ||
1263 | if (ar & AR_UNUSABLE_MASK) | ||
1264 | ar = 0; | ||
1265 | var->type = ar & 15; | ||
1266 | var->s = (ar >> 4) & 1; | ||
1267 | var->dpl = (ar >> 5) & 3; | ||
1268 | var->present = (ar >> 7) & 1; | ||
1269 | var->avl = (ar >> 12) & 1; | ||
1270 | var->l = (ar >> 13) & 1; | ||
1271 | var->db = (ar >> 14) & 1; | ||
1272 | var->g = (ar >> 15) & 1; | ||
1273 | var->unusable = (ar >> 16) & 1; | ||
1274 | } | ||
1275 | |||
1276 | static u32 vmx_segment_access_rights(struct kvm_segment *var) | ||
1277 | { | ||
1278 | u32 ar; | ||
1279 | |||
1280 | if (var->unusable) | ||
1281 | ar = 1 << 16; | ||
1282 | else { | ||
1283 | ar = var->type & 15; | ||
1284 | ar |= (var->s & 1) << 4; | ||
1285 | ar |= (var->dpl & 3) << 5; | ||
1286 | ar |= (var->present & 1) << 7; | ||
1287 | ar |= (var->avl & 1) << 12; | ||
1288 | ar |= (var->l & 1) << 13; | ||
1289 | ar |= (var->db & 1) << 14; | ||
1290 | ar |= (var->g & 1) << 15; | ||
1291 | } | ||
1292 | if (ar == 0) /* a 0 value means unusable */ | ||
1293 | ar = AR_UNUSABLE_MASK; | ||
1294 | |||
1295 | return ar; | ||
1296 | } | ||
1297 | |||
1298 | static void vmx_set_segment(struct kvm_vcpu *vcpu, | ||
1299 | struct kvm_segment *var, int seg) | ||
1300 | { | ||
1301 | struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; | ||
1302 | u32 ar; | ||
1303 | |||
1304 | if (vcpu->rmode.active && seg == VCPU_SREG_TR) { | ||
1305 | vcpu->rmode.tr.selector = var->selector; | ||
1306 | vcpu->rmode.tr.base = var->base; | ||
1307 | vcpu->rmode.tr.limit = var->limit; | ||
1308 | vcpu->rmode.tr.ar = vmx_segment_access_rights(var); | ||
1309 | return; | ||
1310 | } | ||
1311 | vmcs_writel(sf->base, var->base); | ||
1312 | vmcs_write32(sf->limit, var->limit); | ||
1313 | vmcs_write16(sf->selector, var->selector); | ||
1314 | if (vcpu->rmode.active && var->s) { | ||
1315 | /* | ||
1316 | * Hack real-mode segments into vm86 compatibility. | ||
1317 | */ | ||
1318 | if (var->base == 0xffff0000 && var->selector == 0xf000) | ||
1319 | vmcs_writel(sf->base, 0xf0000); | ||
1320 | ar = 0xf3; | ||
1321 | } else | ||
1322 | ar = vmx_segment_access_rights(var); | ||
1323 | vmcs_write32(sf->ar_bytes, ar); | ||
1324 | } | ||
1325 | |||
1326 | static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) | ||
1327 | { | ||
1328 | u32 ar = vmcs_read32(GUEST_CS_AR_BYTES); | ||
1329 | |||
1330 | *db = (ar >> 14) & 1; | ||
1331 | *l = (ar >> 13) & 1; | ||
1332 | } | ||
1333 | |||
1334 | static void vmx_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) | ||
1335 | { | ||
1336 | dt->limit = vmcs_read32(GUEST_IDTR_LIMIT); | ||
1337 | dt->base = vmcs_readl(GUEST_IDTR_BASE); | ||
1338 | } | ||
1339 | |||
1340 | static void vmx_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) | ||
1341 | { | ||
1342 | vmcs_write32(GUEST_IDTR_LIMIT, dt->limit); | ||
1343 | vmcs_writel(GUEST_IDTR_BASE, dt->base); | ||
1344 | } | ||
1345 | |||
1346 | static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) | ||
1347 | { | ||
1348 | dt->limit = vmcs_read32(GUEST_GDTR_LIMIT); | ||
1349 | dt->base = vmcs_readl(GUEST_GDTR_BASE); | ||
1350 | } | ||
1351 | |||
1352 | static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) | ||
1353 | { | ||
1354 | vmcs_write32(GUEST_GDTR_LIMIT, dt->limit); | ||
1355 | vmcs_writel(GUEST_GDTR_BASE, dt->base); | ||
1356 | } | ||
1357 | |||
1358 | static int init_rmode_tss(struct kvm* kvm) | ||
1359 | { | ||
1360 | struct page *p1, *p2, *p3; | ||
1361 | gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; | ||
1362 | char *page; | ||
1363 | |||
1364 | p1 = gfn_to_page(kvm, fn++); | ||
1365 | p2 = gfn_to_page(kvm, fn++); | ||
1366 | p3 = gfn_to_page(kvm, fn); | ||
1367 | |||
1368 | if (!p1 || !p2 || !p3) { | ||
1369 | kvm_printf(kvm,"%s: gfn_to_page failed\n", __FUNCTION__); | ||
1370 | return 0; | ||
1371 | } | ||
1372 | |||
1373 | page = kmap_atomic(p1, KM_USER0); | ||
1374 | clear_page(page); | ||
1375 | *(u16*)(page + 0x66) = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; | ||
1376 | kunmap_atomic(page, KM_USER0); | ||
1377 | |||
1378 | page = kmap_atomic(p2, KM_USER0); | ||
1379 | clear_page(page); | ||
1380 | kunmap_atomic(page, KM_USER0); | ||
1381 | |||
1382 | page = kmap_atomic(p3, KM_USER0); | ||
1383 | clear_page(page); | ||
1384 | *(page + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1) = ~0; | ||
1385 | kunmap_atomic(page, KM_USER0); | ||
1386 | |||
1387 | return 1; | ||
1388 | } | ||
1389 | |||
1390 | static void seg_setup(int seg) | ||
1391 | { | ||
1392 | struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; | ||
1393 | |||
1394 | vmcs_write16(sf->selector, 0); | ||
1395 | vmcs_writel(sf->base, 0); | ||
1396 | vmcs_write32(sf->limit, 0xffff); | ||
1397 | vmcs_write32(sf->ar_bytes, 0x93); | ||
1398 | } | ||
1399 | |||
1400 | /* | ||
1401 | * Sets up the vmcs for emulated real mode. | ||
1402 | */ | ||
1403 | static int vmx_vcpu_setup(struct vcpu_vmx *vmx) | ||
1404 | { | ||
1405 | u32 host_sysenter_cs; | ||
1406 | u32 junk; | ||
1407 | unsigned long a; | ||
1408 | struct descriptor_table dt; | ||
1409 | int i; | ||
1410 | int ret = 0; | ||
1411 | unsigned long kvm_vmx_return; | ||
1412 | u64 msr; | ||
1413 | u32 exec_control; | ||
1414 | |||
1415 | if (!init_rmode_tss(vmx->vcpu.kvm)) { | ||
1416 | ret = -ENOMEM; | ||
1417 | goto out; | ||
1418 | } | ||
1419 | |||
1420 | vmx->vcpu.rmode.active = 0; | ||
1421 | |||
1422 | vmx->vcpu.regs[VCPU_REGS_RDX] = get_rdx_init_val(); | ||
1423 | set_cr8(&vmx->vcpu, 0); | ||
1424 | msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; | ||
1425 | if (vmx->vcpu.vcpu_id == 0) | ||
1426 | msr |= MSR_IA32_APICBASE_BSP; | ||
1427 | kvm_set_apic_base(&vmx->vcpu, msr); | ||
1428 | |||
1429 | fx_init(&vmx->vcpu); | ||
1430 | |||
1431 | /* | ||
1432 | * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode | ||
1433 | * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. | ||
1434 | */ | ||
1435 | if (vmx->vcpu.vcpu_id == 0) { | ||
1436 | vmcs_write16(GUEST_CS_SELECTOR, 0xf000); | ||
1437 | vmcs_writel(GUEST_CS_BASE, 0x000f0000); | ||
1438 | } else { | ||
1439 | vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.sipi_vector << 8); | ||
1440 | vmcs_writel(GUEST_CS_BASE, vmx->vcpu.sipi_vector << 12); | ||
1441 | } | ||
1442 | vmcs_write32(GUEST_CS_LIMIT, 0xffff); | ||
1443 | vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); | ||
1444 | |||
1445 | seg_setup(VCPU_SREG_DS); | ||
1446 | seg_setup(VCPU_SREG_ES); | ||
1447 | seg_setup(VCPU_SREG_FS); | ||
1448 | seg_setup(VCPU_SREG_GS); | ||
1449 | seg_setup(VCPU_SREG_SS); | ||
1450 | |||
1451 | vmcs_write16(GUEST_TR_SELECTOR, 0); | ||
1452 | vmcs_writel(GUEST_TR_BASE, 0); | ||
1453 | vmcs_write32(GUEST_TR_LIMIT, 0xffff); | ||
1454 | vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); | ||
1455 | |||
1456 | vmcs_write16(GUEST_LDTR_SELECTOR, 0); | ||
1457 | vmcs_writel(GUEST_LDTR_BASE, 0); | ||
1458 | vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); | ||
1459 | vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); | ||
1460 | |||
1461 | vmcs_write32(GUEST_SYSENTER_CS, 0); | ||
1462 | vmcs_writel(GUEST_SYSENTER_ESP, 0); | ||
1463 | vmcs_writel(GUEST_SYSENTER_EIP, 0); | ||
1464 | |||
1465 | vmcs_writel(GUEST_RFLAGS, 0x02); | ||
1466 | if (vmx->vcpu.vcpu_id == 0) | ||
1467 | vmcs_writel(GUEST_RIP, 0xfff0); | ||
1468 | else | ||
1469 | vmcs_writel(GUEST_RIP, 0); | ||
1470 | vmcs_writel(GUEST_RSP, 0); | ||
1471 | |||
1472 | //todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 | ||
1473 | vmcs_writel(GUEST_DR7, 0x400); | ||
1474 | |||
1475 | vmcs_writel(GUEST_GDTR_BASE, 0); | ||
1476 | vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); | ||
1477 | |||
1478 | vmcs_writel(GUEST_IDTR_BASE, 0); | ||
1479 | vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); | ||
1480 | |||
1481 | vmcs_write32(GUEST_ACTIVITY_STATE, 0); | ||
1482 | vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); | ||
1483 | vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); | ||
1484 | |||
1485 | /* I/O */ | ||
1486 | vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a)); | ||
1487 | vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b)); | ||
1488 | |||
1489 | guest_write_tsc(0); | ||
1490 | |||
1491 | vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ | ||
1492 | |||
1493 | /* Special registers */ | ||
1494 | vmcs_write64(GUEST_IA32_DEBUGCTL, 0); | ||
1495 | |||
1496 | /* Control */ | ||
1497 | vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, | ||
1498 | vmcs_config.pin_based_exec_ctrl); | ||
1499 | |||
1500 | exec_control = vmcs_config.cpu_based_exec_ctrl; | ||
1501 | if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) { | ||
1502 | exec_control &= ~CPU_BASED_TPR_SHADOW; | ||
1503 | #ifdef CONFIG_X86_64 | ||
1504 | exec_control |= CPU_BASED_CR8_STORE_EXITING | | ||
1505 | CPU_BASED_CR8_LOAD_EXITING; | ||
1506 | #endif | ||
1507 | } | ||
1508 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); | ||
1509 | |||
1510 | vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); | ||
1511 | vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); | ||
1512 | vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ | ||
1513 | |||
1514 | vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */ | ||
1515 | vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */ | ||
1516 | vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ | ||
1517 | |||
1518 | vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ | ||
1519 | vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ | ||
1520 | vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ | ||
1521 | vmcs_write16(HOST_FS_SELECTOR, read_fs()); /* 22.2.4 */ | ||
1522 | vmcs_write16(HOST_GS_SELECTOR, read_gs()); /* 22.2.4 */ | ||
1523 | vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ | ||
1524 | #ifdef CONFIG_X86_64 | ||
1525 | rdmsrl(MSR_FS_BASE, a); | ||
1526 | vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */ | ||
1527 | rdmsrl(MSR_GS_BASE, a); | ||
1528 | vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */ | ||
1529 | #else | ||
1530 | vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */ | ||
1531 | vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ | ||
1532 | #endif | ||
1533 | |||
1534 | vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ | ||
1535 | |||
1536 | get_idt(&dt); | ||
1537 | vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */ | ||
1538 | |||
1539 | asm ("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return)); | ||
1540 | vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */ | ||
1541 | vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); | ||
1542 | vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); | ||
1543 | vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); | ||
1544 | |||
1545 | rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk); | ||
1546 | vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs); | ||
1547 | rdmsrl(MSR_IA32_SYSENTER_ESP, a); | ||
1548 | vmcs_writel(HOST_IA32_SYSENTER_ESP, a); /* 22.2.3 */ | ||
1549 | rdmsrl(MSR_IA32_SYSENTER_EIP, a); | ||
1550 | vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */ | ||
1551 | |||
1552 | for (i = 0; i < NR_VMX_MSR; ++i) { | ||
1553 | u32 index = vmx_msr_index[i]; | ||
1554 | u32 data_low, data_high; | ||
1555 | u64 data; | ||
1556 | int j = vmx->nmsrs; | ||
1557 | |||
1558 | if (rdmsr_safe(index, &data_low, &data_high) < 0) | ||
1559 | continue; | ||
1560 | if (wrmsr_safe(index, data_low, data_high) < 0) | ||
1561 | continue; | ||
1562 | data = data_low | ((u64)data_high << 32); | ||
1563 | vmx->host_msrs[j].index = index; | ||
1564 | vmx->host_msrs[j].reserved = 0; | ||
1565 | vmx->host_msrs[j].data = data; | ||
1566 | vmx->guest_msrs[j] = vmx->host_msrs[j]; | ||
1567 | ++vmx->nmsrs; | ||
1568 | } | ||
1569 | |||
1570 | setup_msrs(vmx); | ||
1571 | |||
1572 | vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); | ||
1573 | |||
1574 | /* 22.2.1, 20.8.1 */ | ||
1575 | vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); | ||
1576 | |||
1577 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ | ||
1578 | |||
1579 | #ifdef CONFIG_X86_64 | ||
1580 | vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); | ||
1581 | if (vm_need_tpr_shadow(vmx->vcpu.kvm)) | ||
1582 | vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, | ||
1583 | page_to_phys(vmx->vcpu.apic->regs_page)); | ||
1584 | vmcs_write32(TPR_THRESHOLD, 0); | ||
1585 | #endif | ||
1586 | |||
1587 | vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); | ||
1588 | vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK); | ||
1589 | |||
1590 | vmx->vcpu.cr0 = 0x60000010; | ||
1591 | vmx_set_cr0(&vmx->vcpu, vmx->vcpu.cr0); // enter rmode | ||
1592 | vmx_set_cr4(&vmx->vcpu, 0); | ||
1593 | #ifdef CONFIG_X86_64 | ||
1594 | vmx_set_efer(&vmx->vcpu, 0); | ||
1595 | #endif | ||
1596 | vmx_fpu_activate(&vmx->vcpu); | ||
1597 | update_exception_bitmap(&vmx->vcpu); | ||
1598 | |||
1599 | return 0; | ||
1600 | |||
1601 | out: | ||
1602 | return ret; | ||
1603 | } | ||
1604 | |||
1605 | static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) | ||
1606 | { | ||
1607 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
1608 | |||
1609 | vmx_vcpu_setup(vmx); | ||
1610 | } | ||
1611 | |||
1612 | static void inject_rmode_irq(struct kvm_vcpu *vcpu, int irq) | ||
1613 | { | ||
1614 | u16 ent[2]; | ||
1615 | u16 cs; | ||
1616 | u16 ip; | ||
1617 | unsigned long flags; | ||
1618 | unsigned long ss_base = vmcs_readl(GUEST_SS_BASE); | ||
1619 | u16 sp = vmcs_readl(GUEST_RSP); | ||
1620 | u32 ss_limit = vmcs_read32(GUEST_SS_LIMIT); | ||
1621 | |||
1622 | if (sp > ss_limit || sp < 6 ) { | ||
1623 | vcpu_printf(vcpu, "%s: #SS, rsp 0x%lx ss 0x%lx limit 0x%x\n", | ||
1624 | __FUNCTION__, | ||
1625 | vmcs_readl(GUEST_RSP), | ||
1626 | vmcs_readl(GUEST_SS_BASE), | ||
1627 | vmcs_read32(GUEST_SS_LIMIT)); | ||
1628 | return; | ||
1629 | } | ||
1630 | |||
1631 | if (emulator_read_std(irq * sizeof(ent), &ent, sizeof(ent), vcpu) != | ||
1632 | X86EMUL_CONTINUE) { | ||
1633 | vcpu_printf(vcpu, "%s: read guest err\n", __FUNCTION__); | ||
1634 | return; | ||
1635 | } | ||
1636 | |||
1637 | flags = vmcs_readl(GUEST_RFLAGS); | ||
1638 | cs = vmcs_readl(GUEST_CS_BASE) >> 4; | ||
1639 | ip = vmcs_readl(GUEST_RIP); | ||
1640 | |||
1641 | |||
1642 | if (emulator_write_emulated(ss_base + sp - 2, &flags, 2, vcpu) != X86EMUL_CONTINUE || | ||
1643 | emulator_write_emulated(ss_base + sp - 4, &cs, 2, vcpu) != X86EMUL_CONTINUE || | ||
1644 | emulator_write_emulated(ss_base + sp - 6, &ip, 2, vcpu) != X86EMUL_CONTINUE) { | ||
1645 | vcpu_printf(vcpu, "%s: write guest err\n", __FUNCTION__); | ||
1646 | return; | ||
1647 | } | ||
1648 | |||
1649 | vmcs_writel(GUEST_RFLAGS, flags & | ||
1650 | ~( X86_EFLAGS_IF | X86_EFLAGS_AC | X86_EFLAGS_TF)); | ||
1651 | vmcs_write16(GUEST_CS_SELECTOR, ent[1]) ; | ||
1652 | vmcs_writel(GUEST_CS_BASE, ent[1] << 4); | ||
1653 | vmcs_writel(GUEST_RIP, ent[0]); | ||
1654 | vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) & ~0xffff) | (sp - 6)); | ||
1655 | } | ||
1656 | |||
1657 | static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) | ||
1658 | { | ||
1659 | if (vcpu->rmode.active) { | ||
1660 | inject_rmode_irq(vcpu, irq); | ||
1661 | return; | ||
1662 | } | ||
1663 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | ||
1664 | irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); | ||
1665 | } | ||
1666 | |||
1667 | static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) | ||
1668 | { | ||
1669 | int word_index = __ffs(vcpu->irq_summary); | ||
1670 | int bit_index = __ffs(vcpu->irq_pending[word_index]); | ||
1671 | int irq = word_index * BITS_PER_LONG + bit_index; | ||
1672 | |||
1673 | clear_bit(bit_index, &vcpu->irq_pending[word_index]); | ||
1674 | if (!vcpu->irq_pending[word_index]) | ||
1675 | clear_bit(word_index, &vcpu->irq_summary); | ||
1676 | vmx_inject_irq(vcpu, irq); | ||
1677 | } | ||
1678 | |||
1679 | |||
1680 | static void do_interrupt_requests(struct kvm_vcpu *vcpu, | ||
1681 | struct kvm_run *kvm_run) | ||
1682 | { | ||
1683 | u32 cpu_based_vm_exec_control; | ||
1684 | |||
1685 | vcpu->interrupt_window_open = | ||
1686 | ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && | ||
1687 | (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0); | ||
1688 | |||
1689 | if (vcpu->interrupt_window_open && | ||
1690 | vcpu->irq_summary && | ||
1691 | !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK)) | ||
1692 | /* | ||
1693 | * If interrupts enabled, and not blocked by sti or mov ss. Good. | ||
1694 | */ | ||
1695 | kvm_do_inject_irq(vcpu); | ||
1696 | |||
1697 | cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); | ||
1698 | if (!vcpu->interrupt_window_open && | ||
1699 | (vcpu->irq_summary || kvm_run->request_interrupt_window)) | ||
1700 | /* | ||
1701 | * Interrupts blocked. Wait for unblock. | ||
1702 | */ | ||
1703 | cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; | ||
1704 | else | ||
1705 | cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; | ||
1706 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); | ||
1707 | } | ||
1708 | |||
1709 | static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu) | ||
1710 | { | ||
1711 | struct kvm_guest_debug *dbg = &vcpu->guest_debug; | ||
1712 | |||
1713 | set_debugreg(dbg->bp[0], 0); | ||
1714 | set_debugreg(dbg->bp[1], 1); | ||
1715 | set_debugreg(dbg->bp[2], 2); | ||
1716 | set_debugreg(dbg->bp[3], 3); | ||
1717 | |||
1718 | if (dbg->singlestep) { | ||
1719 | unsigned long flags; | ||
1720 | |||
1721 | flags = vmcs_readl(GUEST_RFLAGS); | ||
1722 | flags |= X86_EFLAGS_TF | X86_EFLAGS_RF; | ||
1723 | vmcs_writel(GUEST_RFLAGS, flags); | ||
1724 | } | ||
1725 | } | ||
1726 | |||
1727 | static int handle_rmode_exception(struct kvm_vcpu *vcpu, | ||
1728 | int vec, u32 err_code) | ||
1729 | { | ||
1730 | if (!vcpu->rmode.active) | ||
1731 | return 0; | ||
1732 | |||
1733 | /* | ||
1734 | * Instruction with address size override prefix opcode 0x67 | ||
1735 | * Cause the #SS fault with 0 error code in VM86 mode. | ||
1736 | */ | ||
1737 | if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) | ||
1738 | if (emulate_instruction(vcpu, NULL, 0, 0) == EMULATE_DONE) | ||
1739 | return 1; | ||
1740 | return 0; | ||
1741 | } | ||
1742 | |||
1743 | static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
1744 | { | ||
1745 | u32 intr_info, error_code; | ||
1746 | unsigned long cr2, rip; | ||
1747 | u32 vect_info; | ||
1748 | enum emulation_result er; | ||
1749 | int r; | ||
1750 | |||
1751 | vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); | ||
1752 | intr_info = vmcs_read32(VM_EXIT_INTR_INFO); | ||
1753 | |||
1754 | if ((vect_info & VECTORING_INFO_VALID_MASK) && | ||
1755 | !is_page_fault(intr_info)) { | ||
1756 | printk(KERN_ERR "%s: unexpected, vectoring info 0x%x " | ||
1757 | "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info); | ||
1758 | } | ||
1759 | |||
1760 | if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) { | ||
1761 | int irq = vect_info & VECTORING_INFO_VECTOR_MASK; | ||
1762 | set_bit(irq, vcpu->irq_pending); | ||
1763 | set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary); | ||
1764 | } | ||
1765 | |||
1766 | if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */ | ||
1767 | return 1; /* already handled by vmx_vcpu_run() */ | ||
1768 | |||
1769 | if (is_no_device(intr_info)) { | ||
1770 | vmx_fpu_activate(vcpu); | ||
1771 | return 1; | ||
1772 | } | ||
1773 | |||
1774 | error_code = 0; | ||
1775 | rip = vmcs_readl(GUEST_RIP); | ||
1776 | if (intr_info & INTR_INFO_DELIEVER_CODE_MASK) | ||
1777 | error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); | ||
1778 | if (is_page_fault(intr_info)) { | ||
1779 | cr2 = vmcs_readl(EXIT_QUALIFICATION); | ||
1780 | |||
1781 | mutex_lock(&vcpu->kvm->lock); | ||
1782 | r = kvm_mmu_page_fault(vcpu, cr2, error_code); | ||
1783 | if (r < 0) { | ||
1784 | mutex_unlock(&vcpu->kvm->lock); | ||
1785 | return r; | ||
1786 | } | ||
1787 | if (!r) { | ||
1788 | mutex_unlock(&vcpu->kvm->lock); | ||
1789 | return 1; | ||
1790 | } | ||
1791 | |||
1792 | er = emulate_instruction(vcpu, kvm_run, cr2, error_code); | ||
1793 | mutex_unlock(&vcpu->kvm->lock); | ||
1794 | |||
1795 | switch (er) { | ||
1796 | case EMULATE_DONE: | ||
1797 | return 1; | ||
1798 | case EMULATE_DO_MMIO: | ||
1799 | ++vcpu->stat.mmio_exits; | ||
1800 | return 0; | ||
1801 | case EMULATE_FAIL: | ||
1802 | kvm_report_emulation_failure(vcpu, "pagetable"); | ||
1803 | break; | ||
1804 | default: | ||
1805 | BUG(); | ||
1806 | } | ||
1807 | } | ||
1808 | |||
1809 | if (vcpu->rmode.active && | ||
1810 | handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK, | ||
1811 | error_code)) { | ||
1812 | if (vcpu->halt_request) { | ||
1813 | vcpu->halt_request = 0; | ||
1814 | return kvm_emulate_halt(vcpu); | ||
1815 | } | ||
1816 | return 1; | ||
1817 | } | ||
1818 | |||
1819 | if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == (INTR_TYPE_EXCEPTION | 1)) { | ||
1820 | kvm_run->exit_reason = KVM_EXIT_DEBUG; | ||
1821 | return 0; | ||
1822 | } | ||
1823 | kvm_run->exit_reason = KVM_EXIT_EXCEPTION; | ||
1824 | kvm_run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK; | ||
1825 | kvm_run->ex.error_code = error_code; | ||
1826 | return 0; | ||
1827 | } | ||
1828 | |||
1829 | static int handle_external_interrupt(struct kvm_vcpu *vcpu, | ||
1830 | struct kvm_run *kvm_run) | ||
1831 | { | ||
1832 | ++vcpu->stat.irq_exits; | ||
1833 | return 1; | ||
1834 | } | ||
1835 | |||
1836 | static int handle_triple_fault(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
1837 | { | ||
1838 | kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; | ||
1839 | return 0; | ||
1840 | } | ||
1841 | |||
1842 | static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
1843 | { | ||
1844 | unsigned long exit_qualification; | ||
1845 | int size, down, in, string, rep; | ||
1846 | unsigned port; | ||
1847 | |||
1848 | ++vcpu->stat.io_exits; | ||
1849 | exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
1850 | string = (exit_qualification & 16) != 0; | ||
1851 | |||
1852 | if (string) { | ||
1853 | if (emulate_instruction(vcpu, kvm_run, 0, 0) == EMULATE_DO_MMIO) | ||
1854 | return 0; | ||
1855 | return 1; | ||
1856 | } | ||
1857 | |||
1858 | size = (exit_qualification & 7) + 1; | ||
1859 | in = (exit_qualification & 8) != 0; | ||
1860 | down = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0; | ||
1861 | rep = (exit_qualification & 32) != 0; | ||
1862 | port = exit_qualification >> 16; | ||
1863 | |||
1864 | return kvm_emulate_pio(vcpu, kvm_run, in, size, port); | ||
1865 | } | ||
1866 | |||
1867 | static void | ||
1868 | vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) | ||
1869 | { | ||
1870 | /* | ||
1871 | * Patch in the VMCALL instruction: | ||
1872 | */ | ||
1873 | hypercall[0] = 0x0f; | ||
1874 | hypercall[1] = 0x01; | ||
1875 | hypercall[2] = 0xc1; | ||
1876 | hypercall[3] = 0xc3; | ||
1877 | } | ||
1878 | |||
1879 | static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
1880 | { | ||
1881 | unsigned long exit_qualification; | ||
1882 | int cr; | ||
1883 | int reg; | ||
1884 | |||
1885 | exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
1886 | cr = exit_qualification & 15; | ||
1887 | reg = (exit_qualification >> 8) & 15; | ||
1888 | switch ((exit_qualification >> 4) & 3) { | ||
1889 | case 0: /* mov to cr */ | ||
1890 | switch (cr) { | ||
1891 | case 0: | ||
1892 | vcpu_load_rsp_rip(vcpu); | ||
1893 | set_cr0(vcpu, vcpu->regs[reg]); | ||
1894 | skip_emulated_instruction(vcpu); | ||
1895 | return 1; | ||
1896 | case 3: | ||
1897 | vcpu_load_rsp_rip(vcpu); | ||
1898 | set_cr3(vcpu, vcpu->regs[reg]); | ||
1899 | skip_emulated_instruction(vcpu); | ||
1900 | return 1; | ||
1901 | case 4: | ||
1902 | vcpu_load_rsp_rip(vcpu); | ||
1903 | set_cr4(vcpu, vcpu->regs[reg]); | ||
1904 | skip_emulated_instruction(vcpu); | ||
1905 | return 1; | ||
1906 | case 8: | ||
1907 | vcpu_load_rsp_rip(vcpu); | ||
1908 | set_cr8(vcpu, vcpu->regs[reg]); | ||
1909 | skip_emulated_instruction(vcpu); | ||
1910 | kvm_run->exit_reason = KVM_EXIT_SET_TPR; | ||
1911 | return 0; | ||
1912 | }; | ||
1913 | break; | ||
1914 | case 2: /* clts */ | ||
1915 | vcpu_load_rsp_rip(vcpu); | ||
1916 | vmx_fpu_deactivate(vcpu); | ||
1917 | vcpu->cr0 &= ~X86_CR0_TS; | ||
1918 | vmcs_writel(CR0_READ_SHADOW, vcpu->cr0); | ||
1919 | vmx_fpu_activate(vcpu); | ||
1920 | skip_emulated_instruction(vcpu); | ||
1921 | return 1; | ||
1922 | case 1: /*mov from cr*/ | ||
1923 | switch (cr) { | ||
1924 | case 3: | ||
1925 | vcpu_load_rsp_rip(vcpu); | ||
1926 | vcpu->regs[reg] = vcpu->cr3; | ||
1927 | vcpu_put_rsp_rip(vcpu); | ||
1928 | skip_emulated_instruction(vcpu); | ||
1929 | return 1; | ||
1930 | case 8: | ||
1931 | vcpu_load_rsp_rip(vcpu); | ||
1932 | vcpu->regs[reg] = get_cr8(vcpu); | ||
1933 | vcpu_put_rsp_rip(vcpu); | ||
1934 | skip_emulated_instruction(vcpu); | ||
1935 | return 1; | ||
1936 | } | ||
1937 | break; | ||
1938 | case 3: /* lmsw */ | ||
1939 | lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f); | ||
1940 | |||
1941 | skip_emulated_instruction(vcpu); | ||
1942 | return 1; | ||
1943 | default: | ||
1944 | break; | ||
1945 | } | ||
1946 | kvm_run->exit_reason = 0; | ||
1947 | pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n", | ||
1948 | (int)(exit_qualification >> 4) & 3, cr); | ||
1949 | return 0; | ||
1950 | } | ||
1951 | |||
1952 | static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
1953 | { | ||
1954 | unsigned long exit_qualification; | ||
1955 | unsigned long val; | ||
1956 | int dr, reg; | ||
1957 | |||
1958 | /* | ||
1959 | * FIXME: this code assumes the host is debugging the guest. | ||
1960 | * need to deal with guest debugging itself too. | ||
1961 | */ | ||
1962 | exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
1963 | dr = exit_qualification & 7; | ||
1964 | reg = (exit_qualification >> 8) & 15; | ||
1965 | vcpu_load_rsp_rip(vcpu); | ||
1966 | if (exit_qualification & 16) { | ||
1967 | /* mov from dr */ | ||
1968 | switch (dr) { | ||
1969 | case 6: | ||
1970 | val = 0xffff0ff0; | ||
1971 | break; | ||
1972 | case 7: | ||
1973 | val = 0x400; | ||
1974 | break; | ||
1975 | default: | ||
1976 | val = 0; | ||
1977 | } | ||
1978 | vcpu->regs[reg] = val; | ||
1979 | } else { | ||
1980 | /* mov to dr */ | ||
1981 | } | ||
1982 | vcpu_put_rsp_rip(vcpu); | ||
1983 | skip_emulated_instruction(vcpu); | ||
1984 | return 1; | ||
1985 | } | ||
1986 | |||
1987 | static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
1988 | { | ||
1989 | kvm_emulate_cpuid(vcpu); | ||
1990 | return 1; | ||
1991 | } | ||
1992 | |||
1993 | static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
1994 | { | ||
1995 | u32 ecx = vcpu->regs[VCPU_REGS_RCX]; | ||
1996 | u64 data; | ||
1997 | |||
1998 | if (vmx_get_msr(vcpu, ecx, &data)) { | ||
1999 | vmx_inject_gp(vcpu, 0); | ||
2000 | return 1; | ||
2001 | } | ||
2002 | |||
2003 | /* FIXME: handling of bits 32:63 of rax, rdx */ | ||
2004 | vcpu->regs[VCPU_REGS_RAX] = data & -1u; | ||
2005 | vcpu->regs[VCPU_REGS_RDX] = (data >> 32) & -1u; | ||
2006 | skip_emulated_instruction(vcpu); | ||
2007 | return 1; | ||
2008 | } | ||
2009 | |||
2010 | static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
2011 | { | ||
2012 | u32 ecx = vcpu->regs[VCPU_REGS_RCX]; | ||
2013 | u64 data = (vcpu->regs[VCPU_REGS_RAX] & -1u) | ||
2014 | | ((u64)(vcpu->regs[VCPU_REGS_RDX] & -1u) << 32); | ||
2015 | |||
2016 | if (vmx_set_msr(vcpu, ecx, data) != 0) { | ||
2017 | vmx_inject_gp(vcpu, 0); | ||
2018 | return 1; | ||
2019 | } | ||
2020 | |||
2021 | skip_emulated_instruction(vcpu); | ||
2022 | return 1; | ||
2023 | } | ||
2024 | |||
2025 | static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu, | ||
2026 | struct kvm_run *kvm_run) | ||
2027 | { | ||
2028 | return 1; | ||
2029 | } | ||
2030 | |||
2031 | static int handle_interrupt_window(struct kvm_vcpu *vcpu, | ||
2032 | struct kvm_run *kvm_run) | ||
2033 | { | ||
2034 | u32 cpu_based_vm_exec_control; | ||
2035 | |||
2036 | /* clear pending irq */ | ||
2037 | cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); | ||
2038 | cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; | ||
2039 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); | ||
2040 | /* | ||
2041 | * If the user space waits to inject interrupts, exit as soon as | ||
2042 | * possible | ||
2043 | */ | ||
2044 | if (kvm_run->request_interrupt_window && | ||
2045 | !vcpu->irq_summary) { | ||
2046 | kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; | ||
2047 | ++vcpu->stat.irq_window_exits; | ||
2048 | return 0; | ||
2049 | } | ||
2050 | return 1; | ||
2051 | } | ||
2052 | |||
2053 | static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
2054 | { | ||
2055 | skip_emulated_instruction(vcpu); | ||
2056 | return kvm_emulate_halt(vcpu); | ||
2057 | } | ||
2058 | |||
2059 | static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
2060 | { | ||
2061 | skip_emulated_instruction(vcpu); | ||
2062 | return kvm_hypercall(vcpu, kvm_run); | ||
2063 | } | ||
2064 | |||
2065 | /* | ||
2066 | * The exit handlers return 1 if the exit was handled fully and guest execution | ||
2067 | * may resume. Otherwise they set the kvm_run parameter to indicate what needs | ||
2068 | * to be done to userspace and return 0. | ||
2069 | */ | ||
2070 | static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, | ||
2071 | struct kvm_run *kvm_run) = { | ||
2072 | [EXIT_REASON_EXCEPTION_NMI] = handle_exception, | ||
2073 | [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, | ||
2074 | [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, | ||
2075 | [EXIT_REASON_IO_INSTRUCTION] = handle_io, | ||
2076 | [EXIT_REASON_CR_ACCESS] = handle_cr, | ||
2077 | [EXIT_REASON_DR_ACCESS] = handle_dr, | ||
2078 | [EXIT_REASON_CPUID] = handle_cpuid, | ||
2079 | [EXIT_REASON_MSR_READ] = handle_rdmsr, | ||
2080 | [EXIT_REASON_MSR_WRITE] = handle_wrmsr, | ||
2081 | [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, | ||
2082 | [EXIT_REASON_HLT] = handle_halt, | ||
2083 | [EXIT_REASON_VMCALL] = handle_vmcall, | ||
2084 | [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold | ||
2085 | }; | ||
2086 | |||
2087 | static const int kvm_vmx_max_exit_handlers = | ||
2088 | ARRAY_SIZE(kvm_vmx_exit_handlers); | ||
2089 | |||
2090 | /* | ||
2091 | * The guest has exited. See if we can fix it or if we need userspace | ||
2092 | * assistance. | ||
2093 | */ | ||
2094 | static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) | ||
2095 | { | ||
2096 | u32 vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); | ||
2097 | u32 exit_reason = vmcs_read32(VM_EXIT_REASON); | ||
2098 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
2099 | |||
2100 | if (unlikely(vmx->fail)) { | ||
2101 | kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; | ||
2102 | kvm_run->fail_entry.hardware_entry_failure_reason | ||
2103 | = vmcs_read32(VM_INSTRUCTION_ERROR); | ||
2104 | return 0; | ||
2105 | } | ||
2106 | |||
2107 | if ( (vectoring_info & VECTORING_INFO_VALID_MASK) && | ||
2108 | exit_reason != EXIT_REASON_EXCEPTION_NMI ) | ||
2109 | printk(KERN_WARNING "%s: unexpected, valid vectoring info and " | ||
2110 | "exit reason is 0x%x\n", __FUNCTION__, exit_reason); | ||
2111 | if (exit_reason < kvm_vmx_max_exit_handlers | ||
2112 | && kvm_vmx_exit_handlers[exit_reason]) | ||
2113 | return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run); | ||
2114 | else { | ||
2115 | kvm_run->exit_reason = KVM_EXIT_UNKNOWN; | ||
2116 | kvm_run->hw.hardware_exit_reason = exit_reason; | ||
2117 | } | ||
2118 | return 0; | ||
2119 | } | ||
2120 | |||
2121 | static void vmx_flush_tlb(struct kvm_vcpu *vcpu) | ||
2122 | { | ||
2123 | } | ||
2124 | |||
2125 | static void update_tpr_threshold(struct kvm_vcpu *vcpu) | ||
2126 | { | ||
2127 | int max_irr, tpr; | ||
2128 | |||
2129 | if (!vm_need_tpr_shadow(vcpu->kvm)) | ||
2130 | return; | ||
2131 | |||
2132 | if (!kvm_lapic_enabled(vcpu) || | ||
2133 | ((max_irr = kvm_lapic_find_highest_irr(vcpu)) == -1)) { | ||
2134 | vmcs_write32(TPR_THRESHOLD, 0); | ||
2135 | return; | ||
2136 | } | ||
2137 | |||
2138 | tpr = (kvm_lapic_get_cr8(vcpu) & 0x0f) << 4; | ||
2139 | vmcs_write32(TPR_THRESHOLD, (max_irr > tpr) ? tpr >> 4 : max_irr >> 4); | ||
2140 | } | ||
2141 | |||
2142 | static void enable_irq_window(struct kvm_vcpu *vcpu) | ||
2143 | { | ||
2144 | u32 cpu_based_vm_exec_control; | ||
2145 | |||
2146 | cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); | ||
2147 | cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; | ||
2148 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); | ||
2149 | } | ||
2150 | |||
2151 | static void vmx_intr_assist(struct kvm_vcpu *vcpu) | ||
2152 | { | ||
2153 | u32 idtv_info_field, intr_info_field; | ||
2154 | int has_ext_irq, interrupt_window_open; | ||
2155 | int vector; | ||
2156 | |||
2157 | kvm_inject_pending_timer_irqs(vcpu); | ||
2158 | update_tpr_threshold(vcpu); | ||
2159 | |||
2160 | has_ext_irq = kvm_cpu_has_interrupt(vcpu); | ||
2161 | intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); | ||
2162 | idtv_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD); | ||
2163 | if (intr_info_field & INTR_INFO_VALID_MASK) { | ||
2164 | if (idtv_info_field & INTR_INFO_VALID_MASK) { | ||
2165 | /* TODO: fault when IDT_Vectoring */ | ||
2166 | printk(KERN_ERR "Fault when IDT_Vectoring\n"); | ||
2167 | } | ||
2168 | if (has_ext_irq) | ||
2169 | enable_irq_window(vcpu); | ||
2170 | return; | ||
2171 | } | ||
2172 | if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) { | ||
2173 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field); | ||
2174 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, | ||
2175 | vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); | ||
2176 | |||
2177 | if (unlikely(idtv_info_field & INTR_INFO_DELIEVER_CODE_MASK)) | ||
2178 | vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, | ||
2179 | vmcs_read32(IDT_VECTORING_ERROR_CODE)); | ||
2180 | if (unlikely(has_ext_irq)) | ||
2181 | enable_irq_window(vcpu); | ||
2182 | return; | ||
2183 | } | ||
2184 | if (!has_ext_irq) | ||
2185 | return; | ||
2186 | interrupt_window_open = | ||
2187 | ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && | ||
2188 | (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0); | ||
2189 | if (interrupt_window_open) { | ||
2190 | vector = kvm_cpu_get_interrupt(vcpu); | ||
2191 | vmx_inject_irq(vcpu, vector); | ||
2192 | kvm_timer_intr_post(vcpu, vector); | ||
2193 | } else | ||
2194 | enable_irq_window(vcpu); | ||
2195 | } | ||
2196 | |||
2197 | static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
2198 | { | ||
2199 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
2200 | u32 intr_info; | ||
2201 | |||
2202 | /* | ||
2203 | * Loading guest fpu may have cleared host cr0.ts | ||
2204 | */ | ||
2205 | vmcs_writel(HOST_CR0, read_cr0()); | ||
2206 | |||
2207 | asm ( | ||
2208 | /* Store host registers */ | ||
2209 | #ifdef CONFIG_X86_64 | ||
2210 | "push %%rax; push %%rbx; push %%rdx;" | ||
2211 | "push %%rsi; push %%rdi; push %%rbp;" | ||
2212 | "push %%r8; push %%r9; push %%r10; push %%r11;" | ||
2213 | "push %%r12; push %%r13; push %%r14; push %%r15;" | ||
2214 | "push %%rcx \n\t" | ||
2215 | ASM_VMX_VMWRITE_RSP_RDX "\n\t" | ||
2216 | #else | ||
2217 | "pusha; push %%ecx \n\t" | ||
2218 | ASM_VMX_VMWRITE_RSP_RDX "\n\t" | ||
2219 | #endif | ||
2220 | /* Check if vmlaunch of vmresume is needed */ | ||
2221 | "cmp $0, %1 \n\t" | ||
2222 | /* Load guest registers. Don't clobber flags. */ | ||
2223 | #ifdef CONFIG_X86_64 | ||
2224 | "mov %c[cr2](%3), %%rax \n\t" | ||
2225 | "mov %%rax, %%cr2 \n\t" | ||
2226 | "mov %c[rax](%3), %%rax \n\t" | ||
2227 | "mov %c[rbx](%3), %%rbx \n\t" | ||
2228 | "mov %c[rdx](%3), %%rdx \n\t" | ||
2229 | "mov %c[rsi](%3), %%rsi \n\t" | ||
2230 | "mov %c[rdi](%3), %%rdi \n\t" | ||
2231 | "mov %c[rbp](%3), %%rbp \n\t" | ||
2232 | "mov %c[r8](%3), %%r8 \n\t" | ||
2233 | "mov %c[r9](%3), %%r9 \n\t" | ||
2234 | "mov %c[r10](%3), %%r10 \n\t" | ||
2235 | "mov %c[r11](%3), %%r11 \n\t" | ||
2236 | "mov %c[r12](%3), %%r12 \n\t" | ||
2237 | "mov %c[r13](%3), %%r13 \n\t" | ||
2238 | "mov %c[r14](%3), %%r14 \n\t" | ||
2239 | "mov %c[r15](%3), %%r15 \n\t" | ||
2240 | "mov %c[rcx](%3), %%rcx \n\t" /* kills %3 (rcx) */ | ||
2241 | #else | ||
2242 | "mov %c[cr2](%3), %%eax \n\t" | ||
2243 | "mov %%eax, %%cr2 \n\t" | ||
2244 | "mov %c[rax](%3), %%eax \n\t" | ||
2245 | "mov %c[rbx](%3), %%ebx \n\t" | ||
2246 | "mov %c[rdx](%3), %%edx \n\t" | ||
2247 | "mov %c[rsi](%3), %%esi \n\t" | ||
2248 | "mov %c[rdi](%3), %%edi \n\t" | ||
2249 | "mov %c[rbp](%3), %%ebp \n\t" | ||
2250 | "mov %c[rcx](%3), %%ecx \n\t" /* kills %3 (ecx) */ | ||
2251 | #endif | ||
2252 | /* Enter guest mode */ | ||
2253 | "jne .Llaunched \n\t" | ||
2254 | ASM_VMX_VMLAUNCH "\n\t" | ||
2255 | "jmp .Lkvm_vmx_return \n\t" | ||
2256 | ".Llaunched: " ASM_VMX_VMRESUME "\n\t" | ||
2257 | ".Lkvm_vmx_return: " | ||
2258 | /* Save guest registers, load host registers, keep flags */ | ||
2259 | #ifdef CONFIG_X86_64 | ||
2260 | "xchg %3, (%%rsp) \n\t" | ||
2261 | "mov %%rax, %c[rax](%3) \n\t" | ||
2262 | "mov %%rbx, %c[rbx](%3) \n\t" | ||
2263 | "pushq (%%rsp); popq %c[rcx](%3) \n\t" | ||
2264 | "mov %%rdx, %c[rdx](%3) \n\t" | ||
2265 | "mov %%rsi, %c[rsi](%3) \n\t" | ||
2266 | "mov %%rdi, %c[rdi](%3) \n\t" | ||
2267 | "mov %%rbp, %c[rbp](%3) \n\t" | ||
2268 | "mov %%r8, %c[r8](%3) \n\t" | ||
2269 | "mov %%r9, %c[r9](%3) \n\t" | ||
2270 | "mov %%r10, %c[r10](%3) \n\t" | ||
2271 | "mov %%r11, %c[r11](%3) \n\t" | ||
2272 | "mov %%r12, %c[r12](%3) \n\t" | ||
2273 | "mov %%r13, %c[r13](%3) \n\t" | ||
2274 | "mov %%r14, %c[r14](%3) \n\t" | ||
2275 | "mov %%r15, %c[r15](%3) \n\t" | ||
2276 | "mov %%cr2, %%rax \n\t" | ||
2277 | "mov %%rax, %c[cr2](%3) \n\t" | ||
2278 | "mov (%%rsp), %3 \n\t" | ||
2279 | |||
2280 | "pop %%rcx; pop %%r15; pop %%r14; pop %%r13; pop %%r12;" | ||
2281 | "pop %%r11; pop %%r10; pop %%r9; pop %%r8;" | ||
2282 | "pop %%rbp; pop %%rdi; pop %%rsi;" | ||
2283 | "pop %%rdx; pop %%rbx; pop %%rax \n\t" | ||
2284 | #else | ||
2285 | "xchg %3, (%%esp) \n\t" | ||
2286 | "mov %%eax, %c[rax](%3) \n\t" | ||
2287 | "mov %%ebx, %c[rbx](%3) \n\t" | ||
2288 | "pushl (%%esp); popl %c[rcx](%3) \n\t" | ||
2289 | "mov %%edx, %c[rdx](%3) \n\t" | ||
2290 | "mov %%esi, %c[rsi](%3) \n\t" | ||
2291 | "mov %%edi, %c[rdi](%3) \n\t" | ||
2292 | "mov %%ebp, %c[rbp](%3) \n\t" | ||
2293 | "mov %%cr2, %%eax \n\t" | ||
2294 | "mov %%eax, %c[cr2](%3) \n\t" | ||
2295 | "mov (%%esp), %3 \n\t" | ||
2296 | |||
2297 | "pop %%ecx; popa \n\t" | ||
2298 | #endif | ||
2299 | "setbe %0 \n\t" | ||
2300 | : "=q" (vmx->fail) | ||
2301 | : "r"(vmx->launched), "d"((unsigned long)HOST_RSP), | ||
2302 | "c"(vcpu), | ||
2303 | [rax]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RAX])), | ||
2304 | [rbx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBX])), | ||
2305 | [rcx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RCX])), | ||
2306 | [rdx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDX])), | ||
2307 | [rsi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RSI])), | ||
2308 | [rdi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDI])), | ||
2309 | [rbp]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBP])), | ||
2310 | #ifdef CONFIG_X86_64 | ||
2311 | [r8 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R8 ])), | ||
2312 | [r9 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R9 ])), | ||
2313 | [r10]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R10])), | ||
2314 | [r11]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R11])), | ||
2315 | [r12]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R12])), | ||
2316 | [r13]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R13])), | ||
2317 | [r14]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R14])), | ||
2318 | [r15]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R15])), | ||
2319 | #endif | ||
2320 | [cr2]"i"(offsetof(struct kvm_vcpu, cr2)) | ||
2321 | : "cc", "memory" ); | ||
2322 | |||
2323 | vcpu->interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0; | ||
2324 | |||
2325 | asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); | ||
2326 | vmx->launched = 1; | ||
2327 | |||
2328 | intr_info = vmcs_read32(VM_EXIT_INTR_INFO); | ||
2329 | |||
2330 | /* We need to handle NMIs before interrupts are enabled */ | ||
2331 | if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */ | ||
2332 | asm("int $2"); | ||
2333 | } | ||
2334 | |||
2335 | static void vmx_inject_page_fault(struct kvm_vcpu *vcpu, | ||
2336 | unsigned long addr, | ||
2337 | u32 err_code) | ||
2338 | { | ||
2339 | u32 vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); | ||
2340 | |||
2341 | ++vcpu->stat.pf_guest; | ||
2342 | |||
2343 | if (is_page_fault(vect_info)) { | ||
2344 | printk(KERN_DEBUG "inject_page_fault: " | ||
2345 | "double fault 0x%lx @ 0x%lx\n", | ||
2346 | addr, vmcs_readl(GUEST_RIP)); | ||
2347 | vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 0); | ||
2348 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | ||
2349 | DF_VECTOR | | ||
2350 | INTR_TYPE_EXCEPTION | | ||
2351 | INTR_INFO_DELIEVER_CODE_MASK | | ||
2352 | INTR_INFO_VALID_MASK); | ||
2353 | return; | ||
2354 | } | ||
2355 | vcpu->cr2 = addr; | ||
2356 | vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, err_code); | ||
2357 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | ||
2358 | PF_VECTOR | | ||
2359 | INTR_TYPE_EXCEPTION | | ||
2360 | INTR_INFO_DELIEVER_CODE_MASK | | ||
2361 | INTR_INFO_VALID_MASK); | ||
2362 | |||
2363 | } | ||
2364 | |||
2365 | static void vmx_free_vmcs(struct kvm_vcpu *vcpu) | ||
2366 | { | ||
2367 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
2368 | |||
2369 | if (vmx->vmcs) { | ||
2370 | on_each_cpu(__vcpu_clear, vmx, 0, 1); | ||
2371 | free_vmcs(vmx->vmcs); | ||
2372 | vmx->vmcs = NULL; | ||
2373 | } | ||
2374 | } | ||
2375 | |||
2376 | static void vmx_free_vcpu(struct kvm_vcpu *vcpu) | ||
2377 | { | ||
2378 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
2379 | |||
2380 | vmx_free_vmcs(vcpu); | ||
2381 | kfree(vmx->host_msrs); | ||
2382 | kfree(vmx->guest_msrs); | ||
2383 | kvm_vcpu_uninit(vcpu); | ||
2384 | kmem_cache_free(kvm_vcpu_cache, vmx); | ||
2385 | } | ||
2386 | |||
2387 | static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) | ||
2388 | { | ||
2389 | int err; | ||
2390 | struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); | ||
2391 | int cpu; | ||
2392 | |||
2393 | if (!vmx) | ||
2394 | return ERR_PTR(-ENOMEM); | ||
2395 | |||
2396 | err = kvm_vcpu_init(&vmx->vcpu, kvm, id); | ||
2397 | if (err) | ||
2398 | goto free_vcpu; | ||
2399 | |||
2400 | if (irqchip_in_kernel(kvm)) { | ||
2401 | err = kvm_create_lapic(&vmx->vcpu); | ||
2402 | if (err < 0) | ||
2403 | goto free_vcpu; | ||
2404 | } | ||
2405 | |||
2406 | vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); | ||
2407 | if (!vmx->guest_msrs) { | ||
2408 | err = -ENOMEM; | ||
2409 | goto uninit_vcpu; | ||
2410 | } | ||
2411 | |||
2412 | vmx->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); | ||
2413 | if (!vmx->host_msrs) | ||
2414 | goto free_guest_msrs; | ||
2415 | |||
2416 | vmx->vmcs = alloc_vmcs(); | ||
2417 | if (!vmx->vmcs) | ||
2418 | goto free_msrs; | ||
2419 | |||
2420 | vmcs_clear(vmx->vmcs); | ||
2421 | |||
2422 | cpu = get_cpu(); | ||
2423 | vmx_vcpu_load(&vmx->vcpu, cpu); | ||
2424 | err = vmx_vcpu_setup(vmx); | ||
2425 | vmx_vcpu_put(&vmx->vcpu); | ||
2426 | put_cpu(); | ||
2427 | if (err) | ||
2428 | goto free_vmcs; | ||
2429 | |||
2430 | return &vmx->vcpu; | ||
2431 | |||
2432 | free_vmcs: | ||
2433 | free_vmcs(vmx->vmcs); | ||
2434 | free_msrs: | ||
2435 | kfree(vmx->host_msrs); | ||
2436 | free_guest_msrs: | ||
2437 | kfree(vmx->guest_msrs); | ||
2438 | uninit_vcpu: | ||
2439 | kvm_vcpu_uninit(&vmx->vcpu); | ||
2440 | free_vcpu: | ||
2441 | kmem_cache_free(kvm_vcpu_cache, vmx); | ||
2442 | return ERR_PTR(err); | ||
2443 | } | ||
2444 | |||
2445 | static void __init vmx_check_processor_compat(void *rtn) | ||
2446 | { | ||
2447 | struct vmcs_config vmcs_conf; | ||
2448 | |||
2449 | *(int *)rtn = 0; | ||
2450 | if (setup_vmcs_config(&vmcs_conf) < 0) | ||
2451 | *(int *)rtn = -EIO; | ||
2452 | if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) { | ||
2453 | printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n", | ||
2454 | smp_processor_id()); | ||
2455 | *(int *)rtn = -EIO; | ||
2456 | } | ||
2457 | } | ||
2458 | |||
2459 | static struct kvm_x86_ops vmx_x86_ops = { | ||
2460 | .cpu_has_kvm_support = cpu_has_kvm_support, | ||
2461 | .disabled_by_bios = vmx_disabled_by_bios, | ||
2462 | .hardware_setup = hardware_setup, | ||
2463 | .hardware_unsetup = hardware_unsetup, | ||
2464 | .check_processor_compatibility = vmx_check_processor_compat, | ||
2465 | .hardware_enable = hardware_enable, | ||
2466 | .hardware_disable = hardware_disable, | ||
2467 | |||
2468 | .vcpu_create = vmx_create_vcpu, | ||
2469 | .vcpu_free = vmx_free_vcpu, | ||
2470 | .vcpu_reset = vmx_vcpu_reset, | ||
2471 | |||
2472 | .prepare_guest_switch = vmx_save_host_state, | ||
2473 | .vcpu_load = vmx_vcpu_load, | ||
2474 | .vcpu_put = vmx_vcpu_put, | ||
2475 | .vcpu_decache = vmx_vcpu_decache, | ||
2476 | |||
2477 | .set_guest_debug = set_guest_debug, | ||
2478 | .guest_debug_pre = kvm_guest_debug_pre, | ||
2479 | .get_msr = vmx_get_msr, | ||
2480 | .set_msr = vmx_set_msr, | ||
2481 | .get_segment_base = vmx_get_segment_base, | ||
2482 | .get_segment = vmx_get_segment, | ||
2483 | .set_segment = vmx_set_segment, | ||
2484 | .get_cs_db_l_bits = vmx_get_cs_db_l_bits, | ||
2485 | .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, | ||
2486 | .set_cr0 = vmx_set_cr0, | ||
2487 | .set_cr3 = vmx_set_cr3, | ||
2488 | .set_cr4 = vmx_set_cr4, | ||
2489 | #ifdef CONFIG_X86_64 | ||
2490 | .set_efer = vmx_set_efer, | ||
2491 | #endif | ||
2492 | .get_idt = vmx_get_idt, | ||
2493 | .set_idt = vmx_set_idt, | ||
2494 | .get_gdt = vmx_get_gdt, | ||
2495 | .set_gdt = vmx_set_gdt, | ||
2496 | .cache_regs = vcpu_load_rsp_rip, | ||
2497 | .decache_regs = vcpu_put_rsp_rip, | ||
2498 | .get_rflags = vmx_get_rflags, | ||
2499 | .set_rflags = vmx_set_rflags, | ||
2500 | |||
2501 | .tlb_flush = vmx_flush_tlb, | ||
2502 | .inject_page_fault = vmx_inject_page_fault, | ||
2503 | |||
2504 | .inject_gp = vmx_inject_gp, | ||
2505 | |||
2506 | .run = vmx_vcpu_run, | ||
2507 | .handle_exit = kvm_handle_exit, | ||
2508 | .skip_emulated_instruction = skip_emulated_instruction, | ||
2509 | .patch_hypercall = vmx_patch_hypercall, | ||
2510 | .get_irq = vmx_get_irq, | ||
2511 | .set_irq = vmx_inject_irq, | ||
2512 | .inject_pending_irq = vmx_intr_assist, | ||
2513 | .inject_pending_vectors = do_interrupt_requests, | ||
2514 | }; | ||
2515 | |||
2516 | static int __init vmx_init(void) | ||
2517 | { | ||
2518 | void *iova; | ||
2519 | int r; | ||
2520 | |||
2521 | vmx_io_bitmap_a = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); | ||
2522 | if (!vmx_io_bitmap_a) | ||
2523 | return -ENOMEM; | ||
2524 | |||
2525 | vmx_io_bitmap_b = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); | ||
2526 | if (!vmx_io_bitmap_b) { | ||
2527 | r = -ENOMEM; | ||
2528 | goto out; | ||
2529 | } | ||
2530 | |||
2531 | /* | ||
2532 | * Allow direct access to the PC debug port (it is often used for I/O | ||
2533 | * delays, but the vmexits simply slow things down). | ||
2534 | */ | ||
2535 | iova = kmap(vmx_io_bitmap_a); | ||
2536 | memset(iova, 0xff, PAGE_SIZE); | ||
2537 | clear_bit(0x80, iova); | ||
2538 | kunmap(vmx_io_bitmap_a); | ||
2539 | |||
2540 | iova = kmap(vmx_io_bitmap_b); | ||
2541 | memset(iova, 0xff, PAGE_SIZE); | ||
2542 | kunmap(vmx_io_bitmap_b); | ||
2543 | |||
2544 | r = kvm_init_x86(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE); | ||
2545 | if (r) | ||
2546 | goto out1; | ||
2547 | |||
2548 | return 0; | ||
2549 | |||
2550 | out1: | ||
2551 | __free_page(vmx_io_bitmap_b); | ||
2552 | out: | ||
2553 | __free_page(vmx_io_bitmap_a); | ||
2554 | return r; | ||
2555 | } | ||
2556 | |||
2557 | static void __exit vmx_exit(void) | ||
2558 | { | ||
2559 | __free_page(vmx_io_bitmap_b); | ||
2560 | __free_page(vmx_io_bitmap_a); | ||
2561 | |||
2562 | kvm_exit_x86(); | ||
2563 | } | ||
2564 | |||
2565 | module_init(vmx_init) | ||
2566 | module_exit(vmx_exit) | ||
diff --git a/drivers/kvm/vmx.h b/drivers/kvm/vmx.h deleted file mode 100644 index fd4e14666088..000000000000 --- a/drivers/kvm/vmx.h +++ /dev/null | |||
@@ -1,310 +0,0 @@ | |||
1 | #ifndef VMX_H | ||
2 | #define VMX_H | ||
3 | |||
4 | /* | ||
5 | * vmx.h: VMX Architecture related definitions | ||
6 | * Copyright (c) 2004, Intel Corporation. | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or modify it | ||
9 | * under the terms and conditions of the GNU General Public License, | ||
10 | * version 2, as published by the Free Software Foundation. | ||
11 | * | ||
12 | * This program is distributed in the hope it will be useful, but WITHOUT | ||
13 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
14 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
15 | * more details. | ||
16 | * | ||
17 | * You should have received a copy of the GNU General Public License along with | ||
18 | * this program; if not, write to the Free Software Foundation, Inc., 59 Temple | ||
19 | * Place - Suite 330, Boston, MA 02111-1307 USA. | ||
20 | * | ||
21 | * A few random additions are: | ||
22 | * Copyright (C) 2006 Qumranet | ||
23 | * Avi Kivity <avi@qumranet.com> | ||
24 | * Yaniv Kamay <yaniv@qumranet.com> | ||
25 | * | ||
26 | */ | ||
27 | |||
28 | #define CPU_BASED_VIRTUAL_INTR_PENDING 0x00000004 | ||
29 | #define CPU_BASED_USE_TSC_OFFSETING 0x00000008 | ||
30 | #define CPU_BASED_HLT_EXITING 0x00000080 | ||
31 | #define CPU_BASED_INVLPG_EXITING 0x00000200 | ||
32 | #define CPU_BASED_MWAIT_EXITING 0x00000400 | ||
33 | #define CPU_BASED_RDPMC_EXITING 0x00000800 | ||
34 | #define CPU_BASED_RDTSC_EXITING 0x00001000 | ||
35 | #define CPU_BASED_CR8_LOAD_EXITING 0x00080000 | ||
36 | #define CPU_BASED_CR8_STORE_EXITING 0x00100000 | ||
37 | #define CPU_BASED_TPR_SHADOW 0x00200000 | ||
38 | #define CPU_BASED_MOV_DR_EXITING 0x00800000 | ||
39 | #define CPU_BASED_UNCOND_IO_EXITING 0x01000000 | ||
40 | #define CPU_BASED_USE_IO_BITMAPS 0x02000000 | ||
41 | #define CPU_BASED_USE_MSR_BITMAPS 0x10000000 | ||
42 | #define CPU_BASED_MONITOR_EXITING 0x20000000 | ||
43 | #define CPU_BASED_PAUSE_EXITING 0x40000000 | ||
44 | #define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS 0x80000000 | ||
45 | |||
46 | #define PIN_BASED_EXT_INTR_MASK 0x00000001 | ||
47 | #define PIN_BASED_NMI_EXITING 0x00000008 | ||
48 | #define PIN_BASED_VIRTUAL_NMIS 0x00000020 | ||
49 | |||
50 | #define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200 | ||
51 | #define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000 | ||
52 | |||
53 | #define VM_ENTRY_IA32E_MODE 0x00000200 | ||
54 | #define VM_ENTRY_SMM 0x00000400 | ||
55 | #define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800 | ||
56 | |||
57 | #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 | ||
58 | |||
59 | /* VMCS Encodings */ | ||
60 | enum vmcs_field { | ||
61 | GUEST_ES_SELECTOR = 0x00000800, | ||
62 | GUEST_CS_SELECTOR = 0x00000802, | ||
63 | GUEST_SS_SELECTOR = 0x00000804, | ||
64 | GUEST_DS_SELECTOR = 0x00000806, | ||
65 | GUEST_FS_SELECTOR = 0x00000808, | ||
66 | GUEST_GS_SELECTOR = 0x0000080a, | ||
67 | GUEST_LDTR_SELECTOR = 0x0000080c, | ||
68 | GUEST_TR_SELECTOR = 0x0000080e, | ||
69 | HOST_ES_SELECTOR = 0x00000c00, | ||
70 | HOST_CS_SELECTOR = 0x00000c02, | ||
71 | HOST_SS_SELECTOR = 0x00000c04, | ||
72 | HOST_DS_SELECTOR = 0x00000c06, | ||
73 | HOST_FS_SELECTOR = 0x00000c08, | ||
74 | HOST_GS_SELECTOR = 0x00000c0a, | ||
75 | HOST_TR_SELECTOR = 0x00000c0c, | ||
76 | IO_BITMAP_A = 0x00002000, | ||
77 | IO_BITMAP_A_HIGH = 0x00002001, | ||
78 | IO_BITMAP_B = 0x00002002, | ||
79 | IO_BITMAP_B_HIGH = 0x00002003, | ||
80 | MSR_BITMAP = 0x00002004, | ||
81 | MSR_BITMAP_HIGH = 0x00002005, | ||
82 | VM_EXIT_MSR_STORE_ADDR = 0x00002006, | ||
83 | VM_EXIT_MSR_STORE_ADDR_HIGH = 0x00002007, | ||
84 | VM_EXIT_MSR_LOAD_ADDR = 0x00002008, | ||
85 | VM_EXIT_MSR_LOAD_ADDR_HIGH = 0x00002009, | ||
86 | VM_ENTRY_MSR_LOAD_ADDR = 0x0000200a, | ||
87 | VM_ENTRY_MSR_LOAD_ADDR_HIGH = 0x0000200b, | ||
88 | TSC_OFFSET = 0x00002010, | ||
89 | TSC_OFFSET_HIGH = 0x00002011, | ||
90 | VIRTUAL_APIC_PAGE_ADDR = 0x00002012, | ||
91 | VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013, | ||
92 | VMCS_LINK_POINTER = 0x00002800, | ||
93 | VMCS_LINK_POINTER_HIGH = 0x00002801, | ||
94 | GUEST_IA32_DEBUGCTL = 0x00002802, | ||
95 | GUEST_IA32_DEBUGCTL_HIGH = 0x00002803, | ||
96 | PIN_BASED_VM_EXEC_CONTROL = 0x00004000, | ||
97 | CPU_BASED_VM_EXEC_CONTROL = 0x00004002, | ||
98 | EXCEPTION_BITMAP = 0x00004004, | ||
99 | PAGE_FAULT_ERROR_CODE_MASK = 0x00004006, | ||
100 | PAGE_FAULT_ERROR_CODE_MATCH = 0x00004008, | ||
101 | CR3_TARGET_COUNT = 0x0000400a, | ||
102 | VM_EXIT_CONTROLS = 0x0000400c, | ||
103 | VM_EXIT_MSR_STORE_COUNT = 0x0000400e, | ||
104 | VM_EXIT_MSR_LOAD_COUNT = 0x00004010, | ||
105 | VM_ENTRY_CONTROLS = 0x00004012, | ||
106 | VM_ENTRY_MSR_LOAD_COUNT = 0x00004014, | ||
107 | VM_ENTRY_INTR_INFO_FIELD = 0x00004016, | ||
108 | VM_ENTRY_EXCEPTION_ERROR_CODE = 0x00004018, | ||
109 | VM_ENTRY_INSTRUCTION_LEN = 0x0000401a, | ||
110 | TPR_THRESHOLD = 0x0000401c, | ||
111 | SECONDARY_VM_EXEC_CONTROL = 0x0000401e, | ||
112 | VM_INSTRUCTION_ERROR = 0x00004400, | ||
113 | VM_EXIT_REASON = 0x00004402, | ||
114 | VM_EXIT_INTR_INFO = 0x00004404, | ||
115 | VM_EXIT_INTR_ERROR_CODE = 0x00004406, | ||
116 | IDT_VECTORING_INFO_FIELD = 0x00004408, | ||
117 | IDT_VECTORING_ERROR_CODE = 0x0000440a, | ||
118 | VM_EXIT_INSTRUCTION_LEN = 0x0000440c, | ||
119 | VMX_INSTRUCTION_INFO = 0x0000440e, | ||
120 | GUEST_ES_LIMIT = 0x00004800, | ||
121 | GUEST_CS_LIMIT = 0x00004802, | ||
122 | GUEST_SS_LIMIT = 0x00004804, | ||
123 | GUEST_DS_LIMIT = 0x00004806, | ||
124 | GUEST_FS_LIMIT = 0x00004808, | ||
125 | GUEST_GS_LIMIT = 0x0000480a, | ||
126 | GUEST_LDTR_LIMIT = 0x0000480c, | ||
127 | GUEST_TR_LIMIT = 0x0000480e, | ||
128 | GUEST_GDTR_LIMIT = 0x00004810, | ||
129 | GUEST_IDTR_LIMIT = 0x00004812, | ||
130 | GUEST_ES_AR_BYTES = 0x00004814, | ||
131 | GUEST_CS_AR_BYTES = 0x00004816, | ||
132 | GUEST_SS_AR_BYTES = 0x00004818, | ||
133 | GUEST_DS_AR_BYTES = 0x0000481a, | ||
134 | GUEST_FS_AR_BYTES = 0x0000481c, | ||
135 | GUEST_GS_AR_BYTES = 0x0000481e, | ||
136 | GUEST_LDTR_AR_BYTES = 0x00004820, | ||
137 | GUEST_TR_AR_BYTES = 0x00004822, | ||
138 | GUEST_INTERRUPTIBILITY_INFO = 0x00004824, | ||
139 | GUEST_ACTIVITY_STATE = 0X00004826, | ||
140 | GUEST_SYSENTER_CS = 0x0000482A, | ||
141 | HOST_IA32_SYSENTER_CS = 0x00004c00, | ||
142 | CR0_GUEST_HOST_MASK = 0x00006000, | ||
143 | CR4_GUEST_HOST_MASK = 0x00006002, | ||
144 | CR0_READ_SHADOW = 0x00006004, | ||
145 | CR4_READ_SHADOW = 0x00006006, | ||
146 | CR3_TARGET_VALUE0 = 0x00006008, | ||
147 | CR3_TARGET_VALUE1 = 0x0000600a, | ||
148 | CR3_TARGET_VALUE2 = 0x0000600c, | ||
149 | CR3_TARGET_VALUE3 = 0x0000600e, | ||
150 | EXIT_QUALIFICATION = 0x00006400, | ||
151 | GUEST_LINEAR_ADDRESS = 0x0000640a, | ||
152 | GUEST_CR0 = 0x00006800, | ||
153 | GUEST_CR3 = 0x00006802, | ||
154 | GUEST_CR4 = 0x00006804, | ||
155 | GUEST_ES_BASE = 0x00006806, | ||
156 | GUEST_CS_BASE = 0x00006808, | ||
157 | GUEST_SS_BASE = 0x0000680a, | ||
158 | GUEST_DS_BASE = 0x0000680c, | ||
159 | GUEST_FS_BASE = 0x0000680e, | ||
160 | GUEST_GS_BASE = 0x00006810, | ||
161 | GUEST_LDTR_BASE = 0x00006812, | ||
162 | GUEST_TR_BASE = 0x00006814, | ||
163 | GUEST_GDTR_BASE = 0x00006816, | ||
164 | GUEST_IDTR_BASE = 0x00006818, | ||
165 | GUEST_DR7 = 0x0000681a, | ||
166 | GUEST_RSP = 0x0000681c, | ||
167 | GUEST_RIP = 0x0000681e, | ||
168 | GUEST_RFLAGS = 0x00006820, | ||
169 | GUEST_PENDING_DBG_EXCEPTIONS = 0x00006822, | ||
170 | GUEST_SYSENTER_ESP = 0x00006824, | ||
171 | GUEST_SYSENTER_EIP = 0x00006826, | ||
172 | HOST_CR0 = 0x00006c00, | ||
173 | HOST_CR3 = 0x00006c02, | ||
174 | HOST_CR4 = 0x00006c04, | ||
175 | HOST_FS_BASE = 0x00006c06, | ||
176 | HOST_GS_BASE = 0x00006c08, | ||
177 | HOST_TR_BASE = 0x00006c0a, | ||
178 | HOST_GDTR_BASE = 0x00006c0c, | ||
179 | HOST_IDTR_BASE = 0x00006c0e, | ||
180 | HOST_IA32_SYSENTER_ESP = 0x00006c10, | ||
181 | HOST_IA32_SYSENTER_EIP = 0x00006c12, | ||
182 | HOST_RSP = 0x00006c14, | ||
183 | HOST_RIP = 0x00006c16, | ||
184 | }; | ||
185 | |||
186 | #define VMX_EXIT_REASONS_FAILED_VMENTRY 0x80000000 | ||
187 | |||
188 | #define EXIT_REASON_EXCEPTION_NMI 0 | ||
189 | #define EXIT_REASON_EXTERNAL_INTERRUPT 1 | ||
190 | #define EXIT_REASON_TRIPLE_FAULT 2 | ||
191 | |||
192 | #define EXIT_REASON_PENDING_INTERRUPT 7 | ||
193 | |||
194 | #define EXIT_REASON_TASK_SWITCH 9 | ||
195 | #define EXIT_REASON_CPUID 10 | ||
196 | #define EXIT_REASON_HLT 12 | ||
197 | #define EXIT_REASON_INVLPG 14 | ||
198 | #define EXIT_REASON_RDPMC 15 | ||
199 | #define EXIT_REASON_RDTSC 16 | ||
200 | #define EXIT_REASON_VMCALL 18 | ||
201 | #define EXIT_REASON_VMCLEAR 19 | ||
202 | #define EXIT_REASON_VMLAUNCH 20 | ||
203 | #define EXIT_REASON_VMPTRLD 21 | ||
204 | #define EXIT_REASON_VMPTRST 22 | ||
205 | #define EXIT_REASON_VMREAD 23 | ||
206 | #define EXIT_REASON_VMRESUME 24 | ||
207 | #define EXIT_REASON_VMWRITE 25 | ||
208 | #define EXIT_REASON_VMOFF 26 | ||
209 | #define EXIT_REASON_VMON 27 | ||
210 | #define EXIT_REASON_CR_ACCESS 28 | ||
211 | #define EXIT_REASON_DR_ACCESS 29 | ||
212 | #define EXIT_REASON_IO_INSTRUCTION 30 | ||
213 | #define EXIT_REASON_MSR_READ 31 | ||
214 | #define EXIT_REASON_MSR_WRITE 32 | ||
215 | #define EXIT_REASON_MWAIT_INSTRUCTION 36 | ||
216 | #define EXIT_REASON_TPR_BELOW_THRESHOLD 43 | ||
217 | |||
218 | /* | ||
219 | * Interruption-information format | ||
220 | */ | ||
221 | #define INTR_INFO_VECTOR_MASK 0xff /* 7:0 */ | ||
222 | #define INTR_INFO_INTR_TYPE_MASK 0x700 /* 10:8 */ | ||
223 | #define INTR_INFO_DELIEVER_CODE_MASK 0x800 /* 11 */ | ||
224 | #define INTR_INFO_VALID_MASK 0x80000000 /* 31 */ | ||
225 | |||
226 | #define VECTORING_INFO_VECTOR_MASK INTR_INFO_VECTOR_MASK | ||
227 | #define VECTORING_INFO_TYPE_MASK INTR_INFO_INTR_TYPE_MASK | ||
228 | #define VECTORING_INFO_DELIEVER_CODE_MASK INTR_INFO_DELIEVER_CODE_MASK | ||
229 | #define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK | ||
230 | |||
231 | #define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */ | ||
232 | #define INTR_TYPE_EXCEPTION (3 << 8) /* processor exception */ | ||
233 | |||
234 | /* | ||
235 | * Exit Qualifications for MOV for Control Register Access | ||
236 | */ | ||
237 | #define CONTROL_REG_ACCESS_NUM 0x7 /* 2:0, number of control register */ | ||
238 | #define CONTROL_REG_ACCESS_TYPE 0x30 /* 5:4, access type */ | ||
239 | #define CONTROL_REG_ACCESS_REG 0xf00 /* 10:8, general purpose register */ | ||
240 | #define LMSW_SOURCE_DATA_SHIFT 16 | ||
241 | #define LMSW_SOURCE_DATA (0xFFFF << LMSW_SOURCE_DATA_SHIFT) /* 16:31 lmsw source */ | ||
242 | #define REG_EAX (0 << 8) | ||
243 | #define REG_ECX (1 << 8) | ||
244 | #define REG_EDX (2 << 8) | ||
245 | #define REG_EBX (3 << 8) | ||
246 | #define REG_ESP (4 << 8) | ||
247 | #define REG_EBP (5 << 8) | ||
248 | #define REG_ESI (6 << 8) | ||
249 | #define REG_EDI (7 << 8) | ||
250 | #define REG_R8 (8 << 8) | ||
251 | #define REG_R9 (9 << 8) | ||
252 | #define REG_R10 (10 << 8) | ||
253 | #define REG_R11 (11 << 8) | ||
254 | #define REG_R12 (12 << 8) | ||
255 | #define REG_R13 (13 << 8) | ||
256 | #define REG_R14 (14 << 8) | ||
257 | #define REG_R15 (15 << 8) | ||
258 | |||
259 | /* | ||
260 | * Exit Qualifications for MOV for Debug Register Access | ||
261 | */ | ||
262 | #define DEBUG_REG_ACCESS_NUM 0x7 /* 2:0, number of debug register */ | ||
263 | #define DEBUG_REG_ACCESS_TYPE 0x10 /* 4, direction of access */ | ||
264 | #define TYPE_MOV_TO_DR (0 << 4) | ||
265 | #define TYPE_MOV_FROM_DR (1 << 4) | ||
266 | #define DEBUG_REG_ACCESS_REG 0xf00 /* 11:8, general purpose register */ | ||
267 | |||
268 | |||
269 | /* segment AR */ | ||
270 | #define SEGMENT_AR_L_MASK (1 << 13) | ||
271 | |||
272 | #define AR_TYPE_ACCESSES_MASK 1 | ||
273 | #define AR_TYPE_READABLE_MASK (1 << 1) | ||
274 | #define AR_TYPE_WRITEABLE_MASK (1 << 2) | ||
275 | #define AR_TYPE_CODE_MASK (1 << 3) | ||
276 | #define AR_TYPE_MASK 0x0f | ||
277 | #define AR_TYPE_BUSY_64_TSS 11 | ||
278 | #define AR_TYPE_BUSY_32_TSS 11 | ||
279 | #define AR_TYPE_BUSY_16_TSS 3 | ||
280 | #define AR_TYPE_LDT 2 | ||
281 | |||
282 | #define AR_UNUSABLE_MASK (1 << 16) | ||
283 | #define AR_S_MASK (1 << 4) | ||
284 | #define AR_P_MASK (1 << 7) | ||
285 | #define AR_L_MASK (1 << 13) | ||
286 | #define AR_DB_MASK (1 << 14) | ||
287 | #define AR_G_MASK (1 << 15) | ||
288 | #define AR_DPL_SHIFT 5 | ||
289 | #define AR_DPL(ar) (((ar) >> AR_DPL_SHIFT) & 3) | ||
290 | |||
291 | #define AR_RESERVD_MASK 0xfffe0f00 | ||
292 | |||
293 | #define MSR_IA32_VMX_BASIC 0x480 | ||
294 | #define MSR_IA32_VMX_PINBASED_CTLS 0x481 | ||
295 | #define MSR_IA32_VMX_PROCBASED_CTLS 0x482 | ||
296 | #define MSR_IA32_VMX_EXIT_CTLS 0x483 | ||
297 | #define MSR_IA32_VMX_ENTRY_CTLS 0x484 | ||
298 | #define MSR_IA32_VMX_MISC 0x485 | ||
299 | #define MSR_IA32_VMX_CR0_FIXED0 0x486 | ||
300 | #define MSR_IA32_VMX_CR0_FIXED1 0x487 | ||
301 | #define MSR_IA32_VMX_CR4_FIXED0 0x488 | ||
302 | #define MSR_IA32_VMX_CR4_FIXED1 0x489 | ||
303 | #define MSR_IA32_VMX_VMCS_ENUM 0x48a | ||
304 | #define MSR_IA32_VMX_PROCBASED_CTLS2 0x48b | ||
305 | |||
306 | #define MSR_IA32_FEATURE_CONTROL 0x3a | ||
307 | #define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1 | ||
308 | #define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4 | ||
309 | |||
310 | #endif | ||
diff --git a/drivers/kvm/x86_emulate.c b/drivers/kvm/x86_emulate.c deleted file mode 100644 index bd46de6bf891..000000000000 --- a/drivers/kvm/x86_emulate.c +++ /dev/null | |||
@@ -1,1662 +0,0 @@ | |||
1 | /****************************************************************************** | ||
2 | * x86_emulate.c | ||
3 | * | ||
4 | * Generic x86 (32-bit and 64-bit) instruction decoder and emulator. | ||
5 | * | ||
6 | * Copyright (c) 2005 Keir Fraser | ||
7 | * | ||
8 | * Linux coding style, mod r/m decoder, segment base fixes, real-mode | ||
9 | * privileged instructions: | ||
10 | * | ||
11 | * Copyright (C) 2006 Qumranet | ||
12 | * | ||
13 | * Avi Kivity <avi@qumranet.com> | ||
14 | * Yaniv Kamay <yaniv@qumranet.com> | ||
15 | * | ||
16 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
17 | * the COPYING file in the top-level directory. | ||
18 | * | ||
19 | * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 | ||
20 | */ | ||
21 | |||
22 | #ifndef __KERNEL__ | ||
23 | #include <stdio.h> | ||
24 | #include <stdint.h> | ||
25 | #include <public/xen.h> | ||
26 | #define DPRINTF(_f, _a ...) printf( _f , ## _a ) | ||
27 | #else | ||
28 | #include "kvm.h" | ||
29 | #define DPRINTF(x...) do {} while (0) | ||
30 | #endif | ||
31 | #include "x86_emulate.h" | ||
32 | #include <linux/module.h> | ||
33 | |||
34 | /* | ||
35 | * Opcode effective-address decode tables. | ||
36 | * Note that we only emulate instructions that have at least one memory | ||
37 | * operand (excluding implicit stack references). We assume that stack | ||
38 | * references and instruction fetches will never occur in special memory | ||
39 | * areas that require emulation. So, for example, 'mov <imm>,<reg>' need | ||
40 | * not be handled. | ||
41 | */ | ||
42 | |||
43 | /* Operand sizes: 8-bit operands or specified/overridden size. */ | ||
44 | #define ByteOp (1<<0) /* 8-bit operands. */ | ||
45 | /* Destination operand type. */ | ||
46 | #define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ | ||
47 | #define DstReg (2<<1) /* Register operand. */ | ||
48 | #define DstMem (3<<1) /* Memory operand. */ | ||
49 | #define DstMask (3<<1) | ||
50 | /* Source operand type. */ | ||
51 | #define SrcNone (0<<3) /* No source operand. */ | ||
52 | #define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */ | ||
53 | #define SrcReg (1<<3) /* Register operand. */ | ||
54 | #define SrcMem (2<<3) /* Memory operand. */ | ||
55 | #define SrcMem16 (3<<3) /* Memory operand (16-bit). */ | ||
56 | #define SrcMem32 (4<<3) /* Memory operand (32-bit). */ | ||
57 | #define SrcImm (5<<3) /* Immediate operand. */ | ||
58 | #define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */ | ||
59 | #define SrcMask (7<<3) | ||
60 | /* Generic ModRM decode. */ | ||
61 | #define ModRM (1<<6) | ||
62 | /* Destination is only written; never read. */ | ||
63 | #define Mov (1<<7) | ||
64 | #define BitOp (1<<8) | ||
65 | |||
66 | static u8 opcode_table[256] = { | ||
67 | /* 0x00 - 0x07 */ | ||
68 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
69 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
70 | 0, 0, 0, 0, | ||
71 | /* 0x08 - 0x0F */ | ||
72 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
73 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
74 | 0, 0, 0, 0, | ||
75 | /* 0x10 - 0x17 */ | ||
76 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
77 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
78 | 0, 0, 0, 0, | ||
79 | /* 0x18 - 0x1F */ | ||
80 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
81 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
82 | 0, 0, 0, 0, | ||
83 | /* 0x20 - 0x27 */ | ||
84 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
85 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
86 | SrcImmByte, SrcImm, 0, 0, | ||
87 | /* 0x28 - 0x2F */ | ||
88 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
89 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
90 | 0, 0, 0, 0, | ||
91 | /* 0x30 - 0x37 */ | ||
92 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
93 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
94 | 0, 0, 0, 0, | ||
95 | /* 0x38 - 0x3F */ | ||
96 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
97 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
98 | 0, 0, 0, 0, | ||
99 | /* 0x40 - 0x4F */ | ||
100 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
101 | /* 0x50 - 0x57 */ | ||
102 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
103 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
104 | /* 0x58 - 0x5F */ | ||
105 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
106 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
107 | /* 0x60 - 0x67 */ | ||
108 | 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , | ||
109 | 0, 0, 0, 0, | ||
110 | /* 0x68 - 0x6F */ | ||
111 | 0, 0, ImplicitOps|Mov, 0, | ||
112 | SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */ | ||
113 | SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */ | ||
114 | /* 0x70 - 0x77 */ | ||
115 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
116 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
117 | /* 0x78 - 0x7F */ | ||
118 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
119 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
120 | /* 0x80 - 0x87 */ | ||
121 | ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, | ||
122 | ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, | ||
123 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
124 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
125 | /* 0x88 - 0x8F */ | ||
126 | ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, | ||
127 | ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
128 | 0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov, | ||
129 | /* 0x90 - 0x9F */ | ||
130 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps, ImplicitOps, 0, 0, | ||
131 | /* 0xA0 - 0xA7 */ | ||
132 | ByteOp | DstReg | SrcMem | Mov, DstReg | SrcMem | Mov, | ||
133 | ByteOp | DstMem | SrcReg | Mov, DstMem | SrcReg | Mov, | ||
134 | ByteOp | ImplicitOps | Mov, ImplicitOps | Mov, | ||
135 | ByteOp | ImplicitOps, ImplicitOps, | ||
136 | /* 0xA8 - 0xAF */ | ||
137 | 0, 0, ByteOp | ImplicitOps | Mov, ImplicitOps | Mov, | ||
138 | ByteOp | ImplicitOps | Mov, ImplicitOps | Mov, | ||
139 | ByteOp | ImplicitOps, ImplicitOps, | ||
140 | /* 0xB0 - 0xBF */ | ||
141 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
142 | /* 0xC0 - 0xC7 */ | ||
143 | ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, | ||
144 | 0, ImplicitOps, 0, 0, | ||
145 | ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov, | ||
146 | /* 0xC8 - 0xCF */ | ||
147 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
148 | /* 0xD0 - 0xD7 */ | ||
149 | ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, | ||
150 | ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, | ||
151 | 0, 0, 0, 0, | ||
152 | /* 0xD8 - 0xDF */ | ||
153 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
154 | /* 0xE0 - 0xE7 */ | ||
155 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
156 | /* 0xE8 - 0xEF */ | ||
157 | ImplicitOps, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps, 0, 0, 0, 0, | ||
158 | /* 0xF0 - 0xF7 */ | ||
159 | 0, 0, 0, 0, | ||
160 | ImplicitOps, 0, | ||
161 | ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, | ||
162 | /* 0xF8 - 0xFF */ | ||
163 | 0, 0, 0, 0, | ||
164 | 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM | ||
165 | }; | ||
166 | |||
167 | static u16 twobyte_table[256] = { | ||
168 | /* 0x00 - 0x0F */ | ||
169 | 0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0, | ||
170 | ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, | ||
171 | /* 0x10 - 0x1F */ | ||
172 | 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, | ||
173 | /* 0x20 - 0x2F */ | ||
174 | ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0, | ||
175 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
176 | /* 0x30 - 0x3F */ | ||
177 | ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
178 | /* 0x40 - 0x47 */ | ||
179 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
180 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
181 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
182 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
183 | /* 0x48 - 0x4F */ | ||
184 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
185 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
186 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
187 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
188 | /* 0x50 - 0x5F */ | ||
189 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
190 | /* 0x60 - 0x6F */ | ||
191 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
192 | /* 0x70 - 0x7F */ | ||
193 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
194 | /* 0x80 - 0x8F */ | ||
195 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
196 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
197 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
198 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
199 | /* 0x90 - 0x9F */ | ||
200 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
201 | /* 0xA0 - 0xA7 */ | ||
202 | 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0, | ||
203 | /* 0xA8 - 0xAF */ | ||
204 | 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0, | ||
205 | /* 0xB0 - 0xB7 */ | ||
206 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0, | ||
207 | DstMem | SrcReg | ModRM | BitOp, | ||
208 | 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, | ||
209 | DstReg | SrcMem16 | ModRM | Mov, | ||
210 | /* 0xB8 - 0xBF */ | ||
211 | 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp, | ||
212 | 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, | ||
213 | DstReg | SrcMem16 | ModRM | Mov, | ||
214 | /* 0xC0 - 0xCF */ | ||
215 | 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM, | ||
216 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
217 | /* 0xD0 - 0xDF */ | ||
218 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
219 | /* 0xE0 - 0xEF */ | ||
220 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
221 | /* 0xF0 - 0xFF */ | ||
222 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | ||
223 | }; | ||
224 | |||
225 | /* Type, address-of, and value of an instruction's operand. */ | ||
226 | struct operand { | ||
227 | enum { OP_REG, OP_MEM, OP_IMM } type; | ||
228 | unsigned int bytes; | ||
229 | unsigned long val, orig_val, *ptr; | ||
230 | }; | ||
231 | |||
232 | /* EFLAGS bit definitions. */ | ||
233 | #define EFLG_OF (1<<11) | ||
234 | #define EFLG_DF (1<<10) | ||
235 | #define EFLG_SF (1<<7) | ||
236 | #define EFLG_ZF (1<<6) | ||
237 | #define EFLG_AF (1<<4) | ||
238 | #define EFLG_PF (1<<2) | ||
239 | #define EFLG_CF (1<<0) | ||
240 | |||
241 | /* | ||
242 | * Instruction emulation: | ||
243 | * Most instructions are emulated directly via a fragment of inline assembly | ||
244 | * code. This allows us to save/restore EFLAGS and thus very easily pick up | ||
245 | * any modified flags. | ||
246 | */ | ||
247 | |||
248 | #if defined(CONFIG_X86_64) | ||
249 | #define _LO32 "k" /* force 32-bit operand */ | ||
250 | #define _STK "%%rsp" /* stack pointer */ | ||
251 | #elif defined(__i386__) | ||
252 | #define _LO32 "" /* force 32-bit operand */ | ||
253 | #define _STK "%%esp" /* stack pointer */ | ||
254 | #endif | ||
255 | |||
256 | /* | ||
257 | * These EFLAGS bits are restored from saved value during emulation, and | ||
258 | * any changes are written back to the saved value after emulation. | ||
259 | */ | ||
260 | #define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF) | ||
261 | |||
262 | /* Before executing instruction: restore necessary bits in EFLAGS. */ | ||
263 | #define _PRE_EFLAGS(_sav, _msk, _tmp) \ | ||
264 | /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); */ \ | ||
265 | "push %"_sav"; " \ | ||
266 | "movl %"_msk",%"_LO32 _tmp"; " \ | ||
267 | "andl %"_LO32 _tmp",("_STK"); " \ | ||
268 | "pushf; " \ | ||
269 | "notl %"_LO32 _tmp"; " \ | ||
270 | "andl %"_LO32 _tmp",("_STK"); " \ | ||
271 | "pop %"_tmp"; " \ | ||
272 | "orl %"_LO32 _tmp",("_STK"); " \ | ||
273 | "popf; " \ | ||
274 | /* _sav &= ~msk; */ \ | ||
275 | "movl %"_msk",%"_LO32 _tmp"; " \ | ||
276 | "notl %"_LO32 _tmp"; " \ | ||
277 | "andl %"_LO32 _tmp",%"_sav"; " | ||
278 | |||
279 | /* After executing instruction: write-back necessary bits in EFLAGS. */ | ||
280 | #define _POST_EFLAGS(_sav, _msk, _tmp) \ | ||
281 | /* _sav |= EFLAGS & _msk; */ \ | ||
282 | "pushf; " \ | ||
283 | "pop %"_tmp"; " \ | ||
284 | "andl %"_msk",%"_LO32 _tmp"; " \ | ||
285 | "orl %"_LO32 _tmp",%"_sav"; " | ||
286 | |||
287 | /* Raw emulation: instruction has two explicit operands. */ | ||
288 | #define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \ | ||
289 | do { \ | ||
290 | unsigned long _tmp; \ | ||
291 | \ | ||
292 | switch ((_dst).bytes) { \ | ||
293 | case 2: \ | ||
294 | __asm__ __volatile__ ( \ | ||
295 | _PRE_EFLAGS("0","4","2") \ | ||
296 | _op"w %"_wx"3,%1; " \ | ||
297 | _POST_EFLAGS("0","4","2") \ | ||
298 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
299 | "=&r" (_tmp) \ | ||
300 | : _wy ((_src).val), "i" (EFLAGS_MASK) ); \ | ||
301 | break; \ | ||
302 | case 4: \ | ||
303 | __asm__ __volatile__ ( \ | ||
304 | _PRE_EFLAGS("0","4","2") \ | ||
305 | _op"l %"_lx"3,%1; " \ | ||
306 | _POST_EFLAGS("0","4","2") \ | ||
307 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
308 | "=&r" (_tmp) \ | ||
309 | : _ly ((_src).val), "i" (EFLAGS_MASK) ); \ | ||
310 | break; \ | ||
311 | case 8: \ | ||
312 | __emulate_2op_8byte(_op, _src, _dst, \ | ||
313 | _eflags, _qx, _qy); \ | ||
314 | break; \ | ||
315 | } \ | ||
316 | } while (0) | ||
317 | |||
318 | #define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \ | ||
319 | do { \ | ||
320 | unsigned long _tmp; \ | ||
321 | switch ( (_dst).bytes ) \ | ||
322 | { \ | ||
323 | case 1: \ | ||
324 | __asm__ __volatile__ ( \ | ||
325 | _PRE_EFLAGS("0","4","2") \ | ||
326 | _op"b %"_bx"3,%1; " \ | ||
327 | _POST_EFLAGS("0","4","2") \ | ||
328 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
329 | "=&r" (_tmp) \ | ||
330 | : _by ((_src).val), "i" (EFLAGS_MASK) ); \ | ||
331 | break; \ | ||
332 | default: \ | ||
333 | __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ | ||
334 | _wx, _wy, _lx, _ly, _qx, _qy); \ | ||
335 | break; \ | ||
336 | } \ | ||
337 | } while (0) | ||
338 | |||
339 | /* Source operand is byte-sized and may be restricted to just %cl. */ | ||
340 | #define emulate_2op_SrcB(_op, _src, _dst, _eflags) \ | ||
341 | __emulate_2op(_op, _src, _dst, _eflags, \ | ||
342 | "b", "c", "b", "c", "b", "c", "b", "c") | ||
343 | |||
344 | /* Source operand is byte, word, long or quad sized. */ | ||
345 | #define emulate_2op_SrcV(_op, _src, _dst, _eflags) \ | ||
346 | __emulate_2op(_op, _src, _dst, _eflags, \ | ||
347 | "b", "q", "w", "r", _LO32, "r", "", "r") | ||
348 | |||
349 | /* Source operand is word, long or quad sized. */ | ||
350 | #define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \ | ||
351 | __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ | ||
352 | "w", "r", _LO32, "r", "", "r") | ||
353 | |||
354 | /* Instruction has only one explicit operand (no source operand). */ | ||
355 | #define emulate_1op(_op, _dst, _eflags) \ | ||
356 | do { \ | ||
357 | unsigned long _tmp; \ | ||
358 | \ | ||
359 | switch ( (_dst).bytes ) \ | ||
360 | { \ | ||
361 | case 1: \ | ||
362 | __asm__ __volatile__ ( \ | ||
363 | _PRE_EFLAGS("0","3","2") \ | ||
364 | _op"b %1; " \ | ||
365 | _POST_EFLAGS("0","3","2") \ | ||
366 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
367 | "=&r" (_tmp) \ | ||
368 | : "i" (EFLAGS_MASK) ); \ | ||
369 | break; \ | ||
370 | case 2: \ | ||
371 | __asm__ __volatile__ ( \ | ||
372 | _PRE_EFLAGS("0","3","2") \ | ||
373 | _op"w %1; " \ | ||
374 | _POST_EFLAGS("0","3","2") \ | ||
375 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
376 | "=&r" (_tmp) \ | ||
377 | : "i" (EFLAGS_MASK) ); \ | ||
378 | break; \ | ||
379 | case 4: \ | ||
380 | __asm__ __volatile__ ( \ | ||
381 | _PRE_EFLAGS("0","3","2") \ | ||
382 | _op"l %1; " \ | ||
383 | _POST_EFLAGS("0","3","2") \ | ||
384 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
385 | "=&r" (_tmp) \ | ||
386 | : "i" (EFLAGS_MASK) ); \ | ||
387 | break; \ | ||
388 | case 8: \ | ||
389 | __emulate_1op_8byte(_op, _dst, _eflags); \ | ||
390 | break; \ | ||
391 | } \ | ||
392 | } while (0) | ||
393 | |||
394 | /* Emulate an instruction with quadword operands (x86/64 only). */ | ||
395 | #if defined(CONFIG_X86_64) | ||
396 | #define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) \ | ||
397 | do { \ | ||
398 | __asm__ __volatile__ ( \ | ||
399 | _PRE_EFLAGS("0","4","2") \ | ||
400 | _op"q %"_qx"3,%1; " \ | ||
401 | _POST_EFLAGS("0","4","2") \ | ||
402 | : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ | ||
403 | : _qy ((_src).val), "i" (EFLAGS_MASK) ); \ | ||
404 | } while (0) | ||
405 | |||
406 | #define __emulate_1op_8byte(_op, _dst, _eflags) \ | ||
407 | do { \ | ||
408 | __asm__ __volatile__ ( \ | ||
409 | _PRE_EFLAGS("0","3","2") \ | ||
410 | _op"q %1; " \ | ||
411 | _POST_EFLAGS("0","3","2") \ | ||
412 | : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ | ||
413 | : "i" (EFLAGS_MASK) ); \ | ||
414 | } while (0) | ||
415 | |||
416 | #elif defined(__i386__) | ||
417 | #define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) | ||
418 | #define __emulate_1op_8byte(_op, _dst, _eflags) | ||
419 | #endif /* __i386__ */ | ||
420 | |||
421 | /* Fetch next part of the instruction being emulated. */ | ||
422 | #define insn_fetch(_type, _size, _eip) \ | ||
423 | ({ unsigned long _x; \ | ||
424 | rc = ops->read_std((unsigned long)(_eip) + ctxt->cs_base, &_x, \ | ||
425 | (_size), ctxt->vcpu); \ | ||
426 | if ( rc != 0 ) \ | ||
427 | goto done; \ | ||
428 | (_eip) += (_size); \ | ||
429 | (_type)_x; \ | ||
430 | }) | ||
431 | |||
432 | /* Access/update address held in a register, based on addressing mode. */ | ||
433 | #define address_mask(reg) \ | ||
434 | ((ad_bytes == sizeof(unsigned long)) ? \ | ||
435 | (reg) : ((reg) & ((1UL << (ad_bytes << 3)) - 1))) | ||
436 | #define register_address(base, reg) \ | ||
437 | ((base) + address_mask(reg)) | ||
438 | #define register_address_increment(reg, inc) \ | ||
439 | do { \ | ||
440 | /* signed type ensures sign extension to long */ \ | ||
441 | int _inc = (inc); \ | ||
442 | if ( ad_bytes == sizeof(unsigned long) ) \ | ||
443 | (reg) += _inc; \ | ||
444 | else \ | ||
445 | (reg) = ((reg) & ~((1UL << (ad_bytes << 3)) - 1)) | \ | ||
446 | (((reg) + _inc) & ((1UL << (ad_bytes << 3)) - 1)); \ | ||
447 | } while (0) | ||
448 | |||
449 | #define JMP_REL(rel) \ | ||
450 | do { \ | ||
451 | register_address_increment(_eip, rel); \ | ||
452 | } while (0) | ||
453 | |||
454 | /* | ||
455 | * Given the 'reg' portion of a ModRM byte, and a register block, return a | ||
456 | * pointer into the block that addresses the relevant register. | ||
457 | * @highbyte_regs specifies whether to decode AH,CH,DH,BH. | ||
458 | */ | ||
459 | static void *decode_register(u8 modrm_reg, unsigned long *regs, | ||
460 | int highbyte_regs) | ||
461 | { | ||
462 | void *p; | ||
463 | |||
464 | p = ®s[modrm_reg]; | ||
465 | if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8) | ||
466 | p = (unsigned char *)®s[modrm_reg & 3] + 1; | ||
467 | return p; | ||
468 | } | ||
469 | |||
470 | static int read_descriptor(struct x86_emulate_ctxt *ctxt, | ||
471 | struct x86_emulate_ops *ops, | ||
472 | void *ptr, | ||
473 | u16 *size, unsigned long *address, int op_bytes) | ||
474 | { | ||
475 | int rc; | ||
476 | |||
477 | if (op_bytes == 2) | ||
478 | op_bytes = 3; | ||
479 | *address = 0; | ||
480 | rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, | ||
481 | ctxt->vcpu); | ||
482 | if (rc) | ||
483 | return rc; | ||
484 | rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, | ||
485 | ctxt->vcpu); | ||
486 | return rc; | ||
487 | } | ||
488 | |||
489 | static int test_cc(unsigned int condition, unsigned int flags) | ||
490 | { | ||
491 | int rc = 0; | ||
492 | |||
493 | switch ((condition & 15) >> 1) { | ||
494 | case 0: /* o */ | ||
495 | rc |= (flags & EFLG_OF); | ||
496 | break; | ||
497 | case 1: /* b/c/nae */ | ||
498 | rc |= (flags & EFLG_CF); | ||
499 | break; | ||
500 | case 2: /* z/e */ | ||
501 | rc |= (flags & EFLG_ZF); | ||
502 | break; | ||
503 | case 3: /* be/na */ | ||
504 | rc |= (flags & (EFLG_CF|EFLG_ZF)); | ||
505 | break; | ||
506 | case 4: /* s */ | ||
507 | rc |= (flags & EFLG_SF); | ||
508 | break; | ||
509 | case 5: /* p/pe */ | ||
510 | rc |= (flags & EFLG_PF); | ||
511 | break; | ||
512 | case 7: /* le/ng */ | ||
513 | rc |= (flags & EFLG_ZF); | ||
514 | /* fall through */ | ||
515 | case 6: /* l/nge */ | ||
516 | rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF)); | ||
517 | break; | ||
518 | } | ||
519 | |||
520 | /* Odd condition identifiers (lsb == 1) have inverted sense. */ | ||
521 | return (!!rc ^ (condition & 1)); | ||
522 | } | ||
523 | |||
524 | int | ||
525 | x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | ||
526 | { | ||
527 | unsigned d; | ||
528 | u8 b, sib, twobyte = 0, rex_prefix = 0; | ||
529 | u8 modrm, modrm_mod = 0, modrm_reg = 0, modrm_rm = 0; | ||
530 | unsigned long *override_base = NULL; | ||
531 | unsigned int op_bytes, ad_bytes, lock_prefix = 0, rep_prefix = 0, i; | ||
532 | int rc = 0; | ||
533 | struct operand src, dst; | ||
534 | unsigned long cr2 = ctxt->cr2; | ||
535 | int mode = ctxt->mode; | ||
536 | unsigned long modrm_ea; | ||
537 | int use_modrm_ea, index_reg = 0, base_reg = 0, scale, rip_relative = 0; | ||
538 | int no_wb = 0; | ||
539 | u64 msr_data; | ||
540 | |||
541 | /* Shadow copy of register state. Committed on successful emulation. */ | ||
542 | unsigned long _regs[NR_VCPU_REGS]; | ||
543 | unsigned long _eip = ctxt->vcpu->rip, _eflags = ctxt->eflags; | ||
544 | unsigned long modrm_val = 0; | ||
545 | |||
546 | memcpy(_regs, ctxt->vcpu->regs, sizeof _regs); | ||
547 | |||
548 | switch (mode) { | ||
549 | case X86EMUL_MODE_REAL: | ||
550 | case X86EMUL_MODE_PROT16: | ||
551 | op_bytes = ad_bytes = 2; | ||
552 | break; | ||
553 | case X86EMUL_MODE_PROT32: | ||
554 | op_bytes = ad_bytes = 4; | ||
555 | break; | ||
556 | #ifdef CONFIG_X86_64 | ||
557 | case X86EMUL_MODE_PROT64: | ||
558 | op_bytes = 4; | ||
559 | ad_bytes = 8; | ||
560 | break; | ||
561 | #endif | ||
562 | default: | ||
563 | return -1; | ||
564 | } | ||
565 | |||
566 | /* Legacy prefixes. */ | ||
567 | for (i = 0; i < 8; i++) { | ||
568 | switch (b = insn_fetch(u8, 1, _eip)) { | ||
569 | case 0x66: /* operand-size override */ | ||
570 | op_bytes ^= 6; /* switch between 2/4 bytes */ | ||
571 | break; | ||
572 | case 0x67: /* address-size override */ | ||
573 | if (mode == X86EMUL_MODE_PROT64) | ||
574 | ad_bytes ^= 12; /* switch between 4/8 bytes */ | ||
575 | else | ||
576 | ad_bytes ^= 6; /* switch between 2/4 bytes */ | ||
577 | break; | ||
578 | case 0x2e: /* CS override */ | ||
579 | override_base = &ctxt->cs_base; | ||
580 | break; | ||
581 | case 0x3e: /* DS override */ | ||
582 | override_base = &ctxt->ds_base; | ||
583 | break; | ||
584 | case 0x26: /* ES override */ | ||
585 | override_base = &ctxt->es_base; | ||
586 | break; | ||
587 | case 0x64: /* FS override */ | ||
588 | override_base = &ctxt->fs_base; | ||
589 | break; | ||
590 | case 0x65: /* GS override */ | ||
591 | override_base = &ctxt->gs_base; | ||
592 | break; | ||
593 | case 0x36: /* SS override */ | ||
594 | override_base = &ctxt->ss_base; | ||
595 | break; | ||
596 | case 0xf0: /* LOCK */ | ||
597 | lock_prefix = 1; | ||
598 | break; | ||
599 | case 0xf2: /* REPNE/REPNZ */ | ||
600 | case 0xf3: /* REP/REPE/REPZ */ | ||
601 | rep_prefix = 1; | ||
602 | break; | ||
603 | default: | ||
604 | goto done_prefixes; | ||
605 | } | ||
606 | } | ||
607 | |||
608 | done_prefixes: | ||
609 | |||
610 | /* REX prefix. */ | ||
611 | if ((mode == X86EMUL_MODE_PROT64) && ((b & 0xf0) == 0x40)) { | ||
612 | rex_prefix = b; | ||
613 | if (b & 8) | ||
614 | op_bytes = 8; /* REX.W */ | ||
615 | modrm_reg = (b & 4) << 1; /* REX.R */ | ||
616 | index_reg = (b & 2) << 2; /* REX.X */ | ||
617 | modrm_rm = base_reg = (b & 1) << 3; /* REG.B */ | ||
618 | b = insn_fetch(u8, 1, _eip); | ||
619 | } | ||
620 | |||
621 | /* Opcode byte(s). */ | ||
622 | d = opcode_table[b]; | ||
623 | if (d == 0) { | ||
624 | /* Two-byte opcode? */ | ||
625 | if (b == 0x0f) { | ||
626 | twobyte = 1; | ||
627 | b = insn_fetch(u8, 1, _eip); | ||
628 | d = twobyte_table[b]; | ||
629 | } | ||
630 | |||
631 | /* Unrecognised? */ | ||
632 | if (d == 0) | ||
633 | goto cannot_emulate; | ||
634 | } | ||
635 | |||
636 | /* ModRM and SIB bytes. */ | ||
637 | if (d & ModRM) { | ||
638 | modrm = insn_fetch(u8, 1, _eip); | ||
639 | modrm_mod |= (modrm & 0xc0) >> 6; | ||
640 | modrm_reg |= (modrm & 0x38) >> 3; | ||
641 | modrm_rm |= (modrm & 0x07); | ||
642 | modrm_ea = 0; | ||
643 | use_modrm_ea = 1; | ||
644 | |||
645 | if (modrm_mod == 3) { | ||
646 | modrm_val = *(unsigned long *) | ||
647 | decode_register(modrm_rm, _regs, d & ByteOp); | ||
648 | goto modrm_done; | ||
649 | } | ||
650 | |||
651 | if (ad_bytes == 2) { | ||
652 | unsigned bx = _regs[VCPU_REGS_RBX]; | ||
653 | unsigned bp = _regs[VCPU_REGS_RBP]; | ||
654 | unsigned si = _regs[VCPU_REGS_RSI]; | ||
655 | unsigned di = _regs[VCPU_REGS_RDI]; | ||
656 | |||
657 | /* 16-bit ModR/M decode. */ | ||
658 | switch (modrm_mod) { | ||
659 | case 0: | ||
660 | if (modrm_rm == 6) | ||
661 | modrm_ea += insn_fetch(u16, 2, _eip); | ||
662 | break; | ||
663 | case 1: | ||
664 | modrm_ea += insn_fetch(s8, 1, _eip); | ||
665 | break; | ||
666 | case 2: | ||
667 | modrm_ea += insn_fetch(u16, 2, _eip); | ||
668 | break; | ||
669 | } | ||
670 | switch (modrm_rm) { | ||
671 | case 0: | ||
672 | modrm_ea += bx + si; | ||
673 | break; | ||
674 | case 1: | ||
675 | modrm_ea += bx + di; | ||
676 | break; | ||
677 | case 2: | ||
678 | modrm_ea += bp + si; | ||
679 | break; | ||
680 | case 3: | ||
681 | modrm_ea += bp + di; | ||
682 | break; | ||
683 | case 4: | ||
684 | modrm_ea += si; | ||
685 | break; | ||
686 | case 5: | ||
687 | modrm_ea += di; | ||
688 | break; | ||
689 | case 6: | ||
690 | if (modrm_mod != 0) | ||
691 | modrm_ea += bp; | ||
692 | break; | ||
693 | case 7: | ||
694 | modrm_ea += bx; | ||
695 | break; | ||
696 | } | ||
697 | if (modrm_rm == 2 || modrm_rm == 3 || | ||
698 | (modrm_rm == 6 && modrm_mod != 0)) | ||
699 | if (!override_base) | ||
700 | override_base = &ctxt->ss_base; | ||
701 | modrm_ea = (u16)modrm_ea; | ||
702 | } else { | ||
703 | /* 32/64-bit ModR/M decode. */ | ||
704 | switch (modrm_rm) { | ||
705 | case 4: | ||
706 | case 12: | ||
707 | sib = insn_fetch(u8, 1, _eip); | ||
708 | index_reg |= (sib >> 3) & 7; | ||
709 | base_reg |= sib & 7; | ||
710 | scale = sib >> 6; | ||
711 | |||
712 | switch (base_reg) { | ||
713 | case 5: | ||
714 | if (modrm_mod != 0) | ||
715 | modrm_ea += _regs[base_reg]; | ||
716 | else | ||
717 | modrm_ea += insn_fetch(s32, 4, _eip); | ||
718 | break; | ||
719 | default: | ||
720 | modrm_ea += _regs[base_reg]; | ||
721 | } | ||
722 | switch (index_reg) { | ||
723 | case 4: | ||
724 | break; | ||
725 | default: | ||
726 | modrm_ea += _regs[index_reg] << scale; | ||
727 | |||
728 | } | ||
729 | break; | ||
730 | case 5: | ||
731 | if (modrm_mod != 0) | ||
732 | modrm_ea += _regs[modrm_rm]; | ||
733 | else if (mode == X86EMUL_MODE_PROT64) | ||
734 | rip_relative = 1; | ||
735 | break; | ||
736 | default: | ||
737 | modrm_ea += _regs[modrm_rm]; | ||
738 | break; | ||
739 | } | ||
740 | switch (modrm_mod) { | ||
741 | case 0: | ||
742 | if (modrm_rm == 5) | ||
743 | modrm_ea += insn_fetch(s32, 4, _eip); | ||
744 | break; | ||
745 | case 1: | ||
746 | modrm_ea += insn_fetch(s8, 1, _eip); | ||
747 | break; | ||
748 | case 2: | ||
749 | modrm_ea += insn_fetch(s32, 4, _eip); | ||
750 | break; | ||
751 | } | ||
752 | } | ||
753 | if (!override_base) | ||
754 | override_base = &ctxt->ds_base; | ||
755 | if (mode == X86EMUL_MODE_PROT64 && | ||
756 | override_base != &ctxt->fs_base && | ||
757 | override_base != &ctxt->gs_base) | ||
758 | override_base = NULL; | ||
759 | |||
760 | if (override_base) | ||
761 | modrm_ea += *override_base; | ||
762 | |||
763 | if (rip_relative) { | ||
764 | modrm_ea += _eip; | ||
765 | switch (d & SrcMask) { | ||
766 | case SrcImmByte: | ||
767 | modrm_ea += 1; | ||
768 | break; | ||
769 | case SrcImm: | ||
770 | if (d & ByteOp) | ||
771 | modrm_ea += 1; | ||
772 | else | ||
773 | if (op_bytes == 8) | ||
774 | modrm_ea += 4; | ||
775 | else | ||
776 | modrm_ea += op_bytes; | ||
777 | } | ||
778 | } | ||
779 | if (ad_bytes != 8) | ||
780 | modrm_ea = (u32)modrm_ea; | ||
781 | cr2 = modrm_ea; | ||
782 | modrm_done: | ||
783 | ; | ||
784 | } | ||
785 | |||
786 | /* | ||
787 | * Decode and fetch the source operand: register, memory | ||
788 | * or immediate. | ||
789 | */ | ||
790 | switch (d & SrcMask) { | ||
791 | case SrcNone: | ||
792 | break; | ||
793 | case SrcReg: | ||
794 | src.type = OP_REG; | ||
795 | if (d & ByteOp) { | ||
796 | src.ptr = decode_register(modrm_reg, _regs, | ||
797 | (rex_prefix == 0)); | ||
798 | src.val = src.orig_val = *(u8 *) src.ptr; | ||
799 | src.bytes = 1; | ||
800 | } else { | ||
801 | src.ptr = decode_register(modrm_reg, _regs, 0); | ||
802 | switch ((src.bytes = op_bytes)) { | ||
803 | case 2: | ||
804 | src.val = src.orig_val = *(u16 *) src.ptr; | ||
805 | break; | ||
806 | case 4: | ||
807 | src.val = src.orig_val = *(u32 *) src.ptr; | ||
808 | break; | ||
809 | case 8: | ||
810 | src.val = src.orig_val = *(u64 *) src.ptr; | ||
811 | break; | ||
812 | } | ||
813 | } | ||
814 | break; | ||
815 | case SrcMem16: | ||
816 | src.bytes = 2; | ||
817 | goto srcmem_common; | ||
818 | case SrcMem32: | ||
819 | src.bytes = 4; | ||
820 | goto srcmem_common; | ||
821 | case SrcMem: | ||
822 | src.bytes = (d & ByteOp) ? 1 : op_bytes; | ||
823 | /* Don't fetch the address for invlpg: it could be unmapped. */ | ||
824 | if (twobyte && b == 0x01 && modrm_reg == 7) | ||
825 | break; | ||
826 | srcmem_common: | ||
827 | /* | ||
828 | * For instructions with a ModR/M byte, switch to register | ||
829 | * access if Mod = 3. | ||
830 | */ | ||
831 | if ((d & ModRM) && modrm_mod == 3) { | ||
832 | src.type = OP_REG; | ||
833 | break; | ||
834 | } | ||
835 | src.type = OP_MEM; | ||
836 | src.ptr = (unsigned long *)cr2; | ||
837 | src.val = 0; | ||
838 | if ((rc = ops->read_emulated((unsigned long)src.ptr, | ||
839 | &src.val, src.bytes, ctxt->vcpu)) != 0) | ||
840 | goto done; | ||
841 | src.orig_val = src.val; | ||
842 | break; | ||
843 | case SrcImm: | ||
844 | src.type = OP_IMM; | ||
845 | src.ptr = (unsigned long *)_eip; | ||
846 | src.bytes = (d & ByteOp) ? 1 : op_bytes; | ||
847 | if (src.bytes == 8) | ||
848 | src.bytes = 4; | ||
849 | /* NB. Immediates are sign-extended as necessary. */ | ||
850 | switch (src.bytes) { | ||
851 | case 1: | ||
852 | src.val = insn_fetch(s8, 1, _eip); | ||
853 | break; | ||
854 | case 2: | ||
855 | src.val = insn_fetch(s16, 2, _eip); | ||
856 | break; | ||
857 | case 4: | ||
858 | src.val = insn_fetch(s32, 4, _eip); | ||
859 | break; | ||
860 | } | ||
861 | break; | ||
862 | case SrcImmByte: | ||
863 | src.type = OP_IMM; | ||
864 | src.ptr = (unsigned long *)_eip; | ||
865 | src.bytes = 1; | ||
866 | src.val = insn_fetch(s8, 1, _eip); | ||
867 | break; | ||
868 | } | ||
869 | |||
870 | /* Decode and fetch the destination operand: register or memory. */ | ||
871 | switch (d & DstMask) { | ||
872 | case ImplicitOps: | ||
873 | /* Special instructions do their own operand decoding. */ | ||
874 | goto special_insn; | ||
875 | case DstReg: | ||
876 | dst.type = OP_REG; | ||
877 | if ((d & ByteOp) | ||
878 | && !(twobyte && (b == 0xb6 || b == 0xb7))) { | ||
879 | dst.ptr = decode_register(modrm_reg, _regs, | ||
880 | (rex_prefix == 0)); | ||
881 | dst.val = *(u8 *) dst.ptr; | ||
882 | dst.bytes = 1; | ||
883 | } else { | ||
884 | dst.ptr = decode_register(modrm_reg, _regs, 0); | ||
885 | switch ((dst.bytes = op_bytes)) { | ||
886 | case 2: | ||
887 | dst.val = *(u16 *)dst.ptr; | ||
888 | break; | ||
889 | case 4: | ||
890 | dst.val = *(u32 *)dst.ptr; | ||
891 | break; | ||
892 | case 8: | ||
893 | dst.val = *(u64 *)dst.ptr; | ||
894 | break; | ||
895 | } | ||
896 | } | ||
897 | break; | ||
898 | case DstMem: | ||
899 | dst.type = OP_MEM; | ||
900 | dst.ptr = (unsigned long *)cr2; | ||
901 | dst.bytes = (d & ByteOp) ? 1 : op_bytes; | ||
902 | dst.val = 0; | ||
903 | /* | ||
904 | * For instructions with a ModR/M byte, switch to register | ||
905 | * access if Mod = 3. | ||
906 | */ | ||
907 | if ((d & ModRM) && modrm_mod == 3) { | ||
908 | dst.type = OP_REG; | ||
909 | break; | ||
910 | } | ||
911 | if (d & BitOp) { | ||
912 | unsigned long mask = ~(dst.bytes * 8 - 1); | ||
913 | |||
914 | dst.ptr = (void *)dst.ptr + (src.val & mask) / 8; | ||
915 | } | ||
916 | if (!(d & Mov) && /* optimisation - avoid slow emulated read */ | ||
917 | ((rc = ops->read_emulated((unsigned long)dst.ptr, | ||
918 | &dst.val, dst.bytes, ctxt->vcpu)) != 0)) | ||
919 | goto done; | ||
920 | break; | ||
921 | } | ||
922 | dst.orig_val = dst.val; | ||
923 | |||
924 | if (twobyte) | ||
925 | goto twobyte_insn; | ||
926 | |||
927 | switch (b) { | ||
928 | case 0x00 ... 0x05: | ||
929 | add: /* add */ | ||
930 | emulate_2op_SrcV("add", src, dst, _eflags); | ||
931 | break; | ||
932 | case 0x08 ... 0x0d: | ||
933 | or: /* or */ | ||
934 | emulate_2op_SrcV("or", src, dst, _eflags); | ||
935 | break; | ||
936 | case 0x10 ... 0x15: | ||
937 | adc: /* adc */ | ||
938 | emulate_2op_SrcV("adc", src, dst, _eflags); | ||
939 | break; | ||
940 | case 0x18 ... 0x1d: | ||
941 | sbb: /* sbb */ | ||
942 | emulate_2op_SrcV("sbb", src, dst, _eflags); | ||
943 | break; | ||
944 | case 0x20 ... 0x23: | ||
945 | and: /* and */ | ||
946 | emulate_2op_SrcV("and", src, dst, _eflags); | ||
947 | break; | ||
948 | case 0x24: /* and al imm8 */ | ||
949 | dst.type = OP_REG; | ||
950 | dst.ptr = &_regs[VCPU_REGS_RAX]; | ||
951 | dst.val = *(u8 *)dst.ptr; | ||
952 | dst.bytes = 1; | ||
953 | dst.orig_val = dst.val; | ||
954 | goto and; | ||
955 | case 0x25: /* and ax imm16, or eax imm32 */ | ||
956 | dst.type = OP_REG; | ||
957 | dst.bytes = op_bytes; | ||
958 | dst.ptr = &_regs[VCPU_REGS_RAX]; | ||
959 | if (op_bytes == 2) | ||
960 | dst.val = *(u16 *)dst.ptr; | ||
961 | else | ||
962 | dst.val = *(u32 *)dst.ptr; | ||
963 | dst.orig_val = dst.val; | ||
964 | goto and; | ||
965 | case 0x28 ... 0x2d: | ||
966 | sub: /* sub */ | ||
967 | emulate_2op_SrcV("sub", src, dst, _eflags); | ||
968 | break; | ||
969 | case 0x30 ... 0x35: | ||
970 | xor: /* xor */ | ||
971 | emulate_2op_SrcV("xor", src, dst, _eflags); | ||
972 | break; | ||
973 | case 0x38 ... 0x3d: | ||
974 | cmp: /* cmp */ | ||
975 | emulate_2op_SrcV("cmp", src, dst, _eflags); | ||
976 | break; | ||
977 | case 0x63: /* movsxd */ | ||
978 | if (mode != X86EMUL_MODE_PROT64) | ||
979 | goto cannot_emulate; | ||
980 | dst.val = (s32) src.val; | ||
981 | break; | ||
982 | case 0x80 ... 0x83: /* Grp1 */ | ||
983 | switch (modrm_reg) { | ||
984 | case 0: | ||
985 | goto add; | ||
986 | case 1: | ||
987 | goto or; | ||
988 | case 2: | ||
989 | goto adc; | ||
990 | case 3: | ||
991 | goto sbb; | ||
992 | case 4: | ||
993 | goto and; | ||
994 | case 5: | ||
995 | goto sub; | ||
996 | case 6: | ||
997 | goto xor; | ||
998 | case 7: | ||
999 | goto cmp; | ||
1000 | } | ||
1001 | break; | ||
1002 | case 0x84 ... 0x85: | ||
1003 | test: /* test */ | ||
1004 | emulate_2op_SrcV("test", src, dst, _eflags); | ||
1005 | break; | ||
1006 | case 0x86 ... 0x87: /* xchg */ | ||
1007 | /* Write back the register source. */ | ||
1008 | switch (dst.bytes) { | ||
1009 | case 1: | ||
1010 | *(u8 *) src.ptr = (u8) dst.val; | ||
1011 | break; | ||
1012 | case 2: | ||
1013 | *(u16 *) src.ptr = (u16) dst.val; | ||
1014 | break; | ||
1015 | case 4: | ||
1016 | *src.ptr = (u32) dst.val; | ||
1017 | break; /* 64b reg: zero-extend */ | ||
1018 | case 8: | ||
1019 | *src.ptr = dst.val; | ||
1020 | break; | ||
1021 | } | ||
1022 | /* | ||
1023 | * Write back the memory destination with implicit LOCK | ||
1024 | * prefix. | ||
1025 | */ | ||
1026 | dst.val = src.val; | ||
1027 | lock_prefix = 1; | ||
1028 | break; | ||
1029 | case 0x88 ... 0x8b: /* mov */ | ||
1030 | goto mov; | ||
1031 | case 0x8d: /* lea r16/r32, m */ | ||
1032 | dst.val = modrm_val; | ||
1033 | break; | ||
1034 | case 0x8f: /* pop (sole member of Grp1a) */ | ||
1035 | /* 64-bit mode: POP always pops a 64-bit operand. */ | ||
1036 | if (mode == X86EMUL_MODE_PROT64) | ||
1037 | dst.bytes = 8; | ||
1038 | if ((rc = ops->read_std(register_address(ctxt->ss_base, | ||
1039 | _regs[VCPU_REGS_RSP]), | ||
1040 | &dst.val, dst.bytes, ctxt->vcpu)) != 0) | ||
1041 | goto done; | ||
1042 | register_address_increment(_regs[VCPU_REGS_RSP], dst.bytes); | ||
1043 | break; | ||
1044 | case 0xa0 ... 0xa1: /* mov */ | ||
1045 | dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX]; | ||
1046 | dst.val = src.val; | ||
1047 | _eip += ad_bytes; /* skip src displacement */ | ||
1048 | break; | ||
1049 | case 0xa2 ... 0xa3: /* mov */ | ||
1050 | dst.val = (unsigned long)_regs[VCPU_REGS_RAX]; | ||
1051 | _eip += ad_bytes; /* skip dst displacement */ | ||
1052 | break; | ||
1053 | case 0xc0 ... 0xc1: | ||
1054 | grp2: /* Grp2 */ | ||
1055 | switch (modrm_reg) { | ||
1056 | case 0: /* rol */ | ||
1057 | emulate_2op_SrcB("rol", src, dst, _eflags); | ||
1058 | break; | ||
1059 | case 1: /* ror */ | ||
1060 | emulate_2op_SrcB("ror", src, dst, _eflags); | ||
1061 | break; | ||
1062 | case 2: /* rcl */ | ||
1063 | emulate_2op_SrcB("rcl", src, dst, _eflags); | ||
1064 | break; | ||
1065 | case 3: /* rcr */ | ||
1066 | emulate_2op_SrcB("rcr", src, dst, _eflags); | ||
1067 | break; | ||
1068 | case 4: /* sal/shl */ | ||
1069 | case 6: /* sal/shl */ | ||
1070 | emulate_2op_SrcB("sal", src, dst, _eflags); | ||
1071 | break; | ||
1072 | case 5: /* shr */ | ||
1073 | emulate_2op_SrcB("shr", src, dst, _eflags); | ||
1074 | break; | ||
1075 | case 7: /* sar */ | ||
1076 | emulate_2op_SrcB("sar", src, dst, _eflags); | ||
1077 | break; | ||
1078 | } | ||
1079 | break; | ||
1080 | case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */ | ||
1081 | mov: | ||
1082 | dst.val = src.val; | ||
1083 | break; | ||
1084 | case 0xd0 ... 0xd1: /* Grp2 */ | ||
1085 | src.val = 1; | ||
1086 | goto grp2; | ||
1087 | case 0xd2 ... 0xd3: /* Grp2 */ | ||
1088 | src.val = _regs[VCPU_REGS_RCX]; | ||
1089 | goto grp2; | ||
1090 | case 0xf6 ... 0xf7: /* Grp3 */ | ||
1091 | switch (modrm_reg) { | ||
1092 | case 0 ... 1: /* test */ | ||
1093 | /* | ||
1094 | * Special case in Grp3: test has an immediate | ||
1095 | * source operand. | ||
1096 | */ | ||
1097 | src.type = OP_IMM; | ||
1098 | src.ptr = (unsigned long *)_eip; | ||
1099 | src.bytes = (d & ByteOp) ? 1 : op_bytes; | ||
1100 | if (src.bytes == 8) | ||
1101 | src.bytes = 4; | ||
1102 | switch (src.bytes) { | ||
1103 | case 1: | ||
1104 | src.val = insn_fetch(s8, 1, _eip); | ||
1105 | break; | ||
1106 | case 2: | ||
1107 | src.val = insn_fetch(s16, 2, _eip); | ||
1108 | break; | ||
1109 | case 4: | ||
1110 | src.val = insn_fetch(s32, 4, _eip); | ||
1111 | break; | ||
1112 | } | ||
1113 | goto test; | ||
1114 | case 2: /* not */ | ||
1115 | dst.val = ~dst.val; | ||
1116 | break; | ||
1117 | case 3: /* neg */ | ||
1118 | emulate_1op("neg", dst, _eflags); | ||
1119 | break; | ||
1120 | default: | ||
1121 | goto cannot_emulate; | ||
1122 | } | ||
1123 | break; | ||
1124 | case 0xfe ... 0xff: /* Grp4/Grp5 */ | ||
1125 | switch (modrm_reg) { | ||
1126 | case 0: /* inc */ | ||
1127 | emulate_1op("inc", dst, _eflags); | ||
1128 | break; | ||
1129 | case 1: /* dec */ | ||
1130 | emulate_1op("dec", dst, _eflags); | ||
1131 | break; | ||
1132 | case 4: /* jmp abs */ | ||
1133 | if (b == 0xff) | ||
1134 | _eip = dst.val; | ||
1135 | else | ||
1136 | goto cannot_emulate; | ||
1137 | break; | ||
1138 | case 6: /* push */ | ||
1139 | /* 64-bit mode: PUSH always pushes a 64-bit operand. */ | ||
1140 | if (mode == X86EMUL_MODE_PROT64) { | ||
1141 | dst.bytes = 8; | ||
1142 | if ((rc = ops->read_std((unsigned long)dst.ptr, | ||
1143 | &dst.val, 8, | ||
1144 | ctxt->vcpu)) != 0) | ||
1145 | goto done; | ||
1146 | } | ||
1147 | register_address_increment(_regs[VCPU_REGS_RSP], | ||
1148 | -dst.bytes); | ||
1149 | if ((rc = ops->write_emulated( | ||
1150 | register_address(ctxt->ss_base, | ||
1151 | _regs[VCPU_REGS_RSP]), | ||
1152 | &dst.val, dst.bytes, ctxt->vcpu)) != 0) | ||
1153 | goto done; | ||
1154 | no_wb = 1; | ||
1155 | break; | ||
1156 | default: | ||
1157 | goto cannot_emulate; | ||
1158 | } | ||
1159 | break; | ||
1160 | } | ||
1161 | |||
1162 | writeback: | ||
1163 | if (!no_wb) { | ||
1164 | switch (dst.type) { | ||
1165 | case OP_REG: | ||
1166 | /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */ | ||
1167 | switch (dst.bytes) { | ||
1168 | case 1: | ||
1169 | *(u8 *)dst.ptr = (u8)dst.val; | ||
1170 | break; | ||
1171 | case 2: | ||
1172 | *(u16 *)dst.ptr = (u16)dst.val; | ||
1173 | break; | ||
1174 | case 4: | ||
1175 | *dst.ptr = (u32)dst.val; | ||
1176 | break; /* 64b: zero-ext */ | ||
1177 | case 8: | ||
1178 | *dst.ptr = dst.val; | ||
1179 | break; | ||
1180 | } | ||
1181 | break; | ||
1182 | case OP_MEM: | ||
1183 | if (lock_prefix) | ||
1184 | rc = ops->cmpxchg_emulated((unsigned long)dst. | ||
1185 | ptr, &dst.orig_val, | ||
1186 | &dst.val, dst.bytes, | ||
1187 | ctxt->vcpu); | ||
1188 | else | ||
1189 | rc = ops->write_emulated((unsigned long)dst.ptr, | ||
1190 | &dst.val, dst.bytes, | ||
1191 | ctxt->vcpu); | ||
1192 | if (rc != 0) | ||
1193 | goto done; | ||
1194 | default: | ||
1195 | break; | ||
1196 | } | ||
1197 | } | ||
1198 | |||
1199 | /* Commit shadow register state. */ | ||
1200 | memcpy(ctxt->vcpu->regs, _regs, sizeof _regs); | ||
1201 | ctxt->eflags = _eflags; | ||
1202 | ctxt->vcpu->rip = _eip; | ||
1203 | |||
1204 | done: | ||
1205 | return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; | ||
1206 | |||
1207 | special_insn: | ||
1208 | if (twobyte) | ||
1209 | goto twobyte_special_insn; | ||
1210 | switch(b) { | ||
1211 | case 0x50 ... 0x57: /* push reg */ | ||
1212 | if (op_bytes == 2) | ||
1213 | src.val = (u16) _regs[b & 0x7]; | ||
1214 | else | ||
1215 | src.val = (u32) _regs[b & 0x7]; | ||
1216 | dst.type = OP_MEM; | ||
1217 | dst.bytes = op_bytes; | ||
1218 | dst.val = src.val; | ||
1219 | register_address_increment(_regs[VCPU_REGS_RSP], -op_bytes); | ||
1220 | dst.ptr = (void *) register_address( | ||
1221 | ctxt->ss_base, _regs[VCPU_REGS_RSP]); | ||
1222 | break; | ||
1223 | case 0x58 ... 0x5f: /* pop reg */ | ||
1224 | dst.ptr = (unsigned long *)&_regs[b & 0x7]; | ||
1225 | pop_instruction: | ||
1226 | if ((rc = ops->read_std(register_address(ctxt->ss_base, | ||
1227 | _regs[VCPU_REGS_RSP]), dst.ptr, op_bytes, ctxt->vcpu)) | ||
1228 | != 0) | ||
1229 | goto done; | ||
1230 | |||
1231 | register_address_increment(_regs[VCPU_REGS_RSP], op_bytes); | ||
1232 | no_wb = 1; /* Disable writeback. */ | ||
1233 | break; | ||
1234 | case 0x6a: /* push imm8 */ | ||
1235 | src.val = 0L; | ||
1236 | src.val = insn_fetch(s8, 1, _eip); | ||
1237 | push: | ||
1238 | dst.type = OP_MEM; | ||
1239 | dst.bytes = op_bytes; | ||
1240 | dst.val = src.val; | ||
1241 | register_address_increment(_regs[VCPU_REGS_RSP], -op_bytes); | ||
1242 | dst.ptr = (void *) register_address(ctxt->ss_base, | ||
1243 | _regs[VCPU_REGS_RSP]); | ||
1244 | break; | ||
1245 | case 0x6c: /* insb */ | ||
1246 | case 0x6d: /* insw/insd */ | ||
1247 | if (kvm_emulate_pio_string(ctxt->vcpu, NULL, | ||
1248 | 1, /* in */ | ||
1249 | (d & ByteOp) ? 1 : op_bytes, /* size */ | ||
1250 | rep_prefix ? | ||
1251 | address_mask(_regs[VCPU_REGS_RCX]) : 1, /* count */ | ||
1252 | (_eflags & EFLG_DF), /* down */ | ||
1253 | register_address(ctxt->es_base, | ||
1254 | _regs[VCPU_REGS_RDI]), /* address */ | ||
1255 | rep_prefix, | ||
1256 | _regs[VCPU_REGS_RDX] /* port */ | ||
1257 | ) == 0) | ||
1258 | return -1; | ||
1259 | return 0; | ||
1260 | case 0x6e: /* outsb */ | ||
1261 | case 0x6f: /* outsw/outsd */ | ||
1262 | if (kvm_emulate_pio_string(ctxt->vcpu, NULL, | ||
1263 | 0, /* in */ | ||
1264 | (d & ByteOp) ? 1 : op_bytes, /* size */ | ||
1265 | rep_prefix ? | ||
1266 | address_mask(_regs[VCPU_REGS_RCX]) : 1, /* count */ | ||
1267 | (_eflags & EFLG_DF), /* down */ | ||
1268 | register_address(override_base ? | ||
1269 | *override_base : ctxt->ds_base, | ||
1270 | _regs[VCPU_REGS_RSI]), /* address */ | ||
1271 | rep_prefix, | ||
1272 | _regs[VCPU_REGS_RDX] /* port */ | ||
1273 | ) == 0) | ||
1274 | return -1; | ||
1275 | return 0; | ||
1276 | case 0x70 ... 0x7f: /* jcc (short) */ { | ||
1277 | int rel = insn_fetch(s8, 1, _eip); | ||
1278 | |||
1279 | if (test_cc(b, _eflags)) | ||
1280 | JMP_REL(rel); | ||
1281 | break; | ||
1282 | } | ||
1283 | case 0x9c: /* pushf */ | ||
1284 | src.val = (unsigned long) _eflags; | ||
1285 | goto push; | ||
1286 | case 0x9d: /* popf */ | ||
1287 | dst.ptr = (unsigned long *) &_eflags; | ||
1288 | goto pop_instruction; | ||
1289 | case 0xc3: /* ret */ | ||
1290 | dst.ptr = &_eip; | ||
1291 | goto pop_instruction; | ||
1292 | case 0xf4: /* hlt */ | ||
1293 | ctxt->vcpu->halt_request = 1; | ||
1294 | goto done; | ||
1295 | } | ||
1296 | if (rep_prefix) { | ||
1297 | if (_regs[VCPU_REGS_RCX] == 0) { | ||
1298 | ctxt->vcpu->rip = _eip; | ||
1299 | goto done; | ||
1300 | } | ||
1301 | _regs[VCPU_REGS_RCX]--; | ||
1302 | _eip = ctxt->vcpu->rip; | ||
1303 | } | ||
1304 | switch (b) { | ||
1305 | case 0xa4 ... 0xa5: /* movs */ | ||
1306 | dst.type = OP_MEM; | ||
1307 | dst.bytes = (d & ByteOp) ? 1 : op_bytes; | ||
1308 | dst.ptr = (unsigned long *)register_address(ctxt->es_base, | ||
1309 | _regs[VCPU_REGS_RDI]); | ||
1310 | if ((rc = ops->read_emulated(register_address( | ||
1311 | override_base ? *override_base : ctxt->ds_base, | ||
1312 | _regs[VCPU_REGS_RSI]), &dst.val, dst.bytes, ctxt->vcpu)) != 0) | ||
1313 | goto done; | ||
1314 | register_address_increment(_regs[VCPU_REGS_RSI], | ||
1315 | (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes); | ||
1316 | register_address_increment(_regs[VCPU_REGS_RDI], | ||
1317 | (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes); | ||
1318 | break; | ||
1319 | case 0xa6 ... 0xa7: /* cmps */ | ||
1320 | DPRINTF("Urk! I don't handle CMPS.\n"); | ||
1321 | goto cannot_emulate; | ||
1322 | case 0xaa ... 0xab: /* stos */ | ||
1323 | dst.type = OP_MEM; | ||
1324 | dst.bytes = (d & ByteOp) ? 1 : op_bytes; | ||
1325 | dst.ptr = (unsigned long *)cr2; | ||
1326 | dst.val = _regs[VCPU_REGS_RAX]; | ||
1327 | register_address_increment(_regs[VCPU_REGS_RDI], | ||
1328 | (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes); | ||
1329 | break; | ||
1330 | case 0xac ... 0xad: /* lods */ | ||
1331 | dst.type = OP_REG; | ||
1332 | dst.bytes = (d & ByteOp) ? 1 : op_bytes; | ||
1333 | dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX]; | ||
1334 | if ((rc = ops->read_emulated(cr2, &dst.val, dst.bytes, | ||
1335 | ctxt->vcpu)) != 0) | ||
1336 | goto done; | ||
1337 | register_address_increment(_regs[VCPU_REGS_RSI], | ||
1338 | (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes); | ||
1339 | break; | ||
1340 | case 0xae ... 0xaf: /* scas */ | ||
1341 | DPRINTF("Urk! I don't handle SCAS.\n"); | ||
1342 | goto cannot_emulate; | ||
1343 | case 0xe8: /* call (near) */ { | ||
1344 | long int rel; | ||
1345 | switch (op_bytes) { | ||
1346 | case 2: | ||
1347 | rel = insn_fetch(s16, 2, _eip); | ||
1348 | break; | ||
1349 | case 4: | ||
1350 | rel = insn_fetch(s32, 4, _eip); | ||
1351 | break; | ||
1352 | case 8: | ||
1353 | rel = insn_fetch(s64, 8, _eip); | ||
1354 | break; | ||
1355 | default: | ||
1356 | DPRINTF("Call: Invalid op_bytes\n"); | ||
1357 | goto cannot_emulate; | ||
1358 | } | ||
1359 | src.val = (unsigned long) _eip; | ||
1360 | JMP_REL(rel); | ||
1361 | op_bytes = ad_bytes; | ||
1362 | goto push; | ||
1363 | } | ||
1364 | case 0xe9: /* jmp rel */ | ||
1365 | case 0xeb: /* jmp rel short */ | ||
1366 | JMP_REL(src.val); | ||
1367 | no_wb = 1; /* Disable writeback. */ | ||
1368 | break; | ||
1369 | |||
1370 | |||
1371 | } | ||
1372 | goto writeback; | ||
1373 | |||
1374 | twobyte_insn: | ||
1375 | switch (b) { | ||
1376 | case 0x01: /* lgdt, lidt, lmsw */ | ||
1377 | /* Disable writeback. */ | ||
1378 | no_wb = 1; | ||
1379 | switch (modrm_reg) { | ||
1380 | u16 size; | ||
1381 | unsigned long address; | ||
1382 | |||
1383 | case 2: /* lgdt */ | ||
1384 | rc = read_descriptor(ctxt, ops, src.ptr, | ||
1385 | &size, &address, op_bytes); | ||
1386 | if (rc) | ||
1387 | goto done; | ||
1388 | realmode_lgdt(ctxt->vcpu, size, address); | ||
1389 | break; | ||
1390 | case 3: /* lidt */ | ||
1391 | rc = read_descriptor(ctxt, ops, src.ptr, | ||
1392 | &size, &address, op_bytes); | ||
1393 | if (rc) | ||
1394 | goto done; | ||
1395 | realmode_lidt(ctxt->vcpu, size, address); | ||
1396 | break; | ||
1397 | case 4: /* smsw */ | ||
1398 | if (modrm_mod != 3) | ||
1399 | goto cannot_emulate; | ||
1400 | *(u16 *)&_regs[modrm_rm] | ||
1401 | = realmode_get_cr(ctxt->vcpu, 0); | ||
1402 | break; | ||
1403 | case 6: /* lmsw */ | ||
1404 | if (modrm_mod != 3) | ||
1405 | goto cannot_emulate; | ||
1406 | realmode_lmsw(ctxt->vcpu, (u16)modrm_val, &_eflags); | ||
1407 | break; | ||
1408 | case 7: /* invlpg*/ | ||
1409 | emulate_invlpg(ctxt->vcpu, cr2); | ||
1410 | break; | ||
1411 | default: | ||
1412 | goto cannot_emulate; | ||
1413 | } | ||
1414 | break; | ||
1415 | case 0x21: /* mov from dr to reg */ | ||
1416 | no_wb = 1; | ||
1417 | if (modrm_mod != 3) | ||
1418 | goto cannot_emulate; | ||
1419 | rc = emulator_get_dr(ctxt, modrm_reg, &_regs[modrm_rm]); | ||
1420 | break; | ||
1421 | case 0x23: /* mov from reg to dr */ | ||
1422 | no_wb = 1; | ||
1423 | if (modrm_mod != 3) | ||
1424 | goto cannot_emulate; | ||
1425 | rc = emulator_set_dr(ctxt, modrm_reg, _regs[modrm_rm]); | ||
1426 | break; | ||
1427 | case 0x40 ... 0x4f: /* cmov */ | ||
1428 | dst.val = dst.orig_val = src.val; | ||
1429 | no_wb = 1; | ||
1430 | /* | ||
1431 | * First, assume we're decoding an even cmov opcode | ||
1432 | * (lsb == 0). | ||
1433 | */ | ||
1434 | switch ((b & 15) >> 1) { | ||
1435 | case 0: /* cmovo */ | ||
1436 | no_wb = (_eflags & EFLG_OF) ? 0 : 1; | ||
1437 | break; | ||
1438 | case 1: /* cmovb/cmovc/cmovnae */ | ||
1439 | no_wb = (_eflags & EFLG_CF) ? 0 : 1; | ||
1440 | break; | ||
1441 | case 2: /* cmovz/cmove */ | ||
1442 | no_wb = (_eflags & EFLG_ZF) ? 0 : 1; | ||
1443 | break; | ||
1444 | case 3: /* cmovbe/cmovna */ | ||
1445 | no_wb = (_eflags & (EFLG_CF | EFLG_ZF)) ? 0 : 1; | ||
1446 | break; | ||
1447 | case 4: /* cmovs */ | ||
1448 | no_wb = (_eflags & EFLG_SF) ? 0 : 1; | ||
1449 | break; | ||
1450 | case 5: /* cmovp/cmovpe */ | ||
1451 | no_wb = (_eflags & EFLG_PF) ? 0 : 1; | ||
1452 | break; | ||
1453 | case 7: /* cmovle/cmovng */ | ||
1454 | no_wb = (_eflags & EFLG_ZF) ? 0 : 1; | ||
1455 | /* fall through */ | ||
1456 | case 6: /* cmovl/cmovnge */ | ||
1457 | no_wb &= (!(_eflags & EFLG_SF) != | ||
1458 | !(_eflags & EFLG_OF)) ? 0 : 1; | ||
1459 | break; | ||
1460 | } | ||
1461 | /* Odd cmov opcodes (lsb == 1) have inverted sense. */ | ||
1462 | no_wb ^= b & 1; | ||
1463 | break; | ||
1464 | case 0xa3: | ||
1465 | bt: /* bt */ | ||
1466 | src.val &= (dst.bytes << 3) - 1; /* only subword offset */ | ||
1467 | emulate_2op_SrcV_nobyte("bt", src, dst, _eflags); | ||
1468 | break; | ||
1469 | case 0xab: | ||
1470 | bts: /* bts */ | ||
1471 | src.val &= (dst.bytes << 3) - 1; /* only subword offset */ | ||
1472 | emulate_2op_SrcV_nobyte("bts", src, dst, _eflags); | ||
1473 | break; | ||
1474 | case 0xb0 ... 0xb1: /* cmpxchg */ | ||
1475 | /* | ||
1476 | * Save real source value, then compare EAX against | ||
1477 | * destination. | ||
1478 | */ | ||
1479 | src.orig_val = src.val; | ||
1480 | src.val = _regs[VCPU_REGS_RAX]; | ||
1481 | emulate_2op_SrcV("cmp", src, dst, _eflags); | ||
1482 | if (_eflags & EFLG_ZF) { | ||
1483 | /* Success: write back to memory. */ | ||
1484 | dst.val = src.orig_val; | ||
1485 | } else { | ||
1486 | /* Failure: write the value we saw to EAX. */ | ||
1487 | dst.type = OP_REG; | ||
1488 | dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX]; | ||
1489 | } | ||
1490 | break; | ||
1491 | case 0xb3: | ||
1492 | btr: /* btr */ | ||
1493 | src.val &= (dst.bytes << 3) - 1; /* only subword offset */ | ||
1494 | emulate_2op_SrcV_nobyte("btr", src, dst, _eflags); | ||
1495 | break; | ||
1496 | case 0xb6 ... 0xb7: /* movzx */ | ||
1497 | dst.bytes = op_bytes; | ||
1498 | dst.val = (d & ByteOp) ? (u8) src.val : (u16) src.val; | ||
1499 | break; | ||
1500 | case 0xba: /* Grp8 */ | ||
1501 | switch (modrm_reg & 3) { | ||
1502 | case 0: | ||
1503 | goto bt; | ||
1504 | case 1: | ||
1505 | goto bts; | ||
1506 | case 2: | ||
1507 | goto btr; | ||
1508 | case 3: | ||
1509 | goto btc; | ||
1510 | } | ||
1511 | break; | ||
1512 | case 0xbb: | ||
1513 | btc: /* btc */ | ||
1514 | src.val &= (dst.bytes << 3) - 1; /* only subword offset */ | ||
1515 | emulate_2op_SrcV_nobyte("btc", src, dst, _eflags); | ||
1516 | break; | ||
1517 | case 0xbe ... 0xbf: /* movsx */ | ||
1518 | dst.bytes = op_bytes; | ||
1519 | dst.val = (d & ByteOp) ? (s8) src.val : (s16) src.val; | ||
1520 | break; | ||
1521 | case 0xc3: /* movnti */ | ||
1522 | dst.bytes = op_bytes; | ||
1523 | dst.val = (op_bytes == 4) ? (u32) src.val : (u64) src.val; | ||
1524 | break; | ||
1525 | } | ||
1526 | goto writeback; | ||
1527 | |||
1528 | twobyte_special_insn: | ||
1529 | /* Disable writeback. */ | ||
1530 | no_wb = 1; | ||
1531 | switch (b) { | ||
1532 | case 0x06: | ||
1533 | emulate_clts(ctxt->vcpu); | ||
1534 | break; | ||
1535 | case 0x08: /* invd */ | ||
1536 | break; | ||
1537 | case 0x09: /* wbinvd */ | ||
1538 | break; | ||
1539 | case 0x0d: /* GrpP (prefetch) */ | ||
1540 | case 0x18: /* Grp16 (prefetch/nop) */ | ||
1541 | break; | ||
1542 | case 0x20: /* mov cr, reg */ | ||
1543 | if (modrm_mod != 3) | ||
1544 | goto cannot_emulate; | ||
1545 | _regs[modrm_rm] = realmode_get_cr(ctxt->vcpu, modrm_reg); | ||
1546 | break; | ||
1547 | case 0x22: /* mov reg, cr */ | ||
1548 | if (modrm_mod != 3) | ||
1549 | goto cannot_emulate; | ||
1550 | realmode_set_cr(ctxt->vcpu, modrm_reg, modrm_val, &_eflags); | ||
1551 | break; | ||
1552 | case 0x30: | ||
1553 | /* wrmsr */ | ||
1554 | msr_data = (u32)_regs[VCPU_REGS_RAX] | ||
1555 | | ((u64)_regs[VCPU_REGS_RDX] << 32); | ||
1556 | rc = kvm_set_msr(ctxt->vcpu, _regs[VCPU_REGS_RCX], msr_data); | ||
1557 | if (rc) { | ||
1558 | kvm_x86_ops->inject_gp(ctxt->vcpu, 0); | ||
1559 | _eip = ctxt->vcpu->rip; | ||
1560 | } | ||
1561 | rc = X86EMUL_CONTINUE; | ||
1562 | break; | ||
1563 | case 0x32: | ||
1564 | /* rdmsr */ | ||
1565 | rc = kvm_get_msr(ctxt->vcpu, _regs[VCPU_REGS_RCX], &msr_data); | ||
1566 | if (rc) { | ||
1567 | kvm_x86_ops->inject_gp(ctxt->vcpu, 0); | ||
1568 | _eip = ctxt->vcpu->rip; | ||
1569 | } else { | ||
1570 | _regs[VCPU_REGS_RAX] = (u32)msr_data; | ||
1571 | _regs[VCPU_REGS_RDX] = msr_data >> 32; | ||
1572 | } | ||
1573 | rc = X86EMUL_CONTINUE; | ||
1574 | break; | ||
1575 | case 0x80 ... 0x8f: /* jnz rel, etc*/ { | ||
1576 | long int rel; | ||
1577 | |||
1578 | switch (op_bytes) { | ||
1579 | case 2: | ||
1580 | rel = insn_fetch(s16, 2, _eip); | ||
1581 | break; | ||
1582 | case 4: | ||
1583 | rel = insn_fetch(s32, 4, _eip); | ||
1584 | break; | ||
1585 | case 8: | ||
1586 | rel = insn_fetch(s64, 8, _eip); | ||
1587 | break; | ||
1588 | default: | ||
1589 | DPRINTF("jnz: Invalid op_bytes\n"); | ||
1590 | goto cannot_emulate; | ||
1591 | } | ||
1592 | if (test_cc(b, _eflags)) | ||
1593 | JMP_REL(rel); | ||
1594 | break; | ||
1595 | } | ||
1596 | case 0xc7: /* Grp9 (cmpxchg8b) */ | ||
1597 | { | ||
1598 | u64 old, new; | ||
1599 | if ((rc = ops->read_emulated(cr2, &old, 8, ctxt->vcpu)) | ||
1600 | != 0) | ||
1601 | goto done; | ||
1602 | if (((u32) (old >> 0) != (u32) _regs[VCPU_REGS_RAX]) || | ||
1603 | ((u32) (old >> 32) != (u32) _regs[VCPU_REGS_RDX])) { | ||
1604 | _regs[VCPU_REGS_RAX] = (u32) (old >> 0); | ||
1605 | _regs[VCPU_REGS_RDX] = (u32) (old >> 32); | ||
1606 | _eflags &= ~EFLG_ZF; | ||
1607 | } else { | ||
1608 | new = ((u64)_regs[VCPU_REGS_RCX] << 32) | ||
1609 | | (u32) _regs[VCPU_REGS_RBX]; | ||
1610 | if ((rc = ops->cmpxchg_emulated(cr2, &old, | ||
1611 | &new, 8, ctxt->vcpu)) != 0) | ||
1612 | goto done; | ||
1613 | _eflags |= EFLG_ZF; | ||
1614 | } | ||
1615 | break; | ||
1616 | } | ||
1617 | } | ||
1618 | goto writeback; | ||
1619 | |||
1620 | cannot_emulate: | ||
1621 | DPRINTF("Cannot emulate %02x\n", b); | ||
1622 | return -1; | ||
1623 | } | ||
1624 | |||
1625 | #ifdef __XEN__ | ||
1626 | |||
1627 | #include <asm/mm.h> | ||
1628 | #include <asm/uaccess.h> | ||
1629 | |||
1630 | int | ||
1631 | x86_emulate_read_std(unsigned long addr, | ||
1632 | unsigned long *val, | ||
1633 | unsigned int bytes, struct x86_emulate_ctxt *ctxt) | ||
1634 | { | ||
1635 | unsigned int rc; | ||
1636 | |||
1637 | *val = 0; | ||
1638 | |||
1639 | if ((rc = copy_from_user((void *)val, (void *)addr, bytes)) != 0) { | ||
1640 | propagate_page_fault(addr + bytes - rc, 0); /* read fault */ | ||
1641 | return X86EMUL_PROPAGATE_FAULT; | ||
1642 | } | ||
1643 | |||
1644 | return X86EMUL_CONTINUE; | ||
1645 | } | ||
1646 | |||
1647 | int | ||
1648 | x86_emulate_write_std(unsigned long addr, | ||
1649 | unsigned long val, | ||
1650 | unsigned int bytes, struct x86_emulate_ctxt *ctxt) | ||
1651 | { | ||
1652 | unsigned int rc; | ||
1653 | |||
1654 | if ((rc = copy_to_user((void *)addr, (void *)&val, bytes)) != 0) { | ||
1655 | propagate_page_fault(addr + bytes - rc, PGERR_write_access); | ||
1656 | return X86EMUL_PROPAGATE_FAULT; | ||
1657 | } | ||
1658 | |||
1659 | return X86EMUL_CONTINUE; | ||
1660 | } | ||
1661 | |||
1662 | #endif | ||
diff --git a/drivers/kvm/x86_emulate.h b/drivers/kvm/x86_emulate.h deleted file mode 100644 index 92c73aa7f9ac..000000000000 --- a/drivers/kvm/x86_emulate.h +++ /dev/null | |||
@@ -1,155 +0,0 @@ | |||
1 | /****************************************************************************** | ||
2 | * x86_emulate.h | ||
3 | * | ||
4 | * Generic x86 (32-bit and 64-bit) instruction decoder and emulator. | ||
5 | * | ||
6 | * Copyright (c) 2005 Keir Fraser | ||
7 | * | ||
8 | * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 | ||
9 | */ | ||
10 | |||
11 | #ifndef __X86_EMULATE_H__ | ||
12 | #define __X86_EMULATE_H__ | ||
13 | |||
14 | struct x86_emulate_ctxt; | ||
15 | |||
16 | /* | ||
17 | * x86_emulate_ops: | ||
18 | * | ||
19 | * These operations represent the instruction emulator's interface to memory. | ||
20 | * There are two categories of operation: those that act on ordinary memory | ||
21 | * regions (*_std), and those that act on memory regions known to require | ||
22 | * special treatment or emulation (*_emulated). | ||
23 | * | ||
24 | * The emulator assumes that an instruction accesses only one 'emulated memory' | ||
25 | * location, that this location is the given linear faulting address (cr2), and | ||
26 | * that this is one of the instruction's data operands. Instruction fetches and | ||
27 | * stack operations are assumed never to access emulated memory. The emulator | ||
28 | * automatically deduces which operand of a string-move operation is accessing | ||
29 | * emulated memory, and assumes that the other operand accesses normal memory. | ||
30 | * | ||
31 | * NOTES: | ||
32 | * 1. The emulator isn't very smart about emulated vs. standard memory. | ||
33 | * 'Emulated memory' access addresses should be checked for sanity. | ||
34 | * 'Normal memory' accesses may fault, and the caller must arrange to | ||
35 | * detect and handle reentrancy into the emulator via recursive faults. | ||
36 | * Accesses may be unaligned and may cross page boundaries. | ||
37 | * 2. If the access fails (cannot emulate, or a standard access faults) then | ||
38 | * it is up to the memop to propagate the fault to the guest VM via | ||
39 | * some out-of-band mechanism, unknown to the emulator. The memop signals | ||
40 | * failure by returning X86EMUL_PROPAGATE_FAULT to the emulator, which will | ||
41 | * then immediately bail. | ||
42 | * 3. Valid access sizes are 1, 2, 4 and 8 bytes. On x86/32 systems only | ||
43 | * cmpxchg8b_emulated need support 8-byte accesses. | ||
44 | * 4. The emulator cannot handle 64-bit mode emulation on an x86/32 system. | ||
45 | */ | ||
46 | /* Access completed successfully: continue emulation as normal. */ | ||
47 | #define X86EMUL_CONTINUE 0 | ||
48 | /* Access is unhandleable: bail from emulation and return error to caller. */ | ||
49 | #define X86EMUL_UNHANDLEABLE 1 | ||
50 | /* Terminate emulation but return success to the caller. */ | ||
51 | #define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */ | ||
52 | #define X86EMUL_RETRY_INSTR 2 /* retry the instruction for some reason */ | ||
53 | #define X86EMUL_CMPXCHG_FAILED 2 /* cmpxchg did not see expected value */ | ||
54 | struct x86_emulate_ops { | ||
55 | /* | ||
56 | * read_std: Read bytes of standard (non-emulated/special) memory. | ||
57 | * Used for instruction fetch, stack operations, and others. | ||
58 | * @addr: [IN ] Linear address from which to read. | ||
59 | * @val: [OUT] Value read from memory, zero-extended to 'u_long'. | ||
60 | * @bytes: [IN ] Number of bytes to read from memory. | ||
61 | */ | ||
62 | int (*read_std)(unsigned long addr, void *val, | ||
63 | unsigned int bytes, struct kvm_vcpu *vcpu); | ||
64 | |||
65 | /* | ||
66 | * write_std: Write bytes of standard (non-emulated/special) memory. | ||
67 | * Used for stack operations, and others. | ||
68 | * @addr: [IN ] Linear address to which to write. | ||
69 | * @val: [IN ] Value to write to memory (low-order bytes used as | ||
70 | * required). | ||
71 | * @bytes: [IN ] Number of bytes to write to memory. | ||
72 | */ | ||
73 | int (*write_std)(unsigned long addr, const void *val, | ||
74 | unsigned int bytes, struct kvm_vcpu *vcpu); | ||
75 | |||
76 | /* | ||
77 | * read_emulated: Read bytes from emulated/special memory area. | ||
78 | * @addr: [IN ] Linear address from which to read. | ||
79 | * @val: [OUT] Value read from memory, zero-extended to 'u_long'. | ||
80 | * @bytes: [IN ] Number of bytes to read from memory. | ||
81 | */ | ||
82 | int (*read_emulated) (unsigned long addr, | ||
83 | void *val, | ||
84 | unsigned int bytes, | ||
85 | struct kvm_vcpu *vcpu); | ||
86 | |||
87 | /* | ||
88 | * write_emulated: Read bytes from emulated/special memory area. | ||
89 | * @addr: [IN ] Linear address to which to write. | ||
90 | * @val: [IN ] Value to write to memory (low-order bytes used as | ||
91 | * required). | ||
92 | * @bytes: [IN ] Number of bytes to write to memory. | ||
93 | */ | ||
94 | int (*write_emulated) (unsigned long addr, | ||
95 | const void *val, | ||
96 | unsigned int bytes, | ||
97 | struct kvm_vcpu *vcpu); | ||
98 | |||
99 | /* | ||
100 | * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an | ||
101 | * emulated/special memory area. | ||
102 | * @addr: [IN ] Linear address to access. | ||
103 | * @old: [IN ] Value expected to be current at @addr. | ||
104 | * @new: [IN ] Value to write to @addr. | ||
105 | * @bytes: [IN ] Number of bytes to access using CMPXCHG. | ||
106 | */ | ||
107 | int (*cmpxchg_emulated) (unsigned long addr, | ||
108 | const void *old, | ||
109 | const void *new, | ||
110 | unsigned int bytes, | ||
111 | struct kvm_vcpu *vcpu); | ||
112 | |||
113 | }; | ||
114 | |||
115 | struct x86_emulate_ctxt { | ||
116 | /* Register state before/after emulation. */ | ||
117 | struct kvm_vcpu *vcpu; | ||
118 | |||
119 | /* Linear faulting address (if emulating a page-faulting instruction). */ | ||
120 | unsigned long eflags; | ||
121 | unsigned long cr2; | ||
122 | |||
123 | /* Emulated execution mode, represented by an X86EMUL_MODE value. */ | ||
124 | int mode; | ||
125 | |||
126 | unsigned long cs_base; | ||
127 | unsigned long ds_base; | ||
128 | unsigned long es_base; | ||
129 | unsigned long ss_base; | ||
130 | unsigned long gs_base; | ||
131 | unsigned long fs_base; | ||
132 | }; | ||
133 | |||
134 | /* Execution mode, passed to the emulator. */ | ||
135 | #define X86EMUL_MODE_REAL 0 /* Real mode. */ | ||
136 | #define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */ | ||
137 | #define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */ | ||
138 | #define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */ | ||
139 | |||
140 | /* Host execution mode. */ | ||
141 | #if defined(__i386__) | ||
142 | #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32 | ||
143 | #elif defined(CONFIG_X86_64) | ||
144 | #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 | ||
145 | #endif | ||
146 | |||
147 | /* | ||
148 | * x86_emulate_memop: Emulate an instruction that faulted attempting to | ||
149 | * read/write a 'special' memory area. | ||
150 | * Returns -1 on failure, 0 on success. | ||
151 | */ | ||
152 | int x86_emulate_memop(struct x86_emulate_ctxt *ctxt, | ||
153 | struct x86_emulate_ops *ops); | ||
154 | |||
155 | #endif /* __X86_EMULATE_H__ */ | ||