aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--drivers/kvm/Kconfig1
-rw-r--r--drivers/kvm/Makefile2
-rw-r--r--drivers/kvm/i8259.c450
-rw-r--r--drivers/kvm/ioapic.c388
-rw-r--r--drivers/kvm/irq.c98
-rw-r--r--drivers/kvm/irq.h165
-rw-r--r--drivers/kvm/kvm.h201
-rw-r--r--drivers/kvm/kvm_main.c1486
-rw-r--r--drivers/kvm/kvm_svm.h3
-rw-r--r--drivers/kvm/lapic.c1064
-rw-r--r--drivers/kvm/mmu.c51
-rw-r--r--drivers/kvm/paging_tmpl.h84
-rw-r--r--drivers/kvm/svm.c1046
-rw-r--r--drivers/kvm/vmx.c1034
-rw-r--r--drivers/kvm/vmx.h73
-rw-r--r--drivers/kvm/x86_emulate.c411
-rw-r--r--drivers/kvm/x86_emulate.h20
-rw-r--r--include/asm-x86/io_apic_32.h16
-rw-r--r--include/asm-x86/processor-flags.h2
-rw-r--r--include/linux/kvm.h128
20 files changed, 4848 insertions, 1875 deletions
diff --git a/drivers/kvm/Kconfig b/drivers/kvm/Kconfig
index 0a419a0de603..8749fa4ffcee 100644
--- a/drivers/kvm/Kconfig
+++ b/drivers/kvm/Kconfig
@@ -17,6 +17,7 @@ if VIRTUALIZATION
17config KVM 17config KVM
18 tristate "Kernel-based Virtual Machine (KVM) support" 18 tristate "Kernel-based Virtual Machine (KVM) support"
19 depends on X86 && EXPERIMENTAL 19 depends on X86 && EXPERIMENTAL
20 select PREEMPT_NOTIFIERS
20 select ANON_INODES 21 select ANON_INODES
21 ---help--- 22 ---help---
22 Support hosting fully virtualized guest machines using hardware 23 Support hosting fully virtualized guest machines using hardware
diff --git a/drivers/kvm/Makefile b/drivers/kvm/Makefile
index c0a789fa9d65..e5a8f4d3e973 100644
--- a/drivers/kvm/Makefile
+++ b/drivers/kvm/Makefile
@@ -2,7 +2,7 @@
2# Makefile for Kernel-based Virtual Machine module 2# Makefile for Kernel-based Virtual Machine module
3# 3#
4 4
5kvm-objs := kvm_main.o mmu.o x86_emulate.o 5kvm-objs := kvm_main.o mmu.o x86_emulate.o i8259.o irq.o lapic.o ioapic.o
6obj-$(CONFIG_KVM) += kvm.o 6obj-$(CONFIG_KVM) += kvm.o
7kvm-intel-objs = vmx.o 7kvm-intel-objs = vmx.o
8obj-$(CONFIG_KVM_INTEL) += kvm-intel.o 8obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
diff --git a/drivers/kvm/i8259.c b/drivers/kvm/i8259.c
new file mode 100644
index 000000000000..a679157bc599
--- /dev/null
+++ b/drivers/kvm/i8259.c
@@ -0,0 +1,450 @@
1/*
2 * 8259 interrupt controller emulation
3 *
4 * Copyright (c) 2003-2004 Fabrice Bellard
5 * Copyright (c) 2007 Intel Corporation
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a copy
8 * of this software and associated documentation files (the "Software"), to deal
9 * in the Software without restriction, including without limitation the rights
10 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 * copies of the Software, and to permit persons to whom the Software is
12 * furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be included in
15 * all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 * THE SOFTWARE.
24 * Authors:
25 * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
26 * Port from Qemu.
27 */
28#include <linux/mm.h>
29#include "irq.h"
30
31/*
32 * set irq level. If an edge is detected, then the IRR is set to 1
33 */
34static inline void pic_set_irq1(struct kvm_kpic_state *s, int irq, int level)
35{
36 int mask;
37 mask = 1 << irq;
38 if (s->elcr & mask) /* level triggered */
39 if (level) {
40 s->irr |= mask;
41 s->last_irr |= mask;
42 } else {
43 s->irr &= ~mask;
44 s->last_irr &= ~mask;
45 }
46 else /* edge triggered */
47 if (level) {
48 if ((s->last_irr & mask) == 0)
49 s->irr |= mask;
50 s->last_irr |= mask;
51 } else
52 s->last_irr &= ~mask;
53}
54
55/*
56 * return the highest priority found in mask (highest = smallest
57 * number). Return 8 if no irq
58 */
59static inline int get_priority(struct kvm_kpic_state *s, int mask)
60{
61 int priority;
62 if (mask == 0)
63 return 8;
64 priority = 0;
65 while ((mask & (1 << ((priority + s->priority_add) & 7))) == 0)
66 priority++;
67 return priority;
68}
69
70/*
71 * return the pic wanted interrupt. return -1 if none
72 */
73static int pic_get_irq(struct kvm_kpic_state *s)
74{
75 int mask, cur_priority, priority;
76
77 mask = s->irr & ~s->imr;
78 priority = get_priority(s, mask);
79 if (priority == 8)
80 return -1;
81 /*
82 * compute current priority. If special fully nested mode on the
83 * master, the IRQ coming from the slave is not taken into account
84 * for the priority computation.
85 */
86 mask = s->isr;
87 if (s->special_fully_nested_mode && s == &s->pics_state->pics[0])
88 mask &= ~(1 << 2);
89 cur_priority = get_priority(s, mask);
90 if (priority < cur_priority)
91 /*
92 * higher priority found: an irq should be generated
93 */
94 return (priority + s->priority_add) & 7;
95 else
96 return -1;
97}
98
99/*
100 * raise irq to CPU if necessary. must be called every time the active
101 * irq may change
102 */
103static void pic_update_irq(struct kvm_pic *s)
104{
105 int irq2, irq;
106
107 irq2 = pic_get_irq(&s->pics[1]);
108 if (irq2 >= 0) {
109 /*
110 * if irq request by slave pic, signal master PIC
111 */
112 pic_set_irq1(&s->pics[0], 2, 1);
113 pic_set_irq1(&s->pics[0], 2, 0);
114 }
115 irq = pic_get_irq(&s->pics[0]);
116 if (irq >= 0)
117 s->irq_request(s->irq_request_opaque, 1);
118 else
119 s->irq_request(s->irq_request_opaque, 0);
120}
121
122void kvm_pic_update_irq(struct kvm_pic *s)
123{
124 pic_update_irq(s);
125}
126
127void kvm_pic_set_irq(void *opaque, int irq, int level)
128{
129 struct kvm_pic *s = opaque;
130
131 pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
132 pic_update_irq(s);
133}
134
135/*
136 * acknowledge interrupt 'irq'
137 */
138static inline void pic_intack(struct kvm_kpic_state *s, int irq)
139{
140 if (s->auto_eoi) {
141 if (s->rotate_on_auto_eoi)
142 s->priority_add = (irq + 1) & 7;
143 } else
144 s->isr |= (1 << irq);
145 /*
146 * We don't clear a level sensitive interrupt here
147 */
148 if (!(s->elcr & (1 << irq)))
149 s->irr &= ~(1 << irq);
150}
151
152int kvm_pic_read_irq(struct kvm_pic *s)
153{
154 int irq, irq2, intno;
155
156 irq = pic_get_irq(&s->pics[0]);
157 if (irq >= 0) {
158 pic_intack(&s->pics[0], irq);
159 if (irq == 2) {
160 irq2 = pic_get_irq(&s->pics[1]);
161 if (irq2 >= 0)
162 pic_intack(&s->pics[1], irq2);
163 else
164 /*
165 * spurious IRQ on slave controller
166 */
167 irq2 = 7;
168 intno = s->pics[1].irq_base + irq2;
169 irq = irq2 + 8;
170 } else
171 intno = s->pics[0].irq_base + irq;
172 } else {
173 /*
174 * spurious IRQ on host controller
175 */
176 irq = 7;
177 intno = s->pics[0].irq_base + irq;
178 }
179 pic_update_irq(s);
180
181 return intno;
182}
183
184static void pic_reset(void *opaque)
185{
186 struct kvm_kpic_state *s = opaque;
187
188 s->last_irr = 0;
189 s->irr = 0;
190 s->imr = 0;
191 s->isr = 0;
192 s->priority_add = 0;
193 s->irq_base = 0;
194 s->read_reg_select = 0;
195 s->poll = 0;
196 s->special_mask = 0;
197 s->init_state = 0;
198 s->auto_eoi = 0;
199 s->rotate_on_auto_eoi = 0;
200 s->special_fully_nested_mode = 0;
201 s->init4 = 0;
202}
203
204static void pic_ioport_write(void *opaque, u32 addr, u32 val)
205{
206 struct kvm_kpic_state *s = opaque;
207 int priority, cmd, irq;
208
209 addr &= 1;
210 if (addr == 0) {
211 if (val & 0x10) {
212 pic_reset(s); /* init */
213 /*
214 * deassert a pending interrupt
215 */
216 s->pics_state->irq_request(s->pics_state->
217 irq_request_opaque, 0);
218 s->init_state = 1;
219 s->init4 = val & 1;
220 if (val & 0x02)
221 printk(KERN_ERR "single mode not supported");
222 if (val & 0x08)
223 printk(KERN_ERR
224 "level sensitive irq not supported");
225 } else if (val & 0x08) {
226 if (val & 0x04)
227 s->poll = 1;
228 if (val & 0x02)
229 s->read_reg_select = val & 1;
230 if (val & 0x40)
231 s->special_mask = (val >> 5) & 1;
232 } else {
233 cmd = val >> 5;
234 switch (cmd) {
235 case 0:
236 case 4:
237 s->rotate_on_auto_eoi = cmd >> 2;
238 break;
239 case 1: /* end of interrupt */
240 case 5:
241 priority = get_priority(s, s->isr);
242 if (priority != 8) {
243 irq = (priority + s->priority_add) & 7;
244 s->isr &= ~(1 << irq);
245 if (cmd == 5)
246 s->priority_add = (irq + 1) & 7;
247 pic_update_irq(s->pics_state);
248 }
249 break;
250 case 3:
251 irq = val & 7;
252 s->isr &= ~(1 << irq);
253 pic_update_irq(s->pics_state);
254 break;
255 case 6:
256 s->priority_add = (val + 1) & 7;
257 pic_update_irq(s->pics_state);
258 break;
259 case 7:
260 irq = val & 7;
261 s->isr &= ~(1 << irq);
262 s->priority_add = (irq + 1) & 7;
263 pic_update_irq(s->pics_state);
264 break;
265 default:
266 break; /* no operation */
267 }
268 }
269 } else
270 switch (s->init_state) {
271 case 0: /* normal mode */
272 s->imr = val;
273 pic_update_irq(s->pics_state);
274 break;
275 case 1:
276 s->irq_base = val & 0xf8;
277 s->init_state = 2;
278 break;
279 case 2:
280 if (s->init4)
281 s->init_state = 3;
282 else
283 s->init_state = 0;
284 break;
285 case 3:
286 s->special_fully_nested_mode = (val >> 4) & 1;
287 s->auto_eoi = (val >> 1) & 1;
288 s->init_state = 0;
289 break;
290 }
291}
292
293static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1)
294{
295 int ret;
296
297 ret = pic_get_irq(s);
298 if (ret >= 0) {
299 if (addr1 >> 7) {
300 s->pics_state->pics[0].isr &= ~(1 << 2);
301 s->pics_state->pics[0].irr &= ~(1 << 2);
302 }
303 s->irr &= ~(1 << ret);
304 s->isr &= ~(1 << ret);
305 if (addr1 >> 7 || ret != 2)
306 pic_update_irq(s->pics_state);
307 } else {
308 ret = 0x07;
309 pic_update_irq(s->pics_state);
310 }
311
312 return ret;
313}
314
315static u32 pic_ioport_read(void *opaque, u32 addr1)
316{
317 struct kvm_kpic_state *s = opaque;
318 unsigned int addr;
319 int ret;
320
321 addr = addr1;
322 addr &= 1;
323 if (s->poll) {
324 ret = pic_poll_read(s, addr1);
325 s->poll = 0;
326 } else
327 if (addr == 0)
328 if (s->read_reg_select)
329 ret = s->isr;
330 else
331 ret = s->irr;
332 else
333 ret = s->imr;
334 return ret;
335}
336
337static void elcr_ioport_write(void *opaque, u32 addr, u32 val)
338{
339 struct kvm_kpic_state *s = opaque;
340 s->elcr = val & s->elcr_mask;
341}
342
343static u32 elcr_ioport_read(void *opaque, u32 addr1)
344{
345 struct kvm_kpic_state *s = opaque;
346 return s->elcr;
347}
348
349static int picdev_in_range(struct kvm_io_device *this, gpa_t addr)
350{
351 switch (addr) {
352 case 0x20:
353 case 0x21:
354 case 0xa0:
355 case 0xa1:
356 case 0x4d0:
357 case 0x4d1:
358 return 1;
359 default:
360 return 0;
361 }
362}
363
364static void picdev_write(struct kvm_io_device *this,
365 gpa_t addr, int len, const void *val)
366{
367 struct kvm_pic *s = this->private;
368 unsigned char data = *(unsigned char *)val;
369
370 if (len != 1) {
371 if (printk_ratelimit())
372 printk(KERN_ERR "PIC: non byte write\n");
373 return;
374 }
375 switch (addr) {
376 case 0x20:
377 case 0x21:
378 case 0xa0:
379 case 0xa1:
380 pic_ioport_write(&s->pics[addr >> 7], addr, data);
381 break;
382 case 0x4d0:
383 case 0x4d1:
384 elcr_ioport_write(&s->pics[addr & 1], addr, data);
385 break;
386 }
387}
388
389static void picdev_read(struct kvm_io_device *this,
390 gpa_t addr, int len, void *val)
391{
392 struct kvm_pic *s = this->private;
393 unsigned char data = 0;
394
395 if (len != 1) {
396 if (printk_ratelimit())
397 printk(KERN_ERR "PIC: non byte read\n");
398 return;
399 }
400 switch (addr) {
401 case 0x20:
402 case 0x21:
403 case 0xa0:
404 case 0xa1:
405 data = pic_ioport_read(&s->pics[addr >> 7], addr);
406 break;
407 case 0x4d0:
408 case 0x4d1:
409 data = elcr_ioport_read(&s->pics[addr & 1], addr);
410 break;
411 }
412 *(unsigned char *)val = data;
413}
414
415/*
416 * callback when PIC0 irq status changed
417 */
418static void pic_irq_request(void *opaque, int level)
419{
420 struct kvm *kvm = opaque;
421 struct kvm_vcpu *vcpu = kvm->vcpus[0];
422
423 pic_irqchip(kvm)->output = level;
424 if (vcpu)
425 kvm_vcpu_kick(vcpu);
426}
427
428struct kvm_pic *kvm_create_pic(struct kvm *kvm)
429{
430 struct kvm_pic *s;
431 s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
432 if (!s)
433 return NULL;
434 s->pics[0].elcr_mask = 0xf8;
435 s->pics[1].elcr_mask = 0xde;
436 s->irq_request = pic_irq_request;
437 s->irq_request_opaque = kvm;
438 s->pics[0].pics_state = s;
439 s->pics[1].pics_state = s;
440
441 /*
442 * Initialize PIO device
443 */
444 s->dev.read = picdev_read;
445 s->dev.write = picdev_write;
446 s->dev.in_range = picdev_in_range;
447 s->dev.private = s;
448 kvm_io_bus_register_dev(&kvm->pio_bus, &s->dev);
449 return s;
450}
diff --git a/drivers/kvm/ioapic.c b/drivers/kvm/ioapic.c
new file mode 100644
index 000000000000..c7992e667fdb
--- /dev/null
+++ b/drivers/kvm/ioapic.c
@@ -0,0 +1,388 @@
1/*
2 * Copyright (C) 2001 MandrakeSoft S.A.
3 *
4 * MandrakeSoft S.A.
5 * 43, rue d'Aboukir
6 * 75002 Paris - France
7 * http://www.linux-mandrake.com/
8 * http://www.mandrakesoft.com/
9 *
10 * This library is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This library is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with this library; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 *
24 * Yunhong Jiang <yunhong.jiang@intel.com>
25 * Yaozu (Eddie) Dong <eddie.dong@intel.com>
26 * Based on Xen 3.1 code.
27 */
28
29#include "kvm.h"
30#include <linux/kvm.h>
31#include <linux/mm.h>
32#include <linux/highmem.h>
33#include <linux/smp.h>
34#include <linux/hrtimer.h>
35#include <linux/io.h>
36#include <asm/processor.h>
37#include <asm/msr.h>
38#include <asm/page.h>
39#include <asm/current.h>
40#include <asm/apicdef.h>
41#include <asm/io_apic.h>
42#include "irq.h"
43/* #define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */
44#define ioapic_debug(fmt, arg...)
45static void ioapic_deliver(struct kvm_ioapic *vioapic, int irq);
46
47static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
48 unsigned long addr,
49 unsigned long length)
50{
51 unsigned long result = 0;
52
53 switch (ioapic->ioregsel) {
54 case IOAPIC_REG_VERSION:
55 result = ((((IOAPIC_NUM_PINS - 1) & 0xff) << 16)
56 | (IOAPIC_VERSION_ID & 0xff));
57 break;
58
59 case IOAPIC_REG_APIC_ID:
60 case IOAPIC_REG_ARB_ID:
61 result = ((ioapic->id & 0xf) << 24);
62 break;
63
64 default:
65 {
66 u32 redir_index = (ioapic->ioregsel - 0x10) >> 1;
67 u64 redir_content;
68
69 ASSERT(redir_index < IOAPIC_NUM_PINS);
70
71 redir_content = ioapic->redirtbl[redir_index].bits;
72 result = (ioapic->ioregsel & 0x1) ?
73 (redir_content >> 32) & 0xffffffff :
74 redir_content & 0xffffffff;
75 break;
76 }
77 }
78
79 return result;
80}
81
82static void ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx)
83{
84 union ioapic_redir_entry *pent;
85
86 pent = &ioapic->redirtbl[idx];
87
88 if (!pent->fields.mask) {
89 ioapic_deliver(ioapic, idx);
90 if (pent->fields.trig_mode == IOAPIC_LEVEL_TRIG)
91 pent->fields.remote_irr = 1;
92 }
93 if (!pent->fields.trig_mode)
94 ioapic->irr &= ~(1 << idx);
95}
96
97static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
98{
99 unsigned index;
100
101 switch (ioapic->ioregsel) {
102 case IOAPIC_REG_VERSION:
103 /* Writes are ignored. */
104 break;
105
106 case IOAPIC_REG_APIC_ID:
107 ioapic->id = (val >> 24) & 0xf;
108 break;
109
110 case IOAPIC_REG_ARB_ID:
111 break;
112
113 default:
114 index = (ioapic->ioregsel - 0x10) >> 1;
115
116 ioapic_debug("change redir index %x val %x", index, val);
117 if (index >= IOAPIC_NUM_PINS)
118 return;
119 if (ioapic->ioregsel & 1) {
120 ioapic->redirtbl[index].bits &= 0xffffffff;
121 ioapic->redirtbl[index].bits |= (u64) val << 32;
122 } else {
123 ioapic->redirtbl[index].bits &= ~0xffffffffULL;
124 ioapic->redirtbl[index].bits |= (u32) val;
125 ioapic->redirtbl[index].fields.remote_irr = 0;
126 }
127 if (ioapic->irr & (1 << index))
128 ioapic_service(ioapic, index);
129 break;
130 }
131}
132
133static void ioapic_inj_irq(struct kvm_ioapic *ioapic,
134 struct kvm_lapic *target,
135 u8 vector, u8 trig_mode, u8 delivery_mode)
136{
137 ioapic_debug("irq %d trig %d deliv %d", vector, trig_mode,
138 delivery_mode);
139
140 ASSERT((delivery_mode == dest_Fixed) ||
141 (delivery_mode == dest_LowestPrio));
142
143 kvm_apic_set_irq(target, vector, trig_mode);
144}
145
146static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
147 u8 dest_mode)
148{
149 u32 mask = 0;
150 int i;
151 struct kvm *kvm = ioapic->kvm;
152 struct kvm_vcpu *vcpu;
153
154 ioapic_debug("dest %d dest_mode %d", dest, dest_mode);
155
156 if (dest_mode == 0) { /* Physical mode. */
157 if (dest == 0xFF) { /* Broadcast. */
158 for (i = 0; i < KVM_MAX_VCPUS; ++i)
159 if (kvm->vcpus[i] && kvm->vcpus[i]->apic)
160 mask |= 1 << i;
161 return mask;
162 }
163 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
164 vcpu = kvm->vcpus[i];
165 if (!vcpu)
166 continue;
167 if (kvm_apic_match_physical_addr(vcpu->apic, dest)) {
168 if (vcpu->apic)
169 mask = 1 << i;
170 break;
171 }
172 }
173 } else if (dest != 0) /* Logical mode, MDA non-zero. */
174 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
175 vcpu = kvm->vcpus[i];
176 if (!vcpu)
177 continue;
178 if (vcpu->apic &&
179 kvm_apic_match_logical_addr(vcpu->apic, dest))
180 mask |= 1 << vcpu->vcpu_id;
181 }
182 ioapic_debug("mask %x", mask);
183 return mask;
184}
185
186static void ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
187{
188 u8 dest = ioapic->redirtbl[irq].fields.dest_id;
189 u8 dest_mode = ioapic->redirtbl[irq].fields.dest_mode;
190 u8 delivery_mode = ioapic->redirtbl[irq].fields.delivery_mode;
191 u8 vector = ioapic->redirtbl[irq].fields.vector;
192 u8 trig_mode = ioapic->redirtbl[irq].fields.trig_mode;
193 u32 deliver_bitmask;
194 struct kvm_lapic *target;
195 struct kvm_vcpu *vcpu;
196 int vcpu_id;
197
198 ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
199 "vector=%x trig_mode=%x",
200 dest, dest_mode, delivery_mode, vector, trig_mode);
201
202 deliver_bitmask = ioapic_get_delivery_bitmask(ioapic, dest, dest_mode);
203 if (!deliver_bitmask) {
204 ioapic_debug("no target on destination");
205 return;
206 }
207
208 switch (delivery_mode) {
209 case dest_LowestPrio:
210 target =
211 kvm_apic_round_robin(ioapic->kvm, vector, deliver_bitmask);
212 if (target != NULL)
213 ioapic_inj_irq(ioapic, target, vector,
214 trig_mode, delivery_mode);
215 else
216 ioapic_debug("null round robin: "
217 "mask=%x vector=%x delivery_mode=%x",
218 deliver_bitmask, vector, dest_LowestPrio);
219 break;
220 case dest_Fixed:
221 for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) {
222 if (!(deliver_bitmask & (1 << vcpu_id)))
223 continue;
224 deliver_bitmask &= ~(1 << vcpu_id);
225 vcpu = ioapic->kvm->vcpus[vcpu_id];
226 if (vcpu) {
227 target = vcpu->apic;
228 ioapic_inj_irq(ioapic, target, vector,
229 trig_mode, delivery_mode);
230 }
231 }
232 break;
233
234 /* TODO: NMI */
235 default:
236 printk(KERN_WARNING "Unsupported delivery mode %d\n",
237 delivery_mode);
238 break;
239 }
240}
241
242void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
243{
244 u32 old_irr = ioapic->irr;
245 u32 mask = 1 << irq;
246 union ioapic_redir_entry entry;
247
248 if (irq >= 0 && irq < IOAPIC_NUM_PINS) {
249 entry = ioapic->redirtbl[irq];
250 level ^= entry.fields.polarity;
251 if (!level)
252 ioapic->irr &= ~mask;
253 else {
254 ioapic->irr |= mask;
255 if ((!entry.fields.trig_mode && old_irr != ioapic->irr)
256 || !entry.fields.remote_irr)
257 ioapic_service(ioapic, irq);
258 }
259 }
260}
261
262static int get_eoi_gsi(struct kvm_ioapic *ioapic, int vector)
263{
264 int i;
265
266 for (i = 0; i < IOAPIC_NUM_PINS; i++)
267 if (ioapic->redirtbl[i].fields.vector == vector)
268 return i;
269 return -1;
270}
271
272void kvm_ioapic_update_eoi(struct kvm *kvm, int vector)
273{
274 struct kvm_ioapic *ioapic = kvm->vioapic;
275 union ioapic_redir_entry *ent;
276 int gsi;
277
278 gsi = get_eoi_gsi(ioapic, vector);
279 if (gsi == -1) {
280 printk(KERN_WARNING "Can't find redir item for %d EOI\n",
281 vector);
282 return;
283 }
284
285 ent = &ioapic->redirtbl[gsi];
286 ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
287
288 ent->fields.remote_irr = 0;
289 if (!ent->fields.mask && (ioapic->irr & (1 << gsi)))
290 ioapic_deliver(ioapic, gsi);
291}
292
293static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr)
294{
295 struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
296
297 return ((addr >= ioapic->base_address &&
298 (addr < ioapic->base_address + IOAPIC_MEM_LENGTH)));
299}
300
301static void ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
302 void *val)
303{
304 struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
305 u32 result;
306
307 ioapic_debug("addr %lx", (unsigned long)addr);
308 ASSERT(!(addr & 0xf)); /* check alignment */
309
310 addr &= 0xff;
311 switch (addr) {
312 case IOAPIC_REG_SELECT:
313 result = ioapic->ioregsel;
314 break;
315
316 case IOAPIC_REG_WINDOW:
317 result = ioapic_read_indirect(ioapic, addr, len);
318 break;
319
320 default:
321 result = 0;
322 break;
323 }
324 switch (len) {
325 case 8:
326 *(u64 *) val = result;
327 break;
328 case 1:
329 case 2:
330 case 4:
331 memcpy(val, (char *)&result, len);
332 break;
333 default:
334 printk(KERN_WARNING "ioapic: wrong length %d\n", len);
335 }
336}
337
338static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
339 const void *val)
340{
341 struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
342 u32 data;
343
344 ioapic_debug("ioapic_mmio_write addr=%lx len=%d val=%p\n",
345 addr, len, val);
346 ASSERT(!(addr & 0xf)); /* check alignment */
347 if (len == 4 || len == 8)
348 data = *(u32 *) val;
349 else {
350 printk(KERN_WARNING "ioapic: Unsupported size %d\n", len);
351 return;
352 }
353
354 addr &= 0xff;
355 switch (addr) {
356 case IOAPIC_REG_SELECT:
357 ioapic->ioregsel = data;
358 break;
359
360 case IOAPIC_REG_WINDOW:
361 ioapic_write_indirect(ioapic, data);
362 break;
363
364 default:
365 break;
366 }
367}
368
369int kvm_ioapic_init(struct kvm *kvm)
370{
371 struct kvm_ioapic *ioapic;
372 int i;
373
374 ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL);
375 if (!ioapic)
376 return -ENOMEM;
377 kvm->vioapic = ioapic;
378 for (i = 0; i < IOAPIC_NUM_PINS; i++)
379 ioapic->redirtbl[i].fields.mask = 1;
380 ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
381 ioapic->dev.read = ioapic_mmio_read;
382 ioapic->dev.write = ioapic_mmio_write;
383 ioapic->dev.in_range = ioapic_in_range;
384 ioapic->dev.private = ioapic;
385 ioapic->kvm = kvm;
386 kvm_io_bus_register_dev(&kvm->mmio_bus, &ioapic->dev);
387 return 0;
388}
diff --git a/drivers/kvm/irq.c b/drivers/kvm/irq.c
new file mode 100644
index 000000000000..7628c7ff628f
--- /dev/null
+++ b/drivers/kvm/irq.c
@@ -0,0 +1,98 @@
1/*
2 * irq.c: API for in kernel interrupt controller
3 * Copyright (c) 2007, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Authors:
18 * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
19 *
20 */
21
22#include <linux/module.h>
23
24#include "kvm.h"
25#include "irq.h"
26
27/*
28 * check if there is pending interrupt without
29 * intack.
30 */
31int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
32{
33 struct kvm_pic *s;
34
35 if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */
36 if (kvm_apic_accept_pic_intr(v)) {
37 s = pic_irqchip(v->kvm); /* PIC */
38 return s->output;
39 } else
40 return 0;
41 }
42 return 1;
43}
44EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
45
46/*
47 * Read pending interrupt vector and intack.
48 */
49int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
50{
51 struct kvm_pic *s;
52 int vector;
53
54 vector = kvm_get_apic_interrupt(v); /* APIC */
55 if (vector == -1) {
56 if (kvm_apic_accept_pic_intr(v)) {
57 s = pic_irqchip(v->kvm);
58 s->output = 0; /* PIC */
59 vector = kvm_pic_read_irq(s);
60 }
61 }
62 return vector;
63}
64EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
65
66static void vcpu_kick_intr(void *info)
67{
68#ifdef DEBUG
69 struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
70 printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu);
71#endif
72}
73
74void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
75{
76 int ipi_pcpu = vcpu->cpu;
77
78 if (waitqueue_active(&vcpu->wq)) {
79 wake_up_interruptible(&vcpu->wq);
80 ++vcpu->stat.halt_wakeup;
81 }
82 if (vcpu->guest_mode)
83 smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0);
84}
85
86void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
87{
88 kvm_inject_apic_timer_irqs(vcpu);
89 /* TODO: PIT, RTC etc. */
90}
91EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
92
93void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
94{
95 kvm_apic_timer_intr_post(vcpu, vec);
96 /* TODO: PIT, RTC etc. */
97}
98EXPORT_SYMBOL_GPL(kvm_timer_intr_post);
diff --git a/drivers/kvm/irq.h b/drivers/kvm/irq.h
new file mode 100644
index 000000000000..11fc014e2b30
--- /dev/null
+++ b/drivers/kvm/irq.h
@@ -0,0 +1,165 @@
1/*
2 * irq.h: in kernel interrupt controller related definitions
3 * Copyright (c) 2007, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Authors:
18 * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
19 *
20 */
21
22#ifndef __IRQ_H
23#define __IRQ_H
24
25#include "kvm.h"
26
27typedef void irq_request_func(void *opaque, int level);
28
29struct kvm_kpic_state {
30 u8 last_irr; /* edge detection */
31 u8 irr; /* interrupt request register */
32 u8 imr; /* interrupt mask register */
33 u8 isr; /* interrupt service register */
34 u8 priority_add; /* highest irq priority */
35 u8 irq_base;
36 u8 read_reg_select;
37 u8 poll;
38 u8 special_mask;
39 u8 init_state;
40 u8 auto_eoi;
41 u8 rotate_on_auto_eoi;
42 u8 special_fully_nested_mode;
43 u8 init4; /* true if 4 byte init */
44 u8 elcr; /* PIIX edge/trigger selection */
45 u8 elcr_mask;
46 struct kvm_pic *pics_state;
47};
48
49struct kvm_pic {
50 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
51 irq_request_func *irq_request;
52 void *irq_request_opaque;
53 int output; /* intr from master PIC */
54 struct kvm_io_device dev;
55};
56
57struct kvm_pic *kvm_create_pic(struct kvm *kvm);
58void kvm_pic_set_irq(void *opaque, int irq, int level);
59int kvm_pic_read_irq(struct kvm_pic *s);
60int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
61int kvm_cpu_has_interrupt(struct kvm_vcpu *v);
62void kvm_pic_update_irq(struct kvm_pic *s);
63
64#define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS
65#define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */
66#define IOAPIC_EDGE_TRIG 0
67#define IOAPIC_LEVEL_TRIG 1
68
69#define IOAPIC_DEFAULT_BASE_ADDRESS 0xfec00000
70#define IOAPIC_MEM_LENGTH 0x100
71
72/* Direct registers. */
73#define IOAPIC_REG_SELECT 0x00
74#define IOAPIC_REG_WINDOW 0x10
75#define IOAPIC_REG_EOI 0x40 /* IA64 IOSAPIC only */
76
77/* Indirect registers. */
78#define IOAPIC_REG_APIC_ID 0x00 /* x86 IOAPIC only */
79#define IOAPIC_REG_VERSION 0x01
80#define IOAPIC_REG_ARB_ID 0x02 /* x86 IOAPIC only */
81
82struct kvm_ioapic {
83 u64 base_address;
84 u32 ioregsel;
85 u32 id;
86 u32 irr;
87 u32 pad;
88 union ioapic_redir_entry {
89 u64 bits;
90 struct {
91 u8 vector;
92 u8 delivery_mode:3;
93 u8 dest_mode:1;
94 u8 delivery_status:1;
95 u8 polarity:1;
96 u8 remote_irr:1;
97 u8 trig_mode:1;
98 u8 mask:1;
99 u8 reserve:7;
100 u8 reserved[4];
101 u8 dest_id;
102 } fields;
103 } redirtbl[IOAPIC_NUM_PINS];
104 struct kvm_io_device dev;
105 struct kvm *kvm;
106};
107
108struct kvm_lapic {
109 unsigned long base_address;
110 struct kvm_io_device dev;
111 struct {
112 atomic_t pending;
113 s64 period; /* unit: ns */
114 u32 divide_count;
115 ktime_t last_update;
116 struct hrtimer dev;
117 } timer;
118 struct kvm_vcpu *vcpu;
119 struct page *regs_page;
120 void *regs;
121};
122
123#ifdef DEBUG
124#define ASSERT(x) \
125do { \
126 if (!(x)) { \
127 printk(KERN_EMERG "assertion failed %s: %d: %s\n", \
128 __FILE__, __LINE__, #x); \
129 BUG(); \
130 } \
131} while (0)
132#else
133#define ASSERT(x) do { } while (0)
134#endif
135
136void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
137int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
138int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
139int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
140int kvm_create_lapic(struct kvm_vcpu *vcpu);
141void kvm_lapic_reset(struct kvm_vcpu *vcpu);
142void kvm_free_apic(struct kvm_lapic *apic);
143u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
144void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
145void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
146struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
147 unsigned long bitmap);
148u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
149void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
150int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
151void kvm_ioapic_update_eoi(struct kvm *kvm, int vector);
152int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
153int kvm_apic_set_irq(struct kvm_lapic *apic, u8 vec, u8 trig);
154void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
155int kvm_ioapic_init(struct kvm *kvm);
156void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
157int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
158int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
159void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
160void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
161void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
162void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
163void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
164
165#endif
diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
index 336be86c6f5a..ad0813843adc 100644
--- a/drivers/kvm/kvm.h
+++ b/drivers/kvm/kvm.h
@@ -13,60 +13,38 @@
13#include <linux/signal.h> 13#include <linux/signal.h>
14#include <linux/sched.h> 14#include <linux/sched.h>
15#include <linux/mm.h> 15#include <linux/mm.h>
16#include <linux/preempt.h>
16#include <asm/signal.h> 17#include <asm/signal.h>
17 18
18#include "vmx.h"
19#include <linux/kvm.h> 19#include <linux/kvm.h>
20#include <linux/kvm_para.h> 20#include <linux/kvm_para.h>
21 21
22#define CR0_PE_MASK (1ULL << 0) 22#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1)
23#define CR0_MP_MASK (1ULL << 1) 23#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD))
24#define CR0_TS_MASK (1ULL << 3) 24#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS|0xFFFFFF0000000000ULL)
25#define CR0_NE_MASK (1ULL << 5)
26#define CR0_WP_MASK (1ULL << 16)
27#define CR0_NW_MASK (1ULL << 29)
28#define CR0_CD_MASK (1ULL << 30)
29#define CR0_PG_MASK (1ULL << 31)
30
31#define CR3_WPT_MASK (1ULL << 3)
32#define CR3_PCD_MASK (1ULL << 4)
33
34#define CR3_RESEVED_BITS 0x07ULL
35#define CR3_L_MODE_RESEVED_BITS (~((1ULL << 40) - 1) | 0x0fe7ULL)
36#define CR3_FLAGS_MASK ((1ULL << 5) - 1)
37
38#define CR4_VME_MASK (1ULL << 0)
39#define CR4_PSE_MASK (1ULL << 4)
40#define CR4_PAE_MASK (1ULL << 5)
41#define CR4_PGE_MASK (1ULL << 7)
42#define CR4_VMXE_MASK (1ULL << 13)
43 25
44#define KVM_GUEST_CR0_MASK \ 26#define KVM_GUEST_CR0_MASK \
45 (CR0_PG_MASK | CR0_PE_MASK | CR0_WP_MASK | CR0_NE_MASK \ 27 (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE \
46 | CR0_NW_MASK | CR0_CD_MASK) 28 | X86_CR0_NW | X86_CR0_CD)
47#define KVM_VM_CR0_ALWAYS_ON \ 29#define KVM_VM_CR0_ALWAYS_ON \
48 (CR0_PG_MASK | CR0_PE_MASK | CR0_WP_MASK | CR0_NE_MASK | CR0_TS_MASK \ 30 (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE | X86_CR0_TS \
49 | CR0_MP_MASK) 31 | X86_CR0_MP)
50#define KVM_GUEST_CR4_MASK \ 32#define KVM_GUEST_CR4_MASK \
51 (CR4_PSE_MASK | CR4_PAE_MASK | CR4_PGE_MASK | CR4_VMXE_MASK | CR4_VME_MASK) 33 (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE)
52#define KVM_PMODE_VM_CR4_ALWAYS_ON (CR4_VMXE_MASK | CR4_PAE_MASK) 34#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
53#define KVM_RMODE_VM_CR4_ALWAYS_ON (CR4_VMXE_MASK | CR4_PAE_MASK | CR4_VME_MASK) 35#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
54 36
55#define INVALID_PAGE (~(hpa_t)0) 37#define INVALID_PAGE (~(hpa_t)0)
56#define UNMAPPED_GVA (~(gpa_t)0) 38#define UNMAPPED_GVA (~(gpa_t)0)
57 39
58#define KVM_MAX_VCPUS 4 40#define KVM_MAX_VCPUS 4
59#define KVM_ALIAS_SLOTS 4 41#define KVM_ALIAS_SLOTS 4
60#define KVM_MEMORY_SLOTS 4 42#define KVM_MEMORY_SLOTS 8
61#define KVM_NUM_MMU_PAGES 1024 43#define KVM_NUM_MMU_PAGES 1024
62#define KVM_MIN_FREE_MMU_PAGES 5 44#define KVM_MIN_FREE_MMU_PAGES 5
63#define KVM_REFILL_PAGES 25 45#define KVM_REFILL_PAGES 25
64#define KVM_MAX_CPUID_ENTRIES 40 46#define KVM_MAX_CPUID_ENTRIES 40
65 47
66#define FX_IMAGE_SIZE 512
67#define FX_IMAGE_ALIGN 16
68#define FX_BUF_SIZE (2 * FX_IMAGE_SIZE + FX_IMAGE_ALIGN)
69
70#define DE_VECTOR 0 48#define DE_VECTOR 0
71#define NM_VECTOR 7 49#define NM_VECTOR 7
72#define DF_VECTOR 8 50#define DF_VECTOR 8
@@ -158,15 +136,8 @@ struct kvm_mmu_page {
158 }; 136 };
159}; 137};
160 138
161struct vmcs {
162 u32 revision_id;
163 u32 abort;
164 char data[0];
165};
166
167#define vmx_msr_entry kvm_msr_entry
168
169struct kvm_vcpu; 139struct kvm_vcpu;
140extern struct kmem_cache *kvm_vcpu_cache;
170 141
171/* 142/*
172 * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level 143 * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
@@ -260,6 +231,7 @@ struct kvm_stat {
260 u32 signal_exits; 231 u32 signal_exits;
261 u32 irq_window_exits; 232 u32 irq_window_exits;
262 u32 halt_exits; 233 u32 halt_exits;
234 u32 halt_wakeup;
263 u32 request_irq_exits; 235 u32 request_irq_exits;
264 u32 irq_exits; 236 u32 irq_exits;
265 u32 light_exits; 237 u32 light_exits;
@@ -328,21 +300,17 @@ void kvm_io_bus_register_dev(struct kvm_io_bus *bus,
328 300
329struct kvm_vcpu { 301struct kvm_vcpu {
330 struct kvm *kvm; 302 struct kvm *kvm;
331 union { 303 struct preempt_notifier preempt_notifier;
332 struct vmcs *vmcs; 304 int vcpu_id;
333 struct vcpu_svm *svm;
334 };
335 struct mutex mutex; 305 struct mutex mutex;
336 int cpu; 306 int cpu;
337 int launched;
338 u64 host_tsc; 307 u64 host_tsc;
339 struct kvm_run *run; 308 struct kvm_run *run;
340 int interrupt_window_open; 309 int interrupt_window_open;
341 int guest_mode; 310 int guest_mode;
342 unsigned long requests; 311 unsigned long requests;
343 unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */ 312 unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
344#define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE(unsigned long) 313 DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS);
345 unsigned long irq_pending[NR_IRQ_WORDS];
346 unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */ 314 unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */
347 unsigned long rip; /* needs vcpu_load_rsp_rip() */ 315 unsigned long rip; /* needs vcpu_load_rsp_rip() */
348 316
@@ -357,15 +325,15 @@ struct kvm_vcpu {
357 u64 pdptrs[4]; /* pae */ 325 u64 pdptrs[4]; /* pae */
358 u64 shadow_efer; 326 u64 shadow_efer;
359 u64 apic_base; 327 u64 apic_base;
328 struct kvm_lapic *apic; /* kernel irqchip context */
329#define VCPU_MP_STATE_RUNNABLE 0
330#define VCPU_MP_STATE_UNINITIALIZED 1
331#define VCPU_MP_STATE_INIT_RECEIVED 2
332#define VCPU_MP_STATE_SIPI_RECEIVED 3
333#define VCPU_MP_STATE_HALTED 4
334 int mp_state;
335 int sipi_vector;
360 u64 ia32_misc_enable_msr; 336 u64 ia32_misc_enable_msr;
361 int nmsrs;
362 int save_nmsrs;
363 int msr_offset_efer;
364#ifdef CONFIG_X86_64
365 int msr_offset_kernel_gs_base;
366#endif
367 struct vmx_msr_entry *guest_msrs;
368 struct vmx_msr_entry *host_msrs;
369 337
370 struct kvm_mmu mmu; 338 struct kvm_mmu mmu;
371 339
@@ -379,16 +347,10 @@ struct kvm_vcpu {
379 347
380 struct kvm_guest_debug guest_debug; 348 struct kvm_guest_debug guest_debug;
381 349
382 char fx_buf[FX_BUF_SIZE]; 350 struct i387_fxsave_struct host_fx_image;
383 char *host_fx_image; 351 struct i387_fxsave_struct guest_fx_image;
384 char *guest_fx_image;
385 int fpu_active; 352 int fpu_active;
386 int guest_fpu_loaded; 353 int guest_fpu_loaded;
387 struct vmx_host_state {
388 int loaded;
389 u16 fs_sel, gs_sel, ldt_sel;
390 int fs_gs_ldt_reload_needed;
391 } vmx_host_state;
392 354
393 int mmio_needed; 355 int mmio_needed;
394 int mmio_read_completed; 356 int mmio_read_completed;
@@ -399,6 +361,7 @@ struct kvm_vcpu {
399 gva_t mmio_fault_cr2; 361 gva_t mmio_fault_cr2;
400 struct kvm_pio_request pio; 362 struct kvm_pio_request pio;
401 void *pio_data; 363 void *pio_data;
364 wait_queue_head_t wq;
402 365
403 int sigset_active; 366 int sigset_active;
404 sigset_t sigset; 367 sigset_t sigset;
@@ -436,7 +399,7 @@ struct kvm_memory_slot {
436}; 399};
437 400
438struct kvm { 401struct kvm {
439 spinlock_t lock; /* protects everything except vcpus */ 402 struct mutex lock; /* protects everything except vcpus */
440 int naliases; 403 int naliases;
441 struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS]; 404 struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
442 int nmemslots; 405 int nmemslots;
@@ -447,39 +410,59 @@ struct kvm {
447 struct list_head active_mmu_pages; 410 struct list_head active_mmu_pages;
448 int n_free_mmu_pages; 411 int n_free_mmu_pages;
449 struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; 412 struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
450 int nvcpus; 413 struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
451 struct kvm_vcpu vcpus[KVM_MAX_VCPUS];
452 int memory_config_version;
453 int busy;
454 unsigned long rmap_overflow; 414 unsigned long rmap_overflow;
455 struct list_head vm_list; 415 struct list_head vm_list;
456 struct file *filp; 416 struct file *filp;
457 struct kvm_io_bus mmio_bus; 417 struct kvm_io_bus mmio_bus;
458 struct kvm_io_bus pio_bus; 418 struct kvm_io_bus pio_bus;
419 struct kvm_pic *vpic;
420 struct kvm_ioapic *vioapic;
421 int round_robin_prev_vcpu;
459}; 422};
460 423
424static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
425{
426 return kvm->vpic;
427}
428
429static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
430{
431 return kvm->vioapic;
432}
433
434static inline int irqchip_in_kernel(struct kvm *kvm)
435{
436 return pic_irqchip(kvm) != 0;
437}
438
461struct descriptor_table { 439struct descriptor_table {
462 u16 limit; 440 u16 limit;
463 unsigned long base; 441 unsigned long base;
464} __attribute__((packed)); 442} __attribute__((packed));
465 443
466struct kvm_arch_ops { 444struct kvm_x86_ops {
467 int (*cpu_has_kvm_support)(void); /* __init */ 445 int (*cpu_has_kvm_support)(void); /* __init */
468 int (*disabled_by_bios)(void); /* __init */ 446 int (*disabled_by_bios)(void); /* __init */
469 void (*hardware_enable)(void *dummy); /* __init */ 447 void (*hardware_enable)(void *dummy); /* __init */
470 void (*hardware_disable)(void *dummy); 448 void (*hardware_disable)(void *dummy);
449 void (*check_processor_compatibility)(void *rtn);
471 int (*hardware_setup)(void); /* __init */ 450 int (*hardware_setup)(void); /* __init */
472 void (*hardware_unsetup)(void); /* __exit */ 451 void (*hardware_unsetup)(void); /* __exit */
473 452
474 int (*vcpu_create)(struct kvm_vcpu *vcpu); 453 /* Create, but do not attach this VCPU */
454 struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
475 void (*vcpu_free)(struct kvm_vcpu *vcpu); 455 void (*vcpu_free)(struct kvm_vcpu *vcpu);
456 void (*vcpu_reset)(struct kvm_vcpu *vcpu);
476 457
477 void (*vcpu_load)(struct kvm_vcpu *vcpu); 458 void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
459 void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
478 void (*vcpu_put)(struct kvm_vcpu *vcpu); 460 void (*vcpu_put)(struct kvm_vcpu *vcpu);
479 void (*vcpu_decache)(struct kvm_vcpu *vcpu); 461 void (*vcpu_decache)(struct kvm_vcpu *vcpu);
480 462
481 int (*set_guest_debug)(struct kvm_vcpu *vcpu, 463 int (*set_guest_debug)(struct kvm_vcpu *vcpu,
482 struct kvm_debug_guest *dbg); 464 struct kvm_debug_guest *dbg);
465 void (*guest_debug_pre)(struct kvm_vcpu *vcpu);
483 int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); 466 int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
484 int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); 467 int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
485 u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); 468 u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
@@ -505,27 +488,43 @@ struct kvm_arch_ops {
505 unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); 488 unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
506 void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); 489 void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
507 490
508 void (*invlpg)(struct kvm_vcpu *vcpu, gva_t addr);
509 void (*tlb_flush)(struct kvm_vcpu *vcpu); 491 void (*tlb_flush)(struct kvm_vcpu *vcpu);
510 void (*inject_page_fault)(struct kvm_vcpu *vcpu, 492 void (*inject_page_fault)(struct kvm_vcpu *vcpu,
511 unsigned long addr, u32 err_code); 493 unsigned long addr, u32 err_code);
512 494
513 void (*inject_gp)(struct kvm_vcpu *vcpu, unsigned err_code); 495 void (*inject_gp)(struct kvm_vcpu *vcpu, unsigned err_code);
514 496
515 int (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run); 497 void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run);
516 int (*vcpu_setup)(struct kvm_vcpu *vcpu); 498 int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu);
517 void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); 499 void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
518 void (*patch_hypercall)(struct kvm_vcpu *vcpu, 500 void (*patch_hypercall)(struct kvm_vcpu *vcpu,
519 unsigned char *hypercall_addr); 501 unsigned char *hypercall_addr);
502 int (*get_irq)(struct kvm_vcpu *vcpu);
503 void (*set_irq)(struct kvm_vcpu *vcpu, int vec);
504 void (*inject_pending_irq)(struct kvm_vcpu *vcpu);
505 void (*inject_pending_vectors)(struct kvm_vcpu *vcpu,
506 struct kvm_run *run);
520}; 507};
521 508
522extern struct kvm_arch_ops *kvm_arch_ops; 509extern struct kvm_x86_ops *kvm_x86_ops;
510
511/* The guest did something we don't support. */
512#define pr_unimpl(vcpu, fmt, ...) \
513 do { \
514 if (printk_ratelimit()) \
515 printk(KERN_ERR "kvm: %i: cpu%i " fmt, \
516 current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__); \
517 } while(0)
523 518
524#define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt) 519#define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt)
525#define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt) 520#define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt)
526 521
527int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module); 522int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id);
528void kvm_exit_arch(void); 523void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
524
525int kvm_init_x86(struct kvm_x86_ops *ops, unsigned int vcpu_size,
526 struct module *module);
527void kvm_exit_x86(void);
529 528
530int kvm_mmu_module_init(void); 529int kvm_mmu_module_init(void);
531void kvm_mmu_module_exit(void); 530void kvm_mmu_module_exit(void);
@@ -545,8 +544,6 @@ static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
545hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva); 544hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva);
546struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva); 545struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva);
547 546
548void kvm_emulator_want_group7_invlpg(void);
549
550extern hpa_t bad_page_address; 547extern hpa_t bad_page_address;
551 548
552struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); 549struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
@@ -561,6 +558,7 @@ enum emulation_result {
561 558
562int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run, 559int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run,
563 unsigned long cr2, u16 error_code); 560 unsigned long cr2, u16 error_code);
561void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
564void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); 562void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
565void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); 563void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
566void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, 564void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
@@ -574,9 +572,11 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
574 572
575struct x86_emulate_ctxt; 573struct x86_emulate_ctxt;
576 574
577int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 575int kvm_emulate_pio (struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
578 int size, unsigned long count, int string, int down, 576 int size, unsigned port);
579 gva_t address, int rep, unsigned port); 577int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
578 int size, unsigned long count, int down,
579 gva_t address, int rep, unsigned port);
580void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); 580void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
581int kvm_emulate_halt(struct kvm_vcpu *vcpu); 581int kvm_emulate_halt(struct kvm_vcpu *vcpu);
582int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); 582int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
@@ -590,34 +590,33 @@ void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
590void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr0); 590void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr0);
591void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr0); 591void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr0);
592void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr0); 592void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr0);
593unsigned long get_cr8(struct kvm_vcpu *vcpu);
593void lmsw(struct kvm_vcpu *vcpu, unsigned long msw); 594void lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
595void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
594 596
595int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); 597int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
596int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data); 598int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
597 599
598void fx_init(struct kvm_vcpu *vcpu); 600void fx_init(struct kvm_vcpu *vcpu);
599 601
600void load_msrs(struct vmx_msr_entry *e, int n);
601void save_msrs(struct vmx_msr_entry *e, int n);
602void kvm_resched(struct kvm_vcpu *vcpu); 602void kvm_resched(struct kvm_vcpu *vcpu);
603void kvm_load_guest_fpu(struct kvm_vcpu *vcpu); 603void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
604void kvm_put_guest_fpu(struct kvm_vcpu *vcpu); 604void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
605void kvm_flush_remote_tlbs(struct kvm *kvm); 605void kvm_flush_remote_tlbs(struct kvm *kvm);
606 606
607int kvm_read_guest(struct kvm_vcpu *vcpu, 607int emulator_read_std(unsigned long addr,
608 gva_t addr, 608 void *val,
609 unsigned long size, 609 unsigned int bytes,
610 void *dest); 610 struct kvm_vcpu *vcpu);
611 611int emulator_write_emulated(unsigned long addr,
612int kvm_write_guest(struct kvm_vcpu *vcpu, 612 const void *val,
613 gva_t addr, 613 unsigned int bytes,
614 unsigned long size, 614 struct kvm_vcpu *vcpu);
615 void *data);
616 615
617unsigned long segment_base(u16 selector); 616unsigned long segment_base(u16 selector);
618 617
619void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, 618void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
620 const u8 *old, const u8 *new, int bytes); 619 const u8 *new, int bytes);
621int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); 620int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
622void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); 621void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
623int kvm_mmu_load(struct kvm_vcpu *vcpu); 622int kvm_mmu_load(struct kvm_vcpu *vcpu);
@@ -656,17 +655,17 @@ static inline int is_long_mode(struct kvm_vcpu *vcpu)
656 655
657static inline int is_pae(struct kvm_vcpu *vcpu) 656static inline int is_pae(struct kvm_vcpu *vcpu)
658{ 657{
659 return vcpu->cr4 & CR4_PAE_MASK; 658 return vcpu->cr4 & X86_CR4_PAE;
660} 659}
661 660
662static inline int is_pse(struct kvm_vcpu *vcpu) 661static inline int is_pse(struct kvm_vcpu *vcpu)
663{ 662{
664 return vcpu->cr4 & CR4_PSE_MASK; 663 return vcpu->cr4 & X86_CR4_PSE;
665} 664}
666 665
667static inline int is_paging(struct kvm_vcpu *vcpu) 666static inline int is_paging(struct kvm_vcpu *vcpu)
668{ 667{
669 return vcpu->cr0 & CR0_PG_MASK; 668 return vcpu->cr0 & X86_CR0_PG;
670} 669}
671 670
672static inline int memslot_id(struct kvm *kvm, struct kvm_memory_slot *slot) 671static inline int memslot_id(struct kvm *kvm, struct kvm_memory_slot *slot)
@@ -746,12 +745,12 @@ static inline unsigned long read_msr(unsigned long msr)
746} 745}
747#endif 746#endif
748 747
749static inline void fx_save(void *image) 748static inline void fx_save(struct i387_fxsave_struct *image)
750{ 749{
751 asm ("fxsave (%0)":: "r" (image)); 750 asm ("fxsave (%0)":: "r" (image));
752} 751}
753 752
754static inline void fx_restore(void *image) 753static inline void fx_restore(struct i387_fxsave_struct *image)
755{ 754{
756 asm ("fxrstor (%0)":: "r" (image)); 755 asm ("fxrstor (%0)":: "r" (image));
757} 756}
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
index cd0557954e50..353e58527d15 100644
--- a/drivers/kvm/kvm_main.c
+++ b/drivers/kvm/kvm_main.c
@@ -18,6 +18,7 @@
18#include "kvm.h" 18#include "kvm.h"
19#include "x86_emulate.h" 19#include "x86_emulate.h"
20#include "segment_descriptor.h" 20#include "segment_descriptor.h"
21#include "irq.h"
21 22
22#include <linux/kvm.h> 23#include <linux/kvm.h>
23#include <linux/module.h> 24#include <linux/module.h>
@@ -37,6 +38,7 @@
37#include <linux/cpumask.h> 38#include <linux/cpumask.h>
38#include <linux/smp.h> 39#include <linux/smp.h>
39#include <linux/anon_inodes.h> 40#include <linux/anon_inodes.h>
41#include <linux/profile.h>
40 42
41#include <asm/processor.h> 43#include <asm/processor.h>
42#include <asm/msr.h> 44#include <asm/msr.h>
@@ -52,9 +54,11 @@ static LIST_HEAD(vm_list);
52 54
53static cpumask_t cpus_hardware_enabled; 55static cpumask_t cpus_hardware_enabled;
54 56
55struct kvm_arch_ops *kvm_arch_ops; 57struct kvm_x86_ops *kvm_x86_ops;
58struct kmem_cache *kvm_vcpu_cache;
59EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
56 60
57static void hardware_disable(void *ignored); 61static __read_mostly struct preempt_ops kvm_preempt_ops;
58 62
59#define STAT_OFFSET(x) offsetof(struct kvm_vcpu, stat.x) 63#define STAT_OFFSET(x) offsetof(struct kvm_vcpu, stat.x)
60 64
@@ -73,6 +77,7 @@ static struct kvm_stats_debugfs_item {
73 { "signal_exits", STAT_OFFSET(signal_exits) }, 77 { "signal_exits", STAT_OFFSET(signal_exits) },
74 { "irq_window", STAT_OFFSET(irq_window_exits) }, 78 { "irq_window", STAT_OFFSET(irq_window_exits) },
75 { "halt_exits", STAT_OFFSET(halt_exits) }, 79 { "halt_exits", STAT_OFFSET(halt_exits) },
80 { "halt_wakeup", STAT_OFFSET(halt_wakeup) },
76 { "request_irq", STAT_OFFSET(request_irq_exits) }, 81 { "request_irq", STAT_OFFSET(request_irq_exits) },
77 { "irq_exits", STAT_OFFSET(irq_exits) }, 82 { "irq_exits", STAT_OFFSET(irq_exits) },
78 { "light_exits", STAT_OFFSET(light_exits) }, 83 { "light_exits", STAT_OFFSET(light_exits) },
@@ -84,10 +89,17 @@ static struct dentry *debugfs_dir;
84 89
85#define MAX_IO_MSRS 256 90#define MAX_IO_MSRS 256
86 91
87#define CR0_RESEVED_BITS 0xffffffff1ffaffc0ULL 92#define CR0_RESERVED_BITS \
88#define LMSW_GUEST_MASK 0x0eULL 93 (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
89#define CR4_RESEVED_BITS (~((1ULL << 11) - 1)) 94 | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
90#define CR8_RESEVED_BITS (~0x0fULL) 95 | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
96#define CR4_RESERVED_BITS \
97 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
98 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
99 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \
100 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
101
102#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
91#define EFER_RESERVED_BITS 0xfffffffffffff2fe 103#define EFER_RESERVED_BITS 0xfffffffffffff2fe
92 104
93#ifdef CONFIG_X86_64 105#ifdef CONFIG_X86_64
@@ -139,82 +151,14 @@ static inline int valid_vcpu(int n)
139 return likely(n >= 0 && n < KVM_MAX_VCPUS); 151 return likely(n >= 0 && n < KVM_MAX_VCPUS);
140} 152}
141 153
142int kvm_read_guest(struct kvm_vcpu *vcpu, gva_t addr, unsigned long size,
143 void *dest)
144{
145 unsigned char *host_buf = dest;
146 unsigned long req_size = size;
147
148 while (size) {
149 hpa_t paddr;
150 unsigned now;
151 unsigned offset;
152 hva_t guest_buf;
153
154 paddr = gva_to_hpa(vcpu, addr);
155
156 if (is_error_hpa(paddr))
157 break;
158
159 guest_buf = (hva_t)kmap_atomic(
160 pfn_to_page(paddr >> PAGE_SHIFT),
161 KM_USER0);
162 offset = addr & ~PAGE_MASK;
163 guest_buf |= offset;
164 now = min(size, PAGE_SIZE - offset);
165 memcpy(host_buf, (void*)guest_buf, now);
166 host_buf += now;
167 addr += now;
168 size -= now;
169 kunmap_atomic((void *)(guest_buf & PAGE_MASK), KM_USER0);
170 }
171 return req_size - size;
172}
173EXPORT_SYMBOL_GPL(kvm_read_guest);
174
175int kvm_write_guest(struct kvm_vcpu *vcpu, gva_t addr, unsigned long size,
176 void *data)
177{
178 unsigned char *host_buf = data;
179 unsigned long req_size = size;
180
181 while (size) {
182 hpa_t paddr;
183 unsigned now;
184 unsigned offset;
185 hva_t guest_buf;
186 gfn_t gfn;
187
188 paddr = gva_to_hpa(vcpu, addr);
189
190 if (is_error_hpa(paddr))
191 break;
192
193 gfn = vcpu->mmu.gva_to_gpa(vcpu, addr) >> PAGE_SHIFT;
194 mark_page_dirty(vcpu->kvm, gfn);
195 guest_buf = (hva_t)kmap_atomic(
196 pfn_to_page(paddr >> PAGE_SHIFT), KM_USER0);
197 offset = addr & ~PAGE_MASK;
198 guest_buf |= offset;
199 now = min(size, PAGE_SIZE - offset);
200 memcpy((void*)guest_buf, host_buf, now);
201 host_buf += now;
202 addr += now;
203 size -= now;
204 kunmap_atomic((void *)(guest_buf & PAGE_MASK), KM_USER0);
205 }
206 return req_size - size;
207}
208EXPORT_SYMBOL_GPL(kvm_write_guest);
209
210void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) 154void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
211{ 155{
212 if (!vcpu->fpu_active || vcpu->guest_fpu_loaded) 156 if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
213 return; 157 return;
214 158
215 vcpu->guest_fpu_loaded = 1; 159 vcpu->guest_fpu_loaded = 1;
216 fx_save(vcpu->host_fx_image); 160 fx_save(&vcpu->host_fx_image);
217 fx_restore(vcpu->guest_fx_image); 161 fx_restore(&vcpu->guest_fx_image);
218} 162}
219EXPORT_SYMBOL_GPL(kvm_load_guest_fpu); 163EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
220 164
@@ -224,8 +168,8 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
224 return; 168 return;
225 169
226 vcpu->guest_fpu_loaded = 0; 170 vcpu->guest_fpu_loaded = 0;
227 fx_save(vcpu->guest_fx_image); 171 fx_save(&vcpu->guest_fx_image);
228 fx_restore(vcpu->host_fx_image); 172 fx_restore(&vcpu->host_fx_image);
229} 173}
230EXPORT_SYMBOL_GPL(kvm_put_guest_fpu); 174EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
231 175
@@ -234,13 +178,21 @@ EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
234 */ 178 */
235static void vcpu_load(struct kvm_vcpu *vcpu) 179static void vcpu_load(struct kvm_vcpu *vcpu)
236{ 180{
181 int cpu;
182
237 mutex_lock(&vcpu->mutex); 183 mutex_lock(&vcpu->mutex);
238 kvm_arch_ops->vcpu_load(vcpu); 184 cpu = get_cpu();
185 preempt_notifier_register(&vcpu->preempt_notifier);
186 kvm_x86_ops->vcpu_load(vcpu, cpu);
187 put_cpu();
239} 188}
240 189
241static void vcpu_put(struct kvm_vcpu *vcpu) 190static void vcpu_put(struct kvm_vcpu *vcpu)
242{ 191{
243 kvm_arch_ops->vcpu_put(vcpu); 192 preempt_disable();
193 kvm_x86_ops->vcpu_put(vcpu);
194 preempt_notifier_unregister(&vcpu->preempt_notifier);
195 preempt_enable();
244 mutex_unlock(&vcpu->mutex); 196 mutex_unlock(&vcpu->mutex);
245} 197}
246 198
@@ -261,8 +213,10 @@ void kvm_flush_remote_tlbs(struct kvm *kvm)
261 atomic_set(&completed, 0); 213 atomic_set(&completed, 0);
262 cpus_clear(cpus); 214 cpus_clear(cpus);
263 needed = 0; 215 needed = 0;
264 for (i = 0; i < kvm->nvcpus; ++i) { 216 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
265 vcpu = &kvm->vcpus[i]; 217 vcpu = kvm->vcpus[i];
218 if (!vcpu)
219 continue;
266 if (test_and_set_bit(KVM_TLB_FLUSH, &vcpu->requests)) 220 if (test_and_set_bit(KVM_TLB_FLUSH, &vcpu->requests))
267 continue; 221 continue;
268 cpu = vcpu->cpu; 222 cpu = vcpu->cpu;
@@ -286,37 +240,79 @@ void kvm_flush_remote_tlbs(struct kvm *kvm)
286 } 240 }
287} 241}
288 242
243int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
244{
245 struct page *page;
246 int r;
247
248 mutex_init(&vcpu->mutex);
249 vcpu->cpu = -1;
250 vcpu->mmu.root_hpa = INVALID_PAGE;
251 vcpu->kvm = kvm;
252 vcpu->vcpu_id = id;
253 if (!irqchip_in_kernel(kvm) || id == 0)
254 vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
255 else
256 vcpu->mp_state = VCPU_MP_STATE_UNINITIALIZED;
257 init_waitqueue_head(&vcpu->wq);
258
259 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
260 if (!page) {
261 r = -ENOMEM;
262 goto fail;
263 }
264 vcpu->run = page_address(page);
265
266 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
267 if (!page) {
268 r = -ENOMEM;
269 goto fail_free_run;
270 }
271 vcpu->pio_data = page_address(page);
272
273 r = kvm_mmu_create(vcpu);
274 if (r < 0)
275 goto fail_free_pio_data;
276
277 return 0;
278
279fail_free_pio_data:
280 free_page((unsigned long)vcpu->pio_data);
281fail_free_run:
282 free_page((unsigned long)vcpu->run);
283fail:
284 return -ENOMEM;
285}
286EXPORT_SYMBOL_GPL(kvm_vcpu_init);
287
288void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
289{
290 kvm_mmu_destroy(vcpu);
291 if (vcpu->apic)
292 hrtimer_cancel(&vcpu->apic->timer.dev);
293 kvm_free_apic(vcpu->apic);
294 free_page((unsigned long)vcpu->pio_data);
295 free_page((unsigned long)vcpu->run);
296}
297EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
298
289static struct kvm *kvm_create_vm(void) 299static struct kvm *kvm_create_vm(void)
290{ 300{
291 struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); 301 struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
292 int i;
293 302
294 if (!kvm) 303 if (!kvm)
295 return ERR_PTR(-ENOMEM); 304 return ERR_PTR(-ENOMEM);
296 305
297 kvm_io_bus_init(&kvm->pio_bus); 306 kvm_io_bus_init(&kvm->pio_bus);
298 spin_lock_init(&kvm->lock); 307 mutex_init(&kvm->lock);
299 INIT_LIST_HEAD(&kvm->active_mmu_pages); 308 INIT_LIST_HEAD(&kvm->active_mmu_pages);
300 kvm_io_bus_init(&kvm->mmio_bus); 309 kvm_io_bus_init(&kvm->mmio_bus);
301 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
302 struct kvm_vcpu *vcpu = &kvm->vcpus[i];
303
304 mutex_init(&vcpu->mutex);
305 vcpu->cpu = -1;
306 vcpu->kvm = kvm;
307 vcpu->mmu.root_hpa = INVALID_PAGE;
308 }
309 spin_lock(&kvm_lock); 310 spin_lock(&kvm_lock);
310 list_add(&kvm->vm_list, &vm_list); 311 list_add(&kvm->vm_list, &vm_list);
311 spin_unlock(&kvm_lock); 312 spin_unlock(&kvm_lock);
312 return kvm; 313 return kvm;
313} 314}
314 315
315static int kvm_dev_open(struct inode *inode, struct file *filp)
316{
317 return 0;
318}
319
320/* 316/*
321 * Free any memory in @free but not in @dont. 317 * Free any memory in @free but not in @dont.
322 */ 318 */
@@ -353,7 +349,7 @@ static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
353{ 349{
354 int i; 350 int i;
355 351
356 for (i = 0; i < 2; ++i) 352 for (i = 0; i < ARRAY_SIZE(vcpu->pio.guest_pages); ++i)
357 if (vcpu->pio.guest_pages[i]) { 353 if (vcpu->pio.guest_pages[i]) {
358 __free_page(vcpu->pio.guest_pages[i]); 354 __free_page(vcpu->pio.guest_pages[i]);
359 vcpu->pio.guest_pages[i] = NULL; 355 vcpu->pio.guest_pages[i] = NULL;
@@ -362,30 +358,11 @@ static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
362 358
363static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) 359static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
364{ 360{
365 if (!vcpu->vmcs)
366 return;
367
368 vcpu_load(vcpu); 361 vcpu_load(vcpu);
369 kvm_mmu_unload(vcpu); 362 kvm_mmu_unload(vcpu);
370 vcpu_put(vcpu); 363 vcpu_put(vcpu);
371} 364}
372 365
373static void kvm_free_vcpu(struct kvm_vcpu *vcpu)
374{
375 if (!vcpu->vmcs)
376 return;
377
378 vcpu_load(vcpu);
379 kvm_mmu_destroy(vcpu);
380 vcpu_put(vcpu);
381 kvm_arch_ops->vcpu_free(vcpu);
382 free_page((unsigned long)vcpu->run);
383 vcpu->run = NULL;
384 free_page((unsigned long)vcpu->pio_data);
385 vcpu->pio_data = NULL;
386 free_pio_guest_pages(vcpu);
387}
388
389static void kvm_free_vcpus(struct kvm *kvm) 366static void kvm_free_vcpus(struct kvm *kvm)
390{ 367{
391 unsigned int i; 368 unsigned int i;
@@ -394,14 +371,15 @@ static void kvm_free_vcpus(struct kvm *kvm)
394 * Unpin any mmu pages first. 371 * Unpin any mmu pages first.
395 */ 372 */
396 for (i = 0; i < KVM_MAX_VCPUS; ++i) 373 for (i = 0; i < KVM_MAX_VCPUS; ++i)
397 kvm_unload_vcpu_mmu(&kvm->vcpus[i]); 374 if (kvm->vcpus[i])
398 for (i = 0; i < KVM_MAX_VCPUS; ++i) 375 kvm_unload_vcpu_mmu(kvm->vcpus[i]);
399 kvm_free_vcpu(&kvm->vcpus[i]); 376 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
400} 377 if (kvm->vcpus[i]) {
378 kvm_x86_ops->vcpu_free(kvm->vcpus[i]);
379 kvm->vcpus[i] = NULL;
380 }
381 }
401 382
402static int kvm_dev_release(struct inode *inode, struct file *filp)
403{
404 return 0;
405} 383}
406 384
407static void kvm_destroy_vm(struct kvm *kvm) 385static void kvm_destroy_vm(struct kvm *kvm)
@@ -411,6 +389,8 @@ static void kvm_destroy_vm(struct kvm *kvm)
411 spin_unlock(&kvm_lock); 389 spin_unlock(&kvm_lock);
412 kvm_io_bus_destroy(&kvm->pio_bus); 390 kvm_io_bus_destroy(&kvm->pio_bus);
413 kvm_io_bus_destroy(&kvm->mmio_bus); 391 kvm_io_bus_destroy(&kvm->mmio_bus);
392 kfree(kvm->vpic);
393 kfree(kvm->vioapic);
414 kvm_free_vcpus(kvm); 394 kvm_free_vcpus(kvm);
415 kvm_free_physmem(kvm); 395 kvm_free_physmem(kvm);
416 kfree(kvm); 396 kfree(kvm);
@@ -426,7 +406,7 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
426 406
427static void inject_gp(struct kvm_vcpu *vcpu) 407static void inject_gp(struct kvm_vcpu *vcpu)
428{ 408{
429 kvm_arch_ops->inject_gp(vcpu, 0); 409 kvm_x86_ops->inject_gp(vcpu, 0);
430} 410}
431 411
432/* 412/*
@@ -437,58 +417,60 @@ static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
437 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; 417 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
438 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; 418 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
439 int i; 419 int i;
440 u64 pdpte;
441 u64 *pdpt; 420 u64 *pdpt;
442 int ret; 421 int ret;
443 struct page *page; 422 struct page *page;
423 u64 pdpte[ARRAY_SIZE(vcpu->pdptrs)];
444 424
445 spin_lock(&vcpu->kvm->lock); 425 mutex_lock(&vcpu->kvm->lock);
446 page = gfn_to_page(vcpu->kvm, pdpt_gfn); 426 page = gfn_to_page(vcpu->kvm, pdpt_gfn);
447 /* FIXME: !page - emulate? 0xff? */ 427 if (!page) {
428 ret = 0;
429 goto out;
430 }
431
448 pdpt = kmap_atomic(page, KM_USER0); 432 pdpt = kmap_atomic(page, KM_USER0);
433 memcpy(pdpte, pdpt+offset, sizeof(pdpte));
434 kunmap_atomic(pdpt, KM_USER0);
449 435
450 ret = 1; 436 for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
451 for (i = 0; i < 4; ++i) { 437 if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {
452 pdpte = pdpt[offset + i];
453 if ((pdpte & 1) && (pdpte & 0xfffffff0000001e6ull)) {
454 ret = 0; 438 ret = 0;
455 goto out; 439 goto out;
456 } 440 }
457 } 441 }
442 ret = 1;
458 443
459 for (i = 0; i < 4; ++i) 444 memcpy(vcpu->pdptrs, pdpte, sizeof(vcpu->pdptrs));
460 vcpu->pdptrs[i] = pdpt[offset + i];
461
462out: 445out:
463 kunmap_atomic(pdpt, KM_USER0); 446 mutex_unlock(&vcpu->kvm->lock);
464 spin_unlock(&vcpu->kvm->lock);
465 447
466 return ret; 448 return ret;
467} 449}
468 450
469void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 451void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
470{ 452{
471 if (cr0 & CR0_RESEVED_BITS) { 453 if (cr0 & CR0_RESERVED_BITS) {
472 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", 454 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
473 cr0, vcpu->cr0); 455 cr0, vcpu->cr0);
474 inject_gp(vcpu); 456 inject_gp(vcpu);
475 return; 457 return;
476 } 458 }
477 459
478 if ((cr0 & CR0_NW_MASK) && !(cr0 & CR0_CD_MASK)) { 460 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
479 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); 461 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
480 inject_gp(vcpu); 462 inject_gp(vcpu);
481 return; 463 return;
482 } 464 }
483 465
484 if ((cr0 & CR0_PG_MASK) && !(cr0 & CR0_PE_MASK)) { 466 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
485 printk(KERN_DEBUG "set_cr0: #GP, set PG flag " 467 printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
486 "and a clear PE flag\n"); 468 "and a clear PE flag\n");
487 inject_gp(vcpu); 469 inject_gp(vcpu);
488 return; 470 return;
489 } 471 }
490 472
491 if (!is_paging(vcpu) && (cr0 & CR0_PG_MASK)) { 473 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
492#ifdef CONFIG_X86_64 474#ifdef CONFIG_X86_64
493 if ((vcpu->shadow_efer & EFER_LME)) { 475 if ((vcpu->shadow_efer & EFER_LME)) {
494 int cs_db, cs_l; 476 int cs_db, cs_l;
@@ -499,7 +481,7 @@ void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
499 inject_gp(vcpu); 481 inject_gp(vcpu);
500 return; 482 return;
501 } 483 }
502 kvm_arch_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 484 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
503 if (cs_l) { 485 if (cs_l) {
504 printk(KERN_DEBUG "set_cr0: #GP, start paging " 486 printk(KERN_DEBUG "set_cr0: #GP, start paging "
505 "in long mode while CS.L == 1\n"); 487 "in long mode while CS.L == 1\n");
@@ -518,12 +500,12 @@ void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
518 500
519 } 501 }
520 502
521 kvm_arch_ops->set_cr0(vcpu, cr0); 503 kvm_x86_ops->set_cr0(vcpu, cr0);
522 vcpu->cr0 = cr0; 504 vcpu->cr0 = cr0;
523 505
524 spin_lock(&vcpu->kvm->lock); 506 mutex_lock(&vcpu->kvm->lock);
525 kvm_mmu_reset_context(vcpu); 507 kvm_mmu_reset_context(vcpu);
526 spin_unlock(&vcpu->kvm->lock); 508 mutex_unlock(&vcpu->kvm->lock);
527 return; 509 return;
528} 510}
529EXPORT_SYMBOL_GPL(set_cr0); 511EXPORT_SYMBOL_GPL(set_cr0);
@@ -536,62 +518,72 @@ EXPORT_SYMBOL_GPL(lmsw);
536 518
537void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 519void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
538{ 520{
539 if (cr4 & CR4_RESEVED_BITS) { 521 if (cr4 & CR4_RESERVED_BITS) {
540 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); 522 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
541 inject_gp(vcpu); 523 inject_gp(vcpu);
542 return; 524 return;
543 } 525 }
544 526
545 if (is_long_mode(vcpu)) { 527 if (is_long_mode(vcpu)) {
546 if (!(cr4 & CR4_PAE_MASK)) { 528 if (!(cr4 & X86_CR4_PAE)) {
547 printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while " 529 printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
548 "in long mode\n"); 530 "in long mode\n");
549 inject_gp(vcpu); 531 inject_gp(vcpu);
550 return; 532 return;
551 } 533 }
552 } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & CR4_PAE_MASK) 534 } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE)
553 && !load_pdptrs(vcpu, vcpu->cr3)) { 535 && !load_pdptrs(vcpu, vcpu->cr3)) {
554 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); 536 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
555 inject_gp(vcpu); 537 inject_gp(vcpu);
538 return;
556 } 539 }
557 540
558 if (cr4 & CR4_VMXE_MASK) { 541 if (cr4 & X86_CR4_VMXE) {
559 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n"); 542 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
560 inject_gp(vcpu); 543 inject_gp(vcpu);
561 return; 544 return;
562 } 545 }
563 kvm_arch_ops->set_cr4(vcpu, cr4); 546 kvm_x86_ops->set_cr4(vcpu, cr4);
564 spin_lock(&vcpu->kvm->lock); 547 vcpu->cr4 = cr4;
548 mutex_lock(&vcpu->kvm->lock);
565 kvm_mmu_reset_context(vcpu); 549 kvm_mmu_reset_context(vcpu);
566 spin_unlock(&vcpu->kvm->lock); 550 mutex_unlock(&vcpu->kvm->lock);
567} 551}
568EXPORT_SYMBOL_GPL(set_cr4); 552EXPORT_SYMBOL_GPL(set_cr4);
569 553
570void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 554void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
571{ 555{
572 if (is_long_mode(vcpu)) { 556 if (is_long_mode(vcpu)) {
573 if (cr3 & CR3_L_MODE_RESEVED_BITS) { 557 if (cr3 & CR3_L_MODE_RESERVED_BITS) {
574 printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); 558 printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
575 inject_gp(vcpu); 559 inject_gp(vcpu);
576 return; 560 return;
577 } 561 }
578 } else { 562 } else {
579 if (cr3 & CR3_RESEVED_BITS) { 563 if (is_pae(vcpu)) {
580 printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); 564 if (cr3 & CR3_PAE_RESERVED_BITS) {
581 inject_gp(vcpu); 565 printk(KERN_DEBUG
582 return; 566 "set_cr3: #GP, reserved bits\n");
583 } 567 inject_gp(vcpu);
584 if (is_paging(vcpu) && is_pae(vcpu) && 568 return;
585 !load_pdptrs(vcpu, cr3)) { 569 }
586 printk(KERN_DEBUG "set_cr3: #GP, pdptrs " 570 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
587 "reserved bits\n"); 571 printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
588 inject_gp(vcpu); 572 "reserved bits\n");
589 return; 573 inject_gp(vcpu);
574 return;
575 }
576 } else {
577 if (cr3 & CR3_NONPAE_RESERVED_BITS) {
578 printk(KERN_DEBUG
579 "set_cr3: #GP, reserved bits\n");
580 inject_gp(vcpu);
581 return;
582 }
590 } 583 }
591 } 584 }
592 585
593 vcpu->cr3 = cr3; 586 mutex_lock(&vcpu->kvm->lock);
594 spin_lock(&vcpu->kvm->lock);
595 /* 587 /*
596 * Does the new cr3 value map to physical memory? (Note, we 588 * Does the new cr3 value map to physical memory? (Note, we
597 * catch an invalid cr3 even in real-mode, because it would 589 * catch an invalid cr3 even in real-mode, because it would
@@ -603,46 +595,73 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
603 */ 595 */
604 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) 596 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
605 inject_gp(vcpu); 597 inject_gp(vcpu);
606 else 598 else {
599 vcpu->cr3 = cr3;
607 vcpu->mmu.new_cr3(vcpu); 600 vcpu->mmu.new_cr3(vcpu);
608 spin_unlock(&vcpu->kvm->lock); 601 }
602 mutex_unlock(&vcpu->kvm->lock);
609} 603}
610EXPORT_SYMBOL_GPL(set_cr3); 604EXPORT_SYMBOL_GPL(set_cr3);
611 605
612void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) 606void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
613{ 607{
614 if ( cr8 & CR8_RESEVED_BITS) { 608 if (cr8 & CR8_RESERVED_BITS) {
615 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8); 609 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
616 inject_gp(vcpu); 610 inject_gp(vcpu);
617 return; 611 return;
618 } 612 }
619 vcpu->cr8 = cr8; 613 if (irqchip_in_kernel(vcpu->kvm))
614 kvm_lapic_set_tpr(vcpu, cr8);
615 else
616 vcpu->cr8 = cr8;
620} 617}
621EXPORT_SYMBOL_GPL(set_cr8); 618EXPORT_SYMBOL_GPL(set_cr8);
622 619
623void fx_init(struct kvm_vcpu *vcpu) 620unsigned long get_cr8(struct kvm_vcpu *vcpu)
621{
622 if (irqchip_in_kernel(vcpu->kvm))
623 return kvm_lapic_get_cr8(vcpu);
624 else
625 return vcpu->cr8;
626}
627EXPORT_SYMBOL_GPL(get_cr8);
628
629u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
624{ 630{
625 struct __attribute__ ((__packed__)) fx_image_s { 631 if (irqchip_in_kernel(vcpu->kvm))
626 u16 control; //fcw 632 return vcpu->apic_base;
627 u16 status; //fsw 633 else
628 u16 tag; // ftw 634 return vcpu->apic_base;
629 u16 opcode; //fop 635}
630 u64 ip; // fpu ip 636EXPORT_SYMBOL_GPL(kvm_get_apic_base);
631 u64 operand;// fpu dp
632 u32 mxcsr;
633 u32 mxcsr_mask;
634 637
635 } *fx_image; 638void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
639{
640 /* TODO: reserve bits check */
641 if (irqchip_in_kernel(vcpu->kvm))
642 kvm_lapic_set_base(vcpu, data);
643 else
644 vcpu->apic_base = data;
645}
646EXPORT_SYMBOL_GPL(kvm_set_apic_base);
647
648void fx_init(struct kvm_vcpu *vcpu)
649{
650 unsigned after_mxcsr_mask;
636 651
637 fx_save(vcpu->host_fx_image); 652 /* Initialize guest FPU by resetting ours and saving into guest's */
653 preempt_disable();
654 fx_save(&vcpu->host_fx_image);
638 fpu_init(); 655 fpu_init();
639 fx_save(vcpu->guest_fx_image); 656 fx_save(&vcpu->guest_fx_image);
640 fx_restore(vcpu->host_fx_image); 657 fx_restore(&vcpu->host_fx_image);
658 preempt_enable();
641 659
642 fx_image = (struct fx_image_s *)vcpu->guest_fx_image; 660 vcpu->cr0 |= X86_CR0_ET;
643 fx_image->mxcsr = 0x1f80; 661 after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
644 memset(vcpu->guest_fx_image + sizeof(struct fx_image_s), 662 vcpu->guest_fx_image.mxcsr = 0x1f80;
645 0, FX_IMAGE_SIZE - sizeof(struct fx_image_s)); 663 memset((void *)&vcpu->guest_fx_image + after_mxcsr_mask,
664 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
646} 665}
647EXPORT_SYMBOL_GPL(fx_init); 666EXPORT_SYMBOL_GPL(fx_init);
648 667
@@ -661,7 +680,6 @@ static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
661 unsigned long i; 680 unsigned long i;
662 struct kvm_memory_slot *memslot; 681 struct kvm_memory_slot *memslot;
663 struct kvm_memory_slot old, new; 682 struct kvm_memory_slot old, new;
664 int memory_config_version;
665 683
666 r = -EINVAL; 684 r = -EINVAL;
667 /* General sanity checks */ 685 /* General sanity checks */
@@ -681,10 +699,8 @@ static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
681 if (!npages) 699 if (!npages)
682 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES; 700 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
683 701
684raced: 702 mutex_lock(&kvm->lock);
685 spin_lock(&kvm->lock);
686 703
687 memory_config_version = kvm->memory_config_version;
688 new = old = *memslot; 704 new = old = *memslot;
689 705
690 new.base_gfn = base_gfn; 706 new.base_gfn = base_gfn;
@@ -707,11 +723,6 @@ raced:
707 (base_gfn >= s->base_gfn + s->npages))) 723 (base_gfn >= s->base_gfn + s->npages)))
708 goto out_unlock; 724 goto out_unlock;
709 } 725 }
710 /*
711 * Do memory allocations outside lock. memory_config_version will
712 * detect any races.
713 */
714 spin_unlock(&kvm->lock);
715 726
716 /* Deallocate if slot is being removed */ 727 /* Deallocate if slot is being removed */
717 if (!npages) 728 if (!npages)
@@ -728,14 +739,14 @@ raced:
728 new.phys_mem = vmalloc(npages * sizeof(struct page *)); 739 new.phys_mem = vmalloc(npages * sizeof(struct page *));
729 740
730 if (!new.phys_mem) 741 if (!new.phys_mem)
731 goto out_free; 742 goto out_unlock;
732 743
733 memset(new.phys_mem, 0, npages * sizeof(struct page *)); 744 memset(new.phys_mem, 0, npages * sizeof(struct page *));
734 for (i = 0; i < npages; ++i) { 745 for (i = 0; i < npages; ++i) {
735 new.phys_mem[i] = alloc_page(GFP_HIGHUSER 746 new.phys_mem[i] = alloc_page(GFP_HIGHUSER
736 | __GFP_ZERO); 747 | __GFP_ZERO);
737 if (!new.phys_mem[i]) 748 if (!new.phys_mem[i])
738 goto out_free; 749 goto out_unlock;
739 set_page_private(new.phys_mem[i],0); 750 set_page_private(new.phys_mem[i],0);
740 } 751 }
741 } 752 }
@@ -746,39 +757,25 @@ raced:
746 757
747 new.dirty_bitmap = vmalloc(dirty_bytes); 758 new.dirty_bitmap = vmalloc(dirty_bytes);
748 if (!new.dirty_bitmap) 759 if (!new.dirty_bitmap)
749 goto out_free; 760 goto out_unlock;
750 memset(new.dirty_bitmap, 0, dirty_bytes); 761 memset(new.dirty_bitmap, 0, dirty_bytes);
751 } 762 }
752 763
753 spin_lock(&kvm->lock);
754
755 if (memory_config_version != kvm->memory_config_version) {
756 spin_unlock(&kvm->lock);
757 kvm_free_physmem_slot(&new, &old);
758 goto raced;
759 }
760
761 r = -EAGAIN;
762 if (kvm->busy)
763 goto out_unlock;
764
765 if (mem->slot >= kvm->nmemslots) 764 if (mem->slot >= kvm->nmemslots)
766 kvm->nmemslots = mem->slot + 1; 765 kvm->nmemslots = mem->slot + 1;
767 766
768 *memslot = new; 767 *memslot = new;
769 ++kvm->memory_config_version;
770 768
771 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 769 kvm_mmu_slot_remove_write_access(kvm, mem->slot);
772 kvm_flush_remote_tlbs(kvm); 770 kvm_flush_remote_tlbs(kvm);
773 771
774 spin_unlock(&kvm->lock); 772 mutex_unlock(&kvm->lock);
775 773
776 kvm_free_physmem_slot(&old, &new); 774 kvm_free_physmem_slot(&old, &new);
777 return 0; 775 return 0;
778 776
779out_unlock: 777out_unlock:
780 spin_unlock(&kvm->lock); 778 mutex_unlock(&kvm->lock);
781out_free:
782 kvm_free_physmem_slot(&new, &old); 779 kvm_free_physmem_slot(&new, &old);
783out: 780out:
784 return r; 781 return r;
@@ -795,14 +792,8 @@ static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
795 int n; 792 int n;
796 unsigned long any = 0; 793 unsigned long any = 0;
797 794
798 spin_lock(&kvm->lock); 795 mutex_lock(&kvm->lock);
799 796
800 /*
801 * Prevent changes to guest memory configuration even while the lock
802 * is not taken.
803 */
804 ++kvm->busy;
805 spin_unlock(&kvm->lock);
806 r = -EINVAL; 797 r = -EINVAL;
807 if (log->slot >= KVM_MEMORY_SLOTS) 798 if (log->slot >= KVM_MEMORY_SLOTS)
808 goto out; 799 goto out;
@@ -821,18 +812,17 @@ static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
821 if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) 812 if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
822 goto out; 813 goto out;
823 814
824 spin_lock(&kvm->lock); 815 /* If nothing is dirty, don't bother messing with page tables. */
825 kvm_mmu_slot_remove_write_access(kvm, log->slot); 816 if (any) {
826 kvm_flush_remote_tlbs(kvm); 817 kvm_mmu_slot_remove_write_access(kvm, log->slot);
827 memset(memslot->dirty_bitmap, 0, n); 818 kvm_flush_remote_tlbs(kvm);
828 spin_unlock(&kvm->lock); 819 memset(memslot->dirty_bitmap, 0, n);
820 }
829 821
830 r = 0; 822 r = 0;
831 823
832out: 824out:
833 spin_lock(&kvm->lock); 825 mutex_unlock(&kvm->lock);
834 --kvm->busy;
835 spin_unlock(&kvm->lock);
836 return r; 826 return r;
837} 827}
838 828
@@ -862,7 +852,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
862 < alias->target_phys_addr) 852 < alias->target_phys_addr)
863 goto out; 853 goto out;
864 854
865 spin_lock(&kvm->lock); 855 mutex_lock(&kvm->lock);
866 856
867 p = &kvm->aliases[alias->slot]; 857 p = &kvm->aliases[alias->slot];
868 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; 858 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
@@ -876,7 +866,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
876 866
877 kvm_mmu_zap_all(kvm); 867 kvm_mmu_zap_all(kvm);
878 868
879 spin_unlock(&kvm->lock); 869 mutex_unlock(&kvm->lock);
880 870
881 return 0; 871 return 0;
882 872
@@ -884,6 +874,63 @@ out:
884 return r; 874 return r;
885} 875}
886 876
877static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
878{
879 int r;
880
881 r = 0;
882 switch (chip->chip_id) {
883 case KVM_IRQCHIP_PIC_MASTER:
884 memcpy (&chip->chip.pic,
885 &pic_irqchip(kvm)->pics[0],
886 sizeof(struct kvm_pic_state));
887 break;
888 case KVM_IRQCHIP_PIC_SLAVE:
889 memcpy (&chip->chip.pic,
890 &pic_irqchip(kvm)->pics[1],
891 sizeof(struct kvm_pic_state));
892 break;
893 case KVM_IRQCHIP_IOAPIC:
894 memcpy (&chip->chip.ioapic,
895 ioapic_irqchip(kvm),
896 sizeof(struct kvm_ioapic_state));
897 break;
898 default:
899 r = -EINVAL;
900 break;
901 }
902 return r;
903}
904
905static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
906{
907 int r;
908
909 r = 0;
910 switch (chip->chip_id) {
911 case KVM_IRQCHIP_PIC_MASTER:
912 memcpy (&pic_irqchip(kvm)->pics[0],
913 &chip->chip.pic,
914 sizeof(struct kvm_pic_state));
915 break;
916 case KVM_IRQCHIP_PIC_SLAVE:
917 memcpy (&pic_irqchip(kvm)->pics[1],
918 &chip->chip.pic,
919 sizeof(struct kvm_pic_state));
920 break;
921 case KVM_IRQCHIP_IOAPIC:
922 memcpy (ioapic_irqchip(kvm),
923 &chip->chip.ioapic,
924 sizeof(struct kvm_ioapic_state));
925 break;
926 default:
927 r = -EINVAL;
928 break;
929 }
930 kvm_pic_update_irq(pic_irqchip(kvm));
931 return r;
932}
933
887static gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) 934static gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
888{ 935{
889 int i; 936 int i;
@@ -930,37 +977,26 @@ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
930} 977}
931EXPORT_SYMBOL_GPL(gfn_to_page); 978EXPORT_SYMBOL_GPL(gfn_to_page);
932 979
980/* WARNING: Does not work on aliased pages. */
933void mark_page_dirty(struct kvm *kvm, gfn_t gfn) 981void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
934{ 982{
935 int i;
936 struct kvm_memory_slot *memslot; 983 struct kvm_memory_slot *memslot;
937 unsigned long rel_gfn;
938 984
939 for (i = 0; i < kvm->nmemslots; ++i) { 985 memslot = __gfn_to_memslot(kvm, gfn);
940 memslot = &kvm->memslots[i]; 986 if (memslot && memslot->dirty_bitmap) {
941 987 unsigned long rel_gfn = gfn - memslot->base_gfn;
942 if (gfn >= memslot->base_gfn
943 && gfn < memslot->base_gfn + memslot->npages) {
944 988
945 if (!memslot->dirty_bitmap) 989 /* avoid RMW */
946 return; 990 if (!test_bit(rel_gfn, memslot->dirty_bitmap))
947 991 set_bit(rel_gfn, memslot->dirty_bitmap);
948 rel_gfn = gfn - memslot->base_gfn;
949
950 /* avoid RMW */
951 if (!test_bit(rel_gfn, memslot->dirty_bitmap))
952 set_bit(rel_gfn, memslot->dirty_bitmap);
953 return;
954 }
955 } 992 }
956} 993}
957 994
958static int emulator_read_std(unsigned long addr, 995int emulator_read_std(unsigned long addr,
959 void *val, 996 void *val,
960 unsigned int bytes, 997 unsigned int bytes,
961 struct x86_emulate_ctxt *ctxt) 998 struct kvm_vcpu *vcpu)
962{ 999{
963 struct kvm_vcpu *vcpu = ctxt->vcpu;
964 void *data = val; 1000 void *data = val;
965 1001
966 while (bytes) { 1002 while (bytes) {
@@ -990,26 +1026,42 @@ static int emulator_read_std(unsigned long addr,
990 1026
991 return X86EMUL_CONTINUE; 1027 return X86EMUL_CONTINUE;
992} 1028}
1029EXPORT_SYMBOL_GPL(emulator_read_std);
993 1030
994static int emulator_write_std(unsigned long addr, 1031static int emulator_write_std(unsigned long addr,
995 const void *val, 1032 const void *val,
996 unsigned int bytes, 1033 unsigned int bytes,
997 struct x86_emulate_ctxt *ctxt) 1034 struct kvm_vcpu *vcpu)
998{ 1035{
999 printk(KERN_ERR "emulator_write_std: addr %lx n %d\n", 1036 pr_unimpl(vcpu, "emulator_write_std: addr %lx n %d\n", addr, bytes);
1000 addr, bytes);
1001 return X86EMUL_UNHANDLEABLE; 1037 return X86EMUL_UNHANDLEABLE;
1002} 1038}
1003 1039
1040/*
1041 * Only apic need an MMIO device hook, so shortcut now..
1042 */
1043static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
1044 gpa_t addr)
1045{
1046 struct kvm_io_device *dev;
1047
1048 if (vcpu->apic) {
1049 dev = &vcpu->apic->dev;
1050 if (dev->in_range(dev, addr))
1051 return dev;
1052 }
1053 return NULL;
1054}
1055
1004static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, 1056static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
1005 gpa_t addr) 1057 gpa_t addr)
1006{ 1058{
1007 /* 1059 struct kvm_io_device *dev;
1008 * Note that its important to have this wrapper function because 1060
1009 * in the very near future we will be checking for MMIOs against 1061 dev = vcpu_find_pervcpu_dev(vcpu, addr);
1010 * the LAPIC as well as the general MMIO bus 1062 if (dev == NULL)
1011 */ 1063 dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr);
1012 return kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr); 1064 return dev;
1013} 1065}
1014 1066
1015static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu, 1067static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
@@ -1021,9 +1073,8 @@ static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
1021static int emulator_read_emulated(unsigned long addr, 1073static int emulator_read_emulated(unsigned long addr,
1022 void *val, 1074 void *val,
1023 unsigned int bytes, 1075 unsigned int bytes,
1024 struct x86_emulate_ctxt *ctxt) 1076 struct kvm_vcpu *vcpu)
1025{ 1077{
1026 struct kvm_vcpu *vcpu = ctxt->vcpu;
1027 struct kvm_io_device *mmio_dev; 1078 struct kvm_io_device *mmio_dev;
1028 gpa_t gpa; 1079 gpa_t gpa;
1029 1080
@@ -1031,7 +1082,7 @@ static int emulator_read_emulated(unsigned long addr,
1031 memcpy(val, vcpu->mmio_data, bytes); 1082 memcpy(val, vcpu->mmio_data, bytes);
1032 vcpu->mmio_read_completed = 0; 1083 vcpu->mmio_read_completed = 0;
1033 return X86EMUL_CONTINUE; 1084 return X86EMUL_CONTINUE;
1034 } else if (emulator_read_std(addr, val, bytes, ctxt) 1085 } else if (emulator_read_std(addr, val, bytes, vcpu)
1035 == X86EMUL_CONTINUE) 1086 == X86EMUL_CONTINUE)
1036 return X86EMUL_CONTINUE; 1087 return X86EMUL_CONTINUE;
1037 1088
@@ -1061,7 +1112,6 @@ static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
1061{ 1112{
1062 struct page *page; 1113 struct page *page;
1063 void *virt; 1114 void *virt;
1064 unsigned offset = offset_in_page(gpa);
1065 1115
1066 if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT)) 1116 if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT))
1067 return 0; 1117 return 0;
@@ -1070,7 +1120,7 @@ static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
1070 return 0; 1120 return 0;
1071 mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT); 1121 mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT);
1072 virt = kmap_atomic(page, KM_USER0); 1122 virt = kmap_atomic(page, KM_USER0);
1073 kvm_mmu_pte_write(vcpu, gpa, virt + offset, val, bytes); 1123 kvm_mmu_pte_write(vcpu, gpa, val, bytes);
1074 memcpy(virt + offset_in_page(gpa), val, bytes); 1124 memcpy(virt + offset_in_page(gpa), val, bytes);
1075 kunmap_atomic(virt, KM_USER0); 1125 kunmap_atomic(virt, KM_USER0);
1076 return 1; 1126 return 1;
@@ -1079,14 +1129,13 @@ static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
1079static int emulator_write_emulated_onepage(unsigned long addr, 1129static int emulator_write_emulated_onepage(unsigned long addr,
1080 const void *val, 1130 const void *val,
1081 unsigned int bytes, 1131 unsigned int bytes,
1082 struct x86_emulate_ctxt *ctxt) 1132 struct kvm_vcpu *vcpu)
1083{ 1133{
1084 struct kvm_vcpu *vcpu = ctxt->vcpu;
1085 struct kvm_io_device *mmio_dev; 1134 struct kvm_io_device *mmio_dev;
1086 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); 1135 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1087 1136
1088 if (gpa == UNMAPPED_GVA) { 1137 if (gpa == UNMAPPED_GVA) {
1089 kvm_arch_ops->inject_page_fault(vcpu, addr, 2); 1138 kvm_x86_ops->inject_page_fault(vcpu, addr, 2);
1090 return X86EMUL_PROPAGATE_FAULT; 1139 return X86EMUL_PROPAGATE_FAULT;
1091 } 1140 }
1092 1141
@@ -1111,31 +1160,32 @@ static int emulator_write_emulated_onepage(unsigned long addr,
1111 return X86EMUL_CONTINUE; 1160 return X86EMUL_CONTINUE;
1112} 1161}
1113 1162
1114static int emulator_write_emulated(unsigned long addr, 1163int emulator_write_emulated(unsigned long addr,
1115 const void *val, 1164 const void *val,
1116 unsigned int bytes, 1165 unsigned int bytes,
1117 struct x86_emulate_ctxt *ctxt) 1166 struct kvm_vcpu *vcpu)
1118{ 1167{
1119 /* Crossing a page boundary? */ 1168 /* Crossing a page boundary? */
1120 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { 1169 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
1121 int rc, now; 1170 int rc, now;
1122 1171
1123 now = -addr & ~PAGE_MASK; 1172 now = -addr & ~PAGE_MASK;
1124 rc = emulator_write_emulated_onepage(addr, val, now, ctxt); 1173 rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
1125 if (rc != X86EMUL_CONTINUE) 1174 if (rc != X86EMUL_CONTINUE)
1126 return rc; 1175 return rc;
1127 addr += now; 1176 addr += now;
1128 val += now; 1177 val += now;
1129 bytes -= now; 1178 bytes -= now;
1130 } 1179 }
1131 return emulator_write_emulated_onepage(addr, val, bytes, ctxt); 1180 return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
1132} 1181}
1182EXPORT_SYMBOL_GPL(emulator_write_emulated);
1133 1183
1134static int emulator_cmpxchg_emulated(unsigned long addr, 1184static int emulator_cmpxchg_emulated(unsigned long addr,
1135 const void *old, 1185 const void *old,
1136 const void *new, 1186 const void *new,
1137 unsigned int bytes, 1187 unsigned int bytes,
1138 struct x86_emulate_ctxt *ctxt) 1188 struct kvm_vcpu *vcpu)
1139{ 1189{
1140 static int reported; 1190 static int reported;
1141 1191
@@ -1143,12 +1193,12 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
1143 reported = 1; 1193 reported = 1;
1144 printk(KERN_WARNING "kvm: emulating exchange as write\n"); 1194 printk(KERN_WARNING "kvm: emulating exchange as write\n");
1145 } 1195 }
1146 return emulator_write_emulated(addr, new, bytes, ctxt); 1196 return emulator_write_emulated(addr, new, bytes, vcpu);
1147} 1197}
1148 1198
1149static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) 1199static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
1150{ 1200{
1151 return kvm_arch_ops->get_segment_base(vcpu, seg); 1201 return kvm_x86_ops->get_segment_base(vcpu, seg);
1152} 1202}
1153 1203
1154int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) 1204int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
@@ -1158,10 +1208,8 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
1158 1208
1159int emulate_clts(struct kvm_vcpu *vcpu) 1209int emulate_clts(struct kvm_vcpu *vcpu)
1160{ 1210{
1161 unsigned long cr0; 1211 vcpu->cr0 &= ~X86_CR0_TS;
1162 1212 kvm_x86_ops->set_cr0(vcpu, vcpu->cr0);
1163 cr0 = vcpu->cr0 & ~CR0_TS_MASK;
1164 kvm_arch_ops->set_cr0(vcpu, cr0);
1165 return X86EMUL_CONTINUE; 1213 return X86EMUL_CONTINUE;
1166} 1214}
1167 1215
@@ -1171,11 +1219,10 @@ int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, unsigned long *dest)
1171 1219
1172 switch (dr) { 1220 switch (dr) {
1173 case 0 ... 3: 1221 case 0 ... 3:
1174 *dest = kvm_arch_ops->get_dr(vcpu, dr); 1222 *dest = kvm_x86_ops->get_dr(vcpu, dr);
1175 return X86EMUL_CONTINUE; 1223 return X86EMUL_CONTINUE;
1176 default: 1224 default:
1177 printk(KERN_DEBUG "%s: unexpected dr %u\n", 1225 pr_unimpl(vcpu, "%s: unexpected dr %u\n", __FUNCTION__, dr);
1178 __FUNCTION__, dr);
1179 return X86EMUL_UNHANDLEABLE; 1226 return X86EMUL_UNHANDLEABLE;
1180 } 1227 }
1181} 1228}
@@ -1185,7 +1232,7 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
1185 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; 1232 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
1186 int exception; 1233 int exception;
1187 1234
1188 kvm_arch_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception); 1235 kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
1189 if (exception) { 1236 if (exception) {
1190 /* FIXME: better handling */ 1237 /* FIXME: better handling */
1191 return X86EMUL_UNHANDLEABLE; 1238 return X86EMUL_UNHANDLEABLE;
@@ -1193,25 +1240,25 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
1193 return X86EMUL_CONTINUE; 1240 return X86EMUL_CONTINUE;
1194} 1241}
1195 1242
1196static void report_emulation_failure(struct x86_emulate_ctxt *ctxt) 1243void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
1197{ 1244{
1198 static int reported; 1245 static int reported;
1199 u8 opcodes[4]; 1246 u8 opcodes[4];
1200 unsigned long rip = ctxt->vcpu->rip; 1247 unsigned long rip = vcpu->rip;
1201 unsigned long rip_linear; 1248 unsigned long rip_linear;
1202 1249
1203 rip_linear = rip + get_segment_base(ctxt->vcpu, VCPU_SREG_CS); 1250 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
1204 1251
1205 if (reported) 1252 if (reported)
1206 return; 1253 return;
1207 1254
1208 emulator_read_std(rip_linear, (void *)opcodes, 4, ctxt); 1255 emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu);
1209 1256
1210 printk(KERN_ERR "emulation failed but !mmio_needed?" 1257 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
1211 " rip %lx %02x %02x %02x %02x\n", 1258 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
1212 rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
1213 reported = 1; 1259 reported = 1;
1214} 1260}
1261EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
1215 1262
1216struct x86_emulate_ops emulate_ops = { 1263struct x86_emulate_ops emulate_ops = {
1217 .read_std = emulator_read_std, 1264 .read_std = emulator_read_std,
@@ -1231,12 +1278,12 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
1231 int cs_db, cs_l; 1278 int cs_db, cs_l;
1232 1279
1233 vcpu->mmio_fault_cr2 = cr2; 1280 vcpu->mmio_fault_cr2 = cr2;
1234 kvm_arch_ops->cache_regs(vcpu); 1281 kvm_x86_ops->cache_regs(vcpu);
1235 1282
1236 kvm_arch_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 1283 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
1237 1284
1238 emulate_ctxt.vcpu = vcpu; 1285 emulate_ctxt.vcpu = vcpu;
1239 emulate_ctxt.eflags = kvm_arch_ops->get_rflags(vcpu); 1286 emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
1240 emulate_ctxt.cr2 = cr2; 1287 emulate_ctxt.cr2 = cr2;
1241 emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM) 1288 emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM)
1242 ? X86EMUL_MODE_REAL : cs_l 1289 ? X86EMUL_MODE_REAL : cs_l
@@ -1259,9 +1306,13 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
1259 emulate_ctxt.fs_base = get_segment_base(vcpu, VCPU_SREG_FS); 1306 emulate_ctxt.fs_base = get_segment_base(vcpu, VCPU_SREG_FS);
1260 1307
1261 vcpu->mmio_is_write = 0; 1308 vcpu->mmio_is_write = 0;
1309 vcpu->pio.string = 0;
1262 r = x86_emulate_memop(&emulate_ctxt, &emulate_ops); 1310 r = x86_emulate_memop(&emulate_ctxt, &emulate_ops);
1311 if (vcpu->pio.string)
1312 return EMULATE_DO_MMIO;
1263 1313
1264 if ((r || vcpu->mmio_is_write) && run) { 1314 if ((r || vcpu->mmio_is_write) && run) {
1315 run->exit_reason = KVM_EXIT_MMIO;
1265 run->mmio.phys_addr = vcpu->mmio_phys_addr; 1316 run->mmio.phys_addr = vcpu->mmio_phys_addr;
1266 memcpy(run->mmio.data, vcpu->mmio_data, 8); 1317 memcpy(run->mmio.data, vcpu->mmio_data, 8);
1267 run->mmio.len = vcpu->mmio_size; 1318 run->mmio.len = vcpu->mmio_size;
@@ -1272,14 +1323,14 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
1272 if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) 1323 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
1273 return EMULATE_DONE; 1324 return EMULATE_DONE;
1274 if (!vcpu->mmio_needed) { 1325 if (!vcpu->mmio_needed) {
1275 report_emulation_failure(&emulate_ctxt); 1326 kvm_report_emulation_failure(vcpu, "mmio");
1276 return EMULATE_FAIL; 1327 return EMULATE_FAIL;
1277 } 1328 }
1278 return EMULATE_DO_MMIO; 1329 return EMULATE_DO_MMIO;
1279 } 1330 }
1280 1331
1281 kvm_arch_ops->decache_regs(vcpu); 1332 kvm_x86_ops->decache_regs(vcpu);
1282 kvm_arch_ops->set_rflags(vcpu, emulate_ctxt.eflags); 1333 kvm_x86_ops->set_rflags(vcpu, emulate_ctxt.eflags);
1283 1334
1284 if (vcpu->mmio_is_write) { 1335 if (vcpu->mmio_is_write) {
1285 vcpu->mmio_needed = 0; 1336 vcpu->mmio_needed = 0;
@@ -1290,14 +1341,45 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
1290} 1341}
1291EXPORT_SYMBOL_GPL(emulate_instruction); 1342EXPORT_SYMBOL_GPL(emulate_instruction);
1292 1343
1293int kvm_emulate_halt(struct kvm_vcpu *vcpu) 1344/*
1345 * The vCPU has executed a HLT instruction with in-kernel mode enabled.
1346 */
1347static void kvm_vcpu_block(struct kvm_vcpu *vcpu)
1294{ 1348{
1295 if (vcpu->irq_summary) 1349 DECLARE_WAITQUEUE(wait, current);
1296 return 1;
1297 1350
1298 vcpu->run->exit_reason = KVM_EXIT_HLT; 1351 add_wait_queue(&vcpu->wq, &wait);
1352
1353 /*
1354 * We will block until either an interrupt or a signal wakes us up
1355 */
1356 while (!kvm_cpu_has_interrupt(vcpu)
1357 && !signal_pending(current)
1358 && vcpu->mp_state != VCPU_MP_STATE_RUNNABLE
1359 && vcpu->mp_state != VCPU_MP_STATE_SIPI_RECEIVED) {
1360 set_current_state(TASK_INTERRUPTIBLE);
1361 vcpu_put(vcpu);
1362 schedule();
1363 vcpu_load(vcpu);
1364 }
1365
1366 __set_current_state(TASK_RUNNING);
1367 remove_wait_queue(&vcpu->wq, &wait);
1368}
1369
1370int kvm_emulate_halt(struct kvm_vcpu *vcpu)
1371{
1299 ++vcpu->stat.halt_exits; 1372 ++vcpu->stat.halt_exits;
1300 return 0; 1373 if (irqchip_in_kernel(vcpu->kvm)) {
1374 vcpu->mp_state = VCPU_MP_STATE_HALTED;
1375 kvm_vcpu_block(vcpu);
1376 if (vcpu->mp_state != VCPU_MP_STATE_RUNNABLE)
1377 return -EINTR;
1378 return 1;
1379 } else {
1380 vcpu->run->exit_reason = KVM_EXIT_HLT;
1381 return 0;
1382 }
1301} 1383}
1302EXPORT_SYMBOL_GPL(kvm_emulate_halt); 1384EXPORT_SYMBOL_GPL(kvm_emulate_halt);
1303 1385
@@ -1305,7 +1387,7 @@ int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run)
1305{ 1387{
1306 unsigned long nr, a0, a1, a2, a3, a4, a5, ret; 1388 unsigned long nr, a0, a1, a2, a3, a4, a5, ret;
1307 1389
1308 kvm_arch_ops->cache_regs(vcpu); 1390 kvm_x86_ops->cache_regs(vcpu);
1309 ret = -KVM_EINVAL; 1391 ret = -KVM_EINVAL;
1310#ifdef CONFIG_X86_64 1392#ifdef CONFIG_X86_64
1311 if (is_long_mode(vcpu)) { 1393 if (is_long_mode(vcpu)) {
@@ -1329,6 +1411,7 @@ int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run)
1329 } 1411 }
1330 switch (nr) { 1412 switch (nr) {
1331 default: 1413 default:
1414 run->hypercall.nr = nr;
1332 run->hypercall.args[0] = a0; 1415 run->hypercall.args[0] = a0;
1333 run->hypercall.args[1] = a1; 1416 run->hypercall.args[1] = a1;
1334 run->hypercall.args[2] = a2; 1417 run->hypercall.args[2] = a2;
@@ -1337,11 +1420,11 @@ int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run)
1337 run->hypercall.args[5] = a5; 1420 run->hypercall.args[5] = a5;
1338 run->hypercall.ret = ret; 1421 run->hypercall.ret = ret;
1339 run->hypercall.longmode = is_long_mode(vcpu); 1422 run->hypercall.longmode = is_long_mode(vcpu);
1340 kvm_arch_ops->decache_regs(vcpu); 1423 kvm_x86_ops->decache_regs(vcpu);
1341 return 0; 1424 return 0;
1342 } 1425 }
1343 vcpu->regs[VCPU_REGS_RAX] = ret; 1426 vcpu->regs[VCPU_REGS_RAX] = ret;
1344 kvm_arch_ops->decache_regs(vcpu); 1427 kvm_x86_ops->decache_regs(vcpu);
1345 return 1; 1428 return 1;
1346} 1429}
1347EXPORT_SYMBOL_GPL(kvm_hypercall); 1430EXPORT_SYMBOL_GPL(kvm_hypercall);
@@ -1355,26 +1438,26 @@ void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
1355{ 1438{
1356 struct descriptor_table dt = { limit, base }; 1439 struct descriptor_table dt = { limit, base };
1357 1440
1358 kvm_arch_ops->set_gdt(vcpu, &dt); 1441 kvm_x86_ops->set_gdt(vcpu, &dt);
1359} 1442}
1360 1443
1361void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) 1444void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
1362{ 1445{
1363 struct descriptor_table dt = { limit, base }; 1446 struct descriptor_table dt = { limit, base };
1364 1447
1365 kvm_arch_ops->set_idt(vcpu, &dt); 1448 kvm_x86_ops->set_idt(vcpu, &dt);
1366} 1449}
1367 1450
1368void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, 1451void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
1369 unsigned long *rflags) 1452 unsigned long *rflags)
1370{ 1453{
1371 lmsw(vcpu, msw); 1454 lmsw(vcpu, msw);
1372 *rflags = kvm_arch_ops->get_rflags(vcpu); 1455 *rflags = kvm_x86_ops->get_rflags(vcpu);
1373} 1456}
1374 1457
1375unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) 1458unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
1376{ 1459{
1377 kvm_arch_ops->decache_cr4_guest_bits(vcpu); 1460 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
1378 switch (cr) { 1461 switch (cr) {
1379 case 0: 1462 case 0:
1380 return vcpu->cr0; 1463 return vcpu->cr0;
@@ -1396,7 +1479,7 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
1396 switch (cr) { 1479 switch (cr) {
1397 case 0: 1480 case 0:
1398 set_cr0(vcpu, mk_cr_64(vcpu->cr0, val)); 1481 set_cr0(vcpu, mk_cr_64(vcpu->cr0, val));
1399 *rflags = kvm_arch_ops->get_rflags(vcpu); 1482 *rflags = kvm_x86_ops->get_rflags(vcpu);
1400 break; 1483 break;
1401 case 2: 1484 case 2:
1402 vcpu->cr2 = val; 1485 vcpu->cr2 = val;
@@ -1439,7 +1522,7 @@ static int vcpu_register_para(struct kvm_vcpu *vcpu, gpa_t para_state_gpa)
1439 1522
1440 mark_page_dirty(vcpu->kvm, para_state_gpa >> PAGE_SHIFT); 1523 mark_page_dirty(vcpu->kvm, para_state_gpa >> PAGE_SHIFT);
1441 para_state_page = pfn_to_page(para_state_hpa >> PAGE_SHIFT); 1524 para_state_page = pfn_to_page(para_state_hpa >> PAGE_SHIFT);
1442 para_state = kmap_atomic(para_state_page, KM_USER0); 1525 para_state = kmap(para_state_page);
1443 1526
1444 printk(KERN_DEBUG ".... guest version: %d\n", para_state->guest_version); 1527 printk(KERN_DEBUG ".... guest version: %d\n", para_state->guest_version);
1445 printk(KERN_DEBUG ".... size: %d\n", para_state->size); 1528 printk(KERN_DEBUG ".... size: %d\n", para_state->size);
@@ -1470,12 +1553,12 @@ static int vcpu_register_para(struct kvm_vcpu *vcpu, gpa_t para_state_gpa)
1470 mark_page_dirty(vcpu->kvm, hypercall_gpa >> PAGE_SHIFT); 1553 mark_page_dirty(vcpu->kvm, hypercall_gpa >> PAGE_SHIFT);
1471 hypercall = kmap_atomic(pfn_to_page(hypercall_hpa >> PAGE_SHIFT), 1554 hypercall = kmap_atomic(pfn_to_page(hypercall_hpa >> PAGE_SHIFT),
1472 KM_USER1) + (hypercall_hpa & ~PAGE_MASK); 1555 KM_USER1) + (hypercall_hpa & ~PAGE_MASK);
1473 kvm_arch_ops->patch_hypercall(vcpu, hypercall); 1556 kvm_x86_ops->patch_hypercall(vcpu, hypercall);
1474 kunmap_atomic(hypercall, KM_USER1); 1557 kunmap_atomic(hypercall, KM_USER1);
1475 1558
1476 para_state->ret = 0; 1559 para_state->ret = 0;
1477err_kunmap_skip: 1560err_kunmap_skip:
1478 kunmap_atomic(para_state, KM_USER0); 1561 kunmap(para_state_page);
1479 return 0; 1562 return 0;
1480err_gp: 1563err_gp:
1481 return 1; 1564 return 1;
@@ -1511,7 +1594,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1511 data = 3; 1594 data = 3;
1512 break; 1595 break;
1513 case MSR_IA32_APICBASE: 1596 case MSR_IA32_APICBASE:
1514 data = vcpu->apic_base; 1597 data = kvm_get_apic_base(vcpu);
1515 break; 1598 break;
1516 case MSR_IA32_MISC_ENABLE: 1599 case MSR_IA32_MISC_ENABLE:
1517 data = vcpu->ia32_misc_enable_msr; 1600 data = vcpu->ia32_misc_enable_msr;
@@ -1522,7 +1605,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1522 break; 1605 break;
1523#endif 1606#endif
1524 default: 1607 default:
1525 printk(KERN_ERR "kvm: unhandled rdmsr: 0x%x\n", msr); 1608 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
1526 return 1; 1609 return 1;
1527 } 1610 }
1528 *pdata = data; 1611 *pdata = data;
@@ -1537,7 +1620,7 @@ EXPORT_SYMBOL_GPL(kvm_get_msr_common);
1537 */ 1620 */
1538int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) 1621int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1539{ 1622{
1540 return kvm_arch_ops->get_msr(vcpu, msr_index, pdata); 1623 return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
1541} 1624}
1542 1625
1543#ifdef CONFIG_X86_64 1626#ifdef CONFIG_X86_64
@@ -1558,7 +1641,7 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
1558 return; 1641 return;
1559 } 1642 }
1560 1643
1561 kvm_arch_ops->set_efer(vcpu, efer); 1644 kvm_x86_ops->set_efer(vcpu, efer);
1562 1645
1563 efer &= ~EFER_LMA; 1646 efer &= ~EFER_LMA;
1564 efer |= vcpu->shadow_efer & EFER_LMA; 1647 efer |= vcpu->shadow_efer & EFER_LMA;
@@ -1577,11 +1660,11 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1577 break; 1660 break;
1578#endif 1661#endif
1579 case MSR_IA32_MC0_STATUS: 1662 case MSR_IA32_MC0_STATUS:
1580 printk(KERN_WARNING "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", 1663 pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
1581 __FUNCTION__, data); 1664 __FUNCTION__, data);
1582 break; 1665 break;
1583 case MSR_IA32_MCG_STATUS: 1666 case MSR_IA32_MCG_STATUS:
1584 printk(KERN_WARNING "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n", 1667 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
1585 __FUNCTION__, data); 1668 __FUNCTION__, data);
1586 break; 1669 break;
1587 case MSR_IA32_UCODE_REV: 1670 case MSR_IA32_UCODE_REV:
@@ -1589,7 +1672,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1589 case 0x200 ... 0x2ff: /* MTRRs */ 1672 case 0x200 ... 0x2ff: /* MTRRs */
1590 break; 1673 break;
1591 case MSR_IA32_APICBASE: 1674 case MSR_IA32_APICBASE:
1592 vcpu->apic_base = data; 1675 kvm_set_apic_base(vcpu, data);
1593 break; 1676 break;
1594 case MSR_IA32_MISC_ENABLE: 1677 case MSR_IA32_MISC_ENABLE:
1595 vcpu->ia32_misc_enable_msr = data; 1678 vcpu->ia32_misc_enable_msr = data;
@@ -1601,7 +1684,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1601 return vcpu_register_para(vcpu, data); 1684 return vcpu_register_para(vcpu, data);
1602 1685
1603 default: 1686 default:
1604 printk(KERN_ERR "kvm: unhandled wrmsr: 0x%x\n", msr); 1687 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x\n", msr);
1605 return 1; 1688 return 1;
1606 } 1689 }
1607 return 0; 1690 return 0;
@@ -1615,44 +1698,24 @@ EXPORT_SYMBOL_GPL(kvm_set_msr_common);
1615 */ 1698 */
1616int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 1699int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1617{ 1700{
1618 return kvm_arch_ops->set_msr(vcpu, msr_index, data); 1701 return kvm_x86_ops->set_msr(vcpu, msr_index, data);
1619} 1702}
1620 1703
1621void kvm_resched(struct kvm_vcpu *vcpu) 1704void kvm_resched(struct kvm_vcpu *vcpu)
1622{ 1705{
1623 if (!need_resched()) 1706 if (!need_resched())
1624 return; 1707 return;
1625 vcpu_put(vcpu);
1626 cond_resched(); 1708 cond_resched();
1627 vcpu_load(vcpu);
1628} 1709}
1629EXPORT_SYMBOL_GPL(kvm_resched); 1710EXPORT_SYMBOL_GPL(kvm_resched);
1630 1711
1631void load_msrs(struct vmx_msr_entry *e, int n)
1632{
1633 int i;
1634
1635 for (i = 0; i < n; ++i)
1636 wrmsrl(e[i].index, e[i].data);
1637}
1638EXPORT_SYMBOL_GPL(load_msrs);
1639
1640void save_msrs(struct vmx_msr_entry *e, int n)
1641{
1642 int i;
1643
1644 for (i = 0; i < n; ++i)
1645 rdmsrl(e[i].index, e[i].data);
1646}
1647EXPORT_SYMBOL_GPL(save_msrs);
1648
1649void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) 1712void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
1650{ 1713{
1651 int i; 1714 int i;
1652 u32 function; 1715 u32 function;
1653 struct kvm_cpuid_entry *e, *best; 1716 struct kvm_cpuid_entry *e, *best;
1654 1717
1655 kvm_arch_ops->cache_regs(vcpu); 1718 kvm_x86_ops->cache_regs(vcpu);
1656 function = vcpu->regs[VCPU_REGS_RAX]; 1719 function = vcpu->regs[VCPU_REGS_RAX];
1657 vcpu->regs[VCPU_REGS_RAX] = 0; 1720 vcpu->regs[VCPU_REGS_RAX] = 0;
1658 vcpu->regs[VCPU_REGS_RBX] = 0; 1721 vcpu->regs[VCPU_REGS_RBX] = 0;
@@ -1678,8 +1741,8 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
1678 vcpu->regs[VCPU_REGS_RCX] = best->ecx; 1741 vcpu->regs[VCPU_REGS_RCX] = best->ecx;
1679 vcpu->regs[VCPU_REGS_RDX] = best->edx; 1742 vcpu->regs[VCPU_REGS_RDX] = best->edx;
1680 } 1743 }
1681 kvm_arch_ops->decache_regs(vcpu); 1744 kvm_x86_ops->decache_regs(vcpu);
1682 kvm_arch_ops->skip_emulated_instruction(vcpu); 1745 kvm_x86_ops->skip_emulated_instruction(vcpu);
1683} 1746}
1684EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); 1747EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
1685 1748
@@ -1690,11 +1753,9 @@ static int pio_copy_data(struct kvm_vcpu *vcpu)
1690 unsigned bytes; 1753 unsigned bytes;
1691 int nr_pages = vcpu->pio.guest_pages[1] ? 2 : 1; 1754 int nr_pages = vcpu->pio.guest_pages[1] ? 2 : 1;
1692 1755
1693 kvm_arch_ops->vcpu_put(vcpu);
1694 q = vmap(vcpu->pio.guest_pages, nr_pages, VM_READ|VM_WRITE, 1756 q = vmap(vcpu->pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
1695 PAGE_KERNEL); 1757 PAGE_KERNEL);
1696 if (!q) { 1758 if (!q) {
1697 kvm_arch_ops->vcpu_load(vcpu);
1698 free_pio_guest_pages(vcpu); 1759 free_pio_guest_pages(vcpu);
1699 return -ENOMEM; 1760 return -ENOMEM;
1700 } 1761 }
@@ -1706,7 +1767,6 @@ static int pio_copy_data(struct kvm_vcpu *vcpu)
1706 memcpy(p, q, bytes); 1767 memcpy(p, q, bytes);
1707 q -= vcpu->pio.guest_page_offset; 1768 q -= vcpu->pio.guest_page_offset;
1708 vunmap(q); 1769 vunmap(q);
1709 kvm_arch_ops->vcpu_load(vcpu);
1710 free_pio_guest_pages(vcpu); 1770 free_pio_guest_pages(vcpu);
1711 return 0; 1771 return 0;
1712} 1772}
@@ -1717,7 +1777,7 @@ static int complete_pio(struct kvm_vcpu *vcpu)
1717 long delta; 1777 long delta;
1718 int r; 1778 int r;
1719 1779
1720 kvm_arch_ops->cache_regs(vcpu); 1780 kvm_x86_ops->cache_regs(vcpu);
1721 1781
1722 if (!io->string) { 1782 if (!io->string) {
1723 if (io->in) 1783 if (io->in)
@@ -1727,7 +1787,7 @@ static int complete_pio(struct kvm_vcpu *vcpu)
1727 if (io->in) { 1787 if (io->in) {
1728 r = pio_copy_data(vcpu); 1788 r = pio_copy_data(vcpu);
1729 if (r) { 1789 if (r) {
1730 kvm_arch_ops->cache_regs(vcpu); 1790 kvm_x86_ops->cache_regs(vcpu);
1731 return r; 1791 return r;
1732 } 1792 }
1733 } 1793 }
@@ -1750,79 +1810,109 @@ static int complete_pio(struct kvm_vcpu *vcpu)
1750 vcpu->regs[VCPU_REGS_RSI] += delta; 1810 vcpu->regs[VCPU_REGS_RSI] += delta;
1751 } 1811 }
1752 1812
1753 kvm_arch_ops->decache_regs(vcpu); 1813 kvm_x86_ops->decache_regs(vcpu);
1754 1814
1755 io->count -= io->cur_count; 1815 io->count -= io->cur_count;
1756 io->cur_count = 0; 1816 io->cur_count = 0;
1757 1817
1758 if (!io->count)
1759 kvm_arch_ops->skip_emulated_instruction(vcpu);
1760 return 0; 1818 return 0;
1761} 1819}
1762 1820
1763void kernel_pio(struct kvm_io_device *pio_dev, struct kvm_vcpu *vcpu) 1821static void kernel_pio(struct kvm_io_device *pio_dev,
1822 struct kvm_vcpu *vcpu,
1823 void *pd)
1764{ 1824{
1765 /* TODO: String I/O for in kernel device */ 1825 /* TODO: String I/O for in kernel device */
1766 1826
1827 mutex_lock(&vcpu->kvm->lock);
1767 if (vcpu->pio.in) 1828 if (vcpu->pio.in)
1768 kvm_iodevice_read(pio_dev, vcpu->pio.port, 1829 kvm_iodevice_read(pio_dev, vcpu->pio.port,
1769 vcpu->pio.size, 1830 vcpu->pio.size,
1770 vcpu->pio_data); 1831 pd);
1771 else 1832 else
1772 kvm_iodevice_write(pio_dev, vcpu->pio.port, 1833 kvm_iodevice_write(pio_dev, vcpu->pio.port,
1773 vcpu->pio.size, 1834 vcpu->pio.size,
1774 vcpu->pio_data); 1835 pd);
1836 mutex_unlock(&vcpu->kvm->lock);
1837}
1838
1839static void pio_string_write(struct kvm_io_device *pio_dev,
1840 struct kvm_vcpu *vcpu)
1841{
1842 struct kvm_pio_request *io = &vcpu->pio;
1843 void *pd = vcpu->pio_data;
1844 int i;
1845
1846 mutex_lock(&vcpu->kvm->lock);
1847 for (i = 0; i < io->cur_count; i++) {
1848 kvm_iodevice_write(pio_dev, io->port,
1849 io->size,
1850 pd);
1851 pd += io->size;
1852 }
1853 mutex_unlock(&vcpu->kvm->lock);
1775} 1854}
1776 1855
1777int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 1856int kvm_emulate_pio (struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
1778 int size, unsigned long count, int string, int down, 1857 int size, unsigned port)
1858{
1859 struct kvm_io_device *pio_dev;
1860
1861 vcpu->run->exit_reason = KVM_EXIT_IO;
1862 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
1863 vcpu->run->io.size = vcpu->pio.size = size;
1864 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
1865 vcpu->run->io.count = vcpu->pio.count = vcpu->pio.cur_count = 1;
1866 vcpu->run->io.port = vcpu->pio.port = port;
1867 vcpu->pio.in = in;
1868 vcpu->pio.string = 0;
1869 vcpu->pio.down = 0;
1870 vcpu->pio.guest_page_offset = 0;
1871 vcpu->pio.rep = 0;
1872
1873 kvm_x86_ops->cache_regs(vcpu);
1874 memcpy(vcpu->pio_data, &vcpu->regs[VCPU_REGS_RAX], 4);
1875 kvm_x86_ops->decache_regs(vcpu);
1876
1877 kvm_x86_ops->skip_emulated_instruction(vcpu);
1878
1879 pio_dev = vcpu_find_pio_dev(vcpu, port);
1880 if (pio_dev) {
1881 kernel_pio(pio_dev, vcpu, vcpu->pio_data);
1882 complete_pio(vcpu);
1883 return 1;
1884 }
1885 return 0;
1886}
1887EXPORT_SYMBOL_GPL(kvm_emulate_pio);
1888
1889int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
1890 int size, unsigned long count, int down,
1779 gva_t address, int rep, unsigned port) 1891 gva_t address, int rep, unsigned port)
1780{ 1892{
1781 unsigned now, in_page; 1893 unsigned now, in_page;
1782 int i; 1894 int i, ret = 0;
1783 int nr_pages = 1; 1895 int nr_pages = 1;
1784 struct page *page; 1896 struct page *page;
1785 struct kvm_io_device *pio_dev; 1897 struct kvm_io_device *pio_dev;
1786 1898
1787 vcpu->run->exit_reason = KVM_EXIT_IO; 1899 vcpu->run->exit_reason = KVM_EXIT_IO;
1788 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 1900 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
1789 vcpu->run->io.size = size; 1901 vcpu->run->io.size = vcpu->pio.size = size;
1790 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; 1902 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
1791 vcpu->run->io.count = count; 1903 vcpu->run->io.count = vcpu->pio.count = vcpu->pio.cur_count = count;
1792 vcpu->run->io.port = port; 1904 vcpu->run->io.port = vcpu->pio.port = port;
1793 vcpu->pio.count = count;
1794 vcpu->pio.cur_count = count;
1795 vcpu->pio.size = size;
1796 vcpu->pio.in = in; 1905 vcpu->pio.in = in;
1797 vcpu->pio.port = port; 1906 vcpu->pio.string = 1;
1798 vcpu->pio.string = string;
1799 vcpu->pio.down = down; 1907 vcpu->pio.down = down;
1800 vcpu->pio.guest_page_offset = offset_in_page(address); 1908 vcpu->pio.guest_page_offset = offset_in_page(address);
1801 vcpu->pio.rep = rep; 1909 vcpu->pio.rep = rep;
1802 1910
1803 pio_dev = vcpu_find_pio_dev(vcpu, port);
1804 if (!string) {
1805 kvm_arch_ops->cache_regs(vcpu);
1806 memcpy(vcpu->pio_data, &vcpu->regs[VCPU_REGS_RAX], 4);
1807 kvm_arch_ops->decache_regs(vcpu);
1808 if (pio_dev) {
1809 kernel_pio(pio_dev, vcpu);
1810 complete_pio(vcpu);
1811 return 1;
1812 }
1813 return 0;
1814 }
1815 /* TODO: String I/O for in kernel device */
1816 if (pio_dev)
1817 printk(KERN_ERR "kvm_setup_pio: no string io support\n");
1818
1819 if (!count) { 1911 if (!count) {
1820 kvm_arch_ops->skip_emulated_instruction(vcpu); 1912 kvm_x86_ops->skip_emulated_instruction(vcpu);
1821 return 1; 1913 return 1;
1822 } 1914 }
1823 1915
1824 now = min(count, PAGE_SIZE / size);
1825
1826 if (!down) 1916 if (!down)
1827 in_page = PAGE_SIZE - offset_in_page(address); 1917 in_page = PAGE_SIZE - offset_in_page(address);
1828 else 1918 else
@@ -1841,20 +1931,23 @@ int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
1841 /* 1931 /*
1842 * String I/O in reverse. Yuck. Kill the guest, fix later. 1932 * String I/O in reverse. Yuck. Kill the guest, fix later.
1843 */ 1933 */
1844 printk(KERN_ERR "kvm: guest string pio down\n"); 1934 pr_unimpl(vcpu, "guest string pio down\n");
1845 inject_gp(vcpu); 1935 inject_gp(vcpu);
1846 return 1; 1936 return 1;
1847 } 1937 }
1848 vcpu->run->io.count = now; 1938 vcpu->run->io.count = now;
1849 vcpu->pio.cur_count = now; 1939 vcpu->pio.cur_count = now;
1850 1940
1941 if (vcpu->pio.cur_count == vcpu->pio.count)
1942 kvm_x86_ops->skip_emulated_instruction(vcpu);
1943
1851 for (i = 0; i < nr_pages; ++i) { 1944 for (i = 0; i < nr_pages; ++i) {
1852 spin_lock(&vcpu->kvm->lock); 1945 mutex_lock(&vcpu->kvm->lock);
1853 page = gva_to_page(vcpu, address + i * PAGE_SIZE); 1946 page = gva_to_page(vcpu, address + i * PAGE_SIZE);
1854 if (page) 1947 if (page)
1855 get_page(page); 1948 get_page(page);
1856 vcpu->pio.guest_pages[i] = page; 1949 vcpu->pio.guest_pages[i] = page;
1857 spin_unlock(&vcpu->kvm->lock); 1950 mutex_unlock(&vcpu->kvm->lock);
1858 if (!page) { 1951 if (!page) {
1859 inject_gp(vcpu); 1952 inject_gp(vcpu);
1860 free_pio_guest_pages(vcpu); 1953 free_pio_guest_pages(vcpu);
@@ -1862,11 +1955,145 @@ int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
1862 } 1955 }
1863 } 1956 }
1864 1957
1865 if (!vcpu->pio.in) 1958 pio_dev = vcpu_find_pio_dev(vcpu, port);
1866 return pio_copy_data(vcpu); 1959 if (!vcpu->pio.in) {
1867 return 0; 1960 /* string PIO write */
1961 ret = pio_copy_data(vcpu);
1962 if (ret >= 0 && pio_dev) {
1963 pio_string_write(pio_dev, vcpu);
1964 complete_pio(vcpu);
1965 if (vcpu->pio.count == 0)
1966 ret = 1;
1967 }
1968 } else if (pio_dev)
1969 pr_unimpl(vcpu, "no string pio read support yet, "
1970 "port %x size %d count %ld\n",
1971 port, size, count);
1972
1973 return ret;
1974}
1975EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
1976
1977/*
1978 * Check if userspace requested an interrupt window, and that the
1979 * interrupt window is open.
1980 *
1981 * No need to exit to userspace if we already have an interrupt queued.
1982 */
1983static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
1984 struct kvm_run *kvm_run)
1985{
1986 return (!vcpu->irq_summary &&
1987 kvm_run->request_interrupt_window &&
1988 vcpu->interrupt_window_open &&
1989 (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
1990}
1991
1992static void post_kvm_run_save(struct kvm_vcpu *vcpu,
1993 struct kvm_run *kvm_run)
1994{
1995 kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
1996 kvm_run->cr8 = get_cr8(vcpu);
1997 kvm_run->apic_base = kvm_get_apic_base(vcpu);
1998 if (irqchip_in_kernel(vcpu->kvm))
1999 kvm_run->ready_for_interrupt_injection = 1;
2000 else
2001 kvm_run->ready_for_interrupt_injection =
2002 (vcpu->interrupt_window_open &&
2003 vcpu->irq_summary == 0);
2004}
2005
2006static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2007{
2008 int r;
2009
2010 if (unlikely(vcpu->mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) {
2011 printk("vcpu %d received sipi with vector # %x\n",
2012 vcpu->vcpu_id, vcpu->sipi_vector);
2013 kvm_lapic_reset(vcpu);
2014 kvm_x86_ops->vcpu_reset(vcpu);
2015 vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
2016 }
2017
2018preempted:
2019 if (vcpu->guest_debug.enabled)
2020 kvm_x86_ops->guest_debug_pre(vcpu);
2021
2022again:
2023 r = kvm_mmu_reload(vcpu);
2024 if (unlikely(r))
2025 goto out;
2026
2027 preempt_disable();
2028
2029 kvm_x86_ops->prepare_guest_switch(vcpu);
2030 kvm_load_guest_fpu(vcpu);
2031
2032 local_irq_disable();
2033
2034 if (signal_pending(current)) {
2035 local_irq_enable();
2036 preempt_enable();
2037 r = -EINTR;
2038 kvm_run->exit_reason = KVM_EXIT_INTR;
2039 ++vcpu->stat.signal_exits;
2040 goto out;
2041 }
2042
2043 if (irqchip_in_kernel(vcpu->kvm))
2044 kvm_x86_ops->inject_pending_irq(vcpu);
2045 else if (!vcpu->mmio_read_completed)
2046 kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
2047
2048 vcpu->guest_mode = 1;
2049
2050 if (vcpu->requests)
2051 if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests))
2052 kvm_x86_ops->tlb_flush(vcpu);
2053
2054 kvm_x86_ops->run(vcpu, kvm_run);
2055
2056 vcpu->guest_mode = 0;
2057 local_irq_enable();
2058
2059 ++vcpu->stat.exits;
2060
2061 preempt_enable();
2062
2063 /*
2064 * Profile KVM exit RIPs:
2065 */
2066 if (unlikely(prof_on == KVM_PROFILING)) {
2067 kvm_x86_ops->cache_regs(vcpu);
2068 profile_hit(KVM_PROFILING, (void *)vcpu->rip);
2069 }
2070
2071 r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
2072
2073 if (r > 0) {
2074 if (dm_request_for_irq_injection(vcpu, kvm_run)) {
2075 r = -EINTR;
2076 kvm_run->exit_reason = KVM_EXIT_INTR;
2077 ++vcpu->stat.request_irq_exits;
2078 goto out;
2079 }
2080 if (!need_resched()) {
2081 ++vcpu->stat.light_exits;
2082 goto again;
2083 }
2084 }
2085
2086out:
2087 if (r > 0) {
2088 kvm_resched(vcpu);
2089 goto preempted;
2090 }
2091
2092 post_kvm_run_save(vcpu, kvm_run);
2093
2094 return r;
1868} 2095}
1869EXPORT_SYMBOL_GPL(kvm_setup_pio); 2096
1870 2097
1871static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2098static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1872{ 2099{
@@ -1875,11 +2102,18 @@ static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1875 2102
1876 vcpu_load(vcpu); 2103 vcpu_load(vcpu);
1877 2104
2105 if (unlikely(vcpu->mp_state == VCPU_MP_STATE_UNINITIALIZED)) {
2106 kvm_vcpu_block(vcpu);
2107 vcpu_put(vcpu);
2108 return -EAGAIN;
2109 }
2110
1878 if (vcpu->sigset_active) 2111 if (vcpu->sigset_active)
1879 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); 2112 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
1880 2113
1881 /* re-sync apic's tpr */ 2114 /* re-sync apic's tpr */
1882 vcpu->cr8 = kvm_run->cr8; 2115 if (!irqchip_in_kernel(vcpu->kvm))
2116 set_cr8(vcpu, kvm_run->cr8);
1883 2117
1884 if (vcpu->pio.cur_count) { 2118 if (vcpu->pio.cur_count) {
1885 r = complete_pio(vcpu); 2119 r = complete_pio(vcpu);
@@ -1897,19 +2131,18 @@ static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1897 /* 2131 /*
1898 * Read-modify-write. Back to userspace. 2132 * Read-modify-write. Back to userspace.
1899 */ 2133 */
1900 kvm_run->exit_reason = KVM_EXIT_MMIO;
1901 r = 0; 2134 r = 0;
1902 goto out; 2135 goto out;
1903 } 2136 }
1904 } 2137 }
1905 2138
1906 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) { 2139 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) {
1907 kvm_arch_ops->cache_regs(vcpu); 2140 kvm_x86_ops->cache_regs(vcpu);
1908 vcpu->regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret; 2141 vcpu->regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;
1909 kvm_arch_ops->decache_regs(vcpu); 2142 kvm_x86_ops->decache_regs(vcpu);
1910 } 2143 }
1911 2144
1912 r = kvm_arch_ops->run(vcpu, kvm_run); 2145 r = __vcpu_run(vcpu, kvm_run);
1913 2146
1914out: 2147out:
1915 if (vcpu->sigset_active) 2148 if (vcpu->sigset_active)
@@ -1924,7 +2157,7 @@ static int kvm_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu,
1924{ 2157{
1925 vcpu_load(vcpu); 2158 vcpu_load(vcpu);
1926 2159
1927 kvm_arch_ops->cache_regs(vcpu); 2160 kvm_x86_ops->cache_regs(vcpu);
1928 2161
1929 regs->rax = vcpu->regs[VCPU_REGS_RAX]; 2162 regs->rax = vcpu->regs[VCPU_REGS_RAX];
1930 regs->rbx = vcpu->regs[VCPU_REGS_RBX]; 2163 regs->rbx = vcpu->regs[VCPU_REGS_RBX];
@@ -1946,7 +2179,7 @@ static int kvm_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu,
1946#endif 2179#endif
1947 2180
1948 regs->rip = vcpu->rip; 2181 regs->rip = vcpu->rip;
1949 regs->rflags = kvm_arch_ops->get_rflags(vcpu); 2182 regs->rflags = kvm_x86_ops->get_rflags(vcpu);
1950 2183
1951 /* 2184 /*
1952 * Don't leak debug flags in case they were set for guest debugging 2185 * Don't leak debug flags in case they were set for guest debugging
@@ -1984,9 +2217,9 @@ static int kvm_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu,
1984#endif 2217#endif
1985 2218
1986 vcpu->rip = regs->rip; 2219 vcpu->rip = regs->rip;
1987 kvm_arch_ops->set_rflags(vcpu, regs->rflags); 2220 kvm_x86_ops->set_rflags(vcpu, regs->rflags);
1988 2221
1989 kvm_arch_ops->decache_regs(vcpu); 2222 kvm_x86_ops->decache_regs(vcpu);
1990 2223
1991 vcpu_put(vcpu); 2224 vcpu_put(vcpu);
1992 2225
@@ -1996,13 +2229,14 @@ static int kvm_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu,
1996static void get_segment(struct kvm_vcpu *vcpu, 2229static void get_segment(struct kvm_vcpu *vcpu,
1997 struct kvm_segment *var, int seg) 2230 struct kvm_segment *var, int seg)
1998{ 2231{
1999 return kvm_arch_ops->get_segment(vcpu, var, seg); 2232 return kvm_x86_ops->get_segment(vcpu, var, seg);
2000} 2233}
2001 2234
2002static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, 2235static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
2003 struct kvm_sregs *sregs) 2236 struct kvm_sregs *sregs)
2004{ 2237{
2005 struct descriptor_table dt; 2238 struct descriptor_table dt;
2239 int pending_vec;
2006 2240
2007 vcpu_load(vcpu); 2241 vcpu_load(vcpu);
2008 2242
@@ -2016,24 +2250,31 @@ static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
2016 get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 2250 get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
2017 get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 2251 get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
2018 2252
2019 kvm_arch_ops->get_idt(vcpu, &dt); 2253 kvm_x86_ops->get_idt(vcpu, &dt);
2020 sregs->idt.limit = dt.limit; 2254 sregs->idt.limit = dt.limit;
2021 sregs->idt.base = dt.base; 2255 sregs->idt.base = dt.base;
2022 kvm_arch_ops->get_gdt(vcpu, &dt); 2256 kvm_x86_ops->get_gdt(vcpu, &dt);
2023 sregs->gdt.limit = dt.limit; 2257 sregs->gdt.limit = dt.limit;
2024 sregs->gdt.base = dt.base; 2258 sregs->gdt.base = dt.base;
2025 2259
2026 kvm_arch_ops->decache_cr4_guest_bits(vcpu); 2260 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2027 sregs->cr0 = vcpu->cr0; 2261 sregs->cr0 = vcpu->cr0;
2028 sregs->cr2 = vcpu->cr2; 2262 sregs->cr2 = vcpu->cr2;
2029 sregs->cr3 = vcpu->cr3; 2263 sregs->cr3 = vcpu->cr3;
2030 sregs->cr4 = vcpu->cr4; 2264 sregs->cr4 = vcpu->cr4;
2031 sregs->cr8 = vcpu->cr8; 2265 sregs->cr8 = get_cr8(vcpu);
2032 sregs->efer = vcpu->shadow_efer; 2266 sregs->efer = vcpu->shadow_efer;
2033 sregs->apic_base = vcpu->apic_base; 2267 sregs->apic_base = kvm_get_apic_base(vcpu);
2034 2268
2035 memcpy(sregs->interrupt_bitmap, vcpu->irq_pending, 2269 if (irqchip_in_kernel(vcpu->kvm)) {
2036 sizeof sregs->interrupt_bitmap); 2270 memset(sregs->interrupt_bitmap, 0,
2271 sizeof sregs->interrupt_bitmap);
2272 pending_vec = kvm_x86_ops->get_irq(vcpu);
2273 if (pending_vec >= 0)
2274 set_bit(pending_vec, (unsigned long *)sregs->interrupt_bitmap);
2275 } else
2276 memcpy(sregs->interrupt_bitmap, vcpu->irq_pending,
2277 sizeof sregs->interrupt_bitmap);
2037 2278
2038 vcpu_put(vcpu); 2279 vcpu_put(vcpu);
2039 2280
@@ -2043,56 +2284,69 @@ static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
2043static void set_segment(struct kvm_vcpu *vcpu, 2284static void set_segment(struct kvm_vcpu *vcpu,
2044 struct kvm_segment *var, int seg) 2285 struct kvm_segment *var, int seg)
2045{ 2286{
2046 return kvm_arch_ops->set_segment(vcpu, var, seg); 2287 return kvm_x86_ops->set_segment(vcpu, var, seg);
2047} 2288}
2048 2289
2049static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, 2290static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
2050 struct kvm_sregs *sregs) 2291 struct kvm_sregs *sregs)
2051{ 2292{
2052 int mmu_reset_needed = 0; 2293 int mmu_reset_needed = 0;
2053 int i; 2294 int i, pending_vec, max_bits;
2054 struct descriptor_table dt; 2295 struct descriptor_table dt;
2055 2296
2056 vcpu_load(vcpu); 2297 vcpu_load(vcpu);
2057 2298
2058 dt.limit = sregs->idt.limit; 2299 dt.limit = sregs->idt.limit;
2059 dt.base = sregs->idt.base; 2300 dt.base = sregs->idt.base;
2060 kvm_arch_ops->set_idt(vcpu, &dt); 2301 kvm_x86_ops->set_idt(vcpu, &dt);
2061 dt.limit = sregs->gdt.limit; 2302 dt.limit = sregs->gdt.limit;
2062 dt.base = sregs->gdt.base; 2303 dt.base = sregs->gdt.base;
2063 kvm_arch_ops->set_gdt(vcpu, &dt); 2304 kvm_x86_ops->set_gdt(vcpu, &dt);
2064 2305
2065 vcpu->cr2 = sregs->cr2; 2306 vcpu->cr2 = sregs->cr2;
2066 mmu_reset_needed |= vcpu->cr3 != sregs->cr3; 2307 mmu_reset_needed |= vcpu->cr3 != sregs->cr3;
2067 vcpu->cr3 = sregs->cr3; 2308 vcpu->cr3 = sregs->cr3;
2068 2309
2069 vcpu->cr8 = sregs->cr8; 2310 set_cr8(vcpu, sregs->cr8);
2070 2311
2071 mmu_reset_needed |= vcpu->shadow_efer != sregs->efer; 2312 mmu_reset_needed |= vcpu->shadow_efer != sregs->efer;
2072#ifdef CONFIG_X86_64 2313#ifdef CONFIG_X86_64
2073 kvm_arch_ops->set_efer(vcpu, sregs->efer); 2314 kvm_x86_ops->set_efer(vcpu, sregs->efer);
2074#endif 2315#endif
2075 vcpu->apic_base = sregs->apic_base; 2316 kvm_set_apic_base(vcpu, sregs->apic_base);
2076 2317
2077 kvm_arch_ops->decache_cr4_guest_bits(vcpu); 2318 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2078 2319
2079 mmu_reset_needed |= vcpu->cr0 != sregs->cr0; 2320 mmu_reset_needed |= vcpu->cr0 != sregs->cr0;
2080 kvm_arch_ops->set_cr0(vcpu, sregs->cr0); 2321 vcpu->cr0 = sregs->cr0;
2322 kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
2081 2323
2082 mmu_reset_needed |= vcpu->cr4 != sregs->cr4; 2324 mmu_reset_needed |= vcpu->cr4 != sregs->cr4;
2083 kvm_arch_ops->set_cr4(vcpu, sregs->cr4); 2325 kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
2084 if (!is_long_mode(vcpu) && is_pae(vcpu)) 2326 if (!is_long_mode(vcpu) && is_pae(vcpu))
2085 load_pdptrs(vcpu, vcpu->cr3); 2327 load_pdptrs(vcpu, vcpu->cr3);
2086 2328
2087 if (mmu_reset_needed) 2329 if (mmu_reset_needed)
2088 kvm_mmu_reset_context(vcpu); 2330 kvm_mmu_reset_context(vcpu);
2089 2331
2090 memcpy(vcpu->irq_pending, sregs->interrupt_bitmap, 2332 if (!irqchip_in_kernel(vcpu->kvm)) {
2091 sizeof vcpu->irq_pending); 2333 memcpy(vcpu->irq_pending, sregs->interrupt_bitmap,
2092 vcpu->irq_summary = 0; 2334 sizeof vcpu->irq_pending);
2093 for (i = 0; i < NR_IRQ_WORDS; ++i) 2335 vcpu->irq_summary = 0;
2094 if (vcpu->irq_pending[i]) 2336 for (i = 0; i < ARRAY_SIZE(vcpu->irq_pending); ++i)
2095 __set_bit(i, &vcpu->irq_summary); 2337 if (vcpu->irq_pending[i])
2338 __set_bit(i, &vcpu->irq_summary);
2339 } else {
2340 max_bits = (sizeof sregs->interrupt_bitmap) << 3;
2341 pending_vec = find_first_bit(
2342 (const unsigned long *)sregs->interrupt_bitmap,
2343 max_bits);
2344 /* Only pending external irq is handled here */
2345 if (pending_vec < max_bits) {
2346 kvm_x86_ops->set_irq(vcpu, pending_vec);
2347 printk("Set back pending irq %d\n", pending_vec);
2348 }
2349 }
2096 2350
2097 set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 2351 set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
2098 set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 2352 set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
@@ -2109,6 +2363,16 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
2109 return 0; 2363 return 0;
2110} 2364}
2111 2365
2366void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
2367{
2368 struct kvm_segment cs;
2369
2370 get_segment(vcpu, &cs, VCPU_SREG_CS);
2371 *db = cs.db;
2372 *l = cs.l;
2373}
2374EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
2375
2112/* 2376/*
2113 * List of msr numbers which we expose to userspace through KVM_GET_MSRS 2377 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
2114 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. 2378 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
@@ -2236,13 +2500,13 @@ static int kvm_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
2236 gpa_t gpa; 2500 gpa_t gpa;
2237 2501
2238 vcpu_load(vcpu); 2502 vcpu_load(vcpu);
2239 spin_lock(&vcpu->kvm->lock); 2503 mutex_lock(&vcpu->kvm->lock);
2240 gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr); 2504 gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr);
2241 tr->physical_address = gpa; 2505 tr->physical_address = gpa;
2242 tr->valid = gpa != UNMAPPED_GVA; 2506 tr->valid = gpa != UNMAPPED_GVA;
2243 tr->writeable = 1; 2507 tr->writeable = 1;
2244 tr->usermode = 0; 2508 tr->usermode = 0;
2245 spin_unlock(&vcpu->kvm->lock); 2509 mutex_unlock(&vcpu->kvm->lock);
2246 vcpu_put(vcpu); 2510 vcpu_put(vcpu);
2247 2511
2248 return 0; 2512 return 0;
@@ -2253,6 +2517,8 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
2253{ 2517{
2254 if (irq->irq < 0 || irq->irq >= 256) 2518 if (irq->irq < 0 || irq->irq >= 256)
2255 return -EINVAL; 2519 return -EINVAL;
2520 if (irqchip_in_kernel(vcpu->kvm))
2521 return -ENXIO;
2256 vcpu_load(vcpu); 2522 vcpu_load(vcpu);
2257 2523
2258 set_bit(irq->irq, vcpu->irq_pending); 2524 set_bit(irq->irq, vcpu->irq_pending);
@@ -2270,7 +2536,7 @@ static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
2270 2536
2271 vcpu_load(vcpu); 2537 vcpu_load(vcpu);
2272 2538
2273 r = kvm_arch_ops->set_guest_debug(vcpu, dbg); 2539 r = kvm_x86_ops->set_guest_debug(vcpu, dbg);
2274 2540
2275 vcpu_put(vcpu); 2541 vcpu_put(vcpu);
2276 2542
@@ -2285,7 +2551,6 @@ static struct page *kvm_vcpu_nopage(struct vm_area_struct *vma,
2285 unsigned long pgoff; 2551 unsigned long pgoff;
2286 struct page *page; 2552 struct page *page;
2287 2553
2288 *type = VM_FAULT_MINOR;
2289 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 2554 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2290 if (pgoff == 0) 2555 if (pgoff == 0)
2291 page = virt_to_page(vcpu->run); 2556 page = virt_to_page(vcpu->run);
@@ -2294,6 +2559,9 @@ static struct page *kvm_vcpu_nopage(struct vm_area_struct *vma,
2294 else 2559 else
2295 return NOPAGE_SIGBUS; 2560 return NOPAGE_SIGBUS;
2296 get_page(page); 2561 get_page(page);
2562 if (type != NULL)
2563 *type = VM_FAULT_MINOR;
2564
2297 return page; 2565 return page;
2298} 2566}
2299 2567
@@ -2346,74 +2614,52 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
2346{ 2614{
2347 int r; 2615 int r;
2348 struct kvm_vcpu *vcpu; 2616 struct kvm_vcpu *vcpu;
2349 struct page *page;
2350 2617
2351 r = -EINVAL;
2352 if (!valid_vcpu(n)) 2618 if (!valid_vcpu(n))
2353 goto out; 2619 return -EINVAL;
2354
2355 vcpu = &kvm->vcpus[n];
2356
2357 mutex_lock(&vcpu->mutex);
2358
2359 if (vcpu->vmcs) {
2360 mutex_unlock(&vcpu->mutex);
2361 return -EEXIST;
2362 }
2363
2364 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2365 r = -ENOMEM;
2366 if (!page)
2367 goto out_unlock;
2368 vcpu->run = page_address(page);
2369
2370 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2371 r = -ENOMEM;
2372 if (!page)
2373 goto out_free_run;
2374 vcpu->pio_data = page_address(page);
2375 2620
2376 vcpu->host_fx_image = (char*)ALIGN((hva_t)vcpu->fx_buf, 2621 vcpu = kvm_x86_ops->vcpu_create(kvm, n);
2377 FX_IMAGE_ALIGN); 2622 if (IS_ERR(vcpu))
2378 vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE; 2623 return PTR_ERR(vcpu);
2379 vcpu->cr0 = 0x10;
2380 2624
2381 r = kvm_arch_ops->vcpu_create(vcpu); 2625 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
2382 if (r < 0)
2383 goto out_free_vcpus;
2384 2626
2385 r = kvm_mmu_create(vcpu); 2627 /* We do fxsave: this must be aligned. */
2386 if (r < 0) 2628 BUG_ON((unsigned long)&vcpu->host_fx_image & 0xF);
2387 goto out_free_vcpus;
2388 2629
2389 kvm_arch_ops->vcpu_load(vcpu); 2630 vcpu_load(vcpu);
2390 r = kvm_mmu_setup(vcpu); 2631 r = kvm_mmu_setup(vcpu);
2391 if (r >= 0)
2392 r = kvm_arch_ops->vcpu_setup(vcpu);
2393 vcpu_put(vcpu); 2632 vcpu_put(vcpu);
2394
2395 if (r < 0) 2633 if (r < 0)
2396 goto out_free_vcpus; 2634 goto free_vcpu;
2397 2635
2636 mutex_lock(&kvm->lock);
2637 if (kvm->vcpus[n]) {
2638 r = -EEXIST;
2639 mutex_unlock(&kvm->lock);
2640 goto mmu_unload;
2641 }
2642 kvm->vcpus[n] = vcpu;
2643 mutex_unlock(&kvm->lock);
2644
2645 /* Now it's all set up, let userspace reach it */
2398 r = create_vcpu_fd(vcpu); 2646 r = create_vcpu_fd(vcpu);
2399 if (r < 0) 2647 if (r < 0)
2400 goto out_free_vcpus; 2648 goto unlink;
2649 return r;
2401 2650
2402 spin_lock(&kvm_lock); 2651unlink:
2403 if (n >= kvm->nvcpus) 2652 mutex_lock(&kvm->lock);
2404 kvm->nvcpus = n + 1; 2653 kvm->vcpus[n] = NULL;
2405 spin_unlock(&kvm_lock); 2654 mutex_unlock(&kvm->lock);
2406 2655
2407 return r; 2656mmu_unload:
2657 vcpu_load(vcpu);
2658 kvm_mmu_unload(vcpu);
2659 vcpu_put(vcpu);
2408 2660
2409out_free_vcpus: 2661free_vcpu:
2410 kvm_free_vcpu(vcpu); 2662 kvm_x86_ops->vcpu_free(vcpu);
2411out_free_run:
2412 free_page((unsigned long)vcpu->run);
2413 vcpu->run = NULL;
2414out_unlock:
2415 mutex_unlock(&vcpu->mutex);
2416out:
2417 return r; 2663 return r;
2418} 2664}
2419 2665
@@ -2493,7 +2739,7 @@ struct fxsave {
2493 2739
2494static int kvm_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 2740static int kvm_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2495{ 2741{
2496 struct fxsave *fxsave = (struct fxsave *)vcpu->guest_fx_image; 2742 struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image;
2497 2743
2498 vcpu_load(vcpu); 2744 vcpu_load(vcpu);
2499 2745
@@ -2513,7 +2759,7 @@ static int kvm_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2513 2759
2514static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 2760static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2515{ 2761{
2516 struct fxsave *fxsave = (struct fxsave *)vcpu->guest_fx_image; 2762 struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image;
2517 2763
2518 vcpu_load(vcpu); 2764 vcpu_load(vcpu);
2519 2765
@@ -2531,6 +2777,27 @@ static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2531 return 0; 2777 return 0;
2532} 2778}
2533 2779
2780static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
2781 struct kvm_lapic_state *s)
2782{
2783 vcpu_load(vcpu);
2784 memcpy(s->regs, vcpu->apic->regs, sizeof *s);
2785 vcpu_put(vcpu);
2786
2787 return 0;
2788}
2789
2790static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
2791 struct kvm_lapic_state *s)
2792{
2793 vcpu_load(vcpu);
2794 memcpy(vcpu->apic->regs, s->regs, sizeof *s);
2795 kvm_apic_post_state_restore(vcpu);
2796 vcpu_put(vcpu);
2797
2798 return 0;
2799}
2800
2534static long kvm_vcpu_ioctl(struct file *filp, 2801static long kvm_vcpu_ioctl(struct file *filp,
2535 unsigned int ioctl, unsigned long arg) 2802 unsigned int ioctl, unsigned long arg)
2536{ 2803{
@@ -2700,6 +2967,31 @@ static long kvm_vcpu_ioctl(struct file *filp,
2700 r = 0; 2967 r = 0;
2701 break; 2968 break;
2702 } 2969 }
2970 case KVM_GET_LAPIC: {
2971 struct kvm_lapic_state lapic;
2972
2973 memset(&lapic, 0, sizeof lapic);
2974 r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic);
2975 if (r)
2976 goto out;
2977 r = -EFAULT;
2978 if (copy_to_user(argp, &lapic, sizeof lapic))
2979 goto out;
2980 r = 0;
2981 break;
2982 }
2983 case KVM_SET_LAPIC: {
2984 struct kvm_lapic_state lapic;
2985
2986 r = -EFAULT;
2987 if (copy_from_user(&lapic, argp, sizeof lapic))
2988 goto out;
2989 r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);;
2990 if (r)
2991 goto out;
2992 r = 0;
2993 break;
2994 }
2703 default: 2995 default:
2704 ; 2996 ;
2705 } 2997 }
@@ -2753,6 +3045,75 @@ static long kvm_vm_ioctl(struct file *filp,
2753 goto out; 3045 goto out;
2754 break; 3046 break;
2755 } 3047 }
3048 case KVM_CREATE_IRQCHIP:
3049 r = -ENOMEM;
3050 kvm->vpic = kvm_create_pic(kvm);
3051 if (kvm->vpic) {
3052 r = kvm_ioapic_init(kvm);
3053 if (r) {
3054 kfree(kvm->vpic);
3055 kvm->vpic = NULL;
3056 goto out;
3057 }
3058 }
3059 else
3060 goto out;
3061 break;
3062 case KVM_IRQ_LINE: {
3063 struct kvm_irq_level irq_event;
3064
3065 r = -EFAULT;
3066 if (copy_from_user(&irq_event, argp, sizeof irq_event))
3067 goto out;
3068 if (irqchip_in_kernel(kvm)) {
3069 mutex_lock(&kvm->lock);
3070 if (irq_event.irq < 16)
3071 kvm_pic_set_irq(pic_irqchip(kvm),
3072 irq_event.irq,
3073 irq_event.level);
3074 kvm_ioapic_set_irq(kvm->vioapic,
3075 irq_event.irq,
3076 irq_event.level);
3077 mutex_unlock(&kvm->lock);
3078 r = 0;
3079 }
3080 break;
3081 }
3082 case KVM_GET_IRQCHIP: {
3083 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
3084 struct kvm_irqchip chip;
3085
3086 r = -EFAULT;
3087 if (copy_from_user(&chip, argp, sizeof chip))
3088 goto out;
3089 r = -ENXIO;
3090 if (!irqchip_in_kernel(kvm))
3091 goto out;
3092 r = kvm_vm_ioctl_get_irqchip(kvm, &chip);
3093 if (r)
3094 goto out;
3095 r = -EFAULT;
3096 if (copy_to_user(argp, &chip, sizeof chip))
3097 goto out;
3098 r = 0;
3099 break;
3100 }
3101 case KVM_SET_IRQCHIP: {
3102 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
3103 struct kvm_irqchip chip;
3104
3105 r = -EFAULT;
3106 if (copy_from_user(&chip, argp, sizeof chip))
3107 goto out;
3108 r = -ENXIO;
3109 if (!irqchip_in_kernel(kvm))
3110 goto out;
3111 r = kvm_vm_ioctl_set_irqchip(kvm, &chip);
3112 if (r)
3113 goto out;
3114 r = 0;
3115 break;
3116 }
2756 default: 3117 default:
2757 ; 3118 ;
2758 } 3119 }
@@ -2768,12 +3129,14 @@ static struct page *kvm_vm_nopage(struct vm_area_struct *vma,
2768 unsigned long pgoff; 3129 unsigned long pgoff;
2769 struct page *page; 3130 struct page *page;
2770 3131
2771 *type = VM_FAULT_MINOR;
2772 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 3132 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2773 page = gfn_to_page(kvm, pgoff); 3133 page = gfn_to_page(kvm, pgoff);
2774 if (!page) 3134 if (!page)
2775 return NOPAGE_SIGBUS; 3135 return NOPAGE_SIGBUS;
2776 get_page(page); 3136 get_page(page);
3137 if (type != NULL)
3138 *type = VM_FAULT_MINOR;
3139
2777 return page; 3140 return page;
2778} 3141}
2779 3142
@@ -2861,12 +3224,20 @@ static long kvm_dev_ioctl(struct file *filp,
2861 r = 0; 3224 r = 0;
2862 break; 3225 break;
2863 } 3226 }
2864 case KVM_CHECK_EXTENSION: 3227 case KVM_CHECK_EXTENSION: {
2865 /* 3228 int ext = (long)argp;
2866 * No extensions defined at present. 3229
2867 */ 3230 switch (ext) {
2868 r = 0; 3231 case KVM_CAP_IRQCHIP:
3232 case KVM_CAP_HLT:
3233 r = 1;
3234 break;
3235 default:
3236 r = 0;
3237 break;
3238 }
2869 break; 3239 break;
3240 }
2870 case KVM_GET_VCPU_MMAP_SIZE: 3241 case KVM_GET_VCPU_MMAP_SIZE:
2871 r = -EINVAL; 3242 r = -EINVAL;
2872 if (arg) 3243 if (arg)
@@ -2881,8 +3252,6 @@ out:
2881} 3252}
2882 3253
2883static struct file_operations kvm_chardev_ops = { 3254static struct file_operations kvm_chardev_ops = {
2884 .open = kvm_dev_open,
2885 .release = kvm_dev_release,
2886 .unlocked_ioctl = kvm_dev_ioctl, 3255 .unlocked_ioctl = kvm_dev_ioctl,
2887 .compat_ioctl = kvm_dev_ioctl, 3256 .compat_ioctl = kvm_dev_ioctl,
2888}; 3257};
@@ -2893,25 +3262,6 @@ static struct miscdevice kvm_dev = {
2893 &kvm_chardev_ops, 3262 &kvm_chardev_ops,
2894}; 3263};
2895 3264
2896static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
2897 void *v)
2898{
2899 if (val == SYS_RESTART) {
2900 /*
2901 * Some (well, at least mine) BIOSes hang on reboot if
2902 * in vmx root mode.
2903 */
2904 printk(KERN_INFO "kvm: exiting hardware virtualization\n");
2905 on_each_cpu(hardware_disable, NULL, 0, 1);
2906 }
2907 return NOTIFY_OK;
2908}
2909
2910static struct notifier_block kvm_reboot_notifier = {
2911 .notifier_call = kvm_reboot,
2912 .priority = 0,
2913};
2914
2915/* 3265/*
2916 * Make sure that a cpu that is being hot-unplugged does not have any vcpus 3266 * Make sure that a cpu that is being hot-unplugged does not have any vcpus
2917 * cached on it. 3267 * cached on it.
@@ -2925,7 +3275,9 @@ static void decache_vcpus_on_cpu(int cpu)
2925 spin_lock(&kvm_lock); 3275 spin_lock(&kvm_lock);
2926 list_for_each_entry(vm, &vm_list, vm_list) 3276 list_for_each_entry(vm, &vm_list, vm_list)
2927 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 3277 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
2928 vcpu = &vm->vcpus[i]; 3278 vcpu = vm->vcpus[i];
3279 if (!vcpu)
3280 continue;
2929 /* 3281 /*
2930 * If the vcpu is locked, then it is running on some 3282 * If the vcpu is locked, then it is running on some
2931 * other cpu and therefore it is not cached on the 3283 * other cpu and therefore it is not cached on the
@@ -2936,7 +3288,7 @@ static void decache_vcpus_on_cpu(int cpu)
2936 */ 3288 */
2937 if (mutex_trylock(&vcpu->mutex)) { 3289 if (mutex_trylock(&vcpu->mutex)) {
2938 if (vcpu->cpu == cpu) { 3290 if (vcpu->cpu == cpu) {
2939 kvm_arch_ops->vcpu_decache(vcpu); 3291 kvm_x86_ops->vcpu_decache(vcpu);
2940 vcpu->cpu = -1; 3292 vcpu->cpu = -1;
2941 } 3293 }
2942 mutex_unlock(&vcpu->mutex); 3294 mutex_unlock(&vcpu->mutex);
@@ -2952,7 +3304,7 @@ static void hardware_enable(void *junk)
2952 if (cpu_isset(cpu, cpus_hardware_enabled)) 3304 if (cpu_isset(cpu, cpus_hardware_enabled))
2953 return; 3305 return;
2954 cpu_set(cpu, cpus_hardware_enabled); 3306 cpu_set(cpu, cpus_hardware_enabled);
2955 kvm_arch_ops->hardware_enable(NULL); 3307 kvm_x86_ops->hardware_enable(NULL);
2956} 3308}
2957 3309
2958static void hardware_disable(void *junk) 3310static void hardware_disable(void *junk)
@@ -2963,7 +3315,7 @@ static void hardware_disable(void *junk)
2963 return; 3315 return;
2964 cpu_clear(cpu, cpus_hardware_enabled); 3316 cpu_clear(cpu, cpus_hardware_enabled);
2965 decache_vcpus_on_cpu(cpu); 3317 decache_vcpus_on_cpu(cpu);
2966 kvm_arch_ops->hardware_disable(NULL); 3318 kvm_x86_ops->hardware_disable(NULL);
2967} 3319}
2968 3320
2969static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, 3321static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
@@ -2994,6 +3346,25 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
2994 return NOTIFY_OK; 3346 return NOTIFY_OK;
2995} 3347}
2996 3348
3349static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
3350 void *v)
3351{
3352 if (val == SYS_RESTART) {
3353 /*
3354 * Some (well, at least mine) BIOSes hang on reboot if
3355 * in vmx root mode.
3356 */
3357 printk(KERN_INFO "kvm: exiting hardware virtualization\n");
3358 on_each_cpu(hardware_disable, NULL, 0, 1);
3359 }
3360 return NOTIFY_OK;
3361}
3362
3363static struct notifier_block kvm_reboot_notifier = {
3364 .notifier_call = kvm_reboot,
3365 .priority = 0,
3366};
3367
2997void kvm_io_bus_init(struct kvm_io_bus *bus) 3368void kvm_io_bus_init(struct kvm_io_bus *bus)
2998{ 3369{
2999 memset(bus, 0, sizeof(*bus)); 3370 memset(bus, 0, sizeof(*bus));
@@ -3047,18 +3418,15 @@ static u64 stat_get(void *_offset)
3047 spin_lock(&kvm_lock); 3418 spin_lock(&kvm_lock);
3048 list_for_each_entry(kvm, &vm_list, vm_list) 3419 list_for_each_entry(kvm, &vm_list, vm_list)
3049 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 3420 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
3050 vcpu = &kvm->vcpus[i]; 3421 vcpu = kvm->vcpus[i];
3051 total += *(u32 *)((void *)vcpu + offset); 3422 if (vcpu)
3423 total += *(u32 *)((void *)vcpu + offset);
3052 } 3424 }
3053 spin_unlock(&kvm_lock); 3425 spin_unlock(&kvm_lock);
3054 return total; 3426 return total;
3055} 3427}
3056 3428
3057static void stat_set(void *offset, u64 val) 3429DEFINE_SIMPLE_ATTRIBUTE(stat_fops, stat_get, NULL, "%llu\n");
3058{
3059}
3060
3061DEFINE_SIMPLE_ATTRIBUTE(stat_fops, stat_get, stat_set, "%llu\n");
3062 3430
3063static __init void kvm_init_debug(void) 3431static __init void kvm_init_debug(void)
3064{ 3432{
@@ -3105,11 +3473,34 @@ static struct sys_device kvm_sysdev = {
3105 3473
3106hpa_t bad_page_address; 3474hpa_t bad_page_address;
3107 3475
3108int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module) 3476static inline
3477struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
3478{
3479 return container_of(pn, struct kvm_vcpu, preempt_notifier);
3480}
3481
3482static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
3483{
3484 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
3485
3486 kvm_x86_ops->vcpu_load(vcpu, cpu);
3487}
3488
3489static void kvm_sched_out(struct preempt_notifier *pn,
3490 struct task_struct *next)
3491{
3492 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
3493
3494 kvm_x86_ops->vcpu_put(vcpu);
3495}
3496
3497int kvm_init_x86(struct kvm_x86_ops *ops, unsigned int vcpu_size,
3498 struct module *module)
3109{ 3499{
3110 int r; 3500 int r;
3501 int cpu;
3111 3502
3112 if (kvm_arch_ops) { 3503 if (kvm_x86_ops) {
3113 printk(KERN_ERR "kvm: already loaded the other module\n"); 3504 printk(KERN_ERR "kvm: already loaded the other module\n");
3114 return -EEXIST; 3505 return -EEXIST;
3115 } 3506 }
@@ -3123,12 +3514,20 @@ int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module)
3123 return -EOPNOTSUPP; 3514 return -EOPNOTSUPP;
3124 } 3515 }
3125 3516
3126 kvm_arch_ops = ops; 3517 kvm_x86_ops = ops;
3127 3518
3128 r = kvm_arch_ops->hardware_setup(); 3519 r = kvm_x86_ops->hardware_setup();
3129 if (r < 0) 3520 if (r < 0)
3130 goto out; 3521 goto out;
3131 3522
3523 for_each_online_cpu(cpu) {
3524 smp_call_function_single(cpu,
3525 kvm_x86_ops->check_processor_compatibility,
3526 &r, 0, 1);
3527 if (r < 0)
3528 goto out_free_0;
3529 }
3530
3132 on_each_cpu(hardware_enable, NULL, 0, 1); 3531 on_each_cpu(hardware_enable, NULL, 0, 1);
3133 r = register_cpu_notifier(&kvm_cpu_notifier); 3532 r = register_cpu_notifier(&kvm_cpu_notifier);
3134 if (r) 3533 if (r)
@@ -3143,6 +3542,14 @@ int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module)
3143 if (r) 3542 if (r)
3144 goto out_free_3; 3543 goto out_free_3;
3145 3544
3545 /* A kmem cache lets us meet the alignment requirements of fx_save. */
3546 kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size,
3547 __alignof__(struct kvm_vcpu), 0, 0);
3548 if (!kvm_vcpu_cache) {
3549 r = -ENOMEM;
3550 goto out_free_4;
3551 }
3552
3146 kvm_chardev_ops.owner = module; 3553 kvm_chardev_ops.owner = module;
3147 3554
3148 r = misc_register(&kvm_dev); 3555 r = misc_register(&kvm_dev);
@@ -3151,9 +3558,14 @@ int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module)
3151 goto out_free; 3558 goto out_free;
3152 } 3559 }
3153 3560
3561 kvm_preempt_ops.sched_in = kvm_sched_in;
3562 kvm_preempt_ops.sched_out = kvm_sched_out;
3563
3154 return r; 3564 return r;
3155 3565
3156out_free: 3566out_free:
3567 kmem_cache_destroy(kvm_vcpu_cache);
3568out_free_4:
3157 sysdev_unregister(&kvm_sysdev); 3569 sysdev_unregister(&kvm_sysdev);
3158out_free_3: 3570out_free_3:
3159 sysdev_class_unregister(&kvm_sysdev_class); 3571 sysdev_class_unregister(&kvm_sysdev_class);
@@ -3162,22 +3574,24 @@ out_free_2:
3162 unregister_cpu_notifier(&kvm_cpu_notifier); 3574 unregister_cpu_notifier(&kvm_cpu_notifier);
3163out_free_1: 3575out_free_1:
3164 on_each_cpu(hardware_disable, NULL, 0, 1); 3576 on_each_cpu(hardware_disable, NULL, 0, 1);
3165 kvm_arch_ops->hardware_unsetup(); 3577out_free_0:
3578 kvm_x86_ops->hardware_unsetup();
3166out: 3579out:
3167 kvm_arch_ops = NULL; 3580 kvm_x86_ops = NULL;
3168 return r; 3581 return r;
3169} 3582}
3170 3583
3171void kvm_exit_arch(void) 3584void kvm_exit_x86(void)
3172{ 3585{
3173 misc_deregister(&kvm_dev); 3586 misc_deregister(&kvm_dev);
3587 kmem_cache_destroy(kvm_vcpu_cache);
3174 sysdev_unregister(&kvm_sysdev); 3588 sysdev_unregister(&kvm_sysdev);
3175 sysdev_class_unregister(&kvm_sysdev_class); 3589 sysdev_class_unregister(&kvm_sysdev_class);
3176 unregister_reboot_notifier(&kvm_reboot_notifier); 3590 unregister_reboot_notifier(&kvm_reboot_notifier);
3177 unregister_cpu_notifier(&kvm_cpu_notifier); 3591 unregister_cpu_notifier(&kvm_cpu_notifier);
3178 on_each_cpu(hardware_disable, NULL, 0, 1); 3592 on_each_cpu(hardware_disable, NULL, 0, 1);
3179 kvm_arch_ops->hardware_unsetup(); 3593 kvm_x86_ops->hardware_unsetup();
3180 kvm_arch_ops = NULL; 3594 kvm_x86_ops = NULL;
3181} 3595}
3182 3596
3183static __init int kvm_init(void) 3597static __init int kvm_init(void)
@@ -3220,5 +3634,5 @@ static __exit void kvm_exit(void)
3220module_init(kvm_init) 3634module_init(kvm_init)
3221module_exit(kvm_exit) 3635module_exit(kvm_exit)
3222 3636
3223EXPORT_SYMBOL_GPL(kvm_init_arch); 3637EXPORT_SYMBOL_GPL(kvm_init_x86);
3224EXPORT_SYMBOL_GPL(kvm_exit_arch); 3638EXPORT_SYMBOL_GPL(kvm_exit_x86);
diff --git a/drivers/kvm/kvm_svm.h b/drivers/kvm/kvm_svm.h
index a869983d683d..a0e415daef5b 100644
--- a/drivers/kvm/kvm_svm.h
+++ b/drivers/kvm/kvm_svm.h
@@ -20,7 +20,10 @@ static const u32 host_save_user_msrs[] = {
20#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs) 20#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
21#define NUM_DB_REGS 4 21#define NUM_DB_REGS 4
22 22
23struct kvm_vcpu;
24
23struct vcpu_svm { 25struct vcpu_svm {
26 struct kvm_vcpu vcpu;
24 struct vmcb *vmcb; 27 struct vmcb *vmcb;
25 unsigned long vmcb_pa; 28 unsigned long vmcb_pa;
26 struct svm_cpu_data *svm_data; 29 struct svm_cpu_data *svm_data;
diff --git a/drivers/kvm/lapic.c b/drivers/kvm/lapic.c
new file mode 100644
index 000000000000..a190587cf6a5
--- /dev/null
+++ b/drivers/kvm/lapic.c
@@ -0,0 +1,1064 @@
1
2/*
3 * Local APIC virtualization
4 *
5 * Copyright (C) 2006 Qumranet, Inc.
6 * Copyright (C) 2007 Novell
7 * Copyright (C) 2007 Intel
8 *
9 * Authors:
10 * Dor Laor <dor.laor@qumranet.com>
11 * Gregory Haskins <ghaskins@novell.com>
12 * Yaozu (Eddie) Dong <eddie.dong@intel.com>
13 *
14 * Based on Xen 3.1 code, Copyright (c) 2004, Intel Corporation.
15 *
16 * This work is licensed under the terms of the GNU GPL, version 2. See
17 * the COPYING file in the top-level directory.
18 */
19
20#include "kvm.h"
21#include <linux/kvm.h>
22#include <linux/mm.h>
23#include <linux/highmem.h>
24#include <linux/smp.h>
25#include <linux/hrtimer.h>
26#include <linux/io.h>
27#include <linux/module.h>
28#include <asm/processor.h>
29#include <asm/msr.h>
30#include <asm/page.h>
31#include <asm/current.h>
32#include <asm/apicdef.h>
33#include <asm/atomic.h>
34#include <asm/div64.h>
35#include "irq.h"
36
37#define PRId64 "d"
38#define PRIx64 "llx"
39#define PRIu64 "u"
40#define PRIo64 "o"
41
42#define APIC_BUS_CYCLE_NS 1
43
44/* #define apic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */
45#define apic_debug(fmt, arg...)
46
47#define APIC_LVT_NUM 6
48/* 14 is the version for Xeon and Pentium 8.4.8*/
49#define APIC_VERSION (0x14UL | ((APIC_LVT_NUM - 1) << 16))
50#define LAPIC_MMIO_LENGTH (1 << 12)
51/* followed define is not in apicdef.h */
52#define APIC_SHORT_MASK 0xc0000
53#define APIC_DEST_NOSHORT 0x0
54#define APIC_DEST_MASK 0x800
55#define MAX_APIC_VECTOR 256
56
57#define VEC_POS(v) ((v) & (32 - 1))
58#define REG_POS(v) (((v) >> 5) << 4)
59static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off)
60{
61 return *((u32 *) (apic->regs + reg_off));
62}
63
64static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
65{
66 *((u32 *) (apic->regs + reg_off)) = val;
67}
68
69static inline int apic_test_and_set_vector(int vec, void *bitmap)
70{
71 return test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
72}
73
74static inline int apic_test_and_clear_vector(int vec, void *bitmap)
75{
76 return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
77}
78
79static inline void apic_set_vector(int vec, void *bitmap)
80{
81 set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
82}
83
84static inline void apic_clear_vector(int vec, void *bitmap)
85{
86 clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
87}
88
89static inline int apic_hw_enabled(struct kvm_lapic *apic)
90{
91 return (apic)->vcpu->apic_base & MSR_IA32_APICBASE_ENABLE;
92}
93
94static inline int apic_sw_enabled(struct kvm_lapic *apic)
95{
96 return apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED;
97}
98
99static inline int apic_enabled(struct kvm_lapic *apic)
100{
101 return apic_sw_enabled(apic) && apic_hw_enabled(apic);
102}
103
104#define LVT_MASK \
105 (APIC_LVT_MASKED | APIC_SEND_PENDING | APIC_VECTOR_MASK)
106
107#define LINT_MASK \
108 (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
109 APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
110
111static inline int kvm_apic_id(struct kvm_lapic *apic)
112{
113 return (apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
114}
115
116static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
117{
118 return !(apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED);
119}
120
121static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type)
122{
123 return apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK;
124}
125
126static inline int apic_lvtt_period(struct kvm_lapic *apic)
127{
128 return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC;
129}
130
131static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
132 LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */
133 LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */
134 LVT_MASK | APIC_MODE_MASK, /* LVTPC */
135 LINT_MASK, LINT_MASK, /* LVT0-1 */
136 LVT_MASK /* LVTERR */
137};
138
139static int find_highest_vector(void *bitmap)
140{
141 u32 *word = bitmap;
142 int word_offset = MAX_APIC_VECTOR >> 5;
143
144 while ((word_offset != 0) && (word[(--word_offset) << 2] == 0))
145 continue;
146
147 if (likely(!word_offset && !word[0]))
148 return -1;
149 else
150 return fls(word[word_offset << 2]) - 1 + (word_offset << 5);
151}
152
153static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic)
154{
155 return apic_test_and_set_vector(vec, apic->regs + APIC_IRR);
156}
157
158static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
159{
160 apic_clear_vector(vec, apic->regs + APIC_IRR);
161}
162
163static inline int apic_find_highest_irr(struct kvm_lapic *apic)
164{
165 int result;
166
167 result = find_highest_vector(apic->regs + APIC_IRR);
168 ASSERT(result == -1 || result >= 16);
169
170 return result;
171}
172
173int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
174{
175 struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
176 int highest_irr;
177
178 if (!apic)
179 return 0;
180 highest_irr = apic_find_highest_irr(apic);
181
182 return highest_irr;
183}
184EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
185
186int kvm_apic_set_irq(struct kvm_lapic *apic, u8 vec, u8 trig)
187{
188 if (!apic_test_and_set_irr(vec, apic)) {
189 /* a new pending irq is set in IRR */
190 if (trig)
191 apic_set_vector(vec, apic->regs + APIC_TMR);
192 else
193 apic_clear_vector(vec, apic->regs + APIC_TMR);
194 kvm_vcpu_kick(apic->vcpu);
195 return 1;
196 }
197 return 0;
198}
199
200static inline int apic_find_highest_isr(struct kvm_lapic *apic)
201{
202 int result;
203
204 result = find_highest_vector(apic->regs + APIC_ISR);
205 ASSERT(result == -1 || result >= 16);
206
207 return result;
208}
209
210static void apic_update_ppr(struct kvm_lapic *apic)
211{
212 u32 tpr, isrv, ppr;
213 int isr;
214
215 tpr = apic_get_reg(apic, APIC_TASKPRI);
216 isr = apic_find_highest_isr(apic);
217 isrv = (isr != -1) ? isr : 0;
218
219 if ((tpr & 0xf0) >= (isrv & 0xf0))
220 ppr = tpr & 0xff;
221 else
222 ppr = isrv & 0xf0;
223
224 apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x",
225 apic, ppr, isr, isrv);
226
227 apic_set_reg(apic, APIC_PROCPRI, ppr);
228}
229
230static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
231{
232 apic_set_reg(apic, APIC_TASKPRI, tpr);
233 apic_update_ppr(apic);
234}
235
236int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest)
237{
238 return kvm_apic_id(apic) == dest;
239}
240
241int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
242{
243 int result = 0;
244 u8 logical_id;
245
246 logical_id = GET_APIC_LOGICAL_ID(apic_get_reg(apic, APIC_LDR));
247
248 switch (apic_get_reg(apic, APIC_DFR)) {
249 case APIC_DFR_FLAT:
250 if (logical_id & mda)
251 result = 1;
252 break;
253 case APIC_DFR_CLUSTER:
254 if (((logical_id >> 4) == (mda >> 0x4))
255 && (logical_id & mda & 0xf))
256 result = 1;
257 break;
258 default:
259 printk(KERN_WARNING "Bad DFR vcpu %d: %08x\n",
260 apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR));
261 break;
262 }
263
264 return result;
265}
266
267static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
268 int short_hand, int dest, int dest_mode)
269{
270 int result = 0;
271 struct kvm_lapic *target = vcpu->apic;
272
273 apic_debug("target %p, source %p, dest 0x%x, "
274 "dest_mode 0x%x, short_hand 0x%x",
275 target, source, dest, dest_mode, short_hand);
276
277 ASSERT(!target);
278 switch (short_hand) {
279 case APIC_DEST_NOSHORT:
280 if (dest_mode == 0) {
281 /* Physical mode. */
282 if ((dest == 0xFF) || (dest == kvm_apic_id(target)))
283 result = 1;
284 } else
285 /* Logical mode. */
286 result = kvm_apic_match_logical_addr(target, dest);
287 break;
288 case APIC_DEST_SELF:
289 if (target == source)
290 result = 1;
291 break;
292 case APIC_DEST_ALLINC:
293 result = 1;
294 break;
295 case APIC_DEST_ALLBUT:
296 if (target != source)
297 result = 1;
298 break;
299 default:
300 printk(KERN_WARNING "Bad dest shorthand value %x\n",
301 short_hand);
302 break;
303 }
304
305 return result;
306}
307
308/*
309 * Add a pending IRQ into lapic.
310 * Return 1 if successfully added and 0 if discarded.
311 */
312static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
313 int vector, int level, int trig_mode)
314{
315 int orig_irr, result = 0;
316 struct kvm_vcpu *vcpu = apic->vcpu;
317
318 switch (delivery_mode) {
319 case APIC_DM_FIXED:
320 case APIC_DM_LOWEST:
321 /* FIXME add logic for vcpu on reset */
322 if (unlikely(!apic_enabled(apic)))
323 break;
324
325 orig_irr = apic_test_and_set_irr(vector, apic);
326 if (orig_irr && trig_mode) {
327 apic_debug("level trig mode repeatedly for vector %d",
328 vector);
329 break;
330 }
331
332 if (trig_mode) {
333 apic_debug("level trig mode for vector %d", vector);
334 apic_set_vector(vector, apic->regs + APIC_TMR);
335 } else
336 apic_clear_vector(vector, apic->regs + APIC_TMR);
337
338 if (vcpu->mp_state == VCPU_MP_STATE_RUNNABLE)
339 kvm_vcpu_kick(vcpu);
340 else if (vcpu->mp_state == VCPU_MP_STATE_HALTED) {
341 vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
342 if (waitqueue_active(&vcpu->wq))
343 wake_up_interruptible(&vcpu->wq);
344 }
345
346 result = (orig_irr == 0);
347 break;
348
349 case APIC_DM_REMRD:
350 printk(KERN_DEBUG "Ignoring delivery mode 3\n");
351 break;
352
353 case APIC_DM_SMI:
354 printk(KERN_DEBUG "Ignoring guest SMI\n");
355 break;
356 case APIC_DM_NMI:
357 printk(KERN_DEBUG "Ignoring guest NMI\n");
358 break;
359
360 case APIC_DM_INIT:
361 if (level) {
362 if (vcpu->mp_state == VCPU_MP_STATE_RUNNABLE)
363 printk(KERN_DEBUG
364 "INIT on a runnable vcpu %d\n",
365 vcpu->vcpu_id);
366 vcpu->mp_state = VCPU_MP_STATE_INIT_RECEIVED;
367 kvm_vcpu_kick(vcpu);
368 } else {
369 printk(KERN_DEBUG
370 "Ignoring de-assert INIT to vcpu %d\n",
371 vcpu->vcpu_id);
372 }
373
374 break;
375
376 case APIC_DM_STARTUP:
377 printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n",
378 vcpu->vcpu_id, vector);
379 if (vcpu->mp_state == VCPU_MP_STATE_INIT_RECEIVED) {
380 vcpu->sipi_vector = vector;
381 vcpu->mp_state = VCPU_MP_STATE_SIPI_RECEIVED;
382 if (waitqueue_active(&vcpu->wq))
383 wake_up_interruptible(&vcpu->wq);
384 }
385 break;
386
387 default:
388 printk(KERN_ERR "TODO: unsupported delivery mode %x\n",
389 delivery_mode);
390 break;
391 }
392 return result;
393}
394
395struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
396 unsigned long bitmap)
397{
398 int vcpu_id;
399 int last;
400 int next;
401 struct kvm_lapic *apic;
402
403 last = kvm->round_robin_prev_vcpu;
404 next = last;
405
406 do {
407 if (++next == KVM_MAX_VCPUS)
408 next = 0;
409 if (kvm->vcpus[next] == NULL || !test_bit(next, &bitmap))
410 continue;
411 apic = kvm->vcpus[next]->apic;
412 if (apic && apic_enabled(apic))
413 break;
414 apic = NULL;
415 } while (next != last);
416 kvm->round_robin_prev_vcpu = next;
417
418 if (!apic) {
419 vcpu_id = ffs(bitmap) - 1;
420 if (vcpu_id < 0) {
421 vcpu_id = 0;
422 printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n");
423 }
424 apic = kvm->vcpus[vcpu_id]->apic;
425 }
426
427 return apic;
428}
429
430static void apic_set_eoi(struct kvm_lapic *apic)
431{
432 int vector = apic_find_highest_isr(apic);
433
434 /*
435 * Not every write EOI will has corresponding ISR,
436 * one example is when Kernel check timer on setup_IO_APIC
437 */
438 if (vector == -1)
439 return;
440
441 apic_clear_vector(vector, apic->regs + APIC_ISR);
442 apic_update_ppr(apic);
443
444 if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR))
445 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector);
446}
447
448static void apic_send_ipi(struct kvm_lapic *apic)
449{
450 u32 icr_low = apic_get_reg(apic, APIC_ICR);
451 u32 icr_high = apic_get_reg(apic, APIC_ICR2);
452
453 unsigned int dest = GET_APIC_DEST_FIELD(icr_high);
454 unsigned int short_hand = icr_low & APIC_SHORT_MASK;
455 unsigned int trig_mode = icr_low & APIC_INT_LEVELTRIG;
456 unsigned int level = icr_low & APIC_INT_ASSERT;
457 unsigned int dest_mode = icr_low & APIC_DEST_MASK;
458 unsigned int delivery_mode = icr_low & APIC_MODE_MASK;
459 unsigned int vector = icr_low & APIC_VECTOR_MASK;
460
461 struct kvm_lapic *target;
462 struct kvm_vcpu *vcpu;
463 unsigned long lpr_map = 0;
464 int i;
465
466 apic_debug("icr_high 0x%x, icr_low 0x%x, "
467 "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, "
468 "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n",
469 icr_high, icr_low, short_hand, dest,
470 trig_mode, level, dest_mode, delivery_mode, vector);
471
472 for (i = 0; i < KVM_MAX_VCPUS; i++) {
473 vcpu = apic->vcpu->kvm->vcpus[i];
474 if (!vcpu)
475 continue;
476
477 if (vcpu->apic &&
478 apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) {
479 if (delivery_mode == APIC_DM_LOWEST)
480 set_bit(vcpu->vcpu_id, &lpr_map);
481 else
482 __apic_accept_irq(vcpu->apic, delivery_mode,
483 vector, level, trig_mode);
484 }
485 }
486
487 if (delivery_mode == APIC_DM_LOWEST) {
488 target = kvm_apic_round_robin(vcpu->kvm, vector, lpr_map);
489 if (target != NULL)
490 __apic_accept_irq(target, delivery_mode,
491 vector, level, trig_mode);
492 }
493}
494
495static u32 apic_get_tmcct(struct kvm_lapic *apic)
496{
497 u32 counter_passed;
498 ktime_t passed, now = apic->timer.dev.base->get_time();
499 u32 tmcct = apic_get_reg(apic, APIC_TMICT);
500
501 ASSERT(apic != NULL);
502
503 if (unlikely(ktime_to_ns(now) <=
504 ktime_to_ns(apic->timer.last_update))) {
505 /* Wrap around */
506 passed = ktime_add(( {
507 (ktime_t) {
508 .tv64 = KTIME_MAX -
509 (apic->timer.last_update).tv64}; }
510 ), now);
511 apic_debug("time elapsed\n");
512 } else
513 passed = ktime_sub(now, apic->timer.last_update);
514
515 counter_passed = div64_64(ktime_to_ns(passed),
516 (APIC_BUS_CYCLE_NS * apic->timer.divide_count));
517 tmcct -= counter_passed;
518
519 if (tmcct <= 0) {
520 if (unlikely(!apic_lvtt_period(apic)))
521 tmcct = 0;
522 else
523 do {
524 tmcct += apic_get_reg(apic, APIC_TMICT);
525 } while (tmcct <= 0);
526 }
527
528 return tmcct;
529}
530
531static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
532{
533 u32 val = 0;
534
535 if (offset >= LAPIC_MMIO_LENGTH)
536 return 0;
537
538 switch (offset) {
539 case APIC_ARBPRI:
540 printk(KERN_WARNING "Access APIC ARBPRI register "
541 "which is for P6\n");
542 break;
543
544 case APIC_TMCCT: /* Timer CCR */
545 val = apic_get_tmcct(apic);
546 break;
547
548 default:
549 apic_update_ppr(apic);
550 val = apic_get_reg(apic, offset);
551 break;
552 }
553
554 return val;
555}
556
557static void apic_mmio_read(struct kvm_io_device *this,
558 gpa_t address, int len, void *data)
559{
560 struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
561 unsigned int offset = address - apic->base_address;
562 unsigned char alignment = offset & 0xf;
563 u32 result;
564
565 if ((alignment + len) > 4) {
566 printk(KERN_ERR "KVM_APIC_READ: alignment error %lx %d",
567 (unsigned long)address, len);
568 return;
569 }
570 result = __apic_read(apic, offset & ~0xf);
571
572 switch (len) {
573 case 1:
574 case 2:
575 case 4:
576 memcpy(data, (char *)&result + alignment, len);
577 break;
578 default:
579 printk(KERN_ERR "Local APIC read with len = %x, "
580 "should be 1,2, or 4 instead\n", len);
581 break;
582 }
583}
584
585static void update_divide_count(struct kvm_lapic *apic)
586{
587 u32 tmp1, tmp2, tdcr;
588
589 tdcr = apic_get_reg(apic, APIC_TDCR);
590 tmp1 = tdcr & 0xf;
591 tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1;
592 apic->timer.divide_count = 0x1 << (tmp2 & 0x7);
593
594 apic_debug("timer divide count is 0x%x\n",
595 apic->timer.divide_count);
596}
597
598static void start_apic_timer(struct kvm_lapic *apic)
599{
600 ktime_t now = apic->timer.dev.base->get_time();
601
602 apic->timer.last_update = now;
603
604 apic->timer.period = apic_get_reg(apic, APIC_TMICT) *
605 APIC_BUS_CYCLE_NS * apic->timer.divide_count;
606 atomic_set(&apic->timer.pending, 0);
607 hrtimer_start(&apic->timer.dev,
608 ktime_add_ns(now, apic->timer.period),
609 HRTIMER_MODE_ABS);
610
611 apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
612 PRIx64 ", "
613 "timer initial count 0x%x, period %lldns, "
614 "expire @ 0x%016" PRIx64 ".\n", __FUNCTION__,
615 APIC_BUS_CYCLE_NS, ktime_to_ns(now),
616 apic_get_reg(apic, APIC_TMICT),
617 apic->timer.period,
618 ktime_to_ns(ktime_add_ns(now,
619 apic->timer.period)));
620}
621
622static void apic_mmio_write(struct kvm_io_device *this,
623 gpa_t address, int len, const void *data)
624{
625 struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
626 unsigned int offset = address - apic->base_address;
627 unsigned char alignment = offset & 0xf;
628 u32 val;
629
630 /*
631 * APIC register must be aligned on 128-bits boundary.
632 * 32/64/128 bits registers must be accessed thru 32 bits.
633 * Refer SDM 8.4.1
634 */
635 if (len != 4 || alignment) {
636 if (printk_ratelimit())
637 printk(KERN_ERR "apic write: bad size=%d %lx\n",
638 len, (long)address);
639 return;
640 }
641
642 val = *(u32 *) data;
643
644 /* too common printing */
645 if (offset != APIC_EOI)
646 apic_debug("%s: offset 0x%x with length 0x%x, and value is "
647 "0x%x\n", __FUNCTION__, offset, len, val);
648
649 offset &= 0xff0;
650
651 switch (offset) {
652 case APIC_ID: /* Local APIC ID */
653 apic_set_reg(apic, APIC_ID, val);
654 break;
655
656 case APIC_TASKPRI:
657 apic_set_tpr(apic, val & 0xff);
658 break;
659
660 case APIC_EOI:
661 apic_set_eoi(apic);
662 break;
663
664 case APIC_LDR:
665 apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK);
666 break;
667
668 case APIC_DFR:
669 apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
670 break;
671
672 case APIC_SPIV:
673 apic_set_reg(apic, APIC_SPIV, val & 0x3ff);
674 if (!(val & APIC_SPIV_APIC_ENABLED)) {
675 int i;
676 u32 lvt_val;
677
678 for (i = 0; i < APIC_LVT_NUM; i++) {
679 lvt_val = apic_get_reg(apic,
680 APIC_LVTT + 0x10 * i);
681 apic_set_reg(apic, APIC_LVTT + 0x10 * i,
682 lvt_val | APIC_LVT_MASKED);
683 }
684 atomic_set(&apic->timer.pending, 0);
685
686 }
687 break;
688
689 case APIC_ICR:
690 /* No delay here, so we always clear the pending bit */
691 apic_set_reg(apic, APIC_ICR, val & ~(1 << 12));
692 apic_send_ipi(apic);
693 break;
694
695 case APIC_ICR2:
696 apic_set_reg(apic, APIC_ICR2, val & 0xff000000);
697 break;
698
699 case APIC_LVTT:
700 case APIC_LVTTHMR:
701 case APIC_LVTPC:
702 case APIC_LVT0:
703 case APIC_LVT1:
704 case APIC_LVTERR:
705 /* TODO: Check vector */
706 if (!apic_sw_enabled(apic))
707 val |= APIC_LVT_MASKED;
708
709 val &= apic_lvt_mask[(offset - APIC_LVTT) >> 4];
710 apic_set_reg(apic, offset, val);
711
712 break;
713
714 case APIC_TMICT:
715 hrtimer_cancel(&apic->timer.dev);
716 apic_set_reg(apic, APIC_TMICT, val);
717 start_apic_timer(apic);
718 return;
719
720 case APIC_TDCR:
721 if (val & 4)
722 printk(KERN_ERR "KVM_WRITE:TDCR %x\n", val);
723 apic_set_reg(apic, APIC_TDCR, val);
724 update_divide_count(apic);
725 break;
726
727 default:
728 apic_debug("Local APIC Write to read-only register %x\n",
729 offset);
730 break;
731 }
732
733}
734
735static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr)
736{
737 struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
738 int ret = 0;
739
740
741 if (apic_hw_enabled(apic) &&
742 (addr >= apic->base_address) &&
743 (addr < (apic->base_address + LAPIC_MMIO_LENGTH)))
744 ret = 1;
745
746 return ret;
747}
748
749void kvm_free_apic(struct kvm_lapic *apic)
750{
751 if (!apic)
752 return;
753
754 hrtimer_cancel(&apic->timer.dev);
755
756 if (apic->regs_page) {
757 __free_page(apic->regs_page);
758 apic->regs_page = 0;
759 }
760
761 kfree(apic);
762}
763
764/*
765 *----------------------------------------------------------------------
766 * LAPIC interface
767 *----------------------------------------------------------------------
768 */
769
770void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
771{
772 struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
773
774 if (!apic)
775 return;
776 apic_set_tpr(apic, ((cr8 & 0x0f) << 4));
777}
778
779u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
780{
781 struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
782 u64 tpr;
783
784 if (!apic)
785 return 0;
786 tpr = (u64) apic_get_reg(apic, APIC_TASKPRI);
787
788 return (tpr & 0xf0) >> 4;
789}
790EXPORT_SYMBOL_GPL(kvm_lapic_get_cr8);
791
792void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
793{
794 struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
795
796 if (!apic) {
797 value |= MSR_IA32_APICBASE_BSP;
798 vcpu->apic_base = value;
799 return;
800 }
801 if (apic->vcpu->vcpu_id)
802 value &= ~MSR_IA32_APICBASE_BSP;
803
804 vcpu->apic_base = value;
805 apic->base_address = apic->vcpu->apic_base &
806 MSR_IA32_APICBASE_BASE;
807
808 /* with FSB delivery interrupt, we can restart APIC functionality */
809 apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is "
810 "0x%lx.\n", apic->apic_base, apic->base_address);
811
812}
813
814u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu)
815{
816 return vcpu->apic_base;
817}
818EXPORT_SYMBOL_GPL(kvm_lapic_get_base);
819
820void kvm_lapic_reset(struct kvm_vcpu *vcpu)
821{
822 struct kvm_lapic *apic;
823 int i;
824
825 apic_debug("%s\n", __FUNCTION__);
826
827 ASSERT(vcpu);
828 apic = vcpu->apic;
829 ASSERT(apic != NULL);
830
831 /* Stop the timer in case it's a reset to an active apic */
832 hrtimer_cancel(&apic->timer.dev);
833
834 apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24);
835 apic_set_reg(apic, APIC_LVR, APIC_VERSION);
836
837 for (i = 0; i < APIC_LVT_NUM; i++)
838 apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
839 apic_set_reg(apic, APIC_LVT0,
840 SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
841
842 apic_set_reg(apic, APIC_DFR, 0xffffffffU);
843 apic_set_reg(apic, APIC_SPIV, 0xff);
844 apic_set_reg(apic, APIC_TASKPRI, 0);
845 apic_set_reg(apic, APIC_LDR, 0);
846 apic_set_reg(apic, APIC_ESR, 0);
847 apic_set_reg(apic, APIC_ICR, 0);
848 apic_set_reg(apic, APIC_ICR2, 0);
849 apic_set_reg(apic, APIC_TDCR, 0);
850 apic_set_reg(apic, APIC_TMICT, 0);
851 for (i = 0; i < 8; i++) {
852 apic_set_reg(apic, APIC_IRR + 0x10 * i, 0);
853 apic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
854 apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
855 }
856 apic->timer.divide_count = 0;
857 atomic_set(&apic->timer.pending, 0);
858 if (vcpu->vcpu_id == 0)
859 vcpu->apic_base |= MSR_IA32_APICBASE_BSP;
860 apic_update_ppr(apic);
861
862 apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr="
863 "0x%016" PRIx64 ", base_address=0x%0lx.\n", __FUNCTION__,
864 vcpu, kvm_apic_id(apic),
865 vcpu->apic_base, apic->base_address);
866}
867EXPORT_SYMBOL_GPL(kvm_lapic_reset);
868
869int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
870{
871 struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
872 int ret = 0;
873
874 if (!apic)
875 return 0;
876 ret = apic_enabled(apic);
877
878 return ret;
879}
880EXPORT_SYMBOL_GPL(kvm_lapic_enabled);
881
882/*
883 *----------------------------------------------------------------------
884 * timer interface
885 *----------------------------------------------------------------------
886 */
887
888/* TODO: make sure __apic_timer_fn runs in current pCPU */
889static int __apic_timer_fn(struct kvm_lapic *apic)
890{
891 int result = 0;
892 wait_queue_head_t *q = &apic->vcpu->wq;
893
894 atomic_inc(&apic->timer.pending);
895 if (waitqueue_active(q))
896 {
897 apic->vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
898 wake_up_interruptible(q);
899 }
900 if (apic_lvtt_period(apic)) {
901 result = 1;
902 apic->timer.dev.expires = ktime_add_ns(
903 apic->timer.dev.expires,
904 apic->timer.period);
905 }
906 return result;
907}
908
909static int __inject_apic_timer_irq(struct kvm_lapic *apic)
910{
911 int vector;
912
913 vector = apic_lvt_vector(apic, APIC_LVTT);
914 return __apic_accept_irq(apic, APIC_DM_FIXED, vector, 1, 0);
915}
916
917static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
918{
919 struct kvm_lapic *apic;
920 int restart_timer = 0;
921
922 apic = container_of(data, struct kvm_lapic, timer.dev);
923
924 restart_timer = __apic_timer_fn(apic);
925
926 if (restart_timer)
927 return HRTIMER_RESTART;
928 else
929 return HRTIMER_NORESTART;
930}
931
932int kvm_create_lapic(struct kvm_vcpu *vcpu)
933{
934 struct kvm_lapic *apic;
935
936 ASSERT(vcpu != NULL);
937 apic_debug("apic_init %d\n", vcpu->vcpu_id);
938
939 apic = kzalloc(sizeof(*apic), GFP_KERNEL);
940 if (!apic)
941 goto nomem;
942
943 vcpu->apic = apic;
944
945 apic->regs_page = alloc_page(GFP_KERNEL);
946 if (apic->regs_page == NULL) {
947 printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
948 vcpu->vcpu_id);
949 goto nomem;
950 }
951 apic->regs = page_address(apic->regs_page);
952 memset(apic->regs, 0, PAGE_SIZE);
953 apic->vcpu = vcpu;
954
955 hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
956 apic->timer.dev.function = apic_timer_fn;
957 apic->base_address = APIC_DEFAULT_PHYS_BASE;
958 vcpu->apic_base = APIC_DEFAULT_PHYS_BASE;
959
960 kvm_lapic_reset(vcpu);
961 apic->dev.read = apic_mmio_read;
962 apic->dev.write = apic_mmio_write;
963 apic->dev.in_range = apic_mmio_range;
964 apic->dev.private = apic;
965
966 return 0;
967nomem:
968 kvm_free_apic(apic);
969 return -ENOMEM;
970}
971EXPORT_SYMBOL_GPL(kvm_create_lapic);
972
973int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
974{
975 struct kvm_lapic *apic = vcpu->apic;
976 int highest_irr;
977
978 if (!apic || !apic_enabled(apic))
979 return -1;
980
981 apic_update_ppr(apic);
982 highest_irr = apic_find_highest_irr(apic);
983 if ((highest_irr == -1) ||
984 ((highest_irr & 0xF0) <= apic_get_reg(apic, APIC_PROCPRI)))
985 return -1;
986 return highest_irr;
987}
988
989int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
990{
991 u32 lvt0 = apic_get_reg(vcpu->apic, APIC_LVT0);
992 int r = 0;
993
994 if (vcpu->vcpu_id == 0) {
995 if (!apic_hw_enabled(vcpu->apic))
996 r = 1;
997 if ((lvt0 & APIC_LVT_MASKED) == 0 &&
998 GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
999 r = 1;
1000 }
1001 return r;
1002}
1003
1004void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
1005{
1006 struct kvm_lapic *apic = vcpu->apic;
1007
1008 if (apic && apic_lvt_enabled(apic, APIC_LVTT) &&
1009 atomic_read(&apic->timer.pending) > 0) {
1010 if (__inject_apic_timer_irq(apic))
1011 atomic_dec(&apic->timer.pending);
1012 }
1013}
1014
1015void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
1016{
1017 struct kvm_lapic *apic = vcpu->apic;
1018
1019 if (apic && apic_lvt_vector(apic, APIC_LVTT) == vec)
1020 apic->timer.last_update = ktime_add_ns(
1021 apic->timer.last_update,
1022 apic->timer.period);
1023}
1024
1025int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
1026{
1027 int vector = kvm_apic_has_interrupt(vcpu);
1028 struct kvm_lapic *apic = vcpu->apic;
1029
1030 if (vector == -1)
1031 return -1;
1032
1033 apic_set_vector(vector, apic->regs + APIC_ISR);
1034 apic_update_ppr(apic);
1035 apic_clear_irr(vector, apic);
1036 return vector;
1037}
1038
1039void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
1040{
1041 struct kvm_lapic *apic = vcpu->apic;
1042
1043 apic->base_address = vcpu->apic_base &
1044 MSR_IA32_APICBASE_BASE;
1045 apic_set_reg(apic, APIC_LVR, APIC_VERSION);
1046 apic_update_ppr(apic);
1047 hrtimer_cancel(&apic->timer.dev);
1048 update_divide_count(apic);
1049 start_apic_timer(apic);
1050}
1051
1052void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
1053{
1054 struct kvm_lapic *apic = vcpu->apic;
1055 struct hrtimer *timer;
1056
1057 if (!apic)
1058 return;
1059
1060 timer = &apic->timer.dev;
1061 if (hrtimer_cancel(timer))
1062 hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS);
1063}
1064EXPORT_SYMBOL_GPL(kvm_migrate_apic_timer);
diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c
index 23965aa5ee78..6d84d30f5ed0 100644
--- a/drivers/kvm/mmu.c
+++ b/drivers/kvm/mmu.c
@@ -158,7 +158,7 @@ static struct kmem_cache *mmu_page_header_cache;
158 158
159static int is_write_protection(struct kvm_vcpu *vcpu) 159static int is_write_protection(struct kvm_vcpu *vcpu)
160{ 160{
161 return vcpu->cr0 & CR0_WP_MASK; 161 return vcpu->cr0 & X86_CR0_WP;
162} 162}
163 163
164static int is_cpuid_PSE36(void) 164static int is_cpuid_PSE36(void)
@@ -202,15 +202,14 @@ static void set_shadow_pte(u64 *sptep, u64 spte)
202} 202}
203 203
204static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, 204static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
205 struct kmem_cache *base_cache, int min, 205 struct kmem_cache *base_cache, int min)
206 gfp_t gfp_flags)
207{ 206{
208 void *obj; 207 void *obj;
209 208
210 if (cache->nobjs >= min) 209 if (cache->nobjs >= min)
211 return 0; 210 return 0;
212 while (cache->nobjs < ARRAY_SIZE(cache->objects)) { 211 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
213 obj = kmem_cache_zalloc(base_cache, gfp_flags); 212 obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
214 if (!obj) 213 if (!obj)
215 return -ENOMEM; 214 return -ENOMEM;
216 cache->objects[cache->nobjs++] = obj; 215 cache->objects[cache->nobjs++] = obj;
@@ -225,14 +224,14 @@ static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
225} 224}
226 225
227static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, 226static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
228 int min, gfp_t gfp_flags) 227 int min)
229{ 228{
230 struct page *page; 229 struct page *page;
231 230
232 if (cache->nobjs >= min) 231 if (cache->nobjs >= min)
233 return 0; 232 return 0;
234 while (cache->nobjs < ARRAY_SIZE(cache->objects)) { 233 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
235 page = alloc_page(gfp_flags); 234 page = alloc_page(GFP_KERNEL);
236 if (!page) 235 if (!page)
237 return -ENOMEM; 236 return -ENOMEM;
238 set_page_private(page, 0); 237 set_page_private(page, 0);
@@ -247,44 +246,28 @@ static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
247 free_page((unsigned long)mc->objects[--mc->nobjs]); 246 free_page((unsigned long)mc->objects[--mc->nobjs]);
248} 247}
249 248
250static int __mmu_topup_memory_caches(struct kvm_vcpu *vcpu, gfp_t gfp_flags) 249static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
251{ 250{
252 int r; 251 int r;
253 252
253 kvm_mmu_free_some_pages(vcpu);
254 r = mmu_topup_memory_cache(&vcpu->mmu_pte_chain_cache, 254 r = mmu_topup_memory_cache(&vcpu->mmu_pte_chain_cache,
255 pte_chain_cache, 4, gfp_flags); 255 pte_chain_cache, 4);
256 if (r) 256 if (r)
257 goto out; 257 goto out;
258 r = mmu_topup_memory_cache(&vcpu->mmu_rmap_desc_cache, 258 r = mmu_topup_memory_cache(&vcpu->mmu_rmap_desc_cache,
259 rmap_desc_cache, 1, gfp_flags); 259 rmap_desc_cache, 1);
260 if (r) 260 if (r)
261 goto out; 261 goto out;
262 r = mmu_topup_memory_cache_page(&vcpu->mmu_page_cache, 4, gfp_flags); 262 r = mmu_topup_memory_cache_page(&vcpu->mmu_page_cache, 4);
263 if (r) 263 if (r)
264 goto out; 264 goto out;
265 r = mmu_topup_memory_cache(&vcpu->mmu_page_header_cache, 265 r = mmu_topup_memory_cache(&vcpu->mmu_page_header_cache,
266 mmu_page_header_cache, 4, gfp_flags); 266 mmu_page_header_cache, 4);
267out: 267out:
268 return r; 268 return r;
269} 269}
270 270
271static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
272{
273 int r;
274
275 r = __mmu_topup_memory_caches(vcpu, GFP_NOWAIT);
276 kvm_mmu_free_some_pages(vcpu);
277 if (r < 0) {
278 spin_unlock(&vcpu->kvm->lock);
279 kvm_arch_ops->vcpu_put(vcpu);
280 r = __mmu_topup_memory_caches(vcpu, GFP_KERNEL);
281 kvm_arch_ops->vcpu_load(vcpu);
282 spin_lock(&vcpu->kvm->lock);
283 kvm_mmu_free_some_pages(vcpu);
284 }
285 return r;
286}
287
288static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) 271static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
289{ 272{
290 mmu_free_memory_cache(&vcpu->mmu_pte_chain_cache); 273 mmu_free_memory_cache(&vcpu->mmu_pte_chain_cache);
@@ -969,7 +952,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
969static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) 952static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
970{ 953{
971 ++vcpu->stat.tlb_flush; 954 ++vcpu->stat.tlb_flush;
972 kvm_arch_ops->tlb_flush(vcpu); 955 kvm_x86_ops->tlb_flush(vcpu);
973} 956}
974 957
975static void paging_new_cr3(struct kvm_vcpu *vcpu) 958static void paging_new_cr3(struct kvm_vcpu *vcpu)
@@ -982,7 +965,7 @@ static void inject_page_fault(struct kvm_vcpu *vcpu,
982 u64 addr, 965 u64 addr,
983 u32 err_code) 966 u32 err_code)
984{ 967{
985 kvm_arch_ops->inject_page_fault(vcpu, addr, err_code); 968 kvm_x86_ops->inject_page_fault(vcpu, addr, err_code);
986} 969}
987 970
988static void paging_free(struct kvm_vcpu *vcpu) 971static void paging_free(struct kvm_vcpu *vcpu)
@@ -1071,15 +1054,15 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
1071{ 1054{
1072 int r; 1055 int r;
1073 1056
1074 spin_lock(&vcpu->kvm->lock); 1057 mutex_lock(&vcpu->kvm->lock);
1075 r = mmu_topup_memory_caches(vcpu); 1058 r = mmu_topup_memory_caches(vcpu);
1076 if (r) 1059 if (r)
1077 goto out; 1060 goto out;
1078 mmu_alloc_roots(vcpu); 1061 mmu_alloc_roots(vcpu);
1079 kvm_arch_ops->set_cr3(vcpu, vcpu->mmu.root_hpa); 1062 kvm_x86_ops->set_cr3(vcpu, vcpu->mmu.root_hpa);
1080 kvm_mmu_flush_tlb(vcpu); 1063 kvm_mmu_flush_tlb(vcpu);
1081out: 1064out:
1082 spin_unlock(&vcpu->kvm->lock); 1065 mutex_unlock(&vcpu->kvm->lock);
1083 return r; 1066 return r;
1084} 1067}
1085EXPORT_SYMBOL_GPL(kvm_mmu_load); 1068EXPORT_SYMBOL_GPL(kvm_mmu_load);
@@ -1124,7 +1107,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
1124} 1107}
1125 1108
1126void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, 1109void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1127 const u8 *old, const u8 *new, int bytes) 1110 const u8 *new, int bytes)
1128{ 1111{
1129 gfn_t gfn = gpa >> PAGE_SHIFT; 1112 gfn_t gfn = gpa >> PAGE_SHIFT;
1130 struct kvm_mmu_page *page; 1113 struct kvm_mmu_page *page;
diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h
index 4b5391c717f8..6b094b44f8fb 100644
--- a/drivers/kvm/paging_tmpl.h
+++ b/drivers/kvm/paging_tmpl.h
@@ -58,7 +58,10 @@ struct guest_walker {
58 int level; 58 int level;
59 gfn_t table_gfn[PT_MAX_FULL_LEVELS]; 59 gfn_t table_gfn[PT_MAX_FULL_LEVELS];
60 pt_element_t *table; 60 pt_element_t *table;
61 pt_element_t pte;
61 pt_element_t *ptep; 62 pt_element_t *ptep;
63 struct page *page;
64 int index;
62 pt_element_t inherited_ar; 65 pt_element_t inherited_ar;
63 gfn_t gfn; 66 gfn_t gfn;
64 u32 error_code; 67 u32 error_code;
@@ -80,11 +83,14 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
80 pgprintk("%s: addr %lx\n", __FUNCTION__, addr); 83 pgprintk("%s: addr %lx\n", __FUNCTION__, addr);
81 walker->level = vcpu->mmu.root_level; 84 walker->level = vcpu->mmu.root_level;
82 walker->table = NULL; 85 walker->table = NULL;
86 walker->page = NULL;
87 walker->ptep = NULL;
83 root = vcpu->cr3; 88 root = vcpu->cr3;
84#if PTTYPE == 64 89#if PTTYPE == 64
85 if (!is_long_mode(vcpu)) { 90 if (!is_long_mode(vcpu)) {
86 walker->ptep = &vcpu->pdptrs[(addr >> 30) & 3]; 91 walker->ptep = &vcpu->pdptrs[(addr >> 30) & 3];
87 root = *walker->ptep; 92 root = *walker->ptep;
93 walker->pte = root;
88 if (!(root & PT_PRESENT_MASK)) 94 if (!(root & PT_PRESENT_MASK))
89 goto not_present; 95 goto not_present;
90 --walker->level; 96 --walker->level;
@@ -96,10 +102,11 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
96 walker->level - 1, table_gfn); 102 walker->level - 1, table_gfn);
97 slot = gfn_to_memslot(vcpu->kvm, table_gfn); 103 slot = gfn_to_memslot(vcpu->kvm, table_gfn);
98 hpa = safe_gpa_to_hpa(vcpu, root & PT64_BASE_ADDR_MASK); 104 hpa = safe_gpa_to_hpa(vcpu, root & PT64_BASE_ADDR_MASK);
99 walker->table = kmap_atomic(pfn_to_page(hpa >> PAGE_SHIFT), KM_USER0); 105 walker->page = pfn_to_page(hpa >> PAGE_SHIFT);
106 walker->table = kmap_atomic(walker->page, KM_USER0);
100 107
101 ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || 108 ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
102 (vcpu->cr3 & ~(PAGE_MASK | CR3_FLAGS_MASK)) == 0); 109 (vcpu->cr3 & CR3_NONPAE_RESERVED_BITS) == 0);
103 110
104 walker->inherited_ar = PT_USER_MASK | PT_WRITABLE_MASK; 111 walker->inherited_ar = PT_USER_MASK | PT_WRITABLE_MASK;
105 112
@@ -108,6 +115,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
108 hpa_t paddr; 115 hpa_t paddr;
109 116
110 ptep = &walker->table[index]; 117 ptep = &walker->table[index];
118 walker->index = index;
111 ASSERT(((unsigned long)walker->table & PAGE_MASK) == 119 ASSERT(((unsigned long)walker->table & PAGE_MASK) ==
112 ((unsigned long)ptep & PAGE_MASK)); 120 ((unsigned long)ptep & PAGE_MASK));
113 121
@@ -148,16 +156,20 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
148 156
149 walker->inherited_ar &= walker->table[index]; 157 walker->inherited_ar &= walker->table[index];
150 table_gfn = (*ptep & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; 158 table_gfn = (*ptep & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
151 paddr = safe_gpa_to_hpa(vcpu, *ptep & PT_BASE_ADDR_MASK);
152 kunmap_atomic(walker->table, KM_USER0); 159 kunmap_atomic(walker->table, KM_USER0);
153 walker->table = kmap_atomic(pfn_to_page(paddr >> PAGE_SHIFT), 160 paddr = safe_gpa_to_hpa(vcpu, table_gfn << PAGE_SHIFT);
154 KM_USER0); 161 walker->page = pfn_to_page(paddr >> PAGE_SHIFT);
162 walker->table = kmap_atomic(walker->page, KM_USER0);
155 --walker->level; 163 --walker->level;
156 walker->table_gfn[walker->level - 1 ] = table_gfn; 164 walker->table_gfn[walker->level - 1 ] = table_gfn;
157 pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__, 165 pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
158 walker->level - 1, table_gfn); 166 walker->level - 1, table_gfn);
159 } 167 }
160 walker->ptep = ptep; 168 walker->pte = *ptep;
169 if (walker->page)
170 walker->ptep = NULL;
171 if (walker->table)
172 kunmap_atomic(walker->table, KM_USER0);
161 pgprintk("%s: pte %llx\n", __FUNCTION__, (u64)*ptep); 173 pgprintk("%s: pte %llx\n", __FUNCTION__, (u64)*ptep);
162 return 1; 174 return 1;
163 175
@@ -175,13 +187,9 @@ err:
175 walker->error_code |= PFERR_USER_MASK; 187 walker->error_code |= PFERR_USER_MASK;
176 if (fetch_fault) 188 if (fetch_fault)
177 walker->error_code |= PFERR_FETCH_MASK; 189 walker->error_code |= PFERR_FETCH_MASK;
178 return 0;
179}
180
181static void FNAME(release_walker)(struct guest_walker *walker)
182{
183 if (walker->table) 190 if (walker->table)
184 kunmap_atomic(walker->table, KM_USER0); 191 kunmap_atomic(walker->table, KM_USER0);
192 return 0;
185} 193}
186 194
187static void FNAME(mark_pagetable_dirty)(struct kvm *kvm, 195static void FNAME(mark_pagetable_dirty)(struct kvm *kvm,
@@ -193,7 +201,7 @@ static void FNAME(mark_pagetable_dirty)(struct kvm *kvm,
193static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu, 201static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu,
194 u64 *shadow_pte, 202 u64 *shadow_pte,
195 gpa_t gaddr, 203 gpa_t gaddr,
196 pt_element_t *gpte, 204 pt_element_t gpte,
197 u64 access_bits, 205 u64 access_bits,
198 int user_fault, 206 int user_fault,
199 int write_fault, 207 int write_fault,
@@ -202,23 +210,34 @@ static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu,
202 gfn_t gfn) 210 gfn_t gfn)
203{ 211{
204 hpa_t paddr; 212 hpa_t paddr;
205 int dirty = *gpte & PT_DIRTY_MASK; 213 int dirty = gpte & PT_DIRTY_MASK;
206 u64 spte = *shadow_pte; 214 u64 spte = *shadow_pte;
207 int was_rmapped = is_rmap_pte(spte); 215 int was_rmapped = is_rmap_pte(spte);
208 216
209 pgprintk("%s: spte %llx gpte %llx access %llx write_fault %d" 217 pgprintk("%s: spte %llx gpte %llx access %llx write_fault %d"
210 " user_fault %d gfn %lx\n", 218 " user_fault %d gfn %lx\n",
211 __FUNCTION__, spte, (u64)*gpte, access_bits, 219 __FUNCTION__, spte, (u64)gpte, access_bits,
212 write_fault, user_fault, gfn); 220 write_fault, user_fault, gfn);
213 221
214 if (write_fault && !dirty) { 222 if (write_fault && !dirty) {
215 *gpte |= PT_DIRTY_MASK; 223 pt_element_t *guest_ent, *tmp = NULL;
224
225 if (walker->ptep)
226 guest_ent = walker->ptep;
227 else {
228 tmp = kmap_atomic(walker->page, KM_USER0);
229 guest_ent = &tmp[walker->index];
230 }
231
232 *guest_ent |= PT_DIRTY_MASK;
233 if (!walker->ptep)
234 kunmap_atomic(tmp, KM_USER0);
216 dirty = 1; 235 dirty = 1;
217 FNAME(mark_pagetable_dirty)(vcpu->kvm, walker); 236 FNAME(mark_pagetable_dirty)(vcpu->kvm, walker);
218 } 237 }
219 238
220 spte |= PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK; 239 spte |= PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK;
221 spte |= *gpte & PT64_NX_MASK; 240 spte |= gpte & PT64_NX_MASK;
222 if (!dirty) 241 if (!dirty)
223 access_bits &= ~PT_WRITABLE_MASK; 242 access_bits &= ~PT_WRITABLE_MASK;
224 243
@@ -255,7 +274,7 @@ static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu,
255 access_bits &= ~PT_WRITABLE_MASK; 274 access_bits &= ~PT_WRITABLE_MASK;
256 if (is_writeble_pte(spte)) { 275 if (is_writeble_pte(spte)) {
257 spte &= ~PT_WRITABLE_MASK; 276 spte &= ~PT_WRITABLE_MASK;
258 kvm_arch_ops->tlb_flush(vcpu); 277 kvm_x86_ops->tlb_flush(vcpu);
259 } 278 }
260 if (write_fault) 279 if (write_fault)
261 *ptwrite = 1; 280 *ptwrite = 1;
@@ -273,13 +292,13 @@ unshadowed:
273 rmap_add(vcpu, shadow_pte); 292 rmap_add(vcpu, shadow_pte);
274} 293}
275 294
276static void FNAME(set_pte)(struct kvm_vcpu *vcpu, pt_element_t *gpte, 295static void FNAME(set_pte)(struct kvm_vcpu *vcpu, pt_element_t gpte,
277 u64 *shadow_pte, u64 access_bits, 296 u64 *shadow_pte, u64 access_bits,
278 int user_fault, int write_fault, int *ptwrite, 297 int user_fault, int write_fault, int *ptwrite,
279 struct guest_walker *walker, gfn_t gfn) 298 struct guest_walker *walker, gfn_t gfn)
280{ 299{
281 access_bits &= *gpte; 300 access_bits &= gpte;
282 FNAME(set_pte_common)(vcpu, shadow_pte, *gpte & PT_BASE_ADDR_MASK, 301 FNAME(set_pte_common)(vcpu, shadow_pte, gpte & PT_BASE_ADDR_MASK,
283 gpte, access_bits, user_fault, write_fault, 302 gpte, access_bits, user_fault, write_fault,
284 ptwrite, walker, gfn); 303 ptwrite, walker, gfn);
285} 304}
@@ -295,22 +314,22 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
295 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) 314 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK))
296 return; 315 return;
297 pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte); 316 pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte);
298 FNAME(set_pte)(vcpu, &gpte, spte, PT_USER_MASK | PT_WRITABLE_MASK, 0, 317 FNAME(set_pte)(vcpu, gpte, spte, PT_USER_MASK | PT_WRITABLE_MASK, 0,
299 0, NULL, NULL, 318 0, NULL, NULL,
300 (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT); 319 (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT);
301} 320}
302 321
303static void FNAME(set_pde)(struct kvm_vcpu *vcpu, pt_element_t *gpde, 322static void FNAME(set_pde)(struct kvm_vcpu *vcpu, pt_element_t gpde,
304 u64 *shadow_pte, u64 access_bits, 323 u64 *shadow_pte, u64 access_bits,
305 int user_fault, int write_fault, int *ptwrite, 324 int user_fault, int write_fault, int *ptwrite,
306 struct guest_walker *walker, gfn_t gfn) 325 struct guest_walker *walker, gfn_t gfn)
307{ 326{
308 gpa_t gaddr; 327 gpa_t gaddr;
309 328
310 access_bits &= *gpde; 329 access_bits &= gpde;
311 gaddr = (gpa_t)gfn << PAGE_SHIFT; 330 gaddr = (gpa_t)gfn << PAGE_SHIFT;
312 if (PTTYPE == 32 && is_cpuid_PSE36()) 331 if (PTTYPE == 32 && is_cpuid_PSE36())
313 gaddr |= (*gpde & PT32_DIR_PSE36_MASK) << 332 gaddr |= (gpde & PT32_DIR_PSE36_MASK) <<
314 (32 - PT32_DIR_PSE36_SHIFT); 333 (32 - PT32_DIR_PSE36_SHIFT);
315 FNAME(set_pte_common)(vcpu, shadow_pte, gaddr, 334 FNAME(set_pte_common)(vcpu, shadow_pte, gaddr,
316 gpde, access_bits, user_fault, write_fault, 335 gpde, access_bits, user_fault, write_fault,
@@ -328,9 +347,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
328 int level; 347 int level;
329 u64 *shadow_ent; 348 u64 *shadow_ent;
330 u64 *prev_shadow_ent = NULL; 349 u64 *prev_shadow_ent = NULL;
331 pt_element_t *guest_ent = walker->ptep;
332 350
333 if (!is_present_pte(*guest_ent)) 351 if (!is_present_pte(walker->pte))
334 return NULL; 352 return NULL;
335 353
336 shadow_addr = vcpu->mmu.root_hpa; 354 shadow_addr = vcpu->mmu.root_hpa;
@@ -364,12 +382,12 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
364 if (level - 1 == PT_PAGE_TABLE_LEVEL 382 if (level - 1 == PT_PAGE_TABLE_LEVEL
365 && walker->level == PT_DIRECTORY_LEVEL) { 383 && walker->level == PT_DIRECTORY_LEVEL) {
366 metaphysical = 1; 384 metaphysical = 1;
367 hugepage_access = *guest_ent; 385 hugepage_access = walker->pte;
368 hugepage_access &= PT_USER_MASK | PT_WRITABLE_MASK; 386 hugepage_access &= PT_USER_MASK | PT_WRITABLE_MASK;
369 if (*guest_ent & PT64_NX_MASK) 387 if (walker->pte & PT64_NX_MASK)
370 hugepage_access |= (1 << 2); 388 hugepage_access |= (1 << 2);
371 hugepage_access >>= PT_WRITABLE_SHIFT; 389 hugepage_access >>= PT_WRITABLE_SHIFT;
372 table_gfn = (*guest_ent & PT_BASE_ADDR_MASK) 390 table_gfn = (walker->pte & PT_BASE_ADDR_MASK)
373 >> PAGE_SHIFT; 391 >> PAGE_SHIFT;
374 } else { 392 } else {
375 metaphysical = 0; 393 metaphysical = 0;
@@ -386,12 +404,12 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
386 } 404 }
387 405
388 if (walker->level == PT_DIRECTORY_LEVEL) { 406 if (walker->level == PT_DIRECTORY_LEVEL) {
389 FNAME(set_pde)(vcpu, guest_ent, shadow_ent, 407 FNAME(set_pde)(vcpu, walker->pte, shadow_ent,
390 walker->inherited_ar, user_fault, write_fault, 408 walker->inherited_ar, user_fault, write_fault,
391 ptwrite, walker, walker->gfn); 409 ptwrite, walker, walker->gfn);
392 } else { 410 } else {
393 ASSERT(walker->level == PT_PAGE_TABLE_LEVEL); 411 ASSERT(walker->level == PT_PAGE_TABLE_LEVEL);
394 FNAME(set_pte)(vcpu, guest_ent, shadow_ent, 412 FNAME(set_pte)(vcpu, walker->pte, shadow_ent,
395 walker->inherited_ar, user_fault, write_fault, 413 walker->inherited_ar, user_fault, write_fault,
396 ptwrite, walker, walker->gfn); 414 ptwrite, walker, walker->gfn);
397 } 415 }
@@ -442,7 +460,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
442 if (!r) { 460 if (!r) {
443 pgprintk("%s: guest page fault\n", __FUNCTION__); 461 pgprintk("%s: guest page fault\n", __FUNCTION__);
444 inject_page_fault(vcpu, addr, walker.error_code); 462 inject_page_fault(vcpu, addr, walker.error_code);
445 FNAME(release_walker)(&walker);
446 vcpu->last_pt_write_count = 0; /* reset fork detector */ 463 vcpu->last_pt_write_count = 0; /* reset fork detector */
447 return 0; 464 return 0;
448 } 465 }
@@ -452,8 +469,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
452 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__, 469 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__,
453 shadow_pte, *shadow_pte, write_pt); 470 shadow_pte, *shadow_pte, write_pt);
454 471
455 FNAME(release_walker)(&walker);
456
457 if (!write_pt) 472 if (!write_pt)
458 vcpu->last_pt_write_count = 0; /* reset fork detector */ 473 vcpu->last_pt_write_count = 0; /* reset fork detector */
459 474
@@ -482,7 +497,6 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
482 gpa |= vaddr & ~PAGE_MASK; 497 gpa |= vaddr & ~PAGE_MASK;
483 } 498 }
484 499
485 FNAME(release_walker)(&walker);
486 return gpa; 500 return gpa;
487} 501}
488 502
diff --git a/drivers/kvm/svm.c b/drivers/kvm/svm.c
index bc818cc126e3..729f1cd93606 100644
--- a/drivers/kvm/svm.c
+++ b/drivers/kvm/svm.c
@@ -16,12 +16,12 @@
16 16
17#include "kvm_svm.h" 17#include "kvm_svm.h"
18#include "x86_emulate.h" 18#include "x86_emulate.h"
19#include "irq.h"
19 20
20#include <linux/module.h> 21#include <linux/module.h>
21#include <linux/kernel.h> 22#include <linux/kernel.h>
22#include <linux/vmalloc.h> 23#include <linux/vmalloc.h>
23#include <linux/highmem.h> 24#include <linux/highmem.h>
24#include <linux/profile.h>
25#include <linux/sched.h> 25#include <linux/sched.h>
26 26
27#include <asm/desc.h> 27#include <asm/desc.h>
@@ -38,7 +38,6 @@ MODULE_LICENSE("GPL");
38 38
39#define DR7_GD_MASK (1 << 13) 39#define DR7_GD_MASK (1 << 13)
40#define DR6_BD_MASK (1 << 13) 40#define DR6_BD_MASK (1 << 13)
41#define CR4_DE_MASK (1UL << 3)
42 41
43#define SEG_TYPE_LDT 2 42#define SEG_TYPE_LDT 2
44#define SEG_TYPE_BUSY_TSS16 3 43#define SEG_TYPE_BUSY_TSS16 3
@@ -50,6 +49,13 @@ MODULE_LICENSE("GPL");
50#define SVM_FEATURE_LBRV (1 << 1) 49#define SVM_FEATURE_LBRV (1 << 1)
51#define SVM_DEATURE_SVML (1 << 2) 50#define SVM_DEATURE_SVML (1 << 2)
52 51
52static void kvm_reput_irq(struct vcpu_svm *svm);
53
54static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
55{
56 return container_of(vcpu, struct vcpu_svm, vcpu);
57}
58
53unsigned long iopm_base; 59unsigned long iopm_base;
54unsigned long msrpm_base; 60unsigned long msrpm_base;
55 61
@@ -94,20 +100,6 @@ static inline u32 svm_has(u32 feat)
94 return svm_features & feat; 100 return svm_features & feat;
95} 101}
96 102
97static unsigned get_addr_size(struct kvm_vcpu *vcpu)
98{
99 struct vmcb_save_area *sa = &vcpu->svm->vmcb->save;
100 u16 cs_attrib;
101
102 if (!(sa->cr0 & CR0_PE_MASK) || (sa->rflags & X86_EFLAGS_VM))
103 return 2;
104
105 cs_attrib = sa->cs.attrib;
106
107 return (cs_attrib & SVM_SELECTOR_L_MASK) ? 8 :
108 (cs_attrib & SVM_SELECTOR_DB_MASK) ? 4 : 2;
109}
110
111static inline u8 pop_irq(struct kvm_vcpu *vcpu) 103static inline u8 pop_irq(struct kvm_vcpu *vcpu)
112{ 104{
113 int word_index = __ffs(vcpu->irq_summary); 105 int word_index = __ffs(vcpu->irq_summary);
@@ -182,7 +174,7 @@ static inline void write_dr7(unsigned long val)
182 174
183static inline void force_new_asid(struct kvm_vcpu *vcpu) 175static inline void force_new_asid(struct kvm_vcpu *vcpu)
184{ 176{
185 vcpu->svm->asid_generation--; 177 to_svm(vcpu)->asid_generation--;
186} 178}
187 179
188static inline void flush_guest_tlb(struct kvm_vcpu *vcpu) 180static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
@@ -195,22 +187,24 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
195 if (!(efer & KVM_EFER_LMA)) 187 if (!(efer & KVM_EFER_LMA))
196 efer &= ~KVM_EFER_LME; 188 efer &= ~KVM_EFER_LME;
197 189
198 vcpu->svm->vmcb->save.efer = efer | MSR_EFER_SVME_MASK; 190 to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK;
199 vcpu->shadow_efer = efer; 191 vcpu->shadow_efer = efer;
200} 192}
201 193
202static void svm_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code) 194static void svm_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code)
203{ 195{
204 vcpu->svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | 196 struct vcpu_svm *svm = to_svm(vcpu);
197
198 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID |
205 SVM_EVTINJ_VALID_ERR | 199 SVM_EVTINJ_VALID_ERR |
206 SVM_EVTINJ_TYPE_EXEPT | 200 SVM_EVTINJ_TYPE_EXEPT |
207 GP_VECTOR; 201 GP_VECTOR;
208 vcpu->svm->vmcb->control.event_inj_err = error_code; 202 svm->vmcb->control.event_inj_err = error_code;
209} 203}
210 204
211static void inject_ud(struct kvm_vcpu *vcpu) 205static void inject_ud(struct kvm_vcpu *vcpu)
212{ 206{
213 vcpu->svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | 207 to_svm(vcpu)->vmcb->control.event_inj = SVM_EVTINJ_VALID |
214 SVM_EVTINJ_TYPE_EXEPT | 208 SVM_EVTINJ_TYPE_EXEPT |
215 UD_VECTOR; 209 UD_VECTOR;
216} 210}
@@ -229,19 +223,21 @@ static int is_external_interrupt(u32 info)
229 223
230static void skip_emulated_instruction(struct kvm_vcpu *vcpu) 224static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
231{ 225{
232 if (!vcpu->svm->next_rip) { 226 struct vcpu_svm *svm = to_svm(vcpu);
227
228 if (!svm->next_rip) {
233 printk(KERN_DEBUG "%s: NOP\n", __FUNCTION__); 229 printk(KERN_DEBUG "%s: NOP\n", __FUNCTION__);
234 return; 230 return;
235 } 231 }
236 if (vcpu->svm->next_rip - vcpu->svm->vmcb->save.rip > 15) { 232 if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE) {
237 printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n", 233 printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n",
238 __FUNCTION__, 234 __FUNCTION__,
239 vcpu->svm->vmcb->save.rip, 235 svm->vmcb->save.rip,
240 vcpu->svm->next_rip); 236 svm->next_rip);
241 } 237 }
242 238
243 vcpu->rip = vcpu->svm->vmcb->save.rip = vcpu->svm->next_rip; 239 vcpu->rip = svm->vmcb->save.rip = svm->next_rip;
244 vcpu->svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; 240 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
245 241
246 vcpu->interrupt_window_open = 1; 242 vcpu->interrupt_window_open = 1;
247} 243}
@@ -351,8 +347,8 @@ err_1:
351 347
352} 348}
353 349
354static int set_msr_interception(u32 *msrpm, unsigned msr, 350static void set_msr_interception(u32 *msrpm, unsigned msr,
355 int read, int write) 351 int read, int write)
356{ 352{
357 int i; 353 int i;
358 354
@@ -367,11 +363,10 @@ static int set_msr_interception(u32 *msrpm, unsigned msr,
367 u32 mask = ((write) ? 0 : 2) | ((read) ? 0 : 1); 363 u32 mask = ((write) ? 0 : 2) | ((read) ? 0 : 1);
368 *base = (*base & ~(0x3 << msr_shift)) | 364 *base = (*base & ~(0x3 << msr_shift)) |
369 (mask << msr_shift); 365 (mask << msr_shift);
370 return 1; 366 return;
371 } 367 }
372 } 368 }
373 printk(KERN_DEBUG "%s: not found 0x%x\n", __FUNCTION__, msr); 369 BUG();
374 return 0;
375} 370}
376 371
377static __init int svm_hardware_setup(void) 372static __init int svm_hardware_setup(void)
@@ -382,8 +377,6 @@ static __init int svm_hardware_setup(void)
382 void *iopm_va, *msrpm_va; 377 void *iopm_va, *msrpm_va;
383 int r; 378 int r;
384 379
385 kvm_emulator_want_group7_invlpg();
386
387 iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER); 380 iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER);
388 381
389 if (!iopm_pages) 382 if (!iopm_pages)
@@ -458,11 +451,6 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
458 seg->base = 0; 451 seg->base = 0;
459} 452}
460 453
461static int svm_vcpu_setup(struct kvm_vcpu *vcpu)
462{
463 return 0;
464}
465
466static void init_vmcb(struct vmcb *vmcb) 454static void init_vmcb(struct vmcb *vmcb)
467{ 455{
468 struct vmcb_control_area *control = &vmcb->control; 456 struct vmcb_control_area *control = &vmcb->control;
@@ -563,59 +551,83 @@ static void init_vmcb(struct vmcb *vmcb)
563 * cr0 val on cpu init should be 0x60000010, we enable cpu 551 * cr0 val on cpu init should be 0x60000010, we enable cpu
564 * cache by default. the orderly way is to enable cache in bios. 552 * cache by default. the orderly way is to enable cache in bios.
565 */ 553 */
566 save->cr0 = 0x00000010 | CR0_PG_MASK | CR0_WP_MASK; 554 save->cr0 = 0x00000010 | X86_CR0_PG | X86_CR0_WP;
567 save->cr4 = CR4_PAE_MASK; 555 save->cr4 = X86_CR4_PAE;
568 /* rdx = ?? */ 556 /* rdx = ?? */
569} 557}
570 558
571static int svm_create_vcpu(struct kvm_vcpu *vcpu) 559static void svm_vcpu_reset(struct kvm_vcpu *vcpu)
560{
561 struct vcpu_svm *svm = to_svm(vcpu);
562
563 init_vmcb(svm->vmcb);
564}
565
566static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
572{ 567{
568 struct vcpu_svm *svm;
573 struct page *page; 569 struct page *page;
574 int r; 570 int err;
571
572 svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
573 if (!svm) {
574 err = -ENOMEM;
575 goto out;
576 }
577
578 err = kvm_vcpu_init(&svm->vcpu, kvm, id);
579 if (err)
580 goto free_svm;
581
582 if (irqchip_in_kernel(kvm)) {
583 err = kvm_create_lapic(&svm->vcpu);
584 if (err < 0)
585 goto free_svm;
586 }
575 587
576 r = -ENOMEM;
577 vcpu->svm = kzalloc(sizeof *vcpu->svm, GFP_KERNEL);
578 if (!vcpu->svm)
579 goto out1;
580 page = alloc_page(GFP_KERNEL); 588 page = alloc_page(GFP_KERNEL);
581 if (!page) 589 if (!page) {
582 goto out2; 590 err = -ENOMEM;
583 591 goto uninit;
584 vcpu->svm->vmcb = page_address(page); 592 }
585 clear_page(vcpu->svm->vmcb);
586 vcpu->svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
587 vcpu->svm->asid_generation = 0;
588 memset(vcpu->svm->db_regs, 0, sizeof(vcpu->svm->db_regs));
589 init_vmcb(vcpu->svm->vmcb);
590
591 fx_init(vcpu);
592 vcpu->fpu_active = 1;
593 vcpu->apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
594 if (vcpu == &vcpu->kvm->vcpus[0])
595 vcpu->apic_base |= MSR_IA32_APICBASE_BSP;
596 593
597 return 0; 594 svm->vmcb = page_address(page);
595 clear_page(svm->vmcb);
596 svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
597 svm->asid_generation = 0;
598 memset(svm->db_regs, 0, sizeof(svm->db_regs));
599 init_vmcb(svm->vmcb);
598 600
599out2: 601 fx_init(&svm->vcpu);
600 kfree(vcpu->svm); 602 svm->vcpu.fpu_active = 1;
601out1: 603 svm->vcpu.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
602 return r; 604 if (svm->vcpu.vcpu_id == 0)
605 svm->vcpu.apic_base |= MSR_IA32_APICBASE_BSP;
606
607 return &svm->vcpu;
608
609uninit:
610 kvm_vcpu_uninit(&svm->vcpu);
611free_svm:
612 kmem_cache_free(kvm_vcpu_cache, svm);
613out:
614 return ERR_PTR(err);
603} 615}
604 616
605static void svm_free_vcpu(struct kvm_vcpu *vcpu) 617static void svm_free_vcpu(struct kvm_vcpu *vcpu)
606{ 618{
607 if (!vcpu->svm) 619 struct vcpu_svm *svm = to_svm(vcpu);
608 return; 620
609 if (vcpu->svm->vmcb) 621 __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT));
610 __free_page(pfn_to_page(vcpu->svm->vmcb_pa >> PAGE_SHIFT)); 622 kvm_vcpu_uninit(vcpu);
611 kfree(vcpu->svm); 623 kmem_cache_free(kvm_vcpu_cache, svm);
612} 624}
613 625
614static void svm_vcpu_load(struct kvm_vcpu *vcpu) 626static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
615{ 627{
616 int cpu, i; 628 struct vcpu_svm *svm = to_svm(vcpu);
629 int i;
617 630
618 cpu = get_cpu();
619 if (unlikely(cpu != vcpu->cpu)) { 631 if (unlikely(cpu != vcpu->cpu)) {
620 u64 tsc_this, delta; 632 u64 tsc_this, delta;
621 633
@@ -625,23 +637,24 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu)
625 */ 637 */
626 rdtscll(tsc_this); 638 rdtscll(tsc_this);
627 delta = vcpu->host_tsc - tsc_this; 639 delta = vcpu->host_tsc - tsc_this;
628 vcpu->svm->vmcb->control.tsc_offset += delta; 640 svm->vmcb->control.tsc_offset += delta;
629 vcpu->cpu = cpu; 641 vcpu->cpu = cpu;
642 kvm_migrate_apic_timer(vcpu);
630 } 643 }
631 644
632 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) 645 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
633 rdmsrl(host_save_user_msrs[i], vcpu->svm->host_user_msrs[i]); 646 rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
634} 647}
635 648
636static void svm_vcpu_put(struct kvm_vcpu *vcpu) 649static void svm_vcpu_put(struct kvm_vcpu *vcpu)
637{ 650{
651 struct vcpu_svm *svm = to_svm(vcpu);
638 int i; 652 int i;
639 653
640 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) 654 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
641 wrmsrl(host_save_user_msrs[i], vcpu->svm->host_user_msrs[i]); 655 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
642 656
643 rdtscll(vcpu->host_tsc); 657 rdtscll(vcpu->host_tsc);
644 put_cpu();
645} 658}
646 659
647static void svm_vcpu_decache(struct kvm_vcpu *vcpu) 660static void svm_vcpu_decache(struct kvm_vcpu *vcpu)
@@ -650,31 +663,34 @@ static void svm_vcpu_decache(struct kvm_vcpu *vcpu)
650 663
651static void svm_cache_regs(struct kvm_vcpu *vcpu) 664static void svm_cache_regs(struct kvm_vcpu *vcpu)
652{ 665{
653 vcpu->regs[VCPU_REGS_RAX] = vcpu->svm->vmcb->save.rax; 666 struct vcpu_svm *svm = to_svm(vcpu);
654 vcpu->regs[VCPU_REGS_RSP] = vcpu->svm->vmcb->save.rsp; 667
655 vcpu->rip = vcpu->svm->vmcb->save.rip; 668 vcpu->regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
669 vcpu->regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
670 vcpu->rip = svm->vmcb->save.rip;
656} 671}
657 672
658static void svm_decache_regs(struct kvm_vcpu *vcpu) 673static void svm_decache_regs(struct kvm_vcpu *vcpu)
659{ 674{
660 vcpu->svm->vmcb->save.rax = vcpu->regs[VCPU_REGS_RAX]; 675 struct vcpu_svm *svm = to_svm(vcpu);
661 vcpu->svm->vmcb->save.rsp = vcpu->regs[VCPU_REGS_RSP]; 676 svm->vmcb->save.rax = vcpu->regs[VCPU_REGS_RAX];
662 vcpu->svm->vmcb->save.rip = vcpu->rip; 677 svm->vmcb->save.rsp = vcpu->regs[VCPU_REGS_RSP];
678 svm->vmcb->save.rip = vcpu->rip;
663} 679}
664 680
665static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) 681static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
666{ 682{
667 return vcpu->svm->vmcb->save.rflags; 683 return to_svm(vcpu)->vmcb->save.rflags;
668} 684}
669 685
670static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 686static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
671{ 687{
672 vcpu->svm->vmcb->save.rflags = rflags; 688 to_svm(vcpu)->vmcb->save.rflags = rflags;
673} 689}
674 690
675static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg) 691static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
676{ 692{
677 struct vmcb_save_area *save = &vcpu->svm->vmcb->save; 693 struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
678 694
679 switch (seg) { 695 switch (seg) {
680 case VCPU_SREG_CS: return &save->cs; 696 case VCPU_SREG_CS: return &save->cs;
@@ -716,36 +732,36 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
716 var->unusable = !var->present; 732 var->unusable = !var->present;
717} 733}
718 734
719static void svm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
720{
721 struct vmcb_seg *s = svm_seg(vcpu, VCPU_SREG_CS);
722
723 *db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
724 *l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
725}
726
727static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) 735static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
728{ 736{
729 dt->limit = vcpu->svm->vmcb->save.idtr.limit; 737 struct vcpu_svm *svm = to_svm(vcpu);
730 dt->base = vcpu->svm->vmcb->save.idtr.base; 738
739 dt->limit = svm->vmcb->save.idtr.limit;
740 dt->base = svm->vmcb->save.idtr.base;
731} 741}
732 742
733static void svm_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) 743static void svm_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
734{ 744{
735 vcpu->svm->vmcb->save.idtr.limit = dt->limit; 745 struct vcpu_svm *svm = to_svm(vcpu);
736 vcpu->svm->vmcb->save.idtr.base = dt->base ; 746
747 svm->vmcb->save.idtr.limit = dt->limit;
748 svm->vmcb->save.idtr.base = dt->base ;
737} 749}
738 750
739static void svm_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) 751static void svm_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
740{ 752{
741 dt->limit = vcpu->svm->vmcb->save.gdtr.limit; 753 struct vcpu_svm *svm = to_svm(vcpu);
742 dt->base = vcpu->svm->vmcb->save.gdtr.base; 754
755 dt->limit = svm->vmcb->save.gdtr.limit;
756 dt->base = svm->vmcb->save.gdtr.base;
743} 757}
744 758
745static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) 759static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
746{ 760{
747 vcpu->svm->vmcb->save.gdtr.limit = dt->limit; 761 struct vcpu_svm *svm = to_svm(vcpu);
748 vcpu->svm->vmcb->save.gdtr.base = dt->base ; 762
763 svm->vmcb->save.gdtr.limit = dt->limit;
764 svm->vmcb->save.gdtr.base = dt->base ;
749} 765}
750 766
751static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) 767static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
@@ -754,39 +770,42 @@ static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
754 770
755static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 771static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
756{ 772{
773 struct vcpu_svm *svm = to_svm(vcpu);
774
757#ifdef CONFIG_X86_64 775#ifdef CONFIG_X86_64
758 if (vcpu->shadow_efer & KVM_EFER_LME) { 776 if (vcpu->shadow_efer & KVM_EFER_LME) {
759 if (!is_paging(vcpu) && (cr0 & CR0_PG_MASK)) { 777 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
760 vcpu->shadow_efer |= KVM_EFER_LMA; 778 vcpu->shadow_efer |= KVM_EFER_LMA;
761 vcpu->svm->vmcb->save.efer |= KVM_EFER_LMA | KVM_EFER_LME; 779 svm->vmcb->save.efer |= KVM_EFER_LMA | KVM_EFER_LME;
762 } 780 }
763 781
764 if (is_paging(vcpu) && !(cr0 & CR0_PG_MASK) ) { 782 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG) ) {
765 vcpu->shadow_efer &= ~KVM_EFER_LMA; 783 vcpu->shadow_efer &= ~KVM_EFER_LMA;
766 vcpu->svm->vmcb->save.efer &= ~(KVM_EFER_LMA | KVM_EFER_LME); 784 svm->vmcb->save.efer &= ~(KVM_EFER_LMA | KVM_EFER_LME);
767 } 785 }
768 } 786 }
769#endif 787#endif
770 if ((vcpu->cr0 & CR0_TS_MASK) && !(cr0 & CR0_TS_MASK)) { 788 if ((vcpu->cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) {
771 vcpu->svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); 789 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
772 vcpu->fpu_active = 1; 790 vcpu->fpu_active = 1;
773 } 791 }
774 792
775 vcpu->cr0 = cr0; 793 vcpu->cr0 = cr0;
776 cr0 |= CR0_PG_MASK | CR0_WP_MASK; 794 cr0 |= X86_CR0_PG | X86_CR0_WP;
777 cr0 &= ~(CR0_CD_MASK | CR0_NW_MASK); 795 cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
778 vcpu->svm->vmcb->save.cr0 = cr0; 796 svm->vmcb->save.cr0 = cr0;
779} 797}
780 798
781static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 799static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
782{ 800{
783 vcpu->cr4 = cr4; 801 vcpu->cr4 = cr4;
784 vcpu->svm->vmcb->save.cr4 = cr4 | CR4_PAE_MASK; 802 to_svm(vcpu)->vmcb->save.cr4 = cr4 | X86_CR4_PAE;
785} 803}
786 804
787static void svm_set_segment(struct kvm_vcpu *vcpu, 805static void svm_set_segment(struct kvm_vcpu *vcpu,
788 struct kvm_segment *var, int seg) 806 struct kvm_segment *var, int seg)
789{ 807{
808 struct vcpu_svm *svm = to_svm(vcpu);
790 struct vmcb_seg *s = svm_seg(vcpu, seg); 809 struct vmcb_seg *s = svm_seg(vcpu, seg);
791 810
792 s->base = var->base; 811 s->base = var->base;
@@ -805,16 +824,16 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
805 s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT; 824 s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
806 } 825 }
807 if (seg == VCPU_SREG_CS) 826 if (seg == VCPU_SREG_CS)
808 vcpu->svm->vmcb->save.cpl 827 svm->vmcb->save.cpl
809 = (vcpu->svm->vmcb->save.cs.attrib 828 = (svm->vmcb->save.cs.attrib
810 >> SVM_SELECTOR_DPL_SHIFT) & 3; 829 >> SVM_SELECTOR_DPL_SHIFT) & 3;
811 830
812} 831}
813 832
814/* FIXME: 833/* FIXME:
815 834
816 vcpu->svm->vmcb->control.int_ctl &= ~V_TPR_MASK; 835 svm(vcpu)->vmcb->control.int_ctl &= ~V_TPR_MASK;
817 vcpu->svm->vmcb->control.int_ctl |= (sregs->cr8 & V_TPR_MASK); 836 svm(vcpu)->vmcb->control.int_ctl |= (sregs->cr8 & V_TPR_MASK);
818 837
819*/ 838*/
820 839
@@ -823,61 +842,68 @@ static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
823 return -EOPNOTSUPP; 842 return -EOPNOTSUPP;
824} 843}
825 844
845static int svm_get_irq(struct kvm_vcpu *vcpu)
846{
847 struct vcpu_svm *svm = to_svm(vcpu);
848 u32 exit_int_info = svm->vmcb->control.exit_int_info;
849
850 if (is_external_interrupt(exit_int_info))
851 return exit_int_info & SVM_EVTINJ_VEC_MASK;
852 return -1;
853}
854
826static void load_host_msrs(struct kvm_vcpu *vcpu) 855static void load_host_msrs(struct kvm_vcpu *vcpu)
827{ 856{
828#ifdef CONFIG_X86_64 857#ifdef CONFIG_X86_64
829 wrmsrl(MSR_GS_BASE, vcpu->svm->host_gs_base); 858 wrmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
830#endif 859#endif
831} 860}
832 861
833static void save_host_msrs(struct kvm_vcpu *vcpu) 862static void save_host_msrs(struct kvm_vcpu *vcpu)
834{ 863{
835#ifdef CONFIG_X86_64 864#ifdef CONFIG_X86_64
836 rdmsrl(MSR_GS_BASE, vcpu->svm->host_gs_base); 865 rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
837#endif 866#endif
838} 867}
839 868
840static void new_asid(struct kvm_vcpu *vcpu, struct svm_cpu_data *svm_data) 869static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data)
841{ 870{
842 if (svm_data->next_asid > svm_data->max_asid) { 871 if (svm_data->next_asid > svm_data->max_asid) {
843 ++svm_data->asid_generation; 872 ++svm_data->asid_generation;
844 svm_data->next_asid = 1; 873 svm_data->next_asid = 1;
845 vcpu->svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID; 874 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
846 } 875 }
847 876
848 vcpu->cpu = svm_data->cpu; 877 svm->vcpu.cpu = svm_data->cpu;
849 vcpu->svm->asid_generation = svm_data->asid_generation; 878 svm->asid_generation = svm_data->asid_generation;
850 vcpu->svm->vmcb->control.asid = svm_data->next_asid++; 879 svm->vmcb->control.asid = svm_data->next_asid++;
851}
852
853static void svm_invlpg(struct kvm_vcpu *vcpu, gva_t address)
854{
855 invlpga(address, vcpu->svm->vmcb->control.asid); // is needed?
856} 880}
857 881
858static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr) 882static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr)
859{ 883{
860 return vcpu->svm->db_regs[dr]; 884 return to_svm(vcpu)->db_regs[dr];
861} 885}
862 886
863static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, 887static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
864 int *exception) 888 int *exception)
865{ 889{
890 struct vcpu_svm *svm = to_svm(vcpu);
891
866 *exception = 0; 892 *exception = 0;
867 893
868 if (vcpu->svm->vmcb->save.dr7 & DR7_GD_MASK) { 894 if (svm->vmcb->save.dr7 & DR7_GD_MASK) {
869 vcpu->svm->vmcb->save.dr7 &= ~DR7_GD_MASK; 895 svm->vmcb->save.dr7 &= ~DR7_GD_MASK;
870 vcpu->svm->vmcb->save.dr6 |= DR6_BD_MASK; 896 svm->vmcb->save.dr6 |= DR6_BD_MASK;
871 *exception = DB_VECTOR; 897 *exception = DB_VECTOR;
872 return; 898 return;
873 } 899 }
874 900
875 switch (dr) { 901 switch (dr) {
876 case 0 ... 3: 902 case 0 ... 3:
877 vcpu->svm->db_regs[dr] = value; 903 svm->db_regs[dr] = value;
878 return; 904 return;
879 case 4 ... 5: 905 case 4 ... 5:
880 if (vcpu->cr4 & CR4_DE_MASK) { 906 if (vcpu->cr4 & X86_CR4_DE) {
881 *exception = UD_VECTOR; 907 *exception = UD_VECTOR;
882 return; 908 return;
883 } 909 }
@@ -886,7 +912,7 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
886 *exception = GP_VECTOR; 912 *exception = GP_VECTOR;
887 return; 913 return;
888 } 914 }
889 vcpu->svm->vmcb->save.dr7 = value; 915 svm->vmcb->save.dr7 = value;
890 return; 916 return;
891 } 917 }
892 default: 918 default:
@@ -897,42 +923,44 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
897 } 923 }
898} 924}
899 925
900static int pf_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 926static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
901{ 927{
902 u32 exit_int_info = vcpu->svm->vmcb->control.exit_int_info; 928 u32 exit_int_info = svm->vmcb->control.exit_int_info;
929 struct kvm *kvm = svm->vcpu.kvm;
903 u64 fault_address; 930 u64 fault_address;
904 u32 error_code; 931 u32 error_code;
905 enum emulation_result er; 932 enum emulation_result er;
906 int r; 933 int r;
907 934
908 if (is_external_interrupt(exit_int_info)) 935 if (!irqchip_in_kernel(kvm) &&
909 push_irq(vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK); 936 is_external_interrupt(exit_int_info))
937 push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
910 938
911 spin_lock(&vcpu->kvm->lock); 939 mutex_lock(&kvm->lock);
912 940
913 fault_address = vcpu->svm->vmcb->control.exit_info_2; 941 fault_address = svm->vmcb->control.exit_info_2;
914 error_code = vcpu->svm->vmcb->control.exit_info_1; 942 error_code = svm->vmcb->control.exit_info_1;
915 r = kvm_mmu_page_fault(vcpu, fault_address, error_code); 943 r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
916 if (r < 0) { 944 if (r < 0) {
917 spin_unlock(&vcpu->kvm->lock); 945 mutex_unlock(&kvm->lock);
918 return r; 946 return r;
919 } 947 }
920 if (!r) { 948 if (!r) {
921 spin_unlock(&vcpu->kvm->lock); 949 mutex_unlock(&kvm->lock);
922 return 1; 950 return 1;
923 } 951 }
924 er = emulate_instruction(vcpu, kvm_run, fault_address, error_code); 952 er = emulate_instruction(&svm->vcpu, kvm_run, fault_address,
925 spin_unlock(&vcpu->kvm->lock); 953 error_code);
954 mutex_unlock(&kvm->lock);
926 955
927 switch (er) { 956 switch (er) {
928 case EMULATE_DONE: 957 case EMULATE_DONE:
929 return 1; 958 return 1;
930 case EMULATE_DO_MMIO: 959 case EMULATE_DO_MMIO:
931 ++vcpu->stat.mmio_exits; 960 ++svm->vcpu.stat.mmio_exits;
932 kvm_run->exit_reason = KVM_EXIT_MMIO;
933 return 0; 961 return 0;
934 case EMULATE_FAIL: 962 case EMULATE_FAIL:
935 vcpu_printf(vcpu, "%s: emulate fail\n", __FUNCTION__); 963 kvm_report_emulation_failure(&svm->vcpu, "pagetable");
936 break; 964 break;
937 default: 965 default:
938 BUG(); 966 BUG();
@@ -942,252 +970,142 @@ static int pf_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
942 return 0; 970 return 0;
943} 971}
944 972
945static int nm_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 973static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
946{ 974{
947 vcpu->svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); 975 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
948 if (!(vcpu->cr0 & CR0_TS_MASK)) 976 if (!(svm->vcpu.cr0 & X86_CR0_TS))
949 vcpu->svm->vmcb->save.cr0 &= ~CR0_TS_MASK; 977 svm->vmcb->save.cr0 &= ~X86_CR0_TS;
950 vcpu->fpu_active = 1; 978 svm->vcpu.fpu_active = 1;
951 979
952 return 1; 980 return 1;
953} 981}
954 982
955static int shutdown_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 983static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
956{ 984{
957 /* 985 /*
958 * VMCB is undefined after a SHUTDOWN intercept 986 * VMCB is undefined after a SHUTDOWN intercept
959 * so reinitialize it. 987 * so reinitialize it.
960 */ 988 */
961 clear_page(vcpu->svm->vmcb); 989 clear_page(svm->vmcb);
962 init_vmcb(vcpu->svm->vmcb); 990 init_vmcb(svm->vmcb);
963 991
964 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; 992 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
965 return 0; 993 return 0;
966} 994}
967 995
968static int io_get_override(struct kvm_vcpu *vcpu, 996static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
969 struct vmcb_seg **seg,
970 int *addr_override)
971{
972 u8 inst[MAX_INST_SIZE];
973 unsigned ins_length;
974 gva_t rip;
975 int i;
976
977 rip = vcpu->svm->vmcb->save.rip;
978 ins_length = vcpu->svm->next_rip - rip;
979 rip += vcpu->svm->vmcb->save.cs.base;
980
981 if (ins_length > MAX_INST_SIZE)
982 printk(KERN_DEBUG
983 "%s: inst length err, cs base 0x%llx rip 0x%llx "
984 "next rip 0x%llx ins_length %u\n",
985 __FUNCTION__,
986 vcpu->svm->vmcb->save.cs.base,
987 vcpu->svm->vmcb->save.rip,
988 vcpu->svm->vmcb->control.exit_info_2,
989 ins_length);
990
991 if (kvm_read_guest(vcpu, rip, ins_length, inst) != ins_length)
992 /* #PF */
993 return 0;
994
995 *addr_override = 0;
996 *seg = NULL;
997 for (i = 0; i < ins_length; i++)
998 switch (inst[i]) {
999 case 0xf0:
1000 case 0xf2:
1001 case 0xf3:
1002 case 0x66:
1003 continue;
1004 case 0x67:
1005 *addr_override = 1;
1006 continue;
1007 case 0x2e:
1008 *seg = &vcpu->svm->vmcb->save.cs;
1009 continue;
1010 case 0x36:
1011 *seg = &vcpu->svm->vmcb->save.ss;
1012 continue;
1013 case 0x3e:
1014 *seg = &vcpu->svm->vmcb->save.ds;
1015 continue;
1016 case 0x26:
1017 *seg = &vcpu->svm->vmcb->save.es;
1018 continue;
1019 case 0x64:
1020 *seg = &vcpu->svm->vmcb->save.fs;
1021 continue;
1022 case 0x65:
1023 *seg = &vcpu->svm->vmcb->save.gs;
1024 continue;
1025 default:
1026 return 1;
1027 }
1028 printk(KERN_DEBUG "%s: unexpected\n", __FUNCTION__);
1029 return 0;
1030}
1031
1032static unsigned long io_adress(struct kvm_vcpu *vcpu, int ins, gva_t *address)
1033{ 997{
1034 unsigned long addr_mask; 998 u32 io_info = svm->vmcb->control.exit_info_1; //address size bug?
1035 unsigned long *reg; 999 int size, down, in, string, rep;
1036 struct vmcb_seg *seg; 1000 unsigned port;
1037 int addr_override;
1038 struct vmcb_save_area *save_area = &vcpu->svm->vmcb->save;
1039 u16 cs_attrib = save_area->cs.attrib;
1040 unsigned addr_size = get_addr_size(vcpu);
1041
1042 if (!io_get_override(vcpu, &seg, &addr_override))
1043 return 0;
1044
1045 if (addr_override)
1046 addr_size = (addr_size == 2) ? 4: (addr_size >> 1);
1047 1001
1048 if (ins) { 1002 ++svm->vcpu.stat.io_exits;
1049 reg = &vcpu->regs[VCPU_REGS_RDI];
1050 seg = &vcpu->svm->vmcb->save.es;
1051 } else {
1052 reg = &vcpu->regs[VCPU_REGS_RSI];
1053 seg = (seg) ? seg : &vcpu->svm->vmcb->save.ds;
1054 }
1055 1003
1056 addr_mask = ~0ULL >> (64 - (addr_size * 8)); 1004 svm->next_rip = svm->vmcb->control.exit_info_2;
1057 1005
1058 if ((cs_attrib & SVM_SELECTOR_L_MASK) && 1006 string = (io_info & SVM_IOIO_STR_MASK) != 0;
1059 !(vcpu->svm->vmcb->save.rflags & X86_EFLAGS_VM)) {
1060 *address = (*reg & addr_mask);
1061 return addr_mask;
1062 }
1063 1007
1064 if (!(seg->attrib & SVM_SELECTOR_P_SHIFT)) { 1008 if (string) {
1065 svm_inject_gp(vcpu, 0); 1009 if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0) == EMULATE_DO_MMIO)
1066 return 0; 1010 return 0;
1011 return 1;
1067 } 1012 }
1068 1013
1069 *address = (*reg & addr_mask) + seg->base;
1070 return addr_mask;
1071}
1072
1073static int io_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1074{
1075 u32 io_info = vcpu->svm->vmcb->control.exit_info_1; //address size bug?
1076 int size, down, in, string, rep;
1077 unsigned port;
1078 unsigned long count;
1079 gva_t address = 0;
1080
1081 ++vcpu->stat.io_exits;
1082
1083 vcpu->svm->next_rip = vcpu->svm->vmcb->control.exit_info_2;
1084
1085 in = (io_info & SVM_IOIO_TYPE_MASK) != 0; 1014 in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
1086 port = io_info >> 16; 1015 port = io_info >> 16;
1087 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; 1016 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
1088 string = (io_info & SVM_IOIO_STR_MASK) != 0;
1089 rep = (io_info & SVM_IOIO_REP_MASK) != 0; 1017 rep = (io_info & SVM_IOIO_REP_MASK) != 0;
1090 count = 1; 1018 down = (svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0;
1091 down = (vcpu->svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0;
1092 1019
1093 if (string) { 1020 return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port);
1094 unsigned addr_mask;
1095
1096 addr_mask = io_adress(vcpu, in, &address);
1097 if (!addr_mask) {
1098 printk(KERN_DEBUG "%s: get io address failed\n",
1099 __FUNCTION__);
1100 return 1;
1101 }
1102
1103 if (rep)
1104 count = vcpu->regs[VCPU_REGS_RCX] & addr_mask;
1105 }
1106 return kvm_setup_pio(vcpu, kvm_run, in, size, count, string, down,
1107 address, rep, port);
1108} 1021}
1109 1022
1110static int nop_on_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1023static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1111{ 1024{
1112 return 1; 1025 return 1;
1113} 1026}
1114 1027
1115static int halt_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1028static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1116{ 1029{
1117 vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 1; 1030 svm->next_rip = svm->vmcb->save.rip + 1;
1118 skip_emulated_instruction(vcpu); 1031 skip_emulated_instruction(&svm->vcpu);
1119 return kvm_emulate_halt(vcpu); 1032 return kvm_emulate_halt(&svm->vcpu);
1120} 1033}
1121 1034
1122static int vmmcall_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1035static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1123{ 1036{
1124 vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 3; 1037 svm->next_rip = svm->vmcb->save.rip + 3;
1125 skip_emulated_instruction(vcpu); 1038 skip_emulated_instruction(&svm->vcpu);
1126 return kvm_hypercall(vcpu, kvm_run); 1039 return kvm_hypercall(&svm->vcpu, kvm_run);
1127} 1040}
1128 1041
1129static int invalid_op_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1042static int invalid_op_interception(struct vcpu_svm *svm,
1043 struct kvm_run *kvm_run)
1130{ 1044{
1131 inject_ud(vcpu); 1045 inject_ud(&svm->vcpu);
1132 return 1; 1046 return 1;
1133} 1047}
1134 1048
1135static int task_switch_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1049static int task_switch_interception(struct vcpu_svm *svm,
1050 struct kvm_run *kvm_run)
1136{ 1051{
1137 printk(KERN_DEBUG "%s: task swiche is unsupported\n", __FUNCTION__); 1052 pr_unimpl(&svm->vcpu, "%s: task switch is unsupported\n", __FUNCTION__);
1138 kvm_run->exit_reason = KVM_EXIT_UNKNOWN; 1053 kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
1139 return 0; 1054 return 0;
1140} 1055}
1141 1056
1142static int cpuid_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1057static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1143{ 1058{
1144 vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 2; 1059 svm->next_rip = svm->vmcb->save.rip + 2;
1145 kvm_emulate_cpuid(vcpu); 1060 kvm_emulate_cpuid(&svm->vcpu);
1146 return 1; 1061 return 1;
1147} 1062}
1148 1063
1149static int emulate_on_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1064static int emulate_on_interception(struct vcpu_svm *svm,
1065 struct kvm_run *kvm_run)
1150{ 1066{
1151 if (emulate_instruction(vcpu, NULL, 0, 0) != EMULATE_DONE) 1067 if (emulate_instruction(&svm->vcpu, NULL, 0, 0) != EMULATE_DONE)
1152 printk(KERN_ERR "%s: failed\n", __FUNCTION__); 1068 pr_unimpl(&svm->vcpu, "%s: failed\n", __FUNCTION__);
1153 return 1; 1069 return 1;
1154} 1070}
1155 1071
1156static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) 1072static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
1157{ 1073{
1074 struct vcpu_svm *svm = to_svm(vcpu);
1075
1158 switch (ecx) { 1076 switch (ecx) {
1159 case MSR_IA32_TIME_STAMP_COUNTER: { 1077 case MSR_IA32_TIME_STAMP_COUNTER: {
1160 u64 tsc; 1078 u64 tsc;
1161 1079
1162 rdtscll(tsc); 1080 rdtscll(tsc);
1163 *data = vcpu->svm->vmcb->control.tsc_offset + tsc; 1081 *data = svm->vmcb->control.tsc_offset + tsc;
1164 break; 1082 break;
1165 } 1083 }
1166 case MSR_K6_STAR: 1084 case MSR_K6_STAR:
1167 *data = vcpu->svm->vmcb->save.star; 1085 *data = svm->vmcb->save.star;
1168 break; 1086 break;
1169#ifdef CONFIG_X86_64 1087#ifdef CONFIG_X86_64
1170 case MSR_LSTAR: 1088 case MSR_LSTAR:
1171 *data = vcpu->svm->vmcb->save.lstar; 1089 *data = svm->vmcb->save.lstar;
1172 break; 1090 break;
1173 case MSR_CSTAR: 1091 case MSR_CSTAR:
1174 *data = vcpu->svm->vmcb->save.cstar; 1092 *data = svm->vmcb->save.cstar;
1175 break; 1093 break;
1176 case MSR_KERNEL_GS_BASE: 1094 case MSR_KERNEL_GS_BASE:
1177 *data = vcpu->svm->vmcb->save.kernel_gs_base; 1095 *data = svm->vmcb->save.kernel_gs_base;
1178 break; 1096 break;
1179 case MSR_SYSCALL_MASK: 1097 case MSR_SYSCALL_MASK:
1180 *data = vcpu->svm->vmcb->save.sfmask; 1098 *data = svm->vmcb->save.sfmask;
1181 break; 1099 break;
1182#endif 1100#endif
1183 case MSR_IA32_SYSENTER_CS: 1101 case MSR_IA32_SYSENTER_CS:
1184 *data = vcpu->svm->vmcb->save.sysenter_cs; 1102 *data = svm->vmcb->save.sysenter_cs;
1185 break; 1103 break;
1186 case MSR_IA32_SYSENTER_EIP: 1104 case MSR_IA32_SYSENTER_EIP:
1187 *data = vcpu->svm->vmcb->save.sysenter_eip; 1105 *data = svm->vmcb->save.sysenter_eip;
1188 break; 1106 break;
1189 case MSR_IA32_SYSENTER_ESP: 1107 case MSR_IA32_SYSENTER_ESP:
1190 *data = vcpu->svm->vmcb->save.sysenter_esp; 1108 *data = svm->vmcb->save.sysenter_esp;
1191 break; 1109 break;
1192 default: 1110 default:
1193 return kvm_get_msr_common(vcpu, ecx, data); 1111 return kvm_get_msr_common(vcpu, ecx, data);
@@ -1195,57 +1113,59 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
1195 return 0; 1113 return 0;
1196} 1114}
1197 1115
1198static int rdmsr_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1116static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1199{ 1117{
1200 u32 ecx = vcpu->regs[VCPU_REGS_RCX]; 1118 u32 ecx = svm->vcpu.regs[VCPU_REGS_RCX];
1201 u64 data; 1119 u64 data;
1202 1120
1203 if (svm_get_msr(vcpu, ecx, &data)) 1121 if (svm_get_msr(&svm->vcpu, ecx, &data))
1204 svm_inject_gp(vcpu, 0); 1122 svm_inject_gp(&svm->vcpu, 0);
1205 else { 1123 else {
1206 vcpu->svm->vmcb->save.rax = data & 0xffffffff; 1124 svm->vmcb->save.rax = data & 0xffffffff;
1207 vcpu->regs[VCPU_REGS_RDX] = data >> 32; 1125 svm->vcpu.regs[VCPU_REGS_RDX] = data >> 32;
1208 vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 2; 1126 svm->next_rip = svm->vmcb->save.rip + 2;
1209 skip_emulated_instruction(vcpu); 1127 skip_emulated_instruction(&svm->vcpu);
1210 } 1128 }
1211 return 1; 1129 return 1;
1212} 1130}
1213 1131
1214static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) 1132static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
1215{ 1133{
1134 struct vcpu_svm *svm = to_svm(vcpu);
1135
1216 switch (ecx) { 1136 switch (ecx) {
1217 case MSR_IA32_TIME_STAMP_COUNTER: { 1137 case MSR_IA32_TIME_STAMP_COUNTER: {
1218 u64 tsc; 1138 u64 tsc;
1219 1139
1220 rdtscll(tsc); 1140 rdtscll(tsc);
1221 vcpu->svm->vmcb->control.tsc_offset = data - tsc; 1141 svm->vmcb->control.tsc_offset = data - tsc;
1222 break; 1142 break;
1223 } 1143 }
1224 case MSR_K6_STAR: 1144 case MSR_K6_STAR:
1225 vcpu->svm->vmcb->save.star = data; 1145 svm->vmcb->save.star = data;
1226 break; 1146 break;
1227#ifdef CONFIG_X86_64 1147#ifdef CONFIG_X86_64
1228 case MSR_LSTAR: 1148 case MSR_LSTAR:
1229 vcpu->svm->vmcb->save.lstar = data; 1149 svm->vmcb->save.lstar = data;
1230 break; 1150 break;
1231 case MSR_CSTAR: 1151 case MSR_CSTAR:
1232 vcpu->svm->vmcb->save.cstar = data; 1152 svm->vmcb->save.cstar = data;
1233 break; 1153 break;
1234 case MSR_KERNEL_GS_BASE: 1154 case MSR_KERNEL_GS_BASE:
1235 vcpu->svm->vmcb->save.kernel_gs_base = data; 1155 svm->vmcb->save.kernel_gs_base = data;
1236 break; 1156 break;
1237 case MSR_SYSCALL_MASK: 1157 case MSR_SYSCALL_MASK:
1238 vcpu->svm->vmcb->save.sfmask = data; 1158 svm->vmcb->save.sfmask = data;
1239 break; 1159 break;
1240#endif 1160#endif
1241 case MSR_IA32_SYSENTER_CS: 1161 case MSR_IA32_SYSENTER_CS:
1242 vcpu->svm->vmcb->save.sysenter_cs = data; 1162 svm->vmcb->save.sysenter_cs = data;
1243 break; 1163 break;
1244 case MSR_IA32_SYSENTER_EIP: 1164 case MSR_IA32_SYSENTER_EIP:
1245 vcpu->svm->vmcb->save.sysenter_eip = data; 1165 svm->vmcb->save.sysenter_eip = data;
1246 break; 1166 break;
1247 case MSR_IA32_SYSENTER_ESP: 1167 case MSR_IA32_SYSENTER_ESP:
1248 vcpu->svm->vmcb->save.sysenter_esp = data; 1168 svm->vmcb->save.sysenter_esp = data;
1249 break; 1169 break;
1250 default: 1170 default:
1251 return kvm_set_msr_common(vcpu, ecx, data); 1171 return kvm_set_msr_common(vcpu, ecx, data);
@@ -1253,37 +1173,39 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
1253 return 0; 1173 return 0;
1254} 1174}
1255 1175
1256static int wrmsr_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1176static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1257{ 1177{
1258 u32 ecx = vcpu->regs[VCPU_REGS_RCX]; 1178 u32 ecx = svm->vcpu.regs[VCPU_REGS_RCX];
1259 u64 data = (vcpu->svm->vmcb->save.rax & -1u) 1179 u64 data = (svm->vmcb->save.rax & -1u)
1260 | ((u64)(vcpu->regs[VCPU_REGS_RDX] & -1u) << 32); 1180 | ((u64)(svm->vcpu.regs[VCPU_REGS_RDX] & -1u) << 32);
1261 vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 2; 1181 svm->next_rip = svm->vmcb->save.rip + 2;
1262 if (svm_set_msr(vcpu, ecx, data)) 1182 if (svm_set_msr(&svm->vcpu, ecx, data))
1263 svm_inject_gp(vcpu, 0); 1183 svm_inject_gp(&svm->vcpu, 0);
1264 else 1184 else
1265 skip_emulated_instruction(vcpu); 1185 skip_emulated_instruction(&svm->vcpu);
1266 return 1; 1186 return 1;
1267} 1187}
1268 1188
1269static int msr_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1189static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1270{ 1190{
1271 if (vcpu->svm->vmcb->control.exit_info_1) 1191 if (svm->vmcb->control.exit_info_1)
1272 return wrmsr_interception(vcpu, kvm_run); 1192 return wrmsr_interception(svm, kvm_run);
1273 else 1193 else
1274 return rdmsr_interception(vcpu, kvm_run); 1194 return rdmsr_interception(svm, kvm_run);
1275} 1195}
1276 1196
1277static int interrupt_window_interception(struct kvm_vcpu *vcpu, 1197static int interrupt_window_interception(struct vcpu_svm *svm,
1278 struct kvm_run *kvm_run) 1198 struct kvm_run *kvm_run)
1279{ 1199{
1200 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR);
1201 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
1280 /* 1202 /*
1281 * If the user space waits to inject interrupts, exit as soon as 1203 * If the user space waits to inject interrupts, exit as soon as
1282 * possible 1204 * possible
1283 */ 1205 */
1284 if (kvm_run->request_interrupt_window && 1206 if (kvm_run->request_interrupt_window &&
1285 !vcpu->irq_summary) { 1207 !svm->vcpu.irq_summary) {
1286 ++vcpu->stat.irq_window_exits; 1208 ++svm->vcpu.stat.irq_window_exits;
1287 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; 1209 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
1288 return 0; 1210 return 0;
1289 } 1211 }
@@ -1291,7 +1213,7 @@ static int interrupt_window_interception(struct kvm_vcpu *vcpu,
1291 return 1; 1213 return 1;
1292} 1214}
1293 1215
1294static int (*svm_exit_handlers[])(struct kvm_vcpu *vcpu, 1216static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
1295 struct kvm_run *kvm_run) = { 1217 struct kvm_run *kvm_run) = {
1296 [SVM_EXIT_READ_CR0] = emulate_on_interception, 1218 [SVM_EXIT_READ_CR0] = emulate_on_interception,
1297 [SVM_EXIT_READ_CR3] = emulate_on_interception, 1219 [SVM_EXIT_READ_CR3] = emulate_on_interception,
@@ -1338,15 +1260,25 @@ static int (*svm_exit_handlers[])(struct kvm_vcpu *vcpu,
1338}; 1260};
1339 1261
1340 1262
1341static int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1263static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1342{ 1264{
1343 u32 exit_code = vcpu->svm->vmcb->control.exit_code; 1265 struct vcpu_svm *svm = to_svm(vcpu);
1266 u32 exit_code = svm->vmcb->control.exit_code;
1267
1268 kvm_reput_irq(svm);
1344 1269
1345 if (is_external_interrupt(vcpu->svm->vmcb->control.exit_int_info) && 1270 if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
1271 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
1272 kvm_run->fail_entry.hardware_entry_failure_reason
1273 = svm->vmcb->control.exit_code;
1274 return 0;
1275 }
1276
1277 if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
1346 exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR) 1278 exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR)
1347 printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x " 1279 printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
1348 "exit_code 0x%x\n", 1280 "exit_code 0x%x\n",
1349 __FUNCTION__, vcpu->svm->vmcb->control.exit_int_info, 1281 __FUNCTION__, svm->vmcb->control.exit_int_info,
1350 exit_code); 1282 exit_code);
1351 1283
1352 if (exit_code >= ARRAY_SIZE(svm_exit_handlers) 1284 if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
@@ -1356,7 +1288,7 @@ static int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1356 return 0; 1288 return 0;
1357 } 1289 }
1358 1290
1359 return svm_exit_handlers[exit_code](vcpu, kvm_run); 1291 return svm_exit_handlers[exit_code](svm, kvm_run);
1360} 1292}
1361 1293
1362static void reload_tss(struct kvm_vcpu *vcpu) 1294static void reload_tss(struct kvm_vcpu *vcpu)
@@ -1368,93 +1300,126 @@ static void reload_tss(struct kvm_vcpu *vcpu)
1368 load_TR_desc(); 1300 load_TR_desc();
1369} 1301}
1370 1302
1371static void pre_svm_run(struct kvm_vcpu *vcpu) 1303static void pre_svm_run(struct vcpu_svm *svm)
1372{ 1304{
1373 int cpu = raw_smp_processor_id(); 1305 int cpu = raw_smp_processor_id();
1374 1306
1375 struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); 1307 struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
1376 1308
1377 vcpu->svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; 1309 svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
1378 if (vcpu->cpu != cpu || 1310 if (svm->vcpu.cpu != cpu ||
1379 vcpu->svm->asid_generation != svm_data->asid_generation) 1311 svm->asid_generation != svm_data->asid_generation)
1380 new_asid(vcpu, svm_data); 1312 new_asid(svm, svm_data);
1381} 1313}
1382 1314
1383 1315
1384static inline void kvm_do_inject_irq(struct kvm_vcpu *vcpu) 1316static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
1385{ 1317{
1386 struct vmcb_control_area *control; 1318 struct vmcb_control_area *control;
1387 1319
1388 control = &vcpu->svm->vmcb->control; 1320 control = &svm->vmcb->control;
1389 control->int_vector = pop_irq(vcpu); 1321 control->int_vector = irq;
1390 control->int_ctl &= ~V_INTR_PRIO_MASK; 1322 control->int_ctl &= ~V_INTR_PRIO_MASK;
1391 control->int_ctl |= V_IRQ_MASK | 1323 control->int_ctl |= V_IRQ_MASK |
1392 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); 1324 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
1393} 1325}
1394 1326
1395static void kvm_reput_irq(struct kvm_vcpu *vcpu) 1327static void svm_set_irq(struct kvm_vcpu *vcpu, int irq)
1328{
1329 struct vcpu_svm *svm = to_svm(vcpu);
1330
1331 svm_inject_irq(svm, irq);
1332}
1333
1334static void svm_intr_assist(struct kvm_vcpu *vcpu)
1335{
1336 struct vcpu_svm *svm = to_svm(vcpu);
1337 struct vmcb *vmcb = svm->vmcb;
1338 int intr_vector = -1;
1339
1340 kvm_inject_pending_timer_irqs(vcpu);
1341 if ((vmcb->control.exit_int_info & SVM_EVTINJ_VALID) &&
1342 ((vmcb->control.exit_int_info & SVM_EVTINJ_TYPE_MASK) == 0)) {
1343 intr_vector = vmcb->control.exit_int_info &
1344 SVM_EVTINJ_VEC_MASK;
1345 vmcb->control.exit_int_info = 0;
1346 svm_inject_irq(svm, intr_vector);
1347 return;
1348 }
1349
1350 if (vmcb->control.int_ctl & V_IRQ_MASK)
1351 return;
1352
1353 if (!kvm_cpu_has_interrupt(vcpu))
1354 return;
1355
1356 if (!(vmcb->save.rflags & X86_EFLAGS_IF) ||
1357 (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
1358 (vmcb->control.event_inj & SVM_EVTINJ_VALID)) {
1359 /* unable to deliver irq, set pending irq */
1360 vmcb->control.intercept |= (1ULL << INTERCEPT_VINTR);
1361 svm_inject_irq(svm, 0x0);
1362 return;
1363 }
1364 /* Okay, we can deliver the interrupt: grab it and update PIC state. */
1365 intr_vector = kvm_cpu_get_interrupt(vcpu);
1366 svm_inject_irq(svm, intr_vector);
1367 kvm_timer_intr_post(vcpu, intr_vector);
1368}
1369
1370static void kvm_reput_irq(struct vcpu_svm *svm)
1396{ 1371{
1397 struct vmcb_control_area *control = &vcpu->svm->vmcb->control; 1372 struct vmcb_control_area *control = &svm->vmcb->control;
1398 1373
1399 if (control->int_ctl & V_IRQ_MASK) { 1374 if ((control->int_ctl & V_IRQ_MASK)
1375 && !irqchip_in_kernel(svm->vcpu.kvm)) {
1400 control->int_ctl &= ~V_IRQ_MASK; 1376 control->int_ctl &= ~V_IRQ_MASK;
1401 push_irq(vcpu, control->int_vector); 1377 push_irq(&svm->vcpu, control->int_vector);
1402 } 1378 }
1403 1379
1404 vcpu->interrupt_window_open = 1380 svm->vcpu.interrupt_window_open =
1405 !(control->int_state & SVM_INTERRUPT_SHADOW_MASK); 1381 !(control->int_state & SVM_INTERRUPT_SHADOW_MASK);
1406} 1382}
1407 1383
1384static void svm_do_inject_vector(struct vcpu_svm *svm)
1385{
1386 struct kvm_vcpu *vcpu = &svm->vcpu;
1387 int word_index = __ffs(vcpu->irq_summary);
1388 int bit_index = __ffs(vcpu->irq_pending[word_index]);
1389 int irq = word_index * BITS_PER_LONG + bit_index;
1390
1391 clear_bit(bit_index, &vcpu->irq_pending[word_index]);
1392 if (!vcpu->irq_pending[word_index])
1393 clear_bit(word_index, &vcpu->irq_summary);
1394 svm_inject_irq(svm, irq);
1395}
1396
1408static void do_interrupt_requests(struct kvm_vcpu *vcpu, 1397static void do_interrupt_requests(struct kvm_vcpu *vcpu,
1409 struct kvm_run *kvm_run) 1398 struct kvm_run *kvm_run)
1410{ 1399{
1411 struct vmcb_control_area *control = &vcpu->svm->vmcb->control; 1400 struct vcpu_svm *svm = to_svm(vcpu);
1401 struct vmcb_control_area *control = &svm->vmcb->control;
1412 1402
1413 vcpu->interrupt_window_open = 1403 svm->vcpu.interrupt_window_open =
1414 (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) && 1404 (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
1415 (vcpu->svm->vmcb->save.rflags & X86_EFLAGS_IF)); 1405 (svm->vmcb->save.rflags & X86_EFLAGS_IF));
1416 1406
1417 if (vcpu->interrupt_window_open && vcpu->irq_summary) 1407 if (svm->vcpu.interrupt_window_open && svm->vcpu.irq_summary)
1418 /* 1408 /*
1419 * If interrupts enabled, and not blocked by sti or mov ss. Good. 1409 * If interrupts enabled, and not blocked by sti or mov ss. Good.
1420 */ 1410 */
1421 kvm_do_inject_irq(vcpu); 1411 svm_do_inject_vector(svm);
1422 1412
1423 /* 1413 /*
1424 * Interrupts blocked. Wait for unblock. 1414 * Interrupts blocked. Wait for unblock.
1425 */ 1415 */
1426 if (!vcpu->interrupt_window_open && 1416 if (!svm->vcpu.interrupt_window_open &&
1427 (vcpu->irq_summary || kvm_run->request_interrupt_window)) { 1417 (svm->vcpu.irq_summary || kvm_run->request_interrupt_window)) {
1428 control->intercept |= 1ULL << INTERCEPT_VINTR; 1418 control->intercept |= 1ULL << INTERCEPT_VINTR;
1429 } else 1419 } else
1430 control->intercept &= ~(1ULL << INTERCEPT_VINTR); 1420 control->intercept &= ~(1ULL << INTERCEPT_VINTR);
1431} 1421}
1432 1422
1433static void post_kvm_run_save(struct kvm_vcpu *vcpu,
1434 struct kvm_run *kvm_run)
1435{
1436 kvm_run->ready_for_interrupt_injection = (vcpu->interrupt_window_open &&
1437 vcpu->irq_summary == 0);
1438 kvm_run->if_flag = (vcpu->svm->vmcb->save.rflags & X86_EFLAGS_IF) != 0;
1439 kvm_run->cr8 = vcpu->cr8;
1440 kvm_run->apic_base = vcpu->apic_base;
1441}
1442
1443/*
1444 * Check if userspace requested an interrupt window, and that the
1445 * interrupt window is open.
1446 *
1447 * No need to exit to userspace if we already have an interrupt queued.
1448 */
1449static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
1450 struct kvm_run *kvm_run)
1451{
1452 return (!vcpu->irq_summary &&
1453 kvm_run->request_interrupt_window &&
1454 vcpu->interrupt_window_open &&
1455 (vcpu->svm->vmcb->save.rflags & X86_EFLAGS_IF));
1456}
1457
1458static void save_db_regs(unsigned long *db_regs) 1423static void save_db_regs(unsigned long *db_regs)
1459{ 1424{
1460 asm volatile ("mov %%dr0, %0" : "=r"(db_regs[0])); 1425 asm volatile ("mov %%dr0, %0" : "=r"(db_regs[0]));
@@ -1476,49 +1441,37 @@ static void svm_flush_tlb(struct kvm_vcpu *vcpu)
1476 force_new_asid(vcpu); 1441 force_new_asid(vcpu);
1477} 1442}
1478 1443
1479static int svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1444static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
1445{
1446}
1447
1448static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1480{ 1449{
1450 struct vcpu_svm *svm = to_svm(vcpu);
1481 u16 fs_selector; 1451 u16 fs_selector;
1482 u16 gs_selector; 1452 u16 gs_selector;
1483 u16 ldt_selector; 1453 u16 ldt_selector;
1484 int r;
1485
1486again:
1487 r = kvm_mmu_reload(vcpu);
1488 if (unlikely(r))
1489 return r;
1490
1491 if (!vcpu->mmio_read_completed)
1492 do_interrupt_requests(vcpu, kvm_run);
1493 1454
1494 clgi(); 1455 pre_svm_run(svm);
1495
1496 vcpu->guest_mode = 1;
1497 if (vcpu->requests)
1498 if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests))
1499 svm_flush_tlb(vcpu);
1500
1501 pre_svm_run(vcpu);
1502 1456
1503 save_host_msrs(vcpu); 1457 save_host_msrs(vcpu);
1504 fs_selector = read_fs(); 1458 fs_selector = read_fs();
1505 gs_selector = read_gs(); 1459 gs_selector = read_gs();
1506 ldt_selector = read_ldt(); 1460 ldt_selector = read_ldt();
1507 vcpu->svm->host_cr2 = kvm_read_cr2(); 1461 svm->host_cr2 = kvm_read_cr2();
1508 vcpu->svm->host_dr6 = read_dr6(); 1462 svm->host_dr6 = read_dr6();
1509 vcpu->svm->host_dr7 = read_dr7(); 1463 svm->host_dr7 = read_dr7();
1510 vcpu->svm->vmcb->save.cr2 = vcpu->cr2; 1464 svm->vmcb->save.cr2 = vcpu->cr2;
1511 1465
1512 if (vcpu->svm->vmcb->save.dr7 & 0xff) { 1466 if (svm->vmcb->save.dr7 & 0xff) {
1513 write_dr7(0); 1467 write_dr7(0);
1514 save_db_regs(vcpu->svm->host_db_regs); 1468 save_db_regs(svm->host_db_regs);
1515 load_db_regs(vcpu->svm->db_regs); 1469 load_db_regs(svm->db_regs);
1516 } 1470 }
1517 1471
1518 if (vcpu->fpu_active) { 1472 clgi();
1519 fx_save(vcpu->host_fx_image); 1473
1520 fx_restore(vcpu->guest_fx_image); 1474 local_irq_enable();
1521 }
1522 1475
1523 asm volatile ( 1476 asm volatile (
1524#ifdef CONFIG_X86_64 1477#ifdef CONFIG_X86_64
@@ -1532,34 +1485,33 @@ again:
1532#endif 1485#endif
1533 1486
1534#ifdef CONFIG_X86_64 1487#ifdef CONFIG_X86_64
1535 "mov %c[rbx](%[vcpu]), %%rbx \n\t" 1488 "mov %c[rbx](%[svm]), %%rbx \n\t"
1536 "mov %c[rcx](%[vcpu]), %%rcx \n\t" 1489 "mov %c[rcx](%[svm]), %%rcx \n\t"
1537 "mov %c[rdx](%[vcpu]), %%rdx \n\t" 1490 "mov %c[rdx](%[svm]), %%rdx \n\t"
1538 "mov %c[rsi](%[vcpu]), %%rsi \n\t" 1491 "mov %c[rsi](%[svm]), %%rsi \n\t"
1539 "mov %c[rdi](%[vcpu]), %%rdi \n\t" 1492 "mov %c[rdi](%[svm]), %%rdi \n\t"
1540 "mov %c[rbp](%[vcpu]), %%rbp \n\t" 1493 "mov %c[rbp](%[svm]), %%rbp \n\t"
1541 "mov %c[r8](%[vcpu]), %%r8 \n\t" 1494 "mov %c[r8](%[svm]), %%r8 \n\t"
1542 "mov %c[r9](%[vcpu]), %%r9 \n\t" 1495 "mov %c[r9](%[svm]), %%r9 \n\t"
1543 "mov %c[r10](%[vcpu]), %%r10 \n\t" 1496 "mov %c[r10](%[svm]), %%r10 \n\t"
1544 "mov %c[r11](%[vcpu]), %%r11 \n\t" 1497 "mov %c[r11](%[svm]), %%r11 \n\t"
1545 "mov %c[r12](%[vcpu]), %%r12 \n\t" 1498 "mov %c[r12](%[svm]), %%r12 \n\t"
1546 "mov %c[r13](%[vcpu]), %%r13 \n\t" 1499 "mov %c[r13](%[svm]), %%r13 \n\t"
1547 "mov %c[r14](%[vcpu]), %%r14 \n\t" 1500 "mov %c[r14](%[svm]), %%r14 \n\t"
1548 "mov %c[r15](%[vcpu]), %%r15 \n\t" 1501 "mov %c[r15](%[svm]), %%r15 \n\t"
1549#else 1502#else
1550 "mov %c[rbx](%[vcpu]), %%ebx \n\t" 1503 "mov %c[rbx](%[svm]), %%ebx \n\t"
1551 "mov %c[rcx](%[vcpu]), %%ecx \n\t" 1504 "mov %c[rcx](%[svm]), %%ecx \n\t"
1552 "mov %c[rdx](%[vcpu]), %%edx \n\t" 1505 "mov %c[rdx](%[svm]), %%edx \n\t"
1553 "mov %c[rsi](%[vcpu]), %%esi \n\t" 1506 "mov %c[rsi](%[svm]), %%esi \n\t"
1554 "mov %c[rdi](%[vcpu]), %%edi \n\t" 1507 "mov %c[rdi](%[svm]), %%edi \n\t"
1555 "mov %c[rbp](%[vcpu]), %%ebp \n\t" 1508 "mov %c[rbp](%[svm]), %%ebp \n\t"
1556#endif 1509#endif
1557 1510
1558#ifdef CONFIG_X86_64 1511#ifdef CONFIG_X86_64
1559 /* Enter guest mode */ 1512 /* Enter guest mode */
1560 "push %%rax \n\t" 1513 "push %%rax \n\t"
1561 "mov %c[svm](%[vcpu]), %%rax \n\t" 1514 "mov %c[vmcb](%[svm]), %%rax \n\t"
1562 "mov %c[vmcb](%%rax), %%rax \n\t"
1563 SVM_VMLOAD "\n\t" 1515 SVM_VMLOAD "\n\t"
1564 SVM_VMRUN "\n\t" 1516 SVM_VMRUN "\n\t"
1565 SVM_VMSAVE "\n\t" 1517 SVM_VMSAVE "\n\t"
@@ -1567,8 +1519,7 @@ again:
1567#else 1519#else
1568 /* Enter guest mode */ 1520 /* Enter guest mode */
1569 "push %%eax \n\t" 1521 "push %%eax \n\t"
1570 "mov %c[svm](%[vcpu]), %%eax \n\t" 1522 "mov %c[vmcb](%[svm]), %%eax \n\t"
1571 "mov %c[vmcb](%%eax), %%eax \n\t"
1572 SVM_VMLOAD "\n\t" 1523 SVM_VMLOAD "\n\t"
1573 SVM_VMRUN "\n\t" 1524 SVM_VMRUN "\n\t"
1574 SVM_VMSAVE "\n\t" 1525 SVM_VMSAVE "\n\t"
@@ -1577,73 +1528,69 @@ again:
1577 1528
1578 /* Save guest registers, load host registers */ 1529 /* Save guest registers, load host registers */
1579#ifdef CONFIG_X86_64 1530#ifdef CONFIG_X86_64
1580 "mov %%rbx, %c[rbx](%[vcpu]) \n\t" 1531 "mov %%rbx, %c[rbx](%[svm]) \n\t"
1581 "mov %%rcx, %c[rcx](%[vcpu]) \n\t" 1532 "mov %%rcx, %c[rcx](%[svm]) \n\t"
1582 "mov %%rdx, %c[rdx](%[vcpu]) \n\t" 1533 "mov %%rdx, %c[rdx](%[svm]) \n\t"
1583 "mov %%rsi, %c[rsi](%[vcpu]) \n\t" 1534 "mov %%rsi, %c[rsi](%[svm]) \n\t"
1584 "mov %%rdi, %c[rdi](%[vcpu]) \n\t" 1535 "mov %%rdi, %c[rdi](%[svm]) \n\t"
1585 "mov %%rbp, %c[rbp](%[vcpu]) \n\t" 1536 "mov %%rbp, %c[rbp](%[svm]) \n\t"
1586 "mov %%r8, %c[r8](%[vcpu]) \n\t" 1537 "mov %%r8, %c[r8](%[svm]) \n\t"
1587 "mov %%r9, %c[r9](%[vcpu]) \n\t" 1538 "mov %%r9, %c[r9](%[svm]) \n\t"
1588 "mov %%r10, %c[r10](%[vcpu]) \n\t" 1539 "mov %%r10, %c[r10](%[svm]) \n\t"
1589 "mov %%r11, %c[r11](%[vcpu]) \n\t" 1540 "mov %%r11, %c[r11](%[svm]) \n\t"
1590 "mov %%r12, %c[r12](%[vcpu]) \n\t" 1541 "mov %%r12, %c[r12](%[svm]) \n\t"
1591 "mov %%r13, %c[r13](%[vcpu]) \n\t" 1542 "mov %%r13, %c[r13](%[svm]) \n\t"
1592 "mov %%r14, %c[r14](%[vcpu]) \n\t" 1543 "mov %%r14, %c[r14](%[svm]) \n\t"
1593 "mov %%r15, %c[r15](%[vcpu]) \n\t" 1544 "mov %%r15, %c[r15](%[svm]) \n\t"
1594 1545
1595 "pop %%r15; pop %%r14; pop %%r13; pop %%r12;" 1546 "pop %%r15; pop %%r14; pop %%r13; pop %%r12;"
1596 "pop %%r11; pop %%r10; pop %%r9; pop %%r8;" 1547 "pop %%r11; pop %%r10; pop %%r9; pop %%r8;"
1597 "pop %%rbp; pop %%rdi; pop %%rsi;" 1548 "pop %%rbp; pop %%rdi; pop %%rsi;"
1598 "pop %%rdx; pop %%rcx; pop %%rbx; \n\t" 1549 "pop %%rdx; pop %%rcx; pop %%rbx; \n\t"
1599#else 1550#else
1600 "mov %%ebx, %c[rbx](%[vcpu]) \n\t" 1551 "mov %%ebx, %c[rbx](%[svm]) \n\t"
1601 "mov %%ecx, %c[rcx](%[vcpu]) \n\t" 1552 "mov %%ecx, %c[rcx](%[svm]) \n\t"
1602 "mov %%edx, %c[rdx](%[vcpu]) \n\t" 1553 "mov %%edx, %c[rdx](%[svm]) \n\t"
1603 "mov %%esi, %c[rsi](%[vcpu]) \n\t" 1554 "mov %%esi, %c[rsi](%[svm]) \n\t"
1604 "mov %%edi, %c[rdi](%[vcpu]) \n\t" 1555 "mov %%edi, %c[rdi](%[svm]) \n\t"
1605 "mov %%ebp, %c[rbp](%[vcpu]) \n\t" 1556 "mov %%ebp, %c[rbp](%[svm]) \n\t"
1606 1557
1607 "pop %%ebp; pop %%edi; pop %%esi;" 1558 "pop %%ebp; pop %%edi; pop %%esi;"
1608 "pop %%edx; pop %%ecx; pop %%ebx; \n\t" 1559 "pop %%edx; pop %%ecx; pop %%ebx; \n\t"
1609#endif 1560#endif
1610 : 1561 :
1611 : [vcpu]"a"(vcpu), 1562 : [svm]"a"(svm),
1612 [svm]"i"(offsetof(struct kvm_vcpu, svm)),
1613 [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), 1563 [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
1614 [rbx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBX])), 1564 [rbx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RBX])),
1615 [rcx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RCX])), 1565 [rcx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RCX])),
1616 [rdx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDX])), 1566 [rdx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RDX])),
1617 [rsi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RSI])), 1567 [rsi]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RSI])),
1618 [rdi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDI])), 1568 [rdi]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RDI])),
1619 [rbp]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBP])) 1569 [rbp]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RBP]))
1620#ifdef CONFIG_X86_64 1570#ifdef CONFIG_X86_64
1621 ,[r8 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R8 ])), 1571 ,[r8 ]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R8])),
1622 [r9 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R9 ])), 1572 [r9 ]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R9 ])),
1623 [r10]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R10])), 1573 [r10]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R10])),
1624 [r11]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R11])), 1574 [r11]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R11])),
1625 [r12]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R12])), 1575 [r12]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R12])),
1626 [r13]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R13])), 1576 [r13]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R13])),
1627 [r14]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R14])), 1577 [r14]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R14])),
1628 [r15]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R15])) 1578 [r15]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R15]))
1629#endif 1579#endif
1630 : "cc", "memory" ); 1580 : "cc", "memory" );
1631 1581
1632 vcpu->guest_mode = 0; 1582 local_irq_disable();
1633 1583
1634 if (vcpu->fpu_active) { 1584 stgi();
1635 fx_save(vcpu->guest_fx_image);
1636 fx_restore(vcpu->host_fx_image);
1637 }
1638 1585
1639 if ((vcpu->svm->vmcb->save.dr7 & 0xff)) 1586 if ((svm->vmcb->save.dr7 & 0xff))
1640 load_db_regs(vcpu->svm->host_db_regs); 1587 load_db_regs(svm->host_db_regs);
1641 1588
1642 vcpu->cr2 = vcpu->svm->vmcb->save.cr2; 1589 vcpu->cr2 = svm->vmcb->save.cr2;
1643 1590
1644 write_dr6(vcpu->svm->host_dr6); 1591 write_dr6(svm->host_dr6);
1645 write_dr7(vcpu->svm->host_dr7); 1592 write_dr7(svm->host_dr7);
1646 kvm_write_cr2(vcpu->svm->host_cr2); 1593 kvm_write_cr2(svm->host_cr2);
1647 1594
1648 load_fs(fs_selector); 1595 load_fs(fs_selector);
1649 load_gs(gs_selector); 1596 load_gs(gs_selector);
@@ -1652,57 +1599,19 @@ again:
1652 1599
1653 reload_tss(vcpu); 1600 reload_tss(vcpu);
1654 1601
1655 /* 1602 svm->next_rip = 0;
1656 * Profile KVM exit RIPs:
1657 */
1658 if (unlikely(prof_on == KVM_PROFILING))
1659 profile_hit(KVM_PROFILING,
1660 (void *)(unsigned long)vcpu->svm->vmcb->save.rip);
1661
1662 stgi();
1663
1664 kvm_reput_irq(vcpu);
1665
1666 vcpu->svm->next_rip = 0;
1667
1668 if (vcpu->svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
1669 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
1670 kvm_run->fail_entry.hardware_entry_failure_reason
1671 = vcpu->svm->vmcb->control.exit_code;
1672 post_kvm_run_save(vcpu, kvm_run);
1673 return 0;
1674 }
1675
1676 r = handle_exit(vcpu, kvm_run);
1677 if (r > 0) {
1678 if (signal_pending(current)) {
1679 ++vcpu->stat.signal_exits;
1680 post_kvm_run_save(vcpu, kvm_run);
1681 kvm_run->exit_reason = KVM_EXIT_INTR;
1682 return -EINTR;
1683 }
1684
1685 if (dm_request_for_irq_injection(vcpu, kvm_run)) {
1686 ++vcpu->stat.request_irq_exits;
1687 post_kvm_run_save(vcpu, kvm_run);
1688 kvm_run->exit_reason = KVM_EXIT_INTR;
1689 return -EINTR;
1690 }
1691 kvm_resched(vcpu);
1692 goto again;
1693 }
1694 post_kvm_run_save(vcpu, kvm_run);
1695 return r;
1696} 1603}
1697 1604
1698static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) 1605static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
1699{ 1606{
1700 vcpu->svm->vmcb->save.cr3 = root; 1607 struct vcpu_svm *svm = to_svm(vcpu);
1608
1609 svm->vmcb->save.cr3 = root;
1701 force_new_asid(vcpu); 1610 force_new_asid(vcpu);
1702 1611
1703 if (vcpu->fpu_active) { 1612 if (vcpu->fpu_active) {
1704 vcpu->svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR); 1613 svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR);
1705 vcpu->svm->vmcb->save.cr0 |= CR0_TS_MASK; 1614 svm->vmcb->save.cr0 |= X86_CR0_TS;
1706 vcpu->fpu_active = 0; 1615 vcpu->fpu_active = 0;
1707 } 1616 }
1708} 1617}
@@ -1711,26 +1620,27 @@ static void svm_inject_page_fault(struct kvm_vcpu *vcpu,
1711 unsigned long addr, 1620 unsigned long addr,
1712 uint32_t err_code) 1621 uint32_t err_code)
1713{ 1622{
1714 uint32_t exit_int_info = vcpu->svm->vmcb->control.exit_int_info; 1623 struct vcpu_svm *svm = to_svm(vcpu);
1624 uint32_t exit_int_info = svm->vmcb->control.exit_int_info;
1715 1625
1716 ++vcpu->stat.pf_guest; 1626 ++vcpu->stat.pf_guest;
1717 1627
1718 if (is_page_fault(exit_int_info)) { 1628 if (is_page_fault(exit_int_info)) {
1719 1629
1720 vcpu->svm->vmcb->control.event_inj_err = 0; 1630 svm->vmcb->control.event_inj_err = 0;
1721 vcpu->svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | 1631 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID |
1722 SVM_EVTINJ_VALID_ERR | 1632 SVM_EVTINJ_VALID_ERR |
1723 SVM_EVTINJ_TYPE_EXEPT | 1633 SVM_EVTINJ_TYPE_EXEPT |
1724 DF_VECTOR; 1634 DF_VECTOR;
1725 return; 1635 return;
1726 } 1636 }
1727 vcpu->cr2 = addr; 1637 vcpu->cr2 = addr;
1728 vcpu->svm->vmcb->save.cr2 = addr; 1638 svm->vmcb->save.cr2 = addr;
1729 vcpu->svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | 1639 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID |
1730 SVM_EVTINJ_VALID_ERR | 1640 SVM_EVTINJ_VALID_ERR |
1731 SVM_EVTINJ_TYPE_EXEPT | 1641 SVM_EVTINJ_TYPE_EXEPT |
1732 PF_VECTOR; 1642 PF_VECTOR;
1733 vcpu->svm->vmcb->control.event_inj_err = err_code; 1643 svm->vmcb->control.event_inj_err = err_code;
1734} 1644}
1735 1645
1736 1646
@@ -1757,17 +1667,25 @@ svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
1757 hypercall[3] = 0xc3; 1667 hypercall[3] = 0xc3;
1758} 1668}
1759 1669
1760static struct kvm_arch_ops svm_arch_ops = { 1670static void svm_check_processor_compat(void *rtn)
1671{
1672 *(int *)rtn = 0;
1673}
1674
1675static struct kvm_x86_ops svm_x86_ops = {
1761 .cpu_has_kvm_support = has_svm, 1676 .cpu_has_kvm_support = has_svm,
1762 .disabled_by_bios = is_disabled, 1677 .disabled_by_bios = is_disabled,
1763 .hardware_setup = svm_hardware_setup, 1678 .hardware_setup = svm_hardware_setup,
1764 .hardware_unsetup = svm_hardware_unsetup, 1679 .hardware_unsetup = svm_hardware_unsetup,
1680 .check_processor_compatibility = svm_check_processor_compat,
1765 .hardware_enable = svm_hardware_enable, 1681 .hardware_enable = svm_hardware_enable,
1766 .hardware_disable = svm_hardware_disable, 1682 .hardware_disable = svm_hardware_disable,
1767 1683
1768 .vcpu_create = svm_create_vcpu, 1684 .vcpu_create = svm_create_vcpu,
1769 .vcpu_free = svm_free_vcpu, 1685 .vcpu_free = svm_free_vcpu,
1686 .vcpu_reset = svm_vcpu_reset,
1770 1687
1688 .prepare_guest_switch = svm_prepare_guest_switch,
1771 .vcpu_load = svm_vcpu_load, 1689 .vcpu_load = svm_vcpu_load,
1772 .vcpu_put = svm_vcpu_put, 1690 .vcpu_put = svm_vcpu_put,
1773 .vcpu_decache = svm_vcpu_decache, 1691 .vcpu_decache = svm_vcpu_decache,
@@ -1778,7 +1696,7 @@ static struct kvm_arch_ops svm_arch_ops = {
1778 .get_segment_base = svm_get_segment_base, 1696 .get_segment_base = svm_get_segment_base,
1779 .get_segment = svm_get_segment, 1697 .get_segment = svm_get_segment,
1780 .set_segment = svm_set_segment, 1698 .set_segment = svm_set_segment,
1781 .get_cs_db_l_bits = svm_get_cs_db_l_bits, 1699 .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
1782 .decache_cr4_guest_bits = svm_decache_cr4_guest_bits, 1700 .decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
1783 .set_cr0 = svm_set_cr0, 1701 .set_cr0 = svm_set_cr0,
1784 .set_cr3 = svm_set_cr3, 1702 .set_cr3 = svm_set_cr3,
@@ -1795,26 +1713,30 @@ static struct kvm_arch_ops svm_arch_ops = {
1795 .get_rflags = svm_get_rflags, 1713 .get_rflags = svm_get_rflags,
1796 .set_rflags = svm_set_rflags, 1714 .set_rflags = svm_set_rflags,
1797 1715
1798 .invlpg = svm_invlpg,
1799 .tlb_flush = svm_flush_tlb, 1716 .tlb_flush = svm_flush_tlb,
1800 .inject_page_fault = svm_inject_page_fault, 1717 .inject_page_fault = svm_inject_page_fault,
1801 1718
1802 .inject_gp = svm_inject_gp, 1719 .inject_gp = svm_inject_gp,
1803 1720
1804 .run = svm_vcpu_run, 1721 .run = svm_vcpu_run,
1722 .handle_exit = handle_exit,
1805 .skip_emulated_instruction = skip_emulated_instruction, 1723 .skip_emulated_instruction = skip_emulated_instruction,
1806 .vcpu_setup = svm_vcpu_setup,
1807 .patch_hypercall = svm_patch_hypercall, 1724 .patch_hypercall = svm_patch_hypercall,
1725 .get_irq = svm_get_irq,
1726 .set_irq = svm_set_irq,
1727 .inject_pending_irq = svm_intr_assist,
1728 .inject_pending_vectors = do_interrupt_requests,
1808}; 1729};
1809 1730
1810static int __init svm_init(void) 1731static int __init svm_init(void)
1811{ 1732{
1812 return kvm_init_arch(&svm_arch_ops, THIS_MODULE); 1733 return kvm_init_x86(&svm_x86_ops, sizeof(struct vcpu_svm),
1734 THIS_MODULE);
1813} 1735}
1814 1736
1815static void __exit svm_exit(void) 1737static void __exit svm_exit(void)
1816{ 1738{
1817 kvm_exit_arch(); 1739 kvm_exit_x86();
1818} 1740}
1819 1741
1820module_init(svm_init) 1742module_init(svm_init)
diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c
index 80628f69916d..4f115a8e45ef 100644
--- a/drivers/kvm/vmx.c
+++ b/drivers/kvm/vmx.c
@@ -16,6 +16,8 @@
16 */ 16 */
17 17
18#include "kvm.h" 18#include "kvm.h"
19#include "x86_emulate.h"
20#include "irq.h"
19#include "vmx.h" 21#include "vmx.h"
20#include "segment_descriptor.h" 22#include "segment_descriptor.h"
21 23
@@ -23,7 +25,6 @@
23#include <linux/kernel.h> 25#include <linux/kernel.h>
24#include <linux/mm.h> 26#include <linux/mm.h>
25#include <linux/highmem.h> 27#include <linux/highmem.h>
26#include <linux/profile.h>
27#include <linux/sched.h> 28#include <linux/sched.h>
28 29
29#include <asm/io.h> 30#include <asm/io.h>
@@ -32,6 +33,39 @@
32MODULE_AUTHOR("Qumranet"); 33MODULE_AUTHOR("Qumranet");
33MODULE_LICENSE("GPL"); 34MODULE_LICENSE("GPL");
34 35
36struct vmcs {
37 u32 revision_id;
38 u32 abort;
39 char data[0];
40};
41
42struct vcpu_vmx {
43 struct kvm_vcpu vcpu;
44 int launched;
45 u8 fail;
46 struct kvm_msr_entry *guest_msrs;
47 struct kvm_msr_entry *host_msrs;
48 int nmsrs;
49 int save_nmsrs;
50 int msr_offset_efer;
51#ifdef CONFIG_X86_64
52 int msr_offset_kernel_gs_base;
53#endif
54 struct vmcs *vmcs;
55 struct {
56 int loaded;
57 u16 fs_sel, gs_sel, ldt_sel;
58 int gs_ldt_reload_needed;
59 int fs_reload_needed;
60 }host_state;
61
62};
63
64static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
65{
66 return container_of(vcpu, struct vcpu_vmx, vcpu);
67}
68
35static int init_rmode_tss(struct kvm *kvm); 69static int init_rmode_tss(struct kvm *kvm);
36 70
37static DEFINE_PER_CPU(struct vmcs *, vmxarea); 71static DEFINE_PER_CPU(struct vmcs *, vmxarea);
@@ -40,18 +74,17 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
40static struct page *vmx_io_bitmap_a; 74static struct page *vmx_io_bitmap_a;
41static struct page *vmx_io_bitmap_b; 75static struct page *vmx_io_bitmap_b;
42 76
43#ifdef CONFIG_X86_64
44#define HOST_IS_64 1
45#else
46#define HOST_IS_64 0
47#endif
48#define EFER_SAVE_RESTORE_BITS ((u64)EFER_SCE) 77#define EFER_SAVE_RESTORE_BITS ((u64)EFER_SCE)
49 78
50static struct vmcs_descriptor { 79static struct vmcs_config {
51 int size; 80 int size;
52 int order; 81 int order;
53 u32 revision_id; 82 u32 revision_id;
54} vmcs_descriptor; 83 u32 pin_based_exec_ctrl;
84 u32 cpu_based_exec_ctrl;
85 u32 vmexit_ctrl;
86 u32 vmentry_ctrl;
87} vmcs_config;
55 88
56#define VMX_SEGMENT_FIELD(seg) \ 89#define VMX_SEGMENT_FIELD(seg) \
57 [VCPU_SREG_##seg] = { \ 90 [VCPU_SREG_##seg] = { \
@@ -89,16 +122,32 @@ static const u32 vmx_msr_index[] = {
89}; 122};
90#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) 123#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
91 124
92static inline u64 msr_efer_save_restore_bits(struct vmx_msr_entry msr) 125static void load_msrs(struct kvm_msr_entry *e, int n)
126{
127 int i;
128
129 for (i = 0; i < n; ++i)
130 wrmsrl(e[i].index, e[i].data);
131}
132
133static void save_msrs(struct kvm_msr_entry *e, int n)
134{
135 int i;
136
137 for (i = 0; i < n; ++i)
138 rdmsrl(e[i].index, e[i].data);
139}
140
141static inline u64 msr_efer_save_restore_bits(struct kvm_msr_entry msr)
93{ 142{
94 return (u64)msr.data & EFER_SAVE_RESTORE_BITS; 143 return (u64)msr.data & EFER_SAVE_RESTORE_BITS;
95} 144}
96 145
97static inline int msr_efer_need_save_restore(struct kvm_vcpu *vcpu) 146static inline int msr_efer_need_save_restore(struct vcpu_vmx *vmx)
98{ 147{
99 int efer_offset = vcpu->msr_offset_efer; 148 int efer_offset = vmx->msr_offset_efer;
100 return msr_efer_save_restore_bits(vcpu->host_msrs[efer_offset]) != 149 return msr_efer_save_restore_bits(vmx->host_msrs[efer_offset]) !=
101 msr_efer_save_restore_bits(vcpu->guest_msrs[efer_offset]); 150 msr_efer_save_restore_bits(vmx->guest_msrs[efer_offset]);
102} 151}
103 152
104static inline int is_page_fault(u32 intr_info) 153static inline int is_page_fault(u32 intr_info)
@@ -121,23 +170,33 @@ static inline int is_external_interrupt(u32 intr_info)
121 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); 170 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
122} 171}
123 172
124static int __find_msr_index(struct kvm_vcpu *vcpu, u32 msr) 173static inline int cpu_has_vmx_tpr_shadow(void)
174{
175 return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW);
176}
177
178static inline int vm_need_tpr_shadow(struct kvm *kvm)
179{
180 return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm)));
181}
182
183static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
125{ 184{
126 int i; 185 int i;
127 186
128 for (i = 0; i < vcpu->nmsrs; ++i) 187 for (i = 0; i < vmx->nmsrs; ++i)
129 if (vcpu->guest_msrs[i].index == msr) 188 if (vmx->guest_msrs[i].index == msr)
130 return i; 189 return i;
131 return -1; 190 return -1;
132} 191}
133 192
134static struct vmx_msr_entry *find_msr_entry(struct kvm_vcpu *vcpu, u32 msr) 193static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
135{ 194{
136 int i; 195 int i;
137 196
138 i = __find_msr_index(vcpu, msr); 197 i = __find_msr_index(vmx, msr);
139 if (i >= 0) 198 if (i >= 0)
140 return &vcpu->guest_msrs[i]; 199 return &vmx->guest_msrs[i];
141 return NULL; 200 return NULL;
142} 201}
143 202
@@ -156,23 +215,24 @@ static void vmcs_clear(struct vmcs *vmcs)
156 215
157static void __vcpu_clear(void *arg) 216static void __vcpu_clear(void *arg)
158{ 217{
159 struct kvm_vcpu *vcpu = arg; 218 struct vcpu_vmx *vmx = arg;
160 int cpu = raw_smp_processor_id(); 219 int cpu = raw_smp_processor_id();
161 220
162 if (vcpu->cpu == cpu) 221 if (vmx->vcpu.cpu == cpu)
163 vmcs_clear(vcpu->vmcs); 222 vmcs_clear(vmx->vmcs);
164 if (per_cpu(current_vmcs, cpu) == vcpu->vmcs) 223 if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
165 per_cpu(current_vmcs, cpu) = NULL; 224 per_cpu(current_vmcs, cpu) = NULL;
166 rdtscll(vcpu->host_tsc); 225 rdtscll(vmx->vcpu.host_tsc);
167} 226}
168 227
169static void vcpu_clear(struct kvm_vcpu *vcpu) 228static void vcpu_clear(struct vcpu_vmx *vmx)
170{ 229{
171 if (vcpu->cpu != raw_smp_processor_id() && vcpu->cpu != -1) 230 if (vmx->vcpu.cpu != raw_smp_processor_id() && vmx->vcpu.cpu != -1)
172 smp_call_function_single(vcpu->cpu, __vcpu_clear, vcpu, 0, 1); 231 smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear,
232 vmx, 0, 1);
173 else 233 else
174 __vcpu_clear(vcpu); 234 __vcpu_clear(vmx);
175 vcpu->launched = 0; 235 vmx->launched = 0;
176} 236}
177 237
178static unsigned long vmcs_readl(unsigned long field) 238static unsigned long vmcs_readl(unsigned long field)
@@ -282,121 +342,122 @@ static void reload_tss(void)
282#endif 342#endif
283} 343}
284 344
285static void load_transition_efer(struct kvm_vcpu *vcpu) 345static void load_transition_efer(struct vcpu_vmx *vmx)
286{ 346{
287 u64 trans_efer; 347 u64 trans_efer;
288 int efer_offset = vcpu->msr_offset_efer; 348 int efer_offset = vmx->msr_offset_efer;
289 349
290 trans_efer = vcpu->host_msrs[efer_offset].data; 350 trans_efer = vmx->host_msrs[efer_offset].data;
291 trans_efer &= ~EFER_SAVE_RESTORE_BITS; 351 trans_efer &= ~EFER_SAVE_RESTORE_BITS;
292 trans_efer |= msr_efer_save_restore_bits( 352 trans_efer |= msr_efer_save_restore_bits(vmx->guest_msrs[efer_offset]);
293 vcpu->guest_msrs[efer_offset]);
294 wrmsrl(MSR_EFER, trans_efer); 353 wrmsrl(MSR_EFER, trans_efer);
295 vcpu->stat.efer_reload++; 354 vmx->vcpu.stat.efer_reload++;
296} 355}
297 356
298static void vmx_save_host_state(struct kvm_vcpu *vcpu) 357static void vmx_save_host_state(struct kvm_vcpu *vcpu)
299{ 358{
300 struct vmx_host_state *hs = &vcpu->vmx_host_state; 359 struct vcpu_vmx *vmx = to_vmx(vcpu);
301 360
302 if (hs->loaded) 361 if (vmx->host_state.loaded)
303 return; 362 return;
304 363
305 hs->loaded = 1; 364 vmx->host_state.loaded = 1;
306 /* 365 /*
307 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not 366 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not
308 * allow segment selectors with cpl > 0 or ti == 1. 367 * allow segment selectors with cpl > 0 or ti == 1.
309 */ 368 */
310 hs->ldt_sel = read_ldt(); 369 vmx->host_state.ldt_sel = read_ldt();
311 hs->fs_gs_ldt_reload_needed = hs->ldt_sel; 370 vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
312 hs->fs_sel = read_fs(); 371 vmx->host_state.fs_sel = read_fs();
313 if (!(hs->fs_sel & 7)) 372 if (!(vmx->host_state.fs_sel & 7)) {
314 vmcs_write16(HOST_FS_SELECTOR, hs->fs_sel); 373 vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
315 else { 374 vmx->host_state.fs_reload_needed = 0;
375 } else {
316 vmcs_write16(HOST_FS_SELECTOR, 0); 376 vmcs_write16(HOST_FS_SELECTOR, 0);
317 hs->fs_gs_ldt_reload_needed = 1; 377 vmx->host_state.fs_reload_needed = 1;
318 } 378 }
319 hs->gs_sel = read_gs(); 379 vmx->host_state.gs_sel = read_gs();
320 if (!(hs->gs_sel & 7)) 380 if (!(vmx->host_state.gs_sel & 7))
321 vmcs_write16(HOST_GS_SELECTOR, hs->gs_sel); 381 vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
322 else { 382 else {
323 vmcs_write16(HOST_GS_SELECTOR, 0); 383 vmcs_write16(HOST_GS_SELECTOR, 0);
324 hs->fs_gs_ldt_reload_needed = 1; 384 vmx->host_state.gs_ldt_reload_needed = 1;
325 } 385 }
326 386
327#ifdef CONFIG_X86_64 387#ifdef CONFIG_X86_64
328 vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE)); 388 vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
329 vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE)); 389 vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
330#else 390#else
331 vmcs_writel(HOST_FS_BASE, segment_base(hs->fs_sel)); 391 vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel));
332 vmcs_writel(HOST_GS_BASE, segment_base(hs->gs_sel)); 392 vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel));
333#endif 393#endif
334 394
335#ifdef CONFIG_X86_64 395#ifdef CONFIG_X86_64
336 if (is_long_mode(vcpu)) { 396 if (is_long_mode(&vmx->vcpu)) {
337 save_msrs(vcpu->host_msrs + vcpu->msr_offset_kernel_gs_base, 1); 397 save_msrs(vmx->host_msrs +
398 vmx->msr_offset_kernel_gs_base, 1);
338 } 399 }
339#endif 400#endif
340 load_msrs(vcpu->guest_msrs, vcpu->save_nmsrs); 401 load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
341 if (msr_efer_need_save_restore(vcpu)) 402 if (msr_efer_need_save_restore(vmx))
342 load_transition_efer(vcpu); 403 load_transition_efer(vmx);
343} 404}
344 405
345static void vmx_load_host_state(struct kvm_vcpu *vcpu) 406static void vmx_load_host_state(struct vcpu_vmx *vmx)
346{ 407{
347 struct vmx_host_state *hs = &vcpu->vmx_host_state; 408 unsigned long flags;
348 409
349 if (!hs->loaded) 410 if (!vmx->host_state.loaded)
350 return; 411 return;
351 412
352 hs->loaded = 0; 413 vmx->host_state.loaded = 0;
353 if (hs->fs_gs_ldt_reload_needed) { 414 if (vmx->host_state.fs_reload_needed)
354 load_ldt(hs->ldt_sel); 415 load_fs(vmx->host_state.fs_sel);
355 load_fs(hs->fs_sel); 416 if (vmx->host_state.gs_ldt_reload_needed) {
417 load_ldt(vmx->host_state.ldt_sel);
356 /* 418 /*
357 * If we have to reload gs, we must take care to 419 * If we have to reload gs, we must take care to
358 * preserve our gs base. 420 * preserve our gs base.
359 */ 421 */
360 local_irq_disable(); 422 local_irq_save(flags);
361 load_gs(hs->gs_sel); 423 load_gs(vmx->host_state.gs_sel);
362#ifdef CONFIG_X86_64 424#ifdef CONFIG_X86_64
363 wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE)); 425 wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
364#endif 426#endif
365 local_irq_enable(); 427 local_irq_restore(flags);
366
367 reload_tss();
368 } 428 }
369 save_msrs(vcpu->guest_msrs, vcpu->save_nmsrs); 429 reload_tss();
370 load_msrs(vcpu->host_msrs, vcpu->save_nmsrs); 430 save_msrs(vmx->guest_msrs, vmx->save_nmsrs);
371 if (msr_efer_need_save_restore(vcpu)) 431 load_msrs(vmx->host_msrs, vmx->save_nmsrs);
372 load_msrs(vcpu->host_msrs + vcpu->msr_offset_efer, 1); 432 if (msr_efer_need_save_restore(vmx))
433 load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1);
373} 434}
374 435
375/* 436/*
376 * Switches to specified vcpu, until a matching vcpu_put(), but assumes 437 * Switches to specified vcpu, until a matching vcpu_put(), but assumes
377 * vcpu mutex is already taken. 438 * vcpu mutex is already taken.
378 */ 439 */
379static void vmx_vcpu_load(struct kvm_vcpu *vcpu) 440static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
380{ 441{
381 u64 phys_addr = __pa(vcpu->vmcs); 442 struct vcpu_vmx *vmx = to_vmx(vcpu);
382 int cpu; 443 u64 phys_addr = __pa(vmx->vmcs);
383 u64 tsc_this, delta; 444 u64 tsc_this, delta;
384 445
385 cpu = get_cpu(); 446 if (vcpu->cpu != cpu) {
386 447 vcpu_clear(vmx);
387 if (vcpu->cpu != cpu) 448 kvm_migrate_apic_timer(vcpu);
388 vcpu_clear(vcpu); 449 }
389 450
390 if (per_cpu(current_vmcs, cpu) != vcpu->vmcs) { 451 if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
391 u8 error; 452 u8 error;
392 453
393 per_cpu(current_vmcs, cpu) = vcpu->vmcs; 454 per_cpu(current_vmcs, cpu) = vmx->vmcs;
394 asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0" 455 asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0"
395 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) 456 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
396 : "cc"); 457 : "cc");
397 if (error) 458 if (error)
398 printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", 459 printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
399 vcpu->vmcs, phys_addr); 460 vmx->vmcs, phys_addr);
400 } 461 }
401 462
402 if (vcpu->cpu != cpu) { 463 if (vcpu->cpu != cpu) {
@@ -426,9 +487,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu)
426 487
427static void vmx_vcpu_put(struct kvm_vcpu *vcpu) 488static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
428{ 489{
429 vmx_load_host_state(vcpu); 490 vmx_load_host_state(to_vmx(vcpu));
430 kvm_put_guest_fpu(vcpu); 491 kvm_put_guest_fpu(vcpu);
431 put_cpu();
432} 492}
433 493
434static void vmx_fpu_activate(struct kvm_vcpu *vcpu) 494static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
@@ -436,9 +496,9 @@ static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
436 if (vcpu->fpu_active) 496 if (vcpu->fpu_active)
437 return; 497 return;
438 vcpu->fpu_active = 1; 498 vcpu->fpu_active = 1;
439 vmcs_clear_bits(GUEST_CR0, CR0_TS_MASK); 499 vmcs_clear_bits(GUEST_CR0, X86_CR0_TS);
440 if (vcpu->cr0 & CR0_TS_MASK) 500 if (vcpu->cr0 & X86_CR0_TS)
441 vmcs_set_bits(GUEST_CR0, CR0_TS_MASK); 501 vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
442 update_exception_bitmap(vcpu); 502 update_exception_bitmap(vcpu);
443} 503}
444 504
@@ -447,13 +507,13 @@ static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
447 if (!vcpu->fpu_active) 507 if (!vcpu->fpu_active)
448 return; 508 return;
449 vcpu->fpu_active = 0; 509 vcpu->fpu_active = 0;
450 vmcs_set_bits(GUEST_CR0, CR0_TS_MASK); 510 vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
451 update_exception_bitmap(vcpu); 511 update_exception_bitmap(vcpu);
452} 512}
453 513
454static void vmx_vcpu_decache(struct kvm_vcpu *vcpu) 514static void vmx_vcpu_decache(struct kvm_vcpu *vcpu)
455{ 515{
456 vcpu_clear(vcpu); 516 vcpu_clear(to_vmx(vcpu));
457} 517}
458 518
459static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) 519static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
@@ -501,59 +561,62 @@ static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code)
501/* 561/*
502 * Swap MSR entry in host/guest MSR entry array. 562 * Swap MSR entry in host/guest MSR entry array.
503 */ 563 */
504void move_msr_up(struct kvm_vcpu *vcpu, int from, int to) 564#ifdef CONFIG_X86_64
565static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
505{ 566{
506 struct vmx_msr_entry tmp; 567 struct kvm_msr_entry tmp;
507 tmp = vcpu->guest_msrs[to]; 568
508 vcpu->guest_msrs[to] = vcpu->guest_msrs[from]; 569 tmp = vmx->guest_msrs[to];
509 vcpu->guest_msrs[from] = tmp; 570 vmx->guest_msrs[to] = vmx->guest_msrs[from];
510 tmp = vcpu->host_msrs[to]; 571 vmx->guest_msrs[from] = tmp;
511 vcpu->host_msrs[to] = vcpu->host_msrs[from]; 572 tmp = vmx->host_msrs[to];
512 vcpu->host_msrs[from] = tmp; 573 vmx->host_msrs[to] = vmx->host_msrs[from];
574 vmx->host_msrs[from] = tmp;
513} 575}
576#endif
514 577
515/* 578/*
516 * Set up the vmcs to automatically save and restore system 579 * Set up the vmcs to automatically save and restore system
517 * msrs. Don't touch the 64-bit msrs if the guest is in legacy 580 * msrs. Don't touch the 64-bit msrs if the guest is in legacy
518 * mode, as fiddling with msrs is very expensive. 581 * mode, as fiddling with msrs is very expensive.
519 */ 582 */
520static void setup_msrs(struct kvm_vcpu *vcpu) 583static void setup_msrs(struct vcpu_vmx *vmx)
521{ 584{
522 int save_nmsrs; 585 int save_nmsrs;
523 586
524 save_nmsrs = 0; 587 save_nmsrs = 0;
525#ifdef CONFIG_X86_64 588#ifdef CONFIG_X86_64
526 if (is_long_mode(vcpu)) { 589 if (is_long_mode(&vmx->vcpu)) {
527 int index; 590 int index;
528 591
529 index = __find_msr_index(vcpu, MSR_SYSCALL_MASK); 592 index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
530 if (index >= 0) 593 if (index >= 0)
531 move_msr_up(vcpu, index, save_nmsrs++); 594 move_msr_up(vmx, index, save_nmsrs++);
532 index = __find_msr_index(vcpu, MSR_LSTAR); 595 index = __find_msr_index(vmx, MSR_LSTAR);
533 if (index >= 0) 596 if (index >= 0)
534 move_msr_up(vcpu, index, save_nmsrs++); 597 move_msr_up(vmx, index, save_nmsrs++);
535 index = __find_msr_index(vcpu, MSR_CSTAR); 598 index = __find_msr_index(vmx, MSR_CSTAR);
536 if (index >= 0) 599 if (index >= 0)
537 move_msr_up(vcpu, index, save_nmsrs++); 600 move_msr_up(vmx, index, save_nmsrs++);
538 index = __find_msr_index(vcpu, MSR_KERNEL_GS_BASE); 601 index = __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
539 if (index >= 0) 602 if (index >= 0)
540 move_msr_up(vcpu, index, save_nmsrs++); 603 move_msr_up(vmx, index, save_nmsrs++);
541 /* 604 /*
542 * MSR_K6_STAR is only needed on long mode guests, and only 605 * MSR_K6_STAR is only needed on long mode guests, and only
543 * if efer.sce is enabled. 606 * if efer.sce is enabled.
544 */ 607 */
545 index = __find_msr_index(vcpu, MSR_K6_STAR); 608 index = __find_msr_index(vmx, MSR_K6_STAR);
546 if ((index >= 0) && (vcpu->shadow_efer & EFER_SCE)) 609 if ((index >= 0) && (vmx->vcpu.shadow_efer & EFER_SCE))
547 move_msr_up(vcpu, index, save_nmsrs++); 610 move_msr_up(vmx, index, save_nmsrs++);
548 } 611 }
549#endif 612#endif
550 vcpu->save_nmsrs = save_nmsrs; 613 vmx->save_nmsrs = save_nmsrs;
551 614
552#ifdef CONFIG_X86_64 615#ifdef CONFIG_X86_64
553 vcpu->msr_offset_kernel_gs_base = 616 vmx->msr_offset_kernel_gs_base =
554 __find_msr_index(vcpu, MSR_KERNEL_GS_BASE); 617 __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
555#endif 618#endif
556 vcpu->msr_offset_efer = __find_msr_index(vcpu, MSR_EFER); 619 vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER);
557} 620}
558 621
559/* 622/*
@@ -589,7 +652,7 @@ static void guest_write_tsc(u64 guest_tsc)
589static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) 652static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
590{ 653{
591 u64 data; 654 u64 data;
592 struct vmx_msr_entry *msr; 655 struct kvm_msr_entry *msr;
593 656
594 if (!pdata) { 657 if (!pdata) {
595 printk(KERN_ERR "BUG: get_msr called with NULL pdata\n"); 658 printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
@@ -620,7 +683,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
620 data = vmcs_readl(GUEST_SYSENTER_ESP); 683 data = vmcs_readl(GUEST_SYSENTER_ESP);
621 break; 684 break;
622 default: 685 default:
623 msr = find_msr_entry(vcpu, msr_index); 686 msr = find_msr_entry(to_vmx(vcpu), msr_index);
624 if (msr) { 687 if (msr) {
625 data = msr->data; 688 data = msr->data;
626 break; 689 break;
@@ -639,15 +702,16 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
639 */ 702 */
640static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 703static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
641{ 704{
642 struct vmx_msr_entry *msr; 705 struct vcpu_vmx *vmx = to_vmx(vcpu);
706 struct kvm_msr_entry *msr;
643 int ret = 0; 707 int ret = 0;
644 708
645 switch (msr_index) { 709 switch (msr_index) {
646#ifdef CONFIG_X86_64 710#ifdef CONFIG_X86_64
647 case MSR_EFER: 711 case MSR_EFER:
648 ret = kvm_set_msr_common(vcpu, msr_index, data); 712 ret = kvm_set_msr_common(vcpu, msr_index, data);
649 if (vcpu->vmx_host_state.loaded) 713 if (vmx->host_state.loaded)
650 load_transition_efer(vcpu); 714 load_transition_efer(vmx);
651 break; 715 break;
652 case MSR_FS_BASE: 716 case MSR_FS_BASE:
653 vmcs_writel(GUEST_FS_BASE, data); 717 vmcs_writel(GUEST_FS_BASE, data);
@@ -669,11 +733,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
669 guest_write_tsc(data); 733 guest_write_tsc(data);
670 break; 734 break;
671 default: 735 default:
672 msr = find_msr_entry(vcpu, msr_index); 736 msr = find_msr_entry(vmx, msr_index);
673 if (msr) { 737 if (msr) {
674 msr->data = data; 738 msr->data = data;
675 if (vcpu->vmx_host_state.loaded) 739 if (vmx->host_state.loaded)
676 load_msrs(vcpu->guest_msrs, vcpu->save_nmsrs); 740 load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
677 break; 741 break;
678 } 742 }
679 ret = kvm_set_msr_common(vcpu, msr_index, data); 743 ret = kvm_set_msr_common(vcpu, msr_index, data);
@@ -740,6 +804,20 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
740 return 0; 804 return 0;
741} 805}
742 806
807static int vmx_get_irq(struct kvm_vcpu *vcpu)
808{
809 u32 idtv_info_field;
810
811 idtv_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD);
812 if (idtv_info_field & INTR_INFO_VALID_MASK) {
813 if (is_external_interrupt(idtv_info_field))
814 return idtv_info_field & VECTORING_INFO_VECTOR_MASK;
815 else
816 printk("pending exception: not handled yet\n");
817 }
818 return -1;
819}
820
743static __init int cpu_has_kvm_support(void) 821static __init int cpu_has_kvm_support(void)
744{ 822{
745 unsigned long ecx = cpuid_ecx(1); 823 unsigned long ecx = cpuid_ecx(1);
@@ -751,7 +829,10 @@ static __init int vmx_disabled_by_bios(void)
751 u64 msr; 829 u64 msr;
752 830
753 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); 831 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
754 return (msr & 5) == 1; /* locked but not enabled */ 832 return (msr & (MSR_IA32_FEATURE_CONTROL_LOCKED |
833 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
834 == MSR_IA32_FEATURE_CONTROL_LOCKED;
835 /* locked but not enabled */
755} 836}
756 837
757static void hardware_enable(void *garbage) 838static void hardware_enable(void *garbage)
@@ -761,10 +842,15 @@ static void hardware_enable(void *garbage)
761 u64 old; 842 u64 old;
762 843
763 rdmsrl(MSR_IA32_FEATURE_CONTROL, old); 844 rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
764 if ((old & 5) != 5) 845 if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED |
846 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
847 != (MSR_IA32_FEATURE_CONTROL_LOCKED |
848 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
765 /* enable and lock */ 849 /* enable and lock */
766 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | 5); 850 wrmsrl(MSR_IA32_FEATURE_CONTROL, old |
767 write_cr4(read_cr4() | CR4_VMXE); /* FIXME: not cpu hotplug safe */ 851 MSR_IA32_FEATURE_CONTROL_LOCKED |
852 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED);
853 write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
768 asm volatile (ASM_VMX_VMXON_RAX : : "a"(&phys_addr), "m"(phys_addr) 854 asm volatile (ASM_VMX_VMXON_RAX : : "a"(&phys_addr), "m"(phys_addr)
769 : "memory", "cc"); 855 : "memory", "cc");
770} 856}
@@ -774,14 +860,102 @@ static void hardware_disable(void *garbage)
774 asm volatile (ASM_VMX_VMXOFF : : : "cc"); 860 asm volatile (ASM_VMX_VMXOFF : : : "cc");
775} 861}
776 862
777static __init void setup_vmcs_descriptor(void) 863static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
864 u32 msr, u32* result)
865{
866 u32 vmx_msr_low, vmx_msr_high;
867 u32 ctl = ctl_min | ctl_opt;
868
869 rdmsr(msr, vmx_msr_low, vmx_msr_high);
870
871 ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
872 ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */
873
874 /* Ensure minimum (required) set of control bits are supported. */
875 if (ctl_min & ~ctl)
876 return -EIO;
877
878 *result = ctl;
879 return 0;
880}
881
882static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
778{ 883{
779 u32 vmx_msr_low, vmx_msr_high; 884 u32 vmx_msr_low, vmx_msr_high;
885 u32 min, opt;
886 u32 _pin_based_exec_control = 0;
887 u32 _cpu_based_exec_control = 0;
888 u32 _vmexit_control = 0;
889 u32 _vmentry_control = 0;
890
891 min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
892 opt = 0;
893 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
894 &_pin_based_exec_control) < 0)
895 return -EIO;
896
897 min = CPU_BASED_HLT_EXITING |
898#ifdef CONFIG_X86_64
899 CPU_BASED_CR8_LOAD_EXITING |
900 CPU_BASED_CR8_STORE_EXITING |
901#endif
902 CPU_BASED_USE_IO_BITMAPS |
903 CPU_BASED_MOV_DR_EXITING |
904 CPU_BASED_USE_TSC_OFFSETING;
905#ifdef CONFIG_X86_64
906 opt = CPU_BASED_TPR_SHADOW;
907#else
908 opt = 0;
909#endif
910 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
911 &_cpu_based_exec_control) < 0)
912 return -EIO;
913#ifdef CONFIG_X86_64
914 if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
915 _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
916 ~CPU_BASED_CR8_STORE_EXITING;
917#endif
918
919 min = 0;
920#ifdef CONFIG_X86_64
921 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
922#endif
923 opt = 0;
924 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
925 &_vmexit_control) < 0)
926 return -EIO;
927
928 min = opt = 0;
929 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
930 &_vmentry_control) < 0)
931 return -EIO;
780 932
781 rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high); 933 rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
782 vmcs_descriptor.size = vmx_msr_high & 0x1fff; 934
783 vmcs_descriptor.order = get_order(vmcs_descriptor.size); 935 /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
784 vmcs_descriptor.revision_id = vmx_msr_low; 936 if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
937 return -EIO;
938
939#ifdef CONFIG_X86_64
940 /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
941 if (vmx_msr_high & (1u<<16))
942 return -EIO;
943#endif
944
945 /* Require Write-Back (WB) memory type for VMCS accesses. */
946 if (((vmx_msr_high >> 18) & 15) != 6)
947 return -EIO;
948
949 vmcs_conf->size = vmx_msr_high & 0x1fff;
950 vmcs_conf->order = get_order(vmcs_config.size);
951 vmcs_conf->revision_id = vmx_msr_low;
952
953 vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
954 vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
955 vmcs_conf->vmexit_ctrl = _vmexit_control;
956 vmcs_conf->vmentry_ctrl = _vmentry_control;
957
958 return 0;
785} 959}
786 960
787static struct vmcs *alloc_vmcs_cpu(int cpu) 961static struct vmcs *alloc_vmcs_cpu(int cpu)
@@ -790,12 +964,12 @@ static struct vmcs *alloc_vmcs_cpu(int cpu)
790 struct page *pages; 964 struct page *pages;
791 struct vmcs *vmcs; 965 struct vmcs *vmcs;
792 966
793 pages = alloc_pages_node(node, GFP_KERNEL, vmcs_descriptor.order); 967 pages = alloc_pages_node(node, GFP_KERNEL, vmcs_config.order);
794 if (!pages) 968 if (!pages)
795 return NULL; 969 return NULL;
796 vmcs = page_address(pages); 970 vmcs = page_address(pages);
797 memset(vmcs, 0, vmcs_descriptor.size); 971 memset(vmcs, 0, vmcs_config.size);
798 vmcs->revision_id = vmcs_descriptor.revision_id; /* vmcs revision id */ 972 vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */
799 return vmcs; 973 return vmcs;
800} 974}
801 975
@@ -806,7 +980,7 @@ static struct vmcs *alloc_vmcs(void)
806 980
807static void free_vmcs(struct vmcs *vmcs) 981static void free_vmcs(struct vmcs *vmcs)
808{ 982{
809 free_pages((unsigned long)vmcs, vmcs_descriptor.order); 983 free_pages((unsigned long)vmcs, vmcs_config.order);
810} 984}
811 985
812static void free_kvm_area(void) 986static void free_kvm_area(void)
@@ -817,8 +991,6 @@ static void free_kvm_area(void)
817 free_vmcs(per_cpu(vmxarea, cpu)); 991 free_vmcs(per_cpu(vmxarea, cpu));
818} 992}
819 993
820extern struct vmcs *alloc_vmcs_cpu(int cpu);
821
822static __init int alloc_kvm_area(void) 994static __init int alloc_kvm_area(void)
823{ 995{
824 int cpu; 996 int cpu;
@@ -839,7 +1011,8 @@ static __init int alloc_kvm_area(void)
839 1011
840static __init int hardware_setup(void) 1012static __init int hardware_setup(void)
841{ 1013{
842 setup_vmcs_descriptor(); 1014 if (setup_vmcs_config(&vmcs_config) < 0)
1015 return -EIO;
843 return alloc_kvm_area(); 1016 return alloc_kvm_area();
844} 1017}
845 1018
@@ -879,8 +1052,8 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
879 flags |= (vcpu->rmode.save_iopl << IOPL_SHIFT); 1052 flags |= (vcpu->rmode.save_iopl << IOPL_SHIFT);
880 vmcs_writel(GUEST_RFLAGS, flags); 1053 vmcs_writel(GUEST_RFLAGS, flags);
881 1054
882 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~CR4_VME_MASK) | 1055 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
883 (vmcs_readl(CR4_READ_SHADOW) & CR4_VME_MASK)); 1056 (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
884 1057
885 update_exception_bitmap(vcpu); 1058 update_exception_bitmap(vcpu);
886 1059
@@ -897,7 +1070,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
897 vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); 1070 vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
898} 1071}
899 1072
900static int rmode_tss_base(struct kvm* kvm) 1073static gva_t rmode_tss_base(struct kvm* kvm)
901{ 1074{
902 gfn_t base_gfn = kvm->memslots[0].base_gfn + kvm->memslots[0].npages - 3; 1075 gfn_t base_gfn = kvm->memslots[0].base_gfn + kvm->memslots[0].npages - 3;
903 return base_gfn << PAGE_SHIFT; 1076 return base_gfn << PAGE_SHIFT;
@@ -937,7 +1110,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
937 flags |= IOPL_MASK | X86_EFLAGS_VM; 1110 flags |= IOPL_MASK | X86_EFLAGS_VM;
938 1111
939 vmcs_writel(GUEST_RFLAGS, flags); 1112 vmcs_writel(GUEST_RFLAGS, flags);
940 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | CR4_VME_MASK); 1113 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
941 update_exception_bitmap(vcpu); 1114 update_exception_bitmap(vcpu);
942 1115
943 vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4); 1116 vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4);
@@ -975,10 +1148,10 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
975 1148
976 vcpu->shadow_efer |= EFER_LMA; 1149 vcpu->shadow_efer |= EFER_LMA;
977 1150
978 find_msr_entry(vcpu, MSR_EFER)->data |= EFER_LMA | EFER_LME; 1151 find_msr_entry(to_vmx(vcpu), MSR_EFER)->data |= EFER_LMA | EFER_LME;
979 vmcs_write32(VM_ENTRY_CONTROLS, 1152 vmcs_write32(VM_ENTRY_CONTROLS,
980 vmcs_read32(VM_ENTRY_CONTROLS) 1153 vmcs_read32(VM_ENTRY_CONTROLS)
981 | VM_ENTRY_CONTROLS_IA32E_MASK); 1154 | VM_ENTRY_IA32E_MODE);
982} 1155}
983 1156
984static void exit_lmode(struct kvm_vcpu *vcpu) 1157static void exit_lmode(struct kvm_vcpu *vcpu)
@@ -987,7 +1160,7 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
987 1160
988 vmcs_write32(VM_ENTRY_CONTROLS, 1161 vmcs_write32(VM_ENTRY_CONTROLS,
989 vmcs_read32(VM_ENTRY_CONTROLS) 1162 vmcs_read32(VM_ENTRY_CONTROLS)
990 & ~VM_ENTRY_CONTROLS_IA32E_MASK); 1163 & ~VM_ENTRY_IA32E_MODE);
991} 1164}
992 1165
993#endif 1166#endif
@@ -1002,17 +1175,17 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1002{ 1175{
1003 vmx_fpu_deactivate(vcpu); 1176 vmx_fpu_deactivate(vcpu);
1004 1177
1005 if (vcpu->rmode.active && (cr0 & CR0_PE_MASK)) 1178 if (vcpu->rmode.active && (cr0 & X86_CR0_PE))
1006 enter_pmode(vcpu); 1179 enter_pmode(vcpu);
1007 1180
1008 if (!vcpu->rmode.active && !(cr0 & CR0_PE_MASK)) 1181 if (!vcpu->rmode.active && !(cr0 & X86_CR0_PE))
1009 enter_rmode(vcpu); 1182 enter_rmode(vcpu);
1010 1183
1011#ifdef CONFIG_X86_64 1184#ifdef CONFIG_X86_64
1012 if (vcpu->shadow_efer & EFER_LME) { 1185 if (vcpu->shadow_efer & EFER_LME) {
1013 if (!is_paging(vcpu) && (cr0 & CR0_PG_MASK)) 1186 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
1014 enter_lmode(vcpu); 1187 enter_lmode(vcpu);
1015 if (is_paging(vcpu) && !(cr0 & CR0_PG_MASK)) 1188 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
1016 exit_lmode(vcpu); 1189 exit_lmode(vcpu);
1017 } 1190 }
1018#endif 1191#endif
@@ -1022,14 +1195,14 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1022 (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON); 1195 (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON);
1023 vcpu->cr0 = cr0; 1196 vcpu->cr0 = cr0;
1024 1197
1025 if (!(cr0 & CR0_TS_MASK) || !(cr0 & CR0_PE_MASK)) 1198 if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE))
1026 vmx_fpu_activate(vcpu); 1199 vmx_fpu_activate(vcpu);
1027} 1200}
1028 1201
1029static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 1202static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1030{ 1203{
1031 vmcs_writel(GUEST_CR3, cr3); 1204 vmcs_writel(GUEST_CR3, cr3);
1032 if (vcpu->cr0 & CR0_PE_MASK) 1205 if (vcpu->cr0 & X86_CR0_PE)
1033 vmx_fpu_deactivate(vcpu); 1206 vmx_fpu_deactivate(vcpu);
1034} 1207}
1035 1208
@@ -1045,23 +1218,24 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1045 1218
1046static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) 1219static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
1047{ 1220{
1048 struct vmx_msr_entry *msr = find_msr_entry(vcpu, MSR_EFER); 1221 struct vcpu_vmx *vmx = to_vmx(vcpu);
1222 struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
1049 1223
1050 vcpu->shadow_efer = efer; 1224 vcpu->shadow_efer = efer;
1051 if (efer & EFER_LMA) { 1225 if (efer & EFER_LMA) {
1052 vmcs_write32(VM_ENTRY_CONTROLS, 1226 vmcs_write32(VM_ENTRY_CONTROLS,
1053 vmcs_read32(VM_ENTRY_CONTROLS) | 1227 vmcs_read32(VM_ENTRY_CONTROLS) |
1054 VM_ENTRY_CONTROLS_IA32E_MASK); 1228 VM_ENTRY_IA32E_MODE);
1055 msr->data = efer; 1229 msr->data = efer;
1056 1230
1057 } else { 1231 } else {
1058 vmcs_write32(VM_ENTRY_CONTROLS, 1232 vmcs_write32(VM_ENTRY_CONTROLS,
1059 vmcs_read32(VM_ENTRY_CONTROLS) & 1233 vmcs_read32(VM_ENTRY_CONTROLS) &
1060 ~VM_ENTRY_CONTROLS_IA32E_MASK); 1234 ~VM_ENTRY_IA32E_MODE);
1061 1235
1062 msr->data = efer & ~EFER_LME; 1236 msr->data = efer & ~EFER_LME;
1063 } 1237 }
1064 setup_msrs(vcpu); 1238 setup_msrs(vmx);
1065} 1239}
1066 1240
1067#endif 1241#endif
@@ -1210,17 +1384,6 @@ static int init_rmode_tss(struct kvm* kvm)
1210 return 1; 1384 return 1;
1211} 1385}
1212 1386
1213static void vmcs_write32_fixedbits(u32 msr, u32 vmcs_field, u32 val)
1214{
1215 u32 msr_high, msr_low;
1216
1217 rdmsr(msr, msr_low, msr_high);
1218
1219 val &= msr_high;
1220 val |= msr_low;
1221 vmcs_write32(vmcs_field, val);
1222}
1223
1224static void seg_setup(int seg) 1387static void seg_setup(int seg)
1225{ 1388{
1226 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 1389 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
@@ -1234,7 +1397,7 @@ static void seg_setup(int seg)
1234/* 1397/*
1235 * Sets up the vmcs for emulated real mode. 1398 * Sets up the vmcs for emulated real mode.
1236 */ 1399 */
1237static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) 1400static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
1238{ 1401{
1239 u32 host_sysenter_cs; 1402 u32 host_sysenter_cs;
1240 u32 junk; 1403 u32 junk;
@@ -1243,27 +1406,36 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu)
1243 int i; 1406 int i;
1244 int ret = 0; 1407 int ret = 0;
1245 unsigned long kvm_vmx_return; 1408 unsigned long kvm_vmx_return;
1409 u64 msr;
1410 u32 exec_control;
1246 1411
1247 if (!init_rmode_tss(vcpu->kvm)) { 1412 if (!init_rmode_tss(vmx->vcpu.kvm)) {
1248 ret = -ENOMEM; 1413 ret = -ENOMEM;
1249 goto out; 1414 goto out;
1250 } 1415 }
1251 1416
1252 memset(vcpu->regs, 0, sizeof(vcpu->regs)); 1417 vmx->vcpu.rmode.active = 0;
1253 vcpu->regs[VCPU_REGS_RDX] = get_rdx_init_val();
1254 vcpu->cr8 = 0;
1255 vcpu->apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
1256 if (vcpu == &vcpu->kvm->vcpus[0])
1257 vcpu->apic_base |= MSR_IA32_APICBASE_BSP;
1258 1418
1259 fx_init(vcpu); 1419 vmx->vcpu.regs[VCPU_REGS_RDX] = get_rdx_init_val();
1420 set_cr8(&vmx->vcpu, 0);
1421 msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
1422 if (vmx->vcpu.vcpu_id == 0)
1423 msr |= MSR_IA32_APICBASE_BSP;
1424 kvm_set_apic_base(&vmx->vcpu, msr);
1425
1426 fx_init(&vmx->vcpu);
1260 1427
1261 /* 1428 /*
1262 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode 1429 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
1263 * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. 1430 * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh.
1264 */ 1431 */
1265 vmcs_write16(GUEST_CS_SELECTOR, 0xf000); 1432 if (vmx->vcpu.vcpu_id == 0) {
1266 vmcs_writel(GUEST_CS_BASE, 0x000f0000); 1433 vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
1434 vmcs_writel(GUEST_CS_BASE, 0x000f0000);
1435 } else {
1436 vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.sipi_vector << 8);
1437 vmcs_writel(GUEST_CS_BASE, vmx->vcpu.sipi_vector << 12);
1438 }
1267 vmcs_write32(GUEST_CS_LIMIT, 0xffff); 1439 vmcs_write32(GUEST_CS_LIMIT, 0xffff);
1268 vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); 1440 vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
1269 1441
@@ -1288,7 +1460,10 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu)
1288 vmcs_writel(GUEST_SYSENTER_EIP, 0); 1460 vmcs_writel(GUEST_SYSENTER_EIP, 0);
1289 1461
1290 vmcs_writel(GUEST_RFLAGS, 0x02); 1462 vmcs_writel(GUEST_RFLAGS, 0x02);
1291 vmcs_writel(GUEST_RIP, 0xfff0); 1463 if (vmx->vcpu.vcpu_id == 0)
1464 vmcs_writel(GUEST_RIP, 0xfff0);
1465 else
1466 vmcs_writel(GUEST_RIP, 0);
1292 vmcs_writel(GUEST_RSP, 0); 1467 vmcs_writel(GUEST_RSP, 0);
1293 1468
1294 //todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 1469 //todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0
@@ -1316,20 +1491,18 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu)
1316 vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 1491 vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
1317 1492
1318 /* Control */ 1493 /* Control */
1319 vmcs_write32_fixedbits(MSR_IA32_VMX_PINBASED_CTLS, 1494 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
1320 PIN_BASED_VM_EXEC_CONTROL, 1495 vmcs_config.pin_based_exec_ctrl);
1321 PIN_BASED_EXT_INTR_MASK /* 20.6.1 */ 1496
1322 | PIN_BASED_NMI_EXITING /* 20.6.1 */ 1497 exec_control = vmcs_config.cpu_based_exec_ctrl;
1323 ); 1498 if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
1324 vmcs_write32_fixedbits(MSR_IA32_VMX_PROCBASED_CTLS, 1499 exec_control &= ~CPU_BASED_TPR_SHADOW;
1325 CPU_BASED_VM_EXEC_CONTROL, 1500#ifdef CONFIG_X86_64
1326 CPU_BASED_HLT_EXITING /* 20.6.2 */ 1501 exec_control |= CPU_BASED_CR8_STORE_EXITING |
1327 | CPU_BASED_CR8_LOAD_EXITING /* 20.6.2 */ 1502 CPU_BASED_CR8_LOAD_EXITING;
1328 | CPU_BASED_CR8_STORE_EXITING /* 20.6.2 */ 1503#endif
1329 | CPU_BASED_ACTIVATE_IO_BITMAP /* 20.6.2 */ 1504 }
1330 | CPU_BASED_MOV_DR_EXITING 1505 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
1331 | CPU_BASED_USE_TSC_OFFSETING /* 21.3 */
1332 );
1333 1506
1334 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); 1507 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
1335 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); 1508 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
@@ -1377,46 +1550,48 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu)
1377 u32 index = vmx_msr_index[i]; 1550 u32 index = vmx_msr_index[i];
1378 u32 data_low, data_high; 1551 u32 data_low, data_high;
1379 u64 data; 1552 u64 data;
1380 int j = vcpu->nmsrs; 1553 int j = vmx->nmsrs;
1381 1554
1382 if (rdmsr_safe(index, &data_low, &data_high) < 0) 1555 if (rdmsr_safe(index, &data_low, &data_high) < 0)
1383 continue; 1556 continue;
1384 if (wrmsr_safe(index, data_low, data_high) < 0) 1557 if (wrmsr_safe(index, data_low, data_high) < 0)
1385 continue; 1558 continue;
1386 data = data_low | ((u64)data_high << 32); 1559 data = data_low | ((u64)data_high << 32);
1387 vcpu->host_msrs[j].index = index; 1560 vmx->host_msrs[j].index = index;
1388 vcpu->host_msrs[j].reserved = 0; 1561 vmx->host_msrs[j].reserved = 0;
1389 vcpu->host_msrs[j].data = data; 1562 vmx->host_msrs[j].data = data;
1390 vcpu->guest_msrs[j] = vcpu->host_msrs[j]; 1563 vmx->guest_msrs[j] = vmx->host_msrs[j];
1391 ++vcpu->nmsrs; 1564 ++vmx->nmsrs;
1392 } 1565 }
1393 1566
1394 setup_msrs(vcpu); 1567 setup_msrs(vmx);
1395 1568
1396 vmcs_write32_fixedbits(MSR_IA32_VMX_EXIT_CTLS, VM_EXIT_CONTROLS, 1569 vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
1397 (HOST_IS_64 << 9)); /* 22.2,1, 20.7.1 */
1398 1570
1399 /* 22.2.1, 20.8.1 */ 1571 /* 22.2.1, 20.8.1 */
1400 vmcs_write32_fixedbits(MSR_IA32_VMX_ENTRY_CTLS, 1572 vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
1401 VM_ENTRY_CONTROLS, 0); 1573
1402 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ 1574 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
1403 1575
1404#ifdef CONFIG_X86_64 1576#ifdef CONFIG_X86_64
1405 vmcs_writel(VIRTUAL_APIC_PAGE_ADDR, 0); 1577 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
1406 vmcs_writel(TPR_THRESHOLD, 0); 1578 if (vm_need_tpr_shadow(vmx->vcpu.kvm))
1579 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
1580 page_to_phys(vmx->vcpu.apic->regs_page));
1581 vmcs_write32(TPR_THRESHOLD, 0);
1407#endif 1582#endif
1408 1583
1409 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); 1584 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
1410 vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK); 1585 vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK);
1411 1586
1412 vcpu->cr0 = 0x60000010; 1587 vmx->vcpu.cr0 = 0x60000010;
1413 vmx_set_cr0(vcpu, vcpu->cr0); // enter rmode 1588 vmx_set_cr0(&vmx->vcpu, vmx->vcpu.cr0); // enter rmode
1414 vmx_set_cr4(vcpu, 0); 1589 vmx_set_cr4(&vmx->vcpu, 0);
1415#ifdef CONFIG_X86_64 1590#ifdef CONFIG_X86_64
1416 vmx_set_efer(vcpu, 0); 1591 vmx_set_efer(&vmx->vcpu, 0);
1417#endif 1592#endif
1418 vmx_fpu_activate(vcpu); 1593 vmx_fpu_activate(&vmx->vcpu);
1419 update_exception_bitmap(vcpu); 1594 update_exception_bitmap(&vmx->vcpu);
1420 1595
1421 return 0; 1596 return 0;
1422 1597
@@ -1424,6 +1599,13 @@ out:
1424 return ret; 1599 return ret;
1425} 1600}
1426 1601
1602static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
1603{
1604 struct vcpu_vmx *vmx = to_vmx(vcpu);
1605
1606 vmx_vcpu_setup(vmx);
1607}
1608
1427static void inject_rmode_irq(struct kvm_vcpu *vcpu, int irq) 1609static void inject_rmode_irq(struct kvm_vcpu *vcpu, int irq)
1428{ 1610{
1429 u16 ent[2]; 1611 u16 ent[2];
@@ -1443,8 +1625,8 @@ static void inject_rmode_irq(struct kvm_vcpu *vcpu, int irq)
1443 return; 1625 return;
1444 } 1626 }
1445 1627
1446 if (kvm_read_guest(vcpu, irq * sizeof(ent), sizeof(ent), &ent) != 1628 if (emulator_read_std(irq * sizeof(ent), &ent, sizeof(ent), vcpu) !=
1447 sizeof(ent)) { 1629 X86EMUL_CONTINUE) {
1448 vcpu_printf(vcpu, "%s: read guest err\n", __FUNCTION__); 1630 vcpu_printf(vcpu, "%s: read guest err\n", __FUNCTION__);
1449 return; 1631 return;
1450 } 1632 }
@@ -1454,9 +1636,9 @@ static void inject_rmode_irq(struct kvm_vcpu *vcpu, int irq)
1454 ip = vmcs_readl(GUEST_RIP); 1636 ip = vmcs_readl(GUEST_RIP);
1455 1637
1456 1638
1457 if (kvm_write_guest(vcpu, ss_base + sp - 2, 2, &flags) != 2 || 1639 if (emulator_write_emulated(ss_base + sp - 2, &flags, 2, vcpu) != X86EMUL_CONTINUE ||
1458 kvm_write_guest(vcpu, ss_base + sp - 4, 2, &cs) != 2 || 1640 emulator_write_emulated(ss_base + sp - 4, &cs, 2, vcpu) != X86EMUL_CONTINUE ||
1459 kvm_write_guest(vcpu, ss_base + sp - 6, 2, &ip) != 2) { 1641 emulator_write_emulated(ss_base + sp - 6, &ip, 2, vcpu) != X86EMUL_CONTINUE) {
1460 vcpu_printf(vcpu, "%s: write guest err\n", __FUNCTION__); 1642 vcpu_printf(vcpu, "%s: write guest err\n", __FUNCTION__);
1461 return; 1643 return;
1462 } 1644 }
@@ -1469,6 +1651,16 @@ static void inject_rmode_irq(struct kvm_vcpu *vcpu, int irq)
1469 vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) & ~0xffff) | (sp - 6)); 1651 vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) & ~0xffff) | (sp - 6));
1470} 1652}
1471 1653
1654static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
1655{
1656 if (vcpu->rmode.active) {
1657 inject_rmode_irq(vcpu, irq);
1658 return;
1659 }
1660 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
1661 irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
1662}
1663
1472static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) 1664static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
1473{ 1665{
1474 int word_index = __ffs(vcpu->irq_summary); 1666 int word_index = __ffs(vcpu->irq_summary);
@@ -1478,13 +1670,7 @@ static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
1478 clear_bit(bit_index, &vcpu->irq_pending[word_index]); 1670 clear_bit(bit_index, &vcpu->irq_pending[word_index]);
1479 if (!vcpu->irq_pending[word_index]) 1671 if (!vcpu->irq_pending[word_index])
1480 clear_bit(word_index, &vcpu->irq_summary); 1672 clear_bit(word_index, &vcpu->irq_summary);
1481 1673 vmx_inject_irq(vcpu, irq);
1482 if (vcpu->rmode.active) {
1483 inject_rmode_irq(vcpu, irq);
1484 return;
1485 }
1486 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
1487 irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
1488} 1674}
1489 1675
1490 1676
@@ -1568,7 +1754,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1568 "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info); 1754 "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info);
1569 } 1755 }
1570 1756
1571 if (is_external_interrupt(vect_info)) { 1757 if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) {
1572 int irq = vect_info & VECTORING_INFO_VECTOR_MASK; 1758 int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
1573 set_bit(irq, vcpu->irq_pending); 1759 set_bit(irq, vcpu->irq_pending);
1574 set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary); 1760 set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary);
@@ -1591,29 +1777,28 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1591 if (is_page_fault(intr_info)) { 1777 if (is_page_fault(intr_info)) {
1592 cr2 = vmcs_readl(EXIT_QUALIFICATION); 1778 cr2 = vmcs_readl(EXIT_QUALIFICATION);
1593 1779
1594 spin_lock(&vcpu->kvm->lock); 1780 mutex_lock(&vcpu->kvm->lock);
1595 r = kvm_mmu_page_fault(vcpu, cr2, error_code); 1781 r = kvm_mmu_page_fault(vcpu, cr2, error_code);
1596 if (r < 0) { 1782 if (r < 0) {
1597 spin_unlock(&vcpu->kvm->lock); 1783 mutex_unlock(&vcpu->kvm->lock);
1598 return r; 1784 return r;
1599 } 1785 }
1600 if (!r) { 1786 if (!r) {
1601 spin_unlock(&vcpu->kvm->lock); 1787 mutex_unlock(&vcpu->kvm->lock);
1602 return 1; 1788 return 1;
1603 } 1789 }
1604 1790
1605 er = emulate_instruction(vcpu, kvm_run, cr2, error_code); 1791 er = emulate_instruction(vcpu, kvm_run, cr2, error_code);
1606 spin_unlock(&vcpu->kvm->lock); 1792 mutex_unlock(&vcpu->kvm->lock);
1607 1793
1608 switch (er) { 1794 switch (er) {
1609 case EMULATE_DONE: 1795 case EMULATE_DONE:
1610 return 1; 1796 return 1;
1611 case EMULATE_DO_MMIO: 1797 case EMULATE_DO_MMIO:
1612 ++vcpu->stat.mmio_exits; 1798 ++vcpu->stat.mmio_exits;
1613 kvm_run->exit_reason = KVM_EXIT_MMIO;
1614 return 0; 1799 return 0;
1615 case EMULATE_FAIL: 1800 case EMULATE_FAIL:
1616 vcpu_printf(vcpu, "%s: emulate fail\n", __FUNCTION__); 1801 kvm_report_emulation_failure(vcpu, "pagetable");
1617 break; 1802 break;
1618 default: 1803 default:
1619 BUG(); 1804 BUG();
@@ -1653,80 +1838,29 @@ static int handle_triple_fault(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1653 return 0; 1838 return 0;
1654} 1839}
1655 1840
1656static int get_io_count(struct kvm_vcpu *vcpu, unsigned long *count)
1657{
1658 u64 inst;
1659 gva_t rip;
1660 int countr_size;
1661 int i, n;
1662
1663 if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_VM)) {
1664 countr_size = 2;
1665 } else {
1666 u32 cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
1667
1668 countr_size = (cs_ar & AR_L_MASK) ? 8:
1669 (cs_ar & AR_DB_MASK) ? 4: 2;
1670 }
1671
1672 rip = vmcs_readl(GUEST_RIP);
1673 if (countr_size != 8)
1674 rip += vmcs_readl(GUEST_CS_BASE);
1675
1676 n = kvm_read_guest(vcpu, rip, sizeof(inst), &inst);
1677
1678 for (i = 0; i < n; i++) {
1679 switch (((u8*)&inst)[i]) {
1680 case 0xf0:
1681 case 0xf2:
1682 case 0xf3:
1683 case 0x2e:
1684 case 0x36:
1685 case 0x3e:
1686 case 0x26:
1687 case 0x64:
1688 case 0x65:
1689 case 0x66:
1690 break;
1691 case 0x67:
1692 countr_size = (countr_size == 2) ? 4: (countr_size >> 1);
1693 default:
1694 goto done;
1695 }
1696 }
1697 return 0;
1698done:
1699 countr_size *= 8;
1700 *count = vcpu->regs[VCPU_REGS_RCX] & (~0ULL >> (64 - countr_size));
1701 //printk("cx: %lx\n", vcpu->regs[VCPU_REGS_RCX]);
1702 return 1;
1703}
1704
1705static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1841static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1706{ 1842{
1707 u64 exit_qualification; 1843 unsigned long exit_qualification;
1708 int size, down, in, string, rep; 1844 int size, down, in, string, rep;
1709 unsigned port; 1845 unsigned port;
1710 unsigned long count;
1711 gva_t address;
1712 1846
1713 ++vcpu->stat.io_exits; 1847 ++vcpu->stat.io_exits;
1714 exit_qualification = vmcs_read64(EXIT_QUALIFICATION); 1848 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
1715 in = (exit_qualification & 8) != 0;
1716 size = (exit_qualification & 7) + 1;
1717 string = (exit_qualification & 16) != 0; 1849 string = (exit_qualification & 16) != 0;
1850
1851 if (string) {
1852 if (emulate_instruction(vcpu, kvm_run, 0, 0) == EMULATE_DO_MMIO)
1853 return 0;
1854 return 1;
1855 }
1856
1857 size = (exit_qualification & 7) + 1;
1858 in = (exit_qualification & 8) != 0;
1718 down = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0; 1859 down = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0;
1719 count = 1;
1720 rep = (exit_qualification & 32) != 0; 1860 rep = (exit_qualification & 32) != 0;
1721 port = exit_qualification >> 16; 1861 port = exit_qualification >> 16;
1722 address = 0; 1862
1723 if (string) { 1863 return kvm_emulate_pio(vcpu, kvm_run, in, size, port);
1724 if (rep && !get_io_count(vcpu, &count))
1725 return 1;
1726 address = vmcs_readl(GUEST_LINEAR_ADDRESS);
1727 }
1728 return kvm_setup_pio(vcpu, kvm_run, in, size, count, string, down,
1729 address, rep, port);
1730} 1864}
1731 1865
1732static void 1866static void
@@ -1743,11 +1877,11 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
1743 1877
1744static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1878static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1745{ 1879{
1746 u64 exit_qualification; 1880 unsigned long exit_qualification;
1747 int cr; 1881 int cr;
1748 int reg; 1882 int reg;
1749 1883
1750 exit_qualification = vmcs_read64(EXIT_QUALIFICATION); 1884 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
1751 cr = exit_qualification & 15; 1885 cr = exit_qualification & 15;
1752 reg = (exit_qualification >> 8) & 15; 1886 reg = (exit_qualification >> 8) & 15;
1753 switch ((exit_qualification >> 4) & 3) { 1887 switch ((exit_qualification >> 4) & 3) {
@@ -1772,13 +1906,14 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1772 vcpu_load_rsp_rip(vcpu); 1906 vcpu_load_rsp_rip(vcpu);
1773 set_cr8(vcpu, vcpu->regs[reg]); 1907 set_cr8(vcpu, vcpu->regs[reg]);
1774 skip_emulated_instruction(vcpu); 1908 skip_emulated_instruction(vcpu);
1775 return 1; 1909 kvm_run->exit_reason = KVM_EXIT_SET_TPR;
1910 return 0;
1776 }; 1911 };
1777 break; 1912 break;
1778 case 2: /* clts */ 1913 case 2: /* clts */
1779 vcpu_load_rsp_rip(vcpu); 1914 vcpu_load_rsp_rip(vcpu);
1780 vmx_fpu_deactivate(vcpu); 1915 vmx_fpu_deactivate(vcpu);
1781 vcpu->cr0 &= ~CR0_TS_MASK; 1916 vcpu->cr0 &= ~X86_CR0_TS;
1782 vmcs_writel(CR0_READ_SHADOW, vcpu->cr0); 1917 vmcs_writel(CR0_READ_SHADOW, vcpu->cr0);
1783 vmx_fpu_activate(vcpu); 1918 vmx_fpu_activate(vcpu);
1784 skip_emulated_instruction(vcpu); 1919 skip_emulated_instruction(vcpu);
@@ -1793,7 +1928,7 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1793 return 1; 1928 return 1;
1794 case 8: 1929 case 8:
1795 vcpu_load_rsp_rip(vcpu); 1930 vcpu_load_rsp_rip(vcpu);
1796 vcpu->regs[reg] = vcpu->cr8; 1931 vcpu->regs[reg] = get_cr8(vcpu);
1797 vcpu_put_rsp_rip(vcpu); 1932 vcpu_put_rsp_rip(vcpu);
1798 skip_emulated_instruction(vcpu); 1933 skip_emulated_instruction(vcpu);
1799 return 1; 1934 return 1;
@@ -1808,14 +1943,14 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1808 break; 1943 break;
1809 } 1944 }
1810 kvm_run->exit_reason = 0; 1945 kvm_run->exit_reason = 0;
1811 printk(KERN_ERR "kvm: unhandled control register: op %d cr %d\n", 1946 pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
1812 (int)(exit_qualification >> 4) & 3, cr); 1947 (int)(exit_qualification >> 4) & 3, cr);
1813 return 0; 1948 return 0;
1814} 1949}
1815 1950
1816static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1951static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1817{ 1952{
1818 u64 exit_qualification; 1953 unsigned long exit_qualification;
1819 unsigned long val; 1954 unsigned long val;
1820 int dr, reg; 1955 int dr, reg;
1821 1956
@@ -1823,7 +1958,7 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1823 * FIXME: this code assumes the host is debugging the guest. 1958 * FIXME: this code assumes the host is debugging the guest.
1824 * need to deal with guest debugging itself too. 1959 * need to deal with guest debugging itself too.
1825 */ 1960 */
1826 exit_qualification = vmcs_read64(EXIT_QUALIFICATION); 1961 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
1827 dr = exit_qualification & 7; 1962 dr = exit_qualification & 7;
1828 reg = (exit_qualification >> 8) & 15; 1963 reg = (exit_qualification >> 8) & 15;
1829 vcpu_load_rsp_rip(vcpu); 1964 vcpu_load_rsp_rip(vcpu);
@@ -1886,19 +2021,21 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1886 return 1; 2021 return 1;
1887} 2022}
1888 2023
1889static void post_kvm_run_save(struct kvm_vcpu *vcpu, 2024static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu,
1890 struct kvm_run *kvm_run) 2025 struct kvm_run *kvm_run)
1891{ 2026{
1892 kvm_run->if_flag = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) != 0; 2027 return 1;
1893 kvm_run->cr8 = vcpu->cr8;
1894 kvm_run->apic_base = vcpu->apic_base;
1895 kvm_run->ready_for_interrupt_injection = (vcpu->interrupt_window_open &&
1896 vcpu->irq_summary == 0);
1897} 2028}
1898 2029
1899static int handle_interrupt_window(struct kvm_vcpu *vcpu, 2030static int handle_interrupt_window(struct kvm_vcpu *vcpu,
1900 struct kvm_run *kvm_run) 2031 struct kvm_run *kvm_run)
1901{ 2032{
2033 u32 cpu_based_vm_exec_control;
2034
2035 /* clear pending irq */
2036 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2037 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2038 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
1902 /* 2039 /*
1903 * If the user space waits to inject interrupts, exit as soon as 2040 * If the user space waits to inject interrupts, exit as soon as
1904 * possible 2041 * possible
@@ -1943,6 +2080,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
1943 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, 2080 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
1944 [EXIT_REASON_HLT] = handle_halt, 2081 [EXIT_REASON_HLT] = handle_halt,
1945 [EXIT_REASON_VMCALL] = handle_vmcall, 2082 [EXIT_REASON_VMCALL] = handle_vmcall,
2083 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold
1946}; 2084};
1947 2085
1948static const int kvm_vmx_max_exit_handlers = 2086static const int kvm_vmx_max_exit_handlers =
@@ -1956,6 +2094,14 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1956{ 2094{
1957 u32 vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 2095 u32 vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
1958 u32 exit_reason = vmcs_read32(VM_EXIT_REASON); 2096 u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
2097 struct vcpu_vmx *vmx = to_vmx(vcpu);
2098
2099 if (unlikely(vmx->fail)) {
2100 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
2101 kvm_run->fail_entry.hardware_entry_failure_reason
2102 = vmcs_read32(VM_INSTRUCTION_ERROR);
2103 return 0;
2104 }
1959 2105
1960 if ( (vectoring_info & VECTORING_INFO_VALID_MASK) && 2106 if ( (vectoring_info & VECTORING_INFO_VALID_MASK) &&
1961 exit_reason != EXIT_REASON_EXCEPTION_NMI ) 2107 exit_reason != EXIT_REASON_EXCEPTION_NMI )
@@ -1971,57 +2117,91 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1971 return 0; 2117 return 0;
1972} 2118}
1973 2119
1974/* 2120static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
1975 * Check if userspace requested an interrupt window, and that the
1976 * interrupt window is open.
1977 *
1978 * No need to exit to userspace if we already have an interrupt queued.
1979 */
1980static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
1981 struct kvm_run *kvm_run)
1982{ 2121{
1983 return (!vcpu->irq_summary &&
1984 kvm_run->request_interrupt_window &&
1985 vcpu->interrupt_window_open &&
1986 (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF));
1987} 2122}
1988 2123
1989static void vmx_flush_tlb(struct kvm_vcpu *vcpu) 2124static void update_tpr_threshold(struct kvm_vcpu *vcpu)
1990{ 2125{
2126 int max_irr, tpr;
2127
2128 if (!vm_need_tpr_shadow(vcpu->kvm))
2129 return;
2130
2131 if (!kvm_lapic_enabled(vcpu) ||
2132 ((max_irr = kvm_lapic_find_highest_irr(vcpu)) == -1)) {
2133 vmcs_write32(TPR_THRESHOLD, 0);
2134 return;
2135 }
2136
2137 tpr = (kvm_lapic_get_cr8(vcpu) & 0x0f) << 4;
2138 vmcs_write32(TPR_THRESHOLD, (max_irr > tpr) ? tpr >> 4 : max_irr >> 4);
1991} 2139}
1992 2140
1993static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2141static void enable_irq_window(struct kvm_vcpu *vcpu)
1994{ 2142{
1995 u8 fail; 2143 u32 cpu_based_vm_exec_control;
1996 int r;
1997 2144
1998preempted: 2145 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
1999 if (vcpu->guest_debug.enabled) 2146 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
2000 kvm_guest_debug_pre(vcpu); 2147 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2148}
2001 2149
2002again: 2150static void vmx_intr_assist(struct kvm_vcpu *vcpu)
2003 if (!vcpu->mmio_read_completed) 2151{
2004 do_interrupt_requests(vcpu, kvm_run); 2152 u32 idtv_info_field, intr_info_field;
2153 int has_ext_irq, interrupt_window_open;
2154 int vector;
2005 2155
2006 vmx_save_host_state(vcpu); 2156 kvm_inject_pending_timer_irqs(vcpu);
2007 kvm_load_guest_fpu(vcpu); 2157 update_tpr_threshold(vcpu);
2008 2158
2009 r = kvm_mmu_reload(vcpu); 2159 has_ext_irq = kvm_cpu_has_interrupt(vcpu);
2010 if (unlikely(r)) 2160 intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
2011 goto out; 2161 idtv_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD);
2162 if (intr_info_field & INTR_INFO_VALID_MASK) {
2163 if (idtv_info_field & INTR_INFO_VALID_MASK) {
2164 /* TODO: fault when IDT_Vectoring */
2165 printk(KERN_ERR "Fault when IDT_Vectoring\n");
2166 }
2167 if (has_ext_irq)
2168 enable_irq_window(vcpu);
2169 return;
2170 }
2171 if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) {
2172 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field);
2173 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2174 vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
2175
2176 if (unlikely(idtv_info_field & INTR_INFO_DELIEVER_CODE_MASK))
2177 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
2178 vmcs_read32(IDT_VECTORING_ERROR_CODE));
2179 if (unlikely(has_ext_irq))
2180 enable_irq_window(vcpu);
2181 return;
2182 }
2183 if (!has_ext_irq)
2184 return;
2185 interrupt_window_open =
2186 ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
2187 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
2188 if (interrupt_window_open) {
2189 vector = kvm_cpu_get_interrupt(vcpu);
2190 vmx_inject_irq(vcpu, vector);
2191 kvm_timer_intr_post(vcpu, vector);
2192 } else
2193 enable_irq_window(vcpu);
2194}
2195
2196static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2197{
2198 struct vcpu_vmx *vmx = to_vmx(vcpu);
2012 2199
2013 /* 2200 /*
2014 * Loading guest fpu may have cleared host cr0.ts 2201 * Loading guest fpu may have cleared host cr0.ts
2015 */ 2202 */
2016 vmcs_writel(HOST_CR0, read_cr0()); 2203 vmcs_writel(HOST_CR0, read_cr0());
2017 2204
2018 local_irq_disable();
2019
2020 vcpu->guest_mode = 1;
2021 if (vcpu->requests)
2022 if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests))
2023 vmx_flush_tlb(vcpu);
2024
2025 asm ( 2205 asm (
2026 /* Store host registers */ 2206 /* Store host registers */
2027#ifdef CONFIG_X86_64 2207#ifdef CONFIG_X86_64
@@ -2115,8 +2295,8 @@ again:
2115 "pop %%ecx; popa \n\t" 2295 "pop %%ecx; popa \n\t"
2116#endif 2296#endif
2117 "setbe %0 \n\t" 2297 "setbe %0 \n\t"
2118 : "=q" (fail) 2298 : "=q" (vmx->fail)
2119 : "r"(vcpu->launched), "d"((unsigned long)HOST_RSP), 2299 : "r"(vmx->launched), "d"((unsigned long)HOST_RSP),
2120 "c"(vcpu), 2300 "c"(vcpu),
2121 [rax]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RAX])), 2301 [rax]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RAX])),
2122 [rbx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBX])), 2302 [rbx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBX])),
@@ -2138,59 +2318,10 @@ again:
2138 [cr2]"i"(offsetof(struct kvm_vcpu, cr2)) 2318 [cr2]"i"(offsetof(struct kvm_vcpu, cr2))
2139 : "cc", "memory" ); 2319 : "cc", "memory" );
2140 2320
2141 vcpu->guest_mode = 0;
2142 local_irq_enable();
2143
2144 ++vcpu->stat.exits;
2145
2146 vcpu->interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0; 2321 vcpu->interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0;
2147 2322
2148 asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); 2323 asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
2149 2324 vmx->launched = 1;
2150 if (unlikely(fail)) {
2151 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
2152 kvm_run->fail_entry.hardware_entry_failure_reason
2153 = vmcs_read32(VM_INSTRUCTION_ERROR);
2154 r = 0;
2155 goto out;
2156 }
2157 /*
2158 * Profile KVM exit RIPs:
2159 */
2160 if (unlikely(prof_on == KVM_PROFILING))
2161 profile_hit(KVM_PROFILING, (void *)vmcs_readl(GUEST_RIP));
2162
2163 vcpu->launched = 1;
2164 r = kvm_handle_exit(kvm_run, vcpu);
2165 if (r > 0) {
2166 /* Give scheduler a change to reschedule. */
2167 if (signal_pending(current)) {
2168 r = -EINTR;
2169 kvm_run->exit_reason = KVM_EXIT_INTR;
2170 ++vcpu->stat.signal_exits;
2171 goto out;
2172 }
2173
2174 if (dm_request_for_irq_injection(vcpu, kvm_run)) {
2175 r = -EINTR;
2176 kvm_run->exit_reason = KVM_EXIT_INTR;
2177 ++vcpu->stat.request_irq_exits;
2178 goto out;
2179 }
2180 if (!need_resched()) {
2181 ++vcpu->stat.light_exits;
2182 goto again;
2183 }
2184 }
2185
2186out:
2187 if (r > 0) {
2188 kvm_resched(vcpu);
2189 goto preempted;
2190 }
2191
2192 post_kvm_run_save(vcpu, kvm_run);
2193 return r;
2194} 2325}
2195 2326
2196static void vmx_inject_page_fault(struct kvm_vcpu *vcpu, 2327static void vmx_inject_page_fault(struct kvm_vcpu *vcpu,
@@ -2225,67 +2356,118 @@ static void vmx_inject_page_fault(struct kvm_vcpu *vcpu,
2225 2356
2226static void vmx_free_vmcs(struct kvm_vcpu *vcpu) 2357static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
2227{ 2358{
2228 if (vcpu->vmcs) { 2359 struct vcpu_vmx *vmx = to_vmx(vcpu);
2229 on_each_cpu(__vcpu_clear, vcpu, 0, 1); 2360
2230 free_vmcs(vcpu->vmcs); 2361 if (vmx->vmcs) {
2231 vcpu->vmcs = NULL; 2362 on_each_cpu(__vcpu_clear, vmx, 0, 1);
2363 free_vmcs(vmx->vmcs);
2364 vmx->vmcs = NULL;
2232 } 2365 }
2233} 2366}
2234 2367
2235static void vmx_free_vcpu(struct kvm_vcpu *vcpu) 2368static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
2236{ 2369{
2370 struct vcpu_vmx *vmx = to_vmx(vcpu);
2371
2237 vmx_free_vmcs(vcpu); 2372 vmx_free_vmcs(vcpu);
2373 kfree(vmx->host_msrs);
2374 kfree(vmx->guest_msrs);
2375 kvm_vcpu_uninit(vcpu);
2376 kmem_cache_free(kvm_vcpu_cache, vmx);
2238} 2377}
2239 2378
2240static int vmx_create_vcpu(struct kvm_vcpu *vcpu) 2379static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
2241{ 2380{
2242 struct vmcs *vmcs; 2381 int err;
2382 struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
2383 int cpu;
2243 2384
2244 vcpu->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); 2385 if (!vmx)
2245 if (!vcpu->guest_msrs) 2386 return ERR_PTR(-ENOMEM);
2246 return -ENOMEM;
2247 2387
2248 vcpu->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); 2388 err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
2249 if (!vcpu->host_msrs) 2389 if (err)
2250 goto out_free_guest_msrs; 2390 goto free_vcpu;
2251 2391
2252 vmcs = alloc_vmcs(); 2392 if (irqchip_in_kernel(kvm)) {
2253 if (!vmcs) 2393 err = kvm_create_lapic(&vmx->vcpu);
2254 goto out_free_msrs; 2394 if (err < 0)
2395 goto free_vcpu;
2396 }
2255 2397
2256 vmcs_clear(vmcs); 2398 vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
2257 vcpu->vmcs = vmcs; 2399 if (!vmx->guest_msrs) {
2258 vcpu->launched = 0; 2400 err = -ENOMEM;
2401 goto uninit_vcpu;
2402 }
2259 2403
2260 return 0; 2404 vmx->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
2405 if (!vmx->host_msrs)
2406 goto free_guest_msrs;
2261 2407
2262out_free_msrs: 2408 vmx->vmcs = alloc_vmcs();
2263 kfree(vcpu->host_msrs); 2409 if (!vmx->vmcs)
2264 vcpu->host_msrs = NULL; 2410 goto free_msrs;
2265 2411
2266out_free_guest_msrs: 2412 vmcs_clear(vmx->vmcs);
2267 kfree(vcpu->guest_msrs);
2268 vcpu->guest_msrs = NULL;
2269 2413
2270 return -ENOMEM; 2414 cpu = get_cpu();
2415 vmx_vcpu_load(&vmx->vcpu, cpu);
2416 err = vmx_vcpu_setup(vmx);
2417 vmx_vcpu_put(&vmx->vcpu);
2418 put_cpu();
2419 if (err)
2420 goto free_vmcs;
2421
2422 return &vmx->vcpu;
2423
2424free_vmcs:
2425 free_vmcs(vmx->vmcs);
2426free_msrs:
2427 kfree(vmx->host_msrs);
2428free_guest_msrs:
2429 kfree(vmx->guest_msrs);
2430uninit_vcpu:
2431 kvm_vcpu_uninit(&vmx->vcpu);
2432free_vcpu:
2433 kmem_cache_free(kvm_vcpu_cache, vmx);
2434 return ERR_PTR(err);
2435}
2436
2437static void __init vmx_check_processor_compat(void *rtn)
2438{
2439 struct vmcs_config vmcs_conf;
2440
2441 *(int *)rtn = 0;
2442 if (setup_vmcs_config(&vmcs_conf) < 0)
2443 *(int *)rtn = -EIO;
2444 if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
2445 printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
2446 smp_processor_id());
2447 *(int *)rtn = -EIO;
2448 }
2271} 2449}
2272 2450
2273static struct kvm_arch_ops vmx_arch_ops = { 2451static struct kvm_x86_ops vmx_x86_ops = {
2274 .cpu_has_kvm_support = cpu_has_kvm_support, 2452 .cpu_has_kvm_support = cpu_has_kvm_support,
2275 .disabled_by_bios = vmx_disabled_by_bios, 2453 .disabled_by_bios = vmx_disabled_by_bios,
2276 .hardware_setup = hardware_setup, 2454 .hardware_setup = hardware_setup,
2277 .hardware_unsetup = hardware_unsetup, 2455 .hardware_unsetup = hardware_unsetup,
2456 .check_processor_compatibility = vmx_check_processor_compat,
2278 .hardware_enable = hardware_enable, 2457 .hardware_enable = hardware_enable,
2279 .hardware_disable = hardware_disable, 2458 .hardware_disable = hardware_disable,
2280 2459
2281 .vcpu_create = vmx_create_vcpu, 2460 .vcpu_create = vmx_create_vcpu,
2282 .vcpu_free = vmx_free_vcpu, 2461 .vcpu_free = vmx_free_vcpu,
2462 .vcpu_reset = vmx_vcpu_reset,
2283 2463
2464 .prepare_guest_switch = vmx_save_host_state,
2284 .vcpu_load = vmx_vcpu_load, 2465 .vcpu_load = vmx_vcpu_load,
2285 .vcpu_put = vmx_vcpu_put, 2466 .vcpu_put = vmx_vcpu_put,
2286 .vcpu_decache = vmx_vcpu_decache, 2467 .vcpu_decache = vmx_vcpu_decache,
2287 2468
2288 .set_guest_debug = set_guest_debug, 2469 .set_guest_debug = set_guest_debug,
2470 .guest_debug_pre = kvm_guest_debug_pre,
2289 .get_msr = vmx_get_msr, 2471 .get_msr = vmx_get_msr,
2290 .set_msr = vmx_set_msr, 2472 .set_msr = vmx_set_msr,
2291 .get_segment_base = vmx_get_segment_base, 2473 .get_segment_base = vmx_get_segment_base,
@@ -2314,9 +2496,13 @@ static struct kvm_arch_ops vmx_arch_ops = {
2314 .inject_gp = vmx_inject_gp, 2496 .inject_gp = vmx_inject_gp,
2315 2497
2316 .run = vmx_vcpu_run, 2498 .run = vmx_vcpu_run,
2499 .handle_exit = kvm_handle_exit,
2317 .skip_emulated_instruction = skip_emulated_instruction, 2500 .skip_emulated_instruction = skip_emulated_instruction,
2318 .vcpu_setup = vmx_vcpu_setup,
2319 .patch_hypercall = vmx_patch_hypercall, 2501 .patch_hypercall = vmx_patch_hypercall,
2502 .get_irq = vmx_get_irq,
2503 .set_irq = vmx_inject_irq,
2504 .inject_pending_irq = vmx_intr_assist,
2505 .inject_pending_vectors = do_interrupt_requests,
2320}; 2506};
2321 2507
2322static int __init vmx_init(void) 2508static int __init vmx_init(void)
@@ -2347,7 +2533,7 @@ static int __init vmx_init(void)
2347 memset(iova, 0xff, PAGE_SIZE); 2533 memset(iova, 0xff, PAGE_SIZE);
2348 kunmap(vmx_io_bitmap_b); 2534 kunmap(vmx_io_bitmap_b);
2349 2535
2350 r = kvm_init_arch(&vmx_arch_ops, THIS_MODULE); 2536 r = kvm_init_x86(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
2351 if (r) 2537 if (r)
2352 goto out1; 2538 goto out1;
2353 2539
@@ -2365,7 +2551,7 @@ static void __exit vmx_exit(void)
2365 __free_page(vmx_io_bitmap_b); 2551 __free_page(vmx_io_bitmap_b);
2366 __free_page(vmx_io_bitmap_a); 2552 __free_page(vmx_io_bitmap_a);
2367 2553
2368 kvm_exit_arch(); 2554 kvm_exit_x86();
2369} 2555}
2370 2556
2371module_init(vmx_init) 2557module_init(vmx_init)
diff --git a/drivers/kvm/vmx.h b/drivers/kvm/vmx.h
index d0dc93df411b..fd4e14666088 100644
--- a/drivers/kvm/vmx.h
+++ b/drivers/kvm/vmx.h
@@ -25,29 +25,36 @@
25 * 25 *
26 */ 26 */
27 27
28#define CPU_BASED_VIRTUAL_INTR_PENDING 0x00000004 28#define CPU_BASED_VIRTUAL_INTR_PENDING 0x00000004
29#define CPU_BASED_USE_TSC_OFFSETING 0x00000008 29#define CPU_BASED_USE_TSC_OFFSETING 0x00000008
30#define CPU_BASED_HLT_EXITING 0x00000080 30#define CPU_BASED_HLT_EXITING 0x00000080
31#define CPU_BASED_INVDPG_EXITING 0x00000200 31#define CPU_BASED_INVLPG_EXITING 0x00000200
32#define CPU_BASED_MWAIT_EXITING 0x00000400 32#define CPU_BASED_MWAIT_EXITING 0x00000400
33#define CPU_BASED_RDPMC_EXITING 0x00000800 33#define CPU_BASED_RDPMC_EXITING 0x00000800
34#define CPU_BASED_RDTSC_EXITING 0x00001000 34#define CPU_BASED_RDTSC_EXITING 0x00001000
35#define CPU_BASED_CR8_LOAD_EXITING 0x00080000 35#define CPU_BASED_CR8_LOAD_EXITING 0x00080000
36#define CPU_BASED_CR8_STORE_EXITING 0x00100000 36#define CPU_BASED_CR8_STORE_EXITING 0x00100000
37#define CPU_BASED_TPR_SHADOW 0x00200000 37#define CPU_BASED_TPR_SHADOW 0x00200000
38#define CPU_BASED_MOV_DR_EXITING 0x00800000 38#define CPU_BASED_MOV_DR_EXITING 0x00800000
39#define CPU_BASED_UNCOND_IO_EXITING 0x01000000 39#define CPU_BASED_UNCOND_IO_EXITING 0x01000000
40#define CPU_BASED_ACTIVATE_IO_BITMAP 0x02000000 40#define CPU_BASED_USE_IO_BITMAPS 0x02000000
41#define CPU_BASED_MSR_BITMAPS 0x10000000 41#define CPU_BASED_USE_MSR_BITMAPS 0x10000000
42#define CPU_BASED_MONITOR_EXITING 0x20000000 42#define CPU_BASED_MONITOR_EXITING 0x20000000
43#define CPU_BASED_PAUSE_EXITING 0x40000000 43#define CPU_BASED_PAUSE_EXITING 0x40000000
44#define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS 0x80000000
44 45
45#define PIN_BASED_EXT_INTR_MASK 0x1 46#define PIN_BASED_EXT_INTR_MASK 0x00000001
46#define PIN_BASED_NMI_EXITING 0x8 47#define PIN_BASED_NMI_EXITING 0x00000008
48#define PIN_BASED_VIRTUAL_NMIS 0x00000020
47 49
48#define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000 50#define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200
49#define VM_EXIT_HOST_ADD_SPACE_SIZE 0x00000200 51#define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000
50 52
53#define VM_ENTRY_IA32E_MODE 0x00000200
54#define VM_ENTRY_SMM 0x00000400
55#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800
56
57#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
51 58
52/* VMCS Encodings */ 59/* VMCS Encodings */
53enum vmcs_field { 60enum vmcs_field {
@@ -206,6 +213,7 @@ enum vmcs_field {
206#define EXIT_REASON_MSR_READ 31 213#define EXIT_REASON_MSR_READ 31
207#define EXIT_REASON_MSR_WRITE 32 214#define EXIT_REASON_MSR_WRITE 32
208#define EXIT_REASON_MWAIT_INSTRUCTION 36 215#define EXIT_REASON_MWAIT_INSTRUCTION 36
216#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
209 217
210/* 218/*
211 * Interruption-information format 219 * Interruption-information format
@@ -261,9 +269,6 @@ enum vmcs_field {
261/* segment AR */ 269/* segment AR */
262#define SEGMENT_AR_L_MASK (1 << 13) 270#define SEGMENT_AR_L_MASK (1 << 13)
263 271
264/* entry controls */
265#define VM_ENTRY_CONTROLS_IA32E_MASK (1 << 9)
266
267#define AR_TYPE_ACCESSES_MASK 1 272#define AR_TYPE_ACCESSES_MASK 1
268#define AR_TYPE_READABLE_MASK (1 << 1) 273#define AR_TYPE_READABLE_MASK (1 << 1)
269#define AR_TYPE_WRITEABLE_MASK (1 << 2) 274#define AR_TYPE_WRITEABLE_MASK (1 << 2)
@@ -285,13 +290,21 @@ enum vmcs_field {
285 290
286#define AR_RESERVD_MASK 0xfffe0f00 291#define AR_RESERVD_MASK 0xfffe0f00
287 292
288#define CR4_VMXE 0x2000 293#define MSR_IA32_VMX_BASIC 0x480
294#define MSR_IA32_VMX_PINBASED_CTLS 0x481
295#define MSR_IA32_VMX_PROCBASED_CTLS 0x482
296#define MSR_IA32_VMX_EXIT_CTLS 0x483
297#define MSR_IA32_VMX_ENTRY_CTLS 0x484
298#define MSR_IA32_VMX_MISC 0x485
299#define MSR_IA32_VMX_CR0_FIXED0 0x486
300#define MSR_IA32_VMX_CR0_FIXED1 0x487
301#define MSR_IA32_VMX_CR4_FIXED0 0x488
302#define MSR_IA32_VMX_CR4_FIXED1 0x489
303#define MSR_IA32_VMX_VMCS_ENUM 0x48a
304#define MSR_IA32_VMX_PROCBASED_CTLS2 0x48b
289 305
290#define MSR_IA32_VMX_BASIC 0x480 306#define MSR_IA32_FEATURE_CONTROL 0x3a
291#define MSR_IA32_FEATURE_CONTROL 0x03a 307#define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1
292#define MSR_IA32_VMX_PINBASED_CTLS 0x481 308#define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4
293#define MSR_IA32_VMX_PROCBASED_CTLS 0x482
294#define MSR_IA32_VMX_EXIT_CTLS 0x483
295#define MSR_IA32_VMX_ENTRY_CTLS 0x484
296 309
297#endif 310#endif
diff --git a/drivers/kvm/x86_emulate.c b/drivers/kvm/x86_emulate.c
index 4b8a0cc9665e..9737c3b2f48c 100644
--- a/drivers/kvm/x86_emulate.c
+++ b/drivers/kvm/x86_emulate.c
@@ -6,7 +6,7 @@
6 * Copyright (c) 2005 Keir Fraser 6 * Copyright (c) 2005 Keir Fraser
7 * 7 *
8 * Linux coding style, mod r/m decoder, segment base fixes, real-mode 8 * Linux coding style, mod r/m decoder, segment base fixes, real-mode
9 * privieged instructions: 9 * privileged instructions:
10 * 10 *
11 * Copyright (C) 2006 Qumranet 11 * Copyright (C) 2006 Qumranet
12 * 12 *
@@ -83,7 +83,7 @@ static u8 opcode_table[256] = {
83 /* 0x20 - 0x27 */ 83 /* 0x20 - 0x27 */
84 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 84 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
85 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 85 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
86 0, 0, 0, 0, 86 SrcImmByte, SrcImm, 0, 0,
87 /* 0x28 - 0x2F */ 87 /* 0x28 - 0x2F */
88 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 88 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
89 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 89 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
@@ -99,15 +99,24 @@ static u8 opcode_table[256] = {
99 /* 0x40 - 0x4F */ 99 /* 0x40 - 0x4F */
100 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 100 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
101 /* 0x50 - 0x57 */ 101 /* 0x50 - 0x57 */
102 0, 0, 0, 0, 0, 0, 0, 0, 102 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
103 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
103 /* 0x58 - 0x5F */ 104 /* 0x58 - 0x5F */
104 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, 105 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
105 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, 106 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
106 /* 0x60 - 0x6F */ 107 /* 0x60 - 0x67 */
107 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , 108 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
108 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 109 0, 0, 0, 0,
109 /* 0x70 - 0x7F */ 110 /* 0x68 - 0x6F */
110 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 111 0, 0, ImplicitOps|Mov, 0,
112 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */
113 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */
114 /* 0x70 - 0x77 */
115 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
116 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
117 /* 0x78 - 0x7F */
118 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
119 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
111 /* 0x80 - 0x87 */ 120 /* 0x80 - 0x87 */
112 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, 121 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
113 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, 122 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
@@ -116,9 +125,9 @@ static u8 opcode_table[256] = {
116 /* 0x88 - 0x8F */ 125 /* 0x88 - 0x8F */
117 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, 126 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
118 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, 127 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
119 0, 0, 0, DstMem | SrcNone | ModRM | Mov, 128 0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov,
120 /* 0x90 - 0x9F */ 129 /* 0x90 - 0x9F */
121 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 130 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps, ImplicitOps, 0, 0,
122 /* 0xA0 - 0xA7 */ 131 /* 0xA0 - 0xA7 */
123 ByteOp | DstReg | SrcMem | Mov, DstReg | SrcMem | Mov, 132 ByteOp | DstReg | SrcMem | Mov, DstReg | SrcMem | Mov,
124 ByteOp | DstMem | SrcReg | Mov, DstMem | SrcReg | Mov, 133 ByteOp | DstMem | SrcReg | Mov, DstMem | SrcReg | Mov,
@@ -142,8 +151,10 @@ static u8 opcode_table[256] = {
142 0, 0, 0, 0, 151 0, 0, 0, 0,
143 /* 0xD8 - 0xDF */ 152 /* 0xD8 - 0xDF */
144 0, 0, 0, 0, 0, 0, 0, 0, 153 0, 0, 0, 0, 0, 0, 0, 0,
145 /* 0xE0 - 0xEF */ 154 /* 0xE0 - 0xE7 */
146 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 155 0, 0, 0, 0, 0, 0, 0, 0,
156 /* 0xE8 - 0xEF */
157 ImplicitOps, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps, 0, 0, 0, 0,
147 /* 0xF0 - 0xF7 */ 158 /* 0xF0 - 0xF7 */
148 0, 0, 0, 0, 159 0, 0, 0, 0,
149 ImplicitOps, 0, 160 ImplicitOps, 0,
@@ -181,7 +192,10 @@ static u16 twobyte_table[256] = {
181 /* 0x70 - 0x7F */ 192 /* 0x70 - 0x7F */
182 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
183 /* 0x80 - 0x8F */ 194 /* 0x80 - 0x8F */
184 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 195 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
196 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
197 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
198 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
185 /* 0x90 - 0x9F */ 199 /* 0x90 - 0x9F */
186 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 200 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
187 /* 0xA0 - 0xA7 */ 201 /* 0xA0 - 0xA7 */
@@ -207,19 +221,6 @@ static u16 twobyte_table[256] = {
207 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 221 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
208}; 222};
209 223
210/*
211 * Tell the emulator that of the Group 7 instructions (sgdt, lidt, etc.) we
212 * are interested only in invlpg and not in any of the rest.
213 *
214 * invlpg is a special instruction in that the data it references may not
215 * be mapped.
216 */
217void kvm_emulator_want_group7_invlpg(void)
218{
219 twobyte_table[1] &= ~SrcMem;
220}
221EXPORT_SYMBOL_GPL(kvm_emulator_want_group7_invlpg);
222
223/* Type, address-of, and value of an instruction's operand. */ 224/* Type, address-of, and value of an instruction's operand. */
224struct operand { 225struct operand {
225 enum { OP_REG, OP_MEM, OP_IMM } type; 226 enum { OP_REG, OP_MEM, OP_IMM } type;
@@ -420,7 +421,7 @@ struct operand {
420#define insn_fetch(_type, _size, _eip) \ 421#define insn_fetch(_type, _size, _eip) \
421({ unsigned long _x; \ 422({ unsigned long _x; \
422 rc = ops->read_std((unsigned long)(_eip) + ctxt->cs_base, &_x, \ 423 rc = ops->read_std((unsigned long)(_eip) + ctxt->cs_base, &_x, \
423 (_size), ctxt); \ 424 (_size), ctxt->vcpu); \
424 if ( rc != 0 ) \ 425 if ( rc != 0 ) \
425 goto done; \ 426 goto done; \
426 (_eip) += (_size); \ 427 (_eip) += (_size); \
@@ -428,10 +429,11 @@ struct operand {
428}) 429})
429 430
430/* Access/update address held in a register, based on addressing mode. */ 431/* Access/update address held in a register, based on addressing mode. */
432#define address_mask(reg) \
433 ((ad_bytes == sizeof(unsigned long)) ? \
434 (reg) : ((reg) & ((1UL << (ad_bytes << 3)) - 1)))
431#define register_address(base, reg) \ 435#define register_address(base, reg) \
432 ((base) + ((ad_bytes == sizeof(unsigned long)) ? (reg) : \ 436 ((base) + address_mask(reg))
433 ((reg) & ((1UL << (ad_bytes << 3)) - 1))))
434
435#define register_address_increment(reg, inc) \ 437#define register_address_increment(reg, inc) \
436 do { \ 438 do { \
437 /* signed type ensures sign extension to long */ \ 439 /* signed type ensures sign extension to long */ \
@@ -443,8 +445,19 @@ struct operand {
443 (((reg) + _inc) & ((1UL << (ad_bytes << 3)) - 1)); \ 445 (((reg) + _inc) & ((1UL << (ad_bytes << 3)) - 1)); \
444 } while (0) 446 } while (0)
445 447
446void *decode_register(u8 modrm_reg, unsigned long *regs, 448#define JMP_REL(rel) \
447 int highbyte_regs) 449 do { \
450 _eip += (int)(rel); \
451 _eip = ((op_bytes == 2) ? (uint16_t)_eip : (uint32_t)_eip); \
452 } while (0)
453
454/*
455 * Given the 'reg' portion of a ModRM byte, and a register block, return a
456 * pointer into the block that addresses the relevant register.
457 * @highbyte_regs specifies whether to decode AH,CH,DH,BH.
458 */
459static void *decode_register(u8 modrm_reg, unsigned long *regs,
460 int highbyte_regs)
448{ 461{
449 void *p; 462 void *p;
450 463
@@ -464,13 +477,50 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
464 if (op_bytes == 2) 477 if (op_bytes == 2)
465 op_bytes = 3; 478 op_bytes = 3;
466 *address = 0; 479 *address = 0;
467 rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, ctxt); 480 rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
481 ctxt->vcpu);
468 if (rc) 482 if (rc)
469 return rc; 483 return rc;
470 rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, ctxt); 484 rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
485 ctxt->vcpu);
471 return rc; 486 return rc;
472} 487}
473 488
489static int test_cc(unsigned int condition, unsigned int flags)
490{
491 int rc = 0;
492
493 switch ((condition & 15) >> 1) {
494 case 0: /* o */
495 rc |= (flags & EFLG_OF);
496 break;
497 case 1: /* b/c/nae */
498 rc |= (flags & EFLG_CF);
499 break;
500 case 2: /* z/e */
501 rc |= (flags & EFLG_ZF);
502 break;
503 case 3: /* be/na */
504 rc |= (flags & (EFLG_CF|EFLG_ZF));
505 break;
506 case 4: /* s */
507 rc |= (flags & EFLG_SF);
508 break;
509 case 5: /* p/pe */
510 rc |= (flags & EFLG_PF);
511 break;
512 case 7: /* le/ng */
513 rc |= (flags & EFLG_ZF);
514 /* fall through */
515 case 6: /* l/nge */
516 rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF));
517 break;
518 }
519
520 /* Odd condition identifiers (lsb == 1) have inverted sense. */
521 return (!!rc ^ (condition & 1));
522}
523
474int 524int
475x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) 525x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
476{ 526{
@@ -771,11 +821,15 @@ done_prefixes:
771 goto srcmem_common; 821 goto srcmem_common;
772 case SrcMem: 822 case SrcMem:
773 src.bytes = (d & ByteOp) ? 1 : op_bytes; 823 src.bytes = (d & ByteOp) ? 1 : op_bytes;
824 /* Don't fetch the address for invlpg: it could be unmapped. */
825 if (twobyte && b == 0x01 && modrm_reg == 7)
826 break;
774 srcmem_common: 827 srcmem_common:
775 src.type = OP_MEM; 828 src.type = OP_MEM;
776 src.ptr = (unsigned long *)cr2; 829 src.ptr = (unsigned long *)cr2;
830 src.val = 0;
777 if ((rc = ops->read_emulated((unsigned long)src.ptr, 831 if ((rc = ops->read_emulated((unsigned long)src.ptr,
778 &src.val, src.bytes, ctxt)) != 0) 832 &src.val, src.bytes, ctxt->vcpu)) != 0)
779 goto done; 833 goto done;
780 src.orig_val = src.val; 834 src.orig_val = src.val;
781 break; 835 break;
@@ -814,7 +868,7 @@ done_prefixes:
814 case DstReg: 868 case DstReg:
815 dst.type = OP_REG; 869 dst.type = OP_REG;
816 if ((d & ByteOp) 870 if ((d & ByteOp)
817 && !(twobyte_table && (b == 0xb6 || b == 0xb7))) { 871 && !(twobyte && (b == 0xb6 || b == 0xb7))) {
818 dst.ptr = decode_register(modrm_reg, _regs, 872 dst.ptr = decode_register(modrm_reg, _regs,
819 (rex_prefix == 0)); 873 (rex_prefix == 0));
820 dst.val = *(u8 *) dst.ptr; 874 dst.val = *(u8 *) dst.ptr;
@@ -838,6 +892,7 @@ done_prefixes:
838 dst.type = OP_MEM; 892 dst.type = OP_MEM;
839 dst.ptr = (unsigned long *)cr2; 893 dst.ptr = (unsigned long *)cr2;
840 dst.bytes = (d & ByteOp) ? 1 : op_bytes; 894 dst.bytes = (d & ByteOp) ? 1 : op_bytes;
895 dst.val = 0;
841 if (d & BitOp) { 896 if (d & BitOp) {
842 unsigned long mask = ~(dst.bytes * 8 - 1); 897 unsigned long mask = ~(dst.bytes * 8 - 1);
843 898
@@ -845,7 +900,7 @@ done_prefixes:
845 } 900 }
846 if (!(d & Mov) && /* optimisation - avoid slow emulated read */ 901 if (!(d & Mov) && /* optimisation - avoid slow emulated read */
847 ((rc = ops->read_emulated((unsigned long)dst.ptr, 902 ((rc = ops->read_emulated((unsigned long)dst.ptr,
848 &dst.val, dst.bytes, ctxt)) != 0)) 903 &dst.val, dst.bytes, ctxt->vcpu)) != 0))
849 goto done; 904 goto done;
850 break; 905 break;
851 } 906 }
@@ -871,10 +926,27 @@ done_prefixes:
871 sbb: /* sbb */ 926 sbb: /* sbb */
872 emulate_2op_SrcV("sbb", src, dst, _eflags); 927 emulate_2op_SrcV("sbb", src, dst, _eflags);
873 break; 928 break;
874 case 0x20 ... 0x25: 929 case 0x20 ... 0x23:
875 and: /* and */ 930 and: /* and */
876 emulate_2op_SrcV("and", src, dst, _eflags); 931 emulate_2op_SrcV("and", src, dst, _eflags);
877 break; 932 break;
933 case 0x24: /* and al imm8 */
934 dst.type = OP_REG;
935 dst.ptr = &_regs[VCPU_REGS_RAX];
936 dst.val = *(u8 *)dst.ptr;
937 dst.bytes = 1;
938 dst.orig_val = dst.val;
939 goto and;
940 case 0x25: /* and ax imm16, or eax imm32 */
941 dst.type = OP_REG;
942 dst.bytes = op_bytes;
943 dst.ptr = &_regs[VCPU_REGS_RAX];
944 if (op_bytes == 2)
945 dst.val = *(u16 *)dst.ptr;
946 else
947 dst.val = *(u32 *)dst.ptr;
948 dst.orig_val = dst.val;
949 goto and;
878 case 0x28 ... 0x2d: 950 case 0x28 ... 0x2d:
879 sub: /* sub */ 951 sub: /* sub */
880 emulate_2op_SrcV("sub", src, dst, _eflags); 952 emulate_2op_SrcV("sub", src, dst, _eflags);
@@ -892,6 +964,17 @@ done_prefixes:
892 goto cannot_emulate; 964 goto cannot_emulate;
893 dst.val = (s32) src.val; 965 dst.val = (s32) src.val;
894 break; 966 break;
967 case 0x6a: /* push imm8 */
968 src.val = 0L;
969 src.val = insn_fetch(s8, 1, _eip);
970push:
971 dst.type = OP_MEM;
972 dst.bytes = op_bytes;
973 dst.val = src.val;
974 register_address_increment(_regs[VCPU_REGS_RSP], -op_bytes);
975 dst.ptr = (void *) register_address(ctxt->ss_base,
976 _regs[VCPU_REGS_RSP]);
977 break;
895 case 0x80 ... 0x83: /* Grp1 */ 978 case 0x80 ... 0x83: /* Grp1 */
896 switch (modrm_reg) { 979 switch (modrm_reg) {
897 case 0: 980 case 0:
@@ -939,18 +1022,10 @@ done_prefixes:
939 dst.val = src.val; 1022 dst.val = src.val;
940 lock_prefix = 1; 1023 lock_prefix = 1;
941 break; 1024 break;
942 case 0xa0 ... 0xa1: /* mov */
943 dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
944 dst.val = src.val;
945 _eip += ad_bytes; /* skip src displacement */
946 break;
947 case 0xa2 ... 0xa3: /* mov */
948 dst.val = (unsigned long)_regs[VCPU_REGS_RAX];
949 _eip += ad_bytes; /* skip dst displacement */
950 break;
951 case 0x88 ... 0x8b: /* mov */ 1025 case 0x88 ... 0x8b: /* mov */
952 case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */ 1026 goto mov;
953 dst.val = src.val; 1027 case 0x8d: /* lea r16/r32, m */
1028 dst.val = modrm_val;
954 break; 1029 break;
955 case 0x8f: /* pop (sole member of Grp1a) */ 1030 case 0x8f: /* pop (sole member of Grp1a) */
956 /* 64-bit mode: POP always pops a 64-bit operand. */ 1031 /* 64-bit mode: POP always pops a 64-bit operand. */
@@ -958,10 +1033,19 @@ done_prefixes:
958 dst.bytes = 8; 1033 dst.bytes = 8;
959 if ((rc = ops->read_std(register_address(ctxt->ss_base, 1034 if ((rc = ops->read_std(register_address(ctxt->ss_base,
960 _regs[VCPU_REGS_RSP]), 1035 _regs[VCPU_REGS_RSP]),
961 &dst.val, dst.bytes, ctxt)) != 0) 1036 &dst.val, dst.bytes, ctxt->vcpu)) != 0)
962 goto done; 1037 goto done;
963 register_address_increment(_regs[VCPU_REGS_RSP], dst.bytes); 1038 register_address_increment(_regs[VCPU_REGS_RSP], dst.bytes);
964 break; 1039 break;
1040 case 0xa0 ... 0xa1: /* mov */
1041 dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
1042 dst.val = src.val;
1043 _eip += ad_bytes; /* skip src displacement */
1044 break;
1045 case 0xa2 ... 0xa3: /* mov */
1046 dst.val = (unsigned long)_regs[VCPU_REGS_RAX];
1047 _eip += ad_bytes; /* skip dst displacement */
1048 break;
965 case 0xc0 ... 0xc1: 1049 case 0xc0 ... 0xc1:
966 grp2: /* Grp2 */ 1050 grp2: /* Grp2 */
967 switch (modrm_reg) { 1051 switch (modrm_reg) {
@@ -989,12 +1073,41 @@ done_prefixes:
989 break; 1073 break;
990 } 1074 }
991 break; 1075 break;
1076 case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */
1077 mov:
1078 dst.val = src.val;
1079 break;
992 case 0xd0 ... 0xd1: /* Grp2 */ 1080 case 0xd0 ... 0xd1: /* Grp2 */
993 src.val = 1; 1081 src.val = 1;
994 goto grp2; 1082 goto grp2;
995 case 0xd2 ... 0xd3: /* Grp2 */ 1083 case 0xd2 ... 0xd3: /* Grp2 */
996 src.val = _regs[VCPU_REGS_RCX]; 1084 src.val = _regs[VCPU_REGS_RCX];
997 goto grp2; 1085 goto grp2;
1086 case 0xe8: /* call (near) */ {
1087 long int rel;
1088 switch (op_bytes) {
1089 case 2:
1090 rel = insn_fetch(s16, 2, _eip);
1091 break;
1092 case 4:
1093 rel = insn_fetch(s32, 4, _eip);
1094 break;
1095 case 8:
1096 rel = insn_fetch(s64, 8, _eip);
1097 break;
1098 default:
1099 DPRINTF("Call: Invalid op_bytes\n");
1100 goto cannot_emulate;
1101 }
1102 src.val = (unsigned long) _eip;
1103 JMP_REL(rel);
1104 goto push;
1105 }
1106 case 0xe9: /* jmp rel */
1107 case 0xeb: /* jmp rel short */
1108 JMP_REL(src.val);
1109 no_wb = 1; /* Disable writeback. */
1110 break;
998 case 0xf6 ... 0xf7: /* Grp3 */ 1111 case 0xf6 ... 0xf7: /* Grp3 */
999 switch (modrm_reg) { 1112 switch (modrm_reg) {
1000 case 0 ... 1: /* test */ 1113 case 0 ... 1: /* test */
@@ -1037,13 +1150,19 @@ done_prefixes:
1037 case 1: /* dec */ 1150 case 1: /* dec */
1038 emulate_1op("dec", dst, _eflags); 1151 emulate_1op("dec", dst, _eflags);
1039 break; 1152 break;
1153 case 4: /* jmp abs */
1154 if (b == 0xff)
1155 _eip = dst.val;
1156 else
1157 goto cannot_emulate;
1158 break;
1040 case 6: /* push */ 1159 case 6: /* push */
1041 /* 64-bit mode: PUSH always pushes a 64-bit operand. */ 1160 /* 64-bit mode: PUSH always pushes a 64-bit operand. */
1042 if (mode == X86EMUL_MODE_PROT64) { 1161 if (mode == X86EMUL_MODE_PROT64) {
1043 dst.bytes = 8; 1162 dst.bytes = 8;
1044 if ((rc = ops->read_std((unsigned long)dst.ptr, 1163 if ((rc = ops->read_std((unsigned long)dst.ptr,
1045 &dst.val, 8, 1164 &dst.val, 8,
1046 ctxt)) != 0) 1165 ctxt->vcpu)) != 0)
1047 goto done; 1166 goto done;
1048 } 1167 }
1049 register_address_increment(_regs[VCPU_REGS_RSP], 1168 register_address_increment(_regs[VCPU_REGS_RSP],
@@ -1051,7 +1170,7 @@ done_prefixes:
1051 if ((rc = ops->write_std( 1170 if ((rc = ops->write_std(
1052 register_address(ctxt->ss_base, 1171 register_address(ctxt->ss_base,
1053 _regs[VCPU_REGS_RSP]), 1172 _regs[VCPU_REGS_RSP]),
1054 &dst.val, dst.bytes, ctxt)) != 0) 1173 &dst.val, dst.bytes, ctxt->vcpu)) != 0)
1055 goto done; 1174 goto done;
1056 no_wb = 1; 1175 no_wb = 1;
1057 break; 1176 break;
@@ -1086,11 +1205,11 @@ writeback:
1086 rc = ops->cmpxchg_emulated((unsigned long)dst. 1205 rc = ops->cmpxchg_emulated((unsigned long)dst.
1087 ptr, &dst.orig_val, 1206 ptr, &dst.orig_val,
1088 &dst.val, dst.bytes, 1207 &dst.val, dst.bytes,
1089 ctxt); 1208 ctxt->vcpu);
1090 else 1209 else
1091 rc = ops->write_emulated((unsigned long)dst.ptr, 1210 rc = ops->write_emulated((unsigned long)dst.ptr,
1092 &dst.val, dst.bytes, 1211 &dst.val, dst.bytes,
1093 ctxt); 1212 ctxt->vcpu);
1094 if (rc != 0) 1213 if (rc != 0)
1095 goto done; 1214 goto done;
1096 default: 1215 default:
@@ -1109,6 +1228,81 @@ done:
1109special_insn: 1228special_insn:
1110 if (twobyte) 1229 if (twobyte)
1111 goto twobyte_special_insn; 1230 goto twobyte_special_insn;
1231 switch(b) {
1232 case 0x50 ... 0x57: /* push reg */
1233 if (op_bytes == 2)
1234 src.val = (u16) _regs[b & 0x7];
1235 else
1236 src.val = (u32) _regs[b & 0x7];
1237 dst.type = OP_MEM;
1238 dst.bytes = op_bytes;
1239 dst.val = src.val;
1240 register_address_increment(_regs[VCPU_REGS_RSP], -op_bytes);
1241 dst.ptr = (void *) register_address(
1242 ctxt->ss_base, _regs[VCPU_REGS_RSP]);
1243 break;
1244 case 0x58 ... 0x5f: /* pop reg */
1245 dst.ptr = (unsigned long *)&_regs[b & 0x7];
1246 pop_instruction:
1247 if ((rc = ops->read_std(register_address(ctxt->ss_base,
1248 _regs[VCPU_REGS_RSP]), dst.ptr, op_bytes, ctxt->vcpu))
1249 != 0)
1250 goto done;
1251
1252 register_address_increment(_regs[VCPU_REGS_RSP], op_bytes);
1253 no_wb = 1; /* Disable writeback. */
1254 break;
1255 case 0x6c: /* insb */
1256 case 0x6d: /* insw/insd */
1257 if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
1258 1, /* in */
1259 (d & ByteOp) ? 1 : op_bytes, /* size */
1260 rep_prefix ?
1261 address_mask(_regs[VCPU_REGS_RCX]) : 1, /* count */
1262 (_eflags & EFLG_DF), /* down */
1263 register_address(ctxt->es_base,
1264 _regs[VCPU_REGS_RDI]), /* address */
1265 rep_prefix,
1266 _regs[VCPU_REGS_RDX] /* port */
1267 ) == 0)
1268 return -1;
1269 return 0;
1270 case 0x6e: /* outsb */
1271 case 0x6f: /* outsw/outsd */
1272 if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
1273 0, /* in */
1274 (d & ByteOp) ? 1 : op_bytes, /* size */
1275 rep_prefix ?
1276 address_mask(_regs[VCPU_REGS_RCX]) : 1, /* count */
1277 (_eflags & EFLG_DF), /* down */
1278 register_address(override_base ?
1279 *override_base : ctxt->ds_base,
1280 _regs[VCPU_REGS_RSI]), /* address */
1281 rep_prefix,
1282 _regs[VCPU_REGS_RDX] /* port */
1283 ) == 0)
1284 return -1;
1285 return 0;
1286 case 0x70 ... 0x7f: /* jcc (short) */ {
1287 int rel = insn_fetch(s8, 1, _eip);
1288
1289 if (test_cc(b, _eflags))
1290 JMP_REL(rel);
1291 break;
1292 }
1293 case 0x9c: /* pushf */
1294 src.val = (unsigned long) _eflags;
1295 goto push;
1296 case 0x9d: /* popf */
1297 dst.ptr = (unsigned long *) &_eflags;
1298 goto pop_instruction;
1299 case 0xc3: /* ret */
1300 dst.ptr = &_eip;
1301 goto pop_instruction;
1302 case 0xf4: /* hlt */
1303 ctxt->vcpu->halt_request = 1;
1304 goto done;
1305 }
1112 if (rep_prefix) { 1306 if (rep_prefix) {
1113 if (_regs[VCPU_REGS_RCX] == 0) { 1307 if (_regs[VCPU_REGS_RCX] == 0) {
1114 ctxt->vcpu->rip = _eip; 1308 ctxt->vcpu->rip = _eip;
@@ -1125,7 +1319,7 @@ special_insn:
1125 _regs[VCPU_REGS_RDI]); 1319 _regs[VCPU_REGS_RDI]);
1126 if ((rc = ops->read_emulated(register_address( 1320 if ((rc = ops->read_emulated(register_address(
1127 override_base ? *override_base : ctxt->ds_base, 1321 override_base ? *override_base : ctxt->ds_base,
1128 _regs[VCPU_REGS_RSI]), &dst.val, dst.bytes, ctxt)) != 0) 1322 _regs[VCPU_REGS_RSI]), &dst.val, dst.bytes, ctxt->vcpu)) != 0)
1129 goto done; 1323 goto done;
1130 register_address_increment(_regs[VCPU_REGS_RSI], 1324 register_address_increment(_regs[VCPU_REGS_RSI],
1131 (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes); 1325 (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
@@ -1147,7 +1341,8 @@ special_insn:
1147 dst.type = OP_REG; 1341 dst.type = OP_REG;
1148 dst.bytes = (d & ByteOp) ? 1 : op_bytes; 1342 dst.bytes = (d & ByteOp) ? 1 : op_bytes;
1149 dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX]; 1343 dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
1150 if ((rc = ops->read_emulated(cr2, &dst.val, dst.bytes, ctxt)) != 0) 1344 if ((rc = ops->read_emulated(cr2, &dst.val, dst.bytes,
1345 ctxt->vcpu)) != 0)
1151 goto done; 1346 goto done;
1152 register_address_increment(_regs[VCPU_REGS_RSI], 1347 register_address_increment(_regs[VCPU_REGS_RSI],
1153 (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes); 1348 (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
@@ -1155,23 +1350,7 @@ special_insn:
1155 case 0xae ... 0xaf: /* scas */ 1350 case 0xae ... 0xaf: /* scas */
1156 DPRINTF("Urk! I don't handle SCAS.\n"); 1351 DPRINTF("Urk! I don't handle SCAS.\n");
1157 goto cannot_emulate; 1352 goto cannot_emulate;
1158 case 0xf4: /* hlt */
1159 ctxt->vcpu->halt_request = 1;
1160 goto done;
1161 case 0xc3: /* ret */
1162 dst.ptr = &_eip;
1163 goto pop_instruction;
1164 case 0x58 ... 0x5f: /* pop reg */
1165 dst.ptr = (unsigned long *)&_regs[b & 0x7];
1166 1353
1167pop_instruction:
1168 if ((rc = ops->read_std(register_address(ctxt->ss_base,
1169 _regs[VCPU_REGS_RSP]), dst.ptr, op_bytes, ctxt)) != 0)
1170 goto done;
1171
1172 register_address_increment(_regs[VCPU_REGS_RSP], op_bytes);
1173 no_wb = 1; /* Disable writeback. */
1174 break;
1175 } 1354 }
1176 goto writeback; 1355 goto writeback;
1177 1356
@@ -1230,40 +1409,50 @@ twobyte_insn:
1230 break; 1409 break;
1231 case 0x40 ... 0x4f: /* cmov */ 1410 case 0x40 ... 0x4f: /* cmov */
1232 dst.val = dst.orig_val = src.val; 1411 dst.val = dst.orig_val = src.val;
1233 d &= ~Mov; /* default to no move */ 1412 no_wb = 1;
1234 /* 1413 /*
1235 * First, assume we're decoding an even cmov opcode 1414 * First, assume we're decoding an even cmov opcode
1236 * (lsb == 0). 1415 * (lsb == 0).
1237 */ 1416 */
1238 switch ((b & 15) >> 1) { 1417 switch ((b & 15) >> 1) {
1239 case 0: /* cmovo */ 1418 case 0: /* cmovo */
1240 d |= (_eflags & EFLG_OF) ? Mov : 0; 1419 no_wb = (_eflags & EFLG_OF) ? 0 : 1;
1241 break; 1420 break;
1242 case 1: /* cmovb/cmovc/cmovnae */ 1421 case 1: /* cmovb/cmovc/cmovnae */
1243 d |= (_eflags & EFLG_CF) ? Mov : 0; 1422 no_wb = (_eflags & EFLG_CF) ? 0 : 1;
1244 break; 1423 break;
1245 case 2: /* cmovz/cmove */ 1424 case 2: /* cmovz/cmove */
1246 d |= (_eflags & EFLG_ZF) ? Mov : 0; 1425 no_wb = (_eflags & EFLG_ZF) ? 0 : 1;
1247 break; 1426 break;
1248 case 3: /* cmovbe/cmovna */ 1427 case 3: /* cmovbe/cmovna */
1249 d |= (_eflags & (EFLG_CF | EFLG_ZF)) ? Mov : 0; 1428 no_wb = (_eflags & (EFLG_CF | EFLG_ZF)) ? 0 : 1;
1250 break; 1429 break;
1251 case 4: /* cmovs */ 1430 case 4: /* cmovs */
1252 d |= (_eflags & EFLG_SF) ? Mov : 0; 1431 no_wb = (_eflags & EFLG_SF) ? 0 : 1;
1253 break; 1432 break;
1254 case 5: /* cmovp/cmovpe */ 1433 case 5: /* cmovp/cmovpe */
1255 d |= (_eflags & EFLG_PF) ? Mov : 0; 1434 no_wb = (_eflags & EFLG_PF) ? 0 : 1;
1256 break; 1435 break;
1257 case 7: /* cmovle/cmovng */ 1436 case 7: /* cmovle/cmovng */
1258 d |= (_eflags & EFLG_ZF) ? Mov : 0; 1437 no_wb = (_eflags & EFLG_ZF) ? 0 : 1;
1259 /* fall through */ 1438 /* fall through */
1260 case 6: /* cmovl/cmovnge */ 1439 case 6: /* cmovl/cmovnge */
1261 d |= (!(_eflags & EFLG_SF) != 1440 no_wb &= (!(_eflags & EFLG_SF) !=
1262 !(_eflags & EFLG_OF)) ? Mov : 0; 1441 !(_eflags & EFLG_OF)) ? 0 : 1;
1263 break; 1442 break;
1264 } 1443 }
1265 /* Odd cmov opcodes (lsb == 1) have inverted sense. */ 1444 /* Odd cmov opcodes (lsb == 1) have inverted sense. */
1266 d ^= (b & 1) ? Mov : 0; 1445 no_wb ^= b & 1;
1446 break;
1447 case 0xa3:
1448 bt: /* bt */
1449 src.val &= (dst.bytes << 3) - 1; /* only subword offset */
1450 emulate_2op_SrcV_nobyte("bt", src, dst, _eflags);
1451 break;
1452 case 0xab:
1453 bts: /* bts */
1454 src.val &= (dst.bytes << 3) - 1; /* only subword offset */
1455 emulate_2op_SrcV_nobyte("bts", src, dst, _eflags);
1267 break; 1456 break;
1268 case 0xb0 ... 0xb1: /* cmpxchg */ 1457 case 0xb0 ... 0xb1: /* cmpxchg */
1269 /* 1458 /*
@@ -1273,8 +1462,6 @@ twobyte_insn:
1273 src.orig_val = src.val; 1462 src.orig_val = src.val;
1274 src.val = _regs[VCPU_REGS_RAX]; 1463 src.val = _regs[VCPU_REGS_RAX];
1275 emulate_2op_SrcV("cmp", src, dst, _eflags); 1464 emulate_2op_SrcV("cmp", src, dst, _eflags);
1276 /* Always write back. The question is: where to? */
1277 d |= Mov;
1278 if (_eflags & EFLG_ZF) { 1465 if (_eflags & EFLG_ZF) {
1279 /* Success: write back to memory. */ 1466 /* Success: write back to memory. */
1280 dst.val = src.orig_val; 1467 dst.val = src.orig_val;
@@ -1284,30 +1471,15 @@ twobyte_insn:
1284 dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX]; 1471 dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
1285 } 1472 }
1286 break; 1473 break;
1287 case 0xa3:
1288 bt: /* bt */
1289 src.val &= (dst.bytes << 3) - 1; /* only subword offset */
1290 emulate_2op_SrcV_nobyte("bt", src, dst, _eflags);
1291 break;
1292 case 0xb3: 1474 case 0xb3:
1293 btr: /* btr */ 1475 btr: /* btr */
1294 src.val &= (dst.bytes << 3) - 1; /* only subword offset */ 1476 src.val &= (dst.bytes << 3) - 1; /* only subword offset */
1295 emulate_2op_SrcV_nobyte("btr", src, dst, _eflags); 1477 emulate_2op_SrcV_nobyte("btr", src, dst, _eflags);
1296 break; 1478 break;
1297 case 0xab:
1298 bts: /* bts */
1299 src.val &= (dst.bytes << 3) - 1; /* only subword offset */
1300 emulate_2op_SrcV_nobyte("bts", src, dst, _eflags);
1301 break;
1302 case 0xb6 ... 0xb7: /* movzx */ 1479 case 0xb6 ... 0xb7: /* movzx */
1303 dst.bytes = op_bytes; 1480 dst.bytes = op_bytes;
1304 dst.val = (d & ByteOp) ? (u8) src.val : (u16) src.val; 1481 dst.val = (d & ByteOp) ? (u8) src.val : (u16) src.val;
1305 break; 1482 break;
1306 case 0xbb:
1307 btc: /* btc */
1308 src.val &= (dst.bytes << 3) - 1; /* only subword offset */
1309 emulate_2op_SrcV_nobyte("btc", src, dst, _eflags);
1310 break;
1311 case 0xba: /* Grp8 */ 1483 case 0xba: /* Grp8 */
1312 switch (modrm_reg & 3) { 1484 switch (modrm_reg & 3) {
1313 case 0: 1485 case 0:
@@ -1320,6 +1492,11 @@ twobyte_insn:
1320 goto btc; 1492 goto btc;
1321 } 1493 }
1322 break; 1494 break;
1495 case 0xbb:
1496 btc: /* btc */
1497 src.val &= (dst.bytes << 3) - 1; /* only subword offset */
1498 emulate_2op_SrcV_nobyte("btc", src, dst, _eflags);
1499 break;
1323 case 0xbe ... 0xbf: /* movsx */ 1500 case 0xbe ... 0xbf: /* movsx */
1324 dst.bytes = op_bytes; 1501 dst.bytes = op_bytes;
1325 dst.val = (d & ByteOp) ? (s8) src.val : (s16) src.val; 1502 dst.val = (d & ByteOp) ? (s8) src.val : (s16) src.val;
@@ -1331,14 +1508,14 @@ twobyte_special_insn:
1331 /* Disable writeback. */ 1508 /* Disable writeback. */
1332 no_wb = 1; 1509 no_wb = 1;
1333 switch (b) { 1510 switch (b) {
1511 case 0x06:
1512 emulate_clts(ctxt->vcpu);
1513 break;
1334 case 0x09: /* wbinvd */ 1514 case 0x09: /* wbinvd */
1335 break; 1515 break;
1336 case 0x0d: /* GrpP (prefetch) */ 1516 case 0x0d: /* GrpP (prefetch) */
1337 case 0x18: /* Grp16 (prefetch/nop) */ 1517 case 0x18: /* Grp16 (prefetch/nop) */
1338 break; 1518 break;
1339 case 0x06:
1340 emulate_clts(ctxt->vcpu);
1341 break;
1342 case 0x20: /* mov cr, reg */ 1519 case 0x20: /* mov cr, reg */
1343 if (modrm_mod != 3) 1520 if (modrm_mod != 3)
1344 goto cannot_emulate; 1521 goto cannot_emulate;
@@ -1355,7 +1532,7 @@ twobyte_special_insn:
1355 | ((u64)_regs[VCPU_REGS_RDX] << 32); 1532 | ((u64)_regs[VCPU_REGS_RDX] << 32);
1356 rc = kvm_set_msr(ctxt->vcpu, _regs[VCPU_REGS_RCX], msr_data); 1533 rc = kvm_set_msr(ctxt->vcpu, _regs[VCPU_REGS_RCX], msr_data);
1357 if (rc) { 1534 if (rc) {
1358 kvm_arch_ops->inject_gp(ctxt->vcpu, 0); 1535 kvm_x86_ops->inject_gp(ctxt->vcpu, 0);
1359 _eip = ctxt->vcpu->rip; 1536 _eip = ctxt->vcpu->rip;
1360 } 1537 }
1361 rc = X86EMUL_CONTINUE; 1538 rc = X86EMUL_CONTINUE;
@@ -1364,7 +1541,7 @@ twobyte_special_insn:
1364 /* rdmsr */ 1541 /* rdmsr */
1365 rc = kvm_get_msr(ctxt->vcpu, _regs[VCPU_REGS_RCX], &msr_data); 1542 rc = kvm_get_msr(ctxt->vcpu, _regs[VCPU_REGS_RCX], &msr_data);
1366 if (rc) { 1543 if (rc) {
1367 kvm_arch_ops->inject_gp(ctxt->vcpu, 0); 1544 kvm_x86_ops->inject_gp(ctxt->vcpu, 0);
1368 _eip = ctxt->vcpu->rip; 1545 _eip = ctxt->vcpu->rip;
1369 } else { 1546 } else {
1370 _regs[VCPU_REGS_RAX] = (u32)msr_data; 1547 _regs[VCPU_REGS_RAX] = (u32)msr_data;
@@ -1372,10 +1549,32 @@ twobyte_special_insn:
1372 } 1549 }
1373 rc = X86EMUL_CONTINUE; 1550 rc = X86EMUL_CONTINUE;
1374 break; 1551 break;
1552 case 0x80 ... 0x8f: /* jnz rel, etc*/ {
1553 long int rel;
1554
1555 switch (op_bytes) {
1556 case 2:
1557 rel = insn_fetch(s16, 2, _eip);
1558 break;
1559 case 4:
1560 rel = insn_fetch(s32, 4, _eip);
1561 break;
1562 case 8:
1563 rel = insn_fetch(s64, 8, _eip);
1564 break;
1565 default:
1566 DPRINTF("jnz: Invalid op_bytes\n");
1567 goto cannot_emulate;
1568 }
1569 if (test_cc(b, _eflags))
1570 JMP_REL(rel);
1571 break;
1572 }
1375 case 0xc7: /* Grp9 (cmpxchg8b) */ 1573 case 0xc7: /* Grp9 (cmpxchg8b) */
1376 { 1574 {
1377 u64 old, new; 1575 u64 old, new;
1378 if ((rc = ops->read_emulated(cr2, &old, 8, ctxt)) != 0) 1576 if ((rc = ops->read_emulated(cr2, &old, 8, ctxt->vcpu))
1577 != 0)
1379 goto done; 1578 goto done;
1380 if (((u32) (old >> 0) != (u32) _regs[VCPU_REGS_RAX]) || 1579 if (((u32) (old >> 0) != (u32) _regs[VCPU_REGS_RAX]) ||
1381 ((u32) (old >> 32) != (u32) _regs[VCPU_REGS_RDX])) { 1580 ((u32) (old >> 32) != (u32) _regs[VCPU_REGS_RDX])) {
@@ -1386,7 +1585,7 @@ twobyte_special_insn:
1386 new = ((u64)_regs[VCPU_REGS_RCX] << 32) 1585 new = ((u64)_regs[VCPU_REGS_RCX] << 32)
1387 | (u32) _regs[VCPU_REGS_RBX]; 1586 | (u32) _regs[VCPU_REGS_RBX];
1388 if ((rc = ops->cmpxchg_emulated(cr2, &old, 1587 if ((rc = ops->cmpxchg_emulated(cr2, &old,
1389 &new, 8, ctxt)) != 0) 1588 &new, 8, ctxt->vcpu)) != 0)
1390 goto done; 1589 goto done;
1391 _eflags |= EFLG_ZF; 1590 _eflags |= EFLG_ZF;
1392 } 1591 }
diff --git a/drivers/kvm/x86_emulate.h b/drivers/kvm/x86_emulate.h
index ea3407d7feee..92c73aa7f9ac 100644
--- a/drivers/kvm/x86_emulate.h
+++ b/drivers/kvm/x86_emulate.h
@@ -60,7 +60,7 @@ struct x86_emulate_ops {
60 * @bytes: [IN ] Number of bytes to read from memory. 60 * @bytes: [IN ] Number of bytes to read from memory.
61 */ 61 */
62 int (*read_std)(unsigned long addr, void *val, 62 int (*read_std)(unsigned long addr, void *val,
63 unsigned int bytes, struct x86_emulate_ctxt * ctxt); 63 unsigned int bytes, struct kvm_vcpu *vcpu);
64 64
65 /* 65 /*
66 * write_std: Write bytes of standard (non-emulated/special) memory. 66 * write_std: Write bytes of standard (non-emulated/special) memory.
@@ -71,7 +71,7 @@ struct x86_emulate_ops {
71 * @bytes: [IN ] Number of bytes to write to memory. 71 * @bytes: [IN ] Number of bytes to write to memory.
72 */ 72 */
73 int (*write_std)(unsigned long addr, const void *val, 73 int (*write_std)(unsigned long addr, const void *val,
74 unsigned int bytes, struct x86_emulate_ctxt * ctxt); 74 unsigned int bytes, struct kvm_vcpu *vcpu);
75 75
76 /* 76 /*
77 * read_emulated: Read bytes from emulated/special memory area. 77 * read_emulated: Read bytes from emulated/special memory area.
@@ -82,7 +82,7 @@ struct x86_emulate_ops {
82 int (*read_emulated) (unsigned long addr, 82 int (*read_emulated) (unsigned long addr,
83 void *val, 83 void *val,
84 unsigned int bytes, 84 unsigned int bytes,
85 struct x86_emulate_ctxt * ctxt); 85 struct kvm_vcpu *vcpu);
86 86
87 /* 87 /*
88 * write_emulated: Read bytes from emulated/special memory area. 88 * write_emulated: Read bytes from emulated/special memory area.
@@ -94,7 +94,7 @@ struct x86_emulate_ops {
94 int (*write_emulated) (unsigned long addr, 94 int (*write_emulated) (unsigned long addr,
95 const void *val, 95 const void *val,
96 unsigned int bytes, 96 unsigned int bytes,
97 struct x86_emulate_ctxt * ctxt); 97 struct kvm_vcpu *vcpu);
98 98
99 /* 99 /*
100 * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an 100 * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an
@@ -108,12 +108,10 @@ struct x86_emulate_ops {
108 const void *old, 108 const void *old,
109 const void *new, 109 const void *new,
110 unsigned int bytes, 110 unsigned int bytes,
111 struct x86_emulate_ctxt * ctxt); 111 struct kvm_vcpu *vcpu);
112 112
113}; 113};
114 114
115struct cpu_user_regs;
116
117struct x86_emulate_ctxt { 115struct x86_emulate_ctxt {
118 /* Register state before/after emulation. */ 116 /* Register state before/after emulation. */
119 struct kvm_vcpu *vcpu; 117 struct kvm_vcpu *vcpu;
@@ -154,12 +152,4 @@ struct x86_emulate_ctxt {
154int x86_emulate_memop(struct x86_emulate_ctxt *ctxt, 152int x86_emulate_memop(struct x86_emulate_ctxt *ctxt,
155 struct x86_emulate_ops *ops); 153 struct x86_emulate_ops *ops);
156 154
157/*
158 * Given the 'reg' portion of a ModRM byte, and a register block, return a
159 * pointer into the block that addresses the relevant register.
160 * @highbyte_regs specifies whether to decode AH,CH,DH,BH.
161 */
162void *decode_register(u8 modrm_reg, unsigned long *regs,
163 int highbyte_regs);
164
165#endif /* __X86_EMULATE_H__ */ 155#endif /* __X86_EMULATE_H__ */
diff --git a/include/asm-x86/io_apic_32.h b/include/asm-x86/io_apic_32.h
index dbe734ddf2af..3f087883ea48 100644
--- a/include/asm-x86/io_apic_32.h
+++ b/include/asm-x86/io_apic_32.h
@@ -11,8 +11,6 @@
11 * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar 11 * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar
12 */ 12 */
13 13
14#ifdef CONFIG_X86_IO_APIC
15
16/* 14/*
17 * The structure of the IO-APIC: 15 * The structure of the IO-APIC:
18 */ 16 */
@@ -55,12 +53,6 @@ union IO_APIC_reg_03 {
55 } __attribute__ ((packed)) bits; 53 } __attribute__ ((packed)) bits;
56}; 54};
57 55
58/*
59 * # of IO-APICs and # of IRQ routing registers
60 */
61extern int nr_ioapics;
62extern int nr_ioapic_registers[MAX_IO_APICS];
63
64enum ioapic_irq_destination_types { 56enum ioapic_irq_destination_types {
65 dest_Fixed = 0, 57 dest_Fixed = 0,
66 dest_LowestPrio = 1, 58 dest_LowestPrio = 1,
@@ -100,6 +92,14 @@ struct IO_APIC_route_entry {
100 92
101} __attribute__ ((packed)); 93} __attribute__ ((packed));
102 94
95#ifdef CONFIG_X86_IO_APIC
96
97/*
98 * # of IO-APICs and # of IRQ routing registers
99 */
100extern int nr_ioapics;
101extern int nr_ioapic_registers[MAX_IO_APICS];
102
103/* 103/*
104 * MP-BIOS irq configuration table structures: 104 * MP-BIOS irq configuration table structures:
105 */ 105 */
diff --git a/include/asm-x86/processor-flags.h b/include/asm-x86/processor-flags.h
index 5404e90edd57..199cab107d85 100644
--- a/include/asm-x86/processor-flags.h
+++ b/include/asm-x86/processor-flags.h
@@ -63,7 +63,7 @@
63/* 63/*
64 * x86-64 Task Priority Register, CR8 64 * x86-64 Task Priority Register, CR8
65 */ 65 */
66#define X86_CR8_TPR 0x00000007 /* task priority register */ 66#define X86_CR8_TPR 0x0000000F /* task priority register */
67 67
68/* 68/*
69 * AMD and Transmeta use MSRs for configuration; see <asm/msr-index.h> 69 * AMD and Transmeta use MSRs for configuration; see <asm/msr-index.h>
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index e6edca81ab84..057a7f34ee36 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -4,8 +4,7 @@
4/* 4/*
5 * Userspace interface for /dev/kvm - kernel based virtual machine 5 * Userspace interface for /dev/kvm - kernel based virtual machine
6 * 6 *
7 * Note: this interface is considered experimental and may change without 7 * Note: you must update KVM_API_VERSION if you change this interface.
8 * notice.
9 */ 8 */
10 9
11#include <asm/types.h> 10#include <asm/types.h>
@@ -13,14 +12,8 @@
13 12
14#define KVM_API_VERSION 12 13#define KVM_API_VERSION 12
15 14
16/* 15/* Architectural interrupt line count. */
17 * Architectural interrupt line count, and the size of the bitmap needed
18 * to hold them.
19 */
20#define KVM_NR_INTERRUPTS 256 16#define KVM_NR_INTERRUPTS 256
21#define KVM_IRQ_BITMAP_SIZE_BYTES ((KVM_NR_INTERRUPTS + 7) / 8)
22#define KVM_IRQ_BITMAP_SIZE(type) (KVM_IRQ_BITMAP_SIZE_BYTES / sizeof(type))
23
24 17
25/* for KVM_CREATE_MEMORY_REGION */ 18/* for KVM_CREATE_MEMORY_REGION */
26struct kvm_memory_region { 19struct kvm_memory_region {
@@ -41,20 +34,89 @@ struct kvm_memory_alias {
41 __u64 target_phys_addr; 34 __u64 target_phys_addr;
42}; 35};
43 36
44enum kvm_exit_reason { 37/* for KVM_IRQ_LINE */
45 KVM_EXIT_UNKNOWN = 0, 38struct kvm_irq_level {
46 KVM_EXIT_EXCEPTION = 1, 39 /*
47 KVM_EXIT_IO = 2, 40 * ACPI gsi notion of irq.
48 KVM_EXIT_HYPERCALL = 3, 41 * For IA-64 (APIC model) IOAPIC0: irq 0-23; IOAPIC1: irq 24-47..
49 KVM_EXIT_DEBUG = 4, 42 * For X86 (standard AT mode) PIC0/1: irq 0-15. IOAPIC0: 0-23..
50 KVM_EXIT_HLT = 5, 43 */
51 KVM_EXIT_MMIO = 6, 44 __u32 irq;
52 KVM_EXIT_IRQ_WINDOW_OPEN = 7, 45 __u32 level;
53 KVM_EXIT_SHUTDOWN = 8, 46};
54 KVM_EXIT_FAIL_ENTRY = 9, 47
55 KVM_EXIT_INTR = 10, 48/* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */
49struct kvm_pic_state {
50 __u8 last_irr; /* edge detection */
51 __u8 irr; /* interrupt request register */
52 __u8 imr; /* interrupt mask register */
53 __u8 isr; /* interrupt service register */
54 __u8 priority_add; /* highest irq priority */
55 __u8 irq_base;
56 __u8 read_reg_select;
57 __u8 poll;
58 __u8 special_mask;
59 __u8 init_state;
60 __u8 auto_eoi;
61 __u8 rotate_on_auto_eoi;
62 __u8 special_fully_nested_mode;
63 __u8 init4; /* true if 4 byte init */
64 __u8 elcr; /* PIIX edge/trigger selection */
65 __u8 elcr_mask;
66};
67
68#define KVM_IOAPIC_NUM_PINS 24
69struct kvm_ioapic_state {
70 __u64 base_address;
71 __u32 ioregsel;
72 __u32 id;
73 __u32 irr;
74 __u32 pad;
75 union {
76 __u64 bits;
77 struct {
78 __u8 vector;
79 __u8 delivery_mode:3;
80 __u8 dest_mode:1;
81 __u8 delivery_status:1;
82 __u8 polarity:1;
83 __u8 remote_irr:1;
84 __u8 trig_mode:1;
85 __u8 mask:1;
86 __u8 reserve:7;
87 __u8 reserved[4];
88 __u8 dest_id;
89 } fields;
90 } redirtbl[KVM_IOAPIC_NUM_PINS];
56}; 91};
57 92
93#define KVM_IRQCHIP_PIC_MASTER 0
94#define KVM_IRQCHIP_PIC_SLAVE 1
95#define KVM_IRQCHIP_IOAPIC 2
96
97struct kvm_irqchip {
98 __u32 chip_id;
99 __u32 pad;
100 union {
101 char dummy[512]; /* reserving space */
102 struct kvm_pic_state pic;
103 struct kvm_ioapic_state ioapic;
104 } chip;
105};
106
107#define KVM_EXIT_UNKNOWN 0
108#define KVM_EXIT_EXCEPTION 1
109#define KVM_EXIT_IO 2
110#define KVM_EXIT_HYPERCALL 3
111#define KVM_EXIT_DEBUG 4
112#define KVM_EXIT_HLT 5
113#define KVM_EXIT_MMIO 6
114#define KVM_EXIT_IRQ_WINDOW_OPEN 7
115#define KVM_EXIT_SHUTDOWN 8
116#define KVM_EXIT_FAIL_ENTRY 9
117#define KVM_EXIT_INTR 10
118#define KVM_EXIT_SET_TPR 11
119
58/* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */ 120/* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */
59struct kvm_run { 121struct kvm_run {
60 /* in */ 122 /* in */
@@ -106,11 +168,14 @@ struct kvm_run {
106 } mmio; 168 } mmio;
107 /* KVM_EXIT_HYPERCALL */ 169 /* KVM_EXIT_HYPERCALL */
108 struct { 170 struct {
171 __u64 nr;
109 __u64 args[6]; 172 __u64 args[6];
110 __u64 ret; 173 __u64 ret;
111 __u32 longmode; 174 __u32 longmode;
112 __u32 pad; 175 __u32 pad;
113 } hypercall; 176 } hypercall;
177 /* Fix the size of the union. */
178 char padding[256];
114 }; 179 };
115}; 180};
116 181
@@ -139,6 +204,12 @@ struct kvm_fpu {
139 __u32 pad2; 204 __u32 pad2;
140}; 205};
141 206
207/* for KVM_GET_LAPIC and KVM_SET_LAPIC */
208#define KVM_APIC_REG_SIZE 0x400
209struct kvm_lapic_state {
210 char regs[KVM_APIC_REG_SIZE];
211};
212
142struct kvm_segment { 213struct kvm_segment {
143 __u64 base; 214 __u64 base;
144 __u32 limit; 215 __u32 limit;
@@ -164,7 +235,7 @@ struct kvm_sregs {
164 __u64 cr0, cr2, cr3, cr4, cr8; 235 __u64 cr0, cr2, cr3, cr4, cr8;
165 __u64 efer; 236 __u64 efer;
166 __u64 apic_base; 237 __u64 apic_base;
167 __u64 interrupt_bitmap[KVM_IRQ_BITMAP_SIZE(__u64)]; 238 __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64];
168}; 239};
169 240
170struct kvm_msr_entry { 241struct kvm_msr_entry {
@@ -272,6 +343,12 @@ struct kvm_signal_mask {
272#define KVM_GET_VCPU_MMAP_SIZE _IO(KVMIO, 0x04) /* in bytes */ 343#define KVM_GET_VCPU_MMAP_SIZE _IO(KVMIO, 0x04) /* in bytes */
273 344
274/* 345/*
346 * Extension capability list.
347 */
348#define KVM_CAP_IRQCHIP 0
349#define KVM_CAP_HLT 1
350
351/*
275 * ioctls for VM fds 352 * ioctls for VM fds
276 */ 353 */
277#define KVM_SET_MEMORY_REGION _IOW(KVMIO, 0x40, struct kvm_memory_region) 354#define KVM_SET_MEMORY_REGION _IOW(KVMIO, 0x40, struct kvm_memory_region)
@@ -282,6 +359,11 @@ struct kvm_signal_mask {
282#define KVM_CREATE_VCPU _IO(KVMIO, 0x41) 359#define KVM_CREATE_VCPU _IO(KVMIO, 0x41)
283#define KVM_GET_DIRTY_LOG _IOW(KVMIO, 0x42, struct kvm_dirty_log) 360#define KVM_GET_DIRTY_LOG _IOW(KVMIO, 0x42, struct kvm_dirty_log)
284#define KVM_SET_MEMORY_ALIAS _IOW(KVMIO, 0x43, struct kvm_memory_alias) 361#define KVM_SET_MEMORY_ALIAS _IOW(KVMIO, 0x43, struct kvm_memory_alias)
362/* Device model IOC */
363#define KVM_CREATE_IRQCHIP _IO(KVMIO, 0x60)
364#define KVM_IRQ_LINE _IOW(KVMIO, 0x61, struct kvm_irq_level)
365#define KVM_GET_IRQCHIP _IOWR(KVMIO, 0x62, struct kvm_irqchip)
366#define KVM_SET_IRQCHIP _IOR(KVMIO, 0x63, struct kvm_irqchip)
285 367
286/* 368/*
287 * ioctls for vcpu fds 369 * ioctls for vcpu fds
@@ -300,5 +382,7 @@ struct kvm_signal_mask {
300#define KVM_SET_SIGNAL_MASK _IOW(KVMIO, 0x8b, struct kvm_signal_mask) 382#define KVM_SET_SIGNAL_MASK _IOW(KVMIO, 0x8b, struct kvm_signal_mask)
301#define KVM_GET_FPU _IOR(KVMIO, 0x8c, struct kvm_fpu) 383#define KVM_GET_FPU _IOR(KVMIO, 0x8c, struct kvm_fpu)
302#define KVM_SET_FPU _IOW(KVMIO, 0x8d, struct kvm_fpu) 384#define KVM_SET_FPU _IOW(KVMIO, 0x8d, struct kvm_fpu)
385#define KVM_GET_LAPIC _IOR(KVMIO, 0x8e, struct kvm_lapic_state)
386#define KVM_SET_LAPIC _IOW(KVMIO, 0x8f, struct kvm_lapic_state)
303 387
304#endif 388#endif