aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/x86/Kconfig3
-rw-r--r--arch/x86/Makefile2
-rw-r--r--arch/x86/kvm/Kconfig (renamed from drivers/kvm/Kconfig)7
-rw-r--r--arch/x86/kvm/Makefile (renamed from drivers/kvm/Makefile)6
-rw-r--r--arch/x86/kvm/i8259.c (renamed from drivers/kvm/i8259.c)8
-rw-r--r--arch/x86/kvm/irq.c (renamed from drivers/kvm/irq.c)22
-rw-r--r--arch/x86/kvm/irq.h88
-rw-r--r--arch/x86/kvm/kvm_svm.h (renamed from drivers/kvm/kvm_svm.h)2
-rw-r--r--arch/x86/kvm/lapic.c (renamed from drivers/kvm/lapic.c)216
-rw-r--r--arch/x86/kvm/lapic.h50
-rw-r--r--arch/x86/kvm/mmu.c1885
-rw-r--r--arch/x86/kvm/mmu.h44
-rw-r--r--arch/x86/kvm/paging_tmpl.h484
-rw-r--r--arch/x86/kvm/segment_descriptor.h (renamed from drivers/kvm/segment_descriptor.h)12
-rw-r--r--arch/x86/kvm/svm.c (renamed from drivers/kvm/svm.c)353
-rw-r--r--arch/x86/kvm/svm.h (renamed from drivers/kvm/svm.h)3
-rw-r--r--arch/x86/kvm/vmx.c (renamed from drivers/kvm/vmx.c)1079
-rw-r--r--arch/x86/kvm/vmx.h (renamed from drivers/kvm/vmx.h)26
-rw-r--r--arch/x86/kvm/x86.c (renamed from drivers/kvm/kvm_main.c)4243
-rw-r--r--arch/x86/kvm/x86_emulate.c1912
-rw-r--r--drivers/Kconfig2
-rw-r--r--drivers/Makefile1
-rw-r--r--drivers/kvm/irq.h165
-rw-r--r--drivers/kvm/mmu.c1498
-rw-r--r--drivers/kvm/paging_tmpl.h511
-rw-r--r--drivers/kvm/x86_emulate.c1662
-rw-r--r--include/asm-x86/Kbuild1
-rw-r--r--include/asm-x86/kvm.h191
-rw-r--r--include/asm-x86/kvm_host.h (renamed from drivers/kvm/kvm.h)537
-rw-r--r--include/asm-x86/kvm_para.h105
-rw-r--r--include/asm-x86/kvm_x86_emulate.h (renamed from drivers/kvm/x86_emulate.h)69
-rw-r--r--include/linux/Kbuild2
-rw-r--r--include/linux/kvm.h203
-rw-r--r--include/linux/kvm_host.h299
-rw-r--r--include/linux/kvm_para.h82
-rw-r--r--include/linux/kvm_types.h54
-rw-r--r--kernel/fork.c1
-rw-r--r--virt/kvm/ioapic.c (renamed from drivers/kvm/ioapic.c)99
-rw-r--r--virt/kvm/ioapic.h95
-rw-r--r--virt/kvm/iodev.h63
-rw-r--r--virt/kvm/kvm_main.c1400
41 files changed, 9938 insertions, 7547 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index fb3eea3e38ee..65b449134cf7 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -107,6 +107,7 @@ config ARCH_SUPPORTS_OPROFILE
107 bool 107 bool
108 default y 108 default y
109 109
110select HAVE_KVM
110 111
111config ZONE_DMA32 112config ZONE_DMA32
112 bool 113 bool
@@ -1598,4 +1599,6 @@ source "security/Kconfig"
1598 1599
1599source "crypto/Kconfig" 1600source "crypto/Kconfig"
1600 1601
1602source "arch/x86/kvm/Kconfig"
1603
1601source "lib/Kconfig" 1604source "lib/Kconfig"
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index b08f18261df6..da8f4129780b 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -7,6 +7,8 @@ else
7 KBUILD_DEFCONFIG := $(ARCH)_defconfig 7 KBUILD_DEFCONFIG := $(ARCH)_defconfig
8endif 8endif
9 9
10core-$(CONFIG_KVM) += arch/x86/kvm/
11
10# BITS is used as extension for files which are available in a 32 bit 12# BITS is used as extension for files which are available in a 32 bit
11# and a 64 bit version to simplify shared Makefiles. 13# and a 64 bit version to simplify shared Makefiles.
12# e.g.: obj-y += foo_$(BITS).o 14# e.g.: obj-y += foo_$(BITS).o
diff --git a/drivers/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 656920636cb2..c83e1c9b5129 100644
--- a/drivers/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -1,9 +1,12 @@
1# 1#
2# KVM configuration 2# KVM configuration
3# 3#
4config HAVE_KVM
5 bool
6
4menuconfig VIRTUALIZATION 7menuconfig VIRTUALIZATION
5 bool "Virtualization" 8 bool "Virtualization"
6 depends on X86 9 depends on HAVE_KVM || X86
7 default y 10 default y
8 ---help--- 11 ---help---
9 Say Y here to get to see options for using your Linux host to run other 12 Say Y here to get to see options for using your Linux host to run other
@@ -16,7 +19,7 @@ if VIRTUALIZATION
16 19
17config KVM 20config KVM
18 tristate "Kernel-based Virtual Machine (KVM) support" 21 tristate "Kernel-based Virtual Machine (KVM) support"
19 depends on X86 && EXPERIMENTAL 22 depends on HAVE_KVM && EXPERIMENTAL
20 select PREEMPT_NOTIFIERS 23 select PREEMPT_NOTIFIERS
21 select ANON_INODES 24 select ANON_INODES
22 ---help--- 25 ---help---
diff --git a/drivers/kvm/Makefile b/arch/x86/kvm/Makefile
index e5a8f4d3e973..ffdd0b310784 100644
--- a/drivers/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -2,7 +2,11 @@
2# Makefile for Kernel-based Virtual Machine module 2# Makefile for Kernel-based Virtual Machine module
3# 3#
4 4
5kvm-objs := kvm_main.o mmu.o x86_emulate.o i8259.o irq.o lapic.o ioapic.o 5common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o)
6
7EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
8
9kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o
6obj-$(CONFIG_KVM) += kvm.o 10obj-$(CONFIG_KVM) += kvm.o
7kvm-intel-objs = vmx.o 11kvm-intel-objs = vmx.o
8obj-$(CONFIG_KVM_INTEL) += kvm-intel.o 12obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
diff --git a/drivers/kvm/i8259.c b/arch/x86/kvm/i8259.c
index a679157bc599..ab29cf2def47 100644
--- a/drivers/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -28,6 +28,8 @@
28#include <linux/mm.h> 28#include <linux/mm.h>
29#include "irq.h" 29#include "irq.h"
30 30
31#include <linux/kvm_host.h>
32
31/* 33/*
32 * set irq level. If an edge is detected, then the IRR is set to 1 34 * set irq level. If an edge is detected, then the IRR is set to 1
33 */ 35 */
@@ -181,10 +183,8 @@ int kvm_pic_read_irq(struct kvm_pic *s)
181 return intno; 183 return intno;
182} 184}
183 185
184static void pic_reset(void *opaque) 186void kvm_pic_reset(struct kvm_kpic_state *s)
185{ 187{
186 struct kvm_kpic_state *s = opaque;
187
188 s->last_irr = 0; 188 s->last_irr = 0;
189 s->irr = 0; 189 s->irr = 0;
190 s->imr = 0; 190 s->imr = 0;
@@ -209,7 +209,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
209 addr &= 1; 209 addr &= 1;
210 if (addr == 0) { 210 if (addr == 0) {
211 if (val & 0x10) { 211 if (val & 0x10) {
212 pic_reset(s); /* init */ 212 kvm_pic_reset(s); /* init */
213 /* 213 /*
214 * deassert a pending interrupt 214 * deassert a pending interrupt
215 */ 215 */
diff --git a/drivers/kvm/irq.c b/arch/x86/kvm/irq.c
index 7628c7ff628f..e5714759e97f 100644
--- a/drivers/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -20,8 +20,8 @@
20 */ 20 */
21 21
22#include <linux/module.h> 22#include <linux/module.h>
23#include <linux/kvm_host.h>
23 24
24#include "kvm.h"
25#include "irq.h" 25#include "irq.h"
26 26
27/* 27/*
@@ -63,26 +63,6 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
63} 63}
64EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt); 64EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
65 65
66static void vcpu_kick_intr(void *info)
67{
68#ifdef DEBUG
69 struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
70 printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu);
71#endif
72}
73
74void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
75{
76 int ipi_pcpu = vcpu->cpu;
77
78 if (waitqueue_active(&vcpu->wq)) {
79 wake_up_interruptible(&vcpu->wq);
80 ++vcpu->stat.halt_wakeup;
81 }
82 if (vcpu->guest_mode)
83 smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0);
84}
85
86void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) 66void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
87{ 67{
88 kvm_inject_apic_timer_irqs(vcpu); 68 kvm_inject_apic_timer_irqs(vcpu);
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
new file mode 100644
index 000000000000..fa5ed5d59b5d
--- /dev/null
+++ b/arch/x86/kvm/irq.h
@@ -0,0 +1,88 @@
1/*
2 * irq.h: in kernel interrupt controller related definitions
3 * Copyright (c) 2007, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Authors:
18 * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
19 *
20 */
21
22#ifndef __IRQ_H
23#define __IRQ_H
24
25#include <linux/mm_types.h>
26#include <linux/hrtimer.h>
27#include <linux/kvm_host.h>
28
29#include "iodev.h"
30#include "ioapic.h"
31#include "lapic.h"
32
33struct kvm;
34struct kvm_vcpu;
35
36typedef void irq_request_func(void *opaque, int level);
37
38struct kvm_kpic_state {
39 u8 last_irr; /* edge detection */
40 u8 irr; /* interrupt request register */
41 u8 imr; /* interrupt mask register */
42 u8 isr; /* interrupt service register */
43 u8 priority_add; /* highest irq priority */
44 u8 irq_base;
45 u8 read_reg_select;
46 u8 poll;
47 u8 special_mask;
48 u8 init_state;
49 u8 auto_eoi;
50 u8 rotate_on_auto_eoi;
51 u8 special_fully_nested_mode;
52 u8 init4; /* true if 4 byte init */
53 u8 elcr; /* PIIX edge/trigger selection */
54 u8 elcr_mask;
55 struct kvm_pic *pics_state;
56};
57
58struct kvm_pic {
59 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
60 irq_request_func *irq_request;
61 void *irq_request_opaque;
62 int output; /* intr from master PIC */
63 struct kvm_io_device dev;
64};
65
66struct kvm_pic *kvm_create_pic(struct kvm *kvm);
67void kvm_pic_set_irq(void *opaque, int irq, int level);
68int kvm_pic_read_irq(struct kvm_pic *s);
69void kvm_pic_update_irq(struct kvm_pic *s);
70
71static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
72{
73 return kvm->arch.vpic;
74}
75
76static inline int irqchip_in_kernel(struct kvm *kvm)
77{
78 return pic_irqchip(kvm) != NULL;
79}
80
81void kvm_pic_reset(struct kvm_kpic_state *s);
82
83void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
84void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
85void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
86void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
87
88#endif
diff --git a/drivers/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h
index a0e415daef5b..ecdfe97e4635 100644
--- a/drivers/kvm/kvm_svm.h
+++ b/arch/x86/kvm/kvm_svm.h
@@ -4,10 +4,10 @@
4#include <linux/kernel.h> 4#include <linux/kernel.h>
5#include <linux/types.h> 5#include <linux/types.h>
6#include <linux/list.h> 6#include <linux/list.h>
7#include <linux/kvm_host.h>
7#include <asm/msr.h> 8#include <asm/msr.h>
8 9
9#include "svm.h" 10#include "svm.h"
10#include "kvm.h"
11 11
12static const u32 host_save_user_msrs[] = { 12static const u32 host_save_user_msrs[] = {
13#ifdef CONFIG_X86_64 13#ifdef CONFIG_X86_64
diff --git a/drivers/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 238fcad3cece..2cbee9479ce4 100644
--- a/drivers/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -17,7 +17,7 @@
17 * the COPYING file in the top-level directory. 17 * the COPYING file in the top-level directory.
18 */ 18 */
19 19
20#include "kvm.h" 20#include <linux/kvm_host.h>
21#include <linux/kvm.h> 21#include <linux/kvm.h>
22#include <linux/mm.h> 22#include <linux/mm.h>
23#include <linux/highmem.h> 23#include <linux/highmem.h>
@@ -56,6 +56,7 @@
56 56
57#define VEC_POS(v) ((v) & (32 - 1)) 57#define VEC_POS(v) ((v) & (32 - 1))
58#define REG_POS(v) (((v) >> 5) << 4) 58#define REG_POS(v) (((v) >> 5) << 4)
59
59static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off) 60static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off)
60{ 61{
61 return *((u32 *) (apic->regs + reg_off)); 62 return *((u32 *) (apic->regs + reg_off));
@@ -88,7 +89,7 @@ static inline void apic_clear_vector(int vec, void *bitmap)
88 89
89static inline int apic_hw_enabled(struct kvm_lapic *apic) 90static inline int apic_hw_enabled(struct kvm_lapic *apic)
90{ 91{
91 return (apic)->vcpu->apic_base & MSR_IA32_APICBASE_ENABLE; 92 return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
92} 93}
93 94
94static inline int apic_sw_enabled(struct kvm_lapic *apic) 95static inline int apic_sw_enabled(struct kvm_lapic *apic)
@@ -172,7 +173,7 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic)
172 173
173int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) 174int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
174{ 175{
175 struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic; 176 struct kvm_lapic *apic = vcpu->arch.apic;
176 int highest_irr; 177 int highest_irr;
177 178
178 if (!apic) 179 if (!apic)
@@ -183,8 +184,10 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
183} 184}
184EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr); 185EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
185 186
186int kvm_apic_set_irq(struct kvm_lapic *apic, u8 vec, u8 trig) 187int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig)
187{ 188{
189 struct kvm_lapic *apic = vcpu->arch.apic;
190
188 if (!apic_test_and_set_irr(vec, apic)) { 191 if (!apic_test_and_set_irr(vec, apic)) {
189 /* a new pending irq is set in IRR */ 192 /* a new pending irq is set in IRR */
190 if (trig) 193 if (trig)
@@ -268,7 +271,7 @@ static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
268 int short_hand, int dest, int dest_mode) 271 int short_hand, int dest, int dest_mode)
269{ 272{
270 int result = 0; 273 int result = 0;
271 struct kvm_lapic *target = vcpu->apic; 274 struct kvm_lapic *target = vcpu->arch.apic;
272 275
273 apic_debug("target %p, source %p, dest 0x%x, " 276 apic_debug("target %p, source %p, dest 0x%x, "
274 "dest_mode 0x%x, short_hand 0x%x", 277 "dest_mode 0x%x, short_hand 0x%x",
@@ -335,10 +338,10 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
335 } else 338 } else
336 apic_clear_vector(vector, apic->regs + APIC_TMR); 339 apic_clear_vector(vector, apic->regs + APIC_TMR);
337 340
338 if (vcpu->mp_state == VCPU_MP_STATE_RUNNABLE) 341 if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE)
339 kvm_vcpu_kick(vcpu); 342 kvm_vcpu_kick(vcpu);
340 else if (vcpu->mp_state == VCPU_MP_STATE_HALTED) { 343 else if (vcpu->arch.mp_state == VCPU_MP_STATE_HALTED) {
341 vcpu->mp_state = VCPU_MP_STATE_RUNNABLE; 344 vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
342 if (waitqueue_active(&vcpu->wq)) 345 if (waitqueue_active(&vcpu->wq))
343 wake_up_interruptible(&vcpu->wq); 346 wake_up_interruptible(&vcpu->wq);
344 } 347 }
@@ -359,11 +362,11 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
359 362
360 case APIC_DM_INIT: 363 case APIC_DM_INIT:
361 if (level) { 364 if (level) {
362 if (vcpu->mp_state == VCPU_MP_STATE_RUNNABLE) 365 if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE)
363 printk(KERN_DEBUG 366 printk(KERN_DEBUG
364 "INIT on a runnable vcpu %d\n", 367 "INIT on a runnable vcpu %d\n",
365 vcpu->vcpu_id); 368 vcpu->vcpu_id);
366 vcpu->mp_state = VCPU_MP_STATE_INIT_RECEIVED; 369 vcpu->arch.mp_state = VCPU_MP_STATE_INIT_RECEIVED;
367 kvm_vcpu_kick(vcpu); 370 kvm_vcpu_kick(vcpu);
368 } else { 371 } else {
369 printk(KERN_DEBUG 372 printk(KERN_DEBUG
@@ -376,9 +379,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
376 case APIC_DM_STARTUP: 379 case APIC_DM_STARTUP:
377 printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n", 380 printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n",
378 vcpu->vcpu_id, vector); 381 vcpu->vcpu_id, vector);
379 if (vcpu->mp_state == VCPU_MP_STATE_INIT_RECEIVED) { 382 if (vcpu->arch.mp_state == VCPU_MP_STATE_INIT_RECEIVED) {
380 vcpu->sipi_vector = vector; 383 vcpu->arch.sipi_vector = vector;
381 vcpu->mp_state = VCPU_MP_STATE_SIPI_RECEIVED; 384 vcpu->arch.mp_state = VCPU_MP_STATE_SIPI_RECEIVED;
382 if (waitqueue_active(&vcpu->wq)) 385 if (waitqueue_active(&vcpu->wq))
383 wake_up_interruptible(&vcpu->wq); 386 wake_up_interruptible(&vcpu->wq);
384 } 387 }
@@ -392,15 +395,14 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
392 return result; 395 return result;
393} 396}
394 397
395struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector, 398static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
396 unsigned long bitmap) 399 unsigned long bitmap)
397{ 400{
398 int vcpu_id;
399 int last; 401 int last;
400 int next; 402 int next;
401 struct kvm_lapic *apic; 403 struct kvm_lapic *apic = NULL;
402 404
403 last = kvm->round_robin_prev_vcpu; 405 last = kvm->arch.round_robin_prev_vcpu;
404 next = last; 406 next = last;
405 407
406 do { 408 do {
@@ -408,25 +410,30 @@ struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
408 next = 0; 410 next = 0;
409 if (kvm->vcpus[next] == NULL || !test_bit(next, &bitmap)) 411 if (kvm->vcpus[next] == NULL || !test_bit(next, &bitmap))
410 continue; 412 continue;
411 apic = kvm->vcpus[next]->apic; 413 apic = kvm->vcpus[next]->arch.apic;
412 if (apic && apic_enabled(apic)) 414 if (apic && apic_enabled(apic))
413 break; 415 break;
414 apic = NULL; 416 apic = NULL;
415 } while (next != last); 417 } while (next != last);
416 kvm->round_robin_prev_vcpu = next; 418 kvm->arch.round_robin_prev_vcpu = next;
417 419
418 if (!apic) { 420 if (!apic)
419 vcpu_id = ffs(bitmap) - 1; 421 printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n");
420 if (vcpu_id < 0) {
421 vcpu_id = 0;
422 printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n");
423 }
424 apic = kvm->vcpus[vcpu_id]->apic;
425 }
426 422
427 return apic; 423 return apic;
428} 424}
429 425
426struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
427 unsigned long bitmap)
428{
429 struct kvm_lapic *apic;
430
431 apic = kvm_apic_round_robin(kvm, vector, bitmap);
432 if (apic)
433 return apic->vcpu;
434 return NULL;
435}
436
430static void apic_set_eoi(struct kvm_lapic *apic) 437static void apic_set_eoi(struct kvm_lapic *apic)
431{ 438{
432 int vector = apic_find_highest_isr(apic); 439 int vector = apic_find_highest_isr(apic);
@@ -458,7 +465,7 @@ static void apic_send_ipi(struct kvm_lapic *apic)
458 unsigned int delivery_mode = icr_low & APIC_MODE_MASK; 465 unsigned int delivery_mode = icr_low & APIC_MODE_MASK;
459 unsigned int vector = icr_low & APIC_VECTOR_MASK; 466 unsigned int vector = icr_low & APIC_VECTOR_MASK;
460 467
461 struct kvm_lapic *target; 468 struct kvm_vcpu *target;
462 struct kvm_vcpu *vcpu; 469 struct kvm_vcpu *vcpu;
463 unsigned long lpr_map = 0; 470 unsigned long lpr_map = 0;
464 int i; 471 int i;
@@ -474,20 +481,20 @@ static void apic_send_ipi(struct kvm_lapic *apic)
474 if (!vcpu) 481 if (!vcpu)
475 continue; 482 continue;
476 483
477 if (vcpu->apic && 484 if (vcpu->arch.apic &&
478 apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) { 485 apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) {
479 if (delivery_mode == APIC_DM_LOWEST) 486 if (delivery_mode == APIC_DM_LOWEST)
480 set_bit(vcpu->vcpu_id, &lpr_map); 487 set_bit(vcpu->vcpu_id, &lpr_map);
481 else 488 else
482 __apic_accept_irq(vcpu->apic, delivery_mode, 489 __apic_accept_irq(vcpu->arch.apic, delivery_mode,
483 vector, level, trig_mode); 490 vector, level, trig_mode);
484 } 491 }
485 } 492 }
486 493
487 if (delivery_mode == APIC_DM_LOWEST) { 494 if (delivery_mode == APIC_DM_LOWEST) {
488 target = kvm_apic_round_robin(vcpu->kvm, vector, lpr_map); 495 target = kvm_get_lowest_prio_vcpu(vcpu->kvm, vector, lpr_map);
489 if (target != NULL) 496 if (target != NULL)
490 __apic_accept_irq(target, delivery_mode, 497 __apic_accept_irq(target->arch.apic, delivery_mode,
491 vector, level, trig_mode); 498 vector, level, trig_mode);
492 } 499 }
493} 500}
@@ -544,6 +551,23 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic)
544 return tmcct; 551 return tmcct;
545} 552}
546 553
554static void __report_tpr_access(struct kvm_lapic *apic, bool write)
555{
556 struct kvm_vcpu *vcpu = apic->vcpu;
557 struct kvm_run *run = vcpu->run;
558
559 set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests);
560 kvm_x86_ops->cache_regs(vcpu);
561 run->tpr_access.rip = vcpu->arch.rip;
562 run->tpr_access.is_write = write;
563}
564
565static inline void report_tpr_access(struct kvm_lapic *apic, bool write)
566{
567 if (apic->vcpu->arch.tpr_access_reporting)
568 __report_tpr_access(apic, write);
569}
570
547static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) 571static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
548{ 572{
549 u32 val = 0; 573 u32 val = 0;
@@ -561,6 +585,9 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
561 val = apic_get_tmcct(apic); 585 val = apic_get_tmcct(apic);
562 break; 586 break;
563 587
588 case APIC_TASKPRI:
589 report_tpr_access(apic, false);
590 /* fall thru */
564 default: 591 default:
565 apic_update_ppr(apic); 592 apic_update_ppr(apic);
566 val = apic_get_reg(apic, offset); 593 val = apic_get_reg(apic, offset);
@@ -670,6 +697,7 @@ static void apic_mmio_write(struct kvm_io_device *this,
670 break; 697 break;
671 698
672 case APIC_TASKPRI: 699 case APIC_TASKPRI:
700 report_tpr_access(apic, true);
673 apic_set_tpr(apic, val & 0xff); 701 apic_set_tpr(apic, val & 0xff);
674 break; 702 break;
675 703
@@ -762,19 +790,17 @@ static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr)
762 return ret; 790 return ret;
763} 791}
764 792
765void kvm_free_apic(struct kvm_lapic *apic) 793void kvm_free_lapic(struct kvm_vcpu *vcpu)
766{ 794{
767 if (!apic) 795 if (!vcpu->arch.apic)
768 return; 796 return;
769 797
770 hrtimer_cancel(&apic->timer.dev); 798 hrtimer_cancel(&vcpu->arch.apic->timer.dev);
771 799
772 if (apic->regs_page) { 800 if (vcpu->arch.apic->regs_page)
773 __free_page(apic->regs_page); 801 __free_page(vcpu->arch.apic->regs_page);
774 apic->regs_page = 0;
775 }
776 802
777 kfree(apic); 803 kfree(vcpu->arch.apic);
778} 804}
779 805
780/* 806/*
@@ -785,16 +811,17 @@ void kvm_free_apic(struct kvm_lapic *apic)
785 811
786void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8) 812void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
787{ 813{
788 struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic; 814 struct kvm_lapic *apic = vcpu->arch.apic;
789 815
790 if (!apic) 816 if (!apic)
791 return; 817 return;
792 apic_set_tpr(apic, ((cr8 & 0x0f) << 4)); 818 apic_set_tpr(apic, ((cr8 & 0x0f) << 4)
819 | (apic_get_reg(apic, APIC_TASKPRI) & 4));
793} 820}
794 821
795u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) 822u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
796{ 823{
797 struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic; 824 struct kvm_lapic *apic = vcpu->arch.apic;
798 u64 tpr; 825 u64 tpr;
799 826
800 if (!apic) 827 if (!apic)
@@ -807,29 +834,29 @@ EXPORT_SYMBOL_GPL(kvm_lapic_get_cr8);
807 834
808void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) 835void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
809{ 836{
810 struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic; 837 struct kvm_lapic *apic = vcpu->arch.apic;
811 838
812 if (!apic) { 839 if (!apic) {
813 value |= MSR_IA32_APICBASE_BSP; 840 value |= MSR_IA32_APICBASE_BSP;
814 vcpu->apic_base = value; 841 vcpu->arch.apic_base = value;
815 return; 842 return;
816 } 843 }
817 if (apic->vcpu->vcpu_id) 844 if (apic->vcpu->vcpu_id)
818 value &= ~MSR_IA32_APICBASE_BSP; 845 value &= ~MSR_IA32_APICBASE_BSP;
819 846
820 vcpu->apic_base = value; 847 vcpu->arch.apic_base = value;
821 apic->base_address = apic->vcpu->apic_base & 848 apic->base_address = apic->vcpu->arch.apic_base &
822 MSR_IA32_APICBASE_BASE; 849 MSR_IA32_APICBASE_BASE;
823 850
824 /* with FSB delivery interrupt, we can restart APIC functionality */ 851 /* with FSB delivery interrupt, we can restart APIC functionality */
825 apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is " 852 apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is "
826 "0x%lx.\n", apic->apic_base, apic->base_address); 853 "0x%lx.\n", apic->vcpu->arch.apic_base, apic->base_address);
827 854
828} 855}
829 856
830u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu) 857u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu)
831{ 858{
832 return vcpu->apic_base; 859 return vcpu->arch.apic_base;
833} 860}
834EXPORT_SYMBOL_GPL(kvm_lapic_get_base); 861EXPORT_SYMBOL_GPL(kvm_lapic_get_base);
835 862
@@ -841,7 +868,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
841 apic_debug("%s\n", __FUNCTION__); 868 apic_debug("%s\n", __FUNCTION__);
842 869
843 ASSERT(vcpu); 870 ASSERT(vcpu);
844 apic = vcpu->apic; 871 apic = vcpu->arch.apic;
845 ASSERT(apic != NULL); 872 ASSERT(apic != NULL);
846 873
847 /* Stop the timer in case it's a reset to an active apic */ 874 /* Stop the timer in case it's a reset to an active apic */
@@ -872,19 +899,19 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
872 update_divide_count(apic); 899 update_divide_count(apic);
873 atomic_set(&apic->timer.pending, 0); 900 atomic_set(&apic->timer.pending, 0);
874 if (vcpu->vcpu_id == 0) 901 if (vcpu->vcpu_id == 0)
875 vcpu->apic_base |= MSR_IA32_APICBASE_BSP; 902 vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
876 apic_update_ppr(apic); 903 apic_update_ppr(apic);
877 904
878 apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr=" 905 apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr="
879 "0x%016" PRIx64 ", base_address=0x%0lx.\n", __FUNCTION__, 906 "0x%016" PRIx64 ", base_address=0x%0lx.\n", __FUNCTION__,
880 vcpu, kvm_apic_id(apic), 907 vcpu, kvm_apic_id(apic),
881 vcpu->apic_base, apic->base_address); 908 vcpu->arch.apic_base, apic->base_address);
882} 909}
883EXPORT_SYMBOL_GPL(kvm_lapic_reset); 910EXPORT_SYMBOL_GPL(kvm_lapic_reset);
884 911
885int kvm_lapic_enabled(struct kvm_vcpu *vcpu) 912int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
886{ 913{
887 struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic; 914 struct kvm_lapic *apic = vcpu->arch.apic;
888 int ret = 0; 915 int ret = 0;
889 916
890 if (!apic) 917 if (!apic)
@@ -908,9 +935,8 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
908 wait_queue_head_t *q = &apic->vcpu->wq; 935 wait_queue_head_t *q = &apic->vcpu->wq;
909 936
910 atomic_inc(&apic->timer.pending); 937 atomic_inc(&apic->timer.pending);
911 if (waitqueue_active(q)) 938 if (waitqueue_active(q)) {
912 { 939 apic->vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
913 apic->vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
914 wake_up_interruptible(q); 940 wake_up_interruptible(q);
915 } 941 }
916 if (apic_lvtt_period(apic)) { 942 if (apic_lvtt_period(apic)) {
@@ -956,13 +982,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
956 if (!apic) 982 if (!apic)
957 goto nomem; 983 goto nomem;
958 984
959 vcpu->apic = apic; 985 vcpu->arch.apic = apic;
960 986
961 apic->regs_page = alloc_page(GFP_KERNEL); 987 apic->regs_page = alloc_page(GFP_KERNEL);
962 if (apic->regs_page == NULL) { 988 if (apic->regs_page == NULL) {
963 printk(KERN_ERR "malloc apic regs error for vcpu %x\n", 989 printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
964 vcpu->vcpu_id); 990 vcpu->vcpu_id);
965 goto nomem; 991 goto nomem_free_apic;
966 } 992 }
967 apic->regs = page_address(apic->regs_page); 993 apic->regs = page_address(apic->regs_page);
968 memset(apic->regs, 0, PAGE_SIZE); 994 memset(apic->regs, 0, PAGE_SIZE);
@@ -971,7 +997,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
971 hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 997 hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
972 apic->timer.dev.function = apic_timer_fn; 998 apic->timer.dev.function = apic_timer_fn;
973 apic->base_address = APIC_DEFAULT_PHYS_BASE; 999 apic->base_address = APIC_DEFAULT_PHYS_BASE;
974 vcpu->apic_base = APIC_DEFAULT_PHYS_BASE; 1000 vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE;
975 1001
976 kvm_lapic_reset(vcpu); 1002 kvm_lapic_reset(vcpu);
977 apic->dev.read = apic_mmio_read; 1003 apic->dev.read = apic_mmio_read;
@@ -980,15 +1006,16 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
980 apic->dev.private = apic; 1006 apic->dev.private = apic;
981 1007
982 return 0; 1008 return 0;
1009nomem_free_apic:
1010 kfree(apic);
983nomem: 1011nomem:
984 kvm_free_apic(apic);
985 return -ENOMEM; 1012 return -ENOMEM;
986} 1013}
987EXPORT_SYMBOL_GPL(kvm_create_lapic); 1014EXPORT_SYMBOL_GPL(kvm_create_lapic);
988 1015
989int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) 1016int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
990{ 1017{
991 struct kvm_lapic *apic = vcpu->apic; 1018 struct kvm_lapic *apic = vcpu->arch.apic;
992 int highest_irr; 1019 int highest_irr;
993 1020
994 if (!apic || !apic_enabled(apic)) 1021 if (!apic || !apic_enabled(apic))
@@ -1004,11 +1031,11 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
1004 1031
1005int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) 1032int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
1006{ 1033{
1007 u32 lvt0 = apic_get_reg(vcpu->apic, APIC_LVT0); 1034 u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0);
1008 int r = 0; 1035 int r = 0;
1009 1036
1010 if (vcpu->vcpu_id == 0) { 1037 if (vcpu->vcpu_id == 0) {
1011 if (!apic_hw_enabled(vcpu->apic)) 1038 if (!apic_hw_enabled(vcpu->arch.apic))
1012 r = 1; 1039 r = 1;
1013 if ((lvt0 & APIC_LVT_MASKED) == 0 && 1040 if ((lvt0 & APIC_LVT_MASKED) == 0 &&
1014 GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) 1041 GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
@@ -1019,7 +1046,7 @@ int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
1019 1046
1020void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) 1047void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
1021{ 1048{
1022 struct kvm_lapic *apic = vcpu->apic; 1049 struct kvm_lapic *apic = vcpu->arch.apic;
1023 1050
1024 if (apic && apic_lvt_enabled(apic, APIC_LVTT) && 1051 if (apic && apic_lvt_enabled(apic, APIC_LVTT) &&
1025 atomic_read(&apic->timer.pending) > 0) { 1052 atomic_read(&apic->timer.pending) > 0) {
@@ -1030,7 +1057,7 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
1030 1057
1031void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec) 1058void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
1032{ 1059{
1033 struct kvm_lapic *apic = vcpu->apic; 1060 struct kvm_lapic *apic = vcpu->arch.apic;
1034 1061
1035 if (apic && apic_lvt_vector(apic, APIC_LVTT) == vec) 1062 if (apic && apic_lvt_vector(apic, APIC_LVTT) == vec)
1036 apic->timer.last_update = ktime_add_ns( 1063 apic->timer.last_update = ktime_add_ns(
@@ -1041,7 +1068,7 @@ void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
1041int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) 1068int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
1042{ 1069{
1043 int vector = kvm_apic_has_interrupt(vcpu); 1070 int vector = kvm_apic_has_interrupt(vcpu);
1044 struct kvm_lapic *apic = vcpu->apic; 1071 struct kvm_lapic *apic = vcpu->arch.apic;
1045 1072
1046 if (vector == -1) 1073 if (vector == -1)
1047 return -1; 1074 return -1;
@@ -1054,9 +1081,9 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
1054 1081
1055void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) 1082void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
1056{ 1083{
1057 struct kvm_lapic *apic = vcpu->apic; 1084 struct kvm_lapic *apic = vcpu->arch.apic;
1058 1085
1059 apic->base_address = vcpu->apic_base & 1086 apic->base_address = vcpu->arch.apic_base &
1060 MSR_IA32_APICBASE_BASE; 1087 MSR_IA32_APICBASE_BASE;
1061 apic_set_reg(apic, APIC_LVR, APIC_VERSION); 1088 apic_set_reg(apic, APIC_LVR, APIC_VERSION);
1062 apic_update_ppr(apic); 1089 apic_update_ppr(apic);
@@ -1065,9 +1092,9 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
1065 start_apic_timer(apic); 1092 start_apic_timer(apic);
1066} 1093}
1067 1094
1068void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) 1095void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
1069{ 1096{
1070 struct kvm_lapic *apic = vcpu->apic; 1097 struct kvm_lapic *apic = vcpu->arch.apic;
1071 struct hrtimer *timer; 1098 struct hrtimer *timer;
1072 1099
1073 if (!apic) 1100 if (!apic)
@@ -1077,4 +1104,51 @@ void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
1077 if (hrtimer_cancel(timer)) 1104 if (hrtimer_cancel(timer))
1078 hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS); 1105 hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS);
1079} 1106}
1080EXPORT_SYMBOL_GPL(kvm_migrate_apic_timer); 1107
1108void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
1109{
1110 u32 data;
1111 void *vapic;
1112
1113 if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr)
1114 return;
1115
1116 vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0);
1117 data = *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr));
1118 kunmap_atomic(vapic, KM_USER0);
1119
1120 apic_set_tpr(vcpu->arch.apic, data & 0xff);
1121}
1122
1123void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
1124{
1125 u32 data, tpr;
1126 int max_irr, max_isr;
1127 struct kvm_lapic *apic;
1128 void *vapic;
1129
1130 if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr)
1131 return;
1132
1133 apic = vcpu->arch.apic;
1134 tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff;
1135 max_irr = apic_find_highest_irr(apic);
1136 if (max_irr < 0)
1137 max_irr = 0;
1138 max_isr = apic_find_highest_isr(apic);
1139 if (max_isr < 0)
1140 max_isr = 0;
1141 data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24);
1142
1143 vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0);
1144 *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr)) = data;
1145 kunmap_atomic(vapic, KM_USER0);
1146}
1147
1148void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
1149{
1150 if (!irqchip_in_kernel(vcpu->kvm))
1151 return;
1152
1153 vcpu->arch.apic->vapic_addr = vapic_addr;
1154}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
new file mode 100644
index 000000000000..676c396c9cee
--- /dev/null
+++ b/arch/x86/kvm/lapic.h
@@ -0,0 +1,50 @@
1#ifndef __KVM_X86_LAPIC_H
2#define __KVM_X86_LAPIC_H
3
4#include "iodev.h"
5
6#include <linux/kvm_host.h>
7
8struct kvm_lapic {
9 unsigned long base_address;
10 struct kvm_io_device dev;
11 struct {
12 atomic_t pending;
13 s64 period; /* unit: ns */
14 u32 divide_count;
15 ktime_t last_update;
16 struct hrtimer dev;
17 } timer;
18 struct kvm_vcpu *vcpu;
19 struct page *regs_page;
20 void *regs;
21 gpa_t vapic_addr;
22 struct page *vapic_page;
23};
24int kvm_create_lapic(struct kvm_vcpu *vcpu);
25void kvm_free_lapic(struct kvm_vcpu *vcpu);
26
27int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
28int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
29int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
30void kvm_lapic_reset(struct kvm_vcpu *vcpu);
31u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
32void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
33void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
34
35int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
36int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
37int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig);
38
39u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
40void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
41void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
42int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
43int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
44void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
45
46void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
47void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
48void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
49
50#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
new file mode 100644
index 000000000000..8efdcdbebb03
--- /dev/null
+++ b/arch/x86/kvm/mmu.c
@@ -0,0 +1,1885 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * MMU support
8 *
9 * Copyright (C) 2006 Qumranet, Inc.
10 *
11 * Authors:
12 * Yaniv Kamay <yaniv@qumranet.com>
13 * Avi Kivity <avi@qumranet.com>
14 *
15 * This work is licensed under the terms of the GNU GPL, version 2. See
16 * the COPYING file in the top-level directory.
17 *
18 */
19
20#include "vmx.h"
21#include "mmu.h"
22
23#include <linux/kvm_host.h>
24#include <linux/types.h>
25#include <linux/string.h>
26#include <linux/mm.h>
27#include <linux/highmem.h>
28#include <linux/module.h>
29#include <linux/swap.h>
30
31#include <asm/page.h>
32#include <asm/cmpxchg.h>
33#include <asm/io.h>
34
35#undef MMU_DEBUG
36
37#undef AUDIT
38
39#ifdef AUDIT
40static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
41#else
42static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
43#endif
44
45#ifdef MMU_DEBUG
46
47#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
48#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
49
50#else
51
52#define pgprintk(x...) do { } while (0)
53#define rmap_printk(x...) do { } while (0)
54
55#endif
56
57#if defined(MMU_DEBUG) || defined(AUDIT)
58static int dbg = 1;
59#endif
60
61#ifndef MMU_DEBUG
62#define ASSERT(x) do { } while (0)
63#else
64#define ASSERT(x) \
65 if (!(x)) { \
66 printk(KERN_WARNING "assertion failed %s:%d: %s\n", \
67 __FILE__, __LINE__, #x); \
68 }
69#endif
70
71#define PT64_PT_BITS 9
72#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
73#define PT32_PT_BITS 10
74#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
75
76#define PT_WRITABLE_SHIFT 1
77
78#define PT_PRESENT_MASK (1ULL << 0)
79#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
80#define PT_USER_MASK (1ULL << 2)
81#define PT_PWT_MASK (1ULL << 3)
82#define PT_PCD_MASK (1ULL << 4)
83#define PT_ACCESSED_MASK (1ULL << 5)
84#define PT_DIRTY_MASK (1ULL << 6)
85#define PT_PAGE_SIZE_MASK (1ULL << 7)
86#define PT_PAT_MASK (1ULL << 7)
87#define PT_GLOBAL_MASK (1ULL << 8)
88#define PT64_NX_SHIFT 63
89#define PT64_NX_MASK (1ULL << PT64_NX_SHIFT)
90
91#define PT_PAT_SHIFT 7
92#define PT_DIR_PAT_SHIFT 12
93#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
94
95#define PT32_DIR_PSE36_SIZE 4
96#define PT32_DIR_PSE36_SHIFT 13
97#define PT32_DIR_PSE36_MASK \
98 (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
99
100
101#define PT_FIRST_AVAIL_BITS_SHIFT 9
102#define PT64_SECOND_AVAIL_BITS_SHIFT 52
103
104#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
105
106#define VALID_PAGE(x) ((x) != INVALID_PAGE)
107
108#define PT64_LEVEL_BITS 9
109
110#define PT64_LEVEL_SHIFT(level) \
111 (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
112
113#define PT64_LEVEL_MASK(level) \
114 (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
115
116#define PT64_INDEX(address, level)\
117 (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
118
119
120#define PT32_LEVEL_BITS 10
121
122#define PT32_LEVEL_SHIFT(level) \
123 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
124
125#define PT32_LEVEL_MASK(level) \
126 (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
127
128#define PT32_INDEX(address, level)\
129 (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
130
131
132#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
133#define PT64_DIR_BASE_ADDR_MASK \
134 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
135
136#define PT32_BASE_ADDR_MASK PAGE_MASK
137#define PT32_DIR_BASE_ADDR_MASK \
138 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
139
140#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
141 | PT64_NX_MASK)
142
143#define PFERR_PRESENT_MASK (1U << 0)
144#define PFERR_WRITE_MASK (1U << 1)
145#define PFERR_USER_MASK (1U << 2)
146#define PFERR_FETCH_MASK (1U << 4)
147
148#define PT64_ROOT_LEVEL 4
149#define PT32_ROOT_LEVEL 2
150#define PT32E_ROOT_LEVEL 3
151
152#define PT_DIRECTORY_LEVEL 2
153#define PT_PAGE_TABLE_LEVEL 1
154
155#define RMAP_EXT 4
156
157#define ACC_EXEC_MASK 1
158#define ACC_WRITE_MASK PT_WRITABLE_MASK
159#define ACC_USER_MASK PT_USER_MASK
160#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
161
162struct kvm_rmap_desc {
163 u64 *shadow_ptes[RMAP_EXT];
164 struct kvm_rmap_desc *more;
165};
166
167static struct kmem_cache *pte_chain_cache;
168static struct kmem_cache *rmap_desc_cache;
169static struct kmem_cache *mmu_page_header_cache;
170
171static u64 __read_mostly shadow_trap_nonpresent_pte;
172static u64 __read_mostly shadow_notrap_nonpresent_pte;
173
174void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
175{
176 shadow_trap_nonpresent_pte = trap_pte;
177 shadow_notrap_nonpresent_pte = notrap_pte;
178}
179EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
180
181static int is_write_protection(struct kvm_vcpu *vcpu)
182{
183 return vcpu->arch.cr0 & X86_CR0_WP;
184}
185
186static int is_cpuid_PSE36(void)
187{
188 return 1;
189}
190
191static int is_nx(struct kvm_vcpu *vcpu)
192{
193 return vcpu->arch.shadow_efer & EFER_NX;
194}
195
196static int is_present_pte(unsigned long pte)
197{
198 return pte & PT_PRESENT_MASK;
199}
200
201static int is_shadow_present_pte(u64 pte)
202{
203 pte &= ~PT_SHADOW_IO_MARK;
204 return pte != shadow_trap_nonpresent_pte
205 && pte != shadow_notrap_nonpresent_pte;
206}
207
208static int is_writeble_pte(unsigned long pte)
209{
210 return pte & PT_WRITABLE_MASK;
211}
212
213static int is_dirty_pte(unsigned long pte)
214{
215 return pte & PT_DIRTY_MASK;
216}
217
218static int is_io_pte(unsigned long pte)
219{
220 return pte & PT_SHADOW_IO_MARK;
221}
222
223static int is_rmap_pte(u64 pte)
224{
225 return pte != shadow_trap_nonpresent_pte
226 && pte != shadow_notrap_nonpresent_pte;
227}
228
229static gfn_t pse36_gfn_delta(u32 gpte)
230{
231 int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
232
233 return (gpte & PT32_DIR_PSE36_MASK) << shift;
234}
235
236static void set_shadow_pte(u64 *sptep, u64 spte)
237{
238#ifdef CONFIG_X86_64
239 set_64bit((unsigned long *)sptep, spte);
240#else
241 set_64bit((unsigned long long *)sptep, spte);
242#endif
243}
244
245static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
246 struct kmem_cache *base_cache, int min)
247{
248 void *obj;
249
250 if (cache->nobjs >= min)
251 return 0;
252 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
253 obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
254 if (!obj)
255 return -ENOMEM;
256 cache->objects[cache->nobjs++] = obj;
257 }
258 return 0;
259}
260
261static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
262{
263 while (mc->nobjs)
264 kfree(mc->objects[--mc->nobjs]);
265}
266
267static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
268 int min)
269{
270 struct page *page;
271
272 if (cache->nobjs >= min)
273 return 0;
274 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
275 page = alloc_page(GFP_KERNEL);
276 if (!page)
277 return -ENOMEM;
278 set_page_private(page, 0);
279 cache->objects[cache->nobjs++] = page_address(page);
280 }
281 return 0;
282}
283
284static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
285{
286 while (mc->nobjs)
287 free_page((unsigned long)mc->objects[--mc->nobjs]);
288}
289
290static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
291{
292 int r;
293
294 r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache,
295 pte_chain_cache, 4);
296 if (r)
297 goto out;
298 r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
299 rmap_desc_cache, 1);
300 if (r)
301 goto out;
302 r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
303 if (r)
304 goto out;
305 r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
306 mmu_page_header_cache, 4);
307out:
308 return r;
309}
310
311static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
312{
313 mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache);
314 mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache);
315 mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
316 mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
317}
318
319static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
320 size_t size)
321{
322 void *p;
323
324 BUG_ON(!mc->nobjs);
325 p = mc->objects[--mc->nobjs];
326 memset(p, 0, size);
327 return p;
328}
329
330static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
331{
332 return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache,
333 sizeof(struct kvm_pte_chain));
334}
335
336static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
337{
338 kfree(pc);
339}
340
341static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
342{
343 return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache,
344 sizeof(struct kvm_rmap_desc));
345}
346
347static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
348{
349 kfree(rd);
350}
351
352/*
353 * Take gfn and return the reverse mapping to it.
354 * Note: gfn must be unaliased before this function get called
355 */
356
357static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn)
358{
359 struct kvm_memory_slot *slot;
360
361 slot = gfn_to_memslot(kvm, gfn);
362 return &slot->rmap[gfn - slot->base_gfn];
363}
364
365/*
366 * Reverse mapping data structures:
367 *
368 * If rmapp bit zero is zero, then rmapp point to the shadw page table entry
369 * that points to page_address(page).
370 *
371 * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
372 * containing more mappings.
373 */
374static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
375{
376 struct kvm_mmu_page *sp;
377 struct kvm_rmap_desc *desc;
378 unsigned long *rmapp;
379 int i;
380
381 if (!is_rmap_pte(*spte))
382 return;
383 gfn = unalias_gfn(vcpu->kvm, gfn);
384 sp = page_header(__pa(spte));
385 sp->gfns[spte - sp->spt] = gfn;
386 rmapp = gfn_to_rmap(vcpu->kvm, gfn);
387 if (!*rmapp) {
388 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
389 *rmapp = (unsigned long)spte;
390 } else if (!(*rmapp & 1)) {
391 rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
392 desc = mmu_alloc_rmap_desc(vcpu);
393 desc->shadow_ptes[0] = (u64 *)*rmapp;
394 desc->shadow_ptes[1] = spte;
395 *rmapp = (unsigned long)desc | 1;
396 } else {
397 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
398 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
399 while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
400 desc = desc->more;
401 if (desc->shadow_ptes[RMAP_EXT-1]) {
402 desc->more = mmu_alloc_rmap_desc(vcpu);
403 desc = desc->more;
404 }
405 for (i = 0; desc->shadow_ptes[i]; ++i)
406 ;
407 desc->shadow_ptes[i] = spte;
408 }
409}
410
411static void rmap_desc_remove_entry(unsigned long *rmapp,
412 struct kvm_rmap_desc *desc,
413 int i,
414 struct kvm_rmap_desc *prev_desc)
415{
416 int j;
417
418 for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j)
419 ;
420 desc->shadow_ptes[i] = desc->shadow_ptes[j];
421 desc->shadow_ptes[j] = NULL;
422 if (j != 0)
423 return;
424 if (!prev_desc && !desc->more)
425 *rmapp = (unsigned long)desc->shadow_ptes[0];
426 else
427 if (prev_desc)
428 prev_desc->more = desc->more;
429 else
430 *rmapp = (unsigned long)desc->more | 1;
431 mmu_free_rmap_desc(desc);
432}
433
434static void rmap_remove(struct kvm *kvm, u64 *spte)
435{
436 struct kvm_rmap_desc *desc;
437 struct kvm_rmap_desc *prev_desc;
438 struct kvm_mmu_page *sp;
439 struct page *page;
440 unsigned long *rmapp;
441 int i;
442
443 if (!is_rmap_pte(*spte))
444 return;
445 sp = page_header(__pa(spte));
446 page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
447 mark_page_accessed(page);
448 if (is_writeble_pte(*spte))
449 kvm_release_page_dirty(page);
450 else
451 kvm_release_page_clean(page);
452 rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt]);
453 if (!*rmapp) {
454 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
455 BUG();
456 } else if (!(*rmapp & 1)) {
457 rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte);
458 if ((u64 *)*rmapp != spte) {
459 printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n",
460 spte, *spte);
461 BUG();
462 }
463 *rmapp = 0;
464 } else {
465 rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte);
466 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
467 prev_desc = NULL;
468 while (desc) {
469 for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
470 if (desc->shadow_ptes[i] == spte) {
471 rmap_desc_remove_entry(rmapp,
472 desc, i,
473 prev_desc);
474 return;
475 }
476 prev_desc = desc;
477 desc = desc->more;
478 }
479 BUG();
480 }
481}
482
483static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
484{
485 struct kvm_rmap_desc *desc;
486 struct kvm_rmap_desc *prev_desc;
487 u64 *prev_spte;
488 int i;
489
490 if (!*rmapp)
491 return NULL;
492 else if (!(*rmapp & 1)) {
493 if (!spte)
494 return (u64 *)*rmapp;
495 return NULL;
496 }
497 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
498 prev_desc = NULL;
499 prev_spte = NULL;
500 while (desc) {
501 for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) {
502 if (prev_spte == spte)
503 return desc->shadow_ptes[i];
504 prev_spte = desc->shadow_ptes[i];
505 }
506 desc = desc->more;
507 }
508 return NULL;
509}
510
511static void rmap_write_protect(struct kvm *kvm, u64 gfn)
512{
513 unsigned long *rmapp;
514 u64 *spte;
515 int write_protected = 0;
516
517 gfn = unalias_gfn(kvm, gfn);
518 rmapp = gfn_to_rmap(kvm, gfn);
519
520 spte = rmap_next(kvm, rmapp, NULL);
521 while (spte) {
522 BUG_ON(!spte);
523 BUG_ON(!(*spte & PT_PRESENT_MASK));
524 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
525 if (is_writeble_pte(*spte)) {
526 set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK);
527 write_protected = 1;
528 }
529 spte = rmap_next(kvm, rmapp, spte);
530 }
531 if (write_protected)
532 kvm_flush_remote_tlbs(kvm);
533}
534
535#ifdef MMU_DEBUG
536static int is_empty_shadow_page(u64 *spt)
537{
538 u64 *pos;
539 u64 *end;
540
541 for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
542 if ((*pos & ~PT_SHADOW_IO_MARK) != shadow_trap_nonpresent_pte) {
543 printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
544 pos, *pos);
545 return 0;
546 }
547 return 1;
548}
549#endif
550
551static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
552{
553 ASSERT(is_empty_shadow_page(sp->spt));
554 list_del(&sp->link);
555 __free_page(virt_to_page(sp->spt));
556 __free_page(virt_to_page(sp->gfns));
557 kfree(sp);
558 ++kvm->arch.n_free_mmu_pages;
559}
560
561static unsigned kvm_page_table_hashfn(gfn_t gfn)
562{
563 return gfn;
564}
565
566static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
567 u64 *parent_pte)
568{
569 struct kvm_mmu_page *sp;
570
571 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
572 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
573 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
574 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
575 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
576 ASSERT(is_empty_shadow_page(sp->spt));
577 sp->slot_bitmap = 0;
578 sp->multimapped = 0;
579 sp->parent_pte = parent_pte;
580 --vcpu->kvm->arch.n_free_mmu_pages;
581 return sp;
582}
583
584static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
585 struct kvm_mmu_page *sp, u64 *parent_pte)
586{
587 struct kvm_pte_chain *pte_chain;
588 struct hlist_node *node;
589 int i;
590
591 if (!parent_pte)
592 return;
593 if (!sp->multimapped) {
594 u64 *old = sp->parent_pte;
595
596 if (!old) {
597 sp->parent_pte = parent_pte;
598 return;
599 }
600 sp->multimapped = 1;
601 pte_chain = mmu_alloc_pte_chain(vcpu);
602 INIT_HLIST_HEAD(&sp->parent_ptes);
603 hlist_add_head(&pte_chain->link, &sp->parent_ptes);
604 pte_chain->parent_ptes[0] = old;
605 }
606 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) {
607 if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
608 continue;
609 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
610 if (!pte_chain->parent_ptes[i]) {
611 pte_chain->parent_ptes[i] = parent_pte;
612 return;
613 }
614 }
615 pte_chain = mmu_alloc_pte_chain(vcpu);
616 BUG_ON(!pte_chain);
617 hlist_add_head(&pte_chain->link, &sp->parent_ptes);
618 pte_chain->parent_ptes[0] = parent_pte;
619}
620
621static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
622 u64 *parent_pte)
623{
624 struct kvm_pte_chain *pte_chain;
625 struct hlist_node *node;
626 int i;
627
628 if (!sp->multimapped) {
629 BUG_ON(sp->parent_pte != parent_pte);
630 sp->parent_pte = NULL;
631 return;
632 }
633 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
634 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
635 if (!pte_chain->parent_ptes[i])
636 break;
637 if (pte_chain->parent_ptes[i] != parent_pte)
638 continue;
639 while (i + 1 < NR_PTE_CHAIN_ENTRIES
640 && pte_chain->parent_ptes[i + 1]) {
641 pte_chain->parent_ptes[i]
642 = pte_chain->parent_ptes[i + 1];
643 ++i;
644 }
645 pte_chain->parent_ptes[i] = NULL;
646 if (i == 0) {
647 hlist_del(&pte_chain->link);
648 mmu_free_pte_chain(pte_chain);
649 if (hlist_empty(&sp->parent_ptes)) {
650 sp->multimapped = 0;
651 sp->parent_pte = NULL;
652 }
653 }
654 return;
655 }
656 BUG();
657}
658
659static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
660{
661 unsigned index;
662 struct hlist_head *bucket;
663 struct kvm_mmu_page *sp;
664 struct hlist_node *node;
665
666 pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
667 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
668 bucket = &kvm->arch.mmu_page_hash[index];
669 hlist_for_each_entry(sp, node, bucket, hash_link)
670 if (sp->gfn == gfn && !sp->role.metaphysical) {
671 pgprintk("%s: found role %x\n",
672 __FUNCTION__, sp->role.word);
673 return sp;
674 }
675 return NULL;
676}
677
678static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
679 gfn_t gfn,
680 gva_t gaddr,
681 unsigned level,
682 int metaphysical,
683 unsigned access,
684 u64 *parent_pte,
685 bool *new_page)
686{
687 union kvm_mmu_page_role role;
688 unsigned index;
689 unsigned quadrant;
690 struct hlist_head *bucket;
691 struct kvm_mmu_page *sp;
692 struct hlist_node *node;
693
694 role.word = 0;
695 role.glevels = vcpu->arch.mmu.root_level;
696 role.level = level;
697 role.metaphysical = metaphysical;
698 role.access = access;
699 if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
700 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
701 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
702 role.quadrant = quadrant;
703 }
704 pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__,
705 gfn, role.word);
706 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
707 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
708 hlist_for_each_entry(sp, node, bucket, hash_link)
709 if (sp->gfn == gfn && sp->role.word == role.word) {
710 mmu_page_add_parent_pte(vcpu, sp, parent_pte);
711 pgprintk("%s: found\n", __FUNCTION__);
712 return sp;
713 }
714 ++vcpu->kvm->stat.mmu_cache_miss;
715 sp = kvm_mmu_alloc_page(vcpu, parent_pte);
716 if (!sp)
717 return sp;
718 pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word);
719 sp->gfn = gfn;
720 sp->role = role;
721 hlist_add_head(&sp->hash_link, bucket);
722 vcpu->arch.mmu.prefetch_page(vcpu, sp);
723 if (!metaphysical)
724 rmap_write_protect(vcpu->kvm, gfn);
725 if (new_page)
726 *new_page = 1;
727 return sp;
728}
729
730static void kvm_mmu_page_unlink_children(struct kvm *kvm,
731 struct kvm_mmu_page *sp)
732{
733 unsigned i;
734 u64 *pt;
735 u64 ent;
736
737 pt = sp->spt;
738
739 if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
740 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
741 if (is_shadow_present_pte(pt[i]))
742 rmap_remove(kvm, &pt[i]);
743 pt[i] = shadow_trap_nonpresent_pte;
744 }
745 kvm_flush_remote_tlbs(kvm);
746 return;
747 }
748
749 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
750 ent = pt[i];
751
752 pt[i] = shadow_trap_nonpresent_pte;
753 if (!is_shadow_present_pte(ent))
754 continue;
755 ent &= PT64_BASE_ADDR_MASK;
756 mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
757 }
758 kvm_flush_remote_tlbs(kvm);
759}
760
761static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
762{
763 mmu_page_remove_parent_pte(sp, parent_pte);
764}
765
766static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
767{
768 int i;
769
770 for (i = 0; i < KVM_MAX_VCPUS; ++i)
771 if (kvm->vcpus[i])
772 kvm->vcpus[i]->arch.last_pte_updated = NULL;
773}
774
775static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
776{
777 u64 *parent_pte;
778
779 ++kvm->stat.mmu_shadow_zapped;
780 while (sp->multimapped || sp->parent_pte) {
781 if (!sp->multimapped)
782 parent_pte = sp->parent_pte;
783 else {
784 struct kvm_pte_chain *chain;
785
786 chain = container_of(sp->parent_ptes.first,
787 struct kvm_pte_chain, link);
788 parent_pte = chain->parent_ptes[0];
789 }
790 BUG_ON(!parent_pte);
791 kvm_mmu_put_page(sp, parent_pte);
792 set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte);
793 }
794 kvm_mmu_page_unlink_children(kvm, sp);
795 if (!sp->root_count) {
796 hlist_del(&sp->hash_link);
797 kvm_mmu_free_page(kvm, sp);
798 } else
799 list_move(&sp->link, &kvm->arch.active_mmu_pages);
800 kvm_mmu_reset_last_pte_updated(kvm);
801}
802
803/*
804 * Changing the number of mmu pages allocated to the vm
805 * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
806 */
807void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
808{
809 /*
810 * If we set the number of mmu pages to be smaller be than the
811 * number of actived pages , we must to free some mmu pages before we
812 * change the value
813 */
814
815 if ((kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages) >
816 kvm_nr_mmu_pages) {
817 int n_used_mmu_pages = kvm->arch.n_alloc_mmu_pages
818 - kvm->arch.n_free_mmu_pages;
819
820 while (n_used_mmu_pages > kvm_nr_mmu_pages) {
821 struct kvm_mmu_page *page;
822
823 page = container_of(kvm->arch.active_mmu_pages.prev,
824 struct kvm_mmu_page, link);
825 kvm_mmu_zap_page(kvm, page);
826 n_used_mmu_pages--;
827 }
828 kvm->arch.n_free_mmu_pages = 0;
829 }
830 else
831 kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages
832 - kvm->arch.n_alloc_mmu_pages;
833
834 kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages;
835}
836
837static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
838{
839 unsigned index;
840 struct hlist_head *bucket;
841 struct kvm_mmu_page *sp;
842 struct hlist_node *node, *n;
843 int r;
844
845 pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
846 r = 0;
847 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
848 bucket = &kvm->arch.mmu_page_hash[index];
849 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
850 if (sp->gfn == gfn && !sp->role.metaphysical) {
851 pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn,
852 sp->role.word);
853 kvm_mmu_zap_page(kvm, sp);
854 r = 1;
855 }
856 return r;
857}
858
859static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
860{
861 struct kvm_mmu_page *sp;
862
863 while ((sp = kvm_mmu_lookup_page(kvm, gfn)) != NULL) {
864 pgprintk("%s: zap %lx %x\n", __FUNCTION__, gfn, sp->role.word);
865 kvm_mmu_zap_page(kvm, sp);
866 }
867}
868
869static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
870{
871 int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn));
872 struct kvm_mmu_page *sp = page_header(__pa(pte));
873
874 __set_bit(slot, &sp->slot_bitmap);
875}
876
877struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
878{
879 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
880
881 if (gpa == UNMAPPED_GVA)
882 return NULL;
883 return gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
884}
885
886static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
887 unsigned pt_access, unsigned pte_access,
888 int user_fault, int write_fault, int dirty,
889 int *ptwrite, gfn_t gfn, struct page *page)
890{
891 u64 spte;
892 int was_rmapped = is_rmap_pte(*shadow_pte);
893 int was_writeble = is_writeble_pte(*shadow_pte);
894
895 pgprintk("%s: spte %llx access %x write_fault %d"
896 " user_fault %d gfn %lx\n",
897 __FUNCTION__, *shadow_pte, pt_access,
898 write_fault, user_fault, gfn);
899
900 /*
901 * We don't set the accessed bit, since we sometimes want to see
902 * whether the guest actually used the pte (in order to detect
903 * demand paging).
904 */
905 spte = PT_PRESENT_MASK | PT_DIRTY_MASK;
906 if (!dirty)
907 pte_access &= ~ACC_WRITE_MASK;
908 if (!(pte_access & ACC_EXEC_MASK))
909 spte |= PT64_NX_MASK;
910
911 spte |= PT_PRESENT_MASK;
912 if (pte_access & ACC_USER_MASK)
913 spte |= PT_USER_MASK;
914
915 if (is_error_page(page)) {
916 set_shadow_pte(shadow_pte,
917 shadow_trap_nonpresent_pte | PT_SHADOW_IO_MARK);
918 kvm_release_page_clean(page);
919 return;
920 }
921
922 spte |= page_to_phys(page);
923
924 if ((pte_access & ACC_WRITE_MASK)
925 || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
926 struct kvm_mmu_page *shadow;
927
928 spte |= PT_WRITABLE_MASK;
929 if (user_fault) {
930 mmu_unshadow(vcpu->kvm, gfn);
931 goto unshadowed;
932 }
933
934 shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
935 if (shadow) {
936 pgprintk("%s: found shadow page for %lx, marking ro\n",
937 __FUNCTION__, gfn);
938 pte_access &= ~ACC_WRITE_MASK;
939 if (is_writeble_pte(spte)) {
940 spte &= ~PT_WRITABLE_MASK;
941 kvm_x86_ops->tlb_flush(vcpu);
942 }
943 if (write_fault)
944 *ptwrite = 1;
945 }
946 }
947
948unshadowed:
949
950 if (pte_access & ACC_WRITE_MASK)
951 mark_page_dirty(vcpu->kvm, gfn);
952
953 pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte);
954 set_shadow_pte(shadow_pte, spte);
955 page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
956 if (!was_rmapped) {
957 rmap_add(vcpu, shadow_pte, gfn);
958 if (!is_rmap_pte(*shadow_pte))
959 kvm_release_page_clean(page);
960 } else {
961 if (was_writeble)
962 kvm_release_page_dirty(page);
963 else
964 kvm_release_page_clean(page);
965 }
966 if (!ptwrite || !*ptwrite)
967 vcpu->arch.last_pte_updated = shadow_pte;
968}
969
970static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
971{
972}
973
974static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write,
975 gfn_t gfn, struct page *page)
976{
977 int level = PT32E_ROOT_LEVEL;
978 hpa_t table_addr = vcpu->arch.mmu.root_hpa;
979 int pt_write = 0;
980
981 for (; ; level--) {
982 u32 index = PT64_INDEX(v, level);
983 u64 *table;
984
985 ASSERT(VALID_PAGE(table_addr));
986 table = __va(table_addr);
987
988 if (level == 1) {
989 mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
990 0, write, 1, &pt_write, gfn, page);
991 return pt_write || is_io_pte(table[index]);
992 }
993
994 if (table[index] == shadow_trap_nonpresent_pte) {
995 struct kvm_mmu_page *new_table;
996 gfn_t pseudo_gfn;
997
998 pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
999 >> PAGE_SHIFT;
1000 new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
1001 v, level - 1,
1002 1, ACC_ALL, &table[index],
1003 NULL);
1004 if (!new_table) {
1005 pgprintk("nonpaging_map: ENOMEM\n");
1006 kvm_release_page_clean(page);
1007 return -ENOMEM;
1008 }
1009
1010 table[index] = __pa(new_table->spt) | PT_PRESENT_MASK
1011 | PT_WRITABLE_MASK | PT_USER_MASK;
1012 }
1013 table_addr = table[index] & PT64_BASE_ADDR_MASK;
1014 }
1015}
1016
1017static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1018{
1019 int r;
1020
1021 struct page *page;
1022
1023 down_read(&current->mm->mmap_sem);
1024 page = gfn_to_page(vcpu->kvm, gfn);
1025
1026 spin_lock(&vcpu->kvm->mmu_lock);
1027 kvm_mmu_free_some_pages(vcpu);
1028 r = __nonpaging_map(vcpu, v, write, gfn, page);
1029 spin_unlock(&vcpu->kvm->mmu_lock);
1030
1031 up_read(&current->mm->mmap_sem);
1032
1033 return r;
1034}
1035
1036
1037static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
1038 struct kvm_mmu_page *sp)
1039{
1040 int i;
1041
1042 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1043 sp->spt[i] = shadow_trap_nonpresent_pte;
1044}
1045
1046static void mmu_free_roots(struct kvm_vcpu *vcpu)
1047{
1048 int i;
1049 struct kvm_mmu_page *sp;
1050
1051 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
1052 return;
1053 spin_lock(&vcpu->kvm->mmu_lock);
1054#ifdef CONFIG_X86_64
1055 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
1056 hpa_t root = vcpu->arch.mmu.root_hpa;
1057
1058 sp = page_header(root);
1059 --sp->root_count;
1060 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
1061 spin_unlock(&vcpu->kvm->mmu_lock);
1062 return;
1063 }
1064#endif
1065 for (i = 0; i < 4; ++i) {
1066 hpa_t root = vcpu->arch.mmu.pae_root[i];
1067
1068 if (root) {
1069 root &= PT64_BASE_ADDR_MASK;
1070 sp = page_header(root);
1071 --sp->root_count;
1072 }
1073 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
1074 }
1075 spin_unlock(&vcpu->kvm->mmu_lock);
1076 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
1077}
1078
1079static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
1080{
1081 int i;
1082 gfn_t root_gfn;
1083 struct kvm_mmu_page *sp;
1084
1085 root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
1086
1087#ifdef CONFIG_X86_64
1088 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
1089 hpa_t root = vcpu->arch.mmu.root_hpa;
1090
1091 ASSERT(!VALID_PAGE(root));
1092 sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
1093 PT64_ROOT_LEVEL, 0, ACC_ALL, NULL, NULL);
1094 root = __pa(sp->spt);
1095 ++sp->root_count;
1096 vcpu->arch.mmu.root_hpa = root;
1097 return;
1098 }
1099#endif
1100 for (i = 0; i < 4; ++i) {
1101 hpa_t root = vcpu->arch.mmu.pae_root[i];
1102
1103 ASSERT(!VALID_PAGE(root));
1104 if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
1105 if (!is_present_pte(vcpu->arch.pdptrs[i])) {
1106 vcpu->arch.mmu.pae_root[i] = 0;
1107 continue;
1108 }
1109 root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT;
1110 } else if (vcpu->arch.mmu.root_level == 0)
1111 root_gfn = 0;
1112 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
1113 PT32_ROOT_LEVEL, !is_paging(vcpu),
1114 ACC_ALL, NULL, NULL);
1115 root = __pa(sp->spt);
1116 ++sp->root_count;
1117 vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
1118 }
1119 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
1120}
1121
1122static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
1123{
1124 return vaddr;
1125}
1126
1127static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
1128 u32 error_code)
1129{
1130 gfn_t gfn;
1131 int r;
1132
1133 pgprintk("%s: gva %lx error %x\n", __FUNCTION__, gva, error_code);
1134 r = mmu_topup_memory_caches(vcpu);
1135 if (r)
1136 return r;
1137
1138 ASSERT(vcpu);
1139 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
1140
1141 gfn = gva >> PAGE_SHIFT;
1142
1143 return nonpaging_map(vcpu, gva & PAGE_MASK,
1144 error_code & PFERR_WRITE_MASK, gfn);
1145}
1146
1147static void nonpaging_free(struct kvm_vcpu *vcpu)
1148{
1149 mmu_free_roots(vcpu);
1150}
1151
1152static int nonpaging_init_context(struct kvm_vcpu *vcpu)
1153{
1154 struct kvm_mmu *context = &vcpu->arch.mmu;
1155
1156 context->new_cr3 = nonpaging_new_cr3;
1157 context->page_fault = nonpaging_page_fault;
1158 context->gva_to_gpa = nonpaging_gva_to_gpa;
1159 context->free = nonpaging_free;
1160 context->prefetch_page = nonpaging_prefetch_page;
1161 context->root_level = 0;
1162 context->shadow_root_level = PT32E_ROOT_LEVEL;
1163 context->root_hpa = INVALID_PAGE;
1164 return 0;
1165}
1166
1167void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
1168{
1169 ++vcpu->stat.tlb_flush;
1170 kvm_x86_ops->tlb_flush(vcpu);
1171}
1172
1173static void paging_new_cr3(struct kvm_vcpu *vcpu)
1174{
1175 pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3);
1176 mmu_free_roots(vcpu);
1177}
1178
1179static void inject_page_fault(struct kvm_vcpu *vcpu,
1180 u64 addr,
1181 u32 err_code)
1182{
1183 kvm_inject_page_fault(vcpu, addr, err_code);
1184}
1185
1186static void paging_free(struct kvm_vcpu *vcpu)
1187{
1188 nonpaging_free(vcpu);
1189}
1190
1191#define PTTYPE 64
1192#include "paging_tmpl.h"
1193#undef PTTYPE
1194
1195#define PTTYPE 32
1196#include "paging_tmpl.h"
1197#undef PTTYPE
1198
1199static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
1200{
1201 struct kvm_mmu *context = &vcpu->arch.mmu;
1202
1203 ASSERT(is_pae(vcpu));
1204 context->new_cr3 = paging_new_cr3;
1205 context->page_fault = paging64_page_fault;
1206 context->gva_to_gpa = paging64_gva_to_gpa;
1207 context->prefetch_page = paging64_prefetch_page;
1208 context->free = paging_free;
1209 context->root_level = level;
1210 context->shadow_root_level = level;
1211 context->root_hpa = INVALID_PAGE;
1212 return 0;
1213}
1214
1215static int paging64_init_context(struct kvm_vcpu *vcpu)
1216{
1217 return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
1218}
1219
1220static int paging32_init_context(struct kvm_vcpu *vcpu)
1221{
1222 struct kvm_mmu *context = &vcpu->arch.mmu;
1223
1224 context->new_cr3 = paging_new_cr3;
1225 context->page_fault = paging32_page_fault;
1226 context->gva_to_gpa = paging32_gva_to_gpa;
1227 context->free = paging_free;
1228 context->prefetch_page = paging32_prefetch_page;
1229 context->root_level = PT32_ROOT_LEVEL;
1230 context->shadow_root_level = PT32E_ROOT_LEVEL;
1231 context->root_hpa = INVALID_PAGE;
1232 return 0;
1233}
1234
1235static int paging32E_init_context(struct kvm_vcpu *vcpu)
1236{
1237 return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
1238}
1239
1240static int init_kvm_mmu(struct kvm_vcpu *vcpu)
1241{
1242 ASSERT(vcpu);
1243 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
1244
1245 if (!is_paging(vcpu))
1246 return nonpaging_init_context(vcpu);
1247 else if (is_long_mode(vcpu))
1248 return paging64_init_context(vcpu);
1249 else if (is_pae(vcpu))
1250 return paging32E_init_context(vcpu);
1251 else
1252 return paging32_init_context(vcpu);
1253}
1254
1255static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
1256{
1257 ASSERT(vcpu);
1258 if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
1259 vcpu->arch.mmu.free(vcpu);
1260 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
1261 }
1262}
1263
1264int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
1265{
1266 destroy_kvm_mmu(vcpu);
1267 return init_kvm_mmu(vcpu);
1268}
1269EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
1270
1271int kvm_mmu_load(struct kvm_vcpu *vcpu)
1272{
1273 int r;
1274
1275 r = mmu_topup_memory_caches(vcpu);
1276 if (r)
1277 goto out;
1278 spin_lock(&vcpu->kvm->mmu_lock);
1279 kvm_mmu_free_some_pages(vcpu);
1280 mmu_alloc_roots(vcpu);
1281 spin_unlock(&vcpu->kvm->mmu_lock);
1282 kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
1283 kvm_mmu_flush_tlb(vcpu);
1284out:
1285 return r;
1286}
1287EXPORT_SYMBOL_GPL(kvm_mmu_load);
1288
1289void kvm_mmu_unload(struct kvm_vcpu *vcpu)
1290{
1291 mmu_free_roots(vcpu);
1292}
1293
1294static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
1295 struct kvm_mmu_page *sp,
1296 u64 *spte)
1297{
1298 u64 pte;
1299 struct kvm_mmu_page *child;
1300
1301 pte = *spte;
1302 if (is_shadow_present_pte(pte)) {
1303 if (sp->role.level == PT_PAGE_TABLE_LEVEL)
1304 rmap_remove(vcpu->kvm, spte);
1305 else {
1306 child = page_header(pte & PT64_BASE_ADDR_MASK);
1307 mmu_page_remove_parent_pte(child, spte);
1308 }
1309 }
1310 set_shadow_pte(spte, shadow_trap_nonpresent_pte);
1311}
1312
1313static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
1314 struct kvm_mmu_page *sp,
1315 u64 *spte,
1316 const void *new, int bytes,
1317 int offset_in_pte)
1318{
1319 if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
1320 ++vcpu->kvm->stat.mmu_pde_zapped;
1321 return;
1322 }
1323
1324 ++vcpu->kvm->stat.mmu_pte_updated;
1325 if (sp->role.glevels == PT32_ROOT_LEVEL)
1326 paging32_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte);
1327 else
1328 paging64_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte);
1329}
1330
1331static bool need_remote_flush(u64 old, u64 new)
1332{
1333 if (!is_shadow_present_pte(old))
1334 return false;
1335 if (!is_shadow_present_pte(new))
1336 return true;
1337 if ((old ^ new) & PT64_BASE_ADDR_MASK)
1338 return true;
1339 old ^= PT64_NX_MASK;
1340 new ^= PT64_NX_MASK;
1341 return (old & ~new & PT64_PERM_MASK) != 0;
1342}
1343
1344static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new)
1345{
1346 if (need_remote_flush(old, new))
1347 kvm_flush_remote_tlbs(vcpu->kvm);
1348 else
1349 kvm_mmu_flush_tlb(vcpu);
1350}
1351
1352static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
1353{
1354 u64 *spte = vcpu->arch.last_pte_updated;
1355
1356 return !!(spte && (*spte & PT_ACCESSED_MASK));
1357}
1358
1359static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1360 const u8 *new, int bytes)
1361{
1362 gfn_t gfn;
1363 int r;
1364 u64 gpte = 0;
1365
1366 if (bytes != 4 && bytes != 8)
1367 return;
1368
1369 /*
1370 * Assume that the pte write on a page table of the same type
1371 * as the current vcpu paging mode. This is nearly always true
1372 * (might be false while changing modes). Note it is verified later
1373 * by update_pte().
1374 */
1375 if (is_pae(vcpu)) {
1376 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
1377 if ((bytes == 4) && (gpa % 4 == 0)) {
1378 r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8);
1379 if (r)
1380 return;
1381 memcpy((void *)&gpte + (gpa % 8), new, 4);
1382 } else if ((bytes == 8) && (gpa % 8 == 0)) {
1383 memcpy((void *)&gpte, new, 8);
1384 }
1385 } else {
1386 if ((bytes == 4) && (gpa % 4 == 0))
1387 memcpy((void *)&gpte, new, 4);
1388 }
1389 if (!is_present_pte(gpte))
1390 return;
1391 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
1392 vcpu->arch.update_pte.gfn = gfn;
1393 vcpu->arch.update_pte.page = gfn_to_page(vcpu->kvm, gfn);
1394}
1395
1396void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1397 const u8 *new, int bytes)
1398{
1399 gfn_t gfn = gpa >> PAGE_SHIFT;
1400 struct kvm_mmu_page *sp;
1401 struct hlist_node *node, *n;
1402 struct hlist_head *bucket;
1403 unsigned index;
1404 u64 entry;
1405 u64 *spte;
1406 unsigned offset = offset_in_page(gpa);
1407 unsigned pte_size;
1408 unsigned page_offset;
1409 unsigned misaligned;
1410 unsigned quadrant;
1411 int level;
1412 int flooded = 0;
1413 int npte;
1414
1415 pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
1416 mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes);
1417 spin_lock(&vcpu->kvm->mmu_lock);
1418 kvm_mmu_free_some_pages(vcpu);
1419 ++vcpu->kvm->stat.mmu_pte_write;
1420 kvm_mmu_audit(vcpu, "pre pte write");
1421 if (gfn == vcpu->arch.last_pt_write_gfn
1422 && !last_updated_pte_accessed(vcpu)) {
1423 ++vcpu->arch.last_pt_write_count;
1424 if (vcpu->arch.last_pt_write_count >= 3)
1425 flooded = 1;
1426 } else {
1427 vcpu->arch.last_pt_write_gfn = gfn;
1428 vcpu->arch.last_pt_write_count = 1;
1429 vcpu->arch.last_pte_updated = NULL;
1430 }
1431 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
1432 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
1433 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
1434 if (sp->gfn != gfn || sp->role.metaphysical)
1435 continue;
1436 pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
1437 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
1438 misaligned |= bytes < 4;
1439 if (misaligned || flooded) {
1440 /*
1441 * Misaligned accesses are too much trouble to fix
1442 * up; also, they usually indicate a page is not used
1443 * as a page table.
1444 *
1445 * If we're seeing too many writes to a page,
1446 * it may no longer be a page table, or we may be
1447 * forking, in which case it is better to unmap the
1448 * page.
1449 */
1450 pgprintk("misaligned: gpa %llx bytes %d role %x\n",
1451 gpa, bytes, sp->role.word);
1452 kvm_mmu_zap_page(vcpu->kvm, sp);
1453 ++vcpu->kvm->stat.mmu_flooded;
1454 continue;
1455 }
1456 page_offset = offset;
1457 level = sp->role.level;
1458 npte = 1;
1459 if (sp->role.glevels == PT32_ROOT_LEVEL) {
1460 page_offset <<= 1; /* 32->64 */
1461 /*
1462 * A 32-bit pde maps 4MB while the shadow pdes map
1463 * only 2MB. So we need to double the offset again
1464 * and zap two pdes instead of one.
1465 */
1466 if (level == PT32_ROOT_LEVEL) {
1467 page_offset &= ~7; /* kill rounding error */
1468 page_offset <<= 1;
1469 npte = 2;
1470 }
1471 quadrant = page_offset >> PAGE_SHIFT;
1472 page_offset &= ~PAGE_MASK;
1473 if (quadrant != sp->role.quadrant)
1474 continue;
1475 }
1476 spte = &sp->spt[page_offset / sizeof(*spte)];
1477 while (npte--) {
1478 entry = *spte;
1479 mmu_pte_write_zap_pte(vcpu, sp, spte);
1480 mmu_pte_write_new_pte(vcpu, sp, spte, new, bytes,
1481 page_offset & (pte_size - 1));
1482 mmu_pte_write_flush_tlb(vcpu, entry, *spte);
1483 ++spte;
1484 }
1485 }
1486 kvm_mmu_audit(vcpu, "post pte write");
1487 spin_unlock(&vcpu->kvm->mmu_lock);
1488 if (vcpu->arch.update_pte.page) {
1489 kvm_release_page_clean(vcpu->arch.update_pte.page);
1490 vcpu->arch.update_pte.page = NULL;
1491 }
1492}
1493
1494int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
1495{
1496 gpa_t gpa;
1497 int r;
1498
1499 down_read(&current->mm->mmap_sem);
1500 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
1501 up_read(&current->mm->mmap_sem);
1502
1503 spin_lock(&vcpu->kvm->mmu_lock);
1504 r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1505 spin_unlock(&vcpu->kvm->mmu_lock);
1506 return r;
1507}
1508
1509void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
1510{
1511 while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) {
1512 struct kvm_mmu_page *sp;
1513
1514 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
1515 struct kvm_mmu_page, link);
1516 kvm_mmu_zap_page(vcpu->kvm, sp);
1517 ++vcpu->kvm->stat.mmu_recycled;
1518 }
1519}
1520
1521int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
1522{
1523 int r;
1524 enum emulation_result er;
1525
1526 r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code);
1527 if (r < 0)
1528 goto out;
1529
1530 if (!r) {
1531 r = 1;
1532 goto out;
1533 }
1534
1535 r = mmu_topup_memory_caches(vcpu);
1536 if (r)
1537 goto out;
1538
1539 er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0);
1540
1541 switch (er) {
1542 case EMULATE_DONE:
1543 return 1;
1544 case EMULATE_DO_MMIO:
1545 ++vcpu->stat.mmio_exits;
1546 return 0;
1547 case EMULATE_FAIL:
1548 kvm_report_emulation_failure(vcpu, "pagetable");
1549 return 1;
1550 default:
1551 BUG();
1552 }
1553out:
1554 return r;
1555}
1556EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
1557
1558static void free_mmu_pages(struct kvm_vcpu *vcpu)
1559{
1560 struct kvm_mmu_page *sp;
1561
1562 while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
1563 sp = container_of(vcpu->kvm->arch.active_mmu_pages.next,
1564 struct kvm_mmu_page, link);
1565 kvm_mmu_zap_page(vcpu->kvm, sp);
1566 }
1567 free_page((unsigned long)vcpu->arch.mmu.pae_root);
1568}
1569
1570static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
1571{
1572 struct page *page;
1573 int i;
1574
1575 ASSERT(vcpu);
1576
1577 if (vcpu->kvm->arch.n_requested_mmu_pages)
1578 vcpu->kvm->arch.n_free_mmu_pages =
1579 vcpu->kvm->arch.n_requested_mmu_pages;
1580 else
1581 vcpu->kvm->arch.n_free_mmu_pages =
1582 vcpu->kvm->arch.n_alloc_mmu_pages;
1583 /*
1584 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
1585 * Therefore we need to allocate shadow page tables in the first
1586 * 4GB of memory, which happens to fit the DMA32 zone.
1587 */
1588 page = alloc_page(GFP_KERNEL | __GFP_DMA32);
1589 if (!page)
1590 goto error_1;
1591 vcpu->arch.mmu.pae_root = page_address(page);
1592 for (i = 0; i < 4; ++i)
1593 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
1594
1595 return 0;
1596
1597error_1:
1598 free_mmu_pages(vcpu);
1599 return -ENOMEM;
1600}
1601
1602int kvm_mmu_create(struct kvm_vcpu *vcpu)
1603{
1604 ASSERT(vcpu);
1605 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
1606
1607 return alloc_mmu_pages(vcpu);
1608}
1609
1610int kvm_mmu_setup(struct kvm_vcpu *vcpu)
1611{
1612 ASSERT(vcpu);
1613 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
1614
1615 return init_kvm_mmu(vcpu);
1616}
1617
1618void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
1619{
1620 ASSERT(vcpu);
1621
1622 destroy_kvm_mmu(vcpu);
1623 free_mmu_pages(vcpu);
1624 mmu_free_memory_caches(vcpu);
1625}
1626
1627void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
1628{
1629 struct kvm_mmu_page *sp;
1630
1631 list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
1632 int i;
1633 u64 *pt;
1634
1635 if (!test_bit(slot, &sp->slot_bitmap))
1636 continue;
1637
1638 pt = sp->spt;
1639 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1640 /* avoid RMW */
1641 if (pt[i] & PT_WRITABLE_MASK)
1642 pt[i] &= ~PT_WRITABLE_MASK;
1643 }
1644}
1645
1646void kvm_mmu_zap_all(struct kvm *kvm)
1647{
1648 struct kvm_mmu_page *sp, *node;
1649
1650 spin_lock(&kvm->mmu_lock);
1651 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
1652 kvm_mmu_zap_page(kvm, sp);
1653 spin_unlock(&kvm->mmu_lock);
1654
1655 kvm_flush_remote_tlbs(kvm);
1656}
1657
1658void kvm_mmu_module_exit(void)
1659{
1660 if (pte_chain_cache)
1661 kmem_cache_destroy(pte_chain_cache);
1662 if (rmap_desc_cache)
1663 kmem_cache_destroy(rmap_desc_cache);
1664 if (mmu_page_header_cache)
1665 kmem_cache_destroy(mmu_page_header_cache);
1666}
1667
1668int kvm_mmu_module_init(void)
1669{
1670 pte_chain_cache = kmem_cache_create("kvm_pte_chain",
1671 sizeof(struct kvm_pte_chain),
1672 0, 0, NULL);
1673 if (!pte_chain_cache)
1674 goto nomem;
1675 rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
1676 sizeof(struct kvm_rmap_desc),
1677 0, 0, NULL);
1678 if (!rmap_desc_cache)
1679 goto nomem;
1680
1681 mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
1682 sizeof(struct kvm_mmu_page),
1683 0, 0, NULL);
1684 if (!mmu_page_header_cache)
1685 goto nomem;
1686
1687 return 0;
1688
1689nomem:
1690 kvm_mmu_module_exit();
1691 return -ENOMEM;
1692}
1693
1694/*
1695 * Caculate mmu pages needed for kvm.
1696 */
1697unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
1698{
1699 int i;
1700 unsigned int nr_mmu_pages;
1701 unsigned int nr_pages = 0;
1702
1703 for (i = 0; i < kvm->nmemslots; i++)
1704 nr_pages += kvm->memslots[i].npages;
1705
1706 nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
1707 nr_mmu_pages = max(nr_mmu_pages,
1708 (unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
1709
1710 return nr_mmu_pages;
1711}
1712
1713#ifdef AUDIT
1714
1715static const char *audit_msg;
1716
1717static gva_t canonicalize(gva_t gva)
1718{
1719#ifdef CONFIG_X86_64
1720 gva = (long long)(gva << 16) >> 16;
1721#endif
1722 return gva;
1723}
1724
1725static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
1726 gva_t va, int level)
1727{
1728 u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
1729 int i;
1730 gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
1731
1732 for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
1733 u64 ent = pt[i];
1734
1735 if (ent == shadow_trap_nonpresent_pte)
1736 continue;
1737
1738 va = canonicalize(va);
1739 if (level > 1) {
1740 if (ent == shadow_notrap_nonpresent_pte)
1741 printk(KERN_ERR "audit: (%s) nontrapping pte"
1742 " in nonleaf level: levels %d gva %lx"
1743 " level %d pte %llx\n", audit_msg,
1744 vcpu->arch.mmu.root_level, va, level, ent);
1745
1746 audit_mappings_page(vcpu, ent, va, level - 1);
1747 } else {
1748 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va);
1749 struct page *page = gpa_to_page(vcpu, gpa);
1750 hpa_t hpa = page_to_phys(page);
1751
1752 if (is_shadow_present_pte(ent)
1753 && (ent & PT64_BASE_ADDR_MASK) != hpa)
1754 printk(KERN_ERR "xx audit error: (%s) levels %d"
1755 " gva %lx gpa %llx hpa %llx ent %llx %d\n",
1756 audit_msg, vcpu->arch.mmu.root_level,
1757 va, gpa, hpa, ent,
1758 is_shadow_present_pte(ent));
1759 else if (ent == shadow_notrap_nonpresent_pte
1760 && !is_error_hpa(hpa))
1761 printk(KERN_ERR "audit: (%s) notrap shadow,"
1762 " valid guest gva %lx\n", audit_msg, va);
1763 kvm_release_page_clean(page);
1764
1765 }
1766 }
1767}
1768
1769static void audit_mappings(struct kvm_vcpu *vcpu)
1770{
1771 unsigned i;
1772
1773 if (vcpu->arch.mmu.root_level == 4)
1774 audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
1775 else
1776 for (i = 0; i < 4; ++i)
1777 if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
1778 audit_mappings_page(vcpu,
1779 vcpu->arch.mmu.pae_root[i],
1780 i << 30,
1781 2);
1782}
1783
1784static int count_rmaps(struct kvm_vcpu *vcpu)
1785{
1786 int nmaps = 0;
1787 int i, j, k;
1788
1789 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
1790 struct kvm_memory_slot *m = &vcpu->kvm->memslots[i];
1791 struct kvm_rmap_desc *d;
1792
1793 for (j = 0; j < m->npages; ++j) {
1794 unsigned long *rmapp = &m->rmap[j];
1795
1796 if (!*rmapp)
1797 continue;
1798 if (!(*rmapp & 1)) {
1799 ++nmaps;
1800 continue;
1801 }
1802 d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
1803 while (d) {
1804 for (k = 0; k < RMAP_EXT; ++k)
1805 if (d->shadow_ptes[k])
1806 ++nmaps;
1807 else
1808 break;
1809 d = d->more;
1810 }
1811 }
1812 }
1813 return nmaps;
1814}
1815
1816static int count_writable_mappings(struct kvm_vcpu *vcpu)
1817{
1818 int nmaps = 0;
1819 struct kvm_mmu_page *sp;
1820 int i;
1821
1822 list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
1823 u64 *pt = sp->spt;
1824
1825 if (sp->role.level != PT_PAGE_TABLE_LEVEL)
1826 continue;
1827
1828 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1829 u64 ent = pt[i];
1830
1831 if (!(ent & PT_PRESENT_MASK))
1832 continue;
1833 if (!(ent & PT_WRITABLE_MASK))
1834 continue;
1835 ++nmaps;
1836 }
1837 }
1838 return nmaps;
1839}
1840
1841static void audit_rmap(struct kvm_vcpu *vcpu)
1842{
1843 int n_rmap = count_rmaps(vcpu);
1844 int n_actual = count_writable_mappings(vcpu);
1845
1846 if (n_rmap != n_actual)
1847 printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
1848 __FUNCTION__, audit_msg, n_rmap, n_actual);
1849}
1850
1851static void audit_write_protection(struct kvm_vcpu *vcpu)
1852{
1853 struct kvm_mmu_page *sp;
1854 struct kvm_memory_slot *slot;
1855 unsigned long *rmapp;
1856 gfn_t gfn;
1857
1858 list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
1859 if (sp->role.metaphysical)
1860 continue;
1861
1862 slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
1863 gfn = unalias_gfn(vcpu->kvm, sp->gfn);
1864 rmapp = &slot->rmap[gfn - slot->base_gfn];
1865 if (*rmapp)
1866 printk(KERN_ERR "%s: (%s) shadow page has writable"
1867 " mappings: gfn %lx role %x\n",
1868 __FUNCTION__, audit_msg, sp->gfn,
1869 sp->role.word);
1870 }
1871}
1872
1873static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
1874{
1875 int olddbg = dbg;
1876
1877 dbg = 0;
1878 audit_msg = msg;
1879 audit_rmap(vcpu);
1880 audit_write_protection(vcpu);
1881 audit_mappings(vcpu);
1882 dbg = olddbg;
1883}
1884
1885#endif
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
new file mode 100644
index 000000000000..1fce19ec7a23
--- /dev/null
+++ b/arch/x86/kvm/mmu.h
@@ -0,0 +1,44 @@
1#ifndef __KVM_X86_MMU_H
2#define __KVM_X86_MMU_H
3
4#include <linux/kvm_host.h>
5
6static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
7{
8 if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
9 __kvm_mmu_free_some_pages(vcpu);
10}
11
12static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
13{
14 if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE))
15 return 0;
16
17 return kvm_mmu_load(vcpu);
18}
19
20static inline int is_long_mode(struct kvm_vcpu *vcpu)
21{
22#ifdef CONFIG_X86_64
23 return vcpu->arch.shadow_efer & EFER_LME;
24#else
25 return 0;
26#endif
27}
28
29static inline int is_pae(struct kvm_vcpu *vcpu)
30{
31 return vcpu->arch.cr4 & X86_CR4_PAE;
32}
33
34static inline int is_pse(struct kvm_vcpu *vcpu)
35{
36 return vcpu->arch.cr4 & X86_CR4_PSE;
37}
38
39static inline int is_paging(struct kvm_vcpu *vcpu)
40{
41 return vcpu->arch.cr0 & X86_CR0_PG;
42}
43
44#endif
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
new file mode 100644
index 000000000000..03ba8608fe0f
--- /dev/null
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -0,0 +1,484 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * MMU support
8 *
9 * Copyright (C) 2006 Qumranet, Inc.
10 *
11 * Authors:
12 * Yaniv Kamay <yaniv@qumranet.com>
13 * Avi Kivity <avi@qumranet.com>
14 *
15 * This work is licensed under the terms of the GNU GPL, version 2. See
16 * the COPYING file in the top-level directory.
17 *
18 */
19
20/*
21 * We need the mmu code to access both 32-bit and 64-bit guest ptes,
22 * so the code in this file is compiled twice, once per pte size.
23 */
24
25#if PTTYPE == 64
26 #define pt_element_t u64
27 #define guest_walker guest_walker64
28 #define FNAME(name) paging##64_##name
29 #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
30 #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
31 #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
32 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
33 #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
34 #define PT_LEVEL_BITS PT64_LEVEL_BITS
35 #ifdef CONFIG_X86_64
36 #define PT_MAX_FULL_LEVELS 4
37 #define CMPXCHG cmpxchg
38 #else
39 #define CMPXCHG cmpxchg64
40 #define PT_MAX_FULL_LEVELS 2
41 #endif
42#elif PTTYPE == 32
43 #define pt_element_t u32
44 #define guest_walker guest_walker32
45 #define FNAME(name) paging##32_##name
46 #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
47 #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
48 #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
49 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
50 #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
51 #define PT_LEVEL_BITS PT32_LEVEL_BITS
52 #define PT_MAX_FULL_LEVELS 2
53 #define CMPXCHG cmpxchg
54#else
55 #error Invalid PTTYPE value
56#endif
57
58#define gpte_to_gfn FNAME(gpte_to_gfn)
59#define gpte_to_gfn_pde FNAME(gpte_to_gfn_pde)
60
61/*
62 * The guest_walker structure emulates the behavior of the hardware page
63 * table walker.
64 */
65struct guest_walker {
66 int level;
67 gfn_t table_gfn[PT_MAX_FULL_LEVELS];
68 pt_element_t ptes[PT_MAX_FULL_LEVELS];
69 gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
70 unsigned pt_access;
71 unsigned pte_access;
72 gfn_t gfn;
73 u32 error_code;
74};
75
76static gfn_t gpte_to_gfn(pt_element_t gpte)
77{
78 return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
79}
80
81static gfn_t gpte_to_gfn_pde(pt_element_t gpte)
82{
83 return (gpte & PT_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
84}
85
86static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
87 gfn_t table_gfn, unsigned index,
88 pt_element_t orig_pte, pt_element_t new_pte)
89{
90 pt_element_t ret;
91 pt_element_t *table;
92 struct page *page;
93
94 page = gfn_to_page(kvm, table_gfn);
95 table = kmap_atomic(page, KM_USER0);
96
97 ret = CMPXCHG(&table[index], orig_pte, new_pte);
98
99 kunmap_atomic(table, KM_USER0);
100
101 kvm_release_page_dirty(page);
102
103 return (ret != orig_pte);
104}
105
106static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
107{
108 unsigned access;
109
110 access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
111#if PTTYPE == 64
112 if (is_nx(vcpu))
113 access &= ~(gpte >> PT64_NX_SHIFT);
114#endif
115 return access;
116}
117
118/*
119 * Fetch a guest pte for a guest virtual address
120 */
121static int FNAME(walk_addr)(struct guest_walker *walker,
122 struct kvm_vcpu *vcpu, gva_t addr,
123 int write_fault, int user_fault, int fetch_fault)
124{
125 pt_element_t pte;
126 gfn_t table_gfn;
127 unsigned index, pt_access, pte_access;
128 gpa_t pte_gpa;
129
130 pgprintk("%s: addr %lx\n", __FUNCTION__, addr);
131walk:
132 walker->level = vcpu->arch.mmu.root_level;
133 pte = vcpu->arch.cr3;
134#if PTTYPE == 64
135 if (!is_long_mode(vcpu)) {
136 pte = vcpu->arch.pdptrs[(addr >> 30) & 3];
137 if (!is_present_pte(pte))
138 goto not_present;
139 --walker->level;
140 }
141#endif
142 ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
143 (vcpu->cr3 & CR3_NONPAE_RESERVED_BITS) == 0);
144
145 pt_access = ACC_ALL;
146
147 for (;;) {
148 index = PT_INDEX(addr, walker->level);
149
150 table_gfn = gpte_to_gfn(pte);
151 pte_gpa = gfn_to_gpa(table_gfn);
152 pte_gpa += index * sizeof(pt_element_t);
153 walker->table_gfn[walker->level - 1] = table_gfn;
154 walker->pte_gpa[walker->level - 1] = pte_gpa;
155 pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
156 walker->level - 1, table_gfn);
157
158 kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte));
159
160 if (!is_present_pte(pte))
161 goto not_present;
162
163 if (write_fault && !is_writeble_pte(pte))
164 if (user_fault || is_write_protection(vcpu))
165 goto access_error;
166
167 if (user_fault && !(pte & PT_USER_MASK))
168 goto access_error;
169
170#if PTTYPE == 64
171 if (fetch_fault && is_nx(vcpu) && (pte & PT64_NX_MASK))
172 goto access_error;
173#endif
174
175 if (!(pte & PT_ACCESSED_MASK)) {
176 mark_page_dirty(vcpu->kvm, table_gfn);
177 if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn,
178 index, pte, pte|PT_ACCESSED_MASK))
179 goto walk;
180 pte |= PT_ACCESSED_MASK;
181 }
182
183 pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
184
185 walker->ptes[walker->level - 1] = pte;
186
187 if (walker->level == PT_PAGE_TABLE_LEVEL) {
188 walker->gfn = gpte_to_gfn(pte);
189 break;
190 }
191
192 if (walker->level == PT_DIRECTORY_LEVEL
193 && (pte & PT_PAGE_SIZE_MASK)
194 && (PTTYPE == 64 || is_pse(vcpu))) {
195 walker->gfn = gpte_to_gfn_pde(pte);
196 walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL);
197 if (PTTYPE == 32 && is_cpuid_PSE36())
198 walker->gfn += pse36_gfn_delta(pte);
199 break;
200 }
201
202 pt_access = pte_access;
203 --walker->level;
204 }
205
206 if (write_fault && !is_dirty_pte(pte)) {
207 bool ret;
208
209 mark_page_dirty(vcpu->kvm, table_gfn);
210 ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
211 pte|PT_DIRTY_MASK);
212 if (ret)
213 goto walk;
214 pte |= PT_DIRTY_MASK;
215 kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte));
216 walker->ptes[walker->level - 1] = pte;
217 }
218
219 walker->pt_access = pt_access;
220 walker->pte_access = pte_access;
221 pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
222 __FUNCTION__, (u64)pte, pt_access, pte_access);
223 return 1;
224
225not_present:
226 walker->error_code = 0;
227 goto err;
228
229access_error:
230 walker->error_code = PFERR_PRESENT_MASK;
231
232err:
233 if (write_fault)
234 walker->error_code |= PFERR_WRITE_MASK;
235 if (user_fault)
236 walker->error_code |= PFERR_USER_MASK;
237 if (fetch_fault)
238 walker->error_code |= PFERR_FETCH_MASK;
239 return 0;
240}
241
242static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
243 u64 *spte, const void *pte, int bytes,
244 int offset_in_pte)
245{
246 pt_element_t gpte;
247 unsigned pte_access;
248 struct page *npage;
249
250 gpte = *(const pt_element_t *)pte;
251 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
252 if (!offset_in_pte && !is_present_pte(gpte))
253 set_shadow_pte(spte, shadow_notrap_nonpresent_pte);
254 return;
255 }
256 if (bytes < sizeof(pt_element_t))
257 return;
258 pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte);
259 pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte);
260 if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
261 return;
262 npage = vcpu->arch.update_pte.page;
263 if (!npage)
264 return;
265 get_page(npage);
266 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
267 gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte), npage);
268}
269
270/*
271 * Fetch a shadow pte for a specific level in the paging hierarchy.
272 */
273static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
274 struct guest_walker *walker,
275 int user_fault, int write_fault, int *ptwrite,
276 struct page *page)
277{
278 hpa_t shadow_addr;
279 int level;
280 u64 *shadow_ent;
281 unsigned access = walker->pt_access;
282
283 if (!is_present_pte(walker->ptes[walker->level - 1]))
284 return NULL;
285
286 shadow_addr = vcpu->arch.mmu.root_hpa;
287 level = vcpu->arch.mmu.shadow_root_level;
288 if (level == PT32E_ROOT_LEVEL) {
289 shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
290 shadow_addr &= PT64_BASE_ADDR_MASK;
291 --level;
292 }
293
294 for (; ; level--) {
295 u32 index = SHADOW_PT_INDEX(addr, level);
296 struct kvm_mmu_page *shadow_page;
297 u64 shadow_pte;
298 int metaphysical;
299 gfn_t table_gfn;
300 bool new_page = 0;
301
302 shadow_ent = ((u64 *)__va(shadow_addr)) + index;
303 if (level == PT_PAGE_TABLE_LEVEL)
304 break;
305 if (is_shadow_present_pte(*shadow_ent)) {
306 shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
307 continue;
308 }
309
310 if (level - 1 == PT_PAGE_TABLE_LEVEL
311 && walker->level == PT_DIRECTORY_LEVEL) {
312 metaphysical = 1;
313 if (!is_dirty_pte(walker->ptes[level - 1]))
314 access &= ~ACC_WRITE_MASK;
315 table_gfn = gpte_to_gfn(walker->ptes[level - 1]);
316 } else {
317 metaphysical = 0;
318 table_gfn = walker->table_gfn[level - 2];
319 }
320 shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
321 metaphysical, access,
322 shadow_ent, &new_page);
323 if (new_page && !metaphysical) {
324 int r;
325 pt_element_t curr_pte;
326 r = kvm_read_guest_atomic(vcpu->kvm,
327 walker->pte_gpa[level - 2],
328 &curr_pte, sizeof(curr_pte));
329 if (r || curr_pte != walker->ptes[level - 2]) {
330 kvm_release_page_clean(page);
331 return NULL;
332 }
333 }
334 shadow_addr = __pa(shadow_page->spt);
335 shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
336 | PT_WRITABLE_MASK | PT_USER_MASK;
337 *shadow_ent = shadow_pte;
338 }
339
340 mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access,
341 user_fault, write_fault,
342 walker->ptes[walker->level-1] & PT_DIRTY_MASK,
343 ptwrite, walker->gfn, page);
344
345 return shadow_ent;
346}
347
348/*
349 * Page fault handler. There are several causes for a page fault:
350 * - there is no shadow pte for the guest pte
351 * - write access through a shadow pte marked read only so that we can set
352 * the dirty bit
353 * - write access to a shadow pte marked read only so we can update the page
354 * dirty bitmap, when userspace requests it
355 * - mmio access; in this case we will never install a present shadow pte
356 * - normal guest page fault due to the guest pte marked not present, not
357 * writable, or not executable
358 *
359 * Returns: 1 if we need to emulate the instruction, 0 otherwise, or
360 * a negative value on error.
361 */
362static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
363 u32 error_code)
364{
365 int write_fault = error_code & PFERR_WRITE_MASK;
366 int user_fault = error_code & PFERR_USER_MASK;
367 int fetch_fault = error_code & PFERR_FETCH_MASK;
368 struct guest_walker walker;
369 u64 *shadow_pte;
370 int write_pt = 0;
371 int r;
372 struct page *page;
373
374 pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code);
375 kvm_mmu_audit(vcpu, "pre page fault");
376
377 r = mmu_topup_memory_caches(vcpu);
378 if (r)
379 return r;
380
381 down_read(&current->mm->mmap_sem);
382 /*
383 * Look up the shadow pte for the faulting address.
384 */
385 r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
386 fetch_fault);
387
388 /*
389 * The page is not mapped by the guest. Let the guest handle it.
390 */
391 if (!r) {
392 pgprintk("%s: guest page fault\n", __FUNCTION__);
393 inject_page_fault(vcpu, addr, walker.error_code);
394 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
395 up_read(&current->mm->mmap_sem);
396 return 0;
397 }
398
399 page = gfn_to_page(vcpu->kvm, walker.gfn);
400
401 spin_lock(&vcpu->kvm->mmu_lock);
402 kvm_mmu_free_some_pages(vcpu);
403 shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
404 &write_pt, page);
405 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__,
406 shadow_pte, *shadow_pte, write_pt);
407
408 if (!write_pt)
409 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
410
411 /*
412 * mmio: emulate if accessible, otherwise its a guest fault.
413 */
414 if (shadow_pte && is_io_pte(*shadow_pte)) {
415 spin_unlock(&vcpu->kvm->mmu_lock);
416 up_read(&current->mm->mmap_sem);
417 return 1;
418 }
419
420 ++vcpu->stat.pf_fixed;
421 kvm_mmu_audit(vcpu, "post page fault (fixed)");
422 spin_unlock(&vcpu->kvm->mmu_lock);
423 up_read(&current->mm->mmap_sem);
424
425 return write_pt;
426}
427
428static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
429{
430 struct guest_walker walker;
431 gpa_t gpa = UNMAPPED_GVA;
432 int r;
433
434 r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0);
435
436 if (r) {
437 gpa = gfn_to_gpa(walker.gfn);
438 gpa |= vaddr & ~PAGE_MASK;
439 }
440
441 return gpa;
442}
443
444static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
445 struct kvm_mmu_page *sp)
446{
447 int i, offset = 0, r = 0;
448 pt_element_t pt;
449
450 if (sp->role.metaphysical
451 || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) {
452 nonpaging_prefetch_page(vcpu, sp);
453 return;
454 }
455
456 if (PTTYPE == 32)
457 offset = sp->role.quadrant << PT64_LEVEL_BITS;
458
459 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
460 gpa_t pte_gpa = gfn_to_gpa(sp->gfn);
461 pte_gpa += (i+offset) * sizeof(pt_element_t);
462
463 r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &pt,
464 sizeof(pt_element_t));
465 if (r || is_present_pte(pt))
466 sp->spt[i] = shadow_trap_nonpresent_pte;
467 else
468 sp->spt[i] = shadow_notrap_nonpresent_pte;
469 }
470}
471
472#undef pt_element_t
473#undef guest_walker
474#undef FNAME
475#undef PT_BASE_ADDR_MASK
476#undef PT_INDEX
477#undef SHADOW_PT_INDEX
478#undef PT_LEVEL_MASK
479#undef PT_DIR_BASE_ADDR_MASK
480#undef PT_LEVEL_BITS
481#undef PT_MAX_FULL_LEVELS
482#undef gpte_to_gfn
483#undef gpte_to_gfn_pde
484#undef CMPXCHG
diff --git a/drivers/kvm/segment_descriptor.h b/arch/x86/kvm/segment_descriptor.h
index 71fdf458619a..56fc4c873389 100644
--- a/drivers/kvm/segment_descriptor.h
+++ b/arch/x86/kvm/segment_descriptor.h
@@ -1,3 +1,6 @@
1#ifndef __SEGMENT_DESCRIPTOR_H
2#define __SEGMENT_DESCRIPTOR_H
3
1struct segment_descriptor { 4struct segment_descriptor {
2 u16 limit_low; 5 u16 limit_low;
3 u16 base_low; 6 u16 base_low;
@@ -14,4 +17,13 @@ struct segment_descriptor {
14 u8 base_high; 17 u8 base_high;
15} __attribute__((packed)); 18} __attribute__((packed));
16 19
20#ifdef CONFIG_X86_64
21/* LDT or TSS descriptor in the GDT. 16 bytes. */
22struct segment_descriptor_64 {
23 struct segment_descriptor s;
24 u32 base_higher;
25 u32 pad_zero;
26};
17 27
28#endif
29#endif
diff --git a/drivers/kvm/svm.c b/arch/x86/kvm/svm.c
index ced4ac1955db..de755cb1431d 100644
--- a/drivers/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -13,10 +13,11 @@
13 * the COPYING file in the top-level directory. 13 * the COPYING file in the top-level directory.
14 * 14 *
15 */ 15 */
16#include <linux/kvm_host.h>
16 17
17#include "kvm_svm.h" 18#include "kvm_svm.h"
18#include "x86_emulate.h"
19#include "irq.h" 19#include "irq.h"
20#include "mmu.h"
20 21
21#include <linux/module.h> 22#include <linux/module.h>
22#include <linux/kernel.h> 23#include <linux/kernel.h>
@@ -42,9 +43,6 @@ MODULE_LICENSE("GPL");
42#define SEG_TYPE_LDT 2 43#define SEG_TYPE_LDT 2
43#define SEG_TYPE_BUSY_TSS16 3 44#define SEG_TYPE_BUSY_TSS16 3
44 45
45#define KVM_EFER_LMA (1 << 10)
46#define KVM_EFER_LME (1 << 8)
47
48#define SVM_FEATURE_NPT (1 << 0) 46#define SVM_FEATURE_NPT (1 << 0)
49#define SVM_FEATURE_LBRV (1 << 1) 47#define SVM_FEATURE_LBRV (1 << 1)
50#define SVM_DEATURE_SVML (1 << 2) 48#define SVM_DEATURE_SVML (1 << 2)
@@ -102,20 +100,20 @@ static inline u32 svm_has(u32 feat)
102 100
103static inline u8 pop_irq(struct kvm_vcpu *vcpu) 101static inline u8 pop_irq(struct kvm_vcpu *vcpu)
104{ 102{
105 int word_index = __ffs(vcpu->irq_summary); 103 int word_index = __ffs(vcpu->arch.irq_summary);
106 int bit_index = __ffs(vcpu->irq_pending[word_index]); 104 int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
107 int irq = word_index * BITS_PER_LONG + bit_index; 105 int irq = word_index * BITS_PER_LONG + bit_index;
108 106
109 clear_bit(bit_index, &vcpu->irq_pending[word_index]); 107 clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
110 if (!vcpu->irq_pending[word_index]) 108 if (!vcpu->arch.irq_pending[word_index])
111 clear_bit(word_index, &vcpu->irq_summary); 109 clear_bit(word_index, &vcpu->arch.irq_summary);
112 return irq; 110 return irq;
113} 111}
114 112
115static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq) 113static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq)
116{ 114{
117 set_bit(irq, vcpu->irq_pending); 115 set_bit(irq, vcpu->arch.irq_pending);
118 set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary); 116 set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
119} 117}
120 118
121static inline void clgi(void) 119static inline void clgi(void)
@@ -184,35 +182,30 @@ static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
184 182
185static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) 183static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
186{ 184{
187 if (!(efer & KVM_EFER_LMA)) 185 if (!(efer & EFER_LMA))
188 efer &= ~KVM_EFER_LME; 186 efer &= ~EFER_LME;
189 187
190 to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK; 188 to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK;
191 vcpu->shadow_efer = efer; 189 vcpu->arch.shadow_efer = efer;
192} 190}
193 191
194static void svm_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code) 192static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
193 bool has_error_code, u32 error_code)
195{ 194{
196 struct vcpu_svm *svm = to_svm(vcpu); 195 struct vcpu_svm *svm = to_svm(vcpu);
197 196
198 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | 197 svm->vmcb->control.event_inj = nr
199 SVM_EVTINJ_VALID_ERR | 198 | SVM_EVTINJ_VALID
200 SVM_EVTINJ_TYPE_EXEPT | 199 | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
201 GP_VECTOR; 200 | SVM_EVTINJ_TYPE_EXEPT;
202 svm->vmcb->control.event_inj_err = error_code; 201 svm->vmcb->control.event_inj_err = error_code;
203} 202}
204 203
205static void inject_ud(struct kvm_vcpu *vcpu) 204static bool svm_exception_injected(struct kvm_vcpu *vcpu)
206{ 205{
207 to_svm(vcpu)->vmcb->control.event_inj = SVM_EVTINJ_VALID | 206 struct vcpu_svm *svm = to_svm(vcpu);
208 SVM_EVTINJ_TYPE_EXEPT |
209 UD_VECTOR;
210}
211 207
212static int is_page_fault(uint32_t info) 208 return !(svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID);
213{
214 info &= SVM_EVTINJ_VEC_MASK | SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
215 return info == (PF_VECTOR | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT);
216} 209}
217 210
218static int is_external_interrupt(u32 info) 211static int is_external_interrupt(u32 info)
@@ -229,17 +222,16 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
229 printk(KERN_DEBUG "%s: NOP\n", __FUNCTION__); 222 printk(KERN_DEBUG "%s: NOP\n", __FUNCTION__);
230 return; 223 return;
231 } 224 }
232 if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE) { 225 if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE)
233 printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n", 226 printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n",
234 __FUNCTION__, 227 __FUNCTION__,
235 svm->vmcb->save.rip, 228 svm->vmcb->save.rip,
236 svm->next_rip); 229 svm->next_rip);
237 }
238 230
239 vcpu->rip = svm->vmcb->save.rip = svm->next_rip; 231 vcpu->arch.rip = svm->vmcb->save.rip = svm->next_rip;
240 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; 232 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
241 233
242 vcpu->interrupt_window_open = 1; 234 vcpu->arch.interrupt_window_open = 1;
243} 235}
244 236
245static int has_svm(void) 237static int has_svm(void)
@@ -312,7 +304,7 @@ static void svm_hardware_enable(void *garbage)
312 svm_data->next_asid = svm_data->max_asid + 1; 304 svm_data->next_asid = svm_data->max_asid + 1;
313 svm_features = cpuid_edx(SVM_CPUID_FUNC); 305 svm_features = cpuid_edx(SVM_CPUID_FUNC);
314 306
315 asm volatile ( "sgdt %0" : "=m"(gdt_descr) ); 307 asm volatile ("sgdt %0" : "=m"(gdt_descr));
316 gdt = (struct desc_struct *)gdt_descr.address; 308 gdt = (struct desc_struct *)gdt_descr.address;
317 svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); 309 svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
318 310
@@ -458,11 +450,13 @@ static void init_vmcb(struct vmcb *vmcb)
458 450
459 control->intercept_cr_read = INTERCEPT_CR0_MASK | 451 control->intercept_cr_read = INTERCEPT_CR0_MASK |
460 INTERCEPT_CR3_MASK | 452 INTERCEPT_CR3_MASK |
461 INTERCEPT_CR4_MASK; 453 INTERCEPT_CR4_MASK |
454 INTERCEPT_CR8_MASK;
462 455
463 control->intercept_cr_write = INTERCEPT_CR0_MASK | 456 control->intercept_cr_write = INTERCEPT_CR0_MASK |
464 INTERCEPT_CR3_MASK | 457 INTERCEPT_CR3_MASK |
465 INTERCEPT_CR4_MASK; 458 INTERCEPT_CR4_MASK |
459 INTERCEPT_CR8_MASK;
466 460
467 control->intercept_dr_read = INTERCEPT_DR0_MASK | 461 control->intercept_dr_read = INTERCEPT_DR0_MASK |
468 INTERCEPT_DR1_MASK | 462 INTERCEPT_DR1_MASK |
@@ -476,7 +470,8 @@ static void init_vmcb(struct vmcb *vmcb)
476 INTERCEPT_DR5_MASK | 470 INTERCEPT_DR5_MASK |
477 INTERCEPT_DR7_MASK; 471 INTERCEPT_DR7_MASK;
478 472
479 control->intercept_exceptions = 1 << PF_VECTOR; 473 control->intercept_exceptions = (1 << PF_VECTOR) |
474 (1 << UD_VECTOR);
480 475
481 476
482 control->intercept = (1ULL << INTERCEPT_INTR) | 477 control->intercept = (1ULL << INTERCEPT_INTR) |
@@ -543,8 +538,7 @@ static void init_vmcb(struct vmcb *vmcb)
543 init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); 538 init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
544 539
545 save->efer = MSR_EFER_SVME_MASK; 540 save->efer = MSR_EFER_SVME_MASK;
546 541 save->dr6 = 0xffff0ff0;
547 save->dr6 = 0xffff0ff0;
548 save->dr7 = 0x400; 542 save->dr7 = 0x400;
549 save->rflags = 2; 543 save->rflags = 2;
550 save->rip = 0x0000fff0; 544 save->rip = 0x0000fff0;
@@ -558,7 +552,7 @@ static void init_vmcb(struct vmcb *vmcb)
558 /* rdx = ?? */ 552 /* rdx = ?? */
559} 553}
560 554
561static void svm_vcpu_reset(struct kvm_vcpu *vcpu) 555static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
562{ 556{
563 struct vcpu_svm *svm = to_svm(vcpu); 557 struct vcpu_svm *svm = to_svm(vcpu);
564 558
@@ -566,9 +560,11 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu)
566 560
567 if (vcpu->vcpu_id != 0) { 561 if (vcpu->vcpu_id != 0) {
568 svm->vmcb->save.rip = 0; 562 svm->vmcb->save.rip = 0;
569 svm->vmcb->save.cs.base = svm->vcpu.sipi_vector << 12; 563 svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
570 svm->vmcb->save.cs.selector = svm->vcpu.sipi_vector << 8; 564 svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
571 } 565 }
566
567 return 0;
572} 568}
573 569
574static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) 570static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
@@ -587,12 +583,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
587 if (err) 583 if (err)
588 goto free_svm; 584 goto free_svm;
589 585
590 if (irqchip_in_kernel(kvm)) {
591 err = kvm_create_lapic(&svm->vcpu);
592 if (err < 0)
593 goto free_svm;
594 }
595
596 page = alloc_page(GFP_KERNEL); 586 page = alloc_page(GFP_KERNEL);
597 if (!page) { 587 if (!page) {
598 err = -ENOMEM; 588 err = -ENOMEM;
@@ -608,9 +598,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
608 598
609 fx_init(&svm->vcpu); 599 fx_init(&svm->vcpu);
610 svm->vcpu.fpu_active = 1; 600 svm->vcpu.fpu_active = 1;
611 svm->vcpu.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; 601 svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
612 if (svm->vcpu.vcpu_id == 0) 602 if (svm->vcpu.vcpu_id == 0)
613 svm->vcpu.apic_base |= MSR_IA32_APICBASE_BSP; 603 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
614 604
615 return &svm->vcpu; 605 return &svm->vcpu;
616 606
@@ -644,7 +634,7 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
644 * increasing TSC. 634 * increasing TSC.
645 */ 635 */
646 rdtscll(tsc_this); 636 rdtscll(tsc_this);
647 delta = vcpu->host_tsc - tsc_this; 637 delta = vcpu->arch.host_tsc - tsc_this;
648 svm->vmcb->control.tsc_offset += delta; 638 svm->vmcb->control.tsc_offset += delta;
649 vcpu->cpu = cpu; 639 vcpu->cpu = cpu;
650 kvm_migrate_apic_timer(vcpu); 640 kvm_migrate_apic_timer(vcpu);
@@ -659,11 +649,11 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
659 struct vcpu_svm *svm = to_svm(vcpu); 649 struct vcpu_svm *svm = to_svm(vcpu);
660 int i; 650 int i;
661 651
652 ++vcpu->stat.host_state_reload;
662 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) 653 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
663 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); 654 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
664 655
665 rdtscll(vcpu->host_tsc); 656 rdtscll(vcpu->arch.host_tsc);
666 kvm_put_guest_fpu(vcpu);
667} 657}
668 658
669static void svm_vcpu_decache(struct kvm_vcpu *vcpu) 659static void svm_vcpu_decache(struct kvm_vcpu *vcpu)
@@ -674,17 +664,17 @@ static void svm_cache_regs(struct kvm_vcpu *vcpu)
674{ 664{
675 struct vcpu_svm *svm = to_svm(vcpu); 665 struct vcpu_svm *svm = to_svm(vcpu);
676 666
677 vcpu->regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; 667 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
678 vcpu->regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; 668 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
679 vcpu->rip = svm->vmcb->save.rip; 669 vcpu->arch.rip = svm->vmcb->save.rip;
680} 670}
681 671
682static void svm_decache_regs(struct kvm_vcpu *vcpu) 672static void svm_decache_regs(struct kvm_vcpu *vcpu)
683{ 673{
684 struct vcpu_svm *svm = to_svm(vcpu); 674 struct vcpu_svm *svm = to_svm(vcpu);
685 svm->vmcb->save.rax = vcpu->regs[VCPU_REGS_RAX]; 675 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
686 svm->vmcb->save.rsp = vcpu->regs[VCPU_REGS_RSP]; 676 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
687 svm->vmcb->save.rip = vcpu->rip; 677 svm->vmcb->save.rip = vcpu->arch.rip;
688} 678}
689 679
690static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) 680static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
@@ -782,24 +772,24 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
782 struct vcpu_svm *svm = to_svm(vcpu); 772 struct vcpu_svm *svm = to_svm(vcpu);
783 773
784#ifdef CONFIG_X86_64 774#ifdef CONFIG_X86_64
785 if (vcpu->shadow_efer & KVM_EFER_LME) { 775 if (vcpu->arch.shadow_efer & EFER_LME) {
786 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 776 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
787 vcpu->shadow_efer |= KVM_EFER_LMA; 777 vcpu->arch.shadow_efer |= EFER_LMA;
788 svm->vmcb->save.efer |= KVM_EFER_LMA | KVM_EFER_LME; 778 svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
789 } 779 }
790 780
791 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG) ) { 781 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
792 vcpu->shadow_efer &= ~KVM_EFER_LMA; 782 vcpu->arch.shadow_efer &= ~EFER_LMA;
793 svm->vmcb->save.efer &= ~(KVM_EFER_LMA | KVM_EFER_LME); 783 svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
794 } 784 }
795 } 785 }
796#endif 786#endif
797 if ((vcpu->cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) { 787 if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) {
798 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); 788 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
799 vcpu->fpu_active = 1; 789 vcpu->fpu_active = 1;
800 } 790 }
801 791
802 vcpu->cr0 = cr0; 792 vcpu->arch.cr0 = cr0;
803 cr0 |= X86_CR0_PG | X86_CR0_WP; 793 cr0 |= X86_CR0_PG | X86_CR0_WP;
804 cr0 &= ~(X86_CR0_CD | X86_CR0_NW); 794 cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
805 svm->vmcb->save.cr0 = cr0; 795 svm->vmcb->save.cr0 = cr0;
@@ -807,7 +797,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
807 797
808static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 798static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
809{ 799{
810 vcpu->cr4 = cr4; 800 vcpu->arch.cr4 = cr4;
811 to_svm(vcpu)->vmcb->save.cr4 = cr4 | X86_CR4_PAE; 801 to_svm(vcpu)->vmcb->save.cr4 = cr4 | X86_CR4_PAE;
812} 802}
813 803
@@ -912,7 +902,7 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
912 svm->db_regs[dr] = value; 902 svm->db_regs[dr] = value;
913 return; 903 return;
914 case 4 ... 5: 904 case 4 ... 5:
915 if (vcpu->cr4 & X86_CR4_DE) { 905 if (vcpu->arch.cr4 & X86_CR4_DE) {
916 *exception = UD_VECTOR; 906 *exception = UD_VECTOR;
917 return; 907 return;
918 } 908 }
@@ -938,51 +928,30 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
938 struct kvm *kvm = svm->vcpu.kvm; 928 struct kvm *kvm = svm->vcpu.kvm;
939 u64 fault_address; 929 u64 fault_address;
940 u32 error_code; 930 u32 error_code;
941 enum emulation_result er;
942 int r;
943 931
944 if (!irqchip_in_kernel(kvm) && 932 if (!irqchip_in_kernel(kvm) &&
945 is_external_interrupt(exit_int_info)) 933 is_external_interrupt(exit_int_info))
946 push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK); 934 push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
947 935
948 mutex_lock(&kvm->lock);
949
950 fault_address = svm->vmcb->control.exit_info_2; 936 fault_address = svm->vmcb->control.exit_info_2;
951 error_code = svm->vmcb->control.exit_info_1; 937 error_code = svm->vmcb->control.exit_info_1;
952 r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); 938 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
953 if (r < 0) { 939}
954 mutex_unlock(&kvm->lock);
955 return r;
956 }
957 if (!r) {
958 mutex_unlock(&kvm->lock);
959 return 1;
960 }
961 er = emulate_instruction(&svm->vcpu, kvm_run, fault_address,
962 error_code);
963 mutex_unlock(&kvm->lock);
964 940
965 switch (er) { 941static int ud_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
966 case EMULATE_DONE: 942{
967 return 1; 943 int er;
968 case EMULATE_DO_MMIO:
969 ++svm->vcpu.stat.mmio_exits;
970 return 0;
971 case EMULATE_FAIL:
972 kvm_report_emulation_failure(&svm->vcpu, "pagetable");
973 break;
974 default:
975 BUG();
976 }
977 944
978 kvm_run->exit_reason = KVM_EXIT_UNKNOWN; 945 er = emulate_instruction(&svm->vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD);
979 return 0; 946 if (er != EMULATE_DONE)
947 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
948 return 1;
980} 949}
981 950
982static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 951static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
983{ 952{
984 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); 953 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
985 if (!(svm->vcpu.cr0 & X86_CR0_TS)) 954 if (!(svm->vcpu.arch.cr0 & X86_CR0_TS))
986 svm->vmcb->save.cr0 &= ~X86_CR0_TS; 955 svm->vmcb->save.cr0 &= ~X86_CR0_TS;
987 svm->vcpu.fpu_active = 1; 956 svm->vcpu.fpu_active = 1;
988 957
@@ -1004,7 +973,7 @@ static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1004 973
1005static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 974static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1006{ 975{
1007 u32 io_info = svm->vmcb->control.exit_info_1; //address size bug? 976 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
1008 int size, down, in, string, rep; 977 int size, down, in, string, rep;
1009 unsigned port; 978 unsigned port;
1010 979
@@ -1015,7 +984,8 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1015 string = (io_info & SVM_IOIO_STR_MASK) != 0; 984 string = (io_info & SVM_IOIO_STR_MASK) != 0;
1016 985
1017 if (string) { 986 if (string) {
1018 if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0) == EMULATE_DO_MMIO) 987 if (emulate_instruction(&svm->vcpu,
988 kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
1019 return 0; 989 return 0;
1020 return 1; 990 return 1;
1021 } 991 }
@@ -1045,13 +1015,14 @@ static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1045{ 1015{
1046 svm->next_rip = svm->vmcb->save.rip + 3; 1016 svm->next_rip = svm->vmcb->save.rip + 3;
1047 skip_emulated_instruction(&svm->vcpu); 1017 skip_emulated_instruction(&svm->vcpu);
1048 return kvm_hypercall(&svm->vcpu, kvm_run); 1018 kvm_emulate_hypercall(&svm->vcpu);
1019 return 1;
1049} 1020}
1050 1021
1051static int invalid_op_interception(struct vcpu_svm *svm, 1022static int invalid_op_interception(struct vcpu_svm *svm,
1052 struct kvm_run *kvm_run) 1023 struct kvm_run *kvm_run)
1053{ 1024{
1054 inject_ud(&svm->vcpu); 1025 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1055 return 1; 1026 return 1;
1056} 1027}
1057 1028
@@ -1073,11 +1044,20 @@ static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1073static int emulate_on_interception(struct vcpu_svm *svm, 1044static int emulate_on_interception(struct vcpu_svm *svm,
1074 struct kvm_run *kvm_run) 1045 struct kvm_run *kvm_run)
1075{ 1046{
1076 if (emulate_instruction(&svm->vcpu, NULL, 0, 0) != EMULATE_DONE) 1047 if (emulate_instruction(&svm->vcpu, NULL, 0, 0, 0) != EMULATE_DONE)
1077 pr_unimpl(&svm->vcpu, "%s: failed\n", __FUNCTION__); 1048 pr_unimpl(&svm->vcpu, "%s: failed\n", __FUNCTION__);
1078 return 1; 1049 return 1;
1079} 1050}
1080 1051
1052static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1053{
1054 emulate_instruction(&svm->vcpu, NULL, 0, 0, 0);
1055 if (irqchip_in_kernel(svm->vcpu.kvm))
1056 return 1;
1057 kvm_run->exit_reason = KVM_EXIT_SET_TPR;
1058 return 0;
1059}
1060
1081static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) 1061static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
1082{ 1062{
1083 struct vcpu_svm *svm = to_svm(vcpu); 1063 struct vcpu_svm *svm = to_svm(vcpu);
@@ -1124,14 +1104,14 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
1124 1104
1125static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1105static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1126{ 1106{
1127 u32 ecx = svm->vcpu.regs[VCPU_REGS_RCX]; 1107 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
1128 u64 data; 1108 u64 data;
1129 1109
1130 if (svm_get_msr(&svm->vcpu, ecx, &data)) 1110 if (svm_get_msr(&svm->vcpu, ecx, &data))
1131 svm_inject_gp(&svm->vcpu, 0); 1111 kvm_inject_gp(&svm->vcpu, 0);
1132 else { 1112 else {
1133 svm->vmcb->save.rax = data & 0xffffffff; 1113 svm->vmcb->save.rax = data & 0xffffffff;
1134 svm->vcpu.regs[VCPU_REGS_RDX] = data >> 32; 1114 svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32;
1135 svm->next_rip = svm->vmcb->save.rip + 2; 1115 svm->next_rip = svm->vmcb->save.rip + 2;
1136 skip_emulated_instruction(&svm->vcpu); 1116 skip_emulated_instruction(&svm->vcpu);
1137 } 1117 }
@@ -1176,7 +1156,20 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
1176 case MSR_IA32_SYSENTER_ESP: 1156 case MSR_IA32_SYSENTER_ESP:
1177 svm->vmcb->save.sysenter_esp = data; 1157 svm->vmcb->save.sysenter_esp = data;
1178 break; 1158 break;
1159 case MSR_K7_EVNTSEL0:
1160 case MSR_K7_EVNTSEL1:
1161 case MSR_K7_EVNTSEL2:
1162 case MSR_K7_EVNTSEL3:
1163 /*
1164 * only support writing 0 to the performance counters for now
1165 * to make Windows happy. Should be replaced by a real
1166 * performance counter emulation later.
1167 */
1168 if (data != 0)
1169 goto unhandled;
1170 break;
1179 default: 1171 default:
1172 unhandled:
1180 return kvm_set_msr_common(vcpu, ecx, data); 1173 return kvm_set_msr_common(vcpu, ecx, data);
1181 } 1174 }
1182 return 0; 1175 return 0;
@@ -1184,12 +1177,12 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
1184 1177
1185static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1178static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1186{ 1179{
1187 u32 ecx = svm->vcpu.regs[VCPU_REGS_RCX]; 1180 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
1188 u64 data = (svm->vmcb->save.rax & -1u) 1181 u64 data = (svm->vmcb->save.rax & -1u)
1189 | ((u64)(svm->vcpu.regs[VCPU_REGS_RDX] & -1u) << 32); 1182 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
1190 svm->next_rip = svm->vmcb->save.rip + 2; 1183 svm->next_rip = svm->vmcb->save.rip + 2;
1191 if (svm_set_msr(&svm->vcpu, ecx, data)) 1184 if (svm_set_msr(&svm->vcpu, ecx, data))
1192 svm_inject_gp(&svm->vcpu, 0); 1185 kvm_inject_gp(&svm->vcpu, 0);
1193 else 1186 else
1194 skip_emulated_instruction(&svm->vcpu); 1187 skip_emulated_instruction(&svm->vcpu);
1195 return 1; 1188 return 1;
@@ -1213,7 +1206,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm,
1213 * possible 1206 * possible
1214 */ 1207 */
1215 if (kvm_run->request_interrupt_window && 1208 if (kvm_run->request_interrupt_window &&
1216 !svm->vcpu.irq_summary) { 1209 !svm->vcpu.arch.irq_summary) {
1217 ++svm->vcpu.stat.irq_window_exits; 1210 ++svm->vcpu.stat.irq_window_exits;
1218 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; 1211 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
1219 return 0; 1212 return 0;
@@ -1227,10 +1220,12 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
1227 [SVM_EXIT_READ_CR0] = emulate_on_interception, 1220 [SVM_EXIT_READ_CR0] = emulate_on_interception,
1228 [SVM_EXIT_READ_CR3] = emulate_on_interception, 1221 [SVM_EXIT_READ_CR3] = emulate_on_interception,
1229 [SVM_EXIT_READ_CR4] = emulate_on_interception, 1222 [SVM_EXIT_READ_CR4] = emulate_on_interception,
1223 [SVM_EXIT_READ_CR8] = emulate_on_interception,
1230 /* for now: */ 1224 /* for now: */
1231 [SVM_EXIT_WRITE_CR0] = emulate_on_interception, 1225 [SVM_EXIT_WRITE_CR0] = emulate_on_interception,
1232 [SVM_EXIT_WRITE_CR3] = emulate_on_interception, 1226 [SVM_EXIT_WRITE_CR3] = emulate_on_interception,
1233 [SVM_EXIT_WRITE_CR4] = emulate_on_interception, 1227 [SVM_EXIT_WRITE_CR4] = emulate_on_interception,
1228 [SVM_EXIT_WRITE_CR8] = cr8_write_interception,
1234 [SVM_EXIT_READ_DR0] = emulate_on_interception, 1229 [SVM_EXIT_READ_DR0] = emulate_on_interception,
1235 [SVM_EXIT_READ_DR1] = emulate_on_interception, 1230 [SVM_EXIT_READ_DR1] = emulate_on_interception,
1236 [SVM_EXIT_READ_DR2] = emulate_on_interception, 1231 [SVM_EXIT_READ_DR2] = emulate_on_interception,
@@ -1241,6 +1236,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
1241 [SVM_EXIT_WRITE_DR3] = emulate_on_interception, 1236 [SVM_EXIT_WRITE_DR3] = emulate_on_interception,
1242 [SVM_EXIT_WRITE_DR5] = emulate_on_interception, 1237 [SVM_EXIT_WRITE_DR5] = emulate_on_interception,
1243 [SVM_EXIT_WRITE_DR7] = emulate_on_interception, 1238 [SVM_EXIT_WRITE_DR7] = emulate_on_interception,
1239 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
1244 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, 1240 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
1245 [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, 1241 [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception,
1246 [SVM_EXIT_INTR] = nop_on_interception, 1242 [SVM_EXIT_INTR] = nop_on_interception,
@@ -1293,7 +1289,7 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1293 exit_code); 1289 exit_code);
1294 1290
1295 if (exit_code >= ARRAY_SIZE(svm_exit_handlers) 1291 if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
1296 || svm_exit_handlers[exit_code] == 0) { 1292 || !svm_exit_handlers[exit_code]) {
1297 kvm_run->exit_reason = KVM_EXIT_UNKNOWN; 1293 kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
1298 kvm_run->hw.hardware_exit_reason = exit_code; 1294 kvm_run->hw.hardware_exit_reason = exit_code;
1299 return 0; 1295 return 0;
@@ -1307,7 +1303,7 @@ static void reload_tss(struct kvm_vcpu *vcpu)
1307 int cpu = raw_smp_processor_id(); 1303 int cpu = raw_smp_processor_id();
1308 1304
1309 struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); 1305 struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
1310 svm_data->tss_desc->type = 9; //available 32/64-bit TSS 1306 svm_data->tss_desc->type = 9; /* available 32/64-bit TSS */
1311 load_TR_desc(); 1307 load_TR_desc();
1312} 1308}
1313 1309
@@ -1348,7 +1344,6 @@ static void svm_intr_assist(struct kvm_vcpu *vcpu)
1348 struct vmcb *vmcb = svm->vmcb; 1344 struct vmcb *vmcb = svm->vmcb;
1349 int intr_vector = -1; 1345 int intr_vector = -1;
1350 1346
1351 kvm_inject_pending_timer_irqs(vcpu);
1352 if ((vmcb->control.exit_int_info & SVM_EVTINJ_VALID) && 1347 if ((vmcb->control.exit_int_info & SVM_EVTINJ_VALID) &&
1353 ((vmcb->control.exit_int_info & SVM_EVTINJ_TYPE_MASK) == 0)) { 1348 ((vmcb->control.exit_int_info & SVM_EVTINJ_TYPE_MASK) == 0)) {
1354 intr_vector = vmcb->control.exit_int_info & 1349 intr_vector = vmcb->control.exit_int_info &
@@ -1388,20 +1383,20 @@ static void kvm_reput_irq(struct vcpu_svm *svm)
1388 push_irq(&svm->vcpu, control->int_vector); 1383 push_irq(&svm->vcpu, control->int_vector);
1389 } 1384 }
1390 1385
1391 svm->vcpu.interrupt_window_open = 1386 svm->vcpu.arch.interrupt_window_open =
1392 !(control->int_state & SVM_INTERRUPT_SHADOW_MASK); 1387 !(control->int_state & SVM_INTERRUPT_SHADOW_MASK);
1393} 1388}
1394 1389
1395static void svm_do_inject_vector(struct vcpu_svm *svm) 1390static void svm_do_inject_vector(struct vcpu_svm *svm)
1396{ 1391{
1397 struct kvm_vcpu *vcpu = &svm->vcpu; 1392 struct kvm_vcpu *vcpu = &svm->vcpu;
1398 int word_index = __ffs(vcpu->irq_summary); 1393 int word_index = __ffs(vcpu->arch.irq_summary);
1399 int bit_index = __ffs(vcpu->irq_pending[word_index]); 1394 int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
1400 int irq = word_index * BITS_PER_LONG + bit_index; 1395 int irq = word_index * BITS_PER_LONG + bit_index;
1401 1396
1402 clear_bit(bit_index, &vcpu->irq_pending[word_index]); 1397 clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
1403 if (!vcpu->irq_pending[word_index]) 1398 if (!vcpu->arch.irq_pending[word_index])
1404 clear_bit(word_index, &vcpu->irq_summary); 1399 clear_bit(word_index, &vcpu->arch.irq_summary);
1405 svm_inject_irq(svm, irq); 1400 svm_inject_irq(svm, irq);
1406} 1401}
1407 1402
@@ -1411,11 +1406,11 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
1411 struct vcpu_svm *svm = to_svm(vcpu); 1406 struct vcpu_svm *svm = to_svm(vcpu);
1412 struct vmcb_control_area *control = &svm->vmcb->control; 1407 struct vmcb_control_area *control = &svm->vmcb->control;
1413 1408
1414 svm->vcpu.interrupt_window_open = 1409 svm->vcpu.arch.interrupt_window_open =
1415 (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) && 1410 (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
1416 (svm->vmcb->save.rflags & X86_EFLAGS_IF)); 1411 (svm->vmcb->save.rflags & X86_EFLAGS_IF));
1417 1412
1418 if (svm->vcpu.interrupt_window_open && svm->vcpu.irq_summary) 1413 if (svm->vcpu.arch.interrupt_window_open && svm->vcpu.arch.irq_summary)
1419 /* 1414 /*
1420 * If interrupts enabled, and not blocked by sti or mov ss. Good. 1415 * If interrupts enabled, and not blocked by sti or mov ss. Good.
1421 */ 1416 */
@@ -1424,13 +1419,18 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
1424 /* 1419 /*
1425 * Interrupts blocked. Wait for unblock. 1420 * Interrupts blocked. Wait for unblock.
1426 */ 1421 */
1427 if (!svm->vcpu.interrupt_window_open && 1422 if (!svm->vcpu.arch.interrupt_window_open &&
1428 (svm->vcpu.irq_summary || kvm_run->request_interrupt_window)) { 1423 (svm->vcpu.arch.irq_summary || kvm_run->request_interrupt_window))
1429 control->intercept |= 1ULL << INTERCEPT_VINTR; 1424 control->intercept |= 1ULL << INTERCEPT_VINTR;
1430 } else 1425 else
1431 control->intercept &= ~(1ULL << INTERCEPT_VINTR); 1426 control->intercept &= ~(1ULL << INTERCEPT_VINTR);
1432} 1427}
1433 1428
1429static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
1430{
1431 return 0;
1432}
1433
1434static void save_db_regs(unsigned long *db_regs) 1434static void save_db_regs(unsigned long *db_regs)
1435{ 1435{
1436 asm volatile ("mov %%dr0, %0" : "=r"(db_regs[0])); 1436 asm volatile ("mov %%dr0, %0" : "=r"(db_regs[0]));
@@ -1472,7 +1472,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1472 svm->host_cr2 = kvm_read_cr2(); 1472 svm->host_cr2 = kvm_read_cr2();
1473 svm->host_dr6 = read_dr6(); 1473 svm->host_dr6 = read_dr6();
1474 svm->host_dr7 = read_dr7(); 1474 svm->host_dr7 = read_dr7();
1475 svm->vmcb->save.cr2 = vcpu->cr2; 1475 svm->vmcb->save.cr2 = vcpu->arch.cr2;
1476 1476
1477 if (svm->vmcb->save.dr7 & 0xff) { 1477 if (svm->vmcb->save.dr7 & 0xff) {
1478 write_dr7(0); 1478 write_dr7(0);
@@ -1486,13 +1486,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1486 1486
1487 asm volatile ( 1487 asm volatile (
1488#ifdef CONFIG_X86_64 1488#ifdef CONFIG_X86_64
1489 "push %%rbx; push %%rcx; push %%rdx;" 1489 "push %%rbp; \n\t"
1490 "push %%rsi; push %%rdi; push %%rbp;"
1491 "push %%r8; push %%r9; push %%r10; push %%r11;"
1492 "push %%r12; push %%r13; push %%r14; push %%r15;"
1493#else 1490#else
1494 "push %%ebx; push %%ecx; push %%edx;" 1491 "push %%ebp; \n\t"
1495 "push %%esi; push %%edi; push %%ebp;"
1496#endif 1492#endif
1497 1493
1498#ifdef CONFIG_X86_64 1494#ifdef CONFIG_X86_64
@@ -1554,10 +1550,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1554 "mov %%r14, %c[r14](%[svm]) \n\t" 1550 "mov %%r14, %c[r14](%[svm]) \n\t"
1555 "mov %%r15, %c[r15](%[svm]) \n\t" 1551 "mov %%r15, %c[r15](%[svm]) \n\t"
1556 1552
1557 "pop %%r15; pop %%r14; pop %%r13; pop %%r12;" 1553 "pop %%rbp; \n\t"
1558 "pop %%r11; pop %%r10; pop %%r9; pop %%r8;"
1559 "pop %%rbp; pop %%rdi; pop %%rsi;"
1560 "pop %%rdx; pop %%rcx; pop %%rbx; \n\t"
1561#else 1554#else
1562 "mov %%ebx, %c[rbx](%[svm]) \n\t" 1555 "mov %%ebx, %c[rbx](%[svm]) \n\t"
1563 "mov %%ecx, %c[rcx](%[svm]) \n\t" 1556 "mov %%ecx, %c[rcx](%[svm]) \n\t"
@@ -1566,34 +1559,40 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1566 "mov %%edi, %c[rdi](%[svm]) \n\t" 1559 "mov %%edi, %c[rdi](%[svm]) \n\t"
1567 "mov %%ebp, %c[rbp](%[svm]) \n\t" 1560 "mov %%ebp, %c[rbp](%[svm]) \n\t"
1568 1561
1569 "pop %%ebp; pop %%edi; pop %%esi;" 1562 "pop %%ebp; \n\t"
1570 "pop %%edx; pop %%ecx; pop %%ebx; \n\t"
1571#endif 1563#endif
1572 : 1564 :
1573 : [svm]"a"(svm), 1565 : [svm]"a"(svm),
1574 [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), 1566 [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
1575 [rbx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RBX])), 1567 [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])),
1576 [rcx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RCX])), 1568 [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])),
1577 [rdx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RDX])), 1569 [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])),
1578 [rsi]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RSI])), 1570 [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])),
1579 [rdi]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RDI])), 1571 [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])),
1580 [rbp]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RBP])) 1572 [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP]))
1581#ifdef CONFIG_X86_64 1573#ifdef CONFIG_X86_64
1582 ,[r8 ]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R8])), 1574 , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])),
1583 [r9 ]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R9 ])), 1575 [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])),
1584 [r10]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R10])), 1576 [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])),
1585 [r11]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R11])), 1577 [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])),
1586 [r12]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R12])), 1578 [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])),
1587 [r13]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R13])), 1579 [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])),
1588 [r14]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R14])), 1580 [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])),
1589 [r15]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R15])) 1581 [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
1590#endif 1582#endif
1591 : "cc", "memory" ); 1583 : "cc", "memory"
1584#ifdef CONFIG_X86_64
1585 , "rbx", "rcx", "rdx", "rsi", "rdi"
1586 , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
1587#else
1588 , "ebx", "ecx", "edx" , "esi", "edi"
1589#endif
1590 );
1592 1591
1593 if ((svm->vmcb->save.dr7 & 0xff)) 1592 if ((svm->vmcb->save.dr7 & 0xff))
1594 load_db_regs(svm->host_db_regs); 1593 load_db_regs(svm->host_db_regs);
1595 1594
1596 vcpu->cr2 = svm->vmcb->save.cr2; 1595 vcpu->arch.cr2 = svm->vmcb->save.cr2;
1597 1596
1598 write_dr6(svm->host_dr6); 1597 write_dr6(svm->host_dr6);
1599 write_dr7(svm->host_dr7); 1598 write_dr7(svm->host_dr7);
@@ -1627,34 +1626,6 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
1627 } 1626 }
1628} 1627}
1629 1628
1630static void svm_inject_page_fault(struct kvm_vcpu *vcpu,
1631 unsigned long addr,
1632 uint32_t err_code)
1633{
1634 struct vcpu_svm *svm = to_svm(vcpu);
1635 uint32_t exit_int_info = svm->vmcb->control.exit_int_info;
1636
1637 ++vcpu->stat.pf_guest;
1638
1639 if (is_page_fault(exit_int_info)) {
1640
1641 svm->vmcb->control.event_inj_err = 0;
1642 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID |
1643 SVM_EVTINJ_VALID_ERR |
1644 SVM_EVTINJ_TYPE_EXEPT |
1645 DF_VECTOR;
1646 return;
1647 }
1648 vcpu->cr2 = addr;
1649 svm->vmcb->save.cr2 = addr;
1650 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID |
1651 SVM_EVTINJ_VALID_ERR |
1652 SVM_EVTINJ_TYPE_EXEPT |
1653 PF_VECTOR;
1654 svm->vmcb->control.event_inj_err = err_code;
1655}
1656
1657
1658static int is_disabled(void) 1629static int is_disabled(void)
1659{ 1630{
1660 u64 vm_cr; 1631 u64 vm_cr;
@@ -1675,7 +1646,6 @@ svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
1675 hypercall[0] = 0x0f; 1646 hypercall[0] = 0x0f;
1676 hypercall[1] = 0x01; 1647 hypercall[1] = 0x01;
1677 hypercall[2] = 0xd9; 1648 hypercall[2] = 0xd9;
1678 hypercall[3] = 0xc3;
1679} 1649}
1680 1650
1681static void svm_check_processor_compat(void *rtn) 1651static void svm_check_processor_compat(void *rtn)
@@ -1683,6 +1653,11 @@ static void svm_check_processor_compat(void *rtn)
1683 *(int *)rtn = 0; 1653 *(int *)rtn = 0;
1684} 1654}
1685 1655
1656static bool svm_cpu_has_accelerated_tpr(void)
1657{
1658 return false;
1659}
1660
1686static struct kvm_x86_ops svm_x86_ops = { 1661static struct kvm_x86_ops svm_x86_ops = {
1687 .cpu_has_kvm_support = has_svm, 1662 .cpu_has_kvm_support = has_svm,
1688 .disabled_by_bios = is_disabled, 1663 .disabled_by_bios = is_disabled,
@@ -1691,6 +1666,7 @@ static struct kvm_x86_ops svm_x86_ops = {
1691 .check_processor_compatibility = svm_check_processor_compat, 1666 .check_processor_compatibility = svm_check_processor_compat,
1692 .hardware_enable = svm_hardware_enable, 1667 .hardware_enable = svm_hardware_enable,
1693 .hardware_disable = svm_hardware_disable, 1668 .hardware_disable = svm_hardware_disable,
1669 .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
1694 1670
1695 .vcpu_create = svm_create_vcpu, 1671 .vcpu_create = svm_create_vcpu,
1696 .vcpu_free = svm_free_vcpu, 1672 .vcpu_free = svm_free_vcpu,
@@ -1725,9 +1701,6 @@ static struct kvm_x86_ops svm_x86_ops = {
1725 .set_rflags = svm_set_rflags, 1701 .set_rflags = svm_set_rflags,
1726 1702
1727 .tlb_flush = svm_flush_tlb, 1703 .tlb_flush = svm_flush_tlb,
1728 .inject_page_fault = svm_inject_page_fault,
1729
1730 .inject_gp = svm_inject_gp,
1731 1704
1732 .run = svm_vcpu_run, 1705 .run = svm_vcpu_run,
1733 .handle_exit = handle_exit, 1706 .handle_exit = handle_exit,
@@ -1735,19 +1708,23 @@ static struct kvm_x86_ops svm_x86_ops = {
1735 .patch_hypercall = svm_patch_hypercall, 1708 .patch_hypercall = svm_patch_hypercall,
1736 .get_irq = svm_get_irq, 1709 .get_irq = svm_get_irq,
1737 .set_irq = svm_set_irq, 1710 .set_irq = svm_set_irq,
1711 .queue_exception = svm_queue_exception,
1712 .exception_injected = svm_exception_injected,
1738 .inject_pending_irq = svm_intr_assist, 1713 .inject_pending_irq = svm_intr_assist,
1739 .inject_pending_vectors = do_interrupt_requests, 1714 .inject_pending_vectors = do_interrupt_requests,
1715
1716 .set_tss_addr = svm_set_tss_addr,
1740}; 1717};
1741 1718
1742static int __init svm_init(void) 1719static int __init svm_init(void)
1743{ 1720{
1744 return kvm_init_x86(&svm_x86_ops, sizeof(struct vcpu_svm), 1721 return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm),
1745 THIS_MODULE); 1722 THIS_MODULE);
1746} 1723}
1747 1724
1748static void __exit svm_exit(void) 1725static void __exit svm_exit(void)
1749{ 1726{
1750 kvm_exit_x86(); 1727 kvm_exit();
1751} 1728}
1752 1729
1753module_init(svm_init) 1730module_init(svm_init)
diff --git a/drivers/kvm/svm.h b/arch/x86/kvm/svm.h
index 3b1b0f35b6cb..5fd50491b555 100644
--- a/drivers/kvm/svm.h
+++ b/arch/x86/kvm/svm.h
@@ -204,6 +204,7 @@ struct __attribute__ ((__packed__)) vmcb {
204#define INTERCEPT_CR0_MASK 1 204#define INTERCEPT_CR0_MASK 1
205#define INTERCEPT_CR3_MASK (1 << 3) 205#define INTERCEPT_CR3_MASK (1 << 3)
206#define INTERCEPT_CR4_MASK (1 << 4) 206#define INTERCEPT_CR4_MASK (1 << 4)
207#define INTERCEPT_CR8_MASK (1 << 8)
207 208
208#define INTERCEPT_DR0_MASK 1 209#define INTERCEPT_DR0_MASK 1
209#define INTERCEPT_DR1_MASK (1 << 1) 210#define INTERCEPT_DR1_MASK (1 << 1)
@@ -311,7 +312,7 @@ struct __attribute__ ((__packed__)) vmcb {
311 312
312#define SVM_EXIT_ERR -1 313#define SVM_EXIT_ERR -1
313 314
314#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) // TS and MP 315#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) /* TS and MP */
315 316
316#define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda" 317#define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda"
317#define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8" 318#define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8"
diff --git a/drivers/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5b397b6c9f93..ad36447e696e 100644
--- a/drivers/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -15,17 +15,18 @@
15 * 15 *
16 */ 16 */
17 17
18#include "kvm.h"
19#include "x86_emulate.h"
20#include "irq.h" 18#include "irq.h"
21#include "vmx.h" 19#include "vmx.h"
22#include "segment_descriptor.h" 20#include "segment_descriptor.h"
21#include "mmu.h"
23 22
23#include <linux/kvm_host.h>
24#include <linux/module.h> 24#include <linux/module.h>
25#include <linux/kernel.h> 25#include <linux/kernel.h>
26#include <linux/mm.h> 26#include <linux/mm.h>
27#include <linux/highmem.h> 27#include <linux/highmem.h>
28#include <linux/sched.h> 28#include <linux/sched.h>
29#include <linux/moduleparam.h>
29 30
30#include <asm/io.h> 31#include <asm/io.h>
31#include <asm/desc.h> 32#include <asm/desc.h>
@@ -33,6 +34,9 @@
33MODULE_AUTHOR("Qumranet"); 34MODULE_AUTHOR("Qumranet");
34MODULE_LICENSE("GPL"); 35MODULE_LICENSE("GPL");
35 36
37static int bypass_guest_pf = 1;
38module_param(bypass_guest_pf, bool, 0);
39
36struct vmcs { 40struct vmcs {
37 u32 revision_id; 41 u32 revision_id;
38 u32 abort; 42 u32 abort;
@@ -43,6 +47,7 @@ struct vcpu_vmx {
43 struct kvm_vcpu vcpu; 47 struct kvm_vcpu vcpu;
44 int launched; 48 int launched;
45 u8 fail; 49 u8 fail;
50 u32 idt_vectoring_info;
46 struct kvm_msr_entry *guest_msrs; 51 struct kvm_msr_entry *guest_msrs;
47 struct kvm_msr_entry *host_msrs; 52 struct kvm_msr_entry *host_msrs;
48 int nmsrs; 53 int nmsrs;
@@ -57,8 +62,15 @@ struct vcpu_vmx {
57 u16 fs_sel, gs_sel, ldt_sel; 62 u16 fs_sel, gs_sel, ldt_sel;
58 int gs_ldt_reload_needed; 63 int gs_ldt_reload_needed;
59 int fs_reload_needed; 64 int fs_reload_needed;
60 }host_state; 65 int guest_efer_loaded;
61 66 } host_state;
67 struct {
68 struct {
69 bool pending;
70 u8 vector;
71 unsigned rip;
72 } irq;
73 } rmode;
62}; 74};
63 75
64static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) 76static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -74,14 +86,13 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
74static struct page *vmx_io_bitmap_a; 86static struct page *vmx_io_bitmap_a;
75static struct page *vmx_io_bitmap_b; 87static struct page *vmx_io_bitmap_b;
76 88
77#define EFER_SAVE_RESTORE_BITS ((u64)EFER_SCE)
78
79static struct vmcs_config { 89static struct vmcs_config {
80 int size; 90 int size;
81 int order; 91 int order;
82 u32 revision_id; 92 u32 revision_id;
83 u32 pin_based_exec_ctrl; 93 u32 pin_based_exec_ctrl;
84 u32 cpu_based_exec_ctrl; 94 u32 cpu_based_exec_ctrl;
95 u32 cpu_based_2nd_exec_ctrl;
85 u32 vmexit_ctrl; 96 u32 vmexit_ctrl;
86 u32 vmentry_ctrl; 97 u32 vmentry_ctrl;
87} vmcs_config; 98} vmcs_config;
@@ -138,18 +149,6 @@ static void save_msrs(struct kvm_msr_entry *e, int n)
138 rdmsrl(e[i].index, e[i].data); 149 rdmsrl(e[i].index, e[i].data);
139} 150}
140 151
141static inline u64 msr_efer_save_restore_bits(struct kvm_msr_entry msr)
142{
143 return (u64)msr.data & EFER_SAVE_RESTORE_BITS;
144}
145
146static inline int msr_efer_need_save_restore(struct vcpu_vmx *vmx)
147{
148 int efer_offset = vmx->msr_offset_efer;
149 return msr_efer_save_restore_bits(vmx->host_msrs[efer_offset]) !=
150 msr_efer_save_restore_bits(vmx->guest_msrs[efer_offset]);
151}
152
153static inline int is_page_fault(u32 intr_info) 152static inline int is_page_fault(u32 intr_info)
154{ 153{
155 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 154 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
@@ -164,6 +163,13 @@ static inline int is_no_device(u32 intr_info)
164 (INTR_TYPE_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK); 163 (INTR_TYPE_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
165} 164}
166 165
166static inline int is_invalid_opcode(u32 intr_info)
167{
168 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
169 INTR_INFO_VALID_MASK)) ==
170 (INTR_TYPE_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
171}
172
167static inline int is_external_interrupt(u32 intr_info) 173static inline int is_external_interrupt(u32 intr_info)
168{ 174{
169 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) 175 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -180,6 +186,24 @@ static inline int vm_need_tpr_shadow(struct kvm *kvm)
180 return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm))); 186 return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm)));
181} 187}
182 188
189static inline int cpu_has_secondary_exec_ctrls(void)
190{
191 return (vmcs_config.cpu_based_exec_ctrl &
192 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS);
193}
194
195static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
196{
197 return (vmcs_config.cpu_based_2nd_exec_ctrl &
198 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
199}
200
201static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
202{
203 return ((cpu_has_vmx_virtualize_apic_accesses()) &&
204 (irqchip_in_kernel(kvm)));
205}
206
183static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) 207static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
184{ 208{
185 int i; 209 int i;
@@ -222,16 +246,14 @@ static void __vcpu_clear(void *arg)
222 vmcs_clear(vmx->vmcs); 246 vmcs_clear(vmx->vmcs);
223 if (per_cpu(current_vmcs, cpu) == vmx->vmcs) 247 if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
224 per_cpu(current_vmcs, cpu) = NULL; 248 per_cpu(current_vmcs, cpu) = NULL;
225 rdtscll(vmx->vcpu.host_tsc); 249 rdtscll(vmx->vcpu.arch.host_tsc);
226} 250}
227 251
228static void vcpu_clear(struct vcpu_vmx *vmx) 252static void vcpu_clear(struct vcpu_vmx *vmx)
229{ 253{
230 if (vmx->vcpu.cpu != raw_smp_processor_id() && vmx->vcpu.cpu != -1) 254 if (vmx->vcpu.cpu == -1)
231 smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, 255 return;
232 vmx, 0, 1); 256 smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 0, 1);
233 else
234 __vcpu_clear(vmx);
235 vmx->launched = 0; 257 vmx->launched = 0;
236} 258}
237 259
@@ -275,7 +297,7 @@ static void vmcs_writel(unsigned long field, unsigned long value)
275 u8 error; 297 u8 error;
276 298
277 asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0" 299 asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0"
278 : "=q"(error) : "a"(value), "d"(field) : "cc" ); 300 : "=q"(error) : "a"(value), "d"(field) : "cc");
279 if (unlikely(error)) 301 if (unlikely(error))
280 vmwrite_error(field, value); 302 vmwrite_error(field, value);
281} 303}
@@ -315,12 +337,12 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
315{ 337{
316 u32 eb; 338 u32 eb;
317 339
318 eb = 1u << PF_VECTOR; 340 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR);
319 if (!vcpu->fpu_active) 341 if (!vcpu->fpu_active)
320 eb |= 1u << NM_VECTOR; 342 eb |= 1u << NM_VECTOR;
321 if (vcpu->guest_debug.enabled) 343 if (vcpu->guest_debug.enabled)
322 eb |= 1u << 1; 344 eb |= 1u << 1;
323 if (vcpu->rmode.active) 345 if (vcpu->arch.rmode.active)
324 eb = ~0; 346 eb = ~0;
325 vmcs_write32(EXCEPTION_BITMAP, eb); 347 vmcs_write32(EXCEPTION_BITMAP, eb);
326} 348}
@@ -344,16 +366,42 @@ static void reload_tss(void)
344 366
345static void load_transition_efer(struct vcpu_vmx *vmx) 367static void load_transition_efer(struct vcpu_vmx *vmx)
346{ 368{
347 u64 trans_efer;
348 int efer_offset = vmx->msr_offset_efer; 369 int efer_offset = vmx->msr_offset_efer;
370 u64 host_efer = vmx->host_msrs[efer_offset].data;
371 u64 guest_efer = vmx->guest_msrs[efer_offset].data;
372 u64 ignore_bits;
349 373
350 trans_efer = vmx->host_msrs[efer_offset].data; 374 if (efer_offset < 0)
351 trans_efer &= ~EFER_SAVE_RESTORE_BITS; 375 return;
352 trans_efer |= msr_efer_save_restore_bits(vmx->guest_msrs[efer_offset]); 376 /*
353 wrmsrl(MSR_EFER, trans_efer); 377 * NX is emulated; LMA and LME handled by hardware; SCE meaninless
378 * outside long mode
379 */
380 ignore_bits = EFER_NX | EFER_SCE;
381#ifdef CONFIG_X86_64
382 ignore_bits |= EFER_LMA | EFER_LME;
383 /* SCE is meaningful only in long mode on Intel */
384 if (guest_efer & EFER_LMA)
385 ignore_bits &= ~(u64)EFER_SCE;
386#endif
387 if ((guest_efer & ~ignore_bits) == (host_efer & ~ignore_bits))
388 return;
389
390 vmx->host_state.guest_efer_loaded = 1;
391 guest_efer &= ~ignore_bits;
392 guest_efer |= host_efer & ignore_bits;
393 wrmsrl(MSR_EFER, guest_efer);
354 vmx->vcpu.stat.efer_reload++; 394 vmx->vcpu.stat.efer_reload++;
355} 395}
356 396
397static void reload_host_efer(struct vcpu_vmx *vmx)
398{
399 if (vmx->host_state.guest_efer_loaded) {
400 vmx->host_state.guest_efer_loaded = 0;
401 load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1);
402 }
403}
404
357static void vmx_save_host_state(struct kvm_vcpu *vcpu) 405static void vmx_save_host_state(struct kvm_vcpu *vcpu)
358{ 406{
359 struct vcpu_vmx *vmx = to_vmx(vcpu); 407 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -393,14 +441,13 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
393#endif 441#endif
394 442
395#ifdef CONFIG_X86_64 443#ifdef CONFIG_X86_64
396 if (is_long_mode(&vmx->vcpu)) { 444 if (is_long_mode(&vmx->vcpu))
397 save_msrs(vmx->host_msrs + 445 save_msrs(vmx->host_msrs +
398 vmx->msr_offset_kernel_gs_base, 1); 446 vmx->msr_offset_kernel_gs_base, 1);
399 } 447
400#endif 448#endif
401 load_msrs(vmx->guest_msrs, vmx->save_nmsrs); 449 load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
402 if (msr_efer_need_save_restore(vmx)) 450 load_transition_efer(vmx);
403 load_transition_efer(vmx);
404} 451}
405 452
406static void vmx_load_host_state(struct vcpu_vmx *vmx) 453static void vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -410,6 +457,7 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
410 if (!vmx->host_state.loaded) 457 if (!vmx->host_state.loaded)
411 return; 458 return;
412 459
460 ++vmx->vcpu.stat.host_state_reload;
413 vmx->host_state.loaded = 0; 461 vmx->host_state.loaded = 0;
414 if (vmx->host_state.fs_reload_needed) 462 if (vmx->host_state.fs_reload_needed)
415 load_fs(vmx->host_state.fs_sel); 463 load_fs(vmx->host_state.fs_sel);
@@ -429,8 +477,7 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
429 reload_tss(); 477 reload_tss();
430 save_msrs(vmx->guest_msrs, vmx->save_nmsrs); 478 save_msrs(vmx->guest_msrs, vmx->save_nmsrs);
431 load_msrs(vmx->host_msrs, vmx->save_nmsrs); 479 load_msrs(vmx->host_msrs, vmx->save_nmsrs);
432 if (msr_efer_need_save_restore(vmx)) 480 reload_host_efer(vmx);
433 load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1);
434} 481}
435 482
436/* 483/*
@@ -480,7 +527,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
480 * Make sure the time stamp counter is monotonous. 527 * Make sure the time stamp counter is monotonous.
481 */ 528 */
482 rdtscll(tsc_this); 529 rdtscll(tsc_this);
483 delta = vcpu->host_tsc - tsc_this; 530 delta = vcpu->arch.host_tsc - tsc_this;
484 vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta); 531 vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta);
485 } 532 }
486} 533}
@@ -488,7 +535,6 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
488static void vmx_vcpu_put(struct kvm_vcpu *vcpu) 535static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
489{ 536{
490 vmx_load_host_state(to_vmx(vcpu)); 537 vmx_load_host_state(to_vmx(vcpu));
491 kvm_put_guest_fpu(vcpu);
492} 538}
493 539
494static void vmx_fpu_activate(struct kvm_vcpu *vcpu) 540static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
@@ -497,7 +543,7 @@ static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
497 return; 543 return;
498 vcpu->fpu_active = 1; 544 vcpu->fpu_active = 1;
499 vmcs_clear_bits(GUEST_CR0, X86_CR0_TS); 545 vmcs_clear_bits(GUEST_CR0, X86_CR0_TS);
500 if (vcpu->cr0 & X86_CR0_TS) 546 if (vcpu->arch.cr0 & X86_CR0_TS)
501 vmcs_set_bits(GUEST_CR0, X86_CR0_TS); 547 vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
502 update_exception_bitmap(vcpu); 548 update_exception_bitmap(vcpu);
503} 549}
@@ -523,7 +569,7 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
523 569
524static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 570static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
525{ 571{
526 if (vcpu->rmode.active) 572 if (vcpu->arch.rmode.active)
527 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 573 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
528 vmcs_writel(GUEST_RFLAGS, rflags); 574 vmcs_writel(GUEST_RFLAGS, rflags);
529} 575}
@@ -545,19 +591,25 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
545 if (interruptibility & 3) 591 if (interruptibility & 3)
546 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 592 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
547 interruptibility & ~3); 593 interruptibility & ~3);
548 vcpu->interrupt_window_open = 1; 594 vcpu->arch.interrupt_window_open = 1;
549} 595}
550 596
551static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code) 597static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
598 bool has_error_code, u32 error_code)
552{ 599{
553 printk(KERN_DEBUG "inject_general_protection: rip 0x%lx\n",
554 vmcs_readl(GUEST_RIP));
555 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
556 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 600 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
557 GP_VECTOR | 601 nr | INTR_TYPE_EXCEPTION
558 INTR_TYPE_EXCEPTION | 602 | (has_error_code ? INTR_INFO_DELIEVER_CODE_MASK : 0)
559 INTR_INFO_DELIEVER_CODE_MASK | 603 | INTR_INFO_VALID_MASK);
560 INTR_INFO_VALID_MASK); 604 if (has_error_code)
605 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
606}
607
608static bool vmx_exception_injected(struct kvm_vcpu *vcpu)
609{
610 struct vcpu_vmx *vmx = to_vmx(vcpu);
611
612 return !(vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
561} 613}
562 614
563/* 615/*
@@ -608,7 +660,7 @@ static void setup_msrs(struct vcpu_vmx *vmx)
608 * if efer.sce is enabled. 660 * if efer.sce is enabled.
609 */ 661 */
610 index = __find_msr_index(vmx, MSR_K6_STAR); 662 index = __find_msr_index(vmx, MSR_K6_STAR);
611 if ((index >= 0) && (vmx->vcpu.shadow_efer & EFER_SCE)) 663 if ((index >= 0) && (vmx->vcpu.arch.shadow_efer & EFER_SCE))
612 move_msr_up(vmx, index, save_nmsrs++); 664 move_msr_up(vmx, index, save_nmsrs++);
613 } 665 }
614#endif 666#endif
@@ -712,8 +764,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
712#ifdef CONFIG_X86_64 764#ifdef CONFIG_X86_64
713 case MSR_EFER: 765 case MSR_EFER:
714 ret = kvm_set_msr_common(vcpu, msr_index, data); 766 ret = kvm_set_msr_common(vcpu, msr_index, data);
715 if (vmx->host_state.loaded) 767 if (vmx->host_state.loaded) {
768 reload_host_efer(vmx);
716 load_transition_efer(vmx); 769 load_transition_efer(vmx);
770 }
717 break; 771 break;
718 case MSR_FS_BASE: 772 case MSR_FS_BASE:
719 vmcs_writel(GUEST_FS_BASE, data); 773 vmcs_writel(GUEST_FS_BASE, data);
@@ -750,12 +804,12 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
750 804
751/* 805/*
752 * Sync the rsp and rip registers into the vcpu structure. This allows 806 * Sync the rsp and rip registers into the vcpu structure. This allows
753 * registers to be accessed by indexing vcpu->regs. 807 * registers to be accessed by indexing vcpu->arch.regs.
754 */ 808 */
755static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu) 809static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu)
756{ 810{
757 vcpu->regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); 811 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
758 vcpu->rip = vmcs_readl(GUEST_RIP); 812 vcpu->arch.rip = vmcs_readl(GUEST_RIP);
759} 813}
760 814
761/* 815/*
@@ -764,8 +818,8 @@ static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu)
764 */ 818 */
765static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu) 819static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu)
766{ 820{
767 vmcs_writel(GUEST_RSP, vcpu->regs[VCPU_REGS_RSP]); 821 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
768 vmcs_writel(GUEST_RIP, vcpu->rip); 822 vmcs_writel(GUEST_RIP, vcpu->arch.rip);
769} 823}
770 824
771static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) 825static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
@@ -808,14 +862,15 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
808 862
809static int vmx_get_irq(struct kvm_vcpu *vcpu) 863static int vmx_get_irq(struct kvm_vcpu *vcpu)
810{ 864{
865 struct vcpu_vmx *vmx = to_vmx(vcpu);
811 u32 idtv_info_field; 866 u32 idtv_info_field;
812 867
813 idtv_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD); 868 idtv_info_field = vmx->idt_vectoring_info;
814 if (idtv_info_field & INTR_INFO_VALID_MASK) { 869 if (idtv_info_field & INTR_INFO_VALID_MASK) {
815 if (is_external_interrupt(idtv_info_field)) 870 if (is_external_interrupt(idtv_info_field))
816 return idtv_info_field & VECTORING_INFO_VECTOR_MASK; 871 return idtv_info_field & VECTORING_INFO_VECTOR_MASK;
817 else 872 else
818 printk("pending exception: not handled yet\n"); 873 printk(KERN_DEBUG "pending exception: not handled yet\n");
819 } 874 }
820 return -1; 875 return -1;
821} 876}
@@ -863,7 +918,7 @@ static void hardware_disable(void *garbage)
863} 918}
864 919
865static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, 920static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
866 u32 msr, u32* result) 921 u32 msr, u32 *result)
867{ 922{
868 u32 vmx_msr_low, vmx_msr_high; 923 u32 vmx_msr_low, vmx_msr_high;
869 u32 ctl = ctl_min | ctl_opt; 924 u32 ctl = ctl_min | ctl_opt;
@@ -887,6 +942,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
887 u32 min, opt; 942 u32 min, opt;
888 u32 _pin_based_exec_control = 0; 943 u32 _pin_based_exec_control = 0;
889 u32 _cpu_based_exec_control = 0; 944 u32 _cpu_based_exec_control = 0;
945 u32 _cpu_based_2nd_exec_control = 0;
890 u32 _vmexit_control = 0; 946 u32 _vmexit_control = 0;
891 u32 _vmentry_control = 0; 947 u32 _vmentry_control = 0;
892 948
@@ -904,11 +960,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
904 CPU_BASED_USE_IO_BITMAPS | 960 CPU_BASED_USE_IO_BITMAPS |
905 CPU_BASED_MOV_DR_EXITING | 961 CPU_BASED_MOV_DR_EXITING |
906 CPU_BASED_USE_TSC_OFFSETING; 962 CPU_BASED_USE_TSC_OFFSETING;
907#ifdef CONFIG_X86_64 963 opt = CPU_BASED_TPR_SHADOW |
908 opt = CPU_BASED_TPR_SHADOW; 964 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
909#else
910 opt = 0;
911#endif
912 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, 965 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
913 &_cpu_based_exec_control) < 0) 966 &_cpu_based_exec_control) < 0)
914 return -EIO; 967 return -EIO;
@@ -917,6 +970,19 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
917 _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING & 970 _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
918 ~CPU_BASED_CR8_STORE_EXITING; 971 ~CPU_BASED_CR8_STORE_EXITING;
919#endif 972#endif
973 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
974 min = 0;
975 opt = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
976 SECONDARY_EXEC_WBINVD_EXITING;
977 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS2,
978 &_cpu_based_2nd_exec_control) < 0)
979 return -EIO;
980 }
981#ifndef CONFIG_X86_64
982 if (!(_cpu_based_2nd_exec_control &
983 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
984 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
985#endif
920 986
921 min = 0; 987 min = 0;
922#ifdef CONFIG_X86_64 988#ifdef CONFIG_X86_64
@@ -954,6 +1020,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
954 1020
955 vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; 1021 vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
956 vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; 1022 vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
1023 vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
957 vmcs_conf->vmexit_ctrl = _vmexit_control; 1024 vmcs_conf->vmexit_ctrl = _vmexit_control;
958 vmcs_conf->vmentry_ctrl = _vmentry_control; 1025 vmcs_conf->vmentry_ctrl = _vmentry_control;
959 1026
@@ -1043,15 +1110,15 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
1043{ 1110{
1044 unsigned long flags; 1111 unsigned long flags;
1045 1112
1046 vcpu->rmode.active = 0; 1113 vcpu->arch.rmode.active = 0;
1047 1114
1048 vmcs_writel(GUEST_TR_BASE, vcpu->rmode.tr.base); 1115 vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base);
1049 vmcs_write32(GUEST_TR_LIMIT, vcpu->rmode.tr.limit); 1116 vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit);
1050 vmcs_write32(GUEST_TR_AR_BYTES, vcpu->rmode.tr.ar); 1117 vmcs_write32(GUEST_TR_AR_BYTES, vcpu->arch.rmode.tr.ar);
1051 1118
1052 flags = vmcs_readl(GUEST_RFLAGS); 1119 flags = vmcs_readl(GUEST_RFLAGS);
1053 flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM); 1120 flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM);
1054 flags |= (vcpu->rmode.save_iopl << IOPL_SHIFT); 1121 flags |= (vcpu->arch.rmode.save_iopl << IOPL_SHIFT);
1055 vmcs_writel(GUEST_RFLAGS, flags); 1122 vmcs_writel(GUEST_RFLAGS, flags);
1056 1123
1057 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | 1124 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
@@ -1059,10 +1126,10 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
1059 1126
1060 update_exception_bitmap(vcpu); 1127 update_exception_bitmap(vcpu);
1061 1128
1062 fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->rmode.es); 1129 fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
1063 fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->rmode.ds); 1130 fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
1064 fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->rmode.gs); 1131 fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
1065 fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->rmode.fs); 1132 fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
1066 1133
1067 vmcs_write16(GUEST_SS_SELECTOR, 0); 1134 vmcs_write16(GUEST_SS_SELECTOR, 0);
1068 vmcs_write32(GUEST_SS_AR_BYTES, 0x93); 1135 vmcs_write32(GUEST_SS_AR_BYTES, 0x93);
@@ -1072,10 +1139,14 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
1072 vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); 1139 vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
1073} 1140}
1074 1141
1075static gva_t rmode_tss_base(struct kvm* kvm) 1142static gva_t rmode_tss_base(struct kvm *kvm)
1076{ 1143{
1077 gfn_t base_gfn = kvm->memslots[0].base_gfn + kvm->memslots[0].npages - 3; 1144 if (!kvm->arch.tss_addr) {
1078 return base_gfn << PAGE_SHIFT; 1145 gfn_t base_gfn = kvm->memslots[0].base_gfn +
1146 kvm->memslots[0].npages - 3;
1147 return base_gfn << PAGE_SHIFT;
1148 }
1149 return kvm->arch.tss_addr;
1079} 1150}
1080 1151
1081static void fix_rmode_seg(int seg, struct kvm_save_segment *save) 1152static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
@@ -1086,7 +1157,8 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
1086 save->base = vmcs_readl(sf->base); 1157 save->base = vmcs_readl(sf->base);
1087 save->limit = vmcs_read32(sf->limit); 1158 save->limit = vmcs_read32(sf->limit);
1088 save->ar = vmcs_read32(sf->ar_bytes); 1159 save->ar = vmcs_read32(sf->ar_bytes);
1089 vmcs_write16(sf->selector, vmcs_readl(sf->base) >> 4); 1160 vmcs_write16(sf->selector, save->base >> 4);
1161 vmcs_write32(sf->base, save->base & 0xfffff);
1090 vmcs_write32(sf->limit, 0xffff); 1162 vmcs_write32(sf->limit, 0xffff);
1091 vmcs_write32(sf->ar_bytes, 0xf3); 1163 vmcs_write32(sf->ar_bytes, 0xf3);
1092} 1164}
@@ -1095,19 +1167,20 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
1095{ 1167{
1096 unsigned long flags; 1168 unsigned long flags;
1097 1169
1098 vcpu->rmode.active = 1; 1170 vcpu->arch.rmode.active = 1;
1099 1171
1100 vcpu->rmode.tr.base = vmcs_readl(GUEST_TR_BASE); 1172 vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
1101 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); 1173 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
1102 1174
1103 vcpu->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT); 1175 vcpu->arch.rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
1104 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); 1176 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
1105 1177
1106 vcpu->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES); 1178 vcpu->arch.rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
1107 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 1179 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
1108 1180
1109 flags = vmcs_readl(GUEST_RFLAGS); 1181 flags = vmcs_readl(GUEST_RFLAGS);
1110 vcpu->rmode.save_iopl = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; 1182 vcpu->arch.rmode.save_iopl
1183 = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
1111 1184
1112 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 1185 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
1113 1186
@@ -1125,10 +1198,10 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
1125 vmcs_writel(GUEST_CS_BASE, 0xf0000); 1198 vmcs_writel(GUEST_CS_BASE, 0xf0000);
1126 vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4); 1199 vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
1127 1200
1128 fix_rmode_seg(VCPU_SREG_ES, &vcpu->rmode.es); 1201 fix_rmode_seg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
1129 fix_rmode_seg(VCPU_SREG_DS, &vcpu->rmode.ds); 1202 fix_rmode_seg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
1130 fix_rmode_seg(VCPU_SREG_GS, &vcpu->rmode.gs); 1203 fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
1131 fix_rmode_seg(VCPU_SREG_FS, &vcpu->rmode.fs); 1204 fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
1132 1205
1133 kvm_mmu_reset_context(vcpu); 1206 kvm_mmu_reset_context(vcpu);
1134 init_rmode_tss(vcpu->kvm); 1207 init_rmode_tss(vcpu->kvm);
@@ -1149,7 +1222,7 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
1149 | AR_TYPE_BUSY_64_TSS); 1222 | AR_TYPE_BUSY_64_TSS);
1150 } 1223 }
1151 1224
1152 vcpu->shadow_efer |= EFER_LMA; 1225 vcpu->arch.shadow_efer |= EFER_LMA;
1153 1226
1154 find_msr_entry(to_vmx(vcpu), MSR_EFER)->data |= EFER_LMA | EFER_LME; 1227 find_msr_entry(to_vmx(vcpu), MSR_EFER)->data |= EFER_LMA | EFER_LME;
1155 vmcs_write32(VM_ENTRY_CONTROLS, 1228 vmcs_write32(VM_ENTRY_CONTROLS,
@@ -1159,7 +1232,7 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
1159 1232
1160static void exit_lmode(struct kvm_vcpu *vcpu) 1233static void exit_lmode(struct kvm_vcpu *vcpu)
1161{ 1234{
1162 vcpu->shadow_efer &= ~EFER_LMA; 1235 vcpu->arch.shadow_efer &= ~EFER_LMA;
1163 1236
1164 vmcs_write32(VM_ENTRY_CONTROLS, 1237 vmcs_write32(VM_ENTRY_CONTROLS,
1165 vmcs_read32(VM_ENTRY_CONTROLS) 1238 vmcs_read32(VM_ENTRY_CONTROLS)
@@ -1170,22 +1243,22 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
1170 1243
1171static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) 1244static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
1172{ 1245{
1173 vcpu->cr4 &= KVM_GUEST_CR4_MASK; 1246 vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK;
1174 vcpu->cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK; 1247 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK;
1175} 1248}
1176 1249
1177static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 1250static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1178{ 1251{
1179 vmx_fpu_deactivate(vcpu); 1252 vmx_fpu_deactivate(vcpu);
1180 1253
1181 if (vcpu->rmode.active && (cr0 & X86_CR0_PE)) 1254 if (vcpu->arch.rmode.active && (cr0 & X86_CR0_PE))
1182 enter_pmode(vcpu); 1255 enter_pmode(vcpu);
1183 1256
1184 if (!vcpu->rmode.active && !(cr0 & X86_CR0_PE)) 1257 if (!vcpu->arch.rmode.active && !(cr0 & X86_CR0_PE))
1185 enter_rmode(vcpu); 1258 enter_rmode(vcpu);
1186 1259
1187#ifdef CONFIG_X86_64 1260#ifdef CONFIG_X86_64
1188 if (vcpu->shadow_efer & EFER_LME) { 1261 if (vcpu->arch.shadow_efer & EFER_LME) {
1189 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) 1262 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
1190 enter_lmode(vcpu); 1263 enter_lmode(vcpu);
1191 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) 1264 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
@@ -1196,7 +1269,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1196 vmcs_writel(CR0_READ_SHADOW, cr0); 1269 vmcs_writel(CR0_READ_SHADOW, cr0);
1197 vmcs_writel(GUEST_CR0, 1270 vmcs_writel(GUEST_CR0,
1198 (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON); 1271 (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON);
1199 vcpu->cr0 = cr0; 1272 vcpu->arch.cr0 = cr0;
1200 1273
1201 if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE)) 1274 if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE))
1202 vmx_fpu_activate(vcpu); 1275 vmx_fpu_activate(vcpu);
@@ -1205,16 +1278,16 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1205static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 1278static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1206{ 1279{
1207 vmcs_writel(GUEST_CR3, cr3); 1280 vmcs_writel(GUEST_CR3, cr3);
1208 if (vcpu->cr0 & X86_CR0_PE) 1281 if (vcpu->arch.cr0 & X86_CR0_PE)
1209 vmx_fpu_deactivate(vcpu); 1282 vmx_fpu_deactivate(vcpu);
1210} 1283}
1211 1284
1212static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 1285static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1213{ 1286{
1214 vmcs_writel(CR4_READ_SHADOW, cr4); 1287 vmcs_writel(CR4_READ_SHADOW, cr4);
1215 vmcs_writel(GUEST_CR4, cr4 | (vcpu->rmode.active ? 1288 vmcs_writel(GUEST_CR4, cr4 | (vcpu->arch.rmode.active ?
1216 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON)); 1289 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON));
1217 vcpu->cr4 = cr4; 1290 vcpu->arch.cr4 = cr4;
1218} 1291}
1219 1292
1220#ifdef CONFIG_X86_64 1293#ifdef CONFIG_X86_64
@@ -1224,7 +1297,7 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
1224 struct vcpu_vmx *vmx = to_vmx(vcpu); 1297 struct vcpu_vmx *vmx = to_vmx(vcpu);
1225 struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); 1298 struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
1226 1299
1227 vcpu->shadow_efer = efer; 1300 vcpu->arch.shadow_efer = efer;
1228 if (efer & EFER_LMA) { 1301 if (efer & EFER_LMA) {
1229 vmcs_write32(VM_ENTRY_CONTROLS, 1302 vmcs_write32(VM_ENTRY_CONTROLS,
1230 vmcs_read32(VM_ENTRY_CONTROLS) | 1303 vmcs_read32(VM_ENTRY_CONTROLS) |
@@ -1301,17 +1374,17 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
1301 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 1374 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1302 u32 ar; 1375 u32 ar;
1303 1376
1304 if (vcpu->rmode.active && seg == VCPU_SREG_TR) { 1377 if (vcpu->arch.rmode.active && seg == VCPU_SREG_TR) {
1305 vcpu->rmode.tr.selector = var->selector; 1378 vcpu->arch.rmode.tr.selector = var->selector;
1306 vcpu->rmode.tr.base = var->base; 1379 vcpu->arch.rmode.tr.base = var->base;
1307 vcpu->rmode.tr.limit = var->limit; 1380 vcpu->arch.rmode.tr.limit = var->limit;
1308 vcpu->rmode.tr.ar = vmx_segment_access_rights(var); 1381 vcpu->arch.rmode.tr.ar = vmx_segment_access_rights(var);
1309 return; 1382 return;
1310 } 1383 }
1311 vmcs_writel(sf->base, var->base); 1384 vmcs_writel(sf->base, var->base);
1312 vmcs_write32(sf->limit, var->limit); 1385 vmcs_write32(sf->limit, var->limit);
1313 vmcs_write16(sf->selector, var->selector); 1386 vmcs_write16(sf->selector, var->selector);
1314 if (vcpu->rmode.active && var->s) { 1387 if (vcpu->arch.rmode.active && var->s) {
1315 /* 1388 /*
1316 * Hack real-mode segments into vm86 compatibility. 1389 * Hack real-mode segments into vm86 compatibility.
1317 */ 1390 */
@@ -1355,36 +1428,38 @@ static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
1355 vmcs_writel(GUEST_GDTR_BASE, dt->base); 1428 vmcs_writel(GUEST_GDTR_BASE, dt->base);
1356} 1429}
1357 1430
1358static int init_rmode_tss(struct kvm* kvm) 1431static int init_rmode_tss(struct kvm *kvm)
1359{ 1432{
1360 struct page *p1, *p2, *p3;
1361 gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; 1433 gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
1362 char *page; 1434 u16 data = 0;
1363 1435 int ret = 0;
1364 p1 = gfn_to_page(kvm, fn++); 1436 int r;
1365 p2 = gfn_to_page(kvm, fn++);
1366 p3 = gfn_to_page(kvm, fn);
1367
1368 if (!p1 || !p2 || !p3) {
1369 kvm_printf(kvm,"%s: gfn_to_page failed\n", __FUNCTION__);
1370 return 0;
1371 }
1372
1373 page = kmap_atomic(p1, KM_USER0);
1374 clear_page(page);
1375 *(u16*)(page + 0x66) = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
1376 kunmap_atomic(page, KM_USER0);
1377
1378 page = kmap_atomic(p2, KM_USER0);
1379 clear_page(page);
1380 kunmap_atomic(page, KM_USER0);
1381 1437
1382 page = kmap_atomic(p3, KM_USER0); 1438 down_read(&current->mm->mmap_sem);
1383 clear_page(page); 1439 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
1384 *(page + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1) = ~0; 1440 if (r < 0)
1385 kunmap_atomic(page, KM_USER0); 1441 goto out;
1442 data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
1443 r = kvm_write_guest_page(kvm, fn++, &data, 0x66, sizeof(u16));
1444 if (r < 0)
1445 goto out;
1446 r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
1447 if (r < 0)
1448 goto out;
1449 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
1450 if (r < 0)
1451 goto out;
1452 data = ~0;
1453 r = kvm_write_guest_page(kvm, fn, &data,
1454 RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
1455 sizeof(u8));
1456 if (r < 0)
1457 goto out;
1386 1458
1387 return 1; 1459 ret = 1;
1460out:
1461 up_read(&current->mm->mmap_sem);
1462 return ret;
1388} 1463}
1389 1464
1390static void seg_setup(int seg) 1465static void seg_setup(int seg)
@@ -1397,6 +1472,27 @@ static void seg_setup(int seg)
1397 vmcs_write32(sf->ar_bytes, 0x93); 1472 vmcs_write32(sf->ar_bytes, 0x93);
1398} 1473}
1399 1474
1475static int alloc_apic_access_page(struct kvm *kvm)
1476{
1477 struct kvm_userspace_memory_region kvm_userspace_mem;
1478 int r = 0;
1479
1480 down_write(&current->mm->mmap_sem);
1481 if (kvm->arch.apic_access_page)
1482 goto out;
1483 kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
1484 kvm_userspace_mem.flags = 0;
1485 kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL;
1486 kvm_userspace_mem.memory_size = PAGE_SIZE;
1487 r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
1488 if (r)
1489 goto out;
1490 kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
1491out:
1492 up_write(&current->mm->mmap_sem);
1493 return r;
1494}
1495
1400/* 1496/*
1401 * Sets up the vmcs for emulated real mode. 1497 * Sets up the vmcs for emulated real mode.
1402 */ 1498 */
@@ -1407,92 +1503,15 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
1407 unsigned long a; 1503 unsigned long a;
1408 struct descriptor_table dt; 1504 struct descriptor_table dt;
1409 int i; 1505 int i;
1410 int ret = 0;
1411 unsigned long kvm_vmx_return; 1506 unsigned long kvm_vmx_return;
1412 u64 msr;
1413 u32 exec_control; 1507 u32 exec_control;
1414 1508
1415 if (!init_rmode_tss(vmx->vcpu.kvm)) {
1416 ret = -ENOMEM;
1417 goto out;
1418 }
1419
1420 vmx->vcpu.rmode.active = 0;
1421
1422 vmx->vcpu.regs[VCPU_REGS_RDX] = get_rdx_init_val();
1423 set_cr8(&vmx->vcpu, 0);
1424 msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
1425 if (vmx->vcpu.vcpu_id == 0)
1426 msr |= MSR_IA32_APICBASE_BSP;
1427 kvm_set_apic_base(&vmx->vcpu, msr);
1428
1429 fx_init(&vmx->vcpu);
1430
1431 /*
1432 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
1433 * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh.
1434 */
1435 if (vmx->vcpu.vcpu_id == 0) {
1436 vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
1437 vmcs_writel(GUEST_CS_BASE, 0x000f0000);
1438 } else {
1439 vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.sipi_vector << 8);
1440 vmcs_writel(GUEST_CS_BASE, vmx->vcpu.sipi_vector << 12);
1441 }
1442 vmcs_write32(GUEST_CS_LIMIT, 0xffff);
1443 vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
1444
1445 seg_setup(VCPU_SREG_DS);
1446 seg_setup(VCPU_SREG_ES);
1447 seg_setup(VCPU_SREG_FS);
1448 seg_setup(VCPU_SREG_GS);
1449 seg_setup(VCPU_SREG_SS);
1450
1451 vmcs_write16(GUEST_TR_SELECTOR, 0);
1452 vmcs_writel(GUEST_TR_BASE, 0);
1453 vmcs_write32(GUEST_TR_LIMIT, 0xffff);
1454 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
1455
1456 vmcs_write16(GUEST_LDTR_SELECTOR, 0);
1457 vmcs_writel(GUEST_LDTR_BASE, 0);
1458 vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
1459 vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
1460
1461 vmcs_write32(GUEST_SYSENTER_CS, 0);
1462 vmcs_writel(GUEST_SYSENTER_ESP, 0);
1463 vmcs_writel(GUEST_SYSENTER_EIP, 0);
1464
1465 vmcs_writel(GUEST_RFLAGS, 0x02);
1466 if (vmx->vcpu.vcpu_id == 0)
1467 vmcs_writel(GUEST_RIP, 0xfff0);
1468 else
1469 vmcs_writel(GUEST_RIP, 0);
1470 vmcs_writel(GUEST_RSP, 0);
1471
1472 //todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0
1473 vmcs_writel(GUEST_DR7, 0x400);
1474
1475 vmcs_writel(GUEST_GDTR_BASE, 0);
1476 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
1477
1478 vmcs_writel(GUEST_IDTR_BASE, 0);
1479 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
1480
1481 vmcs_write32(GUEST_ACTIVITY_STATE, 0);
1482 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
1483 vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
1484
1485 /* I/O */ 1509 /* I/O */
1486 vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a)); 1510 vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a));
1487 vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b)); 1511 vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b));
1488 1512
1489 guest_write_tsc(0);
1490
1491 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ 1513 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
1492 1514
1493 /* Special registers */
1494 vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
1495
1496 /* Control */ 1515 /* Control */
1497 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, 1516 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
1498 vmcs_config.pin_based_exec_ctrl); 1517 vmcs_config.pin_based_exec_ctrl);
@@ -1507,8 +1526,16 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
1507 } 1526 }
1508 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); 1527 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
1509 1528
1510 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); 1529 if (cpu_has_secondary_exec_ctrls()) {
1511 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); 1530 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
1531 if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
1532 exec_control &=
1533 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
1534 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
1535 }
1536
1537 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
1538 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
1512 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ 1539 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
1513 1540
1514 vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */ 1541 vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */
@@ -1536,7 +1563,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
1536 get_idt(&dt); 1563 get_idt(&dt);
1537 vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */ 1564 vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */
1538 1565
1539 asm ("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return)); 1566 asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
1540 vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */ 1567 vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
1541 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); 1568 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
1542 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 1569 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
@@ -1567,97 +1594,145 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
1567 ++vmx->nmsrs; 1594 ++vmx->nmsrs;
1568 } 1595 }
1569 1596
1570 setup_msrs(vmx);
1571
1572 vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); 1597 vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
1573 1598
1574 /* 22.2.1, 20.8.1 */ 1599 /* 22.2.1, 20.8.1 */
1575 vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); 1600 vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
1576 1601
1577 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
1578
1579#ifdef CONFIG_X86_64
1580 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
1581 if (vm_need_tpr_shadow(vmx->vcpu.kvm))
1582 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
1583 page_to_phys(vmx->vcpu.apic->regs_page));
1584 vmcs_write32(TPR_THRESHOLD, 0);
1585#endif
1586
1587 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); 1602 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
1588 vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK); 1603 vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK);
1589 1604
1590 vmx->vcpu.cr0 = 0x60000010; 1605 if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
1591 vmx_set_cr0(&vmx->vcpu, vmx->vcpu.cr0); // enter rmode 1606 if (alloc_apic_access_page(vmx->vcpu.kvm) != 0)
1592 vmx_set_cr4(&vmx->vcpu, 0); 1607 return -ENOMEM;
1593#ifdef CONFIG_X86_64
1594 vmx_set_efer(&vmx->vcpu, 0);
1595#endif
1596 vmx_fpu_activate(&vmx->vcpu);
1597 update_exception_bitmap(&vmx->vcpu);
1598 1608
1599 return 0; 1609 return 0;
1600
1601out:
1602 return ret;
1603} 1610}
1604 1611
1605static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) 1612static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
1606{ 1613{
1607 struct vcpu_vmx *vmx = to_vmx(vcpu); 1614 struct vcpu_vmx *vmx = to_vmx(vcpu);
1615 u64 msr;
1616 int ret;
1608 1617
1609 vmx_vcpu_setup(vmx); 1618 if (!init_rmode_tss(vmx->vcpu.kvm)) {
1610} 1619 ret = -ENOMEM;
1611 1620 goto out;
1612static void inject_rmode_irq(struct kvm_vcpu *vcpu, int irq)
1613{
1614 u16 ent[2];
1615 u16 cs;
1616 u16 ip;
1617 unsigned long flags;
1618 unsigned long ss_base = vmcs_readl(GUEST_SS_BASE);
1619 u16 sp = vmcs_readl(GUEST_RSP);
1620 u32 ss_limit = vmcs_read32(GUEST_SS_LIMIT);
1621
1622 if (sp > ss_limit || sp < 6 ) {
1623 vcpu_printf(vcpu, "%s: #SS, rsp 0x%lx ss 0x%lx limit 0x%x\n",
1624 __FUNCTION__,
1625 vmcs_readl(GUEST_RSP),
1626 vmcs_readl(GUEST_SS_BASE),
1627 vmcs_read32(GUEST_SS_LIMIT));
1628 return;
1629 } 1621 }
1630 1622
1631 if (emulator_read_std(irq * sizeof(ent), &ent, sizeof(ent), vcpu) != 1623 vmx->vcpu.arch.rmode.active = 0;
1632 X86EMUL_CONTINUE) { 1624
1633 vcpu_printf(vcpu, "%s: read guest err\n", __FUNCTION__); 1625 vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
1634 return; 1626 set_cr8(&vmx->vcpu, 0);
1627 msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
1628 if (vmx->vcpu.vcpu_id == 0)
1629 msr |= MSR_IA32_APICBASE_BSP;
1630 kvm_set_apic_base(&vmx->vcpu, msr);
1631
1632 fx_init(&vmx->vcpu);
1633
1634 /*
1635 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
1636 * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh.
1637 */
1638 if (vmx->vcpu.vcpu_id == 0) {
1639 vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
1640 vmcs_writel(GUEST_CS_BASE, 0x000f0000);
1641 } else {
1642 vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
1643 vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
1635 } 1644 }
1645 vmcs_write32(GUEST_CS_LIMIT, 0xffff);
1646 vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
1647
1648 seg_setup(VCPU_SREG_DS);
1649 seg_setup(VCPU_SREG_ES);
1650 seg_setup(VCPU_SREG_FS);
1651 seg_setup(VCPU_SREG_GS);
1652 seg_setup(VCPU_SREG_SS);
1653
1654 vmcs_write16(GUEST_TR_SELECTOR, 0);
1655 vmcs_writel(GUEST_TR_BASE, 0);
1656 vmcs_write32(GUEST_TR_LIMIT, 0xffff);
1657 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
1636 1658
1637 flags = vmcs_readl(GUEST_RFLAGS); 1659 vmcs_write16(GUEST_LDTR_SELECTOR, 0);
1638 cs = vmcs_readl(GUEST_CS_BASE) >> 4; 1660 vmcs_writel(GUEST_LDTR_BASE, 0);
1639 ip = vmcs_readl(GUEST_RIP); 1661 vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
1662 vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
1640 1663
1664 vmcs_write32(GUEST_SYSENTER_CS, 0);
1665 vmcs_writel(GUEST_SYSENTER_ESP, 0);
1666 vmcs_writel(GUEST_SYSENTER_EIP, 0);
1641 1667
1642 if (emulator_write_emulated(ss_base + sp - 2, &flags, 2, vcpu) != X86EMUL_CONTINUE || 1668 vmcs_writel(GUEST_RFLAGS, 0x02);
1643 emulator_write_emulated(ss_base + sp - 4, &cs, 2, vcpu) != X86EMUL_CONTINUE || 1669 if (vmx->vcpu.vcpu_id == 0)
1644 emulator_write_emulated(ss_base + sp - 6, &ip, 2, vcpu) != X86EMUL_CONTINUE) { 1670 vmcs_writel(GUEST_RIP, 0xfff0);
1645 vcpu_printf(vcpu, "%s: write guest err\n", __FUNCTION__); 1671 else
1646 return; 1672 vmcs_writel(GUEST_RIP, 0);
1673 vmcs_writel(GUEST_RSP, 0);
1674
1675 /* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */
1676 vmcs_writel(GUEST_DR7, 0x400);
1677
1678 vmcs_writel(GUEST_GDTR_BASE, 0);
1679 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
1680
1681 vmcs_writel(GUEST_IDTR_BASE, 0);
1682 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
1683
1684 vmcs_write32(GUEST_ACTIVITY_STATE, 0);
1685 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
1686 vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
1687
1688 guest_write_tsc(0);
1689
1690 /* Special registers */
1691 vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
1692
1693 setup_msrs(vmx);
1694
1695 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
1696
1697 if (cpu_has_vmx_tpr_shadow()) {
1698 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
1699 if (vm_need_tpr_shadow(vmx->vcpu.kvm))
1700 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
1701 page_to_phys(vmx->vcpu.arch.apic->regs_page));
1702 vmcs_write32(TPR_THRESHOLD, 0);
1647 } 1703 }
1648 1704
1649 vmcs_writel(GUEST_RFLAGS, flags & 1705 if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
1650 ~( X86_EFLAGS_IF | X86_EFLAGS_AC | X86_EFLAGS_TF)); 1706 vmcs_write64(APIC_ACCESS_ADDR,
1651 vmcs_write16(GUEST_CS_SELECTOR, ent[1]) ; 1707 page_to_phys(vmx->vcpu.kvm->arch.apic_access_page));
1652 vmcs_writel(GUEST_CS_BASE, ent[1] << 4); 1708
1653 vmcs_writel(GUEST_RIP, ent[0]); 1709 vmx->vcpu.arch.cr0 = 0x60000010;
1654 vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) & ~0xffff) | (sp - 6)); 1710 vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */
1711 vmx_set_cr4(&vmx->vcpu, 0);
1712#ifdef CONFIG_X86_64
1713 vmx_set_efer(&vmx->vcpu, 0);
1714#endif
1715 vmx_fpu_activate(&vmx->vcpu);
1716 update_exception_bitmap(&vmx->vcpu);
1717
1718 return 0;
1719
1720out:
1721 return ret;
1655} 1722}
1656 1723
1657static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) 1724static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
1658{ 1725{
1659 if (vcpu->rmode.active) { 1726 struct vcpu_vmx *vmx = to_vmx(vcpu);
1660 inject_rmode_irq(vcpu, irq); 1727
1728 if (vcpu->arch.rmode.active) {
1729 vmx->rmode.irq.pending = true;
1730 vmx->rmode.irq.vector = irq;
1731 vmx->rmode.irq.rip = vmcs_readl(GUEST_RIP);
1732 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
1733 irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK);
1734 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
1735 vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip - 1);
1661 return; 1736 return;
1662 } 1737 }
1663 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 1738 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
@@ -1666,13 +1741,13 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
1666 1741
1667static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) 1742static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
1668{ 1743{
1669 int word_index = __ffs(vcpu->irq_summary); 1744 int word_index = __ffs(vcpu->arch.irq_summary);
1670 int bit_index = __ffs(vcpu->irq_pending[word_index]); 1745 int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
1671 int irq = word_index * BITS_PER_LONG + bit_index; 1746 int irq = word_index * BITS_PER_LONG + bit_index;
1672 1747
1673 clear_bit(bit_index, &vcpu->irq_pending[word_index]); 1748 clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
1674 if (!vcpu->irq_pending[word_index]) 1749 if (!vcpu->arch.irq_pending[word_index])
1675 clear_bit(word_index, &vcpu->irq_summary); 1750 clear_bit(word_index, &vcpu->arch.irq_summary);
1676 vmx_inject_irq(vcpu, irq); 1751 vmx_inject_irq(vcpu, irq);
1677} 1752}
1678 1753
@@ -1682,12 +1757,12 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
1682{ 1757{
1683 u32 cpu_based_vm_exec_control; 1758 u32 cpu_based_vm_exec_control;
1684 1759
1685 vcpu->interrupt_window_open = 1760 vcpu->arch.interrupt_window_open =
1686 ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && 1761 ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
1687 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0); 1762 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
1688 1763
1689 if (vcpu->interrupt_window_open && 1764 if (vcpu->arch.interrupt_window_open &&
1690 vcpu->irq_summary && 1765 vcpu->arch.irq_summary &&
1691 !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK)) 1766 !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
1692 /* 1767 /*
1693 * If interrupts enabled, and not blocked by sti or mov ss. Good. 1768 * If interrupts enabled, and not blocked by sti or mov ss. Good.
@@ -1695,8 +1770,8 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
1695 kvm_do_inject_irq(vcpu); 1770 kvm_do_inject_irq(vcpu);
1696 1771
1697 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 1772 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
1698 if (!vcpu->interrupt_window_open && 1773 if (!vcpu->arch.interrupt_window_open &&
1699 (vcpu->irq_summary || kvm_run->request_interrupt_window)) 1774 (vcpu->arch.irq_summary || kvm_run->request_interrupt_window))
1700 /* 1775 /*
1701 * Interrupts blocked. Wait for unblock. 1776 * Interrupts blocked. Wait for unblock.
1702 */ 1777 */
@@ -1706,6 +1781,23 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
1706 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 1781 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
1707} 1782}
1708 1783
1784static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
1785{
1786 int ret;
1787 struct kvm_userspace_memory_region tss_mem = {
1788 .slot = 8,
1789 .guest_phys_addr = addr,
1790 .memory_size = PAGE_SIZE * 3,
1791 .flags = 0,
1792 };
1793
1794 ret = kvm_set_memory_region(kvm, &tss_mem, 0);
1795 if (ret)
1796 return ret;
1797 kvm->arch.tss_addr = addr;
1798 return 0;
1799}
1800
1709static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu) 1801static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
1710{ 1802{
1711 struct kvm_guest_debug *dbg = &vcpu->guest_debug; 1803 struct kvm_guest_debug *dbg = &vcpu->guest_debug;
@@ -1727,7 +1819,7 @@ static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
1727static int handle_rmode_exception(struct kvm_vcpu *vcpu, 1819static int handle_rmode_exception(struct kvm_vcpu *vcpu,
1728 int vec, u32 err_code) 1820 int vec, u32 err_code)
1729{ 1821{
1730 if (!vcpu->rmode.active) 1822 if (!vcpu->arch.rmode.active)
1731 return 0; 1823 return 0;
1732 1824
1733 /* 1825 /*
@@ -1735,32 +1827,31 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
1735 * Cause the #SS fault with 0 error code in VM86 mode. 1827 * Cause the #SS fault with 0 error code in VM86 mode.
1736 */ 1828 */
1737 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) 1829 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
1738 if (emulate_instruction(vcpu, NULL, 0, 0) == EMULATE_DONE) 1830 if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE)
1739 return 1; 1831 return 1;
1740 return 0; 1832 return 0;
1741} 1833}
1742 1834
1743static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1835static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1744{ 1836{
1837 struct vcpu_vmx *vmx = to_vmx(vcpu);
1745 u32 intr_info, error_code; 1838 u32 intr_info, error_code;
1746 unsigned long cr2, rip; 1839 unsigned long cr2, rip;
1747 u32 vect_info; 1840 u32 vect_info;
1748 enum emulation_result er; 1841 enum emulation_result er;
1749 int r;
1750 1842
1751 vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 1843 vect_info = vmx->idt_vectoring_info;
1752 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 1844 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
1753 1845
1754 if ((vect_info & VECTORING_INFO_VALID_MASK) && 1846 if ((vect_info & VECTORING_INFO_VALID_MASK) &&
1755 !is_page_fault(intr_info)) { 1847 !is_page_fault(intr_info))
1756 printk(KERN_ERR "%s: unexpected, vectoring info 0x%x " 1848 printk(KERN_ERR "%s: unexpected, vectoring info 0x%x "
1757 "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info); 1849 "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info);
1758 }
1759 1850
1760 if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) { 1851 if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) {
1761 int irq = vect_info & VECTORING_INFO_VECTOR_MASK; 1852 int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
1762 set_bit(irq, vcpu->irq_pending); 1853 set_bit(irq, vcpu->arch.irq_pending);
1763 set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary); 1854 set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
1764 } 1855 }
1765 1856
1766 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */ 1857 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */
@@ -1771,52 +1862,34 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1771 return 1; 1862 return 1;
1772 } 1863 }
1773 1864
1865 if (is_invalid_opcode(intr_info)) {
1866 er = emulate_instruction(vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD);
1867 if (er != EMULATE_DONE)
1868 kvm_queue_exception(vcpu, UD_VECTOR);
1869 return 1;
1870 }
1871
1774 error_code = 0; 1872 error_code = 0;
1775 rip = vmcs_readl(GUEST_RIP); 1873 rip = vmcs_readl(GUEST_RIP);
1776 if (intr_info & INTR_INFO_DELIEVER_CODE_MASK) 1874 if (intr_info & INTR_INFO_DELIEVER_CODE_MASK)
1777 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 1875 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
1778 if (is_page_fault(intr_info)) { 1876 if (is_page_fault(intr_info)) {
1779 cr2 = vmcs_readl(EXIT_QUALIFICATION); 1877 cr2 = vmcs_readl(EXIT_QUALIFICATION);
1780 1878 return kvm_mmu_page_fault(vcpu, cr2, error_code);
1781 mutex_lock(&vcpu->kvm->lock);
1782 r = kvm_mmu_page_fault(vcpu, cr2, error_code);
1783 if (r < 0) {
1784 mutex_unlock(&vcpu->kvm->lock);
1785 return r;
1786 }
1787 if (!r) {
1788 mutex_unlock(&vcpu->kvm->lock);
1789 return 1;
1790 }
1791
1792 er = emulate_instruction(vcpu, kvm_run, cr2, error_code);
1793 mutex_unlock(&vcpu->kvm->lock);
1794
1795 switch (er) {
1796 case EMULATE_DONE:
1797 return 1;
1798 case EMULATE_DO_MMIO:
1799 ++vcpu->stat.mmio_exits;
1800 return 0;
1801 case EMULATE_FAIL:
1802 kvm_report_emulation_failure(vcpu, "pagetable");
1803 break;
1804 default:
1805 BUG();
1806 }
1807 } 1879 }
1808 1880
1809 if (vcpu->rmode.active && 1881 if (vcpu->arch.rmode.active &&
1810 handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK, 1882 handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
1811 error_code)) { 1883 error_code)) {
1812 if (vcpu->halt_request) { 1884 if (vcpu->arch.halt_request) {
1813 vcpu->halt_request = 0; 1885 vcpu->arch.halt_request = 0;
1814 return kvm_emulate_halt(vcpu); 1886 return kvm_emulate_halt(vcpu);
1815 } 1887 }
1816 return 1; 1888 return 1;
1817 } 1889 }
1818 1890
1819 if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == (INTR_TYPE_EXCEPTION | 1)) { 1891 if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) ==
1892 (INTR_TYPE_EXCEPTION | 1)) {
1820 kvm_run->exit_reason = KVM_EXIT_DEBUG; 1893 kvm_run->exit_reason = KVM_EXIT_DEBUG;
1821 return 0; 1894 return 0;
1822 } 1895 }
@@ -1850,7 +1923,8 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1850 string = (exit_qualification & 16) != 0; 1923 string = (exit_qualification & 16) != 0;
1851 1924
1852 if (string) { 1925 if (string) {
1853 if (emulate_instruction(vcpu, kvm_run, 0, 0) == EMULATE_DO_MMIO) 1926 if (emulate_instruction(vcpu,
1927 kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
1854 return 0; 1928 return 0;
1855 return 1; 1929 return 1;
1856 } 1930 }
@@ -1873,7 +1947,6 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
1873 hypercall[0] = 0x0f; 1947 hypercall[0] = 0x0f;
1874 hypercall[1] = 0x01; 1948 hypercall[1] = 0x01;
1875 hypercall[2] = 0xc1; 1949 hypercall[2] = 0xc1;
1876 hypercall[3] = 0xc3;
1877} 1950}
1878 1951
1879static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1952static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
@@ -1890,23 +1963,25 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1890 switch (cr) { 1963 switch (cr) {
1891 case 0: 1964 case 0:
1892 vcpu_load_rsp_rip(vcpu); 1965 vcpu_load_rsp_rip(vcpu);
1893 set_cr0(vcpu, vcpu->regs[reg]); 1966 set_cr0(vcpu, vcpu->arch.regs[reg]);
1894 skip_emulated_instruction(vcpu); 1967 skip_emulated_instruction(vcpu);
1895 return 1; 1968 return 1;
1896 case 3: 1969 case 3:
1897 vcpu_load_rsp_rip(vcpu); 1970 vcpu_load_rsp_rip(vcpu);
1898 set_cr3(vcpu, vcpu->regs[reg]); 1971 set_cr3(vcpu, vcpu->arch.regs[reg]);
1899 skip_emulated_instruction(vcpu); 1972 skip_emulated_instruction(vcpu);
1900 return 1; 1973 return 1;
1901 case 4: 1974 case 4:
1902 vcpu_load_rsp_rip(vcpu); 1975 vcpu_load_rsp_rip(vcpu);
1903 set_cr4(vcpu, vcpu->regs[reg]); 1976 set_cr4(vcpu, vcpu->arch.regs[reg]);
1904 skip_emulated_instruction(vcpu); 1977 skip_emulated_instruction(vcpu);
1905 return 1; 1978 return 1;
1906 case 8: 1979 case 8:
1907 vcpu_load_rsp_rip(vcpu); 1980 vcpu_load_rsp_rip(vcpu);
1908 set_cr8(vcpu, vcpu->regs[reg]); 1981 set_cr8(vcpu, vcpu->arch.regs[reg]);
1909 skip_emulated_instruction(vcpu); 1982 skip_emulated_instruction(vcpu);
1983 if (irqchip_in_kernel(vcpu->kvm))
1984 return 1;
1910 kvm_run->exit_reason = KVM_EXIT_SET_TPR; 1985 kvm_run->exit_reason = KVM_EXIT_SET_TPR;
1911 return 0; 1986 return 0;
1912 }; 1987 };
@@ -1914,8 +1989,8 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1914 case 2: /* clts */ 1989 case 2: /* clts */
1915 vcpu_load_rsp_rip(vcpu); 1990 vcpu_load_rsp_rip(vcpu);
1916 vmx_fpu_deactivate(vcpu); 1991 vmx_fpu_deactivate(vcpu);
1917 vcpu->cr0 &= ~X86_CR0_TS; 1992 vcpu->arch.cr0 &= ~X86_CR0_TS;
1918 vmcs_writel(CR0_READ_SHADOW, vcpu->cr0); 1993 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
1919 vmx_fpu_activate(vcpu); 1994 vmx_fpu_activate(vcpu);
1920 skip_emulated_instruction(vcpu); 1995 skip_emulated_instruction(vcpu);
1921 return 1; 1996 return 1;
@@ -1923,13 +1998,13 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1923 switch (cr) { 1998 switch (cr) {
1924 case 3: 1999 case 3:
1925 vcpu_load_rsp_rip(vcpu); 2000 vcpu_load_rsp_rip(vcpu);
1926 vcpu->regs[reg] = vcpu->cr3; 2001 vcpu->arch.regs[reg] = vcpu->arch.cr3;
1927 vcpu_put_rsp_rip(vcpu); 2002 vcpu_put_rsp_rip(vcpu);
1928 skip_emulated_instruction(vcpu); 2003 skip_emulated_instruction(vcpu);
1929 return 1; 2004 return 1;
1930 case 8: 2005 case 8:
1931 vcpu_load_rsp_rip(vcpu); 2006 vcpu_load_rsp_rip(vcpu);
1932 vcpu->regs[reg] = get_cr8(vcpu); 2007 vcpu->arch.regs[reg] = get_cr8(vcpu);
1933 vcpu_put_rsp_rip(vcpu); 2008 vcpu_put_rsp_rip(vcpu);
1934 skip_emulated_instruction(vcpu); 2009 skip_emulated_instruction(vcpu);
1935 return 1; 2010 return 1;
@@ -1975,7 +2050,7 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1975 default: 2050 default:
1976 val = 0; 2051 val = 0;
1977 } 2052 }
1978 vcpu->regs[reg] = val; 2053 vcpu->arch.regs[reg] = val;
1979 } else { 2054 } else {
1980 /* mov to dr */ 2055 /* mov to dr */
1981 } 2056 }
@@ -1992,29 +2067,29 @@ static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1992 2067
1993static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2068static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1994{ 2069{
1995 u32 ecx = vcpu->regs[VCPU_REGS_RCX]; 2070 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
1996 u64 data; 2071 u64 data;
1997 2072
1998 if (vmx_get_msr(vcpu, ecx, &data)) { 2073 if (vmx_get_msr(vcpu, ecx, &data)) {
1999 vmx_inject_gp(vcpu, 0); 2074 kvm_inject_gp(vcpu, 0);
2000 return 1; 2075 return 1;
2001 } 2076 }
2002 2077
2003 /* FIXME: handling of bits 32:63 of rax, rdx */ 2078 /* FIXME: handling of bits 32:63 of rax, rdx */
2004 vcpu->regs[VCPU_REGS_RAX] = data & -1u; 2079 vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u;
2005 vcpu->regs[VCPU_REGS_RDX] = (data >> 32) & -1u; 2080 vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
2006 skip_emulated_instruction(vcpu); 2081 skip_emulated_instruction(vcpu);
2007 return 1; 2082 return 1;
2008} 2083}
2009 2084
2010static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2085static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2011{ 2086{
2012 u32 ecx = vcpu->regs[VCPU_REGS_RCX]; 2087 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
2013 u64 data = (vcpu->regs[VCPU_REGS_RAX] & -1u) 2088 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
2014 | ((u64)(vcpu->regs[VCPU_REGS_RDX] & -1u) << 32); 2089 | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
2015 2090
2016 if (vmx_set_msr(vcpu, ecx, data) != 0) { 2091 if (vmx_set_msr(vcpu, ecx, data) != 0) {
2017 vmx_inject_gp(vcpu, 0); 2092 kvm_inject_gp(vcpu, 0);
2018 return 1; 2093 return 1;
2019 } 2094 }
2020 2095
@@ -2042,7 +2117,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
2042 * possible 2117 * possible
2043 */ 2118 */
2044 if (kvm_run->request_interrupt_window && 2119 if (kvm_run->request_interrupt_window &&
2045 !vcpu->irq_summary) { 2120 !vcpu->arch.irq_summary) {
2046 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; 2121 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
2047 ++vcpu->stat.irq_window_exits; 2122 ++vcpu->stat.irq_window_exits;
2048 return 0; 2123 return 0;
@@ -2059,7 +2134,35 @@ static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2059static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2134static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2060{ 2135{
2061 skip_emulated_instruction(vcpu); 2136 skip_emulated_instruction(vcpu);
2062 return kvm_hypercall(vcpu, kvm_run); 2137 kvm_emulate_hypercall(vcpu);
2138 return 1;
2139}
2140
2141static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2142{
2143 skip_emulated_instruction(vcpu);
2144 /* TODO: Add support for VT-d/pass-through device */
2145 return 1;
2146}
2147
2148static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2149{
2150 u64 exit_qualification;
2151 enum emulation_result er;
2152 unsigned long offset;
2153
2154 exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2155 offset = exit_qualification & 0xffful;
2156
2157 er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
2158
2159 if (er != EMULATE_DONE) {
2160 printk(KERN_ERR
2161 "Fail to handle apic access vmexit! Offset is 0x%lx\n",
2162 offset);
2163 return -ENOTSUPP;
2164 }
2165 return 1;
2063} 2166}
2064 2167
2065/* 2168/*
@@ -2081,7 +2184,9 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
2081 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, 2184 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
2082 [EXIT_REASON_HLT] = handle_halt, 2185 [EXIT_REASON_HLT] = handle_halt,
2083 [EXIT_REASON_VMCALL] = handle_vmcall, 2186 [EXIT_REASON_VMCALL] = handle_vmcall,
2084 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold 2187 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
2188 [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
2189 [EXIT_REASON_WBINVD] = handle_wbinvd,
2085}; 2190};
2086 2191
2087static const int kvm_vmx_max_exit_handlers = 2192static const int kvm_vmx_max_exit_handlers =
@@ -2093,9 +2198,9 @@ static const int kvm_vmx_max_exit_handlers =
2093 */ 2198 */
2094static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) 2199static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2095{ 2200{
2096 u32 vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
2097 u32 exit_reason = vmcs_read32(VM_EXIT_REASON); 2201 u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
2098 struct vcpu_vmx *vmx = to_vmx(vcpu); 2202 struct vcpu_vmx *vmx = to_vmx(vcpu);
2203 u32 vectoring_info = vmx->idt_vectoring_info;
2099 2204
2100 if (unlikely(vmx->fail)) { 2205 if (unlikely(vmx->fail)) {
2101 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; 2206 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
@@ -2104,8 +2209,8 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2104 return 0; 2209 return 0;
2105 } 2210 }
2106 2211
2107 if ( (vectoring_info & VECTORING_INFO_VALID_MASK) && 2212 if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
2108 exit_reason != EXIT_REASON_EXCEPTION_NMI ) 2213 exit_reason != EXIT_REASON_EXCEPTION_NMI)
2109 printk(KERN_WARNING "%s: unexpected, valid vectoring info and " 2214 printk(KERN_WARNING "%s: unexpected, valid vectoring info and "
2110 "exit reason is 0x%x\n", __FUNCTION__, exit_reason); 2215 "exit reason is 0x%x\n", __FUNCTION__, exit_reason);
2111 if (exit_reason < kvm_vmx_max_exit_handlers 2216 if (exit_reason < kvm_vmx_max_exit_handlers
@@ -2150,26 +2255,38 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
2150 2255
2151static void vmx_intr_assist(struct kvm_vcpu *vcpu) 2256static void vmx_intr_assist(struct kvm_vcpu *vcpu)
2152{ 2257{
2258 struct vcpu_vmx *vmx = to_vmx(vcpu);
2153 u32 idtv_info_field, intr_info_field; 2259 u32 idtv_info_field, intr_info_field;
2154 int has_ext_irq, interrupt_window_open; 2260 int has_ext_irq, interrupt_window_open;
2155 int vector; 2261 int vector;
2156 2262
2157 kvm_inject_pending_timer_irqs(vcpu);
2158 update_tpr_threshold(vcpu); 2263 update_tpr_threshold(vcpu);
2159 2264
2160 has_ext_irq = kvm_cpu_has_interrupt(vcpu); 2265 has_ext_irq = kvm_cpu_has_interrupt(vcpu);
2161 intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); 2266 intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
2162 idtv_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD); 2267 idtv_info_field = vmx->idt_vectoring_info;
2163 if (intr_info_field & INTR_INFO_VALID_MASK) { 2268 if (intr_info_field & INTR_INFO_VALID_MASK) {
2164 if (idtv_info_field & INTR_INFO_VALID_MASK) { 2269 if (idtv_info_field & INTR_INFO_VALID_MASK) {
2165 /* TODO: fault when IDT_Vectoring */ 2270 /* TODO: fault when IDT_Vectoring */
2166 printk(KERN_ERR "Fault when IDT_Vectoring\n"); 2271 if (printk_ratelimit())
2272 printk(KERN_ERR "Fault when IDT_Vectoring\n");
2167 } 2273 }
2168 if (has_ext_irq) 2274 if (has_ext_irq)
2169 enable_irq_window(vcpu); 2275 enable_irq_window(vcpu);
2170 return; 2276 return;
2171 } 2277 }
2172 if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) { 2278 if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) {
2279 if ((idtv_info_field & VECTORING_INFO_TYPE_MASK)
2280 == INTR_TYPE_EXT_INTR
2281 && vcpu->arch.rmode.active) {
2282 u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK;
2283
2284 vmx_inject_irq(vcpu, vect);
2285 if (unlikely(has_ext_irq))
2286 enable_irq_window(vcpu);
2287 return;
2288 }
2289
2173 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field); 2290 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field);
2174 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 2291 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2175 vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); 2292 vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
@@ -2194,6 +2311,29 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
2194 enable_irq_window(vcpu); 2311 enable_irq_window(vcpu);
2195} 2312}
2196 2313
2314/*
2315 * Failure to inject an interrupt should give us the information
2316 * in IDT_VECTORING_INFO_FIELD. However, if the failure occurs
2317 * when fetching the interrupt redirection bitmap in the real-mode
2318 * tss, this doesn't happen. So we do it ourselves.
2319 */
2320static void fixup_rmode_irq(struct vcpu_vmx *vmx)
2321{
2322 vmx->rmode.irq.pending = 0;
2323 if (vmcs_readl(GUEST_RIP) + 1 != vmx->rmode.irq.rip)
2324 return;
2325 vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip);
2326 if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
2327 vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK;
2328 vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR;
2329 return;
2330 }
2331 vmx->idt_vectoring_info =
2332 VECTORING_INFO_VALID_MASK
2333 | INTR_TYPE_EXT_INTR
2334 | vmx->rmode.irq.vector;
2335}
2336
2197static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2337static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2198{ 2338{
2199 struct vcpu_vmx *vmx = to_vmx(vcpu); 2339 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2204,50 +2344,47 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2204 */ 2344 */
2205 vmcs_writel(HOST_CR0, read_cr0()); 2345 vmcs_writel(HOST_CR0, read_cr0());
2206 2346
2207 asm ( 2347 asm(
2208 /* Store host registers */ 2348 /* Store host registers */
2209#ifdef CONFIG_X86_64 2349#ifdef CONFIG_X86_64
2210 "push %%rax; push %%rbx; push %%rdx;" 2350 "push %%rdx; push %%rbp;"
2211 "push %%rsi; push %%rdi; push %%rbp;"
2212 "push %%r8; push %%r9; push %%r10; push %%r11;"
2213 "push %%r12; push %%r13; push %%r14; push %%r15;"
2214 "push %%rcx \n\t" 2351 "push %%rcx \n\t"
2215 ASM_VMX_VMWRITE_RSP_RDX "\n\t"
2216#else 2352#else
2217 "pusha; push %%ecx \n\t" 2353 "push %%edx; push %%ebp;"
2218 ASM_VMX_VMWRITE_RSP_RDX "\n\t" 2354 "push %%ecx \n\t"
2219#endif 2355#endif
2356 ASM_VMX_VMWRITE_RSP_RDX "\n\t"
2220 /* Check if vmlaunch of vmresume is needed */ 2357 /* Check if vmlaunch of vmresume is needed */
2221 "cmp $0, %1 \n\t" 2358 "cmpl $0, %c[launched](%0) \n\t"
2222 /* Load guest registers. Don't clobber flags. */ 2359 /* Load guest registers. Don't clobber flags. */
2223#ifdef CONFIG_X86_64 2360#ifdef CONFIG_X86_64
2224 "mov %c[cr2](%3), %%rax \n\t" 2361 "mov %c[cr2](%0), %%rax \n\t"
2225 "mov %%rax, %%cr2 \n\t" 2362 "mov %%rax, %%cr2 \n\t"
2226 "mov %c[rax](%3), %%rax \n\t" 2363 "mov %c[rax](%0), %%rax \n\t"
2227 "mov %c[rbx](%3), %%rbx \n\t" 2364 "mov %c[rbx](%0), %%rbx \n\t"
2228 "mov %c[rdx](%3), %%rdx \n\t" 2365 "mov %c[rdx](%0), %%rdx \n\t"
2229 "mov %c[rsi](%3), %%rsi \n\t" 2366 "mov %c[rsi](%0), %%rsi \n\t"
2230 "mov %c[rdi](%3), %%rdi \n\t" 2367 "mov %c[rdi](%0), %%rdi \n\t"
2231 "mov %c[rbp](%3), %%rbp \n\t" 2368 "mov %c[rbp](%0), %%rbp \n\t"
2232 "mov %c[r8](%3), %%r8 \n\t" 2369 "mov %c[r8](%0), %%r8 \n\t"
2233 "mov %c[r9](%3), %%r9 \n\t" 2370 "mov %c[r9](%0), %%r9 \n\t"
2234 "mov %c[r10](%3), %%r10 \n\t" 2371 "mov %c[r10](%0), %%r10 \n\t"
2235 "mov %c[r11](%3), %%r11 \n\t" 2372 "mov %c[r11](%0), %%r11 \n\t"
2236 "mov %c[r12](%3), %%r12 \n\t" 2373 "mov %c[r12](%0), %%r12 \n\t"
2237 "mov %c[r13](%3), %%r13 \n\t" 2374 "mov %c[r13](%0), %%r13 \n\t"
2238 "mov %c[r14](%3), %%r14 \n\t" 2375 "mov %c[r14](%0), %%r14 \n\t"
2239 "mov %c[r15](%3), %%r15 \n\t" 2376 "mov %c[r15](%0), %%r15 \n\t"
2240 "mov %c[rcx](%3), %%rcx \n\t" /* kills %3 (rcx) */ 2377 "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (rcx) */
2241#else 2378#else
2242 "mov %c[cr2](%3), %%eax \n\t" 2379 "mov %c[cr2](%0), %%eax \n\t"
2243 "mov %%eax, %%cr2 \n\t" 2380 "mov %%eax, %%cr2 \n\t"
2244 "mov %c[rax](%3), %%eax \n\t" 2381 "mov %c[rax](%0), %%eax \n\t"
2245 "mov %c[rbx](%3), %%ebx \n\t" 2382 "mov %c[rbx](%0), %%ebx \n\t"
2246 "mov %c[rdx](%3), %%edx \n\t" 2383 "mov %c[rdx](%0), %%edx \n\t"
2247 "mov %c[rsi](%3), %%esi \n\t" 2384 "mov %c[rsi](%0), %%esi \n\t"
2248 "mov %c[rdi](%3), %%edi \n\t" 2385 "mov %c[rdi](%0), %%edi \n\t"
2249 "mov %c[rbp](%3), %%ebp \n\t" 2386 "mov %c[rbp](%0), %%ebp \n\t"
2250 "mov %c[rcx](%3), %%ecx \n\t" /* kills %3 (ecx) */ 2387 "mov %c[rcx](%0), %%ecx \n\t" /* kills %0 (ecx) */
2251#endif 2388#endif
2252 /* Enter guest mode */ 2389 /* Enter guest mode */
2253 "jne .Llaunched \n\t" 2390 "jne .Llaunched \n\t"
@@ -2257,72 +2394,79 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2257 ".Lkvm_vmx_return: " 2394 ".Lkvm_vmx_return: "
2258 /* Save guest registers, load host registers, keep flags */ 2395 /* Save guest registers, load host registers, keep flags */
2259#ifdef CONFIG_X86_64 2396#ifdef CONFIG_X86_64
2260 "xchg %3, (%%rsp) \n\t" 2397 "xchg %0, (%%rsp) \n\t"
2261 "mov %%rax, %c[rax](%3) \n\t" 2398 "mov %%rax, %c[rax](%0) \n\t"
2262 "mov %%rbx, %c[rbx](%3) \n\t" 2399 "mov %%rbx, %c[rbx](%0) \n\t"
2263 "pushq (%%rsp); popq %c[rcx](%3) \n\t" 2400 "pushq (%%rsp); popq %c[rcx](%0) \n\t"
2264 "mov %%rdx, %c[rdx](%3) \n\t" 2401 "mov %%rdx, %c[rdx](%0) \n\t"
2265 "mov %%rsi, %c[rsi](%3) \n\t" 2402 "mov %%rsi, %c[rsi](%0) \n\t"
2266 "mov %%rdi, %c[rdi](%3) \n\t" 2403 "mov %%rdi, %c[rdi](%0) \n\t"
2267 "mov %%rbp, %c[rbp](%3) \n\t" 2404 "mov %%rbp, %c[rbp](%0) \n\t"
2268 "mov %%r8, %c[r8](%3) \n\t" 2405 "mov %%r8, %c[r8](%0) \n\t"
2269 "mov %%r9, %c[r9](%3) \n\t" 2406 "mov %%r9, %c[r9](%0) \n\t"
2270 "mov %%r10, %c[r10](%3) \n\t" 2407 "mov %%r10, %c[r10](%0) \n\t"
2271 "mov %%r11, %c[r11](%3) \n\t" 2408 "mov %%r11, %c[r11](%0) \n\t"
2272 "mov %%r12, %c[r12](%3) \n\t" 2409 "mov %%r12, %c[r12](%0) \n\t"
2273 "mov %%r13, %c[r13](%3) \n\t" 2410 "mov %%r13, %c[r13](%0) \n\t"
2274 "mov %%r14, %c[r14](%3) \n\t" 2411 "mov %%r14, %c[r14](%0) \n\t"
2275 "mov %%r15, %c[r15](%3) \n\t" 2412 "mov %%r15, %c[r15](%0) \n\t"
2276 "mov %%cr2, %%rax \n\t" 2413 "mov %%cr2, %%rax \n\t"
2277 "mov %%rax, %c[cr2](%3) \n\t" 2414 "mov %%rax, %c[cr2](%0) \n\t"
2278 "mov (%%rsp), %3 \n\t"
2279 2415
2280 "pop %%rcx; pop %%r15; pop %%r14; pop %%r13; pop %%r12;" 2416 "pop %%rbp; pop %%rbp; pop %%rdx \n\t"
2281 "pop %%r11; pop %%r10; pop %%r9; pop %%r8;"
2282 "pop %%rbp; pop %%rdi; pop %%rsi;"
2283 "pop %%rdx; pop %%rbx; pop %%rax \n\t"
2284#else 2417#else
2285 "xchg %3, (%%esp) \n\t" 2418 "xchg %0, (%%esp) \n\t"
2286 "mov %%eax, %c[rax](%3) \n\t" 2419 "mov %%eax, %c[rax](%0) \n\t"
2287 "mov %%ebx, %c[rbx](%3) \n\t" 2420 "mov %%ebx, %c[rbx](%0) \n\t"
2288 "pushl (%%esp); popl %c[rcx](%3) \n\t" 2421 "pushl (%%esp); popl %c[rcx](%0) \n\t"
2289 "mov %%edx, %c[rdx](%3) \n\t" 2422 "mov %%edx, %c[rdx](%0) \n\t"
2290 "mov %%esi, %c[rsi](%3) \n\t" 2423 "mov %%esi, %c[rsi](%0) \n\t"
2291 "mov %%edi, %c[rdi](%3) \n\t" 2424 "mov %%edi, %c[rdi](%0) \n\t"
2292 "mov %%ebp, %c[rbp](%3) \n\t" 2425 "mov %%ebp, %c[rbp](%0) \n\t"
2293 "mov %%cr2, %%eax \n\t" 2426 "mov %%cr2, %%eax \n\t"
2294 "mov %%eax, %c[cr2](%3) \n\t" 2427 "mov %%eax, %c[cr2](%0) \n\t"
2295 "mov (%%esp), %3 \n\t"
2296 2428
2297 "pop %%ecx; popa \n\t" 2429 "pop %%ebp; pop %%ebp; pop %%edx \n\t"
2430#endif
2431 "setbe %c[fail](%0) \n\t"
2432 : : "c"(vmx), "d"((unsigned long)HOST_RSP),
2433 [launched]"i"(offsetof(struct vcpu_vmx, launched)),
2434 [fail]"i"(offsetof(struct vcpu_vmx, fail)),
2435 [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
2436 [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
2437 [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
2438 [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
2439 [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
2440 [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
2441 [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
2442#ifdef CONFIG_X86_64
2443 [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
2444 [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
2445 [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
2446 [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
2447 [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
2448 [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
2449 [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
2450 [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
2298#endif 2451#endif
2299 "setbe %0 \n\t" 2452 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2))
2300 : "=q" (vmx->fail) 2453 : "cc", "memory"
2301 : "r"(vmx->launched), "d"((unsigned long)HOST_RSP),
2302 "c"(vcpu),
2303 [rax]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RAX])),
2304 [rbx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBX])),
2305 [rcx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RCX])),
2306 [rdx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDX])),
2307 [rsi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RSI])),
2308 [rdi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDI])),
2309 [rbp]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBP])),
2310#ifdef CONFIG_X86_64 2454#ifdef CONFIG_X86_64
2311 [r8 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R8 ])), 2455 , "rbx", "rdi", "rsi"
2312 [r9 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R9 ])), 2456 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
2313 [r10]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R10])), 2457#else
2314 [r11]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R11])), 2458 , "ebx", "edi", "rsi"
2315 [r12]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R12])),
2316 [r13]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R13])),
2317 [r14]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R14])),
2318 [r15]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R15])),
2319#endif 2459#endif
2320 [cr2]"i"(offsetof(struct kvm_vcpu, cr2)) 2460 );
2321 : "cc", "memory" ); 2461
2462 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
2463 if (vmx->rmode.irq.pending)
2464 fixup_rmode_irq(vmx);
2322 2465
2323 vcpu->interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0; 2466 vcpu->arch.interrupt_window_open =
2467 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0;
2324 2468
2325 asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); 2469 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
2326 vmx->launched = 1; 2470 vmx->launched = 1;
2327 2471
2328 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 2472 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
@@ -2332,36 +2476,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2332 asm("int $2"); 2476 asm("int $2");
2333} 2477}
2334 2478
2335static void vmx_inject_page_fault(struct kvm_vcpu *vcpu,
2336 unsigned long addr,
2337 u32 err_code)
2338{
2339 u32 vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
2340
2341 ++vcpu->stat.pf_guest;
2342
2343 if (is_page_fault(vect_info)) {
2344 printk(KERN_DEBUG "inject_page_fault: "
2345 "double fault 0x%lx @ 0x%lx\n",
2346 addr, vmcs_readl(GUEST_RIP));
2347 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 0);
2348 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2349 DF_VECTOR |
2350 INTR_TYPE_EXCEPTION |
2351 INTR_INFO_DELIEVER_CODE_MASK |
2352 INTR_INFO_VALID_MASK);
2353 return;
2354 }
2355 vcpu->cr2 = addr;
2356 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, err_code);
2357 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2358 PF_VECTOR |
2359 INTR_TYPE_EXCEPTION |
2360 INTR_INFO_DELIEVER_CODE_MASK |
2361 INTR_INFO_VALID_MASK);
2362
2363}
2364
2365static void vmx_free_vmcs(struct kvm_vcpu *vcpu) 2479static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
2366{ 2480{
2367 struct vcpu_vmx *vmx = to_vmx(vcpu); 2481 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2397,12 +2511,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
2397 if (err) 2511 if (err)
2398 goto free_vcpu; 2512 goto free_vcpu;
2399 2513
2400 if (irqchip_in_kernel(kvm)) {
2401 err = kvm_create_lapic(&vmx->vcpu);
2402 if (err < 0)
2403 goto free_vcpu;
2404 }
2405
2406 vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); 2514 vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
2407 if (!vmx->guest_msrs) { 2515 if (!vmx->guest_msrs) {
2408 err = -ENOMEM; 2516 err = -ENOMEM;
@@ -2464,6 +2572,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
2464 .check_processor_compatibility = vmx_check_processor_compat, 2572 .check_processor_compatibility = vmx_check_processor_compat,
2465 .hardware_enable = hardware_enable, 2573 .hardware_enable = hardware_enable,
2466 .hardware_disable = hardware_disable, 2574 .hardware_disable = hardware_disable,
2575 .cpu_has_accelerated_tpr = cpu_has_vmx_virtualize_apic_accesses,
2467 2576
2468 .vcpu_create = vmx_create_vcpu, 2577 .vcpu_create = vmx_create_vcpu,
2469 .vcpu_free = vmx_free_vcpu, 2578 .vcpu_free = vmx_free_vcpu,
@@ -2499,9 +2608,6 @@ static struct kvm_x86_ops vmx_x86_ops = {
2499 .set_rflags = vmx_set_rflags, 2608 .set_rflags = vmx_set_rflags,
2500 2609
2501 .tlb_flush = vmx_flush_tlb, 2610 .tlb_flush = vmx_flush_tlb,
2502 .inject_page_fault = vmx_inject_page_fault,
2503
2504 .inject_gp = vmx_inject_gp,
2505 2611
2506 .run = vmx_vcpu_run, 2612 .run = vmx_vcpu_run,
2507 .handle_exit = kvm_handle_exit, 2613 .handle_exit = kvm_handle_exit,
@@ -2509,8 +2615,12 @@ static struct kvm_x86_ops vmx_x86_ops = {
2509 .patch_hypercall = vmx_patch_hypercall, 2615 .patch_hypercall = vmx_patch_hypercall,
2510 .get_irq = vmx_get_irq, 2616 .get_irq = vmx_get_irq,
2511 .set_irq = vmx_inject_irq, 2617 .set_irq = vmx_inject_irq,
2618 .queue_exception = vmx_queue_exception,
2619 .exception_injected = vmx_exception_injected,
2512 .inject_pending_irq = vmx_intr_assist, 2620 .inject_pending_irq = vmx_intr_assist,
2513 .inject_pending_vectors = do_interrupt_requests, 2621 .inject_pending_vectors = do_interrupt_requests,
2622
2623 .set_tss_addr = vmx_set_tss_addr,
2514}; 2624};
2515 2625
2516static int __init vmx_init(void) 2626static int __init vmx_init(void)
@@ -2541,10 +2651,13 @@ static int __init vmx_init(void)
2541 memset(iova, 0xff, PAGE_SIZE); 2651 memset(iova, 0xff, PAGE_SIZE);
2542 kunmap(vmx_io_bitmap_b); 2652 kunmap(vmx_io_bitmap_b);
2543 2653
2544 r = kvm_init_x86(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE); 2654 r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
2545 if (r) 2655 if (r)
2546 goto out1; 2656 goto out1;
2547 2657
2658 if (bypass_guest_pf)
2659 kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
2660
2548 return 0; 2661 return 0;
2549 2662
2550out1: 2663out1:
@@ -2559,7 +2672,7 @@ static void __exit vmx_exit(void)
2559 __free_page(vmx_io_bitmap_b); 2672 __free_page(vmx_io_bitmap_b);
2560 __free_page(vmx_io_bitmap_a); 2673 __free_page(vmx_io_bitmap_a);
2561 2674
2562 kvm_exit_x86(); 2675 kvm_exit();
2563} 2676}
2564 2677
2565module_init(vmx_init) 2678module_init(vmx_init)
diff --git a/drivers/kvm/vmx.h b/arch/x86/kvm/vmx.h
index fd4e14666088..d52ae8d7303d 100644
--- a/drivers/kvm/vmx.h
+++ b/arch/x86/kvm/vmx.h
@@ -25,6 +25,9 @@
25 * 25 *
26 */ 26 */
27 27
28/*
29 * Definitions of Primary Processor-Based VM-Execution Controls.
30 */
28#define CPU_BASED_VIRTUAL_INTR_PENDING 0x00000004 31#define CPU_BASED_VIRTUAL_INTR_PENDING 0x00000004
29#define CPU_BASED_USE_TSC_OFFSETING 0x00000008 32#define CPU_BASED_USE_TSC_OFFSETING 0x00000008
30#define CPU_BASED_HLT_EXITING 0x00000080 33#define CPU_BASED_HLT_EXITING 0x00000080
@@ -42,6 +45,12 @@
42#define CPU_BASED_MONITOR_EXITING 0x20000000 45#define CPU_BASED_MONITOR_EXITING 0x20000000
43#define CPU_BASED_PAUSE_EXITING 0x40000000 46#define CPU_BASED_PAUSE_EXITING 0x40000000
44#define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS 0x80000000 47#define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS 0x80000000
48/*
49 * Definitions of Secondary Processor-Based VM-Execution Controls.
50 */
51#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
52#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
53
45 54
46#define PIN_BASED_EXT_INTR_MASK 0x00000001 55#define PIN_BASED_EXT_INTR_MASK 0x00000001
47#define PIN_BASED_NMI_EXITING 0x00000008 56#define PIN_BASED_NMI_EXITING 0x00000008
@@ -54,8 +63,6 @@
54#define VM_ENTRY_SMM 0x00000400 63#define VM_ENTRY_SMM 0x00000400
55#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800 64#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800
56 65
57#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
58
59/* VMCS Encodings */ 66/* VMCS Encodings */
60enum vmcs_field { 67enum vmcs_field {
61 GUEST_ES_SELECTOR = 0x00000800, 68 GUEST_ES_SELECTOR = 0x00000800,
@@ -89,6 +96,8 @@ enum vmcs_field {
89 TSC_OFFSET_HIGH = 0x00002011, 96 TSC_OFFSET_HIGH = 0x00002011,
90 VIRTUAL_APIC_PAGE_ADDR = 0x00002012, 97 VIRTUAL_APIC_PAGE_ADDR = 0x00002012,
91 VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013, 98 VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013,
99 APIC_ACCESS_ADDR = 0x00002014,
100 APIC_ACCESS_ADDR_HIGH = 0x00002015,
92 VMCS_LINK_POINTER = 0x00002800, 101 VMCS_LINK_POINTER = 0x00002800,
93 VMCS_LINK_POINTER_HIGH = 0x00002801, 102 VMCS_LINK_POINTER_HIGH = 0x00002801,
94 GUEST_IA32_DEBUGCTL = 0x00002802, 103 GUEST_IA32_DEBUGCTL = 0x00002802,
@@ -214,6 +223,8 @@ enum vmcs_field {
214#define EXIT_REASON_MSR_WRITE 32 223#define EXIT_REASON_MSR_WRITE 32
215#define EXIT_REASON_MWAIT_INSTRUCTION 36 224#define EXIT_REASON_MWAIT_INSTRUCTION 36
216#define EXIT_REASON_TPR_BELOW_THRESHOLD 43 225#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
226#define EXIT_REASON_APIC_ACCESS 44
227#define EXIT_REASON_WBINVD 54
217 228
218/* 229/*
219 * Interruption-information format 230 * Interruption-information format
@@ -230,13 +241,14 @@ enum vmcs_field {
230 241
231#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */ 242#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */
232#define INTR_TYPE_EXCEPTION (3 << 8) /* processor exception */ 243#define INTR_TYPE_EXCEPTION (3 << 8) /* processor exception */
244#define INTR_TYPE_SOFT_INTR (4 << 8) /* software interrupt */
233 245
234/* 246/*
235 * Exit Qualifications for MOV for Control Register Access 247 * Exit Qualifications for MOV for Control Register Access
236 */ 248 */
237#define CONTROL_REG_ACCESS_NUM 0x7 /* 2:0, number of control register */ 249#define CONTROL_REG_ACCESS_NUM 0x7 /* 2:0, number of control reg.*/
238#define CONTROL_REG_ACCESS_TYPE 0x30 /* 5:4, access type */ 250#define CONTROL_REG_ACCESS_TYPE 0x30 /* 5:4, access type */
239#define CONTROL_REG_ACCESS_REG 0xf00 /* 10:8, general purpose register */ 251#define CONTROL_REG_ACCESS_REG 0xf00 /* 10:8, general purpose reg. */
240#define LMSW_SOURCE_DATA_SHIFT 16 252#define LMSW_SOURCE_DATA_SHIFT 16
241#define LMSW_SOURCE_DATA (0xFFFF << LMSW_SOURCE_DATA_SHIFT) /* 16:31 lmsw source */ 253#define LMSW_SOURCE_DATA (0xFFFF << LMSW_SOURCE_DATA_SHIFT) /* 16:31 lmsw source */
242#define REG_EAX (0 << 8) 254#define REG_EAX (0 << 8)
@@ -259,11 +271,11 @@ enum vmcs_field {
259/* 271/*
260 * Exit Qualifications for MOV for Debug Register Access 272 * Exit Qualifications for MOV for Debug Register Access
261 */ 273 */
262#define DEBUG_REG_ACCESS_NUM 0x7 /* 2:0, number of debug register */ 274#define DEBUG_REG_ACCESS_NUM 0x7 /* 2:0, number of debug reg. */
263#define DEBUG_REG_ACCESS_TYPE 0x10 /* 4, direction of access */ 275#define DEBUG_REG_ACCESS_TYPE 0x10 /* 4, direction of access */
264#define TYPE_MOV_TO_DR (0 << 4) 276#define TYPE_MOV_TO_DR (0 << 4)
265#define TYPE_MOV_FROM_DR (1 << 4) 277#define TYPE_MOV_FROM_DR (1 << 4)
266#define DEBUG_REG_ACCESS_REG 0xf00 /* 11:8, general purpose register */ 278#define DEBUG_REG_ACCESS_REG 0xf00 /* 11:8, general purpose reg. */
267 279
268 280
269/* segment AR */ 281/* segment AR */
@@ -307,4 +319,6 @@ enum vmcs_field {
307#define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1 319#define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1
308#define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4 320#define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4
309 321
322#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9
323
310#endif 324#endif
diff --git a/drivers/kvm/kvm_main.c b/arch/x86/kvm/x86.c
index c0f372f1d761..8f94a0b89dff 100644
--- a/drivers/kvm/kvm_main.c
+++ b/arch/x86/kvm/x86.c
@@ -1,8 +1,7 @@
1/* 1/*
2 * Kernel-based Virtual Machine driver for Linux 2 * Kernel-based Virtual Machine driver for Linux
3 * 3 *
4 * This module enables machines with Intel VT-x extensions to run virtual 4 * derived from drivers/kvm/kvm_main.c
5 * machines without emulation or binary translation.
6 * 5 *
7 * Copyright (C) 2006 Qumranet, Inc. 6 * Copyright (C) 2006 Qumranet, Inc.
8 * 7 *
@@ -15,80 +14,22 @@
15 * 14 *
16 */ 15 */
17 16
18#include "kvm.h" 17#include <linux/kvm_host.h>
19#include "x86_emulate.h"
20#include "segment_descriptor.h" 18#include "segment_descriptor.h"
21#include "irq.h" 19#include "irq.h"
20#include "mmu.h"
22 21
23#include <linux/kvm.h> 22#include <linux/kvm.h>
24#include <linux/module.h> 23#include <linux/fs.h>
25#include <linux/errno.h>
26#include <linux/percpu.h>
27#include <linux/gfp.h>
28#include <linux/mm.h>
29#include <linux/miscdevice.h>
30#include <linux/vmalloc.h> 24#include <linux/vmalloc.h>
31#include <linux/reboot.h> 25#include <linux/module.h>
32#include <linux/debugfs.h> 26#include <linux/mman.h>
33#include <linux/highmem.h> 27#include <linux/highmem.h>
34#include <linux/file.h>
35#include <linux/sysdev.h>
36#include <linux/cpu.h>
37#include <linux/sched.h>
38#include <linux/cpumask.h>
39#include <linux/smp.h>
40#include <linux/anon_inodes.h>
41#include <linux/profile.h>
42
43#include <asm/processor.h>
44#include <asm/msr.h>
45#include <asm/io.h>
46#include <asm/uaccess.h>
47#include <asm/desc.h>
48
49MODULE_AUTHOR("Qumranet");
50MODULE_LICENSE("GPL");
51 28
52static DEFINE_SPINLOCK(kvm_lock); 29#include <asm/uaccess.h>
53static LIST_HEAD(vm_list); 30#include <asm/msr.h>
54
55static cpumask_t cpus_hardware_enabled;
56
57struct kvm_x86_ops *kvm_x86_ops;
58struct kmem_cache *kvm_vcpu_cache;
59EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
60
61static __read_mostly struct preempt_ops kvm_preempt_ops;
62
63#define STAT_OFFSET(x) offsetof(struct kvm_vcpu, stat.x)
64
65static struct kvm_stats_debugfs_item {
66 const char *name;
67 int offset;
68 struct dentry *dentry;
69} debugfs_entries[] = {
70 { "pf_fixed", STAT_OFFSET(pf_fixed) },
71 { "pf_guest", STAT_OFFSET(pf_guest) },
72 { "tlb_flush", STAT_OFFSET(tlb_flush) },
73 { "invlpg", STAT_OFFSET(invlpg) },
74 { "exits", STAT_OFFSET(exits) },
75 { "io_exits", STAT_OFFSET(io_exits) },
76 { "mmio_exits", STAT_OFFSET(mmio_exits) },
77 { "signal_exits", STAT_OFFSET(signal_exits) },
78 { "irq_window", STAT_OFFSET(irq_window_exits) },
79 { "halt_exits", STAT_OFFSET(halt_exits) },
80 { "halt_wakeup", STAT_OFFSET(halt_wakeup) },
81 { "request_irq", STAT_OFFSET(request_irq_exits) },
82 { "irq_exits", STAT_OFFSET(irq_exits) },
83 { "light_exits", STAT_OFFSET(light_exits) },
84 { "efer_reload", STAT_OFFSET(efer_reload) },
85 { NULL }
86};
87
88static struct dentry *debugfs_dir;
89 31
90#define MAX_IO_MSRS 256 32#define MAX_IO_MSRS 256
91
92#define CR0_RESERVED_BITS \ 33#define CR0_RESERVED_BITS \
93 (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ 34 (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
94 | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ 35 | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
@@ -102,317 +43,151 @@ static struct dentry *debugfs_dir;
102#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) 43#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
103#define EFER_RESERVED_BITS 0xfffffffffffff2fe 44#define EFER_RESERVED_BITS 0xfffffffffffff2fe
104 45
105#ifdef CONFIG_X86_64 46#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
106// LDT or TSS descriptor in the GDT. 16 bytes. 47#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
107struct segment_descriptor_64 {
108 struct segment_descriptor s;
109 u32 base_higher;
110 u32 pad_zero;
111};
112 48
113#endif 49struct kvm_x86_ops *kvm_x86_ops;
50
51struct kvm_stats_debugfs_item debugfs_entries[] = {
52 { "pf_fixed", VCPU_STAT(pf_fixed) },
53 { "pf_guest", VCPU_STAT(pf_guest) },
54 { "tlb_flush", VCPU_STAT(tlb_flush) },
55 { "invlpg", VCPU_STAT(invlpg) },
56 { "exits", VCPU_STAT(exits) },
57 { "io_exits", VCPU_STAT(io_exits) },
58 { "mmio_exits", VCPU_STAT(mmio_exits) },
59 { "signal_exits", VCPU_STAT(signal_exits) },
60 { "irq_window", VCPU_STAT(irq_window_exits) },
61 { "halt_exits", VCPU_STAT(halt_exits) },
62 { "halt_wakeup", VCPU_STAT(halt_wakeup) },
63 { "request_irq", VCPU_STAT(request_irq_exits) },
64 { "irq_exits", VCPU_STAT(irq_exits) },
65 { "host_state_reload", VCPU_STAT(host_state_reload) },
66 { "efer_reload", VCPU_STAT(efer_reload) },
67 { "fpu_reload", VCPU_STAT(fpu_reload) },
68 { "insn_emulation", VCPU_STAT(insn_emulation) },
69 { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
70 { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
71 { "mmu_pte_write", VM_STAT(mmu_pte_write) },
72 { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
73 { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
74 { "mmu_flooded", VM_STAT(mmu_flooded) },
75 { "mmu_recycled", VM_STAT(mmu_recycled) },
76 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
77 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
78 { NULL }
79};
114 80
115static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
116 unsigned long arg);
117 81
118unsigned long segment_base(u16 selector) 82unsigned long segment_base(u16 selector)
119{ 83{
120 struct descriptor_table gdt; 84 struct descriptor_table gdt;
121 struct segment_descriptor *d; 85 struct segment_descriptor *d;
122 unsigned long table_base; 86 unsigned long table_base;
123 typedef unsigned long ul;
124 unsigned long v; 87 unsigned long v;
125 88
126 if (selector == 0) 89 if (selector == 0)
127 return 0; 90 return 0;
128 91
129 asm ("sgdt %0" : "=m"(gdt)); 92 asm("sgdt %0" : "=m"(gdt));
130 table_base = gdt.base; 93 table_base = gdt.base;
131 94
132 if (selector & 4) { /* from ldt */ 95 if (selector & 4) { /* from ldt */
133 u16 ldt_selector; 96 u16 ldt_selector;
134 97
135 asm ("sldt %0" : "=g"(ldt_selector)); 98 asm("sldt %0" : "=g"(ldt_selector));
136 table_base = segment_base(ldt_selector); 99 table_base = segment_base(ldt_selector);
137 } 100 }
138 d = (struct segment_descriptor *)(table_base + (selector & ~7)); 101 d = (struct segment_descriptor *)(table_base + (selector & ~7));
139 v = d->base_low | ((ul)d->base_mid << 16) | ((ul)d->base_high << 24); 102 v = d->base_low | ((unsigned long)d->base_mid << 16) |
103 ((unsigned long)d->base_high << 24);
140#ifdef CONFIG_X86_64 104#ifdef CONFIG_X86_64
141 if (d->system == 0 105 if (d->system == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
142 && (d->type == 2 || d->type == 9 || d->type == 11)) 106 v |= ((unsigned long) \
143 v |= ((ul)((struct segment_descriptor_64 *)d)->base_higher) << 32; 107 ((struct segment_descriptor_64 *)d)->base_higher) << 32;
144#endif 108#endif
145 return v; 109 return v;
146} 110}
147EXPORT_SYMBOL_GPL(segment_base); 111EXPORT_SYMBOL_GPL(segment_base);
148 112
149static inline int valid_vcpu(int n) 113u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
150{
151 return likely(n >= 0 && n < KVM_MAX_VCPUS);
152}
153
154void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
155{
156 if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
157 return;
158
159 vcpu->guest_fpu_loaded = 1;
160 fx_save(&vcpu->host_fx_image);
161 fx_restore(&vcpu->guest_fx_image);
162}
163EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
164
165void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
166{
167 if (!vcpu->guest_fpu_loaded)
168 return;
169
170 vcpu->guest_fpu_loaded = 0;
171 fx_save(&vcpu->guest_fx_image);
172 fx_restore(&vcpu->host_fx_image);
173}
174EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
175
176/*
177 * Switches to specified vcpu, until a matching vcpu_put()
178 */
179static void vcpu_load(struct kvm_vcpu *vcpu)
180{
181 int cpu;
182
183 mutex_lock(&vcpu->mutex);
184 cpu = get_cpu();
185 preempt_notifier_register(&vcpu->preempt_notifier);
186 kvm_x86_ops->vcpu_load(vcpu, cpu);
187 put_cpu();
188}
189
190static void vcpu_put(struct kvm_vcpu *vcpu)
191{
192 preempt_disable();
193 kvm_x86_ops->vcpu_put(vcpu);
194 preempt_notifier_unregister(&vcpu->preempt_notifier);
195 preempt_enable();
196 mutex_unlock(&vcpu->mutex);
197}
198
199static void ack_flush(void *_completed)
200{
201}
202
203void kvm_flush_remote_tlbs(struct kvm *kvm)
204{
205 int i, cpu;
206 cpumask_t cpus;
207 struct kvm_vcpu *vcpu;
208
209 cpus_clear(cpus);
210 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
211 vcpu = kvm->vcpus[i];
212 if (!vcpu)
213 continue;
214 if (test_and_set_bit(KVM_TLB_FLUSH, &vcpu->requests))
215 continue;
216 cpu = vcpu->cpu;
217 if (cpu != -1 && cpu != raw_smp_processor_id())
218 cpu_set(cpu, cpus);
219 }
220 smp_call_function_mask(cpus, ack_flush, NULL, 1);
221}
222
223int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
224{ 114{
225 struct page *page; 115 if (irqchip_in_kernel(vcpu->kvm))
226 int r; 116 return vcpu->arch.apic_base;
227
228 mutex_init(&vcpu->mutex);
229 vcpu->cpu = -1;
230 vcpu->mmu.root_hpa = INVALID_PAGE;
231 vcpu->kvm = kvm;
232 vcpu->vcpu_id = id;
233 if (!irqchip_in_kernel(kvm) || id == 0)
234 vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
235 else 117 else
236 vcpu->mp_state = VCPU_MP_STATE_UNINITIALIZED; 118 return vcpu->arch.apic_base;
237 init_waitqueue_head(&vcpu->wq);
238
239 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
240 if (!page) {
241 r = -ENOMEM;
242 goto fail;
243 }
244 vcpu->run = page_address(page);
245
246 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
247 if (!page) {
248 r = -ENOMEM;
249 goto fail_free_run;
250 }
251 vcpu->pio_data = page_address(page);
252
253 r = kvm_mmu_create(vcpu);
254 if (r < 0)
255 goto fail_free_pio_data;
256
257 return 0;
258
259fail_free_pio_data:
260 free_page((unsigned long)vcpu->pio_data);
261fail_free_run:
262 free_page((unsigned long)vcpu->run);
263fail:
264 return -ENOMEM;
265}
266EXPORT_SYMBOL_GPL(kvm_vcpu_init);
267
268void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
269{
270 kvm_mmu_destroy(vcpu);
271 if (vcpu->apic)
272 hrtimer_cancel(&vcpu->apic->timer.dev);
273 kvm_free_apic(vcpu->apic);
274 free_page((unsigned long)vcpu->pio_data);
275 free_page((unsigned long)vcpu->run);
276}
277EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
278
279static struct kvm *kvm_create_vm(void)
280{
281 struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
282
283 if (!kvm)
284 return ERR_PTR(-ENOMEM);
285
286 kvm_io_bus_init(&kvm->pio_bus);
287 mutex_init(&kvm->lock);
288 INIT_LIST_HEAD(&kvm->active_mmu_pages);
289 kvm_io_bus_init(&kvm->mmio_bus);
290 spin_lock(&kvm_lock);
291 list_add(&kvm->vm_list, &vm_list);
292 spin_unlock(&kvm_lock);
293 return kvm;
294}
295
296/*
297 * Free any memory in @free but not in @dont.
298 */
299static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
300 struct kvm_memory_slot *dont)
301{
302 int i;
303
304 if (!dont || free->phys_mem != dont->phys_mem)
305 if (free->phys_mem) {
306 for (i = 0; i < free->npages; ++i)
307 if (free->phys_mem[i])
308 __free_page(free->phys_mem[i]);
309 vfree(free->phys_mem);
310 }
311
312 if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
313 vfree(free->dirty_bitmap);
314
315 free->phys_mem = NULL;
316 free->npages = 0;
317 free->dirty_bitmap = NULL;
318}
319
320static void kvm_free_physmem(struct kvm *kvm)
321{
322 int i;
323
324 for (i = 0; i < kvm->nmemslots; ++i)
325 kvm_free_physmem_slot(&kvm->memslots[i], NULL);
326} 119}
120EXPORT_SYMBOL_GPL(kvm_get_apic_base);
327 121
328static void free_pio_guest_pages(struct kvm_vcpu *vcpu) 122void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
329{ 123{
330 int i; 124 /* TODO: reserve bits check */
331 125 if (irqchip_in_kernel(vcpu->kvm))
332 for (i = 0; i < ARRAY_SIZE(vcpu->pio.guest_pages); ++i) 126 kvm_lapic_set_base(vcpu, data);
333 if (vcpu->pio.guest_pages[i]) { 127 else
334 __free_page(vcpu->pio.guest_pages[i]); 128 vcpu->arch.apic_base = data;
335 vcpu->pio.guest_pages[i] = NULL;
336 }
337} 129}
130EXPORT_SYMBOL_GPL(kvm_set_apic_base);
338 131
339static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) 132void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
340{ 133{
341 vcpu_load(vcpu); 134 WARN_ON(vcpu->arch.exception.pending);
342 kvm_mmu_unload(vcpu); 135 vcpu->arch.exception.pending = true;
343 vcpu_put(vcpu); 136 vcpu->arch.exception.has_error_code = false;
137 vcpu->arch.exception.nr = nr;
344} 138}
139EXPORT_SYMBOL_GPL(kvm_queue_exception);
345 140
346static void kvm_free_vcpus(struct kvm *kvm) 141void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
142 u32 error_code)
347{ 143{
348 unsigned int i; 144 ++vcpu->stat.pf_guest;
349 145 if (vcpu->arch.exception.pending && vcpu->arch.exception.nr == PF_VECTOR) {
350 /* 146 printk(KERN_DEBUG "kvm: inject_page_fault:"
351 * Unpin any mmu pages first. 147 " double fault 0x%lx\n", addr);
352 */ 148 vcpu->arch.exception.nr = DF_VECTOR;
353 for (i = 0; i < KVM_MAX_VCPUS; ++i) 149 vcpu->arch.exception.error_code = 0;
354 if (kvm->vcpus[i]) 150 return;
355 kvm_unload_vcpu_mmu(kvm->vcpus[i]);
356 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
357 if (kvm->vcpus[i]) {
358 kvm_x86_ops->vcpu_free(kvm->vcpus[i]);
359 kvm->vcpus[i] = NULL;
360 }
361 } 151 }
362 152 vcpu->arch.cr2 = addr;
153 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
363} 154}
364 155
365static void kvm_destroy_vm(struct kvm *kvm) 156void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
366{ 157{
367 spin_lock(&kvm_lock); 158 WARN_ON(vcpu->arch.exception.pending);
368 list_del(&kvm->vm_list); 159 vcpu->arch.exception.pending = true;
369 spin_unlock(&kvm_lock); 160 vcpu->arch.exception.has_error_code = true;
370 kvm_io_bus_destroy(&kvm->pio_bus); 161 vcpu->arch.exception.nr = nr;
371 kvm_io_bus_destroy(&kvm->mmio_bus); 162 vcpu->arch.exception.error_code = error_code;
372 kfree(kvm->vpic);
373 kfree(kvm->vioapic);
374 kvm_free_vcpus(kvm);
375 kvm_free_physmem(kvm);
376 kfree(kvm);
377} 163}
164EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
378 165
379static int kvm_vm_release(struct inode *inode, struct file *filp) 166static void __queue_exception(struct kvm_vcpu *vcpu)
380{ 167{
381 struct kvm *kvm = filp->private_data; 168 kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
382 169 vcpu->arch.exception.has_error_code,
383 kvm_destroy_vm(kvm); 170 vcpu->arch.exception.error_code);
384 return 0;
385}
386
387static void inject_gp(struct kvm_vcpu *vcpu)
388{
389 kvm_x86_ops->inject_gp(vcpu, 0);
390} 171}
391 172
392/* 173/*
393 * Load the pae pdptrs. Return true is they are all valid. 174 * Load the pae pdptrs. Return true is they are all valid.
394 */ 175 */
395static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) 176int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
396{ 177{
397 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; 178 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
398 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; 179 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
399 int i; 180 int i;
400 u64 *pdpt;
401 int ret; 181 int ret;
402 struct page *page; 182 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
403 u64 pdpte[ARRAY_SIZE(vcpu->pdptrs)];
404 183
405 mutex_lock(&vcpu->kvm->lock); 184 down_read(&current->mm->mmap_sem);
406 page = gfn_to_page(vcpu->kvm, pdpt_gfn); 185 ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
407 if (!page) { 186 offset * sizeof(u64), sizeof(pdpte));
187 if (ret < 0) {
408 ret = 0; 188 ret = 0;
409 goto out; 189 goto out;
410 } 190 }
411
412 pdpt = kmap_atomic(page, KM_USER0);
413 memcpy(pdpte, pdpt+offset, sizeof(pdpte));
414 kunmap_atomic(pdpt, KM_USER0);
415
416 for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { 191 for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
417 if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) { 192 if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {
418 ret = 0; 193 ret = 0;
@@ -421,78 +196,96 @@ static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
421 } 196 }
422 ret = 1; 197 ret = 1;
423 198
424 memcpy(vcpu->pdptrs, pdpte, sizeof(vcpu->pdptrs)); 199 memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
425out: 200out:
426 mutex_unlock(&vcpu->kvm->lock); 201 up_read(&current->mm->mmap_sem);
427 202
428 return ret; 203 return ret;
429} 204}
430 205
206static bool pdptrs_changed(struct kvm_vcpu *vcpu)
207{
208 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
209 bool changed = true;
210 int r;
211
212 if (is_long_mode(vcpu) || !is_pae(vcpu))
213 return false;
214
215 down_read(&current->mm->mmap_sem);
216 r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
217 if (r < 0)
218 goto out;
219 changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
220out:
221 up_read(&current->mm->mmap_sem);
222
223 return changed;
224}
225
431void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 226void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
432{ 227{
433 if (cr0 & CR0_RESERVED_BITS) { 228 if (cr0 & CR0_RESERVED_BITS) {
434 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", 229 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
435 cr0, vcpu->cr0); 230 cr0, vcpu->arch.cr0);
436 inject_gp(vcpu); 231 kvm_inject_gp(vcpu, 0);
437 return; 232 return;
438 } 233 }
439 234
440 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { 235 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
441 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); 236 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
442 inject_gp(vcpu); 237 kvm_inject_gp(vcpu, 0);
443 return; 238 return;
444 } 239 }
445 240
446 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { 241 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
447 printk(KERN_DEBUG "set_cr0: #GP, set PG flag " 242 printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
448 "and a clear PE flag\n"); 243 "and a clear PE flag\n");
449 inject_gp(vcpu); 244 kvm_inject_gp(vcpu, 0);
450 return; 245 return;
451 } 246 }
452 247
453 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 248 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
454#ifdef CONFIG_X86_64 249#ifdef CONFIG_X86_64
455 if ((vcpu->shadow_efer & EFER_LME)) { 250 if ((vcpu->arch.shadow_efer & EFER_LME)) {
456 int cs_db, cs_l; 251 int cs_db, cs_l;
457 252
458 if (!is_pae(vcpu)) { 253 if (!is_pae(vcpu)) {
459 printk(KERN_DEBUG "set_cr0: #GP, start paging " 254 printk(KERN_DEBUG "set_cr0: #GP, start paging "
460 "in long mode while PAE is disabled\n"); 255 "in long mode while PAE is disabled\n");
461 inject_gp(vcpu); 256 kvm_inject_gp(vcpu, 0);
462 return; 257 return;
463 } 258 }
464 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 259 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
465 if (cs_l) { 260 if (cs_l) {
466 printk(KERN_DEBUG "set_cr0: #GP, start paging " 261 printk(KERN_DEBUG "set_cr0: #GP, start paging "
467 "in long mode while CS.L == 1\n"); 262 "in long mode while CS.L == 1\n");
468 inject_gp(vcpu); 263 kvm_inject_gp(vcpu, 0);
469 return; 264 return;
470 265
471 } 266 }
472 } else 267 } else
473#endif 268#endif
474 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->cr3)) { 269 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
475 printk(KERN_DEBUG "set_cr0: #GP, pdptrs " 270 printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
476 "reserved bits\n"); 271 "reserved bits\n");
477 inject_gp(vcpu); 272 kvm_inject_gp(vcpu, 0);
478 return; 273 return;
479 } 274 }
480 275
481 } 276 }
482 277
483 kvm_x86_ops->set_cr0(vcpu, cr0); 278 kvm_x86_ops->set_cr0(vcpu, cr0);
484 vcpu->cr0 = cr0; 279 vcpu->arch.cr0 = cr0;
485 280
486 mutex_lock(&vcpu->kvm->lock);
487 kvm_mmu_reset_context(vcpu); 281 kvm_mmu_reset_context(vcpu);
488 mutex_unlock(&vcpu->kvm->lock);
489 return; 282 return;
490} 283}
491EXPORT_SYMBOL_GPL(set_cr0); 284EXPORT_SYMBOL_GPL(set_cr0);
492 285
493void lmsw(struct kvm_vcpu *vcpu, unsigned long msw) 286void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
494{ 287{
495 set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f)); 288 set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
496} 289}
497EXPORT_SYMBOL_GPL(lmsw); 290EXPORT_SYMBOL_GPL(lmsw);
498 291
@@ -500,7 +293,7 @@ void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
500{ 293{
501 if (cr4 & CR4_RESERVED_BITS) { 294 if (cr4 & CR4_RESERVED_BITS) {
502 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); 295 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
503 inject_gp(vcpu); 296 kvm_inject_gp(vcpu, 0);
504 return; 297 return;
505 } 298 }
506 299
@@ -508,35 +301,38 @@ void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
508 if (!(cr4 & X86_CR4_PAE)) { 301 if (!(cr4 & X86_CR4_PAE)) {
509 printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while " 302 printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
510 "in long mode\n"); 303 "in long mode\n");
511 inject_gp(vcpu); 304 kvm_inject_gp(vcpu, 0);
512 return; 305 return;
513 } 306 }
514 } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE) 307 } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE)
515 && !load_pdptrs(vcpu, vcpu->cr3)) { 308 && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
516 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); 309 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
517 inject_gp(vcpu); 310 kvm_inject_gp(vcpu, 0);
518 return; 311 return;
519 } 312 }
520 313
521 if (cr4 & X86_CR4_VMXE) { 314 if (cr4 & X86_CR4_VMXE) {
522 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n"); 315 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
523 inject_gp(vcpu); 316 kvm_inject_gp(vcpu, 0);
524 return; 317 return;
525 } 318 }
526 kvm_x86_ops->set_cr4(vcpu, cr4); 319 kvm_x86_ops->set_cr4(vcpu, cr4);
527 vcpu->cr4 = cr4; 320 vcpu->arch.cr4 = cr4;
528 mutex_lock(&vcpu->kvm->lock);
529 kvm_mmu_reset_context(vcpu); 321 kvm_mmu_reset_context(vcpu);
530 mutex_unlock(&vcpu->kvm->lock);
531} 322}
532EXPORT_SYMBOL_GPL(set_cr4); 323EXPORT_SYMBOL_GPL(set_cr4);
533 324
534void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 325void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
535{ 326{
327 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
328 kvm_mmu_flush_tlb(vcpu);
329 return;
330 }
331
536 if (is_long_mode(vcpu)) { 332 if (is_long_mode(vcpu)) {
537 if (cr3 & CR3_L_MODE_RESERVED_BITS) { 333 if (cr3 & CR3_L_MODE_RESERVED_BITS) {
538 printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); 334 printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
539 inject_gp(vcpu); 335 kvm_inject_gp(vcpu, 0);
540 return; 336 return;
541 } 337 }
542 } else { 338 } else {
@@ -544,26 +340,23 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
544 if (cr3 & CR3_PAE_RESERVED_BITS) { 340 if (cr3 & CR3_PAE_RESERVED_BITS) {
545 printk(KERN_DEBUG 341 printk(KERN_DEBUG
546 "set_cr3: #GP, reserved bits\n"); 342 "set_cr3: #GP, reserved bits\n");
547 inject_gp(vcpu); 343 kvm_inject_gp(vcpu, 0);
548 return; 344 return;
549 } 345 }
550 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { 346 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
551 printk(KERN_DEBUG "set_cr3: #GP, pdptrs " 347 printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
552 "reserved bits\n"); 348 "reserved bits\n");
553 inject_gp(vcpu); 349 kvm_inject_gp(vcpu, 0);
554 return;
555 }
556 } else {
557 if (cr3 & CR3_NONPAE_RESERVED_BITS) {
558 printk(KERN_DEBUG
559 "set_cr3: #GP, reserved bits\n");
560 inject_gp(vcpu);
561 return; 350 return;
562 } 351 }
563 } 352 }
353 /*
354 * We don't check reserved bits in nonpae mode, because
355 * this isn't enforced, and VMware depends on this.
356 */
564 } 357 }
565 358
566 mutex_lock(&vcpu->kvm->lock); 359 down_read(&current->mm->mmap_sem);
567 /* 360 /*
568 * Does the new cr3 value map to physical memory? (Note, we 361 * Does the new cr3 value map to physical memory? (Note, we
569 * catch an invalid cr3 even in real-mode, because it would 362 * catch an invalid cr3 even in real-mode, because it would
@@ -574,12 +367,12 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
574 * to debug) behavior on the guest side. 367 * to debug) behavior on the guest side.
575 */ 368 */
576 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) 369 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
577 inject_gp(vcpu); 370 kvm_inject_gp(vcpu, 0);
578 else { 371 else {
579 vcpu->cr3 = cr3; 372 vcpu->arch.cr3 = cr3;
580 vcpu->mmu.new_cr3(vcpu); 373 vcpu->arch.mmu.new_cr3(vcpu);
581 } 374 }
582 mutex_unlock(&vcpu->kvm->lock); 375 up_read(&current->mm->mmap_sem);
583} 376}
584EXPORT_SYMBOL_GPL(set_cr3); 377EXPORT_SYMBOL_GPL(set_cr3);
585 378
@@ -587,13 +380,13 @@ void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
587{ 380{
588 if (cr8 & CR8_RESERVED_BITS) { 381 if (cr8 & CR8_RESERVED_BITS) {
589 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8); 382 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
590 inject_gp(vcpu); 383 kvm_inject_gp(vcpu, 0);
591 return; 384 return;
592 } 385 }
593 if (irqchip_in_kernel(vcpu->kvm)) 386 if (irqchip_in_kernel(vcpu->kvm))
594 kvm_lapic_set_tpr(vcpu, cr8); 387 kvm_lapic_set_tpr(vcpu, cr8);
595 else 388 else
596 vcpu->cr8 = cr8; 389 vcpu->arch.cr8 = cr8;
597} 390}
598EXPORT_SYMBOL_GPL(set_cr8); 391EXPORT_SYMBOL_GPL(set_cr8);
599 392
@@ -602,210 +395,846 @@ unsigned long get_cr8(struct kvm_vcpu *vcpu)
602 if (irqchip_in_kernel(vcpu->kvm)) 395 if (irqchip_in_kernel(vcpu->kvm))
603 return kvm_lapic_get_cr8(vcpu); 396 return kvm_lapic_get_cr8(vcpu);
604 else 397 else
605 return vcpu->cr8; 398 return vcpu->arch.cr8;
606} 399}
607EXPORT_SYMBOL_GPL(get_cr8); 400EXPORT_SYMBOL_GPL(get_cr8);
608 401
609u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) 402/*
403 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
404 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
405 *
406 * This list is modified at module load time to reflect the
407 * capabilities of the host cpu.
408 */
409static u32 msrs_to_save[] = {
410 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
411 MSR_K6_STAR,
412#ifdef CONFIG_X86_64
413 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
414#endif
415 MSR_IA32_TIME_STAMP_COUNTER,
416};
417
418static unsigned num_msrs_to_save;
419
420static u32 emulated_msrs[] = {
421 MSR_IA32_MISC_ENABLE,
422};
423
424#ifdef CONFIG_X86_64
425
426static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
610{ 427{
611 if (irqchip_in_kernel(vcpu->kvm)) 428 if (efer & EFER_RESERVED_BITS) {
612 return vcpu->apic_base; 429 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
613 else 430 efer);
614 return vcpu->apic_base; 431 kvm_inject_gp(vcpu, 0);
432 return;
433 }
434
435 if (is_paging(vcpu)
436 && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) {
437 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
438 kvm_inject_gp(vcpu, 0);
439 return;
440 }
441
442 kvm_x86_ops->set_efer(vcpu, efer);
443
444 efer &= ~EFER_LMA;
445 efer |= vcpu->arch.shadow_efer & EFER_LMA;
446
447 vcpu->arch.shadow_efer = efer;
615} 448}
616EXPORT_SYMBOL_GPL(kvm_get_apic_base);
617 449
618void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) 450#endif
451
452/*
453 * Writes msr value into into the appropriate "register".
454 * Returns 0 on success, non-0 otherwise.
455 * Assumes vcpu_load() was already called.
456 */
457int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
619{ 458{
620 /* TODO: reserve bits check */ 459 return kvm_x86_ops->set_msr(vcpu, msr_index, data);
621 if (irqchip_in_kernel(vcpu->kvm))
622 kvm_lapic_set_base(vcpu, data);
623 else
624 vcpu->apic_base = data;
625} 460}
626EXPORT_SYMBOL_GPL(kvm_set_apic_base);
627 461
628void fx_init(struct kvm_vcpu *vcpu) 462/*
463 * Adapt set_msr() to msr_io()'s calling convention
464 */
465static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
629{ 466{
630 unsigned after_mxcsr_mask; 467 return kvm_set_msr(vcpu, index, *data);
468}
631 469
632 /* Initialize guest FPU by resetting ours and saving into guest's */
633 preempt_disable();
634 fx_save(&vcpu->host_fx_image);
635 fpu_init();
636 fx_save(&vcpu->guest_fx_image);
637 fx_restore(&vcpu->host_fx_image);
638 preempt_enable();
639 470
640 vcpu->cr0 |= X86_CR0_ET; 471int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
641 after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space); 472{
642 vcpu->guest_fx_image.mxcsr = 0x1f80; 473 switch (msr) {
643 memset((void *)&vcpu->guest_fx_image + after_mxcsr_mask, 474#ifdef CONFIG_X86_64
644 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask); 475 case MSR_EFER:
476 set_efer(vcpu, data);
477 break;
478#endif
479 case MSR_IA32_MC0_STATUS:
480 pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
481 __FUNCTION__, data);
482 break;
483 case MSR_IA32_MCG_STATUS:
484 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
485 __FUNCTION__, data);
486 break;
487 case MSR_IA32_UCODE_REV:
488 case MSR_IA32_UCODE_WRITE:
489 case 0x200 ... 0x2ff: /* MTRRs */
490 break;
491 case MSR_IA32_APICBASE:
492 kvm_set_apic_base(vcpu, data);
493 break;
494 case MSR_IA32_MISC_ENABLE:
495 vcpu->arch.ia32_misc_enable_msr = data;
496 break;
497 default:
498 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
499 return 1;
500 }
501 return 0;
645} 502}
646EXPORT_SYMBOL_GPL(fx_init); 503EXPORT_SYMBOL_GPL(kvm_set_msr_common);
504
505
506/*
507 * Reads an msr value (of 'msr_index') into 'pdata'.
508 * Returns 0 on success, non-0 otherwise.
509 * Assumes vcpu_load() was already called.
510 */
511int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
512{
513 return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
514}
515
516int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
517{
518 u64 data;
519
520 switch (msr) {
521 case 0xc0010010: /* SYSCFG */
522 case 0xc0010015: /* HWCR */
523 case MSR_IA32_PLATFORM_ID:
524 case MSR_IA32_P5_MC_ADDR:
525 case MSR_IA32_P5_MC_TYPE:
526 case MSR_IA32_MC0_CTL:
527 case MSR_IA32_MCG_STATUS:
528 case MSR_IA32_MCG_CAP:
529 case MSR_IA32_MC0_MISC:
530 case MSR_IA32_MC0_MISC+4:
531 case MSR_IA32_MC0_MISC+8:
532 case MSR_IA32_MC0_MISC+12:
533 case MSR_IA32_MC0_MISC+16:
534 case MSR_IA32_UCODE_REV:
535 case MSR_IA32_PERF_STATUS:
536 case MSR_IA32_EBL_CR_POWERON:
537 /* MTRR registers */
538 case 0xfe:
539 case 0x200 ... 0x2ff:
540 data = 0;
541 break;
542 case 0xcd: /* fsb frequency */
543 data = 3;
544 break;
545 case MSR_IA32_APICBASE:
546 data = kvm_get_apic_base(vcpu);
547 break;
548 case MSR_IA32_MISC_ENABLE:
549 data = vcpu->arch.ia32_misc_enable_msr;
550 break;
551#ifdef CONFIG_X86_64
552 case MSR_EFER:
553 data = vcpu->arch.shadow_efer;
554 break;
555#endif
556 default:
557 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
558 return 1;
559 }
560 *pdata = data;
561 return 0;
562}
563EXPORT_SYMBOL_GPL(kvm_get_msr_common);
647 564
648/* 565/*
649 * Allocate some memory and give it an address in the guest physical address 566 * Read or write a bunch of msrs. All parameters are kernel addresses.
650 * space.
651 * 567 *
652 * Discontiguous memory is allowed, mostly for framebuffers. 568 * @return number of msrs set successfully.
653 */ 569 */
654static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, 570static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
655 struct kvm_memory_region *mem) 571 struct kvm_msr_entry *entries,
572 int (*do_msr)(struct kvm_vcpu *vcpu,
573 unsigned index, u64 *data))
656{ 574{
657 int r; 575 int i;
658 gfn_t base_gfn;
659 unsigned long npages;
660 unsigned long i;
661 struct kvm_memory_slot *memslot;
662 struct kvm_memory_slot old, new;
663 576
664 r = -EINVAL; 577 vcpu_load(vcpu);
665 /* General sanity checks */ 578
666 if (mem->memory_size & (PAGE_SIZE - 1)) 579 for (i = 0; i < msrs->nmsrs; ++i)
667 goto out; 580 if (do_msr(vcpu, entries[i].index, &entries[i].data))
668 if (mem->guest_phys_addr & (PAGE_SIZE - 1)) 581 break;
582
583 vcpu_put(vcpu);
584
585 return i;
586}
587
588/*
589 * Read or write a bunch of msrs. Parameters are user addresses.
590 *
591 * @return number of msrs set successfully.
592 */
593static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
594 int (*do_msr)(struct kvm_vcpu *vcpu,
595 unsigned index, u64 *data),
596 int writeback)
597{
598 struct kvm_msrs msrs;
599 struct kvm_msr_entry *entries;
600 int r, n;
601 unsigned size;
602
603 r = -EFAULT;
604 if (copy_from_user(&msrs, user_msrs, sizeof msrs))
669 goto out; 605 goto out;
670 if (mem->slot >= KVM_MEMORY_SLOTS) 606
607 r = -E2BIG;
608 if (msrs.nmsrs >= MAX_IO_MSRS)
671 goto out; 609 goto out;
672 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) 610
611 r = -ENOMEM;
612 size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
613 entries = vmalloc(size);
614 if (!entries)
673 goto out; 615 goto out;
674 616
675 memslot = &kvm->memslots[mem->slot]; 617 r = -EFAULT;
676 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; 618 if (copy_from_user(entries, user_msrs->entries, size))
677 npages = mem->memory_size >> PAGE_SHIFT; 619 goto out_free;
678 620
679 if (!npages) 621 r = n = __msr_io(vcpu, &msrs, entries, do_msr);
680 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES; 622 if (r < 0)
623 goto out_free;
681 624
682 mutex_lock(&kvm->lock); 625 r = -EFAULT;
626 if (writeback && copy_to_user(user_msrs->entries, entries, size))
627 goto out_free;
683 628
684 new = old = *memslot; 629 r = n;
685 630
686 new.base_gfn = base_gfn; 631out_free:
687 new.npages = npages; 632 vfree(entries);
688 new.flags = mem->flags; 633out:
634 return r;
635}
689 636
690 /* Disallow changing a memory slot's size. */ 637/*
691 r = -EINVAL; 638 * Make sure that a cpu that is being hot-unplugged does not have any vcpus
692 if (npages && old.npages && npages != old.npages) 639 * cached on it.
693 goto out_unlock; 640 */
641void decache_vcpus_on_cpu(int cpu)
642{
643 struct kvm *vm;
644 struct kvm_vcpu *vcpu;
645 int i;
694 646
695 /* Check for overlaps */ 647 spin_lock(&kvm_lock);
696 r = -EEXIST; 648 list_for_each_entry(vm, &vm_list, vm_list)
697 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 649 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
698 struct kvm_memory_slot *s = &kvm->memslots[i]; 650 vcpu = vm->vcpus[i];
651 if (!vcpu)
652 continue;
653 /*
654 * If the vcpu is locked, then it is running on some
655 * other cpu and therefore it is not cached on the
656 * cpu in question.
657 *
658 * If it's not locked, check the last cpu it executed
659 * on.
660 */
661 if (mutex_trylock(&vcpu->mutex)) {
662 if (vcpu->cpu == cpu) {
663 kvm_x86_ops->vcpu_decache(vcpu);
664 vcpu->cpu = -1;
665 }
666 mutex_unlock(&vcpu->mutex);
667 }
668 }
669 spin_unlock(&kvm_lock);
670}
699 671
700 if (s == memslot) 672int kvm_dev_ioctl_check_extension(long ext)
701 continue; 673{
702 if (!((base_gfn + npages <= s->base_gfn) || 674 int r;
703 (base_gfn >= s->base_gfn + s->npages))) 675
704 goto out_unlock; 676 switch (ext) {
677 case KVM_CAP_IRQCHIP:
678 case KVM_CAP_HLT:
679 case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
680 case KVM_CAP_USER_MEMORY:
681 case KVM_CAP_SET_TSS_ADDR:
682 case KVM_CAP_EXT_CPUID:
683 r = 1;
684 break;
685 case KVM_CAP_VAPIC:
686 r = !kvm_x86_ops->cpu_has_accelerated_tpr();
687 break;
688 default:
689 r = 0;
690 break;
705 } 691 }
692 return r;
706 693
707 /* Deallocate if slot is being removed */ 694}
708 if (!npages)
709 new.phys_mem = NULL;
710 695
711 /* Free page dirty bitmap if unneeded */ 696long kvm_arch_dev_ioctl(struct file *filp,
712 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) 697 unsigned int ioctl, unsigned long arg)
713 new.dirty_bitmap = NULL; 698{
699 void __user *argp = (void __user *)arg;
700 long r;
714 701
715 r = -ENOMEM; 702 switch (ioctl) {
703 case KVM_GET_MSR_INDEX_LIST: {
704 struct kvm_msr_list __user *user_msr_list = argp;
705 struct kvm_msr_list msr_list;
706 unsigned n;
716 707
717 /* Allocate if a slot is being created */ 708 r = -EFAULT;
718 if (npages && !new.phys_mem) { 709 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
719 new.phys_mem = vmalloc(npages * sizeof(struct page *)); 710 goto out;
711 n = msr_list.nmsrs;
712 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
713 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
714 goto out;
715 r = -E2BIG;
716 if (n < num_msrs_to_save)
717 goto out;
718 r = -EFAULT;
719 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
720 num_msrs_to_save * sizeof(u32)))
721 goto out;
722 if (copy_to_user(user_msr_list->indices
723 + num_msrs_to_save * sizeof(u32),
724 &emulated_msrs,
725 ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
726 goto out;
727 r = 0;
728 break;
729 }
730 default:
731 r = -EINVAL;
732 }
733out:
734 return r;
735}
736
737void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
738{
739 kvm_x86_ops->vcpu_load(vcpu, cpu);
740}
720 741
721 if (!new.phys_mem) 742void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
722 goto out_unlock; 743{
744 kvm_x86_ops->vcpu_put(vcpu);
745 kvm_put_guest_fpu(vcpu);
746}
723 747
724 memset(new.phys_mem, 0, npages * sizeof(struct page *)); 748static int is_efer_nx(void)
725 for (i = 0; i < npages; ++i) { 749{
726 new.phys_mem[i] = alloc_page(GFP_HIGHUSER 750 u64 efer;
727 | __GFP_ZERO); 751
728 if (!new.phys_mem[i]) 752 rdmsrl(MSR_EFER, efer);
729 goto out_unlock; 753 return efer & EFER_NX;
730 set_page_private(new.phys_mem[i],0); 754}
731 }
732 }
733 755
734 /* Allocate page dirty bitmap if needed */ 756static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
735 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { 757{
736 unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8; 758 int i;
759 struct kvm_cpuid_entry2 *e, *entry;
737 760
738 new.dirty_bitmap = vmalloc(dirty_bytes); 761 entry = NULL;
739 if (!new.dirty_bitmap) 762 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
740 goto out_unlock; 763 e = &vcpu->arch.cpuid_entries[i];
741 memset(new.dirty_bitmap, 0, dirty_bytes); 764 if (e->function == 0x80000001) {
765 entry = e;
766 break;
767 }
742 } 768 }
769 if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
770 entry->edx &= ~(1 << 20);
771 printk(KERN_INFO "kvm: guest NX capability removed\n");
772 }
773}
743 774
744 if (mem->slot >= kvm->nmemslots) 775/* when an old userspace process fills a new kernel module */
745 kvm->nmemslots = mem->slot + 1; 776static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
777 struct kvm_cpuid *cpuid,
778 struct kvm_cpuid_entry __user *entries)
779{
780 int r, i;
781 struct kvm_cpuid_entry *cpuid_entries;
746 782
747 *memslot = new; 783 r = -E2BIG;
784 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
785 goto out;
786 r = -ENOMEM;
787 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
788 if (!cpuid_entries)
789 goto out;
790 r = -EFAULT;
791 if (copy_from_user(cpuid_entries, entries,
792 cpuid->nent * sizeof(struct kvm_cpuid_entry)))
793 goto out_free;
794 for (i = 0; i < cpuid->nent; i++) {
795 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
796 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
797 vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
798 vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
799 vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
800 vcpu->arch.cpuid_entries[i].index = 0;
801 vcpu->arch.cpuid_entries[i].flags = 0;
802 vcpu->arch.cpuid_entries[i].padding[0] = 0;
803 vcpu->arch.cpuid_entries[i].padding[1] = 0;
804 vcpu->arch.cpuid_entries[i].padding[2] = 0;
805 }
806 vcpu->arch.cpuid_nent = cpuid->nent;
807 cpuid_fix_nx_cap(vcpu);
808 r = 0;
748 809
749 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 810out_free:
750 kvm_flush_remote_tlbs(kvm); 811 vfree(cpuid_entries);
812out:
813 return r;
814}
751 815
752 mutex_unlock(&kvm->lock); 816static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
817 struct kvm_cpuid2 *cpuid,
818 struct kvm_cpuid_entry2 __user *entries)
819{
820 int r;
753 821
754 kvm_free_physmem_slot(&old, &new); 822 r = -E2BIG;
823 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
824 goto out;
825 r = -EFAULT;
826 if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
827 cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
828 goto out;
829 vcpu->arch.cpuid_nent = cpuid->nent;
755 return 0; 830 return 0;
756 831
757out_unlock:
758 mutex_unlock(&kvm->lock);
759 kvm_free_physmem_slot(&new, &old);
760out: 832out:
761 return r; 833 return r;
762} 834}
763 835
764/* 836static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
765 * Get (and clear) the dirty memory log for a memory slot. 837 struct kvm_cpuid2 *cpuid,
766 */ 838 struct kvm_cpuid_entry2 __user *entries)
767static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
768 struct kvm_dirty_log *log)
769{ 839{
770 struct kvm_memory_slot *memslot; 840 int r;
771 int r, i;
772 int n;
773 unsigned long any = 0;
774
775 mutex_lock(&kvm->lock);
776 841
777 r = -EINVAL; 842 r = -E2BIG;
778 if (log->slot >= KVM_MEMORY_SLOTS) 843 if (cpuid->nent < vcpu->arch.cpuid_nent)
779 goto out; 844 goto out;
780 845 r = -EFAULT;
781 memslot = &kvm->memslots[log->slot]; 846 if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
782 r = -ENOENT; 847 vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
783 if (!memslot->dirty_bitmap)
784 goto out; 848 goto out;
849 return 0;
850
851out:
852 cpuid->nent = vcpu->arch.cpuid_nent;
853 return r;
854}
855
856static inline u32 bit(int bitno)
857{
858 return 1 << (bitno & 31);
859}
860
861static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
862 u32 index)
863{
864 entry->function = function;
865 entry->index = index;
866 cpuid_count(entry->function, entry->index,
867 &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
868 entry->flags = 0;
869}
870
871static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
872 u32 index, int *nent, int maxnent)
873{
874 const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) |
875 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
876 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
877 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
878 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
879 bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) |
880 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
881 bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) |
882 bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) |
883 bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP);
884 const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) |
885 bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
886 bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
887 bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
888 bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
889 bit(X86_FEATURE_PGE) |
890 bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
891 bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) |
892 bit(X86_FEATURE_SYSCALL) |
893 (bit(X86_FEATURE_NX) && is_efer_nx()) |
894#ifdef CONFIG_X86_64
895 bit(X86_FEATURE_LM) |
896#endif
897 bit(X86_FEATURE_MMXEXT) |
898 bit(X86_FEATURE_3DNOWEXT) |
899 bit(X86_FEATURE_3DNOW);
900 const u32 kvm_supported_word3_x86_features =
901 bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16);
902 const u32 kvm_supported_word6_x86_features =
903 bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY);
904
905 /* all func 2 cpuid_count() should be called on the same cpu */
906 get_cpu();
907 do_cpuid_1_ent(entry, function, index);
908 ++*nent;
909
910 switch (function) {
911 case 0:
912 entry->eax = min(entry->eax, (u32)0xb);
913 break;
914 case 1:
915 entry->edx &= kvm_supported_word0_x86_features;
916 entry->ecx &= kvm_supported_word3_x86_features;
917 break;
918 /* function 2 entries are STATEFUL. That is, repeated cpuid commands
919 * may return different values. This forces us to get_cpu() before
920 * issuing the first command, and also to emulate this annoying behavior
921 * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
922 case 2: {
923 int t, times = entry->eax & 0xff;
924
925 entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
926 for (t = 1; t < times && *nent < maxnent; ++t) {
927 do_cpuid_1_ent(&entry[t], function, 0);
928 entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
929 ++*nent;
930 }
931 break;
932 }
933 /* function 4 and 0xb have additional index. */
934 case 4: {
935 int index, cache_type;
936
937 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
938 /* read more entries until cache_type is zero */
939 for (index = 1; *nent < maxnent; ++index) {
940 cache_type = entry[index - 1].eax & 0x1f;
941 if (!cache_type)
942 break;
943 do_cpuid_1_ent(&entry[index], function, index);
944 entry[index].flags |=
945 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
946 ++*nent;
947 }
948 break;
949 }
950 case 0xb: {
951 int index, level_type;
952
953 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
954 /* read more entries until level_type is zero */
955 for (index = 1; *nent < maxnent; ++index) {
956 level_type = entry[index - 1].ecx & 0xff;
957 if (!level_type)
958 break;
959 do_cpuid_1_ent(&entry[index], function, index);
960 entry[index].flags |=
961 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
962 ++*nent;
963 }
964 break;
965 }
966 case 0x80000000:
967 entry->eax = min(entry->eax, 0x8000001a);
968 break;
969 case 0x80000001:
970 entry->edx &= kvm_supported_word1_x86_features;
971 entry->ecx &= kvm_supported_word6_x86_features;
972 break;
973 }
974 put_cpu();
975}
785 976
786 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; 977static int kvm_vm_ioctl_get_supported_cpuid(struct kvm *kvm,
978 struct kvm_cpuid2 *cpuid,
979 struct kvm_cpuid_entry2 __user *entries)
980{
981 struct kvm_cpuid_entry2 *cpuid_entries;
982 int limit, nent = 0, r = -E2BIG;
983 u32 func;
787 984
788 for (i = 0; !any && i < n/sizeof(long); ++i) 985 if (cpuid->nent < 1)
789 any = memslot->dirty_bitmap[i]; 986 goto out;
987 r = -ENOMEM;
988 cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
989 if (!cpuid_entries)
990 goto out;
790 991
992 do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
993 limit = cpuid_entries[0].eax;
994 for (func = 1; func <= limit && nent < cpuid->nent; ++func)
995 do_cpuid_ent(&cpuid_entries[nent], func, 0,
996 &nent, cpuid->nent);
997 r = -E2BIG;
998 if (nent >= cpuid->nent)
999 goto out_free;
1000
1001 do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
1002 limit = cpuid_entries[nent - 1].eax;
1003 for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
1004 do_cpuid_ent(&cpuid_entries[nent], func, 0,
1005 &nent, cpuid->nent);
791 r = -EFAULT; 1006 r = -EFAULT;
792 if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) 1007 if (copy_to_user(entries, cpuid_entries,
793 goto out; 1008 nent * sizeof(struct kvm_cpuid_entry2)))
1009 goto out_free;
1010 cpuid->nent = nent;
1011 r = 0;
794 1012
795 /* If nothing is dirty, don't bother messing with page tables. */ 1013out_free:
796 if (any) { 1014 vfree(cpuid_entries);
797 kvm_mmu_slot_remove_write_access(kvm, log->slot); 1015out:
798 kvm_flush_remote_tlbs(kvm); 1016 return r;
799 memset(memslot->dirty_bitmap, 0, n); 1017}
1018
1019static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
1020 struct kvm_lapic_state *s)
1021{
1022 vcpu_load(vcpu);
1023 memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
1024 vcpu_put(vcpu);
1025
1026 return 0;
1027}
1028
1029static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
1030 struct kvm_lapic_state *s)
1031{
1032 vcpu_load(vcpu);
1033 memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
1034 kvm_apic_post_state_restore(vcpu);
1035 vcpu_put(vcpu);
1036
1037 return 0;
1038}
1039
1040static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
1041 struct kvm_interrupt *irq)
1042{
1043 if (irq->irq < 0 || irq->irq >= 256)
1044 return -EINVAL;
1045 if (irqchip_in_kernel(vcpu->kvm))
1046 return -ENXIO;
1047 vcpu_load(vcpu);
1048
1049 set_bit(irq->irq, vcpu->arch.irq_pending);
1050 set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
1051
1052 vcpu_put(vcpu);
1053
1054 return 0;
1055}
1056
1057static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
1058 struct kvm_tpr_access_ctl *tac)
1059{
1060 if (tac->flags)
1061 return -EINVAL;
1062 vcpu->arch.tpr_access_reporting = !!tac->enabled;
1063 return 0;
1064}
1065
1066long kvm_arch_vcpu_ioctl(struct file *filp,
1067 unsigned int ioctl, unsigned long arg)
1068{
1069 struct kvm_vcpu *vcpu = filp->private_data;
1070 void __user *argp = (void __user *)arg;
1071 int r;
1072
1073 switch (ioctl) {
1074 case KVM_GET_LAPIC: {
1075 struct kvm_lapic_state lapic;
1076
1077 memset(&lapic, 0, sizeof lapic);
1078 r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic);
1079 if (r)
1080 goto out;
1081 r = -EFAULT;
1082 if (copy_to_user(argp, &lapic, sizeof lapic))
1083 goto out;
1084 r = 0;
1085 break;
800 } 1086 }
1087 case KVM_SET_LAPIC: {
1088 struct kvm_lapic_state lapic;
801 1089
802 r = 0; 1090 r = -EFAULT;
1091 if (copy_from_user(&lapic, argp, sizeof lapic))
1092 goto out;
1093 r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);;
1094 if (r)
1095 goto out;
1096 r = 0;
1097 break;
1098 }
1099 case KVM_INTERRUPT: {
1100 struct kvm_interrupt irq;
1101
1102 r = -EFAULT;
1103 if (copy_from_user(&irq, argp, sizeof irq))
1104 goto out;
1105 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
1106 if (r)
1107 goto out;
1108 r = 0;
1109 break;
1110 }
1111 case KVM_SET_CPUID: {
1112 struct kvm_cpuid __user *cpuid_arg = argp;
1113 struct kvm_cpuid cpuid;
803 1114
1115 r = -EFAULT;
1116 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1117 goto out;
1118 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
1119 if (r)
1120 goto out;
1121 break;
1122 }
1123 case KVM_SET_CPUID2: {
1124 struct kvm_cpuid2 __user *cpuid_arg = argp;
1125 struct kvm_cpuid2 cpuid;
1126
1127 r = -EFAULT;
1128 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1129 goto out;
1130 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
1131 cpuid_arg->entries);
1132 if (r)
1133 goto out;
1134 break;
1135 }
1136 case KVM_GET_CPUID2: {
1137 struct kvm_cpuid2 __user *cpuid_arg = argp;
1138 struct kvm_cpuid2 cpuid;
1139
1140 r = -EFAULT;
1141 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1142 goto out;
1143 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
1144 cpuid_arg->entries);
1145 if (r)
1146 goto out;
1147 r = -EFAULT;
1148 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
1149 goto out;
1150 r = 0;
1151 break;
1152 }
1153 case KVM_GET_MSRS:
1154 r = msr_io(vcpu, argp, kvm_get_msr, 1);
1155 break;
1156 case KVM_SET_MSRS:
1157 r = msr_io(vcpu, argp, do_set_msr, 0);
1158 break;
1159 case KVM_TPR_ACCESS_REPORTING: {
1160 struct kvm_tpr_access_ctl tac;
1161
1162 r = -EFAULT;
1163 if (copy_from_user(&tac, argp, sizeof tac))
1164 goto out;
1165 r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
1166 if (r)
1167 goto out;
1168 r = -EFAULT;
1169 if (copy_to_user(argp, &tac, sizeof tac))
1170 goto out;
1171 r = 0;
1172 break;
1173 };
1174 case KVM_SET_VAPIC_ADDR: {
1175 struct kvm_vapic_addr va;
1176
1177 r = -EINVAL;
1178 if (!irqchip_in_kernel(vcpu->kvm))
1179 goto out;
1180 r = -EFAULT;
1181 if (copy_from_user(&va, argp, sizeof va))
1182 goto out;
1183 r = 0;
1184 kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
1185 break;
1186 }
1187 default:
1188 r = -EINVAL;
1189 }
804out: 1190out:
805 mutex_unlock(&kvm->lock);
806 return r; 1191 return r;
807} 1192}
808 1193
1194static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
1195{
1196 int ret;
1197
1198 if (addr > (unsigned int)(-3 * PAGE_SIZE))
1199 return -1;
1200 ret = kvm_x86_ops->set_tss_addr(kvm, addr);
1201 return ret;
1202}
1203
1204static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
1205 u32 kvm_nr_mmu_pages)
1206{
1207 if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
1208 return -EINVAL;
1209
1210 down_write(&current->mm->mmap_sem);
1211
1212 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
1213 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
1214
1215 up_write(&current->mm->mmap_sem);
1216 return 0;
1217}
1218
1219static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
1220{
1221 return kvm->arch.n_alloc_mmu_pages;
1222}
1223
1224gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
1225{
1226 int i;
1227 struct kvm_mem_alias *alias;
1228
1229 for (i = 0; i < kvm->arch.naliases; ++i) {
1230 alias = &kvm->arch.aliases[i];
1231 if (gfn >= alias->base_gfn
1232 && gfn < alias->base_gfn + alias->npages)
1233 return alias->target_gfn + gfn - alias->base_gfn;
1234 }
1235 return gfn;
1236}
1237
809/* 1238/*
810 * Set a new alias region. Aliases map a portion of physical memory into 1239 * Set a new alias region. Aliases map a portion of physical memory into
811 * another portion. This is useful for memory windows, for example the PC 1240 * another portion. This is useful for memory windows, for example the PC
@@ -832,21 +1261,21 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
832 < alias->target_phys_addr) 1261 < alias->target_phys_addr)
833 goto out; 1262 goto out;
834 1263
835 mutex_lock(&kvm->lock); 1264 down_write(&current->mm->mmap_sem);
836 1265
837 p = &kvm->aliases[alias->slot]; 1266 p = &kvm->arch.aliases[alias->slot];
838 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; 1267 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
839 p->npages = alias->memory_size >> PAGE_SHIFT; 1268 p->npages = alias->memory_size >> PAGE_SHIFT;
840 p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; 1269 p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
841 1270
842 for (n = KVM_ALIAS_SLOTS; n > 0; --n) 1271 for (n = KVM_ALIAS_SLOTS; n > 0; --n)
843 if (kvm->aliases[n - 1].npages) 1272 if (kvm->arch.aliases[n - 1].npages)
844 break; 1273 break;
845 kvm->naliases = n; 1274 kvm->arch.naliases = n;
846 1275
847 kvm_mmu_zap_all(kvm); 1276 kvm_mmu_zap_all(kvm);
848 1277
849 mutex_unlock(&kvm->lock); 1278 up_write(&current->mm->mmap_sem);
850 1279
851 return 0; 1280 return 0;
852 1281
@@ -861,17 +1290,17 @@ static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
861 r = 0; 1290 r = 0;
862 switch (chip->chip_id) { 1291 switch (chip->chip_id) {
863 case KVM_IRQCHIP_PIC_MASTER: 1292 case KVM_IRQCHIP_PIC_MASTER:
864 memcpy (&chip->chip.pic, 1293 memcpy(&chip->chip.pic,
865 &pic_irqchip(kvm)->pics[0], 1294 &pic_irqchip(kvm)->pics[0],
866 sizeof(struct kvm_pic_state)); 1295 sizeof(struct kvm_pic_state));
867 break; 1296 break;
868 case KVM_IRQCHIP_PIC_SLAVE: 1297 case KVM_IRQCHIP_PIC_SLAVE:
869 memcpy (&chip->chip.pic, 1298 memcpy(&chip->chip.pic,
870 &pic_irqchip(kvm)->pics[1], 1299 &pic_irqchip(kvm)->pics[1],
871 sizeof(struct kvm_pic_state)); 1300 sizeof(struct kvm_pic_state));
872 break; 1301 break;
873 case KVM_IRQCHIP_IOAPIC: 1302 case KVM_IRQCHIP_IOAPIC:
874 memcpy (&chip->chip.ioapic, 1303 memcpy(&chip->chip.ioapic,
875 ioapic_irqchip(kvm), 1304 ioapic_irqchip(kvm),
876 sizeof(struct kvm_ioapic_state)); 1305 sizeof(struct kvm_ioapic_state));
877 break; 1306 break;
@@ -889,17 +1318,17 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
889 r = 0; 1318 r = 0;
890 switch (chip->chip_id) { 1319 switch (chip->chip_id) {
891 case KVM_IRQCHIP_PIC_MASTER: 1320 case KVM_IRQCHIP_PIC_MASTER:
892 memcpy (&pic_irqchip(kvm)->pics[0], 1321 memcpy(&pic_irqchip(kvm)->pics[0],
893 &chip->chip.pic, 1322 &chip->chip.pic,
894 sizeof(struct kvm_pic_state)); 1323 sizeof(struct kvm_pic_state));
895 break; 1324 break;
896 case KVM_IRQCHIP_PIC_SLAVE: 1325 case KVM_IRQCHIP_PIC_SLAVE:
897 memcpy (&pic_irqchip(kvm)->pics[1], 1326 memcpy(&pic_irqchip(kvm)->pics[1],
898 &chip->chip.pic, 1327 &chip->chip.pic,
899 sizeof(struct kvm_pic_state)); 1328 sizeof(struct kvm_pic_state));
900 break; 1329 break;
901 case KVM_IRQCHIP_IOAPIC: 1330 case KVM_IRQCHIP_IOAPIC:
902 memcpy (ioapic_irqchip(kvm), 1331 memcpy(ioapic_irqchip(kvm),
903 &chip->chip.ioapic, 1332 &chip->chip.ioapic,
904 sizeof(struct kvm_ioapic_state)); 1333 sizeof(struct kvm_ioapic_state));
905 break; 1334 break;
@@ -911,110 +1340,191 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
911 return r; 1340 return r;
912} 1341}
913 1342
914static gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) 1343/*
1344 * Get (and clear) the dirty memory log for a memory slot.
1345 */
1346int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
1347 struct kvm_dirty_log *log)
915{ 1348{
916 int i; 1349 int r;
917 struct kvm_mem_alias *alias; 1350 int n;
918 1351 struct kvm_memory_slot *memslot;
919 for (i = 0; i < kvm->naliases; ++i) { 1352 int is_dirty = 0;
920 alias = &kvm->aliases[i];
921 if (gfn >= alias->base_gfn
922 && gfn < alias->base_gfn + alias->npages)
923 return alias->target_gfn + gfn - alias->base_gfn;
924 }
925 return gfn;
926}
927 1353
928static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 1354 down_write(&current->mm->mmap_sem);
929{
930 int i;
931 1355
932 for (i = 0; i < kvm->nmemslots; ++i) { 1356 r = kvm_get_dirty_log(kvm, log, &is_dirty);
933 struct kvm_memory_slot *memslot = &kvm->memslots[i]; 1357 if (r)
1358 goto out;
934 1359
935 if (gfn >= memslot->base_gfn 1360 /* If nothing is dirty, don't bother messing with page tables. */
936 && gfn < memslot->base_gfn + memslot->npages) 1361 if (is_dirty) {
937 return memslot; 1362 kvm_mmu_slot_remove_write_access(kvm, log->slot);
1363 kvm_flush_remote_tlbs(kvm);
1364 memslot = &kvm->memslots[log->slot];
1365 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
1366 memset(memslot->dirty_bitmap, 0, n);
938 } 1367 }
939 return NULL; 1368 r = 0;
940} 1369out:
941 1370 up_write(&current->mm->mmap_sem);
942struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 1371 return r;
943{
944 gfn = unalias_gfn(kvm, gfn);
945 return __gfn_to_memslot(kvm, gfn);
946}
947
948struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
949{
950 struct kvm_memory_slot *slot;
951
952 gfn = unalias_gfn(kvm, gfn);
953 slot = __gfn_to_memslot(kvm, gfn);
954 if (!slot)
955 return NULL;
956 return slot->phys_mem[gfn - slot->base_gfn];
957} 1372}
958EXPORT_SYMBOL_GPL(gfn_to_page);
959 1373
960/* WARNING: Does not work on aliased pages. */ 1374long kvm_arch_vm_ioctl(struct file *filp,
961void mark_page_dirty(struct kvm *kvm, gfn_t gfn) 1375 unsigned int ioctl, unsigned long arg)
962{ 1376{
963 struct kvm_memory_slot *memslot; 1377 struct kvm *kvm = filp->private_data;
1378 void __user *argp = (void __user *)arg;
1379 int r = -EINVAL;
964 1380
965 memslot = __gfn_to_memslot(kvm, gfn); 1381 switch (ioctl) {
966 if (memslot && memslot->dirty_bitmap) { 1382 case KVM_SET_TSS_ADDR:
967 unsigned long rel_gfn = gfn - memslot->base_gfn; 1383 r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
1384 if (r < 0)
1385 goto out;
1386 break;
1387 case KVM_SET_MEMORY_REGION: {
1388 struct kvm_memory_region kvm_mem;
1389 struct kvm_userspace_memory_region kvm_userspace_mem;
968 1390
969 /* avoid RMW */ 1391 r = -EFAULT;
970 if (!test_bit(rel_gfn, memslot->dirty_bitmap)) 1392 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
971 set_bit(rel_gfn, memslot->dirty_bitmap); 1393 goto out;
1394 kvm_userspace_mem.slot = kvm_mem.slot;
1395 kvm_userspace_mem.flags = kvm_mem.flags;
1396 kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
1397 kvm_userspace_mem.memory_size = kvm_mem.memory_size;
1398 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
1399 if (r)
1400 goto out;
1401 break;
972 } 1402 }
973} 1403 case KVM_SET_NR_MMU_PAGES:
1404 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
1405 if (r)
1406 goto out;
1407 break;
1408 case KVM_GET_NR_MMU_PAGES:
1409 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
1410 break;
1411 case KVM_SET_MEMORY_ALIAS: {
1412 struct kvm_memory_alias alias;
974 1413
975int emulator_read_std(unsigned long addr, 1414 r = -EFAULT;
976 void *val, 1415 if (copy_from_user(&alias, argp, sizeof alias))
977 unsigned int bytes, 1416 goto out;
978 struct kvm_vcpu *vcpu) 1417 r = kvm_vm_ioctl_set_memory_alias(kvm, &alias);
979{ 1418 if (r)
980 void *data = val; 1419 goto out;
1420 break;
1421 }
1422 case KVM_CREATE_IRQCHIP:
1423 r = -ENOMEM;
1424 kvm->arch.vpic = kvm_create_pic(kvm);
1425 if (kvm->arch.vpic) {
1426 r = kvm_ioapic_init(kvm);
1427 if (r) {
1428 kfree(kvm->arch.vpic);
1429 kvm->arch.vpic = NULL;
1430 goto out;
1431 }
1432 } else
1433 goto out;
1434 break;
1435 case KVM_IRQ_LINE: {
1436 struct kvm_irq_level irq_event;
981 1437
982 while (bytes) { 1438 r = -EFAULT;
983 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); 1439 if (copy_from_user(&irq_event, argp, sizeof irq_event))
984 unsigned offset = addr & (PAGE_SIZE-1); 1440 goto out;
985 unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset); 1441 if (irqchip_in_kernel(kvm)) {
986 unsigned long pfn; 1442 mutex_lock(&kvm->lock);
987 struct page *page; 1443 if (irq_event.irq < 16)
988 void *page_virt; 1444 kvm_pic_set_irq(pic_irqchip(kvm),
1445 irq_event.irq,
1446 irq_event.level);
1447 kvm_ioapic_set_irq(kvm->arch.vioapic,
1448 irq_event.irq,
1449 irq_event.level);
1450 mutex_unlock(&kvm->lock);
1451 r = 0;
1452 }
1453 break;
1454 }
1455 case KVM_GET_IRQCHIP: {
1456 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
1457 struct kvm_irqchip chip;
989 1458
990 if (gpa == UNMAPPED_GVA) 1459 r = -EFAULT;
991 return X86EMUL_PROPAGATE_FAULT; 1460 if (copy_from_user(&chip, argp, sizeof chip))
992 pfn = gpa >> PAGE_SHIFT; 1461 goto out;
993 page = gfn_to_page(vcpu->kvm, pfn); 1462 r = -ENXIO;
994 if (!page) 1463 if (!irqchip_in_kernel(kvm))
995 return X86EMUL_UNHANDLEABLE; 1464 goto out;
996 page_virt = kmap_atomic(page, KM_USER0); 1465 r = kvm_vm_ioctl_get_irqchip(kvm, &chip);
1466 if (r)
1467 goto out;
1468 r = -EFAULT;
1469 if (copy_to_user(argp, &chip, sizeof chip))
1470 goto out;
1471 r = 0;
1472 break;
1473 }
1474 case KVM_SET_IRQCHIP: {
1475 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
1476 struct kvm_irqchip chip;
997 1477
998 memcpy(data, page_virt + offset, tocopy); 1478 r = -EFAULT;
1479 if (copy_from_user(&chip, argp, sizeof chip))
1480 goto out;
1481 r = -ENXIO;
1482 if (!irqchip_in_kernel(kvm))
1483 goto out;
1484 r = kvm_vm_ioctl_set_irqchip(kvm, &chip);
1485 if (r)
1486 goto out;
1487 r = 0;
1488 break;
1489 }
1490 case KVM_GET_SUPPORTED_CPUID: {
1491 struct kvm_cpuid2 __user *cpuid_arg = argp;
1492 struct kvm_cpuid2 cpuid;
999 1493
1000 kunmap_atomic(page_virt, KM_USER0); 1494 r = -EFAULT;
1495 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1496 goto out;
1497 r = kvm_vm_ioctl_get_supported_cpuid(kvm, &cpuid,
1498 cpuid_arg->entries);
1499 if (r)
1500 goto out;
1001 1501
1002 bytes -= tocopy; 1502 r = -EFAULT;
1003 data += tocopy; 1503 if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
1004 addr += tocopy; 1504 goto out;
1505 r = 0;
1506 break;
1005 } 1507 }
1006 1508 default:
1007 return X86EMUL_CONTINUE; 1509 ;
1510 }
1511out:
1512 return r;
1008} 1513}
1009EXPORT_SYMBOL_GPL(emulator_read_std);
1010 1514
1011static int emulator_write_std(unsigned long addr, 1515static void kvm_init_msr_list(void)
1012 const void *val,
1013 unsigned int bytes,
1014 struct kvm_vcpu *vcpu)
1015{ 1516{
1016 pr_unimpl(vcpu, "emulator_write_std: addr %lx n %d\n", addr, bytes); 1517 u32 dummy[2];
1017 return X86EMUL_UNHANDLEABLE; 1518 unsigned i, j;
1519
1520 for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
1521 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
1522 continue;
1523 if (j < i)
1524 msrs_to_save[j] = msrs_to_save[i];
1525 j++;
1526 }
1527 num_msrs_to_save = j;
1018} 1528}
1019 1529
1020/* 1530/*
@@ -1025,14 +1535,15 @@ static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
1025{ 1535{
1026 struct kvm_io_device *dev; 1536 struct kvm_io_device *dev;
1027 1537
1028 if (vcpu->apic) { 1538 if (vcpu->arch.apic) {
1029 dev = &vcpu->apic->dev; 1539 dev = &vcpu->arch.apic->dev;
1030 if (dev->in_range(dev, addr)) 1540 if (dev->in_range(dev, addr))
1031 return dev; 1541 return dev;
1032 } 1542 }
1033 return NULL; 1543 return NULL;
1034} 1544}
1035 1545
1546
1036static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, 1547static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
1037 gpa_t addr) 1548 gpa_t addr)
1038{ 1549{
@@ -1044,11 +1555,40 @@ static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
1044 return dev; 1555 return dev;
1045} 1556}
1046 1557
1047static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu, 1558int emulator_read_std(unsigned long addr,
1048 gpa_t addr) 1559 void *val,
1560 unsigned int bytes,
1561 struct kvm_vcpu *vcpu)
1049{ 1562{
1050 return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr); 1563 void *data = val;
1564 int r = X86EMUL_CONTINUE;
1565
1566 down_read(&current->mm->mmap_sem);
1567 while (bytes) {
1568 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1569 unsigned offset = addr & (PAGE_SIZE-1);
1570 unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
1571 int ret;
1572
1573 if (gpa == UNMAPPED_GVA) {
1574 r = X86EMUL_PROPAGATE_FAULT;
1575 goto out;
1576 }
1577 ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy);
1578 if (ret < 0) {
1579 r = X86EMUL_UNHANDLEABLE;
1580 goto out;
1581 }
1582
1583 bytes -= tocopy;
1584 data += tocopy;
1585 addr += tocopy;
1586 }
1587out:
1588 up_read(&current->mm->mmap_sem);
1589 return r;
1051} 1590}
1591EXPORT_SYMBOL_GPL(emulator_read_std);
1052 1592
1053static int emulator_read_emulated(unsigned long addr, 1593static int emulator_read_emulated(unsigned long addr,
1054 void *val, 1594 void *val,
@@ -1062,22 +1602,34 @@ static int emulator_read_emulated(unsigned long addr,
1062 memcpy(val, vcpu->mmio_data, bytes); 1602 memcpy(val, vcpu->mmio_data, bytes);
1063 vcpu->mmio_read_completed = 0; 1603 vcpu->mmio_read_completed = 0;
1064 return X86EMUL_CONTINUE; 1604 return X86EMUL_CONTINUE;
1065 } else if (emulator_read_std(addr, val, bytes, vcpu) 1605 }
1066 == X86EMUL_CONTINUE) 1606
1067 return X86EMUL_CONTINUE; 1607 down_read(&current->mm->mmap_sem);
1608 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1609 up_read(&current->mm->mmap_sem);
1610
1611 /* For APIC access vmexit */
1612 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
1613 goto mmio;
1068 1614
1069 gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); 1615 if (emulator_read_std(addr, val, bytes, vcpu)
1616 == X86EMUL_CONTINUE)
1617 return X86EMUL_CONTINUE;
1070 if (gpa == UNMAPPED_GVA) 1618 if (gpa == UNMAPPED_GVA)
1071 return X86EMUL_PROPAGATE_FAULT; 1619 return X86EMUL_PROPAGATE_FAULT;
1072 1620
1621mmio:
1073 /* 1622 /*
1074 * Is this MMIO handled locally? 1623 * Is this MMIO handled locally?
1075 */ 1624 */
1625 mutex_lock(&vcpu->kvm->lock);
1076 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); 1626 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1077 if (mmio_dev) { 1627 if (mmio_dev) {
1078 kvm_iodevice_read(mmio_dev, gpa, bytes, val); 1628 kvm_iodevice_read(mmio_dev, gpa, bytes, val);
1629 mutex_unlock(&vcpu->kvm->lock);
1079 return X86EMUL_CONTINUE; 1630 return X86EMUL_CONTINUE;
1080 } 1631 }
1632 mutex_unlock(&vcpu->kvm->lock);
1081 1633
1082 vcpu->mmio_needed = 1; 1634 vcpu->mmio_needed = 1;
1083 vcpu->mmio_phys_addr = gpa; 1635 vcpu->mmio_phys_addr = gpa;
@@ -1090,19 +1642,16 @@ static int emulator_read_emulated(unsigned long addr,
1090static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, 1642static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
1091 const void *val, int bytes) 1643 const void *val, int bytes)
1092{ 1644{
1093 struct page *page; 1645 int ret;
1094 void *virt;
1095 1646
1096 if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT)) 1647 down_read(&current->mm->mmap_sem);
1648 ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
1649 if (ret < 0) {
1650 up_read(&current->mm->mmap_sem);
1097 return 0; 1651 return 0;
1098 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); 1652 }
1099 if (!page)
1100 return 0;
1101 mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT);
1102 virt = kmap_atomic(page, KM_USER0);
1103 kvm_mmu_pte_write(vcpu, gpa, val, bytes); 1653 kvm_mmu_pte_write(vcpu, gpa, val, bytes);
1104 memcpy(virt + offset_in_page(gpa), val, bytes); 1654 up_read(&current->mm->mmap_sem);
1105 kunmap_atomic(virt, KM_USER0);
1106 return 1; 1655 return 1;
1107} 1656}
1108 1657
@@ -1112,24 +1661,36 @@ static int emulator_write_emulated_onepage(unsigned long addr,
1112 struct kvm_vcpu *vcpu) 1661 struct kvm_vcpu *vcpu)
1113{ 1662{
1114 struct kvm_io_device *mmio_dev; 1663 struct kvm_io_device *mmio_dev;
1115 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); 1664 gpa_t gpa;
1665
1666 down_read(&current->mm->mmap_sem);
1667 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1668 up_read(&current->mm->mmap_sem);
1116 1669
1117 if (gpa == UNMAPPED_GVA) { 1670 if (gpa == UNMAPPED_GVA) {
1118 kvm_x86_ops->inject_page_fault(vcpu, addr, 2); 1671 kvm_inject_page_fault(vcpu, addr, 2);
1119 return X86EMUL_PROPAGATE_FAULT; 1672 return X86EMUL_PROPAGATE_FAULT;
1120 } 1673 }
1121 1674
1675 /* For APIC access vmexit */
1676 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
1677 goto mmio;
1678
1122 if (emulator_write_phys(vcpu, gpa, val, bytes)) 1679 if (emulator_write_phys(vcpu, gpa, val, bytes))
1123 return X86EMUL_CONTINUE; 1680 return X86EMUL_CONTINUE;
1124 1681
1682mmio:
1125 /* 1683 /*
1126 * Is this MMIO handled locally? 1684 * Is this MMIO handled locally?
1127 */ 1685 */
1686 mutex_lock(&vcpu->kvm->lock);
1128 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); 1687 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1129 if (mmio_dev) { 1688 if (mmio_dev) {
1130 kvm_iodevice_write(mmio_dev, gpa, bytes, val); 1689 kvm_iodevice_write(mmio_dev, gpa, bytes, val);
1690 mutex_unlock(&vcpu->kvm->lock);
1131 return X86EMUL_CONTINUE; 1691 return X86EMUL_CONTINUE;
1132 } 1692 }
1693 mutex_unlock(&vcpu->kvm->lock);
1133 1694
1134 vcpu->mmio_needed = 1; 1695 vcpu->mmio_needed = 1;
1135 vcpu->mmio_phys_addr = gpa; 1696 vcpu->mmio_phys_addr = gpa;
@@ -1173,6 +1734,35 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
1173 reported = 1; 1734 reported = 1;
1174 printk(KERN_WARNING "kvm: emulating exchange as write\n"); 1735 printk(KERN_WARNING "kvm: emulating exchange as write\n");
1175 } 1736 }
1737#ifndef CONFIG_X86_64
1738 /* guests cmpxchg8b have to be emulated atomically */
1739 if (bytes == 8) {
1740 gpa_t gpa;
1741 struct page *page;
1742 char *addr;
1743 u64 val;
1744
1745 down_read(&current->mm->mmap_sem);
1746 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1747
1748 if (gpa == UNMAPPED_GVA ||
1749 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
1750 goto emul_write;
1751
1752 if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
1753 goto emul_write;
1754
1755 val = *(u64 *)new;
1756 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1757 addr = kmap_atomic(page, KM_USER0);
1758 set_64bit((u64 *)(addr + offset_in_page(gpa)), val);
1759 kunmap_atomic(addr, KM_USER0);
1760 kvm_release_page_dirty(page);
1761 emul_write:
1762 up_read(&current->mm->mmap_sem);
1763 }
1764#endif
1765
1176 return emulator_write_emulated(addr, new, bytes, vcpu); 1766 return emulator_write_emulated(addr, new, bytes, vcpu);
1177} 1767}
1178 1768
@@ -1188,11 +1778,11 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
1188 1778
1189int emulate_clts(struct kvm_vcpu *vcpu) 1779int emulate_clts(struct kvm_vcpu *vcpu)
1190{ 1780{
1191 kvm_x86_ops->set_cr0(vcpu, vcpu->cr0 & ~X86_CR0_TS); 1781 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
1192 return X86EMUL_CONTINUE; 1782 return X86EMUL_CONTINUE;
1193} 1783}
1194 1784
1195int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, unsigned long *dest) 1785int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
1196{ 1786{
1197 struct kvm_vcpu *vcpu = ctxt->vcpu; 1787 struct kvm_vcpu *vcpu = ctxt->vcpu;
1198 1788
@@ -1223,7 +1813,7 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
1223{ 1813{
1224 static int reported; 1814 static int reported;
1225 u8 opcodes[4]; 1815 u8 opcodes[4];
1226 unsigned long rip = vcpu->rip; 1816 unsigned long rip = vcpu->arch.rip;
1227 unsigned long rip_linear; 1817 unsigned long rip_linear;
1228 1818
1229 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); 1819 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
@@ -1241,7 +1831,6 @@ EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
1241 1831
1242struct x86_emulate_ops emulate_ops = { 1832struct x86_emulate_ops emulate_ops = {
1243 .read_std = emulator_read_std, 1833 .read_std = emulator_read_std,
1244 .write_std = emulator_write_std,
1245 .read_emulated = emulator_read_emulated, 1834 .read_emulated = emulator_read_emulated,
1246 .write_emulated = emulator_write_emulated, 1835 .write_emulated = emulator_write_emulated,
1247 .cmpxchg_emulated = emulator_cmpxchg_emulated, 1836 .cmpxchg_emulated = emulator_cmpxchg_emulated,
@@ -1250,44 +1839,74 @@ struct x86_emulate_ops emulate_ops = {
1250int emulate_instruction(struct kvm_vcpu *vcpu, 1839int emulate_instruction(struct kvm_vcpu *vcpu,
1251 struct kvm_run *run, 1840 struct kvm_run *run,
1252 unsigned long cr2, 1841 unsigned long cr2,
1253 u16 error_code) 1842 u16 error_code,
1843 int emulation_type)
1254{ 1844{
1255 struct x86_emulate_ctxt emulate_ctxt;
1256 int r; 1845 int r;
1257 int cs_db, cs_l; 1846 struct decode_cache *c;
1258 1847
1259 vcpu->mmio_fault_cr2 = cr2; 1848 vcpu->arch.mmio_fault_cr2 = cr2;
1260 kvm_x86_ops->cache_regs(vcpu); 1849 kvm_x86_ops->cache_regs(vcpu);
1261 1850
1262 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 1851 vcpu->mmio_is_write = 0;
1263 1852 vcpu->arch.pio.string = 0;
1264 emulate_ctxt.vcpu = vcpu; 1853
1265 emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); 1854 if (!(emulation_type & EMULTYPE_NO_DECODE)) {
1266 emulate_ctxt.cr2 = cr2; 1855 int cs_db, cs_l;
1267 emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM) 1856 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
1268 ? X86EMUL_MODE_REAL : cs_l 1857
1269 ? X86EMUL_MODE_PROT64 : cs_db 1858 vcpu->arch.emulate_ctxt.vcpu = vcpu;
1270 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 1859 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
1271 1860 vcpu->arch.emulate_ctxt.mode =
1272 if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) { 1861 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
1273 emulate_ctxt.cs_base = 0; 1862 ? X86EMUL_MODE_REAL : cs_l
1274 emulate_ctxt.ds_base = 0; 1863 ? X86EMUL_MODE_PROT64 : cs_db
1275 emulate_ctxt.es_base = 0; 1864 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
1276 emulate_ctxt.ss_base = 0; 1865
1277 } else { 1866 if (vcpu->arch.emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
1278 emulate_ctxt.cs_base = get_segment_base(vcpu, VCPU_SREG_CS); 1867 vcpu->arch.emulate_ctxt.cs_base = 0;
1279 emulate_ctxt.ds_base = get_segment_base(vcpu, VCPU_SREG_DS); 1868 vcpu->arch.emulate_ctxt.ds_base = 0;
1280 emulate_ctxt.es_base = get_segment_base(vcpu, VCPU_SREG_ES); 1869 vcpu->arch.emulate_ctxt.es_base = 0;
1281 emulate_ctxt.ss_base = get_segment_base(vcpu, VCPU_SREG_SS); 1870 vcpu->arch.emulate_ctxt.ss_base = 0;
1871 } else {
1872 vcpu->arch.emulate_ctxt.cs_base =
1873 get_segment_base(vcpu, VCPU_SREG_CS);
1874 vcpu->arch.emulate_ctxt.ds_base =
1875 get_segment_base(vcpu, VCPU_SREG_DS);
1876 vcpu->arch.emulate_ctxt.es_base =
1877 get_segment_base(vcpu, VCPU_SREG_ES);
1878 vcpu->arch.emulate_ctxt.ss_base =
1879 get_segment_base(vcpu, VCPU_SREG_SS);
1880 }
1881
1882 vcpu->arch.emulate_ctxt.gs_base =
1883 get_segment_base(vcpu, VCPU_SREG_GS);
1884 vcpu->arch.emulate_ctxt.fs_base =
1885 get_segment_base(vcpu, VCPU_SREG_FS);
1886
1887 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
1888
1889 /* Reject the instructions other than VMCALL/VMMCALL when
1890 * try to emulate invalid opcode */
1891 c = &vcpu->arch.emulate_ctxt.decode;
1892 if ((emulation_type & EMULTYPE_TRAP_UD) &&
1893 (!(c->twobyte && c->b == 0x01 &&
1894 (c->modrm_reg == 0 || c->modrm_reg == 3) &&
1895 c->modrm_mod == 3 && c->modrm_rm == 1)))
1896 return EMULATE_FAIL;
1897
1898 ++vcpu->stat.insn_emulation;
1899 if (r) {
1900 ++vcpu->stat.insn_emulation_fail;
1901 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
1902 return EMULATE_DONE;
1903 return EMULATE_FAIL;
1904 }
1282 } 1905 }
1283 1906
1284 emulate_ctxt.gs_base = get_segment_base(vcpu, VCPU_SREG_GS); 1907 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
1285 emulate_ctxt.fs_base = get_segment_base(vcpu, VCPU_SREG_FS);
1286 1908
1287 vcpu->mmio_is_write = 0; 1909 if (vcpu->arch.pio.string)
1288 vcpu->pio.string = 0;
1289 r = x86_emulate_memop(&emulate_ctxt, &emulate_ops);
1290 if (vcpu->pio.string)
1291 return EMULATE_DO_MMIO; 1910 return EMULATE_DO_MMIO;
1292 1911
1293 if ((r || vcpu->mmio_is_write) && run) { 1912 if ((r || vcpu->mmio_is_write) && run) {
@@ -1309,7 +1928,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
1309 } 1928 }
1310 1929
1311 kvm_x86_ops->decache_regs(vcpu); 1930 kvm_x86_ops->decache_regs(vcpu);
1312 kvm_x86_ops->set_rflags(vcpu, emulate_ctxt.eflags); 1931 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
1313 1932
1314 if (vcpu->mmio_is_write) { 1933 if (vcpu->mmio_is_write) {
1315 vcpu->mmio_needed = 0; 1934 vcpu->mmio_needed = 0;
@@ -1320,439 +1939,45 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
1320} 1939}
1321EXPORT_SYMBOL_GPL(emulate_instruction); 1940EXPORT_SYMBOL_GPL(emulate_instruction);
1322 1941
1323/* 1942static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
1324 * The vCPU has executed a HLT instruction with in-kernel mode enabled.
1325 */
1326static void kvm_vcpu_block(struct kvm_vcpu *vcpu)
1327{
1328 DECLARE_WAITQUEUE(wait, current);
1329
1330 add_wait_queue(&vcpu->wq, &wait);
1331
1332 /*
1333 * We will block until either an interrupt or a signal wakes us up
1334 */
1335 while (!kvm_cpu_has_interrupt(vcpu)
1336 && !signal_pending(current)
1337 && vcpu->mp_state != VCPU_MP_STATE_RUNNABLE
1338 && vcpu->mp_state != VCPU_MP_STATE_SIPI_RECEIVED) {
1339 set_current_state(TASK_INTERRUPTIBLE);
1340 vcpu_put(vcpu);
1341 schedule();
1342 vcpu_load(vcpu);
1343 }
1344
1345 __set_current_state(TASK_RUNNING);
1346 remove_wait_queue(&vcpu->wq, &wait);
1347}
1348
1349int kvm_emulate_halt(struct kvm_vcpu *vcpu)
1350{
1351 ++vcpu->stat.halt_exits;
1352 if (irqchip_in_kernel(vcpu->kvm)) {
1353 vcpu->mp_state = VCPU_MP_STATE_HALTED;
1354 kvm_vcpu_block(vcpu);
1355 if (vcpu->mp_state != VCPU_MP_STATE_RUNNABLE)
1356 return -EINTR;
1357 return 1;
1358 } else {
1359 vcpu->run->exit_reason = KVM_EXIT_HLT;
1360 return 0;
1361 }
1362}
1363EXPORT_SYMBOL_GPL(kvm_emulate_halt);
1364
1365int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run)
1366{
1367 unsigned long nr, a0, a1, a2, a3, a4, a5, ret;
1368
1369 kvm_x86_ops->cache_regs(vcpu);
1370 ret = -KVM_EINVAL;
1371#ifdef CONFIG_X86_64
1372 if (is_long_mode(vcpu)) {
1373 nr = vcpu->regs[VCPU_REGS_RAX];
1374 a0 = vcpu->regs[VCPU_REGS_RDI];
1375 a1 = vcpu->regs[VCPU_REGS_RSI];
1376 a2 = vcpu->regs[VCPU_REGS_RDX];
1377 a3 = vcpu->regs[VCPU_REGS_RCX];
1378 a4 = vcpu->regs[VCPU_REGS_R8];
1379 a5 = vcpu->regs[VCPU_REGS_R9];
1380 } else
1381#endif
1382 {
1383 nr = vcpu->regs[VCPU_REGS_RBX] & -1u;
1384 a0 = vcpu->regs[VCPU_REGS_RAX] & -1u;
1385 a1 = vcpu->regs[VCPU_REGS_RCX] & -1u;
1386 a2 = vcpu->regs[VCPU_REGS_RDX] & -1u;
1387 a3 = vcpu->regs[VCPU_REGS_RSI] & -1u;
1388 a4 = vcpu->regs[VCPU_REGS_RDI] & -1u;
1389 a5 = vcpu->regs[VCPU_REGS_RBP] & -1u;
1390 }
1391 switch (nr) {
1392 default:
1393 run->hypercall.nr = nr;
1394 run->hypercall.args[0] = a0;
1395 run->hypercall.args[1] = a1;
1396 run->hypercall.args[2] = a2;
1397 run->hypercall.args[3] = a3;
1398 run->hypercall.args[4] = a4;
1399 run->hypercall.args[5] = a5;
1400 run->hypercall.ret = ret;
1401 run->hypercall.longmode = is_long_mode(vcpu);
1402 kvm_x86_ops->decache_regs(vcpu);
1403 return 0;
1404 }
1405 vcpu->regs[VCPU_REGS_RAX] = ret;
1406 kvm_x86_ops->decache_regs(vcpu);
1407 return 1;
1408}
1409EXPORT_SYMBOL_GPL(kvm_hypercall);
1410
1411static u64 mk_cr_64(u64 curr_cr, u32 new_val)
1412{
1413 return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
1414}
1415
1416void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
1417{
1418 struct descriptor_table dt = { limit, base };
1419
1420 kvm_x86_ops->set_gdt(vcpu, &dt);
1421}
1422
1423void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
1424{
1425 struct descriptor_table dt = { limit, base };
1426
1427 kvm_x86_ops->set_idt(vcpu, &dt);
1428}
1429
1430void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
1431 unsigned long *rflags)
1432{
1433 lmsw(vcpu, msw);
1434 *rflags = kvm_x86_ops->get_rflags(vcpu);
1435}
1436
1437unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
1438{
1439 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
1440 switch (cr) {
1441 case 0:
1442 return vcpu->cr0;
1443 case 2:
1444 return vcpu->cr2;
1445 case 3:
1446 return vcpu->cr3;
1447 case 4:
1448 return vcpu->cr4;
1449 default:
1450 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1451 return 0;
1452 }
1453}
1454
1455void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
1456 unsigned long *rflags)
1457{
1458 switch (cr) {
1459 case 0:
1460 set_cr0(vcpu, mk_cr_64(vcpu->cr0, val));
1461 *rflags = kvm_x86_ops->get_rflags(vcpu);
1462 break;
1463 case 2:
1464 vcpu->cr2 = val;
1465 break;
1466 case 3:
1467 set_cr3(vcpu, val);
1468 break;
1469 case 4:
1470 set_cr4(vcpu, mk_cr_64(vcpu->cr4, val));
1471 break;
1472 default:
1473 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1474 }
1475}
1476
1477/*
1478 * Register the para guest with the host:
1479 */
1480static int vcpu_register_para(struct kvm_vcpu *vcpu, gpa_t para_state_gpa)
1481{
1482 struct kvm_vcpu_para_state *para_state;
1483 hpa_t para_state_hpa, hypercall_hpa;
1484 struct page *para_state_page;
1485 unsigned char *hypercall;
1486 gpa_t hypercall_gpa;
1487
1488 printk(KERN_DEBUG "kvm: guest trying to enter paravirtual mode\n");
1489 printk(KERN_DEBUG ".... para_state_gpa: %08Lx\n", para_state_gpa);
1490
1491 /*
1492 * Needs to be page aligned:
1493 */
1494 if (para_state_gpa != PAGE_ALIGN(para_state_gpa))
1495 goto err_gp;
1496
1497 para_state_hpa = gpa_to_hpa(vcpu, para_state_gpa);
1498 printk(KERN_DEBUG ".... para_state_hpa: %08Lx\n", para_state_hpa);
1499 if (is_error_hpa(para_state_hpa))
1500 goto err_gp;
1501
1502 mark_page_dirty(vcpu->kvm, para_state_gpa >> PAGE_SHIFT);
1503 para_state_page = pfn_to_page(para_state_hpa >> PAGE_SHIFT);
1504 para_state = kmap(para_state_page);
1505
1506 printk(KERN_DEBUG ".... guest version: %d\n", para_state->guest_version);
1507 printk(KERN_DEBUG ".... size: %d\n", para_state->size);
1508
1509 para_state->host_version = KVM_PARA_API_VERSION;
1510 /*
1511 * We cannot support guests that try to register themselves
1512 * with a newer API version than the host supports:
1513 */
1514 if (para_state->guest_version > KVM_PARA_API_VERSION) {
1515 para_state->ret = -KVM_EINVAL;
1516 goto err_kunmap_skip;
1517 }
1518
1519 hypercall_gpa = para_state->hypercall_gpa;
1520 hypercall_hpa = gpa_to_hpa(vcpu, hypercall_gpa);
1521 printk(KERN_DEBUG ".... hypercall_hpa: %08Lx\n", hypercall_hpa);
1522 if (is_error_hpa(hypercall_hpa)) {
1523 para_state->ret = -KVM_EINVAL;
1524 goto err_kunmap_skip;
1525 }
1526
1527 printk(KERN_DEBUG "kvm: para guest successfully registered.\n");
1528 vcpu->para_state_page = para_state_page;
1529 vcpu->para_state_gpa = para_state_gpa;
1530 vcpu->hypercall_gpa = hypercall_gpa;
1531
1532 mark_page_dirty(vcpu->kvm, hypercall_gpa >> PAGE_SHIFT);
1533 hypercall = kmap_atomic(pfn_to_page(hypercall_hpa >> PAGE_SHIFT),
1534 KM_USER1) + (hypercall_hpa & ~PAGE_MASK);
1535 kvm_x86_ops->patch_hypercall(vcpu, hypercall);
1536 kunmap_atomic(hypercall, KM_USER1);
1537
1538 para_state->ret = 0;
1539err_kunmap_skip:
1540 kunmap(para_state_page);
1541 return 0;
1542err_gp:
1543 return 1;
1544}
1545
1546int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1547{
1548 u64 data;
1549
1550 switch (msr) {
1551 case 0xc0010010: /* SYSCFG */
1552 case 0xc0010015: /* HWCR */
1553 case MSR_IA32_PLATFORM_ID:
1554 case MSR_IA32_P5_MC_ADDR:
1555 case MSR_IA32_P5_MC_TYPE:
1556 case MSR_IA32_MC0_CTL:
1557 case MSR_IA32_MCG_STATUS:
1558 case MSR_IA32_MCG_CAP:
1559 case MSR_IA32_MC0_MISC:
1560 case MSR_IA32_MC0_MISC+4:
1561 case MSR_IA32_MC0_MISC+8:
1562 case MSR_IA32_MC0_MISC+12:
1563 case MSR_IA32_MC0_MISC+16:
1564 case MSR_IA32_UCODE_REV:
1565 case MSR_IA32_PERF_STATUS:
1566 case MSR_IA32_EBL_CR_POWERON:
1567 /* MTRR registers */
1568 case 0xfe:
1569 case 0x200 ... 0x2ff:
1570 data = 0;
1571 break;
1572 case 0xcd: /* fsb frequency */
1573 data = 3;
1574 break;
1575 case MSR_IA32_APICBASE:
1576 data = kvm_get_apic_base(vcpu);
1577 break;
1578 case MSR_IA32_MISC_ENABLE:
1579 data = vcpu->ia32_misc_enable_msr;
1580 break;
1581#ifdef CONFIG_X86_64
1582 case MSR_EFER:
1583 data = vcpu->shadow_efer;
1584 break;
1585#endif
1586 default:
1587 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
1588 return 1;
1589 }
1590 *pdata = data;
1591 return 0;
1592}
1593EXPORT_SYMBOL_GPL(kvm_get_msr_common);
1594
1595/*
1596 * Reads an msr value (of 'msr_index') into 'pdata'.
1597 * Returns 0 on success, non-0 otherwise.
1598 * Assumes vcpu_load() was already called.
1599 */
1600int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1601{
1602 return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
1603}
1604
1605#ifdef CONFIG_X86_64
1606
1607static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
1608{
1609 if (efer & EFER_RESERVED_BITS) {
1610 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
1611 efer);
1612 inject_gp(vcpu);
1613 return;
1614 }
1615
1616 if (is_paging(vcpu)
1617 && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) {
1618 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
1619 inject_gp(vcpu);
1620 return;
1621 }
1622
1623 kvm_x86_ops->set_efer(vcpu, efer);
1624
1625 efer &= ~EFER_LMA;
1626 efer |= vcpu->shadow_efer & EFER_LMA;
1627
1628 vcpu->shadow_efer = efer;
1629}
1630
1631#endif
1632
1633int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1634{
1635 switch (msr) {
1636#ifdef CONFIG_X86_64
1637 case MSR_EFER:
1638 set_efer(vcpu, data);
1639 break;
1640#endif
1641 case MSR_IA32_MC0_STATUS:
1642 pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
1643 __FUNCTION__, data);
1644 break;
1645 case MSR_IA32_MCG_STATUS:
1646 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
1647 __FUNCTION__, data);
1648 break;
1649 case MSR_IA32_UCODE_REV:
1650 case MSR_IA32_UCODE_WRITE:
1651 case 0x200 ... 0x2ff: /* MTRRs */
1652 break;
1653 case MSR_IA32_APICBASE:
1654 kvm_set_apic_base(vcpu, data);
1655 break;
1656 case MSR_IA32_MISC_ENABLE:
1657 vcpu->ia32_misc_enable_msr = data;
1658 break;
1659 /*
1660 * This is the 'probe whether the host is KVM' logic:
1661 */
1662 case MSR_KVM_API_MAGIC:
1663 return vcpu_register_para(vcpu, data);
1664
1665 default:
1666 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x\n", msr);
1667 return 1;
1668 }
1669 return 0;
1670}
1671EXPORT_SYMBOL_GPL(kvm_set_msr_common);
1672
1673/*
1674 * Writes msr value into into the appropriate "register".
1675 * Returns 0 on success, non-0 otherwise.
1676 * Assumes vcpu_load() was already called.
1677 */
1678int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1679{
1680 return kvm_x86_ops->set_msr(vcpu, msr_index, data);
1681}
1682
1683void kvm_resched(struct kvm_vcpu *vcpu)
1684{
1685 if (!need_resched())
1686 return;
1687 cond_resched();
1688}
1689EXPORT_SYMBOL_GPL(kvm_resched);
1690
1691void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
1692{ 1943{
1693 int i; 1944 int i;
1694 u32 function;
1695 struct kvm_cpuid_entry *e, *best;
1696 1945
1697 kvm_x86_ops->cache_regs(vcpu); 1946 for (i = 0; i < ARRAY_SIZE(vcpu->arch.pio.guest_pages); ++i)
1698 function = vcpu->regs[VCPU_REGS_RAX]; 1947 if (vcpu->arch.pio.guest_pages[i]) {
1699 vcpu->regs[VCPU_REGS_RAX] = 0; 1948 kvm_release_page_dirty(vcpu->arch.pio.guest_pages[i]);
1700 vcpu->regs[VCPU_REGS_RBX] = 0; 1949 vcpu->arch.pio.guest_pages[i] = NULL;
1701 vcpu->regs[VCPU_REGS_RCX] = 0;
1702 vcpu->regs[VCPU_REGS_RDX] = 0;
1703 best = NULL;
1704 for (i = 0; i < vcpu->cpuid_nent; ++i) {
1705 e = &vcpu->cpuid_entries[i];
1706 if (e->function == function) {
1707 best = e;
1708 break;
1709 } 1950 }
1710 /*
1711 * Both basic or both extended?
1712 */
1713 if (((e->function ^ function) & 0x80000000) == 0)
1714 if (!best || e->function > best->function)
1715 best = e;
1716 }
1717 if (best) {
1718 vcpu->regs[VCPU_REGS_RAX] = best->eax;
1719 vcpu->regs[VCPU_REGS_RBX] = best->ebx;
1720 vcpu->regs[VCPU_REGS_RCX] = best->ecx;
1721 vcpu->regs[VCPU_REGS_RDX] = best->edx;
1722 }
1723 kvm_x86_ops->decache_regs(vcpu);
1724 kvm_x86_ops->skip_emulated_instruction(vcpu);
1725} 1951}
1726EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
1727 1952
1728static int pio_copy_data(struct kvm_vcpu *vcpu) 1953static int pio_copy_data(struct kvm_vcpu *vcpu)
1729{ 1954{
1730 void *p = vcpu->pio_data; 1955 void *p = vcpu->arch.pio_data;
1731 void *q; 1956 void *q;
1732 unsigned bytes; 1957 unsigned bytes;
1733 int nr_pages = vcpu->pio.guest_pages[1] ? 2 : 1; 1958 int nr_pages = vcpu->arch.pio.guest_pages[1] ? 2 : 1;
1734 1959
1735 q = vmap(vcpu->pio.guest_pages, nr_pages, VM_READ|VM_WRITE, 1960 q = vmap(vcpu->arch.pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
1736 PAGE_KERNEL); 1961 PAGE_KERNEL);
1737 if (!q) { 1962 if (!q) {
1738 free_pio_guest_pages(vcpu); 1963 free_pio_guest_pages(vcpu);
1739 return -ENOMEM; 1964 return -ENOMEM;
1740 } 1965 }
1741 q += vcpu->pio.guest_page_offset; 1966 q += vcpu->arch.pio.guest_page_offset;
1742 bytes = vcpu->pio.size * vcpu->pio.cur_count; 1967 bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
1743 if (vcpu->pio.in) 1968 if (vcpu->arch.pio.in)
1744 memcpy(q, p, bytes); 1969 memcpy(q, p, bytes);
1745 else 1970 else
1746 memcpy(p, q, bytes); 1971 memcpy(p, q, bytes);
1747 q -= vcpu->pio.guest_page_offset; 1972 q -= vcpu->arch.pio.guest_page_offset;
1748 vunmap(q); 1973 vunmap(q);
1749 free_pio_guest_pages(vcpu); 1974 free_pio_guest_pages(vcpu);
1750 return 0; 1975 return 0;
1751} 1976}
1752 1977
1753static int complete_pio(struct kvm_vcpu *vcpu) 1978int complete_pio(struct kvm_vcpu *vcpu)
1754{ 1979{
1755 struct kvm_pio_request *io = &vcpu->pio; 1980 struct kvm_pio_request *io = &vcpu->arch.pio;
1756 long delta; 1981 long delta;
1757 int r; 1982 int r;
1758 1983
@@ -1760,7 +1985,7 @@ static int complete_pio(struct kvm_vcpu *vcpu)
1760 1985
1761 if (!io->string) { 1986 if (!io->string) {
1762 if (io->in) 1987 if (io->in)
1763 memcpy(&vcpu->regs[VCPU_REGS_RAX], vcpu->pio_data, 1988 memcpy(&vcpu->arch.regs[VCPU_REGS_RAX], vcpu->arch.pio_data,
1764 io->size); 1989 io->size);
1765 } else { 1990 } else {
1766 if (io->in) { 1991 if (io->in) {
@@ -1778,15 +2003,15 @@ static int complete_pio(struct kvm_vcpu *vcpu)
1778 * The size of the register should really depend on 2003 * The size of the register should really depend on
1779 * current address size. 2004 * current address size.
1780 */ 2005 */
1781 vcpu->regs[VCPU_REGS_RCX] -= delta; 2006 vcpu->arch.regs[VCPU_REGS_RCX] -= delta;
1782 } 2007 }
1783 if (io->down) 2008 if (io->down)
1784 delta = -delta; 2009 delta = -delta;
1785 delta *= io->size; 2010 delta *= io->size;
1786 if (io->in) 2011 if (io->in)
1787 vcpu->regs[VCPU_REGS_RDI] += delta; 2012 vcpu->arch.regs[VCPU_REGS_RDI] += delta;
1788 else 2013 else
1789 vcpu->regs[VCPU_REGS_RSI] += delta; 2014 vcpu->arch.regs[VCPU_REGS_RSI] += delta;
1790 } 2015 }
1791 2016
1792 kvm_x86_ops->decache_regs(vcpu); 2017 kvm_x86_ops->decache_regs(vcpu);
@@ -1804,13 +2029,13 @@ static void kernel_pio(struct kvm_io_device *pio_dev,
1804 /* TODO: String I/O for in kernel device */ 2029 /* TODO: String I/O for in kernel device */
1805 2030
1806 mutex_lock(&vcpu->kvm->lock); 2031 mutex_lock(&vcpu->kvm->lock);
1807 if (vcpu->pio.in) 2032 if (vcpu->arch.pio.in)
1808 kvm_iodevice_read(pio_dev, vcpu->pio.port, 2033 kvm_iodevice_read(pio_dev, vcpu->arch.pio.port,
1809 vcpu->pio.size, 2034 vcpu->arch.pio.size,
1810 pd); 2035 pd);
1811 else 2036 else
1812 kvm_iodevice_write(pio_dev, vcpu->pio.port, 2037 kvm_iodevice_write(pio_dev, vcpu->arch.pio.port,
1813 vcpu->pio.size, 2038 vcpu->arch.pio.size,
1814 pd); 2039 pd);
1815 mutex_unlock(&vcpu->kvm->lock); 2040 mutex_unlock(&vcpu->kvm->lock);
1816} 2041}
@@ -1818,8 +2043,8 @@ static void kernel_pio(struct kvm_io_device *pio_dev,
1818static void pio_string_write(struct kvm_io_device *pio_dev, 2043static void pio_string_write(struct kvm_io_device *pio_dev,
1819 struct kvm_vcpu *vcpu) 2044 struct kvm_vcpu *vcpu)
1820{ 2045{
1821 struct kvm_pio_request *io = &vcpu->pio; 2046 struct kvm_pio_request *io = &vcpu->arch.pio;
1822 void *pd = vcpu->pio_data; 2047 void *pd = vcpu->arch.pio_data;
1823 int i; 2048 int i;
1824 2049
1825 mutex_lock(&vcpu->kvm->lock); 2050 mutex_lock(&vcpu->kvm->lock);
@@ -1832,32 +2057,38 @@ static void pio_string_write(struct kvm_io_device *pio_dev,
1832 mutex_unlock(&vcpu->kvm->lock); 2057 mutex_unlock(&vcpu->kvm->lock);
1833} 2058}
1834 2059
1835int kvm_emulate_pio (struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 2060static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
2061 gpa_t addr)
2062{
2063 return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr);
2064}
2065
2066int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
1836 int size, unsigned port) 2067 int size, unsigned port)
1837{ 2068{
1838 struct kvm_io_device *pio_dev; 2069 struct kvm_io_device *pio_dev;
1839 2070
1840 vcpu->run->exit_reason = KVM_EXIT_IO; 2071 vcpu->run->exit_reason = KVM_EXIT_IO;
1841 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 2072 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
1842 vcpu->run->io.size = vcpu->pio.size = size; 2073 vcpu->run->io.size = vcpu->arch.pio.size = size;
1843 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; 2074 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
1844 vcpu->run->io.count = vcpu->pio.count = vcpu->pio.cur_count = 1; 2075 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1;
1845 vcpu->run->io.port = vcpu->pio.port = port; 2076 vcpu->run->io.port = vcpu->arch.pio.port = port;
1846 vcpu->pio.in = in; 2077 vcpu->arch.pio.in = in;
1847 vcpu->pio.string = 0; 2078 vcpu->arch.pio.string = 0;
1848 vcpu->pio.down = 0; 2079 vcpu->arch.pio.down = 0;
1849 vcpu->pio.guest_page_offset = 0; 2080 vcpu->arch.pio.guest_page_offset = 0;
1850 vcpu->pio.rep = 0; 2081 vcpu->arch.pio.rep = 0;
1851 2082
1852 kvm_x86_ops->cache_regs(vcpu); 2083 kvm_x86_ops->cache_regs(vcpu);
1853 memcpy(vcpu->pio_data, &vcpu->regs[VCPU_REGS_RAX], 4); 2084 memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4);
1854 kvm_x86_ops->decache_regs(vcpu); 2085 kvm_x86_ops->decache_regs(vcpu);
1855 2086
1856 kvm_x86_ops->skip_emulated_instruction(vcpu); 2087 kvm_x86_ops->skip_emulated_instruction(vcpu);
1857 2088
1858 pio_dev = vcpu_find_pio_dev(vcpu, port); 2089 pio_dev = vcpu_find_pio_dev(vcpu, port);
1859 if (pio_dev) { 2090 if (pio_dev) {
1860 kernel_pio(pio_dev, vcpu, vcpu->pio_data); 2091 kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
1861 complete_pio(vcpu); 2092 complete_pio(vcpu);
1862 return 1; 2093 return 1;
1863 } 2094 }
@@ -1877,15 +2108,15 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
1877 2108
1878 vcpu->run->exit_reason = KVM_EXIT_IO; 2109 vcpu->run->exit_reason = KVM_EXIT_IO;
1879 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 2110 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
1880 vcpu->run->io.size = vcpu->pio.size = size; 2111 vcpu->run->io.size = vcpu->arch.pio.size = size;
1881 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; 2112 vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
1882 vcpu->run->io.count = vcpu->pio.count = vcpu->pio.cur_count = count; 2113 vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
1883 vcpu->run->io.port = vcpu->pio.port = port; 2114 vcpu->run->io.port = vcpu->arch.pio.port = port;
1884 vcpu->pio.in = in; 2115 vcpu->arch.pio.in = in;
1885 vcpu->pio.string = 1; 2116 vcpu->arch.pio.string = 1;
1886 vcpu->pio.down = down; 2117 vcpu->arch.pio.down = down;
1887 vcpu->pio.guest_page_offset = offset_in_page(address); 2118 vcpu->arch.pio.guest_page_offset = offset_in_page(address);
1888 vcpu->pio.rep = rep; 2119 vcpu->arch.pio.rep = rep;
1889 2120
1890 if (!count) { 2121 if (!count) {
1891 kvm_x86_ops->skip_emulated_instruction(vcpu); 2122 kvm_x86_ops->skip_emulated_instruction(vcpu);
@@ -1911,37 +2142,35 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
1911 * String I/O in reverse. Yuck. Kill the guest, fix later. 2142 * String I/O in reverse. Yuck. Kill the guest, fix later.
1912 */ 2143 */
1913 pr_unimpl(vcpu, "guest string pio down\n"); 2144 pr_unimpl(vcpu, "guest string pio down\n");
1914 inject_gp(vcpu); 2145 kvm_inject_gp(vcpu, 0);
1915 return 1; 2146 return 1;
1916 } 2147 }
1917 vcpu->run->io.count = now; 2148 vcpu->run->io.count = now;
1918 vcpu->pio.cur_count = now; 2149 vcpu->arch.pio.cur_count = now;
1919 2150
1920 if (vcpu->pio.cur_count == vcpu->pio.count) 2151 if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
1921 kvm_x86_ops->skip_emulated_instruction(vcpu); 2152 kvm_x86_ops->skip_emulated_instruction(vcpu);
1922 2153
1923 for (i = 0; i < nr_pages; ++i) { 2154 for (i = 0; i < nr_pages; ++i) {
1924 mutex_lock(&vcpu->kvm->lock); 2155 down_read(&current->mm->mmap_sem);
1925 page = gva_to_page(vcpu, address + i * PAGE_SIZE); 2156 page = gva_to_page(vcpu, address + i * PAGE_SIZE);
1926 if (page) 2157 vcpu->arch.pio.guest_pages[i] = page;
1927 get_page(page); 2158 up_read(&current->mm->mmap_sem);
1928 vcpu->pio.guest_pages[i] = page;
1929 mutex_unlock(&vcpu->kvm->lock);
1930 if (!page) { 2159 if (!page) {
1931 inject_gp(vcpu); 2160 kvm_inject_gp(vcpu, 0);
1932 free_pio_guest_pages(vcpu); 2161 free_pio_guest_pages(vcpu);
1933 return 1; 2162 return 1;
1934 } 2163 }
1935 } 2164 }
1936 2165
1937 pio_dev = vcpu_find_pio_dev(vcpu, port); 2166 pio_dev = vcpu_find_pio_dev(vcpu, port);
1938 if (!vcpu->pio.in) { 2167 if (!vcpu->arch.pio.in) {
1939 /* string PIO write */ 2168 /* string PIO write */
1940 ret = pio_copy_data(vcpu); 2169 ret = pio_copy_data(vcpu);
1941 if (ret >= 0 && pio_dev) { 2170 if (ret >= 0 && pio_dev) {
1942 pio_string_write(pio_dev, vcpu); 2171 pio_string_write(pio_dev, vcpu);
1943 complete_pio(vcpu); 2172 complete_pio(vcpu);
1944 if (vcpu->pio.count == 0) 2173 if (vcpu->arch.pio.count == 0)
1945 ret = 1; 2174 ret = 1;
1946 } 2175 }
1947 } else if (pio_dev) 2176 } else if (pio_dev)
@@ -1953,6 +2182,263 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
1953} 2182}
1954EXPORT_SYMBOL_GPL(kvm_emulate_pio_string); 2183EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
1955 2184
2185int kvm_arch_init(void *opaque)
2186{
2187 int r;
2188 struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
2189
2190 if (kvm_x86_ops) {
2191 printk(KERN_ERR "kvm: already loaded the other module\n");
2192 r = -EEXIST;
2193 goto out;
2194 }
2195
2196 if (!ops->cpu_has_kvm_support()) {
2197 printk(KERN_ERR "kvm: no hardware support\n");
2198 r = -EOPNOTSUPP;
2199 goto out;
2200 }
2201 if (ops->disabled_by_bios()) {
2202 printk(KERN_ERR "kvm: disabled by bios\n");
2203 r = -EOPNOTSUPP;
2204 goto out;
2205 }
2206
2207 r = kvm_mmu_module_init();
2208 if (r)
2209 goto out;
2210
2211 kvm_init_msr_list();
2212
2213 kvm_x86_ops = ops;
2214 kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
2215 return 0;
2216
2217out:
2218 return r;
2219}
2220
2221void kvm_arch_exit(void)
2222{
2223 kvm_x86_ops = NULL;
2224 kvm_mmu_module_exit();
2225}
2226
2227int kvm_emulate_halt(struct kvm_vcpu *vcpu)
2228{
2229 ++vcpu->stat.halt_exits;
2230 if (irqchip_in_kernel(vcpu->kvm)) {
2231 vcpu->arch.mp_state = VCPU_MP_STATE_HALTED;
2232 kvm_vcpu_block(vcpu);
2233 if (vcpu->arch.mp_state != VCPU_MP_STATE_RUNNABLE)
2234 return -EINTR;
2235 return 1;
2236 } else {
2237 vcpu->run->exit_reason = KVM_EXIT_HLT;
2238 return 0;
2239 }
2240}
2241EXPORT_SYMBOL_GPL(kvm_emulate_halt);
2242
2243int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
2244{
2245 unsigned long nr, a0, a1, a2, a3, ret;
2246
2247 kvm_x86_ops->cache_regs(vcpu);
2248
2249 nr = vcpu->arch.regs[VCPU_REGS_RAX];
2250 a0 = vcpu->arch.regs[VCPU_REGS_RBX];
2251 a1 = vcpu->arch.regs[VCPU_REGS_RCX];
2252 a2 = vcpu->arch.regs[VCPU_REGS_RDX];
2253 a3 = vcpu->arch.regs[VCPU_REGS_RSI];
2254
2255 if (!is_long_mode(vcpu)) {
2256 nr &= 0xFFFFFFFF;
2257 a0 &= 0xFFFFFFFF;
2258 a1 &= 0xFFFFFFFF;
2259 a2 &= 0xFFFFFFFF;
2260 a3 &= 0xFFFFFFFF;
2261 }
2262
2263 switch (nr) {
2264 case KVM_HC_VAPIC_POLL_IRQ:
2265 ret = 0;
2266 break;
2267 default:
2268 ret = -KVM_ENOSYS;
2269 break;
2270 }
2271 vcpu->arch.regs[VCPU_REGS_RAX] = ret;
2272 kvm_x86_ops->decache_regs(vcpu);
2273 return 0;
2274}
2275EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
2276
2277int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
2278{
2279 char instruction[3];
2280 int ret = 0;
2281
2282
2283 /*
2284 * Blow out the MMU to ensure that no other VCPU has an active mapping
2285 * to ensure that the updated hypercall appears atomically across all
2286 * VCPUs.
2287 */
2288 kvm_mmu_zap_all(vcpu->kvm);
2289
2290 kvm_x86_ops->cache_regs(vcpu);
2291 kvm_x86_ops->patch_hypercall(vcpu, instruction);
2292 if (emulator_write_emulated(vcpu->arch.rip, instruction, 3, vcpu)
2293 != X86EMUL_CONTINUE)
2294 ret = -EFAULT;
2295
2296 return ret;
2297}
2298
2299static u64 mk_cr_64(u64 curr_cr, u32 new_val)
2300{
2301 return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
2302}
2303
2304void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
2305{
2306 struct descriptor_table dt = { limit, base };
2307
2308 kvm_x86_ops->set_gdt(vcpu, &dt);
2309}
2310
2311void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
2312{
2313 struct descriptor_table dt = { limit, base };
2314
2315 kvm_x86_ops->set_idt(vcpu, &dt);
2316}
2317
2318void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
2319 unsigned long *rflags)
2320{
2321 lmsw(vcpu, msw);
2322 *rflags = kvm_x86_ops->get_rflags(vcpu);
2323}
2324
2325unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
2326{
2327 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2328 switch (cr) {
2329 case 0:
2330 return vcpu->arch.cr0;
2331 case 2:
2332 return vcpu->arch.cr2;
2333 case 3:
2334 return vcpu->arch.cr3;
2335 case 4:
2336 return vcpu->arch.cr4;
2337 case 8:
2338 return get_cr8(vcpu);
2339 default:
2340 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
2341 return 0;
2342 }
2343}
2344
2345void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
2346 unsigned long *rflags)
2347{
2348 switch (cr) {
2349 case 0:
2350 set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
2351 *rflags = kvm_x86_ops->get_rflags(vcpu);
2352 break;
2353 case 2:
2354 vcpu->arch.cr2 = val;
2355 break;
2356 case 3:
2357 set_cr3(vcpu, val);
2358 break;
2359 case 4:
2360 set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val));
2361 break;
2362 case 8:
2363 set_cr8(vcpu, val & 0xfUL);
2364 break;
2365 default:
2366 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
2367 }
2368}
2369
2370static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
2371{
2372 struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
2373 int j, nent = vcpu->arch.cpuid_nent;
2374
2375 e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
2376 /* when no next entry is found, the current entry[i] is reselected */
2377 for (j = i + 1; j == i; j = (j + 1) % nent) {
2378 struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
2379 if (ej->function == e->function) {
2380 ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
2381 return j;
2382 }
2383 }
2384 return 0; /* silence gcc, even though control never reaches here */
2385}
2386
2387/* find an entry with matching function, matching index (if needed), and that
2388 * should be read next (if it's stateful) */
2389static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
2390 u32 function, u32 index)
2391{
2392 if (e->function != function)
2393 return 0;
2394 if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
2395 return 0;
2396 if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
2397 !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
2398 return 0;
2399 return 1;
2400}
2401
2402void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
2403{
2404 int i;
2405 u32 function, index;
2406 struct kvm_cpuid_entry2 *e, *best;
2407
2408 kvm_x86_ops->cache_regs(vcpu);
2409 function = vcpu->arch.regs[VCPU_REGS_RAX];
2410 index = vcpu->arch.regs[VCPU_REGS_RCX];
2411 vcpu->arch.regs[VCPU_REGS_RAX] = 0;
2412 vcpu->arch.regs[VCPU_REGS_RBX] = 0;
2413 vcpu->arch.regs[VCPU_REGS_RCX] = 0;
2414 vcpu->arch.regs[VCPU_REGS_RDX] = 0;
2415 best = NULL;
2416 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
2417 e = &vcpu->arch.cpuid_entries[i];
2418 if (is_matching_cpuid_entry(e, function, index)) {
2419 if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
2420 move_to_next_stateful_cpuid_entry(vcpu, i);
2421 best = e;
2422 break;
2423 }
2424 /*
2425 * Both basic or both extended?
2426 */
2427 if (((e->function ^ function) & 0x80000000) == 0)
2428 if (!best || e->function > best->function)
2429 best = e;
2430 }
2431 if (best) {
2432 vcpu->arch.regs[VCPU_REGS_RAX] = best->eax;
2433 vcpu->arch.regs[VCPU_REGS_RBX] = best->ebx;
2434 vcpu->arch.regs[VCPU_REGS_RCX] = best->ecx;
2435 vcpu->arch.regs[VCPU_REGS_RDX] = best->edx;
2436 }
2437 kvm_x86_ops->decache_regs(vcpu);
2438 kvm_x86_ops->skip_emulated_instruction(vcpu);
2439}
2440EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
2441
1956/* 2442/*
1957 * Check if userspace requested an interrupt window, and that the 2443 * Check if userspace requested an interrupt window, and that the
1958 * interrupt window is open. 2444 * interrupt window is open.
@@ -1962,9 +2448,9 @@ EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
1962static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, 2448static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
1963 struct kvm_run *kvm_run) 2449 struct kvm_run *kvm_run)
1964{ 2450{
1965 return (!vcpu->irq_summary && 2451 return (!vcpu->arch.irq_summary &&
1966 kvm_run->request_interrupt_window && 2452 kvm_run->request_interrupt_window &&
1967 vcpu->interrupt_window_open && 2453 vcpu->arch.interrupt_window_open &&
1968 (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF)); 2454 (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
1969} 2455}
1970 2456
@@ -1978,22 +2464,51 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu,
1978 kvm_run->ready_for_interrupt_injection = 1; 2464 kvm_run->ready_for_interrupt_injection = 1;
1979 else 2465 else
1980 kvm_run->ready_for_interrupt_injection = 2466 kvm_run->ready_for_interrupt_injection =
1981 (vcpu->interrupt_window_open && 2467 (vcpu->arch.interrupt_window_open &&
1982 vcpu->irq_summary == 0); 2468 vcpu->arch.irq_summary == 0);
2469}
2470
2471static void vapic_enter(struct kvm_vcpu *vcpu)
2472{
2473 struct kvm_lapic *apic = vcpu->arch.apic;
2474 struct page *page;
2475
2476 if (!apic || !apic->vapic_addr)
2477 return;
2478
2479 down_read(&current->mm->mmap_sem);
2480 page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
2481 vcpu->arch.apic->vapic_page = page;
2482 up_read(&current->mm->mmap_sem);
2483}
2484
2485static void vapic_exit(struct kvm_vcpu *vcpu)
2486{
2487 struct kvm_lapic *apic = vcpu->arch.apic;
2488
2489 if (!apic || !apic->vapic_addr)
2490 return;
2491
2492 kvm_release_page_dirty(apic->vapic_page);
2493 mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
1983} 2494}
1984 2495
1985static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2496static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1986{ 2497{
1987 int r; 2498 int r;
1988 2499
1989 if (unlikely(vcpu->mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) { 2500 if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) {
1990 printk("vcpu %d received sipi with vector # %x\n", 2501 pr_debug("vcpu %d received sipi with vector # %x\n",
1991 vcpu->vcpu_id, vcpu->sipi_vector); 2502 vcpu->vcpu_id, vcpu->arch.sipi_vector);
1992 kvm_lapic_reset(vcpu); 2503 kvm_lapic_reset(vcpu);
1993 kvm_x86_ops->vcpu_reset(vcpu); 2504 r = kvm_x86_ops->vcpu_reset(vcpu);
1994 vcpu->mp_state = VCPU_MP_STATE_RUNNABLE; 2505 if (r)
2506 return r;
2507 vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
1995 } 2508 }
1996 2509
2510 vapic_enter(vcpu);
2511
1997preempted: 2512preempted:
1998 if (vcpu->guest_debug.enabled) 2513 if (vcpu->guest_debug.enabled)
1999 kvm_x86_ops->guest_debug_pre(vcpu); 2514 kvm_x86_ops->guest_debug_pre(vcpu);
@@ -2003,6 +2518,19 @@ again:
2003 if (unlikely(r)) 2518 if (unlikely(r))
2004 goto out; 2519 goto out;
2005 2520
2521 if (vcpu->requests) {
2522 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
2523 __kvm_migrate_apic_timer(vcpu);
2524 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
2525 &vcpu->requests)) {
2526 kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS;
2527 r = 0;
2528 goto out;
2529 }
2530 }
2531
2532 kvm_inject_pending_timer_irqs(vcpu);
2533
2006 preempt_disable(); 2534 preempt_disable();
2007 2535
2008 kvm_x86_ops->prepare_guest_switch(vcpu); 2536 kvm_x86_ops->prepare_guest_switch(vcpu);
@@ -2010,6 +2538,13 @@ again:
2010 2538
2011 local_irq_disable(); 2539 local_irq_disable();
2012 2540
2541 if (need_resched()) {
2542 local_irq_enable();
2543 preempt_enable();
2544 r = 1;
2545 goto out;
2546 }
2547
2013 if (signal_pending(current)) { 2548 if (signal_pending(current)) {
2014 local_irq_enable(); 2549 local_irq_enable();
2015 preempt_enable(); 2550 preempt_enable();
@@ -2019,16 +2554,20 @@ again:
2019 goto out; 2554 goto out;
2020 } 2555 }
2021 2556
2022 if (irqchip_in_kernel(vcpu->kvm)) 2557 if (vcpu->arch.exception.pending)
2558 __queue_exception(vcpu);
2559 else if (irqchip_in_kernel(vcpu->kvm))
2023 kvm_x86_ops->inject_pending_irq(vcpu); 2560 kvm_x86_ops->inject_pending_irq(vcpu);
2024 else if (!vcpu->mmio_read_completed) 2561 else
2025 kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run); 2562 kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
2026 2563
2564 kvm_lapic_sync_to_vapic(vcpu);
2565
2027 vcpu->guest_mode = 1; 2566 vcpu->guest_mode = 1;
2028 kvm_guest_enter(); 2567 kvm_guest_enter();
2029 2568
2030 if (vcpu->requests) 2569 if (vcpu->requests)
2031 if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests)) 2570 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
2032 kvm_x86_ops->tlb_flush(vcpu); 2571 kvm_x86_ops->tlb_flush(vcpu);
2033 2572
2034 kvm_x86_ops->run(vcpu, kvm_run); 2573 kvm_x86_ops->run(vcpu, kvm_run);
@@ -2055,9 +2594,14 @@ again:
2055 */ 2594 */
2056 if (unlikely(prof_on == KVM_PROFILING)) { 2595 if (unlikely(prof_on == KVM_PROFILING)) {
2057 kvm_x86_ops->cache_regs(vcpu); 2596 kvm_x86_ops->cache_regs(vcpu);
2058 profile_hit(KVM_PROFILING, (void *)vcpu->rip); 2597 profile_hit(KVM_PROFILING, (void *)vcpu->arch.rip);
2059 } 2598 }
2060 2599
2600 if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu))
2601 vcpu->arch.exception.pending = false;
2602
2603 kvm_lapic_sync_from_vapic(vcpu);
2604
2061 r = kvm_x86_ops->handle_exit(kvm_run, vcpu); 2605 r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
2062 2606
2063 if (r > 0) { 2607 if (r > 0) {
@@ -2067,10 +2611,8 @@ again:
2067 ++vcpu->stat.request_irq_exits; 2611 ++vcpu->stat.request_irq_exits;
2068 goto out; 2612 goto out;
2069 } 2613 }
2070 if (!need_resched()) { 2614 if (!need_resched())
2071 ++vcpu->stat.light_exits;
2072 goto again; 2615 goto again;
2073 }
2074 } 2616 }
2075 2617
2076out: 2618out:
@@ -2081,18 +2623,19 @@ out:
2081 2623
2082 post_kvm_run_save(vcpu, kvm_run); 2624 post_kvm_run_save(vcpu, kvm_run);
2083 2625
2626 vapic_exit(vcpu);
2627
2084 return r; 2628 return r;
2085} 2629}
2086 2630
2087 2631int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2088static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2089{ 2632{
2090 int r; 2633 int r;
2091 sigset_t sigsaved; 2634 sigset_t sigsaved;
2092 2635
2093 vcpu_load(vcpu); 2636 vcpu_load(vcpu);
2094 2637
2095 if (unlikely(vcpu->mp_state == VCPU_MP_STATE_UNINITIALIZED)) { 2638 if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_UNINITIALIZED)) {
2096 kvm_vcpu_block(vcpu); 2639 kvm_vcpu_block(vcpu);
2097 vcpu_put(vcpu); 2640 vcpu_put(vcpu);
2098 return -EAGAIN; 2641 return -EAGAIN;
@@ -2105,18 +2648,19 @@ static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2105 if (!irqchip_in_kernel(vcpu->kvm)) 2648 if (!irqchip_in_kernel(vcpu->kvm))
2106 set_cr8(vcpu, kvm_run->cr8); 2649 set_cr8(vcpu, kvm_run->cr8);
2107 2650
2108 if (vcpu->pio.cur_count) { 2651 if (vcpu->arch.pio.cur_count) {
2109 r = complete_pio(vcpu); 2652 r = complete_pio(vcpu);
2110 if (r) 2653 if (r)
2111 goto out; 2654 goto out;
2112 } 2655 }
2113 2656#if CONFIG_HAS_IOMEM
2114 if (vcpu->mmio_needed) { 2657 if (vcpu->mmio_needed) {
2115 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); 2658 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
2116 vcpu->mmio_read_completed = 1; 2659 vcpu->mmio_read_completed = 1;
2117 vcpu->mmio_needed = 0; 2660 vcpu->mmio_needed = 0;
2118 r = emulate_instruction(vcpu, kvm_run, 2661 r = emulate_instruction(vcpu, kvm_run,
2119 vcpu->mmio_fault_cr2, 0); 2662 vcpu->arch.mmio_fault_cr2, 0,
2663 EMULTYPE_NO_DECODE);
2120 if (r == EMULATE_DO_MMIO) { 2664 if (r == EMULATE_DO_MMIO) {
2121 /* 2665 /*
2122 * Read-modify-write. Back to userspace. 2666 * Read-modify-write. Back to userspace.
@@ -2125,10 +2669,10 @@ static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2125 goto out; 2669 goto out;
2126 } 2670 }
2127 } 2671 }
2128 2672#endif
2129 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) { 2673 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) {
2130 kvm_x86_ops->cache_regs(vcpu); 2674 kvm_x86_ops->cache_regs(vcpu);
2131 vcpu->regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret; 2675 vcpu->arch.regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;
2132 kvm_x86_ops->decache_regs(vcpu); 2676 kvm_x86_ops->decache_regs(vcpu);
2133 } 2677 }
2134 2678
@@ -2142,33 +2686,32 @@ out:
2142 return r; 2686 return r;
2143} 2687}
2144 2688
2145static int kvm_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, 2689int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
2146 struct kvm_regs *regs)
2147{ 2690{
2148 vcpu_load(vcpu); 2691 vcpu_load(vcpu);
2149 2692
2150 kvm_x86_ops->cache_regs(vcpu); 2693 kvm_x86_ops->cache_regs(vcpu);
2151 2694
2152 regs->rax = vcpu->regs[VCPU_REGS_RAX]; 2695 regs->rax = vcpu->arch.regs[VCPU_REGS_RAX];
2153 regs->rbx = vcpu->regs[VCPU_REGS_RBX]; 2696 regs->rbx = vcpu->arch.regs[VCPU_REGS_RBX];
2154 regs->rcx = vcpu->regs[VCPU_REGS_RCX]; 2697 regs->rcx = vcpu->arch.regs[VCPU_REGS_RCX];
2155 regs->rdx = vcpu->regs[VCPU_REGS_RDX]; 2698 regs->rdx = vcpu->arch.regs[VCPU_REGS_RDX];
2156 regs->rsi = vcpu->regs[VCPU_REGS_RSI]; 2699 regs->rsi = vcpu->arch.regs[VCPU_REGS_RSI];
2157 regs->rdi = vcpu->regs[VCPU_REGS_RDI]; 2700 regs->rdi = vcpu->arch.regs[VCPU_REGS_RDI];
2158 regs->rsp = vcpu->regs[VCPU_REGS_RSP]; 2701 regs->rsp = vcpu->arch.regs[VCPU_REGS_RSP];
2159 regs->rbp = vcpu->regs[VCPU_REGS_RBP]; 2702 regs->rbp = vcpu->arch.regs[VCPU_REGS_RBP];
2160#ifdef CONFIG_X86_64 2703#ifdef CONFIG_X86_64
2161 regs->r8 = vcpu->regs[VCPU_REGS_R8]; 2704 regs->r8 = vcpu->arch.regs[VCPU_REGS_R8];
2162 regs->r9 = vcpu->regs[VCPU_REGS_R9]; 2705 regs->r9 = vcpu->arch.regs[VCPU_REGS_R9];
2163 regs->r10 = vcpu->regs[VCPU_REGS_R10]; 2706 regs->r10 = vcpu->arch.regs[VCPU_REGS_R10];
2164 regs->r11 = vcpu->regs[VCPU_REGS_R11]; 2707 regs->r11 = vcpu->arch.regs[VCPU_REGS_R11];
2165 regs->r12 = vcpu->regs[VCPU_REGS_R12]; 2708 regs->r12 = vcpu->arch.regs[VCPU_REGS_R12];
2166 regs->r13 = vcpu->regs[VCPU_REGS_R13]; 2709 regs->r13 = vcpu->arch.regs[VCPU_REGS_R13];
2167 regs->r14 = vcpu->regs[VCPU_REGS_R14]; 2710 regs->r14 = vcpu->arch.regs[VCPU_REGS_R14];
2168 regs->r15 = vcpu->regs[VCPU_REGS_R15]; 2711 regs->r15 = vcpu->arch.regs[VCPU_REGS_R15];
2169#endif 2712#endif
2170 2713
2171 regs->rip = vcpu->rip; 2714 regs->rip = vcpu->arch.rip;
2172 regs->rflags = kvm_x86_ops->get_rflags(vcpu); 2715 regs->rflags = kvm_x86_ops->get_rflags(vcpu);
2173 2716
2174 /* 2717 /*
@@ -2182,31 +2725,30 @@ static int kvm_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu,
2182 return 0; 2725 return 0;
2183} 2726}
2184 2727
2185static int kvm_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, 2728int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
2186 struct kvm_regs *regs)
2187{ 2729{
2188 vcpu_load(vcpu); 2730 vcpu_load(vcpu);
2189 2731
2190 vcpu->regs[VCPU_REGS_RAX] = regs->rax; 2732 vcpu->arch.regs[VCPU_REGS_RAX] = regs->rax;
2191 vcpu->regs[VCPU_REGS_RBX] = regs->rbx; 2733 vcpu->arch.regs[VCPU_REGS_RBX] = regs->rbx;
2192 vcpu->regs[VCPU_REGS_RCX] = regs->rcx; 2734 vcpu->arch.regs[VCPU_REGS_RCX] = regs->rcx;
2193 vcpu->regs[VCPU_REGS_RDX] = regs->rdx; 2735 vcpu->arch.regs[VCPU_REGS_RDX] = regs->rdx;
2194 vcpu->regs[VCPU_REGS_RSI] = regs->rsi; 2736 vcpu->arch.regs[VCPU_REGS_RSI] = regs->rsi;
2195 vcpu->regs[VCPU_REGS_RDI] = regs->rdi; 2737 vcpu->arch.regs[VCPU_REGS_RDI] = regs->rdi;
2196 vcpu->regs[VCPU_REGS_RSP] = regs->rsp; 2738 vcpu->arch.regs[VCPU_REGS_RSP] = regs->rsp;
2197 vcpu->regs[VCPU_REGS_RBP] = regs->rbp; 2739 vcpu->arch.regs[VCPU_REGS_RBP] = regs->rbp;
2198#ifdef CONFIG_X86_64 2740#ifdef CONFIG_X86_64
2199 vcpu->regs[VCPU_REGS_R8] = regs->r8; 2741 vcpu->arch.regs[VCPU_REGS_R8] = regs->r8;
2200 vcpu->regs[VCPU_REGS_R9] = regs->r9; 2742 vcpu->arch.regs[VCPU_REGS_R9] = regs->r9;
2201 vcpu->regs[VCPU_REGS_R10] = regs->r10; 2743 vcpu->arch.regs[VCPU_REGS_R10] = regs->r10;
2202 vcpu->regs[VCPU_REGS_R11] = regs->r11; 2744 vcpu->arch.regs[VCPU_REGS_R11] = regs->r11;
2203 vcpu->regs[VCPU_REGS_R12] = regs->r12; 2745 vcpu->arch.regs[VCPU_REGS_R12] = regs->r12;
2204 vcpu->regs[VCPU_REGS_R13] = regs->r13; 2746 vcpu->arch.regs[VCPU_REGS_R13] = regs->r13;
2205 vcpu->regs[VCPU_REGS_R14] = regs->r14; 2747 vcpu->arch.regs[VCPU_REGS_R14] = regs->r14;
2206 vcpu->regs[VCPU_REGS_R15] = regs->r15; 2748 vcpu->arch.regs[VCPU_REGS_R15] = regs->r15;
2207#endif 2749#endif
2208 2750
2209 vcpu->rip = regs->rip; 2751 vcpu->arch.rip = regs->rip;
2210 kvm_x86_ops->set_rflags(vcpu, regs->rflags); 2752 kvm_x86_ops->set_rflags(vcpu, regs->rflags);
2211 2753
2212 kvm_x86_ops->decache_regs(vcpu); 2754 kvm_x86_ops->decache_regs(vcpu);
@@ -2222,8 +2764,18 @@ static void get_segment(struct kvm_vcpu *vcpu,
2222 return kvm_x86_ops->get_segment(vcpu, var, seg); 2764 return kvm_x86_ops->get_segment(vcpu, var, seg);
2223} 2765}
2224 2766
2225static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, 2767void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
2226 struct kvm_sregs *sregs) 2768{
2769 struct kvm_segment cs;
2770
2771 get_segment(vcpu, &cs, VCPU_SREG_CS);
2772 *db = cs.db;
2773 *l = cs.l;
2774}
2775EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
2776
2777int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
2778 struct kvm_sregs *sregs)
2227{ 2779{
2228 struct descriptor_table dt; 2780 struct descriptor_table dt;
2229 int pending_vec; 2781 int pending_vec;
@@ -2248,12 +2800,12 @@ static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
2248 sregs->gdt.base = dt.base; 2800 sregs->gdt.base = dt.base;
2249 2801
2250 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 2802 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2251 sregs->cr0 = vcpu->cr0; 2803 sregs->cr0 = vcpu->arch.cr0;
2252 sregs->cr2 = vcpu->cr2; 2804 sregs->cr2 = vcpu->arch.cr2;
2253 sregs->cr3 = vcpu->cr3; 2805 sregs->cr3 = vcpu->arch.cr3;
2254 sregs->cr4 = vcpu->cr4; 2806 sregs->cr4 = vcpu->arch.cr4;
2255 sregs->cr8 = get_cr8(vcpu); 2807 sregs->cr8 = get_cr8(vcpu);
2256 sregs->efer = vcpu->shadow_efer; 2808 sregs->efer = vcpu->arch.shadow_efer;
2257 sregs->apic_base = kvm_get_apic_base(vcpu); 2809 sregs->apic_base = kvm_get_apic_base(vcpu);
2258 2810
2259 if (irqchip_in_kernel(vcpu->kvm)) { 2811 if (irqchip_in_kernel(vcpu->kvm)) {
@@ -2261,9 +2813,10 @@ static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
2261 sizeof sregs->interrupt_bitmap); 2813 sizeof sregs->interrupt_bitmap);
2262 pending_vec = kvm_x86_ops->get_irq(vcpu); 2814 pending_vec = kvm_x86_ops->get_irq(vcpu);
2263 if (pending_vec >= 0) 2815 if (pending_vec >= 0)
2264 set_bit(pending_vec, (unsigned long *)sregs->interrupt_bitmap); 2816 set_bit(pending_vec,
2817 (unsigned long *)sregs->interrupt_bitmap);
2265 } else 2818 } else
2266 memcpy(sregs->interrupt_bitmap, vcpu->irq_pending, 2819 memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending,
2267 sizeof sregs->interrupt_bitmap); 2820 sizeof sregs->interrupt_bitmap);
2268 2821
2269 vcpu_put(vcpu); 2822 vcpu_put(vcpu);
@@ -2277,8 +2830,8 @@ static void set_segment(struct kvm_vcpu *vcpu,
2277 return kvm_x86_ops->set_segment(vcpu, var, seg); 2830 return kvm_x86_ops->set_segment(vcpu, var, seg);
2278} 2831}
2279 2832
2280static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, 2833int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
2281 struct kvm_sregs *sregs) 2834 struct kvm_sregs *sregs)
2282{ 2835{
2283 int mmu_reset_needed = 0; 2836 int mmu_reset_needed = 0;
2284 int i, pending_vec, max_bits; 2837 int i, pending_vec, max_bits;
@@ -2293,13 +2846,13 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
2293 dt.base = sregs->gdt.base; 2846 dt.base = sregs->gdt.base;
2294 kvm_x86_ops->set_gdt(vcpu, &dt); 2847 kvm_x86_ops->set_gdt(vcpu, &dt);
2295 2848
2296 vcpu->cr2 = sregs->cr2; 2849 vcpu->arch.cr2 = sregs->cr2;
2297 mmu_reset_needed |= vcpu->cr3 != sregs->cr3; 2850 mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
2298 vcpu->cr3 = sregs->cr3; 2851 vcpu->arch.cr3 = sregs->cr3;
2299 2852
2300 set_cr8(vcpu, sregs->cr8); 2853 set_cr8(vcpu, sregs->cr8);
2301 2854
2302 mmu_reset_needed |= vcpu->shadow_efer != sregs->efer; 2855 mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer;
2303#ifdef CONFIG_X86_64 2856#ifdef CONFIG_X86_64
2304 kvm_x86_ops->set_efer(vcpu, sregs->efer); 2857 kvm_x86_ops->set_efer(vcpu, sregs->efer);
2305#endif 2858#endif
@@ -2307,25 +2860,25 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
2307 2860
2308 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 2861 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2309 2862
2310 mmu_reset_needed |= vcpu->cr0 != sregs->cr0; 2863 mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0;
2311 vcpu->cr0 = sregs->cr0; 2864 vcpu->arch.cr0 = sregs->cr0;
2312 kvm_x86_ops->set_cr0(vcpu, sregs->cr0); 2865 kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
2313 2866
2314 mmu_reset_needed |= vcpu->cr4 != sregs->cr4; 2867 mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4;
2315 kvm_x86_ops->set_cr4(vcpu, sregs->cr4); 2868 kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
2316 if (!is_long_mode(vcpu) && is_pae(vcpu)) 2869 if (!is_long_mode(vcpu) && is_pae(vcpu))
2317 load_pdptrs(vcpu, vcpu->cr3); 2870 load_pdptrs(vcpu, vcpu->arch.cr3);
2318 2871
2319 if (mmu_reset_needed) 2872 if (mmu_reset_needed)
2320 kvm_mmu_reset_context(vcpu); 2873 kvm_mmu_reset_context(vcpu);
2321 2874
2322 if (!irqchip_in_kernel(vcpu->kvm)) { 2875 if (!irqchip_in_kernel(vcpu->kvm)) {
2323 memcpy(vcpu->irq_pending, sregs->interrupt_bitmap, 2876 memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap,
2324 sizeof vcpu->irq_pending); 2877 sizeof vcpu->arch.irq_pending);
2325 vcpu->irq_summary = 0; 2878 vcpu->arch.irq_summary = 0;
2326 for (i = 0; i < ARRAY_SIZE(vcpu->irq_pending); ++i) 2879 for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i)
2327 if (vcpu->irq_pending[i]) 2880 if (vcpu->arch.irq_pending[i])
2328 __set_bit(i, &vcpu->irq_summary); 2881 __set_bit(i, &vcpu->arch.irq_summary);
2329 } else { 2882 } else {
2330 max_bits = (sizeof sregs->interrupt_bitmap) << 3; 2883 max_bits = (sizeof sregs->interrupt_bitmap) << 3;
2331 pending_vec = find_first_bit( 2884 pending_vec = find_first_bit(
@@ -2334,7 +2887,8 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
2334 /* Only pending external irq is handled here */ 2887 /* Only pending external irq is handled here */
2335 if (pending_vec < max_bits) { 2888 if (pending_vec < max_bits) {
2336 kvm_x86_ops->set_irq(vcpu, pending_vec); 2889 kvm_x86_ops->set_irq(vcpu, pending_vec);
2337 printk("Set back pending irq %d\n", pending_vec); 2890 pr_debug("Set back pending irq %d\n",
2891 pending_vec);
2338 } 2892 }
2339 } 2893 }
2340 2894
@@ -2353,174 +2907,8 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
2353 return 0; 2907 return 0;
2354} 2908}
2355 2909
2356void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 2910int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
2357{ 2911 struct kvm_debug_guest *dbg)
2358 struct kvm_segment cs;
2359
2360 get_segment(vcpu, &cs, VCPU_SREG_CS);
2361 *db = cs.db;
2362 *l = cs.l;
2363}
2364EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
2365
2366/*
2367 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
2368 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
2369 *
2370 * This list is modified at module load time to reflect the
2371 * capabilities of the host cpu.
2372 */
2373static u32 msrs_to_save[] = {
2374 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
2375 MSR_K6_STAR,
2376#ifdef CONFIG_X86_64
2377 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
2378#endif
2379 MSR_IA32_TIME_STAMP_COUNTER,
2380};
2381
2382static unsigned num_msrs_to_save;
2383
2384static u32 emulated_msrs[] = {
2385 MSR_IA32_MISC_ENABLE,
2386};
2387
2388static __init void kvm_init_msr_list(void)
2389{
2390 u32 dummy[2];
2391 unsigned i, j;
2392
2393 for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
2394 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
2395 continue;
2396 if (j < i)
2397 msrs_to_save[j] = msrs_to_save[i];
2398 j++;
2399 }
2400 num_msrs_to_save = j;
2401}
2402
2403/*
2404 * Adapt set_msr() to msr_io()'s calling convention
2405 */
2406static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
2407{
2408 return kvm_set_msr(vcpu, index, *data);
2409}
2410
2411/*
2412 * Read or write a bunch of msrs. All parameters are kernel addresses.
2413 *
2414 * @return number of msrs set successfully.
2415 */
2416static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
2417 struct kvm_msr_entry *entries,
2418 int (*do_msr)(struct kvm_vcpu *vcpu,
2419 unsigned index, u64 *data))
2420{
2421 int i;
2422
2423 vcpu_load(vcpu);
2424
2425 for (i = 0; i < msrs->nmsrs; ++i)
2426 if (do_msr(vcpu, entries[i].index, &entries[i].data))
2427 break;
2428
2429 vcpu_put(vcpu);
2430
2431 return i;
2432}
2433
2434/*
2435 * Read or write a bunch of msrs. Parameters are user addresses.
2436 *
2437 * @return number of msrs set successfully.
2438 */
2439static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
2440 int (*do_msr)(struct kvm_vcpu *vcpu,
2441 unsigned index, u64 *data),
2442 int writeback)
2443{
2444 struct kvm_msrs msrs;
2445 struct kvm_msr_entry *entries;
2446 int r, n;
2447 unsigned size;
2448
2449 r = -EFAULT;
2450 if (copy_from_user(&msrs, user_msrs, sizeof msrs))
2451 goto out;
2452
2453 r = -E2BIG;
2454 if (msrs.nmsrs >= MAX_IO_MSRS)
2455 goto out;
2456
2457 r = -ENOMEM;
2458 size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
2459 entries = vmalloc(size);
2460 if (!entries)
2461 goto out;
2462
2463 r = -EFAULT;
2464 if (copy_from_user(entries, user_msrs->entries, size))
2465 goto out_free;
2466
2467 r = n = __msr_io(vcpu, &msrs, entries, do_msr);
2468 if (r < 0)
2469 goto out_free;
2470
2471 r = -EFAULT;
2472 if (writeback && copy_to_user(user_msrs->entries, entries, size))
2473 goto out_free;
2474
2475 r = n;
2476
2477out_free:
2478 vfree(entries);
2479out:
2480 return r;
2481}
2482
2483/*
2484 * Translate a guest virtual address to a guest physical address.
2485 */
2486static int kvm_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
2487 struct kvm_translation *tr)
2488{
2489 unsigned long vaddr = tr->linear_address;
2490 gpa_t gpa;
2491
2492 vcpu_load(vcpu);
2493 mutex_lock(&vcpu->kvm->lock);
2494 gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr);
2495 tr->physical_address = gpa;
2496 tr->valid = gpa != UNMAPPED_GVA;
2497 tr->writeable = 1;
2498 tr->usermode = 0;
2499 mutex_unlock(&vcpu->kvm->lock);
2500 vcpu_put(vcpu);
2501
2502 return 0;
2503}
2504
2505static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
2506 struct kvm_interrupt *irq)
2507{
2508 if (irq->irq < 0 || irq->irq >= 256)
2509 return -EINVAL;
2510 if (irqchip_in_kernel(vcpu->kvm))
2511 return -ENXIO;
2512 vcpu_load(vcpu);
2513
2514 set_bit(irq->irq, vcpu->irq_pending);
2515 set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary);
2516
2517 vcpu_put(vcpu);
2518
2519 return 0;
2520}
2521
2522static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
2523 struct kvm_debug_guest *dbg)
2524{ 2912{
2525 int r; 2913 int r;
2526 2914
@@ -2533,179 +2921,6 @@ static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
2533 return r; 2921 return r;
2534} 2922}
2535 2923
2536static struct page *kvm_vcpu_nopage(struct vm_area_struct *vma,
2537 unsigned long address,
2538 int *type)
2539{
2540 struct kvm_vcpu *vcpu = vma->vm_file->private_data;
2541 unsigned long pgoff;
2542 struct page *page;
2543
2544 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2545 if (pgoff == 0)
2546 page = virt_to_page(vcpu->run);
2547 else if (pgoff == KVM_PIO_PAGE_OFFSET)
2548 page = virt_to_page(vcpu->pio_data);
2549 else
2550 return NOPAGE_SIGBUS;
2551 get_page(page);
2552 if (type != NULL)
2553 *type = VM_FAULT_MINOR;
2554
2555 return page;
2556}
2557
2558static struct vm_operations_struct kvm_vcpu_vm_ops = {
2559 .nopage = kvm_vcpu_nopage,
2560};
2561
2562static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
2563{
2564 vma->vm_ops = &kvm_vcpu_vm_ops;
2565 return 0;
2566}
2567
2568static int kvm_vcpu_release(struct inode *inode, struct file *filp)
2569{
2570 struct kvm_vcpu *vcpu = filp->private_data;
2571
2572 fput(vcpu->kvm->filp);
2573 return 0;
2574}
2575
2576static struct file_operations kvm_vcpu_fops = {
2577 .release = kvm_vcpu_release,
2578 .unlocked_ioctl = kvm_vcpu_ioctl,
2579 .compat_ioctl = kvm_vcpu_ioctl,
2580 .mmap = kvm_vcpu_mmap,
2581};
2582
2583/*
2584 * Allocates an inode for the vcpu.
2585 */
2586static int create_vcpu_fd(struct kvm_vcpu *vcpu)
2587{
2588 int fd, r;
2589 struct inode *inode;
2590 struct file *file;
2591
2592 r = anon_inode_getfd(&fd, &inode, &file,
2593 "kvm-vcpu", &kvm_vcpu_fops, vcpu);
2594 if (r)
2595 return r;
2596 atomic_inc(&vcpu->kvm->filp->f_count);
2597 return fd;
2598}
2599
2600/*
2601 * Creates some virtual cpus. Good luck creating more than one.
2602 */
2603static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
2604{
2605 int r;
2606 struct kvm_vcpu *vcpu;
2607
2608 if (!valid_vcpu(n))
2609 return -EINVAL;
2610
2611 vcpu = kvm_x86_ops->vcpu_create(kvm, n);
2612 if (IS_ERR(vcpu))
2613 return PTR_ERR(vcpu);
2614
2615 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
2616
2617 /* We do fxsave: this must be aligned. */
2618 BUG_ON((unsigned long)&vcpu->host_fx_image & 0xF);
2619
2620 vcpu_load(vcpu);
2621 r = kvm_mmu_setup(vcpu);
2622 vcpu_put(vcpu);
2623 if (r < 0)
2624 goto free_vcpu;
2625
2626 mutex_lock(&kvm->lock);
2627 if (kvm->vcpus[n]) {
2628 r = -EEXIST;
2629 mutex_unlock(&kvm->lock);
2630 goto mmu_unload;
2631 }
2632 kvm->vcpus[n] = vcpu;
2633 mutex_unlock(&kvm->lock);
2634
2635 /* Now it's all set up, let userspace reach it */
2636 r = create_vcpu_fd(vcpu);
2637 if (r < 0)
2638 goto unlink;
2639 return r;
2640
2641unlink:
2642 mutex_lock(&kvm->lock);
2643 kvm->vcpus[n] = NULL;
2644 mutex_unlock(&kvm->lock);
2645
2646mmu_unload:
2647 vcpu_load(vcpu);
2648 kvm_mmu_unload(vcpu);
2649 vcpu_put(vcpu);
2650
2651free_vcpu:
2652 kvm_x86_ops->vcpu_free(vcpu);
2653 return r;
2654}
2655
2656static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
2657{
2658 u64 efer;
2659 int i;
2660 struct kvm_cpuid_entry *e, *entry;
2661
2662 rdmsrl(MSR_EFER, efer);
2663 entry = NULL;
2664 for (i = 0; i < vcpu->cpuid_nent; ++i) {
2665 e = &vcpu->cpuid_entries[i];
2666 if (e->function == 0x80000001) {
2667 entry = e;
2668 break;
2669 }
2670 }
2671 if (entry && (entry->edx & (1 << 20)) && !(efer & EFER_NX)) {
2672 entry->edx &= ~(1 << 20);
2673 printk(KERN_INFO "kvm: guest NX capability removed\n");
2674 }
2675}
2676
2677static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
2678 struct kvm_cpuid *cpuid,
2679 struct kvm_cpuid_entry __user *entries)
2680{
2681 int r;
2682
2683 r = -E2BIG;
2684 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
2685 goto out;
2686 r = -EFAULT;
2687 if (copy_from_user(&vcpu->cpuid_entries, entries,
2688 cpuid->nent * sizeof(struct kvm_cpuid_entry)))
2689 goto out;
2690 vcpu->cpuid_nent = cpuid->nent;
2691 cpuid_fix_nx_cap(vcpu);
2692 return 0;
2693
2694out:
2695 return r;
2696}
2697
2698static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
2699{
2700 if (sigset) {
2701 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
2702 vcpu->sigset_active = 1;
2703 vcpu->sigset = *sigset;
2704 } else
2705 vcpu->sigset_active = 0;
2706 return 0;
2707}
2708
2709/* 2924/*
2710 * fxsave fpu state. Taken from x86_64/processor.h. To be killed when 2925 * fxsave fpu state. Taken from x86_64/processor.h. To be killed when
2711 * we have asm/x86/processor.h 2926 * we have asm/x86/processor.h
@@ -2727,9 +2942,31 @@ struct fxsave {
2727#endif 2942#endif
2728}; 2943};
2729 2944
2730static int kvm_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 2945/*
2946 * Translate a guest virtual address to a guest physical address.
2947 */
2948int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
2949 struct kvm_translation *tr)
2731{ 2950{
2732 struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image; 2951 unsigned long vaddr = tr->linear_address;
2952 gpa_t gpa;
2953
2954 vcpu_load(vcpu);
2955 down_read(&current->mm->mmap_sem);
2956 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr);
2957 up_read(&current->mm->mmap_sem);
2958 tr->physical_address = gpa;
2959 tr->valid = gpa != UNMAPPED_GVA;
2960 tr->writeable = 1;
2961 tr->usermode = 0;
2962 vcpu_put(vcpu);
2963
2964 return 0;
2965}
2966
2967int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2968{
2969 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
2733 2970
2734 vcpu_load(vcpu); 2971 vcpu_load(vcpu);
2735 2972
@@ -2747,9 +2984,9 @@ static int kvm_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2747 return 0; 2984 return 0;
2748} 2985}
2749 2986
2750static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 2987int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2751{ 2988{
2752 struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image; 2989 struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
2753 2990
2754 vcpu_load(vcpu); 2991 vcpu_load(vcpu);
2755 2992
@@ -2767,862 +3004,284 @@ static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2767 return 0; 3004 return 0;
2768} 3005}
2769 3006
2770static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, 3007void fx_init(struct kvm_vcpu *vcpu)
2771 struct kvm_lapic_state *s)
2772{ 3008{
2773 vcpu_load(vcpu); 3009 unsigned after_mxcsr_mask;
2774 memcpy(s->regs, vcpu->apic->regs, sizeof *s);
2775 vcpu_put(vcpu);
2776
2777 return 0;
2778}
2779 3010
2780static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, 3011 /* Initialize guest FPU by resetting ours and saving into guest's */
2781 struct kvm_lapic_state *s) 3012 preempt_disable();
2782{ 3013 fx_save(&vcpu->arch.host_fx_image);
2783 vcpu_load(vcpu); 3014 fpu_init();
2784 memcpy(vcpu->apic->regs, s->regs, sizeof *s); 3015 fx_save(&vcpu->arch.guest_fx_image);
2785 kvm_apic_post_state_restore(vcpu); 3016 fx_restore(&vcpu->arch.host_fx_image);
2786 vcpu_put(vcpu); 3017 preempt_enable();
2787 3018
2788 return 0; 3019 vcpu->arch.cr0 |= X86_CR0_ET;
3020 after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
3021 vcpu->arch.guest_fx_image.mxcsr = 0x1f80;
3022 memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask,
3023 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
2789} 3024}
3025EXPORT_SYMBOL_GPL(fx_init);
2790 3026
2791static long kvm_vcpu_ioctl(struct file *filp, 3027void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
2792 unsigned int ioctl, unsigned long arg)
2793{ 3028{
2794 struct kvm_vcpu *vcpu = filp->private_data; 3029 if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
2795 void __user *argp = (void __user *)arg; 3030 return;
2796 int r = -EINVAL;
2797
2798 switch (ioctl) {
2799 case KVM_RUN:
2800 r = -EINVAL;
2801 if (arg)
2802 goto out;
2803 r = kvm_vcpu_ioctl_run(vcpu, vcpu->run);
2804 break;
2805 case KVM_GET_REGS: {
2806 struct kvm_regs kvm_regs;
2807
2808 memset(&kvm_regs, 0, sizeof kvm_regs);
2809 r = kvm_vcpu_ioctl_get_regs(vcpu, &kvm_regs);
2810 if (r)
2811 goto out;
2812 r = -EFAULT;
2813 if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs))
2814 goto out;
2815 r = 0;
2816 break;
2817 }
2818 case KVM_SET_REGS: {
2819 struct kvm_regs kvm_regs;
2820
2821 r = -EFAULT;
2822 if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs))
2823 goto out;
2824 r = kvm_vcpu_ioctl_set_regs(vcpu, &kvm_regs);
2825 if (r)
2826 goto out;
2827 r = 0;
2828 break;
2829 }
2830 case KVM_GET_SREGS: {
2831 struct kvm_sregs kvm_sregs;
2832
2833 memset(&kvm_sregs, 0, sizeof kvm_sregs);
2834 r = kvm_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs);
2835 if (r)
2836 goto out;
2837 r = -EFAULT;
2838 if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs))
2839 goto out;
2840 r = 0;
2841 break;
2842 }
2843 case KVM_SET_SREGS: {
2844 struct kvm_sregs kvm_sregs;
2845
2846 r = -EFAULT;
2847 if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs))
2848 goto out;
2849 r = kvm_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs);
2850 if (r)
2851 goto out;
2852 r = 0;
2853 break;
2854 }
2855 case KVM_TRANSLATE: {
2856 struct kvm_translation tr;
2857
2858 r = -EFAULT;
2859 if (copy_from_user(&tr, argp, sizeof tr))
2860 goto out;
2861 r = kvm_vcpu_ioctl_translate(vcpu, &tr);
2862 if (r)
2863 goto out;
2864 r = -EFAULT;
2865 if (copy_to_user(argp, &tr, sizeof tr))
2866 goto out;
2867 r = 0;
2868 break;
2869 }
2870 case KVM_INTERRUPT: {
2871 struct kvm_interrupt irq;
2872
2873 r = -EFAULT;
2874 if (copy_from_user(&irq, argp, sizeof irq))
2875 goto out;
2876 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
2877 if (r)
2878 goto out;
2879 r = 0;
2880 break;
2881 }
2882 case KVM_DEBUG_GUEST: {
2883 struct kvm_debug_guest dbg;
2884
2885 r = -EFAULT;
2886 if (copy_from_user(&dbg, argp, sizeof dbg))
2887 goto out;
2888 r = kvm_vcpu_ioctl_debug_guest(vcpu, &dbg);
2889 if (r)
2890 goto out;
2891 r = 0;
2892 break;
2893 }
2894 case KVM_GET_MSRS:
2895 r = msr_io(vcpu, argp, kvm_get_msr, 1);
2896 break;
2897 case KVM_SET_MSRS:
2898 r = msr_io(vcpu, argp, do_set_msr, 0);
2899 break;
2900 case KVM_SET_CPUID: {
2901 struct kvm_cpuid __user *cpuid_arg = argp;
2902 struct kvm_cpuid cpuid;
2903
2904 r = -EFAULT;
2905 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
2906 goto out;
2907 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
2908 if (r)
2909 goto out;
2910 break;
2911 }
2912 case KVM_SET_SIGNAL_MASK: {
2913 struct kvm_signal_mask __user *sigmask_arg = argp;
2914 struct kvm_signal_mask kvm_sigmask;
2915 sigset_t sigset, *p;
2916
2917 p = NULL;
2918 if (argp) {
2919 r = -EFAULT;
2920 if (copy_from_user(&kvm_sigmask, argp,
2921 sizeof kvm_sigmask))
2922 goto out;
2923 r = -EINVAL;
2924 if (kvm_sigmask.len != sizeof sigset)
2925 goto out;
2926 r = -EFAULT;
2927 if (copy_from_user(&sigset, sigmask_arg->sigset,
2928 sizeof sigset))
2929 goto out;
2930 p = &sigset;
2931 }
2932 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
2933 break;
2934 }
2935 case KVM_GET_FPU: {
2936 struct kvm_fpu fpu;
2937
2938 memset(&fpu, 0, sizeof fpu);
2939 r = kvm_vcpu_ioctl_get_fpu(vcpu, &fpu);
2940 if (r)
2941 goto out;
2942 r = -EFAULT;
2943 if (copy_to_user(argp, &fpu, sizeof fpu))
2944 goto out;
2945 r = 0;
2946 break;
2947 }
2948 case KVM_SET_FPU: {
2949 struct kvm_fpu fpu;
2950
2951 r = -EFAULT;
2952 if (copy_from_user(&fpu, argp, sizeof fpu))
2953 goto out;
2954 r = kvm_vcpu_ioctl_set_fpu(vcpu, &fpu);
2955 if (r)
2956 goto out;
2957 r = 0;
2958 break;
2959 }
2960 case KVM_GET_LAPIC: {
2961 struct kvm_lapic_state lapic;
2962
2963 memset(&lapic, 0, sizeof lapic);
2964 r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic);
2965 if (r)
2966 goto out;
2967 r = -EFAULT;
2968 if (copy_to_user(argp, &lapic, sizeof lapic))
2969 goto out;
2970 r = 0;
2971 break;
2972 }
2973 case KVM_SET_LAPIC: {
2974 struct kvm_lapic_state lapic;
2975 3031
2976 r = -EFAULT; 3032 vcpu->guest_fpu_loaded = 1;
2977 if (copy_from_user(&lapic, argp, sizeof lapic)) 3033 fx_save(&vcpu->arch.host_fx_image);
2978 goto out; 3034 fx_restore(&vcpu->arch.guest_fx_image);
2979 r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);;
2980 if (r)
2981 goto out;
2982 r = 0;
2983 break;
2984 }
2985 default:
2986 ;
2987 }
2988out:
2989 return r;
2990} 3035}
3036EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
2991 3037
2992static long kvm_vm_ioctl(struct file *filp, 3038void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
2993 unsigned int ioctl, unsigned long arg)
2994{ 3039{
2995 struct kvm *kvm = filp->private_data; 3040 if (!vcpu->guest_fpu_loaded)
2996 void __user *argp = (void __user *)arg; 3041 return;
2997 int r = -EINVAL;
2998
2999 switch (ioctl) {
3000 case KVM_CREATE_VCPU:
3001 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
3002 if (r < 0)
3003 goto out;
3004 break;
3005 case KVM_SET_MEMORY_REGION: {
3006 struct kvm_memory_region kvm_mem;
3007
3008 r = -EFAULT;
3009 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
3010 goto out;
3011 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_mem);
3012 if (r)
3013 goto out;
3014 break;
3015 }
3016 case KVM_GET_DIRTY_LOG: {
3017 struct kvm_dirty_log log;
3018
3019 r = -EFAULT;
3020 if (copy_from_user(&log, argp, sizeof log))
3021 goto out;
3022 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
3023 if (r)
3024 goto out;
3025 break;
3026 }
3027 case KVM_SET_MEMORY_ALIAS: {
3028 struct kvm_memory_alias alias;
3029
3030 r = -EFAULT;
3031 if (copy_from_user(&alias, argp, sizeof alias))
3032 goto out;
3033 r = kvm_vm_ioctl_set_memory_alias(kvm, &alias);
3034 if (r)
3035 goto out;
3036 break;
3037 }
3038 case KVM_CREATE_IRQCHIP:
3039 r = -ENOMEM;
3040 kvm->vpic = kvm_create_pic(kvm);
3041 if (kvm->vpic) {
3042 r = kvm_ioapic_init(kvm);
3043 if (r) {
3044 kfree(kvm->vpic);
3045 kvm->vpic = NULL;
3046 goto out;
3047 }
3048 }
3049 else
3050 goto out;
3051 break;
3052 case KVM_IRQ_LINE: {
3053 struct kvm_irq_level irq_event;
3054
3055 r = -EFAULT;
3056 if (copy_from_user(&irq_event, argp, sizeof irq_event))
3057 goto out;
3058 if (irqchip_in_kernel(kvm)) {
3059 mutex_lock(&kvm->lock);
3060 if (irq_event.irq < 16)
3061 kvm_pic_set_irq(pic_irqchip(kvm),
3062 irq_event.irq,
3063 irq_event.level);
3064 kvm_ioapic_set_irq(kvm->vioapic,
3065 irq_event.irq,
3066 irq_event.level);
3067 mutex_unlock(&kvm->lock);
3068 r = 0;
3069 }
3070 break;
3071 }
3072 case KVM_GET_IRQCHIP: {
3073 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
3074 struct kvm_irqchip chip;
3075
3076 r = -EFAULT;
3077 if (copy_from_user(&chip, argp, sizeof chip))
3078 goto out;
3079 r = -ENXIO;
3080 if (!irqchip_in_kernel(kvm))
3081 goto out;
3082 r = kvm_vm_ioctl_get_irqchip(kvm, &chip);
3083 if (r)
3084 goto out;
3085 r = -EFAULT;
3086 if (copy_to_user(argp, &chip, sizeof chip))
3087 goto out;
3088 r = 0;
3089 break;
3090 }
3091 case KVM_SET_IRQCHIP: {
3092 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
3093 struct kvm_irqchip chip;
3094 3042
3095 r = -EFAULT; 3043 vcpu->guest_fpu_loaded = 0;
3096 if (copy_from_user(&chip, argp, sizeof chip)) 3044 fx_save(&vcpu->arch.guest_fx_image);
3097 goto out; 3045 fx_restore(&vcpu->arch.host_fx_image);
3098 r = -ENXIO; 3046 ++vcpu->stat.fpu_reload;
3099 if (!irqchip_in_kernel(kvm))
3100 goto out;
3101 r = kvm_vm_ioctl_set_irqchip(kvm, &chip);
3102 if (r)
3103 goto out;
3104 r = 0;
3105 break;
3106 }
3107 default:
3108 ;
3109 }
3110out:
3111 return r;
3112} 3047}
3048EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
3113 3049
3114static struct page *kvm_vm_nopage(struct vm_area_struct *vma, 3050void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
3115 unsigned long address,
3116 int *type)
3117{ 3051{
3118 struct kvm *kvm = vma->vm_file->private_data; 3052 kvm_x86_ops->vcpu_free(vcpu);
3119 unsigned long pgoff;
3120 struct page *page;
3121
3122 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
3123 page = gfn_to_page(kvm, pgoff);
3124 if (!page)
3125 return NOPAGE_SIGBUS;
3126 get_page(page);
3127 if (type != NULL)
3128 *type = VM_FAULT_MINOR;
3129
3130 return page;
3131} 3053}
3132 3054
3133static struct vm_operations_struct kvm_vm_vm_ops = { 3055struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
3134 .nopage = kvm_vm_nopage, 3056 unsigned int id)
3135};
3136
3137static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
3138{ 3057{
3139 vma->vm_ops = &kvm_vm_vm_ops; 3058 return kvm_x86_ops->vcpu_create(kvm, id);
3140 return 0;
3141} 3059}
3142 3060
3143static struct file_operations kvm_vm_fops = { 3061int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
3144 .release = kvm_vm_release,
3145 .unlocked_ioctl = kvm_vm_ioctl,
3146 .compat_ioctl = kvm_vm_ioctl,
3147 .mmap = kvm_vm_mmap,
3148};
3149
3150static int kvm_dev_ioctl_create_vm(void)
3151{ 3062{
3152 int fd, r; 3063 int r;
3153 struct inode *inode;
3154 struct file *file;
3155 struct kvm *kvm;
3156 3064
3157 kvm = kvm_create_vm(); 3065 /* We do fxsave: this must be aligned. */
3158 if (IS_ERR(kvm)) 3066 BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
3159 return PTR_ERR(kvm);
3160 r = anon_inode_getfd(&fd, &inode, &file, "kvm-vm", &kvm_vm_fops, kvm);
3161 if (r) {
3162 kvm_destroy_vm(kvm);
3163 return r;
3164 }
3165 3067
3166 kvm->filp = file; 3068 vcpu_load(vcpu);
3069 r = kvm_arch_vcpu_reset(vcpu);
3070 if (r == 0)
3071 r = kvm_mmu_setup(vcpu);
3072 vcpu_put(vcpu);
3073 if (r < 0)
3074 goto free_vcpu;
3167 3075
3168 return fd; 3076 return 0;
3077free_vcpu:
3078 kvm_x86_ops->vcpu_free(vcpu);
3079 return r;
3169} 3080}
3170 3081
3171static long kvm_dev_ioctl(struct file *filp, 3082void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
3172 unsigned int ioctl, unsigned long arg)
3173{ 3083{
3174 void __user *argp = (void __user *)arg; 3084 vcpu_load(vcpu);
3175 long r = -EINVAL; 3085 kvm_mmu_unload(vcpu);
3176 3086 vcpu_put(vcpu);
3177 switch (ioctl) {
3178 case KVM_GET_API_VERSION:
3179 r = -EINVAL;
3180 if (arg)
3181 goto out;
3182 r = KVM_API_VERSION;
3183 break;
3184 case KVM_CREATE_VM:
3185 r = -EINVAL;
3186 if (arg)
3187 goto out;
3188 r = kvm_dev_ioctl_create_vm();
3189 break;
3190 case KVM_GET_MSR_INDEX_LIST: {
3191 struct kvm_msr_list __user *user_msr_list = argp;
3192 struct kvm_msr_list msr_list;
3193 unsigned n;
3194
3195 r = -EFAULT;
3196 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
3197 goto out;
3198 n = msr_list.nmsrs;
3199 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
3200 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
3201 goto out;
3202 r = -E2BIG;
3203 if (n < num_msrs_to_save)
3204 goto out;
3205 r = -EFAULT;
3206 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
3207 num_msrs_to_save * sizeof(u32)))
3208 goto out;
3209 if (copy_to_user(user_msr_list->indices
3210 + num_msrs_to_save * sizeof(u32),
3211 &emulated_msrs,
3212 ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
3213 goto out;
3214 r = 0;
3215 break;
3216 }
3217 case KVM_CHECK_EXTENSION: {
3218 int ext = (long)argp;
3219 3087
3220 switch (ext) { 3088 kvm_x86_ops->vcpu_free(vcpu);
3221 case KVM_CAP_IRQCHIP:
3222 case KVM_CAP_HLT:
3223 r = 1;
3224 break;
3225 default:
3226 r = 0;
3227 break;
3228 }
3229 break;
3230 }
3231 case KVM_GET_VCPU_MMAP_SIZE:
3232 r = -EINVAL;
3233 if (arg)
3234 goto out;
3235 r = 2 * PAGE_SIZE;
3236 break;
3237 default:
3238 ;
3239 }
3240out:
3241 return r;
3242} 3089}
3243 3090
3244static struct file_operations kvm_chardev_ops = { 3091int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
3245 .unlocked_ioctl = kvm_dev_ioctl,
3246 .compat_ioctl = kvm_dev_ioctl,
3247};
3248
3249static struct miscdevice kvm_dev = {
3250 KVM_MINOR,
3251 "kvm",
3252 &kvm_chardev_ops,
3253};
3254
3255/*
3256 * Make sure that a cpu that is being hot-unplugged does not have any vcpus
3257 * cached on it.
3258 */
3259static void decache_vcpus_on_cpu(int cpu)
3260{ 3092{
3261 struct kvm *vm; 3093 return kvm_x86_ops->vcpu_reset(vcpu);
3262 struct kvm_vcpu *vcpu;
3263 int i;
3264
3265 spin_lock(&kvm_lock);
3266 list_for_each_entry(vm, &vm_list, vm_list)
3267 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
3268 vcpu = vm->vcpus[i];
3269 if (!vcpu)
3270 continue;
3271 /*
3272 * If the vcpu is locked, then it is running on some
3273 * other cpu and therefore it is not cached on the
3274 * cpu in question.
3275 *
3276 * If it's not locked, check the last cpu it executed
3277 * on.
3278 */
3279 if (mutex_trylock(&vcpu->mutex)) {
3280 if (vcpu->cpu == cpu) {
3281 kvm_x86_ops->vcpu_decache(vcpu);
3282 vcpu->cpu = -1;
3283 }
3284 mutex_unlock(&vcpu->mutex);
3285 }
3286 }
3287 spin_unlock(&kvm_lock);
3288} 3094}
3289 3095
3290static void hardware_enable(void *junk) 3096void kvm_arch_hardware_enable(void *garbage)
3291{ 3097{
3292 int cpu = raw_smp_processor_id(); 3098 kvm_x86_ops->hardware_enable(garbage);
3293
3294 if (cpu_isset(cpu, cpus_hardware_enabled))
3295 return;
3296 cpu_set(cpu, cpus_hardware_enabled);
3297 kvm_x86_ops->hardware_enable(NULL);
3298} 3099}
3299 3100
3300static void hardware_disable(void *junk) 3101void kvm_arch_hardware_disable(void *garbage)
3301{ 3102{
3302 int cpu = raw_smp_processor_id(); 3103 kvm_x86_ops->hardware_disable(garbage);
3303
3304 if (!cpu_isset(cpu, cpus_hardware_enabled))
3305 return;
3306 cpu_clear(cpu, cpus_hardware_enabled);
3307 decache_vcpus_on_cpu(cpu);
3308 kvm_x86_ops->hardware_disable(NULL);
3309} 3104}
3310 3105
3311static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, 3106int kvm_arch_hardware_setup(void)
3312 void *v)
3313{ 3107{
3314 int cpu = (long)v; 3108 return kvm_x86_ops->hardware_setup();
3315
3316 switch (val) {
3317 case CPU_DYING:
3318 case CPU_DYING_FROZEN:
3319 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
3320 cpu);
3321 hardware_disable(NULL);
3322 break;
3323 case CPU_UP_CANCELED:
3324 case CPU_UP_CANCELED_FROZEN:
3325 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
3326 cpu);
3327 smp_call_function_single(cpu, hardware_disable, NULL, 0, 1);
3328 break;
3329 case CPU_ONLINE:
3330 case CPU_ONLINE_FROZEN:
3331 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
3332 cpu);
3333 smp_call_function_single(cpu, hardware_enable, NULL, 0, 1);
3334 break;
3335 }
3336 return NOTIFY_OK;
3337} 3109}
3338 3110
3339static int kvm_reboot(struct notifier_block *notifier, unsigned long val, 3111void kvm_arch_hardware_unsetup(void)
3340 void *v)
3341{ 3112{
3342 if (val == SYS_RESTART) { 3113 kvm_x86_ops->hardware_unsetup();
3343 /*
3344 * Some (well, at least mine) BIOSes hang on reboot if
3345 * in vmx root mode.
3346 */
3347 printk(KERN_INFO "kvm: exiting hardware virtualization\n");
3348 on_each_cpu(hardware_disable, NULL, 0, 1);
3349 }
3350 return NOTIFY_OK;
3351} 3114}
3352 3115
3353static struct notifier_block kvm_reboot_notifier = { 3116void kvm_arch_check_processor_compat(void *rtn)
3354 .notifier_call = kvm_reboot,
3355 .priority = 0,
3356};
3357
3358void kvm_io_bus_init(struct kvm_io_bus *bus)
3359{ 3117{
3360 memset(bus, 0, sizeof(*bus)); 3118 kvm_x86_ops->check_processor_compatibility(rtn);
3361} 3119}
3362 3120
3363void kvm_io_bus_destroy(struct kvm_io_bus *bus) 3121int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
3364{ 3122{
3365 int i; 3123 struct page *page;
3124 struct kvm *kvm;
3125 int r;
3366 3126
3367 for (i = 0; i < bus->dev_count; i++) { 3127 BUG_ON(vcpu->kvm == NULL);
3368 struct kvm_io_device *pos = bus->devs[i]; 3128 kvm = vcpu->kvm;
3369 3129
3370 kvm_iodevice_destructor(pos); 3130 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
3371 } 3131 if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0)
3372} 3132 vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
3133 else
3134 vcpu->arch.mp_state = VCPU_MP_STATE_UNINITIALIZED;
3373 3135
3374struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr) 3136 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
3375{ 3137 if (!page) {
3376 int i; 3138 r = -ENOMEM;
3139 goto fail;
3140 }
3141 vcpu->arch.pio_data = page_address(page);
3377 3142
3378 for (i = 0; i < bus->dev_count; i++) { 3143 r = kvm_mmu_create(vcpu);
3379 struct kvm_io_device *pos = bus->devs[i]; 3144 if (r < 0)
3145 goto fail_free_pio_data;
3380 3146
3381 if (pos->in_range(pos, addr)) 3147 if (irqchip_in_kernel(kvm)) {
3382 return pos; 3148 r = kvm_create_lapic(vcpu);
3149 if (r < 0)
3150 goto fail_mmu_destroy;
3383 } 3151 }
3384 3152
3385 return NULL; 3153 return 0;
3386}
3387
3388void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev)
3389{
3390 BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1));
3391 3154
3392 bus->devs[bus->dev_count++] = dev; 3155fail_mmu_destroy:
3156 kvm_mmu_destroy(vcpu);
3157fail_free_pio_data:
3158 free_page((unsigned long)vcpu->arch.pio_data);
3159fail:
3160 return r;
3393} 3161}
3394 3162
3395static struct notifier_block kvm_cpu_notifier = { 3163void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
3396 .notifier_call = kvm_cpu_hotplug,
3397 .priority = 20, /* must be > scheduler priority */
3398};
3399
3400static u64 stat_get(void *_offset)
3401{ 3164{
3402 unsigned offset = (long)_offset; 3165 kvm_free_lapic(vcpu);
3403 u64 total = 0; 3166 kvm_mmu_destroy(vcpu);
3404 struct kvm *kvm; 3167 free_page((unsigned long)vcpu->arch.pio_data);
3405 struct kvm_vcpu *vcpu;
3406 int i;
3407
3408 spin_lock(&kvm_lock);
3409 list_for_each_entry(kvm, &vm_list, vm_list)
3410 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
3411 vcpu = kvm->vcpus[i];
3412 if (vcpu)
3413 total += *(u32 *)((void *)vcpu + offset);
3414 }
3415 spin_unlock(&kvm_lock);
3416 return total;
3417} 3168}
3418 3169
3419DEFINE_SIMPLE_ATTRIBUTE(stat_fops, stat_get, NULL, "%llu\n"); 3170struct kvm *kvm_arch_create_vm(void)
3420
3421static __init void kvm_init_debug(void)
3422{ 3171{
3423 struct kvm_stats_debugfs_item *p; 3172 struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
3424
3425 debugfs_dir = debugfs_create_dir("kvm", NULL);
3426 for (p = debugfs_entries; p->name; ++p)
3427 p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir,
3428 (void *)(long)p->offset,
3429 &stat_fops);
3430}
3431 3173
3432static void kvm_exit_debug(void) 3174 if (!kvm)
3433{ 3175 return ERR_PTR(-ENOMEM);
3434 struct kvm_stats_debugfs_item *p;
3435 3176
3436 for (p = debugfs_entries; p->name; ++p) 3177 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
3437 debugfs_remove(p->dentry);
3438 debugfs_remove(debugfs_dir);
3439}
3440 3178
3441static int kvm_suspend(struct sys_device *dev, pm_message_t state) 3179 return kvm;
3442{
3443 hardware_disable(NULL);
3444 return 0;
3445} 3180}
3446 3181
3447static int kvm_resume(struct sys_device *dev) 3182static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
3448{ 3183{
3449 hardware_enable(NULL); 3184 vcpu_load(vcpu);
3450 return 0; 3185 kvm_mmu_unload(vcpu);
3186 vcpu_put(vcpu);
3451} 3187}
3452 3188
3453static struct sysdev_class kvm_sysdev_class = { 3189static void kvm_free_vcpus(struct kvm *kvm)
3454 .name = "kvm",
3455 .suspend = kvm_suspend,
3456 .resume = kvm_resume,
3457};
3458
3459static struct sys_device kvm_sysdev = {
3460 .id = 0,
3461 .cls = &kvm_sysdev_class,
3462};
3463
3464hpa_t bad_page_address;
3465
3466static inline
3467struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
3468{ 3190{
3469 return container_of(pn, struct kvm_vcpu, preempt_notifier); 3191 unsigned int i;
3470}
3471 3192
3472static void kvm_sched_in(struct preempt_notifier *pn, int cpu) 3193 /*
3473{ 3194 * Unpin any mmu pages first.
3474 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 3195 */
3196 for (i = 0; i < KVM_MAX_VCPUS; ++i)
3197 if (kvm->vcpus[i])
3198 kvm_unload_vcpu_mmu(kvm->vcpus[i]);
3199 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
3200 if (kvm->vcpus[i]) {
3201 kvm_arch_vcpu_free(kvm->vcpus[i]);
3202 kvm->vcpus[i] = NULL;
3203 }
3204 }
3475 3205
3476 kvm_x86_ops->vcpu_load(vcpu, cpu);
3477} 3206}
3478 3207
3479static void kvm_sched_out(struct preempt_notifier *pn, 3208void kvm_arch_destroy_vm(struct kvm *kvm)
3480 struct task_struct *next)
3481{ 3209{
3482 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 3210 kfree(kvm->arch.vpic);
3483 3211 kfree(kvm->arch.vioapic);
3484 kvm_x86_ops->vcpu_put(vcpu); 3212 kvm_free_vcpus(kvm);
3213 kvm_free_physmem(kvm);
3214 kfree(kvm);
3485} 3215}
3486 3216
3487int kvm_init_x86(struct kvm_x86_ops *ops, unsigned int vcpu_size, 3217int kvm_arch_set_memory_region(struct kvm *kvm,
3488 struct module *module) 3218 struct kvm_userspace_memory_region *mem,
3219 struct kvm_memory_slot old,
3220 int user_alloc)
3489{ 3221{
3490 int r; 3222 int npages = mem->memory_size >> PAGE_SHIFT;
3491 int cpu; 3223 struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
3492
3493 if (kvm_x86_ops) {
3494 printk(KERN_ERR "kvm: already loaded the other module\n");
3495 return -EEXIST;
3496 }
3497 3224
3498 if (!ops->cpu_has_kvm_support()) { 3225 /*To keep backward compatibility with older userspace,
3499 printk(KERN_ERR "kvm: no hardware support\n"); 3226 *x86 needs to hanlde !user_alloc case.
3500 return -EOPNOTSUPP; 3227 */
3501 } 3228 if (!user_alloc) {
3502 if (ops->disabled_by_bios()) { 3229 if (npages && !old.rmap) {
3503 printk(KERN_ERR "kvm: disabled by bios\n"); 3230 memslot->userspace_addr = do_mmap(NULL, 0,
3504 return -EOPNOTSUPP; 3231 npages * PAGE_SIZE,
3505 } 3232 PROT_READ | PROT_WRITE,
3506 3233 MAP_SHARED | MAP_ANONYMOUS,
3507 kvm_x86_ops = ops; 3234 0);
3508 3235
3509 r = kvm_x86_ops->hardware_setup(); 3236 if (IS_ERR((void *)memslot->userspace_addr))
3510 if (r < 0) 3237 return PTR_ERR((void *)memslot->userspace_addr);
3511 goto out; 3238 } else {
3512 3239 if (!old.user_alloc && old.rmap) {
3513 for_each_online_cpu(cpu) { 3240 int ret;
3514 smp_call_function_single(cpu, 3241
3515 kvm_x86_ops->check_processor_compatibility, 3242 ret = do_munmap(current->mm, old.userspace_addr,
3516 &r, 0, 1); 3243 old.npages * PAGE_SIZE);
3517 if (r < 0) 3244 if (ret < 0)
3518 goto out_free_0; 3245 printk(KERN_WARNING
3519 } 3246 "kvm_vm_ioctl_set_memory_region: "
3520 3247 "failed to munmap memory\n");
3521 on_each_cpu(hardware_enable, NULL, 0, 1); 3248 }
3522 r = register_cpu_notifier(&kvm_cpu_notifier); 3249 }
3523 if (r)
3524 goto out_free_1;
3525 register_reboot_notifier(&kvm_reboot_notifier);
3526
3527 r = sysdev_class_register(&kvm_sysdev_class);
3528 if (r)
3529 goto out_free_2;
3530
3531 r = sysdev_register(&kvm_sysdev);
3532 if (r)
3533 goto out_free_3;
3534
3535 /* A kmem cache lets us meet the alignment requirements of fx_save. */
3536 kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size,
3537 __alignof__(struct kvm_vcpu), 0, 0);
3538 if (!kvm_vcpu_cache) {
3539 r = -ENOMEM;
3540 goto out_free_4;
3541 } 3250 }
3542 3251
3543 kvm_chardev_ops.owner = module; 3252 if (!kvm->arch.n_requested_mmu_pages) {
3544 3253 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
3545 r = misc_register(&kvm_dev); 3254 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
3546 if (r) {
3547 printk (KERN_ERR "kvm: misc device register failed\n");
3548 goto out_free;
3549 } 3255 }
3550 3256
3551 kvm_preempt_ops.sched_in = kvm_sched_in; 3257 kvm_mmu_slot_remove_write_access(kvm, mem->slot);
3552 kvm_preempt_ops.sched_out = kvm_sched_out; 3258 kvm_flush_remote_tlbs(kvm);
3553
3554 return r;
3555 3259
3556out_free: 3260 return 0;
3557 kmem_cache_destroy(kvm_vcpu_cache);
3558out_free_4:
3559 sysdev_unregister(&kvm_sysdev);
3560out_free_3:
3561 sysdev_class_unregister(&kvm_sysdev_class);
3562out_free_2:
3563 unregister_reboot_notifier(&kvm_reboot_notifier);
3564 unregister_cpu_notifier(&kvm_cpu_notifier);
3565out_free_1:
3566 on_each_cpu(hardware_disable, NULL, 0, 1);
3567out_free_0:
3568 kvm_x86_ops->hardware_unsetup();
3569out:
3570 kvm_x86_ops = NULL;
3571 return r;
3572} 3261}
3573 3262
3574void kvm_exit_x86(void) 3263int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
3575{ 3264{
3576 misc_deregister(&kvm_dev); 3265 return vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE
3577 kmem_cache_destroy(kvm_vcpu_cache); 3266 || vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED;
3578 sysdev_unregister(&kvm_sysdev);
3579 sysdev_class_unregister(&kvm_sysdev_class);
3580 unregister_reboot_notifier(&kvm_reboot_notifier);
3581 unregister_cpu_notifier(&kvm_cpu_notifier);
3582 on_each_cpu(hardware_disable, NULL, 0, 1);
3583 kvm_x86_ops->hardware_unsetup();
3584 kvm_x86_ops = NULL;
3585} 3267}
3586 3268
3587static __init int kvm_init(void) 3269static void vcpu_kick_intr(void *info)
3588{ 3270{
3589 static struct page *bad_page; 3271#ifdef DEBUG
3590 int r; 3272 struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
3591 3273 printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu);
3592 r = kvm_mmu_module_init(); 3274#endif
3593 if (r)
3594 goto out4;
3595
3596 kvm_init_debug();
3597
3598 kvm_init_msr_list();
3599
3600 if ((bad_page = alloc_page(GFP_KERNEL)) == NULL) {
3601 r = -ENOMEM;
3602 goto out;
3603 }
3604
3605 bad_page_address = page_to_pfn(bad_page) << PAGE_SHIFT;
3606 memset(__va(bad_page_address), 0, PAGE_SIZE);
3607
3608 return 0;
3609
3610out:
3611 kvm_exit_debug();
3612 kvm_mmu_module_exit();
3613out4:
3614 return r;
3615} 3275}
3616 3276
3617static __exit void kvm_exit(void) 3277void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
3618{ 3278{
3619 kvm_exit_debug(); 3279 int ipi_pcpu = vcpu->cpu;
3620 __free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT));
3621 kvm_mmu_module_exit();
3622}
3623
3624module_init(kvm_init)
3625module_exit(kvm_exit)
3626 3280
3627EXPORT_SYMBOL_GPL(kvm_init_x86); 3281 if (waitqueue_active(&vcpu->wq)) {
3628EXPORT_SYMBOL_GPL(kvm_exit_x86); 3282 wake_up_interruptible(&vcpu->wq);
3283 ++vcpu->stat.halt_wakeup;
3284 }
3285 if (vcpu->guest_mode)
3286 smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0);
3287}
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
new file mode 100644
index 000000000000..79586003397a
--- /dev/null
+++ b/arch/x86/kvm/x86_emulate.c
@@ -0,0 +1,1912 @@
1/******************************************************************************
2 * x86_emulate.c
3 *
4 * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
5 *
6 * Copyright (c) 2005 Keir Fraser
7 *
8 * Linux coding style, mod r/m decoder, segment base fixes, real-mode
9 * privileged instructions:
10 *
11 * Copyright (C) 2006 Qumranet
12 *
13 * Avi Kivity <avi@qumranet.com>
14 * Yaniv Kamay <yaniv@qumranet.com>
15 *
16 * This work is licensed under the terms of the GNU GPL, version 2. See
17 * the COPYING file in the top-level directory.
18 *
19 * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
20 */
21
22#ifndef __KERNEL__
23#include <stdio.h>
24#include <stdint.h>
25#include <public/xen.h>
26#define DPRINTF(_f, _a ...) printf(_f , ## _a)
27#else
28#include <linux/kvm_host.h>
29#define DPRINTF(x...) do {} while (0)
30#endif
31#include <linux/module.h>
32#include <asm/kvm_x86_emulate.h>
33
34/*
35 * Opcode effective-address decode tables.
36 * Note that we only emulate instructions that have at least one memory
37 * operand (excluding implicit stack references). We assume that stack
38 * references and instruction fetches will never occur in special memory
39 * areas that require emulation. So, for example, 'mov <imm>,<reg>' need
40 * not be handled.
41 */
42
43/* Operand sizes: 8-bit operands or specified/overridden size. */
44#define ByteOp (1<<0) /* 8-bit operands. */
45/* Destination operand type. */
46#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */
47#define DstReg (2<<1) /* Register operand. */
48#define DstMem (3<<1) /* Memory operand. */
49#define DstMask (3<<1)
50/* Source operand type. */
51#define SrcNone (0<<3) /* No source operand. */
52#define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */
53#define SrcReg (1<<3) /* Register operand. */
54#define SrcMem (2<<3) /* Memory operand. */
55#define SrcMem16 (3<<3) /* Memory operand (16-bit). */
56#define SrcMem32 (4<<3) /* Memory operand (32-bit). */
57#define SrcImm (5<<3) /* Immediate operand. */
58#define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */
59#define SrcMask (7<<3)
60/* Generic ModRM decode. */
61#define ModRM (1<<6)
62/* Destination is only written; never read. */
63#define Mov (1<<7)
64#define BitOp (1<<8)
65#define MemAbs (1<<9) /* Memory operand is absolute displacement */
66#define String (1<<10) /* String instruction (rep capable) */
67#define Stack (1<<11) /* Stack instruction (push/pop) */
68
69static u16 opcode_table[256] = {
70 /* 0x00 - 0x07 */
71 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
72 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
73 0, 0, 0, 0,
74 /* 0x08 - 0x0F */
75 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
76 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
77 0, 0, 0, 0,
78 /* 0x10 - 0x17 */
79 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
80 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
81 0, 0, 0, 0,
82 /* 0x18 - 0x1F */
83 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
84 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
85 0, 0, 0, 0,
86 /* 0x20 - 0x27 */
87 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
88 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
89 SrcImmByte, SrcImm, 0, 0,
90 /* 0x28 - 0x2F */
91 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
92 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
93 0, 0, 0, 0,
94 /* 0x30 - 0x37 */
95 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
96 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
97 0, 0, 0, 0,
98 /* 0x38 - 0x3F */
99 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
100 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
101 0, 0, 0, 0,
102 /* 0x40 - 0x47 */
103 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
104 /* 0x48 - 0x4F */
105 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
106 /* 0x50 - 0x57 */
107 SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
108 SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
109 /* 0x58 - 0x5F */
110 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
111 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
112 /* 0x60 - 0x67 */
113 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
114 0, 0, 0, 0,
115 /* 0x68 - 0x6F */
116 0, 0, ImplicitOps | Mov | Stack, 0,
117 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */
118 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */
119 /* 0x70 - 0x77 */
120 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
121 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
122 /* 0x78 - 0x7F */
123 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
124 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
125 /* 0x80 - 0x87 */
126 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
127 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
128 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
129 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
130 /* 0x88 - 0x8F */
131 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
132 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
133 0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov | Stack,
134 /* 0x90 - 0x9F */
135 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
137 /* 0xA0 - 0xA7 */
138 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
139 ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs,
140 ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
141 ByteOp | ImplicitOps | String, ImplicitOps | String,
142 /* 0xA8 - 0xAF */
143 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
144 ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
145 ByteOp | ImplicitOps | String, ImplicitOps | String,
146 /* 0xB0 - 0xBF */
147 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
148 /* 0xC0 - 0xC7 */
149 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
150 0, ImplicitOps | Stack, 0, 0,
151 ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
152 /* 0xC8 - 0xCF */
153 0, 0, 0, 0, 0, 0, 0, 0,
154 /* 0xD0 - 0xD7 */
155 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
156 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
157 0, 0, 0, 0,
158 /* 0xD8 - 0xDF */
159 0, 0, 0, 0, 0, 0, 0, 0,
160 /* 0xE0 - 0xE7 */
161 0, 0, 0, 0, 0, 0, 0, 0,
162 /* 0xE8 - 0xEF */
163 ImplicitOps | Stack, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps,
164 0, 0, 0, 0,
165 /* 0xF0 - 0xF7 */
166 0, 0, 0, 0,
167 ImplicitOps, ImplicitOps,
168 ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
169 /* 0xF8 - 0xFF */
170 ImplicitOps, 0, ImplicitOps, ImplicitOps,
171 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM
172};
173
174static u16 twobyte_table[256] = {
175 /* 0x00 - 0x0F */
176 0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0,
177 ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
178 /* 0x10 - 0x1F */
179 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
180 /* 0x20 - 0x2F */
181 ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 /* 0x30 - 0x3F */
184 ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
185 /* 0x40 - 0x47 */
186 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
187 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
188 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
189 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
190 /* 0x48 - 0x4F */
191 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
192 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
193 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
194 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
195 /* 0x50 - 0x5F */
196 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
197 /* 0x60 - 0x6F */
198 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
199 /* 0x70 - 0x7F */
200 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
201 /* 0x80 - 0x8F */
202 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
203 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
204 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
205 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
206 /* 0x90 - 0x9F */
207 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
208 /* 0xA0 - 0xA7 */
209 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
210 /* 0xA8 - 0xAF */
211 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
212 /* 0xB0 - 0xB7 */
213 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0,
214 DstMem | SrcReg | ModRM | BitOp,
215 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
216 DstReg | SrcMem16 | ModRM | Mov,
217 /* 0xB8 - 0xBF */
218 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp,
219 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
220 DstReg | SrcMem16 | ModRM | Mov,
221 /* 0xC0 - 0xCF */
222 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM,
223 0, 0, 0, 0, 0, 0, 0, 0,
224 /* 0xD0 - 0xDF */
225 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
226 /* 0xE0 - 0xEF */
227 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
228 /* 0xF0 - 0xFF */
229 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
230};
231
232/* EFLAGS bit definitions. */
233#define EFLG_OF (1<<11)
234#define EFLG_DF (1<<10)
235#define EFLG_SF (1<<7)
236#define EFLG_ZF (1<<6)
237#define EFLG_AF (1<<4)
238#define EFLG_PF (1<<2)
239#define EFLG_CF (1<<0)
240
241/*
242 * Instruction emulation:
243 * Most instructions are emulated directly via a fragment of inline assembly
244 * code. This allows us to save/restore EFLAGS and thus very easily pick up
245 * any modified flags.
246 */
247
248#if defined(CONFIG_X86_64)
249#define _LO32 "k" /* force 32-bit operand */
250#define _STK "%%rsp" /* stack pointer */
251#elif defined(__i386__)
252#define _LO32 "" /* force 32-bit operand */
253#define _STK "%%esp" /* stack pointer */
254#endif
255
256/*
257 * These EFLAGS bits are restored from saved value during emulation, and
258 * any changes are written back to the saved value after emulation.
259 */
260#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF)
261
262/* Before executing instruction: restore necessary bits in EFLAGS. */
263#define _PRE_EFLAGS(_sav, _msk, _tmp) \
264 /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~_msk; */ \
265 "movl %"_sav",%"_LO32 _tmp"; " \
266 "push %"_tmp"; " \
267 "push %"_tmp"; " \
268 "movl %"_msk",%"_LO32 _tmp"; " \
269 "andl %"_LO32 _tmp",("_STK"); " \
270 "pushf; " \
271 "notl %"_LO32 _tmp"; " \
272 "andl %"_LO32 _tmp",("_STK"); " \
273 "andl %"_LO32 _tmp","__stringify(BITS_PER_LONG/4)"("_STK"); " \
274 "pop %"_tmp"; " \
275 "orl %"_LO32 _tmp",("_STK"); " \
276 "popf; " \
277 "pop %"_sav"; "
278
279/* After executing instruction: write-back necessary bits in EFLAGS. */
280#define _POST_EFLAGS(_sav, _msk, _tmp) \
281 /* _sav |= EFLAGS & _msk; */ \
282 "pushf; " \
283 "pop %"_tmp"; " \
284 "andl %"_msk",%"_LO32 _tmp"; " \
285 "orl %"_LO32 _tmp",%"_sav"; "
286
287/* Raw emulation: instruction has two explicit operands. */
288#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \
289 do { \
290 unsigned long _tmp; \
291 \
292 switch ((_dst).bytes) { \
293 case 2: \
294 __asm__ __volatile__ ( \
295 _PRE_EFLAGS("0", "4", "2") \
296 _op"w %"_wx"3,%1; " \
297 _POST_EFLAGS("0", "4", "2") \
298 : "=m" (_eflags), "=m" ((_dst).val), \
299 "=&r" (_tmp) \
300 : _wy ((_src).val), "i" (EFLAGS_MASK)); \
301 break; \
302 case 4: \
303 __asm__ __volatile__ ( \
304 _PRE_EFLAGS("0", "4", "2") \
305 _op"l %"_lx"3,%1; " \
306 _POST_EFLAGS("0", "4", "2") \
307 : "=m" (_eflags), "=m" ((_dst).val), \
308 "=&r" (_tmp) \
309 : _ly ((_src).val), "i" (EFLAGS_MASK)); \
310 break; \
311 case 8: \
312 __emulate_2op_8byte(_op, _src, _dst, \
313 _eflags, _qx, _qy); \
314 break; \
315 } \
316 } while (0)
317
318#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
319 do { \
320 unsigned long _tmp; \
321 switch ((_dst).bytes) { \
322 case 1: \
323 __asm__ __volatile__ ( \
324 _PRE_EFLAGS("0", "4", "2") \
325 _op"b %"_bx"3,%1; " \
326 _POST_EFLAGS("0", "4", "2") \
327 : "=m" (_eflags), "=m" ((_dst).val), \
328 "=&r" (_tmp) \
329 : _by ((_src).val), "i" (EFLAGS_MASK)); \
330 break; \
331 default: \
332 __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
333 _wx, _wy, _lx, _ly, _qx, _qy); \
334 break; \
335 } \
336 } while (0)
337
338/* Source operand is byte-sized and may be restricted to just %cl. */
339#define emulate_2op_SrcB(_op, _src, _dst, _eflags) \
340 __emulate_2op(_op, _src, _dst, _eflags, \
341 "b", "c", "b", "c", "b", "c", "b", "c")
342
343/* Source operand is byte, word, long or quad sized. */
344#define emulate_2op_SrcV(_op, _src, _dst, _eflags) \
345 __emulate_2op(_op, _src, _dst, _eflags, \
346 "b", "q", "w", "r", _LO32, "r", "", "r")
347
348/* Source operand is word, long or quad sized. */
349#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \
350 __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
351 "w", "r", _LO32, "r", "", "r")
352
353/* Instruction has only one explicit operand (no source operand). */
354#define emulate_1op(_op, _dst, _eflags) \
355 do { \
356 unsigned long _tmp; \
357 \
358 switch ((_dst).bytes) { \
359 case 1: \
360 __asm__ __volatile__ ( \
361 _PRE_EFLAGS("0", "3", "2") \
362 _op"b %1; " \
363 _POST_EFLAGS("0", "3", "2") \
364 : "=m" (_eflags), "=m" ((_dst).val), \
365 "=&r" (_tmp) \
366 : "i" (EFLAGS_MASK)); \
367 break; \
368 case 2: \
369 __asm__ __volatile__ ( \
370 _PRE_EFLAGS("0", "3", "2") \
371 _op"w %1; " \
372 _POST_EFLAGS("0", "3", "2") \
373 : "=m" (_eflags), "=m" ((_dst).val), \
374 "=&r" (_tmp) \
375 : "i" (EFLAGS_MASK)); \
376 break; \
377 case 4: \
378 __asm__ __volatile__ ( \
379 _PRE_EFLAGS("0", "3", "2") \
380 _op"l %1; " \
381 _POST_EFLAGS("0", "3", "2") \
382 : "=m" (_eflags), "=m" ((_dst).val), \
383 "=&r" (_tmp) \
384 : "i" (EFLAGS_MASK)); \
385 break; \
386 case 8: \
387 __emulate_1op_8byte(_op, _dst, _eflags); \
388 break; \
389 } \
390 } while (0)
391
392/* Emulate an instruction with quadword operands (x86/64 only). */
393#if defined(CONFIG_X86_64)
394#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) \
395 do { \
396 __asm__ __volatile__ ( \
397 _PRE_EFLAGS("0", "4", "2") \
398 _op"q %"_qx"3,%1; " \
399 _POST_EFLAGS("0", "4", "2") \
400 : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
401 : _qy ((_src).val), "i" (EFLAGS_MASK)); \
402 } while (0)
403
404#define __emulate_1op_8byte(_op, _dst, _eflags) \
405 do { \
406 __asm__ __volatile__ ( \
407 _PRE_EFLAGS("0", "3", "2") \
408 _op"q %1; " \
409 _POST_EFLAGS("0", "3", "2") \
410 : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
411 : "i" (EFLAGS_MASK)); \
412 } while (0)
413
414#elif defined(__i386__)
415#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)
416#define __emulate_1op_8byte(_op, _dst, _eflags)
417#endif /* __i386__ */
418
419/* Fetch next part of the instruction being emulated. */
420#define insn_fetch(_type, _size, _eip) \
421({ unsigned long _x; \
422 rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \
423 if (rc != 0) \
424 goto done; \
425 (_eip) += (_size); \
426 (_type)_x; \
427})
428
429/* Access/update address held in a register, based on addressing mode. */
430#define address_mask(reg) \
431 ((c->ad_bytes == sizeof(unsigned long)) ? \
432 (reg) : ((reg) & ((1UL << (c->ad_bytes << 3)) - 1)))
433#define register_address(base, reg) \
434 ((base) + address_mask(reg))
435#define register_address_increment(reg, inc) \
436 do { \
437 /* signed type ensures sign extension to long */ \
438 int _inc = (inc); \
439 if (c->ad_bytes == sizeof(unsigned long)) \
440 (reg) += _inc; \
441 else \
442 (reg) = ((reg) & \
443 ~((1UL << (c->ad_bytes << 3)) - 1)) | \
444 (((reg) + _inc) & \
445 ((1UL << (c->ad_bytes << 3)) - 1)); \
446 } while (0)
447
448#define JMP_REL(rel) \
449 do { \
450 register_address_increment(c->eip, rel); \
451 } while (0)
452
453static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
454 struct x86_emulate_ops *ops,
455 unsigned long linear, u8 *dest)
456{
457 struct fetch_cache *fc = &ctxt->decode.fetch;
458 int rc;
459 int size;
460
461 if (linear < fc->start || linear >= fc->end) {
462 size = min(15UL, PAGE_SIZE - offset_in_page(linear));
463 rc = ops->read_std(linear, fc->data, size, ctxt->vcpu);
464 if (rc)
465 return rc;
466 fc->start = linear;
467 fc->end = linear + size;
468 }
469 *dest = fc->data[linear - fc->start];
470 return 0;
471}
472
473static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
474 struct x86_emulate_ops *ops,
475 unsigned long eip, void *dest, unsigned size)
476{
477 int rc = 0;
478
479 eip += ctxt->cs_base;
480 while (size--) {
481 rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++);
482 if (rc)
483 return rc;
484 }
485 return 0;
486}
487
488/*
489 * Given the 'reg' portion of a ModRM byte, and a register block, return a
490 * pointer into the block that addresses the relevant register.
491 * @highbyte_regs specifies whether to decode AH,CH,DH,BH.
492 */
493static void *decode_register(u8 modrm_reg, unsigned long *regs,
494 int highbyte_regs)
495{
496 void *p;
497
498 p = &regs[modrm_reg];
499 if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8)
500 p = (unsigned char *)&regs[modrm_reg & 3] + 1;
501 return p;
502}
503
504static int read_descriptor(struct x86_emulate_ctxt *ctxt,
505 struct x86_emulate_ops *ops,
506 void *ptr,
507 u16 *size, unsigned long *address, int op_bytes)
508{
509 int rc;
510
511 if (op_bytes == 2)
512 op_bytes = 3;
513 *address = 0;
514 rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
515 ctxt->vcpu);
516 if (rc)
517 return rc;
518 rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
519 ctxt->vcpu);
520 return rc;
521}
522
523static int test_cc(unsigned int condition, unsigned int flags)
524{
525 int rc = 0;
526
527 switch ((condition & 15) >> 1) {
528 case 0: /* o */
529 rc |= (flags & EFLG_OF);
530 break;
531 case 1: /* b/c/nae */
532 rc |= (flags & EFLG_CF);
533 break;
534 case 2: /* z/e */
535 rc |= (flags & EFLG_ZF);
536 break;
537 case 3: /* be/na */
538 rc |= (flags & (EFLG_CF|EFLG_ZF));
539 break;
540 case 4: /* s */
541 rc |= (flags & EFLG_SF);
542 break;
543 case 5: /* p/pe */
544 rc |= (flags & EFLG_PF);
545 break;
546 case 7: /* le/ng */
547 rc |= (flags & EFLG_ZF);
548 /* fall through */
549 case 6: /* l/nge */
550 rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF));
551 break;
552 }
553
554 /* Odd condition identifiers (lsb == 1) have inverted sense. */
555 return (!!rc ^ (condition & 1));
556}
557
558static void decode_register_operand(struct operand *op,
559 struct decode_cache *c,
560 int inhibit_bytereg)
561{
562 unsigned reg = c->modrm_reg;
563 int highbyte_regs = c->rex_prefix == 0;
564
565 if (!(c->d & ModRM))
566 reg = (c->b & 7) | ((c->rex_prefix & 1) << 3);
567 op->type = OP_REG;
568 if ((c->d & ByteOp) && !inhibit_bytereg) {
569 op->ptr = decode_register(reg, c->regs, highbyte_regs);
570 op->val = *(u8 *)op->ptr;
571 op->bytes = 1;
572 } else {
573 op->ptr = decode_register(reg, c->regs, 0);
574 op->bytes = c->op_bytes;
575 switch (op->bytes) {
576 case 2:
577 op->val = *(u16 *)op->ptr;
578 break;
579 case 4:
580 op->val = *(u32 *)op->ptr;
581 break;
582 case 8:
583 op->val = *(u64 *) op->ptr;
584 break;
585 }
586 }
587 op->orig_val = op->val;
588}
589
590static int decode_modrm(struct x86_emulate_ctxt *ctxt,
591 struct x86_emulate_ops *ops)
592{
593 struct decode_cache *c = &ctxt->decode;
594 u8 sib;
595 int index_reg = 0, base_reg = 0, scale, rip_relative = 0;
596 int rc = 0;
597
598 if (c->rex_prefix) {
599 c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */
600 index_reg = (c->rex_prefix & 2) << 2; /* REX.X */
601 c->modrm_rm = base_reg = (c->rex_prefix & 1) << 3; /* REG.B */
602 }
603
604 c->modrm = insn_fetch(u8, 1, c->eip);
605 c->modrm_mod |= (c->modrm & 0xc0) >> 6;
606 c->modrm_reg |= (c->modrm & 0x38) >> 3;
607 c->modrm_rm |= (c->modrm & 0x07);
608 c->modrm_ea = 0;
609 c->use_modrm_ea = 1;
610
611 if (c->modrm_mod == 3) {
612 c->modrm_val = *(unsigned long *)
613 decode_register(c->modrm_rm, c->regs, c->d & ByteOp);
614 return rc;
615 }
616
617 if (c->ad_bytes == 2) {
618 unsigned bx = c->regs[VCPU_REGS_RBX];
619 unsigned bp = c->regs[VCPU_REGS_RBP];
620 unsigned si = c->regs[VCPU_REGS_RSI];
621 unsigned di = c->regs[VCPU_REGS_RDI];
622
623 /* 16-bit ModR/M decode. */
624 switch (c->modrm_mod) {
625 case 0:
626 if (c->modrm_rm == 6)
627 c->modrm_ea += insn_fetch(u16, 2, c->eip);
628 break;
629 case 1:
630 c->modrm_ea += insn_fetch(s8, 1, c->eip);
631 break;
632 case 2:
633 c->modrm_ea += insn_fetch(u16, 2, c->eip);
634 break;
635 }
636 switch (c->modrm_rm) {
637 case 0:
638 c->modrm_ea += bx + si;
639 break;
640 case 1:
641 c->modrm_ea += bx + di;
642 break;
643 case 2:
644 c->modrm_ea += bp + si;
645 break;
646 case 3:
647 c->modrm_ea += bp + di;
648 break;
649 case 4:
650 c->modrm_ea += si;
651 break;
652 case 5:
653 c->modrm_ea += di;
654 break;
655 case 6:
656 if (c->modrm_mod != 0)
657 c->modrm_ea += bp;
658 break;
659 case 7:
660 c->modrm_ea += bx;
661 break;
662 }
663 if (c->modrm_rm == 2 || c->modrm_rm == 3 ||
664 (c->modrm_rm == 6 && c->modrm_mod != 0))
665 if (!c->override_base)
666 c->override_base = &ctxt->ss_base;
667 c->modrm_ea = (u16)c->modrm_ea;
668 } else {
669 /* 32/64-bit ModR/M decode. */
670 switch (c->modrm_rm) {
671 case 4:
672 case 12:
673 sib = insn_fetch(u8, 1, c->eip);
674 index_reg |= (sib >> 3) & 7;
675 base_reg |= sib & 7;
676 scale = sib >> 6;
677
678 switch (base_reg) {
679 case 5:
680 if (c->modrm_mod != 0)
681 c->modrm_ea += c->regs[base_reg];
682 else
683 c->modrm_ea +=
684 insn_fetch(s32, 4, c->eip);
685 break;
686 default:
687 c->modrm_ea += c->regs[base_reg];
688 }
689 switch (index_reg) {
690 case 4:
691 break;
692 default:
693 c->modrm_ea += c->regs[index_reg] << scale;
694 }
695 break;
696 case 5:
697 if (c->modrm_mod != 0)
698 c->modrm_ea += c->regs[c->modrm_rm];
699 else if (ctxt->mode == X86EMUL_MODE_PROT64)
700 rip_relative = 1;
701 break;
702 default:
703 c->modrm_ea += c->regs[c->modrm_rm];
704 break;
705 }
706 switch (c->modrm_mod) {
707 case 0:
708 if (c->modrm_rm == 5)
709 c->modrm_ea += insn_fetch(s32, 4, c->eip);
710 break;
711 case 1:
712 c->modrm_ea += insn_fetch(s8, 1, c->eip);
713 break;
714 case 2:
715 c->modrm_ea += insn_fetch(s32, 4, c->eip);
716 break;
717 }
718 }
719 if (rip_relative) {
720 c->modrm_ea += c->eip;
721 switch (c->d & SrcMask) {
722 case SrcImmByte:
723 c->modrm_ea += 1;
724 break;
725 case SrcImm:
726 if (c->d & ByteOp)
727 c->modrm_ea += 1;
728 else
729 if (c->op_bytes == 8)
730 c->modrm_ea += 4;
731 else
732 c->modrm_ea += c->op_bytes;
733 }
734 }
735done:
736 return rc;
737}
738
739static int decode_abs(struct x86_emulate_ctxt *ctxt,
740 struct x86_emulate_ops *ops)
741{
742 struct decode_cache *c = &ctxt->decode;
743 int rc = 0;
744
745 switch (c->ad_bytes) {
746 case 2:
747 c->modrm_ea = insn_fetch(u16, 2, c->eip);
748 break;
749 case 4:
750 c->modrm_ea = insn_fetch(u32, 4, c->eip);
751 break;
752 case 8:
753 c->modrm_ea = insn_fetch(u64, 8, c->eip);
754 break;
755 }
756done:
757 return rc;
758}
759
760int
761x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
762{
763 struct decode_cache *c = &ctxt->decode;
764 int rc = 0;
765 int mode = ctxt->mode;
766 int def_op_bytes, def_ad_bytes;
767
768 /* Shadow copy of register state. Committed on successful emulation. */
769
770 memset(c, 0, sizeof(struct decode_cache));
771 c->eip = ctxt->vcpu->arch.rip;
772 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
773
774 switch (mode) {
775 case X86EMUL_MODE_REAL:
776 case X86EMUL_MODE_PROT16:
777 def_op_bytes = def_ad_bytes = 2;
778 break;
779 case X86EMUL_MODE_PROT32:
780 def_op_bytes = def_ad_bytes = 4;
781 break;
782#ifdef CONFIG_X86_64
783 case X86EMUL_MODE_PROT64:
784 def_op_bytes = 4;
785 def_ad_bytes = 8;
786 break;
787#endif
788 default:
789 return -1;
790 }
791
792 c->op_bytes = def_op_bytes;
793 c->ad_bytes = def_ad_bytes;
794
795 /* Legacy prefixes. */
796 for (;;) {
797 switch (c->b = insn_fetch(u8, 1, c->eip)) {
798 case 0x66: /* operand-size override */
799 /* switch between 2/4 bytes */
800 c->op_bytes = def_op_bytes ^ 6;
801 break;
802 case 0x67: /* address-size override */
803 if (mode == X86EMUL_MODE_PROT64)
804 /* switch between 4/8 bytes */
805 c->ad_bytes = def_ad_bytes ^ 12;
806 else
807 /* switch between 2/4 bytes */
808 c->ad_bytes = def_ad_bytes ^ 6;
809 break;
810 case 0x2e: /* CS override */
811 c->override_base = &ctxt->cs_base;
812 break;
813 case 0x3e: /* DS override */
814 c->override_base = &ctxt->ds_base;
815 break;
816 case 0x26: /* ES override */
817 c->override_base = &ctxt->es_base;
818 break;
819 case 0x64: /* FS override */
820 c->override_base = &ctxt->fs_base;
821 break;
822 case 0x65: /* GS override */
823 c->override_base = &ctxt->gs_base;
824 break;
825 case 0x36: /* SS override */
826 c->override_base = &ctxt->ss_base;
827 break;
828 case 0x40 ... 0x4f: /* REX */
829 if (mode != X86EMUL_MODE_PROT64)
830 goto done_prefixes;
831 c->rex_prefix = c->b;
832 continue;
833 case 0xf0: /* LOCK */
834 c->lock_prefix = 1;
835 break;
836 case 0xf2: /* REPNE/REPNZ */
837 c->rep_prefix = REPNE_PREFIX;
838 break;
839 case 0xf3: /* REP/REPE/REPZ */
840 c->rep_prefix = REPE_PREFIX;
841 break;
842 default:
843 goto done_prefixes;
844 }
845
846 /* Any legacy prefix after a REX prefix nullifies its effect. */
847
848 c->rex_prefix = 0;
849 }
850
851done_prefixes:
852
853 /* REX prefix. */
854 if (c->rex_prefix)
855 if (c->rex_prefix & 8)
856 c->op_bytes = 8; /* REX.W */
857
858 /* Opcode byte(s). */
859 c->d = opcode_table[c->b];
860 if (c->d == 0) {
861 /* Two-byte opcode? */
862 if (c->b == 0x0f) {
863 c->twobyte = 1;
864 c->b = insn_fetch(u8, 1, c->eip);
865 c->d = twobyte_table[c->b];
866 }
867
868 /* Unrecognised? */
869 if (c->d == 0) {
870 DPRINTF("Cannot emulate %02x\n", c->b);
871 return -1;
872 }
873 }
874
875 if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
876 c->op_bytes = 8;
877
878 /* ModRM and SIB bytes. */
879 if (c->d & ModRM)
880 rc = decode_modrm(ctxt, ops);
881 else if (c->d & MemAbs)
882 rc = decode_abs(ctxt, ops);
883 if (rc)
884 goto done;
885
886 if (!c->override_base)
887 c->override_base = &ctxt->ds_base;
888 if (mode == X86EMUL_MODE_PROT64 &&
889 c->override_base != &ctxt->fs_base &&
890 c->override_base != &ctxt->gs_base)
891 c->override_base = NULL;
892
893 if (c->override_base)
894 c->modrm_ea += *c->override_base;
895
896 if (c->ad_bytes != 8)
897 c->modrm_ea = (u32)c->modrm_ea;
898 /*
899 * Decode and fetch the source operand: register, memory
900 * or immediate.
901 */
902 switch (c->d & SrcMask) {
903 case SrcNone:
904 break;
905 case SrcReg:
906 decode_register_operand(&c->src, c, 0);
907 break;
908 case SrcMem16:
909 c->src.bytes = 2;
910 goto srcmem_common;
911 case SrcMem32:
912 c->src.bytes = 4;
913 goto srcmem_common;
914 case SrcMem:
915 c->src.bytes = (c->d & ByteOp) ? 1 :
916 c->op_bytes;
917 /* Don't fetch the address for invlpg: it could be unmapped. */
918 if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7)
919 break;
920 srcmem_common:
921 /*
922 * For instructions with a ModR/M byte, switch to register
923 * access if Mod = 3.
924 */
925 if ((c->d & ModRM) && c->modrm_mod == 3) {
926 c->src.type = OP_REG;
927 break;
928 }
929 c->src.type = OP_MEM;
930 break;
931 case SrcImm:
932 c->src.type = OP_IMM;
933 c->src.ptr = (unsigned long *)c->eip;
934 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
935 if (c->src.bytes == 8)
936 c->src.bytes = 4;
937 /* NB. Immediates are sign-extended as necessary. */
938 switch (c->src.bytes) {
939 case 1:
940 c->src.val = insn_fetch(s8, 1, c->eip);
941 break;
942 case 2:
943 c->src.val = insn_fetch(s16, 2, c->eip);
944 break;
945 case 4:
946 c->src.val = insn_fetch(s32, 4, c->eip);
947 break;
948 }
949 break;
950 case SrcImmByte:
951 c->src.type = OP_IMM;
952 c->src.ptr = (unsigned long *)c->eip;
953 c->src.bytes = 1;
954 c->src.val = insn_fetch(s8, 1, c->eip);
955 break;
956 }
957
958 /* Decode and fetch the destination operand: register or memory. */
959 switch (c->d & DstMask) {
960 case ImplicitOps:
961 /* Special instructions do their own operand decoding. */
962 return 0;
963 case DstReg:
964 decode_register_operand(&c->dst, c,
965 c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
966 break;
967 case DstMem:
968 if ((c->d & ModRM) && c->modrm_mod == 3) {
969 c->dst.type = OP_REG;
970 break;
971 }
972 c->dst.type = OP_MEM;
973 break;
974 }
975
976done:
977 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
978}
979
980static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
981{
982 struct decode_cache *c = &ctxt->decode;
983
984 c->dst.type = OP_MEM;
985 c->dst.bytes = c->op_bytes;
986 c->dst.val = c->src.val;
987 register_address_increment(c->regs[VCPU_REGS_RSP], -c->op_bytes);
988 c->dst.ptr = (void *) register_address(ctxt->ss_base,
989 c->regs[VCPU_REGS_RSP]);
990}
991
992static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
993 struct x86_emulate_ops *ops)
994{
995 struct decode_cache *c = &ctxt->decode;
996 int rc;
997
998 rc = ops->read_std(register_address(ctxt->ss_base,
999 c->regs[VCPU_REGS_RSP]),
1000 &c->dst.val, c->dst.bytes, ctxt->vcpu);
1001 if (rc != 0)
1002 return rc;
1003
1004 register_address_increment(c->regs[VCPU_REGS_RSP], c->dst.bytes);
1005
1006 return 0;
1007}
1008
1009static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt)
1010{
1011 struct decode_cache *c = &ctxt->decode;
1012 switch (c->modrm_reg) {
1013 case 0: /* rol */
1014 emulate_2op_SrcB("rol", c->src, c->dst, ctxt->eflags);
1015 break;
1016 case 1: /* ror */
1017 emulate_2op_SrcB("ror", c->src, c->dst, ctxt->eflags);
1018 break;
1019 case 2: /* rcl */
1020 emulate_2op_SrcB("rcl", c->src, c->dst, ctxt->eflags);
1021 break;
1022 case 3: /* rcr */
1023 emulate_2op_SrcB("rcr", c->src, c->dst, ctxt->eflags);
1024 break;
1025 case 4: /* sal/shl */
1026 case 6: /* sal/shl */
1027 emulate_2op_SrcB("sal", c->src, c->dst, ctxt->eflags);
1028 break;
1029 case 5: /* shr */
1030 emulate_2op_SrcB("shr", c->src, c->dst, ctxt->eflags);
1031 break;
1032 case 7: /* sar */
1033 emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags);
1034 break;
1035 }
1036}
1037
1038static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
1039 struct x86_emulate_ops *ops)
1040{
1041 struct decode_cache *c = &ctxt->decode;
1042 int rc = 0;
1043
1044 switch (c->modrm_reg) {
1045 case 0 ... 1: /* test */
1046 /*
1047 * Special case in Grp3: test has an immediate
1048 * source operand.
1049 */
1050 c->src.type = OP_IMM;
1051 c->src.ptr = (unsigned long *)c->eip;
1052 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1053 if (c->src.bytes == 8)
1054 c->src.bytes = 4;
1055 switch (c->src.bytes) {
1056 case 1:
1057 c->src.val = insn_fetch(s8, 1, c->eip);
1058 break;
1059 case 2:
1060 c->src.val = insn_fetch(s16, 2, c->eip);
1061 break;
1062 case 4:
1063 c->src.val = insn_fetch(s32, 4, c->eip);
1064 break;
1065 }
1066 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
1067 break;
1068 case 2: /* not */
1069 c->dst.val = ~c->dst.val;
1070 break;
1071 case 3: /* neg */
1072 emulate_1op("neg", c->dst, ctxt->eflags);
1073 break;
1074 default:
1075 DPRINTF("Cannot emulate %02x\n", c->b);
1076 rc = X86EMUL_UNHANDLEABLE;
1077 break;
1078 }
1079done:
1080 return rc;
1081}
1082
1083static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
1084 struct x86_emulate_ops *ops)
1085{
1086 struct decode_cache *c = &ctxt->decode;
1087 int rc;
1088
1089 switch (c->modrm_reg) {
1090 case 0: /* inc */
1091 emulate_1op("inc", c->dst, ctxt->eflags);
1092 break;
1093 case 1: /* dec */
1094 emulate_1op("dec", c->dst, ctxt->eflags);
1095 break;
1096 case 4: /* jmp abs */
1097 if (c->b == 0xff)
1098 c->eip = c->dst.val;
1099 else {
1100 DPRINTF("Cannot emulate %02x\n", c->b);
1101 return X86EMUL_UNHANDLEABLE;
1102 }
1103 break;
1104 case 6: /* push */
1105
1106 /* 64-bit mode: PUSH always pushes a 64-bit operand. */
1107
1108 if (ctxt->mode == X86EMUL_MODE_PROT64) {
1109 c->dst.bytes = 8;
1110 rc = ops->read_std((unsigned long)c->dst.ptr,
1111 &c->dst.val, 8, ctxt->vcpu);
1112 if (rc != 0)
1113 return rc;
1114 }
1115 register_address_increment(c->regs[VCPU_REGS_RSP],
1116 -c->dst.bytes);
1117 rc = ops->write_emulated(register_address(ctxt->ss_base,
1118 c->regs[VCPU_REGS_RSP]), &c->dst.val,
1119 c->dst.bytes, ctxt->vcpu);
1120 if (rc != 0)
1121 return rc;
1122 c->dst.type = OP_NONE;
1123 break;
1124 default:
1125 DPRINTF("Cannot emulate %02x\n", c->b);
1126 return X86EMUL_UNHANDLEABLE;
1127 }
1128 return 0;
1129}
1130
1131static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
1132 struct x86_emulate_ops *ops,
1133 unsigned long memop)
1134{
1135 struct decode_cache *c = &ctxt->decode;
1136 u64 old, new;
1137 int rc;
1138
1139 rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu);
1140 if (rc != 0)
1141 return rc;
1142
1143 if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) ||
1144 ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) {
1145
1146 c->regs[VCPU_REGS_RAX] = (u32) (old >> 0);
1147 c->regs[VCPU_REGS_RDX] = (u32) (old >> 32);
1148 ctxt->eflags &= ~EFLG_ZF;
1149
1150 } else {
1151 new = ((u64)c->regs[VCPU_REGS_RCX] << 32) |
1152 (u32) c->regs[VCPU_REGS_RBX];
1153
1154 rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu);
1155 if (rc != 0)
1156 return rc;
1157 ctxt->eflags |= EFLG_ZF;
1158 }
1159 return 0;
1160}
1161
1162static inline int writeback(struct x86_emulate_ctxt *ctxt,
1163 struct x86_emulate_ops *ops)
1164{
1165 int rc;
1166 struct decode_cache *c = &ctxt->decode;
1167
1168 switch (c->dst.type) {
1169 case OP_REG:
1170 /* The 4-byte case *is* correct:
1171 * in 64-bit mode we zero-extend.
1172 */
1173 switch (c->dst.bytes) {
1174 case 1:
1175 *(u8 *)c->dst.ptr = (u8)c->dst.val;
1176 break;
1177 case 2:
1178 *(u16 *)c->dst.ptr = (u16)c->dst.val;
1179 break;
1180 case 4:
1181 *c->dst.ptr = (u32)c->dst.val;
1182 break; /* 64b: zero-ext */
1183 case 8:
1184 *c->dst.ptr = c->dst.val;
1185 break;
1186 }
1187 break;
1188 case OP_MEM:
1189 if (c->lock_prefix)
1190 rc = ops->cmpxchg_emulated(
1191 (unsigned long)c->dst.ptr,
1192 &c->dst.orig_val,
1193 &c->dst.val,
1194 c->dst.bytes,
1195 ctxt->vcpu);
1196 else
1197 rc = ops->write_emulated(
1198 (unsigned long)c->dst.ptr,
1199 &c->dst.val,
1200 c->dst.bytes,
1201 ctxt->vcpu);
1202 if (rc != 0)
1203 return rc;
1204 break;
1205 case OP_NONE:
1206 /* no writeback */
1207 break;
1208 default:
1209 break;
1210 }
1211 return 0;
1212}
1213
1214int
1215x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1216{
1217 unsigned long memop = 0;
1218 u64 msr_data;
1219 unsigned long saved_eip = 0;
1220 struct decode_cache *c = &ctxt->decode;
1221 int rc = 0;
1222
1223 /* Shadow copy of register state. Committed on successful emulation.
1224 * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't
1225 * modify them.
1226 */
1227
1228 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
1229 saved_eip = c->eip;
1230
1231 if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs))
1232 memop = c->modrm_ea;
1233
1234 if (c->rep_prefix && (c->d & String)) {
1235 /* All REP prefixes have the same first termination condition */
1236 if (c->regs[VCPU_REGS_RCX] == 0) {
1237 ctxt->vcpu->arch.rip = c->eip;
1238 goto done;
1239 }
1240 /* The second termination condition only applies for REPE
1241 * and REPNE. Test if the repeat string operation prefix is
1242 * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
1243 * corresponding termination condition according to:
1244 * - if REPE/REPZ and ZF = 0 then done
1245 * - if REPNE/REPNZ and ZF = 1 then done
1246 */
1247 if ((c->b == 0xa6) || (c->b == 0xa7) ||
1248 (c->b == 0xae) || (c->b == 0xaf)) {
1249 if ((c->rep_prefix == REPE_PREFIX) &&
1250 ((ctxt->eflags & EFLG_ZF) == 0)) {
1251 ctxt->vcpu->arch.rip = c->eip;
1252 goto done;
1253 }
1254 if ((c->rep_prefix == REPNE_PREFIX) &&
1255 ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) {
1256 ctxt->vcpu->arch.rip = c->eip;
1257 goto done;
1258 }
1259 }
1260 c->regs[VCPU_REGS_RCX]--;
1261 c->eip = ctxt->vcpu->arch.rip;
1262 }
1263
1264 if (c->src.type == OP_MEM) {
1265 c->src.ptr = (unsigned long *)memop;
1266 c->src.val = 0;
1267 rc = ops->read_emulated((unsigned long)c->src.ptr,
1268 &c->src.val,
1269 c->src.bytes,
1270 ctxt->vcpu);
1271 if (rc != 0)
1272 goto done;
1273 c->src.orig_val = c->src.val;
1274 }
1275
1276 if ((c->d & DstMask) == ImplicitOps)
1277 goto special_insn;
1278
1279
1280 if (c->dst.type == OP_MEM) {
1281 c->dst.ptr = (unsigned long *)memop;
1282 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1283 c->dst.val = 0;
1284 if (c->d & BitOp) {
1285 unsigned long mask = ~(c->dst.bytes * 8 - 1);
1286
1287 c->dst.ptr = (void *)c->dst.ptr +
1288 (c->src.val & mask) / 8;
1289 }
1290 if (!(c->d & Mov) &&
1291 /* optimisation - avoid slow emulated read */
1292 ((rc = ops->read_emulated((unsigned long)c->dst.ptr,
1293 &c->dst.val,
1294 c->dst.bytes, ctxt->vcpu)) != 0))
1295 goto done;
1296 }
1297 c->dst.orig_val = c->dst.val;
1298
1299special_insn:
1300
1301 if (c->twobyte)
1302 goto twobyte_insn;
1303
1304 switch (c->b) {
1305 case 0x00 ... 0x05:
1306 add: /* add */
1307 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
1308 break;
1309 case 0x08 ... 0x0d:
1310 or: /* or */
1311 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
1312 break;
1313 case 0x10 ... 0x15:
1314 adc: /* adc */
1315 emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
1316 break;
1317 case 0x18 ... 0x1d:
1318 sbb: /* sbb */
1319 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
1320 break;
1321 case 0x20 ... 0x23:
1322 and: /* and */
1323 emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
1324 break;
1325 case 0x24: /* and al imm8 */
1326 c->dst.type = OP_REG;
1327 c->dst.ptr = &c->regs[VCPU_REGS_RAX];
1328 c->dst.val = *(u8 *)c->dst.ptr;
1329 c->dst.bytes = 1;
1330 c->dst.orig_val = c->dst.val;
1331 goto and;
1332 case 0x25: /* and ax imm16, or eax imm32 */
1333 c->dst.type = OP_REG;
1334 c->dst.bytes = c->op_bytes;
1335 c->dst.ptr = &c->regs[VCPU_REGS_RAX];
1336 if (c->op_bytes == 2)
1337 c->dst.val = *(u16 *)c->dst.ptr;
1338 else
1339 c->dst.val = *(u32 *)c->dst.ptr;
1340 c->dst.orig_val = c->dst.val;
1341 goto and;
1342 case 0x28 ... 0x2d:
1343 sub: /* sub */
1344 emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags);
1345 break;
1346 case 0x30 ... 0x35:
1347 xor: /* xor */
1348 emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags);
1349 break;
1350 case 0x38 ... 0x3d:
1351 cmp: /* cmp */
1352 emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
1353 break;
1354 case 0x40 ... 0x47: /* inc r16/r32 */
1355 emulate_1op("inc", c->dst, ctxt->eflags);
1356 break;
1357 case 0x48 ... 0x4f: /* dec r16/r32 */
1358 emulate_1op("dec", c->dst, ctxt->eflags);
1359 break;
1360 case 0x50 ... 0x57: /* push reg */
1361 c->dst.type = OP_MEM;
1362 c->dst.bytes = c->op_bytes;
1363 c->dst.val = c->src.val;
1364 register_address_increment(c->regs[VCPU_REGS_RSP],
1365 -c->op_bytes);
1366 c->dst.ptr = (void *) register_address(
1367 ctxt->ss_base, c->regs[VCPU_REGS_RSP]);
1368 break;
1369 case 0x58 ... 0x5f: /* pop reg */
1370 pop_instruction:
1371 if ((rc = ops->read_std(register_address(ctxt->ss_base,
1372 c->regs[VCPU_REGS_RSP]), c->dst.ptr,
1373 c->op_bytes, ctxt->vcpu)) != 0)
1374 goto done;
1375
1376 register_address_increment(c->regs[VCPU_REGS_RSP],
1377 c->op_bytes);
1378 c->dst.type = OP_NONE; /* Disable writeback. */
1379 break;
1380 case 0x63: /* movsxd */
1381 if (ctxt->mode != X86EMUL_MODE_PROT64)
1382 goto cannot_emulate;
1383 c->dst.val = (s32) c->src.val;
1384 break;
1385 case 0x6a: /* push imm8 */
1386 c->src.val = 0L;
1387 c->src.val = insn_fetch(s8, 1, c->eip);
1388 emulate_push(ctxt);
1389 break;
1390 case 0x6c: /* insb */
1391 case 0x6d: /* insw/insd */
1392 if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
1393 1,
1394 (c->d & ByteOp) ? 1 : c->op_bytes,
1395 c->rep_prefix ?
1396 address_mask(c->regs[VCPU_REGS_RCX]) : 1,
1397 (ctxt->eflags & EFLG_DF),
1398 register_address(ctxt->es_base,
1399 c->regs[VCPU_REGS_RDI]),
1400 c->rep_prefix,
1401 c->regs[VCPU_REGS_RDX]) == 0) {
1402 c->eip = saved_eip;
1403 return -1;
1404 }
1405 return 0;
1406 case 0x6e: /* outsb */
1407 case 0x6f: /* outsw/outsd */
1408 if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
1409 0,
1410 (c->d & ByteOp) ? 1 : c->op_bytes,
1411 c->rep_prefix ?
1412 address_mask(c->regs[VCPU_REGS_RCX]) : 1,
1413 (ctxt->eflags & EFLG_DF),
1414 register_address(c->override_base ?
1415 *c->override_base :
1416 ctxt->ds_base,
1417 c->regs[VCPU_REGS_RSI]),
1418 c->rep_prefix,
1419 c->regs[VCPU_REGS_RDX]) == 0) {
1420 c->eip = saved_eip;
1421 return -1;
1422 }
1423 return 0;
1424 case 0x70 ... 0x7f: /* jcc (short) */ {
1425 int rel = insn_fetch(s8, 1, c->eip);
1426
1427 if (test_cc(c->b, ctxt->eflags))
1428 JMP_REL(rel);
1429 break;
1430 }
1431 case 0x80 ... 0x83: /* Grp1 */
1432 switch (c->modrm_reg) {
1433 case 0:
1434 goto add;
1435 case 1:
1436 goto or;
1437 case 2:
1438 goto adc;
1439 case 3:
1440 goto sbb;
1441 case 4:
1442 goto and;
1443 case 5:
1444 goto sub;
1445 case 6:
1446 goto xor;
1447 case 7:
1448 goto cmp;
1449 }
1450 break;
1451 case 0x84 ... 0x85:
1452 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
1453 break;
1454 case 0x86 ... 0x87: /* xchg */
1455 /* Write back the register source. */
1456 switch (c->dst.bytes) {
1457 case 1:
1458 *(u8 *) c->src.ptr = (u8) c->dst.val;
1459 break;
1460 case 2:
1461 *(u16 *) c->src.ptr = (u16) c->dst.val;
1462 break;
1463 case 4:
1464 *c->src.ptr = (u32) c->dst.val;
1465 break; /* 64b reg: zero-extend */
1466 case 8:
1467 *c->src.ptr = c->dst.val;
1468 break;
1469 }
1470 /*
1471 * Write back the memory destination with implicit LOCK
1472 * prefix.
1473 */
1474 c->dst.val = c->src.val;
1475 c->lock_prefix = 1;
1476 break;
1477 case 0x88 ... 0x8b: /* mov */
1478 goto mov;
1479 case 0x8d: /* lea r16/r32, m */
1480 c->dst.val = c->modrm_val;
1481 break;
1482 case 0x8f: /* pop (sole member of Grp1a) */
1483 rc = emulate_grp1a(ctxt, ops);
1484 if (rc != 0)
1485 goto done;
1486 break;
1487 case 0x9c: /* pushf */
1488 c->src.val = (unsigned long) ctxt->eflags;
1489 emulate_push(ctxt);
1490 break;
1491 case 0x9d: /* popf */
1492 c->dst.ptr = (unsigned long *) &ctxt->eflags;
1493 goto pop_instruction;
1494 case 0xa0 ... 0xa1: /* mov */
1495 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
1496 c->dst.val = c->src.val;
1497 break;
1498 case 0xa2 ... 0xa3: /* mov */
1499 c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX];
1500 break;
1501 case 0xa4 ... 0xa5: /* movs */
1502 c->dst.type = OP_MEM;
1503 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1504 c->dst.ptr = (unsigned long *)register_address(
1505 ctxt->es_base,
1506 c->regs[VCPU_REGS_RDI]);
1507 if ((rc = ops->read_emulated(register_address(
1508 c->override_base ? *c->override_base :
1509 ctxt->ds_base,
1510 c->regs[VCPU_REGS_RSI]),
1511 &c->dst.val,
1512 c->dst.bytes, ctxt->vcpu)) != 0)
1513 goto done;
1514 register_address_increment(c->regs[VCPU_REGS_RSI],
1515 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
1516 : c->dst.bytes);
1517 register_address_increment(c->regs[VCPU_REGS_RDI],
1518 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
1519 : c->dst.bytes);
1520 break;
1521 case 0xa6 ... 0xa7: /* cmps */
1522 c->src.type = OP_NONE; /* Disable writeback. */
1523 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1524 c->src.ptr = (unsigned long *)register_address(
1525 c->override_base ? *c->override_base :
1526 ctxt->ds_base,
1527 c->regs[VCPU_REGS_RSI]);
1528 if ((rc = ops->read_emulated((unsigned long)c->src.ptr,
1529 &c->src.val,
1530 c->src.bytes,
1531 ctxt->vcpu)) != 0)
1532 goto done;
1533
1534 c->dst.type = OP_NONE; /* Disable writeback. */
1535 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1536 c->dst.ptr = (unsigned long *)register_address(
1537 ctxt->es_base,
1538 c->regs[VCPU_REGS_RDI]);
1539 if ((rc = ops->read_emulated((unsigned long)c->dst.ptr,
1540 &c->dst.val,
1541 c->dst.bytes,
1542 ctxt->vcpu)) != 0)
1543 goto done;
1544
1545 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
1546
1547 emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
1548
1549 register_address_increment(c->regs[VCPU_REGS_RSI],
1550 (ctxt->eflags & EFLG_DF) ? -c->src.bytes
1551 : c->src.bytes);
1552 register_address_increment(c->regs[VCPU_REGS_RDI],
1553 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
1554 : c->dst.bytes);
1555
1556 break;
1557 case 0xaa ... 0xab: /* stos */
1558 c->dst.type = OP_MEM;
1559 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1560 c->dst.ptr = (unsigned long *)register_address(
1561 ctxt->es_base,
1562 c->regs[VCPU_REGS_RDI]);
1563 c->dst.val = c->regs[VCPU_REGS_RAX];
1564 register_address_increment(c->regs[VCPU_REGS_RDI],
1565 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
1566 : c->dst.bytes);
1567 break;
1568 case 0xac ... 0xad: /* lods */
1569 c->dst.type = OP_REG;
1570 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1571 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
1572 if ((rc = ops->read_emulated(register_address(
1573 c->override_base ? *c->override_base :
1574 ctxt->ds_base,
1575 c->regs[VCPU_REGS_RSI]),
1576 &c->dst.val,
1577 c->dst.bytes,
1578 ctxt->vcpu)) != 0)
1579 goto done;
1580 register_address_increment(c->regs[VCPU_REGS_RSI],
1581 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
1582 : c->dst.bytes);
1583 break;
1584 case 0xae ... 0xaf: /* scas */
1585 DPRINTF("Urk! I don't handle SCAS.\n");
1586 goto cannot_emulate;
1587 case 0xc0 ... 0xc1:
1588 emulate_grp2(ctxt);
1589 break;
1590 case 0xc3: /* ret */
1591 c->dst.ptr = &c->eip;
1592 goto pop_instruction;
1593 case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */
1594 mov:
1595 c->dst.val = c->src.val;
1596 break;
1597 case 0xd0 ... 0xd1: /* Grp2 */
1598 c->src.val = 1;
1599 emulate_grp2(ctxt);
1600 break;
1601 case 0xd2 ... 0xd3: /* Grp2 */
1602 c->src.val = c->regs[VCPU_REGS_RCX];
1603 emulate_grp2(ctxt);
1604 break;
1605 case 0xe8: /* call (near) */ {
1606 long int rel;
1607 switch (c->op_bytes) {
1608 case 2:
1609 rel = insn_fetch(s16, 2, c->eip);
1610 break;
1611 case 4:
1612 rel = insn_fetch(s32, 4, c->eip);
1613 break;
1614 default:
1615 DPRINTF("Call: Invalid op_bytes\n");
1616 goto cannot_emulate;
1617 }
1618 c->src.val = (unsigned long) c->eip;
1619 JMP_REL(rel);
1620 c->op_bytes = c->ad_bytes;
1621 emulate_push(ctxt);
1622 break;
1623 }
1624 case 0xe9: /* jmp rel */
1625 case 0xeb: /* jmp rel short */
1626 JMP_REL(c->src.val);
1627 c->dst.type = OP_NONE; /* Disable writeback. */
1628 break;
1629 case 0xf4: /* hlt */
1630 ctxt->vcpu->arch.halt_request = 1;
1631 goto done;
1632 case 0xf5: /* cmc */
1633 /* complement carry flag from eflags reg */
1634 ctxt->eflags ^= EFLG_CF;
1635 c->dst.type = OP_NONE; /* Disable writeback. */
1636 break;
1637 case 0xf6 ... 0xf7: /* Grp3 */
1638 rc = emulate_grp3(ctxt, ops);
1639 if (rc != 0)
1640 goto done;
1641 break;
1642 case 0xf8: /* clc */
1643 ctxt->eflags &= ~EFLG_CF;
1644 c->dst.type = OP_NONE; /* Disable writeback. */
1645 break;
1646 case 0xfa: /* cli */
1647 ctxt->eflags &= ~X86_EFLAGS_IF;
1648 c->dst.type = OP_NONE; /* Disable writeback. */
1649 break;
1650 case 0xfb: /* sti */
1651 ctxt->eflags |= X86_EFLAGS_IF;
1652 c->dst.type = OP_NONE; /* Disable writeback. */
1653 break;
1654 case 0xfe ... 0xff: /* Grp4/Grp5 */
1655 rc = emulate_grp45(ctxt, ops);
1656 if (rc != 0)
1657 goto done;
1658 break;
1659 }
1660
1661writeback:
1662 rc = writeback(ctxt, ops);
1663 if (rc != 0)
1664 goto done;
1665
1666 /* Commit shadow register state. */
1667 memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
1668 ctxt->vcpu->arch.rip = c->eip;
1669
1670done:
1671 if (rc == X86EMUL_UNHANDLEABLE) {
1672 c->eip = saved_eip;
1673 return -1;
1674 }
1675 return 0;
1676
1677twobyte_insn:
1678 switch (c->b) {
1679 case 0x01: /* lgdt, lidt, lmsw */
1680 switch (c->modrm_reg) {
1681 u16 size;
1682 unsigned long address;
1683
1684 case 0: /* vmcall */
1685 if (c->modrm_mod != 3 || c->modrm_rm != 1)
1686 goto cannot_emulate;
1687
1688 rc = kvm_fix_hypercall(ctxt->vcpu);
1689 if (rc)
1690 goto done;
1691
1692 kvm_emulate_hypercall(ctxt->vcpu);
1693 break;
1694 case 2: /* lgdt */
1695 rc = read_descriptor(ctxt, ops, c->src.ptr,
1696 &size, &address, c->op_bytes);
1697 if (rc)
1698 goto done;
1699 realmode_lgdt(ctxt->vcpu, size, address);
1700 break;
1701 case 3: /* lidt/vmmcall */
1702 if (c->modrm_mod == 3 && c->modrm_rm == 1) {
1703 rc = kvm_fix_hypercall(ctxt->vcpu);
1704 if (rc)
1705 goto done;
1706 kvm_emulate_hypercall(ctxt->vcpu);
1707 } else {
1708 rc = read_descriptor(ctxt, ops, c->src.ptr,
1709 &size, &address,
1710 c->op_bytes);
1711 if (rc)
1712 goto done;
1713 realmode_lidt(ctxt->vcpu, size, address);
1714 }
1715 break;
1716 case 4: /* smsw */
1717 if (c->modrm_mod != 3)
1718 goto cannot_emulate;
1719 *(u16 *)&c->regs[c->modrm_rm]
1720 = realmode_get_cr(ctxt->vcpu, 0);
1721 break;
1722 case 6: /* lmsw */
1723 if (c->modrm_mod != 3)
1724 goto cannot_emulate;
1725 realmode_lmsw(ctxt->vcpu, (u16)c->modrm_val,
1726 &ctxt->eflags);
1727 break;
1728 case 7: /* invlpg*/
1729 emulate_invlpg(ctxt->vcpu, memop);
1730 break;
1731 default:
1732 goto cannot_emulate;
1733 }
1734 /* Disable writeback. */
1735 c->dst.type = OP_NONE;
1736 break;
1737 case 0x06:
1738 emulate_clts(ctxt->vcpu);
1739 c->dst.type = OP_NONE;
1740 break;
1741 case 0x08: /* invd */
1742 case 0x09: /* wbinvd */
1743 case 0x0d: /* GrpP (prefetch) */
1744 case 0x18: /* Grp16 (prefetch/nop) */
1745 c->dst.type = OP_NONE;
1746 break;
1747 case 0x20: /* mov cr, reg */
1748 if (c->modrm_mod != 3)
1749 goto cannot_emulate;
1750 c->regs[c->modrm_rm] =
1751 realmode_get_cr(ctxt->vcpu, c->modrm_reg);
1752 c->dst.type = OP_NONE; /* no writeback */
1753 break;
1754 case 0x21: /* mov from dr to reg */
1755 if (c->modrm_mod != 3)
1756 goto cannot_emulate;
1757 rc = emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]);
1758 if (rc)
1759 goto cannot_emulate;
1760 c->dst.type = OP_NONE; /* no writeback */
1761 break;
1762 case 0x22: /* mov reg, cr */
1763 if (c->modrm_mod != 3)
1764 goto cannot_emulate;
1765 realmode_set_cr(ctxt->vcpu,
1766 c->modrm_reg, c->modrm_val, &ctxt->eflags);
1767 c->dst.type = OP_NONE;
1768 break;
1769 case 0x23: /* mov from reg to dr */
1770 if (c->modrm_mod != 3)
1771 goto cannot_emulate;
1772 rc = emulator_set_dr(ctxt, c->modrm_reg,
1773 c->regs[c->modrm_rm]);
1774 if (rc)
1775 goto cannot_emulate;
1776 c->dst.type = OP_NONE; /* no writeback */
1777 break;
1778 case 0x30:
1779 /* wrmsr */
1780 msr_data = (u32)c->regs[VCPU_REGS_RAX]
1781 | ((u64)c->regs[VCPU_REGS_RDX] << 32);
1782 rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data);
1783 if (rc) {
1784 kvm_inject_gp(ctxt->vcpu, 0);
1785 c->eip = ctxt->vcpu->arch.rip;
1786 }
1787 rc = X86EMUL_CONTINUE;
1788 c->dst.type = OP_NONE;
1789 break;
1790 case 0x32:
1791 /* rdmsr */
1792 rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data);
1793 if (rc) {
1794 kvm_inject_gp(ctxt->vcpu, 0);
1795 c->eip = ctxt->vcpu->arch.rip;
1796 } else {
1797 c->regs[VCPU_REGS_RAX] = (u32)msr_data;
1798 c->regs[VCPU_REGS_RDX] = msr_data >> 32;
1799 }
1800 rc = X86EMUL_CONTINUE;
1801 c->dst.type = OP_NONE;
1802 break;
1803 case 0x40 ... 0x4f: /* cmov */
1804 c->dst.val = c->dst.orig_val = c->src.val;
1805 if (!test_cc(c->b, ctxt->eflags))
1806 c->dst.type = OP_NONE; /* no writeback */
1807 break;
1808 case 0x80 ... 0x8f: /* jnz rel, etc*/ {
1809 long int rel;
1810
1811 switch (c->op_bytes) {
1812 case 2:
1813 rel = insn_fetch(s16, 2, c->eip);
1814 break;
1815 case 4:
1816 rel = insn_fetch(s32, 4, c->eip);
1817 break;
1818 case 8:
1819 rel = insn_fetch(s64, 8, c->eip);
1820 break;
1821 default:
1822 DPRINTF("jnz: Invalid op_bytes\n");
1823 goto cannot_emulate;
1824 }
1825 if (test_cc(c->b, ctxt->eflags))
1826 JMP_REL(rel);
1827 c->dst.type = OP_NONE;
1828 break;
1829 }
1830 case 0xa3:
1831 bt: /* bt */
1832 c->dst.type = OP_NONE;
1833 /* only subword offset */
1834 c->src.val &= (c->dst.bytes << 3) - 1;
1835 emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags);
1836 break;
1837 case 0xab:
1838 bts: /* bts */
1839 /* only subword offset */
1840 c->src.val &= (c->dst.bytes << 3) - 1;
1841 emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags);
1842 break;
1843 case 0xb0 ... 0xb1: /* cmpxchg */
1844 /*
1845 * Save real source value, then compare EAX against
1846 * destination.
1847 */
1848 c->src.orig_val = c->src.val;
1849 c->src.val = c->regs[VCPU_REGS_RAX];
1850 emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
1851 if (ctxt->eflags & EFLG_ZF) {
1852 /* Success: write back to memory. */
1853 c->dst.val = c->src.orig_val;
1854 } else {
1855 /* Failure: write the value we saw to EAX. */
1856 c->dst.type = OP_REG;
1857 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
1858 }
1859 break;
1860 case 0xb3:
1861 btr: /* btr */
1862 /* only subword offset */
1863 c->src.val &= (c->dst.bytes << 3) - 1;
1864 emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags);
1865 break;
1866 case 0xb6 ... 0xb7: /* movzx */
1867 c->dst.bytes = c->op_bytes;
1868 c->dst.val = (c->d & ByteOp) ? (u8) c->src.val
1869 : (u16) c->src.val;
1870 break;
1871 case 0xba: /* Grp8 */
1872 switch (c->modrm_reg & 3) {
1873 case 0:
1874 goto bt;
1875 case 1:
1876 goto bts;
1877 case 2:
1878 goto btr;
1879 case 3:
1880 goto btc;
1881 }
1882 break;
1883 case 0xbb:
1884 btc: /* btc */
1885 /* only subword offset */
1886 c->src.val &= (c->dst.bytes << 3) - 1;
1887 emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags);
1888 break;
1889 case 0xbe ... 0xbf: /* movsx */
1890 c->dst.bytes = c->op_bytes;
1891 c->dst.val = (c->d & ByteOp) ? (s8) c->src.val :
1892 (s16) c->src.val;
1893 break;
1894 case 0xc3: /* movnti */
1895 c->dst.bytes = c->op_bytes;
1896 c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val :
1897 (u64) c->src.val;
1898 break;
1899 case 0xc7: /* Grp9 (cmpxchg8b) */
1900 rc = emulate_grp9(ctxt, ops, memop);
1901 if (rc != 0)
1902 goto done;
1903 c->dst.type = OP_NONE;
1904 break;
1905 }
1906 goto writeback;
1907
1908cannot_emulate:
1909 DPRINTF("Cannot emulate %02x\n", c->b);
1910 c->eip = saved_eip;
1911 return -1;
1912}
diff --git a/drivers/Kconfig b/drivers/Kconfig
index f4076d9e9902..08d4ae201597 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -90,8 +90,6 @@ source "drivers/dca/Kconfig"
90 90
91source "drivers/auxdisplay/Kconfig" 91source "drivers/auxdisplay/Kconfig"
92 92
93source "drivers/kvm/Kconfig"
94
95source "drivers/uio/Kconfig" 93source "drivers/uio/Kconfig"
96 94
97source "drivers/virtio/Kconfig" 95source "drivers/virtio/Kconfig"
diff --git a/drivers/Makefile b/drivers/Makefile
index d92d4d82d001..9e1f808e43cf 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -47,7 +47,6 @@ obj-$(CONFIG_SPI) += spi/
47obj-$(CONFIG_PCCARD) += pcmcia/ 47obj-$(CONFIG_PCCARD) += pcmcia/
48obj-$(CONFIG_DIO) += dio/ 48obj-$(CONFIG_DIO) += dio/
49obj-$(CONFIG_SBUS) += sbus/ 49obj-$(CONFIG_SBUS) += sbus/
50obj-$(CONFIG_KVM) += kvm/
51obj-$(CONFIG_ZORRO) += zorro/ 50obj-$(CONFIG_ZORRO) += zorro/
52obj-$(CONFIG_MAC) += macintosh/ 51obj-$(CONFIG_MAC) += macintosh/
53obj-$(CONFIG_ATA_OVER_ETH) += block/aoe/ 52obj-$(CONFIG_ATA_OVER_ETH) += block/aoe/
diff --git a/drivers/kvm/irq.h b/drivers/kvm/irq.h
deleted file mode 100644
index 11fc014e2b30..000000000000
--- a/drivers/kvm/irq.h
+++ /dev/null
@@ -1,165 +0,0 @@
1/*
2 * irq.h: in kernel interrupt controller related definitions
3 * Copyright (c) 2007, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
16 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Authors:
18 * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
19 *
20 */
21
22#ifndef __IRQ_H
23#define __IRQ_H
24
25#include "kvm.h"
26
27typedef void irq_request_func(void *opaque, int level);
28
29struct kvm_kpic_state {
30 u8 last_irr; /* edge detection */
31 u8 irr; /* interrupt request register */
32 u8 imr; /* interrupt mask register */
33 u8 isr; /* interrupt service register */
34 u8 priority_add; /* highest irq priority */
35 u8 irq_base;
36 u8 read_reg_select;
37 u8 poll;
38 u8 special_mask;
39 u8 init_state;
40 u8 auto_eoi;
41 u8 rotate_on_auto_eoi;
42 u8 special_fully_nested_mode;
43 u8 init4; /* true if 4 byte init */
44 u8 elcr; /* PIIX edge/trigger selection */
45 u8 elcr_mask;
46 struct kvm_pic *pics_state;
47};
48
49struct kvm_pic {
50 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
51 irq_request_func *irq_request;
52 void *irq_request_opaque;
53 int output; /* intr from master PIC */
54 struct kvm_io_device dev;
55};
56
57struct kvm_pic *kvm_create_pic(struct kvm *kvm);
58void kvm_pic_set_irq(void *opaque, int irq, int level);
59int kvm_pic_read_irq(struct kvm_pic *s);
60int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
61int kvm_cpu_has_interrupt(struct kvm_vcpu *v);
62void kvm_pic_update_irq(struct kvm_pic *s);
63
64#define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS
65#define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */
66#define IOAPIC_EDGE_TRIG 0
67#define IOAPIC_LEVEL_TRIG 1
68
69#define IOAPIC_DEFAULT_BASE_ADDRESS 0xfec00000
70#define IOAPIC_MEM_LENGTH 0x100
71
72/* Direct registers. */
73#define IOAPIC_REG_SELECT 0x00
74#define IOAPIC_REG_WINDOW 0x10
75#define IOAPIC_REG_EOI 0x40 /* IA64 IOSAPIC only */
76
77/* Indirect registers. */
78#define IOAPIC_REG_APIC_ID 0x00 /* x86 IOAPIC only */
79#define IOAPIC_REG_VERSION 0x01
80#define IOAPIC_REG_ARB_ID 0x02 /* x86 IOAPIC only */
81
82struct kvm_ioapic {
83 u64 base_address;
84 u32 ioregsel;
85 u32 id;
86 u32 irr;
87 u32 pad;
88 union ioapic_redir_entry {
89 u64 bits;
90 struct {
91 u8 vector;
92 u8 delivery_mode:3;
93 u8 dest_mode:1;
94 u8 delivery_status:1;
95 u8 polarity:1;
96 u8 remote_irr:1;
97 u8 trig_mode:1;
98 u8 mask:1;
99 u8 reserve:7;
100 u8 reserved[4];
101 u8 dest_id;
102 } fields;
103 } redirtbl[IOAPIC_NUM_PINS];
104 struct kvm_io_device dev;
105 struct kvm *kvm;
106};
107
108struct kvm_lapic {
109 unsigned long base_address;
110 struct kvm_io_device dev;
111 struct {
112 atomic_t pending;
113 s64 period; /* unit: ns */
114 u32 divide_count;
115 ktime_t last_update;
116 struct hrtimer dev;
117 } timer;
118 struct kvm_vcpu *vcpu;
119 struct page *regs_page;
120 void *regs;
121};
122
123#ifdef DEBUG
124#define ASSERT(x) \
125do { \
126 if (!(x)) { \
127 printk(KERN_EMERG "assertion failed %s: %d: %s\n", \
128 __FILE__, __LINE__, #x); \
129 BUG(); \
130 } \
131} while (0)
132#else
133#define ASSERT(x) do { } while (0)
134#endif
135
136void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
137int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
138int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
139int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
140int kvm_create_lapic(struct kvm_vcpu *vcpu);
141void kvm_lapic_reset(struct kvm_vcpu *vcpu);
142void kvm_free_apic(struct kvm_lapic *apic);
143u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
144void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
145void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
146struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
147 unsigned long bitmap);
148u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
149void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
150int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
151void kvm_ioapic_update_eoi(struct kvm *kvm, int vector);
152int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
153int kvm_apic_set_irq(struct kvm_lapic *apic, u8 vec, u8 trig);
154void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
155int kvm_ioapic_init(struct kvm *kvm);
156void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
157int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
158int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
159void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
160void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
161void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
162void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
163void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
164
165#endif
diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c
deleted file mode 100644
index feb5ac986c5d..000000000000
--- a/drivers/kvm/mmu.c
+++ /dev/null
@@ -1,1498 +0,0 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * MMU support
8 *
9 * Copyright (C) 2006 Qumranet, Inc.
10 *
11 * Authors:
12 * Yaniv Kamay <yaniv@qumranet.com>
13 * Avi Kivity <avi@qumranet.com>
14 *
15 * This work is licensed under the terms of the GNU GPL, version 2. See
16 * the COPYING file in the top-level directory.
17 *
18 */
19
20#include "vmx.h"
21#include "kvm.h"
22
23#include <linux/types.h>
24#include <linux/string.h>
25#include <linux/mm.h>
26#include <linux/highmem.h>
27#include <linux/module.h>
28
29#include <asm/page.h>
30#include <asm/cmpxchg.h>
31
32#undef MMU_DEBUG
33
34#undef AUDIT
35
36#ifdef AUDIT
37static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
38#else
39static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
40#endif
41
42#ifdef MMU_DEBUG
43
44#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
45#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
46
47#else
48
49#define pgprintk(x...) do { } while (0)
50#define rmap_printk(x...) do { } while (0)
51
52#endif
53
54#if defined(MMU_DEBUG) || defined(AUDIT)
55static int dbg = 1;
56#endif
57
58#ifndef MMU_DEBUG
59#define ASSERT(x) do { } while (0)
60#else
61#define ASSERT(x) \
62 if (!(x)) { \
63 printk(KERN_WARNING "assertion failed %s:%d: %s\n", \
64 __FILE__, __LINE__, #x); \
65 }
66#endif
67
68#define PT64_PT_BITS 9
69#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
70#define PT32_PT_BITS 10
71#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
72
73#define PT_WRITABLE_SHIFT 1
74
75#define PT_PRESENT_MASK (1ULL << 0)
76#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
77#define PT_USER_MASK (1ULL << 2)
78#define PT_PWT_MASK (1ULL << 3)
79#define PT_PCD_MASK (1ULL << 4)
80#define PT_ACCESSED_MASK (1ULL << 5)
81#define PT_DIRTY_MASK (1ULL << 6)
82#define PT_PAGE_SIZE_MASK (1ULL << 7)
83#define PT_PAT_MASK (1ULL << 7)
84#define PT_GLOBAL_MASK (1ULL << 8)
85#define PT64_NX_MASK (1ULL << 63)
86
87#define PT_PAT_SHIFT 7
88#define PT_DIR_PAT_SHIFT 12
89#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
90
91#define PT32_DIR_PSE36_SIZE 4
92#define PT32_DIR_PSE36_SHIFT 13
93#define PT32_DIR_PSE36_MASK (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
94
95
96#define PT_FIRST_AVAIL_BITS_SHIFT 9
97#define PT64_SECOND_AVAIL_BITS_SHIFT 52
98
99#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
100
101#define VALID_PAGE(x) ((x) != INVALID_PAGE)
102
103#define PT64_LEVEL_BITS 9
104
105#define PT64_LEVEL_SHIFT(level) \
106 ( PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS )
107
108#define PT64_LEVEL_MASK(level) \
109 (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
110
111#define PT64_INDEX(address, level)\
112 (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
113
114
115#define PT32_LEVEL_BITS 10
116
117#define PT32_LEVEL_SHIFT(level) \
118 ( PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS )
119
120#define PT32_LEVEL_MASK(level) \
121 (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
122
123#define PT32_INDEX(address, level)\
124 (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
125
126
127#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
128#define PT64_DIR_BASE_ADDR_MASK \
129 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
130
131#define PT32_BASE_ADDR_MASK PAGE_MASK
132#define PT32_DIR_BASE_ADDR_MASK \
133 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
134
135
136#define PFERR_PRESENT_MASK (1U << 0)
137#define PFERR_WRITE_MASK (1U << 1)
138#define PFERR_USER_MASK (1U << 2)
139#define PFERR_FETCH_MASK (1U << 4)
140
141#define PT64_ROOT_LEVEL 4
142#define PT32_ROOT_LEVEL 2
143#define PT32E_ROOT_LEVEL 3
144
145#define PT_DIRECTORY_LEVEL 2
146#define PT_PAGE_TABLE_LEVEL 1
147
148#define RMAP_EXT 4
149
150struct kvm_rmap_desc {
151 u64 *shadow_ptes[RMAP_EXT];
152 struct kvm_rmap_desc *more;
153};
154
155static struct kmem_cache *pte_chain_cache;
156static struct kmem_cache *rmap_desc_cache;
157static struct kmem_cache *mmu_page_header_cache;
158
159static int is_write_protection(struct kvm_vcpu *vcpu)
160{
161 return vcpu->cr0 & X86_CR0_WP;
162}
163
164static int is_cpuid_PSE36(void)
165{
166 return 1;
167}
168
169static int is_nx(struct kvm_vcpu *vcpu)
170{
171 return vcpu->shadow_efer & EFER_NX;
172}
173
174static int is_present_pte(unsigned long pte)
175{
176 return pte & PT_PRESENT_MASK;
177}
178
179static int is_writeble_pte(unsigned long pte)
180{
181 return pte & PT_WRITABLE_MASK;
182}
183
184static int is_io_pte(unsigned long pte)
185{
186 return pte & PT_SHADOW_IO_MARK;
187}
188
189static int is_rmap_pte(u64 pte)
190{
191 return (pte & (PT_WRITABLE_MASK | PT_PRESENT_MASK))
192 == (PT_WRITABLE_MASK | PT_PRESENT_MASK);
193}
194
195static void set_shadow_pte(u64 *sptep, u64 spte)
196{
197#ifdef CONFIG_X86_64
198 set_64bit((unsigned long *)sptep, spte);
199#else
200 set_64bit((unsigned long long *)sptep, spte);
201#endif
202}
203
204static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
205 struct kmem_cache *base_cache, int min)
206{
207 void *obj;
208
209 if (cache->nobjs >= min)
210 return 0;
211 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
212 obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
213 if (!obj)
214 return -ENOMEM;
215 cache->objects[cache->nobjs++] = obj;
216 }
217 return 0;
218}
219
220static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
221{
222 while (mc->nobjs)
223 kfree(mc->objects[--mc->nobjs]);
224}
225
226static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
227 int min)
228{
229 struct page *page;
230
231 if (cache->nobjs >= min)
232 return 0;
233 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
234 page = alloc_page(GFP_KERNEL);
235 if (!page)
236 return -ENOMEM;
237 set_page_private(page, 0);
238 cache->objects[cache->nobjs++] = page_address(page);
239 }
240 return 0;
241}
242
243static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
244{
245 while (mc->nobjs)
246 free_page((unsigned long)mc->objects[--mc->nobjs]);
247}
248
249static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
250{
251 int r;
252
253 kvm_mmu_free_some_pages(vcpu);
254 r = mmu_topup_memory_cache(&vcpu->mmu_pte_chain_cache,
255 pte_chain_cache, 4);
256 if (r)
257 goto out;
258 r = mmu_topup_memory_cache(&vcpu->mmu_rmap_desc_cache,
259 rmap_desc_cache, 1);
260 if (r)
261 goto out;
262 r = mmu_topup_memory_cache_page(&vcpu->mmu_page_cache, 4);
263 if (r)
264 goto out;
265 r = mmu_topup_memory_cache(&vcpu->mmu_page_header_cache,
266 mmu_page_header_cache, 4);
267out:
268 return r;
269}
270
271static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
272{
273 mmu_free_memory_cache(&vcpu->mmu_pte_chain_cache);
274 mmu_free_memory_cache(&vcpu->mmu_rmap_desc_cache);
275 mmu_free_memory_cache_page(&vcpu->mmu_page_cache);
276 mmu_free_memory_cache(&vcpu->mmu_page_header_cache);
277}
278
279static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
280 size_t size)
281{
282 void *p;
283
284 BUG_ON(!mc->nobjs);
285 p = mc->objects[--mc->nobjs];
286 memset(p, 0, size);
287 return p;
288}
289
290static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
291{
292 return mmu_memory_cache_alloc(&vcpu->mmu_pte_chain_cache,
293 sizeof(struct kvm_pte_chain));
294}
295
296static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
297{
298 kfree(pc);
299}
300
301static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
302{
303 return mmu_memory_cache_alloc(&vcpu->mmu_rmap_desc_cache,
304 sizeof(struct kvm_rmap_desc));
305}
306
307static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
308{
309 kfree(rd);
310}
311
312/*
313 * Reverse mapping data structures:
314 *
315 * If page->private bit zero is zero, then page->private points to the
316 * shadow page table entry that points to page_address(page).
317 *
318 * If page->private bit zero is one, (then page->private & ~1) points
319 * to a struct kvm_rmap_desc containing more mappings.
320 */
321static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte)
322{
323 struct page *page;
324 struct kvm_rmap_desc *desc;
325 int i;
326
327 if (!is_rmap_pte(*spte))
328 return;
329 page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
330 if (!page_private(page)) {
331 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
332 set_page_private(page,(unsigned long)spte);
333 } else if (!(page_private(page) & 1)) {
334 rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
335 desc = mmu_alloc_rmap_desc(vcpu);
336 desc->shadow_ptes[0] = (u64 *)page_private(page);
337 desc->shadow_ptes[1] = spte;
338 set_page_private(page,(unsigned long)desc | 1);
339 } else {
340 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
341 desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
342 while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
343 desc = desc->more;
344 if (desc->shadow_ptes[RMAP_EXT-1]) {
345 desc->more = mmu_alloc_rmap_desc(vcpu);
346 desc = desc->more;
347 }
348 for (i = 0; desc->shadow_ptes[i]; ++i)
349 ;
350 desc->shadow_ptes[i] = spte;
351 }
352}
353
354static void rmap_desc_remove_entry(struct page *page,
355 struct kvm_rmap_desc *desc,
356 int i,
357 struct kvm_rmap_desc *prev_desc)
358{
359 int j;
360
361 for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j)
362 ;
363 desc->shadow_ptes[i] = desc->shadow_ptes[j];
364 desc->shadow_ptes[j] = NULL;
365 if (j != 0)
366 return;
367 if (!prev_desc && !desc->more)
368 set_page_private(page,(unsigned long)desc->shadow_ptes[0]);
369 else
370 if (prev_desc)
371 prev_desc->more = desc->more;
372 else
373 set_page_private(page,(unsigned long)desc->more | 1);
374 mmu_free_rmap_desc(desc);
375}
376
377static void rmap_remove(u64 *spte)
378{
379 struct page *page;
380 struct kvm_rmap_desc *desc;
381 struct kvm_rmap_desc *prev_desc;
382 int i;
383
384 if (!is_rmap_pte(*spte))
385 return;
386 page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
387 if (!page_private(page)) {
388 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
389 BUG();
390 } else if (!(page_private(page) & 1)) {
391 rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte);
392 if ((u64 *)page_private(page) != spte) {
393 printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n",
394 spte, *spte);
395 BUG();
396 }
397 set_page_private(page,0);
398 } else {
399 rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte);
400 desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
401 prev_desc = NULL;
402 while (desc) {
403 for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
404 if (desc->shadow_ptes[i] == spte) {
405 rmap_desc_remove_entry(page,
406 desc, i,
407 prev_desc);
408 return;
409 }
410 prev_desc = desc;
411 desc = desc->more;
412 }
413 BUG();
414 }
415}
416
417static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
418{
419 struct kvm *kvm = vcpu->kvm;
420 struct page *page;
421 struct kvm_rmap_desc *desc;
422 u64 *spte;
423
424 page = gfn_to_page(kvm, gfn);
425 BUG_ON(!page);
426
427 while (page_private(page)) {
428 if (!(page_private(page) & 1))
429 spte = (u64 *)page_private(page);
430 else {
431 desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
432 spte = desc->shadow_ptes[0];
433 }
434 BUG_ON(!spte);
435 BUG_ON((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT
436 != page_to_pfn(page));
437 BUG_ON(!(*spte & PT_PRESENT_MASK));
438 BUG_ON(!(*spte & PT_WRITABLE_MASK));
439 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
440 rmap_remove(spte);
441 set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK);
442 kvm_flush_remote_tlbs(vcpu->kvm);
443 }
444}
445
446#ifdef MMU_DEBUG
447static int is_empty_shadow_page(u64 *spt)
448{
449 u64 *pos;
450 u64 *end;
451
452 for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
453 if (*pos != 0) {
454 printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
455 pos, *pos);
456 return 0;
457 }
458 return 1;
459}
460#endif
461
462static void kvm_mmu_free_page(struct kvm *kvm,
463 struct kvm_mmu_page *page_head)
464{
465 ASSERT(is_empty_shadow_page(page_head->spt));
466 list_del(&page_head->link);
467 __free_page(virt_to_page(page_head->spt));
468 kfree(page_head);
469 ++kvm->n_free_mmu_pages;
470}
471
472static unsigned kvm_page_table_hashfn(gfn_t gfn)
473{
474 return gfn;
475}
476
477static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
478 u64 *parent_pte)
479{
480 struct kvm_mmu_page *page;
481
482 if (!vcpu->kvm->n_free_mmu_pages)
483 return NULL;
484
485 page = mmu_memory_cache_alloc(&vcpu->mmu_page_header_cache,
486 sizeof *page);
487 page->spt = mmu_memory_cache_alloc(&vcpu->mmu_page_cache, PAGE_SIZE);
488 set_page_private(virt_to_page(page->spt), (unsigned long)page);
489 list_add(&page->link, &vcpu->kvm->active_mmu_pages);
490 ASSERT(is_empty_shadow_page(page->spt));
491 page->slot_bitmap = 0;
492 page->multimapped = 0;
493 page->parent_pte = parent_pte;
494 --vcpu->kvm->n_free_mmu_pages;
495 return page;
496}
497
498static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
499 struct kvm_mmu_page *page, u64 *parent_pte)
500{
501 struct kvm_pte_chain *pte_chain;
502 struct hlist_node *node;
503 int i;
504
505 if (!parent_pte)
506 return;
507 if (!page->multimapped) {
508 u64 *old = page->parent_pte;
509
510 if (!old) {
511 page->parent_pte = parent_pte;
512 return;
513 }
514 page->multimapped = 1;
515 pte_chain = mmu_alloc_pte_chain(vcpu);
516 INIT_HLIST_HEAD(&page->parent_ptes);
517 hlist_add_head(&pte_chain->link, &page->parent_ptes);
518 pte_chain->parent_ptes[0] = old;
519 }
520 hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link) {
521 if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
522 continue;
523 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
524 if (!pte_chain->parent_ptes[i]) {
525 pte_chain->parent_ptes[i] = parent_pte;
526 return;
527 }
528 }
529 pte_chain = mmu_alloc_pte_chain(vcpu);
530 BUG_ON(!pte_chain);
531 hlist_add_head(&pte_chain->link, &page->parent_ptes);
532 pte_chain->parent_ptes[0] = parent_pte;
533}
534
535static void mmu_page_remove_parent_pte(struct kvm_mmu_page *page,
536 u64 *parent_pte)
537{
538 struct kvm_pte_chain *pte_chain;
539 struct hlist_node *node;
540 int i;
541
542 if (!page->multimapped) {
543 BUG_ON(page->parent_pte != parent_pte);
544 page->parent_pte = NULL;
545 return;
546 }
547 hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link)
548 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
549 if (!pte_chain->parent_ptes[i])
550 break;
551 if (pte_chain->parent_ptes[i] != parent_pte)
552 continue;
553 while (i + 1 < NR_PTE_CHAIN_ENTRIES
554 && pte_chain->parent_ptes[i + 1]) {
555 pte_chain->parent_ptes[i]
556 = pte_chain->parent_ptes[i + 1];
557 ++i;
558 }
559 pte_chain->parent_ptes[i] = NULL;
560 if (i == 0) {
561 hlist_del(&pte_chain->link);
562 mmu_free_pte_chain(pte_chain);
563 if (hlist_empty(&page->parent_ptes)) {
564 page->multimapped = 0;
565 page->parent_pte = NULL;
566 }
567 }
568 return;
569 }
570 BUG();
571}
572
573static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm_vcpu *vcpu,
574 gfn_t gfn)
575{
576 unsigned index;
577 struct hlist_head *bucket;
578 struct kvm_mmu_page *page;
579 struct hlist_node *node;
580
581 pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
582 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
583 bucket = &vcpu->kvm->mmu_page_hash[index];
584 hlist_for_each_entry(page, node, bucket, hash_link)
585 if (page->gfn == gfn && !page->role.metaphysical) {
586 pgprintk("%s: found role %x\n",
587 __FUNCTION__, page->role.word);
588 return page;
589 }
590 return NULL;
591}
592
593static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
594 gfn_t gfn,
595 gva_t gaddr,
596 unsigned level,
597 int metaphysical,
598 unsigned hugepage_access,
599 u64 *parent_pte)
600{
601 union kvm_mmu_page_role role;
602 unsigned index;
603 unsigned quadrant;
604 struct hlist_head *bucket;
605 struct kvm_mmu_page *page;
606 struct hlist_node *node;
607
608 role.word = 0;
609 role.glevels = vcpu->mmu.root_level;
610 role.level = level;
611 role.metaphysical = metaphysical;
612 role.hugepage_access = hugepage_access;
613 if (vcpu->mmu.root_level <= PT32_ROOT_LEVEL) {
614 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
615 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
616 role.quadrant = quadrant;
617 }
618 pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__,
619 gfn, role.word);
620 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
621 bucket = &vcpu->kvm->mmu_page_hash[index];
622 hlist_for_each_entry(page, node, bucket, hash_link)
623 if (page->gfn == gfn && page->role.word == role.word) {
624 mmu_page_add_parent_pte(vcpu, page, parent_pte);
625 pgprintk("%s: found\n", __FUNCTION__);
626 return page;
627 }
628 page = kvm_mmu_alloc_page(vcpu, parent_pte);
629 if (!page)
630 return page;
631 pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word);
632 page->gfn = gfn;
633 page->role = role;
634 hlist_add_head(&page->hash_link, bucket);
635 if (!metaphysical)
636 rmap_write_protect(vcpu, gfn);
637 return page;
638}
639
640static void kvm_mmu_page_unlink_children(struct kvm *kvm,
641 struct kvm_mmu_page *page)
642{
643 unsigned i;
644 u64 *pt;
645 u64 ent;
646
647 pt = page->spt;
648
649 if (page->role.level == PT_PAGE_TABLE_LEVEL) {
650 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
651 if (pt[i] & PT_PRESENT_MASK)
652 rmap_remove(&pt[i]);
653 pt[i] = 0;
654 }
655 kvm_flush_remote_tlbs(kvm);
656 return;
657 }
658
659 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
660 ent = pt[i];
661
662 pt[i] = 0;
663 if (!(ent & PT_PRESENT_MASK))
664 continue;
665 ent &= PT64_BASE_ADDR_MASK;
666 mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
667 }
668 kvm_flush_remote_tlbs(kvm);
669}
670
671static void kvm_mmu_put_page(struct kvm_mmu_page *page,
672 u64 *parent_pte)
673{
674 mmu_page_remove_parent_pte(page, parent_pte);
675}
676
677static void kvm_mmu_zap_page(struct kvm *kvm,
678 struct kvm_mmu_page *page)
679{
680 u64 *parent_pte;
681
682 while (page->multimapped || page->parent_pte) {
683 if (!page->multimapped)
684 parent_pte = page->parent_pte;
685 else {
686 struct kvm_pte_chain *chain;
687
688 chain = container_of(page->parent_ptes.first,
689 struct kvm_pte_chain, link);
690 parent_pte = chain->parent_ptes[0];
691 }
692 BUG_ON(!parent_pte);
693 kvm_mmu_put_page(page, parent_pte);
694 set_shadow_pte(parent_pte, 0);
695 }
696 kvm_mmu_page_unlink_children(kvm, page);
697 if (!page->root_count) {
698 hlist_del(&page->hash_link);
699 kvm_mmu_free_page(kvm, page);
700 } else
701 list_move(&page->link, &kvm->active_mmu_pages);
702}
703
704static int kvm_mmu_unprotect_page(struct kvm_vcpu *vcpu, gfn_t gfn)
705{
706 unsigned index;
707 struct hlist_head *bucket;
708 struct kvm_mmu_page *page;
709 struct hlist_node *node, *n;
710 int r;
711
712 pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
713 r = 0;
714 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
715 bucket = &vcpu->kvm->mmu_page_hash[index];
716 hlist_for_each_entry_safe(page, node, n, bucket, hash_link)
717 if (page->gfn == gfn && !page->role.metaphysical) {
718 pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn,
719 page->role.word);
720 kvm_mmu_zap_page(vcpu->kvm, page);
721 r = 1;
722 }
723 return r;
724}
725
726static void mmu_unshadow(struct kvm_vcpu *vcpu, gfn_t gfn)
727{
728 struct kvm_mmu_page *page;
729
730 while ((page = kvm_mmu_lookup_page(vcpu, gfn)) != NULL) {
731 pgprintk("%s: zap %lx %x\n",
732 __FUNCTION__, gfn, page->role.word);
733 kvm_mmu_zap_page(vcpu->kvm, page);
734 }
735}
736
737static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa)
738{
739 int slot = memslot_id(kvm, gfn_to_memslot(kvm, gpa >> PAGE_SHIFT));
740 struct kvm_mmu_page *page_head = page_header(__pa(pte));
741
742 __set_bit(slot, &page_head->slot_bitmap);
743}
744
745hpa_t safe_gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa)
746{
747 hpa_t hpa = gpa_to_hpa(vcpu, gpa);
748
749 return is_error_hpa(hpa) ? bad_page_address | (gpa & ~PAGE_MASK): hpa;
750}
751
752hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa)
753{
754 struct page *page;
755
756 ASSERT((gpa & HPA_ERR_MASK) == 0);
757 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
758 if (!page)
759 return gpa | HPA_ERR_MASK;
760 return ((hpa_t)page_to_pfn(page) << PAGE_SHIFT)
761 | (gpa & (PAGE_SIZE-1));
762}
763
764hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva)
765{
766 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
767
768 if (gpa == UNMAPPED_GVA)
769 return UNMAPPED_GVA;
770 return gpa_to_hpa(vcpu, gpa);
771}
772
773struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
774{
775 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
776
777 if (gpa == UNMAPPED_GVA)
778 return NULL;
779 return pfn_to_page(gpa_to_hpa(vcpu, gpa) >> PAGE_SHIFT);
780}
781
782static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
783{
784}
785
786static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p)
787{
788 int level = PT32E_ROOT_LEVEL;
789 hpa_t table_addr = vcpu->mmu.root_hpa;
790
791 for (; ; level--) {
792 u32 index = PT64_INDEX(v, level);
793 u64 *table;
794 u64 pte;
795
796 ASSERT(VALID_PAGE(table_addr));
797 table = __va(table_addr);
798
799 if (level == 1) {
800 pte = table[index];
801 if (is_present_pte(pte) && is_writeble_pte(pte))
802 return 0;
803 mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT);
804 page_header_update_slot(vcpu->kvm, table, v);
805 table[index] = p | PT_PRESENT_MASK | PT_WRITABLE_MASK |
806 PT_USER_MASK;
807 rmap_add(vcpu, &table[index]);
808 return 0;
809 }
810
811 if (table[index] == 0) {
812 struct kvm_mmu_page *new_table;
813 gfn_t pseudo_gfn;
814
815 pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
816 >> PAGE_SHIFT;
817 new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
818 v, level - 1,
819 1, 0, &table[index]);
820 if (!new_table) {
821 pgprintk("nonpaging_map: ENOMEM\n");
822 return -ENOMEM;
823 }
824
825 table[index] = __pa(new_table->spt) | PT_PRESENT_MASK
826 | PT_WRITABLE_MASK | PT_USER_MASK;
827 }
828 table_addr = table[index] & PT64_BASE_ADDR_MASK;
829 }
830}
831
832static void mmu_free_roots(struct kvm_vcpu *vcpu)
833{
834 int i;
835 struct kvm_mmu_page *page;
836
837 if (!VALID_PAGE(vcpu->mmu.root_hpa))
838 return;
839#ifdef CONFIG_X86_64
840 if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
841 hpa_t root = vcpu->mmu.root_hpa;
842
843 page = page_header(root);
844 --page->root_count;
845 vcpu->mmu.root_hpa = INVALID_PAGE;
846 return;
847 }
848#endif
849 for (i = 0; i < 4; ++i) {
850 hpa_t root = vcpu->mmu.pae_root[i];
851
852 if (root) {
853 root &= PT64_BASE_ADDR_MASK;
854 page = page_header(root);
855 --page->root_count;
856 }
857 vcpu->mmu.pae_root[i] = INVALID_PAGE;
858 }
859 vcpu->mmu.root_hpa = INVALID_PAGE;
860}
861
862static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
863{
864 int i;
865 gfn_t root_gfn;
866 struct kvm_mmu_page *page;
867
868 root_gfn = vcpu->cr3 >> PAGE_SHIFT;
869
870#ifdef CONFIG_X86_64
871 if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
872 hpa_t root = vcpu->mmu.root_hpa;
873
874 ASSERT(!VALID_PAGE(root));
875 page = kvm_mmu_get_page(vcpu, root_gfn, 0,
876 PT64_ROOT_LEVEL, 0, 0, NULL);
877 root = __pa(page->spt);
878 ++page->root_count;
879 vcpu->mmu.root_hpa = root;
880 return;
881 }
882#endif
883 for (i = 0; i < 4; ++i) {
884 hpa_t root = vcpu->mmu.pae_root[i];
885
886 ASSERT(!VALID_PAGE(root));
887 if (vcpu->mmu.root_level == PT32E_ROOT_LEVEL) {
888 if (!is_present_pte(vcpu->pdptrs[i])) {
889 vcpu->mmu.pae_root[i] = 0;
890 continue;
891 }
892 root_gfn = vcpu->pdptrs[i] >> PAGE_SHIFT;
893 } else if (vcpu->mmu.root_level == 0)
894 root_gfn = 0;
895 page = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
896 PT32_ROOT_LEVEL, !is_paging(vcpu),
897 0, NULL);
898 root = __pa(page->spt);
899 ++page->root_count;
900 vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK;
901 }
902 vcpu->mmu.root_hpa = __pa(vcpu->mmu.pae_root);
903}
904
905static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
906{
907 return vaddr;
908}
909
910static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
911 u32 error_code)
912{
913 gpa_t addr = gva;
914 hpa_t paddr;
915 int r;
916
917 r = mmu_topup_memory_caches(vcpu);
918 if (r)
919 return r;
920
921 ASSERT(vcpu);
922 ASSERT(VALID_PAGE(vcpu->mmu.root_hpa));
923
924
925 paddr = gpa_to_hpa(vcpu , addr & PT64_BASE_ADDR_MASK);
926
927 if (is_error_hpa(paddr))
928 return 1;
929
930 return nonpaging_map(vcpu, addr & PAGE_MASK, paddr);
931}
932
933static void nonpaging_free(struct kvm_vcpu *vcpu)
934{
935 mmu_free_roots(vcpu);
936}
937
938static int nonpaging_init_context(struct kvm_vcpu *vcpu)
939{
940 struct kvm_mmu *context = &vcpu->mmu;
941
942 context->new_cr3 = nonpaging_new_cr3;
943 context->page_fault = nonpaging_page_fault;
944 context->gva_to_gpa = nonpaging_gva_to_gpa;
945 context->free = nonpaging_free;
946 context->root_level = 0;
947 context->shadow_root_level = PT32E_ROOT_LEVEL;
948 context->root_hpa = INVALID_PAGE;
949 return 0;
950}
951
952static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
953{
954 ++vcpu->stat.tlb_flush;
955 kvm_x86_ops->tlb_flush(vcpu);
956}
957
958static void paging_new_cr3(struct kvm_vcpu *vcpu)
959{
960 pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3);
961 mmu_free_roots(vcpu);
962}
963
964static void inject_page_fault(struct kvm_vcpu *vcpu,
965 u64 addr,
966 u32 err_code)
967{
968 kvm_x86_ops->inject_page_fault(vcpu, addr, err_code);
969}
970
971static void paging_free(struct kvm_vcpu *vcpu)
972{
973 nonpaging_free(vcpu);
974}
975
976#define PTTYPE 64
977#include "paging_tmpl.h"
978#undef PTTYPE
979
980#define PTTYPE 32
981#include "paging_tmpl.h"
982#undef PTTYPE
983
984static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
985{
986 struct kvm_mmu *context = &vcpu->mmu;
987
988 ASSERT(is_pae(vcpu));
989 context->new_cr3 = paging_new_cr3;
990 context->page_fault = paging64_page_fault;
991 context->gva_to_gpa = paging64_gva_to_gpa;
992 context->free = paging_free;
993 context->root_level = level;
994 context->shadow_root_level = level;
995 context->root_hpa = INVALID_PAGE;
996 return 0;
997}
998
999static int paging64_init_context(struct kvm_vcpu *vcpu)
1000{
1001 return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
1002}
1003
1004static int paging32_init_context(struct kvm_vcpu *vcpu)
1005{
1006 struct kvm_mmu *context = &vcpu->mmu;
1007
1008 context->new_cr3 = paging_new_cr3;
1009 context->page_fault = paging32_page_fault;
1010 context->gva_to_gpa = paging32_gva_to_gpa;
1011 context->free = paging_free;
1012 context->root_level = PT32_ROOT_LEVEL;
1013 context->shadow_root_level = PT32E_ROOT_LEVEL;
1014 context->root_hpa = INVALID_PAGE;
1015 return 0;
1016}
1017
1018static int paging32E_init_context(struct kvm_vcpu *vcpu)
1019{
1020 return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
1021}
1022
1023static int init_kvm_mmu(struct kvm_vcpu *vcpu)
1024{
1025 ASSERT(vcpu);
1026 ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
1027
1028 if (!is_paging(vcpu))
1029 return nonpaging_init_context(vcpu);
1030 else if (is_long_mode(vcpu))
1031 return paging64_init_context(vcpu);
1032 else if (is_pae(vcpu))
1033 return paging32E_init_context(vcpu);
1034 else
1035 return paging32_init_context(vcpu);
1036}
1037
1038static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
1039{
1040 ASSERT(vcpu);
1041 if (VALID_PAGE(vcpu->mmu.root_hpa)) {
1042 vcpu->mmu.free(vcpu);
1043 vcpu->mmu.root_hpa = INVALID_PAGE;
1044 }
1045}
1046
1047int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
1048{
1049 destroy_kvm_mmu(vcpu);
1050 return init_kvm_mmu(vcpu);
1051}
1052EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
1053
1054int kvm_mmu_load(struct kvm_vcpu *vcpu)
1055{
1056 int r;
1057
1058 mutex_lock(&vcpu->kvm->lock);
1059 r = mmu_topup_memory_caches(vcpu);
1060 if (r)
1061 goto out;
1062 mmu_alloc_roots(vcpu);
1063 kvm_x86_ops->set_cr3(vcpu, vcpu->mmu.root_hpa);
1064 kvm_mmu_flush_tlb(vcpu);
1065out:
1066 mutex_unlock(&vcpu->kvm->lock);
1067 return r;
1068}
1069EXPORT_SYMBOL_GPL(kvm_mmu_load);
1070
1071void kvm_mmu_unload(struct kvm_vcpu *vcpu)
1072{
1073 mmu_free_roots(vcpu);
1074}
1075
1076static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
1077 struct kvm_mmu_page *page,
1078 u64 *spte)
1079{
1080 u64 pte;
1081 struct kvm_mmu_page *child;
1082
1083 pte = *spte;
1084 if (is_present_pte(pte)) {
1085 if (page->role.level == PT_PAGE_TABLE_LEVEL)
1086 rmap_remove(spte);
1087 else {
1088 child = page_header(pte & PT64_BASE_ADDR_MASK);
1089 mmu_page_remove_parent_pte(child, spte);
1090 }
1091 }
1092 set_shadow_pte(spte, 0);
1093 kvm_flush_remote_tlbs(vcpu->kvm);
1094}
1095
1096static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
1097 struct kvm_mmu_page *page,
1098 u64 *spte,
1099 const void *new, int bytes)
1100{
1101 if (page->role.level != PT_PAGE_TABLE_LEVEL)
1102 return;
1103
1104 if (page->role.glevels == PT32_ROOT_LEVEL)
1105 paging32_update_pte(vcpu, page, spte, new, bytes);
1106 else
1107 paging64_update_pte(vcpu, page, spte, new, bytes);
1108}
1109
1110void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1111 const u8 *new, int bytes)
1112{
1113 gfn_t gfn = gpa >> PAGE_SHIFT;
1114 struct kvm_mmu_page *page;
1115 struct hlist_node *node, *n;
1116 struct hlist_head *bucket;
1117 unsigned index;
1118 u64 *spte;
1119 unsigned offset = offset_in_page(gpa);
1120 unsigned pte_size;
1121 unsigned page_offset;
1122 unsigned misaligned;
1123 unsigned quadrant;
1124 int level;
1125 int flooded = 0;
1126 int npte;
1127
1128 pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
1129 if (gfn == vcpu->last_pt_write_gfn) {
1130 ++vcpu->last_pt_write_count;
1131 if (vcpu->last_pt_write_count >= 3)
1132 flooded = 1;
1133 } else {
1134 vcpu->last_pt_write_gfn = gfn;
1135 vcpu->last_pt_write_count = 1;
1136 }
1137 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
1138 bucket = &vcpu->kvm->mmu_page_hash[index];
1139 hlist_for_each_entry_safe(page, node, n, bucket, hash_link) {
1140 if (page->gfn != gfn || page->role.metaphysical)
1141 continue;
1142 pte_size = page->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
1143 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
1144 misaligned |= bytes < 4;
1145 if (misaligned || flooded) {
1146 /*
1147 * Misaligned accesses are too much trouble to fix
1148 * up; also, they usually indicate a page is not used
1149 * as a page table.
1150 *
1151 * If we're seeing too many writes to a page,
1152 * it may no longer be a page table, or we may be
1153 * forking, in which case it is better to unmap the
1154 * page.
1155 */
1156 pgprintk("misaligned: gpa %llx bytes %d role %x\n",
1157 gpa, bytes, page->role.word);
1158 kvm_mmu_zap_page(vcpu->kvm, page);
1159 continue;
1160 }
1161 page_offset = offset;
1162 level = page->role.level;
1163 npte = 1;
1164 if (page->role.glevels == PT32_ROOT_LEVEL) {
1165 page_offset <<= 1; /* 32->64 */
1166 /*
1167 * A 32-bit pde maps 4MB while the shadow pdes map
1168 * only 2MB. So we need to double the offset again
1169 * and zap two pdes instead of one.
1170 */
1171 if (level == PT32_ROOT_LEVEL) {
1172 page_offset &= ~7; /* kill rounding error */
1173 page_offset <<= 1;
1174 npte = 2;
1175 }
1176 quadrant = page_offset >> PAGE_SHIFT;
1177 page_offset &= ~PAGE_MASK;
1178 if (quadrant != page->role.quadrant)
1179 continue;
1180 }
1181 spte = &page->spt[page_offset / sizeof(*spte)];
1182 while (npte--) {
1183 mmu_pte_write_zap_pte(vcpu, page, spte);
1184 mmu_pte_write_new_pte(vcpu, page, spte, new, bytes);
1185 ++spte;
1186 }
1187 }
1188}
1189
1190int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
1191{
1192 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
1193
1194 return kvm_mmu_unprotect_page(vcpu, gpa >> PAGE_SHIFT);
1195}
1196
1197void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
1198{
1199 while (vcpu->kvm->n_free_mmu_pages < KVM_REFILL_PAGES) {
1200 struct kvm_mmu_page *page;
1201
1202 page = container_of(vcpu->kvm->active_mmu_pages.prev,
1203 struct kvm_mmu_page, link);
1204 kvm_mmu_zap_page(vcpu->kvm, page);
1205 }
1206}
1207
1208static void free_mmu_pages(struct kvm_vcpu *vcpu)
1209{
1210 struct kvm_mmu_page *page;
1211
1212 while (!list_empty(&vcpu->kvm->active_mmu_pages)) {
1213 page = container_of(vcpu->kvm->active_mmu_pages.next,
1214 struct kvm_mmu_page, link);
1215 kvm_mmu_zap_page(vcpu->kvm, page);
1216 }
1217 free_page((unsigned long)vcpu->mmu.pae_root);
1218}
1219
1220static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
1221{
1222 struct page *page;
1223 int i;
1224
1225 ASSERT(vcpu);
1226
1227 vcpu->kvm->n_free_mmu_pages = KVM_NUM_MMU_PAGES;
1228
1229 /*
1230 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
1231 * Therefore we need to allocate shadow page tables in the first
1232 * 4GB of memory, which happens to fit the DMA32 zone.
1233 */
1234 page = alloc_page(GFP_KERNEL | __GFP_DMA32);
1235 if (!page)
1236 goto error_1;
1237 vcpu->mmu.pae_root = page_address(page);
1238 for (i = 0; i < 4; ++i)
1239 vcpu->mmu.pae_root[i] = INVALID_PAGE;
1240
1241 return 0;
1242
1243error_1:
1244 free_mmu_pages(vcpu);
1245 return -ENOMEM;
1246}
1247
1248int kvm_mmu_create(struct kvm_vcpu *vcpu)
1249{
1250 ASSERT(vcpu);
1251 ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
1252
1253 return alloc_mmu_pages(vcpu);
1254}
1255
1256int kvm_mmu_setup(struct kvm_vcpu *vcpu)
1257{
1258 ASSERT(vcpu);
1259 ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
1260
1261 return init_kvm_mmu(vcpu);
1262}
1263
1264void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
1265{
1266 ASSERT(vcpu);
1267
1268 destroy_kvm_mmu(vcpu);
1269 free_mmu_pages(vcpu);
1270 mmu_free_memory_caches(vcpu);
1271}
1272
1273void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
1274{
1275 struct kvm_mmu_page *page;
1276
1277 list_for_each_entry(page, &kvm->active_mmu_pages, link) {
1278 int i;
1279 u64 *pt;
1280
1281 if (!test_bit(slot, &page->slot_bitmap))
1282 continue;
1283
1284 pt = page->spt;
1285 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1286 /* avoid RMW */
1287 if (pt[i] & PT_WRITABLE_MASK) {
1288 rmap_remove(&pt[i]);
1289 pt[i] &= ~PT_WRITABLE_MASK;
1290 }
1291 }
1292}
1293
1294void kvm_mmu_zap_all(struct kvm *kvm)
1295{
1296 struct kvm_mmu_page *page, *node;
1297
1298 list_for_each_entry_safe(page, node, &kvm->active_mmu_pages, link)
1299 kvm_mmu_zap_page(kvm, page);
1300
1301 kvm_flush_remote_tlbs(kvm);
1302}
1303
1304void kvm_mmu_module_exit(void)
1305{
1306 if (pte_chain_cache)
1307 kmem_cache_destroy(pte_chain_cache);
1308 if (rmap_desc_cache)
1309 kmem_cache_destroy(rmap_desc_cache);
1310 if (mmu_page_header_cache)
1311 kmem_cache_destroy(mmu_page_header_cache);
1312}
1313
1314int kvm_mmu_module_init(void)
1315{
1316 pte_chain_cache = kmem_cache_create("kvm_pte_chain",
1317 sizeof(struct kvm_pte_chain),
1318 0, 0, NULL);
1319 if (!pte_chain_cache)
1320 goto nomem;
1321 rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
1322 sizeof(struct kvm_rmap_desc),
1323 0, 0, NULL);
1324 if (!rmap_desc_cache)
1325 goto nomem;
1326
1327 mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
1328 sizeof(struct kvm_mmu_page),
1329 0, 0, NULL);
1330 if (!mmu_page_header_cache)
1331 goto nomem;
1332
1333 return 0;
1334
1335nomem:
1336 kvm_mmu_module_exit();
1337 return -ENOMEM;
1338}
1339
1340#ifdef AUDIT
1341
1342static const char *audit_msg;
1343
1344static gva_t canonicalize(gva_t gva)
1345{
1346#ifdef CONFIG_X86_64
1347 gva = (long long)(gva << 16) >> 16;
1348#endif
1349 return gva;
1350}
1351
1352static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
1353 gva_t va, int level)
1354{
1355 u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
1356 int i;
1357 gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
1358
1359 for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
1360 u64 ent = pt[i];
1361
1362 if (!(ent & PT_PRESENT_MASK))
1363 continue;
1364
1365 va = canonicalize(va);
1366 if (level > 1)
1367 audit_mappings_page(vcpu, ent, va, level - 1);
1368 else {
1369 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, va);
1370 hpa_t hpa = gpa_to_hpa(vcpu, gpa);
1371
1372 if ((ent & PT_PRESENT_MASK)
1373 && (ent & PT64_BASE_ADDR_MASK) != hpa)
1374 printk(KERN_ERR "audit error: (%s) levels %d"
1375 " gva %lx gpa %llx hpa %llx ent %llx\n",
1376 audit_msg, vcpu->mmu.root_level,
1377 va, gpa, hpa, ent);
1378 }
1379 }
1380}
1381
1382static void audit_mappings(struct kvm_vcpu *vcpu)
1383{
1384 unsigned i;
1385
1386 if (vcpu->mmu.root_level == 4)
1387 audit_mappings_page(vcpu, vcpu->mmu.root_hpa, 0, 4);
1388 else
1389 for (i = 0; i < 4; ++i)
1390 if (vcpu->mmu.pae_root[i] & PT_PRESENT_MASK)
1391 audit_mappings_page(vcpu,
1392 vcpu->mmu.pae_root[i],
1393 i << 30,
1394 2);
1395}
1396
1397static int count_rmaps(struct kvm_vcpu *vcpu)
1398{
1399 int nmaps = 0;
1400 int i, j, k;
1401
1402 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
1403 struct kvm_memory_slot *m = &vcpu->kvm->memslots[i];
1404 struct kvm_rmap_desc *d;
1405
1406 for (j = 0; j < m->npages; ++j) {
1407 struct page *page = m->phys_mem[j];
1408
1409 if (!page->private)
1410 continue;
1411 if (!(page->private & 1)) {
1412 ++nmaps;
1413 continue;
1414 }
1415 d = (struct kvm_rmap_desc *)(page->private & ~1ul);
1416 while (d) {
1417 for (k = 0; k < RMAP_EXT; ++k)
1418 if (d->shadow_ptes[k])
1419 ++nmaps;
1420 else
1421 break;
1422 d = d->more;
1423 }
1424 }
1425 }
1426 return nmaps;
1427}
1428
1429static int count_writable_mappings(struct kvm_vcpu *vcpu)
1430{
1431 int nmaps = 0;
1432 struct kvm_mmu_page *page;
1433 int i;
1434
1435 list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) {
1436 u64 *pt = page->spt;
1437
1438 if (page->role.level != PT_PAGE_TABLE_LEVEL)
1439 continue;
1440
1441 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1442 u64 ent = pt[i];
1443
1444 if (!(ent & PT_PRESENT_MASK))
1445 continue;
1446 if (!(ent & PT_WRITABLE_MASK))
1447 continue;
1448 ++nmaps;
1449 }
1450 }
1451 return nmaps;
1452}
1453
1454static void audit_rmap(struct kvm_vcpu *vcpu)
1455{
1456 int n_rmap = count_rmaps(vcpu);
1457 int n_actual = count_writable_mappings(vcpu);
1458
1459 if (n_rmap != n_actual)
1460 printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
1461 __FUNCTION__, audit_msg, n_rmap, n_actual);
1462}
1463
1464static void audit_write_protection(struct kvm_vcpu *vcpu)
1465{
1466 struct kvm_mmu_page *page;
1467
1468 list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) {
1469 hfn_t hfn;
1470 struct page *pg;
1471
1472 if (page->role.metaphysical)
1473 continue;
1474
1475 hfn = gpa_to_hpa(vcpu, (gpa_t)page->gfn << PAGE_SHIFT)
1476 >> PAGE_SHIFT;
1477 pg = pfn_to_page(hfn);
1478 if (pg->private)
1479 printk(KERN_ERR "%s: (%s) shadow page has writable"
1480 " mappings: gfn %lx role %x\n",
1481 __FUNCTION__, audit_msg, page->gfn,
1482 page->role.word);
1483 }
1484}
1485
1486static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
1487{
1488 int olddbg = dbg;
1489
1490 dbg = 0;
1491 audit_msg = msg;
1492 audit_rmap(vcpu);
1493 audit_write_protection(vcpu);
1494 audit_mappings(vcpu);
1495 dbg = olddbg;
1496}
1497
1498#endif
diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h
deleted file mode 100644
index 6b094b44f8fb..000000000000
--- a/drivers/kvm/paging_tmpl.h
+++ /dev/null
@@ -1,511 +0,0 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * MMU support
8 *
9 * Copyright (C) 2006 Qumranet, Inc.
10 *
11 * Authors:
12 * Yaniv Kamay <yaniv@qumranet.com>
13 * Avi Kivity <avi@qumranet.com>
14 *
15 * This work is licensed under the terms of the GNU GPL, version 2. See
16 * the COPYING file in the top-level directory.
17 *
18 */
19
20/*
21 * We need the mmu code to access both 32-bit and 64-bit guest ptes,
22 * so the code in this file is compiled twice, once per pte size.
23 */
24
25#if PTTYPE == 64
26 #define pt_element_t u64
27 #define guest_walker guest_walker64
28 #define FNAME(name) paging##64_##name
29 #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
30 #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
31 #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
32 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
33 #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
34 #ifdef CONFIG_X86_64
35 #define PT_MAX_FULL_LEVELS 4
36 #else
37 #define PT_MAX_FULL_LEVELS 2
38 #endif
39#elif PTTYPE == 32
40 #define pt_element_t u32
41 #define guest_walker guest_walker32
42 #define FNAME(name) paging##32_##name
43 #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
44 #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
45 #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
46 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
47 #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
48 #define PT_MAX_FULL_LEVELS 2
49#else
50 #error Invalid PTTYPE value
51#endif
52
53/*
54 * The guest_walker structure emulates the behavior of the hardware page
55 * table walker.
56 */
57struct guest_walker {
58 int level;
59 gfn_t table_gfn[PT_MAX_FULL_LEVELS];
60 pt_element_t *table;
61 pt_element_t pte;
62 pt_element_t *ptep;
63 struct page *page;
64 int index;
65 pt_element_t inherited_ar;
66 gfn_t gfn;
67 u32 error_code;
68};
69
70/*
71 * Fetch a guest pte for a guest virtual address
72 */
73static int FNAME(walk_addr)(struct guest_walker *walker,
74 struct kvm_vcpu *vcpu, gva_t addr,
75 int write_fault, int user_fault, int fetch_fault)
76{
77 hpa_t hpa;
78 struct kvm_memory_slot *slot;
79 pt_element_t *ptep;
80 pt_element_t root;
81 gfn_t table_gfn;
82
83 pgprintk("%s: addr %lx\n", __FUNCTION__, addr);
84 walker->level = vcpu->mmu.root_level;
85 walker->table = NULL;
86 walker->page = NULL;
87 walker->ptep = NULL;
88 root = vcpu->cr3;
89#if PTTYPE == 64
90 if (!is_long_mode(vcpu)) {
91 walker->ptep = &vcpu->pdptrs[(addr >> 30) & 3];
92 root = *walker->ptep;
93 walker->pte = root;
94 if (!(root & PT_PRESENT_MASK))
95 goto not_present;
96 --walker->level;
97 }
98#endif
99 table_gfn = (root & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
100 walker->table_gfn[walker->level - 1] = table_gfn;
101 pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
102 walker->level - 1, table_gfn);
103 slot = gfn_to_memslot(vcpu->kvm, table_gfn);
104 hpa = safe_gpa_to_hpa(vcpu, root & PT64_BASE_ADDR_MASK);
105 walker->page = pfn_to_page(hpa >> PAGE_SHIFT);
106 walker->table = kmap_atomic(walker->page, KM_USER0);
107
108 ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
109 (vcpu->cr3 & CR3_NONPAE_RESERVED_BITS) == 0);
110
111 walker->inherited_ar = PT_USER_MASK | PT_WRITABLE_MASK;
112
113 for (;;) {
114 int index = PT_INDEX(addr, walker->level);
115 hpa_t paddr;
116
117 ptep = &walker->table[index];
118 walker->index = index;
119 ASSERT(((unsigned long)walker->table & PAGE_MASK) ==
120 ((unsigned long)ptep & PAGE_MASK));
121
122 if (!is_present_pte(*ptep))
123 goto not_present;
124
125 if (write_fault && !is_writeble_pte(*ptep))
126 if (user_fault || is_write_protection(vcpu))
127 goto access_error;
128
129 if (user_fault && !(*ptep & PT_USER_MASK))
130 goto access_error;
131
132#if PTTYPE == 64
133 if (fetch_fault && is_nx(vcpu) && (*ptep & PT64_NX_MASK))
134 goto access_error;
135#endif
136
137 if (!(*ptep & PT_ACCESSED_MASK)) {
138 mark_page_dirty(vcpu->kvm, table_gfn);
139 *ptep |= PT_ACCESSED_MASK;
140 }
141
142 if (walker->level == PT_PAGE_TABLE_LEVEL) {
143 walker->gfn = (*ptep & PT_BASE_ADDR_MASK)
144 >> PAGE_SHIFT;
145 break;
146 }
147
148 if (walker->level == PT_DIRECTORY_LEVEL
149 && (*ptep & PT_PAGE_SIZE_MASK)
150 && (PTTYPE == 64 || is_pse(vcpu))) {
151 walker->gfn = (*ptep & PT_DIR_BASE_ADDR_MASK)
152 >> PAGE_SHIFT;
153 walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL);
154 break;
155 }
156
157 walker->inherited_ar &= walker->table[index];
158 table_gfn = (*ptep & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
159 kunmap_atomic(walker->table, KM_USER0);
160 paddr = safe_gpa_to_hpa(vcpu, table_gfn << PAGE_SHIFT);
161 walker->page = pfn_to_page(paddr >> PAGE_SHIFT);
162 walker->table = kmap_atomic(walker->page, KM_USER0);
163 --walker->level;
164 walker->table_gfn[walker->level - 1 ] = table_gfn;
165 pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
166 walker->level - 1, table_gfn);
167 }
168 walker->pte = *ptep;
169 if (walker->page)
170 walker->ptep = NULL;
171 if (walker->table)
172 kunmap_atomic(walker->table, KM_USER0);
173 pgprintk("%s: pte %llx\n", __FUNCTION__, (u64)*ptep);
174 return 1;
175
176not_present:
177 walker->error_code = 0;
178 goto err;
179
180access_error:
181 walker->error_code = PFERR_PRESENT_MASK;
182
183err:
184 if (write_fault)
185 walker->error_code |= PFERR_WRITE_MASK;
186 if (user_fault)
187 walker->error_code |= PFERR_USER_MASK;
188 if (fetch_fault)
189 walker->error_code |= PFERR_FETCH_MASK;
190 if (walker->table)
191 kunmap_atomic(walker->table, KM_USER0);
192 return 0;
193}
194
195static void FNAME(mark_pagetable_dirty)(struct kvm *kvm,
196 struct guest_walker *walker)
197{
198 mark_page_dirty(kvm, walker->table_gfn[walker->level - 1]);
199}
200
201static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu,
202 u64 *shadow_pte,
203 gpa_t gaddr,
204 pt_element_t gpte,
205 u64 access_bits,
206 int user_fault,
207 int write_fault,
208 int *ptwrite,
209 struct guest_walker *walker,
210 gfn_t gfn)
211{
212 hpa_t paddr;
213 int dirty = gpte & PT_DIRTY_MASK;
214 u64 spte = *shadow_pte;
215 int was_rmapped = is_rmap_pte(spte);
216
217 pgprintk("%s: spte %llx gpte %llx access %llx write_fault %d"
218 " user_fault %d gfn %lx\n",
219 __FUNCTION__, spte, (u64)gpte, access_bits,
220 write_fault, user_fault, gfn);
221
222 if (write_fault && !dirty) {
223 pt_element_t *guest_ent, *tmp = NULL;
224
225 if (walker->ptep)
226 guest_ent = walker->ptep;
227 else {
228 tmp = kmap_atomic(walker->page, KM_USER0);
229 guest_ent = &tmp[walker->index];
230 }
231
232 *guest_ent |= PT_DIRTY_MASK;
233 if (!walker->ptep)
234 kunmap_atomic(tmp, KM_USER0);
235 dirty = 1;
236 FNAME(mark_pagetable_dirty)(vcpu->kvm, walker);
237 }
238
239 spte |= PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK;
240 spte |= gpte & PT64_NX_MASK;
241 if (!dirty)
242 access_bits &= ~PT_WRITABLE_MASK;
243
244 paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK);
245
246 spte |= PT_PRESENT_MASK;
247 if (access_bits & PT_USER_MASK)
248 spte |= PT_USER_MASK;
249
250 if (is_error_hpa(paddr)) {
251 spte |= gaddr;
252 spte |= PT_SHADOW_IO_MARK;
253 spte &= ~PT_PRESENT_MASK;
254 set_shadow_pte(shadow_pte, spte);
255 return;
256 }
257
258 spte |= paddr;
259
260 if ((access_bits & PT_WRITABLE_MASK)
261 || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
262 struct kvm_mmu_page *shadow;
263
264 spte |= PT_WRITABLE_MASK;
265 if (user_fault) {
266 mmu_unshadow(vcpu, gfn);
267 goto unshadowed;
268 }
269
270 shadow = kvm_mmu_lookup_page(vcpu, gfn);
271 if (shadow) {
272 pgprintk("%s: found shadow page for %lx, marking ro\n",
273 __FUNCTION__, gfn);
274 access_bits &= ~PT_WRITABLE_MASK;
275 if (is_writeble_pte(spte)) {
276 spte &= ~PT_WRITABLE_MASK;
277 kvm_x86_ops->tlb_flush(vcpu);
278 }
279 if (write_fault)
280 *ptwrite = 1;
281 }
282 }
283
284unshadowed:
285
286 if (access_bits & PT_WRITABLE_MASK)
287 mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT);
288
289 set_shadow_pte(shadow_pte, spte);
290 page_header_update_slot(vcpu->kvm, shadow_pte, gaddr);
291 if (!was_rmapped)
292 rmap_add(vcpu, shadow_pte);
293}
294
295static void FNAME(set_pte)(struct kvm_vcpu *vcpu, pt_element_t gpte,
296 u64 *shadow_pte, u64 access_bits,
297 int user_fault, int write_fault, int *ptwrite,
298 struct guest_walker *walker, gfn_t gfn)
299{
300 access_bits &= gpte;
301 FNAME(set_pte_common)(vcpu, shadow_pte, gpte & PT_BASE_ADDR_MASK,
302 gpte, access_bits, user_fault, write_fault,
303 ptwrite, walker, gfn);
304}
305
306static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
307 u64 *spte, const void *pte, int bytes)
308{
309 pt_element_t gpte;
310
311 if (bytes < sizeof(pt_element_t))
312 return;
313 gpte = *(const pt_element_t *)pte;
314 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK))
315 return;
316 pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte);
317 FNAME(set_pte)(vcpu, gpte, spte, PT_USER_MASK | PT_WRITABLE_MASK, 0,
318 0, NULL, NULL,
319 (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT);
320}
321
322static void FNAME(set_pde)(struct kvm_vcpu *vcpu, pt_element_t gpde,
323 u64 *shadow_pte, u64 access_bits,
324 int user_fault, int write_fault, int *ptwrite,
325 struct guest_walker *walker, gfn_t gfn)
326{
327 gpa_t gaddr;
328
329 access_bits &= gpde;
330 gaddr = (gpa_t)gfn << PAGE_SHIFT;
331 if (PTTYPE == 32 && is_cpuid_PSE36())
332 gaddr |= (gpde & PT32_DIR_PSE36_MASK) <<
333 (32 - PT32_DIR_PSE36_SHIFT);
334 FNAME(set_pte_common)(vcpu, shadow_pte, gaddr,
335 gpde, access_bits, user_fault, write_fault,
336 ptwrite, walker, gfn);
337}
338
339/*
340 * Fetch a shadow pte for a specific level in the paging hierarchy.
341 */
342static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
343 struct guest_walker *walker,
344 int user_fault, int write_fault, int *ptwrite)
345{
346 hpa_t shadow_addr;
347 int level;
348 u64 *shadow_ent;
349 u64 *prev_shadow_ent = NULL;
350
351 if (!is_present_pte(walker->pte))
352 return NULL;
353
354 shadow_addr = vcpu->mmu.root_hpa;
355 level = vcpu->mmu.shadow_root_level;
356 if (level == PT32E_ROOT_LEVEL) {
357 shadow_addr = vcpu->mmu.pae_root[(addr >> 30) & 3];
358 shadow_addr &= PT64_BASE_ADDR_MASK;
359 --level;
360 }
361
362 for (; ; level--) {
363 u32 index = SHADOW_PT_INDEX(addr, level);
364 struct kvm_mmu_page *shadow_page;
365 u64 shadow_pte;
366 int metaphysical;
367 gfn_t table_gfn;
368 unsigned hugepage_access = 0;
369
370 shadow_ent = ((u64 *)__va(shadow_addr)) + index;
371 if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) {
372 if (level == PT_PAGE_TABLE_LEVEL)
373 break;
374 shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
375 prev_shadow_ent = shadow_ent;
376 continue;
377 }
378
379 if (level == PT_PAGE_TABLE_LEVEL)
380 break;
381
382 if (level - 1 == PT_PAGE_TABLE_LEVEL
383 && walker->level == PT_DIRECTORY_LEVEL) {
384 metaphysical = 1;
385 hugepage_access = walker->pte;
386 hugepage_access &= PT_USER_MASK | PT_WRITABLE_MASK;
387 if (walker->pte & PT64_NX_MASK)
388 hugepage_access |= (1 << 2);
389 hugepage_access >>= PT_WRITABLE_SHIFT;
390 table_gfn = (walker->pte & PT_BASE_ADDR_MASK)
391 >> PAGE_SHIFT;
392 } else {
393 metaphysical = 0;
394 table_gfn = walker->table_gfn[level - 2];
395 }
396 shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
397 metaphysical, hugepage_access,
398 shadow_ent);
399 shadow_addr = __pa(shadow_page->spt);
400 shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
401 | PT_WRITABLE_MASK | PT_USER_MASK;
402 *shadow_ent = shadow_pte;
403 prev_shadow_ent = shadow_ent;
404 }
405
406 if (walker->level == PT_DIRECTORY_LEVEL) {
407 FNAME(set_pde)(vcpu, walker->pte, shadow_ent,
408 walker->inherited_ar, user_fault, write_fault,
409 ptwrite, walker, walker->gfn);
410 } else {
411 ASSERT(walker->level == PT_PAGE_TABLE_LEVEL);
412 FNAME(set_pte)(vcpu, walker->pte, shadow_ent,
413 walker->inherited_ar, user_fault, write_fault,
414 ptwrite, walker, walker->gfn);
415 }
416 return shadow_ent;
417}
418
419/*
420 * Page fault handler. There are several causes for a page fault:
421 * - there is no shadow pte for the guest pte
422 * - write access through a shadow pte marked read only so that we can set
423 * the dirty bit
424 * - write access to a shadow pte marked read only so we can update the page
425 * dirty bitmap, when userspace requests it
426 * - mmio access; in this case we will never install a present shadow pte
427 * - normal guest page fault due to the guest pte marked not present, not
428 * writable, or not executable
429 *
430 * Returns: 1 if we need to emulate the instruction, 0 otherwise, or
431 * a negative value on error.
432 */
433static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
434 u32 error_code)
435{
436 int write_fault = error_code & PFERR_WRITE_MASK;
437 int user_fault = error_code & PFERR_USER_MASK;
438 int fetch_fault = error_code & PFERR_FETCH_MASK;
439 struct guest_walker walker;
440 u64 *shadow_pte;
441 int write_pt = 0;
442 int r;
443
444 pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code);
445 kvm_mmu_audit(vcpu, "pre page fault");
446
447 r = mmu_topup_memory_caches(vcpu);
448 if (r)
449 return r;
450
451 /*
452 * Look up the shadow pte for the faulting address.
453 */
454 r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
455 fetch_fault);
456
457 /*
458 * The page is not mapped by the guest. Let the guest handle it.
459 */
460 if (!r) {
461 pgprintk("%s: guest page fault\n", __FUNCTION__);
462 inject_page_fault(vcpu, addr, walker.error_code);
463 vcpu->last_pt_write_count = 0; /* reset fork detector */
464 return 0;
465 }
466
467 shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
468 &write_pt);
469 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__,
470 shadow_pte, *shadow_pte, write_pt);
471
472 if (!write_pt)
473 vcpu->last_pt_write_count = 0; /* reset fork detector */
474
475 /*
476 * mmio: emulate if accessible, otherwise its a guest fault.
477 */
478 if (is_io_pte(*shadow_pte))
479 return 1;
480
481 ++vcpu->stat.pf_fixed;
482 kvm_mmu_audit(vcpu, "post page fault (fixed)");
483
484 return write_pt;
485}
486
487static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
488{
489 struct guest_walker walker;
490 gpa_t gpa = UNMAPPED_GVA;
491 int r;
492
493 r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0);
494
495 if (r) {
496 gpa = (gpa_t)walker.gfn << PAGE_SHIFT;
497 gpa |= vaddr & ~PAGE_MASK;
498 }
499
500 return gpa;
501}
502
503#undef pt_element_t
504#undef guest_walker
505#undef FNAME
506#undef PT_BASE_ADDR_MASK
507#undef PT_INDEX
508#undef SHADOW_PT_INDEX
509#undef PT_LEVEL_MASK
510#undef PT_DIR_BASE_ADDR_MASK
511#undef PT_MAX_FULL_LEVELS
diff --git a/drivers/kvm/x86_emulate.c b/drivers/kvm/x86_emulate.c
deleted file mode 100644
index bd46de6bf891..000000000000
--- a/drivers/kvm/x86_emulate.c
+++ /dev/null
@@ -1,1662 +0,0 @@
1/******************************************************************************
2 * x86_emulate.c
3 *
4 * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
5 *
6 * Copyright (c) 2005 Keir Fraser
7 *
8 * Linux coding style, mod r/m decoder, segment base fixes, real-mode
9 * privileged instructions:
10 *
11 * Copyright (C) 2006 Qumranet
12 *
13 * Avi Kivity <avi@qumranet.com>
14 * Yaniv Kamay <yaniv@qumranet.com>
15 *
16 * This work is licensed under the terms of the GNU GPL, version 2. See
17 * the COPYING file in the top-level directory.
18 *
19 * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
20 */
21
22#ifndef __KERNEL__
23#include <stdio.h>
24#include <stdint.h>
25#include <public/xen.h>
26#define DPRINTF(_f, _a ...) printf( _f , ## _a )
27#else
28#include "kvm.h"
29#define DPRINTF(x...) do {} while (0)
30#endif
31#include "x86_emulate.h"
32#include <linux/module.h>
33
34/*
35 * Opcode effective-address decode tables.
36 * Note that we only emulate instructions that have at least one memory
37 * operand (excluding implicit stack references). We assume that stack
38 * references and instruction fetches will never occur in special memory
39 * areas that require emulation. So, for example, 'mov <imm>,<reg>' need
40 * not be handled.
41 */
42
43/* Operand sizes: 8-bit operands or specified/overridden size. */
44#define ByteOp (1<<0) /* 8-bit operands. */
45/* Destination operand type. */
46#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */
47#define DstReg (2<<1) /* Register operand. */
48#define DstMem (3<<1) /* Memory operand. */
49#define DstMask (3<<1)
50/* Source operand type. */
51#define SrcNone (0<<3) /* No source operand. */
52#define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */
53#define SrcReg (1<<3) /* Register operand. */
54#define SrcMem (2<<3) /* Memory operand. */
55#define SrcMem16 (3<<3) /* Memory operand (16-bit). */
56#define SrcMem32 (4<<3) /* Memory operand (32-bit). */
57#define SrcImm (5<<3) /* Immediate operand. */
58#define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */
59#define SrcMask (7<<3)
60/* Generic ModRM decode. */
61#define ModRM (1<<6)
62/* Destination is only written; never read. */
63#define Mov (1<<7)
64#define BitOp (1<<8)
65
66static u8 opcode_table[256] = {
67 /* 0x00 - 0x07 */
68 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
69 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
70 0, 0, 0, 0,
71 /* 0x08 - 0x0F */
72 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
73 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
74 0, 0, 0, 0,
75 /* 0x10 - 0x17 */
76 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
77 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
78 0, 0, 0, 0,
79 /* 0x18 - 0x1F */
80 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
81 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
82 0, 0, 0, 0,
83 /* 0x20 - 0x27 */
84 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
85 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
86 SrcImmByte, SrcImm, 0, 0,
87 /* 0x28 - 0x2F */
88 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
89 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
90 0, 0, 0, 0,
91 /* 0x30 - 0x37 */
92 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
93 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
94 0, 0, 0, 0,
95 /* 0x38 - 0x3F */
96 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
97 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
98 0, 0, 0, 0,
99 /* 0x40 - 0x4F */
100 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
101 /* 0x50 - 0x57 */
102 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
103 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
104 /* 0x58 - 0x5F */
105 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
106 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
107 /* 0x60 - 0x67 */
108 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
109 0, 0, 0, 0,
110 /* 0x68 - 0x6F */
111 0, 0, ImplicitOps|Mov, 0,
112 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */
113 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */
114 /* 0x70 - 0x77 */
115 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
116 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
117 /* 0x78 - 0x7F */
118 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
119 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
120 /* 0x80 - 0x87 */
121 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
122 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
123 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
124 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
125 /* 0x88 - 0x8F */
126 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
127 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
128 0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov,
129 /* 0x90 - 0x9F */
130 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps, ImplicitOps, 0, 0,
131 /* 0xA0 - 0xA7 */
132 ByteOp | DstReg | SrcMem | Mov, DstReg | SrcMem | Mov,
133 ByteOp | DstMem | SrcReg | Mov, DstMem | SrcReg | Mov,
134 ByteOp | ImplicitOps | Mov, ImplicitOps | Mov,
135 ByteOp | ImplicitOps, ImplicitOps,
136 /* 0xA8 - 0xAF */
137 0, 0, ByteOp | ImplicitOps | Mov, ImplicitOps | Mov,
138 ByteOp | ImplicitOps | Mov, ImplicitOps | Mov,
139 ByteOp | ImplicitOps, ImplicitOps,
140 /* 0xB0 - 0xBF */
141 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
142 /* 0xC0 - 0xC7 */
143 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
144 0, ImplicitOps, 0, 0,
145 ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
146 /* 0xC8 - 0xCF */
147 0, 0, 0, 0, 0, 0, 0, 0,
148 /* 0xD0 - 0xD7 */
149 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
150 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
151 0, 0, 0, 0,
152 /* 0xD8 - 0xDF */
153 0, 0, 0, 0, 0, 0, 0, 0,
154 /* 0xE0 - 0xE7 */
155 0, 0, 0, 0, 0, 0, 0, 0,
156 /* 0xE8 - 0xEF */
157 ImplicitOps, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps, 0, 0, 0, 0,
158 /* 0xF0 - 0xF7 */
159 0, 0, 0, 0,
160 ImplicitOps, 0,
161 ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
162 /* 0xF8 - 0xFF */
163 0, 0, 0, 0,
164 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM
165};
166
167static u16 twobyte_table[256] = {
168 /* 0x00 - 0x0F */
169 0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0,
170 ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
171 /* 0x10 - 0x1F */
172 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
173 /* 0x20 - 0x2F */
174 ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 /* 0x30 - 0x3F */
177 ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
178 /* 0x40 - 0x47 */
179 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
180 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
181 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
182 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
183 /* 0x48 - 0x4F */
184 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
185 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
186 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
187 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
188 /* 0x50 - 0x5F */
189 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
190 /* 0x60 - 0x6F */
191 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
192 /* 0x70 - 0x7F */
193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
194 /* 0x80 - 0x8F */
195 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
196 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
197 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
198 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
199 /* 0x90 - 0x9F */
200 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
201 /* 0xA0 - 0xA7 */
202 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
203 /* 0xA8 - 0xAF */
204 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
205 /* 0xB0 - 0xB7 */
206 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0,
207 DstMem | SrcReg | ModRM | BitOp,
208 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
209 DstReg | SrcMem16 | ModRM | Mov,
210 /* 0xB8 - 0xBF */
211 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp,
212 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
213 DstReg | SrcMem16 | ModRM | Mov,
214 /* 0xC0 - 0xCF */
215 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 /* 0xD0 - 0xDF */
218 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
219 /* 0xE0 - 0xEF */
220 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
221 /* 0xF0 - 0xFF */
222 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
223};
224
225/* Type, address-of, and value of an instruction's operand. */
226struct operand {
227 enum { OP_REG, OP_MEM, OP_IMM } type;
228 unsigned int bytes;
229 unsigned long val, orig_val, *ptr;
230};
231
232/* EFLAGS bit definitions. */
233#define EFLG_OF (1<<11)
234#define EFLG_DF (1<<10)
235#define EFLG_SF (1<<7)
236#define EFLG_ZF (1<<6)
237#define EFLG_AF (1<<4)
238#define EFLG_PF (1<<2)
239#define EFLG_CF (1<<0)
240
241/*
242 * Instruction emulation:
243 * Most instructions are emulated directly via a fragment of inline assembly
244 * code. This allows us to save/restore EFLAGS and thus very easily pick up
245 * any modified flags.
246 */
247
248#if defined(CONFIG_X86_64)
249#define _LO32 "k" /* force 32-bit operand */
250#define _STK "%%rsp" /* stack pointer */
251#elif defined(__i386__)
252#define _LO32 "" /* force 32-bit operand */
253#define _STK "%%esp" /* stack pointer */
254#endif
255
256/*
257 * These EFLAGS bits are restored from saved value during emulation, and
258 * any changes are written back to the saved value after emulation.
259 */
260#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF)
261
262/* Before executing instruction: restore necessary bits in EFLAGS. */
263#define _PRE_EFLAGS(_sav, _msk, _tmp) \
264 /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); */ \
265 "push %"_sav"; " \
266 "movl %"_msk",%"_LO32 _tmp"; " \
267 "andl %"_LO32 _tmp",("_STK"); " \
268 "pushf; " \
269 "notl %"_LO32 _tmp"; " \
270 "andl %"_LO32 _tmp",("_STK"); " \
271 "pop %"_tmp"; " \
272 "orl %"_LO32 _tmp",("_STK"); " \
273 "popf; " \
274 /* _sav &= ~msk; */ \
275 "movl %"_msk",%"_LO32 _tmp"; " \
276 "notl %"_LO32 _tmp"; " \
277 "andl %"_LO32 _tmp",%"_sav"; "
278
279/* After executing instruction: write-back necessary bits in EFLAGS. */
280#define _POST_EFLAGS(_sav, _msk, _tmp) \
281 /* _sav |= EFLAGS & _msk; */ \
282 "pushf; " \
283 "pop %"_tmp"; " \
284 "andl %"_msk",%"_LO32 _tmp"; " \
285 "orl %"_LO32 _tmp",%"_sav"; "
286
287/* Raw emulation: instruction has two explicit operands. */
288#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \
289 do { \
290 unsigned long _tmp; \
291 \
292 switch ((_dst).bytes) { \
293 case 2: \
294 __asm__ __volatile__ ( \
295 _PRE_EFLAGS("0","4","2") \
296 _op"w %"_wx"3,%1; " \
297 _POST_EFLAGS("0","4","2") \
298 : "=m" (_eflags), "=m" ((_dst).val), \
299 "=&r" (_tmp) \
300 : _wy ((_src).val), "i" (EFLAGS_MASK) ); \
301 break; \
302 case 4: \
303 __asm__ __volatile__ ( \
304 _PRE_EFLAGS("0","4","2") \
305 _op"l %"_lx"3,%1; " \
306 _POST_EFLAGS("0","4","2") \
307 : "=m" (_eflags), "=m" ((_dst).val), \
308 "=&r" (_tmp) \
309 : _ly ((_src).val), "i" (EFLAGS_MASK) ); \
310 break; \
311 case 8: \
312 __emulate_2op_8byte(_op, _src, _dst, \
313 _eflags, _qx, _qy); \
314 break; \
315 } \
316 } while (0)
317
318#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
319 do { \
320 unsigned long _tmp; \
321 switch ( (_dst).bytes ) \
322 { \
323 case 1: \
324 __asm__ __volatile__ ( \
325 _PRE_EFLAGS("0","4","2") \
326 _op"b %"_bx"3,%1; " \
327 _POST_EFLAGS("0","4","2") \
328 : "=m" (_eflags), "=m" ((_dst).val), \
329 "=&r" (_tmp) \
330 : _by ((_src).val), "i" (EFLAGS_MASK) ); \
331 break; \
332 default: \
333 __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
334 _wx, _wy, _lx, _ly, _qx, _qy); \
335 break; \
336 } \
337 } while (0)
338
339/* Source operand is byte-sized and may be restricted to just %cl. */
340#define emulate_2op_SrcB(_op, _src, _dst, _eflags) \
341 __emulate_2op(_op, _src, _dst, _eflags, \
342 "b", "c", "b", "c", "b", "c", "b", "c")
343
344/* Source operand is byte, word, long or quad sized. */
345#define emulate_2op_SrcV(_op, _src, _dst, _eflags) \
346 __emulate_2op(_op, _src, _dst, _eflags, \
347 "b", "q", "w", "r", _LO32, "r", "", "r")
348
349/* Source operand is word, long or quad sized. */
350#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \
351 __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
352 "w", "r", _LO32, "r", "", "r")
353
354/* Instruction has only one explicit operand (no source operand). */
355#define emulate_1op(_op, _dst, _eflags) \
356 do { \
357 unsigned long _tmp; \
358 \
359 switch ( (_dst).bytes ) \
360 { \
361 case 1: \
362 __asm__ __volatile__ ( \
363 _PRE_EFLAGS("0","3","2") \
364 _op"b %1; " \
365 _POST_EFLAGS("0","3","2") \
366 : "=m" (_eflags), "=m" ((_dst).val), \
367 "=&r" (_tmp) \
368 : "i" (EFLAGS_MASK) ); \
369 break; \
370 case 2: \
371 __asm__ __volatile__ ( \
372 _PRE_EFLAGS("0","3","2") \
373 _op"w %1; " \
374 _POST_EFLAGS("0","3","2") \
375 : "=m" (_eflags), "=m" ((_dst).val), \
376 "=&r" (_tmp) \
377 : "i" (EFLAGS_MASK) ); \
378 break; \
379 case 4: \
380 __asm__ __volatile__ ( \
381 _PRE_EFLAGS("0","3","2") \
382 _op"l %1; " \
383 _POST_EFLAGS("0","3","2") \
384 : "=m" (_eflags), "=m" ((_dst).val), \
385 "=&r" (_tmp) \
386 : "i" (EFLAGS_MASK) ); \
387 break; \
388 case 8: \
389 __emulate_1op_8byte(_op, _dst, _eflags); \
390 break; \
391 } \
392 } while (0)
393
394/* Emulate an instruction with quadword operands (x86/64 only). */
395#if defined(CONFIG_X86_64)
396#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) \
397 do { \
398 __asm__ __volatile__ ( \
399 _PRE_EFLAGS("0","4","2") \
400 _op"q %"_qx"3,%1; " \
401 _POST_EFLAGS("0","4","2") \
402 : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
403 : _qy ((_src).val), "i" (EFLAGS_MASK) ); \
404 } while (0)
405
406#define __emulate_1op_8byte(_op, _dst, _eflags) \
407 do { \
408 __asm__ __volatile__ ( \
409 _PRE_EFLAGS("0","3","2") \
410 _op"q %1; " \
411 _POST_EFLAGS("0","3","2") \
412 : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
413 : "i" (EFLAGS_MASK) ); \
414 } while (0)
415
416#elif defined(__i386__)
417#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)
418#define __emulate_1op_8byte(_op, _dst, _eflags)
419#endif /* __i386__ */
420
421/* Fetch next part of the instruction being emulated. */
422#define insn_fetch(_type, _size, _eip) \
423({ unsigned long _x; \
424 rc = ops->read_std((unsigned long)(_eip) + ctxt->cs_base, &_x, \
425 (_size), ctxt->vcpu); \
426 if ( rc != 0 ) \
427 goto done; \
428 (_eip) += (_size); \
429 (_type)_x; \
430})
431
432/* Access/update address held in a register, based on addressing mode. */
433#define address_mask(reg) \
434 ((ad_bytes == sizeof(unsigned long)) ? \
435 (reg) : ((reg) & ((1UL << (ad_bytes << 3)) - 1)))
436#define register_address(base, reg) \
437 ((base) + address_mask(reg))
438#define register_address_increment(reg, inc) \
439 do { \
440 /* signed type ensures sign extension to long */ \
441 int _inc = (inc); \
442 if ( ad_bytes == sizeof(unsigned long) ) \
443 (reg) += _inc; \
444 else \
445 (reg) = ((reg) & ~((1UL << (ad_bytes << 3)) - 1)) | \
446 (((reg) + _inc) & ((1UL << (ad_bytes << 3)) - 1)); \
447 } while (0)
448
449#define JMP_REL(rel) \
450 do { \
451 register_address_increment(_eip, rel); \
452 } while (0)
453
454/*
455 * Given the 'reg' portion of a ModRM byte, and a register block, return a
456 * pointer into the block that addresses the relevant register.
457 * @highbyte_regs specifies whether to decode AH,CH,DH,BH.
458 */
459static void *decode_register(u8 modrm_reg, unsigned long *regs,
460 int highbyte_regs)
461{
462 void *p;
463
464 p = &regs[modrm_reg];
465 if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8)
466 p = (unsigned char *)&regs[modrm_reg & 3] + 1;
467 return p;
468}
469
470static int read_descriptor(struct x86_emulate_ctxt *ctxt,
471 struct x86_emulate_ops *ops,
472 void *ptr,
473 u16 *size, unsigned long *address, int op_bytes)
474{
475 int rc;
476
477 if (op_bytes == 2)
478 op_bytes = 3;
479 *address = 0;
480 rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
481 ctxt->vcpu);
482 if (rc)
483 return rc;
484 rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
485 ctxt->vcpu);
486 return rc;
487}
488
489static int test_cc(unsigned int condition, unsigned int flags)
490{
491 int rc = 0;
492
493 switch ((condition & 15) >> 1) {
494 case 0: /* o */
495 rc |= (flags & EFLG_OF);
496 break;
497 case 1: /* b/c/nae */
498 rc |= (flags & EFLG_CF);
499 break;
500 case 2: /* z/e */
501 rc |= (flags & EFLG_ZF);
502 break;
503 case 3: /* be/na */
504 rc |= (flags & (EFLG_CF|EFLG_ZF));
505 break;
506 case 4: /* s */
507 rc |= (flags & EFLG_SF);
508 break;
509 case 5: /* p/pe */
510 rc |= (flags & EFLG_PF);
511 break;
512 case 7: /* le/ng */
513 rc |= (flags & EFLG_ZF);
514 /* fall through */
515 case 6: /* l/nge */
516 rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF));
517 break;
518 }
519
520 /* Odd condition identifiers (lsb == 1) have inverted sense. */
521 return (!!rc ^ (condition & 1));
522}
523
524int
525x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
526{
527 unsigned d;
528 u8 b, sib, twobyte = 0, rex_prefix = 0;
529 u8 modrm, modrm_mod = 0, modrm_reg = 0, modrm_rm = 0;
530 unsigned long *override_base = NULL;
531 unsigned int op_bytes, ad_bytes, lock_prefix = 0, rep_prefix = 0, i;
532 int rc = 0;
533 struct operand src, dst;
534 unsigned long cr2 = ctxt->cr2;
535 int mode = ctxt->mode;
536 unsigned long modrm_ea;
537 int use_modrm_ea, index_reg = 0, base_reg = 0, scale, rip_relative = 0;
538 int no_wb = 0;
539 u64 msr_data;
540
541 /* Shadow copy of register state. Committed on successful emulation. */
542 unsigned long _regs[NR_VCPU_REGS];
543 unsigned long _eip = ctxt->vcpu->rip, _eflags = ctxt->eflags;
544 unsigned long modrm_val = 0;
545
546 memcpy(_regs, ctxt->vcpu->regs, sizeof _regs);
547
548 switch (mode) {
549 case X86EMUL_MODE_REAL:
550 case X86EMUL_MODE_PROT16:
551 op_bytes = ad_bytes = 2;
552 break;
553 case X86EMUL_MODE_PROT32:
554 op_bytes = ad_bytes = 4;
555 break;
556#ifdef CONFIG_X86_64
557 case X86EMUL_MODE_PROT64:
558 op_bytes = 4;
559 ad_bytes = 8;
560 break;
561#endif
562 default:
563 return -1;
564 }
565
566 /* Legacy prefixes. */
567 for (i = 0; i < 8; i++) {
568 switch (b = insn_fetch(u8, 1, _eip)) {
569 case 0x66: /* operand-size override */
570 op_bytes ^= 6; /* switch between 2/4 bytes */
571 break;
572 case 0x67: /* address-size override */
573 if (mode == X86EMUL_MODE_PROT64)
574 ad_bytes ^= 12; /* switch between 4/8 bytes */
575 else
576 ad_bytes ^= 6; /* switch between 2/4 bytes */
577 break;
578 case 0x2e: /* CS override */
579 override_base = &ctxt->cs_base;
580 break;
581 case 0x3e: /* DS override */
582 override_base = &ctxt->ds_base;
583 break;
584 case 0x26: /* ES override */
585 override_base = &ctxt->es_base;
586 break;
587 case 0x64: /* FS override */
588 override_base = &ctxt->fs_base;
589 break;
590 case 0x65: /* GS override */
591 override_base = &ctxt->gs_base;
592 break;
593 case 0x36: /* SS override */
594 override_base = &ctxt->ss_base;
595 break;
596 case 0xf0: /* LOCK */
597 lock_prefix = 1;
598 break;
599 case 0xf2: /* REPNE/REPNZ */
600 case 0xf3: /* REP/REPE/REPZ */
601 rep_prefix = 1;
602 break;
603 default:
604 goto done_prefixes;
605 }
606 }
607
608done_prefixes:
609
610 /* REX prefix. */
611 if ((mode == X86EMUL_MODE_PROT64) && ((b & 0xf0) == 0x40)) {
612 rex_prefix = b;
613 if (b & 8)
614 op_bytes = 8; /* REX.W */
615 modrm_reg = (b & 4) << 1; /* REX.R */
616 index_reg = (b & 2) << 2; /* REX.X */
617 modrm_rm = base_reg = (b & 1) << 3; /* REG.B */
618 b = insn_fetch(u8, 1, _eip);
619 }
620
621 /* Opcode byte(s). */
622 d = opcode_table[b];
623 if (d == 0) {
624 /* Two-byte opcode? */
625 if (b == 0x0f) {
626 twobyte = 1;
627 b = insn_fetch(u8, 1, _eip);
628 d = twobyte_table[b];
629 }
630
631 /* Unrecognised? */
632 if (d == 0)
633 goto cannot_emulate;
634 }
635
636 /* ModRM and SIB bytes. */
637 if (d & ModRM) {
638 modrm = insn_fetch(u8, 1, _eip);
639 modrm_mod |= (modrm & 0xc0) >> 6;
640 modrm_reg |= (modrm & 0x38) >> 3;
641 modrm_rm |= (modrm & 0x07);
642 modrm_ea = 0;
643 use_modrm_ea = 1;
644
645 if (modrm_mod == 3) {
646 modrm_val = *(unsigned long *)
647 decode_register(modrm_rm, _regs, d & ByteOp);
648 goto modrm_done;
649 }
650
651 if (ad_bytes == 2) {
652 unsigned bx = _regs[VCPU_REGS_RBX];
653 unsigned bp = _regs[VCPU_REGS_RBP];
654 unsigned si = _regs[VCPU_REGS_RSI];
655 unsigned di = _regs[VCPU_REGS_RDI];
656
657 /* 16-bit ModR/M decode. */
658 switch (modrm_mod) {
659 case 0:
660 if (modrm_rm == 6)
661 modrm_ea += insn_fetch(u16, 2, _eip);
662 break;
663 case 1:
664 modrm_ea += insn_fetch(s8, 1, _eip);
665 break;
666 case 2:
667 modrm_ea += insn_fetch(u16, 2, _eip);
668 break;
669 }
670 switch (modrm_rm) {
671 case 0:
672 modrm_ea += bx + si;
673 break;
674 case 1:
675 modrm_ea += bx + di;
676 break;
677 case 2:
678 modrm_ea += bp + si;
679 break;
680 case 3:
681 modrm_ea += bp + di;
682 break;
683 case 4:
684 modrm_ea += si;
685 break;
686 case 5:
687 modrm_ea += di;
688 break;
689 case 6:
690 if (modrm_mod != 0)
691 modrm_ea += bp;
692 break;
693 case 7:
694 modrm_ea += bx;
695 break;
696 }
697 if (modrm_rm == 2 || modrm_rm == 3 ||
698 (modrm_rm == 6 && modrm_mod != 0))
699 if (!override_base)
700 override_base = &ctxt->ss_base;
701 modrm_ea = (u16)modrm_ea;
702 } else {
703 /* 32/64-bit ModR/M decode. */
704 switch (modrm_rm) {
705 case 4:
706 case 12:
707 sib = insn_fetch(u8, 1, _eip);
708 index_reg |= (sib >> 3) & 7;
709 base_reg |= sib & 7;
710 scale = sib >> 6;
711
712 switch (base_reg) {
713 case 5:
714 if (modrm_mod != 0)
715 modrm_ea += _regs[base_reg];
716 else
717 modrm_ea += insn_fetch(s32, 4, _eip);
718 break;
719 default:
720 modrm_ea += _regs[base_reg];
721 }
722 switch (index_reg) {
723 case 4:
724 break;
725 default:
726 modrm_ea += _regs[index_reg] << scale;
727
728 }
729 break;
730 case 5:
731 if (modrm_mod != 0)
732 modrm_ea += _regs[modrm_rm];
733 else if (mode == X86EMUL_MODE_PROT64)
734 rip_relative = 1;
735 break;
736 default:
737 modrm_ea += _regs[modrm_rm];
738 break;
739 }
740 switch (modrm_mod) {
741 case 0:
742 if (modrm_rm == 5)
743 modrm_ea += insn_fetch(s32, 4, _eip);
744 break;
745 case 1:
746 modrm_ea += insn_fetch(s8, 1, _eip);
747 break;
748 case 2:
749 modrm_ea += insn_fetch(s32, 4, _eip);
750 break;
751 }
752 }
753 if (!override_base)
754 override_base = &ctxt->ds_base;
755 if (mode == X86EMUL_MODE_PROT64 &&
756 override_base != &ctxt->fs_base &&
757 override_base != &ctxt->gs_base)
758 override_base = NULL;
759
760 if (override_base)
761 modrm_ea += *override_base;
762
763 if (rip_relative) {
764 modrm_ea += _eip;
765 switch (d & SrcMask) {
766 case SrcImmByte:
767 modrm_ea += 1;
768 break;
769 case SrcImm:
770 if (d & ByteOp)
771 modrm_ea += 1;
772 else
773 if (op_bytes == 8)
774 modrm_ea += 4;
775 else
776 modrm_ea += op_bytes;
777 }
778 }
779 if (ad_bytes != 8)
780 modrm_ea = (u32)modrm_ea;
781 cr2 = modrm_ea;
782 modrm_done:
783 ;
784 }
785
786 /*
787 * Decode and fetch the source operand: register, memory
788 * or immediate.
789 */
790 switch (d & SrcMask) {
791 case SrcNone:
792 break;
793 case SrcReg:
794 src.type = OP_REG;
795 if (d & ByteOp) {
796 src.ptr = decode_register(modrm_reg, _regs,
797 (rex_prefix == 0));
798 src.val = src.orig_val = *(u8 *) src.ptr;
799 src.bytes = 1;
800 } else {
801 src.ptr = decode_register(modrm_reg, _regs, 0);
802 switch ((src.bytes = op_bytes)) {
803 case 2:
804 src.val = src.orig_val = *(u16 *) src.ptr;
805 break;
806 case 4:
807 src.val = src.orig_val = *(u32 *) src.ptr;
808 break;
809 case 8:
810 src.val = src.orig_val = *(u64 *) src.ptr;
811 break;
812 }
813 }
814 break;
815 case SrcMem16:
816 src.bytes = 2;
817 goto srcmem_common;
818 case SrcMem32:
819 src.bytes = 4;
820 goto srcmem_common;
821 case SrcMem:
822 src.bytes = (d & ByteOp) ? 1 : op_bytes;
823 /* Don't fetch the address for invlpg: it could be unmapped. */
824 if (twobyte && b == 0x01 && modrm_reg == 7)
825 break;
826 srcmem_common:
827 /*
828 * For instructions with a ModR/M byte, switch to register
829 * access if Mod = 3.
830 */
831 if ((d & ModRM) && modrm_mod == 3) {
832 src.type = OP_REG;
833 break;
834 }
835 src.type = OP_MEM;
836 src.ptr = (unsigned long *)cr2;
837 src.val = 0;
838 if ((rc = ops->read_emulated((unsigned long)src.ptr,
839 &src.val, src.bytes, ctxt->vcpu)) != 0)
840 goto done;
841 src.orig_val = src.val;
842 break;
843 case SrcImm:
844 src.type = OP_IMM;
845 src.ptr = (unsigned long *)_eip;
846 src.bytes = (d & ByteOp) ? 1 : op_bytes;
847 if (src.bytes == 8)
848 src.bytes = 4;
849 /* NB. Immediates are sign-extended as necessary. */
850 switch (src.bytes) {
851 case 1:
852 src.val = insn_fetch(s8, 1, _eip);
853 break;
854 case 2:
855 src.val = insn_fetch(s16, 2, _eip);
856 break;
857 case 4:
858 src.val = insn_fetch(s32, 4, _eip);
859 break;
860 }
861 break;
862 case SrcImmByte:
863 src.type = OP_IMM;
864 src.ptr = (unsigned long *)_eip;
865 src.bytes = 1;
866 src.val = insn_fetch(s8, 1, _eip);
867 break;
868 }
869
870 /* Decode and fetch the destination operand: register or memory. */
871 switch (d & DstMask) {
872 case ImplicitOps:
873 /* Special instructions do their own operand decoding. */
874 goto special_insn;
875 case DstReg:
876 dst.type = OP_REG;
877 if ((d & ByteOp)
878 && !(twobyte && (b == 0xb6 || b == 0xb7))) {
879 dst.ptr = decode_register(modrm_reg, _regs,
880 (rex_prefix == 0));
881 dst.val = *(u8 *) dst.ptr;
882 dst.bytes = 1;
883 } else {
884 dst.ptr = decode_register(modrm_reg, _regs, 0);
885 switch ((dst.bytes = op_bytes)) {
886 case 2:
887 dst.val = *(u16 *)dst.ptr;
888 break;
889 case 4:
890 dst.val = *(u32 *)dst.ptr;
891 break;
892 case 8:
893 dst.val = *(u64 *)dst.ptr;
894 break;
895 }
896 }
897 break;
898 case DstMem:
899 dst.type = OP_MEM;
900 dst.ptr = (unsigned long *)cr2;
901 dst.bytes = (d & ByteOp) ? 1 : op_bytes;
902 dst.val = 0;
903 /*
904 * For instructions with a ModR/M byte, switch to register
905 * access if Mod = 3.
906 */
907 if ((d & ModRM) && modrm_mod == 3) {
908 dst.type = OP_REG;
909 break;
910 }
911 if (d & BitOp) {
912 unsigned long mask = ~(dst.bytes * 8 - 1);
913
914 dst.ptr = (void *)dst.ptr + (src.val & mask) / 8;
915 }
916 if (!(d & Mov) && /* optimisation - avoid slow emulated read */
917 ((rc = ops->read_emulated((unsigned long)dst.ptr,
918 &dst.val, dst.bytes, ctxt->vcpu)) != 0))
919 goto done;
920 break;
921 }
922 dst.orig_val = dst.val;
923
924 if (twobyte)
925 goto twobyte_insn;
926
927 switch (b) {
928 case 0x00 ... 0x05:
929 add: /* add */
930 emulate_2op_SrcV("add", src, dst, _eflags);
931 break;
932 case 0x08 ... 0x0d:
933 or: /* or */
934 emulate_2op_SrcV("or", src, dst, _eflags);
935 break;
936 case 0x10 ... 0x15:
937 adc: /* adc */
938 emulate_2op_SrcV("adc", src, dst, _eflags);
939 break;
940 case 0x18 ... 0x1d:
941 sbb: /* sbb */
942 emulate_2op_SrcV("sbb", src, dst, _eflags);
943 break;
944 case 0x20 ... 0x23:
945 and: /* and */
946 emulate_2op_SrcV("and", src, dst, _eflags);
947 break;
948 case 0x24: /* and al imm8 */
949 dst.type = OP_REG;
950 dst.ptr = &_regs[VCPU_REGS_RAX];
951 dst.val = *(u8 *)dst.ptr;
952 dst.bytes = 1;
953 dst.orig_val = dst.val;
954 goto and;
955 case 0x25: /* and ax imm16, or eax imm32 */
956 dst.type = OP_REG;
957 dst.bytes = op_bytes;
958 dst.ptr = &_regs[VCPU_REGS_RAX];
959 if (op_bytes == 2)
960 dst.val = *(u16 *)dst.ptr;
961 else
962 dst.val = *(u32 *)dst.ptr;
963 dst.orig_val = dst.val;
964 goto and;
965 case 0x28 ... 0x2d:
966 sub: /* sub */
967 emulate_2op_SrcV("sub", src, dst, _eflags);
968 break;
969 case 0x30 ... 0x35:
970 xor: /* xor */
971 emulate_2op_SrcV("xor", src, dst, _eflags);
972 break;
973 case 0x38 ... 0x3d:
974 cmp: /* cmp */
975 emulate_2op_SrcV("cmp", src, dst, _eflags);
976 break;
977 case 0x63: /* movsxd */
978 if (mode != X86EMUL_MODE_PROT64)
979 goto cannot_emulate;
980 dst.val = (s32) src.val;
981 break;
982 case 0x80 ... 0x83: /* Grp1 */
983 switch (modrm_reg) {
984 case 0:
985 goto add;
986 case 1:
987 goto or;
988 case 2:
989 goto adc;
990 case 3:
991 goto sbb;
992 case 4:
993 goto and;
994 case 5:
995 goto sub;
996 case 6:
997 goto xor;
998 case 7:
999 goto cmp;
1000 }
1001 break;
1002 case 0x84 ... 0x85:
1003 test: /* test */
1004 emulate_2op_SrcV("test", src, dst, _eflags);
1005 break;
1006 case 0x86 ... 0x87: /* xchg */
1007 /* Write back the register source. */
1008 switch (dst.bytes) {
1009 case 1:
1010 *(u8 *) src.ptr = (u8) dst.val;
1011 break;
1012 case 2:
1013 *(u16 *) src.ptr = (u16) dst.val;
1014 break;
1015 case 4:
1016 *src.ptr = (u32) dst.val;
1017 break; /* 64b reg: zero-extend */
1018 case 8:
1019 *src.ptr = dst.val;
1020 break;
1021 }
1022 /*
1023 * Write back the memory destination with implicit LOCK
1024 * prefix.
1025 */
1026 dst.val = src.val;
1027 lock_prefix = 1;
1028 break;
1029 case 0x88 ... 0x8b: /* mov */
1030 goto mov;
1031 case 0x8d: /* lea r16/r32, m */
1032 dst.val = modrm_val;
1033 break;
1034 case 0x8f: /* pop (sole member of Grp1a) */
1035 /* 64-bit mode: POP always pops a 64-bit operand. */
1036 if (mode == X86EMUL_MODE_PROT64)
1037 dst.bytes = 8;
1038 if ((rc = ops->read_std(register_address(ctxt->ss_base,
1039 _regs[VCPU_REGS_RSP]),
1040 &dst.val, dst.bytes, ctxt->vcpu)) != 0)
1041 goto done;
1042 register_address_increment(_regs[VCPU_REGS_RSP], dst.bytes);
1043 break;
1044 case 0xa0 ... 0xa1: /* mov */
1045 dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
1046 dst.val = src.val;
1047 _eip += ad_bytes; /* skip src displacement */
1048 break;
1049 case 0xa2 ... 0xa3: /* mov */
1050 dst.val = (unsigned long)_regs[VCPU_REGS_RAX];
1051 _eip += ad_bytes; /* skip dst displacement */
1052 break;
1053 case 0xc0 ... 0xc1:
1054 grp2: /* Grp2 */
1055 switch (modrm_reg) {
1056 case 0: /* rol */
1057 emulate_2op_SrcB("rol", src, dst, _eflags);
1058 break;
1059 case 1: /* ror */
1060 emulate_2op_SrcB("ror", src, dst, _eflags);
1061 break;
1062 case 2: /* rcl */
1063 emulate_2op_SrcB("rcl", src, dst, _eflags);
1064 break;
1065 case 3: /* rcr */
1066 emulate_2op_SrcB("rcr", src, dst, _eflags);
1067 break;
1068 case 4: /* sal/shl */
1069 case 6: /* sal/shl */
1070 emulate_2op_SrcB("sal", src, dst, _eflags);
1071 break;
1072 case 5: /* shr */
1073 emulate_2op_SrcB("shr", src, dst, _eflags);
1074 break;
1075 case 7: /* sar */
1076 emulate_2op_SrcB("sar", src, dst, _eflags);
1077 break;
1078 }
1079 break;
1080 case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */
1081 mov:
1082 dst.val = src.val;
1083 break;
1084 case 0xd0 ... 0xd1: /* Grp2 */
1085 src.val = 1;
1086 goto grp2;
1087 case 0xd2 ... 0xd3: /* Grp2 */
1088 src.val = _regs[VCPU_REGS_RCX];
1089 goto grp2;
1090 case 0xf6 ... 0xf7: /* Grp3 */
1091 switch (modrm_reg) {
1092 case 0 ... 1: /* test */
1093 /*
1094 * Special case in Grp3: test has an immediate
1095 * source operand.
1096 */
1097 src.type = OP_IMM;
1098 src.ptr = (unsigned long *)_eip;
1099 src.bytes = (d & ByteOp) ? 1 : op_bytes;
1100 if (src.bytes == 8)
1101 src.bytes = 4;
1102 switch (src.bytes) {
1103 case 1:
1104 src.val = insn_fetch(s8, 1, _eip);
1105 break;
1106 case 2:
1107 src.val = insn_fetch(s16, 2, _eip);
1108 break;
1109 case 4:
1110 src.val = insn_fetch(s32, 4, _eip);
1111 break;
1112 }
1113 goto test;
1114 case 2: /* not */
1115 dst.val = ~dst.val;
1116 break;
1117 case 3: /* neg */
1118 emulate_1op("neg", dst, _eflags);
1119 break;
1120 default:
1121 goto cannot_emulate;
1122 }
1123 break;
1124 case 0xfe ... 0xff: /* Grp4/Grp5 */
1125 switch (modrm_reg) {
1126 case 0: /* inc */
1127 emulate_1op("inc", dst, _eflags);
1128 break;
1129 case 1: /* dec */
1130 emulate_1op("dec", dst, _eflags);
1131 break;
1132 case 4: /* jmp abs */
1133 if (b == 0xff)
1134 _eip = dst.val;
1135 else
1136 goto cannot_emulate;
1137 break;
1138 case 6: /* push */
1139 /* 64-bit mode: PUSH always pushes a 64-bit operand. */
1140 if (mode == X86EMUL_MODE_PROT64) {
1141 dst.bytes = 8;
1142 if ((rc = ops->read_std((unsigned long)dst.ptr,
1143 &dst.val, 8,
1144 ctxt->vcpu)) != 0)
1145 goto done;
1146 }
1147 register_address_increment(_regs[VCPU_REGS_RSP],
1148 -dst.bytes);
1149 if ((rc = ops->write_emulated(
1150 register_address(ctxt->ss_base,
1151 _regs[VCPU_REGS_RSP]),
1152 &dst.val, dst.bytes, ctxt->vcpu)) != 0)
1153 goto done;
1154 no_wb = 1;
1155 break;
1156 default:
1157 goto cannot_emulate;
1158 }
1159 break;
1160 }
1161
1162writeback:
1163 if (!no_wb) {
1164 switch (dst.type) {
1165 case OP_REG:
1166 /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
1167 switch (dst.bytes) {
1168 case 1:
1169 *(u8 *)dst.ptr = (u8)dst.val;
1170 break;
1171 case 2:
1172 *(u16 *)dst.ptr = (u16)dst.val;
1173 break;
1174 case 4:
1175 *dst.ptr = (u32)dst.val;
1176 break; /* 64b: zero-ext */
1177 case 8:
1178 *dst.ptr = dst.val;
1179 break;
1180 }
1181 break;
1182 case OP_MEM:
1183 if (lock_prefix)
1184 rc = ops->cmpxchg_emulated((unsigned long)dst.
1185 ptr, &dst.orig_val,
1186 &dst.val, dst.bytes,
1187 ctxt->vcpu);
1188 else
1189 rc = ops->write_emulated((unsigned long)dst.ptr,
1190 &dst.val, dst.bytes,
1191 ctxt->vcpu);
1192 if (rc != 0)
1193 goto done;
1194 default:
1195 break;
1196 }
1197 }
1198
1199 /* Commit shadow register state. */
1200 memcpy(ctxt->vcpu->regs, _regs, sizeof _regs);
1201 ctxt->eflags = _eflags;
1202 ctxt->vcpu->rip = _eip;
1203
1204done:
1205 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
1206
1207special_insn:
1208 if (twobyte)
1209 goto twobyte_special_insn;
1210 switch(b) {
1211 case 0x50 ... 0x57: /* push reg */
1212 if (op_bytes == 2)
1213 src.val = (u16) _regs[b & 0x7];
1214 else
1215 src.val = (u32) _regs[b & 0x7];
1216 dst.type = OP_MEM;
1217 dst.bytes = op_bytes;
1218 dst.val = src.val;
1219 register_address_increment(_regs[VCPU_REGS_RSP], -op_bytes);
1220 dst.ptr = (void *) register_address(
1221 ctxt->ss_base, _regs[VCPU_REGS_RSP]);
1222 break;
1223 case 0x58 ... 0x5f: /* pop reg */
1224 dst.ptr = (unsigned long *)&_regs[b & 0x7];
1225 pop_instruction:
1226 if ((rc = ops->read_std(register_address(ctxt->ss_base,
1227 _regs[VCPU_REGS_RSP]), dst.ptr, op_bytes, ctxt->vcpu))
1228 != 0)
1229 goto done;
1230
1231 register_address_increment(_regs[VCPU_REGS_RSP], op_bytes);
1232 no_wb = 1; /* Disable writeback. */
1233 break;
1234 case 0x6a: /* push imm8 */
1235 src.val = 0L;
1236 src.val = insn_fetch(s8, 1, _eip);
1237 push:
1238 dst.type = OP_MEM;
1239 dst.bytes = op_bytes;
1240 dst.val = src.val;
1241 register_address_increment(_regs[VCPU_REGS_RSP], -op_bytes);
1242 dst.ptr = (void *) register_address(ctxt->ss_base,
1243 _regs[VCPU_REGS_RSP]);
1244 break;
1245 case 0x6c: /* insb */
1246 case 0x6d: /* insw/insd */
1247 if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
1248 1, /* in */
1249 (d & ByteOp) ? 1 : op_bytes, /* size */
1250 rep_prefix ?
1251 address_mask(_regs[VCPU_REGS_RCX]) : 1, /* count */
1252 (_eflags & EFLG_DF), /* down */
1253 register_address(ctxt->es_base,
1254 _regs[VCPU_REGS_RDI]), /* address */
1255 rep_prefix,
1256 _regs[VCPU_REGS_RDX] /* port */
1257 ) == 0)
1258 return -1;
1259 return 0;
1260 case 0x6e: /* outsb */
1261 case 0x6f: /* outsw/outsd */
1262 if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
1263 0, /* in */
1264 (d & ByteOp) ? 1 : op_bytes, /* size */
1265 rep_prefix ?
1266 address_mask(_regs[VCPU_REGS_RCX]) : 1, /* count */
1267 (_eflags & EFLG_DF), /* down */
1268 register_address(override_base ?
1269 *override_base : ctxt->ds_base,
1270 _regs[VCPU_REGS_RSI]), /* address */
1271 rep_prefix,
1272 _regs[VCPU_REGS_RDX] /* port */
1273 ) == 0)
1274 return -1;
1275 return 0;
1276 case 0x70 ... 0x7f: /* jcc (short) */ {
1277 int rel = insn_fetch(s8, 1, _eip);
1278
1279 if (test_cc(b, _eflags))
1280 JMP_REL(rel);
1281 break;
1282 }
1283 case 0x9c: /* pushf */
1284 src.val = (unsigned long) _eflags;
1285 goto push;
1286 case 0x9d: /* popf */
1287 dst.ptr = (unsigned long *) &_eflags;
1288 goto pop_instruction;
1289 case 0xc3: /* ret */
1290 dst.ptr = &_eip;
1291 goto pop_instruction;
1292 case 0xf4: /* hlt */
1293 ctxt->vcpu->halt_request = 1;
1294 goto done;
1295 }
1296 if (rep_prefix) {
1297 if (_regs[VCPU_REGS_RCX] == 0) {
1298 ctxt->vcpu->rip = _eip;
1299 goto done;
1300 }
1301 _regs[VCPU_REGS_RCX]--;
1302 _eip = ctxt->vcpu->rip;
1303 }
1304 switch (b) {
1305 case 0xa4 ... 0xa5: /* movs */
1306 dst.type = OP_MEM;
1307 dst.bytes = (d & ByteOp) ? 1 : op_bytes;
1308 dst.ptr = (unsigned long *)register_address(ctxt->es_base,
1309 _regs[VCPU_REGS_RDI]);
1310 if ((rc = ops->read_emulated(register_address(
1311 override_base ? *override_base : ctxt->ds_base,
1312 _regs[VCPU_REGS_RSI]), &dst.val, dst.bytes, ctxt->vcpu)) != 0)
1313 goto done;
1314 register_address_increment(_regs[VCPU_REGS_RSI],
1315 (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
1316 register_address_increment(_regs[VCPU_REGS_RDI],
1317 (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
1318 break;
1319 case 0xa6 ... 0xa7: /* cmps */
1320 DPRINTF("Urk! I don't handle CMPS.\n");
1321 goto cannot_emulate;
1322 case 0xaa ... 0xab: /* stos */
1323 dst.type = OP_MEM;
1324 dst.bytes = (d & ByteOp) ? 1 : op_bytes;
1325 dst.ptr = (unsigned long *)cr2;
1326 dst.val = _regs[VCPU_REGS_RAX];
1327 register_address_increment(_regs[VCPU_REGS_RDI],
1328 (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
1329 break;
1330 case 0xac ... 0xad: /* lods */
1331 dst.type = OP_REG;
1332 dst.bytes = (d & ByteOp) ? 1 : op_bytes;
1333 dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
1334 if ((rc = ops->read_emulated(cr2, &dst.val, dst.bytes,
1335 ctxt->vcpu)) != 0)
1336 goto done;
1337 register_address_increment(_regs[VCPU_REGS_RSI],
1338 (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
1339 break;
1340 case 0xae ... 0xaf: /* scas */
1341 DPRINTF("Urk! I don't handle SCAS.\n");
1342 goto cannot_emulate;
1343 case 0xe8: /* call (near) */ {
1344 long int rel;
1345 switch (op_bytes) {
1346 case 2:
1347 rel = insn_fetch(s16, 2, _eip);
1348 break;
1349 case 4:
1350 rel = insn_fetch(s32, 4, _eip);
1351 break;
1352 case 8:
1353 rel = insn_fetch(s64, 8, _eip);
1354 break;
1355 default:
1356 DPRINTF("Call: Invalid op_bytes\n");
1357 goto cannot_emulate;
1358 }
1359 src.val = (unsigned long) _eip;
1360 JMP_REL(rel);
1361 op_bytes = ad_bytes;
1362 goto push;
1363 }
1364 case 0xe9: /* jmp rel */
1365 case 0xeb: /* jmp rel short */
1366 JMP_REL(src.val);
1367 no_wb = 1; /* Disable writeback. */
1368 break;
1369
1370
1371 }
1372 goto writeback;
1373
1374twobyte_insn:
1375 switch (b) {
1376 case 0x01: /* lgdt, lidt, lmsw */
1377 /* Disable writeback. */
1378 no_wb = 1;
1379 switch (modrm_reg) {
1380 u16 size;
1381 unsigned long address;
1382
1383 case 2: /* lgdt */
1384 rc = read_descriptor(ctxt, ops, src.ptr,
1385 &size, &address, op_bytes);
1386 if (rc)
1387 goto done;
1388 realmode_lgdt(ctxt->vcpu, size, address);
1389 break;
1390 case 3: /* lidt */
1391 rc = read_descriptor(ctxt, ops, src.ptr,
1392 &size, &address, op_bytes);
1393 if (rc)
1394 goto done;
1395 realmode_lidt(ctxt->vcpu, size, address);
1396 break;
1397 case 4: /* smsw */
1398 if (modrm_mod != 3)
1399 goto cannot_emulate;
1400 *(u16 *)&_regs[modrm_rm]
1401 = realmode_get_cr(ctxt->vcpu, 0);
1402 break;
1403 case 6: /* lmsw */
1404 if (modrm_mod != 3)
1405 goto cannot_emulate;
1406 realmode_lmsw(ctxt->vcpu, (u16)modrm_val, &_eflags);
1407 break;
1408 case 7: /* invlpg*/
1409 emulate_invlpg(ctxt->vcpu, cr2);
1410 break;
1411 default:
1412 goto cannot_emulate;
1413 }
1414 break;
1415 case 0x21: /* mov from dr to reg */
1416 no_wb = 1;
1417 if (modrm_mod != 3)
1418 goto cannot_emulate;
1419 rc = emulator_get_dr(ctxt, modrm_reg, &_regs[modrm_rm]);
1420 break;
1421 case 0x23: /* mov from reg to dr */
1422 no_wb = 1;
1423 if (modrm_mod != 3)
1424 goto cannot_emulate;
1425 rc = emulator_set_dr(ctxt, modrm_reg, _regs[modrm_rm]);
1426 break;
1427 case 0x40 ... 0x4f: /* cmov */
1428 dst.val = dst.orig_val = src.val;
1429 no_wb = 1;
1430 /*
1431 * First, assume we're decoding an even cmov opcode
1432 * (lsb == 0).
1433 */
1434 switch ((b & 15) >> 1) {
1435 case 0: /* cmovo */
1436 no_wb = (_eflags & EFLG_OF) ? 0 : 1;
1437 break;
1438 case 1: /* cmovb/cmovc/cmovnae */
1439 no_wb = (_eflags & EFLG_CF) ? 0 : 1;
1440 break;
1441 case 2: /* cmovz/cmove */
1442 no_wb = (_eflags & EFLG_ZF) ? 0 : 1;
1443 break;
1444 case 3: /* cmovbe/cmovna */
1445 no_wb = (_eflags & (EFLG_CF | EFLG_ZF)) ? 0 : 1;
1446 break;
1447 case 4: /* cmovs */
1448 no_wb = (_eflags & EFLG_SF) ? 0 : 1;
1449 break;
1450 case 5: /* cmovp/cmovpe */
1451 no_wb = (_eflags & EFLG_PF) ? 0 : 1;
1452 break;
1453 case 7: /* cmovle/cmovng */
1454 no_wb = (_eflags & EFLG_ZF) ? 0 : 1;
1455 /* fall through */
1456 case 6: /* cmovl/cmovnge */
1457 no_wb &= (!(_eflags & EFLG_SF) !=
1458 !(_eflags & EFLG_OF)) ? 0 : 1;
1459 break;
1460 }
1461 /* Odd cmov opcodes (lsb == 1) have inverted sense. */
1462 no_wb ^= b & 1;
1463 break;
1464 case 0xa3:
1465 bt: /* bt */
1466 src.val &= (dst.bytes << 3) - 1; /* only subword offset */
1467 emulate_2op_SrcV_nobyte("bt", src, dst, _eflags);
1468 break;
1469 case 0xab:
1470 bts: /* bts */
1471 src.val &= (dst.bytes << 3) - 1; /* only subword offset */
1472 emulate_2op_SrcV_nobyte("bts", src, dst, _eflags);
1473 break;
1474 case 0xb0 ... 0xb1: /* cmpxchg */
1475 /*
1476 * Save real source value, then compare EAX against
1477 * destination.
1478 */
1479 src.orig_val = src.val;
1480 src.val = _regs[VCPU_REGS_RAX];
1481 emulate_2op_SrcV("cmp", src, dst, _eflags);
1482 if (_eflags & EFLG_ZF) {
1483 /* Success: write back to memory. */
1484 dst.val = src.orig_val;
1485 } else {
1486 /* Failure: write the value we saw to EAX. */
1487 dst.type = OP_REG;
1488 dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
1489 }
1490 break;
1491 case 0xb3:
1492 btr: /* btr */
1493 src.val &= (dst.bytes << 3) - 1; /* only subword offset */
1494 emulate_2op_SrcV_nobyte("btr", src, dst, _eflags);
1495 break;
1496 case 0xb6 ... 0xb7: /* movzx */
1497 dst.bytes = op_bytes;
1498 dst.val = (d & ByteOp) ? (u8) src.val : (u16) src.val;
1499 break;
1500 case 0xba: /* Grp8 */
1501 switch (modrm_reg & 3) {
1502 case 0:
1503 goto bt;
1504 case 1:
1505 goto bts;
1506 case 2:
1507 goto btr;
1508 case 3:
1509 goto btc;
1510 }
1511 break;
1512 case 0xbb:
1513 btc: /* btc */
1514 src.val &= (dst.bytes << 3) - 1; /* only subword offset */
1515 emulate_2op_SrcV_nobyte("btc", src, dst, _eflags);
1516 break;
1517 case 0xbe ... 0xbf: /* movsx */
1518 dst.bytes = op_bytes;
1519 dst.val = (d & ByteOp) ? (s8) src.val : (s16) src.val;
1520 break;
1521 case 0xc3: /* movnti */
1522 dst.bytes = op_bytes;
1523 dst.val = (op_bytes == 4) ? (u32) src.val : (u64) src.val;
1524 break;
1525 }
1526 goto writeback;
1527
1528twobyte_special_insn:
1529 /* Disable writeback. */
1530 no_wb = 1;
1531 switch (b) {
1532 case 0x06:
1533 emulate_clts(ctxt->vcpu);
1534 break;
1535 case 0x08: /* invd */
1536 break;
1537 case 0x09: /* wbinvd */
1538 break;
1539 case 0x0d: /* GrpP (prefetch) */
1540 case 0x18: /* Grp16 (prefetch/nop) */
1541 break;
1542 case 0x20: /* mov cr, reg */
1543 if (modrm_mod != 3)
1544 goto cannot_emulate;
1545 _regs[modrm_rm] = realmode_get_cr(ctxt->vcpu, modrm_reg);
1546 break;
1547 case 0x22: /* mov reg, cr */
1548 if (modrm_mod != 3)
1549 goto cannot_emulate;
1550 realmode_set_cr(ctxt->vcpu, modrm_reg, modrm_val, &_eflags);
1551 break;
1552 case 0x30:
1553 /* wrmsr */
1554 msr_data = (u32)_regs[VCPU_REGS_RAX]
1555 | ((u64)_regs[VCPU_REGS_RDX] << 32);
1556 rc = kvm_set_msr(ctxt->vcpu, _regs[VCPU_REGS_RCX], msr_data);
1557 if (rc) {
1558 kvm_x86_ops->inject_gp(ctxt->vcpu, 0);
1559 _eip = ctxt->vcpu->rip;
1560 }
1561 rc = X86EMUL_CONTINUE;
1562 break;
1563 case 0x32:
1564 /* rdmsr */
1565 rc = kvm_get_msr(ctxt->vcpu, _regs[VCPU_REGS_RCX], &msr_data);
1566 if (rc) {
1567 kvm_x86_ops->inject_gp(ctxt->vcpu, 0);
1568 _eip = ctxt->vcpu->rip;
1569 } else {
1570 _regs[VCPU_REGS_RAX] = (u32)msr_data;
1571 _regs[VCPU_REGS_RDX] = msr_data >> 32;
1572 }
1573 rc = X86EMUL_CONTINUE;
1574 break;
1575 case 0x80 ... 0x8f: /* jnz rel, etc*/ {
1576 long int rel;
1577
1578 switch (op_bytes) {
1579 case 2:
1580 rel = insn_fetch(s16, 2, _eip);
1581 break;
1582 case 4:
1583 rel = insn_fetch(s32, 4, _eip);
1584 break;
1585 case 8:
1586 rel = insn_fetch(s64, 8, _eip);
1587 break;
1588 default:
1589 DPRINTF("jnz: Invalid op_bytes\n");
1590 goto cannot_emulate;
1591 }
1592 if (test_cc(b, _eflags))
1593 JMP_REL(rel);
1594 break;
1595 }
1596 case 0xc7: /* Grp9 (cmpxchg8b) */
1597 {
1598 u64 old, new;
1599 if ((rc = ops->read_emulated(cr2, &old, 8, ctxt->vcpu))
1600 != 0)
1601 goto done;
1602 if (((u32) (old >> 0) != (u32) _regs[VCPU_REGS_RAX]) ||
1603 ((u32) (old >> 32) != (u32) _regs[VCPU_REGS_RDX])) {
1604 _regs[VCPU_REGS_RAX] = (u32) (old >> 0);
1605 _regs[VCPU_REGS_RDX] = (u32) (old >> 32);
1606 _eflags &= ~EFLG_ZF;
1607 } else {
1608 new = ((u64)_regs[VCPU_REGS_RCX] << 32)
1609 | (u32) _regs[VCPU_REGS_RBX];
1610 if ((rc = ops->cmpxchg_emulated(cr2, &old,
1611 &new, 8, ctxt->vcpu)) != 0)
1612 goto done;
1613 _eflags |= EFLG_ZF;
1614 }
1615 break;
1616 }
1617 }
1618 goto writeback;
1619
1620cannot_emulate:
1621 DPRINTF("Cannot emulate %02x\n", b);
1622 return -1;
1623}
1624
1625#ifdef __XEN__
1626
1627#include <asm/mm.h>
1628#include <asm/uaccess.h>
1629
1630int
1631x86_emulate_read_std(unsigned long addr,
1632 unsigned long *val,
1633 unsigned int bytes, struct x86_emulate_ctxt *ctxt)
1634{
1635 unsigned int rc;
1636
1637 *val = 0;
1638
1639 if ((rc = copy_from_user((void *)val, (void *)addr, bytes)) != 0) {
1640 propagate_page_fault(addr + bytes - rc, 0); /* read fault */
1641 return X86EMUL_PROPAGATE_FAULT;
1642 }
1643
1644 return X86EMUL_CONTINUE;
1645}
1646
1647int
1648x86_emulate_write_std(unsigned long addr,
1649 unsigned long val,
1650 unsigned int bytes, struct x86_emulate_ctxt *ctxt)
1651{
1652 unsigned int rc;
1653
1654 if ((rc = copy_to_user((void *)addr, (void *)&val, bytes)) != 0) {
1655 propagate_page_fault(addr + bytes - rc, PGERR_write_access);
1656 return X86EMUL_PROPAGATE_FAULT;
1657 }
1658
1659 return X86EMUL_CONTINUE;
1660}
1661
1662#endif
diff --git a/include/asm-x86/Kbuild b/include/asm-x86/Kbuild
index e6189b229143..3c6f0f80e827 100644
--- a/include/asm-x86/Kbuild
+++ b/include/asm-x86/Kbuild
@@ -3,6 +3,7 @@ include include/asm-generic/Kbuild.asm
3header-y += boot.h 3header-y += boot.h
4header-y += bootparam.h 4header-y += bootparam.h
5header-y += debugreg.h 5header-y += debugreg.h
6header-y += kvm.h
6header-y += ldt.h 7header-y += ldt.h
7header-y += msr-index.h 8header-y += msr-index.h
8header-y += prctl.h 9header-y += prctl.h
diff --git a/include/asm-x86/kvm.h b/include/asm-x86/kvm.h
new file mode 100644
index 000000000000..7a71120426a3
--- /dev/null
+++ b/include/asm-x86/kvm.h
@@ -0,0 +1,191 @@
1#ifndef __LINUX_KVM_X86_H
2#define __LINUX_KVM_X86_H
3
4/*
5 * KVM x86 specific structures and definitions
6 *
7 */
8
9#include <asm/types.h>
10#include <linux/ioctl.h>
11
12/* Architectural interrupt line count. */
13#define KVM_NR_INTERRUPTS 256
14
15struct kvm_memory_alias {
16 __u32 slot; /* this has a different namespace than memory slots */
17 __u32 flags;
18 __u64 guest_phys_addr;
19 __u64 memory_size;
20 __u64 target_phys_addr;
21};
22
23/* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */
24struct kvm_pic_state {
25 __u8 last_irr; /* edge detection */
26 __u8 irr; /* interrupt request register */
27 __u8 imr; /* interrupt mask register */
28 __u8 isr; /* interrupt service register */
29 __u8 priority_add; /* highest irq priority */
30 __u8 irq_base;
31 __u8 read_reg_select;
32 __u8 poll;
33 __u8 special_mask;
34 __u8 init_state;
35 __u8 auto_eoi;
36 __u8 rotate_on_auto_eoi;
37 __u8 special_fully_nested_mode;
38 __u8 init4; /* true if 4 byte init */
39 __u8 elcr; /* PIIX edge/trigger selection */
40 __u8 elcr_mask;
41};
42
43#define KVM_IOAPIC_NUM_PINS 24
44struct kvm_ioapic_state {
45 __u64 base_address;
46 __u32 ioregsel;
47 __u32 id;
48 __u32 irr;
49 __u32 pad;
50 union {
51 __u64 bits;
52 struct {
53 __u8 vector;
54 __u8 delivery_mode:3;
55 __u8 dest_mode:1;
56 __u8 delivery_status:1;
57 __u8 polarity:1;
58 __u8 remote_irr:1;
59 __u8 trig_mode:1;
60 __u8 mask:1;
61 __u8 reserve:7;
62 __u8 reserved[4];
63 __u8 dest_id;
64 } fields;
65 } redirtbl[KVM_IOAPIC_NUM_PINS];
66};
67
68#define KVM_IRQCHIP_PIC_MASTER 0
69#define KVM_IRQCHIP_PIC_SLAVE 1
70#define KVM_IRQCHIP_IOAPIC 2
71
72/* for KVM_GET_REGS and KVM_SET_REGS */
73struct kvm_regs {
74 /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
75 __u64 rax, rbx, rcx, rdx;
76 __u64 rsi, rdi, rsp, rbp;
77 __u64 r8, r9, r10, r11;
78 __u64 r12, r13, r14, r15;
79 __u64 rip, rflags;
80};
81
82/* for KVM_GET_LAPIC and KVM_SET_LAPIC */
83#define KVM_APIC_REG_SIZE 0x400
84struct kvm_lapic_state {
85 char regs[KVM_APIC_REG_SIZE];
86};
87
88struct kvm_segment {
89 __u64 base;
90 __u32 limit;
91 __u16 selector;
92 __u8 type;
93 __u8 present, dpl, db, s, l, g, avl;
94 __u8 unusable;
95 __u8 padding;
96};
97
98struct kvm_dtable {
99 __u64 base;
100 __u16 limit;
101 __u16 padding[3];
102};
103
104
105/* for KVM_GET_SREGS and KVM_SET_SREGS */
106struct kvm_sregs {
107 /* out (KVM_GET_SREGS) / in (KVM_SET_SREGS) */
108 struct kvm_segment cs, ds, es, fs, gs, ss;
109 struct kvm_segment tr, ldt;
110 struct kvm_dtable gdt, idt;
111 __u64 cr0, cr2, cr3, cr4, cr8;
112 __u64 efer;
113 __u64 apic_base;
114 __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64];
115};
116
117/* for KVM_GET_FPU and KVM_SET_FPU */
118struct kvm_fpu {
119 __u8 fpr[8][16];
120 __u16 fcw;
121 __u16 fsw;
122 __u8 ftwx; /* in fxsave format */
123 __u8 pad1;
124 __u16 last_opcode;
125 __u64 last_ip;
126 __u64 last_dp;
127 __u8 xmm[16][16];
128 __u32 mxcsr;
129 __u32 pad2;
130};
131
132struct kvm_msr_entry {
133 __u32 index;
134 __u32 reserved;
135 __u64 data;
136};
137
138/* for KVM_GET_MSRS and KVM_SET_MSRS */
139struct kvm_msrs {
140 __u32 nmsrs; /* number of msrs in entries */
141 __u32 pad;
142
143 struct kvm_msr_entry entries[0];
144};
145
146/* for KVM_GET_MSR_INDEX_LIST */
147struct kvm_msr_list {
148 __u32 nmsrs; /* number of msrs in entries */
149 __u32 indices[0];
150};
151
152
153struct kvm_cpuid_entry {
154 __u32 function;
155 __u32 eax;
156 __u32 ebx;
157 __u32 ecx;
158 __u32 edx;
159 __u32 padding;
160};
161
162/* for KVM_SET_CPUID */
163struct kvm_cpuid {
164 __u32 nent;
165 __u32 padding;
166 struct kvm_cpuid_entry entries[0];
167};
168
169struct kvm_cpuid_entry2 {
170 __u32 function;
171 __u32 index;
172 __u32 flags;
173 __u32 eax;
174 __u32 ebx;
175 __u32 ecx;
176 __u32 edx;
177 __u32 padding[3];
178};
179
180#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX 1
181#define KVM_CPUID_FLAG_STATEFUL_FUNC 2
182#define KVM_CPUID_FLAG_STATE_READ_NEXT 4
183
184/* for KVM_SET_CPUID2 */
185struct kvm_cpuid2 {
186 __u32 nent;
187 __u32 padding;
188 struct kvm_cpuid_entry2 entries[0];
189};
190
191#endif
diff --git a/drivers/kvm/kvm.h b/include/asm-x86/kvm_host.h
index 3b0bc4bda5f2..4702b04b979a 100644
--- a/drivers/kvm/kvm.h
+++ b/include/asm-x86/kvm_host.h
@@ -1,23 +1,24 @@
1#ifndef __KVM_H 1#/*
2#define __KVM_H 2 * Kernel-based Virtual Machine driver for Linux
3 3 *
4/* 4 * This header defines architecture specific interfaces, x86 version
5 *
5 * This work is licensed under the terms of the GNU GPL, version 2. See 6 * This work is licensed under the terms of the GNU GPL, version 2. See
6 * the COPYING file in the top-level directory. 7 * the COPYING file in the top-level directory.
8 *
7 */ 9 */
8 10
11#ifndef ASM_KVM_HOST_H
12#define ASM_KVM_HOST_H
13
9#include <linux/types.h> 14#include <linux/types.h>
10#include <linux/list.h>
11#include <linux/mutex.h>
12#include <linux/spinlock.h>
13#include <linux/signal.h>
14#include <linux/sched.h>
15#include <linux/mm.h> 15#include <linux/mm.h>
16#include <linux/preempt.h>
17#include <asm/signal.h>
18 16
19#include <linux/kvm.h> 17#include <linux/kvm.h>
20#include <linux/kvm_para.h> 18#include <linux/kvm_para.h>
19#include <linux/kvm_types.h>
20
21#include <asm/desc.h>
21 22
22#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1) 23#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1)
23#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD)) 24#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD))
@@ -37,15 +38,8 @@
37#define INVALID_PAGE (~(hpa_t)0) 38#define INVALID_PAGE (~(hpa_t)0)
38#define UNMAPPED_GVA (~(gpa_t)0) 39#define UNMAPPED_GVA (~(gpa_t)0)
39 40
40#define KVM_MAX_VCPUS 4
41#define KVM_ALIAS_SLOTS 4
42#define KVM_MEMORY_SLOTS 8
43#define KVM_NUM_MMU_PAGES 1024
44#define KVM_MIN_FREE_MMU_PAGES 5
45#define KVM_REFILL_PAGES 25
46#define KVM_MAX_CPUID_ENTRIES 40
47
48#define DE_VECTOR 0 41#define DE_VECTOR 0
42#define UD_VECTOR 6
49#define NM_VECTOR 7 43#define NM_VECTOR 7
50#define DF_VECTOR 8 44#define DF_VECTOR 8
51#define TS_VECTOR 10 45#define TS_VECTOR 10
@@ -59,31 +53,66 @@
59 53
60#define IOPL_SHIFT 12 54#define IOPL_SHIFT 12
61 55
62#define KVM_PIO_PAGE_OFFSET 1 56#define KVM_ALIAS_SLOTS 4
63 57
64/* 58#define KVM_PERMILLE_MMU_PAGES 20
65 * vcpu->requests bit members 59#define KVM_MIN_ALLOC_MMU_PAGES 64
66 */ 60#define KVM_NUM_MMU_PAGES 1024
67#define KVM_TLB_FLUSH 0 61#define KVM_MIN_FREE_MMU_PAGES 5
62#define KVM_REFILL_PAGES 25
63#define KVM_MAX_CPUID_ENTRIES 40
68 64
69/* 65extern spinlock_t kvm_lock;
70 * Address types: 66extern struct list_head vm_list;
71 * 67
72 * gva - guest virtual address 68struct kvm_vcpu;
73 * gpa - guest physical address 69struct kvm;
74 * gfn - guest frame number 70
75 * hva - host virtual address 71enum {
76 * hpa - host physical address 72 VCPU_REGS_RAX = 0,
77 * hfn - host frame number 73 VCPU_REGS_RCX = 1,
78 */ 74 VCPU_REGS_RDX = 2,
75 VCPU_REGS_RBX = 3,
76 VCPU_REGS_RSP = 4,
77 VCPU_REGS_RBP = 5,
78 VCPU_REGS_RSI = 6,
79 VCPU_REGS_RDI = 7,
80#ifdef CONFIG_X86_64
81 VCPU_REGS_R8 = 8,
82 VCPU_REGS_R9 = 9,
83 VCPU_REGS_R10 = 10,
84 VCPU_REGS_R11 = 11,
85 VCPU_REGS_R12 = 12,
86 VCPU_REGS_R13 = 13,
87 VCPU_REGS_R14 = 14,
88 VCPU_REGS_R15 = 15,
89#endif
90 NR_VCPU_REGS
91};
92
93enum {
94 VCPU_SREG_CS,
95 VCPU_SREG_DS,
96 VCPU_SREG_ES,
97 VCPU_SREG_FS,
98 VCPU_SREG_GS,
99 VCPU_SREG_SS,
100 VCPU_SREG_TR,
101 VCPU_SREG_LDTR,
102};
79 103
80typedef unsigned long gva_t; 104#include <asm/kvm_x86_emulate.h>
81typedef u64 gpa_t;
82typedef unsigned long gfn_t;
83 105
84typedef unsigned long hva_t; 106#define KVM_NR_MEM_OBJS 40
85typedef u64 hpa_t; 107
86typedef unsigned long hfn_t; 108/*
109 * We don't want allocation failures within the mmu code, so we preallocate
110 * enough memory for a single page fault in a cache.
111 */
112struct kvm_mmu_memory_cache {
113 int nobjs;
114 void *objects[KVM_NR_MEM_OBJS];
115};
87 116
88#define NR_PTE_CHAIN_ENTRIES 5 117#define NR_PTE_CHAIN_ENTRIES 5
89 118
@@ -99,7 +128,7 @@ struct kvm_pte_chain {
99 * bits 4:7 - page table level for this shadow (1-4) 128 * bits 4:7 - page table level for this shadow (1-4)
100 * bits 8:9 - page table quadrant for 2-level guests 129 * bits 8:9 - page table quadrant for 2-level guests
101 * bit 16 - "metaphysical" - gfn is not a real page (huge page/real mode) 130 * bit 16 - "metaphysical" - gfn is not a real page (huge page/real mode)
102 * bits 17:19 - "access" - the user, writable, and nx bits of a huge page pde 131 * bits 17:19 - common access permissions for all ptes in this shadow page
103 */ 132 */
104union kvm_mmu_page_role { 133union kvm_mmu_page_role {
105 unsigned word; 134 unsigned word;
@@ -109,7 +138,7 @@ union kvm_mmu_page_role {
109 unsigned quadrant : 2; 138 unsigned quadrant : 2;
110 unsigned pad_for_nice_hex_output : 6; 139 unsigned pad_for_nice_hex_output : 6;
111 unsigned metaphysical : 1; 140 unsigned metaphysical : 1;
112 unsigned hugepage_access : 3; 141 unsigned access : 3;
113 }; 142 };
114}; 143};
115 144
@@ -125,6 +154,8 @@ struct kvm_mmu_page {
125 union kvm_mmu_page_role role; 154 union kvm_mmu_page_role role;
126 155
127 u64 *spt; 156 u64 *spt;
157 /* hold the gfn of each spte inside spt */
158 gfn_t *gfns;
128 unsigned long slot_bitmap; /* One bit set per slot which has memory 159 unsigned long slot_bitmap; /* One bit set per slot which has memory
129 * in this shadow page. 160 * in this shadow page.
130 */ 161 */
@@ -136,9 +167,6 @@ struct kvm_mmu_page {
136 }; 167 };
137}; 168};
138 169
139struct kvm_vcpu;
140extern struct kmem_cache *kvm_vcpu_cache;
141
142/* 170/*
143 * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level 171 * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
144 * 32-bit). The kvm_mmu structure abstracts the details of the current mmu 172 * 32-bit). The kvm_mmu structure abstracts the details of the current mmu
@@ -149,6 +177,8 @@ struct kvm_mmu {
149 int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); 177 int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
150 void (*free)(struct kvm_vcpu *vcpu); 178 void (*free)(struct kvm_vcpu *vcpu);
151 gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva); 179 gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva);
180 void (*prefetch_page)(struct kvm_vcpu *vcpu,
181 struct kvm_mmu_page *page);
152 hpa_t root_hpa; 182 hpa_t root_hpa;
153 int root_level; 183 int root_level;
154 int shadow_root_level; 184 int shadow_root_level;
@@ -156,159 +186,9 @@ struct kvm_mmu {
156 u64 *pae_root; 186 u64 *pae_root;
157}; 187};
158 188
159#define KVM_NR_MEM_OBJS 20 189struct kvm_vcpu_arch {
160
161struct kvm_mmu_memory_cache {
162 int nobjs;
163 void *objects[KVM_NR_MEM_OBJS];
164};
165
166/*
167 * We don't want allocation failures within the mmu code, so we preallocate
168 * enough memory for a single page fault in a cache.
169 */
170struct kvm_guest_debug {
171 int enabled;
172 unsigned long bp[4];
173 int singlestep;
174};
175
176enum {
177 VCPU_REGS_RAX = 0,
178 VCPU_REGS_RCX = 1,
179 VCPU_REGS_RDX = 2,
180 VCPU_REGS_RBX = 3,
181 VCPU_REGS_RSP = 4,
182 VCPU_REGS_RBP = 5,
183 VCPU_REGS_RSI = 6,
184 VCPU_REGS_RDI = 7,
185#ifdef CONFIG_X86_64
186 VCPU_REGS_R8 = 8,
187 VCPU_REGS_R9 = 9,
188 VCPU_REGS_R10 = 10,
189 VCPU_REGS_R11 = 11,
190 VCPU_REGS_R12 = 12,
191 VCPU_REGS_R13 = 13,
192 VCPU_REGS_R14 = 14,
193 VCPU_REGS_R15 = 15,
194#endif
195 NR_VCPU_REGS
196};
197
198enum {
199 VCPU_SREG_CS,
200 VCPU_SREG_DS,
201 VCPU_SREG_ES,
202 VCPU_SREG_FS,
203 VCPU_SREG_GS,
204 VCPU_SREG_SS,
205 VCPU_SREG_TR,
206 VCPU_SREG_LDTR,
207};
208
209struct kvm_pio_request {
210 unsigned long count;
211 int cur_count;
212 struct page *guest_pages[2];
213 unsigned guest_page_offset;
214 int in;
215 int port;
216 int size;
217 int string;
218 int down;
219 int rep;
220};
221
222struct kvm_stat {
223 u32 pf_fixed;
224 u32 pf_guest;
225 u32 tlb_flush;
226 u32 invlpg;
227
228 u32 exits;
229 u32 io_exits;
230 u32 mmio_exits;
231 u32 signal_exits;
232 u32 irq_window_exits;
233 u32 halt_exits;
234 u32 halt_wakeup;
235 u32 request_irq_exits;
236 u32 irq_exits;
237 u32 light_exits;
238 u32 efer_reload;
239};
240
241struct kvm_io_device {
242 void (*read)(struct kvm_io_device *this,
243 gpa_t addr,
244 int len,
245 void *val);
246 void (*write)(struct kvm_io_device *this,
247 gpa_t addr,
248 int len,
249 const void *val);
250 int (*in_range)(struct kvm_io_device *this, gpa_t addr);
251 void (*destructor)(struct kvm_io_device *this);
252
253 void *private;
254};
255
256static inline void kvm_iodevice_read(struct kvm_io_device *dev,
257 gpa_t addr,
258 int len,
259 void *val)
260{
261 dev->read(dev, addr, len, val);
262}
263
264static inline void kvm_iodevice_write(struct kvm_io_device *dev,
265 gpa_t addr,
266 int len,
267 const void *val)
268{
269 dev->write(dev, addr, len, val);
270}
271
272static inline int kvm_iodevice_inrange(struct kvm_io_device *dev, gpa_t addr)
273{
274 return dev->in_range(dev, addr);
275}
276
277static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
278{
279 if (dev->destructor)
280 dev->destructor(dev);
281}
282
283/*
284 * It would be nice to use something smarter than a linear search, TBD...
285 * Thankfully we dont expect many devices to register (famous last words :),
286 * so until then it will suffice. At least its abstracted so we can change
287 * in one place.
288 */
289struct kvm_io_bus {
290 int dev_count;
291#define NR_IOBUS_DEVS 6
292 struct kvm_io_device *devs[NR_IOBUS_DEVS];
293};
294
295void kvm_io_bus_init(struct kvm_io_bus *bus);
296void kvm_io_bus_destroy(struct kvm_io_bus *bus);
297struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr);
298void kvm_io_bus_register_dev(struct kvm_io_bus *bus,
299 struct kvm_io_device *dev);
300
301struct kvm_vcpu {
302 struct kvm *kvm;
303 struct preempt_notifier preempt_notifier;
304 int vcpu_id;
305 struct mutex mutex;
306 int cpu;
307 u64 host_tsc; 190 u64 host_tsc;
308 struct kvm_run *run;
309 int interrupt_window_open; 191 int interrupt_window_open;
310 int guest_mode;
311 unsigned long requests;
312 unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */ 192 unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
313 DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS); 193 DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS);
314 unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */ 194 unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */
@@ -317,9 +197,6 @@ struct kvm_vcpu {
317 unsigned long cr0; 197 unsigned long cr0;
318 unsigned long cr2; 198 unsigned long cr2;
319 unsigned long cr3; 199 unsigned long cr3;
320 gpa_t para_state_gpa;
321 struct page *para_state_page;
322 gpa_t hypercall_gpa;
323 unsigned long cr4; 200 unsigned long cr4;
324 unsigned long cr8; 201 unsigned long cr8;
325 u64 pdptrs[4]; /* pae */ 202 u64 pdptrs[4]; /* pae */
@@ -334,6 +211,7 @@ struct kvm_vcpu {
334 int mp_state; 211 int mp_state;
335 int sipi_vector; 212 int sipi_vector;
336 u64 ia32_misc_enable_msr; 213 u64 ia32_misc_enable_msr;
214 bool tpr_access_reporting;
337 215
338 struct kvm_mmu mmu; 216 struct kvm_mmu mmu;
339 217
@@ -344,29 +222,26 @@ struct kvm_vcpu {
344 222
345 gfn_t last_pt_write_gfn; 223 gfn_t last_pt_write_gfn;
346 int last_pt_write_count; 224 int last_pt_write_count;
225 u64 *last_pte_updated;
347 226
348 struct kvm_guest_debug guest_debug; 227 struct {
228 gfn_t gfn; /* presumed gfn during guest pte update */
229 struct page *page; /* page corresponding to that gfn */
230 } update_pte;
349 231
350 struct i387_fxsave_struct host_fx_image; 232 struct i387_fxsave_struct host_fx_image;
351 struct i387_fxsave_struct guest_fx_image; 233 struct i387_fxsave_struct guest_fx_image;
352 int fpu_active; 234
353 int guest_fpu_loaded;
354
355 int mmio_needed;
356 int mmio_read_completed;
357 int mmio_is_write;
358 int mmio_size;
359 unsigned char mmio_data[8];
360 gpa_t mmio_phys_addr;
361 gva_t mmio_fault_cr2; 235 gva_t mmio_fault_cr2;
362 struct kvm_pio_request pio; 236 struct kvm_pio_request pio;
363 void *pio_data; 237 void *pio_data;
364 wait_queue_head_t wq;
365 238
366 int sigset_active; 239 struct kvm_queued_exception {
367 sigset_t sigset; 240 bool pending;
368 241 bool has_error_code;
369 struct kvm_stat stat; 242 u8 nr;
243 u32 error_code;
244 } exception;
370 245
371 struct { 246 struct {
372 int active; 247 int active;
@@ -381,7 +256,10 @@ struct kvm_vcpu {
381 int halt_request; /* real mode on Intel only */ 256 int halt_request; /* real mode on Intel only */
382 257
383 int cpuid_nent; 258 int cpuid_nent;
384 struct kvm_cpuid_entry cpuid_entries[KVM_MAX_CPUID_ENTRIES]; 259 struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES];
260 /* emulate context */
261
262 struct x86_emulate_ctxt emulate_ctxt;
385}; 263};
386 264
387struct kvm_mem_alias { 265struct kvm_mem_alias {
@@ -390,51 +268,58 @@ struct kvm_mem_alias {
390 gfn_t target_gfn; 268 gfn_t target_gfn;
391}; 269};
392 270
393struct kvm_memory_slot { 271struct kvm_arch{
394 gfn_t base_gfn;
395 unsigned long npages;
396 unsigned long flags;
397 struct page **phys_mem;
398 unsigned long *dirty_bitmap;
399};
400
401struct kvm {
402 struct mutex lock; /* protects everything except vcpus */
403 int naliases; 272 int naliases;
404 struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS]; 273 struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
405 int nmemslots; 274
406 struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS]; 275 unsigned int n_free_mmu_pages;
276 unsigned int n_requested_mmu_pages;
277 unsigned int n_alloc_mmu_pages;
278 struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
407 /* 279 /*
408 * Hash table of struct kvm_mmu_page. 280 * Hash table of struct kvm_mmu_page.
409 */ 281 */
410 struct list_head active_mmu_pages; 282 struct list_head active_mmu_pages;
411 int n_free_mmu_pages;
412 struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
413 struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
414 unsigned long rmap_overflow;
415 struct list_head vm_list;
416 struct file *filp;
417 struct kvm_io_bus mmio_bus;
418 struct kvm_io_bus pio_bus;
419 struct kvm_pic *vpic; 283 struct kvm_pic *vpic;
420 struct kvm_ioapic *vioapic; 284 struct kvm_ioapic *vioapic;
285
421 int round_robin_prev_vcpu; 286 int round_robin_prev_vcpu;
287 unsigned int tss_addr;
288 struct page *apic_access_page;
422}; 289};
423 290
424static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) 291struct kvm_vm_stat {
425{ 292 u32 mmu_shadow_zapped;
426 return kvm->vpic; 293 u32 mmu_pte_write;
427} 294 u32 mmu_pte_updated;
295 u32 mmu_pde_zapped;
296 u32 mmu_flooded;
297 u32 mmu_recycled;
298 u32 mmu_cache_miss;
299 u32 remote_tlb_flush;
300};
428 301
429static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm) 302struct kvm_vcpu_stat {
430{ 303 u32 pf_fixed;
431 return kvm->vioapic; 304 u32 pf_guest;
432} 305 u32 tlb_flush;
306 u32 invlpg;
433 307
434static inline int irqchip_in_kernel(struct kvm *kvm) 308 u32 exits;
435{ 309 u32 io_exits;
436 return pic_irqchip(kvm) != 0; 310 u32 mmio_exits;
437} 311 u32 signal_exits;
312 u32 irq_window_exits;
313 u32 halt_exits;
314 u32 halt_wakeup;
315 u32 request_irq_exits;
316 u32 irq_exits;
317 u32 host_state_reload;
318 u32 efer_reload;
319 u32 fpu_reload;
320 u32 insn_emulation;
321 u32 insn_emulation_fail;
322};
438 323
439struct descriptor_table { 324struct descriptor_table {
440 u16 limit; 325 u16 limit;
@@ -449,11 +334,12 @@ struct kvm_x86_ops {
449 void (*check_processor_compatibility)(void *rtn); 334 void (*check_processor_compatibility)(void *rtn);
450 int (*hardware_setup)(void); /* __init */ 335 int (*hardware_setup)(void); /* __init */
451 void (*hardware_unsetup)(void); /* __exit */ 336 void (*hardware_unsetup)(void); /* __exit */
337 bool (*cpu_has_accelerated_tpr)(void);
452 338
453 /* Create, but do not attach this VCPU */ 339 /* Create, but do not attach this VCPU */
454 struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id); 340 struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
455 void (*vcpu_free)(struct kvm_vcpu *vcpu); 341 void (*vcpu_free)(struct kvm_vcpu *vcpu);
456 void (*vcpu_reset)(struct kvm_vcpu *vcpu); 342 int (*vcpu_reset)(struct kvm_vcpu *vcpu);
457 343
458 void (*prepare_guest_switch)(struct kvm_vcpu *vcpu); 344 void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
459 void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); 345 void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
@@ -489,10 +375,6 @@ struct kvm_x86_ops {
489 void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); 375 void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
490 376
491 void (*tlb_flush)(struct kvm_vcpu *vcpu); 377 void (*tlb_flush)(struct kvm_vcpu *vcpu);
492 void (*inject_page_fault)(struct kvm_vcpu *vcpu,
493 unsigned long addr, u32 err_code);
494
495 void (*inject_gp)(struct kvm_vcpu *vcpu, unsigned err_code);
496 378
497 void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run); 379 void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run);
498 int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu); 380 int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu);
@@ -501,54 +383,31 @@ struct kvm_x86_ops {
501 unsigned char *hypercall_addr); 383 unsigned char *hypercall_addr);
502 int (*get_irq)(struct kvm_vcpu *vcpu); 384 int (*get_irq)(struct kvm_vcpu *vcpu);
503 void (*set_irq)(struct kvm_vcpu *vcpu, int vec); 385 void (*set_irq)(struct kvm_vcpu *vcpu, int vec);
386 void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
387 bool has_error_code, u32 error_code);
388 bool (*exception_injected)(struct kvm_vcpu *vcpu);
504 void (*inject_pending_irq)(struct kvm_vcpu *vcpu); 389 void (*inject_pending_irq)(struct kvm_vcpu *vcpu);
505 void (*inject_pending_vectors)(struct kvm_vcpu *vcpu, 390 void (*inject_pending_vectors)(struct kvm_vcpu *vcpu,
506 struct kvm_run *run); 391 struct kvm_run *run);
392
393 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
507}; 394};
508 395
509extern struct kvm_x86_ops *kvm_x86_ops; 396extern struct kvm_x86_ops *kvm_x86_ops;
510 397
511/* The guest did something we don't support. */
512#define pr_unimpl(vcpu, fmt, ...) \
513 do { \
514 if (printk_ratelimit()) \
515 printk(KERN_ERR "kvm: %i: cpu%i " fmt, \
516 current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__); \
517 } while(0)
518
519#define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt)
520#define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt)
521
522int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id);
523void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
524
525int kvm_init_x86(struct kvm_x86_ops *ops, unsigned int vcpu_size,
526 struct module *module);
527void kvm_exit_x86(void);
528
529int kvm_mmu_module_init(void); 398int kvm_mmu_module_init(void);
530void kvm_mmu_module_exit(void); 399void kvm_mmu_module_exit(void);
531 400
532void kvm_mmu_destroy(struct kvm_vcpu *vcpu); 401void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
533int kvm_mmu_create(struct kvm_vcpu *vcpu); 402int kvm_mmu_create(struct kvm_vcpu *vcpu);
534int kvm_mmu_setup(struct kvm_vcpu *vcpu); 403int kvm_mmu_setup(struct kvm_vcpu *vcpu);
404void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
535 405
536int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); 406int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
537void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); 407void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
538void kvm_mmu_zap_all(struct kvm *kvm); 408void kvm_mmu_zap_all(struct kvm *kvm);
539 409unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
540hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa); 410void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
541#define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
542#define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB)
543static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
544hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva);
545struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva);
546
547extern hpa_t bad_page_address;
548
549struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
550struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
551void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
552 411
553enum emulation_result { 412enum emulation_result {
554 EMULATE_DONE, /* no further processing */ 413 EMULATE_DONE, /* no further processing */
@@ -556,8 +415,10 @@ enum emulation_result {
556 EMULATE_FAIL, /* can't emulate this instruction */ 415 EMULATE_FAIL, /* can't emulate this instruction */
557}; 416};
558 417
418#define EMULTYPE_NO_DECODE (1 << 0)
419#define EMULTYPE_TRAP_UD (1 << 1)
559int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run, 420int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run,
560 unsigned long cr2, u16 error_code); 421 unsigned long cr2, u16 error_code, int emulation_type);
561void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context); 422void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
562void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); 423void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
563void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); 424void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
@@ -572,7 +433,7 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
572 433
573struct x86_emulate_ctxt; 434struct x86_emulate_ctxt;
574 435
575int kvm_emulate_pio (struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 436int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
576 int size, unsigned port); 437 int size, unsigned port);
577int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 438int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
578 int size, unsigned long count, int down, 439 int size, unsigned long count, int down,
@@ -581,7 +442,7 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
581int kvm_emulate_halt(struct kvm_vcpu *vcpu); 442int kvm_emulate_halt(struct kvm_vcpu *vcpu);
582int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); 443int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
583int emulate_clts(struct kvm_vcpu *vcpu); 444int emulate_clts(struct kvm_vcpu *vcpu);
584int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, 445int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
585 unsigned long *dest); 446 unsigned long *dest);
586int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, 447int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
587 unsigned long value); 448 unsigned long value);
@@ -597,15 +458,15 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
597int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); 458int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
598int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data); 459int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
599 460
600void fx_init(struct kvm_vcpu *vcpu); 461void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
462void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
463void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
464 u32 error_code);
601 465
602void kvm_resched(struct kvm_vcpu *vcpu); 466void fx_init(struct kvm_vcpu *vcpu);
603void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
604void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
605void kvm_flush_remote_tlbs(struct kvm *kvm);
606 467
607int emulator_read_std(unsigned long addr, 468int emulator_read_std(unsigned long addr,
608 void *val, 469 void *val,
609 unsigned int bytes, 470 unsigned int bytes,
610 struct kvm_vcpu *vcpu); 471 struct kvm_vcpu *vcpu);
611int emulator_write_emulated(unsigned long addr, 472int emulator_write_emulated(unsigned long addr,
@@ -615,6 +476,7 @@ int emulator_write_emulated(unsigned long addr,
615 476
616unsigned long segment_base(u16 selector); 477unsigned long segment_base(u16 selector);
617 478
479void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
618void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, 480void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
619 const u8 *new, int bytes); 481 const u8 *new, int bytes);
620int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); 482int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
@@ -622,66 +484,14 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
622int kvm_mmu_load(struct kvm_vcpu *vcpu); 484int kvm_mmu_load(struct kvm_vcpu *vcpu);
623void kvm_mmu_unload(struct kvm_vcpu *vcpu); 485void kvm_mmu_unload(struct kvm_vcpu *vcpu);
624 486
625int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run); 487int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
626 488
627static inline void kvm_guest_enter(void) 489int kvm_fix_hypercall(struct kvm_vcpu *vcpu);
628{
629 current->flags |= PF_VCPU;
630}
631 490
632static inline void kvm_guest_exit(void) 491int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code);
633{
634 current->flags &= ~PF_VCPU;
635}
636 492
637static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, 493int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
638 u32 error_code) 494int complete_pio(struct kvm_vcpu *vcpu);
639{
640 return vcpu->mmu.page_fault(vcpu, gva, error_code);
641}
642
643static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
644{
645 if (unlikely(vcpu->kvm->n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
646 __kvm_mmu_free_some_pages(vcpu);
647}
648
649static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
650{
651 if (likely(vcpu->mmu.root_hpa != INVALID_PAGE))
652 return 0;
653
654 return kvm_mmu_load(vcpu);
655}
656
657static inline int is_long_mode(struct kvm_vcpu *vcpu)
658{
659#ifdef CONFIG_X86_64
660 return vcpu->shadow_efer & EFER_LME;
661#else
662 return 0;
663#endif
664}
665
666static inline int is_pae(struct kvm_vcpu *vcpu)
667{
668 return vcpu->cr4 & X86_CR4_PAE;
669}
670
671static inline int is_pse(struct kvm_vcpu *vcpu)
672{
673 return vcpu->cr4 & X86_CR4_PSE;
674}
675
676static inline int is_paging(struct kvm_vcpu *vcpu)
677{
678 return vcpu->cr0 & X86_CR0_PG;
679}
680
681static inline int memslot_id(struct kvm *kvm, struct kvm_memory_slot *slot)
682{
683 return slot - kvm->memslots;
684}
685 495
686static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) 496static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
687{ 497{
@@ -693,55 +503,55 @@ static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
693static inline u16 read_fs(void) 503static inline u16 read_fs(void)
694{ 504{
695 u16 seg; 505 u16 seg;
696 asm ("mov %%fs, %0" : "=g"(seg)); 506 asm("mov %%fs, %0" : "=g"(seg));
697 return seg; 507 return seg;
698} 508}
699 509
700static inline u16 read_gs(void) 510static inline u16 read_gs(void)
701{ 511{
702 u16 seg; 512 u16 seg;
703 asm ("mov %%gs, %0" : "=g"(seg)); 513 asm("mov %%gs, %0" : "=g"(seg));
704 return seg; 514 return seg;
705} 515}
706 516
707static inline u16 read_ldt(void) 517static inline u16 read_ldt(void)
708{ 518{
709 u16 ldt; 519 u16 ldt;
710 asm ("sldt %0" : "=g"(ldt)); 520 asm("sldt %0" : "=g"(ldt));
711 return ldt; 521 return ldt;
712} 522}
713 523
714static inline void load_fs(u16 sel) 524static inline void load_fs(u16 sel)
715{ 525{
716 asm ("mov %0, %%fs" : : "rm"(sel)); 526 asm("mov %0, %%fs" : : "rm"(sel));
717} 527}
718 528
719static inline void load_gs(u16 sel) 529static inline void load_gs(u16 sel)
720{ 530{
721 asm ("mov %0, %%gs" : : "rm"(sel)); 531 asm("mov %0, %%gs" : : "rm"(sel));
722} 532}
723 533
724#ifndef load_ldt 534#ifndef load_ldt
725static inline void load_ldt(u16 sel) 535static inline void load_ldt(u16 sel)
726{ 536{
727 asm ("lldt %0" : : "rm"(sel)); 537 asm("lldt %0" : : "rm"(sel));
728} 538}
729#endif 539#endif
730 540
731static inline void get_idt(struct descriptor_table *table) 541static inline void get_idt(struct descriptor_table *table)
732{ 542{
733 asm ("sidt %0" : "=m"(*table)); 543 asm("sidt %0" : "=m"(*table));
734} 544}
735 545
736static inline void get_gdt(struct descriptor_table *table) 546static inline void get_gdt(struct descriptor_table *table)
737{ 547{
738 asm ("sgdt %0" : "=m"(*table)); 548 asm("sgdt %0" : "=m"(*table));
739} 549}
740 550
741static inline unsigned long read_tr_base(void) 551static inline unsigned long read_tr_base(void)
742{ 552{
743 u16 tr; 553 u16 tr;
744 asm ("str %0" : "=g"(tr)); 554 asm("str %0" : "=g"(tr));
745 return segment_base(tr); 555 return segment_base(tr);
746} 556}
747 557
@@ -757,17 +567,17 @@ static inline unsigned long read_msr(unsigned long msr)
757 567
758static inline void fx_save(struct i387_fxsave_struct *image) 568static inline void fx_save(struct i387_fxsave_struct *image)
759{ 569{
760 asm ("fxsave (%0)":: "r" (image)); 570 asm("fxsave (%0)":: "r" (image));
761} 571}
762 572
763static inline void fx_restore(struct i387_fxsave_struct *image) 573static inline void fx_restore(struct i387_fxsave_struct *image)
764{ 574{
765 asm ("fxrstor (%0)":: "r" (image)); 575 asm("fxrstor (%0)":: "r" (image));
766} 576}
767 577
768static inline void fpu_init(void) 578static inline void fpu_init(void)
769{ 579{
770 asm ("finit"); 580 asm("finit");
771} 581}
772 582
773static inline u32 get_rdx_init_val(void) 583static inline u32 get_rdx_init_val(void)
@@ -775,6 +585,11 @@ static inline u32 get_rdx_init_val(void)
775 return 0x600; /* P6 family */ 585 return 0x600; /* P6 family */
776} 586}
777 587
588static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
589{
590 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
591}
592
778#define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30" 593#define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30"
779#define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2" 594#define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2"
780#define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3" 595#define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3"
diff --git a/include/asm-x86/kvm_para.h b/include/asm-x86/kvm_para.h
new file mode 100644
index 000000000000..c6f3fd8d8c53
--- /dev/null
+++ b/include/asm-x86/kvm_para.h
@@ -0,0 +1,105 @@
1#ifndef __X86_KVM_PARA_H
2#define __X86_KVM_PARA_H
3
4/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It
5 * should be used to determine that a VM is running under KVM.
6 */
7#define KVM_CPUID_SIGNATURE 0x40000000
8
9/* This CPUID returns a feature bitmap in eax. Before enabling a particular
10 * paravirtualization, the appropriate feature bit should be checked.
11 */
12#define KVM_CPUID_FEATURES 0x40000001
13
14#ifdef __KERNEL__
15#include <asm/processor.h>
16
17/* This instruction is vmcall. On non-VT architectures, it will generate a
18 * trap that we will then rewrite to the appropriate instruction.
19 */
20#define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1"
21
22/* For KVM hypercalls, a three-byte sequence of either the vmrun or the vmmrun
23 * instruction. The hypervisor may replace it with something else but only the
24 * instructions are guaranteed to be supported.
25 *
26 * Up to four arguments may be passed in rbx, rcx, rdx, and rsi respectively.
27 * The hypercall number should be placed in rax and the return value will be
28 * placed in rax. No other registers will be clobbered unless explicited
29 * noted by the particular hypercall.
30 */
31
32static inline long kvm_hypercall0(unsigned int nr)
33{
34 long ret;
35 asm volatile(KVM_HYPERCALL
36 : "=a"(ret)
37 : "a"(nr));
38 return ret;
39}
40
41static inline long kvm_hypercall1(unsigned int nr, unsigned long p1)
42{
43 long ret;
44 asm volatile(KVM_HYPERCALL
45 : "=a"(ret)
46 : "a"(nr), "b"(p1));
47 return ret;
48}
49
50static inline long kvm_hypercall2(unsigned int nr, unsigned long p1,
51 unsigned long p2)
52{
53 long ret;
54 asm volatile(KVM_HYPERCALL
55 : "=a"(ret)
56 : "a"(nr), "b"(p1), "c"(p2));
57 return ret;
58}
59
60static inline long kvm_hypercall3(unsigned int nr, unsigned long p1,
61 unsigned long p2, unsigned long p3)
62{
63 long ret;
64 asm volatile(KVM_HYPERCALL
65 : "=a"(ret)
66 : "a"(nr), "b"(p1), "c"(p2), "d"(p3));
67 return ret;
68}
69
70static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
71 unsigned long p2, unsigned long p3,
72 unsigned long p4)
73{
74 long ret;
75 asm volatile(KVM_HYPERCALL
76 : "=a"(ret)
77 : "a"(nr), "b"(p1), "c"(p2), "d"(p3), "S"(p4));
78 return ret;
79}
80
81static inline int kvm_para_available(void)
82{
83 unsigned int eax, ebx, ecx, edx;
84 char signature[13];
85
86 cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx);
87 memcpy(signature + 0, &ebx, 4);
88 memcpy(signature + 4, &ecx, 4);
89 memcpy(signature + 8, &edx, 4);
90 signature[12] = 0;
91
92 if (strcmp(signature, "KVMKVMKVM") == 0)
93 return 1;
94
95 return 0;
96}
97
98static inline unsigned int kvm_arch_para_features(void)
99{
100 return cpuid_eax(KVM_CPUID_FEATURES);
101}
102
103#endif
104
105#endif
diff --git a/drivers/kvm/x86_emulate.h b/include/asm-x86/kvm_x86_emulate.h
index 92c73aa7f9ac..7db91b9bdcd4 100644
--- a/drivers/kvm/x86_emulate.h
+++ b/include/asm-x86/kvm_x86_emulate.h
@@ -63,17 +63,6 @@ struct x86_emulate_ops {
63 unsigned int bytes, struct kvm_vcpu *vcpu); 63 unsigned int bytes, struct kvm_vcpu *vcpu);
64 64
65 /* 65 /*
66 * write_std: Write bytes of standard (non-emulated/special) memory.
67 * Used for stack operations, and others.
68 * @addr: [IN ] Linear address to which to write.
69 * @val: [IN ] Value to write to memory (low-order bytes used as
70 * required).
71 * @bytes: [IN ] Number of bytes to write to memory.
72 */
73 int (*write_std)(unsigned long addr, const void *val,
74 unsigned int bytes, struct kvm_vcpu *vcpu);
75
76 /*
77 * read_emulated: Read bytes from emulated/special memory area. 66 * read_emulated: Read bytes from emulated/special memory area.
78 * @addr: [IN ] Linear address from which to read. 67 * @addr: [IN ] Linear address from which to read.
79 * @val: [OUT] Value read from memory, zero-extended to 'u_long'. 68 * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
@@ -112,13 +101,50 @@ struct x86_emulate_ops {
112 101
113}; 102};
114 103
104/* Type, address-of, and value of an instruction's operand. */
105struct operand {
106 enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type;
107 unsigned int bytes;
108 unsigned long val, orig_val, *ptr;
109};
110
111struct fetch_cache {
112 u8 data[15];
113 unsigned long start;
114 unsigned long end;
115};
116
117struct decode_cache {
118 u8 twobyte;
119 u8 b;
120 u8 lock_prefix;
121 u8 rep_prefix;
122 u8 op_bytes;
123 u8 ad_bytes;
124 u8 rex_prefix;
125 struct operand src;
126 struct operand dst;
127 unsigned long *override_base;
128 unsigned int d;
129 unsigned long regs[NR_VCPU_REGS];
130 unsigned long eip;
131 /* modrm */
132 u8 modrm;
133 u8 modrm_mod;
134 u8 modrm_reg;
135 u8 modrm_rm;
136 u8 use_modrm_ea;
137 unsigned long modrm_ea;
138 unsigned long modrm_val;
139 struct fetch_cache fetch;
140};
141
115struct x86_emulate_ctxt { 142struct x86_emulate_ctxt {
116 /* Register state before/after emulation. */ 143 /* Register state before/after emulation. */
117 struct kvm_vcpu *vcpu; 144 struct kvm_vcpu *vcpu;
118 145
119 /* Linear faulting address (if emulating a page-faulting instruction). */ 146 /* Linear faulting address (if emulating a page-faulting instruction). */
120 unsigned long eflags; 147 unsigned long eflags;
121 unsigned long cr2;
122 148
123 /* Emulated execution mode, represented by an X86EMUL_MODE value. */ 149 /* Emulated execution mode, represented by an X86EMUL_MODE value. */
124 int mode; 150 int mode;
@@ -129,8 +155,16 @@ struct x86_emulate_ctxt {
129 unsigned long ss_base; 155 unsigned long ss_base;
130 unsigned long gs_base; 156 unsigned long gs_base;
131 unsigned long fs_base; 157 unsigned long fs_base;
158
159 /* decode cache */
160
161 struct decode_cache decode;
132}; 162};
133 163
164/* Repeat String Operation Prefix */
165#define REPE_PREFIX 1
166#define REPNE_PREFIX 2
167
134/* Execution mode, passed to the emulator. */ 168/* Execution mode, passed to the emulator. */
135#define X86EMUL_MODE_REAL 0 /* Real mode. */ 169#define X86EMUL_MODE_REAL 0 /* Real mode. */
136#define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */ 170#define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */
@@ -144,12 +178,9 @@ struct x86_emulate_ctxt {
144#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 178#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
145#endif 179#endif
146 180
147/* 181int x86_decode_insn(struct x86_emulate_ctxt *ctxt,
148 * x86_emulate_memop: Emulate an instruction that faulted attempting to 182 struct x86_emulate_ops *ops);
149 * read/write a 'special' memory area. 183int x86_emulate_insn(struct x86_emulate_ctxt *ctxt,
150 * Returns -1 on failure, 0 on success. 184 struct x86_emulate_ops *ops);
151 */
152int x86_emulate_memop(struct x86_emulate_ctxt *ctxt,
153 struct x86_emulate_ops *ops);
154 185
155#endif /* __X86_EMULATE_H__ */ 186#endif /* __X86_EMULATE_H__ */
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index 27b9350052b4..85b2482cc736 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -100,7 +100,6 @@ header-y += iso_fs.h
100header-y += ixjuser.h 100header-y += ixjuser.h
101header-y += jffs2.h 101header-y += jffs2.h
102header-y += keyctl.h 102header-y += keyctl.h
103header-y += kvm.h
104header-y += limits.h 103header-y += limits.h
105header-y += lock_dlm_plock.h 104header-y += lock_dlm_plock.h
106header-y += magic.h 105header-y += magic.h
@@ -256,6 +255,7 @@ unifdef-y += kd.h
256unifdef-y += kernelcapi.h 255unifdef-y += kernelcapi.h
257unifdef-y += kernel.h 256unifdef-y += kernel.h
258unifdef-y += keyboard.h 257unifdef-y += keyboard.h
258unifdef-$(CONFIG_HAVE_KVM) += kvm.h
259unifdef-y += llc.h 259unifdef-y += llc.h
260unifdef-y += loop.h 260unifdef-y += loop.h
261unifdef-y += lp.h 261unifdef-y += lp.h
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 057a7f34ee36..4de4fd2d8607 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -9,12 +9,10 @@
9 9
10#include <asm/types.h> 10#include <asm/types.h>
11#include <linux/ioctl.h> 11#include <linux/ioctl.h>
12#include <asm/kvm.h>
12 13
13#define KVM_API_VERSION 12 14#define KVM_API_VERSION 12
14 15
15/* Architectural interrupt line count. */
16#define KVM_NR_INTERRUPTS 256
17
18/* for KVM_CREATE_MEMORY_REGION */ 16/* for KVM_CREATE_MEMORY_REGION */
19struct kvm_memory_region { 17struct kvm_memory_region {
20 __u32 slot; 18 __u32 slot;
@@ -23,17 +21,19 @@ struct kvm_memory_region {
23 __u64 memory_size; /* bytes */ 21 __u64 memory_size; /* bytes */
24}; 22};
25 23
26/* for kvm_memory_region::flags */ 24/* for KVM_SET_USER_MEMORY_REGION */
27#define KVM_MEM_LOG_DIRTY_PAGES 1UL 25struct kvm_userspace_memory_region {
28 26 __u32 slot;
29struct kvm_memory_alias {
30 __u32 slot; /* this has a different namespace than memory slots */
31 __u32 flags; 27 __u32 flags;
32 __u64 guest_phys_addr; 28 __u64 guest_phys_addr;
33 __u64 memory_size; 29 __u64 memory_size; /* bytes */
34 __u64 target_phys_addr; 30 __u64 userspace_addr; /* start of the userspace allocated memory */
35}; 31};
36 32
33/* for kvm_memory_region::flags */
34#define KVM_MEM_LOG_DIRTY_PAGES 1UL
35
36
37/* for KVM_IRQ_LINE */ 37/* for KVM_IRQ_LINE */
38struct kvm_irq_level { 38struct kvm_irq_level {
39 /* 39 /*
@@ -45,62 +45,18 @@ struct kvm_irq_level {
45 __u32 level; 45 __u32 level;
46}; 46};
47 47
48/* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */
49struct kvm_pic_state {
50 __u8 last_irr; /* edge detection */
51 __u8 irr; /* interrupt request register */
52 __u8 imr; /* interrupt mask register */
53 __u8 isr; /* interrupt service register */
54 __u8 priority_add; /* highest irq priority */
55 __u8 irq_base;
56 __u8 read_reg_select;
57 __u8 poll;
58 __u8 special_mask;
59 __u8 init_state;
60 __u8 auto_eoi;
61 __u8 rotate_on_auto_eoi;
62 __u8 special_fully_nested_mode;
63 __u8 init4; /* true if 4 byte init */
64 __u8 elcr; /* PIIX edge/trigger selection */
65 __u8 elcr_mask;
66};
67
68#define KVM_IOAPIC_NUM_PINS 24
69struct kvm_ioapic_state {
70 __u64 base_address;
71 __u32 ioregsel;
72 __u32 id;
73 __u32 irr;
74 __u32 pad;
75 union {
76 __u64 bits;
77 struct {
78 __u8 vector;
79 __u8 delivery_mode:3;
80 __u8 dest_mode:1;
81 __u8 delivery_status:1;
82 __u8 polarity:1;
83 __u8 remote_irr:1;
84 __u8 trig_mode:1;
85 __u8 mask:1;
86 __u8 reserve:7;
87 __u8 reserved[4];
88 __u8 dest_id;
89 } fields;
90 } redirtbl[KVM_IOAPIC_NUM_PINS];
91};
92
93#define KVM_IRQCHIP_PIC_MASTER 0
94#define KVM_IRQCHIP_PIC_SLAVE 1
95#define KVM_IRQCHIP_IOAPIC 2
96 48
97struct kvm_irqchip { 49struct kvm_irqchip {
98 __u32 chip_id; 50 __u32 chip_id;
99 __u32 pad; 51 __u32 pad;
100 union { 52 union {
101 char dummy[512]; /* reserving space */ 53 char dummy[512]; /* reserving space */
54#ifdef CONFIG_X86
102 struct kvm_pic_state pic; 55 struct kvm_pic_state pic;
56#endif
57#if defined(CONFIG_X86) || defined(CONFIG_IA64)
103 struct kvm_ioapic_state ioapic; 58 struct kvm_ioapic_state ioapic;
59#endif
104 } chip; 60 } chip;
105}; 61};
106 62
@@ -116,6 +72,7 @@ struct kvm_irqchip {
116#define KVM_EXIT_FAIL_ENTRY 9 72#define KVM_EXIT_FAIL_ENTRY 9
117#define KVM_EXIT_INTR 10 73#define KVM_EXIT_INTR 10
118#define KVM_EXIT_SET_TPR 11 74#define KVM_EXIT_SET_TPR 11
75#define KVM_EXIT_TPR_ACCESS 12
119 76
120/* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */ 77/* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */
121struct kvm_run { 78struct kvm_run {
@@ -174,90 +131,17 @@ struct kvm_run {
174 __u32 longmode; 131 __u32 longmode;
175 __u32 pad; 132 __u32 pad;
176 } hypercall; 133 } hypercall;
134 /* KVM_EXIT_TPR_ACCESS */
135 struct {
136 __u64 rip;
137 __u32 is_write;
138 __u32 pad;
139 } tpr_access;
177 /* Fix the size of the union. */ 140 /* Fix the size of the union. */
178 char padding[256]; 141 char padding[256];
179 }; 142 };
180}; 143};
181 144
182/* for KVM_GET_REGS and KVM_SET_REGS */
183struct kvm_regs {
184 /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
185 __u64 rax, rbx, rcx, rdx;
186 __u64 rsi, rdi, rsp, rbp;
187 __u64 r8, r9, r10, r11;
188 __u64 r12, r13, r14, r15;
189 __u64 rip, rflags;
190};
191
192/* for KVM_GET_FPU and KVM_SET_FPU */
193struct kvm_fpu {
194 __u8 fpr[8][16];
195 __u16 fcw;
196 __u16 fsw;
197 __u8 ftwx; /* in fxsave format */
198 __u8 pad1;
199 __u16 last_opcode;
200 __u64 last_ip;
201 __u64 last_dp;
202 __u8 xmm[16][16];
203 __u32 mxcsr;
204 __u32 pad2;
205};
206
207/* for KVM_GET_LAPIC and KVM_SET_LAPIC */
208#define KVM_APIC_REG_SIZE 0x400
209struct kvm_lapic_state {
210 char regs[KVM_APIC_REG_SIZE];
211};
212
213struct kvm_segment {
214 __u64 base;
215 __u32 limit;
216 __u16 selector;
217 __u8 type;
218 __u8 present, dpl, db, s, l, g, avl;
219 __u8 unusable;
220 __u8 padding;
221};
222
223struct kvm_dtable {
224 __u64 base;
225 __u16 limit;
226 __u16 padding[3];
227};
228
229/* for KVM_GET_SREGS and KVM_SET_SREGS */
230struct kvm_sregs {
231 /* out (KVM_GET_SREGS) / in (KVM_SET_SREGS) */
232 struct kvm_segment cs, ds, es, fs, gs, ss;
233 struct kvm_segment tr, ldt;
234 struct kvm_dtable gdt, idt;
235 __u64 cr0, cr2, cr3, cr4, cr8;
236 __u64 efer;
237 __u64 apic_base;
238 __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64];
239};
240
241struct kvm_msr_entry {
242 __u32 index;
243 __u32 reserved;
244 __u64 data;
245};
246
247/* for KVM_GET_MSRS and KVM_SET_MSRS */
248struct kvm_msrs {
249 __u32 nmsrs; /* number of msrs in entries */
250 __u32 pad;
251
252 struct kvm_msr_entry entries[0];
253};
254
255/* for KVM_GET_MSR_INDEX_LIST */
256struct kvm_msr_list {
257 __u32 nmsrs; /* number of msrs in entries */
258 __u32 indices[0];
259};
260
261/* for KVM_TRANSLATE */ 145/* for KVM_TRANSLATE */
262struct kvm_translation { 146struct kvm_translation {
263 /* in */ 147 /* in */
@@ -302,28 +186,24 @@ struct kvm_dirty_log {
302 }; 186 };
303}; 187};
304 188
305struct kvm_cpuid_entry {
306 __u32 function;
307 __u32 eax;
308 __u32 ebx;
309 __u32 ecx;
310 __u32 edx;
311 __u32 padding;
312};
313
314/* for KVM_SET_CPUID */
315struct kvm_cpuid {
316 __u32 nent;
317 __u32 padding;
318 struct kvm_cpuid_entry entries[0];
319};
320
321/* for KVM_SET_SIGNAL_MASK */ 189/* for KVM_SET_SIGNAL_MASK */
322struct kvm_signal_mask { 190struct kvm_signal_mask {
323 __u32 len; 191 __u32 len;
324 __u8 sigset[0]; 192 __u8 sigset[0];
325}; 193};
326 194
195/* for KVM_TPR_ACCESS_REPORTING */
196struct kvm_tpr_access_ctl {
197 __u32 enabled;
198 __u32 flags;
199 __u32 reserved[8];
200};
201
202/* for KVM_SET_VAPIC_ADDR */
203struct kvm_vapic_addr {
204 __u64 vapic_addr;
205};
206
327#define KVMIO 0xAE 207#define KVMIO 0xAE
328 208
329/* 209/*
@@ -347,11 +227,21 @@ struct kvm_signal_mask {
347 */ 227 */
348#define KVM_CAP_IRQCHIP 0 228#define KVM_CAP_IRQCHIP 0
349#define KVM_CAP_HLT 1 229#define KVM_CAP_HLT 1
230#define KVM_CAP_MMU_SHADOW_CACHE_CONTROL 2
231#define KVM_CAP_USER_MEMORY 3
232#define KVM_CAP_SET_TSS_ADDR 4
233#define KVM_CAP_EXT_CPUID 5
234#define KVM_CAP_VAPIC 6
350 235
351/* 236/*
352 * ioctls for VM fds 237 * ioctls for VM fds
353 */ 238 */
354#define KVM_SET_MEMORY_REGION _IOW(KVMIO, 0x40, struct kvm_memory_region) 239#define KVM_SET_MEMORY_REGION _IOW(KVMIO, 0x40, struct kvm_memory_region)
240#define KVM_SET_NR_MMU_PAGES _IO(KVMIO, 0x44)
241#define KVM_GET_NR_MMU_PAGES _IO(KVMIO, 0x45)
242#define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46,\
243 struct kvm_userspace_memory_region)
244#define KVM_SET_TSS_ADDR _IO(KVMIO, 0x47)
355/* 245/*
356 * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns 246 * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns
357 * a vcpu fd. 247 * a vcpu fd.
@@ -359,6 +249,7 @@ struct kvm_signal_mask {
359#define KVM_CREATE_VCPU _IO(KVMIO, 0x41) 249#define KVM_CREATE_VCPU _IO(KVMIO, 0x41)
360#define KVM_GET_DIRTY_LOG _IOW(KVMIO, 0x42, struct kvm_dirty_log) 250#define KVM_GET_DIRTY_LOG _IOW(KVMIO, 0x42, struct kvm_dirty_log)
361#define KVM_SET_MEMORY_ALIAS _IOW(KVMIO, 0x43, struct kvm_memory_alias) 251#define KVM_SET_MEMORY_ALIAS _IOW(KVMIO, 0x43, struct kvm_memory_alias)
252#define KVM_GET_SUPPORTED_CPUID _IOWR(KVMIO, 0x48, struct kvm_cpuid2)
362/* Device model IOC */ 253/* Device model IOC */
363#define KVM_CREATE_IRQCHIP _IO(KVMIO, 0x60) 254#define KVM_CREATE_IRQCHIP _IO(KVMIO, 0x60)
364#define KVM_IRQ_LINE _IOW(KVMIO, 0x61, struct kvm_irq_level) 255#define KVM_IRQ_LINE _IOW(KVMIO, 0x61, struct kvm_irq_level)
@@ -384,5 +275,11 @@ struct kvm_signal_mask {
384#define KVM_SET_FPU _IOW(KVMIO, 0x8d, struct kvm_fpu) 275#define KVM_SET_FPU _IOW(KVMIO, 0x8d, struct kvm_fpu)
385#define KVM_GET_LAPIC _IOR(KVMIO, 0x8e, struct kvm_lapic_state) 276#define KVM_GET_LAPIC _IOR(KVMIO, 0x8e, struct kvm_lapic_state)
386#define KVM_SET_LAPIC _IOW(KVMIO, 0x8f, struct kvm_lapic_state) 277#define KVM_SET_LAPIC _IOW(KVMIO, 0x8f, struct kvm_lapic_state)
278#define KVM_SET_CPUID2 _IOW(KVMIO, 0x90, struct kvm_cpuid2)
279#define KVM_GET_CPUID2 _IOWR(KVMIO, 0x91, struct kvm_cpuid2)
280/* Available with KVM_CAP_VAPIC */
281#define KVM_TPR_ACCESS_REPORTING _IOWR(KVMIO, 0x92, struct kvm_tpr_access_ctl)
282/* Available with KVM_CAP_VAPIC */
283#define KVM_SET_VAPIC_ADDR _IOW(KVMIO, 0x93, struct kvm_vapic_addr)
387 284
388#endif 285#endif
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
new file mode 100644
index 000000000000..ea4764b0a2f4
--- /dev/null
+++ b/include/linux/kvm_host.h
@@ -0,0 +1,299 @@
1#ifndef __KVM_HOST_H
2#define __KVM_HOST_H
3
4/*
5 * This work is licensed under the terms of the GNU GPL, version 2. See
6 * the COPYING file in the top-level directory.
7 */
8
9#include <linux/types.h>
10#include <linux/hardirq.h>
11#include <linux/list.h>
12#include <linux/mutex.h>
13#include <linux/spinlock.h>
14#include <linux/signal.h>
15#include <linux/sched.h>
16#include <linux/mm.h>
17#include <linux/preempt.h>
18#include <asm/signal.h>
19
20#include <linux/kvm.h>
21#include <linux/kvm_para.h>
22
23#include <linux/kvm_types.h>
24
25#include <asm/kvm_host.h>
26
27#define KVM_MAX_VCPUS 4
28#define KVM_MEMORY_SLOTS 8
29/* memory slots that does not exposed to userspace */
30#define KVM_PRIVATE_MEM_SLOTS 4
31
32#define KVM_PIO_PAGE_OFFSET 1
33
34/*
35 * vcpu->requests bit members
36 */
37#define KVM_REQ_TLB_FLUSH 0
38#define KVM_REQ_MIGRATE_TIMER 1
39#define KVM_REQ_REPORT_TPR_ACCESS 2
40
41struct kvm_vcpu;
42extern struct kmem_cache *kvm_vcpu_cache;
43
44struct kvm_guest_debug {
45 int enabled;
46 unsigned long bp[4];
47 int singlestep;
48};
49
50/*
51 * It would be nice to use something smarter than a linear search, TBD...
52 * Thankfully we dont expect many devices to register (famous last words :),
53 * so until then it will suffice. At least its abstracted so we can change
54 * in one place.
55 */
56struct kvm_io_bus {
57 int dev_count;
58#define NR_IOBUS_DEVS 6
59 struct kvm_io_device *devs[NR_IOBUS_DEVS];
60};
61
62void kvm_io_bus_init(struct kvm_io_bus *bus);
63void kvm_io_bus_destroy(struct kvm_io_bus *bus);
64struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr);
65void kvm_io_bus_register_dev(struct kvm_io_bus *bus,
66 struct kvm_io_device *dev);
67
68struct kvm_vcpu {
69 struct kvm *kvm;
70 struct preempt_notifier preempt_notifier;
71 int vcpu_id;
72 struct mutex mutex;
73 int cpu;
74 struct kvm_run *run;
75 int guest_mode;
76 unsigned long requests;
77 struct kvm_guest_debug guest_debug;
78 int fpu_active;
79 int guest_fpu_loaded;
80 wait_queue_head_t wq;
81 int sigset_active;
82 sigset_t sigset;
83 struct kvm_vcpu_stat stat;
84
85#ifdef CONFIG_HAS_IOMEM
86 int mmio_needed;
87 int mmio_read_completed;
88 int mmio_is_write;
89 int mmio_size;
90 unsigned char mmio_data[8];
91 gpa_t mmio_phys_addr;
92#endif
93
94 struct kvm_vcpu_arch arch;
95};
96
97struct kvm_memory_slot {
98 gfn_t base_gfn;
99 unsigned long npages;
100 unsigned long flags;
101 unsigned long *rmap;
102 unsigned long *dirty_bitmap;
103 unsigned long userspace_addr;
104 int user_alloc;
105};
106
107struct kvm {
108 struct mutex lock; /* protects the vcpus array and APIC accesses */
109 spinlock_t mmu_lock;
110 struct mm_struct *mm; /* userspace tied to this vm */
111 int nmemslots;
112 struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS +
113 KVM_PRIVATE_MEM_SLOTS];
114 struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
115 struct list_head vm_list;
116 struct file *filp;
117 struct kvm_io_bus mmio_bus;
118 struct kvm_io_bus pio_bus;
119 struct kvm_vm_stat stat;
120 struct kvm_arch arch;
121};
122
123/* The guest did something we don't support. */
124#define pr_unimpl(vcpu, fmt, ...) \
125 do { \
126 if (printk_ratelimit()) \
127 printk(KERN_ERR "kvm: %i: cpu%i " fmt, \
128 current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__); \
129 } while (0)
130
131#define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt)
132#define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt)
133
134int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id);
135void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
136
137void vcpu_load(struct kvm_vcpu *vcpu);
138void vcpu_put(struct kvm_vcpu *vcpu);
139
140void decache_vcpus_on_cpu(int cpu);
141
142
143int kvm_init(void *opaque, unsigned int vcpu_size,
144 struct module *module);
145void kvm_exit(void);
146
147#define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
148#define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB)
149static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
150struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva);
151
152extern struct page *bad_page;
153
154int is_error_page(struct page *page);
155int kvm_is_error_hva(unsigned long addr);
156int kvm_set_memory_region(struct kvm *kvm,
157 struct kvm_userspace_memory_region *mem,
158 int user_alloc);
159int __kvm_set_memory_region(struct kvm *kvm,
160 struct kvm_userspace_memory_region *mem,
161 int user_alloc);
162int kvm_arch_set_memory_region(struct kvm *kvm,
163 struct kvm_userspace_memory_region *mem,
164 struct kvm_memory_slot old,
165 int user_alloc);
166gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn);
167struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
168void kvm_release_page_clean(struct page *page);
169void kvm_release_page_dirty(struct page *page);
170int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
171 int len);
172int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
173 unsigned long len);
174int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len);
175int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
176 int offset, int len);
177int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
178 unsigned long len);
179int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len);
180int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len);
181struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
182int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn);
183void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
184
185void kvm_vcpu_block(struct kvm_vcpu *vcpu);
186void kvm_resched(struct kvm_vcpu *vcpu);
187void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
188void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
189void kvm_flush_remote_tlbs(struct kvm *kvm);
190
191long kvm_arch_dev_ioctl(struct file *filp,
192 unsigned int ioctl, unsigned long arg);
193long kvm_arch_vcpu_ioctl(struct file *filp,
194 unsigned int ioctl, unsigned long arg);
195void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
196void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu);
197
198int kvm_dev_ioctl_check_extension(long ext);
199
200int kvm_get_dirty_log(struct kvm *kvm,
201 struct kvm_dirty_log *log, int *is_dirty);
202int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
203 struct kvm_dirty_log *log);
204
205int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
206 struct
207 kvm_userspace_memory_region *mem,
208 int user_alloc);
209long kvm_arch_vm_ioctl(struct file *filp,
210 unsigned int ioctl, unsigned long arg);
211void kvm_arch_destroy_vm(struct kvm *kvm);
212
213int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu);
214int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu);
215
216int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
217 struct kvm_translation *tr);
218
219int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs);
220int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs);
221int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
222 struct kvm_sregs *sregs);
223int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
224 struct kvm_sregs *sregs);
225int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
226 struct kvm_debug_guest *dbg);
227int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run);
228
229int kvm_arch_init(void *opaque);
230void kvm_arch_exit(void);
231
232int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu);
233void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu);
234
235void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu);
236void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
237void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu);
238struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id);
239int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu);
240void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu);
241
242int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu);
243void kvm_arch_hardware_enable(void *garbage);
244void kvm_arch_hardware_disable(void *garbage);
245int kvm_arch_hardware_setup(void);
246void kvm_arch_hardware_unsetup(void);
247void kvm_arch_check_processor_compat(void *rtn);
248int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
249
250void kvm_free_physmem(struct kvm *kvm);
251
252struct kvm *kvm_arch_create_vm(void);
253void kvm_arch_destroy_vm(struct kvm *kvm);
254
255int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
256int kvm_cpu_has_interrupt(struct kvm_vcpu *v);
257void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
258
259static inline void kvm_guest_enter(void)
260{
261 account_system_vtime(current);
262 current->flags |= PF_VCPU;
263}
264
265static inline void kvm_guest_exit(void)
266{
267 account_system_vtime(current);
268 current->flags &= ~PF_VCPU;
269}
270
271static inline int memslot_id(struct kvm *kvm, struct kvm_memory_slot *slot)
272{
273 return slot - kvm->memslots;
274}
275
276static inline gpa_t gfn_to_gpa(gfn_t gfn)
277{
278 return (gpa_t)gfn << PAGE_SHIFT;
279}
280
281static inline void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
282{
283 set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests);
284}
285
286enum kvm_stat_kind {
287 KVM_STAT_VM,
288 KVM_STAT_VCPU,
289};
290
291struct kvm_stats_debugfs_item {
292 const char *name;
293 int offset;
294 enum kvm_stat_kind kind;
295 struct dentry *dentry;
296};
297extern struct kvm_stats_debugfs_item debugfs_entries[];
298
299#endif
diff --git a/include/linux/kvm_para.h b/include/linux/kvm_para.h
index 3b292565a693..5497aac0d2f8 100644
--- a/include/linux/kvm_para.h
+++ b/include/linux/kvm_para.h
@@ -2,72 +2,30 @@
2#define __LINUX_KVM_PARA_H 2#define __LINUX_KVM_PARA_H
3 3
4/* 4/*
5 * Guest OS interface for KVM paravirtualization 5 * This header file provides a method for making a hypercall to the host
6 * 6 * Architectures should define:
7 * Note: this interface is totally experimental, and is certain to change 7 * - kvm_hypercall0, kvm_hypercall1...
8 * as we make progress. 8 * - kvm_arch_para_features
9 * - kvm_para_available
9 */ 10 */
10 11
11/* 12/* Return values for hypercalls */
12 * Per-VCPU descriptor area shared between guest and host. Writable to 13#define KVM_ENOSYS 1000
13 * both guest and host. Registered with the host by the guest when
14 * a guest acknowledges paravirtual mode.
15 *
16 * NOTE: all addresses are guest-physical addresses (gpa), to make it
17 * easier for the hypervisor to map between the various addresses.
18 */
19struct kvm_vcpu_para_state {
20 /*
21 * API version information for compatibility. If there's any support
22 * mismatch (too old host trying to execute too new guest) then
23 * the host will deny entry into paravirtual mode. Any other
24 * combination (new host + old guest and new host + new guest)
25 * is supposed to work - new host versions will support all old
26 * guest API versions.
27 */
28 u32 guest_version;
29 u32 host_version;
30 u32 size;
31 u32 ret;
32
33 /*
34 * The address of the vm exit instruction (VMCALL or VMMCALL),
35 * which the host will patch according to the CPU model the
36 * VM runs on:
37 */
38 u64 hypercall_gpa;
39
40} __attribute__ ((aligned(PAGE_SIZE)));
41
42#define KVM_PARA_API_VERSION 1
43
44/*
45 * This is used for an RDMSR's ECX parameter to probe for a KVM host.
46 * Hopefully no CPU vendor will use up this number. This is placed well
47 * out of way of the typical space occupied by CPU vendors' MSR indices,
48 * and we think (or at least hope) it wont be occupied in the future
49 * either.
50 */
51#define MSR_KVM_API_MAGIC 0x87655678
52 14
53#define KVM_EINVAL 1 15#define KVM_HC_VAPIC_POLL_IRQ 1
54 16
55/* 17/*
56 * Hypercall calling convention: 18 * hypercalls use architecture specific
57 *
58 * Each hypercall may have 0-6 parameters.
59 *
60 * 64-bit hypercall index is in RAX, goes from 0 to __NR_hypercalls-1
61 *
62 * 64-bit parameters 1-6 are in the standard gcc x86_64 calling convention
63 * order: RDI, RSI, RDX, RCX, R8, R9.
64 *
65 * 32-bit index is EBX, parameters are: EAX, ECX, EDX, ESI, EDI, EBP.
66 * (the first 3 are according to the gcc regparm calling convention)
67 *
68 * No registers are clobbered by the hypercall, except that the
69 * return value is in RAX.
70 */ 19 */
71#define __NR_hypercalls 0 20#include <asm/kvm_para.h>
21
22#ifdef __KERNEL__
23static inline int kvm_para_has_feature(unsigned int feature)
24{
25 if (kvm_arch_para_features() & (1UL << feature))
26 return 1;
27 return 0;
28}
29#endif /* __KERNEL__ */
30#endif /* __LINUX_KVM_PARA_H */
72 31
73#endif
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
new file mode 100644
index 000000000000..1c4e46decb22
--- /dev/null
+++ b/include/linux/kvm_types.h
@@ -0,0 +1,54 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License.
5 *
6 * This program is distributed in the hope that it will be useful,
7 * but WITHOUT ANY WARRANTY; without even the implied warranty of
8 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
9 * GNU General Public License for more details.
10 *
11 * You should have received a copy of the GNU General Public License
12 * along with this program; if not, write to the Free Software
13 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
14 *
15 */
16
17#ifndef __KVM_TYPES_H__
18#define __KVM_TYPES_H__
19
20#include <asm/types.h>
21
22/*
23 * Address types:
24 *
25 * gva - guest virtual address
26 * gpa - guest physical address
27 * gfn - guest frame number
28 * hva - host virtual address
29 * hpa - host physical address
30 * hfn - host frame number
31 */
32
33typedef unsigned long gva_t;
34typedef u64 gpa_t;
35typedef unsigned long gfn_t;
36
37typedef unsigned long hva_t;
38typedef u64 hpa_t;
39typedef unsigned long hfn_t;
40
41struct kvm_pio_request {
42 unsigned long count;
43 int cur_count;
44 struct page *guest_pages[2];
45 unsigned guest_page_offset;
46 int in;
47 int port;
48 int size;
49 int string;
50 int down;
51 int rep;
52};
53
54#endif /* __KVM_TYPES_H__ */
diff --git a/kernel/fork.c b/kernel/fork.c
index 314f5101d2b0..05e0b6f4365b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -393,6 +393,7 @@ void fastcall __mmdrop(struct mm_struct *mm)
393 destroy_context(mm); 393 destroy_context(mm);
394 free_mm(mm); 394 free_mm(mm);
395} 395}
396EXPORT_SYMBOL_GPL(__mmdrop);
396 397
397/* 398/*
398 * Decrement the use count and release all resources for an mm. 399 * Decrement the use count and release all resources for an mm.
diff --git a/drivers/kvm/ioapic.c b/virt/kvm/ioapic.c
index c7992e667fdb..317f8e211cd2 100644
--- a/drivers/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -26,7 +26,7 @@
26 * Based on Xen 3.1 code. 26 * Based on Xen 3.1 code.
27 */ 27 */
28 28
29#include "kvm.h" 29#include <linux/kvm_host.h>
30#include <linux/kvm.h> 30#include <linux/kvm.h>
31#include <linux/mm.h> 31#include <linux/mm.h>
32#include <linux/highmem.h> 32#include <linux/highmem.h>
@@ -34,14 +34,17 @@
34#include <linux/hrtimer.h> 34#include <linux/hrtimer.h>
35#include <linux/io.h> 35#include <linux/io.h>
36#include <asm/processor.h> 36#include <asm/processor.h>
37#include <asm/msr.h>
38#include <asm/page.h> 37#include <asm/page.h>
39#include <asm/current.h> 38#include <asm/current.h>
40#include <asm/apicdef.h> 39
41#include <asm/io_apic.h> 40#include "ioapic.h"
42#include "irq.h" 41#include "lapic.h"
43/* #define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */ 42
43#if 0
44#define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg)
45#else
44#define ioapic_debug(fmt, arg...) 46#define ioapic_debug(fmt, arg...)
47#endif
45static void ioapic_deliver(struct kvm_ioapic *vioapic, int irq); 48static void ioapic_deliver(struct kvm_ioapic *vioapic, int irq);
46 49
47static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, 50static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
@@ -113,7 +116,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
113 default: 116 default:
114 index = (ioapic->ioregsel - 0x10) >> 1; 117 index = (ioapic->ioregsel - 0x10) >> 1;
115 118
116 ioapic_debug("change redir index %x val %x", index, val); 119 ioapic_debug("change redir index %x val %x\n", index, val);
117 if (index >= IOAPIC_NUM_PINS) 120 if (index >= IOAPIC_NUM_PINS)
118 return; 121 return;
119 if (ioapic->ioregsel & 1) { 122 if (ioapic->ioregsel & 1) {
@@ -131,16 +134,16 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
131} 134}
132 135
133static void ioapic_inj_irq(struct kvm_ioapic *ioapic, 136static void ioapic_inj_irq(struct kvm_ioapic *ioapic,
134 struct kvm_lapic *target, 137 struct kvm_vcpu *vcpu,
135 u8 vector, u8 trig_mode, u8 delivery_mode) 138 u8 vector, u8 trig_mode, u8 delivery_mode)
136{ 139{
137 ioapic_debug("irq %d trig %d deliv %d", vector, trig_mode, 140 ioapic_debug("irq %d trig %d deliv %d\n", vector, trig_mode,
138 delivery_mode); 141 delivery_mode);
139 142
140 ASSERT((delivery_mode == dest_Fixed) || 143 ASSERT((delivery_mode == IOAPIC_FIXED) ||
141 (delivery_mode == dest_LowestPrio)); 144 (delivery_mode == IOAPIC_LOWEST_PRIORITY));
142 145
143 kvm_apic_set_irq(target, vector, trig_mode); 146 kvm_apic_set_irq(vcpu, vector, trig_mode);
144} 147}
145 148
146static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, 149static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
@@ -151,12 +154,12 @@ static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
151 struct kvm *kvm = ioapic->kvm; 154 struct kvm *kvm = ioapic->kvm;
152 struct kvm_vcpu *vcpu; 155 struct kvm_vcpu *vcpu;
153 156
154 ioapic_debug("dest %d dest_mode %d", dest, dest_mode); 157 ioapic_debug("dest %d dest_mode %d\n", dest, dest_mode);
155 158
156 if (dest_mode == 0) { /* Physical mode. */ 159 if (dest_mode == 0) { /* Physical mode. */
157 if (dest == 0xFF) { /* Broadcast. */ 160 if (dest == 0xFF) { /* Broadcast. */
158 for (i = 0; i < KVM_MAX_VCPUS; ++i) 161 for (i = 0; i < KVM_MAX_VCPUS; ++i)
159 if (kvm->vcpus[i] && kvm->vcpus[i]->apic) 162 if (kvm->vcpus[i] && kvm->vcpus[i]->arch.apic)
160 mask |= 1 << i; 163 mask |= 1 << i;
161 return mask; 164 return mask;
162 } 165 }
@@ -164,8 +167,8 @@ static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
164 vcpu = kvm->vcpus[i]; 167 vcpu = kvm->vcpus[i];
165 if (!vcpu) 168 if (!vcpu)
166 continue; 169 continue;
167 if (kvm_apic_match_physical_addr(vcpu->apic, dest)) { 170 if (kvm_apic_match_physical_addr(vcpu->arch.apic, dest)) {
168 if (vcpu->apic) 171 if (vcpu->arch.apic)
169 mask = 1 << i; 172 mask = 1 << i;
170 break; 173 break;
171 } 174 }
@@ -175,11 +178,11 @@ static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
175 vcpu = kvm->vcpus[i]; 178 vcpu = kvm->vcpus[i];
176 if (!vcpu) 179 if (!vcpu)
177 continue; 180 continue;
178 if (vcpu->apic && 181 if (vcpu->arch.apic &&
179 kvm_apic_match_logical_addr(vcpu->apic, dest)) 182 kvm_apic_match_logical_addr(vcpu->arch.apic, dest))
180 mask |= 1 << vcpu->vcpu_id; 183 mask |= 1 << vcpu->vcpu_id;
181 } 184 }
182 ioapic_debug("mask %x", mask); 185 ioapic_debug("mask %x\n", mask);
183 return mask; 186 return mask;
184} 187}
185 188
@@ -191,41 +194,39 @@ static void ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
191 u8 vector = ioapic->redirtbl[irq].fields.vector; 194 u8 vector = ioapic->redirtbl[irq].fields.vector;
192 u8 trig_mode = ioapic->redirtbl[irq].fields.trig_mode; 195 u8 trig_mode = ioapic->redirtbl[irq].fields.trig_mode;
193 u32 deliver_bitmask; 196 u32 deliver_bitmask;
194 struct kvm_lapic *target;
195 struct kvm_vcpu *vcpu; 197 struct kvm_vcpu *vcpu;
196 int vcpu_id; 198 int vcpu_id;
197 199
198 ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x " 200 ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
199 "vector=%x trig_mode=%x", 201 "vector=%x trig_mode=%x\n",
200 dest, dest_mode, delivery_mode, vector, trig_mode); 202 dest, dest_mode, delivery_mode, vector, trig_mode);
201 203
202 deliver_bitmask = ioapic_get_delivery_bitmask(ioapic, dest, dest_mode); 204 deliver_bitmask = ioapic_get_delivery_bitmask(ioapic, dest, dest_mode);
203 if (!deliver_bitmask) { 205 if (!deliver_bitmask) {
204 ioapic_debug("no target on destination"); 206 ioapic_debug("no target on destination\n");
205 return; 207 return;
206 } 208 }
207 209
208 switch (delivery_mode) { 210 switch (delivery_mode) {
209 case dest_LowestPrio: 211 case IOAPIC_LOWEST_PRIORITY:
210 target = 212 vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm, vector,
211 kvm_apic_round_robin(ioapic->kvm, vector, deliver_bitmask); 213 deliver_bitmask);
212 if (target != NULL) 214 if (vcpu != NULL)
213 ioapic_inj_irq(ioapic, target, vector, 215 ioapic_inj_irq(ioapic, vcpu, vector,
214 trig_mode, delivery_mode); 216 trig_mode, delivery_mode);
215 else 217 else
216 ioapic_debug("null round robin: " 218 ioapic_debug("null lowest prio vcpu: "
217 "mask=%x vector=%x delivery_mode=%x", 219 "mask=%x vector=%x delivery_mode=%x\n",
218 deliver_bitmask, vector, dest_LowestPrio); 220 deliver_bitmask, vector, IOAPIC_LOWEST_PRIORITY);
219 break; 221 break;
220 case dest_Fixed: 222 case IOAPIC_FIXED:
221 for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) { 223 for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) {
222 if (!(deliver_bitmask & (1 << vcpu_id))) 224 if (!(deliver_bitmask & (1 << vcpu_id)))
223 continue; 225 continue;
224 deliver_bitmask &= ~(1 << vcpu_id); 226 deliver_bitmask &= ~(1 << vcpu_id);
225 vcpu = ioapic->kvm->vcpus[vcpu_id]; 227 vcpu = ioapic->kvm->vcpus[vcpu_id];
226 if (vcpu) { 228 if (vcpu) {
227 target = vcpu->apic; 229 ioapic_inj_irq(ioapic, vcpu, vector,
228 ioapic_inj_irq(ioapic, target, vector,
229 trig_mode, delivery_mode); 230 trig_mode, delivery_mode);
230 } 231 }
231 } 232 }
@@ -271,7 +272,7 @@ static int get_eoi_gsi(struct kvm_ioapic *ioapic, int vector)
271 272
272void kvm_ioapic_update_eoi(struct kvm *kvm, int vector) 273void kvm_ioapic_update_eoi(struct kvm *kvm, int vector)
273{ 274{
274 struct kvm_ioapic *ioapic = kvm->vioapic; 275 struct kvm_ioapic *ioapic = kvm->arch.vioapic;
275 union ioapic_redir_entry *ent; 276 union ioapic_redir_entry *ent;
276 int gsi; 277 int gsi;
277 278
@@ -304,7 +305,7 @@ static void ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
304 struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; 305 struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
305 u32 result; 306 u32 result;
306 307
307 ioapic_debug("addr %lx", (unsigned long)addr); 308 ioapic_debug("addr %lx\n", (unsigned long)addr);
308 ASSERT(!(addr & 0xf)); /* check alignment */ 309 ASSERT(!(addr & 0xf)); /* check alignment */
309 310
310 addr &= 0xff; 311 addr &= 0xff;
@@ -341,8 +342,8 @@ static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
341 struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; 342 struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
342 u32 data; 343 u32 data;
343 344
344 ioapic_debug("ioapic_mmio_write addr=%lx len=%d val=%p\n", 345 ioapic_debug("ioapic_mmio_write addr=%p len=%d val=%p\n",
345 addr, len, val); 346 (void*)addr, len, val);
346 ASSERT(!(addr & 0xf)); /* check alignment */ 347 ASSERT(!(addr & 0xf)); /* check alignment */
347 if (len == 4 || len == 8) 348 if (len == 4 || len == 8)
348 data = *(u32 *) val; 349 data = *(u32 *) val;
@@ -360,24 +361,38 @@ static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
360 case IOAPIC_REG_WINDOW: 361 case IOAPIC_REG_WINDOW:
361 ioapic_write_indirect(ioapic, data); 362 ioapic_write_indirect(ioapic, data);
362 break; 363 break;
364#ifdef CONFIG_IA64
365 case IOAPIC_REG_EOI:
366 kvm_ioapic_update_eoi(ioapic->kvm, data);
367 break;
368#endif
363 369
364 default: 370 default:
365 break; 371 break;
366 } 372 }
367} 373}
368 374
375void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
376{
377 int i;
378
379 for (i = 0; i < IOAPIC_NUM_PINS; i++)
380 ioapic->redirtbl[i].fields.mask = 1;
381 ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
382 ioapic->ioregsel = 0;
383 ioapic->irr = 0;
384 ioapic->id = 0;
385}
386
369int kvm_ioapic_init(struct kvm *kvm) 387int kvm_ioapic_init(struct kvm *kvm)
370{ 388{
371 struct kvm_ioapic *ioapic; 389 struct kvm_ioapic *ioapic;
372 int i;
373 390
374 ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL); 391 ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL);
375 if (!ioapic) 392 if (!ioapic)
376 return -ENOMEM; 393 return -ENOMEM;
377 kvm->vioapic = ioapic; 394 kvm->arch.vioapic = ioapic;
378 for (i = 0; i < IOAPIC_NUM_PINS; i++) 395 kvm_ioapic_reset(ioapic);
379 ioapic->redirtbl[i].fields.mask = 1;
380 ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
381 ioapic->dev.read = ioapic_mmio_read; 396 ioapic->dev.read = ioapic_mmio_read;
382 ioapic->dev.write = ioapic_mmio_write; 397 ioapic->dev.write = ioapic_mmio_write;
383 ioapic->dev.in_range = ioapic_in_range; 398 ioapic->dev.in_range = ioapic_in_range;
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
new file mode 100644
index 000000000000..7f16675fe783
--- /dev/null
+++ b/virt/kvm/ioapic.h
@@ -0,0 +1,95 @@
1#ifndef __KVM_IO_APIC_H
2#define __KVM_IO_APIC_H
3
4#include <linux/kvm_host.h>
5
6#include "iodev.h"
7
8struct kvm;
9struct kvm_vcpu;
10
11#define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS
12#define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */
13#define IOAPIC_EDGE_TRIG 0
14#define IOAPIC_LEVEL_TRIG 1
15
16#define IOAPIC_DEFAULT_BASE_ADDRESS 0xfec00000
17#define IOAPIC_MEM_LENGTH 0x100
18
19/* Direct registers. */
20#define IOAPIC_REG_SELECT 0x00
21#define IOAPIC_REG_WINDOW 0x10
22#define IOAPIC_REG_EOI 0x40 /* IA64 IOSAPIC only */
23
24/* Indirect registers. */
25#define IOAPIC_REG_APIC_ID 0x00 /* x86 IOAPIC only */
26#define IOAPIC_REG_VERSION 0x01
27#define IOAPIC_REG_ARB_ID 0x02 /* x86 IOAPIC only */
28
29/*ioapic delivery mode*/
30#define IOAPIC_FIXED 0x0
31#define IOAPIC_LOWEST_PRIORITY 0x1
32#define IOAPIC_PMI 0x2
33#define IOAPIC_NMI 0x4
34#define IOAPIC_INIT 0x5
35#define IOAPIC_EXTINT 0x7
36
37struct kvm_ioapic {
38 u64 base_address;
39 u32 ioregsel;
40 u32 id;
41 u32 irr;
42 u32 pad;
43 union ioapic_redir_entry {
44 u64 bits;
45 struct {
46 u8 vector;
47 u8 delivery_mode:3;
48 u8 dest_mode:1;
49 u8 delivery_status:1;
50 u8 polarity:1;
51 u8 remote_irr:1;
52 u8 trig_mode:1;
53 u8 mask:1;
54 u8 reserve:7;
55 u8 reserved[4];
56 u8 dest_id;
57 } fields;
58 } redirtbl[IOAPIC_NUM_PINS];
59 struct kvm_io_device dev;
60 struct kvm *kvm;
61};
62
63#ifdef DEBUG
64#define ASSERT(x) \
65do { \
66 if (!(x)) { \
67 printk(KERN_EMERG "assertion failed %s: %d: %s\n", \
68 __FILE__, __LINE__, #x); \
69 BUG(); \
70 } \
71} while (0)
72#else
73#define ASSERT(x) do { } while (0)
74#endif
75
76static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
77{
78 return kvm->arch.vioapic;
79}
80
81#ifdef CONFIG_IA64
82static inline int irqchip_in_kernel(struct kvm *kvm)
83{
84 return 1;
85}
86#endif
87
88struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
89 unsigned long bitmap);
90void kvm_ioapic_update_eoi(struct kvm *kvm, int vector);
91int kvm_ioapic_init(struct kvm *kvm);
92void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
93void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
94
95#endif
diff --git a/virt/kvm/iodev.h b/virt/kvm/iodev.h
new file mode 100644
index 000000000000..c14e642027b2
--- /dev/null
+++ b/virt/kvm/iodev.h
@@ -0,0 +1,63 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License.
5 *
6 * This program is distributed in the hope that it will be useful,
7 * but WITHOUT ANY WARRANTY; without even the implied warranty of
8 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
9 * GNU General Public License for more details.
10 *
11 * You should have received a copy of the GNU General Public License
12 * along with this program; if not, write to the Free Software
13 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
14 */
15
16#ifndef __KVM_IODEV_H__
17#define __KVM_IODEV_H__
18
19#include <linux/kvm_types.h>
20
21struct kvm_io_device {
22 void (*read)(struct kvm_io_device *this,
23 gpa_t addr,
24 int len,
25 void *val);
26 void (*write)(struct kvm_io_device *this,
27 gpa_t addr,
28 int len,
29 const void *val);
30 int (*in_range)(struct kvm_io_device *this, gpa_t addr);
31 void (*destructor)(struct kvm_io_device *this);
32
33 void *private;
34};
35
36static inline void kvm_iodevice_read(struct kvm_io_device *dev,
37 gpa_t addr,
38 int len,
39 void *val)
40{
41 dev->read(dev, addr, len, val);
42}
43
44static inline void kvm_iodevice_write(struct kvm_io_device *dev,
45 gpa_t addr,
46 int len,
47 const void *val)
48{
49 dev->write(dev, addr, len, val);
50}
51
52static inline int kvm_iodevice_inrange(struct kvm_io_device *dev, gpa_t addr)
53{
54 return dev->in_range(dev, addr);
55}
56
57static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
58{
59 if (dev->destructor)
60 dev->destructor(dev);
61}
62
63#endif /* __KVM_IODEV_H__ */
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
new file mode 100644
index 000000000000..3c4fe26096fc
--- /dev/null
+++ b/virt/kvm/kvm_main.c
@@ -0,0 +1,1400 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * Copyright (C) 2006 Qumranet, Inc.
8 *
9 * Authors:
10 * Avi Kivity <avi@qumranet.com>
11 * Yaniv Kamay <yaniv@qumranet.com>
12 *
13 * This work is licensed under the terms of the GNU GPL, version 2. See
14 * the COPYING file in the top-level directory.
15 *
16 */
17
18#include "iodev.h"
19
20#include <linux/kvm_host.h>
21#include <linux/kvm.h>
22#include <linux/module.h>
23#include <linux/errno.h>
24#include <linux/percpu.h>
25#include <linux/gfp.h>
26#include <linux/mm.h>
27#include <linux/miscdevice.h>
28#include <linux/vmalloc.h>
29#include <linux/reboot.h>
30#include <linux/debugfs.h>
31#include <linux/highmem.h>
32#include <linux/file.h>
33#include <linux/sysdev.h>
34#include <linux/cpu.h>
35#include <linux/sched.h>
36#include <linux/cpumask.h>
37#include <linux/smp.h>
38#include <linux/anon_inodes.h>
39#include <linux/profile.h>
40#include <linux/kvm_para.h>
41#include <linux/pagemap.h>
42#include <linux/mman.h>
43
44#include <asm/processor.h>
45#include <asm/io.h>
46#include <asm/uaccess.h>
47#include <asm/pgtable.h>
48
49MODULE_AUTHOR("Qumranet");
50MODULE_LICENSE("GPL");
51
52DEFINE_SPINLOCK(kvm_lock);
53LIST_HEAD(vm_list);
54
55static cpumask_t cpus_hardware_enabled;
56
57struct kmem_cache *kvm_vcpu_cache;
58EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
59
60static __read_mostly struct preempt_ops kvm_preempt_ops;
61
62static struct dentry *debugfs_dir;
63
64static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
65 unsigned long arg);
66
67static inline int valid_vcpu(int n)
68{
69 return likely(n >= 0 && n < KVM_MAX_VCPUS);
70}
71
72/*
73 * Switches to specified vcpu, until a matching vcpu_put()
74 */
75void vcpu_load(struct kvm_vcpu *vcpu)
76{
77 int cpu;
78
79 mutex_lock(&vcpu->mutex);
80 cpu = get_cpu();
81 preempt_notifier_register(&vcpu->preempt_notifier);
82 kvm_arch_vcpu_load(vcpu, cpu);
83 put_cpu();
84}
85
86void vcpu_put(struct kvm_vcpu *vcpu)
87{
88 preempt_disable();
89 kvm_arch_vcpu_put(vcpu);
90 preempt_notifier_unregister(&vcpu->preempt_notifier);
91 preempt_enable();
92 mutex_unlock(&vcpu->mutex);
93}
94
95static void ack_flush(void *_completed)
96{
97}
98
99void kvm_flush_remote_tlbs(struct kvm *kvm)
100{
101 int i, cpu;
102 cpumask_t cpus;
103 struct kvm_vcpu *vcpu;
104
105 cpus_clear(cpus);
106 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
107 vcpu = kvm->vcpus[i];
108 if (!vcpu)
109 continue;
110 if (test_and_set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
111 continue;
112 cpu = vcpu->cpu;
113 if (cpu != -1 && cpu != raw_smp_processor_id())
114 cpu_set(cpu, cpus);
115 }
116 if (cpus_empty(cpus))
117 return;
118 ++kvm->stat.remote_tlb_flush;
119 smp_call_function_mask(cpus, ack_flush, NULL, 1);
120}
121
122int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
123{
124 struct page *page;
125 int r;
126
127 mutex_init(&vcpu->mutex);
128 vcpu->cpu = -1;
129 vcpu->kvm = kvm;
130 vcpu->vcpu_id = id;
131 init_waitqueue_head(&vcpu->wq);
132
133 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
134 if (!page) {
135 r = -ENOMEM;
136 goto fail;
137 }
138 vcpu->run = page_address(page);
139
140 r = kvm_arch_vcpu_init(vcpu);
141 if (r < 0)
142 goto fail_free_run;
143 return 0;
144
145fail_free_run:
146 free_page((unsigned long)vcpu->run);
147fail:
148 return r;
149}
150EXPORT_SYMBOL_GPL(kvm_vcpu_init);
151
152void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
153{
154 kvm_arch_vcpu_uninit(vcpu);
155 free_page((unsigned long)vcpu->run);
156}
157EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
158
159static struct kvm *kvm_create_vm(void)
160{
161 struct kvm *kvm = kvm_arch_create_vm();
162
163 if (IS_ERR(kvm))
164 goto out;
165
166 kvm->mm = current->mm;
167 atomic_inc(&kvm->mm->mm_count);
168 spin_lock_init(&kvm->mmu_lock);
169 kvm_io_bus_init(&kvm->pio_bus);
170 mutex_init(&kvm->lock);
171 kvm_io_bus_init(&kvm->mmio_bus);
172 spin_lock(&kvm_lock);
173 list_add(&kvm->vm_list, &vm_list);
174 spin_unlock(&kvm_lock);
175out:
176 return kvm;
177}
178
179/*
180 * Free any memory in @free but not in @dont.
181 */
182static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
183 struct kvm_memory_slot *dont)
184{
185 if (!dont || free->rmap != dont->rmap)
186 vfree(free->rmap);
187
188 if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
189 vfree(free->dirty_bitmap);
190
191 free->npages = 0;
192 free->dirty_bitmap = NULL;
193 free->rmap = NULL;
194}
195
196void kvm_free_physmem(struct kvm *kvm)
197{
198 int i;
199
200 for (i = 0; i < kvm->nmemslots; ++i)
201 kvm_free_physmem_slot(&kvm->memslots[i], NULL);
202}
203
204static void kvm_destroy_vm(struct kvm *kvm)
205{
206 struct mm_struct *mm = kvm->mm;
207
208 spin_lock(&kvm_lock);
209 list_del(&kvm->vm_list);
210 spin_unlock(&kvm_lock);
211 kvm_io_bus_destroy(&kvm->pio_bus);
212 kvm_io_bus_destroy(&kvm->mmio_bus);
213 kvm_arch_destroy_vm(kvm);
214 mmdrop(mm);
215}
216
217static int kvm_vm_release(struct inode *inode, struct file *filp)
218{
219 struct kvm *kvm = filp->private_data;
220
221 kvm_destroy_vm(kvm);
222 return 0;
223}
224
225/*
226 * Allocate some memory and give it an address in the guest physical address
227 * space.
228 *
229 * Discontiguous memory is allowed, mostly for framebuffers.
230 *
231 * Must be called holding mmap_sem for write.
232 */
233int __kvm_set_memory_region(struct kvm *kvm,
234 struct kvm_userspace_memory_region *mem,
235 int user_alloc)
236{
237 int r;
238 gfn_t base_gfn;
239 unsigned long npages;
240 unsigned long i;
241 struct kvm_memory_slot *memslot;
242 struct kvm_memory_slot old, new;
243
244 r = -EINVAL;
245 /* General sanity checks */
246 if (mem->memory_size & (PAGE_SIZE - 1))
247 goto out;
248 if (mem->guest_phys_addr & (PAGE_SIZE - 1))
249 goto out;
250 if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
251 goto out;
252 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
253 goto out;
254
255 memslot = &kvm->memslots[mem->slot];
256 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
257 npages = mem->memory_size >> PAGE_SHIFT;
258
259 if (!npages)
260 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
261
262 new = old = *memslot;
263
264 new.base_gfn = base_gfn;
265 new.npages = npages;
266 new.flags = mem->flags;
267
268 /* Disallow changing a memory slot's size. */
269 r = -EINVAL;
270 if (npages && old.npages && npages != old.npages)
271 goto out_free;
272
273 /* Check for overlaps */
274 r = -EEXIST;
275 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
276 struct kvm_memory_slot *s = &kvm->memslots[i];
277
278 if (s == memslot)
279 continue;
280 if (!((base_gfn + npages <= s->base_gfn) ||
281 (base_gfn >= s->base_gfn + s->npages)))
282 goto out_free;
283 }
284
285 /* Free page dirty bitmap if unneeded */
286 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
287 new.dirty_bitmap = NULL;
288
289 r = -ENOMEM;
290
291 /* Allocate if a slot is being created */
292 if (npages && !new.rmap) {
293 new.rmap = vmalloc(npages * sizeof(struct page *));
294
295 if (!new.rmap)
296 goto out_free;
297
298 memset(new.rmap, 0, npages * sizeof(*new.rmap));
299
300 new.user_alloc = user_alloc;
301 new.userspace_addr = mem->userspace_addr;
302 }
303
304 /* Allocate page dirty bitmap if needed */
305 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
306 unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8;
307
308 new.dirty_bitmap = vmalloc(dirty_bytes);
309 if (!new.dirty_bitmap)
310 goto out_free;
311 memset(new.dirty_bitmap, 0, dirty_bytes);
312 }
313
314 if (mem->slot >= kvm->nmemslots)
315 kvm->nmemslots = mem->slot + 1;
316
317 *memslot = new;
318
319 r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc);
320 if (r) {
321 *memslot = old;
322 goto out_free;
323 }
324
325 kvm_free_physmem_slot(&old, &new);
326 return 0;
327
328out_free:
329 kvm_free_physmem_slot(&new, &old);
330out:
331 return r;
332
333}
334EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
335
336int kvm_set_memory_region(struct kvm *kvm,
337 struct kvm_userspace_memory_region *mem,
338 int user_alloc)
339{
340 int r;
341
342 down_write(&current->mm->mmap_sem);
343 r = __kvm_set_memory_region(kvm, mem, user_alloc);
344 up_write(&current->mm->mmap_sem);
345 return r;
346}
347EXPORT_SYMBOL_GPL(kvm_set_memory_region);
348
349int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
350 struct
351 kvm_userspace_memory_region *mem,
352 int user_alloc)
353{
354 if (mem->slot >= KVM_MEMORY_SLOTS)
355 return -EINVAL;
356 return kvm_set_memory_region(kvm, mem, user_alloc);
357}
358
359int kvm_get_dirty_log(struct kvm *kvm,
360 struct kvm_dirty_log *log, int *is_dirty)
361{
362 struct kvm_memory_slot *memslot;
363 int r, i;
364 int n;
365 unsigned long any = 0;
366
367 r = -EINVAL;
368 if (log->slot >= KVM_MEMORY_SLOTS)
369 goto out;
370
371 memslot = &kvm->memslots[log->slot];
372 r = -ENOENT;
373 if (!memslot->dirty_bitmap)
374 goto out;
375
376 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
377
378 for (i = 0; !any && i < n/sizeof(long); ++i)
379 any = memslot->dirty_bitmap[i];
380
381 r = -EFAULT;
382 if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
383 goto out;
384
385 if (any)
386 *is_dirty = 1;
387
388 r = 0;
389out:
390 return r;
391}
392
393int is_error_page(struct page *page)
394{
395 return page == bad_page;
396}
397EXPORT_SYMBOL_GPL(is_error_page);
398
399static inline unsigned long bad_hva(void)
400{
401 return PAGE_OFFSET;
402}
403
404int kvm_is_error_hva(unsigned long addr)
405{
406 return addr == bad_hva();
407}
408EXPORT_SYMBOL_GPL(kvm_is_error_hva);
409
410static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
411{
412 int i;
413
414 for (i = 0; i < kvm->nmemslots; ++i) {
415 struct kvm_memory_slot *memslot = &kvm->memslots[i];
416
417 if (gfn >= memslot->base_gfn
418 && gfn < memslot->base_gfn + memslot->npages)
419 return memslot;
420 }
421 return NULL;
422}
423
424struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
425{
426 gfn = unalias_gfn(kvm, gfn);
427 return __gfn_to_memslot(kvm, gfn);
428}
429
430int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
431{
432 int i;
433
434 gfn = unalias_gfn(kvm, gfn);
435 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
436 struct kvm_memory_slot *memslot = &kvm->memslots[i];
437
438 if (gfn >= memslot->base_gfn
439 && gfn < memslot->base_gfn + memslot->npages)
440 return 1;
441 }
442 return 0;
443}
444EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
445
446static unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
447{
448 struct kvm_memory_slot *slot;
449
450 gfn = unalias_gfn(kvm, gfn);
451 slot = __gfn_to_memslot(kvm, gfn);
452 if (!slot)
453 return bad_hva();
454 return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE);
455}
456
457/*
458 * Requires current->mm->mmap_sem to be held
459 */
460struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
461{
462 struct page *page[1];
463 unsigned long addr;
464 int npages;
465
466 might_sleep();
467
468 addr = gfn_to_hva(kvm, gfn);
469 if (kvm_is_error_hva(addr)) {
470 get_page(bad_page);
471 return bad_page;
472 }
473
474 npages = get_user_pages(current, current->mm, addr, 1, 1, 1, page,
475 NULL);
476
477 if (npages != 1) {
478 get_page(bad_page);
479 return bad_page;
480 }
481
482 return page[0];
483}
484
485EXPORT_SYMBOL_GPL(gfn_to_page);
486
487void kvm_release_page_clean(struct page *page)
488{
489 put_page(page);
490}
491EXPORT_SYMBOL_GPL(kvm_release_page_clean);
492
493void kvm_release_page_dirty(struct page *page)
494{
495 if (!PageReserved(page))
496 SetPageDirty(page);
497 put_page(page);
498}
499EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
500
501static int next_segment(unsigned long len, int offset)
502{
503 if (len > PAGE_SIZE - offset)
504 return PAGE_SIZE - offset;
505 else
506 return len;
507}
508
509int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
510 int len)
511{
512 int r;
513 unsigned long addr;
514
515 addr = gfn_to_hva(kvm, gfn);
516 if (kvm_is_error_hva(addr))
517 return -EFAULT;
518 r = copy_from_user(data, (void __user *)addr + offset, len);
519 if (r)
520 return -EFAULT;
521 return 0;
522}
523EXPORT_SYMBOL_GPL(kvm_read_guest_page);
524
525int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
526{
527 gfn_t gfn = gpa >> PAGE_SHIFT;
528 int seg;
529 int offset = offset_in_page(gpa);
530 int ret;
531
532 while ((seg = next_segment(len, offset)) != 0) {
533 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
534 if (ret < 0)
535 return ret;
536 offset = 0;
537 len -= seg;
538 data += seg;
539 ++gfn;
540 }
541 return 0;
542}
543EXPORT_SYMBOL_GPL(kvm_read_guest);
544
545int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
546 unsigned long len)
547{
548 int r;
549 unsigned long addr;
550 gfn_t gfn = gpa >> PAGE_SHIFT;
551 int offset = offset_in_page(gpa);
552
553 addr = gfn_to_hva(kvm, gfn);
554 if (kvm_is_error_hva(addr))
555 return -EFAULT;
556 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
557 if (r)
558 return -EFAULT;
559 return 0;
560}
561EXPORT_SYMBOL(kvm_read_guest_atomic);
562
563int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
564 int offset, int len)
565{
566 int r;
567 unsigned long addr;
568
569 addr = gfn_to_hva(kvm, gfn);
570 if (kvm_is_error_hva(addr))
571 return -EFAULT;
572 r = copy_to_user((void __user *)addr + offset, data, len);
573 if (r)
574 return -EFAULT;
575 mark_page_dirty(kvm, gfn);
576 return 0;
577}
578EXPORT_SYMBOL_GPL(kvm_write_guest_page);
579
580int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
581 unsigned long len)
582{
583 gfn_t gfn = gpa >> PAGE_SHIFT;
584 int seg;
585 int offset = offset_in_page(gpa);
586 int ret;
587
588 while ((seg = next_segment(len, offset)) != 0) {
589 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
590 if (ret < 0)
591 return ret;
592 offset = 0;
593 len -= seg;
594 data += seg;
595 ++gfn;
596 }
597 return 0;
598}
599
600int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
601{
602 return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len);
603}
604EXPORT_SYMBOL_GPL(kvm_clear_guest_page);
605
606int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
607{
608 gfn_t gfn = gpa >> PAGE_SHIFT;
609 int seg;
610 int offset = offset_in_page(gpa);
611 int ret;
612
613 while ((seg = next_segment(len, offset)) != 0) {
614 ret = kvm_clear_guest_page(kvm, gfn, offset, seg);
615 if (ret < 0)
616 return ret;
617 offset = 0;
618 len -= seg;
619 ++gfn;
620 }
621 return 0;
622}
623EXPORT_SYMBOL_GPL(kvm_clear_guest);
624
625void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
626{
627 struct kvm_memory_slot *memslot;
628
629 gfn = unalias_gfn(kvm, gfn);
630 memslot = __gfn_to_memslot(kvm, gfn);
631 if (memslot && memslot->dirty_bitmap) {
632 unsigned long rel_gfn = gfn - memslot->base_gfn;
633
634 /* avoid RMW */
635 if (!test_bit(rel_gfn, memslot->dirty_bitmap))
636 set_bit(rel_gfn, memslot->dirty_bitmap);
637 }
638}
639
640/*
641 * The vCPU has executed a HLT instruction with in-kernel mode enabled.
642 */
643void kvm_vcpu_block(struct kvm_vcpu *vcpu)
644{
645 DECLARE_WAITQUEUE(wait, current);
646
647 add_wait_queue(&vcpu->wq, &wait);
648
649 /*
650 * We will block until either an interrupt or a signal wakes us up
651 */
652 while (!kvm_cpu_has_interrupt(vcpu)
653 && !signal_pending(current)
654 && !kvm_arch_vcpu_runnable(vcpu)) {
655 set_current_state(TASK_INTERRUPTIBLE);
656 vcpu_put(vcpu);
657 schedule();
658 vcpu_load(vcpu);
659 }
660
661 __set_current_state(TASK_RUNNING);
662 remove_wait_queue(&vcpu->wq, &wait);
663}
664
665void kvm_resched(struct kvm_vcpu *vcpu)
666{
667 if (!need_resched())
668 return;
669 cond_resched();
670}
671EXPORT_SYMBOL_GPL(kvm_resched);
672
673static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
674{
675 struct kvm_vcpu *vcpu = vma->vm_file->private_data;
676 struct page *page;
677
678 if (vmf->pgoff == 0)
679 page = virt_to_page(vcpu->run);
680 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
681 page = virt_to_page(vcpu->arch.pio_data);
682 else
683 return VM_FAULT_SIGBUS;
684 get_page(page);
685 vmf->page = page;
686 return 0;
687}
688
689static struct vm_operations_struct kvm_vcpu_vm_ops = {
690 .fault = kvm_vcpu_fault,
691};
692
693static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
694{
695 vma->vm_ops = &kvm_vcpu_vm_ops;
696 return 0;
697}
698
699static int kvm_vcpu_release(struct inode *inode, struct file *filp)
700{
701 struct kvm_vcpu *vcpu = filp->private_data;
702
703 fput(vcpu->kvm->filp);
704 return 0;
705}
706
707static struct file_operations kvm_vcpu_fops = {
708 .release = kvm_vcpu_release,
709 .unlocked_ioctl = kvm_vcpu_ioctl,
710 .compat_ioctl = kvm_vcpu_ioctl,
711 .mmap = kvm_vcpu_mmap,
712};
713
714/*
715 * Allocates an inode for the vcpu.
716 */
717static int create_vcpu_fd(struct kvm_vcpu *vcpu)
718{
719 int fd, r;
720 struct inode *inode;
721 struct file *file;
722
723 r = anon_inode_getfd(&fd, &inode, &file,
724 "kvm-vcpu", &kvm_vcpu_fops, vcpu);
725 if (r)
726 return r;
727 atomic_inc(&vcpu->kvm->filp->f_count);
728 return fd;
729}
730
731/*
732 * Creates some virtual cpus. Good luck creating more than one.
733 */
734static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
735{
736 int r;
737 struct kvm_vcpu *vcpu;
738
739 if (!valid_vcpu(n))
740 return -EINVAL;
741
742 vcpu = kvm_arch_vcpu_create(kvm, n);
743 if (IS_ERR(vcpu))
744 return PTR_ERR(vcpu);
745
746 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
747
748 r = kvm_arch_vcpu_setup(vcpu);
749 if (r)
750 goto vcpu_destroy;
751
752 mutex_lock(&kvm->lock);
753 if (kvm->vcpus[n]) {
754 r = -EEXIST;
755 mutex_unlock(&kvm->lock);
756 goto vcpu_destroy;
757 }
758 kvm->vcpus[n] = vcpu;
759 mutex_unlock(&kvm->lock);
760
761 /* Now it's all set up, let userspace reach it */
762 r = create_vcpu_fd(vcpu);
763 if (r < 0)
764 goto unlink;
765 return r;
766
767unlink:
768 mutex_lock(&kvm->lock);
769 kvm->vcpus[n] = NULL;
770 mutex_unlock(&kvm->lock);
771vcpu_destroy:
772 kvm_arch_vcpu_destroy(vcpu);
773 return r;
774}
775
776static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
777{
778 if (sigset) {
779 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
780 vcpu->sigset_active = 1;
781 vcpu->sigset = *sigset;
782 } else
783 vcpu->sigset_active = 0;
784 return 0;
785}
786
787static long kvm_vcpu_ioctl(struct file *filp,
788 unsigned int ioctl, unsigned long arg)
789{
790 struct kvm_vcpu *vcpu = filp->private_data;
791 void __user *argp = (void __user *)arg;
792 int r;
793
794 if (vcpu->kvm->mm != current->mm)
795 return -EIO;
796 switch (ioctl) {
797 case KVM_RUN:
798 r = -EINVAL;
799 if (arg)
800 goto out;
801 r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
802 break;
803 case KVM_GET_REGS: {
804 struct kvm_regs kvm_regs;
805
806 memset(&kvm_regs, 0, sizeof kvm_regs);
807 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, &kvm_regs);
808 if (r)
809 goto out;
810 r = -EFAULT;
811 if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs))
812 goto out;
813 r = 0;
814 break;
815 }
816 case KVM_SET_REGS: {
817 struct kvm_regs kvm_regs;
818
819 r = -EFAULT;
820 if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs))
821 goto out;
822 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, &kvm_regs);
823 if (r)
824 goto out;
825 r = 0;
826 break;
827 }
828 case KVM_GET_SREGS: {
829 struct kvm_sregs kvm_sregs;
830
831 memset(&kvm_sregs, 0, sizeof kvm_sregs);
832 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs);
833 if (r)
834 goto out;
835 r = -EFAULT;
836 if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs))
837 goto out;
838 r = 0;
839 break;
840 }
841 case KVM_SET_SREGS: {
842 struct kvm_sregs kvm_sregs;
843
844 r = -EFAULT;
845 if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs))
846 goto out;
847 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs);
848 if (r)
849 goto out;
850 r = 0;
851 break;
852 }
853 case KVM_TRANSLATE: {
854 struct kvm_translation tr;
855
856 r = -EFAULT;
857 if (copy_from_user(&tr, argp, sizeof tr))
858 goto out;
859 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
860 if (r)
861 goto out;
862 r = -EFAULT;
863 if (copy_to_user(argp, &tr, sizeof tr))
864 goto out;
865 r = 0;
866 break;
867 }
868 case KVM_DEBUG_GUEST: {
869 struct kvm_debug_guest dbg;
870
871 r = -EFAULT;
872 if (copy_from_user(&dbg, argp, sizeof dbg))
873 goto out;
874 r = kvm_arch_vcpu_ioctl_debug_guest(vcpu, &dbg);
875 if (r)
876 goto out;
877 r = 0;
878 break;
879 }
880 case KVM_SET_SIGNAL_MASK: {
881 struct kvm_signal_mask __user *sigmask_arg = argp;
882 struct kvm_signal_mask kvm_sigmask;
883 sigset_t sigset, *p;
884
885 p = NULL;
886 if (argp) {
887 r = -EFAULT;
888 if (copy_from_user(&kvm_sigmask, argp,
889 sizeof kvm_sigmask))
890 goto out;
891 r = -EINVAL;
892 if (kvm_sigmask.len != sizeof sigset)
893 goto out;
894 r = -EFAULT;
895 if (copy_from_user(&sigset, sigmask_arg->sigset,
896 sizeof sigset))
897 goto out;
898 p = &sigset;
899 }
900 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
901 break;
902 }
903 case KVM_GET_FPU: {
904 struct kvm_fpu fpu;
905
906 memset(&fpu, 0, sizeof fpu);
907 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, &fpu);
908 if (r)
909 goto out;
910 r = -EFAULT;
911 if (copy_to_user(argp, &fpu, sizeof fpu))
912 goto out;
913 r = 0;
914 break;
915 }
916 case KVM_SET_FPU: {
917 struct kvm_fpu fpu;
918
919 r = -EFAULT;
920 if (copy_from_user(&fpu, argp, sizeof fpu))
921 goto out;
922 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, &fpu);
923 if (r)
924 goto out;
925 r = 0;
926 break;
927 }
928 default:
929 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
930 }
931out:
932 return r;
933}
934
935static long kvm_vm_ioctl(struct file *filp,
936 unsigned int ioctl, unsigned long arg)
937{
938 struct kvm *kvm = filp->private_data;
939 void __user *argp = (void __user *)arg;
940 int r;
941
942 if (kvm->mm != current->mm)
943 return -EIO;
944 switch (ioctl) {
945 case KVM_CREATE_VCPU:
946 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
947 if (r < 0)
948 goto out;
949 break;
950 case KVM_SET_USER_MEMORY_REGION: {
951 struct kvm_userspace_memory_region kvm_userspace_mem;
952
953 r = -EFAULT;
954 if (copy_from_user(&kvm_userspace_mem, argp,
955 sizeof kvm_userspace_mem))
956 goto out;
957
958 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1);
959 if (r)
960 goto out;
961 break;
962 }
963 case KVM_GET_DIRTY_LOG: {
964 struct kvm_dirty_log log;
965
966 r = -EFAULT;
967 if (copy_from_user(&log, argp, sizeof log))
968 goto out;
969 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
970 if (r)
971 goto out;
972 break;
973 }
974 default:
975 r = kvm_arch_vm_ioctl(filp, ioctl, arg);
976 }
977out:
978 return r;
979}
980
981static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
982{
983 struct kvm *kvm = vma->vm_file->private_data;
984 struct page *page;
985
986 if (!kvm_is_visible_gfn(kvm, vmf->pgoff))
987 return VM_FAULT_SIGBUS;
988 page = gfn_to_page(kvm, vmf->pgoff);
989 if (is_error_page(page)) {
990 kvm_release_page_clean(page);
991 return VM_FAULT_SIGBUS;
992 }
993 vmf->page = page;
994 return 0;
995}
996
997static struct vm_operations_struct kvm_vm_vm_ops = {
998 .fault = kvm_vm_fault,
999};
1000
1001static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
1002{
1003 vma->vm_ops = &kvm_vm_vm_ops;
1004 return 0;
1005}
1006
1007static struct file_operations kvm_vm_fops = {
1008 .release = kvm_vm_release,
1009 .unlocked_ioctl = kvm_vm_ioctl,
1010 .compat_ioctl = kvm_vm_ioctl,
1011 .mmap = kvm_vm_mmap,
1012};
1013
1014static int kvm_dev_ioctl_create_vm(void)
1015{
1016 int fd, r;
1017 struct inode *inode;
1018 struct file *file;
1019 struct kvm *kvm;
1020
1021 kvm = kvm_create_vm();
1022 if (IS_ERR(kvm))
1023 return PTR_ERR(kvm);
1024 r = anon_inode_getfd(&fd, &inode, &file, "kvm-vm", &kvm_vm_fops, kvm);
1025 if (r) {
1026 kvm_destroy_vm(kvm);
1027 return r;
1028 }
1029
1030 kvm->filp = file;
1031
1032 return fd;
1033}
1034
1035static long kvm_dev_ioctl(struct file *filp,
1036 unsigned int ioctl, unsigned long arg)
1037{
1038 void __user *argp = (void __user *)arg;
1039 long r = -EINVAL;
1040
1041 switch (ioctl) {
1042 case KVM_GET_API_VERSION:
1043 r = -EINVAL;
1044 if (arg)
1045 goto out;
1046 r = KVM_API_VERSION;
1047 break;
1048 case KVM_CREATE_VM:
1049 r = -EINVAL;
1050 if (arg)
1051 goto out;
1052 r = kvm_dev_ioctl_create_vm();
1053 break;
1054 case KVM_CHECK_EXTENSION:
1055 r = kvm_dev_ioctl_check_extension((long)argp);
1056 break;
1057 case KVM_GET_VCPU_MMAP_SIZE:
1058 r = -EINVAL;
1059 if (arg)
1060 goto out;
1061 r = 2 * PAGE_SIZE;
1062 break;
1063 default:
1064 return kvm_arch_dev_ioctl(filp, ioctl, arg);
1065 }
1066out:
1067 return r;
1068}
1069
1070static struct file_operations kvm_chardev_ops = {
1071 .unlocked_ioctl = kvm_dev_ioctl,
1072 .compat_ioctl = kvm_dev_ioctl,
1073};
1074
1075static struct miscdevice kvm_dev = {
1076 KVM_MINOR,
1077 "kvm",
1078 &kvm_chardev_ops,
1079};
1080
1081static void hardware_enable(void *junk)
1082{
1083 int cpu = raw_smp_processor_id();
1084
1085 if (cpu_isset(cpu, cpus_hardware_enabled))
1086 return;
1087 cpu_set(cpu, cpus_hardware_enabled);
1088 kvm_arch_hardware_enable(NULL);
1089}
1090
1091static void hardware_disable(void *junk)
1092{
1093 int cpu = raw_smp_processor_id();
1094
1095 if (!cpu_isset(cpu, cpus_hardware_enabled))
1096 return;
1097 cpu_clear(cpu, cpus_hardware_enabled);
1098 decache_vcpus_on_cpu(cpu);
1099 kvm_arch_hardware_disable(NULL);
1100}
1101
1102static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
1103 void *v)
1104{
1105 int cpu = (long)v;
1106
1107 val &= ~CPU_TASKS_FROZEN;
1108 switch (val) {
1109 case CPU_DYING:
1110 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
1111 cpu);
1112 hardware_disable(NULL);
1113 break;
1114 case CPU_UP_CANCELED:
1115 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
1116 cpu);
1117 smp_call_function_single(cpu, hardware_disable, NULL, 0, 1);
1118 break;
1119 case CPU_ONLINE:
1120 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
1121 cpu);
1122 smp_call_function_single(cpu, hardware_enable, NULL, 0, 1);
1123 break;
1124 }
1125 return NOTIFY_OK;
1126}
1127
1128static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
1129 void *v)
1130{
1131 if (val == SYS_RESTART) {
1132 /*
1133 * Some (well, at least mine) BIOSes hang on reboot if
1134 * in vmx root mode.
1135 */
1136 printk(KERN_INFO "kvm: exiting hardware virtualization\n");
1137 on_each_cpu(hardware_disable, NULL, 0, 1);
1138 }
1139 return NOTIFY_OK;
1140}
1141
1142static struct notifier_block kvm_reboot_notifier = {
1143 .notifier_call = kvm_reboot,
1144 .priority = 0,
1145};
1146
1147void kvm_io_bus_init(struct kvm_io_bus *bus)
1148{
1149 memset(bus, 0, sizeof(*bus));
1150}
1151
1152void kvm_io_bus_destroy(struct kvm_io_bus *bus)
1153{
1154 int i;
1155
1156 for (i = 0; i < bus->dev_count; i++) {
1157 struct kvm_io_device *pos = bus->devs[i];
1158
1159 kvm_iodevice_destructor(pos);
1160 }
1161}
1162
1163struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr)
1164{
1165 int i;
1166
1167 for (i = 0; i < bus->dev_count; i++) {
1168 struct kvm_io_device *pos = bus->devs[i];
1169
1170 if (pos->in_range(pos, addr))
1171 return pos;
1172 }
1173
1174 return NULL;
1175}
1176
1177void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev)
1178{
1179 BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1));
1180
1181 bus->devs[bus->dev_count++] = dev;
1182}
1183
1184static struct notifier_block kvm_cpu_notifier = {
1185 .notifier_call = kvm_cpu_hotplug,
1186 .priority = 20, /* must be > scheduler priority */
1187};
1188
1189static u64 vm_stat_get(void *_offset)
1190{
1191 unsigned offset = (long)_offset;
1192 u64 total = 0;
1193 struct kvm *kvm;
1194
1195 spin_lock(&kvm_lock);
1196 list_for_each_entry(kvm, &vm_list, vm_list)
1197 total += *(u32 *)((void *)kvm + offset);
1198 spin_unlock(&kvm_lock);
1199 return total;
1200}
1201
1202DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n");
1203
1204static u64 vcpu_stat_get(void *_offset)
1205{
1206 unsigned offset = (long)_offset;
1207 u64 total = 0;
1208 struct kvm *kvm;
1209 struct kvm_vcpu *vcpu;
1210 int i;
1211
1212 spin_lock(&kvm_lock);
1213 list_for_each_entry(kvm, &vm_list, vm_list)
1214 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
1215 vcpu = kvm->vcpus[i];
1216 if (vcpu)
1217 total += *(u32 *)((void *)vcpu + offset);
1218 }
1219 spin_unlock(&kvm_lock);
1220 return total;
1221}
1222
1223DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n");
1224
1225static struct file_operations *stat_fops[] = {
1226 [KVM_STAT_VCPU] = &vcpu_stat_fops,
1227 [KVM_STAT_VM] = &vm_stat_fops,
1228};
1229
1230static void kvm_init_debug(void)
1231{
1232 struct kvm_stats_debugfs_item *p;
1233
1234 debugfs_dir = debugfs_create_dir("kvm", NULL);
1235 for (p = debugfs_entries; p->name; ++p)
1236 p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir,
1237 (void *)(long)p->offset,
1238 stat_fops[p->kind]);
1239}
1240
1241static void kvm_exit_debug(void)
1242{
1243 struct kvm_stats_debugfs_item *p;
1244
1245 for (p = debugfs_entries; p->name; ++p)
1246 debugfs_remove(p->dentry);
1247 debugfs_remove(debugfs_dir);
1248}
1249
1250static int kvm_suspend(struct sys_device *dev, pm_message_t state)
1251{
1252 hardware_disable(NULL);
1253 return 0;
1254}
1255
1256static int kvm_resume(struct sys_device *dev)
1257{
1258 hardware_enable(NULL);
1259 return 0;
1260}
1261
1262static struct sysdev_class kvm_sysdev_class = {
1263 .name = "kvm",
1264 .suspend = kvm_suspend,
1265 .resume = kvm_resume,
1266};
1267
1268static struct sys_device kvm_sysdev = {
1269 .id = 0,
1270 .cls = &kvm_sysdev_class,
1271};
1272
1273struct page *bad_page;
1274
1275static inline
1276struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
1277{
1278 return container_of(pn, struct kvm_vcpu, preempt_notifier);
1279}
1280
1281static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
1282{
1283 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
1284
1285 kvm_arch_vcpu_load(vcpu, cpu);
1286}
1287
1288static void kvm_sched_out(struct preempt_notifier *pn,
1289 struct task_struct *next)
1290{
1291 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
1292
1293 kvm_arch_vcpu_put(vcpu);
1294}
1295
1296int kvm_init(void *opaque, unsigned int vcpu_size,
1297 struct module *module)
1298{
1299 int r;
1300 int cpu;
1301
1302 kvm_init_debug();
1303
1304 r = kvm_arch_init(opaque);
1305 if (r)
1306 goto out_fail;
1307
1308 bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
1309
1310 if (bad_page == NULL) {
1311 r = -ENOMEM;
1312 goto out;
1313 }
1314
1315 r = kvm_arch_hardware_setup();
1316 if (r < 0)
1317 goto out_free_0;
1318
1319 for_each_online_cpu(cpu) {
1320 smp_call_function_single(cpu,
1321 kvm_arch_check_processor_compat,
1322 &r, 0, 1);
1323 if (r < 0)
1324 goto out_free_1;
1325 }
1326
1327 on_each_cpu(hardware_enable, NULL, 0, 1);
1328 r = register_cpu_notifier(&kvm_cpu_notifier);
1329 if (r)
1330 goto out_free_2;
1331 register_reboot_notifier(&kvm_reboot_notifier);
1332
1333 r = sysdev_class_register(&kvm_sysdev_class);
1334 if (r)
1335 goto out_free_3;
1336
1337 r = sysdev_register(&kvm_sysdev);
1338 if (r)
1339 goto out_free_4;
1340
1341 /* A kmem cache lets us meet the alignment requirements of fx_save. */
1342 kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size,
1343 __alignof__(struct kvm_vcpu),
1344 0, NULL);
1345 if (!kvm_vcpu_cache) {
1346 r = -ENOMEM;
1347 goto out_free_5;
1348 }
1349
1350 kvm_chardev_ops.owner = module;
1351
1352 r = misc_register(&kvm_dev);
1353 if (r) {
1354 printk(KERN_ERR "kvm: misc device register failed\n");
1355 goto out_free;
1356 }
1357
1358 kvm_preempt_ops.sched_in = kvm_sched_in;
1359 kvm_preempt_ops.sched_out = kvm_sched_out;
1360
1361 return 0;
1362
1363out_free:
1364 kmem_cache_destroy(kvm_vcpu_cache);
1365out_free_5:
1366 sysdev_unregister(&kvm_sysdev);
1367out_free_4:
1368 sysdev_class_unregister(&kvm_sysdev_class);
1369out_free_3:
1370 unregister_reboot_notifier(&kvm_reboot_notifier);
1371 unregister_cpu_notifier(&kvm_cpu_notifier);
1372out_free_2:
1373 on_each_cpu(hardware_disable, NULL, 0, 1);
1374out_free_1:
1375 kvm_arch_hardware_unsetup();
1376out_free_0:
1377 __free_page(bad_page);
1378out:
1379 kvm_arch_exit();
1380 kvm_exit_debug();
1381out_fail:
1382 return r;
1383}
1384EXPORT_SYMBOL_GPL(kvm_init);
1385
1386void kvm_exit(void)
1387{
1388 misc_deregister(&kvm_dev);
1389 kmem_cache_destroy(kvm_vcpu_cache);
1390 sysdev_unregister(&kvm_sysdev);
1391 sysdev_class_unregister(&kvm_sysdev_class);
1392 unregister_reboot_notifier(&kvm_reboot_notifier);
1393 unregister_cpu_notifier(&kvm_cpu_notifier);
1394 on_each_cpu(hardware_disable, NULL, 0, 1);
1395 kvm_arch_hardware_unsetup();
1396 kvm_arch_exit();
1397 kvm_exit_debug();
1398 __free_page(bad_page);
1399}
1400EXPORT_SYMBOL_GPL(kvm_exit);