diff options
-rw-r--r-- | arch/x86/Kconfig | 3 | ||||
-rw-r--r-- | arch/x86/Makefile | 2 | ||||
-rw-r--r-- | arch/x86/kvm/Kconfig (renamed from drivers/kvm/Kconfig) | 7 | ||||
-rw-r--r-- | arch/x86/kvm/Makefile (renamed from drivers/kvm/Makefile) | 6 | ||||
-rw-r--r-- | arch/x86/kvm/i8259.c (renamed from drivers/kvm/i8259.c) | 8 | ||||
-rw-r--r-- | arch/x86/kvm/irq.c (renamed from drivers/kvm/irq.c) | 22 | ||||
-rw-r--r-- | arch/x86/kvm/irq.h | 88 | ||||
-rw-r--r-- | arch/x86/kvm/kvm_svm.h (renamed from drivers/kvm/kvm_svm.h) | 2 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c (renamed from drivers/kvm/lapic.c) | 216 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.h | 50 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.c | 1885 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.h | 44 | ||||
-rw-r--r-- | arch/x86/kvm/paging_tmpl.h | 484 | ||||
-rw-r--r-- | arch/x86/kvm/segment_descriptor.h (renamed from drivers/kvm/segment_descriptor.h) | 12 | ||||
-rw-r--r-- | arch/x86/kvm/svm.c (renamed from drivers/kvm/svm.c) | 353 | ||||
-rw-r--r-- | arch/x86/kvm/svm.h (renamed from drivers/kvm/svm.h) | 3 | ||||
-rw-r--r-- | arch/x86/kvm/vmx.c (renamed from drivers/kvm/vmx.c) | 1079 | ||||
-rw-r--r-- | arch/x86/kvm/vmx.h (renamed from drivers/kvm/vmx.h) | 26 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c (renamed from drivers/kvm/kvm_main.c) | 4243 | ||||
-rw-r--r-- | arch/x86/kvm/x86_emulate.c | 1912 | ||||
-rw-r--r-- | drivers/Kconfig | 2 | ||||
-rw-r--r-- | drivers/Makefile | 1 | ||||
-rw-r--r-- | drivers/kvm/irq.h | 165 | ||||
-rw-r--r-- | drivers/kvm/mmu.c | 1498 | ||||
-rw-r--r-- | drivers/kvm/paging_tmpl.h | 511 | ||||
-rw-r--r-- | drivers/kvm/x86_emulate.c | 1662 | ||||
-rw-r--r-- | include/asm-x86/Kbuild | 1 | ||||
-rw-r--r-- | include/asm-x86/kvm.h | 191 | ||||
-rw-r--r-- | include/asm-x86/kvm_host.h (renamed from drivers/kvm/kvm.h) | 537 | ||||
-rw-r--r-- | include/asm-x86/kvm_para.h | 105 | ||||
-rw-r--r-- | include/asm-x86/kvm_x86_emulate.h (renamed from drivers/kvm/x86_emulate.h) | 69 | ||||
-rw-r--r-- | include/linux/Kbuild | 2 | ||||
-rw-r--r-- | include/linux/kvm.h | 203 | ||||
-rw-r--r-- | include/linux/kvm_host.h | 299 | ||||
-rw-r--r-- | include/linux/kvm_para.h | 82 | ||||
-rw-r--r-- | include/linux/kvm_types.h | 54 | ||||
-rw-r--r-- | kernel/fork.c | 1 | ||||
-rw-r--r-- | virt/kvm/ioapic.c (renamed from drivers/kvm/ioapic.c) | 99 | ||||
-rw-r--r-- | virt/kvm/ioapic.h | 95 | ||||
-rw-r--r-- | virt/kvm/iodev.h | 63 | ||||
-rw-r--r-- | virt/kvm/kvm_main.c | 1400 |
41 files changed, 9938 insertions, 7547 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index fb3eea3e38ee..65b449134cf7 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
@@ -107,6 +107,7 @@ config ARCH_SUPPORTS_OPROFILE | |||
107 | bool | 107 | bool |
108 | default y | 108 | default y |
109 | 109 | ||
110 | select HAVE_KVM | ||
110 | 111 | ||
111 | config ZONE_DMA32 | 112 | config ZONE_DMA32 |
112 | bool | 113 | bool |
@@ -1598,4 +1599,6 @@ source "security/Kconfig" | |||
1598 | 1599 | ||
1599 | source "crypto/Kconfig" | 1600 | source "crypto/Kconfig" |
1600 | 1601 | ||
1602 | source "arch/x86/kvm/Kconfig" | ||
1603 | |||
1601 | source "lib/Kconfig" | 1604 | source "lib/Kconfig" |
diff --git a/arch/x86/Makefile b/arch/x86/Makefile index b08f18261df6..da8f4129780b 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile | |||
@@ -7,6 +7,8 @@ else | |||
7 | KBUILD_DEFCONFIG := $(ARCH)_defconfig | 7 | KBUILD_DEFCONFIG := $(ARCH)_defconfig |
8 | endif | 8 | endif |
9 | 9 | ||
10 | core-$(CONFIG_KVM) += arch/x86/kvm/ | ||
11 | |||
10 | # BITS is used as extension for files which are available in a 32 bit | 12 | # BITS is used as extension for files which are available in a 32 bit |
11 | # and a 64 bit version to simplify shared Makefiles. | 13 | # and a 64 bit version to simplify shared Makefiles. |
12 | # e.g.: obj-y += foo_$(BITS).o | 14 | # e.g.: obj-y += foo_$(BITS).o |
diff --git a/drivers/kvm/Kconfig b/arch/x86/kvm/Kconfig index 656920636cb2..c83e1c9b5129 100644 --- a/drivers/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig | |||
@@ -1,9 +1,12 @@ | |||
1 | # | 1 | # |
2 | # KVM configuration | 2 | # KVM configuration |
3 | # | 3 | # |
4 | config HAVE_KVM | ||
5 | bool | ||
6 | |||
4 | menuconfig VIRTUALIZATION | 7 | menuconfig VIRTUALIZATION |
5 | bool "Virtualization" | 8 | bool "Virtualization" |
6 | depends on X86 | 9 | depends on HAVE_KVM || X86 |
7 | default y | 10 | default y |
8 | ---help--- | 11 | ---help--- |
9 | Say Y here to get to see options for using your Linux host to run other | 12 | Say Y here to get to see options for using your Linux host to run other |
@@ -16,7 +19,7 @@ if VIRTUALIZATION | |||
16 | 19 | ||
17 | config KVM | 20 | config KVM |
18 | tristate "Kernel-based Virtual Machine (KVM) support" | 21 | tristate "Kernel-based Virtual Machine (KVM) support" |
19 | depends on X86 && EXPERIMENTAL | 22 | depends on HAVE_KVM && EXPERIMENTAL |
20 | select PREEMPT_NOTIFIERS | 23 | select PREEMPT_NOTIFIERS |
21 | select ANON_INODES | 24 | select ANON_INODES |
22 | ---help--- | 25 | ---help--- |
diff --git a/drivers/kvm/Makefile b/arch/x86/kvm/Makefile index e5a8f4d3e973..ffdd0b310784 100644 --- a/drivers/kvm/Makefile +++ b/arch/x86/kvm/Makefile | |||
@@ -2,7 +2,11 @@ | |||
2 | # Makefile for Kernel-based Virtual Machine module | 2 | # Makefile for Kernel-based Virtual Machine module |
3 | # | 3 | # |
4 | 4 | ||
5 | kvm-objs := kvm_main.o mmu.o x86_emulate.o i8259.o irq.o lapic.o ioapic.o | 5 | common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o) |
6 | |||
7 | EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm | ||
8 | |||
9 | kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o | ||
6 | obj-$(CONFIG_KVM) += kvm.o | 10 | obj-$(CONFIG_KVM) += kvm.o |
7 | kvm-intel-objs = vmx.o | 11 | kvm-intel-objs = vmx.o |
8 | obj-$(CONFIG_KVM_INTEL) += kvm-intel.o | 12 | obj-$(CONFIG_KVM_INTEL) += kvm-intel.o |
diff --git a/drivers/kvm/i8259.c b/arch/x86/kvm/i8259.c index a679157bc599..ab29cf2def47 100644 --- a/drivers/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c | |||
@@ -28,6 +28,8 @@ | |||
28 | #include <linux/mm.h> | 28 | #include <linux/mm.h> |
29 | #include "irq.h" | 29 | #include "irq.h" |
30 | 30 | ||
31 | #include <linux/kvm_host.h> | ||
32 | |||
31 | /* | 33 | /* |
32 | * set irq level. If an edge is detected, then the IRR is set to 1 | 34 | * set irq level. If an edge is detected, then the IRR is set to 1 |
33 | */ | 35 | */ |
@@ -181,10 +183,8 @@ int kvm_pic_read_irq(struct kvm_pic *s) | |||
181 | return intno; | 183 | return intno; |
182 | } | 184 | } |
183 | 185 | ||
184 | static void pic_reset(void *opaque) | 186 | void kvm_pic_reset(struct kvm_kpic_state *s) |
185 | { | 187 | { |
186 | struct kvm_kpic_state *s = opaque; | ||
187 | |||
188 | s->last_irr = 0; | 188 | s->last_irr = 0; |
189 | s->irr = 0; | 189 | s->irr = 0; |
190 | s->imr = 0; | 190 | s->imr = 0; |
@@ -209,7 +209,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) | |||
209 | addr &= 1; | 209 | addr &= 1; |
210 | if (addr == 0) { | 210 | if (addr == 0) { |
211 | if (val & 0x10) { | 211 | if (val & 0x10) { |
212 | pic_reset(s); /* init */ | 212 | kvm_pic_reset(s); /* init */ |
213 | /* | 213 | /* |
214 | * deassert a pending interrupt | 214 | * deassert a pending interrupt |
215 | */ | 215 | */ |
diff --git a/drivers/kvm/irq.c b/arch/x86/kvm/irq.c index 7628c7ff628f..e5714759e97f 100644 --- a/drivers/kvm/irq.c +++ b/arch/x86/kvm/irq.c | |||
@@ -20,8 +20,8 @@ | |||
20 | */ | 20 | */ |
21 | 21 | ||
22 | #include <linux/module.h> | 22 | #include <linux/module.h> |
23 | #include <linux/kvm_host.h> | ||
23 | 24 | ||
24 | #include "kvm.h" | ||
25 | #include "irq.h" | 25 | #include "irq.h" |
26 | 26 | ||
27 | /* | 27 | /* |
@@ -63,26 +63,6 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v) | |||
63 | } | 63 | } |
64 | EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt); | 64 | EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt); |
65 | 65 | ||
66 | static void vcpu_kick_intr(void *info) | ||
67 | { | ||
68 | #ifdef DEBUG | ||
69 | struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info; | ||
70 | printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu); | ||
71 | #endif | ||
72 | } | ||
73 | |||
74 | void kvm_vcpu_kick(struct kvm_vcpu *vcpu) | ||
75 | { | ||
76 | int ipi_pcpu = vcpu->cpu; | ||
77 | |||
78 | if (waitqueue_active(&vcpu->wq)) { | ||
79 | wake_up_interruptible(&vcpu->wq); | ||
80 | ++vcpu->stat.halt_wakeup; | ||
81 | } | ||
82 | if (vcpu->guest_mode) | ||
83 | smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0); | ||
84 | } | ||
85 | |||
86 | void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) | 66 | void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) |
87 | { | 67 | { |
88 | kvm_inject_apic_timer_irqs(vcpu); | 68 | kvm_inject_apic_timer_irqs(vcpu); |
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h new file mode 100644 index 000000000000..fa5ed5d59b5d --- /dev/null +++ b/arch/x86/kvm/irq.h | |||
@@ -0,0 +1,88 @@ | |||
1 | /* | ||
2 | * irq.h: in kernel interrupt controller related definitions | ||
3 | * Copyright (c) 2007, Intel Corporation. | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify it | ||
6 | * under the terms and conditions of the GNU General Public License, | ||
7 | * version 2, as published by the Free Software Foundation. | ||
8 | * | ||
9 | * This program is distributed in the hope it will be useful, but WITHOUT | ||
10 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
11 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
12 | * more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License along with | ||
15 | * this program; if not, write to the Free Software Foundation, Inc., 59 Temple | ||
16 | * Place - Suite 330, Boston, MA 02111-1307 USA. | ||
17 | * Authors: | ||
18 | * Yaozu (Eddie) Dong <Eddie.dong@intel.com> | ||
19 | * | ||
20 | */ | ||
21 | |||
22 | #ifndef __IRQ_H | ||
23 | #define __IRQ_H | ||
24 | |||
25 | #include <linux/mm_types.h> | ||
26 | #include <linux/hrtimer.h> | ||
27 | #include <linux/kvm_host.h> | ||
28 | |||
29 | #include "iodev.h" | ||
30 | #include "ioapic.h" | ||
31 | #include "lapic.h" | ||
32 | |||
33 | struct kvm; | ||
34 | struct kvm_vcpu; | ||
35 | |||
36 | typedef void irq_request_func(void *opaque, int level); | ||
37 | |||
38 | struct kvm_kpic_state { | ||
39 | u8 last_irr; /* edge detection */ | ||
40 | u8 irr; /* interrupt request register */ | ||
41 | u8 imr; /* interrupt mask register */ | ||
42 | u8 isr; /* interrupt service register */ | ||
43 | u8 priority_add; /* highest irq priority */ | ||
44 | u8 irq_base; | ||
45 | u8 read_reg_select; | ||
46 | u8 poll; | ||
47 | u8 special_mask; | ||
48 | u8 init_state; | ||
49 | u8 auto_eoi; | ||
50 | u8 rotate_on_auto_eoi; | ||
51 | u8 special_fully_nested_mode; | ||
52 | u8 init4; /* true if 4 byte init */ | ||
53 | u8 elcr; /* PIIX edge/trigger selection */ | ||
54 | u8 elcr_mask; | ||
55 | struct kvm_pic *pics_state; | ||
56 | }; | ||
57 | |||
58 | struct kvm_pic { | ||
59 | struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ | ||
60 | irq_request_func *irq_request; | ||
61 | void *irq_request_opaque; | ||
62 | int output; /* intr from master PIC */ | ||
63 | struct kvm_io_device dev; | ||
64 | }; | ||
65 | |||
66 | struct kvm_pic *kvm_create_pic(struct kvm *kvm); | ||
67 | void kvm_pic_set_irq(void *opaque, int irq, int level); | ||
68 | int kvm_pic_read_irq(struct kvm_pic *s); | ||
69 | void kvm_pic_update_irq(struct kvm_pic *s); | ||
70 | |||
71 | static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) | ||
72 | { | ||
73 | return kvm->arch.vpic; | ||
74 | } | ||
75 | |||
76 | static inline int irqchip_in_kernel(struct kvm *kvm) | ||
77 | { | ||
78 | return pic_irqchip(kvm) != NULL; | ||
79 | } | ||
80 | |||
81 | void kvm_pic_reset(struct kvm_kpic_state *s); | ||
82 | |||
83 | void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec); | ||
84 | void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); | ||
85 | void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); | ||
86 | void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu); | ||
87 | |||
88 | #endif | ||
diff --git a/drivers/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h index a0e415daef5b..ecdfe97e4635 100644 --- a/drivers/kvm/kvm_svm.h +++ b/arch/x86/kvm/kvm_svm.h | |||
@@ -4,10 +4,10 @@ | |||
4 | #include <linux/kernel.h> | 4 | #include <linux/kernel.h> |
5 | #include <linux/types.h> | 5 | #include <linux/types.h> |
6 | #include <linux/list.h> | 6 | #include <linux/list.h> |
7 | #include <linux/kvm_host.h> | ||
7 | #include <asm/msr.h> | 8 | #include <asm/msr.h> |
8 | 9 | ||
9 | #include "svm.h" | 10 | #include "svm.h" |
10 | #include "kvm.h" | ||
11 | 11 | ||
12 | static const u32 host_save_user_msrs[] = { | 12 | static const u32 host_save_user_msrs[] = { |
13 | #ifdef CONFIG_X86_64 | 13 | #ifdef CONFIG_X86_64 |
diff --git a/drivers/kvm/lapic.c b/arch/x86/kvm/lapic.c index 238fcad3cece..2cbee9479ce4 100644 --- a/drivers/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c | |||
@@ -17,7 +17,7 @@ | |||
17 | * the COPYING file in the top-level directory. | 17 | * the COPYING file in the top-level directory. |
18 | */ | 18 | */ |
19 | 19 | ||
20 | #include "kvm.h" | 20 | #include <linux/kvm_host.h> |
21 | #include <linux/kvm.h> | 21 | #include <linux/kvm.h> |
22 | #include <linux/mm.h> | 22 | #include <linux/mm.h> |
23 | #include <linux/highmem.h> | 23 | #include <linux/highmem.h> |
@@ -56,6 +56,7 @@ | |||
56 | 56 | ||
57 | #define VEC_POS(v) ((v) & (32 - 1)) | 57 | #define VEC_POS(v) ((v) & (32 - 1)) |
58 | #define REG_POS(v) (((v) >> 5) << 4) | 58 | #define REG_POS(v) (((v) >> 5) << 4) |
59 | |||
59 | static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off) | 60 | static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off) |
60 | { | 61 | { |
61 | return *((u32 *) (apic->regs + reg_off)); | 62 | return *((u32 *) (apic->regs + reg_off)); |
@@ -88,7 +89,7 @@ static inline void apic_clear_vector(int vec, void *bitmap) | |||
88 | 89 | ||
89 | static inline int apic_hw_enabled(struct kvm_lapic *apic) | 90 | static inline int apic_hw_enabled(struct kvm_lapic *apic) |
90 | { | 91 | { |
91 | return (apic)->vcpu->apic_base & MSR_IA32_APICBASE_ENABLE; | 92 | return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE; |
92 | } | 93 | } |
93 | 94 | ||
94 | static inline int apic_sw_enabled(struct kvm_lapic *apic) | 95 | static inline int apic_sw_enabled(struct kvm_lapic *apic) |
@@ -172,7 +173,7 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic) | |||
172 | 173 | ||
173 | int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) | 174 | int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) |
174 | { | 175 | { |
175 | struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic; | 176 | struct kvm_lapic *apic = vcpu->arch.apic; |
176 | int highest_irr; | 177 | int highest_irr; |
177 | 178 | ||
178 | if (!apic) | 179 | if (!apic) |
@@ -183,8 +184,10 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) | |||
183 | } | 184 | } |
184 | EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr); | 185 | EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr); |
185 | 186 | ||
186 | int kvm_apic_set_irq(struct kvm_lapic *apic, u8 vec, u8 trig) | 187 | int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig) |
187 | { | 188 | { |
189 | struct kvm_lapic *apic = vcpu->arch.apic; | ||
190 | |||
188 | if (!apic_test_and_set_irr(vec, apic)) { | 191 | if (!apic_test_and_set_irr(vec, apic)) { |
189 | /* a new pending irq is set in IRR */ | 192 | /* a new pending irq is set in IRR */ |
190 | if (trig) | 193 | if (trig) |
@@ -268,7 +271,7 @@ static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, | |||
268 | int short_hand, int dest, int dest_mode) | 271 | int short_hand, int dest, int dest_mode) |
269 | { | 272 | { |
270 | int result = 0; | 273 | int result = 0; |
271 | struct kvm_lapic *target = vcpu->apic; | 274 | struct kvm_lapic *target = vcpu->arch.apic; |
272 | 275 | ||
273 | apic_debug("target %p, source %p, dest 0x%x, " | 276 | apic_debug("target %p, source %p, dest 0x%x, " |
274 | "dest_mode 0x%x, short_hand 0x%x", | 277 | "dest_mode 0x%x, short_hand 0x%x", |
@@ -335,10 +338,10 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, | |||
335 | } else | 338 | } else |
336 | apic_clear_vector(vector, apic->regs + APIC_TMR); | 339 | apic_clear_vector(vector, apic->regs + APIC_TMR); |
337 | 340 | ||
338 | if (vcpu->mp_state == VCPU_MP_STATE_RUNNABLE) | 341 | if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE) |
339 | kvm_vcpu_kick(vcpu); | 342 | kvm_vcpu_kick(vcpu); |
340 | else if (vcpu->mp_state == VCPU_MP_STATE_HALTED) { | 343 | else if (vcpu->arch.mp_state == VCPU_MP_STATE_HALTED) { |
341 | vcpu->mp_state = VCPU_MP_STATE_RUNNABLE; | 344 | vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE; |
342 | if (waitqueue_active(&vcpu->wq)) | 345 | if (waitqueue_active(&vcpu->wq)) |
343 | wake_up_interruptible(&vcpu->wq); | 346 | wake_up_interruptible(&vcpu->wq); |
344 | } | 347 | } |
@@ -359,11 +362,11 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, | |||
359 | 362 | ||
360 | case APIC_DM_INIT: | 363 | case APIC_DM_INIT: |
361 | if (level) { | 364 | if (level) { |
362 | if (vcpu->mp_state == VCPU_MP_STATE_RUNNABLE) | 365 | if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE) |
363 | printk(KERN_DEBUG | 366 | printk(KERN_DEBUG |
364 | "INIT on a runnable vcpu %d\n", | 367 | "INIT on a runnable vcpu %d\n", |
365 | vcpu->vcpu_id); | 368 | vcpu->vcpu_id); |
366 | vcpu->mp_state = VCPU_MP_STATE_INIT_RECEIVED; | 369 | vcpu->arch.mp_state = VCPU_MP_STATE_INIT_RECEIVED; |
367 | kvm_vcpu_kick(vcpu); | 370 | kvm_vcpu_kick(vcpu); |
368 | } else { | 371 | } else { |
369 | printk(KERN_DEBUG | 372 | printk(KERN_DEBUG |
@@ -376,9 +379,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, | |||
376 | case APIC_DM_STARTUP: | 379 | case APIC_DM_STARTUP: |
377 | printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n", | 380 | printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n", |
378 | vcpu->vcpu_id, vector); | 381 | vcpu->vcpu_id, vector); |
379 | if (vcpu->mp_state == VCPU_MP_STATE_INIT_RECEIVED) { | 382 | if (vcpu->arch.mp_state == VCPU_MP_STATE_INIT_RECEIVED) { |
380 | vcpu->sipi_vector = vector; | 383 | vcpu->arch.sipi_vector = vector; |
381 | vcpu->mp_state = VCPU_MP_STATE_SIPI_RECEIVED; | 384 | vcpu->arch.mp_state = VCPU_MP_STATE_SIPI_RECEIVED; |
382 | if (waitqueue_active(&vcpu->wq)) | 385 | if (waitqueue_active(&vcpu->wq)) |
383 | wake_up_interruptible(&vcpu->wq); | 386 | wake_up_interruptible(&vcpu->wq); |
384 | } | 387 | } |
@@ -392,15 +395,14 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, | |||
392 | return result; | 395 | return result; |
393 | } | 396 | } |
394 | 397 | ||
395 | struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector, | 398 | static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector, |
396 | unsigned long bitmap) | 399 | unsigned long bitmap) |
397 | { | 400 | { |
398 | int vcpu_id; | ||
399 | int last; | 401 | int last; |
400 | int next; | 402 | int next; |
401 | struct kvm_lapic *apic; | 403 | struct kvm_lapic *apic = NULL; |
402 | 404 | ||
403 | last = kvm->round_robin_prev_vcpu; | 405 | last = kvm->arch.round_robin_prev_vcpu; |
404 | next = last; | 406 | next = last; |
405 | 407 | ||
406 | do { | 408 | do { |
@@ -408,25 +410,30 @@ struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector, | |||
408 | next = 0; | 410 | next = 0; |
409 | if (kvm->vcpus[next] == NULL || !test_bit(next, &bitmap)) | 411 | if (kvm->vcpus[next] == NULL || !test_bit(next, &bitmap)) |
410 | continue; | 412 | continue; |
411 | apic = kvm->vcpus[next]->apic; | 413 | apic = kvm->vcpus[next]->arch.apic; |
412 | if (apic && apic_enabled(apic)) | 414 | if (apic && apic_enabled(apic)) |
413 | break; | 415 | break; |
414 | apic = NULL; | 416 | apic = NULL; |
415 | } while (next != last); | 417 | } while (next != last); |
416 | kvm->round_robin_prev_vcpu = next; | 418 | kvm->arch.round_robin_prev_vcpu = next; |
417 | 419 | ||
418 | if (!apic) { | 420 | if (!apic) |
419 | vcpu_id = ffs(bitmap) - 1; | 421 | printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n"); |
420 | if (vcpu_id < 0) { | ||
421 | vcpu_id = 0; | ||
422 | printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n"); | ||
423 | } | ||
424 | apic = kvm->vcpus[vcpu_id]->apic; | ||
425 | } | ||
426 | 422 | ||
427 | return apic; | 423 | return apic; |
428 | } | 424 | } |
429 | 425 | ||
426 | struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector, | ||
427 | unsigned long bitmap) | ||
428 | { | ||
429 | struct kvm_lapic *apic; | ||
430 | |||
431 | apic = kvm_apic_round_robin(kvm, vector, bitmap); | ||
432 | if (apic) | ||
433 | return apic->vcpu; | ||
434 | return NULL; | ||
435 | } | ||
436 | |||
430 | static void apic_set_eoi(struct kvm_lapic *apic) | 437 | static void apic_set_eoi(struct kvm_lapic *apic) |
431 | { | 438 | { |
432 | int vector = apic_find_highest_isr(apic); | 439 | int vector = apic_find_highest_isr(apic); |
@@ -458,7 +465,7 @@ static void apic_send_ipi(struct kvm_lapic *apic) | |||
458 | unsigned int delivery_mode = icr_low & APIC_MODE_MASK; | 465 | unsigned int delivery_mode = icr_low & APIC_MODE_MASK; |
459 | unsigned int vector = icr_low & APIC_VECTOR_MASK; | 466 | unsigned int vector = icr_low & APIC_VECTOR_MASK; |
460 | 467 | ||
461 | struct kvm_lapic *target; | 468 | struct kvm_vcpu *target; |
462 | struct kvm_vcpu *vcpu; | 469 | struct kvm_vcpu *vcpu; |
463 | unsigned long lpr_map = 0; | 470 | unsigned long lpr_map = 0; |
464 | int i; | 471 | int i; |
@@ -474,20 +481,20 @@ static void apic_send_ipi(struct kvm_lapic *apic) | |||
474 | if (!vcpu) | 481 | if (!vcpu) |
475 | continue; | 482 | continue; |
476 | 483 | ||
477 | if (vcpu->apic && | 484 | if (vcpu->arch.apic && |
478 | apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) { | 485 | apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) { |
479 | if (delivery_mode == APIC_DM_LOWEST) | 486 | if (delivery_mode == APIC_DM_LOWEST) |
480 | set_bit(vcpu->vcpu_id, &lpr_map); | 487 | set_bit(vcpu->vcpu_id, &lpr_map); |
481 | else | 488 | else |
482 | __apic_accept_irq(vcpu->apic, delivery_mode, | 489 | __apic_accept_irq(vcpu->arch.apic, delivery_mode, |
483 | vector, level, trig_mode); | 490 | vector, level, trig_mode); |
484 | } | 491 | } |
485 | } | 492 | } |
486 | 493 | ||
487 | if (delivery_mode == APIC_DM_LOWEST) { | 494 | if (delivery_mode == APIC_DM_LOWEST) { |
488 | target = kvm_apic_round_robin(vcpu->kvm, vector, lpr_map); | 495 | target = kvm_get_lowest_prio_vcpu(vcpu->kvm, vector, lpr_map); |
489 | if (target != NULL) | 496 | if (target != NULL) |
490 | __apic_accept_irq(target, delivery_mode, | 497 | __apic_accept_irq(target->arch.apic, delivery_mode, |
491 | vector, level, trig_mode); | 498 | vector, level, trig_mode); |
492 | } | 499 | } |
493 | } | 500 | } |
@@ -544,6 +551,23 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic) | |||
544 | return tmcct; | 551 | return tmcct; |
545 | } | 552 | } |
546 | 553 | ||
554 | static void __report_tpr_access(struct kvm_lapic *apic, bool write) | ||
555 | { | ||
556 | struct kvm_vcpu *vcpu = apic->vcpu; | ||
557 | struct kvm_run *run = vcpu->run; | ||
558 | |||
559 | set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests); | ||
560 | kvm_x86_ops->cache_regs(vcpu); | ||
561 | run->tpr_access.rip = vcpu->arch.rip; | ||
562 | run->tpr_access.is_write = write; | ||
563 | } | ||
564 | |||
565 | static inline void report_tpr_access(struct kvm_lapic *apic, bool write) | ||
566 | { | ||
567 | if (apic->vcpu->arch.tpr_access_reporting) | ||
568 | __report_tpr_access(apic, write); | ||
569 | } | ||
570 | |||
547 | static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) | 571 | static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) |
548 | { | 572 | { |
549 | u32 val = 0; | 573 | u32 val = 0; |
@@ -561,6 +585,9 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) | |||
561 | val = apic_get_tmcct(apic); | 585 | val = apic_get_tmcct(apic); |
562 | break; | 586 | break; |
563 | 587 | ||
588 | case APIC_TASKPRI: | ||
589 | report_tpr_access(apic, false); | ||
590 | /* fall thru */ | ||
564 | default: | 591 | default: |
565 | apic_update_ppr(apic); | 592 | apic_update_ppr(apic); |
566 | val = apic_get_reg(apic, offset); | 593 | val = apic_get_reg(apic, offset); |
@@ -670,6 +697,7 @@ static void apic_mmio_write(struct kvm_io_device *this, | |||
670 | break; | 697 | break; |
671 | 698 | ||
672 | case APIC_TASKPRI: | 699 | case APIC_TASKPRI: |
700 | report_tpr_access(apic, true); | ||
673 | apic_set_tpr(apic, val & 0xff); | 701 | apic_set_tpr(apic, val & 0xff); |
674 | break; | 702 | break; |
675 | 703 | ||
@@ -762,19 +790,17 @@ static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr) | |||
762 | return ret; | 790 | return ret; |
763 | } | 791 | } |
764 | 792 | ||
765 | void kvm_free_apic(struct kvm_lapic *apic) | 793 | void kvm_free_lapic(struct kvm_vcpu *vcpu) |
766 | { | 794 | { |
767 | if (!apic) | 795 | if (!vcpu->arch.apic) |
768 | return; | 796 | return; |
769 | 797 | ||
770 | hrtimer_cancel(&apic->timer.dev); | 798 | hrtimer_cancel(&vcpu->arch.apic->timer.dev); |
771 | 799 | ||
772 | if (apic->regs_page) { | 800 | if (vcpu->arch.apic->regs_page) |
773 | __free_page(apic->regs_page); | 801 | __free_page(vcpu->arch.apic->regs_page); |
774 | apic->regs_page = 0; | ||
775 | } | ||
776 | 802 | ||
777 | kfree(apic); | 803 | kfree(vcpu->arch.apic); |
778 | } | 804 | } |
779 | 805 | ||
780 | /* | 806 | /* |
@@ -785,16 +811,17 @@ void kvm_free_apic(struct kvm_lapic *apic) | |||
785 | 811 | ||
786 | void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8) | 812 | void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8) |
787 | { | 813 | { |
788 | struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic; | 814 | struct kvm_lapic *apic = vcpu->arch.apic; |
789 | 815 | ||
790 | if (!apic) | 816 | if (!apic) |
791 | return; | 817 | return; |
792 | apic_set_tpr(apic, ((cr8 & 0x0f) << 4)); | 818 | apic_set_tpr(apic, ((cr8 & 0x0f) << 4) |
819 | | (apic_get_reg(apic, APIC_TASKPRI) & 4)); | ||
793 | } | 820 | } |
794 | 821 | ||
795 | u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) | 822 | u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) |
796 | { | 823 | { |
797 | struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic; | 824 | struct kvm_lapic *apic = vcpu->arch.apic; |
798 | u64 tpr; | 825 | u64 tpr; |
799 | 826 | ||
800 | if (!apic) | 827 | if (!apic) |
@@ -807,29 +834,29 @@ EXPORT_SYMBOL_GPL(kvm_lapic_get_cr8); | |||
807 | 834 | ||
808 | void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) | 835 | void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) |
809 | { | 836 | { |
810 | struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic; | 837 | struct kvm_lapic *apic = vcpu->arch.apic; |
811 | 838 | ||
812 | if (!apic) { | 839 | if (!apic) { |
813 | value |= MSR_IA32_APICBASE_BSP; | 840 | value |= MSR_IA32_APICBASE_BSP; |
814 | vcpu->apic_base = value; | 841 | vcpu->arch.apic_base = value; |
815 | return; | 842 | return; |
816 | } | 843 | } |
817 | if (apic->vcpu->vcpu_id) | 844 | if (apic->vcpu->vcpu_id) |
818 | value &= ~MSR_IA32_APICBASE_BSP; | 845 | value &= ~MSR_IA32_APICBASE_BSP; |
819 | 846 | ||
820 | vcpu->apic_base = value; | 847 | vcpu->arch.apic_base = value; |
821 | apic->base_address = apic->vcpu->apic_base & | 848 | apic->base_address = apic->vcpu->arch.apic_base & |
822 | MSR_IA32_APICBASE_BASE; | 849 | MSR_IA32_APICBASE_BASE; |
823 | 850 | ||
824 | /* with FSB delivery interrupt, we can restart APIC functionality */ | 851 | /* with FSB delivery interrupt, we can restart APIC functionality */ |
825 | apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is " | 852 | apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is " |
826 | "0x%lx.\n", apic->apic_base, apic->base_address); | 853 | "0x%lx.\n", apic->vcpu->arch.apic_base, apic->base_address); |
827 | 854 | ||
828 | } | 855 | } |
829 | 856 | ||
830 | u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu) | 857 | u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu) |
831 | { | 858 | { |
832 | return vcpu->apic_base; | 859 | return vcpu->arch.apic_base; |
833 | } | 860 | } |
834 | EXPORT_SYMBOL_GPL(kvm_lapic_get_base); | 861 | EXPORT_SYMBOL_GPL(kvm_lapic_get_base); |
835 | 862 | ||
@@ -841,7 +868,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) | |||
841 | apic_debug("%s\n", __FUNCTION__); | 868 | apic_debug("%s\n", __FUNCTION__); |
842 | 869 | ||
843 | ASSERT(vcpu); | 870 | ASSERT(vcpu); |
844 | apic = vcpu->apic; | 871 | apic = vcpu->arch.apic; |
845 | ASSERT(apic != NULL); | 872 | ASSERT(apic != NULL); |
846 | 873 | ||
847 | /* Stop the timer in case it's a reset to an active apic */ | 874 | /* Stop the timer in case it's a reset to an active apic */ |
@@ -872,19 +899,19 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) | |||
872 | update_divide_count(apic); | 899 | update_divide_count(apic); |
873 | atomic_set(&apic->timer.pending, 0); | 900 | atomic_set(&apic->timer.pending, 0); |
874 | if (vcpu->vcpu_id == 0) | 901 | if (vcpu->vcpu_id == 0) |
875 | vcpu->apic_base |= MSR_IA32_APICBASE_BSP; | 902 | vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; |
876 | apic_update_ppr(apic); | 903 | apic_update_ppr(apic); |
877 | 904 | ||
878 | apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr=" | 905 | apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr=" |
879 | "0x%016" PRIx64 ", base_address=0x%0lx.\n", __FUNCTION__, | 906 | "0x%016" PRIx64 ", base_address=0x%0lx.\n", __FUNCTION__, |
880 | vcpu, kvm_apic_id(apic), | 907 | vcpu, kvm_apic_id(apic), |
881 | vcpu->apic_base, apic->base_address); | 908 | vcpu->arch.apic_base, apic->base_address); |
882 | } | 909 | } |
883 | EXPORT_SYMBOL_GPL(kvm_lapic_reset); | 910 | EXPORT_SYMBOL_GPL(kvm_lapic_reset); |
884 | 911 | ||
885 | int kvm_lapic_enabled(struct kvm_vcpu *vcpu) | 912 | int kvm_lapic_enabled(struct kvm_vcpu *vcpu) |
886 | { | 913 | { |
887 | struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic; | 914 | struct kvm_lapic *apic = vcpu->arch.apic; |
888 | int ret = 0; | 915 | int ret = 0; |
889 | 916 | ||
890 | if (!apic) | 917 | if (!apic) |
@@ -908,9 +935,8 @@ static int __apic_timer_fn(struct kvm_lapic *apic) | |||
908 | wait_queue_head_t *q = &apic->vcpu->wq; | 935 | wait_queue_head_t *q = &apic->vcpu->wq; |
909 | 936 | ||
910 | atomic_inc(&apic->timer.pending); | 937 | atomic_inc(&apic->timer.pending); |
911 | if (waitqueue_active(q)) | 938 | if (waitqueue_active(q)) { |
912 | { | 939 | apic->vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE; |
913 | apic->vcpu->mp_state = VCPU_MP_STATE_RUNNABLE; | ||
914 | wake_up_interruptible(q); | 940 | wake_up_interruptible(q); |
915 | } | 941 | } |
916 | if (apic_lvtt_period(apic)) { | 942 | if (apic_lvtt_period(apic)) { |
@@ -956,13 +982,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) | |||
956 | if (!apic) | 982 | if (!apic) |
957 | goto nomem; | 983 | goto nomem; |
958 | 984 | ||
959 | vcpu->apic = apic; | 985 | vcpu->arch.apic = apic; |
960 | 986 | ||
961 | apic->regs_page = alloc_page(GFP_KERNEL); | 987 | apic->regs_page = alloc_page(GFP_KERNEL); |
962 | if (apic->regs_page == NULL) { | 988 | if (apic->regs_page == NULL) { |
963 | printk(KERN_ERR "malloc apic regs error for vcpu %x\n", | 989 | printk(KERN_ERR "malloc apic regs error for vcpu %x\n", |
964 | vcpu->vcpu_id); | 990 | vcpu->vcpu_id); |
965 | goto nomem; | 991 | goto nomem_free_apic; |
966 | } | 992 | } |
967 | apic->regs = page_address(apic->regs_page); | 993 | apic->regs = page_address(apic->regs_page); |
968 | memset(apic->regs, 0, PAGE_SIZE); | 994 | memset(apic->regs, 0, PAGE_SIZE); |
@@ -971,7 +997,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) | |||
971 | hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); | 997 | hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); |
972 | apic->timer.dev.function = apic_timer_fn; | 998 | apic->timer.dev.function = apic_timer_fn; |
973 | apic->base_address = APIC_DEFAULT_PHYS_BASE; | 999 | apic->base_address = APIC_DEFAULT_PHYS_BASE; |
974 | vcpu->apic_base = APIC_DEFAULT_PHYS_BASE; | 1000 | vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE; |
975 | 1001 | ||
976 | kvm_lapic_reset(vcpu); | 1002 | kvm_lapic_reset(vcpu); |
977 | apic->dev.read = apic_mmio_read; | 1003 | apic->dev.read = apic_mmio_read; |
@@ -980,15 +1006,16 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) | |||
980 | apic->dev.private = apic; | 1006 | apic->dev.private = apic; |
981 | 1007 | ||
982 | return 0; | 1008 | return 0; |
1009 | nomem_free_apic: | ||
1010 | kfree(apic); | ||
983 | nomem: | 1011 | nomem: |
984 | kvm_free_apic(apic); | ||
985 | return -ENOMEM; | 1012 | return -ENOMEM; |
986 | } | 1013 | } |
987 | EXPORT_SYMBOL_GPL(kvm_create_lapic); | 1014 | EXPORT_SYMBOL_GPL(kvm_create_lapic); |
988 | 1015 | ||
989 | int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) | 1016 | int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) |
990 | { | 1017 | { |
991 | struct kvm_lapic *apic = vcpu->apic; | 1018 | struct kvm_lapic *apic = vcpu->arch.apic; |
992 | int highest_irr; | 1019 | int highest_irr; |
993 | 1020 | ||
994 | if (!apic || !apic_enabled(apic)) | 1021 | if (!apic || !apic_enabled(apic)) |
@@ -1004,11 +1031,11 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) | |||
1004 | 1031 | ||
1005 | int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) | 1032 | int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) |
1006 | { | 1033 | { |
1007 | u32 lvt0 = apic_get_reg(vcpu->apic, APIC_LVT0); | 1034 | u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0); |
1008 | int r = 0; | 1035 | int r = 0; |
1009 | 1036 | ||
1010 | if (vcpu->vcpu_id == 0) { | 1037 | if (vcpu->vcpu_id == 0) { |
1011 | if (!apic_hw_enabled(vcpu->apic)) | 1038 | if (!apic_hw_enabled(vcpu->arch.apic)) |
1012 | r = 1; | 1039 | r = 1; |
1013 | if ((lvt0 & APIC_LVT_MASKED) == 0 && | 1040 | if ((lvt0 & APIC_LVT_MASKED) == 0 && |
1014 | GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) | 1041 | GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) |
@@ -1019,7 +1046,7 @@ int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) | |||
1019 | 1046 | ||
1020 | void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) | 1047 | void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) |
1021 | { | 1048 | { |
1022 | struct kvm_lapic *apic = vcpu->apic; | 1049 | struct kvm_lapic *apic = vcpu->arch.apic; |
1023 | 1050 | ||
1024 | if (apic && apic_lvt_enabled(apic, APIC_LVTT) && | 1051 | if (apic && apic_lvt_enabled(apic, APIC_LVTT) && |
1025 | atomic_read(&apic->timer.pending) > 0) { | 1052 | atomic_read(&apic->timer.pending) > 0) { |
@@ -1030,7 +1057,7 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) | |||
1030 | 1057 | ||
1031 | void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec) | 1058 | void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec) |
1032 | { | 1059 | { |
1033 | struct kvm_lapic *apic = vcpu->apic; | 1060 | struct kvm_lapic *apic = vcpu->arch.apic; |
1034 | 1061 | ||
1035 | if (apic && apic_lvt_vector(apic, APIC_LVTT) == vec) | 1062 | if (apic && apic_lvt_vector(apic, APIC_LVTT) == vec) |
1036 | apic->timer.last_update = ktime_add_ns( | 1063 | apic->timer.last_update = ktime_add_ns( |
@@ -1041,7 +1068,7 @@ void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec) | |||
1041 | int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) | 1068 | int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) |
1042 | { | 1069 | { |
1043 | int vector = kvm_apic_has_interrupt(vcpu); | 1070 | int vector = kvm_apic_has_interrupt(vcpu); |
1044 | struct kvm_lapic *apic = vcpu->apic; | 1071 | struct kvm_lapic *apic = vcpu->arch.apic; |
1045 | 1072 | ||
1046 | if (vector == -1) | 1073 | if (vector == -1) |
1047 | return -1; | 1074 | return -1; |
@@ -1054,9 +1081,9 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) | |||
1054 | 1081 | ||
1055 | void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) | 1082 | void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) |
1056 | { | 1083 | { |
1057 | struct kvm_lapic *apic = vcpu->apic; | 1084 | struct kvm_lapic *apic = vcpu->arch.apic; |
1058 | 1085 | ||
1059 | apic->base_address = vcpu->apic_base & | 1086 | apic->base_address = vcpu->arch.apic_base & |
1060 | MSR_IA32_APICBASE_BASE; | 1087 | MSR_IA32_APICBASE_BASE; |
1061 | apic_set_reg(apic, APIC_LVR, APIC_VERSION); | 1088 | apic_set_reg(apic, APIC_LVR, APIC_VERSION); |
1062 | apic_update_ppr(apic); | 1089 | apic_update_ppr(apic); |
@@ -1065,9 +1092,9 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) | |||
1065 | start_apic_timer(apic); | 1092 | start_apic_timer(apic); |
1066 | } | 1093 | } |
1067 | 1094 | ||
1068 | void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) | 1095 | void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) |
1069 | { | 1096 | { |
1070 | struct kvm_lapic *apic = vcpu->apic; | 1097 | struct kvm_lapic *apic = vcpu->arch.apic; |
1071 | struct hrtimer *timer; | 1098 | struct hrtimer *timer; |
1072 | 1099 | ||
1073 | if (!apic) | 1100 | if (!apic) |
@@ -1077,4 +1104,51 @@ void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) | |||
1077 | if (hrtimer_cancel(timer)) | 1104 | if (hrtimer_cancel(timer)) |
1078 | hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS); | 1105 | hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS); |
1079 | } | 1106 | } |
1080 | EXPORT_SYMBOL_GPL(kvm_migrate_apic_timer); | 1107 | |
1108 | void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) | ||
1109 | { | ||
1110 | u32 data; | ||
1111 | void *vapic; | ||
1112 | |||
1113 | if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr) | ||
1114 | return; | ||
1115 | |||
1116 | vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0); | ||
1117 | data = *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr)); | ||
1118 | kunmap_atomic(vapic, KM_USER0); | ||
1119 | |||
1120 | apic_set_tpr(vcpu->arch.apic, data & 0xff); | ||
1121 | } | ||
1122 | |||
1123 | void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) | ||
1124 | { | ||
1125 | u32 data, tpr; | ||
1126 | int max_irr, max_isr; | ||
1127 | struct kvm_lapic *apic; | ||
1128 | void *vapic; | ||
1129 | |||
1130 | if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr) | ||
1131 | return; | ||
1132 | |||
1133 | apic = vcpu->arch.apic; | ||
1134 | tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff; | ||
1135 | max_irr = apic_find_highest_irr(apic); | ||
1136 | if (max_irr < 0) | ||
1137 | max_irr = 0; | ||
1138 | max_isr = apic_find_highest_isr(apic); | ||
1139 | if (max_isr < 0) | ||
1140 | max_isr = 0; | ||
1141 | data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24); | ||
1142 | |||
1143 | vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0); | ||
1144 | *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr)) = data; | ||
1145 | kunmap_atomic(vapic, KM_USER0); | ||
1146 | } | ||
1147 | |||
1148 | void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr) | ||
1149 | { | ||
1150 | if (!irqchip_in_kernel(vcpu->kvm)) | ||
1151 | return; | ||
1152 | |||
1153 | vcpu->arch.apic->vapic_addr = vapic_addr; | ||
1154 | } | ||
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h new file mode 100644 index 000000000000..676c396c9cee --- /dev/null +++ b/arch/x86/kvm/lapic.h | |||
@@ -0,0 +1,50 @@ | |||
1 | #ifndef __KVM_X86_LAPIC_H | ||
2 | #define __KVM_X86_LAPIC_H | ||
3 | |||
4 | #include "iodev.h" | ||
5 | |||
6 | #include <linux/kvm_host.h> | ||
7 | |||
8 | struct kvm_lapic { | ||
9 | unsigned long base_address; | ||
10 | struct kvm_io_device dev; | ||
11 | struct { | ||
12 | atomic_t pending; | ||
13 | s64 period; /* unit: ns */ | ||
14 | u32 divide_count; | ||
15 | ktime_t last_update; | ||
16 | struct hrtimer dev; | ||
17 | } timer; | ||
18 | struct kvm_vcpu *vcpu; | ||
19 | struct page *regs_page; | ||
20 | void *regs; | ||
21 | gpa_t vapic_addr; | ||
22 | struct page *vapic_page; | ||
23 | }; | ||
24 | int kvm_create_lapic(struct kvm_vcpu *vcpu); | ||
25 | void kvm_free_lapic(struct kvm_vcpu *vcpu); | ||
26 | |||
27 | int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); | ||
28 | int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu); | ||
29 | int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); | ||
30 | void kvm_lapic_reset(struct kvm_vcpu *vcpu); | ||
31 | u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); | ||
32 | void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); | ||
33 | void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); | ||
34 | |||
35 | int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); | ||
36 | int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); | ||
37 | int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig); | ||
38 | |||
39 | u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); | ||
40 | void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data); | ||
41 | void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu); | ||
42 | int kvm_lapic_enabled(struct kvm_vcpu *vcpu); | ||
43 | int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); | ||
44 | void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec); | ||
45 | |||
46 | void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); | ||
47 | void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu); | ||
48 | void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); | ||
49 | |||
50 | #endif | ||
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c new file mode 100644 index 000000000000..8efdcdbebb03 --- /dev/null +++ b/arch/x86/kvm/mmu.c | |||
@@ -0,0 +1,1885 @@ | |||
1 | /* | ||
2 | * Kernel-based Virtual Machine driver for Linux | ||
3 | * | ||
4 | * This module enables machines with Intel VT-x extensions to run virtual | ||
5 | * machines without emulation or binary translation. | ||
6 | * | ||
7 | * MMU support | ||
8 | * | ||
9 | * Copyright (C) 2006 Qumranet, Inc. | ||
10 | * | ||
11 | * Authors: | ||
12 | * Yaniv Kamay <yaniv@qumranet.com> | ||
13 | * Avi Kivity <avi@qumranet.com> | ||
14 | * | ||
15 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
16 | * the COPYING file in the top-level directory. | ||
17 | * | ||
18 | */ | ||
19 | |||
20 | #include "vmx.h" | ||
21 | #include "mmu.h" | ||
22 | |||
23 | #include <linux/kvm_host.h> | ||
24 | #include <linux/types.h> | ||
25 | #include <linux/string.h> | ||
26 | #include <linux/mm.h> | ||
27 | #include <linux/highmem.h> | ||
28 | #include <linux/module.h> | ||
29 | #include <linux/swap.h> | ||
30 | |||
31 | #include <asm/page.h> | ||
32 | #include <asm/cmpxchg.h> | ||
33 | #include <asm/io.h> | ||
34 | |||
35 | #undef MMU_DEBUG | ||
36 | |||
37 | #undef AUDIT | ||
38 | |||
39 | #ifdef AUDIT | ||
40 | static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg); | ||
41 | #else | ||
42 | static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {} | ||
43 | #endif | ||
44 | |||
45 | #ifdef MMU_DEBUG | ||
46 | |||
47 | #define pgprintk(x...) do { if (dbg) printk(x); } while (0) | ||
48 | #define rmap_printk(x...) do { if (dbg) printk(x); } while (0) | ||
49 | |||
50 | #else | ||
51 | |||
52 | #define pgprintk(x...) do { } while (0) | ||
53 | #define rmap_printk(x...) do { } while (0) | ||
54 | |||
55 | #endif | ||
56 | |||
57 | #if defined(MMU_DEBUG) || defined(AUDIT) | ||
58 | static int dbg = 1; | ||
59 | #endif | ||
60 | |||
61 | #ifndef MMU_DEBUG | ||
62 | #define ASSERT(x) do { } while (0) | ||
63 | #else | ||
64 | #define ASSERT(x) \ | ||
65 | if (!(x)) { \ | ||
66 | printk(KERN_WARNING "assertion failed %s:%d: %s\n", \ | ||
67 | __FILE__, __LINE__, #x); \ | ||
68 | } | ||
69 | #endif | ||
70 | |||
71 | #define PT64_PT_BITS 9 | ||
72 | #define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) | ||
73 | #define PT32_PT_BITS 10 | ||
74 | #define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS) | ||
75 | |||
76 | #define PT_WRITABLE_SHIFT 1 | ||
77 | |||
78 | #define PT_PRESENT_MASK (1ULL << 0) | ||
79 | #define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT) | ||
80 | #define PT_USER_MASK (1ULL << 2) | ||
81 | #define PT_PWT_MASK (1ULL << 3) | ||
82 | #define PT_PCD_MASK (1ULL << 4) | ||
83 | #define PT_ACCESSED_MASK (1ULL << 5) | ||
84 | #define PT_DIRTY_MASK (1ULL << 6) | ||
85 | #define PT_PAGE_SIZE_MASK (1ULL << 7) | ||
86 | #define PT_PAT_MASK (1ULL << 7) | ||
87 | #define PT_GLOBAL_MASK (1ULL << 8) | ||
88 | #define PT64_NX_SHIFT 63 | ||
89 | #define PT64_NX_MASK (1ULL << PT64_NX_SHIFT) | ||
90 | |||
91 | #define PT_PAT_SHIFT 7 | ||
92 | #define PT_DIR_PAT_SHIFT 12 | ||
93 | #define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT) | ||
94 | |||
95 | #define PT32_DIR_PSE36_SIZE 4 | ||
96 | #define PT32_DIR_PSE36_SHIFT 13 | ||
97 | #define PT32_DIR_PSE36_MASK \ | ||
98 | (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT) | ||
99 | |||
100 | |||
101 | #define PT_FIRST_AVAIL_BITS_SHIFT 9 | ||
102 | #define PT64_SECOND_AVAIL_BITS_SHIFT 52 | ||
103 | |||
104 | #define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) | ||
105 | |||
106 | #define VALID_PAGE(x) ((x) != INVALID_PAGE) | ||
107 | |||
108 | #define PT64_LEVEL_BITS 9 | ||
109 | |||
110 | #define PT64_LEVEL_SHIFT(level) \ | ||
111 | (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS) | ||
112 | |||
113 | #define PT64_LEVEL_MASK(level) \ | ||
114 | (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level)) | ||
115 | |||
116 | #define PT64_INDEX(address, level)\ | ||
117 | (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) | ||
118 | |||
119 | |||
120 | #define PT32_LEVEL_BITS 10 | ||
121 | |||
122 | #define PT32_LEVEL_SHIFT(level) \ | ||
123 | (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS) | ||
124 | |||
125 | #define PT32_LEVEL_MASK(level) \ | ||
126 | (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level)) | ||
127 | |||
128 | #define PT32_INDEX(address, level)\ | ||
129 | (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) | ||
130 | |||
131 | |||
132 | #define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) | ||
133 | #define PT64_DIR_BASE_ADDR_MASK \ | ||
134 | (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1)) | ||
135 | |||
136 | #define PT32_BASE_ADDR_MASK PAGE_MASK | ||
137 | #define PT32_DIR_BASE_ADDR_MASK \ | ||
138 | (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1)) | ||
139 | |||
140 | #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ | ||
141 | | PT64_NX_MASK) | ||
142 | |||
143 | #define PFERR_PRESENT_MASK (1U << 0) | ||
144 | #define PFERR_WRITE_MASK (1U << 1) | ||
145 | #define PFERR_USER_MASK (1U << 2) | ||
146 | #define PFERR_FETCH_MASK (1U << 4) | ||
147 | |||
148 | #define PT64_ROOT_LEVEL 4 | ||
149 | #define PT32_ROOT_LEVEL 2 | ||
150 | #define PT32E_ROOT_LEVEL 3 | ||
151 | |||
152 | #define PT_DIRECTORY_LEVEL 2 | ||
153 | #define PT_PAGE_TABLE_LEVEL 1 | ||
154 | |||
155 | #define RMAP_EXT 4 | ||
156 | |||
157 | #define ACC_EXEC_MASK 1 | ||
158 | #define ACC_WRITE_MASK PT_WRITABLE_MASK | ||
159 | #define ACC_USER_MASK PT_USER_MASK | ||
160 | #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) | ||
161 | |||
162 | struct kvm_rmap_desc { | ||
163 | u64 *shadow_ptes[RMAP_EXT]; | ||
164 | struct kvm_rmap_desc *more; | ||
165 | }; | ||
166 | |||
167 | static struct kmem_cache *pte_chain_cache; | ||
168 | static struct kmem_cache *rmap_desc_cache; | ||
169 | static struct kmem_cache *mmu_page_header_cache; | ||
170 | |||
171 | static u64 __read_mostly shadow_trap_nonpresent_pte; | ||
172 | static u64 __read_mostly shadow_notrap_nonpresent_pte; | ||
173 | |||
174 | void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte) | ||
175 | { | ||
176 | shadow_trap_nonpresent_pte = trap_pte; | ||
177 | shadow_notrap_nonpresent_pte = notrap_pte; | ||
178 | } | ||
179 | EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes); | ||
180 | |||
181 | static int is_write_protection(struct kvm_vcpu *vcpu) | ||
182 | { | ||
183 | return vcpu->arch.cr0 & X86_CR0_WP; | ||
184 | } | ||
185 | |||
186 | static int is_cpuid_PSE36(void) | ||
187 | { | ||
188 | return 1; | ||
189 | } | ||
190 | |||
191 | static int is_nx(struct kvm_vcpu *vcpu) | ||
192 | { | ||
193 | return vcpu->arch.shadow_efer & EFER_NX; | ||
194 | } | ||
195 | |||
196 | static int is_present_pte(unsigned long pte) | ||
197 | { | ||
198 | return pte & PT_PRESENT_MASK; | ||
199 | } | ||
200 | |||
201 | static int is_shadow_present_pte(u64 pte) | ||
202 | { | ||
203 | pte &= ~PT_SHADOW_IO_MARK; | ||
204 | return pte != shadow_trap_nonpresent_pte | ||
205 | && pte != shadow_notrap_nonpresent_pte; | ||
206 | } | ||
207 | |||
208 | static int is_writeble_pte(unsigned long pte) | ||
209 | { | ||
210 | return pte & PT_WRITABLE_MASK; | ||
211 | } | ||
212 | |||
213 | static int is_dirty_pte(unsigned long pte) | ||
214 | { | ||
215 | return pte & PT_DIRTY_MASK; | ||
216 | } | ||
217 | |||
218 | static int is_io_pte(unsigned long pte) | ||
219 | { | ||
220 | return pte & PT_SHADOW_IO_MARK; | ||
221 | } | ||
222 | |||
223 | static int is_rmap_pte(u64 pte) | ||
224 | { | ||
225 | return pte != shadow_trap_nonpresent_pte | ||
226 | && pte != shadow_notrap_nonpresent_pte; | ||
227 | } | ||
228 | |||
229 | static gfn_t pse36_gfn_delta(u32 gpte) | ||
230 | { | ||
231 | int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT; | ||
232 | |||
233 | return (gpte & PT32_DIR_PSE36_MASK) << shift; | ||
234 | } | ||
235 | |||
236 | static void set_shadow_pte(u64 *sptep, u64 spte) | ||
237 | { | ||
238 | #ifdef CONFIG_X86_64 | ||
239 | set_64bit((unsigned long *)sptep, spte); | ||
240 | #else | ||
241 | set_64bit((unsigned long long *)sptep, spte); | ||
242 | #endif | ||
243 | } | ||
244 | |||
245 | static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, | ||
246 | struct kmem_cache *base_cache, int min) | ||
247 | { | ||
248 | void *obj; | ||
249 | |||
250 | if (cache->nobjs >= min) | ||
251 | return 0; | ||
252 | while (cache->nobjs < ARRAY_SIZE(cache->objects)) { | ||
253 | obj = kmem_cache_zalloc(base_cache, GFP_KERNEL); | ||
254 | if (!obj) | ||
255 | return -ENOMEM; | ||
256 | cache->objects[cache->nobjs++] = obj; | ||
257 | } | ||
258 | return 0; | ||
259 | } | ||
260 | |||
261 | static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) | ||
262 | { | ||
263 | while (mc->nobjs) | ||
264 | kfree(mc->objects[--mc->nobjs]); | ||
265 | } | ||
266 | |||
267 | static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, | ||
268 | int min) | ||
269 | { | ||
270 | struct page *page; | ||
271 | |||
272 | if (cache->nobjs >= min) | ||
273 | return 0; | ||
274 | while (cache->nobjs < ARRAY_SIZE(cache->objects)) { | ||
275 | page = alloc_page(GFP_KERNEL); | ||
276 | if (!page) | ||
277 | return -ENOMEM; | ||
278 | set_page_private(page, 0); | ||
279 | cache->objects[cache->nobjs++] = page_address(page); | ||
280 | } | ||
281 | return 0; | ||
282 | } | ||
283 | |||
284 | static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc) | ||
285 | { | ||
286 | while (mc->nobjs) | ||
287 | free_page((unsigned long)mc->objects[--mc->nobjs]); | ||
288 | } | ||
289 | |||
290 | static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) | ||
291 | { | ||
292 | int r; | ||
293 | |||
294 | r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache, | ||
295 | pte_chain_cache, 4); | ||
296 | if (r) | ||
297 | goto out; | ||
298 | r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, | ||
299 | rmap_desc_cache, 1); | ||
300 | if (r) | ||
301 | goto out; | ||
302 | r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); | ||
303 | if (r) | ||
304 | goto out; | ||
305 | r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache, | ||
306 | mmu_page_header_cache, 4); | ||
307 | out: | ||
308 | return r; | ||
309 | } | ||
310 | |||
311 | static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) | ||
312 | { | ||
313 | mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache); | ||
314 | mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache); | ||
315 | mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache); | ||
316 | mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); | ||
317 | } | ||
318 | |||
319 | static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, | ||
320 | size_t size) | ||
321 | { | ||
322 | void *p; | ||
323 | |||
324 | BUG_ON(!mc->nobjs); | ||
325 | p = mc->objects[--mc->nobjs]; | ||
326 | memset(p, 0, size); | ||
327 | return p; | ||
328 | } | ||
329 | |||
330 | static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu) | ||
331 | { | ||
332 | return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache, | ||
333 | sizeof(struct kvm_pte_chain)); | ||
334 | } | ||
335 | |||
336 | static void mmu_free_pte_chain(struct kvm_pte_chain *pc) | ||
337 | { | ||
338 | kfree(pc); | ||
339 | } | ||
340 | |||
341 | static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu) | ||
342 | { | ||
343 | return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache, | ||
344 | sizeof(struct kvm_rmap_desc)); | ||
345 | } | ||
346 | |||
347 | static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd) | ||
348 | { | ||
349 | kfree(rd); | ||
350 | } | ||
351 | |||
352 | /* | ||
353 | * Take gfn and return the reverse mapping to it. | ||
354 | * Note: gfn must be unaliased before this function get called | ||
355 | */ | ||
356 | |||
357 | static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn) | ||
358 | { | ||
359 | struct kvm_memory_slot *slot; | ||
360 | |||
361 | slot = gfn_to_memslot(kvm, gfn); | ||
362 | return &slot->rmap[gfn - slot->base_gfn]; | ||
363 | } | ||
364 | |||
365 | /* | ||
366 | * Reverse mapping data structures: | ||
367 | * | ||
368 | * If rmapp bit zero is zero, then rmapp point to the shadw page table entry | ||
369 | * that points to page_address(page). | ||
370 | * | ||
371 | * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc | ||
372 | * containing more mappings. | ||
373 | */ | ||
374 | static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) | ||
375 | { | ||
376 | struct kvm_mmu_page *sp; | ||
377 | struct kvm_rmap_desc *desc; | ||
378 | unsigned long *rmapp; | ||
379 | int i; | ||
380 | |||
381 | if (!is_rmap_pte(*spte)) | ||
382 | return; | ||
383 | gfn = unalias_gfn(vcpu->kvm, gfn); | ||
384 | sp = page_header(__pa(spte)); | ||
385 | sp->gfns[spte - sp->spt] = gfn; | ||
386 | rmapp = gfn_to_rmap(vcpu->kvm, gfn); | ||
387 | if (!*rmapp) { | ||
388 | rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); | ||
389 | *rmapp = (unsigned long)spte; | ||
390 | } else if (!(*rmapp & 1)) { | ||
391 | rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte); | ||
392 | desc = mmu_alloc_rmap_desc(vcpu); | ||
393 | desc->shadow_ptes[0] = (u64 *)*rmapp; | ||
394 | desc->shadow_ptes[1] = spte; | ||
395 | *rmapp = (unsigned long)desc | 1; | ||
396 | } else { | ||
397 | rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); | ||
398 | desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); | ||
399 | while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) | ||
400 | desc = desc->more; | ||
401 | if (desc->shadow_ptes[RMAP_EXT-1]) { | ||
402 | desc->more = mmu_alloc_rmap_desc(vcpu); | ||
403 | desc = desc->more; | ||
404 | } | ||
405 | for (i = 0; desc->shadow_ptes[i]; ++i) | ||
406 | ; | ||
407 | desc->shadow_ptes[i] = spte; | ||
408 | } | ||
409 | } | ||
410 | |||
411 | static void rmap_desc_remove_entry(unsigned long *rmapp, | ||
412 | struct kvm_rmap_desc *desc, | ||
413 | int i, | ||
414 | struct kvm_rmap_desc *prev_desc) | ||
415 | { | ||
416 | int j; | ||
417 | |||
418 | for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j) | ||
419 | ; | ||
420 | desc->shadow_ptes[i] = desc->shadow_ptes[j]; | ||
421 | desc->shadow_ptes[j] = NULL; | ||
422 | if (j != 0) | ||
423 | return; | ||
424 | if (!prev_desc && !desc->more) | ||
425 | *rmapp = (unsigned long)desc->shadow_ptes[0]; | ||
426 | else | ||
427 | if (prev_desc) | ||
428 | prev_desc->more = desc->more; | ||
429 | else | ||
430 | *rmapp = (unsigned long)desc->more | 1; | ||
431 | mmu_free_rmap_desc(desc); | ||
432 | } | ||
433 | |||
434 | static void rmap_remove(struct kvm *kvm, u64 *spte) | ||
435 | { | ||
436 | struct kvm_rmap_desc *desc; | ||
437 | struct kvm_rmap_desc *prev_desc; | ||
438 | struct kvm_mmu_page *sp; | ||
439 | struct page *page; | ||
440 | unsigned long *rmapp; | ||
441 | int i; | ||
442 | |||
443 | if (!is_rmap_pte(*spte)) | ||
444 | return; | ||
445 | sp = page_header(__pa(spte)); | ||
446 | page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT); | ||
447 | mark_page_accessed(page); | ||
448 | if (is_writeble_pte(*spte)) | ||
449 | kvm_release_page_dirty(page); | ||
450 | else | ||
451 | kvm_release_page_clean(page); | ||
452 | rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt]); | ||
453 | if (!*rmapp) { | ||
454 | printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); | ||
455 | BUG(); | ||
456 | } else if (!(*rmapp & 1)) { | ||
457 | rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte); | ||
458 | if ((u64 *)*rmapp != spte) { | ||
459 | printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n", | ||
460 | spte, *spte); | ||
461 | BUG(); | ||
462 | } | ||
463 | *rmapp = 0; | ||
464 | } else { | ||
465 | rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte); | ||
466 | desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); | ||
467 | prev_desc = NULL; | ||
468 | while (desc) { | ||
469 | for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) | ||
470 | if (desc->shadow_ptes[i] == spte) { | ||
471 | rmap_desc_remove_entry(rmapp, | ||
472 | desc, i, | ||
473 | prev_desc); | ||
474 | return; | ||
475 | } | ||
476 | prev_desc = desc; | ||
477 | desc = desc->more; | ||
478 | } | ||
479 | BUG(); | ||
480 | } | ||
481 | } | ||
482 | |||
483 | static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) | ||
484 | { | ||
485 | struct kvm_rmap_desc *desc; | ||
486 | struct kvm_rmap_desc *prev_desc; | ||
487 | u64 *prev_spte; | ||
488 | int i; | ||
489 | |||
490 | if (!*rmapp) | ||
491 | return NULL; | ||
492 | else if (!(*rmapp & 1)) { | ||
493 | if (!spte) | ||
494 | return (u64 *)*rmapp; | ||
495 | return NULL; | ||
496 | } | ||
497 | desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); | ||
498 | prev_desc = NULL; | ||
499 | prev_spte = NULL; | ||
500 | while (desc) { | ||
501 | for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) { | ||
502 | if (prev_spte == spte) | ||
503 | return desc->shadow_ptes[i]; | ||
504 | prev_spte = desc->shadow_ptes[i]; | ||
505 | } | ||
506 | desc = desc->more; | ||
507 | } | ||
508 | return NULL; | ||
509 | } | ||
510 | |||
511 | static void rmap_write_protect(struct kvm *kvm, u64 gfn) | ||
512 | { | ||
513 | unsigned long *rmapp; | ||
514 | u64 *spte; | ||
515 | int write_protected = 0; | ||
516 | |||
517 | gfn = unalias_gfn(kvm, gfn); | ||
518 | rmapp = gfn_to_rmap(kvm, gfn); | ||
519 | |||
520 | spte = rmap_next(kvm, rmapp, NULL); | ||
521 | while (spte) { | ||
522 | BUG_ON(!spte); | ||
523 | BUG_ON(!(*spte & PT_PRESENT_MASK)); | ||
524 | rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); | ||
525 | if (is_writeble_pte(*spte)) { | ||
526 | set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK); | ||
527 | write_protected = 1; | ||
528 | } | ||
529 | spte = rmap_next(kvm, rmapp, spte); | ||
530 | } | ||
531 | if (write_protected) | ||
532 | kvm_flush_remote_tlbs(kvm); | ||
533 | } | ||
534 | |||
535 | #ifdef MMU_DEBUG | ||
536 | static int is_empty_shadow_page(u64 *spt) | ||
537 | { | ||
538 | u64 *pos; | ||
539 | u64 *end; | ||
540 | |||
541 | for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++) | ||
542 | if ((*pos & ~PT_SHADOW_IO_MARK) != shadow_trap_nonpresent_pte) { | ||
543 | printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__, | ||
544 | pos, *pos); | ||
545 | return 0; | ||
546 | } | ||
547 | return 1; | ||
548 | } | ||
549 | #endif | ||
550 | |||
551 | static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) | ||
552 | { | ||
553 | ASSERT(is_empty_shadow_page(sp->spt)); | ||
554 | list_del(&sp->link); | ||
555 | __free_page(virt_to_page(sp->spt)); | ||
556 | __free_page(virt_to_page(sp->gfns)); | ||
557 | kfree(sp); | ||
558 | ++kvm->arch.n_free_mmu_pages; | ||
559 | } | ||
560 | |||
561 | static unsigned kvm_page_table_hashfn(gfn_t gfn) | ||
562 | { | ||
563 | return gfn; | ||
564 | } | ||
565 | |||
566 | static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, | ||
567 | u64 *parent_pte) | ||
568 | { | ||
569 | struct kvm_mmu_page *sp; | ||
570 | |||
571 | sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp); | ||
572 | sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); | ||
573 | sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); | ||
574 | set_page_private(virt_to_page(sp->spt), (unsigned long)sp); | ||
575 | list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); | ||
576 | ASSERT(is_empty_shadow_page(sp->spt)); | ||
577 | sp->slot_bitmap = 0; | ||
578 | sp->multimapped = 0; | ||
579 | sp->parent_pte = parent_pte; | ||
580 | --vcpu->kvm->arch.n_free_mmu_pages; | ||
581 | return sp; | ||
582 | } | ||
583 | |||
584 | static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, | ||
585 | struct kvm_mmu_page *sp, u64 *parent_pte) | ||
586 | { | ||
587 | struct kvm_pte_chain *pte_chain; | ||
588 | struct hlist_node *node; | ||
589 | int i; | ||
590 | |||
591 | if (!parent_pte) | ||
592 | return; | ||
593 | if (!sp->multimapped) { | ||
594 | u64 *old = sp->parent_pte; | ||
595 | |||
596 | if (!old) { | ||
597 | sp->parent_pte = parent_pte; | ||
598 | return; | ||
599 | } | ||
600 | sp->multimapped = 1; | ||
601 | pte_chain = mmu_alloc_pte_chain(vcpu); | ||
602 | INIT_HLIST_HEAD(&sp->parent_ptes); | ||
603 | hlist_add_head(&pte_chain->link, &sp->parent_ptes); | ||
604 | pte_chain->parent_ptes[0] = old; | ||
605 | } | ||
606 | hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) { | ||
607 | if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1]) | ||
608 | continue; | ||
609 | for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) | ||
610 | if (!pte_chain->parent_ptes[i]) { | ||
611 | pte_chain->parent_ptes[i] = parent_pte; | ||
612 | return; | ||
613 | } | ||
614 | } | ||
615 | pte_chain = mmu_alloc_pte_chain(vcpu); | ||
616 | BUG_ON(!pte_chain); | ||
617 | hlist_add_head(&pte_chain->link, &sp->parent_ptes); | ||
618 | pte_chain->parent_ptes[0] = parent_pte; | ||
619 | } | ||
620 | |||
621 | static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, | ||
622 | u64 *parent_pte) | ||
623 | { | ||
624 | struct kvm_pte_chain *pte_chain; | ||
625 | struct hlist_node *node; | ||
626 | int i; | ||
627 | |||
628 | if (!sp->multimapped) { | ||
629 | BUG_ON(sp->parent_pte != parent_pte); | ||
630 | sp->parent_pte = NULL; | ||
631 | return; | ||
632 | } | ||
633 | hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) | ||
634 | for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { | ||
635 | if (!pte_chain->parent_ptes[i]) | ||
636 | break; | ||
637 | if (pte_chain->parent_ptes[i] != parent_pte) | ||
638 | continue; | ||
639 | while (i + 1 < NR_PTE_CHAIN_ENTRIES | ||
640 | && pte_chain->parent_ptes[i + 1]) { | ||
641 | pte_chain->parent_ptes[i] | ||
642 | = pte_chain->parent_ptes[i + 1]; | ||
643 | ++i; | ||
644 | } | ||
645 | pte_chain->parent_ptes[i] = NULL; | ||
646 | if (i == 0) { | ||
647 | hlist_del(&pte_chain->link); | ||
648 | mmu_free_pte_chain(pte_chain); | ||
649 | if (hlist_empty(&sp->parent_ptes)) { | ||
650 | sp->multimapped = 0; | ||
651 | sp->parent_pte = NULL; | ||
652 | } | ||
653 | } | ||
654 | return; | ||
655 | } | ||
656 | BUG(); | ||
657 | } | ||
658 | |||
659 | static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) | ||
660 | { | ||
661 | unsigned index; | ||
662 | struct hlist_head *bucket; | ||
663 | struct kvm_mmu_page *sp; | ||
664 | struct hlist_node *node; | ||
665 | |||
666 | pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn); | ||
667 | index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; | ||
668 | bucket = &kvm->arch.mmu_page_hash[index]; | ||
669 | hlist_for_each_entry(sp, node, bucket, hash_link) | ||
670 | if (sp->gfn == gfn && !sp->role.metaphysical) { | ||
671 | pgprintk("%s: found role %x\n", | ||
672 | __FUNCTION__, sp->role.word); | ||
673 | return sp; | ||
674 | } | ||
675 | return NULL; | ||
676 | } | ||
677 | |||
678 | static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | ||
679 | gfn_t gfn, | ||
680 | gva_t gaddr, | ||
681 | unsigned level, | ||
682 | int metaphysical, | ||
683 | unsigned access, | ||
684 | u64 *parent_pte, | ||
685 | bool *new_page) | ||
686 | { | ||
687 | union kvm_mmu_page_role role; | ||
688 | unsigned index; | ||
689 | unsigned quadrant; | ||
690 | struct hlist_head *bucket; | ||
691 | struct kvm_mmu_page *sp; | ||
692 | struct hlist_node *node; | ||
693 | |||
694 | role.word = 0; | ||
695 | role.glevels = vcpu->arch.mmu.root_level; | ||
696 | role.level = level; | ||
697 | role.metaphysical = metaphysical; | ||
698 | role.access = access; | ||
699 | if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { | ||
700 | quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); | ||
701 | quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; | ||
702 | role.quadrant = quadrant; | ||
703 | } | ||
704 | pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__, | ||
705 | gfn, role.word); | ||
706 | index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; | ||
707 | bucket = &vcpu->kvm->arch.mmu_page_hash[index]; | ||
708 | hlist_for_each_entry(sp, node, bucket, hash_link) | ||
709 | if (sp->gfn == gfn && sp->role.word == role.word) { | ||
710 | mmu_page_add_parent_pte(vcpu, sp, parent_pte); | ||
711 | pgprintk("%s: found\n", __FUNCTION__); | ||
712 | return sp; | ||
713 | } | ||
714 | ++vcpu->kvm->stat.mmu_cache_miss; | ||
715 | sp = kvm_mmu_alloc_page(vcpu, parent_pte); | ||
716 | if (!sp) | ||
717 | return sp; | ||
718 | pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word); | ||
719 | sp->gfn = gfn; | ||
720 | sp->role = role; | ||
721 | hlist_add_head(&sp->hash_link, bucket); | ||
722 | vcpu->arch.mmu.prefetch_page(vcpu, sp); | ||
723 | if (!metaphysical) | ||
724 | rmap_write_protect(vcpu->kvm, gfn); | ||
725 | if (new_page) | ||
726 | *new_page = 1; | ||
727 | return sp; | ||
728 | } | ||
729 | |||
730 | static void kvm_mmu_page_unlink_children(struct kvm *kvm, | ||
731 | struct kvm_mmu_page *sp) | ||
732 | { | ||
733 | unsigned i; | ||
734 | u64 *pt; | ||
735 | u64 ent; | ||
736 | |||
737 | pt = sp->spt; | ||
738 | |||
739 | if (sp->role.level == PT_PAGE_TABLE_LEVEL) { | ||
740 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | ||
741 | if (is_shadow_present_pte(pt[i])) | ||
742 | rmap_remove(kvm, &pt[i]); | ||
743 | pt[i] = shadow_trap_nonpresent_pte; | ||
744 | } | ||
745 | kvm_flush_remote_tlbs(kvm); | ||
746 | return; | ||
747 | } | ||
748 | |||
749 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | ||
750 | ent = pt[i]; | ||
751 | |||
752 | pt[i] = shadow_trap_nonpresent_pte; | ||
753 | if (!is_shadow_present_pte(ent)) | ||
754 | continue; | ||
755 | ent &= PT64_BASE_ADDR_MASK; | ||
756 | mmu_page_remove_parent_pte(page_header(ent), &pt[i]); | ||
757 | } | ||
758 | kvm_flush_remote_tlbs(kvm); | ||
759 | } | ||
760 | |||
761 | static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) | ||
762 | { | ||
763 | mmu_page_remove_parent_pte(sp, parent_pte); | ||
764 | } | ||
765 | |||
766 | static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm) | ||
767 | { | ||
768 | int i; | ||
769 | |||
770 | for (i = 0; i < KVM_MAX_VCPUS; ++i) | ||
771 | if (kvm->vcpus[i]) | ||
772 | kvm->vcpus[i]->arch.last_pte_updated = NULL; | ||
773 | } | ||
774 | |||
775 | static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) | ||
776 | { | ||
777 | u64 *parent_pte; | ||
778 | |||
779 | ++kvm->stat.mmu_shadow_zapped; | ||
780 | while (sp->multimapped || sp->parent_pte) { | ||
781 | if (!sp->multimapped) | ||
782 | parent_pte = sp->parent_pte; | ||
783 | else { | ||
784 | struct kvm_pte_chain *chain; | ||
785 | |||
786 | chain = container_of(sp->parent_ptes.first, | ||
787 | struct kvm_pte_chain, link); | ||
788 | parent_pte = chain->parent_ptes[0]; | ||
789 | } | ||
790 | BUG_ON(!parent_pte); | ||
791 | kvm_mmu_put_page(sp, parent_pte); | ||
792 | set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte); | ||
793 | } | ||
794 | kvm_mmu_page_unlink_children(kvm, sp); | ||
795 | if (!sp->root_count) { | ||
796 | hlist_del(&sp->hash_link); | ||
797 | kvm_mmu_free_page(kvm, sp); | ||
798 | } else | ||
799 | list_move(&sp->link, &kvm->arch.active_mmu_pages); | ||
800 | kvm_mmu_reset_last_pte_updated(kvm); | ||
801 | } | ||
802 | |||
803 | /* | ||
804 | * Changing the number of mmu pages allocated to the vm | ||
805 | * Note: if kvm_nr_mmu_pages is too small, you will get dead lock | ||
806 | */ | ||
807 | void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) | ||
808 | { | ||
809 | /* | ||
810 | * If we set the number of mmu pages to be smaller be than the | ||
811 | * number of actived pages , we must to free some mmu pages before we | ||
812 | * change the value | ||
813 | */ | ||
814 | |||
815 | if ((kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages) > | ||
816 | kvm_nr_mmu_pages) { | ||
817 | int n_used_mmu_pages = kvm->arch.n_alloc_mmu_pages | ||
818 | - kvm->arch.n_free_mmu_pages; | ||
819 | |||
820 | while (n_used_mmu_pages > kvm_nr_mmu_pages) { | ||
821 | struct kvm_mmu_page *page; | ||
822 | |||
823 | page = container_of(kvm->arch.active_mmu_pages.prev, | ||
824 | struct kvm_mmu_page, link); | ||
825 | kvm_mmu_zap_page(kvm, page); | ||
826 | n_used_mmu_pages--; | ||
827 | } | ||
828 | kvm->arch.n_free_mmu_pages = 0; | ||
829 | } | ||
830 | else | ||
831 | kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages | ||
832 | - kvm->arch.n_alloc_mmu_pages; | ||
833 | |||
834 | kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages; | ||
835 | } | ||
836 | |||
837 | static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) | ||
838 | { | ||
839 | unsigned index; | ||
840 | struct hlist_head *bucket; | ||
841 | struct kvm_mmu_page *sp; | ||
842 | struct hlist_node *node, *n; | ||
843 | int r; | ||
844 | |||
845 | pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn); | ||
846 | r = 0; | ||
847 | index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; | ||
848 | bucket = &kvm->arch.mmu_page_hash[index]; | ||
849 | hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) | ||
850 | if (sp->gfn == gfn && !sp->role.metaphysical) { | ||
851 | pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn, | ||
852 | sp->role.word); | ||
853 | kvm_mmu_zap_page(kvm, sp); | ||
854 | r = 1; | ||
855 | } | ||
856 | return r; | ||
857 | } | ||
858 | |||
859 | static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) | ||
860 | { | ||
861 | struct kvm_mmu_page *sp; | ||
862 | |||
863 | while ((sp = kvm_mmu_lookup_page(kvm, gfn)) != NULL) { | ||
864 | pgprintk("%s: zap %lx %x\n", __FUNCTION__, gfn, sp->role.word); | ||
865 | kvm_mmu_zap_page(kvm, sp); | ||
866 | } | ||
867 | } | ||
868 | |||
869 | static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) | ||
870 | { | ||
871 | int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn)); | ||
872 | struct kvm_mmu_page *sp = page_header(__pa(pte)); | ||
873 | |||
874 | __set_bit(slot, &sp->slot_bitmap); | ||
875 | } | ||
876 | |||
877 | struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) | ||
878 | { | ||
879 | gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); | ||
880 | |||
881 | if (gpa == UNMAPPED_GVA) | ||
882 | return NULL; | ||
883 | return gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); | ||
884 | } | ||
885 | |||
886 | static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | ||
887 | unsigned pt_access, unsigned pte_access, | ||
888 | int user_fault, int write_fault, int dirty, | ||
889 | int *ptwrite, gfn_t gfn, struct page *page) | ||
890 | { | ||
891 | u64 spte; | ||
892 | int was_rmapped = is_rmap_pte(*shadow_pte); | ||
893 | int was_writeble = is_writeble_pte(*shadow_pte); | ||
894 | |||
895 | pgprintk("%s: spte %llx access %x write_fault %d" | ||
896 | " user_fault %d gfn %lx\n", | ||
897 | __FUNCTION__, *shadow_pte, pt_access, | ||
898 | write_fault, user_fault, gfn); | ||
899 | |||
900 | /* | ||
901 | * We don't set the accessed bit, since we sometimes want to see | ||
902 | * whether the guest actually used the pte (in order to detect | ||
903 | * demand paging). | ||
904 | */ | ||
905 | spte = PT_PRESENT_MASK | PT_DIRTY_MASK; | ||
906 | if (!dirty) | ||
907 | pte_access &= ~ACC_WRITE_MASK; | ||
908 | if (!(pte_access & ACC_EXEC_MASK)) | ||
909 | spte |= PT64_NX_MASK; | ||
910 | |||
911 | spte |= PT_PRESENT_MASK; | ||
912 | if (pte_access & ACC_USER_MASK) | ||
913 | spte |= PT_USER_MASK; | ||
914 | |||
915 | if (is_error_page(page)) { | ||
916 | set_shadow_pte(shadow_pte, | ||
917 | shadow_trap_nonpresent_pte | PT_SHADOW_IO_MARK); | ||
918 | kvm_release_page_clean(page); | ||
919 | return; | ||
920 | } | ||
921 | |||
922 | spte |= page_to_phys(page); | ||
923 | |||
924 | if ((pte_access & ACC_WRITE_MASK) | ||
925 | || (write_fault && !is_write_protection(vcpu) && !user_fault)) { | ||
926 | struct kvm_mmu_page *shadow; | ||
927 | |||
928 | spte |= PT_WRITABLE_MASK; | ||
929 | if (user_fault) { | ||
930 | mmu_unshadow(vcpu->kvm, gfn); | ||
931 | goto unshadowed; | ||
932 | } | ||
933 | |||
934 | shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); | ||
935 | if (shadow) { | ||
936 | pgprintk("%s: found shadow page for %lx, marking ro\n", | ||
937 | __FUNCTION__, gfn); | ||
938 | pte_access &= ~ACC_WRITE_MASK; | ||
939 | if (is_writeble_pte(spte)) { | ||
940 | spte &= ~PT_WRITABLE_MASK; | ||
941 | kvm_x86_ops->tlb_flush(vcpu); | ||
942 | } | ||
943 | if (write_fault) | ||
944 | *ptwrite = 1; | ||
945 | } | ||
946 | } | ||
947 | |||
948 | unshadowed: | ||
949 | |||
950 | if (pte_access & ACC_WRITE_MASK) | ||
951 | mark_page_dirty(vcpu->kvm, gfn); | ||
952 | |||
953 | pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte); | ||
954 | set_shadow_pte(shadow_pte, spte); | ||
955 | page_header_update_slot(vcpu->kvm, shadow_pte, gfn); | ||
956 | if (!was_rmapped) { | ||
957 | rmap_add(vcpu, shadow_pte, gfn); | ||
958 | if (!is_rmap_pte(*shadow_pte)) | ||
959 | kvm_release_page_clean(page); | ||
960 | } else { | ||
961 | if (was_writeble) | ||
962 | kvm_release_page_dirty(page); | ||
963 | else | ||
964 | kvm_release_page_clean(page); | ||
965 | } | ||
966 | if (!ptwrite || !*ptwrite) | ||
967 | vcpu->arch.last_pte_updated = shadow_pte; | ||
968 | } | ||
969 | |||
970 | static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) | ||
971 | { | ||
972 | } | ||
973 | |||
974 | static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, | ||
975 | gfn_t gfn, struct page *page) | ||
976 | { | ||
977 | int level = PT32E_ROOT_LEVEL; | ||
978 | hpa_t table_addr = vcpu->arch.mmu.root_hpa; | ||
979 | int pt_write = 0; | ||
980 | |||
981 | for (; ; level--) { | ||
982 | u32 index = PT64_INDEX(v, level); | ||
983 | u64 *table; | ||
984 | |||
985 | ASSERT(VALID_PAGE(table_addr)); | ||
986 | table = __va(table_addr); | ||
987 | |||
988 | if (level == 1) { | ||
989 | mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, | ||
990 | 0, write, 1, &pt_write, gfn, page); | ||
991 | return pt_write || is_io_pte(table[index]); | ||
992 | } | ||
993 | |||
994 | if (table[index] == shadow_trap_nonpresent_pte) { | ||
995 | struct kvm_mmu_page *new_table; | ||
996 | gfn_t pseudo_gfn; | ||
997 | |||
998 | pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK) | ||
999 | >> PAGE_SHIFT; | ||
1000 | new_table = kvm_mmu_get_page(vcpu, pseudo_gfn, | ||
1001 | v, level - 1, | ||
1002 | 1, ACC_ALL, &table[index], | ||
1003 | NULL); | ||
1004 | if (!new_table) { | ||
1005 | pgprintk("nonpaging_map: ENOMEM\n"); | ||
1006 | kvm_release_page_clean(page); | ||
1007 | return -ENOMEM; | ||
1008 | } | ||
1009 | |||
1010 | table[index] = __pa(new_table->spt) | PT_PRESENT_MASK | ||
1011 | | PT_WRITABLE_MASK | PT_USER_MASK; | ||
1012 | } | ||
1013 | table_addr = table[index] & PT64_BASE_ADDR_MASK; | ||
1014 | } | ||
1015 | } | ||
1016 | |||
1017 | static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) | ||
1018 | { | ||
1019 | int r; | ||
1020 | |||
1021 | struct page *page; | ||
1022 | |||
1023 | down_read(¤t->mm->mmap_sem); | ||
1024 | page = gfn_to_page(vcpu->kvm, gfn); | ||
1025 | |||
1026 | spin_lock(&vcpu->kvm->mmu_lock); | ||
1027 | kvm_mmu_free_some_pages(vcpu); | ||
1028 | r = __nonpaging_map(vcpu, v, write, gfn, page); | ||
1029 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
1030 | |||
1031 | up_read(¤t->mm->mmap_sem); | ||
1032 | |||
1033 | return r; | ||
1034 | } | ||
1035 | |||
1036 | |||
1037 | static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, | ||
1038 | struct kvm_mmu_page *sp) | ||
1039 | { | ||
1040 | int i; | ||
1041 | |||
1042 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) | ||
1043 | sp->spt[i] = shadow_trap_nonpresent_pte; | ||
1044 | } | ||
1045 | |||
1046 | static void mmu_free_roots(struct kvm_vcpu *vcpu) | ||
1047 | { | ||
1048 | int i; | ||
1049 | struct kvm_mmu_page *sp; | ||
1050 | |||
1051 | if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) | ||
1052 | return; | ||
1053 | spin_lock(&vcpu->kvm->mmu_lock); | ||
1054 | #ifdef CONFIG_X86_64 | ||
1055 | if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { | ||
1056 | hpa_t root = vcpu->arch.mmu.root_hpa; | ||
1057 | |||
1058 | sp = page_header(root); | ||
1059 | --sp->root_count; | ||
1060 | vcpu->arch.mmu.root_hpa = INVALID_PAGE; | ||
1061 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
1062 | return; | ||
1063 | } | ||
1064 | #endif | ||
1065 | for (i = 0; i < 4; ++i) { | ||
1066 | hpa_t root = vcpu->arch.mmu.pae_root[i]; | ||
1067 | |||
1068 | if (root) { | ||
1069 | root &= PT64_BASE_ADDR_MASK; | ||
1070 | sp = page_header(root); | ||
1071 | --sp->root_count; | ||
1072 | } | ||
1073 | vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; | ||
1074 | } | ||
1075 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
1076 | vcpu->arch.mmu.root_hpa = INVALID_PAGE; | ||
1077 | } | ||
1078 | |||
1079 | static void mmu_alloc_roots(struct kvm_vcpu *vcpu) | ||
1080 | { | ||
1081 | int i; | ||
1082 | gfn_t root_gfn; | ||
1083 | struct kvm_mmu_page *sp; | ||
1084 | |||
1085 | root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT; | ||
1086 | |||
1087 | #ifdef CONFIG_X86_64 | ||
1088 | if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { | ||
1089 | hpa_t root = vcpu->arch.mmu.root_hpa; | ||
1090 | |||
1091 | ASSERT(!VALID_PAGE(root)); | ||
1092 | sp = kvm_mmu_get_page(vcpu, root_gfn, 0, | ||
1093 | PT64_ROOT_LEVEL, 0, ACC_ALL, NULL, NULL); | ||
1094 | root = __pa(sp->spt); | ||
1095 | ++sp->root_count; | ||
1096 | vcpu->arch.mmu.root_hpa = root; | ||
1097 | return; | ||
1098 | } | ||
1099 | #endif | ||
1100 | for (i = 0; i < 4; ++i) { | ||
1101 | hpa_t root = vcpu->arch.mmu.pae_root[i]; | ||
1102 | |||
1103 | ASSERT(!VALID_PAGE(root)); | ||
1104 | if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { | ||
1105 | if (!is_present_pte(vcpu->arch.pdptrs[i])) { | ||
1106 | vcpu->arch.mmu.pae_root[i] = 0; | ||
1107 | continue; | ||
1108 | } | ||
1109 | root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT; | ||
1110 | } else if (vcpu->arch.mmu.root_level == 0) | ||
1111 | root_gfn = 0; | ||
1112 | sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, | ||
1113 | PT32_ROOT_LEVEL, !is_paging(vcpu), | ||
1114 | ACC_ALL, NULL, NULL); | ||
1115 | root = __pa(sp->spt); | ||
1116 | ++sp->root_count; | ||
1117 | vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; | ||
1118 | } | ||
1119 | vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); | ||
1120 | } | ||
1121 | |||
1122 | static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) | ||
1123 | { | ||
1124 | return vaddr; | ||
1125 | } | ||
1126 | |||
1127 | static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, | ||
1128 | u32 error_code) | ||
1129 | { | ||
1130 | gfn_t gfn; | ||
1131 | int r; | ||
1132 | |||
1133 | pgprintk("%s: gva %lx error %x\n", __FUNCTION__, gva, error_code); | ||
1134 | r = mmu_topup_memory_caches(vcpu); | ||
1135 | if (r) | ||
1136 | return r; | ||
1137 | |||
1138 | ASSERT(vcpu); | ||
1139 | ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); | ||
1140 | |||
1141 | gfn = gva >> PAGE_SHIFT; | ||
1142 | |||
1143 | return nonpaging_map(vcpu, gva & PAGE_MASK, | ||
1144 | error_code & PFERR_WRITE_MASK, gfn); | ||
1145 | } | ||
1146 | |||
1147 | static void nonpaging_free(struct kvm_vcpu *vcpu) | ||
1148 | { | ||
1149 | mmu_free_roots(vcpu); | ||
1150 | } | ||
1151 | |||
1152 | static int nonpaging_init_context(struct kvm_vcpu *vcpu) | ||
1153 | { | ||
1154 | struct kvm_mmu *context = &vcpu->arch.mmu; | ||
1155 | |||
1156 | context->new_cr3 = nonpaging_new_cr3; | ||
1157 | context->page_fault = nonpaging_page_fault; | ||
1158 | context->gva_to_gpa = nonpaging_gva_to_gpa; | ||
1159 | context->free = nonpaging_free; | ||
1160 | context->prefetch_page = nonpaging_prefetch_page; | ||
1161 | context->root_level = 0; | ||
1162 | context->shadow_root_level = PT32E_ROOT_LEVEL; | ||
1163 | context->root_hpa = INVALID_PAGE; | ||
1164 | return 0; | ||
1165 | } | ||
1166 | |||
1167 | void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) | ||
1168 | { | ||
1169 | ++vcpu->stat.tlb_flush; | ||
1170 | kvm_x86_ops->tlb_flush(vcpu); | ||
1171 | } | ||
1172 | |||
1173 | static void paging_new_cr3(struct kvm_vcpu *vcpu) | ||
1174 | { | ||
1175 | pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3); | ||
1176 | mmu_free_roots(vcpu); | ||
1177 | } | ||
1178 | |||
1179 | static void inject_page_fault(struct kvm_vcpu *vcpu, | ||
1180 | u64 addr, | ||
1181 | u32 err_code) | ||
1182 | { | ||
1183 | kvm_inject_page_fault(vcpu, addr, err_code); | ||
1184 | } | ||
1185 | |||
1186 | static void paging_free(struct kvm_vcpu *vcpu) | ||
1187 | { | ||
1188 | nonpaging_free(vcpu); | ||
1189 | } | ||
1190 | |||
1191 | #define PTTYPE 64 | ||
1192 | #include "paging_tmpl.h" | ||
1193 | #undef PTTYPE | ||
1194 | |||
1195 | #define PTTYPE 32 | ||
1196 | #include "paging_tmpl.h" | ||
1197 | #undef PTTYPE | ||
1198 | |||
1199 | static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) | ||
1200 | { | ||
1201 | struct kvm_mmu *context = &vcpu->arch.mmu; | ||
1202 | |||
1203 | ASSERT(is_pae(vcpu)); | ||
1204 | context->new_cr3 = paging_new_cr3; | ||
1205 | context->page_fault = paging64_page_fault; | ||
1206 | context->gva_to_gpa = paging64_gva_to_gpa; | ||
1207 | context->prefetch_page = paging64_prefetch_page; | ||
1208 | context->free = paging_free; | ||
1209 | context->root_level = level; | ||
1210 | context->shadow_root_level = level; | ||
1211 | context->root_hpa = INVALID_PAGE; | ||
1212 | return 0; | ||
1213 | } | ||
1214 | |||
1215 | static int paging64_init_context(struct kvm_vcpu *vcpu) | ||
1216 | { | ||
1217 | return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL); | ||
1218 | } | ||
1219 | |||
1220 | static int paging32_init_context(struct kvm_vcpu *vcpu) | ||
1221 | { | ||
1222 | struct kvm_mmu *context = &vcpu->arch.mmu; | ||
1223 | |||
1224 | context->new_cr3 = paging_new_cr3; | ||
1225 | context->page_fault = paging32_page_fault; | ||
1226 | context->gva_to_gpa = paging32_gva_to_gpa; | ||
1227 | context->free = paging_free; | ||
1228 | context->prefetch_page = paging32_prefetch_page; | ||
1229 | context->root_level = PT32_ROOT_LEVEL; | ||
1230 | context->shadow_root_level = PT32E_ROOT_LEVEL; | ||
1231 | context->root_hpa = INVALID_PAGE; | ||
1232 | return 0; | ||
1233 | } | ||
1234 | |||
1235 | static int paging32E_init_context(struct kvm_vcpu *vcpu) | ||
1236 | { | ||
1237 | return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL); | ||
1238 | } | ||
1239 | |||
1240 | static int init_kvm_mmu(struct kvm_vcpu *vcpu) | ||
1241 | { | ||
1242 | ASSERT(vcpu); | ||
1243 | ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); | ||
1244 | |||
1245 | if (!is_paging(vcpu)) | ||
1246 | return nonpaging_init_context(vcpu); | ||
1247 | else if (is_long_mode(vcpu)) | ||
1248 | return paging64_init_context(vcpu); | ||
1249 | else if (is_pae(vcpu)) | ||
1250 | return paging32E_init_context(vcpu); | ||
1251 | else | ||
1252 | return paging32_init_context(vcpu); | ||
1253 | } | ||
1254 | |||
1255 | static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) | ||
1256 | { | ||
1257 | ASSERT(vcpu); | ||
1258 | if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) { | ||
1259 | vcpu->arch.mmu.free(vcpu); | ||
1260 | vcpu->arch.mmu.root_hpa = INVALID_PAGE; | ||
1261 | } | ||
1262 | } | ||
1263 | |||
1264 | int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) | ||
1265 | { | ||
1266 | destroy_kvm_mmu(vcpu); | ||
1267 | return init_kvm_mmu(vcpu); | ||
1268 | } | ||
1269 | EXPORT_SYMBOL_GPL(kvm_mmu_reset_context); | ||
1270 | |||
1271 | int kvm_mmu_load(struct kvm_vcpu *vcpu) | ||
1272 | { | ||
1273 | int r; | ||
1274 | |||
1275 | r = mmu_topup_memory_caches(vcpu); | ||
1276 | if (r) | ||
1277 | goto out; | ||
1278 | spin_lock(&vcpu->kvm->mmu_lock); | ||
1279 | kvm_mmu_free_some_pages(vcpu); | ||
1280 | mmu_alloc_roots(vcpu); | ||
1281 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
1282 | kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); | ||
1283 | kvm_mmu_flush_tlb(vcpu); | ||
1284 | out: | ||
1285 | return r; | ||
1286 | } | ||
1287 | EXPORT_SYMBOL_GPL(kvm_mmu_load); | ||
1288 | |||
1289 | void kvm_mmu_unload(struct kvm_vcpu *vcpu) | ||
1290 | { | ||
1291 | mmu_free_roots(vcpu); | ||
1292 | } | ||
1293 | |||
1294 | static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, | ||
1295 | struct kvm_mmu_page *sp, | ||
1296 | u64 *spte) | ||
1297 | { | ||
1298 | u64 pte; | ||
1299 | struct kvm_mmu_page *child; | ||
1300 | |||
1301 | pte = *spte; | ||
1302 | if (is_shadow_present_pte(pte)) { | ||
1303 | if (sp->role.level == PT_PAGE_TABLE_LEVEL) | ||
1304 | rmap_remove(vcpu->kvm, spte); | ||
1305 | else { | ||
1306 | child = page_header(pte & PT64_BASE_ADDR_MASK); | ||
1307 | mmu_page_remove_parent_pte(child, spte); | ||
1308 | } | ||
1309 | } | ||
1310 | set_shadow_pte(spte, shadow_trap_nonpresent_pte); | ||
1311 | } | ||
1312 | |||
1313 | static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, | ||
1314 | struct kvm_mmu_page *sp, | ||
1315 | u64 *spte, | ||
1316 | const void *new, int bytes, | ||
1317 | int offset_in_pte) | ||
1318 | { | ||
1319 | if (sp->role.level != PT_PAGE_TABLE_LEVEL) { | ||
1320 | ++vcpu->kvm->stat.mmu_pde_zapped; | ||
1321 | return; | ||
1322 | } | ||
1323 | |||
1324 | ++vcpu->kvm->stat.mmu_pte_updated; | ||
1325 | if (sp->role.glevels == PT32_ROOT_LEVEL) | ||
1326 | paging32_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte); | ||
1327 | else | ||
1328 | paging64_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte); | ||
1329 | } | ||
1330 | |||
1331 | static bool need_remote_flush(u64 old, u64 new) | ||
1332 | { | ||
1333 | if (!is_shadow_present_pte(old)) | ||
1334 | return false; | ||
1335 | if (!is_shadow_present_pte(new)) | ||
1336 | return true; | ||
1337 | if ((old ^ new) & PT64_BASE_ADDR_MASK) | ||
1338 | return true; | ||
1339 | old ^= PT64_NX_MASK; | ||
1340 | new ^= PT64_NX_MASK; | ||
1341 | return (old & ~new & PT64_PERM_MASK) != 0; | ||
1342 | } | ||
1343 | |||
1344 | static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new) | ||
1345 | { | ||
1346 | if (need_remote_flush(old, new)) | ||
1347 | kvm_flush_remote_tlbs(vcpu->kvm); | ||
1348 | else | ||
1349 | kvm_mmu_flush_tlb(vcpu); | ||
1350 | } | ||
1351 | |||
1352 | static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu) | ||
1353 | { | ||
1354 | u64 *spte = vcpu->arch.last_pte_updated; | ||
1355 | |||
1356 | return !!(spte && (*spte & PT_ACCESSED_MASK)); | ||
1357 | } | ||
1358 | |||
1359 | static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | ||
1360 | const u8 *new, int bytes) | ||
1361 | { | ||
1362 | gfn_t gfn; | ||
1363 | int r; | ||
1364 | u64 gpte = 0; | ||
1365 | |||
1366 | if (bytes != 4 && bytes != 8) | ||
1367 | return; | ||
1368 | |||
1369 | /* | ||
1370 | * Assume that the pte write on a page table of the same type | ||
1371 | * as the current vcpu paging mode. This is nearly always true | ||
1372 | * (might be false while changing modes). Note it is verified later | ||
1373 | * by update_pte(). | ||
1374 | */ | ||
1375 | if (is_pae(vcpu)) { | ||
1376 | /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ | ||
1377 | if ((bytes == 4) && (gpa % 4 == 0)) { | ||
1378 | r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8); | ||
1379 | if (r) | ||
1380 | return; | ||
1381 | memcpy((void *)&gpte + (gpa % 8), new, 4); | ||
1382 | } else if ((bytes == 8) && (gpa % 8 == 0)) { | ||
1383 | memcpy((void *)&gpte, new, 8); | ||
1384 | } | ||
1385 | } else { | ||
1386 | if ((bytes == 4) && (gpa % 4 == 0)) | ||
1387 | memcpy((void *)&gpte, new, 4); | ||
1388 | } | ||
1389 | if (!is_present_pte(gpte)) | ||
1390 | return; | ||
1391 | gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; | ||
1392 | vcpu->arch.update_pte.gfn = gfn; | ||
1393 | vcpu->arch.update_pte.page = gfn_to_page(vcpu->kvm, gfn); | ||
1394 | } | ||
1395 | |||
1396 | void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | ||
1397 | const u8 *new, int bytes) | ||
1398 | { | ||
1399 | gfn_t gfn = gpa >> PAGE_SHIFT; | ||
1400 | struct kvm_mmu_page *sp; | ||
1401 | struct hlist_node *node, *n; | ||
1402 | struct hlist_head *bucket; | ||
1403 | unsigned index; | ||
1404 | u64 entry; | ||
1405 | u64 *spte; | ||
1406 | unsigned offset = offset_in_page(gpa); | ||
1407 | unsigned pte_size; | ||
1408 | unsigned page_offset; | ||
1409 | unsigned misaligned; | ||
1410 | unsigned quadrant; | ||
1411 | int level; | ||
1412 | int flooded = 0; | ||
1413 | int npte; | ||
1414 | |||
1415 | pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes); | ||
1416 | mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes); | ||
1417 | spin_lock(&vcpu->kvm->mmu_lock); | ||
1418 | kvm_mmu_free_some_pages(vcpu); | ||
1419 | ++vcpu->kvm->stat.mmu_pte_write; | ||
1420 | kvm_mmu_audit(vcpu, "pre pte write"); | ||
1421 | if (gfn == vcpu->arch.last_pt_write_gfn | ||
1422 | && !last_updated_pte_accessed(vcpu)) { | ||
1423 | ++vcpu->arch.last_pt_write_count; | ||
1424 | if (vcpu->arch.last_pt_write_count >= 3) | ||
1425 | flooded = 1; | ||
1426 | } else { | ||
1427 | vcpu->arch.last_pt_write_gfn = gfn; | ||
1428 | vcpu->arch.last_pt_write_count = 1; | ||
1429 | vcpu->arch.last_pte_updated = NULL; | ||
1430 | } | ||
1431 | index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; | ||
1432 | bucket = &vcpu->kvm->arch.mmu_page_hash[index]; | ||
1433 | hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { | ||
1434 | if (sp->gfn != gfn || sp->role.metaphysical) | ||
1435 | continue; | ||
1436 | pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; | ||
1437 | misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); | ||
1438 | misaligned |= bytes < 4; | ||
1439 | if (misaligned || flooded) { | ||
1440 | /* | ||
1441 | * Misaligned accesses are too much trouble to fix | ||
1442 | * up; also, they usually indicate a page is not used | ||
1443 | * as a page table. | ||
1444 | * | ||
1445 | * If we're seeing too many writes to a page, | ||
1446 | * it may no longer be a page table, or we may be | ||
1447 | * forking, in which case it is better to unmap the | ||
1448 | * page. | ||
1449 | */ | ||
1450 | pgprintk("misaligned: gpa %llx bytes %d role %x\n", | ||
1451 | gpa, bytes, sp->role.word); | ||
1452 | kvm_mmu_zap_page(vcpu->kvm, sp); | ||
1453 | ++vcpu->kvm->stat.mmu_flooded; | ||
1454 | continue; | ||
1455 | } | ||
1456 | page_offset = offset; | ||
1457 | level = sp->role.level; | ||
1458 | npte = 1; | ||
1459 | if (sp->role.glevels == PT32_ROOT_LEVEL) { | ||
1460 | page_offset <<= 1; /* 32->64 */ | ||
1461 | /* | ||
1462 | * A 32-bit pde maps 4MB while the shadow pdes map | ||
1463 | * only 2MB. So we need to double the offset again | ||
1464 | * and zap two pdes instead of one. | ||
1465 | */ | ||
1466 | if (level == PT32_ROOT_LEVEL) { | ||
1467 | page_offset &= ~7; /* kill rounding error */ | ||
1468 | page_offset <<= 1; | ||
1469 | npte = 2; | ||
1470 | } | ||
1471 | quadrant = page_offset >> PAGE_SHIFT; | ||
1472 | page_offset &= ~PAGE_MASK; | ||
1473 | if (quadrant != sp->role.quadrant) | ||
1474 | continue; | ||
1475 | } | ||
1476 | spte = &sp->spt[page_offset / sizeof(*spte)]; | ||
1477 | while (npte--) { | ||
1478 | entry = *spte; | ||
1479 | mmu_pte_write_zap_pte(vcpu, sp, spte); | ||
1480 | mmu_pte_write_new_pte(vcpu, sp, spte, new, bytes, | ||
1481 | page_offset & (pte_size - 1)); | ||
1482 | mmu_pte_write_flush_tlb(vcpu, entry, *spte); | ||
1483 | ++spte; | ||
1484 | } | ||
1485 | } | ||
1486 | kvm_mmu_audit(vcpu, "post pte write"); | ||
1487 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
1488 | if (vcpu->arch.update_pte.page) { | ||
1489 | kvm_release_page_clean(vcpu->arch.update_pte.page); | ||
1490 | vcpu->arch.update_pte.page = NULL; | ||
1491 | } | ||
1492 | } | ||
1493 | |||
1494 | int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) | ||
1495 | { | ||
1496 | gpa_t gpa; | ||
1497 | int r; | ||
1498 | |||
1499 | down_read(¤t->mm->mmap_sem); | ||
1500 | gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); | ||
1501 | up_read(¤t->mm->mmap_sem); | ||
1502 | |||
1503 | spin_lock(&vcpu->kvm->mmu_lock); | ||
1504 | r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); | ||
1505 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
1506 | return r; | ||
1507 | } | ||
1508 | |||
1509 | void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) | ||
1510 | { | ||
1511 | while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) { | ||
1512 | struct kvm_mmu_page *sp; | ||
1513 | |||
1514 | sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, | ||
1515 | struct kvm_mmu_page, link); | ||
1516 | kvm_mmu_zap_page(vcpu->kvm, sp); | ||
1517 | ++vcpu->kvm->stat.mmu_recycled; | ||
1518 | } | ||
1519 | } | ||
1520 | |||
1521 | int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) | ||
1522 | { | ||
1523 | int r; | ||
1524 | enum emulation_result er; | ||
1525 | |||
1526 | r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code); | ||
1527 | if (r < 0) | ||
1528 | goto out; | ||
1529 | |||
1530 | if (!r) { | ||
1531 | r = 1; | ||
1532 | goto out; | ||
1533 | } | ||
1534 | |||
1535 | r = mmu_topup_memory_caches(vcpu); | ||
1536 | if (r) | ||
1537 | goto out; | ||
1538 | |||
1539 | er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0); | ||
1540 | |||
1541 | switch (er) { | ||
1542 | case EMULATE_DONE: | ||
1543 | return 1; | ||
1544 | case EMULATE_DO_MMIO: | ||
1545 | ++vcpu->stat.mmio_exits; | ||
1546 | return 0; | ||
1547 | case EMULATE_FAIL: | ||
1548 | kvm_report_emulation_failure(vcpu, "pagetable"); | ||
1549 | return 1; | ||
1550 | default: | ||
1551 | BUG(); | ||
1552 | } | ||
1553 | out: | ||
1554 | return r; | ||
1555 | } | ||
1556 | EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); | ||
1557 | |||
1558 | static void free_mmu_pages(struct kvm_vcpu *vcpu) | ||
1559 | { | ||
1560 | struct kvm_mmu_page *sp; | ||
1561 | |||
1562 | while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) { | ||
1563 | sp = container_of(vcpu->kvm->arch.active_mmu_pages.next, | ||
1564 | struct kvm_mmu_page, link); | ||
1565 | kvm_mmu_zap_page(vcpu->kvm, sp); | ||
1566 | } | ||
1567 | free_page((unsigned long)vcpu->arch.mmu.pae_root); | ||
1568 | } | ||
1569 | |||
1570 | static int alloc_mmu_pages(struct kvm_vcpu *vcpu) | ||
1571 | { | ||
1572 | struct page *page; | ||
1573 | int i; | ||
1574 | |||
1575 | ASSERT(vcpu); | ||
1576 | |||
1577 | if (vcpu->kvm->arch.n_requested_mmu_pages) | ||
1578 | vcpu->kvm->arch.n_free_mmu_pages = | ||
1579 | vcpu->kvm->arch.n_requested_mmu_pages; | ||
1580 | else | ||
1581 | vcpu->kvm->arch.n_free_mmu_pages = | ||
1582 | vcpu->kvm->arch.n_alloc_mmu_pages; | ||
1583 | /* | ||
1584 | * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. | ||
1585 | * Therefore we need to allocate shadow page tables in the first | ||
1586 | * 4GB of memory, which happens to fit the DMA32 zone. | ||
1587 | */ | ||
1588 | page = alloc_page(GFP_KERNEL | __GFP_DMA32); | ||
1589 | if (!page) | ||
1590 | goto error_1; | ||
1591 | vcpu->arch.mmu.pae_root = page_address(page); | ||
1592 | for (i = 0; i < 4; ++i) | ||
1593 | vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; | ||
1594 | |||
1595 | return 0; | ||
1596 | |||
1597 | error_1: | ||
1598 | free_mmu_pages(vcpu); | ||
1599 | return -ENOMEM; | ||
1600 | } | ||
1601 | |||
1602 | int kvm_mmu_create(struct kvm_vcpu *vcpu) | ||
1603 | { | ||
1604 | ASSERT(vcpu); | ||
1605 | ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); | ||
1606 | |||
1607 | return alloc_mmu_pages(vcpu); | ||
1608 | } | ||
1609 | |||
1610 | int kvm_mmu_setup(struct kvm_vcpu *vcpu) | ||
1611 | { | ||
1612 | ASSERT(vcpu); | ||
1613 | ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); | ||
1614 | |||
1615 | return init_kvm_mmu(vcpu); | ||
1616 | } | ||
1617 | |||
1618 | void kvm_mmu_destroy(struct kvm_vcpu *vcpu) | ||
1619 | { | ||
1620 | ASSERT(vcpu); | ||
1621 | |||
1622 | destroy_kvm_mmu(vcpu); | ||
1623 | free_mmu_pages(vcpu); | ||
1624 | mmu_free_memory_caches(vcpu); | ||
1625 | } | ||
1626 | |||
1627 | void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) | ||
1628 | { | ||
1629 | struct kvm_mmu_page *sp; | ||
1630 | |||
1631 | list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { | ||
1632 | int i; | ||
1633 | u64 *pt; | ||
1634 | |||
1635 | if (!test_bit(slot, &sp->slot_bitmap)) | ||
1636 | continue; | ||
1637 | |||
1638 | pt = sp->spt; | ||
1639 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) | ||
1640 | /* avoid RMW */ | ||
1641 | if (pt[i] & PT_WRITABLE_MASK) | ||
1642 | pt[i] &= ~PT_WRITABLE_MASK; | ||
1643 | } | ||
1644 | } | ||
1645 | |||
1646 | void kvm_mmu_zap_all(struct kvm *kvm) | ||
1647 | { | ||
1648 | struct kvm_mmu_page *sp, *node; | ||
1649 | |||
1650 | spin_lock(&kvm->mmu_lock); | ||
1651 | list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) | ||
1652 | kvm_mmu_zap_page(kvm, sp); | ||
1653 | spin_unlock(&kvm->mmu_lock); | ||
1654 | |||
1655 | kvm_flush_remote_tlbs(kvm); | ||
1656 | } | ||
1657 | |||
1658 | void kvm_mmu_module_exit(void) | ||
1659 | { | ||
1660 | if (pte_chain_cache) | ||
1661 | kmem_cache_destroy(pte_chain_cache); | ||
1662 | if (rmap_desc_cache) | ||
1663 | kmem_cache_destroy(rmap_desc_cache); | ||
1664 | if (mmu_page_header_cache) | ||
1665 | kmem_cache_destroy(mmu_page_header_cache); | ||
1666 | } | ||
1667 | |||
1668 | int kvm_mmu_module_init(void) | ||
1669 | { | ||
1670 | pte_chain_cache = kmem_cache_create("kvm_pte_chain", | ||
1671 | sizeof(struct kvm_pte_chain), | ||
1672 | 0, 0, NULL); | ||
1673 | if (!pte_chain_cache) | ||
1674 | goto nomem; | ||
1675 | rmap_desc_cache = kmem_cache_create("kvm_rmap_desc", | ||
1676 | sizeof(struct kvm_rmap_desc), | ||
1677 | 0, 0, NULL); | ||
1678 | if (!rmap_desc_cache) | ||
1679 | goto nomem; | ||
1680 | |||
1681 | mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", | ||
1682 | sizeof(struct kvm_mmu_page), | ||
1683 | 0, 0, NULL); | ||
1684 | if (!mmu_page_header_cache) | ||
1685 | goto nomem; | ||
1686 | |||
1687 | return 0; | ||
1688 | |||
1689 | nomem: | ||
1690 | kvm_mmu_module_exit(); | ||
1691 | return -ENOMEM; | ||
1692 | } | ||
1693 | |||
1694 | /* | ||
1695 | * Caculate mmu pages needed for kvm. | ||
1696 | */ | ||
1697 | unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) | ||
1698 | { | ||
1699 | int i; | ||
1700 | unsigned int nr_mmu_pages; | ||
1701 | unsigned int nr_pages = 0; | ||
1702 | |||
1703 | for (i = 0; i < kvm->nmemslots; i++) | ||
1704 | nr_pages += kvm->memslots[i].npages; | ||
1705 | |||
1706 | nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; | ||
1707 | nr_mmu_pages = max(nr_mmu_pages, | ||
1708 | (unsigned int) KVM_MIN_ALLOC_MMU_PAGES); | ||
1709 | |||
1710 | return nr_mmu_pages; | ||
1711 | } | ||
1712 | |||
1713 | #ifdef AUDIT | ||
1714 | |||
1715 | static const char *audit_msg; | ||
1716 | |||
1717 | static gva_t canonicalize(gva_t gva) | ||
1718 | { | ||
1719 | #ifdef CONFIG_X86_64 | ||
1720 | gva = (long long)(gva << 16) >> 16; | ||
1721 | #endif | ||
1722 | return gva; | ||
1723 | } | ||
1724 | |||
1725 | static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, | ||
1726 | gva_t va, int level) | ||
1727 | { | ||
1728 | u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK); | ||
1729 | int i; | ||
1730 | gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1)); | ||
1731 | |||
1732 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) { | ||
1733 | u64 ent = pt[i]; | ||
1734 | |||
1735 | if (ent == shadow_trap_nonpresent_pte) | ||
1736 | continue; | ||
1737 | |||
1738 | va = canonicalize(va); | ||
1739 | if (level > 1) { | ||
1740 | if (ent == shadow_notrap_nonpresent_pte) | ||
1741 | printk(KERN_ERR "audit: (%s) nontrapping pte" | ||
1742 | " in nonleaf level: levels %d gva %lx" | ||
1743 | " level %d pte %llx\n", audit_msg, | ||
1744 | vcpu->arch.mmu.root_level, va, level, ent); | ||
1745 | |||
1746 | audit_mappings_page(vcpu, ent, va, level - 1); | ||
1747 | } else { | ||
1748 | gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va); | ||
1749 | struct page *page = gpa_to_page(vcpu, gpa); | ||
1750 | hpa_t hpa = page_to_phys(page); | ||
1751 | |||
1752 | if (is_shadow_present_pte(ent) | ||
1753 | && (ent & PT64_BASE_ADDR_MASK) != hpa) | ||
1754 | printk(KERN_ERR "xx audit error: (%s) levels %d" | ||
1755 | " gva %lx gpa %llx hpa %llx ent %llx %d\n", | ||
1756 | audit_msg, vcpu->arch.mmu.root_level, | ||
1757 | va, gpa, hpa, ent, | ||
1758 | is_shadow_present_pte(ent)); | ||
1759 | else if (ent == shadow_notrap_nonpresent_pte | ||
1760 | && !is_error_hpa(hpa)) | ||
1761 | printk(KERN_ERR "audit: (%s) notrap shadow," | ||
1762 | " valid guest gva %lx\n", audit_msg, va); | ||
1763 | kvm_release_page_clean(page); | ||
1764 | |||
1765 | } | ||
1766 | } | ||
1767 | } | ||
1768 | |||
1769 | static void audit_mappings(struct kvm_vcpu *vcpu) | ||
1770 | { | ||
1771 | unsigned i; | ||
1772 | |||
1773 | if (vcpu->arch.mmu.root_level == 4) | ||
1774 | audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4); | ||
1775 | else | ||
1776 | for (i = 0; i < 4; ++i) | ||
1777 | if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK) | ||
1778 | audit_mappings_page(vcpu, | ||
1779 | vcpu->arch.mmu.pae_root[i], | ||
1780 | i << 30, | ||
1781 | 2); | ||
1782 | } | ||
1783 | |||
1784 | static int count_rmaps(struct kvm_vcpu *vcpu) | ||
1785 | { | ||
1786 | int nmaps = 0; | ||
1787 | int i, j, k; | ||
1788 | |||
1789 | for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { | ||
1790 | struct kvm_memory_slot *m = &vcpu->kvm->memslots[i]; | ||
1791 | struct kvm_rmap_desc *d; | ||
1792 | |||
1793 | for (j = 0; j < m->npages; ++j) { | ||
1794 | unsigned long *rmapp = &m->rmap[j]; | ||
1795 | |||
1796 | if (!*rmapp) | ||
1797 | continue; | ||
1798 | if (!(*rmapp & 1)) { | ||
1799 | ++nmaps; | ||
1800 | continue; | ||
1801 | } | ||
1802 | d = (struct kvm_rmap_desc *)(*rmapp & ~1ul); | ||
1803 | while (d) { | ||
1804 | for (k = 0; k < RMAP_EXT; ++k) | ||
1805 | if (d->shadow_ptes[k]) | ||
1806 | ++nmaps; | ||
1807 | else | ||
1808 | break; | ||
1809 | d = d->more; | ||
1810 | } | ||
1811 | } | ||
1812 | } | ||
1813 | return nmaps; | ||
1814 | } | ||
1815 | |||
1816 | static int count_writable_mappings(struct kvm_vcpu *vcpu) | ||
1817 | { | ||
1818 | int nmaps = 0; | ||
1819 | struct kvm_mmu_page *sp; | ||
1820 | int i; | ||
1821 | |||
1822 | list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { | ||
1823 | u64 *pt = sp->spt; | ||
1824 | |||
1825 | if (sp->role.level != PT_PAGE_TABLE_LEVEL) | ||
1826 | continue; | ||
1827 | |||
1828 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | ||
1829 | u64 ent = pt[i]; | ||
1830 | |||
1831 | if (!(ent & PT_PRESENT_MASK)) | ||
1832 | continue; | ||
1833 | if (!(ent & PT_WRITABLE_MASK)) | ||
1834 | continue; | ||
1835 | ++nmaps; | ||
1836 | } | ||
1837 | } | ||
1838 | return nmaps; | ||
1839 | } | ||
1840 | |||
1841 | static void audit_rmap(struct kvm_vcpu *vcpu) | ||
1842 | { | ||
1843 | int n_rmap = count_rmaps(vcpu); | ||
1844 | int n_actual = count_writable_mappings(vcpu); | ||
1845 | |||
1846 | if (n_rmap != n_actual) | ||
1847 | printk(KERN_ERR "%s: (%s) rmap %d actual %d\n", | ||
1848 | __FUNCTION__, audit_msg, n_rmap, n_actual); | ||
1849 | } | ||
1850 | |||
1851 | static void audit_write_protection(struct kvm_vcpu *vcpu) | ||
1852 | { | ||
1853 | struct kvm_mmu_page *sp; | ||
1854 | struct kvm_memory_slot *slot; | ||
1855 | unsigned long *rmapp; | ||
1856 | gfn_t gfn; | ||
1857 | |||
1858 | list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { | ||
1859 | if (sp->role.metaphysical) | ||
1860 | continue; | ||
1861 | |||
1862 | slot = gfn_to_memslot(vcpu->kvm, sp->gfn); | ||
1863 | gfn = unalias_gfn(vcpu->kvm, sp->gfn); | ||
1864 | rmapp = &slot->rmap[gfn - slot->base_gfn]; | ||
1865 | if (*rmapp) | ||
1866 | printk(KERN_ERR "%s: (%s) shadow page has writable" | ||
1867 | " mappings: gfn %lx role %x\n", | ||
1868 | __FUNCTION__, audit_msg, sp->gfn, | ||
1869 | sp->role.word); | ||
1870 | } | ||
1871 | } | ||
1872 | |||
1873 | static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) | ||
1874 | { | ||
1875 | int olddbg = dbg; | ||
1876 | |||
1877 | dbg = 0; | ||
1878 | audit_msg = msg; | ||
1879 | audit_rmap(vcpu); | ||
1880 | audit_write_protection(vcpu); | ||
1881 | audit_mappings(vcpu); | ||
1882 | dbg = olddbg; | ||
1883 | } | ||
1884 | |||
1885 | #endif | ||
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h new file mode 100644 index 000000000000..1fce19ec7a23 --- /dev/null +++ b/arch/x86/kvm/mmu.h | |||
@@ -0,0 +1,44 @@ | |||
1 | #ifndef __KVM_X86_MMU_H | ||
2 | #define __KVM_X86_MMU_H | ||
3 | |||
4 | #include <linux/kvm_host.h> | ||
5 | |||
6 | static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) | ||
7 | { | ||
8 | if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) | ||
9 | __kvm_mmu_free_some_pages(vcpu); | ||
10 | } | ||
11 | |||
12 | static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) | ||
13 | { | ||
14 | if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE)) | ||
15 | return 0; | ||
16 | |||
17 | return kvm_mmu_load(vcpu); | ||
18 | } | ||
19 | |||
20 | static inline int is_long_mode(struct kvm_vcpu *vcpu) | ||
21 | { | ||
22 | #ifdef CONFIG_X86_64 | ||
23 | return vcpu->arch.shadow_efer & EFER_LME; | ||
24 | #else | ||
25 | return 0; | ||
26 | #endif | ||
27 | } | ||
28 | |||
29 | static inline int is_pae(struct kvm_vcpu *vcpu) | ||
30 | { | ||
31 | return vcpu->arch.cr4 & X86_CR4_PAE; | ||
32 | } | ||
33 | |||
34 | static inline int is_pse(struct kvm_vcpu *vcpu) | ||
35 | { | ||
36 | return vcpu->arch.cr4 & X86_CR4_PSE; | ||
37 | } | ||
38 | |||
39 | static inline int is_paging(struct kvm_vcpu *vcpu) | ||
40 | { | ||
41 | return vcpu->arch.cr0 & X86_CR0_PG; | ||
42 | } | ||
43 | |||
44 | #endif | ||
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h new file mode 100644 index 000000000000..03ba8608fe0f --- /dev/null +++ b/arch/x86/kvm/paging_tmpl.h | |||
@@ -0,0 +1,484 @@ | |||
1 | /* | ||
2 | * Kernel-based Virtual Machine driver for Linux | ||
3 | * | ||
4 | * This module enables machines with Intel VT-x extensions to run virtual | ||
5 | * machines without emulation or binary translation. | ||
6 | * | ||
7 | * MMU support | ||
8 | * | ||
9 | * Copyright (C) 2006 Qumranet, Inc. | ||
10 | * | ||
11 | * Authors: | ||
12 | * Yaniv Kamay <yaniv@qumranet.com> | ||
13 | * Avi Kivity <avi@qumranet.com> | ||
14 | * | ||
15 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
16 | * the COPYING file in the top-level directory. | ||
17 | * | ||
18 | */ | ||
19 | |||
20 | /* | ||
21 | * We need the mmu code to access both 32-bit and 64-bit guest ptes, | ||
22 | * so the code in this file is compiled twice, once per pte size. | ||
23 | */ | ||
24 | |||
25 | #if PTTYPE == 64 | ||
26 | #define pt_element_t u64 | ||
27 | #define guest_walker guest_walker64 | ||
28 | #define FNAME(name) paging##64_##name | ||
29 | #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK | ||
30 | #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK | ||
31 | #define PT_INDEX(addr, level) PT64_INDEX(addr, level) | ||
32 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) | ||
33 | #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) | ||
34 | #define PT_LEVEL_BITS PT64_LEVEL_BITS | ||
35 | #ifdef CONFIG_X86_64 | ||
36 | #define PT_MAX_FULL_LEVELS 4 | ||
37 | #define CMPXCHG cmpxchg | ||
38 | #else | ||
39 | #define CMPXCHG cmpxchg64 | ||
40 | #define PT_MAX_FULL_LEVELS 2 | ||
41 | #endif | ||
42 | #elif PTTYPE == 32 | ||
43 | #define pt_element_t u32 | ||
44 | #define guest_walker guest_walker32 | ||
45 | #define FNAME(name) paging##32_##name | ||
46 | #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK | ||
47 | #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK | ||
48 | #define PT_INDEX(addr, level) PT32_INDEX(addr, level) | ||
49 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) | ||
50 | #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) | ||
51 | #define PT_LEVEL_BITS PT32_LEVEL_BITS | ||
52 | #define PT_MAX_FULL_LEVELS 2 | ||
53 | #define CMPXCHG cmpxchg | ||
54 | #else | ||
55 | #error Invalid PTTYPE value | ||
56 | #endif | ||
57 | |||
58 | #define gpte_to_gfn FNAME(gpte_to_gfn) | ||
59 | #define gpte_to_gfn_pde FNAME(gpte_to_gfn_pde) | ||
60 | |||
61 | /* | ||
62 | * The guest_walker structure emulates the behavior of the hardware page | ||
63 | * table walker. | ||
64 | */ | ||
65 | struct guest_walker { | ||
66 | int level; | ||
67 | gfn_t table_gfn[PT_MAX_FULL_LEVELS]; | ||
68 | pt_element_t ptes[PT_MAX_FULL_LEVELS]; | ||
69 | gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; | ||
70 | unsigned pt_access; | ||
71 | unsigned pte_access; | ||
72 | gfn_t gfn; | ||
73 | u32 error_code; | ||
74 | }; | ||
75 | |||
76 | static gfn_t gpte_to_gfn(pt_element_t gpte) | ||
77 | { | ||
78 | return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; | ||
79 | } | ||
80 | |||
81 | static gfn_t gpte_to_gfn_pde(pt_element_t gpte) | ||
82 | { | ||
83 | return (gpte & PT_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT; | ||
84 | } | ||
85 | |||
86 | static bool FNAME(cmpxchg_gpte)(struct kvm *kvm, | ||
87 | gfn_t table_gfn, unsigned index, | ||
88 | pt_element_t orig_pte, pt_element_t new_pte) | ||
89 | { | ||
90 | pt_element_t ret; | ||
91 | pt_element_t *table; | ||
92 | struct page *page; | ||
93 | |||
94 | page = gfn_to_page(kvm, table_gfn); | ||
95 | table = kmap_atomic(page, KM_USER0); | ||
96 | |||
97 | ret = CMPXCHG(&table[index], orig_pte, new_pte); | ||
98 | |||
99 | kunmap_atomic(table, KM_USER0); | ||
100 | |||
101 | kvm_release_page_dirty(page); | ||
102 | |||
103 | return (ret != orig_pte); | ||
104 | } | ||
105 | |||
106 | static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte) | ||
107 | { | ||
108 | unsigned access; | ||
109 | |||
110 | access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; | ||
111 | #if PTTYPE == 64 | ||
112 | if (is_nx(vcpu)) | ||
113 | access &= ~(gpte >> PT64_NX_SHIFT); | ||
114 | #endif | ||
115 | return access; | ||
116 | } | ||
117 | |||
118 | /* | ||
119 | * Fetch a guest pte for a guest virtual address | ||
120 | */ | ||
121 | static int FNAME(walk_addr)(struct guest_walker *walker, | ||
122 | struct kvm_vcpu *vcpu, gva_t addr, | ||
123 | int write_fault, int user_fault, int fetch_fault) | ||
124 | { | ||
125 | pt_element_t pte; | ||
126 | gfn_t table_gfn; | ||
127 | unsigned index, pt_access, pte_access; | ||
128 | gpa_t pte_gpa; | ||
129 | |||
130 | pgprintk("%s: addr %lx\n", __FUNCTION__, addr); | ||
131 | walk: | ||
132 | walker->level = vcpu->arch.mmu.root_level; | ||
133 | pte = vcpu->arch.cr3; | ||
134 | #if PTTYPE == 64 | ||
135 | if (!is_long_mode(vcpu)) { | ||
136 | pte = vcpu->arch.pdptrs[(addr >> 30) & 3]; | ||
137 | if (!is_present_pte(pte)) | ||
138 | goto not_present; | ||
139 | --walker->level; | ||
140 | } | ||
141 | #endif | ||
142 | ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || | ||
143 | (vcpu->cr3 & CR3_NONPAE_RESERVED_BITS) == 0); | ||
144 | |||
145 | pt_access = ACC_ALL; | ||
146 | |||
147 | for (;;) { | ||
148 | index = PT_INDEX(addr, walker->level); | ||
149 | |||
150 | table_gfn = gpte_to_gfn(pte); | ||
151 | pte_gpa = gfn_to_gpa(table_gfn); | ||
152 | pte_gpa += index * sizeof(pt_element_t); | ||
153 | walker->table_gfn[walker->level - 1] = table_gfn; | ||
154 | walker->pte_gpa[walker->level - 1] = pte_gpa; | ||
155 | pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__, | ||
156 | walker->level - 1, table_gfn); | ||
157 | |||
158 | kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)); | ||
159 | |||
160 | if (!is_present_pte(pte)) | ||
161 | goto not_present; | ||
162 | |||
163 | if (write_fault && !is_writeble_pte(pte)) | ||
164 | if (user_fault || is_write_protection(vcpu)) | ||
165 | goto access_error; | ||
166 | |||
167 | if (user_fault && !(pte & PT_USER_MASK)) | ||
168 | goto access_error; | ||
169 | |||
170 | #if PTTYPE == 64 | ||
171 | if (fetch_fault && is_nx(vcpu) && (pte & PT64_NX_MASK)) | ||
172 | goto access_error; | ||
173 | #endif | ||
174 | |||
175 | if (!(pte & PT_ACCESSED_MASK)) { | ||
176 | mark_page_dirty(vcpu->kvm, table_gfn); | ||
177 | if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, | ||
178 | index, pte, pte|PT_ACCESSED_MASK)) | ||
179 | goto walk; | ||
180 | pte |= PT_ACCESSED_MASK; | ||
181 | } | ||
182 | |||
183 | pte_access = pt_access & FNAME(gpte_access)(vcpu, pte); | ||
184 | |||
185 | walker->ptes[walker->level - 1] = pte; | ||
186 | |||
187 | if (walker->level == PT_PAGE_TABLE_LEVEL) { | ||
188 | walker->gfn = gpte_to_gfn(pte); | ||
189 | break; | ||
190 | } | ||
191 | |||
192 | if (walker->level == PT_DIRECTORY_LEVEL | ||
193 | && (pte & PT_PAGE_SIZE_MASK) | ||
194 | && (PTTYPE == 64 || is_pse(vcpu))) { | ||
195 | walker->gfn = gpte_to_gfn_pde(pte); | ||
196 | walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL); | ||
197 | if (PTTYPE == 32 && is_cpuid_PSE36()) | ||
198 | walker->gfn += pse36_gfn_delta(pte); | ||
199 | break; | ||
200 | } | ||
201 | |||
202 | pt_access = pte_access; | ||
203 | --walker->level; | ||
204 | } | ||
205 | |||
206 | if (write_fault && !is_dirty_pte(pte)) { | ||
207 | bool ret; | ||
208 | |||
209 | mark_page_dirty(vcpu->kvm, table_gfn); | ||
210 | ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, | ||
211 | pte|PT_DIRTY_MASK); | ||
212 | if (ret) | ||
213 | goto walk; | ||
214 | pte |= PT_DIRTY_MASK; | ||
215 | kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte)); | ||
216 | walker->ptes[walker->level - 1] = pte; | ||
217 | } | ||
218 | |||
219 | walker->pt_access = pt_access; | ||
220 | walker->pte_access = pte_access; | ||
221 | pgprintk("%s: pte %llx pte_access %x pt_access %x\n", | ||
222 | __FUNCTION__, (u64)pte, pt_access, pte_access); | ||
223 | return 1; | ||
224 | |||
225 | not_present: | ||
226 | walker->error_code = 0; | ||
227 | goto err; | ||
228 | |||
229 | access_error: | ||
230 | walker->error_code = PFERR_PRESENT_MASK; | ||
231 | |||
232 | err: | ||
233 | if (write_fault) | ||
234 | walker->error_code |= PFERR_WRITE_MASK; | ||
235 | if (user_fault) | ||
236 | walker->error_code |= PFERR_USER_MASK; | ||
237 | if (fetch_fault) | ||
238 | walker->error_code |= PFERR_FETCH_MASK; | ||
239 | return 0; | ||
240 | } | ||
241 | |||
242 | static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, | ||
243 | u64 *spte, const void *pte, int bytes, | ||
244 | int offset_in_pte) | ||
245 | { | ||
246 | pt_element_t gpte; | ||
247 | unsigned pte_access; | ||
248 | struct page *npage; | ||
249 | |||
250 | gpte = *(const pt_element_t *)pte; | ||
251 | if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { | ||
252 | if (!offset_in_pte && !is_present_pte(gpte)) | ||
253 | set_shadow_pte(spte, shadow_notrap_nonpresent_pte); | ||
254 | return; | ||
255 | } | ||
256 | if (bytes < sizeof(pt_element_t)) | ||
257 | return; | ||
258 | pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte); | ||
259 | pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte); | ||
260 | if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn) | ||
261 | return; | ||
262 | npage = vcpu->arch.update_pte.page; | ||
263 | if (!npage) | ||
264 | return; | ||
265 | get_page(npage); | ||
266 | mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, | ||
267 | gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte), npage); | ||
268 | } | ||
269 | |||
270 | /* | ||
271 | * Fetch a shadow pte for a specific level in the paging hierarchy. | ||
272 | */ | ||
273 | static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | ||
274 | struct guest_walker *walker, | ||
275 | int user_fault, int write_fault, int *ptwrite, | ||
276 | struct page *page) | ||
277 | { | ||
278 | hpa_t shadow_addr; | ||
279 | int level; | ||
280 | u64 *shadow_ent; | ||
281 | unsigned access = walker->pt_access; | ||
282 | |||
283 | if (!is_present_pte(walker->ptes[walker->level - 1])) | ||
284 | return NULL; | ||
285 | |||
286 | shadow_addr = vcpu->arch.mmu.root_hpa; | ||
287 | level = vcpu->arch.mmu.shadow_root_level; | ||
288 | if (level == PT32E_ROOT_LEVEL) { | ||
289 | shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; | ||
290 | shadow_addr &= PT64_BASE_ADDR_MASK; | ||
291 | --level; | ||
292 | } | ||
293 | |||
294 | for (; ; level--) { | ||
295 | u32 index = SHADOW_PT_INDEX(addr, level); | ||
296 | struct kvm_mmu_page *shadow_page; | ||
297 | u64 shadow_pte; | ||
298 | int metaphysical; | ||
299 | gfn_t table_gfn; | ||
300 | bool new_page = 0; | ||
301 | |||
302 | shadow_ent = ((u64 *)__va(shadow_addr)) + index; | ||
303 | if (level == PT_PAGE_TABLE_LEVEL) | ||
304 | break; | ||
305 | if (is_shadow_present_pte(*shadow_ent)) { | ||
306 | shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK; | ||
307 | continue; | ||
308 | } | ||
309 | |||
310 | if (level - 1 == PT_PAGE_TABLE_LEVEL | ||
311 | && walker->level == PT_DIRECTORY_LEVEL) { | ||
312 | metaphysical = 1; | ||
313 | if (!is_dirty_pte(walker->ptes[level - 1])) | ||
314 | access &= ~ACC_WRITE_MASK; | ||
315 | table_gfn = gpte_to_gfn(walker->ptes[level - 1]); | ||
316 | } else { | ||
317 | metaphysical = 0; | ||
318 | table_gfn = walker->table_gfn[level - 2]; | ||
319 | } | ||
320 | shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, | ||
321 | metaphysical, access, | ||
322 | shadow_ent, &new_page); | ||
323 | if (new_page && !metaphysical) { | ||
324 | int r; | ||
325 | pt_element_t curr_pte; | ||
326 | r = kvm_read_guest_atomic(vcpu->kvm, | ||
327 | walker->pte_gpa[level - 2], | ||
328 | &curr_pte, sizeof(curr_pte)); | ||
329 | if (r || curr_pte != walker->ptes[level - 2]) { | ||
330 | kvm_release_page_clean(page); | ||
331 | return NULL; | ||
332 | } | ||
333 | } | ||
334 | shadow_addr = __pa(shadow_page->spt); | ||
335 | shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK | ||
336 | | PT_WRITABLE_MASK | PT_USER_MASK; | ||
337 | *shadow_ent = shadow_pte; | ||
338 | } | ||
339 | |||
340 | mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access, | ||
341 | user_fault, write_fault, | ||
342 | walker->ptes[walker->level-1] & PT_DIRTY_MASK, | ||
343 | ptwrite, walker->gfn, page); | ||
344 | |||
345 | return shadow_ent; | ||
346 | } | ||
347 | |||
348 | /* | ||
349 | * Page fault handler. There are several causes for a page fault: | ||
350 | * - there is no shadow pte for the guest pte | ||
351 | * - write access through a shadow pte marked read only so that we can set | ||
352 | * the dirty bit | ||
353 | * - write access to a shadow pte marked read only so we can update the page | ||
354 | * dirty bitmap, when userspace requests it | ||
355 | * - mmio access; in this case we will never install a present shadow pte | ||
356 | * - normal guest page fault due to the guest pte marked not present, not | ||
357 | * writable, or not executable | ||
358 | * | ||
359 | * Returns: 1 if we need to emulate the instruction, 0 otherwise, or | ||
360 | * a negative value on error. | ||
361 | */ | ||
362 | static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | ||
363 | u32 error_code) | ||
364 | { | ||
365 | int write_fault = error_code & PFERR_WRITE_MASK; | ||
366 | int user_fault = error_code & PFERR_USER_MASK; | ||
367 | int fetch_fault = error_code & PFERR_FETCH_MASK; | ||
368 | struct guest_walker walker; | ||
369 | u64 *shadow_pte; | ||
370 | int write_pt = 0; | ||
371 | int r; | ||
372 | struct page *page; | ||
373 | |||
374 | pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code); | ||
375 | kvm_mmu_audit(vcpu, "pre page fault"); | ||
376 | |||
377 | r = mmu_topup_memory_caches(vcpu); | ||
378 | if (r) | ||
379 | return r; | ||
380 | |||
381 | down_read(¤t->mm->mmap_sem); | ||
382 | /* | ||
383 | * Look up the shadow pte for the faulting address. | ||
384 | */ | ||
385 | r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault, | ||
386 | fetch_fault); | ||
387 | |||
388 | /* | ||
389 | * The page is not mapped by the guest. Let the guest handle it. | ||
390 | */ | ||
391 | if (!r) { | ||
392 | pgprintk("%s: guest page fault\n", __FUNCTION__); | ||
393 | inject_page_fault(vcpu, addr, walker.error_code); | ||
394 | vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ | ||
395 | up_read(¤t->mm->mmap_sem); | ||
396 | return 0; | ||
397 | } | ||
398 | |||
399 | page = gfn_to_page(vcpu->kvm, walker.gfn); | ||
400 | |||
401 | spin_lock(&vcpu->kvm->mmu_lock); | ||
402 | kvm_mmu_free_some_pages(vcpu); | ||
403 | shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, | ||
404 | &write_pt, page); | ||
405 | pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__, | ||
406 | shadow_pte, *shadow_pte, write_pt); | ||
407 | |||
408 | if (!write_pt) | ||
409 | vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ | ||
410 | |||
411 | /* | ||
412 | * mmio: emulate if accessible, otherwise its a guest fault. | ||
413 | */ | ||
414 | if (shadow_pte && is_io_pte(*shadow_pte)) { | ||
415 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
416 | up_read(¤t->mm->mmap_sem); | ||
417 | return 1; | ||
418 | } | ||
419 | |||
420 | ++vcpu->stat.pf_fixed; | ||
421 | kvm_mmu_audit(vcpu, "post page fault (fixed)"); | ||
422 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
423 | up_read(¤t->mm->mmap_sem); | ||
424 | |||
425 | return write_pt; | ||
426 | } | ||
427 | |||
428 | static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) | ||
429 | { | ||
430 | struct guest_walker walker; | ||
431 | gpa_t gpa = UNMAPPED_GVA; | ||
432 | int r; | ||
433 | |||
434 | r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0); | ||
435 | |||
436 | if (r) { | ||
437 | gpa = gfn_to_gpa(walker.gfn); | ||
438 | gpa |= vaddr & ~PAGE_MASK; | ||
439 | } | ||
440 | |||
441 | return gpa; | ||
442 | } | ||
443 | |||
444 | static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, | ||
445 | struct kvm_mmu_page *sp) | ||
446 | { | ||
447 | int i, offset = 0, r = 0; | ||
448 | pt_element_t pt; | ||
449 | |||
450 | if (sp->role.metaphysical | ||
451 | || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) { | ||
452 | nonpaging_prefetch_page(vcpu, sp); | ||
453 | return; | ||
454 | } | ||
455 | |||
456 | if (PTTYPE == 32) | ||
457 | offset = sp->role.quadrant << PT64_LEVEL_BITS; | ||
458 | |||
459 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | ||
460 | gpa_t pte_gpa = gfn_to_gpa(sp->gfn); | ||
461 | pte_gpa += (i+offset) * sizeof(pt_element_t); | ||
462 | |||
463 | r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &pt, | ||
464 | sizeof(pt_element_t)); | ||
465 | if (r || is_present_pte(pt)) | ||
466 | sp->spt[i] = shadow_trap_nonpresent_pte; | ||
467 | else | ||
468 | sp->spt[i] = shadow_notrap_nonpresent_pte; | ||
469 | } | ||
470 | } | ||
471 | |||
472 | #undef pt_element_t | ||
473 | #undef guest_walker | ||
474 | #undef FNAME | ||
475 | #undef PT_BASE_ADDR_MASK | ||
476 | #undef PT_INDEX | ||
477 | #undef SHADOW_PT_INDEX | ||
478 | #undef PT_LEVEL_MASK | ||
479 | #undef PT_DIR_BASE_ADDR_MASK | ||
480 | #undef PT_LEVEL_BITS | ||
481 | #undef PT_MAX_FULL_LEVELS | ||
482 | #undef gpte_to_gfn | ||
483 | #undef gpte_to_gfn_pde | ||
484 | #undef CMPXCHG | ||
diff --git a/drivers/kvm/segment_descriptor.h b/arch/x86/kvm/segment_descriptor.h index 71fdf458619a..56fc4c873389 100644 --- a/drivers/kvm/segment_descriptor.h +++ b/arch/x86/kvm/segment_descriptor.h | |||
@@ -1,3 +1,6 @@ | |||
1 | #ifndef __SEGMENT_DESCRIPTOR_H | ||
2 | #define __SEGMENT_DESCRIPTOR_H | ||
3 | |||
1 | struct segment_descriptor { | 4 | struct segment_descriptor { |
2 | u16 limit_low; | 5 | u16 limit_low; |
3 | u16 base_low; | 6 | u16 base_low; |
@@ -14,4 +17,13 @@ struct segment_descriptor { | |||
14 | u8 base_high; | 17 | u8 base_high; |
15 | } __attribute__((packed)); | 18 | } __attribute__((packed)); |
16 | 19 | ||
20 | #ifdef CONFIG_X86_64 | ||
21 | /* LDT or TSS descriptor in the GDT. 16 bytes. */ | ||
22 | struct segment_descriptor_64 { | ||
23 | struct segment_descriptor s; | ||
24 | u32 base_higher; | ||
25 | u32 pad_zero; | ||
26 | }; | ||
17 | 27 | ||
28 | #endif | ||
29 | #endif | ||
diff --git a/drivers/kvm/svm.c b/arch/x86/kvm/svm.c index ced4ac1955db..de755cb1431d 100644 --- a/drivers/kvm/svm.c +++ b/arch/x86/kvm/svm.c | |||
@@ -13,10 +13,11 @@ | |||
13 | * the COPYING file in the top-level directory. | 13 | * the COPYING file in the top-level directory. |
14 | * | 14 | * |
15 | */ | 15 | */ |
16 | #include <linux/kvm_host.h> | ||
16 | 17 | ||
17 | #include "kvm_svm.h" | 18 | #include "kvm_svm.h" |
18 | #include "x86_emulate.h" | ||
19 | #include "irq.h" | 19 | #include "irq.h" |
20 | #include "mmu.h" | ||
20 | 21 | ||
21 | #include <linux/module.h> | 22 | #include <linux/module.h> |
22 | #include <linux/kernel.h> | 23 | #include <linux/kernel.h> |
@@ -42,9 +43,6 @@ MODULE_LICENSE("GPL"); | |||
42 | #define SEG_TYPE_LDT 2 | 43 | #define SEG_TYPE_LDT 2 |
43 | #define SEG_TYPE_BUSY_TSS16 3 | 44 | #define SEG_TYPE_BUSY_TSS16 3 |
44 | 45 | ||
45 | #define KVM_EFER_LMA (1 << 10) | ||
46 | #define KVM_EFER_LME (1 << 8) | ||
47 | |||
48 | #define SVM_FEATURE_NPT (1 << 0) | 46 | #define SVM_FEATURE_NPT (1 << 0) |
49 | #define SVM_FEATURE_LBRV (1 << 1) | 47 | #define SVM_FEATURE_LBRV (1 << 1) |
50 | #define SVM_DEATURE_SVML (1 << 2) | 48 | #define SVM_DEATURE_SVML (1 << 2) |
@@ -102,20 +100,20 @@ static inline u32 svm_has(u32 feat) | |||
102 | 100 | ||
103 | static inline u8 pop_irq(struct kvm_vcpu *vcpu) | 101 | static inline u8 pop_irq(struct kvm_vcpu *vcpu) |
104 | { | 102 | { |
105 | int word_index = __ffs(vcpu->irq_summary); | 103 | int word_index = __ffs(vcpu->arch.irq_summary); |
106 | int bit_index = __ffs(vcpu->irq_pending[word_index]); | 104 | int bit_index = __ffs(vcpu->arch.irq_pending[word_index]); |
107 | int irq = word_index * BITS_PER_LONG + bit_index; | 105 | int irq = word_index * BITS_PER_LONG + bit_index; |
108 | 106 | ||
109 | clear_bit(bit_index, &vcpu->irq_pending[word_index]); | 107 | clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); |
110 | if (!vcpu->irq_pending[word_index]) | 108 | if (!vcpu->arch.irq_pending[word_index]) |
111 | clear_bit(word_index, &vcpu->irq_summary); | 109 | clear_bit(word_index, &vcpu->arch.irq_summary); |
112 | return irq; | 110 | return irq; |
113 | } | 111 | } |
114 | 112 | ||
115 | static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq) | 113 | static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq) |
116 | { | 114 | { |
117 | set_bit(irq, vcpu->irq_pending); | 115 | set_bit(irq, vcpu->arch.irq_pending); |
118 | set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary); | 116 | set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary); |
119 | } | 117 | } |
120 | 118 | ||
121 | static inline void clgi(void) | 119 | static inline void clgi(void) |
@@ -184,35 +182,30 @@ static inline void flush_guest_tlb(struct kvm_vcpu *vcpu) | |||
184 | 182 | ||
185 | static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) | 183 | static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) |
186 | { | 184 | { |
187 | if (!(efer & KVM_EFER_LMA)) | 185 | if (!(efer & EFER_LMA)) |
188 | efer &= ~KVM_EFER_LME; | 186 | efer &= ~EFER_LME; |
189 | 187 | ||
190 | to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK; | 188 | to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK; |
191 | vcpu->shadow_efer = efer; | 189 | vcpu->arch.shadow_efer = efer; |
192 | } | 190 | } |
193 | 191 | ||
194 | static void svm_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code) | 192 | static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, |
193 | bool has_error_code, u32 error_code) | ||
195 | { | 194 | { |
196 | struct vcpu_svm *svm = to_svm(vcpu); | 195 | struct vcpu_svm *svm = to_svm(vcpu); |
197 | 196 | ||
198 | svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | | 197 | svm->vmcb->control.event_inj = nr |
199 | SVM_EVTINJ_VALID_ERR | | 198 | | SVM_EVTINJ_VALID |
200 | SVM_EVTINJ_TYPE_EXEPT | | 199 | | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0) |
201 | GP_VECTOR; | 200 | | SVM_EVTINJ_TYPE_EXEPT; |
202 | svm->vmcb->control.event_inj_err = error_code; | 201 | svm->vmcb->control.event_inj_err = error_code; |
203 | } | 202 | } |
204 | 203 | ||
205 | static void inject_ud(struct kvm_vcpu *vcpu) | 204 | static bool svm_exception_injected(struct kvm_vcpu *vcpu) |
206 | { | 205 | { |
207 | to_svm(vcpu)->vmcb->control.event_inj = SVM_EVTINJ_VALID | | 206 | struct vcpu_svm *svm = to_svm(vcpu); |
208 | SVM_EVTINJ_TYPE_EXEPT | | ||
209 | UD_VECTOR; | ||
210 | } | ||
211 | 207 | ||
212 | static int is_page_fault(uint32_t info) | 208 | return !(svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID); |
213 | { | ||
214 | info &= SVM_EVTINJ_VEC_MASK | SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID; | ||
215 | return info == (PF_VECTOR | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT); | ||
216 | } | 209 | } |
217 | 210 | ||
218 | static int is_external_interrupt(u32 info) | 211 | static int is_external_interrupt(u32 info) |
@@ -229,17 +222,16 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) | |||
229 | printk(KERN_DEBUG "%s: NOP\n", __FUNCTION__); | 222 | printk(KERN_DEBUG "%s: NOP\n", __FUNCTION__); |
230 | return; | 223 | return; |
231 | } | 224 | } |
232 | if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE) { | 225 | if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE) |
233 | printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n", | 226 | printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n", |
234 | __FUNCTION__, | 227 | __FUNCTION__, |
235 | svm->vmcb->save.rip, | 228 | svm->vmcb->save.rip, |
236 | svm->next_rip); | 229 | svm->next_rip); |
237 | } | ||
238 | 230 | ||
239 | vcpu->rip = svm->vmcb->save.rip = svm->next_rip; | 231 | vcpu->arch.rip = svm->vmcb->save.rip = svm->next_rip; |
240 | svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; | 232 | svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; |
241 | 233 | ||
242 | vcpu->interrupt_window_open = 1; | 234 | vcpu->arch.interrupt_window_open = 1; |
243 | } | 235 | } |
244 | 236 | ||
245 | static int has_svm(void) | 237 | static int has_svm(void) |
@@ -312,7 +304,7 @@ static void svm_hardware_enable(void *garbage) | |||
312 | svm_data->next_asid = svm_data->max_asid + 1; | 304 | svm_data->next_asid = svm_data->max_asid + 1; |
313 | svm_features = cpuid_edx(SVM_CPUID_FUNC); | 305 | svm_features = cpuid_edx(SVM_CPUID_FUNC); |
314 | 306 | ||
315 | asm volatile ( "sgdt %0" : "=m"(gdt_descr) ); | 307 | asm volatile ("sgdt %0" : "=m"(gdt_descr)); |
316 | gdt = (struct desc_struct *)gdt_descr.address; | 308 | gdt = (struct desc_struct *)gdt_descr.address; |
317 | svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); | 309 | svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); |
318 | 310 | ||
@@ -458,11 +450,13 @@ static void init_vmcb(struct vmcb *vmcb) | |||
458 | 450 | ||
459 | control->intercept_cr_read = INTERCEPT_CR0_MASK | | 451 | control->intercept_cr_read = INTERCEPT_CR0_MASK | |
460 | INTERCEPT_CR3_MASK | | 452 | INTERCEPT_CR3_MASK | |
461 | INTERCEPT_CR4_MASK; | 453 | INTERCEPT_CR4_MASK | |
454 | INTERCEPT_CR8_MASK; | ||
462 | 455 | ||
463 | control->intercept_cr_write = INTERCEPT_CR0_MASK | | 456 | control->intercept_cr_write = INTERCEPT_CR0_MASK | |
464 | INTERCEPT_CR3_MASK | | 457 | INTERCEPT_CR3_MASK | |
465 | INTERCEPT_CR4_MASK; | 458 | INTERCEPT_CR4_MASK | |
459 | INTERCEPT_CR8_MASK; | ||
466 | 460 | ||
467 | control->intercept_dr_read = INTERCEPT_DR0_MASK | | 461 | control->intercept_dr_read = INTERCEPT_DR0_MASK | |
468 | INTERCEPT_DR1_MASK | | 462 | INTERCEPT_DR1_MASK | |
@@ -476,7 +470,8 @@ static void init_vmcb(struct vmcb *vmcb) | |||
476 | INTERCEPT_DR5_MASK | | 470 | INTERCEPT_DR5_MASK | |
477 | INTERCEPT_DR7_MASK; | 471 | INTERCEPT_DR7_MASK; |
478 | 472 | ||
479 | control->intercept_exceptions = 1 << PF_VECTOR; | 473 | control->intercept_exceptions = (1 << PF_VECTOR) | |
474 | (1 << UD_VECTOR); | ||
480 | 475 | ||
481 | 476 | ||
482 | control->intercept = (1ULL << INTERCEPT_INTR) | | 477 | control->intercept = (1ULL << INTERCEPT_INTR) | |
@@ -543,8 +538,7 @@ static void init_vmcb(struct vmcb *vmcb) | |||
543 | init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); | 538 | init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); |
544 | 539 | ||
545 | save->efer = MSR_EFER_SVME_MASK; | 540 | save->efer = MSR_EFER_SVME_MASK; |
546 | 541 | save->dr6 = 0xffff0ff0; | |
547 | save->dr6 = 0xffff0ff0; | ||
548 | save->dr7 = 0x400; | 542 | save->dr7 = 0x400; |
549 | save->rflags = 2; | 543 | save->rflags = 2; |
550 | save->rip = 0x0000fff0; | 544 | save->rip = 0x0000fff0; |
@@ -558,7 +552,7 @@ static void init_vmcb(struct vmcb *vmcb) | |||
558 | /* rdx = ?? */ | 552 | /* rdx = ?? */ |
559 | } | 553 | } |
560 | 554 | ||
561 | static void svm_vcpu_reset(struct kvm_vcpu *vcpu) | 555 | static int svm_vcpu_reset(struct kvm_vcpu *vcpu) |
562 | { | 556 | { |
563 | struct vcpu_svm *svm = to_svm(vcpu); | 557 | struct vcpu_svm *svm = to_svm(vcpu); |
564 | 558 | ||
@@ -566,9 +560,11 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu) | |||
566 | 560 | ||
567 | if (vcpu->vcpu_id != 0) { | 561 | if (vcpu->vcpu_id != 0) { |
568 | svm->vmcb->save.rip = 0; | 562 | svm->vmcb->save.rip = 0; |
569 | svm->vmcb->save.cs.base = svm->vcpu.sipi_vector << 12; | 563 | svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; |
570 | svm->vmcb->save.cs.selector = svm->vcpu.sipi_vector << 8; | 564 | svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; |
571 | } | 565 | } |
566 | |||
567 | return 0; | ||
572 | } | 568 | } |
573 | 569 | ||
574 | static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) | 570 | static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) |
@@ -587,12 +583,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) | |||
587 | if (err) | 583 | if (err) |
588 | goto free_svm; | 584 | goto free_svm; |
589 | 585 | ||
590 | if (irqchip_in_kernel(kvm)) { | ||
591 | err = kvm_create_lapic(&svm->vcpu); | ||
592 | if (err < 0) | ||
593 | goto free_svm; | ||
594 | } | ||
595 | |||
596 | page = alloc_page(GFP_KERNEL); | 586 | page = alloc_page(GFP_KERNEL); |
597 | if (!page) { | 587 | if (!page) { |
598 | err = -ENOMEM; | 588 | err = -ENOMEM; |
@@ -608,9 +598,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) | |||
608 | 598 | ||
609 | fx_init(&svm->vcpu); | 599 | fx_init(&svm->vcpu); |
610 | svm->vcpu.fpu_active = 1; | 600 | svm->vcpu.fpu_active = 1; |
611 | svm->vcpu.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; | 601 | svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; |
612 | if (svm->vcpu.vcpu_id == 0) | 602 | if (svm->vcpu.vcpu_id == 0) |
613 | svm->vcpu.apic_base |= MSR_IA32_APICBASE_BSP; | 603 | svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; |
614 | 604 | ||
615 | return &svm->vcpu; | 605 | return &svm->vcpu; |
616 | 606 | ||
@@ -644,7 +634,7 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | |||
644 | * increasing TSC. | 634 | * increasing TSC. |
645 | */ | 635 | */ |
646 | rdtscll(tsc_this); | 636 | rdtscll(tsc_this); |
647 | delta = vcpu->host_tsc - tsc_this; | 637 | delta = vcpu->arch.host_tsc - tsc_this; |
648 | svm->vmcb->control.tsc_offset += delta; | 638 | svm->vmcb->control.tsc_offset += delta; |
649 | vcpu->cpu = cpu; | 639 | vcpu->cpu = cpu; |
650 | kvm_migrate_apic_timer(vcpu); | 640 | kvm_migrate_apic_timer(vcpu); |
@@ -659,11 +649,11 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu) | |||
659 | struct vcpu_svm *svm = to_svm(vcpu); | 649 | struct vcpu_svm *svm = to_svm(vcpu); |
660 | int i; | 650 | int i; |
661 | 651 | ||
652 | ++vcpu->stat.host_state_reload; | ||
662 | for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) | 653 | for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) |
663 | wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); | 654 | wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); |
664 | 655 | ||
665 | rdtscll(vcpu->host_tsc); | 656 | rdtscll(vcpu->arch.host_tsc); |
666 | kvm_put_guest_fpu(vcpu); | ||
667 | } | 657 | } |
668 | 658 | ||
669 | static void svm_vcpu_decache(struct kvm_vcpu *vcpu) | 659 | static void svm_vcpu_decache(struct kvm_vcpu *vcpu) |
@@ -674,17 +664,17 @@ static void svm_cache_regs(struct kvm_vcpu *vcpu) | |||
674 | { | 664 | { |
675 | struct vcpu_svm *svm = to_svm(vcpu); | 665 | struct vcpu_svm *svm = to_svm(vcpu); |
676 | 666 | ||
677 | vcpu->regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; | 667 | vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; |
678 | vcpu->regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; | 668 | vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; |
679 | vcpu->rip = svm->vmcb->save.rip; | 669 | vcpu->arch.rip = svm->vmcb->save.rip; |
680 | } | 670 | } |
681 | 671 | ||
682 | static void svm_decache_regs(struct kvm_vcpu *vcpu) | 672 | static void svm_decache_regs(struct kvm_vcpu *vcpu) |
683 | { | 673 | { |
684 | struct vcpu_svm *svm = to_svm(vcpu); | 674 | struct vcpu_svm *svm = to_svm(vcpu); |
685 | svm->vmcb->save.rax = vcpu->regs[VCPU_REGS_RAX]; | 675 | svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; |
686 | svm->vmcb->save.rsp = vcpu->regs[VCPU_REGS_RSP]; | 676 | svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; |
687 | svm->vmcb->save.rip = vcpu->rip; | 677 | svm->vmcb->save.rip = vcpu->arch.rip; |
688 | } | 678 | } |
689 | 679 | ||
690 | static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) | 680 | static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) |
@@ -782,24 +772,24 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | |||
782 | struct vcpu_svm *svm = to_svm(vcpu); | 772 | struct vcpu_svm *svm = to_svm(vcpu); |
783 | 773 | ||
784 | #ifdef CONFIG_X86_64 | 774 | #ifdef CONFIG_X86_64 |
785 | if (vcpu->shadow_efer & KVM_EFER_LME) { | 775 | if (vcpu->arch.shadow_efer & EFER_LME) { |
786 | if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { | 776 | if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { |
787 | vcpu->shadow_efer |= KVM_EFER_LMA; | 777 | vcpu->arch.shadow_efer |= EFER_LMA; |
788 | svm->vmcb->save.efer |= KVM_EFER_LMA | KVM_EFER_LME; | 778 | svm->vmcb->save.efer |= EFER_LMA | EFER_LME; |
789 | } | 779 | } |
790 | 780 | ||
791 | if (is_paging(vcpu) && !(cr0 & X86_CR0_PG) ) { | 781 | if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) { |
792 | vcpu->shadow_efer &= ~KVM_EFER_LMA; | 782 | vcpu->arch.shadow_efer &= ~EFER_LMA; |
793 | svm->vmcb->save.efer &= ~(KVM_EFER_LMA | KVM_EFER_LME); | 783 | svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME); |
794 | } | 784 | } |
795 | } | 785 | } |
796 | #endif | 786 | #endif |
797 | if ((vcpu->cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) { | 787 | if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) { |
798 | svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); | 788 | svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); |
799 | vcpu->fpu_active = 1; | 789 | vcpu->fpu_active = 1; |
800 | } | 790 | } |
801 | 791 | ||
802 | vcpu->cr0 = cr0; | 792 | vcpu->arch.cr0 = cr0; |
803 | cr0 |= X86_CR0_PG | X86_CR0_WP; | 793 | cr0 |= X86_CR0_PG | X86_CR0_WP; |
804 | cr0 &= ~(X86_CR0_CD | X86_CR0_NW); | 794 | cr0 &= ~(X86_CR0_CD | X86_CR0_NW); |
805 | svm->vmcb->save.cr0 = cr0; | 795 | svm->vmcb->save.cr0 = cr0; |
@@ -807,7 +797,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | |||
807 | 797 | ||
808 | static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | 798 | static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) |
809 | { | 799 | { |
810 | vcpu->cr4 = cr4; | 800 | vcpu->arch.cr4 = cr4; |
811 | to_svm(vcpu)->vmcb->save.cr4 = cr4 | X86_CR4_PAE; | 801 | to_svm(vcpu)->vmcb->save.cr4 = cr4 | X86_CR4_PAE; |
812 | } | 802 | } |
813 | 803 | ||
@@ -912,7 +902,7 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, | |||
912 | svm->db_regs[dr] = value; | 902 | svm->db_regs[dr] = value; |
913 | return; | 903 | return; |
914 | case 4 ... 5: | 904 | case 4 ... 5: |
915 | if (vcpu->cr4 & X86_CR4_DE) { | 905 | if (vcpu->arch.cr4 & X86_CR4_DE) { |
916 | *exception = UD_VECTOR; | 906 | *exception = UD_VECTOR; |
917 | return; | 907 | return; |
918 | } | 908 | } |
@@ -938,51 +928,30 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | |||
938 | struct kvm *kvm = svm->vcpu.kvm; | 928 | struct kvm *kvm = svm->vcpu.kvm; |
939 | u64 fault_address; | 929 | u64 fault_address; |
940 | u32 error_code; | 930 | u32 error_code; |
941 | enum emulation_result er; | ||
942 | int r; | ||
943 | 931 | ||
944 | if (!irqchip_in_kernel(kvm) && | 932 | if (!irqchip_in_kernel(kvm) && |
945 | is_external_interrupt(exit_int_info)) | 933 | is_external_interrupt(exit_int_info)) |
946 | push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK); | 934 | push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK); |
947 | 935 | ||
948 | mutex_lock(&kvm->lock); | ||
949 | |||
950 | fault_address = svm->vmcb->control.exit_info_2; | 936 | fault_address = svm->vmcb->control.exit_info_2; |
951 | error_code = svm->vmcb->control.exit_info_1; | 937 | error_code = svm->vmcb->control.exit_info_1; |
952 | r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); | 938 | return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); |
953 | if (r < 0) { | 939 | } |
954 | mutex_unlock(&kvm->lock); | ||
955 | return r; | ||
956 | } | ||
957 | if (!r) { | ||
958 | mutex_unlock(&kvm->lock); | ||
959 | return 1; | ||
960 | } | ||
961 | er = emulate_instruction(&svm->vcpu, kvm_run, fault_address, | ||
962 | error_code); | ||
963 | mutex_unlock(&kvm->lock); | ||
964 | 940 | ||
965 | switch (er) { | 941 | static int ud_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) |
966 | case EMULATE_DONE: | 942 | { |
967 | return 1; | 943 | int er; |
968 | case EMULATE_DO_MMIO: | ||
969 | ++svm->vcpu.stat.mmio_exits; | ||
970 | return 0; | ||
971 | case EMULATE_FAIL: | ||
972 | kvm_report_emulation_failure(&svm->vcpu, "pagetable"); | ||
973 | break; | ||
974 | default: | ||
975 | BUG(); | ||
976 | } | ||
977 | 944 | ||
978 | kvm_run->exit_reason = KVM_EXIT_UNKNOWN; | 945 | er = emulate_instruction(&svm->vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD); |
979 | return 0; | 946 | if (er != EMULATE_DONE) |
947 | kvm_queue_exception(&svm->vcpu, UD_VECTOR); | ||
948 | return 1; | ||
980 | } | 949 | } |
981 | 950 | ||
982 | static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | 951 | static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) |
983 | { | 952 | { |
984 | svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); | 953 | svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); |
985 | if (!(svm->vcpu.cr0 & X86_CR0_TS)) | 954 | if (!(svm->vcpu.arch.cr0 & X86_CR0_TS)) |
986 | svm->vmcb->save.cr0 &= ~X86_CR0_TS; | 955 | svm->vmcb->save.cr0 &= ~X86_CR0_TS; |
987 | svm->vcpu.fpu_active = 1; | 956 | svm->vcpu.fpu_active = 1; |
988 | 957 | ||
@@ -1004,7 +973,7 @@ static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | |||
1004 | 973 | ||
1005 | static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | 974 | static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) |
1006 | { | 975 | { |
1007 | u32 io_info = svm->vmcb->control.exit_info_1; //address size bug? | 976 | u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */ |
1008 | int size, down, in, string, rep; | 977 | int size, down, in, string, rep; |
1009 | unsigned port; | 978 | unsigned port; |
1010 | 979 | ||
@@ -1015,7 +984,8 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | |||
1015 | string = (io_info & SVM_IOIO_STR_MASK) != 0; | 984 | string = (io_info & SVM_IOIO_STR_MASK) != 0; |
1016 | 985 | ||
1017 | if (string) { | 986 | if (string) { |
1018 | if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0) == EMULATE_DO_MMIO) | 987 | if (emulate_instruction(&svm->vcpu, |
988 | kvm_run, 0, 0, 0) == EMULATE_DO_MMIO) | ||
1019 | return 0; | 989 | return 0; |
1020 | return 1; | 990 | return 1; |
1021 | } | 991 | } |
@@ -1045,13 +1015,14 @@ static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | |||
1045 | { | 1015 | { |
1046 | svm->next_rip = svm->vmcb->save.rip + 3; | 1016 | svm->next_rip = svm->vmcb->save.rip + 3; |
1047 | skip_emulated_instruction(&svm->vcpu); | 1017 | skip_emulated_instruction(&svm->vcpu); |
1048 | return kvm_hypercall(&svm->vcpu, kvm_run); | 1018 | kvm_emulate_hypercall(&svm->vcpu); |
1019 | return 1; | ||
1049 | } | 1020 | } |
1050 | 1021 | ||
1051 | static int invalid_op_interception(struct vcpu_svm *svm, | 1022 | static int invalid_op_interception(struct vcpu_svm *svm, |
1052 | struct kvm_run *kvm_run) | 1023 | struct kvm_run *kvm_run) |
1053 | { | 1024 | { |
1054 | inject_ud(&svm->vcpu); | 1025 | kvm_queue_exception(&svm->vcpu, UD_VECTOR); |
1055 | return 1; | 1026 | return 1; |
1056 | } | 1027 | } |
1057 | 1028 | ||
@@ -1073,11 +1044,20 @@ static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | |||
1073 | static int emulate_on_interception(struct vcpu_svm *svm, | 1044 | static int emulate_on_interception(struct vcpu_svm *svm, |
1074 | struct kvm_run *kvm_run) | 1045 | struct kvm_run *kvm_run) |
1075 | { | 1046 | { |
1076 | if (emulate_instruction(&svm->vcpu, NULL, 0, 0) != EMULATE_DONE) | 1047 | if (emulate_instruction(&svm->vcpu, NULL, 0, 0, 0) != EMULATE_DONE) |
1077 | pr_unimpl(&svm->vcpu, "%s: failed\n", __FUNCTION__); | 1048 | pr_unimpl(&svm->vcpu, "%s: failed\n", __FUNCTION__); |
1078 | return 1; | 1049 | return 1; |
1079 | } | 1050 | } |
1080 | 1051 | ||
1052 | static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | ||
1053 | { | ||
1054 | emulate_instruction(&svm->vcpu, NULL, 0, 0, 0); | ||
1055 | if (irqchip_in_kernel(svm->vcpu.kvm)) | ||
1056 | return 1; | ||
1057 | kvm_run->exit_reason = KVM_EXIT_SET_TPR; | ||
1058 | return 0; | ||
1059 | } | ||
1060 | |||
1081 | static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) | 1061 | static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) |
1082 | { | 1062 | { |
1083 | struct vcpu_svm *svm = to_svm(vcpu); | 1063 | struct vcpu_svm *svm = to_svm(vcpu); |
@@ -1124,14 +1104,14 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) | |||
1124 | 1104 | ||
1125 | static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | 1105 | static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) |
1126 | { | 1106 | { |
1127 | u32 ecx = svm->vcpu.regs[VCPU_REGS_RCX]; | 1107 | u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; |
1128 | u64 data; | 1108 | u64 data; |
1129 | 1109 | ||
1130 | if (svm_get_msr(&svm->vcpu, ecx, &data)) | 1110 | if (svm_get_msr(&svm->vcpu, ecx, &data)) |
1131 | svm_inject_gp(&svm->vcpu, 0); | 1111 | kvm_inject_gp(&svm->vcpu, 0); |
1132 | else { | 1112 | else { |
1133 | svm->vmcb->save.rax = data & 0xffffffff; | 1113 | svm->vmcb->save.rax = data & 0xffffffff; |
1134 | svm->vcpu.regs[VCPU_REGS_RDX] = data >> 32; | 1114 | svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32; |
1135 | svm->next_rip = svm->vmcb->save.rip + 2; | 1115 | svm->next_rip = svm->vmcb->save.rip + 2; |
1136 | skip_emulated_instruction(&svm->vcpu); | 1116 | skip_emulated_instruction(&svm->vcpu); |
1137 | } | 1117 | } |
@@ -1176,7 +1156,20 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) | |||
1176 | case MSR_IA32_SYSENTER_ESP: | 1156 | case MSR_IA32_SYSENTER_ESP: |
1177 | svm->vmcb->save.sysenter_esp = data; | 1157 | svm->vmcb->save.sysenter_esp = data; |
1178 | break; | 1158 | break; |
1159 | case MSR_K7_EVNTSEL0: | ||
1160 | case MSR_K7_EVNTSEL1: | ||
1161 | case MSR_K7_EVNTSEL2: | ||
1162 | case MSR_K7_EVNTSEL3: | ||
1163 | /* | ||
1164 | * only support writing 0 to the performance counters for now | ||
1165 | * to make Windows happy. Should be replaced by a real | ||
1166 | * performance counter emulation later. | ||
1167 | */ | ||
1168 | if (data != 0) | ||
1169 | goto unhandled; | ||
1170 | break; | ||
1179 | default: | 1171 | default: |
1172 | unhandled: | ||
1180 | return kvm_set_msr_common(vcpu, ecx, data); | 1173 | return kvm_set_msr_common(vcpu, ecx, data); |
1181 | } | 1174 | } |
1182 | return 0; | 1175 | return 0; |
@@ -1184,12 +1177,12 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) | |||
1184 | 1177 | ||
1185 | static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | 1178 | static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) |
1186 | { | 1179 | { |
1187 | u32 ecx = svm->vcpu.regs[VCPU_REGS_RCX]; | 1180 | u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; |
1188 | u64 data = (svm->vmcb->save.rax & -1u) | 1181 | u64 data = (svm->vmcb->save.rax & -1u) |
1189 | | ((u64)(svm->vcpu.regs[VCPU_REGS_RDX] & -1u) << 32); | 1182 | | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); |
1190 | svm->next_rip = svm->vmcb->save.rip + 2; | 1183 | svm->next_rip = svm->vmcb->save.rip + 2; |
1191 | if (svm_set_msr(&svm->vcpu, ecx, data)) | 1184 | if (svm_set_msr(&svm->vcpu, ecx, data)) |
1192 | svm_inject_gp(&svm->vcpu, 0); | 1185 | kvm_inject_gp(&svm->vcpu, 0); |
1193 | else | 1186 | else |
1194 | skip_emulated_instruction(&svm->vcpu); | 1187 | skip_emulated_instruction(&svm->vcpu); |
1195 | return 1; | 1188 | return 1; |
@@ -1213,7 +1206,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm, | |||
1213 | * possible | 1206 | * possible |
1214 | */ | 1207 | */ |
1215 | if (kvm_run->request_interrupt_window && | 1208 | if (kvm_run->request_interrupt_window && |
1216 | !svm->vcpu.irq_summary) { | 1209 | !svm->vcpu.arch.irq_summary) { |
1217 | ++svm->vcpu.stat.irq_window_exits; | 1210 | ++svm->vcpu.stat.irq_window_exits; |
1218 | kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; | 1211 | kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; |
1219 | return 0; | 1212 | return 0; |
@@ -1227,10 +1220,12 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm, | |||
1227 | [SVM_EXIT_READ_CR0] = emulate_on_interception, | 1220 | [SVM_EXIT_READ_CR0] = emulate_on_interception, |
1228 | [SVM_EXIT_READ_CR3] = emulate_on_interception, | 1221 | [SVM_EXIT_READ_CR3] = emulate_on_interception, |
1229 | [SVM_EXIT_READ_CR4] = emulate_on_interception, | 1222 | [SVM_EXIT_READ_CR4] = emulate_on_interception, |
1223 | [SVM_EXIT_READ_CR8] = emulate_on_interception, | ||
1230 | /* for now: */ | 1224 | /* for now: */ |
1231 | [SVM_EXIT_WRITE_CR0] = emulate_on_interception, | 1225 | [SVM_EXIT_WRITE_CR0] = emulate_on_interception, |
1232 | [SVM_EXIT_WRITE_CR3] = emulate_on_interception, | 1226 | [SVM_EXIT_WRITE_CR3] = emulate_on_interception, |
1233 | [SVM_EXIT_WRITE_CR4] = emulate_on_interception, | 1227 | [SVM_EXIT_WRITE_CR4] = emulate_on_interception, |
1228 | [SVM_EXIT_WRITE_CR8] = cr8_write_interception, | ||
1234 | [SVM_EXIT_READ_DR0] = emulate_on_interception, | 1229 | [SVM_EXIT_READ_DR0] = emulate_on_interception, |
1235 | [SVM_EXIT_READ_DR1] = emulate_on_interception, | 1230 | [SVM_EXIT_READ_DR1] = emulate_on_interception, |
1236 | [SVM_EXIT_READ_DR2] = emulate_on_interception, | 1231 | [SVM_EXIT_READ_DR2] = emulate_on_interception, |
@@ -1241,6 +1236,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm, | |||
1241 | [SVM_EXIT_WRITE_DR3] = emulate_on_interception, | 1236 | [SVM_EXIT_WRITE_DR3] = emulate_on_interception, |
1242 | [SVM_EXIT_WRITE_DR5] = emulate_on_interception, | 1237 | [SVM_EXIT_WRITE_DR5] = emulate_on_interception, |
1243 | [SVM_EXIT_WRITE_DR7] = emulate_on_interception, | 1238 | [SVM_EXIT_WRITE_DR7] = emulate_on_interception, |
1239 | [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, | ||
1244 | [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, | 1240 | [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, |
1245 | [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, | 1241 | [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, |
1246 | [SVM_EXIT_INTR] = nop_on_interception, | 1242 | [SVM_EXIT_INTR] = nop_on_interception, |
@@ -1293,7 +1289,7 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) | |||
1293 | exit_code); | 1289 | exit_code); |
1294 | 1290 | ||
1295 | if (exit_code >= ARRAY_SIZE(svm_exit_handlers) | 1291 | if (exit_code >= ARRAY_SIZE(svm_exit_handlers) |
1296 | || svm_exit_handlers[exit_code] == 0) { | 1292 | || !svm_exit_handlers[exit_code]) { |
1297 | kvm_run->exit_reason = KVM_EXIT_UNKNOWN; | 1293 | kvm_run->exit_reason = KVM_EXIT_UNKNOWN; |
1298 | kvm_run->hw.hardware_exit_reason = exit_code; | 1294 | kvm_run->hw.hardware_exit_reason = exit_code; |
1299 | return 0; | 1295 | return 0; |
@@ -1307,7 +1303,7 @@ static void reload_tss(struct kvm_vcpu *vcpu) | |||
1307 | int cpu = raw_smp_processor_id(); | 1303 | int cpu = raw_smp_processor_id(); |
1308 | 1304 | ||
1309 | struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); | 1305 | struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); |
1310 | svm_data->tss_desc->type = 9; //available 32/64-bit TSS | 1306 | svm_data->tss_desc->type = 9; /* available 32/64-bit TSS */ |
1311 | load_TR_desc(); | 1307 | load_TR_desc(); |
1312 | } | 1308 | } |
1313 | 1309 | ||
@@ -1348,7 +1344,6 @@ static void svm_intr_assist(struct kvm_vcpu *vcpu) | |||
1348 | struct vmcb *vmcb = svm->vmcb; | 1344 | struct vmcb *vmcb = svm->vmcb; |
1349 | int intr_vector = -1; | 1345 | int intr_vector = -1; |
1350 | 1346 | ||
1351 | kvm_inject_pending_timer_irqs(vcpu); | ||
1352 | if ((vmcb->control.exit_int_info & SVM_EVTINJ_VALID) && | 1347 | if ((vmcb->control.exit_int_info & SVM_EVTINJ_VALID) && |
1353 | ((vmcb->control.exit_int_info & SVM_EVTINJ_TYPE_MASK) == 0)) { | 1348 | ((vmcb->control.exit_int_info & SVM_EVTINJ_TYPE_MASK) == 0)) { |
1354 | intr_vector = vmcb->control.exit_int_info & | 1349 | intr_vector = vmcb->control.exit_int_info & |
@@ -1388,20 +1383,20 @@ static void kvm_reput_irq(struct vcpu_svm *svm) | |||
1388 | push_irq(&svm->vcpu, control->int_vector); | 1383 | push_irq(&svm->vcpu, control->int_vector); |
1389 | } | 1384 | } |
1390 | 1385 | ||
1391 | svm->vcpu.interrupt_window_open = | 1386 | svm->vcpu.arch.interrupt_window_open = |
1392 | !(control->int_state & SVM_INTERRUPT_SHADOW_MASK); | 1387 | !(control->int_state & SVM_INTERRUPT_SHADOW_MASK); |
1393 | } | 1388 | } |
1394 | 1389 | ||
1395 | static void svm_do_inject_vector(struct vcpu_svm *svm) | 1390 | static void svm_do_inject_vector(struct vcpu_svm *svm) |
1396 | { | 1391 | { |
1397 | struct kvm_vcpu *vcpu = &svm->vcpu; | 1392 | struct kvm_vcpu *vcpu = &svm->vcpu; |
1398 | int word_index = __ffs(vcpu->irq_summary); | 1393 | int word_index = __ffs(vcpu->arch.irq_summary); |
1399 | int bit_index = __ffs(vcpu->irq_pending[word_index]); | 1394 | int bit_index = __ffs(vcpu->arch.irq_pending[word_index]); |
1400 | int irq = word_index * BITS_PER_LONG + bit_index; | 1395 | int irq = word_index * BITS_PER_LONG + bit_index; |
1401 | 1396 | ||
1402 | clear_bit(bit_index, &vcpu->irq_pending[word_index]); | 1397 | clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); |
1403 | if (!vcpu->irq_pending[word_index]) | 1398 | if (!vcpu->arch.irq_pending[word_index]) |
1404 | clear_bit(word_index, &vcpu->irq_summary); | 1399 | clear_bit(word_index, &vcpu->arch.irq_summary); |
1405 | svm_inject_irq(svm, irq); | 1400 | svm_inject_irq(svm, irq); |
1406 | } | 1401 | } |
1407 | 1402 | ||
@@ -1411,11 +1406,11 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu, | |||
1411 | struct vcpu_svm *svm = to_svm(vcpu); | 1406 | struct vcpu_svm *svm = to_svm(vcpu); |
1412 | struct vmcb_control_area *control = &svm->vmcb->control; | 1407 | struct vmcb_control_area *control = &svm->vmcb->control; |
1413 | 1408 | ||
1414 | svm->vcpu.interrupt_window_open = | 1409 | svm->vcpu.arch.interrupt_window_open = |
1415 | (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) && | 1410 | (!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) && |
1416 | (svm->vmcb->save.rflags & X86_EFLAGS_IF)); | 1411 | (svm->vmcb->save.rflags & X86_EFLAGS_IF)); |
1417 | 1412 | ||
1418 | if (svm->vcpu.interrupt_window_open && svm->vcpu.irq_summary) | 1413 | if (svm->vcpu.arch.interrupt_window_open && svm->vcpu.arch.irq_summary) |
1419 | /* | 1414 | /* |
1420 | * If interrupts enabled, and not blocked by sti or mov ss. Good. | 1415 | * If interrupts enabled, and not blocked by sti or mov ss. Good. |
1421 | */ | 1416 | */ |
@@ -1424,13 +1419,18 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu, | |||
1424 | /* | 1419 | /* |
1425 | * Interrupts blocked. Wait for unblock. | 1420 | * Interrupts blocked. Wait for unblock. |
1426 | */ | 1421 | */ |
1427 | if (!svm->vcpu.interrupt_window_open && | 1422 | if (!svm->vcpu.arch.interrupt_window_open && |
1428 | (svm->vcpu.irq_summary || kvm_run->request_interrupt_window)) { | 1423 | (svm->vcpu.arch.irq_summary || kvm_run->request_interrupt_window)) |
1429 | control->intercept |= 1ULL << INTERCEPT_VINTR; | 1424 | control->intercept |= 1ULL << INTERCEPT_VINTR; |
1430 | } else | 1425 | else |
1431 | control->intercept &= ~(1ULL << INTERCEPT_VINTR); | 1426 | control->intercept &= ~(1ULL << INTERCEPT_VINTR); |
1432 | } | 1427 | } |
1433 | 1428 | ||
1429 | static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) | ||
1430 | { | ||
1431 | return 0; | ||
1432 | } | ||
1433 | |||
1434 | static void save_db_regs(unsigned long *db_regs) | 1434 | static void save_db_regs(unsigned long *db_regs) |
1435 | { | 1435 | { |
1436 | asm volatile ("mov %%dr0, %0" : "=r"(db_regs[0])); | 1436 | asm volatile ("mov %%dr0, %0" : "=r"(db_regs[0])); |
@@ -1472,7 +1472,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
1472 | svm->host_cr2 = kvm_read_cr2(); | 1472 | svm->host_cr2 = kvm_read_cr2(); |
1473 | svm->host_dr6 = read_dr6(); | 1473 | svm->host_dr6 = read_dr6(); |
1474 | svm->host_dr7 = read_dr7(); | 1474 | svm->host_dr7 = read_dr7(); |
1475 | svm->vmcb->save.cr2 = vcpu->cr2; | 1475 | svm->vmcb->save.cr2 = vcpu->arch.cr2; |
1476 | 1476 | ||
1477 | if (svm->vmcb->save.dr7 & 0xff) { | 1477 | if (svm->vmcb->save.dr7 & 0xff) { |
1478 | write_dr7(0); | 1478 | write_dr7(0); |
@@ -1486,13 +1486,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
1486 | 1486 | ||
1487 | asm volatile ( | 1487 | asm volatile ( |
1488 | #ifdef CONFIG_X86_64 | 1488 | #ifdef CONFIG_X86_64 |
1489 | "push %%rbx; push %%rcx; push %%rdx;" | 1489 | "push %%rbp; \n\t" |
1490 | "push %%rsi; push %%rdi; push %%rbp;" | ||
1491 | "push %%r8; push %%r9; push %%r10; push %%r11;" | ||
1492 | "push %%r12; push %%r13; push %%r14; push %%r15;" | ||
1493 | #else | 1490 | #else |
1494 | "push %%ebx; push %%ecx; push %%edx;" | 1491 | "push %%ebp; \n\t" |
1495 | "push %%esi; push %%edi; push %%ebp;" | ||
1496 | #endif | 1492 | #endif |
1497 | 1493 | ||
1498 | #ifdef CONFIG_X86_64 | 1494 | #ifdef CONFIG_X86_64 |
@@ -1554,10 +1550,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
1554 | "mov %%r14, %c[r14](%[svm]) \n\t" | 1550 | "mov %%r14, %c[r14](%[svm]) \n\t" |
1555 | "mov %%r15, %c[r15](%[svm]) \n\t" | 1551 | "mov %%r15, %c[r15](%[svm]) \n\t" |
1556 | 1552 | ||
1557 | "pop %%r15; pop %%r14; pop %%r13; pop %%r12;" | 1553 | "pop %%rbp; \n\t" |
1558 | "pop %%r11; pop %%r10; pop %%r9; pop %%r8;" | ||
1559 | "pop %%rbp; pop %%rdi; pop %%rsi;" | ||
1560 | "pop %%rdx; pop %%rcx; pop %%rbx; \n\t" | ||
1561 | #else | 1554 | #else |
1562 | "mov %%ebx, %c[rbx](%[svm]) \n\t" | 1555 | "mov %%ebx, %c[rbx](%[svm]) \n\t" |
1563 | "mov %%ecx, %c[rcx](%[svm]) \n\t" | 1556 | "mov %%ecx, %c[rcx](%[svm]) \n\t" |
@@ -1566,34 +1559,40 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
1566 | "mov %%edi, %c[rdi](%[svm]) \n\t" | 1559 | "mov %%edi, %c[rdi](%[svm]) \n\t" |
1567 | "mov %%ebp, %c[rbp](%[svm]) \n\t" | 1560 | "mov %%ebp, %c[rbp](%[svm]) \n\t" |
1568 | 1561 | ||
1569 | "pop %%ebp; pop %%edi; pop %%esi;" | 1562 | "pop %%ebp; \n\t" |
1570 | "pop %%edx; pop %%ecx; pop %%ebx; \n\t" | ||
1571 | #endif | 1563 | #endif |
1572 | : | 1564 | : |
1573 | : [svm]"a"(svm), | 1565 | : [svm]"a"(svm), |
1574 | [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), | 1566 | [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), |
1575 | [rbx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RBX])), | 1567 | [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])), |
1576 | [rcx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RCX])), | 1568 | [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])), |
1577 | [rdx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RDX])), | 1569 | [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])), |
1578 | [rsi]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RSI])), | 1570 | [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])), |
1579 | [rdi]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RDI])), | 1571 | [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])), |
1580 | [rbp]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RBP])) | 1572 | [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP])) |
1581 | #ifdef CONFIG_X86_64 | 1573 | #ifdef CONFIG_X86_64 |
1582 | ,[r8 ]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R8])), | 1574 | , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])), |
1583 | [r9 ]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R9 ])), | 1575 | [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])), |
1584 | [r10]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R10])), | 1576 | [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])), |
1585 | [r11]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R11])), | 1577 | [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])), |
1586 | [r12]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R12])), | 1578 | [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])), |
1587 | [r13]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R13])), | 1579 | [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])), |
1588 | [r14]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R14])), | 1580 | [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])), |
1589 | [r15]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R15])) | 1581 | [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15])) |
1590 | #endif | 1582 | #endif |
1591 | : "cc", "memory" ); | 1583 | : "cc", "memory" |
1584 | #ifdef CONFIG_X86_64 | ||
1585 | , "rbx", "rcx", "rdx", "rsi", "rdi" | ||
1586 | , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15" | ||
1587 | #else | ||
1588 | , "ebx", "ecx", "edx" , "esi", "edi" | ||
1589 | #endif | ||
1590 | ); | ||
1592 | 1591 | ||
1593 | if ((svm->vmcb->save.dr7 & 0xff)) | 1592 | if ((svm->vmcb->save.dr7 & 0xff)) |
1594 | load_db_regs(svm->host_db_regs); | 1593 | load_db_regs(svm->host_db_regs); |
1595 | 1594 | ||
1596 | vcpu->cr2 = svm->vmcb->save.cr2; | 1595 | vcpu->arch.cr2 = svm->vmcb->save.cr2; |
1597 | 1596 | ||
1598 | write_dr6(svm->host_dr6); | 1597 | write_dr6(svm->host_dr6); |
1599 | write_dr7(svm->host_dr7); | 1598 | write_dr7(svm->host_dr7); |
@@ -1627,34 +1626,6 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) | |||
1627 | } | 1626 | } |
1628 | } | 1627 | } |
1629 | 1628 | ||
1630 | static void svm_inject_page_fault(struct kvm_vcpu *vcpu, | ||
1631 | unsigned long addr, | ||
1632 | uint32_t err_code) | ||
1633 | { | ||
1634 | struct vcpu_svm *svm = to_svm(vcpu); | ||
1635 | uint32_t exit_int_info = svm->vmcb->control.exit_int_info; | ||
1636 | |||
1637 | ++vcpu->stat.pf_guest; | ||
1638 | |||
1639 | if (is_page_fault(exit_int_info)) { | ||
1640 | |||
1641 | svm->vmcb->control.event_inj_err = 0; | ||
1642 | svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | | ||
1643 | SVM_EVTINJ_VALID_ERR | | ||
1644 | SVM_EVTINJ_TYPE_EXEPT | | ||
1645 | DF_VECTOR; | ||
1646 | return; | ||
1647 | } | ||
1648 | vcpu->cr2 = addr; | ||
1649 | svm->vmcb->save.cr2 = addr; | ||
1650 | svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | | ||
1651 | SVM_EVTINJ_VALID_ERR | | ||
1652 | SVM_EVTINJ_TYPE_EXEPT | | ||
1653 | PF_VECTOR; | ||
1654 | svm->vmcb->control.event_inj_err = err_code; | ||
1655 | } | ||
1656 | |||
1657 | |||
1658 | static int is_disabled(void) | 1629 | static int is_disabled(void) |
1659 | { | 1630 | { |
1660 | u64 vm_cr; | 1631 | u64 vm_cr; |
@@ -1675,7 +1646,6 @@ svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) | |||
1675 | hypercall[0] = 0x0f; | 1646 | hypercall[0] = 0x0f; |
1676 | hypercall[1] = 0x01; | 1647 | hypercall[1] = 0x01; |
1677 | hypercall[2] = 0xd9; | 1648 | hypercall[2] = 0xd9; |
1678 | hypercall[3] = 0xc3; | ||
1679 | } | 1649 | } |
1680 | 1650 | ||
1681 | static void svm_check_processor_compat(void *rtn) | 1651 | static void svm_check_processor_compat(void *rtn) |
@@ -1683,6 +1653,11 @@ static void svm_check_processor_compat(void *rtn) | |||
1683 | *(int *)rtn = 0; | 1653 | *(int *)rtn = 0; |
1684 | } | 1654 | } |
1685 | 1655 | ||
1656 | static bool svm_cpu_has_accelerated_tpr(void) | ||
1657 | { | ||
1658 | return false; | ||
1659 | } | ||
1660 | |||
1686 | static struct kvm_x86_ops svm_x86_ops = { | 1661 | static struct kvm_x86_ops svm_x86_ops = { |
1687 | .cpu_has_kvm_support = has_svm, | 1662 | .cpu_has_kvm_support = has_svm, |
1688 | .disabled_by_bios = is_disabled, | 1663 | .disabled_by_bios = is_disabled, |
@@ -1691,6 +1666,7 @@ static struct kvm_x86_ops svm_x86_ops = { | |||
1691 | .check_processor_compatibility = svm_check_processor_compat, | 1666 | .check_processor_compatibility = svm_check_processor_compat, |
1692 | .hardware_enable = svm_hardware_enable, | 1667 | .hardware_enable = svm_hardware_enable, |
1693 | .hardware_disable = svm_hardware_disable, | 1668 | .hardware_disable = svm_hardware_disable, |
1669 | .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr, | ||
1694 | 1670 | ||
1695 | .vcpu_create = svm_create_vcpu, | 1671 | .vcpu_create = svm_create_vcpu, |
1696 | .vcpu_free = svm_free_vcpu, | 1672 | .vcpu_free = svm_free_vcpu, |
@@ -1725,9 +1701,6 @@ static struct kvm_x86_ops svm_x86_ops = { | |||
1725 | .set_rflags = svm_set_rflags, | 1701 | .set_rflags = svm_set_rflags, |
1726 | 1702 | ||
1727 | .tlb_flush = svm_flush_tlb, | 1703 | .tlb_flush = svm_flush_tlb, |
1728 | .inject_page_fault = svm_inject_page_fault, | ||
1729 | |||
1730 | .inject_gp = svm_inject_gp, | ||
1731 | 1704 | ||
1732 | .run = svm_vcpu_run, | 1705 | .run = svm_vcpu_run, |
1733 | .handle_exit = handle_exit, | 1706 | .handle_exit = handle_exit, |
@@ -1735,19 +1708,23 @@ static struct kvm_x86_ops svm_x86_ops = { | |||
1735 | .patch_hypercall = svm_patch_hypercall, | 1708 | .patch_hypercall = svm_patch_hypercall, |
1736 | .get_irq = svm_get_irq, | 1709 | .get_irq = svm_get_irq, |
1737 | .set_irq = svm_set_irq, | 1710 | .set_irq = svm_set_irq, |
1711 | .queue_exception = svm_queue_exception, | ||
1712 | .exception_injected = svm_exception_injected, | ||
1738 | .inject_pending_irq = svm_intr_assist, | 1713 | .inject_pending_irq = svm_intr_assist, |
1739 | .inject_pending_vectors = do_interrupt_requests, | 1714 | .inject_pending_vectors = do_interrupt_requests, |
1715 | |||
1716 | .set_tss_addr = svm_set_tss_addr, | ||
1740 | }; | 1717 | }; |
1741 | 1718 | ||
1742 | static int __init svm_init(void) | 1719 | static int __init svm_init(void) |
1743 | { | 1720 | { |
1744 | return kvm_init_x86(&svm_x86_ops, sizeof(struct vcpu_svm), | 1721 | return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm), |
1745 | THIS_MODULE); | 1722 | THIS_MODULE); |
1746 | } | 1723 | } |
1747 | 1724 | ||
1748 | static void __exit svm_exit(void) | 1725 | static void __exit svm_exit(void) |
1749 | { | 1726 | { |
1750 | kvm_exit_x86(); | 1727 | kvm_exit(); |
1751 | } | 1728 | } |
1752 | 1729 | ||
1753 | module_init(svm_init) | 1730 | module_init(svm_init) |
diff --git a/drivers/kvm/svm.h b/arch/x86/kvm/svm.h index 3b1b0f35b6cb..5fd50491b555 100644 --- a/drivers/kvm/svm.h +++ b/arch/x86/kvm/svm.h | |||
@@ -204,6 +204,7 @@ struct __attribute__ ((__packed__)) vmcb { | |||
204 | #define INTERCEPT_CR0_MASK 1 | 204 | #define INTERCEPT_CR0_MASK 1 |
205 | #define INTERCEPT_CR3_MASK (1 << 3) | 205 | #define INTERCEPT_CR3_MASK (1 << 3) |
206 | #define INTERCEPT_CR4_MASK (1 << 4) | 206 | #define INTERCEPT_CR4_MASK (1 << 4) |
207 | #define INTERCEPT_CR8_MASK (1 << 8) | ||
207 | 208 | ||
208 | #define INTERCEPT_DR0_MASK 1 | 209 | #define INTERCEPT_DR0_MASK 1 |
209 | #define INTERCEPT_DR1_MASK (1 << 1) | 210 | #define INTERCEPT_DR1_MASK (1 << 1) |
@@ -311,7 +312,7 @@ struct __attribute__ ((__packed__)) vmcb { | |||
311 | 312 | ||
312 | #define SVM_EXIT_ERR -1 | 313 | #define SVM_EXIT_ERR -1 |
313 | 314 | ||
314 | #define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) // TS and MP | 315 | #define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) /* TS and MP */ |
315 | 316 | ||
316 | #define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda" | 317 | #define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda" |
317 | #define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8" | 318 | #define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8" |
diff --git a/drivers/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5b397b6c9f93..ad36447e696e 100644 --- a/drivers/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c | |||
@@ -15,17 +15,18 @@ | |||
15 | * | 15 | * |
16 | */ | 16 | */ |
17 | 17 | ||
18 | #include "kvm.h" | ||
19 | #include "x86_emulate.h" | ||
20 | #include "irq.h" | 18 | #include "irq.h" |
21 | #include "vmx.h" | 19 | #include "vmx.h" |
22 | #include "segment_descriptor.h" | 20 | #include "segment_descriptor.h" |
21 | #include "mmu.h" | ||
23 | 22 | ||
23 | #include <linux/kvm_host.h> | ||
24 | #include <linux/module.h> | 24 | #include <linux/module.h> |
25 | #include <linux/kernel.h> | 25 | #include <linux/kernel.h> |
26 | #include <linux/mm.h> | 26 | #include <linux/mm.h> |
27 | #include <linux/highmem.h> | 27 | #include <linux/highmem.h> |
28 | #include <linux/sched.h> | 28 | #include <linux/sched.h> |
29 | #include <linux/moduleparam.h> | ||
29 | 30 | ||
30 | #include <asm/io.h> | 31 | #include <asm/io.h> |
31 | #include <asm/desc.h> | 32 | #include <asm/desc.h> |
@@ -33,6 +34,9 @@ | |||
33 | MODULE_AUTHOR("Qumranet"); | 34 | MODULE_AUTHOR("Qumranet"); |
34 | MODULE_LICENSE("GPL"); | 35 | MODULE_LICENSE("GPL"); |
35 | 36 | ||
37 | static int bypass_guest_pf = 1; | ||
38 | module_param(bypass_guest_pf, bool, 0); | ||
39 | |||
36 | struct vmcs { | 40 | struct vmcs { |
37 | u32 revision_id; | 41 | u32 revision_id; |
38 | u32 abort; | 42 | u32 abort; |
@@ -43,6 +47,7 @@ struct vcpu_vmx { | |||
43 | struct kvm_vcpu vcpu; | 47 | struct kvm_vcpu vcpu; |
44 | int launched; | 48 | int launched; |
45 | u8 fail; | 49 | u8 fail; |
50 | u32 idt_vectoring_info; | ||
46 | struct kvm_msr_entry *guest_msrs; | 51 | struct kvm_msr_entry *guest_msrs; |
47 | struct kvm_msr_entry *host_msrs; | 52 | struct kvm_msr_entry *host_msrs; |
48 | int nmsrs; | 53 | int nmsrs; |
@@ -57,8 +62,15 @@ struct vcpu_vmx { | |||
57 | u16 fs_sel, gs_sel, ldt_sel; | 62 | u16 fs_sel, gs_sel, ldt_sel; |
58 | int gs_ldt_reload_needed; | 63 | int gs_ldt_reload_needed; |
59 | int fs_reload_needed; | 64 | int fs_reload_needed; |
60 | }host_state; | 65 | int guest_efer_loaded; |
61 | 66 | } host_state; | |
67 | struct { | ||
68 | struct { | ||
69 | bool pending; | ||
70 | u8 vector; | ||
71 | unsigned rip; | ||
72 | } irq; | ||
73 | } rmode; | ||
62 | }; | 74 | }; |
63 | 75 | ||
64 | static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) | 76 | static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) |
@@ -74,14 +86,13 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs); | |||
74 | static struct page *vmx_io_bitmap_a; | 86 | static struct page *vmx_io_bitmap_a; |
75 | static struct page *vmx_io_bitmap_b; | 87 | static struct page *vmx_io_bitmap_b; |
76 | 88 | ||
77 | #define EFER_SAVE_RESTORE_BITS ((u64)EFER_SCE) | ||
78 | |||
79 | static struct vmcs_config { | 89 | static struct vmcs_config { |
80 | int size; | 90 | int size; |
81 | int order; | 91 | int order; |
82 | u32 revision_id; | 92 | u32 revision_id; |
83 | u32 pin_based_exec_ctrl; | 93 | u32 pin_based_exec_ctrl; |
84 | u32 cpu_based_exec_ctrl; | 94 | u32 cpu_based_exec_ctrl; |
95 | u32 cpu_based_2nd_exec_ctrl; | ||
85 | u32 vmexit_ctrl; | 96 | u32 vmexit_ctrl; |
86 | u32 vmentry_ctrl; | 97 | u32 vmentry_ctrl; |
87 | } vmcs_config; | 98 | } vmcs_config; |
@@ -138,18 +149,6 @@ static void save_msrs(struct kvm_msr_entry *e, int n) | |||
138 | rdmsrl(e[i].index, e[i].data); | 149 | rdmsrl(e[i].index, e[i].data); |
139 | } | 150 | } |
140 | 151 | ||
141 | static inline u64 msr_efer_save_restore_bits(struct kvm_msr_entry msr) | ||
142 | { | ||
143 | return (u64)msr.data & EFER_SAVE_RESTORE_BITS; | ||
144 | } | ||
145 | |||
146 | static inline int msr_efer_need_save_restore(struct vcpu_vmx *vmx) | ||
147 | { | ||
148 | int efer_offset = vmx->msr_offset_efer; | ||
149 | return msr_efer_save_restore_bits(vmx->host_msrs[efer_offset]) != | ||
150 | msr_efer_save_restore_bits(vmx->guest_msrs[efer_offset]); | ||
151 | } | ||
152 | |||
153 | static inline int is_page_fault(u32 intr_info) | 152 | static inline int is_page_fault(u32 intr_info) |
154 | { | 153 | { |
155 | return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | | 154 | return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | |
@@ -164,6 +163,13 @@ static inline int is_no_device(u32 intr_info) | |||
164 | (INTR_TYPE_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK); | 163 | (INTR_TYPE_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK); |
165 | } | 164 | } |
166 | 165 | ||
166 | static inline int is_invalid_opcode(u32 intr_info) | ||
167 | { | ||
168 | return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | | ||
169 | INTR_INFO_VALID_MASK)) == | ||
170 | (INTR_TYPE_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK); | ||
171 | } | ||
172 | |||
167 | static inline int is_external_interrupt(u32 intr_info) | 173 | static inline int is_external_interrupt(u32 intr_info) |
168 | { | 174 | { |
169 | return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) | 175 | return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) |
@@ -180,6 +186,24 @@ static inline int vm_need_tpr_shadow(struct kvm *kvm) | |||
180 | return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm))); | 186 | return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm))); |
181 | } | 187 | } |
182 | 188 | ||
189 | static inline int cpu_has_secondary_exec_ctrls(void) | ||
190 | { | ||
191 | return (vmcs_config.cpu_based_exec_ctrl & | ||
192 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS); | ||
193 | } | ||
194 | |||
195 | static inline bool cpu_has_vmx_virtualize_apic_accesses(void) | ||
196 | { | ||
197 | return (vmcs_config.cpu_based_2nd_exec_ctrl & | ||
198 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); | ||
199 | } | ||
200 | |||
201 | static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) | ||
202 | { | ||
203 | return ((cpu_has_vmx_virtualize_apic_accesses()) && | ||
204 | (irqchip_in_kernel(kvm))); | ||
205 | } | ||
206 | |||
183 | static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) | 207 | static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) |
184 | { | 208 | { |
185 | int i; | 209 | int i; |
@@ -222,16 +246,14 @@ static void __vcpu_clear(void *arg) | |||
222 | vmcs_clear(vmx->vmcs); | 246 | vmcs_clear(vmx->vmcs); |
223 | if (per_cpu(current_vmcs, cpu) == vmx->vmcs) | 247 | if (per_cpu(current_vmcs, cpu) == vmx->vmcs) |
224 | per_cpu(current_vmcs, cpu) = NULL; | 248 | per_cpu(current_vmcs, cpu) = NULL; |
225 | rdtscll(vmx->vcpu.host_tsc); | 249 | rdtscll(vmx->vcpu.arch.host_tsc); |
226 | } | 250 | } |
227 | 251 | ||
228 | static void vcpu_clear(struct vcpu_vmx *vmx) | 252 | static void vcpu_clear(struct vcpu_vmx *vmx) |
229 | { | 253 | { |
230 | if (vmx->vcpu.cpu != raw_smp_processor_id() && vmx->vcpu.cpu != -1) | 254 | if (vmx->vcpu.cpu == -1) |
231 | smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, | 255 | return; |
232 | vmx, 0, 1); | 256 | smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 0, 1); |
233 | else | ||
234 | __vcpu_clear(vmx); | ||
235 | vmx->launched = 0; | 257 | vmx->launched = 0; |
236 | } | 258 | } |
237 | 259 | ||
@@ -275,7 +297,7 @@ static void vmcs_writel(unsigned long field, unsigned long value) | |||
275 | u8 error; | 297 | u8 error; |
276 | 298 | ||
277 | asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0" | 299 | asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0" |
278 | : "=q"(error) : "a"(value), "d"(field) : "cc" ); | 300 | : "=q"(error) : "a"(value), "d"(field) : "cc"); |
279 | if (unlikely(error)) | 301 | if (unlikely(error)) |
280 | vmwrite_error(field, value); | 302 | vmwrite_error(field, value); |
281 | } | 303 | } |
@@ -315,12 +337,12 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) | |||
315 | { | 337 | { |
316 | u32 eb; | 338 | u32 eb; |
317 | 339 | ||
318 | eb = 1u << PF_VECTOR; | 340 | eb = (1u << PF_VECTOR) | (1u << UD_VECTOR); |
319 | if (!vcpu->fpu_active) | 341 | if (!vcpu->fpu_active) |
320 | eb |= 1u << NM_VECTOR; | 342 | eb |= 1u << NM_VECTOR; |
321 | if (vcpu->guest_debug.enabled) | 343 | if (vcpu->guest_debug.enabled) |
322 | eb |= 1u << 1; | 344 | eb |= 1u << 1; |
323 | if (vcpu->rmode.active) | 345 | if (vcpu->arch.rmode.active) |
324 | eb = ~0; | 346 | eb = ~0; |
325 | vmcs_write32(EXCEPTION_BITMAP, eb); | 347 | vmcs_write32(EXCEPTION_BITMAP, eb); |
326 | } | 348 | } |
@@ -344,16 +366,42 @@ static void reload_tss(void) | |||
344 | 366 | ||
345 | static void load_transition_efer(struct vcpu_vmx *vmx) | 367 | static void load_transition_efer(struct vcpu_vmx *vmx) |
346 | { | 368 | { |
347 | u64 trans_efer; | ||
348 | int efer_offset = vmx->msr_offset_efer; | 369 | int efer_offset = vmx->msr_offset_efer; |
370 | u64 host_efer = vmx->host_msrs[efer_offset].data; | ||
371 | u64 guest_efer = vmx->guest_msrs[efer_offset].data; | ||
372 | u64 ignore_bits; | ||
349 | 373 | ||
350 | trans_efer = vmx->host_msrs[efer_offset].data; | 374 | if (efer_offset < 0) |
351 | trans_efer &= ~EFER_SAVE_RESTORE_BITS; | 375 | return; |
352 | trans_efer |= msr_efer_save_restore_bits(vmx->guest_msrs[efer_offset]); | 376 | /* |
353 | wrmsrl(MSR_EFER, trans_efer); | 377 | * NX is emulated; LMA and LME handled by hardware; SCE meaninless |
378 | * outside long mode | ||
379 | */ | ||
380 | ignore_bits = EFER_NX | EFER_SCE; | ||
381 | #ifdef CONFIG_X86_64 | ||
382 | ignore_bits |= EFER_LMA | EFER_LME; | ||
383 | /* SCE is meaningful only in long mode on Intel */ | ||
384 | if (guest_efer & EFER_LMA) | ||
385 | ignore_bits &= ~(u64)EFER_SCE; | ||
386 | #endif | ||
387 | if ((guest_efer & ~ignore_bits) == (host_efer & ~ignore_bits)) | ||
388 | return; | ||
389 | |||
390 | vmx->host_state.guest_efer_loaded = 1; | ||
391 | guest_efer &= ~ignore_bits; | ||
392 | guest_efer |= host_efer & ignore_bits; | ||
393 | wrmsrl(MSR_EFER, guest_efer); | ||
354 | vmx->vcpu.stat.efer_reload++; | 394 | vmx->vcpu.stat.efer_reload++; |
355 | } | 395 | } |
356 | 396 | ||
397 | static void reload_host_efer(struct vcpu_vmx *vmx) | ||
398 | { | ||
399 | if (vmx->host_state.guest_efer_loaded) { | ||
400 | vmx->host_state.guest_efer_loaded = 0; | ||
401 | load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1); | ||
402 | } | ||
403 | } | ||
404 | |||
357 | static void vmx_save_host_state(struct kvm_vcpu *vcpu) | 405 | static void vmx_save_host_state(struct kvm_vcpu *vcpu) |
358 | { | 406 | { |
359 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 407 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
@@ -393,14 +441,13 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) | |||
393 | #endif | 441 | #endif |
394 | 442 | ||
395 | #ifdef CONFIG_X86_64 | 443 | #ifdef CONFIG_X86_64 |
396 | if (is_long_mode(&vmx->vcpu)) { | 444 | if (is_long_mode(&vmx->vcpu)) |
397 | save_msrs(vmx->host_msrs + | 445 | save_msrs(vmx->host_msrs + |
398 | vmx->msr_offset_kernel_gs_base, 1); | 446 | vmx->msr_offset_kernel_gs_base, 1); |
399 | } | 447 | |
400 | #endif | 448 | #endif |
401 | load_msrs(vmx->guest_msrs, vmx->save_nmsrs); | 449 | load_msrs(vmx->guest_msrs, vmx->save_nmsrs); |
402 | if (msr_efer_need_save_restore(vmx)) | 450 | load_transition_efer(vmx); |
403 | load_transition_efer(vmx); | ||
404 | } | 451 | } |
405 | 452 | ||
406 | static void vmx_load_host_state(struct vcpu_vmx *vmx) | 453 | static void vmx_load_host_state(struct vcpu_vmx *vmx) |
@@ -410,6 +457,7 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx) | |||
410 | if (!vmx->host_state.loaded) | 457 | if (!vmx->host_state.loaded) |
411 | return; | 458 | return; |
412 | 459 | ||
460 | ++vmx->vcpu.stat.host_state_reload; | ||
413 | vmx->host_state.loaded = 0; | 461 | vmx->host_state.loaded = 0; |
414 | if (vmx->host_state.fs_reload_needed) | 462 | if (vmx->host_state.fs_reload_needed) |
415 | load_fs(vmx->host_state.fs_sel); | 463 | load_fs(vmx->host_state.fs_sel); |
@@ -429,8 +477,7 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx) | |||
429 | reload_tss(); | 477 | reload_tss(); |
430 | save_msrs(vmx->guest_msrs, vmx->save_nmsrs); | 478 | save_msrs(vmx->guest_msrs, vmx->save_nmsrs); |
431 | load_msrs(vmx->host_msrs, vmx->save_nmsrs); | 479 | load_msrs(vmx->host_msrs, vmx->save_nmsrs); |
432 | if (msr_efer_need_save_restore(vmx)) | 480 | reload_host_efer(vmx); |
433 | load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1); | ||
434 | } | 481 | } |
435 | 482 | ||
436 | /* | 483 | /* |
@@ -480,7 +527,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | |||
480 | * Make sure the time stamp counter is monotonous. | 527 | * Make sure the time stamp counter is monotonous. |
481 | */ | 528 | */ |
482 | rdtscll(tsc_this); | 529 | rdtscll(tsc_this); |
483 | delta = vcpu->host_tsc - tsc_this; | 530 | delta = vcpu->arch.host_tsc - tsc_this; |
484 | vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta); | 531 | vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta); |
485 | } | 532 | } |
486 | } | 533 | } |
@@ -488,7 +535,6 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | |||
488 | static void vmx_vcpu_put(struct kvm_vcpu *vcpu) | 535 | static void vmx_vcpu_put(struct kvm_vcpu *vcpu) |
489 | { | 536 | { |
490 | vmx_load_host_state(to_vmx(vcpu)); | 537 | vmx_load_host_state(to_vmx(vcpu)); |
491 | kvm_put_guest_fpu(vcpu); | ||
492 | } | 538 | } |
493 | 539 | ||
494 | static void vmx_fpu_activate(struct kvm_vcpu *vcpu) | 540 | static void vmx_fpu_activate(struct kvm_vcpu *vcpu) |
@@ -497,7 +543,7 @@ static void vmx_fpu_activate(struct kvm_vcpu *vcpu) | |||
497 | return; | 543 | return; |
498 | vcpu->fpu_active = 1; | 544 | vcpu->fpu_active = 1; |
499 | vmcs_clear_bits(GUEST_CR0, X86_CR0_TS); | 545 | vmcs_clear_bits(GUEST_CR0, X86_CR0_TS); |
500 | if (vcpu->cr0 & X86_CR0_TS) | 546 | if (vcpu->arch.cr0 & X86_CR0_TS) |
501 | vmcs_set_bits(GUEST_CR0, X86_CR0_TS); | 547 | vmcs_set_bits(GUEST_CR0, X86_CR0_TS); |
502 | update_exception_bitmap(vcpu); | 548 | update_exception_bitmap(vcpu); |
503 | } | 549 | } |
@@ -523,7 +569,7 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) | |||
523 | 569 | ||
524 | static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) | 570 | static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) |
525 | { | 571 | { |
526 | if (vcpu->rmode.active) | 572 | if (vcpu->arch.rmode.active) |
527 | rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; | 573 | rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; |
528 | vmcs_writel(GUEST_RFLAGS, rflags); | 574 | vmcs_writel(GUEST_RFLAGS, rflags); |
529 | } | 575 | } |
@@ -545,19 +591,25 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) | |||
545 | if (interruptibility & 3) | 591 | if (interruptibility & 3) |
546 | vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, | 592 | vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, |
547 | interruptibility & ~3); | 593 | interruptibility & ~3); |
548 | vcpu->interrupt_window_open = 1; | 594 | vcpu->arch.interrupt_window_open = 1; |
549 | } | 595 | } |
550 | 596 | ||
551 | static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code) | 597 | static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, |
598 | bool has_error_code, u32 error_code) | ||
552 | { | 599 | { |
553 | printk(KERN_DEBUG "inject_general_protection: rip 0x%lx\n", | ||
554 | vmcs_readl(GUEST_RIP)); | ||
555 | vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); | ||
556 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | 600 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, |
557 | GP_VECTOR | | 601 | nr | INTR_TYPE_EXCEPTION |
558 | INTR_TYPE_EXCEPTION | | 602 | | (has_error_code ? INTR_INFO_DELIEVER_CODE_MASK : 0) |
559 | INTR_INFO_DELIEVER_CODE_MASK | | 603 | | INTR_INFO_VALID_MASK); |
560 | INTR_INFO_VALID_MASK); | 604 | if (has_error_code) |
605 | vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); | ||
606 | } | ||
607 | |||
608 | static bool vmx_exception_injected(struct kvm_vcpu *vcpu) | ||
609 | { | ||
610 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
611 | |||
612 | return !(vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); | ||
561 | } | 613 | } |
562 | 614 | ||
563 | /* | 615 | /* |
@@ -608,7 +660,7 @@ static void setup_msrs(struct vcpu_vmx *vmx) | |||
608 | * if efer.sce is enabled. | 660 | * if efer.sce is enabled. |
609 | */ | 661 | */ |
610 | index = __find_msr_index(vmx, MSR_K6_STAR); | 662 | index = __find_msr_index(vmx, MSR_K6_STAR); |
611 | if ((index >= 0) && (vmx->vcpu.shadow_efer & EFER_SCE)) | 663 | if ((index >= 0) && (vmx->vcpu.arch.shadow_efer & EFER_SCE)) |
612 | move_msr_up(vmx, index, save_nmsrs++); | 664 | move_msr_up(vmx, index, save_nmsrs++); |
613 | } | 665 | } |
614 | #endif | 666 | #endif |
@@ -712,8 +764,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) | |||
712 | #ifdef CONFIG_X86_64 | 764 | #ifdef CONFIG_X86_64 |
713 | case MSR_EFER: | 765 | case MSR_EFER: |
714 | ret = kvm_set_msr_common(vcpu, msr_index, data); | 766 | ret = kvm_set_msr_common(vcpu, msr_index, data); |
715 | if (vmx->host_state.loaded) | 767 | if (vmx->host_state.loaded) { |
768 | reload_host_efer(vmx); | ||
716 | load_transition_efer(vmx); | 769 | load_transition_efer(vmx); |
770 | } | ||
717 | break; | 771 | break; |
718 | case MSR_FS_BASE: | 772 | case MSR_FS_BASE: |
719 | vmcs_writel(GUEST_FS_BASE, data); | 773 | vmcs_writel(GUEST_FS_BASE, data); |
@@ -750,12 +804,12 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) | |||
750 | 804 | ||
751 | /* | 805 | /* |
752 | * Sync the rsp and rip registers into the vcpu structure. This allows | 806 | * Sync the rsp and rip registers into the vcpu structure. This allows |
753 | * registers to be accessed by indexing vcpu->regs. | 807 | * registers to be accessed by indexing vcpu->arch.regs. |
754 | */ | 808 | */ |
755 | static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu) | 809 | static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu) |
756 | { | 810 | { |
757 | vcpu->regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); | 811 | vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); |
758 | vcpu->rip = vmcs_readl(GUEST_RIP); | 812 | vcpu->arch.rip = vmcs_readl(GUEST_RIP); |
759 | } | 813 | } |
760 | 814 | ||
761 | /* | 815 | /* |
@@ -764,8 +818,8 @@ static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu) | |||
764 | */ | 818 | */ |
765 | static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu) | 819 | static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu) |
766 | { | 820 | { |
767 | vmcs_writel(GUEST_RSP, vcpu->regs[VCPU_REGS_RSP]); | 821 | vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); |
768 | vmcs_writel(GUEST_RIP, vcpu->rip); | 822 | vmcs_writel(GUEST_RIP, vcpu->arch.rip); |
769 | } | 823 | } |
770 | 824 | ||
771 | static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) | 825 | static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) |
@@ -808,14 +862,15 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) | |||
808 | 862 | ||
809 | static int vmx_get_irq(struct kvm_vcpu *vcpu) | 863 | static int vmx_get_irq(struct kvm_vcpu *vcpu) |
810 | { | 864 | { |
865 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
811 | u32 idtv_info_field; | 866 | u32 idtv_info_field; |
812 | 867 | ||
813 | idtv_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD); | 868 | idtv_info_field = vmx->idt_vectoring_info; |
814 | if (idtv_info_field & INTR_INFO_VALID_MASK) { | 869 | if (idtv_info_field & INTR_INFO_VALID_MASK) { |
815 | if (is_external_interrupt(idtv_info_field)) | 870 | if (is_external_interrupt(idtv_info_field)) |
816 | return idtv_info_field & VECTORING_INFO_VECTOR_MASK; | 871 | return idtv_info_field & VECTORING_INFO_VECTOR_MASK; |
817 | else | 872 | else |
818 | printk("pending exception: not handled yet\n"); | 873 | printk(KERN_DEBUG "pending exception: not handled yet\n"); |
819 | } | 874 | } |
820 | return -1; | 875 | return -1; |
821 | } | 876 | } |
@@ -863,7 +918,7 @@ static void hardware_disable(void *garbage) | |||
863 | } | 918 | } |
864 | 919 | ||
865 | static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, | 920 | static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, |
866 | u32 msr, u32* result) | 921 | u32 msr, u32 *result) |
867 | { | 922 | { |
868 | u32 vmx_msr_low, vmx_msr_high; | 923 | u32 vmx_msr_low, vmx_msr_high; |
869 | u32 ctl = ctl_min | ctl_opt; | 924 | u32 ctl = ctl_min | ctl_opt; |
@@ -887,6 +942,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) | |||
887 | u32 min, opt; | 942 | u32 min, opt; |
888 | u32 _pin_based_exec_control = 0; | 943 | u32 _pin_based_exec_control = 0; |
889 | u32 _cpu_based_exec_control = 0; | 944 | u32 _cpu_based_exec_control = 0; |
945 | u32 _cpu_based_2nd_exec_control = 0; | ||
890 | u32 _vmexit_control = 0; | 946 | u32 _vmexit_control = 0; |
891 | u32 _vmentry_control = 0; | 947 | u32 _vmentry_control = 0; |
892 | 948 | ||
@@ -904,11 +960,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) | |||
904 | CPU_BASED_USE_IO_BITMAPS | | 960 | CPU_BASED_USE_IO_BITMAPS | |
905 | CPU_BASED_MOV_DR_EXITING | | 961 | CPU_BASED_MOV_DR_EXITING | |
906 | CPU_BASED_USE_TSC_OFFSETING; | 962 | CPU_BASED_USE_TSC_OFFSETING; |
907 | #ifdef CONFIG_X86_64 | 963 | opt = CPU_BASED_TPR_SHADOW | |
908 | opt = CPU_BASED_TPR_SHADOW; | 964 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; |
909 | #else | ||
910 | opt = 0; | ||
911 | #endif | ||
912 | if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, | 965 | if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, |
913 | &_cpu_based_exec_control) < 0) | 966 | &_cpu_based_exec_control) < 0) |
914 | return -EIO; | 967 | return -EIO; |
@@ -917,6 +970,19 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) | |||
917 | _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING & | 970 | _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING & |
918 | ~CPU_BASED_CR8_STORE_EXITING; | 971 | ~CPU_BASED_CR8_STORE_EXITING; |
919 | #endif | 972 | #endif |
973 | if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { | ||
974 | min = 0; | ||
975 | opt = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | | ||
976 | SECONDARY_EXEC_WBINVD_EXITING; | ||
977 | if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS2, | ||
978 | &_cpu_based_2nd_exec_control) < 0) | ||
979 | return -EIO; | ||
980 | } | ||
981 | #ifndef CONFIG_X86_64 | ||
982 | if (!(_cpu_based_2nd_exec_control & | ||
983 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) | ||
984 | _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; | ||
985 | #endif | ||
920 | 986 | ||
921 | min = 0; | 987 | min = 0; |
922 | #ifdef CONFIG_X86_64 | 988 | #ifdef CONFIG_X86_64 |
@@ -954,6 +1020,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) | |||
954 | 1020 | ||
955 | vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; | 1021 | vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; |
956 | vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; | 1022 | vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; |
1023 | vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; | ||
957 | vmcs_conf->vmexit_ctrl = _vmexit_control; | 1024 | vmcs_conf->vmexit_ctrl = _vmexit_control; |
958 | vmcs_conf->vmentry_ctrl = _vmentry_control; | 1025 | vmcs_conf->vmentry_ctrl = _vmentry_control; |
959 | 1026 | ||
@@ -1043,15 +1110,15 @@ static void enter_pmode(struct kvm_vcpu *vcpu) | |||
1043 | { | 1110 | { |
1044 | unsigned long flags; | 1111 | unsigned long flags; |
1045 | 1112 | ||
1046 | vcpu->rmode.active = 0; | 1113 | vcpu->arch.rmode.active = 0; |
1047 | 1114 | ||
1048 | vmcs_writel(GUEST_TR_BASE, vcpu->rmode.tr.base); | 1115 | vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base); |
1049 | vmcs_write32(GUEST_TR_LIMIT, vcpu->rmode.tr.limit); | 1116 | vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit); |
1050 | vmcs_write32(GUEST_TR_AR_BYTES, vcpu->rmode.tr.ar); | 1117 | vmcs_write32(GUEST_TR_AR_BYTES, vcpu->arch.rmode.tr.ar); |
1051 | 1118 | ||
1052 | flags = vmcs_readl(GUEST_RFLAGS); | 1119 | flags = vmcs_readl(GUEST_RFLAGS); |
1053 | flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM); | 1120 | flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM); |
1054 | flags |= (vcpu->rmode.save_iopl << IOPL_SHIFT); | 1121 | flags |= (vcpu->arch.rmode.save_iopl << IOPL_SHIFT); |
1055 | vmcs_writel(GUEST_RFLAGS, flags); | 1122 | vmcs_writel(GUEST_RFLAGS, flags); |
1056 | 1123 | ||
1057 | vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | | 1124 | vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | |
@@ -1059,10 +1126,10 @@ static void enter_pmode(struct kvm_vcpu *vcpu) | |||
1059 | 1126 | ||
1060 | update_exception_bitmap(vcpu); | 1127 | update_exception_bitmap(vcpu); |
1061 | 1128 | ||
1062 | fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->rmode.es); | 1129 | fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es); |
1063 | fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->rmode.ds); | 1130 | fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); |
1064 | fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->rmode.gs); | 1131 | fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); |
1065 | fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->rmode.fs); | 1132 | fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); |
1066 | 1133 | ||
1067 | vmcs_write16(GUEST_SS_SELECTOR, 0); | 1134 | vmcs_write16(GUEST_SS_SELECTOR, 0); |
1068 | vmcs_write32(GUEST_SS_AR_BYTES, 0x93); | 1135 | vmcs_write32(GUEST_SS_AR_BYTES, 0x93); |
@@ -1072,10 +1139,14 @@ static void enter_pmode(struct kvm_vcpu *vcpu) | |||
1072 | vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); | 1139 | vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); |
1073 | } | 1140 | } |
1074 | 1141 | ||
1075 | static gva_t rmode_tss_base(struct kvm* kvm) | 1142 | static gva_t rmode_tss_base(struct kvm *kvm) |
1076 | { | 1143 | { |
1077 | gfn_t base_gfn = kvm->memslots[0].base_gfn + kvm->memslots[0].npages - 3; | 1144 | if (!kvm->arch.tss_addr) { |
1078 | return base_gfn << PAGE_SHIFT; | 1145 | gfn_t base_gfn = kvm->memslots[0].base_gfn + |
1146 | kvm->memslots[0].npages - 3; | ||
1147 | return base_gfn << PAGE_SHIFT; | ||
1148 | } | ||
1149 | return kvm->arch.tss_addr; | ||
1079 | } | 1150 | } |
1080 | 1151 | ||
1081 | static void fix_rmode_seg(int seg, struct kvm_save_segment *save) | 1152 | static void fix_rmode_seg(int seg, struct kvm_save_segment *save) |
@@ -1086,7 +1157,8 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save) | |||
1086 | save->base = vmcs_readl(sf->base); | 1157 | save->base = vmcs_readl(sf->base); |
1087 | save->limit = vmcs_read32(sf->limit); | 1158 | save->limit = vmcs_read32(sf->limit); |
1088 | save->ar = vmcs_read32(sf->ar_bytes); | 1159 | save->ar = vmcs_read32(sf->ar_bytes); |
1089 | vmcs_write16(sf->selector, vmcs_readl(sf->base) >> 4); | 1160 | vmcs_write16(sf->selector, save->base >> 4); |
1161 | vmcs_write32(sf->base, save->base & 0xfffff); | ||
1090 | vmcs_write32(sf->limit, 0xffff); | 1162 | vmcs_write32(sf->limit, 0xffff); |
1091 | vmcs_write32(sf->ar_bytes, 0xf3); | 1163 | vmcs_write32(sf->ar_bytes, 0xf3); |
1092 | } | 1164 | } |
@@ -1095,19 +1167,20 @@ static void enter_rmode(struct kvm_vcpu *vcpu) | |||
1095 | { | 1167 | { |
1096 | unsigned long flags; | 1168 | unsigned long flags; |
1097 | 1169 | ||
1098 | vcpu->rmode.active = 1; | 1170 | vcpu->arch.rmode.active = 1; |
1099 | 1171 | ||
1100 | vcpu->rmode.tr.base = vmcs_readl(GUEST_TR_BASE); | 1172 | vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE); |
1101 | vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); | 1173 | vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); |
1102 | 1174 | ||
1103 | vcpu->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT); | 1175 | vcpu->arch.rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT); |
1104 | vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); | 1176 | vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); |
1105 | 1177 | ||
1106 | vcpu->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES); | 1178 | vcpu->arch.rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES); |
1107 | vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); | 1179 | vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); |
1108 | 1180 | ||
1109 | flags = vmcs_readl(GUEST_RFLAGS); | 1181 | flags = vmcs_readl(GUEST_RFLAGS); |
1110 | vcpu->rmode.save_iopl = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; | 1182 | vcpu->arch.rmode.save_iopl |
1183 | = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; | ||
1111 | 1184 | ||
1112 | flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; | 1185 | flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; |
1113 | 1186 | ||
@@ -1125,10 +1198,10 @@ static void enter_rmode(struct kvm_vcpu *vcpu) | |||
1125 | vmcs_writel(GUEST_CS_BASE, 0xf0000); | 1198 | vmcs_writel(GUEST_CS_BASE, 0xf0000); |
1126 | vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4); | 1199 | vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4); |
1127 | 1200 | ||
1128 | fix_rmode_seg(VCPU_SREG_ES, &vcpu->rmode.es); | 1201 | fix_rmode_seg(VCPU_SREG_ES, &vcpu->arch.rmode.es); |
1129 | fix_rmode_seg(VCPU_SREG_DS, &vcpu->rmode.ds); | 1202 | fix_rmode_seg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); |
1130 | fix_rmode_seg(VCPU_SREG_GS, &vcpu->rmode.gs); | 1203 | fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); |
1131 | fix_rmode_seg(VCPU_SREG_FS, &vcpu->rmode.fs); | 1204 | fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); |
1132 | 1205 | ||
1133 | kvm_mmu_reset_context(vcpu); | 1206 | kvm_mmu_reset_context(vcpu); |
1134 | init_rmode_tss(vcpu->kvm); | 1207 | init_rmode_tss(vcpu->kvm); |
@@ -1149,7 +1222,7 @@ static void enter_lmode(struct kvm_vcpu *vcpu) | |||
1149 | | AR_TYPE_BUSY_64_TSS); | 1222 | | AR_TYPE_BUSY_64_TSS); |
1150 | } | 1223 | } |
1151 | 1224 | ||
1152 | vcpu->shadow_efer |= EFER_LMA; | 1225 | vcpu->arch.shadow_efer |= EFER_LMA; |
1153 | 1226 | ||
1154 | find_msr_entry(to_vmx(vcpu), MSR_EFER)->data |= EFER_LMA | EFER_LME; | 1227 | find_msr_entry(to_vmx(vcpu), MSR_EFER)->data |= EFER_LMA | EFER_LME; |
1155 | vmcs_write32(VM_ENTRY_CONTROLS, | 1228 | vmcs_write32(VM_ENTRY_CONTROLS, |
@@ -1159,7 +1232,7 @@ static void enter_lmode(struct kvm_vcpu *vcpu) | |||
1159 | 1232 | ||
1160 | static void exit_lmode(struct kvm_vcpu *vcpu) | 1233 | static void exit_lmode(struct kvm_vcpu *vcpu) |
1161 | { | 1234 | { |
1162 | vcpu->shadow_efer &= ~EFER_LMA; | 1235 | vcpu->arch.shadow_efer &= ~EFER_LMA; |
1163 | 1236 | ||
1164 | vmcs_write32(VM_ENTRY_CONTROLS, | 1237 | vmcs_write32(VM_ENTRY_CONTROLS, |
1165 | vmcs_read32(VM_ENTRY_CONTROLS) | 1238 | vmcs_read32(VM_ENTRY_CONTROLS) |
@@ -1170,22 +1243,22 @@ static void exit_lmode(struct kvm_vcpu *vcpu) | |||
1170 | 1243 | ||
1171 | static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) | 1244 | static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) |
1172 | { | 1245 | { |
1173 | vcpu->cr4 &= KVM_GUEST_CR4_MASK; | 1246 | vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK; |
1174 | vcpu->cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK; | 1247 | vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK; |
1175 | } | 1248 | } |
1176 | 1249 | ||
1177 | static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | 1250 | static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) |
1178 | { | 1251 | { |
1179 | vmx_fpu_deactivate(vcpu); | 1252 | vmx_fpu_deactivate(vcpu); |
1180 | 1253 | ||
1181 | if (vcpu->rmode.active && (cr0 & X86_CR0_PE)) | 1254 | if (vcpu->arch.rmode.active && (cr0 & X86_CR0_PE)) |
1182 | enter_pmode(vcpu); | 1255 | enter_pmode(vcpu); |
1183 | 1256 | ||
1184 | if (!vcpu->rmode.active && !(cr0 & X86_CR0_PE)) | 1257 | if (!vcpu->arch.rmode.active && !(cr0 & X86_CR0_PE)) |
1185 | enter_rmode(vcpu); | 1258 | enter_rmode(vcpu); |
1186 | 1259 | ||
1187 | #ifdef CONFIG_X86_64 | 1260 | #ifdef CONFIG_X86_64 |
1188 | if (vcpu->shadow_efer & EFER_LME) { | 1261 | if (vcpu->arch.shadow_efer & EFER_LME) { |
1189 | if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) | 1262 | if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) |
1190 | enter_lmode(vcpu); | 1263 | enter_lmode(vcpu); |
1191 | if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) | 1264 | if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) |
@@ -1196,7 +1269,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | |||
1196 | vmcs_writel(CR0_READ_SHADOW, cr0); | 1269 | vmcs_writel(CR0_READ_SHADOW, cr0); |
1197 | vmcs_writel(GUEST_CR0, | 1270 | vmcs_writel(GUEST_CR0, |
1198 | (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON); | 1271 | (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON); |
1199 | vcpu->cr0 = cr0; | 1272 | vcpu->arch.cr0 = cr0; |
1200 | 1273 | ||
1201 | if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE)) | 1274 | if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE)) |
1202 | vmx_fpu_activate(vcpu); | 1275 | vmx_fpu_activate(vcpu); |
@@ -1205,16 +1278,16 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | |||
1205 | static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | 1278 | static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) |
1206 | { | 1279 | { |
1207 | vmcs_writel(GUEST_CR3, cr3); | 1280 | vmcs_writel(GUEST_CR3, cr3); |
1208 | if (vcpu->cr0 & X86_CR0_PE) | 1281 | if (vcpu->arch.cr0 & X86_CR0_PE) |
1209 | vmx_fpu_deactivate(vcpu); | 1282 | vmx_fpu_deactivate(vcpu); |
1210 | } | 1283 | } |
1211 | 1284 | ||
1212 | static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | 1285 | static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) |
1213 | { | 1286 | { |
1214 | vmcs_writel(CR4_READ_SHADOW, cr4); | 1287 | vmcs_writel(CR4_READ_SHADOW, cr4); |
1215 | vmcs_writel(GUEST_CR4, cr4 | (vcpu->rmode.active ? | 1288 | vmcs_writel(GUEST_CR4, cr4 | (vcpu->arch.rmode.active ? |
1216 | KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON)); | 1289 | KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON)); |
1217 | vcpu->cr4 = cr4; | 1290 | vcpu->arch.cr4 = cr4; |
1218 | } | 1291 | } |
1219 | 1292 | ||
1220 | #ifdef CONFIG_X86_64 | 1293 | #ifdef CONFIG_X86_64 |
@@ -1224,7 +1297,7 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) | |||
1224 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 1297 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
1225 | struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); | 1298 | struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); |
1226 | 1299 | ||
1227 | vcpu->shadow_efer = efer; | 1300 | vcpu->arch.shadow_efer = efer; |
1228 | if (efer & EFER_LMA) { | 1301 | if (efer & EFER_LMA) { |
1229 | vmcs_write32(VM_ENTRY_CONTROLS, | 1302 | vmcs_write32(VM_ENTRY_CONTROLS, |
1230 | vmcs_read32(VM_ENTRY_CONTROLS) | | 1303 | vmcs_read32(VM_ENTRY_CONTROLS) | |
@@ -1301,17 +1374,17 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, | |||
1301 | struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; | 1374 | struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; |
1302 | u32 ar; | 1375 | u32 ar; |
1303 | 1376 | ||
1304 | if (vcpu->rmode.active && seg == VCPU_SREG_TR) { | 1377 | if (vcpu->arch.rmode.active && seg == VCPU_SREG_TR) { |
1305 | vcpu->rmode.tr.selector = var->selector; | 1378 | vcpu->arch.rmode.tr.selector = var->selector; |
1306 | vcpu->rmode.tr.base = var->base; | 1379 | vcpu->arch.rmode.tr.base = var->base; |
1307 | vcpu->rmode.tr.limit = var->limit; | 1380 | vcpu->arch.rmode.tr.limit = var->limit; |
1308 | vcpu->rmode.tr.ar = vmx_segment_access_rights(var); | 1381 | vcpu->arch.rmode.tr.ar = vmx_segment_access_rights(var); |
1309 | return; | 1382 | return; |
1310 | } | 1383 | } |
1311 | vmcs_writel(sf->base, var->base); | 1384 | vmcs_writel(sf->base, var->base); |
1312 | vmcs_write32(sf->limit, var->limit); | 1385 | vmcs_write32(sf->limit, var->limit); |
1313 | vmcs_write16(sf->selector, var->selector); | 1386 | vmcs_write16(sf->selector, var->selector); |
1314 | if (vcpu->rmode.active && var->s) { | 1387 | if (vcpu->arch.rmode.active && var->s) { |
1315 | /* | 1388 | /* |
1316 | * Hack real-mode segments into vm86 compatibility. | 1389 | * Hack real-mode segments into vm86 compatibility. |
1317 | */ | 1390 | */ |
@@ -1355,36 +1428,38 @@ static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) | |||
1355 | vmcs_writel(GUEST_GDTR_BASE, dt->base); | 1428 | vmcs_writel(GUEST_GDTR_BASE, dt->base); |
1356 | } | 1429 | } |
1357 | 1430 | ||
1358 | static int init_rmode_tss(struct kvm* kvm) | 1431 | static int init_rmode_tss(struct kvm *kvm) |
1359 | { | 1432 | { |
1360 | struct page *p1, *p2, *p3; | ||
1361 | gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; | 1433 | gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; |
1362 | char *page; | 1434 | u16 data = 0; |
1363 | 1435 | int ret = 0; | |
1364 | p1 = gfn_to_page(kvm, fn++); | 1436 | int r; |
1365 | p2 = gfn_to_page(kvm, fn++); | ||
1366 | p3 = gfn_to_page(kvm, fn); | ||
1367 | |||
1368 | if (!p1 || !p2 || !p3) { | ||
1369 | kvm_printf(kvm,"%s: gfn_to_page failed\n", __FUNCTION__); | ||
1370 | return 0; | ||
1371 | } | ||
1372 | |||
1373 | page = kmap_atomic(p1, KM_USER0); | ||
1374 | clear_page(page); | ||
1375 | *(u16*)(page + 0x66) = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; | ||
1376 | kunmap_atomic(page, KM_USER0); | ||
1377 | |||
1378 | page = kmap_atomic(p2, KM_USER0); | ||
1379 | clear_page(page); | ||
1380 | kunmap_atomic(page, KM_USER0); | ||
1381 | 1437 | ||
1382 | page = kmap_atomic(p3, KM_USER0); | 1438 | down_read(¤t->mm->mmap_sem); |
1383 | clear_page(page); | 1439 | r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); |
1384 | *(page + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1) = ~0; | 1440 | if (r < 0) |
1385 | kunmap_atomic(page, KM_USER0); | 1441 | goto out; |
1442 | data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; | ||
1443 | r = kvm_write_guest_page(kvm, fn++, &data, 0x66, sizeof(u16)); | ||
1444 | if (r < 0) | ||
1445 | goto out; | ||
1446 | r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE); | ||
1447 | if (r < 0) | ||
1448 | goto out; | ||
1449 | r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); | ||
1450 | if (r < 0) | ||
1451 | goto out; | ||
1452 | data = ~0; | ||
1453 | r = kvm_write_guest_page(kvm, fn, &data, | ||
1454 | RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1, | ||
1455 | sizeof(u8)); | ||
1456 | if (r < 0) | ||
1457 | goto out; | ||
1386 | 1458 | ||
1387 | return 1; | 1459 | ret = 1; |
1460 | out: | ||
1461 | up_read(¤t->mm->mmap_sem); | ||
1462 | return ret; | ||
1388 | } | 1463 | } |
1389 | 1464 | ||
1390 | static void seg_setup(int seg) | 1465 | static void seg_setup(int seg) |
@@ -1397,6 +1472,27 @@ static void seg_setup(int seg) | |||
1397 | vmcs_write32(sf->ar_bytes, 0x93); | 1472 | vmcs_write32(sf->ar_bytes, 0x93); |
1398 | } | 1473 | } |
1399 | 1474 | ||
1475 | static int alloc_apic_access_page(struct kvm *kvm) | ||
1476 | { | ||
1477 | struct kvm_userspace_memory_region kvm_userspace_mem; | ||
1478 | int r = 0; | ||
1479 | |||
1480 | down_write(¤t->mm->mmap_sem); | ||
1481 | if (kvm->arch.apic_access_page) | ||
1482 | goto out; | ||
1483 | kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT; | ||
1484 | kvm_userspace_mem.flags = 0; | ||
1485 | kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL; | ||
1486 | kvm_userspace_mem.memory_size = PAGE_SIZE; | ||
1487 | r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0); | ||
1488 | if (r) | ||
1489 | goto out; | ||
1490 | kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); | ||
1491 | out: | ||
1492 | up_write(¤t->mm->mmap_sem); | ||
1493 | return r; | ||
1494 | } | ||
1495 | |||
1400 | /* | 1496 | /* |
1401 | * Sets up the vmcs for emulated real mode. | 1497 | * Sets up the vmcs for emulated real mode. |
1402 | */ | 1498 | */ |
@@ -1407,92 +1503,15 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) | |||
1407 | unsigned long a; | 1503 | unsigned long a; |
1408 | struct descriptor_table dt; | 1504 | struct descriptor_table dt; |
1409 | int i; | 1505 | int i; |
1410 | int ret = 0; | ||
1411 | unsigned long kvm_vmx_return; | 1506 | unsigned long kvm_vmx_return; |
1412 | u64 msr; | ||
1413 | u32 exec_control; | 1507 | u32 exec_control; |
1414 | 1508 | ||
1415 | if (!init_rmode_tss(vmx->vcpu.kvm)) { | ||
1416 | ret = -ENOMEM; | ||
1417 | goto out; | ||
1418 | } | ||
1419 | |||
1420 | vmx->vcpu.rmode.active = 0; | ||
1421 | |||
1422 | vmx->vcpu.regs[VCPU_REGS_RDX] = get_rdx_init_val(); | ||
1423 | set_cr8(&vmx->vcpu, 0); | ||
1424 | msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; | ||
1425 | if (vmx->vcpu.vcpu_id == 0) | ||
1426 | msr |= MSR_IA32_APICBASE_BSP; | ||
1427 | kvm_set_apic_base(&vmx->vcpu, msr); | ||
1428 | |||
1429 | fx_init(&vmx->vcpu); | ||
1430 | |||
1431 | /* | ||
1432 | * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode | ||
1433 | * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. | ||
1434 | */ | ||
1435 | if (vmx->vcpu.vcpu_id == 0) { | ||
1436 | vmcs_write16(GUEST_CS_SELECTOR, 0xf000); | ||
1437 | vmcs_writel(GUEST_CS_BASE, 0x000f0000); | ||
1438 | } else { | ||
1439 | vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.sipi_vector << 8); | ||
1440 | vmcs_writel(GUEST_CS_BASE, vmx->vcpu.sipi_vector << 12); | ||
1441 | } | ||
1442 | vmcs_write32(GUEST_CS_LIMIT, 0xffff); | ||
1443 | vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); | ||
1444 | |||
1445 | seg_setup(VCPU_SREG_DS); | ||
1446 | seg_setup(VCPU_SREG_ES); | ||
1447 | seg_setup(VCPU_SREG_FS); | ||
1448 | seg_setup(VCPU_SREG_GS); | ||
1449 | seg_setup(VCPU_SREG_SS); | ||
1450 | |||
1451 | vmcs_write16(GUEST_TR_SELECTOR, 0); | ||
1452 | vmcs_writel(GUEST_TR_BASE, 0); | ||
1453 | vmcs_write32(GUEST_TR_LIMIT, 0xffff); | ||
1454 | vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); | ||
1455 | |||
1456 | vmcs_write16(GUEST_LDTR_SELECTOR, 0); | ||
1457 | vmcs_writel(GUEST_LDTR_BASE, 0); | ||
1458 | vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); | ||
1459 | vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); | ||
1460 | |||
1461 | vmcs_write32(GUEST_SYSENTER_CS, 0); | ||
1462 | vmcs_writel(GUEST_SYSENTER_ESP, 0); | ||
1463 | vmcs_writel(GUEST_SYSENTER_EIP, 0); | ||
1464 | |||
1465 | vmcs_writel(GUEST_RFLAGS, 0x02); | ||
1466 | if (vmx->vcpu.vcpu_id == 0) | ||
1467 | vmcs_writel(GUEST_RIP, 0xfff0); | ||
1468 | else | ||
1469 | vmcs_writel(GUEST_RIP, 0); | ||
1470 | vmcs_writel(GUEST_RSP, 0); | ||
1471 | |||
1472 | //todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 | ||
1473 | vmcs_writel(GUEST_DR7, 0x400); | ||
1474 | |||
1475 | vmcs_writel(GUEST_GDTR_BASE, 0); | ||
1476 | vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); | ||
1477 | |||
1478 | vmcs_writel(GUEST_IDTR_BASE, 0); | ||
1479 | vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); | ||
1480 | |||
1481 | vmcs_write32(GUEST_ACTIVITY_STATE, 0); | ||
1482 | vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); | ||
1483 | vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); | ||
1484 | |||
1485 | /* I/O */ | 1509 | /* I/O */ |
1486 | vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a)); | 1510 | vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a)); |
1487 | vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b)); | 1511 | vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b)); |
1488 | 1512 | ||
1489 | guest_write_tsc(0); | ||
1490 | |||
1491 | vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ | 1513 | vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ |
1492 | 1514 | ||
1493 | /* Special registers */ | ||
1494 | vmcs_write64(GUEST_IA32_DEBUGCTL, 0); | ||
1495 | |||
1496 | /* Control */ | 1515 | /* Control */ |
1497 | vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, | 1516 | vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, |
1498 | vmcs_config.pin_based_exec_ctrl); | 1517 | vmcs_config.pin_based_exec_ctrl); |
@@ -1507,8 +1526,16 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) | |||
1507 | } | 1526 | } |
1508 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); | 1527 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); |
1509 | 1528 | ||
1510 | vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); | 1529 | if (cpu_has_secondary_exec_ctrls()) { |
1511 | vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); | 1530 | exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; |
1531 | if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) | ||
1532 | exec_control &= | ||
1533 | ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; | ||
1534 | vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); | ||
1535 | } | ||
1536 | |||
1537 | vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf); | ||
1538 | vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf); | ||
1512 | vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ | 1539 | vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ |
1513 | 1540 | ||
1514 | vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */ | 1541 | vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */ |
@@ -1536,7 +1563,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) | |||
1536 | get_idt(&dt); | 1563 | get_idt(&dt); |
1537 | vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */ | 1564 | vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */ |
1538 | 1565 | ||
1539 | asm ("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return)); | 1566 | asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return)); |
1540 | vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */ | 1567 | vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */ |
1541 | vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); | 1568 | vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); |
1542 | vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); | 1569 | vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); |
@@ -1567,97 +1594,145 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) | |||
1567 | ++vmx->nmsrs; | 1594 | ++vmx->nmsrs; |
1568 | } | 1595 | } |
1569 | 1596 | ||
1570 | setup_msrs(vmx); | ||
1571 | |||
1572 | vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); | 1597 | vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); |
1573 | 1598 | ||
1574 | /* 22.2.1, 20.8.1 */ | 1599 | /* 22.2.1, 20.8.1 */ |
1575 | vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); | 1600 | vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); |
1576 | 1601 | ||
1577 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ | ||
1578 | |||
1579 | #ifdef CONFIG_X86_64 | ||
1580 | vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); | ||
1581 | if (vm_need_tpr_shadow(vmx->vcpu.kvm)) | ||
1582 | vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, | ||
1583 | page_to_phys(vmx->vcpu.apic->regs_page)); | ||
1584 | vmcs_write32(TPR_THRESHOLD, 0); | ||
1585 | #endif | ||
1586 | |||
1587 | vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); | 1602 | vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); |
1588 | vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK); | 1603 | vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK); |
1589 | 1604 | ||
1590 | vmx->vcpu.cr0 = 0x60000010; | 1605 | if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) |
1591 | vmx_set_cr0(&vmx->vcpu, vmx->vcpu.cr0); // enter rmode | 1606 | if (alloc_apic_access_page(vmx->vcpu.kvm) != 0) |
1592 | vmx_set_cr4(&vmx->vcpu, 0); | 1607 | return -ENOMEM; |
1593 | #ifdef CONFIG_X86_64 | ||
1594 | vmx_set_efer(&vmx->vcpu, 0); | ||
1595 | #endif | ||
1596 | vmx_fpu_activate(&vmx->vcpu); | ||
1597 | update_exception_bitmap(&vmx->vcpu); | ||
1598 | 1608 | ||
1599 | return 0; | 1609 | return 0; |
1600 | |||
1601 | out: | ||
1602 | return ret; | ||
1603 | } | 1610 | } |
1604 | 1611 | ||
1605 | static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) | 1612 | static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) |
1606 | { | 1613 | { |
1607 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 1614 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
1615 | u64 msr; | ||
1616 | int ret; | ||
1608 | 1617 | ||
1609 | vmx_vcpu_setup(vmx); | 1618 | if (!init_rmode_tss(vmx->vcpu.kvm)) { |
1610 | } | 1619 | ret = -ENOMEM; |
1611 | 1620 | goto out; | |
1612 | static void inject_rmode_irq(struct kvm_vcpu *vcpu, int irq) | ||
1613 | { | ||
1614 | u16 ent[2]; | ||
1615 | u16 cs; | ||
1616 | u16 ip; | ||
1617 | unsigned long flags; | ||
1618 | unsigned long ss_base = vmcs_readl(GUEST_SS_BASE); | ||
1619 | u16 sp = vmcs_readl(GUEST_RSP); | ||
1620 | u32 ss_limit = vmcs_read32(GUEST_SS_LIMIT); | ||
1621 | |||
1622 | if (sp > ss_limit || sp < 6 ) { | ||
1623 | vcpu_printf(vcpu, "%s: #SS, rsp 0x%lx ss 0x%lx limit 0x%x\n", | ||
1624 | __FUNCTION__, | ||
1625 | vmcs_readl(GUEST_RSP), | ||
1626 | vmcs_readl(GUEST_SS_BASE), | ||
1627 | vmcs_read32(GUEST_SS_LIMIT)); | ||
1628 | return; | ||
1629 | } | 1621 | } |
1630 | 1622 | ||
1631 | if (emulator_read_std(irq * sizeof(ent), &ent, sizeof(ent), vcpu) != | 1623 | vmx->vcpu.arch.rmode.active = 0; |
1632 | X86EMUL_CONTINUE) { | 1624 | |
1633 | vcpu_printf(vcpu, "%s: read guest err\n", __FUNCTION__); | 1625 | vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); |
1634 | return; | 1626 | set_cr8(&vmx->vcpu, 0); |
1627 | msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; | ||
1628 | if (vmx->vcpu.vcpu_id == 0) | ||
1629 | msr |= MSR_IA32_APICBASE_BSP; | ||
1630 | kvm_set_apic_base(&vmx->vcpu, msr); | ||
1631 | |||
1632 | fx_init(&vmx->vcpu); | ||
1633 | |||
1634 | /* | ||
1635 | * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode | ||
1636 | * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. | ||
1637 | */ | ||
1638 | if (vmx->vcpu.vcpu_id == 0) { | ||
1639 | vmcs_write16(GUEST_CS_SELECTOR, 0xf000); | ||
1640 | vmcs_writel(GUEST_CS_BASE, 0x000f0000); | ||
1641 | } else { | ||
1642 | vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8); | ||
1643 | vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12); | ||
1635 | } | 1644 | } |
1645 | vmcs_write32(GUEST_CS_LIMIT, 0xffff); | ||
1646 | vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); | ||
1647 | |||
1648 | seg_setup(VCPU_SREG_DS); | ||
1649 | seg_setup(VCPU_SREG_ES); | ||
1650 | seg_setup(VCPU_SREG_FS); | ||
1651 | seg_setup(VCPU_SREG_GS); | ||
1652 | seg_setup(VCPU_SREG_SS); | ||
1653 | |||
1654 | vmcs_write16(GUEST_TR_SELECTOR, 0); | ||
1655 | vmcs_writel(GUEST_TR_BASE, 0); | ||
1656 | vmcs_write32(GUEST_TR_LIMIT, 0xffff); | ||
1657 | vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); | ||
1636 | 1658 | ||
1637 | flags = vmcs_readl(GUEST_RFLAGS); | 1659 | vmcs_write16(GUEST_LDTR_SELECTOR, 0); |
1638 | cs = vmcs_readl(GUEST_CS_BASE) >> 4; | 1660 | vmcs_writel(GUEST_LDTR_BASE, 0); |
1639 | ip = vmcs_readl(GUEST_RIP); | 1661 | vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); |
1662 | vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); | ||
1640 | 1663 | ||
1664 | vmcs_write32(GUEST_SYSENTER_CS, 0); | ||
1665 | vmcs_writel(GUEST_SYSENTER_ESP, 0); | ||
1666 | vmcs_writel(GUEST_SYSENTER_EIP, 0); | ||
1641 | 1667 | ||
1642 | if (emulator_write_emulated(ss_base + sp - 2, &flags, 2, vcpu) != X86EMUL_CONTINUE || | 1668 | vmcs_writel(GUEST_RFLAGS, 0x02); |
1643 | emulator_write_emulated(ss_base + sp - 4, &cs, 2, vcpu) != X86EMUL_CONTINUE || | 1669 | if (vmx->vcpu.vcpu_id == 0) |
1644 | emulator_write_emulated(ss_base + sp - 6, &ip, 2, vcpu) != X86EMUL_CONTINUE) { | 1670 | vmcs_writel(GUEST_RIP, 0xfff0); |
1645 | vcpu_printf(vcpu, "%s: write guest err\n", __FUNCTION__); | 1671 | else |
1646 | return; | 1672 | vmcs_writel(GUEST_RIP, 0); |
1673 | vmcs_writel(GUEST_RSP, 0); | ||
1674 | |||
1675 | /* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */ | ||
1676 | vmcs_writel(GUEST_DR7, 0x400); | ||
1677 | |||
1678 | vmcs_writel(GUEST_GDTR_BASE, 0); | ||
1679 | vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); | ||
1680 | |||
1681 | vmcs_writel(GUEST_IDTR_BASE, 0); | ||
1682 | vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); | ||
1683 | |||
1684 | vmcs_write32(GUEST_ACTIVITY_STATE, 0); | ||
1685 | vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); | ||
1686 | vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); | ||
1687 | |||
1688 | guest_write_tsc(0); | ||
1689 | |||
1690 | /* Special registers */ | ||
1691 | vmcs_write64(GUEST_IA32_DEBUGCTL, 0); | ||
1692 | |||
1693 | setup_msrs(vmx); | ||
1694 | |||
1695 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ | ||
1696 | |||
1697 | if (cpu_has_vmx_tpr_shadow()) { | ||
1698 | vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); | ||
1699 | if (vm_need_tpr_shadow(vmx->vcpu.kvm)) | ||
1700 | vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, | ||
1701 | page_to_phys(vmx->vcpu.arch.apic->regs_page)); | ||
1702 | vmcs_write32(TPR_THRESHOLD, 0); | ||
1647 | } | 1703 | } |
1648 | 1704 | ||
1649 | vmcs_writel(GUEST_RFLAGS, flags & | 1705 | if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) |
1650 | ~( X86_EFLAGS_IF | X86_EFLAGS_AC | X86_EFLAGS_TF)); | 1706 | vmcs_write64(APIC_ACCESS_ADDR, |
1651 | vmcs_write16(GUEST_CS_SELECTOR, ent[1]) ; | 1707 | page_to_phys(vmx->vcpu.kvm->arch.apic_access_page)); |
1652 | vmcs_writel(GUEST_CS_BASE, ent[1] << 4); | 1708 | |
1653 | vmcs_writel(GUEST_RIP, ent[0]); | 1709 | vmx->vcpu.arch.cr0 = 0x60000010; |
1654 | vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) & ~0xffff) | (sp - 6)); | 1710 | vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */ |
1711 | vmx_set_cr4(&vmx->vcpu, 0); | ||
1712 | #ifdef CONFIG_X86_64 | ||
1713 | vmx_set_efer(&vmx->vcpu, 0); | ||
1714 | #endif | ||
1715 | vmx_fpu_activate(&vmx->vcpu); | ||
1716 | update_exception_bitmap(&vmx->vcpu); | ||
1717 | |||
1718 | return 0; | ||
1719 | |||
1720 | out: | ||
1721 | return ret; | ||
1655 | } | 1722 | } |
1656 | 1723 | ||
1657 | static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) | 1724 | static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) |
1658 | { | 1725 | { |
1659 | if (vcpu->rmode.active) { | 1726 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
1660 | inject_rmode_irq(vcpu, irq); | 1727 | |
1728 | if (vcpu->arch.rmode.active) { | ||
1729 | vmx->rmode.irq.pending = true; | ||
1730 | vmx->rmode.irq.vector = irq; | ||
1731 | vmx->rmode.irq.rip = vmcs_readl(GUEST_RIP); | ||
1732 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | ||
1733 | irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK); | ||
1734 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); | ||
1735 | vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip - 1); | ||
1661 | return; | 1736 | return; |
1662 | } | 1737 | } |
1663 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | 1738 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, |
@@ -1666,13 +1741,13 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) | |||
1666 | 1741 | ||
1667 | static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) | 1742 | static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) |
1668 | { | 1743 | { |
1669 | int word_index = __ffs(vcpu->irq_summary); | 1744 | int word_index = __ffs(vcpu->arch.irq_summary); |
1670 | int bit_index = __ffs(vcpu->irq_pending[word_index]); | 1745 | int bit_index = __ffs(vcpu->arch.irq_pending[word_index]); |
1671 | int irq = word_index * BITS_PER_LONG + bit_index; | 1746 | int irq = word_index * BITS_PER_LONG + bit_index; |
1672 | 1747 | ||
1673 | clear_bit(bit_index, &vcpu->irq_pending[word_index]); | 1748 | clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); |
1674 | if (!vcpu->irq_pending[word_index]) | 1749 | if (!vcpu->arch.irq_pending[word_index]) |
1675 | clear_bit(word_index, &vcpu->irq_summary); | 1750 | clear_bit(word_index, &vcpu->arch.irq_summary); |
1676 | vmx_inject_irq(vcpu, irq); | 1751 | vmx_inject_irq(vcpu, irq); |
1677 | } | 1752 | } |
1678 | 1753 | ||
@@ -1682,12 +1757,12 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu, | |||
1682 | { | 1757 | { |
1683 | u32 cpu_based_vm_exec_control; | 1758 | u32 cpu_based_vm_exec_control; |
1684 | 1759 | ||
1685 | vcpu->interrupt_window_open = | 1760 | vcpu->arch.interrupt_window_open = |
1686 | ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && | 1761 | ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && |
1687 | (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0); | 1762 | (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0); |
1688 | 1763 | ||
1689 | if (vcpu->interrupt_window_open && | 1764 | if (vcpu->arch.interrupt_window_open && |
1690 | vcpu->irq_summary && | 1765 | vcpu->arch.irq_summary && |
1691 | !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK)) | 1766 | !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK)) |
1692 | /* | 1767 | /* |
1693 | * If interrupts enabled, and not blocked by sti or mov ss. Good. | 1768 | * If interrupts enabled, and not blocked by sti or mov ss. Good. |
@@ -1695,8 +1770,8 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu, | |||
1695 | kvm_do_inject_irq(vcpu); | 1770 | kvm_do_inject_irq(vcpu); |
1696 | 1771 | ||
1697 | cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); | 1772 | cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); |
1698 | if (!vcpu->interrupt_window_open && | 1773 | if (!vcpu->arch.interrupt_window_open && |
1699 | (vcpu->irq_summary || kvm_run->request_interrupt_window)) | 1774 | (vcpu->arch.irq_summary || kvm_run->request_interrupt_window)) |
1700 | /* | 1775 | /* |
1701 | * Interrupts blocked. Wait for unblock. | 1776 | * Interrupts blocked. Wait for unblock. |
1702 | */ | 1777 | */ |
@@ -1706,6 +1781,23 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu, | |||
1706 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); | 1781 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); |
1707 | } | 1782 | } |
1708 | 1783 | ||
1784 | static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) | ||
1785 | { | ||
1786 | int ret; | ||
1787 | struct kvm_userspace_memory_region tss_mem = { | ||
1788 | .slot = 8, | ||
1789 | .guest_phys_addr = addr, | ||
1790 | .memory_size = PAGE_SIZE * 3, | ||
1791 | .flags = 0, | ||
1792 | }; | ||
1793 | |||
1794 | ret = kvm_set_memory_region(kvm, &tss_mem, 0); | ||
1795 | if (ret) | ||
1796 | return ret; | ||
1797 | kvm->arch.tss_addr = addr; | ||
1798 | return 0; | ||
1799 | } | ||
1800 | |||
1709 | static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu) | 1801 | static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu) |
1710 | { | 1802 | { |
1711 | struct kvm_guest_debug *dbg = &vcpu->guest_debug; | 1803 | struct kvm_guest_debug *dbg = &vcpu->guest_debug; |
@@ -1727,7 +1819,7 @@ static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu) | |||
1727 | static int handle_rmode_exception(struct kvm_vcpu *vcpu, | 1819 | static int handle_rmode_exception(struct kvm_vcpu *vcpu, |
1728 | int vec, u32 err_code) | 1820 | int vec, u32 err_code) |
1729 | { | 1821 | { |
1730 | if (!vcpu->rmode.active) | 1822 | if (!vcpu->arch.rmode.active) |
1731 | return 0; | 1823 | return 0; |
1732 | 1824 | ||
1733 | /* | 1825 | /* |
@@ -1735,32 +1827,31 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, | |||
1735 | * Cause the #SS fault with 0 error code in VM86 mode. | 1827 | * Cause the #SS fault with 0 error code in VM86 mode. |
1736 | */ | 1828 | */ |
1737 | if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) | 1829 | if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) |
1738 | if (emulate_instruction(vcpu, NULL, 0, 0) == EMULATE_DONE) | 1830 | if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE) |
1739 | return 1; | 1831 | return 1; |
1740 | return 0; | 1832 | return 0; |
1741 | } | 1833 | } |
1742 | 1834 | ||
1743 | static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | 1835 | static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
1744 | { | 1836 | { |
1837 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
1745 | u32 intr_info, error_code; | 1838 | u32 intr_info, error_code; |
1746 | unsigned long cr2, rip; | 1839 | unsigned long cr2, rip; |
1747 | u32 vect_info; | 1840 | u32 vect_info; |
1748 | enum emulation_result er; | 1841 | enum emulation_result er; |
1749 | int r; | ||
1750 | 1842 | ||
1751 | vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); | 1843 | vect_info = vmx->idt_vectoring_info; |
1752 | intr_info = vmcs_read32(VM_EXIT_INTR_INFO); | 1844 | intr_info = vmcs_read32(VM_EXIT_INTR_INFO); |
1753 | 1845 | ||
1754 | if ((vect_info & VECTORING_INFO_VALID_MASK) && | 1846 | if ((vect_info & VECTORING_INFO_VALID_MASK) && |
1755 | !is_page_fault(intr_info)) { | 1847 | !is_page_fault(intr_info)) |
1756 | printk(KERN_ERR "%s: unexpected, vectoring info 0x%x " | 1848 | printk(KERN_ERR "%s: unexpected, vectoring info 0x%x " |
1757 | "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info); | 1849 | "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info); |
1758 | } | ||
1759 | 1850 | ||
1760 | if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) { | 1851 | if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) { |
1761 | int irq = vect_info & VECTORING_INFO_VECTOR_MASK; | 1852 | int irq = vect_info & VECTORING_INFO_VECTOR_MASK; |
1762 | set_bit(irq, vcpu->irq_pending); | 1853 | set_bit(irq, vcpu->arch.irq_pending); |
1763 | set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary); | 1854 | set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary); |
1764 | } | 1855 | } |
1765 | 1856 | ||
1766 | if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */ | 1857 | if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */ |
@@ -1771,52 +1862,34 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
1771 | return 1; | 1862 | return 1; |
1772 | } | 1863 | } |
1773 | 1864 | ||
1865 | if (is_invalid_opcode(intr_info)) { | ||
1866 | er = emulate_instruction(vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD); | ||
1867 | if (er != EMULATE_DONE) | ||
1868 | kvm_queue_exception(vcpu, UD_VECTOR); | ||
1869 | return 1; | ||
1870 | } | ||
1871 | |||
1774 | error_code = 0; | 1872 | error_code = 0; |
1775 | rip = vmcs_readl(GUEST_RIP); | 1873 | rip = vmcs_readl(GUEST_RIP); |
1776 | if (intr_info & INTR_INFO_DELIEVER_CODE_MASK) | 1874 | if (intr_info & INTR_INFO_DELIEVER_CODE_MASK) |
1777 | error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); | 1875 | error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); |
1778 | if (is_page_fault(intr_info)) { | 1876 | if (is_page_fault(intr_info)) { |
1779 | cr2 = vmcs_readl(EXIT_QUALIFICATION); | 1877 | cr2 = vmcs_readl(EXIT_QUALIFICATION); |
1780 | 1878 | return kvm_mmu_page_fault(vcpu, cr2, error_code); | |
1781 | mutex_lock(&vcpu->kvm->lock); | ||
1782 | r = kvm_mmu_page_fault(vcpu, cr2, error_code); | ||
1783 | if (r < 0) { | ||
1784 | mutex_unlock(&vcpu->kvm->lock); | ||
1785 | return r; | ||
1786 | } | ||
1787 | if (!r) { | ||
1788 | mutex_unlock(&vcpu->kvm->lock); | ||
1789 | return 1; | ||
1790 | } | ||
1791 | |||
1792 | er = emulate_instruction(vcpu, kvm_run, cr2, error_code); | ||
1793 | mutex_unlock(&vcpu->kvm->lock); | ||
1794 | |||
1795 | switch (er) { | ||
1796 | case EMULATE_DONE: | ||
1797 | return 1; | ||
1798 | case EMULATE_DO_MMIO: | ||
1799 | ++vcpu->stat.mmio_exits; | ||
1800 | return 0; | ||
1801 | case EMULATE_FAIL: | ||
1802 | kvm_report_emulation_failure(vcpu, "pagetable"); | ||
1803 | break; | ||
1804 | default: | ||
1805 | BUG(); | ||
1806 | } | ||
1807 | } | 1879 | } |
1808 | 1880 | ||
1809 | if (vcpu->rmode.active && | 1881 | if (vcpu->arch.rmode.active && |
1810 | handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK, | 1882 | handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK, |
1811 | error_code)) { | 1883 | error_code)) { |
1812 | if (vcpu->halt_request) { | 1884 | if (vcpu->arch.halt_request) { |
1813 | vcpu->halt_request = 0; | 1885 | vcpu->arch.halt_request = 0; |
1814 | return kvm_emulate_halt(vcpu); | 1886 | return kvm_emulate_halt(vcpu); |
1815 | } | 1887 | } |
1816 | return 1; | 1888 | return 1; |
1817 | } | 1889 | } |
1818 | 1890 | ||
1819 | if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == (INTR_TYPE_EXCEPTION | 1)) { | 1891 | if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == |
1892 | (INTR_TYPE_EXCEPTION | 1)) { | ||
1820 | kvm_run->exit_reason = KVM_EXIT_DEBUG; | 1893 | kvm_run->exit_reason = KVM_EXIT_DEBUG; |
1821 | return 0; | 1894 | return 0; |
1822 | } | 1895 | } |
@@ -1850,7 +1923,8 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
1850 | string = (exit_qualification & 16) != 0; | 1923 | string = (exit_qualification & 16) != 0; |
1851 | 1924 | ||
1852 | if (string) { | 1925 | if (string) { |
1853 | if (emulate_instruction(vcpu, kvm_run, 0, 0) == EMULATE_DO_MMIO) | 1926 | if (emulate_instruction(vcpu, |
1927 | kvm_run, 0, 0, 0) == EMULATE_DO_MMIO) | ||
1854 | return 0; | 1928 | return 0; |
1855 | return 1; | 1929 | return 1; |
1856 | } | 1930 | } |
@@ -1873,7 +1947,6 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) | |||
1873 | hypercall[0] = 0x0f; | 1947 | hypercall[0] = 0x0f; |
1874 | hypercall[1] = 0x01; | 1948 | hypercall[1] = 0x01; |
1875 | hypercall[2] = 0xc1; | 1949 | hypercall[2] = 0xc1; |
1876 | hypercall[3] = 0xc3; | ||
1877 | } | 1950 | } |
1878 | 1951 | ||
1879 | static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | 1952 | static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
@@ -1890,23 +1963,25 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
1890 | switch (cr) { | 1963 | switch (cr) { |
1891 | case 0: | 1964 | case 0: |
1892 | vcpu_load_rsp_rip(vcpu); | 1965 | vcpu_load_rsp_rip(vcpu); |
1893 | set_cr0(vcpu, vcpu->regs[reg]); | 1966 | set_cr0(vcpu, vcpu->arch.regs[reg]); |
1894 | skip_emulated_instruction(vcpu); | 1967 | skip_emulated_instruction(vcpu); |
1895 | return 1; | 1968 | return 1; |
1896 | case 3: | 1969 | case 3: |
1897 | vcpu_load_rsp_rip(vcpu); | 1970 | vcpu_load_rsp_rip(vcpu); |
1898 | set_cr3(vcpu, vcpu->regs[reg]); | 1971 | set_cr3(vcpu, vcpu->arch.regs[reg]); |
1899 | skip_emulated_instruction(vcpu); | 1972 | skip_emulated_instruction(vcpu); |
1900 | return 1; | 1973 | return 1; |
1901 | case 4: | 1974 | case 4: |
1902 | vcpu_load_rsp_rip(vcpu); | 1975 | vcpu_load_rsp_rip(vcpu); |
1903 | set_cr4(vcpu, vcpu->regs[reg]); | 1976 | set_cr4(vcpu, vcpu->arch.regs[reg]); |
1904 | skip_emulated_instruction(vcpu); | 1977 | skip_emulated_instruction(vcpu); |
1905 | return 1; | 1978 | return 1; |
1906 | case 8: | 1979 | case 8: |
1907 | vcpu_load_rsp_rip(vcpu); | 1980 | vcpu_load_rsp_rip(vcpu); |
1908 | set_cr8(vcpu, vcpu->regs[reg]); | 1981 | set_cr8(vcpu, vcpu->arch.regs[reg]); |
1909 | skip_emulated_instruction(vcpu); | 1982 | skip_emulated_instruction(vcpu); |
1983 | if (irqchip_in_kernel(vcpu->kvm)) | ||
1984 | return 1; | ||
1910 | kvm_run->exit_reason = KVM_EXIT_SET_TPR; | 1985 | kvm_run->exit_reason = KVM_EXIT_SET_TPR; |
1911 | return 0; | 1986 | return 0; |
1912 | }; | 1987 | }; |
@@ -1914,8 +1989,8 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
1914 | case 2: /* clts */ | 1989 | case 2: /* clts */ |
1915 | vcpu_load_rsp_rip(vcpu); | 1990 | vcpu_load_rsp_rip(vcpu); |
1916 | vmx_fpu_deactivate(vcpu); | 1991 | vmx_fpu_deactivate(vcpu); |
1917 | vcpu->cr0 &= ~X86_CR0_TS; | 1992 | vcpu->arch.cr0 &= ~X86_CR0_TS; |
1918 | vmcs_writel(CR0_READ_SHADOW, vcpu->cr0); | 1993 | vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); |
1919 | vmx_fpu_activate(vcpu); | 1994 | vmx_fpu_activate(vcpu); |
1920 | skip_emulated_instruction(vcpu); | 1995 | skip_emulated_instruction(vcpu); |
1921 | return 1; | 1996 | return 1; |
@@ -1923,13 +1998,13 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
1923 | switch (cr) { | 1998 | switch (cr) { |
1924 | case 3: | 1999 | case 3: |
1925 | vcpu_load_rsp_rip(vcpu); | 2000 | vcpu_load_rsp_rip(vcpu); |
1926 | vcpu->regs[reg] = vcpu->cr3; | 2001 | vcpu->arch.regs[reg] = vcpu->arch.cr3; |
1927 | vcpu_put_rsp_rip(vcpu); | 2002 | vcpu_put_rsp_rip(vcpu); |
1928 | skip_emulated_instruction(vcpu); | 2003 | skip_emulated_instruction(vcpu); |
1929 | return 1; | 2004 | return 1; |
1930 | case 8: | 2005 | case 8: |
1931 | vcpu_load_rsp_rip(vcpu); | 2006 | vcpu_load_rsp_rip(vcpu); |
1932 | vcpu->regs[reg] = get_cr8(vcpu); | 2007 | vcpu->arch.regs[reg] = get_cr8(vcpu); |
1933 | vcpu_put_rsp_rip(vcpu); | 2008 | vcpu_put_rsp_rip(vcpu); |
1934 | skip_emulated_instruction(vcpu); | 2009 | skip_emulated_instruction(vcpu); |
1935 | return 1; | 2010 | return 1; |
@@ -1975,7 +2050,7 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
1975 | default: | 2050 | default: |
1976 | val = 0; | 2051 | val = 0; |
1977 | } | 2052 | } |
1978 | vcpu->regs[reg] = val; | 2053 | vcpu->arch.regs[reg] = val; |
1979 | } else { | 2054 | } else { |
1980 | /* mov to dr */ | 2055 | /* mov to dr */ |
1981 | } | 2056 | } |
@@ -1992,29 +2067,29 @@ static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
1992 | 2067 | ||
1993 | static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | 2068 | static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
1994 | { | 2069 | { |
1995 | u32 ecx = vcpu->regs[VCPU_REGS_RCX]; | 2070 | u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; |
1996 | u64 data; | 2071 | u64 data; |
1997 | 2072 | ||
1998 | if (vmx_get_msr(vcpu, ecx, &data)) { | 2073 | if (vmx_get_msr(vcpu, ecx, &data)) { |
1999 | vmx_inject_gp(vcpu, 0); | 2074 | kvm_inject_gp(vcpu, 0); |
2000 | return 1; | 2075 | return 1; |
2001 | } | 2076 | } |
2002 | 2077 | ||
2003 | /* FIXME: handling of bits 32:63 of rax, rdx */ | 2078 | /* FIXME: handling of bits 32:63 of rax, rdx */ |
2004 | vcpu->regs[VCPU_REGS_RAX] = data & -1u; | 2079 | vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u; |
2005 | vcpu->regs[VCPU_REGS_RDX] = (data >> 32) & -1u; | 2080 | vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u; |
2006 | skip_emulated_instruction(vcpu); | 2081 | skip_emulated_instruction(vcpu); |
2007 | return 1; | 2082 | return 1; |
2008 | } | 2083 | } |
2009 | 2084 | ||
2010 | static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | 2085 | static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
2011 | { | 2086 | { |
2012 | u32 ecx = vcpu->regs[VCPU_REGS_RCX]; | 2087 | u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; |
2013 | u64 data = (vcpu->regs[VCPU_REGS_RAX] & -1u) | 2088 | u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) |
2014 | | ((u64)(vcpu->regs[VCPU_REGS_RDX] & -1u) << 32); | 2089 | | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); |
2015 | 2090 | ||
2016 | if (vmx_set_msr(vcpu, ecx, data) != 0) { | 2091 | if (vmx_set_msr(vcpu, ecx, data) != 0) { |
2017 | vmx_inject_gp(vcpu, 0); | 2092 | kvm_inject_gp(vcpu, 0); |
2018 | return 1; | 2093 | return 1; |
2019 | } | 2094 | } |
2020 | 2095 | ||
@@ -2042,7 +2117,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu, | |||
2042 | * possible | 2117 | * possible |
2043 | */ | 2118 | */ |
2044 | if (kvm_run->request_interrupt_window && | 2119 | if (kvm_run->request_interrupt_window && |
2045 | !vcpu->irq_summary) { | 2120 | !vcpu->arch.irq_summary) { |
2046 | kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; | 2121 | kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; |
2047 | ++vcpu->stat.irq_window_exits; | 2122 | ++vcpu->stat.irq_window_exits; |
2048 | return 0; | 2123 | return 0; |
@@ -2059,7 +2134,35 @@ static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2059 | static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | 2134 | static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
2060 | { | 2135 | { |
2061 | skip_emulated_instruction(vcpu); | 2136 | skip_emulated_instruction(vcpu); |
2062 | return kvm_hypercall(vcpu, kvm_run); | 2137 | kvm_emulate_hypercall(vcpu); |
2138 | return 1; | ||
2139 | } | ||
2140 | |||
2141 | static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
2142 | { | ||
2143 | skip_emulated_instruction(vcpu); | ||
2144 | /* TODO: Add support for VT-d/pass-through device */ | ||
2145 | return 1; | ||
2146 | } | ||
2147 | |||
2148 | static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
2149 | { | ||
2150 | u64 exit_qualification; | ||
2151 | enum emulation_result er; | ||
2152 | unsigned long offset; | ||
2153 | |||
2154 | exit_qualification = vmcs_read64(EXIT_QUALIFICATION); | ||
2155 | offset = exit_qualification & 0xffful; | ||
2156 | |||
2157 | er = emulate_instruction(vcpu, kvm_run, 0, 0, 0); | ||
2158 | |||
2159 | if (er != EMULATE_DONE) { | ||
2160 | printk(KERN_ERR | ||
2161 | "Fail to handle apic access vmexit! Offset is 0x%lx\n", | ||
2162 | offset); | ||
2163 | return -ENOTSUPP; | ||
2164 | } | ||
2165 | return 1; | ||
2063 | } | 2166 | } |
2064 | 2167 | ||
2065 | /* | 2168 | /* |
@@ -2081,7 +2184,9 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, | |||
2081 | [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, | 2184 | [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, |
2082 | [EXIT_REASON_HLT] = handle_halt, | 2185 | [EXIT_REASON_HLT] = handle_halt, |
2083 | [EXIT_REASON_VMCALL] = handle_vmcall, | 2186 | [EXIT_REASON_VMCALL] = handle_vmcall, |
2084 | [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold | 2187 | [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, |
2188 | [EXIT_REASON_APIC_ACCESS] = handle_apic_access, | ||
2189 | [EXIT_REASON_WBINVD] = handle_wbinvd, | ||
2085 | }; | 2190 | }; |
2086 | 2191 | ||
2087 | static const int kvm_vmx_max_exit_handlers = | 2192 | static const int kvm_vmx_max_exit_handlers = |
@@ -2093,9 +2198,9 @@ static const int kvm_vmx_max_exit_handlers = | |||
2093 | */ | 2198 | */ |
2094 | static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) | 2199 | static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) |
2095 | { | 2200 | { |
2096 | u32 vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); | ||
2097 | u32 exit_reason = vmcs_read32(VM_EXIT_REASON); | 2201 | u32 exit_reason = vmcs_read32(VM_EXIT_REASON); |
2098 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 2202 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
2203 | u32 vectoring_info = vmx->idt_vectoring_info; | ||
2099 | 2204 | ||
2100 | if (unlikely(vmx->fail)) { | 2205 | if (unlikely(vmx->fail)) { |
2101 | kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; | 2206 | kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; |
@@ -2104,8 +2209,8 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) | |||
2104 | return 0; | 2209 | return 0; |
2105 | } | 2210 | } |
2106 | 2211 | ||
2107 | if ( (vectoring_info & VECTORING_INFO_VALID_MASK) && | 2212 | if ((vectoring_info & VECTORING_INFO_VALID_MASK) && |
2108 | exit_reason != EXIT_REASON_EXCEPTION_NMI ) | 2213 | exit_reason != EXIT_REASON_EXCEPTION_NMI) |
2109 | printk(KERN_WARNING "%s: unexpected, valid vectoring info and " | 2214 | printk(KERN_WARNING "%s: unexpected, valid vectoring info and " |
2110 | "exit reason is 0x%x\n", __FUNCTION__, exit_reason); | 2215 | "exit reason is 0x%x\n", __FUNCTION__, exit_reason); |
2111 | if (exit_reason < kvm_vmx_max_exit_handlers | 2216 | if (exit_reason < kvm_vmx_max_exit_handlers |
@@ -2150,26 +2255,38 @@ static void enable_irq_window(struct kvm_vcpu *vcpu) | |||
2150 | 2255 | ||
2151 | static void vmx_intr_assist(struct kvm_vcpu *vcpu) | 2256 | static void vmx_intr_assist(struct kvm_vcpu *vcpu) |
2152 | { | 2257 | { |
2258 | struct vcpu_vmx *vmx = to_vmx(vcpu); | ||
2153 | u32 idtv_info_field, intr_info_field; | 2259 | u32 idtv_info_field, intr_info_field; |
2154 | int has_ext_irq, interrupt_window_open; | 2260 | int has_ext_irq, interrupt_window_open; |
2155 | int vector; | 2261 | int vector; |
2156 | 2262 | ||
2157 | kvm_inject_pending_timer_irqs(vcpu); | ||
2158 | update_tpr_threshold(vcpu); | 2263 | update_tpr_threshold(vcpu); |
2159 | 2264 | ||
2160 | has_ext_irq = kvm_cpu_has_interrupt(vcpu); | 2265 | has_ext_irq = kvm_cpu_has_interrupt(vcpu); |
2161 | intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); | 2266 | intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); |
2162 | idtv_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD); | 2267 | idtv_info_field = vmx->idt_vectoring_info; |
2163 | if (intr_info_field & INTR_INFO_VALID_MASK) { | 2268 | if (intr_info_field & INTR_INFO_VALID_MASK) { |
2164 | if (idtv_info_field & INTR_INFO_VALID_MASK) { | 2269 | if (idtv_info_field & INTR_INFO_VALID_MASK) { |
2165 | /* TODO: fault when IDT_Vectoring */ | 2270 | /* TODO: fault when IDT_Vectoring */ |
2166 | printk(KERN_ERR "Fault when IDT_Vectoring\n"); | 2271 | if (printk_ratelimit()) |
2272 | printk(KERN_ERR "Fault when IDT_Vectoring\n"); | ||
2167 | } | 2273 | } |
2168 | if (has_ext_irq) | 2274 | if (has_ext_irq) |
2169 | enable_irq_window(vcpu); | 2275 | enable_irq_window(vcpu); |
2170 | return; | 2276 | return; |
2171 | } | 2277 | } |
2172 | if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) { | 2278 | if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) { |
2279 | if ((idtv_info_field & VECTORING_INFO_TYPE_MASK) | ||
2280 | == INTR_TYPE_EXT_INTR | ||
2281 | && vcpu->arch.rmode.active) { | ||
2282 | u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK; | ||
2283 | |||
2284 | vmx_inject_irq(vcpu, vect); | ||
2285 | if (unlikely(has_ext_irq)) | ||
2286 | enable_irq_window(vcpu); | ||
2287 | return; | ||
2288 | } | ||
2289 | |||
2173 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field); | 2290 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field); |
2174 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, | 2291 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, |
2175 | vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); | 2292 | vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); |
@@ -2194,6 +2311,29 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu) | |||
2194 | enable_irq_window(vcpu); | 2311 | enable_irq_window(vcpu); |
2195 | } | 2312 | } |
2196 | 2313 | ||
2314 | /* | ||
2315 | * Failure to inject an interrupt should give us the information | ||
2316 | * in IDT_VECTORING_INFO_FIELD. However, if the failure occurs | ||
2317 | * when fetching the interrupt redirection bitmap in the real-mode | ||
2318 | * tss, this doesn't happen. So we do it ourselves. | ||
2319 | */ | ||
2320 | static void fixup_rmode_irq(struct vcpu_vmx *vmx) | ||
2321 | { | ||
2322 | vmx->rmode.irq.pending = 0; | ||
2323 | if (vmcs_readl(GUEST_RIP) + 1 != vmx->rmode.irq.rip) | ||
2324 | return; | ||
2325 | vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip); | ||
2326 | if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) { | ||
2327 | vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK; | ||
2328 | vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR; | ||
2329 | return; | ||
2330 | } | ||
2331 | vmx->idt_vectoring_info = | ||
2332 | VECTORING_INFO_VALID_MASK | ||
2333 | | INTR_TYPE_EXT_INTR | ||
2334 | | vmx->rmode.irq.vector; | ||
2335 | } | ||
2336 | |||
2197 | static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | 2337 | static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
2198 | { | 2338 | { |
2199 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 2339 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
@@ -2204,50 +2344,47 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2204 | */ | 2344 | */ |
2205 | vmcs_writel(HOST_CR0, read_cr0()); | 2345 | vmcs_writel(HOST_CR0, read_cr0()); |
2206 | 2346 | ||
2207 | asm ( | 2347 | asm( |
2208 | /* Store host registers */ | 2348 | /* Store host registers */ |
2209 | #ifdef CONFIG_X86_64 | 2349 | #ifdef CONFIG_X86_64 |
2210 | "push %%rax; push %%rbx; push %%rdx;" | 2350 | "push %%rdx; push %%rbp;" |
2211 | "push %%rsi; push %%rdi; push %%rbp;" | ||
2212 | "push %%r8; push %%r9; push %%r10; push %%r11;" | ||
2213 | "push %%r12; push %%r13; push %%r14; push %%r15;" | ||
2214 | "push %%rcx \n\t" | 2351 | "push %%rcx \n\t" |
2215 | ASM_VMX_VMWRITE_RSP_RDX "\n\t" | ||
2216 | #else | 2352 | #else |
2217 | "pusha; push %%ecx \n\t" | 2353 | "push %%edx; push %%ebp;" |
2218 | ASM_VMX_VMWRITE_RSP_RDX "\n\t" | 2354 | "push %%ecx \n\t" |
2219 | #endif | 2355 | #endif |
2356 | ASM_VMX_VMWRITE_RSP_RDX "\n\t" | ||
2220 | /* Check if vmlaunch of vmresume is needed */ | 2357 | /* Check if vmlaunch of vmresume is needed */ |
2221 | "cmp $0, %1 \n\t" | 2358 | "cmpl $0, %c[launched](%0) \n\t" |
2222 | /* Load guest registers. Don't clobber flags. */ | 2359 | /* Load guest registers. Don't clobber flags. */ |
2223 | #ifdef CONFIG_X86_64 | 2360 | #ifdef CONFIG_X86_64 |
2224 | "mov %c[cr2](%3), %%rax \n\t" | 2361 | "mov %c[cr2](%0), %%rax \n\t" |
2225 | "mov %%rax, %%cr2 \n\t" | 2362 | "mov %%rax, %%cr2 \n\t" |
2226 | "mov %c[rax](%3), %%rax \n\t" | 2363 | "mov %c[rax](%0), %%rax \n\t" |
2227 | "mov %c[rbx](%3), %%rbx \n\t" | 2364 | "mov %c[rbx](%0), %%rbx \n\t" |
2228 | "mov %c[rdx](%3), %%rdx \n\t" | 2365 | "mov %c[rdx](%0), %%rdx \n\t" |
2229 | "mov %c[rsi](%3), %%rsi \n\t" | 2366 | "mov %c[rsi](%0), %%rsi \n\t" |
2230 | "mov %c[rdi](%3), %%rdi \n\t" | 2367 | "mov %c[rdi](%0), %%rdi \n\t" |
2231 | "mov %c[rbp](%3), %%rbp \n\t" | 2368 | "mov %c[rbp](%0), %%rbp \n\t" |
2232 | "mov %c[r8](%3), %%r8 \n\t" | 2369 | "mov %c[r8](%0), %%r8 \n\t" |
2233 | "mov %c[r9](%3), %%r9 \n\t" | 2370 | "mov %c[r9](%0), %%r9 \n\t" |
2234 | "mov %c[r10](%3), %%r10 \n\t" | 2371 | "mov %c[r10](%0), %%r10 \n\t" |
2235 | "mov %c[r11](%3), %%r11 \n\t" | 2372 | "mov %c[r11](%0), %%r11 \n\t" |
2236 | "mov %c[r12](%3), %%r12 \n\t" | 2373 | "mov %c[r12](%0), %%r12 \n\t" |
2237 | "mov %c[r13](%3), %%r13 \n\t" | 2374 | "mov %c[r13](%0), %%r13 \n\t" |
2238 | "mov %c[r14](%3), %%r14 \n\t" | 2375 | "mov %c[r14](%0), %%r14 \n\t" |
2239 | "mov %c[r15](%3), %%r15 \n\t" | 2376 | "mov %c[r15](%0), %%r15 \n\t" |
2240 | "mov %c[rcx](%3), %%rcx \n\t" /* kills %3 (rcx) */ | 2377 | "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (rcx) */ |
2241 | #else | 2378 | #else |
2242 | "mov %c[cr2](%3), %%eax \n\t" | 2379 | "mov %c[cr2](%0), %%eax \n\t" |
2243 | "mov %%eax, %%cr2 \n\t" | 2380 | "mov %%eax, %%cr2 \n\t" |
2244 | "mov %c[rax](%3), %%eax \n\t" | 2381 | "mov %c[rax](%0), %%eax \n\t" |
2245 | "mov %c[rbx](%3), %%ebx \n\t" | 2382 | "mov %c[rbx](%0), %%ebx \n\t" |
2246 | "mov %c[rdx](%3), %%edx \n\t" | 2383 | "mov %c[rdx](%0), %%edx \n\t" |
2247 | "mov %c[rsi](%3), %%esi \n\t" | 2384 | "mov %c[rsi](%0), %%esi \n\t" |
2248 | "mov %c[rdi](%3), %%edi \n\t" | 2385 | "mov %c[rdi](%0), %%edi \n\t" |
2249 | "mov %c[rbp](%3), %%ebp \n\t" | 2386 | "mov %c[rbp](%0), %%ebp \n\t" |
2250 | "mov %c[rcx](%3), %%ecx \n\t" /* kills %3 (ecx) */ | 2387 | "mov %c[rcx](%0), %%ecx \n\t" /* kills %0 (ecx) */ |
2251 | #endif | 2388 | #endif |
2252 | /* Enter guest mode */ | 2389 | /* Enter guest mode */ |
2253 | "jne .Llaunched \n\t" | 2390 | "jne .Llaunched \n\t" |
@@ -2257,72 +2394,79 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2257 | ".Lkvm_vmx_return: " | 2394 | ".Lkvm_vmx_return: " |
2258 | /* Save guest registers, load host registers, keep flags */ | 2395 | /* Save guest registers, load host registers, keep flags */ |
2259 | #ifdef CONFIG_X86_64 | 2396 | #ifdef CONFIG_X86_64 |
2260 | "xchg %3, (%%rsp) \n\t" | 2397 | "xchg %0, (%%rsp) \n\t" |
2261 | "mov %%rax, %c[rax](%3) \n\t" | 2398 | "mov %%rax, %c[rax](%0) \n\t" |
2262 | "mov %%rbx, %c[rbx](%3) \n\t" | 2399 | "mov %%rbx, %c[rbx](%0) \n\t" |
2263 | "pushq (%%rsp); popq %c[rcx](%3) \n\t" | 2400 | "pushq (%%rsp); popq %c[rcx](%0) \n\t" |
2264 | "mov %%rdx, %c[rdx](%3) \n\t" | 2401 | "mov %%rdx, %c[rdx](%0) \n\t" |
2265 | "mov %%rsi, %c[rsi](%3) \n\t" | 2402 | "mov %%rsi, %c[rsi](%0) \n\t" |
2266 | "mov %%rdi, %c[rdi](%3) \n\t" | 2403 | "mov %%rdi, %c[rdi](%0) \n\t" |
2267 | "mov %%rbp, %c[rbp](%3) \n\t" | 2404 | "mov %%rbp, %c[rbp](%0) \n\t" |
2268 | "mov %%r8, %c[r8](%3) \n\t" | 2405 | "mov %%r8, %c[r8](%0) \n\t" |
2269 | "mov %%r9, %c[r9](%3) \n\t" | 2406 | "mov %%r9, %c[r9](%0) \n\t" |
2270 | "mov %%r10, %c[r10](%3) \n\t" | 2407 | "mov %%r10, %c[r10](%0) \n\t" |
2271 | "mov %%r11, %c[r11](%3) \n\t" | 2408 | "mov %%r11, %c[r11](%0) \n\t" |
2272 | "mov %%r12, %c[r12](%3) \n\t" | 2409 | "mov %%r12, %c[r12](%0) \n\t" |
2273 | "mov %%r13, %c[r13](%3) \n\t" | 2410 | "mov %%r13, %c[r13](%0) \n\t" |
2274 | "mov %%r14, %c[r14](%3) \n\t" | 2411 | "mov %%r14, %c[r14](%0) \n\t" |
2275 | "mov %%r15, %c[r15](%3) \n\t" | 2412 | "mov %%r15, %c[r15](%0) \n\t" |
2276 | "mov %%cr2, %%rax \n\t" | 2413 | "mov %%cr2, %%rax \n\t" |
2277 | "mov %%rax, %c[cr2](%3) \n\t" | 2414 | "mov %%rax, %c[cr2](%0) \n\t" |
2278 | "mov (%%rsp), %3 \n\t" | ||
2279 | 2415 | ||
2280 | "pop %%rcx; pop %%r15; pop %%r14; pop %%r13; pop %%r12;" | 2416 | "pop %%rbp; pop %%rbp; pop %%rdx \n\t" |
2281 | "pop %%r11; pop %%r10; pop %%r9; pop %%r8;" | ||
2282 | "pop %%rbp; pop %%rdi; pop %%rsi;" | ||
2283 | "pop %%rdx; pop %%rbx; pop %%rax \n\t" | ||
2284 | #else | 2417 | #else |
2285 | "xchg %3, (%%esp) \n\t" | 2418 | "xchg %0, (%%esp) \n\t" |
2286 | "mov %%eax, %c[rax](%3) \n\t" | 2419 | "mov %%eax, %c[rax](%0) \n\t" |
2287 | "mov %%ebx, %c[rbx](%3) \n\t" | 2420 | "mov %%ebx, %c[rbx](%0) \n\t" |
2288 | "pushl (%%esp); popl %c[rcx](%3) \n\t" | 2421 | "pushl (%%esp); popl %c[rcx](%0) \n\t" |
2289 | "mov %%edx, %c[rdx](%3) \n\t" | 2422 | "mov %%edx, %c[rdx](%0) \n\t" |
2290 | "mov %%esi, %c[rsi](%3) \n\t" | 2423 | "mov %%esi, %c[rsi](%0) \n\t" |
2291 | "mov %%edi, %c[rdi](%3) \n\t" | 2424 | "mov %%edi, %c[rdi](%0) \n\t" |
2292 | "mov %%ebp, %c[rbp](%3) \n\t" | 2425 | "mov %%ebp, %c[rbp](%0) \n\t" |
2293 | "mov %%cr2, %%eax \n\t" | 2426 | "mov %%cr2, %%eax \n\t" |
2294 | "mov %%eax, %c[cr2](%3) \n\t" | 2427 | "mov %%eax, %c[cr2](%0) \n\t" |
2295 | "mov (%%esp), %3 \n\t" | ||
2296 | 2428 | ||
2297 | "pop %%ecx; popa \n\t" | 2429 | "pop %%ebp; pop %%ebp; pop %%edx \n\t" |
2430 | #endif | ||
2431 | "setbe %c[fail](%0) \n\t" | ||
2432 | : : "c"(vmx), "d"((unsigned long)HOST_RSP), | ||
2433 | [launched]"i"(offsetof(struct vcpu_vmx, launched)), | ||
2434 | [fail]"i"(offsetof(struct vcpu_vmx, fail)), | ||
2435 | [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), | ||
2436 | [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), | ||
2437 | [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), | ||
2438 | [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])), | ||
2439 | [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])), | ||
2440 | [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])), | ||
2441 | [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])), | ||
2442 | #ifdef CONFIG_X86_64 | ||
2443 | [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])), | ||
2444 | [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])), | ||
2445 | [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])), | ||
2446 | [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])), | ||
2447 | [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])), | ||
2448 | [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])), | ||
2449 | [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])), | ||
2450 | [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])), | ||
2298 | #endif | 2451 | #endif |
2299 | "setbe %0 \n\t" | 2452 | [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) |
2300 | : "=q" (vmx->fail) | 2453 | : "cc", "memory" |
2301 | : "r"(vmx->launched), "d"((unsigned long)HOST_RSP), | ||
2302 | "c"(vcpu), | ||
2303 | [rax]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RAX])), | ||
2304 | [rbx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBX])), | ||
2305 | [rcx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RCX])), | ||
2306 | [rdx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDX])), | ||
2307 | [rsi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RSI])), | ||
2308 | [rdi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDI])), | ||
2309 | [rbp]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBP])), | ||
2310 | #ifdef CONFIG_X86_64 | 2454 | #ifdef CONFIG_X86_64 |
2311 | [r8 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R8 ])), | 2455 | , "rbx", "rdi", "rsi" |
2312 | [r9 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R9 ])), | 2456 | , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" |
2313 | [r10]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R10])), | 2457 | #else |
2314 | [r11]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R11])), | 2458 | , "ebx", "edi", "rsi" |
2315 | [r12]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R12])), | ||
2316 | [r13]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R13])), | ||
2317 | [r14]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R14])), | ||
2318 | [r15]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R15])), | ||
2319 | #endif | 2459 | #endif |
2320 | [cr2]"i"(offsetof(struct kvm_vcpu, cr2)) | 2460 | ); |
2321 | : "cc", "memory" ); | 2461 | |
2462 | vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); | ||
2463 | if (vmx->rmode.irq.pending) | ||
2464 | fixup_rmode_irq(vmx); | ||
2322 | 2465 | ||
2323 | vcpu->interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0; | 2466 | vcpu->arch.interrupt_window_open = |
2467 | (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0; | ||
2324 | 2468 | ||
2325 | asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); | 2469 | asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); |
2326 | vmx->launched = 1; | 2470 | vmx->launched = 1; |
2327 | 2471 | ||
2328 | intr_info = vmcs_read32(VM_EXIT_INTR_INFO); | 2472 | intr_info = vmcs_read32(VM_EXIT_INTR_INFO); |
@@ -2332,36 +2476,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2332 | asm("int $2"); | 2476 | asm("int $2"); |
2333 | } | 2477 | } |
2334 | 2478 | ||
2335 | static void vmx_inject_page_fault(struct kvm_vcpu *vcpu, | ||
2336 | unsigned long addr, | ||
2337 | u32 err_code) | ||
2338 | { | ||
2339 | u32 vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); | ||
2340 | |||
2341 | ++vcpu->stat.pf_guest; | ||
2342 | |||
2343 | if (is_page_fault(vect_info)) { | ||
2344 | printk(KERN_DEBUG "inject_page_fault: " | ||
2345 | "double fault 0x%lx @ 0x%lx\n", | ||
2346 | addr, vmcs_readl(GUEST_RIP)); | ||
2347 | vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 0); | ||
2348 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | ||
2349 | DF_VECTOR | | ||
2350 | INTR_TYPE_EXCEPTION | | ||
2351 | INTR_INFO_DELIEVER_CODE_MASK | | ||
2352 | INTR_INFO_VALID_MASK); | ||
2353 | return; | ||
2354 | } | ||
2355 | vcpu->cr2 = addr; | ||
2356 | vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, err_code); | ||
2357 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | ||
2358 | PF_VECTOR | | ||
2359 | INTR_TYPE_EXCEPTION | | ||
2360 | INTR_INFO_DELIEVER_CODE_MASK | | ||
2361 | INTR_INFO_VALID_MASK); | ||
2362 | |||
2363 | } | ||
2364 | |||
2365 | static void vmx_free_vmcs(struct kvm_vcpu *vcpu) | 2479 | static void vmx_free_vmcs(struct kvm_vcpu *vcpu) |
2366 | { | 2480 | { |
2367 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 2481 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
@@ -2397,12 +2511,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) | |||
2397 | if (err) | 2511 | if (err) |
2398 | goto free_vcpu; | 2512 | goto free_vcpu; |
2399 | 2513 | ||
2400 | if (irqchip_in_kernel(kvm)) { | ||
2401 | err = kvm_create_lapic(&vmx->vcpu); | ||
2402 | if (err < 0) | ||
2403 | goto free_vcpu; | ||
2404 | } | ||
2405 | |||
2406 | vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); | 2514 | vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); |
2407 | if (!vmx->guest_msrs) { | 2515 | if (!vmx->guest_msrs) { |
2408 | err = -ENOMEM; | 2516 | err = -ENOMEM; |
@@ -2464,6 +2572,7 @@ static struct kvm_x86_ops vmx_x86_ops = { | |||
2464 | .check_processor_compatibility = vmx_check_processor_compat, | 2572 | .check_processor_compatibility = vmx_check_processor_compat, |
2465 | .hardware_enable = hardware_enable, | 2573 | .hardware_enable = hardware_enable, |
2466 | .hardware_disable = hardware_disable, | 2574 | .hardware_disable = hardware_disable, |
2575 | .cpu_has_accelerated_tpr = cpu_has_vmx_virtualize_apic_accesses, | ||
2467 | 2576 | ||
2468 | .vcpu_create = vmx_create_vcpu, | 2577 | .vcpu_create = vmx_create_vcpu, |
2469 | .vcpu_free = vmx_free_vcpu, | 2578 | .vcpu_free = vmx_free_vcpu, |
@@ -2499,9 +2608,6 @@ static struct kvm_x86_ops vmx_x86_ops = { | |||
2499 | .set_rflags = vmx_set_rflags, | 2608 | .set_rflags = vmx_set_rflags, |
2500 | 2609 | ||
2501 | .tlb_flush = vmx_flush_tlb, | 2610 | .tlb_flush = vmx_flush_tlb, |
2502 | .inject_page_fault = vmx_inject_page_fault, | ||
2503 | |||
2504 | .inject_gp = vmx_inject_gp, | ||
2505 | 2611 | ||
2506 | .run = vmx_vcpu_run, | 2612 | .run = vmx_vcpu_run, |
2507 | .handle_exit = kvm_handle_exit, | 2613 | .handle_exit = kvm_handle_exit, |
@@ -2509,8 +2615,12 @@ static struct kvm_x86_ops vmx_x86_ops = { | |||
2509 | .patch_hypercall = vmx_patch_hypercall, | 2615 | .patch_hypercall = vmx_patch_hypercall, |
2510 | .get_irq = vmx_get_irq, | 2616 | .get_irq = vmx_get_irq, |
2511 | .set_irq = vmx_inject_irq, | 2617 | .set_irq = vmx_inject_irq, |
2618 | .queue_exception = vmx_queue_exception, | ||
2619 | .exception_injected = vmx_exception_injected, | ||
2512 | .inject_pending_irq = vmx_intr_assist, | 2620 | .inject_pending_irq = vmx_intr_assist, |
2513 | .inject_pending_vectors = do_interrupt_requests, | 2621 | .inject_pending_vectors = do_interrupt_requests, |
2622 | |||
2623 | .set_tss_addr = vmx_set_tss_addr, | ||
2514 | }; | 2624 | }; |
2515 | 2625 | ||
2516 | static int __init vmx_init(void) | 2626 | static int __init vmx_init(void) |
@@ -2541,10 +2651,13 @@ static int __init vmx_init(void) | |||
2541 | memset(iova, 0xff, PAGE_SIZE); | 2651 | memset(iova, 0xff, PAGE_SIZE); |
2542 | kunmap(vmx_io_bitmap_b); | 2652 | kunmap(vmx_io_bitmap_b); |
2543 | 2653 | ||
2544 | r = kvm_init_x86(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE); | 2654 | r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE); |
2545 | if (r) | 2655 | if (r) |
2546 | goto out1; | 2656 | goto out1; |
2547 | 2657 | ||
2658 | if (bypass_guest_pf) | ||
2659 | kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull); | ||
2660 | |||
2548 | return 0; | 2661 | return 0; |
2549 | 2662 | ||
2550 | out1: | 2663 | out1: |
@@ -2559,7 +2672,7 @@ static void __exit vmx_exit(void) | |||
2559 | __free_page(vmx_io_bitmap_b); | 2672 | __free_page(vmx_io_bitmap_b); |
2560 | __free_page(vmx_io_bitmap_a); | 2673 | __free_page(vmx_io_bitmap_a); |
2561 | 2674 | ||
2562 | kvm_exit_x86(); | 2675 | kvm_exit(); |
2563 | } | 2676 | } |
2564 | 2677 | ||
2565 | module_init(vmx_init) | 2678 | module_init(vmx_init) |
diff --git a/drivers/kvm/vmx.h b/arch/x86/kvm/vmx.h index fd4e14666088..d52ae8d7303d 100644 --- a/drivers/kvm/vmx.h +++ b/arch/x86/kvm/vmx.h | |||
@@ -25,6 +25,9 @@ | |||
25 | * | 25 | * |
26 | */ | 26 | */ |
27 | 27 | ||
28 | /* | ||
29 | * Definitions of Primary Processor-Based VM-Execution Controls. | ||
30 | */ | ||
28 | #define CPU_BASED_VIRTUAL_INTR_PENDING 0x00000004 | 31 | #define CPU_BASED_VIRTUAL_INTR_PENDING 0x00000004 |
29 | #define CPU_BASED_USE_TSC_OFFSETING 0x00000008 | 32 | #define CPU_BASED_USE_TSC_OFFSETING 0x00000008 |
30 | #define CPU_BASED_HLT_EXITING 0x00000080 | 33 | #define CPU_BASED_HLT_EXITING 0x00000080 |
@@ -42,6 +45,12 @@ | |||
42 | #define CPU_BASED_MONITOR_EXITING 0x20000000 | 45 | #define CPU_BASED_MONITOR_EXITING 0x20000000 |
43 | #define CPU_BASED_PAUSE_EXITING 0x40000000 | 46 | #define CPU_BASED_PAUSE_EXITING 0x40000000 |
44 | #define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS 0x80000000 | 47 | #define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS 0x80000000 |
48 | /* | ||
49 | * Definitions of Secondary Processor-Based VM-Execution Controls. | ||
50 | */ | ||
51 | #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 | ||
52 | #define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 | ||
53 | |||
45 | 54 | ||
46 | #define PIN_BASED_EXT_INTR_MASK 0x00000001 | 55 | #define PIN_BASED_EXT_INTR_MASK 0x00000001 |
47 | #define PIN_BASED_NMI_EXITING 0x00000008 | 56 | #define PIN_BASED_NMI_EXITING 0x00000008 |
@@ -54,8 +63,6 @@ | |||
54 | #define VM_ENTRY_SMM 0x00000400 | 63 | #define VM_ENTRY_SMM 0x00000400 |
55 | #define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800 | 64 | #define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800 |
56 | 65 | ||
57 | #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 | ||
58 | |||
59 | /* VMCS Encodings */ | 66 | /* VMCS Encodings */ |
60 | enum vmcs_field { | 67 | enum vmcs_field { |
61 | GUEST_ES_SELECTOR = 0x00000800, | 68 | GUEST_ES_SELECTOR = 0x00000800, |
@@ -89,6 +96,8 @@ enum vmcs_field { | |||
89 | TSC_OFFSET_HIGH = 0x00002011, | 96 | TSC_OFFSET_HIGH = 0x00002011, |
90 | VIRTUAL_APIC_PAGE_ADDR = 0x00002012, | 97 | VIRTUAL_APIC_PAGE_ADDR = 0x00002012, |
91 | VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013, | 98 | VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013, |
99 | APIC_ACCESS_ADDR = 0x00002014, | ||
100 | APIC_ACCESS_ADDR_HIGH = 0x00002015, | ||
92 | VMCS_LINK_POINTER = 0x00002800, | 101 | VMCS_LINK_POINTER = 0x00002800, |
93 | VMCS_LINK_POINTER_HIGH = 0x00002801, | 102 | VMCS_LINK_POINTER_HIGH = 0x00002801, |
94 | GUEST_IA32_DEBUGCTL = 0x00002802, | 103 | GUEST_IA32_DEBUGCTL = 0x00002802, |
@@ -214,6 +223,8 @@ enum vmcs_field { | |||
214 | #define EXIT_REASON_MSR_WRITE 32 | 223 | #define EXIT_REASON_MSR_WRITE 32 |
215 | #define EXIT_REASON_MWAIT_INSTRUCTION 36 | 224 | #define EXIT_REASON_MWAIT_INSTRUCTION 36 |
216 | #define EXIT_REASON_TPR_BELOW_THRESHOLD 43 | 225 | #define EXIT_REASON_TPR_BELOW_THRESHOLD 43 |
226 | #define EXIT_REASON_APIC_ACCESS 44 | ||
227 | #define EXIT_REASON_WBINVD 54 | ||
217 | 228 | ||
218 | /* | 229 | /* |
219 | * Interruption-information format | 230 | * Interruption-information format |
@@ -230,13 +241,14 @@ enum vmcs_field { | |||
230 | 241 | ||
231 | #define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */ | 242 | #define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */ |
232 | #define INTR_TYPE_EXCEPTION (3 << 8) /* processor exception */ | 243 | #define INTR_TYPE_EXCEPTION (3 << 8) /* processor exception */ |
244 | #define INTR_TYPE_SOFT_INTR (4 << 8) /* software interrupt */ | ||
233 | 245 | ||
234 | /* | 246 | /* |
235 | * Exit Qualifications for MOV for Control Register Access | 247 | * Exit Qualifications for MOV for Control Register Access |
236 | */ | 248 | */ |
237 | #define CONTROL_REG_ACCESS_NUM 0x7 /* 2:0, number of control register */ | 249 | #define CONTROL_REG_ACCESS_NUM 0x7 /* 2:0, number of control reg.*/ |
238 | #define CONTROL_REG_ACCESS_TYPE 0x30 /* 5:4, access type */ | 250 | #define CONTROL_REG_ACCESS_TYPE 0x30 /* 5:4, access type */ |
239 | #define CONTROL_REG_ACCESS_REG 0xf00 /* 10:8, general purpose register */ | 251 | #define CONTROL_REG_ACCESS_REG 0xf00 /* 10:8, general purpose reg. */ |
240 | #define LMSW_SOURCE_DATA_SHIFT 16 | 252 | #define LMSW_SOURCE_DATA_SHIFT 16 |
241 | #define LMSW_SOURCE_DATA (0xFFFF << LMSW_SOURCE_DATA_SHIFT) /* 16:31 lmsw source */ | 253 | #define LMSW_SOURCE_DATA (0xFFFF << LMSW_SOURCE_DATA_SHIFT) /* 16:31 lmsw source */ |
242 | #define REG_EAX (0 << 8) | 254 | #define REG_EAX (0 << 8) |
@@ -259,11 +271,11 @@ enum vmcs_field { | |||
259 | /* | 271 | /* |
260 | * Exit Qualifications for MOV for Debug Register Access | 272 | * Exit Qualifications for MOV for Debug Register Access |
261 | */ | 273 | */ |
262 | #define DEBUG_REG_ACCESS_NUM 0x7 /* 2:0, number of debug register */ | 274 | #define DEBUG_REG_ACCESS_NUM 0x7 /* 2:0, number of debug reg. */ |
263 | #define DEBUG_REG_ACCESS_TYPE 0x10 /* 4, direction of access */ | 275 | #define DEBUG_REG_ACCESS_TYPE 0x10 /* 4, direction of access */ |
264 | #define TYPE_MOV_TO_DR (0 << 4) | 276 | #define TYPE_MOV_TO_DR (0 << 4) |
265 | #define TYPE_MOV_FROM_DR (1 << 4) | 277 | #define TYPE_MOV_FROM_DR (1 << 4) |
266 | #define DEBUG_REG_ACCESS_REG 0xf00 /* 11:8, general purpose register */ | 278 | #define DEBUG_REG_ACCESS_REG 0xf00 /* 11:8, general purpose reg. */ |
267 | 279 | ||
268 | 280 | ||
269 | /* segment AR */ | 281 | /* segment AR */ |
@@ -307,4 +319,6 @@ enum vmcs_field { | |||
307 | #define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1 | 319 | #define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1 |
308 | #define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4 | 320 | #define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4 |
309 | 321 | ||
322 | #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9 | ||
323 | |||
310 | #endif | 324 | #endif |
diff --git a/drivers/kvm/kvm_main.c b/arch/x86/kvm/x86.c index c0f372f1d761..8f94a0b89dff 100644 --- a/drivers/kvm/kvm_main.c +++ b/arch/x86/kvm/x86.c | |||
@@ -1,8 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * Kernel-based Virtual Machine driver for Linux | 2 | * Kernel-based Virtual Machine driver for Linux |
3 | * | 3 | * |
4 | * This module enables machines with Intel VT-x extensions to run virtual | 4 | * derived from drivers/kvm/kvm_main.c |
5 | * machines without emulation or binary translation. | ||
6 | * | 5 | * |
7 | * Copyright (C) 2006 Qumranet, Inc. | 6 | * Copyright (C) 2006 Qumranet, Inc. |
8 | * | 7 | * |
@@ -15,80 +14,22 @@ | |||
15 | * | 14 | * |
16 | */ | 15 | */ |
17 | 16 | ||
18 | #include "kvm.h" | 17 | #include <linux/kvm_host.h> |
19 | #include "x86_emulate.h" | ||
20 | #include "segment_descriptor.h" | 18 | #include "segment_descriptor.h" |
21 | #include "irq.h" | 19 | #include "irq.h" |
20 | #include "mmu.h" | ||
22 | 21 | ||
23 | #include <linux/kvm.h> | 22 | #include <linux/kvm.h> |
24 | #include <linux/module.h> | 23 | #include <linux/fs.h> |
25 | #include <linux/errno.h> | ||
26 | #include <linux/percpu.h> | ||
27 | #include <linux/gfp.h> | ||
28 | #include <linux/mm.h> | ||
29 | #include <linux/miscdevice.h> | ||
30 | #include <linux/vmalloc.h> | 24 | #include <linux/vmalloc.h> |
31 | #include <linux/reboot.h> | 25 | #include <linux/module.h> |
32 | #include <linux/debugfs.h> | 26 | #include <linux/mman.h> |
33 | #include <linux/highmem.h> | 27 | #include <linux/highmem.h> |
34 | #include <linux/file.h> | ||
35 | #include <linux/sysdev.h> | ||
36 | #include <linux/cpu.h> | ||
37 | #include <linux/sched.h> | ||
38 | #include <linux/cpumask.h> | ||
39 | #include <linux/smp.h> | ||
40 | #include <linux/anon_inodes.h> | ||
41 | #include <linux/profile.h> | ||
42 | |||
43 | #include <asm/processor.h> | ||
44 | #include <asm/msr.h> | ||
45 | #include <asm/io.h> | ||
46 | #include <asm/uaccess.h> | ||
47 | #include <asm/desc.h> | ||
48 | |||
49 | MODULE_AUTHOR("Qumranet"); | ||
50 | MODULE_LICENSE("GPL"); | ||
51 | 28 | ||
52 | static DEFINE_SPINLOCK(kvm_lock); | 29 | #include <asm/uaccess.h> |
53 | static LIST_HEAD(vm_list); | 30 | #include <asm/msr.h> |
54 | |||
55 | static cpumask_t cpus_hardware_enabled; | ||
56 | |||
57 | struct kvm_x86_ops *kvm_x86_ops; | ||
58 | struct kmem_cache *kvm_vcpu_cache; | ||
59 | EXPORT_SYMBOL_GPL(kvm_vcpu_cache); | ||
60 | |||
61 | static __read_mostly struct preempt_ops kvm_preempt_ops; | ||
62 | |||
63 | #define STAT_OFFSET(x) offsetof(struct kvm_vcpu, stat.x) | ||
64 | |||
65 | static struct kvm_stats_debugfs_item { | ||
66 | const char *name; | ||
67 | int offset; | ||
68 | struct dentry *dentry; | ||
69 | } debugfs_entries[] = { | ||
70 | { "pf_fixed", STAT_OFFSET(pf_fixed) }, | ||
71 | { "pf_guest", STAT_OFFSET(pf_guest) }, | ||
72 | { "tlb_flush", STAT_OFFSET(tlb_flush) }, | ||
73 | { "invlpg", STAT_OFFSET(invlpg) }, | ||
74 | { "exits", STAT_OFFSET(exits) }, | ||
75 | { "io_exits", STAT_OFFSET(io_exits) }, | ||
76 | { "mmio_exits", STAT_OFFSET(mmio_exits) }, | ||
77 | { "signal_exits", STAT_OFFSET(signal_exits) }, | ||
78 | { "irq_window", STAT_OFFSET(irq_window_exits) }, | ||
79 | { "halt_exits", STAT_OFFSET(halt_exits) }, | ||
80 | { "halt_wakeup", STAT_OFFSET(halt_wakeup) }, | ||
81 | { "request_irq", STAT_OFFSET(request_irq_exits) }, | ||
82 | { "irq_exits", STAT_OFFSET(irq_exits) }, | ||
83 | { "light_exits", STAT_OFFSET(light_exits) }, | ||
84 | { "efer_reload", STAT_OFFSET(efer_reload) }, | ||
85 | { NULL } | ||
86 | }; | ||
87 | |||
88 | static struct dentry *debugfs_dir; | ||
89 | 31 | ||
90 | #define MAX_IO_MSRS 256 | 32 | #define MAX_IO_MSRS 256 |
91 | |||
92 | #define CR0_RESERVED_BITS \ | 33 | #define CR0_RESERVED_BITS \ |
93 | (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ | 34 | (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ |
94 | | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ | 35 | | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ |
@@ -102,317 +43,151 @@ static struct dentry *debugfs_dir; | |||
102 | #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) | 43 | #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) |
103 | #define EFER_RESERVED_BITS 0xfffffffffffff2fe | 44 | #define EFER_RESERVED_BITS 0xfffffffffffff2fe |
104 | 45 | ||
105 | #ifdef CONFIG_X86_64 | 46 | #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM |
106 | // LDT or TSS descriptor in the GDT. 16 bytes. | 47 | #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU |
107 | struct segment_descriptor_64 { | ||
108 | struct segment_descriptor s; | ||
109 | u32 base_higher; | ||
110 | u32 pad_zero; | ||
111 | }; | ||
112 | 48 | ||
113 | #endif | 49 | struct kvm_x86_ops *kvm_x86_ops; |
50 | |||
51 | struct kvm_stats_debugfs_item debugfs_entries[] = { | ||
52 | { "pf_fixed", VCPU_STAT(pf_fixed) }, | ||
53 | { "pf_guest", VCPU_STAT(pf_guest) }, | ||
54 | { "tlb_flush", VCPU_STAT(tlb_flush) }, | ||
55 | { "invlpg", VCPU_STAT(invlpg) }, | ||
56 | { "exits", VCPU_STAT(exits) }, | ||
57 | { "io_exits", VCPU_STAT(io_exits) }, | ||
58 | { "mmio_exits", VCPU_STAT(mmio_exits) }, | ||
59 | { "signal_exits", VCPU_STAT(signal_exits) }, | ||
60 | { "irq_window", VCPU_STAT(irq_window_exits) }, | ||
61 | { "halt_exits", VCPU_STAT(halt_exits) }, | ||
62 | { "halt_wakeup", VCPU_STAT(halt_wakeup) }, | ||
63 | { "request_irq", VCPU_STAT(request_irq_exits) }, | ||
64 | { "irq_exits", VCPU_STAT(irq_exits) }, | ||
65 | { "host_state_reload", VCPU_STAT(host_state_reload) }, | ||
66 | { "efer_reload", VCPU_STAT(efer_reload) }, | ||
67 | { "fpu_reload", VCPU_STAT(fpu_reload) }, | ||
68 | { "insn_emulation", VCPU_STAT(insn_emulation) }, | ||
69 | { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, | ||
70 | { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, | ||
71 | { "mmu_pte_write", VM_STAT(mmu_pte_write) }, | ||
72 | { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, | ||
73 | { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) }, | ||
74 | { "mmu_flooded", VM_STAT(mmu_flooded) }, | ||
75 | { "mmu_recycled", VM_STAT(mmu_recycled) }, | ||
76 | { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, | ||
77 | { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, | ||
78 | { NULL } | ||
79 | }; | ||
114 | 80 | ||
115 | static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, | ||
116 | unsigned long arg); | ||
117 | 81 | ||
118 | unsigned long segment_base(u16 selector) | 82 | unsigned long segment_base(u16 selector) |
119 | { | 83 | { |
120 | struct descriptor_table gdt; | 84 | struct descriptor_table gdt; |
121 | struct segment_descriptor *d; | 85 | struct segment_descriptor *d; |
122 | unsigned long table_base; | 86 | unsigned long table_base; |
123 | typedef unsigned long ul; | ||
124 | unsigned long v; | 87 | unsigned long v; |
125 | 88 | ||
126 | if (selector == 0) | 89 | if (selector == 0) |
127 | return 0; | 90 | return 0; |
128 | 91 | ||
129 | asm ("sgdt %0" : "=m"(gdt)); | 92 | asm("sgdt %0" : "=m"(gdt)); |
130 | table_base = gdt.base; | 93 | table_base = gdt.base; |
131 | 94 | ||
132 | if (selector & 4) { /* from ldt */ | 95 | if (selector & 4) { /* from ldt */ |
133 | u16 ldt_selector; | 96 | u16 ldt_selector; |
134 | 97 | ||
135 | asm ("sldt %0" : "=g"(ldt_selector)); | 98 | asm("sldt %0" : "=g"(ldt_selector)); |
136 | table_base = segment_base(ldt_selector); | 99 | table_base = segment_base(ldt_selector); |
137 | } | 100 | } |
138 | d = (struct segment_descriptor *)(table_base + (selector & ~7)); | 101 | d = (struct segment_descriptor *)(table_base + (selector & ~7)); |
139 | v = d->base_low | ((ul)d->base_mid << 16) | ((ul)d->base_high << 24); | 102 | v = d->base_low | ((unsigned long)d->base_mid << 16) | |
103 | ((unsigned long)d->base_high << 24); | ||
140 | #ifdef CONFIG_X86_64 | 104 | #ifdef CONFIG_X86_64 |
141 | if (d->system == 0 | 105 | if (d->system == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) |
142 | && (d->type == 2 || d->type == 9 || d->type == 11)) | 106 | v |= ((unsigned long) \ |
143 | v |= ((ul)((struct segment_descriptor_64 *)d)->base_higher) << 32; | 107 | ((struct segment_descriptor_64 *)d)->base_higher) << 32; |
144 | #endif | 108 | #endif |
145 | return v; | 109 | return v; |
146 | } | 110 | } |
147 | EXPORT_SYMBOL_GPL(segment_base); | 111 | EXPORT_SYMBOL_GPL(segment_base); |
148 | 112 | ||
149 | static inline int valid_vcpu(int n) | 113 | u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) |
150 | { | ||
151 | return likely(n >= 0 && n < KVM_MAX_VCPUS); | ||
152 | } | ||
153 | |||
154 | void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) | ||
155 | { | ||
156 | if (!vcpu->fpu_active || vcpu->guest_fpu_loaded) | ||
157 | return; | ||
158 | |||
159 | vcpu->guest_fpu_loaded = 1; | ||
160 | fx_save(&vcpu->host_fx_image); | ||
161 | fx_restore(&vcpu->guest_fx_image); | ||
162 | } | ||
163 | EXPORT_SYMBOL_GPL(kvm_load_guest_fpu); | ||
164 | |||
165 | void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) | ||
166 | { | ||
167 | if (!vcpu->guest_fpu_loaded) | ||
168 | return; | ||
169 | |||
170 | vcpu->guest_fpu_loaded = 0; | ||
171 | fx_save(&vcpu->guest_fx_image); | ||
172 | fx_restore(&vcpu->host_fx_image); | ||
173 | } | ||
174 | EXPORT_SYMBOL_GPL(kvm_put_guest_fpu); | ||
175 | |||
176 | /* | ||
177 | * Switches to specified vcpu, until a matching vcpu_put() | ||
178 | */ | ||
179 | static void vcpu_load(struct kvm_vcpu *vcpu) | ||
180 | { | ||
181 | int cpu; | ||
182 | |||
183 | mutex_lock(&vcpu->mutex); | ||
184 | cpu = get_cpu(); | ||
185 | preempt_notifier_register(&vcpu->preempt_notifier); | ||
186 | kvm_x86_ops->vcpu_load(vcpu, cpu); | ||
187 | put_cpu(); | ||
188 | } | ||
189 | |||
190 | static void vcpu_put(struct kvm_vcpu *vcpu) | ||
191 | { | ||
192 | preempt_disable(); | ||
193 | kvm_x86_ops->vcpu_put(vcpu); | ||
194 | preempt_notifier_unregister(&vcpu->preempt_notifier); | ||
195 | preempt_enable(); | ||
196 | mutex_unlock(&vcpu->mutex); | ||
197 | } | ||
198 | |||
199 | static void ack_flush(void *_completed) | ||
200 | { | ||
201 | } | ||
202 | |||
203 | void kvm_flush_remote_tlbs(struct kvm *kvm) | ||
204 | { | ||
205 | int i, cpu; | ||
206 | cpumask_t cpus; | ||
207 | struct kvm_vcpu *vcpu; | ||
208 | |||
209 | cpus_clear(cpus); | ||
210 | for (i = 0; i < KVM_MAX_VCPUS; ++i) { | ||
211 | vcpu = kvm->vcpus[i]; | ||
212 | if (!vcpu) | ||
213 | continue; | ||
214 | if (test_and_set_bit(KVM_TLB_FLUSH, &vcpu->requests)) | ||
215 | continue; | ||
216 | cpu = vcpu->cpu; | ||
217 | if (cpu != -1 && cpu != raw_smp_processor_id()) | ||
218 | cpu_set(cpu, cpus); | ||
219 | } | ||
220 | smp_call_function_mask(cpus, ack_flush, NULL, 1); | ||
221 | } | ||
222 | |||
223 | int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) | ||
224 | { | 114 | { |
225 | struct page *page; | 115 | if (irqchip_in_kernel(vcpu->kvm)) |
226 | int r; | 116 | return vcpu->arch.apic_base; |
227 | |||
228 | mutex_init(&vcpu->mutex); | ||
229 | vcpu->cpu = -1; | ||
230 | vcpu->mmu.root_hpa = INVALID_PAGE; | ||
231 | vcpu->kvm = kvm; | ||
232 | vcpu->vcpu_id = id; | ||
233 | if (!irqchip_in_kernel(kvm) || id == 0) | ||
234 | vcpu->mp_state = VCPU_MP_STATE_RUNNABLE; | ||
235 | else | 117 | else |
236 | vcpu->mp_state = VCPU_MP_STATE_UNINITIALIZED; | 118 | return vcpu->arch.apic_base; |
237 | init_waitqueue_head(&vcpu->wq); | ||
238 | |||
239 | page = alloc_page(GFP_KERNEL | __GFP_ZERO); | ||
240 | if (!page) { | ||
241 | r = -ENOMEM; | ||
242 | goto fail; | ||
243 | } | ||
244 | vcpu->run = page_address(page); | ||
245 | |||
246 | page = alloc_page(GFP_KERNEL | __GFP_ZERO); | ||
247 | if (!page) { | ||
248 | r = -ENOMEM; | ||
249 | goto fail_free_run; | ||
250 | } | ||
251 | vcpu->pio_data = page_address(page); | ||
252 | |||
253 | r = kvm_mmu_create(vcpu); | ||
254 | if (r < 0) | ||
255 | goto fail_free_pio_data; | ||
256 | |||
257 | return 0; | ||
258 | |||
259 | fail_free_pio_data: | ||
260 | free_page((unsigned long)vcpu->pio_data); | ||
261 | fail_free_run: | ||
262 | free_page((unsigned long)vcpu->run); | ||
263 | fail: | ||
264 | return -ENOMEM; | ||
265 | } | ||
266 | EXPORT_SYMBOL_GPL(kvm_vcpu_init); | ||
267 | |||
268 | void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) | ||
269 | { | ||
270 | kvm_mmu_destroy(vcpu); | ||
271 | if (vcpu->apic) | ||
272 | hrtimer_cancel(&vcpu->apic->timer.dev); | ||
273 | kvm_free_apic(vcpu->apic); | ||
274 | free_page((unsigned long)vcpu->pio_data); | ||
275 | free_page((unsigned long)vcpu->run); | ||
276 | } | ||
277 | EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); | ||
278 | |||
279 | static struct kvm *kvm_create_vm(void) | ||
280 | { | ||
281 | struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); | ||
282 | |||
283 | if (!kvm) | ||
284 | return ERR_PTR(-ENOMEM); | ||
285 | |||
286 | kvm_io_bus_init(&kvm->pio_bus); | ||
287 | mutex_init(&kvm->lock); | ||
288 | INIT_LIST_HEAD(&kvm->active_mmu_pages); | ||
289 | kvm_io_bus_init(&kvm->mmio_bus); | ||
290 | spin_lock(&kvm_lock); | ||
291 | list_add(&kvm->vm_list, &vm_list); | ||
292 | spin_unlock(&kvm_lock); | ||
293 | return kvm; | ||
294 | } | ||
295 | |||
296 | /* | ||
297 | * Free any memory in @free but not in @dont. | ||
298 | */ | ||
299 | static void kvm_free_physmem_slot(struct kvm_memory_slot *free, | ||
300 | struct kvm_memory_slot *dont) | ||
301 | { | ||
302 | int i; | ||
303 | |||
304 | if (!dont || free->phys_mem != dont->phys_mem) | ||
305 | if (free->phys_mem) { | ||
306 | for (i = 0; i < free->npages; ++i) | ||
307 | if (free->phys_mem[i]) | ||
308 | __free_page(free->phys_mem[i]); | ||
309 | vfree(free->phys_mem); | ||
310 | } | ||
311 | |||
312 | if (!dont || free->dirty_bitmap != dont->dirty_bitmap) | ||
313 | vfree(free->dirty_bitmap); | ||
314 | |||
315 | free->phys_mem = NULL; | ||
316 | free->npages = 0; | ||
317 | free->dirty_bitmap = NULL; | ||
318 | } | ||
319 | |||
320 | static void kvm_free_physmem(struct kvm *kvm) | ||
321 | { | ||
322 | int i; | ||
323 | |||
324 | for (i = 0; i < kvm->nmemslots; ++i) | ||
325 | kvm_free_physmem_slot(&kvm->memslots[i], NULL); | ||
326 | } | 119 | } |
120 | EXPORT_SYMBOL_GPL(kvm_get_apic_base); | ||
327 | 121 | ||
328 | static void free_pio_guest_pages(struct kvm_vcpu *vcpu) | 122 | void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) |
329 | { | 123 | { |
330 | int i; | 124 | /* TODO: reserve bits check */ |
331 | 125 | if (irqchip_in_kernel(vcpu->kvm)) | |
332 | for (i = 0; i < ARRAY_SIZE(vcpu->pio.guest_pages); ++i) | 126 | kvm_lapic_set_base(vcpu, data); |
333 | if (vcpu->pio.guest_pages[i]) { | 127 | else |
334 | __free_page(vcpu->pio.guest_pages[i]); | 128 | vcpu->arch.apic_base = data; |
335 | vcpu->pio.guest_pages[i] = NULL; | ||
336 | } | ||
337 | } | 129 | } |
130 | EXPORT_SYMBOL_GPL(kvm_set_apic_base); | ||
338 | 131 | ||
339 | static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) | 132 | void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) |
340 | { | 133 | { |
341 | vcpu_load(vcpu); | 134 | WARN_ON(vcpu->arch.exception.pending); |
342 | kvm_mmu_unload(vcpu); | 135 | vcpu->arch.exception.pending = true; |
343 | vcpu_put(vcpu); | 136 | vcpu->arch.exception.has_error_code = false; |
137 | vcpu->arch.exception.nr = nr; | ||
344 | } | 138 | } |
139 | EXPORT_SYMBOL_GPL(kvm_queue_exception); | ||
345 | 140 | ||
346 | static void kvm_free_vcpus(struct kvm *kvm) | 141 | void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, |
142 | u32 error_code) | ||
347 | { | 143 | { |
348 | unsigned int i; | 144 | ++vcpu->stat.pf_guest; |
349 | 145 | if (vcpu->arch.exception.pending && vcpu->arch.exception.nr == PF_VECTOR) { | |
350 | /* | 146 | printk(KERN_DEBUG "kvm: inject_page_fault:" |
351 | * Unpin any mmu pages first. | 147 | " double fault 0x%lx\n", addr); |
352 | */ | 148 | vcpu->arch.exception.nr = DF_VECTOR; |
353 | for (i = 0; i < KVM_MAX_VCPUS; ++i) | 149 | vcpu->arch.exception.error_code = 0; |
354 | if (kvm->vcpus[i]) | 150 | return; |
355 | kvm_unload_vcpu_mmu(kvm->vcpus[i]); | ||
356 | for (i = 0; i < KVM_MAX_VCPUS; ++i) { | ||
357 | if (kvm->vcpus[i]) { | ||
358 | kvm_x86_ops->vcpu_free(kvm->vcpus[i]); | ||
359 | kvm->vcpus[i] = NULL; | ||
360 | } | ||
361 | } | 151 | } |
362 | 152 | vcpu->arch.cr2 = addr; | |
153 | kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); | ||
363 | } | 154 | } |
364 | 155 | ||
365 | static void kvm_destroy_vm(struct kvm *kvm) | 156 | void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) |
366 | { | 157 | { |
367 | spin_lock(&kvm_lock); | 158 | WARN_ON(vcpu->arch.exception.pending); |
368 | list_del(&kvm->vm_list); | 159 | vcpu->arch.exception.pending = true; |
369 | spin_unlock(&kvm_lock); | 160 | vcpu->arch.exception.has_error_code = true; |
370 | kvm_io_bus_destroy(&kvm->pio_bus); | 161 | vcpu->arch.exception.nr = nr; |
371 | kvm_io_bus_destroy(&kvm->mmio_bus); | 162 | vcpu->arch.exception.error_code = error_code; |
372 | kfree(kvm->vpic); | ||
373 | kfree(kvm->vioapic); | ||
374 | kvm_free_vcpus(kvm); | ||
375 | kvm_free_physmem(kvm); | ||
376 | kfree(kvm); | ||
377 | } | 163 | } |
164 | EXPORT_SYMBOL_GPL(kvm_queue_exception_e); | ||
378 | 165 | ||
379 | static int kvm_vm_release(struct inode *inode, struct file *filp) | 166 | static void __queue_exception(struct kvm_vcpu *vcpu) |
380 | { | 167 | { |
381 | struct kvm *kvm = filp->private_data; | 168 | kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, |
382 | 169 | vcpu->arch.exception.has_error_code, | |
383 | kvm_destroy_vm(kvm); | 170 | vcpu->arch.exception.error_code); |
384 | return 0; | ||
385 | } | ||
386 | |||
387 | static void inject_gp(struct kvm_vcpu *vcpu) | ||
388 | { | ||
389 | kvm_x86_ops->inject_gp(vcpu, 0); | ||
390 | } | 171 | } |
391 | 172 | ||
392 | /* | 173 | /* |
393 | * Load the pae pdptrs. Return true is they are all valid. | 174 | * Load the pae pdptrs. Return true is they are all valid. |
394 | */ | 175 | */ |
395 | static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) | 176 | int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) |
396 | { | 177 | { |
397 | gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; | 178 | gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; |
398 | unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; | 179 | unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; |
399 | int i; | 180 | int i; |
400 | u64 *pdpt; | ||
401 | int ret; | 181 | int ret; |
402 | struct page *page; | 182 | u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; |
403 | u64 pdpte[ARRAY_SIZE(vcpu->pdptrs)]; | ||
404 | 183 | ||
405 | mutex_lock(&vcpu->kvm->lock); | 184 | down_read(¤t->mm->mmap_sem); |
406 | page = gfn_to_page(vcpu->kvm, pdpt_gfn); | 185 | ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte, |
407 | if (!page) { | 186 | offset * sizeof(u64), sizeof(pdpte)); |
187 | if (ret < 0) { | ||
408 | ret = 0; | 188 | ret = 0; |
409 | goto out; | 189 | goto out; |
410 | } | 190 | } |
411 | |||
412 | pdpt = kmap_atomic(page, KM_USER0); | ||
413 | memcpy(pdpte, pdpt+offset, sizeof(pdpte)); | ||
414 | kunmap_atomic(pdpt, KM_USER0); | ||
415 | |||
416 | for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { | 191 | for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { |
417 | if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) { | 192 | if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) { |
418 | ret = 0; | 193 | ret = 0; |
@@ -421,78 +196,96 @@ static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) | |||
421 | } | 196 | } |
422 | ret = 1; | 197 | ret = 1; |
423 | 198 | ||
424 | memcpy(vcpu->pdptrs, pdpte, sizeof(vcpu->pdptrs)); | 199 | memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); |
425 | out: | 200 | out: |
426 | mutex_unlock(&vcpu->kvm->lock); | 201 | up_read(¤t->mm->mmap_sem); |
427 | 202 | ||
428 | return ret; | 203 | return ret; |
429 | } | 204 | } |
430 | 205 | ||
206 | static bool pdptrs_changed(struct kvm_vcpu *vcpu) | ||
207 | { | ||
208 | u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; | ||
209 | bool changed = true; | ||
210 | int r; | ||
211 | |||
212 | if (is_long_mode(vcpu) || !is_pae(vcpu)) | ||
213 | return false; | ||
214 | |||
215 | down_read(¤t->mm->mmap_sem); | ||
216 | r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); | ||
217 | if (r < 0) | ||
218 | goto out; | ||
219 | changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0; | ||
220 | out: | ||
221 | up_read(¤t->mm->mmap_sem); | ||
222 | |||
223 | return changed; | ||
224 | } | ||
225 | |||
431 | void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | 226 | void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) |
432 | { | 227 | { |
433 | if (cr0 & CR0_RESERVED_BITS) { | 228 | if (cr0 & CR0_RESERVED_BITS) { |
434 | printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", | 229 | printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", |
435 | cr0, vcpu->cr0); | 230 | cr0, vcpu->arch.cr0); |
436 | inject_gp(vcpu); | 231 | kvm_inject_gp(vcpu, 0); |
437 | return; | 232 | return; |
438 | } | 233 | } |
439 | 234 | ||
440 | if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { | 235 | if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { |
441 | printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); | 236 | printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); |
442 | inject_gp(vcpu); | 237 | kvm_inject_gp(vcpu, 0); |
443 | return; | 238 | return; |
444 | } | 239 | } |
445 | 240 | ||
446 | if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { | 241 | if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { |
447 | printk(KERN_DEBUG "set_cr0: #GP, set PG flag " | 242 | printk(KERN_DEBUG "set_cr0: #GP, set PG flag " |
448 | "and a clear PE flag\n"); | 243 | "and a clear PE flag\n"); |
449 | inject_gp(vcpu); | 244 | kvm_inject_gp(vcpu, 0); |
450 | return; | 245 | return; |
451 | } | 246 | } |
452 | 247 | ||
453 | if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { | 248 | if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { |
454 | #ifdef CONFIG_X86_64 | 249 | #ifdef CONFIG_X86_64 |
455 | if ((vcpu->shadow_efer & EFER_LME)) { | 250 | if ((vcpu->arch.shadow_efer & EFER_LME)) { |
456 | int cs_db, cs_l; | 251 | int cs_db, cs_l; |
457 | 252 | ||
458 | if (!is_pae(vcpu)) { | 253 | if (!is_pae(vcpu)) { |
459 | printk(KERN_DEBUG "set_cr0: #GP, start paging " | 254 | printk(KERN_DEBUG "set_cr0: #GP, start paging " |
460 | "in long mode while PAE is disabled\n"); | 255 | "in long mode while PAE is disabled\n"); |
461 | inject_gp(vcpu); | 256 | kvm_inject_gp(vcpu, 0); |
462 | return; | 257 | return; |
463 | } | 258 | } |
464 | kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); | 259 | kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); |
465 | if (cs_l) { | 260 | if (cs_l) { |
466 | printk(KERN_DEBUG "set_cr0: #GP, start paging " | 261 | printk(KERN_DEBUG "set_cr0: #GP, start paging " |
467 | "in long mode while CS.L == 1\n"); | 262 | "in long mode while CS.L == 1\n"); |
468 | inject_gp(vcpu); | 263 | kvm_inject_gp(vcpu, 0); |
469 | return; | 264 | return; |
470 | 265 | ||
471 | } | 266 | } |
472 | } else | 267 | } else |
473 | #endif | 268 | #endif |
474 | if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->cr3)) { | 269 | if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { |
475 | printk(KERN_DEBUG "set_cr0: #GP, pdptrs " | 270 | printk(KERN_DEBUG "set_cr0: #GP, pdptrs " |
476 | "reserved bits\n"); | 271 | "reserved bits\n"); |
477 | inject_gp(vcpu); | 272 | kvm_inject_gp(vcpu, 0); |
478 | return; | 273 | return; |
479 | } | 274 | } |
480 | 275 | ||
481 | } | 276 | } |
482 | 277 | ||
483 | kvm_x86_ops->set_cr0(vcpu, cr0); | 278 | kvm_x86_ops->set_cr0(vcpu, cr0); |
484 | vcpu->cr0 = cr0; | 279 | vcpu->arch.cr0 = cr0; |
485 | 280 | ||
486 | mutex_lock(&vcpu->kvm->lock); | ||
487 | kvm_mmu_reset_context(vcpu); | 281 | kvm_mmu_reset_context(vcpu); |
488 | mutex_unlock(&vcpu->kvm->lock); | ||
489 | return; | 282 | return; |
490 | } | 283 | } |
491 | EXPORT_SYMBOL_GPL(set_cr0); | 284 | EXPORT_SYMBOL_GPL(set_cr0); |
492 | 285 | ||
493 | void lmsw(struct kvm_vcpu *vcpu, unsigned long msw) | 286 | void lmsw(struct kvm_vcpu *vcpu, unsigned long msw) |
494 | { | 287 | { |
495 | set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f)); | 288 | set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); |
496 | } | 289 | } |
497 | EXPORT_SYMBOL_GPL(lmsw); | 290 | EXPORT_SYMBOL_GPL(lmsw); |
498 | 291 | ||
@@ -500,7 +293,7 @@ void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | |||
500 | { | 293 | { |
501 | if (cr4 & CR4_RESERVED_BITS) { | 294 | if (cr4 & CR4_RESERVED_BITS) { |
502 | printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); | 295 | printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); |
503 | inject_gp(vcpu); | 296 | kvm_inject_gp(vcpu, 0); |
504 | return; | 297 | return; |
505 | } | 298 | } |
506 | 299 | ||
@@ -508,35 +301,38 @@ void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | |||
508 | if (!(cr4 & X86_CR4_PAE)) { | 301 | if (!(cr4 & X86_CR4_PAE)) { |
509 | printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while " | 302 | printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while " |
510 | "in long mode\n"); | 303 | "in long mode\n"); |
511 | inject_gp(vcpu); | 304 | kvm_inject_gp(vcpu, 0); |
512 | return; | 305 | return; |
513 | } | 306 | } |
514 | } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE) | 307 | } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE) |
515 | && !load_pdptrs(vcpu, vcpu->cr3)) { | 308 | && !load_pdptrs(vcpu, vcpu->arch.cr3)) { |
516 | printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); | 309 | printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); |
517 | inject_gp(vcpu); | 310 | kvm_inject_gp(vcpu, 0); |
518 | return; | 311 | return; |
519 | } | 312 | } |
520 | 313 | ||
521 | if (cr4 & X86_CR4_VMXE) { | 314 | if (cr4 & X86_CR4_VMXE) { |
522 | printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n"); | 315 | printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n"); |
523 | inject_gp(vcpu); | 316 | kvm_inject_gp(vcpu, 0); |
524 | return; | 317 | return; |
525 | } | 318 | } |
526 | kvm_x86_ops->set_cr4(vcpu, cr4); | 319 | kvm_x86_ops->set_cr4(vcpu, cr4); |
527 | vcpu->cr4 = cr4; | 320 | vcpu->arch.cr4 = cr4; |
528 | mutex_lock(&vcpu->kvm->lock); | ||
529 | kvm_mmu_reset_context(vcpu); | 321 | kvm_mmu_reset_context(vcpu); |
530 | mutex_unlock(&vcpu->kvm->lock); | ||
531 | } | 322 | } |
532 | EXPORT_SYMBOL_GPL(set_cr4); | 323 | EXPORT_SYMBOL_GPL(set_cr4); |
533 | 324 | ||
534 | void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | 325 | void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) |
535 | { | 326 | { |
327 | if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { | ||
328 | kvm_mmu_flush_tlb(vcpu); | ||
329 | return; | ||
330 | } | ||
331 | |||
536 | if (is_long_mode(vcpu)) { | 332 | if (is_long_mode(vcpu)) { |
537 | if (cr3 & CR3_L_MODE_RESERVED_BITS) { | 333 | if (cr3 & CR3_L_MODE_RESERVED_BITS) { |
538 | printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); | 334 | printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); |
539 | inject_gp(vcpu); | 335 | kvm_inject_gp(vcpu, 0); |
540 | return; | 336 | return; |
541 | } | 337 | } |
542 | } else { | 338 | } else { |
@@ -544,26 +340,23 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | |||
544 | if (cr3 & CR3_PAE_RESERVED_BITS) { | 340 | if (cr3 & CR3_PAE_RESERVED_BITS) { |
545 | printk(KERN_DEBUG | 341 | printk(KERN_DEBUG |
546 | "set_cr3: #GP, reserved bits\n"); | 342 | "set_cr3: #GP, reserved bits\n"); |
547 | inject_gp(vcpu); | 343 | kvm_inject_gp(vcpu, 0); |
548 | return; | 344 | return; |
549 | } | 345 | } |
550 | if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { | 346 | if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { |
551 | printk(KERN_DEBUG "set_cr3: #GP, pdptrs " | 347 | printk(KERN_DEBUG "set_cr3: #GP, pdptrs " |
552 | "reserved bits\n"); | 348 | "reserved bits\n"); |
553 | inject_gp(vcpu); | 349 | kvm_inject_gp(vcpu, 0); |
554 | return; | ||
555 | } | ||
556 | } else { | ||
557 | if (cr3 & CR3_NONPAE_RESERVED_BITS) { | ||
558 | printk(KERN_DEBUG | ||
559 | "set_cr3: #GP, reserved bits\n"); | ||
560 | inject_gp(vcpu); | ||
561 | return; | 350 | return; |
562 | } | 351 | } |
563 | } | 352 | } |
353 | /* | ||
354 | * We don't check reserved bits in nonpae mode, because | ||
355 | * this isn't enforced, and VMware depends on this. | ||
356 | */ | ||
564 | } | 357 | } |
565 | 358 | ||
566 | mutex_lock(&vcpu->kvm->lock); | 359 | down_read(¤t->mm->mmap_sem); |
567 | /* | 360 | /* |
568 | * Does the new cr3 value map to physical memory? (Note, we | 361 | * Does the new cr3 value map to physical memory? (Note, we |
569 | * catch an invalid cr3 even in real-mode, because it would | 362 | * catch an invalid cr3 even in real-mode, because it would |
@@ -574,12 +367,12 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | |||
574 | * to debug) behavior on the guest side. | 367 | * to debug) behavior on the guest side. |
575 | */ | 368 | */ |
576 | if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) | 369 | if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) |
577 | inject_gp(vcpu); | 370 | kvm_inject_gp(vcpu, 0); |
578 | else { | 371 | else { |
579 | vcpu->cr3 = cr3; | 372 | vcpu->arch.cr3 = cr3; |
580 | vcpu->mmu.new_cr3(vcpu); | 373 | vcpu->arch.mmu.new_cr3(vcpu); |
581 | } | 374 | } |
582 | mutex_unlock(&vcpu->kvm->lock); | 375 | up_read(¤t->mm->mmap_sem); |
583 | } | 376 | } |
584 | EXPORT_SYMBOL_GPL(set_cr3); | 377 | EXPORT_SYMBOL_GPL(set_cr3); |
585 | 378 | ||
@@ -587,13 +380,13 @@ void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) | |||
587 | { | 380 | { |
588 | if (cr8 & CR8_RESERVED_BITS) { | 381 | if (cr8 & CR8_RESERVED_BITS) { |
589 | printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8); | 382 | printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8); |
590 | inject_gp(vcpu); | 383 | kvm_inject_gp(vcpu, 0); |
591 | return; | 384 | return; |
592 | } | 385 | } |
593 | if (irqchip_in_kernel(vcpu->kvm)) | 386 | if (irqchip_in_kernel(vcpu->kvm)) |
594 | kvm_lapic_set_tpr(vcpu, cr8); | 387 | kvm_lapic_set_tpr(vcpu, cr8); |
595 | else | 388 | else |
596 | vcpu->cr8 = cr8; | 389 | vcpu->arch.cr8 = cr8; |
597 | } | 390 | } |
598 | EXPORT_SYMBOL_GPL(set_cr8); | 391 | EXPORT_SYMBOL_GPL(set_cr8); |
599 | 392 | ||
@@ -602,210 +395,846 @@ unsigned long get_cr8(struct kvm_vcpu *vcpu) | |||
602 | if (irqchip_in_kernel(vcpu->kvm)) | 395 | if (irqchip_in_kernel(vcpu->kvm)) |
603 | return kvm_lapic_get_cr8(vcpu); | 396 | return kvm_lapic_get_cr8(vcpu); |
604 | else | 397 | else |
605 | return vcpu->cr8; | 398 | return vcpu->arch.cr8; |
606 | } | 399 | } |
607 | EXPORT_SYMBOL_GPL(get_cr8); | 400 | EXPORT_SYMBOL_GPL(get_cr8); |
608 | 401 | ||
609 | u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) | 402 | /* |
403 | * List of msr numbers which we expose to userspace through KVM_GET_MSRS | ||
404 | * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. | ||
405 | * | ||
406 | * This list is modified at module load time to reflect the | ||
407 | * capabilities of the host cpu. | ||
408 | */ | ||
409 | static u32 msrs_to_save[] = { | ||
410 | MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, | ||
411 | MSR_K6_STAR, | ||
412 | #ifdef CONFIG_X86_64 | ||
413 | MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, | ||
414 | #endif | ||
415 | MSR_IA32_TIME_STAMP_COUNTER, | ||
416 | }; | ||
417 | |||
418 | static unsigned num_msrs_to_save; | ||
419 | |||
420 | static u32 emulated_msrs[] = { | ||
421 | MSR_IA32_MISC_ENABLE, | ||
422 | }; | ||
423 | |||
424 | #ifdef CONFIG_X86_64 | ||
425 | |||
426 | static void set_efer(struct kvm_vcpu *vcpu, u64 efer) | ||
610 | { | 427 | { |
611 | if (irqchip_in_kernel(vcpu->kvm)) | 428 | if (efer & EFER_RESERVED_BITS) { |
612 | return vcpu->apic_base; | 429 | printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n", |
613 | else | 430 | efer); |
614 | return vcpu->apic_base; | 431 | kvm_inject_gp(vcpu, 0); |
432 | return; | ||
433 | } | ||
434 | |||
435 | if (is_paging(vcpu) | ||
436 | && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) { | ||
437 | printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n"); | ||
438 | kvm_inject_gp(vcpu, 0); | ||
439 | return; | ||
440 | } | ||
441 | |||
442 | kvm_x86_ops->set_efer(vcpu, efer); | ||
443 | |||
444 | efer &= ~EFER_LMA; | ||
445 | efer |= vcpu->arch.shadow_efer & EFER_LMA; | ||
446 | |||
447 | vcpu->arch.shadow_efer = efer; | ||
615 | } | 448 | } |
616 | EXPORT_SYMBOL_GPL(kvm_get_apic_base); | ||
617 | 449 | ||
618 | void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) | 450 | #endif |
451 | |||
452 | /* | ||
453 | * Writes msr value into into the appropriate "register". | ||
454 | * Returns 0 on success, non-0 otherwise. | ||
455 | * Assumes vcpu_load() was already called. | ||
456 | */ | ||
457 | int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) | ||
619 | { | 458 | { |
620 | /* TODO: reserve bits check */ | 459 | return kvm_x86_ops->set_msr(vcpu, msr_index, data); |
621 | if (irqchip_in_kernel(vcpu->kvm)) | ||
622 | kvm_lapic_set_base(vcpu, data); | ||
623 | else | ||
624 | vcpu->apic_base = data; | ||
625 | } | 460 | } |
626 | EXPORT_SYMBOL_GPL(kvm_set_apic_base); | ||
627 | 461 | ||
628 | void fx_init(struct kvm_vcpu *vcpu) | 462 | /* |
463 | * Adapt set_msr() to msr_io()'s calling convention | ||
464 | */ | ||
465 | static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) | ||
629 | { | 466 | { |
630 | unsigned after_mxcsr_mask; | 467 | return kvm_set_msr(vcpu, index, *data); |
468 | } | ||
631 | 469 | ||
632 | /* Initialize guest FPU by resetting ours and saving into guest's */ | ||
633 | preempt_disable(); | ||
634 | fx_save(&vcpu->host_fx_image); | ||
635 | fpu_init(); | ||
636 | fx_save(&vcpu->guest_fx_image); | ||
637 | fx_restore(&vcpu->host_fx_image); | ||
638 | preempt_enable(); | ||
639 | 470 | ||
640 | vcpu->cr0 |= X86_CR0_ET; | 471 | int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) |
641 | after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space); | 472 | { |
642 | vcpu->guest_fx_image.mxcsr = 0x1f80; | 473 | switch (msr) { |
643 | memset((void *)&vcpu->guest_fx_image + after_mxcsr_mask, | 474 | #ifdef CONFIG_X86_64 |
644 | 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask); | 475 | case MSR_EFER: |
476 | set_efer(vcpu, data); | ||
477 | break; | ||
478 | #endif | ||
479 | case MSR_IA32_MC0_STATUS: | ||
480 | pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", | ||
481 | __FUNCTION__, data); | ||
482 | break; | ||
483 | case MSR_IA32_MCG_STATUS: | ||
484 | pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n", | ||
485 | __FUNCTION__, data); | ||
486 | break; | ||
487 | case MSR_IA32_UCODE_REV: | ||
488 | case MSR_IA32_UCODE_WRITE: | ||
489 | case 0x200 ... 0x2ff: /* MTRRs */ | ||
490 | break; | ||
491 | case MSR_IA32_APICBASE: | ||
492 | kvm_set_apic_base(vcpu, data); | ||
493 | break; | ||
494 | case MSR_IA32_MISC_ENABLE: | ||
495 | vcpu->arch.ia32_misc_enable_msr = data; | ||
496 | break; | ||
497 | default: | ||
498 | pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data); | ||
499 | return 1; | ||
500 | } | ||
501 | return 0; | ||
645 | } | 502 | } |
646 | EXPORT_SYMBOL_GPL(fx_init); | 503 | EXPORT_SYMBOL_GPL(kvm_set_msr_common); |
504 | |||
505 | |||
506 | /* | ||
507 | * Reads an msr value (of 'msr_index') into 'pdata'. | ||
508 | * Returns 0 on success, non-0 otherwise. | ||
509 | * Assumes vcpu_load() was already called. | ||
510 | */ | ||
511 | int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) | ||
512 | { | ||
513 | return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); | ||
514 | } | ||
515 | |||
516 | int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) | ||
517 | { | ||
518 | u64 data; | ||
519 | |||
520 | switch (msr) { | ||
521 | case 0xc0010010: /* SYSCFG */ | ||
522 | case 0xc0010015: /* HWCR */ | ||
523 | case MSR_IA32_PLATFORM_ID: | ||
524 | case MSR_IA32_P5_MC_ADDR: | ||
525 | case MSR_IA32_P5_MC_TYPE: | ||
526 | case MSR_IA32_MC0_CTL: | ||
527 | case MSR_IA32_MCG_STATUS: | ||
528 | case MSR_IA32_MCG_CAP: | ||
529 | case MSR_IA32_MC0_MISC: | ||
530 | case MSR_IA32_MC0_MISC+4: | ||
531 | case MSR_IA32_MC0_MISC+8: | ||
532 | case MSR_IA32_MC0_MISC+12: | ||
533 | case MSR_IA32_MC0_MISC+16: | ||
534 | case MSR_IA32_UCODE_REV: | ||
535 | case MSR_IA32_PERF_STATUS: | ||
536 | case MSR_IA32_EBL_CR_POWERON: | ||
537 | /* MTRR registers */ | ||
538 | case 0xfe: | ||
539 | case 0x200 ... 0x2ff: | ||
540 | data = 0; | ||
541 | break; | ||
542 | case 0xcd: /* fsb frequency */ | ||
543 | data = 3; | ||
544 | break; | ||
545 | case MSR_IA32_APICBASE: | ||
546 | data = kvm_get_apic_base(vcpu); | ||
547 | break; | ||
548 | case MSR_IA32_MISC_ENABLE: | ||
549 | data = vcpu->arch.ia32_misc_enable_msr; | ||
550 | break; | ||
551 | #ifdef CONFIG_X86_64 | ||
552 | case MSR_EFER: | ||
553 | data = vcpu->arch.shadow_efer; | ||
554 | break; | ||
555 | #endif | ||
556 | default: | ||
557 | pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); | ||
558 | return 1; | ||
559 | } | ||
560 | *pdata = data; | ||
561 | return 0; | ||
562 | } | ||
563 | EXPORT_SYMBOL_GPL(kvm_get_msr_common); | ||
647 | 564 | ||
648 | /* | 565 | /* |
649 | * Allocate some memory and give it an address in the guest physical address | 566 | * Read or write a bunch of msrs. All parameters are kernel addresses. |
650 | * space. | ||
651 | * | 567 | * |
652 | * Discontiguous memory is allowed, mostly for framebuffers. | 568 | * @return number of msrs set successfully. |
653 | */ | 569 | */ |
654 | static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, | 570 | static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, |
655 | struct kvm_memory_region *mem) | 571 | struct kvm_msr_entry *entries, |
572 | int (*do_msr)(struct kvm_vcpu *vcpu, | ||
573 | unsigned index, u64 *data)) | ||
656 | { | 574 | { |
657 | int r; | 575 | int i; |
658 | gfn_t base_gfn; | ||
659 | unsigned long npages; | ||
660 | unsigned long i; | ||
661 | struct kvm_memory_slot *memslot; | ||
662 | struct kvm_memory_slot old, new; | ||
663 | 576 | ||
664 | r = -EINVAL; | 577 | vcpu_load(vcpu); |
665 | /* General sanity checks */ | 578 | |
666 | if (mem->memory_size & (PAGE_SIZE - 1)) | 579 | for (i = 0; i < msrs->nmsrs; ++i) |
667 | goto out; | 580 | if (do_msr(vcpu, entries[i].index, &entries[i].data)) |
668 | if (mem->guest_phys_addr & (PAGE_SIZE - 1)) | 581 | break; |
582 | |||
583 | vcpu_put(vcpu); | ||
584 | |||
585 | return i; | ||
586 | } | ||
587 | |||
588 | /* | ||
589 | * Read or write a bunch of msrs. Parameters are user addresses. | ||
590 | * | ||
591 | * @return number of msrs set successfully. | ||
592 | */ | ||
593 | static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, | ||
594 | int (*do_msr)(struct kvm_vcpu *vcpu, | ||
595 | unsigned index, u64 *data), | ||
596 | int writeback) | ||
597 | { | ||
598 | struct kvm_msrs msrs; | ||
599 | struct kvm_msr_entry *entries; | ||
600 | int r, n; | ||
601 | unsigned size; | ||
602 | |||
603 | r = -EFAULT; | ||
604 | if (copy_from_user(&msrs, user_msrs, sizeof msrs)) | ||
669 | goto out; | 605 | goto out; |
670 | if (mem->slot >= KVM_MEMORY_SLOTS) | 606 | |
607 | r = -E2BIG; | ||
608 | if (msrs.nmsrs >= MAX_IO_MSRS) | ||
671 | goto out; | 609 | goto out; |
672 | if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) | 610 | |
611 | r = -ENOMEM; | ||
612 | size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; | ||
613 | entries = vmalloc(size); | ||
614 | if (!entries) | ||
673 | goto out; | 615 | goto out; |
674 | 616 | ||
675 | memslot = &kvm->memslots[mem->slot]; | 617 | r = -EFAULT; |
676 | base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; | 618 | if (copy_from_user(entries, user_msrs->entries, size)) |
677 | npages = mem->memory_size >> PAGE_SHIFT; | 619 | goto out_free; |
678 | 620 | ||
679 | if (!npages) | 621 | r = n = __msr_io(vcpu, &msrs, entries, do_msr); |
680 | mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES; | 622 | if (r < 0) |
623 | goto out_free; | ||
681 | 624 | ||
682 | mutex_lock(&kvm->lock); | 625 | r = -EFAULT; |
626 | if (writeback && copy_to_user(user_msrs->entries, entries, size)) | ||
627 | goto out_free; | ||
683 | 628 | ||
684 | new = old = *memslot; | 629 | r = n; |
685 | 630 | ||
686 | new.base_gfn = base_gfn; | 631 | out_free: |
687 | new.npages = npages; | 632 | vfree(entries); |
688 | new.flags = mem->flags; | 633 | out: |
634 | return r; | ||
635 | } | ||
689 | 636 | ||
690 | /* Disallow changing a memory slot's size. */ | 637 | /* |
691 | r = -EINVAL; | 638 | * Make sure that a cpu that is being hot-unplugged does not have any vcpus |
692 | if (npages && old.npages && npages != old.npages) | 639 | * cached on it. |
693 | goto out_unlock; | 640 | */ |
641 | void decache_vcpus_on_cpu(int cpu) | ||
642 | { | ||
643 | struct kvm *vm; | ||
644 | struct kvm_vcpu *vcpu; | ||
645 | int i; | ||
694 | 646 | ||
695 | /* Check for overlaps */ | 647 | spin_lock(&kvm_lock); |
696 | r = -EEXIST; | 648 | list_for_each_entry(vm, &vm_list, vm_list) |
697 | for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { | 649 | for (i = 0; i < KVM_MAX_VCPUS; ++i) { |
698 | struct kvm_memory_slot *s = &kvm->memslots[i]; | 650 | vcpu = vm->vcpus[i]; |
651 | if (!vcpu) | ||
652 | continue; | ||
653 | /* | ||
654 | * If the vcpu is locked, then it is running on some | ||
655 | * other cpu and therefore it is not cached on the | ||
656 | * cpu in question. | ||
657 | * | ||
658 | * If it's not locked, check the last cpu it executed | ||
659 | * on. | ||
660 | */ | ||
661 | if (mutex_trylock(&vcpu->mutex)) { | ||
662 | if (vcpu->cpu == cpu) { | ||
663 | kvm_x86_ops->vcpu_decache(vcpu); | ||
664 | vcpu->cpu = -1; | ||
665 | } | ||
666 | mutex_unlock(&vcpu->mutex); | ||
667 | } | ||
668 | } | ||
669 | spin_unlock(&kvm_lock); | ||
670 | } | ||
699 | 671 | ||
700 | if (s == memslot) | 672 | int kvm_dev_ioctl_check_extension(long ext) |
701 | continue; | 673 | { |
702 | if (!((base_gfn + npages <= s->base_gfn) || | 674 | int r; |
703 | (base_gfn >= s->base_gfn + s->npages))) | 675 | |
704 | goto out_unlock; | 676 | switch (ext) { |
677 | case KVM_CAP_IRQCHIP: | ||
678 | case KVM_CAP_HLT: | ||
679 | case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: | ||
680 | case KVM_CAP_USER_MEMORY: | ||
681 | case KVM_CAP_SET_TSS_ADDR: | ||
682 | case KVM_CAP_EXT_CPUID: | ||
683 | r = 1; | ||
684 | break; | ||
685 | case KVM_CAP_VAPIC: | ||
686 | r = !kvm_x86_ops->cpu_has_accelerated_tpr(); | ||
687 | break; | ||
688 | default: | ||
689 | r = 0; | ||
690 | break; | ||
705 | } | 691 | } |
692 | return r; | ||
706 | 693 | ||
707 | /* Deallocate if slot is being removed */ | 694 | } |
708 | if (!npages) | ||
709 | new.phys_mem = NULL; | ||
710 | 695 | ||
711 | /* Free page dirty bitmap if unneeded */ | 696 | long kvm_arch_dev_ioctl(struct file *filp, |
712 | if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) | 697 | unsigned int ioctl, unsigned long arg) |
713 | new.dirty_bitmap = NULL; | 698 | { |
699 | void __user *argp = (void __user *)arg; | ||
700 | long r; | ||
714 | 701 | ||
715 | r = -ENOMEM; | 702 | switch (ioctl) { |
703 | case KVM_GET_MSR_INDEX_LIST: { | ||
704 | struct kvm_msr_list __user *user_msr_list = argp; | ||
705 | struct kvm_msr_list msr_list; | ||
706 | unsigned n; | ||
716 | 707 | ||
717 | /* Allocate if a slot is being created */ | 708 | r = -EFAULT; |
718 | if (npages && !new.phys_mem) { | 709 | if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list)) |
719 | new.phys_mem = vmalloc(npages * sizeof(struct page *)); | 710 | goto out; |
711 | n = msr_list.nmsrs; | ||
712 | msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs); | ||
713 | if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) | ||
714 | goto out; | ||
715 | r = -E2BIG; | ||
716 | if (n < num_msrs_to_save) | ||
717 | goto out; | ||
718 | r = -EFAULT; | ||
719 | if (copy_to_user(user_msr_list->indices, &msrs_to_save, | ||
720 | num_msrs_to_save * sizeof(u32))) | ||
721 | goto out; | ||
722 | if (copy_to_user(user_msr_list->indices | ||
723 | + num_msrs_to_save * sizeof(u32), | ||
724 | &emulated_msrs, | ||
725 | ARRAY_SIZE(emulated_msrs) * sizeof(u32))) | ||
726 | goto out; | ||
727 | r = 0; | ||
728 | break; | ||
729 | } | ||
730 | default: | ||
731 | r = -EINVAL; | ||
732 | } | ||
733 | out: | ||
734 | return r; | ||
735 | } | ||
736 | |||
737 | void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | ||
738 | { | ||
739 | kvm_x86_ops->vcpu_load(vcpu, cpu); | ||
740 | } | ||
720 | 741 | ||
721 | if (!new.phys_mem) | 742 | void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) |
722 | goto out_unlock; | 743 | { |
744 | kvm_x86_ops->vcpu_put(vcpu); | ||
745 | kvm_put_guest_fpu(vcpu); | ||
746 | } | ||
723 | 747 | ||
724 | memset(new.phys_mem, 0, npages * sizeof(struct page *)); | 748 | static int is_efer_nx(void) |
725 | for (i = 0; i < npages; ++i) { | 749 | { |
726 | new.phys_mem[i] = alloc_page(GFP_HIGHUSER | 750 | u64 efer; |
727 | | __GFP_ZERO); | 751 | |
728 | if (!new.phys_mem[i]) | 752 | rdmsrl(MSR_EFER, efer); |
729 | goto out_unlock; | 753 | return efer & EFER_NX; |
730 | set_page_private(new.phys_mem[i],0); | 754 | } |
731 | } | ||
732 | } | ||
733 | 755 | ||
734 | /* Allocate page dirty bitmap if needed */ | 756 | static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) |
735 | if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { | 757 | { |
736 | unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8; | 758 | int i; |
759 | struct kvm_cpuid_entry2 *e, *entry; | ||
737 | 760 | ||
738 | new.dirty_bitmap = vmalloc(dirty_bytes); | 761 | entry = NULL; |
739 | if (!new.dirty_bitmap) | 762 | for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { |
740 | goto out_unlock; | 763 | e = &vcpu->arch.cpuid_entries[i]; |
741 | memset(new.dirty_bitmap, 0, dirty_bytes); | 764 | if (e->function == 0x80000001) { |
765 | entry = e; | ||
766 | break; | ||
767 | } | ||
742 | } | 768 | } |
769 | if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) { | ||
770 | entry->edx &= ~(1 << 20); | ||
771 | printk(KERN_INFO "kvm: guest NX capability removed\n"); | ||
772 | } | ||
773 | } | ||
743 | 774 | ||
744 | if (mem->slot >= kvm->nmemslots) | 775 | /* when an old userspace process fills a new kernel module */ |
745 | kvm->nmemslots = mem->slot + 1; | 776 | static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, |
777 | struct kvm_cpuid *cpuid, | ||
778 | struct kvm_cpuid_entry __user *entries) | ||
779 | { | ||
780 | int r, i; | ||
781 | struct kvm_cpuid_entry *cpuid_entries; | ||
746 | 782 | ||
747 | *memslot = new; | 783 | r = -E2BIG; |
784 | if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) | ||
785 | goto out; | ||
786 | r = -ENOMEM; | ||
787 | cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent); | ||
788 | if (!cpuid_entries) | ||
789 | goto out; | ||
790 | r = -EFAULT; | ||
791 | if (copy_from_user(cpuid_entries, entries, | ||
792 | cpuid->nent * sizeof(struct kvm_cpuid_entry))) | ||
793 | goto out_free; | ||
794 | for (i = 0; i < cpuid->nent; i++) { | ||
795 | vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; | ||
796 | vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; | ||
797 | vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx; | ||
798 | vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx; | ||
799 | vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx; | ||
800 | vcpu->arch.cpuid_entries[i].index = 0; | ||
801 | vcpu->arch.cpuid_entries[i].flags = 0; | ||
802 | vcpu->arch.cpuid_entries[i].padding[0] = 0; | ||
803 | vcpu->arch.cpuid_entries[i].padding[1] = 0; | ||
804 | vcpu->arch.cpuid_entries[i].padding[2] = 0; | ||
805 | } | ||
806 | vcpu->arch.cpuid_nent = cpuid->nent; | ||
807 | cpuid_fix_nx_cap(vcpu); | ||
808 | r = 0; | ||
748 | 809 | ||
749 | kvm_mmu_slot_remove_write_access(kvm, mem->slot); | 810 | out_free: |
750 | kvm_flush_remote_tlbs(kvm); | 811 | vfree(cpuid_entries); |
812 | out: | ||
813 | return r; | ||
814 | } | ||
751 | 815 | ||
752 | mutex_unlock(&kvm->lock); | 816 | static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, |
817 | struct kvm_cpuid2 *cpuid, | ||
818 | struct kvm_cpuid_entry2 __user *entries) | ||
819 | { | ||
820 | int r; | ||
753 | 821 | ||
754 | kvm_free_physmem_slot(&old, &new); | 822 | r = -E2BIG; |
823 | if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) | ||
824 | goto out; | ||
825 | r = -EFAULT; | ||
826 | if (copy_from_user(&vcpu->arch.cpuid_entries, entries, | ||
827 | cpuid->nent * sizeof(struct kvm_cpuid_entry2))) | ||
828 | goto out; | ||
829 | vcpu->arch.cpuid_nent = cpuid->nent; | ||
755 | return 0; | 830 | return 0; |
756 | 831 | ||
757 | out_unlock: | ||
758 | mutex_unlock(&kvm->lock); | ||
759 | kvm_free_physmem_slot(&new, &old); | ||
760 | out: | 832 | out: |
761 | return r; | 833 | return r; |
762 | } | 834 | } |
763 | 835 | ||
764 | /* | 836 | static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, |
765 | * Get (and clear) the dirty memory log for a memory slot. | 837 | struct kvm_cpuid2 *cpuid, |
766 | */ | 838 | struct kvm_cpuid_entry2 __user *entries) |
767 | static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, | ||
768 | struct kvm_dirty_log *log) | ||
769 | { | 839 | { |
770 | struct kvm_memory_slot *memslot; | 840 | int r; |
771 | int r, i; | ||
772 | int n; | ||
773 | unsigned long any = 0; | ||
774 | |||
775 | mutex_lock(&kvm->lock); | ||
776 | 841 | ||
777 | r = -EINVAL; | 842 | r = -E2BIG; |
778 | if (log->slot >= KVM_MEMORY_SLOTS) | 843 | if (cpuid->nent < vcpu->arch.cpuid_nent) |
779 | goto out; | 844 | goto out; |
780 | 845 | r = -EFAULT; | |
781 | memslot = &kvm->memslots[log->slot]; | 846 | if (copy_to_user(entries, &vcpu->arch.cpuid_entries, |
782 | r = -ENOENT; | 847 | vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) |
783 | if (!memslot->dirty_bitmap) | ||
784 | goto out; | 848 | goto out; |
849 | return 0; | ||
850 | |||
851 | out: | ||
852 | cpuid->nent = vcpu->arch.cpuid_nent; | ||
853 | return r; | ||
854 | } | ||
855 | |||
856 | static inline u32 bit(int bitno) | ||
857 | { | ||
858 | return 1 << (bitno & 31); | ||
859 | } | ||
860 | |||
861 | static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, | ||
862 | u32 index) | ||
863 | { | ||
864 | entry->function = function; | ||
865 | entry->index = index; | ||
866 | cpuid_count(entry->function, entry->index, | ||
867 | &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); | ||
868 | entry->flags = 0; | ||
869 | } | ||
870 | |||
871 | static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | ||
872 | u32 index, int *nent, int maxnent) | ||
873 | { | ||
874 | const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) | | ||
875 | bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) | | ||
876 | bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) | | ||
877 | bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) | | ||
878 | bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) | | ||
879 | bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) | | ||
880 | bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) | | ||
881 | bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) | | ||
882 | bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) | | ||
883 | bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP); | ||
884 | const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) | | ||
885 | bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) | | ||
886 | bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) | | ||
887 | bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) | | ||
888 | bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) | | ||
889 | bit(X86_FEATURE_PGE) | | ||
890 | bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) | | ||
891 | bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) | | ||
892 | bit(X86_FEATURE_SYSCALL) | | ||
893 | (bit(X86_FEATURE_NX) && is_efer_nx()) | | ||
894 | #ifdef CONFIG_X86_64 | ||
895 | bit(X86_FEATURE_LM) | | ||
896 | #endif | ||
897 | bit(X86_FEATURE_MMXEXT) | | ||
898 | bit(X86_FEATURE_3DNOWEXT) | | ||
899 | bit(X86_FEATURE_3DNOW); | ||
900 | const u32 kvm_supported_word3_x86_features = | ||
901 | bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16); | ||
902 | const u32 kvm_supported_word6_x86_features = | ||
903 | bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY); | ||
904 | |||
905 | /* all func 2 cpuid_count() should be called on the same cpu */ | ||
906 | get_cpu(); | ||
907 | do_cpuid_1_ent(entry, function, index); | ||
908 | ++*nent; | ||
909 | |||
910 | switch (function) { | ||
911 | case 0: | ||
912 | entry->eax = min(entry->eax, (u32)0xb); | ||
913 | break; | ||
914 | case 1: | ||
915 | entry->edx &= kvm_supported_word0_x86_features; | ||
916 | entry->ecx &= kvm_supported_word3_x86_features; | ||
917 | break; | ||
918 | /* function 2 entries are STATEFUL. That is, repeated cpuid commands | ||
919 | * may return different values. This forces us to get_cpu() before | ||
920 | * issuing the first command, and also to emulate this annoying behavior | ||
921 | * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */ | ||
922 | case 2: { | ||
923 | int t, times = entry->eax & 0xff; | ||
924 | |||
925 | entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; | ||
926 | for (t = 1; t < times && *nent < maxnent; ++t) { | ||
927 | do_cpuid_1_ent(&entry[t], function, 0); | ||
928 | entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; | ||
929 | ++*nent; | ||
930 | } | ||
931 | break; | ||
932 | } | ||
933 | /* function 4 and 0xb have additional index. */ | ||
934 | case 4: { | ||
935 | int index, cache_type; | ||
936 | |||
937 | entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; | ||
938 | /* read more entries until cache_type is zero */ | ||
939 | for (index = 1; *nent < maxnent; ++index) { | ||
940 | cache_type = entry[index - 1].eax & 0x1f; | ||
941 | if (!cache_type) | ||
942 | break; | ||
943 | do_cpuid_1_ent(&entry[index], function, index); | ||
944 | entry[index].flags |= | ||
945 | KVM_CPUID_FLAG_SIGNIFCANT_INDEX; | ||
946 | ++*nent; | ||
947 | } | ||
948 | break; | ||
949 | } | ||
950 | case 0xb: { | ||
951 | int index, level_type; | ||
952 | |||
953 | entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; | ||
954 | /* read more entries until level_type is zero */ | ||
955 | for (index = 1; *nent < maxnent; ++index) { | ||
956 | level_type = entry[index - 1].ecx & 0xff; | ||
957 | if (!level_type) | ||
958 | break; | ||
959 | do_cpuid_1_ent(&entry[index], function, index); | ||
960 | entry[index].flags |= | ||
961 | KVM_CPUID_FLAG_SIGNIFCANT_INDEX; | ||
962 | ++*nent; | ||
963 | } | ||
964 | break; | ||
965 | } | ||
966 | case 0x80000000: | ||
967 | entry->eax = min(entry->eax, 0x8000001a); | ||
968 | break; | ||
969 | case 0x80000001: | ||
970 | entry->edx &= kvm_supported_word1_x86_features; | ||
971 | entry->ecx &= kvm_supported_word6_x86_features; | ||
972 | break; | ||
973 | } | ||
974 | put_cpu(); | ||
975 | } | ||
785 | 976 | ||
786 | n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; | 977 | static int kvm_vm_ioctl_get_supported_cpuid(struct kvm *kvm, |
978 | struct kvm_cpuid2 *cpuid, | ||
979 | struct kvm_cpuid_entry2 __user *entries) | ||
980 | { | ||
981 | struct kvm_cpuid_entry2 *cpuid_entries; | ||
982 | int limit, nent = 0, r = -E2BIG; | ||
983 | u32 func; | ||
787 | 984 | ||
788 | for (i = 0; !any && i < n/sizeof(long); ++i) | 985 | if (cpuid->nent < 1) |
789 | any = memslot->dirty_bitmap[i]; | 986 | goto out; |
987 | r = -ENOMEM; | ||
988 | cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent); | ||
989 | if (!cpuid_entries) | ||
990 | goto out; | ||
790 | 991 | ||
992 | do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent); | ||
993 | limit = cpuid_entries[0].eax; | ||
994 | for (func = 1; func <= limit && nent < cpuid->nent; ++func) | ||
995 | do_cpuid_ent(&cpuid_entries[nent], func, 0, | ||
996 | &nent, cpuid->nent); | ||
997 | r = -E2BIG; | ||
998 | if (nent >= cpuid->nent) | ||
999 | goto out_free; | ||
1000 | |||
1001 | do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent); | ||
1002 | limit = cpuid_entries[nent - 1].eax; | ||
1003 | for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) | ||
1004 | do_cpuid_ent(&cpuid_entries[nent], func, 0, | ||
1005 | &nent, cpuid->nent); | ||
791 | r = -EFAULT; | 1006 | r = -EFAULT; |
792 | if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) | 1007 | if (copy_to_user(entries, cpuid_entries, |
793 | goto out; | 1008 | nent * sizeof(struct kvm_cpuid_entry2))) |
1009 | goto out_free; | ||
1010 | cpuid->nent = nent; | ||
1011 | r = 0; | ||
794 | 1012 | ||
795 | /* If nothing is dirty, don't bother messing with page tables. */ | 1013 | out_free: |
796 | if (any) { | 1014 | vfree(cpuid_entries); |
797 | kvm_mmu_slot_remove_write_access(kvm, log->slot); | 1015 | out: |
798 | kvm_flush_remote_tlbs(kvm); | 1016 | return r; |
799 | memset(memslot->dirty_bitmap, 0, n); | 1017 | } |
1018 | |||
1019 | static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, | ||
1020 | struct kvm_lapic_state *s) | ||
1021 | { | ||
1022 | vcpu_load(vcpu); | ||
1023 | memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); | ||
1024 | vcpu_put(vcpu); | ||
1025 | |||
1026 | return 0; | ||
1027 | } | ||
1028 | |||
1029 | static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, | ||
1030 | struct kvm_lapic_state *s) | ||
1031 | { | ||
1032 | vcpu_load(vcpu); | ||
1033 | memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); | ||
1034 | kvm_apic_post_state_restore(vcpu); | ||
1035 | vcpu_put(vcpu); | ||
1036 | |||
1037 | return 0; | ||
1038 | } | ||
1039 | |||
1040 | static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, | ||
1041 | struct kvm_interrupt *irq) | ||
1042 | { | ||
1043 | if (irq->irq < 0 || irq->irq >= 256) | ||
1044 | return -EINVAL; | ||
1045 | if (irqchip_in_kernel(vcpu->kvm)) | ||
1046 | return -ENXIO; | ||
1047 | vcpu_load(vcpu); | ||
1048 | |||
1049 | set_bit(irq->irq, vcpu->arch.irq_pending); | ||
1050 | set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary); | ||
1051 | |||
1052 | vcpu_put(vcpu); | ||
1053 | |||
1054 | return 0; | ||
1055 | } | ||
1056 | |||
1057 | static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, | ||
1058 | struct kvm_tpr_access_ctl *tac) | ||
1059 | { | ||
1060 | if (tac->flags) | ||
1061 | return -EINVAL; | ||
1062 | vcpu->arch.tpr_access_reporting = !!tac->enabled; | ||
1063 | return 0; | ||
1064 | } | ||
1065 | |||
1066 | long kvm_arch_vcpu_ioctl(struct file *filp, | ||
1067 | unsigned int ioctl, unsigned long arg) | ||
1068 | { | ||
1069 | struct kvm_vcpu *vcpu = filp->private_data; | ||
1070 | void __user *argp = (void __user *)arg; | ||
1071 | int r; | ||
1072 | |||
1073 | switch (ioctl) { | ||
1074 | case KVM_GET_LAPIC: { | ||
1075 | struct kvm_lapic_state lapic; | ||
1076 | |||
1077 | memset(&lapic, 0, sizeof lapic); | ||
1078 | r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic); | ||
1079 | if (r) | ||
1080 | goto out; | ||
1081 | r = -EFAULT; | ||
1082 | if (copy_to_user(argp, &lapic, sizeof lapic)) | ||
1083 | goto out; | ||
1084 | r = 0; | ||
1085 | break; | ||
800 | } | 1086 | } |
1087 | case KVM_SET_LAPIC: { | ||
1088 | struct kvm_lapic_state lapic; | ||
801 | 1089 | ||
802 | r = 0; | 1090 | r = -EFAULT; |
1091 | if (copy_from_user(&lapic, argp, sizeof lapic)) | ||
1092 | goto out; | ||
1093 | r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);; | ||
1094 | if (r) | ||
1095 | goto out; | ||
1096 | r = 0; | ||
1097 | break; | ||
1098 | } | ||
1099 | case KVM_INTERRUPT: { | ||
1100 | struct kvm_interrupt irq; | ||
1101 | |||
1102 | r = -EFAULT; | ||
1103 | if (copy_from_user(&irq, argp, sizeof irq)) | ||
1104 | goto out; | ||
1105 | r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); | ||
1106 | if (r) | ||
1107 | goto out; | ||
1108 | r = 0; | ||
1109 | break; | ||
1110 | } | ||
1111 | case KVM_SET_CPUID: { | ||
1112 | struct kvm_cpuid __user *cpuid_arg = argp; | ||
1113 | struct kvm_cpuid cpuid; | ||
803 | 1114 | ||
1115 | r = -EFAULT; | ||
1116 | if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) | ||
1117 | goto out; | ||
1118 | r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); | ||
1119 | if (r) | ||
1120 | goto out; | ||
1121 | break; | ||
1122 | } | ||
1123 | case KVM_SET_CPUID2: { | ||
1124 | struct kvm_cpuid2 __user *cpuid_arg = argp; | ||
1125 | struct kvm_cpuid2 cpuid; | ||
1126 | |||
1127 | r = -EFAULT; | ||
1128 | if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) | ||
1129 | goto out; | ||
1130 | r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid, | ||
1131 | cpuid_arg->entries); | ||
1132 | if (r) | ||
1133 | goto out; | ||
1134 | break; | ||
1135 | } | ||
1136 | case KVM_GET_CPUID2: { | ||
1137 | struct kvm_cpuid2 __user *cpuid_arg = argp; | ||
1138 | struct kvm_cpuid2 cpuid; | ||
1139 | |||
1140 | r = -EFAULT; | ||
1141 | if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) | ||
1142 | goto out; | ||
1143 | r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid, | ||
1144 | cpuid_arg->entries); | ||
1145 | if (r) | ||
1146 | goto out; | ||
1147 | r = -EFAULT; | ||
1148 | if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) | ||
1149 | goto out; | ||
1150 | r = 0; | ||
1151 | break; | ||
1152 | } | ||
1153 | case KVM_GET_MSRS: | ||
1154 | r = msr_io(vcpu, argp, kvm_get_msr, 1); | ||
1155 | break; | ||
1156 | case KVM_SET_MSRS: | ||
1157 | r = msr_io(vcpu, argp, do_set_msr, 0); | ||
1158 | break; | ||
1159 | case KVM_TPR_ACCESS_REPORTING: { | ||
1160 | struct kvm_tpr_access_ctl tac; | ||
1161 | |||
1162 | r = -EFAULT; | ||
1163 | if (copy_from_user(&tac, argp, sizeof tac)) | ||
1164 | goto out; | ||
1165 | r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac); | ||
1166 | if (r) | ||
1167 | goto out; | ||
1168 | r = -EFAULT; | ||
1169 | if (copy_to_user(argp, &tac, sizeof tac)) | ||
1170 | goto out; | ||
1171 | r = 0; | ||
1172 | break; | ||
1173 | }; | ||
1174 | case KVM_SET_VAPIC_ADDR: { | ||
1175 | struct kvm_vapic_addr va; | ||
1176 | |||
1177 | r = -EINVAL; | ||
1178 | if (!irqchip_in_kernel(vcpu->kvm)) | ||
1179 | goto out; | ||
1180 | r = -EFAULT; | ||
1181 | if (copy_from_user(&va, argp, sizeof va)) | ||
1182 | goto out; | ||
1183 | r = 0; | ||
1184 | kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr); | ||
1185 | break; | ||
1186 | } | ||
1187 | default: | ||
1188 | r = -EINVAL; | ||
1189 | } | ||
804 | out: | 1190 | out: |
805 | mutex_unlock(&kvm->lock); | ||
806 | return r; | 1191 | return r; |
807 | } | 1192 | } |
808 | 1193 | ||
1194 | static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) | ||
1195 | { | ||
1196 | int ret; | ||
1197 | |||
1198 | if (addr > (unsigned int)(-3 * PAGE_SIZE)) | ||
1199 | return -1; | ||
1200 | ret = kvm_x86_ops->set_tss_addr(kvm, addr); | ||
1201 | return ret; | ||
1202 | } | ||
1203 | |||
1204 | static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, | ||
1205 | u32 kvm_nr_mmu_pages) | ||
1206 | { | ||
1207 | if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) | ||
1208 | return -EINVAL; | ||
1209 | |||
1210 | down_write(¤t->mm->mmap_sem); | ||
1211 | |||
1212 | kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); | ||
1213 | kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; | ||
1214 | |||
1215 | up_write(¤t->mm->mmap_sem); | ||
1216 | return 0; | ||
1217 | } | ||
1218 | |||
1219 | static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) | ||
1220 | { | ||
1221 | return kvm->arch.n_alloc_mmu_pages; | ||
1222 | } | ||
1223 | |||
1224 | gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) | ||
1225 | { | ||
1226 | int i; | ||
1227 | struct kvm_mem_alias *alias; | ||
1228 | |||
1229 | for (i = 0; i < kvm->arch.naliases; ++i) { | ||
1230 | alias = &kvm->arch.aliases[i]; | ||
1231 | if (gfn >= alias->base_gfn | ||
1232 | && gfn < alias->base_gfn + alias->npages) | ||
1233 | return alias->target_gfn + gfn - alias->base_gfn; | ||
1234 | } | ||
1235 | return gfn; | ||
1236 | } | ||
1237 | |||
809 | /* | 1238 | /* |
810 | * Set a new alias region. Aliases map a portion of physical memory into | 1239 | * Set a new alias region. Aliases map a portion of physical memory into |
811 | * another portion. This is useful for memory windows, for example the PC | 1240 | * another portion. This is useful for memory windows, for example the PC |
@@ -832,21 +1261,21 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, | |||
832 | < alias->target_phys_addr) | 1261 | < alias->target_phys_addr) |
833 | goto out; | 1262 | goto out; |
834 | 1263 | ||
835 | mutex_lock(&kvm->lock); | 1264 | down_write(¤t->mm->mmap_sem); |
836 | 1265 | ||
837 | p = &kvm->aliases[alias->slot]; | 1266 | p = &kvm->arch.aliases[alias->slot]; |
838 | p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; | 1267 | p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; |
839 | p->npages = alias->memory_size >> PAGE_SHIFT; | 1268 | p->npages = alias->memory_size >> PAGE_SHIFT; |
840 | p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; | 1269 | p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; |
841 | 1270 | ||
842 | for (n = KVM_ALIAS_SLOTS; n > 0; --n) | 1271 | for (n = KVM_ALIAS_SLOTS; n > 0; --n) |
843 | if (kvm->aliases[n - 1].npages) | 1272 | if (kvm->arch.aliases[n - 1].npages) |
844 | break; | 1273 | break; |
845 | kvm->naliases = n; | 1274 | kvm->arch.naliases = n; |
846 | 1275 | ||
847 | kvm_mmu_zap_all(kvm); | 1276 | kvm_mmu_zap_all(kvm); |
848 | 1277 | ||
849 | mutex_unlock(&kvm->lock); | 1278 | up_write(¤t->mm->mmap_sem); |
850 | 1279 | ||
851 | return 0; | 1280 | return 0; |
852 | 1281 | ||
@@ -861,17 +1290,17 @@ static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) | |||
861 | r = 0; | 1290 | r = 0; |
862 | switch (chip->chip_id) { | 1291 | switch (chip->chip_id) { |
863 | case KVM_IRQCHIP_PIC_MASTER: | 1292 | case KVM_IRQCHIP_PIC_MASTER: |
864 | memcpy (&chip->chip.pic, | 1293 | memcpy(&chip->chip.pic, |
865 | &pic_irqchip(kvm)->pics[0], | 1294 | &pic_irqchip(kvm)->pics[0], |
866 | sizeof(struct kvm_pic_state)); | 1295 | sizeof(struct kvm_pic_state)); |
867 | break; | 1296 | break; |
868 | case KVM_IRQCHIP_PIC_SLAVE: | 1297 | case KVM_IRQCHIP_PIC_SLAVE: |
869 | memcpy (&chip->chip.pic, | 1298 | memcpy(&chip->chip.pic, |
870 | &pic_irqchip(kvm)->pics[1], | 1299 | &pic_irqchip(kvm)->pics[1], |
871 | sizeof(struct kvm_pic_state)); | 1300 | sizeof(struct kvm_pic_state)); |
872 | break; | 1301 | break; |
873 | case KVM_IRQCHIP_IOAPIC: | 1302 | case KVM_IRQCHIP_IOAPIC: |
874 | memcpy (&chip->chip.ioapic, | 1303 | memcpy(&chip->chip.ioapic, |
875 | ioapic_irqchip(kvm), | 1304 | ioapic_irqchip(kvm), |
876 | sizeof(struct kvm_ioapic_state)); | 1305 | sizeof(struct kvm_ioapic_state)); |
877 | break; | 1306 | break; |
@@ -889,17 +1318,17 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) | |||
889 | r = 0; | 1318 | r = 0; |
890 | switch (chip->chip_id) { | 1319 | switch (chip->chip_id) { |
891 | case KVM_IRQCHIP_PIC_MASTER: | 1320 | case KVM_IRQCHIP_PIC_MASTER: |
892 | memcpy (&pic_irqchip(kvm)->pics[0], | 1321 | memcpy(&pic_irqchip(kvm)->pics[0], |
893 | &chip->chip.pic, | 1322 | &chip->chip.pic, |
894 | sizeof(struct kvm_pic_state)); | 1323 | sizeof(struct kvm_pic_state)); |
895 | break; | 1324 | break; |
896 | case KVM_IRQCHIP_PIC_SLAVE: | 1325 | case KVM_IRQCHIP_PIC_SLAVE: |
897 | memcpy (&pic_irqchip(kvm)->pics[1], | 1326 | memcpy(&pic_irqchip(kvm)->pics[1], |
898 | &chip->chip.pic, | 1327 | &chip->chip.pic, |
899 | sizeof(struct kvm_pic_state)); | 1328 | sizeof(struct kvm_pic_state)); |
900 | break; | 1329 | break; |
901 | case KVM_IRQCHIP_IOAPIC: | 1330 | case KVM_IRQCHIP_IOAPIC: |
902 | memcpy (ioapic_irqchip(kvm), | 1331 | memcpy(ioapic_irqchip(kvm), |
903 | &chip->chip.ioapic, | 1332 | &chip->chip.ioapic, |
904 | sizeof(struct kvm_ioapic_state)); | 1333 | sizeof(struct kvm_ioapic_state)); |
905 | break; | 1334 | break; |
@@ -911,110 +1340,191 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) | |||
911 | return r; | 1340 | return r; |
912 | } | 1341 | } |
913 | 1342 | ||
914 | static gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) | 1343 | /* |
1344 | * Get (and clear) the dirty memory log for a memory slot. | ||
1345 | */ | ||
1346 | int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, | ||
1347 | struct kvm_dirty_log *log) | ||
915 | { | 1348 | { |
916 | int i; | 1349 | int r; |
917 | struct kvm_mem_alias *alias; | 1350 | int n; |
918 | 1351 | struct kvm_memory_slot *memslot; | |
919 | for (i = 0; i < kvm->naliases; ++i) { | 1352 | int is_dirty = 0; |
920 | alias = &kvm->aliases[i]; | ||
921 | if (gfn >= alias->base_gfn | ||
922 | && gfn < alias->base_gfn + alias->npages) | ||
923 | return alias->target_gfn + gfn - alias->base_gfn; | ||
924 | } | ||
925 | return gfn; | ||
926 | } | ||
927 | 1353 | ||
928 | static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn) | 1354 | down_write(¤t->mm->mmap_sem); |
929 | { | ||
930 | int i; | ||
931 | 1355 | ||
932 | for (i = 0; i < kvm->nmemslots; ++i) { | 1356 | r = kvm_get_dirty_log(kvm, log, &is_dirty); |
933 | struct kvm_memory_slot *memslot = &kvm->memslots[i]; | 1357 | if (r) |
1358 | goto out; | ||
934 | 1359 | ||
935 | if (gfn >= memslot->base_gfn | 1360 | /* If nothing is dirty, don't bother messing with page tables. */ |
936 | && gfn < memslot->base_gfn + memslot->npages) | 1361 | if (is_dirty) { |
937 | return memslot; | 1362 | kvm_mmu_slot_remove_write_access(kvm, log->slot); |
1363 | kvm_flush_remote_tlbs(kvm); | ||
1364 | memslot = &kvm->memslots[log->slot]; | ||
1365 | n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; | ||
1366 | memset(memslot->dirty_bitmap, 0, n); | ||
938 | } | 1367 | } |
939 | return NULL; | 1368 | r = 0; |
940 | } | 1369 | out: |
941 | 1370 | up_write(¤t->mm->mmap_sem); | |
942 | struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) | 1371 | return r; |
943 | { | ||
944 | gfn = unalias_gfn(kvm, gfn); | ||
945 | return __gfn_to_memslot(kvm, gfn); | ||
946 | } | ||
947 | |||
948 | struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) | ||
949 | { | ||
950 | struct kvm_memory_slot *slot; | ||
951 | |||
952 | gfn = unalias_gfn(kvm, gfn); | ||
953 | slot = __gfn_to_memslot(kvm, gfn); | ||
954 | if (!slot) | ||
955 | return NULL; | ||
956 | return slot->phys_mem[gfn - slot->base_gfn]; | ||
957 | } | 1372 | } |
958 | EXPORT_SYMBOL_GPL(gfn_to_page); | ||
959 | 1373 | ||
960 | /* WARNING: Does not work on aliased pages. */ | 1374 | long kvm_arch_vm_ioctl(struct file *filp, |
961 | void mark_page_dirty(struct kvm *kvm, gfn_t gfn) | 1375 | unsigned int ioctl, unsigned long arg) |
962 | { | 1376 | { |
963 | struct kvm_memory_slot *memslot; | 1377 | struct kvm *kvm = filp->private_data; |
1378 | void __user *argp = (void __user *)arg; | ||
1379 | int r = -EINVAL; | ||
964 | 1380 | ||
965 | memslot = __gfn_to_memslot(kvm, gfn); | 1381 | switch (ioctl) { |
966 | if (memslot && memslot->dirty_bitmap) { | 1382 | case KVM_SET_TSS_ADDR: |
967 | unsigned long rel_gfn = gfn - memslot->base_gfn; | 1383 | r = kvm_vm_ioctl_set_tss_addr(kvm, arg); |
1384 | if (r < 0) | ||
1385 | goto out; | ||
1386 | break; | ||
1387 | case KVM_SET_MEMORY_REGION: { | ||
1388 | struct kvm_memory_region kvm_mem; | ||
1389 | struct kvm_userspace_memory_region kvm_userspace_mem; | ||
968 | 1390 | ||
969 | /* avoid RMW */ | 1391 | r = -EFAULT; |
970 | if (!test_bit(rel_gfn, memslot->dirty_bitmap)) | 1392 | if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem)) |
971 | set_bit(rel_gfn, memslot->dirty_bitmap); | 1393 | goto out; |
1394 | kvm_userspace_mem.slot = kvm_mem.slot; | ||
1395 | kvm_userspace_mem.flags = kvm_mem.flags; | ||
1396 | kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr; | ||
1397 | kvm_userspace_mem.memory_size = kvm_mem.memory_size; | ||
1398 | r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0); | ||
1399 | if (r) | ||
1400 | goto out; | ||
1401 | break; | ||
972 | } | 1402 | } |
973 | } | 1403 | case KVM_SET_NR_MMU_PAGES: |
1404 | r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); | ||
1405 | if (r) | ||
1406 | goto out; | ||
1407 | break; | ||
1408 | case KVM_GET_NR_MMU_PAGES: | ||
1409 | r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); | ||
1410 | break; | ||
1411 | case KVM_SET_MEMORY_ALIAS: { | ||
1412 | struct kvm_memory_alias alias; | ||
974 | 1413 | ||
975 | int emulator_read_std(unsigned long addr, | 1414 | r = -EFAULT; |
976 | void *val, | 1415 | if (copy_from_user(&alias, argp, sizeof alias)) |
977 | unsigned int bytes, | 1416 | goto out; |
978 | struct kvm_vcpu *vcpu) | 1417 | r = kvm_vm_ioctl_set_memory_alias(kvm, &alias); |
979 | { | 1418 | if (r) |
980 | void *data = val; | 1419 | goto out; |
1420 | break; | ||
1421 | } | ||
1422 | case KVM_CREATE_IRQCHIP: | ||
1423 | r = -ENOMEM; | ||
1424 | kvm->arch.vpic = kvm_create_pic(kvm); | ||
1425 | if (kvm->arch.vpic) { | ||
1426 | r = kvm_ioapic_init(kvm); | ||
1427 | if (r) { | ||
1428 | kfree(kvm->arch.vpic); | ||
1429 | kvm->arch.vpic = NULL; | ||
1430 | goto out; | ||
1431 | } | ||
1432 | } else | ||
1433 | goto out; | ||
1434 | break; | ||
1435 | case KVM_IRQ_LINE: { | ||
1436 | struct kvm_irq_level irq_event; | ||
981 | 1437 | ||
982 | while (bytes) { | 1438 | r = -EFAULT; |
983 | gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); | 1439 | if (copy_from_user(&irq_event, argp, sizeof irq_event)) |
984 | unsigned offset = addr & (PAGE_SIZE-1); | 1440 | goto out; |
985 | unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset); | 1441 | if (irqchip_in_kernel(kvm)) { |
986 | unsigned long pfn; | 1442 | mutex_lock(&kvm->lock); |
987 | struct page *page; | 1443 | if (irq_event.irq < 16) |
988 | void *page_virt; | 1444 | kvm_pic_set_irq(pic_irqchip(kvm), |
1445 | irq_event.irq, | ||
1446 | irq_event.level); | ||
1447 | kvm_ioapic_set_irq(kvm->arch.vioapic, | ||
1448 | irq_event.irq, | ||
1449 | irq_event.level); | ||
1450 | mutex_unlock(&kvm->lock); | ||
1451 | r = 0; | ||
1452 | } | ||
1453 | break; | ||
1454 | } | ||
1455 | case KVM_GET_IRQCHIP: { | ||
1456 | /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ | ||
1457 | struct kvm_irqchip chip; | ||
989 | 1458 | ||
990 | if (gpa == UNMAPPED_GVA) | 1459 | r = -EFAULT; |
991 | return X86EMUL_PROPAGATE_FAULT; | 1460 | if (copy_from_user(&chip, argp, sizeof chip)) |
992 | pfn = gpa >> PAGE_SHIFT; | 1461 | goto out; |
993 | page = gfn_to_page(vcpu->kvm, pfn); | 1462 | r = -ENXIO; |
994 | if (!page) | 1463 | if (!irqchip_in_kernel(kvm)) |
995 | return X86EMUL_UNHANDLEABLE; | 1464 | goto out; |
996 | page_virt = kmap_atomic(page, KM_USER0); | 1465 | r = kvm_vm_ioctl_get_irqchip(kvm, &chip); |
1466 | if (r) | ||
1467 | goto out; | ||
1468 | r = -EFAULT; | ||
1469 | if (copy_to_user(argp, &chip, sizeof chip)) | ||
1470 | goto out; | ||
1471 | r = 0; | ||
1472 | break; | ||
1473 | } | ||
1474 | case KVM_SET_IRQCHIP: { | ||
1475 | /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ | ||
1476 | struct kvm_irqchip chip; | ||
997 | 1477 | ||
998 | memcpy(data, page_virt + offset, tocopy); | 1478 | r = -EFAULT; |
1479 | if (copy_from_user(&chip, argp, sizeof chip)) | ||
1480 | goto out; | ||
1481 | r = -ENXIO; | ||
1482 | if (!irqchip_in_kernel(kvm)) | ||
1483 | goto out; | ||
1484 | r = kvm_vm_ioctl_set_irqchip(kvm, &chip); | ||
1485 | if (r) | ||
1486 | goto out; | ||
1487 | r = 0; | ||
1488 | break; | ||
1489 | } | ||
1490 | case KVM_GET_SUPPORTED_CPUID: { | ||
1491 | struct kvm_cpuid2 __user *cpuid_arg = argp; | ||
1492 | struct kvm_cpuid2 cpuid; | ||
999 | 1493 | ||
1000 | kunmap_atomic(page_virt, KM_USER0); | 1494 | r = -EFAULT; |
1495 | if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) | ||
1496 | goto out; | ||
1497 | r = kvm_vm_ioctl_get_supported_cpuid(kvm, &cpuid, | ||
1498 | cpuid_arg->entries); | ||
1499 | if (r) | ||
1500 | goto out; | ||
1001 | 1501 | ||
1002 | bytes -= tocopy; | 1502 | r = -EFAULT; |
1003 | data += tocopy; | 1503 | if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) |
1004 | addr += tocopy; | 1504 | goto out; |
1505 | r = 0; | ||
1506 | break; | ||
1005 | } | 1507 | } |
1006 | 1508 | default: | |
1007 | return X86EMUL_CONTINUE; | 1509 | ; |
1510 | } | ||
1511 | out: | ||
1512 | return r; | ||
1008 | } | 1513 | } |
1009 | EXPORT_SYMBOL_GPL(emulator_read_std); | ||
1010 | 1514 | ||
1011 | static int emulator_write_std(unsigned long addr, | 1515 | static void kvm_init_msr_list(void) |
1012 | const void *val, | ||
1013 | unsigned int bytes, | ||
1014 | struct kvm_vcpu *vcpu) | ||
1015 | { | 1516 | { |
1016 | pr_unimpl(vcpu, "emulator_write_std: addr %lx n %d\n", addr, bytes); | 1517 | u32 dummy[2]; |
1017 | return X86EMUL_UNHANDLEABLE; | 1518 | unsigned i, j; |
1519 | |||
1520 | for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { | ||
1521 | if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) | ||
1522 | continue; | ||
1523 | if (j < i) | ||
1524 | msrs_to_save[j] = msrs_to_save[i]; | ||
1525 | j++; | ||
1526 | } | ||
1527 | num_msrs_to_save = j; | ||
1018 | } | 1528 | } |
1019 | 1529 | ||
1020 | /* | 1530 | /* |
@@ -1025,14 +1535,15 @@ static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu, | |||
1025 | { | 1535 | { |
1026 | struct kvm_io_device *dev; | 1536 | struct kvm_io_device *dev; |
1027 | 1537 | ||
1028 | if (vcpu->apic) { | 1538 | if (vcpu->arch.apic) { |
1029 | dev = &vcpu->apic->dev; | 1539 | dev = &vcpu->arch.apic->dev; |
1030 | if (dev->in_range(dev, addr)) | 1540 | if (dev->in_range(dev, addr)) |
1031 | return dev; | 1541 | return dev; |
1032 | } | 1542 | } |
1033 | return NULL; | 1543 | return NULL; |
1034 | } | 1544 | } |
1035 | 1545 | ||
1546 | |||
1036 | static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, | 1547 | static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, |
1037 | gpa_t addr) | 1548 | gpa_t addr) |
1038 | { | 1549 | { |
@@ -1044,11 +1555,40 @@ static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, | |||
1044 | return dev; | 1555 | return dev; |
1045 | } | 1556 | } |
1046 | 1557 | ||
1047 | static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu, | 1558 | int emulator_read_std(unsigned long addr, |
1048 | gpa_t addr) | 1559 | void *val, |
1560 | unsigned int bytes, | ||
1561 | struct kvm_vcpu *vcpu) | ||
1049 | { | 1562 | { |
1050 | return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr); | 1563 | void *data = val; |
1564 | int r = X86EMUL_CONTINUE; | ||
1565 | |||
1566 | down_read(¤t->mm->mmap_sem); | ||
1567 | while (bytes) { | ||
1568 | gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); | ||
1569 | unsigned offset = addr & (PAGE_SIZE-1); | ||
1570 | unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset); | ||
1571 | int ret; | ||
1572 | |||
1573 | if (gpa == UNMAPPED_GVA) { | ||
1574 | r = X86EMUL_PROPAGATE_FAULT; | ||
1575 | goto out; | ||
1576 | } | ||
1577 | ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy); | ||
1578 | if (ret < 0) { | ||
1579 | r = X86EMUL_UNHANDLEABLE; | ||
1580 | goto out; | ||
1581 | } | ||
1582 | |||
1583 | bytes -= tocopy; | ||
1584 | data += tocopy; | ||
1585 | addr += tocopy; | ||
1586 | } | ||
1587 | out: | ||
1588 | up_read(¤t->mm->mmap_sem); | ||
1589 | return r; | ||
1051 | } | 1590 | } |
1591 | EXPORT_SYMBOL_GPL(emulator_read_std); | ||
1052 | 1592 | ||
1053 | static int emulator_read_emulated(unsigned long addr, | 1593 | static int emulator_read_emulated(unsigned long addr, |
1054 | void *val, | 1594 | void *val, |
@@ -1062,22 +1602,34 @@ static int emulator_read_emulated(unsigned long addr, | |||
1062 | memcpy(val, vcpu->mmio_data, bytes); | 1602 | memcpy(val, vcpu->mmio_data, bytes); |
1063 | vcpu->mmio_read_completed = 0; | 1603 | vcpu->mmio_read_completed = 0; |
1064 | return X86EMUL_CONTINUE; | 1604 | return X86EMUL_CONTINUE; |
1065 | } else if (emulator_read_std(addr, val, bytes, vcpu) | 1605 | } |
1066 | == X86EMUL_CONTINUE) | 1606 | |
1067 | return X86EMUL_CONTINUE; | 1607 | down_read(¤t->mm->mmap_sem); |
1608 | gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); | ||
1609 | up_read(¤t->mm->mmap_sem); | ||
1610 | |||
1611 | /* For APIC access vmexit */ | ||
1612 | if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) | ||
1613 | goto mmio; | ||
1068 | 1614 | ||
1069 | gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); | 1615 | if (emulator_read_std(addr, val, bytes, vcpu) |
1616 | == X86EMUL_CONTINUE) | ||
1617 | return X86EMUL_CONTINUE; | ||
1070 | if (gpa == UNMAPPED_GVA) | 1618 | if (gpa == UNMAPPED_GVA) |
1071 | return X86EMUL_PROPAGATE_FAULT; | 1619 | return X86EMUL_PROPAGATE_FAULT; |
1072 | 1620 | ||
1621 | mmio: | ||
1073 | /* | 1622 | /* |
1074 | * Is this MMIO handled locally? | 1623 | * Is this MMIO handled locally? |
1075 | */ | 1624 | */ |
1625 | mutex_lock(&vcpu->kvm->lock); | ||
1076 | mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); | 1626 | mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); |
1077 | if (mmio_dev) { | 1627 | if (mmio_dev) { |
1078 | kvm_iodevice_read(mmio_dev, gpa, bytes, val); | 1628 | kvm_iodevice_read(mmio_dev, gpa, bytes, val); |
1629 | mutex_unlock(&vcpu->kvm->lock); | ||
1079 | return X86EMUL_CONTINUE; | 1630 | return X86EMUL_CONTINUE; |
1080 | } | 1631 | } |
1632 | mutex_unlock(&vcpu->kvm->lock); | ||
1081 | 1633 | ||
1082 | vcpu->mmio_needed = 1; | 1634 | vcpu->mmio_needed = 1; |
1083 | vcpu->mmio_phys_addr = gpa; | 1635 | vcpu->mmio_phys_addr = gpa; |
@@ -1090,19 +1642,16 @@ static int emulator_read_emulated(unsigned long addr, | |||
1090 | static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, | 1642 | static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, |
1091 | const void *val, int bytes) | 1643 | const void *val, int bytes) |
1092 | { | 1644 | { |
1093 | struct page *page; | 1645 | int ret; |
1094 | void *virt; | ||
1095 | 1646 | ||
1096 | if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT)) | 1647 | down_read(¤t->mm->mmap_sem); |
1648 | ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); | ||
1649 | if (ret < 0) { | ||
1650 | up_read(¤t->mm->mmap_sem); | ||
1097 | return 0; | 1651 | return 0; |
1098 | page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); | 1652 | } |
1099 | if (!page) | ||
1100 | return 0; | ||
1101 | mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT); | ||
1102 | virt = kmap_atomic(page, KM_USER0); | ||
1103 | kvm_mmu_pte_write(vcpu, gpa, val, bytes); | 1653 | kvm_mmu_pte_write(vcpu, gpa, val, bytes); |
1104 | memcpy(virt + offset_in_page(gpa), val, bytes); | 1654 | up_read(¤t->mm->mmap_sem); |
1105 | kunmap_atomic(virt, KM_USER0); | ||
1106 | return 1; | 1655 | return 1; |
1107 | } | 1656 | } |
1108 | 1657 | ||
@@ -1112,24 +1661,36 @@ static int emulator_write_emulated_onepage(unsigned long addr, | |||
1112 | struct kvm_vcpu *vcpu) | 1661 | struct kvm_vcpu *vcpu) |
1113 | { | 1662 | { |
1114 | struct kvm_io_device *mmio_dev; | 1663 | struct kvm_io_device *mmio_dev; |
1115 | gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); | 1664 | gpa_t gpa; |
1665 | |||
1666 | down_read(¤t->mm->mmap_sem); | ||
1667 | gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); | ||
1668 | up_read(¤t->mm->mmap_sem); | ||
1116 | 1669 | ||
1117 | if (gpa == UNMAPPED_GVA) { | 1670 | if (gpa == UNMAPPED_GVA) { |
1118 | kvm_x86_ops->inject_page_fault(vcpu, addr, 2); | 1671 | kvm_inject_page_fault(vcpu, addr, 2); |
1119 | return X86EMUL_PROPAGATE_FAULT; | 1672 | return X86EMUL_PROPAGATE_FAULT; |
1120 | } | 1673 | } |
1121 | 1674 | ||
1675 | /* For APIC access vmexit */ | ||
1676 | if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) | ||
1677 | goto mmio; | ||
1678 | |||
1122 | if (emulator_write_phys(vcpu, gpa, val, bytes)) | 1679 | if (emulator_write_phys(vcpu, gpa, val, bytes)) |
1123 | return X86EMUL_CONTINUE; | 1680 | return X86EMUL_CONTINUE; |
1124 | 1681 | ||
1682 | mmio: | ||
1125 | /* | 1683 | /* |
1126 | * Is this MMIO handled locally? | 1684 | * Is this MMIO handled locally? |
1127 | */ | 1685 | */ |
1686 | mutex_lock(&vcpu->kvm->lock); | ||
1128 | mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); | 1687 | mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); |
1129 | if (mmio_dev) { | 1688 | if (mmio_dev) { |
1130 | kvm_iodevice_write(mmio_dev, gpa, bytes, val); | 1689 | kvm_iodevice_write(mmio_dev, gpa, bytes, val); |
1690 | mutex_unlock(&vcpu->kvm->lock); | ||
1131 | return X86EMUL_CONTINUE; | 1691 | return X86EMUL_CONTINUE; |
1132 | } | 1692 | } |
1693 | mutex_unlock(&vcpu->kvm->lock); | ||
1133 | 1694 | ||
1134 | vcpu->mmio_needed = 1; | 1695 | vcpu->mmio_needed = 1; |
1135 | vcpu->mmio_phys_addr = gpa; | 1696 | vcpu->mmio_phys_addr = gpa; |
@@ -1173,6 +1734,35 @@ static int emulator_cmpxchg_emulated(unsigned long addr, | |||
1173 | reported = 1; | 1734 | reported = 1; |
1174 | printk(KERN_WARNING "kvm: emulating exchange as write\n"); | 1735 | printk(KERN_WARNING "kvm: emulating exchange as write\n"); |
1175 | } | 1736 | } |
1737 | #ifndef CONFIG_X86_64 | ||
1738 | /* guests cmpxchg8b have to be emulated atomically */ | ||
1739 | if (bytes == 8) { | ||
1740 | gpa_t gpa; | ||
1741 | struct page *page; | ||
1742 | char *addr; | ||
1743 | u64 val; | ||
1744 | |||
1745 | down_read(¤t->mm->mmap_sem); | ||
1746 | gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); | ||
1747 | |||
1748 | if (gpa == UNMAPPED_GVA || | ||
1749 | (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) | ||
1750 | goto emul_write; | ||
1751 | |||
1752 | if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK)) | ||
1753 | goto emul_write; | ||
1754 | |||
1755 | val = *(u64 *)new; | ||
1756 | page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); | ||
1757 | addr = kmap_atomic(page, KM_USER0); | ||
1758 | set_64bit((u64 *)(addr + offset_in_page(gpa)), val); | ||
1759 | kunmap_atomic(addr, KM_USER0); | ||
1760 | kvm_release_page_dirty(page); | ||
1761 | emul_write: | ||
1762 | up_read(¤t->mm->mmap_sem); | ||
1763 | } | ||
1764 | #endif | ||
1765 | |||
1176 | return emulator_write_emulated(addr, new, bytes, vcpu); | 1766 | return emulator_write_emulated(addr, new, bytes, vcpu); |
1177 | } | 1767 | } |
1178 | 1768 | ||
@@ -1188,11 +1778,11 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) | |||
1188 | 1778 | ||
1189 | int emulate_clts(struct kvm_vcpu *vcpu) | 1779 | int emulate_clts(struct kvm_vcpu *vcpu) |
1190 | { | 1780 | { |
1191 | kvm_x86_ops->set_cr0(vcpu, vcpu->cr0 & ~X86_CR0_TS); | 1781 | kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); |
1192 | return X86EMUL_CONTINUE; | 1782 | return X86EMUL_CONTINUE; |
1193 | } | 1783 | } |
1194 | 1784 | ||
1195 | int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, unsigned long *dest) | 1785 | int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) |
1196 | { | 1786 | { |
1197 | struct kvm_vcpu *vcpu = ctxt->vcpu; | 1787 | struct kvm_vcpu *vcpu = ctxt->vcpu; |
1198 | 1788 | ||
@@ -1223,7 +1813,7 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) | |||
1223 | { | 1813 | { |
1224 | static int reported; | 1814 | static int reported; |
1225 | u8 opcodes[4]; | 1815 | u8 opcodes[4]; |
1226 | unsigned long rip = vcpu->rip; | 1816 | unsigned long rip = vcpu->arch.rip; |
1227 | unsigned long rip_linear; | 1817 | unsigned long rip_linear; |
1228 | 1818 | ||
1229 | rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); | 1819 | rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); |
@@ -1241,7 +1831,6 @@ EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); | |||
1241 | 1831 | ||
1242 | struct x86_emulate_ops emulate_ops = { | 1832 | struct x86_emulate_ops emulate_ops = { |
1243 | .read_std = emulator_read_std, | 1833 | .read_std = emulator_read_std, |
1244 | .write_std = emulator_write_std, | ||
1245 | .read_emulated = emulator_read_emulated, | 1834 | .read_emulated = emulator_read_emulated, |
1246 | .write_emulated = emulator_write_emulated, | 1835 | .write_emulated = emulator_write_emulated, |
1247 | .cmpxchg_emulated = emulator_cmpxchg_emulated, | 1836 | .cmpxchg_emulated = emulator_cmpxchg_emulated, |
@@ -1250,44 +1839,74 @@ struct x86_emulate_ops emulate_ops = { | |||
1250 | int emulate_instruction(struct kvm_vcpu *vcpu, | 1839 | int emulate_instruction(struct kvm_vcpu *vcpu, |
1251 | struct kvm_run *run, | 1840 | struct kvm_run *run, |
1252 | unsigned long cr2, | 1841 | unsigned long cr2, |
1253 | u16 error_code) | 1842 | u16 error_code, |
1843 | int emulation_type) | ||
1254 | { | 1844 | { |
1255 | struct x86_emulate_ctxt emulate_ctxt; | ||
1256 | int r; | 1845 | int r; |
1257 | int cs_db, cs_l; | 1846 | struct decode_cache *c; |
1258 | 1847 | ||
1259 | vcpu->mmio_fault_cr2 = cr2; | 1848 | vcpu->arch.mmio_fault_cr2 = cr2; |
1260 | kvm_x86_ops->cache_regs(vcpu); | 1849 | kvm_x86_ops->cache_regs(vcpu); |
1261 | 1850 | ||
1262 | kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); | 1851 | vcpu->mmio_is_write = 0; |
1263 | 1852 | vcpu->arch.pio.string = 0; | |
1264 | emulate_ctxt.vcpu = vcpu; | 1853 | |
1265 | emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); | 1854 | if (!(emulation_type & EMULTYPE_NO_DECODE)) { |
1266 | emulate_ctxt.cr2 = cr2; | 1855 | int cs_db, cs_l; |
1267 | emulate_ctxt.mode = (emulate_ctxt.eflags & X86_EFLAGS_VM) | 1856 | kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); |
1268 | ? X86EMUL_MODE_REAL : cs_l | 1857 | |
1269 | ? X86EMUL_MODE_PROT64 : cs_db | 1858 | vcpu->arch.emulate_ctxt.vcpu = vcpu; |
1270 | ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; | 1859 | vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); |
1271 | 1860 | vcpu->arch.emulate_ctxt.mode = | |
1272 | if (emulate_ctxt.mode == X86EMUL_MODE_PROT64) { | 1861 | (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) |
1273 | emulate_ctxt.cs_base = 0; | 1862 | ? X86EMUL_MODE_REAL : cs_l |
1274 | emulate_ctxt.ds_base = 0; | 1863 | ? X86EMUL_MODE_PROT64 : cs_db |
1275 | emulate_ctxt.es_base = 0; | 1864 | ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; |
1276 | emulate_ctxt.ss_base = 0; | 1865 | |
1277 | } else { | 1866 | if (vcpu->arch.emulate_ctxt.mode == X86EMUL_MODE_PROT64) { |
1278 | emulate_ctxt.cs_base = get_segment_base(vcpu, VCPU_SREG_CS); | 1867 | vcpu->arch.emulate_ctxt.cs_base = 0; |
1279 | emulate_ctxt.ds_base = get_segment_base(vcpu, VCPU_SREG_DS); | 1868 | vcpu->arch.emulate_ctxt.ds_base = 0; |
1280 | emulate_ctxt.es_base = get_segment_base(vcpu, VCPU_SREG_ES); | 1869 | vcpu->arch.emulate_ctxt.es_base = 0; |
1281 | emulate_ctxt.ss_base = get_segment_base(vcpu, VCPU_SREG_SS); | 1870 | vcpu->arch.emulate_ctxt.ss_base = 0; |
1871 | } else { | ||
1872 | vcpu->arch.emulate_ctxt.cs_base = | ||
1873 | get_segment_base(vcpu, VCPU_SREG_CS); | ||
1874 | vcpu->arch.emulate_ctxt.ds_base = | ||
1875 | get_segment_base(vcpu, VCPU_SREG_DS); | ||
1876 | vcpu->arch.emulate_ctxt.es_base = | ||
1877 | get_segment_base(vcpu, VCPU_SREG_ES); | ||
1878 | vcpu->arch.emulate_ctxt.ss_base = | ||
1879 | get_segment_base(vcpu, VCPU_SREG_SS); | ||
1880 | } | ||
1881 | |||
1882 | vcpu->arch.emulate_ctxt.gs_base = | ||
1883 | get_segment_base(vcpu, VCPU_SREG_GS); | ||
1884 | vcpu->arch.emulate_ctxt.fs_base = | ||
1885 | get_segment_base(vcpu, VCPU_SREG_FS); | ||
1886 | |||
1887 | r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); | ||
1888 | |||
1889 | /* Reject the instructions other than VMCALL/VMMCALL when | ||
1890 | * try to emulate invalid opcode */ | ||
1891 | c = &vcpu->arch.emulate_ctxt.decode; | ||
1892 | if ((emulation_type & EMULTYPE_TRAP_UD) && | ||
1893 | (!(c->twobyte && c->b == 0x01 && | ||
1894 | (c->modrm_reg == 0 || c->modrm_reg == 3) && | ||
1895 | c->modrm_mod == 3 && c->modrm_rm == 1))) | ||
1896 | return EMULATE_FAIL; | ||
1897 | |||
1898 | ++vcpu->stat.insn_emulation; | ||
1899 | if (r) { | ||
1900 | ++vcpu->stat.insn_emulation_fail; | ||
1901 | if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) | ||
1902 | return EMULATE_DONE; | ||
1903 | return EMULATE_FAIL; | ||
1904 | } | ||
1282 | } | 1905 | } |
1283 | 1906 | ||
1284 | emulate_ctxt.gs_base = get_segment_base(vcpu, VCPU_SREG_GS); | 1907 | r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); |
1285 | emulate_ctxt.fs_base = get_segment_base(vcpu, VCPU_SREG_FS); | ||
1286 | 1908 | ||
1287 | vcpu->mmio_is_write = 0; | 1909 | if (vcpu->arch.pio.string) |
1288 | vcpu->pio.string = 0; | ||
1289 | r = x86_emulate_memop(&emulate_ctxt, &emulate_ops); | ||
1290 | if (vcpu->pio.string) | ||
1291 | return EMULATE_DO_MMIO; | 1910 | return EMULATE_DO_MMIO; |
1292 | 1911 | ||
1293 | if ((r || vcpu->mmio_is_write) && run) { | 1912 | if ((r || vcpu->mmio_is_write) && run) { |
@@ -1309,7 +1928,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu, | |||
1309 | } | 1928 | } |
1310 | 1929 | ||
1311 | kvm_x86_ops->decache_regs(vcpu); | 1930 | kvm_x86_ops->decache_regs(vcpu); |
1312 | kvm_x86_ops->set_rflags(vcpu, emulate_ctxt.eflags); | 1931 | kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); |
1313 | 1932 | ||
1314 | if (vcpu->mmio_is_write) { | 1933 | if (vcpu->mmio_is_write) { |
1315 | vcpu->mmio_needed = 0; | 1934 | vcpu->mmio_needed = 0; |
@@ -1320,439 +1939,45 @@ int emulate_instruction(struct kvm_vcpu *vcpu, | |||
1320 | } | 1939 | } |
1321 | EXPORT_SYMBOL_GPL(emulate_instruction); | 1940 | EXPORT_SYMBOL_GPL(emulate_instruction); |
1322 | 1941 | ||
1323 | /* | 1942 | static void free_pio_guest_pages(struct kvm_vcpu *vcpu) |
1324 | * The vCPU has executed a HLT instruction with in-kernel mode enabled. | ||
1325 | */ | ||
1326 | static void kvm_vcpu_block(struct kvm_vcpu *vcpu) | ||
1327 | { | ||
1328 | DECLARE_WAITQUEUE(wait, current); | ||
1329 | |||
1330 | add_wait_queue(&vcpu->wq, &wait); | ||
1331 | |||
1332 | /* | ||
1333 | * We will block until either an interrupt or a signal wakes us up | ||
1334 | */ | ||
1335 | while (!kvm_cpu_has_interrupt(vcpu) | ||
1336 | && !signal_pending(current) | ||
1337 | && vcpu->mp_state != VCPU_MP_STATE_RUNNABLE | ||
1338 | && vcpu->mp_state != VCPU_MP_STATE_SIPI_RECEIVED) { | ||
1339 | set_current_state(TASK_INTERRUPTIBLE); | ||
1340 | vcpu_put(vcpu); | ||
1341 | schedule(); | ||
1342 | vcpu_load(vcpu); | ||
1343 | } | ||
1344 | |||
1345 | __set_current_state(TASK_RUNNING); | ||
1346 | remove_wait_queue(&vcpu->wq, &wait); | ||
1347 | } | ||
1348 | |||
1349 | int kvm_emulate_halt(struct kvm_vcpu *vcpu) | ||
1350 | { | ||
1351 | ++vcpu->stat.halt_exits; | ||
1352 | if (irqchip_in_kernel(vcpu->kvm)) { | ||
1353 | vcpu->mp_state = VCPU_MP_STATE_HALTED; | ||
1354 | kvm_vcpu_block(vcpu); | ||
1355 | if (vcpu->mp_state != VCPU_MP_STATE_RUNNABLE) | ||
1356 | return -EINTR; | ||
1357 | return 1; | ||
1358 | } else { | ||
1359 | vcpu->run->exit_reason = KVM_EXIT_HLT; | ||
1360 | return 0; | ||
1361 | } | ||
1362 | } | ||
1363 | EXPORT_SYMBOL_GPL(kvm_emulate_halt); | ||
1364 | |||
1365 | int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run) | ||
1366 | { | ||
1367 | unsigned long nr, a0, a1, a2, a3, a4, a5, ret; | ||
1368 | |||
1369 | kvm_x86_ops->cache_regs(vcpu); | ||
1370 | ret = -KVM_EINVAL; | ||
1371 | #ifdef CONFIG_X86_64 | ||
1372 | if (is_long_mode(vcpu)) { | ||
1373 | nr = vcpu->regs[VCPU_REGS_RAX]; | ||
1374 | a0 = vcpu->regs[VCPU_REGS_RDI]; | ||
1375 | a1 = vcpu->regs[VCPU_REGS_RSI]; | ||
1376 | a2 = vcpu->regs[VCPU_REGS_RDX]; | ||
1377 | a3 = vcpu->regs[VCPU_REGS_RCX]; | ||
1378 | a4 = vcpu->regs[VCPU_REGS_R8]; | ||
1379 | a5 = vcpu->regs[VCPU_REGS_R9]; | ||
1380 | } else | ||
1381 | #endif | ||
1382 | { | ||
1383 | nr = vcpu->regs[VCPU_REGS_RBX] & -1u; | ||
1384 | a0 = vcpu->regs[VCPU_REGS_RAX] & -1u; | ||
1385 | a1 = vcpu->regs[VCPU_REGS_RCX] & -1u; | ||
1386 | a2 = vcpu->regs[VCPU_REGS_RDX] & -1u; | ||
1387 | a3 = vcpu->regs[VCPU_REGS_RSI] & -1u; | ||
1388 | a4 = vcpu->regs[VCPU_REGS_RDI] & -1u; | ||
1389 | a5 = vcpu->regs[VCPU_REGS_RBP] & -1u; | ||
1390 | } | ||
1391 | switch (nr) { | ||
1392 | default: | ||
1393 | run->hypercall.nr = nr; | ||
1394 | run->hypercall.args[0] = a0; | ||
1395 | run->hypercall.args[1] = a1; | ||
1396 | run->hypercall.args[2] = a2; | ||
1397 | run->hypercall.args[3] = a3; | ||
1398 | run->hypercall.args[4] = a4; | ||
1399 | run->hypercall.args[5] = a5; | ||
1400 | run->hypercall.ret = ret; | ||
1401 | run->hypercall.longmode = is_long_mode(vcpu); | ||
1402 | kvm_x86_ops->decache_regs(vcpu); | ||
1403 | return 0; | ||
1404 | } | ||
1405 | vcpu->regs[VCPU_REGS_RAX] = ret; | ||
1406 | kvm_x86_ops->decache_regs(vcpu); | ||
1407 | return 1; | ||
1408 | } | ||
1409 | EXPORT_SYMBOL_GPL(kvm_hypercall); | ||
1410 | |||
1411 | static u64 mk_cr_64(u64 curr_cr, u32 new_val) | ||
1412 | { | ||
1413 | return (curr_cr & ~((1ULL << 32) - 1)) | new_val; | ||
1414 | } | ||
1415 | |||
1416 | void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) | ||
1417 | { | ||
1418 | struct descriptor_table dt = { limit, base }; | ||
1419 | |||
1420 | kvm_x86_ops->set_gdt(vcpu, &dt); | ||
1421 | } | ||
1422 | |||
1423 | void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) | ||
1424 | { | ||
1425 | struct descriptor_table dt = { limit, base }; | ||
1426 | |||
1427 | kvm_x86_ops->set_idt(vcpu, &dt); | ||
1428 | } | ||
1429 | |||
1430 | void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, | ||
1431 | unsigned long *rflags) | ||
1432 | { | ||
1433 | lmsw(vcpu, msw); | ||
1434 | *rflags = kvm_x86_ops->get_rflags(vcpu); | ||
1435 | } | ||
1436 | |||
1437 | unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) | ||
1438 | { | ||
1439 | kvm_x86_ops->decache_cr4_guest_bits(vcpu); | ||
1440 | switch (cr) { | ||
1441 | case 0: | ||
1442 | return vcpu->cr0; | ||
1443 | case 2: | ||
1444 | return vcpu->cr2; | ||
1445 | case 3: | ||
1446 | return vcpu->cr3; | ||
1447 | case 4: | ||
1448 | return vcpu->cr4; | ||
1449 | default: | ||
1450 | vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr); | ||
1451 | return 0; | ||
1452 | } | ||
1453 | } | ||
1454 | |||
1455 | void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, | ||
1456 | unsigned long *rflags) | ||
1457 | { | ||
1458 | switch (cr) { | ||
1459 | case 0: | ||
1460 | set_cr0(vcpu, mk_cr_64(vcpu->cr0, val)); | ||
1461 | *rflags = kvm_x86_ops->get_rflags(vcpu); | ||
1462 | break; | ||
1463 | case 2: | ||
1464 | vcpu->cr2 = val; | ||
1465 | break; | ||
1466 | case 3: | ||
1467 | set_cr3(vcpu, val); | ||
1468 | break; | ||
1469 | case 4: | ||
1470 | set_cr4(vcpu, mk_cr_64(vcpu->cr4, val)); | ||
1471 | break; | ||
1472 | default: | ||
1473 | vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr); | ||
1474 | } | ||
1475 | } | ||
1476 | |||
1477 | /* | ||
1478 | * Register the para guest with the host: | ||
1479 | */ | ||
1480 | static int vcpu_register_para(struct kvm_vcpu *vcpu, gpa_t para_state_gpa) | ||
1481 | { | ||
1482 | struct kvm_vcpu_para_state *para_state; | ||
1483 | hpa_t para_state_hpa, hypercall_hpa; | ||
1484 | struct page *para_state_page; | ||
1485 | unsigned char *hypercall; | ||
1486 | gpa_t hypercall_gpa; | ||
1487 | |||
1488 | printk(KERN_DEBUG "kvm: guest trying to enter paravirtual mode\n"); | ||
1489 | printk(KERN_DEBUG ".... para_state_gpa: %08Lx\n", para_state_gpa); | ||
1490 | |||
1491 | /* | ||
1492 | * Needs to be page aligned: | ||
1493 | */ | ||
1494 | if (para_state_gpa != PAGE_ALIGN(para_state_gpa)) | ||
1495 | goto err_gp; | ||
1496 | |||
1497 | para_state_hpa = gpa_to_hpa(vcpu, para_state_gpa); | ||
1498 | printk(KERN_DEBUG ".... para_state_hpa: %08Lx\n", para_state_hpa); | ||
1499 | if (is_error_hpa(para_state_hpa)) | ||
1500 | goto err_gp; | ||
1501 | |||
1502 | mark_page_dirty(vcpu->kvm, para_state_gpa >> PAGE_SHIFT); | ||
1503 | para_state_page = pfn_to_page(para_state_hpa >> PAGE_SHIFT); | ||
1504 | para_state = kmap(para_state_page); | ||
1505 | |||
1506 | printk(KERN_DEBUG ".... guest version: %d\n", para_state->guest_version); | ||
1507 | printk(KERN_DEBUG ".... size: %d\n", para_state->size); | ||
1508 | |||
1509 | para_state->host_version = KVM_PARA_API_VERSION; | ||
1510 | /* | ||
1511 | * We cannot support guests that try to register themselves | ||
1512 | * with a newer API version than the host supports: | ||
1513 | */ | ||
1514 | if (para_state->guest_version > KVM_PARA_API_VERSION) { | ||
1515 | para_state->ret = -KVM_EINVAL; | ||
1516 | goto err_kunmap_skip; | ||
1517 | } | ||
1518 | |||
1519 | hypercall_gpa = para_state->hypercall_gpa; | ||
1520 | hypercall_hpa = gpa_to_hpa(vcpu, hypercall_gpa); | ||
1521 | printk(KERN_DEBUG ".... hypercall_hpa: %08Lx\n", hypercall_hpa); | ||
1522 | if (is_error_hpa(hypercall_hpa)) { | ||
1523 | para_state->ret = -KVM_EINVAL; | ||
1524 | goto err_kunmap_skip; | ||
1525 | } | ||
1526 | |||
1527 | printk(KERN_DEBUG "kvm: para guest successfully registered.\n"); | ||
1528 | vcpu->para_state_page = para_state_page; | ||
1529 | vcpu->para_state_gpa = para_state_gpa; | ||
1530 | vcpu->hypercall_gpa = hypercall_gpa; | ||
1531 | |||
1532 | mark_page_dirty(vcpu->kvm, hypercall_gpa >> PAGE_SHIFT); | ||
1533 | hypercall = kmap_atomic(pfn_to_page(hypercall_hpa >> PAGE_SHIFT), | ||
1534 | KM_USER1) + (hypercall_hpa & ~PAGE_MASK); | ||
1535 | kvm_x86_ops->patch_hypercall(vcpu, hypercall); | ||
1536 | kunmap_atomic(hypercall, KM_USER1); | ||
1537 | |||
1538 | para_state->ret = 0; | ||
1539 | err_kunmap_skip: | ||
1540 | kunmap(para_state_page); | ||
1541 | return 0; | ||
1542 | err_gp: | ||
1543 | return 1; | ||
1544 | } | ||
1545 | |||
1546 | int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) | ||
1547 | { | ||
1548 | u64 data; | ||
1549 | |||
1550 | switch (msr) { | ||
1551 | case 0xc0010010: /* SYSCFG */ | ||
1552 | case 0xc0010015: /* HWCR */ | ||
1553 | case MSR_IA32_PLATFORM_ID: | ||
1554 | case MSR_IA32_P5_MC_ADDR: | ||
1555 | case MSR_IA32_P5_MC_TYPE: | ||
1556 | case MSR_IA32_MC0_CTL: | ||
1557 | case MSR_IA32_MCG_STATUS: | ||
1558 | case MSR_IA32_MCG_CAP: | ||
1559 | case MSR_IA32_MC0_MISC: | ||
1560 | case MSR_IA32_MC0_MISC+4: | ||
1561 | case MSR_IA32_MC0_MISC+8: | ||
1562 | case MSR_IA32_MC0_MISC+12: | ||
1563 | case MSR_IA32_MC0_MISC+16: | ||
1564 | case MSR_IA32_UCODE_REV: | ||
1565 | case MSR_IA32_PERF_STATUS: | ||
1566 | case MSR_IA32_EBL_CR_POWERON: | ||
1567 | /* MTRR registers */ | ||
1568 | case 0xfe: | ||
1569 | case 0x200 ... 0x2ff: | ||
1570 | data = 0; | ||
1571 | break; | ||
1572 | case 0xcd: /* fsb frequency */ | ||
1573 | data = 3; | ||
1574 | break; | ||
1575 | case MSR_IA32_APICBASE: | ||
1576 | data = kvm_get_apic_base(vcpu); | ||
1577 | break; | ||
1578 | case MSR_IA32_MISC_ENABLE: | ||
1579 | data = vcpu->ia32_misc_enable_msr; | ||
1580 | break; | ||
1581 | #ifdef CONFIG_X86_64 | ||
1582 | case MSR_EFER: | ||
1583 | data = vcpu->shadow_efer; | ||
1584 | break; | ||
1585 | #endif | ||
1586 | default: | ||
1587 | pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); | ||
1588 | return 1; | ||
1589 | } | ||
1590 | *pdata = data; | ||
1591 | return 0; | ||
1592 | } | ||
1593 | EXPORT_SYMBOL_GPL(kvm_get_msr_common); | ||
1594 | |||
1595 | /* | ||
1596 | * Reads an msr value (of 'msr_index') into 'pdata'. | ||
1597 | * Returns 0 on success, non-0 otherwise. | ||
1598 | * Assumes vcpu_load() was already called. | ||
1599 | */ | ||
1600 | int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) | ||
1601 | { | ||
1602 | return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); | ||
1603 | } | ||
1604 | |||
1605 | #ifdef CONFIG_X86_64 | ||
1606 | |||
1607 | static void set_efer(struct kvm_vcpu *vcpu, u64 efer) | ||
1608 | { | ||
1609 | if (efer & EFER_RESERVED_BITS) { | ||
1610 | printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n", | ||
1611 | efer); | ||
1612 | inject_gp(vcpu); | ||
1613 | return; | ||
1614 | } | ||
1615 | |||
1616 | if (is_paging(vcpu) | ||
1617 | && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) { | ||
1618 | printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n"); | ||
1619 | inject_gp(vcpu); | ||
1620 | return; | ||
1621 | } | ||
1622 | |||
1623 | kvm_x86_ops->set_efer(vcpu, efer); | ||
1624 | |||
1625 | efer &= ~EFER_LMA; | ||
1626 | efer |= vcpu->shadow_efer & EFER_LMA; | ||
1627 | |||
1628 | vcpu->shadow_efer = efer; | ||
1629 | } | ||
1630 | |||
1631 | #endif | ||
1632 | |||
1633 | int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | ||
1634 | { | ||
1635 | switch (msr) { | ||
1636 | #ifdef CONFIG_X86_64 | ||
1637 | case MSR_EFER: | ||
1638 | set_efer(vcpu, data); | ||
1639 | break; | ||
1640 | #endif | ||
1641 | case MSR_IA32_MC0_STATUS: | ||
1642 | pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", | ||
1643 | __FUNCTION__, data); | ||
1644 | break; | ||
1645 | case MSR_IA32_MCG_STATUS: | ||
1646 | pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n", | ||
1647 | __FUNCTION__, data); | ||
1648 | break; | ||
1649 | case MSR_IA32_UCODE_REV: | ||
1650 | case MSR_IA32_UCODE_WRITE: | ||
1651 | case 0x200 ... 0x2ff: /* MTRRs */ | ||
1652 | break; | ||
1653 | case MSR_IA32_APICBASE: | ||
1654 | kvm_set_apic_base(vcpu, data); | ||
1655 | break; | ||
1656 | case MSR_IA32_MISC_ENABLE: | ||
1657 | vcpu->ia32_misc_enable_msr = data; | ||
1658 | break; | ||
1659 | /* | ||
1660 | * This is the 'probe whether the host is KVM' logic: | ||
1661 | */ | ||
1662 | case MSR_KVM_API_MAGIC: | ||
1663 | return vcpu_register_para(vcpu, data); | ||
1664 | |||
1665 | default: | ||
1666 | pr_unimpl(vcpu, "unhandled wrmsr: 0x%x\n", msr); | ||
1667 | return 1; | ||
1668 | } | ||
1669 | return 0; | ||
1670 | } | ||
1671 | EXPORT_SYMBOL_GPL(kvm_set_msr_common); | ||
1672 | |||
1673 | /* | ||
1674 | * Writes msr value into into the appropriate "register". | ||
1675 | * Returns 0 on success, non-0 otherwise. | ||
1676 | * Assumes vcpu_load() was already called. | ||
1677 | */ | ||
1678 | int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) | ||
1679 | { | ||
1680 | return kvm_x86_ops->set_msr(vcpu, msr_index, data); | ||
1681 | } | ||
1682 | |||
1683 | void kvm_resched(struct kvm_vcpu *vcpu) | ||
1684 | { | ||
1685 | if (!need_resched()) | ||
1686 | return; | ||
1687 | cond_resched(); | ||
1688 | } | ||
1689 | EXPORT_SYMBOL_GPL(kvm_resched); | ||
1690 | |||
1691 | void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) | ||
1692 | { | 1943 | { |
1693 | int i; | 1944 | int i; |
1694 | u32 function; | ||
1695 | struct kvm_cpuid_entry *e, *best; | ||
1696 | 1945 | ||
1697 | kvm_x86_ops->cache_regs(vcpu); | 1946 | for (i = 0; i < ARRAY_SIZE(vcpu->arch.pio.guest_pages); ++i) |
1698 | function = vcpu->regs[VCPU_REGS_RAX]; | 1947 | if (vcpu->arch.pio.guest_pages[i]) { |
1699 | vcpu->regs[VCPU_REGS_RAX] = 0; | 1948 | kvm_release_page_dirty(vcpu->arch.pio.guest_pages[i]); |
1700 | vcpu->regs[VCPU_REGS_RBX] = 0; | 1949 | vcpu->arch.pio.guest_pages[i] = NULL; |
1701 | vcpu->regs[VCPU_REGS_RCX] = 0; | ||
1702 | vcpu->regs[VCPU_REGS_RDX] = 0; | ||
1703 | best = NULL; | ||
1704 | for (i = 0; i < vcpu->cpuid_nent; ++i) { | ||
1705 | e = &vcpu->cpuid_entries[i]; | ||
1706 | if (e->function == function) { | ||
1707 | best = e; | ||
1708 | break; | ||
1709 | } | 1950 | } |
1710 | /* | ||
1711 | * Both basic or both extended? | ||
1712 | */ | ||
1713 | if (((e->function ^ function) & 0x80000000) == 0) | ||
1714 | if (!best || e->function > best->function) | ||
1715 | best = e; | ||
1716 | } | ||
1717 | if (best) { | ||
1718 | vcpu->regs[VCPU_REGS_RAX] = best->eax; | ||
1719 | vcpu->regs[VCPU_REGS_RBX] = best->ebx; | ||
1720 | vcpu->regs[VCPU_REGS_RCX] = best->ecx; | ||
1721 | vcpu->regs[VCPU_REGS_RDX] = best->edx; | ||
1722 | } | ||
1723 | kvm_x86_ops->decache_regs(vcpu); | ||
1724 | kvm_x86_ops->skip_emulated_instruction(vcpu); | ||
1725 | } | 1951 | } |
1726 | EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); | ||
1727 | 1952 | ||
1728 | static int pio_copy_data(struct kvm_vcpu *vcpu) | 1953 | static int pio_copy_data(struct kvm_vcpu *vcpu) |
1729 | { | 1954 | { |
1730 | void *p = vcpu->pio_data; | 1955 | void *p = vcpu->arch.pio_data; |
1731 | void *q; | 1956 | void *q; |
1732 | unsigned bytes; | 1957 | unsigned bytes; |
1733 | int nr_pages = vcpu->pio.guest_pages[1] ? 2 : 1; | 1958 | int nr_pages = vcpu->arch.pio.guest_pages[1] ? 2 : 1; |
1734 | 1959 | ||
1735 | q = vmap(vcpu->pio.guest_pages, nr_pages, VM_READ|VM_WRITE, | 1960 | q = vmap(vcpu->arch.pio.guest_pages, nr_pages, VM_READ|VM_WRITE, |
1736 | PAGE_KERNEL); | 1961 | PAGE_KERNEL); |
1737 | if (!q) { | 1962 | if (!q) { |
1738 | free_pio_guest_pages(vcpu); | 1963 | free_pio_guest_pages(vcpu); |
1739 | return -ENOMEM; | 1964 | return -ENOMEM; |
1740 | } | 1965 | } |
1741 | q += vcpu->pio.guest_page_offset; | 1966 | q += vcpu->arch.pio.guest_page_offset; |
1742 | bytes = vcpu->pio.size * vcpu->pio.cur_count; | 1967 | bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count; |
1743 | if (vcpu->pio.in) | 1968 | if (vcpu->arch.pio.in) |
1744 | memcpy(q, p, bytes); | 1969 | memcpy(q, p, bytes); |
1745 | else | 1970 | else |
1746 | memcpy(p, q, bytes); | 1971 | memcpy(p, q, bytes); |
1747 | q -= vcpu->pio.guest_page_offset; | 1972 | q -= vcpu->arch.pio.guest_page_offset; |
1748 | vunmap(q); | 1973 | vunmap(q); |
1749 | free_pio_guest_pages(vcpu); | 1974 | free_pio_guest_pages(vcpu); |
1750 | return 0; | 1975 | return 0; |
1751 | } | 1976 | } |
1752 | 1977 | ||
1753 | static int complete_pio(struct kvm_vcpu *vcpu) | 1978 | int complete_pio(struct kvm_vcpu *vcpu) |
1754 | { | 1979 | { |
1755 | struct kvm_pio_request *io = &vcpu->pio; | 1980 | struct kvm_pio_request *io = &vcpu->arch.pio; |
1756 | long delta; | 1981 | long delta; |
1757 | int r; | 1982 | int r; |
1758 | 1983 | ||
@@ -1760,7 +1985,7 @@ static int complete_pio(struct kvm_vcpu *vcpu) | |||
1760 | 1985 | ||
1761 | if (!io->string) { | 1986 | if (!io->string) { |
1762 | if (io->in) | 1987 | if (io->in) |
1763 | memcpy(&vcpu->regs[VCPU_REGS_RAX], vcpu->pio_data, | 1988 | memcpy(&vcpu->arch.regs[VCPU_REGS_RAX], vcpu->arch.pio_data, |
1764 | io->size); | 1989 | io->size); |
1765 | } else { | 1990 | } else { |
1766 | if (io->in) { | 1991 | if (io->in) { |
@@ -1778,15 +2003,15 @@ static int complete_pio(struct kvm_vcpu *vcpu) | |||
1778 | * The size of the register should really depend on | 2003 | * The size of the register should really depend on |
1779 | * current address size. | 2004 | * current address size. |
1780 | */ | 2005 | */ |
1781 | vcpu->regs[VCPU_REGS_RCX] -= delta; | 2006 | vcpu->arch.regs[VCPU_REGS_RCX] -= delta; |
1782 | } | 2007 | } |
1783 | if (io->down) | 2008 | if (io->down) |
1784 | delta = -delta; | 2009 | delta = -delta; |
1785 | delta *= io->size; | 2010 | delta *= io->size; |
1786 | if (io->in) | 2011 | if (io->in) |
1787 | vcpu->regs[VCPU_REGS_RDI] += delta; | 2012 | vcpu->arch.regs[VCPU_REGS_RDI] += delta; |
1788 | else | 2013 | else |
1789 | vcpu->regs[VCPU_REGS_RSI] += delta; | 2014 | vcpu->arch.regs[VCPU_REGS_RSI] += delta; |
1790 | } | 2015 | } |
1791 | 2016 | ||
1792 | kvm_x86_ops->decache_regs(vcpu); | 2017 | kvm_x86_ops->decache_regs(vcpu); |
@@ -1804,13 +2029,13 @@ static void kernel_pio(struct kvm_io_device *pio_dev, | |||
1804 | /* TODO: String I/O for in kernel device */ | 2029 | /* TODO: String I/O for in kernel device */ |
1805 | 2030 | ||
1806 | mutex_lock(&vcpu->kvm->lock); | 2031 | mutex_lock(&vcpu->kvm->lock); |
1807 | if (vcpu->pio.in) | 2032 | if (vcpu->arch.pio.in) |
1808 | kvm_iodevice_read(pio_dev, vcpu->pio.port, | 2033 | kvm_iodevice_read(pio_dev, vcpu->arch.pio.port, |
1809 | vcpu->pio.size, | 2034 | vcpu->arch.pio.size, |
1810 | pd); | 2035 | pd); |
1811 | else | 2036 | else |
1812 | kvm_iodevice_write(pio_dev, vcpu->pio.port, | 2037 | kvm_iodevice_write(pio_dev, vcpu->arch.pio.port, |
1813 | vcpu->pio.size, | 2038 | vcpu->arch.pio.size, |
1814 | pd); | 2039 | pd); |
1815 | mutex_unlock(&vcpu->kvm->lock); | 2040 | mutex_unlock(&vcpu->kvm->lock); |
1816 | } | 2041 | } |
@@ -1818,8 +2043,8 @@ static void kernel_pio(struct kvm_io_device *pio_dev, | |||
1818 | static void pio_string_write(struct kvm_io_device *pio_dev, | 2043 | static void pio_string_write(struct kvm_io_device *pio_dev, |
1819 | struct kvm_vcpu *vcpu) | 2044 | struct kvm_vcpu *vcpu) |
1820 | { | 2045 | { |
1821 | struct kvm_pio_request *io = &vcpu->pio; | 2046 | struct kvm_pio_request *io = &vcpu->arch.pio; |
1822 | void *pd = vcpu->pio_data; | 2047 | void *pd = vcpu->arch.pio_data; |
1823 | int i; | 2048 | int i; |
1824 | 2049 | ||
1825 | mutex_lock(&vcpu->kvm->lock); | 2050 | mutex_lock(&vcpu->kvm->lock); |
@@ -1832,32 +2057,38 @@ static void pio_string_write(struct kvm_io_device *pio_dev, | |||
1832 | mutex_unlock(&vcpu->kvm->lock); | 2057 | mutex_unlock(&vcpu->kvm->lock); |
1833 | } | 2058 | } |
1834 | 2059 | ||
1835 | int kvm_emulate_pio (struct kvm_vcpu *vcpu, struct kvm_run *run, int in, | 2060 | static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu, |
2061 | gpa_t addr) | ||
2062 | { | ||
2063 | return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr); | ||
2064 | } | ||
2065 | |||
2066 | int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, | ||
1836 | int size, unsigned port) | 2067 | int size, unsigned port) |
1837 | { | 2068 | { |
1838 | struct kvm_io_device *pio_dev; | 2069 | struct kvm_io_device *pio_dev; |
1839 | 2070 | ||
1840 | vcpu->run->exit_reason = KVM_EXIT_IO; | 2071 | vcpu->run->exit_reason = KVM_EXIT_IO; |
1841 | vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; | 2072 | vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; |
1842 | vcpu->run->io.size = vcpu->pio.size = size; | 2073 | vcpu->run->io.size = vcpu->arch.pio.size = size; |
1843 | vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; | 2074 | vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; |
1844 | vcpu->run->io.count = vcpu->pio.count = vcpu->pio.cur_count = 1; | 2075 | vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1; |
1845 | vcpu->run->io.port = vcpu->pio.port = port; | 2076 | vcpu->run->io.port = vcpu->arch.pio.port = port; |
1846 | vcpu->pio.in = in; | 2077 | vcpu->arch.pio.in = in; |
1847 | vcpu->pio.string = 0; | 2078 | vcpu->arch.pio.string = 0; |
1848 | vcpu->pio.down = 0; | 2079 | vcpu->arch.pio.down = 0; |
1849 | vcpu->pio.guest_page_offset = 0; | 2080 | vcpu->arch.pio.guest_page_offset = 0; |
1850 | vcpu->pio.rep = 0; | 2081 | vcpu->arch.pio.rep = 0; |
1851 | 2082 | ||
1852 | kvm_x86_ops->cache_regs(vcpu); | 2083 | kvm_x86_ops->cache_regs(vcpu); |
1853 | memcpy(vcpu->pio_data, &vcpu->regs[VCPU_REGS_RAX], 4); | 2084 | memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4); |
1854 | kvm_x86_ops->decache_regs(vcpu); | 2085 | kvm_x86_ops->decache_regs(vcpu); |
1855 | 2086 | ||
1856 | kvm_x86_ops->skip_emulated_instruction(vcpu); | 2087 | kvm_x86_ops->skip_emulated_instruction(vcpu); |
1857 | 2088 | ||
1858 | pio_dev = vcpu_find_pio_dev(vcpu, port); | 2089 | pio_dev = vcpu_find_pio_dev(vcpu, port); |
1859 | if (pio_dev) { | 2090 | if (pio_dev) { |
1860 | kernel_pio(pio_dev, vcpu, vcpu->pio_data); | 2091 | kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data); |
1861 | complete_pio(vcpu); | 2092 | complete_pio(vcpu); |
1862 | return 1; | 2093 | return 1; |
1863 | } | 2094 | } |
@@ -1877,15 +2108,15 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, | |||
1877 | 2108 | ||
1878 | vcpu->run->exit_reason = KVM_EXIT_IO; | 2109 | vcpu->run->exit_reason = KVM_EXIT_IO; |
1879 | vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; | 2110 | vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; |
1880 | vcpu->run->io.size = vcpu->pio.size = size; | 2111 | vcpu->run->io.size = vcpu->arch.pio.size = size; |
1881 | vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; | 2112 | vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; |
1882 | vcpu->run->io.count = vcpu->pio.count = vcpu->pio.cur_count = count; | 2113 | vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count; |
1883 | vcpu->run->io.port = vcpu->pio.port = port; | 2114 | vcpu->run->io.port = vcpu->arch.pio.port = port; |
1884 | vcpu->pio.in = in; | 2115 | vcpu->arch.pio.in = in; |
1885 | vcpu->pio.string = 1; | 2116 | vcpu->arch.pio.string = 1; |
1886 | vcpu->pio.down = down; | 2117 | vcpu->arch.pio.down = down; |
1887 | vcpu->pio.guest_page_offset = offset_in_page(address); | 2118 | vcpu->arch.pio.guest_page_offset = offset_in_page(address); |
1888 | vcpu->pio.rep = rep; | 2119 | vcpu->arch.pio.rep = rep; |
1889 | 2120 | ||
1890 | if (!count) { | 2121 | if (!count) { |
1891 | kvm_x86_ops->skip_emulated_instruction(vcpu); | 2122 | kvm_x86_ops->skip_emulated_instruction(vcpu); |
@@ -1911,37 +2142,35 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, | |||
1911 | * String I/O in reverse. Yuck. Kill the guest, fix later. | 2142 | * String I/O in reverse. Yuck. Kill the guest, fix later. |
1912 | */ | 2143 | */ |
1913 | pr_unimpl(vcpu, "guest string pio down\n"); | 2144 | pr_unimpl(vcpu, "guest string pio down\n"); |
1914 | inject_gp(vcpu); | 2145 | kvm_inject_gp(vcpu, 0); |
1915 | return 1; | 2146 | return 1; |
1916 | } | 2147 | } |
1917 | vcpu->run->io.count = now; | 2148 | vcpu->run->io.count = now; |
1918 | vcpu->pio.cur_count = now; | 2149 | vcpu->arch.pio.cur_count = now; |
1919 | 2150 | ||
1920 | if (vcpu->pio.cur_count == vcpu->pio.count) | 2151 | if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count) |
1921 | kvm_x86_ops->skip_emulated_instruction(vcpu); | 2152 | kvm_x86_ops->skip_emulated_instruction(vcpu); |
1922 | 2153 | ||
1923 | for (i = 0; i < nr_pages; ++i) { | 2154 | for (i = 0; i < nr_pages; ++i) { |
1924 | mutex_lock(&vcpu->kvm->lock); | 2155 | down_read(¤t->mm->mmap_sem); |
1925 | page = gva_to_page(vcpu, address + i * PAGE_SIZE); | 2156 | page = gva_to_page(vcpu, address + i * PAGE_SIZE); |
1926 | if (page) | 2157 | vcpu->arch.pio.guest_pages[i] = page; |
1927 | get_page(page); | 2158 | up_read(¤t->mm->mmap_sem); |
1928 | vcpu->pio.guest_pages[i] = page; | ||
1929 | mutex_unlock(&vcpu->kvm->lock); | ||
1930 | if (!page) { | 2159 | if (!page) { |
1931 | inject_gp(vcpu); | 2160 | kvm_inject_gp(vcpu, 0); |
1932 | free_pio_guest_pages(vcpu); | 2161 | free_pio_guest_pages(vcpu); |
1933 | return 1; | 2162 | return 1; |
1934 | } | 2163 | } |
1935 | } | 2164 | } |
1936 | 2165 | ||
1937 | pio_dev = vcpu_find_pio_dev(vcpu, port); | 2166 | pio_dev = vcpu_find_pio_dev(vcpu, port); |
1938 | if (!vcpu->pio.in) { | 2167 | if (!vcpu->arch.pio.in) { |
1939 | /* string PIO write */ | 2168 | /* string PIO write */ |
1940 | ret = pio_copy_data(vcpu); | 2169 | ret = pio_copy_data(vcpu); |
1941 | if (ret >= 0 && pio_dev) { | 2170 | if (ret >= 0 && pio_dev) { |
1942 | pio_string_write(pio_dev, vcpu); | 2171 | pio_string_write(pio_dev, vcpu); |
1943 | complete_pio(vcpu); | 2172 | complete_pio(vcpu); |
1944 | if (vcpu->pio.count == 0) | 2173 | if (vcpu->arch.pio.count == 0) |
1945 | ret = 1; | 2174 | ret = 1; |
1946 | } | 2175 | } |
1947 | } else if (pio_dev) | 2176 | } else if (pio_dev) |
@@ -1953,6 +2182,263 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, | |||
1953 | } | 2182 | } |
1954 | EXPORT_SYMBOL_GPL(kvm_emulate_pio_string); | 2183 | EXPORT_SYMBOL_GPL(kvm_emulate_pio_string); |
1955 | 2184 | ||
2185 | int kvm_arch_init(void *opaque) | ||
2186 | { | ||
2187 | int r; | ||
2188 | struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; | ||
2189 | |||
2190 | if (kvm_x86_ops) { | ||
2191 | printk(KERN_ERR "kvm: already loaded the other module\n"); | ||
2192 | r = -EEXIST; | ||
2193 | goto out; | ||
2194 | } | ||
2195 | |||
2196 | if (!ops->cpu_has_kvm_support()) { | ||
2197 | printk(KERN_ERR "kvm: no hardware support\n"); | ||
2198 | r = -EOPNOTSUPP; | ||
2199 | goto out; | ||
2200 | } | ||
2201 | if (ops->disabled_by_bios()) { | ||
2202 | printk(KERN_ERR "kvm: disabled by bios\n"); | ||
2203 | r = -EOPNOTSUPP; | ||
2204 | goto out; | ||
2205 | } | ||
2206 | |||
2207 | r = kvm_mmu_module_init(); | ||
2208 | if (r) | ||
2209 | goto out; | ||
2210 | |||
2211 | kvm_init_msr_list(); | ||
2212 | |||
2213 | kvm_x86_ops = ops; | ||
2214 | kvm_mmu_set_nonpresent_ptes(0ull, 0ull); | ||
2215 | return 0; | ||
2216 | |||
2217 | out: | ||
2218 | return r; | ||
2219 | } | ||
2220 | |||
2221 | void kvm_arch_exit(void) | ||
2222 | { | ||
2223 | kvm_x86_ops = NULL; | ||
2224 | kvm_mmu_module_exit(); | ||
2225 | } | ||
2226 | |||
2227 | int kvm_emulate_halt(struct kvm_vcpu *vcpu) | ||
2228 | { | ||
2229 | ++vcpu->stat.halt_exits; | ||
2230 | if (irqchip_in_kernel(vcpu->kvm)) { | ||
2231 | vcpu->arch.mp_state = VCPU_MP_STATE_HALTED; | ||
2232 | kvm_vcpu_block(vcpu); | ||
2233 | if (vcpu->arch.mp_state != VCPU_MP_STATE_RUNNABLE) | ||
2234 | return -EINTR; | ||
2235 | return 1; | ||
2236 | } else { | ||
2237 | vcpu->run->exit_reason = KVM_EXIT_HLT; | ||
2238 | return 0; | ||
2239 | } | ||
2240 | } | ||
2241 | EXPORT_SYMBOL_GPL(kvm_emulate_halt); | ||
2242 | |||
2243 | int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) | ||
2244 | { | ||
2245 | unsigned long nr, a0, a1, a2, a3, ret; | ||
2246 | |||
2247 | kvm_x86_ops->cache_regs(vcpu); | ||
2248 | |||
2249 | nr = vcpu->arch.regs[VCPU_REGS_RAX]; | ||
2250 | a0 = vcpu->arch.regs[VCPU_REGS_RBX]; | ||
2251 | a1 = vcpu->arch.regs[VCPU_REGS_RCX]; | ||
2252 | a2 = vcpu->arch.regs[VCPU_REGS_RDX]; | ||
2253 | a3 = vcpu->arch.regs[VCPU_REGS_RSI]; | ||
2254 | |||
2255 | if (!is_long_mode(vcpu)) { | ||
2256 | nr &= 0xFFFFFFFF; | ||
2257 | a0 &= 0xFFFFFFFF; | ||
2258 | a1 &= 0xFFFFFFFF; | ||
2259 | a2 &= 0xFFFFFFFF; | ||
2260 | a3 &= 0xFFFFFFFF; | ||
2261 | } | ||
2262 | |||
2263 | switch (nr) { | ||
2264 | case KVM_HC_VAPIC_POLL_IRQ: | ||
2265 | ret = 0; | ||
2266 | break; | ||
2267 | default: | ||
2268 | ret = -KVM_ENOSYS; | ||
2269 | break; | ||
2270 | } | ||
2271 | vcpu->arch.regs[VCPU_REGS_RAX] = ret; | ||
2272 | kvm_x86_ops->decache_regs(vcpu); | ||
2273 | return 0; | ||
2274 | } | ||
2275 | EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); | ||
2276 | |||
2277 | int kvm_fix_hypercall(struct kvm_vcpu *vcpu) | ||
2278 | { | ||
2279 | char instruction[3]; | ||
2280 | int ret = 0; | ||
2281 | |||
2282 | |||
2283 | /* | ||
2284 | * Blow out the MMU to ensure that no other VCPU has an active mapping | ||
2285 | * to ensure that the updated hypercall appears atomically across all | ||
2286 | * VCPUs. | ||
2287 | */ | ||
2288 | kvm_mmu_zap_all(vcpu->kvm); | ||
2289 | |||
2290 | kvm_x86_ops->cache_regs(vcpu); | ||
2291 | kvm_x86_ops->patch_hypercall(vcpu, instruction); | ||
2292 | if (emulator_write_emulated(vcpu->arch.rip, instruction, 3, vcpu) | ||
2293 | != X86EMUL_CONTINUE) | ||
2294 | ret = -EFAULT; | ||
2295 | |||
2296 | return ret; | ||
2297 | } | ||
2298 | |||
2299 | static u64 mk_cr_64(u64 curr_cr, u32 new_val) | ||
2300 | { | ||
2301 | return (curr_cr & ~((1ULL << 32) - 1)) | new_val; | ||
2302 | } | ||
2303 | |||
2304 | void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) | ||
2305 | { | ||
2306 | struct descriptor_table dt = { limit, base }; | ||
2307 | |||
2308 | kvm_x86_ops->set_gdt(vcpu, &dt); | ||
2309 | } | ||
2310 | |||
2311 | void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) | ||
2312 | { | ||
2313 | struct descriptor_table dt = { limit, base }; | ||
2314 | |||
2315 | kvm_x86_ops->set_idt(vcpu, &dt); | ||
2316 | } | ||
2317 | |||
2318 | void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, | ||
2319 | unsigned long *rflags) | ||
2320 | { | ||
2321 | lmsw(vcpu, msw); | ||
2322 | *rflags = kvm_x86_ops->get_rflags(vcpu); | ||
2323 | } | ||
2324 | |||
2325 | unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) | ||
2326 | { | ||
2327 | kvm_x86_ops->decache_cr4_guest_bits(vcpu); | ||
2328 | switch (cr) { | ||
2329 | case 0: | ||
2330 | return vcpu->arch.cr0; | ||
2331 | case 2: | ||
2332 | return vcpu->arch.cr2; | ||
2333 | case 3: | ||
2334 | return vcpu->arch.cr3; | ||
2335 | case 4: | ||
2336 | return vcpu->arch.cr4; | ||
2337 | case 8: | ||
2338 | return get_cr8(vcpu); | ||
2339 | default: | ||
2340 | vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr); | ||
2341 | return 0; | ||
2342 | } | ||
2343 | } | ||
2344 | |||
2345 | void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, | ||
2346 | unsigned long *rflags) | ||
2347 | { | ||
2348 | switch (cr) { | ||
2349 | case 0: | ||
2350 | set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); | ||
2351 | *rflags = kvm_x86_ops->get_rflags(vcpu); | ||
2352 | break; | ||
2353 | case 2: | ||
2354 | vcpu->arch.cr2 = val; | ||
2355 | break; | ||
2356 | case 3: | ||
2357 | set_cr3(vcpu, val); | ||
2358 | break; | ||
2359 | case 4: | ||
2360 | set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val)); | ||
2361 | break; | ||
2362 | case 8: | ||
2363 | set_cr8(vcpu, val & 0xfUL); | ||
2364 | break; | ||
2365 | default: | ||
2366 | vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr); | ||
2367 | } | ||
2368 | } | ||
2369 | |||
2370 | static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) | ||
2371 | { | ||
2372 | struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; | ||
2373 | int j, nent = vcpu->arch.cpuid_nent; | ||
2374 | |||
2375 | e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT; | ||
2376 | /* when no next entry is found, the current entry[i] is reselected */ | ||
2377 | for (j = i + 1; j == i; j = (j + 1) % nent) { | ||
2378 | struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j]; | ||
2379 | if (ej->function == e->function) { | ||
2380 | ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; | ||
2381 | return j; | ||
2382 | } | ||
2383 | } | ||
2384 | return 0; /* silence gcc, even though control never reaches here */ | ||
2385 | } | ||
2386 | |||
2387 | /* find an entry with matching function, matching index (if needed), and that | ||
2388 | * should be read next (if it's stateful) */ | ||
2389 | static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e, | ||
2390 | u32 function, u32 index) | ||
2391 | { | ||
2392 | if (e->function != function) | ||
2393 | return 0; | ||
2394 | if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index) | ||
2395 | return 0; | ||
2396 | if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) && | ||
2397 | !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT)) | ||
2398 | return 0; | ||
2399 | return 1; | ||
2400 | } | ||
2401 | |||
2402 | void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) | ||
2403 | { | ||
2404 | int i; | ||
2405 | u32 function, index; | ||
2406 | struct kvm_cpuid_entry2 *e, *best; | ||
2407 | |||
2408 | kvm_x86_ops->cache_regs(vcpu); | ||
2409 | function = vcpu->arch.regs[VCPU_REGS_RAX]; | ||
2410 | index = vcpu->arch.regs[VCPU_REGS_RCX]; | ||
2411 | vcpu->arch.regs[VCPU_REGS_RAX] = 0; | ||
2412 | vcpu->arch.regs[VCPU_REGS_RBX] = 0; | ||
2413 | vcpu->arch.regs[VCPU_REGS_RCX] = 0; | ||
2414 | vcpu->arch.regs[VCPU_REGS_RDX] = 0; | ||
2415 | best = NULL; | ||
2416 | for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { | ||
2417 | e = &vcpu->arch.cpuid_entries[i]; | ||
2418 | if (is_matching_cpuid_entry(e, function, index)) { | ||
2419 | if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) | ||
2420 | move_to_next_stateful_cpuid_entry(vcpu, i); | ||
2421 | best = e; | ||
2422 | break; | ||
2423 | } | ||
2424 | /* | ||
2425 | * Both basic or both extended? | ||
2426 | */ | ||
2427 | if (((e->function ^ function) & 0x80000000) == 0) | ||
2428 | if (!best || e->function > best->function) | ||
2429 | best = e; | ||
2430 | } | ||
2431 | if (best) { | ||
2432 | vcpu->arch.regs[VCPU_REGS_RAX] = best->eax; | ||
2433 | vcpu->arch.regs[VCPU_REGS_RBX] = best->ebx; | ||
2434 | vcpu->arch.regs[VCPU_REGS_RCX] = best->ecx; | ||
2435 | vcpu->arch.regs[VCPU_REGS_RDX] = best->edx; | ||
2436 | } | ||
2437 | kvm_x86_ops->decache_regs(vcpu); | ||
2438 | kvm_x86_ops->skip_emulated_instruction(vcpu); | ||
2439 | } | ||
2440 | EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); | ||
2441 | |||
1956 | /* | 2442 | /* |
1957 | * Check if userspace requested an interrupt window, and that the | 2443 | * Check if userspace requested an interrupt window, and that the |
1958 | * interrupt window is open. | 2444 | * interrupt window is open. |
@@ -1962,9 +2448,9 @@ EXPORT_SYMBOL_GPL(kvm_emulate_pio_string); | |||
1962 | static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, | 2448 | static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, |
1963 | struct kvm_run *kvm_run) | 2449 | struct kvm_run *kvm_run) |
1964 | { | 2450 | { |
1965 | return (!vcpu->irq_summary && | 2451 | return (!vcpu->arch.irq_summary && |
1966 | kvm_run->request_interrupt_window && | 2452 | kvm_run->request_interrupt_window && |
1967 | vcpu->interrupt_window_open && | 2453 | vcpu->arch.interrupt_window_open && |
1968 | (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF)); | 2454 | (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF)); |
1969 | } | 2455 | } |
1970 | 2456 | ||
@@ -1978,22 +2464,51 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu, | |||
1978 | kvm_run->ready_for_interrupt_injection = 1; | 2464 | kvm_run->ready_for_interrupt_injection = 1; |
1979 | else | 2465 | else |
1980 | kvm_run->ready_for_interrupt_injection = | 2466 | kvm_run->ready_for_interrupt_injection = |
1981 | (vcpu->interrupt_window_open && | 2467 | (vcpu->arch.interrupt_window_open && |
1982 | vcpu->irq_summary == 0); | 2468 | vcpu->arch.irq_summary == 0); |
2469 | } | ||
2470 | |||
2471 | static void vapic_enter(struct kvm_vcpu *vcpu) | ||
2472 | { | ||
2473 | struct kvm_lapic *apic = vcpu->arch.apic; | ||
2474 | struct page *page; | ||
2475 | |||
2476 | if (!apic || !apic->vapic_addr) | ||
2477 | return; | ||
2478 | |||
2479 | down_read(¤t->mm->mmap_sem); | ||
2480 | page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); | ||
2481 | vcpu->arch.apic->vapic_page = page; | ||
2482 | up_read(¤t->mm->mmap_sem); | ||
2483 | } | ||
2484 | |||
2485 | static void vapic_exit(struct kvm_vcpu *vcpu) | ||
2486 | { | ||
2487 | struct kvm_lapic *apic = vcpu->arch.apic; | ||
2488 | |||
2489 | if (!apic || !apic->vapic_addr) | ||
2490 | return; | ||
2491 | |||
2492 | kvm_release_page_dirty(apic->vapic_page); | ||
2493 | mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); | ||
1983 | } | 2494 | } |
1984 | 2495 | ||
1985 | static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | 2496 | static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
1986 | { | 2497 | { |
1987 | int r; | 2498 | int r; |
1988 | 2499 | ||
1989 | if (unlikely(vcpu->mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) { | 2500 | if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) { |
1990 | printk("vcpu %d received sipi with vector # %x\n", | 2501 | pr_debug("vcpu %d received sipi with vector # %x\n", |
1991 | vcpu->vcpu_id, vcpu->sipi_vector); | 2502 | vcpu->vcpu_id, vcpu->arch.sipi_vector); |
1992 | kvm_lapic_reset(vcpu); | 2503 | kvm_lapic_reset(vcpu); |
1993 | kvm_x86_ops->vcpu_reset(vcpu); | 2504 | r = kvm_x86_ops->vcpu_reset(vcpu); |
1994 | vcpu->mp_state = VCPU_MP_STATE_RUNNABLE; | 2505 | if (r) |
2506 | return r; | ||
2507 | vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE; | ||
1995 | } | 2508 | } |
1996 | 2509 | ||
2510 | vapic_enter(vcpu); | ||
2511 | |||
1997 | preempted: | 2512 | preempted: |
1998 | if (vcpu->guest_debug.enabled) | 2513 | if (vcpu->guest_debug.enabled) |
1999 | kvm_x86_ops->guest_debug_pre(vcpu); | 2514 | kvm_x86_ops->guest_debug_pre(vcpu); |
@@ -2003,6 +2518,19 @@ again: | |||
2003 | if (unlikely(r)) | 2518 | if (unlikely(r)) |
2004 | goto out; | 2519 | goto out; |
2005 | 2520 | ||
2521 | if (vcpu->requests) { | ||
2522 | if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) | ||
2523 | __kvm_migrate_apic_timer(vcpu); | ||
2524 | if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, | ||
2525 | &vcpu->requests)) { | ||
2526 | kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS; | ||
2527 | r = 0; | ||
2528 | goto out; | ||
2529 | } | ||
2530 | } | ||
2531 | |||
2532 | kvm_inject_pending_timer_irqs(vcpu); | ||
2533 | |||
2006 | preempt_disable(); | 2534 | preempt_disable(); |
2007 | 2535 | ||
2008 | kvm_x86_ops->prepare_guest_switch(vcpu); | 2536 | kvm_x86_ops->prepare_guest_switch(vcpu); |
@@ -2010,6 +2538,13 @@ again: | |||
2010 | 2538 | ||
2011 | local_irq_disable(); | 2539 | local_irq_disable(); |
2012 | 2540 | ||
2541 | if (need_resched()) { | ||
2542 | local_irq_enable(); | ||
2543 | preempt_enable(); | ||
2544 | r = 1; | ||
2545 | goto out; | ||
2546 | } | ||
2547 | |||
2013 | if (signal_pending(current)) { | 2548 | if (signal_pending(current)) { |
2014 | local_irq_enable(); | 2549 | local_irq_enable(); |
2015 | preempt_enable(); | 2550 | preempt_enable(); |
@@ -2019,16 +2554,20 @@ again: | |||
2019 | goto out; | 2554 | goto out; |
2020 | } | 2555 | } |
2021 | 2556 | ||
2022 | if (irqchip_in_kernel(vcpu->kvm)) | 2557 | if (vcpu->arch.exception.pending) |
2558 | __queue_exception(vcpu); | ||
2559 | else if (irqchip_in_kernel(vcpu->kvm)) | ||
2023 | kvm_x86_ops->inject_pending_irq(vcpu); | 2560 | kvm_x86_ops->inject_pending_irq(vcpu); |
2024 | else if (!vcpu->mmio_read_completed) | 2561 | else |
2025 | kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run); | 2562 | kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run); |
2026 | 2563 | ||
2564 | kvm_lapic_sync_to_vapic(vcpu); | ||
2565 | |||
2027 | vcpu->guest_mode = 1; | 2566 | vcpu->guest_mode = 1; |
2028 | kvm_guest_enter(); | 2567 | kvm_guest_enter(); |
2029 | 2568 | ||
2030 | if (vcpu->requests) | 2569 | if (vcpu->requests) |
2031 | if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests)) | 2570 | if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) |
2032 | kvm_x86_ops->tlb_flush(vcpu); | 2571 | kvm_x86_ops->tlb_flush(vcpu); |
2033 | 2572 | ||
2034 | kvm_x86_ops->run(vcpu, kvm_run); | 2573 | kvm_x86_ops->run(vcpu, kvm_run); |
@@ -2055,9 +2594,14 @@ again: | |||
2055 | */ | 2594 | */ |
2056 | if (unlikely(prof_on == KVM_PROFILING)) { | 2595 | if (unlikely(prof_on == KVM_PROFILING)) { |
2057 | kvm_x86_ops->cache_regs(vcpu); | 2596 | kvm_x86_ops->cache_regs(vcpu); |
2058 | profile_hit(KVM_PROFILING, (void *)vcpu->rip); | 2597 | profile_hit(KVM_PROFILING, (void *)vcpu->arch.rip); |
2059 | } | 2598 | } |
2060 | 2599 | ||
2600 | if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu)) | ||
2601 | vcpu->arch.exception.pending = false; | ||
2602 | |||
2603 | kvm_lapic_sync_from_vapic(vcpu); | ||
2604 | |||
2061 | r = kvm_x86_ops->handle_exit(kvm_run, vcpu); | 2605 | r = kvm_x86_ops->handle_exit(kvm_run, vcpu); |
2062 | 2606 | ||
2063 | if (r > 0) { | 2607 | if (r > 0) { |
@@ -2067,10 +2611,8 @@ again: | |||
2067 | ++vcpu->stat.request_irq_exits; | 2611 | ++vcpu->stat.request_irq_exits; |
2068 | goto out; | 2612 | goto out; |
2069 | } | 2613 | } |
2070 | if (!need_resched()) { | 2614 | if (!need_resched()) |
2071 | ++vcpu->stat.light_exits; | ||
2072 | goto again; | 2615 | goto again; |
2073 | } | ||
2074 | } | 2616 | } |
2075 | 2617 | ||
2076 | out: | 2618 | out: |
@@ -2081,18 +2623,19 @@ out: | |||
2081 | 2623 | ||
2082 | post_kvm_run_save(vcpu, kvm_run); | 2624 | post_kvm_run_save(vcpu, kvm_run); |
2083 | 2625 | ||
2626 | vapic_exit(vcpu); | ||
2627 | |||
2084 | return r; | 2628 | return r; |
2085 | } | 2629 | } |
2086 | 2630 | ||
2087 | 2631 | int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |
2088 | static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
2089 | { | 2632 | { |
2090 | int r; | 2633 | int r; |
2091 | sigset_t sigsaved; | 2634 | sigset_t sigsaved; |
2092 | 2635 | ||
2093 | vcpu_load(vcpu); | 2636 | vcpu_load(vcpu); |
2094 | 2637 | ||
2095 | if (unlikely(vcpu->mp_state == VCPU_MP_STATE_UNINITIALIZED)) { | 2638 | if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_UNINITIALIZED)) { |
2096 | kvm_vcpu_block(vcpu); | 2639 | kvm_vcpu_block(vcpu); |
2097 | vcpu_put(vcpu); | 2640 | vcpu_put(vcpu); |
2098 | return -EAGAIN; | 2641 | return -EAGAIN; |
@@ -2105,18 +2648,19 @@ static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2105 | if (!irqchip_in_kernel(vcpu->kvm)) | 2648 | if (!irqchip_in_kernel(vcpu->kvm)) |
2106 | set_cr8(vcpu, kvm_run->cr8); | 2649 | set_cr8(vcpu, kvm_run->cr8); |
2107 | 2650 | ||
2108 | if (vcpu->pio.cur_count) { | 2651 | if (vcpu->arch.pio.cur_count) { |
2109 | r = complete_pio(vcpu); | 2652 | r = complete_pio(vcpu); |
2110 | if (r) | 2653 | if (r) |
2111 | goto out; | 2654 | goto out; |
2112 | } | 2655 | } |
2113 | 2656 | #if CONFIG_HAS_IOMEM | |
2114 | if (vcpu->mmio_needed) { | 2657 | if (vcpu->mmio_needed) { |
2115 | memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); | 2658 | memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); |
2116 | vcpu->mmio_read_completed = 1; | 2659 | vcpu->mmio_read_completed = 1; |
2117 | vcpu->mmio_needed = 0; | 2660 | vcpu->mmio_needed = 0; |
2118 | r = emulate_instruction(vcpu, kvm_run, | 2661 | r = emulate_instruction(vcpu, kvm_run, |
2119 | vcpu->mmio_fault_cr2, 0); | 2662 | vcpu->arch.mmio_fault_cr2, 0, |
2663 | EMULTYPE_NO_DECODE); | ||
2120 | if (r == EMULATE_DO_MMIO) { | 2664 | if (r == EMULATE_DO_MMIO) { |
2121 | /* | 2665 | /* |
2122 | * Read-modify-write. Back to userspace. | 2666 | * Read-modify-write. Back to userspace. |
@@ -2125,10 +2669,10 @@ static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2125 | goto out; | 2669 | goto out; |
2126 | } | 2670 | } |
2127 | } | 2671 | } |
2128 | 2672 | #endif | |
2129 | if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) { | 2673 | if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) { |
2130 | kvm_x86_ops->cache_regs(vcpu); | 2674 | kvm_x86_ops->cache_regs(vcpu); |
2131 | vcpu->regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret; | 2675 | vcpu->arch.regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret; |
2132 | kvm_x86_ops->decache_regs(vcpu); | 2676 | kvm_x86_ops->decache_regs(vcpu); |
2133 | } | 2677 | } |
2134 | 2678 | ||
@@ -2142,33 +2686,32 @@ out: | |||
2142 | return r; | 2686 | return r; |
2143 | } | 2687 | } |
2144 | 2688 | ||
2145 | static int kvm_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, | 2689 | int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) |
2146 | struct kvm_regs *regs) | ||
2147 | { | 2690 | { |
2148 | vcpu_load(vcpu); | 2691 | vcpu_load(vcpu); |
2149 | 2692 | ||
2150 | kvm_x86_ops->cache_regs(vcpu); | 2693 | kvm_x86_ops->cache_regs(vcpu); |
2151 | 2694 | ||
2152 | regs->rax = vcpu->regs[VCPU_REGS_RAX]; | 2695 | regs->rax = vcpu->arch.regs[VCPU_REGS_RAX]; |
2153 | regs->rbx = vcpu->regs[VCPU_REGS_RBX]; | 2696 | regs->rbx = vcpu->arch.regs[VCPU_REGS_RBX]; |
2154 | regs->rcx = vcpu->regs[VCPU_REGS_RCX]; | 2697 | regs->rcx = vcpu->arch.regs[VCPU_REGS_RCX]; |
2155 | regs->rdx = vcpu->regs[VCPU_REGS_RDX]; | 2698 | regs->rdx = vcpu->arch.regs[VCPU_REGS_RDX]; |
2156 | regs->rsi = vcpu->regs[VCPU_REGS_RSI]; | 2699 | regs->rsi = vcpu->arch.regs[VCPU_REGS_RSI]; |
2157 | regs->rdi = vcpu->regs[VCPU_REGS_RDI]; | 2700 | regs->rdi = vcpu->arch.regs[VCPU_REGS_RDI]; |
2158 | regs->rsp = vcpu->regs[VCPU_REGS_RSP]; | 2701 | regs->rsp = vcpu->arch.regs[VCPU_REGS_RSP]; |
2159 | regs->rbp = vcpu->regs[VCPU_REGS_RBP]; | 2702 | regs->rbp = vcpu->arch.regs[VCPU_REGS_RBP]; |
2160 | #ifdef CONFIG_X86_64 | 2703 | #ifdef CONFIG_X86_64 |
2161 | regs->r8 = vcpu->regs[VCPU_REGS_R8]; | 2704 | regs->r8 = vcpu->arch.regs[VCPU_REGS_R8]; |
2162 | regs->r9 = vcpu->regs[VCPU_REGS_R9]; | 2705 | regs->r9 = vcpu->arch.regs[VCPU_REGS_R9]; |
2163 | regs->r10 = vcpu->regs[VCPU_REGS_R10]; | 2706 | regs->r10 = vcpu->arch.regs[VCPU_REGS_R10]; |
2164 | regs->r11 = vcpu->regs[VCPU_REGS_R11]; | 2707 | regs->r11 = vcpu->arch.regs[VCPU_REGS_R11]; |
2165 | regs->r12 = vcpu->regs[VCPU_REGS_R12]; | 2708 | regs->r12 = vcpu->arch.regs[VCPU_REGS_R12]; |
2166 | regs->r13 = vcpu->regs[VCPU_REGS_R13]; | 2709 | regs->r13 = vcpu->arch.regs[VCPU_REGS_R13]; |
2167 | regs->r14 = vcpu->regs[VCPU_REGS_R14]; | 2710 | regs->r14 = vcpu->arch.regs[VCPU_REGS_R14]; |
2168 | regs->r15 = vcpu->regs[VCPU_REGS_R15]; | 2711 | regs->r15 = vcpu->arch.regs[VCPU_REGS_R15]; |
2169 | #endif | 2712 | #endif |
2170 | 2713 | ||
2171 | regs->rip = vcpu->rip; | 2714 | regs->rip = vcpu->arch.rip; |
2172 | regs->rflags = kvm_x86_ops->get_rflags(vcpu); | 2715 | regs->rflags = kvm_x86_ops->get_rflags(vcpu); |
2173 | 2716 | ||
2174 | /* | 2717 | /* |
@@ -2182,31 +2725,30 @@ static int kvm_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, | |||
2182 | return 0; | 2725 | return 0; |
2183 | } | 2726 | } |
2184 | 2727 | ||
2185 | static int kvm_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, | 2728 | int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) |
2186 | struct kvm_regs *regs) | ||
2187 | { | 2729 | { |
2188 | vcpu_load(vcpu); | 2730 | vcpu_load(vcpu); |
2189 | 2731 | ||
2190 | vcpu->regs[VCPU_REGS_RAX] = regs->rax; | 2732 | vcpu->arch.regs[VCPU_REGS_RAX] = regs->rax; |
2191 | vcpu->regs[VCPU_REGS_RBX] = regs->rbx; | 2733 | vcpu->arch.regs[VCPU_REGS_RBX] = regs->rbx; |
2192 | vcpu->regs[VCPU_REGS_RCX] = regs->rcx; | 2734 | vcpu->arch.regs[VCPU_REGS_RCX] = regs->rcx; |
2193 | vcpu->regs[VCPU_REGS_RDX] = regs->rdx; | 2735 | vcpu->arch.regs[VCPU_REGS_RDX] = regs->rdx; |
2194 | vcpu->regs[VCPU_REGS_RSI] = regs->rsi; | 2736 | vcpu->arch.regs[VCPU_REGS_RSI] = regs->rsi; |
2195 | vcpu->regs[VCPU_REGS_RDI] = regs->rdi; | 2737 | vcpu->arch.regs[VCPU_REGS_RDI] = regs->rdi; |
2196 | vcpu->regs[VCPU_REGS_RSP] = regs->rsp; | 2738 | vcpu->arch.regs[VCPU_REGS_RSP] = regs->rsp; |
2197 | vcpu->regs[VCPU_REGS_RBP] = regs->rbp; | 2739 | vcpu->arch.regs[VCPU_REGS_RBP] = regs->rbp; |
2198 | #ifdef CONFIG_X86_64 | 2740 | #ifdef CONFIG_X86_64 |
2199 | vcpu->regs[VCPU_REGS_R8] = regs->r8; | 2741 | vcpu->arch.regs[VCPU_REGS_R8] = regs->r8; |
2200 | vcpu->regs[VCPU_REGS_R9] = regs->r9; | 2742 | vcpu->arch.regs[VCPU_REGS_R9] = regs->r9; |
2201 | vcpu->regs[VCPU_REGS_R10] = regs->r10; | 2743 | vcpu->arch.regs[VCPU_REGS_R10] = regs->r10; |
2202 | vcpu->regs[VCPU_REGS_R11] = regs->r11; | 2744 | vcpu->arch.regs[VCPU_REGS_R11] = regs->r11; |
2203 | vcpu->regs[VCPU_REGS_R12] = regs->r12; | 2745 | vcpu->arch.regs[VCPU_REGS_R12] = regs->r12; |
2204 | vcpu->regs[VCPU_REGS_R13] = regs->r13; | 2746 | vcpu->arch.regs[VCPU_REGS_R13] = regs->r13; |
2205 | vcpu->regs[VCPU_REGS_R14] = regs->r14; | 2747 | vcpu->arch.regs[VCPU_REGS_R14] = regs->r14; |
2206 | vcpu->regs[VCPU_REGS_R15] = regs->r15; | 2748 | vcpu->arch.regs[VCPU_REGS_R15] = regs->r15; |
2207 | #endif | 2749 | #endif |
2208 | 2750 | ||
2209 | vcpu->rip = regs->rip; | 2751 | vcpu->arch.rip = regs->rip; |
2210 | kvm_x86_ops->set_rflags(vcpu, regs->rflags); | 2752 | kvm_x86_ops->set_rflags(vcpu, regs->rflags); |
2211 | 2753 | ||
2212 | kvm_x86_ops->decache_regs(vcpu); | 2754 | kvm_x86_ops->decache_regs(vcpu); |
@@ -2222,8 +2764,18 @@ static void get_segment(struct kvm_vcpu *vcpu, | |||
2222 | return kvm_x86_ops->get_segment(vcpu, var, seg); | 2764 | return kvm_x86_ops->get_segment(vcpu, var, seg); |
2223 | } | 2765 | } |
2224 | 2766 | ||
2225 | static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, | 2767 | void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) |
2226 | struct kvm_sregs *sregs) | 2768 | { |
2769 | struct kvm_segment cs; | ||
2770 | |||
2771 | get_segment(vcpu, &cs, VCPU_SREG_CS); | ||
2772 | *db = cs.db; | ||
2773 | *l = cs.l; | ||
2774 | } | ||
2775 | EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits); | ||
2776 | |||
2777 | int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, | ||
2778 | struct kvm_sregs *sregs) | ||
2227 | { | 2779 | { |
2228 | struct descriptor_table dt; | 2780 | struct descriptor_table dt; |
2229 | int pending_vec; | 2781 | int pending_vec; |
@@ -2248,12 +2800,12 @@ static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, | |||
2248 | sregs->gdt.base = dt.base; | 2800 | sregs->gdt.base = dt.base; |
2249 | 2801 | ||
2250 | kvm_x86_ops->decache_cr4_guest_bits(vcpu); | 2802 | kvm_x86_ops->decache_cr4_guest_bits(vcpu); |
2251 | sregs->cr0 = vcpu->cr0; | 2803 | sregs->cr0 = vcpu->arch.cr0; |
2252 | sregs->cr2 = vcpu->cr2; | 2804 | sregs->cr2 = vcpu->arch.cr2; |
2253 | sregs->cr3 = vcpu->cr3; | 2805 | sregs->cr3 = vcpu->arch.cr3; |
2254 | sregs->cr4 = vcpu->cr4; | 2806 | sregs->cr4 = vcpu->arch.cr4; |
2255 | sregs->cr8 = get_cr8(vcpu); | 2807 | sregs->cr8 = get_cr8(vcpu); |
2256 | sregs->efer = vcpu->shadow_efer; | 2808 | sregs->efer = vcpu->arch.shadow_efer; |
2257 | sregs->apic_base = kvm_get_apic_base(vcpu); | 2809 | sregs->apic_base = kvm_get_apic_base(vcpu); |
2258 | 2810 | ||
2259 | if (irqchip_in_kernel(vcpu->kvm)) { | 2811 | if (irqchip_in_kernel(vcpu->kvm)) { |
@@ -2261,9 +2813,10 @@ static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, | |||
2261 | sizeof sregs->interrupt_bitmap); | 2813 | sizeof sregs->interrupt_bitmap); |
2262 | pending_vec = kvm_x86_ops->get_irq(vcpu); | 2814 | pending_vec = kvm_x86_ops->get_irq(vcpu); |
2263 | if (pending_vec >= 0) | 2815 | if (pending_vec >= 0) |
2264 | set_bit(pending_vec, (unsigned long *)sregs->interrupt_bitmap); | 2816 | set_bit(pending_vec, |
2817 | (unsigned long *)sregs->interrupt_bitmap); | ||
2265 | } else | 2818 | } else |
2266 | memcpy(sregs->interrupt_bitmap, vcpu->irq_pending, | 2819 | memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending, |
2267 | sizeof sregs->interrupt_bitmap); | 2820 | sizeof sregs->interrupt_bitmap); |
2268 | 2821 | ||
2269 | vcpu_put(vcpu); | 2822 | vcpu_put(vcpu); |
@@ -2277,8 +2830,8 @@ static void set_segment(struct kvm_vcpu *vcpu, | |||
2277 | return kvm_x86_ops->set_segment(vcpu, var, seg); | 2830 | return kvm_x86_ops->set_segment(vcpu, var, seg); |
2278 | } | 2831 | } |
2279 | 2832 | ||
2280 | static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, | 2833 | int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, |
2281 | struct kvm_sregs *sregs) | 2834 | struct kvm_sregs *sregs) |
2282 | { | 2835 | { |
2283 | int mmu_reset_needed = 0; | 2836 | int mmu_reset_needed = 0; |
2284 | int i, pending_vec, max_bits; | 2837 | int i, pending_vec, max_bits; |
@@ -2293,13 +2846,13 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, | |||
2293 | dt.base = sregs->gdt.base; | 2846 | dt.base = sregs->gdt.base; |
2294 | kvm_x86_ops->set_gdt(vcpu, &dt); | 2847 | kvm_x86_ops->set_gdt(vcpu, &dt); |
2295 | 2848 | ||
2296 | vcpu->cr2 = sregs->cr2; | 2849 | vcpu->arch.cr2 = sregs->cr2; |
2297 | mmu_reset_needed |= vcpu->cr3 != sregs->cr3; | 2850 | mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; |
2298 | vcpu->cr3 = sregs->cr3; | 2851 | vcpu->arch.cr3 = sregs->cr3; |
2299 | 2852 | ||
2300 | set_cr8(vcpu, sregs->cr8); | 2853 | set_cr8(vcpu, sregs->cr8); |
2301 | 2854 | ||
2302 | mmu_reset_needed |= vcpu->shadow_efer != sregs->efer; | 2855 | mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer; |
2303 | #ifdef CONFIG_X86_64 | 2856 | #ifdef CONFIG_X86_64 |
2304 | kvm_x86_ops->set_efer(vcpu, sregs->efer); | 2857 | kvm_x86_ops->set_efer(vcpu, sregs->efer); |
2305 | #endif | 2858 | #endif |
@@ -2307,25 +2860,25 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, | |||
2307 | 2860 | ||
2308 | kvm_x86_ops->decache_cr4_guest_bits(vcpu); | 2861 | kvm_x86_ops->decache_cr4_guest_bits(vcpu); |
2309 | 2862 | ||
2310 | mmu_reset_needed |= vcpu->cr0 != sregs->cr0; | 2863 | mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0; |
2311 | vcpu->cr0 = sregs->cr0; | 2864 | vcpu->arch.cr0 = sregs->cr0; |
2312 | kvm_x86_ops->set_cr0(vcpu, sregs->cr0); | 2865 | kvm_x86_ops->set_cr0(vcpu, sregs->cr0); |
2313 | 2866 | ||
2314 | mmu_reset_needed |= vcpu->cr4 != sregs->cr4; | 2867 | mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4; |
2315 | kvm_x86_ops->set_cr4(vcpu, sregs->cr4); | 2868 | kvm_x86_ops->set_cr4(vcpu, sregs->cr4); |
2316 | if (!is_long_mode(vcpu) && is_pae(vcpu)) | 2869 | if (!is_long_mode(vcpu) && is_pae(vcpu)) |
2317 | load_pdptrs(vcpu, vcpu->cr3); | 2870 | load_pdptrs(vcpu, vcpu->arch.cr3); |
2318 | 2871 | ||
2319 | if (mmu_reset_needed) | 2872 | if (mmu_reset_needed) |
2320 | kvm_mmu_reset_context(vcpu); | 2873 | kvm_mmu_reset_context(vcpu); |
2321 | 2874 | ||
2322 | if (!irqchip_in_kernel(vcpu->kvm)) { | 2875 | if (!irqchip_in_kernel(vcpu->kvm)) { |
2323 | memcpy(vcpu->irq_pending, sregs->interrupt_bitmap, | 2876 | memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap, |
2324 | sizeof vcpu->irq_pending); | 2877 | sizeof vcpu->arch.irq_pending); |
2325 | vcpu->irq_summary = 0; | 2878 | vcpu->arch.irq_summary = 0; |
2326 | for (i = 0; i < ARRAY_SIZE(vcpu->irq_pending); ++i) | 2879 | for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i) |
2327 | if (vcpu->irq_pending[i]) | 2880 | if (vcpu->arch.irq_pending[i]) |
2328 | __set_bit(i, &vcpu->irq_summary); | 2881 | __set_bit(i, &vcpu->arch.irq_summary); |
2329 | } else { | 2882 | } else { |
2330 | max_bits = (sizeof sregs->interrupt_bitmap) << 3; | 2883 | max_bits = (sizeof sregs->interrupt_bitmap) << 3; |
2331 | pending_vec = find_first_bit( | 2884 | pending_vec = find_first_bit( |
@@ -2334,7 +2887,8 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, | |||
2334 | /* Only pending external irq is handled here */ | 2887 | /* Only pending external irq is handled here */ |
2335 | if (pending_vec < max_bits) { | 2888 | if (pending_vec < max_bits) { |
2336 | kvm_x86_ops->set_irq(vcpu, pending_vec); | 2889 | kvm_x86_ops->set_irq(vcpu, pending_vec); |
2337 | printk("Set back pending irq %d\n", pending_vec); | 2890 | pr_debug("Set back pending irq %d\n", |
2891 | pending_vec); | ||
2338 | } | 2892 | } |
2339 | } | 2893 | } |
2340 | 2894 | ||
@@ -2353,174 +2907,8 @@ static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, | |||
2353 | return 0; | 2907 | return 0; |
2354 | } | 2908 | } |
2355 | 2909 | ||
2356 | void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) | 2910 | int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu, |
2357 | { | 2911 | struct kvm_debug_guest *dbg) |
2358 | struct kvm_segment cs; | ||
2359 | |||
2360 | get_segment(vcpu, &cs, VCPU_SREG_CS); | ||
2361 | *db = cs.db; | ||
2362 | *l = cs.l; | ||
2363 | } | ||
2364 | EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits); | ||
2365 | |||
2366 | /* | ||
2367 | * List of msr numbers which we expose to userspace through KVM_GET_MSRS | ||
2368 | * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. | ||
2369 | * | ||
2370 | * This list is modified at module load time to reflect the | ||
2371 | * capabilities of the host cpu. | ||
2372 | */ | ||
2373 | static u32 msrs_to_save[] = { | ||
2374 | MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, | ||
2375 | MSR_K6_STAR, | ||
2376 | #ifdef CONFIG_X86_64 | ||
2377 | MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, | ||
2378 | #endif | ||
2379 | MSR_IA32_TIME_STAMP_COUNTER, | ||
2380 | }; | ||
2381 | |||
2382 | static unsigned num_msrs_to_save; | ||
2383 | |||
2384 | static u32 emulated_msrs[] = { | ||
2385 | MSR_IA32_MISC_ENABLE, | ||
2386 | }; | ||
2387 | |||
2388 | static __init void kvm_init_msr_list(void) | ||
2389 | { | ||
2390 | u32 dummy[2]; | ||
2391 | unsigned i, j; | ||
2392 | |||
2393 | for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { | ||
2394 | if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) | ||
2395 | continue; | ||
2396 | if (j < i) | ||
2397 | msrs_to_save[j] = msrs_to_save[i]; | ||
2398 | j++; | ||
2399 | } | ||
2400 | num_msrs_to_save = j; | ||
2401 | } | ||
2402 | |||
2403 | /* | ||
2404 | * Adapt set_msr() to msr_io()'s calling convention | ||
2405 | */ | ||
2406 | static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) | ||
2407 | { | ||
2408 | return kvm_set_msr(vcpu, index, *data); | ||
2409 | } | ||
2410 | |||
2411 | /* | ||
2412 | * Read or write a bunch of msrs. All parameters are kernel addresses. | ||
2413 | * | ||
2414 | * @return number of msrs set successfully. | ||
2415 | */ | ||
2416 | static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, | ||
2417 | struct kvm_msr_entry *entries, | ||
2418 | int (*do_msr)(struct kvm_vcpu *vcpu, | ||
2419 | unsigned index, u64 *data)) | ||
2420 | { | ||
2421 | int i; | ||
2422 | |||
2423 | vcpu_load(vcpu); | ||
2424 | |||
2425 | for (i = 0; i < msrs->nmsrs; ++i) | ||
2426 | if (do_msr(vcpu, entries[i].index, &entries[i].data)) | ||
2427 | break; | ||
2428 | |||
2429 | vcpu_put(vcpu); | ||
2430 | |||
2431 | return i; | ||
2432 | } | ||
2433 | |||
2434 | /* | ||
2435 | * Read or write a bunch of msrs. Parameters are user addresses. | ||
2436 | * | ||
2437 | * @return number of msrs set successfully. | ||
2438 | */ | ||
2439 | static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, | ||
2440 | int (*do_msr)(struct kvm_vcpu *vcpu, | ||
2441 | unsigned index, u64 *data), | ||
2442 | int writeback) | ||
2443 | { | ||
2444 | struct kvm_msrs msrs; | ||
2445 | struct kvm_msr_entry *entries; | ||
2446 | int r, n; | ||
2447 | unsigned size; | ||
2448 | |||
2449 | r = -EFAULT; | ||
2450 | if (copy_from_user(&msrs, user_msrs, sizeof msrs)) | ||
2451 | goto out; | ||
2452 | |||
2453 | r = -E2BIG; | ||
2454 | if (msrs.nmsrs >= MAX_IO_MSRS) | ||
2455 | goto out; | ||
2456 | |||
2457 | r = -ENOMEM; | ||
2458 | size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; | ||
2459 | entries = vmalloc(size); | ||
2460 | if (!entries) | ||
2461 | goto out; | ||
2462 | |||
2463 | r = -EFAULT; | ||
2464 | if (copy_from_user(entries, user_msrs->entries, size)) | ||
2465 | goto out_free; | ||
2466 | |||
2467 | r = n = __msr_io(vcpu, &msrs, entries, do_msr); | ||
2468 | if (r < 0) | ||
2469 | goto out_free; | ||
2470 | |||
2471 | r = -EFAULT; | ||
2472 | if (writeback && copy_to_user(user_msrs->entries, entries, size)) | ||
2473 | goto out_free; | ||
2474 | |||
2475 | r = n; | ||
2476 | |||
2477 | out_free: | ||
2478 | vfree(entries); | ||
2479 | out: | ||
2480 | return r; | ||
2481 | } | ||
2482 | |||
2483 | /* | ||
2484 | * Translate a guest virtual address to a guest physical address. | ||
2485 | */ | ||
2486 | static int kvm_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, | ||
2487 | struct kvm_translation *tr) | ||
2488 | { | ||
2489 | unsigned long vaddr = tr->linear_address; | ||
2490 | gpa_t gpa; | ||
2491 | |||
2492 | vcpu_load(vcpu); | ||
2493 | mutex_lock(&vcpu->kvm->lock); | ||
2494 | gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr); | ||
2495 | tr->physical_address = gpa; | ||
2496 | tr->valid = gpa != UNMAPPED_GVA; | ||
2497 | tr->writeable = 1; | ||
2498 | tr->usermode = 0; | ||
2499 | mutex_unlock(&vcpu->kvm->lock); | ||
2500 | vcpu_put(vcpu); | ||
2501 | |||
2502 | return 0; | ||
2503 | } | ||
2504 | |||
2505 | static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, | ||
2506 | struct kvm_interrupt *irq) | ||
2507 | { | ||
2508 | if (irq->irq < 0 || irq->irq >= 256) | ||
2509 | return -EINVAL; | ||
2510 | if (irqchip_in_kernel(vcpu->kvm)) | ||
2511 | return -ENXIO; | ||
2512 | vcpu_load(vcpu); | ||
2513 | |||
2514 | set_bit(irq->irq, vcpu->irq_pending); | ||
2515 | set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary); | ||
2516 | |||
2517 | vcpu_put(vcpu); | ||
2518 | |||
2519 | return 0; | ||
2520 | } | ||
2521 | |||
2522 | static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu, | ||
2523 | struct kvm_debug_guest *dbg) | ||
2524 | { | 2912 | { |
2525 | int r; | 2913 | int r; |
2526 | 2914 | ||
@@ -2533,179 +2921,6 @@ static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu, | |||
2533 | return r; | 2921 | return r; |
2534 | } | 2922 | } |
2535 | 2923 | ||
2536 | static struct page *kvm_vcpu_nopage(struct vm_area_struct *vma, | ||
2537 | unsigned long address, | ||
2538 | int *type) | ||
2539 | { | ||
2540 | struct kvm_vcpu *vcpu = vma->vm_file->private_data; | ||
2541 | unsigned long pgoff; | ||
2542 | struct page *page; | ||
2543 | |||
2544 | pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | ||
2545 | if (pgoff == 0) | ||
2546 | page = virt_to_page(vcpu->run); | ||
2547 | else if (pgoff == KVM_PIO_PAGE_OFFSET) | ||
2548 | page = virt_to_page(vcpu->pio_data); | ||
2549 | else | ||
2550 | return NOPAGE_SIGBUS; | ||
2551 | get_page(page); | ||
2552 | if (type != NULL) | ||
2553 | *type = VM_FAULT_MINOR; | ||
2554 | |||
2555 | return page; | ||
2556 | } | ||
2557 | |||
2558 | static struct vm_operations_struct kvm_vcpu_vm_ops = { | ||
2559 | .nopage = kvm_vcpu_nopage, | ||
2560 | }; | ||
2561 | |||
2562 | static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) | ||
2563 | { | ||
2564 | vma->vm_ops = &kvm_vcpu_vm_ops; | ||
2565 | return 0; | ||
2566 | } | ||
2567 | |||
2568 | static int kvm_vcpu_release(struct inode *inode, struct file *filp) | ||
2569 | { | ||
2570 | struct kvm_vcpu *vcpu = filp->private_data; | ||
2571 | |||
2572 | fput(vcpu->kvm->filp); | ||
2573 | return 0; | ||
2574 | } | ||
2575 | |||
2576 | static struct file_operations kvm_vcpu_fops = { | ||
2577 | .release = kvm_vcpu_release, | ||
2578 | .unlocked_ioctl = kvm_vcpu_ioctl, | ||
2579 | .compat_ioctl = kvm_vcpu_ioctl, | ||
2580 | .mmap = kvm_vcpu_mmap, | ||
2581 | }; | ||
2582 | |||
2583 | /* | ||
2584 | * Allocates an inode for the vcpu. | ||
2585 | */ | ||
2586 | static int create_vcpu_fd(struct kvm_vcpu *vcpu) | ||
2587 | { | ||
2588 | int fd, r; | ||
2589 | struct inode *inode; | ||
2590 | struct file *file; | ||
2591 | |||
2592 | r = anon_inode_getfd(&fd, &inode, &file, | ||
2593 | "kvm-vcpu", &kvm_vcpu_fops, vcpu); | ||
2594 | if (r) | ||
2595 | return r; | ||
2596 | atomic_inc(&vcpu->kvm->filp->f_count); | ||
2597 | return fd; | ||
2598 | } | ||
2599 | |||
2600 | /* | ||
2601 | * Creates some virtual cpus. Good luck creating more than one. | ||
2602 | */ | ||
2603 | static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) | ||
2604 | { | ||
2605 | int r; | ||
2606 | struct kvm_vcpu *vcpu; | ||
2607 | |||
2608 | if (!valid_vcpu(n)) | ||
2609 | return -EINVAL; | ||
2610 | |||
2611 | vcpu = kvm_x86_ops->vcpu_create(kvm, n); | ||
2612 | if (IS_ERR(vcpu)) | ||
2613 | return PTR_ERR(vcpu); | ||
2614 | |||
2615 | preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); | ||
2616 | |||
2617 | /* We do fxsave: this must be aligned. */ | ||
2618 | BUG_ON((unsigned long)&vcpu->host_fx_image & 0xF); | ||
2619 | |||
2620 | vcpu_load(vcpu); | ||
2621 | r = kvm_mmu_setup(vcpu); | ||
2622 | vcpu_put(vcpu); | ||
2623 | if (r < 0) | ||
2624 | goto free_vcpu; | ||
2625 | |||
2626 | mutex_lock(&kvm->lock); | ||
2627 | if (kvm->vcpus[n]) { | ||
2628 | r = -EEXIST; | ||
2629 | mutex_unlock(&kvm->lock); | ||
2630 | goto mmu_unload; | ||
2631 | } | ||
2632 | kvm->vcpus[n] = vcpu; | ||
2633 | mutex_unlock(&kvm->lock); | ||
2634 | |||
2635 | /* Now it's all set up, let userspace reach it */ | ||
2636 | r = create_vcpu_fd(vcpu); | ||
2637 | if (r < 0) | ||
2638 | goto unlink; | ||
2639 | return r; | ||
2640 | |||
2641 | unlink: | ||
2642 | mutex_lock(&kvm->lock); | ||
2643 | kvm->vcpus[n] = NULL; | ||
2644 | mutex_unlock(&kvm->lock); | ||
2645 | |||
2646 | mmu_unload: | ||
2647 | vcpu_load(vcpu); | ||
2648 | kvm_mmu_unload(vcpu); | ||
2649 | vcpu_put(vcpu); | ||
2650 | |||
2651 | free_vcpu: | ||
2652 | kvm_x86_ops->vcpu_free(vcpu); | ||
2653 | return r; | ||
2654 | } | ||
2655 | |||
2656 | static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) | ||
2657 | { | ||
2658 | u64 efer; | ||
2659 | int i; | ||
2660 | struct kvm_cpuid_entry *e, *entry; | ||
2661 | |||
2662 | rdmsrl(MSR_EFER, efer); | ||
2663 | entry = NULL; | ||
2664 | for (i = 0; i < vcpu->cpuid_nent; ++i) { | ||
2665 | e = &vcpu->cpuid_entries[i]; | ||
2666 | if (e->function == 0x80000001) { | ||
2667 | entry = e; | ||
2668 | break; | ||
2669 | } | ||
2670 | } | ||
2671 | if (entry && (entry->edx & (1 << 20)) && !(efer & EFER_NX)) { | ||
2672 | entry->edx &= ~(1 << 20); | ||
2673 | printk(KERN_INFO "kvm: guest NX capability removed\n"); | ||
2674 | } | ||
2675 | } | ||
2676 | |||
2677 | static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, | ||
2678 | struct kvm_cpuid *cpuid, | ||
2679 | struct kvm_cpuid_entry __user *entries) | ||
2680 | { | ||
2681 | int r; | ||
2682 | |||
2683 | r = -E2BIG; | ||
2684 | if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) | ||
2685 | goto out; | ||
2686 | r = -EFAULT; | ||
2687 | if (copy_from_user(&vcpu->cpuid_entries, entries, | ||
2688 | cpuid->nent * sizeof(struct kvm_cpuid_entry))) | ||
2689 | goto out; | ||
2690 | vcpu->cpuid_nent = cpuid->nent; | ||
2691 | cpuid_fix_nx_cap(vcpu); | ||
2692 | return 0; | ||
2693 | |||
2694 | out: | ||
2695 | return r; | ||
2696 | } | ||
2697 | |||
2698 | static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) | ||
2699 | { | ||
2700 | if (sigset) { | ||
2701 | sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); | ||
2702 | vcpu->sigset_active = 1; | ||
2703 | vcpu->sigset = *sigset; | ||
2704 | } else | ||
2705 | vcpu->sigset_active = 0; | ||
2706 | return 0; | ||
2707 | } | ||
2708 | |||
2709 | /* | 2924 | /* |
2710 | * fxsave fpu state. Taken from x86_64/processor.h. To be killed when | 2925 | * fxsave fpu state. Taken from x86_64/processor.h. To be killed when |
2711 | * we have asm/x86/processor.h | 2926 | * we have asm/x86/processor.h |
@@ -2727,9 +2942,31 @@ struct fxsave { | |||
2727 | #endif | 2942 | #endif |
2728 | }; | 2943 | }; |
2729 | 2944 | ||
2730 | static int kvm_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) | 2945 | /* |
2946 | * Translate a guest virtual address to a guest physical address. | ||
2947 | */ | ||
2948 | int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, | ||
2949 | struct kvm_translation *tr) | ||
2731 | { | 2950 | { |
2732 | struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image; | 2951 | unsigned long vaddr = tr->linear_address; |
2952 | gpa_t gpa; | ||
2953 | |||
2954 | vcpu_load(vcpu); | ||
2955 | down_read(¤t->mm->mmap_sem); | ||
2956 | gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr); | ||
2957 | up_read(¤t->mm->mmap_sem); | ||
2958 | tr->physical_address = gpa; | ||
2959 | tr->valid = gpa != UNMAPPED_GVA; | ||
2960 | tr->writeable = 1; | ||
2961 | tr->usermode = 0; | ||
2962 | vcpu_put(vcpu); | ||
2963 | |||
2964 | return 0; | ||
2965 | } | ||
2966 | |||
2967 | int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) | ||
2968 | { | ||
2969 | struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; | ||
2733 | 2970 | ||
2734 | vcpu_load(vcpu); | 2971 | vcpu_load(vcpu); |
2735 | 2972 | ||
@@ -2747,9 +2984,9 @@ static int kvm_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) | |||
2747 | return 0; | 2984 | return 0; |
2748 | } | 2985 | } |
2749 | 2986 | ||
2750 | static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) | 2987 | int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) |
2751 | { | 2988 | { |
2752 | struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image; | 2989 | struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; |
2753 | 2990 | ||
2754 | vcpu_load(vcpu); | 2991 | vcpu_load(vcpu); |
2755 | 2992 | ||
@@ -2767,862 +3004,284 @@ static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) | |||
2767 | return 0; | 3004 | return 0; |
2768 | } | 3005 | } |
2769 | 3006 | ||
2770 | static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, | 3007 | void fx_init(struct kvm_vcpu *vcpu) |
2771 | struct kvm_lapic_state *s) | ||
2772 | { | 3008 | { |
2773 | vcpu_load(vcpu); | 3009 | unsigned after_mxcsr_mask; |
2774 | memcpy(s->regs, vcpu->apic->regs, sizeof *s); | ||
2775 | vcpu_put(vcpu); | ||
2776 | |||
2777 | return 0; | ||
2778 | } | ||
2779 | 3010 | ||
2780 | static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, | 3011 | /* Initialize guest FPU by resetting ours and saving into guest's */ |
2781 | struct kvm_lapic_state *s) | 3012 | preempt_disable(); |
2782 | { | 3013 | fx_save(&vcpu->arch.host_fx_image); |
2783 | vcpu_load(vcpu); | 3014 | fpu_init(); |
2784 | memcpy(vcpu->apic->regs, s->regs, sizeof *s); | 3015 | fx_save(&vcpu->arch.guest_fx_image); |
2785 | kvm_apic_post_state_restore(vcpu); | 3016 | fx_restore(&vcpu->arch.host_fx_image); |
2786 | vcpu_put(vcpu); | 3017 | preempt_enable(); |
2787 | 3018 | ||
2788 | return 0; | 3019 | vcpu->arch.cr0 |= X86_CR0_ET; |
3020 | after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space); | ||
3021 | vcpu->arch.guest_fx_image.mxcsr = 0x1f80; | ||
3022 | memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask, | ||
3023 | 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask); | ||
2789 | } | 3024 | } |
3025 | EXPORT_SYMBOL_GPL(fx_init); | ||
2790 | 3026 | ||
2791 | static long kvm_vcpu_ioctl(struct file *filp, | 3027 | void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) |
2792 | unsigned int ioctl, unsigned long arg) | ||
2793 | { | 3028 | { |
2794 | struct kvm_vcpu *vcpu = filp->private_data; | 3029 | if (!vcpu->fpu_active || vcpu->guest_fpu_loaded) |
2795 | void __user *argp = (void __user *)arg; | 3030 | return; |
2796 | int r = -EINVAL; | ||
2797 | |||
2798 | switch (ioctl) { | ||
2799 | case KVM_RUN: | ||
2800 | r = -EINVAL; | ||
2801 | if (arg) | ||
2802 | goto out; | ||
2803 | r = kvm_vcpu_ioctl_run(vcpu, vcpu->run); | ||
2804 | break; | ||
2805 | case KVM_GET_REGS: { | ||
2806 | struct kvm_regs kvm_regs; | ||
2807 | |||
2808 | memset(&kvm_regs, 0, sizeof kvm_regs); | ||
2809 | r = kvm_vcpu_ioctl_get_regs(vcpu, &kvm_regs); | ||
2810 | if (r) | ||
2811 | goto out; | ||
2812 | r = -EFAULT; | ||
2813 | if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs)) | ||
2814 | goto out; | ||
2815 | r = 0; | ||
2816 | break; | ||
2817 | } | ||
2818 | case KVM_SET_REGS: { | ||
2819 | struct kvm_regs kvm_regs; | ||
2820 | |||
2821 | r = -EFAULT; | ||
2822 | if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs)) | ||
2823 | goto out; | ||
2824 | r = kvm_vcpu_ioctl_set_regs(vcpu, &kvm_regs); | ||
2825 | if (r) | ||
2826 | goto out; | ||
2827 | r = 0; | ||
2828 | break; | ||
2829 | } | ||
2830 | case KVM_GET_SREGS: { | ||
2831 | struct kvm_sregs kvm_sregs; | ||
2832 | |||
2833 | memset(&kvm_sregs, 0, sizeof kvm_sregs); | ||
2834 | r = kvm_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs); | ||
2835 | if (r) | ||
2836 | goto out; | ||
2837 | r = -EFAULT; | ||
2838 | if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs)) | ||
2839 | goto out; | ||
2840 | r = 0; | ||
2841 | break; | ||
2842 | } | ||
2843 | case KVM_SET_SREGS: { | ||
2844 | struct kvm_sregs kvm_sregs; | ||
2845 | |||
2846 | r = -EFAULT; | ||
2847 | if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs)) | ||
2848 | goto out; | ||
2849 | r = kvm_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs); | ||
2850 | if (r) | ||
2851 | goto out; | ||
2852 | r = 0; | ||
2853 | break; | ||
2854 | } | ||
2855 | case KVM_TRANSLATE: { | ||
2856 | struct kvm_translation tr; | ||
2857 | |||
2858 | r = -EFAULT; | ||
2859 | if (copy_from_user(&tr, argp, sizeof tr)) | ||
2860 | goto out; | ||
2861 | r = kvm_vcpu_ioctl_translate(vcpu, &tr); | ||
2862 | if (r) | ||
2863 | goto out; | ||
2864 | r = -EFAULT; | ||
2865 | if (copy_to_user(argp, &tr, sizeof tr)) | ||
2866 | goto out; | ||
2867 | r = 0; | ||
2868 | break; | ||
2869 | } | ||
2870 | case KVM_INTERRUPT: { | ||
2871 | struct kvm_interrupt irq; | ||
2872 | |||
2873 | r = -EFAULT; | ||
2874 | if (copy_from_user(&irq, argp, sizeof irq)) | ||
2875 | goto out; | ||
2876 | r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); | ||
2877 | if (r) | ||
2878 | goto out; | ||
2879 | r = 0; | ||
2880 | break; | ||
2881 | } | ||
2882 | case KVM_DEBUG_GUEST: { | ||
2883 | struct kvm_debug_guest dbg; | ||
2884 | |||
2885 | r = -EFAULT; | ||
2886 | if (copy_from_user(&dbg, argp, sizeof dbg)) | ||
2887 | goto out; | ||
2888 | r = kvm_vcpu_ioctl_debug_guest(vcpu, &dbg); | ||
2889 | if (r) | ||
2890 | goto out; | ||
2891 | r = 0; | ||
2892 | break; | ||
2893 | } | ||
2894 | case KVM_GET_MSRS: | ||
2895 | r = msr_io(vcpu, argp, kvm_get_msr, 1); | ||
2896 | break; | ||
2897 | case KVM_SET_MSRS: | ||
2898 | r = msr_io(vcpu, argp, do_set_msr, 0); | ||
2899 | break; | ||
2900 | case KVM_SET_CPUID: { | ||
2901 | struct kvm_cpuid __user *cpuid_arg = argp; | ||
2902 | struct kvm_cpuid cpuid; | ||
2903 | |||
2904 | r = -EFAULT; | ||
2905 | if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) | ||
2906 | goto out; | ||
2907 | r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); | ||
2908 | if (r) | ||
2909 | goto out; | ||
2910 | break; | ||
2911 | } | ||
2912 | case KVM_SET_SIGNAL_MASK: { | ||
2913 | struct kvm_signal_mask __user *sigmask_arg = argp; | ||
2914 | struct kvm_signal_mask kvm_sigmask; | ||
2915 | sigset_t sigset, *p; | ||
2916 | |||
2917 | p = NULL; | ||
2918 | if (argp) { | ||
2919 | r = -EFAULT; | ||
2920 | if (copy_from_user(&kvm_sigmask, argp, | ||
2921 | sizeof kvm_sigmask)) | ||
2922 | goto out; | ||
2923 | r = -EINVAL; | ||
2924 | if (kvm_sigmask.len != sizeof sigset) | ||
2925 | goto out; | ||
2926 | r = -EFAULT; | ||
2927 | if (copy_from_user(&sigset, sigmask_arg->sigset, | ||
2928 | sizeof sigset)) | ||
2929 | goto out; | ||
2930 | p = &sigset; | ||
2931 | } | ||
2932 | r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); | ||
2933 | break; | ||
2934 | } | ||
2935 | case KVM_GET_FPU: { | ||
2936 | struct kvm_fpu fpu; | ||
2937 | |||
2938 | memset(&fpu, 0, sizeof fpu); | ||
2939 | r = kvm_vcpu_ioctl_get_fpu(vcpu, &fpu); | ||
2940 | if (r) | ||
2941 | goto out; | ||
2942 | r = -EFAULT; | ||
2943 | if (copy_to_user(argp, &fpu, sizeof fpu)) | ||
2944 | goto out; | ||
2945 | r = 0; | ||
2946 | break; | ||
2947 | } | ||
2948 | case KVM_SET_FPU: { | ||
2949 | struct kvm_fpu fpu; | ||
2950 | |||
2951 | r = -EFAULT; | ||
2952 | if (copy_from_user(&fpu, argp, sizeof fpu)) | ||
2953 | goto out; | ||
2954 | r = kvm_vcpu_ioctl_set_fpu(vcpu, &fpu); | ||
2955 | if (r) | ||
2956 | goto out; | ||
2957 | r = 0; | ||
2958 | break; | ||
2959 | } | ||
2960 | case KVM_GET_LAPIC: { | ||
2961 | struct kvm_lapic_state lapic; | ||
2962 | |||
2963 | memset(&lapic, 0, sizeof lapic); | ||
2964 | r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic); | ||
2965 | if (r) | ||
2966 | goto out; | ||
2967 | r = -EFAULT; | ||
2968 | if (copy_to_user(argp, &lapic, sizeof lapic)) | ||
2969 | goto out; | ||
2970 | r = 0; | ||
2971 | break; | ||
2972 | } | ||
2973 | case KVM_SET_LAPIC: { | ||
2974 | struct kvm_lapic_state lapic; | ||
2975 | 3031 | ||
2976 | r = -EFAULT; | 3032 | vcpu->guest_fpu_loaded = 1; |
2977 | if (copy_from_user(&lapic, argp, sizeof lapic)) | 3033 | fx_save(&vcpu->arch.host_fx_image); |
2978 | goto out; | 3034 | fx_restore(&vcpu->arch.guest_fx_image); |
2979 | r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);; | ||
2980 | if (r) | ||
2981 | goto out; | ||
2982 | r = 0; | ||
2983 | break; | ||
2984 | } | ||
2985 | default: | ||
2986 | ; | ||
2987 | } | ||
2988 | out: | ||
2989 | return r; | ||
2990 | } | 3035 | } |
3036 | EXPORT_SYMBOL_GPL(kvm_load_guest_fpu); | ||
2991 | 3037 | ||
2992 | static long kvm_vm_ioctl(struct file *filp, | 3038 | void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) |
2993 | unsigned int ioctl, unsigned long arg) | ||
2994 | { | 3039 | { |
2995 | struct kvm *kvm = filp->private_data; | 3040 | if (!vcpu->guest_fpu_loaded) |
2996 | void __user *argp = (void __user *)arg; | 3041 | return; |
2997 | int r = -EINVAL; | ||
2998 | |||
2999 | switch (ioctl) { | ||
3000 | case KVM_CREATE_VCPU: | ||
3001 | r = kvm_vm_ioctl_create_vcpu(kvm, arg); | ||
3002 | if (r < 0) | ||
3003 | goto out; | ||
3004 | break; | ||
3005 | case KVM_SET_MEMORY_REGION: { | ||
3006 | struct kvm_memory_region kvm_mem; | ||
3007 | |||
3008 | r = -EFAULT; | ||
3009 | if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem)) | ||
3010 | goto out; | ||
3011 | r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_mem); | ||
3012 | if (r) | ||
3013 | goto out; | ||
3014 | break; | ||
3015 | } | ||
3016 | case KVM_GET_DIRTY_LOG: { | ||
3017 | struct kvm_dirty_log log; | ||
3018 | |||
3019 | r = -EFAULT; | ||
3020 | if (copy_from_user(&log, argp, sizeof log)) | ||
3021 | goto out; | ||
3022 | r = kvm_vm_ioctl_get_dirty_log(kvm, &log); | ||
3023 | if (r) | ||
3024 | goto out; | ||
3025 | break; | ||
3026 | } | ||
3027 | case KVM_SET_MEMORY_ALIAS: { | ||
3028 | struct kvm_memory_alias alias; | ||
3029 | |||
3030 | r = -EFAULT; | ||
3031 | if (copy_from_user(&alias, argp, sizeof alias)) | ||
3032 | goto out; | ||
3033 | r = kvm_vm_ioctl_set_memory_alias(kvm, &alias); | ||
3034 | if (r) | ||
3035 | goto out; | ||
3036 | break; | ||
3037 | } | ||
3038 | case KVM_CREATE_IRQCHIP: | ||
3039 | r = -ENOMEM; | ||
3040 | kvm->vpic = kvm_create_pic(kvm); | ||
3041 | if (kvm->vpic) { | ||
3042 | r = kvm_ioapic_init(kvm); | ||
3043 | if (r) { | ||
3044 | kfree(kvm->vpic); | ||
3045 | kvm->vpic = NULL; | ||
3046 | goto out; | ||
3047 | } | ||
3048 | } | ||
3049 | else | ||
3050 | goto out; | ||
3051 | break; | ||
3052 | case KVM_IRQ_LINE: { | ||
3053 | struct kvm_irq_level irq_event; | ||
3054 | |||
3055 | r = -EFAULT; | ||
3056 | if (copy_from_user(&irq_event, argp, sizeof irq_event)) | ||
3057 | goto out; | ||
3058 | if (irqchip_in_kernel(kvm)) { | ||
3059 | mutex_lock(&kvm->lock); | ||
3060 | if (irq_event.irq < 16) | ||
3061 | kvm_pic_set_irq(pic_irqchip(kvm), | ||
3062 | irq_event.irq, | ||
3063 | irq_event.level); | ||
3064 | kvm_ioapic_set_irq(kvm->vioapic, | ||
3065 | irq_event.irq, | ||
3066 | irq_event.level); | ||
3067 | mutex_unlock(&kvm->lock); | ||
3068 | r = 0; | ||
3069 | } | ||
3070 | break; | ||
3071 | } | ||
3072 | case KVM_GET_IRQCHIP: { | ||
3073 | /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ | ||
3074 | struct kvm_irqchip chip; | ||
3075 | |||
3076 | r = -EFAULT; | ||
3077 | if (copy_from_user(&chip, argp, sizeof chip)) | ||
3078 | goto out; | ||
3079 | r = -ENXIO; | ||
3080 | if (!irqchip_in_kernel(kvm)) | ||
3081 | goto out; | ||
3082 | r = kvm_vm_ioctl_get_irqchip(kvm, &chip); | ||
3083 | if (r) | ||
3084 | goto out; | ||
3085 | r = -EFAULT; | ||
3086 | if (copy_to_user(argp, &chip, sizeof chip)) | ||
3087 | goto out; | ||
3088 | r = 0; | ||
3089 | break; | ||
3090 | } | ||
3091 | case KVM_SET_IRQCHIP: { | ||
3092 | /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ | ||
3093 | struct kvm_irqchip chip; | ||
3094 | 3042 | ||
3095 | r = -EFAULT; | 3043 | vcpu->guest_fpu_loaded = 0; |
3096 | if (copy_from_user(&chip, argp, sizeof chip)) | 3044 | fx_save(&vcpu->arch.guest_fx_image); |
3097 | goto out; | 3045 | fx_restore(&vcpu->arch.host_fx_image); |
3098 | r = -ENXIO; | 3046 | ++vcpu->stat.fpu_reload; |
3099 | if (!irqchip_in_kernel(kvm)) | ||
3100 | goto out; | ||
3101 | r = kvm_vm_ioctl_set_irqchip(kvm, &chip); | ||
3102 | if (r) | ||
3103 | goto out; | ||
3104 | r = 0; | ||
3105 | break; | ||
3106 | } | ||
3107 | default: | ||
3108 | ; | ||
3109 | } | ||
3110 | out: | ||
3111 | return r; | ||
3112 | } | 3047 | } |
3048 | EXPORT_SYMBOL_GPL(kvm_put_guest_fpu); | ||
3113 | 3049 | ||
3114 | static struct page *kvm_vm_nopage(struct vm_area_struct *vma, | 3050 | void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) |
3115 | unsigned long address, | ||
3116 | int *type) | ||
3117 | { | 3051 | { |
3118 | struct kvm *kvm = vma->vm_file->private_data; | 3052 | kvm_x86_ops->vcpu_free(vcpu); |
3119 | unsigned long pgoff; | ||
3120 | struct page *page; | ||
3121 | |||
3122 | pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | ||
3123 | page = gfn_to_page(kvm, pgoff); | ||
3124 | if (!page) | ||
3125 | return NOPAGE_SIGBUS; | ||
3126 | get_page(page); | ||
3127 | if (type != NULL) | ||
3128 | *type = VM_FAULT_MINOR; | ||
3129 | |||
3130 | return page; | ||
3131 | } | 3053 | } |
3132 | 3054 | ||
3133 | static struct vm_operations_struct kvm_vm_vm_ops = { | 3055 | struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, |
3134 | .nopage = kvm_vm_nopage, | 3056 | unsigned int id) |
3135 | }; | ||
3136 | |||
3137 | static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma) | ||
3138 | { | 3057 | { |
3139 | vma->vm_ops = &kvm_vm_vm_ops; | 3058 | return kvm_x86_ops->vcpu_create(kvm, id); |
3140 | return 0; | ||
3141 | } | 3059 | } |
3142 | 3060 | ||
3143 | static struct file_operations kvm_vm_fops = { | 3061 | int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) |
3144 | .release = kvm_vm_release, | ||
3145 | .unlocked_ioctl = kvm_vm_ioctl, | ||
3146 | .compat_ioctl = kvm_vm_ioctl, | ||
3147 | .mmap = kvm_vm_mmap, | ||
3148 | }; | ||
3149 | |||
3150 | static int kvm_dev_ioctl_create_vm(void) | ||
3151 | { | 3062 | { |
3152 | int fd, r; | 3063 | int r; |
3153 | struct inode *inode; | ||
3154 | struct file *file; | ||
3155 | struct kvm *kvm; | ||
3156 | 3064 | ||
3157 | kvm = kvm_create_vm(); | 3065 | /* We do fxsave: this must be aligned. */ |
3158 | if (IS_ERR(kvm)) | 3066 | BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF); |
3159 | return PTR_ERR(kvm); | ||
3160 | r = anon_inode_getfd(&fd, &inode, &file, "kvm-vm", &kvm_vm_fops, kvm); | ||
3161 | if (r) { | ||
3162 | kvm_destroy_vm(kvm); | ||
3163 | return r; | ||
3164 | } | ||
3165 | 3067 | ||
3166 | kvm->filp = file; | 3068 | vcpu_load(vcpu); |
3069 | r = kvm_arch_vcpu_reset(vcpu); | ||
3070 | if (r == 0) | ||
3071 | r = kvm_mmu_setup(vcpu); | ||
3072 | vcpu_put(vcpu); | ||
3073 | if (r < 0) | ||
3074 | goto free_vcpu; | ||
3167 | 3075 | ||
3168 | return fd; | 3076 | return 0; |
3077 | free_vcpu: | ||
3078 | kvm_x86_ops->vcpu_free(vcpu); | ||
3079 | return r; | ||
3169 | } | 3080 | } |
3170 | 3081 | ||
3171 | static long kvm_dev_ioctl(struct file *filp, | 3082 | void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) |
3172 | unsigned int ioctl, unsigned long arg) | ||
3173 | { | 3083 | { |
3174 | void __user *argp = (void __user *)arg; | 3084 | vcpu_load(vcpu); |
3175 | long r = -EINVAL; | 3085 | kvm_mmu_unload(vcpu); |
3176 | 3086 | vcpu_put(vcpu); | |
3177 | switch (ioctl) { | ||
3178 | case KVM_GET_API_VERSION: | ||
3179 | r = -EINVAL; | ||
3180 | if (arg) | ||
3181 | goto out; | ||
3182 | r = KVM_API_VERSION; | ||
3183 | break; | ||
3184 | case KVM_CREATE_VM: | ||
3185 | r = -EINVAL; | ||
3186 | if (arg) | ||
3187 | goto out; | ||
3188 | r = kvm_dev_ioctl_create_vm(); | ||
3189 | break; | ||
3190 | case KVM_GET_MSR_INDEX_LIST: { | ||
3191 | struct kvm_msr_list __user *user_msr_list = argp; | ||
3192 | struct kvm_msr_list msr_list; | ||
3193 | unsigned n; | ||
3194 | |||
3195 | r = -EFAULT; | ||
3196 | if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list)) | ||
3197 | goto out; | ||
3198 | n = msr_list.nmsrs; | ||
3199 | msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs); | ||
3200 | if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) | ||
3201 | goto out; | ||
3202 | r = -E2BIG; | ||
3203 | if (n < num_msrs_to_save) | ||
3204 | goto out; | ||
3205 | r = -EFAULT; | ||
3206 | if (copy_to_user(user_msr_list->indices, &msrs_to_save, | ||
3207 | num_msrs_to_save * sizeof(u32))) | ||
3208 | goto out; | ||
3209 | if (copy_to_user(user_msr_list->indices | ||
3210 | + num_msrs_to_save * sizeof(u32), | ||
3211 | &emulated_msrs, | ||
3212 | ARRAY_SIZE(emulated_msrs) * sizeof(u32))) | ||
3213 | goto out; | ||
3214 | r = 0; | ||
3215 | break; | ||
3216 | } | ||
3217 | case KVM_CHECK_EXTENSION: { | ||
3218 | int ext = (long)argp; | ||
3219 | 3087 | ||
3220 | switch (ext) { | 3088 | kvm_x86_ops->vcpu_free(vcpu); |
3221 | case KVM_CAP_IRQCHIP: | ||
3222 | case KVM_CAP_HLT: | ||
3223 | r = 1; | ||
3224 | break; | ||
3225 | default: | ||
3226 | r = 0; | ||
3227 | break; | ||
3228 | } | ||
3229 | break; | ||
3230 | } | ||
3231 | case KVM_GET_VCPU_MMAP_SIZE: | ||
3232 | r = -EINVAL; | ||
3233 | if (arg) | ||
3234 | goto out; | ||
3235 | r = 2 * PAGE_SIZE; | ||
3236 | break; | ||
3237 | default: | ||
3238 | ; | ||
3239 | } | ||
3240 | out: | ||
3241 | return r; | ||
3242 | } | 3089 | } |
3243 | 3090 | ||
3244 | static struct file_operations kvm_chardev_ops = { | 3091 | int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) |
3245 | .unlocked_ioctl = kvm_dev_ioctl, | ||
3246 | .compat_ioctl = kvm_dev_ioctl, | ||
3247 | }; | ||
3248 | |||
3249 | static struct miscdevice kvm_dev = { | ||
3250 | KVM_MINOR, | ||
3251 | "kvm", | ||
3252 | &kvm_chardev_ops, | ||
3253 | }; | ||
3254 | |||
3255 | /* | ||
3256 | * Make sure that a cpu that is being hot-unplugged does not have any vcpus | ||
3257 | * cached on it. | ||
3258 | */ | ||
3259 | static void decache_vcpus_on_cpu(int cpu) | ||
3260 | { | 3092 | { |
3261 | struct kvm *vm; | 3093 | return kvm_x86_ops->vcpu_reset(vcpu); |
3262 | struct kvm_vcpu *vcpu; | ||
3263 | int i; | ||
3264 | |||
3265 | spin_lock(&kvm_lock); | ||
3266 | list_for_each_entry(vm, &vm_list, vm_list) | ||
3267 | for (i = 0; i < KVM_MAX_VCPUS; ++i) { | ||
3268 | vcpu = vm->vcpus[i]; | ||
3269 | if (!vcpu) | ||
3270 | continue; | ||
3271 | /* | ||
3272 | * If the vcpu is locked, then it is running on some | ||
3273 | * other cpu and therefore it is not cached on the | ||
3274 | * cpu in question. | ||
3275 | * | ||
3276 | * If it's not locked, check the last cpu it executed | ||
3277 | * on. | ||
3278 | */ | ||
3279 | if (mutex_trylock(&vcpu->mutex)) { | ||
3280 | if (vcpu->cpu == cpu) { | ||
3281 | kvm_x86_ops->vcpu_decache(vcpu); | ||
3282 | vcpu->cpu = -1; | ||
3283 | } | ||
3284 | mutex_unlock(&vcpu->mutex); | ||
3285 | } | ||
3286 | } | ||
3287 | spin_unlock(&kvm_lock); | ||
3288 | } | 3094 | } |
3289 | 3095 | ||
3290 | static void hardware_enable(void *junk) | 3096 | void kvm_arch_hardware_enable(void *garbage) |
3291 | { | 3097 | { |
3292 | int cpu = raw_smp_processor_id(); | 3098 | kvm_x86_ops->hardware_enable(garbage); |
3293 | |||
3294 | if (cpu_isset(cpu, cpus_hardware_enabled)) | ||
3295 | return; | ||
3296 | cpu_set(cpu, cpus_hardware_enabled); | ||
3297 | kvm_x86_ops->hardware_enable(NULL); | ||
3298 | } | 3099 | } |
3299 | 3100 | ||
3300 | static void hardware_disable(void *junk) | 3101 | void kvm_arch_hardware_disable(void *garbage) |
3301 | { | 3102 | { |
3302 | int cpu = raw_smp_processor_id(); | 3103 | kvm_x86_ops->hardware_disable(garbage); |
3303 | |||
3304 | if (!cpu_isset(cpu, cpus_hardware_enabled)) | ||
3305 | return; | ||
3306 | cpu_clear(cpu, cpus_hardware_enabled); | ||
3307 | decache_vcpus_on_cpu(cpu); | ||
3308 | kvm_x86_ops->hardware_disable(NULL); | ||
3309 | } | 3104 | } |
3310 | 3105 | ||
3311 | static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, | 3106 | int kvm_arch_hardware_setup(void) |
3312 | void *v) | ||
3313 | { | 3107 | { |
3314 | int cpu = (long)v; | 3108 | return kvm_x86_ops->hardware_setup(); |
3315 | |||
3316 | switch (val) { | ||
3317 | case CPU_DYING: | ||
3318 | case CPU_DYING_FROZEN: | ||
3319 | printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", | ||
3320 | cpu); | ||
3321 | hardware_disable(NULL); | ||
3322 | break; | ||
3323 | case CPU_UP_CANCELED: | ||
3324 | case CPU_UP_CANCELED_FROZEN: | ||
3325 | printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", | ||
3326 | cpu); | ||
3327 | smp_call_function_single(cpu, hardware_disable, NULL, 0, 1); | ||
3328 | break; | ||
3329 | case CPU_ONLINE: | ||
3330 | case CPU_ONLINE_FROZEN: | ||
3331 | printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", | ||
3332 | cpu); | ||
3333 | smp_call_function_single(cpu, hardware_enable, NULL, 0, 1); | ||
3334 | break; | ||
3335 | } | ||
3336 | return NOTIFY_OK; | ||
3337 | } | 3109 | } |
3338 | 3110 | ||
3339 | static int kvm_reboot(struct notifier_block *notifier, unsigned long val, | 3111 | void kvm_arch_hardware_unsetup(void) |
3340 | void *v) | ||
3341 | { | 3112 | { |
3342 | if (val == SYS_RESTART) { | 3113 | kvm_x86_ops->hardware_unsetup(); |
3343 | /* | ||
3344 | * Some (well, at least mine) BIOSes hang on reboot if | ||
3345 | * in vmx root mode. | ||
3346 | */ | ||
3347 | printk(KERN_INFO "kvm: exiting hardware virtualization\n"); | ||
3348 | on_each_cpu(hardware_disable, NULL, 0, 1); | ||
3349 | } | ||
3350 | return NOTIFY_OK; | ||
3351 | } | 3114 | } |
3352 | 3115 | ||
3353 | static struct notifier_block kvm_reboot_notifier = { | 3116 | void kvm_arch_check_processor_compat(void *rtn) |
3354 | .notifier_call = kvm_reboot, | ||
3355 | .priority = 0, | ||
3356 | }; | ||
3357 | |||
3358 | void kvm_io_bus_init(struct kvm_io_bus *bus) | ||
3359 | { | 3117 | { |
3360 | memset(bus, 0, sizeof(*bus)); | 3118 | kvm_x86_ops->check_processor_compatibility(rtn); |
3361 | } | 3119 | } |
3362 | 3120 | ||
3363 | void kvm_io_bus_destroy(struct kvm_io_bus *bus) | 3121 | int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) |
3364 | { | 3122 | { |
3365 | int i; | 3123 | struct page *page; |
3124 | struct kvm *kvm; | ||
3125 | int r; | ||
3366 | 3126 | ||
3367 | for (i = 0; i < bus->dev_count; i++) { | 3127 | BUG_ON(vcpu->kvm == NULL); |
3368 | struct kvm_io_device *pos = bus->devs[i]; | 3128 | kvm = vcpu->kvm; |
3369 | 3129 | ||
3370 | kvm_iodevice_destructor(pos); | 3130 | vcpu->arch.mmu.root_hpa = INVALID_PAGE; |
3371 | } | 3131 | if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0) |
3372 | } | 3132 | vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE; |
3133 | else | ||
3134 | vcpu->arch.mp_state = VCPU_MP_STATE_UNINITIALIZED; | ||
3373 | 3135 | ||
3374 | struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr) | 3136 | page = alloc_page(GFP_KERNEL | __GFP_ZERO); |
3375 | { | 3137 | if (!page) { |
3376 | int i; | 3138 | r = -ENOMEM; |
3139 | goto fail; | ||
3140 | } | ||
3141 | vcpu->arch.pio_data = page_address(page); | ||
3377 | 3142 | ||
3378 | for (i = 0; i < bus->dev_count; i++) { | 3143 | r = kvm_mmu_create(vcpu); |
3379 | struct kvm_io_device *pos = bus->devs[i]; | 3144 | if (r < 0) |
3145 | goto fail_free_pio_data; | ||
3380 | 3146 | ||
3381 | if (pos->in_range(pos, addr)) | 3147 | if (irqchip_in_kernel(kvm)) { |
3382 | return pos; | 3148 | r = kvm_create_lapic(vcpu); |
3149 | if (r < 0) | ||
3150 | goto fail_mmu_destroy; | ||
3383 | } | 3151 | } |
3384 | 3152 | ||
3385 | return NULL; | 3153 | return 0; |
3386 | } | ||
3387 | |||
3388 | void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev) | ||
3389 | { | ||
3390 | BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1)); | ||
3391 | 3154 | ||
3392 | bus->devs[bus->dev_count++] = dev; | 3155 | fail_mmu_destroy: |
3156 | kvm_mmu_destroy(vcpu); | ||
3157 | fail_free_pio_data: | ||
3158 | free_page((unsigned long)vcpu->arch.pio_data); | ||
3159 | fail: | ||
3160 | return r; | ||
3393 | } | 3161 | } |
3394 | 3162 | ||
3395 | static struct notifier_block kvm_cpu_notifier = { | 3163 | void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) |
3396 | .notifier_call = kvm_cpu_hotplug, | ||
3397 | .priority = 20, /* must be > scheduler priority */ | ||
3398 | }; | ||
3399 | |||
3400 | static u64 stat_get(void *_offset) | ||
3401 | { | 3164 | { |
3402 | unsigned offset = (long)_offset; | 3165 | kvm_free_lapic(vcpu); |
3403 | u64 total = 0; | 3166 | kvm_mmu_destroy(vcpu); |
3404 | struct kvm *kvm; | 3167 | free_page((unsigned long)vcpu->arch.pio_data); |
3405 | struct kvm_vcpu *vcpu; | ||
3406 | int i; | ||
3407 | |||
3408 | spin_lock(&kvm_lock); | ||
3409 | list_for_each_entry(kvm, &vm_list, vm_list) | ||
3410 | for (i = 0; i < KVM_MAX_VCPUS; ++i) { | ||
3411 | vcpu = kvm->vcpus[i]; | ||
3412 | if (vcpu) | ||
3413 | total += *(u32 *)((void *)vcpu + offset); | ||
3414 | } | ||
3415 | spin_unlock(&kvm_lock); | ||
3416 | return total; | ||
3417 | } | 3168 | } |
3418 | 3169 | ||
3419 | DEFINE_SIMPLE_ATTRIBUTE(stat_fops, stat_get, NULL, "%llu\n"); | 3170 | struct kvm *kvm_arch_create_vm(void) |
3420 | |||
3421 | static __init void kvm_init_debug(void) | ||
3422 | { | 3171 | { |
3423 | struct kvm_stats_debugfs_item *p; | 3172 | struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); |
3424 | |||
3425 | debugfs_dir = debugfs_create_dir("kvm", NULL); | ||
3426 | for (p = debugfs_entries; p->name; ++p) | ||
3427 | p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir, | ||
3428 | (void *)(long)p->offset, | ||
3429 | &stat_fops); | ||
3430 | } | ||
3431 | 3173 | ||
3432 | static void kvm_exit_debug(void) | 3174 | if (!kvm) |
3433 | { | 3175 | return ERR_PTR(-ENOMEM); |
3434 | struct kvm_stats_debugfs_item *p; | ||
3435 | 3176 | ||
3436 | for (p = debugfs_entries; p->name; ++p) | 3177 | INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); |
3437 | debugfs_remove(p->dentry); | ||
3438 | debugfs_remove(debugfs_dir); | ||
3439 | } | ||
3440 | 3178 | ||
3441 | static int kvm_suspend(struct sys_device *dev, pm_message_t state) | 3179 | return kvm; |
3442 | { | ||
3443 | hardware_disable(NULL); | ||
3444 | return 0; | ||
3445 | } | 3180 | } |
3446 | 3181 | ||
3447 | static int kvm_resume(struct sys_device *dev) | 3182 | static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) |
3448 | { | 3183 | { |
3449 | hardware_enable(NULL); | 3184 | vcpu_load(vcpu); |
3450 | return 0; | 3185 | kvm_mmu_unload(vcpu); |
3186 | vcpu_put(vcpu); | ||
3451 | } | 3187 | } |
3452 | 3188 | ||
3453 | static struct sysdev_class kvm_sysdev_class = { | 3189 | static void kvm_free_vcpus(struct kvm *kvm) |
3454 | .name = "kvm", | ||
3455 | .suspend = kvm_suspend, | ||
3456 | .resume = kvm_resume, | ||
3457 | }; | ||
3458 | |||
3459 | static struct sys_device kvm_sysdev = { | ||
3460 | .id = 0, | ||
3461 | .cls = &kvm_sysdev_class, | ||
3462 | }; | ||
3463 | |||
3464 | hpa_t bad_page_address; | ||
3465 | |||
3466 | static inline | ||
3467 | struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) | ||
3468 | { | 3190 | { |
3469 | return container_of(pn, struct kvm_vcpu, preempt_notifier); | 3191 | unsigned int i; |
3470 | } | ||
3471 | 3192 | ||
3472 | static void kvm_sched_in(struct preempt_notifier *pn, int cpu) | 3193 | /* |
3473 | { | 3194 | * Unpin any mmu pages first. |
3474 | struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); | 3195 | */ |
3196 | for (i = 0; i < KVM_MAX_VCPUS; ++i) | ||
3197 | if (kvm->vcpus[i]) | ||
3198 | kvm_unload_vcpu_mmu(kvm->vcpus[i]); | ||
3199 | for (i = 0; i < KVM_MAX_VCPUS; ++i) { | ||
3200 | if (kvm->vcpus[i]) { | ||
3201 | kvm_arch_vcpu_free(kvm->vcpus[i]); | ||
3202 | kvm->vcpus[i] = NULL; | ||
3203 | } | ||
3204 | } | ||
3475 | 3205 | ||
3476 | kvm_x86_ops->vcpu_load(vcpu, cpu); | ||
3477 | } | 3206 | } |
3478 | 3207 | ||
3479 | static void kvm_sched_out(struct preempt_notifier *pn, | 3208 | void kvm_arch_destroy_vm(struct kvm *kvm) |
3480 | struct task_struct *next) | ||
3481 | { | 3209 | { |
3482 | struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); | 3210 | kfree(kvm->arch.vpic); |
3483 | 3211 | kfree(kvm->arch.vioapic); | |
3484 | kvm_x86_ops->vcpu_put(vcpu); | 3212 | kvm_free_vcpus(kvm); |
3213 | kvm_free_physmem(kvm); | ||
3214 | kfree(kvm); | ||
3485 | } | 3215 | } |
3486 | 3216 | ||
3487 | int kvm_init_x86(struct kvm_x86_ops *ops, unsigned int vcpu_size, | 3217 | int kvm_arch_set_memory_region(struct kvm *kvm, |
3488 | struct module *module) | 3218 | struct kvm_userspace_memory_region *mem, |
3219 | struct kvm_memory_slot old, | ||
3220 | int user_alloc) | ||
3489 | { | 3221 | { |
3490 | int r; | 3222 | int npages = mem->memory_size >> PAGE_SHIFT; |
3491 | int cpu; | 3223 | struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot]; |
3492 | |||
3493 | if (kvm_x86_ops) { | ||
3494 | printk(KERN_ERR "kvm: already loaded the other module\n"); | ||
3495 | return -EEXIST; | ||
3496 | } | ||
3497 | 3224 | ||
3498 | if (!ops->cpu_has_kvm_support()) { | 3225 | /*To keep backward compatibility with older userspace, |
3499 | printk(KERN_ERR "kvm: no hardware support\n"); | 3226 | *x86 needs to hanlde !user_alloc case. |
3500 | return -EOPNOTSUPP; | 3227 | */ |
3501 | } | 3228 | if (!user_alloc) { |
3502 | if (ops->disabled_by_bios()) { | 3229 | if (npages && !old.rmap) { |
3503 | printk(KERN_ERR "kvm: disabled by bios\n"); | 3230 | memslot->userspace_addr = do_mmap(NULL, 0, |
3504 | return -EOPNOTSUPP; | 3231 | npages * PAGE_SIZE, |
3505 | } | 3232 | PROT_READ | PROT_WRITE, |
3506 | 3233 | MAP_SHARED | MAP_ANONYMOUS, | |
3507 | kvm_x86_ops = ops; | 3234 | 0); |
3508 | 3235 | ||
3509 | r = kvm_x86_ops->hardware_setup(); | 3236 | if (IS_ERR((void *)memslot->userspace_addr)) |
3510 | if (r < 0) | 3237 | return PTR_ERR((void *)memslot->userspace_addr); |
3511 | goto out; | 3238 | } else { |
3512 | 3239 | if (!old.user_alloc && old.rmap) { | |
3513 | for_each_online_cpu(cpu) { | 3240 | int ret; |
3514 | smp_call_function_single(cpu, | 3241 | |
3515 | kvm_x86_ops->check_processor_compatibility, | 3242 | ret = do_munmap(current->mm, old.userspace_addr, |
3516 | &r, 0, 1); | 3243 | old.npages * PAGE_SIZE); |
3517 | if (r < 0) | 3244 | if (ret < 0) |
3518 | goto out_free_0; | 3245 | printk(KERN_WARNING |
3519 | } | 3246 | "kvm_vm_ioctl_set_memory_region: " |
3520 | 3247 | "failed to munmap memory\n"); | |
3521 | on_each_cpu(hardware_enable, NULL, 0, 1); | 3248 | } |
3522 | r = register_cpu_notifier(&kvm_cpu_notifier); | 3249 | } |
3523 | if (r) | ||
3524 | goto out_free_1; | ||
3525 | register_reboot_notifier(&kvm_reboot_notifier); | ||
3526 | |||
3527 | r = sysdev_class_register(&kvm_sysdev_class); | ||
3528 | if (r) | ||
3529 | goto out_free_2; | ||
3530 | |||
3531 | r = sysdev_register(&kvm_sysdev); | ||
3532 | if (r) | ||
3533 | goto out_free_3; | ||
3534 | |||
3535 | /* A kmem cache lets us meet the alignment requirements of fx_save. */ | ||
3536 | kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, | ||
3537 | __alignof__(struct kvm_vcpu), 0, 0); | ||
3538 | if (!kvm_vcpu_cache) { | ||
3539 | r = -ENOMEM; | ||
3540 | goto out_free_4; | ||
3541 | } | 3250 | } |
3542 | 3251 | ||
3543 | kvm_chardev_ops.owner = module; | 3252 | if (!kvm->arch.n_requested_mmu_pages) { |
3544 | 3253 | unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); | |
3545 | r = misc_register(&kvm_dev); | 3254 | kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); |
3546 | if (r) { | ||
3547 | printk (KERN_ERR "kvm: misc device register failed\n"); | ||
3548 | goto out_free; | ||
3549 | } | 3255 | } |
3550 | 3256 | ||
3551 | kvm_preempt_ops.sched_in = kvm_sched_in; | 3257 | kvm_mmu_slot_remove_write_access(kvm, mem->slot); |
3552 | kvm_preempt_ops.sched_out = kvm_sched_out; | 3258 | kvm_flush_remote_tlbs(kvm); |
3553 | |||
3554 | return r; | ||
3555 | 3259 | ||
3556 | out_free: | 3260 | return 0; |
3557 | kmem_cache_destroy(kvm_vcpu_cache); | ||
3558 | out_free_4: | ||
3559 | sysdev_unregister(&kvm_sysdev); | ||
3560 | out_free_3: | ||
3561 | sysdev_class_unregister(&kvm_sysdev_class); | ||
3562 | out_free_2: | ||
3563 | unregister_reboot_notifier(&kvm_reboot_notifier); | ||
3564 | unregister_cpu_notifier(&kvm_cpu_notifier); | ||
3565 | out_free_1: | ||
3566 | on_each_cpu(hardware_disable, NULL, 0, 1); | ||
3567 | out_free_0: | ||
3568 | kvm_x86_ops->hardware_unsetup(); | ||
3569 | out: | ||
3570 | kvm_x86_ops = NULL; | ||
3571 | return r; | ||
3572 | } | 3261 | } |
3573 | 3262 | ||
3574 | void kvm_exit_x86(void) | 3263 | int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) |
3575 | { | 3264 | { |
3576 | misc_deregister(&kvm_dev); | 3265 | return vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE |
3577 | kmem_cache_destroy(kvm_vcpu_cache); | 3266 | || vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED; |
3578 | sysdev_unregister(&kvm_sysdev); | ||
3579 | sysdev_class_unregister(&kvm_sysdev_class); | ||
3580 | unregister_reboot_notifier(&kvm_reboot_notifier); | ||
3581 | unregister_cpu_notifier(&kvm_cpu_notifier); | ||
3582 | on_each_cpu(hardware_disable, NULL, 0, 1); | ||
3583 | kvm_x86_ops->hardware_unsetup(); | ||
3584 | kvm_x86_ops = NULL; | ||
3585 | } | 3267 | } |
3586 | 3268 | ||
3587 | static __init int kvm_init(void) | 3269 | static void vcpu_kick_intr(void *info) |
3588 | { | 3270 | { |
3589 | static struct page *bad_page; | 3271 | #ifdef DEBUG |
3590 | int r; | 3272 | struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info; |
3591 | 3273 | printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu); | |
3592 | r = kvm_mmu_module_init(); | 3274 | #endif |
3593 | if (r) | ||
3594 | goto out4; | ||
3595 | |||
3596 | kvm_init_debug(); | ||
3597 | |||
3598 | kvm_init_msr_list(); | ||
3599 | |||
3600 | if ((bad_page = alloc_page(GFP_KERNEL)) == NULL) { | ||
3601 | r = -ENOMEM; | ||
3602 | goto out; | ||
3603 | } | ||
3604 | |||
3605 | bad_page_address = page_to_pfn(bad_page) << PAGE_SHIFT; | ||
3606 | memset(__va(bad_page_address), 0, PAGE_SIZE); | ||
3607 | |||
3608 | return 0; | ||
3609 | |||
3610 | out: | ||
3611 | kvm_exit_debug(); | ||
3612 | kvm_mmu_module_exit(); | ||
3613 | out4: | ||
3614 | return r; | ||
3615 | } | 3275 | } |
3616 | 3276 | ||
3617 | static __exit void kvm_exit(void) | 3277 | void kvm_vcpu_kick(struct kvm_vcpu *vcpu) |
3618 | { | 3278 | { |
3619 | kvm_exit_debug(); | 3279 | int ipi_pcpu = vcpu->cpu; |
3620 | __free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT)); | ||
3621 | kvm_mmu_module_exit(); | ||
3622 | } | ||
3623 | |||
3624 | module_init(kvm_init) | ||
3625 | module_exit(kvm_exit) | ||
3626 | 3280 | ||
3627 | EXPORT_SYMBOL_GPL(kvm_init_x86); | 3281 | if (waitqueue_active(&vcpu->wq)) { |
3628 | EXPORT_SYMBOL_GPL(kvm_exit_x86); | 3282 | wake_up_interruptible(&vcpu->wq); |
3283 | ++vcpu->stat.halt_wakeup; | ||
3284 | } | ||
3285 | if (vcpu->guest_mode) | ||
3286 | smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0); | ||
3287 | } | ||
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c new file mode 100644 index 000000000000..79586003397a --- /dev/null +++ b/arch/x86/kvm/x86_emulate.c | |||
@@ -0,0 +1,1912 @@ | |||
1 | /****************************************************************************** | ||
2 | * x86_emulate.c | ||
3 | * | ||
4 | * Generic x86 (32-bit and 64-bit) instruction decoder and emulator. | ||
5 | * | ||
6 | * Copyright (c) 2005 Keir Fraser | ||
7 | * | ||
8 | * Linux coding style, mod r/m decoder, segment base fixes, real-mode | ||
9 | * privileged instructions: | ||
10 | * | ||
11 | * Copyright (C) 2006 Qumranet | ||
12 | * | ||
13 | * Avi Kivity <avi@qumranet.com> | ||
14 | * Yaniv Kamay <yaniv@qumranet.com> | ||
15 | * | ||
16 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
17 | * the COPYING file in the top-level directory. | ||
18 | * | ||
19 | * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 | ||
20 | */ | ||
21 | |||
22 | #ifndef __KERNEL__ | ||
23 | #include <stdio.h> | ||
24 | #include <stdint.h> | ||
25 | #include <public/xen.h> | ||
26 | #define DPRINTF(_f, _a ...) printf(_f , ## _a) | ||
27 | #else | ||
28 | #include <linux/kvm_host.h> | ||
29 | #define DPRINTF(x...) do {} while (0) | ||
30 | #endif | ||
31 | #include <linux/module.h> | ||
32 | #include <asm/kvm_x86_emulate.h> | ||
33 | |||
34 | /* | ||
35 | * Opcode effective-address decode tables. | ||
36 | * Note that we only emulate instructions that have at least one memory | ||
37 | * operand (excluding implicit stack references). We assume that stack | ||
38 | * references and instruction fetches will never occur in special memory | ||
39 | * areas that require emulation. So, for example, 'mov <imm>,<reg>' need | ||
40 | * not be handled. | ||
41 | */ | ||
42 | |||
43 | /* Operand sizes: 8-bit operands or specified/overridden size. */ | ||
44 | #define ByteOp (1<<0) /* 8-bit operands. */ | ||
45 | /* Destination operand type. */ | ||
46 | #define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ | ||
47 | #define DstReg (2<<1) /* Register operand. */ | ||
48 | #define DstMem (3<<1) /* Memory operand. */ | ||
49 | #define DstMask (3<<1) | ||
50 | /* Source operand type. */ | ||
51 | #define SrcNone (0<<3) /* No source operand. */ | ||
52 | #define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */ | ||
53 | #define SrcReg (1<<3) /* Register operand. */ | ||
54 | #define SrcMem (2<<3) /* Memory operand. */ | ||
55 | #define SrcMem16 (3<<3) /* Memory operand (16-bit). */ | ||
56 | #define SrcMem32 (4<<3) /* Memory operand (32-bit). */ | ||
57 | #define SrcImm (5<<3) /* Immediate operand. */ | ||
58 | #define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */ | ||
59 | #define SrcMask (7<<3) | ||
60 | /* Generic ModRM decode. */ | ||
61 | #define ModRM (1<<6) | ||
62 | /* Destination is only written; never read. */ | ||
63 | #define Mov (1<<7) | ||
64 | #define BitOp (1<<8) | ||
65 | #define MemAbs (1<<9) /* Memory operand is absolute displacement */ | ||
66 | #define String (1<<10) /* String instruction (rep capable) */ | ||
67 | #define Stack (1<<11) /* Stack instruction (push/pop) */ | ||
68 | |||
69 | static u16 opcode_table[256] = { | ||
70 | /* 0x00 - 0x07 */ | ||
71 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
72 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
73 | 0, 0, 0, 0, | ||
74 | /* 0x08 - 0x0F */ | ||
75 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
76 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
77 | 0, 0, 0, 0, | ||
78 | /* 0x10 - 0x17 */ | ||
79 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
80 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
81 | 0, 0, 0, 0, | ||
82 | /* 0x18 - 0x1F */ | ||
83 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
84 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
85 | 0, 0, 0, 0, | ||
86 | /* 0x20 - 0x27 */ | ||
87 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
88 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
89 | SrcImmByte, SrcImm, 0, 0, | ||
90 | /* 0x28 - 0x2F */ | ||
91 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
92 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
93 | 0, 0, 0, 0, | ||
94 | /* 0x30 - 0x37 */ | ||
95 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
96 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
97 | 0, 0, 0, 0, | ||
98 | /* 0x38 - 0x3F */ | ||
99 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
100 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
101 | 0, 0, 0, 0, | ||
102 | /* 0x40 - 0x47 */ | ||
103 | DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, | ||
104 | /* 0x48 - 0x4F */ | ||
105 | DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, | ||
106 | /* 0x50 - 0x57 */ | ||
107 | SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, | ||
108 | SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, | ||
109 | /* 0x58 - 0x5F */ | ||
110 | DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, | ||
111 | DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, | ||
112 | /* 0x60 - 0x67 */ | ||
113 | 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , | ||
114 | 0, 0, 0, 0, | ||
115 | /* 0x68 - 0x6F */ | ||
116 | 0, 0, ImplicitOps | Mov | Stack, 0, | ||
117 | SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */ | ||
118 | SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */ | ||
119 | /* 0x70 - 0x77 */ | ||
120 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
121 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
122 | /* 0x78 - 0x7F */ | ||
123 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
124 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
125 | /* 0x80 - 0x87 */ | ||
126 | ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, | ||
127 | ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, | ||
128 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
129 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
130 | /* 0x88 - 0x8F */ | ||
131 | ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, | ||
132 | ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
133 | 0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov | Stack, | ||
134 | /* 0x90 - 0x9F */ | ||
135 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
136 | 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, | ||
137 | /* 0xA0 - 0xA7 */ | ||
138 | ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, | ||
139 | ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs, | ||
140 | ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, | ||
141 | ByteOp | ImplicitOps | String, ImplicitOps | String, | ||
142 | /* 0xA8 - 0xAF */ | ||
143 | 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, | ||
144 | ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, | ||
145 | ByteOp | ImplicitOps | String, ImplicitOps | String, | ||
146 | /* 0xB0 - 0xBF */ | ||
147 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
148 | /* 0xC0 - 0xC7 */ | ||
149 | ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, | ||
150 | 0, ImplicitOps | Stack, 0, 0, | ||
151 | ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov, | ||
152 | /* 0xC8 - 0xCF */ | ||
153 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
154 | /* 0xD0 - 0xD7 */ | ||
155 | ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, | ||
156 | ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, | ||
157 | 0, 0, 0, 0, | ||
158 | /* 0xD8 - 0xDF */ | ||
159 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
160 | /* 0xE0 - 0xE7 */ | ||
161 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
162 | /* 0xE8 - 0xEF */ | ||
163 | ImplicitOps | Stack, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps, | ||
164 | 0, 0, 0, 0, | ||
165 | /* 0xF0 - 0xF7 */ | ||
166 | 0, 0, 0, 0, | ||
167 | ImplicitOps, ImplicitOps, | ||
168 | ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, | ||
169 | /* 0xF8 - 0xFF */ | ||
170 | ImplicitOps, 0, ImplicitOps, ImplicitOps, | ||
171 | 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM | ||
172 | }; | ||
173 | |||
174 | static u16 twobyte_table[256] = { | ||
175 | /* 0x00 - 0x0F */ | ||
176 | 0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0, | ||
177 | ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, | ||
178 | /* 0x10 - 0x1F */ | ||
179 | 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, | ||
180 | /* 0x20 - 0x2F */ | ||
181 | ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0, | ||
182 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
183 | /* 0x30 - 0x3F */ | ||
184 | ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
185 | /* 0x40 - 0x47 */ | ||
186 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
187 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
188 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
189 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
190 | /* 0x48 - 0x4F */ | ||
191 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
192 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
193 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
194 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
195 | /* 0x50 - 0x5F */ | ||
196 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
197 | /* 0x60 - 0x6F */ | ||
198 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
199 | /* 0x70 - 0x7F */ | ||
200 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
201 | /* 0x80 - 0x8F */ | ||
202 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
203 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
204 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
205 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
206 | /* 0x90 - 0x9F */ | ||
207 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
208 | /* 0xA0 - 0xA7 */ | ||
209 | 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0, | ||
210 | /* 0xA8 - 0xAF */ | ||
211 | 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0, | ||
212 | /* 0xB0 - 0xB7 */ | ||
213 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0, | ||
214 | DstMem | SrcReg | ModRM | BitOp, | ||
215 | 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, | ||
216 | DstReg | SrcMem16 | ModRM | Mov, | ||
217 | /* 0xB8 - 0xBF */ | ||
218 | 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp, | ||
219 | 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, | ||
220 | DstReg | SrcMem16 | ModRM | Mov, | ||
221 | /* 0xC0 - 0xCF */ | ||
222 | 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM, | ||
223 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
224 | /* 0xD0 - 0xDF */ | ||
225 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
226 | /* 0xE0 - 0xEF */ | ||
227 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
228 | /* 0xF0 - 0xFF */ | ||
229 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | ||
230 | }; | ||
231 | |||
232 | /* EFLAGS bit definitions. */ | ||
233 | #define EFLG_OF (1<<11) | ||
234 | #define EFLG_DF (1<<10) | ||
235 | #define EFLG_SF (1<<7) | ||
236 | #define EFLG_ZF (1<<6) | ||
237 | #define EFLG_AF (1<<4) | ||
238 | #define EFLG_PF (1<<2) | ||
239 | #define EFLG_CF (1<<0) | ||
240 | |||
241 | /* | ||
242 | * Instruction emulation: | ||
243 | * Most instructions are emulated directly via a fragment of inline assembly | ||
244 | * code. This allows us to save/restore EFLAGS and thus very easily pick up | ||
245 | * any modified flags. | ||
246 | */ | ||
247 | |||
248 | #if defined(CONFIG_X86_64) | ||
249 | #define _LO32 "k" /* force 32-bit operand */ | ||
250 | #define _STK "%%rsp" /* stack pointer */ | ||
251 | #elif defined(__i386__) | ||
252 | #define _LO32 "" /* force 32-bit operand */ | ||
253 | #define _STK "%%esp" /* stack pointer */ | ||
254 | #endif | ||
255 | |||
256 | /* | ||
257 | * These EFLAGS bits are restored from saved value during emulation, and | ||
258 | * any changes are written back to the saved value after emulation. | ||
259 | */ | ||
260 | #define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF) | ||
261 | |||
262 | /* Before executing instruction: restore necessary bits in EFLAGS. */ | ||
263 | #define _PRE_EFLAGS(_sav, _msk, _tmp) \ | ||
264 | /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~_msk; */ \ | ||
265 | "movl %"_sav",%"_LO32 _tmp"; " \ | ||
266 | "push %"_tmp"; " \ | ||
267 | "push %"_tmp"; " \ | ||
268 | "movl %"_msk",%"_LO32 _tmp"; " \ | ||
269 | "andl %"_LO32 _tmp",("_STK"); " \ | ||
270 | "pushf; " \ | ||
271 | "notl %"_LO32 _tmp"; " \ | ||
272 | "andl %"_LO32 _tmp",("_STK"); " \ | ||
273 | "andl %"_LO32 _tmp","__stringify(BITS_PER_LONG/4)"("_STK"); " \ | ||
274 | "pop %"_tmp"; " \ | ||
275 | "orl %"_LO32 _tmp",("_STK"); " \ | ||
276 | "popf; " \ | ||
277 | "pop %"_sav"; " | ||
278 | |||
279 | /* After executing instruction: write-back necessary bits in EFLAGS. */ | ||
280 | #define _POST_EFLAGS(_sav, _msk, _tmp) \ | ||
281 | /* _sav |= EFLAGS & _msk; */ \ | ||
282 | "pushf; " \ | ||
283 | "pop %"_tmp"; " \ | ||
284 | "andl %"_msk",%"_LO32 _tmp"; " \ | ||
285 | "orl %"_LO32 _tmp",%"_sav"; " | ||
286 | |||
287 | /* Raw emulation: instruction has two explicit operands. */ | ||
288 | #define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \ | ||
289 | do { \ | ||
290 | unsigned long _tmp; \ | ||
291 | \ | ||
292 | switch ((_dst).bytes) { \ | ||
293 | case 2: \ | ||
294 | __asm__ __volatile__ ( \ | ||
295 | _PRE_EFLAGS("0", "4", "2") \ | ||
296 | _op"w %"_wx"3,%1; " \ | ||
297 | _POST_EFLAGS("0", "4", "2") \ | ||
298 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
299 | "=&r" (_tmp) \ | ||
300 | : _wy ((_src).val), "i" (EFLAGS_MASK)); \ | ||
301 | break; \ | ||
302 | case 4: \ | ||
303 | __asm__ __volatile__ ( \ | ||
304 | _PRE_EFLAGS("0", "4", "2") \ | ||
305 | _op"l %"_lx"3,%1; " \ | ||
306 | _POST_EFLAGS("0", "4", "2") \ | ||
307 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
308 | "=&r" (_tmp) \ | ||
309 | : _ly ((_src).val), "i" (EFLAGS_MASK)); \ | ||
310 | break; \ | ||
311 | case 8: \ | ||
312 | __emulate_2op_8byte(_op, _src, _dst, \ | ||
313 | _eflags, _qx, _qy); \ | ||
314 | break; \ | ||
315 | } \ | ||
316 | } while (0) | ||
317 | |||
318 | #define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \ | ||
319 | do { \ | ||
320 | unsigned long _tmp; \ | ||
321 | switch ((_dst).bytes) { \ | ||
322 | case 1: \ | ||
323 | __asm__ __volatile__ ( \ | ||
324 | _PRE_EFLAGS("0", "4", "2") \ | ||
325 | _op"b %"_bx"3,%1; " \ | ||
326 | _POST_EFLAGS("0", "4", "2") \ | ||
327 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
328 | "=&r" (_tmp) \ | ||
329 | : _by ((_src).val), "i" (EFLAGS_MASK)); \ | ||
330 | break; \ | ||
331 | default: \ | ||
332 | __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ | ||
333 | _wx, _wy, _lx, _ly, _qx, _qy); \ | ||
334 | break; \ | ||
335 | } \ | ||
336 | } while (0) | ||
337 | |||
338 | /* Source operand is byte-sized and may be restricted to just %cl. */ | ||
339 | #define emulate_2op_SrcB(_op, _src, _dst, _eflags) \ | ||
340 | __emulate_2op(_op, _src, _dst, _eflags, \ | ||
341 | "b", "c", "b", "c", "b", "c", "b", "c") | ||
342 | |||
343 | /* Source operand is byte, word, long or quad sized. */ | ||
344 | #define emulate_2op_SrcV(_op, _src, _dst, _eflags) \ | ||
345 | __emulate_2op(_op, _src, _dst, _eflags, \ | ||
346 | "b", "q", "w", "r", _LO32, "r", "", "r") | ||
347 | |||
348 | /* Source operand is word, long or quad sized. */ | ||
349 | #define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \ | ||
350 | __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ | ||
351 | "w", "r", _LO32, "r", "", "r") | ||
352 | |||
353 | /* Instruction has only one explicit operand (no source operand). */ | ||
354 | #define emulate_1op(_op, _dst, _eflags) \ | ||
355 | do { \ | ||
356 | unsigned long _tmp; \ | ||
357 | \ | ||
358 | switch ((_dst).bytes) { \ | ||
359 | case 1: \ | ||
360 | __asm__ __volatile__ ( \ | ||
361 | _PRE_EFLAGS("0", "3", "2") \ | ||
362 | _op"b %1; " \ | ||
363 | _POST_EFLAGS("0", "3", "2") \ | ||
364 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
365 | "=&r" (_tmp) \ | ||
366 | : "i" (EFLAGS_MASK)); \ | ||
367 | break; \ | ||
368 | case 2: \ | ||
369 | __asm__ __volatile__ ( \ | ||
370 | _PRE_EFLAGS("0", "3", "2") \ | ||
371 | _op"w %1; " \ | ||
372 | _POST_EFLAGS("0", "3", "2") \ | ||
373 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
374 | "=&r" (_tmp) \ | ||
375 | : "i" (EFLAGS_MASK)); \ | ||
376 | break; \ | ||
377 | case 4: \ | ||
378 | __asm__ __volatile__ ( \ | ||
379 | _PRE_EFLAGS("0", "3", "2") \ | ||
380 | _op"l %1; " \ | ||
381 | _POST_EFLAGS("0", "3", "2") \ | ||
382 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
383 | "=&r" (_tmp) \ | ||
384 | : "i" (EFLAGS_MASK)); \ | ||
385 | break; \ | ||
386 | case 8: \ | ||
387 | __emulate_1op_8byte(_op, _dst, _eflags); \ | ||
388 | break; \ | ||
389 | } \ | ||
390 | } while (0) | ||
391 | |||
392 | /* Emulate an instruction with quadword operands (x86/64 only). */ | ||
393 | #if defined(CONFIG_X86_64) | ||
394 | #define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) \ | ||
395 | do { \ | ||
396 | __asm__ __volatile__ ( \ | ||
397 | _PRE_EFLAGS("0", "4", "2") \ | ||
398 | _op"q %"_qx"3,%1; " \ | ||
399 | _POST_EFLAGS("0", "4", "2") \ | ||
400 | : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ | ||
401 | : _qy ((_src).val), "i" (EFLAGS_MASK)); \ | ||
402 | } while (0) | ||
403 | |||
404 | #define __emulate_1op_8byte(_op, _dst, _eflags) \ | ||
405 | do { \ | ||
406 | __asm__ __volatile__ ( \ | ||
407 | _PRE_EFLAGS("0", "3", "2") \ | ||
408 | _op"q %1; " \ | ||
409 | _POST_EFLAGS("0", "3", "2") \ | ||
410 | : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ | ||
411 | : "i" (EFLAGS_MASK)); \ | ||
412 | } while (0) | ||
413 | |||
414 | #elif defined(__i386__) | ||
415 | #define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) | ||
416 | #define __emulate_1op_8byte(_op, _dst, _eflags) | ||
417 | #endif /* __i386__ */ | ||
418 | |||
419 | /* Fetch next part of the instruction being emulated. */ | ||
420 | #define insn_fetch(_type, _size, _eip) \ | ||
421 | ({ unsigned long _x; \ | ||
422 | rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \ | ||
423 | if (rc != 0) \ | ||
424 | goto done; \ | ||
425 | (_eip) += (_size); \ | ||
426 | (_type)_x; \ | ||
427 | }) | ||
428 | |||
429 | /* Access/update address held in a register, based on addressing mode. */ | ||
430 | #define address_mask(reg) \ | ||
431 | ((c->ad_bytes == sizeof(unsigned long)) ? \ | ||
432 | (reg) : ((reg) & ((1UL << (c->ad_bytes << 3)) - 1))) | ||
433 | #define register_address(base, reg) \ | ||
434 | ((base) + address_mask(reg)) | ||
435 | #define register_address_increment(reg, inc) \ | ||
436 | do { \ | ||
437 | /* signed type ensures sign extension to long */ \ | ||
438 | int _inc = (inc); \ | ||
439 | if (c->ad_bytes == sizeof(unsigned long)) \ | ||
440 | (reg) += _inc; \ | ||
441 | else \ | ||
442 | (reg) = ((reg) & \ | ||
443 | ~((1UL << (c->ad_bytes << 3)) - 1)) | \ | ||
444 | (((reg) + _inc) & \ | ||
445 | ((1UL << (c->ad_bytes << 3)) - 1)); \ | ||
446 | } while (0) | ||
447 | |||
448 | #define JMP_REL(rel) \ | ||
449 | do { \ | ||
450 | register_address_increment(c->eip, rel); \ | ||
451 | } while (0) | ||
452 | |||
453 | static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, | ||
454 | struct x86_emulate_ops *ops, | ||
455 | unsigned long linear, u8 *dest) | ||
456 | { | ||
457 | struct fetch_cache *fc = &ctxt->decode.fetch; | ||
458 | int rc; | ||
459 | int size; | ||
460 | |||
461 | if (linear < fc->start || linear >= fc->end) { | ||
462 | size = min(15UL, PAGE_SIZE - offset_in_page(linear)); | ||
463 | rc = ops->read_std(linear, fc->data, size, ctxt->vcpu); | ||
464 | if (rc) | ||
465 | return rc; | ||
466 | fc->start = linear; | ||
467 | fc->end = linear + size; | ||
468 | } | ||
469 | *dest = fc->data[linear - fc->start]; | ||
470 | return 0; | ||
471 | } | ||
472 | |||
473 | static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, | ||
474 | struct x86_emulate_ops *ops, | ||
475 | unsigned long eip, void *dest, unsigned size) | ||
476 | { | ||
477 | int rc = 0; | ||
478 | |||
479 | eip += ctxt->cs_base; | ||
480 | while (size--) { | ||
481 | rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++); | ||
482 | if (rc) | ||
483 | return rc; | ||
484 | } | ||
485 | return 0; | ||
486 | } | ||
487 | |||
488 | /* | ||
489 | * Given the 'reg' portion of a ModRM byte, and a register block, return a | ||
490 | * pointer into the block that addresses the relevant register. | ||
491 | * @highbyte_regs specifies whether to decode AH,CH,DH,BH. | ||
492 | */ | ||
493 | static void *decode_register(u8 modrm_reg, unsigned long *regs, | ||
494 | int highbyte_regs) | ||
495 | { | ||
496 | void *p; | ||
497 | |||
498 | p = ®s[modrm_reg]; | ||
499 | if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8) | ||
500 | p = (unsigned char *)®s[modrm_reg & 3] + 1; | ||
501 | return p; | ||
502 | } | ||
503 | |||
504 | static int read_descriptor(struct x86_emulate_ctxt *ctxt, | ||
505 | struct x86_emulate_ops *ops, | ||
506 | void *ptr, | ||
507 | u16 *size, unsigned long *address, int op_bytes) | ||
508 | { | ||
509 | int rc; | ||
510 | |||
511 | if (op_bytes == 2) | ||
512 | op_bytes = 3; | ||
513 | *address = 0; | ||
514 | rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, | ||
515 | ctxt->vcpu); | ||
516 | if (rc) | ||
517 | return rc; | ||
518 | rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, | ||
519 | ctxt->vcpu); | ||
520 | return rc; | ||
521 | } | ||
522 | |||
523 | static int test_cc(unsigned int condition, unsigned int flags) | ||
524 | { | ||
525 | int rc = 0; | ||
526 | |||
527 | switch ((condition & 15) >> 1) { | ||
528 | case 0: /* o */ | ||
529 | rc |= (flags & EFLG_OF); | ||
530 | break; | ||
531 | case 1: /* b/c/nae */ | ||
532 | rc |= (flags & EFLG_CF); | ||
533 | break; | ||
534 | case 2: /* z/e */ | ||
535 | rc |= (flags & EFLG_ZF); | ||
536 | break; | ||
537 | case 3: /* be/na */ | ||
538 | rc |= (flags & (EFLG_CF|EFLG_ZF)); | ||
539 | break; | ||
540 | case 4: /* s */ | ||
541 | rc |= (flags & EFLG_SF); | ||
542 | break; | ||
543 | case 5: /* p/pe */ | ||
544 | rc |= (flags & EFLG_PF); | ||
545 | break; | ||
546 | case 7: /* le/ng */ | ||
547 | rc |= (flags & EFLG_ZF); | ||
548 | /* fall through */ | ||
549 | case 6: /* l/nge */ | ||
550 | rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF)); | ||
551 | break; | ||
552 | } | ||
553 | |||
554 | /* Odd condition identifiers (lsb == 1) have inverted sense. */ | ||
555 | return (!!rc ^ (condition & 1)); | ||
556 | } | ||
557 | |||
558 | static void decode_register_operand(struct operand *op, | ||
559 | struct decode_cache *c, | ||
560 | int inhibit_bytereg) | ||
561 | { | ||
562 | unsigned reg = c->modrm_reg; | ||
563 | int highbyte_regs = c->rex_prefix == 0; | ||
564 | |||
565 | if (!(c->d & ModRM)) | ||
566 | reg = (c->b & 7) | ((c->rex_prefix & 1) << 3); | ||
567 | op->type = OP_REG; | ||
568 | if ((c->d & ByteOp) && !inhibit_bytereg) { | ||
569 | op->ptr = decode_register(reg, c->regs, highbyte_regs); | ||
570 | op->val = *(u8 *)op->ptr; | ||
571 | op->bytes = 1; | ||
572 | } else { | ||
573 | op->ptr = decode_register(reg, c->regs, 0); | ||
574 | op->bytes = c->op_bytes; | ||
575 | switch (op->bytes) { | ||
576 | case 2: | ||
577 | op->val = *(u16 *)op->ptr; | ||
578 | break; | ||
579 | case 4: | ||
580 | op->val = *(u32 *)op->ptr; | ||
581 | break; | ||
582 | case 8: | ||
583 | op->val = *(u64 *) op->ptr; | ||
584 | break; | ||
585 | } | ||
586 | } | ||
587 | op->orig_val = op->val; | ||
588 | } | ||
589 | |||
590 | static int decode_modrm(struct x86_emulate_ctxt *ctxt, | ||
591 | struct x86_emulate_ops *ops) | ||
592 | { | ||
593 | struct decode_cache *c = &ctxt->decode; | ||
594 | u8 sib; | ||
595 | int index_reg = 0, base_reg = 0, scale, rip_relative = 0; | ||
596 | int rc = 0; | ||
597 | |||
598 | if (c->rex_prefix) { | ||
599 | c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */ | ||
600 | index_reg = (c->rex_prefix & 2) << 2; /* REX.X */ | ||
601 | c->modrm_rm = base_reg = (c->rex_prefix & 1) << 3; /* REG.B */ | ||
602 | } | ||
603 | |||
604 | c->modrm = insn_fetch(u8, 1, c->eip); | ||
605 | c->modrm_mod |= (c->modrm & 0xc0) >> 6; | ||
606 | c->modrm_reg |= (c->modrm & 0x38) >> 3; | ||
607 | c->modrm_rm |= (c->modrm & 0x07); | ||
608 | c->modrm_ea = 0; | ||
609 | c->use_modrm_ea = 1; | ||
610 | |||
611 | if (c->modrm_mod == 3) { | ||
612 | c->modrm_val = *(unsigned long *) | ||
613 | decode_register(c->modrm_rm, c->regs, c->d & ByteOp); | ||
614 | return rc; | ||
615 | } | ||
616 | |||
617 | if (c->ad_bytes == 2) { | ||
618 | unsigned bx = c->regs[VCPU_REGS_RBX]; | ||
619 | unsigned bp = c->regs[VCPU_REGS_RBP]; | ||
620 | unsigned si = c->regs[VCPU_REGS_RSI]; | ||
621 | unsigned di = c->regs[VCPU_REGS_RDI]; | ||
622 | |||
623 | /* 16-bit ModR/M decode. */ | ||
624 | switch (c->modrm_mod) { | ||
625 | case 0: | ||
626 | if (c->modrm_rm == 6) | ||
627 | c->modrm_ea += insn_fetch(u16, 2, c->eip); | ||
628 | break; | ||
629 | case 1: | ||
630 | c->modrm_ea += insn_fetch(s8, 1, c->eip); | ||
631 | break; | ||
632 | case 2: | ||
633 | c->modrm_ea += insn_fetch(u16, 2, c->eip); | ||
634 | break; | ||
635 | } | ||
636 | switch (c->modrm_rm) { | ||
637 | case 0: | ||
638 | c->modrm_ea += bx + si; | ||
639 | break; | ||
640 | case 1: | ||
641 | c->modrm_ea += bx + di; | ||
642 | break; | ||
643 | case 2: | ||
644 | c->modrm_ea += bp + si; | ||
645 | break; | ||
646 | case 3: | ||
647 | c->modrm_ea += bp + di; | ||
648 | break; | ||
649 | case 4: | ||
650 | c->modrm_ea += si; | ||
651 | break; | ||
652 | case 5: | ||
653 | c->modrm_ea += di; | ||
654 | break; | ||
655 | case 6: | ||
656 | if (c->modrm_mod != 0) | ||
657 | c->modrm_ea += bp; | ||
658 | break; | ||
659 | case 7: | ||
660 | c->modrm_ea += bx; | ||
661 | break; | ||
662 | } | ||
663 | if (c->modrm_rm == 2 || c->modrm_rm == 3 || | ||
664 | (c->modrm_rm == 6 && c->modrm_mod != 0)) | ||
665 | if (!c->override_base) | ||
666 | c->override_base = &ctxt->ss_base; | ||
667 | c->modrm_ea = (u16)c->modrm_ea; | ||
668 | } else { | ||
669 | /* 32/64-bit ModR/M decode. */ | ||
670 | switch (c->modrm_rm) { | ||
671 | case 4: | ||
672 | case 12: | ||
673 | sib = insn_fetch(u8, 1, c->eip); | ||
674 | index_reg |= (sib >> 3) & 7; | ||
675 | base_reg |= sib & 7; | ||
676 | scale = sib >> 6; | ||
677 | |||
678 | switch (base_reg) { | ||
679 | case 5: | ||
680 | if (c->modrm_mod != 0) | ||
681 | c->modrm_ea += c->regs[base_reg]; | ||
682 | else | ||
683 | c->modrm_ea += | ||
684 | insn_fetch(s32, 4, c->eip); | ||
685 | break; | ||
686 | default: | ||
687 | c->modrm_ea += c->regs[base_reg]; | ||
688 | } | ||
689 | switch (index_reg) { | ||
690 | case 4: | ||
691 | break; | ||
692 | default: | ||
693 | c->modrm_ea += c->regs[index_reg] << scale; | ||
694 | } | ||
695 | break; | ||
696 | case 5: | ||
697 | if (c->modrm_mod != 0) | ||
698 | c->modrm_ea += c->regs[c->modrm_rm]; | ||
699 | else if (ctxt->mode == X86EMUL_MODE_PROT64) | ||
700 | rip_relative = 1; | ||
701 | break; | ||
702 | default: | ||
703 | c->modrm_ea += c->regs[c->modrm_rm]; | ||
704 | break; | ||
705 | } | ||
706 | switch (c->modrm_mod) { | ||
707 | case 0: | ||
708 | if (c->modrm_rm == 5) | ||
709 | c->modrm_ea += insn_fetch(s32, 4, c->eip); | ||
710 | break; | ||
711 | case 1: | ||
712 | c->modrm_ea += insn_fetch(s8, 1, c->eip); | ||
713 | break; | ||
714 | case 2: | ||
715 | c->modrm_ea += insn_fetch(s32, 4, c->eip); | ||
716 | break; | ||
717 | } | ||
718 | } | ||
719 | if (rip_relative) { | ||
720 | c->modrm_ea += c->eip; | ||
721 | switch (c->d & SrcMask) { | ||
722 | case SrcImmByte: | ||
723 | c->modrm_ea += 1; | ||
724 | break; | ||
725 | case SrcImm: | ||
726 | if (c->d & ByteOp) | ||
727 | c->modrm_ea += 1; | ||
728 | else | ||
729 | if (c->op_bytes == 8) | ||
730 | c->modrm_ea += 4; | ||
731 | else | ||
732 | c->modrm_ea += c->op_bytes; | ||
733 | } | ||
734 | } | ||
735 | done: | ||
736 | return rc; | ||
737 | } | ||
738 | |||
739 | static int decode_abs(struct x86_emulate_ctxt *ctxt, | ||
740 | struct x86_emulate_ops *ops) | ||
741 | { | ||
742 | struct decode_cache *c = &ctxt->decode; | ||
743 | int rc = 0; | ||
744 | |||
745 | switch (c->ad_bytes) { | ||
746 | case 2: | ||
747 | c->modrm_ea = insn_fetch(u16, 2, c->eip); | ||
748 | break; | ||
749 | case 4: | ||
750 | c->modrm_ea = insn_fetch(u32, 4, c->eip); | ||
751 | break; | ||
752 | case 8: | ||
753 | c->modrm_ea = insn_fetch(u64, 8, c->eip); | ||
754 | break; | ||
755 | } | ||
756 | done: | ||
757 | return rc; | ||
758 | } | ||
759 | |||
760 | int | ||
761 | x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | ||
762 | { | ||
763 | struct decode_cache *c = &ctxt->decode; | ||
764 | int rc = 0; | ||
765 | int mode = ctxt->mode; | ||
766 | int def_op_bytes, def_ad_bytes; | ||
767 | |||
768 | /* Shadow copy of register state. Committed on successful emulation. */ | ||
769 | |||
770 | memset(c, 0, sizeof(struct decode_cache)); | ||
771 | c->eip = ctxt->vcpu->arch.rip; | ||
772 | memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); | ||
773 | |||
774 | switch (mode) { | ||
775 | case X86EMUL_MODE_REAL: | ||
776 | case X86EMUL_MODE_PROT16: | ||
777 | def_op_bytes = def_ad_bytes = 2; | ||
778 | break; | ||
779 | case X86EMUL_MODE_PROT32: | ||
780 | def_op_bytes = def_ad_bytes = 4; | ||
781 | break; | ||
782 | #ifdef CONFIG_X86_64 | ||
783 | case X86EMUL_MODE_PROT64: | ||
784 | def_op_bytes = 4; | ||
785 | def_ad_bytes = 8; | ||
786 | break; | ||
787 | #endif | ||
788 | default: | ||
789 | return -1; | ||
790 | } | ||
791 | |||
792 | c->op_bytes = def_op_bytes; | ||
793 | c->ad_bytes = def_ad_bytes; | ||
794 | |||
795 | /* Legacy prefixes. */ | ||
796 | for (;;) { | ||
797 | switch (c->b = insn_fetch(u8, 1, c->eip)) { | ||
798 | case 0x66: /* operand-size override */ | ||
799 | /* switch between 2/4 bytes */ | ||
800 | c->op_bytes = def_op_bytes ^ 6; | ||
801 | break; | ||
802 | case 0x67: /* address-size override */ | ||
803 | if (mode == X86EMUL_MODE_PROT64) | ||
804 | /* switch between 4/8 bytes */ | ||
805 | c->ad_bytes = def_ad_bytes ^ 12; | ||
806 | else | ||
807 | /* switch between 2/4 bytes */ | ||
808 | c->ad_bytes = def_ad_bytes ^ 6; | ||
809 | break; | ||
810 | case 0x2e: /* CS override */ | ||
811 | c->override_base = &ctxt->cs_base; | ||
812 | break; | ||
813 | case 0x3e: /* DS override */ | ||
814 | c->override_base = &ctxt->ds_base; | ||
815 | break; | ||
816 | case 0x26: /* ES override */ | ||
817 | c->override_base = &ctxt->es_base; | ||
818 | break; | ||
819 | case 0x64: /* FS override */ | ||
820 | c->override_base = &ctxt->fs_base; | ||
821 | break; | ||
822 | case 0x65: /* GS override */ | ||
823 | c->override_base = &ctxt->gs_base; | ||
824 | break; | ||
825 | case 0x36: /* SS override */ | ||
826 | c->override_base = &ctxt->ss_base; | ||
827 | break; | ||
828 | case 0x40 ... 0x4f: /* REX */ | ||
829 | if (mode != X86EMUL_MODE_PROT64) | ||
830 | goto done_prefixes; | ||
831 | c->rex_prefix = c->b; | ||
832 | continue; | ||
833 | case 0xf0: /* LOCK */ | ||
834 | c->lock_prefix = 1; | ||
835 | break; | ||
836 | case 0xf2: /* REPNE/REPNZ */ | ||
837 | c->rep_prefix = REPNE_PREFIX; | ||
838 | break; | ||
839 | case 0xf3: /* REP/REPE/REPZ */ | ||
840 | c->rep_prefix = REPE_PREFIX; | ||
841 | break; | ||
842 | default: | ||
843 | goto done_prefixes; | ||
844 | } | ||
845 | |||
846 | /* Any legacy prefix after a REX prefix nullifies its effect. */ | ||
847 | |||
848 | c->rex_prefix = 0; | ||
849 | } | ||
850 | |||
851 | done_prefixes: | ||
852 | |||
853 | /* REX prefix. */ | ||
854 | if (c->rex_prefix) | ||
855 | if (c->rex_prefix & 8) | ||
856 | c->op_bytes = 8; /* REX.W */ | ||
857 | |||
858 | /* Opcode byte(s). */ | ||
859 | c->d = opcode_table[c->b]; | ||
860 | if (c->d == 0) { | ||
861 | /* Two-byte opcode? */ | ||
862 | if (c->b == 0x0f) { | ||
863 | c->twobyte = 1; | ||
864 | c->b = insn_fetch(u8, 1, c->eip); | ||
865 | c->d = twobyte_table[c->b]; | ||
866 | } | ||
867 | |||
868 | /* Unrecognised? */ | ||
869 | if (c->d == 0) { | ||
870 | DPRINTF("Cannot emulate %02x\n", c->b); | ||
871 | return -1; | ||
872 | } | ||
873 | } | ||
874 | |||
875 | if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) | ||
876 | c->op_bytes = 8; | ||
877 | |||
878 | /* ModRM and SIB bytes. */ | ||
879 | if (c->d & ModRM) | ||
880 | rc = decode_modrm(ctxt, ops); | ||
881 | else if (c->d & MemAbs) | ||
882 | rc = decode_abs(ctxt, ops); | ||
883 | if (rc) | ||
884 | goto done; | ||
885 | |||
886 | if (!c->override_base) | ||
887 | c->override_base = &ctxt->ds_base; | ||
888 | if (mode == X86EMUL_MODE_PROT64 && | ||
889 | c->override_base != &ctxt->fs_base && | ||
890 | c->override_base != &ctxt->gs_base) | ||
891 | c->override_base = NULL; | ||
892 | |||
893 | if (c->override_base) | ||
894 | c->modrm_ea += *c->override_base; | ||
895 | |||
896 | if (c->ad_bytes != 8) | ||
897 | c->modrm_ea = (u32)c->modrm_ea; | ||
898 | /* | ||
899 | * Decode and fetch the source operand: register, memory | ||
900 | * or immediate. | ||
901 | */ | ||
902 | switch (c->d & SrcMask) { | ||
903 | case SrcNone: | ||
904 | break; | ||
905 | case SrcReg: | ||
906 | decode_register_operand(&c->src, c, 0); | ||
907 | break; | ||
908 | case SrcMem16: | ||
909 | c->src.bytes = 2; | ||
910 | goto srcmem_common; | ||
911 | case SrcMem32: | ||
912 | c->src.bytes = 4; | ||
913 | goto srcmem_common; | ||
914 | case SrcMem: | ||
915 | c->src.bytes = (c->d & ByteOp) ? 1 : | ||
916 | c->op_bytes; | ||
917 | /* Don't fetch the address for invlpg: it could be unmapped. */ | ||
918 | if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7) | ||
919 | break; | ||
920 | srcmem_common: | ||
921 | /* | ||
922 | * For instructions with a ModR/M byte, switch to register | ||
923 | * access if Mod = 3. | ||
924 | */ | ||
925 | if ((c->d & ModRM) && c->modrm_mod == 3) { | ||
926 | c->src.type = OP_REG; | ||
927 | break; | ||
928 | } | ||
929 | c->src.type = OP_MEM; | ||
930 | break; | ||
931 | case SrcImm: | ||
932 | c->src.type = OP_IMM; | ||
933 | c->src.ptr = (unsigned long *)c->eip; | ||
934 | c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
935 | if (c->src.bytes == 8) | ||
936 | c->src.bytes = 4; | ||
937 | /* NB. Immediates are sign-extended as necessary. */ | ||
938 | switch (c->src.bytes) { | ||
939 | case 1: | ||
940 | c->src.val = insn_fetch(s8, 1, c->eip); | ||
941 | break; | ||
942 | case 2: | ||
943 | c->src.val = insn_fetch(s16, 2, c->eip); | ||
944 | break; | ||
945 | case 4: | ||
946 | c->src.val = insn_fetch(s32, 4, c->eip); | ||
947 | break; | ||
948 | } | ||
949 | break; | ||
950 | case SrcImmByte: | ||
951 | c->src.type = OP_IMM; | ||
952 | c->src.ptr = (unsigned long *)c->eip; | ||
953 | c->src.bytes = 1; | ||
954 | c->src.val = insn_fetch(s8, 1, c->eip); | ||
955 | break; | ||
956 | } | ||
957 | |||
958 | /* Decode and fetch the destination operand: register or memory. */ | ||
959 | switch (c->d & DstMask) { | ||
960 | case ImplicitOps: | ||
961 | /* Special instructions do their own operand decoding. */ | ||
962 | return 0; | ||
963 | case DstReg: | ||
964 | decode_register_operand(&c->dst, c, | ||
965 | c->twobyte && (c->b == 0xb6 || c->b == 0xb7)); | ||
966 | break; | ||
967 | case DstMem: | ||
968 | if ((c->d & ModRM) && c->modrm_mod == 3) { | ||
969 | c->dst.type = OP_REG; | ||
970 | break; | ||
971 | } | ||
972 | c->dst.type = OP_MEM; | ||
973 | break; | ||
974 | } | ||
975 | |||
976 | done: | ||
977 | return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; | ||
978 | } | ||
979 | |||
980 | static inline void emulate_push(struct x86_emulate_ctxt *ctxt) | ||
981 | { | ||
982 | struct decode_cache *c = &ctxt->decode; | ||
983 | |||
984 | c->dst.type = OP_MEM; | ||
985 | c->dst.bytes = c->op_bytes; | ||
986 | c->dst.val = c->src.val; | ||
987 | register_address_increment(c->regs[VCPU_REGS_RSP], -c->op_bytes); | ||
988 | c->dst.ptr = (void *) register_address(ctxt->ss_base, | ||
989 | c->regs[VCPU_REGS_RSP]); | ||
990 | } | ||
991 | |||
992 | static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, | ||
993 | struct x86_emulate_ops *ops) | ||
994 | { | ||
995 | struct decode_cache *c = &ctxt->decode; | ||
996 | int rc; | ||
997 | |||
998 | rc = ops->read_std(register_address(ctxt->ss_base, | ||
999 | c->regs[VCPU_REGS_RSP]), | ||
1000 | &c->dst.val, c->dst.bytes, ctxt->vcpu); | ||
1001 | if (rc != 0) | ||
1002 | return rc; | ||
1003 | |||
1004 | register_address_increment(c->regs[VCPU_REGS_RSP], c->dst.bytes); | ||
1005 | |||
1006 | return 0; | ||
1007 | } | ||
1008 | |||
1009 | static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt) | ||
1010 | { | ||
1011 | struct decode_cache *c = &ctxt->decode; | ||
1012 | switch (c->modrm_reg) { | ||
1013 | case 0: /* rol */ | ||
1014 | emulate_2op_SrcB("rol", c->src, c->dst, ctxt->eflags); | ||
1015 | break; | ||
1016 | case 1: /* ror */ | ||
1017 | emulate_2op_SrcB("ror", c->src, c->dst, ctxt->eflags); | ||
1018 | break; | ||
1019 | case 2: /* rcl */ | ||
1020 | emulate_2op_SrcB("rcl", c->src, c->dst, ctxt->eflags); | ||
1021 | break; | ||
1022 | case 3: /* rcr */ | ||
1023 | emulate_2op_SrcB("rcr", c->src, c->dst, ctxt->eflags); | ||
1024 | break; | ||
1025 | case 4: /* sal/shl */ | ||
1026 | case 6: /* sal/shl */ | ||
1027 | emulate_2op_SrcB("sal", c->src, c->dst, ctxt->eflags); | ||
1028 | break; | ||
1029 | case 5: /* shr */ | ||
1030 | emulate_2op_SrcB("shr", c->src, c->dst, ctxt->eflags); | ||
1031 | break; | ||
1032 | case 7: /* sar */ | ||
1033 | emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags); | ||
1034 | break; | ||
1035 | } | ||
1036 | } | ||
1037 | |||
1038 | static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, | ||
1039 | struct x86_emulate_ops *ops) | ||
1040 | { | ||
1041 | struct decode_cache *c = &ctxt->decode; | ||
1042 | int rc = 0; | ||
1043 | |||
1044 | switch (c->modrm_reg) { | ||
1045 | case 0 ... 1: /* test */ | ||
1046 | /* | ||
1047 | * Special case in Grp3: test has an immediate | ||
1048 | * source operand. | ||
1049 | */ | ||
1050 | c->src.type = OP_IMM; | ||
1051 | c->src.ptr = (unsigned long *)c->eip; | ||
1052 | c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
1053 | if (c->src.bytes == 8) | ||
1054 | c->src.bytes = 4; | ||
1055 | switch (c->src.bytes) { | ||
1056 | case 1: | ||
1057 | c->src.val = insn_fetch(s8, 1, c->eip); | ||
1058 | break; | ||
1059 | case 2: | ||
1060 | c->src.val = insn_fetch(s16, 2, c->eip); | ||
1061 | break; | ||
1062 | case 4: | ||
1063 | c->src.val = insn_fetch(s32, 4, c->eip); | ||
1064 | break; | ||
1065 | } | ||
1066 | emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); | ||
1067 | break; | ||
1068 | case 2: /* not */ | ||
1069 | c->dst.val = ~c->dst.val; | ||
1070 | break; | ||
1071 | case 3: /* neg */ | ||
1072 | emulate_1op("neg", c->dst, ctxt->eflags); | ||
1073 | break; | ||
1074 | default: | ||
1075 | DPRINTF("Cannot emulate %02x\n", c->b); | ||
1076 | rc = X86EMUL_UNHANDLEABLE; | ||
1077 | break; | ||
1078 | } | ||
1079 | done: | ||
1080 | return rc; | ||
1081 | } | ||
1082 | |||
1083 | static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, | ||
1084 | struct x86_emulate_ops *ops) | ||
1085 | { | ||
1086 | struct decode_cache *c = &ctxt->decode; | ||
1087 | int rc; | ||
1088 | |||
1089 | switch (c->modrm_reg) { | ||
1090 | case 0: /* inc */ | ||
1091 | emulate_1op("inc", c->dst, ctxt->eflags); | ||
1092 | break; | ||
1093 | case 1: /* dec */ | ||
1094 | emulate_1op("dec", c->dst, ctxt->eflags); | ||
1095 | break; | ||
1096 | case 4: /* jmp abs */ | ||
1097 | if (c->b == 0xff) | ||
1098 | c->eip = c->dst.val; | ||
1099 | else { | ||
1100 | DPRINTF("Cannot emulate %02x\n", c->b); | ||
1101 | return X86EMUL_UNHANDLEABLE; | ||
1102 | } | ||
1103 | break; | ||
1104 | case 6: /* push */ | ||
1105 | |||
1106 | /* 64-bit mode: PUSH always pushes a 64-bit operand. */ | ||
1107 | |||
1108 | if (ctxt->mode == X86EMUL_MODE_PROT64) { | ||
1109 | c->dst.bytes = 8; | ||
1110 | rc = ops->read_std((unsigned long)c->dst.ptr, | ||
1111 | &c->dst.val, 8, ctxt->vcpu); | ||
1112 | if (rc != 0) | ||
1113 | return rc; | ||
1114 | } | ||
1115 | register_address_increment(c->regs[VCPU_REGS_RSP], | ||
1116 | -c->dst.bytes); | ||
1117 | rc = ops->write_emulated(register_address(ctxt->ss_base, | ||
1118 | c->regs[VCPU_REGS_RSP]), &c->dst.val, | ||
1119 | c->dst.bytes, ctxt->vcpu); | ||
1120 | if (rc != 0) | ||
1121 | return rc; | ||
1122 | c->dst.type = OP_NONE; | ||
1123 | break; | ||
1124 | default: | ||
1125 | DPRINTF("Cannot emulate %02x\n", c->b); | ||
1126 | return X86EMUL_UNHANDLEABLE; | ||
1127 | } | ||
1128 | return 0; | ||
1129 | } | ||
1130 | |||
1131 | static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt, | ||
1132 | struct x86_emulate_ops *ops, | ||
1133 | unsigned long memop) | ||
1134 | { | ||
1135 | struct decode_cache *c = &ctxt->decode; | ||
1136 | u64 old, new; | ||
1137 | int rc; | ||
1138 | |||
1139 | rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu); | ||
1140 | if (rc != 0) | ||
1141 | return rc; | ||
1142 | |||
1143 | if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) || | ||
1144 | ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) { | ||
1145 | |||
1146 | c->regs[VCPU_REGS_RAX] = (u32) (old >> 0); | ||
1147 | c->regs[VCPU_REGS_RDX] = (u32) (old >> 32); | ||
1148 | ctxt->eflags &= ~EFLG_ZF; | ||
1149 | |||
1150 | } else { | ||
1151 | new = ((u64)c->regs[VCPU_REGS_RCX] << 32) | | ||
1152 | (u32) c->regs[VCPU_REGS_RBX]; | ||
1153 | |||
1154 | rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu); | ||
1155 | if (rc != 0) | ||
1156 | return rc; | ||
1157 | ctxt->eflags |= EFLG_ZF; | ||
1158 | } | ||
1159 | return 0; | ||
1160 | } | ||
1161 | |||
1162 | static inline int writeback(struct x86_emulate_ctxt *ctxt, | ||
1163 | struct x86_emulate_ops *ops) | ||
1164 | { | ||
1165 | int rc; | ||
1166 | struct decode_cache *c = &ctxt->decode; | ||
1167 | |||
1168 | switch (c->dst.type) { | ||
1169 | case OP_REG: | ||
1170 | /* The 4-byte case *is* correct: | ||
1171 | * in 64-bit mode we zero-extend. | ||
1172 | */ | ||
1173 | switch (c->dst.bytes) { | ||
1174 | case 1: | ||
1175 | *(u8 *)c->dst.ptr = (u8)c->dst.val; | ||
1176 | break; | ||
1177 | case 2: | ||
1178 | *(u16 *)c->dst.ptr = (u16)c->dst.val; | ||
1179 | break; | ||
1180 | case 4: | ||
1181 | *c->dst.ptr = (u32)c->dst.val; | ||
1182 | break; /* 64b: zero-ext */ | ||
1183 | case 8: | ||
1184 | *c->dst.ptr = c->dst.val; | ||
1185 | break; | ||
1186 | } | ||
1187 | break; | ||
1188 | case OP_MEM: | ||
1189 | if (c->lock_prefix) | ||
1190 | rc = ops->cmpxchg_emulated( | ||
1191 | (unsigned long)c->dst.ptr, | ||
1192 | &c->dst.orig_val, | ||
1193 | &c->dst.val, | ||
1194 | c->dst.bytes, | ||
1195 | ctxt->vcpu); | ||
1196 | else | ||
1197 | rc = ops->write_emulated( | ||
1198 | (unsigned long)c->dst.ptr, | ||
1199 | &c->dst.val, | ||
1200 | c->dst.bytes, | ||
1201 | ctxt->vcpu); | ||
1202 | if (rc != 0) | ||
1203 | return rc; | ||
1204 | break; | ||
1205 | case OP_NONE: | ||
1206 | /* no writeback */ | ||
1207 | break; | ||
1208 | default: | ||
1209 | break; | ||
1210 | } | ||
1211 | return 0; | ||
1212 | } | ||
1213 | |||
1214 | int | ||
1215 | x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | ||
1216 | { | ||
1217 | unsigned long memop = 0; | ||
1218 | u64 msr_data; | ||
1219 | unsigned long saved_eip = 0; | ||
1220 | struct decode_cache *c = &ctxt->decode; | ||
1221 | int rc = 0; | ||
1222 | |||
1223 | /* Shadow copy of register state. Committed on successful emulation. | ||
1224 | * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't | ||
1225 | * modify them. | ||
1226 | */ | ||
1227 | |||
1228 | memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); | ||
1229 | saved_eip = c->eip; | ||
1230 | |||
1231 | if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs)) | ||
1232 | memop = c->modrm_ea; | ||
1233 | |||
1234 | if (c->rep_prefix && (c->d & String)) { | ||
1235 | /* All REP prefixes have the same first termination condition */ | ||
1236 | if (c->regs[VCPU_REGS_RCX] == 0) { | ||
1237 | ctxt->vcpu->arch.rip = c->eip; | ||
1238 | goto done; | ||
1239 | } | ||
1240 | /* The second termination condition only applies for REPE | ||
1241 | * and REPNE. Test if the repeat string operation prefix is | ||
1242 | * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the | ||
1243 | * corresponding termination condition according to: | ||
1244 | * - if REPE/REPZ and ZF = 0 then done | ||
1245 | * - if REPNE/REPNZ and ZF = 1 then done | ||
1246 | */ | ||
1247 | if ((c->b == 0xa6) || (c->b == 0xa7) || | ||
1248 | (c->b == 0xae) || (c->b == 0xaf)) { | ||
1249 | if ((c->rep_prefix == REPE_PREFIX) && | ||
1250 | ((ctxt->eflags & EFLG_ZF) == 0)) { | ||
1251 | ctxt->vcpu->arch.rip = c->eip; | ||
1252 | goto done; | ||
1253 | } | ||
1254 | if ((c->rep_prefix == REPNE_PREFIX) && | ||
1255 | ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) { | ||
1256 | ctxt->vcpu->arch.rip = c->eip; | ||
1257 | goto done; | ||
1258 | } | ||
1259 | } | ||
1260 | c->regs[VCPU_REGS_RCX]--; | ||
1261 | c->eip = ctxt->vcpu->arch.rip; | ||
1262 | } | ||
1263 | |||
1264 | if (c->src.type == OP_MEM) { | ||
1265 | c->src.ptr = (unsigned long *)memop; | ||
1266 | c->src.val = 0; | ||
1267 | rc = ops->read_emulated((unsigned long)c->src.ptr, | ||
1268 | &c->src.val, | ||
1269 | c->src.bytes, | ||
1270 | ctxt->vcpu); | ||
1271 | if (rc != 0) | ||
1272 | goto done; | ||
1273 | c->src.orig_val = c->src.val; | ||
1274 | } | ||
1275 | |||
1276 | if ((c->d & DstMask) == ImplicitOps) | ||
1277 | goto special_insn; | ||
1278 | |||
1279 | |||
1280 | if (c->dst.type == OP_MEM) { | ||
1281 | c->dst.ptr = (unsigned long *)memop; | ||
1282 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
1283 | c->dst.val = 0; | ||
1284 | if (c->d & BitOp) { | ||
1285 | unsigned long mask = ~(c->dst.bytes * 8 - 1); | ||
1286 | |||
1287 | c->dst.ptr = (void *)c->dst.ptr + | ||
1288 | (c->src.val & mask) / 8; | ||
1289 | } | ||
1290 | if (!(c->d & Mov) && | ||
1291 | /* optimisation - avoid slow emulated read */ | ||
1292 | ((rc = ops->read_emulated((unsigned long)c->dst.ptr, | ||
1293 | &c->dst.val, | ||
1294 | c->dst.bytes, ctxt->vcpu)) != 0)) | ||
1295 | goto done; | ||
1296 | } | ||
1297 | c->dst.orig_val = c->dst.val; | ||
1298 | |||
1299 | special_insn: | ||
1300 | |||
1301 | if (c->twobyte) | ||
1302 | goto twobyte_insn; | ||
1303 | |||
1304 | switch (c->b) { | ||
1305 | case 0x00 ... 0x05: | ||
1306 | add: /* add */ | ||
1307 | emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); | ||
1308 | break; | ||
1309 | case 0x08 ... 0x0d: | ||
1310 | or: /* or */ | ||
1311 | emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); | ||
1312 | break; | ||
1313 | case 0x10 ... 0x15: | ||
1314 | adc: /* adc */ | ||
1315 | emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags); | ||
1316 | break; | ||
1317 | case 0x18 ... 0x1d: | ||
1318 | sbb: /* sbb */ | ||
1319 | emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); | ||
1320 | break; | ||
1321 | case 0x20 ... 0x23: | ||
1322 | and: /* and */ | ||
1323 | emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); | ||
1324 | break; | ||
1325 | case 0x24: /* and al imm8 */ | ||
1326 | c->dst.type = OP_REG; | ||
1327 | c->dst.ptr = &c->regs[VCPU_REGS_RAX]; | ||
1328 | c->dst.val = *(u8 *)c->dst.ptr; | ||
1329 | c->dst.bytes = 1; | ||
1330 | c->dst.orig_val = c->dst.val; | ||
1331 | goto and; | ||
1332 | case 0x25: /* and ax imm16, or eax imm32 */ | ||
1333 | c->dst.type = OP_REG; | ||
1334 | c->dst.bytes = c->op_bytes; | ||
1335 | c->dst.ptr = &c->regs[VCPU_REGS_RAX]; | ||
1336 | if (c->op_bytes == 2) | ||
1337 | c->dst.val = *(u16 *)c->dst.ptr; | ||
1338 | else | ||
1339 | c->dst.val = *(u32 *)c->dst.ptr; | ||
1340 | c->dst.orig_val = c->dst.val; | ||
1341 | goto and; | ||
1342 | case 0x28 ... 0x2d: | ||
1343 | sub: /* sub */ | ||
1344 | emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags); | ||
1345 | break; | ||
1346 | case 0x30 ... 0x35: | ||
1347 | xor: /* xor */ | ||
1348 | emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags); | ||
1349 | break; | ||
1350 | case 0x38 ... 0x3d: | ||
1351 | cmp: /* cmp */ | ||
1352 | emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); | ||
1353 | break; | ||
1354 | case 0x40 ... 0x47: /* inc r16/r32 */ | ||
1355 | emulate_1op("inc", c->dst, ctxt->eflags); | ||
1356 | break; | ||
1357 | case 0x48 ... 0x4f: /* dec r16/r32 */ | ||
1358 | emulate_1op("dec", c->dst, ctxt->eflags); | ||
1359 | break; | ||
1360 | case 0x50 ... 0x57: /* push reg */ | ||
1361 | c->dst.type = OP_MEM; | ||
1362 | c->dst.bytes = c->op_bytes; | ||
1363 | c->dst.val = c->src.val; | ||
1364 | register_address_increment(c->regs[VCPU_REGS_RSP], | ||
1365 | -c->op_bytes); | ||
1366 | c->dst.ptr = (void *) register_address( | ||
1367 | ctxt->ss_base, c->regs[VCPU_REGS_RSP]); | ||
1368 | break; | ||
1369 | case 0x58 ... 0x5f: /* pop reg */ | ||
1370 | pop_instruction: | ||
1371 | if ((rc = ops->read_std(register_address(ctxt->ss_base, | ||
1372 | c->regs[VCPU_REGS_RSP]), c->dst.ptr, | ||
1373 | c->op_bytes, ctxt->vcpu)) != 0) | ||
1374 | goto done; | ||
1375 | |||
1376 | register_address_increment(c->regs[VCPU_REGS_RSP], | ||
1377 | c->op_bytes); | ||
1378 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
1379 | break; | ||
1380 | case 0x63: /* movsxd */ | ||
1381 | if (ctxt->mode != X86EMUL_MODE_PROT64) | ||
1382 | goto cannot_emulate; | ||
1383 | c->dst.val = (s32) c->src.val; | ||
1384 | break; | ||
1385 | case 0x6a: /* push imm8 */ | ||
1386 | c->src.val = 0L; | ||
1387 | c->src.val = insn_fetch(s8, 1, c->eip); | ||
1388 | emulate_push(ctxt); | ||
1389 | break; | ||
1390 | case 0x6c: /* insb */ | ||
1391 | case 0x6d: /* insw/insd */ | ||
1392 | if (kvm_emulate_pio_string(ctxt->vcpu, NULL, | ||
1393 | 1, | ||
1394 | (c->d & ByteOp) ? 1 : c->op_bytes, | ||
1395 | c->rep_prefix ? | ||
1396 | address_mask(c->regs[VCPU_REGS_RCX]) : 1, | ||
1397 | (ctxt->eflags & EFLG_DF), | ||
1398 | register_address(ctxt->es_base, | ||
1399 | c->regs[VCPU_REGS_RDI]), | ||
1400 | c->rep_prefix, | ||
1401 | c->regs[VCPU_REGS_RDX]) == 0) { | ||
1402 | c->eip = saved_eip; | ||
1403 | return -1; | ||
1404 | } | ||
1405 | return 0; | ||
1406 | case 0x6e: /* outsb */ | ||
1407 | case 0x6f: /* outsw/outsd */ | ||
1408 | if (kvm_emulate_pio_string(ctxt->vcpu, NULL, | ||
1409 | 0, | ||
1410 | (c->d & ByteOp) ? 1 : c->op_bytes, | ||
1411 | c->rep_prefix ? | ||
1412 | address_mask(c->regs[VCPU_REGS_RCX]) : 1, | ||
1413 | (ctxt->eflags & EFLG_DF), | ||
1414 | register_address(c->override_base ? | ||
1415 | *c->override_base : | ||
1416 | ctxt->ds_base, | ||
1417 | c->regs[VCPU_REGS_RSI]), | ||
1418 | c->rep_prefix, | ||
1419 | c->regs[VCPU_REGS_RDX]) == 0) { | ||
1420 | c->eip = saved_eip; | ||
1421 | return -1; | ||
1422 | } | ||
1423 | return 0; | ||
1424 | case 0x70 ... 0x7f: /* jcc (short) */ { | ||
1425 | int rel = insn_fetch(s8, 1, c->eip); | ||
1426 | |||
1427 | if (test_cc(c->b, ctxt->eflags)) | ||
1428 | JMP_REL(rel); | ||
1429 | break; | ||
1430 | } | ||
1431 | case 0x80 ... 0x83: /* Grp1 */ | ||
1432 | switch (c->modrm_reg) { | ||
1433 | case 0: | ||
1434 | goto add; | ||
1435 | case 1: | ||
1436 | goto or; | ||
1437 | case 2: | ||
1438 | goto adc; | ||
1439 | case 3: | ||
1440 | goto sbb; | ||
1441 | case 4: | ||
1442 | goto and; | ||
1443 | case 5: | ||
1444 | goto sub; | ||
1445 | case 6: | ||
1446 | goto xor; | ||
1447 | case 7: | ||
1448 | goto cmp; | ||
1449 | } | ||
1450 | break; | ||
1451 | case 0x84 ... 0x85: | ||
1452 | emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); | ||
1453 | break; | ||
1454 | case 0x86 ... 0x87: /* xchg */ | ||
1455 | /* Write back the register source. */ | ||
1456 | switch (c->dst.bytes) { | ||
1457 | case 1: | ||
1458 | *(u8 *) c->src.ptr = (u8) c->dst.val; | ||
1459 | break; | ||
1460 | case 2: | ||
1461 | *(u16 *) c->src.ptr = (u16) c->dst.val; | ||
1462 | break; | ||
1463 | case 4: | ||
1464 | *c->src.ptr = (u32) c->dst.val; | ||
1465 | break; /* 64b reg: zero-extend */ | ||
1466 | case 8: | ||
1467 | *c->src.ptr = c->dst.val; | ||
1468 | break; | ||
1469 | } | ||
1470 | /* | ||
1471 | * Write back the memory destination with implicit LOCK | ||
1472 | * prefix. | ||
1473 | */ | ||
1474 | c->dst.val = c->src.val; | ||
1475 | c->lock_prefix = 1; | ||
1476 | break; | ||
1477 | case 0x88 ... 0x8b: /* mov */ | ||
1478 | goto mov; | ||
1479 | case 0x8d: /* lea r16/r32, m */ | ||
1480 | c->dst.val = c->modrm_val; | ||
1481 | break; | ||
1482 | case 0x8f: /* pop (sole member of Grp1a) */ | ||
1483 | rc = emulate_grp1a(ctxt, ops); | ||
1484 | if (rc != 0) | ||
1485 | goto done; | ||
1486 | break; | ||
1487 | case 0x9c: /* pushf */ | ||
1488 | c->src.val = (unsigned long) ctxt->eflags; | ||
1489 | emulate_push(ctxt); | ||
1490 | break; | ||
1491 | case 0x9d: /* popf */ | ||
1492 | c->dst.ptr = (unsigned long *) &ctxt->eflags; | ||
1493 | goto pop_instruction; | ||
1494 | case 0xa0 ... 0xa1: /* mov */ | ||
1495 | c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; | ||
1496 | c->dst.val = c->src.val; | ||
1497 | break; | ||
1498 | case 0xa2 ... 0xa3: /* mov */ | ||
1499 | c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX]; | ||
1500 | break; | ||
1501 | case 0xa4 ... 0xa5: /* movs */ | ||
1502 | c->dst.type = OP_MEM; | ||
1503 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
1504 | c->dst.ptr = (unsigned long *)register_address( | ||
1505 | ctxt->es_base, | ||
1506 | c->regs[VCPU_REGS_RDI]); | ||
1507 | if ((rc = ops->read_emulated(register_address( | ||
1508 | c->override_base ? *c->override_base : | ||
1509 | ctxt->ds_base, | ||
1510 | c->regs[VCPU_REGS_RSI]), | ||
1511 | &c->dst.val, | ||
1512 | c->dst.bytes, ctxt->vcpu)) != 0) | ||
1513 | goto done; | ||
1514 | register_address_increment(c->regs[VCPU_REGS_RSI], | ||
1515 | (ctxt->eflags & EFLG_DF) ? -c->dst.bytes | ||
1516 | : c->dst.bytes); | ||
1517 | register_address_increment(c->regs[VCPU_REGS_RDI], | ||
1518 | (ctxt->eflags & EFLG_DF) ? -c->dst.bytes | ||
1519 | : c->dst.bytes); | ||
1520 | break; | ||
1521 | case 0xa6 ... 0xa7: /* cmps */ | ||
1522 | c->src.type = OP_NONE; /* Disable writeback. */ | ||
1523 | c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
1524 | c->src.ptr = (unsigned long *)register_address( | ||
1525 | c->override_base ? *c->override_base : | ||
1526 | ctxt->ds_base, | ||
1527 | c->regs[VCPU_REGS_RSI]); | ||
1528 | if ((rc = ops->read_emulated((unsigned long)c->src.ptr, | ||
1529 | &c->src.val, | ||
1530 | c->src.bytes, | ||
1531 | ctxt->vcpu)) != 0) | ||
1532 | goto done; | ||
1533 | |||
1534 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
1535 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
1536 | c->dst.ptr = (unsigned long *)register_address( | ||
1537 | ctxt->es_base, | ||
1538 | c->regs[VCPU_REGS_RDI]); | ||
1539 | if ((rc = ops->read_emulated((unsigned long)c->dst.ptr, | ||
1540 | &c->dst.val, | ||
1541 | c->dst.bytes, | ||
1542 | ctxt->vcpu)) != 0) | ||
1543 | goto done; | ||
1544 | |||
1545 | DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); | ||
1546 | |||
1547 | emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); | ||
1548 | |||
1549 | register_address_increment(c->regs[VCPU_REGS_RSI], | ||
1550 | (ctxt->eflags & EFLG_DF) ? -c->src.bytes | ||
1551 | : c->src.bytes); | ||
1552 | register_address_increment(c->regs[VCPU_REGS_RDI], | ||
1553 | (ctxt->eflags & EFLG_DF) ? -c->dst.bytes | ||
1554 | : c->dst.bytes); | ||
1555 | |||
1556 | break; | ||
1557 | case 0xaa ... 0xab: /* stos */ | ||
1558 | c->dst.type = OP_MEM; | ||
1559 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
1560 | c->dst.ptr = (unsigned long *)register_address( | ||
1561 | ctxt->es_base, | ||
1562 | c->regs[VCPU_REGS_RDI]); | ||
1563 | c->dst.val = c->regs[VCPU_REGS_RAX]; | ||
1564 | register_address_increment(c->regs[VCPU_REGS_RDI], | ||
1565 | (ctxt->eflags & EFLG_DF) ? -c->dst.bytes | ||
1566 | : c->dst.bytes); | ||
1567 | break; | ||
1568 | case 0xac ... 0xad: /* lods */ | ||
1569 | c->dst.type = OP_REG; | ||
1570 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
1571 | c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; | ||
1572 | if ((rc = ops->read_emulated(register_address( | ||
1573 | c->override_base ? *c->override_base : | ||
1574 | ctxt->ds_base, | ||
1575 | c->regs[VCPU_REGS_RSI]), | ||
1576 | &c->dst.val, | ||
1577 | c->dst.bytes, | ||
1578 | ctxt->vcpu)) != 0) | ||
1579 | goto done; | ||
1580 | register_address_increment(c->regs[VCPU_REGS_RSI], | ||
1581 | (ctxt->eflags & EFLG_DF) ? -c->dst.bytes | ||
1582 | : c->dst.bytes); | ||
1583 | break; | ||
1584 | case 0xae ... 0xaf: /* scas */ | ||
1585 | DPRINTF("Urk! I don't handle SCAS.\n"); | ||
1586 | goto cannot_emulate; | ||
1587 | case 0xc0 ... 0xc1: | ||
1588 | emulate_grp2(ctxt); | ||
1589 | break; | ||
1590 | case 0xc3: /* ret */ | ||
1591 | c->dst.ptr = &c->eip; | ||
1592 | goto pop_instruction; | ||
1593 | case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */ | ||
1594 | mov: | ||
1595 | c->dst.val = c->src.val; | ||
1596 | break; | ||
1597 | case 0xd0 ... 0xd1: /* Grp2 */ | ||
1598 | c->src.val = 1; | ||
1599 | emulate_grp2(ctxt); | ||
1600 | break; | ||
1601 | case 0xd2 ... 0xd3: /* Grp2 */ | ||
1602 | c->src.val = c->regs[VCPU_REGS_RCX]; | ||
1603 | emulate_grp2(ctxt); | ||
1604 | break; | ||
1605 | case 0xe8: /* call (near) */ { | ||
1606 | long int rel; | ||
1607 | switch (c->op_bytes) { | ||
1608 | case 2: | ||
1609 | rel = insn_fetch(s16, 2, c->eip); | ||
1610 | break; | ||
1611 | case 4: | ||
1612 | rel = insn_fetch(s32, 4, c->eip); | ||
1613 | break; | ||
1614 | default: | ||
1615 | DPRINTF("Call: Invalid op_bytes\n"); | ||
1616 | goto cannot_emulate; | ||
1617 | } | ||
1618 | c->src.val = (unsigned long) c->eip; | ||
1619 | JMP_REL(rel); | ||
1620 | c->op_bytes = c->ad_bytes; | ||
1621 | emulate_push(ctxt); | ||
1622 | break; | ||
1623 | } | ||
1624 | case 0xe9: /* jmp rel */ | ||
1625 | case 0xeb: /* jmp rel short */ | ||
1626 | JMP_REL(c->src.val); | ||
1627 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
1628 | break; | ||
1629 | case 0xf4: /* hlt */ | ||
1630 | ctxt->vcpu->arch.halt_request = 1; | ||
1631 | goto done; | ||
1632 | case 0xf5: /* cmc */ | ||
1633 | /* complement carry flag from eflags reg */ | ||
1634 | ctxt->eflags ^= EFLG_CF; | ||
1635 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
1636 | break; | ||
1637 | case 0xf6 ... 0xf7: /* Grp3 */ | ||
1638 | rc = emulate_grp3(ctxt, ops); | ||
1639 | if (rc != 0) | ||
1640 | goto done; | ||
1641 | break; | ||
1642 | case 0xf8: /* clc */ | ||
1643 | ctxt->eflags &= ~EFLG_CF; | ||
1644 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
1645 | break; | ||
1646 | case 0xfa: /* cli */ | ||
1647 | ctxt->eflags &= ~X86_EFLAGS_IF; | ||
1648 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
1649 | break; | ||
1650 | case 0xfb: /* sti */ | ||
1651 | ctxt->eflags |= X86_EFLAGS_IF; | ||
1652 | c->dst.type = OP_NONE; /* Disable writeback. */ | ||
1653 | break; | ||
1654 | case 0xfe ... 0xff: /* Grp4/Grp5 */ | ||
1655 | rc = emulate_grp45(ctxt, ops); | ||
1656 | if (rc != 0) | ||
1657 | goto done; | ||
1658 | break; | ||
1659 | } | ||
1660 | |||
1661 | writeback: | ||
1662 | rc = writeback(ctxt, ops); | ||
1663 | if (rc != 0) | ||
1664 | goto done; | ||
1665 | |||
1666 | /* Commit shadow register state. */ | ||
1667 | memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); | ||
1668 | ctxt->vcpu->arch.rip = c->eip; | ||
1669 | |||
1670 | done: | ||
1671 | if (rc == X86EMUL_UNHANDLEABLE) { | ||
1672 | c->eip = saved_eip; | ||
1673 | return -1; | ||
1674 | } | ||
1675 | return 0; | ||
1676 | |||
1677 | twobyte_insn: | ||
1678 | switch (c->b) { | ||
1679 | case 0x01: /* lgdt, lidt, lmsw */ | ||
1680 | switch (c->modrm_reg) { | ||
1681 | u16 size; | ||
1682 | unsigned long address; | ||
1683 | |||
1684 | case 0: /* vmcall */ | ||
1685 | if (c->modrm_mod != 3 || c->modrm_rm != 1) | ||
1686 | goto cannot_emulate; | ||
1687 | |||
1688 | rc = kvm_fix_hypercall(ctxt->vcpu); | ||
1689 | if (rc) | ||
1690 | goto done; | ||
1691 | |||
1692 | kvm_emulate_hypercall(ctxt->vcpu); | ||
1693 | break; | ||
1694 | case 2: /* lgdt */ | ||
1695 | rc = read_descriptor(ctxt, ops, c->src.ptr, | ||
1696 | &size, &address, c->op_bytes); | ||
1697 | if (rc) | ||
1698 | goto done; | ||
1699 | realmode_lgdt(ctxt->vcpu, size, address); | ||
1700 | break; | ||
1701 | case 3: /* lidt/vmmcall */ | ||
1702 | if (c->modrm_mod == 3 && c->modrm_rm == 1) { | ||
1703 | rc = kvm_fix_hypercall(ctxt->vcpu); | ||
1704 | if (rc) | ||
1705 | goto done; | ||
1706 | kvm_emulate_hypercall(ctxt->vcpu); | ||
1707 | } else { | ||
1708 | rc = read_descriptor(ctxt, ops, c->src.ptr, | ||
1709 | &size, &address, | ||
1710 | c->op_bytes); | ||
1711 | if (rc) | ||
1712 | goto done; | ||
1713 | realmode_lidt(ctxt->vcpu, size, address); | ||
1714 | } | ||
1715 | break; | ||
1716 | case 4: /* smsw */ | ||
1717 | if (c->modrm_mod != 3) | ||
1718 | goto cannot_emulate; | ||
1719 | *(u16 *)&c->regs[c->modrm_rm] | ||
1720 | = realmode_get_cr(ctxt->vcpu, 0); | ||
1721 | break; | ||
1722 | case 6: /* lmsw */ | ||
1723 | if (c->modrm_mod != 3) | ||
1724 | goto cannot_emulate; | ||
1725 | realmode_lmsw(ctxt->vcpu, (u16)c->modrm_val, | ||
1726 | &ctxt->eflags); | ||
1727 | break; | ||
1728 | case 7: /* invlpg*/ | ||
1729 | emulate_invlpg(ctxt->vcpu, memop); | ||
1730 | break; | ||
1731 | default: | ||
1732 | goto cannot_emulate; | ||
1733 | } | ||
1734 | /* Disable writeback. */ | ||
1735 | c->dst.type = OP_NONE; | ||
1736 | break; | ||
1737 | case 0x06: | ||
1738 | emulate_clts(ctxt->vcpu); | ||
1739 | c->dst.type = OP_NONE; | ||
1740 | break; | ||
1741 | case 0x08: /* invd */ | ||
1742 | case 0x09: /* wbinvd */ | ||
1743 | case 0x0d: /* GrpP (prefetch) */ | ||
1744 | case 0x18: /* Grp16 (prefetch/nop) */ | ||
1745 | c->dst.type = OP_NONE; | ||
1746 | break; | ||
1747 | case 0x20: /* mov cr, reg */ | ||
1748 | if (c->modrm_mod != 3) | ||
1749 | goto cannot_emulate; | ||
1750 | c->regs[c->modrm_rm] = | ||
1751 | realmode_get_cr(ctxt->vcpu, c->modrm_reg); | ||
1752 | c->dst.type = OP_NONE; /* no writeback */ | ||
1753 | break; | ||
1754 | case 0x21: /* mov from dr to reg */ | ||
1755 | if (c->modrm_mod != 3) | ||
1756 | goto cannot_emulate; | ||
1757 | rc = emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]); | ||
1758 | if (rc) | ||
1759 | goto cannot_emulate; | ||
1760 | c->dst.type = OP_NONE; /* no writeback */ | ||
1761 | break; | ||
1762 | case 0x22: /* mov reg, cr */ | ||
1763 | if (c->modrm_mod != 3) | ||
1764 | goto cannot_emulate; | ||
1765 | realmode_set_cr(ctxt->vcpu, | ||
1766 | c->modrm_reg, c->modrm_val, &ctxt->eflags); | ||
1767 | c->dst.type = OP_NONE; | ||
1768 | break; | ||
1769 | case 0x23: /* mov from reg to dr */ | ||
1770 | if (c->modrm_mod != 3) | ||
1771 | goto cannot_emulate; | ||
1772 | rc = emulator_set_dr(ctxt, c->modrm_reg, | ||
1773 | c->regs[c->modrm_rm]); | ||
1774 | if (rc) | ||
1775 | goto cannot_emulate; | ||
1776 | c->dst.type = OP_NONE; /* no writeback */ | ||
1777 | break; | ||
1778 | case 0x30: | ||
1779 | /* wrmsr */ | ||
1780 | msr_data = (u32)c->regs[VCPU_REGS_RAX] | ||
1781 | | ((u64)c->regs[VCPU_REGS_RDX] << 32); | ||
1782 | rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data); | ||
1783 | if (rc) { | ||
1784 | kvm_inject_gp(ctxt->vcpu, 0); | ||
1785 | c->eip = ctxt->vcpu->arch.rip; | ||
1786 | } | ||
1787 | rc = X86EMUL_CONTINUE; | ||
1788 | c->dst.type = OP_NONE; | ||
1789 | break; | ||
1790 | case 0x32: | ||
1791 | /* rdmsr */ | ||
1792 | rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data); | ||
1793 | if (rc) { | ||
1794 | kvm_inject_gp(ctxt->vcpu, 0); | ||
1795 | c->eip = ctxt->vcpu->arch.rip; | ||
1796 | } else { | ||
1797 | c->regs[VCPU_REGS_RAX] = (u32)msr_data; | ||
1798 | c->regs[VCPU_REGS_RDX] = msr_data >> 32; | ||
1799 | } | ||
1800 | rc = X86EMUL_CONTINUE; | ||
1801 | c->dst.type = OP_NONE; | ||
1802 | break; | ||
1803 | case 0x40 ... 0x4f: /* cmov */ | ||
1804 | c->dst.val = c->dst.orig_val = c->src.val; | ||
1805 | if (!test_cc(c->b, ctxt->eflags)) | ||
1806 | c->dst.type = OP_NONE; /* no writeback */ | ||
1807 | break; | ||
1808 | case 0x80 ... 0x8f: /* jnz rel, etc*/ { | ||
1809 | long int rel; | ||
1810 | |||
1811 | switch (c->op_bytes) { | ||
1812 | case 2: | ||
1813 | rel = insn_fetch(s16, 2, c->eip); | ||
1814 | break; | ||
1815 | case 4: | ||
1816 | rel = insn_fetch(s32, 4, c->eip); | ||
1817 | break; | ||
1818 | case 8: | ||
1819 | rel = insn_fetch(s64, 8, c->eip); | ||
1820 | break; | ||
1821 | default: | ||
1822 | DPRINTF("jnz: Invalid op_bytes\n"); | ||
1823 | goto cannot_emulate; | ||
1824 | } | ||
1825 | if (test_cc(c->b, ctxt->eflags)) | ||
1826 | JMP_REL(rel); | ||
1827 | c->dst.type = OP_NONE; | ||
1828 | break; | ||
1829 | } | ||
1830 | case 0xa3: | ||
1831 | bt: /* bt */ | ||
1832 | c->dst.type = OP_NONE; | ||
1833 | /* only subword offset */ | ||
1834 | c->src.val &= (c->dst.bytes << 3) - 1; | ||
1835 | emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags); | ||
1836 | break; | ||
1837 | case 0xab: | ||
1838 | bts: /* bts */ | ||
1839 | /* only subword offset */ | ||
1840 | c->src.val &= (c->dst.bytes << 3) - 1; | ||
1841 | emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags); | ||
1842 | break; | ||
1843 | case 0xb0 ... 0xb1: /* cmpxchg */ | ||
1844 | /* | ||
1845 | * Save real source value, then compare EAX against | ||
1846 | * destination. | ||
1847 | */ | ||
1848 | c->src.orig_val = c->src.val; | ||
1849 | c->src.val = c->regs[VCPU_REGS_RAX]; | ||
1850 | emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); | ||
1851 | if (ctxt->eflags & EFLG_ZF) { | ||
1852 | /* Success: write back to memory. */ | ||
1853 | c->dst.val = c->src.orig_val; | ||
1854 | } else { | ||
1855 | /* Failure: write the value we saw to EAX. */ | ||
1856 | c->dst.type = OP_REG; | ||
1857 | c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; | ||
1858 | } | ||
1859 | break; | ||
1860 | case 0xb3: | ||
1861 | btr: /* btr */ | ||
1862 | /* only subword offset */ | ||
1863 | c->src.val &= (c->dst.bytes << 3) - 1; | ||
1864 | emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags); | ||
1865 | break; | ||
1866 | case 0xb6 ... 0xb7: /* movzx */ | ||
1867 | c->dst.bytes = c->op_bytes; | ||
1868 | c->dst.val = (c->d & ByteOp) ? (u8) c->src.val | ||
1869 | : (u16) c->src.val; | ||
1870 | break; | ||
1871 | case 0xba: /* Grp8 */ | ||
1872 | switch (c->modrm_reg & 3) { | ||
1873 | case 0: | ||
1874 | goto bt; | ||
1875 | case 1: | ||
1876 | goto bts; | ||
1877 | case 2: | ||
1878 | goto btr; | ||
1879 | case 3: | ||
1880 | goto btc; | ||
1881 | } | ||
1882 | break; | ||
1883 | case 0xbb: | ||
1884 | btc: /* btc */ | ||
1885 | /* only subword offset */ | ||
1886 | c->src.val &= (c->dst.bytes << 3) - 1; | ||
1887 | emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags); | ||
1888 | break; | ||
1889 | case 0xbe ... 0xbf: /* movsx */ | ||
1890 | c->dst.bytes = c->op_bytes; | ||
1891 | c->dst.val = (c->d & ByteOp) ? (s8) c->src.val : | ||
1892 | (s16) c->src.val; | ||
1893 | break; | ||
1894 | case 0xc3: /* movnti */ | ||
1895 | c->dst.bytes = c->op_bytes; | ||
1896 | c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val : | ||
1897 | (u64) c->src.val; | ||
1898 | break; | ||
1899 | case 0xc7: /* Grp9 (cmpxchg8b) */ | ||
1900 | rc = emulate_grp9(ctxt, ops, memop); | ||
1901 | if (rc != 0) | ||
1902 | goto done; | ||
1903 | c->dst.type = OP_NONE; | ||
1904 | break; | ||
1905 | } | ||
1906 | goto writeback; | ||
1907 | |||
1908 | cannot_emulate: | ||
1909 | DPRINTF("Cannot emulate %02x\n", c->b); | ||
1910 | c->eip = saved_eip; | ||
1911 | return -1; | ||
1912 | } | ||
diff --git a/drivers/Kconfig b/drivers/Kconfig index f4076d9e9902..08d4ae201597 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig | |||
@@ -90,8 +90,6 @@ source "drivers/dca/Kconfig" | |||
90 | 90 | ||
91 | source "drivers/auxdisplay/Kconfig" | 91 | source "drivers/auxdisplay/Kconfig" |
92 | 92 | ||
93 | source "drivers/kvm/Kconfig" | ||
94 | |||
95 | source "drivers/uio/Kconfig" | 93 | source "drivers/uio/Kconfig" |
96 | 94 | ||
97 | source "drivers/virtio/Kconfig" | 95 | source "drivers/virtio/Kconfig" |
diff --git a/drivers/Makefile b/drivers/Makefile index d92d4d82d001..9e1f808e43cf 100644 --- a/drivers/Makefile +++ b/drivers/Makefile | |||
@@ -47,7 +47,6 @@ obj-$(CONFIG_SPI) += spi/ | |||
47 | obj-$(CONFIG_PCCARD) += pcmcia/ | 47 | obj-$(CONFIG_PCCARD) += pcmcia/ |
48 | obj-$(CONFIG_DIO) += dio/ | 48 | obj-$(CONFIG_DIO) += dio/ |
49 | obj-$(CONFIG_SBUS) += sbus/ | 49 | obj-$(CONFIG_SBUS) += sbus/ |
50 | obj-$(CONFIG_KVM) += kvm/ | ||
51 | obj-$(CONFIG_ZORRO) += zorro/ | 50 | obj-$(CONFIG_ZORRO) += zorro/ |
52 | obj-$(CONFIG_MAC) += macintosh/ | 51 | obj-$(CONFIG_MAC) += macintosh/ |
53 | obj-$(CONFIG_ATA_OVER_ETH) += block/aoe/ | 52 | obj-$(CONFIG_ATA_OVER_ETH) += block/aoe/ |
diff --git a/drivers/kvm/irq.h b/drivers/kvm/irq.h deleted file mode 100644 index 11fc014e2b30..000000000000 --- a/drivers/kvm/irq.h +++ /dev/null | |||
@@ -1,165 +0,0 @@ | |||
1 | /* | ||
2 | * irq.h: in kernel interrupt controller related definitions | ||
3 | * Copyright (c) 2007, Intel Corporation. | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify it | ||
6 | * under the terms and conditions of the GNU General Public License, | ||
7 | * version 2, as published by the Free Software Foundation. | ||
8 | * | ||
9 | * This program is distributed in the hope it will be useful, but WITHOUT | ||
10 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
11 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
12 | * more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License along with | ||
15 | * this program; if not, write to the Free Software Foundation, Inc., 59 Temple | ||
16 | * Place - Suite 330, Boston, MA 02111-1307 USA. | ||
17 | * Authors: | ||
18 | * Yaozu (Eddie) Dong <Eddie.dong@intel.com> | ||
19 | * | ||
20 | */ | ||
21 | |||
22 | #ifndef __IRQ_H | ||
23 | #define __IRQ_H | ||
24 | |||
25 | #include "kvm.h" | ||
26 | |||
27 | typedef void irq_request_func(void *opaque, int level); | ||
28 | |||
29 | struct kvm_kpic_state { | ||
30 | u8 last_irr; /* edge detection */ | ||
31 | u8 irr; /* interrupt request register */ | ||
32 | u8 imr; /* interrupt mask register */ | ||
33 | u8 isr; /* interrupt service register */ | ||
34 | u8 priority_add; /* highest irq priority */ | ||
35 | u8 irq_base; | ||
36 | u8 read_reg_select; | ||
37 | u8 poll; | ||
38 | u8 special_mask; | ||
39 | u8 init_state; | ||
40 | u8 auto_eoi; | ||
41 | u8 rotate_on_auto_eoi; | ||
42 | u8 special_fully_nested_mode; | ||
43 | u8 init4; /* true if 4 byte init */ | ||
44 | u8 elcr; /* PIIX edge/trigger selection */ | ||
45 | u8 elcr_mask; | ||
46 | struct kvm_pic *pics_state; | ||
47 | }; | ||
48 | |||
49 | struct kvm_pic { | ||
50 | struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ | ||
51 | irq_request_func *irq_request; | ||
52 | void *irq_request_opaque; | ||
53 | int output; /* intr from master PIC */ | ||
54 | struct kvm_io_device dev; | ||
55 | }; | ||
56 | |||
57 | struct kvm_pic *kvm_create_pic(struct kvm *kvm); | ||
58 | void kvm_pic_set_irq(void *opaque, int irq, int level); | ||
59 | int kvm_pic_read_irq(struct kvm_pic *s); | ||
60 | int kvm_cpu_get_interrupt(struct kvm_vcpu *v); | ||
61 | int kvm_cpu_has_interrupt(struct kvm_vcpu *v); | ||
62 | void kvm_pic_update_irq(struct kvm_pic *s); | ||
63 | |||
64 | #define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS | ||
65 | #define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */ | ||
66 | #define IOAPIC_EDGE_TRIG 0 | ||
67 | #define IOAPIC_LEVEL_TRIG 1 | ||
68 | |||
69 | #define IOAPIC_DEFAULT_BASE_ADDRESS 0xfec00000 | ||
70 | #define IOAPIC_MEM_LENGTH 0x100 | ||
71 | |||
72 | /* Direct registers. */ | ||
73 | #define IOAPIC_REG_SELECT 0x00 | ||
74 | #define IOAPIC_REG_WINDOW 0x10 | ||
75 | #define IOAPIC_REG_EOI 0x40 /* IA64 IOSAPIC only */ | ||
76 | |||
77 | /* Indirect registers. */ | ||
78 | #define IOAPIC_REG_APIC_ID 0x00 /* x86 IOAPIC only */ | ||
79 | #define IOAPIC_REG_VERSION 0x01 | ||
80 | #define IOAPIC_REG_ARB_ID 0x02 /* x86 IOAPIC only */ | ||
81 | |||
82 | struct kvm_ioapic { | ||
83 | u64 base_address; | ||
84 | u32 ioregsel; | ||
85 | u32 id; | ||
86 | u32 irr; | ||
87 | u32 pad; | ||
88 | union ioapic_redir_entry { | ||
89 | u64 bits; | ||
90 | struct { | ||
91 | u8 vector; | ||
92 | u8 delivery_mode:3; | ||
93 | u8 dest_mode:1; | ||
94 | u8 delivery_status:1; | ||
95 | u8 polarity:1; | ||
96 | u8 remote_irr:1; | ||
97 | u8 trig_mode:1; | ||
98 | u8 mask:1; | ||
99 | u8 reserve:7; | ||
100 | u8 reserved[4]; | ||
101 | u8 dest_id; | ||
102 | } fields; | ||
103 | } redirtbl[IOAPIC_NUM_PINS]; | ||
104 | struct kvm_io_device dev; | ||
105 | struct kvm *kvm; | ||
106 | }; | ||
107 | |||
108 | struct kvm_lapic { | ||
109 | unsigned long base_address; | ||
110 | struct kvm_io_device dev; | ||
111 | struct { | ||
112 | atomic_t pending; | ||
113 | s64 period; /* unit: ns */ | ||
114 | u32 divide_count; | ||
115 | ktime_t last_update; | ||
116 | struct hrtimer dev; | ||
117 | } timer; | ||
118 | struct kvm_vcpu *vcpu; | ||
119 | struct page *regs_page; | ||
120 | void *regs; | ||
121 | }; | ||
122 | |||
123 | #ifdef DEBUG | ||
124 | #define ASSERT(x) \ | ||
125 | do { \ | ||
126 | if (!(x)) { \ | ||
127 | printk(KERN_EMERG "assertion failed %s: %d: %s\n", \ | ||
128 | __FILE__, __LINE__, #x); \ | ||
129 | BUG(); \ | ||
130 | } \ | ||
131 | } while (0) | ||
132 | #else | ||
133 | #define ASSERT(x) do { } while (0) | ||
134 | #endif | ||
135 | |||
136 | void kvm_vcpu_kick(struct kvm_vcpu *vcpu); | ||
137 | int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); | ||
138 | int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu); | ||
139 | int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); | ||
140 | int kvm_create_lapic(struct kvm_vcpu *vcpu); | ||
141 | void kvm_lapic_reset(struct kvm_vcpu *vcpu); | ||
142 | void kvm_free_apic(struct kvm_lapic *apic); | ||
143 | u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); | ||
144 | void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); | ||
145 | void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); | ||
146 | struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector, | ||
147 | unsigned long bitmap); | ||
148 | u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); | ||
149 | void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data); | ||
150 | int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); | ||
151 | void kvm_ioapic_update_eoi(struct kvm *kvm, int vector); | ||
152 | int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); | ||
153 | int kvm_apic_set_irq(struct kvm_lapic *apic, u8 vec, u8 trig); | ||
154 | void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu); | ||
155 | int kvm_ioapic_init(struct kvm *kvm); | ||
156 | void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level); | ||
157 | int kvm_lapic_enabled(struct kvm_vcpu *vcpu); | ||
158 | int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); | ||
159 | void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec); | ||
160 | void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec); | ||
161 | void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); | ||
162 | void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); | ||
163 | void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu); | ||
164 | |||
165 | #endif | ||
diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c deleted file mode 100644 index feb5ac986c5d..000000000000 --- a/drivers/kvm/mmu.c +++ /dev/null | |||
@@ -1,1498 +0,0 @@ | |||
1 | /* | ||
2 | * Kernel-based Virtual Machine driver for Linux | ||
3 | * | ||
4 | * This module enables machines with Intel VT-x extensions to run virtual | ||
5 | * machines without emulation or binary translation. | ||
6 | * | ||
7 | * MMU support | ||
8 | * | ||
9 | * Copyright (C) 2006 Qumranet, Inc. | ||
10 | * | ||
11 | * Authors: | ||
12 | * Yaniv Kamay <yaniv@qumranet.com> | ||
13 | * Avi Kivity <avi@qumranet.com> | ||
14 | * | ||
15 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
16 | * the COPYING file in the top-level directory. | ||
17 | * | ||
18 | */ | ||
19 | |||
20 | #include "vmx.h" | ||
21 | #include "kvm.h" | ||
22 | |||
23 | #include <linux/types.h> | ||
24 | #include <linux/string.h> | ||
25 | #include <linux/mm.h> | ||
26 | #include <linux/highmem.h> | ||
27 | #include <linux/module.h> | ||
28 | |||
29 | #include <asm/page.h> | ||
30 | #include <asm/cmpxchg.h> | ||
31 | |||
32 | #undef MMU_DEBUG | ||
33 | |||
34 | #undef AUDIT | ||
35 | |||
36 | #ifdef AUDIT | ||
37 | static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg); | ||
38 | #else | ||
39 | static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {} | ||
40 | #endif | ||
41 | |||
42 | #ifdef MMU_DEBUG | ||
43 | |||
44 | #define pgprintk(x...) do { if (dbg) printk(x); } while (0) | ||
45 | #define rmap_printk(x...) do { if (dbg) printk(x); } while (0) | ||
46 | |||
47 | #else | ||
48 | |||
49 | #define pgprintk(x...) do { } while (0) | ||
50 | #define rmap_printk(x...) do { } while (0) | ||
51 | |||
52 | #endif | ||
53 | |||
54 | #if defined(MMU_DEBUG) || defined(AUDIT) | ||
55 | static int dbg = 1; | ||
56 | #endif | ||
57 | |||
58 | #ifndef MMU_DEBUG | ||
59 | #define ASSERT(x) do { } while (0) | ||
60 | #else | ||
61 | #define ASSERT(x) \ | ||
62 | if (!(x)) { \ | ||
63 | printk(KERN_WARNING "assertion failed %s:%d: %s\n", \ | ||
64 | __FILE__, __LINE__, #x); \ | ||
65 | } | ||
66 | #endif | ||
67 | |||
68 | #define PT64_PT_BITS 9 | ||
69 | #define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) | ||
70 | #define PT32_PT_BITS 10 | ||
71 | #define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS) | ||
72 | |||
73 | #define PT_WRITABLE_SHIFT 1 | ||
74 | |||
75 | #define PT_PRESENT_MASK (1ULL << 0) | ||
76 | #define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT) | ||
77 | #define PT_USER_MASK (1ULL << 2) | ||
78 | #define PT_PWT_MASK (1ULL << 3) | ||
79 | #define PT_PCD_MASK (1ULL << 4) | ||
80 | #define PT_ACCESSED_MASK (1ULL << 5) | ||
81 | #define PT_DIRTY_MASK (1ULL << 6) | ||
82 | #define PT_PAGE_SIZE_MASK (1ULL << 7) | ||
83 | #define PT_PAT_MASK (1ULL << 7) | ||
84 | #define PT_GLOBAL_MASK (1ULL << 8) | ||
85 | #define PT64_NX_MASK (1ULL << 63) | ||
86 | |||
87 | #define PT_PAT_SHIFT 7 | ||
88 | #define PT_DIR_PAT_SHIFT 12 | ||
89 | #define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT) | ||
90 | |||
91 | #define PT32_DIR_PSE36_SIZE 4 | ||
92 | #define PT32_DIR_PSE36_SHIFT 13 | ||
93 | #define PT32_DIR_PSE36_MASK (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT) | ||
94 | |||
95 | |||
96 | #define PT_FIRST_AVAIL_BITS_SHIFT 9 | ||
97 | #define PT64_SECOND_AVAIL_BITS_SHIFT 52 | ||
98 | |||
99 | #define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) | ||
100 | |||
101 | #define VALID_PAGE(x) ((x) != INVALID_PAGE) | ||
102 | |||
103 | #define PT64_LEVEL_BITS 9 | ||
104 | |||
105 | #define PT64_LEVEL_SHIFT(level) \ | ||
106 | ( PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS ) | ||
107 | |||
108 | #define PT64_LEVEL_MASK(level) \ | ||
109 | (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level)) | ||
110 | |||
111 | #define PT64_INDEX(address, level)\ | ||
112 | (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) | ||
113 | |||
114 | |||
115 | #define PT32_LEVEL_BITS 10 | ||
116 | |||
117 | #define PT32_LEVEL_SHIFT(level) \ | ||
118 | ( PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS ) | ||
119 | |||
120 | #define PT32_LEVEL_MASK(level) \ | ||
121 | (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level)) | ||
122 | |||
123 | #define PT32_INDEX(address, level)\ | ||
124 | (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) | ||
125 | |||
126 | |||
127 | #define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) | ||
128 | #define PT64_DIR_BASE_ADDR_MASK \ | ||
129 | (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1)) | ||
130 | |||
131 | #define PT32_BASE_ADDR_MASK PAGE_MASK | ||
132 | #define PT32_DIR_BASE_ADDR_MASK \ | ||
133 | (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1)) | ||
134 | |||
135 | |||
136 | #define PFERR_PRESENT_MASK (1U << 0) | ||
137 | #define PFERR_WRITE_MASK (1U << 1) | ||
138 | #define PFERR_USER_MASK (1U << 2) | ||
139 | #define PFERR_FETCH_MASK (1U << 4) | ||
140 | |||
141 | #define PT64_ROOT_LEVEL 4 | ||
142 | #define PT32_ROOT_LEVEL 2 | ||
143 | #define PT32E_ROOT_LEVEL 3 | ||
144 | |||
145 | #define PT_DIRECTORY_LEVEL 2 | ||
146 | #define PT_PAGE_TABLE_LEVEL 1 | ||
147 | |||
148 | #define RMAP_EXT 4 | ||
149 | |||
150 | struct kvm_rmap_desc { | ||
151 | u64 *shadow_ptes[RMAP_EXT]; | ||
152 | struct kvm_rmap_desc *more; | ||
153 | }; | ||
154 | |||
155 | static struct kmem_cache *pte_chain_cache; | ||
156 | static struct kmem_cache *rmap_desc_cache; | ||
157 | static struct kmem_cache *mmu_page_header_cache; | ||
158 | |||
159 | static int is_write_protection(struct kvm_vcpu *vcpu) | ||
160 | { | ||
161 | return vcpu->cr0 & X86_CR0_WP; | ||
162 | } | ||
163 | |||
164 | static int is_cpuid_PSE36(void) | ||
165 | { | ||
166 | return 1; | ||
167 | } | ||
168 | |||
169 | static int is_nx(struct kvm_vcpu *vcpu) | ||
170 | { | ||
171 | return vcpu->shadow_efer & EFER_NX; | ||
172 | } | ||
173 | |||
174 | static int is_present_pte(unsigned long pte) | ||
175 | { | ||
176 | return pte & PT_PRESENT_MASK; | ||
177 | } | ||
178 | |||
179 | static int is_writeble_pte(unsigned long pte) | ||
180 | { | ||
181 | return pte & PT_WRITABLE_MASK; | ||
182 | } | ||
183 | |||
184 | static int is_io_pte(unsigned long pte) | ||
185 | { | ||
186 | return pte & PT_SHADOW_IO_MARK; | ||
187 | } | ||
188 | |||
189 | static int is_rmap_pte(u64 pte) | ||
190 | { | ||
191 | return (pte & (PT_WRITABLE_MASK | PT_PRESENT_MASK)) | ||
192 | == (PT_WRITABLE_MASK | PT_PRESENT_MASK); | ||
193 | } | ||
194 | |||
195 | static void set_shadow_pte(u64 *sptep, u64 spte) | ||
196 | { | ||
197 | #ifdef CONFIG_X86_64 | ||
198 | set_64bit((unsigned long *)sptep, spte); | ||
199 | #else | ||
200 | set_64bit((unsigned long long *)sptep, spte); | ||
201 | #endif | ||
202 | } | ||
203 | |||
204 | static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, | ||
205 | struct kmem_cache *base_cache, int min) | ||
206 | { | ||
207 | void *obj; | ||
208 | |||
209 | if (cache->nobjs >= min) | ||
210 | return 0; | ||
211 | while (cache->nobjs < ARRAY_SIZE(cache->objects)) { | ||
212 | obj = kmem_cache_zalloc(base_cache, GFP_KERNEL); | ||
213 | if (!obj) | ||
214 | return -ENOMEM; | ||
215 | cache->objects[cache->nobjs++] = obj; | ||
216 | } | ||
217 | return 0; | ||
218 | } | ||
219 | |||
220 | static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) | ||
221 | { | ||
222 | while (mc->nobjs) | ||
223 | kfree(mc->objects[--mc->nobjs]); | ||
224 | } | ||
225 | |||
226 | static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, | ||
227 | int min) | ||
228 | { | ||
229 | struct page *page; | ||
230 | |||
231 | if (cache->nobjs >= min) | ||
232 | return 0; | ||
233 | while (cache->nobjs < ARRAY_SIZE(cache->objects)) { | ||
234 | page = alloc_page(GFP_KERNEL); | ||
235 | if (!page) | ||
236 | return -ENOMEM; | ||
237 | set_page_private(page, 0); | ||
238 | cache->objects[cache->nobjs++] = page_address(page); | ||
239 | } | ||
240 | return 0; | ||
241 | } | ||
242 | |||
243 | static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc) | ||
244 | { | ||
245 | while (mc->nobjs) | ||
246 | free_page((unsigned long)mc->objects[--mc->nobjs]); | ||
247 | } | ||
248 | |||
249 | static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) | ||
250 | { | ||
251 | int r; | ||
252 | |||
253 | kvm_mmu_free_some_pages(vcpu); | ||
254 | r = mmu_topup_memory_cache(&vcpu->mmu_pte_chain_cache, | ||
255 | pte_chain_cache, 4); | ||
256 | if (r) | ||
257 | goto out; | ||
258 | r = mmu_topup_memory_cache(&vcpu->mmu_rmap_desc_cache, | ||
259 | rmap_desc_cache, 1); | ||
260 | if (r) | ||
261 | goto out; | ||
262 | r = mmu_topup_memory_cache_page(&vcpu->mmu_page_cache, 4); | ||
263 | if (r) | ||
264 | goto out; | ||
265 | r = mmu_topup_memory_cache(&vcpu->mmu_page_header_cache, | ||
266 | mmu_page_header_cache, 4); | ||
267 | out: | ||
268 | return r; | ||
269 | } | ||
270 | |||
271 | static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) | ||
272 | { | ||
273 | mmu_free_memory_cache(&vcpu->mmu_pte_chain_cache); | ||
274 | mmu_free_memory_cache(&vcpu->mmu_rmap_desc_cache); | ||
275 | mmu_free_memory_cache_page(&vcpu->mmu_page_cache); | ||
276 | mmu_free_memory_cache(&vcpu->mmu_page_header_cache); | ||
277 | } | ||
278 | |||
279 | static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, | ||
280 | size_t size) | ||
281 | { | ||
282 | void *p; | ||
283 | |||
284 | BUG_ON(!mc->nobjs); | ||
285 | p = mc->objects[--mc->nobjs]; | ||
286 | memset(p, 0, size); | ||
287 | return p; | ||
288 | } | ||
289 | |||
290 | static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu) | ||
291 | { | ||
292 | return mmu_memory_cache_alloc(&vcpu->mmu_pte_chain_cache, | ||
293 | sizeof(struct kvm_pte_chain)); | ||
294 | } | ||
295 | |||
296 | static void mmu_free_pte_chain(struct kvm_pte_chain *pc) | ||
297 | { | ||
298 | kfree(pc); | ||
299 | } | ||
300 | |||
301 | static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu) | ||
302 | { | ||
303 | return mmu_memory_cache_alloc(&vcpu->mmu_rmap_desc_cache, | ||
304 | sizeof(struct kvm_rmap_desc)); | ||
305 | } | ||
306 | |||
307 | static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd) | ||
308 | { | ||
309 | kfree(rd); | ||
310 | } | ||
311 | |||
312 | /* | ||
313 | * Reverse mapping data structures: | ||
314 | * | ||
315 | * If page->private bit zero is zero, then page->private points to the | ||
316 | * shadow page table entry that points to page_address(page). | ||
317 | * | ||
318 | * If page->private bit zero is one, (then page->private & ~1) points | ||
319 | * to a struct kvm_rmap_desc containing more mappings. | ||
320 | */ | ||
321 | static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte) | ||
322 | { | ||
323 | struct page *page; | ||
324 | struct kvm_rmap_desc *desc; | ||
325 | int i; | ||
326 | |||
327 | if (!is_rmap_pte(*spte)) | ||
328 | return; | ||
329 | page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT); | ||
330 | if (!page_private(page)) { | ||
331 | rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); | ||
332 | set_page_private(page,(unsigned long)spte); | ||
333 | } else if (!(page_private(page) & 1)) { | ||
334 | rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte); | ||
335 | desc = mmu_alloc_rmap_desc(vcpu); | ||
336 | desc->shadow_ptes[0] = (u64 *)page_private(page); | ||
337 | desc->shadow_ptes[1] = spte; | ||
338 | set_page_private(page,(unsigned long)desc | 1); | ||
339 | } else { | ||
340 | rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); | ||
341 | desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul); | ||
342 | while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) | ||
343 | desc = desc->more; | ||
344 | if (desc->shadow_ptes[RMAP_EXT-1]) { | ||
345 | desc->more = mmu_alloc_rmap_desc(vcpu); | ||
346 | desc = desc->more; | ||
347 | } | ||
348 | for (i = 0; desc->shadow_ptes[i]; ++i) | ||
349 | ; | ||
350 | desc->shadow_ptes[i] = spte; | ||
351 | } | ||
352 | } | ||
353 | |||
354 | static void rmap_desc_remove_entry(struct page *page, | ||
355 | struct kvm_rmap_desc *desc, | ||
356 | int i, | ||
357 | struct kvm_rmap_desc *prev_desc) | ||
358 | { | ||
359 | int j; | ||
360 | |||
361 | for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j) | ||
362 | ; | ||
363 | desc->shadow_ptes[i] = desc->shadow_ptes[j]; | ||
364 | desc->shadow_ptes[j] = NULL; | ||
365 | if (j != 0) | ||
366 | return; | ||
367 | if (!prev_desc && !desc->more) | ||
368 | set_page_private(page,(unsigned long)desc->shadow_ptes[0]); | ||
369 | else | ||
370 | if (prev_desc) | ||
371 | prev_desc->more = desc->more; | ||
372 | else | ||
373 | set_page_private(page,(unsigned long)desc->more | 1); | ||
374 | mmu_free_rmap_desc(desc); | ||
375 | } | ||
376 | |||
377 | static void rmap_remove(u64 *spte) | ||
378 | { | ||
379 | struct page *page; | ||
380 | struct kvm_rmap_desc *desc; | ||
381 | struct kvm_rmap_desc *prev_desc; | ||
382 | int i; | ||
383 | |||
384 | if (!is_rmap_pte(*spte)) | ||
385 | return; | ||
386 | page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT); | ||
387 | if (!page_private(page)) { | ||
388 | printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); | ||
389 | BUG(); | ||
390 | } else if (!(page_private(page) & 1)) { | ||
391 | rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte); | ||
392 | if ((u64 *)page_private(page) != spte) { | ||
393 | printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n", | ||
394 | spte, *spte); | ||
395 | BUG(); | ||
396 | } | ||
397 | set_page_private(page,0); | ||
398 | } else { | ||
399 | rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte); | ||
400 | desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul); | ||
401 | prev_desc = NULL; | ||
402 | while (desc) { | ||
403 | for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) | ||
404 | if (desc->shadow_ptes[i] == spte) { | ||
405 | rmap_desc_remove_entry(page, | ||
406 | desc, i, | ||
407 | prev_desc); | ||
408 | return; | ||
409 | } | ||
410 | prev_desc = desc; | ||
411 | desc = desc->more; | ||
412 | } | ||
413 | BUG(); | ||
414 | } | ||
415 | } | ||
416 | |||
417 | static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) | ||
418 | { | ||
419 | struct kvm *kvm = vcpu->kvm; | ||
420 | struct page *page; | ||
421 | struct kvm_rmap_desc *desc; | ||
422 | u64 *spte; | ||
423 | |||
424 | page = gfn_to_page(kvm, gfn); | ||
425 | BUG_ON(!page); | ||
426 | |||
427 | while (page_private(page)) { | ||
428 | if (!(page_private(page) & 1)) | ||
429 | spte = (u64 *)page_private(page); | ||
430 | else { | ||
431 | desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul); | ||
432 | spte = desc->shadow_ptes[0]; | ||
433 | } | ||
434 | BUG_ON(!spte); | ||
435 | BUG_ON((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT | ||
436 | != page_to_pfn(page)); | ||
437 | BUG_ON(!(*spte & PT_PRESENT_MASK)); | ||
438 | BUG_ON(!(*spte & PT_WRITABLE_MASK)); | ||
439 | rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); | ||
440 | rmap_remove(spte); | ||
441 | set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK); | ||
442 | kvm_flush_remote_tlbs(vcpu->kvm); | ||
443 | } | ||
444 | } | ||
445 | |||
446 | #ifdef MMU_DEBUG | ||
447 | static int is_empty_shadow_page(u64 *spt) | ||
448 | { | ||
449 | u64 *pos; | ||
450 | u64 *end; | ||
451 | |||
452 | for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++) | ||
453 | if (*pos != 0) { | ||
454 | printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__, | ||
455 | pos, *pos); | ||
456 | return 0; | ||
457 | } | ||
458 | return 1; | ||
459 | } | ||
460 | #endif | ||
461 | |||
462 | static void kvm_mmu_free_page(struct kvm *kvm, | ||
463 | struct kvm_mmu_page *page_head) | ||
464 | { | ||
465 | ASSERT(is_empty_shadow_page(page_head->spt)); | ||
466 | list_del(&page_head->link); | ||
467 | __free_page(virt_to_page(page_head->spt)); | ||
468 | kfree(page_head); | ||
469 | ++kvm->n_free_mmu_pages; | ||
470 | } | ||
471 | |||
472 | static unsigned kvm_page_table_hashfn(gfn_t gfn) | ||
473 | { | ||
474 | return gfn; | ||
475 | } | ||
476 | |||
477 | static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, | ||
478 | u64 *parent_pte) | ||
479 | { | ||
480 | struct kvm_mmu_page *page; | ||
481 | |||
482 | if (!vcpu->kvm->n_free_mmu_pages) | ||
483 | return NULL; | ||
484 | |||
485 | page = mmu_memory_cache_alloc(&vcpu->mmu_page_header_cache, | ||
486 | sizeof *page); | ||
487 | page->spt = mmu_memory_cache_alloc(&vcpu->mmu_page_cache, PAGE_SIZE); | ||
488 | set_page_private(virt_to_page(page->spt), (unsigned long)page); | ||
489 | list_add(&page->link, &vcpu->kvm->active_mmu_pages); | ||
490 | ASSERT(is_empty_shadow_page(page->spt)); | ||
491 | page->slot_bitmap = 0; | ||
492 | page->multimapped = 0; | ||
493 | page->parent_pte = parent_pte; | ||
494 | --vcpu->kvm->n_free_mmu_pages; | ||
495 | return page; | ||
496 | } | ||
497 | |||
498 | static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, | ||
499 | struct kvm_mmu_page *page, u64 *parent_pte) | ||
500 | { | ||
501 | struct kvm_pte_chain *pte_chain; | ||
502 | struct hlist_node *node; | ||
503 | int i; | ||
504 | |||
505 | if (!parent_pte) | ||
506 | return; | ||
507 | if (!page->multimapped) { | ||
508 | u64 *old = page->parent_pte; | ||
509 | |||
510 | if (!old) { | ||
511 | page->parent_pte = parent_pte; | ||
512 | return; | ||
513 | } | ||
514 | page->multimapped = 1; | ||
515 | pte_chain = mmu_alloc_pte_chain(vcpu); | ||
516 | INIT_HLIST_HEAD(&page->parent_ptes); | ||
517 | hlist_add_head(&pte_chain->link, &page->parent_ptes); | ||
518 | pte_chain->parent_ptes[0] = old; | ||
519 | } | ||
520 | hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link) { | ||
521 | if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1]) | ||
522 | continue; | ||
523 | for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) | ||
524 | if (!pte_chain->parent_ptes[i]) { | ||
525 | pte_chain->parent_ptes[i] = parent_pte; | ||
526 | return; | ||
527 | } | ||
528 | } | ||
529 | pte_chain = mmu_alloc_pte_chain(vcpu); | ||
530 | BUG_ON(!pte_chain); | ||
531 | hlist_add_head(&pte_chain->link, &page->parent_ptes); | ||
532 | pte_chain->parent_ptes[0] = parent_pte; | ||
533 | } | ||
534 | |||
535 | static void mmu_page_remove_parent_pte(struct kvm_mmu_page *page, | ||
536 | u64 *parent_pte) | ||
537 | { | ||
538 | struct kvm_pte_chain *pte_chain; | ||
539 | struct hlist_node *node; | ||
540 | int i; | ||
541 | |||
542 | if (!page->multimapped) { | ||
543 | BUG_ON(page->parent_pte != parent_pte); | ||
544 | page->parent_pte = NULL; | ||
545 | return; | ||
546 | } | ||
547 | hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link) | ||
548 | for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { | ||
549 | if (!pte_chain->parent_ptes[i]) | ||
550 | break; | ||
551 | if (pte_chain->parent_ptes[i] != parent_pte) | ||
552 | continue; | ||
553 | while (i + 1 < NR_PTE_CHAIN_ENTRIES | ||
554 | && pte_chain->parent_ptes[i + 1]) { | ||
555 | pte_chain->parent_ptes[i] | ||
556 | = pte_chain->parent_ptes[i + 1]; | ||
557 | ++i; | ||
558 | } | ||
559 | pte_chain->parent_ptes[i] = NULL; | ||
560 | if (i == 0) { | ||
561 | hlist_del(&pte_chain->link); | ||
562 | mmu_free_pte_chain(pte_chain); | ||
563 | if (hlist_empty(&page->parent_ptes)) { | ||
564 | page->multimapped = 0; | ||
565 | page->parent_pte = NULL; | ||
566 | } | ||
567 | } | ||
568 | return; | ||
569 | } | ||
570 | BUG(); | ||
571 | } | ||
572 | |||
573 | static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm_vcpu *vcpu, | ||
574 | gfn_t gfn) | ||
575 | { | ||
576 | unsigned index; | ||
577 | struct hlist_head *bucket; | ||
578 | struct kvm_mmu_page *page; | ||
579 | struct hlist_node *node; | ||
580 | |||
581 | pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn); | ||
582 | index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; | ||
583 | bucket = &vcpu->kvm->mmu_page_hash[index]; | ||
584 | hlist_for_each_entry(page, node, bucket, hash_link) | ||
585 | if (page->gfn == gfn && !page->role.metaphysical) { | ||
586 | pgprintk("%s: found role %x\n", | ||
587 | __FUNCTION__, page->role.word); | ||
588 | return page; | ||
589 | } | ||
590 | return NULL; | ||
591 | } | ||
592 | |||
593 | static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | ||
594 | gfn_t gfn, | ||
595 | gva_t gaddr, | ||
596 | unsigned level, | ||
597 | int metaphysical, | ||
598 | unsigned hugepage_access, | ||
599 | u64 *parent_pte) | ||
600 | { | ||
601 | union kvm_mmu_page_role role; | ||
602 | unsigned index; | ||
603 | unsigned quadrant; | ||
604 | struct hlist_head *bucket; | ||
605 | struct kvm_mmu_page *page; | ||
606 | struct hlist_node *node; | ||
607 | |||
608 | role.word = 0; | ||
609 | role.glevels = vcpu->mmu.root_level; | ||
610 | role.level = level; | ||
611 | role.metaphysical = metaphysical; | ||
612 | role.hugepage_access = hugepage_access; | ||
613 | if (vcpu->mmu.root_level <= PT32_ROOT_LEVEL) { | ||
614 | quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); | ||
615 | quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; | ||
616 | role.quadrant = quadrant; | ||
617 | } | ||
618 | pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__, | ||
619 | gfn, role.word); | ||
620 | index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; | ||
621 | bucket = &vcpu->kvm->mmu_page_hash[index]; | ||
622 | hlist_for_each_entry(page, node, bucket, hash_link) | ||
623 | if (page->gfn == gfn && page->role.word == role.word) { | ||
624 | mmu_page_add_parent_pte(vcpu, page, parent_pte); | ||
625 | pgprintk("%s: found\n", __FUNCTION__); | ||
626 | return page; | ||
627 | } | ||
628 | page = kvm_mmu_alloc_page(vcpu, parent_pte); | ||
629 | if (!page) | ||
630 | return page; | ||
631 | pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word); | ||
632 | page->gfn = gfn; | ||
633 | page->role = role; | ||
634 | hlist_add_head(&page->hash_link, bucket); | ||
635 | if (!metaphysical) | ||
636 | rmap_write_protect(vcpu, gfn); | ||
637 | return page; | ||
638 | } | ||
639 | |||
640 | static void kvm_mmu_page_unlink_children(struct kvm *kvm, | ||
641 | struct kvm_mmu_page *page) | ||
642 | { | ||
643 | unsigned i; | ||
644 | u64 *pt; | ||
645 | u64 ent; | ||
646 | |||
647 | pt = page->spt; | ||
648 | |||
649 | if (page->role.level == PT_PAGE_TABLE_LEVEL) { | ||
650 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | ||
651 | if (pt[i] & PT_PRESENT_MASK) | ||
652 | rmap_remove(&pt[i]); | ||
653 | pt[i] = 0; | ||
654 | } | ||
655 | kvm_flush_remote_tlbs(kvm); | ||
656 | return; | ||
657 | } | ||
658 | |||
659 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | ||
660 | ent = pt[i]; | ||
661 | |||
662 | pt[i] = 0; | ||
663 | if (!(ent & PT_PRESENT_MASK)) | ||
664 | continue; | ||
665 | ent &= PT64_BASE_ADDR_MASK; | ||
666 | mmu_page_remove_parent_pte(page_header(ent), &pt[i]); | ||
667 | } | ||
668 | kvm_flush_remote_tlbs(kvm); | ||
669 | } | ||
670 | |||
671 | static void kvm_mmu_put_page(struct kvm_mmu_page *page, | ||
672 | u64 *parent_pte) | ||
673 | { | ||
674 | mmu_page_remove_parent_pte(page, parent_pte); | ||
675 | } | ||
676 | |||
677 | static void kvm_mmu_zap_page(struct kvm *kvm, | ||
678 | struct kvm_mmu_page *page) | ||
679 | { | ||
680 | u64 *parent_pte; | ||
681 | |||
682 | while (page->multimapped || page->parent_pte) { | ||
683 | if (!page->multimapped) | ||
684 | parent_pte = page->parent_pte; | ||
685 | else { | ||
686 | struct kvm_pte_chain *chain; | ||
687 | |||
688 | chain = container_of(page->parent_ptes.first, | ||
689 | struct kvm_pte_chain, link); | ||
690 | parent_pte = chain->parent_ptes[0]; | ||
691 | } | ||
692 | BUG_ON(!parent_pte); | ||
693 | kvm_mmu_put_page(page, parent_pte); | ||
694 | set_shadow_pte(parent_pte, 0); | ||
695 | } | ||
696 | kvm_mmu_page_unlink_children(kvm, page); | ||
697 | if (!page->root_count) { | ||
698 | hlist_del(&page->hash_link); | ||
699 | kvm_mmu_free_page(kvm, page); | ||
700 | } else | ||
701 | list_move(&page->link, &kvm->active_mmu_pages); | ||
702 | } | ||
703 | |||
704 | static int kvm_mmu_unprotect_page(struct kvm_vcpu *vcpu, gfn_t gfn) | ||
705 | { | ||
706 | unsigned index; | ||
707 | struct hlist_head *bucket; | ||
708 | struct kvm_mmu_page *page; | ||
709 | struct hlist_node *node, *n; | ||
710 | int r; | ||
711 | |||
712 | pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn); | ||
713 | r = 0; | ||
714 | index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; | ||
715 | bucket = &vcpu->kvm->mmu_page_hash[index]; | ||
716 | hlist_for_each_entry_safe(page, node, n, bucket, hash_link) | ||
717 | if (page->gfn == gfn && !page->role.metaphysical) { | ||
718 | pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn, | ||
719 | page->role.word); | ||
720 | kvm_mmu_zap_page(vcpu->kvm, page); | ||
721 | r = 1; | ||
722 | } | ||
723 | return r; | ||
724 | } | ||
725 | |||
726 | static void mmu_unshadow(struct kvm_vcpu *vcpu, gfn_t gfn) | ||
727 | { | ||
728 | struct kvm_mmu_page *page; | ||
729 | |||
730 | while ((page = kvm_mmu_lookup_page(vcpu, gfn)) != NULL) { | ||
731 | pgprintk("%s: zap %lx %x\n", | ||
732 | __FUNCTION__, gfn, page->role.word); | ||
733 | kvm_mmu_zap_page(vcpu->kvm, page); | ||
734 | } | ||
735 | } | ||
736 | |||
737 | static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa) | ||
738 | { | ||
739 | int slot = memslot_id(kvm, gfn_to_memslot(kvm, gpa >> PAGE_SHIFT)); | ||
740 | struct kvm_mmu_page *page_head = page_header(__pa(pte)); | ||
741 | |||
742 | __set_bit(slot, &page_head->slot_bitmap); | ||
743 | } | ||
744 | |||
745 | hpa_t safe_gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa) | ||
746 | { | ||
747 | hpa_t hpa = gpa_to_hpa(vcpu, gpa); | ||
748 | |||
749 | return is_error_hpa(hpa) ? bad_page_address | (gpa & ~PAGE_MASK): hpa; | ||
750 | } | ||
751 | |||
752 | hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa) | ||
753 | { | ||
754 | struct page *page; | ||
755 | |||
756 | ASSERT((gpa & HPA_ERR_MASK) == 0); | ||
757 | page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); | ||
758 | if (!page) | ||
759 | return gpa | HPA_ERR_MASK; | ||
760 | return ((hpa_t)page_to_pfn(page) << PAGE_SHIFT) | ||
761 | | (gpa & (PAGE_SIZE-1)); | ||
762 | } | ||
763 | |||
764 | hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva) | ||
765 | { | ||
766 | gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva); | ||
767 | |||
768 | if (gpa == UNMAPPED_GVA) | ||
769 | return UNMAPPED_GVA; | ||
770 | return gpa_to_hpa(vcpu, gpa); | ||
771 | } | ||
772 | |||
773 | struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) | ||
774 | { | ||
775 | gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva); | ||
776 | |||
777 | if (gpa == UNMAPPED_GVA) | ||
778 | return NULL; | ||
779 | return pfn_to_page(gpa_to_hpa(vcpu, gpa) >> PAGE_SHIFT); | ||
780 | } | ||
781 | |||
782 | static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) | ||
783 | { | ||
784 | } | ||
785 | |||
786 | static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p) | ||
787 | { | ||
788 | int level = PT32E_ROOT_LEVEL; | ||
789 | hpa_t table_addr = vcpu->mmu.root_hpa; | ||
790 | |||
791 | for (; ; level--) { | ||
792 | u32 index = PT64_INDEX(v, level); | ||
793 | u64 *table; | ||
794 | u64 pte; | ||
795 | |||
796 | ASSERT(VALID_PAGE(table_addr)); | ||
797 | table = __va(table_addr); | ||
798 | |||
799 | if (level == 1) { | ||
800 | pte = table[index]; | ||
801 | if (is_present_pte(pte) && is_writeble_pte(pte)) | ||
802 | return 0; | ||
803 | mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT); | ||
804 | page_header_update_slot(vcpu->kvm, table, v); | ||
805 | table[index] = p | PT_PRESENT_MASK | PT_WRITABLE_MASK | | ||
806 | PT_USER_MASK; | ||
807 | rmap_add(vcpu, &table[index]); | ||
808 | return 0; | ||
809 | } | ||
810 | |||
811 | if (table[index] == 0) { | ||
812 | struct kvm_mmu_page *new_table; | ||
813 | gfn_t pseudo_gfn; | ||
814 | |||
815 | pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK) | ||
816 | >> PAGE_SHIFT; | ||
817 | new_table = kvm_mmu_get_page(vcpu, pseudo_gfn, | ||
818 | v, level - 1, | ||
819 | 1, 0, &table[index]); | ||
820 | if (!new_table) { | ||
821 | pgprintk("nonpaging_map: ENOMEM\n"); | ||
822 | return -ENOMEM; | ||
823 | } | ||
824 | |||
825 | table[index] = __pa(new_table->spt) | PT_PRESENT_MASK | ||
826 | | PT_WRITABLE_MASK | PT_USER_MASK; | ||
827 | } | ||
828 | table_addr = table[index] & PT64_BASE_ADDR_MASK; | ||
829 | } | ||
830 | } | ||
831 | |||
832 | static void mmu_free_roots(struct kvm_vcpu *vcpu) | ||
833 | { | ||
834 | int i; | ||
835 | struct kvm_mmu_page *page; | ||
836 | |||
837 | if (!VALID_PAGE(vcpu->mmu.root_hpa)) | ||
838 | return; | ||
839 | #ifdef CONFIG_X86_64 | ||
840 | if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) { | ||
841 | hpa_t root = vcpu->mmu.root_hpa; | ||
842 | |||
843 | page = page_header(root); | ||
844 | --page->root_count; | ||
845 | vcpu->mmu.root_hpa = INVALID_PAGE; | ||
846 | return; | ||
847 | } | ||
848 | #endif | ||
849 | for (i = 0; i < 4; ++i) { | ||
850 | hpa_t root = vcpu->mmu.pae_root[i]; | ||
851 | |||
852 | if (root) { | ||
853 | root &= PT64_BASE_ADDR_MASK; | ||
854 | page = page_header(root); | ||
855 | --page->root_count; | ||
856 | } | ||
857 | vcpu->mmu.pae_root[i] = INVALID_PAGE; | ||
858 | } | ||
859 | vcpu->mmu.root_hpa = INVALID_PAGE; | ||
860 | } | ||
861 | |||
862 | static void mmu_alloc_roots(struct kvm_vcpu *vcpu) | ||
863 | { | ||
864 | int i; | ||
865 | gfn_t root_gfn; | ||
866 | struct kvm_mmu_page *page; | ||
867 | |||
868 | root_gfn = vcpu->cr3 >> PAGE_SHIFT; | ||
869 | |||
870 | #ifdef CONFIG_X86_64 | ||
871 | if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) { | ||
872 | hpa_t root = vcpu->mmu.root_hpa; | ||
873 | |||
874 | ASSERT(!VALID_PAGE(root)); | ||
875 | page = kvm_mmu_get_page(vcpu, root_gfn, 0, | ||
876 | PT64_ROOT_LEVEL, 0, 0, NULL); | ||
877 | root = __pa(page->spt); | ||
878 | ++page->root_count; | ||
879 | vcpu->mmu.root_hpa = root; | ||
880 | return; | ||
881 | } | ||
882 | #endif | ||
883 | for (i = 0; i < 4; ++i) { | ||
884 | hpa_t root = vcpu->mmu.pae_root[i]; | ||
885 | |||
886 | ASSERT(!VALID_PAGE(root)); | ||
887 | if (vcpu->mmu.root_level == PT32E_ROOT_LEVEL) { | ||
888 | if (!is_present_pte(vcpu->pdptrs[i])) { | ||
889 | vcpu->mmu.pae_root[i] = 0; | ||
890 | continue; | ||
891 | } | ||
892 | root_gfn = vcpu->pdptrs[i] >> PAGE_SHIFT; | ||
893 | } else if (vcpu->mmu.root_level == 0) | ||
894 | root_gfn = 0; | ||
895 | page = kvm_mmu_get_page(vcpu, root_gfn, i << 30, | ||
896 | PT32_ROOT_LEVEL, !is_paging(vcpu), | ||
897 | 0, NULL); | ||
898 | root = __pa(page->spt); | ||
899 | ++page->root_count; | ||
900 | vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK; | ||
901 | } | ||
902 | vcpu->mmu.root_hpa = __pa(vcpu->mmu.pae_root); | ||
903 | } | ||
904 | |||
905 | static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) | ||
906 | { | ||
907 | return vaddr; | ||
908 | } | ||
909 | |||
910 | static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, | ||
911 | u32 error_code) | ||
912 | { | ||
913 | gpa_t addr = gva; | ||
914 | hpa_t paddr; | ||
915 | int r; | ||
916 | |||
917 | r = mmu_topup_memory_caches(vcpu); | ||
918 | if (r) | ||
919 | return r; | ||
920 | |||
921 | ASSERT(vcpu); | ||
922 | ASSERT(VALID_PAGE(vcpu->mmu.root_hpa)); | ||
923 | |||
924 | |||
925 | paddr = gpa_to_hpa(vcpu , addr & PT64_BASE_ADDR_MASK); | ||
926 | |||
927 | if (is_error_hpa(paddr)) | ||
928 | return 1; | ||
929 | |||
930 | return nonpaging_map(vcpu, addr & PAGE_MASK, paddr); | ||
931 | } | ||
932 | |||
933 | static void nonpaging_free(struct kvm_vcpu *vcpu) | ||
934 | { | ||
935 | mmu_free_roots(vcpu); | ||
936 | } | ||
937 | |||
938 | static int nonpaging_init_context(struct kvm_vcpu *vcpu) | ||
939 | { | ||
940 | struct kvm_mmu *context = &vcpu->mmu; | ||
941 | |||
942 | context->new_cr3 = nonpaging_new_cr3; | ||
943 | context->page_fault = nonpaging_page_fault; | ||
944 | context->gva_to_gpa = nonpaging_gva_to_gpa; | ||
945 | context->free = nonpaging_free; | ||
946 | context->root_level = 0; | ||
947 | context->shadow_root_level = PT32E_ROOT_LEVEL; | ||
948 | context->root_hpa = INVALID_PAGE; | ||
949 | return 0; | ||
950 | } | ||
951 | |||
952 | static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) | ||
953 | { | ||
954 | ++vcpu->stat.tlb_flush; | ||
955 | kvm_x86_ops->tlb_flush(vcpu); | ||
956 | } | ||
957 | |||
958 | static void paging_new_cr3(struct kvm_vcpu *vcpu) | ||
959 | { | ||
960 | pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3); | ||
961 | mmu_free_roots(vcpu); | ||
962 | } | ||
963 | |||
964 | static void inject_page_fault(struct kvm_vcpu *vcpu, | ||
965 | u64 addr, | ||
966 | u32 err_code) | ||
967 | { | ||
968 | kvm_x86_ops->inject_page_fault(vcpu, addr, err_code); | ||
969 | } | ||
970 | |||
971 | static void paging_free(struct kvm_vcpu *vcpu) | ||
972 | { | ||
973 | nonpaging_free(vcpu); | ||
974 | } | ||
975 | |||
976 | #define PTTYPE 64 | ||
977 | #include "paging_tmpl.h" | ||
978 | #undef PTTYPE | ||
979 | |||
980 | #define PTTYPE 32 | ||
981 | #include "paging_tmpl.h" | ||
982 | #undef PTTYPE | ||
983 | |||
984 | static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) | ||
985 | { | ||
986 | struct kvm_mmu *context = &vcpu->mmu; | ||
987 | |||
988 | ASSERT(is_pae(vcpu)); | ||
989 | context->new_cr3 = paging_new_cr3; | ||
990 | context->page_fault = paging64_page_fault; | ||
991 | context->gva_to_gpa = paging64_gva_to_gpa; | ||
992 | context->free = paging_free; | ||
993 | context->root_level = level; | ||
994 | context->shadow_root_level = level; | ||
995 | context->root_hpa = INVALID_PAGE; | ||
996 | return 0; | ||
997 | } | ||
998 | |||
999 | static int paging64_init_context(struct kvm_vcpu *vcpu) | ||
1000 | { | ||
1001 | return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL); | ||
1002 | } | ||
1003 | |||
1004 | static int paging32_init_context(struct kvm_vcpu *vcpu) | ||
1005 | { | ||
1006 | struct kvm_mmu *context = &vcpu->mmu; | ||
1007 | |||
1008 | context->new_cr3 = paging_new_cr3; | ||
1009 | context->page_fault = paging32_page_fault; | ||
1010 | context->gva_to_gpa = paging32_gva_to_gpa; | ||
1011 | context->free = paging_free; | ||
1012 | context->root_level = PT32_ROOT_LEVEL; | ||
1013 | context->shadow_root_level = PT32E_ROOT_LEVEL; | ||
1014 | context->root_hpa = INVALID_PAGE; | ||
1015 | return 0; | ||
1016 | } | ||
1017 | |||
1018 | static int paging32E_init_context(struct kvm_vcpu *vcpu) | ||
1019 | { | ||
1020 | return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL); | ||
1021 | } | ||
1022 | |||
1023 | static int init_kvm_mmu(struct kvm_vcpu *vcpu) | ||
1024 | { | ||
1025 | ASSERT(vcpu); | ||
1026 | ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa)); | ||
1027 | |||
1028 | if (!is_paging(vcpu)) | ||
1029 | return nonpaging_init_context(vcpu); | ||
1030 | else if (is_long_mode(vcpu)) | ||
1031 | return paging64_init_context(vcpu); | ||
1032 | else if (is_pae(vcpu)) | ||
1033 | return paging32E_init_context(vcpu); | ||
1034 | else | ||
1035 | return paging32_init_context(vcpu); | ||
1036 | } | ||
1037 | |||
1038 | static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) | ||
1039 | { | ||
1040 | ASSERT(vcpu); | ||
1041 | if (VALID_PAGE(vcpu->mmu.root_hpa)) { | ||
1042 | vcpu->mmu.free(vcpu); | ||
1043 | vcpu->mmu.root_hpa = INVALID_PAGE; | ||
1044 | } | ||
1045 | } | ||
1046 | |||
1047 | int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) | ||
1048 | { | ||
1049 | destroy_kvm_mmu(vcpu); | ||
1050 | return init_kvm_mmu(vcpu); | ||
1051 | } | ||
1052 | EXPORT_SYMBOL_GPL(kvm_mmu_reset_context); | ||
1053 | |||
1054 | int kvm_mmu_load(struct kvm_vcpu *vcpu) | ||
1055 | { | ||
1056 | int r; | ||
1057 | |||
1058 | mutex_lock(&vcpu->kvm->lock); | ||
1059 | r = mmu_topup_memory_caches(vcpu); | ||
1060 | if (r) | ||
1061 | goto out; | ||
1062 | mmu_alloc_roots(vcpu); | ||
1063 | kvm_x86_ops->set_cr3(vcpu, vcpu->mmu.root_hpa); | ||
1064 | kvm_mmu_flush_tlb(vcpu); | ||
1065 | out: | ||
1066 | mutex_unlock(&vcpu->kvm->lock); | ||
1067 | return r; | ||
1068 | } | ||
1069 | EXPORT_SYMBOL_GPL(kvm_mmu_load); | ||
1070 | |||
1071 | void kvm_mmu_unload(struct kvm_vcpu *vcpu) | ||
1072 | { | ||
1073 | mmu_free_roots(vcpu); | ||
1074 | } | ||
1075 | |||
1076 | static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, | ||
1077 | struct kvm_mmu_page *page, | ||
1078 | u64 *spte) | ||
1079 | { | ||
1080 | u64 pte; | ||
1081 | struct kvm_mmu_page *child; | ||
1082 | |||
1083 | pte = *spte; | ||
1084 | if (is_present_pte(pte)) { | ||
1085 | if (page->role.level == PT_PAGE_TABLE_LEVEL) | ||
1086 | rmap_remove(spte); | ||
1087 | else { | ||
1088 | child = page_header(pte & PT64_BASE_ADDR_MASK); | ||
1089 | mmu_page_remove_parent_pte(child, spte); | ||
1090 | } | ||
1091 | } | ||
1092 | set_shadow_pte(spte, 0); | ||
1093 | kvm_flush_remote_tlbs(vcpu->kvm); | ||
1094 | } | ||
1095 | |||
1096 | static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, | ||
1097 | struct kvm_mmu_page *page, | ||
1098 | u64 *spte, | ||
1099 | const void *new, int bytes) | ||
1100 | { | ||
1101 | if (page->role.level != PT_PAGE_TABLE_LEVEL) | ||
1102 | return; | ||
1103 | |||
1104 | if (page->role.glevels == PT32_ROOT_LEVEL) | ||
1105 | paging32_update_pte(vcpu, page, spte, new, bytes); | ||
1106 | else | ||
1107 | paging64_update_pte(vcpu, page, spte, new, bytes); | ||
1108 | } | ||
1109 | |||
1110 | void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | ||
1111 | const u8 *new, int bytes) | ||
1112 | { | ||
1113 | gfn_t gfn = gpa >> PAGE_SHIFT; | ||
1114 | struct kvm_mmu_page *page; | ||
1115 | struct hlist_node *node, *n; | ||
1116 | struct hlist_head *bucket; | ||
1117 | unsigned index; | ||
1118 | u64 *spte; | ||
1119 | unsigned offset = offset_in_page(gpa); | ||
1120 | unsigned pte_size; | ||
1121 | unsigned page_offset; | ||
1122 | unsigned misaligned; | ||
1123 | unsigned quadrant; | ||
1124 | int level; | ||
1125 | int flooded = 0; | ||
1126 | int npte; | ||
1127 | |||
1128 | pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes); | ||
1129 | if (gfn == vcpu->last_pt_write_gfn) { | ||
1130 | ++vcpu->last_pt_write_count; | ||
1131 | if (vcpu->last_pt_write_count >= 3) | ||
1132 | flooded = 1; | ||
1133 | } else { | ||
1134 | vcpu->last_pt_write_gfn = gfn; | ||
1135 | vcpu->last_pt_write_count = 1; | ||
1136 | } | ||
1137 | index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; | ||
1138 | bucket = &vcpu->kvm->mmu_page_hash[index]; | ||
1139 | hlist_for_each_entry_safe(page, node, n, bucket, hash_link) { | ||
1140 | if (page->gfn != gfn || page->role.metaphysical) | ||
1141 | continue; | ||
1142 | pte_size = page->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; | ||
1143 | misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); | ||
1144 | misaligned |= bytes < 4; | ||
1145 | if (misaligned || flooded) { | ||
1146 | /* | ||
1147 | * Misaligned accesses are too much trouble to fix | ||
1148 | * up; also, they usually indicate a page is not used | ||
1149 | * as a page table. | ||
1150 | * | ||
1151 | * If we're seeing too many writes to a page, | ||
1152 | * it may no longer be a page table, or we may be | ||
1153 | * forking, in which case it is better to unmap the | ||
1154 | * page. | ||
1155 | */ | ||
1156 | pgprintk("misaligned: gpa %llx bytes %d role %x\n", | ||
1157 | gpa, bytes, page->role.word); | ||
1158 | kvm_mmu_zap_page(vcpu->kvm, page); | ||
1159 | continue; | ||
1160 | } | ||
1161 | page_offset = offset; | ||
1162 | level = page->role.level; | ||
1163 | npte = 1; | ||
1164 | if (page->role.glevels == PT32_ROOT_LEVEL) { | ||
1165 | page_offset <<= 1; /* 32->64 */ | ||
1166 | /* | ||
1167 | * A 32-bit pde maps 4MB while the shadow pdes map | ||
1168 | * only 2MB. So we need to double the offset again | ||
1169 | * and zap two pdes instead of one. | ||
1170 | */ | ||
1171 | if (level == PT32_ROOT_LEVEL) { | ||
1172 | page_offset &= ~7; /* kill rounding error */ | ||
1173 | page_offset <<= 1; | ||
1174 | npte = 2; | ||
1175 | } | ||
1176 | quadrant = page_offset >> PAGE_SHIFT; | ||
1177 | page_offset &= ~PAGE_MASK; | ||
1178 | if (quadrant != page->role.quadrant) | ||
1179 | continue; | ||
1180 | } | ||
1181 | spte = &page->spt[page_offset / sizeof(*spte)]; | ||
1182 | while (npte--) { | ||
1183 | mmu_pte_write_zap_pte(vcpu, page, spte); | ||
1184 | mmu_pte_write_new_pte(vcpu, page, spte, new, bytes); | ||
1185 | ++spte; | ||
1186 | } | ||
1187 | } | ||
1188 | } | ||
1189 | |||
1190 | int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) | ||
1191 | { | ||
1192 | gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva); | ||
1193 | |||
1194 | return kvm_mmu_unprotect_page(vcpu, gpa >> PAGE_SHIFT); | ||
1195 | } | ||
1196 | |||
1197 | void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) | ||
1198 | { | ||
1199 | while (vcpu->kvm->n_free_mmu_pages < KVM_REFILL_PAGES) { | ||
1200 | struct kvm_mmu_page *page; | ||
1201 | |||
1202 | page = container_of(vcpu->kvm->active_mmu_pages.prev, | ||
1203 | struct kvm_mmu_page, link); | ||
1204 | kvm_mmu_zap_page(vcpu->kvm, page); | ||
1205 | } | ||
1206 | } | ||
1207 | |||
1208 | static void free_mmu_pages(struct kvm_vcpu *vcpu) | ||
1209 | { | ||
1210 | struct kvm_mmu_page *page; | ||
1211 | |||
1212 | while (!list_empty(&vcpu->kvm->active_mmu_pages)) { | ||
1213 | page = container_of(vcpu->kvm->active_mmu_pages.next, | ||
1214 | struct kvm_mmu_page, link); | ||
1215 | kvm_mmu_zap_page(vcpu->kvm, page); | ||
1216 | } | ||
1217 | free_page((unsigned long)vcpu->mmu.pae_root); | ||
1218 | } | ||
1219 | |||
1220 | static int alloc_mmu_pages(struct kvm_vcpu *vcpu) | ||
1221 | { | ||
1222 | struct page *page; | ||
1223 | int i; | ||
1224 | |||
1225 | ASSERT(vcpu); | ||
1226 | |||
1227 | vcpu->kvm->n_free_mmu_pages = KVM_NUM_MMU_PAGES; | ||
1228 | |||
1229 | /* | ||
1230 | * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. | ||
1231 | * Therefore we need to allocate shadow page tables in the first | ||
1232 | * 4GB of memory, which happens to fit the DMA32 zone. | ||
1233 | */ | ||
1234 | page = alloc_page(GFP_KERNEL | __GFP_DMA32); | ||
1235 | if (!page) | ||
1236 | goto error_1; | ||
1237 | vcpu->mmu.pae_root = page_address(page); | ||
1238 | for (i = 0; i < 4; ++i) | ||
1239 | vcpu->mmu.pae_root[i] = INVALID_PAGE; | ||
1240 | |||
1241 | return 0; | ||
1242 | |||
1243 | error_1: | ||
1244 | free_mmu_pages(vcpu); | ||
1245 | return -ENOMEM; | ||
1246 | } | ||
1247 | |||
1248 | int kvm_mmu_create(struct kvm_vcpu *vcpu) | ||
1249 | { | ||
1250 | ASSERT(vcpu); | ||
1251 | ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa)); | ||
1252 | |||
1253 | return alloc_mmu_pages(vcpu); | ||
1254 | } | ||
1255 | |||
1256 | int kvm_mmu_setup(struct kvm_vcpu *vcpu) | ||
1257 | { | ||
1258 | ASSERT(vcpu); | ||
1259 | ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa)); | ||
1260 | |||
1261 | return init_kvm_mmu(vcpu); | ||
1262 | } | ||
1263 | |||
1264 | void kvm_mmu_destroy(struct kvm_vcpu *vcpu) | ||
1265 | { | ||
1266 | ASSERT(vcpu); | ||
1267 | |||
1268 | destroy_kvm_mmu(vcpu); | ||
1269 | free_mmu_pages(vcpu); | ||
1270 | mmu_free_memory_caches(vcpu); | ||
1271 | } | ||
1272 | |||
1273 | void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) | ||
1274 | { | ||
1275 | struct kvm_mmu_page *page; | ||
1276 | |||
1277 | list_for_each_entry(page, &kvm->active_mmu_pages, link) { | ||
1278 | int i; | ||
1279 | u64 *pt; | ||
1280 | |||
1281 | if (!test_bit(slot, &page->slot_bitmap)) | ||
1282 | continue; | ||
1283 | |||
1284 | pt = page->spt; | ||
1285 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) | ||
1286 | /* avoid RMW */ | ||
1287 | if (pt[i] & PT_WRITABLE_MASK) { | ||
1288 | rmap_remove(&pt[i]); | ||
1289 | pt[i] &= ~PT_WRITABLE_MASK; | ||
1290 | } | ||
1291 | } | ||
1292 | } | ||
1293 | |||
1294 | void kvm_mmu_zap_all(struct kvm *kvm) | ||
1295 | { | ||
1296 | struct kvm_mmu_page *page, *node; | ||
1297 | |||
1298 | list_for_each_entry_safe(page, node, &kvm->active_mmu_pages, link) | ||
1299 | kvm_mmu_zap_page(kvm, page); | ||
1300 | |||
1301 | kvm_flush_remote_tlbs(kvm); | ||
1302 | } | ||
1303 | |||
1304 | void kvm_mmu_module_exit(void) | ||
1305 | { | ||
1306 | if (pte_chain_cache) | ||
1307 | kmem_cache_destroy(pte_chain_cache); | ||
1308 | if (rmap_desc_cache) | ||
1309 | kmem_cache_destroy(rmap_desc_cache); | ||
1310 | if (mmu_page_header_cache) | ||
1311 | kmem_cache_destroy(mmu_page_header_cache); | ||
1312 | } | ||
1313 | |||
1314 | int kvm_mmu_module_init(void) | ||
1315 | { | ||
1316 | pte_chain_cache = kmem_cache_create("kvm_pte_chain", | ||
1317 | sizeof(struct kvm_pte_chain), | ||
1318 | 0, 0, NULL); | ||
1319 | if (!pte_chain_cache) | ||
1320 | goto nomem; | ||
1321 | rmap_desc_cache = kmem_cache_create("kvm_rmap_desc", | ||
1322 | sizeof(struct kvm_rmap_desc), | ||
1323 | 0, 0, NULL); | ||
1324 | if (!rmap_desc_cache) | ||
1325 | goto nomem; | ||
1326 | |||
1327 | mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", | ||
1328 | sizeof(struct kvm_mmu_page), | ||
1329 | 0, 0, NULL); | ||
1330 | if (!mmu_page_header_cache) | ||
1331 | goto nomem; | ||
1332 | |||
1333 | return 0; | ||
1334 | |||
1335 | nomem: | ||
1336 | kvm_mmu_module_exit(); | ||
1337 | return -ENOMEM; | ||
1338 | } | ||
1339 | |||
1340 | #ifdef AUDIT | ||
1341 | |||
1342 | static const char *audit_msg; | ||
1343 | |||
1344 | static gva_t canonicalize(gva_t gva) | ||
1345 | { | ||
1346 | #ifdef CONFIG_X86_64 | ||
1347 | gva = (long long)(gva << 16) >> 16; | ||
1348 | #endif | ||
1349 | return gva; | ||
1350 | } | ||
1351 | |||
1352 | static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, | ||
1353 | gva_t va, int level) | ||
1354 | { | ||
1355 | u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK); | ||
1356 | int i; | ||
1357 | gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1)); | ||
1358 | |||
1359 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) { | ||
1360 | u64 ent = pt[i]; | ||
1361 | |||
1362 | if (!(ent & PT_PRESENT_MASK)) | ||
1363 | continue; | ||
1364 | |||
1365 | va = canonicalize(va); | ||
1366 | if (level > 1) | ||
1367 | audit_mappings_page(vcpu, ent, va, level - 1); | ||
1368 | else { | ||
1369 | gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, va); | ||
1370 | hpa_t hpa = gpa_to_hpa(vcpu, gpa); | ||
1371 | |||
1372 | if ((ent & PT_PRESENT_MASK) | ||
1373 | && (ent & PT64_BASE_ADDR_MASK) != hpa) | ||
1374 | printk(KERN_ERR "audit error: (%s) levels %d" | ||
1375 | " gva %lx gpa %llx hpa %llx ent %llx\n", | ||
1376 | audit_msg, vcpu->mmu.root_level, | ||
1377 | va, gpa, hpa, ent); | ||
1378 | } | ||
1379 | } | ||
1380 | } | ||
1381 | |||
1382 | static void audit_mappings(struct kvm_vcpu *vcpu) | ||
1383 | { | ||
1384 | unsigned i; | ||
1385 | |||
1386 | if (vcpu->mmu.root_level == 4) | ||
1387 | audit_mappings_page(vcpu, vcpu->mmu.root_hpa, 0, 4); | ||
1388 | else | ||
1389 | for (i = 0; i < 4; ++i) | ||
1390 | if (vcpu->mmu.pae_root[i] & PT_PRESENT_MASK) | ||
1391 | audit_mappings_page(vcpu, | ||
1392 | vcpu->mmu.pae_root[i], | ||
1393 | i << 30, | ||
1394 | 2); | ||
1395 | } | ||
1396 | |||
1397 | static int count_rmaps(struct kvm_vcpu *vcpu) | ||
1398 | { | ||
1399 | int nmaps = 0; | ||
1400 | int i, j, k; | ||
1401 | |||
1402 | for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { | ||
1403 | struct kvm_memory_slot *m = &vcpu->kvm->memslots[i]; | ||
1404 | struct kvm_rmap_desc *d; | ||
1405 | |||
1406 | for (j = 0; j < m->npages; ++j) { | ||
1407 | struct page *page = m->phys_mem[j]; | ||
1408 | |||
1409 | if (!page->private) | ||
1410 | continue; | ||
1411 | if (!(page->private & 1)) { | ||
1412 | ++nmaps; | ||
1413 | continue; | ||
1414 | } | ||
1415 | d = (struct kvm_rmap_desc *)(page->private & ~1ul); | ||
1416 | while (d) { | ||
1417 | for (k = 0; k < RMAP_EXT; ++k) | ||
1418 | if (d->shadow_ptes[k]) | ||
1419 | ++nmaps; | ||
1420 | else | ||
1421 | break; | ||
1422 | d = d->more; | ||
1423 | } | ||
1424 | } | ||
1425 | } | ||
1426 | return nmaps; | ||
1427 | } | ||
1428 | |||
1429 | static int count_writable_mappings(struct kvm_vcpu *vcpu) | ||
1430 | { | ||
1431 | int nmaps = 0; | ||
1432 | struct kvm_mmu_page *page; | ||
1433 | int i; | ||
1434 | |||
1435 | list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) { | ||
1436 | u64 *pt = page->spt; | ||
1437 | |||
1438 | if (page->role.level != PT_PAGE_TABLE_LEVEL) | ||
1439 | continue; | ||
1440 | |||
1441 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | ||
1442 | u64 ent = pt[i]; | ||
1443 | |||
1444 | if (!(ent & PT_PRESENT_MASK)) | ||
1445 | continue; | ||
1446 | if (!(ent & PT_WRITABLE_MASK)) | ||
1447 | continue; | ||
1448 | ++nmaps; | ||
1449 | } | ||
1450 | } | ||
1451 | return nmaps; | ||
1452 | } | ||
1453 | |||
1454 | static void audit_rmap(struct kvm_vcpu *vcpu) | ||
1455 | { | ||
1456 | int n_rmap = count_rmaps(vcpu); | ||
1457 | int n_actual = count_writable_mappings(vcpu); | ||
1458 | |||
1459 | if (n_rmap != n_actual) | ||
1460 | printk(KERN_ERR "%s: (%s) rmap %d actual %d\n", | ||
1461 | __FUNCTION__, audit_msg, n_rmap, n_actual); | ||
1462 | } | ||
1463 | |||
1464 | static void audit_write_protection(struct kvm_vcpu *vcpu) | ||
1465 | { | ||
1466 | struct kvm_mmu_page *page; | ||
1467 | |||
1468 | list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) { | ||
1469 | hfn_t hfn; | ||
1470 | struct page *pg; | ||
1471 | |||
1472 | if (page->role.metaphysical) | ||
1473 | continue; | ||
1474 | |||
1475 | hfn = gpa_to_hpa(vcpu, (gpa_t)page->gfn << PAGE_SHIFT) | ||
1476 | >> PAGE_SHIFT; | ||
1477 | pg = pfn_to_page(hfn); | ||
1478 | if (pg->private) | ||
1479 | printk(KERN_ERR "%s: (%s) shadow page has writable" | ||
1480 | " mappings: gfn %lx role %x\n", | ||
1481 | __FUNCTION__, audit_msg, page->gfn, | ||
1482 | page->role.word); | ||
1483 | } | ||
1484 | } | ||
1485 | |||
1486 | static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) | ||
1487 | { | ||
1488 | int olddbg = dbg; | ||
1489 | |||
1490 | dbg = 0; | ||
1491 | audit_msg = msg; | ||
1492 | audit_rmap(vcpu); | ||
1493 | audit_write_protection(vcpu); | ||
1494 | audit_mappings(vcpu); | ||
1495 | dbg = olddbg; | ||
1496 | } | ||
1497 | |||
1498 | #endif | ||
diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h deleted file mode 100644 index 6b094b44f8fb..000000000000 --- a/drivers/kvm/paging_tmpl.h +++ /dev/null | |||
@@ -1,511 +0,0 @@ | |||
1 | /* | ||
2 | * Kernel-based Virtual Machine driver for Linux | ||
3 | * | ||
4 | * This module enables machines with Intel VT-x extensions to run virtual | ||
5 | * machines without emulation or binary translation. | ||
6 | * | ||
7 | * MMU support | ||
8 | * | ||
9 | * Copyright (C) 2006 Qumranet, Inc. | ||
10 | * | ||
11 | * Authors: | ||
12 | * Yaniv Kamay <yaniv@qumranet.com> | ||
13 | * Avi Kivity <avi@qumranet.com> | ||
14 | * | ||
15 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
16 | * the COPYING file in the top-level directory. | ||
17 | * | ||
18 | */ | ||
19 | |||
20 | /* | ||
21 | * We need the mmu code to access both 32-bit and 64-bit guest ptes, | ||
22 | * so the code in this file is compiled twice, once per pte size. | ||
23 | */ | ||
24 | |||
25 | #if PTTYPE == 64 | ||
26 | #define pt_element_t u64 | ||
27 | #define guest_walker guest_walker64 | ||
28 | #define FNAME(name) paging##64_##name | ||
29 | #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK | ||
30 | #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK | ||
31 | #define PT_INDEX(addr, level) PT64_INDEX(addr, level) | ||
32 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) | ||
33 | #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) | ||
34 | #ifdef CONFIG_X86_64 | ||
35 | #define PT_MAX_FULL_LEVELS 4 | ||
36 | #else | ||
37 | #define PT_MAX_FULL_LEVELS 2 | ||
38 | #endif | ||
39 | #elif PTTYPE == 32 | ||
40 | #define pt_element_t u32 | ||
41 | #define guest_walker guest_walker32 | ||
42 | #define FNAME(name) paging##32_##name | ||
43 | #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK | ||
44 | #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK | ||
45 | #define PT_INDEX(addr, level) PT32_INDEX(addr, level) | ||
46 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) | ||
47 | #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) | ||
48 | #define PT_MAX_FULL_LEVELS 2 | ||
49 | #else | ||
50 | #error Invalid PTTYPE value | ||
51 | #endif | ||
52 | |||
53 | /* | ||
54 | * The guest_walker structure emulates the behavior of the hardware page | ||
55 | * table walker. | ||
56 | */ | ||
57 | struct guest_walker { | ||
58 | int level; | ||
59 | gfn_t table_gfn[PT_MAX_FULL_LEVELS]; | ||
60 | pt_element_t *table; | ||
61 | pt_element_t pte; | ||
62 | pt_element_t *ptep; | ||
63 | struct page *page; | ||
64 | int index; | ||
65 | pt_element_t inherited_ar; | ||
66 | gfn_t gfn; | ||
67 | u32 error_code; | ||
68 | }; | ||
69 | |||
70 | /* | ||
71 | * Fetch a guest pte for a guest virtual address | ||
72 | */ | ||
73 | static int FNAME(walk_addr)(struct guest_walker *walker, | ||
74 | struct kvm_vcpu *vcpu, gva_t addr, | ||
75 | int write_fault, int user_fault, int fetch_fault) | ||
76 | { | ||
77 | hpa_t hpa; | ||
78 | struct kvm_memory_slot *slot; | ||
79 | pt_element_t *ptep; | ||
80 | pt_element_t root; | ||
81 | gfn_t table_gfn; | ||
82 | |||
83 | pgprintk("%s: addr %lx\n", __FUNCTION__, addr); | ||
84 | walker->level = vcpu->mmu.root_level; | ||
85 | walker->table = NULL; | ||
86 | walker->page = NULL; | ||
87 | walker->ptep = NULL; | ||
88 | root = vcpu->cr3; | ||
89 | #if PTTYPE == 64 | ||
90 | if (!is_long_mode(vcpu)) { | ||
91 | walker->ptep = &vcpu->pdptrs[(addr >> 30) & 3]; | ||
92 | root = *walker->ptep; | ||
93 | walker->pte = root; | ||
94 | if (!(root & PT_PRESENT_MASK)) | ||
95 | goto not_present; | ||
96 | --walker->level; | ||
97 | } | ||
98 | #endif | ||
99 | table_gfn = (root & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; | ||
100 | walker->table_gfn[walker->level - 1] = table_gfn; | ||
101 | pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__, | ||
102 | walker->level - 1, table_gfn); | ||
103 | slot = gfn_to_memslot(vcpu->kvm, table_gfn); | ||
104 | hpa = safe_gpa_to_hpa(vcpu, root & PT64_BASE_ADDR_MASK); | ||
105 | walker->page = pfn_to_page(hpa >> PAGE_SHIFT); | ||
106 | walker->table = kmap_atomic(walker->page, KM_USER0); | ||
107 | |||
108 | ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || | ||
109 | (vcpu->cr3 & CR3_NONPAE_RESERVED_BITS) == 0); | ||
110 | |||
111 | walker->inherited_ar = PT_USER_MASK | PT_WRITABLE_MASK; | ||
112 | |||
113 | for (;;) { | ||
114 | int index = PT_INDEX(addr, walker->level); | ||
115 | hpa_t paddr; | ||
116 | |||
117 | ptep = &walker->table[index]; | ||
118 | walker->index = index; | ||
119 | ASSERT(((unsigned long)walker->table & PAGE_MASK) == | ||
120 | ((unsigned long)ptep & PAGE_MASK)); | ||
121 | |||
122 | if (!is_present_pte(*ptep)) | ||
123 | goto not_present; | ||
124 | |||
125 | if (write_fault && !is_writeble_pte(*ptep)) | ||
126 | if (user_fault || is_write_protection(vcpu)) | ||
127 | goto access_error; | ||
128 | |||
129 | if (user_fault && !(*ptep & PT_USER_MASK)) | ||
130 | goto access_error; | ||
131 | |||
132 | #if PTTYPE == 64 | ||
133 | if (fetch_fault && is_nx(vcpu) && (*ptep & PT64_NX_MASK)) | ||
134 | goto access_error; | ||
135 | #endif | ||
136 | |||
137 | if (!(*ptep & PT_ACCESSED_MASK)) { | ||
138 | mark_page_dirty(vcpu->kvm, table_gfn); | ||
139 | *ptep |= PT_ACCESSED_MASK; | ||
140 | } | ||
141 | |||
142 | if (walker->level == PT_PAGE_TABLE_LEVEL) { | ||
143 | walker->gfn = (*ptep & PT_BASE_ADDR_MASK) | ||
144 | >> PAGE_SHIFT; | ||
145 | break; | ||
146 | } | ||
147 | |||
148 | if (walker->level == PT_DIRECTORY_LEVEL | ||
149 | && (*ptep & PT_PAGE_SIZE_MASK) | ||
150 | && (PTTYPE == 64 || is_pse(vcpu))) { | ||
151 | walker->gfn = (*ptep & PT_DIR_BASE_ADDR_MASK) | ||
152 | >> PAGE_SHIFT; | ||
153 | walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL); | ||
154 | break; | ||
155 | } | ||
156 | |||
157 | walker->inherited_ar &= walker->table[index]; | ||
158 | table_gfn = (*ptep & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; | ||
159 | kunmap_atomic(walker->table, KM_USER0); | ||
160 | paddr = safe_gpa_to_hpa(vcpu, table_gfn << PAGE_SHIFT); | ||
161 | walker->page = pfn_to_page(paddr >> PAGE_SHIFT); | ||
162 | walker->table = kmap_atomic(walker->page, KM_USER0); | ||
163 | --walker->level; | ||
164 | walker->table_gfn[walker->level - 1 ] = table_gfn; | ||
165 | pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__, | ||
166 | walker->level - 1, table_gfn); | ||
167 | } | ||
168 | walker->pte = *ptep; | ||
169 | if (walker->page) | ||
170 | walker->ptep = NULL; | ||
171 | if (walker->table) | ||
172 | kunmap_atomic(walker->table, KM_USER0); | ||
173 | pgprintk("%s: pte %llx\n", __FUNCTION__, (u64)*ptep); | ||
174 | return 1; | ||
175 | |||
176 | not_present: | ||
177 | walker->error_code = 0; | ||
178 | goto err; | ||
179 | |||
180 | access_error: | ||
181 | walker->error_code = PFERR_PRESENT_MASK; | ||
182 | |||
183 | err: | ||
184 | if (write_fault) | ||
185 | walker->error_code |= PFERR_WRITE_MASK; | ||
186 | if (user_fault) | ||
187 | walker->error_code |= PFERR_USER_MASK; | ||
188 | if (fetch_fault) | ||
189 | walker->error_code |= PFERR_FETCH_MASK; | ||
190 | if (walker->table) | ||
191 | kunmap_atomic(walker->table, KM_USER0); | ||
192 | return 0; | ||
193 | } | ||
194 | |||
195 | static void FNAME(mark_pagetable_dirty)(struct kvm *kvm, | ||
196 | struct guest_walker *walker) | ||
197 | { | ||
198 | mark_page_dirty(kvm, walker->table_gfn[walker->level - 1]); | ||
199 | } | ||
200 | |||
201 | static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu, | ||
202 | u64 *shadow_pte, | ||
203 | gpa_t gaddr, | ||
204 | pt_element_t gpte, | ||
205 | u64 access_bits, | ||
206 | int user_fault, | ||
207 | int write_fault, | ||
208 | int *ptwrite, | ||
209 | struct guest_walker *walker, | ||
210 | gfn_t gfn) | ||
211 | { | ||
212 | hpa_t paddr; | ||
213 | int dirty = gpte & PT_DIRTY_MASK; | ||
214 | u64 spte = *shadow_pte; | ||
215 | int was_rmapped = is_rmap_pte(spte); | ||
216 | |||
217 | pgprintk("%s: spte %llx gpte %llx access %llx write_fault %d" | ||
218 | " user_fault %d gfn %lx\n", | ||
219 | __FUNCTION__, spte, (u64)gpte, access_bits, | ||
220 | write_fault, user_fault, gfn); | ||
221 | |||
222 | if (write_fault && !dirty) { | ||
223 | pt_element_t *guest_ent, *tmp = NULL; | ||
224 | |||
225 | if (walker->ptep) | ||
226 | guest_ent = walker->ptep; | ||
227 | else { | ||
228 | tmp = kmap_atomic(walker->page, KM_USER0); | ||
229 | guest_ent = &tmp[walker->index]; | ||
230 | } | ||
231 | |||
232 | *guest_ent |= PT_DIRTY_MASK; | ||
233 | if (!walker->ptep) | ||
234 | kunmap_atomic(tmp, KM_USER0); | ||
235 | dirty = 1; | ||
236 | FNAME(mark_pagetable_dirty)(vcpu->kvm, walker); | ||
237 | } | ||
238 | |||
239 | spte |= PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK; | ||
240 | spte |= gpte & PT64_NX_MASK; | ||
241 | if (!dirty) | ||
242 | access_bits &= ~PT_WRITABLE_MASK; | ||
243 | |||
244 | paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK); | ||
245 | |||
246 | spte |= PT_PRESENT_MASK; | ||
247 | if (access_bits & PT_USER_MASK) | ||
248 | spte |= PT_USER_MASK; | ||
249 | |||
250 | if (is_error_hpa(paddr)) { | ||
251 | spte |= gaddr; | ||
252 | spte |= PT_SHADOW_IO_MARK; | ||
253 | spte &= ~PT_PRESENT_MASK; | ||
254 | set_shadow_pte(shadow_pte, spte); | ||
255 | return; | ||
256 | } | ||
257 | |||
258 | spte |= paddr; | ||
259 | |||
260 | if ((access_bits & PT_WRITABLE_MASK) | ||
261 | || (write_fault && !is_write_protection(vcpu) && !user_fault)) { | ||
262 | struct kvm_mmu_page *shadow; | ||
263 | |||
264 | spte |= PT_WRITABLE_MASK; | ||
265 | if (user_fault) { | ||
266 | mmu_unshadow(vcpu, gfn); | ||
267 | goto unshadowed; | ||
268 | } | ||
269 | |||
270 | shadow = kvm_mmu_lookup_page(vcpu, gfn); | ||
271 | if (shadow) { | ||
272 | pgprintk("%s: found shadow page for %lx, marking ro\n", | ||
273 | __FUNCTION__, gfn); | ||
274 | access_bits &= ~PT_WRITABLE_MASK; | ||
275 | if (is_writeble_pte(spte)) { | ||
276 | spte &= ~PT_WRITABLE_MASK; | ||
277 | kvm_x86_ops->tlb_flush(vcpu); | ||
278 | } | ||
279 | if (write_fault) | ||
280 | *ptwrite = 1; | ||
281 | } | ||
282 | } | ||
283 | |||
284 | unshadowed: | ||
285 | |||
286 | if (access_bits & PT_WRITABLE_MASK) | ||
287 | mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT); | ||
288 | |||
289 | set_shadow_pte(shadow_pte, spte); | ||
290 | page_header_update_slot(vcpu->kvm, shadow_pte, gaddr); | ||
291 | if (!was_rmapped) | ||
292 | rmap_add(vcpu, shadow_pte); | ||
293 | } | ||
294 | |||
295 | static void FNAME(set_pte)(struct kvm_vcpu *vcpu, pt_element_t gpte, | ||
296 | u64 *shadow_pte, u64 access_bits, | ||
297 | int user_fault, int write_fault, int *ptwrite, | ||
298 | struct guest_walker *walker, gfn_t gfn) | ||
299 | { | ||
300 | access_bits &= gpte; | ||
301 | FNAME(set_pte_common)(vcpu, shadow_pte, gpte & PT_BASE_ADDR_MASK, | ||
302 | gpte, access_bits, user_fault, write_fault, | ||
303 | ptwrite, walker, gfn); | ||
304 | } | ||
305 | |||
306 | static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, | ||
307 | u64 *spte, const void *pte, int bytes) | ||
308 | { | ||
309 | pt_element_t gpte; | ||
310 | |||
311 | if (bytes < sizeof(pt_element_t)) | ||
312 | return; | ||
313 | gpte = *(const pt_element_t *)pte; | ||
314 | if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) | ||
315 | return; | ||
316 | pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte); | ||
317 | FNAME(set_pte)(vcpu, gpte, spte, PT_USER_MASK | PT_WRITABLE_MASK, 0, | ||
318 | 0, NULL, NULL, | ||
319 | (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT); | ||
320 | } | ||
321 | |||
322 | static void FNAME(set_pde)(struct kvm_vcpu *vcpu, pt_element_t gpde, | ||
323 | u64 *shadow_pte, u64 access_bits, | ||
324 | int user_fault, int write_fault, int *ptwrite, | ||
325 | struct guest_walker *walker, gfn_t gfn) | ||
326 | { | ||
327 | gpa_t gaddr; | ||
328 | |||
329 | access_bits &= gpde; | ||
330 | gaddr = (gpa_t)gfn << PAGE_SHIFT; | ||
331 | if (PTTYPE == 32 && is_cpuid_PSE36()) | ||
332 | gaddr |= (gpde & PT32_DIR_PSE36_MASK) << | ||
333 | (32 - PT32_DIR_PSE36_SHIFT); | ||
334 | FNAME(set_pte_common)(vcpu, shadow_pte, gaddr, | ||
335 | gpde, access_bits, user_fault, write_fault, | ||
336 | ptwrite, walker, gfn); | ||
337 | } | ||
338 | |||
339 | /* | ||
340 | * Fetch a shadow pte for a specific level in the paging hierarchy. | ||
341 | */ | ||
342 | static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | ||
343 | struct guest_walker *walker, | ||
344 | int user_fault, int write_fault, int *ptwrite) | ||
345 | { | ||
346 | hpa_t shadow_addr; | ||
347 | int level; | ||
348 | u64 *shadow_ent; | ||
349 | u64 *prev_shadow_ent = NULL; | ||
350 | |||
351 | if (!is_present_pte(walker->pte)) | ||
352 | return NULL; | ||
353 | |||
354 | shadow_addr = vcpu->mmu.root_hpa; | ||
355 | level = vcpu->mmu.shadow_root_level; | ||
356 | if (level == PT32E_ROOT_LEVEL) { | ||
357 | shadow_addr = vcpu->mmu.pae_root[(addr >> 30) & 3]; | ||
358 | shadow_addr &= PT64_BASE_ADDR_MASK; | ||
359 | --level; | ||
360 | } | ||
361 | |||
362 | for (; ; level--) { | ||
363 | u32 index = SHADOW_PT_INDEX(addr, level); | ||
364 | struct kvm_mmu_page *shadow_page; | ||
365 | u64 shadow_pte; | ||
366 | int metaphysical; | ||
367 | gfn_t table_gfn; | ||
368 | unsigned hugepage_access = 0; | ||
369 | |||
370 | shadow_ent = ((u64 *)__va(shadow_addr)) + index; | ||
371 | if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) { | ||
372 | if (level == PT_PAGE_TABLE_LEVEL) | ||
373 | break; | ||
374 | shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK; | ||
375 | prev_shadow_ent = shadow_ent; | ||
376 | continue; | ||
377 | } | ||
378 | |||
379 | if (level == PT_PAGE_TABLE_LEVEL) | ||
380 | break; | ||
381 | |||
382 | if (level - 1 == PT_PAGE_TABLE_LEVEL | ||
383 | && walker->level == PT_DIRECTORY_LEVEL) { | ||
384 | metaphysical = 1; | ||
385 | hugepage_access = walker->pte; | ||
386 | hugepage_access &= PT_USER_MASK | PT_WRITABLE_MASK; | ||
387 | if (walker->pte & PT64_NX_MASK) | ||
388 | hugepage_access |= (1 << 2); | ||
389 | hugepage_access >>= PT_WRITABLE_SHIFT; | ||
390 | table_gfn = (walker->pte & PT_BASE_ADDR_MASK) | ||
391 | >> PAGE_SHIFT; | ||
392 | } else { | ||
393 | metaphysical = 0; | ||
394 | table_gfn = walker->table_gfn[level - 2]; | ||
395 | } | ||
396 | shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, | ||
397 | metaphysical, hugepage_access, | ||
398 | shadow_ent); | ||
399 | shadow_addr = __pa(shadow_page->spt); | ||
400 | shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK | ||
401 | | PT_WRITABLE_MASK | PT_USER_MASK; | ||
402 | *shadow_ent = shadow_pte; | ||
403 | prev_shadow_ent = shadow_ent; | ||
404 | } | ||
405 | |||
406 | if (walker->level == PT_DIRECTORY_LEVEL) { | ||
407 | FNAME(set_pde)(vcpu, walker->pte, shadow_ent, | ||
408 | walker->inherited_ar, user_fault, write_fault, | ||
409 | ptwrite, walker, walker->gfn); | ||
410 | } else { | ||
411 | ASSERT(walker->level == PT_PAGE_TABLE_LEVEL); | ||
412 | FNAME(set_pte)(vcpu, walker->pte, shadow_ent, | ||
413 | walker->inherited_ar, user_fault, write_fault, | ||
414 | ptwrite, walker, walker->gfn); | ||
415 | } | ||
416 | return shadow_ent; | ||
417 | } | ||
418 | |||
419 | /* | ||
420 | * Page fault handler. There are several causes for a page fault: | ||
421 | * - there is no shadow pte for the guest pte | ||
422 | * - write access through a shadow pte marked read only so that we can set | ||
423 | * the dirty bit | ||
424 | * - write access to a shadow pte marked read only so we can update the page | ||
425 | * dirty bitmap, when userspace requests it | ||
426 | * - mmio access; in this case we will never install a present shadow pte | ||
427 | * - normal guest page fault due to the guest pte marked not present, not | ||
428 | * writable, or not executable | ||
429 | * | ||
430 | * Returns: 1 if we need to emulate the instruction, 0 otherwise, or | ||
431 | * a negative value on error. | ||
432 | */ | ||
433 | static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | ||
434 | u32 error_code) | ||
435 | { | ||
436 | int write_fault = error_code & PFERR_WRITE_MASK; | ||
437 | int user_fault = error_code & PFERR_USER_MASK; | ||
438 | int fetch_fault = error_code & PFERR_FETCH_MASK; | ||
439 | struct guest_walker walker; | ||
440 | u64 *shadow_pte; | ||
441 | int write_pt = 0; | ||
442 | int r; | ||
443 | |||
444 | pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code); | ||
445 | kvm_mmu_audit(vcpu, "pre page fault"); | ||
446 | |||
447 | r = mmu_topup_memory_caches(vcpu); | ||
448 | if (r) | ||
449 | return r; | ||
450 | |||
451 | /* | ||
452 | * Look up the shadow pte for the faulting address. | ||
453 | */ | ||
454 | r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault, | ||
455 | fetch_fault); | ||
456 | |||
457 | /* | ||
458 | * The page is not mapped by the guest. Let the guest handle it. | ||
459 | */ | ||
460 | if (!r) { | ||
461 | pgprintk("%s: guest page fault\n", __FUNCTION__); | ||
462 | inject_page_fault(vcpu, addr, walker.error_code); | ||
463 | vcpu->last_pt_write_count = 0; /* reset fork detector */ | ||
464 | return 0; | ||
465 | } | ||
466 | |||
467 | shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, | ||
468 | &write_pt); | ||
469 | pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__, | ||
470 | shadow_pte, *shadow_pte, write_pt); | ||
471 | |||
472 | if (!write_pt) | ||
473 | vcpu->last_pt_write_count = 0; /* reset fork detector */ | ||
474 | |||
475 | /* | ||
476 | * mmio: emulate if accessible, otherwise its a guest fault. | ||
477 | */ | ||
478 | if (is_io_pte(*shadow_pte)) | ||
479 | return 1; | ||
480 | |||
481 | ++vcpu->stat.pf_fixed; | ||
482 | kvm_mmu_audit(vcpu, "post page fault (fixed)"); | ||
483 | |||
484 | return write_pt; | ||
485 | } | ||
486 | |||
487 | static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) | ||
488 | { | ||
489 | struct guest_walker walker; | ||
490 | gpa_t gpa = UNMAPPED_GVA; | ||
491 | int r; | ||
492 | |||
493 | r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0); | ||
494 | |||
495 | if (r) { | ||
496 | gpa = (gpa_t)walker.gfn << PAGE_SHIFT; | ||
497 | gpa |= vaddr & ~PAGE_MASK; | ||
498 | } | ||
499 | |||
500 | return gpa; | ||
501 | } | ||
502 | |||
503 | #undef pt_element_t | ||
504 | #undef guest_walker | ||
505 | #undef FNAME | ||
506 | #undef PT_BASE_ADDR_MASK | ||
507 | #undef PT_INDEX | ||
508 | #undef SHADOW_PT_INDEX | ||
509 | #undef PT_LEVEL_MASK | ||
510 | #undef PT_DIR_BASE_ADDR_MASK | ||
511 | #undef PT_MAX_FULL_LEVELS | ||
diff --git a/drivers/kvm/x86_emulate.c b/drivers/kvm/x86_emulate.c deleted file mode 100644 index bd46de6bf891..000000000000 --- a/drivers/kvm/x86_emulate.c +++ /dev/null | |||
@@ -1,1662 +0,0 @@ | |||
1 | /****************************************************************************** | ||
2 | * x86_emulate.c | ||
3 | * | ||
4 | * Generic x86 (32-bit and 64-bit) instruction decoder and emulator. | ||
5 | * | ||
6 | * Copyright (c) 2005 Keir Fraser | ||
7 | * | ||
8 | * Linux coding style, mod r/m decoder, segment base fixes, real-mode | ||
9 | * privileged instructions: | ||
10 | * | ||
11 | * Copyright (C) 2006 Qumranet | ||
12 | * | ||
13 | * Avi Kivity <avi@qumranet.com> | ||
14 | * Yaniv Kamay <yaniv@qumranet.com> | ||
15 | * | ||
16 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
17 | * the COPYING file in the top-level directory. | ||
18 | * | ||
19 | * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 | ||
20 | */ | ||
21 | |||
22 | #ifndef __KERNEL__ | ||
23 | #include <stdio.h> | ||
24 | #include <stdint.h> | ||
25 | #include <public/xen.h> | ||
26 | #define DPRINTF(_f, _a ...) printf( _f , ## _a ) | ||
27 | #else | ||
28 | #include "kvm.h" | ||
29 | #define DPRINTF(x...) do {} while (0) | ||
30 | #endif | ||
31 | #include "x86_emulate.h" | ||
32 | #include <linux/module.h> | ||
33 | |||
34 | /* | ||
35 | * Opcode effective-address decode tables. | ||
36 | * Note that we only emulate instructions that have at least one memory | ||
37 | * operand (excluding implicit stack references). We assume that stack | ||
38 | * references and instruction fetches will never occur in special memory | ||
39 | * areas that require emulation. So, for example, 'mov <imm>,<reg>' need | ||
40 | * not be handled. | ||
41 | */ | ||
42 | |||
43 | /* Operand sizes: 8-bit operands or specified/overridden size. */ | ||
44 | #define ByteOp (1<<0) /* 8-bit operands. */ | ||
45 | /* Destination operand type. */ | ||
46 | #define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ | ||
47 | #define DstReg (2<<1) /* Register operand. */ | ||
48 | #define DstMem (3<<1) /* Memory operand. */ | ||
49 | #define DstMask (3<<1) | ||
50 | /* Source operand type. */ | ||
51 | #define SrcNone (0<<3) /* No source operand. */ | ||
52 | #define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */ | ||
53 | #define SrcReg (1<<3) /* Register operand. */ | ||
54 | #define SrcMem (2<<3) /* Memory operand. */ | ||
55 | #define SrcMem16 (3<<3) /* Memory operand (16-bit). */ | ||
56 | #define SrcMem32 (4<<3) /* Memory operand (32-bit). */ | ||
57 | #define SrcImm (5<<3) /* Immediate operand. */ | ||
58 | #define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */ | ||
59 | #define SrcMask (7<<3) | ||
60 | /* Generic ModRM decode. */ | ||
61 | #define ModRM (1<<6) | ||
62 | /* Destination is only written; never read. */ | ||
63 | #define Mov (1<<7) | ||
64 | #define BitOp (1<<8) | ||
65 | |||
66 | static u8 opcode_table[256] = { | ||
67 | /* 0x00 - 0x07 */ | ||
68 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
69 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
70 | 0, 0, 0, 0, | ||
71 | /* 0x08 - 0x0F */ | ||
72 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
73 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
74 | 0, 0, 0, 0, | ||
75 | /* 0x10 - 0x17 */ | ||
76 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
77 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
78 | 0, 0, 0, 0, | ||
79 | /* 0x18 - 0x1F */ | ||
80 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
81 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
82 | 0, 0, 0, 0, | ||
83 | /* 0x20 - 0x27 */ | ||
84 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
85 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
86 | SrcImmByte, SrcImm, 0, 0, | ||
87 | /* 0x28 - 0x2F */ | ||
88 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
89 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
90 | 0, 0, 0, 0, | ||
91 | /* 0x30 - 0x37 */ | ||
92 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
93 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
94 | 0, 0, 0, 0, | ||
95 | /* 0x38 - 0x3F */ | ||
96 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
97 | ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, | ||
98 | 0, 0, 0, 0, | ||
99 | /* 0x40 - 0x4F */ | ||
100 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
101 | /* 0x50 - 0x57 */ | ||
102 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
103 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
104 | /* 0x58 - 0x5F */ | ||
105 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
106 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
107 | /* 0x60 - 0x67 */ | ||
108 | 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , | ||
109 | 0, 0, 0, 0, | ||
110 | /* 0x68 - 0x6F */ | ||
111 | 0, 0, ImplicitOps|Mov, 0, | ||
112 | SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */ | ||
113 | SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */ | ||
114 | /* 0x70 - 0x77 */ | ||
115 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
116 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
117 | /* 0x78 - 0x7F */ | ||
118 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
119 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
120 | /* 0x80 - 0x87 */ | ||
121 | ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, | ||
122 | ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, | ||
123 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
124 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | ||
125 | /* 0x88 - 0x8F */ | ||
126 | ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, | ||
127 | ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
128 | 0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov, | ||
129 | /* 0x90 - 0x9F */ | ||
130 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps, ImplicitOps, 0, 0, | ||
131 | /* 0xA0 - 0xA7 */ | ||
132 | ByteOp | DstReg | SrcMem | Mov, DstReg | SrcMem | Mov, | ||
133 | ByteOp | DstMem | SrcReg | Mov, DstMem | SrcReg | Mov, | ||
134 | ByteOp | ImplicitOps | Mov, ImplicitOps | Mov, | ||
135 | ByteOp | ImplicitOps, ImplicitOps, | ||
136 | /* 0xA8 - 0xAF */ | ||
137 | 0, 0, ByteOp | ImplicitOps | Mov, ImplicitOps | Mov, | ||
138 | ByteOp | ImplicitOps | Mov, ImplicitOps | Mov, | ||
139 | ByteOp | ImplicitOps, ImplicitOps, | ||
140 | /* 0xB0 - 0xBF */ | ||
141 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
142 | /* 0xC0 - 0xC7 */ | ||
143 | ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, | ||
144 | 0, ImplicitOps, 0, 0, | ||
145 | ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov, | ||
146 | /* 0xC8 - 0xCF */ | ||
147 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
148 | /* 0xD0 - 0xD7 */ | ||
149 | ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, | ||
150 | ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, | ||
151 | 0, 0, 0, 0, | ||
152 | /* 0xD8 - 0xDF */ | ||
153 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
154 | /* 0xE0 - 0xE7 */ | ||
155 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
156 | /* 0xE8 - 0xEF */ | ||
157 | ImplicitOps, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps, 0, 0, 0, 0, | ||
158 | /* 0xF0 - 0xF7 */ | ||
159 | 0, 0, 0, 0, | ||
160 | ImplicitOps, 0, | ||
161 | ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, | ||
162 | /* 0xF8 - 0xFF */ | ||
163 | 0, 0, 0, 0, | ||
164 | 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM | ||
165 | }; | ||
166 | |||
167 | static u16 twobyte_table[256] = { | ||
168 | /* 0x00 - 0x0F */ | ||
169 | 0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0, | ||
170 | ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, | ||
171 | /* 0x10 - 0x1F */ | ||
172 | 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, | ||
173 | /* 0x20 - 0x2F */ | ||
174 | ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0, | ||
175 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
176 | /* 0x30 - 0x3F */ | ||
177 | ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
178 | /* 0x40 - 0x47 */ | ||
179 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
180 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
181 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
182 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
183 | /* 0x48 - 0x4F */ | ||
184 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
185 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
186 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
187 | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | ||
188 | /* 0x50 - 0x5F */ | ||
189 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
190 | /* 0x60 - 0x6F */ | ||
191 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
192 | /* 0x70 - 0x7F */ | ||
193 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
194 | /* 0x80 - 0x8F */ | ||
195 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
196 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
197 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
198 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | ||
199 | /* 0x90 - 0x9F */ | ||
200 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
201 | /* 0xA0 - 0xA7 */ | ||
202 | 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0, | ||
203 | /* 0xA8 - 0xAF */ | ||
204 | 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0, | ||
205 | /* 0xB0 - 0xB7 */ | ||
206 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0, | ||
207 | DstMem | SrcReg | ModRM | BitOp, | ||
208 | 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, | ||
209 | DstReg | SrcMem16 | ModRM | Mov, | ||
210 | /* 0xB8 - 0xBF */ | ||
211 | 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp, | ||
212 | 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, | ||
213 | DstReg | SrcMem16 | ModRM | Mov, | ||
214 | /* 0xC0 - 0xCF */ | ||
215 | 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM, | ||
216 | 0, 0, 0, 0, 0, 0, 0, 0, | ||
217 | /* 0xD0 - 0xDF */ | ||
218 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
219 | /* 0xE0 - 0xEF */ | ||
220 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
221 | /* 0xF0 - 0xFF */ | ||
222 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | ||
223 | }; | ||
224 | |||
225 | /* Type, address-of, and value of an instruction's operand. */ | ||
226 | struct operand { | ||
227 | enum { OP_REG, OP_MEM, OP_IMM } type; | ||
228 | unsigned int bytes; | ||
229 | unsigned long val, orig_val, *ptr; | ||
230 | }; | ||
231 | |||
232 | /* EFLAGS bit definitions. */ | ||
233 | #define EFLG_OF (1<<11) | ||
234 | #define EFLG_DF (1<<10) | ||
235 | #define EFLG_SF (1<<7) | ||
236 | #define EFLG_ZF (1<<6) | ||
237 | #define EFLG_AF (1<<4) | ||
238 | #define EFLG_PF (1<<2) | ||
239 | #define EFLG_CF (1<<0) | ||
240 | |||
241 | /* | ||
242 | * Instruction emulation: | ||
243 | * Most instructions are emulated directly via a fragment of inline assembly | ||
244 | * code. This allows us to save/restore EFLAGS and thus very easily pick up | ||
245 | * any modified flags. | ||
246 | */ | ||
247 | |||
248 | #if defined(CONFIG_X86_64) | ||
249 | #define _LO32 "k" /* force 32-bit operand */ | ||
250 | #define _STK "%%rsp" /* stack pointer */ | ||
251 | #elif defined(__i386__) | ||
252 | #define _LO32 "" /* force 32-bit operand */ | ||
253 | #define _STK "%%esp" /* stack pointer */ | ||
254 | #endif | ||
255 | |||
256 | /* | ||
257 | * These EFLAGS bits are restored from saved value during emulation, and | ||
258 | * any changes are written back to the saved value after emulation. | ||
259 | */ | ||
260 | #define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF) | ||
261 | |||
262 | /* Before executing instruction: restore necessary bits in EFLAGS. */ | ||
263 | #define _PRE_EFLAGS(_sav, _msk, _tmp) \ | ||
264 | /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); */ \ | ||
265 | "push %"_sav"; " \ | ||
266 | "movl %"_msk",%"_LO32 _tmp"; " \ | ||
267 | "andl %"_LO32 _tmp",("_STK"); " \ | ||
268 | "pushf; " \ | ||
269 | "notl %"_LO32 _tmp"; " \ | ||
270 | "andl %"_LO32 _tmp",("_STK"); " \ | ||
271 | "pop %"_tmp"; " \ | ||
272 | "orl %"_LO32 _tmp",("_STK"); " \ | ||
273 | "popf; " \ | ||
274 | /* _sav &= ~msk; */ \ | ||
275 | "movl %"_msk",%"_LO32 _tmp"; " \ | ||
276 | "notl %"_LO32 _tmp"; " \ | ||
277 | "andl %"_LO32 _tmp",%"_sav"; " | ||
278 | |||
279 | /* After executing instruction: write-back necessary bits in EFLAGS. */ | ||
280 | #define _POST_EFLAGS(_sav, _msk, _tmp) \ | ||
281 | /* _sav |= EFLAGS & _msk; */ \ | ||
282 | "pushf; " \ | ||
283 | "pop %"_tmp"; " \ | ||
284 | "andl %"_msk",%"_LO32 _tmp"; " \ | ||
285 | "orl %"_LO32 _tmp",%"_sav"; " | ||
286 | |||
287 | /* Raw emulation: instruction has two explicit operands. */ | ||
288 | #define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \ | ||
289 | do { \ | ||
290 | unsigned long _tmp; \ | ||
291 | \ | ||
292 | switch ((_dst).bytes) { \ | ||
293 | case 2: \ | ||
294 | __asm__ __volatile__ ( \ | ||
295 | _PRE_EFLAGS("0","4","2") \ | ||
296 | _op"w %"_wx"3,%1; " \ | ||
297 | _POST_EFLAGS("0","4","2") \ | ||
298 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
299 | "=&r" (_tmp) \ | ||
300 | : _wy ((_src).val), "i" (EFLAGS_MASK) ); \ | ||
301 | break; \ | ||
302 | case 4: \ | ||
303 | __asm__ __volatile__ ( \ | ||
304 | _PRE_EFLAGS("0","4","2") \ | ||
305 | _op"l %"_lx"3,%1; " \ | ||
306 | _POST_EFLAGS("0","4","2") \ | ||
307 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
308 | "=&r" (_tmp) \ | ||
309 | : _ly ((_src).val), "i" (EFLAGS_MASK) ); \ | ||
310 | break; \ | ||
311 | case 8: \ | ||
312 | __emulate_2op_8byte(_op, _src, _dst, \ | ||
313 | _eflags, _qx, _qy); \ | ||
314 | break; \ | ||
315 | } \ | ||
316 | } while (0) | ||
317 | |||
318 | #define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \ | ||
319 | do { \ | ||
320 | unsigned long _tmp; \ | ||
321 | switch ( (_dst).bytes ) \ | ||
322 | { \ | ||
323 | case 1: \ | ||
324 | __asm__ __volatile__ ( \ | ||
325 | _PRE_EFLAGS("0","4","2") \ | ||
326 | _op"b %"_bx"3,%1; " \ | ||
327 | _POST_EFLAGS("0","4","2") \ | ||
328 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
329 | "=&r" (_tmp) \ | ||
330 | : _by ((_src).val), "i" (EFLAGS_MASK) ); \ | ||
331 | break; \ | ||
332 | default: \ | ||
333 | __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ | ||
334 | _wx, _wy, _lx, _ly, _qx, _qy); \ | ||
335 | break; \ | ||
336 | } \ | ||
337 | } while (0) | ||
338 | |||
339 | /* Source operand is byte-sized and may be restricted to just %cl. */ | ||
340 | #define emulate_2op_SrcB(_op, _src, _dst, _eflags) \ | ||
341 | __emulate_2op(_op, _src, _dst, _eflags, \ | ||
342 | "b", "c", "b", "c", "b", "c", "b", "c") | ||
343 | |||
344 | /* Source operand is byte, word, long or quad sized. */ | ||
345 | #define emulate_2op_SrcV(_op, _src, _dst, _eflags) \ | ||
346 | __emulate_2op(_op, _src, _dst, _eflags, \ | ||
347 | "b", "q", "w", "r", _LO32, "r", "", "r") | ||
348 | |||
349 | /* Source operand is word, long or quad sized. */ | ||
350 | #define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \ | ||
351 | __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ | ||
352 | "w", "r", _LO32, "r", "", "r") | ||
353 | |||
354 | /* Instruction has only one explicit operand (no source operand). */ | ||
355 | #define emulate_1op(_op, _dst, _eflags) \ | ||
356 | do { \ | ||
357 | unsigned long _tmp; \ | ||
358 | \ | ||
359 | switch ( (_dst).bytes ) \ | ||
360 | { \ | ||
361 | case 1: \ | ||
362 | __asm__ __volatile__ ( \ | ||
363 | _PRE_EFLAGS("0","3","2") \ | ||
364 | _op"b %1; " \ | ||
365 | _POST_EFLAGS("0","3","2") \ | ||
366 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
367 | "=&r" (_tmp) \ | ||
368 | : "i" (EFLAGS_MASK) ); \ | ||
369 | break; \ | ||
370 | case 2: \ | ||
371 | __asm__ __volatile__ ( \ | ||
372 | _PRE_EFLAGS("0","3","2") \ | ||
373 | _op"w %1; " \ | ||
374 | _POST_EFLAGS("0","3","2") \ | ||
375 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
376 | "=&r" (_tmp) \ | ||
377 | : "i" (EFLAGS_MASK) ); \ | ||
378 | break; \ | ||
379 | case 4: \ | ||
380 | __asm__ __volatile__ ( \ | ||
381 | _PRE_EFLAGS("0","3","2") \ | ||
382 | _op"l %1; " \ | ||
383 | _POST_EFLAGS("0","3","2") \ | ||
384 | : "=m" (_eflags), "=m" ((_dst).val), \ | ||
385 | "=&r" (_tmp) \ | ||
386 | : "i" (EFLAGS_MASK) ); \ | ||
387 | break; \ | ||
388 | case 8: \ | ||
389 | __emulate_1op_8byte(_op, _dst, _eflags); \ | ||
390 | break; \ | ||
391 | } \ | ||
392 | } while (0) | ||
393 | |||
394 | /* Emulate an instruction with quadword operands (x86/64 only). */ | ||
395 | #if defined(CONFIG_X86_64) | ||
396 | #define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) \ | ||
397 | do { \ | ||
398 | __asm__ __volatile__ ( \ | ||
399 | _PRE_EFLAGS("0","4","2") \ | ||
400 | _op"q %"_qx"3,%1; " \ | ||
401 | _POST_EFLAGS("0","4","2") \ | ||
402 | : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ | ||
403 | : _qy ((_src).val), "i" (EFLAGS_MASK) ); \ | ||
404 | } while (0) | ||
405 | |||
406 | #define __emulate_1op_8byte(_op, _dst, _eflags) \ | ||
407 | do { \ | ||
408 | __asm__ __volatile__ ( \ | ||
409 | _PRE_EFLAGS("0","3","2") \ | ||
410 | _op"q %1; " \ | ||
411 | _POST_EFLAGS("0","3","2") \ | ||
412 | : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ | ||
413 | : "i" (EFLAGS_MASK) ); \ | ||
414 | } while (0) | ||
415 | |||
416 | #elif defined(__i386__) | ||
417 | #define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) | ||
418 | #define __emulate_1op_8byte(_op, _dst, _eflags) | ||
419 | #endif /* __i386__ */ | ||
420 | |||
421 | /* Fetch next part of the instruction being emulated. */ | ||
422 | #define insn_fetch(_type, _size, _eip) \ | ||
423 | ({ unsigned long _x; \ | ||
424 | rc = ops->read_std((unsigned long)(_eip) + ctxt->cs_base, &_x, \ | ||
425 | (_size), ctxt->vcpu); \ | ||
426 | if ( rc != 0 ) \ | ||
427 | goto done; \ | ||
428 | (_eip) += (_size); \ | ||
429 | (_type)_x; \ | ||
430 | }) | ||
431 | |||
432 | /* Access/update address held in a register, based on addressing mode. */ | ||
433 | #define address_mask(reg) \ | ||
434 | ((ad_bytes == sizeof(unsigned long)) ? \ | ||
435 | (reg) : ((reg) & ((1UL << (ad_bytes << 3)) - 1))) | ||
436 | #define register_address(base, reg) \ | ||
437 | ((base) + address_mask(reg)) | ||
438 | #define register_address_increment(reg, inc) \ | ||
439 | do { \ | ||
440 | /* signed type ensures sign extension to long */ \ | ||
441 | int _inc = (inc); \ | ||
442 | if ( ad_bytes == sizeof(unsigned long) ) \ | ||
443 | (reg) += _inc; \ | ||
444 | else \ | ||
445 | (reg) = ((reg) & ~((1UL << (ad_bytes << 3)) - 1)) | \ | ||
446 | (((reg) + _inc) & ((1UL << (ad_bytes << 3)) - 1)); \ | ||
447 | } while (0) | ||
448 | |||
449 | #define JMP_REL(rel) \ | ||
450 | do { \ | ||
451 | register_address_increment(_eip, rel); \ | ||
452 | } while (0) | ||
453 | |||
454 | /* | ||
455 | * Given the 'reg' portion of a ModRM byte, and a register block, return a | ||
456 | * pointer into the block that addresses the relevant register. | ||
457 | * @highbyte_regs specifies whether to decode AH,CH,DH,BH. | ||
458 | */ | ||
459 | static void *decode_register(u8 modrm_reg, unsigned long *regs, | ||
460 | int highbyte_regs) | ||
461 | { | ||
462 | void *p; | ||
463 | |||
464 | p = ®s[modrm_reg]; | ||
465 | if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8) | ||
466 | p = (unsigned char *)®s[modrm_reg & 3] + 1; | ||
467 | return p; | ||
468 | } | ||
469 | |||
470 | static int read_descriptor(struct x86_emulate_ctxt *ctxt, | ||
471 | struct x86_emulate_ops *ops, | ||
472 | void *ptr, | ||
473 | u16 *size, unsigned long *address, int op_bytes) | ||
474 | { | ||
475 | int rc; | ||
476 | |||
477 | if (op_bytes == 2) | ||
478 | op_bytes = 3; | ||
479 | *address = 0; | ||
480 | rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, | ||
481 | ctxt->vcpu); | ||
482 | if (rc) | ||
483 | return rc; | ||
484 | rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, | ||
485 | ctxt->vcpu); | ||
486 | return rc; | ||
487 | } | ||
488 | |||
489 | static int test_cc(unsigned int condition, unsigned int flags) | ||
490 | { | ||
491 | int rc = 0; | ||
492 | |||
493 | switch ((condition & 15) >> 1) { | ||
494 | case 0: /* o */ | ||
495 | rc |= (flags & EFLG_OF); | ||
496 | break; | ||
497 | case 1: /* b/c/nae */ | ||
498 | rc |= (flags & EFLG_CF); | ||
499 | break; | ||
500 | case 2: /* z/e */ | ||
501 | rc |= (flags & EFLG_ZF); | ||
502 | break; | ||
503 | case 3: /* be/na */ | ||
504 | rc |= (flags & (EFLG_CF|EFLG_ZF)); | ||
505 | break; | ||
506 | case 4: /* s */ | ||
507 | rc |= (flags & EFLG_SF); | ||
508 | break; | ||
509 | case 5: /* p/pe */ | ||
510 | rc |= (flags & EFLG_PF); | ||
511 | break; | ||
512 | case 7: /* le/ng */ | ||
513 | rc |= (flags & EFLG_ZF); | ||
514 | /* fall through */ | ||
515 | case 6: /* l/nge */ | ||
516 | rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF)); | ||
517 | break; | ||
518 | } | ||
519 | |||
520 | /* Odd condition identifiers (lsb == 1) have inverted sense. */ | ||
521 | return (!!rc ^ (condition & 1)); | ||
522 | } | ||
523 | |||
524 | int | ||
525 | x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | ||
526 | { | ||
527 | unsigned d; | ||
528 | u8 b, sib, twobyte = 0, rex_prefix = 0; | ||
529 | u8 modrm, modrm_mod = 0, modrm_reg = 0, modrm_rm = 0; | ||
530 | unsigned long *override_base = NULL; | ||
531 | unsigned int op_bytes, ad_bytes, lock_prefix = 0, rep_prefix = 0, i; | ||
532 | int rc = 0; | ||
533 | struct operand src, dst; | ||
534 | unsigned long cr2 = ctxt->cr2; | ||
535 | int mode = ctxt->mode; | ||
536 | unsigned long modrm_ea; | ||
537 | int use_modrm_ea, index_reg = 0, base_reg = 0, scale, rip_relative = 0; | ||
538 | int no_wb = 0; | ||
539 | u64 msr_data; | ||
540 | |||
541 | /* Shadow copy of register state. Committed on successful emulation. */ | ||
542 | unsigned long _regs[NR_VCPU_REGS]; | ||
543 | unsigned long _eip = ctxt->vcpu->rip, _eflags = ctxt->eflags; | ||
544 | unsigned long modrm_val = 0; | ||
545 | |||
546 | memcpy(_regs, ctxt->vcpu->regs, sizeof _regs); | ||
547 | |||
548 | switch (mode) { | ||
549 | case X86EMUL_MODE_REAL: | ||
550 | case X86EMUL_MODE_PROT16: | ||
551 | op_bytes = ad_bytes = 2; | ||
552 | break; | ||
553 | case X86EMUL_MODE_PROT32: | ||
554 | op_bytes = ad_bytes = 4; | ||
555 | break; | ||
556 | #ifdef CONFIG_X86_64 | ||
557 | case X86EMUL_MODE_PROT64: | ||
558 | op_bytes = 4; | ||
559 | ad_bytes = 8; | ||
560 | break; | ||
561 | #endif | ||
562 | default: | ||
563 | return -1; | ||
564 | } | ||
565 | |||
566 | /* Legacy prefixes. */ | ||
567 | for (i = 0; i < 8; i++) { | ||
568 | switch (b = insn_fetch(u8, 1, _eip)) { | ||
569 | case 0x66: /* operand-size override */ | ||
570 | op_bytes ^= 6; /* switch between 2/4 bytes */ | ||
571 | break; | ||
572 | case 0x67: /* address-size override */ | ||
573 | if (mode == X86EMUL_MODE_PROT64) | ||
574 | ad_bytes ^= 12; /* switch between 4/8 bytes */ | ||
575 | else | ||
576 | ad_bytes ^= 6; /* switch between 2/4 bytes */ | ||
577 | break; | ||
578 | case 0x2e: /* CS override */ | ||
579 | override_base = &ctxt->cs_base; | ||
580 | break; | ||
581 | case 0x3e: /* DS override */ | ||
582 | override_base = &ctxt->ds_base; | ||
583 | break; | ||
584 | case 0x26: /* ES override */ | ||
585 | override_base = &ctxt->es_base; | ||
586 | break; | ||
587 | case 0x64: /* FS override */ | ||
588 | override_base = &ctxt->fs_base; | ||
589 | break; | ||
590 | case 0x65: /* GS override */ | ||
591 | override_base = &ctxt->gs_base; | ||
592 | break; | ||
593 | case 0x36: /* SS override */ | ||
594 | override_base = &ctxt->ss_base; | ||
595 | break; | ||
596 | case 0xf0: /* LOCK */ | ||
597 | lock_prefix = 1; | ||
598 | break; | ||
599 | case 0xf2: /* REPNE/REPNZ */ | ||
600 | case 0xf3: /* REP/REPE/REPZ */ | ||
601 | rep_prefix = 1; | ||
602 | break; | ||
603 | default: | ||
604 | goto done_prefixes; | ||
605 | } | ||
606 | } | ||
607 | |||
608 | done_prefixes: | ||
609 | |||
610 | /* REX prefix. */ | ||
611 | if ((mode == X86EMUL_MODE_PROT64) && ((b & 0xf0) == 0x40)) { | ||
612 | rex_prefix = b; | ||
613 | if (b & 8) | ||
614 | op_bytes = 8; /* REX.W */ | ||
615 | modrm_reg = (b & 4) << 1; /* REX.R */ | ||
616 | index_reg = (b & 2) << 2; /* REX.X */ | ||
617 | modrm_rm = base_reg = (b & 1) << 3; /* REG.B */ | ||
618 | b = insn_fetch(u8, 1, _eip); | ||
619 | } | ||
620 | |||
621 | /* Opcode byte(s). */ | ||
622 | d = opcode_table[b]; | ||
623 | if (d == 0) { | ||
624 | /* Two-byte opcode? */ | ||
625 | if (b == 0x0f) { | ||
626 | twobyte = 1; | ||
627 | b = insn_fetch(u8, 1, _eip); | ||
628 | d = twobyte_table[b]; | ||
629 | } | ||
630 | |||
631 | /* Unrecognised? */ | ||
632 | if (d == 0) | ||
633 | goto cannot_emulate; | ||
634 | } | ||
635 | |||
636 | /* ModRM and SIB bytes. */ | ||
637 | if (d & ModRM) { | ||
638 | modrm = insn_fetch(u8, 1, _eip); | ||
639 | modrm_mod |= (modrm & 0xc0) >> 6; | ||
640 | modrm_reg |= (modrm & 0x38) >> 3; | ||
641 | modrm_rm |= (modrm & 0x07); | ||
642 | modrm_ea = 0; | ||
643 | use_modrm_ea = 1; | ||
644 | |||
645 | if (modrm_mod == 3) { | ||
646 | modrm_val = *(unsigned long *) | ||
647 | decode_register(modrm_rm, _regs, d & ByteOp); | ||
648 | goto modrm_done; | ||
649 | } | ||
650 | |||
651 | if (ad_bytes == 2) { | ||
652 | unsigned bx = _regs[VCPU_REGS_RBX]; | ||
653 | unsigned bp = _regs[VCPU_REGS_RBP]; | ||
654 | unsigned si = _regs[VCPU_REGS_RSI]; | ||
655 | unsigned di = _regs[VCPU_REGS_RDI]; | ||
656 | |||
657 | /* 16-bit ModR/M decode. */ | ||
658 | switch (modrm_mod) { | ||
659 | case 0: | ||
660 | if (modrm_rm == 6) | ||
661 | modrm_ea += insn_fetch(u16, 2, _eip); | ||
662 | break; | ||
663 | case 1: | ||
664 | modrm_ea += insn_fetch(s8, 1, _eip); | ||
665 | break; | ||
666 | case 2: | ||
667 | modrm_ea += insn_fetch(u16, 2, _eip); | ||
668 | break; | ||
669 | } | ||
670 | switch (modrm_rm) { | ||
671 | case 0: | ||
672 | modrm_ea += bx + si; | ||
673 | break; | ||
674 | case 1: | ||
675 | modrm_ea += bx + di; | ||
676 | break; | ||
677 | case 2: | ||
678 | modrm_ea += bp + si; | ||
679 | break; | ||
680 | case 3: | ||
681 | modrm_ea += bp + di; | ||
682 | break; | ||
683 | case 4: | ||
684 | modrm_ea += si; | ||
685 | break; | ||
686 | case 5: | ||
687 | modrm_ea += di; | ||
688 | break; | ||
689 | case 6: | ||
690 | if (modrm_mod != 0) | ||
691 | modrm_ea += bp; | ||
692 | break; | ||
693 | case 7: | ||
694 | modrm_ea += bx; | ||
695 | break; | ||
696 | } | ||
697 | if (modrm_rm == 2 || modrm_rm == 3 || | ||
698 | (modrm_rm == 6 && modrm_mod != 0)) | ||
699 | if (!override_base) | ||
700 | override_base = &ctxt->ss_base; | ||
701 | modrm_ea = (u16)modrm_ea; | ||
702 | } else { | ||
703 | /* 32/64-bit ModR/M decode. */ | ||
704 | switch (modrm_rm) { | ||
705 | case 4: | ||
706 | case 12: | ||
707 | sib = insn_fetch(u8, 1, _eip); | ||
708 | index_reg |= (sib >> 3) & 7; | ||
709 | base_reg |= sib & 7; | ||
710 | scale = sib >> 6; | ||
711 | |||
712 | switch (base_reg) { | ||
713 | case 5: | ||
714 | if (modrm_mod != 0) | ||
715 | modrm_ea += _regs[base_reg]; | ||
716 | else | ||
717 | modrm_ea += insn_fetch(s32, 4, _eip); | ||
718 | break; | ||
719 | default: | ||
720 | modrm_ea += _regs[base_reg]; | ||
721 | } | ||
722 | switch (index_reg) { | ||
723 | case 4: | ||
724 | break; | ||
725 | default: | ||
726 | modrm_ea += _regs[index_reg] << scale; | ||
727 | |||
728 | } | ||
729 | break; | ||
730 | case 5: | ||
731 | if (modrm_mod != 0) | ||
732 | modrm_ea += _regs[modrm_rm]; | ||
733 | else if (mode == X86EMUL_MODE_PROT64) | ||
734 | rip_relative = 1; | ||
735 | break; | ||
736 | default: | ||
737 | modrm_ea += _regs[modrm_rm]; | ||
738 | break; | ||
739 | } | ||
740 | switch (modrm_mod) { | ||
741 | case 0: | ||
742 | if (modrm_rm == 5) | ||
743 | modrm_ea += insn_fetch(s32, 4, _eip); | ||
744 | break; | ||
745 | case 1: | ||
746 | modrm_ea += insn_fetch(s8, 1, _eip); | ||
747 | break; | ||
748 | case 2: | ||
749 | modrm_ea += insn_fetch(s32, 4, _eip); | ||
750 | break; | ||
751 | } | ||
752 | } | ||
753 | if (!override_base) | ||
754 | override_base = &ctxt->ds_base; | ||
755 | if (mode == X86EMUL_MODE_PROT64 && | ||
756 | override_base != &ctxt->fs_base && | ||
757 | override_base != &ctxt->gs_base) | ||
758 | override_base = NULL; | ||
759 | |||
760 | if (override_base) | ||
761 | modrm_ea += *override_base; | ||
762 | |||
763 | if (rip_relative) { | ||
764 | modrm_ea += _eip; | ||
765 | switch (d & SrcMask) { | ||
766 | case SrcImmByte: | ||
767 | modrm_ea += 1; | ||
768 | break; | ||
769 | case SrcImm: | ||
770 | if (d & ByteOp) | ||
771 | modrm_ea += 1; | ||
772 | else | ||
773 | if (op_bytes == 8) | ||
774 | modrm_ea += 4; | ||
775 | else | ||
776 | modrm_ea += op_bytes; | ||
777 | } | ||
778 | } | ||
779 | if (ad_bytes != 8) | ||
780 | modrm_ea = (u32)modrm_ea; | ||
781 | cr2 = modrm_ea; | ||
782 | modrm_done: | ||
783 | ; | ||
784 | } | ||
785 | |||
786 | /* | ||
787 | * Decode and fetch the source operand: register, memory | ||
788 | * or immediate. | ||
789 | */ | ||
790 | switch (d & SrcMask) { | ||
791 | case SrcNone: | ||
792 | break; | ||
793 | case SrcReg: | ||
794 | src.type = OP_REG; | ||
795 | if (d & ByteOp) { | ||
796 | src.ptr = decode_register(modrm_reg, _regs, | ||
797 | (rex_prefix == 0)); | ||
798 | src.val = src.orig_val = *(u8 *) src.ptr; | ||
799 | src.bytes = 1; | ||
800 | } else { | ||
801 | src.ptr = decode_register(modrm_reg, _regs, 0); | ||
802 | switch ((src.bytes = op_bytes)) { | ||
803 | case 2: | ||
804 | src.val = src.orig_val = *(u16 *) src.ptr; | ||
805 | break; | ||
806 | case 4: | ||
807 | src.val = src.orig_val = *(u32 *) src.ptr; | ||
808 | break; | ||
809 | case 8: | ||
810 | src.val = src.orig_val = *(u64 *) src.ptr; | ||
811 | break; | ||
812 | } | ||
813 | } | ||
814 | break; | ||
815 | case SrcMem16: | ||
816 | src.bytes = 2; | ||
817 | goto srcmem_common; | ||
818 | case SrcMem32: | ||
819 | src.bytes = 4; | ||
820 | goto srcmem_common; | ||
821 | case SrcMem: | ||
822 | src.bytes = (d & ByteOp) ? 1 : op_bytes; | ||
823 | /* Don't fetch the address for invlpg: it could be unmapped. */ | ||
824 | if (twobyte && b == 0x01 && modrm_reg == 7) | ||
825 | break; | ||
826 | srcmem_common: | ||
827 | /* | ||
828 | * For instructions with a ModR/M byte, switch to register | ||
829 | * access if Mod = 3. | ||
830 | */ | ||
831 | if ((d & ModRM) && modrm_mod == 3) { | ||
832 | src.type = OP_REG; | ||
833 | break; | ||
834 | } | ||
835 | src.type = OP_MEM; | ||
836 | src.ptr = (unsigned long *)cr2; | ||
837 | src.val = 0; | ||
838 | if ((rc = ops->read_emulated((unsigned long)src.ptr, | ||
839 | &src.val, src.bytes, ctxt->vcpu)) != 0) | ||
840 | goto done; | ||
841 | src.orig_val = src.val; | ||
842 | break; | ||
843 | case SrcImm: | ||
844 | src.type = OP_IMM; | ||
845 | src.ptr = (unsigned long *)_eip; | ||
846 | src.bytes = (d & ByteOp) ? 1 : op_bytes; | ||
847 | if (src.bytes == 8) | ||
848 | src.bytes = 4; | ||
849 | /* NB. Immediates are sign-extended as necessary. */ | ||
850 | switch (src.bytes) { | ||
851 | case 1: | ||
852 | src.val = insn_fetch(s8, 1, _eip); | ||
853 | break; | ||
854 | case 2: | ||
855 | src.val = insn_fetch(s16, 2, _eip); | ||
856 | break; | ||
857 | case 4: | ||
858 | src.val = insn_fetch(s32, 4, _eip); | ||
859 | break; | ||
860 | } | ||
861 | break; | ||
862 | case SrcImmByte: | ||
863 | src.type = OP_IMM; | ||
864 | src.ptr = (unsigned long *)_eip; | ||
865 | src.bytes = 1; | ||
866 | src.val = insn_fetch(s8, 1, _eip); | ||
867 | break; | ||
868 | } | ||
869 | |||
870 | /* Decode and fetch the destination operand: register or memory. */ | ||
871 | switch (d & DstMask) { | ||
872 | case ImplicitOps: | ||
873 | /* Special instructions do their own operand decoding. */ | ||
874 | goto special_insn; | ||
875 | case DstReg: | ||
876 | dst.type = OP_REG; | ||
877 | if ((d & ByteOp) | ||
878 | && !(twobyte && (b == 0xb6 || b == 0xb7))) { | ||
879 | dst.ptr = decode_register(modrm_reg, _regs, | ||
880 | (rex_prefix == 0)); | ||
881 | dst.val = *(u8 *) dst.ptr; | ||
882 | dst.bytes = 1; | ||
883 | } else { | ||
884 | dst.ptr = decode_register(modrm_reg, _regs, 0); | ||
885 | switch ((dst.bytes = op_bytes)) { | ||
886 | case 2: | ||
887 | dst.val = *(u16 *)dst.ptr; | ||
888 | break; | ||
889 | case 4: | ||
890 | dst.val = *(u32 *)dst.ptr; | ||
891 | break; | ||
892 | case 8: | ||
893 | dst.val = *(u64 *)dst.ptr; | ||
894 | break; | ||
895 | } | ||
896 | } | ||
897 | break; | ||
898 | case DstMem: | ||
899 | dst.type = OP_MEM; | ||
900 | dst.ptr = (unsigned long *)cr2; | ||
901 | dst.bytes = (d & ByteOp) ? 1 : op_bytes; | ||
902 | dst.val = 0; | ||
903 | /* | ||
904 | * For instructions with a ModR/M byte, switch to register | ||
905 | * access if Mod = 3. | ||
906 | */ | ||
907 | if ((d & ModRM) && modrm_mod == 3) { | ||
908 | dst.type = OP_REG; | ||
909 | break; | ||
910 | } | ||
911 | if (d & BitOp) { | ||
912 | unsigned long mask = ~(dst.bytes * 8 - 1); | ||
913 | |||
914 | dst.ptr = (void *)dst.ptr + (src.val & mask) / 8; | ||
915 | } | ||
916 | if (!(d & Mov) && /* optimisation - avoid slow emulated read */ | ||
917 | ((rc = ops->read_emulated((unsigned long)dst.ptr, | ||
918 | &dst.val, dst.bytes, ctxt->vcpu)) != 0)) | ||
919 | goto done; | ||
920 | break; | ||
921 | } | ||
922 | dst.orig_val = dst.val; | ||
923 | |||
924 | if (twobyte) | ||
925 | goto twobyte_insn; | ||
926 | |||
927 | switch (b) { | ||
928 | case 0x00 ... 0x05: | ||
929 | add: /* add */ | ||
930 | emulate_2op_SrcV("add", src, dst, _eflags); | ||
931 | break; | ||
932 | case 0x08 ... 0x0d: | ||
933 | or: /* or */ | ||
934 | emulate_2op_SrcV("or", src, dst, _eflags); | ||
935 | break; | ||
936 | case 0x10 ... 0x15: | ||
937 | adc: /* adc */ | ||
938 | emulate_2op_SrcV("adc", src, dst, _eflags); | ||
939 | break; | ||
940 | case 0x18 ... 0x1d: | ||
941 | sbb: /* sbb */ | ||
942 | emulate_2op_SrcV("sbb", src, dst, _eflags); | ||
943 | break; | ||
944 | case 0x20 ... 0x23: | ||
945 | and: /* and */ | ||
946 | emulate_2op_SrcV("and", src, dst, _eflags); | ||
947 | break; | ||
948 | case 0x24: /* and al imm8 */ | ||
949 | dst.type = OP_REG; | ||
950 | dst.ptr = &_regs[VCPU_REGS_RAX]; | ||
951 | dst.val = *(u8 *)dst.ptr; | ||
952 | dst.bytes = 1; | ||
953 | dst.orig_val = dst.val; | ||
954 | goto and; | ||
955 | case 0x25: /* and ax imm16, or eax imm32 */ | ||
956 | dst.type = OP_REG; | ||
957 | dst.bytes = op_bytes; | ||
958 | dst.ptr = &_regs[VCPU_REGS_RAX]; | ||
959 | if (op_bytes == 2) | ||
960 | dst.val = *(u16 *)dst.ptr; | ||
961 | else | ||
962 | dst.val = *(u32 *)dst.ptr; | ||
963 | dst.orig_val = dst.val; | ||
964 | goto and; | ||
965 | case 0x28 ... 0x2d: | ||
966 | sub: /* sub */ | ||
967 | emulate_2op_SrcV("sub", src, dst, _eflags); | ||
968 | break; | ||
969 | case 0x30 ... 0x35: | ||
970 | xor: /* xor */ | ||
971 | emulate_2op_SrcV("xor", src, dst, _eflags); | ||
972 | break; | ||
973 | case 0x38 ... 0x3d: | ||
974 | cmp: /* cmp */ | ||
975 | emulate_2op_SrcV("cmp", src, dst, _eflags); | ||
976 | break; | ||
977 | case 0x63: /* movsxd */ | ||
978 | if (mode != X86EMUL_MODE_PROT64) | ||
979 | goto cannot_emulate; | ||
980 | dst.val = (s32) src.val; | ||
981 | break; | ||
982 | case 0x80 ... 0x83: /* Grp1 */ | ||
983 | switch (modrm_reg) { | ||
984 | case 0: | ||
985 | goto add; | ||
986 | case 1: | ||
987 | goto or; | ||
988 | case 2: | ||
989 | goto adc; | ||
990 | case 3: | ||
991 | goto sbb; | ||
992 | case 4: | ||
993 | goto and; | ||
994 | case 5: | ||
995 | goto sub; | ||
996 | case 6: | ||
997 | goto xor; | ||
998 | case 7: | ||
999 | goto cmp; | ||
1000 | } | ||
1001 | break; | ||
1002 | case 0x84 ... 0x85: | ||
1003 | test: /* test */ | ||
1004 | emulate_2op_SrcV("test", src, dst, _eflags); | ||
1005 | break; | ||
1006 | case 0x86 ... 0x87: /* xchg */ | ||
1007 | /* Write back the register source. */ | ||
1008 | switch (dst.bytes) { | ||
1009 | case 1: | ||
1010 | *(u8 *) src.ptr = (u8) dst.val; | ||
1011 | break; | ||
1012 | case 2: | ||
1013 | *(u16 *) src.ptr = (u16) dst.val; | ||
1014 | break; | ||
1015 | case 4: | ||
1016 | *src.ptr = (u32) dst.val; | ||
1017 | break; /* 64b reg: zero-extend */ | ||
1018 | case 8: | ||
1019 | *src.ptr = dst.val; | ||
1020 | break; | ||
1021 | } | ||
1022 | /* | ||
1023 | * Write back the memory destination with implicit LOCK | ||
1024 | * prefix. | ||
1025 | */ | ||
1026 | dst.val = src.val; | ||
1027 | lock_prefix = 1; | ||
1028 | break; | ||
1029 | case 0x88 ... 0x8b: /* mov */ | ||
1030 | goto mov; | ||
1031 | case 0x8d: /* lea r16/r32, m */ | ||
1032 | dst.val = modrm_val; | ||
1033 | break; | ||
1034 | case 0x8f: /* pop (sole member of Grp1a) */ | ||
1035 | /* 64-bit mode: POP always pops a 64-bit operand. */ | ||
1036 | if (mode == X86EMUL_MODE_PROT64) | ||
1037 | dst.bytes = 8; | ||
1038 | if ((rc = ops->read_std(register_address(ctxt->ss_base, | ||
1039 | _regs[VCPU_REGS_RSP]), | ||
1040 | &dst.val, dst.bytes, ctxt->vcpu)) != 0) | ||
1041 | goto done; | ||
1042 | register_address_increment(_regs[VCPU_REGS_RSP], dst.bytes); | ||
1043 | break; | ||
1044 | case 0xa0 ... 0xa1: /* mov */ | ||
1045 | dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX]; | ||
1046 | dst.val = src.val; | ||
1047 | _eip += ad_bytes; /* skip src displacement */ | ||
1048 | break; | ||
1049 | case 0xa2 ... 0xa3: /* mov */ | ||
1050 | dst.val = (unsigned long)_regs[VCPU_REGS_RAX]; | ||
1051 | _eip += ad_bytes; /* skip dst displacement */ | ||
1052 | break; | ||
1053 | case 0xc0 ... 0xc1: | ||
1054 | grp2: /* Grp2 */ | ||
1055 | switch (modrm_reg) { | ||
1056 | case 0: /* rol */ | ||
1057 | emulate_2op_SrcB("rol", src, dst, _eflags); | ||
1058 | break; | ||
1059 | case 1: /* ror */ | ||
1060 | emulate_2op_SrcB("ror", src, dst, _eflags); | ||
1061 | break; | ||
1062 | case 2: /* rcl */ | ||
1063 | emulate_2op_SrcB("rcl", src, dst, _eflags); | ||
1064 | break; | ||
1065 | case 3: /* rcr */ | ||
1066 | emulate_2op_SrcB("rcr", src, dst, _eflags); | ||
1067 | break; | ||
1068 | case 4: /* sal/shl */ | ||
1069 | case 6: /* sal/shl */ | ||
1070 | emulate_2op_SrcB("sal", src, dst, _eflags); | ||
1071 | break; | ||
1072 | case 5: /* shr */ | ||
1073 | emulate_2op_SrcB("shr", src, dst, _eflags); | ||
1074 | break; | ||
1075 | case 7: /* sar */ | ||
1076 | emulate_2op_SrcB("sar", src, dst, _eflags); | ||
1077 | break; | ||
1078 | } | ||
1079 | break; | ||
1080 | case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */ | ||
1081 | mov: | ||
1082 | dst.val = src.val; | ||
1083 | break; | ||
1084 | case 0xd0 ... 0xd1: /* Grp2 */ | ||
1085 | src.val = 1; | ||
1086 | goto grp2; | ||
1087 | case 0xd2 ... 0xd3: /* Grp2 */ | ||
1088 | src.val = _regs[VCPU_REGS_RCX]; | ||
1089 | goto grp2; | ||
1090 | case 0xf6 ... 0xf7: /* Grp3 */ | ||
1091 | switch (modrm_reg) { | ||
1092 | case 0 ... 1: /* test */ | ||
1093 | /* | ||
1094 | * Special case in Grp3: test has an immediate | ||
1095 | * source operand. | ||
1096 | */ | ||
1097 | src.type = OP_IMM; | ||
1098 | src.ptr = (unsigned long *)_eip; | ||
1099 | src.bytes = (d & ByteOp) ? 1 : op_bytes; | ||
1100 | if (src.bytes == 8) | ||
1101 | src.bytes = 4; | ||
1102 | switch (src.bytes) { | ||
1103 | case 1: | ||
1104 | src.val = insn_fetch(s8, 1, _eip); | ||
1105 | break; | ||
1106 | case 2: | ||
1107 | src.val = insn_fetch(s16, 2, _eip); | ||
1108 | break; | ||
1109 | case 4: | ||
1110 | src.val = insn_fetch(s32, 4, _eip); | ||
1111 | break; | ||
1112 | } | ||
1113 | goto test; | ||
1114 | case 2: /* not */ | ||
1115 | dst.val = ~dst.val; | ||
1116 | break; | ||
1117 | case 3: /* neg */ | ||
1118 | emulate_1op("neg", dst, _eflags); | ||
1119 | break; | ||
1120 | default: | ||
1121 | goto cannot_emulate; | ||
1122 | } | ||
1123 | break; | ||
1124 | case 0xfe ... 0xff: /* Grp4/Grp5 */ | ||
1125 | switch (modrm_reg) { | ||
1126 | case 0: /* inc */ | ||
1127 | emulate_1op("inc", dst, _eflags); | ||
1128 | break; | ||
1129 | case 1: /* dec */ | ||
1130 | emulate_1op("dec", dst, _eflags); | ||
1131 | break; | ||
1132 | case 4: /* jmp abs */ | ||
1133 | if (b == 0xff) | ||
1134 | _eip = dst.val; | ||
1135 | else | ||
1136 | goto cannot_emulate; | ||
1137 | break; | ||
1138 | case 6: /* push */ | ||
1139 | /* 64-bit mode: PUSH always pushes a 64-bit operand. */ | ||
1140 | if (mode == X86EMUL_MODE_PROT64) { | ||
1141 | dst.bytes = 8; | ||
1142 | if ((rc = ops->read_std((unsigned long)dst.ptr, | ||
1143 | &dst.val, 8, | ||
1144 | ctxt->vcpu)) != 0) | ||
1145 | goto done; | ||
1146 | } | ||
1147 | register_address_increment(_regs[VCPU_REGS_RSP], | ||
1148 | -dst.bytes); | ||
1149 | if ((rc = ops->write_emulated( | ||
1150 | register_address(ctxt->ss_base, | ||
1151 | _regs[VCPU_REGS_RSP]), | ||
1152 | &dst.val, dst.bytes, ctxt->vcpu)) != 0) | ||
1153 | goto done; | ||
1154 | no_wb = 1; | ||
1155 | break; | ||
1156 | default: | ||
1157 | goto cannot_emulate; | ||
1158 | } | ||
1159 | break; | ||
1160 | } | ||
1161 | |||
1162 | writeback: | ||
1163 | if (!no_wb) { | ||
1164 | switch (dst.type) { | ||
1165 | case OP_REG: | ||
1166 | /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */ | ||
1167 | switch (dst.bytes) { | ||
1168 | case 1: | ||
1169 | *(u8 *)dst.ptr = (u8)dst.val; | ||
1170 | break; | ||
1171 | case 2: | ||
1172 | *(u16 *)dst.ptr = (u16)dst.val; | ||
1173 | break; | ||
1174 | case 4: | ||
1175 | *dst.ptr = (u32)dst.val; | ||
1176 | break; /* 64b: zero-ext */ | ||
1177 | case 8: | ||
1178 | *dst.ptr = dst.val; | ||
1179 | break; | ||
1180 | } | ||
1181 | break; | ||
1182 | case OP_MEM: | ||
1183 | if (lock_prefix) | ||
1184 | rc = ops->cmpxchg_emulated((unsigned long)dst. | ||
1185 | ptr, &dst.orig_val, | ||
1186 | &dst.val, dst.bytes, | ||
1187 | ctxt->vcpu); | ||
1188 | else | ||
1189 | rc = ops->write_emulated((unsigned long)dst.ptr, | ||
1190 | &dst.val, dst.bytes, | ||
1191 | ctxt->vcpu); | ||
1192 | if (rc != 0) | ||
1193 | goto done; | ||
1194 | default: | ||
1195 | break; | ||
1196 | } | ||
1197 | } | ||
1198 | |||
1199 | /* Commit shadow register state. */ | ||
1200 | memcpy(ctxt->vcpu->regs, _regs, sizeof _regs); | ||
1201 | ctxt->eflags = _eflags; | ||
1202 | ctxt->vcpu->rip = _eip; | ||
1203 | |||
1204 | done: | ||
1205 | return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; | ||
1206 | |||
1207 | special_insn: | ||
1208 | if (twobyte) | ||
1209 | goto twobyte_special_insn; | ||
1210 | switch(b) { | ||
1211 | case 0x50 ... 0x57: /* push reg */ | ||
1212 | if (op_bytes == 2) | ||
1213 | src.val = (u16) _regs[b & 0x7]; | ||
1214 | else | ||
1215 | src.val = (u32) _regs[b & 0x7]; | ||
1216 | dst.type = OP_MEM; | ||
1217 | dst.bytes = op_bytes; | ||
1218 | dst.val = src.val; | ||
1219 | register_address_increment(_regs[VCPU_REGS_RSP], -op_bytes); | ||
1220 | dst.ptr = (void *) register_address( | ||
1221 | ctxt->ss_base, _regs[VCPU_REGS_RSP]); | ||
1222 | break; | ||
1223 | case 0x58 ... 0x5f: /* pop reg */ | ||
1224 | dst.ptr = (unsigned long *)&_regs[b & 0x7]; | ||
1225 | pop_instruction: | ||
1226 | if ((rc = ops->read_std(register_address(ctxt->ss_base, | ||
1227 | _regs[VCPU_REGS_RSP]), dst.ptr, op_bytes, ctxt->vcpu)) | ||
1228 | != 0) | ||
1229 | goto done; | ||
1230 | |||
1231 | register_address_increment(_regs[VCPU_REGS_RSP], op_bytes); | ||
1232 | no_wb = 1; /* Disable writeback. */ | ||
1233 | break; | ||
1234 | case 0x6a: /* push imm8 */ | ||
1235 | src.val = 0L; | ||
1236 | src.val = insn_fetch(s8, 1, _eip); | ||
1237 | push: | ||
1238 | dst.type = OP_MEM; | ||
1239 | dst.bytes = op_bytes; | ||
1240 | dst.val = src.val; | ||
1241 | register_address_increment(_regs[VCPU_REGS_RSP], -op_bytes); | ||
1242 | dst.ptr = (void *) register_address(ctxt->ss_base, | ||
1243 | _regs[VCPU_REGS_RSP]); | ||
1244 | break; | ||
1245 | case 0x6c: /* insb */ | ||
1246 | case 0x6d: /* insw/insd */ | ||
1247 | if (kvm_emulate_pio_string(ctxt->vcpu, NULL, | ||
1248 | 1, /* in */ | ||
1249 | (d & ByteOp) ? 1 : op_bytes, /* size */ | ||
1250 | rep_prefix ? | ||
1251 | address_mask(_regs[VCPU_REGS_RCX]) : 1, /* count */ | ||
1252 | (_eflags & EFLG_DF), /* down */ | ||
1253 | register_address(ctxt->es_base, | ||
1254 | _regs[VCPU_REGS_RDI]), /* address */ | ||
1255 | rep_prefix, | ||
1256 | _regs[VCPU_REGS_RDX] /* port */ | ||
1257 | ) == 0) | ||
1258 | return -1; | ||
1259 | return 0; | ||
1260 | case 0x6e: /* outsb */ | ||
1261 | case 0x6f: /* outsw/outsd */ | ||
1262 | if (kvm_emulate_pio_string(ctxt->vcpu, NULL, | ||
1263 | 0, /* in */ | ||
1264 | (d & ByteOp) ? 1 : op_bytes, /* size */ | ||
1265 | rep_prefix ? | ||
1266 | address_mask(_regs[VCPU_REGS_RCX]) : 1, /* count */ | ||
1267 | (_eflags & EFLG_DF), /* down */ | ||
1268 | register_address(override_base ? | ||
1269 | *override_base : ctxt->ds_base, | ||
1270 | _regs[VCPU_REGS_RSI]), /* address */ | ||
1271 | rep_prefix, | ||
1272 | _regs[VCPU_REGS_RDX] /* port */ | ||
1273 | ) == 0) | ||
1274 | return -1; | ||
1275 | return 0; | ||
1276 | case 0x70 ... 0x7f: /* jcc (short) */ { | ||
1277 | int rel = insn_fetch(s8, 1, _eip); | ||
1278 | |||
1279 | if (test_cc(b, _eflags)) | ||
1280 | JMP_REL(rel); | ||
1281 | break; | ||
1282 | } | ||
1283 | case 0x9c: /* pushf */ | ||
1284 | src.val = (unsigned long) _eflags; | ||
1285 | goto push; | ||
1286 | case 0x9d: /* popf */ | ||
1287 | dst.ptr = (unsigned long *) &_eflags; | ||
1288 | goto pop_instruction; | ||
1289 | case 0xc3: /* ret */ | ||
1290 | dst.ptr = &_eip; | ||
1291 | goto pop_instruction; | ||
1292 | case 0xf4: /* hlt */ | ||
1293 | ctxt->vcpu->halt_request = 1; | ||
1294 | goto done; | ||
1295 | } | ||
1296 | if (rep_prefix) { | ||
1297 | if (_regs[VCPU_REGS_RCX] == 0) { | ||
1298 | ctxt->vcpu->rip = _eip; | ||
1299 | goto done; | ||
1300 | } | ||
1301 | _regs[VCPU_REGS_RCX]--; | ||
1302 | _eip = ctxt->vcpu->rip; | ||
1303 | } | ||
1304 | switch (b) { | ||
1305 | case 0xa4 ... 0xa5: /* movs */ | ||
1306 | dst.type = OP_MEM; | ||
1307 | dst.bytes = (d & ByteOp) ? 1 : op_bytes; | ||
1308 | dst.ptr = (unsigned long *)register_address(ctxt->es_base, | ||
1309 | _regs[VCPU_REGS_RDI]); | ||
1310 | if ((rc = ops->read_emulated(register_address( | ||
1311 | override_base ? *override_base : ctxt->ds_base, | ||
1312 | _regs[VCPU_REGS_RSI]), &dst.val, dst.bytes, ctxt->vcpu)) != 0) | ||
1313 | goto done; | ||
1314 | register_address_increment(_regs[VCPU_REGS_RSI], | ||
1315 | (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes); | ||
1316 | register_address_increment(_regs[VCPU_REGS_RDI], | ||
1317 | (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes); | ||
1318 | break; | ||
1319 | case 0xa6 ... 0xa7: /* cmps */ | ||
1320 | DPRINTF("Urk! I don't handle CMPS.\n"); | ||
1321 | goto cannot_emulate; | ||
1322 | case 0xaa ... 0xab: /* stos */ | ||
1323 | dst.type = OP_MEM; | ||
1324 | dst.bytes = (d & ByteOp) ? 1 : op_bytes; | ||
1325 | dst.ptr = (unsigned long *)cr2; | ||
1326 | dst.val = _regs[VCPU_REGS_RAX]; | ||
1327 | register_address_increment(_regs[VCPU_REGS_RDI], | ||
1328 | (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes); | ||
1329 | break; | ||
1330 | case 0xac ... 0xad: /* lods */ | ||
1331 | dst.type = OP_REG; | ||
1332 | dst.bytes = (d & ByteOp) ? 1 : op_bytes; | ||
1333 | dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX]; | ||
1334 | if ((rc = ops->read_emulated(cr2, &dst.val, dst.bytes, | ||
1335 | ctxt->vcpu)) != 0) | ||
1336 | goto done; | ||
1337 | register_address_increment(_regs[VCPU_REGS_RSI], | ||
1338 | (_eflags & EFLG_DF) ? -dst.bytes : dst.bytes); | ||
1339 | break; | ||
1340 | case 0xae ... 0xaf: /* scas */ | ||
1341 | DPRINTF("Urk! I don't handle SCAS.\n"); | ||
1342 | goto cannot_emulate; | ||
1343 | case 0xe8: /* call (near) */ { | ||
1344 | long int rel; | ||
1345 | switch (op_bytes) { | ||
1346 | case 2: | ||
1347 | rel = insn_fetch(s16, 2, _eip); | ||
1348 | break; | ||
1349 | case 4: | ||
1350 | rel = insn_fetch(s32, 4, _eip); | ||
1351 | break; | ||
1352 | case 8: | ||
1353 | rel = insn_fetch(s64, 8, _eip); | ||
1354 | break; | ||
1355 | default: | ||
1356 | DPRINTF("Call: Invalid op_bytes\n"); | ||
1357 | goto cannot_emulate; | ||
1358 | } | ||
1359 | src.val = (unsigned long) _eip; | ||
1360 | JMP_REL(rel); | ||
1361 | op_bytes = ad_bytes; | ||
1362 | goto push; | ||
1363 | } | ||
1364 | case 0xe9: /* jmp rel */ | ||
1365 | case 0xeb: /* jmp rel short */ | ||
1366 | JMP_REL(src.val); | ||
1367 | no_wb = 1; /* Disable writeback. */ | ||
1368 | break; | ||
1369 | |||
1370 | |||
1371 | } | ||
1372 | goto writeback; | ||
1373 | |||
1374 | twobyte_insn: | ||
1375 | switch (b) { | ||
1376 | case 0x01: /* lgdt, lidt, lmsw */ | ||
1377 | /* Disable writeback. */ | ||
1378 | no_wb = 1; | ||
1379 | switch (modrm_reg) { | ||
1380 | u16 size; | ||
1381 | unsigned long address; | ||
1382 | |||
1383 | case 2: /* lgdt */ | ||
1384 | rc = read_descriptor(ctxt, ops, src.ptr, | ||
1385 | &size, &address, op_bytes); | ||
1386 | if (rc) | ||
1387 | goto done; | ||
1388 | realmode_lgdt(ctxt->vcpu, size, address); | ||
1389 | break; | ||
1390 | case 3: /* lidt */ | ||
1391 | rc = read_descriptor(ctxt, ops, src.ptr, | ||
1392 | &size, &address, op_bytes); | ||
1393 | if (rc) | ||
1394 | goto done; | ||
1395 | realmode_lidt(ctxt->vcpu, size, address); | ||
1396 | break; | ||
1397 | case 4: /* smsw */ | ||
1398 | if (modrm_mod != 3) | ||
1399 | goto cannot_emulate; | ||
1400 | *(u16 *)&_regs[modrm_rm] | ||
1401 | = realmode_get_cr(ctxt->vcpu, 0); | ||
1402 | break; | ||
1403 | case 6: /* lmsw */ | ||
1404 | if (modrm_mod != 3) | ||
1405 | goto cannot_emulate; | ||
1406 | realmode_lmsw(ctxt->vcpu, (u16)modrm_val, &_eflags); | ||
1407 | break; | ||
1408 | case 7: /* invlpg*/ | ||
1409 | emulate_invlpg(ctxt->vcpu, cr2); | ||
1410 | break; | ||
1411 | default: | ||
1412 | goto cannot_emulate; | ||
1413 | } | ||
1414 | break; | ||
1415 | case 0x21: /* mov from dr to reg */ | ||
1416 | no_wb = 1; | ||
1417 | if (modrm_mod != 3) | ||
1418 | goto cannot_emulate; | ||
1419 | rc = emulator_get_dr(ctxt, modrm_reg, &_regs[modrm_rm]); | ||
1420 | break; | ||
1421 | case 0x23: /* mov from reg to dr */ | ||
1422 | no_wb = 1; | ||
1423 | if (modrm_mod != 3) | ||
1424 | goto cannot_emulate; | ||
1425 | rc = emulator_set_dr(ctxt, modrm_reg, _regs[modrm_rm]); | ||
1426 | break; | ||
1427 | case 0x40 ... 0x4f: /* cmov */ | ||
1428 | dst.val = dst.orig_val = src.val; | ||
1429 | no_wb = 1; | ||
1430 | /* | ||
1431 | * First, assume we're decoding an even cmov opcode | ||
1432 | * (lsb == 0). | ||
1433 | */ | ||
1434 | switch ((b & 15) >> 1) { | ||
1435 | case 0: /* cmovo */ | ||
1436 | no_wb = (_eflags & EFLG_OF) ? 0 : 1; | ||
1437 | break; | ||
1438 | case 1: /* cmovb/cmovc/cmovnae */ | ||
1439 | no_wb = (_eflags & EFLG_CF) ? 0 : 1; | ||
1440 | break; | ||
1441 | case 2: /* cmovz/cmove */ | ||
1442 | no_wb = (_eflags & EFLG_ZF) ? 0 : 1; | ||
1443 | break; | ||
1444 | case 3: /* cmovbe/cmovna */ | ||
1445 | no_wb = (_eflags & (EFLG_CF | EFLG_ZF)) ? 0 : 1; | ||
1446 | break; | ||
1447 | case 4: /* cmovs */ | ||
1448 | no_wb = (_eflags & EFLG_SF) ? 0 : 1; | ||
1449 | break; | ||
1450 | case 5: /* cmovp/cmovpe */ | ||
1451 | no_wb = (_eflags & EFLG_PF) ? 0 : 1; | ||
1452 | break; | ||
1453 | case 7: /* cmovle/cmovng */ | ||
1454 | no_wb = (_eflags & EFLG_ZF) ? 0 : 1; | ||
1455 | /* fall through */ | ||
1456 | case 6: /* cmovl/cmovnge */ | ||
1457 | no_wb &= (!(_eflags & EFLG_SF) != | ||
1458 | !(_eflags & EFLG_OF)) ? 0 : 1; | ||
1459 | break; | ||
1460 | } | ||
1461 | /* Odd cmov opcodes (lsb == 1) have inverted sense. */ | ||
1462 | no_wb ^= b & 1; | ||
1463 | break; | ||
1464 | case 0xa3: | ||
1465 | bt: /* bt */ | ||
1466 | src.val &= (dst.bytes << 3) - 1; /* only subword offset */ | ||
1467 | emulate_2op_SrcV_nobyte("bt", src, dst, _eflags); | ||
1468 | break; | ||
1469 | case 0xab: | ||
1470 | bts: /* bts */ | ||
1471 | src.val &= (dst.bytes << 3) - 1; /* only subword offset */ | ||
1472 | emulate_2op_SrcV_nobyte("bts", src, dst, _eflags); | ||
1473 | break; | ||
1474 | case 0xb0 ... 0xb1: /* cmpxchg */ | ||
1475 | /* | ||
1476 | * Save real source value, then compare EAX against | ||
1477 | * destination. | ||
1478 | */ | ||
1479 | src.orig_val = src.val; | ||
1480 | src.val = _regs[VCPU_REGS_RAX]; | ||
1481 | emulate_2op_SrcV("cmp", src, dst, _eflags); | ||
1482 | if (_eflags & EFLG_ZF) { | ||
1483 | /* Success: write back to memory. */ | ||
1484 | dst.val = src.orig_val; | ||
1485 | } else { | ||
1486 | /* Failure: write the value we saw to EAX. */ | ||
1487 | dst.type = OP_REG; | ||
1488 | dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX]; | ||
1489 | } | ||
1490 | break; | ||
1491 | case 0xb3: | ||
1492 | btr: /* btr */ | ||
1493 | src.val &= (dst.bytes << 3) - 1; /* only subword offset */ | ||
1494 | emulate_2op_SrcV_nobyte("btr", src, dst, _eflags); | ||
1495 | break; | ||
1496 | case 0xb6 ... 0xb7: /* movzx */ | ||
1497 | dst.bytes = op_bytes; | ||
1498 | dst.val = (d & ByteOp) ? (u8) src.val : (u16) src.val; | ||
1499 | break; | ||
1500 | case 0xba: /* Grp8 */ | ||
1501 | switch (modrm_reg & 3) { | ||
1502 | case 0: | ||
1503 | goto bt; | ||
1504 | case 1: | ||
1505 | goto bts; | ||
1506 | case 2: | ||
1507 | goto btr; | ||
1508 | case 3: | ||
1509 | goto btc; | ||
1510 | } | ||
1511 | break; | ||
1512 | case 0xbb: | ||
1513 | btc: /* btc */ | ||
1514 | src.val &= (dst.bytes << 3) - 1; /* only subword offset */ | ||
1515 | emulate_2op_SrcV_nobyte("btc", src, dst, _eflags); | ||
1516 | break; | ||
1517 | case 0xbe ... 0xbf: /* movsx */ | ||
1518 | dst.bytes = op_bytes; | ||
1519 | dst.val = (d & ByteOp) ? (s8) src.val : (s16) src.val; | ||
1520 | break; | ||
1521 | case 0xc3: /* movnti */ | ||
1522 | dst.bytes = op_bytes; | ||
1523 | dst.val = (op_bytes == 4) ? (u32) src.val : (u64) src.val; | ||
1524 | break; | ||
1525 | } | ||
1526 | goto writeback; | ||
1527 | |||
1528 | twobyte_special_insn: | ||
1529 | /* Disable writeback. */ | ||
1530 | no_wb = 1; | ||
1531 | switch (b) { | ||
1532 | case 0x06: | ||
1533 | emulate_clts(ctxt->vcpu); | ||
1534 | break; | ||
1535 | case 0x08: /* invd */ | ||
1536 | break; | ||
1537 | case 0x09: /* wbinvd */ | ||
1538 | break; | ||
1539 | case 0x0d: /* GrpP (prefetch) */ | ||
1540 | case 0x18: /* Grp16 (prefetch/nop) */ | ||
1541 | break; | ||
1542 | case 0x20: /* mov cr, reg */ | ||
1543 | if (modrm_mod != 3) | ||
1544 | goto cannot_emulate; | ||
1545 | _regs[modrm_rm] = realmode_get_cr(ctxt->vcpu, modrm_reg); | ||
1546 | break; | ||
1547 | case 0x22: /* mov reg, cr */ | ||
1548 | if (modrm_mod != 3) | ||
1549 | goto cannot_emulate; | ||
1550 | realmode_set_cr(ctxt->vcpu, modrm_reg, modrm_val, &_eflags); | ||
1551 | break; | ||
1552 | case 0x30: | ||
1553 | /* wrmsr */ | ||
1554 | msr_data = (u32)_regs[VCPU_REGS_RAX] | ||
1555 | | ((u64)_regs[VCPU_REGS_RDX] << 32); | ||
1556 | rc = kvm_set_msr(ctxt->vcpu, _regs[VCPU_REGS_RCX], msr_data); | ||
1557 | if (rc) { | ||
1558 | kvm_x86_ops->inject_gp(ctxt->vcpu, 0); | ||
1559 | _eip = ctxt->vcpu->rip; | ||
1560 | } | ||
1561 | rc = X86EMUL_CONTINUE; | ||
1562 | break; | ||
1563 | case 0x32: | ||
1564 | /* rdmsr */ | ||
1565 | rc = kvm_get_msr(ctxt->vcpu, _regs[VCPU_REGS_RCX], &msr_data); | ||
1566 | if (rc) { | ||
1567 | kvm_x86_ops->inject_gp(ctxt->vcpu, 0); | ||
1568 | _eip = ctxt->vcpu->rip; | ||
1569 | } else { | ||
1570 | _regs[VCPU_REGS_RAX] = (u32)msr_data; | ||
1571 | _regs[VCPU_REGS_RDX] = msr_data >> 32; | ||
1572 | } | ||
1573 | rc = X86EMUL_CONTINUE; | ||
1574 | break; | ||
1575 | case 0x80 ... 0x8f: /* jnz rel, etc*/ { | ||
1576 | long int rel; | ||
1577 | |||
1578 | switch (op_bytes) { | ||
1579 | case 2: | ||
1580 | rel = insn_fetch(s16, 2, _eip); | ||
1581 | break; | ||
1582 | case 4: | ||
1583 | rel = insn_fetch(s32, 4, _eip); | ||
1584 | break; | ||
1585 | case 8: | ||
1586 | rel = insn_fetch(s64, 8, _eip); | ||
1587 | break; | ||
1588 | default: | ||
1589 | DPRINTF("jnz: Invalid op_bytes\n"); | ||
1590 | goto cannot_emulate; | ||
1591 | } | ||
1592 | if (test_cc(b, _eflags)) | ||
1593 | JMP_REL(rel); | ||
1594 | break; | ||
1595 | } | ||
1596 | case 0xc7: /* Grp9 (cmpxchg8b) */ | ||
1597 | { | ||
1598 | u64 old, new; | ||
1599 | if ((rc = ops->read_emulated(cr2, &old, 8, ctxt->vcpu)) | ||
1600 | != 0) | ||
1601 | goto done; | ||
1602 | if (((u32) (old >> 0) != (u32) _regs[VCPU_REGS_RAX]) || | ||
1603 | ((u32) (old >> 32) != (u32) _regs[VCPU_REGS_RDX])) { | ||
1604 | _regs[VCPU_REGS_RAX] = (u32) (old >> 0); | ||
1605 | _regs[VCPU_REGS_RDX] = (u32) (old >> 32); | ||
1606 | _eflags &= ~EFLG_ZF; | ||
1607 | } else { | ||
1608 | new = ((u64)_regs[VCPU_REGS_RCX] << 32) | ||
1609 | | (u32) _regs[VCPU_REGS_RBX]; | ||
1610 | if ((rc = ops->cmpxchg_emulated(cr2, &old, | ||
1611 | &new, 8, ctxt->vcpu)) != 0) | ||
1612 | goto done; | ||
1613 | _eflags |= EFLG_ZF; | ||
1614 | } | ||
1615 | break; | ||
1616 | } | ||
1617 | } | ||
1618 | goto writeback; | ||
1619 | |||
1620 | cannot_emulate: | ||
1621 | DPRINTF("Cannot emulate %02x\n", b); | ||
1622 | return -1; | ||
1623 | } | ||
1624 | |||
1625 | #ifdef __XEN__ | ||
1626 | |||
1627 | #include <asm/mm.h> | ||
1628 | #include <asm/uaccess.h> | ||
1629 | |||
1630 | int | ||
1631 | x86_emulate_read_std(unsigned long addr, | ||
1632 | unsigned long *val, | ||
1633 | unsigned int bytes, struct x86_emulate_ctxt *ctxt) | ||
1634 | { | ||
1635 | unsigned int rc; | ||
1636 | |||
1637 | *val = 0; | ||
1638 | |||
1639 | if ((rc = copy_from_user((void *)val, (void *)addr, bytes)) != 0) { | ||
1640 | propagate_page_fault(addr + bytes - rc, 0); /* read fault */ | ||
1641 | return X86EMUL_PROPAGATE_FAULT; | ||
1642 | } | ||
1643 | |||
1644 | return X86EMUL_CONTINUE; | ||
1645 | } | ||
1646 | |||
1647 | int | ||
1648 | x86_emulate_write_std(unsigned long addr, | ||
1649 | unsigned long val, | ||
1650 | unsigned int bytes, struct x86_emulate_ctxt *ctxt) | ||
1651 | { | ||
1652 | unsigned int rc; | ||
1653 | |||
1654 | if ((rc = copy_to_user((void *)addr, (void *)&val, bytes)) != 0) { | ||
1655 | propagate_page_fault(addr + bytes - rc, PGERR_write_access); | ||
1656 | return X86EMUL_PROPAGATE_FAULT; | ||
1657 | } | ||
1658 | |||
1659 | return X86EMUL_CONTINUE; | ||
1660 | } | ||
1661 | |||
1662 | #endif | ||
diff --git a/include/asm-x86/Kbuild b/include/asm-x86/Kbuild index e6189b229143..3c6f0f80e827 100644 --- a/include/asm-x86/Kbuild +++ b/include/asm-x86/Kbuild | |||
@@ -3,6 +3,7 @@ include include/asm-generic/Kbuild.asm | |||
3 | header-y += boot.h | 3 | header-y += boot.h |
4 | header-y += bootparam.h | 4 | header-y += bootparam.h |
5 | header-y += debugreg.h | 5 | header-y += debugreg.h |
6 | header-y += kvm.h | ||
6 | header-y += ldt.h | 7 | header-y += ldt.h |
7 | header-y += msr-index.h | 8 | header-y += msr-index.h |
8 | header-y += prctl.h | 9 | header-y += prctl.h |
diff --git a/include/asm-x86/kvm.h b/include/asm-x86/kvm.h new file mode 100644 index 000000000000..7a71120426a3 --- /dev/null +++ b/include/asm-x86/kvm.h | |||
@@ -0,0 +1,191 @@ | |||
1 | #ifndef __LINUX_KVM_X86_H | ||
2 | #define __LINUX_KVM_X86_H | ||
3 | |||
4 | /* | ||
5 | * KVM x86 specific structures and definitions | ||
6 | * | ||
7 | */ | ||
8 | |||
9 | #include <asm/types.h> | ||
10 | #include <linux/ioctl.h> | ||
11 | |||
12 | /* Architectural interrupt line count. */ | ||
13 | #define KVM_NR_INTERRUPTS 256 | ||
14 | |||
15 | struct kvm_memory_alias { | ||
16 | __u32 slot; /* this has a different namespace than memory slots */ | ||
17 | __u32 flags; | ||
18 | __u64 guest_phys_addr; | ||
19 | __u64 memory_size; | ||
20 | __u64 target_phys_addr; | ||
21 | }; | ||
22 | |||
23 | /* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */ | ||
24 | struct kvm_pic_state { | ||
25 | __u8 last_irr; /* edge detection */ | ||
26 | __u8 irr; /* interrupt request register */ | ||
27 | __u8 imr; /* interrupt mask register */ | ||
28 | __u8 isr; /* interrupt service register */ | ||
29 | __u8 priority_add; /* highest irq priority */ | ||
30 | __u8 irq_base; | ||
31 | __u8 read_reg_select; | ||
32 | __u8 poll; | ||
33 | __u8 special_mask; | ||
34 | __u8 init_state; | ||
35 | __u8 auto_eoi; | ||
36 | __u8 rotate_on_auto_eoi; | ||
37 | __u8 special_fully_nested_mode; | ||
38 | __u8 init4; /* true if 4 byte init */ | ||
39 | __u8 elcr; /* PIIX edge/trigger selection */ | ||
40 | __u8 elcr_mask; | ||
41 | }; | ||
42 | |||
43 | #define KVM_IOAPIC_NUM_PINS 24 | ||
44 | struct kvm_ioapic_state { | ||
45 | __u64 base_address; | ||
46 | __u32 ioregsel; | ||
47 | __u32 id; | ||
48 | __u32 irr; | ||
49 | __u32 pad; | ||
50 | union { | ||
51 | __u64 bits; | ||
52 | struct { | ||
53 | __u8 vector; | ||
54 | __u8 delivery_mode:3; | ||
55 | __u8 dest_mode:1; | ||
56 | __u8 delivery_status:1; | ||
57 | __u8 polarity:1; | ||
58 | __u8 remote_irr:1; | ||
59 | __u8 trig_mode:1; | ||
60 | __u8 mask:1; | ||
61 | __u8 reserve:7; | ||
62 | __u8 reserved[4]; | ||
63 | __u8 dest_id; | ||
64 | } fields; | ||
65 | } redirtbl[KVM_IOAPIC_NUM_PINS]; | ||
66 | }; | ||
67 | |||
68 | #define KVM_IRQCHIP_PIC_MASTER 0 | ||
69 | #define KVM_IRQCHIP_PIC_SLAVE 1 | ||
70 | #define KVM_IRQCHIP_IOAPIC 2 | ||
71 | |||
72 | /* for KVM_GET_REGS and KVM_SET_REGS */ | ||
73 | struct kvm_regs { | ||
74 | /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */ | ||
75 | __u64 rax, rbx, rcx, rdx; | ||
76 | __u64 rsi, rdi, rsp, rbp; | ||
77 | __u64 r8, r9, r10, r11; | ||
78 | __u64 r12, r13, r14, r15; | ||
79 | __u64 rip, rflags; | ||
80 | }; | ||
81 | |||
82 | /* for KVM_GET_LAPIC and KVM_SET_LAPIC */ | ||
83 | #define KVM_APIC_REG_SIZE 0x400 | ||
84 | struct kvm_lapic_state { | ||
85 | char regs[KVM_APIC_REG_SIZE]; | ||
86 | }; | ||
87 | |||
88 | struct kvm_segment { | ||
89 | __u64 base; | ||
90 | __u32 limit; | ||
91 | __u16 selector; | ||
92 | __u8 type; | ||
93 | __u8 present, dpl, db, s, l, g, avl; | ||
94 | __u8 unusable; | ||
95 | __u8 padding; | ||
96 | }; | ||
97 | |||
98 | struct kvm_dtable { | ||
99 | __u64 base; | ||
100 | __u16 limit; | ||
101 | __u16 padding[3]; | ||
102 | }; | ||
103 | |||
104 | |||
105 | /* for KVM_GET_SREGS and KVM_SET_SREGS */ | ||
106 | struct kvm_sregs { | ||
107 | /* out (KVM_GET_SREGS) / in (KVM_SET_SREGS) */ | ||
108 | struct kvm_segment cs, ds, es, fs, gs, ss; | ||
109 | struct kvm_segment tr, ldt; | ||
110 | struct kvm_dtable gdt, idt; | ||
111 | __u64 cr0, cr2, cr3, cr4, cr8; | ||
112 | __u64 efer; | ||
113 | __u64 apic_base; | ||
114 | __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64]; | ||
115 | }; | ||
116 | |||
117 | /* for KVM_GET_FPU and KVM_SET_FPU */ | ||
118 | struct kvm_fpu { | ||
119 | __u8 fpr[8][16]; | ||
120 | __u16 fcw; | ||
121 | __u16 fsw; | ||
122 | __u8 ftwx; /* in fxsave format */ | ||
123 | __u8 pad1; | ||
124 | __u16 last_opcode; | ||
125 | __u64 last_ip; | ||
126 | __u64 last_dp; | ||
127 | __u8 xmm[16][16]; | ||
128 | __u32 mxcsr; | ||
129 | __u32 pad2; | ||
130 | }; | ||
131 | |||
132 | struct kvm_msr_entry { | ||
133 | __u32 index; | ||
134 | __u32 reserved; | ||
135 | __u64 data; | ||
136 | }; | ||
137 | |||
138 | /* for KVM_GET_MSRS and KVM_SET_MSRS */ | ||
139 | struct kvm_msrs { | ||
140 | __u32 nmsrs; /* number of msrs in entries */ | ||
141 | __u32 pad; | ||
142 | |||
143 | struct kvm_msr_entry entries[0]; | ||
144 | }; | ||
145 | |||
146 | /* for KVM_GET_MSR_INDEX_LIST */ | ||
147 | struct kvm_msr_list { | ||
148 | __u32 nmsrs; /* number of msrs in entries */ | ||
149 | __u32 indices[0]; | ||
150 | }; | ||
151 | |||
152 | |||
153 | struct kvm_cpuid_entry { | ||
154 | __u32 function; | ||
155 | __u32 eax; | ||
156 | __u32 ebx; | ||
157 | __u32 ecx; | ||
158 | __u32 edx; | ||
159 | __u32 padding; | ||
160 | }; | ||
161 | |||
162 | /* for KVM_SET_CPUID */ | ||
163 | struct kvm_cpuid { | ||
164 | __u32 nent; | ||
165 | __u32 padding; | ||
166 | struct kvm_cpuid_entry entries[0]; | ||
167 | }; | ||
168 | |||
169 | struct kvm_cpuid_entry2 { | ||
170 | __u32 function; | ||
171 | __u32 index; | ||
172 | __u32 flags; | ||
173 | __u32 eax; | ||
174 | __u32 ebx; | ||
175 | __u32 ecx; | ||
176 | __u32 edx; | ||
177 | __u32 padding[3]; | ||
178 | }; | ||
179 | |||
180 | #define KVM_CPUID_FLAG_SIGNIFCANT_INDEX 1 | ||
181 | #define KVM_CPUID_FLAG_STATEFUL_FUNC 2 | ||
182 | #define KVM_CPUID_FLAG_STATE_READ_NEXT 4 | ||
183 | |||
184 | /* for KVM_SET_CPUID2 */ | ||
185 | struct kvm_cpuid2 { | ||
186 | __u32 nent; | ||
187 | __u32 padding; | ||
188 | struct kvm_cpuid_entry2 entries[0]; | ||
189 | }; | ||
190 | |||
191 | #endif | ||
diff --git a/drivers/kvm/kvm.h b/include/asm-x86/kvm_host.h index 3b0bc4bda5f2..4702b04b979a 100644 --- a/drivers/kvm/kvm.h +++ b/include/asm-x86/kvm_host.h | |||
@@ -1,23 +1,24 @@ | |||
1 | #ifndef __KVM_H | 1 | #/* |
2 | #define __KVM_H | 2 | * Kernel-based Virtual Machine driver for Linux |
3 | 3 | * | |
4 | /* | 4 | * This header defines architecture specific interfaces, x86 version |
5 | * | ||
5 | * This work is licensed under the terms of the GNU GPL, version 2. See | 6 | * This work is licensed under the terms of the GNU GPL, version 2. See |
6 | * the COPYING file in the top-level directory. | 7 | * the COPYING file in the top-level directory. |
8 | * | ||
7 | */ | 9 | */ |
8 | 10 | ||
11 | #ifndef ASM_KVM_HOST_H | ||
12 | #define ASM_KVM_HOST_H | ||
13 | |||
9 | #include <linux/types.h> | 14 | #include <linux/types.h> |
10 | #include <linux/list.h> | ||
11 | #include <linux/mutex.h> | ||
12 | #include <linux/spinlock.h> | ||
13 | #include <linux/signal.h> | ||
14 | #include <linux/sched.h> | ||
15 | #include <linux/mm.h> | 15 | #include <linux/mm.h> |
16 | #include <linux/preempt.h> | ||
17 | #include <asm/signal.h> | ||
18 | 16 | ||
19 | #include <linux/kvm.h> | 17 | #include <linux/kvm.h> |
20 | #include <linux/kvm_para.h> | 18 | #include <linux/kvm_para.h> |
19 | #include <linux/kvm_types.h> | ||
20 | |||
21 | #include <asm/desc.h> | ||
21 | 22 | ||
22 | #define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1) | 23 | #define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1) |
23 | #define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD)) | 24 | #define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD)) |
@@ -37,15 +38,8 @@ | |||
37 | #define INVALID_PAGE (~(hpa_t)0) | 38 | #define INVALID_PAGE (~(hpa_t)0) |
38 | #define UNMAPPED_GVA (~(gpa_t)0) | 39 | #define UNMAPPED_GVA (~(gpa_t)0) |
39 | 40 | ||
40 | #define KVM_MAX_VCPUS 4 | ||
41 | #define KVM_ALIAS_SLOTS 4 | ||
42 | #define KVM_MEMORY_SLOTS 8 | ||
43 | #define KVM_NUM_MMU_PAGES 1024 | ||
44 | #define KVM_MIN_FREE_MMU_PAGES 5 | ||
45 | #define KVM_REFILL_PAGES 25 | ||
46 | #define KVM_MAX_CPUID_ENTRIES 40 | ||
47 | |||
48 | #define DE_VECTOR 0 | 41 | #define DE_VECTOR 0 |
42 | #define UD_VECTOR 6 | ||
49 | #define NM_VECTOR 7 | 43 | #define NM_VECTOR 7 |
50 | #define DF_VECTOR 8 | 44 | #define DF_VECTOR 8 |
51 | #define TS_VECTOR 10 | 45 | #define TS_VECTOR 10 |
@@ -59,31 +53,66 @@ | |||
59 | 53 | ||
60 | #define IOPL_SHIFT 12 | 54 | #define IOPL_SHIFT 12 |
61 | 55 | ||
62 | #define KVM_PIO_PAGE_OFFSET 1 | 56 | #define KVM_ALIAS_SLOTS 4 |
63 | 57 | ||
64 | /* | 58 | #define KVM_PERMILLE_MMU_PAGES 20 |
65 | * vcpu->requests bit members | 59 | #define KVM_MIN_ALLOC_MMU_PAGES 64 |
66 | */ | 60 | #define KVM_NUM_MMU_PAGES 1024 |
67 | #define KVM_TLB_FLUSH 0 | 61 | #define KVM_MIN_FREE_MMU_PAGES 5 |
62 | #define KVM_REFILL_PAGES 25 | ||
63 | #define KVM_MAX_CPUID_ENTRIES 40 | ||
68 | 64 | ||
69 | /* | 65 | extern spinlock_t kvm_lock; |
70 | * Address types: | 66 | extern struct list_head vm_list; |
71 | * | 67 | |
72 | * gva - guest virtual address | 68 | struct kvm_vcpu; |
73 | * gpa - guest physical address | 69 | struct kvm; |
74 | * gfn - guest frame number | 70 | |
75 | * hva - host virtual address | 71 | enum { |
76 | * hpa - host physical address | 72 | VCPU_REGS_RAX = 0, |
77 | * hfn - host frame number | 73 | VCPU_REGS_RCX = 1, |
78 | */ | 74 | VCPU_REGS_RDX = 2, |
75 | VCPU_REGS_RBX = 3, | ||
76 | VCPU_REGS_RSP = 4, | ||
77 | VCPU_REGS_RBP = 5, | ||
78 | VCPU_REGS_RSI = 6, | ||
79 | VCPU_REGS_RDI = 7, | ||
80 | #ifdef CONFIG_X86_64 | ||
81 | VCPU_REGS_R8 = 8, | ||
82 | VCPU_REGS_R9 = 9, | ||
83 | VCPU_REGS_R10 = 10, | ||
84 | VCPU_REGS_R11 = 11, | ||
85 | VCPU_REGS_R12 = 12, | ||
86 | VCPU_REGS_R13 = 13, | ||
87 | VCPU_REGS_R14 = 14, | ||
88 | VCPU_REGS_R15 = 15, | ||
89 | #endif | ||
90 | NR_VCPU_REGS | ||
91 | }; | ||
92 | |||
93 | enum { | ||
94 | VCPU_SREG_CS, | ||
95 | VCPU_SREG_DS, | ||
96 | VCPU_SREG_ES, | ||
97 | VCPU_SREG_FS, | ||
98 | VCPU_SREG_GS, | ||
99 | VCPU_SREG_SS, | ||
100 | VCPU_SREG_TR, | ||
101 | VCPU_SREG_LDTR, | ||
102 | }; | ||
79 | 103 | ||
80 | typedef unsigned long gva_t; | 104 | #include <asm/kvm_x86_emulate.h> |
81 | typedef u64 gpa_t; | ||
82 | typedef unsigned long gfn_t; | ||
83 | 105 | ||
84 | typedef unsigned long hva_t; | 106 | #define KVM_NR_MEM_OBJS 40 |
85 | typedef u64 hpa_t; | 107 | |
86 | typedef unsigned long hfn_t; | 108 | /* |
109 | * We don't want allocation failures within the mmu code, so we preallocate | ||
110 | * enough memory for a single page fault in a cache. | ||
111 | */ | ||
112 | struct kvm_mmu_memory_cache { | ||
113 | int nobjs; | ||
114 | void *objects[KVM_NR_MEM_OBJS]; | ||
115 | }; | ||
87 | 116 | ||
88 | #define NR_PTE_CHAIN_ENTRIES 5 | 117 | #define NR_PTE_CHAIN_ENTRIES 5 |
89 | 118 | ||
@@ -99,7 +128,7 @@ struct kvm_pte_chain { | |||
99 | * bits 4:7 - page table level for this shadow (1-4) | 128 | * bits 4:7 - page table level for this shadow (1-4) |
100 | * bits 8:9 - page table quadrant for 2-level guests | 129 | * bits 8:9 - page table quadrant for 2-level guests |
101 | * bit 16 - "metaphysical" - gfn is not a real page (huge page/real mode) | 130 | * bit 16 - "metaphysical" - gfn is not a real page (huge page/real mode) |
102 | * bits 17:19 - "access" - the user, writable, and nx bits of a huge page pde | 131 | * bits 17:19 - common access permissions for all ptes in this shadow page |
103 | */ | 132 | */ |
104 | union kvm_mmu_page_role { | 133 | union kvm_mmu_page_role { |
105 | unsigned word; | 134 | unsigned word; |
@@ -109,7 +138,7 @@ union kvm_mmu_page_role { | |||
109 | unsigned quadrant : 2; | 138 | unsigned quadrant : 2; |
110 | unsigned pad_for_nice_hex_output : 6; | 139 | unsigned pad_for_nice_hex_output : 6; |
111 | unsigned metaphysical : 1; | 140 | unsigned metaphysical : 1; |
112 | unsigned hugepage_access : 3; | 141 | unsigned access : 3; |
113 | }; | 142 | }; |
114 | }; | 143 | }; |
115 | 144 | ||
@@ -125,6 +154,8 @@ struct kvm_mmu_page { | |||
125 | union kvm_mmu_page_role role; | 154 | union kvm_mmu_page_role role; |
126 | 155 | ||
127 | u64 *spt; | 156 | u64 *spt; |
157 | /* hold the gfn of each spte inside spt */ | ||
158 | gfn_t *gfns; | ||
128 | unsigned long slot_bitmap; /* One bit set per slot which has memory | 159 | unsigned long slot_bitmap; /* One bit set per slot which has memory |
129 | * in this shadow page. | 160 | * in this shadow page. |
130 | */ | 161 | */ |
@@ -136,9 +167,6 @@ struct kvm_mmu_page { | |||
136 | }; | 167 | }; |
137 | }; | 168 | }; |
138 | 169 | ||
139 | struct kvm_vcpu; | ||
140 | extern struct kmem_cache *kvm_vcpu_cache; | ||
141 | |||
142 | /* | 170 | /* |
143 | * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level | 171 | * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level |
144 | * 32-bit). The kvm_mmu structure abstracts the details of the current mmu | 172 | * 32-bit). The kvm_mmu structure abstracts the details of the current mmu |
@@ -149,6 +177,8 @@ struct kvm_mmu { | |||
149 | int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); | 177 | int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); |
150 | void (*free)(struct kvm_vcpu *vcpu); | 178 | void (*free)(struct kvm_vcpu *vcpu); |
151 | gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva); | 179 | gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva); |
180 | void (*prefetch_page)(struct kvm_vcpu *vcpu, | ||
181 | struct kvm_mmu_page *page); | ||
152 | hpa_t root_hpa; | 182 | hpa_t root_hpa; |
153 | int root_level; | 183 | int root_level; |
154 | int shadow_root_level; | 184 | int shadow_root_level; |
@@ -156,159 +186,9 @@ struct kvm_mmu { | |||
156 | u64 *pae_root; | 186 | u64 *pae_root; |
157 | }; | 187 | }; |
158 | 188 | ||
159 | #define KVM_NR_MEM_OBJS 20 | 189 | struct kvm_vcpu_arch { |
160 | |||
161 | struct kvm_mmu_memory_cache { | ||
162 | int nobjs; | ||
163 | void *objects[KVM_NR_MEM_OBJS]; | ||
164 | }; | ||
165 | |||
166 | /* | ||
167 | * We don't want allocation failures within the mmu code, so we preallocate | ||
168 | * enough memory for a single page fault in a cache. | ||
169 | */ | ||
170 | struct kvm_guest_debug { | ||
171 | int enabled; | ||
172 | unsigned long bp[4]; | ||
173 | int singlestep; | ||
174 | }; | ||
175 | |||
176 | enum { | ||
177 | VCPU_REGS_RAX = 0, | ||
178 | VCPU_REGS_RCX = 1, | ||
179 | VCPU_REGS_RDX = 2, | ||
180 | VCPU_REGS_RBX = 3, | ||
181 | VCPU_REGS_RSP = 4, | ||
182 | VCPU_REGS_RBP = 5, | ||
183 | VCPU_REGS_RSI = 6, | ||
184 | VCPU_REGS_RDI = 7, | ||
185 | #ifdef CONFIG_X86_64 | ||
186 | VCPU_REGS_R8 = 8, | ||
187 | VCPU_REGS_R9 = 9, | ||
188 | VCPU_REGS_R10 = 10, | ||
189 | VCPU_REGS_R11 = 11, | ||
190 | VCPU_REGS_R12 = 12, | ||
191 | VCPU_REGS_R13 = 13, | ||
192 | VCPU_REGS_R14 = 14, | ||
193 | VCPU_REGS_R15 = 15, | ||
194 | #endif | ||
195 | NR_VCPU_REGS | ||
196 | }; | ||
197 | |||
198 | enum { | ||
199 | VCPU_SREG_CS, | ||
200 | VCPU_SREG_DS, | ||
201 | VCPU_SREG_ES, | ||
202 | VCPU_SREG_FS, | ||
203 | VCPU_SREG_GS, | ||
204 | VCPU_SREG_SS, | ||
205 | VCPU_SREG_TR, | ||
206 | VCPU_SREG_LDTR, | ||
207 | }; | ||
208 | |||
209 | struct kvm_pio_request { | ||
210 | unsigned long count; | ||
211 | int cur_count; | ||
212 | struct page *guest_pages[2]; | ||
213 | unsigned guest_page_offset; | ||
214 | int in; | ||
215 | int port; | ||
216 | int size; | ||
217 | int string; | ||
218 | int down; | ||
219 | int rep; | ||
220 | }; | ||
221 | |||
222 | struct kvm_stat { | ||
223 | u32 pf_fixed; | ||
224 | u32 pf_guest; | ||
225 | u32 tlb_flush; | ||
226 | u32 invlpg; | ||
227 | |||
228 | u32 exits; | ||
229 | u32 io_exits; | ||
230 | u32 mmio_exits; | ||
231 | u32 signal_exits; | ||
232 | u32 irq_window_exits; | ||
233 | u32 halt_exits; | ||
234 | u32 halt_wakeup; | ||
235 | u32 request_irq_exits; | ||
236 | u32 irq_exits; | ||
237 | u32 light_exits; | ||
238 | u32 efer_reload; | ||
239 | }; | ||
240 | |||
241 | struct kvm_io_device { | ||
242 | void (*read)(struct kvm_io_device *this, | ||
243 | gpa_t addr, | ||
244 | int len, | ||
245 | void *val); | ||
246 | void (*write)(struct kvm_io_device *this, | ||
247 | gpa_t addr, | ||
248 | int len, | ||
249 | const void *val); | ||
250 | int (*in_range)(struct kvm_io_device *this, gpa_t addr); | ||
251 | void (*destructor)(struct kvm_io_device *this); | ||
252 | |||
253 | void *private; | ||
254 | }; | ||
255 | |||
256 | static inline void kvm_iodevice_read(struct kvm_io_device *dev, | ||
257 | gpa_t addr, | ||
258 | int len, | ||
259 | void *val) | ||
260 | { | ||
261 | dev->read(dev, addr, len, val); | ||
262 | } | ||
263 | |||
264 | static inline void kvm_iodevice_write(struct kvm_io_device *dev, | ||
265 | gpa_t addr, | ||
266 | int len, | ||
267 | const void *val) | ||
268 | { | ||
269 | dev->write(dev, addr, len, val); | ||
270 | } | ||
271 | |||
272 | static inline int kvm_iodevice_inrange(struct kvm_io_device *dev, gpa_t addr) | ||
273 | { | ||
274 | return dev->in_range(dev, addr); | ||
275 | } | ||
276 | |||
277 | static inline void kvm_iodevice_destructor(struct kvm_io_device *dev) | ||
278 | { | ||
279 | if (dev->destructor) | ||
280 | dev->destructor(dev); | ||
281 | } | ||
282 | |||
283 | /* | ||
284 | * It would be nice to use something smarter than a linear search, TBD... | ||
285 | * Thankfully we dont expect many devices to register (famous last words :), | ||
286 | * so until then it will suffice. At least its abstracted so we can change | ||
287 | * in one place. | ||
288 | */ | ||
289 | struct kvm_io_bus { | ||
290 | int dev_count; | ||
291 | #define NR_IOBUS_DEVS 6 | ||
292 | struct kvm_io_device *devs[NR_IOBUS_DEVS]; | ||
293 | }; | ||
294 | |||
295 | void kvm_io_bus_init(struct kvm_io_bus *bus); | ||
296 | void kvm_io_bus_destroy(struct kvm_io_bus *bus); | ||
297 | struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr); | ||
298 | void kvm_io_bus_register_dev(struct kvm_io_bus *bus, | ||
299 | struct kvm_io_device *dev); | ||
300 | |||
301 | struct kvm_vcpu { | ||
302 | struct kvm *kvm; | ||
303 | struct preempt_notifier preempt_notifier; | ||
304 | int vcpu_id; | ||
305 | struct mutex mutex; | ||
306 | int cpu; | ||
307 | u64 host_tsc; | 190 | u64 host_tsc; |
308 | struct kvm_run *run; | ||
309 | int interrupt_window_open; | 191 | int interrupt_window_open; |
310 | int guest_mode; | ||
311 | unsigned long requests; | ||
312 | unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */ | 192 | unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */ |
313 | DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS); | 193 | DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS); |
314 | unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */ | 194 | unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */ |
@@ -317,9 +197,6 @@ struct kvm_vcpu { | |||
317 | unsigned long cr0; | 197 | unsigned long cr0; |
318 | unsigned long cr2; | 198 | unsigned long cr2; |
319 | unsigned long cr3; | 199 | unsigned long cr3; |
320 | gpa_t para_state_gpa; | ||
321 | struct page *para_state_page; | ||
322 | gpa_t hypercall_gpa; | ||
323 | unsigned long cr4; | 200 | unsigned long cr4; |
324 | unsigned long cr8; | 201 | unsigned long cr8; |
325 | u64 pdptrs[4]; /* pae */ | 202 | u64 pdptrs[4]; /* pae */ |
@@ -334,6 +211,7 @@ struct kvm_vcpu { | |||
334 | int mp_state; | 211 | int mp_state; |
335 | int sipi_vector; | 212 | int sipi_vector; |
336 | u64 ia32_misc_enable_msr; | 213 | u64 ia32_misc_enable_msr; |
214 | bool tpr_access_reporting; | ||
337 | 215 | ||
338 | struct kvm_mmu mmu; | 216 | struct kvm_mmu mmu; |
339 | 217 | ||
@@ -344,29 +222,26 @@ struct kvm_vcpu { | |||
344 | 222 | ||
345 | gfn_t last_pt_write_gfn; | 223 | gfn_t last_pt_write_gfn; |
346 | int last_pt_write_count; | 224 | int last_pt_write_count; |
225 | u64 *last_pte_updated; | ||
347 | 226 | ||
348 | struct kvm_guest_debug guest_debug; | 227 | struct { |
228 | gfn_t gfn; /* presumed gfn during guest pte update */ | ||
229 | struct page *page; /* page corresponding to that gfn */ | ||
230 | } update_pte; | ||
349 | 231 | ||
350 | struct i387_fxsave_struct host_fx_image; | 232 | struct i387_fxsave_struct host_fx_image; |
351 | struct i387_fxsave_struct guest_fx_image; | 233 | struct i387_fxsave_struct guest_fx_image; |
352 | int fpu_active; | 234 | |
353 | int guest_fpu_loaded; | ||
354 | |||
355 | int mmio_needed; | ||
356 | int mmio_read_completed; | ||
357 | int mmio_is_write; | ||
358 | int mmio_size; | ||
359 | unsigned char mmio_data[8]; | ||
360 | gpa_t mmio_phys_addr; | ||
361 | gva_t mmio_fault_cr2; | 235 | gva_t mmio_fault_cr2; |
362 | struct kvm_pio_request pio; | 236 | struct kvm_pio_request pio; |
363 | void *pio_data; | 237 | void *pio_data; |
364 | wait_queue_head_t wq; | ||
365 | 238 | ||
366 | int sigset_active; | 239 | struct kvm_queued_exception { |
367 | sigset_t sigset; | 240 | bool pending; |
368 | 241 | bool has_error_code; | |
369 | struct kvm_stat stat; | 242 | u8 nr; |
243 | u32 error_code; | ||
244 | } exception; | ||
370 | 245 | ||
371 | struct { | 246 | struct { |
372 | int active; | 247 | int active; |
@@ -381,7 +256,10 @@ struct kvm_vcpu { | |||
381 | int halt_request; /* real mode on Intel only */ | 256 | int halt_request; /* real mode on Intel only */ |
382 | 257 | ||
383 | int cpuid_nent; | 258 | int cpuid_nent; |
384 | struct kvm_cpuid_entry cpuid_entries[KVM_MAX_CPUID_ENTRIES]; | 259 | struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES]; |
260 | /* emulate context */ | ||
261 | |||
262 | struct x86_emulate_ctxt emulate_ctxt; | ||
385 | }; | 263 | }; |
386 | 264 | ||
387 | struct kvm_mem_alias { | 265 | struct kvm_mem_alias { |
@@ -390,51 +268,58 @@ struct kvm_mem_alias { | |||
390 | gfn_t target_gfn; | 268 | gfn_t target_gfn; |
391 | }; | 269 | }; |
392 | 270 | ||
393 | struct kvm_memory_slot { | 271 | struct kvm_arch{ |
394 | gfn_t base_gfn; | ||
395 | unsigned long npages; | ||
396 | unsigned long flags; | ||
397 | struct page **phys_mem; | ||
398 | unsigned long *dirty_bitmap; | ||
399 | }; | ||
400 | |||
401 | struct kvm { | ||
402 | struct mutex lock; /* protects everything except vcpus */ | ||
403 | int naliases; | 272 | int naliases; |
404 | struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS]; | 273 | struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS]; |
405 | int nmemslots; | 274 | |
406 | struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS]; | 275 | unsigned int n_free_mmu_pages; |
276 | unsigned int n_requested_mmu_pages; | ||
277 | unsigned int n_alloc_mmu_pages; | ||
278 | struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; | ||
407 | /* | 279 | /* |
408 | * Hash table of struct kvm_mmu_page. | 280 | * Hash table of struct kvm_mmu_page. |
409 | */ | 281 | */ |
410 | struct list_head active_mmu_pages; | 282 | struct list_head active_mmu_pages; |
411 | int n_free_mmu_pages; | ||
412 | struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; | ||
413 | struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; | ||
414 | unsigned long rmap_overflow; | ||
415 | struct list_head vm_list; | ||
416 | struct file *filp; | ||
417 | struct kvm_io_bus mmio_bus; | ||
418 | struct kvm_io_bus pio_bus; | ||
419 | struct kvm_pic *vpic; | 283 | struct kvm_pic *vpic; |
420 | struct kvm_ioapic *vioapic; | 284 | struct kvm_ioapic *vioapic; |
285 | |||
421 | int round_robin_prev_vcpu; | 286 | int round_robin_prev_vcpu; |
287 | unsigned int tss_addr; | ||
288 | struct page *apic_access_page; | ||
422 | }; | 289 | }; |
423 | 290 | ||
424 | static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) | 291 | struct kvm_vm_stat { |
425 | { | 292 | u32 mmu_shadow_zapped; |
426 | return kvm->vpic; | 293 | u32 mmu_pte_write; |
427 | } | 294 | u32 mmu_pte_updated; |
295 | u32 mmu_pde_zapped; | ||
296 | u32 mmu_flooded; | ||
297 | u32 mmu_recycled; | ||
298 | u32 mmu_cache_miss; | ||
299 | u32 remote_tlb_flush; | ||
300 | }; | ||
428 | 301 | ||
429 | static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm) | 302 | struct kvm_vcpu_stat { |
430 | { | 303 | u32 pf_fixed; |
431 | return kvm->vioapic; | 304 | u32 pf_guest; |
432 | } | 305 | u32 tlb_flush; |
306 | u32 invlpg; | ||
433 | 307 | ||
434 | static inline int irqchip_in_kernel(struct kvm *kvm) | 308 | u32 exits; |
435 | { | 309 | u32 io_exits; |
436 | return pic_irqchip(kvm) != 0; | 310 | u32 mmio_exits; |
437 | } | 311 | u32 signal_exits; |
312 | u32 irq_window_exits; | ||
313 | u32 halt_exits; | ||
314 | u32 halt_wakeup; | ||
315 | u32 request_irq_exits; | ||
316 | u32 irq_exits; | ||
317 | u32 host_state_reload; | ||
318 | u32 efer_reload; | ||
319 | u32 fpu_reload; | ||
320 | u32 insn_emulation; | ||
321 | u32 insn_emulation_fail; | ||
322 | }; | ||
438 | 323 | ||
439 | struct descriptor_table { | 324 | struct descriptor_table { |
440 | u16 limit; | 325 | u16 limit; |
@@ -449,11 +334,12 @@ struct kvm_x86_ops { | |||
449 | void (*check_processor_compatibility)(void *rtn); | 334 | void (*check_processor_compatibility)(void *rtn); |
450 | int (*hardware_setup)(void); /* __init */ | 335 | int (*hardware_setup)(void); /* __init */ |
451 | void (*hardware_unsetup)(void); /* __exit */ | 336 | void (*hardware_unsetup)(void); /* __exit */ |
337 | bool (*cpu_has_accelerated_tpr)(void); | ||
452 | 338 | ||
453 | /* Create, but do not attach this VCPU */ | 339 | /* Create, but do not attach this VCPU */ |
454 | struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id); | 340 | struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id); |
455 | void (*vcpu_free)(struct kvm_vcpu *vcpu); | 341 | void (*vcpu_free)(struct kvm_vcpu *vcpu); |
456 | void (*vcpu_reset)(struct kvm_vcpu *vcpu); | 342 | int (*vcpu_reset)(struct kvm_vcpu *vcpu); |
457 | 343 | ||
458 | void (*prepare_guest_switch)(struct kvm_vcpu *vcpu); | 344 | void (*prepare_guest_switch)(struct kvm_vcpu *vcpu); |
459 | void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); | 345 | void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); |
@@ -489,10 +375,6 @@ struct kvm_x86_ops { | |||
489 | void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); | 375 | void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); |
490 | 376 | ||
491 | void (*tlb_flush)(struct kvm_vcpu *vcpu); | 377 | void (*tlb_flush)(struct kvm_vcpu *vcpu); |
492 | void (*inject_page_fault)(struct kvm_vcpu *vcpu, | ||
493 | unsigned long addr, u32 err_code); | ||
494 | |||
495 | void (*inject_gp)(struct kvm_vcpu *vcpu, unsigned err_code); | ||
496 | 378 | ||
497 | void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run); | 379 | void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run); |
498 | int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu); | 380 | int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu); |
@@ -501,54 +383,31 @@ struct kvm_x86_ops { | |||
501 | unsigned char *hypercall_addr); | 383 | unsigned char *hypercall_addr); |
502 | int (*get_irq)(struct kvm_vcpu *vcpu); | 384 | int (*get_irq)(struct kvm_vcpu *vcpu); |
503 | void (*set_irq)(struct kvm_vcpu *vcpu, int vec); | 385 | void (*set_irq)(struct kvm_vcpu *vcpu, int vec); |
386 | void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr, | ||
387 | bool has_error_code, u32 error_code); | ||
388 | bool (*exception_injected)(struct kvm_vcpu *vcpu); | ||
504 | void (*inject_pending_irq)(struct kvm_vcpu *vcpu); | 389 | void (*inject_pending_irq)(struct kvm_vcpu *vcpu); |
505 | void (*inject_pending_vectors)(struct kvm_vcpu *vcpu, | 390 | void (*inject_pending_vectors)(struct kvm_vcpu *vcpu, |
506 | struct kvm_run *run); | 391 | struct kvm_run *run); |
392 | |||
393 | int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); | ||
507 | }; | 394 | }; |
508 | 395 | ||
509 | extern struct kvm_x86_ops *kvm_x86_ops; | 396 | extern struct kvm_x86_ops *kvm_x86_ops; |
510 | 397 | ||
511 | /* The guest did something we don't support. */ | ||
512 | #define pr_unimpl(vcpu, fmt, ...) \ | ||
513 | do { \ | ||
514 | if (printk_ratelimit()) \ | ||
515 | printk(KERN_ERR "kvm: %i: cpu%i " fmt, \ | ||
516 | current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__); \ | ||
517 | } while(0) | ||
518 | |||
519 | #define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt) | ||
520 | #define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt) | ||
521 | |||
522 | int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id); | ||
523 | void kvm_vcpu_uninit(struct kvm_vcpu *vcpu); | ||
524 | |||
525 | int kvm_init_x86(struct kvm_x86_ops *ops, unsigned int vcpu_size, | ||
526 | struct module *module); | ||
527 | void kvm_exit_x86(void); | ||
528 | |||
529 | int kvm_mmu_module_init(void); | 398 | int kvm_mmu_module_init(void); |
530 | void kvm_mmu_module_exit(void); | 399 | void kvm_mmu_module_exit(void); |
531 | 400 | ||
532 | void kvm_mmu_destroy(struct kvm_vcpu *vcpu); | 401 | void kvm_mmu_destroy(struct kvm_vcpu *vcpu); |
533 | int kvm_mmu_create(struct kvm_vcpu *vcpu); | 402 | int kvm_mmu_create(struct kvm_vcpu *vcpu); |
534 | int kvm_mmu_setup(struct kvm_vcpu *vcpu); | 403 | int kvm_mmu_setup(struct kvm_vcpu *vcpu); |
404 | void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte); | ||
535 | 405 | ||
536 | int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); | 406 | int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); |
537 | void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); | 407 | void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); |
538 | void kvm_mmu_zap_all(struct kvm *kvm); | 408 | void kvm_mmu_zap_all(struct kvm *kvm); |
539 | 409 | unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); | |
540 | hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa); | 410 | void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); |
541 | #define HPA_MSB ((sizeof(hpa_t) * 8) - 1) | ||
542 | #define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB) | ||
543 | static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; } | ||
544 | hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva); | ||
545 | struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva); | ||
546 | |||
547 | extern hpa_t bad_page_address; | ||
548 | |||
549 | struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); | ||
550 | struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); | ||
551 | void mark_page_dirty(struct kvm *kvm, gfn_t gfn); | ||
552 | 411 | ||
553 | enum emulation_result { | 412 | enum emulation_result { |
554 | EMULATE_DONE, /* no further processing */ | 413 | EMULATE_DONE, /* no further processing */ |
@@ -556,8 +415,10 @@ enum emulation_result { | |||
556 | EMULATE_FAIL, /* can't emulate this instruction */ | 415 | EMULATE_FAIL, /* can't emulate this instruction */ |
557 | }; | 416 | }; |
558 | 417 | ||
418 | #define EMULTYPE_NO_DECODE (1 << 0) | ||
419 | #define EMULTYPE_TRAP_UD (1 << 1) | ||
559 | int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run, | 420 | int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run, |
560 | unsigned long cr2, u16 error_code); | 421 | unsigned long cr2, u16 error_code, int emulation_type); |
561 | void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context); | 422 | void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context); |
562 | void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); | 423 | void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); |
563 | void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); | 424 | void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); |
@@ -572,7 +433,7 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); | |||
572 | 433 | ||
573 | struct x86_emulate_ctxt; | 434 | struct x86_emulate_ctxt; |
574 | 435 | ||
575 | int kvm_emulate_pio (struct kvm_vcpu *vcpu, struct kvm_run *run, int in, | 436 | int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, |
576 | int size, unsigned port); | 437 | int size, unsigned port); |
577 | int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, | 438 | int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, |
578 | int size, unsigned long count, int down, | 439 | int size, unsigned long count, int down, |
@@ -581,7 +442,7 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); | |||
581 | int kvm_emulate_halt(struct kvm_vcpu *vcpu); | 442 | int kvm_emulate_halt(struct kvm_vcpu *vcpu); |
582 | int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); | 443 | int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); |
583 | int emulate_clts(struct kvm_vcpu *vcpu); | 444 | int emulate_clts(struct kvm_vcpu *vcpu); |
584 | int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, | 445 | int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, |
585 | unsigned long *dest); | 446 | unsigned long *dest); |
586 | int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, | 447 | int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, |
587 | unsigned long value); | 448 | unsigned long value); |
@@ -597,15 +458,15 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l); | |||
597 | int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); | 458 | int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); |
598 | int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data); | 459 | int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data); |
599 | 460 | ||
600 | void fx_init(struct kvm_vcpu *vcpu); | 461 | void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); |
462 | void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); | ||
463 | void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2, | ||
464 | u32 error_code); | ||
601 | 465 | ||
602 | void kvm_resched(struct kvm_vcpu *vcpu); | 466 | void fx_init(struct kvm_vcpu *vcpu); |
603 | void kvm_load_guest_fpu(struct kvm_vcpu *vcpu); | ||
604 | void kvm_put_guest_fpu(struct kvm_vcpu *vcpu); | ||
605 | void kvm_flush_remote_tlbs(struct kvm *kvm); | ||
606 | 467 | ||
607 | int emulator_read_std(unsigned long addr, | 468 | int emulator_read_std(unsigned long addr, |
608 | void *val, | 469 | void *val, |
609 | unsigned int bytes, | 470 | unsigned int bytes, |
610 | struct kvm_vcpu *vcpu); | 471 | struct kvm_vcpu *vcpu); |
611 | int emulator_write_emulated(unsigned long addr, | 472 | int emulator_write_emulated(unsigned long addr, |
@@ -615,6 +476,7 @@ int emulator_write_emulated(unsigned long addr, | |||
615 | 476 | ||
616 | unsigned long segment_base(u16 selector); | 477 | unsigned long segment_base(u16 selector); |
617 | 478 | ||
479 | void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu); | ||
618 | void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | 480 | void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, |
619 | const u8 *new, int bytes); | 481 | const u8 *new, int bytes); |
620 | int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); | 482 | int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); |
@@ -622,66 +484,14 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); | |||
622 | int kvm_mmu_load(struct kvm_vcpu *vcpu); | 484 | int kvm_mmu_load(struct kvm_vcpu *vcpu); |
623 | void kvm_mmu_unload(struct kvm_vcpu *vcpu); | 485 | void kvm_mmu_unload(struct kvm_vcpu *vcpu); |
624 | 486 | ||
625 | int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run); | 487 | int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); |
626 | 488 | ||
627 | static inline void kvm_guest_enter(void) | 489 | int kvm_fix_hypercall(struct kvm_vcpu *vcpu); |
628 | { | ||
629 | current->flags |= PF_VCPU; | ||
630 | } | ||
631 | 490 | ||
632 | static inline void kvm_guest_exit(void) | 491 | int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code); |
633 | { | ||
634 | current->flags &= ~PF_VCPU; | ||
635 | } | ||
636 | 492 | ||
637 | static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, | 493 | int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); |
638 | u32 error_code) | 494 | int complete_pio(struct kvm_vcpu *vcpu); |
639 | { | ||
640 | return vcpu->mmu.page_fault(vcpu, gva, error_code); | ||
641 | } | ||
642 | |||
643 | static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) | ||
644 | { | ||
645 | if (unlikely(vcpu->kvm->n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) | ||
646 | __kvm_mmu_free_some_pages(vcpu); | ||
647 | } | ||
648 | |||
649 | static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) | ||
650 | { | ||
651 | if (likely(vcpu->mmu.root_hpa != INVALID_PAGE)) | ||
652 | return 0; | ||
653 | |||
654 | return kvm_mmu_load(vcpu); | ||
655 | } | ||
656 | |||
657 | static inline int is_long_mode(struct kvm_vcpu *vcpu) | ||
658 | { | ||
659 | #ifdef CONFIG_X86_64 | ||
660 | return vcpu->shadow_efer & EFER_LME; | ||
661 | #else | ||
662 | return 0; | ||
663 | #endif | ||
664 | } | ||
665 | |||
666 | static inline int is_pae(struct kvm_vcpu *vcpu) | ||
667 | { | ||
668 | return vcpu->cr4 & X86_CR4_PAE; | ||
669 | } | ||
670 | |||
671 | static inline int is_pse(struct kvm_vcpu *vcpu) | ||
672 | { | ||
673 | return vcpu->cr4 & X86_CR4_PSE; | ||
674 | } | ||
675 | |||
676 | static inline int is_paging(struct kvm_vcpu *vcpu) | ||
677 | { | ||
678 | return vcpu->cr0 & X86_CR0_PG; | ||
679 | } | ||
680 | |||
681 | static inline int memslot_id(struct kvm *kvm, struct kvm_memory_slot *slot) | ||
682 | { | ||
683 | return slot - kvm->memslots; | ||
684 | } | ||
685 | 495 | ||
686 | static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) | 496 | static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) |
687 | { | 497 | { |
@@ -693,55 +503,55 @@ static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) | |||
693 | static inline u16 read_fs(void) | 503 | static inline u16 read_fs(void) |
694 | { | 504 | { |
695 | u16 seg; | 505 | u16 seg; |
696 | asm ("mov %%fs, %0" : "=g"(seg)); | 506 | asm("mov %%fs, %0" : "=g"(seg)); |
697 | return seg; | 507 | return seg; |
698 | } | 508 | } |
699 | 509 | ||
700 | static inline u16 read_gs(void) | 510 | static inline u16 read_gs(void) |
701 | { | 511 | { |
702 | u16 seg; | 512 | u16 seg; |
703 | asm ("mov %%gs, %0" : "=g"(seg)); | 513 | asm("mov %%gs, %0" : "=g"(seg)); |
704 | return seg; | 514 | return seg; |
705 | } | 515 | } |
706 | 516 | ||
707 | static inline u16 read_ldt(void) | 517 | static inline u16 read_ldt(void) |
708 | { | 518 | { |
709 | u16 ldt; | 519 | u16 ldt; |
710 | asm ("sldt %0" : "=g"(ldt)); | 520 | asm("sldt %0" : "=g"(ldt)); |
711 | return ldt; | 521 | return ldt; |
712 | } | 522 | } |
713 | 523 | ||
714 | static inline void load_fs(u16 sel) | 524 | static inline void load_fs(u16 sel) |
715 | { | 525 | { |
716 | asm ("mov %0, %%fs" : : "rm"(sel)); | 526 | asm("mov %0, %%fs" : : "rm"(sel)); |
717 | } | 527 | } |
718 | 528 | ||
719 | static inline void load_gs(u16 sel) | 529 | static inline void load_gs(u16 sel) |
720 | { | 530 | { |
721 | asm ("mov %0, %%gs" : : "rm"(sel)); | 531 | asm("mov %0, %%gs" : : "rm"(sel)); |
722 | } | 532 | } |
723 | 533 | ||
724 | #ifndef load_ldt | 534 | #ifndef load_ldt |
725 | static inline void load_ldt(u16 sel) | 535 | static inline void load_ldt(u16 sel) |
726 | { | 536 | { |
727 | asm ("lldt %0" : : "rm"(sel)); | 537 | asm("lldt %0" : : "rm"(sel)); |
728 | } | 538 | } |
729 | #endif | 539 | #endif |
730 | 540 | ||
731 | static inline void get_idt(struct descriptor_table *table) | 541 | static inline void get_idt(struct descriptor_table *table) |
732 | { | 542 | { |
733 | asm ("sidt %0" : "=m"(*table)); | 543 | asm("sidt %0" : "=m"(*table)); |
734 | } | 544 | } |
735 | 545 | ||
736 | static inline void get_gdt(struct descriptor_table *table) | 546 | static inline void get_gdt(struct descriptor_table *table) |
737 | { | 547 | { |
738 | asm ("sgdt %0" : "=m"(*table)); | 548 | asm("sgdt %0" : "=m"(*table)); |
739 | } | 549 | } |
740 | 550 | ||
741 | static inline unsigned long read_tr_base(void) | 551 | static inline unsigned long read_tr_base(void) |
742 | { | 552 | { |
743 | u16 tr; | 553 | u16 tr; |
744 | asm ("str %0" : "=g"(tr)); | 554 | asm("str %0" : "=g"(tr)); |
745 | return segment_base(tr); | 555 | return segment_base(tr); |
746 | } | 556 | } |
747 | 557 | ||
@@ -757,17 +567,17 @@ static inline unsigned long read_msr(unsigned long msr) | |||
757 | 567 | ||
758 | static inline void fx_save(struct i387_fxsave_struct *image) | 568 | static inline void fx_save(struct i387_fxsave_struct *image) |
759 | { | 569 | { |
760 | asm ("fxsave (%0)":: "r" (image)); | 570 | asm("fxsave (%0)":: "r" (image)); |
761 | } | 571 | } |
762 | 572 | ||
763 | static inline void fx_restore(struct i387_fxsave_struct *image) | 573 | static inline void fx_restore(struct i387_fxsave_struct *image) |
764 | { | 574 | { |
765 | asm ("fxrstor (%0)":: "r" (image)); | 575 | asm("fxrstor (%0)":: "r" (image)); |
766 | } | 576 | } |
767 | 577 | ||
768 | static inline void fpu_init(void) | 578 | static inline void fpu_init(void) |
769 | { | 579 | { |
770 | asm ("finit"); | 580 | asm("finit"); |
771 | } | 581 | } |
772 | 582 | ||
773 | static inline u32 get_rdx_init_val(void) | 583 | static inline u32 get_rdx_init_val(void) |
@@ -775,6 +585,11 @@ static inline u32 get_rdx_init_val(void) | |||
775 | return 0x600; /* P6 family */ | 585 | return 0x600; /* P6 family */ |
776 | } | 586 | } |
777 | 587 | ||
588 | static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code) | ||
589 | { | ||
590 | kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); | ||
591 | } | ||
592 | |||
778 | #define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30" | 593 | #define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30" |
779 | #define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2" | 594 | #define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2" |
780 | #define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3" | 595 | #define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3" |
diff --git a/include/asm-x86/kvm_para.h b/include/asm-x86/kvm_para.h new file mode 100644 index 000000000000..c6f3fd8d8c53 --- /dev/null +++ b/include/asm-x86/kvm_para.h | |||
@@ -0,0 +1,105 @@ | |||
1 | #ifndef __X86_KVM_PARA_H | ||
2 | #define __X86_KVM_PARA_H | ||
3 | |||
4 | /* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It | ||
5 | * should be used to determine that a VM is running under KVM. | ||
6 | */ | ||
7 | #define KVM_CPUID_SIGNATURE 0x40000000 | ||
8 | |||
9 | /* This CPUID returns a feature bitmap in eax. Before enabling a particular | ||
10 | * paravirtualization, the appropriate feature bit should be checked. | ||
11 | */ | ||
12 | #define KVM_CPUID_FEATURES 0x40000001 | ||
13 | |||
14 | #ifdef __KERNEL__ | ||
15 | #include <asm/processor.h> | ||
16 | |||
17 | /* This instruction is vmcall. On non-VT architectures, it will generate a | ||
18 | * trap that we will then rewrite to the appropriate instruction. | ||
19 | */ | ||
20 | #define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1" | ||
21 | |||
22 | /* For KVM hypercalls, a three-byte sequence of either the vmrun or the vmmrun | ||
23 | * instruction. The hypervisor may replace it with something else but only the | ||
24 | * instructions are guaranteed to be supported. | ||
25 | * | ||
26 | * Up to four arguments may be passed in rbx, rcx, rdx, and rsi respectively. | ||
27 | * The hypercall number should be placed in rax and the return value will be | ||
28 | * placed in rax. No other registers will be clobbered unless explicited | ||
29 | * noted by the particular hypercall. | ||
30 | */ | ||
31 | |||
32 | static inline long kvm_hypercall0(unsigned int nr) | ||
33 | { | ||
34 | long ret; | ||
35 | asm volatile(KVM_HYPERCALL | ||
36 | : "=a"(ret) | ||
37 | : "a"(nr)); | ||
38 | return ret; | ||
39 | } | ||
40 | |||
41 | static inline long kvm_hypercall1(unsigned int nr, unsigned long p1) | ||
42 | { | ||
43 | long ret; | ||
44 | asm volatile(KVM_HYPERCALL | ||
45 | : "=a"(ret) | ||
46 | : "a"(nr), "b"(p1)); | ||
47 | return ret; | ||
48 | } | ||
49 | |||
50 | static inline long kvm_hypercall2(unsigned int nr, unsigned long p1, | ||
51 | unsigned long p2) | ||
52 | { | ||
53 | long ret; | ||
54 | asm volatile(KVM_HYPERCALL | ||
55 | : "=a"(ret) | ||
56 | : "a"(nr), "b"(p1), "c"(p2)); | ||
57 | return ret; | ||
58 | } | ||
59 | |||
60 | static inline long kvm_hypercall3(unsigned int nr, unsigned long p1, | ||
61 | unsigned long p2, unsigned long p3) | ||
62 | { | ||
63 | long ret; | ||
64 | asm volatile(KVM_HYPERCALL | ||
65 | : "=a"(ret) | ||
66 | : "a"(nr), "b"(p1), "c"(p2), "d"(p3)); | ||
67 | return ret; | ||
68 | } | ||
69 | |||
70 | static inline long kvm_hypercall4(unsigned int nr, unsigned long p1, | ||
71 | unsigned long p2, unsigned long p3, | ||
72 | unsigned long p4) | ||
73 | { | ||
74 | long ret; | ||
75 | asm volatile(KVM_HYPERCALL | ||
76 | : "=a"(ret) | ||
77 | : "a"(nr), "b"(p1), "c"(p2), "d"(p3), "S"(p4)); | ||
78 | return ret; | ||
79 | } | ||
80 | |||
81 | static inline int kvm_para_available(void) | ||
82 | { | ||
83 | unsigned int eax, ebx, ecx, edx; | ||
84 | char signature[13]; | ||
85 | |||
86 | cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx); | ||
87 | memcpy(signature + 0, &ebx, 4); | ||
88 | memcpy(signature + 4, &ecx, 4); | ||
89 | memcpy(signature + 8, &edx, 4); | ||
90 | signature[12] = 0; | ||
91 | |||
92 | if (strcmp(signature, "KVMKVMKVM") == 0) | ||
93 | return 1; | ||
94 | |||
95 | return 0; | ||
96 | } | ||
97 | |||
98 | static inline unsigned int kvm_arch_para_features(void) | ||
99 | { | ||
100 | return cpuid_eax(KVM_CPUID_FEATURES); | ||
101 | } | ||
102 | |||
103 | #endif | ||
104 | |||
105 | #endif | ||
diff --git a/drivers/kvm/x86_emulate.h b/include/asm-x86/kvm_x86_emulate.h index 92c73aa7f9ac..7db91b9bdcd4 100644 --- a/drivers/kvm/x86_emulate.h +++ b/include/asm-x86/kvm_x86_emulate.h | |||
@@ -63,17 +63,6 @@ struct x86_emulate_ops { | |||
63 | unsigned int bytes, struct kvm_vcpu *vcpu); | 63 | unsigned int bytes, struct kvm_vcpu *vcpu); |
64 | 64 | ||
65 | /* | 65 | /* |
66 | * write_std: Write bytes of standard (non-emulated/special) memory. | ||
67 | * Used for stack operations, and others. | ||
68 | * @addr: [IN ] Linear address to which to write. | ||
69 | * @val: [IN ] Value to write to memory (low-order bytes used as | ||
70 | * required). | ||
71 | * @bytes: [IN ] Number of bytes to write to memory. | ||
72 | */ | ||
73 | int (*write_std)(unsigned long addr, const void *val, | ||
74 | unsigned int bytes, struct kvm_vcpu *vcpu); | ||
75 | |||
76 | /* | ||
77 | * read_emulated: Read bytes from emulated/special memory area. | 66 | * read_emulated: Read bytes from emulated/special memory area. |
78 | * @addr: [IN ] Linear address from which to read. | 67 | * @addr: [IN ] Linear address from which to read. |
79 | * @val: [OUT] Value read from memory, zero-extended to 'u_long'. | 68 | * @val: [OUT] Value read from memory, zero-extended to 'u_long'. |
@@ -112,13 +101,50 @@ struct x86_emulate_ops { | |||
112 | 101 | ||
113 | }; | 102 | }; |
114 | 103 | ||
104 | /* Type, address-of, and value of an instruction's operand. */ | ||
105 | struct operand { | ||
106 | enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type; | ||
107 | unsigned int bytes; | ||
108 | unsigned long val, orig_val, *ptr; | ||
109 | }; | ||
110 | |||
111 | struct fetch_cache { | ||
112 | u8 data[15]; | ||
113 | unsigned long start; | ||
114 | unsigned long end; | ||
115 | }; | ||
116 | |||
117 | struct decode_cache { | ||
118 | u8 twobyte; | ||
119 | u8 b; | ||
120 | u8 lock_prefix; | ||
121 | u8 rep_prefix; | ||
122 | u8 op_bytes; | ||
123 | u8 ad_bytes; | ||
124 | u8 rex_prefix; | ||
125 | struct operand src; | ||
126 | struct operand dst; | ||
127 | unsigned long *override_base; | ||
128 | unsigned int d; | ||
129 | unsigned long regs[NR_VCPU_REGS]; | ||
130 | unsigned long eip; | ||
131 | /* modrm */ | ||
132 | u8 modrm; | ||
133 | u8 modrm_mod; | ||
134 | u8 modrm_reg; | ||
135 | u8 modrm_rm; | ||
136 | u8 use_modrm_ea; | ||
137 | unsigned long modrm_ea; | ||
138 | unsigned long modrm_val; | ||
139 | struct fetch_cache fetch; | ||
140 | }; | ||
141 | |||
115 | struct x86_emulate_ctxt { | 142 | struct x86_emulate_ctxt { |
116 | /* Register state before/after emulation. */ | 143 | /* Register state before/after emulation. */ |
117 | struct kvm_vcpu *vcpu; | 144 | struct kvm_vcpu *vcpu; |
118 | 145 | ||
119 | /* Linear faulting address (if emulating a page-faulting instruction). */ | 146 | /* Linear faulting address (if emulating a page-faulting instruction). */ |
120 | unsigned long eflags; | 147 | unsigned long eflags; |
121 | unsigned long cr2; | ||
122 | 148 | ||
123 | /* Emulated execution mode, represented by an X86EMUL_MODE value. */ | 149 | /* Emulated execution mode, represented by an X86EMUL_MODE value. */ |
124 | int mode; | 150 | int mode; |
@@ -129,8 +155,16 @@ struct x86_emulate_ctxt { | |||
129 | unsigned long ss_base; | 155 | unsigned long ss_base; |
130 | unsigned long gs_base; | 156 | unsigned long gs_base; |
131 | unsigned long fs_base; | 157 | unsigned long fs_base; |
158 | |||
159 | /* decode cache */ | ||
160 | |||
161 | struct decode_cache decode; | ||
132 | }; | 162 | }; |
133 | 163 | ||
164 | /* Repeat String Operation Prefix */ | ||
165 | #define REPE_PREFIX 1 | ||
166 | #define REPNE_PREFIX 2 | ||
167 | |||
134 | /* Execution mode, passed to the emulator. */ | 168 | /* Execution mode, passed to the emulator. */ |
135 | #define X86EMUL_MODE_REAL 0 /* Real mode. */ | 169 | #define X86EMUL_MODE_REAL 0 /* Real mode. */ |
136 | #define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */ | 170 | #define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */ |
@@ -144,12 +178,9 @@ struct x86_emulate_ctxt { | |||
144 | #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 | 178 | #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 |
145 | #endif | 179 | #endif |
146 | 180 | ||
147 | /* | 181 | int x86_decode_insn(struct x86_emulate_ctxt *ctxt, |
148 | * x86_emulate_memop: Emulate an instruction that faulted attempting to | 182 | struct x86_emulate_ops *ops); |
149 | * read/write a 'special' memory area. | 183 | int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, |
150 | * Returns -1 on failure, 0 on success. | 184 | struct x86_emulate_ops *ops); |
151 | */ | ||
152 | int x86_emulate_memop(struct x86_emulate_ctxt *ctxt, | ||
153 | struct x86_emulate_ops *ops); | ||
154 | 185 | ||
155 | #endif /* __X86_EMULATE_H__ */ | 186 | #endif /* __X86_EMULATE_H__ */ |
diff --git a/include/linux/Kbuild b/include/linux/Kbuild index 27b9350052b4..85b2482cc736 100644 --- a/include/linux/Kbuild +++ b/include/linux/Kbuild | |||
@@ -100,7 +100,6 @@ header-y += iso_fs.h | |||
100 | header-y += ixjuser.h | 100 | header-y += ixjuser.h |
101 | header-y += jffs2.h | 101 | header-y += jffs2.h |
102 | header-y += keyctl.h | 102 | header-y += keyctl.h |
103 | header-y += kvm.h | ||
104 | header-y += limits.h | 103 | header-y += limits.h |
105 | header-y += lock_dlm_plock.h | 104 | header-y += lock_dlm_plock.h |
106 | header-y += magic.h | 105 | header-y += magic.h |
@@ -256,6 +255,7 @@ unifdef-y += kd.h | |||
256 | unifdef-y += kernelcapi.h | 255 | unifdef-y += kernelcapi.h |
257 | unifdef-y += kernel.h | 256 | unifdef-y += kernel.h |
258 | unifdef-y += keyboard.h | 257 | unifdef-y += keyboard.h |
258 | unifdef-$(CONFIG_HAVE_KVM) += kvm.h | ||
259 | unifdef-y += llc.h | 259 | unifdef-y += llc.h |
260 | unifdef-y += loop.h | 260 | unifdef-y += loop.h |
261 | unifdef-y += lp.h | 261 | unifdef-y += lp.h |
diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 057a7f34ee36..4de4fd2d8607 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h | |||
@@ -9,12 +9,10 @@ | |||
9 | 9 | ||
10 | #include <asm/types.h> | 10 | #include <asm/types.h> |
11 | #include <linux/ioctl.h> | 11 | #include <linux/ioctl.h> |
12 | #include <asm/kvm.h> | ||
12 | 13 | ||
13 | #define KVM_API_VERSION 12 | 14 | #define KVM_API_VERSION 12 |
14 | 15 | ||
15 | /* Architectural interrupt line count. */ | ||
16 | #define KVM_NR_INTERRUPTS 256 | ||
17 | |||
18 | /* for KVM_CREATE_MEMORY_REGION */ | 16 | /* for KVM_CREATE_MEMORY_REGION */ |
19 | struct kvm_memory_region { | 17 | struct kvm_memory_region { |
20 | __u32 slot; | 18 | __u32 slot; |
@@ -23,17 +21,19 @@ struct kvm_memory_region { | |||
23 | __u64 memory_size; /* bytes */ | 21 | __u64 memory_size; /* bytes */ |
24 | }; | 22 | }; |
25 | 23 | ||
26 | /* for kvm_memory_region::flags */ | 24 | /* for KVM_SET_USER_MEMORY_REGION */ |
27 | #define KVM_MEM_LOG_DIRTY_PAGES 1UL | 25 | struct kvm_userspace_memory_region { |
28 | 26 | __u32 slot; | |
29 | struct kvm_memory_alias { | ||
30 | __u32 slot; /* this has a different namespace than memory slots */ | ||
31 | __u32 flags; | 27 | __u32 flags; |
32 | __u64 guest_phys_addr; | 28 | __u64 guest_phys_addr; |
33 | __u64 memory_size; | 29 | __u64 memory_size; /* bytes */ |
34 | __u64 target_phys_addr; | 30 | __u64 userspace_addr; /* start of the userspace allocated memory */ |
35 | }; | 31 | }; |
36 | 32 | ||
33 | /* for kvm_memory_region::flags */ | ||
34 | #define KVM_MEM_LOG_DIRTY_PAGES 1UL | ||
35 | |||
36 | |||
37 | /* for KVM_IRQ_LINE */ | 37 | /* for KVM_IRQ_LINE */ |
38 | struct kvm_irq_level { | 38 | struct kvm_irq_level { |
39 | /* | 39 | /* |
@@ -45,62 +45,18 @@ struct kvm_irq_level { | |||
45 | __u32 level; | 45 | __u32 level; |
46 | }; | 46 | }; |
47 | 47 | ||
48 | /* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */ | ||
49 | struct kvm_pic_state { | ||
50 | __u8 last_irr; /* edge detection */ | ||
51 | __u8 irr; /* interrupt request register */ | ||
52 | __u8 imr; /* interrupt mask register */ | ||
53 | __u8 isr; /* interrupt service register */ | ||
54 | __u8 priority_add; /* highest irq priority */ | ||
55 | __u8 irq_base; | ||
56 | __u8 read_reg_select; | ||
57 | __u8 poll; | ||
58 | __u8 special_mask; | ||
59 | __u8 init_state; | ||
60 | __u8 auto_eoi; | ||
61 | __u8 rotate_on_auto_eoi; | ||
62 | __u8 special_fully_nested_mode; | ||
63 | __u8 init4; /* true if 4 byte init */ | ||
64 | __u8 elcr; /* PIIX edge/trigger selection */ | ||
65 | __u8 elcr_mask; | ||
66 | }; | ||
67 | |||
68 | #define KVM_IOAPIC_NUM_PINS 24 | ||
69 | struct kvm_ioapic_state { | ||
70 | __u64 base_address; | ||
71 | __u32 ioregsel; | ||
72 | __u32 id; | ||
73 | __u32 irr; | ||
74 | __u32 pad; | ||
75 | union { | ||
76 | __u64 bits; | ||
77 | struct { | ||
78 | __u8 vector; | ||
79 | __u8 delivery_mode:3; | ||
80 | __u8 dest_mode:1; | ||
81 | __u8 delivery_status:1; | ||
82 | __u8 polarity:1; | ||
83 | __u8 remote_irr:1; | ||
84 | __u8 trig_mode:1; | ||
85 | __u8 mask:1; | ||
86 | __u8 reserve:7; | ||
87 | __u8 reserved[4]; | ||
88 | __u8 dest_id; | ||
89 | } fields; | ||
90 | } redirtbl[KVM_IOAPIC_NUM_PINS]; | ||
91 | }; | ||
92 | |||
93 | #define KVM_IRQCHIP_PIC_MASTER 0 | ||
94 | #define KVM_IRQCHIP_PIC_SLAVE 1 | ||
95 | #define KVM_IRQCHIP_IOAPIC 2 | ||
96 | 48 | ||
97 | struct kvm_irqchip { | 49 | struct kvm_irqchip { |
98 | __u32 chip_id; | 50 | __u32 chip_id; |
99 | __u32 pad; | 51 | __u32 pad; |
100 | union { | 52 | union { |
101 | char dummy[512]; /* reserving space */ | 53 | char dummy[512]; /* reserving space */ |
54 | #ifdef CONFIG_X86 | ||
102 | struct kvm_pic_state pic; | 55 | struct kvm_pic_state pic; |
56 | #endif | ||
57 | #if defined(CONFIG_X86) || defined(CONFIG_IA64) | ||
103 | struct kvm_ioapic_state ioapic; | 58 | struct kvm_ioapic_state ioapic; |
59 | #endif | ||
104 | } chip; | 60 | } chip; |
105 | }; | 61 | }; |
106 | 62 | ||
@@ -116,6 +72,7 @@ struct kvm_irqchip { | |||
116 | #define KVM_EXIT_FAIL_ENTRY 9 | 72 | #define KVM_EXIT_FAIL_ENTRY 9 |
117 | #define KVM_EXIT_INTR 10 | 73 | #define KVM_EXIT_INTR 10 |
118 | #define KVM_EXIT_SET_TPR 11 | 74 | #define KVM_EXIT_SET_TPR 11 |
75 | #define KVM_EXIT_TPR_ACCESS 12 | ||
119 | 76 | ||
120 | /* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */ | 77 | /* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */ |
121 | struct kvm_run { | 78 | struct kvm_run { |
@@ -174,90 +131,17 @@ struct kvm_run { | |||
174 | __u32 longmode; | 131 | __u32 longmode; |
175 | __u32 pad; | 132 | __u32 pad; |
176 | } hypercall; | 133 | } hypercall; |
134 | /* KVM_EXIT_TPR_ACCESS */ | ||
135 | struct { | ||
136 | __u64 rip; | ||
137 | __u32 is_write; | ||
138 | __u32 pad; | ||
139 | } tpr_access; | ||
177 | /* Fix the size of the union. */ | 140 | /* Fix the size of the union. */ |
178 | char padding[256]; | 141 | char padding[256]; |
179 | }; | 142 | }; |
180 | }; | 143 | }; |
181 | 144 | ||
182 | /* for KVM_GET_REGS and KVM_SET_REGS */ | ||
183 | struct kvm_regs { | ||
184 | /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */ | ||
185 | __u64 rax, rbx, rcx, rdx; | ||
186 | __u64 rsi, rdi, rsp, rbp; | ||
187 | __u64 r8, r9, r10, r11; | ||
188 | __u64 r12, r13, r14, r15; | ||
189 | __u64 rip, rflags; | ||
190 | }; | ||
191 | |||
192 | /* for KVM_GET_FPU and KVM_SET_FPU */ | ||
193 | struct kvm_fpu { | ||
194 | __u8 fpr[8][16]; | ||
195 | __u16 fcw; | ||
196 | __u16 fsw; | ||
197 | __u8 ftwx; /* in fxsave format */ | ||
198 | __u8 pad1; | ||
199 | __u16 last_opcode; | ||
200 | __u64 last_ip; | ||
201 | __u64 last_dp; | ||
202 | __u8 xmm[16][16]; | ||
203 | __u32 mxcsr; | ||
204 | __u32 pad2; | ||
205 | }; | ||
206 | |||
207 | /* for KVM_GET_LAPIC and KVM_SET_LAPIC */ | ||
208 | #define KVM_APIC_REG_SIZE 0x400 | ||
209 | struct kvm_lapic_state { | ||
210 | char regs[KVM_APIC_REG_SIZE]; | ||
211 | }; | ||
212 | |||
213 | struct kvm_segment { | ||
214 | __u64 base; | ||
215 | __u32 limit; | ||
216 | __u16 selector; | ||
217 | __u8 type; | ||
218 | __u8 present, dpl, db, s, l, g, avl; | ||
219 | __u8 unusable; | ||
220 | __u8 padding; | ||
221 | }; | ||
222 | |||
223 | struct kvm_dtable { | ||
224 | __u64 base; | ||
225 | __u16 limit; | ||
226 | __u16 padding[3]; | ||
227 | }; | ||
228 | |||
229 | /* for KVM_GET_SREGS and KVM_SET_SREGS */ | ||
230 | struct kvm_sregs { | ||
231 | /* out (KVM_GET_SREGS) / in (KVM_SET_SREGS) */ | ||
232 | struct kvm_segment cs, ds, es, fs, gs, ss; | ||
233 | struct kvm_segment tr, ldt; | ||
234 | struct kvm_dtable gdt, idt; | ||
235 | __u64 cr0, cr2, cr3, cr4, cr8; | ||
236 | __u64 efer; | ||
237 | __u64 apic_base; | ||
238 | __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64]; | ||
239 | }; | ||
240 | |||
241 | struct kvm_msr_entry { | ||
242 | __u32 index; | ||
243 | __u32 reserved; | ||
244 | __u64 data; | ||
245 | }; | ||
246 | |||
247 | /* for KVM_GET_MSRS and KVM_SET_MSRS */ | ||
248 | struct kvm_msrs { | ||
249 | __u32 nmsrs; /* number of msrs in entries */ | ||
250 | __u32 pad; | ||
251 | |||
252 | struct kvm_msr_entry entries[0]; | ||
253 | }; | ||
254 | |||
255 | /* for KVM_GET_MSR_INDEX_LIST */ | ||
256 | struct kvm_msr_list { | ||
257 | __u32 nmsrs; /* number of msrs in entries */ | ||
258 | __u32 indices[0]; | ||
259 | }; | ||
260 | |||
261 | /* for KVM_TRANSLATE */ | 145 | /* for KVM_TRANSLATE */ |
262 | struct kvm_translation { | 146 | struct kvm_translation { |
263 | /* in */ | 147 | /* in */ |
@@ -302,28 +186,24 @@ struct kvm_dirty_log { | |||
302 | }; | 186 | }; |
303 | }; | 187 | }; |
304 | 188 | ||
305 | struct kvm_cpuid_entry { | ||
306 | __u32 function; | ||
307 | __u32 eax; | ||
308 | __u32 ebx; | ||
309 | __u32 ecx; | ||
310 | __u32 edx; | ||
311 | __u32 padding; | ||
312 | }; | ||
313 | |||
314 | /* for KVM_SET_CPUID */ | ||
315 | struct kvm_cpuid { | ||
316 | __u32 nent; | ||
317 | __u32 padding; | ||
318 | struct kvm_cpuid_entry entries[0]; | ||
319 | }; | ||
320 | |||
321 | /* for KVM_SET_SIGNAL_MASK */ | 189 | /* for KVM_SET_SIGNAL_MASK */ |
322 | struct kvm_signal_mask { | 190 | struct kvm_signal_mask { |
323 | __u32 len; | 191 | __u32 len; |
324 | __u8 sigset[0]; | 192 | __u8 sigset[0]; |
325 | }; | 193 | }; |
326 | 194 | ||
195 | /* for KVM_TPR_ACCESS_REPORTING */ | ||
196 | struct kvm_tpr_access_ctl { | ||
197 | __u32 enabled; | ||
198 | __u32 flags; | ||
199 | __u32 reserved[8]; | ||
200 | }; | ||
201 | |||
202 | /* for KVM_SET_VAPIC_ADDR */ | ||
203 | struct kvm_vapic_addr { | ||
204 | __u64 vapic_addr; | ||
205 | }; | ||
206 | |||
327 | #define KVMIO 0xAE | 207 | #define KVMIO 0xAE |
328 | 208 | ||
329 | /* | 209 | /* |
@@ -347,11 +227,21 @@ struct kvm_signal_mask { | |||
347 | */ | 227 | */ |
348 | #define KVM_CAP_IRQCHIP 0 | 228 | #define KVM_CAP_IRQCHIP 0 |
349 | #define KVM_CAP_HLT 1 | 229 | #define KVM_CAP_HLT 1 |
230 | #define KVM_CAP_MMU_SHADOW_CACHE_CONTROL 2 | ||
231 | #define KVM_CAP_USER_MEMORY 3 | ||
232 | #define KVM_CAP_SET_TSS_ADDR 4 | ||
233 | #define KVM_CAP_EXT_CPUID 5 | ||
234 | #define KVM_CAP_VAPIC 6 | ||
350 | 235 | ||
351 | /* | 236 | /* |
352 | * ioctls for VM fds | 237 | * ioctls for VM fds |
353 | */ | 238 | */ |
354 | #define KVM_SET_MEMORY_REGION _IOW(KVMIO, 0x40, struct kvm_memory_region) | 239 | #define KVM_SET_MEMORY_REGION _IOW(KVMIO, 0x40, struct kvm_memory_region) |
240 | #define KVM_SET_NR_MMU_PAGES _IO(KVMIO, 0x44) | ||
241 | #define KVM_GET_NR_MMU_PAGES _IO(KVMIO, 0x45) | ||
242 | #define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46,\ | ||
243 | struct kvm_userspace_memory_region) | ||
244 | #define KVM_SET_TSS_ADDR _IO(KVMIO, 0x47) | ||
355 | /* | 245 | /* |
356 | * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns | 246 | * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns |
357 | * a vcpu fd. | 247 | * a vcpu fd. |
@@ -359,6 +249,7 @@ struct kvm_signal_mask { | |||
359 | #define KVM_CREATE_VCPU _IO(KVMIO, 0x41) | 249 | #define KVM_CREATE_VCPU _IO(KVMIO, 0x41) |
360 | #define KVM_GET_DIRTY_LOG _IOW(KVMIO, 0x42, struct kvm_dirty_log) | 250 | #define KVM_GET_DIRTY_LOG _IOW(KVMIO, 0x42, struct kvm_dirty_log) |
361 | #define KVM_SET_MEMORY_ALIAS _IOW(KVMIO, 0x43, struct kvm_memory_alias) | 251 | #define KVM_SET_MEMORY_ALIAS _IOW(KVMIO, 0x43, struct kvm_memory_alias) |
252 | #define KVM_GET_SUPPORTED_CPUID _IOWR(KVMIO, 0x48, struct kvm_cpuid2) | ||
362 | /* Device model IOC */ | 253 | /* Device model IOC */ |
363 | #define KVM_CREATE_IRQCHIP _IO(KVMIO, 0x60) | 254 | #define KVM_CREATE_IRQCHIP _IO(KVMIO, 0x60) |
364 | #define KVM_IRQ_LINE _IOW(KVMIO, 0x61, struct kvm_irq_level) | 255 | #define KVM_IRQ_LINE _IOW(KVMIO, 0x61, struct kvm_irq_level) |
@@ -384,5 +275,11 @@ struct kvm_signal_mask { | |||
384 | #define KVM_SET_FPU _IOW(KVMIO, 0x8d, struct kvm_fpu) | 275 | #define KVM_SET_FPU _IOW(KVMIO, 0x8d, struct kvm_fpu) |
385 | #define KVM_GET_LAPIC _IOR(KVMIO, 0x8e, struct kvm_lapic_state) | 276 | #define KVM_GET_LAPIC _IOR(KVMIO, 0x8e, struct kvm_lapic_state) |
386 | #define KVM_SET_LAPIC _IOW(KVMIO, 0x8f, struct kvm_lapic_state) | 277 | #define KVM_SET_LAPIC _IOW(KVMIO, 0x8f, struct kvm_lapic_state) |
278 | #define KVM_SET_CPUID2 _IOW(KVMIO, 0x90, struct kvm_cpuid2) | ||
279 | #define KVM_GET_CPUID2 _IOWR(KVMIO, 0x91, struct kvm_cpuid2) | ||
280 | /* Available with KVM_CAP_VAPIC */ | ||
281 | #define KVM_TPR_ACCESS_REPORTING _IOWR(KVMIO, 0x92, struct kvm_tpr_access_ctl) | ||
282 | /* Available with KVM_CAP_VAPIC */ | ||
283 | #define KVM_SET_VAPIC_ADDR _IOW(KVMIO, 0x93, struct kvm_vapic_addr) | ||
387 | 284 | ||
388 | #endif | 285 | #endif |
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h new file mode 100644 index 000000000000..ea4764b0a2f4 --- /dev/null +++ b/include/linux/kvm_host.h | |||
@@ -0,0 +1,299 @@ | |||
1 | #ifndef __KVM_HOST_H | ||
2 | #define __KVM_HOST_H | ||
3 | |||
4 | /* | ||
5 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
6 | * the COPYING file in the top-level directory. | ||
7 | */ | ||
8 | |||
9 | #include <linux/types.h> | ||
10 | #include <linux/hardirq.h> | ||
11 | #include <linux/list.h> | ||
12 | #include <linux/mutex.h> | ||
13 | #include <linux/spinlock.h> | ||
14 | #include <linux/signal.h> | ||
15 | #include <linux/sched.h> | ||
16 | #include <linux/mm.h> | ||
17 | #include <linux/preempt.h> | ||
18 | #include <asm/signal.h> | ||
19 | |||
20 | #include <linux/kvm.h> | ||
21 | #include <linux/kvm_para.h> | ||
22 | |||
23 | #include <linux/kvm_types.h> | ||
24 | |||
25 | #include <asm/kvm_host.h> | ||
26 | |||
27 | #define KVM_MAX_VCPUS 4 | ||
28 | #define KVM_MEMORY_SLOTS 8 | ||
29 | /* memory slots that does not exposed to userspace */ | ||
30 | #define KVM_PRIVATE_MEM_SLOTS 4 | ||
31 | |||
32 | #define KVM_PIO_PAGE_OFFSET 1 | ||
33 | |||
34 | /* | ||
35 | * vcpu->requests bit members | ||
36 | */ | ||
37 | #define KVM_REQ_TLB_FLUSH 0 | ||
38 | #define KVM_REQ_MIGRATE_TIMER 1 | ||
39 | #define KVM_REQ_REPORT_TPR_ACCESS 2 | ||
40 | |||
41 | struct kvm_vcpu; | ||
42 | extern struct kmem_cache *kvm_vcpu_cache; | ||
43 | |||
44 | struct kvm_guest_debug { | ||
45 | int enabled; | ||
46 | unsigned long bp[4]; | ||
47 | int singlestep; | ||
48 | }; | ||
49 | |||
50 | /* | ||
51 | * It would be nice to use something smarter than a linear search, TBD... | ||
52 | * Thankfully we dont expect many devices to register (famous last words :), | ||
53 | * so until then it will suffice. At least its abstracted so we can change | ||
54 | * in one place. | ||
55 | */ | ||
56 | struct kvm_io_bus { | ||
57 | int dev_count; | ||
58 | #define NR_IOBUS_DEVS 6 | ||
59 | struct kvm_io_device *devs[NR_IOBUS_DEVS]; | ||
60 | }; | ||
61 | |||
62 | void kvm_io_bus_init(struct kvm_io_bus *bus); | ||
63 | void kvm_io_bus_destroy(struct kvm_io_bus *bus); | ||
64 | struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr); | ||
65 | void kvm_io_bus_register_dev(struct kvm_io_bus *bus, | ||
66 | struct kvm_io_device *dev); | ||
67 | |||
68 | struct kvm_vcpu { | ||
69 | struct kvm *kvm; | ||
70 | struct preempt_notifier preempt_notifier; | ||
71 | int vcpu_id; | ||
72 | struct mutex mutex; | ||
73 | int cpu; | ||
74 | struct kvm_run *run; | ||
75 | int guest_mode; | ||
76 | unsigned long requests; | ||
77 | struct kvm_guest_debug guest_debug; | ||
78 | int fpu_active; | ||
79 | int guest_fpu_loaded; | ||
80 | wait_queue_head_t wq; | ||
81 | int sigset_active; | ||
82 | sigset_t sigset; | ||
83 | struct kvm_vcpu_stat stat; | ||
84 | |||
85 | #ifdef CONFIG_HAS_IOMEM | ||
86 | int mmio_needed; | ||
87 | int mmio_read_completed; | ||
88 | int mmio_is_write; | ||
89 | int mmio_size; | ||
90 | unsigned char mmio_data[8]; | ||
91 | gpa_t mmio_phys_addr; | ||
92 | #endif | ||
93 | |||
94 | struct kvm_vcpu_arch arch; | ||
95 | }; | ||
96 | |||
97 | struct kvm_memory_slot { | ||
98 | gfn_t base_gfn; | ||
99 | unsigned long npages; | ||
100 | unsigned long flags; | ||
101 | unsigned long *rmap; | ||
102 | unsigned long *dirty_bitmap; | ||
103 | unsigned long userspace_addr; | ||
104 | int user_alloc; | ||
105 | }; | ||
106 | |||
107 | struct kvm { | ||
108 | struct mutex lock; /* protects the vcpus array and APIC accesses */ | ||
109 | spinlock_t mmu_lock; | ||
110 | struct mm_struct *mm; /* userspace tied to this vm */ | ||
111 | int nmemslots; | ||
112 | struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS + | ||
113 | KVM_PRIVATE_MEM_SLOTS]; | ||
114 | struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; | ||
115 | struct list_head vm_list; | ||
116 | struct file *filp; | ||
117 | struct kvm_io_bus mmio_bus; | ||
118 | struct kvm_io_bus pio_bus; | ||
119 | struct kvm_vm_stat stat; | ||
120 | struct kvm_arch arch; | ||
121 | }; | ||
122 | |||
123 | /* The guest did something we don't support. */ | ||
124 | #define pr_unimpl(vcpu, fmt, ...) \ | ||
125 | do { \ | ||
126 | if (printk_ratelimit()) \ | ||
127 | printk(KERN_ERR "kvm: %i: cpu%i " fmt, \ | ||
128 | current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__); \ | ||
129 | } while (0) | ||
130 | |||
131 | #define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt) | ||
132 | #define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt) | ||
133 | |||
134 | int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id); | ||
135 | void kvm_vcpu_uninit(struct kvm_vcpu *vcpu); | ||
136 | |||
137 | void vcpu_load(struct kvm_vcpu *vcpu); | ||
138 | void vcpu_put(struct kvm_vcpu *vcpu); | ||
139 | |||
140 | void decache_vcpus_on_cpu(int cpu); | ||
141 | |||
142 | |||
143 | int kvm_init(void *opaque, unsigned int vcpu_size, | ||
144 | struct module *module); | ||
145 | void kvm_exit(void); | ||
146 | |||
147 | #define HPA_MSB ((sizeof(hpa_t) * 8) - 1) | ||
148 | #define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB) | ||
149 | static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; } | ||
150 | struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva); | ||
151 | |||
152 | extern struct page *bad_page; | ||
153 | |||
154 | int is_error_page(struct page *page); | ||
155 | int kvm_is_error_hva(unsigned long addr); | ||
156 | int kvm_set_memory_region(struct kvm *kvm, | ||
157 | struct kvm_userspace_memory_region *mem, | ||
158 | int user_alloc); | ||
159 | int __kvm_set_memory_region(struct kvm *kvm, | ||
160 | struct kvm_userspace_memory_region *mem, | ||
161 | int user_alloc); | ||
162 | int kvm_arch_set_memory_region(struct kvm *kvm, | ||
163 | struct kvm_userspace_memory_region *mem, | ||
164 | struct kvm_memory_slot old, | ||
165 | int user_alloc); | ||
166 | gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn); | ||
167 | struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); | ||
168 | void kvm_release_page_clean(struct page *page); | ||
169 | void kvm_release_page_dirty(struct page *page); | ||
170 | int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, | ||
171 | int len); | ||
172 | int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, | ||
173 | unsigned long len); | ||
174 | int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len); | ||
175 | int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, | ||
176 | int offset, int len); | ||
177 | int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, | ||
178 | unsigned long len); | ||
179 | int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len); | ||
180 | int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len); | ||
181 | struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); | ||
182 | int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn); | ||
183 | void mark_page_dirty(struct kvm *kvm, gfn_t gfn); | ||
184 | |||
185 | void kvm_vcpu_block(struct kvm_vcpu *vcpu); | ||
186 | void kvm_resched(struct kvm_vcpu *vcpu); | ||
187 | void kvm_load_guest_fpu(struct kvm_vcpu *vcpu); | ||
188 | void kvm_put_guest_fpu(struct kvm_vcpu *vcpu); | ||
189 | void kvm_flush_remote_tlbs(struct kvm *kvm); | ||
190 | |||
191 | long kvm_arch_dev_ioctl(struct file *filp, | ||
192 | unsigned int ioctl, unsigned long arg); | ||
193 | long kvm_arch_vcpu_ioctl(struct file *filp, | ||
194 | unsigned int ioctl, unsigned long arg); | ||
195 | void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu); | ||
196 | void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu); | ||
197 | |||
198 | int kvm_dev_ioctl_check_extension(long ext); | ||
199 | |||
200 | int kvm_get_dirty_log(struct kvm *kvm, | ||
201 | struct kvm_dirty_log *log, int *is_dirty); | ||
202 | int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, | ||
203 | struct kvm_dirty_log *log); | ||
204 | |||
205 | int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, | ||
206 | struct | ||
207 | kvm_userspace_memory_region *mem, | ||
208 | int user_alloc); | ||
209 | long kvm_arch_vm_ioctl(struct file *filp, | ||
210 | unsigned int ioctl, unsigned long arg); | ||
211 | void kvm_arch_destroy_vm(struct kvm *kvm); | ||
212 | |||
213 | int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu); | ||
214 | int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu); | ||
215 | |||
216 | int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, | ||
217 | struct kvm_translation *tr); | ||
218 | |||
219 | int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs); | ||
220 | int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs); | ||
221 | int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, | ||
222 | struct kvm_sregs *sregs); | ||
223 | int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, | ||
224 | struct kvm_sregs *sregs); | ||
225 | int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu, | ||
226 | struct kvm_debug_guest *dbg); | ||
227 | int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run); | ||
228 | |||
229 | int kvm_arch_init(void *opaque); | ||
230 | void kvm_arch_exit(void); | ||
231 | |||
232 | int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu); | ||
233 | void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu); | ||
234 | |||
235 | void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu); | ||
236 | void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu); | ||
237 | void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu); | ||
238 | struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id); | ||
239 | int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu); | ||
240 | void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu); | ||
241 | |||
242 | int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu); | ||
243 | void kvm_arch_hardware_enable(void *garbage); | ||
244 | void kvm_arch_hardware_disable(void *garbage); | ||
245 | int kvm_arch_hardware_setup(void); | ||
246 | void kvm_arch_hardware_unsetup(void); | ||
247 | void kvm_arch_check_processor_compat(void *rtn); | ||
248 | int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); | ||
249 | |||
250 | void kvm_free_physmem(struct kvm *kvm); | ||
251 | |||
252 | struct kvm *kvm_arch_create_vm(void); | ||
253 | void kvm_arch_destroy_vm(struct kvm *kvm); | ||
254 | |||
255 | int kvm_cpu_get_interrupt(struct kvm_vcpu *v); | ||
256 | int kvm_cpu_has_interrupt(struct kvm_vcpu *v); | ||
257 | void kvm_vcpu_kick(struct kvm_vcpu *vcpu); | ||
258 | |||
259 | static inline void kvm_guest_enter(void) | ||
260 | { | ||
261 | account_system_vtime(current); | ||
262 | current->flags |= PF_VCPU; | ||
263 | } | ||
264 | |||
265 | static inline void kvm_guest_exit(void) | ||
266 | { | ||
267 | account_system_vtime(current); | ||
268 | current->flags &= ~PF_VCPU; | ||
269 | } | ||
270 | |||
271 | static inline int memslot_id(struct kvm *kvm, struct kvm_memory_slot *slot) | ||
272 | { | ||
273 | return slot - kvm->memslots; | ||
274 | } | ||
275 | |||
276 | static inline gpa_t gfn_to_gpa(gfn_t gfn) | ||
277 | { | ||
278 | return (gpa_t)gfn << PAGE_SHIFT; | ||
279 | } | ||
280 | |||
281 | static inline void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) | ||
282 | { | ||
283 | set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests); | ||
284 | } | ||
285 | |||
286 | enum kvm_stat_kind { | ||
287 | KVM_STAT_VM, | ||
288 | KVM_STAT_VCPU, | ||
289 | }; | ||
290 | |||
291 | struct kvm_stats_debugfs_item { | ||
292 | const char *name; | ||
293 | int offset; | ||
294 | enum kvm_stat_kind kind; | ||
295 | struct dentry *dentry; | ||
296 | }; | ||
297 | extern struct kvm_stats_debugfs_item debugfs_entries[]; | ||
298 | |||
299 | #endif | ||
diff --git a/include/linux/kvm_para.h b/include/linux/kvm_para.h index 3b292565a693..5497aac0d2f8 100644 --- a/include/linux/kvm_para.h +++ b/include/linux/kvm_para.h | |||
@@ -2,72 +2,30 @@ | |||
2 | #define __LINUX_KVM_PARA_H | 2 | #define __LINUX_KVM_PARA_H |
3 | 3 | ||
4 | /* | 4 | /* |
5 | * Guest OS interface for KVM paravirtualization | 5 | * This header file provides a method for making a hypercall to the host |
6 | * | 6 | * Architectures should define: |
7 | * Note: this interface is totally experimental, and is certain to change | 7 | * - kvm_hypercall0, kvm_hypercall1... |
8 | * as we make progress. | 8 | * - kvm_arch_para_features |
9 | * - kvm_para_available | ||
9 | */ | 10 | */ |
10 | 11 | ||
11 | /* | 12 | /* Return values for hypercalls */ |
12 | * Per-VCPU descriptor area shared between guest and host. Writable to | 13 | #define KVM_ENOSYS 1000 |
13 | * both guest and host. Registered with the host by the guest when | ||
14 | * a guest acknowledges paravirtual mode. | ||
15 | * | ||
16 | * NOTE: all addresses are guest-physical addresses (gpa), to make it | ||
17 | * easier for the hypervisor to map between the various addresses. | ||
18 | */ | ||
19 | struct kvm_vcpu_para_state { | ||
20 | /* | ||
21 | * API version information for compatibility. If there's any support | ||
22 | * mismatch (too old host trying to execute too new guest) then | ||
23 | * the host will deny entry into paravirtual mode. Any other | ||
24 | * combination (new host + old guest and new host + new guest) | ||
25 | * is supposed to work - new host versions will support all old | ||
26 | * guest API versions. | ||
27 | */ | ||
28 | u32 guest_version; | ||
29 | u32 host_version; | ||
30 | u32 size; | ||
31 | u32 ret; | ||
32 | |||
33 | /* | ||
34 | * The address of the vm exit instruction (VMCALL or VMMCALL), | ||
35 | * which the host will patch according to the CPU model the | ||
36 | * VM runs on: | ||
37 | */ | ||
38 | u64 hypercall_gpa; | ||
39 | |||
40 | } __attribute__ ((aligned(PAGE_SIZE))); | ||
41 | |||
42 | #define KVM_PARA_API_VERSION 1 | ||
43 | |||
44 | /* | ||
45 | * This is used for an RDMSR's ECX parameter to probe for a KVM host. | ||
46 | * Hopefully no CPU vendor will use up this number. This is placed well | ||
47 | * out of way of the typical space occupied by CPU vendors' MSR indices, | ||
48 | * and we think (or at least hope) it wont be occupied in the future | ||
49 | * either. | ||
50 | */ | ||
51 | #define MSR_KVM_API_MAGIC 0x87655678 | ||
52 | 14 | ||
53 | #define KVM_EINVAL 1 | 15 | #define KVM_HC_VAPIC_POLL_IRQ 1 |
54 | 16 | ||
55 | /* | 17 | /* |
56 | * Hypercall calling convention: | 18 | * hypercalls use architecture specific |
57 | * | ||
58 | * Each hypercall may have 0-6 parameters. | ||
59 | * | ||
60 | * 64-bit hypercall index is in RAX, goes from 0 to __NR_hypercalls-1 | ||
61 | * | ||
62 | * 64-bit parameters 1-6 are in the standard gcc x86_64 calling convention | ||
63 | * order: RDI, RSI, RDX, RCX, R8, R9. | ||
64 | * | ||
65 | * 32-bit index is EBX, parameters are: EAX, ECX, EDX, ESI, EDI, EBP. | ||
66 | * (the first 3 are according to the gcc regparm calling convention) | ||
67 | * | ||
68 | * No registers are clobbered by the hypercall, except that the | ||
69 | * return value is in RAX. | ||
70 | */ | 19 | */ |
71 | #define __NR_hypercalls 0 | 20 | #include <asm/kvm_para.h> |
21 | |||
22 | #ifdef __KERNEL__ | ||
23 | static inline int kvm_para_has_feature(unsigned int feature) | ||
24 | { | ||
25 | if (kvm_arch_para_features() & (1UL << feature)) | ||
26 | return 1; | ||
27 | return 0; | ||
28 | } | ||
29 | #endif /* __KERNEL__ */ | ||
30 | #endif /* __LINUX_KVM_PARA_H */ | ||
72 | 31 | ||
73 | #endif | ||
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h new file mode 100644 index 000000000000..1c4e46decb22 --- /dev/null +++ b/include/linux/kvm_types.h | |||
@@ -0,0 +1,54 @@ | |||
1 | /* | ||
2 | * This program is free software; you can redistribute it and/or modify | ||
3 | * it under the terms of the GNU General Public License as published by | ||
4 | * the Free Software Foundation; either version 2 of the License. | ||
5 | * | ||
6 | * This program is distributed in the hope that it will be useful, | ||
7 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
8 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
9 | * GNU General Public License for more details. | ||
10 | * | ||
11 | * You should have received a copy of the GNU General Public License | ||
12 | * along with this program; if not, write to the Free Software | ||
13 | * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | ||
14 | * | ||
15 | */ | ||
16 | |||
17 | #ifndef __KVM_TYPES_H__ | ||
18 | #define __KVM_TYPES_H__ | ||
19 | |||
20 | #include <asm/types.h> | ||
21 | |||
22 | /* | ||
23 | * Address types: | ||
24 | * | ||
25 | * gva - guest virtual address | ||
26 | * gpa - guest physical address | ||
27 | * gfn - guest frame number | ||
28 | * hva - host virtual address | ||
29 | * hpa - host physical address | ||
30 | * hfn - host frame number | ||
31 | */ | ||
32 | |||
33 | typedef unsigned long gva_t; | ||
34 | typedef u64 gpa_t; | ||
35 | typedef unsigned long gfn_t; | ||
36 | |||
37 | typedef unsigned long hva_t; | ||
38 | typedef u64 hpa_t; | ||
39 | typedef unsigned long hfn_t; | ||
40 | |||
41 | struct kvm_pio_request { | ||
42 | unsigned long count; | ||
43 | int cur_count; | ||
44 | struct page *guest_pages[2]; | ||
45 | unsigned guest_page_offset; | ||
46 | int in; | ||
47 | int port; | ||
48 | int size; | ||
49 | int string; | ||
50 | int down; | ||
51 | int rep; | ||
52 | }; | ||
53 | |||
54 | #endif /* __KVM_TYPES_H__ */ | ||
diff --git a/kernel/fork.c b/kernel/fork.c index 314f5101d2b0..05e0b6f4365b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -393,6 +393,7 @@ void fastcall __mmdrop(struct mm_struct *mm) | |||
393 | destroy_context(mm); | 393 | destroy_context(mm); |
394 | free_mm(mm); | 394 | free_mm(mm); |
395 | } | 395 | } |
396 | EXPORT_SYMBOL_GPL(__mmdrop); | ||
396 | 397 | ||
397 | /* | 398 | /* |
398 | * Decrement the use count and release all resources for an mm. | 399 | * Decrement the use count and release all resources for an mm. |
diff --git a/drivers/kvm/ioapic.c b/virt/kvm/ioapic.c index c7992e667fdb..317f8e211cd2 100644 --- a/drivers/kvm/ioapic.c +++ b/virt/kvm/ioapic.c | |||
@@ -26,7 +26,7 @@ | |||
26 | * Based on Xen 3.1 code. | 26 | * Based on Xen 3.1 code. |
27 | */ | 27 | */ |
28 | 28 | ||
29 | #include "kvm.h" | 29 | #include <linux/kvm_host.h> |
30 | #include <linux/kvm.h> | 30 | #include <linux/kvm.h> |
31 | #include <linux/mm.h> | 31 | #include <linux/mm.h> |
32 | #include <linux/highmem.h> | 32 | #include <linux/highmem.h> |
@@ -34,14 +34,17 @@ | |||
34 | #include <linux/hrtimer.h> | 34 | #include <linux/hrtimer.h> |
35 | #include <linux/io.h> | 35 | #include <linux/io.h> |
36 | #include <asm/processor.h> | 36 | #include <asm/processor.h> |
37 | #include <asm/msr.h> | ||
38 | #include <asm/page.h> | 37 | #include <asm/page.h> |
39 | #include <asm/current.h> | 38 | #include <asm/current.h> |
40 | #include <asm/apicdef.h> | 39 | |
41 | #include <asm/io_apic.h> | 40 | #include "ioapic.h" |
42 | #include "irq.h" | 41 | #include "lapic.h" |
43 | /* #define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */ | 42 | |
43 | #if 0 | ||
44 | #define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) | ||
45 | #else | ||
44 | #define ioapic_debug(fmt, arg...) | 46 | #define ioapic_debug(fmt, arg...) |
47 | #endif | ||
45 | static void ioapic_deliver(struct kvm_ioapic *vioapic, int irq); | 48 | static void ioapic_deliver(struct kvm_ioapic *vioapic, int irq); |
46 | 49 | ||
47 | static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, | 50 | static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, |
@@ -113,7 +116,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) | |||
113 | default: | 116 | default: |
114 | index = (ioapic->ioregsel - 0x10) >> 1; | 117 | index = (ioapic->ioregsel - 0x10) >> 1; |
115 | 118 | ||
116 | ioapic_debug("change redir index %x val %x", index, val); | 119 | ioapic_debug("change redir index %x val %x\n", index, val); |
117 | if (index >= IOAPIC_NUM_PINS) | 120 | if (index >= IOAPIC_NUM_PINS) |
118 | return; | 121 | return; |
119 | if (ioapic->ioregsel & 1) { | 122 | if (ioapic->ioregsel & 1) { |
@@ -131,16 +134,16 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) | |||
131 | } | 134 | } |
132 | 135 | ||
133 | static void ioapic_inj_irq(struct kvm_ioapic *ioapic, | 136 | static void ioapic_inj_irq(struct kvm_ioapic *ioapic, |
134 | struct kvm_lapic *target, | 137 | struct kvm_vcpu *vcpu, |
135 | u8 vector, u8 trig_mode, u8 delivery_mode) | 138 | u8 vector, u8 trig_mode, u8 delivery_mode) |
136 | { | 139 | { |
137 | ioapic_debug("irq %d trig %d deliv %d", vector, trig_mode, | 140 | ioapic_debug("irq %d trig %d deliv %d\n", vector, trig_mode, |
138 | delivery_mode); | 141 | delivery_mode); |
139 | 142 | ||
140 | ASSERT((delivery_mode == dest_Fixed) || | 143 | ASSERT((delivery_mode == IOAPIC_FIXED) || |
141 | (delivery_mode == dest_LowestPrio)); | 144 | (delivery_mode == IOAPIC_LOWEST_PRIORITY)); |
142 | 145 | ||
143 | kvm_apic_set_irq(target, vector, trig_mode); | 146 | kvm_apic_set_irq(vcpu, vector, trig_mode); |
144 | } | 147 | } |
145 | 148 | ||
146 | static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, | 149 | static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, |
@@ -151,12 +154,12 @@ static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, | |||
151 | struct kvm *kvm = ioapic->kvm; | 154 | struct kvm *kvm = ioapic->kvm; |
152 | struct kvm_vcpu *vcpu; | 155 | struct kvm_vcpu *vcpu; |
153 | 156 | ||
154 | ioapic_debug("dest %d dest_mode %d", dest, dest_mode); | 157 | ioapic_debug("dest %d dest_mode %d\n", dest, dest_mode); |
155 | 158 | ||
156 | if (dest_mode == 0) { /* Physical mode. */ | 159 | if (dest_mode == 0) { /* Physical mode. */ |
157 | if (dest == 0xFF) { /* Broadcast. */ | 160 | if (dest == 0xFF) { /* Broadcast. */ |
158 | for (i = 0; i < KVM_MAX_VCPUS; ++i) | 161 | for (i = 0; i < KVM_MAX_VCPUS; ++i) |
159 | if (kvm->vcpus[i] && kvm->vcpus[i]->apic) | 162 | if (kvm->vcpus[i] && kvm->vcpus[i]->arch.apic) |
160 | mask |= 1 << i; | 163 | mask |= 1 << i; |
161 | return mask; | 164 | return mask; |
162 | } | 165 | } |
@@ -164,8 +167,8 @@ static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, | |||
164 | vcpu = kvm->vcpus[i]; | 167 | vcpu = kvm->vcpus[i]; |
165 | if (!vcpu) | 168 | if (!vcpu) |
166 | continue; | 169 | continue; |
167 | if (kvm_apic_match_physical_addr(vcpu->apic, dest)) { | 170 | if (kvm_apic_match_physical_addr(vcpu->arch.apic, dest)) { |
168 | if (vcpu->apic) | 171 | if (vcpu->arch.apic) |
169 | mask = 1 << i; | 172 | mask = 1 << i; |
170 | break; | 173 | break; |
171 | } | 174 | } |
@@ -175,11 +178,11 @@ static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, | |||
175 | vcpu = kvm->vcpus[i]; | 178 | vcpu = kvm->vcpus[i]; |
176 | if (!vcpu) | 179 | if (!vcpu) |
177 | continue; | 180 | continue; |
178 | if (vcpu->apic && | 181 | if (vcpu->arch.apic && |
179 | kvm_apic_match_logical_addr(vcpu->apic, dest)) | 182 | kvm_apic_match_logical_addr(vcpu->arch.apic, dest)) |
180 | mask |= 1 << vcpu->vcpu_id; | 183 | mask |= 1 << vcpu->vcpu_id; |
181 | } | 184 | } |
182 | ioapic_debug("mask %x", mask); | 185 | ioapic_debug("mask %x\n", mask); |
183 | return mask; | 186 | return mask; |
184 | } | 187 | } |
185 | 188 | ||
@@ -191,41 +194,39 @@ static void ioapic_deliver(struct kvm_ioapic *ioapic, int irq) | |||
191 | u8 vector = ioapic->redirtbl[irq].fields.vector; | 194 | u8 vector = ioapic->redirtbl[irq].fields.vector; |
192 | u8 trig_mode = ioapic->redirtbl[irq].fields.trig_mode; | 195 | u8 trig_mode = ioapic->redirtbl[irq].fields.trig_mode; |
193 | u32 deliver_bitmask; | 196 | u32 deliver_bitmask; |
194 | struct kvm_lapic *target; | ||
195 | struct kvm_vcpu *vcpu; | 197 | struct kvm_vcpu *vcpu; |
196 | int vcpu_id; | 198 | int vcpu_id; |
197 | 199 | ||
198 | ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x " | 200 | ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x " |
199 | "vector=%x trig_mode=%x", | 201 | "vector=%x trig_mode=%x\n", |
200 | dest, dest_mode, delivery_mode, vector, trig_mode); | 202 | dest, dest_mode, delivery_mode, vector, trig_mode); |
201 | 203 | ||
202 | deliver_bitmask = ioapic_get_delivery_bitmask(ioapic, dest, dest_mode); | 204 | deliver_bitmask = ioapic_get_delivery_bitmask(ioapic, dest, dest_mode); |
203 | if (!deliver_bitmask) { | 205 | if (!deliver_bitmask) { |
204 | ioapic_debug("no target on destination"); | 206 | ioapic_debug("no target on destination\n"); |
205 | return; | 207 | return; |
206 | } | 208 | } |
207 | 209 | ||
208 | switch (delivery_mode) { | 210 | switch (delivery_mode) { |
209 | case dest_LowestPrio: | 211 | case IOAPIC_LOWEST_PRIORITY: |
210 | target = | 212 | vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm, vector, |
211 | kvm_apic_round_robin(ioapic->kvm, vector, deliver_bitmask); | 213 | deliver_bitmask); |
212 | if (target != NULL) | 214 | if (vcpu != NULL) |
213 | ioapic_inj_irq(ioapic, target, vector, | 215 | ioapic_inj_irq(ioapic, vcpu, vector, |
214 | trig_mode, delivery_mode); | 216 | trig_mode, delivery_mode); |
215 | else | 217 | else |
216 | ioapic_debug("null round robin: " | 218 | ioapic_debug("null lowest prio vcpu: " |
217 | "mask=%x vector=%x delivery_mode=%x", | 219 | "mask=%x vector=%x delivery_mode=%x\n", |
218 | deliver_bitmask, vector, dest_LowestPrio); | 220 | deliver_bitmask, vector, IOAPIC_LOWEST_PRIORITY); |
219 | break; | 221 | break; |
220 | case dest_Fixed: | 222 | case IOAPIC_FIXED: |
221 | for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) { | 223 | for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) { |
222 | if (!(deliver_bitmask & (1 << vcpu_id))) | 224 | if (!(deliver_bitmask & (1 << vcpu_id))) |
223 | continue; | 225 | continue; |
224 | deliver_bitmask &= ~(1 << vcpu_id); | 226 | deliver_bitmask &= ~(1 << vcpu_id); |
225 | vcpu = ioapic->kvm->vcpus[vcpu_id]; | 227 | vcpu = ioapic->kvm->vcpus[vcpu_id]; |
226 | if (vcpu) { | 228 | if (vcpu) { |
227 | target = vcpu->apic; | 229 | ioapic_inj_irq(ioapic, vcpu, vector, |
228 | ioapic_inj_irq(ioapic, target, vector, | ||
229 | trig_mode, delivery_mode); | 230 | trig_mode, delivery_mode); |
230 | } | 231 | } |
231 | } | 232 | } |
@@ -271,7 +272,7 @@ static int get_eoi_gsi(struct kvm_ioapic *ioapic, int vector) | |||
271 | 272 | ||
272 | void kvm_ioapic_update_eoi(struct kvm *kvm, int vector) | 273 | void kvm_ioapic_update_eoi(struct kvm *kvm, int vector) |
273 | { | 274 | { |
274 | struct kvm_ioapic *ioapic = kvm->vioapic; | 275 | struct kvm_ioapic *ioapic = kvm->arch.vioapic; |
275 | union ioapic_redir_entry *ent; | 276 | union ioapic_redir_entry *ent; |
276 | int gsi; | 277 | int gsi; |
277 | 278 | ||
@@ -304,7 +305,7 @@ static void ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len, | |||
304 | struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; | 305 | struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; |
305 | u32 result; | 306 | u32 result; |
306 | 307 | ||
307 | ioapic_debug("addr %lx", (unsigned long)addr); | 308 | ioapic_debug("addr %lx\n", (unsigned long)addr); |
308 | ASSERT(!(addr & 0xf)); /* check alignment */ | 309 | ASSERT(!(addr & 0xf)); /* check alignment */ |
309 | 310 | ||
310 | addr &= 0xff; | 311 | addr &= 0xff; |
@@ -341,8 +342,8 @@ static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, | |||
341 | struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; | 342 | struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; |
342 | u32 data; | 343 | u32 data; |
343 | 344 | ||
344 | ioapic_debug("ioapic_mmio_write addr=%lx len=%d val=%p\n", | 345 | ioapic_debug("ioapic_mmio_write addr=%p len=%d val=%p\n", |
345 | addr, len, val); | 346 | (void*)addr, len, val); |
346 | ASSERT(!(addr & 0xf)); /* check alignment */ | 347 | ASSERT(!(addr & 0xf)); /* check alignment */ |
347 | if (len == 4 || len == 8) | 348 | if (len == 4 || len == 8) |
348 | data = *(u32 *) val; | 349 | data = *(u32 *) val; |
@@ -360,24 +361,38 @@ static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, | |||
360 | case IOAPIC_REG_WINDOW: | 361 | case IOAPIC_REG_WINDOW: |
361 | ioapic_write_indirect(ioapic, data); | 362 | ioapic_write_indirect(ioapic, data); |
362 | break; | 363 | break; |
364 | #ifdef CONFIG_IA64 | ||
365 | case IOAPIC_REG_EOI: | ||
366 | kvm_ioapic_update_eoi(ioapic->kvm, data); | ||
367 | break; | ||
368 | #endif | ||
363 | 369 | ||
364 | default: | 370 | default: |
365 | break; | 371 | break; |
366 | } | 372 | } |
367 | } | 373 | } |
368 | 374 | ||
375 | void kvm_ioapic_reset(struct kvm_ioapic *ioapic) | ||
376 | { | ||
377 | int i; | ||
378 | |||
379 | for (i = 0; i < IOAPIC_NUM_PINS; i++) | ||
380 | ioapic->redirtbl[i].fields.mask = 1; | ||
381 | ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS; | ||
382 | ioapic->ioregsel = 0; | ||
383 | ioapic->irr = 0; | ||
384 | ioapic->id = 0; | ||
385 | } | ||
386 | |||
369 | int kvm_ioapic_init(struct kvm *kvm) | 387 | int kvm_ioapic_init(struct kvm *kvm) |
370 | { | 388 | { |
371 | struct kvm_ioapic *ioapic; | 389 | struct kvm_ioapic *ioapic; |
372 | int i; | ||
373 | 390 | ||
374 | ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL); | 391 | ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL); |
375 | if (!ioapic) | 392 | if (!ioapic) |
376 | return -ENOMEM; | 393 | return -ENOMEM; |
377 | kvm->vioapic = ioapic; | 394 | kvm->arch.vioapic = ioapic; |
378 | for (i = 0; i < IOAPIC_NUM_PINS; i++) | 395 | kvm_ioapic_reset(ioapic); |
379 | ioapic->redirtbl[i].fields.mask = 1; | ||
380 | ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS; | ||
381 | ioapic->dev.read = ioapic_mmio_read; | 396 | ioapic->dev.read = ioapic_mmio_read; |
382 | ioapic->dev.write = ioapic_mmio_write; | 397 | ioapic->dev.write = ioapic_mmio_write; |
383 | ioapic->dev.in_range = ioapic_in_range; | 398 | ioapic->dev.in_range = ioapic_in_range; |
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h new file mode 100644 index 000000000000..7f16675fe783 --- /dev/null +++ b/virt/kvm/ioapic.h | |||
@@ -0,0 +1,95 @@ | |||
1 | #ifndef __KVM_IO_APIC_H | ||
2 | #define __KVM_IO_APIC_H | ||
3 | |||
4 | #include <linux/kvm_host.h> | ||
5 | |||
6 | #include "iodev.h" | ||
7 | |||
8 | struct kvm; | ||
9 | struct kvm_vcpu; | ||
10 | |||
11 | #define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS | ||
12 | #define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */ | ||
13 | #define IOAPIC_EDGE_TRIG 0 | ||
14 | #define IOAPIC_LEVEL_TRIG 1 | ||
15 | |||
16 | #define IOAPIC_DEFAULT_BASE_ADDRESS 0xfec00000 | ||
17 | #define IOAPIC_MEM_LENGTH 0x100 | ||
18 | |||
19 | /* Direct registers. */ | ||
20 | #define IOAPIC_REG_SELECT 0x00 | ||
21 | #define IOAPIC_REG_WINDOW 0x10 | ||
22 | #define IOAPIC_REG_EOI 0x40 /* IA64 IOSAPIC only */ | ||
23 | |||
24 | /* Indirect registers. */ | ||
25 | #define IOAPIC_REG_APIC_ID 0x00 /* x86 IOAPIC only */ | ||
26 | #define IOAPIC_REG_VERSION 0x01 | ||
27 | #define IOAPIC_REG_ARB_ID 0x02 /* x86 IOAPIC only */ | ||
28 | |||
29 | /*ioapic delivery mode*/ | ||
30 | #define IOAPIC_FIXED 0x0 | ||
31 | #define IOAPIC_LOWEST_PRIORITY 0x1 | ||
32 | #define IOAPIC_PMI 0x2 | ||
33 | #define IOAPIC_NMI 0x4 | ||
34 | #define IOAPIC_INIT 0x5 | ||
35 | #define IOAPIC_EXTINT 0x7 | ||
36 | |||
37 | struct kvm_ioapic { | ||
38 | u64 base_address; | ||
39 | u32 ioregsel; | ||
40 | u32 id; | ||
41 | u32 irr; | ||
42 | u32 pad; | ||
43 | union ioapic_redir_entry { | ||
44 | u64 bits; | ||
45 | struct { | ||
46 | u8 vector; | ||
47 | u8 delivery_mode:3; | ||
48 | u8 dest_mode:1; | ||
49 | u8 delivery_status:1; | ||
50 | u8 polarity:1; | ||
51 | u8 remote_irr:1; | ||
52 | u8 trig_mode:1; | ||
53 | u8 mask:1; | ||
54 | u8 reserve:7; | ||
55 | u8 reserved[4]; | ||
56 | u8 dest_id; | ||
57 | } fields; | ||
58 | } redirtbl[IOAPIC_NUM_PINS]; | ||
59 | struct kvm_io_device dev; | ||
60 | struct kvm *kvm; | ||
61 | }; | ||
62 | |||
63 | #ifdef DEBUG | ||
64 | #define ASSERT(x) \ | ||
65 | do { \ | ||
66 | if (!(x)) { \ | ||
67 | printk(KERN_EMERG "assertion failed %s: %d: %s\n", \ | ||
68 | __FILE__, __LINE__, #x); \ | ||
69 | BUG(); \ | ||
70 | } \ | ||
71 | } while (0) | ||
72 | #else | ||
73 | #define ASSERT(x) do { } while (0) | ||
74 | #endif | ||
75 | |||
76 | static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm) | ||
77 | { | ||
78 | return kvm->arch.vioapic; | ||
79 | } | ||
80 | |||
81 | #ifdef CONFIG_IA64 | ||
82 | static inline int irqchip_in_kernel(struct kvm *kvm) | ||
83 | { | ||
84 | return 1; | ||
85 | } | ||
86 | #endif | ||
87 | |||
88 | struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector, | ||
89 | unsigned long bitmap); | ||
90 | void kvm_ioapic_update_eoi(struct kvm *kvm, int vector); | ||
91 | int kvm_ioapic_init(struct kvm *kvm); | ||
92 | void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level); | ||
93 | void kvm_ioapic_reset(struct kvm_ioapic *ioapic); | ||
94 | |||
95 | #endif | ||
diff --git a/virt/kvm/iodev.h b/virt/kvm/iodev.h new file mode 100644 index 000000000000..c14e642027b2 --- /dev/null +++ b/virt/kvm/iodev.h | |||
@@ -0,0 +1,63 @@ | |||
1 | /* | ||
2 | * This program is free software; you can redistribute it and/or modify | ||
3 | * it under the terms of the GNU General Public License as published by | ||
4 | * the Free Software Foundation; either version 2 of the License. | ||
5 | * | ||
6 | * This program is distributed in the hope that it will be useful, | ||
7 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
8 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
9 | * GNU General Public License for more details. | ||
10 | * | ||
11 | * You should have received a copy of the GNU General Public License | ||
12 | * along with this program; if not, write to the Free Software | ||
13 | * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | ||
14 | */ | ||
15 | |||
16 | #ifndef __KVM_IODEV_H__ | ||
17 | #define __KVM_IODEV_H__ | ||
18 | |||
19 | #include <linux/kvm_types.h> | ||
20 | |||
21 | struct kvm_io_device { | ||
22 | void (*read)(struct kvm_io_device *this, | ||
23 | gpa_t addr, | ||
24 | int len, | ||
25 | void *val); | ||
26 | void (*write)(struct kvm_io_device *this, | ||
27 | gpa_t addr, | ||
28 | int len, | ||
29 | const void *val); | ||
30 | int (*in_range)(struct kvm_io_device *this, gpa_t addr); | ||
31 | void (*destructor)(struct kvm_io_device *this); | ||
32 | |||
33 | void *private; | ||
34 | }; | ||
35 | |||
36 | static inline void kvm_iodevice_read(struct kvm_io_device *dev, | ||
37 | gpa_t addr, | ||
38 | int len, | ||
39 | void *val) | ||
40 | { | ||
41 | dev->read(dev, addr, len, val); | ||
42 | } | ||
43 | |||
44 | static inline void kvm_iodevice_write(struct kvm_io_device *dev, | ||
45 | gpa_t addr, | ||
46 | int len, | ||
47 | const void *val) | ||
48 | { | ||
49 | dev->write(dev, addr, len, val); | ||
50 | } | ||
51 | |||
52 | static inline int kvm_iodevice_inrange(struct kvm_io_device *dev, gpa_t addr) | ||
53 | { | ||
54 | return dev->in_range(dev, addr); | ||
55 | } | ||
56 | |||
57 | static inline void kvm_iodevice_destructor(struct kvm_io_device *dev) | ||
58 | { | ||
59 | if (dev->destructor) | ||
60 | dev->destructor(dev); | ||
61 | } | ||
62 | |||
63 | #endif /* __KVM_IODEV_H__ */ | ||
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c new file mode 100644 index 000000000000..3c4fe26096fc --- /dev/null +++ b/virt/kvm/kvm_main.c | |||
@@ -0,0 +1,1400 @@ | |||
1 | /* | ||
2 | * Kernel-based Virtual Machine driver for Linux | ||
3 | * | ||
4 | * This module enables machines with Intel VT-x extensions to run virtual | ||
5 | * machines without emulation or binary translation. | ||
6 | * | ||
7 | * Copyright (C) 2006 Qumranet, Inc. | ||
8 | * | ||
9 | * Authors: | ||
10 | * Avi Kivity <avi@qumranet.com> | ||
11 | * Yaniv Kamay <yaniv@qumranet.com> | ||
12 | * | ||
13 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
14 | * the COPYING file in the top-level directory. | ||
15 | * | ||
16 | */ | ||
17 | |||
18 | #include "iodev.h" | ||
19 | |||
20 | #include <linux/kvm_host.h> | ||
21 | #include <linux/kvm.h> | ||
22 | #include <linux/module.h> | ||
23 | #include <linux/errno.h> | ||
24 | #include <linux/percpu.h> | ||
25 | #include <linux/gfp.h> | ||
26 | #include <linux/mm.h> | ||
27 | #include <linux/miscdevice.h> | ||
28 | #include <linux/vmalloc.h> | ||
29 | #include <linux/reboot.h> | ||
30 | #include <linux/debugfs.h> | ||
31 | #include <linux/highmem.h> | ||
32 | #include <linux/file.h> | ||
33 | #include <linux/sysdev.h> | ||
34 | #include <linux/cpu.h> | ||
35 | #include <linux/sched.h> | ||
36 | #include <linux/cpumask.h> | ||
37 | #include <linux/smp.h> | ||
38 | #include <linux/anon_inodes.h> | ||
39 | #include <linux/profile.h> | ||
40 | #include <linux/kvm_para.h> | ||
41 | #include <linux/pagemap.h> | ||
42 | #include <linux/mman.h> | ||
43 | |||
44 | #include <asm/processor.h> | ||
45 | #include <asm/io.h> | ||
46 | #include <asm/uaccess.h> | ||
47 | #include <asm/pgtable.h> | ||
48 | |||
49 | MODULE_AUTHOR("Qumranet"); | ||
50 | MODULE_LICENSE("GPL"); | ||
51 | |||
52 | DEFINE_SPINLOCK(kvm_lock); | ||
53 | LIST_HEAD(vm_list); | ||
54 | |||
55 | static cpumask_t cpus_hardware_enabled; | ||
56 | |||
57 | struct kmem_cache *kvm_vcpu_cache; | ||
58 | EXPORT_SYMBOL_GPL(kvm_vcpu_cache); | ||
59 | |||
60 | static __read_mostly struct preempt_ops kvm_preempt_ops; | ||
61 | |||
62 | static struct dentry *debugfs_dir; | ||
63 | |||
64 | static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, | ||
65 | unsigned long arg); | ||
66 | |||
67 | static inline int valid_vcpu(int n) | ||
68 | { | ||
69 | return likely(n >= 0 && n < KVM_MAX_VCPUS); | ||
70 | } | ||
71 | |||
72 | /* | ||
73 | * Switches to specified vcpu, until a matching vcpu_put() | ||
74 | */ | ||
75 | void vcpu_load(struct kvm_vcpu *vcpu) | ||
76 | { | ||
77 | int cpu; | ||
78 | |||
79 | mutex_lock(&vcpu->mutex); | ||
80 | cpu = get_cpu(); | ||
81 | preempt_notifier_register(&vcpu->preempt_notifier); | ||
82 | kvm_arch_vcpu_load(vcpu, cpu); | ||
83 | put_cpu(); | ||
84 | } | ||
85 | |||
86 | void vcpu_put(struct kvm_vcpu *vcpu) | ||
87 | { | ||
88 | preempt_disable(); | ||
89 | kvm_arch_vcpu_put(vcpu); | ||
90 | preempt_notifier_unregister(&vcpu->preempt_notifier); | ||
91 | preempt_enable(); | ||
92 | mutex_unlock(&vcpu->mutex); | ||
93 | } | ||
94 | |||
95 | static void ack_flush(void *_completed) | ||
96 | { | ||
97 | } | ||
98 | |||
99 | void kvm_flush_remote_tlbs(struct kvm *kvm) | ||
100 | { | ||
101 | int i, cpu; | ||
102 | cpumask_t cpus; | ||
103 | struct kvm_vcpu *vcpu; | ||
104 | |||
105 | cpus_clear(cpus); | ||
106 | for (i = 0; i < KVM_MAX_VCPUS; ++i) { | ||
107 | vcpu = kvm->vcpus[i]; | ||
108 | if (!vcpu) | ||
109 | continue; | ||
110 | if (test_and_set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) | ||
111 | continue; | ||
112 | cpu = vcpu->cpu; | ||
113 | if (cpu != -1 && cpu != raw_smp_processor_id()) | ||
114 | cpu_set(cpu, cpus); | ||
115 | } | ||
116 | if (cpus_empty(cpus)) | ||
117 | return; | ||
118 | ++kvm->stat.remote_tlb_flush; | ||
119 | smp_call_function_mask(cpus, ack_flush, NULL, 1); | ||
120 | } | ||
121 | |||
122 | int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) | ||
123 | { | ||
124 | struct page *page; | ||
125 | int r; | ||
126 | |||
127 | mutex_init(&vcpu->mutex); | ||
128 | vcpu->cpu = -1; | ||
129 | vcpu->kvm = kvm; | ||
130 | vcpu->vcpu_id = id; | ||
131 | init_waitqueue_head(&vcpu->wq); | ||
132 | |||
133 | page = alloc_page(GFP_KERNEL | __GFP_ZERO); | ||
134 | if (!page) { | ||
135 | r = -ENOMEM; | ||
136 | goto fail; | ||
137 | } | ||
138 | vcpu->run = page_address(page); | ||
139 | |||
140 | r = kvm_arch_vcpu_init(vcpu); | ||
141 | if (r < 0) | ||
142 | goto fail_free_run; | ||
143 | return 0; | ||
144 | |||
145 | fail_free_run: | ||
146 | free_page((unsigned long)vcpu->run); | ||
147 | fail: | ||
148 | return r; | ||
149 | } | ||
150 | EXPORT_SYMBOL_GPL(kvm_vcpu_init); | ||
151 | |||
152 | void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) | ||
153 | { | ||
154 | kvm_arch_vcpu_uninit(vcpu); | ||
155 | free_page((unsigned long)vcpu->run); | ||
156 | } | ||
157 | EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); | ||
158 | |||
159 | static struct kvm *kvm_create_vm(void) | ||
160 | { | ||
161 | struct kvm *kvm = kvm_arch_create_vm(); | ||
162 | |||
163 | if (IS_ERR(kvm)) | ||
164 | goto out; | ||
165 | |||
166 | kvm->mm = current->mm; | ||
167 | atomic_inc(&kvm->mm->mm_count); | ||
168 | spin_lock_init(&kvm->mmu_lock); | ||
169 | kvm_io_bus_init(&kvm->pio_bus); | ||
170 | mutex_init(&kvm->lock); | ||
171 | kvm_io_bus_init(&kvm->mmio_bus); | ||
172 | spin_lock(&kvm_lock); | ||
173 | list_add(&kvm->vm_list, &vm_list); | ||
174 | spin_unlock(&kvm_lock); | ||
175 | out: | ||
176 | return kvm; | ||
177 | } | ||
178 | |||
179 | /* | ||
180 | * Free any memory in @free but not in @dont. | ||
181 | */ | ||
182 | static void kvm_free_physmem_slot(struct kvm_memory_slot *free, | ||
183 | struct kvm_memory_slot *dont) | ||
184 | { | ||
185 | if (!dont || free->rmap != dont->rmap) | ||
186 | vfree(free->rmap); | ||
187 | |||
188 | if (!dont || free->dirty_bitmap != dont->dirty_bitmap) | ||
189 | vfree(free->dirty_bitmap); | ||
190 | |||
191 | free->npages = 0; | ||
192 | free->dirty_bitmap = NULL; | ||
193 | free->rmap = NULL; | ||
194 | } | ||
195 | |||
196 | void kvm_free_physmem(struct kvm *kvm) | ||
197 | { | ||
198 | int i; | ||
199 | |||
200 | for (i = 0; i < kvm->nmemslots; ++i) | ||
201 | kvm_free_physmem_slot(&kvm->memslots[i], NULL); | ||
202 | } | ||
203 | |||
204 | static void kvm_destroy_vm(struct kvm *kvm) | ||
205 | { | ||
206 | struct mm_struct *mm = kvm->mm; | ||
207 | |||
208 | spin_lock(&kvm_lock); | ||
209 | list_del(&kvm->vm_list); | ||
210 | spin_unlock(&kvm_lock); | ||
211 | kvm_io_bus_destroy(&kvm->pio_bus); | ||
212 | kvm_io_bus_destroy(&kvm->mmio_bus); | ||
213 | kvm_arch_destroy_vm(kvm); | ||
214 | mmdrop(mm); | ||
215 | } | ||
216 | |||
217 | static int kvm_vm_release(struct inode *inode, struct file *filp) | ||
218 | { | ||
219 | struct kvm *kvm = filp->private_data; | ||
220 | |||
221 | kvm_destroy_vm(kvm); | ||
222 | return 0; | ||
223 | } | ||
224 | |||
225 | /* | ||
226 | * Allocate some memory and give it an address in the guest physical address | ||
227 | * space. | ||
228 | * | ||
229 | * Discontiguous memory is allowed, mostly for framebuffers. | ||
230 | * | ||
231 | * Must be called holding mmap_sem for write. | ||
232 | */ | ||
233 | int __kvm_set_memory_region(struct kvm *kvm, | ||
234 | struct kvm_userspace_memory_region *mem, | ||
235 | int user_alloc) | ||
236 | { | ||
237 | int r; | ||
238 | gfn_t base_gfn; | ||
239 | unsigned long npages; | ||
240 | unsigned long i; | ||
241 | struct kvm_memory_slot *memslot; | ||
242 | struct kvm_memory_slot old, new; | ||
243 | |||
244 | r = -EINVAL; | ||
245 | /* General sanity checks */ | ||
246 | if (mem->memory_size & (PAGE_SIZE - 1)) | ||
247 | goto out; | ||
248 | if (mem->guest_phys_addr & (PAGE_SIZE - 1)) | ||
249 | goto out; | ||
250 | if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS) | ||
251 | goto out; | ||
252 | if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) | ||
253 | goto out; | ||
254 | |||
255 | memslot = &kvm->memslots[mem->slot]; | ||
256 | base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; | ||
257 | npages = mem->memory_size >> PAGE_SHIFT; | ||
258 | |||
259 | if (!npages) | ||
260 | mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES; | ||
261 | |||
262 | new = old = *memslot; | ||
263 | |||
264 | new.base_gfn = base_gfn; | ||
265 | new.npages = npages; | ||
266 | new.flags = mem->flags; | ||
267 | |||
268 | /* Disallow changing a memory slot's size. */ | ||
269 | r = -EINVAL; | ||
270 | if (npages && old.npages && npages != old.npages) | ||
271 | goto out_free; | ||
272 | |||
273 | /* Check for overlaps */ | ||
274 | r = -EEXIST; | ||
275 | for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { | ||
276 | struct kvm_memory_slot *s = &kvm->memslots[i]; | ||
277 | |||
278 | if (s == memslot) | ||
279 | continue; | ||
280 | if (!((base_gfn + npages <= s->base_gfn) || | ||
281 | (base_gfn >= s->base_gfn + s->npages))) | ||
282 | goto out_free; | ||
283 | } | ||
284 | |||
285 | /* Free page dirty bitmap if unneeded */ | ||
286 | if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) | ||
287 | new.dirty_bitmap = NULL; | ||
288 | |||
289 | r = -ENOMEM; | ||
290 | |||
291 | /* Allocate if a slot is being created */ | ||
292 | if (npages && !new.rmap) { | ||
293 | new.rmap = vmalloc(npages * sizeof(struct page *)); | ||
294 | |||
295 | if (!new.rmap) | ||
296 | goto out_free; | ||
297 | |||
298 | memset(new.rmap, 0, npages * sizeof(*new.rmap)); | ||
299 | |||
300 | new.user_alloc = user_alloc; | ||
301 | new.userspace_addr = mem->userspace_addr; | ||
302 | } | ||
303 | |||
304 | /* Allocate page dirty bitmap if needed */ | ||
305 | if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { | ||
306 | unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8; | ||
307 | |||
308 | new.dirty_bitmap = vmalloc(dirty_bytes); | ||
309 | if (!new.dirty_bitmap) | ||
310 | goto out_free; | ||
311 | memset(new.dirty_bitmap, 0, dirty_bytes); | ||
312 | } | ||
313 | |||
314 | if (mem->slot >= kvm->nmemslots) | ||
315 | kvm->nmemslots = mem->slot + 1; | ||
316 | |||
317 | *memslot = new; | ||
318 | |||
319 | r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc); | ||
320 | if (r) { | ||
321 | *memslot = old; | ||
322 | goto out_free; | ||
323 | } | ||
324 | |||
325 | kvm_free_physmem_slot(&old, &new); | ||
326 | return 0; | ||
327 | |||
328 | out_free: | ||
329 | kvm_free_physmem_slot(&new, &old); | ||
330 | out: | ||
331 | return r; | ||
332 | |||
333 | } | ||
334 | EXPORT_SYMBOL_GPL(__kvm_set_memory_region); | ||
335 | |||
336 | int kvm_set_memory_region(struct kvm *kvm, | ||
337 | struct kvm_userspace_memory_region *mem, | ||
338 | int user_alloc) | ||
339 | { | ||
340 | int r; | ||
341 | |||
342 | down_write(¤t->mm->mmap_sem); | ||
343 | r = __kvm_set_memory_region(kvm, mem, user_alloc); | ||
344 | up_write(¤t->mm->mmap_sem); | ||
345 | return r; | ||
346 | } | ||
347 | EXPORT_SYMBOL_GPL(kvm_set_memory_region); | ||
348 | |||
349 | int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, | ||
350 | struct | ||
351 | kvm_userspace_memory_region *mem, | ||
352 | int user_alloc) | ||
353 | { | ||
354 | if (mem->slot >= KVM_MEMORY_SLOTS) | ||
355 | return -EINVAL; | ||
356 | return kvm_set_memory_region(kvm, mem, user_alloc); | ||
357 | } | ||
358 | |||
359 | int kvm_get_dirty_log(struct kvm *kvm, | ||
360 | struct kvm_dirty_log *log, int *is_dirty) | ||
361 | { | ||
362 | struct kvm_memory_slot *memslot; | ||
363 | int r, i; | ||
364 | int n; | ||
365 | unsigned long any = 0; | ||
366 | |||
367 | r = -EINVAL; | ||
368 | if (log->slot >= KVM_MEMORY_SLOTS) | ||
369 | goto out; | ||
370 | |||
371 | memslot = &kvm->memslots[log->slot]; | ||
372 | r = -ENOENT; | ||
373 | if (!memslot->dirty_bitmap) | ||
374 | goto out; | ||
375 | |||
376 | n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; | ||
377 | |||
378 | for (i = 0; !any && i < n/sizeof(long); ++i) | ||
379 | any = memslot->dirty_bitmap[i]; | ||
380 | |||
381 | r = -EFAULT; | ||
382 | if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) | ||
383 | goto out; | ||
384 | |||
385 | if (any) | ||
386 | *is_dirty = 1; | ||
387 | |||
388 | r = 0; | ||
389 | out: | ||
390 | return r; | ||
391 | } | ||
392 | |||
393 | int is_error_page(struct page *page) | ||
394 | { | ||
395 | return page == bad_page; | ||
396 | } | ||
397 | EXPORT_SYMBOL_GPL(is_error_page); | ||
398 | |||
399 | static inline unsigned long bad_hva(void) | ||
400 | { | ||
401 | return PAGE_OFFSET; | ||
402 | } | ||
403 | |||
404 | int kvm_is_error_hva(unsigned long addr) | ||
405 | { | ||
406 | return addr == bad_hva(); | ||
407 | } | ||
408 | EXPORT_SYMBOL_GPL(kvm_is_error_hva); | ||
409 | |||
410 | static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn) | ||
411 | { | ||
412 | int i; | ||
413 | |||
414 | for (i = 0; i < kvm->nmemslots; ++i) { | ||
415 | struct kvm_memory_slot *memslot = &kvm->memslots[i]; | ||
416 | |||
417 | if (gfn >= memslot->base_gfn | ||
418 | && gfn < memslot->base_gfn + memslot->npages) | ||
419 | return memslot; | ||
420 | } | ||
421 | return NULL; | ||
422 | } | ||
423 | |||
424 | struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) | ||
425 | { | ||
426 | gfn = unalias_gfn(kvm, gfn); | ||
427 | return __gfn_to_memslot(kvm, gfn); | ||
428 | } | ||
429 | |||
430 | int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) | ||
431 | { | ||
432 | int i; | ||
433 | |||
434 | gfn = unalias_gfn(kvm, gfn); | ||
435 | for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { | ||
436 | struct kvm_memory_slot *memslot = &kvm->memslots[i]; | ||
437 | |||
438 | if (gfn >= memslot->base_gfn | ||
439 | && gfn < memslot->base_gfn + memslot->npages) | ||
440 | return 1; | ||
441 | } | ||
442 | return 0; | ||
443 | } | ||
444 | EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); | ||
445 | |||
446 | static unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) | ||
447 | { | ||
448 | struct kvm_memory_slot *slot; | ||
449 | |||
450 | gfn = unalias_gfn(kvm, gfn); | ||
451 | slot = __gfn_to_memslot(kvm, gfn); | ||
452 | if (!slot) | ||
453 | return bad_hva(); | ||
454 | return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE); | ||
455 | } | ||
456 | |||
457 | /* | ||
458 | * Requires current->mm->mmap_sem to be held | ||
459 | */ | ||
460 | struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) | ||
461 | { | ||
462 | struct page *page[1]; | ||
463 | unsigned long addr; | ||
464 | int npages; | ||
465 | |||
466 | might_sleep(); | ||
467 | |||
468 | addr = gfn_to_hva(kvm, gfn); | ||
469 | if (kvm_is_error_hva(addr)) { | ||
470 | get_page(bad_page); | ||
471 | return bad_page; | ||
472 | } | ||
473 | |||
474 | npages = get_user_pages(current, current->mm, addr, 1, 1, 1, page, | ||
475 | NULL); | ||
476 | |||
477 | if (npages != 1) { | ||
478 | get_page(bad_page); | ||
479 | return bad_page; | ||
480 | } | ||
481 | |||
482 | return page[0]; | ||
483 | } | ||
484 | |||
485 | EXPORT_SYMBOL_GPL(gfn_to_page); | ||
486 | |||
487 | void kvm_release_page_clean(struct page *page) | ||
488 | { | ||
489 | put_page(page); | ||
490 | } | ||
491 | EXPORT_SYMBOL_GPL(kvm_release_page_clean); | ||
492 | |||
493 | void kvm_release_page_dirty(struct page *page) | ||
494 | { | ||
495 | if (!PageReserved(page)) | ||
496 | SetPageDirty(page); | ||
497 | put_page(page); | ||
498 | } | ||
499 | EXPORT_SYMBOL_GPL(kvm_release_page_dirty); | ||
500 | |||
501 | static int next_segment(unsigned long len, int offset) | ||
502 | { | ||
503 | if (len > PAGE_SIZE - offset) | ||
504 | return PAGE_SIZE - offset; | ||
505 | else | ||
506 | return len; | ||
507 | } | ||
508 | |||
509 | int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, | ||
510 | int len) | ||
511 | { | ||
512 | int r; | ||
513 | unsigned long addr; | ||
514 | |||
515 | addr = gfn_to_hva(kvm, gfn); | ||
516 | if (kvm_is_error_hva(addr)) | ||
517 | return -EFAULT; | ||
518 | r = copy_from_user(data, (void __user *)addr + offset, len); | ||
519 | if (r) | ||
520 | return -EFAULT; | ||
521 | return 0; | ||
522 | } | ||
523 | EXPORT_SYMBOL_GPL(kvm_read_guest_page); | ||
524 | |||
525 | int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) | ||
526 | { | ||
527 | gfn_t gfn = gpa >> PAGE_SHIFT; | ||
528 | int seg; | ||
529 | int offset = offset_in_page(gpa); | ||
530 | int ret; | ||
531 | |||
532 | while ((seg = next_segment(len, offset)) != 0) { | ||
533 | ret = kvm_read_guest_page(kvm, gfn, data, offset, seg); | ||
534 | if (ret < 0) | ||
535 | return ret; | ||
536 | offset = 0; | ||
537 | len -= seg; | ||
538 | data += seg; | ||
539 | ++gfn; | ||
540 | } | ||
541 | return 0; | ||
542 | } | ||
543 | EXPORT_SYMBOL_GPL(kvm_read_guest); | ||
544 | |||
545 | int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, | ||
546 | unsigned long len) | ||
547 | { | ||
548 | int r; | ||
549 | unsigned long addr; | ||
550 | gfn_t gfn = gpa >> PAGE_SHIFT; | ||
551 | int offset = offset_in_page(gpa); | ||
552 | |||
553 | addr = gfn_to_hva(kvm, gfn); | ||
554 | if (kvm_is_error_hva(addr)) | ||
555 | return -EFAULT; | ||
556 | r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); | ||
557 | if (r) | ||
558 | return -EFAULT; | ||
559 | return 0; | ||
560 | } | ||
561 | EXPORT_SYMBOL(kvm_read_guest_atomic); | ||
562 | |||
563 | int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, | ||
564 | int offset, int len) | ||
565 | { | ||
566 | int r; | ||
567 | unsigned long addr; | ||
568 | |||
569 | addr = gfn_to_hva(kvm, gfn); | ||
570 | if (kvm_is_error_hva(addr)) | ||
571 | return -EFAULT; | ||
572 | r = copy_to_user((void __user *)addr + offset, data, len); | ||
573 | if (r) | ||
574 | return -EFAULT; | ||
575 | mark_page_dirty(kvm, gfn); | ||
576 | return 0; | ||
577 | } | ||
578 | EXPORT_SYMBOL_GPL(kvm_write_guest_page); | ||
579 | |||
580 | int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, | ||
581 | unsigned long len) | ||
582 | { | ||
583 | gfn_t gfn = gpa >> PAGE_SHIFT; | ||
584 | int seg; | ||
585 | int offset = offset_in_page(gpa); | ||
586 | int ret; | ||
587 | |||
588 | while ((seg = next_segment(len, offset)) != 0) { | ||
589 | ret = kvm_write_guest_page(kvm, gfn, data, offset, seg); | ||
590 | if (ret < 0) | ||
591 | return ret; | ||
592 | offset = 0; | ||
593 | len -= seg; | ||
594 | data += seg; | ||
595 | ++gfn; | ||
596 | } | ||
597 | return 0; | ||
598 | } | ||
599 | |||
600 | int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) | ||
601 | { | ||
602 | return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len); | ||
603 | } | ||
604 | EXPORT_SYMBOL_GPL(kvm_clear_guest_page); | ||
605 | |||
606 | int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) | ||
607 | { | ||
608 | gfn_t gfn = gpa >> PAGE_SHIFT; | ||
609 | int seg; | ||
610 | int offset = offset_in_page(gpa); | ||
611 | int ret; | ||
612 | |||
613 | while ((seg = next_segment(len, offset)) != 0) { | ||
614 | ret = kvm_clear_guest_page(kvm, gfn, offset, seg); | ||
615 | if (ret < 0) | ||
616 | return ret; | ||
617 | offset = 0; | ||
618 | len -= seg; | ||
619 | ++gfn; | ||
620 | } | ||
621 | return 0; | ||
622 | } | ||
623 | EXPORT_SYMBOL_GPL(kvm_clear_guest); | ||
624 | |||
625 | void mark_page_dirty(struct kvm *kvm, gfn_t gfn) | ||
626 | { | ||
627 | struct kvm_memory_slot *memslot; | ||
628 | |||
629 | gfn = unalias_gfn(kvm, gfn); | ||
630 | memslot = __gfn_to_memslot(kvm, gfn); | ||
631 | if (memslot && memslot->dirty_bitmap) { | ||
632 | unsigned long rel_gfn = gfn - memslot->base_gfn; | ||
633 | |||
634 | /* avoid RMW */ | ||
635 | if (!test_bit(rel_gfn, memslot->dirty_bitmap)) | ||
636 | set_bit(rel_gfn, memslot->dirty_bitmap); | ||
637 | } | ||
638 | } | ||
639 | |||
640 | /* | ||
641 | * The vCPU has executed a HLT instruction with in-kernel mode enabled. | ||
642 | */ | ||
643 | void kvm_vcpu_block(struct kvm_vcpu *vcpu) | ||
644 | { | ||
645 | DECLARE_WAITQUEUE(wait, current); | ||
646 | |||
647 | add_wait_queue(&vcpu->wq, &wait); | ||
648 | |||
649 | /* | ||
650 | * We will block until either an interrupt or a signal wakes us up | ||
651 | */ | ||
652 | while (!kvm_cpu_has_interrupt(vcpu) | ||
653 | && !signal_pending(current) | ||
654 | && !kvm_arch_vcpu_runnable(vcpu)) { | ||
655 | set_current_state(TASK_INTERRUPTIBLE); | ||
656 | vcpu_put(vcpu); | ||
657 | schedule(); | ||
658 | vcpu_load(vcpu); | ||
659 | } | ||
660 | |||
661 | __set_current_state(TASK_RUNNING); | ||
662 | remove_wait_queue(&vcpu->wq, &wait); | ||
663 | } | ||
664 | |||
665 | void kvm_resched(struct kvm_vcpu *vcpu) | ||
666 | { | ||
667 | if (!need_resched()) | ||
668 | return; | ||
669 | cond_resched(); | ||
670 | } | ||
671 | EXPORT_SYMBOL_GPL(kvm_resched); | ||
672 | |||
673 | static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | ||
674 | { | ||
675 | struct kvm_vcpu *vcpu = vma->vm_file->private_data; | ||
676 | struct page *page; | ||
677 | |||
678 | if (vmf->pgoff == 0) | ||
679 | page = virt_to_page(vcpu->run); | ||
680 | else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET) | ||
681 | page = virt_to_page(vcpu->arch.pio_data); | ||
682 | else | ||
683 | return VM_FAULT_SIGBUS; | ||
684 | get_page(page); | ||
685 | vmf->page = page; | ||
686 | return 0; | ||
687 | } | ||
688 | |||
689 | static struct vm_operations_struct kvm_vcpu_vm_ops = { | ||
690 | .fault = kvm_vcpu_fault, | ||
691 | }; | ||
692 | |||
693 | static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) | ||
694 | { | ||
695 | vma->vm_ops = &kvm_vcpu_vm_ops; | ||
696 | return 0; | ||
697 | } | ||
698 | |||
699 | static int kvm_vcpu_release(struct inode *inode, struct file *filp) | ||
700 | { | ||
701 | struct kvm_vcpu *vcpu = filp->private_data; | ||
702 | |||
703 | fput(vcpu->kvm->filp); | ||
704 | return 0; | ||
705 | } | ||
706 | |||
707 | static struct file_operations kvm_vcpu_fops = { | ||
708 | .release = kvm_vcpu_release, | ||
709 | .unlocked_ioctl = kvm_vcpu_ioctl, | ||
710 | .compat_ioctl = kvm_vcpu_ioctl, | ||
711 | .mmap = kvm_vcpu_mmap, | ||
712 | }; | ||
713 | |||
714 | /* | ||
715 | * Allocates an inode for the vcpu. | ||
716 | */ | ||
717 | static int create_vcpu_fd(struct kvm_vcpu *vcpu) | ||
718 | { | ||
719 | int fd, r; | ||
720 | struct inode *inode; | ||
721 | struct file *file; | ||
722 | |||
723 | r = anon_inode_getfd(&fd, &inode, &file, | ||
724 | "kvm-vcpu", &kvm_vcpu_fops, vcpu); | ||
725 | if (r) | ||
726 | return r; | ||
727 | atomic_inc(&vcpu->kvm->filp->f_count); | ||
728 | return fd; | ||
729 | } | ||
730 | |||
731 | /* | ||
732 | * Creates some virtual cpus. Good luck creating more than one. | ||
733 | */ | ||
734 | static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) | ||
735 | { | ||
736 | int r; | ||
737 | struct kvm_vcpu *vcpu; | ||
738 | |||
739 | if (!valid_vcpu(n)) | ||
740 | return -EINVAL; | ||
741 | |||
742 | vcpu = kvm_arch_vcpu_create(kvm, n); | ||
743 | if (IS_ERR(vcpu)) | ||
744 | return PTR_ERR(vcpu); | ||
745 | |||
746 | preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); | ||
747 | |||
748 | r = kvm_arch_vcpu_setup(vcpu); | ||
749 | if (r) | ||
750 | goto vcpu_destroy; | ||
751 | |||
752 | mutex_lock(&kvm->lock); | ||
753 | if (kvm->vcpus[n]) { | ||
754 | r = -EEXIST; | ||
755 | mutex_unlock(&kvm->lock); | ||
756 | goto vcpu_destroy; | ||
757 | } | ||
758 | kvm->vcpus[n] = vcpu; | ||
759 | mutex_unlock(&kvm->lock); | ||
760 | |||
761 | /* Now it's all set up, let userspace reach it */ | ||
762 | r = create_vcpu_fd(vcpu); | ||
763 | if (r < 0) | ||
764 | goto unlink; | ||
765 | return r; | ||
766 | |||
767 | unlink: | ||
768 | mutex_lock(&kvm->lock); | ||
769 | kvm->vcpus[n] = NULL; | ||
770 | mutex_unlock(&kvm->lock); | ||
771 | vcpu_destroy: | ||
772 | kvm_arch_vcpu_destroy(vcpu); | ||
773 | return r; | ||
774 | } | ||
775 | |||
776 | static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) | ||
777 | { | ||
778 | if (sigset) { | ||
779 | sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); | ||
780 | vcpu->sigset_active = 1; | ||
781 | vcpu->sigset = *sigset; | ||
782 | } else | ||
783 | vcpu->sigset_active = 0; | ||
784 | return 0; | ||
785 | } | ||
786 | |||
787 | static long kvm_vcpu_ioctl(struct file *filp, | ||
788 | unsigned int ioctl, unsigned long arg) | ||
789 | { | ||
790 | struct kvm_vcpu *vcpu = filp->private_data; | ||
791 | void __user *argp = (void __user *)arg; | ||
792 | int r; | ||
793 | |||
794 | if (vcpu->kvm->mm != current->mm) | ||
795 | return -EIO; | ||
796 | switch (ioctl) { | ||
797 | case KVM_RUN: | ||
798 | r = -EINVAL; | ||
799 | if (arg) | ||
800 | goto out; | ||
801 | r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); | ||
802 | break; | ||
803 | case KVM_GET_REGS: { | ||
804 | struct kvm_regs kvm_regs; | ||
805 | |||
806 | memset(&kvm_regs, 0, sizeof kvm_regs); | ||
807 | r = kvm_arch_vcpu_ioctl_get_regs(vcpu, &kvm_regs); | ||
808 | if (r) | ||
809 | goto out; | ||
810 | r = -EFAULT; | ||
811 | if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs)) | ||
812 | goto out; | ||
813 | r = 0; | ||
814 | break; | ||
815 | } | ||
816 | case KVM_SET_REGS: { | ||
817 | struct kvm_regs kvm_regs; | ||
818 | |||
819 | r = -EFAULT; | ||
820 | if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs)) | ||
821 | goto out; | ||
822 | r = kvm_arch_vcpu_ioctl_set_regs(vcpu, &kvm_regs); | ||
823 | if (r) | ||
824 | goto out; | ||
825 | r = 0; | ||
826 | break; | ||
827 | } | ||
828 | case KVM_GET_SREGS: { | ||
829 | struct kvm_sregs kvm_sregs; | ||
830 | |||
831 | memset(&kvm_sregs, 0, sizeof kvm_sregs); | ||
832 | r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs); | ||
833 | if (r) | ||
834 | goto out; | ||
835 | r = -EFAULT; | ||
836 | if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs)) | ||
837 | goto out; | ||
838 | r = 0; | ||
839 | break; | ||
840 | } | ||
841 | case KVM_SET_SREGS: { | ||
842 | struct kvm_sregs kvm_sregs; | ||
843 | |||
844 | r = -EFAULT; | ||
845 | if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs)) | ||
846 | goto out; | ||
847 | r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs); | ||
848 | if (r) | ||
849 | goto out; | ||
850 | r = 0; | ||
851 | break; | ||
852 | } | ||
853 | case KVM_TRANSLATE: { | ||
854 | struct kvm_translation tr; | ||
855 | |||
856 | r = -EFAULT; | ||
857 | if (copy_from_user(&tr, argp, sizeof tr)) | ||
858 | goto out; | ||
859 | r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr); | ||
860 | if (r) | ||
861 | goto out; | ||
862 | r = -EFAULT; | ||
863 | if (copy_to_user(argp, &tr, sizeof tr)) | ||
864 | goto out; | ||
865 | r = 0; | ||
866 | break; | ||
867 | } | ||
868 | case KVM_DEBUG_GUEST: { | ||
869 | struct kvm_debug_guest dbg; | ||
870 | |||
871 | r = -EFAULT; | ||
872 | if (copy_from_user(&dbg, argp, sizeof dbg)) | ||
873 | goto out; | ||
874 | r = kvm_arch_vcpu_ioctl_debug_guest(vcpu, &dbg); | ||
875 | if (r) | ||
876 | goto out; | ||
877 | r = 0; | ||
878 | break; | ||
879 | } | ||
880 | case KVM_SET_SIGNAL_MASK: { | ||
881 | struct kvm_signal_mask __user *sigmask_arg = argp; | ||
882 | struct kvm_signal_mask kvm_sigmask; | ||
883 | sigset_t sigset, *p; | ||
884 | |||
885 | p = NULL; | ||
886 | if (argp) { | ||
887 | r = -EFAULT; | ||
888 | if (copy_from_user(&kvm_sigmask, argp, | ||
889 | sizeof kvm_sigmask)) | ||
890 | goto out; | ||
891 | r = -EINVAL; | ||
892 | if (kvm_sigmask.len != sizeof sigset) | ||
893 | goto out; | ||
894 | r = -EFAULT; | ||
895 | if (copy_from_user(&sigset, sigmask_arg->sigset, | ||
896 | sizeof sigset)) | ||
897 | goto out; | ||
898 | p = &sigset; | ||
899 | } | ||
900 | r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); | ||
901 | break; | ||
902 | } | ||
903 | case KVM_GET_FPU: { | ||
904 | struct kvm_fpu fpu; | ||
905 | |||
906 | memset(&fpu, 0, sizeof fpu); | ||
907 | r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, &fpu); | ||
908 | if (r) | ||
909 | goto out; | ||
910 | r = -EFAULT; | ||
911 | if (copy_to_user(argp, &fpu, sizeof fpu)) | ||
912 | goto out; | ||
913 | r = 0; | ||
914 | break; | ||
915 | } | ||
916 | case KVM_SET_FPU: { | ||
917 | struct kvm_fpu fpu; | ||
918 | |||
919 | r = -EFAULT; | ||
920 | if (copy_from_user(&fpu, argp, sizeof fpu)) | ||
921 | goto out; | ||
922 | r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, &fpu); | ||
923 | if (r) | ||
924 | goto out; | ||
925 | r = 0; | ||
926 | break; | ||
927 | } | ||
928 | default: | ||
929 | r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); | ||
930 | } | ||
931 | out: | ||
932 | return r; | ||
933 | } | ||
934 | |||
935 | static long kvm_vm_ioctl(struct file *filp, | ||
936 | unsigned int ioctl, unsigned long arg) | ||
937 | { | ||
938 | struct kvm *kvm = filp->private_data; | ||
939 | void __user *argp = (void __user *)arg; | ||
940 | int r; | ||
941 | |||
942 | if (kvm->mm != current->mm) | ||
943 | return -EIO; | ||
944 | switch (ioctl) { | ||
945 | case KVM_CREATE_VCPU: | ||
946 | r = kvm_vm_ioctl_create_vcpu(kvm, arg); | ||
947 | if (r < 0) | ||
948 | goto out; | ||
949 | break; | ||
950 | case KVM_SET_USER_MEMORY_REGION: { | ||
951 | struct kvm_userspace_memory_region kvm_userspace_mem; | ||
952 | |||
953 | r = -EFAULT; | ||
954 | if (copy_from_user(&kvm_userspace_mem, argp, | ||
955 | sizeof kvm_userspace_mem)) | ||
956 | goto out; | ||
957 | |||
958 | r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1); | ||
959 | if (r) | ||
960 | goto out; | ||
961 | break; | ||
962 | } | ||
963 | case KVM_GET_DIRTY_LOG: { | ||
964 | struct kvm_dirty_log log; | ||
965 | |||
966 | r = -EFAULT; | ||
967 | if (copy_from_user(&log, argp, sizeof log)) | ||
968 | goto out; | ||
969 | r = kvm_vm_ioctl_get_dirty_log(kvm, &log); | ||
970 | if (r) | ||
971 | goto out; | ||
972 | break; | ||
973 | } | ||
974 | default: | ||
975 | r = kvm_arch_vm_ioctl(filp, ioctl, arg); | ||
976 | } | ||
977 | out: | ||
978 | return r; | ||
979 | } | ||
980 | |||
981 | static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | ||
982 | { | ||
983 | struct kvm *kvm = vma->vm_file->private_data; | ||
984 | struct page *page; | ||
985 | |||
986 | if (!kvm_is_visible_gfn(kvm, vmf->pgoff)) | ||
987 | return VM_FAULT_SIGBUS; | ||
988 | page = gfn_to_page(kvm, vmf->pgoff); | ||
989 | if (is_error_page(page)) { | ||
990 | kvm_release_page_clean(page); | ||
991 | return VM_FAULT_SIGBUS; | ||
992 | } | ||
993 | vmf->page = page; | ||
994 | return 0; | ||
995 | } | ||
996 | |||
997 | static struct vm_operations_struct kvm_vm_vm_ops = { | ||
998 | .fault = kvm_vm_fault, | ||
999 | }; | ||
1000 | |||
1001 | static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma) | ||
1002 | { | ||
1003 | vma->vm_ops = &kvm_vm_vm_ops; | ||
1004 | return 0; | ||
1005 | } | ||
1006 | |||
1007 | static struct file_operations kvm_vm_fops = { | ||
1008 | .release = kvm_vm_release, | ||
1009 | .unlocked_ioctl = kvm_vm_ioctl, | ||
1010 | .compat_ioctl = kvm_vm_ioctl, | ||
1011 | .mmap = kvm_vm_mmap, | ||
1012 | }; | ||
1013 | |||
1014 | static int kvm_dev_ioctl_create_vm(void) | ||
1015 | { | ||
1016 | int fd, r; | ||
1017 | struct inode *inode; | ||
1018 | struct file *file; | ||
1019 | struct kvm *kvm; | ||
1020 | |||
1021 | kvm = kvm_create_vm(); | ||
1022 | if (IS_ERR(kvm)) | ||
1023 | return PTR_ERR(kvm); | ||
1024 | r = anon_inode_getfd(&fd, &inode, &file, "kvm-vm", &kvm_vm_fops, kvm); | ||
1025 | if (r) { | ||
1026 | kvm_destroy_vm(kvm); | ||
1027 | return r; | ||
1028 | } | ||
1029 | |||
1030 | kvm->filp = file; | ||
1031 | |||
1032 | return fd; | ||
1033 | } | ||
1034 | |||
1035 | static long kvm_dev_ioctl(struct file *filp, | ||
1036 | unsigned int ioctl, unsigned long arg) | ||
1037 | { | ||
1038 | void __user *argp = (void __user *)arg; | ||
1039 | long r = -EINVAL; | ||
1040 | |||
1041 | switch (ioctl) { | ||
1042 | case KVM_GET_API_VERSION: | ||
1043 | r = -EINVAL; | ||
1044 | if (arg) | ||
1045 | goto out; | ||
1046 | r = KVM_API_VERSION; | ||
1047 | break; | ||
1048 | case KVM_CREATE_VM: | ||
1049 | r = -EINVAL; | ||
1050 | if (arg) | ||
1051 | goto out; | ||
1052 | r = kvm_dev_ioctl_create_vm(); | ||
1053 | break; | ||
1054 | case KVM_CHECK_EXTENSION: | ||
1055 | r = kvm_dev_ioctl_check_extension((long)argp); | ||
1056 | break; | ||
1057 | case KVM_GET_VCPU_MMAP_SIZE: | ||
1058 | r = -EINVAL; | ||
1059 | if (arg) | ||
1060 | goto out; | ||
1061 | r = 2 * PAGE_SIZE; | ||
1062 | break; | ||
1063 | default: | ||
1064 | return kvm_arch_dev_ioctl(filp, ioctl, arg); | ||
1065 | } | ||
1066 | out: | ||
1067 | return r; | ||
1068 | } | ||
1069 | |||
1070 | static struct file_operations kvm_chardev_ops = { | ||
1071 | .unlocked_ioctl = kvm_dev_ioctl, | ||
1072 | .compat_ioctl = kvm_dev_ioctl, | ||
1073 | }; | ||
1074 | |||
1075 | static struct miscdevice kvm_dev = { | ||
1076 | KVM_MINOR, | ||
1077 | "kvm", | ||
1078 | &kvm_chardev_ops, | ||
1079 | }; | ||
1080 | |||
1081 | static void hardware_enable(void *junk) | ||
1082 | { | ||
1083 | int cpu = raw_smp_processor_id(); | ||
1084 | |||
1085 | if (cpu_isset(cpu, cpus_hardware_enabled)) | ||
1086 | return; | ||
1087 | cpu_set(cpu, cpus_hardware_enabled); | ||
1088 | kvm_arch_hardware_enable(NULL); | ||
1089 | } | ||
1090 | |||
1091 | static void hardware_disable(void *junk) | ||
1092 | { | ||
1093 | int cpu = raw_smp_processor_id(); | ||
1094 | |||
1095 | if (!cpu_isset(cpu, cpus_hardware_enabled)) | ||
1096 | return; | ||
1097 | cpu_clear(cpu, cpus_hardware_enabled); | ||
1098 | decache_vcpus_on_cpu(cpu); | ||
1099 | kvm_arch_hardware_disable(NULL); | ||
1100 | } | ||
1101 | |||
1102 | static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, | ||
1103 | void *v) | ||
1104 | { | ||
1105 | int cpu = (long)v; | ||
1106 | |||
1107 | val &= ~CPU_TASKS_FROZEN; | ||
1108 | switch (val) { | ||
1109 | case CPU_DYING: | ||
1110 | printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", | ||
1111 | cpu); | ||
1112 | hardware_disable(NULL); | ||
1113 | break; | ||
1114 | case CPU_UP_CANCELED: | ||
1115 | printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", | ||
1116 | cpu); | ||
1117 | smp_call_function_single(cpu, hardware_disable, NULL, 0, 1); | ||
1118 | break; | ||
1119 | case CPU_ONLINE: | ||
1120 | printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", | ||
1121 | cpu); | ||
1122 | smp_call_function_single(cpu, hardware_enable, NULL, 0, 1); | ||
1123 | break; | ||
1124 | } | ||
1125 | return NOTIFY_OK; | ||
1126 | } | ||
1127 | |||
1128 | static int kvm_reboot(struct notifier_block *notifier, unsigned long val, | ||
1129 | void *v) | ||
1130 | { | ||
1131 | if (val == SYS_RESTART) { | ||
1132 | /* | ||
1133 | * Some (well, at least mine) BIOSes hang on reboot if | ||
1134 | * in vmx root mode. | ||
1135 | */ | ||
1136 | printk(KERN_INFO "kvm: exiting hardware virtualization\n"); | ||
1137 | on_each_cpu(hardware_disable, NULL, 0, 1); | ||
1138 | } | ||
1139 | return NOTIFY_OK; | ||
1140 | } | ||
1141 | |||
1142 | static struct notifier_block kvm_reboot_notifier = { | ||
1143 | .notifier_call = kvm_reboot, | ||
1144 | .priority = 0, | ||
1145 | }; | ||
1146 | |||
1147 | void kvm_io_bus_init(struct kvm_io_bus *bus) | ||
1148 | { | ||
1149 | memset(bus, 0, sizeof(*bus)); | ||
1150 | } | ||
1151 | |||
1152 | void kvm_io_bus_destroy(struct kvm_io_bus *bus) | ||
1153 | { | ||
1154 | int i; | ||
1155 | |||
1156 | for (i = 0; i < bus->dev_count; i++) { | ||
1157 | struct kvm_io_device *pos = bus->devs[i]; | ||
1158 | |||
1159 | kvm_iodevice_destructor(pos); | ||
1160 | } | ||
1161 | } | ||
1162 | |||
1163 | struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr) | ||
1164 | { | ||
1165 | int i; | ||
1166 | |||
1167 | for (i = 0; i < bus->dev_count; i++) { | ||
1168 | struct kvm_io_device *pos = bus->devs[i]; | ||
1169 | |||
1170 | if (pos->in_range(pos, addr)) | ||
1171 | return pos; | ||
1172 | } | ||
1173 | |||
1174 | return NULL; | ||
1175 | } | ||
1176 | |||
1177 | void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev) | ||
1178 | { | ||
1179 | BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1)); | ||
1180 | |||
1181 | bus->devs[bus->dev_count++] = dev; | ||
1182 | } | ||
1183 | |||
1184 | static struct notifier_block kvm_cpu_notifier = { | ||
1185 | .notifier_call = kvm_cpu_hotplug, | ||
1186 | .priority = 20, /* must be > scheduler priority */ | ||
1187 | }; | ||
1188 | |||
1189 | static u64 vm_stat_get(void *_offset) | ||
1190 | { | ||
1191 | unsigned offset = (long)_offset; | ||
1192 | u64 total = 0; | ||
1193 | struct kvm *kvm; | ||
1194 | |||
1195 | spin_lock(&kvm_lock); | ||
1196 | list_for_each_entry(kvm, &vm_list, vm_list) | ||
1197 | total += *(u32 *)((void *)kvm + offset); | ||
1198 | spin_unlock(&kvm_lock); | ||
1199 | return total; | ||
1200 | } | ||
1201 | |||
1202 | DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n"); | ||
1203 | |||
1204 | static u64 vcpu_stat_get(void *_offset) | ||
1205 | { | ||
1206 | unsigned offset = (long)_offset; | ||
1207 | u64 total = 0; | ||
1208 | struct kvm *kvm; | ||
1209 | struct kvm_vcpu *vcpu; | ||
1210 | int i; | ||
1211 | |||
1212 | spin_lock(&kvm_lock); | ||
1213 | list_for_each_entry(kvm, &vm_list, vm_list) | ||
1214 | for (i = 0; i < KVM_MAX_VCPUS; ++i) { | ||
1215 | vcpu = kvm->vcpus[i]; | ||
1216 | if (vcpu) | ||
1217 | total += *(u32 *)((void *)vcpu + offset); | ||
1218 | } | ||
1219 | spin_unlock(&kvm_lock); | ||
1220 | return total; | ||
1221 | } | ||
1222 | |||
1223 | DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n"); | ||
1224 | |||
1225 | static struct file_operations *stat_fops[] = { | ||
1226 | [KVM_STAT_VCPU] = &vcpu_stat_fops, | ||
1227 | [KVM_STAT_VM] = &vm_stat_fops, | ||
1228 | }; | ||
1229 | |||
1230 | static void kvm_init_debug(void) | ||
1231 | { | ||
1232 | struct kvm_stats_debugfs_item *p; | ||
1233 | |||
1234 | debugfs_dir = debugfs_create_dir("kvm", NULL); | ||
1235 | for (p = debugfs_entries; p->name; ++p) | ||
1236 | p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir, | ||
1237 | (void *)(long)p->offset, | ||
1238 | stat_fops[p->kind]); | ||
1239 | } | ||
1240 | |||
1241 | static void kvm_exit_debug(void) | ||
1242 | { | ||
1243 | struct kvm_stats_debugfs_item *p; | ||
1244 | |||
1245 | for (p = debugfs_entries; p->name; ++p) | ||
1246 | debugfs_remove(p->dentry); | ||
1247 | debugfs_remove(debugfs_dir); | ||
1248 | } | ||
1249 | |||
1250 | static int kvm_suspend(struct sys_device *dev, pm_message_t state) | ||
1251 | { | ||
1252 | hardware_disable(NULL); | ||
1253 | return 0; | ||
1254 | } | ||
1255 | |||
1256 | static int kvm_resume(struct sys_device *dev) | ||
1257 | { | ||
1258 | hardware_enable(NULL); | ||
1259 | return 0; | ||
1260 | } | ||
1261 | |||
1262 | static struct sysdev_class kvm_sysdev_class = { | ||
1263 | .name = "kvm", | ||
1264 | .suspend = kvm_suspend, | ||
1265 | .resume = kvm_resume, | ||
1266 | }; | ||
1267 | |||
1268 | static struct sys_device kvm_sysdev = { | ||
1269 | .id = 0, | ||
1270 | .cls = &kvm_sysdev_class, | ||
1271 | }; | ||
1272 | |||
1273 | struct page *bad_page; | ||
1274 | |||
1275 | static inline | ||
1276 | struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) | ||
1277 | { | ||
1278 | return container_of(pn, struct kvm_vcpu, preempt_notifier); | ||
1279 | } | ||
1280 | |||
1281 | static void kvm_sched_in(struct preempt_notifier *pn, int cpu) | ||
1282 | { | ||
1283 | struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); | ||
1284 | |||
1285 | kvm_arch_vcpu_load(vcpu, cpu); | ||
1286 | } | ||
1287 | |||
1288 | static void kvm_sched_out(struct preempt_notifier *pn, | ||
1289 | struct task_struct *next) | ||
1290 | { | ||
1291 | struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); | ||
1292 | |||
1293 | kvm_arch_vcpu_put(vcpu); | ||
1294 | } | ||
1295 | |||
1296 | int kvm_init(void *opaque, unsigned int vcpu_size, | ||
1297 | struct module *module) | ||
1298 | { | ||
1299 | int r; | ||
1300 | int cpu; | ||
1301 | |||
1302 | kvm_init_debug(); | ||
1303 | |||
1304 | r = kvm_arch_init(opaque); | ||
1305 | if (r) | ||
1306 | goto out_fail; | ||
1307 | |||
1308 | bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO); | ||
1309 | |||
1310 | if (bad_page == NULL) { | ||
1311 | r = -ENOMEM; | ||
1312 | goto out; | ||
1313 | } | ||
1314 | |||
1315 | r = kvm_arch_hardware_setup(); | ||
1316 | if (r < 0) | ||
1317 | goto out_free_0; | ||
1318 | |||
1319 | for_each_online_cpu(cpu) { | ||
1320 | smp_call_function_single(cpu, | ||
1321 | kvm_arch_check_processor_compat, | ||
1322 | &r, 0, 1); | ||
1323 | if (r < 0) | ||
1324 | goto out_free_1; | ||
1325 | } | ||
1326 | |||
1327 | on_each_cpu(hardware_enable, NULL, 0, 1); | ||
1328 | r = register_cpu_notifier(&kvm_cpu_notifier); | ||
1329 | if (r) | ||
1330 | goto out_free_2; | ||
1331 | register_reboot_notifier(&kvm_reboot_notifier); | ||
1332 | |||
1333 | r = sysdev_class_register(&kvm_sysdev_class); | ||
1334 | if (r) | ||
1335 | goto out_free_3; | ||
1336 | |||
1337 | r = sysdev_register(&kvm_sysdev); | ||
1338 | if (r) | ||
1339 | goto out_free_4; | ||
1340 | |||
1341 | /* A kmem cache lets us meet the alignment requirements of fx_save. */ | ||
1342 | kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, | ||
1343 | __alignof__(struct kvm_vcpu), | ||
1344 | 0, NULL); | ||
1345 | if (!kvm_vcpu_cache) { | ||
1346 | r = -ENOMEM; | ||
1347 | goto out_free_5; | ||
1348 | } | ||
1349 | |||
1350 | kvm_chardev_ops.owner = module; | ||
1351 | |||
1352 | r = misc_register(&kvm_dev); | ||
1353 | if (r) { | ||
1354 | printk(KERN_ERR "kvm: misc device register failed\n"); | ||
1355 | goto out_free; | ||
1356 | } | ||
1357 | |||
1358 | kvm_preempt_ops.sched_in = kvm_sched_in; | ||
1359 | kvm_preempt_ops.sched_out = kvm_sched_out; | ||
1360 | |||
1361 | return 0; | ||
1362 | |||
1363 | out_free: | ||
1364 | kmem_cache_destroy(kvm_vcpu_cache); | ||
1365 | out_free_5: | ||
1366 | sysdev_unregister(&kvm_sysdev); | ||
1367 | out_free_4: | ||
1368 | sysdev_class_unregister(&kvm_sysdev_class); | ||
1369 | out_free_3: | ||
1370 | unregister_reboot_notifier(&kvm_reboot_notifier); | ||
1371 | unregister_cpu_notifier(&kvm_cpu_notifier); | ||
1372 | out_free_2: | ||
1373 | on_each_cpu(hardware_disable, NULL, 0, 1); | ||
1374 | out_free_1: | ||
1375 | kvm_arch_hardware_unsetup(); | ||
1376 | out_free_0: | ||
1377 | __free_page(bad_page); | ||
1378 | out: | ||
1379 | kvm_arch_exit(); | ||
1380 | kvm_exit_debug(); | ||
1381 | out_fail: | ||
1382 | return r; | ||
1383 | } | ||
1384 | EXPORT_SYMBOL_GPL(kvm_init); | ||
1385 | |||
1386 | void kvm_exit(void) | ||
1387 | { | ||
1388 | misc_deregister(&kvm_dev); | ||
1389 | kmem_cache_destroy(kvm_vcpu_cache); | ||
1390 | sysdev_unregister(&kvm_sysdev); | ||
1391 | sysdev_class_unregister(&kvm_sysdev_class); | ||
1392 | unregister_reboot_notifier(&kvm_reboot_notifier); | ||
1393 | unregister_cpu_notifier(&kvm_cpu_notifier); | ||
1394 | on_each_cpu(hardware_disable, NULL, 0, 1); | ||
1395 | kvm_arch_hardware_unsetup(); | ||
1396 | kvm_arch_exit(); | ||
1397 | kvm_exit_debug(); | ||
1398 | __free_page(bad_page); | ||
1399 | } | ||
1400 | EXPORT_SYMBOL_GPL(kvm_exit); | ||