diff options
Diffstat (limited to 'arch/x86')
-rw-r--r-- | arch/x86/Kconfig | 19 | ||||
-rw-r--r-- | arch/x86/kernel/Makefile | 2 | ||||
-rw-r--r-- | arch/x86/kernel/crash.c | 3 | ||||
-rw-r--r-- | arch/x86/kernel/kvm.c | 248 | ||||
-rw-r--r-- | arch/x86/kernel/kvmclock.c | 187 | ||||
-rw-r--r-- | arch/x86/kernel/reboot.c | 13 | ||||
-rw-r--r-- | arch/x86/kernel/setup_32.c | 6 | ||||
-rw-r--r-- | arch/x86/kernel/setup_64.c | 7 | ||||
-rw-r--r-- | arch/x86/kvm/Kconfig | 13 | ||||
-rw-r--r-- | arch/x86/kvm/Makefile | 6 | ||||
-rw-r--r-- | arch/x86/kvm/i8254.c | 611 | ||||
-rw-r--r-- | arch/x86/kvm/i8254.h | 63 | ||||
-rw-r--r-- | arch/x86/kvm/irq.c | 18 | ||||
-rw-r--r-- | arch/x86/kvm/irq.h | 3 | ||||
-rw-r--r-- | arch/x86/kvm/kvm_svm.h | 2 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c | 35 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.c | 672 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.h | 6 | ||||
-rw-r--r-- | arch/x86/kvm/paging_tmpl.h | 86 | ||||
-rw-r--r-- | arch/x86/kvm/segment_descriptor.h | 29 | ||||
-rw-r--r-- | arch/x86/kvm/svm.c | 352 | ||||
-rw-r--r-- | arch/x86/kvm/svm.h | 3 | ||||
-rw-r--r-- | arch/x86/kvm/tss.h | 59 | ||||
-rw-r--r-- | arch/x86/kvm/vmx.c | 278 | ||||
-rw-r--r-- | arch/x86/kvm/vmx.h | 10 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 897 | ||||
-rw-r--r-- | arch/x86/kvm/x86_emulate.c | 285 |
27 files changed, 3365 insertions, 548 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2fadf794483d..e5790fe9e330 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
@@ -373,6 +373,25 @@ config VMI | |||
373 | at the moment), by linking the kernel to a GPL-ed ROM module | 373 | at the moment), by linking the kernel to a GPL-ed ROM module |
374 | provided by the hypervisor. | 374 | provided by the hypervisor. |
375 | 375 | ||
376 | config KVM_CLOCK | ||
377 | bool "KVM paravirtualized clock" | ||
378 | select PARAVIRT | ||
379 | depends on !(X86_VISWS || X86_VOYAGER) | ||
380 | help | ||
381 | Turning on this option will allow you to run a paravirtualized clock | ||
382 | when running over the KVM hypervisor. Instead of relying on a PIT | ||
383 | (or probably other) emulation by the underlying device model, the host | ||
384 | provides the guest with timing infrastructure such as time of day, and | ||
385 | system time | ||
386 | |||
387 | config KVM_GUEST | ||
388 | bool "KVM Guest support" | ||
389 | select PARAVIRT | ||
390 | depends on !(X86_VISWS || X86_VOYAGER) | ||
391 | help | ||
392 | This option enables various optimizations for running under the KVM | ||
393 | hypervisor. | ||
394 | |||
376 | source "arch/x86/lguest/Kconfig" | 395 | source "arch/x86/lguest/Kconfig" |
377 | 396 | ||
378 | config PARAVIRT | 397 | config PARAVIRT |
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 90e092d0af0c..fa19c3819540 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile | |||
@@ -80,6 +80,8 @@ obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o | |||
80 | obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o | 80 | obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o |
81 | 81 | ||
82 | obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o | 82 | obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o |
83 | obj-$(CONFIG_KVM_GUEST) += kvm.o | ||
84 | obj-$(CONFIG_KVM_CLOCK) += kvmclock.o | ||
83 | obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o | 85 | obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o |
84 | 86 | ||
85 | ifdef CONFIG_INPUT_PCSPKR | 87 | ifdef CONFIG_INPUT_PCSPKR |
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 2251d0ae9570..268553817909 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c | |||
@@ -25,6 +25,7 @@ | |||
25 | #include <asm/hpet.h> | 25 | #include <asm/hpet.h> |
26 | #include <linux/kdebug.h> | 26 | #include <linux/kdebug.h> |
27 | #include <asm/smp.h> | 27 | #include <asm/smp.h> |
28 | #include <asm/reboot.h> | ||
28 | 29 | ||
29 | #include <mach_ipi.h> | 30 | #include <mach_ipi.h> |
30 | 31 | ||
@@ -117,7 +118,7 @@ static void nmi_shootdown_cpus(void) | |||
117 | } | 118 | } |
118 | #endif | 119 | #endif |
119 | 120 | ||
120 | void machine_crash_shutdown(struct pt_regs *regs) | 121 | void native_machine_crash_shutdown(struct pt_regs *regs) |
121 | { | 122 | { |
122 | /* This function is only called after the system | 123 | /* This function is only called after the system |
123 | * has panicked or is otherwise in a critical state. | 124 | * has panicked or is otherwise in a critical state. |
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c new file mode 100644 index 000000000000..8b7a3cf37d2b --- /dev/null +++ b/arch/x86/kernel/kvm.c | |||
@@ -0,0 +1,248 @@ | |||
1 | /* | ||
2 | * KVM paravirt_ops implementation | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | ||
17 | * | ||
18 | * Copyright (C) 2007, Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
19 | * Copyright IBM Corporation, 2007 | ||
20 | * Authors: Anthony Liguori <aliguori@us.ibm.com> | ||
21 | */ | ||
22 | |||
23 | #include <linux/module.h> | ||
24 | #include <linux/kernel.h> | ||
25 | #include <linux/kvm_para.h> | ||
26 | #include <linux/cpu.h> | ||
27 | #include <linux/mm.h> | ||
28 | #include <linux/highmem.h> | ||
29 | #include <linux/hardirq.h> | ||
30 | |||
31 | #define MMU_QUEUE_SIZE 1024 | ||
32 | |||
33 | struct kvm_para_state { | ||
34 | u8 mmu_queue[MMU_QUEUE_SIZE]; | ||
35 | int mmu_queue_len; | ||
36 | enum paravirt_lazy_mode mode; | ||
37 | }; | ||
38 | |||
39 | static DEFINE_PER_CPU(struct kvm_para_state, para_state); | ||
40 | |||
41 | static struct kvm_para_state *kvm_para_state(void) | ||
42 | { | ||
43 | return &per_cpu(para_state, raw_smp_processor_id()); | ||
44 | } | ||
45 | |||
46 | /* | ||
47 | * No need for any "IO delay" on KVM | ||
48 | */ | ||
49 | static void kvm_io_delay(void) | ||
50 | { | ||
51 | } | ||
52 | |||
53 | static void kvm_mmu_op(void *buffer, unsigned len) | ||
54 | { | ||
55 | int r; | ||
56 | unsigned long a1, a2; | ||
57 | |||
58 | do { | ||
59 | a1 = __pa(buffer); | ||
60 | a2 = 0; /* on i386 __pa() always returns <4G */ | ||
61 | r = kvm_hypercall3(KVM_HC_MMU_OP, len, a1, a2); | ||
62 | buffer += r; | ||
63 | len -= r; | ||
64 | } while (len); | ||
65 | } | ||
66 | |||
67 | static void mmu_queue_flush(struct kvm_para_state *state) | ||
68 | { | ||
69 | if (state->mmu_queue_len) { | ||
70 | kvm_mmu_op(state->mmu_queue, state->mmu_queue_len); | ||
71 | state->mmu_queue_len = 0; | ||
72 | } | ||
73 | } | ||
74 | |||
75 | static void kvm_deferred_mmu_op(void *buffer, int len) | ||
76 | { | ||
77 | struct kvm_para_state *state = kvm_para_state(); | ||
78 | |||
79 | if (state->mode != PARAVIRT_LAZY_MMU) { | ||
80 | kvm_mmu_op(buffer, len); | ||
81 | return; | ||
82 | } | ||
83 | if (state->mmu_queue_len + len > sizeof state->mmu_queue) | ||
84 | mmu_queue_flush(state); | ||
85 | memcpy(state->mmu_queue + state->mmu_queue_len, buffer, len); | ||
86 | state->mmu_queue_len += len; | ||
87 | } | ||
88 | |||
89 | static void kvm_mmu_write(void *dest, u64 val) | ||
90 | { | ||
91 | __u64 pte_phys; | ||
92 | struct kvm_mmu_op_write_pte wpte; | ||
93 | |||
94 | #ifdef CONFIG_HIGHPTE | ||
95 | struct page *page; | ||
96 | unsigned long dst = (unsigned long) dest; | ||
97 | |||
98 | page = kmap_atomic_to_page(dest); | ||
99 | pte_phys = page_to_pfn(page); | ||
100 | pte_phys <<= PAGE_SHIFT; | ||
101 | pte_phys += (dst & ~(PAGE_MASK)); | ||
102 | #else | ||
103 | pte_phys = (unsigned long)__pa(dest); | ||
104 | #endif | ||
105 | wpte.header.op = KVM_MMU_OP_WRITE_PTE; | ||
106 | wpte.pte_val = val; | ||
107 | wpte.pte_phys = pte_phys; | ||
108 | |||
109 | kvm_deferred_mmu_op(&wpte, sizeof wpte); | ||
110 | } | ||
111 | |||
112 | /* | ||
113 | * We only need to hook operations that are MMU writes. We hook these so that | ||
114 | * we can use lazy MMU mode to batch these operations. We could probably | ||
115 | * improve the performance of the host code if we used some of the information | ||
116 | * here to simplify processing of batched writes. | ||
117 | */ | ||
118 | static void kvm_set_pte(pte_t *ptep, pte_t pte) | ||
119 | { | ||
120 | kvm_mmu_write(ptep, pte_val(pte)); | ||
121 | } | ||
122 | |||
123 | static void kvm_set_pte_at(struct mm_struct *mm, unsigned long addr, | ||
124 | pte_t *ptep, pte_t pte) | ||
125 | { | ||
126 | kvm_mmu_write(ptep, pte_val(pte)); | ||
127 | } | ||
128 | |||
129 | static void kvm_set_pmd(pmd_t *pmdp, pmd_t pmd) | ||
130 | { | ||
131 | kvm_mmu_write(pmdp, pmd_val(pmd)); | ||
132 | } | ||
133 | |||
134 | #if PAGETABLE_LEVELS >= 3 | ||
135 | #ifdef CONFIG_X86_PAE | ||
136 | static void kvm_set_pte_atomic(pte_t *ptep, pte_t pte) | ||
137 | { | ||
138 | kvm_mmu_write(ptep, pte_val(pte)); | ||
139 | } | ||
140 | |||
141 | static void kvm_set_pte_present(struct mm_struct *mm, unsigned long addr, | ||
142 | pte_t *ptep, pte_t pte) | ||
143 | { | ||
144 | kvm_mmu_write(ptep, pte_val(pte)); | ||
145 | } | ||
146 | |||
147 | static void kvm_pte_clear(struct mm_struct *mm, | ||
148 | unsigned long addr, pte_t *ptep) | ||
149 | { | ||
150 | kvm_mmu_write(ptep, 0); | ||
151 | } | ||
152 | |||
153 | static void kvm_pmd_clear(pmd_t *pmdp) | ||
154 | { | ||
155 | kvm_mmu_write(pmdp, 0); | ||
156 | } | ||
157 | #endif | ||
158 | |||
159 | static void kvm_set_pud(pud_t *pudp, pud_t pud) | ||
160 | { | ||
161 | kvm_mmu_write(pudp, pud_val(pud)); | ||
162 | } | ||
163 | |||
164 | #if PAGETABLE_LEVELS == 4 | ||
165 | static void kvm_set_pgd(pgd_t *pgdp, pgd_t pgd) | ||
166 | { | ||
167 | kvm_mmu_write(pgdp, pgd_val(pgd)); | ||
168 | } | ||
169 | #endif | ||
170 | #endif /* PAGETABLE_LEVELS >= 3 */ | ||
171 | |||
172 | static void kvm_flush_tlb(void) | ||
173 | { | ||
174 | struct kvm_mmu_op_flush_tlb ftlb = { | ||
175 | .header.op = KVM_MMU_OP_FLUSH_TLB, | ||
176 | }; | ||
177 | |||
178 | kvm_deferred_mmu_op(&ftlb, sizeof ftlb); | ||
179 | } | ||
180 | |||
181 | static void kvm_release_pt(u32 pfn) | ||
182 | { | ||
183 | struct kvm_mmu_op_release_pt rpt = { | ||
184 | .header.op = KVM_MMU_OP_RELEASE_PT, | ||
185 | .pt_phys = (u64)pfn << PAGE_SHIFT, | ||
186 | }; | ||
187 | |||
188 | kvm_mmu_op(&rpt, sizeof rpt); | ||
189 | } | ||
190 | |||
191 | static void kvm_enter_lazy_mmu(void) | ||
192 | { | ||
193 | struct kvm_para_state *state = kvm_para_state(); | ||
194 | |||
195 | paravirt_enter_lazy_mmu(); | ||
196 | state->mode = paravirt_get_lazy_mode(); | ||
197 | } | ||
198 | |||
199 | static void kvm_leave_lazy_mmu(void) | ||
200 | { | ||
201 | struct kvm_para_state *state = kvm_para_state(); | ||
202 | |||
203 | mmu_queue_flush(state); | ||
204 | paravirt_leave_lazy(paravirt_get_lazy_mode()); | ||
205 | state->mode = paravirt_get_lazy_mode(); | ||
206 | } | ||
207 | |||
208 | static void paravirt_ops_setup(void) | ||
209 | { | ||
210 | pv_info.name = "KVM"; | ||
211 | pv_info.paravirt_enabled = 1; | ||
212 | |||
213 | if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY)) | ||
214 | pv_cpu_ops.io_delay = kvm_io_delay; | ||
215 | |||
216 | if (kvm_para_has_feature(KVM_FEATURE_MMU_OP)) { | ||
217 | pv_mmu_ops.set_pte = kvm_set_pte; | ||
218 | pv_mmu_ops.set_pte_at = kvm_set_pte_at; | ||
219 | pv_mmu_ops.set_pmd = kvm_set_pmd; | ||
220 | #if PAGETABLE_LEVELS >= 3 | ||
221 | #ifdef CONFIG_X86_PAE | ||
222 | pv_mmu_ops.set_pte_atomic = kvm_set_pte_atomic; | ||
223 | pv_mmu_ops.set_pte_present = kvm_set_pte_present; | ||
224 | pv_mmu_ops.pte_clear = kvm_pte_clear; | ||
225 | pv_mmu_ops.pmd_clear = kvm_pmd_clear; | ||
226 | #endif | ||
227 | pv_mmu_ops.set_pud = kvm_set_pud; | ||
228 | #if PAGETABLE_LEVELS == 4 | ||
229 | pv_mmu_ops.set_pgd = kvm_set_pgd; | ||
230 | #endif | ||
231 | #endif | ||
232 | pv_mmu_ops.flush_tlb_user = kvm_flush_tlb; | ||
233 | pv_mmu_ops.release_pte = kvm_release_pt; | ||
234 | pv_mmu_ops.release_pmd = kvm_release_pt; | ||
235 | pv_mmu_ops.release_pud = kvm_release_pt; | ||
236 | |||
237 | pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu; | ||
238 | pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu; | ||
239 | } | ||
240 | } | ||
241 | |||
242 | void __init kvm_guest_init(void) | ||
243 | { | ||
244 | if (!kvm_para_available()) | ||
245 | return; | ||
246 | |||
247 | paravirt_ops_setup(); | ||
248 | } | ||
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c new file mode 100644 index 000000000000..ddee04043aeb --- /dev/null +++ b/arch/x86/kernel/kvmclock.c | |||
@@ -0,0 +1,187 @@ | |||
1 | /* KVM paravirtual clock driver. A clocksource implementation | ||
2 | Copyright (C) 2008 Glauber de Oliveira Costa, Red Hat Inc. | ||
3 | |||
4 | This program is free software; you can redistribute it and/or modify | ||
5 | it under the terms of the GNU General Public License as published by | ||
6 | the Free Software Foundation; either version 2 of the License, or | ||
7 | (at your option) any later version. | ||
8 | |||
9 | This program is distributed in the hope that it will be useful, | ||
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | GNU General Public License for more details. | ||
13 | |||
14 | You should have received a copy of the GNU General Public License | ||
15 | along with this program; if not, write to the Free Software | ||
16 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA | ||
17 | */ | ||
18 | |||
19 | #include <linux/clocksource.h> | ||
20 | #include <linux/kvm_para.h> | ||
21 | #include <asm/arch_hooks.h> | ||
22 | #include <asm/msr.h> | ||
23 | #include <asm/apic.h> | ||
24 | #include <linux/percpu.h> | ||
25 | #include <asm/reboot.h> | ||
26 | |||
27 | #define KVM_SCALE 22 | ||
28 | |||
29 | static int kvmclock = 1; | ||
30 | |||
31 | static int parse_no_kvmclock(char *arg) | ||
32 | { | ||
33 | kvmclock = 0; | ||
34 | return 0; | ||
35 | } | ||
36 | early_param("no-kvmclock", parse_no_kvmclock); | ||
37 | |||
38 | /* The hypervisor will put information about time periodically here */ | ||
39 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct kvm_vcpu_time_info, hv_clock); | ||
40 | #define get_clock(cpu, field) per_cpu(hv_clock, cpu).field | ||
41 | |||
42 | static inline u64 kvm_get_delta(u64 last_tsc) | ||
43 | { | ||
44 | int cpu = smp_processor_id(); | ||
45 | u64 delta = native_read_tsc() - last_tsc; | ||
46 | return (delta * get_clock(cpu, tsc_to_system_mul)) >> KVM_SCALE; | ||
47 | } | ||
48 | |||
49 | static struct kvm_wall_clock wall_clock; | ||
50 | static cycle_t kvm_clock_read(void); | ||
51 | /* | ||
52 | * The wallclock is the time of day when we booted. Since then, some time may | ||
53 | * have elapsed since the hypervisor wrote the data. So we try to account for | ||
54 | * that with system time | ||
55 | */ | ||
56 | unsigned long kvm_get_wallclock(void) | ||
57 | { | ||
58 | u32 wc_sec, wc_nsec; | ||
59 | u64 delta; | ||
60 | struct timespec ts; | ||
61 | int version, nsec; | ||
62 | int low, high; | ||
63 | |||
64 | low = (int)__pa(&wall_clock); | ||
65 | high = ((u64)__pa(&wall_clock) >> 32); | ||
66 | |||
67 | delta = kvm_clock_read(); | ||
68 | |||
69 | native_write_msr(MSR_KVM_WALL_CLOCK, low, high); | ||
70 | do { | ||
71 | version = wall_clock.wc_version; | ||
72 | rmb(); | ||
73 | wc_sec = wall_clock.wc_sec; | ||
74 | wc_nsec = wall_clock.wc_nsec; | ||
75 | rmb(); | ||
76 | } while ((wall_clock.wc_version != version) || (version & 1)); | ||
77 | |||
78 | delta = kvm_clock_read() - delta; | ||
79 | delta += wc_nsec; | ||
80 | nsec = do_div(delta, NSEC_PER_SEC); | ||
81 | set_normalized_timespec(&ts, wc_sec + delta, nsec); | ||
82 | /* | ||
83 | * Of all mechanisms of time adjustment I've tested, this one | ||
84 | * was the champion! | ||
85 | */ | ||
86 | return ts.tv_sec + 1; | ||
87 | } | ||
88 | |||
89 | int kvm_set_wallclock(unsigned long now) | ||
90 | { | ||
91 | return 0; | ||
92 | } | ||
93 | |||
94 | /* | ||
95 | * This is our read_clock function. The host puts an tsc timestamp each time | ||
96 | * it updates a new time. Without the tsc adjustment, we can have a situation | ||
97 | * in which a vcpu starts to run earlier (smaller system_time), but probes | ||
98 | * time later (compared to another vcpu), leading to backwards time | ||
99 | */ | ||
100 | static cycle_t kvm_clock_read(void) | ||
101 | { | ||
102 | u64 last_tsc, now; | ||
103 | int cpu; | ||
104 | |||
105 | preempt_disable(); | ||
106 | cpu = smp_processor_id(); | ||
107 | |||
108 | last_tsc = get_clock(cpu, tsc_timestamp); | ||
109 | now = get_clock(cpu, system_time); | ||
110 | |||
111 | now += kvm_get_delta(last_tsc); | ||
112 | preempt_enable(); | ||
113 | |||
114 | return now; | ||
115 | } | ||
116 | static struct clocksource kvm_clock = { | ||
117 | .name = "kvm-clock", | ||
118 | .read = kvm_clock_read, | ||
119 | .rating = 400, | ||
120 | .mask = CLOCKSOURCE_MASK(64), | ||
121 | .mult = 1 << KVM_SCALE, | ||
122 | .shift = KVM_SCALE, | ||
123 | .flags = CLOCK_SOURCE_IS_CONTINUOUS, | ||
124 | }; | ||
125 | |||
126 | static int kvm_register_clock(void) | ||
127 | { | ||
128 | int cpu = smp_processor_id(); | ||
129 | int low, high; | ||
130 | low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1; | ||
131 | high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32); | ||
132 | |||
133 | return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high); | ||
134 | } | ||
135 | |||
136 | static void kvm_setup_secondary_clock(void) | ||
137 | { | ||
138 | /* | ||
139 | * Now that the first cpu already had this clocksource initialized, | ||
140 | * we shouldn't fail. | ||
141 | */ | ||
142 | WARN_ON(kvm_register_clock()); | ||
143 | /* ok, done with our trickery, call native */ | ||
144 | setup_secondary_APIC_clock(); | ||
145 | } | ||
146 | |||
147 | /* | ||
148 | * After the clock is registered, the host will keep writing to the | ||
149 | * registered memory location. If the guest happens to shutdown, this memory | ||
150 | * won't be valid. In cases like kexec, in which you install a new kernel, this | ||
151 | * means a random memory location will be kept being written. So before any | ||
152 | * kind of shutdown from our side, we unregister the clock by writting anything | ||
153 | * that does not have the 'enable' bit set in the msr | ||
154 | */ | ||
155 | #ifdef CONFIG_KEXEC | ||
156 | static void kvm_crash_shutdown(struct pt_regs *regs) | ||
157 | { | ||
158 | native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0); | ||
159 | native_machine_crash_shutdown(regs); | ||
160 | } | ||
161 | #endif | ||
162 | |||
163 | static void kvm_shutdown(void) | ||
164 | { | ||
165 | native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0); | ||
166 | native_machine_shutdown(); | ||
167 | } | ||
168 | |||
169 | void __init kvmclock_init(void) | ||
170 | { | ||
171 | if (!kvm_para_available()) | ||
172 | return; | ||
173 | |||
174 | if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { | ||
175 | if (kvm_register_clock()) | ||
176 | return; | ||
177 | pv_time_ops.get_wallclock = kvm_get_wallclock; | ||
178 | pv_time_ops.set_wallclock = kvm_set_wallclock; | ||
179 | pv_time_ops.sched_clock = kvm_clock_read; | ||
180 | pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock; | ||
181 | machine_ops.shutdown = kvm_shutdown; | ||
182 | #ifdef CONFIG_KEXEC | ||
183 | machine_ops.crash_shutdown = kvm_crash_shutdown; | ||
184 | #endif | ||
185 | clocksource_register(&kvm_clock); | ||
186 | } | ||
187 | } | ||
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 1791a751a772..a4a838306b2c 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c | |||
@@ -399,7 +399,7 @@ static void native_machine_emergency_restart(void) | |||
399 | } | 399 | } |
400 | } | 400 | } |
401 | 401 | ||
402 | static void native_machine_shutdown(void) | 402 | void native_machine_shutdown(void) |
403 | { | 403 | { |
404 | /* Stop the cpus and apics */ | 404 | /* Stop the cpus and apics */ |
405 | #ifdef CONFIG_SMP | 405 | #ifdef CONFIG_SMP |
@@ -470,7 +470,10 @@ struct machine_ops machine_ops = { | |||
470 | .shutdown = native_machine_shutdown, | 470 | .shutdown = native_machine_shutdown, |
471 | .emergency_restart = native_machine_emergency_restart, | 471 | .emergency_restart = native_machine_emergency_restart, |
472 | .restart = native_machine_restart, | 472 | .restart = native_machine_restart, |
473 | .halt = native_machine_halt | 473 | .halt = native_machine_halt, |
474 | #ifdef CONFIG_KEXEC | ||
475 | .crash_shutdown = native_machine_crash_shutdown, | ||
476 | #endif | ||
474 | }; | 477 | }; |
475 | 478 | ||
476 | void machine_power_off(void) | 479 | void machine_power_off(void) |
@@ -498,3 +501,9 @@ void machine_halt(void) | |||
498 | machine_ops.halt(); | 501 | machine_ops.halt(); |
499 | } | 502 | } |
500 | 503 | ||
504 | #ifdef CONFIG_KEXEC | ||
505 | void machine_crash_shutdown(struct pt_regs *regs) | ||
506 | { | ||
507 | machine_ops.crash_shutdown(regs); | ||
508 | } | ||
509 | #endif | ||
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 44cc9b933932..2283422af794 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c | |||
@@ -47,6 +47,7 @@ | |||
47 | #include <linux/pfn.h> | 47 | #include <linux/pfn.h> |
48 | #include <linux/pci.h> | 48 | #include <linux/pci.h> |
49 | #include <linux/init_ohci1394_dma.h> | 49 | #include <linux/init_ohci1394_dma.h> |
50 | #include <linux/kvm_para.h> | ||
50 | 51 | ||
51 | #include <video/edid.h> | 52 | #include <video/edid.h> |
52 | 53 | ||
@@ -820,6 +821,10 @@ void __init setup_arch(char **cmdline_p) | |||
820 | 821 | ||
821 | max_low_pfn = setup_memory(); | 822 | max_low_pfn = setup_memory(); |
822 | 823 | ||
824 | #ifdef CONFIG_KVM_CLOCK | ||
825 | kvmclock_init(); | ||
826 | #endif | ||
827 | |||
823 | #ifdef CONFIG_VMI | 828 | #ifdef CONFIG_VMI |
824 | /* | 829 | /* |
825 | * Must be after max_low_pfn is determined, and before kernel | 830 | * Must be after max_low_pfn is determined, and before kernel |
@@ -827,6 +832,7 @@ void __init setup_arch(char **cmdline_p) | |||
827 | */ | 832 | */ |
828 | vmi_init(); | 833 | vmi_init(); |
829 | #endif | 834 | #endif |
835 | kvm_guest_init(); | ||
830 | 836 | ||
831 | /* | 837 | /* |
832 | * NOTE: before this point _nobody_ is allowed to allocate | 838 | * NOTE: before this point _nobody_ is allowed to allocate |
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index 60e64c8eee92..a94fb959a87a 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c | |||
@@ -42,6 +42,7 @@ | |||
42 | #include <linux/ctype.h> | 42 | #include <linux/ctype.h> |
43 | #include <linux/uaccess.h> | 43 | #include <linux/uaccess.h> |
44 | #include <linux/init_ohci1394_dma.h> | 44 | #include <linux/init_ohci1394_dma.h> |
45 | #include <linux/kvm_para.h> | ||
45 | 46 | ||
46 | #include <asm/mtrr.h> | 47 | #include <asm/mtrr.h> |
47 | #include <asm/uaccess.h> | 48 | #include <asm/uaccess.h> |
@@ -384,6 +385,10 @@ void __init setup_arch(char **cmdline_p) | |||
384 | 385 | ||
385 | io_delay_init(); | 386 | io_delay_init(); |
386 | 387 | ||
388 | #ifdef CONFIG_KVM_CLOCK | ||
389 | kvmclock_init(); | ||
390 | #endif | ||
391 | |||
387 | #ifdef CONFIG_SMP | 392 | #ifdef CONFIG_SMP |
388 | /* setup to use the early static init tables during kernel startup */ | 393 | /* setup to use the early static init tables during kernel startup */ |
389 | x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init; | 394 | x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init; |
@@ -488,6 +493,8 @@ void __init setup_arch(char **cmdline_p) | |||
488 | init_apic_mappings(); | 493 | init_apic_mappings(); |
489 | ioapic_init_mappings(); | 494 | ioapic_init_mappings(); |
490 | 495 | ||
496 | kvm_guest_init(); | ||
497 | |||
491 | /* | 498 | /* |
492 | * We trust e820 completely. No explicit ROM probing in memory. | 499 | * We trust e820 completely. No explicit ROM probing in memory. |
493 | */ | 500 | */ |
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 41962e793c0f..8d45fabc5f3b 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig | |||
@@ -19,7 +19,7 @@ if VIRTUALIZATION | |||
19 | 19 | ||
20 | config KVM | 20 | config KVM |
21 | tristate "Kernel-based Virtual Machine (KVM) support" | 21 | tristate "Kernel-based Virtual Machine (KVM) support" |
22 | depends on HAVE_KVM && EXPERIMENTAL | 22 | depends on HAVE_KVM |
23 | select PREEMPT_NOTIFIERS | 23 | select PREEMPT_NOTIFIERS |
24 | select ANON_INODES | 24 | select ANON_INODES |
25 | ---help--- | 25 | ---help--- |
@@ -50,6 +50,17 @@ config KVM_AMD | |||
50 | Provides support for KVM on AMD processors equipped with the AMD-V | 50 | Provides support for KVM on AMD processors equipped with the AMD-V |
51 | (SVM) extensions. | 51 | (SVM) extensions. |
52 | 52 | ||
53 | config KVM_TRACE | ||
54 | bool "KVM trace support" | ||
55 | depends on KVM && MARKERS && SYSFS | ||
56 | select RELAY | ||
57 | select DEBUG_FS | ||
58 | default n | ||
59 | ---help--- | ||
60 | This option allows reading a trace of kvm-related events through | ||
61 | relayfs. Note the ABI is not considered stable and will be | ||
62 | modified in future updates. | ||
63 | |||
53 | # OK, it's a little counter-intuitive to do this, but it puts it neatly under | 64 | # OK, it's a little counter-intuitive to do this, but it puts it neatly under |
54 | # the virtualization menu. | 65 | # the virtualization menu. |
55 | source drivers/lguest/Kconfig | 66 | source drivers/lguest/Kconfig |
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index ffdd0b310784..c97d35c218db 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile | |||
@@ -3,10 +3,14 @@ | |||
3 | # | 3 | # |
4 | 4 | ||
5 | common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o) | 5 | common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o) |
6 | ifeq ($(CONFIG_KVM_TRACE),y) | ||
7 | common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o) | ||
8 | endif | ||
6 | 9 | ||
7 | EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm | 10 | EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm |
8 | 11 | ||
9 | kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o | 12 | kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \ |
13 | i8254.o | ||
10 | obj-$(CONFIG_KVM) += kvm.o | 14 | obj-$(CONFIG_KVM) += kvm.o |
11 | kvm-intel-objs = vmx.o | 15 | kvm-intel-objs = vmx.o |
12 | obj-$(CONFIG_KVM_INTEL) += kvm-intel.o | 16 | obj-$(CONFIG_KVM_INTEL) += kvm-intel.o |
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c new file mode 100644 index 000000000000..361e31611276 --- /dev/null +++ b/arch/x86/kvm/i8254.c | |||
@@ -0,0 +1,611 @@ | |||
1 | /* | ||
2 | * 8253/8254 interval timer emulation | ||
3 | * | ||
4 | * Copyright (c) 2003-2004 Fabrice Bellard | ||
5 | * Copyright (c) 2006 Intel Corporation | ||
6 | * Copyright (c) 2007 Keir Fraser, XenSource Inc | ||
7 | * Copyright (c) 2008 Intel Corporation | ||
8 | * | ||
9 | * Permission is hereby granted, free of charge, to any person obtaining a copy | ||
10 | * of this software and associated documentation files (the "Software"), to deal | ||
11 | * in the Software without restriction, including without limitation the rights | ||
12 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
13 | * copies of the Software, and to permit persons to whom the Software is | ||
14 | * furnished to do so, subject to the following conditions: | ||
15 | * | ||
16 | * The above copyright notice and this permission notice shall be included in | ||
17 | * all copies or substantial portions of the Software. | ||
18 | * | ||
19 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
20 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
21 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | ||
22 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
23 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
24 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | ||
25 | * THE SOFTWARE. | ||
26 | * | ||
27 | * Authors: | ||
28 | * Sheng Yang <sheng.yang@intel.com> | ||
29 | * Based on QEMU and Xen. | ||
30 | */ | ||
31 | |||
32 | #include <linux/kvm_host.h> | ||
33 | |||
34 | #include "irq.h" | ||
35 | #include "i8254.h" | ||
36 | |||
37 | #ifndef CONFIG_X86_64 | ||
38 | #define mod_64(x, y) ((x) - (y) * div64_64(x, y)) | ||
39 | #else | ||
40 | #define mod_64(x, y) ((x) % (y)) | ||
41 | #endif | ||
42 | |||
43 | #define RW_STATE_LSB 1 | ||
44 | #define RW_STATE_MSB 2 | ||
45 | #define RW_STATE_WORD0 3 | ||
46 | #define RW_STATE_WORD1 4 | ||
47 | |||
48 | /* Compute with 96 bit intermediate result: (a*b)/c */ | ||
49 | static u64 muldiv64(u64 a, u32 b, u32 c) | ||
50 | { | ||
51 | union { | ||
52 | u64 ll; | ||
53 | struct { | ||
54 | u32 low, high; | ||
55 | } l; | ||
56 | } u, res; | ||
57 | u64 rl, rh; | ||
58 | |||
59 | u.ll = a; | ||
60 | rl = (u64)u.l.low * (u64)b; | ||
61 | rh = (u64)u.l.high * (u64)b; | ||
62 | rh += (rl >> 32); | ||
63 | res.l.high = div64_64(rh, c); | ||
64 | res.l.low = div64_64(((mod_64(rh, c) << 32) + (rl & 0xffffffff)), c); | ||
65 | return res.ll; | ||
66 | } | ||
67 | |||
68 | static void pit_set_gate(struct kvm *kvm, int channel, u32 val) | ||
69 | { | ||
70 | struct kvm_kpit_channel_state *c = | ||
71 | &kvm->arch.vpit->pit_state.channels[channel]; | ||
72 | |||
73 | WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); | ||
74 | |||
75 | switch (c->mode) { | ||
76 | default: | ||
77 | case 0: | ||
78 | case 4: | ||
79 | /* XXX: just disable/enable counting */ | ||
80 | break; | ||
81 | case 1: | ||
82 | case 2: | ||
83 | case 3: | ||
84 | case 5: | ||
85 | /* Restart counting on rising edge. */ | ||
86 | if (c->gate < val) | ||
87 | c->count_load_time = ktime_get(); | ||
88 | break; | ||
89 | } | ||
90 | |||
91 | c->gate = val; | ||
92 | } | ||
93 | |||
94 | int pit_get_gate(struct kvm *kvm, int channel) | ||
95 | { | ||
96 | WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); | ||
97 | |||
98 | return kvm->arch.vpit->pit_state.channels[channel].gate; | ||
99 | } | ||
100 | |||
101 | static int pit_get_count(struct kvm *kvm, int channel) | ||
102 | { | ||
103 | struct kvm_kpit_channel_state *c = | ||
104 | &kvm->arch.vpit->pit_state.channels[channel]; | ||
105 | s64 d, t; | ||
106 | int counter; | ||
107 | |||
108 | WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); | ||
109 | |||
110 | t = ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time)); | ||
111 | d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC); | ||
112 | |||
113 | switch (c->mode) { | ||
114 | case 0: | ||
115 | case 1: | ||
116 | case 4: | ||
117 | case 5: | ||
118 | counter = (c->count - d) & 0xffff; | ||
119 | break; | ||
120 | case 3: | ||
121 | /* XXX: may be incorrect for odd counts */ | ||
122 | counter = c->count - (mod_64((2 * d), c->count)); | ||
123 | break; | ||
124 | default: | ||
125 | counter = c->count - mod_64(d, c->count); | ||
126 | break; | ||
127 | } | ||
128 | return counter; | ||
129 | } | ||
130 | |||
131 | static int pit_get_out(struct kvm *kvm, int channel) | ||
132 | { | ||
133 | struct kvm_kpit_channel_state *c = | ||
134 | &kvm->arch.vpit->pit_state.channels[channel]; | ||
135 | s64 d, t; | ||
136 | int out; | ||
137 | |||
138 | WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); | ||
139 | |||
140 | t = ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time)); | ||
141 | d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC); | ||
142 | |||
143 | switch (c->mode) { | ||
144 | default: | ||
145 | case 0: | ||
146 | out = (d >= c->count); | ||
147 | break; | ||
148 | case 1: | ||
149 | out = (d < c->count); | ||
150 | break; | ||
151 | case 2: | ||
152 | out = ((mod_64(d, c->count) == 0) && (d != 0)); | ||
153 | break; | ||
154 | case 3: | ||
155 | out = (mod_64(d, c->count) < ((c->count + 1) >> 1)); | ||
156 | break; | ||
157 | case 4: | ||
158 | case 5: | ||
159 | out = (d == c->count); | ||
160 | break; | ||
161 | } | ||
162 | |||
163 | return out; | ||
164 | } | ||
165 | |||
166 | static void pit_latch_count(struct kvm *kvm, int channel) | ||
167 | { | ||
168 | struct kvm_kpit_channel_state *c = | ||
169 | &kvm->arch.vpit->pit_state.channels[channel]; | ||
170 | |||
171 | WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); | ||
172 | |||
173 | if (!c->count_latched) { | ||
174 | c->latched_count = pit_get_count(kvm, channel); | ||
175 | c->count_latched = c->rw_mode; | ||
176 | } | ||
177 | } | ||
178 | |||
179 | static void pit_latch_status(struct kvm *kvm, int channel) | ||
180 | { | ||
181 | struct kvm_kpit_channel_state *c = | ||
182 | &kvm->arch.vpit->pit_state.channels[channel]; | ||
183 | |||
184 | WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); | ||
185 | |||
186 | if (!c->status_latched) { | ||
187 | /* TODO: Return NULL COUNT (bit 6). */ | ||
188 | c->status = ((pit_get_out(kvm, channel) << 7) | | ||
189 | (c->rw_mode << 4) | | ||
190 | (c->mode << 1) | | ||
191 | c->bcd); | ||
192 | c->status_latched = 1; | ||
193 | } | ||
194 | } | ||
195 | |||
196 | int __pit_timer_fn(struct kvm_kpit_state *ps) | ||
197 | { | ||
198 | struct kvm_vcpu *vcpu0 = ps->pit->kvm->vcpus[0]; | ||
199 | struct kvm_kpit_timer *pt = &ps->pit_timer; | ||
200 | |||
201 | atomic_inc(&pt->pending); | ||
202 | smp_mb__after_atomic_inc(); | ||
203 | /* FIXME: handle case where the guest is in guest mode */ | ||
204 | if (vcpu0 && waitqueue_active(&vcpu0->wq)) { | ||
205 | vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE; | ||
206 | wake_up_interruptible(&vcpu0->wq); | ||
207 | } | ||
208 | |||
209 | pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period); | ||
210 | pt->scheduled = ktime_to_ns(pt->timer.expires); | ||
211 | |||
212 | return (pt->period == 0 ? 0 : 1); | ||
213 | } | ||
214 | |||
215 | int pit_has_pending_timer(struct kvm_vcpu *vcpu) | ||
216 | { | ||
217 | struct kvm_pit *pit = vcpu->kvm->arch.vpit; | ||
218 | |||
219 | if (pit && vcpu->vcpu_id == 0) | ||
220 | return atomic_read(&pit->pit_state.pit_timer.pending); | ||
221 | |||
222 | return 0; | ||
223 | } | ||
224 | |||
225 | static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) | ||
226 | { | ||
227 | struct kvm_kpit_state *ps; | ||
228 | int restart_timer = 0; | ||
229 | |||
230 | ps = container_of(data, struct kvm_kpit_state, pit_timer.timer); | ||
231 | |||
232 | restart_timer = __pit_timer_fn(ps); | ||
233 | |||
234 | if (restart_timer) | ||
235 | return HRTIMER_RESTART; | ||
236 | else | ||
237 | return HRTIMER_NORESTART; | ||
238 | } | ||
239 | |||
240 | static void destroy_pit_timer(struct kvm_kpit_timer *pt) | ||
241 | { | ||
242 | pr_debug("pit: execute del timer!\n"); | ||
243 | hrtimer_cancel(&pt->timer); | ||
244 | } | ||
245 | |||
246 | static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int is_period) | ||
247 | { | ||
248 | s64 interval; | ||
249 | |||
250 | interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); | ||
251 | |||
252 | pr_debug("pit: create pit timer, interval is %llu nsec\n", interval); | ||
253 | |||
254 | /* TODO The new value only affected after the retriggered */ | ||
255 | hrtimer_cancel(&pt->timer); | ||
256 | pt->period = (is_period == 0) ? 0 : interval; | ||
257 | pt->timer.function = pit_timer_fn; | ||
258 | atomic_set(&pt->pending, 0); | ||
259 | |||
260 | hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval), | ||
261 | HRTIMER_MODE_ABS); | ||
262 | } | ||
263 | |||
264 | static void pit_load_count(struct kvm *kvm, int channel, u32 val) | ||
265 | { | ||
266 | struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; | ||
267 | |||
268 | WARN_ON(!mutex_is_locked(&ps->lock)); | ||
269 | |||
270 | pr_debug("pit: load_count val is %d, channel is %d\n", val, channel); | ||
271 | |||
272 | /* | ||
273 | * Though spec said the state of 8254 is undefined after power-up, | ||
274 | * seems some tricky OS like Windows XP depends on IRQ0 interrupt | ||
275 | * when booting up. | ||
276 | * So here setting initialize rate for it, and not a specific number | ||
277 | */ | ||
278 | if (val == 0) | ||
279 | val = 0x10000; | ||
280 | |||
281 | ps->channels[channel].count_load_time = ktime_get(); | ||
282 | ps->channels[channel].count = val; | ||
283 | |||
284 | if (channel != 0) | ||
285 | return; | ||
286 | |||
287 | /* Two types of timer | ||
288 | * mode 1 is one shot, mode 2 is period, otherwise del timer */ | ||
289 | switch (ps->channels[0].mode) { | ||
290 | case 1: | ||
291 | create_pit_timer(&ps->pit_timer, val, 0); | ||
292 | break; | ||
293 | case 2: | ||
294 | create_pit_timer(&ps->pit_timer, val, 1); | ||
295 | break; | ||
296 | default: | ||
297 | destroy_pit_timer(&ps->pit_timer); | ||
298 | } | ||
299 | } | ||
300 | |||
301 | void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val) | ||
302 | { | ||
303 | mutex_lock(&kvm->arch.vpit->pit_state.lock); | ||
304 | pit_load_count(kvm, channel, val); | ||
305 | mutex_unlock(&kvm->arch.vpit->pit_state.lock); | ||
306 | } | ||
307 | |||
308 | static void pit_ioport_write(struct kvm_io_device *this, | ||
309 | gpa_t addr, int len, const void *data) | ||
310 | { | ||
311 | struct kvm_pit *pit = (struct kvm_pit *)this->private; | ||
312 | struct kvm_kpit_state *pit_state = &pit->pit_state; | ||
313 | struct kvm *kvm = pit->kvm; | ||
314 | int channel, access; | ||
315 | struct kvm_kpit_channel_state *s; | ||
316 | u32 val = *(u32 *) data; | ||
317 | |||
318 | val &= 0xff; | ||
319 | addr &= KVM_PIT_CHANNEL_MASK; | ||
320 | |||
321 | mutex_lock(&pit_state->lock); | ||
322 | |||
323 | if (val != 0) | ||
324 | pr_debug("pit: write addr is 0x%x, len is %d, val is 0x%x\n", | ||
325 | (unsigned int)addr, len, val); | ||
326 | |||
327 | if (addr == 3) { | ||
328 | channel = val >> 6; | ||
329 | if (channel == 3) { | ||
330 | /* Read-Back Command. */ | ||
331 | for (channel = 0; channel < 3; channel++) { | ||
332 | s = &pit_state->channels[channel]; | ||
333 | if (val & (2 << channel)) { | ||
334 | if (!(val & 0x20)) | ||
335 | pit_latch_count(kvm, channel); | ||
336 | if (!(val & 0x10)) | ||
337 | pit_latch_status(kvm, channel); | ||
338 | } | ||
339 | } | ||
340 | } else { | ||
341 | /* Select Counter <channel>. */ | ||
342 | s = &pit_state->channels[channel]; | ||
343 | access = (val >> 4) & KVM_PIT_CHANNEL_MASK; | ||
344 | if (access == 0) { | ||
345 | pit_latch_count(kvm, channel); | ||
346 | } else { | ||
347 | s->rw_mode = access; | ||
348 | s->read_state = access; | ||
349 | s->write_state = access; | ||
350 | s->mode = (val >> 1) & 7; | ||
351 | if (s->mode > 5) | ||
352 | s->mode -= 4; | ||
353 | s->bcd = val & 1; | ||
354 | } | ||
355 | } | ||
356 | } else { | ||
357 | /* Write Count. */ | ||
358 | s = &pit_state->channels[addr]; | ||
359 | switch (s->write_state) { | ||
360 | default: | ||
361 | case RW_STATE_LSB: | ||
362 | pit_load_count(kvm, addr, val); | ||
363 | break; | ||
364 | case RW_STATE_MSB: | ||
365 | pit_load_count(kvm, addr, val << 8); | ||
366 | break; | ||
367 | case RW_STATE_WORD0: | ||
368 | s->write_latch = val; | ||
369 | s->write_state = RW_STATE_WORD1; | ||
370 | break; | ||
371 | case RW_STATE_WORD1: | ||
372 | pit_load_count(kvm, addr, s->write_latch | (val << 8)); | ||
373 | s->write_state = RW_STATE_WORD0; | ||
374 | break; | ||
375 | } | ||
376 | } | ||
377 | |||
378 | mutex_unlock(&pit_state->lock); | ||
379 | } | ||
380 | |||
381 | static void pit_ioport_read(struct kvm_io_device *this, | ||
382 | gpa_t addr, int len, void *data) | ||
383 | { | ||
384 | struct kvm_pit *pit = (struct kvm_pit *)this->private; | ||
385 | struct kvm_kpit_state *pit_state = &pit->pit_state; | ||
386 | struct kvm *kvm = pit->kvm; | ||
387 | int ret, count; | ||
388 | struct kvm_kpit_channel_state *s; | ||
389 | |||
390 | addr &= KVM_PIT_CHANNEL_MASK; | ||
391 | s = &pit_state->channels[addr]; | ||
392 | |||
393 | mutex_lock(&pit_state->lock); | ||
394 | |||
395 | if (s->status_latched) { | ||
396 | s->status_latched = 0; | ||
397 | ret = s->status; | ||
398 | } else if (s->count_latched) { | ||
399 | switch (s->count_latched) { | ||
400 | default: | ||
401 | case RW_STATE_LSB: | ||
402 | ret = s->latched_count & 0xff; | ||
403 | s->count_latched = 0; | ||
404 | break; | ||
405 | case RW_STATE_MSB: | ||
406 | ret = s->latched_count >> 8; | ||
407 | s->count_latched = 0; | ||
408 | break; | ||
409 | case RW_STATE_WORD0: | ||
410 | ret = s->latched_count & 0xff; | ||
411 | s->count_latched = RW_STATE_MSB; | ||
412 | break; | ||
413 | } | ||
414 | } else { | ||
415 | switch (s->read_state) { | ||
416 | default: | ||
417 | case RW_STATE_LSB: | ||
418 | count = pit_get_count(kvm, addr); | ||
419 | ret = count & 0xff; | ||
420 | break; | ||
421 | case RW_STATE_MSB: | ||
422 | count = pit_get_count(kvm, addr); | ||
423 | ret = (count >> 8) & 0xff; | ||
424 | break; | ||
425 | case RW_STATE_WORD0: | ||
426 | count = pit_get_count(kvm, addr); | ||
427 | ret = count & 0xff; | ||
428 | s->read_state = RW_STATE_WORD1; | ||
429 | break; | ||
430 | case RW_STATE_WORD1: | ||
431 | count = pit_get_count(kvm, addr); | ||
432 | ret = (count >> 8) & 0xff; | ||
433 | s->read_state = RW_STATE_WORD0; | ||
434 | break; | ||
435 | } | ||
436 | } | ||
437 | |||
438 | if (len > sizeof(ret)) | ||
439 | len = sizeof(ret); | ||
440 | memcpy(data, (char *)&ret, len); | ||
441 | |||
442 | mutex_unlock(&pit_state->lock); | ||
443 | } | ||
444 | |||
445 | static int pit_in_range(struct kvm_io_device *this, gpa_t addr) | ||
446 | { | ||
447 | return ((addr >= KVM_PIT_BASE_ADDRESS) && | ||
448 | (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH)); | ||
449 | } | ||
450 | |||
451 | static void speaker_ioport_write(struct kvm_io_device *this, | ||
452 | gpa_t addr, int len, const void *data) | ||
453 | { | ||
454 | struct kvm_pit *pit = (struct kvm_pit *)this->private; | ||
455 | struct kvm_kpit_state *pit_state = &pit->pit_state; | ||
456 | struct kvm *kvm = pit->kvm; | ||
457 | u32 val = *(u32 *) data; | ||
458 | |||
459 | mutex_lock(&pit_state->lock); | ||
460 | pit_state->speaker_data_on = (val >> 1) & 1; | ||
461 | pit_set_gate(kvm, 2, val & 1); | ||
462 | mutex_unlock(&pit_state->lock); | ||
463 | } | ||
464 | |||
465 | static void speaker_ioport_read(struct kvm_io_device *this, | ||
466 | gpa_t addr, int len, void *data) | ||
467 | { | ||
468 | struct kvm_pit *pit = (struct kvm_pit *)this->private; | ||
469 | struct kvm_kpit_state *pit_state = &pit->pit_state; | ||
470 | struct kvm *kvm = pit->kvm; | ||
471 | unsigned int refresh_clock; | ||
472 | int ret; | ||
473 | |||
474 | /* Refresh clock toggles at about 15us. We approximate as 2^14ns. */ | ||
475 | refresh_clock = ((unsigned int)ktime_to_ns(ktime_get()) >> 14) & 1; | ||
476 | |||
477 | mutex_lock(&pit_state->lock); | ||
478 | ret = ((pit_state->speaker_data_on << 1) | pit_get_gate(kvm, 2) | | ||
479 | (pit_get_out(kvm, 2) << 5) | (refresh_clock << 4)); | ||
480 | if (len > sizeof(ret)) | ||
481 | len = sizeof(ret); | ||
482 | memcpy(data, (char *)&ret, len); | ||
483 | mutex_unlock(&pit_state->lock); | ||
484 | } | ||
485 | |||
486 | static int speaker_in_range(struct kvm_io_device *this, gpa_t addr) | ||
487 | { | ||
488 | return (addr == KVM_SPEAKER_BASE_ADDRESS); | ||
489 | } | ||
490 | |||
491 | void kvm_pit_reset(struct kvm_pit *pit) | ||
492 | { | ||
493 | int i; | ||
494 | struct kvm_kpit_channel_state *c; | ||
495 | |||
496 | mutex_lock(&pit->pit_state.lock); | ||
497 | for (i = 0; i < 3; i++) { | ||
498 | c = &pit->pit_state.channels[i]; | ||
499 | c->mode = 0xff; | ||
500 | c->gate = (i != 2); | ||
501 | pit_load_count(pit->kvm, i, 0); | ||
502 | } | ||
503 | mutex_unlock(&pit->pit_state.lock); | ||
504 | |||
505 | atomic_set(&pit->pit_state.pit_timer.pending, 0); | ||
506 | pit->pit_state.inject_pending = 1; | ||
507 | } | ||
508 | |||
509 | struct kvm_pit *kvm_create_pit(struct kvm *kvm) | ||
510 | { | ||
511 | struct kvm_pit *pit; | ||
512 | struct kvm_kpit_state *pit_state; | ||
513 | |||
514 | pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL); | ||
515 | if (!pit) | ||
516 | return NULL; | ||
517 | |||
518 | mutex_init(&pit->pit_state.lock); | ||
519 | mutex_lock(&pit->pit_state.lock); | ||
520 | |||
521 | /* Initialize PIO device */ | ||
522 | pit->dev.read = pit_ioport_read; | ||
523 | pit->dev.write = pit_ioport_write; | ||
524 | pit->dev.in_range = pit_in_range; | ||
525 | pit->dev.private = pit; | ||
526 | kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev); | ||
527 | |||
528 | pit->speaker_dev.read = speaker_ioport_read; | ||
529 | pit->speaker_dev.write = speaker_ioport_write; | ||
530 | pit->speaker_dev.in_range = speaker_in_range; | ||
531 | pit->speaker_dev.private = pit; | ||
532 | kvm_io_bus_register_dev(&kvm->pio_bus, &pit->speaker_dev); | ||
533 | |||
534 | kvm->arch.vpit = pit; | ||
535 | pit->kvm = kvm; | ||
536 | |||
537 | pit_state = &pit->pit_state; | ||
538 | pit_state->pit = pit; | ||
539 | hrtimer_init(&pit_state->pit_timer.timer, | ||
540 | CLOCK_MONOTONIC, HRTIMER_MODE_ABS); | ||
541 | mutex_unlock(&pit->pit_state.lock); | ||
542 | |||
543 | kvm_pit_reset(pit); | ||
544 | |||
545 | return pit; | ||
546 | } | ||
547 | |||
548 | void kvm_free_pit(struct kvm *kvm) | ||
549 | { | ||
550 | struct hrtimer *timer; | ||
551 | |||
552 | if (kvm->arch.vpit) { | ||
553 | mutex_lock(&kvm->arch.vpit->pit_state.lock); | ||
554 | timer = &kvm->arch.vpit->pit_state.pit_timer.timer; | ||
555 | hrtimer_cancel(timer); | ||
556 | mutex_unlock(&kvm->arch.vpit->pit_state.lock); | ||
557 | kfree(kvm->arch.vpit); | ||
558 | } | ||
559 | } | ||
560 | |||
561 | void __inject_pit_timer_intr(struct kvm *kvm) | ||
562 | { | ||
563 | mutex_lock(&kvm->lock); | ||
564 | kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 1); | ||
565 | kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 0); | ||
566 | kvm_pic_set_irq(pic_irqchip(kvm), 0, 1); | ||
567 | kvm_pic_set_irq(pic_irqchip(kvm), 0, 0); | ||
568 | mutex_unlock(&kvm->lock); | ||
569 | } | ||
570 | |||
571 | void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu) | ||
572 | { | ||
573 | struct kvm_pit *pit = vcpu->kvm->arch.vpit; | ||
574 | struct kvm *kvm = vcpu->kvm; | ||
575 | struct kvm_kpit_state *ps; | ||
576 | |||
577 | if (vcpu && pit) { | ||
578 | ps = &pit->pit_state; | ||
579 | |||
580 | /* Try to inject pending interrupts when: | ||
581 | * 1. Pending exists | ||
582 | * 2. Last interrupt was accepted or waited for too long time*/ | ||
583 | if (atomic_read(&ps->pit_timer.pending) && | ||
584 | (ps->inject_pending || | ||
585 | (jiffies - ps->last_injected_time | ||
586 | >= KVM_MAX_PIT_INTR_INTERVAL))) { | ||
587 | ps->inject_pending = 0; | ||
588 | __inject_pit_timer_intr(kvm); | ||
589 | ps->last_injected_time = jiffies; | ||
590 | } | ||
591 | } | ||
592 | } | ||
593 | |||
594 | void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec) | ||
595 | { | ||
596 | struct kvm_arch *arch = &vcpu->kvm->arch; | ||
597 | struct kvm_kpit_state *ps; | ||
598 | |||
599 | if (vcpu && arch->vpit) { | ||
600 | ps = &arch->vpit->pit_state; | ||
601 | if (atomic_read(&ps->pit_timer.pending) && | ||
602 | (((arch->vpic->pics[0].imr & 1) == 0 && | ||
603 | arch->vpic->pics[0].irq_base == vec) || | ||
604 | (arch->vioapic->redirtbl[0].fields.vector == vec && | ||
605 | arch->vioapic->redirtbl[0].fields.mask != 1))) { | ||
606 | ps->inject_pending = 1; | ||
607 | atomic_dec(&ps->pit_timer.pending); | ||
608 | ps->channels[0].count_load_time = ktime_get(); | ||
609 | } | ||
610 | } | ||
611 | } | ||
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h new file mode 100644 index 000000000000..db25c2a6c8c4 --- /dev/null +++ b/arch/x86/kvm/i8254.h | |||
@@ -0,0 +1,63 @@ | |||
1 | #ifndef __I8254_H | ||
2 | #define __I8254_H | ||
3 | |||
4 | #include "iodev.h" | ||
5 | |||
6 | struct kvm_kpit_timer { | ||
7 | struct hrtimer timer; | ||
8 | int irq; | ||
9 | s64 period; /* unit: ns */ | ||
10 | s64 scheduled; | ||
11 | ktime_t last_update; | ||
12 | atomic_t pending; | ||
13 | }; | ||
14 | |||
15 | struct kvm_kpit_channel_state { | ||
16 | u32 count; /* can be 65536 */ | ||
17 | u16 latched_count; | ||
18 | u8 count_latched; | ||
19 | u8 status_latched; | ||
20 | u8 status; | ||
21 | u8 read_state; | ||
22 | u8 write_state; | ||
23 | u8 write_latch; | ||
24 | u8 rw_mode; | ||
25 | u8 mode; | ||
26 | u8 bcd; /* not supported */ | ||
27 | u8 gate; /* timer start */ | ||
28 | ktime_t count_load_time; | ||
29 | }; | ||
30 | |||
31 | struct kvm_kpit_state { | ||
32 | struct kvm_kpit_channel_state channels[3]; | ||
33 | struct kvm_kpit_timer pit_timer; | ||
34 | u32 speaker_data_on; | ||
35 | struct mutex lock; | ||
36 | struct kvm_pit *pit; | ||
37 | bool inject_pending; /* if inject pending interrupts */ | ||
38 | unsigned long last_injected_time; | ||
39 | }; | ||
40 | |||
41 | struct kvm_pit { | ||
42 | unsigned long base_addresss; | ||
43 | struct kvm_io_device dev; | ||
44 | struct kvm_io_device speaker_dev; | ||
45 | struct kvm *kvm; | ||
46 | struct kvm_kpit_state pit_state; | ||
47 | }; | ||
48 | |||
49 | #define KVM_PIT_BASE_ADDRESS 0x40 | ||
50 | #define KVM_SPEAKER_BASE_ADDRESS 0x61 | ||
51 | #define KVM_PIT_MEM_LENGTH 4 | ||
52 | #define KVM_PIT_FREQ 1193181 | ||
53 | #define KVM_MAX_PIT_INTR_INTERVAL HZ / 100 | ||
54 | #define KVM_PIT_CHANNEL_MASK 0x3 | ||
55 | |||
56 | void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu); | ||
57 | void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec); | ||
58 | void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val); | ||
59 | struct kvm_pit *kvm_create_pit(struct kvm *kvm); | ||
60 | void kvm_free_pit(struct kvm *kvm); | ||
61 | void kvm_pit_reset(struct kvm_pit *pit); | ||
62 | |||
63 | #endif | ||
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index e5714759e97f..ce1f583459b1 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c | |||
@@ -23,6 +23,22 @@ | |||
23 | #include <linux/kvm_host.h> | 23 | #include <linux/kvm_host.h> |
24 | 24 | ||
25 | #include "irq.h" | 25 | #include "irq.h" |
26 | #include "i8254.h" | ||
27 | |||
28 | /* | ||
29 | * check if there are pending timer events | ||
30 | * to be processed. | ||
31 | */ | ||
32 | int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) | ||
33 | { | ||
34 | int ret; | ||
35 | |||
36 | ret = pit_has_pending_timer(vcpu); | ||
37 | ret |= apic_has_pending_timer(vcpu); | ||
38 | |||
39 | return ret; | ||
40 | } | ||
41 | EXPORT_SYMBOL(kvm_cpu_has_pending_timer); | ||
26 | 42 | ||
27 | /* | 43 | /* |
28 | * check if there is pending interrupt without | 44 | * check if there is pending interrupt without |
@@ -66,6 +82,7 @@ EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt); | |||
66 | void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) | 82 | void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) |
67 | { | 83 | { |
68 | kvm_inject_apic_timer_irqs(vcpu); | 84 | kvm_inject_apic_timer_irqs(vcpu); |
85 | kvm_inject_pit_timer_irqs(vcpu); | ||
69 | /* TODO: PIT, RTC etc. */ | 86 | /* TODO: PIT, RTC etc. */ |
70 | } | 87 | } |
71 | EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs); | 88 | EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs); |
@@ -73,6 +90,7 @@ EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs); | |||
73 | void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec) | 90 | void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec) |
74 | { | 91 | { |
75 | kvm_apic_timer_intr_post(vcpu, vec); | 92 | kvm_apic_timer_intr_post(vcpu, vec); |
93 | kvm_pit_timer_intr_post(vcpu, vec); | ||
76 | /* TODO: PIT, RTC etc. */ | 94 | /* TODO: PIT, RTC etc. */ |
77 | } | 95 | } |
78 | EXPORT_SYMBOL_GPL(kvm_timer_intr_post); | 96 | EXPORT_SYMBOL_GPL(kvm_timer_intr_post); |
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index fa5ed5d59b5d..1802134b836f 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h | |||
@@ -85,4 +85,7 @@ void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); | |||
85 | void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); | 85 | void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); |
86 | void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu); | 86 | void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu); |
87 | 87 | ||
88 | int pit_has_pending_timer(struct kvm_vcpu *vcpu); | ||
89 | int apic_has_pending_timer(struct kvm_vcpu *vcpu); | ||
90 | |||
88 | #endif | 91 | #endif |
diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h index ecdfe97e4635..65ef0fc2c036 100644 --- a/arch/x86/kvm/kvm_svm.h +++ b/arch/x86/kvm/kvm_svm.h | |||
@@ -39,6 +39,8 @@ struct vcpu_svm { | |||
39 | unsigned long host_db_regs[NUM_DB_REGS]; | 39 | unsigned long host_db_regs[NUM_DB_REGS]; |
40 | unsigned long host_dr6; | 40 | unsigned long host_dr6; |
41 | unsigned long host_dr7; | 41 | unsigned long host_dr7; |
42 | |||
43 | u32 *msrpm; | ||
42 | }; | 44 | }; |
43 | 45 | ||
44 | #endif | 46 | #endif |
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 68a6b1511934..57ac4e4c556a 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c | |||
@@ -338,10 +338,10 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, | |||
338 | } else | 338 | } else |
339 | apic_clear_vector(vector, apic->regs + APIC_TMR); | 339 | apic_clear_vector(vector, apic->regs + APIC_TMR); |
340 | 340 | ||
341 | if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE) | 341 | if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) |
342 | kvm_vcpu_kick(vcpu); | 342 | kvm_vcpu_kick(vcpu); |
343 | else if (vcpu->arch.mp_state == VCPU_MP_STATE_HALTED) { | 343 | else if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) { |
344 | vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE; | 344 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; |
345 | if (waitqueue_active(&vcpu->wq)) | 345 | if (waitqueue_active(&vcpu->wq)) |
346 | wake_up_interruptible(&vcpu->wq); | 346 | wake_up_interruptible(&vcpu->wq); |
347 | } | 347 | } |
@@ -362,11 +362,11 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, | |||
362 | 362 | ||
363 | case APIC_DM_INIT: | 363 | case APIC_DM_INIT: |
364 | if (level) { | 364 | if (level) { |
365 | if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE) | 365 | if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) |
366 | printk(KERN_DEBUG | 366 | printk(KERN_DEBUG |
367 | "INIT on a runnable vcpu %d\n", | 367 | "INIT on a runnable vcpu %d\n", |
368 | vcpu->vcpu_id); | 368 | vcpu->vcpu_id); |
369 | vcpu->arch.mp_state = VCPU_MP_STATE_INIT_RECEIVED; | 369 | vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; |
370 | kvm_vcpu_kick(vcpu); | 370 | kvm_vcpu_kick(vcpu); |
371 | } else { | 371 | } else { |
372 | printk(KERN_DEBUG | 372 | printk(KERN_DEBUG |
@@ -379,9 +379,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, | |||
379 | case APIC_DM_STARTUP: | 379 | case APIC_DM_STARTUP: |
380 | printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n", | 380 | printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n", |
381 | vcpu->vcpu_id, vector); | 381 | vcpu->vcpu_id, vector); |
382 | if (vcpu->arch.mp_state == VCPU_MP_STATE_INIT_RECEIVED) { | 382 | if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { |
383 | vcpu->arch.sipi_vector = vector; | 383 | vcpu->arch.sipi_vector = vector; |
384 | vcpu->arch.mp_state = VCPU_MP_STATE_SIPI_RECEIVED; | 384 | vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; |
385 | if (waitqueue_active(&vcpu->wq)) | 385 | if (waitqueue_active(&vcpu->wq)) |
386 | wake_up_interruptible(&vcpu->wq); | 386 | wake_up_interruptible(&vcpu->wq); |
387 | } | 387 | } |
@@ -658,7 +658,7 @@ static void start_apic_timer(struct kvm_lapic *apic) | |||
658 | apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" | 658 | apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" |
659 | PRIx64 ", " | 659 | PRIx64 ", " |
660 | "timer initial count 0x%x, period %lldns, " | 660 | "timer initial count 0x%x, period %lldns, " |
661 | "expire @ 0x%016" PRIx64 ".\n", __FUNCTION__, | 661 | "expire @ 0x%016" PRIx64 ".\n", __func__, |
662 | APIC_BUS_CYCLE_NS, ktime_to_ns(now), | 662 | APIC_BUS_CYCLE_NS, ktime_to_ns(now), |
663 | apic_get_reg(apic, APIC_TMICT), | 663 | apic_get_reg(apic, APIC_TMICT), |
664 | apic->timer.period, | 664 | apic->timer.period, |
@@ -691,7 +691,7 @@ static void apic_mmio_write(struct kvm_io_device *this, | |||
691 | /* too common printing */ | 691 | /* too common printing */ |
692 | if (offset != APIC_EOI) | 692 | if (offset != APIC_EOI) |
693 | apic_debug("%s: offset 0x%x with length 0x%x, and value is " | 693 | apic_debug("%s: offset 0x%x with length 0x%x, and value is " |
694 | "0x%x\n", __FUNCTION__, offset, len, val); | 694 | "0x%x\n", __func__, offset, len, val); |
695 | 695 | ||
696 | offset &= 0xff0; | 696 | offset &= 0xff0; |
697 | 697 | ||
@@ -822,6 +822,7 @@ void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8) | |||
822 | apic_set_tpr(apic, ((cr8 & 0x0f) << 4) | 822 | apic_set_tpr(apic, ((cr8 & 0x0f) << 4) |
823 | | (apic_get_reg(apic, APIC_TASKPRI) & 4)); | 823 | | (apic_get_reg(apic, APIC_TASKPRI) & 4)); |
824 | } | 824 | } |
825 | EXPORT_SYMBOL_GPL(kvm_lapic_set_tpr); | ||
825 | 826 | ||
826 | u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) | 827 | u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) |
827 | { | 828 | { |
@@ -869,7 +870,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) | |||
869 | struct kvm_lapic *apic; | 870 | struct kvm_lapic *apic; |
870 | int i; | 871 | int i; |
871 | 872 | ||
872 | apic_debug("%s\n", __FUNCTION__); | 873 | apic_debug("%s\n", __func__); |
873 | 874 | ||
874 | ASSERT(vcpu); | 875 | ASSERT(vcpu); |
875 | apic = vcpu->arch.apic; | 876 | apic = vcpu->arch.apic; |
@@ -907,7 +908,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) | |||
907 | apic_update_ppr(apic); | 908 | apic_update_ppr(apic); |
908 | 909 | ||
909 | apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr=" | 910 | apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr=" |
910 | "0x%016" PRIx64 ", base_address=0x%0lx.\n", __FUNCTION__, | 911 | "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__, |
911 | vcpu, kvm_apic_id(apic), | 912 | vcpu, kvm_apic_id(apic), |
912 | vcpu->arch.apic_base, apic->base_address); | 913 | vcpu->arch.apic_base, apic->base_address); |
913 | } | 914 | } |
@@ -940,7 +941,7 @@ static int __apic_timer_fn(struct kvm_lapic *apic) | |||
940 | 941 | ||
941 | atomic_inc(&apic->timer.pending); | 942 | atomic_inc(&apic->timer.pending); |
942 | if (waitqueue_active(q)) { | 943 | if (waitqueue_active(q)) { |
943 | apic->vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE; | 944 | apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; |
944 | wake_up_interruptible(q); | 945 | wake_up_interruptible(q); |
945 | } | 946 | } |
946 | if (apic_lvtt_period(apic)) { | 947 | if (apic_lvtt_period(apic)) { |
@@ -952,6 +953,16 @@ static int __apic_timer_fn(struct kvm_lapic *apic) | |||
952 | return result; | 953 | return result; |
953 | } | 954 | } |
954 | 955 | ||
956 | int apic_has_pending_timer(struct kvm_vcpu *vcpu) | ||
957 | { | ||
958 | struct kvm_lapic *lapic = vcpu->arch.apic; | ||
959 | |||
960 | if (lapic) | ||
961 | return atomic_read(&lapic->timer.pending); | ||
962 | |||
963 | return 0; | ||
964 | } | ||
965 | |||
955 | static int __inject_apic_timer_irq(struct kvm_lapic *apic) | 966 | static int __inject_apic_timer_irq(struct kvm_lapic *apic) |
956 | { | 967 | { |
957 | int vector; | 968 | int vector; |
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index e55af12e11b7..2ad6f5481671 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c | |||
@@ -27,11 +27,22 @@ | |||
27 | #include <linux/highmem.h> | 27 | #include <linux/highmem.h> |
28 | #include <linux/module.h> | 28 | #include <linux/module.h> |
29 | #include <linux/swap.h> | 29 | #include <linux/swap.h> |
30 | #include <linux/hugetlb.h> | ||
31 | #include <linux/compiler.h> | ||
30 | 32 | ||
31 | #include <asm/page.h> | 33 | #include <asm/page.h> |
32 | #include <asm/cmpxchg.h> | 34 | #include <asm/cmpxchg.h> |
33 | #include <asm/io.h> | 35 | #include <asm/io.h> |
34 | 36 | ||
37 | /* | ||
38 | * When setting this variable to true it enables Two-Dimensional-Paging | ||
39 | * where the hardware walks 2 page tables: | ||
40 | * 1. the guest-virtual to guest-physical | ||
41 | * 2. while doing 1. it walks guest-physical to host-physical | ||
42 | * If the hardware supports that we don't need to do shadow paging. | ||
43 | */ | ||
44 | bool tdp_enabled = false; | ||
45 | |||
35 | #undef MMU_DEBUG | 46 | #undef MMU_DEBUG |
36 | 47 | ||
37 | #undef AUDIT | 48 | #undef AUDIT |
@@ -101,8 +112,6 @@ static int dbg = 1; | |||
101 | #define PT_FIRST_AVAIL_BITS_SHIFT 9 | 112 | #define PT_FIRST_AVAIL_BITS_SHIFT 9 |
102 | #define PT64_SECOND_AVAIL_BITS_SHIFT 52 | 113 | #define PT64_SECOND_AVAIL_BITS_SHIFT 52 |
103 | 114 | ||
104 | #define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) | ||
105 | |||
106 | #define VALID_PAGE(x) ((x) != INVALID_PAGE) | 115 | #define VALID_PAGE(x) ((x) != INVALID_PAGE) |
107 | 116 | ||
108 | #define PT64_LEVEL_BITS 9 | 117 | #define PT64_LEVEL_BITS 9 |
@@ -159,6 +168,13 @@ static int dbg = 1; | |||
159 | #define ACC_USER_MASK PT_USER_MASK | 168 | #define ACC_USER_MASK PT_USER_MASK |
160 | #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) | 169 | #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) |
161 | 170 | ||
171 | struct kvm_pv_mmu_op_buffer { | ||
172 | void *ptr; | ||
173 | unsigned len; | ||
174 | unsigned processed; | ||
175 | char buf[512] __aligned(sizeof(long)); | ||
176 | }; | ||
177 | |||
162 | struct kvm_rmap_desc { | 178 | struct kvm_rmap_desc { |
163 | u64 *shadow_ptes[RMAP_EXT]; | 179 | u64 *shadow_ptes[RMAP_EXT]; |
164 | struct kvm_rmap_desc *more; | 180 | struct kvm_rmap_desc *more; |
@@ -200,11 +216,15 @@ static int is_present_pte(unsigned long pte) | |||
200 | 216 | ||
201 | static int is_shadow_present_pte(u64 pte) | 217 | static int is_shadow_present_pte(u64 pte) |
202 | { | 218 | { |
203 | pte &= ~PT_SHADOW_IO_MARK; | ||
204 | return pte != shadow_trap_nonpresent_pte | 219 | return pte != shadow_trap_nonpresent_pte |
205 | && pte != shadow_notrap_nonpresent_pte; | 220 | && pte != shadow_notrap_nonpresent_pte; |
206 | } | 221 | } |
207 | 222 | ||
223 | static int is_large_pte(u64 pte) | ||
224 | { | ||
225 | return pte & PT_PAGE_SIZE_MASK; | ||
226 | } | ||
227 | |||
208 | static int is_writeble_pte(unsigned long pte) | 228 | static int is_writeble_pte(unsigned long pte) |
209 | { | 229 | { |
210 | return pte & PT_WRITABLE_MASK; | 230 | return pte & PT_WRITABLE_MASK; |
@@ -215,14 +235,14 @@ static int is_dirty_pte(unsigned long pte) | |||
215 | return pte & PT_DIRTY_MASK; | 235 | return pte & PT_DIRTY_MASK; |
216 | } | 236 | } |
217 | 237 | ||
218 | static int is_io_pte(unsigned long pte) | 238 | static int is_rmap_pte(u64 pte) |
219 | { | 239 | { |
220 | return pte & PT_SHADOW_IO_MARK; | 240 | return is_shadow_present_pte(pte); |
221 | } | 241 | } |
222 | 242 | ||
223 | static int is_rmap_pte(u64 pte) | 243 | static pfn_t spte_to_pfn(u64 pte) |
224 | { | 244 | { |
225 | return is_shadow_present_pte(pte); | 245 | return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; |
226 | } | 246 | } |
227 | 247 | ||
228 | static gfn_t pse36_gfn_delta(u32 gpte) | 248 | static gfn_t pse36_gfn_delta(u32 gpte) |
@@ -349,16 +369,100 @@ static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd) | |||
349 | } | 369 | } |
350 | 370 | ||
351 | /* | 371 | /* |
372 | * Return the pointer to the largepage write count for a given | ||
373 | * gfn, handling slots that are not large page aligned. | ||
374 | */ | ||
375 | static int *slot_largepage_idx(gfn_t gfn, struct kvm_memory_slot *slot) | ||
376 | { | ||
377 | unsigned long idx; | ||
378 | |||
379 | idx = (gfn / KVM_PAGES_PER_HPAGE) - | ||
380 | (slot->base_gfn / KVM_PAGES_PER_HPAGE); | ||
381 | return &slot->lpage_info[idx].write_count; | ||
382 | } | ||
383 | |||
384 | static void account_shadowed(struct kvm *kvm, gfn_t gfn) | ||
385 | { | ||
386 | int *write_count; | ||
387 | |||
388 | write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn)); | ||
389 | *write_count += 1; | ||
390 | WARN_ON(*write_count > KVM_PAGES_PER_HPAGE); | ||
391 | } | ||
392 | |||
393 | static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) | ||
394 | { | ||
395 | int *write_count; | ||
396 | |||
397 | write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn)); | ||
398 | *write_count -= 1; | ||
399 | WARN_ON(*write_count < 0); | ||
400 | } | ||
401 | |||
402 | static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn) | ||
403 | { | ||
404 | struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); | ||
405 | int *largepage_idx; | ||
406 | |||
407 | if (slot) { | ||
408 | largepage_idx = slot_largepage_idx(gfn, slot); | ||
409 | return *largepage_idx; | ||
410 | } | ||
411 | |||
412 | return 1; | ||
413 | } | ||
414 | |||
415 | static int host_largepage_backed(struct kvm *kvm, gfn_t gfn) | ||
416 | { | ||
417 | struct vm_area_struct *vma; | ||
418 | unsigned long addr; | ||
419 | |||
420 | addr = gfn_to_hva(kvm, gfn); | ||
421 | if (kvm_is_error_hva(addr)) | ||
422 | return 0; | ||
423 | |||
424 | vma = find_vma(current->mm, addr); | ||
425 | if (vma && is_vm_hugetlb_page(vma)) | ||
426 | return 1; | ||
427 | |||
428 | return 0; | ||
429 | } | ||
430 | |||
431 | static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn) | ||
432 | { | ||
433 | struct kvm_memory_slot *slot; | ||
434 | |||
435 | if (has_wrprotected_page(vcpu->kvm, large_gfn)) | ||
436 | return 0; | ||
437 | |||
438 | if (!host_largepage_backed(vcpu->kvm, large_gfn)) | ||
439 | return 0; | ||
440 | |||
441 | slot = gfn_to_memslot(vcpu->kvm, large_gfn); | ||
442 | if (slot && slot->dirty_bitmap) | ||
443 | return 0; | ||
444 | |||
445 | return 1; | ||
446 | } | ||
447 | |||
448 | /* | ||
352 | * Take gfn and return the reverse mapping to it. | 449 | * Take gfn and return the reverse mapping to it. |
353 | * Note: gfn must be unaliased before this function get called | 450 | * Note: gfn must be unaliased before this function get called |
354 | */ | 451 | */ |
355 | 452 | ||
356 | static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn) | 453 | static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage) |
357 | { | 454 | { |
358 | struct kvm_memory_slot *slot; | 455 | struct kvm_memory_slot *slot; |
456 | unsigned long idx; | ||
359 | 457 | ||
360 | slot = gfn_to_memslot(kvm, gfn); | 458 | slot = gfn_to_memslot(kvm, gfn); |
361 | return &slot->rmap[gfn - slot->base_gfn]; | 459 | if (!lpage) |
460 | return &slot->rmap[gfn - slot->base_gfn]; | ||
461 | |||
462 | idx = (gfn / KVM_PAGES_PER_HPAGE) - | ||
463 | (slot->base_gfn / KVM_PAGES_PER_HPAGE); | ||
464 | |||
465 | return &slot->lpage_info[idx].rmap_pde; | ||
362 | } | 466 | } |
363 | 467 | ||
364 | /* | 468 | /* |
@@ -370,7 +474,7 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn) | |||
370 | * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc | 474 | * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc |
371 | * containing more mappings. | 475 | * containing more mappings. |
372 | */ | 476 | */ |
373 | static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) | 477 | static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage) |
374 | { | 478 | { |
375 | struct kvm_mmu_page *sp; | 479 | struct kvm_mmu_page *sp; |
376 | struct kvm_rmap_desc *desc; | 480 | struct kvm_rmap_desc *desc; |
@@ -382,7 +486,7 @@ static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) | |||
382 | gfn = unalias_gfn(vcpu->kvm, gfn); | 486 | gfn = unalias_gfn(vcpu->kvm, gfn); |
383 | sp = page_header(__pa(spte)); | 487 | sp = page_header(__pa(spte)); |
384 | sp->gfns[spte - sp->spt] = gfn; | 488 | sp->gfns[spte - sp->spt] = gfn; |
385 | rmapp = gfn_to_rmap(vcpu->kvm, gfn); | 489 | rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage); |
386 | if (!*rmapp) { | 490 | if (!*rmapp) { |
387 | rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); | 491 | rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); |
388 | *rmapp = (unsigned long)spte; | 492 | *rmapp = (unsigned long)spte; |
@@ -435,20 +539,21 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) | |||
435 | struct kvm_rmap_desc *desc; | 539 | struct kvm_rmap_desc *desc; |
436 | struct kvm_rmap_desc *prev_desc; | 540 | struct kvm_rmap_desc *prev_desc; |
437 | struct kvm_mmu_page *sp; | 541 | struct kvm_mmu_page *sp; |
438 | struct page *page; | 542 | pfn_t pfn; |
439 | unsigned long *rmapp; | 543 | unsigned long *rmapp; |
440 | int i; | 544 | int i; |
441 | 545 | ||
442 | if (!is_rmap_pte(*spte)) | 546 | if (!is_rmap_pte(*spte)) |
443 | return; | 547 | return; |
444 | sp = page_header(__pa(spte)); | 548 | sp = page_header(__pa(spte)); |
445 | page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT); | 549 | pfn = spte_to_pfn(*spte); |
446 | mark_page_accessed(page); | 550 | if (*spte & PT_ACCESSED_MASK) |
551 | kvm_set_pfn_accessed(pfn); | ||
447 | if (is_writeble_pte(*spte)) | 552 | if (is_writeble_pte(*spte)) |
448 | kvm_release_page_dirty(page); | 553 | kvm_release_pfn_dirty(pfn); |
449 | else | 554 | else |
450 | kvm_release_page_clean(page); | 555 | kvm_release_pfn_clean(pfn); |
451 | rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt]); | 556 | rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], is_large_pte(*spte)); |
452 | if (!*rmapp) { | 557 | if (!*rmapp) { |
453 | printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); | 558 | printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); |
454 | BUG(); | 559 | BUG(); |
@@ -514,7 +619,7 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn) | |||
514 | int write_protected = 0; | 619 | int write_protected = 0; |
515 | 620 | ||
516 | gfn = unalias_gfn(kvm, gfn); | 621 | gfn = unalias_gfn(kvm, gfn); |
517 | rmapp = gfn_to_rmap(kvm, gfn); | 622 | rmapp = gfn_to_rmap(kvm, gfn, 0); |
518 | 623 | ||
519 | spte = rmap_next(kvm, rmapp, NULL); | 624 | spte = rmap_next(kvm, rmapp, NULL); |
520 | while (spte) { | 625 | while (spte) { |
@@ -527,8 +632,35 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn) | |||
527 | } | 632 | } |
528 | spte = rmap_next(kvm, rmapp, spte); | 633 | spte = rmap_next(kvm, rmapp, spte); |
529 | } | 634 | } |
635 | if (write_protected) { | ||
636 | pfn_t pfn; | ||
637 | |||
638 | spte = rmap_next(kvm, rmapp, NULL); | ||
639 | pfn = spte_to_pfn(*spte); | ||
640 | kvm_set_pfn_dirty(pfn); | ||
641 | } | ||
642 | |||
643 | /* check for huge page mappings */ | ||
644 | rmapp = gfn_to_rmap(kvm, gfn, 1); | ||
645 | spte = rmap_next(kvm, rmapp, NULL); | ||
646 | while (spte) { | ||
647 | BUG_ON(!spte); | ||
648 | BUG_ON(!(*spte & PT_PRESENT_MASK)); | ||
649 | BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); | ||
650 | pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); | ||
651 | if (is_writeble_pte(*spte)) { | ||
652 | rmap_remove(kvm, spte); | ||
653 | --kvm->stat.lpages; | ||
654 | set_shadow_pte(spte, shadow_trap_nonpresent_pte); | ||
655 | write_protected = 1; | ||
656 | } | ||
657 | spte = rmap_next(kvm, rmapp, spte); | ||
658 | } | ||
659 | |||
530 | if (write_protected) | 660 | if (write_protected) |
531 | kvm_flush_remote_tlbs(kvm); | 661 | kvm_flush_remote_tlbs(kvm); |
662 | |||
663 | account_shadowed(kvm, gfn); | ||
532 | } | 664 | } |
533 | 665 | ||
534 | #ifdef MMU_DEBUG | 666 | #ifdef MMU_DEBUG |
@@ -538,8 +670,8 @@ static int is_empty_shadow_page(u64 *spt) | |||
538 | u64 *end; | 670 | u64 *end; |
539 | 671 | ||
540 | for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++) | 672 | for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++) |
541 | if ((*pos & ~PT_SHADOW_IO_MARK) != shadow_trap_nonpresent_pte) { | 673 | if (*pos != shadow_trap_nonpresent_pte) { |
542 | printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__, | 674 | printk(KERN_ERR "%s: %p %llx\n", __func__, |
543 | pos, *pos); | 675 | pos, *pos); |
544 | return 0; | 676 | return 0; |
545 | } | 677 | } |
@@ -559,7 +691,7 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) | |||
559 | 691 | ||
560 | static unsigned kvm_page_table_hashfn(gfn_t gfn) | 692 | static unsigned kvm_page_table_hashfn(gfn_t gfn) |
561 | { | 693 | { |
562 | return gfn; | 694 | return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1); |
563 | } | 695 | } |
564 | 696 | ||
565 | static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, | 697 | static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, |
@@ -662,13 +794,14 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) | |||
662 | struct kvm_mmu_page *sp; | 794 | struct kvm_mmu_page *sp; |
663 | struct hlist_node *node; | 795 | struct hlist_node *node; |
664 | 796 | ||
665 | pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn); | 797 | pgprintk("%s: looking for gfn %lx\n", __func__, gfn); |
666 | index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; | 798 | index = kvm_page_table_hashfn(gfn); |
667 | bucket = &kvm->arch.mmu_page_hash[index]; | 799 | bucket = &kvm->arch.mmu_page_hash[index]; |
668 | hlist_for_each_entry(sp, node, bucket, hash_link) | 800 | hlist_for_each_entry(sp, node, bucket, hash_link) |
669 | if (sp->gfn == gfn && !sp->role.metaphysical) { | 801 | if (sp->gfn == gfn && !sp->role.metaphysical |
802 | && !sp->role.invalid) { | ||
670 | pgprintk("%s: found role %x\n", | 803 | pgprintk("%s: found role %x\n", |
671 | __FUNCTION__, sp->role.word); | 804 | __func__, sp->role.word); |
672 | return sp; | 805 | return sp; |
673 | } | 806 | } |
674 | return NULL; | 807 | return NULL; |
@@ -699,27 +832,27 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | |||
699 | quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; | 832 | quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; |
700 | role.quadrant = quadrant; | 833 | role.quadrant = quadrant; |
701 | } | 834 | } |
702 | pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__, | 835 | pgprintk("%s: looking gfn %lx role %x\n", __func__, |
703 | gfn, role.word); | 836 | gfn, role.word); |
704 | index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; | 837 | index = kvm_page_table_hashfn(gfn); |
705 | bucket = &vcpu->kvm->arch.mmu_page_hash[index]; | 838 | bucket = &vcpu->kvm->arch.mmu_page_hash[index]; |
706 | hlist_for_each_entry(sp, node, bucket, hash_link) | 839 | hlist_for_each_entry(sp, node, bucket, hash_link) |
707 | if (sp->gfn == gfn && sp->role.word == role.word) { | 840 | if (sp->gfn == gfn && sp->role.word == role.word) { |
708 | mmu_page_add_parent_pte(vcpu, sp, parent_pte); | 841 | mmu_page_add_parent_pte(vcpu, sp, parent_pte); |
709 | pgprintk("%s: found\n", __FUNCTION__); | 842 | pgprintk("%s: found\n", __func__); |
710 | return sp; | 843 | return sp; |
711 | } | 844 | } |
712 | ++vcpu->kvm->stat.mmu_cache_miss; | 845 | ++vcpu->kvm->stat.mmu_cache_miss; |
713 | sp = kvm_mmu_alloc_page(vcpu, parent_pte); | 846 | sp = kvm_mmu_alloc_page(vcpu, parent_pte); |
714 | if (!sp) | 847 | if (!sp) |
715 | return sp; | 848 | return sp; |
716 | pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word); | 849 | pgprintk("%s: adding gfn %lx role %x\n", __func__, gfn, role.word); |
717 | sp->gfn = gfn; | 850 | sp->gfn = gfn; |
718 | sp->role = role; | 851 | sp->role = role; |
719 | hlist_add_head(&sp->hash_link, bucket); | 852 | hlist_add_head(&sp->hash_link, bucket); |
720 | vcpu->arch.mmu.prefetch_page(vcpu, sp); | ||
721 | if (!metaphysical) | 853 | if (!metaphysical) |
722 | rmap_write_protect(vcpu->kvm, gfn); | 854 | rmap_write_protect(vcpu->kvm, gfn); |
855 | vcpu->arch.mmu.prefetch_page(vcpu, sp); | ||
723 | return sp; | 856 | return sp; |
724 | } | 857 | } |
725 | 858 | ||
@@ -745,11 +878,17 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm, | |||
745 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | 878 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { |
746 | ent = pt[i]; | 879 | ent = pt[i]; |
747 | 880 | ||
881 | if (is_shadow_present_pte(ent)) { | ||
882 | if (!is_large_pte(ent)) { | ||
883 | ent &= PT64_BASE_ADDR_MASK; | ||
884 | mmu_page_remove_parent_pte(page_header(ent), | ||
885 | &pt[i]); | ||
886 | } else { | ||
887 | --kvm->stat.lpages; | ||
888 | rmap_remove(kvm, &pt[i]); | ||
889 | } | ||
890 | } | ||
748 | pt[i] = shadow_trap_nonpresent_pte; | 891 | pt[i] = shadow_trap_nonpresent_pte; |
749 | if (!is_shadow_present_pte(ent)) | ||
750 | continue; | ||
751 | ent &= PT64_BASE_ADDR_MASK; | ||
752 | mmu_page_remove_parent_pte(page_header(ent), &pt[i]); | ||
753 | } | 892 | } |
754 | kvm_flush_remote_tlbs(kvm); | 893 | kvm_flush_remote_tlbs(kvm); |
755 | } | 894 | } |
@@ -789,10 +928,15 @@ static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) | |||
789 | } | 928 | } |
790 | kvm_mmu_page_unlink_children(kvm, sp); | 929 | kvm_mmu_page_unlink_children(kvm, sp); |
791 | if (!sp->root_count) { | 930 | if (!sp->root_count) { |
931 | if (!sp->role.metaphysical) | ||
932 | unaccount_shadowed(kvm, sp->gfn); | ||
792 | hlist_del(&sp->hash_link); | 933 | hlist_del(&sp->hash_link); |
793 | kvm_mmu_free_page(kvm, sp); | 934 | kvm_mmu_free_page(kvm, sp); |
794 | } else | 935 | } else { |
795 | list_move(&sp->link, &kvm->arch.active_mmu_pages); | 936 | list_move(&sp->link, &kvm->arch.active_mmu_pages); |
937 | sp->role.invalid = 1; | ||
938 | kvm_reload_remote_mmus(kvm); | ||
939 | } | ||
796 | kvm_mmu_reset_last_pte_updated(kvm); | 940 | kvm_mmu_reset_last_pte_updated(kvm); |
797 | } | 941 | } |
798 | 942 | ||
@@ -838,13 +982,13 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) | |||
838 | struct hlist_node *node, *n; | 982 | struct hlist_node *node, *n; |
839 | int r; | 983 | int r; |
840 | 984 | ||
841 | pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn); | 985 | pgprintk("%s: looking for gfn %lx\n", __func__, gfn); |
842 | r = 0; | 986 | r = 0; |
843 | index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; | 987 | index = kvm_page_table_hashfn(gfn); |
844 | bucket = &kvm->arch.mmu_page_hash[index]; | 988 | bucket = &kvm->arch.mmu_page_hash[index]; |
845 | hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) | 989 | hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) |
846 | if (sp->gfn == gfn && !sp->role.metaphysical) { | 990 | if (sp->gfn == gfn && !sp->role.metaphysical) { |
847 | pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn, | 991 | pgprintk("%s: gfn %lx role %x\n", __func__, gfn, |
848 | sp->role.word); | 992 | sp->role.word); |
849 | kvm_mmu_zap_page(kvm, sp); | 993 | kvm_mmu_zap_page(kvm, sp); |
850 | r = 1; | 994 | r = 1; |
@@ -857,7 +1001,7 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) | |||
857 | struct kvm_mmu_page *sp; | 1001 | struct kvm_mmu_page *sp; |
858 | 1002 | ||
859 | while ((sp = kvm_mmu_lookup_page(kvm, gfn)) != NULL) { | 1003 | while ((sp = kvm_mmu_lookup_page(kvm, gfn)) != NULL) { |
860 | pgprintk("%s: zap %lx %x\n", __FUNCTION__, gfn, sp->role.word); | 1004 | pgprintk("%s: zap %lx %x\n", __func__, gfn, sp->role.word); |
861 | kvm_mmu_zap_page(kvm, sp); | 1005 | kvm_mmu_zap_page(kvm, sp); |
862 | } | 1006 | } |
863 | } | 1007 | } |
@@ -889,26 +1033,39 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) | |||
889 | static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | 1033 | static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, |
890 | unsigned pt_access, unsigned pte_access, | 1034 | unsigned pt_access, unsigned pte_access, |
891 | int user_fault, int write_fault, int dirty, | 1035 | int user_fault, int write_fault, int dirty, |
892 | int *ptwrite, gfn_t gfn, struct page *page) | 1036 | int *ptwrite, int largepage, gfn_t gfn, |
1037 | pfn_t pfn, bool speculative) | ||
893 | { | 1038 | { |
894 | u64 spte; | 1039 | u64 spte; |
895 | int was_rmapped = 0; | 1040 | int was_rmapped = 0; |
896 | int was_writeble = is_writeble_pte(*shadow_pte); | 1041 | int was_writeble = is_writeble_pte(*shadow_pte); |
897 | hfn_t host_pfn = (*shadow_pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; | ||
898 | 1042 | ||
899 | pgprintk("%s: spte %llx access %x write_fault %d" | 1043 | pgprintk("%s: spte %llx access %x write_fault %d" |
900 | " user_fault %d gfn %lx\n", | 1044 | " user_fault %d gfn %lx\n", |
901 | __FUNCTION__, *shadow_pte, pt_access, | 1045 | __func__, *shadow_pte, pt_access, |
902 | write_fault, user_fault, gfn); | 1046 | write_fault, user_fault, gfn); |
903 | 1047 | ||
904 | if (is_rmap_pte(*shadow_pte)) { | 1048 | if (is_rmap_pte(*shadow_pte)) { |
905 | if (host_pfn != page_to_pfn(page)) { | 1049 | /* |
1050 | * If we overwrite a PTE page pointer with a 2MB PMD, unlink | ||
1051 | * the parent of the now unreachable PTE. | ||
1052 | */ | ||
1053 | if (largepage && !is_large_pte(*shadow_pte)) { | ||
1054 | struct kvm_mmu_page *child; | ||
1055 | u64 pte = *shadow_pte; | ||
1056 | |||
1057 | child = page_header(pte & PT64_BASE_ADDR_MASK); | ||
1058 | mmu_page_remove_parent_pte(child, shadow_pte); | ||
1059 | } else if (pfn != spte_to_pfn(*shadow_pte)) { | ||
906 | pgprintk("hfn old %lx new %lx\n", | 1060 | pgprintk("hfn old %lx new %lx\n", |
907 | host_pfn, page_to_pfn(page)); | 1061 | spte_to_pfn(*shadow_pte), pfn); |
908 | rmap_remove(vcpu->kvm, shadow_pte); | 1062 | rmap_remove(vcpu->kvm, shadow_pte); |
1063 | } else { | ||
1064 | if (largepage) | ||
1065 | was_rmapped = is_large_pte(*shadow_pte); | ||
1066 | else | ||
1067 | was_rmapped = 1; | ||
909 | } | 1068 | } |
910 | else | ||
911 | was_rmapped = 1; | ||
912 | } | 1069 | } |
913 | 1070 | ||
914 | /* | 1071 | /* |
@@ -917,6 +1074,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | |||
917 | * demand paging). | 1074 | * demand paging). |
918 | */ | 1075 | */ |
919 | spte = PT_PRESENT_MASK | PT_DIRTY_MASK; | 1076 | spte = PT_PRESENT_MASK | PT_DIRTY_MASK; |
1077 | if (!speculative) | ||
1078 | pte_access |= PT_ACCESSED_MASK; | ||
920 | if (!dirty) | 1079 | if (!dirty) |
921 | pte_access &= ~ACC_WRITE_MASK; | 1080 | pte_access &= ~ACC_WRITE_MASK; |
922 | if (!(pte_access & ACC_EXEC_MASK)) | 1081 | if (!(pte_access & ACC_EXEC_MASK)) |
@@ -925,15 +1084,10 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | |||
925 | spte |= PT_PRESENT_MASK; | 1084 | spte |= PT_PRESENT_MASK; |
926 | if (pte_access & ACC_USER_MASK) | 1085 | if (pte_access & ACC_USER_MASK) |
927 | spte |= PT_USER_MASK; | 1086 | spte |= PT_USER_MASK; |
1087 | if (largepage) | ||
1088 | spte |= PT_PAGE_SIZE_MASK; | ||
928 | 1089 | ||
929 | if (is_error_page(page)) { | 1090 | spte |= (u64)pfn << PAGE_SHIFT; |
930 | set_shadow_pte(shadow_pte, | ||
931 | shadow_trap_nonpresent_pte | PT_SHADOW_IO_MARK); | ||
932 | kvm_release_page_clean(page); | ||
933 | return; | ||
934 | } | ||
935 | |||
936 | spte |= page_to_phys(page); | ||
937 | 1091 | ||
938 | if ((pte_access & ACC_WRITE_MASK) | 1092 | if ((pte_access & ACC_WRITE_MASK) |
939 | || (write_fault && !is_write_protection(vcpu) && !user_fault)) { | 1093 | || (write_fault && !is_write_protection(vcpu) && !user_fault)) { |
@@ -946,9 +1100,10 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, | |||
946 | } | 1100 | } |
947 | 1101 | ||
948 | shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); | 1102 | shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); |
949 | if (shadow) { | 1103 | if (shadow || |
1104 | (largepage && has_wrprotected_page(vcpu->kvm, gfn))) { | ||
950 | pgprintk("%s: found shadow page for %lx, marking ro\n", | 1105 | pgprintk("%s: found shadow page for %lx, marking ro\n", |
951 | __FUNCTION__, gfn); | 1106 | __func__, gfn); |
952 | pte_access &= ~ACC_WRITE_MASK; | 1107 | pte_access &= ~ACC_WRITE_MASK; |
953 | if (is_writeble_pte(spte)) { | 1108 | if (is_writeble_pte(spte)) { |
954 | spte &= ~PT_WRITABLE_MASK; | 1109 | spte &= ~PT_WRITABLE_MASK; |
@@ -964,18 +1119,25 @@ unshadowed: | |||
964 | if (pte_access & ACC_WRITE_MASK) | 1119 | if (pte_access & ACC_WRITE_MASK) |
965 | mark_page_dirty(vcpu->kvm, gfn); | 1120 | mark_page_dirty(vcpu->kvm, gfn); |
966 | 1121 | ||
967 | pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte); | 1122 | pgprintk("%s: setting spte %llx\n", __func__, spte); |
1123 | pgprintk("instantiating %s PTE (%s) at %d (%llx) addr %llx\n", | ||
1124 | (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB", | ||
1125 | (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte, shadow_pte); | ||
968 | set_shadow_pte(shadow_pte, spte); | 1126 | set_shadow_pte(shadow_pte, spte); |
1127 | if (!was_rmapped && (spte & PT_PAGE_SIZE_MASK) | ||
1128 | && (spte & PT_PRESENT_MASK)) | ||
1129 | ++vcpu->kvm->stat.lpages; | ||
1130 | |||
969 | page_header_update_slot(vcpu->kvm, shadow_pte, gfn); | 1131 | page_header_update_slot(vcpu->kvm, shadow_pte, gfn); |
970 | if (!was_rmapped) { | 1132 | if (!was_rmapped) { |
971 | rmap_add(vcpu, shadow_pte, gfn); | 1133 | rmap_add(vcpu, shadow_pte, gfn, largepage); |
972 | if (!is_rmap_pte(*shadow_pte)) | 1134 | if (!is_rmap_pte(*shadow_pte)) |
973 | kvm_release_page_clean(page); | 1135 | kvm_release_pfn_clean(pfn); |
974 | } else { | 1136 | } else { |
975 | if (was_writeble) | 1137 | if (was_writeble) |
976 | kvm_release_page_dirty(page); | 1138 | kvm_release_pfn_dirty(pfn); |
977 | else | 1139 | else |
978 | kvm_release_page_clean(page); | 1140 | kvm_release_pfn_clean(pfn); |
979 | } | 1141 | } |
980 | if (!ptwrite || !*ptwrite) | 1142 | if (!ptwrite || !*ptwrite) |
981 | vcpu->arch.last_pte_updated = shadow_pte; | 1143 | vcpu->arch.last_pte_updated = shadow_pte; |
@@ -985,10 +1147,10 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) | |||
985 | { | 1147 | { |
986 | } | 1148 | } |
987 | 1149 | ||
988 | static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, | 1150 | static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, |
989 | gfn_t gfn, struct page *page) | 1151 | int largepage, gfn_t gfn, pfn_t pfn, |
1152 | int level) | ||
990 | { | 1153 | { |
991 | int level = PT32E_ROOT_LEVEL; | ||
992 | hpa_t table_addr = vcpu->arch.mmu.root_hpa; | 1154 | hpa_t table_addr = vcpu->arch.mmu.root_hpa; |
993 | int pt_write = 0; | 1155 | int pt_write = 0; |
994 | 1156 | ||
@@ -1001,8 +1163,14 @@ static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, | |||
1001 | 1163 | ||
1002 | if (level == 1) { | 1164 | if (level == 1) { |
1003 | mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, | 1165 | mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, |
1004 | 0, write, 1, &pt_write, gfn, page); | 1166 | 0, write, 1, &pt_write, 0, gfn, pfn, false); |
1005 | return pt_write || is_io_pte(table[index]); | 1167 | return pt_write; |
1168 | } | ||
1169 | |||
1170 | if (largepage && level == 2) { | ||
1171 | mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, | ||
1172 | 0, write, 1, &pt_write, 1, gfn, pfn, false); | ||
1173 | return pt_write; | ||
1006 | } | 1174 | } |
1007 | 1175 | ||
1008 | if (table[index] == shadow_trap_nonpresent_pte) { | 1176 | if (table[index] == shadow_trap_nonpresent_pte) { |
@@ -1016,7 +1184,7 @@ static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, | |||
1016 | 1, ACC_ALL, &table[index]); | 1184 | 1, ACC_ALL, &table[index]); |
1017 | if (!new_table) { | 1185 | if (!new_table) { |
1018 | pgprintk("nonpaging_map: ENOMEM\n"); | 1186 | pgprintk("nonpaging_map: ENOMEM\n"); |
1019 | kvm_release_page_clean(page); | 1187 | kvm_release_pfn_clean(pfn); |
1020 | return -ENOMEM; | 1188 | return -ENOMEM; |
1021 | } | 1189 | } |
1022 | 1190 | ||
@@ -1030,21 +1198,30 @@ static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, | |||
1030 | static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) | 1198 | static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) |
1031 | { | 1199 | { |
1032 | int r; | 1200 | int r; |
1033 | 1201 | int largepage = 0; | |
1034 | struct page *page; | 1202 | pfn_t pfn; |
1035 | |||
1036 | down_read(&vcpu->kvm->slots_lock); | ||
1037 | 1203 | ||
1038 | down_read(¤t->mm->mmap_sem); | 1204 | down_read(¤t->mm->mmap_sem); |
1039 | page = gfn_to_page(vcpu->kvm, gfn); | 1205 | if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { |
1206 | gfn &= ~(KVM_PAGES_PER_HPAGE-1); | ||
1207 | largepage = 1; | ||
1208 | } | ||
1209 | |||
1210 | pfn = gfn_to_pfn(vcpu->kvm, gfn); | ||
1040 | up_read(¤t->mm->mmap_sem); | 1211 | up_read(¤t->mm->mmap_sem); |
1041 | 1212 | ||
1213 | /* mmio */ | ||
1214 | if (is_error_pfn(pfn)) { | ||
1215 | kvm_release_pfn_clean(pfn); | ||
1216 | return 1; | ||
1217 | } | ||
1218 | |||
1042 | spin_lock(&vcpu->kvm->mmu_lock); | 1219 | spin_lock(&vcpu->kvm->mmu_lock); |
1043 | kvm_mmu_free_some_pages(vcpu); | 1220 | kvm_mmu_free_some_pages(vcpu); |
1044 | r = __nonpaging_map(vcpu, v, write, gfn, page); | 1221 | r = __direct_map(vcpu, v, write, largepage, gfn, pfn, |
1222 | PT32E_ROOT_LEVEL); | ||
1045 | spin_unlock(&vcpu->kvm->mmu_lock); | 1223 | spin_unlock(&vcpu->kvm->mmu_lock); |
1046 | 1224 | ||
1047 | up_read(&vcpu->kvm->slots_lock); | ||
1048 | 1225 | ||
1049 | return r; | 1226 | return r; |
1050 | } | 1227 | } |
@@ -1073,6 +1250,8 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) | |||
1073 | 1250 | ||
1074 | sp = page_header(root); | 1251 | sp = page_header(root); |
1075 | --sp->root_count; | 1252 | --sp->root_count; |
1253 | if (!sp->root_count && sp->role.invalid) | ||
1254 | kvm_mmu_zap_page(vcpu->kvm, sp); | ||
1076 | vcpu->arch.mmu.root_hpa = INVALID_PAGE; | 1255 | vcpu->arch.mmu.root_hpa = INVALID_PAGE; |
1077 | spin_unlock(&vcpu->kvm->mmu_lock); | 1256 | spin_unlock(&vcpu->kvm->mmu_lock); |
1078 | return; | 1257 | return; |
@@ -1085,6 +1264,8 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) | |||
1085 | root &= PT64_BASE_ADDR_MASK; | 1264 | root &= PT64_BASE_ADDR_MASK; |
1086 | sp = page_header(root); | 1265 | sp = page_header(root); |
1087 | --sp->root_count; | 1266 | --sp->root_count; |
1267 | if (!sp->root_count && sp->role.invalid) | ||
1268 | kvm_mmu_zap_page(vcpu->kvm, sp); | ||
1088 | } | 1269 | } |
1089 | vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; | 1270 | vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; |
1090 | } | 1271 | } |
@@ -1097,6 +1278,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) | |||
1097 | int i; | 1278 | int i; |
1098 | gfn_t root_gfn; | 1279 | gfn_t root_gfn; |
1099 | struct kvm_mmu_page *sp; | 1280 | struct kvm_mmu_page *sp; |
1281 | int metaphysical = 0; | ||
1100 | 1282 | ||
1101 | root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT; | 1283 | root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT; |
1102 | 1284 | ||
@@ -1105,14 +1287,20 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) | |||
1105 | hpa_t root = vcpu->arch.mmu.root_hpa; | 1287 | hpa_t root = vcpu->arch.mmu.root_hpa; |
1106 | 1288 | ||
1107 | ASSERT(!VALID_PAGE(root)); | 1289 | ASSERT(!VALID_PAGE(root)); |
1290 | if (tdp_enabled) | ||
1291 | metaphysical = 1; | ||
1108 | sp = kvm_mmu_get_page(vcpu, root_gfn, 0, | 1292 | sp = kvm_mmu_get_page(vcpu, root_gfn, 0, |
1109 | PT64_ROOT_LEVEL, 0, ACC_ALL, NULL); | 1293 | PT64_ROOT_LEVEL, metaphysical, |
1294 | ACC_ALL, NULL); | ||
1110 | root = __pa(sp->spt); | 1295 | root = __pa(sp->spt); |
1111 | ++sp->root_count; | 1296 | ++sp->root_count; |
1112 | vcpu->arch.mmu.root_hpa = root; | 1297 | vcpu->arch.mmu.root_hpa = root; |
1113 | return; | 1298 | return; |
1114 | } | 1299 | } |
1115 | #endif | 1300 | #endif |
1301 | metaphysical = !is_paging(vcpu); | ||
1302 | if (tdp_enabled) | ||
1303 | metaphysical = 1; | ||
1116 | for (i = 0; i < 4; ++i) { | 1304 | for (i = 0; i < 4; ++i) { |
1117 | hpa_t root = vcpu->arch.mmu.pae_root[i]; | 1305 | hpa_t root = vcpu->arch.mmu.pae_root[i]; |
1118 | 1306 | ||
@@ -1126,7 +1314,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) | |||
1126 | } else if (vcpu->arch.mmu.root_level == 0) | 1314 | } else if (vcpu->arch.mmu.root_level == 0) |
1127 | root_gfn = 0; | 1315 | root_gfn = 0; |
1128 | sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, | 1316 | sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, |
1129 | PT32_ROOT_LEVEL, !is_paging(vcpu), | 1317 | PT32_ROOT_LEVEL, metaphysical, |
1130 | ACC_ALL, NULL); | 1318 | ACC_ALL, NULL); |
1131 | root = __pa(sp->spt); | 1319 | root = __pa(sp->spt); |
1132 | ++sp->root_count; | 1320 | ++sp->root_count; |
@@ -1146,7 +1334,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, | |||
1146 | gfn_t gfn; | 1334 | gfn_t gfn; |
1147 | int r; | 1335 | int r; |
1148 | 1336 | ||
1149 | pgprintk("%s: gva %lx error %x\n", __FUNCTION__, gva, error_code); | 1337 | pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); |
1150 | r = mmu_topup_memory_caches(vcpu); | 1338 | r = mmu_topup_memory_caches(vcpu); |
1151 | if (r) | 1339 | if (r) |
1152 | return r; | 1340 | return r; |
@@ -1160,6 +1348,41 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, | |||
1160 | error_code & PFERR_WRITE_MASK, gfn); | 1348 | error_code & PFERR_WRITE_MASK, gfn); |
1161 | } | 1349 | } |
1162 | 1350 | ||
1351 | static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, | ||
1352 | u32 error_code) | ||
1353 | { | ||
1354 | pfn_t pfn; | ||
1355 | int r; | ||
1356 | int largepage = 0; | ||
1357 | gfn_t gfn = gpa >> PAGE_SHIFT; | ||
1358 | |||
1359 | ASSERT(vcpu); | ||
1360 | ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); | ||
1361 | |||
1362 | r = mmu_topup_memory_caches(vcpu); | ||
1363 | if (r) | ||
1364 | return r; | ||
1365 | |||
1366 | down_read(¤t->mm->mmap_sem); | ||
1367 | if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { | ||
1368 | gfn &= ~(KVM_PAGES_PER_HPAGE-1); | ||
1369 | largepage = 1; | ||
1370 | } | ||
1371 | pfn = gfn_to_pfn(vcpu->kvm, gfn); | ||
1372 | up_read(¤t->mm->mmap_sem); | ||
1373 | if (is_error_pfn(pfn)) { | ||
1374 | kvm_release_pfn_clean(pfn); | ||
1375 | return 1; | ||
1376 | } | ||
1377 | spin_lock(&vcpu->kvm->mmu_lock); | ||
1378 | kvm_mmu_free_some_pages(vcpu); | ||
1379 | r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, | ||
1380 | largepage, gfn, pfn, TDP_ROOT_LEVEL); | ||
1381 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
1382 | |||
1383 | return r; | ||
1384 | } | ||
1385 | |||
1163 | static void nonpaging_free(struct kvm_vcpu *vcpu) | 1386 | static void nonpaging_free(struct kvm_vcpu *vcpu) |
1164 | { | 1387 | { |
1165 | mmu_free_roots(vcpu); | 1388 | mmu_free_roots(vcpu); |
@@ -1188,7 +1411,7 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) | |||
1188 | 1411 | ||
1189 | static void paging_new_cr3(struct kvm_vcpu *vcpu) | 1412 | static void paging_new_cr3(struct kvm_vcpu *vcpu) |
1190 | { | 1413 | { |
1191 | pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->arch.cr3); | 1414 | pgprintk("%s: cr3 %lx\n", __func__, vcpu->arch.cr3); |
1192 | mmu_free_roots(vcpu); | 1415 | mmu_free_roots(vcpu); |
1193 | } | 1416 | } |
1194 | 1417 | ||
@@ -1253,7 +1476,35 @@ static int paging32E_init_context(struct kvm_vcpu *vcpu) | |||
1253 | return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL); | 1476 | return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL); |
1254 | } | 1477 | } |
1255 | 1478 | ||
1256 | static int init_kvm_mmu(struct kvm_vcpu *vcpu) | 1479 | static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) |
1480 | { | ||
1481 | struct kvm_mmu *context = &vcpu->arch.mmu; | ||
1482 | |||
1483 | context->new_cr3 = nonpaging_new_cr3; | ||
1484 | context->page_fault = tdp_page_fault; | ||
1485 | context->free = nonpaging_free; | ||
1486 | context->prefetch_page = nonpaging_prefetch_page; | ||
1487 | context->shadow_root_level = TDP_ROOT_LEVEL; | ||
1488 | context->root_hpa = INVALID_PAGE; | ||
1489 | |||
1490 | if (!is_paging(vcpu)) { | ||
1491 | context->gva_to_gpa = nonpaging_gva_to_gpa; | ||
1492 | context->root_level = 0; | ||
1493 | } else if (is_long_mode(vcpu)) { | ||
1494 | context->gva_to_gpa = paging64_gva_to_gpa; | ||
1495 | context->root_level = PT64_ROOT_LEVEL; | ||
1496 | } else if (is_pae(vcpu)) { | ||
1497 | context->gva_to_gpa = paging64_gva_to_gpa; | ||
1498 | context->root_level = PT32E_ROOT_LEVEL; | ||
1499 | } else { | ||
1500 | context->gva_to_gpa = paging32_gva_to_gpa; | ||
1501 | context->root_level = PT32_ROOT_LEVEL; | ||
1502 | } | ||
1503 | |||
1504 | return 0; | ||
1505 | } | ||
1506 | |||
1507 | static int init_kvm_softmmu(struct kvm_vcpu *vcpu) | ||
1257 | { | 1508 | { |
1258 | ASSERT(vcpu); | 1509 | ASSERT(vcpu); |
1259 | ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); | 1510 | ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); |
@@ -1268,6 +1519,16 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu) | |||
1268 | return paging32_init_context(vcpu); | 1519 | return paging32_init_context(vcpu); |
1269 | } | 1520 | } |
1270 | 1521 | ||
1522 | static int init_kvm_mmu(struct kvm_vcpu *vcpu) | ||
1523 | { | ||
1524 | vcpu->arch.update_pte.pfn = bad_pfn; | ||
1525 | |||
1526 | if (tdp_enabled) | ||
1527 | return init_kvm_tdp_mmu(vcpu); | ||
1528 | else | ||
1529 | return init_kvm_softmmu(vcpu); | ||
1530 | } | ||
1531 | |||
1271 | static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) | 1532 | static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) |
1272 | { | 1533 | { |
1273 | ASSERT(vcpu); | 1534 | ASSERT(vcpu); |
@@ -1316,7 +1577,8 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, | |||
1316 | 1577 | ||
1317 | pte = *spte; | 1578 | pte = *spte; |
1318 | if (is_shadow_present_pte(pte)) { | 1579 | if (is_shadow_present_pte(pte)) { |
1319 | if (sp->role.level == PT_PAGE_TABLE_LEVEL) | 1580 | if (sp->role.level == PT_PAGE_TABLE_LEVEL || |
1581 | is_large_pte(pte)) | ||
1320 | rmap_remove(vcpu->kvm, spte); | 1582 | rmap_remove(vcpu->kvm, spte); |
1321 | else { | 1583 | else { |
1322 | child = page_header(pte & PT64_BASE_ADDR_MASK); | 1584 | child = page_header(pte & PT64_BASE_ADDR_MASK); |
@@ -1324,24 +1586,26 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, | |||
1324 | } | 1586 | } |
1325 | } | 1587 | } |
1326 | set_shadow_pte(spte, shadow_trap_nonpresent_pte); | 1588 | set_shadow_pte(spte, shadow_trap_nonpresent_pte); |
1589 | if (is_large_pte(pte)) | ||
1590 | --vcpu->kvm->stat.lpages; | ||
1327 | } | 1591 | } |
1328 | 1592 | ||
1329 | static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, | 1593 | static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, |
1330 | struct kvm_mmu_page *sp, | 1594 | struct kvm_mmu_page *sp, |
1331 | u64 *spte, | 1595 | u64 *spte, |
1332 | const void *new, int bytes, | 1596 | const void *new) |
1333 | int offset_in_pte) | ||
1334 | { | 1597 | { |
1335 | if (sp->role.level != PT_PAGE_TABLE_LEVEL) { | 1598 | if ((sp->role.level != PT_PAGE_TABLE_LEVEL) |
1599 | && !vcpu->arch.update_pte.largepage) { | ||
1336 | ++vcpu->kvm->stat.mmu_pde_zapped; | 1600 | ++vcpu->kvm->stat.mmu_pde_zapped; |
1337 | return; | 1601 | return; |
1338 | } | 1602 | } |
1339 | 1603 | ||
1340 | ++vcpu->kvm->stat.mmu_pte_updated; | 1604 | ++vcpu->kvm->stat.mmu_pte_updated; |
1341 | if (sp->role.glevels == PT32_ROOT_LEVEL) | 1605 | if (sp->role.glevels == PT32_ROOT_LEVEL) |
1342 | paging32_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte); | 1606 | paging32_update_pte(vcpu, sp, spte, new); |
1343 | else | 1607 | else |
1344 | paging64_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte); | 1608 | paging64_update_pte(vcpu, sp, spte, new); |
1345 | } | 1609 | } |
1346 | 1610 | ||
1347 | static bool need_remote_flush(u64 old, u64 new) | 1611 | static bool need_remote_flush(u64 old, u64 new) |
@@ -1378,7 +1642,9 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
1378 | gfn_t gfn; | 1642 | gfn_t gfn; |
1379 | int r; | 1643 | int r; |
1380 | u64 gpte = 0; | 1644 | u64 gpte = 0; |
1381 | struct page *page; | 1645 | pfn_t pfn; |
1646 | |||
1647 | vcpu->arch.update_pte.largepage = 0; | ||
1382 | 1648 | ||
1383 | if (bytes != 4 && bytes != 8) | 1649 | if (bytes != 4 && bytes != 8) |
1384 | return; | 1650 | return; |
@@ -1408,11 +1674,19 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
1408 | gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; | 1674 | gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; |
1409 | 1675 | ||
1410 | down_read(¤t->mm->mmap_sem); | 1676 | down_read(¤t->mm->mmap_sem); |
1411 | page = gfn_to_page(vcpu->kvm, gfn); | 1677 | if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) { |
1678 | gfn &= ~(KVM_PAGES_PER_HPAGE-1); | ||
1679 | vcpu->arch.update_pte.largepage = 1; | ||
1680 | } | ||
1681 | pfn = gfn_to_pfn(vcpu->kvm, gfn); | ||
1412 | up_read(¤t->mm->mmap_sem); | 1682 | up_read(¤t->mm->mmap_sem); |
1413 | 1683 | ||
1684 | if (is_error_pfn(pfn)) { | ||
1685 | kvm_release_pfn_clean(pfn); | ||
1686 | return; | ||
1687 | } | ||
1414 | vcpu->arch.update_pte.gfn = gfn; | 1688 | vcpu->arch.update_pte.gfn = gfn; |
1415 | vcpu->arch.update_pte.page = page; | 1689 | vcpu->arch.update_pte.pfn = pfn; |
1416 | } | 1690 | } |
1417 | 1691 | ||
1418 | void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | 1692 | void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, |
@@ -1423,7 +1697,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
1423 | struct hlist_node *node, *n; | 1697 | struct hlist_node *node, *n; |
1424 | struct hlist_head *bucket; | 1698 | struct hlist_head *bucket; |
1425 | unsigned index; | 1699 | unsigned index; |
1426 | u64 entry; | 1700 | u64 entry, gentry; |
1427 | u64 *spte; | 1701 | u64 *spte; |
1428 | unsigned offset = offset_in_page(gpa); | 1702 | unsigned offset = offset_in_page(gpa); |
1429 | unsigned pte_size; | 1703 | unsigned pte_size; |
@@ -1433,8 +1707,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
1433 | int level; | 1707 | int level; |
1434 | int flooded = 0; | 1708 | int flooded = 0; |
1435 | int npte; | 1709 | int npte; |
1710 | int r; | ||
1436 | 1711 | ||
1437 | pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes); | 1712 | pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); |
1438 | mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes); | 1713 | mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes); |
1439 | spin_lock(&vcpu->kvm->mmu_lock); | 1714 | spin_lock(&vcpu->kvm->mmu_lock); |
1440 | kvm_mmu_free_some_pages(vcpu); | 1715 | kvm_mmu_free_some_pages(vcpu); |
@@ -1450,7 +1725,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
1450 | vcpu->arch.last_pt_write_count = 1; | 1725 | vcpu->arch.last_pt_write_count = 1; |
1451 | vcpu->arch.last_pte_updated = NULL; | 1726 | vcpu->arch.last_pte_updated = NULL; |
1452 | } | 1727 | } |
1453 | index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; | 1728 | index = kvm_page_table_hashfn(gfn); |
1454 | bucket = &vcpu->kvm->arch.mmu_page_hash[index]; | 1729 | bucket = &vcpu->kvm->arch.mmu_page_hash[index]; |
1455 | hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { | 1730 | hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { |
1456 | if (sp->gfn != gfn || sp->role.metaphysical) | 1731 | if (sp->gfn != gfn || sp->role.metaphysical) |
@@ -1496,20 +1771,29 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
1496 | continue; | 1771 | continue; |
1497 | } | 1772 | } |
1498 | spte = &sp->spt[page_offset / sizeof(*spte)]; | 1773 | spte = &sp->spt[page_offset / sizeof(*spte)]; |
1774 | if ((gpa & (pte_size - 1)) || (bytes < pte_size)) { | ||
1775 | gentry = 0; | ||
1776 | r = kvm_read_guest_atomic(vcpu->kvm, | ||
1777 | gpa & ~(u64)(pte_size - 1), | ||
1778 | &gentry, pte_size); | ||
1779 | new = (const void *)&gentry; | ||
1780 | if (r < 0) | ||
1781 | new = NULL; | ||
1782 | } | ||
1499 | while (npte--) { | 1783 | while (npte--) { |
1500 | entry = *spte; | 1784 | entry = *spte; |
1501 | mmu_pte_write_zap_pte(vcpu, sp, spte); | 1785 | mmu_pte_write_zap_pte(vcpu, sp, spte); |
1502 | mmu_pte_write_new_pte(vcpu, sp, spte, new, bytes, | 1786 | if (new) |
1503 | page_offset & (pte_size - 1)); | 1787 | mmu_pte_write_new_pte(vcpu, sp, spte, new); |
1504 | mmu_pte_write_flush_tlb(vcpu, entry, *spte); | 1788 | mmu_pte_write_flush_tlb(vcpu, entry, *spte); |
1505 | ++spte; | 1789 | ++spte; |
1506 | } | 1790 | } |
1507 | } | 1791 | } |
1508 | kvm_mmu_audit(vcpu, "post pte write"); | 1792 | kvm_mmu_audit(vcpu, "post pte write"); |
1509 | spin_unlock(&vcpu->kvm->mmu_lock); | 1793 | spin_unlock(&vcpu->kvm->mmu_lock); |
1510 | if (vcpu->arch.update_pte.page) { | 1794 | if (!is_error_pfn(vcpu->arch.update_pte.pfn)) { |
1511 | kvm_release_page_clean(vcpu->arch.update_pte.page); | 1795 | kvm_release_pfn_clean(vcpu->arch.update_pte.pfn); |
1512 | vcpu->arch.update_pte.page = NULL; | 1796 | vcpu->arch.update_pte.pfn = bad_pfn; |
1513 | } | 1797 | } |
1514 | } | 1798 | } |
1515 | 1799 | ||
@@ -1518,9 +1802,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) | |||
1518 | gpa_t gpa; | 1802 | gpa_t gpa; |
1519 | int r; | 1803 | int r; |
1520 | 1804 | ||
1521 | down_read(&vcpu->kvm->slots_lock); | ||
1522 | gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); | 1805 | gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); |
1523 | up_read(&vcpu->kvm->slots_lock); | ||
1524 | 1806 | ||
1525 | spin_lock(&vcpu->kvm->mmu_lock); | 1807 | spin_lock(&vcpu->kvm->mmu_lock); |
1526 | r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); | 1808 | r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); |
@@ -1577,6 +1859,12 @@ out: | |||
1577 | } | 1859 | } |
1578 | EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); | 1860 | EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); |
1579 | 1861 | ||
1862 | void kvm_enable_tdp(void) | ||
1863 | { | ||
1864 | tdp_enabled = true; | ||
1865 | } | ||
1866 | EXPORT_SYMBOL_GPL(kvm_enable_tdp); | ||
1867 | |||
1580 | static void free_mmu_pages(struct kvm_vcpu *vcpu) | 1868 | static void free_mmu_pages(struct kvm_vcpu *vcpu) |
1581 | { | 1869 | { |
1582 | struct kvm_mmu_page *sp; | 1870 | struct kvm_mmu_page *sp; |
@@ -1677,7 +1965,53 @@ void kvm_mmu_zap_all(struct kvm *kvm) | |||
1677 | kvm_flush_remote_tlbs(kvm); | 1965 | kvm_flush_remote_tlbs(kvm); |
1678 | } | 1966 | } |
1679 | 1967 | ||
1680 | void kvm_mmu_module_exit(void) | 1968 | void kvm_mmu_remove_one_alloc_mmu_page(struct kvm *kvm) |
1969 | { | ||
1970 | struct kvm_mmu_page *page; | ||
1971 | |||
1972 | page = container_of(kvm->arch.active_mmu_pages.prev, | ||
1973 | struct kvm_mmu_page, link); | ||
1974 | kvm_mmu_zap_page(kvm, page); | ||
1975 | } | ||
1976 | |||
1977 | static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask) | ||
1978 | { | ||
1979 | struct kvm *kvm; | ||
1980 | struct kvm *kvm_freed = NULL; | ||
1981 | int cache_count = 0; | ||
1982 | |||
1983 | spin_lock(&kvm_lock); | ||
1984 | |||
1985 | list_for_each_entry(kvm, &vm_list, vm_list) { | ||
1986 | int npages; | ||
1987 | |||
1988 | spin_lock(&kvm->mmu_lock); | ||
1989 | npages = kvm->arch.n_alloc_mmu_pages - | ||
1990 | kvm->arch.n_free_mmu_pages; | ||
1991 | cache_count += npages; | ||
1992 | if (!kvm_freed && nr_to_scan > 0 && npages > 0) { | ||
1993 | kvm_mmu_remove_one_alloc_mmu_page(kvm); | ||
1994 | cache_count--; | ||
1995 | kvm_freed = kvm; | ||
1996 | } | ||
1997 | nr_to_scan--; | ||
1998 | |||
1999 | spin_unlock(&kvm->mmu_lock); | ||
2000 | } | ||
2001 | if (kvm_freed) | ||
2002 | list_move_tail(&kvm_freed->vm_list, &vm_list); | ||
2003 | |||
2004 | spin_unlock(&kvm_lock); | ||
2005 | |||
2006 | return cache_count; | ||
2007 | } | ||
2008 | |||
2009 | static struct shrinker mmu_shrinker = { | ||
2010 | .shrink = mmu_shrink, | ||
2011 | .seeks = DEFAULT_SEEKS * 10, | ||
2012 | }; | ||
2013 | |||
2014 | void mmu_destroy_caches(void) | ||
1681 | { | 2015 | { |
1682 | if (pte_chain_cache) | 2016 | if (pte_chain_cache) |
1683 | kmem_cache_destroy(pte_chain_cache); | 2017 | kmem_cache_destroy(pte_chain_cache); |
@@ -1687,6 +2021,12 @@ void kvm_mmu_module_exit(void) | |||
1687 | kmem_cache_destroy(mmu_page_header_cache); | 2021 | kmem_cache_destroy(mmu_page_header_cache); |
1688 | } | 2022 | } |
1689 | 2023 | ||
2024 | void kvm_mmu_module_exit(void) | ||
2025 | { | ||
2026 | mmu_destroy_caches(); | ||
2027 | unregister_shrinker(&mmu_shrinker); | ||
2028 | } | ||
2029 | |||
1690 | int kvm_mmu_module_init(void) | 2030 | int kvm_mmu_module_init(void) |
1691 | { | 2031 | { |
1692 | pte_chain_cache = kmem_cache_create("kvm_pte_chain", | 2032 | pte_chain_cache = kmem_cache_create("kvm_pte_chain", |
@@ -1706,10 +2046,12 @@ int kvm_mmu_module_init(void) | |||
1706 | if (!mmu_page_header_cache) | 2046 | if (!mmu_page_header_cache) |
1707 | goto nomem; | 2047 | goto nomem; |
1708 | 2048 | ||
2049 | register_shrinker(&mmu_shrinker); | ||
2050 | |||
1709 | return 0; | 2051 | return 0; |
1710 | 2052 | ||
1711 | nomem: | 2053 | nomem: |
1712 | kvm_mmu_module_exit(); | 2054 | mmu_destroy_caches(); |
1713 | return -ENOMEM; | 2055 | return -ENOMEM; |
1714 | } | 2056 | } |
1715 | 2057 | ||
@@ -1732,6 +2074,127 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) | |||
1732 | return nr_mmu_pages; | 2074 | return nr_mmu_pages; |
1733 | } | 2075 | } |
1734 | 2076 | ||
2077 | static void *pv_mmu_peek_buffer(struct kvm_pv_mmu_op_buffer *buffer, | ||
2078 | unsigned len) | ||
2079 | { | ||
2080 | if (len > buffer->len) | ||
2081 | return NULL; | ||
2082 | return buffer->ptr; | ||
2083 | } | ||
2084 | |||
2085 | static void *pv_mmu_read_buffer(struct kvm_pv_mmu_op_buffer *buffer, | ||
2086 | unsigned len) | ||
2087 | { | ||
2088 | void *ret; | ||
2089 | |||
2090 | ret = pv_mmu_peek_buffer(buffer, len); | ||
2091 | if (!ret) | ||
2092 | return ret; | ||
2093 | buffer->ptr += len; | ||
2094 | buffer->len -= len; | ||
2095 | buffer->processed += len; | ||
2096 | return ret; | ||
2097 | } | ||
2098 | |||
2099 | static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu, | ||
2100 | gpa_t addr, gpa_t value) | ||
2101 | { | ||
2102 | int bytes = 8; | ||
2103 | int r; | ||
2104 | |||
2105 | if (!is_long_mode(vcpu) && !is_pae(vcpu)) | ||
2106 | bytes = 4; | ||
2107 | |||
2108 | r = mmu_topup_memory_caches(vcpu); | ||
2109 | if (r) | ||
2110 | return r; | ||
2111 | |||
2112 | if (!emulator_write_phys(vcpu, addr, &value, bytes)) | ||
2113 | return -EFAULT; | ||
2114 | |||
2115 | return 1; | ||
2116 | } | ||
2117 | |||
2118 | static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu) | ||
2119 | { | ||
2120 | kvm_x86_ops->tlb_flush(vcpu); | ||
2121 | return 1; | ||
2122 | } | ||
2123 | |||
2124 | static int kvm_pv_mmu_release_pt(struct kvm_vcpu *vcpu, gpa_t addr) | ||
2125 | { | ||
2126 | spin_lock(&vcpu->kvm->mmu_lock); | ||
2127 | mmu_unshadow(vcpu->kvm, addr >> PAGE_SHIFT); | ||
2128 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
2129 | return 1; | ||
2130 | } | ||
2131 | |||
2132 | static int kvm_pv_mmu_op_one(struct kvm_vcpu *vcpu, | ||
2133 | struct kvm_pv_mmu_op_buffer *buffer) | ||
2134 | { | ||
2135 | struct kvm_mmu_op_header *header; | ||
2136 | |||
2137 | header = pv_mmu_peek_buffer(buffer, sizeof *header); | ||
2138 | if (!header) | ||
2139 | return 0; | ||
2140 | switch (header->op) { | ||
2141 | case KVM_MMU_OP_WRITE_PTE: { | ||
2142 | struct kvm_mmu_op_write_pte *wpte; | ||
2143 | |||
2144 | wpte = pv_mmu_read_buffer(buffer, sizeof *wpte); | ||
2145 | if (!wpte) | ||
2146 | return 0; | ||
2147 | return kvm_pv_mmu_write(vcpu, wpte->pte_phys, | ||
2148 | wpte->pte_val); | ||
2149 | } | ||
2150 | case KVM_MMU_OP_FLUSH_TLB: { | ||
2151 | struct kvm_mmu_op_flush_tlb *ftlb; | ||
2152 | |||
2153 | ftlb = pv_mmu_read_buffer(buffer, sizeof *ftlb); | ||
2154 | if (!ftlb) | ||
2155 | return 0; | ||
2156 | return kvm_pv_mmu_flush_tlb(vcpu); | ||
2157 | } | ||
2158 | case KVM_MMU_OP_RELEASE_PT: { | ||
2159 | struct kvm_mmu_op_release_pt *rpt; | ||
2160 | |||
2161 | rpt = pv_mmu_read_buffer(buffer, sizeof *rpt); | ||
2162 | if (!rpt) | ||
2163 | return 0; | ||
2164 | return kvm_pv_mmu_release_pt(vcpu, rpt->pt_phys); | ||
2165 | } | ||
2166 | default: return 0; | ||
2167 | } | ||
2168 | } | ||
2169 | |||
2170 | int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes, | ||
2171 | gpa_t addr, unsigned long *ret) | ||
2172 | { | ||
2173 | int r; | ||
2174 | struct kvm_pv_mmu_op_buffer buffer; | ||
2175 | |||
2176 | buffer.ptr = buffer.buf; | ||
2177 | buffer.len = min_t(unsigned long, bytes, sizeof buffer.buf); | ||
2178 | buffer.processed = 0; | ||
2179 | |||
2180 | r = kvm_read_guest(vcpu->kvm, addr, buffer.buf, buffer.len); | ||
2181 | if (r) | ||
2182 | goto out; | ||
2183 | |||
2184 | while (buffer.len) { | ||
2185 | r = kvm_pv_mmu_op_one(vcpu, &buffer); | ||
2186 | if (r < 0) | ||
2187 | goto out; | ||
2188 | if (r == 0) | ||
2189 | break; | ||
2190 | } | ||
2191 | |||
2192 | r = 1; | ||
2193 | out: | ||
2194 | *ret = buffer.processed; | ||
2195 | return r; | ||
2196 | } | ||
2197 | |||
1735 | #ifdef AUDIT | 2198 | #ifdef AUDIT |
1736 | 2199 | ||
1737 | static const char *audit_msg; | 2200 | static const char *audit_msg; |
@@ -1768,8 +2231,7 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, | |||
1768 | audit_mappings_page(vcpu, ent, va, level - 1); | 2231 | audit_mappings_page(vcpu, ent, va, level - 1); |
1769 | } else { | 2232 | } else { |
1770 | gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va); | 2233 | gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va); |
1771 | struct page *page = gpa_to_page(vcpu, gpa); | 2234 | hpa_t hpa = (hpa_t)gpa_to_pfn(vcpu, gpa) << PAGE_SHIFT; |
1772 | hpa_t hpa = page_to_phys(page); | ||
1773 | 2235 | ||
1774 | if (is_shadow_present_pte(ent) | 2236 | if (is_shadow_present_pte(ent) |
1775 | && (ent & PT64_BASE_ADDR_MASK) != hpa) | 2237 | && (ent & PT64_BASE_ADDR_MASK) != hpa) |
@@ -1782,7 +2244,7 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, | |||
1782 | && !is_error_hpa(hpa)) | 2244 | && !is_error_hpa(hpa)) |
1783 | printk(KERN_ERR "audit: (%s) notrap shadow," | 2245 | printk(KERN_ERR "audit: (%s) notrap shadow," |
1784 | " valid guest gva %lx\n", audit_msg, va); | 2246 | " valid guest gva %lx\n", audit_msg, va); |
1785 | kvm_release_page_clean(page); | 2247 | kvm_release_pfn_clean(pfn); |
1786 | 2248 | ||
1787 | } | 2249 | } |
1788 | } | 2250 | } |
@@ -1867,7 +2329,7 @@ static void audit_rmap(struct kvm_vcpu *vcpu) | |||
1867 | 2329 | ||
1868 | if (n_rmap != n_actual) | 2330 | if (n_rmap != n_actual) |
1869 | printk(KERN_ERR "%s: (%s) rmap %d actual %d\n", | 2331 | printk(KERN_ERR "%s: (%s) rmap %d actual %d\n", |
1870 | __FUNCTION__, audit_msg, n_rmap, n_actual); | 2332 | __func__, audit_msg, n_rmap, n_actual); |
1871 | } | 2333 | } |
1872 | 2334 | ||
1873 | static void audit_write_protection(struct kvm_vcpu *vcpu) | 2335 | static void audit_write_protection(struct kvm_vcpu *vcpu) |
@@ -1887,7 +2349,7 @@ static void audit_write_protection(struct kvm_vcpu *vcpu) | |||
1887 | if (*rmapp) | 2349 | if (*rmapp) |
1888 | printk(KERN_ERR "%s: (%s) shadow page has writable" | 2350 | printk(KERN_ERR "%s: (%s) shadow page has writable" |
1889 | " mappings: gfn %lx role %x\n", | 2351 | " mappings: gfn %lx role %x\n", |
1890 | __FUNCTION__, audit_msg, sp->gfn, | 2352 | __func__, audit_msg, sp->gfn, |
1891 | sp->role.word); | 2353 | sp->role.word); |
1892 | } | 2354 | } |
1893 | } | 2355 | } |
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 1fce19ec7a23..e64e9f56a65e 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h | |||
@@ -3,6 +3,12 @@ | |||
3 | 3 | ||
4 | #include <linux/kvm_host.h> | 4 | #include <linux/kvm_host.h> |
5 | 5 | ||
6 | #ifdef CONFIG_X86_64 | ||
7 | #define TDP_ROOT_LEVEL PT64_ROOT_LEVEL | ||
8 | #else | ||
9 | #define TDP_ROOT_LEVEL PT32E_ROOT_LEVEL | ||
10 | #endif | ||
11 | |||
6 | static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) | 12 | static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) |
7 | { | 13 | { |
8 | if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) | 14 | if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) |
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index ecc0856268c4..156fe10288ae 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h | |||
@@ -130,7 +130,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker, | |||
130 | unsigned index, pt_access, pte_access; | 130 | unsigned index, pt_access, pte_access; |
131 | gpa_t pte_gpa; | 131 | gpa_t pte_gpa; |
132 | 132 | ||
133 | pgprintk("%s: addr %lx\n", __FUNCTION__, addr); | 133 | pgprintk("%s: addr %lx\n", __func__, addr); |
134 | walk: | 134 | walk: |
135 | walker->level = vcpu->arch.mmu.root_level; | 135 | walker->level = vcpu->arch.mmu.root_level; |
136 | pte = vcpu->arch.cr3; | 136 | pte = vcpu->arch.cr3; |
@@ -155,7 +155,7 @@ walk: | |||
155 | pte_gpa += index * sizeof(pt_element_t); | 155 | pte_gpa += index * sizeof(pt_element_t); |
156 | walker->table_gfn[walker->level - 1] = table_gfn; | 156 | walker->table_gfn[walker->level - 1] = table_gfn; |
157 | walker->pte_gpa[walker->level - 1] = pte_gpa; | 157 | walker->pte_gpa[walker->level - 1] = pte_gpa; |
158 | pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__, | 158 | pgprintk("%s: table_gfn[%d] %lx\n", __func__, |
159 | walker->level - 1, table_gfn); | 159 | walker->level - 1, table_gfn); |
160 | 160 | ||
161 | kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)); | 161 | kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)); |
@@ -222,7 +222,7 @@ walk: | |||
222 | walker->pt_access = pt_access; | 222 | walker->pt_access = pt_access; |
223 | walker->pte_access = pte_access; | 223 | walker->pte_access = pte_access; |
224 | pgprintk("%s: pte %llx pte_access %x pt_access %x\n", | 224 | pgprintk("%s: pte %llx pte_access %x pt_access %x\n", |
225 | __FUNCTION__, (u64)pte, pt_access, pte_access); | 225 | __func__, (u64)pte, pt_access, pte_access); |
226 | return 1; | 226 | return 1; |
227 | 227 | ||
228 | not_present: | 228 | not_present: |
@@ -243,31 +243,30 @@ err: | |||
243 | } | 243 | } |
244 | 244 | ||
245 | static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, | 245 | static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, |
246 | u64 *spte, const void *pte, int bytes, | 246 | u64 *spte, const void *pte) |
247 | int offset_in_pte) | ||
248 | { | 247 | { |
249 | pt_element_t gpte; | 248 | pt_element_t gpte; |
250 | unsigned pte_access; | 249 | unsigned pte_access; |
251 | struct page *npage; | 250 | pfn_t pfn; |
251 | int largepage = vcpu->arch.update_pte.largepage; | ||
252 | 252 | ||
253 | gpte = *(const pt_element_t *)pte; | 253 | gpte = *(const pt_element_t *)pte; |
254 | if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { | 254 | if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { |
255 | if (!offset_in_pte && !is_present_pte(gpte)) | 255 | if (!is_present_pte(gpte)) |
256 | set_shadow_pte(spte, shadow_notrap_nonpresent_pte); | 256 | set_shadow_pte(spte, shadow_notrap_nonpresent_pte); |
257 | return; | 257 | return; |
258 | } | 258 | } |
259 | if (bytes < sizeof(pt_element_t)) | 259 | pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); |
260 | return; | ||
261 | pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte); | ||
262 | pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte); | 260 | pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte); |
263 | if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn) | 261 | if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn) |
264 | return; | 262 | return; |
265 | npage = vcpu->arch.update_pte.page; | 263 | pfn = vcpu->arch.update_pte.pfn; |
266 | if (!npage) | 264 | if (is_error_pfn(pfn)) |
267 | return; | 265 | return; |
268 | get_page(npage); | 266 | kvm_get_pfn(pfn); |
269 | mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, | 267 | mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, |
270 | gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte), npage); | 268 | gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte), |
269 | pfn, true); | ||
271 | } | 270 | } |
272 | 271 | ||
273 | /* | 272 | /* |
@@ -275,8 +274,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, | |||
275 | */ | 274 | */ |
276 | static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | 275 | static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, |
277 | struct guest_walker *walker, | 276 | struct guest_walker *walker, |
278 | int user_fault, int write_fault, int *ptwrite, | 277 | int user_fault, int write_fault, int largepage, |
279 | struct page *page) | 278 | int *ptwrite, pfn_t pfn) |
280 | { | 279 | { |
281 | hpa_t shadow_addr; | 280 | hpa_t shadow_addr; |
282 | int level; | 281 | int level; |
@@ -304,11 +303,19 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | |||
304 | shadow_ent = ((u64 *)__va(shadow_addr)) + index; | 303 | shadow_ent = ((u64 *)__va(shadow_addr)) + index; |
305 | if (level == PT_PAGE_TABLE_LEVEL) | 304 | if (level == PT_PAGE_TABLE_LEVEL) |
306 | break; | 305 | break; |
307 | if (is_shadow_present_pte(*shadow_ent)) { | 306 | |
307 | if (largepage && level == PT_DIRECTORY_LEVEL) | ||
308 | break; | ||
309 | |||
310 | if (is_shadow_present_pte(*shadow_ent) | ||
311 | && !is_large_pte(*shadow_ent)) { | ||
308 | shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK; | 312 | shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK; |
309 | continue; | 313 | continue; |
310 | } | 314 | } |
311 | 315 | ||
316 | if (is_large_pte(*shadow_ent)) | ||
317 | rmap_remove(vcpu->kvm, shadow_ent); | ||
318 | |||
312 | if (level - 1 == PT_PAGE_TABLE_LEVEL | 319 | if (level - 1 == PT_PAGE_TABLE_LEVEL |
313 | && walker->level == PT_DIRECTORY_LEVEL) { | 320 | && walker->level == PT_DIRECTORY_LEVEL) { |
314 | metaphysical = 1; | 321 | metaphysical = 1; |
@@ -329,7 +336,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | |||
329 | walker->pte_gpa[level - 2], | 336 | walker->pte_gpa[level - 2], |
330 | &curr_pte, sizeof(curr_pte)); | 337 | &curr_pte, sizeof(curr_pte)); |
331 | if (r || curr_pte != walker->ptes[level - 2]) { | 338 | if (r || curr_pte != walker->ptes[level - 2]) { |
332 | kvm_release_page_clean(page); | 339 | kvm_release_pfn_clean(pfn); |
333 | return NULL; | 340 | return NULL; |
334 | } | 341 | } |
335 | } | 342 | } |
@@ -342,7 +349,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | |||
342 | mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access, | 349 | mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access, |
343 | user_fault, write_fault, | 350 | user_fault, write_fault, |
344 | walker->ptes[walker->level-1] & PT_DIRTY_MASK, | 351 | walker->ptes[walker->level-1] & PT_DIRTY_MASK, |
345 | ptwrite, walker->gfn, page); | 352 | ptwrite, largepage, walker->gfn, pfn, false); |
346 | 353 | ||
347 | return shadow_ent; | 354 | return shadow_ent; |
348 | } | 355 | } |
@@ -371,16 +378,16 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
371 | u64 *shadow_pte; | 378 | u64 *shadow_pte; |
372 | int write_pt = 0; | 379 | int write_pt = 0; |
373 | int r; | 380 | int r; |
374 | struct page *page; | 381 | pfn_t pfn; |
382 | int largepage = 0; | ||
375 | 383 | ||
376 | pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code); | 384 | pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); |
377 | kvm_mmu_audit(vcpu, "pre page fault"); | 385 | kvm_mmu_audit(vcpu, "pre page fault"); |
378 | 386 | ||
379 | r = mmu_topup_memory_caches(vcpu); | 387 | r = mmu_topup_memory_caches(vcpu); |
380 | if (r) | 388 | if (r) |
381 | return r; | 389 | return r; |
382 | 390 | ||
383 | down_read(&vcpu->kvm->slots_lock); | ||
384 | /* | 391 | /* |
385 | * Look up the shadow pte for the faulting address. | 392 | * Look up the shadow pte for the faulting address. |
386 | */ | 393 | */ |
@@ -391,40 +398,45 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, | |||
391 | * The page is not mapped by the guest. Let the guest handle it. | 398 | * The page is not mapped by the guest. Let the guest handle it. |
392 | */ | 399 | */ |
393 | if (!r) { | 400 | if (!r) { |
394 | pgprintk("%s: guest page fault\n", __FUNCTION__); | 401 | pgprintk("%s: guest page fault\n", __func__); |
395 | inject_page_fault(vcpu, addr, walker.error_code); | 402 | inject_page_fault(vcpu, addr, walker.error_code); |
396 | vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ | 403 | vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ |
397 | up_read(&vcpu->kvm->slots_lock); | ||
398 | return 0; | 404 | return 0; |
399 | } | 405 | } |
400 | 406 | ||
401 | down_read(¤t->mm->mmap_sem); | 407 | down_read(¤t->mm->mmap_sem); |
402 | page = gfn_to_page(vcpu->kvm, walker.gfn); | 408 | if (walker.level == PT_DIRECTORY_LEVEL) { |
409 | gfn_t large_gfn; | ||
410 | large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1); | ||
411 | if (is_largepage_backed(vcpu, large_gfn)) { | ||
412 | walker.gfn = large_gfn; | ||
413 | largepage = 1; | ||
414 | } | ||
415 | } | ||
416 | pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); | ||
403 | up_read(¤t->mm->mmap_sem); | 417 | up_read(¤t->mm->mmap_sem); |
404 | 418 | ||
419 | /* mmio */ | ||
420 | if (is_error_pfn(pfn)) { | ||
421 | pgprintk("gfn %x is mmio\n", walker.gfn); | ||
422 | kvm_release_pfn_clean(pfn); | ||
423 | return 1; | ||
424 | } | ||
425 | |||
405 | spin_lock(&vcpu->kvm->mmu_lock); | 426 | spin_lock(&vcpu->kvm->mmu_lock); |
406 | kvm_mmu_free_some_pages(vcpu); | 427 | kvm_mmu_free_some_pages(vcpu); |
407 | shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, | 428 | shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, |
408 | &write_pt, page); | 429 | largepage, &write_pt, pfn); |
409 | pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__, | 430 | |
431 | pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, | ||
410 | shadow_pte, *shadow_pte, write_pt); | 432 | shadow_pte, *shadow_pte, write_pt); |
411 | 433 | ||
412 | if (!write_pt) | 434 | if (!write_pt) |
413 | vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ | 435 | vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ |
414 | 436 | ||
415 | /* | ||
416 | * mmio: emulate if accessible, otherwise its a guest fault. | ||
417 | */ | ||
418 | if (shadow_pte && is_io_pte(*shadow_pte)) { | ||
419 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
420 | up_read(&vcpu->kvm->slots_lock); | ||
421 | return 1; | ||
422 | } | ||
423 | |||
424 | ++vcpu->stat.pf_fixed; | 437 | ++vcpu->stat.pf_fixed; |
425 | kvm_mmu_audit(vcpu, "post page fault (fixed)"); | 438 | kvm_mmu_audit(vcpu, "post page fault (fixed)"); |
426 | spin_unlock(&vcpu->kvm->mmu_lock); | 439 | spin_unlock(&vcpu->kvm->mmu_lock); |
427 | up_read(&vcpu->kvm->slots_lock); | ||
428 | 440 | ||
429 | return write_pt; | 441 | return write_pt; |
430 | } | 442 | } |
diff --git a/arch/x86/kvm/segment_descriptor.h b/arch/x86/kvm/segment_descriptor.h deleted file mode 100644 index 56fc4c873389..000000000000 --- a/arch/x86/kvm/segment_descriptor.h +++ /dev/null | |||
@@ -1,29 +0,0 @@ | |||
1 | #ifndef __SEGMENT_DESCRIPTOR_H | ||
2 | #define __SEGMENT_DESCRIPTOR_H | ||
3 | |||
4 | struct segment_descriptor { | ||
5 | u16 limit_low; | ||
6 | u16 base_low; | ||
7 | u8 base_mid; | ||
8 | u8 type : 4; | ||
9 | u8 system : 1; | ||
10 | u8 dpl : 2; | ||
11 | u8 present : 1; | ||
12 | u8 limit_high : 4; | ||
13 | u8 avl : 1; | ||
14 | u8 long_mode : 1; | ||
15 | u8 default_op : 1; | ||
16 | u8 granularity : 1; | ||
17 | u8 base_high; | ||
18 | } __attribute__((packed)); | ||
19 | |||
20 | #ifdef CONFIG_X86_64 | ||
21 | /* LDT or TSS descriptor in the GDT. 16 bytes. */ | ||
22 | struct segment_descriptor_64 { | ||
23 | struct segment_descriptor s; | ||
24 | u32 base_higher; | ||
25 | u32 pad_zero; | ||
26 | }; | ||
27 | |||
28 | #endif | ||
29 | #endif | ||
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 1a582f1090e8..89e0be2c10d0 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c | |||
@@ -47,6 +47,18 @@ MODULE_LICENSE("GPL"); | |||
47 | #define SVM_FEATURE_LBRV (1 << 1) | 47 | #define SVM_FEATURE_LBRV (1 << 1) |
48 | #define SVM_DEATURE_SVML (1 << 2) | 48 | #define SVM_DEATURE_SVML (1 << 2) |
49 | 49 | ||
50 | #define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) | ||
51 | |||
52 | /* enable NPT for AMD64 and X86 with PAE */ | ||
53 | #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) | ||
54 | static bool npt_enabled = true; | ||
55 | #else | ||
56 | static bool npt_enabled = false; | ||
57 | #endif | ||
58 | static int npt = 1; | ||
59 | |||
60 | module_param(npt, int, S_IRUGO); | ||
61 | |||
50 | static void kvm_reput_irq(struct vcpu_svm *svm); | 62 | static void kvm_reput_irq(struct vcpu_svm *svm); |
51 | 63 | ||
52 | static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) | 64 | static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) |
@@ -54,8 +66,7 @@ static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) | |||
54 | return container_of(vcpu, struct vcpu_svm, vcpu); | 66 | return container_of(vcpu, struct vcpu_svm, vcpu); |
55 | } | 67 | } |
56 | 68 | ||
57 | unsigned long iopm_base; | 69 | static unsigned long iopm_base; |
58 | unsigned long msrpm_base; | ||
59 | 70 | ||
60 | struct kvm_ldttss_desc { | 71 | struct kvm_ldttss_desc { |
61 | u16 limit0; | 72 | u16 limit0; |
@@ -182,7 +193,7 @@ static inline void flush_guest_tlb(struct kvm_vcpu *vcpu) | |||
182 | 193 | ||
183 | static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) | 194 | static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) |
184 | { | 195 | { |
185 | if (!(efer & EFER_LMA)) | 196 | if (!npt_enabled && !(efer & EFER_LMA)) |
186 | efer &= ~EFER_LME; | 197 | efer &= ~EFER_LME; |
187 | 198 | ||
188 | to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK; | 199 | to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK; |
@@ -219,12 +230,12 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) | |||
219 | struct vcpu_svm *svm = to_svm(vcpu); | 230 | struct vcpu_svm *svm = to_svm(vcpu); |
220 | 231 | ||
221 | if (!svm->next_rip) { | 232 | if (!svm->next_rip) { |
222 | printk(KERN_DEBUG "%s: NOP\n", __FUNCTION__); | 233 | printk(KERN_DEBUG "%s: NOP\n", __func__); |
223 | return; | 234 | return; |
224 | } | 235 | } |
225 | if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE) | 236 | if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE) |
226 | printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n", | 237 | printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n", |
227 | __FUNCTION__, | 238 | __func__, |
228 | svm->vmcb->save.rip, | 239 | svm->vmcb->save.rip, |
229 | svm->next_rip); | 240 | svm->next_rip); |
230 | 241 | ||
@@ -279,11 +290,7 @@ static void svm_hardware_enable(void *garbage) | |||
279 | 290 | ||
280 | struct svm_cpu_data *svm_data; | 291 | struct svm_cpu_data *svm_data; |
281 | uint64_t efer; | 292 | uint64_t efer; |
282 | #ifdef CONFIG_X86_64 | ||
283 | struct desc_ptr gdt_descr; | ||
284 | #else | ||
285 | struct desc_ptr gdt_descr; | 293 | struct desc_ptr gdt_descr; |
286 | #endif | ||
287 | struct desc_struct *gdt; | 294 | struct desc_struct *gdt; |
288 | int me = raw_smp_processor_id(); | 295 | int me = raw_smp_processor_id(); |
289 | 296 | ||
@@ -302,7 +309,6 @@ static void svm_hardware_enable(void *garbage) | |||
302 | svm_data->asid_generation = 1; | 309 | svm_data->asid_generation = 1; |
303 | svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; | 310 | svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; |
304 | svm_data->next_asid = svm_data->max_asid + 1; | 311 | svm_data->next_asid = svm_data->max_asid + 1; |
305 | svm_features = cpuid_edx(SVM_CPUID_FUNC); | ||
306 | 312 | ||
307 | asm volatile ("sgdt %0" : "=m"(gdt_descr)); | 313 | asm volatile ("sgdt %0" : "=m"(gdt_descr)); |
308 | gdt = (struct desc_struct *)gdt_descr.address; | 314 | gdt = (struct desc_struct *)gdt_descr.address; |
@@ -361,12 +367,51 @@ static void set_msr_interception(u32 *msrpm, unsigned msr, | |||
361 | BUG(); | 367 | BUG(); |
362 | } | 368 | } |
363 | 369 | ||
370 | static void svm_vcpu_init_msrpm(u32 *msrpm) | ||
371 | { | ||
372 | memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER)); | ||
373 | |||
374 | #ifdef CONFIG_X86_64 | ||
375 | set_msr_interception(msrpm, MSR_GS_BASE, 1, 1); | ||
376 | set_msr_interception(msrpm, MSR_FS_BASE, 1, 1); | ||
377 | set_msr_interception(msrpm, MSR_KERNEL_GS_BASE, 1, 1); | ||
378 | set_msr_interception(msrpm, MSR_LSTAR, 1, 1); | ||
379 | set_msr_interception(msrpm, MSR_CSTAR, 1, 1); | ||
380 | set_msr_interception(msrpm, MSR_SYSCALL_MASK, 1, 1); | ||
381 | #endif | ||
382 | set_msr_interception(msrpm, MSR_K6_STAR, 1, 1); | ||
383 | set_msr_interception(msrpm, MSR_IA32_SYSENTER_CS, 1, 1); | ||
384 | set_msr_interception(msrpm, MSR_IA32_SYSENTER_ESP, 1, 1); | ||
385 | set_msr_interception(msrpm, MSR_IA32_SYSENTER_EIP, 1, 1); | ||
386 | } | ||
387 | |||
388 | static void svm_enable_lbrv(struct vcpu_svm *svm) | ||
389 | { | ||
390 | u32 *msrpm = svm->msrpm; | ||
391 | |||
392 | svm->vmcb->control.lbr_ctl = 1; | ||
393 | set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1); | ||
394 | set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1); | ||
395 | set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); | ||
396 | set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 1, 1); | ||
397 | } | ||
398 | |||
399 | static void svm_disable_lbrv(struct vcpu_svm *svm) | ||
400 | { | ||
401 | u32 *msrpm = svm->msrpm; | ||
402 | |||
403 | svm->vmcb->control.lbr_ctl = 0; | ||
404 | set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0); | ||
405 | set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0); | ||
406 | set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0); | ||
407 | set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0); | ||
408 | } | ||
409 | |||
364 | static __init int svm_hardware_setup(void) | 410 | static __init int svm_hardware_setup(void) |
365 | { | 411 | { |
366 | int cpu; | 412 | int cpu; |
367 | struct page *iopm_pages; | 413 | struct page *iopm_pages; |
368 | struct page *msrpm_pages; | 414 | void *iopm_va; |
369 | void *iopm_va, *msrpm_va; | ||
370 | int r; | 415 | int r; |
371 | 416 | ||
372 | iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER); | 417 | iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER); |
@@ -379,41 +424,33 @@ static __init int svm_hardware_setup(void) | |||
379 | clear_bit(0x80, iopm_va); /* allow direct access to PC debug port */ | 424 | clear_bit(0x80, iopm_va); /* allow direct access to PC debug port */ |
380 | iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; | 425 | iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; |
381 | 426 | ||
427 | if (boot_cpu_has(X86_FEATURE_NX)) | ||
428 | kvm_enable_efer_bits(EFER_NX); | ||
382 | 429 | ||
383 | msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); | 430 | for_each_online_cpu(cpu) { |
431 | r = svm_cpu_init(cpu); | ||
432 | if (r) | ||
433 | goto err; | ||
434 | } | ||
384 | 435 | ||
385 | r = -ENOMEM; | 436 | svm_features = cpuid_edx(SVM_CPUID_FUNC); |
386 | if (!msrpm_pages) | ||
387 | goto err_1; | ||
388 | 437 | ||
389 | msrpm_va = page_address(msrpm_pages); | 438 | if (!svm_has(SVM_FEATURE_NPT)) |
390 | memset(msrpm_va, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER)); | 439 | npt_enabled = false; |
391 | msrpm_base = page_to_pfn(msrpm_pages) << PAGE_SHIFT; | ||
392 | 440 | ||
393 | #ifdef CONFIG_X86_64 | 441 | if (npt_enabled && !npt) { |
394 | set_msr_interception(msrpm_va, MSR_GS_BASE, 1, 1); | 442 | printk(KERN_INFO "kvm: Nested Paging disabled\n"); |
395 | set_msr_interception(msrpm_va, MSR_FS_BASE, 1, 1); | 443 | npt_enabled = false; |
396 | set_msr_interception(msrpm_va, MSR_KERNEL_GS_BASE, 1, 1); | 444 | } |
397 | set_msr_interception(msrpm_va, MSR_LSTAR, 1, 1); | ||
398 | set_msr_interception(msrpm_va, MSR_CSTAR, 1, 1); | ||
399 | set_msr_interception(msrpm_va, MSR_SYSCALL_MASK, 1, 1); | ||
400 | #endif | ||
401 | set_msr_interception(msrpm_va, MSR_K6_STAR, 1, 1); | ||
402 | set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_CS, 1, 1); | ||
403 | set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_ESP, 1, 1); | ||
404 | set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_EIP, 1, 1); | ||
405 | 445 | ||
406 | for_each_online_cpu(cpu) { | 446 | if (npt_enabled) { |
407 | r = svm_cpu_init(cpu); | 447 | printk(KERN_INFO "kvm: Nested Paging enabled\n"); |
408 | if (r) | 448 | kvm_enable_tdp(); |
409 | goto err_2; | ||
410 | } | 449 | } |
450 | |||
411 | return 0; | 451 | return 0; |
412 | 452 | ||
413 | err_2: | 453 | err: |
414 | __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER); | ||
415 | msrpm_base = 0; | ||
416 | err_1: | ||
417 | __free_pages(iopm_pages, IOPM_ALLOC_ORDER); | 454 | __free_pages(iopm_pages, IOPM_ALLOC_ORDER); |
418 | iopm_base = 0; | 455 | iopm_base = 0; |
419 | return r; | 456 | return r; |
@@ -421,9 +458,8 @@ err_1: | |||
421 | 458 | ||
422 | static __exit void svm_hardware_unsetup(void) | 459 | static __exit void svm_hardware_unsetup(void) |
423 | { | 460 | { |
424 | __free_pages(pfn_to_page(msrpm_base >> PAGE_SHIFT), MSRPM_ALLOC_ORDER); | ||
425 | __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER); | 461 | __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER); |
426 | iopm_base = msrpm_base = 0; | 462 | iopm_base = 0; |
427 | } | 463 | } |
428 | 464 | ||
429 | static void init_seg(struct vmcb_seg *seg) | 465 | static void init_seg(struct vmcb_seg *seg) |
@@ -443,15 +479,14 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type) | |||
443 | seg->base = 0; | 479 | seg->base = 0; |
444 | } | 480 | } |
445 | 481 | ||
446 | static void init_vmcb(struct vmcb *vmcb) | 482 | static void init_vmcb(struct vcpu_svm *svm) |
447 | { | 483 | { |
448 | struct vmcb_control_area *control = &vmcb->control; | 484 | struct vmcb_control_area *control = &svm->vmcb->control; |
449 | struct vmcb_save_area *save = &vmcb->save; | 485 | struct vmcb_save_area *save = &svm->vmcb->save; |
450 | 486 | ||
451 | control->intercept_cr_read = INTERCEPT_CR0_MASK | | 487 | control->intercept_cr_read = INTERCEPT_CR0_MASK | |
452 | INTERCEPT_CR3_MASK | | 488 | INTERCEPT_CR3_MASK | |
453 | INTERCEPT_CR4_MASK | | 489 | INTERCEPT_CR4_MASK; |
454 | INTERCEPT_CR8_MASK; | ||
455 | 490 | ||
456 | control->intercept_cr_write = INTERCEPT_CR0_MASK | | 491 | control->intercept_cr_write = INTERCEPT_CR0_MASK | |
457 | INTERCEPT_CR3_MASK | | 492 | INTERCEPT_CR3_MASK | |
@@ -471,23 +506,13 @@ static void init_vmcb(struct vmcb *vmcb) | |||
471 | INTERCEPT_DR7_MASK; | 506 | INTERCEPT_DR7_MASK; |
472 | 507 | ||
473 | control->intercept_exceptions = (1 << PF_VECTOR) | | 508 | control->intercept_exceptions = (1 << PF_VECTOR) | |
474 | (1 << UD_VECTOR); | 509 | (1 << UD_VECTOR) | |
510 | (1 << MC_VECTOR); | ||
475 | 511 | ||
476 | 512 | ||
477 | control->intercept = (1ULL << INTERCEPT_INTR) | | 513 | control->intercept = (1ULL << INTERCEPT_INTR) | |
478 | (1ULL << INTERCEPT_NMI) | | 514 | (1ULL << INTERCEPT_NMI) | |
479 | (1ULL << INTERCEPT_SMI) | | 515 | (1ULL << INTERCEPT_SMI) | |
480 | /* | ||
481 | * selective cr0 intercept bug? | ||
482 | * 0: 0f 22 d8 mov %eax,%cr3 | ||
483 | * 3: 0f 20 c0 mov %cr0,%eax | ||
484 | * 6: 0d 00 00 00 80 or $0x80000000,%eax | ||
485 | * b: 0f 22 c0 mov %eax,%cr0 | ||
486 | * set cr3 ->interception | ||
487 | * get cr0 ->interception | ||
488 | * set cr0 -> no interception | ||
489 | */ | ||
490 | /* (1ULL << INTERCEPT_SELECTIVE_CR0) | */ | ||
491 | (1ULL << INTERCEPT_CPUID) | | 516 | (1ULL << INTERCEPT_CPUID) | |
492 | (1ULL << INTERCEPT_INVD) | | 517 | (1ULL << INTERCEPT_INVD) | |
493 | (1ULL << INTERCEPT_HLT) | | 518 | (1ULL << INTERCEPT_HLT) | |
@@ -508,7 +533,7 @@ static void init_vmcb(struct vmcb *vmcb) | |||
508 | (1ULL << INTERCEPT_MWAIT); | 533 | (1ULL << INTERCEPT_MWAIT); |
509 | 534 | ||
510 | control->iopm_base_pa = iopm_base; | 535 | control->iopm_base_pa = iopm_base; |
511 | control->msrpm_base_pa = msrpm_base; | 536 | control->msrpm_base_pa = __pa(svm->msrpm); |
512 | control->tsc_offset = 0; | 537 | control->tsc_offset = 0; |
513 | control->int_ctl = V_INTR_MASKING_MASK; | 538 | control->int_ctl = V_INTR_MASKING_MASK; |
514 | 539 | ||
@@ -550,13 +575,30 @@ static void init_vmcb(struct vmcb *vmcb) | |||
550 | save->cr0 = 0x00000010 | X86_CR0_PG | X86_CR0_WP; | 575 | save->cr0 = 0x00000010 | X86_CR0_PG | X86_CR0_WP; |
551 | save->cr4 = X86_CR4_PAE; | 576 | save->cr4 = X86_CR4_PAE; |
552 | /* rdx = ?? */ | 577 | /* rdx = ?? */ |
578 | |||
579 | if (npt_enabled) { | ||
580 | /* Setup VMCB for Nested Paging */ | ||
581 | control->nested_ctl = 1; | ||
582 | control->intercept &= ~(1ULL << INTERCEPT_TASK_SWITCH); | ||
583 | control->intercept_exceptions &= ~(1 << PF_VECTOR); | ||
584 | control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK| | ||
585 | INTERCEPT_CR3_MASK); | ||
586 | control->intercept_cr_write &= ~(INTERCEPT_CR0_MASK| | ||
587 | INTERCEPT_CR3_MASK); | ||
588 | save->g_pat = 0x0007040600070406ULL; | ||
589 | /* enable caching because the QEMU Bios doesn't enable it */ | ||
590 | save->cr0 = X86_CR0_ET; | ||
591 | save->cr3 = 0; | ||
592 | save->cr4 = 0; | ||
593 | } | ||
594 | force_new_asid(&svm->vcpu); | ||
553 | } | 595 | } |
554 | 596 | ||
555 | static int svm_vcpu_reset(struct kvm_vcpu *vcpu) | 597 | static int svm_vcpu_reset(struct kvm_vcpu *vcpu) |
556 | { | 598 | { |
557 | struct vcpu_svm *svm = to_svm(vcpu); | 599 | struct vcpu_svm *svm = to_svm(vcpu); |
558 | 600 | ||
559 | init_vmcb(svm->vmcb); | 601 | init_vmcb(svm); |
560 | 602 | ||
561 | if (vcpu->vcpu_id != 0) { | 603 | if (vcpu->vcpu_id != 0) { |
562 | svm->vmcb->save.rip = 0; | 604 | svm->vmcb->save.rip = 0; |
@@ -571,6 +613,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) | |||
571 | { | 613 | { |
572 | struct vcpu_svm *svm; | 614 | struct vcpu_svm *svm; |
573 | struct page *page; | 615 | struct page *page; |
616 | struct page *msrpm_pages; | ||
574 | int err; | 617 | int err; |
575 | 618 | ||
576 | svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); | 619 | svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); |
@@ -589,12 +632,19 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) | |||
589 | goto uninit; | 632 | goto uninit; |
590 | } | 633 | } |
591 | 634 | ||
635 | err = -ENOMEM; | ||
636 | msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); | ||
637 | if (!msrpm_pages) | ||
638 | goto uninit; | ||
639 | svm->msrpm = page_address(msrpm_pages); | ||
640 | svm_vcpu_init_msrpm(svm->msrpm); | ||
641 | |||
592 | svm->vmcb = page_address(page); | 642 | svm->vmcb = page_address(page); |
593 | clear_page(svm->vmcb); | 643 | clear_page(svm->vmcb); |
594 | svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; | 644 | svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; |
595 | svm->asid_generation = 0; | 645 | svm->asid_generation = 0; |
596 | memset(svm->db_regs, 0, sizeof(svm->db_regs)); | 646 | memset(svm->db_regs, 0, sizeof(svm->db_regs)); |
597 | init_vmcb(svm->vmcb); | 647 | init_vmcb(svm); |
598 | 648 | ||
599 | fx_init(&svm->vcpu); | 649 | fx_init(&svm->vcpu); |
600 | svm->vcpu.fpu_active = 1; | 650 | svm->vcpu.fpu_active = 1; |
@@ -617,6 +667,7 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) | |||
617 | struct vcpu_svm *svm = to_svm(vcpu); | 667 | struct vcpu_svm *svm = to_svm(vcpu); |
618 | 668 | ||
619 | __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT)); | 669 | __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT)); |
670 | __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); | ||
620 | kvm_vcpu_uninit(vcpu); | 671 | kvm_vcpu_uninit(vcpu); |
621 | kmem_cache_free(kvm_vcpu_cache, svm); | 672 | kmem_cache_free(kvm_vcpu_cache, svm); |
622 | } | 673 | } |
@@ -731,6 +782,13 @@ static void svm_get_segment(struct kvm_vcpu *vcpu, | |||
731 | var->unusable = !var->present; | 782 | var->unusable = !var->present; |
732 | } | 783 | } |
733 | 784 | ||
785 | static int svm_get_cpl(struct kvm_vcpu *vcpu) | ||
786 | { | ||
787 | struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save; | ||
788 | |||
789 | return save->cpl; | ||
790 | } | ||
791 | |||
734 | static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) | 792 | static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) |
735 | { | 793 | { |
736 | struct vcpu_svm *svm = to_svm(vcpu); | 794 | struct vcpu_svm *svm = to_svm(vcpu); |
@@ -784,6 +842,9 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | |||
784 | } | 842 | } |
785 | } | 843 | } |
786 | #endif | 844 | #endif |
845 | if (npt_enabled) | ||
846 | goto set; | ||
847 | |||
787 | if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) { | 848 | if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) { |
788 | svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); | 849 | svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); |
789 | vcpu->fpu_active = 1; | 850 | vcpu->fpu_active = 1; |
@@ -791,18 +852,29 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | |||
791 | 852 | ||
792 | vcpu->arch.cr0 = cr0; | 853 | vcpu->arch.cr0 = cr0; |
793 | cr0 |= X86_CR0_PG | X86_CR0_WP; | 854 | cr0 |= X86_CR0_PG | X86_CR0_WP; |
794 | cr0 &= ~(X86_CR0_CD | X86_CR0_NW); | ||
795 | if (!vcpu->fpu_active) { | 855 | if (!vcpu->fpu_active) { |
796 | svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR); | 856 | svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR); |
797 | cr0 |= X86_CR0_TS; | 857 | cr0 |= X86_CR0_TS; |
798 | } | 858 | } |
859 | set: | ||
860 | /* | ||
861 | * re-enable caching here because the QEMU bios | ||
862 | * does not do it - this results in some delay at | ||
863 | * reboot | ||
864 | */ | ||
865 | cr0 &= ~(X86_CR0_CD | X86_CR0_NW); | ||
799 | svm->vmcb->save.cr0 = cr0; | 866 | svm->vmcb->save.cr0 = cr0; |
800 | } | 867 | } |
801 | 868 | ||
802 | static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | 869 | static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) |
803 | { | 870 | { |
804 | vcpu->arch.cr4 = cr4; | 871 | unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE; |
805 | to_svm(vcpu)->vmcb->save.cr4 = cr4 | X86_CR4_PAE; | 872 | |
873 | vcpu->arch.cr4 = cr4; | ||
874 | if (!npt_enabled) | ||
875 | cr4 |= X86_CR4_PAE; | ||
876 | cr4 |= host_cr4_mce; | ||
877 | to_svm(vcpu)->vmcb->save.cr4 = cr4; | ||
806 | } | 878 | } |
807 | 879 | ||
808 | static void svm_set_segment(struct kvm_vcpu *vcpu, | 880 | static void svm_set_segment(struct kvm_vcpu *vcpu, |
@@ -833,13 +905,6 @@ static void svm_set_segment(struct kvm_vcpu *vcpu, | |||
833 | 905 | ||
834 | } | 906 | } |
835 | 907 | ||
836 | /* FIXME: | ||
837 | |||
838 | svm(vcpu)->vmcb->control.int_ctl &= ~V_TPR_MASK; | ||
839 | svm(vcpu)->vmcb->control.int_ctl |= (sregs->cr8 & V_TPR_MASK); | ||
840 | |||
841 | */ | ||
842 | |||
843 | static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) | 908 | static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) |
844 | { | 909 | { |
845 | return -EOPNOTSUPP; | 910 | return -EOPNOTSUPP; |
@@ -920,7 +985,7 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, | |||
920 | } | 985 | } |
921 | default: | 986 | default: |
922 | printk(KERN_DEBUG "%s: unexpected dr %u\n", | 987 | printk(KERN_DEBUG "%s: unexpected dr %u\n", |
923 | __FUNCTION__, dr); | 988 | __func__, dr); |
924 | *exception = UD_VECTOR; | 989 | *exception = UD_VECTOR; |
925 | return; | 990 | return; |
926 | } | 991 | } |
@@ -962,6 +1027,19 @@ static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | |||
962 | return 1; | 1027 | return 1; |
963 | } | 1028 | } |
964 | 1029 | ||
1030 | static int mc_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | ||
1031 | { | ||
1032 | /* | ||
1033 | * On an #MC intercept the MCE handler is not called automatically in | ||
1034 | * the host. So do it by hand here. | ||
1035 | */ | ||
1036 | asm volatile ( | ||
1037 | "int $0x12\n"); | ||
1038 | /* not sure if we ever come back to this point */ | ||
1039 | |||
1040 | return 1; | ||
1041 | } | ||
1042 | |||
965 | static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | 1043 | static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) |
966 | { | 1044 | { |
967 | /* | 1045 | /* |
@@ -969,7 +1047,7 @@ static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | |||
969 | * so reinitialize it. | 1047 | * so reinitialize it. |
970 | */ | 1048 | */ |
971 | clear_page(svm->vmcb); | 1049 | clear_page(svm->vmcb); |
972 | init_vmcb(svm->vmcb); | 1050 | init_vmcb(svm); |
973 | 1051 | ||
974 | kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; | 1052 | kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; |
975 | return 0; | 1053 | return 0; |
@@ -1033,9 +1111,18 @@ static int invalid_op_interception(struct vcpu_svm *svm, | |||
1033 | static int task_switch_interception(struct vcpu_svm *svm, | 1111 | static int task_switch_interception(struct vcpu_svm *svm, |
1034 | struct kvm_run *kvm_run) | 1112 | struct kvm_run *kvm_run) |
1035 | { | 1113 | { |
1036 | pr_unimpl(&svm->vcpu, "%s: task switch is unsupported\n", __FUNCTION__); | 1114 | u16 tss_selector; |
1037 | kvm_run->exit_reason = KVM_EXIT_UNKNOWN; | 1115 | |
1038 | return 0; | 1116 | tss_selector = (u16)svm->vmcb->control.exit_info_1; |
1117 | if (svm->vmcb->control.exit_info_2 & | ||
1118 | (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET)) | ||
1119 | return kvm_task_switch(&svm->vcpu, tss_selector, | ||
1120 | TASK_SWITCH_IRET); | ||
1121 | if (svm->vmcb->control.exit_info_2 & | ||
1122 | (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP)) | ||
1123 | return kvm_task_switch(&svm->vcpu, tss_selector, | ||
1124 | TASK_SWITCH_JMP); | ||
1125 | return kvm_task_switch(&svm->vcpu, tss_selector, TASK_SWITCH_CALL); | ||
1039 | } | 1126 | } |
1040 | 1127 | ||
1041 | static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) | 1128 | static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) |
@@ -1049,7 +1136,7 @@ static int emulate_on_interception(struct vcpu_svm *svm, | |||
1049 | struct kvm_run *kvm_run) | 1136 | struct kvm_run *kvm_run) |
1050 | { | 1137 | { |
1051 | if (emulate_instruction(&svm->vcpu, NULL, 0, 0, 0) != EMULATE_DONE) | 1138 | if (emulate_instruction(&svm->vcpu, NULL, 0, 0, 0) != EMULATE_DONE) |
1052 | pr_unimpl(&svm->vcpu, "%s: failed\n", __FUNCTION__); | 1139 | pr_unimpl(&svm->vcpu, "%s: failed\n", __func__); |
1053 | return 1; | 1140 | return 1; |
1054 | } | 1141 | } |
1055 | 1142 | ||
@@ -1179,8 +1266,19 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) | |||
1179 | svm->vmcb->save.sysenter_esp = data; | 1266 | svm->vmcb->save.sysenter_esp = data; |
1180 | break; | 1267 | break; |
1181 | case MSR_IA32_DEBUGCTLMSR: | 1268 | case MSR_IA32_DEBUGCTLMSR: |
1182 | pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", | 1269 | if (!svm_has(SVM_FEATURE_LBRV)) { |
1183 | __FUNCTION__, data); | 1270 | pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", |
1271 | __func__, data); | ||
1272 | break; | ||
1273 | } | ||
1274 | if (data & DEBUGCTL_RESERVED_BITS) | ||
1275 | return 1; | ||
1276 | |||
1277 | svm->vmcb->save.dbgctl = data; | ||
1278 | if (data & (1ULL<<0)) | ||
1279 | svm_enable_lbrv(svm); | ||
1280 | else | ||
1281 | svm_disable_lbrv(svm); | ||
1184 | break; | 1282 | break; |
1185 | case MSR_K7_EVNTSEL0: | 1283 | case MSR_K7_EVNTSEL0: |
1186 | case MSR_K7_EVNTSEL1: | 1284 | case MSR_K7_EVNTSEL1: |
@@ -1265,6 +1363,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm, | |||
1265 | [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, | 1363 | [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, |
1266 | [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, | 1364 | [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, |
1267 | [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, | 1365 | [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, |
1366 | [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception, | ||
1268 | [SVM_EXIT_INTR] = nop_on_interception, | 1367 | [SVM_EXIT_INTR] = nop_on_interception, |
1269 | [SVM_EXIT_NMI] = nop_on_interception, | 1368 | [SVM_EXIT_NMI] = nop_on_interception, |
1270 | [SVM_EXIT_SMI] = nop_on_interception, | 1369 | [SVM_EXIT_SMI] = nop_on_interception, |
@@ -1290,14 +1389,34 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm, | |||
1290 | [SVM_EXIT_WBINVD] = emulate_on_interception, | 1389 | [SVM_EXIT_WBINVD] = emulate_on_interception, |
1291 | [SVM_EXIT_MONITOR] = invalid_op_interception, | 1390 | [SVM_EXIT_MONITOR] = invalid_op_interception, |
1292 | [SVM_EXIT_MWAIT] = invalid_op_interception, | 1391 | [SVM_EXIT_MWAIT] = invalid_op_interception, |
1392 | [SVM_EXIT_NPF] = pf_interception, | ||
1293 | }; | 1393 | }; |
1294 | 1394 | ||
1295 | |||
1296 | static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) | 1395 | static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) |
1297 | { | 1396 | { |
1298 | struct vcpu_svm *svm = to_svm(vcpu); | 1397 | struct vcpu_svm *svm = to_svm(vcpu); |
1299 | u32 exit_code = svm->vmcb->control.exit_code; | 1398 | u32 exit_code = svm->vmcb->control.exit_code; |
1300 | 1399 | ||
1400 | if (npt_enabled) { | ||
1401 | int mmu_reload = 0; | ||
1402 | if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) { | ||
1403 | svm_set_cr0(vcpu, svm->vmcb->save.cr0); | ||
1404 | mmu_reload = 1; | ||
1405 | } | ||
1406 | vcpu->arch.cr0 = svm->vmcb->save.cr0; | ||
1407 | vcpu->arch.cr3 = svm->vmcb->save.cr3; | ||
1408 | if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { | ||
1409 | if (!load_pdptrs(vcpu, vcpu->arch.cr3)) { | ||
1410 | kvm_inject_gp(vcpu, 0); | ||
1411 | return 1; | ||
1412 | } | ||
1413 | } | ||
1414 | if (mmu_reload) { | ||
1415 | kvm_mmu_reset_context(vcpu); | ||
1416 | kvm_mmu_load(vcpu); | ||
1417 | } | ||
1418 | } | ||
1419 | |||
1301 | kvm_reput_irq(svm); | 1420 | kvm_reput_irq(svm); |
1302 | 1421 | ||
1303 | if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { | 1422 | if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { |
@@ -1308,10 +1427,11 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) | |||
1308 | } | 1427 | } |
1309 | 1428 | ||
1310 | if (is_external_interrupt(svm->vmcb->control.exit_int_info) && | 1429 | if (is_external_interrupt(svm->vmcb->control.exit_int_info) && |
1311 | exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR) | 1430 | exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR && |
1431 | exit_code != SVM_EXIT_NPF) | ||
1312 | printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x " | 1432 | printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x " |
1313 | "exit_code 0x%x\n", | 1433 | "exit_code 0x%x\n", |
1314 | __FUNCTION__, svm->vmcb->control.exit_int_info, | 1434 | __func__, svm->vmcb->control.exit_int_info, |
1315 | exit_code); | 1435 | exit_code); |
1316 | 1436 | ||
1317 | if (exit_code >= ARRAY_SIZE(svm_exit_handlers) | 1437 | if (exit_code >= ARRAY_SIZE(svm_exit_handlers) |
@@ -1364,6 +1484,27 @@ static void svm_set_irq(struct kvm_vcpu *vcpu, int irq) | |||
1364 | svm_inject_irq(svm, irq); | 1484 | svm_inject_irq(svm, irq); |
1365 | } | 1485 | } |
1366 | 1486 | ||
1487 | static void update_cr8_intercept(struct kvm_vcpu *vcpu) | ||
1488 | { | ||
1489 | struct vcpu_svm *svm = to_svm(vcpu); | ||
1490 | struct vmcb *vmcb = svm->vmcb; | ||
1491 | int max_irr, tpr; | ||
1492 | |||
1493 | if (!irqchip_in_kernel(vcpu->kvm) || vcpu->arch.apic->vapic_addr) | ||
1494 | return; | ||
1495 | |||
1496 | vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; | ||
1497 | |||
1498 | max_irr = kvm_lapic_find_highest_irr(vcpu); | ||
1499 | if (max_irr == -1) | ||
1500 | return; | ||
1501 | |||
1502 | tpr = kvm_lapic_get_cr8(vcpu) << 4; | ||
1503 | |||
1504 | if (tpr >= (max_irr & 0xf0)) | ||
1505 | vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK; | ||
1506 | } | ||
1507 | |||
1367 | static void svm_intr_assist(struct kvm_vcpu *vcpu) | 1508 | static void svm_intr_assist(struct kvm_vcpu *vcpu) |
1368 | { | 1509 | { |
1369 | struct vcpu_svm *svm = to_svm(vcpu); | 1510 | struct vcpu_svm *svm = to_svm(vcpu); |
@@ -1376,14 +1517,14 @@ static void svm_intr_assist(struct kvm_vcpu *vcpu) | |||
1376 | SVM_EVTINJ_VEC_MASK; | 1517 | SVM_EVTINJ_VEC_MASK; |
1377 | vmcb->control.exit_int_info = 0; | 1518 | vmcb->control.exit_int_info = 0; |
1378 | svm_inject_irq(svm, intr_vector); | 1519 | svm_inject_irq(svm, intr_vector); |
1379 | return; | 1520 | goto out; |
1380 | } | 1521 | } |
1381 | 1522 | ||
1382 | if (vmcb->control.int_ctl & V_IRQ_MASK) | 1523 | if (vmcb->control.int_ctl & V_IRQ_MASK) |
1383 | return; | 1524 | goto out; |
1384 | 1525 | ||
1385 | if (!kvm_cpu_has_interrupt(vcpu)) | 1526 | if (!kvm_cpu_has_interrupt(vcpu)) |
1386 | return; | 1527 | goto out; |
1387 | 1528 | ||
1388 | if (!(vmcb->save.rflags & X86_EFLAGS_IF) || | 1529 | if (!(vmcb->save.rflags & X86_EFLAGS_IF) || |
1389 | (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) || | 1530 | (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) || |
@@ -1391,12 +1532,14 @@ static void svm_intr_assist(struct kvm_vcpu *vcpu) | |||
1391 | /* unable to deliver irq, set pending irq */ | 1532 | /* unable to deliver irq, set pending irq */ |
1392 | vmcb->control.intercept |= (1ULL << INTERCEPT_VINTR); | 1533 | vmcb->control.intercept |= (1ULL << INTERCEPT_VINTR); |
1393 | svm_inject_irq(svm, 0x0); | 1534 | svm_inject_irq(svm, 0x0); |
1394 | return; | 1535 | goto out; |
1395 | } | 1536 | } |
1396 | /* Okay, we can deliver the interrupt: grab it and update PIC state. */ | 1537 | /* Okay, we can deliver the interrupt: grab it and update PIC state. */ |
1397 | intr_vector = kvm_cpu_get_interrupt(vcpu); | 1538 | intr_vector = kvm_cpu_get_interrupt(vcpu); |
1398 | svm_inject_irq(svm, intr_vector); | 1539 | svm_inject_irq(svm, intr_vector); |
1399 | kvm_timer_intr_post(vcpu, intr_vector); | 1540 | kvm_timer_intr_post(vcpu, intr_vector); |
1541 | out: | ||
1542 | update_cr8_intercept(vcpu); | ||
1400 | } | 1543 | } |
1401 | 1544 | ||
1402 | static void kvm_reput_irq(struct vcpu_svm *svm) | 1545 | static void kvm_reput_irq(struct vcpu_svm *svm) |
@@ -1482,6 +1625,29 @@ static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) | |||
1482 | { | 1625 | { |
1483 | } | 1626 | } |
1484 | 1627 | ||
1628 | static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu) | ||
1629 | { | ||
1630 | struct vcpu_svm *svm = to_svm(vcpu); | ||
1631 | |||
1632 | if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) { | ||
1633 | int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK; | ||
1634 | kvm_lapic_set_tpr(vcpu, cr8); | ||
1635 | } | ||
1636 | } | ||
1637 | |||
1638 | static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) | ||
1639 | { | ||
1640 | struct vcpu_svm *svm = to_svm(vcpu); | ||
1641 | u64 cr8; | ||
1642 | |||
1643 | if (!irqchip_in_kernel(vcpu->kvm)) | ||
1644 | return; | ||
1645 | |||
1646 | cr8 = kvm_get_cr8(vcpu); | ||
1647 | svm->vmcb->control.int_ctl &= ~V_TPR_MASK; | ||
1648 | svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; | ||
1649 | } | ||
1650 | |||
1485 | static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | 1651 | static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) |
1486 | { | 1652 | { |
1487 | struct vcpu_svm *svm = to_svm(vcpu); | 1653 | struct vcpu_svm *svm = to_svm(vcpu); |
@@ -1491,6 +1657,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
1491 | 1657 | ||
1492 | pre_svm_run(svm); | 1658 | pre_svm_run(svm); |
1493 | 1659 | ||
1660 | sync_lapic_to_cr8(vcpu); | ||
1661 | |||
1494 | save_host_msrs(vcpu); | 1662 | save_host_msrs(vcpu); |
1495 | fs_selector = read_fs(); | 1663 | fs_selector = read_fs(); |
1496 | gs_selector = read_gs(); | 1664 | gs_selector = read_gs(); |
@@ -1499,6 +1667,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
1499 | svm->host_dr6 = read_dr6(); | 1667 | svm->host_dr6 = read_dr6(); |
1500 | svm->host_dr7 = read_dr7(); | 1668 | svm->host_dr7 = read_dr7(); |
1501 | svm->vmcb->save.cr2 = vcpu->arch.cr2; | 1669 | svm->vmcb->save.cr2 = vcpu->arch.cr2; |
1670 | /* required for live migration with NPT */ | ||
1671 | if (npt_enabled) | ||
1672 | svm->vmcb->save.cr3 = vcpu->arch.cr3; | ||
1502 | 1673 | ||
1503 | if (svm->vmcb->save.dr7 & 0xff) { | 1674 | if (svm->vmcb->save.dr7 & 0xff) { |
1504 | write_dr7(0); | 1675 | write_dr7(0); |
@@ -1635,6 +1806,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
1635 | 1806 | ||
1636 | stgi(); | 1807 | stgi(); |
1637 | 1808 | ||
1809 | sync_cr8_to_lapic(vcpu); | ||
1810 | |||
1638 | svm->next_rip = 0; | 1811 | svm->next_rip = 0; |
1639 | } | 1812 | } |
1640 | 1813 | ||
@@ -1642,6 +1815,12 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) | |||
1642 | { | 1815 | { |
1643 | struct vcpu_svm *svm = to_svm(vcpu); | 1816 | struct vcpu_svm *svm = to_svm(vcpu); |
1644 | 1817 | ||
1818 | if (npt_enabled) { | ||
1819 | svm->vmcb->control.nested_cr3 = root; | ||
1820 | force_new_asid(vcpu); | ||
1821 | return; | ||
1822 | } | ||
1823 | |||
1645 | svm->vmcb->save.cr3 = root; | 1824 | svm->vmcb->save.cr3 = root; |
1646 | force_new_asid(vcpu); | 1825 | force_new_asid(vcpu); |
1647 | 1826 | ||
@@ -1709,6 +1888,7 @@ static struct kvm_x86_ops svm_x86_ops = { | |||
1709 | .get_segment_base = svm_get_segment_base, | 1888 | .get_segment_base = svm_get_segment_base, |
1710 | .get_segment = svm_get_segment, | 1889 | .get_segment = svm_get_segment, |
1711 | .set_segment = svm_set_segment, | 1890 | .set_segment = svm_set_segment, |
1891 | .get_cpl = svm_get_cpl, | ||
1712 | .get_cs_db_l_bits = kvm_get_cs_db_l_bits, | 1892 | .get_cs_db_l_bits = kvm_get_cs_db_l_bits, |
1713 | .decache_cr4_guest_bits = svm_decache_cr4_guest_bits, | 1893 | .decache_cr4_guest_bits = svm_decache_cr4_guest_bits, |
1714 | .set_cr0 = svm_set_cr0, | 1894 | .set_cr0 = svm_set_cr0, |
diff --git a/arch/x86/kvm/svm.h b/arch/x86/kvm/svm.h index 5fd50491b555..1b8afa78e869 100644 --- a/arch/x86/kvm/svm.h +++ b/arch/x86/kvm/svm.h | |||
@@ -238,6 +238,9 @@ struct __attribute__ ((__packed__)) vmcb { | |||
238 | #define SVM_EXITINTINFO_VALID SVM_EVTINJ_VALID | 238 | #define SVM_EXITINTINFO_VALID SVM_EVTINJ_VALID |
239 | #define SVM_EXITINTINFO_VALID_ERR SVM_EVTINJ_VALID_ERR | 239 | #define SVM_EXITINTINFO_VALID_ERR SVM_EVTINJ_VALID_ERR |
240 | 240 | ||
241 | #define SVM_EXITINFOSHIFT_TS_REASON_IRET 36 | ||
242 | #define SVM_EXITINFOSHIFT_TS_REASON_JMP 38 | ||
243 | |||
241 | #define SVM_EXIT_READ_CR0 0x000 | 244 | #define SVM_EXIT_READ_CR0 0x000 |
242 | #define SVM_EXIT_READ_CR3 0x003 | 245 | #define SVM_EXIT_READ_CR3 0x003 |
243 | #define SVM_EXIT_READ_CR4 0x004 | 246 | #define SVM_EXIT_READ_CR4 0x004 |
diff --git a/arch/x86/kvm/tss.h b/arch/x86/kvm/tss.h new file mode 100644 index 000000000000..622aa10f692f --- /dev/null +++ b/arch/x86/kvm/tss.h | |||
@@ -0,0 +1,59 @@ | |||
1 | #ifndef __TSS_SEGMENT_H | ||
2 | #define __TSS_SEGMENT_H | ||
3 | |||
4 | struct tss_segment_32 { | ||
5 | u32 prev_task_link; | ||
6 | u32 esp0; | ||
7 | u32 ss0; | ||
8 | u32 esp1; | ||
9 | u32 ss1; | ||
10 | u32 esp2; | ||
11 | u32 ss2; | ||
12 | u32 cr3; | ||
13 | u32 eip; | ||
14 | u32 eflags; | ||
15 | u32 eax; | ||
16 | u32 ecx; | ||
17 | u32 edx; | ||
18 | u32 ebx; | ||
19 | u32 esp; | ||
20 | u32 ebp; | ||
21 | u32 esi; | ||
22 | u32 edi; | ||
23 | u32 es; | ||
24 | u32 cs; | ||
25 | u32 ss; | ||
26 | u32 ds; | ||
27 | u32 fs; | ||
28 | u32 gs; | ||
29 | u32 ldt_selector; | ||
30 | u16 t; | ||
31 | u16 io_map; | ||
32 | }; | ||
33 | |||
34 | struct tss_segment_16 { | ||
35 | u16 prev_task_link; | ||
36 | u16 sp0; | ||
37 | u16 ss0; | ||
38 | u16 sp1; | ||
39 | u16 ss1; | ||
40 | u16 sp2; | ||
41 | u16 ss2; | ||
42 | u16 ip; | ||
43 | u16 flag; | ||
44 | u16 ax; | ||
45 | u16 cx; | ||
46 | u16 dx; | ||
47 | u16 bx; | ||
48 | u16 sp; | ||
49 | u16 bp; | ||
50 | u16 si; | ||
51 | u16 di; | ||
52 | u16 es; | ||
53 | u16 cs; | ||
54 | u16 ss; | ||
55 | u16 ds; | ||
56 | u16 ldt; | ||
57 | }; | ||
58 | |||
59 | #endif | ||
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 8e1462880d1f..8e5d6645b90d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c | |||
@@ -17,7 +17,6 @@ | |||
17 | 17 | ||
18 | #include "irq.h" | 18 | #include "irq.h" |
19 | #include "vmx.h" | 19 | #include "vmx.h" |
20 | #include "segment_descriptor.h" | ||
21 | #include "mmu.h" | 20 | #include "mmu.h" |
22 | 21 | ||
23 | #include <linux/kvm_host.h> | 22 | #include <linux/kvm_host.h> |
@@ -37,6 +36,12 @@ MODULE_LICENSE("GPL"); | |||
37 | static int bypass_guest_pf = 1; | 36 | static int bypass_guest_pf = 1; |
38 | module_param(bypass_guest_pf, bool, 0); | 37 | module_param(bypass_guest_pf, bool, 0); |
39 | 38 | ||
39 | static int enable_vpid = 1; | ||
40 | module_param(enable_vpid, bool, 0); | ||
41 | |||
42 | static int flexpriority_enabled = 1; | ||
43 | module_param(flexpriority_enabled, bool, 0); | ||
44 | |||
40 | struct vmcs { | 45 | struct vmcs { |
41 | u32 revision_id; | 46 | u32 revision_id; |
42 | u32 abort; | 47 | u32 abort; |
@@ -71,6 +76,7 @@ struct vcpu_vmx { | |||
71 | unsigned rip; | 76 | unsigned rip; |
72 | } irq; | 77 | } irq; |
73 | } rmode; | 78 | } rmode; |
79 | int vpid; | ||
74 | }; | 80 | }; |
75 | 81 | ||
76 | static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) | 82 | static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) |
@@ -85,6 +91,10 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs); | |||
85 | 91 | ||
86 | static struct page *vmx_io_bitmap_a; | 92 | static struct page *vmx_io_bitmap_a; |
87 | static struct page *vmx_io_bitmap_b; | 93 | static struct page *vmx_io_bitmap_b; |
94 | static struct page *vmx_msr_bitmap; | ||
95 | |||
96 | static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); | ||
97 | static DEFINE_SPINLOCK(vmx_vpid_lock); | ||
88 | 98 | ||
89 | static struct vmcs_config { | 99 | static struct vmcs_config { |
90 | int size; | 100 | int size; |
@@ -176,6 +186,11 @@ static inline int is_external_interrupt(u32 intr_info) | |||
176 | == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); | 186 | == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); |
177 | } | 187 | } |
178 | 188 | ||
189 | static inline int cpu_has_vmx_msr_bitmap(void) | ||
190 | { | ||
191 | return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS); | ||
192 | } | ||
193 | |||
179 | static inline int cpu_has_vmx_tpr_shadow(void) | 194 | static inline int cpu_has_vmx_tpr_shadow(void) |
180 | { | 195 | { |
181 | return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW); | 196 | return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW); |
@@ -194,8 +209,9 @@ static inline int cpu_has_secondary_exec_ctrls(void) | |||
194 | 209 | ||
195 | static inline bool cpu_has_vmx_virtualize_apic_accesses(void) | 210 | static inline bool cpu_has_vmx_virtualize_apic_accesses(void) |
196 | { | 211 | { |
197 | return (vmcs_config.cpu_based_2nd_exec_ctrl & | 212 | return flexpriority_enabled |
198 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); | 213 | && (vmcs_config.cpu_based_2nd_exec_ctrl & |
214 | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); | ||
199 | } | 215 | } |
200 | 216 | ||
201 | static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) | 217 | static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) |
@@ -204,6 +220,12 @@ static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) | |||
204 | (irqchip_in_kernel(kvm))); | 220 | (irqchip_in_kernel(kvm))); |
205 | } | 221 | } |
206 | 222 | ||
223 | static inline int cpu_has_vmx_vpid(void) | ||
224 | { | ||
225 | return (vmcs_config.cpu_based_2nd_exec_ctrl & | ||
226 | SECONDARY_EXEC_ENABLE_VPID); | ||
227 | } | ||
228 | |||
207 | static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) | 229 | static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) |
208 | { | 230 | { |
209 | int i; | 231 | int i; |
@@ -214,6 +236,20 @@ static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) | |||
214 | return -1; | 236 | return -1; |
215 | } | 237 | } |
216 | 238 | ||
239 | static inline void __invvpid(int ext, u16 vpid, gva_t gva) | ||
240 | { | ||
241 | struct { | ||
242 | u64 vpid : 16; | ||
243 | u64 rsvd : 48; | ||
244 | u64 gva; | ||
245 | } operand = { vpid, 0, gva }; | ||
246 | |||
247 | asm volatile (ASM_VMX_INVVPID | ||
248 | /* CF==1 or ZF==1 --> rc = -1 */ | ||
249 | "; ja 1f ; ud2 ; 1:" | ||
250 | : : "a"(&operand), "c"(ext) : "cc", "memory"); | ||
251 | } | ||
252 | |||
217 | static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) | 253 | static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) |
218 | { | 254 | { |
219 | int i; | 255 | int i; |
@@ -257,6 +293,14 @@ static void vcpu_clear(struct vcpu_vmx *vmx) | |||
257 | vmx->launched = 0; | 293 | vmx->launched = 0; |
258 | } | 294 | } |
259 | 295 | ||
296 | static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx) | ||
297 | { | ||
298 | if (vmx->vpid == 0) | ||
299 | return; | ||
300 | |||
301 | __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0); | ||
302 | } | ||
303 | |||
260 | static unsigned long vmcs_readl(unsigned long field) | 304 | static unsigned long vmcs_readl(unsigned long field) |
261 | { | 305 | { |
262 | unsigned long value; | 306 | unsigned long value; |
@@ -353,7 +397,7 @@ static void reload_tss(void) | |||
353 | * VT restores TR but not its size. Useless. | 397 | * VT restores TR but not its size. Useless. |
354 | */ | 398 | */ |
355 | struct descriptor_table gdt; | 399 | struct descriptor_table gdt; |
356 | struct segment_descriptor *descs; | 400 | struct desc_struct *descs; |
357 | 401 | ||
358 | get_gdt(&gdt); | 402 | get_gdt(&gdt); |
359 | descs = (void *)gdt.base; | 403 | descs = (void *)gdt.base; |
@@ -485,11 +529,12 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | |||
485 | { | 529 | { |
486 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 530 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
487 | u64 phys_addr = __pa(vmx->vmcs); | 531 | u64 phys_addr = __pa(vmx->vmcs); |
488 | u64 tsc_this, delta; | 532 | u64 tsc_this, delta, new_offset; |
489 | 533 | ||
490 | if (vcpu->cpu != cpu) { | 534 | if (vcpu->cpu != cpu) { |
491 | vcpu_clear(vmx); | 535 | vcpu_clear(vmx); |
492 | kvm_migrate_apic_timer(vcpu); | 536 | kvm_migrate_apic_timer(vcpu); |
537 | vpid_sync_vcpu_all(vmx); | ||
493 | } | 538 | } |
494 | 539 | ||
495 | if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { | 540 | if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { |
@@ -524,8 +569,11 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | |||
524 | * Make sure the time stamp counter is monotonous. | 569 | * Make sure the time stamp counter is monotonous. |
525 | */ | 570 | */ |
526 | rdtscll(tsc_this); | 571 | rdtscll(tsc_this); |
527 | delta = vcpu->arch.host_tsc - tsc_this; | 572 | if (tsc_this < vcpu->arch.host_tsc) { |
528 | vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta); | 573 | delta = vcpu->arch.host_tsc - tsc_this; |
574 | new_offset = vmcs_read64(TSC_OFFSET) + delta; | ||
575 | vmcs_write64(TSC_OFFSET, new_offset); | ||
576 | } | ||
529 | } | 577 | } |
530 | } | 578 | } |
531 | 579 | ||
@@ -596,7 +644,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, | |||
596 | { | 644 | { |
597 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, | 645 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, |
598 | nr | INTR_TYPE_EXCEPTION | 646 | nr | INTR_TYPE_EXCEPTION |
599 | | (has_error_code ? INTR_INFO_DELIEVER_CODE_MASK : 0) | 647 | | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0) |
600 | | INTR_INFO_VALID_MASK); | 648 | | INTR_INFO_VALID_MASK); |
601 | if (has_error_code) | 649 | if (has_error_code) |
602 | vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); | 650 | vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); |
@@ -959,6 +1007,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) | |||
959 | CPU_BASED_MOV_DR_EXITING | | 1007 | CPU_BASED_MOV_DR_EXITING | |
960 | CPU_BASED_USE_TSC_OFFSETING; | 1008 | CPU_BASED_USE_TSC_OFFSETING; |
961 | opt = CPU_BASED_TPR_SHADOW | | 1009 | opt = CPU_BASED_TPR_SHADOW | |
1010 | CPU_BASED_USE_MSR_BITMAPS | | ||
962 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; | 1011 | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; |
963 | if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, | 1012 | if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, |
964 | &_cpu_based_exec_control) < 0) | 1013 | &_cpu_based_exec_control) < 0) |
@@ -971,7 +1020,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) | |||
971 | if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { | 1020 | if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { |
972 | min = 0; | 1021 | min = 0; |
973 | opt = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | | 1022 | opt = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | |
974 | SECONDARY_EXEC_WBINVD_EXITING; | 1023 | SECONDARY_EXEC_WBINVD_EXITING | |
1024 | SECONDARY_EXEC_ENABLE_VPID; | ||
975 | if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS2, | 1025 | if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS2, |
976 | &_cpu_based_2nd_exec_control) < 0) | 1026 | &_cpu_based_2nd_exec_control) < 0) |
977 | return -EIO; | 1027 | return -EIO; |
@@ -1080,6 +1130,10 @@ static __init int hardware_setup(void) | |||
1080 | { | 1130 | { |
1081 | if (setup_vmcs_config(&vmcs_config) < 0) | 1131 | if (setup_vmcs_config(&vmcs_config) < 0) |
1082 | return -EIO; | 1132 | return -EIO; |
1133 | |||
1134 | if (boot_cpu_has(X86_FEATURE_NX)) | ||
1135 | kvm_enable_efer_bits(EFER_NX); | ||
1136 | |||
1083 | return alloc_kvm_area(); | 1137 | return alloc_kvm_area(); |
1084 | } | 1138 | } |
1085 | 1139 | ||
@@ -1214,7 +1268,7 @@ static void enter_lmode(struct kvm_vcpu *vcpu) | |||
1214 | guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); | 1268 | guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); |
1215 | if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) { | 1269 | if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) { |
1216 | printk(KERN_DEBUG "%s: tss fixup for long mode. \n", | 1270 | printk(KERN_DEBUG "%s: tss fixup for long mode. \n", |
1217 | __FUNCTION__); | 1271 | __func__); |
1218 | vmcs_write32(GUEST_TR_AR_BYTES, | 1272 | vmcs_write32(GUEST_TR_AR_BYTES, |
1219 | (guest_tr_ar & ~AR_TYPE_MASK) | 1273 | (guest_tr_ar & ~AR_TYPE_MASK) |
1220 | | AR_TYPE_BUSY_64_TSS); | 1274 | | AR_TYPE_BUSY_64_TSS); |
@@ -1239,6 +1293,11 @@ static void exit_lmode(struct kvm_vcpu *vcpu) | |||
1239 | 1293 | ||
1240 | #endif | 1294 | #endif |
1241 | 1295 | ||
1296 | static void vmx_flush_tlb(struct kvm_vcpu *vcpu) | ||
1297 | { | ||
1298 | vpid_sync_vcpu_all(to_vmx(vcpu)); | ||
1299 | } | ||
1300 | |||
1242 | static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) | 1301 | static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) |
1243 | { | 1302 | { |
1244 | vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK; | 1303 | vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK; |
@@ -1275,6 +1334,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | |||
1275 | 1334 | ||
1276 | static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | 1335 | static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) |
1277 | { | 1336 | { |
1337 | vmx_flush_tlb(vcpu); | ||
1278 | vmcs_writel(GUEST_CR3, cr3); | 1338 | vmcs_writel(GUEST_CR3, cr3); |
1279 | if (vcpu->arch.cr0 & X86_CR0_PE) | 1339 | if (vcpu->arch.cr0 & X86_CR0_PE) |
1280 | vmx_fpu_deactivate(vcpu); | 1340 | vmx_fpu_deactivate(vcpu); |
@@ -1288,14 +1348,14 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | |||
1288 | vcpu->arch.cr4 = cr4; | 1348 | vcpu->arch.cr4 = cr4; |
1289 | } | 1349 | } |
1290 | 1350 | ||
1291 | #ifdef CONFIG_X86_64 | ||
1292 | |||
1293 | static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) | 1351 | static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) |
1294 | { | 1352 | { |
1295 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 1353 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
1296 | struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); | 1354 | struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); |
1297 | 1355 | ||
1298 | vcpu->arch.shadow_efer = efer; | 1356 | vcpu->arch.shadow_efer = efer; |
1357 | if (!msr) | ||
1358 | return; | ||
1299 | if (efer & EFER_LMA) { | 1359 | if (efer & EFER_LMA) { |
1300 | vmcs_write32(VM_ENTRY_CONTROLS, | 1360 | vmcs_write32(VM_ENTRY_CONTROLS, |
1301 | vmcs_read32(VM_ENTRY_CONTROLS) | | 1361 | vmcs_read32(VM_ENTRY_CONTROLS) | |
@@ -1312,8 +1372,6 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) | |||
1312 | setup_msrs(vmx); | 1372 | setup_msrs(vmx); |
1313 | } | 1373 | } |
1314 | 1374 | ||
1315 | #endif | ||
1316 | |||
1317 | static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) | 1375 | static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) |
1318 | { | 1376 | { |
1319 | struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; | 1377 | struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; |
@@ -1344,6 +1402,20 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, | |||
1344 | var->unusable = (ar >> 16) & 1; | 1402 | var->unusable = (ar >> 16) & 1; |
1345 | } | 1403 | } |
1346 | 1404 | ||
1405 | static int vmx_get_cpl(struct kvm_vcpu *vcpu) | ||
1406 | { | ||
1407 | struct kvm_segment kvm_seg; | ||
1408 | |||
1409 | if (!(vcpu->arch.cr0 & X86_CR0_PE)) /* if real mode */ | ||
1410 | return 0; | ||
1411 | |||
1412 | if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */ | ||
1413 | return 3; | ||
1414 | |||
1415 | vmx_get_segment(vcpu, &kvm_seg, VCPU_SREG_CS); | ||
1416 | return kvm_seg.selector & 3; | ||
1417 | } | ||
1418 | |||
1347 | static u32 vmx_segment_access_rights(struct kvm_segment *var) | 1419 | static u32 vmx_segment_access_rights(struct kvm_segment *var) |
1348 | { | 1420 | { |
1349 | u32 ar; | 1421 | u32 ar; |
@@ -1433,7 +1505,6 @@ static int init_rmode_tss(struct kvm *kvm) | |||
1433 | int ret = 0; | 1505 | int ret = 0; |
1434 | int r; | 1506 | int r; |
1435 | 1507 | ||
1436 | down_read(&kvm->slots_lock); | ||
1437 | r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); | 1508 | r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); |
1438 | if (r < 0) | 1509 | if (r < 0) |
1439 | goto out; | 1510 | goto out; |
@@ -1456,7 +1527,6 @@ static int init_rmode_tss(struct kvm *kvm) | |||
1456 | 1527 | ||
1457 | ret = 1; | 1528 | ret = 1; |
1458 | out: | 1529 | out: |
1459 | up_read(&kvm->slots_lock); | ||
1460 | return ret; | 1530 | return ret; |
1461 | } | 1531 | } |
1462 | 1532 | ||
@@ -1494,6 +1564,46 @@ out: | |||
1494 | return r; | 1564 | return r; |
1495 | } | 1565 | } |
1496 | 1566 | ||
1567 | static void allocate_vpid(struct vcpu_vmx *vmx) | ||
1568 | { | ||
1569 | int vpid; | ||
1570 | |||
1571 | vmx->vpid = 0; | ||
1572 | if (!enable_vpid || !cpu_has_vmx_vpid()) | ||
1573 | return; | ||
1574 | spin_lock(&vmx_vpid_lock); | ||
1575 | vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); | ||
1576 | if (vpid < VMX_NR_VPIDS) { | ||
1577 | vmx->vpid = vpid; | ||
1578 | __set_bit(vpid, vmx_vpid_bitmap); | ||
1579 | } | ||
1580 | spin_unlock(&vmx_vpid_lock); | ||
1581 | } | ||
1582 | |||
1583 | void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr) | ||
1584 | { | ||
1585 | void *va; | ||
1586 | |||
1587 | if (!cpu_has_vmx_msr_bitmap()) | ||
1588 | return; | ||
1589 | |||
1590 | /* | ||
1591 | * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals | ||
1592 | * have the write-low and read-high bitmap offsets the wrong way round. | ||
1593 | * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. | ||
1594 | */ | ||
1595 | va = kmap(msr_bitmap); | ||
1596 | if (msr <= 0x1fff) { | ||
1597 | __clear_bit(msr, va + 0x000); /* read-low */ | ||
1598 | __clear_bit(msr, va + 0x800); /* write-low */ | ||
1599 | } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { | ||
1600 | msr &= 0x1fff; | ||
1601 | __clear_bit(msr, va + 0x400); /* read-high */ | ||
1602 | __clear_bit(msr, va + 0xc00); /* write-high */ | ||
1603 | } | ||
1604 | kunmap(msr_bitmap); | ||
1605 | } | ||
1606 | |||
1497 | /* | 1607 | /* |
1498 | * Sets up the vmcs for emulated real mode. | 1608 | * Sets up the vmcs for emulated real mode. |
1499 | */ | 1609 | */ |
@@ -1511,6 +1621,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) | |||
1511 | vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a)); | 1621 | vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a)); |
1512 | vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b)); | 1622 | vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b)); |
1513 | 1623 | ||
1624 | if (cpu_has_vmx_msr_bitmap()) | ||
1625 | vmcs_write64(MSR_BITMAP, page_to_phys(vmx_msr_bitmap)); | ||
1626 | |||
1514 | vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ | 1627 | vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ |
1515 | 1628 | ||
1516 | /* Control */ | 1629 | /* Control */ |
@@ -1532,6 +1645,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) | |||
1532 | if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) | 1645 | if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) |
1533 | exec_control &= | 1646 | exec_control &= |
1534 | ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; | 1647 | ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; |
1648 | if (vmx->vpid == 0) | ||
1649 | exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; | ||
1535 | vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); | 1650 | vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); |
1536 | } | 1651 | } |
1537 | 1652 | ||
@@ -1613,6 +1728,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) | |||
1613 | u64 msr; | 1728 | u64 msr; |
1614 | int ret; | 1729 | int ret; |
1615 | 1730 | ||
1731 | down_read(&vcpu->kvm->slots_lock); | ||
1616 | if (!init_rmode_tss(vmx->vcpu.kvm)) { | 1732 | if (!init_rmode_tss(vmx->vcpu.kvm)) { |
1617 | ret = -ENOMEM; | 1733 | ret = -ENOMEM; |
1618 | goto out; | 1734 | goto out; |
@@ -1621,7 +1737,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) | |||
1621 | vmx->vcpu.arch.rmode.active = 0; | 1737 | vmx->vcpu.arch.rmode.active = 0; |
1622 | 1738 | ||
1623 | vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); | 1739 | vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); |
1624 | set_cr8(&vmx->vcpu, 0); | 1740 | kvm_set_cr8(&vmx->vcpu, 0); |
1625 | msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; | 1741 | msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; |
1626 | if (vmx->vcpu.vcpu_id == 0) | 1742 | if (vmx->vcpu.vcpu_id == 0) |
1627 | msr |= MSR_IA32_APICBASE_BSP; | 1743 | msr |= MSR_IA32_APICBASE_BSP; |
@@ -1704,18 +1820,22 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) | |||
1704 | vmcs_write64(APIC_ACCESS_ADDR, | 1820 | vmcs_write64(APIC_ACCESS_ADDR, |
1705 | page_to_phys(vmx->vcpu.kvm->arch.apic_access_page)); | 1821 | page_to_phys(vmx->vcpu.kvm->arch.apic_access_page)); |
1706 | 1822 | ||
1823 | if (vmx->vpid != 0) | ||
1824 | vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); | ||
1825 | |||
1707 | vmx->vcpu.arch.cr0 = 0x60000010; | 1826 | vmx->vcpu.arch.cr0 = 0x60000010; |
1708 | vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */ | 1827 | vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */ |
1709 | vmx_set_cr4(&vmx->vcpu, 0); | 1828 | vmx_set_cr4(&vmx->vcpu, 0); |
1710 | #ifdef CONFIG_X86_64 | ||
1711 | vmx_set_efer(&vmx->vcpu, 0); | 1829 | vmx_set_efer(&vmx->vcpu, 0); |
1712 | #endif | ||
1713 | vmx_fpu_activate(&vmx->vcpu); | 1830 | vmx_fpu_activate(&vmx->vcpu); |
1714 | update_exception_bitmap(&vmx->vcpu); | 1831 | update_exception_bitmap(&vmx->vcpu); |
1715 | 1832 | ||
1716 | return 0; | 1833 | vpid_sync_vcpu_all(vmx); |
1834 | |||
1835 | ret = 0; | ||
1717 | 1836 | ||
1718 | out: | 1837 | out: |
1838 | up_read(&vcpu->kvm->slots_lock); | ||
1719 | return ret; | 1839 | return ret; |
1720 | } | 1840 | } |
1721 | 1841 | ||
@@ -1723,6 +1843,8 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) | |||
1723 | { | 1843 | { |
1724 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 1844 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
1725 | 1845 | ||
1846 | KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler); | ||
1847 | |||
1726 | if (vcpu->arch.rmode.active) { | 1848 | if (vcpu->arch.rmode.active) { |
1727 | vmx->rmode.irq.pending = true; | 1849 | vmx->rmode.irq.pending = true; |
1728 | vmx->rmode.irq.vector = irq; | 1850 | vmx->rmode.irq.vector = irq; |
@@ -1844,7 +1966,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
1844 | if ((vect_info & VECTORING_INFO_VALID_MASK) && | 1966 | if ((vect_info & VECTORING_INFO_VALID_MASK) && |
1845 | !is_page_fault(intr_info)) | 1967 | !is_page_fault(intr_info)) |
1846 | printk(KERN_ERR "%s: unexpected, vectoring info 0x%x " | 1968 | printk(KERN_ERR "%s: unexpected, vectoring info 0x%x " |
1847 | "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info); | 1969 | "intr info 0x%x\n", __func__, vect_info, intr_info); |
1848 | 1970 | ||
1849 | if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) { | 1971 | if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) { |
1850 | int irq = vect_info & VECTORING_INFO_VECTOR_MASK; | 1972 | int irq = vect_info & VECTORING_INFO_VECTOR_MASK; |
@@ -1869,10 +1991,12 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
1869 | 1991 | ||
1870 | error_code = 0; | 1992 | error_code = 0; |
1871 | rip = vmcs_readl(GUEST_RIP); | 1993 | rip = vmcs_readl(GUEST_RIP); |
1872 | if (intr_info & INTR_INFO_DELIEVER_CODE_MASK) | 1994 | if (intr_info & INTR_INFO_DELIVER_CODE_MASK) |
1873 | error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); | 1995 | error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); |
1874 | if (is_page_fault(intr_info)) { | 1996 | if (is_page_fault(intr_info)) { |
1875 | cr2 = vmcs_readl(EXIT_QUALIFICATION); | 1997 | cr2 = vmcs_readl(EXIT_QUALIFICATION); |
1998 | KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2, | ||
1999 | (u32)((u64)cr2 >> 32), handler); | ||
1876 | return kvm_mmu_page_fault(vcpu, cr2, error_code); | 2000 | return kvm_mmu_page_fault(vcpu, cr2, error_code); |
1877 | } | 2001 | } |
1878 | 2002 | ||
@@ -1901,6 +2025,7 @@ static int handle_external_interrupt(struct kvm_vcpu *vcpu, | |||
1901 | struct kvm_run *kvm_run) | 2025 | struct kvm_run *kvm_run) |
1902 | { | 2026 | { |
1903 | ++vcpu->stat.irq_exits; | 2027 | ++vcpu->stat.irq_exits; |
2028 | KVMTRACE_1D(INTR, vcpu, vmcs_read32(VM_EXIT_INTR_INFO), handler); | ||
1904 | return 1; | 2029 | return 1; |
1905 | } | 2030 | } |
1906 | 2031 | ||
@@ -1958,25 +2083,27 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
1958 | reg = (exit_qualification >> 8) & 15; | 2083 | reg = (exit_qualification >> 8) & 15; |
1959 | switch ((exit_qualification >> 4) & 3) { | 2084 | switch ((exit_qualification >> 4) & 3) { |
1960 | case 0: /* mov to cr */ | 2085 | case 0: /* mov to cr */ |
2086 | KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)vcpu->arch.regs[reg], | ||
2087 | (u32)((u64)vcpu->arch.regs[reg] >> 32), handler); | ||
1961 | switch (cr) { | 2088 | switch (cr) { |
1962 | case 0: | 2089 | case 0: |
1963 | vcpu_load_rsp_rip(vcpu); | 2090 | vcpu_load_rsp_rip(vcpu); |
1964 | set_cr0(vcpu, vcpu->arch.regs[reg]); | 2091 | kvm_set_cr0(vcpu, vcpu->arch.regs[reg]); |
1965 | skip_emulated_instruction(vcpu); | 2092 | skip_emulated_instruction(vcpu); |
1966 | return 1; | 2093 | return 1; |
1967 | case 3: | 2094 | case 3: |
1968 | vcpu_load_rsp_rip(vcpu); | 2095 | vcpu_load_rsp_rip(vcpu); |
1969 | set_cr3(vcpu, vcpu->arch.regs[reg]); | 2096 | kvm_set_cr3(vcpu, vcpu->arch.regs[reg]); |
1970 | skip_emulated_instruction(vcpu); | 2097 | skip_emulated_instruction(vcpu); |
1971 | return 1; | 2098 | return 1; |
1972 | case 4: | 2099 | case 4: |
1973 | vcpu_load_rsp_rip(vcpu); | 2100 | vcpu_load_rsp_rip(vcpu); |
1974 | set_cr4(vcpu, vcpu->arch.regs[reg]); | 2101 | kvm_set_cr4(vcpu, vcpu->arch.regs[reg]); |
1975 | skip_emulated_instruction(vcpu); | 2102 | skip_emulated_instruction(vcpu); |
1976 | return 1; | 2103 | return 1; |
1977 | case 8: | 2104 | case 8: |
1978 | vcpu_load_rsp_rip(vcpu); | 2105 | vcpu_load_rsp_rip(vcpu); |
1979 | set_cr8(vcpu, vcpu->arch.regs[reg]); | 2106 | kvm_set_cr8(vcpu, vcpu->arch.regs[reg]); |
1980 | skip_emulated_instruction(vcpu); | 2107 | skip_emulated_instruction(vcpu); |
1981 | if (irqchip_in_kernel(vcpu->kvm)) | 2108 | if (irqchip_in_kernel(vcpu->kvm)) |
1982 | return 1; | 2109 | return 1; |
@@ -1990,6 +2117,7 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
1990 | vcpu->arch.cr0 &= ~X86_CR0_TS; | 2117 | vcpu->arch.cr0 &= ~X86_CR0_TS; |
1991 | vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); | 2118 | vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); |
1992 | vmx_fpu_activate(vcpu); | 2119 | vmx_fpu_activate(vcpu); |
2120 | KVMTRACE_0D(CLTS, vcpu, handler); | ||
1993 | skip_emulated_instruction(vcpu); | 2121 | skip_emulated_instruction(vcpu); |
1994 | return 1; | 2122 | return 1; |
1995 | case 1: /*mov from cr*/ | 2123 | case 1: /*mov from cr*/ |
@@ -1998,18 +2126,24 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
1998 | vcpu_load_rsp_rip(vcpu); | 2126 | vcpu_load_rsp_rip(vcpu); |
1999 | vcpu->arch.regs[reg] = vcpu->arch.cr3; | 2127 | vcpu->arch.regs[reg] = vcpu->arch.cr3; |
2000 | vcpu_put_rsp_rip(vcpu); | 2128 | vcpu_put_rsp_rip(vcpu); |
2129 | KVMTRACE_3D(CR_READ, vcpu, (u32)cr, | ||
2130 | (u32)vcpu->arch.regs[reg], | ||
2131 | (u32)((u64)vcpu->arch.regs[reg] >> 32), | ||
2132 | handler); | ||
2001 | skip_emulated_instruction(vcpu); | 2133 | skip_emulated_instruction(vcpu); |
2002 | return 1; | 2134 | return 1; |
2003 | case 8: | 2135 | case 8: |
2004 | vcpu_load_rsp_rip(vcpu); | 2136 | vcpu_load_rsp_rip(vcpu); |
2005 | vcpu->arch.regs[reg] = get_cr8(vcpu); | 2137 | vcpu->arch.regs[reg] = kvm_get_cr8(vcpu); |
2006 | vcpu_put_rsp_rip(vcpu); | 2138 | vcpu_put_rsp_rip(vcpu); |
2139 | KVMTRACE_2D(CR_READ, vcpu, (u32)cr, | ||
2140 | (u32)vcpu->arch.regs[reg], handler); | ||
2007 | skip_emulated_instruction(vcpu); | 2141 | skip_emulated_instruction(vcpu); |
2008 | return 1; | 2142 | return 1; |
2009 | } | 2143 | } |
2010 | break; | 2144 | break; |
2011 | case 3: /* lmsw */ | 2145 | case 3: /* lmsw */ |
2012 | lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f); | 2146 | kvm_lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f); |
2013 | 2147 | ||
2014 | skip_emulated_instruction(vcpu); | 2148 | skip_emulated_instruction(vcpu); |
2015 | return 1; | 2149 | return 1; |
@@ -2049,6 +2183,7 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2049 | val = 0; | 2183 | val = 0; |
2050 | } | 2184 | } |
2051 | vcpu->arch.regs[reg] = val; | 2185 | vcpu->arch.regs[reg] = val; |
2186 | KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler); | ||
2052 | } else { | 2187 | } else { |
2053 | /* mov to dr */ | 2188 | /* mov to dr */ |
2054 | } | 2189 | } |
@@ -2073,6 +2208,9 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2073 | return 1; | 2208 | return 1; |
2074 | } | 2209 | } |
2075 | 2210 | ||
2211 | KVMTRACE_3D(MSR_READ, vcpu, ecx, (u32)data, (u32)(data >> 32), | ||
2212 | handler); | ||
2213 | |||
2076 | /* FIXME: handling of bits 32:63 of rax, rdx */ | 2214 | /* FIXME: handling of bits 32:63 of rax, rdx */ |
2077 | vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u; | 2215 | vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u; |
2078 | vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u; | 2216 | vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u; |
@@ -2086,6 +2224,9 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2086 | u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) | 2224 | u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) |
2087 | | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); | 2225 | | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); |
2088 | 2226 | ||
2227 | KVMTRACE_3D(MSR_WRITE, vcpu, ecx, (u32)data, (u32)(data >> 32), | ||
2228 | handler); | ||
2229 | |||
2089 | if (vmx_set_msr(vcpu, ecx, data) != 0) { | 2230 | if (vmx_set_msr(vcpu, ecx, data) != 0) { |
2090 | kvm_inject_gp(vcpu, 0); | 2231 | kvm_inject_gp(vcpu, 0); |
2091 | return 1; | 2232 | return 1; |
@@ -2110,6 +2251,9 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu, | |||
2110 | cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); | 2251 | cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); |
2111 | cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; | 2252 | cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; |
2112 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); | 2253 | vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); |
2254 | |||
2255 | KVMTRACE_0D(PEND_INTR, vcpu, handler); | ||
2256 | |||
2113 | /* | 2257 | /* |
2114 | * If the user space waits to inject interrupts, exit as soon as | 2258 | * If the user space waits to inject interrupts, exit as soon as |
2115 | * possible | 2259 | * possible |
@@ -2152,6 +2296,8 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2152 | exit_qualification = vmcs_read64(EXIT_QUALIFICATION); | 2296 | exit_qualification = vmcs_read64(EXIT_QUALIFICATION); |
2153 | offset = exit_qualification & 0xffful; | 2297 | offset = exit_qualification & 0xffful; |
2154 | 2298 | ||
2299 | KVMTRACE_1D(APIC_ACCESS, vcpu, (u32)offset, handler); | ||
2300 | |||
2155 | er = emulate_instruction(vcpu, kvm_run, 0, 0, 0); | 2301 | er = emulate_instruction(vcpu, kvm_run, 0, 0, 0); |
2156 | 2302 | ||
2157 | if (er != EMULATE_DONE) { | 2303 | if (er != EMULATE_DONE) { |
@@ -2163,6 +2309,20 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2163 | return 1; | 2309 | return 1; |
2164 | } | 2310 | } |
2165 | 2311 | ||
2312 | static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | ||
2313 | { | ||
2314 | unsigned long exit_qualification; | ||
2315 | u16 tss_selector; | ||
2316 | int reason; | ||
2317 | |||
2318 | exit_qualification = vmcs_readl(EXIT_QUALIFICATION); | ||
2319 | |||
2320 | reason = (u32)exit_qualification >> 30; | ||
2321 | tss_selector = exit_qualification; | ||
2322 | |||
2323 | return kvm_task_switch(vcpu, tss_selector, reason); | ||
2324 | } | ||
2325 | |||
2166 | /* | 2326 | /* |
2167 | * The exit handlers return 1 if the exit was handled fully and guest execution | 2327 | * The exit handlers return 1 if the exit was handled fully and guest execution |
2168 | * may resume. Otherwise they set the kvm_run parameter to indicate what needs | 2328 | * may resume. Otherwise they set the kvm_run parameter to indicate what needs |
@@ -2185,6 +2345,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, | |||
2185 | [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, | 2345 | [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, |
2186 | [EXIT_REASON_APIC_ACCESS] = handle_apic_access, | 2346 | [EXIT_REASON_APIC_ACCESS] = handle_apic_access, |
2187 | [EXIT_REASON_WBINVD] = handle_wbinvd, | 2347 | [EXIT_REASON_WBINVD] = handle_wbinvd, |
2348 | [EXIT_REASON_TASK_SWITCH] = handle_task_switch, | ||
2188 | }; | 2349 | }; |
2189 | 2350 | ||
2190 | static const int kvm_vmx_max_exit_handlers = | 2351 | static const int kvm_vmx_max_exit_handlers = |
@@ -2200,6 +2361,9 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) | |||
2200 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 2361 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
2201 | u32 vectoring_info = vmx->idt_vectoring_info; | 2362 | u32 vectoring_info = vmx->idt_vectoring_info; |
2202 | 2363 | ||
2364 | KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)vmcs_readl(GUEST_RIP), | ||
2365 | (u32)((u64)vmcs_readl(GUEST_RIP) >> 32), entryexit); | ||
2366 | |||
2203 | if (unlikely(vmx->fail)) { | 2367 | if (unlikely(vmx->fail)) { |
2204 | kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; | 2368 | kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; |
2205 | kvm_run->fail_entry.hardware_entry_failure_reason | 2369 | kvm_run->fail_entry.hardware_entry_failure_reason |
@@ -2210,7 +2374,7 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) | |||
2210 | if ((vectoring_info & VECTORING_INFO_VALID_MASK) && | 2374 | if ((vectoring_info & VECTORING_INFO_VALID_MASK) && |
2211 | exit_reason != EXIT_REASON_EXCEPTION_NMI) | 2375 | exit_reason != EXIT_REASON_EXCEPTION_NMI) |
2212 | printk(KERN_WARNING "%s: unexpected, valid vectoring info and " | 2376 | printk(KERN_WARNING "%s: unexpected, valid vectoring info and " |
2213 | "exit reason is 0x%x\n", __FUNCTION__, exit_reason); | 2377 | "exit reason is 0x%x\n", __func__, exit_reason); |
2214 | if (exit_reason < kvm_vmx_max_exit_handlers | 2378 | if (exit_reason < kvm_vmx_max_exit_handlers |
2215 | && kvm_vmx_exit_handlers[exit_reason]) | 2379 | && kvm_vmx_exit_handlers[exit_reason]) |
2216 | return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run); | 2380 | return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run); |
@@ -2221,10 +2385,6 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) | |||
2221 | return 0; | 2385 | return 0; |
2222 | } | 2386 | } |
2223 | 2387 | ||
2224 | static void vmx_flush_tlb(struct kvm_vcpu *vcpu) | ||
2225 | { | ||
2226 | } | ||
2227 | |||
2228 | static void update_tpr_threshold(struct kvm_vcpu *vcpu) | 2388 | static void update_tpr_threshold(struct kvm_vcpu *vcpu) |
2229 | { | 2389 | { |
2230 | int max_irr, tpr; | 2390 | int max_irr, tpr; |
@@ -2285,11 +2445,13 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu) | |||
2285 | return; | 2445 | return; |
2286 | } | 2446 | } |
2287 | 2447 | ||
2448 | KVMTRACE_1D(REDELIVER_EVT, vcpu, idtv_info_field, handler); | ||
2449 | |||
2288 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field); | 2450 | vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field); |
2289 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, | 2451 | vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, |
2290 | vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); | 2452 | vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); |
2291 | 2453 | ||
2292 | if (unlikely(idtv_info_field & INTR_INFO_DELIEVER_CODE_MASK)) | 2454 | if (unlikely(idtv_info_field & INTR_INFO_DELIVER_CODE_MASK)) |
2293 | vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, | 2455 | vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, |
2294 | vmcs_read32(IDT_VECTORING_ERROR_CODE)); | 2456 | vmcs_read32(IDT_VECTORING_ERROR_CODE)); |
2295 | if (unlikely(has_ext_irq)) | 2457 | if (unlikely(has_ext_irq)) |
@@ -2470,8 +2632,10 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2470 | intr_info = vmcs_read32(VM_EXIT_INTR_INFO); | 2632 | intr_info = vmcs_read32(VM_EXIT_INTR_INFO); |
2471 | 2633 | ||
2472 | /* We need to handle NMIs before interrupts are enabled */ | 2634 | /* We need to handle NMIs before interrupts are enabled */ |
2473 | if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */ | 2635 | if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) { /* nmi */ |
2636 | KVMTRACE_0D(NMI, vcpu, handler); | ||
2474 | asm("int $2"); | 2637 | asm("int $2"); |
2638 | } | ||
2475 | } | 2639 | } |
2476 | 2640 | ||
2477 | static void vmx_free_vmcs(struct kvm_vcpu *vcpu) | 2641 | static void vmx_free_vmcs(struct kvm_vcpu *vcpu) |
@@ -2489,6 +2653,10 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) | |||
2489 | { | 2653 | { |
2490 | struct vcpu_vmx *vmx = to_vmx(vcpu); | 2654 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
2491 | 2655 | ||
2656 | spin_lock(&vmx_vpid_lock); | ||
2657 | if (vmx->vpid != 0) | ||
2658 | __clear_bit(vmx->vpid, vmx_vpid_bitmap); | ||
2659 | spin_unlock(&vmx_vpid_lock); | ||
2492 | vmx_free_vmcs(vcpu); | 2660 | vmx_free_vmcs(vcpu); |
2493 | kfree(vmx->host_msrs); | 2661 | kfree(vmx->host_msrs); |
2494 | kfree(vmx->guest_msrs); | 2662 | kfree(vmx->guest_msrs); |
@@ -2505,6 +2673,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) | |||
2505 | if (!vmx) | 2673 | if (!vmx) |
2506 | return ERR_PTR(-ENOMEM); | 2674 | return ERR_PTR(-ENOMEM); |
2507 | 2675 | ||
2676 | allocate_vpid(vmx); | ||
2677 | |||
2508 | err = kvm_vcpu_init(&vmx->vcpu, kvm, id); | 2678 | err = kvm_vcpu_init(&vmx->vcpu, kvm, id); |
2509 | if (err) | 2679 | if (err) |
2510 | goto free_vcpu; | 2680 | goto free_vcpu; |
@@ -2591,14 +2761,13 @@ static struct kvm_x86_ops vmx_x86_ops = { | |||
2591 | .get_segment_base = vmx_get_segment_base, | 2761 | .get_segment_base = vmx_get_segment_base, |
2592 | .get_segment = vmx_get_segment, | 2762 | .get_segment = vmx_get_segment, |
2593 | .set_segment = vmx_set_segment, | 2763 | .set_segment = vmx_set_segment, |
2764 | .get_cpl = vmx_get_cpl, | ||
2594 | .get_cs_db_l_bits = vmx_get_cs_db_l_bits, | 2765 | .get_cs_db_l_bits = vmx_get_cs_db_l_bits, |
2595 | .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, | 2766 | .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, |
2596 | .set_cr0 = vmx_set_cr0, | 2767 | .set_cr0 = vmx_set_cr0, |
2597 | .set_cr3 = vmx_set_cr3, | 2768 | .set_cr3 = vmx_set_cr3, |
2598 | .set_cr4 = vmx_set_cr4, | 2769 | .set_cr4 = vmx_set_cr4, |
2599 | #ifdef CONFIG_X86_64 | ||
2600 | .set_efer = vmx_set_efer, | 2770 | .set_efer = vmx_set_efer, |
2601 | #endif | ||
2602 | .get_idt = vmx_get_idt, | 2771 | .get_idt = vmx_get_idt, |
2603 | .set_idt = vmx_set_idt, | 2772 | .set_idt = vmx_set_idt, |
2604 | .get_gdt = vmx_get_gdt, | 2773 | .get_gdt = vmx_get_gdt, |
@@ -2626,7 +2795,7 @@ static struct kvm_x86_ops vmx_x86_ops = { | |||
2626 | 2795 | ||
2627 | static int __init vmx_init(void) | 2796 | static int __init vmx_init(void) |
2628 | { | 2797 | { |
2629 | void *iova; | 2798 | void *va; |
2630 | int r; | 2799 | int r; |
2631 | 2800 | ||
2632 | vmx_io_bitmap_a = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); | 2801 | vmx_io_bitmap_a = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); |
@@ -2639,28 +2808,48 @@ static int __init vmx_init(void) | |||
2639 | goto out; | 2808 | goto out; |
2640 | } | 2809 | } |
2641 | 2810 | ||
2811 | vmx_msr_bitmap = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); | ||
2812 | if (!vmx_msr_bitmap) { | ||
2813 | r = -ENOMEM; | ||
2814 | goto out1; | ||
2815 | } | ||
2816 | |||
2642 | /* | 2817 | /* |
2643 | * Allow direct access to the PC debug port (it is often used for I/O | 2818 | * Allow direct access to the PC debug port (it is often used for I/O |
2644 | * delays, but the vmexits simply slow things down). | 2819 | * delays, but the vmexits simply slow things down). |
2645 | */ | 2820 | */ |
2646 | iova = kmap(vmx_io_bitmap_a); | 2821 | va = kmap(vmx_io_bitmap_a); |
2647 | memset(iova, 0xff, PAGE_SIZE); | 2822 | memset(va, 0xff, PAGE_SIZE); |
2648 | clear_bit(0x80, iova); | 2823 | clear_bit(0x80, va); |
2649 | kunmap(vmx_io_bitmap_a); | 2824 | kunmap(vmx_io_bitmap_a); |
2650 | 2825 | ||
2651 | iova = kmap(vmx_io_bitmap_b); | 2826 | va = kmap(vmx_io_bitmap_b); |
2652 | memset(iova, 0xff, PAGE_SIZE); | 2827 | memset(va, 0xff, PAGE_SIZE); |
2653 | kunmap(vmx_io_bitmap_b); | 2828 | kunmap(vmx_io_bitmap_b); |
2654 | 2829 | ||
2830 | va = kmap(vmx_msr_bitmap); | ||
2831 | memset(va, 0xff, PAGE_SIZE); | ||
2832 | kunmap(vmx_msr_bitmap); | ||
2833 | |||
2834 | set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ | ||
2835 | |||
2655 | r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE); | 2836 | r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE); |
2656 | if (r) | 2837 | if (r) |
2657 | goto out1; | 2838 | goto out2; |
2839 | |||
2840 | vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_FS_BASE); | ||
2841 | vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_GS_BASE); | ||
2842 | vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_CS); | ||
2843 | vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_ESP); | ||
2844 | vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_EIP); | ||
2658 | 2845 | ||
2659 | if (bypass_guest_pf) | 2846 | if (bypass_guest_pf) |
2660 | kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull); | 2847 | kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull); |
2661 | 2848 | ||
2662 | return 0; | 2849 | return 0; |
2663 | 2850 | ||
2851 | out2: | ||
2852 | __free_page(vmx_msr_bitmap); | ||
2664 | out1: | 2853 | out1: |
2665 | __free_page(vmx_io_bitmap_b); | 2854 | __free_page(vmx_io_bitmap_b); |
2666 | out: | 2855 | out: |
@@ -2670,6 +2859,7 @@ out: | |||
2670 | 2859 | ||
2671 | static void __exit vmx_exit(void) | 2860 | static void __exit vmx_exit(void) |
2672 | { | 2861 | { |
2862 | __free_page(vmx_msr_bitmap); | ||
2673 | __free_page(vmx_io_bitmap_b); | 2863 | __free_page(vmx_io_bitmap_b); |
2674 | __free_page(vmx_io_bitmap_a); | 2864 | __free_page(vmx_io_bitmap_a); |
2675 | 2865 | ||
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h index d52ae8d7303d..5dff4606b988 100644 --- a/arch/x86/kvm/vmx.h +++ b/arch/x86/kvm/vmx.h | |||
@@ -49,6 +49,7 @@ | |||
49 | * Definitions of Secondary Processor-Based VM-Execution Controls. | 49 | * Definitions of Secondary Processor-Based VM-Execution Controls. |
50 | */ | 50 | */ |
51 | #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 | 51 | #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 |
52 | #define SECONDARY_EXEC_ENABLE_VPID 0x00000020 | ||
52 | #define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 | 53 | #define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 |
53 | 54 | ||
54 | 55 | ||
@@ -65,6 +66,7 @@ | |||
65 | 66 | ||
66 | /* VMCS Encodings */ | 67 | /* VMCS Encodings */ |
67 | enum vmcs_field { | 68 | enum vmcs_field { |
69 | VIRTUAL_PROCESSOR_ID = 0x00000000, | ||
68 | GUEST_ES_SELECTOR = 0x00000800, | 70 | GUEST_ES_SELECTOR = 0x00000800, |
69 | GUEST_CS_SELECTOR = 0x00000802, | 71 | GUEST_CS_SELECTOR = 0x00000802, |
70 | GUEST_SS_SELECTOR = 0x00000804, | 72 | GUEST_SS_SELECTOR = 0x00000804, |
@@ -231,12 +233,12 @@ enum vmcs_field { | |||
231 | */ | 233 | */ |
232 | #define INTR_INFO_VECTOR_MASK 0xff /* 7:0 */ | 234 | #define INTR_INFO_VECTOR_MASK 0xff /* 7:0 */ |
233 | #define INTR_INFO_INTR_TYPE_MASK 0x700 /* 10:8 */ | 235 | #define INTR_INFO_INTR_TYPE_MASK 0x700 /* 10:8 */ |
234 | #define INTR_INFO_DELIEVER_CODE_MASK 0x800 /* 11 */ | 236 | #define INTR_INFO_DELIVER_CODE_MASK 0x800 /* 11 */ |
235 | #define INTR_INFO_VALID_MASK 0x80000000 /* 31 */ | 237 | #define INTR_INFO_VALID_MASK 0x80000000 /* 31 */ |
236 | 238 | ||
237 | #define VECTORING_INFO_VECTOR_MASK INTR_INFO_VECTOR_MASK | 239 | #define VECTORING_INFO_VECTOR_MASK INTR_INFO_VECTOR_MASK |
238 | #define VECTORING_INFO_TYPE_MASK INTR_INFO_INTR_TYPE_MASK | 240 | #define VECTORING_INFO_TYPE_MASK INTR_INFO_INTR_TYPE_MASK |
239 | #define VECTORING_INFO_DELIEVER_CODE_MASK INTR_INFO_DELIEVER_CODE_MASK | 241 | #define VECTORING_INFO_DELIVER_CODE_MASK INTR_INFO_DELIVER_CODE_MASK |
240 | #define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK | 242 | #define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK |
241 | 243 | ||
242 | #define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */ | 244 | #define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */ |
@@ -321,4 +323,8 @@ enum vmcs_field { | |||
321 | 323 | ||
322 | #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9 | 324 | #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9 |
323 | 325 | ||
326 | #define VMX_NR_VPIDS (1 << 16) | ||
327 | #define VMX_VPID_EXTENT_SINGLE_CONTEXT 1 | ||
328 | #define VMX_VPID_EXTENT_ALL_CONTEXT 2 | ||
329 | |||
324 | #endif | 330 | #endif |
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6b01552bd1f1..0ce556372a4d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c | |||
@@ -15,10 +15,12 @@ | |||
15 | */ | 15 | */ |
16 | 16 | ||
17 | #include <linux/kvm_host.h> | 17 | #include <linux/kvm_host.h> |
18 | #include "segment_descriptor.h" | ||
19 | #include "irq.h" | 18 | #include "irq.h" |
20 | #include "mmu.h" | 19 | #include "mmu.h" |
20 | #include "i8254.h" | ||
21 | #include "tss.h" | ||
21 | 22 | ||
23 | #include <linux/clocksource.h> | ||
22 | #include <linux/kvm.h> | 24 | #include <linux/kvm.h> |
23 | #include <linux/fs.h> | 25 | #include <linux/fs.h> |
24 | #include <linux/vmalloc.h> | 26 | #include <linux/vmalloc.h> |
@@ -28,6 +30,7 @@ | |||
28 | 30 | ||
29 | #include <asm/uaccess.h> | 31 | #include <asm/uaccess.h> |
30 | #include <asm/msr.h> | 32 | #include <asm/msr.h> |
33 | #include <asm/desc.h> | ||
31 | 34 | ||
32 | #define MAX_IO_MSRS 256 | 35 | #define MAX_IO_MSRS 256 |
33 | #define CR0_RESERVED_BITS \ | 36 | #define CR0_RESERVED_BITS \ |
@@ -41,7 +44,15 @@ | |||
41 | | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) | 44 | | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) |
42 | 45 | ||
43 | #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) | 46 | #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) |
44 | #define EFER_RESERVED_BITS 0xfffffffffffff2fe | 47 | /* EFER defaults: |
48 | * - enable syscall per default because its emulated by KVM | ||
49 | * - enable LME and LMA per default on 64 bit KVM | ||
50 | */ | ||
51 | #ifdef CONFIG_X86_64 | ||
52 | static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL; | ||
53 | #else | ||
54 | static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL; | ||
55 | #endif | ||
45 | 56 | ||
46 | #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM | 57 | #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM |
47 | #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU | 58 | #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU |
@@ -63,6 +74,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { | |||
63 | { "irq_window", VCPU_STAT(irq_window_exits) }, | 74 | { "irq_window", VCPU_STAT(irq_window_exits) }, |
64 | { "halt_exits", VCPU_STAT(halt_exits) }, | 75 | { "halt_exits", VCPU_STAT(halt_exits) }, |
65 | { "halt_wakeup", VCPU_STAT(halt_wakeup) }, | 76 | { "halt_wakeup", VCPU_STAT(halt_wakeup) }, |
77 | { "hypercalls", VCPU_STAT(hypercalls) }, | ||
66 | { "request_irq", VCPU_STAT(request_irq_exits) }, | 78 | { "request_irq", VCPU_STAT(request_irq_exits) }, |
67 | { "irq_exits", VCPU_STAT(irq_exits) }, | 79 | { "irq_exits", VCPU_STAT(irq_exits) }, |
68 | { "host_state_reload", VCPU_STAT(host_state_reload) }, | 80 | { "host_state_reload", VCPU_STAT(host_state_reload) }, |
@@ -78,6 +90,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { | |||
78 | { "mmu_recycled", VM_STAT(mmu_recycled) }, | 90 | { "mmu_recycled", VM_STAT(mmu_recycled) }, |
79 | { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, | 91 | { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, |
80 | { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, | 92 | { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, |
93 | { "largepages", VM_STAT(lpages) }, | ||
81 | { NULL } | 94 | { NULL } |
82 | }; | 95 | }; |
83 | 96 | ||
@@ -85,7 +98,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { | |||
85 | unsigned long segment_base(u16 selector) | 98 | unsigned long segment_base(u16 selector) |
86 | { | 99 | { |
87 | struct descriptor_table gdt; | 100 | struct descriptor_table gdt; |
88 | struct segment_descriptor *d; | 101 | struct desc_struct *d; |
89 | unsigned long table_base; | 102 | unsigned long table_base; |
90 | unsigned long v; | 103 | unsigned long v; |
91 | 104 | ||
@@ -101,13 +114,12 @@ unsigned long segment_base(u16 selector) | |||
101 | asm("sldt %0" : "=g"(ldt_selector)); | 114 | asm("sldt %0" : "=g"(ldt_selector)); |
102 | table_base = segment_base(ldt_selector); | 115 | table_base = segment_base(ldt_selector); |
103 | } | 116 | } |
104 | d = (struct segment_descriptor *)(table_base + (selector & ~7)); | 117 | d = (struct desc_struct *)(table_base + (selector & ~7)); |
105 | v = d->base_low | ((unsigned long)d->base_mid << 16) | | 118 | v = d->base0 | ((unsigned long)d->base1 << 16) | |
106 | ((unsigned long)d->base_high << 24); | 119 | ((unsigned long)d->base2 << 24); |
107 | #ifdef CONFIG_X86_64 | 120 | #ifdef CONFIG_X86_64 |
108 | if (d->system == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) | 121 | if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) |
109 | v |= ((unsigned long) \ | 122 | v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32; |
110 | ((struct segment_descriptor_64 *)d)->base_higher) << 32; | ||
111 | #endif | 123 | #endif |
112 | return v; | 124 | return v; |
113 | } | 125 | } |
@@ -145,11 +157,16 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, | |||
145 | u32 error_code) | 157 | u32 error_code) |
146 | { | 158 | { |
147 | ++vcpu->stat.pf_guest; | 159 | ++vcpu->stat.pf_guest; |
148 | if (vcpu->arch.exception.pending && vcpu->arch.exception.nr == PF_VECTOR) { | 160 | if (vcpu->arch.exception.pending) { |
149 | printk(KERN_DEBUG "kvm: inject_page_fault:" | 161 | if (vcpu->arch.exception.nr == PF_VECTOR) { |
150 | " double fault 0x%lx\n", addr); | 162 | printk(KERN_DEBUG "kvm: inject_page_fault:" |
151 | vcpu->arch.exception.nr = DF_VECTOR; | 163 | " double fault 0x%lx\n", addr); |
152 | vcpu->arch.exception.error_code = 0; | 164 | vcpu->arch.exception.nr = DF_VECTOR; |
165 | vcpu->arch.exception.error_code = 0; | ||
166 | } else if (vcpu->arch.exception.nr == DF_VECTOR) { | ||
167 | /* triple fault -> shutdown */ | ||
168 | set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); | ||
169 | } | ||
153 | return; | 170 | return; |
154 | } | 171 | } |
155 | vcpu->arch.cr2 = addr; | 172 | vcpu->arch.cr2 = addr; |
@@ -184,7 +201,6 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) | |||
184 | int ret; | 201 | int ret; |
185 | u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; | 202 | u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; |
186 | 203 | ||
187 | down_read(&vcpu->kvm->slots_lock); | ||
188 | ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte, | 204 | ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte, |
189 | offset * sizeof(u64), sizeof(pdpte)); | 205 | offset * sizeof(u64), sizeof(pdpte)); |
190 | if (ret < 0) { | 206 | if (ret < 0) { |
@@ -201,10 +217,10 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) | |||
201 | 217 | ||
202 | memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); | 218 | memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); |
203 | out: | 219 | out: |
204 | up_read(&vcpu->kvm->slots_lock); | ||
205 | 220 | ||
206 | return ret; | 221 | return ret; |
207 | } | 222 | } |
223 | EXPORT_SYMBOL_GPL(load_pdptrs); | ||
208 | 224 | ||
209 | static bool pdptrs_changed(struct kvm_vcpu *vcpu) | 225 | static bool pdptrs_changed(struct kvm_vcpu *vcpu) |
210 | { | 226 | { |
@@ -215,18 +231,16 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu) | |||
215 | if (is_long_mode(vcpu) || !is_pae(vcpu)) | 231 | if (is_long_mode(vcpu) || !is_pae(vcpu)) |
216 | return false; | 232 | return false; |
217 | 233 | ||
218 | down_read(&vcpu->kvm->slots_lock); | ||
219 | r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); | 234 | r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); |
220 | if (r < 0) | 235 | if (r < 0) |
221 | goto out; | 236 | goto out; |
222 | changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0; | 237 | changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0; |
223 | out: | 238 | out: |
224 | up_read(&vcpu->kvm->slots_lock); | ||
225 | 239 | ||
226 | return changed; | 240 | return changed; |
227 | } | 241 | } |
228 | 242 | ||
229 | void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | 243 | void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) |
230 | { | 244 | { |
231 | if (cr0 & CR0_RESERVED_BITS) { | 245 | if (cr0 & CR0_RESERVED_BITS) { |
232 | printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", | 246 | printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", |
@@ -284,15 +298,18 @@ void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | |||
284 | kvm_mmu_reset_context(vcpu); | 298 | kvm_mmu_reset_context(vcpu); |
285 | return; | 299 | return; |
286 | } | 300 | } |
287 | EXPORT_SYMBOL_GPL(set_cr0); | 301 | EXPORT_SYMBOL_GPL(kvm_set_cr0); |
288 | 302 | ||
289 | void lmsw(struct kvm_vcpu *vcpu, unsigned long msw) | 303 | void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) |
290 | { | 304 | { |
291 | set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); | 305 | kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); |
306 | KVMTRACE_1D(LMSW, vcpu, | ||
307 | (u32)((vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)), | ||
308 | handler); | ||
292 | } | 309 | } |
293 | EXPORT_SYMBOL_GPL(lmsw); | 310 | EXPORT_SYMBOL_GPL(kvm_lmsw); |
294 | 311 | ||
295 | void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | 312 | void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) |
296 | { | 313 | { |
297 | if (cr4 & CR4_RESERVED_BITS) { | 314 | if (cr4 & CR4_RESERVED_BITS) { |
298 | printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); | 315 | printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); |
@@ -323,9 +340,9 @@ void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | |||
323 | vcpu->arch.cr4 = cr4; | 340 | vcpu->arch.cr4 = cr4; |
324 | kvm_mmu_reset_context(vcpu); | 341 | kvm_mmu_reset_context(vcpu); |
325 | } | 342 | } |
326 | EXPORT_SYMBOL_GPL(set_cr4); | 343 | EXPORT_SYMBOL_GPL(kvm_set_cr4); |
327 | 344 | ||
328 | void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | 345 | void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) |
329 | { | 346 | { |
330 | if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { | 347 | if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { |
331 | kvm_mmu_flush_tlb(vcpu); | 348 | kvm_mmu_flush_tlb(vcpu); |
@@ -359,7 +376,6 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | |||
359 | */ | 376 | */ |
360 | } | 377 | } |
361 | 378 | ||
362 | down_read(&vcpu->kvm->slots_lock); | ||
363 | /* | 379 | /* |
364 | * Does the new cr3 value map to physical memory? (Note, we | 380 | * Does the new cr3 value map to physical memory? (Note, we |
365 | * catch an invalid cr3 even in real-mode, because it would | 381 | * catch an invalid cr3 even in real-mode, because it would |
@@ -375,11 +391,10 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) | |||
375 | vcpu->arch.cr3 = cr3; | 391 | vcpu->arch.cr3 = cr3; |
376 | vcpu->arch.mmu.new_cr3(vcpu); | 392 | vcpu->arch.mmu.new_cr3(vcpu); |
377 | } | 393 | } |
378 | up_read(&vcpu->kvm->slots_lock); | ||
379 | } | 394 | } |
380 | EXPORT_SYMBOL_GPL(set_cr3); | 395 | EXPORT_SYMBOL_GPL(kvm_set_cr3); |
381 | 396 | ||
382 | void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) | 397 | void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) |
383 | { | 398 | { |
384 | if (cr8 & CR8_RESERVED_BITS) { | 399 | if (cr8 & CR8_RESERVED_BITS) { |
385 | printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8); | 400 | printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8); |
@@ -391,16 +406,16 @@ void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) | |||
391 | else | 406 | else |
392 | vcpu->arch.cr8 = cr8; | 407 | vcpu->arch.cr8 = cr8; |
393 | } | 408 | } |
394 | EXPORT_SYMBOL_GPL(set_cr8); | 409 | EXPORT_SYMBOL_GPL(kvm_set_cr8); |
395 | 410 | ||
396 | unsigned long get_cr8(struct kvm_vcpu *vcpu) | 411 | unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) |
397 | { | 412 | { |
398 | if (irqchip_in_kernel(vcpu->kvm)) | 413 | if (irqchip_in_kernel(vcpu->kvm)) |
399 | return kvm_lapic_get_cr8(vcpu); | 414 | return kvm_lapic_get_cr8(vcpu); |
400 | else | 415 | else |
401 | return vcpu->arch.cr8; | 416 | return vcpu->arch.cr8; |
402 | } | 417 | } |
403 | EXPORT_SYMBOL_GPL(get_cr8); | 418 | EXPORT_SYMBOL_GPL(kvm_get_cr8); |
404 | 419 | ||
405 | /* | 420 | /* |
406 | * List of msr numbers which we expose to userspace through KVM_GET_MSRS | 421 | * List of msr numbers which we expose to userspace through KVM_GET_MSRS |
@@ -415,7 +430,8 @@ static u32 msrs_to_save[] = { | |||
415 | #ifdef CONFIG_X86_64 | 430 | #ifdef CONFIG_X86_64 |
416 | MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, | 431 | MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, |
417 | #endif | 432 | #endif |
418 | MSR_IA32_TIME_STAMP_COUNTER, | 433 | MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, |
434 | MSR_IA32_PERF_STATUS, | ||
419 | }; | 435 | }; |
420 | 436 | ||
421 | static unsigned num_msrs_to_save; | 437 | static unsigned num_msrs_to_save; |
@@ -424,11 +440,9 @@ static u32 emulated_msrs[] = { | |||
424 | MSR_IA32_MISC_ENABLE, | 440 | MSR_IA32_MISC_ENABLE, |
425 | }; | 441 | }; |
426 | 442 | ||
427 | #ifdef CONFIG_X86_64 | ||
428 | |||
429 | static void set_efer(struct kvm_vcpu *vcpu, u64 efer) | 443 | static void set_efer(struct kvm_vcpu *vcpu, u64 efer) |
430 | { | 444 | { |
431 | if (efer & EFER_RESERVED_BITS) { | 445 | if (efer & efer_reserved_bits) { |
432 | printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n", | 446 | printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n", |
433 | efer); | 447 | efer); |
434 | kvm_inject_gp(vcpu, 0); | 448 | kvm_inject_gp(vcpu, 0); |
@@ -450,7 +464,12 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer) | |||
450 | vcpu->arch.shadow_efer = efer; | 464 | vcpu->arch.shadow_efer = efer; |
451 | } | 465 | } |
452 | 466 | ||
453 | #endif | 467 | void kvm_enable_efer_bits(u64 mask) |
468 | { | ||
469 | efer_reserved_bits &= ~mask; | ||
470 | } | ||
471 | EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); | ||
472 | |||
454 | 473 | ||
455 | /* | 474 | /* |
456 | * Writes msr value into into the appropriate "register". | 475 | * Writes msr value into into the appropriate "register". |
@@ -470,26 +489,86 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) | |||
470 | return kvm_set_msr(vcpu, index, *data); | 489 | return kvm_set_msr(vcpu, index, *data); |
471 | } | 490 | } |
472 | 491 | ||
492 | static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) | ||
493 | { | ||
494 | static int version; | ||
495 | struct kvm_wall_clock wc; | ||
496 | struct timespec wc_ts; | ||
497 | |||
498 | if (!wall_clock) | ||
499 | return; | ||
500 | |||
501 | version++; | ||
502 | |||
503 | kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); | ||
504 | |||
505 | wc_ts = current_kernel_time(); | ||
506 | wc.wc_sec = wc_ts.tv_sec; | ||
507 | wc.wc_nsec = wc_ts.tv_nsec; | ||
508 | wc.wc_version = version; | ||
509 | |||
510 | kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); | ||
511 | |||
512 | version++; | ||
513 | kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); | ||
514 | } | ||
515 | |||
516 | static void kvm_write_guest_time(struct kvm_vcpu *v) | ||
517 | { | ||
518 | struct timespec ts; | ||
519 | unsigned long flags; | ||
520 | struct kvm_vcpu_arch *vcpu = &v->arch; | ||
521 | void *shared_kaddr; | ||
522 | |||
523 | if ((!vcpu->time_page)) | ||
524 | return; | ||
525 | |||
526 | /* Keep irq disabled to prevent changes to the clock */ | ||
527 | local_irq_save(flags); | ||
528 | kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER, | ||
529 | &vcpu->hv_clock.tsc_timestamp); | ||
530 | ktime_get_ts(&ts); | ||
531 | local_irq_restore(flags); | ||
532 | |||
533 | /* With all the info we got, fill in the values */ | ||
534 | |||
535 | vcpu->hv_clock.system_time = ts.tv_nsec + | ||
536 | (NSEC_PER_SEC * (u64)ts.tv_sec); | ||
537 | /* | ||
538 | * The interface expects us to write an even number signaling that the | ||
539 | * update is finished. Since the guest won't see the intermediate | ||
540 | * state, we just write "2" at the end | ||
541 | */ | ||
542 | vcpu->hv_clock.version = 2; | ||
543 | |||
544 | shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0); | ||
545 | |||
546 | memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, | ||
547 | sizeof(vcpu->hv_clock)); | ||
548 | |||
549 | kunmap_atomic(shared_kaddr, KM_USER0); | ||
550 | |||
551 | mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); | ||
552 | } | ||
553 | |||
473 | 554 | ||
474 | int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | 555 | int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) |
475 | { | 556 | { |
476 | switch (msr) { | 557 | switch (msr) { |
477 | #ifdef CONFIG_X86_64 | ||
478 | case MSR_EFER: | 558 | case MSR_EFER: |
479 | set_efer(vcpu, data); | 559 | set_efer(vcpu, data); |
480 | break; | 560 | break; |
481 | #endif | ||
482 | case MSR_IA32_MC0_STATUS: | 561 | case MSR_IA32_MC0_STATUS: |
483 | pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", | 562 | pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", |
484 | __FUNCTION__, data); | 563 | __func__, data); |
485 | break; | 564 | break; |
486 | case MSR_IA32_MCG_STATUS: | 565 | case MSR_IA32_MCG_STATUS: |
487 | pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n", | 566 | pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n", |
488 | __FUNCTION__, data); | 567 | __func__, data); |
489 | break; | 568 | break; |
490 | case MSR_IA32_MCG_CTL: | 569 | case MSR_IA32_MCG_CTL: |
491 | pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n", | 570 | pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n", |
492 | __FUNCTION__, data); | 571 | __func__, data); |
493 | break; | 572 | break; |
494 | case MSR_IA32_UCODE_REV: | 573 | case MSR_IA32_UCODE_REV: |
495 | case MSR_IA32_UCODE_WRITE: | 574 | case MSR_IA32_UCODE_WRITE: |
@@ -501,6 +580,42 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) | |||
501 | case MSR_IA32_MISC_ENABLE: | 580 | case MSR_IA32_MISC_ENABLE: |
502 | vcpu->arch.ia32_misc_enable_msr = data; | 581 | vcpu->arch.ia32_misc_enable_msr = data; |
503 | break; | 582 | break; |
583 | case MSR_KVM_WALL_CLOCK: | ||
584 | vcpu->kvm->arch.wall_clock = data; | ||
585 | kvm_write_wall_clock(vcpu->kvm, data); | ||
586 | break; | ||
587 | case MSR_KVM_SYSTEM_TIME: { | ||
588 | if (vcpu->arch.time_page) { | ||
589 | kvm_release_page_dirty(vcpu->arch.time_page); | ||
590 | vcpu->arch.time_page = NULL; | ||
591 | } | ||
592 | |||
593 | vcpu->arch.time = data; | ||
594 | |||
595 | /* we verify if the enable bit is set... */ | ||
596 | if (!(data & 1)) | ||
597 | break; | ||
598 | |||
599 | /* ...but clean it before doing the actual write */ | ||
600 | vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); | ||
601 | |||
602 | vcpu->arch.hv_clock.tsc_to_system_mul = | ||
603 | clocksource_khz2mult(tsc_khz, 22); | ||
604 | vcpu->arch.hv_clock.tsc_shift = 22; | ||
605 | |||
606 | down_read(¤t->mm->mmap_sem); | ||
607 | vcpu->arch.time_page = | ||
608 | gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); | ||
609 | up_read(¤t->mm->mmap_sem); | ||
610 | |||
611 | if (is_error_page(vcpu->arch.time_page)) { | ||
612 | kvm_release_page_clean(vcpu->arch.time_page); | ||
613 | vcpu->arch.time_page = NULL; | ||
614 | } | ||
615 | |||
616 | kvm_write_guest_time(vcpu); | ||
617 | break; | ||
618 | } | ||
504 | default: | 619 | default: |
505 | pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data); | 620 | pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data); |
506 | return 1; | 621 | return 1; |
@@ -540,7 +655,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) | |||
540 | case MSR_IA32_MC0_MISC+12: | 655 | case MSR_IA32_MC0_MISC+12: |
541 | case MSR_IA32_MC0_MISC+16: | 656 | case MSR_IA32_MC0_MISC+16: |
542 | case MSR_IA32_UCODE_REV: | 657 | case MSR_IA32_UCODE_REV: |
543 | case MSR_IA32_PERF_STATUS: | ||
544 | case MSR_IA32_EBL_CR_POWERON: | 658 | case MSR_IA32_EBL_CR_POWERON: |
545 | /* MTRR registers */ | 659 | /* MTRR registers */ |
546 | case 0xfe: | 660 | case 0xfe: |
@@ -556,11 +670,21 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) | |||
556 | case MSR_IA32_MISC_ENABLE: | 670 | case MSR_IA32_MISC_ENABLE: |
557 | data = vcpu->arch.ia32_misc_enable_msr; | 671 | data = vcpu->arch.ia32_misc_enable_msr; |
558 | break; | 672 | break; |
559 | #ifdef CONFIG_X86_64 | 673 | case MSR_IA32_PERF_STATUS: |
674 | /* TSC increment by tick */ | ||
675 | data = 1000ULL; | ||
676 | /* CPU multiplier */ | ||
677 | data |= (((uint64_t)4ULL) << 40); | ||
678 | break; | ||
560 | case MSR_EFER: | 679 | case MSR_EFER: |
561 | data = vcpu->arch.shadow_efer; | 680 | data = vcpu->arch.shadow_efer; |
562 | break; | 681 | break; |
563 | #endif | 682 | case MSR_KVM_WALL_CLOCK: |
683 | data = vcpu->kvm->arch.wall_clock; | ||
684 | break; | ||
685 | case MSR_KVM_SYSTEM_TIME: | ||
686 | data = vcpu->arch.time; | ||
687 | break; | ||
564 | default: | 688 | default: |
565 | pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); | 689 | pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); |
566 | return 1; | 690 | return 1; |
@@ -584,9 +708,11 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, | |||
584 | 708 | ||
585 | vcpu_load(vcpu); | 709 | vcpu_load(vcpu); |
586 | 710 | ||
711 | down_read(&vcpu->kvm->slots_lock); | ||
587 | for (i = 0; i < msrs->nmsrs; ++i) | 712 | for (i = 0; i < msrs->nmsrs; ++i) |
588 | if (do_msr(vcpu, entries[i].index, &entries[i].data)) | 713 | if (do_msr(vcpu, entries[i].index, &entries[i].data)) |
589 | break; | 714 | break; |
715 | up_read(&vcpu->kvm->slots_lock); | ||
590 | 716 | ||
591 | vcpu_put(vcpu); | 717 | vcpu_put(vcpu); |
592 | 718 | ||
@@ -688,11 +814,24 @@ int kvm_dev_ioctl_check_extension(long ext) | |||
688 | case KVM_CAP_USER_MEMORY: | 814 | case KVM_CAP_USER_MEMORY: |
689 | case KVM_CAP_SET_TSS_ADDR: | 815 | case KVM_CAP_SET_TSS_ADDR: |
690 | case KVM_CAP_EXT_CPUID: | 816 | case KVM_CAP_EXT_CPUID: |
817 | case KVM_CAP_CLOCKSOURCE: | ||
818 | case KVM_CAP_PIT: | ||
819 | case KVM_CAP_NOP_IO_DELAY: | ||
820 | case KVM_CAP_MP_STATE: | ||
691 | r = 1; | 821 | r = 1; |
692 | break; | 822 | break; |
693 | case KVM_CAP_VAPIC: | 823 | case KVM_CAP_VAPIC: |
694 | r = !kvm_x86_ops->cpu_has_accelerated_tpr(); | 824 | r = !kvm_x86_ops->cpu_has_accelerated_tpr(); |
695 | break; | 825 | break; |
826 | case KVM_CAP_NR_VCPUS: | ||
827 | r = KVM_MAX_VCPUS; | ||
828 | break; | ||
829 | case KVM_CAP_NR_MEMSLOTS: | ||
830 | r = KVM_MEMORY_SLOTS; | ||
831 | break; | ||
832 | case KVM_CAP_PV_MMU: | ||
833 | r = !tdp_enabled; | ||
834 | break; | ||
696 | default: | 835 | default: |
697 | r = 0; | 836 | r = 0; |
698 | break; | 837 | break; |
@@ -763,6 +902,7 @@ out: | |||
763 | void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | 902 | void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) |
764 | { | 903 | { |
765 | kvm_x86_ops->vcpu_load(vcpu, cpu); | 904 | kvm_x86_ops->vcpu_load(vcpu, cpu); |
905 | kvm_write_guest_time(vcpu); | ||
766 | } | 906 | } |
767 | 907 | ||
768 | void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) | 908 | void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) |
@@ -958,32 +1098,32 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | |||
958 | } | 1098 | } |
959 | /* function 4 and 0xb have additional index. */ | 1099 | /* function 4 and 0xb have additional index. */ |
960 | case 4: { | 1100 | case 4: { |
961 | int index, cache_type; | 1101 | int i, cache_type; |
962 | 1102 | ||
963 | entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; | 1103 | entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; |
964 | /* read more entries until cache_type is zero */ | 1104 | /* read more entries until cache_type is zero */ |
965 | for (index = 1; *nent < maxnent; ++index) { | 1105 | for (i = 1; *nent < maxnent; ++i) { |
966 | cache_type = entry[index - 1].eax & 0x1f; | 1106 | cache_type = entry[i - 1].eax & 0x1f; |
967 | if (!cache_type) | 1107 | if (!cache_type) |
968 | break; | 1108 | break; |
969 | do_cpuid_1_ent(&entry[index], function, index); | 1109 | do_cpuid_1_ent(&entry[i], function, i); |
970 | entry[index].flags |= | 1110 | entry[i].flags |= |
971 | KVM_CPUID_FLAG_SIGNIFCANT_INDEX; | 1111 | KVM_CPUID_FLAG_SIGNIFCANT_INDEX; |
972 | ++*nent; | 1112 | ++*nent; |
973 | } | 1113 | } |
974 | break; | 1114 | break; |
975 | } | 1115 | } |
976 | case 0xb: { | 1116 | case 0xb: { |
977 | int index, level_type; | 1117 | int i, level_type; |
978 | 1118 | ||
979 | entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; | 1119 | entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; |
980 | /* read more entries until level_type is zero */ | 1120 | /* read more entries until level_type is zero */ |
981 | for (index = 1; *nent < maxnent; ++index) { | 1121 | for (i = 1; *nent < maxnent; ++i) { |
982 | level_type = entry[index - 1].ecx & 0xff; | 1122 | level_type = entry[i - 1].ecx & 0xff; |
983 | if (!level_type) | 1123 | if (!level_type) |
984 | break; | 1124 | break; |
985 | do_cpuid_1_ent(&entry[index], function, index); | 1125 | do_cpuid_1_ent(&entry[i], function, i); |
986 | entry[index].flags |= | 1126 | entry[i].flags |= |
987 | KVM_CPUID_FLAG_SIGNIFCANT_INDEX; | 1127 | KVM_CPUID_FLAG_SIGNIFCANT_INDEX; |
988 | ++*nent; | 1128 | ++*nent; |
989 | } | 1129 | } |
@@ -1365,6 +1505,23 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) | |||
1365 | return r; | 1505 | return r; |
1366 | } | 1506 | } |
1367 | 1507 | ||
1508 | static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps) | ||
1509 | { | ||
1510 | int r = 0; | ||
1511 | |||
1512 | memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state)); | ||
1513 | return r; | ||
1514 | } | ||
1515 | |||
1516 | static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) | ||
1517 | { | ||
1518 | int r = 0; | ||
1519 | |||
1520 | memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state)); | ||
1521 | kvm_pit_load_count(kvm, 0, ps->channels[0].count); | ||
1522 | return r; | ||
1523 | } | ||
1524 | |||
1368 | /* | 1525 | /* |
1369 | * Get (and clear) the dirty memory log for a memory slot. | 1526 | * Get (and clear) the dirty memory log for a memory slot. |
1370 | */ | 1527 | */ |
@@ -1457,6 +1614,12 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
1457 | } else | 1614 | } else |
1458 | goto out; | 1615 | goto out; |
1459 | break; | 1616 | break; |
1617 | case KVM_CREATE_PIT: | ||
1618 | r = -ENOMEM; | ||
1619 | kvm->arch.vpit = kvm_create_pit(kvm); | ||
1620 | if (kvm->arch.vpit) | ||
1621 | r = 0; | ||
1622 | break; | ||
1460 | case KVM_IRQ_LINE: { | 1623 | case KVM_IRQ_LINE: { |
1461 | struct kvm_irq_level irq_event; | 1624 | struct kvm_irq_level irq_event; |
1462 | 1625 | ||
@@ -1512,6 +1675,37 @@ long kvm_arch_vm_ioctl(struct file *filp, | |||
1512 | r = 0; | 1675 | r = 0; |
1513 | break; | 1676 | break; |
1514 | } | 1677 | } |
1678 | case KVM_GET_PIT: { | ||
1679 | struct kvm_pit_state ps; | ||
1680 | r = -EFAULT; | ||
1681 | if (copy_from_user(&ps, argp, sizeof ps)) | ||
1682 | goto out; | ||
1683 | r = -ENXIO; | ||
1684 | if (!kvm->arch.vpit) | ||
1685 | goto out; | ||
1686 | r = kvm_vm_ioctl_get_pit(kvm, &ps); | ||
1687 | if (r) | ||
1688 | goto out; | ||
1689 | r = -EFAULT; | ||
1690 | if (copy_to_user(argp, &ps, sizeof ps)) | ||
1691 | goto out; | ||
1692 | r = 0; | ||
1693 | break; | ||
1694 | } | ||
1695 | case KVM_SET_PIT: { | ||
1696 | struct kvm_pit_state ps; | ||
1697 | r = -EFAULT; | ||
1698 | if (copy_from_user(&ps, argp, sizeof ps)) | ||
1699 | goto out; | ||
1700 | r = -ENXIO; | ||
1701 | if (!kvm->arch.vpit) | ||
1702 | goto out; | ||
1703 | r = kvm_vm_ioctl_set_pit(kvm, &ps); | ||
1704 | if (r) | ||
1705 | goto out; | ||
1706 | r = 0; | ||
1707 | break; | ||
1708 | } | ||
1515 | default: | 1709 | default: |
1516 | ; | 1710 | ; |
1517 | } | 1711 | } |
@@ -1570,7 +1764,6 @@ int emulator_read_std(unsigned long addr, | |||
1570 | void *data = val; | 1764 | void *data = val; |
1571 | int r = X86EMUL_CONTINUE; | 1765 | int r = X86EMUL_CONTINUE; |
1572 | 1766 | ||
1573 | down_read(&vcpu->kvm->slots_lock); | ||
1574 | while (bytes) { | 1767 | while (bytes) { |
1575 | gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); | 1768 | gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); |
1576 | unsigned offset = addr & (PAGE_SIZE-1); | 1769 | unsigned offset = addr & (PAGE_SIZE-1); |
@@ -1592,7 +1785,6 @@ int emulator_read_std(unsigned long addr, | |||
1592 | addr += tocopy; | 1785 | addr += tocopy; |
1593 | } | 1786 | } |
1594 | out: | 1787 | out: |
1595 | up_read(&vcpu->kvm->slots_lock); | ||
1596 | return r; | 1788 | return r; |
1597 | } | 1789 | } |
1598 | EXPORT_SYMBOL_GPL(emulator_read_std); | 1790 | EXPORT_SYMBOL_GPL(emulator_read_std); |
@@ -1611,9 +1803,7 @@ static int emulator_read_emulated(unsigned long addr, | |||
1611 | return X86EMUL_CONTINUE; | 1803 | return X86EMUL_CONTINUE; |
1612 | } | 1804 | } |
1613 | 1805 | ||
1614 | down_read(&vcpu->kvm->slots_lock); | ||
1615 | gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); | 1806 | gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); |
1616 | up_read(&vcpu->kvm->slots_lock); | ||
1617 | 1807 | ||
1618 | /* For APIC access vmexit */ | 1808 | /* For APIC access vmexit */ |
1619 | if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) | 1809 | if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) |
@@ -1646,19 +1836,15 @@ mmio: | |||
1646 | return X86EMUL_UNHANDLEABLE; | 1836 | return X86EMUL_UNHANDLEABLE; |
1647 | } | 1837 | } |
1648 | 1838 | ||
1649 | static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, | 1839 | int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, |
1650 | const void *val, int bytes) | 1840 | const void *val, int bytes) |
1651 | { | 1841 | { |
1652 | int ret; | 1842 | int ret; |
1653 | 1843 | ||
1654 | down_read(&vcpu->kvm->slots_lock); | ||
1655 | ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); | 1844 | ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); |
1656 | if (ret < 0) { | 1845 | if (ret < 0) |
1657 | up_read(&vcpu->kvm->slots_lock); | ||
1658 | return 0; | 1846 | return 0; |
1659 | } | ||
1660 | kvm_mmu_pte_write(vcpu, gpa, val, bytes); | 1847 | kvm_mmu_pte_write(vcpu, gpa, val, bytes); |
1661 | up_read(&vcpu->kvm->slots_lock); | ||
1662 | return 1; | 1848 | return 1; |
1663 | } | 1849 | } |
1664 | 1850 | ||
@@ -1670,9 +1856,7 @@ static int emulator_write_emulated_onepage(unsigned long addr, | |||
1670 | struct kvm_io_device *mmio_dev; | 1856 | struct kvm_io_device *mmio_dev; |
1671 | gpa_t gpa; | 1857 | gpa_t gpa; |
1672 | 1858 | ||
1673 | down_read(&vcpu->kvm->slots_lock); | ||
1674 | gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); | 1859 | gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); |
1675 | up_read(&vcpu->kvm->slots_lock); | ||
1676 | 1860 | ||
1677 | if (gpa == UNMAPPED_GVA) { | 1861 | if (gpa == UNMAPPED_GVA) { |
1678 | kvm_inject_page_fault(vcpu, addr, 2); | 1862 | kvm_inject_page_fault(vcpu, addr, 2); |
@@ -1749,7 +1933,6 @@ static int emulator_cmpxchg_emulated(unsigned long addr, | |||
1749 | char *kaddr; | 1933 | char *kaddr; |
1750 | u64 val; | 1934 | u64 val; |
1751 | 1935 | ||
1752 | down_read(&vcpu->kvm->slots_lock); | ||
1753 | gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); | 1936 | gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); |
1754 | 1937 | ||
1755 | if (gpa == UNMAPPED_GVA || | 1938 | if (gpa == UNMAPPED_GVA || |
@@ -1769,9 +1952,8 @@ static int emulator_cmpxchg_emulated(unsigned long addr, | |||
1769 | set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val); | 1952 | set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val); |
1770 | kunmap_atomic(kaddr, KM_USER0); | 1953 | kunmap_atomic(kaddr, KM_USER0); |
1771 | kvm_release_page_dirty(page); | 1954 | kvm_release_page_dirty(page); |
1772 | emul_write: | ||
1773 | up_read(&vcpu->kvm->slots_lock); | ||
1774 | } | 1955 | } |
1956 | emul_write: | ||
1775 | #endif | 1957 | #endif |
1776 | 1958 | ||
1777 | return emulator_write_emulated(addr, new, bytes, vcpu); | 1959 | return emulator_write_emulated(addr, new, bytes, vcpu); |
@@ -1802,7 +1984,7 @@ int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) | |||
1802 | *dest = kvm_x86_ops->get_dr(vcpu, dr); | 1984 | *dest = kvm_x86_ops->get_dr(vcpu, dr); |
1803 | return X86EMUL_CONTINUE; | 1985 | return X86EMUL_CONTINUE; |
1804 | default: | 1986 | default: |
1805 | pr_unimpl(vcpu, "%s: unexpected dr %u\n", __FUNCTION__, dr); | 1987 | pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr); |
1806 | return X86EMUL_UNHANDLEABLE; | 1988 | return X86EMUL_UNHANDLEABLE; |
1807 | } | 1989 | } |
1808 | } | 1990 | } |
@@ -1840,7 +2022,7 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) | |||
1840 | } | 2022 | } |
1841 | EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); | 2023 | EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); |
1842 | 2024 | ||
1843 | struct x86_emulate_ops emulate_ops = { | 2025 | static struct x86_emulate_ops emulate_ops = { |
1844 | .read_std = emulator_read_std, | 2026 | .read_std = emulator_read_std, |
1845 | .read_emulated = emulator_read_emulated, | 2027 | .read_emulated = emulator_read_emulated, |
1846 | .write_emulated = emulator_write_emulated, | 2028 | .write_emulated = emulator_write_emulated, |
@@ -2091,6 +2273,13 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, | |||
2091 | vcpu->arch.pio.guest_page_offset = 0; | 2273 | vcpu->arch.pio.guest_page_offset = 0; |
2092 | vcpu->arch.pio.rep = 0; | 2274 | vcpu->arch.pio.rep = 0; |
2093 | 2275 | ||
2276 | if (vcpu->run->io.direction == KVM_EXIT_IO_IN) | ||
2277 | KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size, | ||
2278 | handler); | ||
2279 | else | ||
2280 | KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size, | ||
2281 | handler); | ||
2282 | |||
2094 | kvm_x86_ops->cache_regs(vcpu); | 2283 | kvm_x86_ops->cache_regs(vcpu); |
2095 | memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4); | 2284 | memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4); |
2096 | kvm_x86_ops->decache_regs(vcpu); | 2285 | kvm_x86_ops->decache_regs(vcpu); |
@@ -2129,6 +2318,13 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, | |||
2129 | vcpu->arch.pio.guest_page_offset = offset_in_page(address); | 2318 | vcpu->arch.pio.guest_page_offset = offset_in_page(address); |
2130 | vcpu->arch.pio.rep = rep; | 2319 | vcpu->arch.pio.rep = rep; |
2131 | 2320 | ||
2321 | if (vcpu->run->io.direction == KVM_EXIT_IO_IN) | ||
2322 | KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size, | ||
2323 | handler); | ||
2324 | else | ||
2325 | KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size, | ||
2326 | handler); | ||
2327 | |||
2132 | if (!count) { | 2328 | if (!count) { |
2133 | kvm_x86_ops->skip_emulated_instruction(vcpu); | 2329 | kvm_x86_ops->skip_emulated_instruction(vcpu); |
2134 | return 1; | 2330 | return 1; |
@@ -2163,10 +2359,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, | |||
2163 | kvm_x86_ops->skip_emulated_instruction(vcpu); | 2359 | kvm_x86_ops->skip_emulated_instruction(vcpu); |
2164 | 2360 | ||
2165 | for (i = 0; i < nr_pages; ++i) { | 2361 | for (i = 0; i < nr_pages; ++i) { |
2166 | down_read(&vcpu->kvm->slots_lock); | ||
2167 | page = gva_to_page(vcpu, address + i * PAGE_SIZE); | 2362 | page = gva_to_page(vcpu, address + i * PAGE_SIZE); |
2168 | vcpu->arch.pio.guest_pages[i] = page; | 2363 | vcpu->arch.pio.guest_pages[i] = page; |
2169 | up_read(&vcpu->kvm->slots_lock); | ||
2170 | if (!page) { | 2364 | if (!page) { |
2171 | kvm_inject_gp(vcpu, 0); | 2365 | kvm_inject_gp(vcpu, 0); |
2172 | free_pio_guest_pages(vcpu); | 2366 | free_pio_guest_pages(vcpu); |
@@ -2238,10 +2432,13 @@ void kvm_arch_exit(void) | |||
2238 | int kvm_emulate_halt(struct kvm_vcpu *vcpu) | 2432 | int kvm_emulate_halt(struct kvm_vcpu *vcpu) |
2239 | { | 2433 | { |
2240 | ++vcpu->stat.halt_exits; | 2434 | ++vcpu->stat.halt_exits; |
2435 | KVMTRACE_0D(HLT, vcpu, handler); | ||
2241 | if (irqchip_in_kernel(vcpu->kvm)) { | 2436 | if (irqchip_in_kernel(vcpu->kvm)) { |
2242 | vcpu->arch.mp_state = VCPU_MP_STATE_HALTED; | 2437 | vcpu->arch.mp_state = KVM_MP_STATE_HALTED; |
2438 | up_read(&vcpu->kvm->slots_lock); | ||
2243 | kvm_vcpu_block(vcpu); | 2439 | kvm_vcpu_block(vcpu); |
2244 | if (vcpu->arch.mp_state != VCPU_MP_STATE_RUNNABLE) | 2440 | down_read(&vcpu->kvm->slots_lock); |
2441 | if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE) | ||
2245 | return -EINTR; | 2442 | return -EINTR; |
2246 | return 1; | 2443 | return 1; |
2247 | } else { | 2444 | } else { |
@@ -2251,9 +2448,19 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu) | |||
2251 | } | 2448 | } |
2252 | EXPORT_SYMBOL_GPL(kvm_emulate_halt); | 2449 | EXPORT_SYMBOL_GPL(kvm_emulate_halt); |
2253 | 2450 | ||
2451 | static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0, | ||
2452 | unsigned long a1) | ||
2453 | { | ||
2454 | if (is_long_mode(vcpu)) | ||
2455 | return a0; | ||
2456 | else | ||
2457 | return a0 | ((gpa_t)a1 << 32); | ||
2458 | } | ||
2459 | |||
2254 | int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) | 2460 | int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) |
2255 | { | 2461 | { |
2256 | unsigned long nr, a0, a1, a2, a3, ret; | 2462 | unsigned long nr, a0, a1, a2, a3, ret; |
2463 | int r = 1; | ||
2257 | 2464 | ||
2258 | kvm_x86_ops->cache_regs(vcpu); | 2465 | kvm_x86_ops->cache_regs(vcpu); |
2259 | 2466 | ||
@@ -2263,6 +2470,8 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) | |||
2263 | a2 = vcpu->arch.regs[VCPU_REGS_RDX]; | 2470 | a2 = vcpu->arch.regs[VCPU_REGS_RDX]; |
2264 | a3 = vcpu->arch.regs[VCPU_REGS_RSI]; | 2471 | a3 = vcpu->arch.regs[VCPU_REGS_RSI]; |
2265 | 2472 | ||
2473 | KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler); | ||
2474 | |||
2266 | if (!is_long_mode(vcpu)) { | 2475 | if (!is_long_mode(vcpu)) { |
2267 | nr &= 0xFFFFFFFF; | 2476 | nr &= 0xFFFFFFFF; |
2268 | a0 &= 0xFFFFFFFF; | 2477 | a0 &= 0xFFFFFFFF; |
@@ -2275,13 +2484,17 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) | |||
2275 | case KVM_HC_VAPIC_POLL_IRQ: | 2484 | case KVM_HC_VAPIC_POLL_IRQ: |
2276 | ret = 0; | 2485 | ret = 0; |
2277 | break; | 2486 | break; |
2487 | case KVM_HC_MMU_OP: | ||
2488 | r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret); | ||
2489 | break; | ||
2278 | default: | 2490 | default: |
2279 | ret = -KVM_ENOSYS; | 2491 | ret = -KVM_ENOSYS; |
2280 | break; | 2492 | break; |
2281 | } | 2493 | } |
2282 | vcpu->arch.regs[VCPU_REGS_RAX] = ret; | 2494 | vcpu->arch.regs[VCPU_REGS_RAX] = ret; |
2283 | kvm_x86_ops->decache_regs(vcpu); | 2495 | kvm_x86_ops->decache_regs(vcpu); |
2284 | return 0; | 2496 | ++vcpu->stat.hypercalls; |
2497 | return r; | ||
2285 | } | 2498 | } |
2286 | EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); | 2499 | EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); |
2287 | 2500 | ||
@@ -2329,7 +2542,7 @@ void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) | |||
2329 | void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, | 2542 | void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, |
2330 | unsigned long *rflags) | 2543 | unsigned long *rflags) |
2331 | { | 2544 | { |
2332 | lmsw(vcpu, msw); | 2545 | kvm_lmsw(vcpu, msw); |
2333 | *rflags = kvm_x86_ops->get_rflags(vcpu); | 2546 | *rflags = kvm_x86_ops->get_rflags(vcpu); |
2334 | } | 2547 | } |
2335 | 2548 | ||
@@ -2346,9 +2559,9 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) | |||
2346 | case 4: | 2559 | case 4: |
2347 | return vcpu->arch.cr4; | 2560 | return vcpu->arch.cr4; |
2348 | case 8: | 2561 | case 8: |
2349 | return get_cr8(vcpu); | 2562 | return kvm_get_cr8(vcpu); |
2350 | default: | 2563 | default: |
2351 | vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr); | 2564 | vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); |
2352 | return 0; | 2565 | return 0; |
2353 | } | 2566 | } |
2354 | } | 2567 | } |
@@ -2358,23 +2571,23 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, | |||
2358 | { | 2571 | { |
2359 | switch (cr) { | 2572 | switch (cr) { |
2360 | case 0: | 2573 | case 0: |
2361 | set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); | 2574 | kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); |
2362 | *rflags = kvm_x86_ops->get_rflags(vcpu); | 2575 | *rflags = kvm_x86_ops->get_rflags(vcpu); |
2363 | break; | 2576 | break; |
2364 | case 2: | 2577 | case 2: |
2365 | vcpu->arch.cr2 = val; | 2578 | vcpu->arch.cr2 = val; |
2366 | break; | 2579 | break; |
2367 | case 3: | 2580 | case 3: |
2368 | set_cr3(vcpu, val); | 2581 | kvm_set_cr3(vcpu, val); |
2369 | break; | 2582 | break; |
2370 | case 4: | 2583 | case 4: |
2371 | set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val)); | 2584 | kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val)); |
2372 | break; | 2585 | break; |
2373 | case 8: | 2586 | case 8: |
2374 | set_cr8(vcpu, val & 0xfUL); | 2587 | kvm_set_cr8(vcpu, val & 0xfUL); |
2375 | break; | 2588 | break; |
2376 | default: | 2589 | default: |
2377 | vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr); | 2590 | vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); |
2378 | } | 2591 | } |
2379 | } | 2592 | } |
2380 | 2593 | ||
@@ -2447,6 +2660,11 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) | |||
2447 | } | 2660 | } |
2448 | kvm_x86_ops->decache_regs(vcpu); | 2661 | kvm_x86_ops->decache_regs(vcpu); |
2449 | kvm_x86_ops->skip_emulated_instruction(vcpu); | 2662 | kvm_x86_ops->skip_emulated_instruction(vcpu); |
2663 | KVMTRACE_5D(CPUID, vcpu, function, | ||
2664 | (u32)vcpu->arch.regs[VCPU_REGS_RAX], | ||
2665 | (u32)vcpu->arch.regs[VCPU_REGS_RBX], | ||
2666 | (u32)vcpu->arch.regs[VCPU_REGS_RCX], | ||
2667 | (u32)vcpu->arch.regs[VCPU_REGS_RDX], handler); | ||
2450 | } | 2668 | } |
2451 | EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); | 2669 | EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); |
2452 | 2670 | ||
@@ -2469,7 +2687,7 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu, | |||
2469 | struct kvm_run *kvm_run) | 2687 | struct kvm_run *kvm_run) |
2470 | { | 2688 | { |
2471 | kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0; | 2689 | kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0; |
2472 | kvm_run->cr8 = get_cr8(vcpu); | 2690 | kvm_run->cr8 = kvm_get_cr8(vcpu); |
2473 | kvm_run->apic_base = kvm_get_apic_base(vcpu); | 2691 | kvm_run->apic_base = kvm_get_apic_base(vcpu); |
2474 | if (irqchip_in_kernel(vcpu->kvm)) | 2692 | if (irqchip_in_kernel(vcpu->kvm)) |
2475 | kvm_run->ready_for_interrupt_injection = 1; | 2693 | kvm_run->ready_for_interrupt_injection = 1; |
@@ -2509,16 +2727,17 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2509 | { | 2727 | { |
2510 | int r; | 2728 | int r; |
2511 | 2729 | ||
2512 | if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) { | 2730 | if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { |
2513 | pr_debug("vcpu %d received sipi with vector # %x\n", | 2731 | pr_debug("vcpu %d received sipi with vector # %x\n", |
2514 | vcpu->vcpu_id, vcpu->arch.sipi_vector); | 2732 | vcpu->vcpu_id, vcpu->arch.sipi_vector); |
2515 | kvm_lapic_reset(vcpu); | 2733 | kvm_lapic_reset(vcpu); |
2516 | r = kvm_x86_ops->vcpu_reset(vcpu); | 2734 | r = kvm_x86_ops->vcpu_reset(vcpu); |
2517 | if (r) | 2735 | if (r) |
2518 | return r; | 2736 | return r; |
2519 | vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE; | 2737 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; |
2520 | } | 2738 | } |
2521 | 2739 | ||
2740 | down_read(&vcpu->kvm->slots_lock); | ||
2522 | vapic_enter(vcpu); | 2741 | vapic_enter(vcpu); |
2523 | 2742 | ||
2524 | preempted: | 2743 | preempted: |
@@ -2526,6 +2745,10 @@ preempted: | |||
2526 | kvm_x86_ops->guest_debug_pre(vcpu); | 2745 | kvm_x86_ops->guest_debug_pre(vcpu); |
2527 | 2746 | ||
2528 | again: | 2747 | again: |
2748 | if (vcpu->requests) | ||
2749 | if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) | ||
2750 | kvm_mmu_unload(vcpu); | ||
2751 | |||
2529 | r = kvm_mmu_reload(vcpu); | 2752 | r = kvm_mmu_reload(vcpu); |
2530 | if (unlikely(r)) | 2753 | if (unlikely(r)) |
2531 | goto out; | 2754 | goto out; |
@@ -2539,6 +2762,11 @@ again: | |||
2539 | r = 0; | 2762 | r = 0; |
2540 | goto out; | 2763 | goto out; |
2541 | } | 2764 | } |
2765 | if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) { | ||
2766 | kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; | ||
2767 | r = 0; | ||
2768 | goto out; | ||
2769 | } | ||
2542 | } | 2770 | } |
2543 | 2771 | ||
2544 | kvm_inject_pending_timer_irqs(vcpu); | 2772 | kvm_inject_pending_timer_irqs(vcpu); |
@@ -2557,6 +2785,14 @@ again: | |||
2557 | goto out; | 2785 | goto out; |
2558 | } | 2786 | } |
2559 | 2787 | ||
2788 | if (vcpu->requests) | ||
2789 | if (test_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) { | ||
2790 | local_irq_enable(); | ||
2791 | preempt_enable(); | ||
2792 | r = 1; | ||
2793 | goto out; | ||
2794 | } | ||
2795 | |||
2560 | if (signal_pending(current)) { | 2796 | if (signal_pending(current)) { |
2561 | local_irq_enable(); | 2797 | local_irq_enable(); |
2562 | preempt_enable(); | 2798 | preempt_enable(); |
@@ -2566,6 +2802,13 @@ again: | |||
2566 | goto out; | 2802 | goto out; |
2567 | } | 2803 | } |
2568 | 2804 | ||
2805 | vcpu->guest_mode = 1; | ||
2806 | /* | ||
2807 | * Make sure that guest_mode assignment won't happen after | ||
2808 | * testing the pending IRQ vector bitmap. | ||
2809 | */ | ||
2810 | smp_wmb(); | ||
2811 | |||
2569 | if (vcpu->arch.exception.pending) | 2812 | if (vcpu->arch.exception.pending) |
2570 | __queue_exception(vcpu); | 2813 | __queue_exception(vcpu); |
2571 | else if (irqchip_in_kernel(vcpu->kvm)) | 2814 | else if (irqchip_in_kernel(vcpu->kvm)) |
@@ -2575,13 +2818,15 @@ again: | |||
2575 | 2818 | ||
2576 | kvm_lapic_sync_to_vapic(vcpu); | 2819 | kvm_lapic_sync_to_vapic(vcpu); |
2577 | 2820 | ||
2578 | vcpu->guest_mode = 1; | 2821 | up_read(&vcpu->kvm->slots_lock); |
2822 | |||
2579 | kvm_guest_enter(); | 2823 | kvm_guest_enter(); |
2580 | 2824 | ||
2581 | if (vcpu->requests) | 2825 | if (vcpu->requests) |
2582 | if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) | 2826 | if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) |
2583 | kvm_x86_ops->tlb_flush(vcpu); | 2827 | kvm_x86_ops->tlb_flush(vcpu); |
2584 | 2828 | ||
2829 | KVMTRACE_0D(VMENTRY, vcpu, entryexit); | ||
2585 | kvm_x86_ops->run(vcpu, kvm_run); | 2830 | kvm_x86_ops->run(vcpu, kvm_run); |
2586 | 2831 | ||
2587 | vcpu->guest_mode = 0; | 2832 | vcpu->guest_mode = 0; |
@@ -2601,6 +2846,8 @@ again: | |||
2601 | 2846 | ||
2602 | preempt_enable(); | 2847 | preempt_enable(); |
2603 | 2848 | ||
2849 | down_read(&vcpu->kvm->slots_lock); | ||
2850 | |||
2604 | /* | 2851 | /* |
2605 | * Profile KVM exit RIPs: | 2852 | * Profile KVM exit RIPs: |
2606 | */ | 2853 | */ |
@@ -2628,14 +2875,18 @@ again: | |||
2628 | } | 2875 | } |
2629 | 2876 | ||
2630 | out: | 2877 | out: |
2878 | up_read(&vcpu->kvm->slots_lock); | ||
2631 | if (r > 0) { | 2879 | if (r > 0) { |
2632 | kvm_resched(vcpu); | 2880 | kvm_resched(vcpu); |
2881 | down_read(&vcpu->kvm->slots_lock); | ||
2633 | goto preempted; | 2882 | goto preempted; |
2634 | } | 2883 | } |
2635 | 2884 | ||
2636 | post_kvm_run_save(vcpu, kvm_run); | 2885 | post_kvm_run_save(vcpu, kvm_run); |
2637 | 2886 | ||
2887 | down_read(&vcpu->kvm->slots_lock); | ||
2638 | vapic_exit(vcpu); | 2888 | vapic_exit(vcpu); |
2889 | up_read(&vcpu->kvm->slots_lock); | ||
2639 | 2890 | ||
2640 | return r; | 2891 | return r; |
2641 | } | 2892 | } |
@@ -2647,7 +2898,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2647 | 2898 | ||
2648 | vcpu_load(vcpu); | 2899 | vcpu_load(vcpu); |
2649 | 2900 | ||
2650 | if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_UNINITIALIZED)) { | 2901 | if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { |
2651 | kvm_vcpu_block(vcpu); | 2902 | kvm_vcpu_block(vcpu); |
2652 | vcpu_put(vcpu); | 2903 | vcpu_put(vcpu); |
2653 | return -EAGAIN; | 2904 | return -EAGAIN; |
@@ -2658,7 +2909,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2658 | 2909 | ||
2659 | /* re-sync apic's tpr */ | 2910 | /* re-sync apic's tpr */ |
2660 | if (!irqchip_in_kernel(vcpu->kvm)) | 2911 | if (!irqchip_in_kernel(vcpu->kvm)) |
2661 | set_cr8(vcpu, kvm_run->cr8); | 2912 | kvm_set_cr8(vcpu, kvm_run->cr8); |
2662 | 2913 | ||
2663 | if (vcpu->arch.pio.cur_count) { | 2914 | if (vcpu->arch.pio.cur_count) { |
2664 | r = complete_pio(vcpu); | 2915 | r = complete_pio(vcpu); |
@@ -2670,9 +2921,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) | |||
2670 | memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); | 2921 | memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); |
2671 | vcpu->mmio_read_completed = 1; | 2922 | vcpu->mmio_read_completed = 1; |
2672 | vcpu->mmio_needed = 0; | 2923 | vcpu->mmio_needed = 0; |
2924 | |||
2925 | down_read(&vcpu->kvm->slots_lock); | ||
2673 | r = emulate_instruction(vcpu, kvm_run, | 2926 | r = emulate_instruction(vcpu, kvm_run, |
2674 | vcpu->arch.mmio_fault_cr2, 0, | 2927 | vcpu->arch.mmio_fault_cr2, 0, |
2675 | EMULTYPE_NO_DECODE); | 2928 | EMULTYPE_NO_DECODE); |
2929 | up_read(&vcpu->kvm->slots_lock); | ||
2676 | if (r == EMULATE_DO_MMIO) { | 2930 | if (r == EMULATE_DO_MMIO) { |
2677 | /* | 2931 | /* |
2678 | * Read-modify-write. Back to userspace. | 2932 | * Read-modify-write. Back to userspace. |
@@ -2773,7 +3027,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) | |||
2773 | static void get_segment(struct kvm_vcpu *vcpu, | 3027 | static void get_segment(struct kvm_vcpu *vcpu, |
2774 | struct kvm_segment *var, int seg) | 3028 | struct kvm_segment *var, int seg) |
2775 | { | 3029 | { |
2776 | return kvm_x86_ops->get_segment(vcpu, var, seg); | 3030 | kvm_x86_ops->get_segment(vcpu, var, seg); |
2777 | } | 3031 | } |
2778 | 3032 | ||
2779 | void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) | 3033 | void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) |
@@ -2816,7 +3070,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, | |||
2816 | sregs->cr2 = vcpu->arch.cr2; | 3070 | sregs->cr2 = vcpu->arch.cr2; |
2817 | sregs->cr3 = vcpu->arch.cr3; | 3071 | sregs->cr3 = vcpu->arch.cr3; |
2818 | sregs->cr4 = vcpu->arch.cr4; | 3072 | sregs->cr4 = vcpu->arch.cr4; |
2819 | sregs->cr8 = get_cr8(vcpu); | 3073 | sregs->cr8 = kvm_get_cr8(vcpu); |
2820 | sregs->efer = vcpu->arch.shadow_efer; | 3074 | sregs->efer = vcpu->arch.shadow_efer; |
2821 | sregs->apic_base = kvm_get_apic_base(vcpu); | 3075 | sregs->apic_base = kvm_get_apic_base(vcpu); |
2822 | 3076 | ||
@@ -2836,12 +3090,438 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, | |||
2836 | return 0; | 3090 | return 0; |
2837 | } | 3091 | } |
2838 | 3092 | ||
3093 | int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, | ||
3094 | struct kvm_mp_state *mp_state) | ||
3095 | { | ||
3096 | vcpu_load(vcpu); | ||
3097 | mp_state->mp_state = vcpu->arch.mp_state; | ||
3098 | vcpu_put(vcpu); | ||
3099 | return 0; | ||
3100 | } | ||
3101 | |||
3102 | int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, | ||
3103 | struct kvm_mp_state *mp_state) | ||
3104 | { | ||
3105 | vcpu_load(vcpu); | ||
3106 | vcpu->arch.mp_state = mp_state->mp_state; | ||
3107 | vcpu_put(vcpu); | ||
3108 | return 0; | ||
3109 | } | ||
3110 | |||
2839 | static void set_segment(struct kvm_vcpu *vcpu, | 3111 | static void set_segment(struct kvm_vcpu *vcpu, |
2840 | struct kvm_segment *var, int seg) | 3112 | struct kvm_segment *var, int seg) |
2841 | { | 3113 | { |
2842 | return kvm_x86_ops->set_segment(vcpu, var, seg); | 3114 | kvm_x86_ops->set_segment(vcpu, var, seg); |
3115 | } | ||
3116 | |||
3117 | static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector, | ||
3118 | struct kvm_segment *kvm_desct) | ||
3119 | { | ||
3120 | kvm_desct->base = seg_desc->base0; | ||
3121 | kvm_desct->base |= seg_desc->base1 << 16; | ||
3122 | kvm_desct->base |= seg_desc->base2 << 24; | ||
3123 | kvm_desct->limit = seg_desc->limit0; | ||
3124 | kvm_desct->limit |= seg_desc->limit << 16; | ||
3125 | kvm_desct->selector = selector; | ||
3126 | kvm_desct->type = seg_desc->type; | ||
3127 | kvm_desct->present = seg_desc->p; | ||
3128 | kvm_desct->dpl = seg_desc->dpl; | ||
3129 | kvm_desct->db = seg_desc->d; | ||
3130 | kvm_desct->s = seg_desc->s; | ||
3131 | kvm_desct->l = seg_desc->l; | ||
3132 | kvm_desct->g = seg_desc->g; | ||
3133 | kvm_desct->avl = seg_desc->avl; | ||
3134 | if (!selector) | ||
3135 | kvm_desct->unusable = 1; | ||
3136 | else | ||
3137 | kvm_desct->unusable = 0; | ||
3138 | kvm_desct->padding = 0; | ||
3139 | } | ||
3140 | |||
3141 | static void get_segment_descritptor_dtable(struct kvm_vcpu *vcpu, | ||
3142 | u16 selector, | ||
3143 | struct descriptor_table *dtable) | ||
3144 | { | ||
3145 | if (selector & 1 << 2) { | ||
3146 | struct kvm_segment kvm_seg; | ||
3147 | |||
3148 | get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR); | ||
3149 | |||
3150 | if (kvm_seg.unusable) | ||
3151 | dtable->limit = 0; | ||
3152 | else | ||
3153 | dtable->limit = kvm_seg.limit; | ||
3154 | dtable->base = kvm_seg.base; | ||
3155 | } | ||
3156 | else | ||
3157 | kvm_x86_ops->get_gdt(vcpu, dtable); | ||
3158 | } | ||
3159 | |||
3160 | /* allowed just for 8 bytes segments */ | ||
3161 | static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, | ||
3162 | struct desc_struct *seg_desc) | ||
3163 | { | ||
3164 | struct descriptor_table dtable; | ||
3165 | u16 index = selector >> 3; | ||
3166 | |||
3167 | get_segment_descritptor_dtable(vcpu, selector, &dtable); | ||
3168 | |||
3169 | if (dtable.limit < index * 8 + 7) { | ||
3170 | kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); | ||
3171 | return 1; | ||
3172 | } | ||
3173 | return kvm_read_guest(vcpu->kvm, dtable.base + index * 8, seg_desc, 8); | ||
3174 | } | ||
3175 | |||
3176 | /* allowed just for 8 bytes segments */ | ||
3177 | static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, | ||
3178 | struct desc_struct *seg_desc) | ||
3179 | { | ||
3180 | struct descriptor_table dtable; | ||
3181 | u16 index = selector >> 3; | ||
3182 | |||
3183 | get_segment_descritptor_dtable(vcpu, selector, &dtable); | ||
3184 | |||
3185 | if (dtable.limit < index * 8 + 7) | ||
3186 | return 1; | ||
3187 | return kvm_write_guest(vcpu->kvm, dtable.base + index * 8, seg_desc, 8); | ||
3188 | } | ||
3189 | |||
3190 | static u32 get_tss_base_addr(struct kvm_vcpu *vcpu, | ||
3191 | struct desc_struct *seg_desc) | ||
3192 | { | ||
3193 | u32 base_addr; | ||
3194 | |||
3195 | base_addr = seg_desc->base0; | ||
3196 | base_addr |= (seg_desc->base1 << 16); | ||
3197 | base_addr |= (seg_desc->base2 << 24); | ||
3198 | |||
3199 | return base_addr; | ||
3200 | } | ||
3201 | |||
3202 | static int load_tss_segment32(struct kvm_vcpu *vcpu, | ||
3203 | struct desc_struct *seg_desc, | ||
3204 | struct tss_segment_32 *tss) | ||
3205 | { | ||
3206 | u32 base_addr; | ||
3207 | |||
3208 | base_addr = get_tss_base_addr(vcpu, seg_desc); | ||
3209 | |||
3210 | return kvm_read_guest(vcpu->kvm, base_addr, tss, | ||
3211 | sizeof(struct tss_segment_32)); | ||
3212 | } | ||
3213 | |||
3214 | static int save_tss_segment32(struct kvm_vcpu *vcpu, | ||
3215 | struct desc_struct *seg_desc, | ||
3216 | struct tss_segment_32 *tss) | ||
3217 | { | ||
3218 | u32 base_addr; | ||
3219 | |||
3220 | base_addr = get_tss_base_addr(vcpu, seg_desc); | ||
3221 | |||
3222 | return kvm_write_guest(vcpu->kvm, base_addr, tss, | ||
3223 | sizeof(struct tss_segment_32)); | ||
3224 | } | ||
3225 | |||
3226 | static int load_tss_segment16(struct kvm_vcpu *vcpu, | ||
3227 | struct desc_struct *seg_desc, | ||
3228 | struct tss_segment_16 *tss) | ||
3229 | { | ||
3230 | u32 base_addr; | ||
3231 | |||
3232 | base_addr = get_tss_base_addr(vcpu, seg_desc); | ||
3233 | |||
3234 | return kvm_read_guest(vcpu->kvm, base_addr, tss, | ||
3235 | sizeof(struct tss_segment_16)); | ||
3236 | } | ||
3237 | |||
3238 | static int save_tss_segment16(struct kvm_vcpu *vcpu, | ||
3239 | struct desc_struct *seg_desc, | ||
3240 | struct tss_segment_16 *tss) | ||
3241 | { | ||
3242 | u32 base_addr; | ||
3243 | |||
3244 | base_addr = get_tss_base_addr(vcpu, seg_desc); | ||
3245 | |||
3246 | return kvm_write_guest(vcpu->kvm, base_addr, tss, | ||
3247 | sizeof(struct tss_segment_16)); | ||
3248 | } | ||
3249 | |||
3250 | static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg) | ||
3251 | { | ||
3252 | struct kvm_segment kvm_seg; | ||
3253 | |||
3254 | get_segment(vcpu, &kvm_seg, seg); | ||
3255 | return kvm_seg.selector; | ||
3256 | } | ||
3257 | |||
3258 | static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu, | ||
3259 | u16 selector, | ||
3260 | struct kvm_segment *kvm_seg) | ||
3261 | { | ||
3262 | struct desc_struct seg_desc; | ||
3263 | |||
3264 | if (load_guest_segment_descriptor(vcpu, selector, &seg_desc)) | ||
3265 | return 1; | ||
3266 | seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg); | ||
3267 | return 0; | ||
3268 | } | ||
3269 | |||
3270 | static int load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, | ||
3271 | int type_bits, int seg) | ||
3272 | { | ||
3273 | struct kvm_segment kvm_seg; | ||
3274 | |||
3275 | if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) | ||
3276 | return 1; | ||
3277 | kvm_seg.type |= type_bits; | ||
3278 | |||
3279 | if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS && | ||
3280 | seg != VCPU_SREG_LDTR) | ||
3281 | if (!kvm_seg.s) | ||
3282 | kvm_seg.unusable = 1; | ||
3283 | |||
3284 | set_segment(vcpu, &kvm_seg, seg); | ||
3285 | return 0; | ||
3286 | } | ||
3287 | |||
3288 | static void save_state_to_tss32(struct kvm_vcpu *vcpu, | ||
3289 | struct tss_segment_32 *tss) | ||
3290 | { | ||
3291 | tss->cr3 = vcpu->arch.cr3; | ||
3292 | tss->eip = vcpu->arch.rip; | ||
3293 | tss->eflags = kvm_x86_ops->get_rflags(vcpu); | ||
3294 | tss->eax = vcpu->arch.regs[VCPU_REGS_RAX]; | ||
3295 | tss->ecx = vcpu->arch.regs[VCPU_REGS_RCX]; | ||
3296 | tss->edx = vcpu->arch.regs[VCPU_REGS_RDX]; | ||
3297 | tss->ebx = vcpu->arch.regs[VCPU_REGS_RBX]; | ||
3298 | tss->esp = vcpu->arch.regs[VCPU_REGS_RSP]; | ||
3299 | tss->ebp = vcpu->arch.regs[VCPU_REGS_RBP]; | ||
3300 | tss->esi = vcpu->arch.regs[VCPU_REGS_RSI]; | ||
3301 | tss->edi = vcpu->arch.regs[VCPU_REGS_RDI]; | ||
3302 | |||
3303 | tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); | ||
3304 | tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); | ||
3305 | tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); | ||
3306 | tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); | ||
3307 | tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS); | ||
3308 | tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS); | ||
3309 | tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR); | ||
3310 | tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR); | ||
3311 | } | ||
3312 | |||
3313 | static int load_state_from_tss32(struct kvm_vcpu *vcpu, | ||
3314 | struct tss_segment_32 *tss) | ||
3315 | { | ||
3316 | kvm_set_cr3(vcpu, tss->cr3); | ||
3317 | |||
3318 | vcpu->arch.rip = tss->eip; | ||
3319 | kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2); | ||
3320 | |||
3321 | vcpu->arch.regs[VCPU_REGS_RAX] = tss->eax; | ||
3322 | vcpu->arch.regs[VCPU_REGS_RCX] = tss->ecx; | ||
3323 | vcpu->arch.regs[VCPU_REGS_RDX] = tss->edx; | ||
3324 | vcpu->arch.regs[VCPU_REGS_RBX] = tss->ebx; | ||
3325 | vcpu->arch.regs[VCPU_REGS_RSP] = tss->esp; | ||
3326 | vcpu->arch.regs[VCPU_REGS_RBP] = tss->ebp; | ||
3327 | vcpu->arch.regs[VCPU_REGS_RSI] = tss->esi; | ||
3328 | vcpu->arch.regs[VCPU_REGS_RDI] = tss->edi; | ||
3329 | |||
3330 | if (load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) | ||
3331 | return 1; | ||
3332 | |||
3333 | if (load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) | ||
3334 | return 1; | ||
3335 | |||
3336 | if (load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) | ||
3337 | return 1; | ||
3338 | |||
3339 | if (load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) | ||
3340 | return 1; | ||
3341 | |||
3342 | if (load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) | ||
3343 | return 1; | ||
3344 | |||
3345 | if (load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS)) | ||
3346 | return 1; | ||
3347 | |||
3348 | if (load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS)) | ||
3349 | return 1; | ||
3350 | return 0; | ||
3351 | } | ||
3352 | |||
3353 | static void save_state_to_tss16(struct kvm_vcpu *vcpu, | ||
3354 | struct tss_segment_16 *tss) | ||
3355 | { | ||
3356 | tss->ip = vcpu->arch.rip; | ||
3357 | tss->flag = kvm_x86_ops->get_rflags(vcpu); | ||
3358 | tss->ax = vcpu->arch.regs[VCPU_REGS_RAX]; | ||
3359 | tss->cx = vcpu->arch.regs[VCPU_REGS_RCX]; | ||
3360 | tss->dx = vcpu->arch.regs[VCPU_REGS_RDX]; | ||
3361 | tss->bx = vcpu->arch.regs[VCPU_REGS_RBX]; | ||
3362 | tss->sp = vcpu->arch.regs[VCPU_REGS_RSP]; | ||
3363 | tss->bp = vcpu->arch.regs[VCPU_REGS_RBP]; | ||
3364 | tss->si = vcpu->arch.regs[VCPU_REGS_RSI]; | ||
3365 | tss->di = vcpu->arch.regs[VCPU_REGS_RDI]; | ||
3366 | |||
3367 | tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); | ||
3368 | tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); | ||
3369 | tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); | ||
3370 | tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); | ||
3371 | tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR); | ||
3372 | tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR); | ||
3373 | } | ||
3374 | |||
3375 | static int load_state_from_tss16(struct kvm_vcpu *vcpu, | ||
3376 | struct tss_segment_16 *tss) | ||
3377 | { | ||
3378 | vcpu->arch.rip = tss->ip; | ||
3379 | kvm_x86_ops->set_rflags(vcpu, tss->flag | 2); | ||
3380 | vcpu->arch.regs[VCPU_REGS_RAX] = tss->ax; | ||
3381 | vcpu->arch.regs[VCPU_REGS_RCX] = tss->cx; | ||
3382 | vcpu->arch.regs[VCPU_REGS_RDX] = tss->dx; | ||
3383 | vcpu->arch.regs[VCPU_REGS_RBX] = tss->bx; | ||
3384 | vcpu->arch.regs[VCPU_REGS_RSP] = tss->sp; | ||
3385 | vcpu->arch.regs[VCPU_REGS_RBP] = tss->bp; | ||
3386 | vcpu->arch.regs[VCPU_REGS_RSI] = tss->si; | ||
3387 | vcpu->arch.regs[VCPU_REGS_RDI] = tss->di; | ||
3388 | |||
3389 | if (load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) | ||
3390 | return 1; | ||
3391 | |||
3392 | if (load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) | ||
3393 | return 1; | ||
3394 | |||
3395 | if (load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) | ||
3396 | return 1; | ||
3397 | |||
3398 | if (load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) | ||
3399 | return 1; | ||
3400 | |||
3401 | if (load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) | ||
3402 | return 1; | ||
3403 | return 0; | ||
3404 | } | ||
3405 | |||
3406 | int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, | ||
3407 | struct desc_struct *cseg_desc, | ||
3408 | struct desc_struct *nseg_desc) | ||
3409 | { | ||
3410 | struct tss_segment_16 tss_segment_16; | ||
3411 | int ret = 0; | ||
3412 | |||
3413 | if (load_tss_segment16(vcpu, cseg_desc, &tss_segment_16)) | ||
3414 | goto out; | ||
3415 | |||
3416 | save_state_to_tss16(vcpu, &tss_segment_16); | ||
3417 | save_tss_segment16(vcpu, cseg_desc, &tss_segment_16); | ||
3418 | |||
3419 | if (load_tss_segment16(vcpu, nseg_desc, &tss_segment_16)) | ||
3420 | goto out; | ||
3421 | if (load_state_from_tss16(vcpu, &tss_segment_16)) | ||
3422 | goto out; | ||
3423 | |||
3424 | ret = 1; | ||
3425 | out: | ||
3426 | return ret; | ||
3427 | } | ||
3428 | |||
3429 | int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, | ||
3430 | struct desc_struct *cseg_desc, | ||
3431 | struct desc_struct *nseg_desc) | ||
3432 | { | ||
3433 | struct tss_segment_32 tss_segment_32; | ||
3434 | int ret = 0; | ||
3435 | |||
3436 | if (load_tss_segment32(vcpu, cseg_desc, &tss_segment_32)) | ||
3437 | goto out; | ||
3438 | |||
3439 | save_state_to_tss32(vcpu, &tss_segment_32); | ||
3440 | save_tss_segment32(vcpu, cseg_desc, &tss_segment_32); | ||
3441 | |||
3442 | if (load_tss_segment32(vcpu, nseg_desc, &tss_segment_32)) | ||
3443 | goto out; | ||
3444 | if (load_state_from_tss32(vcpu, &tss_segment_32)) | ||
3445 | goto out; | ||
3446 | |||
3447 | ret = 1; | ||
3448 | out: | ||
3449 | return ret; | ||
2843 | } | 3450 | } |
2844 | 3451 | ||
3452 | int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) | ||
3453 | { | ||
3454 | struct kvm_segment tr_seg; | ||
3455 | struct desc_struct cseg_desc; | ||
3456 | struct desc_struct nseg_desc; | ||
3457 | int ret = 0; | ||
3458 | |||
3459 | get_segment(vcpu, &tr_seg, VCPU_SREG_TR); | ||
3460 | |||
3461 | if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc)) | ||
3462 | goto out; | ||
3463 | |||
3464 | if (load_guest_segment_descriptor(vcpu, tr_seg.selector, &cseg_desc)) | ||
3465 | goto out; | ||
3466 | |||
3467 | |||
3468 | if (reason != TASK_SWITCH_IRET) { | ||
3469 | int cpl; | ||
3470 | |||
3471 | cpl = kvm_x86_ops->get_cpl(vcpu); | ||
3472 | if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) { | ||
3473 | kvm_queue_exception_e(vcpu, GP_VECTOR, 0); | ||
3474 | return 1; | ||
3475 | } | ||
3476 | } | ||
3477 | |||
3478 | if (!nseg_desc.p || (nseg_desc.limit0 | nseg_desc.limit << 16) < 0x67) { | ||
3479 | kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc); | ||
3480 | return 1; | ||
3481 | } | ||
3482 | |||
3483 | if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) { | ||
3484 | cseg_desc.type &= ~(1 << 8); //clear the B flag | ||
3485 | save_guest_segment_descriptor(vcpu, tr_seg.selector, | ||
3486 | &cseg_desc); | ||
3487 | } | ||
3488 | |||
3489 | if (reason == TASK_SWITCH_IRET) { | ||
3490 | u32 eflags = kvm_x86_ops->get_rflags(vcpu); | ||
3491 | kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT); | ||
3492 | } | ||
3493 | |||
3494 | kvm_x86_ops->skip_emulated_instruction(vcpu); | ||
3495 | kvm_x86_ops->cache_regs(vcpu); | ||
3496 | |||
3497 | if (nseg_desc.type & 8) | ||
3498 | ret = kvm_task_switch_32(vcpu, tss_selector, &cseg_desc, | ||
3499 | &nseg_desc); | ||
3500 | else | ||
3501 | ret = kvm_task_switch_16(vcpu, tss_selector, &cseg_desc, | ||
3502 | &nseg_desc); | ||
3503 | |||
3504 | if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) { | ||
3505 | u32 eflags = kvm_x86_ops->get_rflags(vcpu); | ||
3506 | kvm_x86_ops->set_rflags(vcpu, eflags | X86_EFLAGS_NT); | ||
3507 | } | ||
3508 | |||
3509 | if (reason != TASK_SWITCH_IRET) { | ||
3510 | nseg_desc.type |= (1 << 8); | ||
3511 | save_guest_segment_descriptor(vcpu, tss_selector, | ||
3512 | &nseg_desc); | ||
3513 | } | ||
3514 | |||
3515 | kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS); | ||
3516 | seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg); | ||
3517 | tr_seg.type = 11; | ||
3518 | set_segment(vcpu, &tr_seg, VCPU_SREG_TR); | ||
3519 | out: | ||
3520 | kvm_x86_ops->decache_regs(vcpu); | ||
3521 | return ret; | ||
3522 | } | ||
3523 | EXPORT_SYMBOL_GPL(kvm_task_switch); | ||
3524 | |||
2845 | int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, | 3525 | int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, |
2846 | struct kvm_sregs *sregs) | 3526 | struct kvm_sregs *sregs) |
2847 | { | 3527 | { |
@@ -2862,12 +3542,10 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, | |||
2862 | mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; | 3542 | mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; |
2863 | vcpu->arch.cr3 = sregs->cr3; | 3543 | vcpu->arch.cr3 = sregs->cr3; |
2864 | 3544 | ||
2865 | set_cr8(vcpu, sregs->cr8); | 3545 | kvm_set_cr8(vcpu, sregs->cr8); |
2866 | 3546 | ||
2867 | mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer; | 3547 | mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer; |
2868 | #ifdef CONFIG_X86_64 | ||
2869 | kvm_x86_ops->set_efer(vcpu, sregs->efer); | 3548 | kvm_x86_ops->set_efer(vcpu, sregs->efer); |
2870 | #endif | ||
2871 | kvm_set_apic_base(vcpu, sregs->apic_base); | 3549 | kvm_set_apic_base(vcpu, sregs->apic_base); |
2872 | 3550 | ||
2873 | kvm_x86_ops->decache_cr4_guest_bits(vcpu); | 3551 | kvm_x86_ops->decache_cr4_guest_bits(vcpu); |
@@ -3141,9 +3819,9 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) | |||
3141 | 3819 | ||
3142 | vcpu->arch.mmu.root_hpa = INVALID_PAGE; | 3820 | vcpu->arch.mmu.root_hpa = INVALID_PAGE; |
3143 | if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0) | 3821 | if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0) |
3144 | vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE; | 3822 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; |
3145 | else | 3823 | else |
3146 | vcpu->arch.mp_state = VCPU_MP_STATE_UNINITIALIZED; | 3824 | vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; |
3147 | 3825 | ||
3148 | page = alloc_page(GFP_KERNEL | __GFP_ZERO); | 3826 | page = alloc_page(GFP_KERNEL | __GFP_ZERO); |
3149 | if (!page) { | 3827 | if (!page) { |
@@ -3175,7 +3853,9 @@ fail: | |||
3175 | void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) | 3853 | void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) |
3176 | { | 3854 | { |
3177 | kvm_free_lapic(vcpu); | 3855 | kvm_free_lapic(vcpu); |
3856 | down_read(&vcpu->kvm->slots_lock); | ||
3178 | kvm_mmu_destroy(vcpu); | 3857 | kvm_mmu_destroy(vcpu); |
3858 | up_read(&vcpu->kvm->slots_lock); | ||
3179 | free_page((unsigned long)vcpu->arch.pio_data); | 3859 | free_page((unsigned long)vcpu->arch.pio_data); |
3180 | } | 3860 | } |
3181 | 3861 | ||
@@ -3219,10 +3899,13 @@ static void kvm_free_vcpus(struct kvm *kvm) | |||
3219 | 3899 | ||
3220 | void kvm_arch_destroy_vm(struct kvm *kvm) | 3900 | void kvm_arch_destroy_vm(struct kvm *kvm) |
3221 | { | 3901 | { |
3902 | kvm_free_pit(kvm); | ||
3222 | kfree(kvm->arch.vpic); | 3903 | kfree(kvm->arch.vpic); |
3223 | kfree(kvm->arch.vioapic); | 3904 | kfree(kvm->arch.vioapic); |
3224 | kvm_free_vcpus(kvm); | 3905 | kvm_free_vcpus(kvm); |
3225 | kvm_free_physmem(kvm); | 3906 | kvm_free_physmem(kvm); |
3907 | if (kvm->arch.apic_access_page) | ||
3908 | put_page(kvm->arch.apic_access_page); | ||
3226 | kfree(kvm); | 3909 | kfree(kvm); |
3227 | } | 3910 | } |
3228 | 3911 | ||
@@ -3278,8 +3961,8 @@ int kvm_arch_set_memory_region(struct kvm *kvm, | |||
3278 | 3961 | ||
3279 | int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) | 3962 | int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) |
3280 | { | 3963 | { |
3281 | return vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE | 3964 | return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE |
3282 | || vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED; | 3965 | || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED; |
3283 | } | 3966 | } |
3284 | 3967 | ||
3285 | static void vcpu_kick_intr(void *info) | 3968 | static void vcpu_kick_intr(void *info) |
@@ -3293,11 +3976,17 @@ static void vcpu_kick_intr(void *info) | |||
3293 | void kvm_vcpu_kick(struct kvm_vcpu *vcpu) | 3976 | void kvm_vcpu_kick(struct kvm_vcpu *vcpu) |
3294 | { | 3977 | { |
3295 | int ipi_pcpu = vcpu->cpu; | 3978 | int ipi_pcpu = vcpu->cpu; |
3979 | int cpu = get_cpu(); | ||
3296 | 3980 | ||
3297 | if (waitqueue_active(&vcpu->wq)) { | 3981 | if (waitqueue_active(&vcpu->wq)) { |
3298 | wake_up_interruptible(&vcpu->wq); | 3982 | wake_up_interruptible(&vcpu->wq); |
3299 | ++vcpu->stat.halt_wakeup; | 3983 | ++vcpu->stat.halt_wakeup; |
3300 | } | 3984 | } |
3301 | if (vcpu->guest_mode) | 3985 | /* |
3986 | * We may be called synchronously with irqs disabled in guest mode, | ||
3987 | * So need not to call smp_call_function_single() in that case. | ||
3988 | */ | ||
3989 | if (vcpu->guest_mode && vcpu->cpu != cpu) | ||
3302 | smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0); | 3990 | smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0); |
3991 | put_cpu(); | ||
3303 | } | 3992 | } |
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index 79586003397a..2ca08386f993 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c | |||
@@ -65,6 +65,14 @@ | |||
65 | #define MemAbs (1<<9) /* Memory operand is absolute displacement */ | 65 | #define MemAbs (1<<9) /* Memory operand is absolute displacement */ |
66 | #define String (1<<10) /* String instruction (rep capable) */ | 66 | #define String (1<<10) /* String instruction (rep capable) */ |
67 | #define Stack (1<<11) /* Stack instruction (push/pop) */ | 67 | #define Stack (1<<11) /* Stack instruction (push/pop) */ |
68 | #define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ | ||
69 | #define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ | ||
70 | #define GroupMask 0xff /* Group number stored in bits 0:7 */ | ||
71 | |||
72 | enum { | ||
73 | Group1_80, Group1_81, Group1_82, Group1_83, | ||
74 | Group1A, Group3_Byte, Group3, Group4, Group5, Group7, | ||
75 | }; | ||
68 | 76 | ||
69 | static u16 opcode_table[256] = { | 77 | static u16 opcode_table[256] = { |
70 | /* 0x00 - 0x07 */ | 78 | /* 0x00 - 0x07 */ |
@@ -123,14 +131,14 @@ static u16 opcode_table[256] = { | |||
123 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | 131 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, |
124 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, | 132 | ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, |
125 | /* 0x80 - 0x87 */ | 133 | /* 0x80 - 0x87 */ |
126 | ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, | 134 | Group | Group1_80, Group | Group1_81, |
127 | ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, | 135 | Group | Group1_82, Group | Group1_83, |
128 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | 136 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, |
129 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, | 137 | ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, |
130 | /* 0x88 - 0x8F */ | 138 | /* 0x88 - 0x8F */ |
131 | ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, | 139 | ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, |
132 | ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, | 140 | ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, |
133 | 0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov | Stack, | 141 | 0, ModRM | DstReg, 0, Group | Group1A, |
134 | /* 0x90 - 0x9F */ | 142 | /* 0x90 - 0x9F */ |
135 | 0, 0, 0, 0, 0, 0, 0, 0, | 143 | 0, 0, 0, 0, 0, 0, 0, 0, |
136 | 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, | 144 | 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, |
@@ -164,16 +172,15 @@ static u16 opcode_table[256] = { | |||
164 | 0, 0, 0, 0, | 172 | 0, 0, 0, 0, |
165 | /* 0xF0 - 0xF7 */ | 173 | /* 0xF0 - 0xF7 */ |
166 | 0, 0, 0, 0, | 174 | 0, 0, 0, 0, |
167 | ImplicitOps, ImplicitOps, | 175 | ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3, |
168 | ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, | ||
169 | /* 0xF8 - 0xFF */ | 176 | /* 0xF8 - 0xFF */ |
170 | ImplicitOps, 0, ImplicitOps, ImplicitOps, | 177 | ImplicitOps, 0, ImplicitOps, ImplicitOps, |
171 | 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM | 178 | 0, 0, Group | Group4, Group | Group5, |
172 | }; | 179 | }; |
173 | 180 | ||
174 | static u16 twobyte_table[256] = { | 181 | static u16 twobyte_table[256] = { |
175 | /* 0x00 - 0x0F */ | 182 | /* 0x00 - 0x0F */ |
176 | 0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0, | 183 | 0, Group | GroupDual | Group7, 0, 0, 0, 0, ImplicitOps, 0, |
177 | ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, | 184 | ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, |
178 | /* 0x10 - 0x1F */ | 185 | /* 0x10 - 0x1F */ |
179 | 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, | 186 | 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, |
@@ -229,6 +236,56 @@ static u16 twobyte_table[256] = { | |||
229 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | 236 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
230 | }; | 237 | }; |
231 | 238 | ||
239 | static u16 group_table[] = { | ||
240 | [Group1_80*8] = | ||
241 | ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, | ||
242 | ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, | ||
243 | ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, | ||
244 | ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, | ||
245 | [Group1_81*8] = | ||
246 | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, | ||
247 | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, | ||
248 | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, | ||
249 | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, | ||
250 | [Group1_82*8] = | ||
251 | ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, | ||
252 | ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, | ||
253 | ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, | ||
254 | ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, | ||
255 | [Group1_83*8] = | ||
256 | DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, | ||
257 | DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, | ||
258 | DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, | ||
259 | DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, | ||
260 | [Group1A*8] = | ||
261 | DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0, | ||
262 | [Group3_Byte*8] = | ||
263 | ByteOp | SrcImm | DstMem | ModRM, 0, | ||
264 | ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, | ||
265 | 0, 0, 0, 0, | ||
266 | [Group3*8] = | ||
267 | DstMem | SrcImm | ModRM | SrcImm, 0, | ||
268 | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, | ||
269 | 0, 0, 0, 0, | ||
270 | [Group4*8] = | ||
271 | ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, | ||
272 | 0, 0, 0, 0, 0, 0, | ||
273 | [Group5*8] = | ||
274 | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 0, 0, | ||
275 | SrcMem | ModRM, 0, SrcMem | ModRM | Stack, 0, | ||
276 | [Group7*8] = | ||
277 | 0, 0, ModRM | SrcMem, ModRM | SrcMem, | ||
278 | SrcNone | ModRM | DstMem | Mov, 0, | ||
279 | SrcMem16 | ModRM | Mov, SrcMem | ModRM | ByteOp, | ||
280 | }; | ||
281 | |||
282 | static u16 group2_table[] = { | ||
283 | [Group7*8] = | ||
284 | SrcNone | ModRM, 0, 0, 0, | ||
285 | SrcNone | ModRM | DstMem | Mov, 0, | ||
286 | SrcMem16 | ModRM | Mov, 0, | ||
287 | }; | ||
288 | |||
232 | /* EFLAGS bit definitions. */ | 289 | /* EFLAGS bit definitions. */ |
233 | #define EFLG_OF (1<<11) | 290 | #define EFLG_OF (1<<11) |
234 | #define EFLG_DF (1<<10) | 291 | #define EFLG_DF (1<<10) |
@@ -317,7 +374,7 @@ static u16 twobyte_table[256] = { | |||
317 | 374 | ||
318 | #define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \ | 375 | #define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \ |
319 | do { \ | 376 | do { \ |
320 | unsigned long _tmp; \ | 377 | unsigned long __tmp; \ |
321 | switch ((_dst).bytes) { \ | 378 | switch ((_dst).bytes) { \ |
322 | case 1: \ | 379 | case 1: \ |
323 | __asm__ __volatile__ ( \ | 380 | __asm__ __volatile__ ( \ |
@@ -325,7 +382,7 @@ static u16 twobyte_table[256] = { | |||
325 | _op"b %"_bx"3,%1; " \ | 382 | _op"b %"_bx"3,%1; " \ |
326 | _POST_EFLAGS("0", "4", "2") \ | 383 | _POST_EFLAGS("0", "4", "2") \ |
327 | : "=m" (_eflags), "=m" ((_dst).val), \ | 384 | : "=m" (_eflags), "=m" ((_dst).val), \ |
328 | "=&r" (_tmp) \ | 385 | "=&r" (__tmp) \ |
329 | : _by ((_src).val), "i" (EFLAGS_MASK)); \ | 386 | : _by ((_src).val), "i" (EFLAGS_MASK)); \ |
330 | break; \ | 387 | break; \ |
331 | default: \ | 388 | default: \ |
@@ -426,29 +483,40 @@ static u16 twobyte_table[256] = { | |||
426 | (_type)_x; \ | 483 | (_type)_x; \ |
427 | }) | 484 | }) |
428 | 485 | ||
486 | static inline unsigned long ad_mask(struct decode_cache *c) | ||
487 | { | ||
488 | return (1UL << (c->ad_bytes << 3)) - 1; | ||
489 | } | ||
490 | |||
429 | /* Access/update address held in a register, based on addressing mode. */ | 491 | /* Access/update address held in a register, based on addressing mode. */ |
430 | #define address_mask(reg) \ | 492 | static inline unsigned long |
431 | ((c->ad_bytes == sizeof(unsigned long)) ? \ | 493 | address_mask(struct decode_cache *c, unsigned long reg) |
432 | (reg) : ((reg) & ((1UL << (c->ad_bytes << 3)) - 1))) | 494 | { |
433 | #define register_address(base, reg) \ | 495 | if (c->ad_bytes == sizeof(unsigned long)) |
434 | ((base) + address_mask(reg)) | 496 | return reg; |
435 | #define register_address_increment(reg, inc) \ | 497 | else |
436 | do { \ | 498 | return reg & ad_mask(c); |
437 | /* signed type ensures sign extension to long */ \ | 499 | } |
438 | int _inc = (inc); \ | ||
439 | if (c->ad_bytes == sizeof(unsigned long)) \ | ||
440 | (reg) += _inc; \ | ||
441 | else \ | ||
442 | (reg) = ((reg) & \ | ||
443 | ~((1UL << (c->ad_bytes << 3)) - 1)) | \ | ||
444 | (((reg) + _inc) & \ | ||
445 | ((1UL << (c->ad_bytes << 3)) - 1)); \ | ||
446 | } while (0) | ||
447 | 500 | ||
448 | #define JMP_REL(rel) \ | 501 | static inline unsigned long |
449 | do { \ | 502 | register_address(struct decode_cache *c, unsigned long base, unsigned long reg) |
450 | register_address_increment(c->eip, rel); \ | 503 | { |
451 | } while (0) | 504 | return base + address_mask(c, reg); |
505 | } | ||
506 | |||
507 | static inline void | ||
508 | register_address_increment(struct decode_cache *c, unsigned long *reg, int inc) | ||
509 | { | ||
510 | if (c->ad_bytes == sizeof(unsigned long)) | ||
511 | *reg += inc; | ||
512 | else | ||
513 | *reg = (*reg & ~ad_mask(c)) | ((*reg + inc) & ad_mask(c)); | ||
514 | } | ||
515 | |||
516 | static inline void jmp_rel(struct decode_cache *c, int rel) | ||
517 | { | ||
518 | register_address_increment(c, &c->eip, rel); | ||
519 | } | ||
452 | 520 | ||
453 | static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, | 521 | static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, |
454 | struct x86_emulate_ops *ops, | 522 | struct x86_emulate_ops *ops, |
@@ -763,7 +831,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) | |||
763 | struct decode_cache *c = &ctxt->decode; | 831 | struct decode_cache *c = &ctxt->decode; |
764 | int rc = 0; | 832 | int rc = 0; |
765 | int mode = ctxt->mode; | 833 | int mode = ctxt->mode; |
766 | int def_op_bytes, def_ad_bytes; | 834 | int def_op_bytes, def_ad_bytes, group; |
767 | 835 | ||
768 | /* Shadow copy of register state. Committed on successful emulation. */ | 836 | /* Shadow copy of register state. Committed on successful emulation. */ |
769 | 837 | ||
@@ -864,12 +932,24 @@ done_prefixes: | |||
864 | c->b = insn_fetch(u8, 1, c->eip); | 932 | c->b = insn_fetch(u8, 1, c->eip); |
865 | c->d = twobyte_table[c->b]; | 933 | c->d = twobyte_table[c->b]; |
866 | } | 934 | } |
935 | } | ||
867 | 936 | ||
868 | /* Unrecognised? */ | 937 | if (c->d & Group) { |
869 | if (c->d == 0) { | 938 | group = c->d & GroupMask; |
870 | DPRINTF("Cannot emulate %02x\n", c->b); | 939 | c->modrm = insn_fetch(u8, 1, c->eip); |
871 | return -1; | 940 | --c->eip; |
872 | } | 941 | |
942 | group = (group << 3) + ((c->modrm >> 3) & 7); | ||
943 | if ((c->d & GroupDual) && (c->modrm >> 6) == 3) | ||
944 | c->d = group2_table[group]; | ||
945 | else | ||
946 | c->d = group_table[group]; | ||
947 | } | ||
948 | |||
949 | /* Unrecognised? */ | ||
950 | if (c->d == 0) { | ||
951 | DPRINTF("Cannot emulate %02x\n", c->b); | ||
952 | return -1; | ||
873 | } | 953 | } |
874 | 954 | ||
875 | if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) | 955 | if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) |
@@ -924,6 +1004,7 @@ done_prefixes: | |||
924 | */ | 1004 | */ |
925 | if ((c->d & ModRM) && c->modrm_mod == 3) { | 1005 | if ((c->d & ModRM) && c->modrm_mod == 3) { |
926 | c->src.type = OP_REG; | 1006 | c->src.type = OP_REG; |
1007 | c->src.val = c->modrm_val; | ||
927 | break; | 1008 | break; |
928 | } | 1009 | } |
929 | c->src.type = OP_MEM; | 1010 | c->src.type = OP_MEM; |
@@ -967,6 +1048,7 @@ done_prefixes: | |||
967 | case DstMem: | 1048 | case DstMem: |
968 | if ((c->d & ModRM) && c->modrm_mod == 3) { | 1049 | if ((c->d & ModRM) && c->modrm_mod == 3) { |
969 | c->dst.type = OP_REG; | 1050 | c->dst.type = OP_REG; |
1051 | c->dst.val = c->dst.orig_val = c->modrm_val; | ||
970 | break; | 1052 | break; |
971 | } | 1053 | } |
972 | c->dst.type = OP_MEM; | 1054 | c->dst.type = OP_MEM; |
@@ -984,8 +1066,8 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt) | |||
984 | c->dst.type = OP_MEM; | 1066 | c->dst.type = OP_MEM; |
985 | c->dst.bytes = c->op_bytes; | 1067 | c->dst.bytes = c->op_bytes; |
986 | c->dst.val = c->src.val; | 1068 | c->dst.val = c->src.val; |
987 | register_address_increment(c->regs[VCPU_REGS_RSP], -c->op_bytes); | 1069 | register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); |
988 | c->dst.ptr = (void *) register_address(ctxt->ss_base, | 1070 | c->dst.ptr = (void *) register_address(c, ctxt->ss_base, |
989 | c->regs[VCPU_REGS_RSP]); | 1071 | c->regs[VCPU_REGS_RSP]); |
990 | } | 1072 | } |
991 | 1073 | ||
@@ -995,13 +1077,13 @@ static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, | |||
995 | struct decode_cache *c = &ctxt->decode; | 1077 | struct decode_cache *c = &ctxt->decode; |
996 | int rc; | 1078 | int rc; |
997 | 1079 | ||
998 | rc = ops->read_std(register_address(ctxt->ss_base, | 1080 | rc = ops->read_std(register_address(c, ctxt->ss_base, |
999 | c->regs[VCPU_REGS_RSP]), | 1081 | c->regs[VCPU_REGS_RSP]), |
1000 | &c->dst.val, c->dst.bytes, ctxt->vcpu); | 1082 | &c->dst.val, c->dst.bytes, ctxt->vcpu); |
1001 | if (rc != 0) | 1083 | if (rc != 0) |
1002 | return rc; | 1084 | return rc; |
1003 | 1085 | ||
1004 | register_address_increment(c->regs[VCPU_REGS_RSP], c->dst.bytes); | 1086 | register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->dst.bytes); |
1005 | 1087 | ||
1006 | return 0; | 1088 | return 0; |
1007 | } | 1089 | } |
@@ -1043,26 +1125,6 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, | |||
1043 | 1125 | ||
1044 | switch (c->modrm_reg) { | 1126 | switch (c->modrm_reg) { |
1045 | case 0 ... 1: /* test */ | 1127 | case 0 ... 1: /* test */ |
1046 | /* | ||
1047 | * Special case in Grp3: test has an immediate | ||
1048 | * source operand. | ||
1049 | */ | ||
1050 | c->src.type = OP_IMM; | ||
1051 | c->src.ptr = (unsigned long *)c->eip; | ||
1052 | c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | ||
1053 | if (c->src.bytes == 8) | ||
1054 | c->src.bytes = 4; | ||
1055 | switch (c->src.bytes) { | ||
1056 | case 1: | ||
1057 | c->src.val = insn_fetch(s8, 1, c->eip); | ||
1058 | break; | ||
1059 | case 2: | ||
1060 | c->src.val = insn_fetch(s16, 2, c->eip); | ||
1061 | break; | ||
1062 | case 4: | ||
1063 | c->src.val = insn_fetch(s32, 4, c->eip); | ||
1064 | break; | ||
1065 | } | ||
1066 | emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); | 1128 | emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); |
1067 | break; | 1129 | break; |
1068 | case 2: /* not */ | 1130 | case 2: /* not */ |
@@ -1076,7 +1138,6 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, | |||
1076 | rc = X86EMUL_UNHANDLEABLE; | 1138 | rc = X86EMUL_UNHANDLEABLE; |
1077 | break; | 1139 | break; |
1078 | } | 1140 | } |
1079 | done: | ||
1080 | return rc; | 1141 | return rc; |
1081 | } | 1142 | } |
1082 | 1143 | ||
@@ -1084,7 +1145,6 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, | |||
1084 | struct x86_emulate_ops *ops) | 1145 | struct x86_emulate_ops *ops) |
1085 | { | 1146 | { |
1086 | struct decode_cache *c = &ctxt->decode; | 1147 | struct decode_cache *c = &ctxt->decode; |
1087 | int rc; | ||
1088 | 1148 | ||
1089 | switch (c->modrm_reg) { | 1149 | switch (c->modrm_reg) { |
1090 | case 0: /* inc */ | 1150 | case 0: /* inc */ |
@@ -1094,36 +1154,11 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, | |||
1094 | emulate_1op("dec", c->dst, ctxt->eflags); | 1154 | emulate_1op("dec", c->dst, ctxt->eflags); |
1095 | break; | 1155 | break; |
1096 | case 4: /* jmp abs */ | 1156 | case 4: /* jmp abs */ |
1097 | if (c->b == 0xff) | 1157 | c->eip = c->src.val; |
1098 | c->eip = c->dst.val; | ||
1099 | else { | ||
1100 | DPRINTF("Cannot emulate %02x\n", c->b); | ||
1101 | return X86EMUL_UNHANDLEABLE; | ||
1102 | } | ||
1103 | break; | 1158 | break; |
1104 | case 6: /* push */ | 1159 | case 6: /* push */ |
1105 | 1160 | emulate_push(ctxt); | |
1106 | /* 64-bit mode: PUSH always pushes a 64-bit operand. */ | ||
1107 | |||
1108 | if (ctxt->mode == X86EMUL_MODE_PROT64) { | ||
1109 | c->dst.bytes = 8; | ||
1110 | rc = ops->read_std((unsigned long)c->dst.ptr, | ||
1111 | &c->dst.val, 8, ctxt->vcpu); | ||
1112 | if (rc != 0) | ||
1113 | return rc; | ||
1114 | } | ||
1115 | register_address_increment(c->regs[VCPU_REGS_RSP], | ||
1116 | -c->dst.bytes); | ||
1117 | rc = ops->write_emulated(register_address(ctxt->ss_base, | ||
1118 | c->regs[VCPU_REGS_RSP]), &c->dst.val, | ||
1119 | c->dst.bytes, ctxt->vcpu); | ||
1120 | if (rc != 0) | ||
1121 | return rc; | ||
1122 | c->dst.type = OP_NONE; | ||
1123 | break; | 1161 | break; |
1124 | default: | ||
1125 | DPRINTF("Cannot emulate %02x\n", c->b); | ||
1126 | return X86EMUL_UNHANDLEABLE; | ||
1127 | } | 1162 | } |
1128 | return 0; | 1163 | return 0; |
1129 | } | 1164 | } |
@@ -1361,19 +1396,19 @@ special_insn: | |||
1361 | c->dst.type = OP_MEM; | 1396 | c->dst.type = OP_MEM; |
1362 | c->dst.bytes = c->op_bytes; | 1397 | c->dst.bytes = c->op_bytes; |
1363 | c->dst.val = c->src.val; | 1398 | c->dst.val = c->src.val; |
1364 | register_address_increment(c->regs[VCPU_REGS_RSP], | 1399 | register_address_increment(c, &c->regs[VCPU_REGS_RSP], |
1365 | -c->op_bytes); | 1400 | -c->op_bytes); |
1366 | c->dst.ptr = (void *) register_address( | 1401 | c->dst.ptr = (void *) register_address( |
1367 | ctxt->ss_base, c->regs[VCPU_REGS_RSP]); | 1402 | c, ctxt->ss_base, c->regs[VCPU_REGS_RSP]); |
1368 | break; | 1403 | break; |
1369 | case 0x58 ... 0x5f: /* pop reg */ | 1404 | case 0x58 ... 0x5f: /* pop reg */ |
1370 | pop_instruction: | 1405 | pop_instruction: |
1371 | if ((rc = ops->read_std(register_address(ctxt->ss_base, | 1406 | if ((rc = ops->read_std(register_address(c, ctxt->ss_base, |
1372 | c->regs[VCPU_REGS_RSP]), c->dst.ptr, | 1407 | c->regs[VCPU_REGS_RSP]), c->dst.ptr, |
1373 | c->op_bytes, ctxt->vcpu)) != 0) | 1408 | c->op_bytes, ctxt->vcpu)) != 0) |
1374 | goto done; | 1409 | goto done; |
1375 | 1410 | ||
1376 | register_address_increment(c->regs[VCPU_REGS_RSP], | 1411 | register_address_increment(c, &c->regs[VCPU_REGS_RSP], |
1377 | c->op_bytes); | 1412 | c->op_bytes); |
1378 | c->dst.type = OP_NONE; /* Disable writeback. */ | 1413 | c->dst.type = OP_NONE; /* Disable writeback. */ |
1379 | break; | 1414 | break; |
@@ -1393,9 +1428,9 @@ special_insn: | |||
1393 | 1, | 1428 | 1, |
1394 | (c->d & ByteOp) ? 1 : c->op_bytes, | 1429 | (c->d & ByteOp) ? 1 : c->op_bytes, |
1395 | c->rep_prefix ? | 1430 | c->rep_prefix ? |
1396 | address_mask(c->regs[VCPU_REGS_RCX]) : 1, | 1431 | address_mask(c, c->regs[VCPU_REGS_RCX]) : 1, |
1397 | (ctxt->eflags & EFLG_DF), | 1432 | (ctxt->eflags & EFLG_DF), |
1398 | register_address(ctxt->es_base, | 1433 | register_address(c, ctxt->es_base, |
1399 | c->regs[VCPU_REGS_RDI]), | 1434 | c->regs[VCPU_REGS_RDI]), |
1400 | c->rep_prefix, | 1435 | c->rep_prefix, |
1401 | c->regs[VCPU_REGS_RDX]) == 0) { | 1436 | c->regs[VCPU_REGS_RDX]) == 0) { |
@@ -1409,9 +1444,9 @@ special_insn: | |||
1409 | 0, | 1444 | 0, |
1410 | (c->d & ByteOp) ? 1 : c->op_bytes, | 1445 | (c->d & ByteOp) ? 1 : c->op_bytes, |
1411 | c->rep_prefix ? | 1446 | c->rep_prefix ? |
1412 | address_mask(c->regs[VCPU_REGS_RCX]) : 1, | 1447 | address_mask(c, c->regs[VCPU_REGS_RCX]) : 1, |
1413 | (ctxt->eflags & EFLG_DF), | 1448 | (ctxt->eflags & EFLG_DF), |
1414 | register_address(c->override_base ? | 1449 | register_address(c, c->override_base ? |
1415 | *c->override_base : | 1450 | *c->override_base : |
1416 | ctxt->ds_base, | 1451 | ctxt->ds_base, |
1417 | c->regs[VCPU_REGS_RSI]), | 1452 | c->regs[VCPU_REGS_RSI]), |
@@ -1425,7 +1460,7 @@ special_insn: | |||
1425 | int rel = insn_fetch(s8, 1, c->eip); | 1460 | int rel = insn_fetch(s8, 1, c->eip); |
1426 | 1461 | ||
1427 | if (test_cc(c->b, ctxt->eflags)) | 1462 | if (test_cc(c->b, ctxt->eflags)) |
1428 | JMP_REL(rel); | 1463 | jmp_rel(c, rel); |
1429 | break; | 1464 | break; |
1430 | } | 1465 | } |
1431 | case 0x80 ... 0x83: /* Grp1 */ | 1466 | case 0x80 ... 0x83: /* Grp1 */ |
@@ -1477,7 +1512,7 @@ special_insn: | |||
1477 | case 0x88 ... 0x8b: /* mov */ | 1512 | case 0x88 ... 0x8b: /* mov */ |
1478 | goto mov; | 1513 | goto mov; |
1479 | case 0x8d: /* lea r16/r32, m */ | 1514 | case 0x8d: /* lea r16/r32, m */ |
1480 | c->dst.val = c->modrm_val; | 1515 | c->dst.val = c->modrm_ea; |
1481 | break; | 1516 | break; |
1482 | case 0x8f: /* pop (sole member of Grp1a) */ | 1517 | case 0x8f: /* pop (sole member of Grp1a) */ |
1483 | rc = emulate_grp1a(ctxt, ops); | 1518 | rc = emulate_grp1a(ctxt, ops); |
@@ -1501,27 +1536,27 @@ special_insn: | |||
1501 | case 0xa4 ... 0xa5: /* movs */ | 1536 | case 0xa4 ... 0xa5: /* movs */ |
1502 | c->dst.type = OP_MEM; | 1537 | c->dst.type = OP_MEM; |
1503 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | 1538 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; |
1504 | c->dst.ptr = (unsigned long *)register_address( | 1539 | c->dst.ptr = (unsigned long *)register_address(c, |
1505 | ctxt->es_base, | 1540 | ctxt->es_base, |
1506 | c->regs[VCPU_REGS_RDI]); | 1541 | c->regs[VCPU_REGS_RDI]); |
1507 | if ((rc = ops->read_emulated(register_address( | 1542 | if ((rc = ops->read_emulated(register_address(c, |
1508 | c->override_base ? *c->override_base : | 1543 | c->override_base ? *c->override_base : |
1509 | ctxt->ds_base, | 1544 | ctxt->ds_base, |
1510 | c->regs[VCPU_REGS_RSI]), | 1545 | c->regs[VCPU_REGS_RSI]), |
1511 | &c->dst.val, | 1546 | &c->dst.val, |
1512 | c->dst.bytes, ctxt->vcpu)) != 0) | 1547 | c->dst.bytes, ctxt->vcpu)) != 0) |
1513 | goto done; | 1548 | goto done; |
1514 | register_address_increment(c->regs[VCPU_REGS_RSI], | 1549 | register_address_increment(c, &c->regs[VCPU_REGS_RSI], |
1515 | (ctxt->eflags & EFLG_DF) ? -c->dst.bytes | 1550 | (ctxt->eflags & EFLG_DF) ? -c->dst.bytes |
1516 | : c->dst.bytes); | 1551 | : c->dst.bytes); |
1517 | register_address_increment(c->regs[VCPU_REGS_RDI], | 1552 | register_address_increment(c, &c->regs[VCPU_REGS_RDI], |
1518 | (ctxt->eflags & EFLG_DF) ? -c->dst.bytes | 1553 | (ctxt->eflags & EFLG_DF) ? -c->dst.bytes |
1519 | : c->dst.bytes); | 1554 | : c->dst.bytes); |
1520 | break; | 1555 | break; |
1521 | case 0xa6 ... 0xa7: /* cmps */ | 1556 | case 0xa6 ... 0xa7: /* cmps */ |
1522 | c->src.type = OP_NONE; /* Disable writeback. */ | 1557 | c->src.type = OP_NONE; /* Disable writeback. */ |
1523 | c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | 1558 | c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; |
1524 | c->src.ptr = (unsigned long *)register_address( | 1559 | c->src.ptr = (unsigned long *)register_address(c, |
1525 | c->override_base ? *c->override_base : | 1560 | c->override_base ? *c->override_base : |
1526 | ctxt->ds_base, | 1561 | ctxt->ds_base, |
1527 | c->regs[VCPU_REGS_RSI]); | 1562 | c->regs[VCPU_REGS_RSI]); |
@@ -1533,7 +1568,7 @@ special_insn: | |||
1533 | 1568 | ||
1534 | c->dst.type = OP_NONE; /* Disable writeback. */ | 1569 | c->dst.type = OP_NONE; /* Disable writeback. */ |
1535 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | 1570 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; |
1536 | c->dst.ptr = (unsigned long *)register_address( | 1571 | c->dst.ptr = (unsigned long *)register_address(c, |
1537 | ctxt->es_base, | 1572 | ctxt->es_base, |
1538 | c->regs[VCPU_REGS_RDI]); | 1573 | c->regs[VCPU_REGS_RDI]); |
1539 | if ((rc = ops->read_emulated((unsigned long)c->dst.ptr, | 1574 | if ((rc = ops->read_emulated((unsigned long)c->dst.ptr, |
@@ -1546,10 +1581,10 @@ special_insn: | |||
1546 | 1581 | ||
1547 | emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); | 1582 | emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); |
1548 | 1583 | ||
1549 | register_address_increment(c->regs[VCPU_REGS_RSI], | 1584 | register_address_increment(c, &c->regs[VCPU_REGS_RSI], |
1550 | (ctxt->eflags & EFLG_DF) ? -c->src.bytes | 1585 | (ctxt->eflags & EFLG_DF) ? -c->src.bytes |
1551 | : c->src.bytes); | 1586 | : c->src.bytes); |
1552 | register_address_increment(c->regs[VCPU_REGS_RDI], | 1587 | register_address_increment(c, &c->regs[VCPU_REGS_RDI], |
1553 | (ctxt->eflags & EFLG_DF) ? -c->dst.bytes | 1588 | (ctxt->eflags & EFLG_DF) ? -c->dst.bytes |
1554 | : c->dst.bytes); | 1589 | : c->dst.bytes); |
1555 | 1590 | ||
@@ -1557,11 +1592,11 @@ special_insn: | |||
1557 | case 0xaa ... 0xab: /* stos */ | 1592 | case 0xaa ... 0xab: /* stos */ |
1558 | c->dst.type = OP_MEM; | 1593 | c->dst.type = OP_MEM; |
1559 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | 1594 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; |
1560 | c->dst.ptr = (unsigned long *)register_address( | 1595 | c->dst.ptr = (unsigned long *)register_address(c, |
1561 | ctxt->es_base, | 1596 | ctxt->es_base, |
1562 | c->regs[VCPU_REGS_RDI]); | 1597 | c->regs[VCPU_REGS_RDI]); |
1563 | c->dst.val = c->regs[VCPU_REGS_RAX]; | 1598 | c->dst.val = c->regs[VCPU_REGS_RAX]; |
1564 | register_address_increment(c->regs[VCPU_REGS_RDI], | 1599 | register_address_increment(c, &c->regs[VCPU_REGS_RDI], |
1565 | (ctxt->eflags & EFLG_DF) ? -c->dst.bytes | 1600 | (ctxt->eflags & EFLG_DF) ? -c->dst.bytes |
1566 | : c->dst.bytes); | 1601 | : c->dst.bytes); |
1567 | break; | 1602 | break; |
@@ -1569,7 +1604,7 @@ special_insn: | |||
1569 | c->dst.type = OP_REG; | 1604 | c->dst.type = OP_REG; |
1570 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; | 1605 | c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; |
1571 | c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; | 1606 | c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; |
1572 | if ((rc = ops->read_emulated(register_address( | 1607 | if ((rc = ops->read_emulated(register_address(c, |
1573 | c->override_base ? *c->override_base : | 1608 | c->override_base ? *c->override_base : |
1574 | ctxt->ds_base, | 1609 | ctxt->ds_base, |
1575 | c->regs[VCPU_REGS_RSI]), | 1610 | c->regs[VCPU_REGS_RSI]), |
@@ -1577,7 +1612,7 @@ special_insn: | |||
1577 | c->dst.bytes, | 1612 | c->dst.bytes, |
1578 | ctxt->vcpu)) != 0) | 1613 | ctxt->vcpu)) != 0) |
1579 | goto done; | 1614 | goto done; |
1580 | register_address_increment(c->regs[VCPU_REGS_RSI], | 1615 | register_address_increment(c, &c->regs[VCPU_REGS_RSI], |
1581 | (ctxt->eflags & EFLG_DF) ? -c->dst.bytes | 1616 | (ctxt->eflags & EFLG_DF) ? -c->dst.bytes |
1582 | : c->dst.bytes); | 1617 | : c->dst.bytes); |
1583 | break; | 1618 | break; |
@@ -1616,14 +1651,14 @@ special_insn: | |||
1616 | goto cannot_emulate; | 1651 | goto cannot_emulate; |
1617 | } | 1652 | } |
1618 | c->src.val = (unsigned long) c->eip; | 1653 | c->src.val = (unsigned long) c->eip; |
1619 | JMP_REL(rel); | 1654 | jmp_rel(c, rel); |
1620 | c->op_bytes = c->ad_bytes; | 1655 | c->op_bytes = c->ad_bytes; |
1621 | emulate_push(ctxt); | 1656 | emulate_push(ctxt); |
1622 | break; | 1657 | break; |
1623 | } | 1658 | } |
1624 | case 0xe9: /* jmp rel */ | 1659 | case 0xe9: /* jmp rel */ |
1625 | case 0xeb: /* jmp rel short */ | 1660 | case 0xeb: /* jmp rel short */ |
1626 | JMP_REL(c->src.val); | 1661 | jmp_rel(c, c->src.val); |
1627 | c->dst.type = OP_NONE; /* Disable writeback. */ | 1662 | c->dst.type = OP_NONE; /* Disable writeback. */ |
1628 | break; | 1663 | break; |
1629 | case 0xf4: /* hlt */ | 1664 | case 0xf4: /* hlt */ |
@@ -1690,6 +1725,8 @@ twobyte_insn: | |||
1690 | goto done; | 1725 | goto done; |
1691 | 1726 | ||
1692 | kvm_emulate_hypercall(ctxt->vcpu); | 1727 | kvm_emulate_hypercall(ctxt->vcpu); |
1728 | /* Disable writeback. */ | ||
1729 | c->dst.type = OP_NONE; | ||
1693 | break; | 1730 | break; |
1694 | case 2: /* lgdt */ | 1731 | case 2: /* lgdt */ |
1695 | rc = read_descriptor(ctxt, ops, c->src.ptr, | 1732 | rc = read_descriptor(ctxt, ops, c->src.ptr, |
@@ -1697,6 +1734,8 @@ twobyte_insn: | |||
1697 | if (rc) | 1734 | if (rc) |
1698 | goto done; | 1735 | goto done; |
1699 | realmode_lgdt(ctxt->vcpu, size, address); | 1736 | realmode_lgdt(ctxt->vcpu, size, address); |
1737 | /* Disable writeback. */ | ||
1738 | c->dst.type = OP_NONE; | ||
1700 | break; | 1739 | break; |
1701 | case 3: /* lidt/vmmcall */ | 1740 | case 3: /* lidt/vmmcall */ |
1702 | if (c->modrm_mod == 3 && c->modrm_rm == 1) { | 1741 | if (c->modrm_mod == 3 && c->modrm_rm == 1) { |
@@ -1712,27 +1751,25 @@ twobyte_insn: | |||
1712 | goto done; | 1751 | goto done; |
1713 | realmode_lidt(ctxt->vcpu, size, address); | 1752 | realmode_lidt(ctxt->vcpu, size, address); |
1714 | } | 1753 | } |
1754 | /* Disable writeback. */ | ||
1755 | c->dst.type = OP_NONE; | ||
1715 | break; | 1756 | break; |
1716 | case 4: /* smsw */ | 1757 | case 4: /* smsw */ |
1717 | if (c->modrm_mod != 3) | 1758 | c->dst.bytes = 2; |
1718 | goto cannot_emulate; | 1759 | c->dst.val = realmode_get_cr(ctxt->vcpu, 0); |
1719 | *(u16 *)&c->regs[c->modrm_rm] | ||
1720 | = realmode_get_cr(ctxt->vcpu, 0); | ||
1721 | break; | 1760 | break; |
1722 | case 6: /* lmsw */ | 1761 | case 6: /* lmsw */ |
1723 | if (c->modrm_mod != 3) | 1762 | realmode_lmsw(ctxt->vcpu, (u16)c->src.val, |
1724 | goto cannot_emulate; | 1763 | &ctxt->eflags); |
1725 | realmode_lmsw(ctxt->vcpu, (u16)c->modrm_val, | ||
1726 | &ctxt->eflags); | ||
1727 | break; | 1764 | break; |
1728 | case 7: /* invlpg*/ | 1765 | case 7: /* invlpg*/ |
1729 | emulate_invlpg(ctxt->vcpu, memop); | 1766 | emulate_invlpg(ctxt->vcpu, memop); |
1767 | /* Disable writeback. */ | ||
1768 | c->dst.type = OP_NONE; | ||
1730 | break; | 1769 | break; |
1731 | default: | 1770 | default: |
1732 | goto cannot_emulate; | 1771 | goto cannot_emulate; |
1733 | } | 1772 | } |
1734 | /* Disable writeback. */ | ||
1735 | c->dst.type = OP_NONE; | ||
1736 | break; | 1773 | break; |
1737 | case 0x06: | 1774 | case 0x06: |
1738 | emulate_clts(ctxt->vcpu); | 1775 | emulate_clts(ctxt->vcpu); |
@@ -1823,7 +1860,7 @@ twobyte_insn: | |||
1823 | goto cannot_emulate; | 1860 | goto cannot_emulate; |
1824 | } | 1861 | } |
1825 | if (test_cc(c->b, ctxt->eflags)) | 1862 | if (test_cc(c->b, ctxt->eflags)) |
1826 | JMP_REL(rel); | 1863 | jmp_rel(c, rel); |
1827 | c->dst.type = OP_NONE; | 1864 | c->dst.type = OP_NONE; |
1828 | break; | 1865 | break; |
1829 | } | 1866 | } |