aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2008-04-27 13:13:52 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-04-27 13:13:52 -0400
commit42cadc86008aae0fd9ff31642dc01ed50723cf32 (patch)
treeb05d4c8f0561bad5a0183a89fb23ce4c8ee1653c /arch/x86
parentfba5c1af5c4fd6645fe62ea84ccde0981282cf66 (diff)
parent66c0b394f08fd89236515c1c84485ea712a157be (diff)
Merge branch 'kvm-updates-2.6.26' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm
* 'kvm-updates-2.6.26' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm: (147 commits) KVM: kill file->f_count abuse in kvm KVM: MMU: kvm_pv_mmu_op should not take mmap_sem KVM: SVM: remove selective CR0 comment KVM: SVM: remove now obsolete FIXME comment KVM: SVM: disable CR8 intercept when tpr is not masking interrupts KVM: SVM: sync V_TPR with LAPIC.TPR if CR8 write intercept is disabled KVM: export kvm_lapic_set_tpr() to modules KVM: SVM: sync TPR value to V_TPR field in the VMCB KVM: ppc: PowerPC 440 KVM implementation KVM: Add MAINTAINERS entry for PowerPC KVM KVM: ppc: Add DCR access information to struct kvm_run ppc: Export tlb_44x_hwater for KVM KVM: Rename debugfs_dir to kvm_debugfs_dir KVM: x86 emulator: fix lea to really get the effective address KVM: x86 emulator: fix smsw and lmsw with a memory operand KVM: x86 emulator: initialize src.val and dst.val for register operands KVM: SVM: force a new asid when initializing the vmcb KVM: fix kvm_vcpu_kick vs __vcpu_run race KVM: add ioctls to save/store mpstate KVM: Rename VCPU_MP_STATE_* to KVM_MP_STATE_* ...
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig19
-rw-r--r--arch/x86/kernel/Makefile2
-rw-r--r--arch/x86/kernel/crash.c3
-rw-r--r--arch/x86/kernel/kvm.c248
-rw-r--r--arch/x86/kernel/kvmclock.c187
-rw-r--r--arch/x86/kernel/reboot.c13
-rw-r--r--arch/x86/kernel/setup_32.c6
-rw-r--r--arch/x86/kernel/setup_64.c7
-rw-r--r--arch/x86/kvm/Kconfig13
-rw-r--r--arch/x86/kvm/Makefile6
-rw-r--r--arch/x86/kvm/i8254.c611
-rw-r--r--arch/x86/kvm/i8254.h63
-rw-r--r--arch/x86/kvm/irq.c18
-rw-r--r--arch/x86/kvm/irq.h3
-rw-r--r--arch/x86/kvm/kvm_svm.h2
-rw-r--r--arch/x86/kvm/lapic.c35
-rw-r--r--arch/x86/kvm/mmu.c672
-rw-r--r--arch/x86/kvm/mmu.h6
-rw-r--r--arch/x86/kvm/paging_tmpl.h86
-rw-r--r--arch/x86/kvm/segment_descriptor.h29
-rw-r--r--arch/x86/kvm/svm.c352
-rw-r--r--arch/x86/kvm/svm.h3
-rw-r--r--arch/x86/kvm/tss.h59
-rw-r--r--arch/x86/kvm/vmx.c278
-rw-r--r--arch/x86/kvm/vmx.h10
-rw-r--r--arch/x86/kvm/x86.c897
-rw-r--r--arch/x86/kvm/x86_emulate.c285
27 files changed, 3365 insertions, 548 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2fadf794483d..e5790fe9e330 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -373,6 +373,25 @@ config VMI
373 at the moment), by linking the kernel to a GPL-ed ROM module 373 at the moment), by linking the kernel to a GPL-ed ROM module
374 provided by the hypervisor. 374 provided by the hypervisor.
375 375
376config KVM_CLOCK
377 bool "KVM paravirtualized clock"
378 select PARAVIRT
379 depends on !(X86_VISWS || X86_VOYAGER)
380 help
381 Turning on this option will allow you to run a paravirtualized clock
382 when running over the KVM hypervisor. Instead of relying on a PIT
383 (or probably other) emulation by the underlying device model, the host
384 provides the guest with timing infrastructure such as time of day, and
385 system time
386
387config KVM_GUEST
388 bool "KVM Guest support"
389 select PARAVIRT
390 depends on !(X86_VISWS || X86_VOYAGER)
391 help
392 This option enables various optimizations for running under the KVM
393 hypervisor.
394
376source "arch/x86/lguest/Kconfig" 395source "arch/x86/lguest/Kconfig"
377 396
378config PARAVIRT 397config PARAVIRT
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 90e092d0af0c..fa19c3819540 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -80,6 +80,8 @@ obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o
80obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o 80obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
81 81
82obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o 82obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
83obj-$(CONFIG_KVM_GUEST) += kvm.o
84obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
83obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o 85obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
84 86
85ifdef CONFIG_INPUT_PCSPKR 87ifdef CONFIG_INPUT_PCSPKR
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 2251d0ae9570..268553817909 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -25,6 +25,7 @@
25#include <asm/hpet.h> 25#include <asm/hpet.h>
26#include <linux/kdebug.h> 26#include <linux/kdebug.h>
27#include <asm/smp.h> 27#include <asm/smp.h>
28#include <asm/reboot.h>
28 29
29#include <mach_ipi.h> 30#include <mach_ipi.h>
30 31
@@ -117,7 +118,7 @@ static void nmi_shootdown_cpus(void)
117} 118}
118#endif 119#endif
119 120
120void machine_crash_shutdown(struct pt_regs *regs) 121void native_machine_crash_shutdown(struct pt_regs *regs)
121{ 122{
122 /* This function is only called after the system 123 /* This function is only called after the system
123 * has panicked or is otherwise in a critical state. 124 * has panicked or is otherwise in a critical state.
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
new file mode 100644
index 000000000000..8b7a3cf37d2b
--- /dev/null
+++ b/arch/x86/kernel/kvm.c
@@ -0,0 +1,248 @@
1/*
2 * KVM paravirt_ops implementation
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 *
18 * Copyright (C) 2007, Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
19 * Copyright IBM Corporation, 2007
20 * Authors: Anthony Liguori <aliguori@us.ibm.com>
21 */
22
23#include <linux/module.h>
24#include <linux/kernel.h>
25#include <linux/kvm_para.h>
26#include <linux/cpu.h>
27#include <linux/mm.h>
28#include <linux/highmem.h>
29#include <linux/hardirq.h>
30
31#define MMU_QUEUE_SIZE 1024
32
33struct kvm_para_state {
34 u8 mmu_queue[MMU_QUEUE_SIZE];
35 int mmu_queue_len;
36 enum paravirt_lazy_mode mode;
37};
38
39static DEFINE_PER_CPU(struct kvm_para_state, para_state);
40
41static struct kvm_para_state *kvm_para_state(void)
42{
43 return &per_cpu(para_state, raw_smp_processor_id());
44}
45
46/*
47 * No need for any "IO delay" on KVM
48 */
49static void kvm_io_delay(void)
50{
51}
52
53static void kvm_mmu_op(void *buffer, unsigned len)
54{
55 int r;
56 unsigned long a1, a2;
57
58 do {
59 a1 = __pa(buffer);
60 a2 = 0; /* on i386 __pa() always returns <4G */
61 r = kvm_hypercall3(KVM_HC_MMU_OP, len, a1, a2);
62 buffer += r;
63 len -= r;
64 } while (len);
65}
66
67static void mmu_queue_flush(struct kvm_para_state *state)
68{
69 if (state->mmu_queue_len) {
70 kvm_mmu_op(state->mmu_queue, state->mmu_queue_len);
71 state->mmu_queue_len = 0;
72 }
73}
74
75static void kvm_deferred_mmu_op(void *buffer, int len)
76{
77 struct kvm_para_state *state = kvm_para_state();
78
79 if (state->mode != PARAVIRT_LAZY_MMU) {
80 kvm_mmu_op(buffer, len);
81 return;
82 }
83 if (state->mmu_queue_len + len > sizeof state->mmu_queue)
84 mmu_queue_flush(state);
85 memcpy(state->mmu_queue + state->mmu_queue_len, buffer, len);
86 state->mmu_queue_len += len;
87}
88
89static void kvm_mmu_write(void *dest, u64 val)
90{
91 __u64 pte_phys;
92 struct kvm_mmu_op_write_pte wpte;
93
94#ifdef CONFIG_HIGHPTE
95 struct page *page;
96 unsigned long dst = (unsigned long) dest;
97
98 page = kmap_atomic_to_page(dest);
99 pte_phys = page_to_pfn(page);
100 pte_phys <<= PAGE_SHIFT;
101 pte_phys += (dst & ~(PAGE_MASK));
102#else
103 pte_phys = (unsigned long)__pa(dest);
104#endif
105 wpte.header.op = KVM_MMU_OP_WRITE_PTE;
106 wpte.pte_val = val;
107 wpte.pte_phys = pte_phys;
108
109 kvm_deferred_mmu_op(&wpte, sizeof wpte);
110}
111
112/*
113 * We only need to hook operations that are MMU writes. We hook these so that
114 * we can use lazy MMU mode to batch these operations. We could probably
115 * improve the performance of the host code if we used some of the information
116 * here to simplify processing of batched writes.
117 */
118static void kvm_set_pte(pte_t *ptep, pte_t pte)
119{
120 kvm_mmu_write(ptep, pte_val(pte));
121}
122
123static void kvm_set_pte_at(struct mm_struct *mm, unsigned long addr,
124 pte_t *ptep, pte_t pte)
125{
126 kvm_mmu_write(ptep, pte_val(pte));
127}
128
129static void kvm_set_pmd(pmd_t *pmdp, pmd_t pmd)
130{
131 kvm_mmu_write(pmdp, pmd_val(pmd));
132}
133
134#if PAGETABLE_LEVELS >= 3
135#ifdef CONFIG_X86_PAE
136static void kvm_set_pte_atomic(pte_t *ptep, pte_t pte)
137{
138 kvm_mmu_write(ptep, pte_val(pte));
139}
140
141static void kvm_set_pte_present(struct mm_struct *mm, unsigned long addr,
142 pte_t *ptep, pte_t pte)
143{
144 kvm_mmu_write(ptep, pte_val(pte));
145}
146
147static void kvm_pte_clear(struct mm_struct *mm,
148 unsigned long addr, pte_t *ptep)
149{
150 kvm_mmu_write(ptep, 0);
151}
152
153static void kvm_pmd_clear(pmd_t *pmdp)
154{
155 kvm_mmu_write(pmdp, 0);
156}
157#endif
158
159static void kvm_set_pud(pud_t *pudp, pud_t pud)
160{
161 kvm_mmu_write(pudp, pud_val(pud));
162}
163
164#if PAGETABLE_LEVELS == 4
165static void kvm_set_pgd(pgd_t *pgdp, pgd_t pgd)
166{
167 kvm_mmu_write(pgdp, pgd_val(pgd));
168}
169#endif
170#endif /* PAGETABLE_LEVELS >= 3 */
171
172static void kvm_flush_tlb(void)
173{
174 struct kvm_mmu_op_flush_tlb ftlb = {
175 .header.op = KVM_MMU_OP_FLUSH_TLB,
176 };
177
178 kvm_deferred_mmu_op(&ftlb, sizeof ftlb);
179}
180
181static void kvm_release_pt(u32 pfn)
182{
183 struct kvm_mmu_op_release_pt rpt = {
184 .header.op = KVM_MMU_OP_RELEASE_PT,
185 .pt_phys = (u64)pfn << PAGE_SHIFT,
186 };
187
188 kvm_mmu_op(&rpt, sizeof rpt);
189}
190
191static void kvm_enter_lazy_mmu(void)
192{
193 struct kvm_para_state *state = kvm_para_state();
194
195 paravirt_enter_lazy_mmu();
196 state->mode = paravirt_get_lazy_mode();
197}
198
199static void kvm_leave_lazy_mmu(void)
200{
201 struct kvm_para_state *state = kvm_para_state();
202
203 mmu_queue_flush(state);
204 paravirt_leave_lazy(paravirt_get_lazy_mode());
205 state->mode = paravirt_get_lazy_mode();
206}
207
208static void paravirt_ops_setup(void)
209{
210 pv_info.name = "KVM";
211 pv_info.paravirt_enabled = 1;
212
213 if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
214 pv_cpu_ops.io_delay = kvm_io_delay;
215
216 if (kvm_para_has_feature(KVM_FEATURE_MMU_OP)) {
217 pv_mmu_ops.set_pte = kvm_set_pte;
218 pv_mmu_ops.set_pte_at = kvm_set_pte_at;
219 pv_mmu_ops.set_pmd = kvm_set_pmd;
220#if PAGETABLE_LEVELS >= 3
221#ifdef CONFIG_X86_PAE
222 pv_mmu_ops.set_pte_atomic = kvm_set_pte_atomic;
223 pv_mmu_ops.set_pte_present = kvm_set_pte_present;
224 pv_mmu_ops.pte_clear = kvm_pte_clear;
225 pv_mmu_ops.pmd_clear = kvm_pmd_clear;
226#endif
227 pv_mmu_ops.set_pud = kvm_set_pud;
228#if PAGETABLE_LEVELS == 4
229 pv_mmu_ops.set_pgd = kvm_set_pgd;
230#endif
231#endif
232 pv_mmu_ops.flush_tlb_user = kvm_flush_tlb;
233 pv_mmu_ops.release_pte = kvm_release_pt;
234 pv_mmu_ops.release_pmd = kvm_release_pt;
235 pv_mmu_ops.release_pud = kvm_release_pt;
236
237 pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu;
238 pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu;
239 }
240}
241
242void __init kvm_guest_init(void)
243{
244 if (!kvm_para_available())
245 return;
246
247 paravirt_ops_setup();
248}
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
new file mode 100644
index 000000000000..ddee04043aeb
--- /dev/null
+++ b/arch/x86/kernel/kvmclock.c
@@ -0,0 +1,187 @@
1/* KVM paravirtual clock driver. A clocksource implementation
2 Copyright (C) 2008 Glauber de Oliveira Costa, Red Hat Inc.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17*/
18
19#include <linux/clocksource.h>
20#include <linux/kvm_para.h>
21#include <asm/arch_hooks.h>
22#include <asm/msr.h>
23#include <asm/apic.h>
24#include <linux/percpu.h>
25#include <asm/reboot.h>
26
27#define KVM_SCALE 22
28
29static int kvmclock = 1;
30
31static int parse_no_kvmclock(char *arg)
32{
33 kvmclock = 0;
34 return 0;
35}
36early_param("no-kvmclock", parse_no_kvmclock);
37
38/* The hypervisor will put information about time periodically here */
39static DEFINE_PER_CPU_SHARED_ALIGNED(struct kvm_vcpu_time_info, hv_clock);
40#define get_clock(cpu, field) per_cpu(hv_clock, cpu).field
41
42static inline u64 kvm_get_delta(u64 last_tsc)
43{
44 int cpu = smp_processor_id();
45 u64 delta = native_read_tsc() - last_tsc;
46 return (delta * get_clock(cpu, tsc_to_system_mul)) >> KVM_SCALE;
47}
48
49static struct kvm_wall_clock wall_clock;
50static cycle_t kvm_clock_read(void);
51/*
52 * The wallclock is the time of day when we booted. Since then, some time may
53 * have elapsed since the hypervisor wrote the data. So we try to account for
54 * that with system time
55 */
56unsigned long kvm_get_wallclock(void)
57{
58 u32 wc_sec, wc_nsec;
59 u64 delta;
60 struct timespec ts;
61 int version, nsec;
62 int low, high;
63
64 low = (int)__pa(&wall_clock);
65 high = ((u64)__pa(&wall_clock) >> 32);
66
67 delta = kvm_clock_read();
68
69 native_write_msr(MSR_KVM_WALL_CLOCK, low, high);
70 do {
71 version = wall_clock.wc_version;
72 rmb();
73 wc_sec = wall_clock.wc_sec;
74 wc_nsec = wall_clock.wc_nsec;
75 rmb();
76 } while ((wall_clock.wc_version != version) || (version & 1));
77
78 delta = kvm_clock_read() - delta;
79 delta += wc_nsec;
80 nsec = do_div(delta, NSEC_PER_SEC);
81 set_normalized_timespec(&ts, wc_sec + delta, nsec);
82 /*
83 * Of all mechanisms of time adjustment I've tested, this one
84 * was the champion!
85 */
86 return ts.tv_sec + 1;
87}
88
89int kvm_set_wallclock(unsigned long now)
90{
91 return 0;
92}
93
94/*
95 * This is our read_clock function. The host puts an tsc timestamp each time
96 * it updates a new time. Without the tsc adjustment, we can have a situation
97 * in which a vcpu starts to run earlier (smaller system_time), but probes
98 * time later (compared to another vcpu), leading to backwards time
99 */
100static cycle_t kvm_clock_read(void)
101{
102 u64 last_tsc, now;
103 int cpu;
104
105 preempt_disable();
106 cpu = smp_processor_id();
107
108 last_tsc = get_clock(cpu, tsc_timestamp);
109 now = get_clock(cpu, system_time);
110
111 now += kvm_get_delta(last_tsc);
112 preempt_enable();
113
114 return now;
115}
116static struct clocksource kvm_clock = {
117 .name = "kvm-clock",
118 .read = kvm_clock_read,
119 .rating = 400,
120 .mask = CLOCKSOURCE_MASK(64),
121 .mult = 1 << KVM_SCALE,
122 .shift = KVM_SCALE,
123 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
124};
125
126static int kvm_register_clock(void)
127{
128 int cpu = smp_processor_id();
129 int low, high;
130 low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1;
131 high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);
132
133 return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high);
134}
135
136static void kvm_setup_secondary_clock(void)
137{
138 /*
139 * Now that the first cpu already had this clocksource initialized,
140 * we shouldn't fail.
141 */
142 WARN_ON(kvm_register_clock());
143 /* ok, done with our trickery, call native */
144 setup_secondary_APIC_clock();
145}
146
147/*
148 * After the clock is registered, the host will keep writing to the
149 * registered memory location. If the guest happens to shutdown, this memory
150 * won't be valid. In cases like kexec, in which you install a new kernel, this
151 * means a random memory location will be kept being written. So before any
152 * kind of shutdown from our side, we unregister the clock by writting anything
153 * that does not have the 'enable' bit set in the msr
154 */
155#ifdef CONFIG_KEXEC
156static void kvm_crash_shutdown(struct pt_regs *regs)
157{
158 native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0);
159 native_machine_crash_shutdown(regs);
160}
161#endif
162
163static void kvm_shutdown(void)
164{
165 native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0);
166 native_machine_shutdown();
167}
168
169void __init kvmclock_init(void)
170{
171 if (!kvm_para_available())
172 return;
173
174 if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
175 if (kvm_register_clock())
176 return;
177 pv_time_ops.get_wallclock = kvm_get_wallclock;
178 pv_time_ops.set_wallclock = kvm_set_wallclock;
179 pv_time_ops.sched_clock = kvm_clock_read;
180 pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock;
181 machine_ops.shutdown = kvm_shutdown;
182#ifdef CONFIG_KEXEC
183 machine_ops.crash_shutdown = kvm_crash_shutdown;
184#endif
185 clocksource_register(&kvm_clock);
186 }
187}
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 1791a751a772..a4a838306b2c 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -399,7 +399,7 @@ static void native_machine_emergency_restart(void)
399 } 399 }
400} 400}
401 401
402static void native_machine_shutdown(void) 402void native_machine_shutdown(void)
403{ 403{
404 /* Stop the cpus and apics */ 404 /* Stop the cpus and apics */
405#ifdef CONFIG_SMP 405#ifdef CONFIG_SMP
@@ -470,7 +470,10 @@ struct machine_ops machine_ops = {
470 .shutdown = native_machine_shutdown, 470 .shutdown = native_machine_shutdown,
471 .emergency_restart = native_machine_emergency_restart, 471 .emergency_restart = native_machine_emergency_restart,
472 .restart = native_machine_restart, 472 .restart = native_machine_restart,
473 .halt = native_machine_halt 473 .halt = native_machine_halt,
474#ifdef CONFIG_KEXEC
475 .crash_shutdown = native_machine_crash_shutdown,
476#endif
474}; 477};
475 478
476void machine_power_off(void) 479void machine_power_off(void)
@@ -498,3 +501,9 @@ void machine_halt(void)
498 machine_ops.halt(); 501 machine_ops.halt();
499} 502}
500 503
504#ifdef CONFIG_KEXEC
505void machine_crash_shutdown(struct pt_regs *regs)
506{
507 machine_ops.crash_shutdown(regs);
508}
509#endif
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 44cc9b933932..2283422af794 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -47,6 +47,7 @@
47#include <linux/pfn.h> 47#include <linux/pfn.h>
48#include <linux/pci.h> 48#include <linux/pci.h>
49#include <linux/init_ohci1394_dma.h> 49#include <linux/init_ohci1394_dma.h>
50#include <linux/kvm_para.h>
50 51
51#include <video/edid.h> 52#include <video/edid.h>
52 53
@@ -820,6 +821,10 @@ void __init setup_arch(char **cmdline_p)
820 821
821 max_low_pfn = setup_memory(); 822 max_low_pfn = setup_memory();
822 823
824#ifdef CONFIG_KVM_CLOCK
825 kvmclock_init();
826#endif
827
823#ifdef CONFIG_VMI 828#ifdef CONFIG_VMI
824 /* 829 /*
825 * Must be after max_low_pfn is determined, and before kernel 830 * Must be after max_low_pfn is determined, and before kernel
@@ -827,6 +832,7 @@ void __init setup_arch(char **cmdline_p)
827 */ 832 */
828 vmi_init(); 833 vmi_init();
829#endif 834#endif
835 kvm_guest_init();
830 836
831 /* 837 /*
832 * NOTE: before this point _nobody_ is allowed to allocate 838 * NOTE: before this point _nobody_ is allowed to allocate
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 60e64c8eee92..a94fb959a87a 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -42,6 +42,7 @@
42#include <linux/ctype.h> 42#include <linux/ctype.h>
43#include <linux/uaccess.h> 43#include <linux/uaccess.h>
44#include <linux/init_ohci1394_dma.h> 44#include <linux/init_ohci1394_dma.h>
45#include <linux/kvm_para.h>
45 46
46#include <asm/mtrr.h> 47#include <asm/mtrr.h>
47#include <asm/uaccess.h> 48#include <asm/uaccess.h>
@@ -384,6 +385,10 @@ void __init setup_arch(char **cmdline_p)
384 385
385 io_delay_init(); 386 io_delay_init();
386 387
388#ifdef CONFIG_KVM_CLOCK
389 kvmclock_init();
390#endif
391
387#ifdef CONFIG_SMP 392#ifdef CONFIG_SMP
388 /* setup to use the early static init tables during kernel startup */ 393 /* setup to use the early static init tables during kernel startup */
389 x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init; 394 x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
@@ -488,6 +493,8 @@ void __init setup_arch(char **cmdline_p)
488 init_apic_mappings(); 493 init_apic_mappings();
489 ioapic_init_mappings(); 494 ioapic_init_mappings();
490 495
496 kvm_guest_init();
497
491 /* 498 /*
492 * We trust e820 completely. No explicit ROM probing in memory. 499 * We trust e820 completely. No explicit ROM probing in memory.
493 */ 500 */
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 41962e793c0f..8d45fabc5f3b 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -19,7 +19,7 @@ if VIRTUALIZATION
19 19
20config KVM 20config KVM
21 tristate "Kernel-based Virtual Machine (KVM) support" 21 tristate "Kernel-based Virtual Machine (KVM) support"
22 depends on HAVE_KVM && EXPERIMENTAL 22 depends on HAVE_KVM
23 select PREEMPT_NOTIFIERS 23 select PREEMPT_NOTIFIERS
24 select ANON_INODES 24 select ANON_INODES
25 ---help--- 25 ---help---
@@ -50,6 +50,17 @@ config KVM_AMD
50 Provides support for KVM on AMD processors equipped with the AMD-V 50 Provides support for KVM on AMD processors equipped with the AMD-V
51 (SVM) extensions. 51 (SVM) extensions.
52 52
53config KVM_TRACE
54 bool "KVM trace support"
55 depends on KVM && MARKERS && SYSFS
56 select RELAY
57 select DEBUG_FS
58 default n
59 ---help---
60 This option allows reading a trace of kvm-related events through
61 relayfs. Note the ABI is not considered stable and will be
62 modified in future updates.
63
53# OK, it's a little counter-intuitive to do this, but it puts it neatly under 64# OK, it's a little counter-intuitive to do this, but it puts it neatly under
54# the virtualization menu. 65# the virtualization menu.
55source drivers/lguest/Kconfig 66source drivers/lguest/Kconfig
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index ffdd0b310784..c97d35c218db 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -3,10 +3,14 @@
3# 3#
4 4
5common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o) 5common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o)
6ifeq ($(CONFIG_KVM_TRACE),y)
7common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o)
8endif
6 9
7EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm 10EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
8 11
9kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o 12kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \
13 i8254.o
10obj-$(CONFIG_KVM) += kvm.o 14obj-$(CONFIG_KVM) += kvm.o
11kvm-intel-objs = vmx.o 15kvm-intel-objs = vmx.o
12obj-$(CONFIG_KVM_INTEL) += kvm-intel.o 16obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
new file mode 100644
index 000000000000..361e31611276
--- /dev/null
+++ b/arch/x86/kvm/i8254.c
@@ -0,0 +1,611 @@
1/*
2 * 8253/8254 interval timer emulation
3 *
4 * Copyright (c) 2003-2004 Fabrice Bellard
5 * Copyright (c) 2006 Intel Corporation
6 * Copyright (c) 2007 Keir Fraser, XenSource Inc
7 * Copyright (c) 2008 Intel Corporation
8 *
9 * Permission is hereby granted, free of charge, to any person obtaining a copy
10 * of this software and associated documentation files (the "Software"), to deal
11 * in the Software without restriction, including without limitation the rights
12 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 * copies of the Software, and to permit persons to whom the Software is
14 * furnished to do so, subject to the following conditions:
15 *
16 * The above copyright notice and this permission notice shall be included in
17 * all copies or substantial portions of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25 * THE SOFTWARE.
26 *
27 * Authors:
28 * Sheng Yang <sheng.yang@intel.com>
29 * Based on QEMU and Xen.
30 */
31
32#include <linux/kvm_host.h>
33
34#include "irq.h"
35#include "i8254.h"
36
37#ifndef CONFIG_X86_64
38#define mod_64(x, y) ((x) - (y) * div64_64(x, y))
39#else
40#define mod_64(x, y) ((x) % (y))
41#endif
42
43#define RW_STATE_LSB 1
44#define RW_STATE_MSB 2
45#define RW_STATE_WORD0 3
46#define RW_STATE_WORD1 4
47
48/* Compute with 96 bit intermediate result: (a*b)/c */
49static u64 muldiv64(u64 a, u32 b, u32 c)
50{
51 union {
52 u64 ll;
53 struct {
54 u32 low, high;
55 } l;
56 } u, res;
57 u64 rl, rh;
58
59 u.ll = a;
60 rl = (u64)u.l.low * (u64)b;
61 rh = (u64)u.l.high * (u64)b;
62 rh += (rl >> 32);
63 res.l.high = div64_64(rh, c);
64 res.l.low = div64_64(((mod_64(rh, c) << 32) + (rl & 0xffffffff)), c);
65 return res.ll;
66}
67
68static void pit_set_gate(struct kvm *kvm, int channel, u32 val)
69{
70 struct kvm_kpit_channel_state *c =
71 &kvm->arch.vpit->pit_state.channels[channel];
72
73 WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
74
75 switch (c->mode) {
76 default:
77 case 0:
78 case 4:
79 /* XXX: just disable/enable counting */
80 break;
81 case 1:
82 case 2:
83 case 3:
84 case 5:
85 /* Restart counting on rising edge. */
86 if (c->gate < val)
87 c->count_load_time = ktime_get();
88 break;
89 }
90
91 c->gate = val;
92}
93
94int pit_get_gate(struct kvm *kvm, int channel)
95{
96 WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
97
98 return kvm->arch.vpit->pit_state.channels[channel].gate;
99}
100
101static int pit_get_count(struct kvm *kvm, int channel)
102{
103 struct kvm_kpit_channel_state *c =
104 &kvm->arch.vpit->pit_state.channels[channel];
105 s64 d, t;
106 int counter;
107
108 WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
109
110 t = ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time));
111 d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
112
113 switch (c->mode) {
114 case 0:
115 case 1:
116 case 4:
117 case 5:
118 counter = (c->count - d) & 0xffff;
119 break;
120 case 3:
121 /* XXX: may be incorrect for odd counts */
122 counter = c->count - (mod_64((2 * d), c->count));
123 break;
124 default:
125 counter = c->count - mod_64(d, c->count);
126 break;
127 }
128 return counter;
129}
130
131static int pit_get_out(struct kvm *kvm, int channel)
132{
133 struct kvm_kpit_channel_state *c =
134 &kvm->arch.vpit->pit_state.channels[channel];
135 s64 d, t;
136 int out;
137
138 WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
139
140 t = ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time));
141 d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
142
143 switch (c->mode) {
144 default:
145 case 0:
146 out = (d >= c->count);
147 break;
148 case 1:
149 out = (d < c->count);
150 break;
151 case 2:
152 out = ((mod_64(d, c->count) == 0) && (d != 0));
153 break;
154 case 3:
155 out = (mod_64(d, c->count) < ((c->count + 1) >> 1));
156 break;
157 case 4:
158 case 5:
159 out = (d == c->count);
160 break;
161 }
162
163 return out;
164}
165
166static void pit_latch_count(struct kvm *kvm, int channel)
167{
168 struct kvm_kpit_channel_state *c =
169 &kvm->arch.vpit->pit_state.channels[channel];
170
171 WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
172
173 if (!c->count_latched) {
174 c->latched_count = pit_get_count(kvm, channel);
175 c->count_latched = c->rw_mode;
176 }
177}
178
179static void pit_latch_status(struct kvm *kvm, int channel)
180{
181 struct kvm_kpit_channel_state *c =
182 &kvm->arch.vpit->pit_state.channels[channel];
183
184 WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
185
186 if (!c->status_latched) {
187 /* TODO: Return NULL COUNT (bit 6). */
188 c->status = ((pit_get_out(kvm, channel) << 7) |
189 (c->rw_mode << 4) |
190 (c->mode << 1) |
191 c->bcd);
192 c->status_latched = 1;
193 }
194}
195
196int __pit_timer_fn(struct kvm_kpit_state *ps)
197{
198 struct kvm_vcpu *vcpu0 = ps->pit->kvm->vcpus[0];
199 struct kvm_kpit_timer *pt = &ps->pit_timer;
200
201 atomic_inc(&pt->pending);
202 smp_mb__after_atomic_inc();
203 /* FIXME: handle case where the guest is in guest mode */
204 if (vcpu0 && waitqueue_active(&vcpu0->wq)) {
205 vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE;
206 wake_up_interruptible(&vcpu0->wq);
207 }
208
209 pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period);
210 pt->scheduled = ktime_to_ns(pt->timer.expires);
211
212 return (pt->period == 0 ? 0 : 1);
213}
214
215int pit_has_pending_timer(struct kvm_vcpu *vcpu)
216{
217 struct kvm_pit *pit = vcpu->kvm->arch.vpit;
218
219 if (pit && vcpu->vcpu_id == 0)
220 return atomic_read(&pit->pit_state.pit_timer.pending);
221
222 return 0;
223}
224
225static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
226{
227 struct kvm_kpit_state *ps;
228 int restart_timer = 0;
229
230 ps = container_of(data, struct kvm_kpit_state, pit_timer.timer);
231
232 restart_timer = __pit_timer_fn(ps);
233
234 if (restart_timer)
235 return HRTIMER_RESTART;
236 else
237 return HRTIMER_NORESTART;
238}
239
240static void destroy_pit_timer(struct kvm_kpit_timer *pt)
241{
242 pr_debug("pit: execute del timer!\n");
243 hrtimer_cancel(&pt->timer);
244}
245
246static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int is_period)
247{
248 s64 interval;
249
250 interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
251
252 pr_debug("pit: create pit timer, interval is %llu nsec\n", interval);
253
254 /* TODO The new value only affected after the retriggered */
255 hrtimer_cancel(&pt->timer);
256 pt->period = (is_period == 0) ? 0 : interval;
257 pt->timer.function = pit_timer_fn;
258 atomic_set(&pt->pending, 0);
259
260 hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval),
261 HRTIMER_MODE_ABS);
262}
263
264static void pit_load_count(struct kvm *kvm, int channel, u32 val)
265{
266 struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
267
268 WARN_ON(!mutex_is_locked(&ps->lock));
269
270 pr_debug("pit: load_count val is %d, channel is %d\n", val, channel);
271
272 /*
273 * Though spec said the state of 8254 is undefined after power-up,
274 * seems some tricky OS like Windows XP depends on IRQ0 interrupt
275 * when booting up.
276 * So here setting initialize rate for it, and not a specific number
277 */
278 if (val == 0)
279 val = 0x10000;
280
281 ps->channels[channel].count_load_time = ktime_get();
282 ps->channels[channel].count = val;
283
284 if (channel != 0)
285 return;
286
287 /* Two types of timer
288 * mode 1 is one shot, mode 2 is period, otherwise del timer */
289 switch (ps->channels[0].mode) {
290 case 1:
291 create_pit_timer(&ps->pit_timer, val, 0);
292 break;
293 case 2:
294 create_pit_timer(&ps->pit_timer, val, 1);
295 break;
296 default:
297 destroy_pit_timer(&ps->pit_timer);
298 }
299}
300
301void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val)
302{
303 mutex_lock(&kvm->arch.vpit->pit_state.lock);
304 pit_load_count(kvm, channel, val);
305 mutex_unlock(&kvm->arch.vpit->pit_state.lock);
306}
307
308static void pit_ioport_write(struct kvm_io_device *this,
309 gpa_t addr, int len, const void *data)
310{
311 struct kvm_pit *pit = (struct kvm_pit *)this->private;
312 struct kvm_kpit_state *pit_state = &pit->pit_state;
313 struct kvm *kvm = pit->kvm;
314 int channel, access;
315 struct kvm_kpit_channel_state *s;
316 u32 val = *(u32 *) data;
317
318 val &= 0xff;
319 addr &= KVM_PIT_CHANNEL_MASK;
320
321 mutex_lock(&pit_state->lock);
322
323 if (val != 0)
324 pr_debug("pit: write addr is 0x%x, len is %d, val is 0x%x\n",
325 (unsigned int)addr, len, val);
326
327 if (addr == 3) {
328 channel = val >> 6;
329 if (channel == 3) {
330 /* Read-Back Command. */
331 for (channel = 0; channel < 3; channel++) {
332 s = &pit_state->channels[channel];
333 if (val & (2 << channel)) {
334 if (!(val & 0x20))
335 pit_latch_count(kvm, channel);
336 if (!(val & 0x10))
337 pit_latch_status(kvm, channel);
338 }
339 }
340 } else {
341 /* Select Counter <channel>. */
342 s = &pit_state->channels[channel];
343 access = (val >> 4) & KVM_PIT_CHANNEL_MASK;
344 if (access == 0) {
345 pit_latch_count(kvm, channel);
346 } else {
347 s->rw_mode = access;
348 s->read_state = access;
349 s->write_state = access;
350 s->mode = (val >> 1) & 7;
351 if (s->mode > 5)
352 s->mode -= 4;
353 s->bcd = val & 1;
354 }
355 }
356 } else {
357 /* Write Count. */
358 s = &pit_state->channels[addr];
359 switch (s->write_state) {
360 default:
361 case RW_STATE_LSB:
362 pit_load_count(kvm, addr, val);
363 break;
364 case RW_STATE_MSB:
365 pit_load_count(kvm, addr, val << 8);
366 break;
367 case RW_STATE_WORD0:
368 s->write_latch = val;
369 s->write_state = RW_STATE_WORD1;
370 break;
371 case RW_STATE_WORD1:
372 pit_load_count(kvm, addr, s->write_latch | (val << 8));
373 s->write_state = RW_STATE_WORD0;
374 break;
375 }
376 }
377
378 mutex_unlock(&pit_state->lock);
379}
380
381static void pit_ioport_read(struct kvm_io_device *this,
382 gpa_t addr, int len, void *data)
383{
384 struct kvm_pit *pit = (struct kvm_pit *)this->private;
385 struct kvm_kpit_state *pit_state = &pit->pit_state;
386 struct kvm *kvm = pit->kvm;
387 int ret, count;
388 struct kvm_kpit_channel_state *s;
389
390 addr &= KVM_PIT_CHANNEL_MASK;
391 s = &pit_state->channels[addr];
392
393 mutex_lock(&pit_state->lock);
394
395 if (s->status_latched) {
396 s->status_latched = 0;
397 ret = s->status;
398 } else if (s->count_latched) {
399 switch (s->count_latched) {
400 default:
401 case RW_STATE_LSB:
402 ret = s->latched_count & 0xff;
403 s->count_latched = 0;
404 break;
405 case RW_STATE_MSB:
406 ret = s->latched_count >> 8;
407 s->count_latched = 0;
408 break;
409 case RW_STATE_WORD0:
410 ret = s->latched_count & 0xff;
411 s->count_latched = RW_STATE_MSB;
412 break;
413 }
414 } else {
415 switch (s->read_state) {
416 default:
417 case RW_STATE_LSB:
418 count = pit_get_count(kvm, addr);
419 ret = count & 0xff;
420 break;
421 case RW_STATE_MSB:
422 count = pit_get_count(kvm, addr);
423 ret = (count >> 8) & 0xff;
424 break;
425 case RW_STATE_WORD0:
426 count = pit_get_count(kvm, addr);
427 ret = count & 0xff;
428 s->read_state = RW_STATE_WORD1;
429 break;
430 case RW_STATE_WORD1:
431 count = pit_get_count(kvm, addr);
432 ret = (count >> 8) & 0xff;
433 s->read_state = RW_STATE_WORD0;
434 break;
435 }
436 }
437
438 if (len > sizeof(ret))
439 len = sizeof(ret);
440 memcpy(data, (char *)&ret, len);
441
442 mutex_unlock(&pit_state->lock);
443}
444
445static int pit_in_range(struct kvm_io_device *this, gpa_t addr)
446{
447 return ((addr >= KVM_PIT_BASE_ADDRESS) &&
448 (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH));
449}
450
451static void speaker_ioport_write(struct kvm_io_device *this,
452 gpa_t addr, int len, const void *data)
453{
454 struct kvm_pit *pit = (struct kvm_pit *)this->private;
455 struct kvm_kpit_state *pit_state = &pit->pit_state;
456 struct kvm *kvm = pit->kvm;
457 u32 val = *(u32 *) data;
458
459 mutex_lock(&pit_state->lock);
460 pit_state->speaker_data_on = (val >> 1) & 1;
461 pit_set_gate(kvm, 2, val & 1);
462 mutex_unlock(&pit_state->lock);
463}
464
465static void speaker_ioport_read(struct kvm_io_device *this,
466 gpa_t addr, int len, void *data)
467{
468 struct kvm_pit *pit = (struct kvm_pit *)this->private;
469 struct kvm_kpit_state *pit_state = &pit->pit_state;
470 struct kvm *kvm = pit->kvm;
471 unsigned int refresh_clock;
472 int ret;
473
474 /* Refresh clock toggles at about 15us. We approximate as 2^14ns. */
475 refresh_clock = ((unsigned int)ktime_to_ns(ktime_get()) >> 14) & 1;
476
477 mutex_lock(&pit_state->lock);
478 ret = ((pit_state->speaker_data_on << 1) | pit_get_gate(kvm, 2) |
479 (pit_get_out(kvm, 2) << 5) | (refresh_clock << 4));
480 if (len > sizeof(ret))
481 len = sizeof(ret);
482 memcpy(data, (char *)&ret, len);
483 mutex_unlock(&pit_state->lock);
484}
485
486static int speaker_in_range(struct kvm_io_device *this, gpa_t addr)
487{
488 return (addr == KVM_SPEAKER_BASE_ADDRESS);
489}
490
491void kvm_pit_reset(struct kvm_pit *pit)
492{
493 int i;
494 struct kvm_kpit_channel_state *c;
495
496 mutex_lock(&pit->pit_state.lock);
497 for (i = 0; i < 3; i++) {
498 c = &pit->pit_state.channels[i];
499 c->mode = 0xff;
500 c->gate = (i != 2);
501 pit_load_count(pit->kvm, i, 0);
502 }
503 mutex_unlock(&pit->pit_state.lock);
504
505 atomic_set(&pit->pit_state.pit_timer.pending, 0);
506 pit->pit_state.inject_pending = 1;
507}
508
509struct kvm_pit *kvm_create_pit(struct kvm *kvm)
510{
511 struct kvm_pit *pit;
512 struct kvm_kpit_state *pit_state;
513
514 pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL);
515 if (!pit)
516 return NULL;
517
518 mutex_init(&pit->pit_state.lock);
519 mutex_lock(&pit->pit_state.lock);
520
521 /* Initialize PIO device */
522 pit->dev.read = pit_ioport_read;
523 pit->dev.write = pit_ioport_write;
524 pit->dev.in_range = pit_in_range;
525 pit->dev.private = pit;
526 kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev);
527
528 pit->speaker_dev.read = speaker_ioport_read;
529 pit->speaker_dev.write = speaker_ioport_write;
530 pit->speaker_dev.in_range = speaker_in_range;
531 pit->speaker_dev.private = pit;
532 kvm_io_bus_register_dev(&kvm->pio_bus, &pit->speaker_dev);
533
534 kvm->arch.vpit = pit;
535 pit->kvm = kvm;
536
537 pit_state = &pit->pit_state;
538 pit_state->pit = pit;
539 hrtimer_init(&pit_state->pit_timer.timer,
540 CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
541 mutex_unlock(&pit->pit_state.lock);
542
543 kvm_pit_reset(pit);
544
545 return pit;
546}
547
548void kvm_free_pit(struct kvm *kvm)
549{
550 struct hrtimer *timer;
551
552 if (kvm->arch.vpit) {
553 mutex_lock(&kvm->arch.vpit->pit_state.lock);
554 timer = &kvm->arch.vpit->pit_state.pit_timer.timer;
555 hrtimer_cancel(timer);
556 mutex_unlock(&kvm->arch.vpit->pit_state.lock);
557 kfree(kvm->arch.vpit);
558 }
559}
560
561void __inject_pit_timer_intr(struct kvm *kvm)
562{
563 mutex_lock(&kvm->lock);
564 kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 1);
565 kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 0);
566 kvm_pic_set_irq(pic_irqchip(kvm), 0, 1);
567 kvm_pic_set_irq(pic_irqchip(kvm), 0, 0);
568 mutex_unlock(&kvm->lock);
569}
570
571void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
572{
573 struct kvm_pit *pit = vcpu->kvm->arch.vpit;
574 struct kvm *kvm = vcpu->kvm;
575 struct kvm_kpit_state *ps;
576
577 if (vcpu && pit) {
578 ps = &pit->pit_state;
579
580 /* Try to inject pending interrupts when:
581 * 1. Pending exists
582 * 2. Last interrupt was accepted or waited for too long time*/
583 if (atomic_read(&ps->pit_timer.pending) &&
584 (ps->inject_pending ||
585 (jiffies - ps->last_injected_time
586 >= KVM_MAX_PIT_INTR_INTERVAL))) {
587 ps->inject_pending = 0;
588 __inject_pit_timer_intr(kvm);
589 ps->last_injected_time = jiffies;
590 }
591 }
592}
593
594void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
595{
596 struct kvm_arch *arch = &vcpu->kvm->arch;
597 struct kvm_kpit_state *ps;
598
599 if (vcpu && arch->vpit) {
600 ps = &arch->vpit->pit_state;
601 if (atomic_read(&ps->pit_timer.pending) &&
602 (((arch->vpic->pics[0].imr & 1) == 0 &&
603 arch->vpic->pics[0].irq_base == vec) ||
604 (arch->vioapic->redirtbl[0].fields.vector == vec &&
605 arch->vioapic->redirtbl[0].fields.mask != 1))) {
606 ps->inject_pending = 1;
607 atomic_dec(&ps->pit_timer.pending);
608 ps->channels[0].count_load_time = ktime_get();
609 }
610 }
611}
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
new file mode 100644
index 000000000000..db25c2a6c8c4
--- /dev/null
+++ b/arch/x86/kvm/i8254.h
@@ -0,0 +1,63 @@
1#ifndef __I8254_H
2#define __I8254_H
3
4#include "iodev.h"
5
6struct kvm_kpit_timer {
7 struct hrtimer timer;
8 int irq;
9 s64 period; /* unit: ns */
10 s64 scheduled;
11 ktime_t last_update;
12 atomic_t pending;
13};
14
15struct kvm_kpit_channel_state {
16 u32 count; /* can be 65536 */
17 u16 latched_count;
18 u8 count_latched;
19 u8 status_latched;
20 u8 status;
21 u8 read_state;
22 u8 write_state;
23 u8 write_latch;
24 u8 rw_mode;
25 u8 mode;
26 u8 bcd; /* not supported */
27 u8 gate; /* timer start */
28 ktime_t count_load_time;
29};
30
31struct kvm_kpit_state {
32 struct kvm_kpit_channel_state channels[3];
33 struct kvm_kpit_timer pit_timer;
34 u32 speaker_data_on;
35 struct mutex lock;
36 struct kvm_pit *pit;
37 bool inject_pending; /* if inject pending interrupts */
38 unsigned long last_injected_time;
39};
40
41struct kvm_pit {
42 unsigned long base_addresss;
43 struct kvm_io_device dev;
44 struct kvm_io_device speaker_dev;
45 struct kvm *kvm;
46 struct kvm_kpit_state pit_state;
47};
48
49#define KVM_PIT_BASE_ADDRESS 0x40
50#define KVM_SPEAKER_BASE_ADDRESS 0x61
51#define KVM_PIT_MEM_LENGTH 4
52#define KVM_PIT_FREQ 1193181
53#define KVM_MAX_PIT_INTR_INTERVAL HZ / 100
54#define KVM_PIT_CHANNEL_MASK 0x3
55
56void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu);
57void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
58void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val);
59struct kvm_pit *kvm_create_pit(struct kvm *kvm);
60void kvm_free_pit(struct kvm *kvm);
61void kvm_pit_reset(struct kvm_pit *pit);
62
63#endif
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index e5714759e97f..ce1f583459b1 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -23,6 +23,22 @@
23#include <linux/kvm_host.h> 23#include <linux/kvm_host.h>
24 24
25#include "irq.h" 25#include "irq.h"
26#include "i8254.h"
27
28/*
29 * check if there are pending timer events
30 * to be processed.
31 */
32int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
33{
34 int ret;
35
36 ret = pit_has_pending_timer(vcpu);
37 ret |= apic_has_pending_timer(vcpu);
38
39 return ret;
40}
41EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
26 42
27/* 43/*
28 * check if there is pending interrupt without 44 * check if there is pending interrupt without
@@ -66,6 +82,7 @@ EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
66void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) 82void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
67{ 83{
68 kvm_inject_apic_timer_irqs(vcpu); 84 kvm_inject_apic_timer_irqs(vcpu);
85 kvm_inject_pit_timer_irqs(vcpu);
69 /* TODO: PIT, RTC etc. */ 86 /* TODO: PIT, RTC etc. */
70} 87}
71EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs); 88EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
@@ -73,6 +90,7 @@ EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
73void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec) 90void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
74{ 91{
75 kvm_apic_timer_intr_post(vcpu, vec); 92 kvm_apic_timer_intr_post(vcpu, vec);
93 kvm_pit_timer_intr_post(vcpu, vec);
76 /* TODO: PIT, RTC etc. */ 94 /* TODO: PIT, RTC etc. */
77} 95}
78EXPORT_SYMBOL_GPL(kvm_timer_intr_post); 96EXPORT_SYMBOL_GPL(kvm_timer_intr_post);
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index fa5ed5d59b5d..1802134b836f 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -85,4 +85,7 @@ void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
85void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); 85void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
86void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu); 86void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
87 87
88int pit_has_pending_timer(struct kvm_vcpu *vcpu);
89int apic_has_pending_timer(struct kvm_vcpu *vcpu);
90
88#endif 91#endif
diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h
index ecdfe97e4635..65ef0fc2c036 100644
--- a/arch/x86/kvm/kvm_svm.h
+++ b/arch/x86/kvm/kvm_svm.h
@@ -39,6 +39,8 @@ struct vcpu_svm {
39 unsigned long host_db_regs[NUM_DB_REGS]; 39 unsigned long host_db_regs[NUM_DB_REGS];
40 unsigned long host_dr6; 40 unsigned long host_dr6;
41 unsigned long host_dr7; 41 unsigned long host_dr7;
42
43 u32 *msrpm;
42}; 44};
43 45
44#endif 46#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 68a6b1511934..57ac4e4c556a 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -338,10 +338,10 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
338 } else 338 } else
339 apic_clear_vector(vector, apic->regs + APIC_TMR); 339 apic_clear_vector(vector, apic->regs + APIC_TMR);
340 340
341 if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE) 341 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
342 kvm_vcpu_kick(vcpu); 342 kvm_vcpu_kick(vcpu);
343 else if (vcpu->arch.mp_state == VCPU_MP_STATE_HALTED) { 343 else if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) {
344 vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE; 344 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
345 if (waitqueue_active(&vcpu->wq)) 345 if (waitqueue_active(&vcpu->wq))
346 wake_up_interruptible(&vcpu->wq); 346 wake_up_interruptible(&vcpu->wq);
347 } 347 }
@@ -362,11 +362,11 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
362 362
363 case APIC_DM_INIT: 363 case APIC_DM_INIT:
364 if (level) { 364 if (level) {
365 if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE) 365 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
366 printk(KERN_DEBUG 366 printk(KERN_DEBUG
367 "INIT on a runnable vcpu %d\n", 367 "INIT on a runnable vcpu %d\n",
368 vcpu->vcpu_id); 368 vcpu->vcpu_id);
369 vcpu->arch.mp_state = VCPU_MP_STATE_INIT_RECEIVED; 369 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
370 kvm_vcpu_kick(vcpu); 370 kvm_vcpu_kick(vcpu);
371 } else { 371 } else {
372 printk(KERN_DEBUG 372 printk(KERN_DEBUG
@@ -379,9 +379,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
379 case APIC_DM_STARTUP: 379 case APIC_DM_STARTUP:
380 printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n", 380 printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n",
381 vcpu->vcpu_id, vector); 381 vcpu->vcpu_id, vector);
382 if (vcpu->arch.mp_state == VCPU_MP_STATE_INIT_RECEIVED) { 382 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
383 vcpu->arch.sipi_vector = vector; 383 vcpu->arch.sipi_vector = vector;
384 vcpu->arch.mp_state = VCPU_MP_STATE_SIPI_RECEIVED; 384 vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
385 if (waitqueue_active(&vcpu->wq)) 385 if (waitqueue_active(&vcpu->wq))
386 wake_up_interruptible(&vcpu->wq); 386 wake_up_interruptible(&vcpu->wq);
387 } 387 }
@@ -658,7 +658,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
658 apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" 658 apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
659 PRIx64 ", " 659 PRIx64 ", "
660 "timer initial count 0x%x, period %lldns, " 660 "timer initial count 0x%x, period %lldns, "
661 "expire @ 0x%016" PRIx64 ".\n", __FUNCTION__, 661 "expire @ 0x%016" PRIx64 ".\n", __func__,
662 APIC_BUS_CYCLE_NS, ktime_to_ns(now), 662 APIC_BUS_CYCLE_NS, ktime_to_ns(now),
663 apic_get_reg(apic, APIC_TMICT), 663 apic_get_reg(apic, APIC_TMICT),
664 apic->timer.period, 664 apic->timer.period,
@@ -691,7 +691,7 @@ static void apic_mmio_write(struct kvm_io_device *this,
691 /* too common printing */ 691 /* too common printing */
692 if (offset != APIC_EOI) 692 if (offset != APIC_EOI)
693 apic_debug("%s: offset 0x%x with length 0x%x, and value is " 693 apic_debug("%s: offset 0x%x with length 0x%x, and value is "
694 "0x%x\n", __FUNCTION__, offset, len, val); 694 "0x%x\n", __func__, offset, len, val);
695 695
696 offset &= 0xff0; 696 offset &= 0xff0;
697 697
@@ -822,6 +822,7 @@ void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
822 apic_set_tpr(apic, ((cr8 & 0x0f) << 4) 822 apic_set_tpr(apic, ((cr8 & 0x0f) << 4)
823 | (apic_get_reg(apic, APIC_TASKPRI) & 4)); 823 | (apic_get_reg(apic, APIC_TASKPRI) & 4));
824} 824}
825EXPORT_SYMBOL_GPL(kvm_lapic_set_tpr);
825 826
826u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) 827u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
827{ 828{
@@ -869,7 +870,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
869 struct kvm_lapic *apic; 870 struct kvm_lapic *apic;
870 int i; 871 int i;
871 872
872 apic_debug("%s\n", __FUNCTION__); 873 apic_debug("%s\n", __func__);
873 874
874 ASSERT(vcpu); 875 ASSERT(vcpu);
875 apic = vcpu->arch.apic; 876 apic = vcpu->arch.apic;
@@ -907,7 +908,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
907 apic_update_ppr(apic); 908 apic_update_ppr(apic);
908 909
909 apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr=" 910 apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr="
910 "0x%016" PRIx64 ", base_address=0x%0lx.\n", __FUNCTION__, 911 "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__,
911 vcpu, kvm_apic_id(apic), 912 vcpu, kvm_apic_id(apic),
912 vcpu->arch.apic_base, apic->base_address); 913 vcpu->arch.apic_base, apic->base_address);
913} 914}
@@ -940,7 +941,7 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
940 941
941 atomic_inc(&apic->timer.pending); 942 atomic_inc(&apic->timer.pending);
942 if (waitqueue_active(q)) { 943 if (waitqueue_active(q)) {
943 apic->vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE; 944 apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
944 wake_up_interruptible(q); 945 wake_up_interruptible(q);
945 } 946 }
946 if (apic_lvtt_period(apic)) { 947 if (apic_lvtt_period(apic)) {
@@ -952,6 +953,16 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
952 return result; 953 return result;
953} 954}
954 955
956int apic_has_pending_timer(struct kvm_vcpu *vcpu)
957{
958 struct kvm_lapic *lapic = vcpu->arch.apic;
959
960 if (lapic)
961 return atomic_read(&lapic->timer.pending);
962
963 return 0;
964}
965
955static int __inject_apic_timer_irq(struct kvm_lapic *apic) 966static int __inject_apic_timer_irq(struct kvm_lapic *apic)
956{ 967{
957 int vector; 968 int vector;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e55af12e11b7..2ad6f5481671 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -27,11 +27,22 @@
27#include <linux/highmem.h> 27#include <linux/highmem.h>
28#include <linux/module.h> 28#include <linux/module.h>
29#include <linux/swap.h> 29#include <linux/swap.h>
30#include <linux/hugetlb.h>
31#include <linux/compiler.h>
30 32
31#include <asm/page.h> 33#include <asm/page.h>
32#include <asm/cmpxchg.h> 34#include <asm/cmpxchg.h>
33#include <asm/io.h> 35#include <asm/io.h>
34 36
37/*
38 * When setting this variable to true it enables Two-Dimensional-Paging
39 * where the hardware walks 2 page tables:
40 * 1. the guest-virtual to guest-physical
41 * 2. while doing 1. it walks guest-physical to host-physical
42 * If the hardware supports that we don't need to do shadow paging.
43 */
44bool tdp_enabled = false;
45
35#undef MMU_DEBUG 46#undef MMU_DEBUG
36 47
37#undef AUDIT 48#undef AUDIT
@@ -101,8 +112,6 @@ static int dbg = 1;
101#define PT_FIRST_AVAIL_BITS_SHIFT 9 112#define PT_FIRST_AVAIL_BITS_SHIFT 9
102#define PT64_SECOND_AVAIL_BITS_SHIFT 52 113#define PT64_SECOND_AVAIL_BITS_SHIFT 52
103 114
104#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
105
106#define VALID_PAGE(x) ((x) != INVALID_PAGE) 115#define VALID_PAGE(x) ((x) != INVALID_PAGE)
107 116
108#define PT64_LEVEL_BITS 9 117#define PT64_LEVEL_BITS 9
@@ -159,6 +168,13 @@ static int dbg = 1;
159#define ACC_USER_MASK PT_USER_MASK 168#define ACC_USER_MASK PT_USER_MASK
160#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) 169#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
161 170
171struct kvm_pv_mmu_op_buffer {
172 void *ptr;
173 unsigned len;
174 unsigned processed;
175 char buf[512] __aligned(sizeof(long));
176};
177
162struct kvm_rmap_desc { 178struct kvm_rmap_desc {
163 u64 *shadow_ptes[RMAP_EXT]; 179 u64 *shadow_ptes[RMAP_EXT];
164 struct kvm_rmap_desc *more; 180 struct kvm_rmap_desc *more;
@@ -200,11 +216,15 @@ static int is_present_pte(unsigned long pte)
200 216
201static int is_shadow_present_pte(u64 pte) 217static int is_shadow_present_pte(u64 pte)
202{ 218{
203 pte &= ~PT_SHADOW_IO_MARK;
204 return pte != shadow_trap_nonpresent_pte 219 return pte != shadow_trap_nonpresent_pte
205 && pte != shadow_notrap_nonpresent_pte; 220 && pte != shadow_notrap_nonpresent_pte;
206} 221}
207 222
223static int is_large_pte(u64 pte)
224{
225 return pte & PT_PAGE_SIZE_MASK;
226}
227
208static int is_writeble_pte(unsigned long pte) 228static int is_writeble_pte(unsigned long pte)
209{ 229{
210 return pte & PT_WRITABLE_MASK; 230 return pte & PT_WRITABLE_MASK;
@@ -215,14 +235,14 @@ static int is_dirty_pte(unsigned long pte)
215 return pte & PT_DIRTY_MASK; 235 return pte & PT_DIRTY_MASK;
216} 236}
217 237
218static int is_io_pte(unsigned long pte) 238static int is_rmap_pte(u64 pte)
219{ 239{
220 return pte & PT_SHADOW_IO_MARK; 240 return is_shadow_present_pte(pte);
221} 241}
222 242
223static int is_rmap_pte(u64 pte) 243static pfn_t spte_to_pfn(u64 pte)
224{ 244{
225 return is_shadow_present_pte(pte); 245 return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
226} 246}
227 247
228static gfn_t pse36_gfn_delta(u32 gpte) 248static gfn_t pse36_gfn_delta(u32 gpte)
@@ -349,16 +369,100 @@ static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
349} 369}
350 370
351/* 371/*
372 * Return the pointer to the largepage write count for a given
373 * gfn, handling slots that are not large page aligned.
374 */
375static int *slot_largepage_idx(gfn_t gfn, struct kvm_memory_slot *slot)
376{
377 unsigned long idx;
378
379 idx = (gfn / KVM_PAGES_PER_HPAGE) -
380 (slot->base_gfn / KVM_PAGES_PER_HPAGE);
381 return &slot->lpage_info[idx].write_count;
382}
383
384static void account_shadowed(struct kvm *kvm, gfn_t gfn)
385{
386 int *write_count;
387
388 write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn));
389 *write_count += 1;
390 WARN_ON(*write_count > KVM_PAGES_PER_HPAGE);
391}
392
393static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
394{
395 int *write_count;
396
397 write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn));
398 *write_count -= 1;
399 WARN_ON(*write_count < 0);
400}
401
402static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn)
403{
404 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
405 int *largepage_idx;
406
407 if (slot) {
408 largepage_idx = slot_largepage_idx(gfn, slot);
409 return *largepage_idx;
410 }
411
412 return 1;
413}
414
415static int host_largepage_backed(struct kvm *kvm, gfn_t gfn)
416{
417 struct vm_area_struct *vma;
418 unsigned long addr;
419
420 addr = gfn_to_hva(kvm, gfn);
421 if (kvm_is_error_hva(addr))
422 return 0;
423
424 vma = find_vma(current->mm, addr);
425 if (vma && is_vm_hugetlb_page(vma))
426 return 1;
427
428 return 0;
429}
430
431static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn)
432{
433 struct kvm_memory_slot *slot;
434
435 if (has_wrprotected_page(vcpu->kvm, large_gfn))
436 return 0;
437
438 if (!host_largepage_backed(vcpu->kvm, large_gfn))
439 return 0;
440
441 slot = gfn_to_memslot(vcpu->kvm, large_gfn);
442 if (slot && slot->dirty_bitmap)
443 return 0;
444
445 return 1;
446}
447
448/*
352 * Take gfn and return the reverse mapping to it. 449 * Take gfn and return the reverse mapping to it.
353 * Note: gfn must be unaliased before this function get called 450 * Note: gfn must be unaliased before this function get called
354 */ 451 */
355 452
356static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn) 453static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage)
357{ 454{
358 struct kvm_memory_slot *slot; 455 struct kvm_memory_slot *slot;
456 unsigned long idx;
359 457
360 slot = gfn_to_memslot(kvm, gfn); 458 slot = gfn_to_memslot(kvm, gfn);
361 return &slot->rmap[gfn - slot->base_gfn]; 459 if (!lpage)
460 return &slot->rmap[gfn - slot->base_gfn];
461
462 idx = (gfn / KVM_PAGES_PER_HPAGE) -
463 (slot->base_gfn / KVM_PAGES_PER_HPAGE);
464
465 return &slot->lpage_info[idx].rmap_pde;
362} 466}
363 467
364/* 468/*
@@ -370,7 +474,7 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn)
370 * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc 474 * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
371 * containing more mappings. 475 * containing more mappings.
372 */ 476 */
373static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) 477static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage)
374{ 478{
375 struct kvm_mmu_page *sp; 479 struct kvm_mmu_page *sp;
376 struct kvm_rmap_desc *desc; 480 struct kvm_rmap_desc *desc;
@@ -382,7 +486,7 @@ static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
382 gfn = unalias_gfn(vcpu->kvm, gfn); 486 gfn = unalias_gfn(vcpu->kvm, gfn);
383 sp = page_header(__pa(spte)); 487 sp = page_header(__pa(spte));
384 sp->gfns[spte - sp->spt] = gfn; 488 sp->gfns[spte - sp->spt] = gfn;
385 rmapp = gfn_to_rmap(vcpu->kvm, gfn); 489 rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage);
386 if (!*rmapp) { 490 if (!*rmapp) {
387 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); 491 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
388 *rmapp = (unsigned long)spte; 492 *rmapp = (unsigned long)spte;
@@ -435,20 +539,21 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
435 struct kvm_rmap_desc *desc; 539 struct kvm_rmap_desc *desc;
436 struct kvm_rmap_desc *prev_desc; 540 struct kvm_rmap_desc *prev_desc;
437 struct kvm_mmu_page *sp; 541 struct kvm_mmu_page *sp;
438 struct page *page; 542 pfn_t pfn;
439 unsigned long *rmapp; 543 unsigned long *rmapp;
440 int i; 544 int i;
441 545
442 if (!is_rmap_pte(*spte)) 546 if (!is_rmap_pte(*spte))
443 return; 547 return;
444 sp = page_header(__pa(spte)); 548 sp = page_header(__pa(spte));
445 page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT); 549 pfn = spte_to_pfn(*spte);
446 mark_page_accessed(page); 550 if (*spte & PT_ACCESSED_MASK)
551 kvm_set_pfn_accessed(pfn);
447 if (is_writeble_pte(*spte)) 552 if (is_writeble_pte(*spte))
448 kvm_release_page_dirty(page); 553 kvm_release_pfn_dirty(pfn);
449 else 554 else
450 kvm_release_page_clean(page); 555 kvm_release_pfn_clean(pfn);
451 rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt]); 556 rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], is_large_pte(*spte));
452 if (!*rmapp) { 557 if (!*rmapp) {
453 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); 558 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
454 BUG(); 559 BUG();
@@ -514,7 +619,7 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
514 int write_protected = 0; 619 int write_protected = 0;
515 620
516 gfn = unalias_gfn(kvm, gfn); 621 gfn = unalias_gfn(kvm, gfn);
517 rmapp = gfn_to_rmap(kvm, gfn); 622 rmapp = gfn_to_rmap(kvm, gfn, 0);
518 623
519 spte = rmap_next(kvm, rmapp, NULL); 624 spte = rmap_next(kvm, rmapp, NULL);
520 while (spte) { 625 while (spte) {
@@ -527,8 +632,35 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
527 } 632 }
528 spte = rmap_next(kvm, rmapp, spte); 633 spte = rmap_next(kvm, rmapp, spte);
529 } 634 }
635 if (write_protected) {
636 pfn_t pfn;
637
638 spte = rmap_next(kvm, rmapp, NULL);
639 pfn = spte_to_pfn(*spte);
640 kvm_set_pfn_dirty(pfn);
641 }
642
643 /* check for huge page mappings */
644 rmapp = gfn_to_rmap(kvm, gfn, 1);
645 spte = rmap_next(kvm, rmapp, NULL);
646 while (spte) {
647 BUG_ON(!spte);
648 BUG_ON(!(*spte & PT_PRESENT_MASK));
649 BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
650 pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
651 if (is_writeble_pte(*spte)) {
652 rmap_remove(kvm, spte);
653 --kvm->stat.lpages;
654 set_shadow_pte(spte, shadow_trap_nonpresent_pte);
655 write_protected = 1;
656 }
657 spte = rmap_next(kvm, rmapp, spte);
658 }
659
530 if (write_protected) 660 if (write_protected)
531 kvm_flush_remote_tlbs(kvm); 661 kvm_flush_remote_tlbs(kvm);
662
663 account_shadowed(kvm, gfn);
532} 664}
533 665
534#ifdef MMU_DEBUG 666#ifdef MMU_DEBUG
@@ -538,8 +670,8 @@ static int is_empty_shadow_page(u64 *spt)
538 u64 *end; 670 u64 *end;
539 671
540 for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++) 672 for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
541 if ((*pos & ~PT_SHADOW_IO_MARK) != shadow_trap_nonpresent_pte) { 673 if (*pos != shadow_trap_nonpresent_pte) {
542 printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__, 674 printk(KERN_ERR "%s: %p %llx\n", __func__,
543 pos, *pos); 675 pos, *pos);
544 return 0; 676 return 0;
545 } 677 }
@@ -559,7 +691,7 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
559 691
560static unsigned kvm_page_table_hashfn(gfn_t gfn) 692static unsigned kvm_page_table_hashfn(gfn_t gfn)
561{ 693{
562 return gfn; 694 return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
563} 695}
564 696
565static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, 697static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
@@ -662,13 +794,14 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
662 struct kvm_mmu_page *sp; 794 struct kvm_mmu_page *sp;
663 struct hlist_node *node; 795 struct hlist_node *node;
664 796
665 pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn); 797 pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
666 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; 798 index = kvm_page_table_hashfn(gfn);
667 bucket = &kvm->arch.mmu_page_hash[index]; 799 bucket = &kvm->arch.mmu_page_hash[index];
668 hlist_for_each_entry(sp, node, bucket, hash_link) 800 hlist_for_each_entry(sp, node, bucket, hash_link)
669 if (sp->gfn == gfn && !sp->role.metaphysical) { 801 if (sp->gfn == gfn && !sp->role.metaphysical
802 && !sp->role.invalid) {
670 pgprintk("%s: found role %x\n", 803 pgprintk("%s: found role %x\n",
671 __FUNCTION__, sp->role.word); 804 __func__, sp->role.word);
672 return sp; 805 return sp;
673 } 806 }
674 return NULL; 807 return NULL;
@@ -699,27 +832,27 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
699 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; 832 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
700 role.quadrant = quadrant; 833 role.quadrant = quadrant;
701 } 834 }
702 pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__, 835 pgprintk("%s: looking gfn %lx role %x\n", __func__,
703 gfn, role.word); 836 gfn, role.word);
704 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; 837 index = kvm_page_table_hashfn(gfn);
705 bucket = &vcpu->kvm->arch.mmu_page_hash[index]; 838 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
706 hlist_for_each_entry(sp, node, bucket, hash_link) 839 hlist_for_each_entry(sp, node, bucket, hash_link)
707 if (sp->gfn == gfn && sp->role.word == role.word) { 840 if (sp->gfn == gfn && sp->role.word == role.word) {
708 mmu_page_add_parent_pte(vcpu, sp, parent_pte); 841 mmu_page_add_parent_pte(vcpu, sp, parent_pte);
709 pgprintk("%s: found\n", __FUNCTION__); 842 pgprintk("%s: found\n", __func__);
710 return sp; 843 return sp;
711 } 844 }
712 ++vcpu->kvm->stat.mmu_cache_miss; 845 ++vcpu->kvm->stat.mmu_cache_miss;
713 sp = kvm_mmu_alloc_page(vcpu, parent_pte); 846 sp = kvm_mmu_alloc_page(vcpu, parent_pte);
714 if (!sp) 847 if (!sp)
715 return sp; 848 return sp;
716 pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word); 849 pgprintk("%s: adding gfn %lx role %x\n", __func__, gfn, role.word);
717 sp->gfn = gfn; 850 sp->gfn = gfn;
718 sp->role = role; 851 sp->role = role;
719 hlist_add_head(&sp->hash_link, bucket); 852 hlist_add_head(&sp->hash_link, bucket);
720 vcpu->arch.mmu.prefetch_page(vcpu, sp);
721 if (!metaphysical) 853 if (!metaphysical)
722 rmap_write_protect(vcpu->kvm, gfn); 854 rmap_write_protect(vcpu->kvm, gfn);
855 vcpu->arch.mmu.prefetch_page(vcpu, sp);
723 return sp; 856 return sp;
724} 857}
725 858
@@ -745,11 +878,17 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
745 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { 878 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
746 ent = pt[i]; 879 ent = pt[i];
747 880
881 if (is_shadow_present_pte(ent)) {
882 if (!is_large_pte(ent)) {
883 ent &= PT64_BASE_ADDR_MASK;
884 mmu_page_remove_parent_pte(page_header(ent),
885 &pt[i]);
886 } else {
887 --kvm->stat.lpages;
888 rmap_remove(kvm, &pt[i]);
889 }
890 }
748 pt[i] = shadow_trap_nonpresent_pte; 891 pt[i] = shadow_trap_nonpresent_pte;
749 if (!is_shadow_present_pte(ent))
750 continue;
751 ent &= PT64_BASE_ADDR_MASK;
752 mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
753 } 892 }
754 kvm_flush_remote_tlbs(kvm); 893 kvm_flush_remote_tlbs(kvm);
755} 894}
@@ -789,10 +928,15 @@ static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
789 } 928 }
790 kvm_mmu_page_unlink_children(kvm, sp); 929 kvm_mmu_page_unlink_children(kvm, sp);
791 if (!sp->root_count) { 930 if (!sp->root_count) {
931 if (!sp->role.metaphysical)
932 unaccount_shadowed(kvm, sp->gfn);
792 hlist_del(&sp->hash_link); 933 hlist_del(&sp->hash_link);
793 kvm_mmu_free_page(kvm, sp); 934 kvm_mmu_free_page(kvm, sp);
794 } else 935 } else {
795 list_move(&sp->link, &kvm->arch.active_mmu_pages); 936 list_move(&sp->link, &kvm->arch.active_mmu_pages);
937 sp->role.invalid = 1;
938 kvm_reload_remote_mmus(kvm);
939 }
796 kvm_mmu_reset_last_pte_updated(kvm); 940 kvm_mmu_reset_last_pte_updated(kvm);
797} 941}
798 942
@@ -838,13 +982,13 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
838 struct hlist_node *node, *n; 982 struct hlist_node *node, *n;
839 int r; 983 int r;
840 984
841 pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn); 985 pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
842 r = 0; 986 r = 0;
843 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; 987 index = kvm_page_table_hashfn(gfn);
844 bucket = &kvm->arch.mmu_page_hash[index]; 988 bucket = &kvm->arch.mmu_page_hash[index];
845 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) 989 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
846 if (sp->gfn == gfn && !sp->role.metaphysical) { 990 if (sp->gfn == gfn && !sp->role.metaphysical) {
847 pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn, 991 pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
848 sp->role.word); 992 sp->role.word);
849 kvm_mmu_zap_page(kvm, sp); 993 kvm_mmu_zap_page(kvm, sp);
850 r = 1; 994 r = 1;
@@ -857,7 +1001,7 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
857 struct kvm_mmu_page *sp; 1001 struct kvm_mmu_page *sp;
858 1002
859 while ((sp = kvm_mmu_lookup_page(kvm, gfn)) != NULL) { 1003 while ((sp = kvm_mmu_lookup_page(kvm, gfn)) != NULL) {
860 pgprintk("%s: zap %lx %x\n", __FUNCTION__, gfn, sp->role.word); 1004 pgprintk("%s: zap %lx %x\n", __func__, gfn, sp->role.word);
861 kvm_mmu_zap_page(kvm, sp); 1005 kvm_mmu_zap_page(kvm, sp);
862 } 1006 }
863} 1007}
@@ -889,26 +1033,39 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
889static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, 1033static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
890 unsigned pt_access, unsigned pte_access, 1034 unsigned pt_access, unsigned pte_access,
891 int user_fault, int write_fault, int dirty, 1035 int user_fault, int write_fault, int dirty,
892 int *ptwrite, gfn_t gfn, struct page *page) 1036 int *ptwrite, int largepage, gfn_t gfn,
1037 pfn_t pfn, bool speculative)
893{ 1038{
894 u64 spte; 1039 u64 spte;
895 int was_rmapped = 0; 1040 int was_rmapped = 0;
896 int was_writeble = is_writeble_pte(*shadow_pte); 1041 int was_writeble = is_writeble_pte(*shadow_pte);
897 hfn_t host_pfn = (*shadow_pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
898 1042
899 pgprintk("%s: spte %llx access %x write_fault %d" 1043 pgprintk("%s: spte %llx access %x write_fault %d"
900 " user_fault %d gfn %lx\n", 1044 " user_fault %d gfn %lx\n",
901 __FUNCTION__, *shadow_pte, pt_access, 1045 __func__, *shadow_pte, pt_access,
902 write_fault, user_fault, gfn); 1046 write_fault, user_fault, gfn);
903 1047
904 if (is_rmap_pte(*shadow_pte)) { 1048 if (is_rmap_pte(*shadow_pte)) {
905 if (host_pfn != page_to_pfn(page)) { 1049 /*
1050 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
1051 * the parent of the now unreachable PTE.
1052 */
1053 if (largepage && !is_large_pte(*shadow_pte)) {
1054 struct kvm_mmu_page *child;
1055 u64 pte = *shadow_pte;
1056
1057 child = page_header(pte & PT64_BASE_ADDR_MASK);
1058 mmu_page_remove_parent_pte(child, shadow_pte);
1059 } else if (pfn != spte_to_pfn(*shadow_pte)) {
906 pgprintk("hfn old %lx new %lx\n", 1060 pgprintk("hfn old %lx new %lx\n",
907 host_pfn, page_to_pfn(page)); 1061 spte_to_pfn(*shadow_pte), pfn);
908 rmap_remove(vcpu->kvm, shadow_pte); 1062 rmap_remove(vcpu->kvm, shadow_pte);
1063 } else {
1064 if (largepage)
1065 was_rmapped = is_large_pte(*shadow_pte);
1066 else
1067 was_rmapped = 1;
909 } 1068 }
910 else
911 was_rmapped = 1;
912 } 1069 }
913 1070
914 /* 1071 /*
@@ -917,6 +1074,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
917 * demand paging). 1074 * demand paging).
918 */ 1075 */
919 spte = PT_PRESENT_MASK | PT_DIRTY_MASK; 1076 spte = PT_PRESENT_MASK | PT_DIRTY_MASK;
1077 if (!speculative)
1078 pte_access |= PT_ACCESSED_MASK;
920 if (!dirty) 1079 if (!dirty)
921 pte_access &= ~ACC_WRITE_MASK; 1080 pte_access &= ~ACC_WRITE_MASK;
922 if (!(pte_access & ACC_EXEC_MASK)) 1081 if (!(pte_access & ACC_EXEC_MASK))
@@ -925,15 +1084,10 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
925 spte |= PT_PRESENT_MASK; 1084 spte |= PT_PRESENT_MASK;
926 if (pte_access & ACC_USER_MASK) 1085 if (pte_access & ACC_USER_MASK)
927 spte |= PT_USER_MASK; 1086 spte |= PT_USER_MASK;
1087 if (largepage)
1088 spte |= PT_PAGE_SIZE_MASK;
928 1089
929 if (is_error_page(page)) { 1090 spte |= (u64)pfn << PAGE_SHIFT;
930 set_shadow_pte(shadow_pte,
931 shadow_trap_nonpresent_pte | PT_SHADOW_IO_MARK);
932 kvm_release_page_clean(page);
933 return;
934 }
935
936 spte |= page_to_phys(page);
937 1091
938 if ((pte_access & ACC_WRITE_MASK) 1092 if ((pte_access & ACC_WRITE_MASK)
939 || (write_fault && !is_write_protection(vcpu) && !user_fault)) { 1093 || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
@@ -946,9 +1100,10 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
946 } 1100 }
947 1101
948 shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); 1102 shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
949 if (shadow) { 1103 if (shadow ||
1104 (largepage && has_wrprotected_page(vcpu->kvm, gfn))) {
950 pgprintk("%s: found shadow page for %lx, marking ro\n", 1105 pgprintk("%s: found shadow page for %lx, marking ro\n",
951 __FUNCTION__, gfn); 1106 __func__, gfn);
952 pte_access &= ~ACC_WRITE_MASK; 1107 pte_access &= ~ACC_WRITE_MASK;
953 if (is_writeble_pte(spte)) { 1108 if (is_writeble_pte(spte)) {
954 spte &= ~PT_WRITABLE_MASK; 1109 spte &= ~PT_WRITABLE_MASK;
@@ -964,18 +1119,25 @@ unshadowed:
964 if (pte_access & ACC_WRITE_MASK) 1119 if (pte_access & ACC_WRITE_MASK)
965 mark_page_dirty(vcpu->kvm, gfn); 1120 mark_page_dirty(vcpu->kvm, gfn);
966 1121
967 pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte); 1122 pgprintk("%s: setting spte %llx\n", __func__, spte);
1123 pgprintk("instantiating %s PTE (%s) at %d (%llx) addr %llx\n",
1124 (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB",
1125 (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte, shadow_pte);
968 set_shadow_pte(shadow_pte, spte); 1126 set_shadow_pte(shadow_pte, spte);
1127 if (!was_rmapped && (spte & PT_PAGE_SIZE_MASK)
1128 && (spte & PT_PRESENT_MASK))
1129 ++vcpu->kvm->stat.lpages;
1130
969 page_header_update_slot(vcpu->kvm, shadow_pte, gfn); 1131 page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
970 if (!was_rmapped) { 1132 if (!was_rmapped) {
971 rmap_add(vcpu, shadow_pte, gfn); 1133 rmap_add(vcpu, shadow_pte, gfn, largepage);
972 if (!is_rmap_pte(*shadow_pte)) 1134 if (!is_rmap_pte(*shadow_pte))
973 kvm_release_page_clean(page); 1135 kvm_release_pfn_clean(pfn);
974 } else { 1136 } else {
975 if (was_writeble) 1137 if (was_writeble)
976 kvm_release_page_dirty(page); 1138 kvm_release_pfn_dirty(pfn);
977 else 1139 else
978 kvm_release_page_clean(page); 1140 kvm_release_pfn_clean(pfn);
979 } 1141 }
980 if (!ptwrite || !*ptwrite) 1142 if (!ptwrite || !*ptwrite)
981 vcpu->arch.last_pte_updated = shadow_pte; 1143 vcpu->arch.last_pte_updated = shadow_pte;
@@ -985,10 +1147,10 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
985{ 1147{
986} 1148}
987 1149
988static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, 1150static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
989 gfn_t gfn, struct page *page) 1151 int largepage, gfn_t gfn, pfn_t pfn,
1152 int level)
990{ 1153{
991 int level = PT32E_ROOT_LEVEL;
992 hpa_t table_addr = vcpu->arch.mmu.root_hpa; 1154 hpa_t table_addr = vcpu->arch.mmu.root_hpa;
993 int pt_write = 0; 1155 int pt_write = 0;
994 1156
@@ -1001,8 +1163,14 @@ static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write,
1001 1163
1002 if (level == 1) { 1164 if (level == 1) {
1003 mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, 1165 mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
1004 0, write, 1, &pt_write, gfn, page); 1166 0, write, 1, &pt_write, 0, gfn, pfn, false);
1005 return pt_write || is_io_pte(table[index]); 1167 return pt_write;
1168 }
1169
1170 if (largepage && level == 2) {
1171 mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
1172 0, write, 1, &pt_write, 1, gfn, pfn, false);
1173 return pt_write;
1006 } 1174 }
1007 1175
1008 if (table[index] == shadow_trap_nonpresent_pte) { 1176 if (table[index] == shadow_trap_nonpresent_pte) {
@@ -1016,7 +1184,7 @@ static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write,
1016 1, ACC_ALL, &table[index]); 1184 1, ACC_ALL, &table[index]);
1017 if (!new_table) { 1185 if (!new_table) {
1018 pgprintk("nonpaging_map: ENOMEM\n"); 1186 pgprintk("nonpaging_map: ENOMEM\n");
1019 kvm_release_page_clean(page); 1187 kvm_release_pfn_clean(pfn);
1020 return -ENOMEM; 1188 return -ENOMEM;
1021 } 1189 }
1022 1190
@@ -1030,21 +1198,30 @@ static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write,
1030static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) 1198static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1031{ 1199{
1032 int r; 1200 int r;
1033 1201 int largepage = 0;
1034 struct page *page; 1202 pfn_t pfn;
1035
1036 down_read(&vcpu->kvm->slots_lock);
1037 1203
1038 down_read(&current->mm->mmap_sem); 1204 down_read(&current->mm->mmap_sem);
1039 page = gfn_to_page(vcpu->kvm, gfn); 1205 if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
1206 gfn &= ~(KVM_PAGES_PER_HPAGE-1);
1207 largepage = 1;
1208 }
1209
1210 pfn = gfn_to_pfn(vcpu->kvm, gfn);
1040 up_read(&current->mm->mmap_sem); 1211 up_read(&current->mm->mmap_sem);
1041 1212
1213 /* mmio */
1214 if (is_error_pfn(pfn)) {
1215 kvm_release_pfn_clean(pfn);
1216 return 1;
1217 }
1218
1042 spin_lock(&vcpu->kvm->mmu_lock); 1219 spin_lock(&vcpu->kvm->mmu_lock);
1043 kvm_mmu_free_some_pages(vcpu); 1220 kvm_mmu_free_some_pages(vcpu);
1044 r = __nonpaging_map(vcpu, v, write, gfn, page); 1221 r = __direct_map(vcpu, v, write, largepage, gfn, pfn,
1222 PT32E_ROOT_LEVEL);
1045 spin_unlock(&vcpu->kvm->mmu_lock); 1223 spin_unlock(&vcpu->kvm->mmu_lock);
1046 1224
1047 up_read(&vcpu->kvm->slots_lock);
1048 1225
1049 return r; 1226 return r;
1050} 1227}
@@ -1073,6 +1250,8 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
1073 1250
1074 sp = page_header(root); 1251 sp = page_header(root);
1075 --sp->root_count; 1252 --sp->root_count;
1253 if (!sp->root_count && sp->role.invalid)
1254 kvm_mmu_zap_page(vcpu->kvm, sp);
1076 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 1255 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
1077 spin_unlock(&vcpu->kvm->mmu_lock); 1256 spin_unlock(&vcpu->kvm->mmu_lock);
1078 return; 1257 return;
@@ -1085,6 +1264,8 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
1085 root &= PT64_BASE_ADDR_MASK; 1264 root &= PT64_BASE_ADDR_MASK;
1086 sp = page_header(root); 1265 sp = page_header(root);
1087 --sp->root_count; 1266 --sp->root_count;
1267 if (!sp->root_count && sp->role.invalid)
1268 kvm_mmu_zap_page(vcpu->kvm, sp);
1088 } 1269 }
1089 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; 1270 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
1090 } 1271 }
@@ -1097,6 +1278,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
1097 int i; 1278 int i;
1098 gfn_t root_gfn; 1279 gfn_t root_gfn;
1099 struct kvm_mmu_page *sp; 1280 struct kvm_mmu_page *sp;
1281 int metaphysical = 0;
1100 1282
1101 root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT; 1283 root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
1102 1284
@@ -1105,14 +1287,20 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
1105 hpa_t root = vcpu->arch.mmu.root_hpa; 1287 hpa_t root = vcpu->arch.mmu.root_hpa;
1106 1288
1107 ASSERT(!VALID_PAGE(root)); 1289 ASSERT(!VALID_PAGE(root));
1290 if (tdp_enabled)
1291 metaphysical = 1;
1108 sp = kvm_mmu_get_page(vcpu, root_gfn, 0, 1292 sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
1109 PT64_ROOT_LEVEL, 0, ACC_ALL, NULL); 1293 PT64_ROOT_LEVEL, metaphysical,
1294 ACC_ALL, NULL);
1110 root = __pa(sp->spt); 1295 root = __pa(sp->spt);
1111 ++sp->root_count; 1296 ++sp->root_count;
1112 vcpu->arch.mmu.root_hpa = root; 1297 vcpu->arch.mmu.root_hpa = root;
1113 return; 1298 return;
1114 } 1299 }
1115#endif 1300#endif
1301 metaphysical = !is_paging(vcpu);
1302 if (tdp_enabled)
1303 metaphysical = 1;
1116 for (i = 0; i < 4; ++i) { 1304 for (i = 0; i < 4; ++i) {
1117 hpa_t root = vcpu->arch.mmu.pae_root[i]; 1305 hpa_t root = vcpu->arch.mmu.pae_root[i];
1118 1306
@@ -1126,7 +1314,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
1126 } else if (vcpu->arch.mmu.root_level == 0) 1314 } else if (vcpu->arch.mmu.root_level == 0)
1127 root_gfn = 0; 1315 root_gfn = 0;
1128 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, 1316 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
1129 PT32_ROOT_LEVEL, !is_paging(vcpu), 1317 PT32_ROOT_LEVEL, metaphysical,
1130 ACC_ALL, NULL); 1318 ACC_ALL, NULL);
1131 root = __pa(sp->spt); 1319 root = __pa(sp->spt);
1132 ++sp->root_count; 1320 ++sp->root_count;
@@ -1146,7 +1334,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
1146 gfn_t gfn; 1334 gfn_t gfn;
1147 int r; 1335 int r;
1148 1336
1149 pgprintk("%s: gva %lx error %x\n", __FUNCTION__, gva, error_code); 1337 pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
1150 r = mmu_topup_memory_caches(vcpu); 1338 r = mmu_topup_memory_caches(vcpu);
1151 if (r) 1339 if (r)
1152 return r; 1340 return r;
@@ -1160,6 +1348,41 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
1160 error_code & PFERR_WRITE_MASK, gfn); 1348 error_code & PFERR_WRITE_MASK, gfn);
1161} 1349}
1162 1350
1351static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
1352 u32 error_code)
1353{
1354 pfn_t pfn;
1355 int r;
1356 int largepage = 0;
1357 gfn_t gfn = gpa >> PAGE_SHIFT;
1358
1359 ASSERT(vcpu);
1360 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
1361
1362 r = mmu_topup_memory_caches(vcpu);
1363 if (r)
1364 return r;
1365
1366 down_read(&current->mm->mmap_sem);
1367 if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
1368 gfn &= ~(KVM_PAGES_PER_HPAGE-1);
1369 largepage = 1;
1370 }
1371 pfn = gfn_to_pfn(vcpu->kvm, gfn);
1372 up_read(&current->mm->mmap_sem);
1373 if (is_error_pfn(pfn)) {
1374 kvm_release_pfn_clean(pfn);
1375 return 1;
1376 }
1377 spin_lock(&vcpu->kvm->mmu_lock);
1378 kvm_mmu_free_some_pages(vcpu);
1379 r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
1380 largepage, gfn, pfn, TDP_ROOT_LEVEL);
1381 spin_unlock(&vcpu->kvm->mmu_lock);
1382
1383 return r;
1384}
1385
1163static void nonpaging_free(struct kvm_vcpu *vcpu) 1386static void nonpaging_free(struct kvm_vcpu *vcpu)
1164{ 1387{
1165 mmu_free_roots(vcpu); 1388 mmu_free_roots(vcpu);
@@ -1188,7 +1411,7 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
1188 1411
1189static void paging_new_cr3(struct kvm_vcpu *vcpu) 1412static void paging_new_cr3(struct kvm_vcpu *vcpu)
1190{ 1413{
1191 pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->arch.cr3); 1414 pgprintk("%s: cr3 %lx\n", __func__, vcpu->arch.cr3);
1192 mmu_free_roots(vcpu); 1415 mmu_free_roots(vcpu);
1193} 1416}
1194 1417
@@ -1253,7 +1476,35 @@ static int paging32E_init_context(struct kvm_vcpu *vcpu)
1253 return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL); 1476 return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
1254} 1477}
1255 1478
1256static int init_kvm_mmu(struct kvm_vcpu *vcpu) 1479static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
1480{
1481 struct kvm_mmu *context = &vcpu->arch.mmu;
1482
1483 context->new_cr3 = nonpaging_new_cr3;
1484 context->page_fault = tdp_page_fault;
1485 context->free = nonpaging_free;
1486 context->prefetch_page = nonpaging_prefetch_page;
1487 context->shadow_root_level = TDP_ROOT_LEVEL;
1488 context->root_hpa = INVALID_PAGE;
1489
1490 if (!is_paging(vcpu)) {
1491 context->gva_to_gpa = nonpaging_gva_to_gpa;
1492 context->root_level = 0;
1493 } else if (is_long_mode(vcpu)) {
1494 context->gva_to_gpa = paging64_gva_to_gpa;
1495 context->root_level = PT64_ROOT_LEVEL;
1496 } else if (is_pae(vcpu)) {
1497 context->gva_to_gpa = paging64_gva_to_gpa;
1498 context->root_level = PT32E_ROOT_LEVEL;
1499 } else {
1500 context->gva_to_gpa = paging32_gva_to_gpa;
1501 context->root_level = PT32_ROOT_LEVEL;
1502 }
1503
1504 return 0;
1505}
1506
1507static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
1257{ 1508{
1258 ASSERT(vcpu); 1509 ASSERT(vcpu);
1259 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); 1510 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
@@ -1268,6 +1519,16 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu)
1268 return paging32_init_context(vcpu); 1519 return paging32_init_context(vcpu);
1269} 1520}
1270 1521
1522static int init_kvm_mmu(struct kvm_vcpu *vcpu)
1523{
1524 vcpu->arch.update_pte.pfn = bad_pfn;
1525
1526 if (tdp_enabled)
1527 return init_kvm_tdp_mmu(vcpu);
1528 else
1529 return init_kvm_softmmu(vcpu);
1530}
1531
1271static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) 1532static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
1272{ 1533{
1273 ASSERT(vcpu); 1534 ASSERT(vcpu);
@@ -1316,7 +1577,8 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
1316 1577
1317 pte = *spte; 1578 pte = *spte;
1318 if (is_shadow_present_pte(pte)) { 1579 if (is_shadow_present_pte(pte)) {
1319 if (sp->role.level == PT_PAGE_TABLE_LEVEL) 1580 if (sp->role.level == PT_PAGE_TABLE_LEVEL ||
1581 is_large_pte(pte))
1320 rmap_remove(vcpu->kvm, spte); 1582 rmap_remove(vcpu->kvm, spte);
1321 else { 1583 else {
1322 child = page_header(pte & PT64_BASE_ADDR_MASK); 1584 child = page_header(pte & PT64_BASE_ADDR_MASK);
@@ -1324,24 +1586,26 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
1324 } 1586 }
1325 } 1587 }
1326 set_shadow_pte(spte, shadow_trap_nonpresent_pte); 1588 set_shadow_pte(spte, shadow_trap_nonpresent_pte);
1589 if (is_large_pte(pte))
1590 --vcpu->kvm->stat.lpages;
1327} 1591}
1328 1592
1329static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, 1593static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
1330 struct kvm_mmu_page *sp, 1594 struct kvm_mmu_page *sp,
1331 u64 *spte, 1595 u64 *spte,
1332 const void *new, int bytes, 1596 const void *new)
1333 int offset_in_pte)
1334{ 1597{
1335 if (sp->role.level != PT_PAGE_TABLE_LEVEL) { 1598 if ((sp->role.level != PT_PAGE_TABLE_LEVEL)
1599 && !vcpu->arch.update_pte.largepage) {
1336 ++vcpu->kvm->stat.mmu_pde_zapped; 1600 ++vcpu->kvm->stat.mmu_pde_zapped;
1337 return; 1601 return;
1338 } 1602 }
1339 1603
1340 ++vcpu->kvm->stat.mmu_pte_updated; 1604 ++vcpu->kvm->stat.mmu_pte_updated;
1341 if (sp->role.glevels == PT32_ROOT_LEVEL) 1605 if (sp->role.glevels == PT32_ROOT_LEVEL)
1342 paging32_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte); 1606 paging32_update_pte(vcpu, sp, spte, new);
1343 else 1607 else
1344 paging64_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte); 1608 paging64_update_pte(vcpu, sp, spte, new);
1345} 1609}
1346 1610
1347static bool need_remote_flush(u64 old, u64 new) 1611static bool need_remote_flush(u64 old, u64 new)
@@ -1378,7 +1642,9 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1378 gfn_t gfn; 1642 gfn_t gfn;
1379 int r; 1643 int r;
1380 u64 gpte = 0; 1644 u64 gpte = 0;
1381 struct page *page; 1645 pfn_t pfn;
1646
1647 vcpu->arch.update_pte.largepage = 0;
1382 1648
1383 if (bytes != 4 && bytes != 8) 1649 if (bytes != 4 && bytes != 8)
1384 return; 1650 return;
@@ -1408,11 +1674,19 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1408 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; 1674 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
1409 1675
1410 down_read(&current->mm->mmap_sem); 1676 down_read(&current->mm->mmap_sem);
1411 page = gfn_to_page(vcpu->kvm, gfn); 1677 if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) {
1678 gfn &= ~(KVM_PAGES_PER_HPAGE-1);
1679 vcpu->arch.update_pte.largepage = 1;
1680 }
1681 pfn = gfn_to_pfn(vcpu->kvm, gfn);
1412 up_read(&current->mm->mmap_sem); 1682 up_read(&current->mm->mmap_sem);
1413 1683
1684 if (is_error_pfn(pfn)) {
1685 kvm_release_pfn_clean(pfn);
1686 return;
1687 }
1414 vcpu->arch.update_pte.gfn = gfn; 1688 vcpu->arch.update_pte.gfn = gfn;
1415 vcpu->arch.update_pte.page = page; 1689 vcpu->arch.update_pte.pfn = pfn;
1416} 1690}
1417 1691
1418void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, 1692void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -1423,7 +1697,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1423 struct hlist_node *node, *n; 1697 struct hlist_node *node, *n;
1424 struct hlist_head *bucket; 1698 struct hlist_head *bucket;
1425 unsigned index; 1699 unsigned index;
1426 u64 entry; 1700 u64 entry, gentry;
1427 u64 *spte; 1701 u64 *spte;
1428 unsigned offset = offset_in_page(gpa); 1702 unsigned offset = offset_in_page(gpa);
1429 unsigned pte_size; 1703 unsigned pte_size;
@@ -1433,8 +1707,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1433 int level; 1707 int level;
1434 int flooded = 0; 1708 int flooded = 0;
1435 int npte; 1709 int npte;
1710 int r;
1436 1711
1437 pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes); 1712 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
1438 mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes); 1713 mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes);
1439 spin_lock(&vcpu->kvm->mmu_lock); 1714 spin_lock(&vcpu->kvm->mmu_lock);
1440 kvm_mmu_free_some_pages(vcpu); 1715 kvm_mmu_free_some_pages(vcpu);
@@ -1450,7 +1725,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1450 vcpu->arch.last_pt_write_count = 1; 1725 vcpu->arch.last_pt_write_count = 1;
1451 vcpu->arch.last_pte_updated = NULL; 1726 vcpu->arch.last_pte_updated = NULL;
1452 } 1727 }
1453 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; 1728 index = kvm_page_table_hashfn(gfn);
1454 bucket = &vcpu->kvm->arch.mmu_page_hash[index]; 1729 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
1455 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { 1730 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
1456 if (sp->gfn != gfn || sp->role.metaphysical) 1731 if (sp->gfn != gfn || sp->role.metaphysical)
@@ -1496,20 +1771,29 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1496 continue; 1771 continue;
1497 } 1772 }
1498 spte = &sp->spt[page_offset / sizeof(*spte)]; 1773 spte = &sp->spt[page_offset / sizeof(*spte)];
1774 if ((gpa & (pte_size - 1)) || (bytes < pte_size)) {
1775 gentry = 0;
1776 r = kvm_read_guest_atomic(vcpu->kvm,
1777 gpa & ~(u64)(pte_size - 1),
1778 &gentry, pte_size);
1779 new = (const void *)&gentry;
1780 if (r < 0)
1781 new = NULL;
1782 }
1499 while (npte--) { 1783 while (npte--) {
1500 entry = *spte; 1784 entry = *spte;
1501 mmu_pte_write_zap_pte(vcpu, sp, spte); 1785 mmu_pte_write_zap_pte(vcpu, sp, spte);
1502 mmu_pte_write_new_pte(vcpu, sp, spte, new, bytes, 1786 if (new)
1503 page_offset & (pte_size - 1)); 1787 mmu_pte_write_new_pte(vcpu, sp, spte, new);
1504 mmu_pte_write_flush_tlb(vcpu, entry, *spte); 1788 mmu_pte_write_flush_tlb(vcpu, entry, *spte);
1505 ++spte; 1789 ++spte;
1506 } 1790 }
1507 } 1791 }
1508 kvm_mmu_audit(vcpu, "post pte write"); 1792 kvm_mmu_audit(vcpu, "post pte write");
1509 spin_unlock(&vcpu->kvm->mmu_lock); 1793 spin_unlock(&vcpu->kvm->mmu_lock);
1510 if (vcpu->arch.update_pte.page) { 1794 if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
1511 kvm_release_page_clean(vcpu->arch.update_pte.page); 1795 kvm_release_pfn_clean(vcpu->arch.update_pte.pfn);
1512 vcpu->arch.update_pte.page = NULL; 1796 vcpu->arch.update_pte.pfn = bad_pfn;
1513 } 1797 }
1514} 1798}
1515 1799
@@ -1518,9 +1802,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
1518 gpa_t gpa; 1802 gpa_t gpa;
1519 int r; 1803 int r;
1520 1804
1521 down_read(&vcpu->kvm->slots_lock);
1522 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); 1805 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
1523 up_read(&vcpu->kvm->slots_lock);
1524 1806
1525 spin_lock(&vcpu->kvm->mmu_lock); 1807 spin_lock(&vcpu->kvm->mmu_lock);
1526 r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); 1808 r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
@@ -1577,6 +1859,12 @@ out:
1577} 1859}
1578EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); 1860EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
1579 1861
1862void kvm_enable_tdp(void)
1863{
1864 tdp_enabled = true;
1865}
1866EXPORT_SYMBOL_GPL(kvm_enable_tdp);
1867
1580static void free_mmu_pages(struct kvm_vcpu *vcpu) 1868static void free_mmu_pages(struct kvm_vcpu *vcpu)
1581{ 1869{
1582 struct kvm_mmu_page *sp; 1870 struct kvm_mmu_page *sp;
@@ -1677,7 +1965,53 @@ void kvm_mmu_zap_all(struct kvm *kvm)
1677 kvm_flush_remote_tlbs(kvm); 1965 kvm_flush_remote_tlbs(kvm);
1678} 1966}
1679 1967
1680void kvm_mmu_module_exit(void) 1968void kvm_mmu_remove_one_alloc_mmu_page(struct kvm *kvm)
1969{
1970 struct kvm_mmu_page *page;
1971
1972 page = container_of(kvm->arch.active_mmu_pages.prev,
1973 struct kvm_mmu_page, link);
1974 kvm_mmu_zap_page(kvm, page);
1975}
1976
1977static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
1978{
1979 struct kvm *kvm;
1980 struct kvm *kvm_freed = NULL;
1981 int cache_count = 0;
1982
1983 spin_lock(&kvm_lock);
1984
1985 list_for_each_entry(kvm, &vm_list, vm_list) {
1986 int npages;
1987
1988 spin_lock(&kvm->mmu_lock);
1989 npages = kvm->arch.n_alloc_mmu_pages -
1990 kvm->arch.n_free_mmu_pages;
1991 cache_count += npages;
1992 if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
1993 kvm_mmu_remove_one_alloc_mmu_page(kvm);
1994 cache_count--;
1995 kvm_freed = kvm;
1996 }
1997 nr_to_scan--;
1998
1999 spin_unlock(&kvm->mmu_lock);
2000 }
2001 if (kvm_freed)
2002 list_move_tail(&kvm_freed->vm_list, &vm_list);
2003
2004 spin_unlock(&kvm_lock);
2005
2006 return cache_count;
2007}
2008
2009static struct shrinker mmu_shrinker = {
2010 .shrink = mmu_shrink,
2011 .seeks = DEFAULT_SEEKS * 10,
2012};
2013
2014void mmu_destroy_caches(void)
1681{ 2015{
1682 if (pte_chain_cache) 2016 if (pte_chain_cache)
1683 kmem_cache_destroy(pte_chain_cache); 2017 kmem_cache_destroy(pte_chain_cache);
@@ -1687,6 +2021,12 @@ void kvm_mmu_module_exit(void)
1687 kmem_cache_destroy(mmu_page_header_cache); 2021 kmem_cache_destroy(mmu_page_header_cache);
1688} 2022}
1689 2023
2024void kvm_mmu_module_exit(void)
2025{
2026 mmu_destroy_caches();
2027 unregister_shrinker(&mmu_shrinker);
2028}
2029
1690int kvm_mmu_module_init(void) 2030int kvm_mmu_module_init(void)
1691{ 2031{
1692 pte_chain_cache = kmem_cache_create("kvm_pte_chain", 2032 pte_chain_cache = kmem_cache_create("kvm_pte_chain",
@@ -1706,10 +2046,12 @@ int kvm_mmu_module_init(void)
1706 if (!mmu_page_header_cache) 2046 if (!mmu_page_header_cache)
1707 goto nomem; 2047 goto nomem;
1708 2048
2049 register_shrinker(&mmu_shrinker);
2050
1709 return 0; 2051 return 0;
1710 2052
1711nomem: 2053nomem:
1712 kvm_mmu_module_exit(); 2054 mmu_destroy_caches();
1713 return -ENOMEM; 2055 return -ENOMEM;
1714} 2056}
1715 2057
@@ -1732,6 +2074,127 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
1732 return nr_mmu_pages; 2074 return nr_mmu_pages;
1733} 2075}
1734 2076
2077static void *pv_mmu_peek_buffer(struct kvm_pv_mmu_op_buffer *buffer,
2078 unsigned len)
2079{
2080 if (len > buffer->len)
2081 return NULL;
2082 return buffer->ptr;
2083}
2084
2085static void *pv_mmu_read_buffer(struct kvm_pv_mmu_op_buffer *buffer,
2086 unsigned len)
2087{
2088 void *ret;
2089
2090 ret = pv_mmu_peek_buffer(buffer, len);
2091 if (!ret)
2092 return ret;
2093 buffer->ptr += len;
2094 buffer->len -= len;
2095 buffer->processed += len;
2096 return ret;
2097}
2098
2099static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
2100 gpa_t addr, gpa_t value)
2101{
2102 int bytes = 8;
2103 int r;
2104
2105 if (!is_long_mode(vcpu) && !is_pae(vcpu))
2106 bytes = 4;
2107
2108 r = mmu_topup_memory_caches(vcpu);
2109 if (r)
2110 return r;
2111
2112 if (!emulator_write_phys(vcpu, addr, &value, bytes))
2113 return -EFAULT;
2114
2115 return 1;
2116}
2117
2118static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
2119{
2120 kvm_x86_ops->tlb_flush(vcpu);
2121 return 1;
2122}
2123
2124static int kvm_pv_mmu_release_pt(struct kvm_vcpu *vcpu, gpa_t addr)
2125{
2126 spin_lock(&vcpu->kvm->mmu_lock);
2127 mmu_unshadow(vcpu->kvm, addr >> PAGE_SHIFT);
2128 spin_unlock(&vcpu->kvm->mmu_lock);
2129 return 1;
2130}
2131
2132static int kvm_pv_mmu_op_one(struct kvm_vcpu *vcpu,
2133 struct kvm_pv_mmu_op_buffer *buffer)
2134{
2135 struct kvm_mmu_op_header *header;
2136
2137 header = pv_mmu_peek_buffer(buffer, sizeof *header);
2138 if (!header)
2139 return 0;
2140 switch (header->op) {
2141 case KVM_MMU_OP_WRITE_PTE: {
2142 struct kvm_mmu_op_write_pte *wpte;
2143
2144 wpte = pv_mmu_read_buffer(buffer, sizeof *wpte);
2145 if (!wpte)
2146 return 0;
2147 return kvm_pv_mmu_write(vcpu, wpte->pte_phys,
2148 wpte->pte_val);
2149 }
2150 case KVM_MMU_OP_FLUSH_TLB: {
2151 struct kvm_mmu_op_flush_tlb *ftlb;
2152
2153 ftlb = pv_mmu_read_buffer(buffer, sizeof *ftlb);
2154 if (!ftlb)
2155 return 0;
2156 return kvm_pv_mmu_flush_tlb(vcpu);
2157 }
2158 case KVM_MMU_OP_RELEASE_PT: {
2159 struct kvm_mmu_op_release_pt *rpt;
2160
2161 rpt = pv_mmu_read_buffer(buffer, sizeof *rpt);
2162 if (!rpt)
2163 return 0;
2164 return kvm_pv_mmu_release_pt(vcpu, rpt->pt_phys);
2165 }
2166 default: return 0;
2167 }
2168}
2169
2170int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
2171 gpa_t addr, unsigned long *ret)
2172{
2173 int r;
2174 struct kvm_pv_mmu_op_buffer buffer;
2175
2176 buffer.ptr = buffer.buf;
2177 buffer.len = min_t(unsigned long, bytes, sizeof buffer.buf);
2178 buffer.processed = 0;
2179
2180 r = kvm_read_guest(vcpu->kvm, addr, buffer.buf, buffer.len);
2181 if (r)
2182 goto out;
2183
2184 while (buffer.len) {
2185 r = kvm_pv_mmu_op_one(vcpu, &buffer);
2186 if (r < 0)
2187 goto out;
2188 if (r == 0)
2189 break;
2190 }
2191
2192 r = 1;
2193out:
2194 *ret = buffer.processed;
2195 return r;
2196}
2197
1735#ifdef AUDIT 2198#ifdef AUDIT
1736 2199
1737static const char *audit_msg; 2200static const char *audit_msg;
@@ -1768,8 +2231,7 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
1768 audit_mappings_page(vcpu, ent, va, level - 1); 2231 audit_mappings_page(vcpu, ent, va, level - 1);
1769 } else { 2232 } else {
1770 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va); 2233 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va);
1771 struct page *page = gpa_to_page(vcpu, gpa); 2234 hpa_t hpa = (hpa_t)gpa_to_pfn(vcpu, gpa) << PAGE_SHIFT;
1772 hpa_t hpa = page_to_phys(page);
1773 2235
1774 if (is_shadow_present_pte(ent) 2236 if (is_shadow_present_pte(ent)
1775 && (ent & PT64_BASE_ADDR_MASK) != hpa) 2237 && (ent & PT64_BASE_ADDR_MASK) != hpa)
@@ -1782,7 +2244,7 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
1782 && !is_error_hpa(hpa)) 2244 && !is_error_hpa(hpa))
1783 printk(KERN_ERR "audit: (%s) notrap shadow," 2245 printk(KERN_ERR "audit: (%s) notrap shadow,"
1784 " valid guest gva %lx\n", audit_msg, va); 2246 " valid guest gva %lx\n", audit_msg, va);
1785 kvm_release_page_clean(page); 2247 kvm_release_pfn_clean(pfn);
1786 2248
1787 } 2249 }
1788 } 2250 }
@@ -1867,7 +2329,7 @@ static void audit_rmap(struct kvm_vcpu *vcpu)
1867 2329
1868 if (n_rmap != n_actual) 2330 if (n_rmap != n_actual)
1869 printk(KERN_ERR "%s: (%s) rmap %d actual %d\n", 2331 printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
1870 __FUNCTION__, audit_msg, n_rmap, n_actual); 2332 __func__, audit_msg, n_rmap, n_actual);
1871} 2333}
1872 2334
1873static void audit_write_protection(struct kvm_vcpu *vcpu) 2335static void audit_write_protection(struct kvm_vcpu *vcpu)
@@ -1887,7 +2349,7 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
1887 if (*rmapp) 2349 if (*rmapp)
1888 printk(KERN_ERR "%s: (%s) shadow page has writable" 2350 printk(KERN_ERR "%s: (%s) shadow page has writable"
1889 " mappings: gfn %lx role %x\n", 2351 " mappings: gfn %lx role %x\n",
1890 __FUNCTION__, audit_msg, sp->gfn, 2352 __func__, audit_msg, sp->gfn,
1891 sp->role.word); 2353 sp->role.word);
1892 } 2354 }
1893} 2355}
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 1fce19ec7a23..e64e9f56a65e 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -3,6 +3,12 @@
3 3
4#include <linux/kvm_host.h> 4#include <linux/kvm_host.h>
5 5
6#ifdef CONFIG_X86_64
7#define TDP_ROOT_LEVEL PT64_ROOT_LEVEL
8#else
9#define TDP_ROOT_LEVEL PT32E_ROOT_LEVEL
10#endif
11
6static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) 12static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
7{ 13{
8 if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) 14 if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index ecc0856268c4..156fe10288ae 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -130,7 +130,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
130 unsigned index, pt_access, pte_access; 130 unsigned index, pt_access, pte_access;
131 gpa_t pte_gpa; 131 gpa_t pte_gpa;
132 132
133 pgprintk("%s: addr %lx\n", __FUNCTION__, addr); 133 pgprintk("%s: addr %lx\n", __func__, addr);
134walk: 134walk:
135 walker->level = vcpu->arch.mmu.root_level; 135 walker->level = vcpu->arch.mmu.root_level;
136 pte = vcpu->arch.cr3; 136 pte = vcpu->arch.cr3;
@@ -155,7 +155,7 @@ walk:
155 pte_gpa += index * sizeof(pt_element_t); 155 pte_gpa += index * sizeof(pt_element_t);
156 walker->table_gfn[walker->level - 1] = table_gfn; 156 walker->table_gfn[walker->level - 1] = table_gfn;
157 walker->pte_gpa[walker->level - 1] = pte_gpa; 157 walker->pte_gpa[walker->level - 1] = pte_gpa;
158 pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__, 158 pgprintk("%s: table_gfn[%d] %lx\n", __func__,
159 walker->level - 1, table_gfn); 159 walker->level - 1, table_gfn);
160 160
161 kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)); 161 kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte));
@@ -222,7 +222,7 @@ walk:
222 walker->pt_access = pt_access; 222 walker->pt_access = pt_access;
223 walker->pte_access = pte_access; 223 walker->pte_access = pte_access;
224 pgprintk("%s: pte %llx pte_access %x pt_access %x\n", 224 pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
225 __FUNCTION__, (u64)pte, pt_access, pte_access); 225 __func__, (u64)pte, pt_access, pte_access);
226 return 1; 226 return 1;
227 227
228not_present: 228not_present:
@@ -243,31 +243,30 @@ err:
243} 243}
244 244
245static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, 245static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
246 u64 *spte, const void *pte, int bytes, 246 u64 *spte, const void *pte)
247 int offset_in_pte)
248{ 247{
249 pt_element_t gpte; 248 pt_element_t gpte;
250 unsigned pte_access; 249 unsigned pte_access;
251 struct page *npage; 250 pfn_t pfn;
251 int largepage = vcpu->arch.update_pte.largepage;
252 252
253 gpte = *(const pt_element_t *)pte; 253 gpte = *(const pt_element_t *)pte;
254 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { 254 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
255 if (!offset_in_pte && !is_present_pte(gpte)) 255 if (!is_present_pte(gpte))
256 set_shadow_pte(spte, shadow_notrap_nonpresent_pte); 256 set_shadow_pte(spte, shadow_notrap_nonpresent_pte);
257 return; 257 return;
258 } 258 }
259 if (bytes < sizeof(pt_element_t)) 259 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
260 return;
261 pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte);
262 pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte); 260 pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte);
263 if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn) 261 if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
264 return; 262 return;
265 npage = vcpu->arch.update_pte.page; 263 pfn = vcpu->arch.update_pte.pfn;
266 if (!npage) 264 if (is_error_pfn(pfn))
267 return; 265 return;
268 get_page(npage); 266 kvm_get_pfn(pfn);
269 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, 267 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
270 gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte), npage); 268 gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte),
269 pfn, true);
271} 270}
272 271
273/* 272/*
@@ -275,8 +274,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
275 */ 274 */
276static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, 275static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
277 struct guest_walker *walker, 276 struct guest_walker *walker,
278 int user_fault, int write_fault, int *ptwrite, 277 int user_fault, int write_fault, int largepage,
279 struct page *page) 278 int *ptwrite, pfn_t pfn)
280{ 279{
281 hpa_t shadow_addr; 280 hpa_t shadow_addr;
282 int level; 281 int level;
@@ -304,11 +303,19 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
304 shadow_ent = ((u64 *)__va(shadow_addr)) + index; 303 shadow_ent = ((u64 *)__va(shadow_addr)) + index;
305 if (level == PT_PAGE_TABLE_LEVEL) 304 if (level == PT_PAGE_TABLE_LEVEL)
306 break; 305 break;
307 if (is_shadow_present_pte(*shadow_ent)) { 306
307 if (largepage && level == PT_DIRECTORY_LEVEL)
308 break;
309
310 if (is_shadow_present_pte(*shadow_ent)
311 && !is_large_pte(*shadow_ent)) {
308 shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK; 312 shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
309 continue; 313 continue;
310 } 314 }
311 315
316 if (is_large_pte(*shadow_ent))
317 rmap_remove(vcpu->kvm, shadow_ent);
318
312 if (level - 1 == PT_PAGE_TABLE_LEVEL 319 if (level - 1 == PT_PAGE_TABLE_LEVEL
313 && walker->level == PT_DIRECTORY_LEVEL) { 320 && walker->level == PT_DIRECTORY_LEVEL) {
314 metaphysical = 1; 321 metaphysical = 1;
@@ -329,7 +336,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
329 walker->pte_gpa[level - 2], 336 walker->pte_gpa[level - 2],
330 &curr_pte, sizeof(curr_pte)); 337 &curr_pte, sizeof(curr_pte));
331 if (r || curr_pte != walker->ptes[level - 2]) { 338 if (r || curr_pte != walker->ptes[level - 2]) {
332 kvm_release_page_clean(page); 339 kvm_release_pfn_clean(pfn);
333 return NULL; 340 return NULL;
334 } 341 }
335 } 342 }
@@ -342,7 +349,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
342 mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access, 349 mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access,
343 user_fault, write_fault, 350 user_fault, write_fault,
344 walker->ptes[walker->level-1] & PT_DIRTY_MASK, 351 walker->ptes[walker->level-1] & PT_DIRTY_MASK,
345 ptwrite, walker->gfn, page); 352 ptwrite, largepage, walker->gfn, pfn, false);
346 353
347 return shadow_ent; 354 return shadow_ent;
348} 355}
@@ -371,16 +378,16 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
371 u64 *shadow_pte; 378 u64 *shadow_pte;
372 int write_pt = 0; 379 int write_pt = 0;
373 int r; 380 int r;
374 struct page *page; 381 pfn_t pfn;
382 int largepage = 0;
375 383
376 pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code); 384 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
377 kvm_mmu_audit(vcpu, "pre page fault"); 385 kvm_mmu_audit(vcpu, "pre page fault");
378 386
379 r = mmu_topup_memory_caches(vcpu); 387 r = mmu_topup_memory_caches(vcpu);
380 if (r) 388 if (r)
381 return r; 389 return r;
382 390
383 down_read(&vcpu->kvm->slots_lock);
384 /* 391 /*
385 * Look up the shadow pte for the faulting address. 392 * Look up the shadow pte for the faulting address.
386 */ 393 */
@@ -391,40 +398,45 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
391 * The page is not mapped by the guest. Let the guest handle it. 398 * The page is not mapped by the guest. Let the guest handle it.
392 */ 399 */
393 if (!r) { 400 if (!r) {
394 pgprintk("%s: guest page fault\n", __FUNCTION__); 401 pgprintk("%s: guest page fault\n", __func__);
395 inject_page_fault(vcpu, addr, walker.error_code); 402 inject_page_fault(vcpu, addr, walker.error_code);
396 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ 403 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
397 up_read(&vcpu->kvm->slots_lock);
398 return 0; 404 return 0;
399 } 405 }
400 406
401 down_read(&current->mm->mmap_sem); 407 down_read(&current->mm->mmap_sem);
402 page = gfn_to_page(vcpu->kvm, walker.gfn); 408 if (walker.level == PT_DIRECTORY_LEVEL) {
409 gfn_t large_gfn;
410 large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1);
411 if (is_largepage_backed(vcpu, large_gfn)) {
412 walker.gfn = large_gfn;
413 largepage = 1;
414 }
415 }
416 pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
403 up_read(&current->mm->mmap_sem); 417 up_read(&current->mm->mmap_sem);
404 418
419 /* mmio */
420 if (is_error_pfn(pfn)) {
421 pgprintk("gfn %x is mmio\n", walker.gfn);
422 kvm_release_pfn_clean(pfn);
423 return 1;
424 }
425
405 spin_lock(&vcpu->kvm->mmu_lock); 426 spin_lock(&vcpu->kvm->mmu_lock);
406 kvm_mmu_free_some_pages(vcpu); 427 kvm_mmu_free_some_pages(vcpu);
407 shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, 428 shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
408 &write_pt, page); 429 largepage, &write_pt, pfn);
409 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__, 430
431 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
410 shadow_pte, *shadow_pte, write_pt); 432 shadow_pte, *shadow_pte, write_pt);
411 433
412 if (!write_pt) 434 if (!write_pt)
413 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ 435 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
414 436
415 /*
416 * mmio: emulate if accessible, otherwise its a guest fault.
417 */
418 if (shadow_pte && is_io_pte(*shadow_pte)) {
419 spin_unlock(&vcpu->kvm->mmu_lock);
420 up_read(&vcpu->kvm->slots_lock);
421 return 1;
422 }
423
424 ++vcpu->stat.pf_fixed; 437 ++vcpu->stat.pf_fixed;
425 kvm_mmu_audit(vcpu, "post page fault (fixed)"); 438 kvm_mmu_audit(vcpu, "post page fault (fixed)");
426 spin_unlock(&vcpu->kvm->mmu_lock); 439 spin_unlock(&vcpu->kvm->mmu_lock);
427 up_read(&vcpu->kvm->slots_lock);
428 440
429 return write_pt; 441 return write_pt;
430} 442}
diff --git a/arch/x86/kvm/segment_descriptor.h b/arch/x86/kvm/segment_descriptor.h
deleted file mode 100644
index 56fc4c873389..000000000000
--- a/arch/x86/kvm/segment_descriptor.h
+++ /dev/null
@@ -1,29 +0,0 @@
1#ifndef __SEGMENT_DESCRIPTOR_H
2#define __SEGMENT_DESCRIPTOR_H
3
4struct segment_descriptor {
5 u16 limit_low;
6 u16 base_low;
7 u8 base_mid;
8 u8 type : 4;
9 u8 system : 1;
10 u8 dpl : 2;
11 u8 present : 1;
12 u8 limit_high : 4;
13 u8 avl : 1;
14 u8 long_mode : 1;
15 u8 default_op : 1;
16 u8 granularity : 1;
17 u8 base_high;
18} __attribute__((packed));
19
20#ifdef CONFIG_X86_64
21/* LDT or TSS descriptor in the GDT. 16 bytes. */
22struct segment_descriptor_64 {
23 struct segment_descriptor s;
24 u32 base_higher;
25 u32 pad_zero;
26};
27
28#endif
29#endif
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1a582f1090e8..89e0be2c10d0 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -47,6 +47,18 @@ MODULE_LICENSE("GPL");
47#define SVM_FEATURE_LBRV (1 << 1) 47#define SVM_FEATURE_LBRV (1 << 1)
48#define SVM_DEATURE_SVML (1 << 2) 48#define SVM_DEATURE_SVML (1 << 2)
49 49
50#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
51
52/* enable NPT for AMD64 and X86 with PAE */
53#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
54static bool npt_enabled = true;
55#else
56static bool npt_enabled = false;
57#endif
58static int npt = 1;
59
60module_param(npt, int, S_IRUGO);
61
50static void kvm_reput_irq(struct vcpu_svm *svm); 62static void kvm_reput_irq(struct vcpu_svm *svm);
51 63
52static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) 64static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
@@ -54,8 +66,7 @@ static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
54 return container_of(vcpu, struct vcpu_svm, vcpu); 66 return container_of(vcpu, struct vcpu_svm, vcpu);
55} 67}
56 68
57unsigned long iopm_base; 69static unsigned long iopm_base;
58unsigned long msrpm_base;
59 70
60struct kvm_ldttss_desc { 71struct kvm_ldttss_desc {
61 u16 limit0; 72 u16 limit0;
@@ -182,7 +193,7 @@ static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
182 193
183static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) 194static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
184{ 195{
185 if (!(efer & EFER_LMA)) 196 if (!npt_enabled && !(efer & EFER_LMA))
186 efer &= ~EFER_LME; 197 efer &= ~EFER_LME;
187 198
188 to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK; 199 to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK;
@@ -219,12 +230,12 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
219 struct vcpu_svm *svm = to_svm(vcpu); 230 struct vcpu_svm *svm = to_svm(vcpu);
220 231
221 if (!svm->next_rip) { 232 if (!svm->next_rip) {
222 printk(KERN_DEBUG "%s: NOP\n", __FUNCTION__); 233 printk(KERN_DEBUG "%s: NOP\n", __func__);
223 return; 234 return;
224 } 235 }
225 if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE) 236 if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE)
226 printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n", 237 printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n",
227 __FUNCTION__, 238 __func__,
228 svm->vmcb->save.rip, 239 svm->vmcb->save.rip,
229 svm->next_rip); 240 svm->next_rip);
230 241
@@ -279,11 +290,7 @@ static void svm_hardware_enable(void *garbage)
279 290
280 struct svm_cpu_data *svm_data; 291 struct svm_cpu_data *svm_data;
281 uint64_t efer; 292 uint64_t efer;
282#ifdef CONFIG_X86_64
283 struct desc_ptr gdt_descr;
284#else
285 struct desc_ptr gdt_descr; 293 struct desc_ptr gdt_descr;
286#endif
287 struct desc_struct *gdt; 294 struct desc_struct *gdt;
288 int me = raw_smp_processor_id(); 295 int me = raw_smp_processor_id();
289 296
@@ -302,7 +309,6 @@ static void svm_hardware_enable(void *garbage)
302 svm_data->asid_generation = 1; 309 svm_data->asid_generation = 1;
303 svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; 310 svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
304 svm_data->next_asid = svm_data->max_asid + 1; 311 svm_data->next_asid = svm_data->max_asid + 1;
305 svm_features = cpuid_edx(SVM_CPUID_FUNC);
306 312
307 asm volatile ("sgdt %0" : "=m"(gdt_descr)); 313 asm volatile ("sgdt %0" : "=m"(gdt_descr));
308 gdt = (struct desc_struct *)gdt_descr.address; 314 gdt = (struct desc_struct *)gdt_descr.address;
@@ -361,12 +367,51 @@ static void set_msr_interception(u32 *msrpm, unsigned msr,
361 BUG(); 367 BUG();
362} 368}
363 369
370static void svm_vcpu_init_msrpm(u32 *msrpm)
371{
372 memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
373
374#ifdef CONFIG_X86_64
375 set_msr_interception(msrpm, MSR_GS_BASE, 1, 1);
376 set_msr_interception(msrpm, MSR_FS_BASE, 1, 1);
377 set_msr_interception(msrpm, MSR_KERNEL_GS_BASE, 1, 1);
378 set_msr_interception(msrpm, MSR_LSTAR, 1, 1);
379 set_msr_interception(msrpm, MSR_CSTAR, 1, 1);
380 set_msr_interception(msrpm, MSR_SYSCALL_MASK, 1, 1);
381#endif
382 set_msr_interception(msrpm, MSR_K6_STAR, 1, 1);
383 set_msr_interception(msrpm, MSR_IA32_SYSENTER_CS, 1, 1);
384 set_msr_interception(msrpm, MSR_IA32_SYSENTER_ESP, 1, 1);
385 set_msr_interception(msrpm, MSR_IA32_SYSENTER_EIP, 1, 1);
386}
387
388static void svm_enable_lbrv(struct vcpu_svm *svm)
389{
390 u32 *msrpm = svm->msrpm;
391
392 svm->vmcb->control.lbr_ctl = 1;
393 set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
394 set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
395 set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
396 set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
397}
398
399static void svm_disable_lbrv(struct vcpu_svm *svm)
400{
401 u32 *msrpm = svm->msrpm;
402
403 svm->vmcb->control.lbr_ctl = 0;
404 set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
405 set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
406 set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
407 set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
408}
409
364static __init int svm_hardware_setup(void) 410static __init int svm_hardware_setup(void)
365{ 411{
366 int cpu; 412 int cpu;
367 struct page *iopm_pages; 413 struct page *iopm_pages;
368 struct page *msrpm_pages; 414 void *iopm_va;
369 void *iopm_va, *msrpm_va;
370 int r; 415 int r;
371 416
372 iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER); 417 iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER);
@@ -379,41 +424,33 @@ static __init int svm_hardware_setup(void)
379 clear_bit(0x80, iopm_va); /* allow direct access to PC debug port */ 424 clear_bit(0x80, iopm_va); /* allow direct access to PC debug port */
380 iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; 425 iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
381 426
427 if (boot_cpu_has(X86_FEATURE_NX))
428 kvm_enable_efer_bits(EFER_NX);
382 429
383 msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); 430 for_each_online_cpu(cpu) {
431 r = svm_cpu_init(cpu);
432 if (r)
433 goto err;
434 }
384 435
385 r = -ENOMEM; 436 svm_features = cpuid_edx(SVM_CPUID_FUNC);
386 if (!msrpm_pages)
387 goto err_1;
388 437
389 msrpm_va = page_address(msrpm_pages); 438 if (!svm_has(SVM_FEATURE_NPT))
390 memset(msrpm_va, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER)); 439 npt_enabled = false;
391 msrpm_base = page_to_pfn(msrpm_pages) << PAGE_SHIFT;
392 440
393#ifdef CONFIG_X86_64 441 if (npt_enabled && !npt) {
394 set_msr_interception(msrpm_va, MSR_GS_BASE, 1, 1); 442 printk(KERN_INFO "kvm: Nested Paging disabled\n");
395 set_msr_interception(msrpm_va, MSR_FS_BASE, 1, 1); 443 npt_enabled = false;
396 set_msr_interception(msrpm_va, MSR_KERNEL_GS_BASE, 1, 1); 444 }
397 set_msr_interception(msrpm_va, MSR_LSTAR, 1, 1);
398 set_msr_interception(msrpm_va, MSR_CSTAR, 1, 1);
399 set_msr_interception(msrpm_va, MSR_SYSCALL_MASK, 1, 1);
400#endif
401 set_msr_interception(msrpm_va, MSR_K6_STAR, 1, 1);
402 set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_CS, 1, 1);
403 set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_ESP, 1, 1);
404 set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_EIP, 1, 1);
405 445
406 for_each_online_cpu(cpu) { 446 if (npt_enabled) {
407 r = svm_cpu_init(cpu); 447 printk(KERN_INFO "kvm: Nested Paging enabled\n");
408 if (r) 448 kvm_enable_tdp();
409 goto err_2;
410 } 449 }
450
411 return 0; 451 return 0;
412 452
413err_2: 453err:
414 __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
415 msrpm_base = 0;
416err_1:
417 __free_pages(iopm_pages, IOPM_ALLOC_ORDER); 454 __free_pages(iopm_pages, IOPM_ALLOC_ORDER);
418 iopm_base = 0; 455 iopm_base = 0;
419 return r; 456 return r;
@@ -421,9 +458,8 @@ err_1:
421 458
422static __exit void svm_hardware_unsetup(void) 459static __exit void svm_hardware_unsetup(void)
423{ 460{
424 __free_pages(pfn_to_page(msrpm_base >> PAGE_SHIFT), MSRPM_ALLOC_ORDER);
425 __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER); 461 __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
426 iopm_base = msrpm_base = 0; 462 iopm_base = 0;
427} 463}
428 464
429static void init_seg(struct vmcb_seg *seg) 465static void init_seg(struct vmcb_seg *seg)
@@ -443,15 +479,14 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
443 seg->base = 0; 479 seg->base = 0;
444} 480}
445 481
446static void init_vmcb(struct vmcb *vmcb) 482static void init_vmcb(struct vcpu_svm *svm)
447{ 483{
448 struct vmcb_control_area *control = &vmcb->control; 484 struct vmcb_control_area *control = &svm->vmcb->control;
449 struct vmcb_save_area *save = &vmcb->save; 485 struct vmcb_save_area *save = &svm->vmcb->save;
450 486
451 control->intercept_cr_read = INTERCEPT_CR0_MASK | 487 control->intercept_cr_read = INTERCEPT_CR0_MASK |
452 INTERCEPT_CR3_MASK | 488 INTERCEPT_CR3_MASK |
453 INTERCEPT_CR4_MASK | 489 INTERCEPT_CR4_MASK;
454 INTERCEPT_CR8_MASK;
455 490
456 control->intercept_cr_write = INTERCEPT_CR0_MASK | 491 control->intercept_cr_write = INTERCEPT_CR0_MASK |
457 INTERCEPT_CR3_MASK | 492 INTERCEPT_CR3_MASK |
@@ -471,23 +506,13 @@ static void init_vmcb(struct vmcb *vmcb)
471 INTERCEPT_DR7_MASK; 506 INTERCEPT_DR7_MASK;
472 507
473 control->intercept_exceptions = (1 << PF_VECTOR) | 508 control->intercept_exceptions = (1 << PF_VECTOR) |
474 (1 << UD_VECTOR); 509 (1 << UD_VECTOR) |
510 (1 << MC_VECTOR);
475 511
476 512
477 control->intercept = (1ULL << INTERCEPT_INTR) | 513 control->intercept = (1ULL << INTERCEPT_INTR) |
478 (1ULL << INTERCEPT_NMI) | 514 (1ULL << INTERCEPT_NMI) |
479 (1ULL << INTERCEPT_SMI) | 515 (1ULL << INTERCEPT_SMI) |
480 /*
481 * selective cr0 intercept bug?
482 * 0: 0f 22 d8 mov %eax,%cr3
483 * 3: 0f 20 c0 mov %cr0,%eax
484 * 6: 0d 00 00 00 80 or $0x80000000,%eax
485 * b: 0f 22 c0 mov %eax,%cr0
486 * set cr3 ->interception
487 * get cr0 ->interception
488 * set cr0 -> no interception
489 */
490 /* (1ULL << INTERCEPT_SELECTIVE_CR0) | */
491 (1ULL << INTERCEPT_CPUID) | 516 (1ULL << INTERCEPT_CPUID) |
492 (1ULL << INTERCEPT_INVD) | 517 (1ULL << INTERCEPT_INVD) |
493 (1ULL << INTERCEPT_HLT) | 518 (1ULL << INTERCEPT_HLT) |
@@ -508,7 +533,7 @@ static void init_vmcb(struct vmcb *vmcb)
508 (1ULL << INTERCEPT_MWAIT); 533 (1ULL << INTERCEPT_MWAIT);
509 534
510 control->iopm_base_pa = iopm_base; 535 control->iopm_base_pa = iopm_base;
511 control->msrpm_base_pa = msrpm_base; 536 control->msrpm_base_pa = __pa(svm->msrpm);
512 control->tsc_offset = 0; 537 control->tsc_offset = 0;
513 control->int_ctl = V_INTR_MASKING_MASK; 538 control->int_ctl = V_INTR_MASKING_MASK;
514 539
@@ -550,13 +575,30 @@ static void init_vmcb(struct vmcb *vmcb)
550 save->cr0 = 0x00000010 | X86_CR0_PG | X86_CR0_WP; 575 save->cr0 = 0x00000010 | X86_CR0_PG | X86_CR0_WP;
551 save->cr4 = X86_CR4_PAE; 576 save->cr4 = X86_CR4_PAE;
552 /* rdx = ?? */ 577 /* rdx = ?? */
578
579 if (npt_enabled) {
580 /* Setup VMCB for Nested Paging */
581 control->nested_ctl = 1;
582 control->intercept &= ~(1ULL << INTERCEPT_TASK_SWITCH);
583 control->intercept_exceptions &= ~(1 << PF_VECTOR);
584 control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK|
585 INTERCEPT_CR3_MASK);
586 control->intercept_cr_write &= ~(INTERCEPT_CR0_MASK|
587 INTERCEPT_CR3_MASK);
588 save->g_pat = 0x0007040600070406ULL;
589 /* enable caching because the QEMU Bios doesn't enable it */
590 save->cr0 = X86_CR0_ET;
591 save->cr3 = 0;
592 save->cr4 = 0;
593 }
594 force_new_asid(&svm->vcpu);
553} 595}
554 596
555static int svm_vcpu_reset(struct kvm_vcpu *vcpu) 597static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
556{ 598{
557 struct vcpu_svm *svm = to_svm(vcpu); 599 struct vcpu_svm *svm = to_svm(vcpu);
558 600
559 init_vmcb(svm->vmcb); 601 init_vmcb(svm);
560 602
561 if (vcpu->vcpu_id != 0) { 603 if (vcpu->vcpu_id != 0) {
562 svm->vmcb->save.rip = 0; 604 svm->vmcb->save.rip = 0;
@@ -571,6 +613,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
571{ 613{
572 struct vcpu_svm *svm; 614 struct vcpu_svm *svm;
573 struct page *page; 615 struct page *page;
616 struct page *msrpm_pages;
574 int err; 617 int err;
575 618
576 svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); 619 svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
@@ -589,12 +632,19 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
589 goto uninit; 632 goto uninit;
590 } 633 }
591 634
635 err = -ENOMEM;
636 msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
637 if (!msrpm_pages)
638 goto uninit;
639 svm->msrpm = page_address(msrpm_pages);
640 svm_vcpu_init_msrpm(svm->msrpm);
641
592 svm->vmcb = page_address(page); 642 svm->vmcb = page_address(page);
593 clear_page(svm->vmcb); 643 clear_page(svm->vmcb);
594 svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; 644 svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
595 svm->asid_generation = 0; 645 svm->asid_generation = 0;
596 memset(svm->db_regs, 0, sizeof(svm->db_regs)); 646 memset(svm->db_regs, 0, sizeof(svm->db_regs));
597 init_vmcb(svm->vmcb); 647 init_vmcb(svm);
598 648
599 fx_init(&svm->vcpu); 649 fx_init(&svm->vcpu);
600 svm->vcpu.fpu_active = 1; 650 svm->vcpu.fpu_active = 1;
@@ -617,6 +667,7 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
617 struct vcpu_svm *svm = to_svm(vcpu); 667 struct vcpu_svm *svm = to_svm(vcpu);
618 668
619 __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT)); 669 __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT));
670 __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
620 kvm_vcpu_uninit(vcpu); 671 kvm_vcpu_uninit(vcpu);
621 kmem_cache_free(kvm_vcpu_cache, svm); 672 kmem_cache_free(kvm_vcpu_cache, svm);
622} 673}
@@ -731,6 +782,13 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
731 var->unusable = !var->present; 782 var->unusable = !var->present;
732} 783}
733 784
785static int svm_get_cpl(struct kvm_vcpu *vcpu)
786{
787 struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
788
789 return save->cpl;
790}
791
734static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) 792static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
735{ 793{
736 struct vcpu_svm *svm = to_svm(vcpu); 794 struct vcpu_svm *svm = to_svm(vcpu);
@@ -784,6 +842,9 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
784 } 842 }
785 } 843 }
786#endif 844#endif
845 if (npt_enabled)
846 goto set;
847
787 if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) { 848 if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) {
788 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); 849 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
789 vcpu->fpu_active = 1; 850 vcpu->fpu_active = 1;
@@ -791,18 +852,29 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
791 852
792 vcpu->arch.cr0 = cr0; 853 vcpu->arch.cr0 = cr0;
793 cr0 |= X86_CR0_PG | X86_CR0_WP; 854 cr0 |= X86_CR0_PG | X86_CR0_WP;
794 cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
795 if (!vcpu->fpu_active) { 855 if (!vcpu->fpu_active) {
796 svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR); 856 svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR);
797 cr0 |= X86_CR0_TS; 857 cr0 |= X86_CR0_TS;
798 } 858 }
859set:
860 /*
861 * re-enable caching here because the QEMU bios
862 * does not do it - this results in some delay at
863 * reboot
864 */
865 cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
799 svm->vmcb->save.cr0 = cr0; 866 svm->vmcb->save.cr0 = cr0;
800} 867}
801 868
802static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 869static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
803{ 870{
804 vcpu->arch.cr4 = cr4; 871 unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE;
805 to_svm(vcpu)->vmcb->save.cr4 = cr4 | X86_CR4_PAE; 872
873 vcpu->arch.cr4 = cr4;
874 if (!npt_enabled)
875 cr4 |= X86_CR4_PAE;
876 cr4 |= host_cr4_mce;
877 to_svm(vcpu)->vmcb->save.cr4 = cr4;
806} 878}
807 879
808static void svm_set_segment(struct kvm_vcpu *vcpu, 880static void svm_set_segment(struct kvm_vcpu *vcpu,
@@ -833,13 +905,6 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
833 905
834} 906}
835 907
836/* FIXME:
837
838 svm(vcpu)->vmcb->control.int_ctl &= ~V_TPR_MASK;
839 svm(vcpu)->vmcb->control.int_ctl |= (sregs->cr8 & V_TPR_MASK);
840
841*/
842
843static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) 908static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
844{ 909{
845 return -EOPNOTSUPP; 910 return -EOPNOTSUPP;
@@ -920,7 +985,7 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
920 } 985 }
921 default: 986 default:
922 printk(KERN_DEBUG "%s: unexpected dr %u\n", 987 printk(KERN_DEBUG "%s: unexpected dr %u\n",
923 __FUNCTION__, dr); 988 __func__, dr);
924 *exception = UD_VECTOR; 989 *exception = UD_VECTOR;
925 return; 990 return;
926 } 991 }
@@ -962,6 +1027,19 @@ static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
962 return 1; 1027 return 1;
963} 1028}
964 1029
1030static int mc_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1031{
1032 /*
1033 * On an #MC intercept the MCE handler is not called automatically in
1034 * the host. So do it by hand here.
1035 */
1036 asm volatile (
1037 "int $0x12\n");
1038 /* not sure if we ever come back to this point */
1039
1040 return 1;
1041}
1042
965static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1043static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
966{ 1044{
967 /* 1045 /*
@@ -969,7 +1047,7 @@ static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
969 * so reinitialize it. 1047 * so reinitialize it.
970 */ 1048 */
971 clear_page(svm->vmcb); 1049 clear_page(svm->vmcb);
972 init_vmcb(svm->vmcb); 1050 init_vmcb(svm);
973 1051
974 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; 1052 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
975 return 0; 1053 return 0;
@@ -1033,9 +1111,18 @@ static int invalid_op_interception(struct vcpu_svm *svm,
1033static int task_switch_interception(struct vcpu_svm *svm, 1111static int task_switch_interception(struct vcpu_svm *svm,
1034 struct kvm_run *kvm_run) 1112 struct kvm_run *kvm_run)
1035{ 1113{
1036 pr_unimpl(&svm->vcpu, "%s: task switch is unsupported\n", __FUNCTION__); 1114 u16 tss_selector;
1037 kvm_run->exit_reason = KVM_EXIT_UNKNOWN; 1115
1038 return 0; 1116 tss_selector = (u16)svm->vmcb->control.exit_info_1;
1117 if (svm->vmcb->control.exit_info_2 &
1118 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
1119 return kvm_task_switch(&svm->vcpu, tss_selector,
1120 TASK_SWITCH_IRET);
1121 if (svm->vmcb->control.exit_info_2 &
1122 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
1123 return kvm_task_switch(&svm->vcpu, tss_selector,
1124 TASK_SWITCH_JMP);
1125 return kvm_task_switch(&svm->vcpu, tss_selector, TASK_SWITCH_CALL);
1039} 1126}
1040 1127
1041static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1128static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
@@ -1049,7 +1136,7 @@ static int emulate_on_interception(struct vcpu_svm *svm,
1049 struct kvm_run *kvm_run) 1136 struct kvm_run *kvm_run)
1050{ 1137{
1051 if (emulate_instruction(&svm->vcpu, NULL, 0, 0, 0) != EMULATE_DONE) 1138 if (emulate_instruction(&svm->vcpu, NULL, 0, 0, 0) != EMULATE_DONE)
1052 pr_unimpl(&svm->vcpu, "%s: failed\n", __FUNCTION__); 1139 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
1053 return 1; 1140 return 1;
1054} 1141}
1055 1142
@@ -1179,8 +1266,19 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
1179 svm->vmcb->save.sysenter_esp = data; 1266 svm->vmcb->save.sysenter_esp = data;
1180 break; 1267 break;
1181 case MSR_IA32_DEBUGCTLMSR: 1268 case MSR_IA32_DEBUGCTLMSR:
1182 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", 1269 if (!svm_has(SVM_FEATURE_LBRV)) {
1183 __FUNCTION__, data); 1270 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
1271 __func__, data);
1272 break;
1273 }
1274 if (data & DEBUGCTL_RESERVED_BITS)
1275 return 1;
1276
1277 svm->vmcb->save.dbgctl = data;
1278 if (data & (1ULL<<0))
1279 svm_enable_lbrv(svm);
1280 else
1281 svm_disable_lbrv(svm);
1184 break; 1282 break;
1185 case MSR_K7_EVNTSEL0: 1283 case MSR_K7_EVNTSEL0:
1186 case MSR_K7_EVNTSEL1: 1284 case MSR_K7_EVNTSEL1:
@@ -1265,6 +1363,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
1265 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, 1363 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
1266 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, 1364 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
1267 [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, 1365 [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception,
1366 [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception,
1268 [SVM_EXIT_INTR] = nop_on_interception, 1367 [SVM_EXIT_INTR] = nop_on_interception,
1269 [SVM_EXIT_NMI] = nop_on_interception, 1368 [SVM_EXIT_NMI] = nop_on_interception,
1270 [SVM_EXIT_SMI] = nop_on_interception, 1369 [SVM_EXIT_SMI] = nop_on_interception,
@@ -1290,14 +1389,34 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
1290 [SVM_EXIT_WBINVD] = emulate_on_interception, 1389 [SVM_EXIT_WBINVD] = emulate_on_interception,
1291 [SVM_EXIT_MONITOR] = invalid_op_interception, 1390 [SVM_EXIT_MONITOR] = invalid_op_interception,
1292 [SVM_EXIT_MWAIT] = invalid_op_interception, 1391 [SVM_EXIT_MWAIT] = invalid_op_interception,
1392 [SVM_EXIT_NPF] = pf_interception,
1293}; 1393};
1294 1394
1295
1296static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) 1395static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1297{ 1396{
1298 struct vcpu_svm *svm = to_svm(vcpu); 1397 struct vcpu_svm *svm = to_svm(vcpu);
1299 u32 exit_code = svm->vmcb->control.exit_code; 1398 u32 exit_code = svm->vmcb->control.exit_code;
1300 1399
1400 if (npt_enabled) {
1401 int mmu_reload = 0;
1402 if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) {
1403 svm_set_cr0(vcpu, svm->vmcb->save.cr0);
1404 mmu_reload = 1;
1405 }
1406 vcpu->arch.cr0 = svm->vmcb->save.cr0;
1407 vcpu->arch.cr3 = svm->vmcb->save.cr3;
1408 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
1409 if (!load_pdptrs(vcpu, vcpu->arch.cr3)) {
1410 kvm_inject_gp(vcpu, 0);
1411 return 1;
1412 }
1413 }
1414 if (mmu_reload) {
1415 kvm_mmu_reset_context(vcpu);
1416 kvm_mmu_load(vcpu);
1417 }
1418 }
1419
1301 kvm_reput_irq(svm); 1420 kvm_reput_irq(svm);
1302 1421
1303 if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { 1422 if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
@@ -1308,10 +1427,11 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1308 } 1427 }
1309 1428
1310 if (is_external_interrupt(svm->vmcb->control.exit_int_info) && 1429 if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
1311 exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR) 1430 exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
1431 exit_code != SVM_EXIT_NPF)
1312 printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x " 1432 printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
1313 "exit_code 0x%x\n", 1433 "exit_code 0x%x\n",
1314 __FUNCTION__, svm->vmcb->control.exit_int_info, 1434 __func__, svm->vmcb->control.exit_int_info,
1315 exit_code); 1435 exit_code);
1316 1436
1317 if (exit_code >= ARRAY_SIZE(svm_exit_handlers) 1437 if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
@@ -1364,6 +1484,27 @@ static void svm_set_irq(struct kvm_vcpu *vcpu, int irq)
1364 svm_inject_irq(svm, irq); 1484 svm_inject_irq(svm, irq);
1365} 1485}
1366 1486
1487static void update_cr8_intercept(struct kvm_vcpu *vcpu)
1488{
1489 struct vcpu_svm *svm = to_svm(vcpu);
1490 struct vmcb *vmcb = svm->vmcb;
1491 int max_irr, tpr;
1492
1493 if (!irqchip_in_kernel(vcpu->kvm) || vcpu->arch.apic->vapic_addr)
1494 return;
1495
1496 vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
1497
1498 max_irr = kvm_lapic_find_highest_irr(vcpu);
1499 if (max_irr == -1)
1500 return;
1501
1502 tpr = kvm_lapic_get_cr8(vcpu) << 4;
1503
1504 if (tpr >= (max_irr & 0xf0))
1505 vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK;
1506}
1507
1367static void svm_intr_assist(struct kvm_vcpu *vcpu) 1508static void svm_intr_assist(struct kvm_vcpu *vcpu)
1368{ 1509{
1369 struct vcpu_svm *svm = to_svm(vcpu); 1510 struct vcpu_svm *svm = to_svm(vcpu);
@@ -1376,14 +1517,14 @@ static void svm_intr_assist(struct kvm_vcpu *vcpu)
1376 SVM_EVTINJ_VEC_MASK; 1517 SVM_EVTINJ_VEC_MASK;
1377 vmcb->control.exit_int_info = 0; 1518 vmcb->control.exit_int_info = 0;
1378 svm_inject_irq(svm, intr_vector); 1519 svm_inject_irq(svm, intr_vector);
1379 return; 1520 goto out;
1380 } 1521 }
1381 1522
1382 if (vmcb->control.int_ctl & V_IRQ_MASK) 1523 if (vmcb->control.int_ctl & V_IRQ_MASK)
1383 return; 1524 goto out;
1384 1525
1385 if (!kvm_cpu_has_interrupt(vcpu)) 1526 if (!kvm_cpu_has_interrupt(vcpu))
1386 return; 1527 goto out;
1387 1528
1388 if (!(vmcb->save.rflags & X86_EFLAGS_IF) || 1529 if (!(vmcb->save.rflags & X86_EFLAGS_IF) ||
1389 (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) || 1530 (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
@@ -1391,12 +1532,14 @@ static void svm_intr_assist(struct kvm_vcpu *vcpu)
1391 /* unable to deliver irq, set pending irq */ 1532 /* unable to deliver irq, set pending irq */
1392 vmcb->control.intercept |= (1ULL << INTERCEPT_VINTR); 1533 vmcb->control.intercept |= (1ULL << INTERCEPT_VINTR);
1393 svm_inject_irq(svm, 0x0); 1534 svm_inject_irq(svm, 0x0);
1394 return; 1535 goto out;
1395 } 1536 }
1396 /* Okay, we can deliver the interrupt: grab it and update PIC state. */ 1537 /* Okay, we can deliver the interrupt: grab it and update PIC state. */
1397 intr_vector = kvm_cpu_get_interrupt(vcpu); 1538 intr_vector = kvm_cpu_get_interrupt(vcpu);
1398 svm_inject_irq(svm, intr_vector); 1539 svm_inject_irq(svm, intr_vector);
1399 kvm_timer_intr_post(vcpu, intr_vector); 1540 kvm_timer_intr_post(vcpu, intr_vector);
1541out:
1542 update_cr8_intercept(vcpu);
1400} 1543}
1401 1544
1402static void kvm_reput_irq(struct vcpu_svm *svm) 1545static void kvm_reput_irq(struct vcpu_svm *svm)
@@ -1482,6 +1625,29 @@ static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
1482{ 1625{
1483} 1626}
1484 1627
1628static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
1629{
1630 struct vcpu_svm *svm = to_svm(vcpu);
1631
1632 if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) {
1633 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
1634 kvm_lapic_set_tpr(vcpu, cr8);
1635 }
1636}
1637
1638static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
1639{
1640 struct vcpu_svm *svm = to_svm(vcpu);
1641 u64 cr8;
1642
1643 if (!irqchip_in_kernel(vcpu->kvm))
1644 return;
1645
1646 cr8 = kvm_get_cr8(vcpu);
1647 svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
1648 svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
1649}
1650
1485static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1651static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1486{ 1652{
1487 struct vcpu_svm *svm = to_svm(vcpu); 1653 struct vcpu_svm *svm = to_svm(vcpu);
@@ -1491,6 +1657,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1491 1657
1492 pre_svm_run(svm); 1658 pre_svm_run(svm);
1493 1659
1660 sync_lapic_to_cr8(vcpu);
1661
1494 save_host_msrs(vcpu); 1662 save_host_msrs(vcpu);
1495 fs_selector = read_fs(); 1663 fs_selector = read_fs();
1496 gs_selector = read_gs(); 1664 gs_selector = read_gs();
@@ -1499,6 +1667,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1499 svm->host_dr6 = read_dr6(); 1667 svm->host_dr6 = read_dr6();
1500 svm->host_dr7 = read_dr7(); 1668 svm->host_dr7 = read_dr7();
1501 svm->vmcb->save.cr2 = vcpu->arch.cr2; 1669 svm->vmcb->save.cr2 = vcpu->arch.cr2;
1670 /* required for live migration with NPT */
1671 if (npt_enabled)
1672 svm->vmcb->save.cr3 = vcpu->arch.cr3;
1502 1673
1503 if (svm->vmcb->save.dr7 & 0xff) { 1674 if (svm->vmcb->save.dr7 & 0xff) {
1504 write_dr7(0); 1675 write_dr7(0);
@@ -1635,6 +1806,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1635 1806
1636 stgi(); 1807 stgi();
1637 1808
1809 sync_cr8_to_lapic(vcpu);
1810
1638 svm->next_rip = 0; 1811 svm->next_rip = 0;
1639} 1812}
1640 1813
@@ -1642,6 +1815,12 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
1642{ 1815{
1643 struct vcpu_svm *svm = to_svm(vcpu); 1816 struct vcpu_svm *svm = to_svm(vcpu);
1644 1817
1818 if (npt_enabled) {
1819 svm->vmcb->control.nested_cr3 = root;
1820 force_new_asid(vcpu);
1821 return;
1822 }
1823
1645 svm->vmcb->save.cr3 = root; 1824 svm->vmcb->save.cr3 = root;
1646 force_new_asid(vcpu); 1825 force_new_asid(vcpu);
1647 1826
@@ -1709,6 +1888,7 @@ static struct kvm_x86_ops svm_x86_ops = {
1709 .get_segment_base = svm_get_segment_base, 1888 .get_segment_base = svm_get_segment_base,
1710 .get_segment = svm_get_segment, 1889 .get_segment = svm_get_segment,
1711 .set_segment = svm_set_segment, 1890 .set_segment = svm_set_segment,
1891 .get_cpl = svm_get_cpl,
1712 .get_cs_db_l_bits = kvm_get_cs_db_l_bits, 1892 .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
1713 .decache_cr4_guest_bits = svm_decache_cr4_guest_bits, 1893 .decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
1714 .set_cr0 = svm_set_cr0, 1894 .set_cr0 = svm_set_cr0,
diff --git a/arch/x86/kvm/svm.h b/arch/x86/kvm/svm.h
index 5fd50491b555..1b8afa78e869 100644
--- a/arch/x86/kvm/svm.h
+++ b/arch/x86/kvm/svm.h
@@ -238,6 +238,9 @@ struct __attribute__ ((__packed__)) vmcb {
238#define SVM_EXITINTINFO_VALID SVM_EVTINJ_VALID 238#define SVM_EXITINTINFO_VALID SVM_EVTINJ_VALID
239#define SVM_EXITINTINFO_VALID_ERR SVM_EVTINJ_VALID_ERR 239#define SVM_EXITINTINFO_VALID_ERR SVM_EVTINJ_VALID_ERR
240 240
241#define SVM_EXITINFOSHIFT_TS_REASON_IRET 36
242#define SVM_EXITINFOSHIFT_TS_REASON_JMP 38
243
241#define SVM_EXIT_READ_CR0 0x000 244#define SVM_EXIT_READ_CR0 0x000
242#define SVM_EXIT_READ_CR3 0x003 245#define SVM_EXIT_READ_CR3 0x003
243#define SVM_EXIT_READ_CR4 0x004 246#define SVM_EXIT_READ_CR4 0x004
diff --git a/arch/x86/kvm/tss.h b/arch/x86/kvm/tss.h
new file mode 100644
index 000000000000..622aa10f692f
--- /dev/null
+++ b/arch/x86/kvm/tss.h
@@ -0,0 +1,59 @@
1#ifndef __TSS_SEGMENT_H
2#define __TSS_SEGMENT_H
3
4struct tss_segment_32 {
5 u32 prev_task_link;
6 u32 esp0;
7 u32 ss0;
8 u32 esp1;
9 u32 ss1;
10 u32 esp2;
11 u32 ss2;
12 u32 cr3;
13 u32 eip;
14 u32 eflags;
15 u32 eax;
16 u32 ecx;
17 u32 edx;
18 u32 ebx;
19 u32 esp;
20 u32 ebp;
21 u32 esi;
22 u32 edi;
23 u32 es;
24 u32 cs;
25 u32 ss;
26 u32 ds;
27 u32 fs;
28 u32 gs;
29 u32 ldt_selector;
30 u16 t;
31 u16 io_map;
32};
33
34struct tss_segment_16 {
35 u16 prev_task_link;
36 u16 sp0;
37 u16 ss0;
38 u16 sp1;
39 u16 ss1;
40 u16 sp2;
41 u16 ss2;
42 u16 ip;
43 u16 flag;
44 u16 ax;
45 u16 cx;
46 u16 dx;
47 u16 bx;
48 u16 sp;
49 u16 bp;
50 u16 si;
51 u16 di;
52 u16 es;
53 u16 cs;
54 u16 ss;
55 u16 ds;
56 u16 ldt;
57};
58
59#endif
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8e1462880d1f..8e5d6645b90d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -17,7 +17,6 @@
17 17
18#include "irq.h" 18#include "irq.h"
19#include "vmx.h" 19#include "vmx.h"
20#include "segment_descriptor.h"
21#include "mmu.h" 20#include "mmu.h"
22 21
23#include <linux/kvm_host.h> 22#include <linux/kvm_host.h>
@@ -37,6 +36,12 @@ MODULE_LICENSE("GPL");
37static int bypass_guest_pf = 1; 36static int bypass_guest_pf = 1;
38module_param(bypass_guest_pf, bool, 0); 37module_param(bypass_guest_pf, bool, 0);
39 38
39static int enable_vpid = 1;
40module_param(enable_vpid, bool, 0);
41
42static int flexpriority_enabled = 1;
43module_param(flexpriority_enabled, bool, 0);
44
40struct vmcs { 45struct vmcs {
41 u32 revision_id; 46 u32 revision_id;
42 u32 abort; 47 u32 abort;
@@ -71,6 +76,7 @@ struct vcpu_vmx {
71 unsigned rip; 76 unsigned rip;
72 } irq; 77 } irq;
73 } rmode; 78 } rmode;
79 int vpid;
74}; 80};
75 81
76static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) 82static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -85,6 +91,10 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
85 91
86static struct page *vmx_io_bitmap_a; 92static struct page *vmx_io_bitmap_a;
87static struct page *vmx_io_bitmap_b; 93static struct page *vmx_io_bitmap_b;
94static struct page *vmx_msr_bitmap;
95
96static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
97static DEFINE_SPINLOCK(vmx_vpid_lock);
88 98
89static struct vmcs_config { 99static struct vmcs_config {
90 int size; 100 int size;
@@ -176,6 +186,11 @@ static inline int is_external_interrupt(u32 intr_info)
176 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); 186 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
177} 187}
178 188
189static inline int cpu_has_vmx_msr_bitmap(void)
190{
191 return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS);
192}
193
179static inline int cpu_has_vmx_tpr_shadow(void) 194static inline int cpu_has_vmx_tpr_shadow(void)
180{ 195{
181 return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW); 196 return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW);
@@ -194,8 +209,9 @@ static inline int cpu_has_secondary_exec_ctrls(void)
194 209
195static inline bool cpu_has_vmx_virtualize_apic_accesses(void) 210static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
196{ 211{
197 return (vmcs_config.cpu_based_2nd_exec_ctrl & 212 return flexpriority_enabled
198 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); 213 && (vmcs_config.cpu_based_2nd_exec_ctrl &
214 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
199} 215}
200 216
201static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) 217static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
@@ -204,6 +220,12 @@ static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
204 (irqchip_in_kernel(kvm))); 220 (irqchip_in_kernel(kvm)));
205} 221}
206 222
223static inline int cpu_has_vmx_vpid(void)
224{
225 return (vmcs_config.cpu_based_2nd_exec_ctrl &
226 SECONDARY_EXEC_ENABLE_VPID);
227}
228
207static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) 229static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
208{ 230{
209 int i; 231 int i;
@@ -214,6 +236,20 @@ static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
214 return -1; 236 return -1;
215} 237}
216 238
239static inline void __invvpid(int ext, u16 vpid, gva_t gva)
240{
241 struct {
242 u64 vpid : 16;
243 u64 rsvd : 48;
244 u64 gva;
245 } operand = { vpid, 0, gva };
246
247 asm volatile (ASM_VMX_INVVPID
248 /* CF==1 or ZF==1 --> rc = -1 */
249 "; ja 1f ; ud2 ; 1:"
250 : : "a"(&operand), "c"(ext) : "cc", "memory");
251}
252
217static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) 253static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
218{ 254{
219 int i; 255 int i;
@@ -257,6 +293,14 @@ static void vcpu_clear(struct vcpu_vmx *vmx)
257 vmx->launched = 0; 293 vmx->launched = 0;
258} 294}
259 295
296static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx)
297{
298 if (vmx->vpid == 0)
299 return;
300
301 __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
302}
303
260static unsigned long vmcs_readl(unsigned long field) 304static unsigned long vmcs_readl(unsigned long field)
261{ 305{
262 unsigned long value; 306 unsigned long value;
@@ -353,7 +397,7 @@ static void reload_tss(void)
353 * VT restores TR but not its size. Useless. 397 * VT restores TR but not its size. Useless.
354 */ 398 */
355 struct descriptor_table gdt; 399 struct descriptor_table gdt;
356 struct segment_descriptor *descs; 400 struct desc_struct *descs;
357 401
358 get_gdt(&gdt); 402 get_gdt(&gdt);
359 descs = (void *)gdt.base; 403 descs = (void *)gdt.base;
@@ -485,11 +529,12 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
485{ 529{
486 struct vcpu_vmx *vmx = to_vmx(vcpu); 530 struct vcpu_vmx *vmx = to_vmx(vcpu);
487 u64 phys_addr = __pa(vmx->vmcs); 531 u64 phys_addr = __pa(vmx->vmcs);
488 u64 tsc_this, delta; 532 u64 tsc_this, delta, new_offset;
489 533
490 if (vcpu->cpu != cpu) { 534 if (vcpu->cpu != cpu) {
491 vcpu_clear(vmx); 535 vcpu_clear(vmx);
492 kvm_migrate_apic_timer(vcpu); 536 kvm_migrate_apic_timer(vcpu);
537 vpid_sync_vcpu_all(vmx);
493 } 538 }
494 539
495 if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { 540 if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
@@ -524,8 +569,11 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
524 * Make sure the time stamp counter is monotonous. 569 * Make sure the time stamp counter is monotonous.
525 */ 570 */
526 rdtscll(tsc_this); 571 rdtscll(tsc_this);
527 delta = vcpu->arch.host_tsc - tsc_this; 572 if (tsc_this < vcpu->arch.host_tsc) {
528 vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta); 573 delta = vcpu->arch.host_tsc - tsc_this;
574 new_offset = vmcs_read64(TSC_OFFSET) + delta;
575 vmcs_write64(TSC_OFFSET, new_offset);
576 }
529 } 577 }
530} 578}
531 579
@@ -596,7 +644,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
596{ 644{
597 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 645 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
598 nr | INTR_TYPE_EXCEPTION 646 nr | INTR_TYPE_EXCEPTION
599 | (has_error_code ? INTR_INFO_DELIEVER_CODE_MASK : 0) 647 | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0)
600 | INTR_INFO_VALID_MASK); 648 | INTR_INFO_VALID_MASK);
601 if (has_error_code) 649 if (has_error_code)
602 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); 650 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
@@ -959,6 +1007,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
959 CPU_BASED_MOV_DR_EXITING | 1007 CPU_BASED_MOV_DR_EXITING |
960 CPU_BASED_USE_TSC_OFFSETING; 1008 CPU_BASED_USE_TSC_OFFSETING;
961 opt = CPU_BASED_TPR_SHADOW | 1009 opt = CPU_BASED_TPR_SHADOW |
1010 CPU_BASED_USE_MSR_BITMAPS |
962 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 1011 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
963 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, 1012 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
964 &_cpu_based_exec_control) < 0) 1013 &_cpu_based_exec_control) < 0)
@@ -971,7 +1020,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
971 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { 1020 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
972 min = 0; 1021 min = 0;
973 opt = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 1022 opt = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
974 SECONDARY_EXEC_WBINVD_EXITING; 1023 SECONDARY_EXEC_WBINVD_EXITING |
1024 SECONDARY_EXEC_ENABLE_VPID;
975 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS2, 1025 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS2,
976 &_cpu_based_2nd_exec_control) < 0) 1026 &_cpu_based_2nd_exec_control) < 0)
977 return -EIO; 1027 return -EIO;
@@ -1080,6 +1130,10 @@ static __init int hardware_setup(void)
1080{ 1130{
1081 if (setup_vmcs_config(&vmcs_config) < 0) 1131 if (setup_vmcs_config(&vmcs_config) < 0)
1082 return -EIO; 1132 return -EIO;
1133
1134 if (boot_cpu_has(X86_FEATURE_NX))
1135 kvm_enable_efer_bits(EFER_NX);
1136
1083 return alloc_kvm_area(); 1137 return alloc_kvm_area();
1084} 1138}
1085 1139
@@ -1214,7 +1268,7 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
1214 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); 1268 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
1215 if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) { 1269 if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
1216 printk(KERN_DEBUG "%s: tss fixup for long mode. \n", 1270 printk(KERN_DEBUG "%s: tss fixup for long mode. \n",
1217 __FUNCTION__); 1271 __func__);
1218 vmcs_write32(GUEST_TR_AR_BYTES, 1272 vmcs_write32(GUEST_TR_AR_BYTES,
1219 (guest_tr_ar & ~AR_TYPE_MASK) 1273 (guest_tr_ar & ~AR_TYPE_MASK)
1220 | AR_TYPE_BUSY_64_TSS); 1274 | AR_TYPE_BUSY_64_TSS);
@@ -1239,6 +1293,11 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
1239 1293
1240#endif 1294#endif
1241 1295
1296static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
1297{
1298 vpid_sync_vcpu_all(to_vmx(vcpu));
1299}
1300
1242static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) 1301static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
1243{ 1302{
1244 vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK; 1303 vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK;
@@ -1275,6 +1334,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1275 1334
1276static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 1335static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1277{ 1336{
1337 vmx_flush_tlb(vcpu);
1278 vmcs_writel(GUEST_CR3, cr3); 1338 vmcs_writel(GUEST_CR3, cr3);
1279 if (vcpu->arch.cr0 & X86_CR0_PE) 1339 if (vcpu->arch.cr0 & X86_CR0_PE)
1280 vmx_fpu_deactivate(vcpu); 1340 vmx_fpu_deactivate(vcpu);
@@ -1288,14 +1348,14 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1288 vcpu->arch.cr4 = cr4; 1348 vcpu->arch.cr4 = cr4;
1289} 1349}
1290 1350
1291#ifdef CONFIG_X86_64
1292
1293static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) 1351static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
1294{ 1352{
1295 struct vcpu_vmx *vmx = to_vmx(vcpu); 1353 struct vcpu_vmx *vmx = to_vmx(vcpu);
1296 struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); 1354 struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
1297 1355
1298 vcpu->arch.shadow_efer = efer; 1356 vcpu->arch.shadow_efer = efer;
1357 if (!msr)
1358 return;
1299 if (efer & EFER_LMA) { 1359 if (efer & EFER_LMA) {
1300 vmcs_write32(VM_ENTRY_CONTROLS, 1360 vmcs_write32(VM_ENTRY_CONTROLS,
1301 vmcs_read32(VM_ENTRY_CONTROLS) | 1361 vmcs_read32(VM_ENTRY_CONTROLS) |
@@ -1312,8 +1372,6 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
1312 setup_msrs(vmx); 1372 setup_msrs(vmx);
1313} 1373}
1314 1374
1315#endif
1316
1317static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) 1375static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
1318{ 1376{
1319 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 1377 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
@@ -1344,6 +1402,20 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
1344 var->unusable = (ar >> 16) & 1; 1402 var->unusable = (ar >> 16) & 1;
1345} 1403}
1346 1404
1405static int vmx_get_cpl(struct kvm_vcpu *vcpu)
1406{
1407 struct kvm_segment kvm_seg;
1408
1409 if (!(vcpu->arch.cr0 & X86_CR0_PE)) /* if real mode */
1410 return 0;
1411
1412 if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */
1413 return 3;
1414
1415 vmx_get_segment(vcpu, &kvm_seg, VCPU_SREG_CS);
1416 return kvm_seg.selector & 3;
1417}
1418
1347static u32 vmx_segment_access_rights(struct kvm_segment *var) 1419static u32 vmx_segment_access_rights(struct kvm_segment *var)
1348{ 1420{
1349 u32 ar; 1421 u32 ar;
@@ -1433,7 +1505,6 @@ static int init_rmode_tss(struct kvm *kvm)
1433 int ret = 0; 1505 int ret = 0;
1434 int r; 1506 int r;
1435 1507
1436 down_read(&kvm->slots_lock);
1437 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); 1508 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
1438 if (r < 0) 1509 if (r < 0)
1439 goto out; 1510 goto out;
@@ -1456,7 +1527,6 @@ static int init_rmode_tss(struct kvm *kvm)
1456 1527
1457 ret = 1; 1528 ret = 1;
1458out: 1529out:
1459 up_read(&kvm->slots_lock);
1460 return ret; 1530 return ret;
1461} 1531}
1462 1532
@@ -1494,6 +1564,46 @@ out:
1494 return r; 1564 return r;
1495} 1565}
1496 1566
1567static void allocate_vpid(struct vcpu_vmx *vmx)
1568{
1569 int vpid;
1570
1571 vmx->vpid = 0;
1572 if (!enable_vpid || !cpu_has_vmx_vpid())
1573 return;
1574 spin_lock(&vmx_vpid_lock);
1575 vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
1576 if (vpid < VMX_NR_VPIDS) {
1577 vmx->vpid = vpid;
1578 __set_bit(vpid, vmx_vpid_bitmap);
1579 }
1580 spin_unlock(&vmx_vpid_lock);
1581}
1582
1583void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr)
1584{
1585 void *va;
1586
1587 if (!cpu_has_vmx_msr_bitmap())
1588 return;
1589
1590 /*
1591 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
1592 * have the write-low and read-high bitmap offsets the wrong way round.
1593 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
1594 */
1595 va = kmap(msr_bitmap);
1596 if (msr <= 0x1fff) {
1597 __clear_bit(msr, va + 0x000); /* read-low */
1598 __clear_bit(msr, va + 0x800); /* write-low */
1599 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
1600 msr &= 0x1fff;
1601 __clear_bit(msr, va + 0x400); /* read-high */
1602 __clear_bit(msr, va + 0xc00); /* write-high */
1603 }
1604 kunmap(msr_bitmap);
1605}
1606
1497/* 1607/*
1498 * Sets up the vmcs for emulated real mode. 1608 * Sets up the vmcs for emulated real mode.
1499 */ 1609 */
@@ -1511,6 +1621,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
1511 vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a)); 1621 vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a));
1512 vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b)); 1622 vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b));
1513 1623
1624 if (cpu_has_vmx_msr_bitmap())
1625 vmcs_write64(MSR_BITMAP, page_to_phys(vmx_msr_bitmap));
1626
1514 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ 1627 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
1515 1628
1516 /* Control */ 1629 /* Control */
@@ -1532,6 +1645,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
1532 if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) 1645 if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
1533 exec_control &= 1646 exec_control &=
1534 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 1647 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
1648 if (vmx->vpid == 0)
1649 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
1535 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); 1650 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
1536 } 1651 }
1537 1652
@@ -1613,6 +1728,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
1613 u64 msr; 1728 u64 msr;
1614 int ret; 1729 int ret;
1615 1730
1731 down_read(&vcpu->kvm->slots_lock);
1616 if (!init_rmode_tss(vmx->vcpu.kvm)) { 1732 if (!init_rmode_tss(vmx->vcpu.kvm)) {
1617 ret = -ENOMEM; 1733 ret = -ENOMEM;
1618 goto out; 1734 goto out;
@@ -1621,7 +1737,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
1621 vmx->vcpu.arch.rmode.active = 0; 1737 vmx->vcpu.arch.rmode.active = 0;
1622 1738
1623 vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); 1739 vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
1624 set_cr8(&vmx->vcpu, 0); 1740 kvm_set_cr8(&vmx->vcpu, 0);
1625 msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; 1741 msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
1626 if (vmx->vcpu.vcpu_id == 0) 1742 if (vmx->vcpu.vcpu_id == 0)
1627 msr |= MSR_IA32_APICBASE_BSP; 1743 msr |= MSR_IA32_APICBASE_BSP;
@@ -1704,18 +1820,22 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
1704 vmcs_write64(APIC_ACCESS_ADDR, 1820 vmcs_write64(APIC_ACCESS_ADDR,
1705 page_to_phys(vmx->vcpu.kvm->arch.apic_access_page)); 1821 page_to_phys(vmx->vcpu.kvm->arch.apic_access_page));
1706 1822
1823 if (vmx->vpid != 0)
1824 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
1825
1707 vmx->vcpu.arch.cr0 = 0x60000010; 1826 vmx->vcpu.arch.cr0 = 0x60000010;
1708 vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */ 1827 vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */
1709 vmx_set_cr4(&vmx->vcpu, 0); 1828 vmx_set_cr4(&vmx->vcpu, 0);
1710#ifdef CONFIG_X86_64
1711 vmx_set_efer(&vmx->vcpu, 0); 1829 vmx_set_efer(&vmx->vcpu, 0);
1712#endif
1713 vmx_fpu_activate(&vmx->vcpu); 1830 vmx_fpu_activate(&vmx->vcpu);
1714 update_exception_bitmap(&vmx->vcpu); 1831 update_exception_bitmap(&vmx->vcpu);
1715 1832
1716 return 0; 1833 vpid_sync_vcpu_all(vmx);
1834
1835 ret = 0;
1717 1836
1718out: 1837out:
1838 up_read(&vcpu->kvm->slots_lock);
1719 return ret; 1839 return ret;
1720} 1840}
1721 1841
@@ -1723,6 +1843,8 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
1723{ 1843{
1724 struct vcpu_vmx *vmx = to_vmx(vcpu); 1844 struct vcpu_vmx *vmx = to_vmx(vcpu);
1725 1845
1846 KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler);
1847
1726 if (vcpu->arch.rmode.active) { 1848 if (vcpu->arch.rmode.active) {
1727 vmx->rmode.irq.pending = true; 1849 vmx->rmode.irq.pending = true;
1728 vmx->rmode.irq.vector = irq; 1850 vmx->rmode.irq.vector = irq;
@@ -1844,7 +1966,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1844 if ((vect_info & VECTORING_INFO_VALID_MASK) && 1966 if ((vect_info & VECTORING_INFO_VALID_MASK) &&
1845 !is_page_fault(intr_info)) 1967 !is_page_fault(intr_info))
1846 printk(KERN_ERR "%s: unexpected, vectoring info 0x%x " 1968 printk(KERN_ERR "%s: unexpected, vectoring info 0x%x "
1847 "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info); 1969 "intr info 0x%x\n", __func__, vect_info, intr_info);
1848 1970
1849 if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) { 1971 if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) {
1850 int irq = vect_info & VECTORING_INFO_VECTOR_MASK; 1972 int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
@@ -1869,10 +1991,12 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1869 1991
1870 error_code = 0; 1992 error_code = 0;
1871 rip = vmcs_readl(GUEST_RIP); 1993 rip = vmcs_readl(GUEST_RIP);
1872 if (intr_info & INTR_INFO_DELIEVER_CODE_MASK) 1994 if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
1873 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 1995 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
1874 if (is_page_fault(intr_info)) { 1996 if (is_page_fault(intr_info)) {
1875 cr2 = vmcs_readl(EXIT_QUALIFICATION); 1997 cr2 = vmcs_readl(EXIT_QUALIFICATION);
1998 KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2,
1999 (u32)((u64)cr2 >> 32), handler);
1876 return kvm_mmu_page_fault(vcpu, cr2, error_code); 2000 return kvm_mmu_page_fault(vcpu, cr2, error_code);
1877 } 2001 }
1878 2002
@@ -1901,6 +2025,7 @@ static int handle_external_interrupt(struct kvm_vcpu *vcpu,
1901 struct kvm_run *kvm_run) 2025 struct kvm_run *kvm_run)
1902{ 2026{
1903 ++vcpu->stat.irq_exits; 2027 ++vcpu->stat.irq_exits;
2028 KVMTRACE_1D(INTR, vcpu, vmcs_read32(VM_EXIT_INTR_INFO), handler);
1904 return 1; 2029 return 1;
1905} 2030}
1906 2031
@@ -1958,25 +2083,27 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1958 reg = (exit_qualification >> 8) & 15; 2083 reg = (exit_qualification >> 8) & 15;
1959 switch ((exit_qualification >> 4) & 3) { 2084 switch ((exit_qualification >> 4) & 3) {
1960 case 0: /* mov to cr */ 2085 case 0: /* mov to cr */
2086 KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)vcpu->arch.regs[reg],
2087 (u32)((u64)vcpu->arch.regs[reg] >> 32), handler);
1961 switch (cr) { 2088 switch (cr) {
1962 case 0: 2089 case 0:
1963 vcpu_load_rsp_rip(vcpu); 2090 vcpu_load_rsp_rip(vcpu);
1964 set_cr0(vcpu, vcpu->arch.regs[reg]); 2091 kvm_set_cr0(vcpu, vcpu->arch.regs[reg]);
1965 skip_emulated_instruction(vcpu); 2092 skip_emulated_instruction(vcpu);
1966 return 1; 2093 return 1;
1967 case 3: 2094 case 3:
1968 vcpu_load_rsp_rip(vcpu); 2095 vcpu_load_rsp_rip(vcpu);
1969 set_cr3(vcpu, vcpu->arch.regs[reg]); 2096 kvm_set_cr3(vcpu, vcpu->arch.regs[reg]);
1970 skip_emulated_instruction(vcpu); 2097 skip_emulated_instruction(vcpu);
1971 return 1; 2098 return 1;
1972 case 4: 2099 case 4:
1973 vcpu_load_rsp_rip(vcpu); 2100 vcpu_load_rsp_rip(vcpu);
1974 set_cr4(vcpu, vcpu->arch.regs[reg]); 2101 kvm_set_cr4(vcpu, vcpu->arch.regs[reg]);
1975 skip_emulated_instruction(vcpu); 2102 skip_emulated_instruction(vcpu);
1976 return 1; 2103 return 1;
1977 case 8: 2104 case 8:
1978 vcpu_load_rsp_rip(vcpu); 2105 vcpu_load_rsp_rip(vcpu);
1979 set_cr8(vcpu, vcpu->arch.regs[reg]); 2106 kvm_set_cr8(vcpu, vcpu->arch.regs[reg]);
1980 skip_emulated_instruction(vcpu); 2107 skip_emulated_instruction(vcpu);
1981 if (irqchip_in_kernel(vcpu->kvm)) 2108 if (irqchip_in_kernel(vcpu->kvm))
1982 return 1; 2109 return 1;
@@ -1990,6 +2117,7 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1990 vcpu->arch.cr0 &= ~X86_CR0_TS; 2117 vcpu->arch.cr0 &= ~X86_CR0_TS;
1991 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); 2118 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
1992 vmx_fpu_activate(vcpu); 2119 vmx_fpu_activate(vcpu);
2120 KVMTRACE_0D(CLTS, vcpu, handler);
1993 skip_emulated_instruction(vcpu); 2121 skip_emulated_instruction(vcpu);
1994 return 1; 2122 return 1;
1995 case 1: /*mov from cr*/ 2123 case 1: /*mov from cr*/
@@ -1998,18 +2126,24 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1998 vcpu_load_rsp_rip(vcpu); 2126 vcpu_load_rsp_rip(vcpu);
1999 vcpu->arch.regs[reg] = vcpu->arch.cr3; 2127 vcpu->arch.regs[reg] = vcpu->arch.cr3;
2000 vcpu_put_rsp_rip(vcpu); 2128 vcpu_put_rsp_rip(vcpu);
2129 KVMTRACE_3D(CR_READ, vcpu, (u32)cr,
2130 (u32)vcpu->arch.regs[reg],
2131 (u32)((u64)vcpu->arch.regs[reg] >> 32),
2132 handler);
2001 skip_emulated_instruction(vcpu); 2133 skip_emulated_instruction(vcpu);
2002 return 1; 2134 return 1;
2003 case 8: 2135 case 8:
2004 vcpu_load_rsp_rip(vcpu); 2136 vcpu_load_rsp_rip(vcpu);
2005 vcpu->arch.regs[reg] = get_cr8(vcpu); 2137 vcpu->arch.regs[reg] = kvm_get_cr8(vcpu);
2006 vcpu_put_rsp_rip(vcpu); 2138 vcpu_put_rsp_rip(vcpu);
2139 KVMTRACE_2D(CR_READ, vcpu, (u32)cr,
2140 (u32)vcpu->arch.regs[reg], handler);
2007 skip_emulated_instruction(vcpu); 2141 skip_emulated_instruction(vcpu);
2008 return 1; 2142 return 1;
2009 } 2143 }
2010 break; 2144 break;
2011 case 3: /* lmsw */ 2145 case 3: /* lmsw */
2012 lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f); 2146 kvm_lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f);
2013 2147
2014 skip_emulated_instruction(vcpu); 2148 skip_emulated_instruction(vcpu);
2015 return 1; 2149 return 1;
@@ -2049,6 +2183,7 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2049 val = 0; 2183 val = 0;
2050 } 2184 }
2051 vcpu->arch.regs[reg] = val; 2185 vcpu->arch.regs[reg] = val;
2186 KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler);
2052 } else { 2187 } else {
2053 /* mov to dr */ 2188 /* mov to dr */
2054 } 2189 }
@@ -2073,6 +2208,9 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2073 return 1; 2208 return 1;
2074 } 2209 }
2075 2210
2211 KVMTRACE_3D(MSR_READ, vcpu, ecx, (u32)data, (u32)(data >> 32),
2212 handler);
2213
2076 /* FIXME: handling of bits 32:63 of rax, rdx */ 2214 /* FIXME: handling of bits 32:63 of rax, rdx */
2077 vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u; 2215 vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u;
2078 vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u; 2216 vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
@@ -2086,6 +2224,9 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2086 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) 2224 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
2087 | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); 2225 | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
2088 2226
2227 KVMTRACE_3D(MSR_WRITE, vcpu, ecx, (u32)data, (u32)(data >> 32),
2228 handler);
2229
2089 if (vmx_set_msr(vcpu, ecx, data) != 0) { 2230 if (vmx_set_msr(vcpu, ecx, data) != 0) {
2090 kvm_inject_gp(vcpu, 0); 2231 kvm_inject_gp(vcpu, 0);
2091 return 1; 2232 return 1;
@@ -2110,6 +2251,9 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
2110 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 2251 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2111 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; 2252 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2112 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 2253 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2254
2255 KVMTRACE_0D(PEND_INTR, vcpu, handler);
2256
2113 /* 2257 /*
2114 * If the user space waits to inject interrupts, exit as soon as 2258 * If the user space waits to inject interrupts, exit as soon as
2115 * possible 2259 * possible
@@ -2152,6 +2296,8 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2152 exit_qualification = vmcs_read64(EXIT_QUALIFICATION); 2296 exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2153 offset = exit_qualification & 0xffful; 2297 offset = exit_qualification & 0xffful;
2154 2298
2299 KVMTRACE_1D(APIC_ACCESS, vcpu, (u32)offset, handler);
2300
2155 er = emulate_instruction(vcpu, kvm_run, 0, 0, 0); 2301 er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
2156 2302
2157 if (er != EMULATE_DONE) { 2303 if (er != EMULATE_DONE) {
@@ -2163,6 +2309,20 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2163 return 1; 2309 return 1;
2164} 2310}
2165 2311
2312static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2313{
2314 unsigned long exit_qualification;
2315 u16 tss_selector;
2316 int reason;
2317
2318 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
2319
2320 reason = (u32)exit_qualification >> 30;
2321 tss_selector = exit_qualification;
2322
2323 return kvm_task_switch(vcpu, tss_selector, reason);
2324}
2325
2166/* 2326/*
2167 * The exit handlers return 1 if the exit was handled fully and guest execution 2327 * The exit handlers return 1 if the exit was handled fully and guest execution
2168 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 2328 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -2185,6 +2345,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
2185 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, 2345 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
2186 [EXIT_REASON_APIC_ACCESS] = handle_apic_access, 2346 [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
2187 [EXIT_REASON_WBINVD] = handle_wbinvd, 2347 [EXIT_REASON_WBINVD] = handle_wbinvd,
2348 [EXIT_REASON_TASK_SWITCH] = handle_task_switch,
2188}; 2349};
2189 2350
2190static const int kvm_vmx_max_exit_handlers = 2351static const int kvm_vmx_max_exit_handlers =
@@ -2200,6 +2361,9 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2200 struct vcpu_vmx *vmx = to_vmx(vcpu); 2361 struct vcpu_vmx *vmx = to_vmx(vcpu);
2201 u32 vectoring_info = vmx->idt_vectoring_info; 2362 u32 vectoring_info = vmx->idt_vectoring_info;
2202 2363
2364 KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)vmcs_readl(GUEST_RIP),
2365 (u32)((u64)vmcs_readl(GUEST_RIP) >> 32), entryexit);
2366
2203 if (unlikely(vmx->fail)) { 2367 if (unlikely(vmx->fail)) {
2204 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; 2368 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
2205 kvm_run->fail_entry.hardware_entry_failure_reason 2369 kvm_run->fail_entry.hardware_entry_failure_reason
@@ -2210,7 +2374,7 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2210 if ((vectoring_info & VECTORING_INFO_VALID_MASK) && 2374 if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
2211 exit_reason != EXIT_REASON_EXCEPTION_NMI) 2375 exit_reason != EXIT_REASON_EXCEPTION_NMI)
2212 printk(KERN_WARNING "%s: unexpected, valid vectoring info and " 2376 printk(KERN_WARNING "%s: unexpected, valid vectoring info and "
2213 "exit reason is 0x%x\n", __FUNCTION__, exit_reason); 2377 "exit reason is 0x%x\n", __func__, exit_reason);
2214 if (exit_reason < kvm_vmx_max_exit_handlers 2378 if (exit_reason < kvm_vmx_max_exit_handlers
2215 && kvm_vmx_exit_handlers[exit_reason]) 2379 && kvm_vmx_exit_handlers[exit_reason])
2216 return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run); 2380 return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run);
@@ -2221,10 +2385,6 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2221 return 0; 2385 return 0;
2222} 2386}
2223 2387
2224static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
2225{
2226}
2227
2228static void update_tpr_threshold(struct kvm_vcpu *vcpu) 2388static void update_tpr_threshold(struct kvm_vcpu *vcpu)
2229{ 2389{
2230 int max_irr, tpr; 2390 int max_irr, tpr;
@@ -2285,11 +2445,13 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
2285 return; 2445 return;
2286 } 2446 }
2287 2447
2448 KVMTRACE_1D(REDELIVER_EVT, vcpu, idtv_info_field, handler);
2449
2288 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field); 2450 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field);
2289 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 2451 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2290 vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); 2452 vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
2291 2453
2292 if (unlikely(idtv_info_field & INTR_INFO_DELIEVER_CODE_MASK)) 2454 if (unlikely(idtv_info_field & INTR_INFO_DELIVER_CODE_MASK))
2293 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 2455 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
2294 vmcs_read32(IDT_VECTORING_ERROR_CODE)); 2456 vmcs_read32(IDT_VECTORING_ERROR_CODE));
2295 if (unlikely(has_ext_irq)) 2457 if (unlikely(has_ext_irq))
@@ -2470,8 +2632,10 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2470 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 2632 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
2471 2633
2472 /* We need to handle NMIs before interrupts are enabled */ 2634 /* We need to handle NMIs before interrupts are enabled */
2473 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */ 2635 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) { /* nmi */
2636 KVMTRACE_0D(NMI, vcpu, handler);
2474 asm("int $2"); 2637 asm("int $2");
2638 }
2475} 2639}
2476 2640
2477static void vmx_free_vmcs(struct kvm_vcpu *vcpu) 2641static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
@@ -2489,6 +2653,10 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
2489{ 2653{
2490 struct vcpu_vmx *vmx = to_vmx(vcpu); 2654 struct vcpu_vmx *vmx = to_vmx(vcpu);
2491 2655
2656 spin_lock(&vmx_vpid_lock);
2657 if (vmx->vpid != 0)
2658 __clear_bit(vmx->vpid, vmx_vpid_bitmap);
2659 spin_unlock(&vmx_vpid_lock);
2492 vmx_free_vmcs(vcpu); 2660 vmx_free_vmcs(vcpu);
2493 kfree(vmx->host_msrs); 2661 kfree(vmx->host_msrs);
2494 kfree(vmx->guest_msrs); 2662 kfree(vmx->guest_msrs);
@@ -2505,6 +2673,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
2505 if (!vmx) 2673 if (!vmx)
2506 return ERR_PTR(-ENOMEM); 2674 return ERR_PTR(-ENOMEM);
2507 2675
2676 allocate_vpid(vmx);
2677
2508 err = kvm_vcpu_init(&vmx->vcpu, kvm, id); 2678 err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
2509 if (err) 2679 if (err)
2510 goto free_vcpu; 2680 goto free_vcpu;
@@ -2591,14 +2761,13 @@ static struct kvm_x86_ops vmx_x86_ops = {
2591 .get_segment_base = vmx_get_segment_base, 2761 .get_segment_base = vmx_get_segment_base,
2592 .get_segment = vmx_get_segment, 2762 .get_segment = vmx_get_segment,
2593 .set_segment = vmx_set_segment, 2763 .set_segment = vmx_set_segment,
2764 .get_cpl = vmx_get_cpl,
2594 .get_cs_db_l_bits = vmx_get_cs_db_l_bits, 2765 .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
2595 .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, 2766 .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
2596 .set_cr0 = vmx_set_cr0, 2767 .set_cr0 = vmx_set_cr0,
2597 .set_cr3 = vmx_set_cr3, 2768 .set_cr3 = vmx_set_cr3,
2598 .set_cr4 = vmx_set_cr4, 2769 .set_cr4 = vmx_set_cr4,
2599#ifdef CONFIG_X86_64
2600 .set_efer = vmx_set_efer, 2770 .set_efer = vmx_set_efer,
2601#endif
2602 .get_idt = vmx_get_idt, 2771 .get_idt = vmx_get_idt,
2603 .set_idt = vmx_set_idt, 2772 .set_idt = vmx_set_idt,
2604 .get_gdt = vmx_get_gdt, 2773 .get_gdt = vmx_get_gdt,
@@ -2626,7 +2795,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
2626 2795
2627static int __init vmx_init(void) 2796static int __init vmx_init(void)
2628{ 2797{
2629 void *iova; 2798 void *va;
2630 int r; 2799 int r;
2631 2800
2632 vmx_io_bitmap_a = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); 2801 vmx_io_bitmap_a = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
@@ -2639,28 +2808,48 @@ static int __init vmx_init(void)
2639 goto out; 2808 goto out;
2640 } 2809 }
2641 2810
2811 vmx_msr_bitmap = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
2812 if (!vmx_msr_bitmap) {
2813 r = -ENOMEM;
2814 goto out1;
2815 }
2816
2642 /* 2817 /*
2643 * Allow direct access to the PC debug port (it is often used for I/O 2818 * Allow direct access to the PC debug port (it is often used for I/O
2644 * delays, but the vmexits simply slow things down). 2819 * delays, but the vmexits simply slow things down).
2645 */ 2820 */
2646 iova = kmap(vmx_io_bitmap_a); 2821 va = kmap(vmx_io_bitmap_a);
2647 memset(iova, 0xff, PAGE_SIZE); 2822 memset(va, 0xff, PAGE_SIZE);
2648 clear_bit(0x80, iova); 2823 clear_bit(0x80, va);
2649 kunmap(vmx_io_bitmap_a); 2824 kunmap(vmx_io_bitmap_a);
2650 2825
2651 iova = kmap(vmx_io_bitmap_b); 2826 va = kmap(vmx_io_bitmap_b);
2652 memset(iova, 0xff, PAGE_SIZE); 2827 memset(va, 0xff, PAGE_SIZE);
2653 kunmap(vmx_io_bitmap_b); 2828 kunmap(vmx_io_bitmap_b);
2654 2829
2830 va = kmap(vmx_msr_bitmap);
2831 memset(va, 0xff, PAGE_SIZE);
2832 kunmap(vmx_msr_bitmap);
2833
2834 set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
2835
2655 r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE); 2836 r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
2656 if (r) 2837 if (r)
2657 goto out1; 2838 goto out2;
2839
2840 vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_FS_BASE);
2841 vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_GS_BASE);
2842 vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_CS);
2843 vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_ESP);
2844 vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_EIP);
2658 2845
2659 if (bypass_guest_pf) 2846 if (bypass_guest_pf)
2660 kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull); 2847 kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
2661 2848
2662 return 0; 2849 return 0;
2663 2850
2851out2:
2852 __free_page(vmx_msr_bitmap);
2664out1: 2853out1:
2665 __free_page(vmx_io_bitmap_b); 2854 __free_page(vmx_io_bitmap_b);
2666out: 2855out:
@@ -2670,6 +2859,7 @@ out:
2670 2859
2671static void __exit vmx_exit(void) 2860static void __exit vmx_exit(void)
2672{ 2861{
2862 __free_page(vmx_msr_bitmap);
2673 __free_page(vmx_io_bitmap_b); 2863 __free_page(vmx_io_bitmap_b);
2674 __free_page(vmx_io_bitmap_a); 2864 __free_page(vmx_io_bitmap_a);
2675 2865
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h
index d52ae8d7303d..5dff4606b988 100644
--- a/arch/x86/kvm/vmx.h
+++ b/arch/x86/kvm/vmx.h
@@ -49,6 +49,7 @@
49 * Definitions of Secondary Processor-Based VM-Execution Controls. 49 * Definitions of Secondary Processor-Based VM-Execution Controls.
50 */ 50 */
51#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 51#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
52#define SECONDARY_EXEC_ENABLE_VPID 0x00000020
52#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 53#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
53 54
54 55
@@ -65,6 +66,7 @@
65 66
66/* VMCS Encodings */ 67/* VMCS Encodings */
67enum vmcs_field { 68enum vmcs_field {
69 VIRTUAL_PROCESSOR_ID = 0x00000000,
68 GUEST_ES_SELECTOR = 0x00000800, 70 GUEST_ES_SELECTOR = 0x00000800,
69 GUEST_CS_SELECTOR = 0x00000802, 71 GUEST_CS_SELECTOR = 0x00000802,
70 GUEST_SS_SELECTOR = 0x00000804, 72 GUEST_SS_SELECTOR = 0x00000804,
@@ -231,12 +233,12 @@ enum vmcs_field {
231 */ 233 */
232#define INTR_INFO_VECTOR_MASK 0xff /* 7:0 */ 234#define INTR_INFO_VECTOR_MASK 0xff /* 7:0 */
233#define INTR_INFO_INTR_TYPE_MASK 0x700 /* 10:8 */ 235#define INTR_INFO_INTR_TYPE_MASK 0x700 /* 10:8 */
234#define INTR_INFO_DELIEVER_CODE_MASK 0x800 /* 11 */ 236#define INTR_INFO_DELIVER_CODE_MASK 0x800 /* 11 */
235#define INTR_INFO_VALID_MASK 0x80000000 /* 31 */ 237#define INTR_INFO_VALID_MASK 0x80000000 /* 31 */
236 238
237#define VECTORING_INFO_VECTOR_MASK INTR_INFO_VECTOR_MASK 239#define VECTORING_INFO_VECTOR_MASK INTR_INFO_VECTOR_MASK
238#define VECTORING_INFO_TYPE_MASK INTR_INFO_INTR_TYPE_MASK 240#define VECTORING_INFO_TYPE_MASK INTR_INFO_INTR_TYPE_MASK
239#define VECTORING_INFO_DELIEVER_CODE_MASK INTR_INFO_DELIEVER_CODE_MASK 241#define VECTORING_INFO_DELIVER_CODE_MASK INTR_INFO_DELIVER_CODE_MASK
240#define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK 242#define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK
241 243
242#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */ 244#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */
@@ -321,4 +323,8 @@ enum vmcs_field {
321 323
322#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9 324#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9
323 325
326#define VMX_NR_VPIDS (1 << 16)
327#define VMX_VPID_EXTENT_SINGLE_CONTEXT 1
328#define VMX_VPID_EXTENT_ALL_CONTEXT 2
329
324#endif 330#endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6b01552bd1f1..0ce556372a4d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -15,10 +15,12 @@
15 */ 15 */
16 16
17#include <linux/kvm_host.h> 17#include <linux/kvm_host.h>
18#include "segment_descriptor.h"
19#include "irq.h" 18#include "irq.h"
20#include "mmu.h" 19#include "mmu.h"
20#include "i8254.h"
21#include "tss.h"
21 22
23#include <linux/clocksource.h>
22#include <linux/kvm.h> 24#include <linux/kvm.h>
23#include <linux/fs.h> 25#include <linux/fs.h>
24#include <linux/vmalloc.h> 26#include <linux/vmalloc.h>
@@ -28,6 +30,7 @@
28 30
29#include <asm/uaccess.h> 31#include <asm/uaccess.h>
30#include <asm/msr.h> 32#include <asm/msr.h>
33#include <asm/desc.h>
31 34
32#define MAX_IO_MSRS 256 35#define MAX_IO_MSRS 256
33#define CR0_RESERVED_BITS \ 36#define CR0_RESERVED_BITS \
@@ -41,7 +44,15 @@
41 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) 44 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
42 45
43#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) 46#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
44#define EFER_RESERVED_BITS 0xfffffffffffff2fe 47/* EFER defaults:
48 * - enable syscall per default because its emulated by KVM
49 * - enable LME and LMA per default on 64 bit KVM
50 */
51#ifdef CONFIG_X86_64
52static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL;
53#else
54static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL;
55#endif
45 56
46#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM 57#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
47#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU 58#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
@@ -63,6 +74,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
63 { "irq_window", VCPU_STAT(irq_window_exits) }, 74 { "irq_window", VCPU_STAT(irq_window_exits) },
64 { "halt_exits", VCPU_STAT(halt_exits) }, 75 { "halt_exits", VCPU_STAT(halt_exits) },
65 { "halt_wakeup", VCPU_STAT(halt_wakeup) }, 76 { "halt_wakeup", VCPU_STAT(halt_wakeup) },
77 { "hypercalls", VCPU_STAT(hypercalls) },
66 { "request_irq", VCPU_STAT(request_irq_exits) }, 78 { "request_irq", VCPU_STAT(request_irq_exits) },
67 { "irq_exits", VCPU_STAT(irq_exits) }, 79 { "irq_exits", VCPU_STAT(irq_exits) },
68 { "host_state_reload", VCPU_STAT(host_state_reload) }, 80 { "host_state_reload", VCPU_STAT(host_state_reload) },
@@ -78,6 +90,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
78 { "mmu_recycled", VM_STAT(mmu_recycled) }, 90 { "mmu_recycled", VM_STAT(mmu_recycled) },
79 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, 91 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
80 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, 92 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
93 { "largepages", VM_STAT(lpages) },
81 { NULL } 94 { NULL }
82}; 95};
83 96
@@ -85,7 +98,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
85unsigned long segment_base(u16 selector) 98unsigned long segment_base(u16 selector)
86{ 99{
87 struct descriptor_table gdt; 100 struct descriptor_table gdt;
88 struct segment_descriptor *d; 101 struct desc_struct *d;
89 unsigned long table_base; 102 unsigned long table_base;
90 unsigned long v; 103 unsigned long v;
91 104
@@ -101,13 +114,12 @@ unsigned long segment_base(u16 selector)
101 asm("sldt %0" : "=g"(ldt_selector)); 114 asm("sldt %0" : "=g"(ldt_selector));
102 table_base = segment_base(ldt_selector); 115 table_base = segment_base(ldt_selector);
103 } 116 }
104 d = (struct segment_descriptor *)(table_base + (selector & ~7)); 117 d = (struct desc_struct *)(table_base + (selector & ~7));
105 v = d->base_low | ((unsigned long)d->base_mid << 16) | 118 v = d->base0 | ((unsigned long)d->base1 << 16) |
106 ((unsigned long)d->base_high << 24); 119 ((unsigned long)d->base2 << 24);
107#ifdef CONFIG_X86_64 120#ifdef CONFIG_X86_64
108 if (d->system == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) 121 if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
109 v |= ((unsigned long) \ 122 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
110 ((struct segment_descriptor_64 *)d)->base_higher) << 32;
111#endif 123#endif
112 return v; 124 return v;
113} 125}
@@ -145,11 +157,16 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
145 u32 error_code) 157 u32 error_code)
146{ 158{
147 ++vcpu->stat.pf_guest; 159 ++vcpu->stat.pf_guest;
148 if (vcpu->arch.exception.pending && vcpu->arch.exception.nr == PF_VECTOR) { 160 if (vcpu->arch.exception.pending) {
149 printk(KERN_DEBUG "kvm: inject_page_fault:" 161 if (vcpu->arch.exception.nr == PF_VECTOR) {
150 " double fault 0x%lx\n", addr); 162 printk(KERN_DEBUG "kvm: inject_page_fault:"
151 vcpu->arch.exception.nr = DF_VECTOR; 163 " double fault 0x%lx\n", addr);
152 vcpu->arch.exception.error_code = 0; 164 vcpu->arch.exception.nr = DF_VECTOR;
165 vcpu->arch.exception.error_code = 0;
166 } else if (vcpu->arch.exception.nr == DF_VECTOR) {
167 /* triple fault -> shutdown */
168 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
169 }
153 return; 170 return;
154 } 171 }
155 vcpu->arch.cr2 = addr; 172 vcpu->arch.cr2 = addr;
@@ -184,7 +201,6 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
184 int ret; 201 int ret;
185 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; 202 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
186 203
187 down_read(&vcpu->kvm->slots_lock);
188 ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte, 204 ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
189 offset * sizeof(u64), sizeof(pdpte)); 205 offset * sizeof(u64), sizeof(pdpte));
190 if (ret < 0) { 206 if (ret < 0) {
@@ -201,10 +217,10 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
201 217
202 memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); 218 memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
203out: 219out:
204 up_read(&vcpu->kvm->slots_lock);
205 220
206 return ret; 221 return ret;
207} 222}
223EXPORT_SYMBOL_GPL(load_pdptrs);
208 224
209static bool pdptrs_changed(struct kvm_vcpu *vcpu) 225static bool pdptrs_changed(struct kvm_vcpu *vcpu)
210{ 226{
@@ -215,18 +231,16 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
215 if (is_long_mode(vcpu) || !is_pae(vcpu)) 231 if (is_long_mode(vcpu) || !is_pae(vcpu))
216 return false; 232 return false;
217 233
218 down_read(&vcpu->kvm->slots_lock);
219 r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); 234 r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
220 if (r < 0) 235 if (r < 0)
221 goto out; 236 goto out;
222 changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0; 237 changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
223out: 238out:
224 up_read(&vcpu->kvm->slots_lock);
225 239
226 return changed; 240 return changed;
227} 241}
228 242
229void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 243void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
230{ 244{
231 if (cr0 & CR0_RESERVED_BITS) { 245 if (cr0 & CR0_RESERVED_BITS) {
232 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", 246 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
@@ -284,15 +298,18 @@ void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
284 kvm_mmu_reset_context(vcpu); 298 kvm_mmu_reset_context(vcpu);
285 return; 299 return;
286} 300}
287EXPORT_SYMBOL_GPL(set_cr0); 301EXPORT_SYMBOL_GPL(kvm_set_cr0);
288 302
289void lmsw(struct kvm_vcpu *vcpu, unsigned long msw) 303void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
290{ 304{
291 set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); 305 kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
306 KVMTRACE_1D(LMSW, vcpu,
307 (u32)((vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)),
308 handler);
292} 309}
293EXPORT_SYMBOL_GPL(lmsw); 310EXPORT_SYMBOL_GPL(kvm_lmsw);
294 311
295void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 312void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
296{ 313{
297 if (cr4 & CR4_RESERVED_BITS) { 314 if (cr4 & CR4_RESERVED_BITS) {
298 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); 315 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
@@ -323,9 +340,9 @@ void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
323 vcpu->arch.cr4 = cr4; 340 vcpu->arch.cr4 = cr4;
324 kvm_mmu_reset_context(vcpu); 341 kvm_mmu_reset_context(vcpu);
325} 342}
326EXPORT_SYMBOL_GPL(set_cr4); 343EXPORT_SYMBOL_GPL(kvm_set_cr4);
327 344
328void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 345void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
329{ 346{
330 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { 347 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
331 kvm_mmu_flush_tlb(vcpu); 348 kvm_mmu_flush_tlb(vcpu);
@@ -359,7 +376,6 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
359 */ 376 */
360 } 377 }
361 378
362 down_read(&vcpu->kvm->slots_lock);
363 /* 379 /*
364 * Does the new cr3 value map to physical memory? (Note, we 380 * Does the new cr3 value map to physical memory? (Note, we
365 * catch an invalid cr3 even in real-mode, because it would 381 * catch an invalid cr3 even in real-mode, because it would
@@ -375,11 +391,10 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
375 vcpu->arch.cr3 = cr3; 391 vcpu->arch.cr3 = cr3;
376 vcpu->arch.mmu.new_cr3(vcpu); 392 vcpu->arch.mmu.new_cr3(vcpu);
377 } 393 }
378 up_read(&vcpu->kvm->slots_lock);
379} 394}
380EXPORT_SYMBOL_GPL(set_cr3); 395EXPORT_SYMBOL_GPL(kvm_set_cr3);
381 396
382void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) 397void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
383{ 398{
384 if (cr8 & CR8_RESERVED_BITS) { 399 if (cr8 & CR8_RESERVED_BITS) {
385 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8); 400 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
@@ -391,16 +406,16 @@ void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
391 else 406 else
392 vcpu->arch.cr8 = cr8; 407 vcpu->arch.cr8 = cr8;
393} 408}
394EXPORT_SYMBOL_GPL(set_cr8); 409EXPORT_SYMBOL_GPL(kvm_set_cr8);
395 410
396unsigned long get_cr8(struct kvm_vcpu *vcpu) 411unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
397{ 412{
398 if (irqchip_in_kernel(vcpu->kvm)) 413 if (irqchip_in_kernel(vcpu->kvm))
399 return kvm_lapic_get_cr8(vcpu); 414 return kvm_lapic_get_cr8(vcpu);
400 else 415 else
401 return vcpu->arch.cr8; 416 return vcpu->arch.cr8;
402} 417}
403EXPORT_SYMBOL_GPL(get_cr8); 418EXPORT_SYMBOL_GPL(kvm_get_cr8);
404 419
405/* 420/*
406 * List of msr numbers which we expose to userspace through KVM_GET_MSRS 421 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
@@ -415,7 +430,8 @@ static u32 msrs_to_save[] = {
415#ifdef CONFIG_X86_64 430#ifdef CONFIG_X86_64
416 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 431 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
417#endif 432#endif
418 MSR_IA32_TIME_STAMP_COUNTER, 433 MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
434 MSR_IA32_PERF_STATUS,
419}; 435};
420 436
421static unsigned num_msrs_to_save; 437static unsigned num_msrs_to_save;
@@ -424,11 +440,9 @@ static u32 emulated_msrs[] = {
424 MSR_IA32_MISC_ENABLE, 440 MSR_IA32_MISC_ENABLE,
425}; 441};
426 442
427#ifdef CONFIG_X86_64
428
429static void set_efer(struct kvm_vcpu *vcpu, u64 efer) 443static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
430{ 444{
431 if (efer & EFER_RESERVED_BITS) { 445 if (efer & efer_reserved_bits) {
432 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n", 446 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
433 efer); 447 efer);
434 kvm_inject_gp(vcpu, 0); 448 kvm_inject_gp(vcpu, 0);
@@ -450,7 +464,12 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
450 vcpu->arch.shadow_efer = efer; 464 vcpu->arch.shadow_efer = efer;
451} 465}
452 466
453#endif 467void kvm_enable_efer_bits(u64 mask)
468{
469 efer_reserved_bits &= ~mask;
470}
471EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
472
454 473
455/* 474/*
456 * Writes msr value into into the appropriate "register". 475 * Writes msr value into into the appropriate "register".
@@ -470,26 +489,86 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
470 return kvm_set_msr(vcpu, index, *data); 489 return kvm_set_msr(vcpu, index, *data);
471} 490}
472 491
492static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
493{
494 static int version;
495 struct kvm_wall_clock wc;
496 struct timespec wc_ts;
497
498 if (!wall_clock)
499 return;
500
501 version++;
502
503 kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
504
505 wc_ts = current_kernel_time();
506 wc.wc_sec = wc_ts.tv_sec;
507 wc.wc_nsec = wc_ts.tv_nsec;
508 wc.wc_version = version;
509
510 kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
511
512 version++;
513 kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
514}
515
516static void kvm_write_guest_time(struct kvm_vcpu *v)
517{
518 struct timespec ts;
519 unsigned long flags;
520 struct kvm_vcpu_arch *vcpu = &v->arch;
521 void *shared_kaddr;
522
523 if ((!vcpu->time_page))
524 return;
525
526 /* Keep irq disabled to prevent changes to the clock */
527 local_irq_save(flags);
528 kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
529 &vcpu->hv_clock.tsc_timestamp);
530 ktime_get_ts(&ts);
531 local_irq_restore(flags);
532
533 /* With all the info we got, fill in the values */
534
535 vcpu->hv_clock.system_time = ts.tv_nsec +
536 (NSEC_PER_SEC * (u64)ts.tv_sec);
537 /*
538 * The interface expects us to write an even number signaling that the
539 * update is finished. Since the guest won't see the intermediate
540 * state, we just write "2" at the end
541 */
542 vcpu->hv_clock.version = 2;
543
544 shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
545
546 memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
547 sizeof(vcpu->hv_clock));
548
549 kunmap_atomic(shared_kaddr, KM_USER0);
550
551 mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
552}
553
473 554
474int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 555int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
475{ 556{
476 switch (msr) { 557 switch (msr) {
477#ifdef CONFIG_X86_64
478 case MSR_EFER: 558 case MSR_EFER:
479 set_efer(vcpu, data); 559 set_efer(vcpu, data);
480 break; 560 break;
481#endif
482 case MSR_IA32_MC0_STATUS: 561 case MSR_IA32_MC0_STATUS:
483 pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", 562 pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
484 __FUNCTION__, data); 563 __func__, data);
485 break; 564 break;
486 case MSR_IA32_MCG_STATUS: 565 case MSR_IA32_MCG_STATUS:
487 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n", 566 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
488 __FUNCTION__, data); 567 __func__, data);
489 break; 568 break;
490 case MSR_IA32_MCG_CTL: 569 case MSR_IA32_MCG_CTL:
491 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n", 570 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
492 __FUNCTION__, data); 571 __func__, data);
493 break; 572 break;
494 case MSR_IA32_UCODE_REV: 573 case MSR_IA32_UCODE_REV:
495 case MSR_IA32_UCODE_WRITE: 574 case MSR_IA32_UCODE_WRITE:
@@ -501,6 +580,42 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
501 case MSR_IA32_MISC_ENABLE: 580 case MSR_IA32_MISC_ENABLE:
502 vcpu->arch.ia32_misc_enable_msr = data; 581 vcpu->arch.ia32_misc_enable_msr = data;
503 break; 582 break;
583 case MSR_KVM_WALL_CLOCK:
584 vcpu->kvm->arch.wall_clock = data;
585 kvm_write_wall_clock(vcpu->kvm, data);
586 break;
587 case MSR_KVM_SYSTEM_TIME: {
588 if (vcpu->arch.time_page) {
589 kvm_release_page_dirty(vcpu->arch.time_page);
590 vcpu->arch.time_page = NULL;
591 }
592
593 vcpu->arch.time = data;
594
595 /* we verify if the enable bit is set... */
596 if (!(data & 1))
597 break;
598
599 /* ...but clean it before doing the actual write */
600 vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
601
602 vcpu->arch.hv_clock.tsc_to_system_mul =
603 clocksource_khz2mult(tsc_khz, 22);
604 vcpu->arch.hv_clock.tsc_shift = 22;
605
606 down_read(&current->mm->mmap_sem);
607 vcpu->arch.time_page =
608 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
609 up_read(&current->mm->mmap_sem);
610
611 if (is_error_page(vcpu->arch.time_page)) {
612 kvm_release_page_clean(vcpu->arch.time_page);
613 vcpu->arch.time_page = NULL;
614 }
615
616 kvm_write_guest_time(vcpu);
617 break;
618 }
504 default: 619 default:
505 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data); 620 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
506 return 1; 621 return 1;
@@ -540,7 +655,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
540 case MSR_IA32_MC0_MISC+12: 655 case MSR_IA32_MC0_MISC+12:
541 case MSR_IA32_MC0_MISC+16: 656 case MSR_IA32_MC0_MISC+16:
542 case MSR_IA32_UCODE_REV: 657 case MSR_IA32_UCODE_REV:
543 case MSR_IA32_PERF_STATUS:
544 case MSR_IA32_EBL_CR_POWERON: 658 case MSR_IA32_EBL_CR_POWERON:
545 /* MTRR registers */ 659 /* MTRR registers */
546 case 0xfe: 660 case 0xfe:
@@ -556,11 +670,21 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
556 case MSR_IA32_MISC_ENABLE: 670 case MSR_IA32_MISC_ENABLE:
557 data = vcpu->arch.ia32_misc_enable_msr; 671 data = vcpu->arch.ia32_misc_enable_msr;
558 break; 672 break;
559#ifdef CONFIG_X86_64 673 case MSR_IA32_PERF_STATUS:
674 /* TSC increment by tick */
675 data = 1000ULL;
676 /* CPU multiplier */
677 data |= (((uint64_t)4ULL) << 40);
678 break;
560 case MSR_EFER: 679 case MSR_EFER:
561 data = vcpu->arch.shadow_efer; 680 data = vcpu->arch.shadow_efer;
562 break; 681 break;
563#endif 682 case MSR_KVM_WALL_CLOCK:
683 data = vcpu->kvm->arch.wall_clock;
684 break;
685 case MSR_KVM_SYSTEM_TIME:
686 data = vcpu->arch.time;
687 break;
564 default: 688 default:
565 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); 689 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
566 return 1; 690 return 1;
@@ -584,9 +708,11 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
584 708
585 vcpu_load(vcpu); 709 vcpu_load(vcpu);
586 710
711 down_read(&vcpu->kvm->slots_lock);
587 for (i = 0; i < msrs->nmsrs; ++i) 712 for (i = 0; i < msrs->nmsrs; ++i)
588 if (do_msr(vcpu, entries[i].index, &entries[i].data)) 713 if (do_msr(vcpu, entries[i].index, &entries[i].data))
589 break; 714 break;
715 up_read(&vcpu->kvm->slots_lock);
590 716
591 vcpu_put(vcpu); 717 vcpu_put(vcpu);
592 718
@@ -688,11 +814,24 @@ int kvm_dev_ioctl_check_extension(long ext)
688 case KVM_CAP_USER_MEMORY: 814 case KVM_CAP_USER_MEMORY:
689 case KVM_CAP_SET_TSS_ADDR: 815 case KVM_CAP_SET_TSS_ADDR:
690 case KVM_CAP_EXT_CPUID: 816 case KVM_CAP_EXT_CPUID:
817 case KVM_CAP_CLOCKSOURCE:
818 case KVM_CAP_PIT:
819 case KVM_CAP_NOP_IO_DELAY:
820 case KVM_CAP_MP_STATE:
691 r = 1; 821 r = 1;
692 break; 822 break;
693 case KVM_CAP_VAPIC: 823 case KVM_CAP_VAPIC:
694 r = !kvm_x86_ops->cpu_has_accelerated_tpr(); 824 r = !kvm_x86_ops->cpu_has_accelerated_tpr();
695 break; 825 break;
826 case KVM_CAP_NR_VCPUS:
827 r = KVM_MAX_VCPUS;
828 break;
829 case KVM_CAP_NR_MEMSLOTS:
830 r = KVM_MEMORY_SLOTS;
831 break;
832 case KVM_CAP_PV_MMU:
833 r = !tdp_enabled;
834 break;
696 default: 835 default:
697 r = 0; 836 r = 0;
698 break; 837 break;
@@ -763,6 +902,7 @@ out:
763void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 902void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
764{ 903{
765 kvm_x86_ops->vcpu_load(vcpu, cpu); 904 kvm_x86_ops->vcpu_load(vcpu, cpu);
905 kvm_write_guest_time(vcpu);
766} 906}
767 907
768void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 908void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -958,32 +1098,32 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
958 } 1098 }
959 /* function 4 and 0xb have additional index. */ 1099 /* function 4 and 0xb have additional index. */
960 case 4: { 1100 case 4: {
961 int index, cache_type; 1101 int i, cache_type;
962 1102
963 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1103 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
964 /* read more entries until cache_type is zero */ 1104 /* read more entries until cache_type is zero */
965 for (index = 1; *nent < maxnent; ++index) { 1105 for (i = 1; *nent < maxnent; ++i) {
966 cache_type = entry[index - 1].eax & 0x1f; 1106 cache_type = entry[i - 1].eax & 0x1f;
967 if (!cache_type) 1107 if (!cache_type)
968 break; 1108 break;
969 do_cpuid_1_ent(&entry[index], function, index); 1109 do_cpuid_1_ent(&entry[i], function, i);
970 entry[index].flags |= 1110 entry[i].flags |=
971 KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1111 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
972 ++*nent; 1112 ++*nent;
973 } 1113 }
974 break; 1114 break;
975 } 1115 }
976 case 0xb: { 1116 case 0xb: {
977 int index, level_type; 1117 int i, level_type;
978 1118
979 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1119 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
980 /* read more entries until level_type is zero */ 1120 /* read more entries until level_type is zero */
981 for (index = 1; *nent < maxnent; ++index) { 1121 for (i = 1; *nent < maxnent; ++i) {
982 level_type = entry[index - 1].ecx & 0xff; 1122 level_type = entry[i - 1].ecx & 0xff;
983 if (!level_type) 1123 if (!level_type)
984 break; 1124 break;
985 do_cpuid_1_ent(&entry[index], function, index); 1125 do_cpuid_1_ent(&entry[i], function, i);
986 entry[index].flags |= 1126 entry[i].flags |=
987 KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1127 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
988 ++*nent; 1128 ++*nent;
989 } 1129 }
@@ -1365,6 +1505,23 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
1365 return r; 1505 return r;
1366} 1506}
1367 1507
1508static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
1509{
1510 int r = 0;
1511
1512 memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
1513 return r;
1514}
1515
1516static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
1517{
1518 int r = 0;
1519
1520 memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
1521 kvm_pit_load_count(kvm, 0, ps->channels[0].count);
1522 return r;
1523}
1524
1368/* 1525/*
1369 * Get (and clear) the dirty memory log for a memory slot. 1526 * Get (and clear) the dirty memory log for a memory slot.
1370 */ 1527 */
@@ -1457,6 +1614,12 @@ long kvm_arch_vm_ioctl(struct file *filp,
1457 } else 1614 } else
1458 goto out; 1615 goto out;
1459 break; 1616 break;
1617 case KVM_CREATE_PIT:
1618 r = -ENOMEM;
1619 kvm->arch.vpit = kvm_create_pit(kvm);
1620 if (kvm->arch.vpit)
1621 r = 0;
1622 break;
1460 case KVM_IRQ_LINE: { 1623 case KVM_IRQ_LINE: {
1461 struct kvm_irq_level irq_event; 1624 struct kvm_irq_level irq_event;
1462 1625
@@ -1512,6 +1675,37 @@ long kvm_arch_vm_ioctl(struct file *filp,
1512 r = 0; 1675 r = 0;
1513 break; 1676 break;
1514 } 1677 }
1678 case KVM_GET_PIT: {
1679 struct kvm_pit_state ps;
1680 r = -EFAULT;
1681 if (copy_from_user(&ps, argp, sizeof ps))
1682 goto out;
1683 r = -ENXIO;
1684 if (!kvm->arch.vpit)
1685 goto out;
1686 r = kvm_vm_ioctl_get_pit(kvm, &ps);
1687 if (r)
1688 goto out;
1689 r = -EFAULT;
1690 if (copy_to_user(argp, &ps, sizeof ps))
1691 goto out;
1692 r = 0;
1693 break;
1694 }
1695 case KVM_SET_PIT: {
1696 struct kvm_pit_state ps;
1697 r = -EFAULT;
1698 if (copy_from_user(&ps, argp, sizeof ps))
1699 goto out;
1700 r = -ENXIO;
1701 if (!kvm->arch.vpit)
1702 goto out;
1703 r = kvm_vm_ioctl_set_pit(kvm, &ps);
1704 if (r)
1705 goto out;
1706 r = 0;
1707 break;
1708 }
1515 default: 1709 default:
1516 ; 1710 ;
1517 } 1711 }
@@ -1570,7 +1764,6 @@ int emulator_read_std(unsigned long addr,
1570 void *data = val; 1764 void *data = val;
1571 int r = X86EMUL_CONTINUE; 1765 int r = X86EMUL_CONTINUE;
1572 1766
1573 down_read(&vcpu->kvm->slots_lock);
1574 while (bytes) { 1767 while (bytes) {
1575 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 1768 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1576 unsigned offset = addr & (PAGE_SIZE-1); 1769 unsigned offset = addr & (PAGE_SIZE-1);
@@ -1592,7 +1785,6 @@ int emulator_read_std(unsigned long addr,
1592 addr += tocopy; 1785 addr += tocopy;
1593 } 1786 }
1594out: 1787out:
1595 up_read(&vcpu->kvm->slots_lock);
1596 return r; 1788 return r;
1597} 1789}
1598EXPORT_SYMBOL_GPL(emulator_read_std); 1790EXPORT_SYMBOL_GPL(emulator_read_std);
@@ -1611,9 +1803,7 @@ static int emulator_read_emulated(unsigned long addr,
1611 return X86EMUL_CONTINUE; 1803 return X86EMUL_CONTINUE;
1612 } 1804 }
1613 1805
1614 down_read(&vcpu->kvm->slots_lock);
1615 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 1806 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1616 up_read(&vcpu->kvm->slots_lock);
1617 1807
1618 /* For APIC access vmexit */ 1808 /* For APIC access vmexit */
1619 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 1809 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -1646,19 +1836,15 @@ mmio:
1646 return X86EMUL_UNHANDLEABLE; 1836 return X86EMUL_UNHANDLEABLE;
1647} 1837}
1648 1838
1649static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, 1839int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
1650 const void *val, int bytes) 1840 const void *val, int bytes)
1651{ 1841{
1652 int ret; 1842 int ret;
1653 1843
1654 down_read(&vcpu->kvm->slots_lock);
1655 ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); 1844 ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
1656 if (ret < 0) { 1845 if (ret < 0)
1657 up_read(&vcpu->kvm->slots_lock);
1658 return 0; 1846 return 0;
1659 }
1660 kvm_mmu_pte_write(vcpu, gpa, val, bytes); 1847 kvm_mmu_pte_write(vcpu, gpa, val, bytes);
1661 up_read(&vcpu->kvm->slots_lock);
1662 return 1; 1848 return 1;
1663} 1849}
1664 1850
@@ -1670,9 +1856,7 @@ static int emulator_write_emulated_onepage(unsigned long addr,
1670 struct kvm_io_device *mmio_dev; 1856 struct kvm_io_device *mmio_dev;
1671 gpa_t gpa; 1857 gpa_t gpa;
1672 1858
1673 down_read(&vcpu->kvm->slots_lock);
1674 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 1859 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1675 up_read(&vcpu->kvm->slots_lock);
1676 1860
1677 if (gpa == UNMAPPED_GVA) { 1861 if (gpa == UNMAPPED_GVA) {
1678 kvm_inject_page_fault(vcpu, addr, 2); 1862 kvm_inject_page_fault(vcpu, addr, 2);
@@ -1749,7 +1933,6 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
1749 char *kaddr; 1933 char *kaddr;
1750 u64 val; 1934 u64 val;
1751 1935
1752 down_read(&vcpu->kvm->slots_lock);
1753 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 1936 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
1754 1937
1755 if (gpa == UNMAPPED_GVA || 1938 if (gpa == UNMAPPED_GVA ||
@@ -1769,9 +1952,8 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
1769 set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val); 1952 set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
1770 kunmap_atomic(kaddr, KM_USER0); 1953 kunmap_atomic(kaddr, KM_USER0);
1771 kvm_release_page_dirty(page); 1954 kvm_release_page_dirty(page);
1772 emul_write:
1773 up_read(&vcpu->kvm->slots_lock);
1774 } 1955 }
1956emul_write:
1775#endif 1957#endif
1776 1958
1777 return emulator_write_emulated(addr, new, bytes, vcpu); 1959 return emulator_write_emulated(addr, new, bytes, vcpu);
@@ -1802,7 +1984,7 @@ int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
1802 *dest = kvm_x86_ops->get_dr(vcpu, dr); 1984 *dest = kvm_x86_ops->get_dr(vcpu, dr);
1803 return X86EMUL_CONTINUE; 1985 return X86EMUL_CONTINUE;
1804 default: 1986 default:
1805 pr_unimpl(vcpu, "%s: unexpected dr %u\n", __FUNCTION__, dr); 1987 pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr);
1806 return X86EMUL_UNHANDLEABLE; 1988 return X86EMUL_UNHANDLEABLE;
1807 } 1989 }
1808} 1990}
@@ -1840,7 +2022,7 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
1840} 2022}
1841EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); 2023EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
1842 2024
1843struct x86_emulate_ops emulate_ops = { 2025static struct x86_emulate_ops emulate_ops = {
1844 .read_std = emulator_read_std, 2026 .read_std = emulator_read_std,
1845 .read_emulated = emulator_read_emulated, 2027 .read_emulated = emulator_read_emulated,
1846 .write_emulated = emulator_write_emulated, 2028 .write_emulated = emulator_write_emulated,
@@ -2091,6 +2273,13 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2091 vcpu->arch.pio.guest_page_offset = 0; 2273 vcpu->arch.pio.guest_page_offset = 0;
2092 vcpu->arch.pio.rep = 0; 2274 vcpu->arch.pio.rep = 0;
2093 2275
2276 if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
2277 KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size,
2278 handler);
2279 else
2280 KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
2281 handler);
2282
2094 kvm_x86_ops->cache_regs(vcpu); 2283 kvm_x86_ops->cache_regs(vcpu);
2095 memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4); 2284 memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4);
2096 kvm_x86_ops->decache_regs(vcpu); 2285 kvm_x86_ops->decache_regs(vcpu);
@@ -2129,6 +2318,13 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2129 vcpu->arch.pio.guest_page_offset = offset_in_page(address); 2318 vcpu->arch.pio.guest_page_offset = offset_in_page(address);
2130 vcpu->arch.pio.rep = rep; 2319 vcpu->arch.pio.rep = rep;
2131 2320
2321 if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
2322 KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size,
2323 handler);
2324 else
2325 KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
2326 handler);
2327
2132 if (!count) { 2328 if (!count) {
2133 kvm_x86_ops->skip_emulated_instruction(vcpu); 2329 kvm_x86_ops->skip_emulated_instruction(vcpu);
2134 return 1; 2330 return 1;
@@ -2163,10 +2359,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2163 kvm_x86_ops->skip_emulated_instruction(vcpu); 2359 kvm_x86_ops->skip_emulated_instruction(vcpu);
2164 2360
2165 for (i = 0; i < nr_pages; ++i) { 2361 for (i = 0; i < nr_pages; ++i) {
2166 down_read(&vcpu->kvm->slots_lock);
2167 page = gva_to_page(vcpu, address + i * PAGE_SIZE); 2362 page = gva_to_page(vcpu, address + i * PAGE_SIZE);
2168 vcpu->arch.pio.guest_pages[i] = page; 2363 vcpu->arch.pio.guest_pages[i] = page;
2169 up_read(&vcpu->kvm->slots_lock);
2170 if (!page) { 2364 if (!page) {
2171 kvm_inject_gp(vcpu, 0); 2365 kvm_inject_gp(vcpu, 0);
2172 free_pio_guest_pages(vcpu); 2366 free_pio_guest_pages(vcpu);
@@ -2238,10 +2432,13 @@ void kvm_arch_exit(void)
2238int kvm_emulate_halt(struct kvm_vcpu *vcpu) 2432int kvm_emulate_halt(struct kvm_vcpu *vcpu)
2239{ 2433{
2240 ++vcpu->stat.halt_exits; 2434 ++vcpu->stat.halt_exits;
2435 KVMTRACE_0D(HLT, vcpu, handler);
2241 if (irqchip_in_kernel(vcpu->kvm)) { 2436 if (irqchip_in_kernel(vcpu->kvm)) {
2242 vcpu->arch.mp_state = VCPU_MP_STATE_HALTED; 2437 vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
2438 up_read(&vcpu->kvm->slots_lock);
2243 kvm_vcpu_block(vcpu); 2439 kvm_vcpu_block(vcpu);
2244 if (vcpu->arch.mp_state != VCPU_MP_STATE_RUNNABLE) 2440 down_read(&vcpu->kvm->slots_lock);
2441 if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE)
2245 return -EINTR; 2442 return -EINTR;
2246 return 1; 2443 return 1;
2247 } else { 2444 } else {
@@ -2251,9 +2448,19 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
2251} 2448}
2252EXPORT_SYMBOL_GPL(kvm_emulate_halt); 2449EXPORT_SYMBOL_GPL(kvm_emulate_halt);
2253 2450
2451static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,
2452 unsigned long a1)
2453{
2454 if (is_long_mode(vcpu))
2455 return a0;
2456 else
2457 return a0 | ((gpa_t)a1 << 32);
2458}
2459
2254int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) 2460int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
2255{ 2461{
2256 unsigned long nr, a0, a1, a2, a3, ret; 2462 unsigned long nr, a0, a1, a2, a3, ret;
2463 int r = 1;
2257 2464
2258 kvm_x86_ops->cache_regs(vcpu); 2465 kvm_x86_ops->cache_regs(vcpu);
2259 2466
@@ -2263,6 +2470,8 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
2263 a2 = vcpu->arch.regs[VCPU_REGS_RDX]; 2470 a2 = vcpu->arch.regs[VCPU_REGS_RDX];
2264 a3 = vcpu->arch.regs[VCPU_REGS_RSI]; 2471 a3 = vcpu->arch.regs[VCPU_REGS_RSI];
2265 2472
2473 KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler);
2474
2266 if (!is_long_mode(vcpu)) { 2475 if (!is_long_mode(vcpu)) {
2267 nr &= 0xFFFFFFFF; 2476 nr &= 0xFFFFFFFF;
2268 a0 &= 0xFFFFFFFF; 2477 a0 &= 0xFFFFFFFF;
@@ -2275,13 +2484,17 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
2275 case KVM_HC_VAPIC_POLL_IRQ: 2484 case KVM_HC_VAPIC_POLL_IRQ:
2276 ret = 0; 2485 ret = 0;
2277 break; 2486 break;
2487 case KVM_HC_MMU_OP:
2488 r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret);
2489 break;
2278 default: 2490 default:
2279 ret = -KVM_ENOSYS; 2491 ret = -KVM_ENOSYS;
2280 break; 2492 break;
2281 } 2493 }
2282 vcpu->arch.regs[VCPU_REGS_RAX] = ret; 2494 vcpu->arch.regs[VCPU_REGS_RAX] = ret;
2283 kvm_x86_ops->decache_regs(vcpu); 2495 kvm_x86_ops->decache_regs(vcpu);
2284 return 0; 2496 ++vcpu->stat.hypercalls;
2497 return r;
2285} 2498}
2286EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); 2499EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
2287 2500
@@ -2329,7 +2542,7 @@ void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
2329void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, 2542void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
2330 unsigned long *rflags) 2543 unsigned long *rflags)
2331{ 2544{
2332 lmsw(vcpu, msw); 2545 kvm_lmsw(vcpu, msw);
2333 *rflags = kvm_x86_ops->get_rflags(vcpu); 2546 *rflags = kvm_x86_ops->get_rflags(vcpu);
2334} 2547}
2335 2548
@@ -2346,9 +2559,9 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
2346 case 4: 2559 case 4:
2347 return vcpu->arch.cr4; 2560 return vcpu->arch.cr4;
2348 case 8: 2561 case 8:
2349 return get_cr8(vcpu); 2562 return kvm_get_cr8(vcpu);
2350 default: 2563 default:
2351 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr); 2564 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
2352 return 0; 2565 return 0;
2353 } 2566 }
2354} 2567}
@@ -2358,23 +2571,23 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
2358{ 2571{
2359 switch (cr) { 2572 switch (cr) {
2360 case 0: 2573 case 0:
2361 set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); 2574 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
2362 *rflags = kvm_x86_ops->get_rflags(vcpu); 2575 *rflags = kvm_x86_ops->get_rflags(vcpu);
2363 break; 2576 break;
2364 case 2: 2577 case 2:
2365 vcpu->arch.cr2 = val; 2578 vcpu->arch.cr2 = val;
2366 break; 2579 break;
2367 case 3: 2580 case 3:
2368 set_cr3(vcpu, val); 2581 kvm_set_cr3(vcpu, val);
2369 break; 2582 break;
2370 case 4: 2583 case 4:
2371 set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val)); 2584 kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val));
2372 break; 2585 break;
2373 case 8: 2586 case 8:
2374 set_cr8(vcpu, val & 0xfUL); 2587 kvm_set_cr8(vcpu, val & 0xfUL);
2375 break; 2588 break;
2376 default: 2589 default:
2377 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr); 2590 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
2378 } 2591 }
2379} 2592}
2380 2593
@@ -2447,6 +2660,11 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
2447 } 2660 }
2448 kvm_x86_ops->decache_regs(vcpu); 2661 kvm_x86_ops->decache_regs(vcpu);
2449 kvm_x86_ops->skip_emulated_instruction(vcpu); 2662 kvm_x86_ops->skip_emulated_instruction(vcpu);
2663 KVMTRACE_5D(CPUID, vcpu, function,
2664 (u32)vcpu->arch.regs[VCPU_REGS_RAX],
2665 (u32)vcpu->arch.regs[VCPU_REGS_RBX],
2666 (u32)vcpu->arch.regs[VCPU_REGS_RCX],
2667 (u32)vcpu->arch.regs[VCPU_REGS_RDX], handler);
2450} 2668}
2451EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); 2669EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
2452 2670
@@ -2469,7 +2687,7 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu,
2469 struct kvm_run *kvm_run) 2687 struct kvm_run *kvm_run)
2470{ 2688{
2471 kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0; 2689 kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
2472 kvm_run->cr8 = get_cr8(vcpu); 2690 kvm_run->cr8 = kvm_get_cr8(vcpu);
2473 kvm_run->apic_base = kvm_get_apic_base(vcpu); 2691 kvm_run->apic_base = kvm_get_apic_base(vcpu);
2474 if (irqchip_in_kernel(vcpu->kvm)) 2692 if (irqchip_in_kernel(vcpu->kvm))
2475 kvm_run->ready_for_interrupt_injection = 1; 2693 kvm_run->ready_for_interrupt_injection = 1;
@@ -2509,16 +2727,17 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2509{ 2727{
2510 int r; 2728 int r;
2511 2729
2512 if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) { 2730 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
2513 pr_debug("vcpu %d received sipi with vector # %x\n", 2731 pr_debug("vcpu %d received sipi with vector # %x\n",
2514 vcpu->vcpu_id, vcpu->arch.sipi_vector); 2732 vcpu->vcpu_id, vcpu->arch.sipi_vector);
2515 kvm_lapic_reset(vcpu); 2733 kvm_lapic_reset(vcpu);
2516 r = kvm_x86_ops->vcpu_reset(vcpu); 2734 r = kvm_x86_ops->vcpu_reset(vcpu);
2517 if (r) 2735 if (r)
2518 return r; 2736 return r;
2519 vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE; 2737 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
2520 } 2738 }
2521 2739
2740 down_read(&vcpu->kvm->slots_lock);
2522 vapic_enter(vcpu); 2741 vapic_enter(vcpu);
2523 2742
2524preempted: 2743preempted:
@@ -2526,6 +2745,10 @@ preempted:
2526 kvm_x86_ops->guest_debug_pre(vcpu); 2745 kvm_x86_ops->guest_debug_pre(vcpu);
2527 2746
2528again: 2747again:
2748 if (vcpu->requests)
2749 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
2750 kvm_mmu_unload(vcpu);
2751
2529 r = kvm_mmu_reload(vcpu); 2752 r = kvm_mmu_reload(vcpu);
2530 if (unlikely(r)) 2753 if (unlikely(r))
2531 goto out; 2754 goto out;
@@ -2539,6 +2762,11 @@ again:
2539 r = 0; 2762 r = 0;
2540 goto out; 2763 goto out;
2541 } 2764 }
2765 if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) {
2766 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
2767 r = 0;
2768 goto out;
2769 }
2542 } 2770 }
2543 2771
2544 kvm_inject_pending_timer_irqs(vcpu); 2772 kvm_inject_pending_timer_irqs(vcpu);
@@ -2557,6 +2785,14 @@ again:
2557 goto out; 2785 goto out;
2558 } 2786 }
2559 2787
2788 if (vcpu->requests)
2789 if (test_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) {
2790 local_irq_enable();
2791 preempt_enable();
2792 r = 1;
2793 goto out;
2794 }
2795
2560 if (signal_pending(current)) { 2796 if (signal_pending(current)) {
2561 local_irq_enable(); 2797 local_irq_enable();
2562 preempt_enable(); 2798 preempt_enable();
@@ -2566,6 +2802,13 @@ again:
2566 goto out; 2802 goto out;
2567 } 2803 }
2568 2804
2805 vcpu->guest_mode = 1;
2806 /*
2807 * Make sure that guest_mode assignment won't happen after
2808 * testing the pending IRQ vector bitmap.
2809 */
2810 smp_wmb();
2811
2569 if (vcpu->arch.exception.pending) 2812 if (vcpu->arch.exception.pending)
2570 __queue_exception(vcpu); 2813 __queue_exception(vcpu);
2571 else if (irqchip_in_kernel(vcpu->kvm)) 2814 else if (irqchip_in_kernel(vcpu->kvm))
@@ -2575,13 +2818,15 @@ again:
2575 2818
2576 kvm_lapic_sync_to_vapic(vcpu); 2819 kvm_lapic_sync_to_vapic(vcpu);
2577 2820
2578 vcpu->guest_mode = 1; 2821 up_read(&vcpu->kvm->slots_lock);
2822
2579 kvm_guest_enter(); 2823 kvm_guest_enter();
2580 2824
2581 if (vcpu->requests) 2825 if (vcpu->requests)
2582 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) 2826 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
2583 kvm_x86_ops->tlb_flush(vcpu); 2827 kvm_x86_ops->tlb_flush(vcpu);
2584 2828
2829 KVMTRACE_0D(VMENTRY, vcpu, entryexit);
2585 kvm_x86_ops->run(vcpu, kvm_run); 2830 kvm_x86_ops->run(vcpu, kvm_run);
2586 2831
2587 vcpu->guest_mode = 0; 2832 vcpu->guest_mode = 0;
@@ -2601,6 +2846,8 @@ again:
2601 2846
2602 preempt_enable(); 2847 preempt_enable();
2603 2848
2849 down_read(&vcpu->kvm->slots_lock);
2850
2604 /* 2851 /*
2605 * Profile KVM exit RIPs: 2852 * Profile KVM exit RIPs:
2606 */ 2853 */
@@ -2628,14 +2875,18 @@ again:
2628 } 2875 }
2629 2876
2630out: 2877out:
2878 up_read(&vcpu->kvm->slots_lock);
2631 if (r > 0) { 2879 if (r > 0) {
2632 kvm_resched(vcpu); 2880 kvm_resched(vcpu);
2881 down_read(&vcpu->kvm->slots_lock);
2633 goto preempted; 2882 goto preempted;
2634 } 2883 }
2635 2884
2636 post_kvm_run_save(vcpu, kvm_run); 2885 post_kvm_run_save(vcpu, kvm_run);
2637 2886
2887 down_read(&vcpu->kvm->slots_lock);
2638 vapic_exit(vcpu); 2888 vapic_exit(vcpu);
2889 up_read(&vcpu->kvm->slots_lock);
2639 2890
2640 return r; 2891 return r;
2641} 2892}
@@ -2647,7 +2898,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2647 2898
2648 vcpu_load(vcpu); 2899 vcpu_load(vcpu);
2649 2900
2650 if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_UNINITIALIZED)) { 2901 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
2651 kvm_vcpu_block(vcpu); 2902 kvm_vcpu_block(vcpu);
2652 vcpu_put(vcpu); 2903 vcpu_put(vcpu);
2653 return -EAGAIN; 2904 return -EAGAIN;
@@ -2658,7 +2909,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2658 2909
2659 /* re-sync apic's tpr */ 2910 /* re-sync apic's tpr */
2660 if (!irqchip_in_kernel(vcpu->kvm)) 2911 if (!irqchip_in_kernel(vcpu->kvm))
2661 set_cr8(vcpu, kvm_run->cr8); 2912 kvm_set_cr8(vcpu, kvm_run->cr8);
2662 2913
2663 if (vcpu->arch.pio.cur_count) { 2914 if (vcpu->arch.pio.cur_count) {
2664 r = complete_pio(vcpu); 2915 r = complete_pio(vcpu);
@@ -2670,9 +2921,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2670 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); 2921 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
2671 vcpu->mmio_read_completed = 1; 2922 vcpu->mmio_read_completed = 1;
2672 vcpu->mmio_needed = 0; 2923 vcpu->mmio_needed = 0;
2924
2925 down_read(&vcpu->kvm->slots_lock);
2673 r = emulate_instruction(vcpu, kvm_run, 2926 r = emulate_instruction(vcpu, kvm_run,
2674 vcpu->arch.mmio_fault_cr2, 0, 2927 vcpu->arch.mmio_fault_cr2, 0,
2675 EMULTYPE_NO_DECODE); 2928 EMULTYPE_NO_DECODE);
2929 up_read(&vcpu->kvm->slots_lock);
2676 if (r == EMULATE_DO_MMIO) { 2930 if (r == EMULATE_DO_MMIO) {
2677 /* 2931 /*
2678 * Read-modify-write. Back to userspace. 2932 * Read-modify-write. Back to userspace.
@@ -2773,7 +3027,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
2773static void get_segment(struct kvm_vcpu *vcpu, 3027static void get_segment(struct kvm_vcpu *vcpu,
2774 struct kvm_segment *var, int seg) 3028 struct kvm_segment *var, int seg)
2775{ 3029{
2776 return kvm_x86_ops->get_segment(vcpu, var, seg); 3030 kvm_x86_ops->get_segment(vcpu, var, seg);
2777} 3031}
2778 3032
2779void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 3033void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
@@ -2816,7 +3070,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
2816 sregs->cr2 = vcpu->arch.cr2; 3070 sregs->cr2 = vcpu->arch.cr2;
2817 sregs->cr3 = vcpu->arch.cr3; 3071 sregs->cr3 = vcpu->arch.cr3;
2818 sregs->cr4 = vcpu->arch.cr4; 3072 sregs->cr4 = vcpu->arch.cr4;
2819 sregs->cr8 = get_cr8(vcpu); 3073 sregs->cr8 = kvm_get_cr8(vcpu);
2820 sregs->efer = vcpu->arch.shadow_efer; 3074 sregs->efer = vcpu->arch.shadow_efer;
2821 sregs->apic_base = kvm_get_apic_base(vcpu); 3075 sregs->apic_base = kvm_get_apic_base(vcpu);
2822 3076
@@ -2836,12 +3090,438 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
2836 return 0; 3090 return 0;
2837} 3091}
2838 3092
3093int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
3094 struct kvm_mp_state *mp_state)
3095{
3096 vcpu_load(vcpu);
3097 mp_state->mp_state = vcpu->arch.mp_state;
3098 vcpu_put(vcpu);
3099 return 0;
3100}
3101
3102int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
3103 struct kvm_mp_state *mp_state)
3104{
3105 vcpu_load(vcpu);
3106 vcpu->arch.mp_state = mp_state->mp_state;
3107 vcpu_put(vcpu);
3108 return 0;
3109}
3110
2839static void set_segment(struct kvm_vcpu *vcpu, 3111static void set_segment(struct kvm_vcpu *vcpu,
2840 struct kvm_segment *var, int seg) 3112 struct kvm_segment *var, int seg)
2841{ 3113{
2842 return kvm_x86_ops->set_segment(vcpu, var, seg); 3114 kvm_x86_ops->set_segment(vcpu, var, seg);
3115}
3116
3117static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
3118 struct kvm_segment *kvm_desct)
3119{
3120 kvm_desct->base = seg_desc->base0;
3121 kvm_desct->base |= seg_desc->base1 << 16;
3122 kvm_desct->base |= seg_desc->base2 << 24;
3123 kvm_desct->limit = seg_desc->limit0;
3124 kvm_desct->limit |= seg_desc->limit << 16;
3125 kvm_desct->selector = selector;
3126 kvm_desct->type = seg_desc->type;
3127 kvm_desct->present = seg_desc->p;
3128 kvm_desct->dpl = seg_desc->dpl;
3129 kvm_desct->db = seg_desc->d;
3130 kvm_desct->s = seg_desc->s;
3131 kvm_desct->l = seg_desc->l;
3132 kvm_desct->g = seg_desc->g;
3133 kvm_desct->avl = seg_desc->avl;
3134 if (!selector)
3135 kvm_desct->unusable = 1;
3136 else
3137 kvm_desct->unusable = 0;
3138 kvm_desct->padding = 0;
3139}
3140
3141static void get_segment_descritptor_dtable(struct kvm_vcpu *vcpu,
3142 u16 selector,
3143 struct descriptor_table *dtable)
3144{
3145 if (selector & 1 << 2) {
3146 struct kvm_segment kvm_seg;
3147
3148 get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR);
3149
3150 if (kvm_seg.unusable)
3151 dtable->limit = 0;
3152 else
3153 dtable->limit = kvm_seg.limit;
3154 dtable->base = kvm_seg.base;
3155 }
3156 else
3157 kvm_x86_ops->get_gdt(vcpu, dtable);
3158}
3159
3160/* allowed just for 8 bytes segments */
3161static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3162 struct desc_struct *seg_desc)
3163{
3164 struct descriptor_table dtable;
3165 u16 index = selector >> 3;
3166
3167 get_segment_descritptor_dtable(vcpu, selector, &dtable);
3168
3169 if (dtable.limit < index * 8 + 7) {
3170 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
3171 return 1;
3172 }
3173 return kvm_read_guest(vcpu->kvm, dtable.base + index * 8, seg_desc, 8);
3174}
3175
3176/* allowed just for 8 bytes segments */
3177static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3178 struct desc_struct *seg_desc)
3179{
3180 struct descriptor_table dtable;
3181 u16 index = selector >> 3;
3182
3183 get_segment_descritptor_dtable(vcpu, selector, &dtable);
3184
3185 if (dtable.limit < index * 8 + 7)
3186 return 1;
3187 return kvm_write_guest(vcpu->kvm, dtable.base + index * 8, seg_desc, 8);
3188}
3189
3190static u32 get_tss_base_addr(struct kvm_vcpu *vcpu,
3191 struct desc_struct *seg_desc)
3192{
3193 u32 base_addr;
3194
3195 base_addr = seg_desc->base0;
3196 base_addr |= (seg_desc->base1 << 16);
3197 base_addr |= (seg_desc->base2 << 24);
3198
3199 return base_addr;
3200}
3201
3202static int load_tss_segment32(struct kvm_vcpu *vcpu,
3203 struct desc_struct *seg_desc,
3204 struct tss_segment_32 *tss)
3205{
3206 u32 base_addr;
3207
3208 base_addr = get_tss_base_addr(vcpu, seg_desc);
3209
3210 return kvm_read_guest(vcpu->kvm, base_addr, tss,
3211 sizeof(struct tss_segment_32));
3212}
3213
3214static int save_tss_segment32(struct kvm_vcpu *vcpu,
3215 struct desc_struct *seg_desc,
3216 struct tss_segment_32 *tss)
3217{
3218 u32 base_addr;
3219
3220 base_addr = get_tss_base_addr(vcpu, seg_desc);
3221
3222 return kvm_write_guest(vcpu->kvm, base_addr, tss,
3223 sizeof(struct tss_segment_32));
3224}
3225
3226static int load_tss_segment16(struct kvm_vcpu *vcpu,
3227 struct desc_struct *seg_desc,
3228 struct tss_segment_16 *tss)
3229{
3230 u32 base_addr;
3231
3232 base_addr = get_tss_base_addr(vcpu, seg_desc);
3233
3234 return kvm_read_guest(vcpu->kvm, base_addr, tss,
3235 sizeof(struct tss_segment_16));
3236}
3237
3238static int save_tss_segment16(struct kvm_vcpu *vcpu,
3239 struct desc_struct *seg_desc,
3240 struct tss_segment_16 *tss)
3241{
3242 u32 base_addr;
3243
3244 base_addr = get_tss_base_addr(vcpu, seg_desc);
3245
3246 return kvm_write_guest(vcpu->kvm, base_addr, tss,
3247 sizeof(struct tss_segment_16));
3248}
3249
3250static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
3251{
3252 struct kvm_segment kvm_seg;
3253
3254 get_segment(vcpu, &kvm_seg, seg);
3255 return kvm_seg.selector;
3256}
3257
3258static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu,
3259 u16 selector,
3260 struct kvm_segment *kvm_seg)
3261{
3262 struct desc_struct seg_desc;
3263
3264 if (load_guest_segment_descriptor(vcpu, selector, &seg_desc))
3265 return 1;
3266 seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg);
3267 return 0;
3268}
3269
3270static int load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3271 int type_bits, int seg)
3272{
3273 struct kvm_segment kvm_seg;
3274
3275 if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg))
3276 return 1;
3277 kvm_seg.type |= type_bits;
3278
3279 if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS &&
3280 seg != VCPU_SREG_LDTR)
3281 if (!kvm_seg.s)
3282 kvm_seg.unusable = 1;
3283
3284 set_segment(vcpu, &kvm_seg, seg);
3285 return 0;
3286}
3287
3288static void save_state_to_tss32(struct kvm_vcpu *vcpu,
3289 struct tss_segment_32 *tss)
3290{
3291 tss->cr3 = vcpu->arch.cr3;
3292 tss->eip = vcpu->arch.rip;
3293 tss->eflags = kvm_x86_ops->get_rflags(vcpu);
3294 tss->eax = vcpu->arch.regs[VCPU_REGS_RAX];
3295 tss->ecx = vcpu->arch.regs[VCPU_REGS_RCX];
3296 tss->edx = vcpu->arch.regs[VCPU_REGS_RDX];
3297 tss->ebx = vcpu->arch.regs[VCPU_REGS_RBX];
3298 tss->esp = vcpu->arch.regs[VCPU_REGS_RSP];
3299 tss->ebp = vcpu->arch.regs[VCPU_REGS_RBP];
3300 tss->esi = vcpu->arch.regs[VCPU_REGS_RSI];
3301 tss->edi = vcpu->arch.regs[VCPU_REGS_RDI];
3302
3303 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
3304 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
3305 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
3306 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
3307 tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS);
3308 tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS);
3309 tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
3310 tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR);
3311}
3312
3313static int load_state_from_tss32(struct kvm_vcpu *vcpu,
3314 struct tss_segment_32 *tss)
3315{
3316 kvm_set_cr3(vcpu, tss->cr3);
3317
3318 vcpu->arch.rip = tss->eip;
3319 kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2);
3320
3321 vcpu->arch.regs[VCPU_REGS_RAX] = tss->eax;
3322 vcpu->arch.regs[VCPU_REGS_RCX] = tss->ecx;
3323 vcpu->arch.regs[VCPU_REGS_RDX] = tss->edx;
3324 vcpu->arch.regs[VCPU_REGS_RBX] = tss->ebx;
3325 vcpu->arch.regs[VCPU_REGS_RSP] = tss->esp;
3326 vcpu->arch.regs[VCPU_REGS_RBP] = tss->ebp;
3327 vcpu->arch.regs[VCPU_REGS_RSI] = tss->esi;
3328 vcpu->arch.regs[VCPU_REGS_RDI] = tss->edi;
3329
3330 if (load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR))
3331 return 1;
3332
3333 if (load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
3334 return 1;
3335
3336 if (load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
3337 return 1;
3338
3339 if (load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
3340 return 1;
3341
3342 if (load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
3343 return 1;
3344
3345 if (load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS))
3346 return 1;
3347
3348 if (load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS))
3349 return 1;
3350 return 0;
3351}
3352
3353static void save_state_to_tss16(struct kvm_vcpu *vcpu,
3354 struct tss_segment_16 *tss)
3355{
3356 tss->ip = vcpu->arch.rip;
3357 tss->flag = kvm_x86_ops->get_rflags(vcpu);
3358 tss->ax = vcpu->arch.regs[VCPU_REGS_RAX];
3359 tss->cx = vcpu->arch.regs[VCPU_REGS_RCX];
3360 tss->dx = vcpu->arch.regs[VCPU_REGS_RDX];
3361 tss->bx = vcpu->arch.regs[VCPU_REGS_RBX];
3362 tss->sp = vcpu->arch.regs[VCPU_REGS_RSP];
3363 tss->bp = vcpu->arch.regs[VCPU_REGS_RBP];
3364 tss->si = vcpu->arch.regs[VCPU_REGS_RSI];
3365 tss->di = vcpu->arch.regs[VCPU_REGS_RDI];
3366
3367 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
3368 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
3369 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
3370 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
3371 tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR);
3372 tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR);
3373}
3374
3375static int load_state_from_tss16(struct kvm_vcpu *vcpu,
3376 struct tss_segment_16 *tss)
3377{
3378 vcpu->arch.rip = tss->ip;
3379 kvm_x86_ops->set_rflags(vcpu, tss->flag | 2);
3380 vcpu->arch.regs[VCPU_REGS_RAX] = tss->ax;
3381 vcpu->arch.regs[VCPU_REGS_RCX] = tss->cx;
3382 vcpu->arch.regs[VCPU_REGS_RDX] = tss->dx;
3383 vcpu->arch.regs[VCPU_REGS_RBX] = tss->bx;
3384 vcpu->arch.regs[VCPU_REGS_RSP] = tss->sp;
3385 vcpu->arch.regs[VCPU_REGS_RBP] = tss->bp;
3386 vcpu->arch.regs[VCPU_REGS_RSI] = tss->si;
3387 vcpu->arch.regs[VCPU_REGS_RDI] = tss->di;
3388
3389 if (load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR))
3390 return 1;
3391
3392 if (load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
3393 return 1;
3394
3395 if (load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
3396 return 1;
3397
3398 if (load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
3399 return 1;
3400
3401 if (load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
3402 return 1;
3403 return 0;
3404}
3405
3406int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
3407 struct desc_struct *cseg_desc,
3408 struct desc_struct *nseg_desc)
3409{
3410 struct tss_segment_16 tss_segment_16;
3411 int ret = 0;
3412
3413 if (load_tss_segment16(vcpu, cseg_desc, &tss_segment_16))
3414 goto out;
3415
3416 save_state_to_tss16(vcpu, &tss_segment_16);
3417 save_tss_segment16(vcpu, cseg_desc, &tss_segment_16);
3418
3419 if (load_tss_segment16(vcpu, nseg_desc, &tss_segment_16))
3420 goto out;
3421 if (load_state_from_tss16(vcpu, &tss_segment_16))
3422 goto out;
3423
3424 ret = 1;
3425out:
3426 return ret;
3427}
3428
3429int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
3430 struct desc_struct *cseg_desc,
3431 struct desc_struct *nseg_desc)
3432{
3433 struct tss_segment_32 tss_segment_32;
3434 int ret = 0;
3435
3436 if (load_tss_segment32(vcpu, cseg_desc, &tss_segment_32))
3437 goto out;
3438
3439 save_state_to_tss32(vcpu, &tss_segment_32);
3440 save_tss_segment32(vcpu, cseg_desc, &tss_segment_32);
3441
3442 if (load_tss_segment32(vcpu, nseg_desc, &tss_segment_32))
3443 goto out;
3444 if (load_state_from_tss32(vcpu, &tss_segment_32))
3445 goto out;
3446
3447 ret = 1;
3448out:
3449 return ret;
2843} 3450}
2844 3451
3452int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
3453{
3454 struct kvm_segment tr_seg;
3455 struct desc_struct cseg_desc;
3456 struct desc_struct nseg_desc;
3457 int ret = 0;
3458
3459 get_segment(vcpu, &tr_seg, VCPU_SREG_TR);
3460
3461 if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc))
3462 goto out;
3463
3464 if (load_guest_segment_descriptor(vcpu, tr_seg.selector, &cseg_desc))
3465 goto out;
3466
3467
3468 if (reason != TASK_SWITCH_IRET) {
3469 int cpl;
3470
3471 cpl = kvm_x86_ops->get_cpl(vcpu);
3472 if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) {
3473 kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
3474 return 1;
3475 }
3476 }
3477
3478 if (!nseg_desc.p || (nseg_desc.limit0 | nseg_desc.limit << 16) < 0x67) {
3479 kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc);
3480 return 1;
3481 }
3482
3483 if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
3484 cseg_desc.type &= ~(1 << 8); //clear the B flag
3485 save_guest_segment_descriptor(vcpu, tr_seg.selector,
3486 &cseg_desc);
3487 }
3488
3489 if (reason == TASK_SWITCH_IRET) {
3490 u32 eflags = kvm_x86_ops->get_rflags(vcpu);
3491 kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
3492 }
3493
3494 kvm_x86_ops->skip_emulated_instruction(vcpu);
3495 kvm_x86_ops->cache_regs(vcpu);
3496
3497 if (nseg_desc.type & 8)
3498 ret = kvm_task_switch_32(vcpu, tss_selector, &cseg_desc,
3499 &nseg_desc);
3500 else
3501 ret = kvm_task_switch_16(vcpu, tss_selector, &cseg_desc,
3502 &nseg_desc);
3503
3504 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
3505 u32 eflags = kvm_x86_ops->get_rflags(vcpu);
3506 kvm_x86_ops->set_rflags(vcpu, eflags | X86_EFLAGS_NT);
3507 }
3508
3509 if (reason != TASK_SWITCH_IRET) {
3510 nseg_desc.type |= (1 << 8);
3511 save_guest_segment_descriptor(vcpu, tss_selector,
3512 &nseg_desc);
3513 }
3514
3515 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS);
3516 seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
3517 tr_seg.type = 11;
3518 set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
3519out:
3520 kvm_x86_ops->decache_regs(vcpu);
3521 return ret;
3522}
3523EXPORT_SYMBOL_GPL(kvm_task_switch);
3524
2845int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, 3525int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
2846 struct kvm_sregs *sregs) 3526 struct kvm_sregs *sregs)
2847{ 3527{
@@ -2862,12 +3542,10 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
2862 mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; 3542 mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
2863 vcpu->arch.cr3 = sregs->cr3; 3543 vcpu->arch.cr3 = sregs->cr3;
2864 3544
2865 set_cr8(vcpu, sregs->cr8); 3545 kvm_set_cr8(vcpu, sregs->cr8);
2866 3546
2867 mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer; 3547 mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer;
2868#ifdef CONFIG_X86_64
2869 kvm_x86_ops->set_efer(vcpu, sregs->efer); 3548 kvm_x86_ops->set_efer(vcpu, sregs->efer);
2870#endif
2871 kvm_set_apic_base(vcpu, sregs->apic_base); 3549 kvm_set_apic_base(vcpu, sregs->apic_base);
2872 3550
2873 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 3551 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
@@ -3141,9 +3819,9 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
3141 3819
3142 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 3820 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
3143 if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0) 3821 if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0)
3144 vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE; 3822 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
3145 else 3823 else
3146 vcpu->arch.mp_state = VCPU_MP_STATE_UNINITIALIZED; 3824 vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
3147 3825
3148 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 3826 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
3149 if (!page) { 3827 if (!page) {
@@ -3175,7 +3853,9 @@ fail:
3175void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) 3853void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
3176{ 3854{
3177 kvm_free_lapic(vcpu); 3855 kvm_free_lapic(vcpu);
3856 down_read(&vcpu->kvm->slots_lock);
3178 kvm_mmu_destroy(vcpu); 3857 kvm_mmu_destroy(vcpu);
3858 up_read(&vcpu->kvm->slots_lock);
3179 free_page((unsigned long)vcpu->arch.pio_data); 3859 free_page((unsigned long)vcpu->arch.pio_data);
3180} 3860}
3181 3861
@@ -3219,10 +3899,13 @@ static void kvm_free_vcpus(struct kvm *kvm)
3219 3899
3220void kvm_arch_destroy_vm(struct kvm *kvm) 3900void kvm_arch_destroy_vm(struct kvm *kvm)
3221{ 3901{
3902 kvm_free_pit(kvm);
3222 kfree(kvm->arch.vpic); 3903 kfree(kvm->arch.vpic);
3223 kfree(kvm->arch.vioapic); 3904 kfree(kvm->arch.vioapic);
3224 kvm_free_vcpus(kvm); 3905 kvm_free_vcpus(kvm);
3225 kvm_free_physmem(kvm); 3906 kvm_free_physmem(kvm);
3907 if (kvm->arch.apic_access_page)
3908 put_page(kvm->arch.apic_access_page);
3226 kfree(kvm); 3909 kfree(kvm);
3227} 3910}
3228 3911
@@ -3278,8 +3961,8 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
3278 3961
3279int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 3962int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
3280{ 3963{
3281 return vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE 3964 return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
3282 || vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED; 3965 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED;
3283} 3966}
3284 3967
3285static void vcpu_kick_intr(void *info) 3968static void vcpu_kick_intr(void *info)
@@ -3293,11 +3976,17 @@ static void vcpu_kick_intr(void *info)
3293void kvm_vcpu_kick(struct kvm_vcpu *vcpu) 3976void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
3294{ 3977{
3295 int ipi_pcpu = vcpu->cpu; 3978 int ipi_pcpu = vcpu->cpu;
3979 int cpu = get_cpu();
3296 3980
3297 if (waitqueue_active(&vcpu->wq)) { 3981 if (waitqueue_active(&vcpu->wq)) {
3298 wake_up_interruptible(&vcpu->wq); 3982 wake_up_interruptible(&vcpu->wq);
3299 ++vcpu->stat.halt_wakeup; 3983 ++vcpu->stat.halt_wakeup;
3300 } 3984 }
3301 if (vcpu->guest_mode) 3985 /*
3986 * We may be called synchronously with irqs disabled in guest mode,
3987 * So need not to call smp_call_function_single() in that case.
3988 */
3989 if (vcpu->guest_mode && vcpu->cpu != cpu)
3302 smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0); 3990 smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0);
3991 put_cpu();
3303} 3992}
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 79586003397a..2ca08386f993 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -65,6 +65,14 @@
65#define MemAbs (1<<9) /* Memory operand is absolute displacement */ 65#define MemAbs (1<<9) /* Memory operand is absolute displacement */
66#define String (1<<10) /* String instruction (rep capable) */ 66#define String (1<<10) /* String instruction (rep capable) */
67#define Stack (1<<11) /* Stack instruction (push/pop) */ 67#define Stack (1<<11) /* Stack instruction (push/pop) */
68#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
69#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */
70#define GroupMask 0xff /* Group number stored in bits 0:7 */
71
72enum {
73 Group1_80, Group1_81, Group1_82, Group1_83,
74 Group1A, Group3_Byte, Group3, Group4, Group5, Group7,
75};
68 76
69static u16 opcode_table[256] = { 77static u16 opcode_table[256] = {
70 /* 0x00 - 0x07 */ 78 /* 0x00 - 0x07 */
@@ -123,14 +131,14 @@ static u16 opcode_table[256] = {
123 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, 131 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
124 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, 132 ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
125 /* 0x80 - 0x87 */ 133 /* 0x80 - 0x87 */
126 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, 134 Group | Group1_80, Group | Group1_81,
127 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, 135 Group | Group1_82, Group | Group1_83,
128 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 136 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
129 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 137 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
130 /* 0x88 - 0x8F */ 138 /* 0x88 - 0x8F */
131 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, 139 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
132 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, 140 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
133 0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov | Stack, 141 0, ModRM | DstReg, 0, Group | Group1A,
134 /* 0x90 - 0x9F */ 142 /* 0x90 - 0x9F */
135 0, 0, 0, 0, 0, 0, 0, 0, 143 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, 144 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
@@ -164,16 +172,15 @@ static u16 opcode_table[256] = {
164 0, 0, 0, 0, 172 0, 0, 0, 0,
165 /* 0xF0 - 0xF7 */ 173 /* 0xF0 - 0xF7 */
166 0, 0, 0, 0, 174 0, 0, 0, 0,
167 ImplicitOps, ImplicitOps, 175 ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3,
168 ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
169 /* 0xF8 - 0xFF */ 176 /* 0xF8 - 0xFF */
170 ImplicitOps, 0, ImplicitOps, ImplicitOps, 177 ImplicitOps, 0, ImplicitOps, ImplicitOps,
171 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM 178 0, 0, Group | Group4, Group | Group5,
172}; 179};
173 180
174static u16 twobyte_table[256] = { 181static u16 twobyte_table[256] = {
175 /* 0x00 - 0x0F */ 182 /* 0x00 - 0x0F */
176 0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0, 183 0, Group | GroupDual | Group7, 0, 0, 0, 0, ImplicitOps, 0,
177 ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 184 ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
178 /* 0x10 - 0x1F */ 185 /* 0x10 - 0x1F */
179 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, 186 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
@@ -229,6 +236,56 @@ static u16 twobyte_table[256] = {
229 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 236 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
230}; 237};
231 238
239static u16 group_table[] = {
240 [Group1_80*8] =
241 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
242 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
243 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
244 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
245 [Group1_81*8] =
246 DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
247 DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
248 DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
249 DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
250 [Group1_82*8] =
251 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
252 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
253 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
254 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
255 [Group1_83*8] =
256 DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM,
257 DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM,
258 DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM,
259 DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM,
260 [Group1A*8] =
261 DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0,
262 [Group3_Byte*8] =
263 ByteOp | SrcImm | DstMem | ModRM, 0,
264 ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
265 0, 0, 0, 0,
266 [Group3*8] =
267 DstMem | SrcImm | ModRM | SrcImm, 0,
268 DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
269 0, 0, 0, 0,
270 [Group4*8] =
271 ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
272 0, 0, 0, 0, 0, 0,
273 [Group5*8] =
274 DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 0, 0,
275 SrcMem | ModRM, 0, SrcMem | ModRM | Stack, 0,
276 [Group7*8] =
277 0, 0, ModRM | SrcMem, ModRM | SrcMem,
278 SrcNone | ModRM | DstMem | Mov, 0,
279 SrcMem16 | ModRM | Mov, SrcMem | ModRM | ByteOp,
280};
281
282static u16 group2_table[] = {
283 [Group7*8] =
284 SrcNone | ModRM, 0, 0, 0,
285 SrcNone | ModRM | DstMem | Mov, 0,
286 SrcMem16 | ModRM | Mov, 0,
287};
288
232/* EFLAGS bit definitions. */ 289/* EFLAGS bit definitions. */
233#define EFLG_OF (1<<11) 290#define EFLG_OF (1<<11)
234#define EFLG_DF (1<<10) 291#define EFLG_DF (1<<10)
@@ -317,7 +374,7 @@ static u16 twobyte_table[256] = {
317 374
318#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \ 375#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
319 do { \ 376 do { \
320 unsigned long _tmp; \ 377 unsigned long __tmp; \
321 switch ((_dst).bytes) { \ 378 switch ((_dst).bytes) { \
322 case 1: \ 379 case 1: \
323 __asm__ __volatile__ ( \ 380 __asm__ __volatile__ ( \
@@ -325,7 +382,7 @@ static u16 twobyte_table[256] = {
325 _op"b %"_bx"3,%1; " \ 382 _op"b %"_bx"3,%1; " \
326 _POST_EFLAGS("0", "4", "2") \ 383 _POST_EFLAGS("0", "4", "2") \
327 : "=m" (_eflags), "=m" ((_dst).val), \ 384 : "=m" (_eflags), "=m" ((_dst).val), \
328 "=&r" (_tmp) \ 385 "=&r" (__tmp) \
329 : _by ((_src).val), "i" (EFLAGS_MASK)); \ 386 : _by ((_src).val), "i" (EFLAGS_MASK)); \
330 break; \ 387 break; \
331 default: \ 388 default: \
@@ -426,29 +483,40 @@ static u16 twobyte_table[256] = {
426 (_type)_x; \ 483 (_type)_x; \
427}) 484})
428 485
486static inline unsigned long ad_mask(struct decode_cache *c)
487{
488 return (1UL << (c->ad_bytes << 3)) - 1;
489}
490
429/* Access/update address held in a register, based on addressing mode. */ 491/* Access/update address held in a register, based on addressing mode. */
430#define address_mask(reg) \ 492static inline unsigned long
431 ((c->ad_bytes == sizeof(unsigned long)) ? \ 493address_mask(struct decode_cache *c, unsigned long reg)
432 (reg) : ((reg) & ((1UL << (c->ad_bytes << 3)) - 1))) 494{
433#define register_address(base, reg) \ 495 if (c->ad_bytes == sizeof(unsigned long))
434 ((base) + address_mask(reg)) 496 return reg;
435#define register_address_increment(reg, inc) \ 497 else
436 do { \ 498 return reg & ad_mask(c);
437 /* signed type ensures sign extension to long */ \ 499}
438 int _inc = (inc); \
439 if (c->ad_bytes == sizeof(unsigned long)) \
440 (reg) += _inc; \
441 else \
442 (reg) = ((reg) & \
443 ~((1UL << (c->ad_bytes << 3)) - 1)) | \
444 (((reg) + _inc) & \
445 ((1UL << (c->ad_bytes << 3)) - 1)); \
446 } while (0)
447 500
448#define JMP_REL(rel) \ 501static inline unsigned long
449 do { \ 502register_address(struct decode_cache *c, unsigned long base, unsigned long reg)
450 register_address_increment(c->eip, rel); \ 503{
451 } while (0) 504 return base + address_mask(c, reg);
505}
506
507static inline void
508register_address_increment(struct decode_cache *c, unsigned long *reg, int inc)
509{
510 if (c->ad_bytes == sizeof(unsigned long))
511 *reg += inc;
512 else
513 *reg = (*reg & ~ad_mask(c)) | ((*reg + inc) & ad_mask(c));
514}
515
516static inline void jmp_rel(struct decode_cache *c, int rel)
517{
518 register_address_increment(c, &c->eip, rel);
519}
452 520
453static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, 521static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
454 struct x86_emulate_ops *ops, 522 struct x86_emulate_ops *ops,
@@ -763,7 +831,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
763 struct decode_cache *c = &ctxt->decode; 831 struct decode_cache *c = &ctxt->decode;
764 int rc = 0; 832 int rc = 0;
765 int mode = ctxt->mode; 833 int mode = ctxt->mode;
766 int def_op_bytes, def_ad_bytes; 834 int def_op_bytes, def_ad_bytes, group;
767 835
768 /* Shadow copy of register state. Committed on successful emulation. */ 836 /* Shadow copy of register state. Committed on successful emulation. */
769 837
@@ -864,12 +932,24 @@ done_prefixes:
864 c->b = insn_fetch(u8, 1, c->eip); 932 c->b = insn_fetch(u8, 1, c->eip);
865 c->d = twobyte_table[c->b]; 933 c->d = twobyte_table[c->b];
866 } 934 }
935 }
867 936
868 /* Unrecognised? */ 937 if (c->d & Group) {
869 if (c->d == 0) { 938 group = c->d & GroupMask;
870 DPRINTF("Cannot emulate %02x\n", c->b); 939 c->modrm = insn_fetch(u8, 1, c->eip);
871 return -1; 940 --c->eip;
872 } 941
942 group = (group << 3) + ((c->modrm >> 3) & 7);
943 if ((c->d & GroupDual) && (c->modrm >> 6) == 3)
944 c->d = group2_table[group];
945 else
946 c->d = group_table[group];
947 }
948
949 /* Unrecognised? */
950 if (c->d == 0) {
951 DPRINTF("Cannot emulate %02x\n", c->b);
952 return -1;
873 } 953 }
874 954
875 if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) 955 if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
@@ -924,6 +1004,7 @@ done_prefixes:
924 */ 1004 */
925 if ((c->d & ModRM) && c->modrm_mod == 3) { 1005 if ((c->d & ModRM) && c->modrm_mod == 3) {
926 c->src.type = OP_REG; 1006 c->src.type = OP_REG;
1007 c->src.val = c->modrm_val;
927 break; 1008 break;
928 } 1009 }
929 c->src.type = OP_MEM; 1010 c->src.type = OP_MEM;
@@ -967,6 +1048,7 @@ done_prefixes:
967 case DstMem: 1048 case DstMem:
968 if ((c->d & ModRM) && c->modrm_mod == 3) { 1049 if ((c->d & ModRM) && c->modrm_mod == 3) {
969 c->dst.type = OP_REG; 1050 c->dst.type = OP_REG;
1051 c->dst.val = c->dst.orig_val = c->modrm_val;
970 break; 1052 break;
971 } 1053 }
972 c->dst.type = OP_MEM; 1054 c->dst.type = OP_MEM;
@@ -984,8 +1066,8 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
984 c->dst.type = OP_MEM; 1066 c->dst.type = OP_MEM;
985 c->dst.bytes = c->op_bytes; 1067 c->dst.bytes = c->op_bytes;
986 c->dst.val = c->src.val; 1068 c->dst.val = c->src.val;
987 register_address_increment(c->regs[VCPU_REGS_RSP], -c->op_bytes); 1069 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
988 c->dst.ptr = (void *) register_address(ctxt->ss_base, 1070 c->dst.ptr = (void *) register_address(c, ctxt->ss_base,
989 c->regs[VCPU_REGS_RSP]); 1071 c->regs[VCPU_REGS_RSP]);
990} 1072}
991 1073
@@ -995,13 +1077,13 @@ static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
995 struct decode_cache *c = &ctxt->decode; 1077 struct decode_cache *c = &ctxt->decode;
996 int rc; 1078 int rc;
997 1079
998 rc = ops->read_std(register_address(ctxt->ss_base, 1080 rc = ops->read_std(register_address(c, ctxt->ss_base,
999 c->regs[VCPU_REGS_RSP]), 1081 c->regs[VCPU_REGS_RSP]),
1000 &c->dst.val, c->dst.bytes, ctxt->vcpu); 1082 &c->dst.val, c->dst.bytes, ctxt->vcpu);
1001 if (rc != 0) 1083 if (rc != 0)
1002 return rc; 1084 return rc;
1003 1085
1004 register_address_increment(c->regs[VCPU_REGS_RSP], c->dst.bytes); 1086 register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->dst.bytes);
1005 1087
1006 return 0; 1088 return 0;
1007} 1089}
@@ -1043,26 +1125,6 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
1043 1125
1044 switch (c->modrm_reg) { 1126 switch (c->modrm_reg) {
1045 case 0 ... 1: /* test */ 1127 case 0 ... 1: /* test */
1046 /*
1047 * Special case in Grp3: test has an immediate
1048 * source operand.
1049 */
1050 c->src.type = OP_IMM;
1051 c->src.ptr = (unsigned long *)c->eip;
1052 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1053 if (c->src.bytes == 8)
1054 c->src.bytes = 4;
1055 switch (c->src.bytes) {
1056 case 1:
1057 c->src.val = insn_fetch(s8, 1, c->eip);
1058 break;
1059 case 2:
1060 c->src.val = insn_fetch(s16, 2, c->eip);
1061 break;
1062 case 4:
1063 c->src.val = insn_fetch(s32, 4, c->eip);
1064 break;
1065 }
1066 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); 1128 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
1067 break; 1129 break;
1068 case 2: /* not */ 1130 case 2: /* not */
@@ -1076,7 +1138,6 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
1076 rc = X86EMUL_UNHANDLEABLE; 1138 rc = X86EMUL_UNHANDLEABLE;
1077 break; 1139 break;
1078 } 1140 }
1079done:
1080 return rc; 1141 return rc;
1081} 1142}
1082 1143
@@ -1084,7 +1145,6 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
1084 struct x86_emulate_ops *ops) 1145 struct x86_emulate_ops *ops)
1085{ 1146{
1086 struct decode_cache *c = &ctxt->decode; 1147 struct decode_cache *c = &ctxt->decode;
1087 int rc;
1088 1148
1089 switch (c->modrm_reg) { 1149 switch (c->modrm_reg) {
1090 case 0: /* inc */ 1150 case 0: /* inc */
@@ -1094,36 +1154,11 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
1094 emulate_1op("dec", c->dst, ctxt->eflags); 1154 emulate_1op("dec", c->dst, ctxt->eflags);
1095 break; 1155 break;
1096 case 4: /* jmp abs */ 1156 case 4: /* jmp abs */
1097 if (c->b == 0xff) 1157 c->eip = c->src.val;
1098 c->eip = c->dst.val;
1099 else {
1100 DPRINTF("Cannot emulate %02x\n", c->b);
1101 return X86EMUL_UNHANDLEABLE;
1102 }
1103 break; 1158 break;
1104 case 6: /* push */ 1159 case 6: /* push */
1105 1160 emulate_push(ctxt);
1106 /* 64-bit mode: PUSH always pushes a 64-bit operand. */
1107
1108 if (ctxt->mode == X86EMUL_MODE_PROT64) {
1109 c->dst.bytes = 8;
1110 rc = ops->read_std((unsigned long)c->dst.ptr,
1111 &c->dst.val, 8, ctxt->vcpu);
1112 if (rc != 0)
1113 return rc;
1114 }
1115 register_address_increment(c->regs[VCPU_REGS_RSP],
1116 -c->dst.bytes);
1117 rc = ops->write_emulated(register_address(ctxt->ss_base,
1118 c->regs[VCPU_REGS_RSP]), &c->dst.val,
1119 c->dst.bytes, ctxt->vcpu);
1120 if (rc != 0)
1121 return rc;
1122 c->dst.type = OP_NONE;
1123 break; 1161 break;
1124 default:
1125 DPRINTF("Cannot emulate %02x\n", c->b);
1126 return X86EMUL_UNHANDLEABLE;
1127 } 1162 }
1128 return 0; 1163 return 0;
1129} 1164}
@@ -1361,19 +1396,19 @@ special_insn:
1361 c->dst.type = OP_MEM; 1396 c->dst.type = OP_MEM;
1362 c->dst.bytes = c->op_bytes; 1397 c->dst.bytes = c->op_bytes;
1363 c->dst.val = c->src.val; 1398 c->dst.val = c->src.val;
1364 register_address_increment(c->regs[VCPU_REGS_RSP], 1399 register_address_increment(c, &c->regs[VCPU_REGS_RSP],
1365 -c->op_bytes); 1400 -c->op_bytes);
1366 c->dst.ptr = (void *) register_address( 1401 c->dst.ptr = (void *) register_address(
1367 ctxt->ss_base, c->regs[VCPU_REGS_RSP]); 1402 c, ctxt->ss_base, c->regs[VCPU_REGS_RSP]);
1368 break; 1403 break;
1369 case 0x58 ... 0x5f: /* pop reg */ 1404 case 0x58 ... 0x5f: /* pop reg */
1370 pop_instruction: 1405 pop_instruction:
1371 if ((rc = ops->read_std(register_address(ctxt->ss_base, 1406 if ((rc = ops->read_std(register_address(c, ctxt->ss_base,
1372 c->regs[VCPU_REGS_RSP]), c->dst.ptr, 1407 c->regs[VCPU_REGS_RSP]), c->dst.ptr,
1373 c->op_bytes, ctxt->vcpu)) != 0) 1408 c->op_bytes, ctxt->vcpu)) != 0)
1374 goto done; 1409 goto done;
1375 1410
1376 register_address_increment(c->regs[VCPU_REGS_RSP], 1411 register_address_increment(c, &c->regs[VCPU_REGS_RSP],
1377 c->op_bytes); 1412 c->op_bytes);
1378 c->dst.type = OP_NONE; /* Disable writeback. */ 1413 c->dst.type = OP_NONE; /* Disable writeback. */
1379 break; 1414 break;
@@ -1393,9 +1428,9 @@ special_insn:
1393 1, 1428 1,
1394 (c->d & ByteOp) ? 1 : c->op_bytes, 1429 (c->d & ByteOp) ? 1 : c->op_bytes,
1395 c->rep_prefix ? 1430 c->rep_prefix ?
1396 address_mask(c->regs[VCPU_REGS_RCX]) : 1, 1431 address_mask(c, c->regs[VCPU_REGS_RCX]) : 1,
1397 (ctxt->eflags & EFLG_DF), 1432 (ctxt->eflags & EFLG_DF),
1398 register_address(ctxt->es_base, 1433 register_address(c, ctxt->es_base,
1399 c->regs[VCPU_REGS_RDI]), 1434 c->regs[VCPU_REGS_RDI]),
1400 c->rep_prefix, 1435 c->rep_prefix,
1401 c->regs[VCPU_REGS_RDX]) == 0) { 1436 c->regs[VCPU_REGS_RDX]) == 0) {
@@ -1409,9 +1444,9 @@ special_insn:
1409 0, 1444 0,
1410 (c->d & ByteOp) ? 1 : c->op_bytes, 1445 (c->d & ByteOp) ? 1 : c->op_bytes,
1411 c->rep_prefix ? 1446 c->rep_prefix ?
1412 address_mask(c->regs[VCPU_REGS_RCX]) : 1, 1447 address_mask(c, c->regs[VCPU_REGS_RCX]) : 1,
1413 (ctxt->eflags & EFLG_DF), 1448 (ctxt->eflags & EFLG_DF),
1414 register_address(c->override_base ? 1449 register_address(c, c->override_base ?
1415 *c->override_base : 1450 *c->override_base :
1416 ctxt->ds_base, 1451 ctxt->ds_base,
1417 c->regs[VCPU_REGS_RSI]), 1452 c->regs[VCPU_REGS_RSI]),
@@ -1425,7 +1460,7 @@ special_insn:
1425 int rel = insn_fetch(s8, 1, c->eip); 1460 int rel = insn_fetch(s8, 1, c->eip);
1426 1461
1427 if (test_cc(c->b, ctxt->eflags)) 1462 if (test_cc(c->b, ctxt->eflags))
1428 JMP_REL(rel); 1463 jmp_rel(c, rel);
1429 break; 1464 break;
1430 } 1465 }
1431 case 0x80 ... 0x83: /* Grp1 */ 1466 case 0x80 ... 0x83: /* Grp1 */
@@ -1477,7 +1512,7 @@ special_insn:
1477 case 0x88 ... 0x8b: /* mov */ 1512 case 0x88 ... 0x8b: /* mov */
1478 goto mov; 1513 goto mov;
1479 case 0x8d: /* lea r16/r32, m */ 1514 case 0x8d: /* lea r16/r32, m */
1480 c->dst.val = c->modrm_val; 1515 c->dst.val = c->modrm_ea;
1481 break; 1516 break;
1482 case 0x8f: /* pop (sole member of Grp1a) */ 1517 case 0x8f: /* pop (sole member of Grp1a) */
1483 rc = emulate_grp1a(ctxt, ops); 1518 rc = emulate_grp1a(ctxt, ops);
@@ -1501,27 +1536,27 @@ special_insn:
1501 case 0xa4 ... 0xa5: /* movs */ 1536 case 0xa4 ... 0xa5: /* movs */
1502 c->dst.type = OP_MEM; 1537 c->dst.type = OP_MEM;
1503 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1538 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1504 c->dst.ptr = (unsigned long *)register_address( 1539 c->dst.ptr = (unsigned long *)register_address(c,
1505 ctxt->es_base, 1540 ctxt->es_base,
1506 c->regs[VCPU_REGS_RDI]); 1541 c->regs[VCPU_REGS_RDI]);
1507 if ((rc = ops->read_emulated(register_address( 1542 if ((rc = ops->read_emulated(register_address(c,
1508 c->override_base ? *c->override_base : 1543 c->override_base ? *c->override_base :
1509 ctxt->ds_base, 1544 ctxt->ds_base,
1510 c->regs[VCPU_REGS_RSI]), 1545 c->regs[VCPU_REGS_RSI]),
1511 &c->dst.val, 1546 &c->dst.val,
1512 c->dst.bytes, ctxt->vcpu)) != 0) 1547 c->dst.bytes, ctxt->vcpu)) != 0)
1513 goto done; 1548 goto done;
1514 register_address_increment(c->regs[VCPU_REGS_RSI], 1549 register_address_increment(c, &c->regs[VCPU_REGS_RSI],
1515 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes 1550 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
1516 : c->dst.bytes); 1551 : c->dst.bytes);
1517 register_address_increment(c->regs[VCPU_REGS_RDI], 1552 register_address_increment(c, &c->regs[VCPU_REGS_RDI],
1518 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes 1553 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
1519 : c->dst.bytes); 1554 : c->dst.bytes);
1520 break; 1555 break;
1521 case 0xa6 ... 0xa7: /* cmps */ 1556 case 0xa6 ... 0xa7: /* cmps */
1522 c->src.type = OP_NONE; /* Disable writeback. */ 1557 c->src.type = OP_NONE; /* Disable writeback. */
1523 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1558 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1524 c->src.ptr = (unsigned long *)register_address( 1559 c->src.ptr = (unsigned long *)register_address(c,
1525 c->override_base ? *c->override_base : 1560 c->override_base ? *c->override_base :
1526 ctxt->ds_base, 1561 ctxt->ds_base,
1527 c->regs[VCPU_REGS_RSI]); 1562 c->regs[VCPU_REGS_RSI]);
@@ -1533,7 +1568,7 @@ special_insn:
1533 1568
1534 c->dst.type = OP_NONE; /* Disable writeback. */ 1569 c->dst.type = OP_NONE; /* Disable writeback. */
1535 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1570 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1536 c->dst.ptr = (unsigned long *)register_address( 1571 c->dst.ptr = (unsigned long *)register_address(c,
1537 ctxt->es_base, 1572 ctxt->es_base,
1538 c->regs[VCPU_REGS_RDI]); 1573 c->regs[VCPU_REGS_RDI]);
1539 if ((rc = ops->read_emulated((unsigned long)c->dst.ptr, 1574 if ((rc = ops->read_emulated((unsigned long)c->dst.ptr,
@@ -1546,10 +1581,10 @@ special_insn:
1546 1581
1547 emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); 1582 emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
1548 1583
1549 register_address_increment(c->regs[VCPU_REGS_RSI], 1584 register_address_increment(c, &c->regs[VCPU_REGS_RSI],
1550 (ctxt->eflags & EFLG_DF) ? -c->src.bytes 1585 (ctxt->eflags & EFLG_DF) ? -c->src.bytes
1551 : c->src.bytes); 1586 : c->src.bytes);
1552 register_address_increment(c->regs[VCPU_REGS_RDI], 1587 register_address_increment(c, &c->regs[VCPU_REGS_RDI],
1553 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes 1588 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
1554 : c->dst.bytes); 1589 : c->dst.bytes);
1555 1590
@@ -1557,11 +1592,11 @@ special_insn:
1557 case 0xaa ... 0xab: /* stos */ 1592 case 0xaa ... 0xab: /* stos */
1558 c->dst.type = OP_MEM; 1593 c->dst.type = OP_MEM;
1559 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1594 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1560 c->dst.ptr = (unsigned long *)register_address( 1595 c->dst.ptr = (unsigned long *)register_address(c,
1561 ctxt->es_base, 1596 ctxt->es_base,
1562 c->regs[VCPU_REGS_RDI]); 1597 c->regs[VCPU_REGS_RDI]);
1563 c->dst.val = c->regs[VCPU_REGS_RAX]; 1598 c->dst.val = c->regs[VCPU_REGS_RAX];
1564 register_address_increment(c->regs[VCPU_REGS_RDI], 1599 register_address_increment(c, &c->regs[VCPU_REGS_RDI],
1565 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes 1600 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
1566 : c->dst.bytes); 1601 : c->dst.bytes);
1567 break; 1602 break;
@@ -1569,7 +1604,7 @@ special_insn:
1569 c->dst.type = OP_REG; 1604 c->dst.type = OP_REG;
1570 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1605 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1571 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; 1606 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
1572 if ((rc = ops->read_emulated(register_address( 1607 if ((rc = ops->read_emulated(register_address(c,
1573 c->override_base ? *c->override_base : 1608 c->override_base ? *c->override_base :
1574 ctxt->ds_base, 1609 ctxt->ds_base,
1575 c->regs[VCPU_REGS_RSI]), 1610 c->regs[VCPU_REGS_RSI]),
@@ -1577,7 +1612,7 @@ special_insn:
1577 c->dst.bytes, 1612 c->dst.bytes,
1578 ctxt->vcpu)) != 0) 1613 ctxt->vcpu)) != 0)
1579 goto done; 1614 goto done;
1580 register_address_increment(c->regs[VCPU_REGS_RSI], 1615 register_address_increment(c, &c->regs[VCPU_REGS_RSI],
1581 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes 1616 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
1582 : c->dst.bytes); 1617 : c->dst.bytes);
1583 break; 1618 break;
@@ -1616,14 +1651,14 @@ special_insn:
1616 goto cannot_emulate; 1651 goto cannot_emulate;
1617 } 1652 }
1618 c->src.val = (unsigned long) c->eip; 1653 c->src.val = (unsigned long) c->eip;
1619 JMP_REL(rel); 1654 jmp_rel(c, rel);
1620 c->op_bytes = c->ad_bytes; 1655 c->op_bytes = c->ad_bytes;
1621 emulate_push(ctxt); 1656 emulate_push(ctxt);
1622 break; 1657 break;
1623 } 1658 }
1624 case 0xe9: /* jmp rel */ 1659 case 0xe9: /* jmp rel */
1625 case 0xeb: /* jmp rel short */ 1660 case 0xeb: /* jmp rel short */
1626 JMP_REL(c->src.val); 1661 jmp_rel(c, c->src.val);
1627 c->dst.type = OP_NONE; /* Disable writeback. */ 1662 c->dst.type = OP_NONE; /* Disable writeback. */
1628 break; 1663 break;
1629 case 0xf4: /* hlt */ 1664 case 0xf4: /* hlt */
@@ -1690,6 +1725,8 @@ twobyte_insn:
1690 goto done; 1725 goto done;
1691 1726
1692 kvm_emulate_hypercall(ctxt->vcpu); 1727 kvm_emulate_hypercall(ctxt->vcpu);
1728 /* Disable writeback. */
1729 c->dst.type = OP_NONE;
1693 break; 1730 break;
1694 case 2: /* lgdt */ 1731 case 2: /* lgdt */
1695 rc = read_descriptor(ctxt, ops, c->src.ptr, 1732 rc = read_descriptor(ctxt, ops, c->src.ptr,
@@ -1697,6 +1734,8 @@ twobyte_insn:
1697 if (rc) 1734 if (rc)
1698 goto done; 1735 goto done;
1699 realmode_lgdt(ctxt->vcpu, size, address); 1736 realmode_lgdt(ctxt->vcpu, size, address);
1737 /* Disable writeback. */
1738 c->dst.type = OP_NONE;
1700 break; 1739 break;
1701 case 3: /* lidt/vmmcall */ 1740 case 3: /* lidt/vmmcall */
1702 if (c->modrm_mod == 3 && c->modrm_rm == 1) { 1741 if (c->modrm_mod == 3 && c->modrm_rm == 1) {
@@ -1712,27 +1751,25 @@ twobyte_insn:
1712 goto done; 1751 goto done;
1713 realmode_lidt(ctxt->vcpu, size, address); 1752 realmode_lidt(ctxt->vcpu, size, address);
1714 } 1753 }
1754 /* Disable writeback. */
1755 c->dst.type = OP_NONE;
1715 break; 1756 break;
1716 case 4: /* smsw */ 1757 case 4: /* smsw */
1717 if (c->modrm_mod != 3) 1758 c->dst.bytes = 2;
1718 goto cannot_emulate; 1759 c->dst.val = realmode_get_cr(ctxt->vcpu, 0);
1719 *(u16 *)&c->regs[c->modrm_rm]
1720 = realmode_get_cr(ctxt->vcpu, 0);
1721 break; 1760 break;
1722 case 6: /* lmsw */ 1761 case 6: /* lmsw */
1723 if (c->modrm_mod != 3) 1762 realmode_lmsw(ctxt->vcpu, (u16)c->src.val,
1724 goto cannot_emulate; 1763 &ctxt->eflags);
1725 realmode_lmsw(ctxt->vcpu, (u16)c->modrm_val,
1726 &ctxt->eflags);
1727 break; 1764 break;
1728 case 7: /* invlpg*/ 1765 case 7: /* invlpg*/
1729 emulate_invlpg(ctxt->vcpu, memop); 1766 emulate_invlpg(ctxt->vcpu, memop);
1767 /* Disable writeback. */
1768 c->dst.type = OP_NONE;
1730 break; 1769 break;
1731 default: 1770 default:
1732 goto cannot_emulate; 1771 goto cannot_emulate;
1733 } 1772 }
1734 /* Disable writeback. */
1735 c->dst.type = OP_NONE;
1736 break; 1773 break;
1737 case 0x06: 1774 case 0x06:
1738 emulate_clts(ctxt->vcpu); 1775 emulate_clts(ctxt->vcpu);
@@ -1823,7 +1860,7 @@ twobyte_insn:
1823 goto cannot_emulate; 1860 goto cannot_emulate;
1824 } 1861 }
1825 if (test_cc(c->b, ctxt->eflags)) 1862 if (test_cc(c->b, ctxt->eflags))
1826 JMP_REL(rel); 1863 jmp_rel(c, rel);
1827 c->dst.type = OP_NONE; 1864 c->dst.type = OP_NONE;
1828 break; 1865 break;
1829 } 1866 }