aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2009-09-14 20:43:43 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-09-14 20:43:43 -0400
commit69def9f05dfce3281bb06599057e6b8097385d39 (patch)
tree7d826b22924268ddbfad101993b248996d40e2ec
parent353f6dd2dec992ddd34620a94b051b0f76227379 (diff)
parent8e616fc8d343bd7f0f0a0c22407fdcb77f6d22b1 (diff)
Merge branch 'kvm-updates/2.6.32' of git://git.kernel.org/pub/scm/virt/kvm/kvm
* 'kvm-updates/2.6.32' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (202 commits) MAINTAINERS: update KVM entry KVM: correct error-handling code KVM: fix compile warnings on s390 KVM: VMX: Check cpl before emulating debug register access KVM: fix misreporting of coalesced interrupts by kvm tracer KVM: x86: drop duplicate kvm_flush_remote_tlb calls KVM: VMX: call vmx_load_host_state() only if msr is cached KVM: VMX: Conditionally reload debug register 6 KVM: Use thread debug register storage instead of kvm specific data KVM guest: do not batch pte updates from interrupt context KVM: Fix coalesced interrupt reporting in IOAPIC KVM guest: fix bogus wallclock physical address calculation KVM: VMX: Fix cr8 exiting control clobbering by EPT KVM: Optimize kvm_mmu_unprotect_page_virt() for tdp KVM: Document KVM_CAP_IRQCHIP KVM: Protect update_cr8_intercept() when running without an apic KVM: VMX: Fix EPT with WP bit change during paging KVM: Use kvm_{read,write}_guest_virt() to read and write segment descriptors KVM: x86 emulator: Add adc and sbb missing decoder flags KVM: Add missing #include ...
-rw-r--r--Documentation/ioctl/ioctl-number.txt2
-rw-r--r--Documentation/kernel-parameters.txt39
-rw-r--r--Documentation/kvm/api.txt759
-rw-r--r--MAINTAINERS1
-rw-r--r--arch/ia64/include/asm/kvm_host.h4
-rw-r--r--arch/ia64/include/asm/kvm_para.h4
-rw-r--r--arch/ia64/kvm/Kconfig11
-rw-r--r--arch/ia64/kvm/kvm-ia64.c85
-rw-r--r--arch/ia64/kvm/vcpu.c4
-rw-r--r--arch/powerpc/include/asm/kvm_host.h4
-rw-r--r--arch/powerpc/kvm/44x.c4
-rw-r--r--arch/powerpc/kvm/44x_tlb.c11
-rw-r--r--arch/powerpc/kvm/Kconfig14
-rw-r--r--arch/powerpc/kvm/Makefile4
-rw-r--r--arch/powerpc/kvm/booke.c2
-rw-r--r--arch/powerpc/kvm/e500.c7
-rw-r--r--arch/powerpc/kvm/e500_emulate.c3
-rw-r--r--arch/powerpc/kvm/e500_tlb.c26
-rw-r--r--arch/powerpc/kvm/e500_tlb.h6
-rw-r--r--arch/powerpc/kvm/emulate.c7
-rw-r--r--arch/powerpc/kvm/powerpc.c32
-rw-r--r--arch/powerpc/kvm/trace.h104
-rw-r--r--arch/s390/include/asm/kvm.h9
-rw-r--r--arch/s390/include/asm/kvm_host.h15
-rw-r--r--arch/s390/include/asm/kvm_para.h4
-rw-r--r--arch/s390/kvm/Kconfig9
-rw-r--r--arch/s390/kvm/gaccess.h23
-rw-r--r--arch/s390/kvm/intercept.c18
-rw-r--r--arch/s390/kvm/interrupt.c8
-rw-r--r--arch/s390/kvm/kvm-s390.c78
-rw-r--r--arch/s390/kvm/kvm-s390.h32
-rw-r--r--arch/s390/kvm/sigp.c60
-rw-r--r--arch/x86/include/asm/apicdef.h2
-rw-r--r--arch/x86/include/asm/kvm.h10
-rw-r--r--arch/x86/include/asm/kvm_emulate.h (renamed from arch/x86/include/asm/kvm_x86_emulate.h)0
-rw-r--r--arch/x86/include/asm/kvm_host.h60
-rw-r--r--arch/x86/include/asm/kvm_para.h2
-rw-r--r--arch/x86/include/asm/msr-index.h1
-rw-r--r--arch/x86/include/asm/vmx.h8
-rw-r--r--arch/x86/kernel/kvm.c7
-rw-r--r--arch/x86/kernel/kvmclock.c4
-rw-r--r--arch/x86/kvm/Kconfig21
-rw-r--r--arch/x86/kvm/Makefile35
-rw-r--r--arch/x86/kvm/emulate.c (renamed from arch/x86/kvm/x86_emulate.c)265
-rw-r--r--arch/x86/kvm/i8254.c160
-rw-r--r--arch/x86/kvm/i8254.h5
-rw-r--r--arch/x86/kvm/i8259.c116
-rw-r--r--arch/x86/kvm/irq.h1
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h9
-rw-r--r--arch/x86/kvm/kvm_svm.h51
-rw-r--r--arch/x86/kvm/kvm_timer.h2
-rw-r--r--arch/x86/kvm/lapic.c334
-rw-r--r--arch/x86/kvm/lapic.h4
-rw-r--r--arch/x86/kvm/mmu.c587
-rw-r--r--arch/x86/kvm/mmu.h4
-rw-r--r--arch/x86/kvm/mmutrace.h220
-rw-r--r--arch/x86/kvm/paging_tmpl.h141
-rw-r--r--arch/x86/kvm/svm.c889
-rw-r--r--arch/x86/kvm/timer.c16
-rw-r--r--arch/x86/kvm/trace.h355
-rw-r--r--arch/x86/kvm/vmx.c497
-rw-r--r--arch/x86/kvm/x86.c815
-rw-r--r--arch/x86/kvm/x86.h4
-rw-r--r--arch/x86/mm/highmem_32.c1
-rw-r--r--include/asm-generic/Kbuild.asm5
-rw-r--r--include/linux/Kbuild4
-rw-r--r--include/linux/kvm.h127
-rw-r--r--include/linux/kvm_host.h114
-rw-r--r--include/linux/kvm_para.h1
-rw-r--r--include/trace/events/kvm.h151
-rw-r--r--mm/hugetlb.c1
-rw-r--r--virt/kvm/Kconfig14
-rw-r--r--virt/kvm/coalesced_mmio.c74
-rw-r--r--virt/kvm/coalesced_mmio.h1
-rw-r--r--virt/kvm/eventfd.c578
-rw-r--r--virt/kvm/ioapic.c78
-rw-r--r--virt/kvm/iodev.h55
-rw-r--r--virt/kvm/irq_comm.c51
-rw-r--r--virt/kvm/kvm_main.c298
-rw-r--r--virt/kvm/kvm_trace.c285
80 files changed, 5692 insertions, 2160 deletions
diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index 1c058b552e93..aafca0a8f66a 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -193,7 +193,7 @@ Code Seq# Include File Comments
1930xAD 00 Netfilter device in development: 1930xAD 00 Netfilter device in development:
194 <mailto:rusty@rustcorp.com.au> 194 <mailto:rusty@rustcorp.com.au>
1950xAE all linux/kvm.h Kernel-based Virtual Machine 1950xAE all linux/kvm.h Kernel-based Virtual Machine
196 <mailto:kvm-devel@lists.sourceforge.net> 196 <mailto:kvm@vger.kernel.org>
1970xB0 all RATIO devices in development: 1970xB0 all RATIO devices in development:
198 <mailto:vgo@ratio.de> 198 <mailto:vgo@ratio.de>
1990xB1 00-1F PPPoX <mailto:mostrows@styx.uwaterloo.ca> 1990xB1 00-1F PPPoX <mailto:mostrows@styx.uwaterloo.ca>
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index cb3a169e372a..3a238644c811 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -57,6 +57,7 @@ parameter is applicable:
57 ISAPNP ISA PnP code is enabled. 57 ISAPNP ISA PnP code is enabled.
58 ISDN Appropriate ISDN support is enabled. 58 ISDN Appropriate ISDN support is enabled.
59 JOY Appropriate joystick support is enabled. 59 JOY Appropriate joystick support is enabled.
60 KVM Kernel Virtual Machine support is enabled.
60 LIBATA Libata driver is enabled 61 LIBATA Libata driver is enabled
61 LP Printer support is enabled. 62 LP Printer support is enabled.
62 LOOP Loopback device support is enabled. 63 LOOP Loopback device support is enabled.
@@ -1098,6 +1099,44 @@ and is between 256 and 4096 characters. It is defined in the file
1098 kstack=N [X86] Print N words from the kernel stack 1099 kstack=N [X86] Print N words from the kernel stack
1099 in oops dumps. 1100 in oops dumps.
1100 1101
1102 kvm.ignore_msrs=[KVM] Ignore guest accesses to unhandled MSRs.
1103 Default is 0 (don't ignore, but inject #GP)
1104
1105 kvm.oos_shadow= [KVM] Disable out-of-sync shadow paging.
1106 Default is 1 (enabled)
1107
1108 kvm-amd.nested= [KVM,AMD] Allow nested virtualization in KVM/SVM.
1109 Default is 0 (off)
1110
1111 kvm-amd.npt= [KVM,AMD] Disable nested paging (virtualized MMU)
1112 for all guests.
1113 Default is 1 (enabled) if in 64bit or 32bit-PAE mode
1114
1115 kvm-intel.bypass_guest_pf=
1116 [KVM,Intel] Disables bypassing of guest page faults
1117 on Intel chips. Default is 1 (enabled)
1118
1119 kvm-intel.ept= [KVM,Intel] Disable extended page tables
1120 (virtualized MMU) support on capable Intel chips.
1121 Default is 1 (enabled)
1122
1123 kvm-intel.emulate_invalid_guest_state=
1124 [KVM,Intel] Enable emulation of invalid guest states
1125 Default is 0 (disabled)
1126
1127 kvm-intel.flexpriority=
1128 [KVM,Intel] Disable FlexPriority feature (TPR shadow).
1129 Default is 1 (enabled)
1130
1131 kvm-intel.unrestricted_guest=
1132 [KVM,Intel] Disable unrestricted guest feature
1133 (virtualized real and unpaged mode) on capable
1134 Intel chips. Default is 1 (enabled)
1135
1136 kvm-intel.vpid= [KVM,Intel] Disable Virtual Processor Identification
1137 feature (tagged TLBs) on capable Intel chips.
1138 Default is 1 (enabled)
1139
1101 l2cr= [PPC] 1140 l2cr= [PPC]
1102 1141
1103 l3cr= [PPC] 1142 l3cr= [PPC]
diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
new file mode 100644
index 000000000000..5a4bc8cf6d04
--- /dev/null
+++ b/Documentation/kvm/api.txt
@@ -0,0 +1,759 @@
1The Definitive KVM (Kernel-based Virtual Machine) API Documentation
2===================================================================
3
41. General description
5
6The kvm API is a set of ioctls that are issued to control various aspects
7of a virtual machine. The ioctls belong to three classes
8
9 - System ioctls: These query and set global attributes which affect the
10 whole kvm subsystem. In addition a system ioctl is used to create
11 virtual machines
12
13 - VM ioctls: These query and set attributes that affect an entire virtual
14 machine, for example memory layout. In addition a VM ioctl is used to
15 create virtual cpus (vcpus).
16
17 Only run VM ioctls from the same process (address space) that was used
18 to create the VM.
19
20 - vcpu ioctls: These query and set attributes that control the operation
21 of a single virtual cpu.
22
23 Only run vcpu ioctls from the same thread that was used to create the
24 vcpu.
25
262. File descritpors
27
28The kvm API is centered around file descriptors. An initial
29open("/dev/kvm") obtains a handle to the kvm subsystem; this handle
30can be used to issue system ioctls. A KVM_CREATE_VM ioctl on this
31handle will create a VM file descripror which can be used to issue VM
32ioctls. A KVM_CREATE_VCPU ioctl on a VM fd will create a virtual cpu
33and return a file descriptor pointing to it. Finally, ioctls on a vcpu
34fd can be used to control the vcpu, including the important task of
35actually running guest code.
36
37In general file descriptors can be migrated among processes by means
38of fork() and the SCM_RIGHTS facility of unix domain socket. These
39kinds of tricks are explicitly not supported by kvm. While they will
40not cause harm to the host, their actual behavior is not guaranteed by
41the API. The only supported use is one virtual machine per process,
42and one vcpu per thread.
43
443. Extensions
45
46As of Linux 2.6.22, the KVM ABI has been stabilized: no backward
47incompatible change are allowed. However, there is an extension
48facility that allows backward-compatible extensions to the API to be
49queried and used.
50
51The extension mechanism is not based on on the Linux version number.
52Instead, kvm defines extension identifiers and a facility to query
53whether a particular extension identifier is available. If it is, a
54set of ioctls is available for application use.
55
564. API description
57
58This section describes ioctls that can be used to control kvm guests.
59For each ioctl, the following information is provided along with a
60description:
61
62 Capability: which KVM extension provides this ioctl. Can be 'basic',
63 which means that is will be provided by any kernel that supports
64 API version 12 (see section 4.1), or a KVM_CAP_xyz constant, which
65 means availability needs to be checked with KVM_CHECK_EXTENSION
66 (see section 4.4).
67
68 Architectures: which instruction set architectures provide this ioctl.
69 x86 includes both i386 and x86_64.
70
71 Type: system, vm, or vcpu.
72
73 Parameters: what parameters are accepted by the ioctl.
74
75 Returns: the return value. General error numbers (EBADF, ENOMEM, EINVAL)
76 are not detailed, but errors with specific meanings are.
77
784.1 KVM_GET_API_VERSION
79
80Capability: basic
81Architectures: all
82Type: system ioctl
83Parameters: none
84Returns: the constant KVM_API_VERSION (=12)
85
86This identifies the API version as the stable kvm API. It is not
87expected that this number will change. However, Linux 2.6.20 and
882.6.21 report earlier versions; these are not documented and not
89supported. Applications should refuse to run if KVM_GET_API_VERSION
90returns a value other than 12. If this check passes, all ioctls
91described as 'basic' will be available.
92
934.2 KVM_CREATE_VM
94
95Capability: basic
96Architectures: all
97Type: system ioctl
98Parameters: none
99Returns: a VM fd that can be used to control the new virtual machine.
100
101The new VM has no virtual cpus and no memory. An mmap() of a VM fd
102will access the virtual machine's physical address space; offset zero
103corresponds to guest physical address zero. Use of mmap() on a VM fd
104is discouraged if userspace memory allocation (KVM_CAP_USER_MEMORY) is
105available.
106
1074.3 KVM_GET_MSR_INDEX_LIST
108
109Capability: basic
110Architectures: x86
111Type: system
112Parameters: struct kvm_msr_list (in/out)
113Returns: 0 on success; -1 on error
114Errors:
115 E2BIG: the msr index list is to be to fit in the array specified by
116 the user.
117
118struct kvm_msr_list {
119 __u32 nmsrs; /* number of msrs in entries */
120 __u32 indices[0];
121};
122
123This ioctl returns the guest msrs that are supported. The list varies
124by kvm version and host processor, but does not change otherwise. The
125user fills in the size of the indices array in nmsrs, and in return
126kvm adjusts nmsrs to reflect the actual number of msrs and fills in
127the indices array with their numbers.
128
1294.4 KVM_CHECK_EXTENSION
130
131Capability: basic
132Architectures: all
133Type: system ioctl
134Parameters: extension identifier (KVM_CAP_*)
135Returns: 0 if unsupported; 1 (or some other positive integer) if supported
136
137The API allows the application to query about extensions to the core
138kvm API. Userspace passes an extension identifier (an integer) and
139receives an integer that describes the extension availability.
140Generally 0 means no and 1 means yes, but some extensions may report
141additional information in the integer return value.
142
1434.5 KVM_GET_VCPU_MMAP_SIZE
144
145Capability: basic
146Architectures: all
147Type: system ioctl
148Parameters: none
149Returns: size of vcpu mmap area, in bytes
150
151The KVM_RUN ioctl (cf.) communicates with userspace via a shared
152memory region. This ioctl returns the size of that region. See the
153KVM_RUN documentation for details.
154
1554.6 KVM_SET_MEMORY_REGION
156
157Capability: basic
158Architectures: all
159Type: vm ioctl
160Parameters: struct kvm_memory_region (in)
161Returns: 0 on success, -1 on error
162
163struct kvm_memory_region {
164 __u32 slot;
165 __u32 flags;
166 __u64 guest_phys_addr;
167 __u64 memory_size; /* bytes */
168};
169
170/* for kvm_memory_region::flags */
171#define KVM_MEM_LOG_DIRTY_PAGES 1UL
172
173This ioctl allows the user to create or modify a guest physical memory
174slot. When changing an existing slot, it may be moved in the guest
175physical memory space, or its flags may be modified. It may not be
176resized. Slots may not overlap.
177
178The flags field supports just one flag, KVM_MEM_LOG_DIRTY_PAGES, which
179instructs kvm to keep track of writes to memory within the slot. See
180the KVM_GET_DIRTY_LOG ioctl.
181
182It is recommended to use the KVM_SET_USER_MEMORY_REGION ioctl instead
183of this API, if available. This newer API allows placing guest memory
184at specified locations in the host address space, yielding better
185control and easy access.
186
1874.6 KVM_CREATE_VCPU
188
189Capability: basic
190Architectures: all
191Type: vm ioctl
192Parameters: vcpu id (apic id on x86)
193Returns: vcpu fd on success, -1 on error
194
195This API adds a vcpu to a virtual machine. The vcpu id is a small integer
196in the range [0, max_vcpus).
197
1984.7 KVM_GET_DIRTY_LOG (vm ioctl)
199
200Capability: basic
201Architectures: x86
202Type: vm ioctl
203Parameters: struct kvm_dirty_log (in/out)
204Returns: 0 on success, -1 on error
205
206/* for KVM_GET_DIRTY_LOG */
207struct kvm_dirty_log {
208 __u32 slot;
209 __u32 padding;
210 union {
211 void __user *dirty_bitmap; /* one bit per page */
212 __u64 padding;
213 };
214};
215
216Given a memory slot, return a bitmap containing any pages dirtied
217since the last call to this ioctl. Bit 0 is the first page in the
218memory slot. Ensure the entire structure is cleared to avoid padding
219issues.
220
2214.8 KVM_SET_MEMORY_ALIAS
222
223Capability: basic
224Architectures: x86
225Type: vm ioctl
226Parameters: struct kvm_memory_alias (in)
227Returns: 0 (success), -1 (error)
228
229struct kvm_memory_alias {
230 __u32 slot; /* this has a different namespace than memory slots */
231 __u32 flags;
232 __u64 guest_phys_addr;
233 __u64 memory_size;
234 __u64 target_phys_addr;
235};
236
237Defines a guest physical address space region as an alias to another
238region. Useful for aliased address, for example the VGA low memory
239window. Should not be used with userspace memory.
240
2414.9 KVM_RUN
242
243Capability: basic
244Architectures: all
245Type: vcpu ioctl
246Parameters: none
247Returns: 0 on success, -1 on error
248Errors:
249 EINTR: an unmasked signal is pending
250
251This ioctl is used to run a guest virtual cpu. While there are no
252explicit parameters, there is an implicit parameter block that can be
253obtained by mmap()ing the vcpu fd at offset 0, with the size given by
254KVM_GET_VCPU_MMAP_SIZE. The parameter block is formatted as a 'struct
255kvm_run' (see below).
256
2574.10 KVM_GET_REGS
258
259Capability: basic
260Architectures: all
261Type: vcpu ioctl
262Parameters: struct kvm_regs (out)
263Returns: 0 on success, -1 on error
264
265Reads the general purpose registers from the vcpu.
266
267/* x86 */
268struct kvm_regs {
269 /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
270 __u64 rax, rbx, rcx, rdx;
271 __u64 rsi, rdi, rsp, rbp;
272 __u64 r8, r9, r10, r11;
273 __u64 r12, r13, r14, r15;
274 __u64 rip, rflags;
275};
276
2774.11 KVM_SET_REGS
278
279Capability: basic
280Architectures: all
281Type: vcpu ioctl
282Parameters: struct kvm_regs (in)
283Returns: 0 on success, -1 on error
284
285Writes the general purpose registers into the vcpu.
286
287See KVM_GET_REGS for the data structure.
288
2894.12 KVM_GET_SREGS
290
291Capability: basic
292Architectures: x86
293Type: vcpu ioctl
294Parameters: struct kvm_sregs (out)
295Returns: 0 on success, -1 on error
296
297Reads special registers from the vcpu.
298
299/* x86 */
300struct kvm_sregs {
301 struct kvm_segment cs, ds, es, fs, gs, ss;
302 struct kvm_segment tr, ldt;
303 struct kvm_dtable gdt, idt;
304 __u64 cr0, cr2, cr3, cr4, cr8;
305 __u64 efer;
306 __u64 apic_base;
307 __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64];
308};
309
310interrupt_bitmap is a bitmap of pending external interrupts. At most
311one bit may be set. This interrupt has been acknowledged by the APIC
312but not yet injected into the cpu core.
313
3144.13 KVM_SET_SREGS
315
316Capability: basic
317Architectures: x86
318Type: vcpu ioctl
319Parameters: struct kvm_sregs (in)
320Returns: 0 on success, -1 on error
321
322Writes special registers into the vcpu. See KVM_GET_SREGS for the
323data structures.
324
3254.14 KVM_TRANSLATE
326
327Capability: basic
328Architectures: x86
329Type: vcpu ioctl
330Parameters: struct kvm_translation (in/out)
331Returns: 0 on success, -1 on error
332
333Translates a virtual address according to the vcpu's current address
334translation mode.
335
336struct kvm_translation {
337 /* in */
338 __u64 linear_address;
339
340 /* out */
341 __u64 physical_address;
342 __u8 valid;
343 __u8 writeable;
344 __u8 usermode;
345 __u8 pad[5];
346};
347
3484.15 KVM_INTERRUPT
349
350Capability: basic
351Architectures: x86
352Type: vcpu ioctl
353Parameters: struct kvm_interrupt (in)
354Returns: 0 on success, -1 on error
355
356Queues a hardware interrupt vector to be injected. This is only
357useful if in-kernel local APIC is not used.
358
359/* for KVM_INTERRUPT */
360struct kvm_interrupt {
361 /* in */
362 __u32 irq;
363};
364
365Note 'irq' is an interrupt vector, not an interrupt pin or line.
366
3674.16 KVM_DEBUG_GUEST
368
369Capability: basic
370Architectures: none
371Type: vcpu ioctl
372Parameters: none)
373Returns: -1 on error
374
375Support for this has been removed. Use KVM_SET_GUEST_DEBUG instead.
376
3774.17 KVM_GET_MSRS
378
379Capability: basic
380Architectures: x86
381Type: vcpu ioctl
382Parameters: struct kvm_msrs (in/out)
383Returns: 0 on success, -1 on error
384
385Reads model-specific registers from the vcpu. Supported msr indices can
386be obtained using KVM_GET_MSR_INDEX_LIST.
387
388struct kvm_msrs {
389 __u32 nmsrs; /* number of msrs in entries */
390 __u32 pad;
391
392 struct kvm_msr_entry entries[0];
393};
394
395struct kvm_msr_entry {
396 __u32 index;
397 __u32 reserved;
398 __u64 data;
399};
400
401Application code should set the 'nmsrs' member (which indicates the
402size of the entries array) and the 'index' member of each array entry.
403kvm will fill in the 'data' member.
404
4054.18 KVM_SET_MSRS
406
407Capability: basic
408Architectures: x86
409Type: vcpu ioctl
410Parameters: struct kvm_msrs (in)
411Returns: 0 on success, -1 on error
412
413Writes model-specific registers to the vcpu. See KVM_GET_MSRS for the
414data structures.
415
416Application code should set the 'nmsrs' member (which indicates the
417size of the entries array), and the 'index' and 'data' members of each
418array entry.
419
4204.19 KVM_SET_CPUID
421
422Capability: basic
423Architectures: x86
424Type: vcpu ioctl
425Parameters: struct kvm_cpuid (in)
426Returns: 0 on success, -1 on error
427
428Defines the vcpu responses to the cpuid instruction. Applications
429should use the KVM_SET_CPUID2 ioctl if available.
430
431
432struct kvm_cpuid_entry {
433 __u32 function;
434 __u32 eax;
435 __u32 ebx;
436 __u32 ecx;
437 __u32 edx;
438 __u32 padding;
439};
440
441/* for KVM_SET_CPUID */
442struct kvm_cpuid {
443 __u32 nent;
444 __u32 padding;
445 struct kvm_cpuid_entry entries[0];
446};
447
4484.20 KVM_SET_SIGNAL_MASK
449
450Capability: basic
451Architectures: x86
452Type: vcpu ioctl
453Parameters: struct kvm_signal_mask (in)
454Returns: 0 on success, -1 on error
455
456Defines which signals are blocked during execution of KVM_RUN. This
457signal mask temporarily overrides the threads signal mask. Any
458unblocked signal received (except SIGKILL and SIGSTOP, which retain
459their traditional behaviour) will cause KVM_RUN to return with -EINTR.
460
461Note the signal will only be delivered if not blocked by the original
462signal mask.
463
464/* for KVM_SET_SIGNAL_MASK */
465struct kvm_signal_mask {
466 __u32 len;
467 __u8 sigset[0];
468};
469
4704.21 KVM_GET_FPU
471
472Capability: basic
473Architectures: x86
474Type: vcpu ioctl
475Parameters: struct kvm_fpu (out)
476Returns: 0 on success, -1 on error
477
478Reads the floating point state from the vcpu.
479
480/* for KVM_GET_FPU and KVM_SET_FPU */
481struct kvm_fpu {
482 __u8 fpr[8][16];
483 __u16 fcw;
484 __u16 fsw;
485 __u8 ftwx; /* in fxsave format */
486 __u8 pad1;
487 __u16 last_opcode;
488 __u64 last_ip;
489 __u64 last_dp;
490 __u8 xmm[16][16];
491 __u32 mxcsr;
492 __u32 pad2;
493};
494
4954.22 KVM_SET_FPU
496
497Capability: basic
498Architectures: x86
499Type: vcpu ioctl
500Parameters: struct kvm_fpu (in)
501Returns: 0 on success, -1 on error
502
503Writes the floating point state to the vcpu.
504
505/* for KVM_GET_FPU and KVM_SET_FPU */
506struct kvm_fpu {
507 __u8 fpr[8][16];
508 __u16 fcw;
509 __u16 fsw;
510 __u8 ftwx; /* in fxsave format */
511 __u8 pad1;
512 __u16 last_opcode;
513 __u64 last_ip;
514 __u64 last_dp;
515 __u8 xmm[16][16];
516 __u32 mxcsr;
517 __u32 pad2;
518};
519
5204.23 KVM_CREATE_IRQCHIP
521
522Capability: KVM_CAP_IRQCHIP
523Architectures: x86, ia64
524Type: vm ioctl
525Parameters: none
526Returns: 0 on success, -1 on error
527
528Creates an interrupt controller model in the kernel. On x86, creates a virtual
529ioapic, a virtual PIC (two PICs, nested), and sets up future vcpus to have a
530local APIC. IRQ routing for GSIs 0-15 is set to both PIC and IOAPIC; GSI 16-23
531only go to the IOAPIC. On ia64, a IOSAPIC is created.
532
5334.24 KVM_IRQ_LINE
534
535Capability: KVM_CAP_IRQCHIP
536Architectures: x86, ia64
537Type: vm ioctl
538Parameters: struct kvm_irq_level
539Returns: 0 on success, -1 on error
540
541Sets the level of a GSI input to the interrupt controller model in the kernel.
542Requires that an interrupt controller model has been previously created with
543KVM_CREATE_IRQCHIP. Note that edge-triggered interrupts require the level
544to be set to 1 and then back to 0.
545
546struct kvm_irq_level {
547 union {
548 __u32 irq; /* GSI */
549 __s32 status; /* not used for KVM_IRQ_LEVEL */
550 };
551 __u32 level; /* 0 or 1 */
552};
553
5544.25 KVM_GET_IRQCHIP
555
556Capability: KVM_CAP_IRQCHIP
557Architectures: x86, ia64
558Type: vm ioctl
559Parameters: struct kvm_irqchip (in/out)
560Returns: 0 on success, -1 on error
561
562Reads the state of a kernel interrupt controller created with
563KVM_CREATE_IRQCHIP into a buffer provided by the caller.
564
565struct kvm_irqchip {
566 __u32 chip_id; /* 0 = PIC1, 1 = PIC2, 2 = IOAPIC */
567 __u32 pad;
568 union {
569 char dummy[512]; /* reserving space */
570 struct kvm_pic_state pic;
571 struct kvm_ioapic_state ioapic;
572 } chip;
573};
574
5754.26 KVM_SET_IRQCHIP
576
577Capability: KVM_CAP_IRQCHIP
578Architectures: x86, ia64
579Type: vm ioctl
580Parameters: struct kvm_irqchip (in)
581Returns: 0 on success, -1 on error
582
583Sets the state of a kernel interrupt controller created with
584KVM_CREATE_IRQCHIP from a buffer provided by the caller.
585
586struct kvm_irqchip {
587 __u32 chip_id; /* 0 = PIC1, 1 = PIC2, 2 = IOAPIC */
588 __u32 pad;
589 union {
590 char dummy[512]; /* reserving space */
591 struct kvm_pic_state pic;
592 struct kvm_ioapic_state ioapic;
593 } chip;
594};
595
5965. The kvm_run structure
597
598Application code obtains a pointer to the kvm_run structure by
599mmap()ing a vcpu fd. From that point, application code can control
600execution by changing fields in kvm_run prior to calling the KVM_RUN
601ioctl, and obtain information about the reason KVM_RUN returned by
602looking up structure members.
603
604struct kvm_run {
605 /* in */
606 __u8 request_interrupt_window;
607
608Request that KVM_RUN return when it becomes possible to inject external
609interrupts into the guest. Useful in conjunction with KVM_INTERRUPT.
610
611 __u8 padding1[7];
612
613 /* out */
614 __u32 exit_reason;
615
616When KVM_RUN has returned successfully (return value 0), this informs
617application code why KVM_RUN has returned. Allowable values for this
618field are detailed below.
619
620 __u8 ready_for_interrupt_injection;
621
622If request_interrupt_window has been specified, this field indicates
623an interrupt can be injected now with KVM_INTERRUPT.
624
625 __u8 if_flag;
626
627The value of the current interrupt flag. Only valid if in-kernel
628local APIC is not used.
629
630 __u8 padding2[2];
631
632 /* in (pre_kvm_run), out (post_kvm_run) */
633 __u64 cr8;
634
635The value of the cr8 register. Only valid if in-kernel local APIC is
636not used. Both input and output.
637
638 __u64 apic_base;
639
640The value of the APIC BASE msr. Only valid if in-kernel local
641APIC is not used. Both input and output.
642
643 union {
644 /* KVM_EXIT_UNKNOWN */
645 struct {
646 __u64 hardware_exit_reason;
647 } hw;
648
649If exit_reason is KVM_EXIT_UNKNOWN, the vcpu has exited due to unknown
650reasons. Further architecture-specific information is available in
651hardware_exit_reason.
652
653 /* KVM_EXIT_FAIL_ENTRY */
654 struct {
655 __u64 hardware_entry_failure_reason;
656 } fail_entry;
657
658If exit_reason is KVM_EXIT_FAIL_ENTRY, the vcpu could not be run due
659to unknown reasons. Further architecture-specific information is
660available in hardware_entry_failure_reason.
661
662 /* KVM_EXIT_EXCEPTION */
663 struct {
664 __u32 exception;
665 __u32 error_code;
666 } ex;
667
668Unused.
669
670 /* KVM_EXIT_IO */
671 struct {
672#define KVM_EXIT_IO_IN 0
673#define KVM_EXIT_IO_OUT 1
674 __u8 direction;
675 __u8 size; /* bytes */
676 __u16 port;
677 __u32 count;
678 __u64 data_offset; /* relative to kvm_run start */
679 } io;
680
681If exit_reason is KVM_EXIT_IO_IN or KVM_EXIT_IO_OUT, then the vcpu has
682executed a port I/O instruction which could not be satisfied by kvm.
683data_offset describes where the data is located (KVM_EXIT_IO_OUT) or
684where kvm expects application code to place the data for the next
685KVM_RUN invocation (KVM_EXIT_IO_IN). Data format is a patcked array.
686
687 struct {
688 struct kvm_debug_exit_arch arch;
689 } debug;
690
691Unused.
692
693 /* KVM_EXIT_MMIO */
694 struct {
695 __u64 phys_addr;
696 __u8 data[8];
697 __u32 len;
698 __u8 is_write;
699 } mmio;
700
701If exit_reason is KVM_EXIT_MMIO or KVM_EXIT_IO_OUT, then the vcpu has
702executed a memory-mapped I/O instruction which could not be satisfied
703by kvm. The 'data' member contains the written data if 'is_write' is
704true, and should be filled by application code otherwise.
705
706 /* KVM_EXIT_HYPERCALL */
707 struct {
708 __u64 nr;
709 __u64 args[6];
710 __u64 ret;
711 __u32 longmode;
712 __u32 pad;
713 } hypercall;
714
715Unused.
716
717 /* KVM_EXIT_TPR_ACCESS */
718 struct {
719 __u64 rip;
720 __u32 is_write;
721 __u32 pad;
722 } tpr_access;
723
724To be documented (KVM_TPR_ACCESS_REPORTING).
725
726 /* KVM_EXIT_S390_SIEIC */
727 struct {
728 __u8 icptcode;
729 __u64 mask; /* psw upper half */
730 __u64 addr; /* psw lower half */
731 __u16 ipa;
732 __u32 ipb;
733 } s390_sieic;
734
735s390 specific.
736
737 /* KVM_EXIT_S390_RESET */
738#define KVM_S390_RESET_POR 1
739#define KVM_S390_RESET_CLEAR 2
740#define KVM_S390_RESET_SUBSYSTEM 4
741#define KVM_S390_RESET_CPU_INIT 8
742#define KVM_S390_RESET_IPL 16
743 __u64 s390_reset_flags;
744
745s390 specific.
746
747 /* KVM_EXIT_DCR */
748 struct {
749 __u32 dcrn;
750 __u32 data;
751 __u8 is_write;
752 } dcr;
753
754powerpc specific.
755
756 /* Fix the size of the union. */
757 char padding[256];
758 };
759};
diff --git a/MAINTAINERS b/MAINTAINERS
index e95cb772f931..15169365c339 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2926,6 +2926,7 @@ F: include/linux/sunrpc/
2926 2926
2927KERNEL VIRTUAL MACHINE (KVM) 2927KERNEL VIRTUAL MACHINE (KVM)
2928M: Avi Kivity <avi@redhat.com> 2928M: Avi Kivity <avi@redhat.com>
2929M: Marcelo Tosatti <mtosatti@redhat.com>
2929L: kvm@vger.kernel.org 2930L: kvm@vger.kernel.org
2930W: http://kvm.qumranet.com 2931W: http://kvm.qumranet.com
2931S: Supported 2932S: Supported
diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h
index 5f43697aed30..d9b6325a9328 100644
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -235,7 +235,8 @@ struct kvm_vm_data {
235#define KVM_REQ_PTC_G 32 235#define KVM_REQ_PTC_G 32
236#define KVM_REQ_RESUME 33 236#define KVM_REQ_RESUME 33
237 237
238#define KVM_PAGES_PER_HPAGE 1 238#define KVM_NR_PAGE_SIZES 1
239#define KVM_PAGES_PER_HPAGE(x) 1
239 240
240struct kvm; 241struct kvm;
241struct kvm_vcpu; 242struct kvm_vcpu;
@@ -465,7 +466,6 @@ struct kvm_arch {
465 unsigned long metaphysical_rr4; 466 unsigned long metaphysical_rr4;
466 unsigned long vmm_init_rr; 467 unsigned long vmm_init_rr;
467 468
468 int online_vcpus;
469 int is_sn2; 469 int is_sn2;
470 470
471 struct kvm_ioapic *vioapic; 471 struct kvm_ioapic *vioapic;
diff --git a/arch/ia64/include/asm/kvm_para.h b/arch/ia64/include/asm/kvm_para.h
index 0d6d8ca07b8c..1588aee781a2 100644
--- a/arch/ia64/include/asm/kvm_para.h
+++ b/arch/ia64/include/asm/kvm_para.h
@@ -19,9 +19,13 @@
19 * 19 *
20 */ 20 */
21 21
22#ifdef __KERNEL__
23
22static inline unsigned int kvm_arch_para_features(void) 24static inline unsigned int kvm_arch_para_features(void)
23{ 25{
24 return 0; 26 return 0;
25} 27}
26 28
27#endif 29#endif
30
31#endif
diff --git a/arch/ia64/kvm/Kconfig b/arch/ia64/kvm/Kconfig
index 64d520937874..ef3e7be29caf 100644
--- a/arch/ia64/kvm/Kconfig
+++ b/arch/ia64/kvm/Kconfig
@@ -1,12 +1,8 @@
1# 1#
2# KVM configuration 2# KVM configuration
3# 3#
4config HAVE_KVM
5 bool
6 4
7config HAVE_KVM_IRQCHIP 5source "virt/kvm/Kconfig"
8 bool
9 default y
10 6
11menuconfig VIRTUALIZATION 7menuconfig VIRTUALIZATION
12 bool "Virtualization" 8 bool "Virtualization"
@@ -28,6 +24,8 @@ config KVM
28 depends on PCI 24 depends on PCI
29 select PREEMPT_NOTIFIERS 25 select PREEMPT_NOTIFIERS
30 select ANON_INODES 26 select ANON_INODES
27 select HAVE_KVM_IRQCHIP
28 select KVM_APIC_ARCHITECTURE
31 ---help--- 29 ---help---
32 Support hosting fully virtualized guest machines using hardware 30 Support hosting fully virtualized guest machines using hardware
33 virtualization extensions. You will need a fairly recent 31 virtualization extensions. You will need a fairly recent
@@ -49,9 +47,6 @@ config KVM_INTEL
49 Provides support for KVM on Itanium 2 processors equipped with the VT 47 Provides support for KVM on Itanium 2 processors equipped with the VT
50 extensions. 48 extensions.
51 49
52config KVM_TRACE
53 bool
54
55source drivers/virtio/Kconfig 50source drivers/virtio/Kconfig
56 51
57endif # VIRTUALIZATION 52endif # VIRTUALIZATION
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 80c57b0a21c4..0ad09f05efa9 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -210,16 +210,6 @@ int kvm_dev_ioctl_check_extension(long ext)
210 210
211} 211}
212 212
213static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
214 gpa_t addr, int len, int is_write)
215{
216 struct kvm_io_device *dev;
217
218 dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len, is_write);
219
220 return dev;
221}
222
223static int handle_vm_error(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 213static int handle_vm_error(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
224{ 214{
225 kvm_run->exit_reason = KVM_EXIT_UNKNOWN; 215 kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
@@ -231,6 +221,7 @@ static int handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
231{ 221{
232 struct kvm_mmio_req *p; 222 struct kvm_mmio_req *p;
233 struct kvm_io_device *mmio_dev; 223 struct kvm_io_device *mmio_dev;
224 int r;
234 225
235 p = kvm_get_vcpu_ioreq(vcpu); 226 p = kvm_get_vcpu_ioreq(vcpu);
236 227
@@ -247,16 +238,13 @@ static int handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
247 kvm_run->exit_reason = KVM_EXIT_MMIO; 238 kvm_run->exit_reason = KVM_EXIT_MMIO;
248 return 0; 239 return 0;
249mmio: 240mmio:
250 mmio_dev = vcpu_find_mmio_dev(vcpu, p->addr, p->size, !p->dir); 241 if (p->dir)
251 if (mmio_dev) { 242 r = kvm_io_bus_read(&vcpu->kvm->mmio_bus, p->addr,
252 if (!p->dir) 243 p->size, &p->data);
253 kvm_iodevice_write(mmio_dev, p->addr, p->size, 244 else
254 &p->data); 245 r = kvm_io_bus_write(&vcpu->kvm->mmio_bus, p->addr,
255 else 246 p->size, &p->data);
256 kvm_iodevice_read(mmio_dev, p->addr, p->size, 247 if (r)
257 &p->data);
258
259 } else
260 printk(KERN_ERR"kvm: No iodevice found! addr:%lx\n", p->addr); 248 printk(KERN_ERR"kvm: No iodevice found! addr:%lx\n", p->addr);
261 p->state = STATE_IORESP_READY; 249 p->state = STATE_IORESP_READY;
262 250
@@ -337,13 +325,12 @@ static struct kvm_vcpu *lid_to_vcpu(struct kvm *kvm, unsigned long id,
337{ 325{
338 union ia64_lid lid; 326 union ia64_lid lid;
339 int i; 327 int i;
328 struct kvm_vcpu *vcpu;
340 329
341 for (i = 0; i < kvm->arch.online_vcpus; i++) { 330 kvm_for_each_vcpu(i, vcpu, kvm) {
342 if (kvm->vcpus[i]) { 331 lid.val = VCPU_LID(vcpu);
343 lid.val = VCPU_LID(kvm->vcpus[i]); 332 if (lid.id == id && lid.eid == eid)
344 if (lid.id == id && lid.eid == eid) 333 return vcpu;
345 return kvm->vcpus[i];
346 }
347 } 334 }
348 335
349 return NULL; 336 return NULL;
@@ -409,21 +396,21 @@ static int handle_global_purge(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
409 struct kvm *kvm = vcpu->kvm; 396 struct kvm *kvm = vcpu->kvm;
410 struct call_data call_data; 397 struct call_data call_data;
411 int i; 398 int i;
399 struct kvm_vcpu *vcpui;
412 400
413 call_data.ptc_g_data = p->u.ptc_g_data; 401 call_data.ptc_g_data = p->u.ptc_g_data;
414 402
415 for (i = 0; i < kvm->arch.online_vcpus; i++) { 403 kvm_for_each_vcpu(i, vcpui, kvm) {
416 if (!kvm->vcpus[i] || kvm->vcpus[i]->arch.mp_state == 404 if (vcpui->arch.mp_state == KVM_MP_STATE_UNINITIALIZED ||
417 KVM_MP_STATE_UNINITIALIZED || 405 vcpu == vcpui)
418 vcpu == kvm->vcpus[i])
419 continue; 406 continue;
420 407
421 if (waitqueue_active(&kvm->vcpus[i]->wq)) 408 if (waitqueue_active(&vcpui->wq))
422 wake_up_interruptible(&kvm->vcpus[i]->wq); 409 wake_up_interruptible(&vcpui->wq);
423 410
424 if (kvm->vcpus[i]->cpu != -1) { 411 if (vcpui->cpu != -1) {
425 call_data.vcpu = kvm->vcpus[i]; 412 call_data.vcpu = vcpui;
426 smp_call_function_single(kvm->vcpus[i]->cpu, 413 smp_call_function_single(vcpui->cpu,
427 vcpu_global_purge, &call_data, 1); 414 vcpu_global_purge, &call_data, 1);
428 } else 415 } else
429 printk(KERN_WARNING"kvm: Uninit vcpu received ipi!\n"); 416 printk(KERN_WARNING"kvm: Uninit vcpu received ipi!\n");
@@ -852,8 +839,6 @@ struct kvm *kvm_arch_create_vm(void)
852 839
853 kvm_init_vm(kvm); 840 kvm_init_vm(kvm);
854 841
855 kvm->arch.online_vcpus = 0;
856
857 return kvm; 842 return kvm;
858 843
859} 844}
@@ -1000,10 +985,10 @@ long kvm_arch_vm_ioctl(struct file *filp,
1000 goto out; 985 goto out;
1001 if (irqchip_in_kernel(kvm)) { 986 if (irqchip_in_kernel(kvm)) {
1002 __s32 status; 987 __s32 status;
1003 mutex_lock(&kvm->lock); 988 mutex_lock(&kvm->irq_lock);
1004 status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 989 status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
1005 irq_event.irq, irq_event.level); 990 irq_event.irq, irq_event.level);
1006 mutex_unlock(&kvm->lock); 991 mutex_unlock(&kvm->irq_lock);
1007 if (ioctl == KVM_IRQ_LINE_STATUS) { 992 if (ioctl == KVM_IRQ_LINE_STATUS) {
1008 irq_event.status = status; 993 irq_event.status = status;
1009 if (copy_to_user(argp, &irq_event, 994 if (copy_to_user(argp, &irq_event,
@@ -1216,7 +1201,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
1216 if (IS_ERR(vmm_vcpu)) 1201 if (IS_ERR(vmm_vcpu))
1217 return PTR_ERR(vmm_vcpu); 1202 return PTR_ERR(vmm_vcpu);
1218 1203
1219 if (vcpu->vcpu_id == 0) { 1204 if (kvm_vcpu_is_bsp(vcpu)) {
1220 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 1205 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
1221 1206
1222 /*Set entry address for first run.*/ 1207 /*Set entry address for first run.*/
@@ -1224,7 +1209,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
1224 1209
1225 /*Initialize itc offset for vcpus*/ 1210 /*Initialize itc offset for vcpus*/
1226 itc_offset = 0UL - kvm_get_itc(vcpu); 1211 itc_offset = 0UL - kvm_get_itc(vcpu);
1227 for (i = 0; i < kvm->arch.online_vcpus; i++) { 1212 for (i = 0; i < KVM_MAX_VCPUS; i++) {
1228 v = (struct kvm_vcpu *)((char *)vcpu + 1213 v = (struct kvm_vcpu *)((char *)vcpu +
1229 sizeof(struct kvm_vcpu_data) * i); 1214 sizeof(struct kvm_vcpu_data) * i);
1230 v->arch.itc_offset = itc_offset; 1215 v->arch.itc_offset = itc_offset;
@@ -1356,8 +1341,6 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
1356 goto fail; 1341 goto fail;
1357 } 1342 }
1358 1343
1359 kvm->arch.online_vcpus++;
1360
1361 return vcpu; 1344 return vcpu;
1362fail: 1345fail:
1363 return ERR_PTR(r); 1346 return ERR_PTR(r);
@@ -1952,19 +1935,6 @@ int kvm_highest_pending_irq(struct kvm_vcpu *vcpu)
1952 return find_highest_bits((int *)&vpd->irr[0]); 1935 return find_highest_bits((int *)&vpd->irr[0]);
1953} 1936}
1954 1937
1955int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu)
1956{
1957 if (kvm_highest_pending_irq(vcpu) != -1)
1958 return 1;
1959 return 0;
1960}
1961
1962int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
1963{
1964 /* do real check here */
1965 return 1;
1966}
1967
1968int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) 1938int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
1969{ 1939{
1970 return vcpu->arch.timer_fired; 1940 return vcpu->arch.timer_fired;
@@ -1977,7 +1947,8 @@ gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
1977 1947
1978int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 1948int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
1979{ 1949{
1980 return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE; 1950 return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) ||
1951 (kvm_highest_pending_irq(vcpu) != -1);
1981} 1952}
1982 1953
1983int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, 1954int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
diff --git a/arch/ia64/kvm/vcpu.c b/arch/ia64/kvm/vcpu.c
index cc406d064a09..dce75b70cdd5 100644
--- a/arch/ia64/kvm/vcpu.c
+++ b/arch/ia64/kvm/vcpu.c
@@ -830,8 +830,8 @@ static void vcpu_set_itc(struct kvm_vcpu *vcpu, u64 val)
830 830
831 kvm = (struct kvm *)KVM_VM_BASE; 831 kvm = (struct kvm *)KVM_VM_BASE;
832 832
833 if (vcpu->vcpu_id == 0) { 833 if (kvm_vcpu_is_bsp(vcpu)) {
834 for (i = 0; i < kvm->arch.online_vcpus; i++) { 834 for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) {
835 v = (struct kvm_vcpu *)((char *)vcpu + 835 v = (struct kvm_vcpu *)((char *)vcpu +
836 sizeof(struct kvm_vcpu_data) * i); 836 sizeof(struct kvm_vcpu_data) * i);
837 VMX(v, itc_offset) = itc_offset; 837 VMX(v, itc_offset) = itc_offset;
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index fddc3ed715fa..c9c930ed11d7 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -34,7 +34,8 @@
34#define KVM_COALESCED_MMIO_PAGE_OFFSET 1 34#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
35 35
36/* We don't currently support large pages. */ 36/* We don't currently support large pages. */
37#define KVM_PAGES_PER_HPAGE (1UL << 31) 37#define KVM_NR_PAGE_SIZES 1
38#define KVM_PAGES_PER_HPAGE(x) (1UL<<31)
38 39
39struct kvm; 40struct kvm;
40struct kvm_run; 41struct kvm_run;
@@ -153,7 +154,6 @@ struct kvm_vcpu_arch {
153 u32 pid; 154 u32 pid;
154 u32 swap_pid; 155 u32 swap_pid;
155 156
156 u32 pvr;
157 u32 ccr0; 157 u32 ccr0;
158 u32 ccr1; 158 u32 ccr1;
159 u32 dbcr0; 159 u32 dbcr0;
diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c
index 0cef809cec21..f4d1b55aa70b 100644
--- a/arch/powerpc/kvm/44x.c
+++ b/arch/powerpc/kvm/44x.c
@@ -138,7 +138,7 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
138 kmem_cache_free(kvm_vcpu_cache, vcpu_44x); 138 kmem_cache_free(kvm_vcpu_cache, vcpu_44x);
139} 139}
140 140
141static int kvmppc_44x_init(void) 141static int __init kvmppc_44x_init(void)
142{ 142{
143 int r; 143 int r;
144 144
@@ -149,7 +149,7 @@ static int kvmppc_44x_init(void)
149 return kvm_init(NULL, sizeof(struct kvmppc_vcpu_44x), THIS_MODULE); 149 return kvm_init(NULL, sizeof(struct kvmppc_vcpu_44x), THIS_MODULE);
150} 150}
151 151
152static void kvmppc_44x_exit(void) 152static void __exit kvmppc_44x_exit(void)
153{ 153{
154 kvmppc_booke_exit(); 154 kvmppc_booke_exit();
155} 155}
diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c
index 4a16f472cc18..ff3cb63b8117 100644
--- a/arch/powerpc/kvm/44x_tlb.c
+++ b/arch/powerpc/kvm/44x_tlb.c
@@ -30,6 +30,7 @@
30#include "timing.h" 30#include "timing.h"
31 31
32#include "44x_tlb.h" 32#include "44x_tlb.h"
33#include "trace.h"
33 34
34#ifndef PPC44x_TLBE_SIZE 35#ifndef PPC44x_TLBE_SIZE
35#define PPC44x_TLBE_SIZE PPC44x_TLB_4K 36#define PPC44x_TLBE_SIZE PPC44x_TLB_4K
@@ -263,7 +264,7 @@ static void kvmppc_44x_shadow_release(struct kvmppc_vcpu_44x *vcpu_44x,
263 264
264 /* XXX set tlb_44x_index to stlb_index? */ 265 /* XXX set tlb_44x_index to stlb_index? */
265 266
266 KVMTRACE_1D(STLB_INVAL, &vcpu_44x->vcpu, stlb_index, handler); 267 trace_kvm_stlb_inval(stlb_index);
267} 268}
268 269
269void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) 270void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
@@ -365,8 +366,8 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gpa_t gpaddr,
365 /* Insert shadow mapping into hardware TLB. */ 366 /* Insert shadow mapping into hardware TLB. */
366 kvmppc_44x_tlbe_set_modified(vcpu_44x, victim); 367 kvmppc_44x_tlbe_set_modified(vcpu_44x, victim);
367 kvmppc_44x_tlbwe(victim, &stlbe); 368 kvmppc_44x_tlbwe(victim, &stlbe);
368 KVMTRACE_5D(STLB_WRITE, vcpu, victim, stlbe.tid, stlbe.word0, stlbe.word1, 369 trace_kvm_stlb_write(victim, stlbe.tid, stlbe.word0, stlbe.word1,
369 stlbe.word2, handler); 370 stlbe.word2);
370} 371}
371 372
372/* For a particular guest TLB entry, invalidate the corresponding host TLB 373/* For a particular guest TLB entry, invalidate the corresponding host TLB
@@ -485,8 +486,8 @@ int kvmppc_44x_emul_tlbwe(struct kvm_vcpu *vcpu, u8 ra, u8 rs, u8 ws)
485 kvmppc_mmu_map(vcpu, eaddr, gpaddr, gtlb_index); 486 kvmppc_mmu_map(vcpu, eaddr, gpaddr, gtlb_index);
486 } 487 }
487 488
488 KVMTRACE_5D(GTLB_WRITE, vcpu, gtlb_index, tlbe->tid, tlbe->word0, 489 trace_kvm_gtlb_write(gtlb_index, tlbe->tid, tlbe->word0, tlbe->word1,
489 tlbe->word1, tlbe->word2, handler); 490 tlbe->word2);
490 491
491 kvmppc_set_exit_type(vcpu, EMULATED_TLBWE_EXITS); 492 kvmppc_set_exit_type(vcpu, EMULATED_TLBWE_EXITS);
492 return EMULATE_DONE; 493 return EMULATE_DONE;
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 5a152a52796f..c29926846613 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -2,8 +2,7 @@
2# KVM configuration 2# KVM configuration
3# 3#
4 4
5config HAVE_KVM_IRQCHIP 5source "virt/kvm/Kconfig"
6 bool
7 6
8menuconfig VIRTUALIZATION 7menuconfig VIRTUALIZATION
9 bool "Virtualization" 8 bool "Virtualization"
@@ -59,17 +58,6 @@ config KVM_E500
59 58
60 If unsure, say N. 59 If unsure, say N.
61 60
62config KVM_TRACE
63 bool "KVM trace support"
64 depends on KVM && MARKERS && SYSFS
65 select RELAY
66 select DEBUG_FS
67 default n
68 ---help---
69 This option allows reading a trace of kvm-related events through
70 relayfs. Note the ABI is not considered stable and will be
71 modified in future updates.
72
73source drivers/virtio/Kconfig 61source drivers/virtio/Kconfig
74 62
75endif # VIRTUALIZATION 63endif # VIRTUALIZATION
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 459c7ee580f7..37655fe19f2f 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -8,7 +8,9 @@ EXTRA_CFLAGS += -Ivirt/kvm -Iarch/powerpc/kvm
8 8
9common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o) 9common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o)
10 10
11common-objs-$(CONFIG_KVM_TRACE) += $(addprefix ../../../virt/kvm/, kvm_trace.o) 11CFLAGS_44x_tlb.o := -I.
12CFLAGS_e500_tlb.o := -I.
13CFLAGS_emulate.o := -I.
12 14
13kvm-objs := $(common-objs-y) powerpc.o emulate.o 15kvm-objs := $(common-objs-y) powerpc.o emulate.o
14obj-$(CONFIG_KVM_EXIT_TIMING) += timing.o 16obj-$(CONFIG_KVM_EXIT_TIMING) += timing.o
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 642e4204cf25..e7bf4d029484 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -520,7 +520,7 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
520 return kvmppc_core_vcpu_translate(vcpu, tr); 520 return kvmppc_core_vcpu_translate(vcpu, tr);
521} 521}
522 522
523int kvmppc_booke_init(void) 523int __init kvmppc_booke_init(void)
524{ 524{
525 unsigned long ivor[16]; 525 unsigned long ivor[16];
526 unsigned long max_ivor = 0; 526 unsigned long max_ivor = 0;
diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c
index d8067fd81cdd..64949eef43f1 100644
--- a/arch/powerpc/kvm/e500.c
+++ b/arch/powerpc/kvm/e500.c
@@ -60,9 +60,6 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu)
60 60
61 kvmppc_e500_tlb_setup(vcpu_e500); 61 kvmppc_e500_tlb_setup(vcpu_e500);
62 62
63 /* Use the same core vertion as host's */
64 vcpu->arch.pvr = mfspr(SPRN_PVR);
65
66 return 0; 63 return 0;
67} 64}
68 65
@@ -132,7 +129,7 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
132 kmem_cache_free(kvm_vcpu_cache, vcpu_e500); 129 kmem_cache_free(kvm_vcpu_cache, vcpu_e500);
133} 130}
134 131
135static int kvmppc_e500_init(void) 132static int __init kvmppc_e500_init(void)
136{ 133{
137 int r, i; 134 int r, i;
138 unsigned long ivor[3]; 135 unsigned long ivor[3];
@@ -160,7 +157,7 @@ static int kvmppc_e500_init(void)
160 return kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), THIS_MODULE); 157 return kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), THIS_MODULE);
161} 158}
162 159
163static void kvmppc_e500_exit(void) 160static void __init kvmppc_e500_exit(void)
164{ 161{
165 kvmppc_booke_exit(); 162 kvmppc_booke_exit();
166} 163}
diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c
index 3f760414b9f8..be95b8d8e3b7 100644
--- a/arch/powerpc/kvm/e500_emulate.c
+++ b/arch/powerpc/kvm/e500_emulate.c
@@ -180,6 +180,9 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
180 case SPRN_MMUCSR0: 180 case SPRN_MMUCSR0:
181 vcpu->arch.gpr[rt] = 0; break; 181 vcpu->arch.gpr[rt] = 0; break;
182 182
183 case SPRN_MMUCFG:
184 vcpu->arch.gpr[rt] = mfspr(SPRN_MMUCFG); break;
185
183 /* extra exceptions */ 186 /* extra exceptions */
184 case SPRN_IVOR32: 187 case SPRN_IVOR32:
185 vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL]; 188 vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL];
diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
index 0e773fc2d5e4..fb1e1dc11ba5 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -22,6 +22,7 @@
22 22
23#include "../mm/mmu_decl.h" 23#include "../mm/mmu_decl.h"
24#include "e500_tlb.h" 24#include "e500_tlb.h"
25#include "trace.h"
25 26
26#define to_htlb1_esel(esel) (tlb1_entry_num - (esel) - 1) 27#define to_htlb1_esel(esel) (tlb1_entry_num - (esel) - 1)
27 28
@@ -224,9 +225,8 @@ static void kvmppc_e500_stlbe_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500,
224 225
225 kvmppc_e500_shadow_release(vcpu_e500, tlbsel, esel); 226 kvmppc_e500_shadow_release(vcpu_e500, tlbsel, esel);
226 stlbe->mas1 = 0; 227 stlbe->mas1 = 0;
227 KVMTRACE_5D(STLB_INVAL, &vcpu_e500->vcpu, index_of(tlbsel, esel), 228 trace_kvm_stlb_inval(index_of(tlbsel, esel), stlbe->mas1, stlbe->mas2,
228 stlbe->mas1, stlbe->mas2, stlbe->mas3, stlbe->mas7, 229 stlbe->mas3, stlbe->mas7);
229 handler);
230} 230}
231 231
232static void kvmppc_e500_tlb1_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500, 232static void kvmppc_e500_tlb1_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500,
@@ -269,7 +269,7 @@ static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu,
269 tlbsel = (vcpu_e500->mas4 >> 28) & 0x1; 269 tlbsel = (vcpu_e500->mas4 >> 28) & 0x1;
270 victim = (tlbsel == 0) ? tlb0_get_next_victim(vcpu_e500) : 0; 270 victim = (tlbsel == 0) ? tlb0_get_next_victim(vcpu_e500) : 0;
271 pidsel = (vcpu_e500->mas4 >> 16) & 0xf; 271 pidsel = (vcpu_e500->mas4 >> 16) & 0xf;
272 tsized = (vcpu_e500->mas4 >> 8) & 0xf; 272 tsized = (vcpu_e500->mas4 >> 7) & 0x1f;
273 273
274 vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim) 274 vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim)
275 | MAS0_NV(vcpu_e500->guest_tlb_nv[tlbsel]); 275 | MAS0_NV(vcpu_e500->guest_tlb_nv[tlbsel]);
@@ -309,7 +309,7 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
309 vcpu_e500->shadow_pages[tlbsel][esel] = new_page; 309 vcpu_e500->shadow_pages[tlbsel][esel] = new_page;
310 310
311 /* Force TS=1 IPROT=0 TSIZE=4KB for all guest mappings. */ 311 /* Force TS=1 IPROT=0 TSIZE=4KB for all guest mappings. */
312 stlbe->mas1 = MAS1_TSIZE(BOOKE_PAGESZ_4K) 312 stlbe->mas1 = MAS1_TSIZE(BOOK3E_PAGESZ_4K)
313 | MAS1_TID(get_tlb_tid(gtlbe)) | MAS1_TS | MAS1_VALID; 313 | MAS1_TID(get_tlb_tid(gtlbe)) | MAS1_TS | MAS1_VALID;
314 stlbe->mas2 = (gvaddr & MAS2_EPN) 314 stlbe->mas2 = (gvaddr & MAS2_EPN)
315 | e500_shadow_mas2_attrib(gtlbe->mas2, 315 | e500_shadow_mas2_attrib(gtlbe->mas2,
@@ -319,9 +319,8 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
319 vcpu_e500->vcpu.arch.msr & MSR_PR); 319 vcpu_e500->vcpu.arch.msr & MSR_PR);
320 stlbe->mas7 = (hpaddr >> 32) & MAS7_RPN; 320 stlbe->mas7 = (hpaddr >> 32) & MAS7_RPN;
321 321
322 KVMTRACE_5D(STLB_WRITE, &vcpu_e500->vcpu, index_of(tlbsel, esel), 322 trace_kvm_stlb_write(index_of(tlbsel, esel), stlbe->mas1, stlbe->mas2,
323 stlbe->mas1, stlbe->mas2, stlbe->mas3, stlbe->mas7, 323 stlbe->mas3, stlbe->mas7);
324 handler);
325} 324}
326 325
327/* XXX only map the one-one case, for now use TLB0 */ 326/* XXX only map the one-one case, for now use TLB0 */
@@ -535,9 +534,8 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
535 gtlbe->mas3 = vcpu_e500->mas3; 534 gtlbe->mas3 = vcpu_e500->mas3;
536 gtlbe->mas7 = vcpu_e500->mas7; 535 gtlbe->mas7 = vcpu_e500->mas7;
537 536
538 KVMTRACE_5D(GTLB_WRITE, vcpu, vcpu_e500->mas0, 537 trace_kvm_gtlb_write(vcpu_e500->mas0, gtlbe->mas1, gtlbe->mas2,
539 gtlbe->mas1, gtlbe->mas2, gtlbe->mas3, gtlbe->mas7, 538 gtlbe->mas3, gtlbe->mas7);
540 handler);
541 539
542 /* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */ 540 /* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */
543 if (tlbe_is_host_safe(vcpu, gtlbe)) { 541 if (tlbe_is_host_safe(vcpu, gtlbe)) {
@@ -545,7 +543,7 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
545 case 0: 543 case 0:
546 /* TLB0 */ 544 /* TLB0 */
547 gtlbe->mas1 &= ~MAS1_TSIZE(~0); 545 gtlbe->mas1 &= ~MAS1_TSIZE(~0);
548 gtlbe->mas1 |= MAS1_TSIZE(BOOKE_PAGESZ_4K); 546 gtlbe->mas1 |= MAS1_TSIZE(BOOK3E_PAGESZ_4K);
549 547
550 stlbsel = 0; 548 stlbsel = 0;
551 sesel = kvmppc_e500_stlbe_map(vcpu_e500, 0, esel); 549 sesel = kvmppc_e500_stlbe_map(vcpu_e500, 0, esel);
@@ -679,14 +677,14 @@ void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *vcpu_e500)
679 677
680 /* Insert large initial mapping for guest. */ 678 /* Insert large initial mapping for guest. */
681 tlbe = &vcpu_e500->guest_tlb[1][0]; 679 tlbe = &vcpu_e500->guest_tlb[1][0];
682 tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOKE_PAGESZ_256M); 680 tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_256M);
683 tlbe->mas2 = 0; 681 tlbe->mas2 = 0;
684 tlbe->mas3 = E500_TLB_SUPER_PERM_MASK; 682 tlbe->mas3 = E500_TLB_SUPER_PERM_MASK;
685 tlbe->mas7 = 0; 683 tlbe->mas7 = 0;
686 684
687 /* 4K map for serial output. Used by kernel wrapper. */ 685 /* 4K map for serial output. Used by kernel wrapper. */
688 tlbe = &vcpu_e500->guest_tlb[1][1]; 686 tlbe = &vcpu_e500->guest_tlb[1][1];
689 tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOKE_PAGESZ_4K); 687 tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_4K);
690 tlbe->mas2 = (0xe0004500 & 0xFFFFF000) | MAS2_I | MAS2_G; 688 tlbe->mas2 = (0xe0004500 & 0xFFFFF000) | MAS2_I | MAS2_G;
691 tlbe->mas3 = (0xe0004500 & 0xFFFFF000) | E500_TLB_SUPER_PERM_MASK; 689 tlbe->mas3 = (0xe0004500 & 0xFFFFF000) | E500_TLB_SUPER_PERM_MASK;
692 tlbe->mas7 = 0; 690 tlbe->mas7 = 0;
diff --git a/arch/powerpc/kvm/e500_tlb.h b/arch/powerpc/kvm/e500_tlb.h
index 45b064b76906..d28e3010a5e2 100644
--- a/arch/powerpc/kvm/e500_tlb.h
+++ b/arch/powerpc/kvm/e500_tlb.h
@@ -16,7 +16,7 @@
16#define __KVM_E500_TLB_H__ 16#define __KVM_E500_TLB_H__
17 17
18#include <linux/kvm_host.h> 18#include <linux/kvm_host.h>
19#include <asm/mmu-fsl-booke.h> 19#include <asm/mmu-book3e.h>
20#include <asm/tlb.h> 20#include <asm/tlb.h>
21#include <asm/kvm_e500.h> 21#include <asm/kvm_e500.h>
22 22
@@ -59,7 +59,7 @@ extern void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *);
59/* TLB helper functions */ 59/* TLB helper functions */
60static inline unsigned int get_tlb_size(const struct tlbe *tlbe) 60static inline unsigned int get_tlb_size(const struct tlbe *tlbe)
61{ 61{
62 return (tlbe->mas1 >> 8) & 0xf; 62 return (tlbe->mas1 >> 7) & 0x1f;
63} 63}
64 64
65static inline gva_t get_tlb_eaddr(const struct tlbe *tlbe) 65static inline gva_t get_tlb_eaddr(const struct tlbe *tlbe)
@@ -70,7 +70,7 @@ static inline gva_t get_tlb_eaddr(const struct tlbe *tlbe)
70static inline u64 get_tlb_bytes(const struct tlbe *tlbe) 70static inline u64 get_tlb_bytes(const struct tlbe *tlbe)
71{ 71{
72 unsigned int pgsize = get_tlb_size(tlbe); 72 unsigned int pgsize = get_tlb_size(tlbe);
73 return 1ULL << 10 << (pgsize << 1); 73 return 1ULL << 10 << pgsize;
74} 74}
75 75
76static inline gva_t get_tlb_end(const struct tlbe *tlbe) 76static inline gva_t get_tlb_end(const struct tlbe *tlbe)
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index a561d6e8da1c..7737146af3fb 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -29,6 +29,7 @@
29#include <asm/kvm_ppc.h> 29#include <asm/kvm_ppc.h>
30#include <asm/disassemble.h> 30#include <asm/disassemble.h>
31#include "timing.h" 31#include "timing.h"
32#include "trace.h"
32 33
33#define OP_TRAP 3 34#define OP_TRAP 3
34 35
@@ -187,7 +188,9 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
187 case SPRN_SRR1: 188 case SPRN_SRR1:
188 vcpu->arch.gpr[rt] = vcpu->arch.srr1; break; 189 vcpu->arch.gpr[rt] = vcpu->arch.srr1; break;
189 case SPRN_PVR: 190 case SPRN_PVR:
190 vcpu->arch.gpr[rt] = vcpu->arch.pvr; break; 191 vcpu->arch.gpr[rt] = mfspr(SPRN_PVR); break;
192 case SPRN_PIR:
193 vcpu->arch.gpr[rt] = mfspr(SPRN_PIR); break;
191 194
192 /* Note: mftb and TBRL/TBWL are user-accessible, so 195 /* Note: mftb and TBRL/TBWL are user-accessible, so
193 * the guest can always access the real TB anyways. 196 * the guest can always access the real TB anyways.
@@ -417,7 +420,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
417 } 420 }
418 } 421 }
419 422
420 KVMTRACE_3D(PPC_INSTR, vcpu, inst, (int)vcpu->arch.pc, emulated, entryexit); 423 trace_kvm_ppc_instr(inst, vcpu->arch.pc, emulated);
421 424
422 if (advance) 425 if (advance)
423 vcpu->arch.pc += 4; /* Advance past emulated instruction. */ 426 vcpu->arch.pc += 4; /* Advance past emulated instruction. */
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 2cf915e51e7e..2a4551f78f60 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -31,25 +31,17 @@
31#include "timing.h" 31#include "timing.h"
32#include "../mm/mmu_decl.h" 32#include "../mm/mmu_decl.h"
33 33
34#define CREATE_TRACE_POINTS
35#include "trace.h"
36
34gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) 37gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
35{ 38{
36 return gfn; 39 return gfn;
37} 40}
38 41
39int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
40{
41 return !!(v->arch.pending_exceptions);
42}
43
44int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
45{
46 /* do real check here */
47 return 1;
48}
49
50int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) 42int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
51{ 43{
52 return !(v->arch.msr & MSR_WE); 44 return !(v->arch.msr & MSR_WE) || !!(v->arch.pending_exceptions);
53} 45}
54 46
55 47
@@ -122,13 +114,17 @@ struct kvm *kvm_arch_create_vm(void)
122static void kvmppc_free_vcpus(struct kvm *kvm) 114static void kvmppc_free_vcpus(struct kvm *kvm)
123{ 115{
124 unsigned int i; 116 unsigned int i;
117 struct kvm_vcpu *vcpu;
125 118
126 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 119 kvm_for_each_vcpu(i, vcpu, kvm)
127 if (kvm->vcpus[i]) { 120 kvm_arch_vcpu_free(vcpu);
128 kvm_arch_vcpu_free(kvm->vcpus[i]); 121
129 kvm->vcpus[i] = NULL; 122 mutex_lock(&kvm->lock);
130 } 123 for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
131 } 124 kvm->vcpus[i] = NULL;
125
126 atomic_set(&kvm->online_vcpus, 0);
127 mutex_unlock(&kvm->lock);
132} 128}
133 129
134void kvm_arch_sync_events(struct kvm *kvm) 130void kvm_arch_sync_events(struct kvm *kvm)
diff --git a/arch/powerpc/kvm/trace.h b/arch/powerpc/kvm/trace.h
new file mode 100644
index 000000000000..67f219de0455
--- /dev/null
+++ b/arch/powerpc/kvm/trace.h
@@ -0,0 +1,104 @@
1#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ)
2#define _TRACE_KVM_H
3
4#include <linux/tracepoint.h>
5
6#undef TRACE_SYSTEM
7#define TRACE_SYSTEM kvm
8#define TRACE_INCLUDE_PATH .
9#define TRACE_INCLUDE_FILE trace
10
11/*
12 * Tracepoint for guest mode entry.
13 */
14TRACE_EVENT(kvm_ppc_instr,
15 TP_PROTO(unsigned int inst, unsigned long pc, unsigned int emulate),
16 TP_ARGS(inst, pc, emulate),
17
18 TP_STRUCT__entry(
19 __field( unsigned int, inst )
20 __field( unsigned long, pc )
21 __field( unsigned int, emulate )
22 ),
23
24 TP_fast_assign(
25 __entry->inst = inst;
26 __entry->pc = pc;
27 __entry->emulate = emulate;
28 ),
29
30 TP_printk("inst %u pc 0x%lx emulate %u\n",
31 __entry->inst, __entry->pc, __entry->emulate)
32);
33
34TRACE_EVENT(kvm_stlb_inval,
35 TP_PROTO(unsigned int stlb_index),
36 TP_ARGS(stlb_index),
37
38 TP_STRUCT__entry(
39 __field( unsigned int, stlb_index )
40 ),
41
42 TP_fast_assign(
43 __entry->stlb_index = stlb_index;
44 ),
45
46 TP_printk("stlb_index %u", __entry->stlb_index)
47);
48
49TRACE_EVENT(kvm_stlb_write,
50 TP_PROTO(unsigned int victim, unsigned int tid, unsigned int word0,
51 unsigned int word1, unsigned int word2),
52 TP_ARGS(victim, tid, word0, word1, word2),
53
54 TP_STRUCT__entry(
55 __field( unsigned int, victim )
56 __field( unsigned int, tid )
57 __field( unsigned int, word0 )
58 __field( unsigned int, word1 )
59 __field( unsigned int, word2 )
60 ),
61
62 TP_fast_assign(
63 __entry->victim = victim;
64 __entry->tid = tid;
65 __entry->word0 = word0;
66 __entry->word1 = word1;
67 __entry->word2 = word2;
68 ),
69
70 TP_printk("victim %u tid %u w0 %u w1 %u w2 %u",
71 __entry->victim, __entry->tid, __entry->word0,
72 __entry->word1, __entry->word2)
73);
74
75TRACE_EVENT(kvm_gtlb_write,
76 TP_PROTO(unsigned int gtlb_index, unsigned int tid, unsigned int word0,
77 unsigned int word1, unsigned int word2),
78 TP_ARGS(gtlb_index, tid, word0, word1, word2),
79
80 TP_STRUCT__entry(
81 __field( unsigned int, gtlb_index )
82 __field( unsigned int, tid )
83 __field( unsigned int, word0 )
84 __field( unsigned int, word1 )
85 __field( unsigned int, word2 )
86 ),
87
88 TP_fast_assign(
89 __entry->gtlb_index = gtlb_index;
90 __entry->tid = tid;
91 __entry->word0 = word0;
92 __entry->word1 = word1;
93 __entry->word2 = word2;
94 ),
95
96 TP_printk("gtlb_index %u tid %u w0 %u w1 %u w2 %u",
97 __entry->gtlb_index, __entry->tid, __entry->word0,
98 __entry->word1, __entry->word2)
99);
100
101#endif /* _TRACE_KVM_H */
102
103/* This part must be outside protection */
104#include <trace/define_trace.h>
diff --git a/arch/s390/include/asm/kvm.h b/arch/s390/include/asm/kvm.h
index 0b2f829f6d50..3dfcaeb5d7f4 100644
--- a/arch/s390/include/asm/kvm.h
+++ b/arch/s390/include/asm/kvm.h
@@ -15,15 +15,6 @@
15 */ 15 */
16#include <linux/types.h> 16#include <linux/types.h>
17 17
18/* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */
19struct kvm_pic_state {
20 /* no PIC for s390 */
21};
22
23struct kvm_ioapic_state {
24 /* no IOAPIC for s390 */
25};
26
27/* for KVM_GET_REGS and KVM_SET_REGS */ 18/* for KVM_GET_REGS and KVM_SET_REGS */
28struct kvm_regs { 19struct kvm_regs {
29 /* general purpose regs for s390 */ 20 /* general purpose regs for s390 */
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 698988f69403..27605b62b980 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -1,7 +1,7 @@
1/* 1/*
2 * asm-s390/kvm_host.h - definition for kernel virtual machines on s390 2 * asm-s390/kvm_host.h - definition for kernel virtual machines on s390
3 * 3 *
4 * Copyright IBM Corp. 2008 4 * Copyright IBM Corp. 2008,2009
5 * 5 *
6 * This program is free software; you can redistribute it and/or modify 6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License (version 2 only) 7 * it under the terms of the GNU General Public License (version 2 only)
@@ -40,7 +40,11 @@ struct sca_block {
40 struct sca_entry cpu[64]; 40 struct sca_entry cpu[64];
41} __attribute__((packed)); 41} __attribute__((packed));
42 42
43#define KVM_PAGES_PER_HPAGE 256 43#define KVM_NR_PAGE_SIZES 2
44#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + ((x) - 1) * 8)
45#define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x))
46#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1))
47#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE)
44 48
45#define CPUSTAT_HOST 0x80000000 49#define CPUSTAT_HOST 0x80000000
46#define CPUSTAT_WAIT 0x10000000 50#define CPUSTAT_WAIT 0x10000000
@@ -182,8 +186,9 @@ struct kvm_s390_interrupt_info {
182}; 186};
183 187
184/* for local_interrupt.action_flags */ 188/* for local_interrupt.action_flags */
185#define ACTION_STORE_ON_STOP 1 189#define ACTION_STORE_ON_STOP (1<<0)
186#define ACTION_STOP_ON_STOP 2 190#define ACTION_STOP_ON_STOP (1<<1)
191#define ACTION_RELOADVCPU_ON_STOP (1<<2)
187 192
188struct kvm_s390_local_interrupt { 193struct kvm_s390_local_interrupt {
189 spinlock_t lock; 194 spinlock_t lock;
@@ -227,8 +232,6 @@ struct kvm_vm_stat {
227}; 232};
228 233
229struct kvm_arch{ 234struct kvm_arch{
230 unsigned long guest_origin;
231 unsigned long guest_memsize;
232 struct sca_block *sca; 235 struct sca_block *sca;
233 debug_info_t *dbf; 236 debug_info_t *dbf;
234 struct kvm_s390_float_interrupt float_int; 237 struct kvm_s390_float_interrupt float_int;
diff --git a/arch/s390/include/asm/kvm_para.h b/arch/s390/include/asm/kvm_para.h
index 2c503796b619..6964db226f83 100644
--- a/arch/s390/include/asm/kvm_para.h
+++ b/arch/s390/include/asm/kvm_para.h
@@ -13,6 +13,8 @@
13#ifndef __S390_KVM_PARA_H 13#ifndef __S390_KVM_PARA_H
14#define __S390_KVM_PARA_H 14#define __S390_KVM_PARA_H
15 15
16#ifdef __KERNEL__
17
16/* 18/*
17 * Hypercalls for KVM on s390. The calling convention is similar to the 19 * Hypercalls for KVM on s390. The calling convention is similar to the
18 * s390 ABI, so we use R2-R6 for parameters 1-5. In addition we use R1 20 * s390 ABI, so we use R2-R6 for parameters 1-5. In addition we use R1
@@ -147,4 +149,6 @@ static inline unsigned int kvm_arch_para_features(void)
147 return 0; 149 return 0;
148} 150}
149 151
152#endif
153
150#endif /* __S390_KVM_PARA_H */ 154#endif /* __S390_KVM_PARA_H */
diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig
index 3e260b7e37b2..bf164fc21864 100644
--- a/arch/s390/kvm/Kconfig
+++ b/arch/s390/kvm/Kconfig
@@ -1,11 +1,7 @@
1# 1#
2# KVM configuration 2# KVM configuration
3# 3#
4config HAVE_KVM 4source "virt/kvm/Kconfig"
5 bool
6
7config HAVE_KVM_IRQCHIP
8 bool
9 5
10menuconfig VIRTUALIZATION 6menuconfig VIRTUALIZATION
11 bool "Virtualization" 7 bool "Virtualization"
@@ -38,9 +34,6 @@ config KVM
38 34
39 If unsure, say N. 35 If unsure, say N.
40 36
41config KVM_TRACE
42 bool
43
44# OK, it's a little counter-intuitive to do this, but it puts it neatly under 37# OK, it's a little counter-intuitive to do this, but it puts it neatly under
45# the virtualization menu. 38# the virtualization menu.
46source drivers/virtio/Kconfig 39source drivers/virtio/Kconfig
diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h
index ed60f3a74a85..03c716a0f01f 100644
--- a/arch/s390/kvm/gaccess.h
+++ b/arch/s390/kvm/gaccess.h
@@ -1,7 +1,7 @@
1/* 1/*
2 * gaccess.h - access guest memory 2 * gaccess.h - access guest memory
3 * 3 *
4 * Copyright IBM Corp. 2008 4 * Copyright IBM Corp. 2008,2009
5 * 5 *
6 * This program is free software; you can redistribute it and/or modify 6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License (version 2 only) 7 * it under the terms of the GNU General Public License (version 2 only)
@@ -16,13 +16,14 @@
16#include <linux/compiler.h> 16#include <linux/compiler.h>
17#include <linux/kvm_host.h> 17#include <linux/kvm_host.h>
18#include <asm/uaccess.h> 18#include <asm/uaccess.h>
19#include "kvm-s390.h"
19 20
20static inline void __user *__guestaddr_to_user(struct kvm_vcpu *vcpu, 21static inline void __user *__guestaddr_to_user(struct kvm_vcpu *vcpu,
21 unsigned long guestaddr) 22 unsigned long guestaddr)
22{ 23{
23 unsigned long prefix = vcpu->arch.sie_block->prefix; 24 unsigned long prefix = vcpu->arch.sie_block->prefix;
24 unsigned long origin = vcpu->kvm->arch.guest_origin; 25 unsigned long origin = vcpu->arch.sie_block->gmsor;
25 unsigned long memsize = vcpu->kvm->arch.guest_memsize; 26 unsigned long memsize = kvm_s390_vcpu_get_memsize(vcpu);
26 27
27 if (guestaddr < 2 * PAGE_SIZE) 28 if (guestaddr < 2 * PAGE_SIZE)
28 guestaddr += prefix; 29 guestaddr += prefix;
@@ -158,8 +159,8 @@ static inline int copy_to_guest(struct kvm_vcpu *vcpu, unsigned long guestdest,
158 const void *from, unsigned long n) 159 const void *from, unsigned long n)
159{ 160{
160 unsigned long prefix = vcpu->arch.sie_block->prefix; 161 unsigned long prefix = vcpu->arch.sie_block->prefix;
161 unsigned long origin = vcpu->kvm->arch.guest_origin; 162 unsigned long origin = vcpu->arch.sie_block->gmsor;
162 unsigned long memsize = vcpu->kvm->arch.guest_memsize; 163 unsigned long memsize = kvm_s390_vcpu_get_memsize(vcpu);
163 164
164 if ((guestdest < 2 * PAGE_SIZE) && (guestdest + n > 2 * PAGE_SIZE)) 165 if ((guestdest < 2 * PAGE_SIZE) && (guestdest + n > 2 * PAGE_SIZE))
165 goto slowpath; 166 goto slowpath;
@@ -209,8 +210,8 @@ static inline int copy_from_guest(struct kvm_vcpu *vcpu, void *to,
209 unsigned long guestsrc, unsigned long n) 210 unsigned long guestsrc, unsigned long n)
210{ 211{
211 unsigned long prefix = vcpu->arch.sie_block->prefix; 212 unsigned long prefix = vcpu->arch.sie_block->prefix;
212 unsigned long origin = vcpu->kvm->arch.guest_origin; 213 unsigned long origin = vcpu->arch.sie_block->gmsor;
213 unsigned long memsize = vcpu->kvm->arch.guest_memsize; 214 unsigned long memsize = kvm_s390_vcpu_get_memsize(vcpu);
214 215
215 if ((guestsrc < 2 * PAGE_SIZE) && (guestsrc + n > 2 * PAGE_SIZE)) 216 if ((guestsrc < 2 * PAGE_SIZE) && (guestsrc + n > 2 * PAGE_SIZE))
216 goto slowpath; 217 goto slowpath;
@@ -244,8 +245,8 @@ static inline int copy_to_guest_absolute(struct kvm_vcpu *vcpu,
244 unsigned long guestdest, 245 unsigned long guestdest,
245 const void *from, unsigned long n) 246 const void *from, unsigned long n)
246{ 247{
247 unsigned long origin = vcpu->kvm->arch.guest_origin; 248 unsigned long origin = vcpu->arch.sie_block->gmsor;
248 unsigned long memsize = vcpu->kvm->arch.guest_memsize; 249 unsigned long memsize = kvm_s390_vcpu_get_memsize(vcpu);
249 250
250 if (guestdest + n > memsize) 251 if (guestdest + n > memsize)
251 return -EFAULT; 252 return -EFAULT;
@@ -262,8 +263,8 @@ static inline int copy_from_guest_absolute(struct kvm_vcpu *vcpu, void *to,
262 unsigned long guestsrc, 263 unsigned long guestsrc,
263 unsigned long n) 264 unsigned long n)
264{ 265{
265 unsigned long origin = vcpu->kvm->arch.guest_origin; 266 unsigned long origin = vcpu->arch.sie_block->gmsor;
266 unsigned long memsize = vcpu->kvm->arch.guest_memsize; 267 unsigned long memsize = kvm_s390_vcpu_get_memsize(vcpu);
267 268
268 if (guestsrc + n > memsize) 269 if (guestsrc + n > memsize)
269 return -EFAULT; 270 return -EFAULT;
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 98997ccba501..ba9d8a7bc1ac 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * intercept.c - in-kernel handling for sie intercepts 2 * intercept.c - in-kernel handling for sie intercepts
3 * 3 *
4 * Copyright IBM Corp. 2008 4 * Copyright IBM Corp. 2008,2009
5 * 5 *
6 * This program is free software; you can redistribute it and/or modify 6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License (version 2 only) 7 * it under the terms of the GNU General Public License (version 2 only)
@@ -128,7 +128,7 @@ static int handle_noop(struct kvm_vcpu *vcpu)
128 128
129static int handle_stop(struct kvm_vcpu *vcpu) 129static int handle_stop(struct kvm_vcpu *vcpu)
130{ 130{
131 int rc; 131 int rc = 0;
132 132
133 vcpu->stat.exit_stop_request++; 133 vcpu->stat.exit_stop_request++;
134 atomic_clear_mask(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags); 134 atomic_clear_mask(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
@@ -141,12 +141,18 @@ static int handle_stop(struct kvm_vcpu *vcpu)
141 rc = -ENOTSUPP; 141 rc = -ENOTSUPP;
142 } 142 }
143 143
144 if (vcpu->arch.local_int.action_bits & ACTION_RELOADVCPU_ON_STOP) {
145 vcpu->arch.local_int.action_bits &= ~ACTION_RELOADVCPU_ON_STOP;
146 rc = SIE_INTERCEPT_RERUNVCPU;
147 vcpu->run->exit_reason = KVM_EXIT_INTR;
148 }
149
144 if (vcpu->arch.local_int.action_bits & ACTION_STOP_ON_STOP) { 150 if (vcpu->arch.local_int.action_bits & ACTION_STOP_ON_STOP) {
145 vcpu->arch.local_int.action_bits &= ~ACTION_STOP_ON_STOP; 151 vcpu->arch.local_int.action_bits &= ~ACTION_STOP_ON_STOP;
146 VCPU_EVENT(vcpu, 3, "%s", "cpu stopped"); 152 VCPU_EVENT(vcpu, 3, "%s", "cpu stopped");
147 rc = -ENOTSUPP; 153 rc = -ENOTSUPP;
148 } else 154 }
149 rc = 0; 155
150 spin_unlock_bh(&vcpu->arch.local_int.lock); 156 spin_unlock_bh(&vcpu->arch.local_int.lock);
151 return rc; 157 return rc;
152} 158}
@@ -158,9 +164,9 @@ static int handle_validity(struct kvm_vcpu *vcpu)
158 164
159 vcpu->stat.exit_validity++; 165 vcpu->stat.exit_validity++;
160 if ((viwhy == 0x37) && (vcpu->arch.sie_block->prefix 166 if ((viwhy == 0x37) && (vcpu->arch.sie_block->prefix
161 <= vcpu->kvm->arch.guest_memsize - 2*PAGE_SIZE)){ 167 <= kvm_s390_vcpu_get_memsize(vcpu) - 2*PAGE_SIZE)) {
162 rc = fault_in_pages_writeable((char __user *) 168 rc = fault_in_pages_writeable((char __user *)
163 vcpu->kvm->arch.guest_origin + 169 vcpu->arch.sie_block->gmsor +
164 vcpu->arch.sie_block->prefix, 170 vcpu->arch.sie_block->prefix,
165 2*PAGE_SIZE); 171 2*PAGE_SIZE);
166 if (rc) 172 if (rc)
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 4d613415c435..2c2f98353415 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -283,7 +283,7 @@ static int __try_deliver_ckc_interrupt(struct kvm_vcpu *vcpu)
283 return 1; 283 return 1;
284} 284}
285 285
286int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu) 286static int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu)
287{ 287{
288 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; 288 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
289 struct kvm_s390_float_interrupt *fi = vcpu->arch.local_int.float_int; 289 struct kvm_s390_float_interrupt *fi = vcpu->arch.local_int.float_int;
@@ -320,12 +320,6 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu)
320 return rc; 320 return rc;
321} 321}
322 322
323int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
324{
325 /* do real check here */
326 return 1;
327}
328
329int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) 323int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
330{ 324{
331 return 0; 325 return 0;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 90d9d1ba258b..07ced89740d7 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * s390host.c -- hosting zSeries kernel virtual machines 2 * s390host.c -- hosting zSeries kernel virtual machines
3 * 3 *
4 * Copyright IBM Corp. 2008 4 * Copyright IBM Corp. 2008,2009
5 * 5 *
6 * This program is free software; you can redistribute it and/or modify 6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License (version 2 only) 7 * it under the terms of the GNU General Public License (version 2 only)
@@ -10,6 +10,7 @@
10 * Author(s): Carsten Otte <cotte@de.ibm.com> 10 * Author(s): Carsten Otte <cotte@de.ibm.com>
11 * Christian Borntraeger <borntraeger@de.ibm.com> 11 * Christian Borntraeger <borntraeger@de.ibm.com>
12 * Heiko Carstens <heiko.carstens@de.ibm.com> 12 * Heiko Carstens <heiko.carstens@de.ibm.com>
13 * Christian Ehrhardt <ehrhardt@de.ibm.com>
13 */ 14 */
14 15
15#include <linux/compiler.h> 16#include <linux/compiler.h>
@@ -210,13 +211,17 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
210static void kvm_free_vcpus(struct kvm *kvm) 211static void kvm_free_vcpus(struct kvm *kvm)
211{ 212{
212 unsigned int i; 213 unsigned int i;
214 struct kvm_vcpu *vcpu;
213 215
214 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 216 kvm_for_each_vcpu(i, vcpu, kvm)
215 if (kvm->vcpus[i]) { 217 kvm_arch_vcpu_destroy(vcpu);
216 kvm_arch_vcpu_destroy(kvm->vcpus[i]); 218
217 kvm->vcpus[i] = NULL; 219 mutex_lock(&kvm->lock);
218 } 220 for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
219 } 221 kvm->vcpus[i] = NULL;
222
223 atomic_set(&kvm->online_vcpus, 0);
224 mutex_unlock(&kvm->lock);
220} 225}
221 226
222void kvm_arch_sync_events(struct kvm *kvm) 227void kvm_arch_sync_events(struct kvm *kvm)
@@ -278,16 +283,10 @@ static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu)
278 vcpu->arch.sie_block->gbea = 1; 283 vcpu->arch.sie_block->gbea = 1;
279} 284}
280 285
281/* The current code can have up to 256 pages for virtio */
282#define VIRTIODESCSPACE (256ul * 4096ul)
283
284int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) 286int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
285{ 287{
286 atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH); 288 atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH);
287 vcpu->arch.sie_block->gmslm = vcpu->kvm->arch.guest_memsize + 289 set_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests);
288 vcpu->kvm->arch.guest_origin +
289 VIRTIODESCSPACE - 1ul;
290 vcpu->arch.sie_block->gmsor = vcpu->kvm->arch.guest_origin;
291 vcpu->arch.sie_block->ecb = 2; 290 vcpu->arch.sie_block->ecb = 2;
292 vcpu->arch.sie_block->eca = 0xC1002001U; 291 vcpu->arch.sie_block->eca = 0xC1002001U;
293 vcpu->arch.sie_block->fac = (int) (long) facilities; 292 vcpu->arch.sie_block->fac = (int) (long) facilities;
@@ -319,8 +318,6 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
319 BUG_ON(!kvm->arch.sca); 318 BUG_ON(!kvm->arch.sca);
320 if (!kvm->arch.sca->cpu[id].sda) 319 if (!kvm->arch.sca->cpu[id].sda)
321 kvm->arch.sca->cpu[id].sda = (__u64) vcpu->arch.sie_block; 320 kvm->arch.sca->cpu[id].sda = (__u64) vcpu->arch.sie_block;
322 else
323 BUG_ON(!kvm->vcpus[id]); /* vcpu does already exist */
324 vcpu->arch.sie_block->scaoh = (__u32)(((__u64)kvm->arch.sca) >> 32); 321 vcpu->arch.sie_block->scaoh = (__u32)(((__u64)kvm->arch.sca) >> 32);
325 vcpu->arch.sie_block->scaol = (__u32)(__u64)kvm->arch.sca; 322 vcpu->arch.sie_block->scaol = (__u32)(__u64)kvm->arch.sca;
326 323
@@ -490,9 +487,15 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
490 487
491 vcpu_load(vcpu); 488 vcpu_load(vcpu);
492 489
490rerun_vcpu:
491 if (vcpu->requests)
492 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
493 kvm_s390_vcpu_set_mem(vcpu);
494
493 /* verify, that memory has been registered */ 495 /* verify, that memory has been registered */
494 if (!vcpu->kvm->arch.guest_memsize) { 496 if (!vcpu->arch.sie_block->gmslm) {
495 vcpu_put(vcpu); 497 vcpu_put(vcpu);
498 VCPU_EVENT(vcpu, 3, "%s", "no memory registered to run vcpu");
496 return -EINVAL; 499 return -EINVAL;
497 } 500 }
498 501
@@ -509,6 +512,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
509 vcpu->arch.sie_block->gpsw.addr = kvm_run->s390_sieic.addr; 512 vcpu->arch.sie_block->gpsw.addr = kvm_run->s390_sieic.addr;
510 break; 513 break;
511 case KVM_EXIT_UNKNOWN: 514 case KVM_EXIT_UNKNOWN:
515 case KVM_EXIT_INTR:
512 case KVM_EXIT_S390_RESET: 516 case KVM_EXIT_S390_RESET:
513 break; 517 break;
514 default: 518 default:
@@ -522,8 +526,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
522 rc = kvm_handle_sie_intercept(vcpu); 526 rc = kvm_handle_sie_intercept(vcpu);
523 } while (!signal_pending(current) && !rc); 527 } while (!signal_pending(current) && !rc);
524 528
525 if (signal_pending(current) && !rc) 529 if (rc == SIE_INTERCEPT_RERUNVCPU)
530 goto rerun_vcpu;
531
532 if (signal_pending(current) && !rc) {
533 kvm_run->exit_reason = KVM_EXIT_INTR;
526 rc = -EINTR; 534 rc = -EINTR;
535 }
527 536
528 if (rc == -ENOTSUPP) { 537 if (rc == -ENOTSUPP) {
529 /* intercept cannot be handled in-kernel, prepare kvm-run */ 538 /* intercept cannot be handled in-kernel, prepare kvm-run */
@@ -676,6 +685,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
676 int user_alloc) 685 int user_alloc)
677{ 686{
678 int i; 687 int i;
688 struct kvm_vcpu *vcpu;
679 689
680 /* A few sanity checks. We can have exactly one memory slot which has 690 /* A few sanity checks. We can have exactly one memory slot which has
681 to start at guest virtual zero and which has to be located at a 691 to start at guest virtual zero and which has to be located at a
@@ -684,7 +694,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
684 vmas. It is okay to mmap() and munmap() stuff in this slot after 694 vmas. It is okay to mmap() and munmap() stuff in this slot after
685 doing this call at any time */ 695 doing this call at any time */
686 696
687 if (mem->slot || kvm->arch.guest_memsize) 697 if (mem->slot)
688 return -EINVAL; 698 return -EINVAL;
689 699
690 if (mem->guest_phys_addr) 700 if (mem->guest_phys_addr)
@@ -699,36 +709,14 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
699 if (!user_alloc) 709 if (!user_alloc)
700 return -EINVAL; 710 return -EINVAL;
701 711
702 /* lock all vcpus */ 712 /* request update of sie control block for all available vcpus */
703 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 713 kvm_for_each_vcpu(i, vcpu, kvm) {
704 if (!kvm->vcpus[i]) 714 if (test_and_set_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
705 continue; 715 continue;
706 if (!mutex_trylock(&kvm->vcpus[i]->mutex)) 716 kvm_s390_inject_sigp_stop(vcpu, ACTION_RELOADVCPU_ON_STOP);
707 goto fail_out;
708 }
709
710 kvm->arch.guest_origin = mem->userspace_addr;
711 kvm->arch.guest_memsize = mem->memory_size;
712
713 /* update sie control blocks, and unlock all vcpus */
714 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
715 if (kvm->vcpus[i]) {
716 kvm->vcpus[i]->arch.sie_block->gmsor =
717 kvm->arch.guest_origin;
718 kvm->vcpus[i]->arch.sie_block->gmslm =
719 kvm->arch.guest_memsize +
720 kvm->arch.guest_origin +
721 VIRTIODESCSPACE - 1ul;
722 mutex_unlock(&kvm->vcpus[i]->mutex);
723 }
724 } 717 }
725 718
726 return 0; 719 return 0;
727
728fail_out:
729 for (; i >= 0; i--)
730 mutex_unlock(&kvm->vcpus[i]->mutex);
731 return -EINVAL;
732} 720}
733 721
734void kvm_arch_flush_shadow(struct kvm *kvm) 722void kvm_arch_flush_shadow(struct kvm *kvm)
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 748fee872323..ec5eee7c25d8 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -1,7 +1,7 @@
1/* 1/*
2 * kvm_s390.h - definition for kvm on s390 2 * kvm_s390.h - definition for kvm on s390
3 * 3 *
4 * Copyright IBM Corp. 2008 4 * Copyright IBM Corp. 2008,2009
5 * 5 *
6 * This program is free software; you can redistribute it and/or modify 6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License (version 2 only) 7 * it under the terms of the GNU General Public License (version 2 only)
@@ -9,6 +9,7 @@
9 * 9 *
10 * Author(s): Carsten Otte <cotte@de.ibm.com> 10 * Author(s): Carsten Otte <cotte@de.ibm.com>
11 * Christian Borntraeger <borntraeger@de.ibm.com> 11 * Christian Borntraeger <borntraeger@de.ibm.com>
12 * Christian Ehrhardt <ehrhardt@de.ibm.com>
12 */ 13 */
13 14
14#ifndef ARCH_S390_KVM_S390_H 15#ifndef ARCH_S390_KVM_S390_H
@@ -18,8 +19,13 @@
18#include <linux/kvm.h> 19#include <linux/kvm.h>
19#include <linux/kvm_host.h> 20#include <linux/kvm_host.h>
20 21
22/* The current code can have up to 256 pages for virtio */
23#define VIRTIODESCSPACE (256ul * 4096ul)
24
21typedef int (*intercept_handler_t)(struct kvm_vcpu *vcpu); 25typedef int (*intercept_handler_t)(struct kvm_vcpu *vcpu);
22 26
27/* negativ values are error codes, positive values for internal conditions */
28#define SIE_INTERCEPT_RERUNVCPU (1<<0)
23int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu); 29int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu);
24 30
25#define VM_EVENT(d_kvm, d_loglevel, d_string, d_args...)\ 31#define VM_EVENT(d_kvm, d_loglevel, d_string, d_args...)\
@@ -50,6 +56,30 @@ int kvm_s390_inject_vm(struct kvm *kvm,
50int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu, 56int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
51 struct kvm_s390_interrupt *s390int); 57 struct kvm_s390_interrupt *s390int);
52int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code); 58int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code);
59int kvm_s390_inject_sigp_stop(struct kvm_vcpu *vcpu, int action);
60
61static inline int kvm_s390_vcpu_get_memsize(struct kvm_vcpu *vcpu)
62{
63 return vcpu->arch.sie_block->gmslm
64 - vcpu->arch.sie_block->gmsor
65 - VIRTIODESCSPACE + 1ul;
66}
67
68static inline void kvm_s390_vcpu_set_mem(struct kvm_vcpu *vcpu)
69{
70 struct kvm_memory_slot *mem;
71
72 down_read(&vcpu->kvm->slots_lock);
73 mem = &vcpu->kvm->memslots[0];
74
75 vcpu->arch.sie_block->gmsor = mem->userspace_addr;
76 vcpu->arch.sie_block->gmslm =
77 mem->userspace_addr +
78 (mem->npages << PAGE_SHIFT) +
79 VIRTIODESCSPACE - 1ul;
80
81 up_read(&vcpu->kvm->slots_lock);
82}
53 83
54/* implemented in priv.c */ 84/* implemented in priv.c */
55int kvm_s390_handle_b2(struct kvm_vcpu *vcpu); 85int kvm_s390_handle_b2(struct kvm_vcpu *vcpu);
diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c
index 0ef81d6776e9..40c8c6748cfe 100644
--- a/arch/s390/kvm/sigp.c
+++ b/arch/s390/kvm/sigp.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * sigp.c - handlinge interprocessor communication 2 * sigp.c - handlinge interprocessor communication
3 * 3 *
4 * Copyright IBM Corp. 2008 4 * Copyright IBM Corp. 2008,2009
5 * 5 *
6 * This program is free software; you can redistribute it and/or modify 6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License (version 2 only) 7 * it under the terms of the GNU General Public License (version 2 only)
@@ -9,6 +9,7 @@
9 * 9 *
10 * Author(s): Carsten Otte <cotte@de.ibm.com> 10 * Author(s): Carsten Otte <cotte@de.ibm.com>
11 * Christian Borntraeger <borntraeger@de.ibm.com> 11 * Christian Borntraeger <borntraeger@de.ibm.com>
12 * Christian Ehrhardt <ehrhardt@de.ibm.com>
12 */ 13 */
13 14
14#include <linux/kvm.h> 15#include <linux/kvm.h>
@@ -107,46 +108,57 @@ unlock:
107 return rc; 108 return rc;
108} 109}
109 110
110static int __sigp_stop(struct kvm_vcpu *vcpu, u16 cpu_addr, int store) 111static int __inject_sigp_stop(struct kvm_s390_local_interrupt *li, int action)
111{ 112{
112 struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
113 struct kvm_s390_local_interrupt *li;
114 struct kvm_s390_interrupt_info *inti; 113 struct kvm_s390_interrupt_info *inti;
115 int rc;
116
117 if (cpu_addr >= KVM_MAX_VCPUS)
118 return 3; /* not operational */
119 114
120 inti = kzalloc(sizeof(*inti), GFP_KERNEL); 115 inti = kzalloc(sizeof(*inti), GFP_KERNEL);
121 if (!inti) 116 if (!inti)
122 return -ENOMEM; 117 return -ENOMEM;
123
124 inti->type = KVM_S390_SIGP_STOP; 118 inti->type = KVM_S390_SIGP_STOP;
125 119
126 spin_lock(&fi->lock);
127 li = fi->local_int[cpu_addr];
128 if (li == NULL) {
129 rc = 3; /* not operational */
130 kfree(inti);
131 goto unlock;
132 }
133 spin_lock_bh(&li->lock); 120 spin_lock_bh(&li->lock);
134 list_add_tail(&inti->list, &li->list); 121 list_add_tail(&inti->list, &li->list);
135 atomic_set(&li->active, 1); 122 atomic_set(&li->active, 1);
136 atomic_set_mask(CPUSTAT_STOP_INT, li->cpuflags); 123 atomic_set_mask(CPUSTAT_STOP_INT, li->cpuflags);
137 if (store) 124 li->action_bits |= action;
138 li->action_bits |= ACTION_STORE_ON_STOP;
139 li->action_bits |= ACTION_STOP_ON_STOP;
140 if (waitqueue_active(&li->wq)) 125 if (waitqueue_active(&li->wq))
141 wake_up_interruptible(&li->wq); 126 wake_up_interruptible(&li->wq);
142 spin_unlock_bh(&li->lock); 127 spin_unlock_bh(&li->lock);
143 rc = 0; /* order accepted */ 128
129 return 0; /* order accepted */
130}
131
132static int __sigp_stop(struct kvm_vcpu *vcpu, u16 cpu_addr, int action)
133{
134 struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int;
135 struct kvm_s390_local_interrupt *li;
136 int rc;
137
138 if (cpu_addr >= KVM_MAX_VCPUS)
139 return 3; /* not operational */
140
141 spin_lock(&fi->lock);
142 li = fi->local_int[cpu_addr];
143 if (li == NULL) {
144 rc = 3; /* not operational */
145 goto unlock;
146 }
147
148 rc = __inject_sigp_stop(li, action);
149
144unlock: 150unlock:
145 spin_unlock(&fi->lock); 151 spin_unlock(&fi->lock);
146 VCPU_EVENT(vcpu, 4, "sent sigp stop to cpu %x", cpu_addr); 152 VCPU_EVENT(vcpu, 4, "sent sigp stop to cpu %x", cpu_addr);
147 return rc; 153 return rc;
148} 154}
149 155
156int kvm_s390_inject_sigp_stop(struct kvm_vcpu *vcpu, int action)
157{
158 struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
159 return __inject_sigp_stop(li, action);
160}
161
150static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter) 162static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter)
151{ 163{
152 int rc; 164 int rc;
@@ -177,9 +189,9 @@ static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address,
177 /* make sure that the new value is valid memory */ 189 /* make sure that the new value is valid memory */
178 address = address & 0x7fffe000u; 190 address = address & 0x7fffe000u;
179 if ((copy_from_guest(vcpu, &tmp, 191 if ((copy_from_guest(vcpu, &tmp,
180 (u64) (address + vcpu->kvm->arch.guest_origin) , 1)) || 192 (u64) (address + vcpu->arch.sie_block->gmsor) , 1)) ||
181 (copy_from_guest(vcpu, &tmp, (u64) (address + 193 (copy_from_guest(vcpu, &tmp, (u64) (address +
182 vcpu->kvm->arch.guest_origin + PAGE_SIZE), 1))) { 194 vcpu->arch.sie_block->gmsor + PAGE_SIZE), 1))) {
183 *reg |= SIGP_STAT_INVALID_PARAMETER; 195 *reg |= SIGP_STAT_INVALID_PARAMETER;
184 return 1; /* invalid parameter */ 196 return 1; /* invalid parameter */
185 } 197 }
@@ -262,11 +274,11 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu)
262 break; 274 break;
263 case SIGP_STOP: 275 case SIGP_STOP:
264 vcpu->stat.instruction_sigp_stop++; 276 vcpu->stat.instruction_sigp_stop++;
265 rc = __sigp_stop(vcpu, cpu_addr, 0); 277 rc = __sigp_stop(vcpu, cpu_addr, ACTION_STOP_ON_STOP);
266 break; 278 break;
267 case SIGP_STOP_STORE_STATUS: 279 case SIGP_STOP_STORE_STATUS:
268 vcpu->stat.instruction_sigp_stop++; 280 vcpu->stat.instruction_sigp_stop++;
269 rc = __sigp_stop(vcpu, cpu_addr, 1); 281 rc = __sigp_stop(vcpu, cpu_addr, ACTION_STORE_ON_STOP);
270 break; 282 break;
271 case SIGP_SET_ARCH: 283 case SIGP_SET_ARCH:
272 vcpu->stat.instruction_sigp_arch++; 284 vcpu->stat.instruction_sigp_arch++;
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 7386bfa4f4bc..3b62da926de9 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -15,6 +15,7 @@
15 15
16#define APIC_LVR 0x30 16#define APIC_LVR 0x30
17#define APIC_LVR_MASK 0xFF00FF 17#define APIC_LVR_MASK 0xFF00FF
18#define APIC_LVR_DIRECTED_EOI (1 << 24)
18#define GET_APIC_VERSION(x) ((x) & 0xFFu) 19#define GET_APIC_VERSION(x) ((x) & 0xFFu)
19#define GET_APIC_MAXLVT(x) (((x) >> 16) & 0xFFu) 20#define GET_APIC_MAXLVT(x) (((x) >> 16) & 0xFFu)
20#ifdef CONFIG_X86_32 21#ifdef CONFIG_X86_32
@@ -41,6 +42,7 @@
41#define APIC_DFR_CLUSTER 0x0FFFFFFFul 42#define APIC_DFR_CLUSTER 0x0FFFFFFFul
42#define APIC_DFR_FLAT 0xFFFFFFFFul 43#define APIC_DFR_FLAT 0xFFFFFFFFul
43#define APIC_SPIV 0xF0 44#define APIC_SPIV 0xF0
45#define APIC_SPIV_DIRECTED_EOI (1 << 12)
44#define APIC_SPIV_FOCUS_DISABLED (1 << 9) 46#define APIC_SPIV_FOCUS_DISABLED (1 << 9)
45#define APIC_SPIV_APIC_ENABLED (1 << 8) 47#define APIC_SPIV_APIC_ENABLED (1 << 8)
46#define APIC_ISR 0x100 48#define APIC_ISR 0x100
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index 125be8b19568..4a5fe914dc59 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -17,6 +17,8 @@
17#define __KVM_HAVE_USER_NMI 17#define __KVM_HAVE_USER_NMI
18#define __KVM_HAVE_GUEST_DEBUG 18#define __KVM_HAVE_GUEST_DEBUG
19#define __KVM_HAVE_MSIX 19#define __KVM_HAVE_MSIX
20#define __KVM_HAVE_MCE
21#define __KVM_HAVE_PIT_STATE2
20 22
21/* Architectural interrupt line count. */ 23/* Architectural interrupt line count. */
22#define KVM_NR_INTERRUPTS 256 24#define KVM_NR_INTERRUPTS 256
@@ -236,6 +238,14 @@ struct kvm_pit_state {
236 struct kvm_pit_channel_state channels[3]; 238 struct kvm_pit_channel_state channels[3];
237}; 239};
238 240
241#define KVM_PIT_FLAGS_HPET_LEGACY 0x00000001
242
243struct kvm_pit_state2 {
244 struct kvm_pit_channel_state channels[3];
245 __u32 flags;
246 __u32 reserved[9];
247};
248
239struct kvm_reinject_control { 249struct kvm_reinject_control {
240 __u8 pit_reinject; 250 __u8 pit_reinject;
241 __u8 reserved[31]; 251 __u8 reserved[31];
diff --git a/arch/x86/include/asm/kvm_x86_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index b7ed2c423116..b7ed2c423116 100644
--- a/arch/x86/include/asm/kvm_x86_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index eabdc1cfab5c..3be000435fad 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -14,6 +14,7 @@
14#include <linux/types.h> 14#include <linux/types.h>
15#include <linux/mm.h> 15#include <linux/mm.h>
16#include <linux/mmu_notifier.h> 16#include <linux/mmu_notifier.h>
17#include <linux/tracepoint.h>
17 18
18#include <linux/kvm.h> 19#include <linux/kvm.h>
19#include <linux/kvm_para.h> 20#include <linux/kvm_para.h>
@@ -37,12 +38,14 @@
37#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \ 38#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \
38 0xFFFFFF0000000000ULL) 39 0xFFFFFF0000000000ULL)
39 40
40#define KVM_GUEST_CR0_MASK \ 41#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
41 (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE \ 42 (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
42 | X86_CR0_NW | X86_CR0_CD) 43#define KVM_GUEST_CR0_MASK \
44 (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
45#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \
46 (X86_CR0_WP | X86_CR0_NE | X86_CR0_TS | X86_CR0_MP)
43#define KVM_VM_CR0_ALWAYS_ON \ 47#define KVM_VM_CR0_ALWAYS_ON \
44 (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE | X86_CR0_TS \ 48 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
45 | X86_CR0_MP)
46#define KVM_GUEST_CR4_MASK \ 49#define KVM_GUEST_CR4_MASK \
47 (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE) 50 (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE)
48#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) 51#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
@@ -51,12 +54,12 @@
51#define INVALID_PAGE (~(hpa_t)0) 54#define INVALID_PAGE (~(hpa_t)0)
52#define UNMAPPED_GVA (~(gpa_t)0) 55#define UNMAPPED_GVA (~(gpa_t)0)
53 56
54/* shadow tables are PAE even on non-PAE hosts */ 57/* KVM Hugepage definitions for x86 */
55#define KVM_HPAGE_SHIFT 21 58#define KVM_NR_PAGE_SIZES 3
56#define KVM_HPAGE_SIZE (1UL << KVM_HPAGE_SHIFT) 59#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + (((x) - 1) * 9))
57#define KVM_HPAGE_MASK (~(KVM_HPAGE_SIZE - 1)) 60#define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x))
58 61#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1))
59#define KVM_PAGES_PER_HPAGE (KVM_HPAGE_SIZE / PAGE_SIZE) 62#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE)
60 63
61#define DE_VECTOR 0 64#define DE_VECTOR 0
62#define DB_VECTOR 1 65#define DB_VECTOR 1
@@ -120,6 +123,10 @@ enum kvm_reg {
120 NR_VCPU_REGS 123 NR_VCPU_REGS
121}; 124};
122 125
126enum kvm_reg_ex {
127 VCPU_EXREG_PDPTR = NR_VCPU_REGS,
128};
129
123enum { 130enum {
124 VCPU_SREG_ES, 131 VCPU_SREG_ES,
125 VCPU_SREG_CS, 132 VCPU_SREG_CS,
@@ -131,7 +138,7 @@ enum {
131 VCPU_SREG_LDTR, 138 VCPU_SREG_LDTR,
132}; 139};
133 140
134#include <asm/kvm_x86_emulate.h> 141#include <asm/kvm_emulate.h>
135 142
136#define KVM_NR_MEM_OBJS 40 143#define KVM_NR_MEM_OBJS 40
137 144
@@ -308,7 +315,6 @@ struct kvm_vcpu_arch {
308 struct { 315 struct {
309 gfn_t gfn; /* presumed gfn during guest pte update */ 316 gfn_t gfn; /* presumed gfn during guest pte update */
310 pfn_t pfn; /* pfn corresponding to that gfn */ 317 pfn_t pfn; /* pfn corresponding to that gfn */
311 int largepage;
312 unsigned long mmu_seq; 318 unsigned long mmu_seq;
313 } update_pte; 319 } update_pte;
314 320
@@ -334,16 +340,6 @@ struct kvm_vcpu_arch {
334 u8 nr; 340 u8 nr;
335 } interrupt; 341 } interrupt;
336 342
337 struct {
338 int vm86_active;
339 u8 save_iopl;
340 struct kvm_save_segment {
341 u16 selector;
342 unsigned long base;
343 u32 limit;
344 u32 ar;
345 } tr, es, ds, fs, gs;
346 } rmode;
347 int halt_request; /* real mode on Intel only */ 343 int halt_request; /* real mode on Intel only */
348 344
349 int cpuid_nent; 345 int cpuid_nent;
@@ -366,13 +362,15 @@ struct kvm_vcpu_arch {
366 u32 pat; 362 u32 pat;
367 363
368 int switch_db_regs; 364 int switch_db_regs;
369 unsigned long host_db[KVM_NR_DB_REGS];
370 unsigned long host_dr6;
371 unsigned long host_dr7;
372 unsigned long db[KVM_NR_DB_REGS]; 365 unsigned long db[KVM_NR_DB_REGS];
373 unsigned long dr6; 366 unsigned long dr6;
374 unsigned long dr7; 367 unsigned long dr7;
375 unsigned long eff_db[KVM_NR_DB_REGS]; 368 unsigned long eff_db[KVM_NR_DB_REGS];
369
370 u64 mcg_cap;
371 u64 mcg_status;
372 u64 mcg_ctl;
373 u64 *mce_banks;
376}; 374};
377 375
378struct kvm_mem_alias { 376struct kvm_mem_alias {
@@ -409,6 +407,7 @@ struct kvm_arch{
409 407
410 struct page *ept_identity_pagetable; 408 struct page *ept_identity_pagetable;
411 bool ept_identity_pagetable_done; 409 bool ept_identity_pagetable_done;
410 gpa_t ept_identity_map_addr;
412 411
413 unsigned long irq_sources_bitmap; 412 unsigned long irq_sources_bitmap;
414 unsigned long irq_states[KVM_IOAPIC_NUM_PINS]; 413 unsigned long irq_states[KVM_IOAPIC_NUM_PINS];
@@ -526,6 +525,9 @@ struct kvm_x86_ops {
526 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); 525 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
527 int (*get_tdp_level)(void); 526 int (*get_tdp_level)(void);
528 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); 527 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
528 bool (*gb_page_enable)(void);
529
530 const struct trace_print_flags *exit_reasons_str;
529}; 531};
530 532
531extern struct kvm_x86_ops *kvm_x86_ops; 533extern struct kvm_x86_ops *kvm_x86_ops;
@@ -618,6 +620,7 @@ void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
618void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); 620void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
619void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2, 621void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
620 u32 error_code); 622 u32 error_code);
623bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
621 624
622int kvm_pic_set_irq(void *opaque, int irq, int level); 625int kvm_pic_set_irq(void *opaque, int irq, int level);
623 626
@@ -752,8 +755,6 @@ static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
752 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); 755 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
753} 756}
754 757
755#define MSR_IA32_TIME_STAMP_COUNTER 0x010
756
757#define TSS_IOPB_BASE_OFFSET 0x66 758#define TSS_IOPB_BASE_OFFSET 0x66
758#define TSS_BASE_SIZE 0x68 759#define TSS_BASE_SIZE 0x68
759#define TSS_IOPB_SIZE (65536 / 8) 760#define TSS_IOPB_SIZE (65536 / 8)
@@ -796,5 +797,8 @@ asmlinkage void kvm_handle_fault_on_reboot(void);
796int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); 797int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
797int kvm_age_hva(struct kvm *kvm, unsigned long hva); 798int kvm_age_hva(struct kvm *kvm, unsigned long hva);
798int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); 799int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
800int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
801int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
802int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
799 803
800#endif /* _ASM_X86_KVM_HOST_H */ 804#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index b8a3305ae093..c584076a47f4 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -1,6 +1,8 @@
1#ifndef _ASM_X86_KVM_PARA_H 1#ifndef _ASM_X86_KVM_PARA_H
2#define _ASM_X86_KVM_PARA_H 2#define _ASM_X86_KVM_PARA_H
3 3
4#include <linux/types.h>
5
4/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It 6/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It
5 * should be used to determine that a VM is running under KVM. 7 * should be used to determine that a VM is running under KVM.
6 */ 8 */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 6be7fc254b59..bd5549034a95 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -374,6 +374,7 @@
374/* AMD-V MSRs */ 374/* AMD-V MSRs */
375 375
376#define MSR_VM_CR 0xc0010114 376#define MSR_VM_CR 0xc0010114
377#define MSR_VM_IGNNE 0xc0010115
377#define MSR_VM_HSAVE_PA 0xc0010117 378#define MSR_VM_HSAVE_PA 0xc0010117
378 379
379#endif /* _ASM_X86_MSR_INDEX_H */ 380#endif /* _ASM_X86_MSR_INDEX_H */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 11be5ad2e0e9..272514c2d456 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -55,6 +55,7 @@
55#define SECONDARY_EXEC_ENABLE_EPT 0x00000002 55#define SECONDARY_EXEC_ENABLE_EPT 0x00000002
56#define SECONDARY_EXEC_ENABLE_VPID 0x00000020 56#define SECONDARY_EXEC_ENABLE_VPID 0x00000020
57#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 57#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
58#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080
58 59
59 60
60#define PIN_BASED_EXT_INTR_MASK 0x00000001 61#define PIN_BASED_EXT_INTR_MASK 0x00000001
@@ -351,9 +352,16 @@ enum vmcs_field {
351#define VMX_EPT_EXTENT_INDIVIDUAL_ADDR 0 352#define VMX_EPT_EXTENT_INDIVIDUAL_ADDR 0
352#define VMX_EPT_EXTENT_CONTEXT 1 353#define VMX_EPT_EXTENT_CONTEXT 1
353#define VMX_EPT_EXTENT_GLOBAL 2 354#define VMX_EPT_EXTENT_GLOBAL 2
355
356#define VMX_EPT_EXECUTE_ONLY_BIT (1ull)
357#define VMX_EPT_PAGE_WALK_4_BIT (1ull << 6)
358#define VMX_EPTP_UC_BIT (1ull << 8)
359#define VMX_EPTP_WB_BIT (1ull << 14)
360#define VMX_EPT_2MB_PAGE_BIT (1ull << 16)
354#define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24) 361#define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24)
355#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) 362#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
356#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) 363#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26)
364
357#define VMX_EPT_DEFAULT_GAW 3 365#define VMX_EPT_DEFAULT_GAW 3
358#define VMX_EPT_MAX_GAW 0x4 366#define VMX_EPT_MAX_GAW 0x4
359#define VMX_EPT_MT_EPTE_SHIFT 3 367#define VMX_EPT_MT_EPTE_SHIFT 3
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index c664d515f613..63b0ec8d3d4a 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -34,7 +34,6 @@
34struct kvm_para_state { 34struct kvm_para_state {
35 u8 mmu_queue[MMU_QUEUE_SIZE]; 35 u8 mmu_queue[MMU_QUEUE_SIZE];
36 int mmu_queue_len; 36 int mmu_queue_len;
37 enum paravirt_lazy_mode mode;
38}; 37};
39 38
40static DEFINE_PER_CPU(struct kvm_para_state, para_state); 39static DEFINE_PER_CPU(struct kvm_para_state, para_state);
@@ -77,7 +76,7 @@ static void kvm_deferred_mmu_op(void *buffer, int len)
77{ 76{
78 struct kvm_para_state *state = kvm_para_state(); 77 struct kvm_para_state *state = kvm_para_state();
79 78
80 if (state->mode != PARAVIRT_LAZY_MMU) { 79 if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) {
81 kvm_mmu_op(buffer, len); 80 kvm_mmu_op(buffer, len);
82 return; 81 return;
83 } 82 }
@@ -185,10 +184,7 @@ static void kvm_release_pt(unsigned long pfn)
185 184
186static void kvm_enter_lazy_mmu(void) 185static void kvm_enter_lazy_mmu(void)
187{ 186{
188 struct kvm_para_state *state = kvm_para_state();
189
190 paravirt_enter_lazy_mmu(); 187 paravirt_enter_lazy_mmu();
191 state->mode = paravirt_get_lazy_mode();
192} 188}
193 189
194static void kvm_leave_lazy_mmu(void) 190static void kvm_leave_lazy_mmu(void)
@@ -197,7 +193,6 @@ static void kvm_leave_lazy_mmu(void)
197 193
198 mmu_queue_flush(state); 194 mmu_queue_flush(state);
199 paravirt_leave_lazy_mmu(); 195 paravirt_leave_lazy_mmu();
200 state->mode = paravirt_get_lazy_mode();
201} 196}
202 197
203static void __init paravirt_ops_setup(void) 198static void __init paravirt_ops_setup(void)
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 223af43f1526..e5efcdcca31b 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -50,8 +50,8 @@ static unsigned long kvm_get_wallclock(void)
50 struct timespec ts; 50 struct timespec ts;
51 int low, high; 51 int low, high;
52 52
53 low = (int)__pa(&wall_clock); 53 low = (int)__pa_symbol(&wall_clock);
54 high = ((u64)__pa(&wall_clock) >> 32); 54 high = ((u64)__pa_symbol(&wall_clock) >> 32);
55 native_write_msr(MSR_KVM_WALL_CLOCK, low, high); 55 native_write_msr(MSR_KVM_WALL_CLOCK, low, high);
56 56
57 vcpu_time = &get_cpu_var(hv_clock); 57 vcpu_time = &get_cpu_var(hv_clock);
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 8600a09e0c6c..b84e571f4175 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -1,12 +1,8 @@
1# 1#
2# KVM configuration 2# KVM configuration
3# 3#
4config HAVE_KVM
5 bool
6 4
7config HAVE_KVM_IRQCHIP 5source "virt/kvm/Kconfig"
8 bool
9 default y
10 6
11menuconfig VIRTUALIZATION 7menuconfig VIRTUALIZATION
12 bool "Virtualization" 8 bool "Virtualization"
@@ -29,6 +25,9 @@ config KVM
29 select PREEMPT_NOTIFIERS 25 select PREEMPT_NOTIFIERS
30 select MMU_NOTIFIER 26 select MMU_NOTIFIER
31 select ANON_INODES 27 select ANON_INODES
28 select HAVE_KVM_IRQCHIP
29 select HAVE_KVM_EVENTFD
30 select KVM_APIC_ARCHITECTURE
32 ---help--- 31 ---help---
33 Support hosting fully virtualized guest machines using hardware 32 Support hosting fully virtualized guest machines using hardware
34 virtualization extensions. You will need a fairly recent 33 virtualization extensions. You will need a fairly recent
@@ -63,18 +62,6 @@ config KVM_AMD
63 To compile this as a module, choose M here: the module 62 To compile this as a module, choose M here: the module
64 will be called kvm-amd. 63 will be called kvm-amd.
65 64
66config KVM_TRACE
67 bool "KVM trace support"
68 depends on KVM && SYSFS
69 select MARKERS
70 select RELAY
71 select DEBUG_FS
72 default n
73 ---help---
74 This option allows reading a trace of kvm-related events through
75 relayfs. Note the ABI is not considered stable and will be
76 modified in future updates.
77
78# OK, it's a little counter-intuitive to do this, but it puts it neatly under 65# OK, it's a little counter-intuitive to do this, but it puts it neatly under
79# the virtualization menu. 66# the virtualization menu.
80source drivers/lguest/Kconfig 67source drivers/lguest/Kconfig
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index b43c4efafe80..0e7fe78d0f74 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -1,22 +1,19 @@
1#
2# Makefile for Kernel-based Virtual Machine module
3#
4
5common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
6 coalesced_mmio.o irq_comm.o)
7ifeq ($(CONFIG_KVM_TRACE),y)
8common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o)
9endif
10ifeq ($(CONFIG_IOMMU_API),y)
11common-objs += $(addprefix ../../../virt/kvm/, iommu.o)
12endif
13 1
14EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm 2EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
15 3
16kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \ 4CFLAGS_x86.o := -I.
17 i8254.o timer.o 5CFLAGS_svm.o := -I.
18obj-$(CONFIG_KVM) += kvm.o 6CFLAGS_vmx.o := -I.
19kvm-intel-objs = vmx.o 7
20obj-$(CONFIG_KVM_INTEL) += kvm-intel.o 8kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
21kvm-amd-objs = svm.o 9 coalesced_mmio.o irq_comm.o eventfd.o)
22obj-$(CONFIG_KVM_AMD) += kvm-amd.o 10kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o)
11
12kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
13 i8254.o timer.o
14kvm-intel-y += vmx.o
15kvm-amd-y += svm.o
16
17obj-$(CONFIG_KVM) += kvm.o
18obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
19obj-$(CONFIG_KVM_AMD) += kvm-amd.o
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/emulate.c
index 616de4628d60..1be5cd640e93 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1,5 +1,5 @@
1/****************************************************************************** 1/******************************************************************************
2 * x86_emulate.c 2 * emulate.c
3 * 3 *
4 * Generic x86 (32-bit and 64-bit) instruction decoder and emulator. 4 * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
5 * 5 *
@@ -30,7 +30,9 @@
30#define DPRINTF(x...) do {} while (0) 30#define DPRINTF(x...) do {} while (0)
31#endif 31#endif
32#include <linux/module.h> 32#include <linux/module.h>
33#include <asm/kvm_x86_emulate.h> 33#include <asm/kvm_emulate.h>
34
35#include "mmu.h" /* for is_long_mode() */
34 36
35/* 37/*
36 * Opcode effective-address decode tables. 38 * Opcode effective-address decode tables.
@@ -60,6 +62,7 @@
60#define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */ 62#define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */
61#define SrcOne (7<<4) /* Implied '1' */ 63#define SrcOne (7<<4) /* Implied '1' */
62#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */ 64#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */
65#define SrcImmU (9<<4) /* Immediate operand, unsigned */
63#define SrcMask (0xf<<4) 66#define SrcMask (0xf<<4)
64/* Generic ModRM decode. */ 67/* Generic ModRM decode. */
65#define ModRM (1<<8) 68#define ModRM (1<<8)
@@ -97,11 +100,11 @@ static u32 opcode_table[256] = {
97 /* 0x10 - 0x17 */ 100 /* 0x10 - 0x17 */
98 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 101 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
99 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 102 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
100 0, 0, 0, 0, 103 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0,
101 /* 0x18 - 0x1F */ 104 /* 0x18 - 0x1F */
102 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 105 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
103 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 106 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
104 0, 0, 0, 0, 107 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0,
105 /* 0x20 - 0x27 */ 108 /* 0x20 - 0x27 */
106 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 109 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
107 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 110 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
@@ -195,7 +198,7 @@ static u32 opcode_table[256] = {
195 ByteOp | SrcImmUByte, SrcImmUByte, 198 ByteOp | SrcImmUByte, SrcImmUByte,
196 /* 0xE8 - 0xEF */ 199 /* 0xE8 - 0xEF */
197 SrcImm | Stack, SrcImm | ImplicitOps, 200 SrcImm | Stack, SrcImm | ImplicitOps,
198 SrcImm | Src2Imm16, SrcImmByte | ImplicitOps, 201 SrcImmU | Src2Imm16, SrcImmByte | ImplicitOps,
199 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, 202 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
200 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, 203 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
201 /* 0xF0 - 0xF7 */ 204 /* 0xF0 - 0xF7 */
@@ -208,7 +211,7 @@ static u32 opcode_table[256] = {
208 211
209static u32 twobyte_table[256] = { 212static u32 twobyte_table[256] = {
210 /* 0x00 - 0x0F */ 213 /* 0x00 - 0x0F */
211 0, Group | GroupDual | Group7, 0, 0, 0, 0, ImplicitOps, 0, 214 0, Group | GroupDual | Group7, 0, 0, 0, ImplicitOps, ImplicitOps, 0,
212 ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 215 ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
213 /* 0x10 - 0x1F */ 216 /* 0x10 - 0x1F */
214 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, 217 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
@@ -216,7 +219,9 @@ static u32 twobyte_table[256] = {
216 ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0, 219 ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0, 220 0, 0, 0, 0, 0, 0, 0, 0,
218 /* 0x30 - 0x3F */ 221 /* 0x30 - 0x3F */
219 ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 222 ImplicitOps, 0, ImplicitOps, 0,
223 ImplicitOps, ImplicitOps, 0, 0,
224 0, 0, 0, 0, 0, 0, 0, 0,
220 /* 0x40 - 0x47 */ 225 /* 0x40 - 0x47 */
221 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, 226 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
222 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, 227 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
@@ -319,8 +324,11 @@ static u32 group2_table[] = {
319}; 324};
320 325
321/* EFLAGS bit definitions. */ 326/* EFLAGS bit definitions. */
327#define EFLG_VM (1<<17)
328#define EFLG_RF (1<<16)
322#define EFLG_OF (1<<11) 329#define EFLG_OF (1<<11)
323#define EFLG_DF (1<<10) 330#define EFLG_DF (1<<10)
331#define EFLG_IF (1<<9)
324#define EFLG_SF (1<<7) 332#define EFLG_SF (1<<7)
325#define EFLG_ZF (1<<6) 333#define EFLG_ZF (1<<6)
326#define EFLG_AF (1<<4) 334#define EFLG_AF (1<<4)
@@ -1027,6 +1035,7 @@ done_prefixes:
1027 c->src.type = OP_MEM; 1035 c->src.type = OP_MEM;
1028 break; 1036 break;
1029 case SrcImm: 1037 case SrcImm:
1038 case SrcImmU:
1030 c->src.type = OP_IMM; 1039 c->src.type = OP_IMM;
1031 c->src.ptr = (unsigned long *)c->eip; 1040 c->src.ptr = (unsigned long *)c->eip;
1032 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1041 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
@@ -1044,6 +1053,19 @@ done_prefixes:
1044 c->src.val = insn_fetch(s32, 4, c->eip); 1053 c->src.val = insn_fetch(s32, 4, c->eip);
1045 break; 1054 break;
1046 } 1055 }
1056 if ((c->d & SrcMask) == SrcImmU) {
1057 switch (c->src.bytes) {
1058 case 1:
1059 c->src.val &= 0xff;
1060 break;
1061 case 2:
1062 c->src.val &= 0xffff;
1063 break;
1064 case 4:
1065 c->src.val &= 0xffffffff;
1066 break;
1067 }
1068 }
1047 break; 1069 break;
1048 case SrcImmByte: 1070 case SrcImmByte:
1049 case SrcImmUByte: 1071 case SrcImmUByte:
@@ -1375,6 +1397,217 @@ static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask)
1375 ctxt->interruptibility = mask; 1397 ctxt->interruptibility = mask;
1376} 1398}
1377 1399
1400static inline void
1401setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
1402 struct kvm_segment *cs, struct kvm_segment *ss)
1403{
1404 memset(cs, 0, sizeof(struct kvm_segment));
1405 kvm_x86_ops->get_segment(ctxt->vcpu, cs, VCPU_SREG_CS);
1406 memset(ss, 0, sizeof(struct kvm_segment));
1407
1408 cs->l = 0; /* will be adjusted later */
1409 cs->base = 0; /* flat segment */
1410 cs->g = 1; /* 4kb granularity */
1411 cs->limit = 0xffffffff; /* 4GB limit */
1412 cs->type = 0x0b; /* Read, Execute, Accessed */
1413 cs->s = 1;
1414 cs->dpl = 0; /* will be adjusted later */
1415 cs->present = 1;
1416 cs->db = 1;
1417
1418 ss->unusable = 0;
1419 ss->base = 0; /* flat segment */
1420 ss->limit = 0xffffffff; /* 4GB limit */
1421 ss->g = 1; /* 4kb granularity */
1422 ss->s = 1;
1423 ss->type = 0x03; /* Read/Write, Accessed */
1424 ss->db = 1; /* 32bit stack segment */
1425 ss->dpl = 0;
1426 ss->present = 1;
1427}
1428
1429static int
1430emulate_syscall(struct x86_emulate_ctxt *ctxt)
1431{
1432 struct decode_cache *c = &ctxt->decode;
1433 struct kvm_segment cs, ss;
1434 u64 msr_data;
1435
1436 /* syscall is not available in real mode */
1437 if (c->lock_prefix || ctxt->mode == X86EMUL_MODE_REAL
1438 || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE))
1439 return -1;
1440
1441 setup_syscalls_segments(ctxt, &cs, &ss);
1442
1443 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
1444 msr_data >>= 32;
1445 cs.selector = (u16)(msr_data & 0xfffc);
1446 ss.selector = (u16)(msr_data + 8);
1447
1448 if (is_long_mode(ctxt->vcpu)) {
1449 cs.db = 0;
1450 cs.l = 1;
1451 }
1452 kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
1453 kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
1454
1455 c->regs[VCPU_REGS_RCX] = c->eip;
1456 if (is_long_mode(ctxt->vcpu)) {
1457#ifdef CONFIG_X86_64
1458 c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF;
1459
1460 kvm_x86_ops->get_msr(ctxt->vcpu,
1461 ctxt->mode == X86EMUL_MODE_PROT64 ?
1462 MSR_LSTAR : MSR_CSTAR, &msr_data);
1463 c->eip = msr_data;
1464
1465 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data);
1466 ctxt->eflags &= ~(msr_data | EFLG_RF);
1467#endif
1468 } else {
1469 /* legacy mode */
1470 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
1471 c->eip = (u32)msr_data;
1472
1473 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
1474 }
1475
1476 return 0;
1477}
1478
1479static int
1480emulate_sysenter(struct x86_emulate_ctxt *ctxt)
1481{
1482 struct decode_cache *c = &ctxt->decode;
1483 struct kvm_segment cs, ss;
1484 u64 msr_data;
1485
1486 /* inject #UD if LOCK prefix is used */
1487 if (c->lock_prefix)
1488 return -1;
1489
1490 /* inject #GP if in real mode or paging is disabled */
1491 if (ctxt->mode == X86EMUL_MODE_REAL ||
1492 !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) {
1493 kvm_inject_gp(ctxt->vcpu, 0);
1494 return -1;
1495 }
1496
1497 /* XXX sysenter/sysexit have not been tested in 64bit mode.
1498 * Therefore, we inject an #UD.
1499 */
1500 if (ctxt->mode == X86EMUL_MODE_PROT64)
1501 return -1;
1502
1503 setup_syscalls_segments(ctxt, &cs, &ss);
1504
1505 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
1506 switch (ctxt->mode) {
1507 case X86EMUL_MODE_PROT32:
1508 if ((msr_data & 0xfffc) == 0x0) {
1509 kvm_inject_gp(ctxt->vcpu, 0);
1510 return -1;
1511 }
1512 break;
1513 case X86EMUL_MODE_PROT64:
1514 if (msr_data == 0x0) {
1515 kvm_inject_gp(ctxt->vcpu, 0);
1516 return -1;
1517 }
1518 break;
1519 }
1520
1521 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
1522 cs.selector = (u16)msr_data;
1523 cs.selector &= ~SELECTOR_RPL_MASK;
1524 ss.selector = cs.selector + 8;
1525 ss.selector &= ~SELECTOR_RPL_MASK;
1526 if (ctxt->mode == X86EMUL_MODE_PROT64
1527 || is_long_mode(ctxt->vcpu)) {
1528 cs.db = 0;
1529 cs.l = 1;
1530 }
1531
1532 kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
1533 kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
1534
1535 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data);
1536 c->eip = msr_data;
1537
1538 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data);
1539 c->regs[VCPU_REGS_RSP] = msr_data;
1540
1541 return 0;
1542}
1543
1544static int
1545emulate_sysexit(struct x86_emulate_ctxt *ctxt)
1546{
1547 struct decode_cache *c = &ctxt->decode;
1548 struct kvm_segment cs, ss;
1549 u64 msr_data;
1550 int usermode;
1551
1552 /* inject #UD if LOCK prefix is used */
1553 if (c->lock_prefix)
1554 return -1;
1555
1556 /* inject #GP if in real mode or paging is disabled */
1557 if (ctxt->mode == X86EMUL_MODE_REAL
1558 || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) {
1559 kvm_inject_gp(ctxt->vcpu, 0);
1560 return -1;
1561 }
1562
1563 /* sysexit must be called from CPL 0 */
1564 if (kvm_x86_ops->get_cpl(ctxt->vcpu) != 0) {
1565 kvm_inject_gp(ctxt->vcpu, 0);
1566 return -1;
1567 }
1568
1569 setup_syscalls_segments(ctxt, &cs, &ss);
1570
1571 if ((c->rex_prefix & 0x8) != 0x0)
1572 usermode = X86EMUL_MODE_PROT64;
1573 else
1574 usermode = X86EMUL_MODE_PROT32;
1575
1576 cs.dpl = 3;
1577 ss.dpl = 3;
1578 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
1579 switch (usermode) {
1580 case X86EMUL_MODE_PROT32:
1581 cs.selector = (u16)(msr_data + 16);
1582 if ((msr_data & 0xfffc) == 0x0) {
1583 kvm_inject_gp(ctxt->vcpu, 0);
1584 return -1;
1585 }
1586 ss.selector = (u16)(msr_data + 24);
1587 break;
1588 case X86EMUL_MODE_PROT64:
1589 cs.selector = (u16)(msr_data + 32);
1590 if (msr_data == 0x0) {
1591 kvm_inject_gp(ctxt->vcpu, 0);
1592 return -1;
1593 }
1594 ss.selector = cs.selector + 8;
1595 cs.db = 0;
1596 cs.l = 1;
1597 break;
1598 }
1599 cs.selector |= SELECTOR_RPL_MASK;
1600 ss.selector |= SELECTOR_RPL_MASK;
1601
1602 kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
1603 kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
1604
1605 c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX];
1606 c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX];
1607
1608 return 0;
1609}
1610
1378int 1611int
1379x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) 1612x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1380{ 1613{
@@ -1970,6 +2203,12 @@ twobyte_insn:
1970 goto cannot_emulate; 2203 goto cannot_emulate;
1971 } 2204 }
1972 break; 2205 break;
2206 case 0x05: /* syscall */
2207 if (emulate_syscall(ctxt) == -1)
2208 goto cannot_emulate;
2209 else
2210 goto writeback;
2211 break;
1973 case 0x06: 2212 case 0x06:
1974 emulate_clts(ctxt->vcpu); 2213 emulate_clts(ctxt->vcpu);
1975 c->dst.type = OP_NONE; 2214 c->dst.type = OP_NONE;
@@ -2036,6 +2275,18 @@ twobyte_insn:
2036 rc = X86EMUL_CONTINUE; 2275 rc = X86EMUL_CONTINUE;
2037 c->dst.type = OP_NONE; 2276 c->dst.type = OP_NONE;
2038 break; 2277 break;
2278 case 0x34: /* sysenter */
2279 if (emulate_sysenter(ctxt) == -1)
2280 goto cannot_emulate;
2281 else
2282 goto writeback;
2283 break;
2284 case 0x35: /* sysexit */
2285 if (emulate_sysexit(ctxt) == -1)
2286 goto cannot_emulate;
2287 else
2288 goto writeback;
2289 break;
2039 case 0x40 ... 0x4f: /* cmov */ 2290 case 0x40 ... 0x4f: /* cmov */
2040 c->dst.val = c->dst.orig_val = c->src.val; 2291 c->dst.val = c->dst.orig_val = c->src.val;
2041 if (!test_cc(c->b, ctxt->eflags)) 2292 if (!test_cc(c->b, ctxt->eflags))
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 21f68e00524f..82ad523b4901 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -231,7 +231,7 @@ int pit_has_pending_timer(struct kvm_vcpu *vcpu)
231{ 231{
232 struct kvm_pit *pit = vcpu->kvm->arch.vpit; 232 struct kvm_pit *pit = vcpu->kvm->arch.vpit;
233 233
234 if (pit && vcpu->vcpu_id == 0 && pit->pit_state.irq_ack) 234 if (pit && kvm_vcpu_is_bsp(vcpu) && pit->pit_state.irq_ack)
235 return atomic_read(&pit->pit_state.pit_timer.pending); 235 return atomic_read(&pit->pit_state.pit_timer.pending);
236 return 0; 236 return 0;
237} 237}
@@ -252,7 +252,7 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
252 struct kvm_pit *pit = vcpu->kvm->arch.vpit; 252 struct kvm_pit *pit = vcpu->kvm->arch.vpit;
253 struct hrtimer *timer; 253 struct hrtimer *timer;
254 254
255 if (vcpu->vcpu_id != 0 || !pit) 255 if (!kvm_vcpu_is_bsp(vcpu) || !pit)
256 return; 256 return;
257 257
258 timer = &pit->pit_state.pit_timer.timer; 258 timer = &pit->pit_state.pit_timer.timer;
@@ -294,7 +294,7 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
294 pt->timer.function = kvm_timer_fn; 294 pt->timer.function = kvm_timer_fn;
295 pt->t_ops = &kpit_ops; 295 pt->t_ops = &kpit_ops;
296 pt->kvm = ps->pit->kvm; 296 pt->kvm = ps->pit->kvm;
297 pt->vcpu_id = 0; 297 pt->vcpu = pt->kvm->bsp_vcpu;
298 298
299 atomic_set(&pt->pending, 0); 299 atomic_set(&pt->pending, 0);
300 ps->irq_ack = 1; 300 ps->irq_ack = 1;
@@ -332,33 +332,62 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
332 case 1: 332 case 1:
333 /* FIXME: enhance mode 4 precision */ 333 /* FIXME: enhance mode 4 precision */
334 case 4: 334 case 4:
335 create_pit_timer(ps, val, 0); 335 if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)) {
336 create_pit_timer(ps, val, 0);
337 }
336 break; 338 break;
337 case 2: 339 case 2:
338 case 3: 340 case 3:
339 create_pit_timer(ps, val, 1); 341 if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)){
342 create_pit_timer(ps, val, 1);
343 }
340 break; 344 break;
341 default: 345 default:
342 destroy_pit_timer(&ps->pit_timer); 346 destroy_pit_timer(&ps->pit_timer);
343 } 347 }
344} 348}
345 349
346void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val) 350void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start)
351{
352 u8 saved_mode;
353 if (hpet_legacy_start) {
354 /* save existing mode for later reenablement */
355 saved_mode = kvm->arch.vpit->pit_state.channels[0].mode;
356 kvm->arch.vpit->pit_state.channels[0].mode = 0xff; /* disable timer */
357 pit_load_count(kvm, channel, val);
358 kvm->arch.vpit->pit_state.channels[0].mode = saved_mode;
359 } else {
360 pit_load_count(kvm, channel, val);
361 }
362}
363
364static inline struct kvm_pit *dev_to_pit(struct kvm_io_device *dev)
365{
366 return container_of(dev, struct kvm_pit, dev);
367}
368
369static inline struct kvm_pit *speaker_to_pit(struct kvm_io_device *dev)
347{ 370{
348 mutex_lock(&kvm->arch.vpit->pit_state.lock); 371 return container_of(dev, struct kvm_pit, speaker_dev);
349 pit_load_count(kvm, channel, val);
350 mutex_unlock(&kvm->arch.vpit->pit_state.lock);
351} 372}
352 373
353static void pit_ioport_write(struct kvm_io_device *this, 374static inline int pit_in_range(gpa_t addr)
354 gpa_t addr, int len, const void *data)
355{ 375{
356 struct kvm_pit *pit = (struct kvm_pit *)this->private; 376 return ((addr >= KVM_PIT_BASE_ADDRESS) &&
377 (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH));
378}
379
380static int pit_ioport_write(struct kvm_io_device *this,
381 gpa_t addr, int len, const void *data)
382{
383 struct kvm_pit *pit = dev_to_pit(this);
357 struct kvm_kpit_state *pit_state = &pit->pit_state; 384 struct kvm_kpit_state *pit_state = &pit->pit_state;
358 struct kvm *kvm = pit->kvm; 385 struct kvm *kvm = pit->kvm;
359 int channel, access; 386 int channel, access;
360 struct kvm_kpit_channel_state *s; 387 struct kvm_kpit_channel_state *s;
361 u32 val = *(u32 *) data; 388 u32 val = *(u32 *) data;
389 if (!pit_in_range(addr))
390 return -EOPNOTSUPP;
362 391
363 val &= 0xff; 392 val &= 0xff;
364 addr &= KVM_PIT_CHANNEL_MASK; 393 addr &= KVM_PIT_CHANNEL_MASK;
@@ -421,16 +450,19 @@ static void pit_ioport_write(struct kvm_io_device *this,
421 } 450 }
422 451
423 mutex_unlock(&pit_state->lock); 452 mutex_unlock(&pit_state->lock);
453 return 0;
424} 454}
425 455
426static void pit_ioport_read(struct kvm_io_device *this, 456static int pit_ioport_read(struct kvm_io_device *this,
427 gpa_t addr, int len, void *data) 457 gpa_t addr, int len, void *data)
428{ 458{
429 struct kvm_pit *pit = (struct kvm_pit *)this->private; 459 struct kvm_pit *pit = dev_to_pit(this);
430 struct kvm_kpit_state *pit_state = &pit->pit_state; 460 struct kvm_kpit_state *pit_state = &pit->pit_state;
431 struct kvm *kvm = pit->kvm; 461 struct kvm *kvm = pit->kvm;
432 int ret, count; 462 int ret, count;
433 struct kvm_kpit_channel_state *s; 463 struct kvm_kpit_channel_state *s;
464 if (!pit_in_range(addr))
465 return -EOPNOTSUPP;
434 466
435 addr &= KVM_PIT_CHANNEL_MASK; 467 addr &= KVM_PIT_CHANNEL_MASK;
436 s = &pit_state->channels[addr]; 468 s = &pit_state->channels[addr];
@@ -485,37 +517,36 @@ static void pit_ioport_read(struct kvm_io_device *this,
485 memcpy(data, (char *)&ret, len); 517 memcpy(data, (char *)&ret, len);
486 518
487 mutex_unlock(&pit_state->lock); 519 mutex_unlock(&pit_state->lock);
520 return 0;
488} 521}
489 522
490static int pit_in_range(struct kvm_io_device *this, gpa_t addr, 523static int speaker_ioport_write(struct kvm_io_device *this,
491 int len, int is_write) 524 gpa_t addr, int len, const void *data)
492{
493 return ((addr >= KVM_PIT_BASE_ADDRESS) &&
494 (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH));
495}
496
497static void speaker_ioport_write(struct kvm_io_device *this,
498 gpa_t addr, int len, const void *data)
499{ 525{
500 struct kvm_pit *pit = (struct kvm_pit *)this->private; 526 struct kvm_pit *pit = speaker_to_pit(this);
501 struct kvm_kpit_state *pit_state = &pit->pit_state; 527 struct kvm_kpit_state *pit_state = &pit->pit_state;
502 struct kvm *kvm = pit->kvm; 528 struct kvm *kvm = pit->kvm;
503 u32 val = *(u32 *) data; 529 u32 val = *(u32 *) data;
530 if (addr != KVM_SPEAKER_BASE_ADDRESS)
531 return -EOPNOTSUPP;
504 532
505 mutex_lock(&pit_state->lock); 533 mutex_lock(&pit_state->lock);
506 pit_state->speaker_data_on = (val >> 1) & 1; 534 pit_state->speaker_data_on = (val >> 1) & 1;
507 pit_set_gate(kvm, 2, val & 1); 535 pit_set_gate(kvm, 2, val & 1);
508 mutex_unlock(&pit_state->lock); 536 mutex_unlock(&pit_state->lock);
537 return 0;
509} 538}
510 539
511static void speaker_ioport_read(struct kvm_io_device *this, 540static int speaker_ioport_read(struct kvm_io_device *this,
512 gpa_t addr, int len, void *data) 541 gpa_t addr, int len, void *data)
513{ 542{
514 struct kvm_pit *pit = (struct kvm_pit *)this->private; 543 struct kvm_pit *pit = speaker_to_pit(this);
515 struct kvm_kpit_state *pit_state = &pit->pit_state; 544 struct kvm_kpit_state *pit_state = &pit->pit_state;
516 struct kvm *kvm = pit->kvm; 545 struct kvm *kvm = pit->kvm;
517 unsigned int refresh_clock; 546 unsigned int refresh_clock;
518 int ret; 547 int ret;
548 if (addr != KVM_SPEAKER_BASE_ADDRESS)
549 return -EOPNOTSUPP;
519 550
520 /* Refresh clock toggles at about 15us. We approximate as 2^14ns. */ 551 /* Refresh clock toggles at about 15us. We approximate as 2^14ns. */
521 refresh_clock = ((unsigned int)ktime_to_ns(ktime_get()) >> 14) & 1; 552 refresh_clock = ((unsigned int)ktime_to_ns(ktime_get()) >> 14) & 1;
@@ -527,12 +558,7 @@ static void speaker_ioport_read(struct kvm_io_device *this,
527 len = sizeof(ret); 558 len = sizeof(ret);
528 memcpy(data, (char *)&ret, len); 559 memcpy(data, (char *)&ret, len);
529 mutex_unlock(&pit_state->lock); 560 mutex_unlock(&pit_state->lock);
530} 561 return 0;
531
532static int speaker_in_range(struct kvm_io_device *this, gpa_t addr,
533 int len, int is_write)
534{
535 return (addr == KVM_SPEAKER_BASE_ADDRESS);
536} 562}
537 563
538void kvm_pit_reset(struct kvm_pit *pit) 564void kvm_pit_reset(struct kvm_pit *pit)
@@ -541,6 +567,7 @@ void kvm_pit_reset(struct kvm_pit *pit)
541 struct kvm_kpit_channel_state *c; 567 struct kvm_kpit_channel_state *c;
542 568
543 mutex_lock(&pit->pit_state.lock); 569 mutex_lock(&pit->pit_state.lock);
570 pit->pit_state.flags = 0;
544 for (i = 0; i < 3; i++) { 571 for (i = 0; i < 3; i++) {
545 c = &pit->pit_state.channels[i]; 572 c = &pit->pit_state.channels[i];
546 c->mode = 0xff; 573 c->mode = 0xff;
@@ -563,10 +590,22 @@ static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask)
563 } 590 }
564} 591}
565 592
566struct kvm_pit *kvm_create_pit(struct kvm *kvm) 593static const struct kvm_io_device_ops pit_dev_ops = {
594 .read = pit_ioport_read,
595 .write = pit_ioport_write,
596};
597
598static const struct kvm_io_device_ops speaker_dev_ops = {
599 .read = speaker_ioport_read,
600 .write = speaker_ioport_write,
601};
602
603/* Caller must have writers lock on slots_lock */
604struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
567{ 605{
568 struct kvm_pit *pit; 606 struct kvm_pit *pit;
569 struct kvm_kpit_state *pit_state; 607 struct kvm_kpit_state *pit_state;
608 int ret;
570 609
571 pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL); 610 pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL);
572 if (!pit) 611 if (!pit)
@@ -582,19 +621,6 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
582 mutex_lock(&pit->pit_state.lock); 621 mutex_lock(&pit->pit_state.lock);
583 spin_lock_init(&pit->pit_state.inject_lock); 622 spin_lock_init(&pit->pit_state.inject_lock);
584 623
585 /* Initialize PIO device */
586 pit->dev.read = pit_ioport_read;
587 pit->dev.write = pit_ioport_write;
588 pit->dev.in_range = pit_in_range;
589 pit->dev.private = pit;
590 kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev);
591
592 pit->speaker_dev.read = speaker_ioport_read;
593 pit->speaker_dev.write = speaker_ioport_write;
594 pit->speaker_dev.in_range = speaker_in_range;
595 pit->speaker_dev.private = pit;
596 kvm_io_bus_register_dev(&kvm->pio_bus, &pit->speaker_dev);
597
598 kvm->arch.vpit = pit; 624 kvm->arch.vpit = pit;
599 pit->kvm = kvm; 625 pit->kvm = kvm;
600 626
@@ -613,7 +639,30 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
613 pit->mask_notifier.func = pit_mask_notifer; 639 pit->mask_notifier.func = pit_mask_notifer;
614 kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier); 640 kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
615 641
642 kvm_iodevice_init(&pit->dev, &pit_dev_ops);
643 ret = __kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev);
644 if (ret < 0)
645 goto fail;
646
647 if (flags & KVM_PIT_SPEAKER_DUMMY) {
648 kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops);
649 ret = __kvm_io_bus_register_dev(&kvm->pio_bus,
650 &pit->speaker_dev);
651 if (ret < 0)
652 goto fail_unregister;
653 }
654
616 return pit; 655 return pit;
656
657fail_unregister:
658 __kvm_io_bus_unregister_dev(&kvm->pio_bus, &pit->dev);
659
660fail:
661 if (pit->irq_source_id >= 0)
662 kvm_free_irq_source_id(kvm, pit->irq_source_id);
663
664 kfree(pit);
665 return NULL;
617} 666}
618 667
619void kvm_free_pit(struct kvm *kvm) 668void kvm_free_pit(struct kvm *kvm)
@@ -623,6 +672,8 @@ void kvm_free_pit(struct kvm *kvm)
623 if (kvm->arch.vpit) { 672 if (kvm->arch.vpit) {
624 kvm_unregister_irq_mask_notifier(kvm, 0, 673 kvm_unregister_irq_mask_notifier(kvm, 0,
625 &kvm->arch.vpit->mask_notifier); 674 &kvm->arch.vpit->mask_notifier);
675 kvm_unregister_irq_ack_notifier(kvm,
676 &kvm->arch.vpit->pit_state.irq_ack_notifier);
626 mutex_lock(&kvm->arch.vpit->pit_state.lock); 677 mutex_lock(&kvm->arch.vpit->pit_state.lock);
627 timer = &kvm->arch.vpit->pit_state.pit_timer.timer; 678 timer = &kvm->arch.vpit->pit_state.pit_timer.timer;
628 hrtimer_cancel(timer); 679 hrtimer_cancel(timer);
@@ -637,10 +688,10 @@ static void __inject_pit_timer_intr(struct kvm *kvm)
637 struct kvm_vcpu *vcpu; 688 struct kvm_vcpu *vcpu;
638 int i; 689 int i;
639 690
640 mutex_lock(&kvm->lock); 691 mutex_lock(&kvm->irq_lock);
641 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1); 692 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
642 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0); 693 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
643 mutex_unlock(&kvm->lock); 694 mutex_unlock(&kvm->irq_lock);
644 695
645 /* 696 /*
646 * Provides NMI watchdog support via Virtual Wire mode. 697 * Provides NMI watchdog support via Virtual Wire mode.
@@ -652,11 +703,8 @@ static void __inject_pit_timer_intr(struct kvm *kvm)
652 * VCPU0, and only if its LVT0 is in EXTINT mode. 703 * VCPU0, and only if its LVT0 is in EXTINT mode.
653 */ 704 */
654 if (kvm->arch.vapics_in_nmi_mode > 0) 705 if (kvm->arch.vapics_in_nmi_mode > 0)
655 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 706 kvm_for_each_vcpu(i, vcpu, kvm)
656 vcpu = kvm->vcpus[i]; 707 kvm_apic_nmi_wd_deliver(vcpu);
657 if (vcpu)
658 kvm_apic_nmi_wd_deliver(vcpu);
659 }
660} 708}
661 709
662void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu) 710void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
@@ -665,7 +713,7 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
665 struct kvm *kvm = vcpu->kvm; 713 struct kvm *kvm = vcpu->kvm;
666 struct kvm_kpit_state *ps; 714 struct kvm_kpit_state *ps;
667 715
668 if (vcpu && pit) { 716 if (pit) {
669 int inject = 0; 717 int inject = 0;
670 ps = &pit->pit_state; 718 ps = &pit->pit_state;
671 719
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index bbd863ff60b7..d4c1c7ffdc09 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -21,6 +21,7 @@ struct kvm_kpit_channel_state {
21 21
22struct kvm_kpit_state { 22struct kvm_kpit_state {
23 struct kvm_kpit_channel_state channels[3]; 23 struct kvm_kpit_channel_state channels[3];
24 u32 flags;
24 struct kvm_timer pit_timer; 25 struct kvm_timer pit_timer;
25 bool is_periodic; 26 bool is_periodic;
26 u32 speaker_data_on; 27 u32 speaker_data_on;
@@ -49,8 +50,8 @@ struct kvm_pit {
49#define KVM_PIT_CHANNEL_MASK 0x3 50#define KVM_PIT_CHANNEL_MASK 0x3
50 51
51void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu); 52void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu);
52void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val); 53void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start);
53struct kvm_pit *kvm_create_pit(struct kvm *kvm); 54struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags);
54void kvm_free_pit(struct kvm *kvm); 55void kvm_free_pit(struct kvm *kvm);
55void kvm_pit_reset(struct kvm_pit *pit); 56void kvm_pit_reset(struct kvm_pit *pit);
56 57
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 1ccb50c74f18..01f151682802 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -30,50 +30,24 @@
30#include "irq.h" 30#include "irq.h"
31 31
32#include <linux/kvm_host.h> 32#include <linux/kvm_host.h>
33 33#include "trace.h"
34static void pic_lock(struct kvm_pic *s)
35 __acquires(&s->lock)
36{
37 spin_lock(&s->lock);
38}
39
40static void pic_unlock(struct kvm_pic *s)
41 __releases(&s->lock)
42{
43 struct kvm *kvm = s->kvm;
44 unsigned acks = s->pending_acks;
45 bool wakeup = s->wakeup_needed;
46 struct kvm_vcpu *vcpu;
47
48 s->pending_acks = 0;
49 s->wakeup_needed = false;
50
51 spin_unlock(&s->lock);
52
53 while (acks) {
54 kvm_notify_acked_irq(kvm, SELECT_PIC(__ffs(acks)),
55 __ffs(acks));
56 acks &= acks - 1;
57 }
58
59 if (wakeup) {
60 vcpu = s->kvm->vcpus[0];
61 if (vcpu)
62 kvm_vcpu_kick(vcpu);
63 }
64}
65 34
66static void pic_clear_isr(struct kvm_kpic_state *s, int irq) 35static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
67{ 36{
68 s->isr &= ~(1 << irq); 37 s->isr &= ~(1 << irq);
69 s->isr_ack |= (1 << irq); 38 s->isr_ack |= (1 << irq);
39 if (s != &s->pics_state->pics[0])
40 irq += 8;
41 kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq);
70} 42}
71 43
72void kvm_pic_clear_isr_ack(struct kvm *kvm) 44void kvm_pic_clear_isr_ack(struct kvm *kvm)
73{ 45{
74 struct kvm_pic *s = pic_irqchip(kvm); 46 struct kvm_pic *s = pic_irqchip(kvm);
47 spin_lock(&s->lock);
75 s->pics[0].isr_ack = 0xff; 48 s->pics[0].isr_ack = 0xff;
76 s->pics[1].isr_ack = 0xff; 49 s->pics[1].isr_ack = 0xff;
50 spin_unlock(&s->lock);
77} 51}
78 52
79/* 53/*
@@ -174,9 +148,9 @@ static void pic_update_irq(struct kvm_pic *s)
174 148
175void kvm_pic_update_irq(struct kvm_pic *s) 149void kvm_pic_update_irq(struct kvm_pic *s)
176{ 150{
177 pic_lock(s); 151 spin_lock(&s->lock);
178 pic_update_irq(s); 152 pic_update_irq(s);
179 pic_unlock(s); 153 spin_unlock(&s->lock);
180} 154}
181 155
182int kvm_pic_set_irq(void *opaque, int irq, int level) 156int kvm_pic_set_irq(void *opaque, int irq, int level)
@@ -184,12 +158,14 @@ int kvm_pic_set_irq(void *opaque, int irq, int level)
184 struct kvm_pic *s = opaque; 158 struct kvm_pic *s = opaque;
185 int ret = -1; 159 int ret = -1;
186 160
187 pic_lock(s); 161 spin_lock(&s->lock);
188 if (irq >= 0 && irq < PIC_NUM_PINS) { 162 if (irq >= 0 && irq < PIC_NUM_PINS) {
189 ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); 163 ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
190 pic_update_irq(s); 164 pic_update_irq(s);
165 trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
166 s->pics[irq >> 3].imr, ret == 0);
191 } 167 }
192 pic_unlock(s); 168 spin_unlock(&s->lock);
193 169
194 return ret; 170 return ret;
195} 171}
@@ -217,7 +193,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
217 int irq, irq2, intno; 193 int irq, irq2, intno;
218 struct kvm_pic *s = pic_irqchip(kvm); 194 struct kvm_pic *s = pic_irqchip(kvm);
219 195
220 pic_lock(s); 196 spin_lock(&s->lock);
221 irq = pic_get_irq(&s->pics[0]); 197 irq = pic_get_irq(&s->pics[0]);
222 if (irq >= 0) { 198 if (irq >= 0) {
223 pic_intack(&s->pics[0], irq); 199 pic_intack(&s->pics[0], irq);
@@ -242,8 +218,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
242 intno = s->pics[0].irq_base + irq; 218 intno = s->pics[0].irq_base + irq;
243 } 219 }
244 pic_update_irq(s); 220 pic_update_irq(s);
245 pic_unlock(s); 221 spin_unlock(&s->lock);
246 kvm_notify_acked_irq(kvm, SELECT_PIC(irq), irq);
247 222
248 return intno; 223 return intno;
249} 224}
@@ -252,7 +227,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
252{ 227{
253 int irq, irqbase, n; 228 int irq, irqbase, n;
254 struct kvm *kvm = s->pics_state->irq_request_opaque; 229 struct kvm *kvm = s->pics_state->irq_request_opaque;
255 struct kvm_vcpu *vcpu0 = kvm->vcpus[0]; 230 struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu;
256 231
257 if (s == &s->pics_state->pics[0]) 232 if (s == &s->pics_state->pics[0])
258 irqbase = 0; 233 irqbase = 0;
@@ -263,7 +238,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
263 if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0)) 238 if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
264 if (s->irr & (1 << irq) || s->isr & (1 << irq)) { 239 if (s->irr & (1 << irq) || s->isr & (1 << irq)) {
265 n = irq + irqbase; 240 n = irq + irqbase;
266 s->pics_state->pending_acks |= 1 << n; 241 kvm_notify_acked_irq(kvm, SELECT_PIC(n), n);
267 } 242 }
268 } 243 }
269 s->last_irr = 0; 244 s->last_irr = 0;
@@ -428,8 +403,7 @@ static u32 elcr_ioport_read(void *opaque, u32 addr1)
428 return s->elcr; 403 return s->elcr;
429} 404}
430 405
431static int picdev_in_range(struct kvm_io_device *this, gpa_t addr, 406static int picdev_in_range(gpa_t addr)
432 int len, int is_write)
433{ 407{
434 switch (addr) { 408 switch (addr) {
435 case 0x20: 409 case 0x20:
@@ -444,18 +418,25 @@ static int picdev_in_range(struct kvm_io_device *this, gpa_t addr,
444 } 418 }
445} 419}
446 420
447static void picdev_write(struct kvm_io_device *this, 421static inline struct kvm_pic *to_pic(struct kvm_io_device *dev)
422{
423 return container_of(dev, struct kvm_pic, dev);
424}
425
426static int picdev_write(struct kvm_io_device *this,
448 gpa_t addr, int len, const void *val) 427 gpa_t addr, int len, const void *val)
449{ 428{
450 struct kvm_pic *s = this->private; 429 struct kvm_pic *s = to_pic(this);
451 unsigned char data = *(unsigned char *)val; 430 unsigned char data = *(unsigned char *)val;
431 if (!picdev_in_range(addr))
432 return -EOPNOTSUPP;
452 433
453 if (len != 1) { 434 if (len != 1) {
454 if (printk_ratelimit()) 435 if (printk_ratelimit())
455 printk(KERN_ERR "PIC: non byte write\n"); 436 printk(KERN_ERR "PIC: non byte write\n");
456 return; 437 return 0;
457 } 438 }
458 pic_lock(s); 439 spin_lock(&s->lock);
459 switch (addr) { 440 switch (addr) {
460 case 0x20: 441 case 0x20:
461 case 0x21: 442 case 0x21:
@@ -468,21 +449,24 @@ static void picdev_write(struct kvm_io_device *this,
468 elcr_ioport_write(&s->pics[addr & 1], addr, data); 449 elcr_ioport_write(&s->pics[addr & 1], addr, data);
469 break; 450 break;
470 } 451 }
471 pic_unlock(s); 452 spin_unlock(&s->lock);
453 return 0;
472} 454}
473 455
474static void picdev_read(struct kvm_io_device *this, 456static int picdev_read(struct kvm_io_device *this,
475 gpa_t addr, int len, void *val) 457 gpa_t addr, int len, void *val)
476{ 458{
477 struct kvm_pic *s = this->private; 459 struct kvm_pic *s = to_pic(this);
478 unsigned char data = 0; 460 unsigned char data = 0;
461 if (!picdev_in_range(addr))
462 return -EOPNOTSUPP;
479 463
480 if (len != 1) { 464 if (len != 1) {
481 if (printk_ratelimit()) 465 if (printk_ratelimit())
482 printk(KERN_ERR "PIC: non byte read\n"); 466 printk(KERN_ERR "PIC: non byte read\n");
483 return; 467 return 0;
484 } 468 }
485 pic_lock(s); 469 spin_lock(&s->lock);
486 switch (addr) { 470 switch (addr) {
487 case 0x20: 471 case 0x20:
488 case 0x21: 472 case 0x21:
@@ -496,7 +480,8 @@ static void picdev_read(struct kvm_io_device *this,
496 break; 480 break;
497 } 481 }
498 *(unsigned char *)val = data; 482 *(unsigned char *)val = data;
499 pic_unlock(s); 483 spin_unlock(&s->lock);
484 return 0;
500} 485}
501 486
502/* 487/*
@@ -505,20 +490,27 @@ static void picdev_read(struct kvm_io_device *this,
505static void pic_irq_request(void *opaque, int level) 490static void pic_irq_request(void *opaque, int level)
506{ 491{
507 struct kvm *kvm = opaque; 492 struct kvm *kvm = opaque;
508 struct kvm_vcpu *vcpu = kvm->vcpus[0]; 493 struct kvm_vcpu *vcpu = kvm->bsp_vcpu;
509 struct kvm_pic *s = pic_irqchip(kvm); 494 struct kvm_pic *s = pic_irqchip(kvm);
510 int irq = pic_get_irq(&s->pics[0]); 495 int irq = pic_get_irq(&s->pics[0]);
511 496
512 s->output = level; 497 s->output = level;
513 if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) { 498 if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
514 s->pics[0].isr_ack &= ~(1 << irq); 499 s->pics[0].isr_ack &= ~(1 << irq);
515 s->wakeup_needed = true; 500 kvm_vcpu_kick(vcpu);
516 } 501 }
517} 502}
518 503
504static const struct kvm_io_device_ops picdev_ops = {
505 .read = picdev_read,
506 .write = picdev_write,
507};
508
519struct kvm_pic *kvm_create_pic(struct kvm *kvm) 509struct kvm_pic *kvm_create_pic(struct kvm *kvm)
520{ 510{
521 struct kvm_pic *s; 511 struct kvm_pic *s;
512 int ret;
513
522 s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); 514 s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
523 if (!s) 515 if (!s)
524 return NULL; 516 return NULL;
@@ -534,10 +526,12 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
534 /* 526 /*
535 * Initialize PIO device 527 * Initialize PIO device
536 */ 528 */
537 s->dev.read = picdev_read; 529 kvm_iodevice_init(&s->dev, &picdev_ops);
538 s->dev.write = picdev_write; 530 ret = kvm_io_bus_register_dev(kvm, &kvm->pio_bus, &s->dev);
539 s->dev.in_range = picdev_in_range; 531 if (ret < 0) {
540 s->dev.private = s; 532 kfree(s);
541 kvm_io_bus_register_dev(&kvm->pio_bus, &s->dev); 533 return NULL;
534 }
535
542 return s; 536 return s;
543} 537}
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 9f593188129e..7d6058a2fd38 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -63,7 +63,6 @@ struct kvm_kpic_state {
63 63
64struct kvm_pic { 64struct kvm_pic {
65 spinlock_t lock; 65 spinlock_t lock;
66 bool wakeup_needed;
67 unsigned pending_acks; 66 unsigned pending_acks;
68 struct kvm *kvm; 67 struct kvm *kvm;
69 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ 68 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 1ff819dce7d3..7bcc5b6a4403 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -29,4 +29,13 @@ static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val)
29 kvm_register_write(vcpu, VCPU_REGS_RIP, val); 29 kvm_register_write(vcpu, VCPU_REGS_RIP, val);
30} 30}
31 31
32static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
33{
34 if (!test_bit(VCPU_EXREG_PDPTR,
35 (unsigned long *)&vcpu->arch.regs_avail))
36 kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR);
37
38 return vcpu->arch.pdptrs[index];
39}
40
32#endif 41#endif
diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h
deleted file mode 100644
index ed66e4c078dc..000000000000
--- a/arch/x86/kvm/kvm_svm.h
+++ /dev/null
@@ -1,51 +0,0 @@
1#ifndef __KVM_SVM_H
2#define __KVM_SVM_H
3
4#include <linux/kernel.h>
5#include <linux/types.h>
6#include <linux/list.h>
7#include <linux/kvm_host.h>
8#include <asm/msr.h>
9
10#include <asm/svm.h>
11
12static const u32 host_save_user_msrs[] = {
13#ifdef CONFIG_X86_64
14 MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
15 MSR_FS_BASE,
16#endif
17 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
18};
19
20#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
21
22struct kvm_vcpu;
23
24struct vcpu_svm {
25 struct kvm_vcpu vcpu;
26 struct vmcb *vmcb;
27 unsigned long vmcb_pa;
28 struct svm_cpu_data *svm_data;
29 uint64_t asid_generation;
30
31 u64 next_rip;
32
33 u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
34 u64 host_gs_base;
35 unsigned long host_cr2;
36
37 u32 *msrpm;
38 struct vmcb *hsave;
39 u64 hsave_msr;
40
41 u64 nested_vmcb;
42
43 /* These are the merged vectors */
44 u32 *nested_msrpm;
45
46 /* gpa pointers to the real vectors */
47 u64 nested_vmcb_msrpm;
48};
49
50#endif
51
diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h
index 26bd6ba74e1c..55c7524dda54 100644
--- a/arch/x86/kvm/kvm_timer.h
+++ b/arch/x86/kvm/kvm_timer.h
@@ -6,7 +6,7 @@ struct kvm_timer {
6 bool reinject; 6 bool reinject;
7 struct kvm_timer_ops *t_ops; 7 struct kvm_timer_ops *t_ops;
8 struct kvm *kvm; 8 struct kvm *kvm;
9 int vcpu_id; 9 struct kvm_vcpu *vcpu;
10}; 10};
11 11
12struct kvm_timer_ops { 12struct kvm_timer_ops {
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index ae99d83f81a3..1ae5ceba7eb2 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -32,8 +32,11 @@
32#include <asm/current.h> 32#include <asm/current.h>
33#include <asm/apicdef.h> 33#include <asm/apicdef.h>
34#include <asm/atomic.h> 34#include <asm/atomic.h>
35#include <asm/apicdef.h>
35#include "kvm_cache_regs.h" 36#include "kvm_cache_regs.h"
36#include "irq.h" 37#include "irq.h"
38#include "trace.h"
39#include "x86.h"
37 40
38#ifndef CONFIG_X86_64 41#ifndef CONFIG_X86_64
39#define mod_64(x, y) ((x) - (y) * div64_u64(x, y)) 42#define mod_64(x, y) ((x) - (y) * div64_u64(x, y))
@@ -141,6 +144,26 @@ static inline int apic_lvt_nmi_mode(u32 lvt_val)
141 return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI; 144 return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI;
142} 145}
143 146
147void kvm_apic_set_version(struct kvm_vcpu *vcpu)
148{
149 struct kvm_lapic *apic = vcpu->arch.apic;
150 struct kvm_cpuid_entry2 *feat;
151 u32 v = APIC_VERSION;
152
153 if (!irqchip_in_kernel(vcpu->kvm))
154 return;
155
156 feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0);
157 if (feat && (feat->ecx & (1 << (X86_FEATURE_X2APIC & 31))))
158 v |= APIC_LVR_DIRECTED_EOI;
159 apic_set_reg(apic, APIC_LVR, v);
160}
161
162static inline int apic_x2apic_mode(struct kvm_lapic *apic)
163{
164 return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
165}
166
144static unsigned int apic_lvt_mask[APIC_LVT_NUM] = { 167static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
145 LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */ 168 LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */
146 LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ 169 LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */
@@ -165,36 +188,52 @@ static int find_highest_vector(void *bitmap)
165 188
166static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic) 189static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic)
167{ 190{
191 apic->irr_pending = true;
168 return apic_test_and_set_vector(vec, apic->regs + APIC_IRR); 192 return apic_test_and_set_vector(vec, apic->regs + APIC_IRR);
169} 193}
170 194
171static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) 195static inline int apic_search_irr(struct kvm_lapic *apic)
172{ 196{
173 apic_clear_vector(vec, apic->regs + APIC_IRR); 197 return find_highest_vector(apic->regs + APIC_IRR);
174} 198}
175 199
176static inline int apic_find_highest_irr(struct kvm_lapic *apic) 200static inline int apic_find_highest_irr(struct kvm_lapic *apic)
177{ 201{
178 int result; 202 int result;
179 203
180 result = find_highest_vector(apic->regs + APIC_IRR); 204 if (!apic->irr_pending)
205 return -1;
206
207 result = apic_search_irr(apic);
181 ASSERT(result == -1 || result >= 16); 208 ASSERT(result == -1 || result >= 16);
182 209
183 return result; 210 return result;
184} 211}
185 212
213static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
214{
215 apic->irr_pending = false;
216 apic_clear_vector(vec, apic->regs + APIC_IRR);
217 if (apic_search_irr(apic) != -1)
218 apic->irr_pending = true;
219}
220
186int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) 221int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
187{ 222{
188 struct kvm_lapic *apic = vcpu->arch.apic; 223 struct kvm_lapic *apic = vcpu->arch.apic;
189 int highest_irr; 224 int highest_irr;
190 225
226 /* This may race with setting of irr in __apic_accept_irq() and
227 * value returned may be wrong, but kvm_vcpu_kick() in __apic_accept_irq
228 * will cause vmexit immediately and the value will be recalculated
229 * on the next vmentry.
230 */
191 if (!apic) 231 if (!apic)
192 return 0; 232 return 0;
193 highest_irr = apic_find_highest_irr(apic); 233 highest_irr = apic_find_highest_irr(apic);
194 234
195 return highest_irr; 235 return highest_irr;
196} 236}
197EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
198 237
199static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, 238static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
200 int vector, int level, int trig_mode); 239 int vector, int level, int trig_mode);
@@ -251,7 +290,12 @@ int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest)
251int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda) 290int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
252{ 291{
253 int result = 0; 292 int result = 0;
254 u8 logical_id; 293 u32 logical_id;
294
295 if (apic_x2apic_mode(apic)) {
296 logical_id = apic_get_reg(apic, APIC_LDR);
297 return logical_id & mda;
298 }
255 299
256 logical_id = GET_APIC_LOGICAL_ID(apic_get_reg(apic, APIC_LDR)); 300 logical_id = GET_APIC_LOGICAL_ID(apic_get_reg(apic, APIC_LDR));
257 301
@@ -331,6 +375,8 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
331 break; 375 break;
332 376
333 result = !apic_test_and_set_irr(vector, apic); 377 result = !apic_test_and_set_irr(vector, apic);
378 trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
379 trig_mode, vector, !result);
334 if (!result) { 380 if (!result) {
335 if (trig_mode) 381 if (trig_mode)
336 apic_debug("level trig mode repeatedly for " 382 apic_debug("level trig mode repeatedly for "
@@ -425,7 +471,11 @@ static void apic_set_eoi(struct kvm_lapic *apic)
425 trigger_mode = IOAPIC_LEVEL_TRIG; 471 trigger_mode = IOAPIC_LEVEL_TRIG;
426 else 472 else
427 trigger_mode = IOAPIC_EDGE_TRIG; 473 trigger_mode = IOAPIC_EDGE_TRIG;
428 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); 474 if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) {
475 mutex_lock(&apic->vcpu->kvm->irq_lock);
476 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
477 mutex_unlock(&apic->vcpu->kvm->irq_lock);
478 }
429} 479}
430 480
431static void apic_send_ipi(struct kvm_lapic *apic) 481static void apic_send_ipi(struct kvm_lapic *apic)
@@ -440,7 +490,12 @@ static void apic_send_ipi(struct kvm_lapic *apic)
440 irq.level = icr_low & APIC_INT_ASSERT; 490 irq.level = icr_low & APIC_INT_ASSERT;
441 irq.trig_mode = icr_low & APIC_INT_LEVELTRIG; 491 irq.trig_mode = icr_low & APIC_INT_LEVELTRIG;
442 irq.shorthand = icr_low & APIC_SHORT_MASK; 492 irq.shorthand = icr_low & APIC_SHORT_MASK;
443 irq.dest_id = GET_APIC_DEST_FIELD(icr_high); 493 if (apic_x2apic_mode(apic))
494 irq.dest_id = icr_high;
495 else
496 irq.dest_id = GET_APIC_DEST_FIELD(icr_high);
497
498 trace_kvm_apic_ipi(icr_low, irq.dest_id);
444 499
445 apic_debug("icr_high 0x%x, icr_low 0x%x, " 500 apic_debug("icr_high 0x%x, icr_low 0x%x, "
446 "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, " 501 "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, "
@@ -449,7 +504,9 @@ static void apic_send_ipi(struct kvm_lapic *apic)
449 irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode, 504 irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode,
450 irq.vector); 505 irq.vector);
451 506
507 mutex_lock(&apic->vcpu->kvm->irq_lock);
452 kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq); 508 kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq);
509 mutex_unlock(&apic->vcpu->kvm->irq_lock);
453} 510}
454 511
455static u32 apic_get_tmcct(struct kvm_lapic *apic) 512static u32 apic_get_tmcct(struct kvm_lapic *apic)
@@ -495,12 +552,16 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
495{ 552{
496 u32 val = 0; 553 u32 val = 0;
497 554
498 KVMTRACE_1D(APIC_ACCESS, apic->vcpu, (u32)offset, handler);
499
500 if (offset >= LAPIC_MMIO_LENGTH) 555 if (offset >= LAPIC_MMIO_LENGTH)
501 return 0; 556 return 0;
502 557
503 switch (offset) { 558 switch (offset) {
559 case APIC_ID:
560 if (apic_x2apic_mode(apic))
561 val = kvm_apic_id(apic);
562 else
563 val = kvm_apic_id(apic) << 24;
564 break;
504 case APIC_ARBPRI: 565 case APIC_ARBPRI:
505 printk(KERN_WARNING "Access APIC ARBPRI register " 566 printk(KERN_WARNING "Access APIC ARBPRI register "
506 "which is for P6\n"); 567 "which is for P6\n");
@@ -522,21 +583,35 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
522 return val; 583 return val;
523} 584}
524 585
525static void apic_mmio_read(struct kvm_io_device *this, 586static inline struct kvm_lapic *to_lapic(struct kvm_io_device *dev)
526 gpa_t address, int len, void *data) 587{
588 return container_of(dev, struct kvm_lapic, dev);
589}
590
591static int apic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
592 void *data)
527{ 593{
528 struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
529 unsigned int offset = address - apic->base_address;
530 unsigned char alignment = offset & 0xf; 594 unsigned char alignment = offset & 0xf;
531 u32 result; 595 u32 result;
596 /* this bitmask has a bit cleared for each reserver register */
597 static const u64 rmask = 0x43ff01ffffffe70cULL;
532 598
533 if ((alignment + len) > 4) { 599 if ((alignment + len) > 4) {
534 printk(KERN_ERR "KVM_APIC_READ: alignment error %lx %d", 600 apic_debug("KVM_APIC_READ: alignment error %x %d\n",
535 (unsigned long)address, len); 601 offset, len);
536 return; 602 return 1;
537 } 603 }
604
605 if (offset > 0x3f0 || !(rmask & (1ULL << (offset >> 4)))) {
606 apic_debug("KVM_APIC_READ: read reserved register %x\n",
607 offset);
608 return 1;
609 }
610
538 result = __apic_read(apic, offset & ~0xf); 611 result = __apic_read(apic, offset & ~0xf);
539 612
613 trace_kvm_apic_read(offset, result);
614
540 switch (len) { 615 switch (len) {
541 case 1: 616 case 1:
542 case 2: 617 case 2:
@@ -548,6 +623,28 @@ static void apic_mmio_read(struct kvm_io_device *this,
548 "should be 1,2, or 4 instead\n", len); 623 "should be 1,2, or 4 instead\n", len);
549 break; 624 break;
550 } 625 }
626 return 0;
627}
628
629static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr)
630{
631 return apic_hw_enabled(apic) &&
632 addr >= apic->base_address &&
633 addr < apic->base_address + LAPIC_MMIO_LENGTH;
634}
635
636static int apic_mmio_read(struct kvm_io_device *this,
637 gpa_t address, int len, void *data)
638{
639 struct kvm_lapic *apic = to_lapic(this);
640 u32 offset = address - apic->base_address;
641
642 if (!apic_mmio_in_range(apic, address))
643 return -EOPNOTSUPP;
644
645 apic_reg_read(apic, offset, len, data);
646
647 return 0;
551} 648}
552 649
553static void update_divide_count(struct kvm_lapic *apic) 650static void update_divide_count(struct kvm_lapic *apic)
@@ -573,6 +670,15 @@ static void start_apic_timer(struct kvm_lapic *apic)
573 670
574 if (!apic->lapic_timer.period) 671 if (!apic->lapic_timer.period)
575 return; 672 return;
673 /*
674 * Do not allow the guest to program periodic timers with small
675 * interval, since the hrtimers are not throttled by the host
676 * scheduler.
677 */
678 if (apic_lvtt_period(apic)) {
679 if (apic->lapic_timer.period < NSEC_PER_MSEC/2)
680 apic->lapic_timer.period = NSEC_PER_MSEC/2;
681 }
576 682
577 hrtimer_start(&apic->lapic_timer.timer, 683 hrtimer_start(&apic->lapic_timer.timer,
578 ktime_add_ns(now, apic->lapic_timer.period), 684 ktime_add_ns(now, apic->lapic_timer.period),
@@ -603,40 +709,18 @@ static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
603 apic->vcpu->kvm->arch.vapics_in_nmi_mode--; 709 apic->vcpu->kvm->arch.vapics_in_nmi_mode--;
604} 710}
605 711
606static void apic_mmio_write(struct kvm_io_device *this, 712static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
607 gpa_t address, int len, const void *data)
608{ 713{
609 struct kvm_lapic *apic = (struct kvm_lapic *)this->private; 714 int ret = 0;
610 unsigned int offset = address - apic->base_address;
611 unsigned char alignment = offset & 0xf;
612 u32 val;
613
614 /*
615 * APIC register must be aligned on 128-bits boundary.
616 * 32/64/128 bits registers must be accessed thru 32 bits.
617 * Refer SDM 8.4.1
618 */
619 if (len != 4 || alignment) {
620 /* Don't shout loud, $infamous_os would cause only noise. */
621 apic_debug("apic write: bad size=%d %lx\n",
622 len, (long)address);
623 return;
624 }
625
626 val = *(u32 *) data;
627
628 /* too common printing */
629 if (offset != APIC_EOI)
630 apic_debug("%s: offset 0x%x with length 0x%x, and value is "
631 "0x%x\n", __func__, offset, len, val);
632
633 offset &= 0xff0;
634 715
635 KVMTRACE_1D(APIC_ACCESS, apic->vcpu, (u32)offset, handler); 716 trace_kvm_apic_write(reg, val);
636 717
637 switch (offset) { 718 switch (reg) {
638 case APIC_ID: /* Local APIC ID */ 719 case APIC_ID: /* Local APIC ID */
639 apic_set_reg(apic, APIC_ID, val); 720 if (!apic_x2apic_mode(apic))
721 apic_set_reg(apic, APIC_ID, val);
722 else
723 ret = 1;
640 break; 724 break;
641 725
642 case APIC_TASKPRI: 726 case APIC_TASKPRI:
@@ -649,15 +733,24 @@ static void apic_mmio_write(struct kvm_io_device *this,
649 break; 733 break;
650 734
651 case APIC_LDR: 735 case APIC_LDR:
652 apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK); 736 if (!apic_x2apic_mode(apic))
737 apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK);
738 else
739 ret = 1;
653 break; 740 break;
654 741
655 case APIC_DFR: 742 case APIC_DFR:
656 apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF); 743 if (!apic_x2apic_mode(apic))
744 apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
745 else
746 ret = 1;
657 break; 747 break;
658 748
659 case APIC_SPIV: 749 case APIC_SPIV: {
660 apic_set_reg(apic, APIC_SPIV, val & 0x3ff); 750 u32 mask = 0x3ff;
751 if (apic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI)
752 mask |= APIC_SPIV_DIRECTED_EOI;
753 apic_set_reg(apic, APIC_SPIV, val & mask);
661 if (!(val & APIC_SPIV_APIC_ENABLED)) { 754 if (!(val & APIC_SPIV_APIC_ENABLED)) {
662 int i; 755 int i;
663 u32 lvt_val; 756 u32 lvt_val;
@@ -672,7 +765,7 @@ static void apic_mmio_write(struct kvm_io_device *this,
672 765
673 } 766 }
674 break; 767 break;
675 768 }
676 case APIC_ICR: 769 case APIC_ICR:
677 /* No delay here, so we always clear the pending bit */ 770 /* No delay here, so we always clear the pending bit */
678 apic_set_reg(apic, APIC_ICR, val & ~(1 << 12)); 771 apic_set_reg(apic, APIC_ICR, val & ~(1 << 12));
@@ -680,7 +773,9 @@ static void apic_mmio_write(struct kvm_io_device *this,
680 break; 773 break;
681 774
682 case APIC_ICR2: 775 case APIC_ICR2:
683 apic_set_reg(apic, APIC_ICR2, val & 0xff000000); 776 if (!apic_x2apic_mode(apic))
777 val &= 0xff000000;
778 apic_set_reg(apic, APIC_ICR2, val);
684 break; 779 break;
685 780
686 case APIC_LVT0: 781 case APIC_LVT0:
@@ -694,8 +789,8 @@ static void apic_mmio_write(struct kvm_io_device *this,
694 if (!apic_sw_enabled(apic)) 789 if (!apic_sw_enabled(apic))
695 val |= APIC_LVT_MASKED; 790 val |= APIC_LVT_MASKED;
696 791
697 val &= apic_lvt_mask[(offset - APIC_LVTT) >> 4]; 792 val &= apic_lvt_mask[(reg - APIC_LVTT) >> 4];
698 apic_set_reg(apic, offset, val); 793 apic_set_reg(apic, reg, val);
699 794
700 break; 795 break;
701 796
@@ -703,7 +798,7 @@ static void apic_mmio_write(struct kvm_io_device *this,
703 hrtimer_cancel(&apic->lapic_timer.timer); 798 hrtimer_cancel(&apic->lapic_timer.timer);
704 apic_set_reg(apic, APIC_TMICT, val); 799 apic_set_reg(apic, APIC_TMICT, val);
705 start_apic_timer(apic); 800 start_apic_timer(apic);
706 return; 801 break;
707 802
708 case APIC_TDCR: 803 case APIC_TDCR:
709 if (val & 4) 804 if (val & 4)
@@ -712,27 +807,59 @@ static void apic_mmio_write(struct kvm_io_device *this,
712 update_divide_count(apic); 807 update_divide_count(apic);
713 break; 808 break;
714 809
810 case APIC_ESR:
811 if (apic_x2apic_mode(apic) && val != 0) {
812 printk(KERN_ERR "KVM_WRITE:ESR not zero %x\n", val);
813 ret = 1;
814 }
815 break;
816
817 case APIC_SELF_IPI:
818 if (apic_x2apic_mode(apic)) {
819 apic_reg_write(apic, APIC_ICR, 0x40000 | (val & 0xff));
820 } else
821 ret = 1;
822 break;
715 default: 823 default:
716 apic_debug("Local APIC Write to read-only register %x\n", 824 ret = 1;
717 offset);
718 break; 825 break;
719 } 826 }
720 827 if (ret)
828 apic_debug("Local APIC Write to read-only register %x\n", reg);
829 return ret;
721} 830}
722 831
723static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr, 832static int apic_mmio_write(struct kvm_io_device *this,
724 int len, int size) 833 gpa_t address, int len, const void *data)
725{ 834{
726 struct kvm_lapic *apic = (struct kvm_lapic *)this->private; 835 struct kvm_lapic *apic = to_lapic(this);
727 int ret = 0; 836 unsigned int offset = address - apic->base_address;
837 u32 val;
728 838
839 if (!apic_mmio_in_range(apic, address))
840 return -EOPNOTSUPP;
729 841
730 if (apic_hw_enabled(apic) && 842 /*
731 (addr >= apic->base_address) && 843 * APIC register must be aligned on 128-bits boundary.
732 (addr < (apic->base_address + LAPIC_MMIO_LENGTH))) 844 * 32/64/128 bits registers must be accessed thru 32 bits.
733 ret = 1; 845 * Refer SDM 8.4.1
846 */
847 if (len != 4 || (offset & 0xf)) {
848 /* Don't shout loud, $infamous_os would cause only noise. */
849 apic_debug("apic write: bad size=%d %lx\n", len, (long)address);
850 return 0;
851 }
734 852
735 return ret; 853 val = *(u32*)data;
854
855 /* too common printing */
856 if (offset != APIC_EOI)
857 apic_debug("%s: offset 0x%x with length 0x%x, and value is "
858 "0x%x\n", __func__, offset, len, val);
859
860 apic_reg_write(apic, offset & 0xff0, val);
861
862 return 0;
736} 863}
737 864
738void kvm_free_lapic(struct kvm_vcpu *vcpu) 865void kvm_free_lapic(struct kvm_vcpu *vcpu)
@@ -763,7 +890,6 @@ void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
763 apic_set_tpr(apic, ((cr8 & 0x0f) << 4) 890 apic_set_tpr(apic, ((cr8 & 0x0f) << 4)
764 | (apic_get_reg(apic, APIC_TASKPRI) & 4)); 891 | (apic_get_reg(apic, APIC_TASKPRI) & 4));
765} 892}
766EXPORT_SYMBOL_GPL(kvm_lapic_set_tpr);
767 893
768u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) 894u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
769{ 895{
@@ -776,7 +902,6 @@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
776 902
777 return (tpr & 0xf0) >> 4; 903 return (tpr & 0xf0) >> 4;
778} 904}
779EXPORT_SYMBOL_GPL(kvm_lapic_get_cr8);
780 905
781void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) 906void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
782{ 907{
@@ -787,10 +912,16 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
787 vcpu->arch.apic_base = value; 912 vcpu->arch.apic_base = value;
788 return; 913 return;
789 } 914 }
790 if (apic->vcpu->vcpu_id) 915
916 if (!kvm_vcpu_is_bsp(apic->vcpu))
791 value &= ~MSR_IA32_APICBASE_BSP; 917 value &= ~MSR_IA32_APICBASE_BSP;
792 918
793 vcpu->arch.apic_base = value; 919 vcpu->arch.apic_base = value;
920 if (apic_x2apic_mode(apic)) {
921 u32 id = kvm_apic_id(apic);
922 u32 ldr = ((id & ~0xf) << 16) | (1 << (id & 0xf));
923 apic_set_reg(apic, APIC_LDR, ldr);
924 }
794 apic->base_address = apic->vcpu->arch.apic_base & 925 apic->base_address = apic->vcpu->arch.apic_base &
795 MSR_IA32_APICBASE_BASE; 926 MSR_IA32_APICBASE_BASE;
796 927
@@ -800,12 +931,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
800 931
801} 932}
802 933
803u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu)
804{
805 return vcpu->arch.apic_base;
806}
807EXPORT_SYMBOL_GPL(kvm_lapic_get_base);
808
809void kvm_lapic_reset(struct kvm_vcpu *vcpu) 934void kvm_lapic_reset(struct kvm_vcpu *vcpu)
810{ 935{
811 struct kvm_lapic *apic; 936 struct kvm_lapic *apic;
@@ -821,7 +946,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
821 hrtimer_cancel(&apic->lapic_timer.timer); 946 hrtimer_cancel(&apic->lapic_timer.timer);
822 947
823 apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24); 948 apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24);
824 apic_set_reg(apic, APIC_LVR, APIC_VERSION); 949 kvm_apic_set_version(apic->vcpu);
825 950
826 for (i = 0; i < APIC_LVT_NUM; i++) 951 for (i = 0; i < APIC_LVT_NUM; i++)
827 apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED); 952 apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
@@ -842,9 +967,10 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
842 apic_set_reg(apic, APIC_ISR + 0x10 * i, 0); 967 apic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
843 apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); 968 apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
844 } 969 }
970 apic->irr_pending = false;
845 update_divide_count(apic); 971 update_divide_count(apic);
846 atomic_set(&apic->lapic_timer.pending, 0); 972 atomic_set(&apic->lapic_timer.pending, 0);
847 if (vcpu->vcpu_id == 0) 973 if (kvm_vcpu_is_bsp(vcpu))
848 vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; 974 vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
849 apic_update_ppr(apic); 975 apic_update_ppr(apic);
850 976
@@ -855,7 +981,6 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
855 vcpu, kvm_apic_id(apic), 981 vcpu, kvm_apic_id(apic),
856 vcpu->arch.apic_base, apic->base_address); 982 vcpu->arch.apic_base, apic->base_address);
857} 983}
858EXPORT_SYMBOL_GPL(kvm_lapic_reset);
859 984
860bool kvm_apic_present(struct kvm_vcpu *vcpu) 985bool kvm_apic_present(struct kvm_vcpu *vcpu)
861{ 986{
@@ -866,7 +991,6 @@ int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
866{ 991{
867 return kvm_apic_present(vcpu) && apic_sw_enabled(vcpu->arch.apic); 992 return kvm_apic_present(vcpu) && apic_sw_enabled(vcpu->arch.apic);
868} 993}
869EXPORT_SYMBOL_GPL(kvm_lapic_enabled);
870 994
871/* 995/*
872 *---------------------------------------------------------------------- 996 *----------------------------------------------------------------------
@@ -917,6 +1041,11 @@ static struct kvm_timer_ops lapic_timer_ops = {
917 .is_periodic = lapic_is_periodic, 1041 .is_periodic = lapic_is_periodic,
918}; 1042};
919 1043
1044static const struct kvm_io_device_ops apic_mmio_ops = {
1045 .read = apic_mmio_read,
1046 .write = apic_mmio_write,
1047};
1048
920int kvm_create_lapic(struct kvm_vcpu *vcpu) 1049int kvm_create_lapic(struct kvm_vcpu *vcpu)
921{ 1050{
922 struct kvm_lapic *apic; 1051 struct kvm_lapic *apic;
@@ -945,16 +1074,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
945 apic->lapic_timer.timer.function = kvm_timer_fn; 1074 apic->lapic_timer.timer.function = kvm_timer_fn;
946 apic->lapic_timer.t_ops = &lapic_timer_ops; 1075 apic->lapic_timer.t_ops = &lapic_timer_ops;
947 apic->lapic_timer.kvm = vcpu->kvm; 1076 apic->lapic_timer.kvm = vcpu->kvm;
948 apic->lapic_timer.vcpu_id = vcpu->vcpu_id; 1077 apic->lapic_timer.vcpu = vcpu;
949 1078
950 apic->base_address = APIC_DEFAULT_PHYS_BASE; 1079 apic->base_address = APIC_DEFAULT_PHYS_BASE;
951 vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE; 1080 vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE;
952 1081
953 kvm_lapic_reset(vcpu); 1082 kvm_lapic_reset(vcpu);
954 apic->dev.read = apic_mmio_read; 1083 kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
955 apic->dev.write = apic_mmio_write;
956 apic->dev.in_range = apic_mmio_range;
957 apic->dev.private = apic;
958 1084
959 return 0; 1085 return 0;
960nomem_free_apic: 1086nomem_free_apic:
@@ -962,7 +1088,6 @@ nomem_free_apic:
962nomem: 1088nomem:
963 return -ENOMEM; 1089 return -ENOMEM;
964} 1090}
965EXPORT_SYMBOL_GPL(kvm_create_lapic);
966 1091
967int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) 1092int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
968{ 1093{
@@ -985,7 +1110,7 @@ int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
985 u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0); 1110 u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0);
986 int r = 0; 1111 int r = 0;
987 1112
988 if (vcpu->vcpu_id == 0) { 1113 if (kvm_vcpu_is_bsp(vcpu)) {
989 if (!apic_hw_enabled(vcpu->arch.apic)) 1114 if (!apic_hw_enabled(vcpu->arch.apic))
990 r = 1; 1115 r = 1;
991 if ((lvt0 & APIC_LVT_MASKED) == 0 && 1116 if ((lvt0 & APIC_LVT_MASKED) == 0 &&
@@ -1025,7 +1150,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
1025 1150
1026 apic->base_address = vcpu->arch.apic_base & 1151 apic->base_address = vcpu->arch.apic_base &
1027 MSR_IA32_APICBASE_BASE; 1152 MSR_IA32_APICBASE_BASE;
1028 apic_set_reg(apic, APIC_LVR, APIC_VERSION); 1153 kvm_apic_set_version(vcpu);
1154
1029 apic_update_ppr(apic); 1155 apic_update_ppr(apic);
1030 hrtimer_cancel(&apic->lapic_timer.timer); 1156 hrtimer_cancel(&apic->lapic_timer.timer);
1031 update_divide_count(apic); 1157 update_divide_count(apic);
@@ -1092,3 +1218,35 @@ void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
1092 1218
1093 vcpu->arch.apic->vapic_addr = vapic_addr; 1219 vcpu->arch.apic->vapic_addr = vapic_addr;
1094} 1220}
1221
1222int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1223{
1224 struct kvm_lapic *apic = vcpu->arch.apic;
1225 u32 reg = (msr - APIC_BASE_MSR) << 4;
1226
1227 if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
1228 return 1;
1229
1230 /* if this is ICR write vector before command */
1231 if (msr == 0x830)
1232 apic_reg_write(apic, APIC_ICR2, (u32)(data >> 32));
1233 return apic_reg_write(apic, reg, (u32)data);
1234}
1235
1236int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
1237{
1238 struct kvm_lapic *apic = vcpu->arch.apic;
1239 u32 reg = (msr - APIC_BASE_MSR) << 4, low, high = 0;
1240
1241 if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
1242 return 1;
1243
1244 if (apic_reg_read(apic, reg, 4, &low))
1245 return 1;
1246 if (msr == 0x830)
1247 apic_reg_read(apic, APIC_ICR2, 4, &high);
1248
1249 *data = (((u64)high) << 32) | low;
1250
1251 return 0;
1252}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index a587f8349c46..40010b09c4aa 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -12,6 +12,7 @@ struct kvm_lapic {
12 struct kvm_timer lapic_timer; 12 struct kvm_timer lapic_timer;
13 u32 divide_count; 13 u32 divide_count;
14 struct kvm_vcpu *vcpu; 14 struct kvm_vcpu *vcpu;
15 bool irr_pending;
15 struct page *regs_page; 16 struct page *regs_page;
16 void *regs; 17 void *regs;
17 gpa_t vapic_addr; 18 gpa_t vapic_addr;
@@ -28,6 +29,7 @@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
28void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); 29void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
29void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); 30void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
30u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); 31u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
32void kvm_apic_set_version(struct kvm_vcpu *vcpu);
31 33
32int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); 34int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
33int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); 35int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
@@ -44,4 +46,6 @@ void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
44void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu); 46void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
45void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); 47void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
46 48
49int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data);
50int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
47#endif 51#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 0ef5bb2b4043..eca41ae9f453 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -18,6 +18,7 @@
18 */ 18 */
19 19
20#include "mmu.h" 20#include "mmu.h"
21#include "kvm_cache_regs.h"
21 22
22#include <linux/kvm_host.h> 23#include <linux/kvm_host.h>
23#include <linux/types.h> 24#include <linux/types.h>
@@ -107,6 +108,9 @@ module_param(oos_shadow, bool, 0644);
107 108
108#define PT32_LEVEL_MASK(level) \ 109#define PT32_LEVEL_MASK(level) \
109 (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level)) 110 (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
111#define PT32_LVL_OFFSET_MASK(level) \
112 (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
113 * PT32_LEVEL_BITS))) - 1))
110 114
111#define PT32_INDEX(address, level)\ 115#define PT32_INDEX(address, level)\
112 (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) 116 (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
@@ -115,10 +119,19 @@ module_param(oos_shadow, bool, 0644);
115#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) 119#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
116#define PT64_DIR_BASE_ADDR_MASK \ 120#define PT64_DIR_BASE_ADDR_MASK \
117 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1)) 121 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
122#define PT64_LVL_ADDR_MASK(level) \
123 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
124 * PT64_LEVEL_BITS))) - 1))
125#define PT64_LVL_OFFSET_MASK(level) \
126 (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
127 * PT64_LEVEL_BITS))) - 1))
118 128
119#define PT32_BASE_ADDR_MASK PAGE_MASK 129#define PT32_BASE_ADDR_MASK PAGE_MASK
120#define PT32_DIR_BASE_ADDR_MASK \ 130#define PT32_DIR_BASE_ADDR_MASK \
121 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1)) 131 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
132#define PT32_LVL_ADDR_MASK(level) \
133 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
134 * PT32_LEVEL_BITS))) - 1))
122 135
123#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ 136#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
124 | PT64_NX_MASK) 137 | PT64_NX_MASK)
@@ -129,6 +142,7 @@ module_param(oos_shadow, bool, 0644);
129#define PFERR_RSVD_MASK (1U << 3) 142#define PFERR_RSVD_MASK (1U << 3)
130#define PFERR_FETCH_MASK (1U << 4) 143#define PFERR_FETCH_MASK (1U << 4)
131 144
145#define PT_PDPE_LEVEL 3
132#define PT_DIRECTORY_LEVEL 2 146#define PT_DIRECTORY_LEVEL 2
133#define PT_PAGE_TABLE_LEVEL 1 147#define PT_PAGE_TABLE_LEVEL 1
134 148
@@ -139,10 +153,13 @@ module_param(oos_shadow, bool, 0644);
139#define ACC_USER_MASK PT_USER_MASK 153#define ACC_USER_MASK PT_USER_MASK
140#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) 154#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
141 155
156#define CREATE_TRACE_POINTS
157#include "mmutrace.h"
158
142#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) 159#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
143 160
144struct kvm_rmap_desc { 161struct kvm_rmap_desc {
145 u64 *shadow_ptes[RMAP_EXT]; 162 u64 *sptes[RMAP_EXT];
146 struct kvm_rmap_desc *more; 163 struct kvm_rmap_desc *more;
147}; 164};
148 165
@@ -239,16 +256,25 @@ static int is_writeble_pte(unsigned long pte)
239 return pte & PT_WRITABLE_MASK; 256 return pte & PT_WRITABLE_MASK;
240} 257}
241 258
242static int is_dirty_pte(unsigned long pte) 259static int is_dirty_gpte(unsigned long pte)
243{ 260{
244 return pte & shadow_dirty_mask; 261 return pte & PT_DIRTY_MASK;
245} 262}
246 263
247static int is_rmap_pte(u64 pte) 264static int is_rmap_spte(u64 pte)
248{ 265{
249 return is_shadow_present_pte(pte); 266 return is_shadow_present_pte(pte);
250} 267}
251 268
269static int is_last_spte(u64 pte, int level)
270{
271 if (level == PT_PAGE_TABLE_LEVEL)
272 return 1;
273 if (is_large_pte(pte))
274 return 1;
275 return 0;
276}
277
252static pfn_t spte_to_pfn(u64 pte) 278static pfn_t spte_to_pfn(u64 pte)
253{ 279{
254 return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; 280 return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
@@ -261,7 +287,7 @@ static gfn_t pse36_gfn_delta(u32 gpte)
261 return (gpte & PT32_DIR_PSE36_MASK) << shift; 287 return (gpte & PT32_DIR_PSE36_MASK) << shift;
262} 288}
263 289
264static void set_shadow_pte(u64 *sptep, u64 spte) 290static void __set_spte(u64 *sptep, u64 spte)
265{ 291{
266#ifdef CONFIG_X86_64 292#ifdef CONFIG_X86_64
267 set_64bit((unsigned long *)sptep, spte); 293 set_64bit((unsigned long *)sptep, spte);
@@ -380,37 +406,52 @@ static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
380 * Return the pointer to the largepage write count for a given 406 * Return the pointer to the largepage write count for a given
381 * gfn, handling slots that are not large page aligned. 407 * gfn, handling slots that are not large page aligned.
382 */ 408 */
383static int *slot_largepage_idx(gfn_t gfn, struct kvm_memory_slot *slot) 409static int *slot_largepage_idx(gfn_t gfn,
410 struct kvm_memory_slot *slot,
411 int level)
384{ 412{
385 unsigned long idx; 413 unsigned long idx;
386 414
387 idx = (gfn / KVM_PAGES_PER_HPAGE) - 415 idx = (gfn / KVM_PAGES_PER_HPAGE(level)) -
388 (slot->base_gfn / KVM_PAGES_PER_HPAGE); 416 (slot->base_gfn / KVM_PAGES_PER_HPAGE(level));
389 return &slot->lpage_info[idx].write_count; 417 return &slot->lpage_info[level - 2][idx].write_count;
390} 418}
391 419
392static void account_shadowed(struct kvm *kvm, gfn_t gfn) 420static void account_shadowed(struct kvm *kvm, gfn_t gfn)
393{ 421{
422 struct kvm_memory_slot *slot;
394 int *write_count; 423 int *write_count;
424 int i;
395 425
396 gfn = unalias_gfn(kvm, gfn); 426 gfn = unalias_gfn(kvm, gfn);
397 write_count = slot_largepage_idx(gfn, 427
398 gfn_to_memslot_unaliased(kvm, gfn)); 428 slot = gfn_to_memslot_unaliased(kvm, gfn);
399 *write_count += 1; 429 for (i = PT_DIRECTORY_LEVEL;
430 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
431 write_count = slot_largepage_idx(gfn, slot, i);
432 *write_count += 1;
433 }
400} 434}
401 435
402static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) 436static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
403{ 437{
438 struct kvm_memory_slot *slot;
404 int *write_count; 439 int *write_count;
440 int i;
405 441
406 gfn = unalias_gfn(kvm, gfn); 442 gfn = unalias_gfn(kvm, gfn);
407 write_count = slot_largepage_idx(gfn, 443 for (i = PT_DIRECTORY_LEVEL;
408 gfn_to_memslot_unaliased(kvm, gfn)); 444 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
409 *write_count -= 1; 445 slot = gfn_to_memslot_unaliased(kvm, gfn);
410 WARN_ON(*write_count < 0); 446 write_count = slot_largepage_idx(gfn, slot, i);
447 *write_count -= 1;
448 WARN_ON(*write_count < 0);
449 }
411} 450}
412 451
413static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn) 452static int has_wrprotected_page(struct kvm *kvm,
453 gfn_t gfn,
454 int level)
414{ 455{
415 struct kvm_memory_slot *slot; 456 struct kvm_memory_slot *slot;
416 int *largepage_idx; 457 int *largepage_idx;
@@ -418,47 +459,67 @@ static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn)
418 gfn = unalias_gfn(kvm, gfn); 459 gfn = unalias_gfn(kvm, gfn);
419 slot = gfn_to_memslot_unaliased(kvm, gfn); 460 slot = gfn_to_memslot_unaliased(kvm, gfn);
420 if (slot) { 461 if (slot) {
421 largepage_idx = slot_largepage_idx(gfn, slot); 462 largepage_idx = slot_largepage_idx(gfn, slot, level);
422 return *largepage_idx; 463 return *largepage_idx;
423 } 464 }
424 465
425 return 1; 466 return 1;
426} 467}
427 468
428static int host_largepage_backed(struct kvm *kvm, gfn_t gfn) 469static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
429{ 470{
471 unsigned long page_size = PAGE_SIZE;
430 struct vm_area_struct *vma; 472 struct vm_area_struct *vma;
431 unsigned long addr; 473 unsigned long addr;
432 int ret = 0; 474 int i, ret = 0;
433 475
434 addr = gfn_to_hva(kvm, gfn); 476 addr = gfn_to_hva(kvm, gfn);
435 if (kvm_is_error_hva(addr)) 477 if (kvm_is_error_hva(addr))
436 return ret; 478 return page_size;
437 479
438 down_read(&current->mm->mmap_sem); 480 down_read(&current->mm->mmap_sem);
439 vma = find_vma(current->mm, addr); 481 vma = find_vma(current->mm, addr);
440 if (vma && is_vm_hugetlb_page(vma)) 482 if (!vma)
441 ret = 1; 483 goto out;
484
485 page_size = vma_kernel_pagesize(vma);
486
487out:
442 up_read(&current->mm->mmap_sem); 488 up_read(&current->mm->mmap_sem);
443 489
490 for (i = PT_PAGE_TABLE_LEVEL;
491 i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) {
492 if (page_size >= KVM_HPAGE_SIZE(i))
493 ret = i;
494 else
495 break;
496 }
497
444 return ret; 498 return ret;
445} 499}
446 500
447static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn) 501static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
448{ 502{
449 struct kvm_memory_slot *slot; 503 struct kvm_memory_slot *slot;
450 504 int host_level;
451 if (has_wrprotected_page(vcpu->kvm, large_gfn)) 505 int level = PT_PAGE_TABLE_LEVEL;
452 return 0;
453
454 if (!host_largepage_backed(vcpu->kvm, large_gfn))
455 return 0;
456 506
457 slot = gfn_to_memslot(vcpu->kvm, large_gfn); 507 slot = gfn_to_memslot(vcpu->kvm, large_gfn);
458 if (slot && slot->dirty_bitmap) 508 if (slot && slot->dirty_bitmap)
459 return 0; 509 return PT_PAGE_TABLE_LEVEL;
460 510
461 return 1; 511 host_level = host_mapping_level(vcpu->kvm, large_gfn);
512
513 if (host_level == PT_PAGE_TABLE_LEVEL)
514 return host_level;
515
516 for (level = PT_DIRECTORY_LEVEL; level <= host_level; ++level) {
517
518 if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
519 break;
520 }
521
522 return level - 1;
462} 523}
463 524
464/* 525/*
@@ -466,19 +527,19 @@ static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn)
466 * Note: gfn must be unaliased before this function get called 527 * Note: gfn must be unaliased before this function get called
467 */ 528 */
468 529
469static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage) 530static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
470{ 531{
471 struct kvm_memory_slot *slot; 532 struct kvm_memory_slot *slot;
472 unsigned long idx; 533 unsigned long idx;
473 534
474 slot = gfn_to_memslot(kvm, gfn); 535 slot = gfn_to_memslot(kvm, gfn);
475 if (!lpage) 536 if (likely(level == PT_PAGE_TABLE_LEVEL))
476 return &slot->rmap[gfn - slot->base_gfn]; 537 return &slot->rmap[gfn - slot->base_gfn];
477 538
478 idx = (gfn / KVM_PAGES_PER_HPAGE) - 539 idx = (gfn / KVM_PAGES_PER_HPAGE(level)) -
479 (slot->base_gfn / KVM_PAGES_PER_HPAGE); 540 (slot->base_gfn / KVM_PAGES_PER_HPAGE(level));
480 541
481 return &slot->lpage_info[idx].rmap_pde; 542 return &slot->lpage_info[level - 2][idx].rmap_pde;
482} 543}
483 544
484/* 545/*
@@ -494,42 +555,42 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage)
494 * the spte was not added. 555 * the spte was not added.
495 * 556 *
496 */ 557 */
497static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage) 558static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
498{ 559{
499 struct kvm_mmu_page *sp; 560 struct kvm_mmu_page *sp;
500 struct kvm_rmap_desc *desc; 561 struct kvm_rmap_desc *desc;
501 unsigned long *rmapp; 562 unsigned long *rmapp;
502 int i, count = 0; 563 int i, count = 0;
503 564
504 if (!is_rmap_pte(*spte)) 565 if (!is_rmap_spte(*spte))
505 return count; 566 return count;
506 gfn = unalias_gfn(vcpu->kvm, gfn); 567 gfn = unalias_gfn(vcpu->kvm, gfn);
507 sp = page_header(__pa(spte)); 568 sp = page_header(__pa(spte));
508 sp->gfns[spte - sp->spt] = gfn; 569 sp->gfns[spte - sp->spt] = gfn;
509 rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage); 570 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
510 if (!*rmapp) { 571 if (!*rmapp) {
511 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); 572 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
512 *rmapp = (unsigned long)spte; 573 *rmapp = (unsigned long)spte;
513 } else if (!(*rmapp & 1)) { 574 } else if (!(*rmapp & 1)) {
514 rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte); 575 rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
515 desc = mmu_alloc_rmap_desc(vcpu); 576 desc = mmu_alloc_rmap_desc(vcpu);
516 desc->shadow_ptes[0] = (u64 *)*rmapp; 577 desc->sptes[0] = (u64 *)*rmapp;
517 desc->shadow_ptes[1] = spte; 578 desc->sptes[1] = spte;
518 *rmapp = (unsigned long)desc | 1; 579 *rmapp = (unsigned long)desc | 1;
519 } else { 580 } else {
520 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); 581 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
521 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 582 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
522 while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) { 583 while (desc->sptes[RMAP_EXT-1] && desc->more) {
523 desc = desc->more; 584 desc = desc->more;
524 count += RMAP_EXT; 585 count += RMAP_EXT;
525 } 586 }
526 if (desc->shadow_ptes[RMAP_EXT-1]) { 587 if (desc->sptes[RMAP_EXT-1]) {
527 desc->more = mmu_alloc_rmap_desc(vcpu); 588 desc->more = mmu_alloc_rmap_desc(vcpu);
528 desc = desc->more; 589 desc = desc->more;
529 } 590 }
530 for (i = 0; desc->shadow_ptes[i]; ++i) 591 for (i = 0; desc->sptes[i]; ++i)
531 ; 592 ;
532 desc->shadow_ptes[i] = spte; 593 desc->sptes[i] = spte;
533 } 594 }
534 return count; 595 return count;
535} 596}
@@ -541,14 +602,14 @@ static void rmap_desc_remove_entry(unsigned long *rmapp,
541{ 602{
542 int j; 603 int j;
543 604
544 for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j) 605 for (j = RMAP_EXT - 1; !desc->sptes[j] && j > i; --j)
545 ; 606 ;
546 desc->shadow_ptes[i] = desc->shadow_ptes[j]; 607 desc->sptes[i] = desc->sptes[j];
547 desc->shadow_ptes[j] = NULL; 608 desc->sptes[j] = NULL;
548 if (j != 0) 609 if (j != 0)
549 return; 610 return;
550 if (!prev_desc && !desc->more) 611 if (!prev_desc && !desc->more)
551 *rmapp = (unsigned long)desc->shadow_ptes[0]; 612 *rmapp = (unsigned long)desc->sptes[0];
552 else 613 else
553 if (prev_desc) 614 if (prev_desc)
554 prev_desc->more = desc->more; 615 prev_desc->more = desc->more;
@@ -566,7 +627,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
566 unsigned long *rmapp; 627 unsigned long *rmapp;
567 int i; 628 int i;
568 629
569 if (!is_rmap_pte(*spte)) 630 if (!is_rmap_spte(*spte))
570 return; 631 return;
571 sp = page_header(__pa(spte)); 632 sp = page_header(__pa(spte));
572 pfn = spte_to_pfn(*spte); 633 pfn = spte_to_pfn(*spte);
@@ -576,7 +637,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
576 kvm_release_pfn_dirty(pfn); 637 kvm_release_pfn_dirty(pfn);
577 else 638 else
578 kvm_release_pfn_clean(pfn); 639 kvm_release_pfn_clean(pfn);
579 rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], is_large_pte(*spte)); 640 rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level);
580 if (!*rmapp) { 641 if (!*rmapp) {
581 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); 642 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
582 BUG(); 643 BUG();
@@ -593,8 +654,8 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
593 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 654 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
594 prev_desc = NULL; 655 prev_desc = NULL;
595 while (desc) { 656 while (desc) {
596 for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) 657 for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i)
597 if (desc->shadow_ptes[i] == spte) { 658 if (desc->sptes[i] == spte) {
598 rmap_desc_remove_entry(rmapp, 659 rmap_desc_remove_entry(rmapp,
599 desc, i, 660 desc, i,
600 prev_desc); 661 prev_desc);
@@ -625,10 +686,10 @@ static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
625 prev_desc = NULL; 686 prev_desc = NULL;
626 prev_spte = NULL; 687 prev_spte = NULL;
627 while (desc) { 688 while (desc) {
628 for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) { 689 for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) {
629 if (prev_spte == spte) 690 if (prev_spte == spte)
630 return desc->shadow_ptes[i]; 691 return desc->sptes[i];
631 prev_spte = desc->shadow_ptes[i]; 692 prev_spte = desc->sptes[i];
632 } 693 }
633 desc = desc->more; 694 desc = desc->more;
634 } 695 }
@@ -639,10 +700,10 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
639{ 700{
640 unsigned long *rmapp; 701 unsigned long *rmapp;
641 u64 *spte; 702 u64 *spte;
642 int write_protected = 0; 703 int i, write_protected = 0;
643 704
644 gfn = unalias_gfn(kvm, gfn); 705 gfn = unalias_gfn(kvm, gfn);
645 rmapp = gfn_to_rmap(kvm, gfn, 0); 706 rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL);
646 707
647 spte = rmap_next(kvm, rmapp, NULL); 708 spte = rmap_next(kvm, rmapp, NULL);
648 while (spte) { 709 while (spte) {
@@ -650,7 +711,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
650 BUG_ON(!(*spte & PT_PRESENT_MASK)); 711 BUG_ON(!(*spte & PT_PRESENT_MASK));
651 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); 712 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
652 if (is_writeble_pte(*spte)) { 713 if (is_writeble_pte(*spte)) {
653 set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK); 714 __set_spte(spte, *spte & ~PT_WRITABLE_MASK);
654 write_protected = 1; 715 write_protected = 1;
655 } 716 }
656 spte = rmap_next(kvm, rmapp, spte); 717 spte = rmap_next(kvm, rmapp, spte);
@@ -664,21 +725,24 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
664 } 725 }
665 726
666 /* check for huge page mappings */ 727 /* check for huge page mappings */
667 rmapp = gfn_to_rmap(kvm, gfn, 1); 728 for (i = PT_DIRECTORY_LEVEL;
668 spte = rmap_next(kvm, rmapp, NULL); 729 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
669 while (spte) { 730 rmapp = gfn_to_rmap(kvm, gfn, i);
670 BUG_ON(!spte); 731 spte = rmap_next(kvm, rmapp, NULL);
671 BUG_ON(!(*spte & PT_PRESENT_MASK)); 732 while (spte) {
672 BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); 733 BUG_ON(!spte);
673 pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); 734 BUG_ON(!(*spte & PT_PRESENT_MASK));
674 if (is_writeble_pte(*spte)) { 735 BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
675 rmap_remove(kvm, spte); 736 pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
676 --kvm->stat.lpages; 737 if (is_writeble_pte(*spte)) {
677 set_shadow_pte(spte, shadow_trap_nonpresent_pte); 738 rmap_remove(kvm, spte);
678 spte = NULL; 739 --kvm->stat.lpages;
679 write_protected = 1; 740 __set_spte(spte, shadow_trap_nonpresent_pte);
741 spte = NULL;
742 write_protected = 1;
743 }
744 spte = rmap_next(kvm, rmapp, spte);
680 } 745 }
681 spte = rmap_next(kvm, rmapp, spte);
682 } 746 }
683 747
684 return write_protected; 748 return write_protected;
@@ -693,7 +757,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
693 BUG_ON(!(*spte & PT_PRESENT_MASK)); 757 BUG_ON(!(*spte & PT_PRESENT_MASK));
694 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); 758 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
695 rmap_remove(kvm, spte); 759 rmap_remove(kvm, spte);
696 set_shadow_pte(spte, shadow_trap_nonpresent_pte); 760 __set_spte(spte, shadow_trap_nonpresent_pte);
697 need_tlb_flush = 1; 761 need_tlb_flush = 1;
698 } 762 }
699 return need_tlb_flush; 763 return need_tlb_flush;
@@ -702,7 +766,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
702static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, 766static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
703 int (*handler)(struct kvm *kvm, unsigned long *rmapp)) 767 int (*handler)(struct kvm *kvm, unsigned long *rmapp))
704{ 768{
705 int i; 769 int i, j;
706 int retval = 0; 770 int retval = 0;
707 771
708 /* 772 /*
@@ -721,11 +785,15 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
721 end = start + (memslot->npages << PAGE_SHIFT); 785 end = start + (memslot->npages << PAGE_SHIFT);
722 if (hva >= start && hva < end) { 786 if (hva >= start && hva < end) {
723 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; 787 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
788
724 retval |= handler(kvm, &memslot->rmap[gfn_offset]); 789 retval |= handler(kvm, &memslot->rmap[gfn_offset]);
725 retval |= handler(kvm, 790
726 &memslot->lpage_info[ 791 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
727 gfn_offset / 792 int idx = gfn_offset;
728 KVM_PAGES_PER_HPAGE].rmap_pde); 793 idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j);
794 retval |= handler(kvm,
795 &memslot->lpage_info[j][idx].rmap_pde);
796 }
729 } 797 }
730 } 798 }
731 799
@@ -763,12 +831,15 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
763 831
764#define RMAP_RECYCLE_THRESHOLD 1000 832#define RMAP_RECYCLE_THRESHOLD 1000
765 833
766static void rmap_recycle(struct kvm_vcpu *vcpu, gfn_t gfn, int lpage) 834static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
767{ 835{
768 unsigned long *rmapp; 836 unsigned long *rmapp;
837 struct kvm_mmu_page *sp;
838
839 sp = page_header(__pa(spte));
769 840
770 gfn = unalias_gfn(vcpu->kvm, gfn); 841 gfn = unalias_gfn(vcpu->kvm, gfn);
771 rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage); 842 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
772 843
773 kvm_unmap_rmapp(vcpu->kvm, rmapp); 844 kvm_unmap_rmapp(vcpu->kvm, rmapp);
774 kvm_flush_remote_tlbs(vcpu->kvm); 845 kvm_flush_remote_tlbs(vcpu->kvm);
@@ -1109,6 +1180,7 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1109 return 1; 1180 return 1;
1110 } 1181 }
1111 1182
1183 trace_kvm_mmu_sync_page(sp);
1112 if (rmap_write_protect(vcpu->kvm, sp->gfn)) 1184 if (rmap_write_protect(vcpu->kvm, sp->gfn))
1113 kvm_flush_remote_tlbs(vcpu->kvm); 1185 kvm_flush_remote_tlbs(vcpu->kvm);
1114 kvm_unlink_unsync_page(vcpu->kvm, sp); 1186 kvm_unlink_unsync_page(vcpu->kvm, sp);
@@ -1231,8 +1303,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1231 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; 1303 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
1232 role.quadrant = quadrant; 1304 role.quadrant = quadrant;
1233 } 1305 }
1234 pgprintk("%s: looking gfn %lx role %x\n", __func__,
1235 gfn, role.word);
1236 index = kvm_page_table_hashfn(gfn); 1306 index = kvm_page_table_hashfn(gfn);
1237 bucket = &vcpu->kvm->arch.mmu_page_hash[index]; 1307 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
1238 hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link) 1308 hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link)
@@ -1249,14 +1319,13 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1249 set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); 1319 set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
1250 kvm_mmu_mark_parents_unsync(vcpu, sp); 1320 kvm_mmu_mark_parents_unsync(vcpu, sp);
1251 } 1321 }
1252 pgprintk("%s: found\n", __func__); 1322 trace_kvm_mmu_get_page(sp, false);
1253 return sp; 1323 return sp;
1254 } 1324 }
1255 ++vcpu->kvm->stat.mmu_cache_miss; 1325 ++vcpu->kvm->stat.mmu_cache_miss;
1256 sp = kvm_mmu_alloc_page(vcpu, parent_pte); 1326 sp = kvm_mmu_alloc_page(vcpu, parent_pte);
1257 if (!sp) 1327 if (!sp)
1258 return sp; 1328 return sp;
1259 pgprintk("%s: adding gfn %lx role %x\n", __func__, gfn, role.word);
1260 sp->gfn = gfn; 1329 sp->gfn = gfn;
1261 sp->role = role; 1330 sp->role = role;
1262 hlist_add_head(&sp->hash_link, bucket); 1331 hlist_add_head(&sp->hash_link, bucket);
@@ -1269,6 +1338,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1269 vcpu->arch.mmu.prefetch_page(vcpu, sp); 1338 vcpu->arch.mmu.prefetch_page(vcpu, sp);
1270 else 1339 else
1271 nonpaging_prefetch_page(vcpu, sp); 1340 nonpaging_prefetch_page(vcpu, sp);
1341 trace_kvm_mmu_get_page(sp, true);
1272 return sp; 1342 return sp;
1273} 1343}
1274 1344
@@ -1292,6 +1362,11 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
1292{ 1362{
1293 if (iterator->level < PT_PAGE_TABLE_LEVEL) 1363 if (iterator->level < PT_PAGE_TABLE_LEVEL)
1294 return false; 1364 return false;
1365
1366 if (iterator->level == PT_PAGE_TABLE_LEVEL)
1367 if (is_large_pte(*iterator->sptep))
1368 return false;
1369
1295 iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level); 1370 iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
1296 iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index; 1371 iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
1297 return true; 1372 return true;
@@ -1312,25 +1387,17 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
1312 1387
1313 pt = sp->spt; 1388 pt = sp->spt;
1314 1389
1315 if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
1316 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1317 if (is_shadow_present_pte(pt[i]))
1318 rmap_remove(kvm, &pt[i]);
1319 pt[i] = shadow_trap_nonpresent_pte;
1320 }
1321 return;
1322 }
1323
1324 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { 1390 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1325 ent = pt[i]; 1391 ent = pt[i];
1326 1392
1327 if (is_shadow_present_pte(ent)) { 1393 if (is_shadow_present_pte(ent)) {
1328 if (!is_large_pte(ent)) { 1394 if (!is_last_spte(ent, sp->role.level)) {
1329 ent &= PT64_BASE_ADDR_MASK; 1395 ent &= PT64_BASE_ADDR_MASK;
1330 mmu_page_remove_parent_pte(page_header(ent), 1396 mmu_page_remove_parent_pte(page_header(ent),
1331 &pt[i]); 1397 &pt[i]);
1332 } else { 1398 } else {
1333 --kvm->stat.lpages; 1399 if (is_large_pte(ent))
1400 --kvm->stat.lpages;
1334 rmap_remove(kvm, &pt[i]); 1401 rmap_remove(kvm, &pt[i]);
1335 } 1402 }
1336 } 1403 }
@@ -1346,10 +1413,10 @@ static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
1346static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm) 1413static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
1347{ 1414{
1348 int i; 1415 int i;
1416 struct kvm_vcpu *vcpu;
1349 1417
1350 for (i = 0; i < KVM_MAX_VCPUS; ++i) 1418 kvm_for_each_vcpu(i, vcpu, kvm)
1351 if (kvm->vcpus[i]) 1419 vcpu->arch.last_pte_updated = NULL;
1352 kvm->vcpus[i]->arch.last_pte_updated = NULL;
1353} 1420}
1354 1421
1355static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) 1422static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
@@ -1368,7 +1435,7 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
1368 } 1435 }
1369 BUG_ON(!parent_pte); 1436 BUG_ON(!parent_pte);
1370 kvm_mmu_put_page(sp, parent_pte); 1437 kvm_mmu_put_page(sp, parent_pte);
1371 set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte); 1438 __set_spte(parent_pte, shadow_trap_nonpresent_pte);
1372 } 1439 }
1373} 1440}
1374 1441
@@ -1400,6 +1467,8 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
1400static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1467static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1401{ 1468{
1402 int ret; 1469 int ret;
1470
1471 trace_kvm_mmu_zap_page(sp);
1403 ++kvm->stat.mmu_shadow_zapped; 1472 ++kvm->stat.mmu_shadow_zapped;
1404 ret = mmu_zap_unsync_children(kvm, sp); 1473 ret = mmu_zap_unsync_children(kvm, sp);
1405 kvm_mmu_page_unlink_children(kvm, sp); 1474 kvm_mmu_page_unlink_children(kvm, sp);
@@ -1516,7 +1585,7 @@ static void mmu_convert_notrap(struct kvm_mmu_page *sp)
1516 1585
1517 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { 1586 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1518 if (pt[i] == shadow_notrap_nonpresent_pte) 1587 if (pt[i] == shadow_notrap_nonpresent_pte)
1519 set_shadow_pte(&pt[i], shadow_trap_nonpresent_pte); 1588 __set_spte(&pt[i], shadow_trap_nonpresent_pte);
1520 } 1589 }
1521} 1590}
1522 1591
@@ -1646,6 +1715,7 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1646 struct kvm_mmu_page *s; 1715 struct kvm_mmu_page *s;
1647 struct hlist_node *node, *n; 1716 struct hlist_node *node, *n;
1648 1717
1718 trace_kvm_mmu_unsync_page(sp);
1649 index = kvm_page_table_hashfn(sp->gfn); 1719 index = kvm_page_table_hashfn(sp->gfn);
1650 bucket = &vcpu->kvm->arch.mmu_page_hash[index]; 1720 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
1651 /* don't unsync if pagetable is shadowed with multiple roles */ 1721 /* don't unsync if pagetable is shadowed with multiple roles */
@@ -1682,9 +1752,9 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
1682 return 0; 1752 return 0;
1683} 1753}
1684 1754
1685static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, 1755static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1686 unsigned pte_access, int user_fault, 1756 unsigned pte_access, int user_fault,
1687 int write_fault, int dirty, int largepage, 1757 int write_fault, int dirty, int level,
1688 gfn_t gfn, pfn_t pfn, bool speculative, 1758 gfn_t gfn, pfn_t pfn, bool speculative,
1689 bool can_unsync) 1759 bool can_unsync)
1690{ 1760{
@@ -1707,7 +1777,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1707 spte |= shadow_nx_mask; 1777 spte |= shadow_nx_mask;
1708 if (pte_access & ACC_USER_MASK) 1778 if (pte_access & ACC_USER_MASK)
1709 spte |= shadow_user_mask; 1779 spte |= shadow_user_mask;
1710 if (largepage) 1780 if (level > PT_PAGE_TABLE_LEVEL)
1711 spte |= PT_PAGE_SIZE_MASK; 1781 spte |= PT_PAGE_SIZE_MASK;
1712 if (tdp_enabled) 1782 if (tdp_enabled)
1713 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, 1783 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
@@ -1718,7 +1788,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1718 if ((pte_access & ACC_WRITE_MASK) 1788 if ((pte_access & ACC_WRITE_MASK)
1719 || (write_fault && !is_write_protection(vcpu) && !user_fault)) { 1789 || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
1720 1790
1721 if (largepage && has_wrprotected_page(vcpu->kvm, gfn)) { 1791 if (level > PT_PAGE_TABLE_LEVEL &&
1792 has_wrprotected_page(vcpu->kvm, gfn, level)) {
1722 ret = 1; 1793 ret = 1;
1723 spte = shadow_trap_nonpresent_pte; 1794 spte = shadow_trap_nonpresent_pte;
1724 goto set_pte; 1795 goto set_pte;
@@ -1732,7 +1803,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1732 * is responsibility of mmu_get_page / kvm_sync_page. 1803 * is responsibility of mmu_get_page / kvm_sync_page.
1733 * Same reasoning can be applied to dirty page accounting. 1804 * Same reasoning can be applied to dirty page accounting.
1734 */ 1805 */
1735 if (!can_unsync && is_writeble_pte(*shadow_pte)) 1806 if (!can_unsync && is_writeble_pte(*sptep))
1736 goto set_pte; 1807 goto set_pte;
1737 1808
1738 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { 1809 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
@@ -1749,65 +1820,67 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1749 mark_page_dirty(vcpu->kvm, gfn); 1820 mark_page_dirty(vcpu->kvm, gfn);
1750 1821
1751set_pte: 1822set_pte:
1752 set_shadow_pte(shadow_pte, spte); 1823 __set_spte(sptep, spte);
1753 return ret; 1824 return ret;
1754} 1825}
1755 1826
1756static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, 1827static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1757 unsigned pt_access, unsigned pte_access, 1828 unsigned pt_access, unsigned pte_access,
1758 int user_fault, int write_fault, int dirty, 1829 int user_fault, int write_fault, int dirty,
1759 int *ptwrite, int largepage, gfn_t gfn, 1830 int *ptwrite, int level, gfn_t gfn,
1760 pfn_t pfn, bool speculative) 1831 pfn_t pfn, bool speculative)
1761{ 1832{
1762 int was_rmapped = 0; 1833 int was_rmapped = 0;
1763 int was_writeble = is_writeble_pte(*shadow_pte); 1834 int was_writeble = is_writeble_pte(*sptep);
1764 int rmap_count; 1835 int rmap_count;
1765 1836
1766 pgprintk("%s: spte %llx access %x write_fault %d" 1837 pgprintk("%s: spte %llx access %x write_fault %d"
1767 " user_fault %d gfn %lx\n", 1838 " user_fault %d gfn %lx\n",
1768 __func__, *shadow_pte, pt_access, 1839 __func__, *sptep, pt_access,
1769 write_fault, user_fault, gfn); 1840 write_fault, user_fault, gfn);
1770 1841
1771 if (is_rmap_pte(*shadow_pte)) { 1842 if (is_rmap_spte(*sptep)) {
1772 /* 1843 /*
1773 * If we overwrite a PTE page pointer with a 2MB PMD, unlink 1844 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
1774 * the parent of the now unreachable PTE. 1845 * the parent of the now unreachable PTE.
1775 */ 1846 */
1776 if (largepage && !is_large_pte(*shadow_pte)) { 1847 if (level > PT_PAGE_TABLE_LEVEL &&
1848 !is_large_pte(*sptep)) {
1777 struct kvm_mmu_page *child; 1849 struct kvm_mmu_page *child;
1778 u64 pte = *shadow_pte; 1850 u64 pte = *sptep;
1779 1851
1780 child = page_header(pte & PT64_BASE_ADDR_MASK); 1852 child = page_header(pte & PT64_BASE_ADDR_MASK);
1781 mmu_page_remove_parent_pte(child, shadow_pte); 1853 mmu_page_remove_parent_pte(child, sptep);
1782 } else if (pfn != spte_to_pfn(*shadow_pte)) { 1854 } else if (pfn != spte_to_pfn(*sptep)) {
1783 pgprintk("hfn old %lx new %lx\n", 1855 pgprintk("hfn old %lx new %lx\n",
1784 spte_to_pfn(*shadow_pte), pfn); 1856 spte_to_pfn(*sptep), pfn);
1785 rmap_remove(vcpu->kvm, shadow_pte); 1857 rmap_remove(vcpu->kvm, sptep);
1786 } else 1858 } else
1787 was_rmapped = 1; 1859 was_rmapped = 1;
1788 } 1860 }
1789 if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault, 1861
1790 dirty, largepage, gfn, pfn, speculative, true)) { 1862 if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
1863 dirty, level, gfn, pfn, speculative, true)) {
1791 if (write_fault) 1864 if (write_fault)
1792 *ptwrite = 1; 1865 *ptwrite = 1;
1793 kvm_x86_ops->tlb_flush(vcpu); 1866 kvm_x86_ops->tlb_flush(vcpu);
1794 } 1867 }
1795 1868
1796 pgprintk("%s: setting spte %llx\n", __func__, *shadow_pte); 1869 pgprintk("%s: setting spte %llx\n", __func__, *sptep);
1797 pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n", 1870 pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
1798 is_large_pte(*shadow_pte)? "2MB" : "4kB", 1871 is_large_pte(*sptep)? "2MB" : "4kB",
1799 is_present_pte(*shadow_pte)?"RW":"R", gfn, 1872 *sptep & PT_PRESENT_MASK ?"RW":"R", gfn,
1800 *shadow_pte, shadow_pte); 1873 *sptep, sptep);
1801 if (!was_rmapped && is_large_pte(*shadow_pte)) 1874 if (!was_rmapped && is_large_pte(*sptep))
1802 ++vcpu->kvm->stat.lpages; 1875 ++vcpu->kvm->stat.lpages;
1803 1876
1804 page_header_update_slot(vcpu->kvm, shadow_pte, gfn); 1877 page_header_update_slot(vcpu->kvm, sptep, gfn);
1805 if (!was_rmapped) { 1878 if (!was_rmapped) {
1806 rmap_count = rmap_add(vcpu, shadow_pte, gfn, largepage); 1879 rmap_count = rmap_add(vcpu, sptep, gfn);
1807 if (!is_rmap_pte(*shadow_pte)) 1880 if (!is_rmap_spte(*sptep))
1808 kvm_release_pfn_clean(pfn); 1881 kvm_release_pfn_clean(pfn);
1809 if (rmap_count > RMAP_RECYCLE_THRESHOLD) 1882 if (rmap_count > RMAP_RECYCLE_THRESHOLD)
1810 rmap_recycle(vcpu, gfn, largepage); 1883 rmap_recycle(vcpu, sptep, gfn);
1811 } else { 1884 } else {
1812 if (was_writeble) 1885 if (was_writeble)
1813 kvm_release_pfn_dirty(pfn); 1886 kvm_release_pfn_dirty(pfn);
@@ -1815,7 +1888,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1815 kvm_release_pfn_clean(pfn); 1888 kvm_release_pfn_clean(pfn);
1816 } 1889 }
1817 if (speculative) { 1890 if (speculative) {
1818 vcpu->arch.last_pte_updated = shadow_pte; 1891 vcpu->arch.last_pte_updated = sptep;
1819 vcpu->arch.last_pte_gfn = gfn; 1892 vcpu->arch.last_pte_gfn = gfn;
1820 } 1893 }
1821} 1894}
@@ -1825,7 +1898,7 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
1825} 1898}
1826 1899
1827static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, 1900static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
1828 int largepage, gfn_t gfn, pfn_t pfn) 1901 int level, gfn_t gfn, pfn_t pfn)
1829{ 1902{
1830 struct kvm_shadow_walk_iterator iterator; 1903 struct kvm_shadow_walk_iterator iterator;
1831 struct kvm_mmu_page *sp; 1904 struct kvm_mmu_page *sp;
@@ -1833,11 +1906,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
1833 gfn_t pseudo_gfn; 1906 gfn_t pseudo_gfn;
1834 1907
1835 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { 1908 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
1836 if (iterator.level == PT_PAGE_TABLE_LEVEL 1909 if (iterator.level == level) {
1837 || (largepage && iterator.level == PT_DIRECTORY_LEVEL)) {
1838 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, 1910 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
1839 0, write, 1, &pt_write, 1911 0, write, 1, &pt_write,
1840 largepage, gfn, pfn, false); 1912 level, gfn, pfn, false);
1841 ++vcpu->stat.pf_fixed; 1913 ++vcpu->stat.pf_fixed;
1842 break; 1914 break;
1843 } 1915 }
@@ -1853,10 +1925,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
1853 return -ENOMEM; 1925 return -ENOMEM;
1854 } 1926 }
1855 1927
1856 set_shadow_pte(iterator.sptep, 1928 __set_spte(iterator.sptep,
1857 __pa(sp->spt) 1929 __pa(sp->spt)
1858 | PT_PRESENT_MASK | PT_WRITABLE_MASK 1930 | PT_PRESENT_MASK | PT_WRITABLE_MASK
1859 | shadow_user_mask | shadow_x_mask); 1931 | shadow_user_mask | shadow_x_mask);
1860 } 1932 }
1861 } 1933 }
1862 return pt_write; 1934 return pt_write;
@@ -1865,14 +1937,20 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
1865static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) 1937static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1866{ 1938{
1867 int r; 1939 int r;
1868 int largepage = 0; 1940 int level;
1869 pfn_t pfn; 1941 pfn_t pfn;
1870 unsigned long mmu_seq; 1942 unsigned long mmu_seq;
1871 1943
1872 if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { 1944 level = mapping_level(vcpu, gfn);
1873 gfn &= ~(KVM_PAGES_PER_HPAGE-1); 1945
1874 largepage = 1; 1946 /*
1875 } 1947 * This path builds a PAE pagetable - so we can map 2mb pages at
1948 * maximum. Therefore check if the level is larger than that.
1949 */
1950 if (level > PT_DIRECTORY_LEVEL)
1951 level = PT_DIRECTORY_LEVEL;
1952
1953 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
1876 1954
1877 mmu_seq = vcpu->kvm->mmu_notifier_seq; 1955 mmu_seq = vcpu->kvm->mmu_notifier_seq;
1878 smp_rmb(); 1956 smp_rmb();
@@ -1888,7 +1966,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1888 if (mmu_notifier_retry(vcpu, mmu_seq)) 1966 if (mmu_notifier_retry(vcpu, mmu_seq))
1889 goto out_unlock; 1967 goto out_unlock;
1890 kvm_mmu_free_some_pages(vcpu); 1968 kvm_mmu_free_some_pages(vcpu);
1891 r = __direct_map(vcpu, v, write, largepage, gfn, pfn); 1969 r = __direct_map(vcpu, v, write, level, gfn, pfn);
1892 spin_unlock(&vcpu->kvm->mmu_lock); 1970 spin_unlock(&vcpu->kvm->mmu_lock);
1893 1971
1894 1972
@@ -1954,6 +2032,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
1954 gfn_t root_gfn; 2032 gfn_t root_gfn;
1955 struct kvm_mmu_page *sp; 2033 struct kvm_mmu_page *sp;
1956 int direct = 0; 2034 int direct = 0;
2035 u64 pdptr;
1957 2036
1958 root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT; 2037 root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
1959 2038
@@ -1981,11 +2060,12 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
1981 2060
1982 ASSERT(!VALID_PAGE(root)); 2061 ASSERT(!VALID_PAGE(root));
1983 if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { 2062 if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
1984 if (!is_present_pte(vcpu->arch.pdptrs[i])) { 2063 pdptr = kvm_pdptr_read(vcpu, i);
2064 if (!is_present_gpte(pdptr)) {
1985 vcpu->arch.mmu.pae_root[i] = 0; 2065 vcpu->arch.mmu.pae_root[i] = 0;
1986 continue; 2066 continue;
1987 } 2067 }
1988 root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT; 2068 root_gfn = pdptr >> PAGE_SHIFT;
1989 } else if (vcpu->arch.mmu.root_level == 0) 2069 } else if (vcpu->arch.mmu.root_level == 0)
1990 root_gfn = 0; 2070 root_gfn = 0;
1991 if (mmu_check_root(vcpu, root_gfn)) 2071 if (mmu_check_root(vcpu, root_gfn))
@@ -2062,7 +2142,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
2062{ 2142{
2063 pfn_t pfn; 2143 pfn_t pfn;
2064 int r; 2144 int r;
2065 int largepage = 0; 2145 int level;
2066 gfn_t gfn = gpa >> PAGE_SHIFT; 2146 gfn_t gfn = gpa >> PAGE_SHIFT;
2067 unsigned long mmu_seq; 2147 unsigned long mmu_seq;
2068 2148
@@ -2073,10 +2153,10 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
2073 if (r) 2153 if (r)
2074 return r; 2154 return r;
2075 2155
2076 if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { 2156 level = mapping_level(vcpu, gfn);
2077 gfn &= ~(KVM_PAGES_PER_HPAGE-1); 2157
2078 largepage = 1; 2158 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
2079 } 2159
2080 mmu_seq = vcpu->kvm->mmu_notifier_seq; 2160 mmu_seq = vcpu->kvm->mmu_notifier_seq;
2081 smp_rmb(); 2161 smp_rmb();
2082 pfn = gfn_to_pfn(vcpu->kvm, gfn); 2162 pfn = gfn_to_pfn(vcpu->kvm, gfn);
@@ -2089,7 +2169,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
2089 goto out_unlock; 2169 goto out_unlock;
2090 kvm_mmu_free_some_pages(vcpu); 2170 kvm_mmu_free_some_pages(vcpu);
2091 r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, 2171 r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
2092 largepage, gfn, pfn); 2172 level, gfn, pfn);
2093 spin_unlock(&vcpu->kvm->mmu_lock); 2173 spin_unlock(&vcpu->kvm->mmu_lock);
2094 2174
2095 return r; 2175 return r;
@@ -2206,7 +2286,9 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
2206 context->rsvd_bits_mask[0][0] = exb_bit_rsvd | 2286 context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
2207 rsvd_bits(maxphyaddr, 51); 2287 rsvd_bits(maxphyaddr, 51);
2208 context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3]; 2288 context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
2209 context->rsvd_bits_mask[1][2] = context->rsvd_bits_mask[0][2]; 2289 context->rsvd_bits_mask[1][2] = exb_bit_rsvd |
2290 rsvd_bits(maxphyaddr, 51) |
2291 rsvd_bits(13, 29);
2210 context->rsvd_bits_mask[1][1] = exb_bit_rsvd | 2292 context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
2211 rsvd_bits(maxphyaddr, 51) | 2293 rsvd_bits(maxphyaddr, 51) |
2212 rsvd_bits(13, 20); /* large page */ 2294 rsvd_bits(13, 20); /* large page */
@@ -2357,8 +2439,8 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
2357 spin_unlock(&vcpu->kvm->mmu_lock); 2439 spin_unlock(&vcpu->kvm->mmu_lock);
2358 if (r) 2440 if (r)
2359 goto out; 2441 goto out;
2442 /* set_cr3() should ensure TLB has been flushed */
2360 kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); 2443 kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
2361 kvm_mmu_flush_tlb(vcpu);
2362out: 2444out:
2363 return r; 2445 return r;
2364} 2446}
@@ -2378,15 +2460,14 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
2378 2460
2379 pte = *spte; 2461 pte = *spte;
2380 if (is_shadow_present_pte(pte)) { 2462 if (is_shadow_present_pte(pte)) {
2381 if (sp->role.level == PT_PAGE_TABLE_LEVEL || 2463 if (is_last_spte(pte, sp->role.level))
2382 is_large_pte(pte))
2383 rmap_remove(vcpu->kvm, spte); 2464 rmap_remove(vcpu->kvm, spte);
2384 else { 2465 else {
2385 child = page_header(pte & PT64_BASE_ADDR_MASK); 2466 child = page_header(pte & PT64_BASE_ADDR_MASK);
2386 mmu_page_remove_parent_pte(child, spte); 2467 mmu_page_remove_parent_pte(child, spte);
2387 } 2468 }
2388 } 2469 }
2389 set_shadow_pte(spte, shadow_trap_nonpresent_pte); 2470 __set_spte(spte, shadow_trap_nonpresent_pte);
2390 if (is_large_pte(pte)) 2471 if (is_large_pte(pte))
2391 --vcpu->kvm->stat.lpages; 2472 --vcpu->kvm->stat.lpages;
2392} 2473}
@@ -2397,11 +2478,8 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
2397 const void *new) 2478 const void *new)
2398{ 2479{
2399 if (sp->role.level != PT_PAGE_TABLE_LEVEL) { 2480 if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
2400 if (!vcpu->arch.update_pte.largepage || 2481 ++vcpu->kvm->stat.mmu_pde_zapped;
2401 sp->role.glevels == PT32_ROOT_LEVEL) { 2482 return;
2402 ++vcpu->kvm->stat.mmu_pde_zapped;
2403 return;
2404 }
2405 } 2483 }
2406 2484
2407 ++vcpu->kvm->stat.mmu_pte_updated; 2485 ++vcpu->kvm->stat.mmu_pte_updated;
@@ -2447,8 +2525,6 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2447 u64 gpte = 0; 2525 u64 gpte = 0;
2448 pfn_t pfn; 2526 pfn_t pfn;
2449 2527
2450 vcpu->arch.update_pte.largepage = 0;
2451
2452 if (bytes != 4 && bytes != 8) 2528 if (bytes != 4 && bytes != 8)
2453 return; 2529 return;
2454 2530
@@ -2472,14 +2548,10 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2472 if ((bytes == 4) && (gpa % 4 == 0)) 2548 if ((bytes == 4) && (gpa % 4 == 0))
2473 memcpy((void *)&gpte, new, 4); 2549 memcpy((void *)&gpte, new, 4);
2474 } 2550 }
2475 if (!is_present_pte(gpte)) 2551 if (!is_present_gpte(gpte))
2476 return; 2552 return;
2477 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; 2553 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
2478 2554
2479 if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) {
2480 gfn &= ~(KVM_PAGES_PER_HPAGE-1);
2481 vcpu->arch.update_pte.largepage = 1;
2482 }
2483 vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq; 2555 vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
2484 smp_rmb(); 2556 smp_rmb();
2485 pfn = gfn_to_pfn(vcpu->kvm, gfn); 2557 pfn = gfn_to_pfn(vcpu->kvm, gfn);
@@ -2622,6 +2694,9 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
2622 gpa_t gpa; 2694 gpa_t gpa;
2623 int r; 2695 int r;
2624 2696
2697 if (tdp_enabled)
2698 return 0;
2699
2625 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); 2700 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
2626 2701
2627 spin_lock(&vcpu->kvm->mmu_lock); 2702 spin_lock(&vcpu->kvm->mmu_lock);
@@ -2633,7 +2708,8 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
2633 2708
2634void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) 2709void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
2635{ 2710{
2636 while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) { 2711 while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES &&
2712 !list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
2637 struct kvm_mmu_page *sp; 2713 struct kvm_mmu_page *sp;
2638 2714
2639 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, 2715 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
@@ -2670,8 +2746,9 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
2670 ++vcpu->stat.mmio_exits; 2746 ++vcpu->stat.mmio_exits;
2671 return 0; 2747 return 0;
2672 case EMULATE_FAIL: 2748 case EMULATE_FAIL:
2673 kvm_report_emulation_failure(vcpu, "pagetable"); 2749 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2674 return 1; 2750 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
2751 return 0;
2675 default: 2752 default:
2676 BUG(); 2753 BUG();
2677 } 2754 }
@@ -2712,12 +2789,6 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
2712 2789
2713 ASSERT(vcpu); 2790 ASSERT(vcpu);
2714 2791
2715 if (vcpu->kvm->arch.n_requested_mmu_pages)
2716 vcpu->kvm->arch.n_free_mmu_pages =
2717 vcpu->kvm->arch.n_requested_mmu_pages;
2718 else
2719 vcpu->kvm->arch.n_free_mmu_pages =
2720 vcpu->kvm->arch.n_alloc_mmu_pages;
2721 /* 2792 /*
2722 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. 2793 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
2723 * Therefore we need to allocate shadow page tables in the first 2794 * Therefore we need to allocate shadow page tables in the first
@@ -3029,6 +3100,24 @@ out:
3029 return r; 3100 return r;
3030} 3101}
3031 3102
3103int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
3104{
3105 struct kvm_shadow_walk_iterator iterator;
3106 int nr_sptes = 0;
3107
3108 spin_lock(&vcpu->kvm->mmu_lock);
3109 for_each_shadow_entry(vcpu, addr, iterator) {
3110 sptes[iterator.level-1] = *iterator.sptep;
3111 nr_sptes++;
3112 if (!is_shadow_present_pte(*iterator.sptep))
3113 break;
3114 }
3115 spin_unlock(&vcpu->kvm->mmu_lock);
3116
3117 return nr_sptes;
3118}
3119EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
3120
3032#ifdef AUDIT 3121#ifdef AUDIT
3033 3122
3034static const char *audit_msg; 3123static const char *audit_msg;
@@ -3041,6 +3130,54 @@ static gva_t canonicalize(gva_t gva)
3041 return gva; 3130 return gva;
3042} 3131}
3043 3132
3133
3134typedef void (*inspect_spte_fn) (struct kvm *kvm, struct kvm_mmu_page *sp,
3135 u64 *sptep);
3136
3137static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
3138 inspect_spte_fn fn)
3139{
3140 int i;
3141
3142 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
3143 u64 ent = sp->spt[i];
3144
3145 if (is_shadow_present_pte(ent)) {
3146 if (!is_last_spte(ent, sp->role.level)) {
3147 struct kvm_mmu_page *child;
3148 child = page_header(ent & PT64_BASE_ADDR_MASK);
3149 __mmu_spte_walk(kvm, child, fn);
3150 } else
3151 fn(kvm, sp, &sp->spt[i]);
3152 }
3153 }
3154}
3155
3156static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
3157{
3158 int i;
3159 struct kvm_mmu_page *sp;
3160
3161 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
3162 return;
3163 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
3164 hpa_t root = vcpu->arch.mmu.root_hpa;
3165 sp = page_header(root);
3166 __mmu_spte_walk(vcpu->kvm, sp, fn);
3167 return;
3168 }
3169 for (i = 0; i < 4; ++i) {
3170 hpa_t root = vcpu->arch.mmu.pae_root[i];
3171
3172 if (root && VALID_PAGE(root)) {
3173 root &= PT64_BASE_ADDR_MASK;
3174 sp = page_header(root);
3175 __mmu_spte_walk(vcpu->kvm, sp, fn);
3176 }
3177 }
3178 return;
3179}
3180
3044static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, 3181static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
3045 gva_t va, int level) 3182 gva_t va, int level)
3046{ 3183{
@@ -3055,20 +3192,19 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
3055 continue; 3192 continue;
3056 3193
3057 va = canonicalize(va); 3194 va = canonicalize(va);
3058 if (level > 1) { 3195 if (is_shadow_present_pte(ent) && !is_last_spte(ent, level))
3059 if (ent == shadow_notrap_nonpresent_pte) 3196 audit_mappings_page(vcpu, ent, va, level - 1);
3060 printk(KERN_ERR "audit: (%s) nontrapping pte" 3197 else {
3061 " in nonleaf level: levels %d gva %lx"
3062 " level %d pte %llx\n", audit_msg,
3063 vcpu->arch.mmu.root_level, va, level, ent);
3064 else
3065 audit_mappings_page(vcpu, ent, va, level - 1);
3066 } else {
3067 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va); 3198 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va);
3068 gfn_t gfn = gpa >> PAGE_SHIFT; 3199 gfn_t gfn = gpa >> PAGE_SHIFT;
3069 pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn); 3200 pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn);
3070 hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT; 3201 hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT;
3071 3202
3203 if (is_error_pfn(pfn)) {
3204 kvm_release_pfn_clean(pfn);
3205 continue;
3206 }
3207
3072 if (is_shadow_present_pte(ent) 3208 if (is_shadow_present_pte(ent)
3073 && (ent & PT64_BASE_ADDR_MASK) != hpa) 3209 && (ent & PT64_BASE_ADDR_MASK) != hpa)
3074 printk(KERN_ERR "xx audit error: (%s) levels %d" 3210 printk(KERN_ERR "xx audit error: (%s) levels %d"
@@ -3122,7 +3258,7 @@ static int count_rmaps(struct kvm_vcpu *vcpu)
3122 d = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 3258 d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
3123 while (d) { 3259 while (d) {
3124 for (k = 0; k < RMAP_EXT; ++k) 3260 for (k = 0; k < RMAP_EXT; ++k)
3125 if (d->shadow_ptes[k]) 3261 if (d->sptes[k])
3126 ++nmaps; 3262 ++nmaps;
3127 else 3263 else
3128 break; 3264 break;
@@ -3133,9 +3269,48 @@ static int count_rmaps(struct kvm_vcpu *vcpu)
3133 return nmaps; 3269 return nmaps;
3134} 3270}
3135 3271
3136static int count_writable_mappings(struct kvm_vcpu *vcpu) 3272void inspect_spte_has_rmap(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *sptep)
3273{
3274 unsigned long *rmapp;
3275 struct kvm_mmu_page *rev_sp;
3276 gfn_t gfn;
3277
3278 if (*sptep & PT_WRITABLE_MASK) {
3279 rev_sp = page_header(__pa(sptep));
3280 gfn = rev_sp->gfns[sptep - rev_sp->spt];
3281
3282 if (!gfn_to_memslot(kvm, gfn)) {
3283 if (!printk_ratelimit())
3284 return;
3285 printk(KERN_ERR "%s: no memslot for gfn %ld\n",
3286 audit_msg, gfn);
3287 printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n",
3288 audit_msg, sptep - rev_sp->spt,
3289 rev_sp->gfn);
3290 dump_stack();
3291 return;
3292 }
3293
3294 rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt],
3295 is_large_pte(*sptep));
3296 if (!*rmapp) {
3297 if (!printk_ratelimit())
3298 return;
3299 printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
3300 audit_msg, *sptep);
3301 dump_stack();
3302 }
3303 }
3304
3305}
3306
3307void audit_writable_sptes_have_rmaps(struct kvm_vcpu *vcpu)
3308{
3309 mmu_spte_walk(vcpu, inspect_spte_has_rmap);
3310}
3311
3312static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
3137{ 3313{
3138 int nmaps = 0;
3139 struct kvm_mmu_page *sp; 3314 struct kvm_mmu_page *sp;
3140 int i; 3315 int i;
3141 3316
@@ -3152,20 +3327,16 @@ static int count_writable_mappings(struct kvm_vcpu *vcpu)
3152 continue; 3327 continue;
3153 if (!(ent & PT_WRITABLE_MASK)) 3328 if (!(ent & PT_WRITABLE_MASK))
3154 continue; 3329 continue;
3155 ++nmaps; 3330 inspect_spte_has_rmap(vcpu->kvm, sp, &pt[i]);
3156 } 3331 }
3157 } 3332 }
3158 return nmaps; 3333 return;
3159} 3334}
3160 3335
3161static void audit_rmap(struct kvm_vcpu *vcpu) 3336static void audit_rmap(struct kvm_vcpu *vcpu)
3162{ 3337{
3163 int n_rmap = count_rmaps(vcpu); 3338 check_writable_mappings_rmap(vcpu);
3164 int n_actual = count_writable_mappings(vcpu); 3339 count_rmaps(vcpu);
3165
3166 if (n_rmap != n_actual)
3167 printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
3168 __func__, audit_msg, n_rmap, n_actual);
3169} 3340}
3170 3341
3171static void audit_write_protection(struct kvm_vcpu *vcpu) 3342static void audit_write_protection(struct kvm_vcpu *vcpu)
@@ -3173,20 +3344,28 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
3173 struct kvm_mmu_page *sp; 3344 struct kvm_mmu_page *sp;
3174 struct kvm_memory_slot *slot; 3345 struct kvm_memory_slot *slot;
3175 unsigned long *rmapp; 3346 unsigned long *rmapp;
3347 u64 *spte;
3176 gfn_t gfn; 3348 gfn_t gfn;
3177 3349
3178 list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { 3350 list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
3179 if (sp->role.direct) 3351 if (sp->role.direct)
3180 continue; 3352 continue;
3353 if (sp->unsync)
3354 continue;
3181 3355
3182 gfn = unalias_gfn(vcpu->kvm, sp->gfn); 3356 gfn = unalias_gfn(vcpu->kvm, sp->gfn);
3183 slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn); 3357 slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn);
3184 rmapp = &slot->rmap[gfn - slot->base_gfn]; 3358 rmapp = &slot->rmap[gfn - slot->base_gfn];
3185 if (*rmapp) 3359
3186 printk(KERN_ERR "%s: (%s) shadow page has writable" 3360 spte = rmap_next(vcpu->kvm, rmapp, NULL);
3187 " mappings: gfn %lx role %x\n", 3361 while (spte) {
3362 if (*spte & PT_WRITABLE_MASK)
3363 printk(KERN_ERR "%s: (%s) shadow page has "
3364 "writable mappings: gfn %lx role %x\n",
3188 __func__, audit_msg, sp->gfn, 3365 __func__, audit_msg, sp->gfn,
3189 sp->role.word); 3366 sp->role.word);
3367 spte = rmap_next(vcpu->kvm, rmapp, spte);
3368 }
3190 } 3369 }
3191} 3370}
3192 3371
@@ -3198,7 +3377,9 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
3198 audit_msg = msg; 3377 audit_msg = msg;
3199 audit_rmap(vcpu); 3378 audit_rmap(vcpu);
3200 audit_write_protection(vcpu); 3379 audit_write_protection(vcpu);
3201 audit_mappings(vcpu); 3380 if (strcmp("pre pte write", audit_msg) != 0)
3381 audit_mappings(vcpu);
3382 audit_writable_sptes_have_rmaps(vcpu);
3202 dbg = olddbg; 3383 dbg = olddbg;
3203} 3384}
3204 3385
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 3494a2fb136e..61a1b3884b49 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -37,6 +37,8 @@
37#define PT32_ROOT_LEVEL 2 37#define PT32_ROOT_LEVEL 2
38#define PT32E_ROOT_LEVEL 3 38#define PT32E_ROOT_LEVEL 3
39 39
40int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
41
40static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) 42static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
41{ 43{
42 if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) 44 if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
@@ -75,7 +77,7 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
75 return vcpu->arch.cr0 & X86_CR0_PG; 77 return vcpu->arch.cr0 & X86_CR0_PG;
76} 78}
77 79
78static inline int is_present_pte(unsigned long pte) 80static inline int is_present_gpte(unsigned long pte)
79{ 81{
80 return pte & PT_PRESENT_MASK; 82 return pte & PT_PRESENT_MASK;
81} 83}
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
new file mode 100644
index 000000000000..3e4a5c6ca2a9
--- /dev/null
+++ b/arch/x86/kvm/mmutrace.h
@@ -0,0 +1,220 @@
1#if !defined(_TRACE_KVMMMU_H) || defined(TRACE_HEADER_MULTI_READ)
2#define _TRACE_KVMMMU_H
3
4#include <linux/tracepoint.h>
5#include <linux/ftrace_event.h>
6
7#undef TRACE_SYSTEM
8#define TRACE_SYSTEM kvmmmu
9#define TRACE_INCLUDE_PATH .
10#define TRACE_INCLUDE_FILE mmutrace
11
12#define KVM_MMU_PAGE_FIELDS \
13 __field(__u64, gfn) \
14 __field(__u32, role) \
15 __field(__u32, root_count) \
16 __field(__u32, unsync)
17
18#define KVM_MMU_PAGE_ASSIGN(sp) \
19 __entry->gfn = sp->gfn; \
20 __entry->role = sp->role.word; \
21 __entry->root_count = sp->root_count; \
22 __entry->unsync = sp->unsync;
23
24#define KVM_MMU_PAGE_PRINTK() ({ \
25 const char *ret = p->buffer + p->len; \
26 static const char *access_str[] = { \
27 "---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux" \
28 }; \
29 union kvm_mmu_page_role role; \
30 \
31 role.word = __entry->role; \
32 \
33 trace_seq_printf(p, "sp gfn %llx %u/%u q%u%s %s%s %spge" \
34 " %snxe root %u %s%c", \
35 __entry->gfn, role.level, role.glevels, \
36 role.quadrant, \
37 role.direct ? " direct" : "", \
38 access_str[role.access], \
39 role.invalid ? " invalid" : "", \
40 role.cr4_pge ? "" : "!", \
41 role.nxe ? "" : "!", \
42 __entry->root_count, \
43 __entry->unsync ? "unsync" : "sync", 0); \
44 ret; \
45 })
46
47#define kvm_mmu_trace_pferr_flags \
48 { PFERR_PRESENT_MASK, "P" }, \
49 { PFERR_WRITE_MASK, "W" }, \
50 { PFERR_USER_MASK, "U" }, \
51 { PFERR_RSVD_MASK, "RSVD" }, \
52 { PFERR_FETCH_MASK, "F" }
53
54/*
55 * A pagetable walk has started
56 */
57TRACE_EVENT(
58 kvm_mmu_pagetable_walk,
59 TP_PROTO(u64 addr, int write_fault, int user_fault, int fetch_fault),
60 TP_ARGS(addr, write_fault, user_fault, fetch_fault),
61
62 TP_STRUCT__entry(
63 __field(__u64, addr)
64 __field(__u32, pferr)
65 ),
66
67 TP_fast_assign(
68 __entry->addr = addr;
69 __entry->pferr = (!!write_fault << 1) | (!!user_fault << 2)
70 | (!!fetch_fault << 4);
71 ),
72
73 TP_printk("addr %llx pferr %x %s", __entry->addr, __entry->pferr,
74 __print_flags(__entry->pferr, "|", kvm_mmu_trace_pferr_flags))
75);
76
77
78/* We just walked a paging element */
79TRACE_EVENT(
80 kvm_mmu_paging_element,
81 TP_PROTO(u64 pte, int level),
82 TP_ARGS(pte, level),
83
84 TP_STRUCT__entry(
85 __field(__u64, pte)
86 __field(__u32, level)
87 ),
88
89 TP_fast_assign(
90 __entry->pte = pte;
91 __entry->level = level;
92 ),
93
94 TP_printk("pte %llx level %u", __entry->pte, __entry->level)
95);
96
97/* We set a pte accessed bit */
98TRACE_EVENT(
99 kvm_mmu_set_accessed_bit,
100 TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
101 TP_ARGS(table_gfn, index, size),
102
103 TP_STRUCT__entry(
104 __field(__u64, gpa)
105 ),
106
107 TP_fast_assign(
108 __entry->gpa = ((u64)table_gfn << PAGE_SHIFT)
109 + index * size;
110 ),
111
112 TP_printk("gpa %llx", __entry->gpa)
113);
114
115/* We set a pte dirty bit */
116TRACE_EVENT(
117 kvm_mmu_set_dirty_bit,
118 TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
119 TP_ARGS(table_gfn, index, size),
120
121 TP_STRUCT__entry(
122 __field(__u64, gpa)
123 ),
124
125 TP_fast_assign(
126 __entry->gpa = ((u64)table_gfn << PAGE_SHIFT)
127 + index * size;
128 ),
129
130 TP_printk("gpa %llx", __entry->gpa)
131);
132
133TRACE_EVENT(
134 kvm_mmu_walker_error,
135 TP_PROTO(u32 pferr),
136 TP_ARGS(pferr),
137
138 TP_STRUCT__entry(
139 __field(__u32, pferr)
140 ),
141
142 TP_fast_assign(
143 __entry->pferr = pferr;
144 ),
145
146 TP_printk("pferr %x %s", __entry->pferr,
147 __print_flags(__entry->pferr, "|", kvm_mmu_trace_pferr_flags))
148);
149
150TRACE_EVENT(
151 kvm_mmu_get_page,
152 TP_PROTO(struct kvm_mmu_page *sp, bool created),
153 TP_ARGS(sp, created),
154
155 TP_STRUCT__entry(
156 KVM_MMU_PAGE_FIELDS
157 __field(bool, created)
158 ),
159
160 TP_fast_assign(
161 KVM_MMU_PAGE_ASSIGN(sp)
162 __entry->created = created;
163 ),
164
165 TP_printk("%s %s", KVM_MMU_PAGE_PRINTK(),
166 __entry->created ? "new" : "existing")
167);
168
169TRACE_EVENT(
170 kvm_mmu_sync_page,
171 TP_PROTO(struct kvm_mmu_page *sp),
172 TP_ARGS(sp),
173
174 TP_STRUCT__entry(
175 KVM_MMU_PAGE_FIELDS
176 ),
177
178 TP_fast_assign(
179 KVM_MMU_PAGE_ASSIGN(sp)
180 ),
181
182 TP_printk("%s", KVM_MMU_PAGE_PRINTK())
183);
184
185TRACE_EVENT(
186 kvm_mmu_unsync_page,
187 TP_PROTO(struct kvm_mmu_page *sp),
188 TP_ARGS(sp),
189
190 TP_STRUCT__entry(
191 KVM_MMU_PAGE_FIELDS
192 ),
193
194 TP_fast_assign(
195 KVM_MMU_PAGE_ASSIGN(sp)
196 ),
197
198 TP_printk("%s", KVM_MMU_PAGE_PRINTK())
199);
200
201TRACE_EVENT(
202 kvm_mmu_zap_page,
203 TP_PROTO(struct kvm_mmu_page *sp),
204 TP_ARGS(sp),
205
206 TP_STRUCT__entry(
207 KVM_MMU_PAGE_FIELDS
208 ),
209
210 TP_fast_assign(
211 KVM_MMU_PAGE_ASSIGN(sp)
212 ),
213
214 TP_printk("%s", KVM_MMU_PAGE_PRINTK())
215);
216
217#endif /* _TRACE_KVMMMU_H */
218
219/* This part must be outside protection */
220#include <trace/define_trace.h>
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 67785f635399..d2fec9c12d22 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -27,7 +27,8 @@
27 #define guest_walker guest_walker64 27 #define guest_walker guest_walker64
28 #define FNAME(name) paging##64_##name 28 #define FNAME(name) paging##64_##name
29 #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK 29 #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
30 #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK 30 #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
31 #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
31 #define PT_INDEX(addr, level) PT64_INDEX(addr, level) 32 #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
32 #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) 33 #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
33 #define PT_LEVEL_BITS PT64_LEVEL_BITS 34 #define PT_LEVEL_BITS PT64_LEVEL_BITS
@@ -43,7 +44,8 @@
43 #define guest_walker guest_walker32 44 #define guest_walker guest_walker32
44 #define FNAME(name) paging##32_##name 45 #define FNAME(name) paging##32_##name
45 #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK 46 #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
46 #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK 47 #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl)
48 #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl)
47 #define PT_INDEX(addr, level) PT32_INDEX(addr, level) 49 #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
48 #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) 50 #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
49 #define PT_LEVEL_BITS PT32_LEVEL_BITS 51 #define PT_LEVEL_BITS PT32_LEVEL_BITS
@@ -53,8 +55,8 @@
53 #error Invalid PTTYPE value 55 #error Invalid PTTYPE value
54#endif 56#endif
55 57
56#define gpte_to_gfn FNAME(gpte_to_gfn) 58#define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl)
57#define gpte_to_gfn_pde FNAME(gpte_to_gfn_pde) 59#define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PT_PAGE_TABLE_LEVEL)
58 60
59/* 61/*
60 * The guest_walker structure emulates the behavior of the hardware page 62 * The guest_walker structure emulates the behavior of the hardware page
@@ -71,14 +73,9 @@ struct guest_walker {
71 u32 error_code; 73 u32 error_code;
72}; 74};
73 75
74static gfn_t gpte_to_gfn(pt_element_t gpte) 76static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
75{ 77{
76 return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; 78 return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
77}
78
79static gfn_t gpte_to_gfn_pde(pt_element_t gpte)
80{
81 return (gpte & PT_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
82} 79}
83 80
84static bool FNAME(cmpxchg_gpte)(struct kvm *kvm, 81static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
@@ -125,14 +122,16 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
125 gpa_t pte_gpa; 122 gpa_t pte_gpa;
126 int rsvd_fault = 0; 123 int rsvd_fault = 0;
127 124
128 pgprintk("%s: addr %lx\n", __func__, addr); 125 trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
126 fetch_fault);
129walk: 127walk:
130 walker->level = vcpu->arch.mmu.root_level; 128 walker->level = vcpu->arch.mmu.root_level;
131 pte = vcpu->arch.cr3; 129 pte = vcpu->arch.cr3;
132#if PTTYPE == 64 130#if PTTYPE == 64
133 if (!is_long_mode(vcpu)) { 131 if (!is_long_mode(vcpu)) {
134 pte = vcpu->arch.pdptrs[(addr >> 30) & 3]; 132 pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3);
135 if (!is_present_pte(pte)) 133 trace_kvm_mmu_paging_element(pte, walker->level);
134 if (!is_present_gpte(pte))
136 goto not_present; 135 goto not_present;
137 --walker->level; 136 --walker->level;
138 } 137 }
@@ -150,12 +149,11 @@ walk:
150 pte_gpa += index * sizeof(pt_element_t); 149 pte_gpa += index * sizeof(pt_element_t);
151 walker->table_gfn[walker->level - 1] = table_gfn; 150 walker->table_gfn[walker->level - 1] = table_gfn;
152 walker->pte_gpa[walker->level - 1] = pte_gpa; 151 walker->pte_gpa[walker->level - 1] = pte_gpa;
153 pgprintk("%s: table_gfn[%d] %lx\n", __func__,
154 walker->level - 1, table_gfn);
155 152
156 kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)); 153 kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte));
154 trace_kvm_mmu_paging_element(pte, walker->level);
157 155
158 if (!is_present_pte(pte)) 156 if (!is_present_gpte(pte))
159 goto not_present; 157 goto not_present;
160 158
161 rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level); 159 rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level);
@@ -175,6 +173,8 @@ walk:
175#endif 173#endif
176 174
177 if (!(pte & PT_ACCESSED_MASK)) { 175 if (!(pte & PT_ACCESSED_MASK)) {
176 trace_kvm_mmu_set_accessed_bit(table_gfn, index,
177 sizeof(pte));
178 mark_page_dirty(vcpu->kvm, table_gfn); 178 mark_page_dirty(vcpu->kvm, table_gfn);
179 if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, 179 if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn,
180 index, pte, pte|PT_ACCESSED_MASK)) 180 index, pte, pte|PT_ACCESSED_MASK))
@@ -186,18 +186,24 @@ walk:
186 186
187 walker->ptes[walker->level - 1] = pte; 187 walker->ptes[walker->level - 1] = pte;
188 188
189 if (walker->level == PT_PAGE_TABLE_LEVEL) { 189 if ((walker->level == PT_PAGE_TABLE_LEVEL) ||
190 walker->gfn = gpte_to_gfn(pte); 190 ((walker->level == PT_DIRECTORY_LEVEL) &&
191 break; 191 (pte & PT_PAGE_SIZE_MASK) &&
192 } 192 (PTTYPE == 64 || is_pse(vcpu))) ||
193 193 ((walker->level == PT_PDPE_LEVEL) &&
194 if (walker->level == PT_DIRECTORY_LEVEL 194 (pte & PT_PAGE_SIZE_MASK) &&
195 && (pte & PT_PAGE_SIZE_MASK) 195 is_long_mode(vcpu))) {
196 && (PTTYPE == 64 || is_pse(vcpu))) { 196 int lvl = walker->level;
197 walker->gfn = gpte_to_gfn_pde(pte); 197
198 walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL); 198 walker->gfn = gpte_to_gfn_lvl(pte, lvl);
199 if (PTTYPE == 32 && is_cpuid_PSE36()) 199 walker->gfn += (addr & PT_LVL_OFFSET_MASK(lvl))
200 >> PAGE_SHIFT;
201
202 if (PTTYPE == 32 &&
203 walker->level == PT_DIRECTORY_LEVEL &&
204 is_cpuid_PSE36())
200 walker->gfn += pse36_gfn_delta(pte); 205 walker->gfn += pse36_gfn_delta(pte);
206
201 break; 207 break;
202 } 208 }
203 209
@@ -205,9 +211,10 @@ walk:
205 --walker->level; 211 --walker->level;
206 } 212 }
207 213
208 if (write_fault && !is_dirty_pte(pte)) { 214 if (write_fault && !is_dirty_gpte(pte)) {
209 bool ret; 215 bool ret;
210 216
217 trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
211 mark_page_dirty(vcpu->kvm, table_gfn); 218 mark_page_dirty(vcpu->kvm, table_gfn);
212 ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, 219 ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
213 pte|PT_DIRTY_MASK); 220 pte|PT_DIRTY_MASK);
@@ -239,6 +246,7 @@ err:
239 walker->error_code |= PFERR_FETCH_MASK; 246 walker->error_code |= PFERR_FETCH_MASK;
240 if (rsvd_fault) 247 if (rsvd_fault)
241 walker->error_code |= PFERR_RSVD_MASK; 248 walker->error_code |= PFERR_RSVD_MASK;
249 trace_kvm_mmu_walker_error(walker->error_code);
242 return 0; 250 return 0;
243} 251}
244 252
@@ -248,12 +256,11 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
248 pt_element_t gpte; 256 pt_element_t gpte;
249 unsigned pte_access; 257 unsigned pte_access;
250 pfn_t pfn; 258 pfn_t pfn;
251 int largepage = vcpu->arch.update_pte.largepage;
252 259
253 gpte = *(const pt_element_t *)pte; 260 gpte = *(const pt_element_t *)pte;
254 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { 261 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
255 if (!is_present_pte(gpte)) 262 if (!is_present_gpte(gpte))
256 set_shadow_pte(spte, shadow_notrap_nonpresent_pte); 263 __set_spte(spte, shadow_notrap_nonpresent_pte);
257 return; 264 return;
258 } 265 }
259 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); 266 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
@@ -267,7 +274,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
267 return; 274 return;
268 kvm_get_pfn(pfn); 275 kvm_get_pfn(pfn);
269 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, 276 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
270 gpte & PT_DIRTY_MASK, NULL, largepage, 277 gpte & PT_DIRTY_MASK, NULL, PT_PAGE_TABLE_LEVEL,
271 gpte_to_gfn(gpte), pfn, true); 278 gpte_to_gfn(gpte), pfn, true);
272} 279}
273 280
@@ -276,7 +283,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
276 */ 283 */
277static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, 284static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
278 struct guest_walker *gw, 285 struct guest_walker *gw,
279 int user_fault, int write_fault, int largepage, 286 int user_fault, int write_fault, int hlevel,
280 int *ptwrite, pfn_t pfn) 287 int *ptwrite, pfn_t pfn)
281{ 288{
282 unsigned access = gw->pt_access; 289 unsigned access = gw->pt_access;
@@ -289,19 +296,18 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
289 pt_element_t curr_pte; 296 pt_element_t curr_pte;
290 struct kvm_shadow_walk_iterator iterator; 297 struct kvm_shadow_walk_iterator iterator;
291 298
292 if (!is_present_pte(gw->ptes[gw->level - 1])) 299 if (!is_present_gpte(gw->ptes[gw->level - 1]))
293 return NULL; 300 return NULL;
294 301
295 for_each_shadow_entry(vcpu, addr, iterator) { 302 for_each_shadow_entry(vcpu, addr, iterator) {
296 level = iterator.level; 303 level = iterator.level;
297 sptep = iterator.sptep; 304 sptep = iterator.sptep;
298 if (level == PT_PAGE_TABLE_LEVEL 305 if (iterator.level == hlevel) {
299 || (largepage && level == PT_DIRECTORY_LEVEL)) {
300 mmu_set_spte(vcpu, sptep, access, 306 mmu_set_spte(vcpu, sptep, access,
301 gw->pte_access & access, 307 gw->pte_access & access,
302 user_fault, write_fault, 308 user_fault, write_fault,
303 gw->ptes[gw->level-1] & PT_DIRTY_MASK, 309 gw->ptes[gw->level-1] & PT_DIRTY_MASK,
304 ptwrite, largepage, 310 ptwrite, level,
305 gw->gfn, pfn, false); 311 gw->gfn, pfn, false);
306 break; 312 break;
307 } 313 }
@@ -311,16 +317,19 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
311 317
312 if (is_large_pte(*sptep)) { 318 if (is_large_pte(*sptep)) {
313 rmap_remove(vcpu->kvm, sptep); 319 rmap_remove(vcpu->kvm, sptep);
314 set_shadow_pte(sptep, shadow_trap_nonpresent_pte); 320 __set_spte(sptep, shadow_trap_nonpresent_pte);
315 kvm_flush_remote_tlbs(vcpu->kvm); 321 kvm_flush_remote_tlbs(vcpu->kvm);
316 } 322 }
317 323
318 if (level == PT_DIRECTORY_LEVEL 324 if (level <= gw->level) {
319 && gw->level == PT_DIRECTORY_LEVEL) { 325 int delta = level - gw->level + 1;
320 direct = 1; 326 direct = 1;
321 if (!is_dirty_pte(gw->ptes[level - 1])) 327 if (!is_dirty_gpte(gw->ptes[level - delta]))
322 access &= ~ACC_WRITE_MASK; 328 access &= ~ACC_WRITE_MASK;
323 table_gfn = gpte_to_gfn(gw->ptes[level - 1]); 329 table_gfn = gpte_to_gfn(gw->ptes[level - delta]);
330 /* advance table_gfn when emulating 1gb pages with 4k */
331 if (delta == 0)
332 table_gfn += PT_INDEX(addr, level);
324 } else { 333 } else {
325 direct = 0; 334 direct = 0;
326 table_gfn = gw->table_gfn[level - 2]; 335 table_gfn = gw->table_gfn[level - 2];
@@ -369,11 +378,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
369 int user_fault = error_code & PFERR_USER_MASK; 378 int user_fault = error_code & PFERR_USER_MASK;
370 int fetch_fault = error_code & PFERR_FETCH_MASK; 379 int fetch_fault = error_code & PFERR_FETCH_MASK;
371 struct guest_walker walker; 380 struct guest_walker walker;
372 u64 *shadow_pte; 381 u64 *sptep;
373 int write_pt = 0; 382 int write_pt = 0;
374 int r; 383 int r;
375 pfn_t pfn; 384 pfn_t pfn;
376 int largepage = 0; 385 int level = PT_PAGE_TABLE_LEVEL;
377 unsigned long mmu_seq; 386 unsigned long mmu_seq;
378 387
379 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); 388 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
@@ -399,14 +408,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
399 return 0; 408 return 0;
400 } 409 }
401 410
402 if (walker.level == PT_DIRECTORY_LEVEL) { 411 if (walker.level >= PT_DIRECTORY_LEVEL) {
403 gfn_t large_gfn; 412 level = min(walker.level, mapping_level(vcpu, walker.gfn));
404 large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1); 413 walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
405 if (is_largepage_backed(vcpu, large_gfn)) {
406 walker.gfn = large_gfn;
407 largepage = 1;
408 }
409 } 414 }
415
410 mmu_seq = vcpu->kvm->mmu_notifier_seq; 416 mmu_seq = vcpu->kvm->mmu_notifier_seq;
411 smp_rmb(); 417 smp_rmb();
412 pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); 418 pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
@@ -422,11 +428,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
422 if (mmu_notifier_retry(vcpu, mmu_seq)) 428 if (mmu_notifier_retry(vcpu, mmu_seq))
423 goto out_unlock; 429 goto out_unlock;
424 kvm_mmu_free_some_pages(vcpu); 430 kvm_mmu_free_some_pages(vcpu);
425 shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, 431 sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
426 largepage, &write_pt, pfn); 432 level, &write_pt, pfn);
427
428 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, 433 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
429 shadow_pte, *shadow_pte, write_pt); 434 sptep, *sptep, write_pt);
430 435
431 if (!write_pt) 436 if (!write_pt)
432 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ 437 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
@@ -459,8 +464,9 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
459 sptep = iterator.sptep; 464 sptep = iterator.sptep;
460 465
461 /* FIXME: properly handle invlpg on large guest pages */ 466 /* FIXME: properly handle invlpg on large guest pages */
462 if (level == PT_PAGE_TABLE_LEVEL || 467 if (level == PT_PAGE_TABLE_LEVEL ||
463 ((level == PT_DIRECTORY_LEVEL) && is_large_pte(*sptep))) { 468 ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) ||
469 ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) {
464 struct kvm_mmu_page *sp = page_header(__pa(sptep)); 470 struct kvm_mmu_page *sp = page_header(__pa(sptep));
465 471
466 pte_gpa = (sp->gfn << PAGE_SHIFT); 472 pte_gpa = (sp->gfn << PAGE_SHIFT);
@@ -472,7 +478,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
472 --vcpu->kvm->stat.lpages; 478 --vcpu->kvm->stat.lpages;
473 need_flush = 1; 479 need_flush = 1;
474 } 480 }
475 set_shadow_pte(sptep, shadow_trap_nonpresent_pte); 481 __set_spte(sptep, shadow_trap_nonpresent_pte);
476 break; 482 break;
477 } 483 }
478 484
@@ -489,7 +495,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
489 if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, 495 if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
490 sizeof(pt_element_t))) 496 sizeof(pt_element_t)))
491 return; 497 return;
492 if (is_present_pte(gpte) && (gpte & PT_ACCESSED_MASK)) { 498 if (is_present_gpte(gpte) && (gpte & PT_ACCESSED_MASK)) {
493 if (mmu_topup_memory_caches(vcpu)) 499 if (mmu_topup_memory_caches(vcpu))
494 return; 500 return;
495 kvm_mmu_pte_write(vcpu, pte_gpa, (const u8 *)&gpte, 501 kvm_mmu_pte_write(vcpu, pte_gpa, (const u8 *)&gpte,
@@ -536,7 +542,7 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
536 r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, pt, sizeof pt); 542 r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, pt, sizeof pt);
537 pte_gpa += ARRAY_SIZE(pt) * sizeof(pt_element_t); 543 pte_gpa += ARRAY_SIZE(pt) * sizeof(pt_element_t);
538 for (j = 0; j < ARRAY_SIZE(pt); ++j) 544 for (j = 0; j < ARRAY_SIZE(pt); ++j)
539 if (r || is_present_pte(pt[j])) 545 if (r || is_present_gpte(pt[j]))
540 sp->spt[i+j] = shadow_trap_nonpresent_pte; 546 sp->spt[i+j] = shadow_trap_nonpresent_pte;
541 else 547 else
542 sp->spt[i+j] = shadow_notrap_nonpresent_pte; 548 sp->spt[i+j] = shadow_notrap_nonpresent_pte;
@@ -574,23 +580,23 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
574 sizeof(pt_element_t))) 580 sizeof(pt_element_t)))
575 return -EINVAL; 581 return -EINVAL;
576 582
577 if (gpte_to_gfn(gpte) != gfn || !is_present_pte(gpte) || 583 if (gpte_to_gfn(gpte) != gfn || !is_present_gpte(gpte) ||
578 !(gpte & PT_ACCESSED_MASK)) { 584 !(gpte & PT_ACCESSED_MASK)) {
579 u64 nonpresent; 585 u64 nonpresent;
580 586
581 rmap_remove(vcpu->kvm, &sp->spt[i]); 587 rmap_remove(vcpu->kvm, &sp->spt[i]);
582 if (is_present_pte(gpte)) 588 if (is_present_gpte(gpte))
583 nonpresent = shadow_trap_nonpresent_pte; 589 nonpresent = shadow_trap_nonpresent_pte;
584 else 590 else
585 nonpresent = shadow_notrap_nonpresent_pte; 591 nonpresent = shadow_notrap_nonpresent_pte;
586 set_shadow_pte(&sp->spt[i], nonpresent); 592 __set_spte(&sp->spt[i], nonpresent);
587 continue; 593 continue;
588 } 594 }
589 595
590 nr_present++; 596 nr_present++;
591 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); 597 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
592 set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, 598 set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
593 is_dirty_pte(gpte), 0, gfn, 599 is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn,
594 spte_to_pfn(sp->spt[i]), true, false); 600 spte_to_pfn(sp->spt[i]), true, false);
595 } 601 }
596 602
@@ -603,9 +609,10 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
603#undef PT_BASE_ADDR_MASK 609#undef PT_BASE_ADDR_MASK
604#undef PT_INDEX 610#undef PT_INDEX
605#undef PT_LEVEL_MASK 611#undef PT_LEVEL_MASK
606#undef PT_DIR_BASE_ADDR_MASK 612#undef PT_LVL_ADDR_MASK
613#undef PT_LVL_OFFSET_MASK
607#undef PT_LEVEL_BITS 614#undef PT_LEVEL_BITS
608#undef PT_MAX_FULL_LEVELS 615#undef PT_MAX_FULL_LEVELS
609#undef gpte_to_gfn 616#undef gpte_to_gfn
610#undef gpte_to_gfn_pde 617#undef gpte_to_gfn_lvl
611#undef CMPXCHG 618#undef CMPXCHG
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index b1f658ad2f06..944cc9c04b3c 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -15,7 +15,6 @@
15 */ 15 */
16#include <linux/kvm_host.h> 16#include <linux/kvm_host.h>
17 17
18#include "kvm_svm.h"
19#include "irq.h" 18#include "irq.h"
20#include "mmu.h" 19#include "mmu.h"
21#include "kvm_cache_regs.h" 20#include "kvm_cache_regs.h"
@@ -26,10 +25,12 @@
26#include <linux/vmalloc.h> 25#include <linux/vmalloc.h>
27#include <linux/highmem.h> 26#include <linux/highmem.h>
28#include <linux/sched.h> 27#include <linux/sched.h>
28#include <linux/ftrace_event.h>
29 29
30#include <asm/desc.h> 30#include <asm/desc.h>
31 31
32#include <asm/virtext.h> 32#include <asm/virtext.h>
33#include "trace.h"
33 34
34#define __ex(x) __kvm_handle_fault_on_reboot(x) 35#define __ex(x) __kvm_handle_fault_on_reboot(x)
35 36
@@ -46,6 +47,10 @@ MODULE_LICENSE("GPL");
46#define SVM_FEATURE_LBRV (1 << 1) 47#define SVM_FEATURE_LBRV (1 << 1)
47#define SVM_FEATURE_SVML (1 << 2) 48#define SVM_FEATURE_SVML (1 << 2)
48 49
50#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
51#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */
52#define NESTED_EXIT_CONTINUE 2 /* Further checks needed */
53
49#define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) 54#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
50 55
51/* Turn on to get debugging output*/ 56/* Turn on to get debugging output*/
@@ -57,6 +62,58 @@ MODULE_LICENSE("GPL");
57#define nsvm_printk(fmt, args...) do {} while(0) 62#define nsvm_printk(fmt, args...) do {} while(0)
58#endif 63#endif
59 64
65static const u32 host_save_user_msrs[] = {
66#ifdef CONFIG_X86_64
67 MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
68 MSR_FS_BASE,
69#endif
70 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
71};
72
73#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
74
75struct kvm_vcpu;
76
77struct nested_state {
78 struct vmcb *hsave;
79 u64 hsave_msr;
80 u64 vmcb;
81
82 /* These are the merged vectors */
83 u32 *msrpm;
84
85 /* gpa pointers to the real vectors */
86 u64 vmcb_msrpm;
87
88 /* cache for intercepts of the guest */
89 u16 intercept_cr_read;
90 u16 intercept_cr_write;
91 u16 intercept_dr_read;
92 u16 intercept_dr_write;
93 u32 intercept_exceptions;
94 u64 intercept;
95
96};
97
98struct vcpu_svm {
99 struct kvm_vcpu vcpu;
100 struct vmcb *vmcb;
101 unsigned long vmcb_pa;
102 struct svm_cpu_data *svm_data;
103 uint64_t asid_generation;
104 uint64_t sysenter_esp;
105 uint64_t sysenter_eip;
106
107 u64 next_rip;
108
109 u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
110 u64 host_gs_base;
111
112 u32 *msrpm;
113
114 struct nested_state nested;
115};
116
60/* enable NPT for AMD64 and X86 with PAE */ 117/* enable NPT for AMD64 and X86 with PAE */
61#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) 118#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
62static bool npt_enabled = true; 119static bool npt_enabled = true;
@@ -67,15 +124,14 @@ static int npt = 1;
67 124
68module_param(npt, int, S_IRUGO); 125module_param(npt, int, S_IRUGO);
69 126
70static int nested = 0; 127static int nested = 1;
71module_param(nested, int, S_IRUGO); 128module_param(nested, int, S_IRUGO);
72 129
73static void svm_flush_tlb(struct kvm_vcpu *vcpu); 130static void svm_flush_tlb(struct kvm_vcpu *vcpu);
131static void svm_complete_interrupts(struct vcpu_svm *svm);
74 132
75static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override); 133static int nested_svm_exit_handled(struct vcpu_svm *svm);
76static int nested_svm_vmexit(struct vcpu_svm *svm); 134static int nested_svm_vmexit(struct vcpu_svm *svm);
77static int nested_svm_vmsave(struct vcpu_svm *svm, void *nested_vmcb,
78 void *arg2, void *opaque);
79static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, 135static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
80 bool has_error_code, u32 error_code); 136 bool has_error_code, u32 error_code);
81 137
@@ -86,7 +142,22 @@ static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
86 142
87static inline bool is_nested(struct vcpu_svm *svm) 143static inline bool is_nested(struct vcpu_svm *svm)
88{ 144{
89 return svm->nested_vmcb; 145 return svm->nested.vmcb;
146}
147
148static inline void enable_gif(struct vcpu_svm *svm)
149{
150 svm->vcpu.arch.hflags |= HF_GIF_MASK;
151}
152
153static inline void disable_gif(struct vcpu_svm *svm)
154{
155 svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
156}
157
158static inline bool gif_set(struct vcpu_svm *svm)
159{
160 return !!(svm->vcpu.arch.hflags & HF_GIF_MASK);
90} 161}
91 162
92static unsigned long iopm_base; 163static unsigned long iopm_base;
@@ -147,19 +218,6 @@ static inline void invlpga(unsigned long addr, u32 asid)
147 asm volatile (__ex(SVM_INVLPGA) :: "a"(addr), "c"(asid)); 218 asm volatile (__ex(SVM_INVLPGA) :: "a"(addr), "c"(asid));
148} 219}
149 220
150static inline unsigned long kvm_read_cr2(void)
151{
152 unsigned long cr2;
153
154 asm volatile ("mov %%cr2, %0" : "=r" (cr2));
155 return cr2;
156}
157
158static inline void kvm_write_cr2(unsigned long val)
159{
160 asm volatile ("mov %0, %%cr2" :: "r" (val));
161}
162
163static inline void force_new_asid(struct kvm_vcpu *vcpu) 221static inline void force_new_asid(struct kvm_vcpu *vcpu)
164{ 222{
165 to_svm(vcpu)->asid_generation--; 223 to_svm(vcpu)->asid_generation--;
@@ -263,7 +321,7 @@ static void svm_hardware_enable(void *garbage)
263 321
264 struct svm_cpu_data *svm_data; 322 struct svm_cpu_data *svm_data;
265 uint64_t efer; 323 uint64_t efer;
266 struct desc_ptr gdt_descr; 324 struct descriptor_table gdt_descr;
267 struct desc_struct *gdt; 325 struct desc_struct *gdt;
268 int me = raw_smp_processor_id(); 326 int me = raw_smp_processor_id();
269 327
@@ -283,8 +341,8 @@ static void svm_hardware_enable(void *garbage)
283 svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; 341 svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
284 svm_data->next_asid = svm_data->max_asid + 1; 342 svm_data->next_asid = svm_data->max_asid + 1;
285 343
286 asm volatile ("sgdt %0" : "=m"(gdt_descr)); 344 kvm_get_gdt(&gdt_descr);
287 gdt = (struct desc_struct *)gdt_descr.address; 345 gdt = (struct desc_struct *)gdt_descr.base;
288 svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); 346 svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
289 347
290 rdmsrl(MSR_EFER, efer); 348 rdmsrl(MSR_EFER, efer);
@@ -367,8 +425,6 @@ static void svm_vcpu_init_msrpm(u32 *msrpm)
367#endif 425#endif
368 set_msr_interception(msrpm, MSR_K6_STAR, 1, 1); 426 set_msr_interception(msrpm, MSR_K6_STAR, 1, 1);
369 set_msr_interception(msrpm, MSR_IA32_SYSENTER_CS, 1, 1); 427 set_msr_interception(msrpm, MSR_IA32_SYSENTER_CS, 1, 1);
370 set_msr_interception(msrpm, MSR_IA32_SYSENTER_ESP, 1, 1);
371 set_msr_interception(msrpm, MSR_IA32_SYSENTER_EIP, 1, 1);
372} 428}
373 429
374static void svm_enable_lbrv(struct vcpu_svm *svm) 430static void svm_enable_lbrv(struct vcpu_svm *svm)
@@ -595,8 +651,10 @@ static void init_vmcb(struct vcpu_svm *svm)
595 } 651 }
596 force_new_asid(&svm->vcpu); 652 force_new_asid(&svm->vcpu);
597 653
598 svm->nested_vmcb = 0; 654 svm->nested.vmcb = 0;
599 svm->vcpu.arch.hflags = HF_GIF_MASK; 655 svm->vcpu.arch.hflags = 0;
656
657 enable_gif(svm);
600} 658}
601 659
602static int svm_vcpu_reset(struct kvm_vcpu *vcpu) 660static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
@@ -605,7 +663,7 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
605 663
606 init_vmcb(svm); 664 init_vmcb(svm);
607 665
608 if (vcpu->vcpu_id != 0) { 666 if (!kvm_vcpu_is_bsp(vcpu)) {
609 kvm_rip_write(vcpu, 0); 667 kvm_rip_write(vcpu, 0);
610 svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; 668 svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
611 svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; 669 svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
@@ -656,9 +714,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
656 hsave_page = alloc_page(GFP_KERNEL); 714 hsave_page = alloc_page(GFP_KERNEL);
657 if (!hsave_page) 715 if (!hsave_page)
658 goto uninit; 716 goto uninit;
659 svm->hsave = page_address(hsave_page); 717 svm->nested.hsave = page_address(hsave_page);
660 718
661 svm->nested_msrpm = page_address(nested_msrpm_pages); 719 svm->nested.msrpm = page_address(nested_msrpm_pages);
662 720
663 svm->vmcb = page_address(page); 721 svm->vmcb = page_address(page);
664 clear_page(svm->vmcb); 722 clear_page(svm->vmcb);
@@ -669,7 +727,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
669 fx_init(&svm->vcpu); 727 fx_init(&svm->vcpu);
670 svm->vcpu.fpu_active = 1; 728 svm->vcpu.fpu_active = 1;
671 svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; 729 svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
672 if (svm->vcpu.vcpu_id == 0) 730 if (kvm_vcpu_is_bsp(&svm->vcpu))
673 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; 731 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
674 732
675 return &svm->vcpu; 733 return &svm->vcpu;
@@ -688,8 +746,8 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
688 746
689 __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT)); 747 __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT));
690 __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); 748 __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
691 __free_page(virt_to_page(svm->hsave)); 749 __free_page(virt_to_page(svm->nested.hsave));
692 __free_pages(virt_to_page(svm->nested_msrpm), MSRPM_ALLOC_ORDER); 750 __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
693 kvm_vcpu_uninit(vcpu); 751 kvm_vcpu_uninit(vcpu);
694 kmem_cache_free(kvm_vcpu_cache, svm); 752 kmem_cache_free(kvm_vcpu_cache, svm);
695} 753}
@@ -740,6 +798,18 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
740 to_svm(vcpu)->vmcb->save.rflags = rflags; 798 to_svm(vcpu)->vmcb->save.rflags = rflags;
741} 799}
742 800
801static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
802{
803 switch (reg) {
804 case VCPU_EXREG_PDPTR:
805 BUG_ON(!npt_enabled);
806 load_pdptrs(vcpu, vcpu->arch.cr3);
807 break;
808 default:
809 BUG();
810 }
811}
812
743static void svm_set_vintr(struct vcpu_svm *svm) 813static void svm_set_vintr(struct vcpu_svm *svm)
744{ 814{
745 svm->vmcb->control.intercept |= 1ULL << INTERCEPT_VINTR; 815 svm->vmcb->control.intercept |= 1ULL << INTERCEPT_VINTR;
@@ -1061,7 +1131,6 @@ static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr)
1061 val = 0; 1131 val = 0;
1062 } 1132 }
1063 1133
1064 KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler);
1065 return val; 1134 return val;
1066} 1135}
1067 1136
@@ -1070,8 +1139,6 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
1070{ 1139{
1071 struct vcpu_svm *svm = to_svm(vcpu); 1140 struct vcpu_svm *svm = to_svm(vcpu);
1072 1141
1073 KVMTRACE_2D(DR_WRITE, vcpu, (u32)dr, (u32)value, handler);
1074
1075 *exception = 0; 1142 *exception = 0;
1076 1143
1077 switch (dr) { 1144 switch (dr) {
@@ -1119,25 +1186,9 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1119 fault_address = svm->vmcb->control.exit_info_2; 1186 fault_address = svm->vmcb->control.exit_info_2;
1120 error_code = svm->vmcb->control.exit_info_1; 1187 error_code = svm->vmcb->control.exit_info_1;
1121 1188
1122 if (!npt_enabled) 1189 trace_kvm_page_fault(fault_address, error_code);
1123 KVMTRACE_3D(PAGE_FAULT, &svm->vcpu, error_code, 1190 if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu))
1124 (u32)fault_address, (u32)(fault_address >> 32), 1191 kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
1125 handler);
1126 else
1127 KVMTRACE_3D(TDP_FAULT, &svm->vcpu, error_code,
1128 (u32)fault_address, (u32)(fault_address >> 32),
1129 handler);
1130 /*
1131 * FIXME: Tis shouldn't be necessary here, but there is a flush
1132 * missing in the MMU code. Until we find this bug, flush the
1133 * complete TLB here on an NPF
1134 */
1135 if (npt_enabled)
1136 svm_flush_tlb(&svm->vcpu);
1137 else {
1138 if (kvm_event_needs_reinjection(&svm->vcpu))
1139 kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
1140 }
1141 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); 1192 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
1142} 1193}
1143 1194
@@ -1253,14 +1304,12 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1253 1304
1254static int nmi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1305static int nmi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1255{ 1306{
1256 KVMTRACE_0D(NMI, &svm->vcpu, handler);
1257 return 1; 1307 return 1;
1258} 1308}
1259 1309
1260static int intr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1310static int intr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1261{ 1311{
1262 ++svm->vcpu.stat.irq_exits; 1312 ++svm->vcpu.stat.irq_exits;
1263 KVMTRACE_0D(INTR, &svm->vcpu, handler);
1264 return 1; 1313 return 1;
1265} 1314}
1266 1315
@@ -1303,44 +1352,39 @@ static int nested_svm_check_permissions(struct vcpu_svm *svm)
1303static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, 1352static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
1304 bool has_error_code, u32 error_code) 1353 bool has_error_code, u32 error_code)
1305{ 1354{
1306 if (is_nested(svm)) { 1355 if (!is_nested(svm))
1307 svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; 1356 return 0;
1308 svm->vmcb->control.exit_code_hi = 0;
1309 svm->vmcb->control.exit_info_1 = error_code;
1310 svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
1311 if (nested_svm_exit_handled(svm, false)) {
1312 nsvm_printk("VMexit -> EXCP 0x%x\n", nr);
1313
1314 nested_svm_vmexit(svm);
1315 return 1;
1316 }
1317 }
1318 1357
1319 return 0; 1358 svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
1359 svm->vmcb->control.exit_code_hi = 0;
1360 svm->vmcb->control.exit_info_1 = error_code;
1361 svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
1362
1363 return nested_svm_exit_handled(svm);
1320} 1364}
1321 1365
1322static inline int nested_svm_intr(struct vcpu_svm *svm) 1366static inline int nested_svm_intr(struct vcpu_svm *svm)
1323{ 1367{
1324 if (is_nested(svm)) { 1368 if (!is_nested(svm))
1325 if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) 1369 return 0;
1326 return 0;
1327 1370
1328 if (!(svm->vcpu.arch.hflags & HF_HIF_MASK)) 1371 if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
1329 return 0; 1372 return 0;
1330 1373
1331 svm->vmcb->control.exit_code = SVM_EXIT_INTR; 1374 if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
1375 return 0;
1332 1376
1333 if (nested_svm_exit_handled(svm, false)) { 1377 svm->vmcb->control.exit_code = SVM_EXIT_INTR;
1334 nsvm_printk("VMexit -> INTR\n"); 1378
1335 nested_svm_vmexit(svm); 1379 if (nested_svm_exit_handled(svm)) {
1336 return 1; 1380 nsvm_printk("VMexit -> INTR\n");
1337 } 1381 return 1;
1338 } 1382 }
1339 1383
1340 return 0; 1384 return 0;
1341} 1385}
1342 1386
1343static struct page *nested_svm_get_page(struct vcpu_svm *svm, u64 gpa) 1387static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, enum km_type idx)
1344{ 1388{
1345 struct page *page; 1389 struct page *page;
1346 1390
@@ -1348,236 +1392,246 @@ static struct page *nested_svm_get_page(struct vcpu_svm *svm, u64 gpa)
1348 page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT); 1392 page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT);
1349 up_read(&current->mm->mmap_sem); 1393 up_read(&current->mm->mmap_sem);
1350 1394
1351 if (is_error_page(page)) { 1395 if (is_error_page(page))
1352 printk(KERN_INFO "%s: could not find page at 0x%llx\n", 1396 goto error;
1353 __func__, gpa); 1397
1354 kvm_release_page_clean(page); 1398 return kmap_atomic(page, idx);
1355 kvm_inject_gp(&svm->vcpu, 0); 1399
1356 return NULL; 1400error:
1357 } 1401 kvm_release_page_clean(page);
1358 return page; 1402 kvm_inject_gp(&svm->vcpu, 0);
1403
1404 return NULL;
1359} 1405}
1360 1406
1361static int nested_svm_do(struct vcpu_svm *svm, 1407static void nested_svm_unmap(void *addr, enum km_type idx)
1362 u64 arg1_gpa, u64 arg2_gpa, void *opaque,
1363 int (*handler)(struct vcpu_svm *svm,
1364 void *arg1,
1365 void *arg2,
1366 void *opaque))
1367{ 1408{
1368 struct page *arg1_page; 1409 struct page *page;
1369 struct page *arg2_page = NULL;
1370 void *arg1;
1371 void *arg2 = NULL;
1372 int retval;
1373 1410
1374 arg1_page = nested_svm_get_page(svm, arg1_gpa); 1411 if (!addr)
1375 if(arg1_page == NULL) 1412 return;
1376 return 1;
1377 1413
1378 if (arg2_gpa) { 1414 page = kmap_atomic_to_page(addr);
1379 arg2_page = nested_svm_get_page(svm, arg2_gpa); 1415
1380 if(arg2_page == NULL) { 1416 kunmap_atomic(addr, idx);
1381 kvm_release_page_clean(arg1_page); 1417 kvm_release_page_dirty(page);
1382 return 1; 1418}
1383 } 1419
1384 } 1420static bool nested_svm_exit_handled_msr(struct vcpu_svm *svm)
1421{
1422 u32 param = svm->vmcb->control.exit_info_1 & 1;
1423 u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
1424 bool ret = false;
1425 u32 t0, t1;
1426 u8 *msrpm;
1385 1427
1386 arg1 = kmap_atomic(arg1_page, KM_USER0); 1428 if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
1387 if (arg2_gpa) 1429 return false;
1388 arg2 = kmap_atomic(arg2_page, KM_USER1);
1389 1430
1390 retval = handler(svm, arg1, arg2, opaque); 1431 msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, KM_USER0);
1432
1433 if (!msrpm)
1434 goto out;
1435
1436 switch (msr) {
1437 case 0 ... 0x1fff:
1438 t0 = (msr * 2) % 8;
1439 t1 = msr / 8;
1440 break;
1441 case 0xc0000000 ... 0xc0001fff:
1442 t0 = (8192 + msr - 0xc0000000) * 2;
1443 t1 = (t0 / 8);
1444 t0 %= 8;
1445 break;
1446 case 0xc0010000 ... 0xc0011fff:
1447 t0 = (16384 + msr - 0xc0010000) * 2;
1448 t1 = (t0 / 8);
1449 t0 %= 8;
1450 break;
1451 default:
1452 ret = true;
1453 goto out;
1454 }
1391 1455
1392 kunmap_atomic(arg1, KM_USER0); 1456 ret = msrpm[t1] & ((1 << param) << t0);
1393 if (arg2_gpa)
1394 kunmap_atomic(arg2, KM_USER1);
1395 1457
1396 kvm_release_page_dirty(arg1_page); 1458out:
1397 if (arg2_gpa) 1459 nested_svm_unmap(msrpm, KM_USER0);
1398 kvm_release_page_dirty(arg2_page);
1399 1460
1400 return retval; 1461 return ret;
1401} 1462}
1402 1463
1403static int nested_svm_exit_handled_real(struct vcpu_svm *svm, 1464static int nested_svm_exit_special(struct vcpu_svm *svm)
1404 void *arg1,
1405 void *arg2,
1406 void *opaque)
1407{ 1465{
1408 struct vmcb *nested_vmcb = (struct vmcb *)arg1;
1409 bool kvm_overrides = *(bool *)opaque;
1410 u32 exit_code = svm->vmcb->control.exit_code; 1466 u32 exit_code = svm->vmcb->control.exit_code;
1411 1467
1412 if (kvm_overrides) { 1468 switch (exit_code) {
1413 switch (exit_code) { 1469 case SVM_EXIT_INTR:
1414 case SVM_EXIT_INTR: 1470 case SVM_EXIT_NMI:
1415 case SVM_EXIT_NMI: 1471 return NESTED_EXIT_HOST;
1416 return 0;
1417 /* For now we are always handling NPFs when using them */ 1472 /* For now we are always handling NPFs when using them */
1418 case SVM_EXIT_NPF: 1473 case SVM_EXIT_NPF:
1419 if (npt_enabled) 1474 if (npt_enabled)
1420 return 0; 1475 return NESTED_EXIT_HOST;
1421 break; 1476 break;
1422 /* When we're shadowing, trap PFs */ 1477 /* When we're shadowing, trap PFs */
1423 case SVM_EXIT_EXCP_BASE + PF_VECTOR: 1478 case SVM_EXIT_EXCP_BASE + PF_VECTOR:
1424 if (!npt_enabled) 1479 if (!npt_enabled)
1425 return 0; 1480 return NESTED_EXIT_HOST;
1426 break; 1481 break;
1427 default: 1482 default:
1428 break; 1483 break;
1429 }
1430 } 1484 }
1431 1485
1486 return NESTED_EXIT_CONTINUE;
1487}
1488
1489/*
1490 * If this function returns true, this #vmexit was already handled
1491 */
1492static int nested_svm_exit_handled(struct vcpu_svm *svm)
1493{
1494 u32 exit_code = svm->vmcb->control.exit_code;
1495 int vmexit = NESTED_EXIT_HOST;
1496
1432 switch (exit_code) { 1497 switch (exit_code) {
1498 case SVM_EXIT_MSR:
1499 vmexit = nested_svm_exit_handled_msr(svm);
1500 break;
1433 case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: { 1501 case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: {
1434 u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0); 1502 u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0);
1435 if (nested_vmcb->control.intercept_cr_read & cr_bits) 1503 if (svm->nested.intercept_cr_read & cr_bits)
1436 return 1; 1504 vmexit = NESTED_EXIT_DONE;
1437 break; 1505 break;
1438 } 1506 }
1439 case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: { 1507 case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: {
1440 u32 cr_bits = 1 << (exit_code - SVM_EXIT_WRITE_CR0); 1508 u32 cr_bits = 1 << (exit_code - SVM_EXIT_WRITE_CR0);
1441 if (nested_vmcb->control.intercept_cr_write & cr_bits) 1509 if (svm->nested.intercept_cr_write & cr_bits)
1442 return 1; 1510 vmexit = NESTED_EXIT_DONE;
1443 break; 1511 break;
1444 } 1512 }
1445 case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: { 1513 case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: {
1446 u32 dr_bits = 1 << (exit_code - SVM_EXIT_READ_DR0); 1514 u32 dr_bits = 1 << (exit_code - SVM_EXIT_READ_DR0);
1447 if (nested_vmcb->control.intercept_dr_read & dr_bits) 1515 if (svm->nested.intercept_dr_read & dr_bits)
1448 return 1; 1516 vmexit = NESTED_EXIT_DONE;
1449 break; 1517 break;
1450 } 1518 }
1451 case SVM_EXIT_WRITE_DR0 ... SVM_EXIT_WRITE_DR7: { 1519 case SVM_EXIT_WRITE_DR0 ... SVM_EXIT_WRITE_DR7: {
1452 u32 dr_bits = 1 << (exit_code - SVM_EXIT_WRITE_DR0); 1520 u32 dr_bits = 1 << (exit_code - SVM_EXIT_WRITE_DR0);
1453 if (nested_vmcb->control.intercept_dr_write & dr_bits) 1521 if (svm->nested.intercept_dr_write & dr_bits)
1454 return 1; 1522 vmexit = NESTED_EXIT_DONE;
1455 break; 1523 break;
1456 } 1524 }
1457 case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: { 1525 case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
1458 u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); 1526 u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
1459 if (nested_vmcb->control.intercept_exceptions & excp_bits) 1527 if (svm->nested.intercept_exceptions & excp_bits)
1460 return 1; 1528 vmexit = NESTED_EXIT_DONE;
1461 break; 1529 break;
1462 } 1530 }
1463 default: { 1531 default: {
1464 u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR); 1532 u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
1465 nsvm_printk("exit code: 0x%x\n", exit_code); 1533 nsvm_printk("exit code: 0x%x\n", exit_code);
1466 if (nested_vmcb->control.intercept & exit_bits) 1534 if (svm->nested.intercept & exit_bits)
1467 return 1; 1535 vmexit = NESTED_EXIT_DONE;
1468 } 1536 }
1469 } 1537 }
1470 1538
1471 return 0; 1539 if (vmexit == NESTED_EXIT_DONE) {
1472} 1540 nsvm_printk("#VMEXIT reason=%04x\n", exit_code);
1473 1541 nested_svm_vmexit(svm);
1474static int nested_svm_exit_handled_msr(struct vcpu_svm *svm,
1475 void *arg1, void *arg2,
1476 void *opaque)
1477{
1478 struct vmcb *nested_vmcb = (struct vmcb *)arg1;
1479 u8 *msrpm = (u8 *)arg2;
1480 u32 t0, t1;
1481 u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
1482 u32 param = svm->vmcb->control.exit_info_1 & 1;
1483
1484 if (!(nested_vmcb->control.intercept & (1ULL << INTERCEPT_MSR_PROT)))
1485 return 0;
1486
1487 switch(msr) {
1488 case 0 ... 0x1fff:
1489 t0 = (msr * 2) % 8;
1490 t1 = msr / 8;
1491 break;
1492 case 0xc0000000 ... 0xc0001fff:
1493 t0 = (8192 + msr - 0xc0000000) * 2;
1494 t1 = (t0 / 8);
1495 t0 %= 8;
1496 break;
1497 case 0xc0010000 ... 0xc0011fff:
1498 t0 = (16384 + msr - 0xc0010000) * 2;
1499 t1 = (t0 / 8);
1500 t0 %= 8;
1501 break;
1502 default:
1503 return 1;
1504 break;
1505 } 1542 }
1506 if (msrpm[t1] & ((1 << param) << t0))
1507 return 1;
1508 1543
1509 return 0; 1544 return vmexit;
1545}
1546
1547static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *from_vmcb)
1548{
1549 struct vmcb_control_area *dst = &dst_vmcb->control;
1550 struct vmcb_control_area *from = &from_vmcb->control;
1551
1552 dst->intercept_cr_read = from->intercept_cr_read;
1553 dst->intercept_cr_write = from->intercept_cr_write;
1554 dst->intercept_dr_read = from->intercept_dr_read;
1555 dst->intercept_dr_write = from->intercept_dr_write;
1556 dst->intercept_exceptions = from->intercept_exceptions;
1557 dst->intercept = from->intercept;
1558 dst->iopm_base_pa = from->iopm_base_pa;
1559 dst->msrpm_base_pa = from->msrpm_base_pa;
1560 dst->tsc_offset = from->tsc_offset;
1561 dst->asid = from->asid;
1562 dst->tlb_ctl = from->tlb_ctl;
1563 dst->int_ctl = from->int_ctl;
1564 dst->int_vector = from->int_vector;
1565 dst->int_state = from->int_state;
1566 dst->exit_code = from->exit_code;
1567 dst->exit_code_hi = from->exit_code_hi;
1568 dst->exit_info_1 = from->exit_info_1;
1569 dst->exit_info_2 = from->exit_info_2;
1570 dst->exit_int_info = from->exit_int_info;
1571 dst->exit_int_info_err = from->exit_int_info_err;
1572 dst->nested_ctl = from->nested_ctl;
1573 dst->event_inj = from->event_inj;
1574 dst->event_inj_err = from->event_inj_err;
1575 dst->nested_cr3 = from->nested_cr3;
1576 dst->lbr_ctl = from->lbr_ctl;
1510} 1577}
1511 1578
1512static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override) 1579static int nested_svm_vmexit(struct vcpu_svm *svm)
1513{ 1580{
1514 bool k = kvm_override; 1581 struct vmcb *nested_vmcb;
1515 1582 struct vmcb *hsave = svm->nested.hsave;
1516 switch (svm->vmcb->control.exit_code) { 1583 struct vmcb *vmcb = svm->vmcb;
1517 case SVM_EXIT_MSR:
1518 return nested_svm_do(svm, svm->nested_vmcb,
1519 svm->nested_vmcb_msrpm, NULL,
1520 nested_svm_exit_handled_msr);
1521 default: break;
1522 }
1523 1584
1524 return nested_svm_do(svm, svm->nested_vmcb, 0, &k, 1585 nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, KM_USER0);
1525 nested_svm_exit_handled_real); 1586 if (!nested_vmcb)
1526} 1587 return 1;
1527
1528static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1,
1529 void *arg2, void *opaque)
1530{
1531 struct vmcb *nested_vmcb = (struct vmcb *)arg1;
1532 struct vmcb *hsave = svm->hsave;
1533 u64 nested_save[] = { nested_vmcb->save.cr0,
1534 nested_vmcb->save.cr3,
1535 nested_vmcb->save.cr4,
1536 nested_vmcb->save.efer,
1537 nested_vmcb->control.intercept_cr_read,
1538 nested_vmcb->control.intercept_cr_write,
1539 nested_vmcb->control.intercept_dr_read,
1540 nested_vmcb->control.intercept_dr_write,
1541 nested_vmcb->control.intercept_exceptions,
1542 nested_vmcb->control.intercept,
1543 nested_vmcb->control.msrpm_base_pa,
1544 nested_vmcb->control.iopm_base_pa,
1545 nested_vmcb->control.tsc_offset };
1546 1588
1547 /* Give the current vmcb to the guest */ 1589 /* Give the current vmcb to the guest */
1548 memcpy(nested_vmcb, svm->vmcb, sizeof(struct vmcb)); 1590 disable_gif(svm);
1549 nested_vmcb->save.cr0 = nested_save[0]; 1591
1550 if (!npt_enabled) 1592 nested_vmcb->save.es = vmcb->save.es;
1551 nested_vmcb->save.cr3 = nested_save[1]; 1593 nested_vmcb->save.cs = vmcb->save.cs;
1552 nested_vmcb->save.cr4 = nested_save[2]; 1594 nested_vmcb->save.ss = vmcb->save.ss;
1553 nested_vmcb->save.efer = nested_save[3]; 1595 nested_vmcb->save.ds = vmcb->save.ds;
1554 nested_vmcb->control.intercept_cr_read = nested_save[4]; 1596 nested_vmcb->save.gdtr = vmcb->save.gdtr;
1555 nested_vmcb->control.intercept_cr_write = nested_save[5]; 1597 nested_vmcb->save.idtr = vmcb->save.idtr;
1556 nested_vmcb->control.intercept_dr_read = nested_save[6]; 1598 if (npt_enabled)
1557 nested_vmcb->control.intercept_dr_write = nested_save[7]; 1599 nested_vmcb->save.cr3 = vmcb->save.cr3;
1558 nested_vmcb->control.intercept_exceptions = nested_save[8]; 1600 nested_vmcb->save.cr2 = vmcb->save.cr2;
1559 nested_vmcb->control.intercept = nested_save[9]; 1601 nested_vmcb->save.rflags = vmcb->save.rflags;
1560 nested_vmcb->control.msrpm_base_pa = nested_save[10]; 1602 nested_vmcb->save.rip = vmcb->save.rip;
1561 nested_vmcb->control.iopm_base_pa = nested_save[11]; 1603 nested_vmcb->save.rsp = vmcb->save.rsp;
1562 nested_vmcb->control.tsc_offset = nested_save[12]; 1604 nested_vmcb->save.rax = vmcb->save.rax;
1605 nested_vmcb->save.dr7 = vmcb->save.dr7;
1606 nested_vmcb->save.dr6 = vmcb->save.dr6;
1607 nested_vmcb->save.cpl = vmcb->save.cpl;
1608
1609 nested_vmcb->control.int_ctl = vmcb->control.int_ctl;
1610 nested_vmcb->control.int_vector = vmcb->control.int_vector;
1611 nested_vmcb->control.int_state = vmcb->control.int_state;
1612 nested_vmcb->control.exit_code = vmcb->control.exit_code;
1613 nested_vmcb->control.exit_code_hi = vmcb->control.exit_code_hi;
1614 nested_vmcb->control.exit_info_1 = vmcb->control.exit_info_1;
1615 nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2;
1616 nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info;
1617 nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
1618 nested_vmcb->control.tlb_ctl = 0;
1619 nested_vmcb->control.event_inj = 0;
1620 nested_vmcb->control.event_inj_err = 0;
1563 1621
1564 /* We always set V_INTR_MASKING and remember the old value in hflags */ 1622 /* We always set V_INTR_MASKING and remember the old value in hflags */
1565 if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) 1623 if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
1566 nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK; 1624 nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK;
1567 1625
1568 if ((nested_vmcb->control.int_ctl & V_IRQ_MASK) &&
1569 (nested_vmcb->control.int_vector)) {
1570 nsvm_printk("WARNING: IRQ 0x%x still enabled on #VMEXIT\n",
1571 nested_vmcb->control.int_vector);
1572 }
1573
1574 /* Restore the original control entries */ 1626 /* Restore the original control entries */
1575 svm->vmcb->control = hsave->control; 1627 copy_vmcb_control_area(vmcb, hsave);
1576 1628
1577 /* Kill any pending exceptions */ 1629 /* Kill any pending exceptions */
1578 if (svm->vcpu.arch.exception.pending == true) 1630 if (svm->vcpu.arch.exception.pending == true)
1579 nsvm_printk("WARNING: Pending Exception\n"); 1631 nsvm_printk("WARNING: Pending Exception\n");
1580 svm->vcpu.arch.exception.pending = false; 1632
1633 kvm_clear_exception_queue(&svm->vcpu);
1634 kvm_clear_interrupt_queue(&svm->vcpu);
1581 1635
1582 /* Restore selected save entries */ 1636 /* Restore selected save entries */
1583 svm->vmcb->save.es = hsave->save.es; 1637 svm->vmcb->save.es = hsave->save.es;
@@ -1603,19 +1657,10 @@ static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1,
1603 svm->vmcb->save.cpl = 0; 1657 svm->vmcb->save.cpl = 0;
1604 svm->vmcb->control.exit_int_info = 0; 1658 svm->vmcb->control.exit_int_info = 0;
1605 1659
1606 svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
1607 /* Exit nested SVM mode */ 1660 /* Exit nested SVM mode */
1608 svm->nested_vmcb = 0; 1661 svm->nested.vmcb = 0;
1609 1662
1610 return 0; 1663 nested_svm_unmap(nested_vmcb, KM_USER0);
1611}
1612
1613static int nested_svm_vmexit(struct vcpu_svm *svm)
1614{
1615 nsvm_printk("VMexit\n");
1616 if (nested_svm_do(svm, svm->nested_vmcb, 0,
1617 NULL, nested_svm_vmexit_real))
1618 return 1;
1619 1664
1620 kvm_mmu_reset_context(&svm->vcpu); 1665 kvm_mmu_reset_context(&svm->vcpu);
1621 kvm_mmu_load(&svm->vcpu); 1666 kvm_mmu_load(&svm->vcpu);
@@ -1623,38 +1668,63 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1623 return 0; 1668 return 0;
1624} 1669}
1625 1670
1626static int nested_svm_vmrun_msrpm(struct vcpu_svm *svm, void *arg1, 1671static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
1627 void *arg2, void *opaque)
1628{ 1672{
1673 u32 *nested_msrpm;
1629 int i; 1674 int i;
1630 u32 *nested_msrpm = (u32*)arg1; 1675
1676 nested_msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, KM_USER0);
1677 if (!nested_msrpm)
1678 return false;
1679
1631 for (i=0; i< PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER) / 4; i++) 1680 for (i=0; i< PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER) / 4; i++)
1632 svm->nested_msrpm[i] = svm->msrpm[i] | nested_msrpm[i]; 1681 svm->nested.msrpm[i] = svm->msrpm[i] | nested_msrpm[i];
1633 svm->vmcb->control.msrpm_base_pa = __pa(svm->nested_msrpm);
1634 1682
1635 return 0; 1683 svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm);
1684
1685 nested_svm_unmap(nested_msrpm, KM_USER0);
1686
1687 return true;
1636} 1688}
1637 1689
1638static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1, 1690static bool nested_svm_vmrun(struct vcpu_svm *svm)
1639 void *arg2, void *opaque)
1640{ 1691{
1641 struct vmcb *nested_vmcb = (struct vmcb *)arg1; 1692 struct vmcb *nested_vmcb;
1642 struct vmcb *hsave = svm->hsave; 1693 struct vmcb *hsave = svm->nested.hsave;
1694 struct vmcb *vmcb = svm->vmcb;
1695
1696 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0);
1697 if (!nested_vmcb)
1698 return false;
1643 1699
1644 /* nested_vmcb is our indicator if nested SVM is activated */ 1700 /* nested_vmcb is our indicator if nested SVM is activated */
1645 svm->nested_vmcb = svm->vmcb->save.rax; 1701 svm->nested.vmcb = svm->vmcb->save.rax;
1646 1702
1647 /* Clear internal status */ 1703 /* Clear internal status */
1648 svm->vcpu.arch.exception.pending = false; 1704 kvm_clear_exception_queue(&svm->vcpu);
1705 kvm_clear_interrupt_queue(&svm->vcpu);
1649 1706
1650 /* Save the old vmcb, so we don't need to pick what we save, but 1707 /* Save the old vmcb, so we don't need to pick what we save, but
1651 can restore everything when a VMEXIT occurs */ 1708 can restore everything when a VMEXIT occurs */
1652 memcpy(hsave, svm->vmcb, sizeof(struct vmcb)); 1709 hsave->save.es = vmcb->save.es;
1653 /* We need to remember the original CR3 in the SPT case */ 1710 hsave->save.cs = vmcb->save.cs;
1654 if (!npt_enabled) 1711 hsave->save.ss = vmcb->save.ss;
1655 hsave->save.cr3 = svm->vcpu.arch.cr3; 1712 hsave->save.ds = vmcb->save.ds;
1656 hsave->save.cr4 = svm->vcpu.arch.cr4; 1713 hsave->save.gdtr = vmcb->save.gdtr;
1657 hsave->save.rip = svm->next_rip; 1714 hsave->save.idtr = vmcb->save.idtr;
1715 hsave->save.efer = svm->vcpu.arch.shadow_efer;
1716 hsave->save.cr0 = svm->vcpu.arch.cr0;
1717 hsave->save.cr4 = svm->vcpu.arch.cr4;
1718 hsave->save.rflags = vmcb->save.rflags;
1719 hsave->save.rip = svm->next_rip;
1720 hsave->save.rsp = vmcb->save.rsp;
1721 hsave->save.rax = vmcb->save.rax;
1722 if (npt_enabled)
1723 hsave->save.cr3 = vmcb->save.cr3;
1724 else
1725 hsave->save.cr3 = svm->vcpu.arch.cr3;
1726
1727 copy_vmcb_control_area(hsave, vmcb);
1658 1728
1659 if (svm->vmcb->save.rflags & X86_EFLAGS_IF) 1729 if (svm->vmcb->save.rflags & X86_EFLAGS_IF)
1660 svm->vcpu.arch.hflags |= HF_HIF_MASK; 1730 svm->vcpu.arch.hflags |= HF_HIF_MASK;
@@ -1679,7 +1749,7 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1,
1679 kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); 1749 kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
1680 kvm_mmu_reset_context(&svm->vcpu); 1750 kvm_mmu_reset_context(&svm->vcpu);
1681 } 1751 }
1682 svm->vmcb->save.cr2 = nested_vmcb->save.cr2; 1752 svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2;
1683 kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax); 1753 kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax);
1684 kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp); 1754 kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp);
1685 kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip); 1755 kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip);
@@ -1706,7 +1776,15 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1,
1706 1776
1707 svm->vmcb->control.intercept |= nested_vmcb->control.intercept; 1777 svm->vmcb->control.intercept |= nested_vmcb->control.intercept;
1708 1778
1709 svm->nested_vmcb_msrpm = nested_vmcb->control.msrpm_base_pa; 1779 svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa;
1780
1781 /* cache intercepts */
1782 svm->nested.intercept_cr_read = nested_vmcb->control.intercept_cr_read;
1783 svm->nested.intercept_cr_write = nested_vmcb->control.intercept_cr_write;
1784 svm->nested.intercept_dr_read = nested_vmcb->control.intercept_dr_read;
1785 svm->nested.intercept_dr_write = nested_vmcb->control.intercept_dr_write;
1786 svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions;
1787 svm->nested.intercept = nested_vmcb->control.intercept;
1710 1788
1711 force_new_asid(&svm->vcpu); 1789 force_new_asid(&svm->vcpu);
1712 svm->vmcb->control.exit_int_info = nested_vmcb->control.exit_int_info; 1790 svm->vmcb->control.exit_int_info = nested_vmcb->control.exit_int_info;
@@ -1734,12 +1812,14 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1,
1734 svm->vmcb->control.event_inj = nested_vmcb->control.event_inj; 1812 svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
1735 svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err; 1813 svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
1736 1814
1737 svm->vcpu.arch.hflags |= HF_GIF_MASK; 1815 nested_svm_unmap(nested_vmcb, KM_USER0);
1738 1816
1739 return 0; 1817 enable_gif(svm);
1818
1819 return true;
1740} 1820}
1741 1821
1742static int nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) 1822static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
1743{ 1823{
1744 to_vmcb->save.fs = from_vmcb->save.fs; 1824 to_vmcb->save.fs = from_vmcb->save.fs;
1745 to_vmcb->save.gs = from_vmcb->save.gs; 1825 to_vmcb->save.gs = from_vmcb->save.gs;
@@ -1753,44 +1833,44 @@ static int nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
1753 to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs; 1833 to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs;
1754 to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp; 1834 to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp;
1755 to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip; 1835 to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
1756
1757 return 1;
1758}
1759
1760static int nested_svm_vmload(struct vcpu_svm *svm, void *nested_vmcb,
1761 void *arg2, void *opaque)
1762{
1763 return nested_svm_vmloadsave((struct vmcb *)nested_vmcb, svm->vmcb);
1764}
1765
1766static int nested_svm_vmsave(struct vcpu_svm *svm, void *nested_vmcb,
1767 void *arg2, void *opaque)
1768{
1769 return nested_svm_vmloadsave(svm->vmcb, (struct vmcb *)nested_vmcb);
1770} 1836}
1771 1837
1772static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1838static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1773{ 1839{
1840 struct vmcb *nested_vmcb;
1841
1774 if (nested_svm_check_permissions(svm)) 1842 if (nested_svm_check_permissions(svm))
1775 return 1; 1843 return 1;
1776 1844
1777 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 1845 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1778 skip_emulated_instruction(&svm->vcpu); 1846 skip_emulated_instruction(&svm->vcpu);
1779 1847
1780 nested_svm_do(svm, svm->vmcb->save.rax, 0, NULL, nested_svm_vmload); 1848 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0);
1849 if (!nested_vmcb)
1850 return 1;
1851
1852 nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
1853 nested_svm_unmap(nested_vmcb, KM_USER0);
1781 1854
1782 return 1; 1855 return 1;
1783} 1856}
1784 1857
1785static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1858static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1786{ 1859{
1860 struct vmcb *nested_vmcb;
1861
1787 if (nested_svm_check_permissions(svm)) 1862 if (nested_svm_check_permissions(svm))
1788 return 1; 1863 return 1;
1789 1864
1790 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 1865 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1791 skip_emulated_instruction(&svm->vcpu); 1866 skip_emulated_instruction(&svm->vcpu);
1792 1867
1793 nested_svm_do(svm, svm->vmcb->save.rax, 0, NULL, nested_svm_vmsave); 1868 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0);
1869 if (!nested_vmcb)
1870 return 1;
1871
1872 nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
1873 nested_svm_unmap(nested_vmcb, KM_USER0);
1794 1874
1795 return 1; 1875 return 1;
1796} 1876}
@@ -1798,19 +1878,29 @@ static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1798static int vmrun_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1878static int vmrun_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1799{ 1879{
1800 nsvm_printk("VMrun\n"); 1880 nsvm_printk("VMrun\n");
1881
1801 if (nested_svm_check_permissions(svm)) 1882 if (nested_svm_check_permissions(svm))
1802 return 1; 1883 return 1;
1803 1884
1804 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 1885 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1805 skip_emulated_instruction(&svm->vcpu); 1886 skip_emulated_instruction(&svm->vcpu);
1806 1887
1807 if (nested_svm_do(svm, svm->vmcb->save.rax, 0, 1888 if (!nested_svm_vmrun(svm))
1808 NULL, nested_svm_vmrun))
1809 return 1; 1889 return 1;
1810 1890
1811 if (nested_svm_do(svm, svm->nested_vmcb_msrpm, 0, 1891 if (!nested_svm_vmrun_msrpm(svm))
1812 NULL, nested_svm_vmrun_msrpm)) 1892 goto failed;
1813 return 1; 1893
1894 return 1;
1895
1896failed:
1897
1898 svm->vmcb->control.exit_code = SVM_EXIT_ERR;
1899 svm->vmcb->control.exit_code_hi = 0;
1900 svm->vmcb->control.exit_info_1 = 0;
1901 svm->vmcb->control.exit_info_2 = 0;
1902
1903 nested_svm_vmexit(svm);
1814 1904
1815 return 1; 1905 return 1;
1816} 1906}
@@ -1823,7 +1913,7 @@ static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1823 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 1913 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1824 skip_emulated_instruction(&svm->vcpu); 1914 skip_emulated_instruction(&svm->vcpu);
1825 1915
1826 svm->vcpu.arch.hflags |= HF_GIF_MASK; 1916 enable_gif(svm);
1827 1917
1828 return 1; 1918 return 1;
1829} 1919}
@@ -1836,7 +1926,7 @@ static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1836 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 1926 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1837 skip_emulated_instruction(&svm->vcpu); 1927 skip_emulated_instruction(&svm->vcpu);
1838 1928
1839 svm->vcpu.arch.hflags &= ~HF_GIF_MASK; 1929 disable_gif(svm);
1840 1930
1841 /* After a CLGI no interrupts should come */ 1931 /* After a CLGI no interrupts should come */
1842 svm_clear_vintr(svm); 1932 svm_clear_vintr(svm);
@@ -1845,6 +1935,19 @@ static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1845 return 1; 1935 return 1;
1846} 1936}
1847 1937
1938static int invlpga_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1939{
1940 struct kvm_vcpu *vcpu = &svm->vcpu;
1941 nsvm_printk("INVLPGA\n");
1942
1943 /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
1944 kvm_mmu_invlpg(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]);
1945
1946 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1947 skip_emulated_instruction(&svm->vcpu);
1948 return 1;
1949}
1950
1848static int invalid_op_interception(struct vcpu_svm *svm, 1951static int invalid_op_interception(struct vcpu_svm *svm,
1849 struct kvm_run *kvm_run) 1952 struct kvm_run *kvm_run)
1850{ 1953{
@@ -1953,7 +2056,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
1953 struct vcpu_svm *svm = to_svm(vcpu); 2056 struct vcpu_svm *svm = to_svm(vcpu);
1954 2057
1955 switch (ecx) { 2058 switch (ecx) {
1956 case MSR_IA32_TIME_STAMP_COUNTER: { 2059 case MSR_IA32_TSC: {
1957 u64 tsc; 2060 u64 tsc;
1958 2061
1959 rdtscll(tsc); 2062 rdtscll(tsc);
@@ -1981,10 +2084,10 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
1981 *data = svm->vmcb->save.sysenter_cs; 2084 *data = svm->vmcb->save.sysenter_cs;
1982 break; 2085 break;
1983 case MSR_IA32_SYSENTER_EIP: 2086 case MSR_IA32_SYSENTER_EIP:
1984 *data = svm->vmcb->save.sysenter_eip; 2087 *data = svm->sysenter_eip;
1985 break; 2088 break;
1986 case MSR_IA32_SYSENTER_ESP: 2089 case MSR_IA32_SYSENTER_ESP:
1987 *data = svm->vmcb->save.sysenter_esp; 2090 *data = svm->sysenter_esp;
1988 break; 2091 break;
1989 /* Nobody will change the following 5 values in the VMCB so 2092 /* Nobody will change the following 5 values in the VMCB so
1990 we can safely return them on rdmsr. They will always be 0 2093 we can safely return them on rdmsr. They will always be 0
@@ -2005,7 +2108,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
2005 *data = svm->vmcb->save.last_excp_to; 2108 *data = svm->vmcb->save.last_excp_to;
2006 break; 2109 break;
2007 case MSR_VM_HSAVE_PA: 2110 case MSR_VM_HSAVE_PA:
2008 *data = svm->hsave_msr; 2111 *data = svm->nested.hsave_msr;
2009 break; 2112 break;
2010 case MSR_VM_CR: 2113 case MSR_VM_CR:
2011 *data = 0; 2114 *data = 0;
@@ -2027,8 +2130,7 @@ static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
2027 if (svm_get_msr(&svm->vcpu, ecx, &data)) 2130 if (svm_get_msr(&svm->vcpu, ecx, &data))
2028 kvm_inject_gp(&svm->vcpu, 0); 2131 kvm_inject_gp(&svm->vcpu, 0);
2029 else { 2132 else {
2030 KVMTRACE_3D(MSR_READ, &svm->vcpu, ecx, (u32)data, 2133 trace_kvm_msr_read(ecx, data);
2031 (u32)(data >> 32), handler);
2032 2134
2033 svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff; 2135 svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff;
2034 svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32; 2136 svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32;
@@ -2043,7 +2145,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
2043 struct vcpu_svm *svm = to_svm(vcpu); 2145 struct vcpu_svm *svm = to_svm(vcpu);
2044 2146
2045 switch (ecx) { 2147 switch (ecx) {
2046 case MSR_IA32_TIME_STAMP_COUNTER: { 2148 case MSR_IA32_TSC: {
2047 u64 tsc; 2149 u64 tsc;
2048 2150
2049 rdtscll(tsc); 2151 rdtscll(tsc);
@@ -2071,9 +2173,11 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
2071 svm->vmcb->save.sysenter_cs = data; 2173 svm->vmcb->save.sysenter_cs = data;
2072 break; 2174 break;
2073 case MSR_IA32_SYSENTER_EIP: 2175 case MSR_IA32_SYSENTER_EIP:
2176 svm->sysenter_eip = data;
2074 svm->vmcb->save.sysenter_eip = data; 2177 svm->vmcb->save.sysenter_eip = data;
2075 break; 2178 break;
2076 case MSR_IA32_SYSENTER_ESP: 2179 case MSR_IA32_SYSENTER_ESP:
2180 svm->sysenter_esp = data;
2077 svm->vmcb->save.sysenter_esp = data; 2181 svm->vmcb->save.sysenter_esp = data;
2078 break; 2182 break;
2079 case MSR_IA32_DEBUGCTLMSR: 2183 case MSR_IA32_DEBUGCTLMSR:
@@ -2091,24 +2195,12 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
2091 else 2195 else
2092 svm_disable_lbrv(svm); 2196 svm_disable_lbrv(svm);
2093 break; 2197 break;
2094 case MSR_K7_EVNTSEL0:
2095 case MSR_K7_EVNTSEL1:
2096 case MSR_K7_EVNTSEL2:
2097 case MSR_K7_EVNTSEL3:
2098 case MSR_K7_PERFCTR0:
2099 case MSR_K7_PERFCTR1:
2100 case MSR_K7_PERFCTR2:
2101 case MSR_K7_PERFCTR3:
2102 /*
2103 * Just discard all writes to the performance counters; this
2104 * should keep both older linux and windows 64-bit guests
2105 * happy
2106 */
2107 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", ecx, data);
2108
2109 break;
2110 case MSR_VM_HSAVE_PA: 2198 case MSR_VM_HSAVE_PA:
2111 svm->hsave_msr = data; 2199 svm->nested.hsave_msr = data;
2200 break;
2201 case MSR_VM_CR:
2202 case MSR_VM_IGNNE:
2203 pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
2112 break; 2204 break;
2113 default: 2205 default:
2114 return kvm_set_msr_common(vcpu, ecx, data); 2206 return kvm_set_msr_common(vcpu, ecx, data);
@@ -2122,8 +2214,7 @@ static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
2122 u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) 2214 u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
2123 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); 2215 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
2124 2216
2125 KVMTRACE_3D(MSR_WRITE, &svm->vcpu, ecx, (u32)data, (u32)(data >> 32), 2217 trace_kvm_msr_write(ecx, data);
2126 handler);
2127 2218
2128 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; 2219 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
2129 if (svm_set_msr(&svm->vcpu, ecx, data)) 2220 if (svm_set_msr(&svm->vcpu, ecx, data))
@@ -2144,8 +2235,6 @@ static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
2144static int interrupt_window_interception(struct vcpu_svm *svm, 2235static int interrupt_window_interception(struct vcpu_svm *svm,
2145 struct kvm_run *kvm_run) 2236 struct kvm_run *kvm_run)
2146{ 2237{
2147 KVMTRACE_0D(PEND_INTR, &svm->vcpu, handler);
2148
2149 svm_clear_vintr(svm); 2238 svm_clear_vintr(svm);
2150 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; 2239 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
2151 /* 2240 /*
@@ -2201,7 +2290,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
2201 [SVM_EXIT_INVD] = emulate_on_interception, 2290 [SVM_EXIT_INVD] = emulate_on_interception,
2202 [SVM_EXIT_HLT] = halt_interception, 2291 [SVM_EXIT_HLT] = halt_interception,
2203 [SVM_EXIT_INVLPG] = invlpg_interception, 2292 [SVM_EXIT_INVLPG] = invlpg_interception,
2204 [SVM_EXIT_INVLPGA] = invalid_op_interception, 2293 [SVM_EXIT_INVLPGA] = invlpga_interception,
2205 [SVM_EXIT_IOIO] = io_interception, 2294 [SVM_EXIT_IOIO] = io_interception,
2206 [SVM_EXIT_MSR] = msr_interception, 2295 [SVM_EXIT_MSR] = msr_interception,
2207 [SVM_EXIT_TASK_SWITCH] = task_switch_interception, 2296 [SVM_EXIT_TASK_SWITCH] = task_switch_interception,
@@ -2224,20 +2313,26 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2224 struct vcpu_svm *svm = to_svm(vcpu); 2313 struct vcpu_svm *svm = to_svm(vcpu);
2225 u32 exit_code = svm->vmcb->control.exit_code; 2314 u32 exit_code = svm->vmcb->control.exit_code;
2226 2315
2227 KVMTRACE_3D(VMEXIT, vcpu, exit_code, (u32)svm->vmcb->save.rip, 2316 trace_kvm_exit(exit_code, svm->vmcb->save.rip);
2228 (u32)((u64)svm->vmcb->save.rip >> 32), entryexit);
2229 2317
2230 if (is_nested(svm)) { 2318 if (is_nested(svm)) {
2319 int vmexit;
2320
2231 nsvm_printk("nested handle_exit: 0x%x | 0x%lx | 0x%lx | 0x%lx\n", 2321 nsvm_printk("nested handle_exit: 0x%x | 0x%lx | 0x%lx | 0x%lx\n",
2232 exit_code, svm->vmcb->control.exit_info_1, 2322 exit_code, svm->vmcb->control.exit_info_1,
2233 svm->vmcb->control.exit_info_2, svm->vmcb->save.rip); 2323 svm->vmcb->control.exit_info_2, svm->vmcb->save.rip);
2234 if (nested_svm_exit_handled(svm, true)) { 2324
2235 nested_svm_vmexit(svm); 2325 vmexit = nested_svm_exit_special(svm);
2236 nsvm_printk("-> #VMEXIT\n"); 2326
2327 if (vmexit == NESTED_EXIT_CONTINUE)
2328 vmexit = nested_svm_exit_handled(svm);
2329
2330 if (vmexit == NESTED_EXIT_DONE)
2237 return 1; 2331 return 1;
2238 }
2239 } 2332 }
2240 2333
2334 svm_complete_interrupts(svm);
2335
2241 if (npt_enabled) { 2336 if (npt_enabled) {
2242 int mmu_reload = 0; 2337 int mmu_reload = 0;
2243 if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) { 2338 if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) {
@@ -2246,12 +2341,6 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2246 } 2341 }
2247 vcpu->arch.cr0 = svm->vmcb->save.cr0; 2342 vcpu->arch.cr0 = svm->vmcb->save.cr0;
2248 vcpu->arch.cr3 = svm->vmcb->save.cr3; 2343 vcpu->arch.cr3 = svm->vmcb->save.cr3;
2249 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
2250 if (!load_pdptrs(vcpu, vcpu->arch.cr3)) {
2251 kvm_inject_gp(vcpu, 0);
2252 return 1;
2253 }
2254 }
2255 if (mmu_reload) { 2344 if (mmu_reload) {
2256 kvm_mmu_reset_context(vcpu); 2345 kvm_mmu_reset_context(vcpu);
2257 kvm_mmu_load(vcpu); 2346 kvm_mmu_load(vcpu);
@@ -2319,7 +2408,7 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
2319{ 2408{
2320 struct vmcb_control_area *control; 2409 struct vmcb_control_area *control;
2321 2410
2322 KVMTRACE_1D(INJ_VIRQ, &svm->vcpu, (u32)irq, handler); 2411 trace_kvm_inj_virq(irq);
2323 2412
2324 ++svm->vcpu.stat.irq_injections; 2413 ++svm->vcpu.stat.irq_injections;
2325 control = &svm->vmcb->control; 2414 control = &svm->vmcb->control;
@@ -2329,21 +2418,14 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
2329 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); 2418 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
2330} 2419}
2331 2420
2332static void svm_queue_irq(struct kvm_vcpu *vcpu, unsigned nr)
2333{
2334 struct vcpu_svm *svm = to_svm(vcpu);
2335
2336 svm->vmcb->control.event_inj = nr |
2337 SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
2338}
2339
2340static void svm_set_irq(struct kvm_vcpu *vcpu) 2421static void svm_set_irq(struct kvm_vcpu *vcpu)
2341{ 2422{
2342 struct vcpu_svm *svm = to_svm(vcpu); 2423 struct vcpu_svm *svm = to_svm(vcpu);
2343 2424
2344 nested_svm_intr(svm); 2425 BUG_ON(!(gif_set(svm)));
2345 2426
2346 svm_queue_irq(vcpu, vcpu->arch.interrupt.nr); 2427 svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
2428 SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
2347} 2429}
2348 2430
2349static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) 2431static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
@@ -2371,13 +2453,25 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
2371 struct vmcb *vmcb = svm->vmcb; 2453 struct vmcb *vmcb = svm->vmcb;
2372 return (vmcb->save.rflags & X86_EFLAGS_IF) && 2454 return (vmcb->save.rflags & X86_EFLAGS_IF) &&
2373 !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) && 2455 !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
2374 (svm->vcpu.arch.hflags & HF_GIF_MASK); 2456 gif_set(svm) &&
2457 !(is_nested(svm) && (svm->vcpu.arch.hflags & HF_VINTR_MASK));
2375} 2458}
2376 2459
2377static void enable_irq_window(struct kvm_vcpu *vcpu) 2460static void enable_irq_window(struct kvm_vcpu *vcpu)
2378{ 2461{
2379 svm_set_vintr(to_svm(vcpu)); 2462 struct vcpu_svm *svm = to_svm(vcpu);
2380 svm_inject_irq(to_svm(vcpu), 0x0); 2463 nsvm_printk("Trying to open IRQ window\n");
2464
2465 nested_svm_intr(svm);
2466
2467 /* In case GIF=0 we can't rely on the CPU to tell us when
2468 * GIF becomes 1, because that's a separate STGI/VMRUN intercept.
2469 * The next time we get that intercept, this function will be
2470 * called again though and we'll get the vintr intercept. */
2471 if (gif_set(svm)) {
2472 svm_set_vintr(svm);
2473 svm_inject_irq(svm, 0x0);
2474 }
2381} 2475}
2382 2476
2383static void enable_nmi_window(struct kvm_vcpu *vcpu) 2477static void enable_nmi_window(struct kvm_vcpu *vcpu)
@@ -2456,6 +2550,8 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
2456 case SVM_EXITINTINFO_TYPE_EXEPT: 2550 case SVM_EXITINTINFO_TYPE_EXEPT:
2457 /* In case of software exception do not reinject an exception 2551 /* In case of software exception do not reinject an exception
2458 vector, but re-execute and instruction instead */ 2552 vector, but re-execute and instruction instead */
2553 if (is_nested(svm))
2554 break;
2459 if (kvm_exception_is_soft(vector)) 2555 if (kvm_exception_is_soft(vector))
2460 break; 2556 break;
2461 if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) { 2557 if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
@@ -2498,9 +2594,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2498 fs_selector = kvm_read_fs(); 2594 fs_selector = kvm_read_fs();
2499 gs_selector = kvm_read_gs(); 2595 gs_selector = kvm_read_gs();
2500 ldt_selector = kvm_read_ldt(); 2596 ldt_selector = kvm_read_ldt();
2501 svm->host_cr2 = kvm_read_cr2(); 2597 svm->vmcb->save.cr2 = vcpu->arch.cr2;
2502 if (!is_nested(svm))
2503 svm->vmcb->save.cr2 = vcpu->arch.cr2;
2504 /* required for live migration with NPT */ 2598 /* required for live migration with NPT */
2505 if (npt_enabled) 2599 if (npt_enabled)
2506 svm->vmcb->save.cr3 = vcpu->arch.cr3; 2600 svm->vmcb->save.cr3 = vcpu->arch.cr3;
@@ -2585,8 +2679,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2585 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; 2679 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
2586 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; 2680 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
2587 2681
2588 kvm_write_cr2(svm->host_cr2);
2589
2590 kvm_load_fs(fs_selector); 2682 kvm_load_fs(fs_selector);
2591 kvm_load_gs(gs_selector); 2683 kvm_load_gs(gs_selector);
2592 kvm_load_ldt(ldt_selector); 2684 kvm_load_ldt(ldt_selector);
@@ -2602,7 +2694,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2602 2694
2603 svm->next_rip = 0; 2695 svm->next_rip = 0;
2604 2696
2605 svm_complete_interrupts(svm); 2697 if (npt_enabled) {
2698 vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
2699 vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);
2700 }
2606} 2701}
2607 2702
2608#undef R 2703#undef R
@@ -2673,6 +2768,64 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
2673 return 0; 2768 return 0;
2674} 2769}
2675 2770
2771static const struct trace_print_flags svm_exit_reasons_str[] = {
2772 { SVM_EXIT_READ_CR0, "read_cr0" },
2773 { SVM_EXIT_READ_CR3, "read_cr3" },
2774 { SVM_EXIT_READ_CR4, "read_cr4" },
2775 { SVM_EXIT_READ_CR8, "read_cr8" },
2776 { SVM_EXIT_WRITE_CR0, "write_cr0" },
2777 { SVM_EXIT_WRITE_CR3, "write_cr3" },
2778 { SVM_EXIT_WRITE_CR4, "write_cr4" },
2779 { SVM_EXIT_WRITE_CR8, "write_cr8" },
2780 { SVM_EXIT_READ_DR0, "read_dr0" },
2781 { SVM_EXIT_READ_DR1, "read_dr1" },
2782 { SVM_EXIT_READ_DR2, "read_dr2" },
2783 { SVM_EXIT_READ_DR3, "read_dr3" },
2784 { SVM_EXIT_WRITE_DR0, "write_dr0" },
2785 { SVM_EXIT_WRITE_DR1, "write_dr1" },
2786 { SVM_EXIT_WRITE_DR2, "write_dr2" },
2787 { SVM_EXIT_WRITE_DR3, "write_dr3" },
2788 { SVM_EXIT_WRITE_DR5, "write_dr5" },
2789 { SVM_EXIT_WRITE_DR7, "write_dr7" },
2790 { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" },
2791 { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" },
2792 { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" },
2793 { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" },
2794 { SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" },
2795 { SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" },
2796 { SVM_EXIT_INTR, "interrupt" },
2797 { SVM_EXIT_NMI, "nmi" },
2798 { SVM_EXIT_SMI, "smi" },
2799 { SVM_EXIT_INIT, "init" },
2800 { SVM_EXIT_VINTR, "vintr" },
2801 { SVM_EXIT_CPUID, "cpuid" },
2802 { SVM_EXIT_INVD, "invd" },
2803 { SVM_EXIT_HLT, "hlt" },
2804 { SVM_EXIT_INVLPG, "invlpg" },
2805 { SVM_EXIT_INVLPGA, "invlpga" },
2806 { SVM_EXIT_IOIO, "io" },
2807 { SVM_EXIT_MSR, "msr" },
2808 { SVM_EXIT_TASK_SWITCH, "task_switch" },
2809 { SVM_EXIT_SHUTDOWN, "shutdown" },
2810 { SVM_EXIT_VMRUN, "vmrun" },
2811 { SVM_EXIT_VMMCALL, "hypercall" },
2812 { SVM_EXIT_VMLOAD, "vmload" },
2813 { SVM_EXIT_VMSAVE, "vmsave" },
2814 { SVM_EXIT_STGI, "stgi" },
2815 { SVM_EXIT_CLGI, "clgi" },
2816 { SVM_EXIT_SKINIT, "skinit" },
2817 { SVM_EXIT_WBINVD, "wbinvd" },
2818 { SVM_EXIT_MONITOR, "monitor" },
2819 { SVM_EXIT_MWAIT, "mwait" },
2820 { SVM_EXIT_NPF, "npf" },
2821 { -1, NULL }
2822};
2823
2824static bool svm_gb_page_enable(void)
2825{
2826 return true;
2827}
2828
2676static struct kvm_x86_ops svm_x86_ops = { 2829static struct kvm_x86_ops svm_x86_ops = {
2677 .cpu_has_kvm_support = has_svm, 2830 .cpu_has_kvm_support = has_svm,
2678 .disabled_by_bios = is_disabled, 2831 .disabled_by_bios = is_disabled,
@@ -2710,6 +2863,7 @@ static struct kvm_x86_ops svm_x86_ops = {
2710 .set_gdt = svm_set_gdt, 2863 .set_gdt = svm_set_gdt,
2711 .get_dr = svm_get_dr, 2864 .get_dr = svm_get_dr,
2712 .set_dr = svm_set_dr, 2865 .set_dr = svm_set_dr,
2866 .cache_reg = svm_cache_reg,
2713 .get_rflags = svm_get_rflags, 2867 .get_rflags = svm_get_rflags,
2714 .set_rflags = svm_set_rflags, 2868 .set_rflags = svm_set_rflags,
2715 2869
@@ -2733,6 +2887,9 @@ static struct kvm_x86_ops svm_x86_ops = {
2733 .set_tss_addr = svm_set_tss_addr, 2887 .set_tss_addr = svm_set_tss_addr,
2734 .get_tdp_level = get_npt_level, 2888 .get_tdp_level = get_npt_level,
2735 .get_mt_mask = svm_get_mt_mask, 2889 .get_mt_mask = svm_get_mt_mask,
2890
2891 .exit_reasons_str = svm_exit_reasons_str,
2892 .gb_page_enable = svm_gb_page_enable,
2736}; 2893};
2737 2894
2738static int __init svm_init(void) 2895static int __init svm_init(void)
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
index 86dbac072d0c..eea40439066c 100644
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
@@ -9,12 +9,16 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
9 int restart_timer = 0; 9 int restart_timer = 0;
10 wait_queue_head_t *q = &vcpu->wq; 10 wait_queue_head_t *q = &vcpu->wq;
11 11
12 /* FIXME: this code should not know anything about vcpus */ 12 /*
13 if (!atomic_inc_and_test(&ktimer->pending)) 13 * There is a race window between reading and incrementing, but we do
14 * not care about potentially loosing timer events in the !reinject
15 * case anyway.
16 */
17 if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
18 atomic_inc(&ktimer->pending);
19 /* FIXME: this code should not know anything about vcpus */
14 set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); 20 set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
15 21 }
16 if (!ktimer->reinject)
17 atomic_set(&ktimer->pending, 1);
18 22
19 if (waitqueue_active(q)) 23 if (waitqueue_active(q))
20 wake_up_interruptible(q); 24 wake_up_interruptible(q);
@@ -33,7 +37,7 @@ enum hrtimer_restart kvm_timer_fn(struct hrtimer *data)
33 struct kvm_vcpu *vcpu; 37 struct kvm_vcpu *vcpu;
34 struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); 38 struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
35 39
36 vcpu = ktimer->kvm->vcpus[ktimer->vcpu_id]; 40 vcpu = ktimer->vcpu;
37 if (!vcpu) 41 if (!vcpu)
38 return HRTIMER_NORESTART; 42 return HRTIMER_NORESTART;
39 43
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
new file mode 100644
index 000000000000..0d480e77eacf
--- /dev/null
+++ b/arch/x86/kvm/trace.h
@@ -0,0 +1,355 @@
1#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ)
2#define _TRACE_KVM_H
3
4#include <linux/tracepoint.h>
5
6#undef TRACE_SYSTEM
7#define TRACE_SYSTEM kvm
8#define TRACE_INCLUDE_PATH arch/x86/kvm
9#define TRACE_INCLUDE_FILE trace
10
11/*
12 * Tracepoint for guest mode entry.
13 */
14TRACE_EVENT(kvm_entry,
15 TP_PROTO(unsigned int vcpu_id),
16 TP_ARGS(vcpu_id),
17
18 TP_STRUCT__entry(
19 __field( unsigned int, vcpu_id )
20 ),
21
22 TP_fast_assign(
23 __entry->vcpu_id = vcpu_id;
24 ),
25
26 TP_printk("vcpu %u", __entry->vcpu_id)
27);
28
29/*
30 * Tracepoint for hypercall.
31 */
32TRACE_EVENT(kvm_hypercall,
33 TP_PROTO(unsigned long nr, unsigned long a0, unsigned long a1,
34 unsigned long a2, unsigned long a3),
35 TP_ARGS(nr, a0, a1, a2, a3),
36
37 TP_STRUCT__entry(
38 __field( unsigned long, nr )
39 __field( unsigned long, a0 )
40 __field( unsigned long, a1 )
41 __field( unsigned long, a2 )
42 __field( unsigned long, a3 )
43 ),
44
45 TP_fast_assign(
46 __entry->nr = nr;
47 __entry->a0 = a0;
48 __entry->a1 = a1;
49 __entry->a2 = a2;
50 __entry->a3 = a3;
51 ),
52
53 TP_printk("nr 0x%lx a0 0x%lx a1 0x%lx a2 0x%lx a3 0x%lx",
54 __entry->nr, __entry->a0, __entry->a1, __entry->a2,
55 __entry->a3)
56);
57
58/*
59 * Tracepoint for PIO.
60 */
61TRACE_EVENT(kvm_pio,
62 TP_PROTO(unsigned int rw, unsigned int port, unsigned int size,
63 unsigned int count),
64 TP_ARGS(rw, port, size, count),
65
66 TP_STRUCT__entry(
67 __field( unsigned int, rw )
68 __field( unsigned int, port )
69 __field( unsigned int, size )
70 __field( unsigned int, count )
71 ),
72
73 TP_fast_assign(
74 __entry->rw = rw;
75 __entry->port = port;
76 __entry->size = size;
77 __entry->count = count;
78 ),
79
80 TP_printk("pio_%s at 0x%x size %d count %d",
81 __entry->rw ? "write" : "read",
82 __entry->port, __entry->size, __entry->count)
83);
84
85/*
86 * Tracepoint for cpuid.
87 */
88TRACE_EVENT(kvm_cpuid,
89 TP_PROTO(unsigned int function, unsigned long rax, unsigned long rbx,
90 unsigned long rcx, unsigned long rdx),
91 TP_ARGS(function, rax, rbx, rcx, rdx),
92
93 TP_STRUCT__entry(
94 __field( unsigned int, function )
95 __field( unsigned long, rax )
96 __field( unsigned long, rbx )
97 __field( unsigned long, rcx )
98 __field( unsigned long, rdx )
99 ),
100
101 TP_fast_assign(
102 __entry->function = function;
103 __entry->rax = rax;
104 __entry->rbx = rbx;
105 __entry->rcx = rcx;
106 __entry->rdx = rdx;
107 ),
108
109 TP_printk("func %x rax %lx rbx %lx rcx %lx rdx %lx",
110 __entry->function, __entry->rax,
111 __entry->rbx, __entry->rcx, __entry->rdx)
112);
113
114#define AREG(x) { APIC_##x, "APIC_" #x }
115
116#define kvm_trace_symbol_apic \
117 AREG(ID), AREG(LVR), AREG(TASKPRI), AREG(ARBPRI), AREG(PROCPRI), \
118 AREG(EOI), AREG(RRR), AREG(LDR), AREG(DFR), AREG(SPIV), AREG(ISR), \
119 AREG(TMR), AREG(IRR), AREG(ESR), AREG(ICR), AREG(ICR2), AREG(LVTT), \
120 AREG(LVTTHMR), AREG(LVTPC), AREG(LVT0), AREG(LVT1), AREG(LVTERR), \
121 AREG(TMICT), AREG(TMCCT), AREG(TDCR), AREG(SELF_IPI), AREG(EFEAT), \
122 AREG(ECTRL)
123/*
124 * Tracepoint for apic access.
125 */
126TRACE_EVENT(kvm_apic,
127 TP_PROTO(unsigned int rw, unsigned int reg, unsigned int val),
128 TP_ARGS(rw, reg, val),
129
130 TP_STRUCT__entry(
131 __field( unsigned int, rw )
132 __field( unsigned int, reg )
133 __field( unsigned int, val )
134 ),
135
136 TP_fast_assign(
137 __entry->rw = rw;
138 __entry->reg = reg;
139 __entry->val = val;
140 ),
141
142 TP_printk("apic_%s %s = 0x%x",
143 __entry->rw ? "write" : "read",
144 __print_symbolic(__entry->reg, kvm_trace_symbol_apic),
145 __entry->val)
146);
147
148#define trace_kvm_apic_read(reg, val) trace_kvm_apic(0, reg, val)
149#define trace_kvm_apic_write(reg, val) trace_kvm_apic(1, reg, val)
150
151/*
152 * Tracepoint for kvm guest exit:
153 */
154TRACE_EVENT(kvm_exit,
155 TP_PROTO(unsigned int exit_reason, unsigned long guest_rip),
156 TP_ARGS(exit_reason, guest_rip),
157
158 TP_STRUCT__entry(
159 __field( unsigned int, exit_reason )
160 __field( unsigned long, guest_rip )
161 ),
162
163 TP_fast_assign(
164 __entry->exit_reason = exit_reason;
165 __entry->guest_rip = guest_rip;
166 ),
167
168 TP_printk("reason %s rip 0x%lx",
169 ftrace_print_symbols_seq(p, __entry->exit_reason,
170 kvm_x86_ops->exit_reasons_str),
171 __entry->guest_rip)
172);
173
174/*
175 * Tracepoint for kvm interrupt injection:
176 */
177TRACE_EVENT(kvm_inj_virq,
178 TP_PROTO(unsigned int irq),
179 TP_ARGS(irq),
180
181 TP_STRUCT__entry(
182 __field( unsigned int, irq )
183 ),
184
185 TP_fast_assign(
186 __entry->irq = irq;
187 ),
188
189 TP_printk("irq %u", __entry->irq)
190);
191
192/*
193 * Tracepoint for page fault.
194 */
195TRACE_EVENT(kvm_page_fault,
196 TP_PROTO(unsigned long fault_address, unsigned int error_code),
197 TP_ARGS(fault_address, error_code),
198
199 TP_STRUCT__entry(
200 __field( unsigned long, fault_address )
201 __field( unsigned int, error_code )
202 ),
203
204 TP_fast_assign(
205 __entry->fault_address = fault_address;
206 __entry->error_code = error_code;
207 ),
208
209 TP_printk("address %lx error_code %x",
210 __entry->fault_address, __entry->error_code)
211);
212
213/*
214 * Tracepoint for guest MSR access.
215 */
216TRACE_EVENT(kvm_msr,
217 TP_PROTO(unsigned int rw, unsigned int ecx, unsigned long data),
218 TP_ARGS(rw, ecx, data),
219
220 TP_STRUCT__entry(
221 __field( unsigned int, rw )
222 __field( unsigned int, ecx )
223 __field( unsigned long, data )
224 ),
225
226 TP_fast_assign(
227 __entry->rw = rw;
228 __entry->ecx = ecx;
229 __entry->data = data;
230 ),
231
232 TP_printk("msr_%s %x = 0x%lx",
233 __entry->rw ? "write" : "read",
234 __entry->ecx, __entry->data)
235);
236
237#define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data)
238#define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data)
239
240/*
241 * Tracepoint for guest CR access.
242 */
243TRACE_EVENT(kvm_cr,
244 TP_PROTO(unsigned int rw, unsigned int cr, unsigned long val),
245 TP_ARGS(rw, cr, val),
246
247 TP_STRUCT__entry(
248 __field( unsigned int, rw )
249 __field( unsigned int, cr )
250 __field( unsigned long, val )
251 ),
252
253 TP_fast_assign(
254 __entry->rw = rw;
255 __entry->cr = cr;
256 __entry->val = val;
257 ),
258
259 TP_printk("cr_%s %x = 0x%lx",
260 __entry->rw ? "write" : "read",
261 __entry->cr, __entry->val)
262);
263
264#define trace_kvm_cr_read(cr, val) trace_kvm_cr(0, cr, val)
265#define trace_kvm_cr_write(cr, val) trace_kvm_cr(1, cr, val)
266
267TRACE_EVENT(kvm_pic_set_irq,
268 TP_PROTO(__u8 chip, __u8 pin, __u8 elcr, __u8 imr, bool coalesced),
269 TP_ARGS(chip, pin, elcr, imr, coalesced),
270
271 TP_STRUCT__entry(
272 __field( __u8, chip )
273 __field( __u8, pin )
274 __field( __u8, elcr )
275 __field( __u8, imr )
276 __field( bool, coalesced )
277 ),
278
279 TP_fast_assign(
280 __entry->chip = chip;
281 __entry->pin = pin;
282 __entry->elcr = elcr;
283 __entry->imr = imr;
284 __entry->coalesced = coalesced;
285 ),
286
287 TP_printk("chip %u pin %u (%s%s)%s",
288 __entry->chip, __entry->pin,
289 (__entry->elcr & (1 << __entry->pin)) ? "level":"edge",
290 (__entry->imr & (1 << __entry->pin)) ? "|masked":"",
291 __entry->coalesced ? " (coalesced)" : "")
292);
293
294#define kvm_apic_dst_shorthand \
295 {0x0, "dst"}, \
296 {0x1, "self"}, \
297 {0x2, "all"}, \
298 {0x3, "all-but-self"}
299
300TRACE_EVENT(kvm_apic_ipi,
301 TP_PROTO(__u32 icr_low, __u32 dest_id),
302 TP_ARGS(icr_low, dest_id),
303
304 TP_STRUCT__entry(
305 __field( __u32, icr_low )
306 __field( __u32, dest_id )
307 ),
308
309 TP_fast_assign(
310 __entry->icr_low = icr_low;
311 __entry->dest_id = dest_id;
312 ),
313
314 TP_printk("dst %x vec %u (%s|%s|%s|%s|%s)",
315 __entry->dest_id, (u8)__entry->icr_low,
316 __print_symbolic((__entry->icr_low >> 8 & 0x7),
317 kvm_deliver_mode),
318 (__entry->icr_low & (1<<11)) ? "logical" : "physical",
319 (__entry->icr_low & (1<<14)) ? "assert" : "de-assert",
320 (__entry->icr_low & (1<<15)) ? "level" : "edge",
321 __print_symbolic((__entry->icr_low >> 18 & 0x3),
322 kvm_apic_dst_shorthand))
323);
324
325TRACE_EVENT(kvm_apic_accept_irq,
326 TP_PROTO(__u32 apicid, __u16 dm, __u8 tm, __u8 vec, bool coalesced),
327 TP_ARGS(apicid, dm, tm, vec, coalesced),
328
329 TP_STRUCT__entry(
330 __field( __u32, apicid )
331 __field( __u16, dm )
332 __field( __u8, tm )
333 __field( __u8, vec )
334 __field( bool, coalesced )
335 ),
336
337 TP_fast_assign(
338 __entry->apicid = apicid;
339 __entry->dm = dm;
340 __entry->tm = tm;
341 __entry->vec = vec;
342 __entry->coalesced = coalesced;
343 ),
344
345 TP_printk("apicid %x vec %u (%s|%s)%s",
346 __entry->apicid, __entry->vec,
347 __print_symbolic((__entry->dm >> 8 & 0x7), kvm_deliver_mode),
348 __entry->tm ? "level" : "edge",
349 __entry->coalesced ? " (coalesced)" : "")
350);
351
352#endif /* _TRACE_KVM_H */
353
354/* This part must be outside protection */
355#include <trace/define_trace.h>
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 29f912927a58..f3812014bd0b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -25,6 +25,7 @@
25#include <linux/highmem.h> 25#include <linux/highmem.h>
26#include <linux/sched.h> 26#include <linux/sched.h>
27#include <linux/moduleparam.h> 27#include <linux/moduleparam.h>
28#include <linux/ftrace_event.h>
28#include "kvm_cache_regs.h" 29#include "kvm_cache_regs.h"
29#include "x86.h" 30#include "x86.h"
30 31
@@ -34,6 +35,8 @@
34#include <asm/virtext.h> 35#include <asm/virtext.h>
35#include <asm/mce.h> 36#include <asm/mce.h>
36 37
38#include "trace.h"
39
37#define __ex(x) __kvm_handle_fault_on_reboot(x) 40#define __ex(x) __kvm_handle_fault_on_reboot(x)
38 41
39MODULE_AUTHOR("Qumranet"); 42MODULE_AUTHOR("Qumranet");
@@ -51,6 +54,10 @@ module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
51static int __read_mostly enable_ept = 1; 54static int __read_mostly enable_ept = 1;
52module_param_named(ept, enable_ept, bool, S_IRUGO); 55module_param_named(ept, enable_ept, bool, S_IRUGO);
53 56
57static int __read_mostly enable_unrestricted_guest = 1;
58module_param_named(unrestricted_guest,
59 enable_unrestricted_guest, bool, S_IRUGO);
60
54static int __read_mostly emulate_invalid_guest_state = 0; 61static int __read_mostly emulate_invalid_guest_state = 0;
55module_param(emulate_invalid_guest_state, bool, S_IRUGO); 62module_param(emulate_invalid_guest_state, bool, S_IRUGO);
56 63
@@ -84,6 +91,14 @@ struct vcpu_vmx {
84 int guest_efer_loaded; 91 int guest_efer_loaded;
85 } host_state; 92 } host_state;
86 struct { 93 struct {
94 int vm86_active;
95 u8 save_iopl;
96 struct kvm_save_segment {
97 u16 selector;
98 unsigned long base;
99 u32 limit;
100 u32 ar;
101 } tr, es, ds, fs, gs;
87 struct { 102 struct {
88 bool pending; 103 bool pending;
89 u8 vector; 104 u8 vector;
@@ -161,6 +176,8 @@ static struct kvm_vmx_segment_field {
161 VMX_SEGMENT_FIELD(LDTR), 176 VMX_SEGMENT_FIELD(LDTR),
162}; 177};
163 178
179static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
180
164/* 181/*
165 * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it 182 * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it
166 * away by decrementing the array size. 183 * away by decrementing the array size.
@@ -256,6 +273,26 @@ static inline bool cpu_has_vmx_flexpriority(void)
256 cpu_has_vmx_virtualize_apic_accesses(); 273 cpu_has_vmx_virtualize_apic_accesses();
257} 274}
258 275
276static inline bool cpu_has_vmx_ept_execute_only(void)
277{
278 return !!(vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT);
279}
280
281static inline bool cpu_has_vmx_eptp_uncacheable(void)
282{
283 return !!(vmx_capability.ept & VMX_EPTP_UC_BIT);
284}
285
286static inline bool cpu_has_vmx_eptp_writeback(void)
287{
288 return !!(vmx_capability.ept & VMX_EPTP_WB_BIT);
289}
290
291static inline bool cpu_has_vmx_ept_2m_page(void)
292{
293 return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT);
294}
295
259static inline int cpu_has_vmx_invept_individual_addr(void) 296static inline int cpu_has_vmx_invept_individual_addr(void)
260{ 297{
261 return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT); 298 return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT);
@@ -277,6 +314,12 @@ static inline int cpu_has_vmx_ept(void)
277 SECONDARY_EXEC_ENABLE_EPT; 314 SECONDARY_EXEC_ENABLE_EPT;
278} 315}
279 316
317static inline int cpu_has_vmx_unrestricted_guest(void)
318{
319 return vmcs_config.cpu_based_2nd_exec_ctrl &
320 SECONDARY_EXEC_UNRESTRICTED_GUEST;
321}
322
280static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) 323static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
281{ 324{
282 return flexpriority_enabled && 325 return flexpriority_enabled &&
@@ -497,14 +540,16 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
497 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR); 540 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR);
498 if (!vcpu->fpu_active) 541 if (!vcpu->fpu_active)
499 eb |= 1u << NM_VECTOR; 542 eb |= 1u << NM_VECTOR;
543 /*
544 * Unconditionally intercept #DB so we can maintain dr6 without
545 * reading it every exit.
546 */
547 eb |= 1u << DB_VECTOR;
500 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { 548 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
501 if (vcpu->guest_debug &
502 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
503 eb |= 1u << DB_VECTOR;
504 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 549 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
505 eb |= 1u << BP_VECTOR; 550 eb |= 1u << BP_VECTOR;
506 } 551 }
507 if (vcpu->arch.rmode.vm86_active) 552 if (to_vmx(vcpu)->rmode.vm86_active)
508 eb = ~0; 553 eb = ~0;
509 if (enable_ept) 554 if (enable_ept)
510 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ 555 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
@@ -528,12 +573,15 @@ static void reload_tss(void)
528static void load_transition_efer(struct vcpu_vmx *vmx) 573static void load_transition_efer(struct vcpu_vmx *vmx)
529{ 574{
530 int efer_offset = vmx->msr_offset_efer; 575 int efer_offset = vmx->msr_offset_efer;
531 u64 host_efer = vmx->host_msrs[efer_offset].data; 576 u64 host_efer;
532 u64 guest_efer = vmx->guest_msrs[efer_offset].data; 577 u64 guest_efer;
533 u64 ignore_bits; 578 u64 ignore_bits;
534 579
535 if (efer_offset < 0) 580 if (efer_offset < 0)
536 return; 581 return;
582 host_efer = vmx->host_msrs[efer_offset].data;
583 guest_efer = vmx->guest_msrs[efer_offset].data;
584
537 /* 585 /*
538 * NX is emulated; LMA and LME handled by hardware; SCE meaninless 586 * NX is emulated; LMA and LME handled by hardware; SCE meaninless
539 * outside long mode 587 * outside long mode
@@ -735,12 +783,17 @@ static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
735 783
736static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) 784static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
737{ 785{
738 return vmcs_readl(GUEST_RFLAGS); 786 unsigned long rflags;
787
788 rflags = vmcs_readl(GUEST_RFLAGS);
789 if (to_vmx(vcpu)->rmode.vm86_active)
790 rflags &= ~(unsigned long)(X86_EFLAGS_IOPL | X86_EFLAGS_VM);
791 return rflags;
739} 792}
740 793
741static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 794static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
742{ 795{
743 if (vcpu->arch.rmode.vm86_active) 796 if (to_vmx(vcpu)->rmode.vm86_active)
744 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 797 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
745 vmcs_writel(GUEST_RFLAGS, rflags); 798 vmcs_writel(GUEST_RFLAGS, rflags);
746} 799}
@@ -797,12 +850,13 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
797 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 850 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
798 } 851 }
799 852
800 if (vcpu->arch.rmode.vm86_active) { 853 if (vmx->rmode.vm86_active) {
801 vmx->rmode.irq.pending = true; 854 vmx->rmode.irq.pending = true;
802 vmx->rmode.irq.vector = nr; 855 vmx->rmode.irq.vector = nr;
803 vmx->rmode.irq.rip = kvm_rip_read(vcpu); 856 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
804 if (nr == BP_VECTOR || nr == OF_VECTOR) 857 if (kvm_exception_is_soft(nr))
805 vmx->rmode.irq.rip++; 858 vmx->rmode.irq.rip +=
859 vmx->vcpu.arch.event_exit_inst_len;
806 intr_info |= INTR_TYPE_SOFT_INTR; 860 intr_info |= INTR_TYPE_SOFT_INTR;
807 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); 861 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
808 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); 862 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
@@ -940,7 +994,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
940 case MSR_EFER: 994 case MSR_EFER:
941 return kvm_get_msr_common(vcpu, msr_index, pdata); 995 return kvm_get_msr_common(vcpu, msr_index, pdata);
942#endif 996#endif
943 case MSR_IA32_TIME_STAMP_COUNTER: 997 case MSR_IA32_TSC:
944 data = guest_read_tsc(); 998 data = guest_read_tsc();
945 break; 999 break;
946 case MSR_IA32_SYSENTER_CS: 1000 case MSR_IA32_SYSENTER_CS:
@@ -953,9 +1007,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
953 data = vmcs_readl(GUEST_SYSENTER_ESP); 1007 data = vmcs_readl(GUEST_SYSENTER_ESP);
954 break; 1008 break;
955 default: 1009 default:
956 vmx_load_host_state(to_vmx(vcpu));
957 msr = find_msr_entry(to_vmx(vcpu), msr_index); 1010 msr = find_msr_entry(to_vmx(vcpu), msr_index);
958 if (msr) { 1011 if (msr) {
1012 vmx_load_host_state(to_vmx(vcpu));
959 data = msr->data; 1013 data = msr->data;
960 break; 1014 break;
961 } 1015 }
@@ -1000,22 +1054,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1000 case MSR_IA32_SYSENTER_ESP: 1054 case MSR_IA32_SYSENTER_ESP:
1001 vmcs_writel(GUEST_SYSENTER_ESP, data); 1055 vmcs_writel(GUEST_SYSENTER_ESP, data);
1002 break; 1056 break;
1003 case MSR_IA32_TIME_STAMP_COUNTER: 1057 case MSR_IA32_TSC:
1004 rdtscll(host_tsc); 1058 rdtscll(host_tsc);
1005 guest_write_tsc(data, host_tsc); 1059 guest_write_tsc(data, host_tsc);
1006 break; 1060 break;
1007 case MSR_P6_PERFCTR0:
1008 case MSR_P6_PERFCTR1:
1009 case MSR_P6_EVNTSEL0:
1010 case MSR_P6_EVNTSEL1:
1011 /*
1012 * Just discard all writes to the performance counters; this
1013 * should keep both older linux and windows 64-bit guests
1014 * happy
1015 */
1016 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", msr_index, data);
1017
1018 break;
1019 case MSR_IA32_CR_PAT: 1061 case MSR_IA32_CR_PAT:
1020 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 1062 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
1021 vmcs_write64(GUEST_IA32_PAT, data); 1063 vmcs_write64(GUEST_IA32_PAT, data);
@@ -1024,9 +1066,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1024 } 1066 }
1025 /* Otherwise falls through to kvm_set_msr_common */ 1067 /* Otherwise falls through to kvm_set_msr_common */
1026 default: 1068 default:
1027 vmx_load_host_state(vmx);
1028 msr = find_msr_entry(vmx, msr_index); 1069 msr = find_msr_entry(vmx, msr_index);
1029 if (msr) { 1070 if (msr) {
1071 vmx_load_host_state(vmx);
1030 msr->data = data; 1072 msr->data = data;
1031 break; 1073 break;
1032 } 1074 }
@@ -1046,6 +1088,10 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1046 case VCPU_REGS_RIP: 1088 case VCPU_REGS_RIP:
1047 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP); 1089 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
1048 break; 1090 break;
1091 case VCPU_EXREG_PDPTR:
1092 if (enable_ept)
1093 ept_save_pdptrs(vcpu);
1094 break;
1049 default: 1095 default:
1050 break; 1096 break;
1051 } 1097 }
@@ -1203,7 +1249,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1203 opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 1249 opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
1204 SECONDARY_EXEC_WBINVD_EXITING | 1250 SECONDARY_EXEC_WBINVD_EXITING |
1205 SECONDARY_EXEC_ENABLE_VPID | 1251 SECONDARY_EXEC_ENABLE_VPID |
1206 SECONDARY_EXEC_ENABLE_EPT; 1252 SECONDARY_EXEC_ENABLE_EPT |
1253 SECONDARY_EXEC_UNRESTRICTED_GUEST;
1207 if (adjust_vmx_controls(min2, opt2, 1254 if (adjust_vmx_controls(min2, opt2,
1208 MSR_IA32_VMX_PROCBASED_CTLS2, 1255 MSR_IA32_VMX_PROCBASED_CTLS2,
1209 &_cpu_based_2nd_exec_control) < 0) 1256 &_cpu_based_2nd_exec_control) < 0)
@@ -1217,12 +1264,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1217 if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { 1264 if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
1218 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT 1265 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
1219 enabled */ 1266 enabled */
1220 min &= ~(CPU_BASED_CR3_LOAD_EXITING | 1267 _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
1221 CPU_BASED_CR3_STORE_EXITING | 1268 CPU_BASED_CR3_STORE_EXITING |
1222 CPU_BASED_INVLPG_EXITING); 1269 CPU_BASED_INVLPG_EXITING);
1223 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
1224 &_cpu_based_exec_control) < 0)
1225 return -EIO;
1226 rdmsr(MSR_IA32_VMX_EPT_VPID_CAP, 1270 rdmsr(MSR_IA32_VMX_EPT_VPID_CAP,
1227 vmx_capability.ept, vmx_capability.vpid); 1271 vmx_capability.ept, vmx_capability.vpid);
1228 } 1272 }
@@ -1333,8 +1377,13 @@ static __init int hardware_setup(void)
1333 if (!cpu_has_vmx_vpid()) 1377 if (!cpu_has_vmx_vpid())
1334 enable_vpid = 0; 1378 enable_vpid = 0;
1335 1379
1336 if (!cpu_has_vmx_ept()) 1380 if (!cpu_has_vmx_ept()) {
1337 enable_ept = 0; 1381 enable_ept = 0;
1382 enable_unrestricted_guest = 0;
1383 }
1384
1385 if (!cpu_has_vmx_unrestricted_guest())
1386 enable_unrestricted_guest = 0;
1338 1387
1339 if (!cpu_has_vmx_flexpriority()) 1388 if (!cpu_has_vmx_flexpriority())
1340 flexpriority_enabled = 0; 1389 flexpriority_enabled = 0;
@@ -1342,6 +1391,9 @@ static __init int hardware_setup(void)
1342 if (!cpu_has_vmx_tpr_shadow()) 1391 if (!cpu_has_vmx_tpr_shadow())
1343 kvm_x86_ops->update_cr8_intercept = NULL; 1392 kvm_x86_ops->update_cr8_intercept = NULL;
1344 1393
1394 if (enable_ept && !cpu_has_vmx_ept_2m_page())
1395 kvm_disable_largepages();
1396
1345 return alloc_kvm_area(); 1397 return alloc_kvm_area();
1346} 1398}
1347 1399
@@ -1372,15 +1424,15 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
1372 struct vcpu_vmx *vmx = to_vmx(vcpu); 1424 struct vcpu_vmx *vmx = to_vmx(vcpu);
1373 1425
1374 vmx->emulation_required = 1; 1426 vmx->emulation_required = 1;
1375 vcpu->arch.rmode.vm86_active = 0; 1427 vmx->rmode.vm86_active = 0;
1376 1428
1377 vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base); 1429 vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base);
1378 vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit); 1430 vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit);
1379 vmcs_write32(GUEST_TR_AR_BYTES, vcpu->arch.rmode.tr.ar); 1431 vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar);
1380 1432
1381 flags = vmcs_readl(GUEST_RFLAGS); 1433 flags = vmcs_readl(GUEST_RFLAGS);
1382 flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM); 1434 flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM);
1383 flags |= (vcpu->arch.rmode.save_iopl << IOPL_SHIFT); 1435 flags |= (vmx->rmode.save_iopl << IOPL_SHIFT);
1384 vmcs_writel(GUEST_RFLAGS, flags); 1436 vmcs_writel(GUEST_RFLAGS, flags);
1385 1437
1386 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | 1438 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
@@ -1391,10 +1443,10 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
1391 if (emulate_invalid_guest_state) 1443 if (emulate_invalid_guest_state)
1392 return; 1444 return;
1393 1445
1394 fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es); 1446 fix_pmode_dataseg(VCPU_SREG_ES, &vmx->rmode.es);
1395 fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); 1447 fix_pmode_dataseg(VCPU_SREG_DS, &vmx->rmode.ds);
1396 fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); 1448 fix_pmode_dataseg(VCPU_SREG_GS, &vmx->rmode.gs);
1397 fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); 1449 fix_pmode_dataseg(VCPU_SREG_FS, &vmx->rmode.fs);
1398 1450
1399 vmcs_write16(GUEST_SS_SELECTOR, 0); 1451 vmcs_write16(GUEST_SS_SELECTOR, 0);
1400 vmcs_write32(GUEST_SS_AR_BYTES, 0x93); 1452 vmcs_write32(GUEST_SS_AR_BYTES, 0x93);
@@ -1433,20 +1485,23 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
1433 unsigned long flags; 1485 unsigned long flags;
1434 struct vcpu_vmx *vmx = to_vmx(vcpu); 1486 struct vcpu_vmx *vmx = to_vmx(vcpu);
1435 1487
1488 if (enable_unrestricted_guest)
1489 return;
1490
1436 vmx->emulation_required = 1; 1491 vmx->emulation_required = 1;
1437 vcpu->arch.rmode.vm86_active = 1; 1492 vmx->rmode.vm86_active = 1;
1438 1493
1439 vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE); 1494 vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
1440 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); 1495 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
1441 1496
1442 vcpu->arch.rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT); 1497 vmx->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
1443 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); 1498 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
1444 1499
1445 vcpu->arch.rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES); 1500 vmx->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
1446 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 1501 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
1447 1502
1448 flags = vmcs_readl(GUEST_RFLAGS); 1503 flags = vmcs_readl(GUEST_RFLAGS);
1449 vcpu->arch.rmode.save_iopl 1504 vmx->rmode.save_iopl
1450 = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; 1505 = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
1451 1506
1452 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 1507 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
@@ -1468,10 +1523,10 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
1468 vmcs_writel(GUEST_CS_BASE, 0xf0000); 1523 vmcs_writel(GUEST_CS_BASE, 0xf0000);
1469 vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4); 1524 vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
1470 1525
1471 fix_rmode_seg(VCPU_SREG_ES, &vcpu->arch.rmode.es); 1526 fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es);
1472 fix_rmode_seg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); 1527 fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds);
1473 fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); 1528 fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs);
1474 fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); 1529 fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs);
1475 1530
1476continue_rmode: 1531continue_rmode:
1477 kvm_mmu_reset_context(vcpu); 1532 kvm_mmu_reset_context(vcpu);
@@ -1545,11 +1600,11 @@ static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
1545 1600
1546static void ept_load_pdptrs(struct kvm_vcpu *vcpu) 1601static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
1547{ 1602{
1603 if (!test_bit(VCPU_EXREG_PDPTR,
1604 (unsigned long *)&vcpu->arch.regs_dirty))
1605 return;
1606
1548 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { 1607 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
1549 if (!load_pdptrs(vcpu, vcpu->arch.cr3)) {
1550 printk(KERN_ERR "EPT: Fail to load pdptrs!\n");
1551 return;
1552 }
1553 vmcs_write64(GUEST_PDPTR0, vcpu->arch.pdptrs[0]); 1608 vmcs_write64(GUEST_PDPTR0, vcpu->arch.pdptrs[0]);
1554 vmcs_write64(GUEST_PDPTR1, vcpu->arch.pdptrs[1]); 1609 vmcs_write64(GUEST_PDPTR1, vcpu->arch.pdptrs[1]);
1555 vmcs_write64(GUEST_PDPTR2, vcpu->arch.pdptrs[2]); 1610 vmcs_write64(GUEST_PDPTR2, vcpu->arch.pdptrs[2]);
@@ -1557,6 +1612,21 @@ static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
1557 } 1612 }
1558} 1613}
1559 1614
1615static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
1616{
1617 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
1618 vcpu->arch.pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
1619 vcpu->arch.pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
1620 vcpu->arch.pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
1621 vcpu->arch.pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
1622 }
1623
1624 __set_bit(VCPU_EXREG_PDPTR,
1625 (unsigned long *)&vcpu->arch.regs_avail);
1626 __set_bit(VCPU_EXREG_PDPTR,
1627 (unsigned long *)&vcpu->arch.regs_dirty);
1628}
1629
1560static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); 1630static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
1561 1631
1562static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, 1632static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
@@ -1571,8 +1641,6 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
1571 CPU_BASED_CR3_STORE_EXITING)); 1641 CPU_BASED_CR3_STORE_EXITING));
1572 vcpu->arch.cr0 = cr0; 1642 vcpu->arch.cr0 = cr0;
1573 vmx_set_cr4(vcpu, vcpu->arch.cr4); 1643 vmx_set_cr4(vcpu, vcpu->arch.cr4);
1574 *hw_cr0 |= X86_CR0_PE | X86_CR0_PG;
1575 *hw_cr0 &= ~X86_CR0_WP;
1576 } else if (!is_paging(vcpu)) { 1644 } else if (!is_paging(vcpu)) {
1577 /* From nonpaging to paging */ 1645 /* From nonpaging to paging */
1578 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, 1646 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
@@ -1581,9 +1649,10 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
1581 CPU_BASED_CR3_STORE_EXITING)); 1649 CPU_BASED_CR3_STORE_EXITING));
1582 vcpu->arch.cr0 = cr0; 1650 vcpu->arch.cr0 = cr0;
1583 vmx_set_cr4(vcpu, vcpu->arch.cr4); 1651 vmx_set_cr4(vcpu, vcpu->arch.cr4);
1584 if (!(vcpu->arch.cr0 & X86_CR0_WP))
1585 *hw_cr0 &= ~X86_CR0_WP;
1586 } 1652 }
1653
1654 if (!(cr0 & X86_CR0_WP))
1655 *hw_cr0 &= ~X86_CR0_WP;
1587} 1656}
1588 1657
1589static void ept_update_paging_mode_cr4(unsigned long *hw_cr4, 1658static void ept_update_paging_mode_cr4(unsigned long *hw_cr4,
@@ -1598,15 +1667,21 @@ static void ept_update_paging_mode_cr4(unsigned long *hw_cr4,
1598 1667
1599static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 1668static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1600{ 1669{
1601 unsigned long hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | 1670 struct vcpu_vmx *vmx = to_vmx(vcpu);
1602 KVM_VM_CR0_ALWAYS_ON; 1671 unsigned long hw_cr0;
1672
1673 if (enable_unrestricted_guest)
1674 hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST)
1675 | KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
1676 else
1677 hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON;
1603 1678
1604 vmx_fpu_deactivate(vcpu); 1679 vmx_fpu_deactivate(vcpu);
1605 1680
1606 if (vcpu->arch.rmode.vm86_active && (cr0 & X86_CR0_PE)) 1681 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
1607 enter_pmode(vcpu); 1682 enter_pmode(vcpu);
1608 1683
1609 if (!vcpu->arch.rmode.vm86_active && !(cr0 & X86_CR0_PE)) 1684 if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
1610 enter_rmode(vcpu); 1685 enter_rmode(vcpu);
1611 1686
1612#ifdef CONFIG_X86_64 1687#ifdef CONFIG_X86_64
@@ -1650,10 +1725,8 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1650 if (enable_ept) { 1725 if (enable_ept) {
1651 eptp = construct_eptp(cr3); 1726 eptp = construct_eptp(cr3);
1652 vmcs_write64(EPT_POINTER, eptp); 1727 vmcs_write64(EPT_POINTER, eptp);
1653 ept_sync_context(eptp);
1654 ept_load_pdptrs(vcpu);
1655 guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 : 1728 guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 :
1656 VMX_EPT_IDENTITY_PAGETABLE_ADDR; 1729 vcpu->kvm->arch.ept_identity_map_addr;
1657 } 1730 }
1658 1731
1659 vmx_flush_tlb(vcpu); 1732 vmx_flush_tlb(vcpu);
@@ -1664,7 +1737,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1664 1737
1665static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 1738static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1666{ 1739{
1667 unsigned long hw_cr4 = cr4 | (vcpu->arch.rmode.vm86_active ? 1740 unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ?
1668 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); 1741 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
1669 1742
1670 vcpu->arch.cr4 = cr4; 1743 vcpu->arch.cr4 = cr4;
@@ -1707,16 +1780,13 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
1707 1780
1708static int vmx_get_cpl(struct kvm_vcpu *vcpu) 1781static int vmx_get_cpl(struct kvm_vcpu *vcpu)
1709{ 1782{
1710 struct kvm_segment kvm_seg;
1711
1712 if (!(vcpu->arch.cr0 & X86_CR0_PE)) /* if real mode */ 1783 if (!(vcpu->arch.cr0 & X86_CR0_PE)) /* if real mode */
1713 return 0; 1784 return 0;
1714 1785
1715 if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */ 1786 if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */
1716 return 3; 1787 return 3;
1717 1788
1718 vmx_get_segment(vcpu, &kvm_seg, VCPU_SREG_CS); 1789 return vmcs_read16(GUEST_CS_SELECTOR) & 3;
1719 return kvm_seg.selector & 3;
1720} 1790}
1721 1791
1722static u32 vmx_segment_access_rights(struct kvm_segment *var) 1792static u32 vmx_segment_access_rights(struct kvm_segment *var)
@@ -1744,20 +1814,21 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var)
1744static void vmx_set_segment(struct kvm_vcpu *vcpu, 1814static void vmx_set_segment(struct kvm_vcpu *vcpu,
1745 struct kvm_segment *var, int seg) 1815 struct kvm_segment *var, int seg)
1746{ 1816{
1817 struct vcpu_vmx *vmx = to_vmx(vcpu);
1747 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 1818 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1748 u32 ar; 1819 u32 ar;
1749 1820
1750 if (vcpu->arch.rmode.vm86_active && seg == VCPU_SREG_TR) { 1821 if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) {
1751 vcpu->arch.rmode.tr.selector = var->selector; 1822 vmx->rmode.tr.selector = var->selector;
1752 vcpu->arch.rmode.tr.base = var->base; 1823 vmx->rmode.tr.base = var->base;
1753 vcpu->arch.rmode.tr.limit = var->limit; 1824 vmx->rmode.tr.limit = var->limit;
1754 vcpu->arch.rmode.tr.ar = vmx_segment_access_rights(var); 1825 vmx->rmode.tr.ar = vmx_segment_access_rights(var);
1755 return; 1826 return;
1756 } 1827 }
1757 vmcs_writel(sf->base, var->base); 1828 vmcs_writel(sf->base, var->base);
1758 vmcs_write32(sf->limit, var->limit); 1829 vmcs_write32(sf->limit, var->limit);
1759 vmcs_write16(sf->selector, var->selector); 1830 vmcs_write16(sf->selector, var->selector);
1760 if (vcpu->arch.rmode.vm86_active && var->s) { 1831 if (vmx->rmode.vm86_active && var->s) {
1761 /* 1832 /*
1762 * Hack real-mode segments into vm86 compatibility. 1833 * Hack real-mode segments into vm86 compatibility.
1763 */ 1834 */
@@ -1766,6 +1837,21 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
1766 ar = 0xf3; 1837 ar = 0xf3;
1767 } else 1838 } else
1768 ar = vmx_segment_access_rights(var); 1839 ar = vmx_segment_access_rights(var);
1840
1841 /*
1842 * Fix the "Accessed" bit in AR field of segment registers for older
1843 * qemu binaries.
1844 * IA32 arch specifies that at the time of processor reset the
1845 * "Accessed" bit in the AR field of segment registers is 1. And qemu
1846 * is setting it to 0 in the usedland code. This causes invalid guest
1847 * state vmexit when "unrestricted guest" mode is turned on.
1848 * Fix for this setup issue in cpu_reset is being pushed in the qemu
1849 * tree. Newer qemu binaries with that qemu fix would not need this
1850 * kvm hack.
1851 */
1852 if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR))
1853 ar |= 0x1; /* Accessed */
1854
1769 vmcs_write32(sf->ar_bytes, ar); 1855 vmcs_write32(sf->ar_bytes, ar);
1770} 1856}
1771 1857
@@ -2040,7 +2126,7 @@ static int init_rmode_identity_map(struct kvm *kvm)
2040 if (likely(kvm->arch.ept_identity_pagetable_done)) 2126 if (likely(kvm->arch.ept_identity_pagetable_done))
2041 return 1; 2127 return 1;
2042 ret = 0; 2128 ret = 0;
2043 identity_map_pfn = VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT; 2129 identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT;
2044 r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); 2130 r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
2045 if (r < 0) 2131 if (r < 0)
2046 goto out; 2132 goto out;
@@ -2062,11 +2148,19 @@ out:
2062static void seg_setup(int seg) 2148static void seg_setup(int seg)
2063{ 2149{
2064 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 2150 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2151 unsigned int ar;
2065 2152
2066 vmcs_write16(sf->selector, 0); 2153 vmcs_write16(sf->selector, 0);
2067 vmcs_writel(sf->base, 0); 2154 vmcs_writel(sf->base, 0);
2068 vmcs_write32(sf->limit, 0xffff); 2155 vmcs_write32(sf->limit, 0xffff);
2069 vmcs_write32(sf->ar_bytes, 0xf3); 2156 if (enable_unrestricted_guest) {
2157 ar = 0x93;
2158 if (seg == VCPU_SREG_CS)
2159 ar |= 0x08; /* code segment */
2160 } else
2161 ar = 0xf3;
2162
2163 vmcs_write32(sf->ar_bytes, ar);
2070} 2164}
2071 2165
2072static int alloc_apic_access_page(struct kvm *kvm) 2166static int alloc_apic_access_page(struct kvm *kvm)
@@ -2101,14 +2195,15 @@ static int alloc_identity_pagetable(struct kvm *kvm)
2101 goto out; 2195 goto out;
2102 kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; 2196 kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
2103 kvm_userspace_mem.flags = 0; 2197 kvm_userspace_mem.flags = 0;
2104 kvm_userspace_mem.guest_phys_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; 2198 kvm_userspace_mem.guest_phys_addr =
2199 kvm->arch.ept_identity_map_addr;
2105 kvm_userspace_mem.memory_size = PAGE_SIZE; 2200 kvm_userspace_mem.memory_size = PAGE_SIZE;
2106 r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0); 2201 r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
2107 if (r) 2202 if (r)
2108 goto out; 2203 goto out;
2109 2204
2110 kvm->arch.ept_identity_pagetable = gfn_to_page(kvm, 2205 kvm->arch.ept_identity_pagetable = gfn_to_page(kvm,
2111 VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT); 2206 kvm->arch.ept_identity_map_addr >> PAGE_SHIFT);
2112out: 2207out:
2113 up_write(&kvm->slots_lock); 2208 up_write(&kvm->slots_lock);
2114 return r; 2209 return r;
@@ -2209,6 +2304,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2209 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; 2304 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
2210 if (!enable_ept) 2305 if (!enable_ept)
2211 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; 2306 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
2307 if (!enable_unrestricted_guest)
2308 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
2212 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); 2309 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
2213 } 2310 }
2214 2311
@@ -2326,14 +2423,14 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2326 goto out; 2423 goto out;
2327 } 2424 }
2328 2425
2329 vmx->vcpu.arch.rmode.vm86_active = 0; 2426 vmx->rmode.vm86_active = 0;
2330 2427
2331 vmx->soft_vnmi_blocked = 0; 2428 vmx->soft_vnmi_blocked = 0;
2332 2429
2333 vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); 2430 vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
2334 kvm_set_cr8(&vmx->vcpu, 0); 2431 kvm_set_cr8(&vmx->vcpu, 0);
2335 msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; 2432 msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
2336 if (vmx->vcpu.vcpu_id == 0) 2433 if (kvm_vcpu_is_bsp(&vmx->vcpu))
2337 msr |= MSR_IA32_APICBASE_BSP; 2434 msr |= MSR_IA32_APICBASE_BSP;
2338 kvm_set_apic_base(&vmx->vcpu, msr); 2435 kvm_set_apic_base(&vmx->vcpu, msr);
2339 2436
@@ -2344,7 +2441,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2344 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode 2441 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
2345 * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. 2442 * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh.
2346 */ 2443 */
2347 if (vmx->vcpu.vcpu_id == 0) { 2444 if (kvm_vcpu_is_bsp(&vmx->vcpu)) {
2348 vmcs_write16(GUEST_CS_SELECTOR, 0xf000); 2445 vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
2349 vmcs_writel(GUEST_CS_BASE, 0x000f0000); 2446 vmcs_writel(GUEST_CS_BASE, 0x000f0000);
2350 } else { 2447 } else {
@@ -2373,7 +2470,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2373 vmcs_writel(GUEST_SYSENTER_EIP, 0); 2470 vmcs_writel(GUEST_SYSENTER_EIP, 0);
2374 2471
2375 vmcs_writel(GUEST_RFLAGS, 0x02); 2472 vmcs_writel(GUEST_RFLAGS, 0x02);
2376 if (vmx->vcpu.vcpu_id == 0) 2473 if (kvm_vcpu_is_bsp(&vmx->vcpu))
2377 kvm_rip_write(vcpu, 0xfff0); 2474 kvm_rip_write(vcpu, 0xfff0);
2378 else 2475 else
2379 kvm_rip_write(vcpu, 0); 2476 kvm_rip_write(vcpu, 0);
@@ -2461,13 +2558,16 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
2461 uint32_t intr; 2558 uint32_t intr;
2462 int irq = vcpu->arch.interrupt.nr; 2559 int irq = vcpu->arch.interrupt.nr;
2463 2560
2464 KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler); 2561 trace_kvm_inj_virq(irq);
2465 2562
2466 ++vcpu->stat.irq_injections; 2563 ++vcpu->stat.irq_injections;
2467 if (vcpu->arch.rmode.vm86_active) { 2564 if (vmx->rmode.vm86_active) {
2468 vmx->rmode.irq.pending = true; 2565 vmx->rmode.irq.pending = true;
2469 vmx->rmode.irq.vector = irq; 2566 vmx->rmode.irq.vector = irq;
2470 vmx->rmode.irq.rip = kvm_rip_read(vcpu); 2567 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
2568 if (vcpu->arch.interrupt.soft)
2569 vmx->rmode.irq.rip +=
2570 vmx->vcpu.arch.event_exit_inst_len;
2471 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2571 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2472 irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK); 2572 irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK);
2473 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); 2573 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
@@ -2502,7 +2602,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
2502 } 2602 }
2503 2603
2504 ++vcpu->stat.nmi_injections; 2604 ++vcpu->stat.nmi_injections;
2505 if (vcpu->arch.rmode.vm86_active) { 2605 if (vmx->rmode.vm86_active) {
2506 vmx->rmode.irq.pending = true; 2606 vmx->rmode.irq.pending = true;
2507 vmx->rmode.irq.vector = NMI_VECTOR; 2607 vmx->rmode.irq.vector = NMI_VECTOR;
2508 vmx->rmode.irq.rip = kvm_rip_read(vcpu); 2608 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
@@ -2659,14 +2759,14 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2659 if (enable_ept) 2759 if (enable_ept)
2660 BUG(); 2760 BUG();
2661 cr2 = vmcs_readl(EXIT_QUALIFICATION); 2761 cr2 = vmcs_readl(EXIT_QUALIFICATION);
2662 KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2, 2762 trace_kvm_page_fault(cr2, error_code);
2663 (u32)((u64)cr2 >> 32), handler); 2763
2664 if (kvm_event_needs_reinjection(vcpu)) 2764 if (kvm_event_needs_reinjection(vcpu))
2665 kvm_mmu_unprotect_page_virt(vcpu, cr2); 2765 kvm_mmu_unprotect_page_virt(vcpu, cr2);
2666 return kvm_mmu_page_fault(vcpu, cr2, error_code); 2766 return kvm_mmu_page_fault(vcpu, cr2, error_code);
2667 } 2767 }
2668 2768
2669 if (vcpu->arch.rmode.vm86_active && 2769 if (vmx->rmode.vm86_active &&
2670 handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK, 2770 handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
2671 error_code)) { 2771 error_code)) {
2672 if (vcpu->arch.halt_request) { 2772 if (vcpu->arch.halt_request) {
@@ -2707,7 +2807,6 @@ static int handle_external_interrupt(struct kvm_vcpu *vcpu,
2707 struct kvm_run *kvm_run) 2807 struct kvm_run *kvm_run)
2708{ 2808{
2709 ++vcpu->stat.irq_exits; 2809 ++vcpu->stat.irq_exits;
2710 KVMTRACE_1D(INTR, vcpu, vmcs_read32(VM_EXIT_INTR_INFO), handler);
2711 return 1; 2810 return 1;
2712} 2811}
2713 2812
@@ -2755,7 +2854,7 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
2755 2854
2756static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2855static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2757{ 2856{
2758 unsigned long exit_qualification; 2857 unsigned long exit_qualification, val;
2759 int cr; 2858 int cr;
2760 int reg; 2859 int reg;
2761 2860
@@ -2764,21 +2863,19 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2764 reg = (exit_qualification >> 8) & 15; 2863 reg = (exit_qualification >> 8) & 15;
2765 switch ((exit_qualification >> 4) & 3) { 2864 switch ((exit_qualification >> 4) & 3) {
2766 case 0: /* mov to cr */ 2865 case 0: /* mov to cr */
2767 KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, 2866 val = kvm_register_read(vcpu, reg);
2768 (u32)kvm_register_read(vcpu, reg), 2867 trace_kvm_cr_write(cr, val);
2769 (u32)((u64)kvm_register_read(vcpu, reg) >> 32),
2770 handler);
2771 switch (cr) { 2868 switch (cr) {
2772 case 0: 2869 case 0:
2773 kvm_set_cr0(vcpu, kvm_register_read(vcpu, reg)); 2870 kvm_set_cr0(vcpu, val);
2774 skip_emulated_instruction(vcpu); 2871 skip_emulated_instruction(vcpu);
2775 return 1; 2872 return 1;
2776 case 3: 2873 case 3:
2777 kvm_set_cr3(vcpu, kvm_register_read(vcpu, reg)); 2874 kvm_set_cr3(vcpu, val);
2778 skip_emulated_instruction(vcpu); 2875 skip_emulated_instruction(vcpu);
2779 return 1; 2876 return 1;
2780 case 4: 2877 case 4:
2781 kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg)); 2878 kvm_set_cr4(vcpu, val);
2782 skip_emulated_instruction(vcpu); 2879 skip_emulated_instruction(vcpu);
2783 return 1; 2880 return 1;
2784 case 8: { 2881 case 8: {
@@ -2800,23 +2897,19 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2800 vcpu->arch.cr0 &= ~X86_CR0_TS; 2897 vcpu->arch.cr0 &= ~X86_CR0_TS;
2801 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); 2898 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
2802 vmx_fpu_activate(vcpu); 2899 vmx_fpu_activate(vcpu);
2803 KVMTRACE_0D(CLTS, vcpu, handler);
2804 skip_emulated_instruction(vcpu); 2900 skip_emulated_instruction(vcpu);
2805 return 1; 2901 return 1;
2806 case 1: /*mov from cr*/ 2902 case 1: /*mov from cr*/
2807 switch (cr) { 2903 switch (cr) {
2808 case 3: 2904 case 3:
2809 kvm_register_write(vcpu, reg, vcpu->arch.cr3); 2905 kvm_register_write(vcpu, reg, vcpu->arch.cr3);
2810 KVMTRACE_3D(CR_READ, vcpu, (u32)cr, 2906 trace_kvm_cr_read(cr, vcpu->arch.cr3);
2811 (u32)kvm_register_read(vcpu, reg),
2812 (u32)((u64)kvm_register_read(vcpu, reg) >> 32),
2813 handler);
2814 skip_emulated_instruction(vcpu); 2907 skip_emulated_instruction(vcpu);
2815 return 1; 2908 return 1;
2816 case 8: 2909 case 8:
2817 kvm_register_write(vcpu, reg, kvm_get_cr8(vcpu)); 2910 val = kvm_get_cr8(vcpu);
2818 KVMTRACE_2D(CR_READ, vcpu, (u32)cr, 2911 kvm_register_write(vcpu, reg, val);
2819 (u32)kvm_register_read(vcpu, reg), handler); 2912 trace_kvm_cr_read(cr, val);
2820 skip_emulated_instruction(vcpu); 2913 skip_emulated_instruction(vcpu);
2821 return 1; 2914 return 1;
2822 } 2915 }
@@ -2841,6 +2934,8 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2841 unsigned long val; 2934 unsigned long val;
2842 int dr, reg; 2935 int dr, reg;
2843 2936
2937 if (!kvm_require_cpl(vcpu, 0))
2938 return 1;
2844 dr = vmcs_readl(GUEST_DR7); 2939 dr = vmcs_readl(GUEST_DR7);
2845 if (dr & DR7_GD) { 2940 if (dr & DR7_GD) {
2846 /* 2941 /*
@@ -2884,7 +2979,6 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2884 val = 0; 2979 val = 0;
2885 } 2980 }
2886 kvm_register_write(vcpu, reg, val); 2981 kvm_register_write(vcpu, reg, val);
2887 KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler);
2888 } else { 2982 } else {
2889 val = vcpu->arch.regs[reg]; 2983 val = vcpu->arch.regs[reg];
2890 switch (dr) { 2984 switch (dr) {
@@ -2917,7 +3011,6 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2917 } 3011 }
2918 break; 3012 break;
2919 } 3013 }
2920 KVMTRACE_2D(DR_WRITE, vcpu, (u32)dr, (u32)val, handler);
2921 } 3014 }
2922 skip_emulated_instruction(vcpu); 3015 skip_emulated_instruction(vcpu);
2923 return 1; 3016 return 1;
@@ -2939,8 +3032,7 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2939 return 1; 3032 return 1;
2940 } 3033 }
2941 3034
2942 KVMTRACE_3D(MSR_READ, vcpu, ecx, (u32)data, (u32)(data >> 32), 3035 trace_kvm_msr_read(ecx, data);
2943 handler);
2944 3036
2945 /* FIXME: handling of bits 32:63 of rax, rdx */ 3037 /* FIXME: handling of bits 32:63 of rax, rdx */
2946 vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u; 3038 vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u;
@@ -2955,8 +3047,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2955 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) 3047 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
2956 | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); 3048 | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
2957 3049
2958 KVMTRACE_3D(MSR_WRITE, vcpu, ecx, (u32)data, (u32)(data >> 32), 3050 trace_kvm_msr_write(ecx, data);
2959 handler);
2960 3051
2961 if (vmx_set_msr(vcpu, ecx, data) != 0) { 3052 if (vmx_set_msr(vcpu, ecx, data) != 0) {
2962 kvm_inject_gp(vcpu, 0); 3053 kvm_inject_gp(vcpu, 0);
@@ -2983,7 +3074,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
2983 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; 3074 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2984 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 3075 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2985 3076
2986 KVMTRACE_0D(PEND_INTR, vcpu, handler);
2987 ++vcpu->stat.irq_window_exits; 3077 ++vcpu->stat.irq_window_exits;
2988 3078
2989 /* 3079 /*
@@ -3049,7 +3139,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3049 printk(KERN_ERR 3139 printk(KERN_ERR
3050 "Fail to handle apic access vmexit! Offset is 0x%lx\n", 3140 "Fail to handle apic access vmexit! Offset is 0x%lx\n",
3051 offset); 3141 offset);
3052 return -ENOTSUPP; 3142 return -ENOEXEC;
3053 } 3143 }
3054 return 1; 3144 return 1;
3055} 3145}
@@ -3118,7 +3208,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3118 3208
3119 if (exit_qualification & (1 << 6)) { 3209 if (exit_qualification & (1 << 6)) {
3120 printk(KERN_ERR "EPT: GPA exceeds GAW!\n"); 3210 printk(KERN_ERR "EPT: GPA exceeds GAW!\n");
3121 return -ENOTSUPP; 3211 return -EINVAL;
3122 } 3212 }
3123 3213
3124 gla_validity = (exit_qualification >> 7) & 0x3; 3214 gla_validity = (exit_qualification >> 7) & 0x3;
@@ -3130,14 +3220,98 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3130 printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", 3220 printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
3131 (long unsigned int)exit_qualification); 3221 (long unsigned int)exit_qualification);
3132 kvm_run->exit_reason = KVM_EXIT_UNKNOWN; 3222 kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
3133 kvm_run->hw.hardware_exit_reason = 0; 3223 kvm_run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION;
3134 return -ENOTSUPP; 3224 return 0;
3135 } 3225 }
3136 3226
3137 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 3227 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
3228 trace_kvm_page_fault(gpa, exit_qualification);
3138 return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0); 3229 return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0);
3139} 3230}
3140 3231
3232static u64 ept_rsvd_mask(u64 spte, int level)
3233{
3234 int i;
3235 u64 mask = 0;
3236
3237 for (i = 51; i > boot_cpu_data.x86_phys_bits; i--)
3238 mask |= (1ULL << i);
3239
3240 if (level > 2)
3241 /* bits 7:3 reserved */
3242 mask |= 0xf8;
3243 else if (level == 2) {
3244 if (spte & (1ULL << 7))
3245 /* 2MB ref, bits 20:12 reserved */
3246 mask |= 0x1ff000;
3247 else
3248 /* bits 6:3 reserved */
3249 mask |= 0x78;
3250 }
3251
3252 return mask;
3253}
3254
3255static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte,
3256 int level)
3257{
3258 printk(KERN_ERR "%s: spte 0x%llx level %d\n", __func__, spte, level);
3259
3260 /* 010b (write-only) */
3261 WARN_ON((spte & 0x7) == 0x2);
3262
3263 /* 110b (write/execute) */
3264 WARN_ON((spte & 0x7) == 0x6);
3265
3266 /* 100b (execute-only) and value not supported by logical processor */
3267 if (!cpu_has_vmx_ept_execute_only())
3268 WARN_ON((spte & 0x7) == 0x4);
3269
3270 /* not 000b */
3271 if ((spte & 0x7)) {
3272 u64 rsvd_bits = spte & ept_rsvd_mask(spte, level);
3273
3274 if (rsvd_bits != 0) {
3275 printk(KERN_ERR "%s: rsvd_bits = 0x%llx\n",
3276 __func__, rsvd_bits);
3277 WARN_ON(1);
3278 }
3279
3280 if (level == 1 || (level == 2 && (spte & (1ULL << 7)))) {
3281 u64 ept_mem_type = (spte & 0x38) >> 3;
3282
3283 if (ept_mem_type == 2 || ept_mem_type == 3 ||
3284 ept_mem_type == 7) {
3285 printk(KERN_ERR "%s: ept_mem_type=0x%llx\n",
3286 __func__, ept_mem_type);
3287 WARN_ON(1);
3288 }
3289 }
3290 }
3291}
3292
3293static int handle_ept_misconfig(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3294{
3295 u64 sptes[4];
3296 int nr_sptes, i;
3297 gpa_t gpa;
3298
3299 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
3300
3301 printk(KERN_ERR "EPT: Misconfiguration.\n");
3302 printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa);
3303
3304 nr_sptes = kvm_mmu_get_spte_hierarchy(vcpu, gpa, sptes);
3305
3306 for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i)
3307 ept_misconfig_inspect_spte(vcpu, sptes[i-1], i);
3308
3309 kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
3310 kvm_run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG;
3311
3312 return 0;
3313}
3314
3141static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3315static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3142{ 3316{
3143 u32 cpu_based_vm_exec_control; 3317 u32 cpu_based_vm_exec_control;
@@ -3217,8 +3391,9 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
3217 [EXIT_REASON_APIC_ACCESS] = handle_apic_access, 3391 [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
3218 [EXIT_REASON_WBINVD] = handle_wbinvd, 3392 [EXIT_REASON_WBINVD] = handle_wbinvd,
3219 [EXIT_REASON_TASK_SWITCH] = handle_task_switch, 3393 [EXIT_REASON_TASK_SWITCH] = handle_task_switch,
3220 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
3221 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, 3394 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
3395 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
3396 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
3222}; 3397};
3223 3398
3224static const int kvm_vmx_max_exit_handlers = 3399static const int kvm_vmx_max_exit_handlers =
@@ -3234,8 +3409,7 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3234 u32 exit_reason = vmx->exit_reason; 3409 u32 exit_reason = vmx->exit_reason;
3235 u32 vectoring_info = vmx->idt_vectoring_info; 3410 u32 vectoring_info = vmx->idt_vectoring_info;
3236 3411
3237 KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu), 3412 trace_kvm_exit(exit_reason, kvm_rip_read(vcpu));
3238 (u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit);
3239 3413
3240 /* If we need to emulate an MMIO from handle_invalid_guest_state 3414 /* If we need to emulate an MMIO from handle_invalid_guest_state
3241 * we just return 0 */ 3415 * we just return 0 */
@@ -3247,10 +3421,8 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3247 3421
3248 /* Access CR3 don't cause VMExit in paging mode, so we need 3422 /* Access CR3 don't cause VMExit in paging mode, so we need
3249 * to sync with guest real CR3. */ 3423 * to sync with guest real CR3. */
3250 if (enable_ept && is_paging(vcpu)) { 3424 if (enable_ept && is_paging(vcpu))
3251 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 3425 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
3252 ept_load_pdptrs(vcpu);
3253 }
3254 3426
3255 if (unlikely(vmx->fail)) { 3427 if (unlikely(vmx->fail)) {
3256 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; 3428 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
@@ -3326,10 +3498,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
3326 3498
3327 /* We need to handle NMIs before interrupts are enabled */ 3499 /* We need to handle NMIs before interrupts are enabled */
3328 if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR && 3500 if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
3329 (exit_intr_info & INTR_INFO_VALID_MASK)) { 3501 (exit_intr_info & INTR_INFO_VALID_MASK))
3330 KVMTRACE_0D(NMI, &vmx->vcpu, handler);
3331 asm("int $2"); 3502 asm("int $2");
3332 }
3333 3503
3334 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; 3504 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
3335 3505
@@ -3434,6 +3604,10 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3434{ 3604{
3435 struct vcpu_vmx *vmx = to_vmx(vcpu); 3605 struct vcpu_vmx *vmx = to_vmx(vcpu);
3436 3606
3607 if (enable_ept && is_paging(vcpu)) {
3608 vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
3609 ept_load_pdptrs(vcpu);
3610 }
3437 /* Record the guest's net vcpu time for enforced NMI injections. */ 3611 /* Record the guest's net vcpu time for enforced NMI injections. */
3438 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) 3612 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
3439 vmx->entry_time = ktime_get(); 3613 vmx->entry_time = ktime_get();
@@ -3449,12 +3623,21 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3449 if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) 3623 if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
3450 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); 3624 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
3451 3625
3626 /* When single-stepping over STI and MOV SS, we must clear the
3627 * corresponding interruptibility bits in the guest state. Otherwise
3628 * vmentry fails as it then expects bit 14 (BS) in pending debug
3629 * exceptions being set, but that's not correct for the guest debugging
3630 * case. */
3631 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
3632 vmx_set_interrupt_shadow(vcpu, 0);
3633
3452 /* 3634 /*
3453 * Loading guest fpu may have cleared host cr0.ts 3635 * Loading guest fpu may have cleared host cr0.ts
3454 */ 3636 */
3455 vmcs_writel(HOST_CR0, read_cr0()); 3637 vmcs_writel(HOST_CR0, read_cr0());
3456 3638
3457 set_debugreg(vcpu->arch.dr6, 6); 3639 if (vcpu->arch.switch_db_regs)
3640 set_debugreg(vcpu->arch.dr6, 6);
3458 3641
3459 asm( 3642 asm(
3460 /* Store host registers */ 3643 /* Store host registers */
@@ -3465,11 +3648,16 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3465 "mov %%"R"sp, %c[host_rsp](%0) \n\t" 3648 "mov %%"R"sp, %c[host_rsp](%0) \n\t"
3466 __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" 3649 __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
3467 "1: \n\t" 3650 "1: \n\t"
3651 /* Reload cr2 if changed */
3652 "mov %c[cr2](%0), %%"R"ax \n\t"
3653 "mov %%cr2, %%"R"dx \n\t"
3654 "cmp %%"R"ax, %%"R"dx \n\t"
3655 "je 2f \n\t"
3656 "mov %%"R"ax, %%cr2 \n\t"
3657 "2: \n\t"
3468 /* Check if vmlaunch of vmresume is needed */ 3658 /* Check if vmlaunch of vmresume is needed */
3469 "cmpl $0, %c[launched](%0) \n\t" 3659 "cmpl $0, %c[launched](%0) \n\t"
3470 /* Load guest registers. Don't clobber flags. */ 3660 /* Load guest registers. Don't clobber flags. */
3471 "mov %c[cr2](%0), %%"R"ax \n\t"
3472 "mov %%"R"ax, %%cr2 \n\t"
3473 "mov %c[rax](%0), %%"R"ax \n\t" 3661 "mov %c[rax](%0), %%"R"ax \n\t"
3474 "mov %c[rbx](%0), %%"R"bx \n\t" 3662 "mov %c[rbx](%0), %%"R"bx \n\t"
3475 "mov %c[rdx](%0), %%"R"dx \n\t" 3663 "mov %c[rdx](%0), %%"R"dx \n\t"
@@ -3547,10 +3735,12 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3547#endif 3735#endif
3548 ); 3736 );
3549 3737
3550 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); 3738 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
3739 | (1 << VCPU_EXREG_PDPTR));
3551 vcpu->arch.regs_dirty = 0; 3740 vcpu->arch.regs_dirty = 0;
3552 3741
3553 get_debugreg(vcpu->arch.dr6, 6); 3742 if (vcpu->arch.switch_db_regs)
3743 get_debugreg(vcpu->arch.dr6, 6);
3554 3744
3555 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 3745 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
3556 if (vmx->rmode.irq.pending) 3746 if (vmx->rmode.irq.pending)
@@ -3633,9 +3823,13 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
3633 if (alloc_apic_access_page(kvm) != 0) 3823 if (alloc_apic_access_page(kvm) != 0)
3634 goto free_vmcs; 3824 goto free_vmcs;
3635 3825
3636 if (enable_ept) 3826 if (enable_ept) {
3827 if (!kvm->arch.ept_identity_map_addr)
3828 kvm->arch.ept_identity_map_addr =
3829 VMX_EPT_IDENTITY_PAGETABLE_ADDR;
3637 if (alloc_identity_pagetable(kvm) != 0) 3830 if (alloc_identity_pagetable(kvm) != 0)
3638 goto free_vmcs; 3831 goto free_vmcs;
3832 }
3639 3833
3640 return &vmx->vcpu; 3834 return &vmx->vcpu;
3641 3835
@@ -3699,6 +3893,34 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
3699 return ret; 3893 return ret;
3700} 3894}
3701 3895
3896static const struct trace_print_flags vmx_exit_reasons_str[] = {
3897 { EXIT_REASON_EXCEPTION_NMI, "exception" },
3898 { EXIT_REASON_EXTERNAL_INTERRUPT, "ext_irq" },
3899 { EXIT_REASON_TRIPLE_FAULT, "triple_fault" },
3900 { EXIT_REASON_NMI_WINDOW, "nmi_window" },
3901 { EXIT_REASON_IO_INSTRUCTION, "io_instruction" },
3902 { EXIT_REASON_CR_ACCESS, "cr_access" },
3903 { EXIT_REASON_DR_ACCESS, "dr_access" },
3904 { EXIT_REASON_CPUID, "cpuid" },
3905 { EXIT_REASON_MSR_READ, "rdmsr" },
3906 { EXIT_REASON_MSR_WRITE, "wrmsr" },
3907 { EXIT_REASON_PENDING_INTERRUPT, "interrupt_window" },
3908 { EXIT_REASON_HLT, "halt" },
3909 { EXIT_REASON_INVLPG, "invlpg" },
3910 { EXIT_REASON_VMCALL, "hypercall" },
3911 { EXIT_REASON_TPR_BELOW_THRESHOLD, "tpr_below_thres" },
3912 { EXIT_REASON_APIC_ACCESS, "apic_access" },
3913 { EXIT_REASON_WBINVD, "wbinvd" },
3914 { EXIT_REASON_TASK_SWITCH, "task_switch" },
3915 { EXIT_REASON_EPT_VIOLATION, "ept_violation" },
3916 { -1, NULL }
3917};
3918
3919static bool vmx_gb_page_enable(void)
3920{
3921 return false;
3922}
3923
3702static struct kvm_x86_ops vmx_x86_ops = { 3924static struct kvm_x86_ops vmx_x86_ops = {
3703 .cpu_has_kvm_support = cpu_has_kvm_support, 3925 .cpu_has_kvm_support = cpu_has_kvm_support,
3704 .disabled_by_bios = vmx_disabled_by_bios, 3926 .disabled_by_bios = vmx_disabled_by_bios,
@@ -3758,6 +3980,9 @@ static struct kvm_x86_ops vmx_x86_ops = {
3758 .set_tss_addr = vmx_set_tss_addr, 3980 .set_tss_addr = vmx_set_tss_addr,
3759 .get_tdp_level = get_ept_level, 3981 .get_tdp_level = get_ept_level,
3760 .get_mt_mask = vmx_get_mt_mask, 3982 .get_mt_mask = vmx_get_mt_mask,
3983
3984 .exit_reasons_str = vmx_exit_reasons_str,
3985 .gb_page_enable = vmx_gb_page_enable,
3761}; 3986};
3762 3987
3763static int __init vmx_init(void) 3988static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 633ccc7400a4..be451ee44249 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -37,11 +37,16 @@
37#include <linux/iommu.h> 37#include <linux/iommu.h>
38#include <linux/intel-iommu.h> 38#include <linux/intel-iommu.h>
39#include <linux/cpufreq.h> 39#include <linux/cpufreq.h>
40#include <trace/events/kvm.h>
41#undef TRACE_INCLUDE_FILE
42#define CREATE_TRACE_POINTS
43#include "trace.h"
40 44
41#include <asm/uaccess.h> 45#include <asm/uaccess.h>
42#include <asm/msr.h> 46#include <asm/msr.h>
43#include <asm/desc.h> 47#include <asm/desc.h>
44#include <asm/mtrr.h> 48#include <asm/mtrr.h>
49#include <asm/mce.h>
45 50
46#define MAX_IO_MSRS 256 51#define MAX_IO_MSRS 256
47#define CR0_RESERVED_BITS \ 52#define CR0_RESERVED_BITS \
@@ -55,6 +60,10 @@
55 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) 60 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
56 61
57#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) 62#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
63
64#define KVM_MAX_MCE_BANKS 32
65#define KVM_MCE_CAP_SUPPORTED MCG_CTL_P
66
58/* EFER defaults: 67/* EFER defaults:
59 * - enable syscall per default because its emulated by KVM 68 * - enable syscall per default because its emulated by KVM
60 * - enable LME and LMA per default on 64 bit KVM 69 * - enable LME and LMA per default on 64 bit KVM
@@ -68,14 +77,16 @@ static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL;
68#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM 77#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
69#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU 78#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
70 79
80static void update_cr8_intercept(struct kvm_vcpu *vcpu);
71static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, 81static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
72 struct kvm_cpuid_entry2 __user *entries); 82 struct kvm_cpuid_entry2 __user *entries);
73struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
74 u32 function, u32 index);
75 83
76struct kvm_x86_ops *kvm_x86_ops; 84struct kvm_x86_ops *kvm_x86_ops;
77EXPORT_SYMBOL_GPL(kvm_x86_ops); 85EXPORT_SYMBOL_GPL(kvm_x86_ops);
78 86
87int ignore_msrs = 0;
88module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
89
79struct kvm_stats_debugfs_item debugfs_entries[] = { 90struct kvm_stats_debugfs_item debugfs_entries[] = {
80 { "pf_fixed", VCPU_STAT(pf_fixed) }, 91 { "pf_fixed", VCPU_STAT(pf_fixed) },
81 { "pf_guest", VCPU_STAT(pf_guest) }, 92 { "pf_guest", VCPU_STAT(pf_guest) },
@@ -122,18 +133,16 @@ unsigned long segment_base(u16 selector)
122 if (selector == 0) 133 if (selector == 0)
123 return 0; 134 return 0;
124 135
125 asm("sgdt %0" : "=m"(gdt)); 136 kvm_get_gdt(&gdt);
126 table_base = gdt.base; 137 table_base = gdt.base;
127 138
128 if (selector & 4) { /* from ldt */ 139 if (selector & 4) { /* from ldt */
129 u16 ldt_selector; 140 u16 ldt_selector = kvm_read_ldt();
130 141
131 asm("sldt %0" : "=g"(ldt_selector));
132 table_base = segment_base(ldt_selector); 142 table_base = segment_base(ldt_selector);
133 } 143 }
134 d = (struct desc_struct *)(table_base + (selector & ~7)); 144 d = (struct desc_struct *)(table_base + (selector & ~7));
135 v = d->base0 | ((unsigned long)d->base1 << 16) | 145 v = get_desc_base(d);
136 ((unsigned long)d->base2 << 24);
137#ifdef CONFIG_X86_64 146#ifdef CONFIG_X86_64
138 if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) 147 if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
139 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32; 148 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
@@ -176,16 +185,22 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
176 ++vcpu->stat.pf_guest; 185 ++vcpu->stat.pf_guest;
177 186
178 if (vcpu->arch.exception.pending) { 187 if (vcpu->arch.exception.pending) {
179 if (vcpu->arch.exception.nr == PF_VECTOR) { 188 switch(vcpu->arch.exception.nr) {
180 printk(KERN_DEBUG "kvm: inject_page_fault:" 189 case DF_VECTOR:
181 " double fault 0x%lx\n", addr);
182 vcpu->arch.exception.nr = DF_VECTOR;
183 vcpu->arch.exception.error_code = 0;
184 } else if (vcpu->arch.exception.nr == DF_VECTOR) {
185 /* triple fault -> shutdown */ 190 /* triple fault -> shutdown */
186 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); 191 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
192 return;
193 case PF_VECTOR:
194 vcpu->arch.exception.nr = DF_VECTOR;
195 vcpu->arch.exception.error_code = 0;
196 return;
197 default:
198 /* replace previous exception with a new one in a hope
199 that instruction re-execution will regenerate lost
200 exception */
201 vcpu->arch.exception.pending = false;
202 break;
187 } 203 }
188 return;
189 } 204 }
190 vcpu->arch.cr2 = addr; 205 vcpu->arch.cr2 = addr;
191 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); 206 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
@@ -207,12 +222,18 @@ void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
207} 222}
208EXPORT_SYMBOL_GPL(kvm_queue_exception_e); 223EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
209 224
210static void __queue_exception(struct kvm_vcpu *vcpu) 225/*
226 * Checks if cpl <= required_cpl; if true, return true. Otherwise queue
227 * a #GP and return false.
228 */
229bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
211{ 230{
212 kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, 231 if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
213 vcpu->arch.exception.has_error_code, 232 return true;
214 vcpu->arch.exception.error_code); 233 kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
234 return false;
215} 235}
236EXPORT_SYMBOL_GPL(kvm_require_cpl);
216 237
217/* 238/*
218 * Load the pae pdptrs. Return true is they are all valid. 239 * Load the pae pdptrs. Return true is they are all valid.
@@ -232,7 +253,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
232 goto out; 253 goto out;
233 } 254 }
234 for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { 255 for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
235 if (is_present_pte(pdpte[i]) && 256 if (is_present_gpte(pdpte[i]) &&
236 (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) { 257 (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
237 ret = 0; 258 ret = 0;
238 goto out; 259 goto out;
@@ -241,6 +262,10 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
241 ret = 1; 262 ret = 1;
242 263
243 memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); 264 memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
265 __set_bit(VCPU_EXREG_PDPTR,
266 (unsigned long *)&vcpu->arch.regs_avail);
267 __set_bit(VCPU_EXREG_PDPTR,
268 (unsigned long *)&vcpu->arch.regs_dirty);
244out: 269out:
245 270
246 return ret; 271 return ret;
@@ -256,6 +281,10 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
256 if (is_long_mode(vcpu) || !is_pae(vcpu)) 281 if (is_long_mode(vcpu) || !is_pae(vcpu))
257 return false; 282 return false;
258 283
284 if (!test_bit(VCPU_EXREG_PDPTR,
285 (unsigned long *)&vcpu->arch.regs_avail))
286 return true;
287
259 r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); 288 r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
260 if (r < 0) 289 if (r < 0)
261 goto out; 290 goto out;
@@ -328,9 +357,6 @@ EXPORT_SYMBOL_GPL(kvm_set_cr0);
328void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) 357void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
329{ 358{
330 kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); 359 kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
331 KVMTRACE_1D(LMSW, vcpu,
332 (u32)((vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)),
333 handler);
334} 360}
335EXPORT_SYMBOL_GPL(kvm_lmsw); 361EXPORT_SYMBOL_GPL(kvm_lmsw);
336 362
@@ -466,7 +492,7 @@ static u32 msrs_to_save[] = {
466#ifdef CONFIG_X86_64 492#ifdef CONFIG_X86_64
467 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 493 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
468#endif 494#endif
469 MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 495 MSR_IA32_TSC, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
470 MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA 496 MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
471}; 497};
472 498
@@ -644,8 +670,7 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
644 670
645 /* Keep irq disabled to prevent changes to the clock */ 671 /* Keep irq disabled to prevent changes to the clock */
646 local_irq_save(flags); 672 local_irq_save(flags);
647 kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER, 673 kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp);
648 &vcpu->hv_clock.tsc_timestamp);
649 ktime_get_ts(&ts); 674 ktime_get_ts(&ts);
650 local_irq_restore(flags); 675 local_irq_restore(flags);
651 676
@@ -778,23 +803,60 @@ static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
778 return 0; 803 return 0;
779} 804}
780 805
806static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
807{
808 u64 mcg_cap = vcpu->arch.mcg_cap;
809 unsigned bank_num = mcg_cap & 0xff;
810
811 switch (msr) {
812 case MSR_IA32_MCG_STATUS:
813 vcpu->arch.mcg_status = data;
814 break;
815 case MSR_IA32_MCG_CTL:
816 if (!(mcg_cap & MCG_CTL_P))
817 return 1;
818 if (data != 0 && data != ~(u64)0)
819 return -1;
820 vcpu->arch.mcg_ctl = data;
821 break;
822 default:
823 if (msr >= MSR_IA32_MC0_CTL &&
824 msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
825 u32 offset = msr - MSR_IA32_MC0_CTL;
826 /* only 0 or all 1s can be written to IA32_MCi_CTL */
827 if ((offset & 0x3) == 0 &&
828 data != 0 && data != ~(u64)0)
829 return -1;
830 vcpu->arch.mce_banks[offset] = data;
831 break;
832 }
833 return 1;
834 }
835 return 0;
836}
837
781int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 838int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
782{ 839{
783 switch (msr) { 840 switch (msr) {
784 case MSR_EFER: 841 case MSR_EFER:
785 set_efer(vcpu, data); 842 set_efer(vcpu, data);
786 break; 843 break;
787 case MSR_IA32_MC0_STATUS: 844 case MSR_K7_HWCR:
788 pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", 845 data &= ~(u64)0x40; /* ignore flush filter disable */
789 __func__, data); 846 if (data != 0) {
847 pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
848 data);
849 return 1;
850 }
790 break; 851 break;
791 case MSR_IA32_MCG_STATUS: 852 case MSR_FAM10H_MMIO_CONF_BASE:
792 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n", 853 if (data != 0) {
793 __func__, data); 854 pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
855 "0x%llx\n", data);
856 return 1;
857 }
794 break; 858 break;
795 case MSR_IA32_MCG_CTL: 859 case MSR_AMD64_NB_CFG:
796 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
797 __func__, data);
798 break; 860 break;
799 case MSR_IA32_DEBUGCTLMSR: 861 case MSR_IA32_DEBUGCTLMSR:
800 if (!data) { 862 if (!data) {
@@ -811,12 +873,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
811 case MSR_IA32_UCODE_REV: 873 case MSR_IA32_UCODE_REV:
812 case MSR_IA32_UCODE_WRITE: 874 case MSR_IA32_UCODE_WRITE:
813 case MSR_VM_HSAVE_PA: 875 case MSR_VM_HSAVE_PA:
876 case MSR_AMD64_PATCH_LOADER:
814 break; 877 break;
815 case 0x200 ... 0x2ff: 878 case 0x200 ... 0x2ff:
816 return set_msr_mtrr(vcpu, msr, data); 879 return set_msr_mtrr(vcpu, msr, data);
817 case MSR_IA32_APICBASE: 880 case MSR_IA32_APICBASE:
818 kvm_set_apic_base(vcpu, data); 881 kvm_set_apic_base(vcpu, data);
819 break; 882 break;
883 case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
884 return kvm_x2apic_msr_write(vcpu, msr, data);
820 case MSR_IA32_MISC_ENABLE: 885 case MSR_IA32_MISC_ENABLE:
821 vcpu->arch.ia32_misc_enable_msr = data; 886 vcpu->arch.ia32_misc_enable_msr = data;
822 break; 887 break;
@@ -850,9 +915,50 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
850 kvm_request_guest_time_update(vcpu); 915 kvm_request_guest_time_update(vcpu);
851 break; 916 break;
852 } 917 }
918 case MSR_IA32_MCG_CTL:
919 case MSR_IA32_MCG_STATUS:
920 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
921 return set_msr_mce(vcpu, msr, data);
922
923 /* Performance counters are not protected by a CPUID bit,
924 * so we should check all of them in the generic path for the sake of
925 * cross vendor migration.
926 * Writing a zero into the event select MSRs disables them,
927 * which we perfectly emulate ;-). Any other value should be at least
928 * reported, some guests depend on them.
929 */
930 case MSR_P6_EVNTSEL0:
931 case MSR_P6_EVNTSEL1:
932 case MSR_K7_EVNTSEL0:
933 case MSR_K7_EVNTSEL1:
934 case MSR_K7_EVNTSEL2:
935 case MSR_K7_EVNTSEL3:
936 if (data != 0)
937 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
938 "0x%x data 0x%llx\n", msr, data);
939 break;
940 /* at least RHEL 4 unconditionally writes to the perfctr registers,
941 * so we ignore writes to make it happy.
942 */
943 case MSR_P6_PERFCTR0:
944 case MSR_P6_PERFCTR1:
945 case MSR_K7_PERFCTR0:
946 case MSR_K7_PERFCTR1:
947 case MSR_K7_PERFCTR2:
948 case MSR_K7_PERFCTR3:
949 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
950 "0x%x data 0x%llx\n", msr, data);
951 break;
853 default: 952 default:
854 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data); 953 if (!ignore_msrs) {
855 return 1; 954 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
955 msr, data);
956 return 1;
957 } else {
958 pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n",
959 msr, data);
960 break;
961 }
856 } 962 }
857 return 0; 963 return 0;
858} 964}
@@ -905,26 +1011,47 @@ static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
905 return 0; 1011 return 0;
906} 1012}
907 1013
908int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 1014static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
909{ 1015{
910 u64 data; 1016 u64 data;
1017 u64 mcg_cap = vcpu->arch.mcg_cap;
1018 unsigned bank_num = mcg_cap & 0xff;
911 1019
912 switch (msr) { 1020 switch (msr) {
913 case 0xc0010010: /* SYSCFG */
914 case 0xc0010015: /* HWCR */
915 case MSR_IA32_PLATFORM_ID:
916 case MSR_IA32_P5_MC_ADDR: 1021 case MSR_IA32_P5_MC_ADDR:
917 case MSR_IA32_P5_MC_TYPE: 1022 case MSR_IA32_P5_MC_TYPE:
918 case MSR_IA32_MC0_CTL: 1023 data = 0;
919 case MSR_IA32_MCG_STATUS: 1024 break;
920 case MSR_IA32_MCG_CAP: 1025 case MSR_IA32_MCG_CAP:
1026 data = vcpu->arch.mcg_cap;
1027 break;
921 case MSR_IA32_MCG_CTL: 1028 case MSR_IA32_MCG_CTL:
922 case MSR_IA32_MC0_MISC: 1029 if (!(mcg_cap & MCG_CTL_P))
923 case MSR_IA32_MC0_MISC+4: 1030 return 1;
924 case MSR_IA32_MC0_MISC+8: 1031 data = vcpu->arch.mcg_ctl;
925 case MSR_IA32_MC0_MISC+12: 1032 break;
926 case MSR_IA32_MC0_MISC+16: 1033 case MSR_IA32_MCG_STATUS:
927 case MSR_IA32_MC0_MISC+20: 1034 data = vcpu->arch.mcg_status;
1035 break;
1036 default:
1037 if (msr >= MSR_IA32_MC0_CTL &&
1038 msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
1039 u32 offset = msr - MSR_IA32_MC0_CTL;
1040 data = vcpu->arch.mce_banks[offset];
1041 break;
1042 }
1043 return 1;
1044 }
1045 *pdata = data;
1046 return 0;
1047}
1048
1049int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1050{
1051 u64 data;
1052
1053 switch (msr) {
1054 case MSR_IA32_PLATFORM_ID:
928 case MSR_IA32_UCODE_REV: 1055 case MSR_IA32_UCODE_REV:
929 case MSR_IA32_EBL_CR_POWERON: 1056 case MSR_IA32_EBL_CR_POWERON:
930 case MSR_IA32_DEBUGCTLMSR: 1057 case MSR_IA32_DEBUGCTLMSR:
@@ -932,10 +1059,18 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
932 case MSR_IA32_LASTBRANCHTOIP: 1059 case MSR_IA32_LASTBRANCHTOIP:
933 case MSR_IA32_LASTINTFROMIP: 1060 case MSR_IA32_LASTINTFROMIP:
934 case MSR_IA32_LASTINTTOIP: 1061 case MSR_IA32_LASTINTTOIP:
1062 case MSR_K8_SYSCFG:
1063 case MSR_K7_HWCR:
935 case MSR_VM_HSAVE_PA: 1064 case MSR_VM_HSAVE_PA:
1065 case MSR_P6_PERFCTR0:
1066 case MSR_P6_PERFCTR1:
936 case MSR_P6_EVNTSEL0: 1067 case MSR_P6_EVNTSEL0:
937 case MSR_P6_EVNTSEL1: 1068 case MSR_P6_EVNTSEL1:
938 case MSR_K7_EVNTSEL0: 1069 case MSR_K7_EVNTSEL0:
1070 case MSR_K7_PERFCTR0:
1071 case MSR_K8_INT_PENDING_MSG:
1072 case MSR_AMD64_NB_CFG:
1073 case MSR_FAM10H_MMIO_CONF_BASE:
939 data = 0; 1074 data = 0;
940 break; 1075 break;
941 case MSR_MTRRcap: 1076 case MSR_MTRRcap:
@@ -949,6 +1084,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
949 case MSR_IA32_APICBASE: 1084 case MSR_IA32_APICBASE:
950 data = kvm_get_apic_base(vcpu); 1085 data = kvm_get_apic_base(vcpu);
951 break; 1086 break;
1087 case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
1088 return kvm_x2apic_msr_read(vcpu, msr, pdata);
1089 break;
952 case MSR_IA32_MISC_ENABLE: 1090 case MSR_IA32_MISC_ENABLE:
953 data = vcpu->arch.ia32_misc_enable_msr; 1091 data = vcpu->arch.ia32_misc_enable_msr;
954 break; 1092 break;
@@ -967,9 +1105,22 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
967 case MSR_KVM_SYSTEM_TIME: 1105 case MSR_KVM_SYSTEM_TIME:
968 data = vcpu->arch.time; 1106 data = vcpu->arch.time;
969 break; 1107 break;
1108 case MSR_IA32_P5_MC_ADDR:
1109 case MSR_IA32_P5_MC_TYPE:
1110 case MSR_IA32_MCG_CAP:
1111 case MSR_IA32_MCG_CTL:
1112 case MSR_IA32_MCG_STATUS:
1113 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
1114 return get_msr_mce(vcpu, msr, pdata);
970 default: 1115 default:
971 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); 1116 if (!ignore_msrs) {
972 return 1; 1117 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
1118 return 1;
1119 } else {
1120 pr_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr);
1121 data = 0;
1122 }
1123 break;
973 } 1124 }
974 *pdata = data; 1125 *pdata = data;
975 return 0; 1126 return 0;
@@ -1068,6 +1219,11 @@ int kvm_dev_ioctl_check_extension(long ext)
1068 case KVM_CAP_REINJECT_CONTROL: 1219 case KVM_CAP_REINJECT_CONTROL:
1069 case KVM_CAP_IRQ_INJECT_STATUS: 1220 case KVM_CAP_IRQ_INJECT_STATUS:
1070 case KVM_CAP_ASSIGN_DEV_IRQ: 1221 case KVM_CAP_ASSIGN_DEV_IRQ:
1222 case KVM_CAP_IRQFD:
1223 case KVM_CAP_IOEVENTFD:
1224 case KVM_CAP_PIT2:
1225 case KVM_CAP_PIT_STATE2:
1226 case KVM_CAP_SET_IDENTITY_MAP_ADDR:
1071 r = 1; 1227 r = 1;
1072 break; 1228 break;
1073 case KVM_CAP_COALESCED_MMIO: 1229 case KVM_CAP_COALESCED_MMIO:
@@ -1088,6 +1244,9 @@ int kvm_dev_ioctl_check_extension(long ext)
1088 case KVM_CAP_IOMMU: 1244 case KVM_CAP_IOMMU:
1089 r = iommu_found(); 1245 r = iommu_found();
1090 break; 1246 break;
1247 case KVM_CAP_MCE:
1248 r = KVM_MAX_MCE_BANKS;
1249 break;
1091 default: 1250 default:
1092 r = 0; 1251 r = 0;
1093 break; 1252 break;
@@ -1147,6 +1306,16 @@ long kvm_arch_dev_ioctl(struct file *filp,
1147 r = 0; 1306 r = 0;
1148 break; 1307 break;
1149 } 1308 }
1309 case KVM_X86_GET_MCE_CAP_SUPPORTED: {
1310 u64 mce_cap;
1311
1312 mce_cap = KVM_MCE_CAP_SUPPORTED;
1313 r = -EFAULT;
1314 if (copy_to_user(argp, &mce_cap, sizeof mce_cap))
1315 goto out;
1316 r = 0;
1317 break;
1318 }
1150 default: 1319 default:
1151 r = -EINVAL; 1320 r = -EINVAL;
1152 } 1321 }
@@ -1227,6 +1396,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
1227 vcpu->arch.cpuid_nent = cpuid->nent; 1396 vcpu->arch.cpuid_nent = cpuid->nent;
1228 cpuid_fix_nx_cap(vcpu); 1397 cpuid_fix_nx_cap(vcpu);
1229 r = 0; 1398 r = 0;
1399 kvm_apic_set_version(vcpu);
1230 1400
1231out_free: 1401out_free:
1232 vfree(cpuid_entries); 1402 vfree(cpuid_entries);
@@ -1248,6 +1418,7 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
1248 cpuid->nent * sizeof(struct kvm_cpuid_entry2))) 1418 cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
1249 goto out; 1419 goto out;
1250 vcpu->arch.cpuid_nent = cpuid->nent; 1420 vcpu->arch.cpuid_nent = cpuid->nent;
1421 kvm_apic_set_version(vcpu);
1251 return 0; 1422 return 0;
1252 1423
1253out: 1424out:
@@ -1290,6 +1461,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1290 u32 index, int *nent, int maxnent) 1461 u32 index, int *nent, int maxnent)
1291{ 1462{
1292 unsigned f_nx = is_efer_nx() ? F(NX) : 0; 1463 unsigned f_nx = is_efer_nx() ? F(NX) : 0;
1464 unsigned f_gbpages = kvm_x86_ops->gb_page_enable() ? F(GBPAGES) : 0;
1293#ifdef CONFIG_X86_64 1465#ifdef CONFIG_X86_64
1294 unsigned f_lm = F(LM); 1466 unsigned f_lm = F(LM);
1295#else 1467#else
@@ -1314,7 +1486,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1314 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | 1486 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
1315 F(PAT) | F(PSE36) | 0 /* Reserved */ | 1487 F(PAT) | F(PSE36) | 0 /* Reserved */ |
1316 f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | 1488 f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
1317 F(FXSR) | F(FXSR_OPT) | 0 /* GBPAGES */ | 0 /* RDTSCP */ | 1489 F(FXSR) | F(FXSR_OPT) | f_gbpages | 0 /* RDTSCP */ |
1318 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); 1490 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
1319 /* cpuid 1.ecx */ 1491 /* cpuid 1.ecx */
1320 const u32 kvm_supported_word4_x86_features = 1492 const u32 kvm_supported_word4_x86_features =
@@ -1323,7 +1495,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1323 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | 1495 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
1324 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | 1496 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
1325 0 /* Reserved, DCA */ | F(XMM4_1) | 1497 0 /* Reserved, DCA */ | F(XMM4_1) |
1326 F(XMM4_2) | 0 /* x2APIC */ | F(MOVBE) | F(POPCNT) | 1498 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
1327 0 /* Reserved, XSAVE, OSXSAVE */; 1499 0 /* Reserved, XSAVE, OSXSAVE */;
1328 /* cpuid 0x80000001.ecx */ 1500 /* cpuid 0x80000001.ecx */
1329 const u32 kvm_supported_word6_x86_features = 1501 const u32 kvm_supported_word6_x86_features =
@@ -1344,6 +1516,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1344 case 1: 1516 case 1:
1345 entry->edx &= kvm_supported_word0_x86_features; 1517 entry->edx &= kvm_supported_word0_x86_features;
1346 entry->ecx &= kvm_supported_word4_x86_features; 1518 entry->ecx &= kvm_supported_word4_x86_features;
1519 /* we support x2apic emulation even if host does not support
1520 * it since we emulate x2apic in software */
1521 entry->ecx |= F(X2APIC);
1347 break; 1522 break;
1348 /* function 2 entries are STATEFUL. That is, repeated cpuid commands 1523 /* function 2 entries are STATEFUL. That is, repeated cpuid commands
1349 * may return different values. This forces us to get_cpu() before 1524 * may return different values. This forces us to get_cpu() before
@@ -1435,6 +1610,10 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
1435 for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) 1610 for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
1436 do_cpuid_ent(&cpuid_entries[nent], func, 0, 1611 do_cpuid_ent(&cpuid_entries[nent], func, 0,
1437 &nent, cpuid->nent); 1612 &nent, cpuid->nent);
1613 r = -E2BIG;
1614 if (nent >= cpuid->nent)
1615 goto out_free;
1616
1438 r = -EFAULT; 1617 r = -EFAULT;
1439 if (copy_to_user(entries, cpuid_entries, 1618 if (copy_to_user(entries, cpuid_entries,
1440 nent * sizeof(struct kvm_cpuid_entry2))) 1619 nent * sizeof(struct kvm_cpuid_entry2)))
@@ -1464,6 +1643,7 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
1464 vcpu_load(vcpu); 1643 vcpu_load(vcpu);
1465 memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); 1644 memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
1466 kvm_apic_post_state_restore(vcpu); 1645 kvm_apic_post_state_restore(vcpu);
1646 update_cr8_intercept(vcpu);
1467 vcpu_put(vcpu); 1647 vcpu_put(vcpu);
1468 1648
1469 return 0; 1649 return 0;
@@ -1503,6 +1683,80 @@ static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
1503 return 0; 1683 return 0;
1504} 1684}
1505 1685
1686static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
1687 u64 mcg_cap)
1688{
1689 int r;
1690 unsigned bank_num = mcg_cap & 0xff, bank;
1691
1692 r = -EINVAL;
1693 if (!bank_num)
1694 goto out;
1695 if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000))
1696 goto out;
1697 r = 0;
1698 vcpu->arch.mcg_cap = mcg_cap;
1699 /* Init IA32_MCG_CTL to all 1s */
1700 if (mcg_cap & MCG_CTL_P)
1701 vcpu->arch.mcg_ctl = ~(u64)0;
1702 /* Init IA32_MCi_CTL to all 1s */
1703 for (bank = 0; bank < bank_num; bank++)
1704 vcpu->arch.mce_banks[bank*4] = ~(u64)0;
1705out:
1706 return r;
1707}
1708
1709static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
1710 struct kvm_x86_mce *mce)
1711{
1712 u64 mcg_cap = vcpu->arch.mcg_cap;
1713 unsigned bank_num = mcg_cap & 0xff;
1714 u64 *banks = vcpu->arch.mce_banks;
1715
1716 if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
1717 return -EINVAL;
1718 /*
1719 * if IA32_MCG_CTL is not all 1s, the uncorrected error
1720 * reporting is disabled
1721 */
1722 if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
1723 vcpu->arch.mcg_ctl != ~(u64)0)
1724 return 0;
1725 banks += 4 * mce->bank;
1726 /*
1727 * if IA32_MCi_CTL is not all 1s, the uncorrected error
1728 * reporting is disabled for the bank
1729 */
1730 if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
1731 return 0;
1732 if (mce->status & MCI_STATUS_UC) {
1733 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
1734 !(vcpu->arch.cr4 & X86_CR4_MCE)) {
1735 printk(KERN_DEBUG "kvm: set_mce: "
1736 "injects mce exception while "
1737 "previous one is in progress!\n");
1738 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
1739 return 0;
1740 }
1741 if (banks[1] & MCI_STATUS_VAL)
1742 mce->status |= MCI_STATUS_OVER;
1743 banks[2] = mce->addr;
1744 banks[3] = mce->misc;
1745 vcpu->arch.mcg_status = mce->mcg_status;
1746 banks[1] = mce->status;
1747 kvm_queue_exception(vcpu, MC_VECTOR);
1748 } else if (!(banks[1] & MCI_STATUS_VAL)
1749 || !(banks[1] & MCI_STATUS_UC)) {
1750 if (banks[1] & MCI_STATUS_VAL)
1751 mce->status |= MCI_STATUS_OVER;
1752 banks[2] = mce->addr;
1753 banks[3] = mce->misc;
1754 banks[1] = mce->status;
1755 } else
1756 banks[1] |= MCI_STATUS_OVER;
1757 return 0;
1758}
1759
1506long kvm_arch_vcpu_ioctl(struct file *filp, 1760long kvm_arch_vcpu_ioctl(struct file *filp,
1507 unsigned int ioctl, unsigned long arg) 1761 unsigned int ioctl, unsigned long arg)
1508{ 1762{
@@ -1636,6 +1890,24 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
1636 kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr); 1890 kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
1637 break; 1891 break;
1638 } 1892 }
1893 case KVM_X86_SETUP_MCE: {
1894 u64 mcg_cap;
1895
1896 r = -EFAULT;
1897 if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap))
1898 goto out;
1899 r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
1900 break;
1901 }
1902 case KVM_X86_SET_MCE: {
1903 struct kvm_x86_mce mce;
1904
1905 r = -EFAULT;
1906 if (copy_from_user(&mce, argp, sizeof mce))
1907 goto out;
1908 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
1909 break;
1910 }
1639 default: 1911 default:
1640 r = -EINVAL; 1912 r = -EINVAL;
1641 } 1913 }
@@ -1654,6 +1926,13 @@ static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
1654 return ret; 1926 return ret;
1655} 1927}
1656 1928
1929static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
1930 u64 ident_addr)
1931{
1932 kvm->arch.ept_identity_map_addr = ident_addr;
1933 return 0;
1934}
1935
1657static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, 1936static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
1658 u32 kvm_nr_mmu_pages) 1937 u32 kvm_nr_mmu_pages)
1659{ 1938{
@@ -1775,19 +2054,25 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
1775 r = 0; 2054 r = 0;
1776 switch (chip->chip_id) { 2055 switch (chip->chip_id) {
1777 case KVM_IRQCHIP_PIC_MASTER: 2056 case KVM_IRQCHIP_PIC_MASTER:
2057 spin_lock(&pic_irqchip(kvm)->lock);
1778 memcpy(&pic_irqchip(kvm)->pics[0], 2058 memcpy(&pic_irqchip(kvm)->pics[0],
1779 &chip->chip.pic, 2059 &chip->chip.pic,
1780 sizeof(struct kvm_pic_state)); 2060 sizeof(struct kvm_pic_state));
2061 spin_unlock(&pic_irqchip(kvm)->lock);
1781 break; 2062 break;
1782 case KVM_IRQCHIP_PIC_SLAVE: 2063 case KVM_IRQCHIP_PIC_SLAVE:
2064 spin_lock(&pic_irqchip(kvm)->lock);
1783 memcpy(&pic_irqchip(kvm)->pics[1], 2065 memcpy(&pic_irqchip(kvm)->pics[1],
1784 &chip->chip.pic, 2066 &chip->chip.pic,
1785 sizeof(struct kvm_pic_state)); 2067 sizeof(struct kvm_pic_state));
2068 spin_unlock(&pic_irqchip(kvm)->lock);
1786 break; 2069 break;
1787 case KVM_IRQCHIP_IOAPIC: 2070 case KVM_IRQCHIP_IOAPIC:
2071 mutex_lock(&kvm->irq_lock);
1788 memcpy(ioapic_irqchip(kvm), 2072 memcpy(ioapic_irqchip(kvm),
1789 &chip->chip.ioapic, 2073 &chip->chip.ioapic,
1790 sizeof(struct kvm_ioapic_state)); 2074 sizeof(struct kvm_ioapic_state));
2075 mutex_unlock(&kvm->irq_lock);
1791 break; 2076 break;
1792 default: 2077 default:
1793 r = -EINVAL; 2078 r = -EINVAL;
@@ -1801,7 +2086,9 @@ static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
1801{ 2086{
1802 int r = 0; 2087 int r = 0;
1803 2088
2089 mutex_lock(&kvm->arch.vpit->pit_state.lock);
1804 memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state)); 2090 memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
2091 mutex_unlock(&kvm->arch.vpit->pit_state.lock);
1805 return r; 2092 return r;
1806} 2093}
1807 2094
@@ -1809,8 +2096,39 @@ static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
1809{ 2096{
1810 int r = 0; 2097 int r = 0;
1811 2098
2099 mutex_lock(&kvm->arch.vpit->pit_state.lock);
1812 memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state)); 2100 memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
1813 kvm_pit_load_count(kvm, 0, ps->channels[0].count); 2101 kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0);
2102 mutex_unlock(&kvm->arch.vpit->pit_state.lock);
2103 return r;
2104}
2105
2106static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
2107{
2108 int r = 0;
2109
2110 mutex_lock(&kvm->arch.vpit->pit_state.lock);
2111 memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
2112 sizeof(ps->channels));
2113 ps->flags = kvm->arch.vpit->pit_state.flags;
2114 mutex_unlock(&kvm->arch.vpit->pit_state.lock);
2115 return r;
2116}
2117
2118static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
2119{
2120 int r = 0, start = 0;
2121 u32 prev_legacy, cur_legacy;
2122 mutex_lock(&kvm->arch.vpit->pit_state.lock);
2123 prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
2124 cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
2125 if (!prev_legacy && cur_legacy)
2126 start = 1;
2127 memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels,
2128 sizeof(kvm->arch.vpit->pit_state.channels));
2129 kvm->arch.vpit->pit_state.flags = ps->flags;
2130 kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start);
2131 mutex_unlock(&kvm->arch.vpit->pit_state.lock);
1814 return r; 2132 return r;
1815} 2133}
1816 2134
@@ -1819,7 +2137,9 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
1819{ 2137{
1820 if (!kvm->arch.vpit) 2138 if (!kvm->arch.vpit)
1821 return -ENXIO; 2139 return -ENXIO;
2140 mutex_lock(&kvm->arch.vpit->pit_state.lock);
1822 kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject; 2141 kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject;
2142 mutex_unlock(&kvm->arch.vpit->pit_state.lock);
1823 return 0; 2143 return 0;
1824} 2144}
1825 2145
@@ -1845,7 +2165,6 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
1845 spin_lock(&kvm->mmu_lock); 2165 spin_lock(&kvm->mmu_lock);
1846 kvm_mmu_slot_remove_write_access(kvm, log->slot); 2166 kvm_mmu_slot_remove_write_access(kvm, log->slot);
1847 spin_unlock(&kvm->mmu_lock); 2167 spin_unlock(&kvm->mmu_lock);
1848 kvm_flush_remote_tlbs(kvm);
1849 memslot = &kvm->memslots[log->slot]; 2168 memslot = &kvm->memslots[log->slot];
1850 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; 2169 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
1851 memset(memslot->dirty_bitmap, 0, n); 2170 memset(memslot->dirty_bitmap, 0, n);
@@ -1869,7 +2188,9 @@ long kvm_arch_vm_ioctl(struct file *filp,
1869 */ 2188 */
1870 union { 2189 union {
1871 struct kvm_pit_state ps; 2190 struct kvm_pit_state ps;
2191 struct kvm_pit_state2 ps2;
1872 struct kvm_memory_alias alias; 2192 struct kvm_memory_alias alias;
2193 struct kvm_pit_config pit_config;
1873 } u; 2194 } u;
1874 2195
1875 switch (ioctl) { 2196 switch (ioctl) {
@@ -1878,6 +2199,17 @@ long kvm_arch_vm_ioctl(struct file *filp,
1878 if (r < 0) 2199 if (r < 0)
1879 goto out; 2200 goto out;
1880 break; 2201 break;
2202 case KVM_SET_IDENTITY_MAP_ADDR: {
2203 u64 ident_addr;
2204
2205 r = -EFAULT;
2206 if (copy_from_user(&ident_addr, argp, sizeof ident_addr))
2207 goto out;
2208 r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
2209 if (r < 0)
2210 goto out;
2211 break;
2212 }
1881 case KVM_SET_MEMORY_REGION: { 2213 case KVM_SET_MEMORY_REGION: {
1882 struct kvm_memory_region kvm_mem; 2214 struct kvm_memory_region kvm_mem;
1883 struct kvm_userspace_memory_region kvm_userspace_mem; 2215 struct kvm_userspace_memory_region kvm_userspace_mem;
@@ -1930,16 +2262,24 @@ long kvm_arch_vm_ioctl(struct file *filp,
1930 } 2262 }
1931 break; 2263 break;
1932 case KVM_CREATE_PIT: 2264 case KVM_CREATE_PIT:
1933 mutex_lock(&kvm->lock); 2265 u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
2266 goto create_pit;
2267 case KVM_CREATE_PIT2:
2268 r = -EFAULT;
2269 if (copy_from_user(&u.pit_config, argp,
2270 sizeof(struct kvm_pit_config)))
2271 goto out;
2272 create_pit:
2273 down_write(&kvm->slots_lock);
1934 r = -EEXIST; 2274 r = -EEXIST;
1935 if (kvm->arch.vpit) 2275 if (kvm->arch.vpit)
1936 goto create_pit_unlock; 2276 goto create_pit_unlock;
1937 r = -ENOMEM; 2277 r = -ENOMEM;
1938 kvm->arch.vpit = kvm_create_pit(kvm); 2278 kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);
1939 if (kvm->arch.vpit) 2279 if (kvm->arch.vpit)
1940 r = 0; 2280 r = 0;
1941 create_pit_unlock: 2281 create_pit_unlock:
1942 mutex_unlock(&kvm->lock); 2282 up_write(&kvm->slots_lock);
1943 break; 2283 break;
1944 case KVM_IRQ_LINE_STATUS: 2284 case KVM_IRQ_LINE_STATUS:
1945 case KVM_IRQ_LINE: { 2285 case KVM_IRQ_LINE: {
@@ -1950,10 +2290,10 @@ long kvm_arch_vm_ioctl(struct file *filp,
1950 goto out; 2290 goto out;
1951 if (irqchip_in_kernel(kvm)) { 2291 if (irqchip_in_kernel(kvm)) {
1952 __s32 status; 2292 __s32 status;
1953 mutex_lock(&kvm->lock); 2293 mutex_lock(&kvm->irq_lock);
1954 status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 2294 status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
1955 irq_event.irq, irq_event.level); 2295 irq_event.irq, irq_event.level);
1956 mutex_unlock(&kvm->lock); 2296 mutex_unlock(&kvm->irq_lock);
1957 if (ioctl == KVM_IRQ_LINE_STATUS) { 2297 if (ioctl == KVM_IRQ_LINE_STATUS) {
1958 irq_event.status = status; 2298 irq_event.status = status;
1959 if (copy_to_user(argp, &irq_event, 2299 if (copy_to_user(argp, &irq_event,
@@ -2042,6 +2382,32 @@ long kvm_arch_vm_ioctl(struct file *filp,
2042 r = 0; 2382 r = 0;
2043 break; 2383 break;
2044 } 2384 }
2385 case KVM_GET_PIT2: {
2386 r = -ENXIO;
2387 if (!kvm->arch.vpit)
2388 goto out;
2389 r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2);
2390 if (r)
2391 goto out;
2392 r = -EFAULT;
2393 if (copy_to_user(argp, &u.ps2, sizeof(u.ps2)))
2394 goto out;
2395 r = 0;
2396 break;
2397 }
2398 case KVM_SET_PIT2: {
2399 r = -EFAULT;
2400 if (copy_from_user(&u.ps2, argp, sizeof(u.ps2)))
2401 goto out;
2402 r = -ENXIO;
2403 if (!kvm->arch.vpit)
2404 goto out;
2405 r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
2406 if (r)
2407 goto out;
2408 r = 0;
2409 break;
2410 }
2045 case KVM_REINJECT_CONTROL: { 2411 case KVM_REINJECT_CONTROL: {
2046 struct kvm_reinject_control control; 2412 struct kvm_reinject_control control;
2047 r = -EFAULT; 2413 r = -EFAULT;
@@ -2075,35 +2441,23 @@ static void kvm_init_msr_list(void)
2075 num_msrs_to_save = j; 2441 num_msrs_to_save = j;
2076} 2442}
2077 2443
2078/* 2444static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
2079 * Only apic need an MMIO device hook, so shortcut now.. 2445 const void *v)
2080 */
2081static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
2082 gpa_t addr, int len,
2083 int is_write)
2084{ 2446{
2085 struct kvm_io_device *dev; 2447 if (vcpu->arch.apic &&
2448 !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v))
2449 return 0;
2086 2450
2087 if (vcpu->arch.apic) { 2451 return kvm_io_bus_write(&vcpu->kvm->mmio_bus, addr, len, v);
2088 dev = &vcpu->arch.apic->dev;
2089 if (dev->in_range(dev, addr, len, is_write))
2090 return dev;
2091 }
2092 return NULL;
2093} 2452}
2094 2453
2095 2454static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
2096static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
2097 gpa_t addr, int len,
2098 int is_write)
2099{ 2455{
2100 struct kvm_io_device *dev; 2456 if (vcpu->arch.apic &&
2457 !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v))
2458 return 0;
2101 2459
2102 dev = vcpu_find_pervcpu_dev(vcpu, addr, len, is_write); 2460 return kvm_io_bus_read(&vcpu->kvm->mmio_bus, addr, len, v);
2103 if (dev == NULL)
2104 dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len,
2105 is_write);
2106 return dev;
2107} 2461}
2108 2462
2109static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, 2463static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
@@ -2172,11 +2526,12 @@ static int emulator_read_emulated(unsigned long addr,
2172 unsigned int bytes, 2526 unsigned int bytes,
2173 struct kvm_vcpu *vcpu) 2527 struct kvm_vcpu *vcpu)
2174{ 2528{
2175 struct kvm_io_device *mmio_dev;
2176 gpa_t gpa; 2529 gpa_t gpa;
2177 2530
2178 if (vcpu->mmio_read_completed) { 2531 if (vcpu->mmio_read_completed) {
2179 memcpy(val, vcpu->mmio_data, bytes); 2532 memcpy(val, vcpu->mmio_data, bytes);
2533 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
2534 vcpu->mmio_phys_addr, *(u64 *)val);
2180 vcpu->mmio_read_completed = 0; 2535 vcpu->mmio_read_completed = 0;
2181 return X86EMUL_CONTINUE; 2536 return X86EMUL_CONTINUE;
2182 } 2537 }
@@ -2197,14 +2552,12 @@ mmio:
2197 /* 2552 /*
2198 * Is this MMIO handled locally? 2553 * Is this MMIO handled locally?
2199 */ 2554 */
2200 mutex_lock(&vcpu->kvm->lock); 2555 if (!vcpu_mmio_read(vcpu, gpa, bytes, val)) {
2201 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0); 2556 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, gpa, *(u64 *)val);
2202 if (mmio_dev) {
2203 kvm_iodevice_read(mmio_dev, gpa, bytes, val);
2204 mutex_unlock(&vcpu->kvm->lock);
2205 return X86EMUL_CONTINUE; 2557 return X86EMUL_CONTINUE;
2206 } 2558 }
2207 mutex_unlock(&vcpu->kvm->lock); 2559
2560 trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
2208 2561
2209 vcpu->mmio_needed = 1; 2562 vcpu->mmio_needed = 1;
2210 vcpu->mmio_phys_addr = gpa; 2563 vcpu->mmio_phys_addr = gpa;
@@ -2231,7 +2584,6 @@ static int emulator_write_emulated_onepage(unsigned long addr,
2231 unsigned int bytes, 2584 unsigned int bytes,
2232 struct kvm_vcpu *vcpu) 2585 struct kvm_vcpu *vcpu)
2233{ 2586{
2234 struct kvm_io_device *mmio_dev;
2235 gpa_t gpa; 2587 gpa_t gpa;
2236 2588
2237 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 2589 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
@@ -2249,17 +2601,12 @@ static int emulator_write_emulated_onepage(unsigned long addr,
2249 return X86EMUL_CONTINUE; 2601 return X86EMUL_CONTINUE;
2250 2602
2251mmio: 2603mmio:
2604 trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
2252 /* 2605 /*
2253 * Is this MMIO handled locally? 2606 * Is this MMIO handled locally?
2254 */ 2607 */
2255 mutex_lock(&vcpu->kvm->lock); 2608 if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
2256 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1);
2257 if (mmio_dev) {
2258 kvm_iodevice_write(mmio_dev, gpa, bytes, val);
2259 mutex_unlock(&vcpu->kvm->lock);
2260 return X86EMUL_CONTINUE; 2609 return X86EMUL_CONTINUE;
2261 }
2262 mutex_unlock(&vcpu->kvm->lock);
2263 2610
2264 vcpu->mmio_needed = 1; 2611 vcpu->mmio_needed = 1;
2265 vcpu->mmio_phys_addr = gpa; 2612 vcpu->mmio_phys_addr = gpa;
@@ -2343,7 +2690,6 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
2343 2690
2344int emulate_clts(struct kvm_vcpu *vcpu) 2691int emulate_clts(struct kvm_vcpu *vcpu)
2345{ 2692{
2346 KVMTRACE_0D(CLTS, vcpu, handler);
2347 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); 2693 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
2348 return X86EMUL_CONTINUE; 2694 return X86EMUL_CONTINUE;
2349} 2695}
@@ -2420,7 +2766,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
2420 kvm_clear_exception_queue(vcpu); 2766 kvm_clear_exception_queue(vcpu);
2421 vcpu->arch.mmio_fault_cr2 = cr2; 2767 vcpu->arch.mmio_fault_cr2 = cr2;
2422 /* 2768 /*
2423 * TODO: fix x86_emulate.c to use guest_read/write_register 2769 * TODO: fix emulate.c to use guest_read/write_register
2424 * instead of direct ->regs accesses, can save hundred cycles 2770 * instead of direct ->regs accesses, can save hundred cycles
2425 * on Intel for instructions that don't read/change RSP, for 2771 * on Intel for instructions that don't read/change RSP, for
2426 * for example. 2772 * for example.
@@ -2444,14 +2790,33 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
2444 2790
2445 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 2791 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
2446 2792
2447 /* Reject the instructions other than VMCALL/VMMCALL when 2793 /* Only allow emulation of specific instructions on #UD
2448 * try to emulate invalid opcode */ 2794 * (namely VMMCALL, sysenter, sysexit, syscall)*/
2449 c = &vcpu->arch.emulate_ctxt.decode; 2795 c = &vcpu->arch.emulate_ctxt.decode;
2450 if ((emulation_type & EMULTYPE_TRAP_UD) && 2796 if (emulation_type & EMULTYPE_TRAP_UD) {
2451 (!(c->twobyte && c->b == 0x01 && 2797 if (!c->twobyte)
2452 (c->modrm_reg == 0 || c->modrm_reg == 3) && 2798 return EMULATE_FAIL;
2453 c->modrm_mod == 3 && c->modrm_rm == 1))) 2799 switch (c->b) {
2454 return EMULATE_FAIL; 2800 case 0x01: /* VMMCALL */
2801 if (c->modrm_mod != 3 || c->modrm_rm != 1)
2802 return EMULATE_FAIL;
2803 break;
2804 case 0x34: /* sysenter */
2805 case 0x35: /* sysexit */
2806 if (c->modrm_mod != 0 || c->modrm_rm != 0)
2807 return EMULATE_FAIL;
2808 break;
2809 case 0x05: /* syscall */
2810 if (c->modrm_mod != 0 || c->modrm_rm != 0)
2811 return EMULATE_FAIL;
2812 break;
2813 default:
2814 return EMULATE_FAIL;
2815 }
2816
2817 if (!(c->modrm_reg == 0 || c->modrm_reg == 3))
2818 return EMULATE_FAIL;
2819 }
2455 2820
2456 ++vcpu->stat.insn_emulation; 2821 ++vcpu->stat.insn_emulation;
2457 if (r) { 2822 if (r) {
@@ -2571,52 +2936,40 @@ int complete_pio(struct kvm_vcpu *vcpu)
2571 return 0; 2936 return 0;
2572} 2937}
2573 2938
2574static void kernel_pio(struct kvm_io_device *pio_dev, 2939static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
2575 struct kvm_vcpu *vcpu,
2576 void *pd)
2577{ 2940{
2578 /* TODO: String I/O for in kernel device */ 2941 /* TODO: String I/O for in kernel device */
2942 int r;
2579 2943
2580 mutex_lock(&vcpu->kvm->lock);
2581 if (vcpu->arch.pio.in) 2944 if (vcpu->arch.pio.in)
2582 kvm_iodevice_read(pio_dev, vcpu->arch.pio.port, 2945 r = kvm_io_bus_read(&vcpu->kvm->pio_bus, vcpu->arch.pio.port,
2583 vcpu->arch.pio.size, 2946 vcpu->arch.pio.size, pd);
2584 pd);
2585 else 2947 else
2586 kvm_iodevice_write(pio_dev, vcpu->arch.pio.port, 2948 r = kvm_io_bus_write(&vcpu->kvm->pio_bus, vcpu->arch.pio.port,
2587 vcpu->arch.pio.size, 2949 vcpu->arch.pio.size, pd);
2588 pd); 2950 return r;
2589 mutex_unlock(&vcpu->kvm->lock);
2590} 2951}
2591 2952
2592static void pio_string_write(struct kvm_io_device *pio_dev, 2953static int pio_string_write(struct kvm_vcpu *vcpu)
2593 struct kvm_vcpu *vcpu)
2594{ 2954{
2595 struct kvm_pio_request *io = &vcpu->arch.pio; 2955 struct kvm_pio_request *io = &vcpu->arch.pio;
2596 void *pd = vcpu->arch.pio_data; 2956 void *pd = vcpu->arch.pio_data;
2597 int i; 2957 int i, r = 0;
2598 2958
2599 mutex_lock(&vcpu->kvm->lock);
2600 for (i = 0; i < io->cur_count; i++) { 2959 for (i = 0; i < io->cur_count; i++) {
2601 kvm_iodevice_write(pio_dev, io->port, 2960 if (kvm_io_bus_write(&vcpu->kvm->pio_bus,
2602 io->size, 2961 io->port, io->size, pd)) {
2603 pd); 2962 r = -EOPNOTSUPP;
2963 break;
2964 }
2604 pd += io->size; 2965 pd += io->size;
2605 } 2966 }
2606 mutex_unlock(&vcpu->kvm->lock); 2967 return r;
2607}
2608
2609static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
2610 gpa_t addr, int len,
2611 int is_write)
2612{
2613 return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr, len, is_write);
2614} 2968}
2615 2969
2616int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 2970int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2617 int size, unsigned port) 2971 int size, unsigned port)
2618{ 2972{
2619 struct kvm_io_device *pio_dev;
2620 unsigned long val; 2973 unsigned long val;
2621 2974
2622 vcpu->run->exit_reason = KVM_EXIT_IO; 2975 vcpu->run->exit_reason = KVM_EXIT_IO;
@@ -2630,19 +2983,13 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2630 vcpu->arch.pio.down = 0; 2983 vcpu->arch.pio.down = 0;
2631 vcpu->arch.pio.rep = 0; 2984 vcpu->arch.pio.rep = 0;
2632 2985
2633 if (vcpu->run->io.direction == KVM_EXIT_IO_IN) 2986 trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port,
2634 KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size, 2987 size, 1);
2635 handler);
2636 else
2637 KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
2638 handler);
2639 2988
2640 val = kvm_register_read(vcpu, VCPU_REGS_RAX); 2989 val = kvm_register_read(vcpu, VCPU_REGS_RAX);
2641 memcpy(vcpu->arch.pio_data, &val, 4); 2990 memcpy(vcpu->arch.pio_data, &val, 4);
2642 2991
2643 pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in); 2992 if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
2644 if (pio_dev) {
2645 kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
2646 complete_pio(vcpu); 2993 complete_pio(vcpu);
2647 return 1; 2994 return 1;
2648 } 2995 }
@@ -2656,7 +3003,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2656{ 3003{
2657 unsigned now, in_page; 3004 unsigned now, in_page;
2658 int ret = 0; 3005 int ret = 0;
2659 struct kvm_io_device *pio_dev;
2660 3006
2661 vcpu->run->exit_reason = KVM_EXIT_IO; 3007 vcpu->run->exit_reason = KVM_EXIT_IO;
2662 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 3008 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
@@ -2669,12 +3015,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2669 vcpu->arch.pio.down = down; 3015 vcpu->arch.pio.down = down;
2670 vcpu->arch.pio.rep = rep; 3016 vcpu->arch.pio.rep = rep;
2671 3017
2672 if (vcpu->run->io.direction == KVM_EXIT_IO_IN) 3018 trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port,
2673 KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size, 3019 size, count);
2674 handler);
2675 else
2676 KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
2677 handler);
2678 3020
2679 if (!count) { 3021 if (!count) {
2680 kvm_x86_ops->skip_emulated_instruction(vcpu); 3022 kvm_x86_ops->skip_emulated_instruction(vcpu);
@@ -2704,9 +3046,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2704 3046
2705 vcpu->arch.pio.guest_gva = address; 3047 vcpu->arch.pio.guest_gva = address;
2706 3048
2707 pio_dev = vcpu_find_pio_dev(vcpu, port,
2708 vcpu->arch.pio.cur_count,
2709 !vcpu->arch.pio.in);
2710 if (!vcpu->arch.pio.in) { 3049 if (!vcpu->arch.pio.in) {
2711 /* string PIO write */ 3050 /* string PIO write */
2712 ret = pio_copy_data(vcpu); 3051 ret = pio_copy_data(vcpu);
@@ -2714,16 +3053,13 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2714 kvm_inject_gp(vcpu, 0); 3053 kvm_inject_gp(vcpu, 0);
2715 return 1; 3054 return 1;
2716 } 3055 }
2717 if (ret == 0 && pio_dev) { 3056 if (ret == 0 && !pio_string_write(vcpu)) {
2718 pio_string_write(pio_dev, vcpu);
2719 complete_pio(vcpu); 3057 complete_pio(vcpu);
2720 if (vcpu->arch.pio.count == 0) 3058 if (vcpu->arch.pio.count == 0)
2721 ret = 1; 3059 ret = 1;
2722 } 3060 }
2723 } else if (pio_dev) 3061 }
2724 pr_unimpl(vcpu, "no string pio read support yet, " 3062 /* no string PIO read support yet */
2725 "port %x size %d count %ld\n",
2726 port, size, count);
2727 3063
2728 return ret; 3064 return ret;
2729} 3065}
@@ -2756,10 +3092,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
2756 3092
2757 spin_lock(&kvm_lock); 3093 spin_lock(&kvm_lock);
2758 list_for_each_entry(kvm, &vm_list, vm_list) { 3094 list_for_each_entry(kvm, &vm_list, vm_list) {
2759 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 3095 kvm_for_each_vcpu(i, vcpu, kvm) {
2760 vcpu = kvm->vcpus[i];
2761 if (!vcpu)
2762 continue;
2763 if (vcpu->cpu != freq->cpu) 3096 if (vcpu->cpu != freq->cpu)
2764 continue; 3097 continue;
2765 if (!kvm_request_guest_time_update(vcpu)) 3098 if (!kvm_request_guest_time_update(vcpu))
@@ -2852,7 +3185,6 @@ void kvm_arch_exit(void)
2852int kvm_emulate_halt(struct kvm_vcpu *vcpu) 3185int kvm_emulate_halt(struct kvm_vcpu *vcpu)
2853{ 3186{
2854 ++vcpu->stat.halt_exits; 3187 ++vcpu->stat.halt_exits;
2855 KVMTRACE_0D(HLT, vcpu, handler);
2856 if (irqchip_in_kernel(vcpu->kvm)) { 3188 if (irqchip_in_kernel(vcpu->kvm)) {
2857 vcpu->arch.mp_state = KVM_MP_STATE_HALTED; 3189 vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
2858 return 1; 3190 return 1;
@@ -2883,7 +3215,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
2883 a2 = kvm_register_read(vcpu, VCPU_REGS_RDX); 3215 a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
2884 a3 = kvm_register_read(vcpu, VCPU_REGS_RSI); 3216 a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
2885 3217
2886 KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler); 3218 trace_kvm_hypercall(nr, a0, a1, a2, a3);
2887 3219
2888 if (!is_long_mode(vcpu)) { 3220 if (!is_long_mode(vcpu)) {
2889 nr &= 0xFFFFFFFF; 3221 nr &= 0xFFFFFFFF;
@@ -2893,6 +3225,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
2893 a3 &= 0xFFFFFFFF; 3225 a3 &= 0xFFFFFFFF;
2894 } 3226 }
2895 3227
3228 if (kvm_x86_ops->get_cpl(vcpu) != 0) {
3229 ret = -KVM_EPERM;
3230 goto out;
3231 }
3232
2896 switch (nr) { 3233 switch (nr) {
2897 case KVM_HC_VAPIC_POLL_IRQ: 3234 case KVM_HC_VAPIC_POLL_IRQ:
2898 ret = 0; 3235 ret = 0;
@@ -2904,6 +3241,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
2904 ret = -KVM_ENOSYS; 3241 ret = -KVM_ENOSYS;
2905 break; 3242 break;
2906 } 3243 }
3244out:
2907 kvm_register_write(vcpu, VCPU_REGS_RAX, ret); 3245 kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
2908 ++vcpu->stat.hypercalls; 3246 ++vcpu->stat.hypercalls;
2909 return r; 3247 return r;
@@ -2983,8 +3321,6 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
2983 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 3321 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
2984 return 0; 3322 return 0;
2985 } 3323 }
2986 KVMTRACE_3D(CR_READ, vcpu, (u32)cr, (u32)value,
2987 (u32)((u64)value >> 32), handler);
2988 3324
2989 return value; 3325 return value;
2990} 3326}
@@ -2992,9 +3328,6 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
2992void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, 3328void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
2993 unsigned long *rflags) 3329 unsigned long *rflags)
2994{ 3330{
2995 KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)val,
2996 (u32)((u64)val >> 32), handler);
2997
2998 switch (cr) { 3331 switch (cr) {
2999 case 0: 3332 case 0:
3000 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); 3333 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
@@ -3104,11 +3437,11 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
3104 kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); 3437 kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
3105 } 3438 }
3106 kvm_x86_ops->skip_emulated_instruction(vcpu); 3439 kvm_x86_ops->skip_emulated_instruction(vcpu);
3107 KVMTRACE_5D(CPUID, vcpu, function, 3440 trace_kvm_cpuid(function,
3108 (u32)kvm_register_read(vcpu, VCPU_REGS_RAX), 3441 kvm_register_read(vcpu, VCPU_REGS_RAX),
3109 (u32)kvm_register_read(vcpu, VCPU_REGS_RBX), 3442 kvm_register_read(vcpu, VCPU_REGS_RBX),
3110 (u32)kvm_register_read(vcpu, VCPU_REGS_RCX), 3443 kvm_register_read(vcpu, VCPU_REGS_RCX),
3111 (u32)kvm_register_read(vcpu, VCPU_REGS_RDX), handler); 3444 kvm_register_read(vcpu, VCPU_REGS_RDX));
3112} 3445}
3113EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); 3446EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
3114 3447
@@ -3174,6 +3507,9 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
3174 if (!kvm_x86_ops->update_cr8_intercept) 3507 if (!kvm_x86_ops->update_cr8_intercept)
3175 return; 3508 return;
3176 3509
3510 if (!vcpu->arch.apic)
3511 return;
3512
3177 if (!vcpu->arch.apic->vapic_addr) 3513 if (!vcpu->arch.apic->vapic_addr)
3178 max_irr = kvm_lapic_find_highest_irr(vcpu); 3514 max_irr = kvm_lapic_find_highest_irr(vcpu);
3179 else 3515 else
@@ -3187,12 +3523,16 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
3187 kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); 3523 kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
3188} 3524}
3189 3525
3190static void inject_pending_irq(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3526static void inject_pending_event(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3191{ 3527{
3192 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
3193 kvm_x86_ops->set_interrupt_shadow(vcpu, 0);
3194
3195 /* try to reinject previous events if any */ 3528 /* try to reinject previous events if any */
3529 if (vcpu->arch.exception.pending) {
3530 kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
3531 vcpu->arch.exception.has_error_code,
3532 vcpu->arch.exception.error_code);
3533 return;
3534 }
3535
3196 if (vcpu->arch.nmi_injected) { 3536 if (vcpu->arch.nmi_injected) {
3197 kvm_x86_ops->set_nmi(vcpu); 3537 kvm_x86_ops->set_nmi(vcpu);
3198 return; 3538 return;
@@ -3266,16 +3606,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3266 smp_mb__after_clear_bit(); 3606 smp_mb__after_clear_bit();
3267 3607
3268 if (vcpu->requests || need_resched() || signal_pending(current)) { 3608 if (vcpu->requests || need_resched() || signal_pending(current)) {
3609 set_bit(KVM_REQ_KICK, &vcpu->requests);
3269 local_irq_enable(); 3610 local_irq_enable();
3270 preempt_enable(); 3611 preempt_enable();
3271 r = 1; 3612 r = 1;
3272 goto out; 3613 goto out;
3273 } 3614 }
3274 3615
3275 if (vcpu->arch.exception.pending) 3616 inject_pending_event(vcpu, kvm_run);
3276 __queue_exception(vcpu);
3277 else
3278 inject_pending_irq(vcpu, kvm_run);
3279 3617
3280 /* enable NMI/IRQ window open exits if needed */ 3618 /* enable NMI/IRQ window open exits if needed */
3281 if (vcpu->arch.nmi_pending) 3619 if (vcpu->arch.nmi_pending)
@@ -3292,14 +3630,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3292 3630
3293 kvm_guest_enter(); 3631 kvm_guest_enter();
3294 3632
3295 get_debugreg(vcpu->arch.host_dr6, 6);
3296 get_debugreg(vcpu->arch.host_dr7, 7);
3297 if (unlikely(vcpu->arch.switch_db_regs)) { 3633 if (unlikely(vcpu->arch.switch_db_regs)) {
3298 get_debugreg(vcpu->arch.host_db[0], 0);
3299 get_debugreg(vcpu->arch.host_db[1], 1);
3300 get_debugreg(vcpu->arch.host_db[2], 2);
3301 get_debugreg(vcpu->arch.host_db[3], 3);
3302
3303 set_debugreg(0, 7); 3634 set_debugreg(0, 7);
3304 set_debugreg(vcpu->arch.eff_db[0], 0); 3635 set_debugreg(vcpu->arch.eff_db[0], 0);
3305 set_debugreg(vcpu->arch.eff_db[1], 1); 3636 set_debugreg(vcpu->arch.eff_db[1], 1);
@@ -3307,18 +3638,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3307 set_debugreg(vcpu->arch.eff_db[3], 3); 3638 set_debugreg(vcpu->arch.eff_db[3], 3);
3308 } 3639 }
3309 3640
3310 KVMTRACE_0D(VMENTRY, vcpu, entryexit); 3641 trace_kvm_entry(vcpu->vcpu_id);
3311 kvm_x86_ops->run(vcpu, kvm_run); 3642 kvm_x86_ops->run(vcpu, kvm_run);
3312 3643
3313 if (unlikely(vcpu->arch.switch_db_regs)) { 3644 if (unlikely(vcpu->arch.switch_db_regs || test_thread_flag(TIF_DEBUG))) {
3314 set_debugreg(0, 7); 3645 set_debugreg(current->thread.debugreg0, 0);
3315 set_debugreg(vcpu->arch.host_db[0], 0); 3646 set_debugreg(current->thread.debugreg1, 1);
3316 set_debugreg(vcpu->arch.host_db[1], 1); 3647 set_debugreg(current->thread.debugreg2, 2);
3317 set_debugreg(vcpu->arch.host_db[2], 2); 3648 set_debugreg(current->thread.debugreg3, 3);
3318 set_debugreg(vcpu->arch.host_db[3], 3); 3649 set_debugreg(current->thread.debugreg6, 6);
3650 set_debugreg(current->thread.debugreg7, 7);
3319 } 3651 }
3320 set_debugreg(vcpu->arch.host_dr6, 6);
3321 set_debugreg(vcpu->arch.host_dr7, 7);
3322 3652
3323 set_bit(KVM_REQ_KICK, &vcpu->requests); 3653 set_bit(KVM_REQ_KICK, &vcpu->requests);
3324 local_irq_enable(); 3654 local_irq_enable();
@@ -3648,11 +3978,8 @@ static void kvm_set_segment(struct kvm_vcpu *vcpu,
3648static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector, 3978static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
3649 struct kvm_segment *kvm_desct) 3979 struct kvm_segment *kvm_desct)
3650{ 3980{
3651 kvm_desct->base = seg_desc->base0; 3981 kvm_desct->base = get_desc_base(seg_desc);
3652 kvm_desct->base |= seg_desc->base1 << 16; 3982 kvm_desct->limit = get_desc_limit(seg_desc);
3653 kvm_desct->base |= seg_desc->base2 << 24;
3654 kvm_desct->limit = seg_desc->limit0;
3655 kvm_desct->limit |= seg_desc->limit << 16;
3656 if (seg_desc->g) { 3983 if (seg_desc->g) {
3657 kvm_desct->limit <<= 12; 3984 kvm_desct->limit <<= 12;
3658 kvm_desct->limit |= 0xfff; 3985 kvm_desct->limit |= 0xfff;
@@ -3696,7 +4023,6 @@ static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu,
3696static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 4023static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3697 struct desc_struct *seg_desc) 4024 struct desc_struct *seg_desc)
3698{ 4025{
3699 gpa_t gpa;
3700 struct descriptor_table dtable; 4026 struct descriptor_table dtable;
3701 u16 index = selector >> 3; 4027 u16 index = selector >> 3;
3702 4028
@@ -3706,16 +4032,13 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3706 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); 4032 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
3707 return 1; 4033 return 1;
3708 } 4034 }
3709 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base); 4035 return kvm_read_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu);
3710 gpa += index * 8;
3711 return kvm_read_guest(vcpu->kvm, gpa, seg_desc, 8);
3712} 4036}
3713 4037
3714/* allowed just for 8 bytes segments */ 4038/* allowed just for 8 bytes segments */
3715static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 4039static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3716 struct desc_struct *seg_desc) 4040 struct desc_struct *seg_desc)
3717{ 4041{
3718 gpa_t gpa;
3719 struct descriptor_table dtable; 4042 struct descriptor_table dtable;
3720 u16 index = selector >> 3; 4043 u16 index = selector >> 3;
3721 4044
@@ -3723,19 +4046,13 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3723 4046
3724 if (dtable.limit < index * 8 + 7) 4047 if (dtable.limit < index * 8 + 7)
3725 return 1; 4048 return 1;
3726 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base); 4049 return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu);
3727 gpa += index * 8;
3728 return kvm_write_guest(vcpu->kvm, gpa, seg_desc, 8);
3729} 4050}
3730 4051
3731static u32 get_tss_base_addr(struct kvm_vcpu *vcpu, 4052static u32 get_tss_base_addr(struct kvm_vcpu *vcpu,
3732 struct desc_struct *seg_desc) 4053 struct desc_struct *seg_desc)
3733{ 4054{
3734 u32 base_addr; 4055 u32 base_addr = get_desc_base(seg_desc);
3735
3736 base_addr = seg_desc->base0;
3737 base_addr |= (seg_desc->base1 << 16);
3738 base_addr |= (seg_desc->base2 << 24);
3739 4056
3740 return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr); 4057 return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr);
3741} 4058}
@@ -3780,12 +4097,19 @@ static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int se
3780 return 0; 4097 return 0;
3781} 4098}
3782 4099
4100static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
4101{
4102 return (seg != VCPU_SREG_LDTR) &&
4103 (seg != VCPU_SREG_TR) &&
4104 (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_VM);
4105}
4106
3783int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 4107int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3784 int type_bits, int seg) 4108 int type_bits, int seg)
3785{ 4109{
3786 struct kvm_segment kvm_seg; 4110 struct kvm_segment kvm_seg;
3787 4111
3788 if (!(vcpu->arch.cr0 & X86_CR0_PE)) 4112 if (is_vm86_segment(vcpu, seg) || !(vcpu->arch.cr0 & X86_CR0_PE))
3789 return kvm_load_realmode_segment(vcpu, selector, seg); 4113 return kvm_load_realmode_segment(vcpu, selector, seg);
3790 if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) 4114 if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg))
3791 return 1; 4115 return 1;
@@ -4024,7 +4348,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
4024 } 4348 }
4025 } 4349 }
4026 4350
4027 if (!nseg_desc.p || (nseg_desc.limit0 | nseg_desc.limit << 16) < 0x67) { 4351 if (!nseg_desc.p || get_desc_limit(&nseg_desc) < 0x67) {
4028 kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc); 4352 kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc);
4029 return 1; 4353 return 1;
4030 } 4354 }
@@ -4094,13 +4418,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
4094 4418
4095 vcpu->arch.cr2 = sregs->cr2; 4419 vcpu->arch.cr2 = sregs->cr2;
4096 mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; 4420 mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
4097 4421 vcpu->arch.cr3 = sregs->cr3;
4098 down_read(&vcpu->kvm->slots_lock);
4099 if (gfn_to_memslot(vcpu->kvm, sregs->cr3 >> PAGE_SHIFT))
4100 vcpu->arch.cr3 = sregs->cr3;
4101 else
4102 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
4103 up_read(&vcpu->kvm->slots_lock);
4104 4422
4105 kvm_set_cr8(vcpu, sregs->cr8); 4423 kvm_set_cr8(vcpu, sregs->cr8);
4106 4424
@@ -4142,8 +4460,10 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
4142 kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 4460 kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
4143 kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 4461 kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
4144 4462
4463 update_cr8_intercept(vcpu);
4464
4145 /* Older userspace won't unhalt the vcpu on reset. */ 4465 /* Older userspace won't unhalt the vcpu on reset. */
4146 if (vcpu->vcpu_id == 0 && kvm_rip_read(vcpu) == 0xfff0 && 4466 if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
4147 sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && 4467 sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
4148 !(vcpu->arch.cr0 & X86_CR0_PE)) 4468 !(vcpu->arch.cr0 & X86_CR0_PE))
4149 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 4469 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -4414,7 +4734,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
4414 kvm = vcpu->kvm; 4734 kvm = vcpu->kvm;
4415 4735
4416 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 4736 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
4417 if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0) 4737 if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
4418 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 4738 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
4419 else 4739 else
4420 vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; 4740 vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
@@ -4436,6 +4756,14 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
4436 goto fail_mmu_destroy; 4756 goto fail_mmu_destroy;
4437 } 4757 }
4438 4758
4759 vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
4760 GFP_KERNEL);
4761 if (!vcpu->arch.mce_banks) {
4762 r = -ENOMEM;
4763 goto fail_mmu_destroy;
4764 }
4765 vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
4766
4439 return 0; 4767 return 0;
4440 4768
4441fail_mmu_destroy: 4769fail_mmu_destroy:
@@ -4483,20 +4811,22 @@ static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
4483static void kvm_free_vcpus(struct kvm *kvm) 4811static void kvm_free_vcpus(struct kvm *kvm)
4484{ 4812{
4485 unsigned int i; 4813 unsigned int i;
4814 struct kvm_vcpu *vcpu;
4486 4815
4487 /* 4816 /*
4488 * Unpin any mmu pages first. 4817 * Unpin any mmu pages first.
4489 */ 4818 */
4490 for (i = 0; i < KVM_MAX_VCPUS; ++i) 4819 kvm_for_each_vcpu(i, vcpu, kvm)
4491 if (kvm->vcpus[i]) 4820 kvm_unload_vcpu_mmu(vcpu);
4492 kvm_unload_vcpu_mmu(kvm->vcpus[i]); 4821 kvm_for_each_vcpu(i, vcpu, kvm)
4493 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 4822 kvm_arch_vcpu_free(vcpu);
4494 if (kvm->vcpus[i]) { 4823
4495 kvm_arch_vcpu_free(kvm->vcpus[i]); 4824 mutex_lock(&kvm->lock);
4496 kvm->vcpus[i] = NULL; 4825 for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
4497 } 4826 kvm->vcpus[i] = NULL;
4498 }
4499 4827
4828 atomic_set(&kvm->online_vcpus, 0);
4829 mutex_unlock(&kvm->lock);
4500} 4830}
4501 4831
4502void kvm_arch_sync_events(struct kvm *kvm) 4832void kvm_arch_sync_events(struct kvm *kvm)
@@ -4573,7 +4903,6 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
4573 4903
4574 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 4904 kvm_mmu_slot_remove_write_access(kvm, mem->slot);
4575 spin_unlock(&kvm->mmu_lock); 4905 spin_unlock(&kvm->mmu_lock);
4576 kvm_flush_remote_tlbs(kvm);
4577 4906
4578 return 0; 4907 return 0;
4579} 4908}
@@ -4587,8 +4916,10 @@ void kvm_arch_flush_shadow(struct kvm *kvm)
4587int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 4916int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
4588{ 4917{
4589 return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE 4918 return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
4590 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED 4919 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
4591 || vcpu->arch.nmi_pending; 4920 || vcpu->arch.nmi_pending ||
4921 (kvm_arch_interrupt_allowed(vcpu) &&
4922 kvm_cpu_has_interrupt(vcpu));
4592} 4923}
4593 4924
4594void kvm_vcpu_kick(struct kvm_vcpu *vcpu) 4925void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
@@ -4612,3 +4943,9 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
4612{ 4943{
4613 return kvm_x86_ops->interrupt_allowed(vcpu); 4944 return kvm_x86_ops->interrupt_allowed(vcpu);
4614} 4945}
4946
4947EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
4948EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
4949EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
4950EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
4951EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 4c8e10af78e8..5eadea585d2a 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -31,4 +31,8 @@ static inline bool kvm_exception_is_soft(unsigned int nr)
31{ 31{
32 return (nr == BP_VECTOR) || (nr == OF_VECTOR); 32 return (nr == BP_VECTOR) || (nr == OF_VECTOR);
33} 33}
34
35struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
36 u32 function, u32 index);
37
34#endif 38#endif
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 1617958a3805..63a6ba66cbe0 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -104,6 +104,7 @@ EXPORT_SYMBOL(kunmap);
104EXPORT_SYMBOL(kmap_atomic); 104EXPORT_SYMBOL(kmap_atomic);
105EXPORT_SYMBOL(kunmap_atomic); 105EXPORT_SYMBOL(kunmap_atomic);
106EXPORT_SYMBOL(kmap_atomic_prot); 106EXPORT_SYMBOL(kmap_atomic_prot);
107EXPORT_SYMBOL(kmap_atomic_to_page);
107 108
108void __init set_highmem_pages_init(void) 109void __init set_highmem_pages_init(void)
109{ 110{
diff --git a/include/asm-generic/Kbuild.asm b/include/asm-generic/Kbuild.asm
index 290910e4ede4..96d7c9804dc1 100644
--- a/include/asm-generic/Kbuild.asm
+++ b/include/asm-generic/Kbuild.asm
@@ -3,6 +3,11 @@ ifneq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/asm/kvm.h \
3header-y += kvm.h 3header-y += kvm.h
4endif 4endif
5 5
6ifneq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/asm/kvm_para.h \
7 $(srctree)/include/asm-$(SRCARCH)/kvm_para.h),)
8header-y += kvm_para.h
9endif
10
6ifneq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/asm/a.out.h \ 11ifneq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/asm/a.out.h \
7 $(srctree)/include/asm-$(SRCARCH)/a.out.h),) 12 $(srctree)/include/asm-$(SRCARCH)/a.out.h),)
8unifdef-y += a.out.h 13unifdef-y += a.out.h
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index 334a3593cdfd..cff4a101f266 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -268,6 +268,10 @@ ifneq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/asm/kvm.h \
268 $(srctree)/include/asm-$(SRCARCH)/kvm.h),) 268 $(srctree)/include/asm-$(SRCARCH)/kvm.h),)
269unifdef-y += kvm.h 269unifdef-y += kvm.h
270endif 270endif
271ifneq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/asm/kvm_para.h \
272 $(srctree)/include/asm-$(SRCARCH)/kvm_para.h),)
273unifdef-y += kvm_para.h
274endif
271unifdef-y += llc.h 275unifdef-y += llc.h
272unifdef-y += loop.h 276unifdef-y += loop.h
273unifdef-y += lp.h 277unifdef-y += lp.h
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 3db5d8d37485..f8f8900fc5ec 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -14,7 +14,7 @@
14 14
15#define KVM_API_VERSION 12 15#define KVM_API_VERSION 12
16 16
17/* for KVM_TRACE_ENABLE */ 17/* for KVM_TRACE_ENABLE, deprecated */
18struct kvm_user_trace_setup { 18struct kvm_user_trace_setup {
19 __u32 buf_size; /* sub_buffer size of each per-cpu */ 19 __u32 buf_size; /* sub_buffer size of each per-cpu */
20 __u32 buf_nr; /* the number of sub_buffers of each per-cpu */ 20 __u32 buf_nr; /* the number of sub_buffers of each per-cpu */
@@ -70,6 +70,14 @@ struct kvm_irqchip {
70 } chip; 70 } chip;
71}; 71};
72 72
73/* for KVM_CREATE_PIT2 */
74struct kvm_pit_config {
75 __u32 flags;
76 __u32 pad[15];
77};
78
79#define KVM_PIT_SPEAKER_DUMMY 1
80
73#define KVM_EXIT_UNKNOWN 0 81#define KVM_EXIT_UNKNOWN 0
74#define KVM_EXIT_EXCEPTION 1 82#define KVM_EXIT_EXCEPTION 1
75#define KVM_EXIT_IO 2 83#define KVM_EXIT_IO 2
@@ -87,6 +95,10 @@ struct kvm_irqchip {
87#define KVM_EXIT_S390_RESET 14 95#define KVM_EXIT_S390_RESET 14
88#define KVM_EXIT_DCR 15 96#define KVM_EXIT_DCR 15
89#define KVM_EXIT_NMI 16 97#define KVM_EXIT_NMI 16
98#define KVM_EXIT_INTERNAL_ERROR 17
99
100/* For KVM_EXIT_INTERNAL_ERROR */
101#define KVM_INTERNAL_ERROR_EMULATION 1
90 102
91/* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */ 103/* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */
92struct kvm_run { 104struct kvm_run {
@@ -173,6 +185,9 @@ struct kvm_run {
173 __u32 data; 185 __u32 data;
174 __u8 is_write; 186 __u8 is_write;
175 } dcr; 187 } dcr;
188 struct {
189 __u32 suberror;
190 } internal;
176 /* Fix the size of the union. */ 191 /* Fix the size of the union. */
177 char padding[256]; 192 char padding[256];
178 }; 193 };
@@ -292,6 +307,28 @@ struct kvm_guest_debug {
292 struct kvm_guest_debug_arch arch; 307 struct kvm_guest_debug_arch arch;
293}; 308};
294 309
310enum {
311 kvm_ioeventfd_flag_nr_datamatch,
312 kvm_ioeventfd_flag_nr_pio,
313 kvm_ioeventfd_flag_nr_deassign,
314 kvm_ioeventfd_flag_nr_max,
315};
316
317#define KVM_IOEVENTFD_FLAG_DATAMATCH (1 << kvm_ioeventfd_flag_nr_datamatch)
318#define KVM_IOEVENTFD_FLAG_PIO (1 << kvm_ioeventfd_flag_nr_pio)
319#define KVM_IOEVENTFD_FLAG_DEASSIGN (1 << kvm_ioeventfd_flag_nr_deassign)
320
321#define KVM_IOEVENTFD_VALID_FLAG_MASK ((1 << kvm_ioeventfd_flag_nr_max) - 1)
322
323struct kvm_ioeventfd {
324 __u64 datamatch;
325 __u64 addr; /* legal pio/mmio address */
326 __u32 len; /* 1, 2, 4, or 8 bytes */
327 __s32 fd;
328 __u32 flags;
329 __u8 pad[36];
330};
331
295#define KVM_TRC_SHIFT 16 332#define KVM_TRC_SHIFT 16
296/* 333/*
297 * kvm trace categories 334 * kvm trace categories
@@ -310,35 +347,6 @@ struct kvm_guest_debug {
310#define KVM_TRC_CYCLE_SIZE 8 347#define KVM_TRC_CYCLE_SIZE 8
311#define KVM_TRC_EXTRA_MAX 7 348#define KVM_TRC_EXTRA_MAX 7
312 349
313/* This structure represents a single trace buffer record. */
314struct kvm_trace_rec {
315 /* variable rec_val
316 * is split into:
317 * bits 0 - 27 -> event id
318 * bits 28 -30 -> number of extra data args of size u32
319 * bits 31 -> binary indicator for if tsc is in record
320 */
321 __u32 rec_val;
322 __u32 pid;
323 __u32 vcpu_id;
324 union {
325 struct {
326 __u64 timestamp;
327 __u32 extra_u32[KVM_TRC_EXTRA_MAX];
328 } __attribute__((packed)) timestamp;
329 struct {
330 __u32 extra_u32[KVM_TRC_EXTRA_MAX];
331 } notimestamp;
332 } u;
333};
334
335#define TRACE_REC_EVENT_ID(val) \
336 (0x0fffffff & (val))
337#define TRACE_REC_NUM_DATA_ARGS(val) \
338 (0x70000000 & ((val) << 28))
339#define TRACE_REC_TCS(val) \
340 (0x80000000 & ((val) << 31))
341
342#define KVMIO 0xAE 350#define KVMIO 0xAE
343 351
344/* 352/*
@@ -415,6 +423,19 @@ struct kvm_trace_rec {
415#define KVM_CAP_ASSIGN_DEV_IRQ 29 423#define KVM_CAP_ASSIGN_DEV_IRQ 29
416/* Another bug in KVM_SET_USER_MEMORY_REGION fixed: */ 424/* Another bug in KVM_SET_USER_MEMORY_REGION fixed: */
417#define KVM_CAP_JOIN_MEMORY_REGIONS_WORKS 30 425#define KVM_CAP_JOIN_MEMORY_REGIONS_WORKS 30
426#ifdef __KVM_HAVE_MCE
427#define KVM_CAP_MCE 31
428#endif
429#define KVM_CAP_IRQFD 32
430#ifdef __KVM_HAVE_PIT
431#define KVM_CAP_PIT2 33
432#endif
433#define KVM_CAP_SET_BOOT_CPU_ID 34
434#ifdef __KVM_HAVE_PIT_STATE2
435#define KVM_CAP_PIT_STATE2 35
436#endif
437#define KVM_CAP_IOEVENTFD 36
438#define KVM_CAP_SET_IDENTITY_MAP_ADDR 37
418 439
419#ifdef KVM_CAP_IRQ_ROUTING 440#ifdef KVM_CAP_IRQ_ROUTING
420 441
@@ -454,15 +475,32 @@ struct kvm_irq_routing {
454 475
455#endif 476#endif
456 477
478#ifdef KVM_CAP_MCE
479/* x86 MCE */
480struct kvm_x86_mce {
481 __u64 status;
482 __u64 addr;
483 __u64 misc;
484 __u64 mcg_status;
485 __u8 bank;
486 __u8 pad1[7];
487 __u64 pad2[3];
488};
489#endif
490
491#define KVM_IRQFD_FLAG_DEASSIGN (1 << 0)
492
493struct kvm_irqfd {
494 __u32 fd;
495 __u32 gsi;
496 __u32 flags;
497 __u8 pad[20];
498};
499
457/* 500/*
458 * ioctls for VM fds 501 * ioctls for VM fds
459 */ 502 */
460#define KVM_SET_MEMORY_REGION _IOW(KVMIO, 0x40, struct kvm_memory_region) 503#define KVM_SET_MEMORY_REGION _IOW(KVMIO, 0x40, struct kvm_memory_region)
461#define KVM_SET_NR_MMU_PAGES _IO(KVMIO, 0x44)
462#define KVM_GET_NR_MMU_PAGES _IO(KVMIO, 0x45)
463#define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46,\
464 struct kvm_userspace_memory_region)
465#define KVM_SET_TSS_ADDR _IO(KVMIO, 0x47)
466/* 504/*
467 * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns 505 * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns
468 * a vcpu fd. 506 * a vcpu fd.
@@ -470,6 +508,12 @@ struct kvm_irq_routing {
470#define KVM_CREATE_VCPU _IO(KVMIO, 0x41) 508#define KVM_CREATE_VCPU _IO(KVMIO, 0x41)
471#define KVM_GET_DIRTY_LOG _IOW(KVMIO, 0x42, struct kvm_dirty_log) 509#define KVM_GET_DIRTY_LOG _IOW(KVMIO, 0x42, struct kvm_dirty_log)
472#define KVM_SET_MEMORY_ALIAS _IOW(KVMIO, 0x43, struct kvm_memory_alias) 510#define KVM_SET_MEMORY_ALIAS _IOW(KVMIO, 0x43, struct kvm_memory_alias)
511#define KVM_SET_NR_MMU_PAGES _IO(KVMIO, 0x44)
512#define KVM_GET_NR_MMU_PAGES _IO(KVMIO, 0x45)
513#define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46,\
514 struct kvm_userspace_memory_region)
515#define KVM_SET_TSS_ADDR _IO(KVMIO, 0x47)
516#define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO, 0x48, __u64)
473/* Device model IOC */ 517/* Device model IOC */
474#define KVM_CREATE_IRQCHIP _IO(KVMIO, 0x60) 518#define KVM_CREATE_IRQCHIP _IO(KVMIO, 0x60)
475#define KVM_IRQ_LINE _IOW(KVMIO, 0x61, struct kvm_irq_level) 519#define KVM_IRQ_LINE _IOW(KVMIO, 0x61, struct kvm_irq_level)
@@ -498,6 +542,10 @@ struct kvm_irq_routing {
498#define KVM_ASSIGN_SET_MSIX_ENTRY \ 542#define KVM_ASSIGN_SET_MSIX_ENTRY \
499 _IOW(KVMIO, 0x74, struct kvm_assigned_msix_entry) 543 _IOW(KVMIO, 0x74, struct kvm_assigned_msix_entry)
500#define KVM_DEASSIGN_DEV_IRQ _IOW(KVMIO, 0x75, struct kvm_assigned_irq) 544#define KVM_DEASSIGN_DEV_IRQ _IOW(KVMIO, 0x75, struct kvm_assigned_irq)
545#define KVM_IRQFD _IOW(KVMIO, 0x76, struct kvm_irqfd)
546#define KVM_CREATE_PIT2 _IOW(KVMIO, 0x77, struct kvm_pit_config)
547#define KVM_SET_BOOT_CPU_ID _IO(KVMIO, 0x78)
548#define KVM_IOEVENTFD _IOW(KVMIO, 0x79, struct kvm_ioeventfd)
501 549
502/* 550/*
503 * ioctls for vcpu fds 551 * ioctls for vcpu fds
@@ -541,6 +589,10 @@ struct kvm_irq_routing {
541#define KVM_NMI _IO(KVMIO, 0x9a) 589#define KVM_NMI _IO(KVMIO, 0x9a)
542/* Available with KVM_CAP_SET_GUEST_DEBUG */ 590/* Available with KVM_CAP_SET_GUEST_DEBUG */
543#define KVM_SET_GUEST_DEBUG _IOW(KVMIO, 0x9b, struct kvm_guest_debug) 591#define KVM_SET_GUEST_DEBUG _IOW(KVMIO, 0x9b, struct kvm_guest_debug)
592/* MCE for x86 */
593#define KVM_X86_SETUP_MCE _IOW(KVMIO, 0x9c, __u64)
594#define KVM_X86_GET_MCE_CAP_SUPPORTED _IOR(KVMIO, 0x9d, __u64)
595#define KVM_X86_SET_MCE _IOW(KVMIO, 0x9e, struct kvm_x86_mce)
544 596
545/* 597/*
546 * Deprecated interfaces 598 * Deprecated interfaces
@@ -563,6 +615,9 @@ struct kvm_debug_guest {
563#define KVM_IA64_VCPU_GET_STACK _IOR(KVMIO, 0x9a, void *) 615#define KVM_IA64_VCPU_GET_STACK _IOR(KVMIO, 0x9a, void *)
564#define KVM_IA64_VCPU_SET_STACK _IOW(KVMIO, 0x9b, void *) 616#define KVM_IA64_VCPU_SET_STACK _IOW(KVMIO, 0x9b, void *)
565 617
618#define KVM_GET_PIT2 _IOR(KVMIO, 0x9f, struct kvm_pit_state2)
619#define KVM_SET_PIT2 _IOW(KVMIO, 0xa0, struct kvm_pit_state2)
620
566#define KVM_TRC_INJ_VIRQ (KVM_TRC_HANDLER + 0x02) 621#define KVM_TRC_INJ_VIRQ (KVM_TRC_HANDLER + 0x02)
567#define KVM_TRC_REDELIVER_EVT (KVM_TRC_HANDLER + 0x03) 622#define KVM_TRC_REDELIVER_EVT (KVM_TRC_HANDLER + 0x03)
568#define KVM_TRC_PEND_INTR (KVM_TRC_HANDLER + 0x04) 623#define KVM_TRC_PEND_INTR (KVM_TRC_HANDLER + 0x04)
@@ -633,7 +688,7 @@ struct kvm_assigned_msix_nr {
633 __u16 padding; 688 __u16 padding;
634}; 689};
635 690
636#define KVM_MAX_MSIX_PER_DEV 512 691#define KVM_MAX_MSIX_PER_DEV 256
637struct kvm_assigned_msix_entry { 692struct kvm_assigned_msix_entry {
638 __u32 assigned_dev_id; 693 __u32 assigned_dev_id;
639 __u32 gsi; 694 __u32 gsi;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 3060bdc35ffe..4af56036a6bf 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -42,6 +42,7 @@
42 42
43#define KVM_USERSPACE_IRQ_SOURCE_ID 0 43#define KVM_USERSPACE_IRQ_SOURCE_ID 0
44 44
45struct kvm;
45struct kvm_vcpu; 46struct kvm_vcpu;
46extern struct kmem_cache *kvm_vcpu_cache; 47extern struct kmem_cache *kvm_vcpu_cache;
47 48
@@ -59,10 +60,18 @@ struct kvm_io_bus {
59 60
60void kvm_io_bus_init(struct kvm_io_bus *bus); 61void kvm_io_bus_init(struct kvm_io_bus *bus);
61void kvm_io_bus_destroy(struct kvm_io_bus *bus); 62void kvm_io_bus_destroy(struct kvm_io_bus *bus);
62struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, 63int kvm_io_bus_write(struct kvm_io_bus *bus, gpa_t addr, int len,
63 gpa_t addr, int len, int is_write); 64 const void *val);
64void kvm_io_bus_register_dev(struct kvm_io_bus *bus, 65int kvm_io_bus_read(struct kvm_io_bus *bus, gpa_t addr, int len,
65 struct kvm_io_device *dev); 66 void *val);
67int __kvm_io_bus_register_dev(struct kvm_io_bus *bus,
68 struct kvm_io_device *dev);
69int kvm_io_bus_register_dev(struct kvm *kvm, struct kvm_io_bus *bus,
70 struct kvm_io_device *dev);
71void __kvm_io_bus_unregister_dev(struct kvm_io_bus *bus,
72 struct kvm_io_device *dev);
73void kvm_io_bus_unregister_dev(struct kvm *kvm, struct kvm_io_bus *bus,
74 struct kvm_io_device *dev);
66 75
67struct kvm_vcpu { 76struct kvm_vcpu {
68 struct kvm *kvm; 77 struct kvm *kvm;
@@ -103,7 +112,7 @@ struct kvm_memory_slot {
103 struct { 112 struct {
104 unsigned long rmap_pde; 113 unsigned long rmap_pde;
105 int write_count; 114 int write_count;
106 } *lpage_info; 115 } *lpage_info[KVM_NR_PAGE_SIZES - 1];
107 unsigned long userspace_addr; 116 unsigned long userspace_addr;
108 int user_alloc; 117 int user_alloc;
109}; 118};
@@ -124,7 +133,6 @@ struct kvm_kernel_irq_routing_entry {
124}; 133};
125 134
126struct kvm { 135struct kvm {
127 struct mutex lock; /* protects the vcpus array and APIC accesses */
128 spinlock_t mmu_lock; 136 spinlock_t mmu_lock;
129 spinlock_t requests_lock; 137 spinlock_t requests_lock;
130 struct rw_semaphore slots_lock; 138 struct rw_semaphore slots_lock;
@@ -132,10 +140,23 @@ struct kvm {
132 int nmemslots; 140 int nmemslots;
133 struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS + 141 struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS +
134 KVM_PRIVATE_MEM_SLOTS]; 142 KVM_PRIVATE_MEM_SLOTS];
143#ifdef CONFIG_KVM_APIC_ARCHITECTURE
144 u32 bsp_vcpu_id;
145 struct kvm_vcpu *bsp_vcpu;
146#endif
135 struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; 147 struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
148 atomic_t online_vcpus;
136 struct list_head vm_list; 149 struct list_head vm_list;
150 struct mutex lock;
137 struct kvm_io_bus mmio_bus; 151 struct kvm_io_bus mmio_bus;
138 struct kvm_io_bus pio_bus; 152 struct kvm_io_bus pio_bus;
153#ifdef CONFIG_HAVE_KVM_EVENTFD
154 struct {
155 spinlock_t lock;
156 struct list_head items;
157 } irqfds;
158 struct list_head ioeventfds;
159#endif
139 struct kvm_vm_stat stat; 160 struct kvm_vm_stat stat;
140 struct kvm_arch arch; 161 struct kvm_arch arch;
141 atomic_t users_count; 162 atomic_t users_count;
@@ -144,6 +165,7 @@ struct kvm {
144 struct kvm_coalesced_mmio_ring *coalesced_mmio_ring; 165 struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
145#endif 166#endif
146 167
168 struct mutex irq_lock;
147#ifdef CONFIG_HAVE_KVM_IRQCHIP 169#ifdef CONFIG_HAVE_KVM_IRQCHIP
148 struct list_head irq_routing; /* of kvm_kernel_irq_routing_entry */ 170 struct list_head irq_routing; /* of kvm_kernel_irq_routing_entry */
149 struct hlist_head mask_notifier_list; 171 struct hlist_head mask_notifier_list;
@@ -167,6 +189,17 @@ struct kvm {
167#define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt) 189#define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt)
168#define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt) 190#define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt)
169 191
192static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i)
193{
194 smp_rmb();
195 return kvm->vcpus[i];
196}
197
198#define kvm_for_each_vcpu(idx, vcpup, kvm) \
199 for (idx = 0, vcpup = kvm_get_vcpu(kvm, idx); \
200 idx < atomic_read(&kvm->online_vcpus) && vcpup; \
201 vcpup = kvm_get_vcpu(kvm, ++idx))
202
170int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id); 203int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id);
171void kvm_vcpu_uninit(struct kvm_vcpu *vcpu); 204void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
172 205
@@ -201,6 +234,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
201 struct kvm_userspace_memory_region *mem, 234 struct kvm_userspace_memory_region *mem,
202 struct kvm_memory_slot old, 235 struct kvm_memory_slot old,
203 int user_alloc); 236 int user_alloc);
237void kvm_disable_largepages(void);
204void kvm_arch_flush_shadow(struct kvm *kvm); 238void kvm_arch_flush_shadow(struct kvm *kvm);
205gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn); 239gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn);
206struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); 240struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
@@ -243,8 +277,6 @@ long kvm_arch_dev_ioctl(struct file *filp,
243 unsigned int ioctl, unsigned long arg); 277 unsigned int ioctl, unsigned long arg);
244long kvm_arch_vcpu_ioctl(struct file *filp, 278long kvm_arch_vcpu_ioctl(struct file *filp,
245 unsigned int ioctl, unsigned long arg); 279 unsigned int ioctl, unsigned long arg);
246void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
247void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu);
248 280
249int kvm_dev_ioctl_check_extension(long ext); 281int kvm_dev_ioctl_check_extension(long ext);
250 282
@@ -300,7 +332,6 @@ int kvm_arch_hardware_setup(void);
300void kvm_arch_hardware_unsetup(void); 332void kvm_arch_hardware_unsetup(void);
301void kvm_arch_check_processor_compat(void *rtn); 333void kvm_arch_check_processor_compat(void *rtn);
302int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); 334int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
303int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
304 335
305void kvm_free_physmem(struct kvm *kvm); 336void kvm_free_physmem(struct kvm *kvm);
306 337
@@ -309,8 +340,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm);
309void kvm_free_all_assigned_devices(struct kvm *kvm); 340void kvm_free_all_assigned_devices(struct kvm *kvm);
310void kvm_arch_sync_events(struct kvm *kvm); 341void kvm_arch_sync_events(struct kvm *kvm);
311 342
312int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
313int kvm_cpu_has_interrupt(struct kvm_vcpu *v);
314int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); 343int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);
315void kvm_vcpu_kick(struct kvm_vcpu *vcpu); 344void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
316 345
@@ -366,7 +395,8 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level);
366void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin); 395void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
367void kvm_register_irq_ack_notifier(struct kvm *kvm, 396void kvm_register_irq_ack_notifier(struct kvm *kvm,
368 struct kvm_irq_ack_notifier *kian); 397 struct kvm_irq_ack_notifier *kian);
369void kvm_unregister_irq_ack_notifier(struct kvm_irq_ack_notifier *kian); 398void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
399 struct kvm_irq_ack_notifier *kian);
370int kvm_request_irq_source_id(struct kvm *kvm); 400int kvm_request_irq_source_id(struct kvm *kvm);
371void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id); 401void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
372 402
@@ -459,37 +489,6 @@ struct kvm_stats_debugfs_item {
459extern struct kvm_stats_debugfs_item debugfs_entries[]; 489extern struct kvm_stats_debugfs_item debugfs_entries[];
460extern struct dentry *kvm_debugfs_dir; 490extern struct dentry *kvm_debugfs_dir;
461 491
462#define KVMTRACE_5D(evt, vcpu, d1, d2, d3, d4, d5, name) \
463 trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
464 vcpu, 5, d1, d2, d3, d4, d5)
465#define KVMTRACE_4D(evt, vcpu, d1, d2, d3, d4, name) \
466 trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
467 vcpu, 4, d1, d2, d3, d4, 0)
468#define KVMTRACE_3D(evt, vcpu, d1, d2, d3, name) \
469 trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
470 vcpu, 3, d1, d2, d3, 0, 0)
471#define KVMTRACE_2D(evt, vcpu, d1, d2, name) \
472 trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
473 vcpu, 2, d1, d2, 0, 0, 0)
474#define KVMTRACE_1D(evt, vcpu, d1, name) \
475 trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
476 vcpu, 1, d1, 0, 0, 0, 0)
477#define KVMTRACE_0D(evt, vcpu, name) \
478 trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
479 vcpu, 0, 0, 0, 0, 0, 0)
480
481#ifdef CONFIG_KVM_TRACE
482int kvm_trace_ioctl(unsigned int ioctl, unsigned long arg);
483void kvm_trace_cleanup(void);
484#else
485static inline
486int kvm_trace_ioctl(unsigned int ioctl, unsigned long arg)
487{
488 return -EINVAL;
489}
490#define kvm_trace_cleanup() ((void)0)
491#endif
492
493#ifdef KVM_ARCH_WANT_MMU_NOTIFIER 492#ifdef KVM_ARCH_WANT_MMU_NOTIFIER
494static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_seq) 493static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_seq)
495{ 494{
@@ -525,4 +524,33 @@ static inline void kvm_free_irq_routing(struct kvm *kvm) {}
525 524
526#endif 525#endif
527 526
527#ifdef CONFIG_HAVE_KVM_EVENTFD
528
529void kvm_eventfd_init(struct kvm *kvm);
530int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags);
531void kvm_irqfd_release(struct kvm *kvm);
532int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args);
533
534#else
535
536static inline void kvm_eventfd_init(struct kvm *kvm) {}
537static inline int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
538{
539 return -EINVAL;
540}
541
542static inline void kvm_irqfd_release(struct kvm *kvm) {}
543static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
544{
545 return -ENOSYS;
546}
547
548#endif /* CONFIG_HAVE_KVM_EVENTFD */
549
550#ifdef CONFIG_KVM_APIC_ARCHITECTURE
551static inline bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
552{
553 return vcpu->kvm->bsp_vcpu_id == vcpu->vcpu_id;
554}
555#endif
528#endif 556#endif
diff --git a/include/linux/kvm_para.h b/include/linux/kvm_para.h
index 3ddce03766ca..d73109243fda 100644
--- a/include/linux/kvm_para.h
+++ b/include/linux/kvm_para.h
@@ -13,6 +13,7 @@
13#define KVM_ENOSYS 1000 13#define KVM_ENOSYS 1000
14#define KVM_EFAULT EFAULT 14#define KVM_EFAULT EFAULT
15#define KVM_E2BIG E2BIG 15#define KVM_E2BIG E2BIG
16#define KVM_EPERM EPERM
16 17
17#define KVM_HC_VAPIC_POLL_IRQ 1 18#define KVM_HC_VAPIC_POLL_IRQ 1
18#define KVM_HC_MMU_OP 2 19#define KVM_HC_MMU_OP 2
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
new file mode 100644
index 000000000000..dbe108455275
--- /dev/null
+++ b/include/trace/events/kvm.h
@@ -0,0 +1,151 @@
1#if !defined(_TRACE_KVM_MAIN_H) || defined(TRACE_HEADER_MULTI_READ)
2#define _TRACE_KVM_MAIN_H
3
4#include <linux/tracepoint.h>
5
6#undef TRACE_SYSTEM
7#define TRACE_SYSTEM kvm
8#define TRACE_INCLUDE_FILE kvm
9
10#if defined(__KVM_HAVE_IOAPIC)
11TRACE_EVENT(kvm_set_irq,
12 TP_PROTO(unsigned int gsi, int level, int irq_source_id),
13 TP_ARGS(gsi, level, irq_source_id),
14
15 TP_STRUCT__entry(
16 __field( unsigned int, gsi )
17 __field( int, level )
18 __field( int, irq_source_id )
19 ),
20
21 TP_fast_assign(
22 __entry->gsi = gsi;
23 __entry->level = level;
24 __entry->irq_source_id = irq_source_id;
25 ),
26
27 TP_printk("gsi %u level %d source %d",
28 __entry->gsi, __entry->level, __entry->irq_source_id)
29);
30
31#define kvm_deliver_mode \
32 {0x0, "Fixed"}, \
33 {0x1, "LowPrio"}, \
34 {0x2, "SMI"}, \
35 {0x3, "Res3"}, \
36 {0x4, "NMI"}, \
37 {0x5, "INIT"}, \
38 {0x6, "SIPI"}, \
39 {0x7, "ExtINT"}
40
41TRACE_EVENT(kvm_ioapic_set_irq,
42 TP_PROTO(__u64 e, int pin, bool coalesced),
43 TP_ARGS(e, pin, coalesced),
44
45 TP_STRUCT__entry(
46 __field( __u64, e )
47 __field( int, pin )
48 __field( bool, coalesced )
49 ),
50
51 TP_fast_assign(
52 __entry->e = e;
53 __entry->pin = pin;
54 __entry->coalesced = coalesced;
55 ),
56
57 TP_printk("pin %u dst %x vec=%u (%s|%s|%s%s)%s",
58 __entry->pin, (u8)(__entry->e >> 56), (u8)__entry->e,
59 __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode),
60 (__entry->e & (1<<11)) ? "logical" : "physical",
61 (__entry->e & (1<<15)) ? "level" : "edge",
62 (__entry->e & (1<<16)) ? "|masked" : "",
63 __entry->coalesced ? " (coalesced)" : "")
64);
65
66TRACE_EVENT(kvm_msi_set_irq,
67 TP_PROTO(__u64 address, __u64 data),
68 TP_ARGS(address, data),
69
70 TP_STRUCT__entry(
71 __field( __u64, address )
72 __field( __u64, data )
73 ),
74
75 TP_fast_assign(
76 __entry->address = address;
77 __entry->data = data;
78 ),
79
80 TP_printk("dst %u vec %x (%s|%s|%s%s)",
81 (u8)(__entry->address >> 12), (u8)__entry->data,
82 __print_symbolic((__entry->data >> 8 & 0x7), kvm_deliver_mode),
83 (__entry->address & (1<<2)) ? "logical" : "physical",
84 (__entry->data & (1<<15)) ? "level" : "edge",
85 (__entry->address & (1<<3)) ? "|rh" : "")
86);
87
88#define kvm_irqchips \
89 {KVM_IRQCHIP_PIC_MASTER, "PIC master"}, \
90 {KVM_IRQCHIP_PIC_SLAVE, "PIC slave"}, \
91 {KVM_IRQCHIP_IOAPIC, "IOAPIC"}
92
93TRACE_EVENT(kvm_ack_irq,
94 TP_PROTO(unsigned int irqchip, unsigned int pin),
95 TP_ARGS(irqchip, pin),
96
97 TP_STRUCT__entry(
98 __field( unsigned int, irqchip )
99 __field( unsigned int, pin )
100 ),
101
102 TP_fast_assign(
103 __entry->irqchip = irqchip;
104 __entry->pin = pin;
105 ),
106
107 TP_printk("irqchip %s pin %u",
108 __print_symbolic(__entry->irqchip, kvm_irqchips),
109 __entry->pin)
110);
111
112
113
114#endif /* defined(__KVM_HAVE_IOAPIC) */
115
116#define KVM_TRACE_MMIO_READ_UNSATISFIED 0
117#define KVM_TRACE_MMIO_READ 1
118#define KVM_TRACE_MMIO_WRITE 2
119
120#define kvm_trace_symbol_mmio \
121 { KVM_TRACE_MMIO_READ_UNSATISFIED, "unsatisfied-read" }, \
122 { KVM_TRACE_MMIO_READ, "read" }, \
123 { KVM_TRACE_MMIO_WRITE, "write" }
124
125TRACE_EVENT(kvm_mmio,
126 TP_PROTO(int type, int len, u64 gpa, u64 val),
127 TP_ARGS(type, len, gpa, val),
128
129 TP_STRUCT__entry(
130 __field( u32, type )
131 __field( u32, len )
132 __field( u64, gpa )
133 __field( u64, val )
134 ),
135
136 TP_fast_assign(
137 __entry->type = type;
138 __entry->len = len;
139 __entry->gpa = gpa;
140 __entry->val = val;
141 ),
142
143 TP_printk("mmio %s len %u gpa 0x%llx val 0x%llx",
144 __print_symbolic(__entry->type, kvm_trace_symbol_mmio),
145 __entry->len, __entry->gpa, __entry->val)
146);
147
148#endif /* _TRACE_KVM_MAIN_H */
149
150/* This part must be outside protection */
151#include <trace/define_trace.h>
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index cafdcee154e8..b16d63634777 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -234,6 +234,7 @@ unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
234 234
235 return 1UL << (hstate->order + PAGE_SHIFT); 235 return 1UL << (hstate->order + PAGE_SHIFT);
236} 236}
237EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
237 238
238/* 239/*
239 * Return the page size being used by the MMU to back a VMA. In the majority 240 * Return the page size being used by the MMU to back a VMA. In the majority
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
new file mode 100644
index 000000000000..daece36c0a57
--- /dev/null
+++ b/virt/kvm/Kconfig
@@ -0,0 +1,14 @@
1# KVM common configuration items and defaults
2
3config HAVE_KVM
4 bool
5
6config HAVE_KVM_IRQCHIP
7 bool
8
9config HAVE_KVM_EVENTFD
10 bool
11 select EVENTFD
12
13config KVM_APIC_ARCHITECTURE
14 bool
diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c
index 5ae620d32fac..04d69cd7049b 100644
--- a/virt/kvm/coalesced_mmio.c
+++ b/virt/kvm/coalesced_mmio.c
@@ -14,32 +14,28 @@
14 14
15#include "coalesced_mmio.h" 15#include "coalesced_mmio.h"
16 16
17static int coalesced_mmio_in_range(struct kvm_io_device *this, 17static inline struct kvm_coalesced_mmio_dev *to_mmio(struct kvm_io_device *dev)
18 gpa_t addr, int len, int is_write) 18{
19 return container_of(dev, struct kvm_coalesced_mmio_dev, dev);
20}
21
22static int coalesced_mmio_in_range(struct kvm_coalesced_mmio_dev *dev,
23 gpa_t addr, int len)
19{ 24{
20 struct kvm_coalesced_mmio_dev *dev =
21 (struct kvm_coalesced_mmio_dev*)this->private;
22 struct kvm_coalesced_mmio_zone *zone; 25 struct kvm_coalesced_mmio_zone *zone;
23 int next; 26 struct kvm_coalesced_mmio_ring *ring;
27 unsigned avail;
24 int i; 28 int i;
25 29
26 if (!is_write)
27 return 0;
28
29 /* kvm->lock is taken by the caller and must be not released before
30 * dev.read/write
31 */
32
33 /* Are we able to batch it ? */ 30 /* Are we able to batch it ? */
34 31
35 /* last is the first free entry 32 /* last is the first free entry
36 * check if we don't meet the first used entry 33 * check if we don't meet the first used entry
37 * there is always one unused entry in the buffer 34 * there is always one unused entry in the buffer
38 */ 35 */
39 36 ring = dev->kvm->coalesced_mmio_ring;
40 next = (dev->kvm->coalesced_mmio_ring->last + 1) % 37 avail = (ring->first - ring->last - 1) % KVM_COALESCED_MMIO_MAX;
41 KVM_COALESCED_MMIO_MAX; 38 if (avail < KVM_MAX_VCPUS) {
42 if (next == dev->kvm->coalesced_mmio_ring->first) {
43 /* full */ 39 /* full */
44 return 0; 40 return 0;
45 } 41 }
@@ -60,14 +56,15 @@ static int coalesced_mmio_in_range(struct kvm_io_device *this,
60 return 0; 56 return 0;
61} 57}
62 58
63static void coalesced_mmio_write(struct kvm_io_device *this, 59static int coalesced_mmio_write(struct kvm_io_device *this,
64 gpa_t addr, int len, const void *val) 60 gpa_t addr, int len, const void *val)
65{ 61{
66 struct kvm_coalesced_mmio_dev *dev = 62 struct kvm_coalesced_mmio_dev *dev = to_mmio(this);
67 (struct kvm_coalesced_mmio_dev*)this->private;
68 struct kvm_coalesced_mmio_ring *ring = dev->kvm->coalesced_mmio_ring; 63 struct kvm_coalesced_mmio_ring *ring = dev->kvm->coalesced_mmio_ring;
64 if (!coalesced_mmio_in_range(dev, addr, len))
65 return -EOPNOTSUPP;
69 66
70 /* kvm->lock must be taken by caller before call to in_range()*/ 67 spin_lock(&dev->lock);
71 68
72 /* copy data in first free entry of the ring */ 69 /* copy data in first free entry of the ring */
73 70
@@ -76,29 +73,40 @@ static void coalesced_mmio_write(struct kvm_io_device *this,
76 memcpy(ring->coalesced_mmio[ring->last].data, val, len); 73 memcpy(ring->coalesced_mmio[ring->last].data, val, len);
77 smp_wmb(); 74 smp_wmb();
78 ring->last = (ring->last + 1) % KVM_COALESCED_MMIO_MAX; 75 ring->last = (ring->last + 1) % KVM_COALESCED_MMIO_MAX;
76 spin_unlock(&dev->lock);
77 return 0;
79} 78}
80 79
81static void coalesced_mmio_destructor(struct kvm_io_device *this) 80static void coalesced_mmio_destructor(struct kvm_io_device *this)
82{ 81{
83 kfree(this); 82 struct kvm_coalesced_mmio_dev *dev = to_mmio(this);
83
84 kfree(dev);
84} 85}
85 86
87static const struct kvm_io_device_ops coalesced_mmio_ops = {
88 .write = coalesced_mmio_write,
89 .destructor = coalesced_mmio_destructor,
90};
91
86int kvm_coalesced_mmio_init(struct kvm *kvm) 92int kvm_coalesced_mmio_init(struct kvm *kvm)
87{ 93{
88 struct kvm_coalesced_mmio_dev *dev; 94 struct kvm_coalesced_mmio_dev *dev;
95 int ret;
89 96
90 dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL); 97 dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL);
91 if (!dev) 98 if (!dev)
92 return -ENOMEM; 99 return -ENOMEM;
93 dev->dev.write = coalesced_mmio_write; 100 spin_lock_init(&dev->lock);
94 dev->dev.in_range = coalesced_mmio_in_range; 101 kvm_iodevice_init(&dev->dev, &coalesced_mmio_ops);
95 dev->dev.destructor = coalesced_mmio_destructor;
96 dev->dev.private = dev;
97 dev->kvm = kvm; 102 dev->kvm = kvm;
98 kvm->coalesced_mmio_dev = dev; 103 kvm->coalesced_mmio_dev = dev;
99 kvm_io_bus_register_dev(&kvm->mmio_bus, &dev->dev);
100 104
101 return 0; 105 ret = kvm_io_bus_register_dev(kvm, &kvm->mmio_bus, &dev->dev);
106 if (ret < 0)
107 kfree(dev);
108
109 return ret;
102} 110}
103 111
104int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm, 112int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm,
@@ -109,16 +117,16 @@ int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm,
109 if (dev == NULL) 117 if (dev == NULL)
110 return -EINVAL; 118 return -EINVAL;
111 119
112 mutex_lock(&kvm->lock); 120 down_write(&kvm->slots_lock);
113 if (dev->nb_zones >= KVM_COALESCED_MMIO_ZONE_MAX) { 121 if (dev->nb_zones >= KVM_COALESCED_MMIO_ZONE_MAX) {
114 mutex_unlock(&kvm->lock); 122 up_write(&kvm->slots_lock);
115 return -ENOBUFS; 123 return -ENOBUFS;
116 } 124 }
117 125
118 dev->zone[dev->nb_zones] = *zone; 126 dev->zone[dev->nb_zones] = *zone;
119 dev->nb_zones++; 127 dev->nb_zones++;
120 128
121 mutex_unlock(&kvm->lock); 129 up_write(&kvm->slots_lock);
122 return 0; 130 return 0;
123} 131}
124 132
@@ -132,7 +140,7 @@ int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm,
132 if (dev == NULL) 140 if (dev == NULL)
133 return -EINVAL; 141 return -EINVAL;
134 142
135 mutex_lock(&kvm->lock); 143 down_write(&kvm->slots_lock);
136 144
137 i = dev->nb_zones; 145 i = dev->nb_zones;
138 while(i) { 146 while(i) {
@@ -150,7 +158,7 @@ int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm,
150 i--; 158 i--;
151 } 159 }
152 160
153 mutex_unlock(&kvm->lock); 161 up_write(&kvm->slots_lock);
154 162
155 return 0; 163 return 0;
156} 164}
diff --git a/virt/kvm/coalesced_mmio.h b/virt/kvm/coalesced_mmio.h
index 5ac0ec628461..4b49f27fa31e 100644
--- a/virt/kvm/coalesced_mmio.h
+++ b/virt/kvm/coalesced_mmio.h
@@ -12,6 +12,7 @@
12struct kvm_coalesced_mmio_dev { 12struct kvm_coalesced_mmio_dev {
13 struct kvm_io_device dev; 13 struct kvm_io_device dev;
14 struct kvm *kvm; 14 struct kvm *kvm;
15 spinlock_t lock;
15 int nb_zones; 16 int nb_zones;
16 struct kvm_coalesced_mmio_zone zone[KVM_COALESCED_MMIO_ZONE_MAX]; 17 struct kvm_coalesced_mmio_zone zone[KVM_COALESCED_MMIO_ZONE_MAX];
17}; 18};
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
new file mode 100644
index 000000000000..bb4ebd89b9ff
--- /dev/null
+++ b/virt/kvm/eventfd.c
@@ -0,0 +1,578 @@
1/*
2 * kvm eventfd support - use eventfd objects to signal various KVM events
3 *
4 * Copyright 2009 Novell. All Rights Reserved.
5 *
6 * Author:
7 * Gregory Haskins <ghaskins@novell.com>
8 *
9 * This file is free software; you can redistribute it and/or modify
10 * it under the terms of version 2 of the GNU General Public License
11 * as published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software Foundation,
20 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
21 */
22
23#include <linux/kvm_host.h>
24#include <linux/kvm.h>
25#include <linux/workqueue.h>
26#include <linux/syscalls.h>
27#include <linux/wait.h>
28#include <linux/poll.h>
29#include <linux/file.h>
30#include <linux/list.h>
31#include <linux/eventfd.h>
32#include <linux/kernel.h>
33
34#include "iodev.h"
35
36/*
37 * --------------------------------------------------------------------
38 * irqfd: Allows an fd to be used to inject an interrupt to the guest
39 *
40 * Credit goes to Avi Kivity for the original idea.
41 * --------------------------------------------------------------------
42 */
43
44struct _irqfd {
45 struct kvm *kvm;
46 struct eventfd_ctx *eventfd;
47 int gsi;
48 struct list_head list;
49 poll_table pt;
50 wait_queue_head_t *wqh;
51 wait_queue_t wait;
52 struct work_struct inject;
53 struct work_struct shutdown;
54};
55
56static struct workqueue_struct *irqfd_cleanup_wq;
57
58static void
59irqfd_inject(struct work_struct *work)
60{
61 struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
62 struct kvm *kvm = irqfd->kvm;
63
64 mutex_lock(&kvm->irq_lock);
65 kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
66 kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
67 mutex_unlock(&kvm->irq_lock);
68}
69
70/*
71 * Race-free decouple logic (ordering is critical)
72 */
73static void
74irqfd_shutdown(struct work_struct *work)
75{
76 struct _irqfd *irqfd = container_of(work, struct _irqfd, shutdown);
77
78 /*
79 * Synchronize with the wait-queue and unhook ourselves to prevent
80 * further events.
81 */
82 remove_wait_queue(irqfd->wqh, &irqfd->wait);
83
84 /*
85 * We know no new events will be scheduled at this point, so block
86 * until all previously outstanding events have completed
87 */
88 flush_work(&irqfd->inject);
89
90 /*
91 * It is now safe to release the object's resources
92 */
93 eventfd_ctx_put(irqfd->eventfd);
94 kfree(irqfd);
95}
96
97
98/* assumes kvm->irqfds.lock is held */
99static bool
100irqfd_is_active(struct _irqfd *irqfd)
101{
102 return list_empty(&irqfd->list) ? false : true;
103}
104
105/*
106 * Mark the irqfd as inactive and schedule it for removal
107 *
108 * assumes kvm->irqfds.lock is held
109 */
110static void
111irqfd_deactivate(struct _irqfd *irqfd)
112{
113 BUG_ON(!irqfd_is_active(irqfd));
114
115 list_del_init(&irqfd->list);
116
117 queue_work(irqfd_cleanup_wq, &irqfd->shutdown);
118}
119
120/*
121 * Called with wqh->lock held and interrupts disabled
122 */
123static int
124irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
125{
126 struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait);
127 unsigned long flags = (unsigned long)key;
128
129 if (flags & POLLIN)
130 /* An event has been signaled, inject an interrupt */
131 schedule_work(&irqfd->inject);
132
133 if (flags & POLLHUP) {
134 /* The eventfd is closing, detach from KVM */
135 struct kvm *kvm = irqfd->kvm;
136 unsigned long flags;
137
138 spin_lock_irqsave(&kvm->irqfds.lock, flags);
139
140 /*
141 * We must check if someone deactivated the irqfd before
142 * we could acquire the irqfds.lock since the item is
143 * deactivated from the KVM side before it is unhooked from
144 * the wait-queue. If it is already deactivated, we can
145 * simply return knowing the other side will cleanup for us.
146 * We cannot race against the irqfd going away since the
147 * other side is required to acquire wqh->lock, which we hold
148 */
149 if (irqfd_is_active(irqfd))
150 irqfd_deactivate(irqfd);
151
152 spin_unlock_irqrestore(&kvm->irqfds.lock, flags);
153 }
154
155 return 0;
156}
157
158static void
159irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
160 poll_table *pt)
161{
162 struct _irqfd *irqfd = container_of(pt, struct _irqfd, pt);
163
164 irqfd->wqh = wqh;
165 add_wait_queue(wqh, &irqfd->wait);
166}
167
168static int
169kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
170{
171 struct _irqfd *irqfd;
172 struct file *file = NULL;
173 struct eventfd_ctx *eventfd = NULL;
174 int ret;
175 unsigned int events;
176
177 irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL);
178 if (!irqfd)
179 return -ENOMEM;
180
181 irqfd->kvm = kvm;
182 irqfd->gsi = gsi;
183 INIT_LIST_HEAD(&irqfd->list);
184 INIT_WORK(&irqfd->inject, irqfd_inject);
185 INIT_WORK(&irqfd->shutdown, irqfd_shutdown);
186
187 file = eventfd_fget(fd);
188 if (IS_ERR(file)) {
189 ret = PTR_ERR(file);
190 goto fail;
191 }
192
193 eventfd = eventfd_ctx_fileget(file);
194 if (IS_ERR(eventfd)) {
195 ret = PTR_ERR(eventfd);
196 goto fail;
197 }
198
199 irqfd->eventfd = eventfd;
200
201 /*
202 * Install our own custom wake-up handling so we are notified via
203 * a callback whenever someone signals the underlying eventfd
204 */
205 init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup);
206 init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc);
207
208 events = file->f_op->poll(file, &irqfd->pt);
209
210 spin_lock_irq(&kvm->irqfds.lock);
211 list_add_tail(&irqfd->list, &kvm->irqfds.items);
212 spin_unlock_irq(&kvm->irqfds.lock);
213
214 /*
215 * Check if there was an event already pending on the eventfd
216 * before we registered, and trigger it as if we didn't miss it.
217 */
218 if (events & POLLIN)
219 schedule_work(&irqfd->inject);
220
221 /*
222 * do not drop the file until the irqfd is fully initialized, otherwise
223 * we might race against the POLLHUP
224 */
225 fput(file);
226
227 return 0;
228
229fail:
230 if (eventfd && !IS_ERR(eventfd))
231 eventfd_ctx_put(eventfd);
232
233 if (!IS_ERR(file))
234 fput(file);
235
236 kfree(irqfd);
237 return ret;
238}
239
240void
241kvm_eventfd_init(struct kvm *kvm)
242{
243 spin_lock_init(&kvm->irqfds.lock);
244 INIT_LIST_HEAD(&kvm->irqfds.items);
245 INIT_LIST_HEAD(&kvm->ioeventfds);
246}
247
248/*
249 * shutdown any irqfd's that match fd+gsi
250 */
251static int
252kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi)
253{
254 struct _irqfd *irqfd, *tmp;
255 struct eventfd_ctx *eventfd;
256
257 eventfd = eventfd_ctx_fdget(fd);
258 if (IS_ERR(eventfd))
259 return PTR_ERR(eventfd);
260
261 spin_lock_irq(&kvm->irqfds.lock);
262
263 list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) {
264 if (irqfd->eventfd == eventfd && irqfd->gsi == gsi)
265 irqfd_deactivate(irqfd);
266 }
267
268 spin_unlock_irq(&kvm->irqfds.lock);
269 eventfd_ctx_put(eventfd);
270
271 /*
272 * Block until we know all outstanding shutdown jobs have completed
273 * so that we guarantee there will not be any more interrupts on this
274 * gsi once this deassign function returns.
275 */
276 flush_workqueue(irqfd_cleanup_wq);
277
278 return 0;
279}
280
281int
282kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
283{
284 if (flags & KVM_IRQFD_FLAG_DEASSIGN)
285 return kvm_irqfd_deassign(kvm, fd, gsi);
286
287 return kvm_irqfd_assign(kvm, fd, gsi);
288}
289
290/*
291 * This function is called as the kvm VM fd is being released. Shutdown all
292 * irqfds that still remain open
293 */
294void
295kvm_irqfd_release(struct kvm *kvm)
296{
297 struct _irqfd *irqfd, *tmp;
298
299 spin_lock_irq(&kvm->irqfds.lock);
300
301 list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list)
302 irqfd_deactivate(irqfd);
303
304 spin_unlock_irq(&kvm->irqfds.lock);
305
306 /*
307 * Block until we know all outstanding shutdown jobs have completed
308 * since we do not take a kvm* reference.
309 */
310 flush_workqueue(irqfd_cleanup_wq);
311
312}
313
314/*
315 * create a host-wide workqueue for issuing deferred shutdown requests
316 * aggregated from all vm* instances. We need our own isolated single-thread
317 * queue to prevent deadlock against flushing the normal work-queue.
318 */
319static int __init irqfd_module_init(void)
320{
321 irqfd_cleanup_wq = create_singlethread_workqueue("kvm-irqfd-cleanup");
322 if (!irqfd_cleanup_wq)
323 return -ENOMEM;
324
325 return 0;
326}
327
328static void __exit irqfd_module_exit(void)
329{
330 destroy_workqueue(irqfd_cleanup_wq);
331}
332
333module_init(irqfd_module_init);
334module_exit(irqfd_module_exit);
335
336/*
337 * --------------------------------------------------------------------
338 * ioeventfd: translate a PIO/MMIO memory write to an eventfd signal.
339 *
340 * userspace can register a PIO/MMIO address with an eventfd for receiving
341 * notification when the memory has been touched.
342 * --------------------------------------------------------------------
343 */
344
345struct _ioeventfd {
346 struct list_head list;
347 u64 addr;
348 int length;
349 struct eventfd_ctx *eventfd;
350 u64 datamatch;
351 struct kvm_io_device dev;
352 bool wildcard;
353};
354
355static inline struct _ioeventfd *
356to_ioeventfd(struct kvm_io_device *dev)
357{
358 return container_of(dev, struct _ioeventfd, dev);
359}
360
361static void
362ioeventfd_release(struct _ioeventfd *p)
363{
364 eventfd_ctx_put(p->eventfd);
365 list_del(&p->list);
366 kfree(p);
367}
368
369static bool
370ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val)
371{
372 u64 _val;
373
374 if (!(addr == p->addr && len == p->length))
375 /* address-range must be precise for a hit */
376 return false;
377
378 if (p->wildcard)
379 /* all else equal, wildcard is always a hit */
380 return true;
381
382 /* otherwise, we have to actually compare the data */
383
384 BUG_ON(!IS_ALIGNED((unsigned long)val, len));
385
386 switch (len) {
387 case 1:
388 _val = *(u8 *)val;
389 break;
390 case 2:
391 _val = *(u16 *)val;
392 break;
393 case 4:
394 _val = *(u32 *)val;
395 break;
396 case 8:
397 _val = *(u64 *)val;
398 break;
399 default:
400 return false;
401 }
402
403 return _val == p->datamatch ? true : false;
404}
405
406/* MMIO/PIO writes trigger an event if the addr/val match */
407static int
408ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len,
409 const void *val)
410{
411 struct _ioeventfd *p = to_ioeventfd(this);
412
413 if (!ioeventfd_in_range(p, addr, len, val))
414 return -EOPNOTSUPP;
415
416 eventfd_signal(p->eventfd, 1);
417 return 0;
418}
419
420/*
421 * This function is called as KVM is completely shutting down. We do not
422 * need to worry about locking just nuke anything we have as quickly as possible
423 */
424static void
425ioeventfd_destructor(struct kvm_io_device *this)
426{
427 struct _ioeventfd *p = to_ioeventfd(this);
428
429 ioeventfd_release(p);
430}
431
432static const struct kvm_io_device_ops ioeventfd_ops = {
433 .write = ioeventfd_write,
434 .destructor = ioeventfd_destructor,
435};
436
437/* assumes kvm->slots_lock held */
438static bool
439ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p)
440{
441 struct _ioeventfd *_p;
442
443 list_for_each_entry(_p, &kvm->ioeventfds, list)
444 if (_p->addr == p->addr && _p->length == p->length &&
445 (_p->wildcard || p->wildcard ||
446 _p->datamatch == p->datamatch))
447 return true;
448
449 return false;
450}
451
452static int
453kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
454{
455 int pio = args->flags & KVM_IOEVENTFD_FLAG_PIO;
456 struct kvm_io_bus *bus = pio ? &kvm->pio_bus : &kvm->mmio_bus;
457 struct _ioeventfd *p;
458 struct eventfd_ctx *eventfd;
459 int ret;
460
461 /* must be natural-word sized */
462 switch (args->len) {
463 case 1:
464 case 2:
465 case 4:
466 case 8:
467 break;
468 default:
469 return -EINVAL;
470 }
471
472 /* check for range overflow */
473 if (args->addr + args->len < args->addr)
474 return -EINVAL;
475
476 /* check for extra flags that we don't understand */
477 if (args->flags & ~KVM_IOEVENTFD_VALID_FLAG_MASK)
478 return -EINVAL;
479
480 eventfd = eventfd_ctx_fdget(args->fd);
481 if (IS_ERR(eventfd))
482 return PTR_ERR(eventfd);
483
484 p = kzalloc(sizeof(*p), GFP_KERNEL);
485 if (!p) {
486 ret = -ENOMEM;
487 goto fail;
488 }
489
490 INIT_LIST_HEAD(&p->list);
491 p->addr = args->addr;
492 p->length = args->len;
493 p->eventfd = eventfd;
494
495 /* The datamatch feature is optional, otherwise this is a wildcard */
496 if (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH)
497 p->datamatch = args->datamatch;
498 else
499 p->wildcard = true;
500
501 down_write(&kvm->slots_lock);
502
503 /* Verify that there isnt a match already */
504 if (ioeventfd_check_collision(kvm, p)) {
505 ret = -EEXIST;
506 goto unlock_fail;
507 }
508
509 kvm_iodevice_init(&p->dev, &ioeventfd_ops);
510
511 ret = __kvm_io_bus_register_dev(bus, &p->dev);
512 if (ret < 0)
513 goto unlock_fail;
514
515 list_add_tail(&p->list, &kvm->ioeventfds);
516
517 up_write(&kvm->slots_lock);
518
519 return 0;
520
521unlock_fail:
522 up_write(&kvm->slots_lock);
523
524fail:
525 kfree(p);
526 eventfd_ctx_put(eventfd);
527
528 return ret;
529}
530
531static int
532kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
533{
534 int pio = args->flags & KVM_IOEVENTFD_FLAG_PIO;
535 struct kvm_io_bus *bus = pio ? &kvm->pio_bus : &kvm->mmio_bus;
536 struct _ioeventfd *p, *tmp;
537 struct eventfd_ctx *eventfd;
538 int ret = -ENOENT;
539
540 eventfd = eventfd_ctx_fdget(args->fd);
541 if (IS_ERR(eventfd))
542 return PTR_ERR(eventfd);
543
544 down_write(&kvm->slots_lock);
545
546 list_for_each_entry_safe(p, tmp, &kvm->ioeventfds, list) {
547 bool wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH);
548
549 if (p->eventfd != eventfd ||
550 p->addr != args->addr ||
551 p->length != args->len ||
552 p->wildcard != wildcard)
553 continue;
554
555 if (!p->wildcard && p->datamatch != args->datamatch)
556 continue;
557
558 __kvm_io_bus_unregister_dev(bus, &p->dev);
559 ioeventfd_release(p);
560 ret = 0;
561 break;
562 }
563
564 up_write(&kvm->slots_lock);
565
566 eventfd_ctx_put(eventfd);
567
568 return ret;
569}
570
571int
572kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
573{
574 if (args->flags & KVM_IOEVENTFD_FLAG_DEASSIGN)
575 return kvm_deassign_ioeventfd(kvm, args);
576
577 return kvm_assign_ioeventfd(kvm, args);
578}
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 1150c6d5c7b8..9fe140bb38ec 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -36,6 +36,7 @@
36#include <asm/processor.h> 36#include <asm/processor.h>
37#include <asm/page.h> 37#include <asm/page.h>
38#include <asm/current.h> 38#include <asm/current.h>
39#include <trace/events/kvm.h>
39 40
40#include "ioapic.h" 41#include "ioapic.h"
41#include "lapic.h" 42#include "lapic.h"
@@ -103,6 +104,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
103{ 104{
104 unsigned index; 105 unsigned index;
105 bool mask_before, mask_after; 106 bool mask_before, mask_after;
107 union kvm_ioapic_redirect_entry *e;
106 108
107 switch (ioapic->ioregsel) { 109 switch (ioapic->ioregsel) {
108 case IOAPIC_REG_VERSION: 110 case IOAPIC_REG_VERSION:
@@ -122,19 +124,20 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
122 ioapic_debug("change redir index %x val %x\n", index, val); 124 ioapic_debug("change redir index %x val %x\n", index, val);
123 if (index >= IOAPIC_NUM_PINS) 125 if (index >= IOAPIC_NUM_PINS)
124 return; 126 return;
125 mask_before = ioapic->redirtbl[index].fields.mask; 127 e = &ioapic->redirtbl[index];
128 mask_before = e->fields.mask;
126 if (ioapic->ioregsel & 1) { 129 if (ioapic->ioregsel & 1) {
127 ioapic->redirtbl[index].bits &= 0xffffffff; 130 e->bits &= 0xffffffff;
128 ioapic->redirtbl[index].bits |= (u64) val << 32; 131 e->bits |= (u64) val << 32;
129 } else { 132 } else {
130 ioapic->redirtbl[index].bits &= ~0xffffffffULL; 133 e->bits &= ~0xffffffffULL;
131 ioapic->redirtbl[index].bits |= (u32) val; 134 e->bits |= (u32) val;
132 ioapic->redirtbl[index].fields.remote_irr = 0; 135 e->fields.remote_irr = 0;
133 } 136 }
134 mask_after = ioapic->redirtbl[index].fields.mask; 137 mask_after = e->fields.mask;
135 if (mask_before != mask_after) 138 if (mask_before != mask_after)
136 kvm_fire_mask_notifiers(ioapic->kvm, index, mask_after); 139 kvm_fire_mask_notifiers(ioapic->kvm, index, mask_after);
137 if (ioapic->redirtbl[index].fields.trig_mode == IOAPIC_LEVEL_TRIG 140 if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG
138 && ioapic->irr & (1 << index)) 141 && ioapic->irr & (1 << index))
139 ioapic_service(ioapic, index); 142 ioapic_service(ioapic, index);
140 break; 143 break;
@@ -164,7 +167,9 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
164 /* Always delivery PIT interrupt to vcpu 0 */ 167 /* Always delivery PIT interrupt to vcpu 0 */
165 if (irq == 0) { 168 if (irq == 0) {
166 irqe.dest_mode = 0; /* Physical mode. */ 169 irqe.dest_mode = 0; /* Physical mode. */
167 irqe.dest_id = ioapic->kvm->vcpus[0]->vcpu_id; 170 /* need to read apic_id from apic regiest since
171 * it can be rewritten */
172 irqe.dest_id = ioapic->kvm->bsp_vcpu->vcpu_id;
168 } 173 }
169#endif 174#endif
170 return kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe); 175 return kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe);
@@ -188,7 +193,10 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
188 if ((edge && old_irr != ioapic->irr) || 193 if ((edge && old_irr != ioapic->irr) ||
189 (!edge && !entry.fields.remote_irr)) 194 (!edge && !entry.fields.remote_irr))
190 ret = ioapic_service(ioapic, irq); 195 ret = ioapic_service(ioapic, irq);
196 else
197 ret = 0; /* report coalesced interrupt */
191 } 198 }
199 trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
192 } 200 }
193 return ret; 201 return ret;
194} 202}
@@ -220,24 +228,29 @@ void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode)
220 __kvm_ioapic_update_eoi(ioapic, i, trigger_mode); 228 __kvm_ioapic_update_eoi(ioapic, i, trigger_mode);
221} 229}
222 230
223static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr, 231static inline struct kvm_ioapic *to_ioapic(struct kvm_io_device *dev)
224 int len, int is_write)
225{ 232{
226 struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; 233 return container_of(dev, struct kvm_ioapic, dev);
234}
227 235
236static inline int ioapic_in_range(struct kvm_ioapic *ioapic, gpa_t addr)
237{
228 return ((addr >= ioapic->base_address && 238 return ((addr >= ioapic->base_address &&
229 (addr < ioapic->base_address + IOAPIC_MEM_LENGTH))); 239 (addr < ioapic->base_address + IOAPIC_MEM_LENGTH)));
230} 240}
231 241
232static void ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len, 242static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
233 void *val) 243 void *val)
234{ 244{
235 struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; 245 struct kvm_ioapic *ioapic = to_ioapic(this);
236 u32 result; 246 u32 result;
247 if (!ioapic_in_range(ioapic, addr))
248 return -EOPNOTSUPP;
237 249
238 ioapic_debug("addr %lx\n", (unsigned long)addr); 250 ioapic_debug("addr %lx\n", (unsigned long)addr);
239 ASSERT(!(addr & 0xf)); /* check alignment */ 251 ASSERT(!(addr & 0xf)); /* check alignment */
240 252
253 mutex_lock(&ioapic->kvm->irq_lock);
241 addr &= 0xff; 254 addr &= 0xff;
242 switch (addr) { 255 switch (addr) {
243 case IOAPIC_REG_SELECT: 256 case IOAPIC_REG_SELECT:
@@ -264,22 +277,28 @@ static void ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
264 default: 277 default:
265 printk(KERN_WARNING "ioapic: wrong length %d\n", len); 278 printk(KERN_WARNING "ioapic: wrong length %d\n", len);
266 } 279 }
280 mutex_unlock(&ioapic->kvm->irq_lock);
281 return 0;
267} 282}
268 283
269static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, 284static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
270 const void *val) 285 const void *val)
271{ 286{
272 struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; 287 struct kvm_ioapic *ioapic = to_ioapic(this);
273 u32 data; 288 u32 data;
289 if (!ioapic_in_range(ioapic, addr))
290 return -EOPNOTSUPP;
274 291
275 ioapic_debug("ioapic_mmio_write addr=%p len=%d val=%p\n", 292 ioapic_debug("ioapic_mmio_write addr=%p len=%d val=%p\n",
276 (void*)addr, len, val); 293 (void*)addr, len, val);
277 ASSERT(!(addr & 0xf)); /* check alignment */ 294 ASSERT(!(addr & 0xf)); /* check alignment */
295
296 mutex_lock(&ioapic->kvm->irq_lock);
278 if (len == 4 || len == 8) 297 if (len == 4 || len == 8)
279 data = *(u32 *) val; 298 data = *(u32 *) val;
280 else { 299 else {
281 printk(KERN_WARNING "ioapic: Unsupported size %d\n", len); 300 printk(KERN_WARNING "ioapic: Unsupported size %d\n", len);
282 return; 301 goto unlock;
283 } 302 }
284 303
285 addr &= 0xff; 304 addr &= 0xff;
@@ -300,6 +319,9 @@ static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
300 default: 319 default:
301 break; 320 break;
302 } 321 }
322unlock:
323 mutex_unlock(&ioapic->kvm->irq_lock);
324 return 0;
303} 325}
304 326
305void kvm_ioapic_reset(struct kvm_ioapic *ioapic) 327void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
@@ -314,21 +336,27 @@ void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
314 ioapic->id = 0; 336 ioapic->id = 0;
315} 337}
316 338
339static const struct kvm_io_device_ops ioapic_mmio_ops = {
340 .read = ioapic_mmio_read,
341 .write = ioapic_mmio_write,
342};
343
317int kvm_ioapic_init(struct kvm *kvm) 344int kvm_ioapic_init(struct kvm *kvm)
318{ 345{
319 struct kvm_ioapic *ioapic; 346 struct kvm_ioapic *ioapic;
347 int ret;
320 348
321 ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL); 349 ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL);
322 if (!ioapic) 350 if (!ioapic)
323 return -ENOMEM; 351 return -ENOMEM;
324 kvm->arch.vioapic = ioapic; 352 kvm->arch.vioapic = ioapic;
325 kvm_ioapic_reset(ioapic); 353 kvm_ioapic_reset(ioapic);
326 ioapic->dev.read = ioapic_mmio_read; 354 kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops);
327 ioapic->dev.write = ioapic_mmio_write;
328 ioapic->dev.in_range = ioapic_in_range;
329 ioapic->dev.private = ioapic;
330 ioapic->kvm = kvm; 355 ioapic->kvm = kvm;
331 kvm_io_bus_register_dev(&kvm->mmio_bus, &ioapic->dev); 356 ret = kvm_io_bus_register_dev(kvm, &kvm->mmio_bus, &ioapic->dev);
332 return 0; 357 if (ret < 0)
358 kfree(ioapic);
359
360 return ret;
333} 361}
334 362
diff --git a/virt/kvm/iodev.h b/virt/kvm/iodev.h
index 55e8846ac3a6..12fd3caffd2b 100644
--- a/virt/kvm/iodev.h
+++ b/virt/kvm/iodev.h
@@ -17,49 +17,54 @@
17#define __KVM_IODEV_H__ 17#define __KVM_IODEV_H__
18 18
19#include <linux/kvm_types.h> 19#include <linux/kvm_types.h>
20#include <asm/errno.h>
20 21
21struct kvm_io_device { 22struct kvm_io_device;
22 void (*read)(struct kvm_io_device *this, 23
24/**
25 * kvm_io_device_ops are called under kvm slots_lock.
26 * read and write handlers return 0 if the transaction has been handled,
27 * or non-zero to have it passed to the next device.
28 **/
29struct kvm_io_device_ops {
30 int (*read)(struct kvm_io_device *this,
31 gpa_t addr,
32 int len,
33 void *val);
34 int (*write)(struct kvm_io_device *this,
23 gpa_t addr, 35 gpa_t addr,
24 int len, 36 int len,
25 void *val); 37 const void *val);
26 void (*write)(struct kvm_io_device *this,
27 gpa_t addr,
28 int len,
29 const void *val);
30 int (*in_range)(struct kvm_io_device *this, gpa_t addr, int len,
31 int is_write);
32 void (*destructor)(struct kvm_io_device *this); 38 void (*destructor)(struct kvm_io_device *this);
39};
33 40
34 void *private; 41
42struct kvm_io_device {
43 const struct kvm_io_device_ops *ops;
35}; 44};
36 45
37static inline void kvm_iodevice_read(struct kvm_io_device *dev, 46static inline void kvm_iodevice_init(struct kvm_io_device *dev,
38 gpa_t addr, 47 const struct kvm_io_device_ops *ops)
39 int len,
40 void *val)
41{ 48{
42 dev->read(dev, addr, len, val); 49 dev->ops = ops;
43} 50}
44 51
45static inline void kvm_iodevice_write(struct kvm_io_device *dev, 52static inline int kvm_iodevice_read(struct kvm_io_device *dev,
46 gpa_t addr, 53 gpa_t addr, int l, void *v)
47 int len,
48 const void *val)
49{ 54{
50 dev->write(dev, addr, len, val); 55 return dev->ops->read ? dev->ops->read(dev, addr, l, v) : -EOPNOTSUPP;
51} 56}
52 57
53static inline int kvm_iodevice_inrange(struct kvm_io_device *dev, 58static inline int kvm_iodevice_write(struct kvm_io_device *dev,
54 gpa_t addr, int len, int is_write) 59 gpa_t addr, int l, const void *v)
55{ 60{
56 return dev->in_range(dev, addr, len, is_write); 61 return dev->ops->write ? dev->ops->write(dev, addr, l, v) : -EOPNOTSUPP;
57} 62}
58 63
59static inline void kvm_iodevice_destructor(struct kvm_io_device *dev) 64static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
60{ 65{
61 if (dev->destructor) 66 if (dev->ops->destructor)
62 dev->destructor(dev); 67 dev->ops->destructor(dev);
63} 68}
64 69
65#endif /* __KVM_IODEV_H__ */ 70#endif /* __KVM_IODEV_H__ */
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index ddc17f0e2f35..001663ff401a 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -20,6 +20,7 @@
20 */ 20 */
21 21
22#include <linux/kvm_host.h> 22#include <linux/kvm_host.h>
23#include <trace/events/kvm.h>
23 24
24#include <asm/msidef.h> 25#include <asm/msidef.h>
25#ifdef CONFIG_IA64 26#ifdef CONFIG_IA64
@@ -62,14 +63,14 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
62 int i, r = -1; 63 int i, r = -1;
63 struct kvm_vcpu *vcpu, *lowest = NULL; 64 struct kvm_vcpu *vcpu, *lowest = NULL;
64 65
66 WARN_ON(!mutex_is_locked(&kvm->irq_lock));
67
65 if (irq->dest_mode == 0 && irq->dest_id == 0xff && 68 if (irq->dest_mode == 0 && irq->dest_id == 0xff &&
66 kvm_is_dm_lowest_prio(irq)) 69 kvm_is_dm_lowest_prio(irq))
67 printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n"); 70 printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n");
68 71
69 for (i = 0; i < KVM_MAX_VCPUS; i++) { 72 kvm_for_each_vcpu(i, vcpu, kvm) {
70 vcpu = kvm->vcpus[i]; 73 if (!kvm_apic_present(vcpu))
71
72 if (!vcpu || !kvm_apic_present(vcpu))
73 continue; 74 continue;
74 75
75 if (!kvm_apic_match_dest(vcpu, src, irq->shorthand, 76 if (!kvm_apic_match_dest(vcpu, src, irq->shorthand,
@@ -99,6 +100,8 @@ static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
99{ 100{
100 struct kvm_lapic_irq irq; 101 struct kvm_lapic_irq irq;
101 102
103 trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data);
104
102 irq.dest_id = (e->msi.address_lo & 105 irq.dest_id = (e->msi.address_lo &
103 MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT; 106 MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT;
104 irq.vector = (e->msi.data & 107 irq.vector = (e->msi.data &
@@ -113,7 +116,7 @@ static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
113 return kvm_irq_delivery_to_apic(kvm, NULL, &irq); 116 return kvm_irq_delivery_to_apic(kvm, NULL, &irq);
114} 117}
115 118
116/* This should be called with the kvm->lock mutex held 119/* This should be called with the kvm->irq_lock mutex held
117 * Return value: 120 * Return value:
118 * < 0 Interrupt was ignored (masked or not delivered for other reasons) 121 * < 0 Interrupt was ignored (masked or not delivered for other reasons)
119 * = 0 Interrupt was coalesced (previous irq is still pending) 122 * = 0 Interrupt was coalesced (previous irq is still pending)
@@ -125,6 +128,10 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level)
125 unsigned long *irq_state, sig_level; 128 unsigned long *irq_state, sig_level;
126 int ret = -1; 129 int ret = -1;
127 130
131 trace_kvm_set_irq(irq, level, irq_source_id);
132
133 WARN_ON(!mutex_is_locked(&kvm->irq_lock));
134
128 if (irq < KVM_IOAPIC_NUM_PINS) { 135 if (irq < KVM_IOAPIC_NUM_PINS) {
129 irq_state = (unsigned long *)&kvm->arch.irq_states[irq]; 136 irq_state = (unsigned long *)&kvm->arch.irq_states[irq];
130 137
@@ -134,7 +141,9 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level)
134 else 141 else
135 clear_bit(irq_source_id, irq_state); 142 clear_bit(irq_source_id, irq_state);
136 sig_level = !!(*irq_state); 143 sig_level = !!(*irq_state);
137 } else /* Deal with MSI/MSI-X */ 144 } else if (!level)
145 return ret;
146 else /* Deal with MSI/MSI-X */
138 sig_level = 1; 147 sig_level = 1;
139 148
140 /* Not possible to detect if the guest uses the PIC or the 149 /* Not possible to detect if the guest uses the PIC or the
@@ -159,6 +168,8 @@ void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
159 struct hlist_node *n; 168 struct hlist_node *n;
160 unsigned gsi = pin; 169 unsigned gsi = pin;
161 170
171 trace_kvm_ack_irq(irqchip, pin);
172
162 list_for_each_entry(e, &kvm->irq_routing, link) 173 list_for_each_entry(e, &kvm->irq_routing, link)
163 if (e->type == KVM_IRQ_ROUTING_IRQCHIP && 174 if (e->type == KVM_IRQ_ROUTING_IRQCHIP &&
164 e->irqchip.irqchip == irqchip && 175 e->irqchip.irqchip == irqchip &&
@@ -175,19 +186,26 @@ void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
175void kvm_register_irq_ack_notifier(struct kvm *kvm, 186void kvm_register_irq_ack_notifier(struct kvm *kvm,
176 struct kvm_irq_ack_notifier *kian) 187 struct kvm_irq_ack_notifier *kian)
177{ 188{
189 mutex_lock(&kvm->irq_lock);
178 hlist_add_head(&kian->link, &kvm->arch.irq_ack_notifier_list); 190 hlist_add_head(&kian->link, &kvm->arch.irq_ack_notifier_list);
191 mutex_unlock(&kvm->irq_lock);
179} 192}
180 193
181void kvm_unregister_irq_ack_notifier(struct kvm_irq_ack_notifier *kian) 194void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
195 struct kvm_irq_ack_notifier *kian)
182{ 196{
197 mutex_lock(&kvm->irq_lock);
183 hlist_del_init(&kian->link); 198 hlist_del_init(&kian->link);
199 mutex_unlock(&kvm->irq_lock);
184} 200}
185 201
186/* The caller must hold kvm->lock mutex */
187int kvm_request_irq_source_id(struct kvm *kvm) 202int kvm_request_irq_source_id(struct kvm *kvm)
188{ 203{
189 unsigned long *bitmap = &kvm->arch.irq_sources_bitmap; 204 unsigned long *bitmap = &kvm->arch.irq_sources_bitmap;
190 int irq_source_id = find_first_zero_bit(bitmap, 205 int irq_source_id;
206
207 mutex_lock(&kvm->irq_lock);
208 irq_source_id = find_first_zero_bit(bitmap,
191 sizeof(kvm->arch.irq_sources_bitmap)); 209 sizeof(kvm->arch.irq_sources_bitmap));
192 210
193 if (irq_source_id >= sizeof(kvm->arch.irq_sources_bitmap)) { 211 if (irq_source_id >= sizeof(kvm->arch.irq_sources_bitmap)) {
@@ -197,6 +215,7 @@ int kvm_request_irq_source_id(struct kvm *kvm)
197 215
198 ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); 216 ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
199 set_bit(irq_source_id, bitmap); 217 set_bit(irq_source_id, bitmap);
218 mutex_unlock(&kvm->irq_lock);
200 219
201 return irq_source_id; 220 return irq_source_id;
202} 221}
@@ -207,6 +226,7 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
207 226
208 ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); 227 ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
209 228
229 mutex_lock(&kvm->irq_lock);
210 if (irq_source_id < 0 || 230 if (irq_source_id < 0 ||
211 irq_source_id >= sizeof(kvm->arch.irq_sources_bitmap)) { 231 irq_source_id >= sizeof(kvm->arch.irq_sources_bitmap)) {
212 printk(KERN_ERR "kvm: IRQ source ID out of range!\n"); 232 printk(KERN_ERR "kvm: IRQ source ID out of range!\n");
@@ -215,19 +235,24 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
215 for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++) 235 for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++)
216 clear_bit(irq_source_id, &kvm->arch.irq_states[i]); 236 clear_bit(irq_source_id, &kvm->arch.irq_states[i]);
217 clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap); 237 clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap);
238 mutex_unlock(&kvm->irq_lock);
218} 239}
219 240
220void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq, 241void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
221 struct kvm_irq_mask_notifier *kimn) 242 struct kvm_irq_mask_notifier *kimn)
222{ 243{
244 mutex_lock(&kvm->irq_lock);
223 kimn->irq = irq; 245 kimn->irq = irq;
224 hlist_add_head(&kimn->link, &kvm->mask_notifier_list); 246 hlist_add_head(&kimn->link, &kvm->mask_notifier_list);
247 mutex_unlock(&kvm->irq_lock);
225} 248}
226 249
227void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq, 250void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
228 struct kvm_irq_mask_notifier *kimn) 251 struct kvm_irq_mask_notifier *kimn)
229{ 252{
253 mutex_lock(&kvm->irq_lock);
230 hlist_del(&kimn->link); 254 hlist_del(&kimn->link);
255 mutex_unlock(&kvm->irq_lock);
231} 256}
232 257
233void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask) 258void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask)
@@ -235,6 +260,8 @@ void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask)
235 struct kvm_irq_mask_notifier *kimn; 260 struct kvm_irq_mask_notifier *kimn;
236 struct hlist_node *n; 261 struct hlist_node *n;
237 262
263 WARN_ON(!mutex_is_locked(&kvm->irq_lock));
264
238 hlist_for_each_entry(kimn, n, &kvm->mask_notifier_list, link) 265 hlist_for_each_entry(kimn, n, &kvm->mask_notifier_list, link)
239 if (kimn->irq == irq) 266 if (kimn->irq == irq)
240 kimn->func(kimn, mask); 267 kimn->func(kimn, mask);
@@ -250,7 +277,9 @@ static void __kvm_free_irq_routing(struct list_head *irq_routing)
250 277
251void kvm_free_irq_routing(struct kvm *kvm) 278void kvm_free_irq_routing(struct kvm *kvm)
252{ 279{
280 mutex_lock(&kvm->irq_lock);
253 __kvm_free_irq_routing(&kvm->irq_routing); 281 __kvm_free_irq_routing(&kvm->irq_routing);
282 mutex_unlock(&kvm->irq_lock);
254} 283}
255 284
256static int setup_routing_entry(struct kvm_kernel_irq_routing_entry *e, 285static int setup_routing_entry(struct kvm_kernel_irq_routing_entry *e,
@@ -325,13 +354,13 @@ int kvm_set_irq_routing(struct kvm *kvm,
325 e = NULL; 354 e = NULL;
326 } 355 }
327 356
328 mutex_lock(&kvm->lock); 357 mutex_lock(&kvm->irq_lock);
329 list_splice(&kvm->irq_routing, &tmp); 358 list_splice(&kvm->irq_routing, &tmp);
330 INIT_LIST_HEAD(&kvm->irq_routing); 359 INIT_LIST_HEAD(&kvm->irq_routing);
331 list_splice(&irq_list, &kvm->irq_routing); 360 list_splice(&irq_list, &kvm->irq_routing);
332 INIT_LIST_HEAD(&irq_list); 361 INIT_LIST_HEAD(&irq_list);
333 list_splice(&tmp, &irq_list); 362 list_splice(&tmp, &irq_list);
334 mutex_unlock(&kvm->lock); 363 mutex_unlock(&kvm->irq_lock);
335 364
336 r = 0; 365 r = 0;
337 366
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 2884baf1d5f9..897bff3b7df9 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -59,9 +59,18 @@
59#include "irq.h" 59#include "irq.h"
60#endif 60#endif
61 61
62#define CREATE_TRACE_POINTS
63#include <trace/events/kvm.h>
64
62MODULE_AUTHOR("Qumranet"); 65MODULE_AUTHOR("Qumranet");
63MODULE_LICENSE("GPL"); 66MODULE_LICENSE("GPL");
64 67
68/*
69 * Ordering of locks:
70 *
71 * kvm->slots_lock --> kvm->lock --> kvm->irq_lock
72 */
73
65DEFINE_SPINLOCK(kvm_lock); 74DEFINE_SPINLOCK(kvm_lock);
66LIST_HEAD(vm_list); 75LIST_HEAD(vm_list);
67 76
@@ -79,6 +88,8 @@ static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
79 88
80static bool kvm_rebooting; 89static bool kvm_rebooting;
81 90
91static bool largepages_enabled = true;
92
82#ifdef KVM_CAP_DEVICE_ASSIGNMENT 93#ifdef KVM_CAP_DEVICE_ASSIGNMENT
83static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, 94static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
84 int assigned_dev_id) 95 int assigned_dev_id)
@@ -120,17 +131,13 @@ static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
120{ 131{
121 struct kvm_assigned_dev_kernel *assigned_dev; 132 struct kvm_assigned_dev_kernel *assigned_dev;
122 struct kvm *kvm; 133 struct kvm *kvm;
123 int irq, i; 134 int i;
124 135
125 assigned_dev = container_of(work, struct kvm_assigned_dev_kernel, 136 assigned_dev = container_of(work, struct kvm_assigned_dev_kernel,
126 interrupt_work); 137 interrupt_work);
127 kvm = assigned_dev->kvm; 138 kvm = assigned_dev->kvm;
128 139
129 /* This is taken to safely inject irq inside the guest. When 140 mutex_lock(&kvm->irq_lock);
130 * the interrupt injection (or the ioapic code) uses a
131 * finer-grained lock, update this
132 */
133 mutex_lock(&kvm->lock);
134 spin_lock_irq(&assigned_dev->assigned_dev_lock); 141 spin_lock_irq(&assigned_dev->assigned_dev_lock);
135 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { 142 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
136 struct kvm_guest_msix_entry *guest_entries = 143 struct kvm_guest_msix_entry *guest_entries =
@@ -143,23 +150,13 @@ static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
143 kvm_set_irq(assigned_dev->kvm, 150 kvm_set_irq(assigned_dev->kvm,
144 assigned_dev->irq_source_id, 151 assigned_dev->irq_source_id,
145 guest_entries[i].vector, 1); 152 guest_entries[i].vector, 1);
146 irq = assigned_dev->host_msix_entries[i].vector;
147 if (irq != 0)
148 enable_irq(irq);
149 assigned_dev->host_irq_disabled = false;
150 } 153 }
151 } else { 154 } else
152 kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, 155 kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
153 assigned_dev->guest_irq, 1); 156 assigned_dev->guest_irq, 1);
154 if (assigned_dev->irq_requested_type &
155 KVM_DEV_IRQ_GUEST_MSI) {
156 enable_irq(assigned_dev->host_irq);
157 assigned_dev->host_irq_disabled = false;
158 }
159 }
160 157
161 spin_unlock_irq(&assigned_dev->assigned_dev_lock); 158 spin_unlock_irq(&assigned_dev->assigned_dev_lock);
162 mutex_unlock(&assigned_dev->kvm->lock); 159 mutex_unlock(&assigned_dev->kvm->irq_lock);
163} 160}
164 161
165static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id) 162static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
@@ -179,8 +176,10 @@ static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
179 176
180 schedule_work(&assigned_dev->interrupt_work); 177 schedule_work(&assigned_dev->interrupt_work);
181 178
182 disable_irq_nosync(irq); 179 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
183 assigned_dev->host_irq_disabled = true; 180 disable_irq_nosync(irq);
181 assigned_dev->host_irq_disabled = true;
182 }
184 183
185out: 184out:
186 spin_unlock_irqrestore(&assigned_dev->assigned_dev_lock, flags); 185 spin_unlock_irqrestore(&assigned_dev->assigned_dev_lock, flags);
@@ -215,7 +214,7 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
215static void deassign_guest_irq(struct kvm *kvm, 214static void deassign_guest_irq(struct kvm *kvm,
216 struct kvm_assigned_dev_kernel *assigned_dev) 215 struct kvm_assigned_dev_kernel *assigned_dev)
217{ 216{
218 kvm_unregister_irq_ack_notifier(&assigned_dev->ack_notifier); 217 kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier);
219 assigned_dev->ack_notifier.gsi = -1; 218 assigned_dev->ack_notifier.gsi = -1;
220 219
221 if (assigned_dev->irq_source_id != -1) 220 if (assigned_dev->irq_source_id != -1)
@@ -417,6 +416,7 @@ static int assigned_device_enable_guest_msi(struct kvm *kvm,
417{ 416{
418 dev->guest_irq = irq->guest_irq; 417 dev->guest_irq = irq->guest_irq;
419 dev->ack_notifier.gsi = -1; 418 dev->ack_notifier.gsi = -1;
419 dev->host_irq_disabled = false;
420 return 0; 420 return 0;
421} 421}
422#endif 422#endif
@@ -427,6 +427,7 @@ static int assigned_device_enable_guest_msix(struct kvm *kvm,
427{ 427{
428 dev->guest_irq = irq->guest_irq; 428 dev->guest_irq = irq->guest_irq;
429 dev->ack_notifier.gsi = -1; 429 dev->ack_notifier.gsi = -1;
430 dev->host_irq_disabled = false;
430 return 0; 431 return 0;
431} 432}
432#endif 433#endif
@@ -693,11 +694,6 @@ out:
693} 694}
694#endif 695#endif
695 696
696static inline int valid_vcpu(int n)
697{
698 return likely(n >= 0 && n < KVM_MAX_VCPUS);
699}
700
701inline int kvm_is_mmio_pfn(pfn_t pfn) 697inline int kvm_is_mmio_pfn(pfn_t pfn)
702{ 698{
703 if (pfn_valid(pfn)) { 699 if (pfn_valid(pfn)) {
@@ -745,12 +741,9 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
745 if (alloc_cpumask_var(&cpus, GFP_ATOMIC)) 741 if (alloc_cpumask_var(&cpus, GFP_ATOMIC))
746 cpumask_clear(cpus); 742 cpumask_clear(cpus);
747 743
748 me = get_cpu();
749 spin_lock(&kvm->requests_lock); 744 spin_lock(&kvm->requests_lock);
750 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 745 me = smp_processor_id();
751 vcpu = kvm->vcpus[i]; 746 kvm_for_each_vcpu(i, vcpu, kvm) {
752 if (!vcpu)
753 continue;
754 if (test_and_set_bit(req, &vcpu->requests)) 747 if (test_and_set_bit(req, &vcpu->requests))
755 continue; 748 continue;
756 cpu = vcpu->cpu; 749 cpu = vcpu->cpu;
@@ -764,7 +757,6 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
764 else 757 else
765 called = false; 758 called = false;
766 spin_unlock(&kvm->requests_lock); 759 spin_unlock(&kvm->requests_lock);
767 put_cpu();
768 free_cpumask_var(cpus); 760 free_cpumask_var(cpus);
769 return called; 761 return called;
770} 762}
@@ -986,7 +978,9 @@ static struct kvm *kvm_create_vm(void)
986 spin_lock_init(&kvm->mmu_lock); 978 spin_lock_init(&kvm->mmu_lock);
987 spin_lock_init(&kvm->requests_lock); 979 spin_lock_init(&kvm->requests_lock);
988 kvm_io_bus_init(&kvm->pio_bus); 980 kvm_io_bus_init(&kvm->pio_bus);
981 kvm_eventfd_init(kvm);
989 mutex_init(&kvm->lock); 982 mutex_init(&kvm->lock);
983 mutex_init(&kvm->irq_lock);
990 kvm_io_bus_init(&kvm->mmio_bus); 984 kvm_io_bus_init(&kvm->mmio_bus);
991 init_rwsem(&kvm->slots_lock); 985 init_rwsem(&kvm->slots_lock);
992 atomic_set(&kvm->users_count, 1); 986 atomic_set(&kvm->users_count, 1);
@@ -1006,19 +1000,25 @@ out:
1006static void kvm_free_physmem_slot(struct kvm_memory_slot *free, 1000static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
1007 struct kvm_memory_slot *dont) 1001 struct kvm_memory_slot *dont)
1008{ 1002{
1003 int i;
1004
1009 if (!dont || free->rmap != dont->rmap) 1005 if (!dont || free->rmap != dont->rmap)
1010 vfree(free->rmap); 1006 vfree(free->rmap);
1011 1007
1012 if (!dont || free->dirty_bitmap != dont->dirty_bitmap) 1008 if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
1013 vfree(free->dirty_bitmap); 1009 vfree(free->dirty_bitmap);
1014 1010
1015 if (!dont || free->lpage_info != dont->lpage_info) 1011
1016 vfree(free->lpage_info); 1012 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
1013 if (!dont || free->lpage_info[i] != dont->lpage_info[i]) {
1014 vfree(free->lpage_info[i]);
1015 free->lpage_info[i] = NULL;
1016 }
1017 }
1017 1018
1018 free->npages = 0; 1019 free->npages = 0;
1019 free->dirty_bitmap = NULL; 1020 free->dirty_bitmap = NULL;
1020 free->rmap = NULL; 1021 free->rmap = NULL;
1021 free->lpage_info = NULL;
1022} 1022}
1023 1023
1024void kvm_free_physmem(struct kvm *kvm) 1024void kvm_free_physmem(struct kvm *kvm)
@@ -1071,6 +1071,8 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
1071{ 1071{
1072 struct kvm *kvm = filp->private_data; 1072 struct kvm *kvm = filp->private_data;
1073 1073
1074 kvm_irqfd_release(kvm);
1075
1074 kvm_put_kvm(kvm); 1076 kvm_put_kvm(kvm);
1075 return 0; 1077 return 0;
1076} 1078}
@@ -1089,8 +1091,8 @@ int __kvm_set_memory_region(struct kvm *kvm,
1089{ 1091{
1090 int r; 1092 int r;
1091 gfn_t base_gfn; 1093 gfn_t base_gfn;
1092 unsigned long npages, ugfn; 1094 unsigned long npages;
1093 unsigned long largepages, i; 1095 unsigned long i;
1094 struct kvm_memory_slot *memslot; 1096 struct kvm_memory_slot *memslot;
1095 struct kvm_memory_slot old, new; 1097 struct kvm_memory_slot old, new;
1096 1098
@@ -1164,31 +1166,51 @@ int __kvm_set_memory_region(struct kvm *kvm,
1164 else 1166 else
1165 new.userspace_addr = 0; 1167 new.userspace_addr = 0;
1166 } 1168 }
1167 if (npages && !new.lpage_info) { 1169 if (!npages)
1168 largepages = 1 + (base_gfn + npages - 1) / KVM_PAGES_PER_HPAGE; 1170 goto skip_lpage;
1169 largepages -= base_gfn / KVM_PAGES_PER_HPAGE;
1170 1171
1171 new.lpage_info = vmalloc(largepages * sizeof(*new.lpage_info)); 1172 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
1173 unsigned long ugfn;
1174 unsigned long j;
1175 int lpages;
1176 int level = i + 2;
1172 1177
1173 if (!new.lpage_info) 1178 /* Avoid unused variable warning if no large pages */
1179 (void)level;
1180
1181 if (new.lpage_info[i])
1182 continue;
1183
1184 lpages = 1 + (base_gfn + npages - 1) /
1185 KVM_PAGES_PER_HPAGE(level);
1186 lpages -= base_gfn / KVM_PAGES_PER_HPAGE(level);
1187
1188 new.lpage_info[i] = vmalloc(lpages * sizeof(*new.lpage_info[i]));
1189
1190 if (!new.lpage_info[i])
1174 goto out_free; 1191 goto out_free;
1175 1192
1176 memset(new.lpage_info, 0, largepages * sizeof(*new.lpage_info)); 1193 memset(new.lpage_info[i], 0,
1194 lpages * sizeof(*new.lpage_info[i]));
1177 1195
1178 if (base_gfn % KVM_PAGES_PER_HPAGE) 1196 if (base_gfn % KVM_PAGES_PER_HPAGE(level))
1179 new.lpage_info[0].write_count = 1; 1197 new.lpage_info[i][0].write_count = 1;
1180 if ((base_gfn+npages) % KVM_PAGES_PER_HPAGE) 1198 if ((base_gfn+npages) % KVM_PAGES_PER_HPAGE(level))
1181 new.lpage_info[largepages-1].write_count = 1; 1199 new.lpage_info[i][lpages - 1].write_count = 1;
1182 ugfn = new.userspace_addr >> PAGE_SHIFT; 1200 ugfn = new.userspace_addr >> PAGE_SHIFT;
1183 /* 1201 /*
1184 * If the gfn and userspace address are not aligned wrt each 1202 * If the gfn and userspace address are not aligned wrt each
1185 * other, disable large page support for this slot 1203 * other, or if explicitly asked to, disable large page
1204 * support for this slot
1186 */ 1205 */
1187 if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE - 1)) 1206 if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
1188 for (i = 0; i < largepages; ++i) 1207 !largepages_enabled)
1189 new.lpage_info[i].write_count = 1; 1208 for (j = 0; j < lpages; ++j)
1209 new.lpage_info[i][j].write_count = 1;
1190 } 1210 }
1191 1211
1212skip_lpage:
1213
1192 /* Allocate page dirty bitmap if needed */ 1214 /* Allocate page dirty bitmap if needed */
1193 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { 1215 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
1194 unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8; 1216 unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8;
@@ -1200,6 +1222,10 @@ int __kvm_set_memory_region(struct kvm *kvm,
1200 if (old.npages) 1222 if (old.npages)
1201 kvm_arch_flush_shadow(kvm); 1223 kvm_arch_flush_shadow(kvm);
1202 } 1224 }
1225#else /* not defined CONFIG_S390 */
1226 new.user_alloc = user_alloc;
1227 if (user_alloc)
1228 new.userspace_addr = mem->userspace_addr;
1203#endif /* not defined CONFIG_S390 */ 1229#endif /* not defined CONFIG_S390 */
1204 1230
1205 if (!npages) 1231 if (!npages)
@@ -1299,6 +1325,12 @@ out:
1299 return r; 1325 return r;
1300} 1326}
1301 1327
1328void kvm_disable_largepages(void)
1329{
1330 largepages_enabled = false;
1331}
1332EXPORT_SYMBOL_GPL(kvm_disable_largepages);
1333
1302int is_error_page(struct page *page) 1334int is_error_page(struct page *page)
1303{ 1335{
1304 return page == bad_page; 1336 return page == bad_page;
@@ -1635,9 +1667,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
1635 for (;;) { 1667 for (;;) {
1636 prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); 1668 prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
1637 1669
1638 if ((kvm_arch_interrupt_allowed(vcpu) && 1670 if (kvm_arch_vcpu_runnable(vcpu)) {
1639 kvm_cpu_has_interrupt(vcpu)) ||
1640 kvm_arch_vcpu_runnable(vcpu)) {
1641 set_bit(KVM_REQ_UNHALT, &vcpu->requests); 1671 set_bit(KVM_REQ_UNHALT, &vcpu->requests);
1642 break; 1672 break;
1643 } 1673 }
@@ -1714,24 +1744,18 @@ static struct file_operations kvm_vcpu_fops = {
1714 */ 1744 */
1715static int create_vcpu_fd(struct kvm_vcpu *vcpu) 1745static int create_vcpu_fd(struct kvm_vcpu *vcpu)
1716{ 1746{
1717 int fd = anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, 0); 1747 return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, 0);
1718 if (fd < 0)
1719 kvm_put_kvm(vcpu->kvm);
1720 return fd;
1721} 1748}
1722 1749
1723/* 1750/*
1724 * Creates some virtual cpus. Good luck creating more than one. 1751 * Creates some virtual cpus. Good luck creating more than one.
1725 */ 1752 */
1726static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) 1753static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
1727{ 1754{
1728 int r; 1755 int r;
1729 struct kvm_vcpu *vcpu; 1756 struct kvm_vcpu *vcpu, *v;
1730
1731 if (!valid_vcpu(n))
1732 return -EINVAL;
1733 1757
1734 vcpu = kvm_arch_vcpu_create(kvm, n); 1758 vcpu = kvm_arch_vcpu_create(kvm, id);
1735 if (IS_ERR(vcpu)) 1759 if (IS_ERR(vcpu))
1736 return PTR_ERR(vcpu); 1760 return PTR_ERR(vcpu);
1737 1761
@@ -1742,23 +1766,38 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
1742 return r; 1766 return r;
1743 1767
1744 mutex_lock(&kvm->lock); 1768 mutex_lock(&kvm->lock);
1745 if (kvm->vcpus[n]) { 1769 if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) {
1746 r = -EEXIST; 1770 r = -EINVAL;
1747 goto vcpu_destroy; 1771 goto vcpu_destroy;
1748 } 1772 }
1749 kvm->vcpus[n] = vcpu; 1773
1750 mutex_unlock(&kvm->lock); 1774 kvm_for_each_vcpu(r, v, kvm)
1775 if (v->vcpu_id == id) {
1776 r = -EEXIST;
1777 goto vcpu_destroy;
1778 }
1779
1780 BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]);
1751 1781
1752 /* Now it's all set up, let userspace reach it */ 1782 /* Now it's all set up, let userspace reach it */
1753 kvm_get_kvm(kvm); 1783 kvm_get_kvm(kvm);
1754 r = create_vcpu_fd(vcpu); 1784 r = create_vcpu_fd(vcpu);
1755 if (r < 0) 1785 if (r < 0) {
1756 goto unlink; 1786 kvm_put_kvm(kvm);
1787 goto vcpu_destroy;
1788 }
1789
1790 kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu;
1791 smp_wmb();
1792 atomic_inc(&kvm->online_vcpus);
1793
1794#ifdef CONFIG_KVM_APIC_ARCHITECTURE
1795 if (kvm->bsp_vcpu_id == id)
1796 kvm->bsp_vcpu = vcpu;
1797#endif
1798 mutex_unlock(&kvm->lock);
1757 return r; 1799 return r;
1758 1800
1759unlink:
1760 mutex_lock(&kvm->lock);
1761 kvm->vcpus[n] = NULL;
1762vcpu_destroy: 1801vcpu_destroy:
1763 mutex_unlock(&kvm->lock); 1802 mutex_unlock(&kvm->lock);
1764 kvm_arch_vcpu_destroy(vcpu); 1803 kvm_arch_vcpu_destroy(vcpu);
@@ -2199,6 +2238,7 @@ static long kvm_vm_ioctl(struct file *filp,
2199 vfree(entries); 2238 vfree(entries);
2200 break; 2239 break;
2201 } 2240 }
2241#endif /* KVM_CAP_IRQ_ROUTING */
2202#ifdef __KVM_HAVE_MSIX 2242#ifdef __KVM_HAVE_MSIX
2203 case KVM_ASSIGN_SET_MSIX_NR: { 2243 case KVM_ASSIGN_SET_MSIX_NR: {
2204 struct kvm_assigned_msix_nr entry_nr; 2244 struct kvm_assigned_msix_nr entry_nr;
@@ -2221,7 +2261,35 @@ static long kvm_vm_ioctl(struct file *filp,
2221 break; 2261 break;
2222 } 2262 }
2223#endif 2263#endif
2224#endif /* KVM_CAP_IRQ_ROUTING */ 2264 case KVM_IRQFD: {
2265 struct kvm_irqfd data;
2266
2267 r = -EFAULT;
2268 if (copy_from_user(&data, argp, sizeof data))
2269 goto out;
2270 r = kvm_irqfd(kvm, data.fd, data.gsi, data.flags);
2271 break;
2272 }
2273 case KVM_IOEVENTFD: {
2274 struct kvm_ioeventfd data;
2275
2276 r = -EFAULT;
2277 if (copy_from_user(&data, argp, sizeof data))
2278 goto out;
2279 r = kvm_ioeventfd(kvm, &data);
2280 break;
2281 }
2282#ifdef CONFIG_KVM_APIC_ARCHITECTURE
2283 case KVM_SET_BOOT_CPU_ID:
2284 r = 0;
2285 mutex_lock(&kvm->lock);
2286 if (atomic_read(&kvm->online_vcpus) != 0)
2287 r = -EBUSY;
2288 else
2289 kvm->bsp_vcpu_id = arg;
2290 mutex_unlock(&kvm->lock);
2291 break;
2292#endif
2225 default: 2293 default:
2226 r = kvm_arch_vm_ioctl(filp, ioctl, arg); 2294 r = kvm_arch_vm_ioctl(filp, ioctl, arg);
2227 } 2295 }
@@ -2288,6 +2356,9 @@ static long kvm_dev_ioctl_check_extension_generic(long arg)
2288 case KVM_CAP_USER_MEMORY: 2356 case KVM_CAP_USER_MEMORY:
2289 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: 2357 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
2290 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: 2358 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
2359#ifdef CONFIG_KVM_APIC_ARCHITECTURE
2360 case KVM_CAP_SET_BOOT_CPU_ID:
2361#endif
2291 return 1; 2362 return 1;
2292#ifdef CONFIG_HAVE_KVM_IRQCHIP 2363#ifdef CONFIG_HAVE_KVM_IRQCHIP
2293 case KVM_CAP_IRQ_ROUTING: 2364 case KVM_CAP_IRQ_ROUTING:
@@ -2335,7 +2406,7 @@ static long kvm_dev_ioctl(struct file *filp,
2335 case KVM_TRACE_ENABLE: 2406 case KVM_TRACE_ENABLE:
2336 case KVM_TRACE_PAUSE: 2407 case KVM_TRACE_PAUSE:
2337 case KVM_TRACE_DISABLE: 2408 case KVM_TRACE_DISABLE:
2338 r = kvm_trace_ioctl(ioctl, arg); 2409 r = -EOPNOTSUPP;
2339 break; 2410 break;
2340 default: 2411 default:
2341 return kvm_arch_dev_ioctl(filp, ioctl, arg); 2412 return kvm_arch_dev_ioctl(filp, ioctl, arg);
@@ -2449,26 +2520,71 @@ void kvm_io_bus_destroy(struct kvm_io_bus *bus)
2449 } 2520 }
2450} 2521}
2451 2522
2452struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, 2523/* kvm_io_bus_write - called under kvm->slots_lock */
2453 gpa_t addr, int len, int is_write) 2524int kvm_io_bus_write(struct kvm_io_bus *bus, gpa_t addr,
2525 int len, const void *val)
2454{ 2526{
2455 int i; 2527 int i;
2528 for (i = 0; i < bus->dev_count; i++)
2529 if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
2530 return 0;
2531 return -EOPNOTSUPP;
2532}
2456 2533
2457 for (i = 0; i < bus->dev_count; i++) { 2534/* kvm_io_bus_read - called under kvm->slots_lock */
2458 struct kvm_io_device *pos = bus->devs[i]; 2535int kvm_io_bus_read(struct kvm_io_bus *bus, gpa_t addr, int len, void *val)
2536{
2537 int i;
2538 for (i = 0; i < bus->dev_count; i++)
2539 if (!kvm_iodevice_read(bus->devs[i], addr, len, val))
2540 return 0;
2541 return -EOPNOTSUPP;
2542}
2459 2543
2460 if (pos->in_range(pos, addr, len, is_write)) 2544int kvm_io_bus_register_dev(struct kvm *kvm, struct kvm_io_bus *bus,
2461 return pos; 2545 struct kvm_io_device *dev)
2462 } 2546{
2547 int ret;
2463 2548
2464 return NULL; 2549 down_write(&kvm->slots_lock);
2550 ret = __kvm_io_bus_register_dev(bus, dev);
2551 up_write(&kvm->slots_lock);
2552
2553 return ret;
2465} 2554}
2466 2555
2467void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev) 2556/* An unlocked version. Caller must have write lock on slots_lock. */
2557int __kvm_io_bus_register_dev(struct kvm_io_bus *bus,
2558 struct kvm_io_device *dev)
2468{ 2559{
2469 BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1)); 2560 if (bus->dev_count > NR_IOBUS_DEVS-1)
2561 return -ENOSPC;
2470 2562
2471 bus->devs[bus->dev_count++] = dev; 2563 bus->devs[bus->dev_count++] = dev;
2564
2565 return 0;
2566}
2567
2568void kvm_io_bus_unregister_dev(struct kvm *kvm,
2569 struct kvm_io_bus *bus,
2570 struct kvm_io_device *dev)
2571{
2572 down_write(&kvm->slots_lock);
2573 __kvm_io_bus_unregister_dev(bus, dev);
2574 up_write(&kvm->slots_lock);
2575}
2576
2577/* An unlocked version. Caller must have write lock on slots_lock. */
2578void __kvm_io_bus_unregister_dev(struct kvm_io_bus *bus,
2579 struct kvm_io_device *dev)
2580{
2581 int i;
2582
2583 for (i = 0; i < bus->dev_count; i++)
2584 if (bus->devs[i] == dev) {
2585 bus->devs[i] = bus->devs[--bus->dev_count];
2586 break;
2587 }
2472} 2588}
2473 2589
2474static struct notifier_block kvm_cpu_notifier = { 2590static struct notifier_block kvm_cpu_notifier = {
@@ -2501,11 +2617,9 @@ static int vcpu_stat_get(void *_offset, u64 *val)
2501 *val = 0; 2617 *val = 0;
2502 spin_lock(&kvm_lock); 2618 spin_lock(&kvm_lock);
2503 list_for_each_entry(kvm, &vm_list, vm_list) 2619 list_for_each_entry(kvm, &vm_list, vm_list)
2504 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 2620 kvm_for_each_vcpu(i, vcpu, kvm)
2505 vcpu = kvm->vcpus[i]; 2621 *val += *(u32 *)((void *)vcpu + offset);
2506 if (vcpu) 2622
2507 *val += *(u32 *)((void *)vcpu + offset);
2508 }
2509 spin_unlock(&kvm_lock); 2623 spin_unlock(&kvm_lock);
2510 return 0; 2624 return 0;
2511} 2625}
@@ -2679,15 +2793,15 @@ out_free_0:
2679 __free_page(bad_page); 2793 __free_page(bad_page);
2680out: 2794out:
2681 kvm_arch_exit(); 2795 kvm_arch_exit();
2682 kvm_exit_debug();
2683out_fail: 2796out_fail:
2797 kvm_exit_debug();
2684 return r; 2798 return r;
2685} 2799}
2686EXPORT_SYMBOL_GPL(kvm_init); 2800EXPORT_SYMBOL_GPL(kvm_init);
2687 2801
2688void kvm_exit(void) 2802void kvm_exit(void)
2689{ 2803{
2690 kvm_trace_cleanup(); 2804 tracepoint_synchronize_unregister();
2691 misc_deregister(&kvm_dev); 2805 misc_deregister(&kvm_dev);
2692 kmem_cache_destroy(kvm_vcpu_cache); 2806 kmem_cache_destroy(kvm_vcpu_cache);
2693 sysdev_unregister(&kvm_sysdev); 2807 sysdev_unregister(&kvm_sysdev);
diff --git a/virt/kvm/kvm_trace.c b/virt/kvm/kvm_trace.c
deleted file mode 100644
index f59874446440..000000000000
--- a/virt/kvm/kvm_trace.c
+++ /dev/null
@@ -1,285 +0,0 @@
1/*
2 * kvm trace
3 *
4 * It is designed to allow debugging traces of kvm to be generated
5 * on UP / SMP machines. Each trace entry can be timestamped so that
6 * it's possible to reconstruct a chronological record of trace events.
7 * The implementation refers to blktrace kernel support.
8 *
9 * Copyright (c) 2008 Intel Corporation
10 * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk>
11 *
12 * Authors: Feng(Eric) Liu, eric.e.liu@intel.com
13 *
14 * Date: Feb 2008
15 */
16
17#include <linux/module.h>
18#include <linux/relay.h>
19#include <linux/debugfs.h>
20#include <linux/ktime.h>
21
22#include <linux/kvm_host.h>
23
24#define KVM_TRACE_STATE_RUNNING (1 << 0)
25#define KVM_TRACE_STATE_PAUSE (1 << 1)
26#define KVM_TRACE_STATE_CLEARUP (1 << 2)
27
28struct kvm_trace {
29 int trace_state;
30 struct rchan *rchan;
31 struct dentry *lost_file;
32 atomic_t lost_records;
33};
34static struct kvm_trace *kvm_trace;
35
36struct kvm_trace_probe {
37 const char *name;
38 const char *format;
39 u32 timestamp_in;
40 marker_probe_func *probe_func;
41};
42
43static inline int calc_rec_size(int timestamp, int extra)
44{
45 int rec_size = KVM_TRC_HEAD_SIZE;
46
47 rec_size += extra;
48 return timestamp ? rec_size += KVM_TRC_CYCLE_SIZE : rec_size;
49}
50
51static void kvm_add_trace(void *probe_private, void *call_data,
52 const char *format, va_list *args)
53{
54 struct kvm_trace_probe *p = probe_private;
55 struct kvm_trace *kt = kvm_trace;
56 struct kvm_trace_rec rec;
57 struct kvm_vcpu *vcpu;
58 int i, size;
59 u32 extra;
60
61 if (unlikely(kt->trace_state != KVM_TRACE_STATE_RUNNING))
62 return;
63
64 rec.rec_val = TRACE_REC_EVENT_ID(va_arg(*args, u32));
65 vcpu = va_arg(*args, struct kvm_vcpu *);
66 rec.pid = current->tgid;
67 rec.vcpu_id = vcpu->vcpu_id;
68
69 extra = va_arg(*args, u32);
70 WARN_ON(!(extra <= KVM_TRC_EXTRA_MAX));
71 extra = min_t(u32, extra, KVM_TRC_EXTRA_MAX);
72
73 rec.rec_val |= TRACE_REC_TCS(p->timestamp_in)
74 | TRACE_REC_NUM_DATA_ARGS(extra);
75
76 if (p->timestamp_in) {
77 rec.u.timestamp.timestamp = ktime_to_ns(ktime_get());
78
79 for (i = 0; i < extra; i++)
80 rec.u.timestamp.extra_u32[i] = va_arg(*args, u32);
81 } else {
82 for (i = 0; i < extra; i++)
83 rec.u.notimestamp.extra_u32[i] = va_arg(*args, u32);
84 }
85
86 size = calc_rec_size(p->timestamp_in, extra * sizeof(u32));
87 relay_write(kt->rchan, &rec, size);
88}
89
90static struct kvm_trace_probe kvm_trace_probes[] = {
91 { "kvm_trace_entryexit", "%u %p %u %u %u %u %u %u", 1, kvm_add_trace },
92 { "kvm_trace_handler", "%u %p %u %u %u %u %u %u", 0, kvm_add_trace },
93};
94
95static int lost_records_get(void *data, u64 *val)
96{
97 struct kvm_trace *kt = data;
98
99 *val = atomic_read(&kt->lost_records);
100 return 0;
101}
102
103DEFINE_SIMPLE_ATTRIBUTE(kvm_trace_lost_ops, lost_records_get, NULL, "%llu\n");
104
105/*
106 * The relay channel is used in "no-overwrite" mode, it keeps trace of how
107 * many times we encountered a full subbuffer, to tell user space app the
108 * lost records there were.
109 */
110static int kvm_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
111 void *prev_subbuf, size_t prev_padding)
112{
113 struct kvm_trace *kt;
114
115 if (!relay_buf_full(buf)) {
116 if (!prev_subbuf) {
117 /*
118 * executed only once when the channel is opened
119 * save metadata as first record
120 */
121 subbuf_start_reserve(buf, sizeof(u32));
122 *(u32 *)subbuf = 0x12345678;
123 }
124
125 return 1;
126 }
127
128 kt = buf->chan->private_data;
129 atomic_inc(&kt->lost_records);
130
131 return 0;
132}
133
134static struct dentry *kvm_create_buf_file_callack(const char *filename,
135 struct dentry *parent,
136 int mode,
137 struct rchan_buf *buf,
138 int *is_global)
139{
140 return debugfs_create_file(filename, mode, parent, buf,
141 &relay_file_operations);
142}
143
144static int kvm_remove_buf_file_callback(struct dentry *dentry)
145{
146 debugfs_remove(dentry);
147 return 0;
148}
149
150static struct rchan_callbacks kvm_relay_callbacks = {
151 .subbuf_start = kvm_subbuf_start_callback,
152 .create_buf_file = kvm_create_buf_file_callack,
153 .remove_buf_file = kvm_remove_buf_file_callback,
154};
155
156static int do_kvm_trace_enable(struct kvm_user_trace_setup *kuts)
157{
158 struct kvm_trace *kt;
159 int i, r = -ENOMEM;
160
161 if (!kuts->buf_size || !kuts->buf_nr)
162 return -EINVAL;
163
164 kt = kzalloc(sizeof(*kt), GFP_KERNEL);
165 if (!kt)
166 goto err;
167
168 r = -EIO;
169 atomic_set(&kt->lost_records, 0);
170 kt->lost_file = debugfs_create_file("lost_records", 0444, kvm_debugfs_dir,
171 kt, &kvm_trace_lost_ops);
172 if (!kt->lost_file)
173 goto err;
174
175 kt->rchan = relay_open("trace", kvm_debugfs_dir, kuts->buf_size,
176 kuts->buf_nr, &kvm_relay_callbacks, kt);
177 if (!kt->rchan)
178 goto err;
179
180 kvm_trace = kt;
181
182 for (i = 0; i < ARRAY_SIZE(kvm_trace_probes); i++) {
183 struct kvm_trace_probe *p = &kvm_trace_probes[i];
184
185 r = marker_probe_register(p->name, p->format, p->probe_func, p);
186 if (r)
187 printk(KERN_INFO "Unable to register probe %s\n",
188 p->name);
189 }
190
191 kvm_trace->trace_state = KVM_TRACE_STATE_RUNNING;
192
193 return 0;
194err:
195 if (kt) {
196 if (kt->lost_file)
197 debugfs_remove(kt->lost_file);
198 if (kt->rchan)
199 relay_close(kt->rchan);
200 kfree(kt);
201 }
202 return r;
203}
204
205static int kvm_trace_enable(char __user *arg)
206{
207 struct kvm_user_trace_setup kuts;
208 int ret;
209
210 ret = copy_from_user(&kuts, arg, sizeof(kuts));
211 if (ret)
212 return -EFAULT;
213
214 ret = do_kvm_trace_enable(&kuts);
215 if (ret)
216 return ret;
217
218 return 0;
219}
220
221static int kvm_trace_pause(void)
222{
223 struct kvm_trace *kt = kvm_trace;
224 int r = -EINVAL;
225
226 if (kt == NULL)
227 return r;
228
229 if (kt->trace_state == KVM_TRACE_STATE_RUNNING) {
230 kt->trace_state = KVM_TRACE_STATE_PAUSE;
231 relay_flush(kt->rchan);
232 r = 0;
233 }
234
235 return r;
236}
237
238void kvm_trace_cleanup(void)
239{
240 struct kvm_trace *kt = kvm_trace;
241 int i;
242
243 if (kt == NULL)
244 return;
245
246 if (kt->trace_state == KVM_TRACE_STATE_RUNNING ||
247 kt->trace_state == KVM_TRACE_STATE_PAUSE) {
248
249 kt->trace_state = KVM_TRACE_STATE_CLEARUP;
250
251 for (i = 0; i < ARRAY_SIZE(kvm_trace_probes); i++) {
252 struct kvm_trace_probe *p = &kvm_trace_probes[i];
253 marker_probe_unregister(p->name, p->probe_func, p);
254 }
255 marker_synchronize_unregister();
256
257 relay_close(kt->rchan);
258 debugfs_remove(kt->lost_file);
259 kfree(kt);
260 }
261}
262
263int kvm_trace_ioctl(unsigned int ioctl, unsigned long arg)
264{
265 void __user *argp = (void __user *)arg;
266 long r = -EINVAL;
267
268 if (!capable(CAP_SYS_ADMIN))
269 return -EPERM;
270
271 switch (ioctl) {
272 case KVM_TRACE_ENABLE:
273 r = kvm_trace_enable(argp);
274 break;
275 case KVM_TRACE_PAUSE:
276 r = kvm_trace_pause();
277 break;
278 case KVM_TRACE_DISABLE:
279 r = 0;
280 kvm_trace_cleanup();
281 break;
282 }
283
284 return r;
285}