aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2010-10-24 15:47:25 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2010-10-24 15:47:25 -0400
commit1765a1fe5d6f82c0eceb1ad10594cfc83759b6d0 (patch)
treea701020f0fa3a1932a36d174c5fffd20496303a9
parentbdaf12b41235b0c59949914de022341e77907461 (diff)
parent2a31339aa014c0d0b97c57d3ebc997732f8f47fc (diff)
Merge branch 'kvm-updates/2.6.37' of git://git.kernel.org/pub/scm/virt/kvm/kvm
* 'kvm-updates/2.6.37' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (321 commits) KVM: Drop CONFIG_DMAR dependency around kvm_iommu_map_pages KVM: Fix signature of kvm_iommu_map_pages stub KVM: MCE: Send SRAR SIGBUS directly KVM: MCE: Add MCG_SER_P into KVM_MCE_CAP_SUPPORTED KVM: fix typo in copyright notice KVM: Disable interrupts around get_kernel_ns() KVM: MMU: Avoid sign extension in mmu_alloc_direct_roots() pae root address KVM: MMU: move access code parsing to FNAME(walk_addr) function KVM: MMU: audit: check whether have unsync sps after root sync KVM: MMU: audit: introduce audit_printk to cleanup audit code KVM: MMU: audit: unregister audit tracepoints before module unloaded KVM: MMU: audit: fix vcpu's spte walking KVM: MMU: set access bit for direct mapping KVM: MMU: cleanup for error mask set while walk guest page table KVM: MMU: update 'root_hpa' out of loop in PAE shadow path KVM: x86 emulator: Eliminate compilation warning in x86_decode_insn() KVM: x86: Fix constant type in kvm_get_time_scale KVM: VMX: Add AX to list of registers clobbered by guest switch KVM guest: Move a printk that's using the clock before it's ready KVM: x86: TSC catchup mode ...
-rw-r--r--Documentation/kernel-parameters.txt8
-rw-r--r--Documentation/kvm/api.txt61
-rw-r--r--Documentation/kvm/ppc-pv.txt196
-rw-r--r--Documentation/kvm/timekeeping.txt612
-rw-r--r--arch/ia64/kvm/lapic.h1
-rw-r--r--arch/powerpc/include/asm/kvm.h1
-rw-r--r--arch/powerpc/include/asm/kvm_asm.h4
-rw-r--r--arch/powerpc/include/asm/kvm_book3s.h31
-rw-r--r--arch/powerpc/include/asm/kvm_host.h21
-rw-r--r--arch/powerpc/include/asm/kvm_para.h139
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h1
-rw-r--r--arch/powerpc/kernel/Makefile2
-rw-r--r--arch/powerpc/kernel/asm-offsets.c25
-rw-r--r--arch/powerpc/kernel/exceptions-64s.S6
-rw-r--r--arch/powerpc/kernel/head_64.S6
-rw-r--r--arch/powerpc/kernel/kvm.c596
-rw-r--r--arch/powerpc/kernel/kvm_emul.S302
-rw-r--r--arch/powerpc/kvm/44x.c10
-rw-r--r--arch/powerpc/kvm/44x_tlb.c9
-rw-r--r--arch/powerpc/kvm/book3s.c272
-rw-r--r--arch/powerpc/kvm/book3s_32_mmu.c111
-rw-r--r--arch/powerpc/kvm/book3s_32_mmu_host.c75
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu.c42
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_host.c74
-rw-r--r--arch/powerpc/kvm/book3s_emulate.c73
-rw-r--r--arch/powerpc/kvm/book3s_mmu_hpte.c140
-rw-r--r--arch/powerpc/kvm/book3s_paired_singles.c11
-rw-r--r--arch/powerpc/kvm/book3s_rmhandlers.S32
-rw-r--r--arch/powerpc/kvm/booke.c108
-rw-r--r--arch/powerpc/kvm/booke.h10
-rw-r--r--arch/powerpc/kvm/booke_emulate.c14
-rw-r--r--arch/powerpc/kvm/booke_interrupts.S3
-rw-r--r--arch/powerpc/kvm/e500.c7
-rw-r--r--arch/powerpc/kvm/e500_tlb.c18
-rw-r--r--arch/powerpc/kvm/e500_tlb.h2
-rw-r--r--arch/powerpc/kvm/emulate.c36
-rw-r--r--arch/powerpc/kvm/powerpc.c88
-rw-r--r--arch/powerpc/kvm/trace.h239
-rw-r--r--arch/powerpc/platforms/Kconfig10
-rw-r--r--arch/s390/include/asm/Kbuild1
-rw-r--r--arch/s390/include/asm/kvm_virtio.h7
-rw-r--r--arch/x86/include/asm/kvm_emulate.h30
-rw-r--r--arch/x86/include/asm/kvm_host.h81
-rw-r--r--arch/x86/include/asm/kvm_para.h6
-rw-r--r--arch/x86/include/asm/msr-index.h1
-rw-r--r--arch/x86/include/asm/pvclock.h38
-rw-r--r--arch/x86/kernel/kvmclock.c6
-rw-r--r--arch/x86/kernel/pvclock.c3
-rw-r--r--arch/x86/kvm/Kconfig7
-rw-r--r--arch/x86/kvm/emulate.c2262
-rw-r--r--arch/x86/kvm/i8254.c11
-rw-r--r--arch/x86/kvm/i8259.c25
-rw-r--r--arch/x86/kvm/irq.c9
-rw-r--r--arch/x86/kvm/irq.h2
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h9
-rw-r--r--arch/x86/kvm/lapic.c15
-rw-r--r--arch/x86/kvm/mmu.c918
-rw-r--r--arch/x86/kvm/mmu.h9
-rw-r--r--arch/x86/kvm/mmu_audit.c299
-rw-r--r--arch/x86/kvm/mmutrace.h19
-rw-r--r--arch/x86/kvm/paging_tmpl.h202
-rw-r--r--arch/x86/kvm/svm.c283
-rw-r--r--arch/x86/kvm/timer.c2
-rw-r--r--arch/x86/kvm/vmx.c219
-rw-r--r--arch/x86/kvm/x86.c780
-rw-r--r--arch/x86/kvm/x86.h8
-rw-r--r--drivers/s390/kvm/kvm_virtio.c66
-rw-r--r--include/linux/kvm.h12
-rw-r--r--include/linux/kvm_host.h22
-rw-r--r--include/linux/kvm_para.h7
-rw-r--r--mm/util.c13
-rw-r--r--virt/kvm/irq_comm.c2
-rw-r--r--virt/kvm/kvm_main.c84
73 files changed, 6583 insertions, 2271 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 4cd8b86e00ea..9533af74a127 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1131,9 +1131,13 @@ and is between 256 and 4096 characters. It is defined in the file
1131 kvm.oos_shadow= [KVM] Disable out-of-sync shadow paging. 1131 kvm.oos_shadow= [KVM] Disable out-of-sync shadow paging.
1132 Default is 1 (enabled) 1132 Default is 1 (enabled)
1133 1133
1134 kvm-amd.nested= [KVM,AMD] Allow nested virtualization in KVM/SVM. 1134 kvm.mmu_audit= [KVM] This is a R/W parameter which allows audit
1135 KVM MMU at runtime.
1135 Default is 0 (off) 1136 Default is 0 (off)
1136 1137
1138 kvm-amd.nested= [KVM,AMD] Allow nested virtualization in KVM/SVM.
1139 Default is 1 (enabled)
1140
1137 kvm-amd.npt= [KVM,AMD] Disable nested paging (virtualized MMU) 1141 kvm-amd.npt= [KVM,AMD] Disable nested paging (virtualized MMU)
1138 for all guests. 1142 for all guests.
1139 Default is 1 (enabled) if in 64bit or 32bit-PAE mode 1143 Default is 1 (enabled) if in 64bit or 32bit-PAE mode
@@ -1698,6 +1702,8 @@ and is between 256 and 4096 characters. It is defined in the file
1698 1702
1699 nojitter [IA64] Disables jitter checking for ITC timers. 1703 nojitter [IA64] Disables jitter checking for ITC timers.
1700 1704
1705 no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver
1706
1701 nolapic [X86-32,APIC] Do not enable or use the local APIC. 1707 nolapic [X86-32,APIC] Do not enable or use the local APIC.
1702 1708
1703 nolapic_timer [X86-32,APIC] Do not use the local APIC timer. 1709 nolapic_timer [X86-32,APIC] Do not use the local APIC timer.
diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
index 5f5b64982b1a..b336266bea5e 100644
--- a/Documentation/kvm/api.txt
+++ b/Documentation/kvm/api.txt
@@ -320,13 +320,13 @@ struct kvm_translation {
3204.15 KVM_INTERRUPT 3204.15 KVM_INTERRUPT
321 321
322Capability: basic 322Capability: basic
323Architectures: x86 323Architectures: x86, ppc
324Type: vcpu ioctl 324Type: vcpu ioctl
325Parameters: struct kvm_interrupt (in) 325Parameters: struct kvm_interrupt (in)
326Returns: 0 on success, -1 on error 326Returns: 0 on success, -1 on error
327 327
328Queues a hardware interrupt vector to be injected. This is only 328Queues a hardware interrupt vector to be injected. This is only
329useful if in-kernel local APIC is not used. 329useful if in-kernel local APIC or equivalent is not used.
330 330
331/* for KVM_INTERRUPT */ 331/* for KVM_INTERRUPT */
332struct kvm_interrupt { 332struct kvm_interrupt {
@@ -334,8 +334,37 @@ struct kvm_interrupt {
334 __u32 irq; 334 __u32 irq;
335}; 335};
336 336
337X86:
338
337Note 'irq' is an interrupt vector, not an interrupt pin or line. 339Note 'irq' is an interrupt vector, not an interrupt pin or line.
338 340
341PPC:
342
343Queues an external interrupt to be injected. This ioctl is overleaded
344with 3 different irq values:
345
346a) KVM_INTERRUPT_SET
347
348 This injects an edge type external interrupt into the guest once it's ready
349 to receive interrupts. When injected, the interrupt is done.
350
351b) KVM_INTERRUPT_UNSET
352
353 This unsets any pending interrupt.
354
355 Only available with KVM_CAP_PPC_UNSET_IRQ.
356
357c) KVM_INTERRUPT_SET_LEVEL
358
359 This injects a level type external interrupt into the guest context. The
360 interrupt stays pending until a specific ioctl with KVM_INTERRUPT_UNSET
361 is triggered.
362
363 Only available with KVM_CAP_PPC_IRQ_LEVEL.
364
365Note that any value for 'irq' other than the ones stated above is invalid
366and incurs unexpected behavior.
367
3394.16 KVM_DEBUG_GUEST 3684.16 KVM_DEBUG_GUEST
340 369
341Capability: basic 370Capability: basic
@@ -1013,8 +1042,9 @@ number is just right, the 'nent' field is adjusted to the number of valid
1013entries in the 'entries' array, which is then filled. 1042entries in the 'entries' array, which is then filled.
1014 1043
1015The entries returned are the host cpuid as returned by the cpuid instruction, 1044The entries returned are the host cpuid as returned by the cpuid instruction,
1016with unknown or unsupported features masked out. The fields in each entry 1045with unknown or unsupported features masked out. Some features (for example,
1017are defined as follows: 1046x2apic), may not be present in the host cpu, but are exposed by kvm if it can
1047emulate them efficiently. The fields in each entry are defined as follows:
1018 1048
1019 function: the eax value used to obtain the entry 1049 function: the eax value used to obtain the entry
1020 index: the ecx value used to obtain the entry (for entries that are 1050 index: the ecx value used to obtain the entry (for entries that are
@@ -1032,6 +1062,29 @@ are defined as follows:
1032 eax, ebx, ecx, edx: the values returned by the cpuid instruction for 1062 eax, ebx, ecx, edx: the values returned by the cpuid instruction for
1033 this function/index combination 1063 this function/index combination
1034 1064
10654.46 KVM_PPC_GET_PVINFO
1066
1067Capability: KVM_CAP_PPC_GET_PVINFO
1068Architectures: ppc
1069Type: vm ioctl
1070Parameters: struct kvm_ppc_pvinfo (out)
1071Returns: 0 on success, !0 on error
1072
1073struct kvm_ppc_pvinfo {
1074 __u32 flags;
1075 __u32 hcall[4];
1076 __u8 pad[108];
1077};
1078
1079This ioctl fetches PV specific information that need to be passed to the guest
1080using the device tree or other means from vm context.
1081
1082For now the only implemented piece of information distributed here is an array
1083of 4 instructions that make up a hypercall.
1084
1085If any additional field gets added to this structure later on, a bit for that
1086additional piece of information will be set in the flags bitmap.
1087
10355. The kvm_run structure 10885. The kvm_run structure
1036 1089
1037Application code obtains a pointer to the kvm_run structure by 1090Application code obtains a pointer to the kvm_run structure by
diff --git a/Documentation/kvm/ppc-pv.txt b/Documentation/kvm/ppc-pv.txt
new file mode 100644
index 000000000000..a7f2244b3be9
--- /dev/null
+++ b/Documentation/kvm/ppc-pv.txt
@@ -0,0 +1,196 @@
1The PPC KVM paravirtual interface
2=================================
3
4The basic execution principle by which KVM on PowerPC works is to run all kernel
5space code in PR=1 which is user space. This way we trap all privileged
6instructions and can emulate them accordingly.
7
8Unfortunately that is also the downfall. There are quite some privileged
9instructions that needlessly return us to the hypervisor even though they
10could be handled differently.
11
12This is what the PPC PV interface helps with. It takes privileged instructions
13and transforms them into unprivileged ones with some help from the hypervisor.
14This cuts down virtualization costs by about 50% on some of my benchmarks.
15
16The code for that interface can be found in arch/powerpc/kernel/kvm*
17
18Querying for existence
19======================
20
21To find out if we're running on KVM or not, we leverage the device tree. When
22Linux is running on KVM, a node /hypervisor exists. That node contains a
23compatible property with the value "linux,kvm".
24
25Once you determined you're running under a PV capable KVM, you can now use
26hypercalls as described below.
27
28KVM hypercalls
29==============
30
31Inside the device tree's /hypervisor node there's a property called
32'hypercall-instructions'. This property contains at most 4 opcodes that make
33up the hypercall. To call a hypercall, just call these instructions.
34
35The parameters are as follows:
36
37 Register IN OUT
38
39 r0 - volatile
40 r3 1st parameter Return code
41 r4 2nd parameter 1st output value
42 r5 3rd parameter 2nd output value
43 r6 4th parameter 3rd output value
44 r7 5th parameter 4th output value
45 r8 6th parameter 5th output value
46 r9 7th parameter 6th output value
47 r10 8th parameter 7th output value
48 r11 hypercall number 8th output value
49 r12 - volatile
50
51Hypercall definitions are shared in generic code, so the same hypercall numbers
52apply for x86 and powerpc alike with the exception that each KVM hypercall
53also needs to be ORed with the KVM vendor code which is (42 << 16).
54
55Return codes can be as follows:
56
57 Code Meaning
58
59 0 Success
60 12 Hypercall not implemented
61 <0 Error
62
63The magic page
64==============
65
66To enable communication between the hypervisor and guest there is a new shared
67page that contains parts of supervisor visible register state. The guest can
68map this shared page using the KVM hypercall KVM_HC_PPC_MAP_MAGIC_PAGE.
69
70With this hypercall issued the guest always gets the magic page mapped at the
71desired location in effective and physical address space. For now, we always
72map the page to -4096. This way we can access it using absolute load and store
73functions. The following instruction reads the first field of the magic page:
74
75 ld rX, -4096(0)
76
77The interface is designed to be extensible should there be need later to add
78additional registers to the magic page. If you add fields to the magic page,
79also define a new hypercall feature to indicate that the host can give you more
80registers. Only if the host supports the additional features, make use of them.
81
82The magic page has the following layout as described in
83arch/powerpc/include/asm/kvm_para.h:
84
85struct kvm_vcpu_arch_shared {
86 __u64 scratch1;
87 __u64 scratch2;
88 __u64 scratch3;
89 __u64 critical; /* Guest may not get interrupts if == r1 */
90 __u64 sprg0;
91 __u64 sprg1;
92 __u64 sprg2;
93 __u64 sprg3;
94 __u64 srr0;
95 __u64 srr1;
96 __u64 dar;
97 __u64 msr;
98 __u32 dsisr;
99 __u32 int_pending; /* Tells the guest if we have an interrupt */
100};
101
102Additions to the page must only occur at the end. Struct fields are always 32
103or 64 bit aligned, depending on them being 32 or 64 bit wide respectively.
104
105Magic page features
106===================
107
108When mapping the magic page using the KVM hypercall KVM_HC_PPC_MAP_MAGIC_PAGE,
109a second return value is passed to the guest. This second return value contains
110a bitmap of available features inside the magic page.
111
112The following enhancements to the magic page are currently available:
113
114 KVM_MAGIC_FEAT_SR Maps SR registers r/w in the magic page
115
116For enhanced features in the magic page, please check for the existence of the
117feature before using them!
118
119MSR bits
120========
121
122The MSR contains bits that require hypervisor intervention and bits that do
123not require direct hypervisor intervention because they only get interpreted
124when entering the guest or don't have any impact on the hypervisor's behavior.
125
126The following bits are safe to be set inside the guest:
127
128 MSR_EE
129 MSR_RI
130 MSR_CR
131 MSR_ME
132
133If any other bit changes in the MSR, please still use mtmsr(d).
134
135Patched instructions
136====================
137
138The "ld" and "std" instructions are transormed to "lwz" and "stw" instructions
139respectively on 32 bit systems with an added offset of 4 to accomodate for big
140endianness.
141
142The following is a list of mapping the Linux kernel performs when running as
143guest. Implementing any of those mappings is optional, as the instruction traps
144also act on the shared page. So calling privileged instructions still works as
145before.
146
147From To
148==== ==
149
150mfmsr rX ld rX, magic_page->msr
151mfsprg rX, 0 ld rX, magic_page->sprg0
152mfsprg rX, 1 ld rX, magic_page->sprg1
153mfsprg rX, 2 ld rX, magic_page->sprg2
154mfsprg rX, 3 ld rX, magic_page->sprg3
155mfsrr0 rX ld rX, magic_page->srr0
156mfsrr1 rX ld rX, magic_page->srr1
157mfdar rX ld rX, magic_page->dar
158mfdsisr rX lwz rX, magic_page->dsisr
159
160mtmsr rX std rX, magic_page->msr
161mtsprg 0, rX std rX, magic_page->sprg0
162mtsprg 1, rX std rX, magic_page->sprg1
163mtsprg 2, rX std rX, magic_page->sprg2
164mtsprg 3, rX std rX, magic_page->sprg3
165mtsrr0 rX std rX, magic_page->srr0
166mtsrr1 rX std rX, magic_page->srr1
167mtdar rX std rX, magic_page->dar
168mtdsisr rX stw rX, magic_page->dsisr
169
170tlbsync nop
171
172mtmsrd rX, 0 b <special mtmsr section>
173mtmsr rX b <special mtmsr section>
174
175mtmsrd rX, 1 b <special mtmsrd section>
176
177[Book3S only]
178mtsrin rX, rY b <special mtsrin section>
179
180[BookE only]
181wrteei [0|1] b <special wrteei section>
182
183
184Some instructions require more logic to determine what's going on than a load
185or store instruction can deliver. To enable patching of those, we keep some
186RAM around where we can live translate instructions to. What happens is the
187following:
188
189 1) copy emulation code to memory
190 2) patch that code to fit the emulated instruction
191 3) patch that code to return to the original pc + 4
192 4) patch the original instruction to branch to the new code
193
194That way we can inject an arbitrary amount of code as replacement for a single
195instruction. This allows us to check for pending interrupts when setting EE=1
196for example.
diff --git a/Documentation/kvm/timekeeping.txt b/Documentation/kvm/timekeeping.txt
new file mode 100644
index 000000000000..0c5033a58c9e
--- /dev/null
+++ b/Documentation/kvm/timekeeping.txt
@@ -0,0 +1,612 @@
1
2 Timekeeping Virtualization for X86-Based Architectures
3
4 Zachary Amsden <zamsden@redhat.com>
5 Copyright (c) 2010, Red Hat. All rights reserved.
6
71) Overview
82) Timing Devices
93) TSC Hardware
104) Virtualization Problems
11
12=========================================================================
13
141) Overview
15
16One of the most complicated parts of the X86 platform, and specifically,
17the virtualization of this platform is the plethora of timing devices available
18and the complexity of emulating those devices. In addition, virtualization of
19time introduces a new set of challenges because it introduces a multiplexed
20division of time beyond the control of the guest CPU.
21
22First, we will describe the various timekeeping hardware available, then
23present some of the problems which arise and solutions available, giving
24specific recommendations for certain classes of KVM guests.
25
26The purpose of this document is to collect data and information relevant to
27timekeeping which may be difficult to find elsewhere, specifically,
28information relevant to KVM and hardware-based virtualization.
29
30=========================================================================
31
322) Timing Devices
33
34First we discuss the basic hardware devices available. TSC and the related
35KVM clock are special enough to warrant a full exposition and are described in
36the following section.
37
382.1) i8254 - PIT
39
40One of the first timer devices available is the programmable interrupt timer,
41or PIT. The PIT has a fixed frequency 1.193182 MHz base clock and three
42channels which can be programmed to deliver periodic or one-shot interrupts.
43These three channels can be configured in different modes and have individual
44counters. Channel 1 and 2 were not available for general use in the original
45IBM PC, and historically were connected to control RAM refresh and the PC
46speaker. Now the PIT is typically integrated as part of an emulated chipset
47and a separate physical PIT is not used.
48
49The PIT uses I/O ports 0x40 - 0x43. Access to the 16-bit counters is done
50using single or multiple byte access to the I/O ports. There are 6 modes
51available, but not all modes are available to all timers, as only timer 2
52has a connected gate input, required for modes 1 and 5. The gate line is
53controlled by port 61h, bit 0, as illustrated in the following diagram.
54
55 -------------- ----------------
56| | | |
57| 1.1932 MHz |---------->| CLOCK OUT | ---------> IRQ 0
58| Clock | | | |
59 -------------- | +->| GATE TIMER 0 |
60 | ----------------
61 |
62 | ----------------
63 | | |
64 |------>| CLOCK OUT | ---------> 66.3 KHZ DRAM
65 | | | (aka /dev/null)
66 | +->| GATE TIMER 1 |
67 | ----------------
68 |
69 | ----------------
70 | | |
71 |------>| CLOCK OUT | ---------> Port 61h, bit 5
72 | | |
73Port 61h, bit 0 ---------->| GATE TIMER 2 | \_.---- ____
74 ---------------- _| )--|LPF|---Speaker
75 / *---- \___/
76Port 61h, bit 1 -----------------------------------/
77
78The timer modes are now described.
79
80Mode 0: Single Timeout. This is a one-shot software timeout that counts down
81 when the gate is high (always true for timers 0 and 1). When the count
82 reaches zero, the output goes high.
83
84Mode 1: Triggered One-shot. The output is intially set high. When the gate
85 line is set high, a countdown is initiated (which does not stop if the gate is
86 lowered), during which the output is set low. When the count reaches zero,
87 the output goes high.
88
89Mode 2: Rate Generator. The output is initially set high. When the countdown
90 reaches 1, the output goes low for one count and then returns high. The value
91 is reloaded and the countdown automatically resumes. If the gate line goes
92 low, the count is halted. If the output is low when the gate is lowered, the
93 output automatically goes high (this only affects timer 2).
94
95Mode 3: Square Wave. This generates a high / low square wave. The count
96 determines the length of the pulse, which alternates between high and low
97 when zero is reached. The count only proceeds when gate is high and is
98 automatically reloaded on reaching zero. The count is decremented twice at
99 each clock to generate a full high / low cycle at the full periodic rate.
100 If the count is even, the clock remains high for N/2 counts and low for N/2
101 counts; if the clock is odd, the clock is high for (N+1)/2 counts and low
102 for (N-1)/2 counts. Only even values are latched by the counter, so odd
103 values are not observed when reading. This is the intended mode for timer 2,
104 which generates sine-like tones by low-pass filtering the square wave output.
105
106Mode 4: Software Strobe. After programming this mode and loading the counter,
107 the output remains high until the counter reaches zero. Then the output
108 goes low for 1 clock cycle and returns high. The counter is not reloaded.
109 Counting only occurs when gate is high.
110
111Mode 5: Hardware Strobe. After programming and loading the counter, the
112 output remains high. When the gate is raised, a countdown is initiated
113 (which does not stop if the gate is lowered). When the counter reaches zero,
114 the output goes low for 1 clock cycle and then returns high. The counter is
115 not reloaded.
116
117In addition to normal binary counting, the PIT supports BCD counting. The
118command port, 0x43 is used to set the counter and mode for each of the three
119timers.
120
121PIT commands, issued to port 0x43, using the following bit encoding:
122
123Bit 7-4: Command (See table below)
124Bit 3-1: Mode (000 = Mode 0, 101 = Mode 5, 11X = undefined)
125Bit 0 : Binary (0) / BCD (1)
126
127Command table:
128
1290000 - Latch Timer 0 count for port 0x40
130 sample and hold the count to be read in port 0x40;
131 additional commands ignored until counter is read;
132 mode bits ignored.
133
1340001 - Set Timer 0 LSB mode for port 0x40
135 set timer to read LSB only and force MSB to zero;
136 mode bits set timer mode
137
1380010 - Set Timer 0 MSB mode for port 0x40
139 set timer to read MSB only and force LSB to zero;
140 mode bits set timer mode
141
1420011 - Set Timer 0 16-bit mode for port 0x40
143 set timer to read / write LSB first, then MSB;
144 mode bits set timer mode
145
1460100 - Latch Timer 1 count for port 0x41 - as described above
1470101 - Set Timer 1 LSB mode for port 0x41 - as described above
1480110 - Set Timer 1 MSB mode for port 0x41 - as described above
1490111 - Set Timer 1 16-bit mode for port 0x41 - as described above
150
1511000 - Latch Timer 2 count for port 0x42 - as described above
1521001 - Set Timer 2 LSB mode for port 0x42 - as described above
1531010 - Set Timer 2 MSB mode for port 0x42 - as described above
1541011 - Set Timer 2 16-bit mode for port 0x42 as described above
155
1561101 - General counter latch
157 Latch combination of counters into corresponding ports
158 Bit 3 = Counter 2
159 Bit 2 = Counter 1
160 Bit 1 = Counter 0
161 Bit 0 = Unused
162
1631110 - Latch timer status
164 Latch combination of counter mode into corresponding ports
165 Bit 3 = Counter 2
166 Bit 2 = Counter 1
167 Bit 1 = Counter 0
168
169 The output of ports 0x40-0x42 following this command will be:
170
171 Bit 7 = Output pin
172 Bit 6 = Count loaded (0 if timer has expired)
173 Bit 5-4 = Read / Write mode
174 01 = MSB only
175 10 = LSB only
176 11 = LSB / MSB (16-bit)
177 Bit 3-1 = Mode
178 Bit 0 = Binary (0) / BCD mode (1)
179
1802.2) RTC
181
182The second device which was available in the original PC was the MC146818 real
183time clock. The original device is now obsolete, and usually emulated by the
184system chipset, sometimes by an HPET and some frankenstein IRQ routing.
185
186The RTC is accessed through CMOS variables, which uses an index register to
187control which bytes are read. Since there is only one index register, read
188of the CMOS and read of the RTC require lock protection (in addition, it is
189dangerous to allow userspace utilities such as hwclock to have direct RTC
190access, as they could corrupt kernel reads and writes of CMOS memory).
191
192The RTC generates an interrupt which is usually routed to IRQ 8. The interrupt
193can function as a periodic timer, an additional once a day alarm, and can issue
194interrupts after an update of the CMOS registers by the MC146818 is complete.
195The type of interrupt is signalled in the RTC status registers.
196
197The RTC will update the current time fields by battery power even while the
198system is off. The current time fields should not be read while an update is
199in progress, as indicated in the status register.
200
201The clock uses a 32.768kHz crystal, so bits 6-4 of register A should be
202programmed to a 32kHz divider if the RTC is to count seconds.
203
204This is the RAM map originally used for the RTC/CMOS:
205
206Location Size Description
207------------------------------------------
20800h byte Current second (BCD)
20901h byte Seconds alarm (BCD)
21002h byte Current minute (BCD)
21103h byte Minutes alarm (BCD)
21204h byte Current hour (BCD)
21305h byte Hours alarm (BCD)
21406h byte Current day of week (BCD)
21507h byte Current day of month (BCD)
21608h byte Current month (BCD)
21709h byte Current year (BCD)
2180Ah byte Register A
219 bit 7 = Update in progress
220 bit 6-4 = Divider for clock
221 000 = 4.194 MHz
222 001 = 1.049 MHz
223 010 = 32 kHz
224 10X = test modes
225 110 = reset / disable
226 111 = reset / disable
227 bit 3-0 = Rate selection for periodic interrupt
228 000 = periodic timer disabled
229 001 = 3.90625 uS
230 010 = 7.8125 uS
231 011 = .122070 mS
232 100 = .244141 mS
233 ...
234 1101 = 125 mS
235 1110 = 250 mS
236 1111 = 500 mS
2370Bh byte Register B
238 bit 7 = Run (0) / Halt (1)
239 bit 6 = Periodic interrupt enable
240 bit 5 = Alarm interrupt enable
241 bit 4 = Update-ended interrupt enable
242 bit 3 = Square wave interrupt enable
243 bit 2 = BCD calendar (0) / Binary (1)
244 bit 1 = 12-hour mode (0) / 24-hour mode (1)
245 bit 0 = 0 (DST off) / 1 (DST enabled)
246OCh byte Register C (read only)
247 bit 7 = interrupt request flag (IRQF)
248 bit 6 = periodic interrupt flag (PF)
249 bit 5 = alarm interrupt flag (AF)
250 bit 4 = update interrupt flag (UF)
251 bit 3-0 = reserved
252ODh byte Register D (read only)
253 bit 7 = RTC has power
254 bit 6-0 = reserved
25532h byte Current century BCD (*)
256 (*) location vendor specific and now determined from ACPI global tables
257
2582.3) APIC
259
260On Pentium and later processors, an on-board timer is available to each CPU
261as part of the Advanced Programmable Interrupt Controller. The APIC is
262accessed through memory-mapped registers and provides interrupt service to each
263CPU, used for IPIs and local timer interrupts.
264
265Although in theory the APIC is a safe and stable source for local interrupts,
266in practice, many bugs and glitches have occurred due to the special nature of
267the APIC CPU-local memory-mapped hardware. Beware that CPU errata may affect
268the use of the APIC and that workarounds may be required. In addition, some of
269these workarounds pose unique constraints for virtualization - requiring either
270extra overhead incurred from extra reads of memory-mapped I/O or additional
271functionality that may be more computationally expensive to implement.
272
273Since the APIC is documented quite well in the Intel and AMD manuals, we will
274avoid repetition of the detail here. It should be pointed out that the APIC
275timer is programmed through the LVT (local vector timer) register, is capable
276of one-shot or periodic operation, and is based on the bus clock divided down
277by the programmable divider register.
278
2792.4) HPET
280
281HPET is quite complex, and was originally intended to replace the PIT / RTC
282support of the X86 PC. It remains to be seen whether that will be the case, as
283the de facto standard of PC hardware is to emulate these older devices. Some
284systems designated as legacy free may support only the HPET as a hardware timer
285device.
286
287The HPET spec is rather loose and vague, requiring at least 3 hardware timers,
288but allowing implementation freedom to support many more. It also imposes no
289fixed rate on the timer frequency, but does impose some extremal values on
290frequency, error and slew.
291
292In general, the HPET is recommended as a high precision (compared to PIT /RTC)
293time source which is independent of local variation (as there is only one HPET
294in any given system). The HPET is also memory-mapped, and its presence is
295indicated through ACPI tables by the BIOS.
296
297Detailed specification of the HPET is beyond the current scope of this
298document, as it is also very well documented elsewhere.
299
3002.5) Offboard Timers
301
302Several cards, both proprietary (watchdog boards) and commonplace (e1000) have
303timing chips built into the cards which may have registers which are accessible
304to kernel or user drivers. To the author's knowledge, using these to generate
305a clocksource for a Linux or other kernel has not yet been attempted and is in
306general frowned upon as not playing by the agreed rules of the game. Such a
307timer device would require additional support to be virtualized properly and is
308not considered important at this time as no known operating system does this.
309
310=========================================================================
311
3123) TSC Hardware
313
314The TSC or time stamp counter is relatively simple in theory; it counts
315instruction cycles issued by the processor, which can be used as a measure of
316time. In practice, due to a number of problems, it is the most complicated
317timekeeping device to use.
318
319The TSC is represented internally as a 64-bit MSR which can be read with the
320RDMSR, RDTSC, or RDTSCP (when available) instructions. In the past, hardware
321limitations made it possible to write the TSC, but generally on old hardware it
322was only possible to write the low 32-bits of the 64-bit counter, and the upper
32332-bits of the counter were cleared. Now, however, on Intel processors family
3240Fh, for models 3, 4 and 6, and family 06h, models e and f, this restriction
325has been lifted and all 64-bits are writable. On AMD systems, the ability to
326write the TSC MSR is not an architectural guarantee.
327
328The TSC is accessible from CPL-0 and conditionally, for CPL > 0 software by
329means of the CR4.TSD bit, which when enabled, disables CPL > 0 TSC access.
330
331Some vendors have implemented an additional instruction, RDTSCP, which returns
332atomically not just the TSC, but an indicator which corresponds to the
333processor number. This can be used to index into an array of TSC variables to
334determine offset information in SMP systems where TSCs are not synchronized.
335The presence of this instruction must be determined by consulting CPUID feature
336bits.
337
338Both VMX and SVM provide extension fields in the virtualization hardware which
339allows the guest visible TSC to be offset by a constant. Newer implementations
340promise to allow the TSC to additionally be scaled, but this hardware is not
341yet widely available.
342
3433.1) TSC synchronization
344
345The TSC is a CPU-local clock in most implementations. This means, on SMP
346platforms, the TSCs of different CPUs may start at different times depending
347on when the CPUs are powered on. Generally, CPUs on the same die will share
348the same clock, however, this is not always the case.
349
350The BIOS may attempt to resynchronize the TSCs during the poweron process and
351the operating system or other system software may attempt to do this as well.
352Several hardware limitations make the problem worse - if it is not possible to
353write the full 64-bits of the TSC, it may be impossible to match the TSC in
354newly arriving CPUs to that of the rest of the system, resulting in
355unsynchronized TSCs. This may be done by BIOS or system software, but in
356practice, getting a perfectly synchronized TSC will not be possible unless all
357values are read from the same clock, which generally only is possible on single
358socket systems or those with special hardware support.
359
3603.2) TSC and CPU hotplug
361
362As touched on already, CPUs which arrive later than the boot time of the system
363may not have a TSC value that is synchronized with the rest of the system.
364Either system software, BIOS, or SMM code may actually try to establish the TSC
365to a value matching the rest of the system, but a perfect match is usually not
366a guarantee. This can have the effect of bringing a system from a state where
367TSC is synchronized back to a state where TSC synchronization flaws, however
368small, may be exposed to the OS and any virtualization environment.
369
3703.3) TSC and multi-socket / NUMA
371
372Multi-socket systems, especially large multi-socket systems are likely to have
373individual clocksources rather than a single, universally distributed clock.
374Since these clocks are driven by different crystals, they will not have
375perfectly matched frequency, and temperature and electrical variations will
376cause the CPU clocks, and thus the TSCs to drift over time. Depending on the
377exact clock and bus design, the drift may or may not be fixed in absolute
378error, and may accumulate over time.
379
380In addition, very large systems may deliberately slew the clocks of individual
381cores. This technique, known as spread-spectrum clocking, reduces EMI at the
382clock frequency and harmonics of it, which may be required to pass FCC
383standards for telecommunications and computer equipment.
384
385It is recommended not to trust the TSCs to remain synchronized on NUMA or
386multiple socket systems for these reasons.
387
3883.4) TSC and C-states
389
390C-states, or idling states of the processor, especially C1E and deeper sleep
391states may be problematic for TSC as well. The TSC may stop advancing in such
392a state, resulting in a TSC which is behind that of other CPUs when execution
393is resumed. Such CPUs must be detected and flagged by the operating system
394based on CPU and chipset identifications.
395
396The TSC in such a case may be corrected by catching it up to a known external
397clocksource.
398
3993.5) TSC frequency change / P-states
400
401To make things slightly more interesting, some CPUs may change frequency. They
402may or may not run the TSC at the same rate, and because the frequency change
403may be staggered or slewed, at some points in time, the TSC rate may not be
404known other than falling within a range of values. In this case, the TSC will
405not be a stable time source, and must be calibrated against a known, stable,
406external clock to be a usable source of time.
407
408Whether the TSC runs at a constant rate or scales with the P-state is model
409dependent and must be determined by inspecting CPUID, chipset or vendor
410specific MSR fields.
411
412In addition, some vendors have known bugs where the P-state is actually
413compensated for properly during normal operation, but when the processor is
414inactive, the P-state may be raised temporarily to service cache misses from
415other processors. In such cases, the TSC on halted CPUs could advance faster
416than that of non-halted processors. AMD Turion processors are known to have
417this problem.
418
4193.6) TSC and STPCLK / T-states
420
421External signals given to the processor may also have the effect of stopping
422the TSC. This is typically done for thermal emergency power control to prevent
423an overheating condition, and typically, there is no way to detect that this
424condition has happened.
425
4263.7) TSC virtualization - VMX
427
428VMX provides conditional trapping of RDTSC, RDMSR, WRMSR and RDTSCP
429instructions, which is enough for full virtualization of TSC in any manner. In
430addition, VMX allows passing through the host TSC plus an additional TSC_OFFSET
431field specified in the VMCS. Special instructions must be used to read and
432write the VMCS field.
433
4343.8) TSC virtualization - SVM
435
436SVM provides conditional trapping of RDTSC, RDMSR, WRMSR and RDTSCP
437instructions, which is enough for full virtualization of TSC in any manner. In
438addition, SVM allows passing through the host TSC plus an additional offset
439field specified in the SVM control block.
440
4413.9) TSC feature bits in Linux
442
443In summary, there is no way to guarantee the TSC remains in perfect
444synchronization unless it is explicitly guaranteed by the architecture. Even
445if so, the TSCs in multi-sockets or NUMA systems may still run independently
446despite being locally consistent.
447
448The following feature bits are used by Linux to signal various TSC attributes,
449but they can only be taken to be meaningful for UP or single node systems.
450
451X86_FEATURE_TSC : The TSC is available in hardware
452X86_FEATURE_RDTSCP : The RDTSCP instruction is available
453X86_FEATURE_CONSTANT_TSC : The TSC rate is unchanged with P-states
454X86_FEATURE_NONSTOP_TSC : The TSC does not stop in C-states
455X86_FEATURE_TSC_RELIABLE : TSC sync checks are skipped (VMware)
456
4574) Virtualization Problems
458
459Timekeeping is especially problematic for virtualization because a number of
460challenges arise. The most obvious problem is that time is now shared between
461the host and, potentially, a number of virtual machines. Thus the virtual
462operating system does not run with 100% usage of the CPU, despite the fact that
463it may very well make that assumption. It may expect it to remain true to very
464exacting bounds when interrupt sources are disabled, but in reality only its
465virtual interrupt sources are disabled, and the machine may still be preempted
466at any time. This causes problems as the passage of real time, the injection
467of machine interrupts and the associated clock sources are no longer completely
468synchronized with real time.
469
470This same problem can occur on native harware to a degree, as SMM mode may
471steal cycles from the naturally on X86 systems when SMM mode is used by the
472BIOS, but not in such an extreme fashion. However, the fact that SMM mode may
473cause similar problems to virtualization makes it a good justification for
474solving many of these problems on bare metal.
475
4764.1) Interrupt clocking
477
478One of the most immediate problems that occurs with legacy operating systems
479is that the system timekeeping routines are often designed to keep track of
480time by counting periodic interrupts. These interrupts may come from the PIT
481or the RTC, but the problem is the same: the host virtualization engine may not
482be able to deliver the proper number of interrupts per second, and so guest
483time may fall behind. This is especially problematic if a high interrupt rate
484is selected, such as 1000 HZ, which is unfortunately the default for many Linux
485guests.
486
487There are three approaches to solving this problem; first, it may be possible
488to simply ignore it. Guests which have a separate time source for tracking
489'wall clock' or 'real time' may not need any adjustment of their interrupts to
490maintain proper time. If this is not sufficient, it may be necessary to inject
491additional interrupts into the guest in order to increase the effective
492interrupt rate. This approach leads to complications in extreme conditions,
493where host load or guest lag is too much to compensate for, and thus another
494solution to the problem has risen: the guest may need to become aware of lost
495ticks and compensate for them internally. Although promising in theory, the
496implementation of this policy in Linux has been extremely error prone, and a
497number of buggy variants of lost tick compensation are distributed across
498commonly used Linux systems.
499
500Windows uses periodic RTC clocking as a means of keeping time internally, and
501thus requires interrupt slewing to keep proper time. It does use a low enough
502rate (ed: is it 18.2 Hz?) however that it has not yet been a problem in
503practice.
504
5054.2) TSC sampling and serialization
506
507As the highest precision time source available, the cycle counter of the CPU
508has aroused much interest from developers. As explained above, this timer has
509many problems unique to its nature as a local, potentially unstable and
510potentially unsynchronized source. One issue which is not unique to the TSC,
511but is highlighted because of its very precise nature is sampling delay. By
512definition, the counter, once read is already old. However, it is also
513possible for the counter to be read ahead of the actual use of the result.
514This is a consequence of the superscalar execution of the instruction stream,
515which may execute instructions out of order. Such execution is called
516non-serialized. Forcing serialized execution is necessary for precise
517measurement with the TSC, and requires a serializing instruction, such as CPUID
518or an MSR read.
519
520Since CPUID may actually be virtualized by a trap and emulate mechanism, this
521serialization can pose a performance issue for hardware virtualization. An
522accurate time stamp counter reading may therefore not always be available, and
523it may be necessary for an implementation to guard against "backwards" reads of
524the TSC as seen from other CPUs, even in an otherwise perfectly synchronized
525system.
526
5274.3) Timespec aliasing
528
529Additionally, this lack of serialization from the TSC poses another challenge
530when using results of the TSC when measured against another time source. As
531the TSC is much higher precision, many possible values of the TSC may be read
532while another clock is still expressing the same value.
533
534That is, you may read (T,T+10) while external clock C maintains the same value.
535Due to non-serialized reads, you may actually end up with a range which
536fluctuates - from (T-1.. T+10). Thus, any time calculated from a TSC, but
537calibrated against an external value may have a range of valid values.
538Re-calibrating this computation may actually cause time, as computed after the
539calibration, to go backwards, compared with time computed before the
540calibration.
541
542This problem is particularly pronounced with an internal time source in Linux,
543the kernel time, which is expressed in the theoretically high resolution
544timespec - but which advances in much larger granularity intervals, sometimes
545at the rate of jiffies, and possibly in catchup modes, at a much larger step.
546
547This aliasing requires care in the computation and recalibration of kvmclock
548and any other values derived from TSC computation (such as TSC virtualization
549itself).
550
5514.4) Migration
552
553Migration of a virtual machine raises problems for timekeeping in two ways.
554First, the migration itself may take time, during which interrupts cannot be
555delivered, and after which, the guest time may need to be caught up. NTP may
556be able to help to some degree here, as the clock correction required is
557typically small enough to fall in the NTP-correctable window.
558
559An additional concern is that timers based off the TSC (or HPET, if the raw bus
560clock is exposed) may now be running at different rates, requiring compensation
561in some way in the hypervisor by virtualizing these timers. In addition,
562migrating to a faster machine may preclude the use of a passthrough TSC, as a
563faster clock cannot be made visible to a guest without the potential of time
564advancing faster than usual. A slower clock is less of a problem, as it can
565always be caught up to the original rate. KVM clock avoids these problems by
566simply storing multipliers and offsets against the TSC for the guest to convert
567back into nanosecond resolution values.
568
5694.5) Scheduling
570
571Since scheduling may be based on precise timing and firing of interrupts, the
572scheduling algorithms of an operating system may be adversely affected by
573virtualization. In theory, the effect is random and should be universally
574distributed, but in contrived as well as real scenarios (guest device access,
575causes of virtualization exits, possible context switch), this may not always
576be the case. The effect of this has not been well studied.
577
578In an attempt to work around this, several implementations have provided a
579paravirtualized scheduler clock, which reveals the true amount of CPU time for
580which a virtual machine has been running.
581
5824.6) Watchdogs
583
584Watchdog timers, such as the lock detector in Linux may fire accidentally when
585running under hardware virtualization due to timer interrupts being delayed or
586misinterpretation of the passage of real time. Usually, these warnings are
587spurious and can be ignored, but in some circumstances it may be necessary to
588disable such detection.
589
5904.7) Delays and precision timing
591
592Precise timing and delays may not be possible in a virtualized system. This
593can happen if the system is controlling physical hardware, or issues delays to
594compensate for slower I/O to and from devices. The first issue is not solvable
595in general for a virtualized system; hardware control software can't be
596adequately virtualized without a full real-time operating system, which would
597require an RT aware virtualization platform.
598
599The second issue may cause performance problems, but this is unlikely to be a
600significant issue. In many cases these delays may be eliminated through
601configuration or paravirtualization.
602
6034.8) Covert channels and leaks
604
605In addition to the above problems, time information will inevitably leak to the
606guest about the host in anything but a perfect implementation of virtualized
607time. This may allow the guest to infer the presence of a hypervisor (as in a
608red-pill type detection), and it may allow information to leak between guests
609by using CPU utilization itself as a signalling channel. Preventing such
610problems would require completely isolated virtual time which may not track
611real time any longer. This may be useful in certain security or QA contexts,
612but in general isn't recommended for real-world deployment scenarios.
diff --git a/arch/ia64/kvm/lapic.h b/arch/ia64/kvm/lapic.h
index ee541cebcd78..c5f92a926a9a 100644
--- a/arch/ia64/kvm/lapic.h
+++ b/arch/ia64/kvm/lapic.h
@@ -25,5 +25,6 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
25int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); 25int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
26int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq); 26int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
27#define kvm_apic_present(x) (true) 27#define kvm_apic_present(x) (true)
28#define kvm_lapic_enabled(x) (true)
28 29
29#endif 30#endif
diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h
index 6c5547d82bbe..18ea6963ad77 100644
--- a/arch/powerpc/include/asm/kvm.h
+++ b/arch/powerpc/include/asm/kvm.h
@@ -86,5 +86,6 @@ struct kvm_guest_debug_arch {
86 86
87#define KVM_INTERRUPT_SET -1U 87#define KVM_INTERRUPT_SET -1U
88#define KVM_INTERRUPT_UNSET -2U 88#define KVM_INTERRUPT_UNSET -2U
89#define KVM_INTERRUPT_SET_LEVEL -3U
89 90
90#endif /* __LINUX_KVM_POWERPC_H */ 91#endif /* __LINUX_KVM_POWERPC_H */
diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h
index c5ea4cda34b3..5b7504674397 100644
--- a/arch/powerpc/include/asm/kvm_asm.h
+++ b/arch/powerpc/include/asm/kvm_asm.h
@@ -58,6 +58,7 @@
58#define BOOK3S_INTERRUPT_INST_STORAGE 0x400 58#define BOOK3S_INTERRUPT_INST_STORAGE 0x400
59#define BOOK3S_INTERRUPT_INST_SEGMENT 0x480 59#define BOOK3S_INTERRUPT_INST_SEGMENT 0x480
60#define BOOK3S_INTERRUPT_EXTERNAL 0x500 60#define BOOK3S_INTERRUPT_EXTERNAL 0x500
61#define BOOK3S_INTERRUPT_EXTERNAL_LEVEL 0x501
61#define BOOK3S_INTERRUPT_ALIGNMENT 0x600 62#define BOOK3S_INTERRUPT_ALIGNMENT 0x600
62#define BOOK3S_INTERRUPT_PROGRAM 0x700 63#define BOOK3S_INTERRUPT_PROGRAM 0x700
63#define BOOK3S_INTERRUPT_FP_UNAVAIL 0x800 64#define BOOK3S_INTERRUPT_FP_UNAVAIL 0x800
@@ -84,7 +85,8 @@
84#define BOOK3S_IRQPRIO_EXTERNAL 13 85#define BOOK3S_IRQPRIO_EXTERNAL 13
85#define BOOK3S_IRQPRIO_DECREMENTER 14 86#define BOOK3S_IRQPRIO_DECREMENTER 14
86#define BOOK3S_IRQPRIO_PERFORMANCE_MONITOR 15 87#define BOOK3S_IRQPRIO_PERFORMANCE_MONITOR 15
87#define BOOK3S_IRQPRIO_MAX 16 88#define BOOK3S_IRQPRIO_EXTERNAL_LEVEL 16
89#define BOOK3S_IRQPRIO_MAX 17
88 90
89#define BOOK3S_HFLAG_DCBZ32 0x1 91#define BOOK3S_HFLAG_DCBZ32 0x1
90#define BOOK3S_HFLAG_SLB 0x2 92#define BOOK3S_HFLAG_SLB 0x2
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 8274a2d43925..d62e703f1214 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -38,15 +38,6 @@ struct kvmppc_slb {
38 bool class : 1; 38 bool class : 1;
39}; 39};
40 40
41struct kvmppc_sr {
42 u32 raw;
43 u32 vsid;
44 bool Ks : 1;
45 bool Kp : 1;
46 bool nx : 1;
47 bool valid : 1;
48};
49
50struct kvmppc_bat { 41struct kvmppc_bat {
51 u64 raw; 42 u64 raw;
52 u32 bepi; 43 u32 bepi;
@@ -69,6 +60,13 @@ struct kvmppc_sid_map {
69#define SID_MAP_NUM (1 << SID_MAP_BITS) 60#define SID_MAP_NUM (1 << SID_MAP_BITS)
70#define SID_MAP_MASK (SID_MAP_NUM - 1) 61#define SID_MAP_MASK (SID_MAP_NUM - 1)
71 62
63#ifdef CONFIG_PPC_BOOK3S_64
64#define SID_CONTEXTS 1
65#else
66#define SID_CONTEXTS 128
67#define VSID_POOL_SIZE (SID_CONTEXTS * 16)
68#endif
69
72struct kvmppc_vcpu_book3s { 70struct kvmppc_vcpu_book3s {
73 struct kvm_vcpu vcpu; 71 struct kvm_vcpu vcpu;
74 struct kvmppc_book3s_shadow_vcpu *shadow_vcpu; 72 struct kvmppc_book3s_shadow_vcpu *shadow_vcpu;
@@ -79,20 +77,22 @@ struct kvmppc_vcpu_book3s {
79 u64 vsid; 77 u64 vsid;
80 } slb_shadow[64]; 78 } slb_shadow[64];
81 u8 slb_shadow_max; 79 u8 slb_shadow_max;
82 struct kvmppc_sr sr[16];
83 struct kvmppc_bat ibat[8]; 80 struct kvmppc_bat ibat[8];
84 struct kvmppc_bat dbat[8]; 81 struct kvmppc_bat dbat[8];
85 u64 hid[6]; 82 u64 hid[6];
86 u64 gqr[8]; 83 u64 gqr[8];
87 int slb_nr; 84 int slb_nr;
88 u32 dsisr;
89 u64 sdr1; 85 u64 sdr1;
90 u64 hior; 86 u64 hior;
91 u64 msr_mask; 87 u64 msr_mask;
92 u64 vsid_first;
93 u64 vsid_next; 88 u64 vsid_next;
89#ifdef CONFIG_PPC_BOOK3S_32
90 u32 vsid_pool[VSID_POOL_SIZE];
91#else
92 u64 vsid_first;
94 u64 vsid_max; 93 u64 vsid_max;
95 int context_id; 94#endif
95 int context_id[SID_CONTEXTS];
96 ulong prog_flags; /* flags to inject when giving a 700 trap */ 96 ulong prog_flags; /* flags to inject when giving a 700 trap */
97}; 97};
98 98
@@ -131,9 +131,10 @@ extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat,
131 bool upper, u32 val); 131 bool upper, u32 val);
132extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr); 132extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr);
133extern int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu); 133extern int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu);
134extern pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn);
134 135
135extern u32 kvmppc_trampoline_lowmem; 136extern ulong kvmppc_trampoline_lowmem;
136extern u32 kvmppc_trampoline_enter; 137extern ulong kvmppc_trampoline_enter;
137extern void kvmppc_rmcall(ulong srr0, ulong srr1); 138extern void kvmppc_rmcall(ulong srr0, ulong srr1);
138extern void kvmppc_load_up_fpu(void); 139extern void kvmppc_load_up_fpu(void);
139extern void kvmppc_load_up_altivec(void); 140extern void kvmppc_load_up_altivec(void);
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index b0b23c007d6e..bba3b9b72a39 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -25,6 +25,7 @@
25#include <linux/interrupt.h> 25#include <linux/interrupt.h>
26#include <linux/types.h> 26#include <linux/types.h>
27#include <linux/kvm_types.h> 27#include <linux/kvm_types.h>
28#include <linux/kvm_para.h>
28#include <asm/kvm_asm.h> 29#include <asm/kvm_asm.h>
29 30
30#define KVM_MAX_VCPUS 1 31#define KVM_MAX_VCPUS 1
@@ -41,12 +42,17 @@
41 42
42#define HPTEG_CACHE_NUM (1 << 15) 43#define HPTEG_CACHE_NUM (1 << 15)
43#define HPTEG_HASH_BITS_PTE 13 44#define HPTEG_HASH_BITS_PTE 13
45#define HPTEG_HASH_BITS_PTE_LONG 12
44#define HPTEG_HASH_BITS_VPTE 13 46#define HPTEG_HASH_BITS_VPTE 13
45#define HPTEG_HASH_BITS_VPTE_LONG 5 47#define HPTEG_HASH_BITS_VPTE_LONG 5
46#define HPTEG_HASH_NUM_PTE (1 << HPTEG_HASH_BITS_PTE) 48#define HPTEG_HASH_NUM_PTE (1 << HPTEG_HASH_BITS_PTE)
49#define HPTEG_HASH_NUM_PTE_LONG (1 << HPTEG_HASH_BITS_PTE_LONG)
47#define HPTEG_HASH_NUM_VPTE (1 << HPTEG_HASH_BITS_VPTE) 50#define HPTEG_HASH_NUM_VPTE (1 << HPTEG_HASH_BITS_VPTE)
48#define HPTEG_HASH_NUM_VPTE_LONG (1 << HPTEG_HASH_BITS_VPTE_LONG) 51#define HPTEG_HASH_NUM_VPTE_LONG (1 << HPTEG_HASH_BITS_VPTE_LONG)
49 52
53/* Physical Address Mask - allowed range of real mode RAM access */
54#define KVM_PAM 0x0fffffffffffffffULL
55
50struct kvm; 56struct kvm;
51struct kvm_run; 57struct kvm_run;
52struct kvm_vcpu; 58struct kvm_vcpu;
@@ -159,8 +165,10 @@ struct kvmppc_mmu {
159 165
160struct hpte_cache { 166struct hpte_cache {
161 struct hlist_node list_pte; 167 struct hlist_node list_pte;
168 struct hlist_node list_pte_long;
162 struct hlist_node list_vpte; 169 struct hlist_node list_vpte;
163 struct hlist_node list_vpte_long; 170 struct hlist_node list_vpte_long;
171 struct rcu_head rcu_head;
164 u64 host_va; 172 u64 host_va;
165 u64 pfn; 173 u64 pfn;
166 ulong slot; 174 ulong slot;
@@ -210,28 +218,20 @@ struct kvm_vcpu_arch {
210 u32 cr; 218 u32 cr;
211#endif 219#endif
212 220
213 ulong msr;
214#ifdef CONFIG_PPC_BOOK3S 221#ifdef CONFIG_PPC_BOOK3S
215 ulong shadow_msr; 222 ulong shadow_msr;
216 ulong hflags; 223 ulong hflags;
217 ulong guest_owned_ext; 224 ulong guest_owned_ext;
218#endif 225#endif
219 u32 mmucr; 226 u32 mmucr;
220 ulong sprg0;
221 ulong sprg1;
222 ulong sprg2;
223 ulong sprg3;
224 ulong sprg4; 227 ulong sprg4;
225 ulong sprg5; 228 ulong sprg5;
226 ulong sprg6; 229 ulong sprg6;
227 ulong sprg7; 230 ulong sprg7;
228 ulong srr0;
229 ulong srr1;
230 ulong csrr0; 231 ulong csrr0;
231 ulong csrr1; 232 ulong csrr1;
232 ulong dsrr0; 233 ulong dsrr0;
233 ulong dsrr1; 234 ulong dsrr1;
234 ulong dear;
235 ulong esr; 235 ulong esr;
236 u32 dec; 236 u32 dec;
237 u32 decar; 237 u32 decar;
@@ -290,12 +290,17 @@ struct kvm_vcpu_arch {
290 struct tasklet_struct tasklet; 290 struct tasklet_struct tasklet;
291 u64 dec_jiffies; 291 u64 dec_jiffies;
292 unsigned long pending_exceptions; 292 unsigned long pending_exceptions;
293 struct kvm_vcpu_arch_shared *shared;
294 unsigned long magic_page_pa; /* phys addr to map the magic page to */
295 unsigned long magic_page_ea; /* effect. addr to map the magic page to */
293 296
294#ifdef CONFIG_PPC_BOOK3S 297#ifdef CONFIG_PPC_BOOK3S
295 struct hlist_head hpte_hash_pte[HPTEG_HASH_NUM_PTE]; 298 struct hlist_head hpte_hash_pte[HPTEG_HASH_NUM_PTE];
299 struct hlist_head hpte_hash_pte_long[HPTEG_HASH_NUM_PTE_LONG];
296 struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE]; 300 struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE];
297 struct hlist_head hpte_hash_vpte_long[HPTEG_HASH_NUM_VPTE_LONG]; 301 struct hlist_head hpte_hash_vpte_long[HPTEG_HASH_NUM_VPTE_LONG];
298 int hpte_cache_count; 302 int hpte_cache_count;
303 spinlock_t mmu_lock;
299#endif 304#endif
300}; 305};
301 306
diff --git a/arch/powerpc/include/asm/kvm_para.h b/arch/powerpc/include/asm/kvm_para.h
index 2d48f6a63d0b..50533f9adf40 100644
--- a/arch/powerpc/include/asm/kvm_para.h
+++ b/arch/powerpc/include/asm/kvm_para.h
@@ -20,16 +20,153 @@
20#ifndef __POWERPC_KVM_PARA_H__ 20#ifndef __POWERPC_KVM_PARA_H__
21#define __POWERPC_KVM_PARA_H__ 21#define __POWERPC_KVM_PARA_H__
22 22
23#include <linux/types.h>
24
25struct kvm_vcpu_arch_shared {
26 __u64 scratch1;
27 __u64 scratch2;
28 __u64 scratch3;
29 __u64 critical; /* Guest may not get interrupts if == r1 */
30 __u64 sprg0;
31 __u64 sprg1;
32 __u64 sprg2;
33 __u64 sprg3;
34 __u64 srr0;
35 __u64 srr1;
36 __u64 dar;
37 __u64 msr;
38 __u32 dsisr;
39 __u32 int_pending; /* Tells the guest if we have an interrupt */
40 __u32 sr[16];
41};
42
43#define KVM_SC_MAGIC_R0 0x4b564d21 /* "KVM!" */
44#define HC_VENDOR_KVM (42 << 16)
45#define HC_EV_SUCCESS 0
46#define HC_EV_UNIMPLEMENTED 12
47
48#define KVM_FEATURE_MAGIC_PAGE 1
49
50#define KVM_MAGIC_FEAT_SR (1 << 0)
51
23#ifdef __KERNEL__ 52#ifdef __KERNEL__
24 53
54#ifdef CONFIG_KVM_GUEST
55
56#include <linux/of.h>
57
58static inline int kvm_para_available(void)
59{
60 struct device_node *hyper_node;
61
62 hyper_node = of_find_node_by_path("/hypervisor");
63 if (!hyper_node)
64 return 0;
65
66 if (!of_device_is_compatible(hyper_node, "linux,kvm"))
67 return 0;
68
69 return 1;
70}
71
72extern unsigned long kvm_hypercall(unsigned long *in,
73 unsigned long *out,
74 unsigned long nr);
75
76#else
77
25static inline int kvm_para_available(void) 78static inline int kvm_para_available(void)
26{ 79{
27 return 0; 80 return 0;
28} 81}
29 82
83static unsigned long kvm_hypercall(unsigned long *in,
84 unsigned long *out,
85 unsigned long nr)
86{
87 return HC_EV_UNIMPLEMENTED;
88}
89
90#endif
91
92static inline long kvm_hypercall0_1(unsigned int nr, unsigned long *r2)
93{
94 unsigned long in[8];
95 unsigned long out[8];
96 unsigned long r;
97
98 r = kvm_hypercall(in, out, nr | HC_VENDOR_KVM);
99 *r2 = out[0];
100
101 return r;
102}
103
104static inline long kvm_hypercall0(unsigned int nr)
105{
106 unsigned long in[8];
107 unsigned long out[8];
108
109 return kvm_hypercall(in, out, nr | HC_VENDOR_KVM);
110}
111
112static inline long kvm_hypercall1(unsigned int nr, unsigned long p1)
113{
114 unsigned long in[8];
115 unsigned long out[8];
116
117 in[0] = p1;
118 return kvm_hypercall(in, out, nr | HC_VENDOR_KVM);
119}
120
121static inline long kvm_hypercall2(unsigned int nr, unsigned long p1,
122 unsigned long p2)
123{
124 unsigned long in[8];
125 unsigned long out[8];
126
127 in[0] = p1;
128 in[1] = p2;
129 return kvm_hypercall(in, out, nr | HC_VENDOR_KVM);
130}
131
132static inline long kvm_hypercall3(unsigned int nr, unsigned long p1,
133 unsigned long p2, unsigned long p3)
134{
135 unsigned long in[8];
136 unsigned long out[8];
137
138 in[0] = p1;
139 in[1] = p2;
140 in[2] = p3;
141 return kvm_hypercall(in, out, nr | HC_VENDOR_KVM);
142}
143
144static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
145 unsigned long p2, unsigned long p3,
146 unsigned long p4)
147{
148 unsigned long in[8];
149 unsigned long out[8];
150
151 in[0] = p1;
152 in[1] = p2;
153 in[2] = p3;
154 in[3] = p4;
155 return kvm_hypercall(in, out, nr | HC_VENDOR_KVM);
156}
157
158
30static inline unsigned int kvm_arch_para_features(void) 159static inline unsigned int kvm_arch_para_features(void)
31{ 160{
32 return 0; 161 unsigned long r;
162
163 if (!kvm_para_available())
164 return 0;
165
166 if(kvm_hypercall0_1(KVM_HC_FEATURES, &r))
167 return 0;
168
169 return r;
33} 170}
34 171
35#endif /* __KERNEL__ */ 172#endif /* __KERNEL__ */
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 18d139ec2d22..ecb3bc74c344 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -107,6 +107,7 @@ extern int kvmppc_booke_init(void);
107extern void kvmppc_booke_exit(void); 107extern void kvmppc_booke_exit(void);
108 108
109extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu); 109extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu);
110extern int kvmppc_kvm_pv(struct kvm_vcpu *vcpu);
110 111
111/* 112/*
112 * Cuts out inst bits with ordering according to spec. 113 * Cuts out inst bits with ordering according to spec.
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 4ed076a4db24..36c30f31ec93 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -129,6 +129,8 @@ ifneq ($(CONFIG_XMON)$(CONFIG_KEXEC),)
129obj-y += ppc_save_regs.o 129obj-y += ppc_save_regs.o
130endif 130endif
131 131
132obj-$(CONFIG_KVM_GUEST) += kvm.o kvm_emul.o
133
132# Disable GCOV in odd or sensitive code 134# Disable GCOV in odd or sensitive code
133GCOV_PROFILE_prom_init.o := n 135GCOV_PROFILE_prom_init.o := n
134GCOV_PROFILE_ftrace.o := n 136GCOV_PROFILE_ftrace.o := n
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index c3e01945ad4f..bd0df2e6aa8f 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -48,11 +48,11 @@
48#ifdef CONFIG_PPC_ISERIES 48#ifdef CONFIG_PPC_ISERIES
49#include <asm/iseries/alpaca.h> 49#include <asm/iseries/alpaca.h>
50#endif 50#endif
51#ifdef CONFIG_KVM 51#if defined(CONFIG_KVM) || defined(CONFIG_KVM_GUEST)
52#include <linux/kvm_host.h> 52#include <linux/kvm_host.h>
53#ifndef CONFIG_BOOKE
54#include <asm/kvm_book3s.h>
55#endif 53#endif
54#if defined(CONFIG_KVM) && defined(CONFIG_PPC_BOOK3S)
55#include <asm/kvm_book3s.h>
56#endif 56#endif
57 57
58#ifdef CONFIG_PPC32 58#ifdef CONFIG_PPC32
@@ -396,12 +396,13 @@ int main(void)
396 DEFINE(VCPU_HOST_STACK, offsetof(struct kvm_vcpu, arch.host_stack)); 396 DEFINE(VCPU_HOST_STACK, offsetof(struct kvm_vcpu, arch.host_stack));
397 DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid)); 397 DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid));
398 DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr)); 398 DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr));
399 DEFINE(VCPU_MSR, offsetof(struct kvm_vcpu, arch.msr));
400 DEFINE(VCPU_SPRG4, offsetof(struct kvm_vcpu, arch.sprg4)); 399 DEFINE(VCPU_SPRG4, offsetof(struct kvm_vcpu, arch.sprg4));
401 DEFINE(VCPU_SPRG5, offsetof(struct kvm_vcpu, arch.sprg5)); 400 DEFINE(VCPU_SPRG5, offsetof(struct kvm_vcpu, arch.sprg5));
402 DEFINE(VCPU_SPRG6, offsetof(struct kvm_vcpu, arch.sprg6)); 401 DEFINE(VCPU_SPRG6, offsetof(struct kvm_vcpu, arch.sprg6));
403 DEFINE(VCPU_SPRG7, offsetof(struct kvm_vcpu, arch.sprg7)); 402 DEFINE(VCPU_SPRG7, offsetof(struct kvm_vcpu, arch.sprg7));
404 DEFINE(VCPU_SHADOW_PID, offsetof(struct kvm_vcpu, arch.shadow_pid)); 403 DEFINE(VCPU_SHADOW_PID, offsetof(struct kvm_vcpu, arch.shadow_pid));
404 DEFINE(VCPU_SHARED, offsetof(struct kvm_vcpu, arch.shared));
405 DEFINE(VCPU_SHARED_MSR, offsetof(struct kvm_vcpu_arch_shared, msr));
405 406
406 /* book3s */ 407 /* book3s */
407#ifdef CONFIG_PPC_BOOK3S 408#ifdef CONFIG_PPC_BOOK3S
@@ -466,6 +467,22 @@ int main(void)
466 DEFINE(VCPU_FAULT_ESR, offsetof(struct kvm_vcpu, arch.fault_esr)); 467 DEFINE(VCPU_FAULT_ESR, offsetof(struct kvm_vcpu, arch.fault_esr));
467#endif /* CONFIG_PPC_BOOK3S */ 468#endif /* CONFIG_PPC_BOOK3S */
468#endif 469#endif
470
471#ifdef CONFIG_KVM_GUEST
472 DEFINE(KVM_MAGIC_SCRATCH1, offsetof(struct kvm_vcpu_arch_shared,
473 scratch1));
474 DEFINE(KVM_MAGIC_SCRATCH2, offsetof(struct kvm_vcpu_arch_shared,
475 scratch2));
476 DEFINE(KVM_MAGIC_SCRATCH3, offsetof(struct kvm_vcpu_arch_shared,
477 scratch3));
478 DEFINE(KVM_MAGIC_INT, offsetof(struct kvm_vcpu_arch_shared,
479 int_pending));
480 DEFINE(KVM_MAGIC_MSR, offsetof(struct kvm_vcpu_arch_shared, msr));
481 DEFINE(KVM_MAGIC_CRITICAL, offsetof(struct kvm_vcpu_arch_shared,
482 critical));
483 DEFINE(KVM_MAGIC_SR, offsetof(struct kvm_vcpu_arch_shared, sr));
484#endif
485
469#ifdef CONFIG_44x 486#ifdef CONFIG_44x
470 DEFINE(PGD_T_LOG2, PGD_T_LOG2); 487 DEFINE(PGD_T_LOG2, PGD_T_LOG2);
471 DEFINE(PTE_T_LOG2, PTE_T_LOG2); 488 DEFINE(PTE_T_LOG2, PTE_T_LOG2);
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 39b0c48872d2..9f8b01d6466f 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -299,6 +299,12 @@ slb_miss_user_pseries:
299 b . /* prevent spec. execution */ 299 b . /* prevent spec. execution */
300#endif /* __DISABLED__ */ 300#endif /* __DISABLED__ */
301 301
302/* KVM's trampoline code needs to be close to the interrupt handlers */
303
304#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
305#include "../kvm/book3s_rmhandlers.S"
306#endif
307
302 .align 7 308 .align 7
303 .globl __end_interrupts 309 .globl __end_interrupts
304__end_interrupts: 310__end_interrupts:
diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S
index c571cd3c1453..f0dd577e4a5b 100644
--- a/arch/powerpc/kernel/head_64.S
+++ b/arch/powerpc/kernel/head_64.S
@@ -166,12 +166,6 @@ exception_marker:
166#include "exceptions-64s.S" 166#include "exceptions-64s.S"
167#endif 167#endif
168 168
169/* KVM trampoline code needs to be close to the interrupt handlers */
170
171#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
172#include "../kvm/book3s_rmhandlers.S"
173#endif
174
175_GLOBAL(generic_secondary_thread_init) 169_GLOBAL(generic_secondary_thread_init)
176 mr r24,r3 170 mr r24,r3
177 171
diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
new file mode 100644
index 000000000000..428d0e538aec
--- /dev/null
+++ b/arch/powerpc/kernel/kvm.c
@@ -0,0 +1,596 @@
1/*
2 * Copyright (C) 2010 SUSE Linux Products GmbH. All rights reserved.
3 *
4 * Authors:
5 * Alexander Graf <agraf@suse.de>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License, version 2, as
9 * published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 */
20
21#include <linux/kvm_host.h>
22#include <linux/init.h>
23#include <linux/kvm_para.h>
24#include <linux/slab.h>
25#include <linux/of.h>
26
27#include <asm/reg.h>
28#include <asm/sections.h>
29#include <asm/cacheflush.h>
30#include <asm/disassemble.h>
31
32#define KVM_MAGIC_PAGE (-4096L)
33#define magic_var(x) KVM_MAGIC_PAGE + offsetof(struct kvm_vcpu_arch_shared, x)
34
35#define KVM_INST_LWZ 0x80000000
36#define KVM_INST_STW 0x90000000
37#define KVM_INST_LD 0xe8000000
38#define KVM_INST_STD 0xf8000000
39#define KVM_INST_NOP 0x60000000
40#define KVM_INST_B 0x48000000
41#define KVM_INST_B_MASK 0x03ffffff
42#define KVM_INST_B_MAX 0x01ffffff
43
44#define KVM_MASK_RT 0x03e00000
45#define KVM_RT_30 0x03c00000
46#define KVM_MASK_RB 0x0000f800
47#define KVM_INST_MFMSR 0x7c0000a6
48#define KVM_INST_MFSPR_SPRG0 0x7c1042a6
49#define KVM_INST_MFSPR_SPRG1 0x7c1142a6
50#define KVM_INST_MFSPR_SPRG2 0x7c1242a6
51#define KVM_INST_MFSPR_SPRG3 0x7c1342a6
52#define KVM_INST_MFSPR_SRR0 0x7c1a02a6
53#define KVM_INST_MFSPR_SRR1 0x7c1b02a6
54#define KVM_INST_MFSPR_DAR 0x7c1302a6
55#define KVM_INST_MFSPR_DSISR 0x7c1202a6
56
57#define KVM_INST_MTSPR_SPRG0 0x7c1043a6
58#define KVM_INST_MTSPR_SPRG1 0x7c1143a6
59#define KVM_INST_MTSPR_SPRG2 0x7c1243a6
60#define KVM_INST_MTSPR_SPRG3 0x7c1343a6
61#define KVM_INST_MTSPR_SRR0 0x7c1a03a6
62#define KVM_INST_MTSPR_SRR1 0x7c1b03a6
63#define KVM_INST_MTSPR_DAR 0x7c1303a6
64#define KVM_INST_MTSPR_DSISR 0x7c1203a6
65
66#define KVM_INST_TLBSYNC 0x7c00046c
67#define KVM_INST_MTMSRD_L0 0x7c000164
68#define KVM_INST_MTMSRD_L1 0x7c010164
69#define KVM_INST_MTMSR 0x7c000124
70
71#define KVM_INST_WRTEEI_0 0x7c000146
72#define KVM_INST_WRTEEI_1 0x7c008146
73
74#define KVM_INST_MTSRIN 0x7c0001e4
75
76static bool kvm_patching_worked = true;
77static char kvm_tmp[1024 * 1024];
78static int kvm_tmp_index;
79
80static inline void kvm_patch_ins(u32 *inst, u32 new_inst)
81{
82 *inst = new_inst;
83 flush_icache_range((ulong)inst, (ulong)inst + 4);
84}
85
86static void kvm_patch_ins_ll(u32 *inst, long addr, u32 rt)
87{
88#ifdef CONFIG_64BIT
89 kvm_patch_ins(inst, KVM_INST_LD | rt | (addr & 0x0000fffc));
90#else
91 kvm_patch_ins(inst, KVM_INST_LWZ | rt | (addr & 0x0000fffc));
92#endif
93}
94
95static void kvm_patch_ins_ld(u32 *inst, long addr, u32 rt)
96{
97#ifdef CONFIG_64BIT
98 kvm_patch_ins(inst, KVM_INST_LD | rt | (addr & 0x0000fffc));
99#else
100 kvm_patch_ins(inst, KVM_INST_LWZ | rt | ((addr + 4) & 0x0000fffc));
101#endif
102}
103
104static void kvm_patch_ins_lwz(u32 *inst, long addr, u32 rt)
105{
106 kvm_patch_ins(inst, KVM_INST_LWZ | rt | (addr & 0x0000ffff));
107}
108
109static void kvm_patch_ins_std(u32 *inst, long addr, u32 rt)
110{
111#ifdef CONFIG_64BIT
112 kvm_patch_ins(inst, KVM_INST_STD | rt | (addr & 0x0000fffc));
113#else
114 kvm_patch_ins(inst, KVM_INST_STW | rt | ((addr + 4) & 0x0000fffc));
115#endif
116}
117
118static void kvm_patch_ins_stw(u32 *inst, long addr, u32 rt)
119{
120 kvm_patch_ins(inst, KVM_INST_STW | rt | (addr & 0x0000fffc));
121}
122
123static void kvm_patch_ins_nop(u32 *inst)
124{
125 kvm_patch_ins(inst, KVM_INST_NOP);
126}
127
128static void kvm_patch_ins_b(u32 *inst, int addr)
129{
130#ifdef CONFIG_RELOCATABLE
131 /* On relocatable kernels interrupts handlers and our code
132 can be in different regions, so we don't patch them */
133
134 extern u32 __end_interrupts;
135 if ((ulong)inst < (ulong)&__end_interrupts)
136 return;
137#endif
138
139 kvm_patch_ins(inst, KVM_INST_B | (addr & KVM_INST_B_MASK));
140}
141
142static u32 *kvm_alloc(int len)
143{
144 u32 *p;
145
146 if ((kvm_tmp_index + len) > ARRAY_SIZE(kvm_tmp)) {
147 printk(KERN_ERR "KVM: No more space (%d + %d)\n",
148 kvm_tmp_index, len);
149 kvm_patching_worked = false;
150 return NULL;
151 }
152
153 p = (void*)&kvm_tmp[kvm_tmp_index];
154 kvm_tmp_index += len;
155
156 return p;
157}
158
159extern u32 kvm_emulate_mtmsrd_branch_offs;
160extern u32 kvm_emulate_mtmsrd_reg_offs;
161extern u32 kvm_emulate_mtmsrd_orig_ins_offs;
162extern u32 kvm_emulate_mtmsrd_len;
163extern u32 kvm_emulate_mtmsrd[];
164
165static void kvm_patch_ins_mtmsrd(u32 *inst, u32 rt)
166{
167 u32 *p;
168 int distance_start;
169 int distance_end;
170 ulong next_inst;
171
172 p = kvm_alloc(kvm_emulate_mtmsrd_len * 4);
173 if (!p)
174 return;
175
176 /* Find out where we are and put everything there */
177 distance_start = (ulong)p - (ulong)inst;
178 next_inst = ((ulong)inst + 4);
179 distance_end = next_inst - (ulong)&p[kvm_emulate_mtmsrd_branch_offs];
180
181 /* Make sure we only write valid b instructions */
182 if (distance_start > KVM_INST_B_MAX) {
183 kvm_patching_worked = false;
184 return;
185 }
186
187 /* Modify the chunk to fit the invocation */
188 memcpy(p, kvm_emulate_mtmsrd, kvm_emulate_mtmsrd_len * 4);
189 p[kvm_emulate_mtmsrd_branch_offs] |= distance_end & KVM_INST_B_MASK;
190 switch (get_rt(rt)) {
191 case 30:
192 kvm_patch_ins_ll(&p[kvm_emulate_mtmsrd_reg_offs],
193 magic_var(scratch2), KVM_RT_30);
194 break;
195 case 31:
196 kvm_patch_ins_ll(&p[kvm_emulate_mtmsrd_reg_offs],
197 magic_var(scratch1), KVM_RT_30);
198 break;
199 default:
200 p[kvm_emulate_mtmsrd_reg_offs] |= rt;
201 break;
202 }
203
204 p[kvm_emulate_mtmsrd_orig_ins_offs] = *inst;
205 flush_icache_range((ulong)p, (ulong)p + kvm_emulate_mtmsrd_len * 4);
206
207 /* Patch the invocation */
208 kvm_patch_ins_b(inst, distance_start);
209}
210
211extern u32 kvm_emulate_mtmsr_branch_offs;
212extern u32 kvm_emulate_mtmsr_reg1_offs;
213extern u32 kvm_emulate_mtmsr_reg2_offs;
214extern u32 kvm_emulate_mtmsr_orig_ins_offs;
215extern u32 kvm_emulate_mtmsr_len;
216extern u32 kvm_emulate_mtmsr[];
217
218static void kvm_patch_ins_mtmsr(u32 *inst, u32 rt)
219{
220 u32 *p;
221 int distance_start;
222 int distance_end;
223 ulong next_inst;
224
225 p = kvm_alloc(kvm_emulate_mtmsr_len * 4);
226 if (!p)
227 return;
228
229 /* Find out where we are and put everything there */
230 distance_start = (ulong)p - (ulong)inst;
231 next_inst = ((ulong)inst + 4);
232 distance_end = next_inst - (ulong)&p[kvm_emulate_mtmsr_branch_offs];
233
234 /* Make sure we only write valid b instructions */
235 if (distance_start > KVM_INST_B_MAX) {
236 kvm_patching_worked = false;
237 return;
238 }
239
240 /* Modify the chunk to fit the invocation */
241 memcpy(p, kvm_emulate_mtmsr, kvm_emulate_mtmsr_len * 4);
242 p[kvm_emulate_mtmsr_branch_offs] |= distance_end & KVM_INST_B_MASK;
243
244 /* Make clobbered registers work too */
245 switch (get_rt(rt)) {
246 case 30:
247 kvm_patch_ins_ll(&p[kvm_emulate_mtmsr_reg1_offs],
248 magic_var(scratch2), KVM_RT_30);
249 kvm_patch_ins_ll(&p[kvm_emulate_mtmsr_reg2_offs],
250 magic_var(scratch2), KVM_RT_30);
251 break;
252 case 31:
253 kvm_patch_ins_ll(&p[kvm_emulate_mtmsr_reg1_offs],
254 magic_var(scratch1), KVM_RT_30);
255 kvm_patch_ins_ll(&p[kvm_emulate_mtmsr_reg2_offs],
256 magic_var(scratch1), KVM_RT_30);
257 break;
258 default:
259 p[kvm_emulate_mtmsr_reg1_offs] |= rt;
260 p[kvm_emulate_mtmsr_reg2_offs] |= rt;
261 break;
262 }
263
264 p[kvm_emulate_mtmsr_orig_ins_offs] = *inst;
265 flush_icache_range((ulong)p, (ulong)p + kvm_emulate_mtmsr_len * 4);
266
267 /* Patch the invocation */
268 kvm_patch_ins_b(inst, distance_start);
269}
270
271#ifdef CONFIG_BOOKE
272
273extern u32 kvm_emulate_wrteei_branch_offs;
274extern u32 kvm_emulate_wrteei_ee_offs;
275extern u32 kvm_emulate_wrteei_len;
276extern u32 kvm_emulate_wrteei[];
277
278static void kvm_patch_ins_wrteei(u32 *inst)
279{
280 u32 *p;
281 int distance_start;
282 int distance_end;
283 ulong next_inst;
284
285 p = kvm_alloc(kvm_emulate_wrteei_len * 4);
286 if (!p)
287 return;
288
289 /* Find out where we are and put everything there */
290 distance_start = (ulong)p - (ulong)inst;
291 next_inst = ((ulong)inst + 4);
292 distance_end = next_inst - (ulong)&p[kvm_emulate_wrteei_branch_offs];
293
294 /* Make sure we only write valid b instructions */
295 if (distance_start > KVM_INST_B_MAX) {
296 kvm_patching_worked = false;
297 return;
298 }
299
300 /* Modify the chunk to fit the invocation */
301 memcpy(p, kvm_emulate_wrteei, kvm_emulate_wrteei_len * 4);
302 p[kvm_emulate_wrteei_branch_offs] |= distance_end & KVM_INST_B_MASK;
303 p[kvm_emulate_wrteei_ee_offs] |= (*inst & MSR_EE);
304 flush_icache_range((ulong)p, (ulong)p + kvm_emulate_wrteei_len * 4);
305
306 /* Patch the invocation */
307 kvm_patch_ins_b(inst, distance_start);
308}
309
310#endif
311
312#ifdef CONFIG_PPC_BOOK3S_32
313
314extern u32 kvm_emulate_mtsrin_branch_offs;
315extern u32 kvm_emulate_mtsrin_reg1_offs;
316extern u32 kvm_emulate_mtsrin_reg2_offs;
317extern u32 kvm_emulate_mtsrin_orig_ins_offs;
318extern u32 kvm_emulate_mtsrin_len;
319extern u32 kvm_emulate_mtsrin[];
320
321static void kvm_patch_ins_mtsrin(u32 *inst, u32 rt, u32 rb)
322{
323 u32 *p;
324 int distance_start;
325 int distance_end;
326 ulong next_inst;
327
328 p = kvm_alloc(kvm_emulate_mtsrin_len * 4);
329 if (!p)
330 return;
331
332 /* Find out where we are and put everything there */
333 distance_start = (ulong)p - (ulong)inst;
334 next_inst = ((ulong)inst + 4);
335 distance_end = next_inst - (ulong)&p[kvm_emulate_mtsrin_branch_offs];
336
337 /* Make sure we only write valid b instructions */
338 if (distance_start > KVM_INST_B_MAX) {
339 kvm_patching_worked = false;
340 return;
341 }
342
343 /* Modify the chunk to fit the invocation */
344 memcpy(p, kvm_emulate_mtsrin, kvm_emulate_mtsrin_len * 4);
345 p[kvm_emulate_mtsrin_branch_offs] |= distance_end & KVM_INST_B_MASK;
346 p[kvm_emulate_mtsrin_reg1_offs] |= (rb << 10);
347 p[kvm_emulate_mtsrin_reg2_offs] |= rt;
348 p[kvm_emulate_mtsrin_orig_ins_offs] = *inst;
349 flush_icache_range((ulong)p, (ulong)p + kvm_emulate_mtsrin_len * 4);
350
351 /* Patch the invocation */
352 kvm_patch_ins_b(inst, distance_start);
353}
354
355#endif
356
357static void kvm_map_magic_page(void *data)
358{
359 u32 *features = data;
360
361 ulong in[8];
362 ulong out[8];
363
364 in[0] = KVM_MAGIC_PAGE;
365 in[1] = KVM_MAGIC_PAGE;
366
367 kvm_hypercall(in, out, HC_VENDOR_KVM | KVM_HC_PPC_MAP_MAGIC_PAGE);
368
369 *features = out[0];
370}
371
372static void kvm_check_ins(u32 *inst, u32 features)
373{
374 u32 _inst = *inst;
375 u32 inst_no_rt = _inst & ~KVM_MASK_RT;
376 u32 inst_rt = _inst & KVM_MASK_RT;
377
378 switch (inst_no_rt) {
379 /* Loads */
380 case KVM_INST_MFMSR:
381 kvm_patch_ins_ld(inst, magic_var(msr), inst_rt);
382 break;
383 case KVM_INST_MFSPR_SPRG0:
384 kvm_patch_ins_ld(inst, magic_var(sprg0), inst_rt);
385 break;
386 case KVM_INST_MFSPR_SPRG1:
387 kvm_patch_ins_ld(inst, magic_var(sprg1), inst_rt);
388 break;
389 case KVM_INST_MFSPR_SPRG2:
390 kvm_patch_ins_ld(inst, magic_var(sprg2), inst_rt);
391 break;
392 case KVM_INST_MFSPR_SPRG3:
393 kvm_patch_ins_ld(inst, magic_var(sprg3), inst_rt);
394 break;
395 case KVM_INST_MFSPR_SRR0:
396 kvm_patch_ins_ld(inst, magic_var(srr0), inst_rt);
397 break;
398 case KVM_INST_MFSPR_SRR1:
399 kvm_patch_ins_ld(inst, magic_var(srr1), inst_rt);
400 break;
401 case KVM_INST_MFSPR_DAR:
402 kvm_patch_ins_ld(inst, magic_var(dar), inst_rt);
403 break;
404 case KVM_INST_MFSPR_DSISR:
405 kvm_patch_ins_lwz(inst, magic_var(dsisr), inst_rt);
406 break;
407
408 /* Stores */
409 case KVM_INST_MTSPR_SPRG0:
410 kvm_patch_ins_std(inst, magic_var(sprg0), inst_rt);
411 break;
412 case KVM_INST_MTSPR_SPRG1:
413 kvm_patch_ins_std(inst, magic_var(sprg1), inst_rt);
414 break;
415 case KVM_INST_MTSPR_SPRG2:
416 kvm_patch_ins_std(inst, magic_var(sprg2), inst_rt);
417 break;
418 case KVM_INST_MTSPR_SPRG3:
419 kvm_patch_ins_std(inst, magic_var(sprg3), inst_rt);
420 break;
421 case KVM_INST_MTSPR_SRR0:
422 kvm_patch_ins_std(inst, magic_var(srr0), inst_rt);
423 break;
424 case KVM_INST_MTSPR_SRR1:
425 kvm_patch_ins_std(inst, magic_var(srr1), inst_rt);
426 break;
427 case KVM_INST_MTSPR_DAR:
428 kvm_patch_ins_std(inst, magic_var(dar), inst_rt);
429 break;
430 case KVM_INST_MTSPR_DSISR:
431 kvm_patch_ins_stw(inst, magic_var(dsisr), inst_rt);
432 break;
433
434 /* Nops */
435 case KVM_INST_TLBSYNC:
436 kvm_patch_ins_nop(inst);
437 break;
438
439 /* Rewrites */
440 case KVM_INST_MTMSRD_L1:
441 kvm_patch_ins_mtmsrd(inst, inst_rt);
442 break;
443 case KVM_INST_MTMSR:
444 case KVM_INST_MTMSRD_L0:
445 kvm_patch_ins_mtmsr(inst, inst_rt);
446 break;
447 }
448
449 switch (inst_no_rt & ~KVM_MASK_RB) {
450#ifdef CONFIG_PPC_BOOK3S_32
451 case KVM_INST_MTSRIN:
452 if (features & KVM_MAGIC_FEAT_SR) {
453 u32 inst_rb = _inst & KVM_MASK_RB;
454 kvm_patch_ins_mtsrin(inst, inst_rt, inst_rb);
455 }
456 break;
457 break;
458#endif
459 }
460
461 switch (_inst) {
462#ifdef CONFIG_BOOKE
463 case KVM_INST_WRTEEI_0:
464 case KVM_INST_WRTEEI_1:
465 kvm_patch_ins_wrteei(inst);
466 break;
467#endif
468 }
469}
470
471static void kvm_use_magic_page(void)
472{
473 u32 *p;
474 u32 *start, *end;
475 u32 tmp;
476 u32 features;
477
478 /* Tell the host to map the magic page to -4096 on all CPUs */
479 on_each_cpu(kvm_map_magic_page, &features, 1);
480
481 /* Quick self-test to see if the mapping works */
482 if (__get_user(tmp, (u32*)KVM_MAGIC_PAGE)) {
483 kvm_patching_worked = false;
484 return;
485 }
486
487 /* Now loop through all code and find instructions */
488 start = (void*)_stext;
489 end = (void*)_etext;
490
491 for (p = start; p < end; p++)
492 kvm_check_ins(p, features);
493
494 printk(KERN_INFO "KVM: Live patching for a fast VM %s\n",
495 kvm_patching_worked ? "worked" : "failed");
496}
497
498unsigned long kvm_hypercall(unsigned long *in,
499 unsigned long *out,
500 unsigned long nr)
501{
502 unsigned long register r0 asm("r0");
503 unsigned long register r3 asm("r3") = in[0];
504 unsigned long register r4 asm("r4") = in[1];
505 unsigned long register r5 asm("r5") = in[2];
506 unsigned long register r6 asm("r6") = in[3];
507 unsigned long register r7 asm("r7") = in[4];
508 unsigned long register r8 asm("r8") = in[5];
509 unsigned long register r9 asm("r9") = in[6];
510 unsigned long register r10 asm("r10") = in[7];
511 unsigned long register r11 asm("r11") = nr;
512 unsigned long register r12 asm("r12");
513
514 asm volatile("bl kvm_hypercall_start"
515 : "=r"(r0), "=r"(r3), "=r"(r4), "=r"(r5), "=r"(r6),
516 "=r"(r7), "=r"(r8), "=r"(r9), "=r"(r10), "=r"(r11),
517 "=r"(r12)
518 : "r"(r3), "r"(r4), "r"(r5), "r"(r6), "r"(r7), "r"(r8),
519 "r"(r9), "r"(r10), "r"(r11)
520 : "memory", "cc", "xer", "ctr", "lr");
521
522 out[0] = r4;
523 out[1] = r5;
524 out[2] = r6;
525 out[3] = r7;
526 out[4] = r8;
527 out[5] = r9;
528 out[6] = r10;
529 out[7] = r11;
530
531 return r3;
532}
533EXPORT_SYMBOL_GPL(kvm_hypercall);
534
535static int kvm_para_setup(void)
536{
537 extern u32 kvm_hypercall_start;
538 struct device_node *hyper_node;
539 u32 *insts;
540 int len, i;
541
542 hyper_node = of_find_node_by_path("/hypervisor");
543 if (!hyper_node)
544 return -1;
545
546 insts = (u32*)of_get_property(hyper_node, "hcall-instructions", &len);
547 if (len % 4)
548 return -1;
549 if (len > (4 * 4))
550 return -1;
551
552 for (i = 0; i < (len / 4); i++)
553 kvm_patch_ins(&(&kvm_hypercall_start)[i], insts[i]);
554
555 return 0;
556}
557
558static __init void kvm_free_tmp(void)
559{
560 unsigned long start, end;
561
562 start = (ulong)&kvm_tmp[kvm_tmp_index + (PAGE_SIZE - 1)] & PAGE_MASK;
563 end = (ulong)&kvm_tmp[ARRAY_SIZE(kvm_tmp)] & PAGE_MASK;
564
565 /* Free the tmp space we don't need */
566 for (; start < end; start += PAGE_SIZE) {
567 ClearPageReserved(virt_to_page(start));
568 init_page_count(virt_to_page(start));
569 free_page(start);
570 totalram_pages++;
571 }
572}
573
574static int __init kvm_guest_init(void)
575{
576 if (!kvm_para_available())
577 goto free_tmp;
578
579 if (kvm_para_setup())
580 goto free_tmp;
581
582 if (kvm_para_has_feature(KVM_FEATURE_MAGIC_PAGE))
583 kvm_use_magic_page();
584
585#ifdef CONFIG_PPC_BOOK3S_64
586 /* Enable napping */
587 powersave_nap = 1;
588#endif
589
590free_tmp:
591 kvm_free_tmp();
592
593 return 0;
594}
595
596postcore_initcall(kvm_guest_init);
diff --git a/arch/powerpc/kernel/kvm_emul.S b/arch/powerpc/kernel/kvm_emul.S
new file mode 100644
index 000000000000..f2b1b2523e61
--- /dev/null
+++ b/arch/powerpc/kernel/kvm_emul.S
@@ -0,0 +1,302 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License, version 2, as
4 * published by the Free Software Foundation.
5 *
6 * This program is distributed in the hope that it will be useful,
7 * but WITHOUT ANY WARRANTY; without even the implied warranty of
8 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
9 * GNU General Public License for more details.
10 *
11 * You should have received a copy of the GNU General Public License
12 * along with this program; if not, write to the Free Software
13 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
14 *
15 * Copyright SUSE Linux Products GmbH 2010
16 *
17 * Authors: Alexander Graf <agraf@suse.de>
18 */
19
20#include <asm/ppc_asm.h>
21#include <asm/kvm_asm.h>
22#include <asm/reg.h>
23#include <asm/page.h>
24#include <asm/asm-offsets.h>
25
26/* Hypercall entry point. Will be patched with device tree instructions. */
27
28.global kvm_hypercall_start
29kvm_hypercall_start:
30 li r3, -1
31 nop
32 nop
33 nop
34 blr
35
36#define KVM_MAGIC_PAGE (-4096)
37
38#ifdef CONFIG_64BIT
39#define LL64(reg, offs, reg2) ld reg, (offs)(reg2)
40#define STL64(reg, offs, reg2) std reg, (offs)(reg2)
41#else
42#define LL64(reg, offs, reg2) lwz reg, (offs + 4)(reg2)
43#define STL64(reg, offs, reg2) stw reg, (offs + 4)(reg2)
44#endif
45
46#define SCRATCH_SAVE \
47 /* Enable critical section. We are critical if \
48 shared->critical == r1 */ \
49 STL64(r1, KVM_MAGIC_PAGE + KVM_MAGIC_CRITICAL, 0); \
50 \
51 /* Save state */ \
52 PPC_STL r31, (KVM_MAGIC_PAGE + KVM_MAGIC_SCRATCH1)(0); \
53 PPC_STL r30, (KVM_MAGIC_PAGE + KVM_MAGIC_SCRATCH2)(0); \
54 mfcr r31; \
55 stw r31, (KVM_MAGIC_PAGE + KVM_MAGIC_SCRATCH3)(0);
56
57#define SCRATCH_RESTORE \
58 /* Restore state */ \
59 PPC_LL r31, (KVM_MAGIC_PAGE + KVM_MAGIC_SCRATCH1)(0); \
60 lwz r30, (KVM_MAGIC_PAGE + KVM_MAGIC_SCRATCH3)(0); \
61 mtcr r30; \
62 PPC_LL r30, (KVM_MAGIC_PAGE + KVM_MAGIC_SCRATCH2)(0); \
63 \
64 /* Disable critical section. We are critical if \
65 shared->critical == r1 and r2 is always != r1 */ \
66 STL64(r2, KVM_MAGIC_PAGE + KVM_MAGIC_CRITICAL, 0);
67
68.global kvm_emulate_mtmsrd
69kvm_emulate_mtmsrd:
70
71 SCRATCH_SAVE
72
73 /* Put MSR & ~(MSR_EE|MSR_RI) in r31 */
74 LL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
75 lis r30, (~(MSR_EE | MSR_RI))@h
76 ori r30, r30, (~(MSR_EE | MSR_RI))@l
77 and r31, r31, r30
78
79 /* OR the register's (MSR_EE|MSR_RI) on MSR */
80kvm_emulate_mtmsrd_reg:
81 ori r30, r0, 0
82 andi. r30, r30, (MSR_EE|MSR_RI)
83 or r31, r31, r30
84
85 /* Put MSR back into magic page */
86 STL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
87
88 /* Check if we have to fetch an interrupt */
89 lwz r31, (KVM_MAGIC_PAGE + KVM_MAGIC_INT)(0)
90 cmpwi r31, 0
91 beq+ no_check
92
93 /* Check if we may trigger an interrupt */
94 andi. r30, r30, MSR_EE
95 beq no_check
96
97 SCRATCH_RESTORE
98
99 /* Nag hypervisor */
100kvm_emulate_mtmsrd_orig_ins:
101 tlbsync
102
103 b kvm_emulate_mtmsrd_branch
104
105no_check:
106
107 SCRATCH_RESTORE
108
109 /* Go back to caller */
110kvm_emulate_mtmsrd_branch:
111 b .
112kvm_emulate_mtmsrd_end:
113
114.global kvm_emulate_mtmsrd_branch_offs
115kvm_emulate_mtmsrd_branch_offs:
116 .long (kvm_emulate_mtmsrd_branch - kvm_emulate_mtmsrd) / 4
117
118.global kvm_emulate_mtmsrd_reg_offs
119kvm_emulate_mtmsrd_reg_offs:
120 .long (kvm_emulate_mtmsrd_reg - kvm_emulate_mtmsrd) / 4
121
122.global kvm_emulate_mtmsrd_orig_ins_offs
123kvm_emulate_mtmsrd_orig_ins_offs:
124 .long (kvm_emulate_mtmsrd_orig_ins - kvm_emulate_mtmsrd) / 4
125
126.global kvm_emulate_mtmsrd_len
127kvm_emulate_mtmsrd_len:
128 .long (kvm_emulate_mtmsrd_end - kvm_emulate_mtmsrd) / 4
129
130
131#define MSR_SAFE_BITS (MSR_EE | MSR_CE | MSR_ME | MSR_RI)
132#define MSR_CRITICAL_BITS ~MSR_SAFE_BITS
133
134.global kvm_emulate_mtmsr
135kvm_emulate_mtmsr:
136
137 SCRATCH_SAVE
138
139 /* Fetch old MSR in r31 */
140 LL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
141
142 /* Find the changed bits between old and new MSR */
143kvm_emulate_mtmsr_reg1:
144 ori r30, r0, 0
145 xor r31, r30, r31
146
147 /* Check if we need to really do mtmsr */
148 LOAD_REG_IMMEDIATE(r30, MSR_CRITICAL_BITS)
149 and. r31, r31, r30
150
151 /* No critical bits changed? Maybe we can stay in the guest. */
152 beq maybe_stay_in_guest
153
154do_mtmsr:
155
156 SCRATCH_RESTORE
157
158 /* Just fire off the mtmsr if it's critical */
159kvm_emulate_mtmsr_orig_ins:
160 mtmsr r0
161
162 b kvm_emulate_mtmsr_branch
163
164maybe_stay_in_guest:
165
166 /* Get the target register in r30 */
167kvm_emulate_mtmsr_reg2:
168 ori r30, r0, 0
169
170 /* Check if we have to fetch an interrupt */
171 lwz r31, (KVM_MAGIC_PAGE + KVM_MAGIC_INT)(0)
172 cmpwi r31, 0
173 beq+ no_mtmsr
174
175 /* Check if we may trigger an interrupt */
176 andi. r31, r30, MSR_EE
177 beq no_mtmsr
178
179 b do_mtmsr
180
181no_mtmsr:
182
183 /* Put MSR into magic page because we don't call mtmsr */
184 STL64(r30, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
185
186 SCRATCH_RESTORE
187
188 /* Go back to caller */
189kvm_emulate_mtmsr_branch:
190 b .
191kvm_emulate_mtmsr_end:
192
193.global kvm_emulate_mtmsr_branch_offs
194kvm_emulate_mtmsr_branch_offs:
195 .long (kvm_emulate_mtmsr_branch - kvm_emulate_mtmsr) / 4
196
197.global kvm_emulate_mtmsr_reg1_offs
198kvm_emulate_mtmsr_reg1_offs:
199 .long (kvm_emulate_mtmsr_reg1 - kvm_emulate_mtmsr) / 4
200
201.global kvm_emulate_mtmsr_reg2_offs
202kvm_emulate_mtmsr_reg2_offs:
203 .long (kvm_emulate_mtmsr_reg2 - kvm_emulate_mtmsr) / 4
204
205.global kvm_emulate_mtmsr_orig_ins_offs
206kvm_emulate_mtmsr_orig_ins_offs:
207 .long (kvm_emulate_mtmsr_orig_ins - kvm_emulate_mtmsr) / 4
208
209.global kvm_emulate_mtmsr_len
210kvm_emulate_mtmsr_len:
211 .long (kvm_emulate_mtmsr_end - kvm_emulate_mtmsr) / 4
212
213
214
215.global kvm_emulate_wrteei
216kvm_emulate_wrteei:
217
218 SCRATCH_SAVE
219
220 /* Fetch old MSR in r31 */
221 LL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
222
223 /* Remove MSR_EE from old MSR */
224 li r30, 0
225 ori r30, r30, MSR_EE
226 andc r31, r31, r30
227
228 /* OR new MSR_EE onto the old MSR */
229kvm_emulate_wrteei_ee:
230 ori r31, r31, 0
231
232 /* Write new MSR value back */
233 STL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
234
235 SCRATCH_RESTORE
236
237 /* Go back to caller */
238kvm_emulate_wrteei_branch:
239 b .
240kvm_emulate_wrteei_end:
241
242.global kvm_emulate_wrteei_branch_offs
243kvm_emulate_wrteei_branch_offs:
244 .long (kvm_emulate_wrteei_branch - kvm_emulate_wrteei) / 4
245
246.global kvm_emulate_wrteei_ee_offs
247kvm_emulate_wrteei_ee_offs:
248 .long (kvm_emulate_wrteei_ee - kvm_emulate_wrteei) / 4
249
250.global kvm_emulate_wrteei_len
251kvm_emulate_wrteei_len:
252 .long (kvm_emulate_wrteei_end - kvm_emulate_wrteei) / 4
253
254
255.global kvm_emulate_mtsrin
256kvm_emulate_mtsrin:
257
258 SCRATCH_SAVE
259
260 LL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
261 andi. r31, r31, MSR_DR | MSR_IR
262 beq kvm_emulate_mtsrin_reg1
263
264 SCRATCH_RESTORE
265
266kvm_emulate_mtsrin_orig_ins:
267 nop
268 b kvm_emulate_mtsrin_branch
269
270kvm_emulate_mtsrin_reg1:
271 /* rX >> 26 */
272 rlwinm r30,r0,6,26,29
273
274kvm_emulate_mtsrin_reg2:
275 stw r0, (KVM_MAGIC_PAGE + KVM_MAGIC_SR)(r30)
276
277 SCRATCH_RESTORE
278
279 /* Go back to caller */
280kvm_emulate_mtsrin_branch:
281 b .
282kvm_emulate_mtsrin_end:
283
284.global kvm_emulate_mtsrin_branch_offs
285kvm_emulate_mtsrin_branch_offs:
286 .long (kvm_emulate_mtsrin_branch - kvm_emulate_mtsrin) / 4
287
288.global kvm_emulate_mtsrin_reg1_offs
289kvm_emulate_mtsrin_reg1_offs:
290 .long (kvm_emulate_mtsrin_reg1 - kvm_emulate_mtsrin) / 4
291
292.global kvm_emulate_mtsrin_reg2_offs
293kvm_emulate_mtsrin_reg2_offs:
294 .long (kvm_emulate_mtsrin_reg2 - kvm_emulate_mtsrin) / 4
295
296.global kvm_emulate_mtsrin_orig_ins_offs
297kvm_emulate_mtsrin_orig_ins_offs:
298 .long (kvm_emulate_mtsrin_orig_ins - kvm_emulate_mtsrin) / 4
299
300.global kvm_emulate_mtsrin_len
301kvm_emulate_mtsrin_len:
302 .long (kvm_emulate_mtsrin_end - kvm_emulate_mtsrin) / 4
diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c
index 73c0a3f64ed1..74d0e7421143 100644
--- a/arch/powerpc/kvm/44x.c
+++ b/arch/powerpc/kvm/44x.c
@@ -43,7 +43,7 @@ int kvmppc_core_check_processor_compat(void)
43{ 43{
44 int r; 44 int r;
45 45
46 if (strcmp(cur_cpu_spec->platform, "ppc440") == 0) 46 if (strncmp(cur_cpu_spec->platform, "ppc440", 6) == 0)
47 r = 0; 47 r = 0;
48 else 48 else
49 r = -ENOTSUPP; 49 r = -ENOTSUPP;
@@ -72,6 +72,7 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu)
72 /* Since the guest can directly access the timebase, it must know the 72 /* Since the guest can directly access the timebase, it must know the
73 * real timebase frequency. Accordingly, it must see the state of 73 * real timebase frequency. Accordingly, it must see the state of
74 * CCR1[TCS]. */ 74 * CCR1[TCS]. */
75 /* XXX CCR1 doesn't exist on all 440 SoCs. */
75 vcpu->arch.ccr1 = mfspr(SPRN_CCR1); 76 vcpu->arch.ccr1 = mfspr(SPRN_CCR1);
76 77
77 for (i = 0; i < ARRAY_SIZE(vcpu_44x->shadow_refs); i++) 78 for (i = 0; i < ARRAY_SIZE(vcpu_44x->shadow_refs); i++)
@@ -123,8 +124,14 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
123 if (err) 124 if (err)
124 goto free_vcpu; 125 goto free_vcpu;
125 126
127 vcpu->arch.shared = (void*)__get_free_page(GFP_KERNEL|__GFP_ZERO);
128 if (!vcpu->arch.shared)
129 goto uninit_vcpu;
130
126 return vcpu; 131 return vcpu;
127 132
133uninit_vcpu:
134 kvm_vcpu_uninit(vcpu);
128free_vcpu: 135free_vcpu:
129 kmem_cache_free(kvm_vcpu_cache, vcpu_44x); 136 kmem_cache_free(kvm_vcpu_cache, vcpu_44x);
130out: 137out:
@@ -135,6 +142,7 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
135{ 142{
136 struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu); 143 struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
137 144
145 free_page((unsigned long)vcpu->arch.shared);
138 kvm_vcpu_uninit(vcpu); 146 kvm_vcpu_uninit(vcpu);
139 kmem_cache_free(kvm_vcpu_cache, vcpu_44x); 147 kmem_cache_free(kvm_vcpu_cache, vcpu_44x);
140} 148}
diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c
index 9b9b5cdea840..5f3cff83e089 100644
--- a/arch/powerpc/kvm/44x_tlb.c
+++ b/arch/powerpc/kvm/44x_tlb.c
@@ -47,6 +47,7 @@
47#ifdef DEBUG 47#ifdef DEBUG
48void kvmppc_dump_tlbs(struct kvm_vcpu *vcpu) 48void kvmppc_dump_tlbs(struct kvm_vcpu *vcpu)
49{ 49{
50 struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
50 struct kvmppc_44x_tlbe *tlbe; 51 struct kvmppc_44x_tlbe *tlbe;
51 int i; 52 int i;
52 53
@@ -221,14 +222,14 @@ gpa_t kvmppc_mmu_xlate(struct kvm_vcpu *vcpu, unsigned int gtlb_index,
221 222
222int kvmppc_mmu_itlb_index(struct kvm_vcpu *vcpu, gva_t eaddr) 223int kvmppc_mmu_itlb_index(struct kvm_vcpu *vcpu, gva_t eaddr)
223{ 224{
224 unsigned int as = !!(vcpu->arch.msr & MSR_IS); 225 unsigned int as = !!(vcpu->arch.shared->msr & MSR_IS);
225 226
226 return kvmppc_44x_tlb_index(vcpu, eaddr, vcpu->arch.pid, as); 227 return kvmppc_44x_tlb_index(vcpu, eaddr, vcpu->arch.pid, as);
227} 228}
228 229
229int kvmppc_mmu_dtlb_index(struct kvm_vcpu *vcpu, gva_t eaddr) 230int kvmppc_mmu_dtlb_index(struct kvm_vcpu *vcpu, gva_t eaddr)
230{ 231{
231 unsigned int as = !!(vcpu->arch.msr & MSR_DS); 232 unsigned int as = !!(vcpu->arch.shared->msr & MSR_DS);
232 233
233 return kvmppc_44x_tlb_index(vcpu, eaddr, vcpu->arch.pid, as); 234 return kvmppc_44x_tlb_index(vcpu, eaddr, vcpu->arch.pid, as);
234} 235}
@@ -354,7 +355,7 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gpa_t gpaddr,
354 355
355 stlbe.word1 = (hpaddr & 0xfffffc00) | ((hpaddr >> 32) & 0xf); 356 stlbe.word1 = (hpaddr & 0xfffffc00) | ((hpaddr >> 32) & 0xf);
356 stlbe.word2 = kvmppc_44x_tlb_shadow_attrib(flags, 357 stlbe.word2 = kvmppc_44x_tlb_shadow_attrib(flags,
357 vcpu->arch.msr & MSR_PR); 358 vcpu->arch.shared->msr & MSR_PR);
358 stlbe.tid = !(asid & 0xff); 359 stlbe.tid = !(asid & 0xff);
359 360
360 /* Keep track of the reference so we can properly release it later. */ 361 /* Keep track of the reference so we can properly release it later. */
@@ -423,7 +424,7 @@ static int tlbe_is_host_safe(const struct kvm_vcpu *vcpu,
423 424
424 /* Does it match current guest AS? */ 425 /* Does it match current guest AS? */
425 /* XXX what about IS != DS? */ 426 /* XXX what about IS != DS? */
426 if (get_tlb_ts(tlbe) != !!(vcpu->arch.msr & MSR_IS)) 427 if (get_tlb_ts(tlbe) != !!(vcpu->arch.shared->msr & MSR_IS))
427 return 0; 428 return 0;
428 429
429 gpa = get_tlb_raddr(tlbe); 430 gpa = get_tlb_raddr(tlbe);
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index a3cef30d1d42..e316847c08c0 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -17,6 +17,7 @@
17#include <linux/kvm_host.h> 17#include <linux/kvm_host.h>
18#include <linux/err.h> 18#include <linux/err.h>
19#include <linux/slab.h> 19#include <linux/slab.h>
20#include "trace.h"
20 21
21#include <asm/reg.h> 22#include <asm/reg.h>
22#include <asm/cputable.h> 23#include <asm/cputable.h>
@@ -35,7 +36,6 @@
35#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU 36#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
36 37
37/* #define EXIT_DEBUG */ 38/* #define EXIT_DEBUG */
38/* #define EXIT_DEBUG_SIMPLE */
39/* #define DEBUG_EXT */ 39/* #define DEBUG_EXT */
40 40
41static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr, 41static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
@@ -105,65 +105,71 @@ void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
105 kvmppc_giveup_ext(vcpu, MSR_VSX); 105 kvmppc_giveup_ext(vcpu, MSR_VSX);
106} 106}
107 107
108#if defined(EXIT_DEBUG)
109static u32 kvmppc_get_dec(struct kvm_vcpu *vcpu)
110{
111 u64 jd = mftb() - vcpu->arch.dec_jiffies;
112 return vcpu->arch.dec - jd;
113}
114#endif
115
116static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu) 108static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu)
117{ 109{
118 vcpu->arch.shadow_msr = vcpu->arch.msr; 110 ulong smsr = vcpu->arch.shared->msr;
111
119 /* Guest MSR values */ 112 /* Guest MSR values */
120 vcpu->arch.shadow_msr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | 113 smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_DE;
121 MSR_BE | MSR_DE;
122 /* Process MSR values */ 114 /* Process MSR values */
123 vcpu->arch.shadow_msr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | 115 smsr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | MSR_EE;
124 MSR_EE;
125 /* External providers the guest reserved */ 116 /* External providers the guest reserved */
126 vcpu->arch.shadow_msr |= (vcpu->arch.msr & vcpu->arch.guest_owned_ext); 117 smsr |= (vcpu->arch.shared->msr & vcpu->arch.guest_owned_ext);
127 /* 64-bit Process MSR values */ 118 /* 64-bit Process MSR values */
128#ifdef CONFIG_PPC_BOOK3S_64 119#ifdef CONFIG_PPC_BOOK3S_64
129 vcpu->arch.shadow_msr |= MSR_ISF | MSR_HV; 120 smsr |= MSR_ISF | MSR_HV;
130#endif 121#endif
122 vcpu->arch.shadow_msr = smsr;
131} 123}
132 124
133void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr) 125void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
134{ 126{
135 ulong old_msr = vcpu->arch.msr; 127 ulong old_msr = vcpu->arch.shared->msr;
136 128
137#ifdef EXIT_DEBUG 129#ifdef EXIT_DEBUG
138 printk(KERN_INFO "KVM: Set MSR to 0x%llx\n", msr); 130 printk(KERN_INFO "KVM: Set MSR to 0x%llx\n", msr);
139#endif 131#endif
140 132
141 msr &= to_book3s(vcpu)->msr_mask; 133 msr &= to_book3s(vcpu)->msr_mask;
142 vcpu->arch.msr = msr; 134 vcpu->arch.shared->msr = msr;
143 kvmppc_recalc_shadow_msr(vcpu); 135 kvmppc_recalc_shadow_msr(vcpu);
144 136
145 if (msr & (MSR_WE|MSR_POW)) { 137 if (msr & MSR_POW) {
146 if (!vcpu->arch.pending_exceptions) { 138 if (!vcpu->arch.pending_exceptions) {
147 kvm_vcpu_block(vcpu); 139 kvm_vcpu_block(vcpu);
148 vcpu->stat.halt_wakeup++; 140 vcpu->stat.halt_wakeup++;
141
142 /* Unset POW bit after we woke up */
143 msr &= ~MSR_POW;
144 vcpu->arch.shared->msr = msr;
149 } 145 }
150 } 146 }
151 147
152 if ((vcpu->arch.msr & (MSR_PR|MSR_IR|MSR_DR)) != 148 if ((vcpu->arch.shared->msr & (MSR_PR|MSR_IR|MSR_DR)) !=
153 (old_msr & (MSR_PR|MSR_IR|MSR_DR))) { 149 (old_msr & (MSR_PR|MSR_IR|MSR_DR))) {
154 kvmppc_mmu_flush_segments(vcpu); 150 kvmppc_mmu_flush_segments(vcpu);
155 kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)); 151 kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu));
152
153 /* Preload magic page segment when in kernel mode */
154 if (!(msr & MSR_PR) && vcpu->arch.magic_page_pa) {
155 struct kvm_vcpu_arch *a = &vcpu->arch;
156
157 if (msr & MSR_DR)
158 kvmppc_mmu_map_segment(vcpu, a->magic_page_ea);
159 else
160 kvmppc_mmu_map_segment(vcpu, a->magic_page_pa);
161 }
156 } 162 }
157 163
158 /* Preload FPU if it's enabled */ 164 /* Preload FPU if it's enabled */
159 if (vcpu->arch.msr & MSR_FP) 165 if (vcpu->arch.shared->msr & MSR_FP)
160 kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP); 166 kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP);
161} 167}
162 168
163void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags) 169void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags)
164{ 170{
165 vcpu->arch.srr0 = kvmppc_get_pc(vcpu); 171 vcpu->arch.shared->srr0 = kvmppc_get_pc(vcpu);
166 vcpu->arch.srr1 = vcpu->arch.msr | flags; 172 vcpu->arch.shared->srr1 = vcpu->arch.shared->msr | flags;
167 kvmppc_set_pc(vcpu, to_book3s(vcpu)->hior + vec); 173 kvmppc_set_pc(vcpu, to_book3s(vcpu)->hior + vec);
168 vcpu->arch.mmu.reset_msr(vcpu); 174 vcpu->arch.mmu.reset_msr(vcpu);
169} 175}
@@ -180,6 +186,7 @@ static int kvmppc_book3s_vec2irqprio(unsigned int vec)
180 case 0x400: prio = BOOK3S_IRQPRIO_INST_STORAGE; break; 186 case 0x400: prio = BOOK3S_IRQPRIO_INST_STORAGE; break;
181 case 0x480: prio = BOOK3S_IRQPRIO_INST_SEGMENT; break; 187 case 0x480: prio = BOOK3S_IRQPRIO_INST_SEGMENT; break;
182 case 0x500: prio = BOOK3S_IRQPRIO_EXTERNAL; break; 188 case 0x500: prio = BOOK3S_IRQPRIO_EXTERNAL; break;
189 case 0x501: prio = BOOK3S_IRQPRIO_EXTERNAL_LEVEL; break;
183 case 0x600: prio = BOOK3S_IRQPRIO_ALIGNMENT; break; 190 case 0x600: prio = BOOK3S_IRQPRIO_ALIGNMENT; break;
184 case 0x700: prio = BOOK3S_IRQPRIO_PROGRAM; break; 191 case 0x700: prio = BOOK3S_IRQPRIO_PROGRAM; break;
185 case 0x800: prio = BOOK3S_IRQPRIO_FP_UNAVAIL; break; 192 case 0x800: prio = BOOK3S_IRQPRIO_FP_UNAVAIL; break;
@@ -199,6 +206,9 @@ static void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu,
199{ 206{
200 clear_bit(kvmppc_book3s_vec2irqprio(vec), 207 clear_bit(kvmppc_book3s_vec2irqprio(vec),
201 &vcpu->arch.pending_exceptions); 208 &vcpu->arch.pending_exceptions);
209
210 if (!vcpu->arch.pending_exceptions)
211 vcpu->arch.shared->int_pending = 0;
202} 212}
203 213
204void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec) 214void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec)
@@ -237,13 +247,19 @@ void kvmppc_core_dequeue_dec(struct kvm_vcpu *vcpu)
237void kvmppc_core_queue_external(struct kvm_vcpu *vcpu, 247void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
238 struct kvm_interrupt *irq) 248 struct kvm_interrupt *irq)
239{ 249{
240 kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL); 250 unsigned int vec = BOOK3S_INTERRUPT_EXTERNAL;
251
252 if (irq->irq == KVM_INTERRUPT_SET_LEVEL)
253 vec = BOOK3S_INTERRUPT_EXTERNAL_LEVEL;
254
255 kvmppc_book3s_queue_irqprio(vcpu, vec);
241} 256}
242 257
243void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu, 258void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
244 struct kvm_interrupt *irq) 259 struct kvm_interrupt *irq)
245{ 260{
246 kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL); 261 kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL);
262 kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
247} 263}
248 264
249int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority) 265int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority)
@@ -251,14 +267,29 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority)
251 int deliver = 1; 267 int deliver = 1;
252 int vec = 0; 268 int vec = 0;
253 ulong flags = 0ULL; 269 ulong flags = 0ULL;
270 ulong crit_raw = vcpu->arch.shared->critical;
271 ulong crit_r1 = kvmppc_get_gpr(vcpu, 1);
272 bool crit;
273
274 /* Truncate crit indicators in 32 bit mode */
275 if (!(vcpu->arch.shared->msr & MSR_SF)) {
276 crit_raw &= 0xffffffff;
277 crit_r1 &= 0xffffffff;
278 }
279
280 /* Critical section when crit == r1 */
281 crit = (crit_raw == crit_r1);
282 /* ... and we're in supervisor mode */
283 crit = crit && !(vcpu->arch.shared->msr & MSR_PR);
254 284
255 switch (priority) { 285 switch (priority) {
256 case BOOK3S_IRQPRIO_DECREMENTER: 286 case BOOK3S_IRQPRIO_DECREMENTER:
257 deliver = vcpu->arch.msr & MSR_EE; 287 deliver = (vcpu->arch.shared->msr & MSR_EE) && !crit;
258 vec = BOOK3S_INTERRUPT_DECREMENTER; 288 vec = BOOK3S_INTERRUPT_DECREMENTER;
259 break; 289 break;
260 case BOOK3S_IRQPRIO_EXTERNAL: 290 case BOOK3S_IRQPRIO_EXTERNAL:
261 deliver = vcpu->arch.msr & MSR_EE; 291 case BOOK3S_IRQPRIO_EXTERNAL_LEVEL:
292 deliver = (vcpu->arch.shared->msr & MSR_EE) && !crit;
262 vec = BOOK3S_INTERRUPT_EXTERNAL; 293 vec = BOOK3S_INTERRUPT_EXTERNAL;
263 break; 294 break;
264 case BOOK3S_IRQPRIO_SYSTEM_RESET: 295 case BOOK3S_IRQPRIO_SYSTEM_RESET:
@@ -320,9 +351,27 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority)
320 return deliver; 351 return deliver;
321} 352}
322 353
354/*
355 * This function determines if an irqprio should be cleared once issued.
356 */
357static bool clear_irqprio(struct kvm_vcpu *vcpu, unsigned int priority)
358{
359 switch (priority) {
360 case BOOK3S_IRQPRIO_DECREMENTER:
361 /* DEC interrupts get cleared by mtdec */
362 return false;
363 case BOOK3S_IRQPRIO_EXTERNAL_LEVEL:
364 /* External interrupts get cleared by userspace */
365 return false;
366 }
367
368 return true;
369}
370
323void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu) 371void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu)
324{ 372{
325 unsigned long *pending = &vcpu->arch.pending_exceptions; 373 unsigned long *pending = &vcpu->arch.pending_exceptions;
374 unsigned long old_pending = vcpu->arch.pending_exceptions;
326 unsigned int priority; 375 unsigned int priority;
327 376
328#ifdef EXIT_DEBUG 377#ifdef EXIT_DEBUG
@@ -332,8 +381,7 @@ void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu)
332 priority = __ffs(*pending); 381 priority = __ffs(*pending);
333 while (priority < BOOK3S_IRQPRIO_MAX) { 382 while (priority < BOOK3S_IRQPRIO_MAX) {
334 if (kvmppc_book3s_irqprio_deliver(vcpu, priority) && 383 if (kvmppc_book3s_irqprio_deliver(vcpu, priority) &&
335 (priority != BOOK3S_IRQPRIO_DECREMENTER)) { 384 clear_irqprio(vcpu, priority)) {
336 /* DEC interrupts get cleared by mtdec */
337 clear_bit(priority, &vcpu->arch.pending_exceptions); 385 clear_bit(priority, &vcpu->arch.pending_exceptions);
338 break; 386 break;
339 } 387 }
@@ -342,6 +390,12 @@ void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu)
342 BITS_PER_BYTE * sizeof(*pending), 390 BITS_PER_BYTE * sizeof(*pending),
343 priority + 1); 391 priority + 1);
344 } 392 }
393
394 /* Tell the guest about our interrupt status */
395 if (*pending)
396 vcpu->arch.shared->int_pending = 1;
397 else if (old_pending)
398 vcpu->arch.shared->int_pending = 0;
345} 399}
346 400
347void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr) 401void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
@@ -398,6 +452,25 @@ void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
398 } 452 }
399} 453}
400 454
455pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
456{
457 ulong mp_pa = vcpu->arch.magic_page_pa;
458
459 /* Magic page override */
460 if (unlikely(mp_pa) &&
461 unlikely(((gfn << PAGE_SHIFT) & KVM_PAM) ==
462 ((mp_pa & PAGE_MASK) & KVM_PAM))) {
463 ulong shared_page = ((ulong)vcpu->arch.shared) & PAGE_MASK;
464 pfn_t pfn;
465
466 pfn = (pfn_t)virt_to_phys((void*)shared_page) >> PAGE_SHIFT;
467 get_page(pfn_to_page(pfn));
468 return pfn;
469 }
470
471 return gfn_to_pfn(vcpu->kvm, gfn);
472}
473
401/* Book3s_32 CPUs always have 32 bytes cache line size, which Linux assumes. To 474/* Book3s_32 CPUs always have 32 bytes cache line size, which Linux assumes. To
402 * make Book3s_32 Linux work on Book3s_64, we have to make sure we trap dcbz to 475 * make Book3s_32 Linux work on Book3s_64, we have to make sure we trap dcbz to
403 * emulate 32 bytes dcbz length. 476 * emulate 32 bytes dcbz length.
@@ -415,8 +488,10 @@ static void kvmppc_patch_dcbz(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte)
415 int i; 488 int i;
416 489
417 hpage = gfn_to_page(vcpu->kvm, pte->raddr >> PAGE_SHIFT); 490 hpage = gfn_to_page(vcpu->kvm, pte->raddr >> PAGE_SHIFT);
418 if (is_error_page(hpage)) 491 if (is_error_page(hpage)) {
492 kvm_release_page_clean(hpage);
419 return; 493 return;
494 }
420 495
421 hpage_offset = pte->raddr & ~PAGE_MASK; 496 hpage_offset = pte->raddr & ~PAGE_MASK;
422 hpage_offset &= ~0xFFFULL; 497 hpage_offset &= ~0xFFFULL;
@@ -437,14 +512,14 @@ static void kvmppc_patch_dcbz(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte)
437static int kvmppc_xlate(struct kvm_vcpu *vcpu, ulong eaddr, bool data, 512static int kvmppc_xlate(struct kvm_vcpu *vcpu, ulong eaddr, bool data,
438 struct kvmppc_pte *pte) 513 struct kvmppc_pte *pte)
439{ 514{
440 int relocated = (vcpu->arch.msr & (data ? MSR_DR : MSR_IR)); 515 int relocated = (vcpu->arch.shared->msr & (data ? MSR_DR : MSR_IR));
441 int r; 516 int r;
442 517
443 if (relocated) { 518 if (relocated) {
444 r = vcpu->arch.mmu.xlate(vcpu, eaddr, pte, data); 519 r = vcpu->arch.mmu.xlate(vcpu, eaddr, pte, data);
445 } else { 520 } else {
446 pte->eaddr = eaddr; 521 pte->eaddr = eaddr;
447 pte->raddr = eaddr & 0xffffffff; 522 pte->raddr = eaddr & KVM_PAM;
448 pte->vpage = VSID_REAL | eaddr >> 12; 523 pte->vpage = VSID_REAL | eaddr >> 12;
449 pte->may_read = true; 524 pte->may_read = true;
450 pte->may_write = true; 525 pte->may_write = true;
@@ -533,6 +608,13 @@ mmio:
533 608
534static int kvmppc_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) 609static int kvmppc_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
535{ 610{
611 ulong mp_pa = vcpu->arch.magic_page_pa;
612
613 if (unlikely(mp_pa) &&
614 unlikely((mp_pa & KVM_PAM) >> PAGE_SHIFT == gfn)) {
615 return 1;
616 }
617
536 return kvm_is_visible_gfn(vcpu->kvm, gfn); 618 return kvm_is_visible_gfn(vcpu->kvm, gfn);
537} 619}
538 620
@@ -545,8 +627,8 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
545 int page_found = 0; 627 int page_found = 0;
546 struct kvmppc_pte pte; 628 struct kvmppc_pte pte;
547 bool is_mmio = false; 629 bool is_mmio = false;
548 bool dr = (vcpu->arch.msr & MSR_DR) ? true : false; 630 bool dr = (vcpu->arch.shared->msr & MSR_DR) ? true : false;
549 bool ir = (vcpu->arch.msr & MSR_IR) ? true : false; 631 bool ir = (vcpu->arch.shared->msr & MSR_IR) ? true : false;
550 u64 vsid; 632 u64 vsid;
551 633
552 relocated = data ? dr : ir; 634 relocated = data ? dr : ir;
@@ -558,12 +640,12 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
558 pte.may_execute = true; 640 pte.may_execute = true;
559 pte.may_read = true; 641 pte.may_read = true;
560 pte.may_write = true; 642 pte.may_write = true;
561 pte.raddr = eaddr & 0xffffffff; 643 pte.raddr = eaddr & KVM_PAM;
562 pte.eaddr = eaddr; 644 pte.eaddr = eaddr;
563 pte.vpage = eaddr >> 12; 645 pte.vpage = eaddr >> 12;
564 } 646 }
565 647
566 switch (vcpu->arch.msr & (MSR_DR|MSR_IR)) { 648 switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) {
567 case 0: 649 case 0:
568 pte.vpage |= ((u64)VSID_REAL << (SID_SHIFT - 12)); 650 pte.vpage |= ((u64)VSID_REAL << (SID_SHIFT - 12));
569 break; 651 break;
@@ -571,7 +653,7 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
571 case MSR_IR: 653 case MSR_IR:
572 vcpu->arch.mmu.esid_to_vsid(vcpu, eaddr >> SID_SHIFT, &vsid); 654 vcpu->arch.mmu.esid_to_vsid(vcpu, eaddr >> SID_SHIFT, &vsid);
573 655
574 if ((vcpu->arch.msr & (MSR_DR|MSR_IR)) == MSR_DR) 656 if ((vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) == MSR_DR)
575 pte.vpage |= ((u64)VSID_REAL_DR << (SID_SHIFT - 12)); 657 pte.vpage |= ((u64)VSID_REAL_DR << (SID_SHIFT - 12));
576 else 658 else
577 pte.vpage |= ((u64)VSID_REAL_IR << (SID_SHIFT - 12)); 659 pte.vpage |= ((u64)VSID_REAL_IR << (SID_SHIFT - 12));
@@ -594,20 +676,23 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
594 676
595 if (page_found == -ENOENT) { 677 if (page_found == -ENOENT) {
596 /* Page not found in guest PTE entries */ 678 /* Page not found in guest PTE entries */
597 vcpu->arch.dear = kvmppc_get_fault_dar(vcpu); 679 vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
598 to_book3s(vcpu)->dsisr = to_svcpu(vcpu)->fault_dsisr; 680 vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr;
599 vcpu->arch.msr |= (to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL); 681 vcpu->arch.shared->msr |=
682 (to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL);
600 kvmppc_book3s_queue_irqprio(vcpu, vec); 683 kvmppc_book3s_queue_irqprio(vcpu, vec);
601 } else if (page_found == -EPERM) { 684 } else if (page_found == -EPERM) {
602 /* Storage protection */ 685 /* Storage protection */
603 vcpu->arch.dear = kvmppc_get_fault_dar(vcpu); 686 vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
604 to_book3s(vcpu)->dsisr = to_svcpu(vcpu)->fault_dsisr & ~DSISR_NOHPTE; 687 vcpu->arch.shared->dsisr =
605 to_book3s(vcpu)->dsisr |= DSISR_PROTFAULT; 688 to_svcpu(vcpu)->fault_dsisr & ~DSISR_NOHPTE;
606 vcpu->arch.msr |= (to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL); 689 vcpu->arch.shared->dsisr |= DSISR_PROTFAULT;
690 vcpu->arch.shared->msr |=
691 (to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL);
607 kvmppc_book3s_queue_irqprio(vcpu, vec); 692 kvmppc_book3s_queue_irqprio(vcpu, vec);
608 } else if (page_found == -EINVAL) { 693 } else if (page_found == -EINVAL) {
609 /* Page not found in guest SLB */ 694 /* Page not found in guest SLB */
610 vcpu->arch.dear = kvmppc_get_fault_dar(vcpu); 695 vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
611 kvmppc_book3s_queue_irqprio(vcpu, vec + 0x80); 696 kvmppc_book3s_queue_irqprio(vcpu, vec + 0x80);
612 } else if (!is_mmio && 697 } else if (!is_mmio &&
613 kvmppc_visible_gfn(vcpu, pte.raddr >> PAGE_SHIFT)) { 698 kvmppc_visible_gfn(vcpu, pte.raddr >> PAGE_SHIFT)) {
@@ -695,9 +780,11 @@ static int kvmppc_read_inst(struct kvm_vcpu *vcpu)
695 780
696 ret = kvmppc_ld(vcpu, &srr0, sizeof(u32), &last_inst, false); 781 ret = kvmppc_ld(vcpu, &srr0, sizeof(u32), &last_inst, false);
697 if (ret == -ENOENT) { 782 if (ret == -ENOENT) {
698 vcpu->arch.msr = kvmppc_set_field(vcpu->arch.msr, 33, 33, 1); 783 ulong msr = vcpu->arch.shared->msr;
699 vcpu->arch.msr = kvmppc_set_field(vcpu->arch.msr, 34, 36, 0); 784
700 vcpu->arch.msr = kvmppc_set_field(vcpu->arch.msr, 42, 47, 0); 785 msr = kvmppc_set_field(msr, 33, 33, 1);
786 msr = kvmppc_set_field(msr, 34, 36, 0);
787 vcpu->arch.shared->msr = kvmppc_set_field(msr, 42, 47, 0);
701 kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_INST_STORAGE); 788 kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_INST_STORAGE);
702 return EMULATE_AGAIN; 789 return EMULATE_AGAIN;
703 } 790 }
@@ -736,7 +823,7 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
736 if (vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE) 823 if (vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE)
737 return RESUME_GUEST; 824 return RESUME_GUEST;
738 825
739 if (!(vcpu->arch.msr & msr)) { 826 if (!(vcpu->arch.shared->msr & msr)) {
740 kvmppc_book3s_queue_irqprio(vcpu, exit_nr); 827 kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
741 return RESUME_GUEST; 828 return RESUME_GUEST;
742 } 829 }
@@ -796,16 +883,8 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
796 883
797 run->exit_reason = KVM_EXIT_UNKNOWN; 884 run->exit_reason = KVM_EXIT_UNKNOWN;
798 run->ready_for_interrupt_injection = 1; 885 run->ready_for_interrupt_injection = 1;
799#ifdef EXIT_DEBUG 886
800 printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | dar=0x%lx | dec=0x%x | msr=0x%lx\n", 887 trace_kvm_book3s_exit(exit_nr, vcpu);
801 exit_nr, kvmppc_get_pc(vcpu), kvmppc_get_fault_dar(vcpu),
802 kvmppc_get_dec(vcpu), to_svcpu(vcpu)->shadow_srr1);
803#elif defined (EXIT_DEBUG_SIMPLE)
804 if ((exit_nr != 0x900) && (exit_nr != 0x500))
805 printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | dar=0x%lx | msr=0x%lx\n",
806 exit_nr, kvmppc_get_pc(vcpu), kvmppc_get_fault_dar(vcpu),
807 vcpu->arch.msr);
808#endif
809 kvm_resched(vcpu); 888 kvm_resched(vcpu);
810 switch (exit_nr) { 889 switch (exit_nr) {
811 case BOOK3S_INTERRUPT_INST_STORAGE: 890 case BOOK3S_INTERRUPT_INST_STORAGE:
@@ -836,9 +915,9 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
836 kvmppc_mmu_pte_flush(vcpu, kvmppc_get_pc(vcpu), ~0xFFFUL); 915 kvmppc_mmu_pte_flush(vcpu, kvmppc_get_pc(vcpu), ~0xFFFUL);
837 r = RESUME_GUEST; 916 r = RESUME_GUEST;
838 } else { 917 } else {
839 vcpu->arch.msr |= to_svcpu(vcpu)->shadow_srr1 & 0x58000000; 918 vcpu->arch.shared->msr |=
919 to_svcpu(vcpu)->shadow_srr1 & 0x58000000;
840 kvmppc_book3s_queue_irqprio(vcpu, exit_nr); 920 kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
841 kvmppc_mmu_pte_flush(vcpu, kvmppc_get_pc(vcpu), ~0xFFFUL);
842 r = RESUME_GUEST; 921 r = RESUME_GUEST;
843 } 922 }
844 break; 923 break;
@@ -861,17 +940,16 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
861 if (to_svcpu(vcpu)->fault_dsisr & DSISR_NOHPTE) { 940 if (to_svcpu(vcpu)->fault_dsisr & DSISR_NOHPTE) {
862 r = kvmppc_handle_pagefault(run, vcpu, dar, exit_nr); 941 r = kvmppc_handle_pagefault(run, vcpu, dar, exit_nr);
863 } else { 942 } else {
864 vcpu->arch.dear = dar; 943 vcpu->arch.shared->dar = dar;
865 to_book3s(vcpu)->dsisr = to_svcpu(vcpu)->fault_dsisr; 944 vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr;
866 kvmppc_book3s_queue_irqprio(vcpu, exit_nr); 945 kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
867 kvmppc_mmu_pte_flush(vcpu, vcpu->arch.dear, ~0xFFFUL);
868 r = RESUME_GUEST; 946 r = RESUME_GUEST;
869 } 947 }
870 break; 948 break;
871 } 949 }
872 case BOOK3S_INTERRUPT_DATA_SEGMENT: 950 case BOOK3S_INTERRUPT_DATA_SEGMENT:
873 if (kvmppc_mmu_map_segment(vcpu, kvmppc_get_fault_dar(vcpu)) < 0) { 951 if (kvmppc_mmu_map_segment(vcpu, kvmppc_get_fault_dar(vcpu)) < 0) {
874 vcpu->arch.dear = kvmppc_get_fault_dar(vcpu); 952 vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu);
875 kvmppc_book3s_queue_irqprio(vcpu, 953 kvmppc_book3s_queue_irqprio(vcpu,
876 BOOK3S_INTERRUPT_DATA_SEGMENT); 954 BOOK3S_INTERRUPT_DATA_SEGMENT);
877 } 955 }
@@ -904,7 +982,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
904program_interrupt: 982program_interrupt:
905 flags = to_svcpu(vcpu)->shadow_srr1 & 0x1f0000ull; 983 flags = to_svcpu(vcpu)->shadow_srr1 & 0x1f0000ull;
906 984
907 if (vcpu->arch.msr & MSR_PR) { 985 if (vcpu->arch.shared->msr & MSR_PR) {
908#ifdef EXIT_DEBUG 986#ifdef EXIT_DEBUG
909 printk(KERN_INFO "Userspace triggered 0x700 exception at 0x%lx (0x%x)\n", kvmppc_get_pc(vcpu), kvmppc_get_last_inst(vcpu)); 987 printk(KERN_INFO "Userspace triggered 0x700 exception at 0x%lx (0x%x)\n", kvmppc_get_pc(vcpu), kvmppc_get_last_inst(vcpu));
910#endif 988#endif
@@ -941,10 +1019,10 @@ program_interrupt:
941 break; 1019 break;
942 } 1020 }
943 case BOOK3S_INTERRUPT_SYSCALL: 1021 case BOOK3S_INTERRUPT_SYSCALL:
944 // XXX make user settable
945 if (vcpu->arch.osi_enabled && 1022 if (vcpu->arch.osi_enabled &&
946 (((u32)kvmppc_get_gpr(vcpu, 3)) == OSI_SC_MAGIC_R3) && 1023 (((u32)kvmppc_get_gpr(vcpu, 3)) == OSI_SC_MAGIC_R3) &&
947 (((u32)kvmppc_get_gpr(vcpu, 4)) == OSI_SC_MAGIC_R4)) { 1024 (((u32)kvmppc_get_gpr(vcpu, 4)) == OSI_SC_MAGIC_R4)) {
1025 /* MOL hypercalls */
948 u64 *gprs = run->osi.gprs; 1026 u64 *gprs = run->osi.gprs;
949 int i; 1027 int i;
950 1028
@@ -953,8 +1031,13 @@ program_interrupt:
953 gprs[i] = kvmppc_get_gpr(vcpu, i); 1031 gprs[i] = kvmppc_get_gpr(vcpu, i);
954 vcpu->arch.osi_needed = 1; 1032 vcpu->arch.osi_needed = 1;
955 r = RESUME_HOST_NV; 1033 r = RESUME_HOST_NV;
956 1034 } else if (!(vcpu->arch.shared->msr & MSR_PR) &&
1035 (((u32)kvmppc_get_gpr(vcpu, 0)) == KVM_SC_MAGIC_R0)) {
1036 /* KVM PV hypercalls */
1037 kvmppc_set_gpr(vcpu, 3, kvmppc_kvm_pv(vcpu));
1038 r = RESUME_GUEST;
957 } else { 1039 } else {
1040 /* Guest syscalls */
958 vcpu->stat.syscall_exits++; 1041 vcpu->stat.syscall_exits++;
959 kvmppc_book3s_queue_irqprio(vcpu, exit_nr); 1042 kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
960 r = RESUME_GUEST; 1043 r = RESUME_GUEST;
@@ -989,9 +1072,9 @@ program_interrupt:
989 } 1072 }
990 case BOOK3S_INTERRUPT_ALIGNMENT: 1073 case BOOK3S_INTERRUPT_ALIGNMENT:
991 if (kvmppc_read_inst(vcpu) == EMULATE_DONE) { 1074 if (kvmppc_read_inst(vcpu) == EMULATE_DONE) {
992 to_book3s(vcpu)->dsisr = kvmppc_alignment_dsisr(vcpu, 1075 vcpu->arch.shared->dsisr = kvmppc_alignment_dsisr(vcpu,
993 kvmppc_get_last_inst(vcpu)); 1076 kvmppc_get_last_inst(vcpu));
994 vcpu->arch.dear = kvmppc_alignment_dar(vcpu, 1077 vcpu->arch.shared->dar = kvmppc_alignment_dar(vcpu,
995 kvmppc_get_last_inst(vcpu)); 1078 kvmppc_get_last_inst(vcpu));
996 kvmppc_book3s_queue_irqprio(vcpu, exit_nr); 1079 kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
997 } 1080 }
@@ -1031,9 +1114,7 @@ program_interrupt:
1031 } 1114 }
1032 } 1115 }
1033 1116
1034#ifdef EXIT_DEBUG 1117 trace_kvm_book3s_reenter(r, vcpu);
1035 printk(KERN_EMERG "KVM exit: vcpu=0x%p pc=0x%lx r=0x%x\n", vcpu, kvmppc_get_pc(vcpu), r);
1036#endif
1037 1118
1038 return r; 1119 return r;
1039} 1120}
@@ -1052,14 +1133,14 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
1052 regs->ctr = kvmppc_get_ctr(vcpu); 1133 regs->ctr = kvmppc_get_ctr(vcpu);
1053 regs->lr = kvmppc_get_lr(vcpu); 1134 regs->lr = kvmppc_get_lr(vcpu);
1054 regs->xer = kvmppc_get_xer(vcpu); 1135 regs->xer = kvmppc_get_xer(vcpu);
1055 regs->msr = vcpu->arch.msr; 1136 regs->msr = vcpu->arch.shared->msr;
1056 regs->srr0 = vcpu->arch.srr0; 1137 regs->srr0 = vcpu->arch.shared->srr0;
1057 regs->srr1 = vcpu->arch.srr1; 1138 regs->srr1 = vcpu->arch.shared->srr1;
1058 regs->pid = vcpu->arch.pid; 1139 regs->pid = vcpu->arch.pid;
1059 regs->sprg0 = vcpu->arch.sprg0; 1140 regs->sprg0 = vcpu->arch.shared->sprg0;
1060 regs->sprg1 = vcpu->arch.sprg1; 1141 regs->sprg1 = vcpu->arch.shared->sprg1;
1061 regs->sprg2 = vcpu->arch.sprg2; 1142 regs->sprg2 = vcpu->arch.shared->sprg2;
1062 regs->sprg3 = vcpu->arch.sprg3; 1143 regs->sprg3 = vcpu->arch.shared->sprg3;
1063 regs->sprg5 = vcpu->arch.sprg4; 1144 regs->sprg5 = vcpu->arch.sprg4;
1064 regs->sprg6 = vcpu->arch.sprg5; 1145 regs->sprg6 = vcpu->arch.sprg5;
1065 regs->sprg7 = vcpu->arch.sprg6; 1146 regs->sprg7 = vcpu->arch.sprg6;
@@ -1080,12 +1161,12 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
1080 kvmppc_set_lr(vcpu, regs->lr); 1161 kvmppc_set_lr(vcpu, regs->lr);
1081 kvmppc_set_xer(vcpu, regs->xer); 1162 kvmppc_set_xer(vcpu, regs->xer);
1082 kvmppc_set_msr(vcpu, regs->msr); 1163 kvmppc_set_msr(vcpu, regs->msr);
1083 vcpu->arch.srr0 = regs->srr0; 1164 vcpu->arch.shared->srr0 = regs->srr0;
1084 vcpu->arch.srr1 = regs->srr1; 1165 vcpu->arch.shared->srr1 = regs->srr1;
1085 vcpu->arch.sprg0 = regs->sprg0; 1166 vcpu->arch.shared->sprg0 = regs->sprg0;
1086 vcpu->arch.sprg1 = regs->sprg1; 1167 vcpu->arch.shared->sprg1 = regs->sprg1;
1087 vcpu->arch.sprg2 = regs->sprg2; 1168 vcpu->arch.shared->sprg2 = regs->sprg2;
1088 vcpu->arch.sprg3 = regs->sprg3; 1169 vcpu->arch.shared->sprg3 = regs->sprg3;
1089 vcpu->arch.sprg5 = regs->sprg4; 1170 vcpu->arch.sprg5 = regs->sprg4;
1090 vcpu->arch.sprg6 = regs->sprg5; 1171 vcpu->arch.sprg6 = regs->sprg5;
1091 vcpu->arch.sprg7 = regs->sprg6; 1172 vcpu->arch.sprg7 = regs->sprg6;
@@ -1111,10 +1192,9 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
1111 sregs->u.s.ppc64.slb[i].slbv = vcpu3s->slb[i].origv; 1192 sregs->u.s.ppc64.slb[i].slbv = vcpu3s->slb[i].origv;
1112 } 1193 }
1113 } else { 1194 } else {
1114 for (i = 0; i < 16; i++) { 1195 for (i = 0; i < 16; i++)
1115 sregs->u.s.ppc32.sr[i] = vcpu3s->sr[i].raw; 1196 sregs->u.s.ppc32.sr[i] = vcpu->arch.shared->sr[i];
1116 sregs->u.s.ppc32.sr[i] = vcpu3s->sr[i].raw; 1197
1117 }
1118 for (i = 0; i < 8; i++) { 1198 for (i = 0; i < 8; i++) {
1119 sregs->u.s.ppc32.ibat[i] = vcpu3s->ibat[i].raw; 1199 sregs->u.s.ppc32.ibat[i] = vcpu3s->ibat[i].raw;
1120 sregs->u.s.ppc32.dbat[i] = vcpu3s->dbat[i].raw; 1200 sregs->u.s.ppc32.dbat[i] = vcpu3s->dbat[i].raw;
@@ -1225,6 +1305,7 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
1225 struct kvmppc_vcpu_book3s *vcpu_book3s; 1305 struct kvmppc_vcpu_book3s *vcpu_book3s;
1226 struct kvm_vcpu *vcpu; 1306 struct kvm_vcpu *vcpu;
1227 int err = -ENOMEM; 1307 int err = -ENOMEM;
1308 unsigned long p;
1228 1309
1229 vcpu_book3s = vmalloc(sizeof(struct kvmppc_vcpu_book3s)); 1310 vcpu_book3s = vmalloc(sizeof(struct kvmppc_vcpu_book3s));
1230 if (!vcpu_book3s) 1311 if (!vcpu_book3s)
@@ -1242,6 +1323,12 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
1242 if (err) 1323 if (err)
1243 goto free_shadow_vcpu; 1324 goto free_shadow_vcpu;
1244 1325
1326 p = __get_free_page(GFP_KERNEL|__GFP_ZERO);
1327 /* the real shared page fills the last 4k of our page */
1328 vcpu->arch.shared = (void*)(p + PAGE_SIZE - 4096);
1329 if (!p)
1330 goto uninit_vcpu;
1331
1245 vcpu->arch.host_retip = kvm_return_point; 1332 vcpu->arch.host_retip = kvm_return_point;
1246 vcpu->arch.host_msr = mfmsr(); 1333 vcpu->arch.host_msr = mfmsr();
1247#ifdef CONFIG_PPC_BOOK3S_64 1334#ifdef CONFIG_PPC_BOOK3S_64
@@ -1268,10 +1355,12 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
1268 1355
1269 err = kvmppc_mmu_init(vcpu); 1356 err = kvmppc_mmu_init(vcpu);
1270 if (err < 0) 1357 if (err < 0)
1271 goto free_shadow_vcpu; 1358 goto uninit_vcpu;
1272 1359
1273 return vcpu; 1360 return vcpu;
1274 1361
1362uninit_vcpu:
1363 kvm_vcpu_uninit(vcpu);
1275free_shadow_vcpu: 1364free_shadow_vcpu:
1276 kfree(vcpu_book3s->shadow_vcpu); 1365 kfree(vcpu_book3s->shadow_vcpu);
1277free_vcpu: 1366free_vcpu:
@@ -1284,6 +1373,7 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
1284{ 1373{
1285 struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); 1374 struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
1286 1375
1376 free_page((unsigned long)vcpu->arch.shared & PAGE_MASK);
1287 kvm_vcpu_uninit(vcpu); 1377 kvm_vcpu_uninit(vcpu);
1288 kfree(vcpu_book3s->shadow_vcpu); 1378 kfree(vcpu_book3s->shadow_vcpu);
1289 vfree(vcpu_book3s); 1379 vfree(vcpu_book3s);
@@ -1346,7 +1436,7 @@ int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1346 local_irq_enable(); 1436 local_irq_enable();
1347 1437
1348 /* Preload FPU if it's enabled */ 1438 /* Preload FPU if it's enabled */
1349 if (vcpu->arch.msr & MSR_FP) 1439 if (vcpu->arch.shared->msr & MSR_FP)
1350 kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP); 1440 kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP);
1351 1441
1352 ret = __kvmppc_vcpu_entry(kvm_run, vcpu); 1442 ret = __kvmppc_vcpu_entry(kvm_run, vcpu);
diff --git a/arch/powerpc/kvm/book3s_32_mmu.c b/arch/powerpc/kvm/book3s_32_mmu.c
index 3292d76101d2..c8cefdd15fd8 100644
--- a/arch/powerpc/kvm/book3s_32_mmu.c
+++ b/arch/powerpc/kvm/book3s_32_mmu.c
@@ -58,14 +58,39 @@ static inline bool check_debug_ip(struct kvm_vcpu *vcpu)
58#endif 58#endif
59} 59}
60 60
61static inline u32 sr_vsid(u32 sr_raw)
62{
63 return sr_raw & 0x0fffffff;
64}
65
66static inline bool sr_valid(u32 sr_raw)
67{
68 return (sr_raw & 0x80000000) ? false : true;
69}
70
71static inline bool sr_ks(u32 sr_raw)
72{
73 return (sr_raw & 0x40000000) ? true: false;
74}
75
76static inline bool sr_kp(u32 sr_raw)
77{
78 return (sr_raw & 0x20000000) ? true: false;
79}
80
81static inline bool sr_nx(u32 sr_raw)
82{
83 return (sr_raw & 0x10000000) ? true: false;
84}
85
61static int kvmppc_mmu_book3s_32_xlate_bat(struct kvm_vcpu *vcpu, gva_t eaddr, 86static int kvmppc_mmu_book3s_32_xlate_bat(struct kvm_vcpu *vcpu, gva_t eaddr,
62 struct kvmppc_pte *pte, bool data); 87 struct kvmppc_pte *pte, bool data);
63static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid, 88static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
64 u64 *vsid); 89 u64 *vsid);
65 90
66static struct kvmppc_sr *find_sr(struct kvmppc_vcpu_book3s *vcpu_book3s, gva_t eaddr) 91static u32 find_sr(struct kvm_vcpu *vcpu, gva_t eaddr)
67{ 92{
68 return &vcpu_book3s->sr[(eaddr >> 28) & 0xf]; 93 return vcpu->arch.shared->sr[(eaddr >> 28) & 0xf];
69} 94}
70 95
71static u64 kvmppc_mmu_book3s_32_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr, 96static u64 kvmppc_mmu_book3s_32_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr,
@@ -87,7 +112,7 @@ static void kvmppc_mmu_book3s_32_reset_msr(struct kvm_vcpu *vcpu)
87} 112}
88 113
89static hva_t kvmppc_mmu_book3s_32_get_pteg(struct kvmppc_vcpu_book3s *vcpu_book3s, 114static hva_t kvmppc_mmu_book3s_32_get_pteg(struct kvmppc_vcpu_book3s *vcpu_book3s,
90 struct kvmppc_sr *sre, gva_t eaddr, 115 u32 sre, gva_t eaddr,
91 bool primary) 116 bool primary)
92{ 117{
93 u32 page, hash, pteg, htabmask; 118 u32 page, hash, pteg, htabmask;
@@ -96,7 +121,7 @@ static hva_t kvmppc_mmu_book3s_32_get_pteg(struct kvmppc_vcpu_book3s *vcpu_book3
96 page = (eaddr & 0x0FFFFFFF) >> 12; 121 page = (eaddr & 0x0FFFFFFF) >> 12;
97 htabmask = ((vcpu_book3s->sdr1 & 0x1FF) << 16) | 0xFFC0; 122 htabmask = ((vcpu_book3s->sdr1 & 0x1FF) << 16) | 0xFFC0;
98 123
99 hash = ((sre->vsid ^ page) << 6); 124 hash = ((sr_vsid(sre) ^ page) << 6);
100 if (!primary) 125 if (!primary)
101 hash = ~hash; 126 hash = ~hash;
102 hash &= htabmask; 127 hash &= htabmask;
@@ -104,8 +129,8 @@ static hva_t kvmppc_mmu_book3s_32_get_pteg(struct kvmppc_vcpu_book3s *vcpu_book3
104 pteg = (vcpu_book3s->sdr1 & 0xffff0000) | hash; 129 pteg = (vcpu_book3s->sdr1 & 0xffff0000) | hash;
105 130
106 dprintk("MMU: pc=0x%lx eaddr=0x%lx sdr1=0x%llx pteg=0x%x vsid=0x%x\n", 131 dprintk("MMU: pc=0x%lx eaddr=0x%lx sdr1=0x%llx pteg=0x%x vsid=0x%x\n",
107 vcpu_book3s->vcpu.arch.pc, eaddr, vcpu_book3s->sdr1, pteg, 132 kvmppc_get_pc(&vcpu_book3s->vcpu), eaddr, vcpu_book3s->sdr1, pteg,
108 sre->vsid); 133 sr_vsid(sre));
109 134
110 r = gfn_to_hva(vcpu_book3s->vcpu.kvm, pteg >> PAGE_SHIFT); 135 r = gfn_to_hva(vcpu_book3s->vcpu.kvm, pteg >> PAGE_SHIFT);
111 if (kvm_is_error_hva(r)) 136 if (kvm_is_error_hva(r))
@@ -113,10 +138,9 @@ static hva_t kvmppc_mmu_book3s_32_get_pteg(struct kvmppc_vcpu_book3s *vcpu_book3
113 return r | (pteg & ~PAGE_MASK); 138 return r | (pteg & ~PAGE_MASK);
114} 139}
115 140
116static u32 kvmppc_mmu_book3s_32_get_ptem(struct kvmppc_sr *sre, gva_t eaddr, 141static u32 kvmppc_mmu_book3s_32_get_ptem(u32 sre, gva_t eaddr, bool primary)
117 bool primary)
118{ 142{
119 return ((eaddr & 0x0fffffff) >> 22) | (sre->vsid << 7) | 143 return ((eaddr & 0x0fffffff) >> 22) | (sr_vsid(sre) << 7) |
120 (primary ? 0 : 0x40) | 0x80000000; 144 (primary ? 0 : 0x40) | 0x80000000;
121} 145}
122 146
@@ -133,7 +157,7 @@ static int kvmppc_mmu_book3s_32_xlate_bat(struct kvm_vcpu *vcpu, gva_t eaddr,
133 else 157 else
134 bat = &vcpu_book3s->ibat[i]; 158 bat = &vcpu_book3s->ibat[i];
135 159
136 if (vcpu->arch.msr & MSR_PR) { 160 if (vcpu->arch.shared->msr & MSR_PR) {
137 if (!bat->vp) 161 if (!bat->vp)
138 continue; 162 continue;
139 } else { 163 } else {
@@ -180,17 +204,17 @@ static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr,
180 bool primary) 204 bool primary)
181{ 205{
182 struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); 206 struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
183 struct kvmppc_sr *sre; 207 u32 sre;
184 hva_t ptegp; 208 hva_t ptegp;
185 u32 pteg[16]; 209 u32 pteg[16];
186 u32 ptem = 0; 210 u32 ptem = 0;
187 int i; 211 int i;
188 int found = 0; 212 int found = 0;
189 213
190 sre = find_sr(vcpu_book3s, eaddr); 214 sre = find_sr(vcpu, eaddr);
191 215
192 dprintk_pte("SR 0x%lx: vsid=0x%x, raw=0x%x\n", eaddr >> 28, 216 dprintk_pte("SR 0x%lx: vsid=0x%x, raw=0x%x\n", eaddr >> 28,
193 sre->vsid, sre->raw); 217 sr_vsid(sre), sre);
194 218
195 pte->vpage = kvmppc_mmu_book3s_32_ea_to_vp(vcpu, eaddr, data); 219 pte->vpage = kvmppc_mmu_book3s_32_ea_to_vp(vcpu, eaddr, data);
196 220
@@ -214,8 +238,8 @@ static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr,
214 pte->raddr = (pteg[i+1] & ~(0xFFFULL)) | (eaddr & 0xFFF); 238 pte->raddr = (pteg[i+1] & ~(0xFFFULL)) | (eaddr & 0xFFF);
215 pp = pteg[i+1] & 3; 239 pp = pteg[i+1] & 3;
216 240
217 if ((sre->Kp && (vcpu->arch.msr & MSR_PR)) || 241 if ((sr_kp(sre) && (vcpu->arch.shared->msr & MSR_PR)) ||
218 (sre->Ks && !(vcpu->arch.msr & MSR_PR))) 242 (sr_ks(sre) && !(vcpu->arch.shared->msr & MSR_PR)))
219 pp |= 4; 243 pp |= 4;
220 244
221 pte->may_write = false; 245 pte->may_write = false;
@@ -269,7 +293,7 @@ no_page_found:
269 dprintk_pte("KVM MMU: No PTE found (sdr1=0x%llx ptegp=0x%lx)\n", 293 dprintk_pte("KVM MMU: No PTE found (sdr1=0x%llx ptegp=0x%lx)\n",
270 to_book3s(vcpu)->sdr1, ptegp); 294 to_book3s(vcpu)->sdr1, ptegp);
271 for (i=0; i<16; i+=2) { 295 for (i=0; i<16; i+=2) {
272 dprintk_pte(" %02d: 0x%x - 0x%x (0x%llx)\n", 296 dprintk_pte(" %02d: 0x%x - 0x%x (0x%x)\n",
273 i, pteg[i], pteg[i+1], ptem); 297 i, pteg[i], pteg[i+1], ptem);
274 } 298 }
275 } 299 }
@@ -281,8 +305,24 @@ static int kvmppc_mmu_book3s_32_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
281 struct kvmppc_pte *pte, bool data) 305 struct kvmppc_pte *pte, bool data)
282{ 306{
283 int r; 307 int r;
308 ulong mp_ea = vcpu->arch.magic_page_ea;
284 309
285 pte->eaddr = eaddr; 310 pte->eaddr = eaddr;
311
312 /* Magic page override */
313 if (unlikely(mp_ea) &&
314 unlikely((eaddr & ~0xfffULL) == (mp_ea & ~0xfffULL)) &&
315 !(vcpu->arch.shared->msr & MSR_PR)) {
316 pte->vpage = kvmppc_mmu_book3s_32_ea_to_vp(vcpu, eaddr, data);
317 pte->raddr = vcpu->arch.magic_page_pa | (pte->raddr & 0xfff);
318 pte->raddr &= KVM_PAM;
319 pte->may_execute = true;
320 pte->may_read = true;
321 pte->may_write = true;
322
323 return 0;
324 }
325
286 r = kvmppc_mmu_book3s_32_xlate_bat(vcpu, eaddr, pte, data); 326 r = kvmppc_mmu_book3s_32_xlate_bat(vcpu, eaddr, pte, data);
287 if (r < 0) 327 if (r < 0)
288 r = kvmppc_mmu_book3s_32_xlate_pte(vcpu, eaddr, pte, data, true); 328 r = kvmppc_mmu_book3s_32_xlate_pte(vcpu, eaddr, pte, data, true);
@@ -295,30 +335,13 @@ static int kvmppc_mmu_book3s_32_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
295 335
296static u32 kvmppc_mmu_book3s_32_mfsrin(struct kvm_vcpu *vcpu, u32 srnum) 336static u32 kvmppc_mmu_book3s_32_mfsrin(struct kvm_vcpu *vcpu, u32 srnum)
297{ 337{
298 return to_book3s(vcpu)->sr[srnum].raw; 338 return vcpu->arch.shared->sr[srnum];
299} 339}
300 340
301static void kvmppc_mmu_book3s_32_mtsrin(struct kvm_vcpu *vcpu, u32 srnum, 341static void kvmppc_mmu_book3s_32_mtsrin(struct kvm_vcpu *vcpu, u32 srnum,
302 ulong value) 342 ulong value)
303{ 343{
304 struct kvmppc_sr *sre; 344 vcpu->arch.shared->sr[srnum] = value;
305
306 sre = &to_book3s(vcpu)->sr[srnum];
307
308 /* Flush any left-over shadows from the previous SR */
309
310 /* XXX Not necessary? */
311 /* kvmppc_mmu_pte_flush(vcpu, ((u64)sre->vsid) << 28, 0xf0000000ULL); */
312
313 /* And then put in the new SR */
314 sre->raw = value;
315 sre->vsid = (value & 0x0fffffff);
316 sre->valid = (value & 0x80000000) ? false : true;
317 sre->Ks = (value & 0x40000000) ? true : false;
318 sre->Kp = (value & 0x20000000) ? true : false;
319 sre->nx = (value & 0x10000000) ? true : false;
320
321 /* Map the new segment */
322 kvmppc_mmu_map_segment(vcpu, srnum << SID_SHIFT); 345 kvmppc_mmu_map_segment(vcpu, srnum << SID_SHIFT);
323} 346}
324 347
@@ -331,19 +354,19 @@ static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
331 u64 *vsid) 354 u64 *vsid)
332{ 355{
333 ulong ea = esid << SID_SHIFT; 356 ulong ea = esid << SID_SHIFT;
334 struct kvmppc_sr *sr; 357 u32 sr;
335 u64 gvsid = esid; 358 u64 gvsid = esid;
336 359
337 if (vcpu->arch.msr & (MSR_DR|MSR_IR)) { 360 if (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) {
338 sr = find_sr(to_book3s(vcpu), ea); 361 sr = find_sr(vcpu, ea);
339 if (sr->valid) 362 if (sr_valid(sr))
340 gvsid = sr->vsid; 363 gvsid = sr_vsid(sr);
341 } 364 }
342 365
343 /* In case we only have one of MSR_IR or MSR_DR set, let's put 366 /* In case we only have one of MSR_IR or MSR_DR set, let's put
344 that in the real-mode context (and hope RM doesn't access 367 that in the real-mode context (and hope RM doesn't access
345 high memory) */ 368 high memory) */
346 switch (vcpu->arch.msr & (MSR_DR|MSR_IR)) { 369 switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) {
347 case 0: 370 case 0:
348 *vsid = VSID_REAL | esid; 371 *vsid = VSID_REAL | esid;
349 break; 372 break;
@@ -354,8 +377,8 @@ static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
354 *vsid = VSID_REAL_DR | gvsid; 377 *vsid = VSID_REAL_DR | gvsid;
355 break; 378 break;
356 case MSR_DR|MSR_IR: 379 case MSR_DR|MSR_IR:
357 if (sr->valid) 380 if (sr_valid(sr))
358 *vsid = sr->vsid; 381 *vsid = sr_vsid(sr);
359 else 382 else
360 *vsid = VSID_BAT | gvsid; 383 *vsid = VSID_BAT | gvsid;
361 break; 384 break;
@@ -363,7 +386,7 @@ static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
363 BUG(); 386 BUG();
364 } 387 }
365 388
366 if (vcpu->arch.msr & MSR_PR) 389 if (vcpu->arch.shared->msr & MSR_PR)
367 *vsid |= VSID_PR; 390 *vsid |= VSID_PR;
368 391
369 return 0; 392 return 0;
diff --git a/arch/powerpc/kvm/book3s_32_mmu_host.c b/arch/powerpc/kvm/book3s_32_mmu_host.c
index 0b51ef872c1e..9fecbfbce773 100644
--- a/arch/powerpc/kvm/book3s_32_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_32_mmu_host.c
@@ -19,7 +19,6 @@
19 */ 19 */
20 20
21#include <linux/kvm_host.h> 21#include <linux/kvm_host.h>
22#include <linux/hash.h>
23 22
24#include <asm/kvm_ppc.h> 23#include <asm/kvm_ppc.h>
25#include <asm/kvm_book3s.h> 24#include <asm/kvm_book3s.h>
@@ -77,7 +76,14 @@ void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
77 * a hash, so we don't waste cycles on looping */ 76 * a hash, so we don't waste cycles on looping */
78static u16 kvmppc_sid_hash(struct kvm_vcpu *vcpu, u64 gvsid) 77static u16 kvmppc_sid_hash(struct kvm_vcpu *vcpu, u64 gvsid)
79{ 78{
80 return hash_64(gvsid, SID_MAP_BITS); 79 return (u16)(((gvsid >> (SID_MAP_BITS * 7)) & SID_MAP_MASK) ^
80 ((gvsid >> (SID_MAP_BITS * 6)) & SID_MAP_MASK) ^
81 ((gvsid >> (SID_MAP_BITS * 5)) & SID_MAP_MASK) ^
82 ((gvsid >> (SID_MAP_BITS * 4)) & SID_MAP_MASK) ^
83 ((gvsid >> (SID_MAP_BITS * 3)) & SID_MAP_MASK) ^
84 ((gvsid >> (SID_MAP_BITS * 2)) & SID_MAP_MASK) ^
85 ((gvsid >> (SID_MAP_BITS * 1)) & SID_MAP_MASK) ^
86 ((gvsid >> (SID_MAP_BITS * 0)) & SID_MAP_MASK));
81} 87}
82 88
83 89
@@ -86,7 +92,7 @@ static struct kvmppc_sid_map *find_sid_vsid(struct kvm_vcpu *vcpu, u64 gvsid)
86 struct kvmppc_sid_map *map; 92 struct kvmppc_sid_map *map;
87 u16 sid_map_mask; 93 u16 sid_map_mask;
88 94
89 if (vcpu->arch.msr & MSR_PR) 95 if (vcpu->arch.shared->msr & MSR_PR)
90 gvsid |= VSID_PR; 96 gvsid |= VSID_PR;
91 97
92 sid_map_mask = kvmppc_sid_hash(vcpu, gvsid); 98 sid_map_mask = kvmppc_sid_hash(vcpu, gvsid);
@@ -147,8 +153,8 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte)
147 struct hpte_cache *pte; 153 struct hpte_cache *pte;
148 154
149 /* Get host physical address for gpa */ 155 /* Get host physical address for gpa */
150 hpaddr = gfn_to_pfn(vcpu->kvm, orig_pte->raddr >> PAGE_SHIFT); 156 hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT);
151 if (kvm_is_error_hva(hpaddr)) { 157 if (is_error_pfn(hpaddr)) {
152 printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n", 158 printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n",
153 orig_pte->eaddr); 159 orig_pte->eaddr);
154 return -EINVAL; 160 return -EINVAL;
@@ -253,7 +259,7 @@ static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid)
253 u16 sid_map_mask; 259 u16 sid_map_mask;
254 static int backwards_map = 0; 260 static int backwards_map = 0;
255 261
256 if (vcpu->arch.msr & MSR_PR) 262 if (vcpu->arch.shared->msr & MSR_PR)
257 gvsid |= VSID_PR; 263 gvsid |= VSID_PR;
258 264
259 /* We might get collisions that trap in preceding order, so let's 265 /* We might get collisions that trap in preceding order, so let's
@@ -269,18 +275,15 @@ static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid)
269 backwards_map = !backwards_map; 275 backwards_map = !backwards_map;
270 276
271 /* Uh-oh ... out of mappings. Let's flush! */ 277 /* Uh-oh ... out of mappings. Let's flush! */
272 if (vcpu_book3s->vsid_next >= vcpu_book3s->vsid_max) { 278 if (vcpu_book3s->vsid_next >= VSID_POOL_SIZE) {
273 vcpu_book3s->vsid_next = vcpu_book3s->vsid_first; 279 vcpu_book3s->vsid_next = 0;
274 memset(vcpu_book3s->sid_map, 0, 280 memset(vcpu_book3s->sid_map, 0,
275 sizeof(struct kvmppc_sid_map) * SID_MAP_NUM); 281 sizeof(struct kvmppc_sid_map) * SID_MAP_NUM);
276 kvmppc_mmu_pte_flush(vcpu, 0, 0); 282 kvmppc_mmu_pte_flush(vcpu, 0, 0);
277 kvmppc_mmu_flush_segments(vcpu); 283 kvmppc_mmu_flush_segments(vcpu);
278 } 284 }
279 map->host_vsid = vcpu_book3s->vsid_next; 285 map->host_vsid = vcpu_book3s->vsid_pool[vcpu_book3s->vsid_next];
280 286 vcpu_book3s->vsid_next++;
281 /* Would have to be 111 to be completely aligned with the rest of
282 Linux, but that is just way too little space! */
283 vcpu_book3s->vsid_next+=1;
284 287
285 map->guest_vsid = gvsid; 288 map->guest_vsid = gvsid;
286 map->valid = true; 289 map->valid = true;
@@ -327,40 +330,38 @@ void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu)
327 330
328void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) 331void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
329{ 332{
333 int i;
334
330 kvmppc_mmu_hpte_destroy(vcpu); 335 kvmppc_mmu_hpte_destroy(vcpu);
331 preempt_disable(); 336 preempt_disable();
332 __destroy_context(to_book3s(vcpu)->context_id); 337 for (i = 0; i < SID_CONTEXTS; i++)
338 __destroy_context(to_book3s(vcpu)->context_id[i]);
333 preempt_enable(); 339 preempt_enable();
334} 340}
335 341
336/* From mm/mmu_context_hash32.c */ 342/* From mm/mmu_context_hash32.c */
337#define CTX_TO_VSID(ctx) (((ctx) * (897 * 16)) & 0xffffff) 343#define CTX_TO_VSID(c, id) ((((c) * (897 * 16)) + (id * 0x111)) & 0xffffff)
338 344
339int kvmppc_mmu_init(struct kvm_vcpu *vcpu) 345int kvmppc_mmu_init(struct kvm_vcpu *vcpu)
340{ 346{
341 struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); 347 struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
342 int err; 348 int err;
343 ulong sdr1; 349 ulong sdr1;
350 int i;
351 int j;
344 352
345 err = __init_new_context(); 353 for (i = 0; i < SID_CONTEXTS; i++) {
346 if (err < 0) 354 err = __init_new_context();
347 return -1; 355 if (err < 0)
348 vcpu3s->context_id = err; 356 goto init_fail;
349 357 vcpu3s->context_id[i] = err;
350 vcpu3s->vsid_max = CTX_TO_VSID(vcpu3s->context_id + 1) - 1;
351 vcpu3s->vsid_first = CTX_TO_VSID(vcpu3s->context_id);
352
353#if 0 /* XXX still doesn't guarantee uniqueness */
354 /* We could collide with the Linux vsid space because the vsid
355 * wraps around at 24 bits. We're safe if we do our own space
356 * though, so let's always set the highest bit. */
357 358
358 vcpu3s->vsid_max |= 0x00800000; 359 /* Remember context id for this combination */
359 vcpu3s->vsid_first |= 0x00800000; 360 for (j = 0; j < 16; j++)
360#endif 361 vcpu3s->vsid_pool[(i * 16) + j] = CTX_TO_VSID(err, j);
361 BUG_ON(vcpu3s->vsid_max < vcpu3s->vsid_first); 362 }
362 363
363 vcpu3s->vsid_next = vcpu3s->vsid_first; 364 vcpu3s->vsid_next = 0;
364 365
365 /* Remember where the HTAB is */ 366 /* Remember where the HTAB is */
366 asm ( "mfsdr1 %0" : "=r"(sdr1) ); 367 asm ( "mfsdr1 %0" : "=r"(sdr1) );
@@ -370,4 +371,14 @@ int kvmppc_mmu_init(struct kvm_vcpu *vcpu)
370 kvmppc_mmu_hpte_init(vcpu); 371 kvmppc_mmu_hpte_init(vcpu);
371 372
372 return 0; 373 return 0;
374
375init_fail:
376 for (j = 0; j < i; j++) {
377 if (!vcpu3s->context_id[j])
378 continue;
379
380 __destroy_context(to_book3s(vcpu)->context_id[j]);
381 }
382
383 return -1;
373} 384}
diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c
index 4025ea26b3c1..d7889ef3211e 100644
--- a/arch/powerpc/kvm/book3s_64_mmu.c
+++ b/arch/powerpc/kvm/book3s_64_mmu.c
@@ -163,6 +163,22 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
163 bool found = false; 163 bool found = false;
164 bool perm_err = false; 164 bool perm_err = false;
165 int second = 0; 165 int second = 0;
166 ulong mp_ea = vcpu->arch.magic_page_ea;
167
168 /* Magic page override */
169 if (unlikely(mp_ea) &&
170 unlikely((eaddr & ~0xfffULL) == (mp_ea & ~0xfffULL)) &&
171 !(vcpu->arch.shared->msr & MSR_PR)) {
172 gpte->eaddr = eaddr;
173 gpte->vpage = kvmppc_mmu_book3s_64_ea_to_vp(vcpu, eaddr, data);
174 gpte->raddr = vcpu->arch.magic_page_pa | (gpte->raddr & 0xfff);
175 gpte->raddr &= KVM_PAM;
176 gpte->may_execute = true;
177 gpte->may_read = true;
178 gpte->may_write = true;
179
180 return 0;
181 }
166 182
167 slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu_book3s, eaddr); 183 slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu_book3s, eaddr);
168 if (!slbe) 184 if (!slbe)
@@ -180,9 +196,9 @@ do_second:
180 goto no_page_found; 196 goto no_page_found;
181 } 197 }
182 198
183 if ((vcpu->arch.msr & MSR_PR) && slbe->Kp) 199 if ((vcpu->arch.shared->msr & MSR_PR) && slbe->Kp)
184 key = 4; 200 key = 4;
185 else if (!(vcpu->arch.msr & MSR_PR) && slbe->Ks) 201 else if (!(vcpu->arch.shared->msr & MSR_PR) && slbe->Ks)
186 key = 4; 202 key = 4;
187 203
188 for (i=0; i<16; i+=2) { 204 for (i=0; i<16; i+=2) {
@@ -381,7 +397,7 @@ static void kvmppc_mmu_book3s_64_slbia(struct kvm_vcpu *vcpu)
381 for (i = 1; i < vcpu_book3s->slb_nr; i++) 397 for (i = 1; i < vcpu_book3s->slb_nr; i++)
382 vcpu_book3s->slb[i].valid = false; 398 vcpu_book3s->slb[i].valid = false;
383 399
384 if (vcpu->arch.msr & MSR_IR) { 400 if (vcpu->arch.shared->msr & MSR_IR) {
385 kvmppc_mmu_flush_segments(vcpu); 401 kvmppc_mmu_flush_segments(vcpu);
386 kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)); 402 kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu));
387 } 403 }
@@ -445,14 +461,15 @@ static int kvmppc_mmu_book3s_64_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
445 ulong ea = esid << SID_SHIFT; 461 ulong ea = esid << SID_SHIFT;
446 struct kvmppc_slb *slb; 462 struct kvmppc_slb *slb;
447 u64 gvsid = esid; 463 u64 gvsid = esid;
464 ulong mp_ea = vcpu->arch.magic_page_ea;
448 465
449 if (vcpu->arch.msr & (MSR_DR|MSR_IR)) { 466 if (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) {
450 slb = kvmppc_mmu_book3s_64_find_slbe(to_book3s(vcpu), ea); 467 slb = kvmppc_mmu_book3s_64_find_slbe(to_book3s(vcpu), ea);
451 if (slb) 468 if (slb)
452 gvsid = slb->vsid; 469 gvsid = slb->vsid;
453 } 470 }
454 471
455 switch (vcpu->arch.msr & (MSR_DR|MSR_IR)) { 472 switch (vcpu->arch.shared->msr & (MSR_DR|MSR_IR)) {
456 case 0: 473 case 0:
457 *vsid = VSID_REAL | esid; 474 *vsid = VSID_REAL | esid;
458 break; 475 break;
@@ -464,7 +481,7 @@ static int kvmppc_mmu_book3s_64_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
464 break; 481 break;
465 case MSR_DR|MSR_IR: 482 case MSR_DR|MSR_IR:
466 if (!slb) 483 if (!slb)
467 return -ENOENT; 484 goto no_slb;
468 485
469 *vsid = gvsid; 486 *vsid = gvsid;
470 break; 487 break;
@@ -473,10 +490,21 @@ static int kvmppc_mmu_book3s_64_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
473 break; 490 break;
474 } 491 }
475 492
476 if (vcpu->arch.msr & MSR_PR) 493 if (vcpu->arch.shared->msr & MSR_PR)
477 *vsid |= VSID_PR; 494 *vsid |= VSID_PR;
478 495
479 return 0; 496 return 0;
497
498no_slb:
499 /* Catch magic page case */
500 if (unlikely(mp_ea) &&
501 unlikely(esid == (mp_ea >> SID_SHIFT)) &&
502 !(vcpu->arch.shared->msr & MSR_PR)) {
503 *vsid = VSID_REAL | esid;
504 return 0;
505 }
506
507 return -EINVAL;
480} 508}
481 509
482static bool kvmppc_mmu_book3s_64_is_dcbz32(struct kvm_vcpu *vcpu) 510static bool kvmppc_mmu_book3s_64_is_dcbz32(struct kvm_vcpu *vcpu)
diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c
index 384179a5002b..fa2f08434ba5 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_host.c
@@ -20,7 +20,6 @@
20 */ 20 */
21 21
22#include <linux/kvm_host.h> 22#include <linux/kvm_host.h>
23#include <linux/hash.h>
24 23
25#include <asm/kvm_ppc.h> 24#include <asm/kvm_ppc.h>
26#include <asm/kvm_book3s.h> 25#include <asm/kvm_book3s.h>
@@ -28,24 +27,9 @@
28#include <asm/machdep.h> 27#include <asm/machdep.h>
29#include <asm/mmu_context.h> 28#include <asm/mmu_context.h>
30#include <asm/hw_irq.h> 29#include <asm/hw_irq.h>
30#include "trace.h"
31 31
32#define PTE_SIZE 12 32#define PTE_SIZE 12
33#define VSID_ALL 0
34
35/* #define DEBUG_MMU */
36/* #define DEBUG_SLB */
37
38#ifdef DEBUG_MMU
39#define dprintk_mmu(a, ...) printk(KERN_INFO a, __VA_ARGS__)
40#else
41#define dprintk_mmu(a, ...) do { } while(0)
42#endif
43
44#ifdef DEBUG_SLB
45#define dprintk_slb(a, ...) printk(KERN_INFO a, __VA_ARGS__)
46#else
47#define dprintk_slb(a, ...) do { } while(0)
48#endif
49 33
50void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte) 34void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
51{ 35{
@@ -58,34 +42,39 @@ void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
58 * a hash, so we don't waste cycles on looping */ 42 * a hash, so we don't waste cycles on looping */
59static u16 kvmppc_sid_hash(struct kvm_vcpu *vcpu, u64 gvsid) 43static u16 kvmppc_sid_hash(struct kvm_vcpu *vcpu, u64 gvsid)
60{ 44{
61 return hash_64(gvsid, SID_MAP_BITS); 45 return (u16)(((gvsid >> (SID_MAP_BITS * 7)) & SID_MAP_MASK) ^
46 ((gvsid >> (SID_MAP_BITS * 6)) & SID_MAP_MASK) ^
47 ((gvsid >> (SID_MAP_BITS * 5)) & SID_MAP_MASK) ^
48 ((gvsid >> (SID_MAP_BITS * 4)) & SID_MAP_MASK) ^
49 ((gvsid >> (SID_MAP_BITS * 3)) & SID_MAP_MASK) ^
50 ((gvsid >> (SID_MAP_BITS * 2)) & SID_MAP_MASK) ^
51 ((gvsid >> (SID_MAP_BITS * 1)) & SID_MAP_MASK) ^
52 ((gvsid >> (SID_MAP_BITS * 0)) & SID_MAP_MASK));
62} 53}
63 54
55
64static struct kvmppc_sid_map *find_sid_vsid(struct kvm_vcpu *vcpu, u64 gvsid) 56static struct kvmppc_sid_map *find_sid_vsid(struct kvm_vcpu *vcpu, u64 gvsid)
65{ 57{
66 struct kvmppc_sid_map *map; 58 struct kvmppc_sid_map *map;
67 u16 sid_map_mask; 59 u16 sid_map_mask;
68 60
69 if (vcpu->arch.msr & MSR_PR) 61 if (vcpu->arch.shared->msr & MSR_PR)
70 gvsid |= VSID_PR; 62 gvsid |= VSID_PR;
71 63
72 sid_map_mask = kvmppc_sid_hash(vcpu, gvsid); 64 sid_map_mask = kvmppc_sid_hash(vcpu, gvsid);
73 map = &to_book3s(vcpu)->sid_map[sid_map_mask]; 65 map = &to_book3s(vcpu)->sid_map[sid_map_mask];
74 if (map->guest_vsid == gvsid) { 66 if (map->valid && (map->guest_vsid == gvsid)) {
75 dprintk_slb("SLB: Searching: 0x%llx -> 0x%llx\n", 67 trace_kvm_book3s_slb_found(gvsid, map->host_vsid);
76 gvsid, map->host_vsid);
77 return map; 68 return map;
78 } 69 }
79 70
80 map = &to_book3s(vcpu)->sid_map[SID_MAP_MASK - sid_map_mask]; 71 map = &to_book3s(vcpu)->sid_map[SID_MAP_MASK - sid_map_mask];
81 if (map->guest_vsid == gvsid) { 72 if (map->valid && (map->guest_vsid == gvsid)) {
82 dprintk_slb("SLB: Searching 0x%llx -> 0x%llx\n", 73 trace_kvm_book3s_slb_found(gvsid, map->host_vsid);
83 gvsid, map->host_vsid);
84 return map; 74 return map;
85 } 75 }
86 76
87 dprintk_slb("SLB: Searching %d/%d: 0x%llx -> not found\n", 77 trace_kvm_book3s_slb_fail(sid_map_mask, gvsid);
88 sid_map_mask, SID_MAP_MASK - sid_map_mask, gvsid);
89 return NULL; 78 return NULL;
90} 79}
91 80
@@ -101,18 +90,13 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte)
101 struct kvmppc_sid_map *map; 90 struct kvmppc_sid_map *map;
102 91
103 /* Get host physical address for gpa */ 92 /* Get host physical address for gpa */
104 hpaddr = gfn_to_pfn(vcpu->kvm, orig_pte->raddr >> PAGE_SHIFT); 93 hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT);
105 if (kvm_is_error_hva(hpaddr)) { 94 if (is_error_pfn(hpaddr)) {
106 printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n", orig_pte->eaddr); 95 printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n", orig_pte->eaddr);
107 return -EINVAL; 96 return -EINVAL;
108 } 97 }
109 hpaddr <<= PAGE_SHIFT; 98 hpaddr <<= PAGE_SHIFT;
110#if PAGE_SHIFT == 12 99 hpaddr |= orig_pte->raddr & (~0xfffULL & ~PAGE_MASK);
111#elif PAGE_SHIFT == 16
112 hpaddr |= orig_pte->raddr & 0xf000;
113#else
114#error Unknown page size
115#endif
116 100
117 /* and write the mapping ea -> hpa into the pt */ 101 /* and write the mapping ea -> hpa into the pt */
118 vcpu->arch.mmu.esid_to_vsid(vcpu, orig_pte->eaddr >> SID_SHIFT, &vsid); 102 vcpu->arch.mmu.esid_to_vsid(vcpu, orig_pte->eaddr >> SID_SHIFT, &vsid);
@@ -161,10 +145,7 @@ map_again:
161 } else { 145 } else {
162 struct hpte_cache *pte = kvmppc_mmu_hpte_cache_next(vcpu); 146 struct hpte_cache *pte = kvmppc_mmu_hpte_cache_next(vcpu);
163 147
164 dprintk_mmu("KVM: %c%c Map 0x%lx: [%lx] 0x%lx (0x%llx) -> %lx\n", 148 trace_kvm_book3s_64_mmu_map(rflags, hpteg, va, hpaddr, orig_pte);
165 ((rflags & HPTE_R_PP) == 3) ? '-' : 'w',
166 (rflags & HPTE_R_N) ? '-' : 'x',
167 orig_pte->eaddr, hpteg, va, orig_pte->vpage, hpaddr);
168 149
169 /* The ppc_md code may give us a secondary entry even though we 150 /* The ppc_md code may give us a secondary entry even though we
170 asked for a primary. Fix up. */ 151 asked for a primary. Fix up. */
@@ -191,7 +172,7 @@ static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid)
191 u16 sid_map_mask; 172 u16 sid_map_mask;
192 static int backwards_map = 0; 173 static int backwards_map = 0;
193 174
194 if (vcpu->arch.msr & MSR_PR) 175 if (vcpu->arch.shared->msr & MSR_PR)
195 gvsid |= VSID_PR; 176 gvsid |= VSID_PR;
196 177
197 /* We might get collisions that trap in preceding order, so let's 178 /* We might get collisions that trap in preceding order, so let's
@@ -219,8 +200,7 @@ static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid)
219 map->guest_vsid = gvsid; 200 map->guest_vsid = gvsid;
220 map->valid = true; 201 map->valid = true;
221 202
222 dprintk_slb("SLB: New mapping at %d: 0x%llx -> 0x%llx\n", 203 trace_kvm_book3s_slb_map(sid_map_mask, gvsid, map->host_vsid);
223 sid_map_mask, gvsid, map->host_vsid);
224 204
225 return map; 205 return map;
226} 206}
@@ -292,7 +272,7 @@ int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr)
292 to_svcpu(vcpu)->slb[slb_index].esid = slb_esid; 272 to_svcpu(vcpu)->slb[slb_index].esid = slb_esid;
293 to_svcpu(vcpu)->slb[slb_index].vsid = slb_vsid; 273 to_svcpu(vcpu)->slb[slb_index].vsid = slb_vsid;
294 274
295 dprintk_slb("slbmte %#llx, %#llx\n", slb_vsid, slb_esid); 275 trace_kvm_book3s_slbmte(slb_vsid, slb_esid);
296 276
297 return 0; 277 return 0;
298} 278}
@@ -306,7 +286,7 @@ void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu)
306void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) 286void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
307{ 287{
308 kvmppc_mmu_hpte_destroy(vcpu); 288 kvmppc_mmu_hpte_destroy(vcpu);
309 __destroy_context(to_book3s(vcpu)->context_id); 289 __destroy_context(to_book3s(vcpu)->context_id[0]);
310} 290}
311 291
312int kvmppc_mmu_init(struct kvm_vcpu *vcpu) 292int kvmppc_mmu_init(struct kvm_vcpu *vcpu)
@@ -317,10 +297,10 @@ int kvmppc_mmu_init(struct kvm_vcpu *vcpu)
317 err = __init_new_context(); 297 err = __init_new_context();
318 if (err < 0) 298 if (err < 0)
319 return -1; 299 return -1;
320 vcpu3s->context_id = err; 300 vcpu3s->context_id[0] = err;
321 301
322 vcpu3s->vsid_max = ((vcpu3s->context_id + 1) << USER_ESID_BITS) - 1; 302 vcpu3s->vsid_max = ((vcpu3s->context_id[0] + 1) << USER_ESID_BITS) - 1;
323 vcpu3s->vsid_first = vcpu3s->context_id << USER_ESID_BITS; 303 vcpu3s->vsid_first = vcpu3s->context_id[0] << USER_ESID_BITS;
324 vcpu3s->vsid_next = vcpu3s->vsid_first; 304 vcpu3s->vsid_next = vcpu3s->vsid_first;
325 305
326 kvmppc_mmu_hpte_init(vcpu); 306 kvmppc_mmu_hpte_init(vcpu);
diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c
index c85f906038ce..466846557089 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -73,8 +73,8 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
73 switch (get_xop(inst)) { 73 switch (get_xop(inst)) {
74 case OP_19_XOP_RFID: 74 case OP_19_XOP_RFID:
75 case OP_19_XOP_RFI: 75 case OP_19_XOP_RFI:
76 kvmppc_set_pc(vcpu, vcpu->arch.srr0); 76 kvmppc_set_pc(vcpu, vcpu->arch.shared->srr0);
77 kvmppc_set_msr(vcpu, vcpu->arch.srr1); 77 kvmppc_set_msr(vcpu, vcpu->arch.shared->srr1);
78 *advance = 0; 78 *advance = 0;
79 break; 79 break;
80 80
@@ -86,14 +86,15 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
86 case 31: 86 case 31:
87 switch (get_xop(inst)) { 87 switch (get_xop(inst)) {
88 case OP_31_XOP_MFMSR: 88 case OP_31_XOP_MFMSR:
89 kvmppc_set_gpr(vcpu, get_rt(inst), vcpu->arch.msr); 89 kvmppc_set_gpr(vcpu, get_rt(inst),
90 vcpu->arch.shared->msr);
90 break; 91 break;
91 case OP_31_XOP_MTMSRD: 92 case OP_31_XOP_MTMSRD:
92 { 93 {
93 ulong rs = kvmppc_get_gpr(vcpu, get_rs(inst)); 94 ulong rs = kvmppc_get_gpr(vcpu, get_rs(inst));
94 if (inst & 0x10000) { 95 if (inst & 0x10000) {
95 vcpu->arch.msr &= ~(MSR_RI | MSR_EE); 96 vcpu->arch.shared->msr &= ~(MSR_RI | MSR_EE);
96 vcpu->arch.msr |= rs & (MSR_RI | MSR_EE); 97 vcpu->arch.shared->msr |= rs & (MSR_RI | MSR_EE);
97 } else 98 } else
98 kvmppc_set_msr(vcpu, rs); 99 kvmppc_set_msr(vcpu, rs);
99 break; 100 break;
@@ -204,14 +205,14 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
204 ra = kvmppc_get_gpr(vcpu, get_ra(inst)); 205 ra = kvmppc_get_gpr(vcpu, get_ra(inst));
205 206
206 addr = (ra + rb) & ~31ULL; 207 addr = (ra + rb) & ~31ULL;
207 if (!(vcpu->arch.msr & MSR_SF)) 208 if (!(vcpu->arch.shared->msr & MSR_SF))
208 addr &= 0xffffffff; 209 addr &= 0xffffffff;
209 vaddr = addr; 210 vaddr = addr;
210 211
211 r = kvmppc_st(vcpu, &addr, 32, zeros, true); 212 r = kvmppc_st(vcpu, &addr, 32, zeros, true);
212 if ((r == -ENOENT) || (r == -EPERM)) { 213 if ((r == -ENOENT) || (r == -EPERM)) {
213 *advance = 0; 214 *advance = 0;
214 vcpu->arch.dear = vaddr; 215 vcpu->arch.shared->dar = vaddr;
215 to_svcpu(vcpu)->fault_dar = vaddr; 216 to_svcpu(vcpu)->fault_dar = vaddr;
216 217
217 dsisr = DSISR_ISSTORE; 218 dsisr = DSISR_ISSTORE;
@@ -220,7 +221,7 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
220 else if (r == -EPERM) 221 else if (r == -EPERM)
221 dsisr |= DSISR_PROTFAULT; 222 dsisr |= DSISR_PROTFAULT;
222 223
223 to_book3s(vcpu)->dsisr = dsisr; 224 vcpu->arch.shared->dsisr = dsisr;
224 to_svcpu(vcpu)->fault_dsisr = dsisr; 225 to_svcpu(vcpu)->fault_dsisr = dsisr;
225 226
226 kvmppc_book3s_queue_irqprio(vcpu, 227 kvmppc_book3s_queue_irqprio(vcpu,
@@ -263,7 +264,7 @@ void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat, bool upper,
263 } 264 }
264} 265}
265 266
266static u32 kvmppc_read_bat(struct kvm_vcpu *vcpu, int sprn) 267static struct kvmppc_bat *kvmppc_find_bat(struct kvm_vcpu *vcpu, int sprn)
267{ 268{
268 struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); 269 struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
269 struct kvmppc_bat *bat; 270 struct kvmppc_bat *bat;
@@ -285,35 +286,7 @@ static u32 kvmppc_read_bat(struct kvm_vcpu *vcpu, int sprn)
285 BUG(); 286 BUG();
286 } 287 }
287 288
288 if (sprn % 2) 289 return bat;
289 return bat->raw >> 32;
290 else
291 return bat->raw;
292}
293
294static void kvmppc_write_bat(struct kvm_vcpu *vcpu, int sprn, u32 val)
295{
296 struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
297 struct kvmppc_bat *bat;
298
299 switch (sprn) {
300 case SPRN_IBAT0U ... SPRN_IBAT3L:
301 bat = &vcpu_book3s->ibat[(sprn - SPRN_IBAT0U) / 2];
302 break;
303 case SPRN_IBAT4U ... SPRN_IBAT7L:
304 bat = &vcpu_book3s->ibat[4 + ((sprn - SPRN_IBAT4U) / 2)];
305 break;
306 case SPRN_DBAT0U ... SPRN_DBAT3L:
307 bat = &vcpu_book3s->dbat[(sprn - SPRN_DBAT0U) / 2];
308 break;
309 case SPRN_DBAT4U ... SPRN_DBAT7L:
310 bat = &vcpu_book3s->dbat[4 + ((sprn - SPRN_DBAT4U) / 2)];
311 break;
312 default:
313 BUG();
314 }
315
316 kvmppc_set_bat(vcpu, bat, !(sprn % 2), val);
317} 290}
318 291
319int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) 292int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
@@ -326,10 +299,10 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
326 to_book3s(vcpu)->sdr1 = spr_val; 299 to_book3s(vcpu)->sdr1 = spr_val;
327 break; 300 break;
328 case SPRN_DSISR: 301 case SPRN_DSISR:
329 to_book3s(vcpu)->dsisr = spr_val; 302 vcpu->arch.shared->dsisr = spr_val;
330 break; 303 break;
331 case SPRN_DAR: 304 case SPRN_DAR:
332 vcpu->arch.dear = spr_val; 305 vcpu->arch.shared->dar = spr_val;
333 break; 306 break;
334 case SPRN_HIOR: 307 case SPRN_HIOR:
335 to_book3s(vcpu)->hior = spr_val; 308 to_book3s(vcpu)->hior = spr_val;
@@ -338,12 +311,16 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
338 case SPRN_IBAT4U ... SPRN_IBAT7L: 311 case SPRN_IBAT4U ... SPRN_IBAT7L:
339 case SPRN_DBAT0U ... SPRN_DBAT3L: 312 case SPRN_DBAT0U ... SPRN_DBAT3L:
340 case SPRN_DBAT4U ... SPRN_DBAT7L: 313 case SPRN_DBAT4U ... SPRN_DBAT7L:
341 kvmppc_write_bat(vcpu, sprn, (u32)spr_val); 314 {
315 struct kvmppc_bat *bat = kvmppc_find_bat(vcpu, sprn);
316
317 kvmppc_set_bat(vcpu, bat, !(sprn % 2), (u32)spr_val);
342 /* BAT writes happen so rarely that we're ok to flush 318 /* BAT writes happen so rarely that we're ok to flush
343 * everything here */ 319 * everything here */
344 kvmppc_mmu_pte_flush(vcpu, 0, 0); 320 kvmppc_mmu_pte_flush(vcpu, 0, 0);
345 kvmppc_mmu_flush_segments(vcpu); 321 kvmppc_mmu_flush_segments(vcpu);
346 break; 322 break;
323 }
347 case SPRN_HID0: 324 case SPRN_HID0:
348 to_book3s(vcpu)->hid[0] = spr_val; 325 to_book3s(vcpu)->hid[0] = spr_val;
349 break; 326 break;
@@ -433,16 +410,24 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
433 case SPRN_IBAT4U ... SPRN_IBAT7L: 410 case SPRN_IBAT4U ... SPRN_IBAT7L:
434 case SPRN_DBAT0U ... SPRN_DBAT3L: 411 case SPRN_DBAT0U ... SPRN_DBAT3L:
435 case SPRN_DBAT4U ... SPRN_DBAT7L: 412 case SPRN_DBAT4U ... SPRN_DBAT7L:
436 kvmppc_set_gpr(vcpu, rt, kvmppc_read_bat(vcpu, sprn)); 413 {
414 struct kvmppc_bat *bat = kvmppc_find_bat(vcpu, sprn);
415
416 if (sprn % 2)
417 kvmppc_set_gpr(vcpu, rt, bat->raw >> 32);
418 else
419 kvmppc_set_gpr(vcpu, rt, bat->raw);
420
437 break; 421 break;
422 }
438 case SPRN_SDR1: 423 case SPRN_SDR1:
439 kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->sdr1); 424 kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->sdr1);
440 break; 425 break;
441 case SPRN_DSISR: 426 case SPRN_DSISR:
442 kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->dsisr); 427 kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->dsisr);
443 break; 428 break;
444 case SPRN_DAR: 429 case SPRN_DAR:
445 kvmppc_set_gpr(vcpu, rt, vcpu->arch.dear); 430 kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->dar);
446 break; 431 break;
447 case SPRN_HIOR: 432 case SPRN_HIOR:
448 kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hior); 433 kvmppc_set_gpr(vcpu, rt, to_book3s(vcpu)->hior);
diff --git a/arch/powerpc/kvm/book3s_mmu_hpte.c b/arch/powerpc/kvm/book3s_mmu_hpte.c
index 4868d4a7ebc5..79751d8dd131 100644
--- a/arch/powerpc/kvm/book3s_mmu_hpte.c
+++ b/arch/powerpc/kvm/book3s_mmu_hpte.c
@@ -21,6 +21,7 @@
21#include <linux/kvm_host.h> 21#include <linux/kvm_host.h>
22#include <linux/hash.h> 22#include <linux/hash.h>
23#include <linux/slab.h> 23#include <linux/slab.h>
24#include "trace.h"
24 25
25#include <asm/kvm_ppc.h> 26#include <asm/kvm_ppc.h>
26#include <asm/kvm_book3s.h> 27#include <asm/kvm_book3s.h>
@@ -30,14 +31,6 @@
30 31
31#define PTE_SIZE 12 32#define PTE_SIZE 12
32 33
33/* #define DEBUG_MMU */
34
35#ifdef DEBUG_MMU
36#define dprintk_mmu(a, ...) printk(KERN_INFO a, __VA_ARGS__)
37#else
38#define dprintk_mmu(a, ...) do { } while(0)
39#endif
40
41static struct kmem_cache *hpte_cache; 34static struct kmem_cache *hpte_cache;
42 35
43static inline u64 kvmppc_mmu_hash_pte(u64 eaddr) 36static inline u64 kvmppc_mmu_hash_pte(u64 eaddr)
@@ -45,6 +38,12 @@ static inline u64 kvmppc_mmu_hash_pte(u64 eaddr)
45 return hash_64(eaddr >> PTE_SIZE, HPTEG_HASH_BITS_PTE); 38 return hash_64(eaddr >> PTE_SIZE, HPTEG_HASH_BITS_PTE);
46} 39}
47 40
41static inline u64 kvmppc_mmu_hash_pte_long(u64 eaddr)
42{
43 return hash_64((eaddr & 0x0ffff000) >> PTE_SIZE,
44 HPTEG_HASH_BITS_PTE_LONG);
45}
46
48static inline u64 kvmppc_mmu_hash_vpte(u64 vpage) 47static inline u64 kvmppc_mmu_hash_vpte(u64 vpage)
49{ 48{
50 return hash_64(vpage & 0xfffffffffULL, HPTEG_HASH_BITS_VPTE); 49 return hash_64(vpage & 0xfffffffffULL, HPTEG_HASH_BITS_VPTE);
@@ -60,77 +59,128 @@ void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
60{ 59{
61 u64 index; 60 u64 index;
62 61
62 trace_kvm_book3s_mmu_map(pte);
63
64 spin_lock(&vcpu->arch.mmu_lock);
65
63 /* Add to ePTE list */ 66 /* Add to ePTE list */
64 index = kvmppc_mmu_hash_pte(pte->pte.eaddr); 67 index = kvmppc_mmu_hash_pte(pte->pte.eaddr);
65 hlist_add_head(&pte->list_pte, &vcpu->arch.hpte_hash_pte[index]); 68 hlist_add_head_rcu(&pte->list_pte, &vcpu->arch.hpte_hash_pte[index]);
69
70 /* Add to ePTE_long list */
71 index = kvmppc_mmu_hash_pte_long(pte->pte.eaddr);
72 hlist_add_head_rcu(&pte->list_pte_long,
73 &vcpu->arch.hpte_hash_pte_long[index]);
66 74
67 /* Add to vPTE list */ 75 /* Add to vPTE list */
68 index = kvmppc_mmu_hash_vpte(pte->pte.vpage); 76 index = kvmppc_mmu_hash_vpte(pte->pte.vpage);
69 hlist_add_head(&pte->list_vpte, &vcpu->arch.hpte_hash_vpte[index]); 77 hlist_add_head_rcu(&pte->list_vpte, &vcpu->arch.hpte_hash_vpte[index]);
70 78
71 /* Add to vPTE_long list */ 79 /* Add to vPTE_long list */
72 index = kvmppc_mmu_hash_vpte_long(pte->pte.vpage); 80 index = kvmppc_mmu_hash_vpte_long(pte->pte.vpage);
73 hlist_add_head(&pte->list_vpte_long, 81 hlist_add_head_rcu(&pte->list_vpte_long,
74 &vcpu->arch.hpte_hash_vpte_long[index]); 82 &vcpu->arch.hpte_hash_vpte_long[index]);
83
84 spin_unlock(&vcpu->arch.mmu_lock);
85}
86
87static void free_pte_rcu(struct rcu_head *head)
88{
89 struct hpte_cache *pte = container_of(head, struct hpte_cache, rcu_head);
90 kmem_cache_free(hpte_cache, pte);
75} 91}
76 92
77static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte) 93static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
78{ 94{
79 dprintk_mmu("KVM: Flushing SPT: 0x%lx (0x%llx) -> 0x%llx\n", 95 trace_kvm_book3s_mmu_invalidate(pte);
80 pte->pte.eaddr, pte->pte.vpage, pte->host_va);
81 96
82 /* Different for 32 and 64 bit */ 97 /* Different for 32 and 64 bit */
83 kvmppc_mmu_invalidate_pte(vcpu, pte); 98 kvmppc_mmu_invalidate_pte(vcpu, pte);
84 99
100 spin_lock(&vcpu->arch.mmu_lock);
101
102 /* pte already invalidated in between? */
103 if (hlist_unhashed(&pte->list_pte)) {
104 spin_unlock(&vcpu->arch.mmu_lock);
105 return;
106 }
107
108 hlist_del_init_rcu(&pte->list_pte);
109 hlist_del_init_rcu(&pte->list_pte_long);
110 hlist_del_init_rcu(&pte->list_vpte);
111 hlist_del_init_rcu(&pte->list_vpte_long);
112
85 if (pte->pte.may_write) 113 if (pte->pte.may_write)
86 kvm_release_pfn_dirty(pte->pfn); 114 kvm_release_pfn_dirty(pte->pfn);
87 else 115 else
88 kvm_release_pfn_clean(pte->pfn); 116 kvm_release_pfn_clean(pte->pfn);
89 117
90 hlist_del(&pte->list_pte); 118 spin_unlock(&vcpu->arch.mmu_lock);
91 hlist_del(&pte->list_vpte);
92 hlist_del(&pte->list_vpte_long);
93 119
94 vcpu->arch.hpte_cache_count--; 120 vcpu->arch.hpte_cache_count--;
95 kmem_cache_free(hpte_cache, pte); 121 call_rcu(&pte->rcu_head, free_pte_rcu);
96} 122}
97 123
98static void kvmppc_mmu_pte_flush_all(struct kvm_vcpu *vcpu) 124static void kvmppc_mmu_pte_flush_all(struct kvm_vcpu *vcpu)
99{ 125{
100 struct hpte_cache *pte; 126 struct hpte_cache *pte;
101 struct hlist_node *node, *tmp; 127 struct hlist_node *node;
102 int i; 128 int i;
103 129
130 rcu_read_lock();
131
104 for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) { 132 for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) {
105 struct hlist_head *list = &vcpu->arch.hpte_hash_vpte_long[i]; 133 struct hlist_head *list = &vcpu->arch.hpte_hash_vpte_long[i];
106 134
107 hlist_for_each_entry_safe(pte, node, tmp, list, list_vpte_long) 135 hlist_for_each_entry_rcu(pte, node, list, list_vpte_long)
108 invalidate_pte(vcpu, pte); 136 invalidate_pte(vcpu, pte);
109 } 137 }
138
139 rcu_read_unlock();
110} 140}
111 141
112static void kvmppc_mmu_pte_flush_page(struct kvm_vcpu *vcpu, ulong guest_ea) 142static void kvmppc_mmu_pte_flush_page(struct kvm_vcpu *vcpu, ulong guest_ea)
113{ 143{
114 struct hlist_head *list; 144 struct hlist_head *list;
115 struct hlist_node *node, *tmp; 145 struct hlist_node *node;
116 struct hpte_cache *pte; 146 struct hpte_cache *pte;
117 147
118 /* Find the list of entries in the map */ 148 /* Find the list of entries in the map */
119 list = &vcpu->arch.hpte_hash_pte[kvmppc_mmu_hash_pte(guest_ea)]; 149 list = &vcpu->arch.hpte_hash_pte[kvmppc_mmu_hash_pte(guest_ea)];
120 150
151 rcu_read_lock();
152
121 /* Check the list for matching entries and invalidate */ 153 /* Check the list for matching entries and invalidate */
122 hlist_for_each_entry_safe(pte, node, tmp, list, list_pte) 154 hlist_for_each_entry_rcu(pte, node, list, list_pte)
123 if ((pte->pte.eaddr & ~0xfffUL) == guest_ea) 155 if ((pte->pte.eaddr & ~0xfffUL) == guest_ea)
124 invalidate_pte(vcpu, pte); 156 invalidate_pte(vcpu, pte);
157
158 rcu_read_unlock();
125} 159}
126 160
127void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask) 161static void kvmppc_mmu_pte_flush_long(struct kvm_vcpu *vcpu, ulong guest_ea)
128{ 162{
129 u64 i; 163 struct hlist_head *list;
164 struct hlist_node *node;
165 struct hpte_cache *pte;
130 166
131 dprintk_mmu("KVM: Flushing %d Shadow PTEs: 0x%lx & 0x%lx\n", 167 /* Find the list of entries in the map */
132 vcpu->arch.hpte_cache_count, guest_ea, ea_mask); 168 list = &vcpu->arch.hpte_hash_pte_long[
169 kvmppc_mmu_hash_pte_long(guest_ea)];
133 170
171 rcu_read_lock();
172
173 /* Check the list for matching entries and invalidate */
174 hlist_for_each_entry_rcu(pte, node, list, list_pte_long)
175 if ((pte->pte.eaddr & 0x0ffff000UL) == guest_ea)
176 invalidate_pte(vcpu, pte);
177
178 rcu_read_unlock();
179}
180
181void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask)
182{
183 trace_kvm_book3s_mmu_flush("", vcpu, guest_ea, ea_mask);
134 guest_ea &= ea_mask; 184 guest_ea &= ea_mask;
135 185
136 switch (ea_mask) { 186 switch (ea_mask) {
@@ -138,9 +188,7 @@ void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask)
138 kvmppc_mmu_pte_flush_page(vcpu, guest_ea); 188 kvmppc_mmu_pte_flush_page(vcpu, guest_ea);
139 break; 189 break;
140 case 0x0ffff000: 190 case 0x0ffff000:
141 /* 32-bit flush w/o segment, go through all possible segments */ 191 kvmppc_mmu_pte_flush_long(vcpu, guest_ea);
142 for (i = 0; i < 0x100000000ULL; i += 0x10000000ULL)
143 kvmppc_mmu_pte_flush(vcpu, guest_ea | i, ~0xfffUL);
144 break; 192 break;
145 case 0: 193 case 0:
146 /* Doing a complete flush -> start from scratch */ 194 /* Doing a complete flush -> start from scratch */
@@ -156,39 +204,46 @@ void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask)
156static void kvmppc_mmu_pte_vflush_short(struct kvm_vcpu *vcpu, u64 guest_vp) 204static void kvmppc_mmu_pte_vflush_short(struct kvm_vcpu *vcpu, u64 guest_vp)
157{ 205{
158 struct hlist_head *list; 206 struct hlist_head *list;
159 struct hlist_node *node, *tmp; 207 struct hlist_node *node;
160 struct hpte_cache *pte; 208 struct hpte_cache *pte;
161 u64 vp_mask = 0xfffffffffULL; 209 u64 vp_mask = 0xfffffffffULL;
162 210
163 list = &vcpu->arch.hpte_hash_vpte[kvmppc_mmu_hash_vpte(guest_vp)]; 211 list = &vcpu->arch.hpte_hash_vpte[kvmppc_mmu_hash_vpte(guest_vp)];
164 212
213 rcu_read_lock();
214
165 /* Check the list for matching entries and invalidate */ 215 /* Check the list for matching entries and invalidate */
166 hlist_for_each_entry_safe(pte, node, tmp, list, list_vpte) 216 hlist_for_each_entry_rcu(pte, node, list, list_vpte)
167 if ((pte->pte.vpage & vp_mask) == guest_vp) 217 if ((pte->pte.vpage & vp_mask) == guest_vp)
168 invalidate_pte(vcpu, pte); 218 invalidate_pte(vcpu, pte);
219
220 rcu_read_unlock();
169} 221}
170 222
171/* Flush with mask 0xffffff000 */ 223/* Flush with mask 0xffffff000 */
172static void kvmppc_mmu_pte_vflush_long(struct kvm_vcpu *vcpu, u64 guest_vp) 224static void kvmppc_mmu_pte_vflush_long(struct kvm_vcpu *vcpu, u64 guest_vp)
173{ 225{
174 struct hlist_head *list; 226 struct hlist_head *list;
175 struct hlist_node *node, *tmp; 227 struct hlist_node *node;
176 struct hpte_cache *pte; 228 struct hpte_cache *pte;
177 u64 vp_mask = 0xffffff000ULL; 229 u64 vp_mask = 0xffffff000ULL;
178 230
179 list = &vcpu->arch.hpte_hash_vpte_long[ 231 list = &vcpu->arch.hpte_hash_vpte_long[
180 kvmppc_mmu_hash_vpte_long(guest_vp)]; 232 kvmppc_mmu_hash_vpte_long(guest_vp)];
181 233
234 rcu_read_lock();
235
182 /* Check the list for matching entries and invalidate */ 236 /* Check the list for matching entries and invalidate */
183 hlist_for_each_entry_safe(pte, node, tmp, list, list_vpte_long) 237 hlist_for_each_entry_rcu(pte, node, list, list_vpte_long)
184 if ((pte->pte.vpage & vp_mask) == guest_vp) 238 if ((pte->pte.vpage & vp_mask) == guest_vp)
185 invalidate_pte(vcpu, pte); 239 invalidate_pte(vcpu, pte);
240
241 rcu_read_unlock();
186} 242}
187 243
188void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask) 244void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask)
189{ 245{
190 dprintk_mmu("KVM: Flushing %d Shadow vPTEs: 0x%llx & 0x%llx\n", 246 trace_kvm_book3s_mmu_flush("v", vcpu, guest_vp, vp_mask);
191 vcpu->arch.hpte_cache_count, guest_vp, vp_mask);
192 guest_vp &= vp_mask; 247 guest_vp &= vp_mask;
193 248
194 switch(vp_mask) { 249 switch(vp_mask) {
@@ -206,21 +261,24 @@ void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask)
206 261
207void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end) 262void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
208{ 263{
209 struct hlist_node *node, *tmp; 264 struct hlist_node *node;
210 struct hpte_cache *pte; 265 struct hpte_cache *pte;
211 int i; 266 int i;
212 267
213 dprintk_mmu("KVM: Flushing %d Shadow pPTEs: 0x%lx - 0x%lx\n", 268 trace_kvm_book3s_mmu_flush("p", vcpu, pa_start, pa_end);
214 vcpu->arch.hpte_cache_count, pa_start, pa_end); 269
270 rcu_read_lock();
215 271
216 for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) { 272 for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) {
217 struct hlist_head *list = &vcpu->arch.hpte_hash_vpte_long[i]; 273 struct hlist_head *list = &vcpu->arch.hpte_hash_vpte_long[i];
218 274
219 hlist_for_each_entry_safe(pte, node, tmp, list, list_vpte_long) 275 hlist_for_each_entry_rcu(pte, node, list, list_vpte_long)
220 if ((pte->pte.raddr >= pa_start) && 276 if ((pte->pte.raddr >= pa_start) &&
221 (pte->pte.raddr < pa_end)) 277 (pte->pte.raddr < pa_end))
222 invalidate_pte(vcpu, pte); 278 invalidate_pte(vcpu, pte);
223 } 279 }
280
281 rcu_read_unlock();
224} 282}
225 283
226struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu) 284struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu)
@@ -254,11 +312,15 @@ int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu)
254 /* init hpte lookup hashes */ 312 /* init hpte lookup hashes */
255 kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_pte, 313 kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_pte,
256 ARRAY_SIZE(vcpu->arch.hpte_hash_pte)); 314 ARRAY_SIZE(vcpu->arch.hpte_hash_pte));
315 kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_pte_long,
316 ARRAY_SIZE(vcpu->arch.hpte_hash_pte_long));
257 kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_vpte, 317 kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_vpte,
258 ARRAY_SIZE(vcpu->arch.hpte_hash_vpte)); 318 ARRAY_SIZE(vcpu->arch.hpte_hash_vpte));
259 kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_vpte_long, 319 kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_vpte_long,
260 ARRAY_SIZE(vcpu->arch.hpte_hash_vpte_long)); 320 ARRAY_SIZE(vcpu->arch.hpte_hash_vpte_long));
261 321
322 spin_lock_init(&vcpu->arch.mmu_lock);
323
262 return 0; 324 return 0;
263} 325}
264 326
diff --git a/arch/powerpc/kvm/book3s_paired_singles.c b/arch/powerpc/kvm/book3s_paired_singles.c
index 35a701f3ece4..7b0ee96c1bed 100644
--- a/arch/powerpc/kvm/book3s_paired_singles.c
+++ b/arch/powerpc/kvm/book3s_paired_singles.c
@@ -165,14 +165,15 @@ static inline void kvmppc_sync_qpr(struct kvm_vcpu *vcpu, int rt)
165static void kvmppc_inject_pf(struct kvm_vcpu *vcpu, ulong eaddr, bool is_store) 165static void kvmppc_inject_pf(struct kvm_vcpu *vcpu, ulong eaddr, bool is_store)
166{ 166{
167 u64 dsisr; 167 u64 dsisr;
168 struct kvm_vcpu_arch_shared *shared = vcpu->arch.shared;
168 169
169 vcpu->arch.msr = kvmppc_set_field(vcpu->arch.msr, 33, 36, 0); 170 shared->msr = kvmppc_set_field(shared->msr, 33, 36, 0);
170 vcpu->arch.msr = kvmppc_set_field(vcpu->arch.msr, 42, 47, 0); 171 shared->msr = kvmppc_set_field(shared->msr, 42, 47, 0);
171 vcpu->arch.dear = eaddr; 172 shared->dar = eaddr;
172 /* Page Fault */ 173 /* Page Fault */
173 dsisr = kvmppc_set_field(0, 33, 33, 1); 174 dsisr = kvmppc_set_field(0, 33, 33, 1);
174 if (is_store) 175 if (is_store)
175 to_book3s(vcpu)->dsisr = kvmppc_set_field(dsisr, 38, 38, 1); 176 shared->dsisr = kvmppc_set_field(dsisr, 38, 38, 1);
176 kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE); 177 kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE);
177} 178}
178 179
@@ -658,7 +659,7 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)
658 if (!kvmppc_inst_is_paired_single(vcpu, inst)) 659 if (!kvmppc_inst_is_paired_single(vcpu, inst))
659 return EMULATE_FAIL; 660 return EMULATE_FAIL;
660 661
661 if (!(vcpu->arch.msr & MSR_FP)) { 662 if (!(vcpu->arch.shared->msr & MSR_FP)) {
662 kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL); 663 kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL);
663 return EMULATE_AGAIN; 664 return EMULATE_AGAIN;
664 } 665 }
diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S
index 506d5c316c96..2b9c9088d00e 100644
--- a/arch/powerpc/kvm/book3s_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_rmhandlers.S
@@ -202,8 +202,25 @@ _GLOBAL(kvmppc_rmcall)
202 202
203#if defined(CONFIG_PPC_BOOK3S_32) 203#if defined(CONFIG_PPC_BOOK3S_32)
204#define STACK_LR INT_FRAME_SIZE+4 204#define STACK_LR INT_FRAME_SIZE+4
205
206/* load_up_xxx have to run with MSR_DR=0 on Book3S_32 */
207#define MSR_EXT_START \
208 PPC_STL r20, _NIP(r1); \
209 mfmsr r20; \
210 LOAD_REG_IMMEDIATE(r3, MSR_DR|MSR_EE); \
211 andc r3,r20,r3; /* Disable DR,EE */ \
212 mtmsr r3; \
213 sync
214
215#define MSR_EXT_END \
216 mtmsr r20; /* Enable DR,EE */ \
217 sync; \
218 PPC_LL r20, _NIP(r1)
219
205#elif defined(CONFIG_PPC_BOOK3S_64) 220#elif defined(CONFIG_PPC_BOOK3S_64)
206#define STACK_LR _LINK 221#define STACK_LR _LINK
222#define MSR_EXT_START
223#define MSR_EXT_END
207#endif 224#endif
208 225
209/* 226/*
@@ -215,19 +232,12 @@ _GLOBAL(kvmppc_load_up_ ## what); \
215 PPC_STLU r1, -INT_FRAME_SIZE(r1); \ 232 PPC_STLU r1, -INT_FRAME_SIZE(r1); \
216 mflr r3; \ 233 mflr r3; \
217 PPC_STL r3, STACK_LR(r1); \ 234 PPC_STL r3, STACK_LR(r1); \
218 PPC_STL r20, _NIP(r1); \ 235 MSR_EXT_START; \
219 mfmsr r20; \
220 LOAD_REG_IMMEDIATE(r3, MSR_DR|MSR_EE); \
221 andc r3,r20,r3; /* Disable DR,EE */ \
222 mtmsr r3; \
223 sync; \
224 \ 236 \
225 bl FUNC(load_up_ ## what); \ 237 bl FUNC(load_up_ ## what); \
226 \ 238 \
227 mtmsr r20; /* Enable DR,EE */ \ 239 MSR_EXT_END; \
228 sync; \
229 PPC_LL r3, STACK_LR(r1); \ 240 PPC_LL r3, STACK_LR(r1); \
230 PPC_LL r20, _NIP(r1); \
231 mtlr r3; \ 241 mtlr r3; \
232 addi r1, r1, INT_FRAME_SIZE; \ 242 addi r1, r1, INT_FRAME_SIZE; \
233 blr 243 blr
@@ -242,10 +252,10 @@ define_load_up(vsx)
242 252
243.global kvmppc_trampoline_lowmem 253.global kvmppc_trampoline_lowmem
244kvmppc_trampoline_lowmem: 254kvmppc_trampoline_lowmem:
245 .long kvmppc_handler_lowmem_trampoline - CONFIG_KERNEL_START 255 PPC_LONG kvmppc_handler_lowmem_trampoline - CONFIG_KERNEL_START
246 256
247.global kvmppc_trampoline_enter 257.global kvmppc_trampoline_enter
248kvmppc_trampoline_enter: 258kvmppc_trampoline_enter:
249 .long kvmppc_handler_trampoline_enter - CONFIG_KERNEL_START 259 PPC_LONG kvmppc_handler_trampoline_enter - CONFIG_KERNEL_START
250 260
251#include "book3s_segment.S" 261#include "book3s_segment.S"
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 8d4e35f5372c..77575d08c818 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -62,9 +62,10 @@ void kvmppc_dump_vcpu(struct kvm_vcpu *vcpu)
62{ 62{
63 int i; 63 int i;
64 64
65 printk("pc: %08lx msr: %08lx\n", vcpu->arch.pc, vcpu->arch.msr); 65 printk("pc: %08lx msr: %08llx\n", vcpu->arch.pc, vcpu->arch.shared->msr);
66 printk("lr: %08lx ctr: %08lx\n", vcpu->arch.lr, vcpu->arch.ctr); 66 printk("lr: %08lx ctr: %08lx\n", vcpu->arch.lr, vcpu->arch.ctr);
67 printk("srr0: %08lx srr1: %08lx\n", vcpu->arch.srr0, vcpu->arch.srr1); 67 printk("srr0: %08llx srr1: %08llx\n", vcpu->arch.shared->srr0,
68 vcpu->arch.shared->srr1);
68 69
69 printk("exceptions: %08lx\n", vcpu->arch.pending_exceptions); 70 printk("exceptions: %08lx\n", vcpu->arch.pending_exceptions);
70 71
@@ -130,13 +131,19 @@ void kvmppc_core_dequeue_dec(struct kvm_vcpu *vcpu)
130void kvmppc_core_queue_external(struct kvm_vcpu *vcpu, 131void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
131 struct kvm_interrupt *irq) 132 struct kvm_interrupt *irq)
132{ 133{
133 kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_EXTERNAL); 134 unsigned int prio = BOOKE_IRQPRIO_EXTERNAL;
135
136 if (irq->irq == KVM_INTERRUPT_SET_LEVEL)
137 prio = BOOKE_IRQPRIO_EXTERNAL_LEVEL;
138
139 kvmppc_booke_queue_irqprio(vcpu, prio);
134} 140}
135 141
136void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu, 142void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
137 struct kvm_interrupt *irq) 143 struct kvm_interrupt *irq)
138{ 144{
139 clear_bit(BOOKE_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions); 145 clear_bit(BOOKE_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions);
146 clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions);
140} 147}
141 148
142/* Deliver the interrupt of the corresponding priority, if possible. */ 149/* Deliver the interrupt of the corresponding priority, if possible. */
@@ -146,6 +153,26 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
146 int allowed = 0; 153 int allowed = 0;
147 ulong uninitialized_var(msr_mask); 154 ulong uninitialized_var(msr_mask);
148 bool update_esr = false, update_dear = false; 155 bool update_esr = false, update_dear = false;
156 ulong crit_raw = vcpu->arch.shared->critical;
157 ulong crit_r1 = kvmppc_get_gpr(vcpu, 1);
158 bool crit;
159 bool keep_irq = false;
160
161 /* Truncate crit indicators in 32 bit mode */
162 if (!(vcpu->arch.shared->msr & MSR_SF)) {
163 crit_raw &= 0xffffffff;
164 crit_r1 &= 0xffffffff;
165 }
166
167 /* Critical section when crit == r1 */
168 crit = (crit_raw == crit_r1);
169 /* ... and we're in supervisor mode */
170 crit = crit && !(vcpu->arch.shared->msr & MSR_PR);
171
172 if (priority == BOOKE_IRQPRIO_EXTERNAL_LEVEL) {
173 priority = BOOKE_IRQPRIO_EXTERNAL;
174 keep_irq = true;
175 }
149 176
150 switch (priority) { 177 switch (priority) {
151 case BOOKE_IRQPRIO_DTLB_MISS: 178 case BOOKE_IRQPRIO_DTLB_MISS:
@@ -169,36 +196,38 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
169 break; 196 break;
170 case BOOKE_IRQPRIO_CRITICAL: 197 case BOOKE_IRQPRIO_CRITICAL:
171 case BOOKE_IRQPRIO_WATCHDOG: 198 case BOOKE_IRQPRIO_WATCHDOG:
172 allowed = vcpu->arch.msr & MSR_CE; 199 allowed = vcpu->arch.shared->msr & MSR_CE;
173 msr_mask = MSR_ME; 200 msr_mask = MSR_ME;
174 break; 201 break;
175 case BOOKE_IRQPRIO_MACHINE_CHECK: 202 case BOOKE_IRQPRIO_MACHINE_CHECK:
176 allowed = vcpu->arch.msr & MSR_ME; 203 allowed = vcpu->arch.shared->msr & MSR_ME;
177 msr_mask = 0; 204 msr_mask = 0;
178 break; 205 break;
179 case BOOKE_IRQPRIO_EXTERNAL: 206 case BOOKE_IRQPRIO_EXTERNAL:
180 case BOOKE_IRQPRIO_DECREMENTER: 207 case BOOKE_IRQPRIO_DECREMENTER:
181 case BOOKE_IRQPRIO_FIT: 208 case BOOKE_IRQPRIO_FIT:
182 allowed = vcpu->arch.msr & MSR_EE; 209 allowed = vcpu->arch.shared->msr & MSR_EE;
210 allowed = allowed && !crit;
183 msr_mask = MSR_CE|MSR_ME|MSR_DE; 211 msr_mask = MSR_CE|MSR_ME|MSR_DE;
184 break; 212 break;
185 case BOOKE_IRQPRIO_DEBUG: 213 case BOOKE_IRQPRIO_DEBUG:
186 allowed = vcpu->arch.msr & MSR_DE; 214 allowed = vcpu->arch.shared->msr & MSR_DE;
187 msr_mask = MSR_ME; 215 msr_mask = MSR_ME;
188 break; 216 break;
189 } 217 }
190 218
191 if (allowed) { 219 if (allowed) {
192 vcpu->arch.srr0 = vcpu->arch.pc; 220 vcpu->arch.shared->srr0 = vcpu->arch.pc;
193 vcpu->arch.srr1 = vcpu->arch.msr; 221 vcpu->arch.shared->srr1 = vcpu->arch.shared->msr;
194 vcpu->arch.pc = vcpu->arch.ivpr | vcpu->arch.ivor[priority]; 222 vcpu->arch.pc = vcpu->arch.ivpr | vcpu->arch.ivor[priority];
195 if (update_esr == true) 223 if (update_esr == true)
196 vcpu->arch.esr = vcpu->arch.queued_esr; 224 vcpu->arch.esr = vcpu->arch.queued_esr;
197 if (update_dear == true) 225 if (update_dear == true)
198 vcpu->arch.dear = vcpu->arch.queued_dear; 226 vcpu->arch.shared->dar = vcpu->arch.queued_dear;
199 kvmppc_set_msr(vcpu, vcpu->arch.msr & msr_mask); 227 kvmppc_set_msr(vcpu, vcpu->arch.shared->msr & msr_mask);
200 228
201 clear_bit(priority, &vcpu->arch.pending_exceptions); 229 if (!keep_irq)
230 clear_bit(priority, &vcpu->arch.pending_exceptions);
202 } 231 }
203 232
204 return allowed; 233 return allowed;
@@ -208,6 +237,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
208void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu) 237void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu)
209{ 238{
210 unsigned long *pending = &vcpu->arch.pending_exceptions; 239 unsigned long *pending = &vcpu->arch.pending_exceptions;
240 unsigned long old_pending = vcpu->arch.pending_exceptions;
211 unsigned int priority; 241 unsigned int priority;
212 242
213 priority = __ffs(*pending); 243 priority = __ffs(*pending);
@@ -219,6 +249,12 @@ void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu)
219 BITS_PER_BYTE * sizeof(*pending), 249 BITS_PER_BYTE * sizeof(*pending),
220 priority + 1); 250 priority + 1);
221 } 251 }
252
253 /* Tell the guest about our interrupt status */
254 if (*pending)
255 vcpu->arch.shared->int_pending = 1;
256 else if (old_pending)
257 vcpu->arch.shared->int_pending = 0;
222} 258}
223 259
224/** 260/**
@@ -265,7 +301,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
265 break; 301 break;
266 302
267 case BOOKE_INTERRUPT_PROGRAM: 303 case BOOKE_INTERRUPT_PROGRAM:
268 if (vcpu->arch.msr & MSR_PR) { 304 if (vcpu->arch.shared->msr & MSR_PR) {
269 /* Program traps generated by user-level software must be handled 305 /* Program traps generated by user-level software must be handled
270 * by the guest kernel. */ 306 * by the guest kernel. */
271 kvmppc_core_queue_program(vcpu, vcpu->arch.fault_esr); 307 kvmppc_core_queue_program(vcpu, vcpu->arch.fault_esr);
@@ -337,7 +373,15 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
337 break; 373 break;
338 374
339 case BOOKE_INTERRUPT_SYSCALL: 375 case BOOKE_INTERRUPT_SYSCALL:
340 kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SYSCALL); 376 if (!(vcpu->arch.shared->msr & MSR_PR) &&
377 (((u32)kvmppc_get_gpr(vcpu, 0)) == KVM_SC_MAGIC_R0)) {
378 /* KVM PV hypercalls */
379 kvmppc_set_gpr(vcpu, 3, kvmppc_kvm_pv(vcpu));
380 r = RESUME_GUEST;
381 } else {
382 /* Guest syscalls */
383 kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SYSCALL);
384 }
341 kvmppc_account_exit(vcpu, SYSCALL_EXITS); 385 kvmppc_account_exit(vcpu, SYSCALL_EXITS);
342 r = RESUME_GUEST; 386 r = RESUME_GUEST;
343 break; 387 break;
@@ -466,15 +510,19 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
466/* Initial guest state: 16MB mapping 0 -> 0, PC = 0, MSR = 0, R1 = 16MB */ 510/* Initial guest state: 16MB mapping 0 -> 0, PC = 0, MSR = 0, R1 = 16MB */
467int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) 511int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
468{ 512{
513 int i;
514
469 vcpu->arch.pc = 0; 515 vcpu->arch.pc = 0;
470 vcpu->arch.msr = 0; 516 vcpu->arch.shared->msr = 0;
471 kvmppc_set_gpr(vcpu, 1, (16<<20) - 8); /* -8 for the callee-save LR slot */ 517 kvmppc_set_gpr(vcpu, 1, (16<<20) - 8); /* -8 for the callee-save LR slot */
472 518
473 vcpu->arch.shadow_pid = 1; 519 vcpu->arch.shadow_pid = 1;
474 520
475 /* Eye-catching number so we know if the guest takes an interrupt 521 /* Eye-catching numbers so we know if the guest takes an interrupt
476 * before it's programmed its own IVPR. */ 522 * before it's programmed its own IVPR/IVORs. */
477 vcpu->arch.ivpr = 0x55550000; 523 vcpu->arch.ivpr = 0x55550000;
524 for (i = 0; i < BOOKE_IRQPRIO_MAX; i++)
525 vcpu->arch.ivor[i] = 0x7700 | i * 4;
478 526
479 kvmppc_init_timing_stats(vcpu); 527 kvmppc_init_timing_stats(vcpu);
480 528
@@ -490,14 +538,14 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
490 regs->ctr = vcpu->arch.ctr; 538 regs->ctr = vcpu->arch.ctr;
491 regs->lr = vcpu->arch.lr; 539 regs->lr = vcpu->arch.lr;
492 regs->xer = kvmppc_get_xer(vcpu); 540 regs->xer = kvmppc_get_xer(vcpu);
493 regs->msr = vcpu->arch.msr; 541 regs->msr = vcpu->arch.shared->msr;
494 regs->srr0 = vcpu->arch.srr0; 542 regs->srr0 = vcpu->arch.shared->srr0;
495 regs->srr1 = vcpu->arch.srr1; 543 regs->srr1 = vcpu->arch.shared->srr1;
496 regs->pid = vcpu->arch.pid; 544 regs->pid = vcpu->arch.pid;
497 regs->sprg0 = vcpu->arch.sprg0; 545 regs->sprg0 = vcpu->arch.shared->sprg0;
498 regs->sprg1 = vcpu->arch.sprg1; 546 regs->sprg1 = vcpu->arch.shared->sprg1;
499 regs->sprg2 = vcpu->arch.sprg2; 547 regs->sprg2 = vcpu->arch.shared->sprg2;
500 regs->sprg3 = vcpu->arch.sprg3; 548 regs->sprg3 = vcpu->arch.shared->sprg3;
501 regs->sprg5 = vcpu->arch.sprg4; 549 regs->sprg5 = vcpu->arch.sprg4;
502 regs->sprg6 = vcpu->arch.sprg5; 550 regs->sprg6 = vcpu->arch.sprg5;
503 regs->sprg7 = vcpu->arch.sprg6; 551 regs->sprg7 = vcpu->arch.sprg6;
@@ -518,12 +566,12 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
518 vcpu->arch.lr = regs->lr; 566 vcpu->arch.lr = regs->lr;
519 kvmppc_set_xer(vcpu, regs->xer); 567 kvmppc_set_xer(vcpu, regs->xer);
520 kvmppc_set_msr(vcpu, regs->msr); 568 kvmppc_set_msr(vcpu, regs->msr);
521 vcpu->arch.srr0 = regs->srr0; 569 vcpu->arch.shared->srr0 = regs->srr0;
522 vcpu->arch.srr1 = regs->srr1; 570 vcpu->arch.shared->srr1 = regs->srr1;
523 vcpu->arch.sprg0 = regs->sprg0; 571 vcpu->arch.shared->sprg0 = regs->sprg0;
524 vcpu->arch.sprg1 = regs->sprg1; 572 vcpu->arch.shared->sprg1 = regs->sprg1;
525 vcpu->arch.sprg2 = regs->sprg2; 573 vcpu->arch.shared->sprg2 = regs->sprg2;
526 vcpu->arch.sprg3 = regs->sprg3; 574 vcpu->arch.shared->sprg3 = regs->sprg3;
527 vcpu->arch.sprg5 = regs->sprg4; 575 vcpu->arch.sprg5 = regs->sprg4;
528 vcpu->arch.sprg6 = regs->sprg5; 576 vcpu->arch.sprg6 = regs->sprg5;
529 vcpu->arch.sprg7 = regs->sprg6; 577 vcpu->arch.sprg7 = regs->sprg6;
diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h
index d59bcca1f9d8..492bb7030358 100644
--- a/arch/powerpc/kvm/booke.h
+++ b/arch/powerpc/kvm/booke.h
@@ -46,7 +46,9 @@
46#define BOOKE_IRQPRIO_FIT 17 46#define BOOKE_IRQPRIO_FIT 17
47#define BOOKE_IRQPRIO_DECREMENTER 18 47#define BOOKE_IRQPRIO_DECREMENTER 18
48#define BOOKE_IRQPRIO_PERFORMANCE_MONITOR 19 48#define BOOKE_IRQPRIO_PERFORMANCE_MONITOR 19
49#define BOOKE_IRQPRIO_MAX 19 49/* Internal pseudo-irqprio for level triggered externals */
50#define BOOKE_IRQPRIO_EXTERNAL_LEVEL 20
51#define BOOKE_IRQPRIO_MAX 20
50 52
51extern unsigned long kvmppc_booke_handlers; 53extern unsigned long kvmppc_booke_handlers;
52 54
@@ -54,12 +56,12 @@ extern unsigned long kvmppc_booke_handlers;
54 * changing. */ 56 * changing. */
55static inline void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr) 57static inline void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr)
56{ 58{
57 if ((new_msr & MSR_PR) != (vcpu->arch.msr & MSR_PR)) 59 if ((new_msr & MSR_PR) != (vcpu->arch.shared->msr & MSR_PR))
58 kvmppc_mmu_priv_switch(vcpu, new_msr & MSR_PR); 60 kvmppc_mmu_priv_switch(vcpu, new_msr & MSR_PR);
59 61
60 vcpu->arch.msr = new_msr; 62 vcpu->arch.shared->msr = new_msr;
61 63
62 if (vcpu->arch.msr & MSR_WE) { 64 if (vcpu->arch.shared->msr & MSR_WE) {
63 kvm_vcpu_block(vcpu); 65 kvm_vcpu_block(vcpu);
64 kvmppc_set_exit_type(vcpu, EMULATED_MTMSRWE_EXITS); 66 kvmppc_set_exit_type(vcpu, EMULATED_MTMSRWE_EXITS);
65 }; 67 };
diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c
index cbc790ee1928..1260f5f24c0c 100644
--- a/arch/powerpc/kvm/booke_emulate.c
+++ b/arch/powerpc/kvm/booke_emulate.c
@@ -31,8 +31,8 @@
31 31
32static void kvmppc_emul_rfi(struct kvm_vcpu *vcpu) 32static void kvmppc_emul_rfi(struct kvm_vcpu *vcpu)
33{ 33{
34 vcpu->arch.pc = vcpu->arch.srr0; 34 vcpu->arch.pc = vcpu->arch.shared->srr0;
35 kvmppc_set_msr(vcpu, vcpu->arch.srr1); 35 kvmppc_set_msr(vcpu, vcpu->arch.shared->srr1);
36} 36}
37 37
38int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, 38int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
@@ -62,7 +62,7 @@ int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
62 62
63 case OP_31_XOP_MFMSR: 63 case OP_31_XOP_MFMSR:
64 rt = get_rt(inst); 64 rt = get_rt(inst);
65 kvmppc_set_gpr(vcpu, rt, vcpu->arch.msr); 65 kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->msr);
66 kvmppc_set_exit_type(vcpu, EMULATED_MFMSR_EXITS); 66 kvmppc_set_exit_type(vcpu, EMULATED_MFMSR_EXITS);
67 break; 67 break;
68 68
@@ -74,13 +74,13 @@ int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
74 74
75 case OP_31_XOP_WRTEE: 75 case OP_31_XOP_WRTEE:
76 rs = get_rs(inst); 76 rs = get_rs(inst);
77 vcpu->arch.msr = (vcpu->arch.msr & ~MSR_EE) 77 vcpu->arch.shared->msr = (vcpu->arch.shared->msr & ~MSR_EE)
78 | (kvmppc_get_gpr(vcpu, rs) & MSR_EE); 78 | (kvmppc_get_gpr(vcpu, rs) & MSR_EE);
79 kvmppc_set_exit_type(vcpu, EMULATED_WRTEE_EXITS); 79 kvmppc_set_exit_type(vcpu, EMULATED_WRTEE_EXITS);
80 break; 80 break;
81 81
82 case OP_31_XOP_WRTEEI: 82 case OP_31_XOP_WRTEEI:
83 vcpu->arch.msr = (vcpu->arch.msr & ~MSR_EE) 83 vcpu->arch.shared->msr = (vcpu->arch.shared->msr & ~MSR_EE)
84 | (inst & MSR_EE); 84 | (inst & MSR_EE);
85 kvmppc_set_exit_type(vcpu, EMULATED_WRTEE_EXITS); 85 kvmppc_set_exit_type(vcpu, EMULATED_WRTEE_EXITS);
86 break; 86 break;
@@ -105,7 +105,7 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
105 105
106 switch (sprn) { 106 switch (sprn) {
107 case SPRN_DEAR: 107 case SPRN_DEAR:
108 vcpu->arch.dear = spr_val; break; 108 vcpu->arch.shared->dar = spr_val; break;
109 case SPRN_ESR: 109 case SPRN_ESR:
110 vcpu->arch.esr = spr_val; break; 110 vcpu->arch.esr = spr_val; break;
111 case SPRN_DBCR0: 111 case SPRN_DBCR0:
@@ -200,7 +200,7 @@ int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
200 case SPRN_IVPR: 200 case SPRN_IVPR:
201 kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivpr); break; 201 kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivpr); break;
202 case SPRN_DEAR: 202 case SPRN_DEAR:
203 kvmppc_set_gpr(vcpu, rt, vcpu->arch.dear); break; 203 kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->dar); break;
204 case SPRN_ESR: 204 case SPRN_ESR:
205 kvmppc_set_gpr(vcpu, rt, vcpu->arch.esr); break; 205 kvmppc_set_gpr(vcpu, rt, vcpu->arch.esr); break;
206 case SPRN_DBCR0: 206 case SPRN_DBCR0:
diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S
index 380a78cf484d..049846911ce4 100644
--- a/arch/powerpc/kvm/booke_interrupts.S
+++ b/arch/powerpc/kvm/booke_interrupts.S
@@ -415,7 +415,8 @@ lightweight_exit:
415 lwz r8, VCPU_GPR(r8)(r4) 415 lwz r8, VCPU_GPR(r8)(r4)
416 lwz r3, VCPU_PC(r4) 416 lwz r3, VCPU_PC(r4)
417 mtsrr0 r3 417 mtsrr0 r3
418 lwz r3, VCPU_MSR(r4) 418 lwz r3, VCPU_SHARED(r4)
419 lwz r3, VCPU_SHARED_MSR(r3)
419 oris r3, r3, KVMPPC_MSR_MASK@h 420 oris r3, r3, KVMPPC_MSR_MASK@h
420 ori r3, r3, KVMPPC_MSR_MASK@l 421 ori r3, r3, KVMPPC_MSR_MASK@l
421 mtsrr1 r3 422 mtsrr1 r3
diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c
index e8a00b0c4449..71750f2dd5d3 100644
--- a/arch/powerpc/kvm/e500.c
+++ b/arch/powerpc/kvm/e500.c
@@ -117,8 +117,14 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
117 if (err) 117 if (err)
118 goto uninit_vcpu; 118 goto uninit_vcpu;
119 119
120 vcpu->arch.shared = (void*)__get_free_page(GFP_KERNEL|__GFP_ZERO);
121 if (!vcpu->arch.shared)
122 goto uninit_tlb;
123
120 return vcpu; 124 return vcpu;
121 125
126uninit_tlb:
127 kvmppc_e500_tlb_uninit(vcpu_e500);
122uninit_vcpu: 128uninit_vcpu:
123 kvm_vcpu_uninit(vcpu); 129 kvm_vcpu_uninit(vcpu);
124free_vcpu: 130free_vcpu:
@@ -131,6 +137,7 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
131{ 137{
132 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); 138 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
133 139
140 free_page((unsigned long)vcpu->arch.shared);
134 kvmppc_e500_tlb_uninit(vcpu_e500); 141 kvmppc_e500_tlb_uninit(vcpu_e500);
135 kvm_vcpu_uninit(vcpu); 142 kvm_vcpu_uninit(vcpu);
136 kmem_cache_free(kvm_vcpu_cache, vcpu_e500); 143 kmem_cache_free(kvm_vcpu_cache, vcpu_e500);
diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
index 21011e12caeb..d6d6d47a75a9 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -226,8 +226,7 @@ static void kvmppc_e500_stlbe_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500,
226 226
227 kvmppc_e500_shadow_release(vcpu_e500, tlbsel, esel); 227 kvmppc_e500_shadow_release(vcpu_e500, tlbsel, esel);
228 stlbe->mas1 = 0; 228 stlbe->mas1 = 0;
229 trace_kvm_stlb_inval(index_of(tlbsel, esel), stlbe->mas1, stlbe->mas2, 229 trace_kvm_stlb_inval(index_of(tlbsel, esel));
230 stlbe->mas3, stlbe->mas7);
231} 230}
232 231
233static void kvmppc_e500_tlb1_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500, 232static void kvmppc_e500_tlb1_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500,
@@ -298,7 +297,8 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
298 /* Get reference to new page. */ 297 /* Get reference to new page. */
299 new_page = gfn_to_page(vcpu_e500->vcpu.kvm, gfn); 298 new_page = gfn_to_page(vcpu_e500->vcpu.kvm, gfn);
300 if (is_error_page(new_page)) { 299 if (is_error_page(new_page)) {
301 printk(KERN_ERR "Couldn't get guest page for gfn %lx!\n", gfn); 300 printk(KERN_ERR "Couldn't get guest page for gfn %lx!\n",
301 (long)gfn);
302 kvm_release_page_clean(new_page); 302 kvm_release_page_clean(new_page);
303 return; 303 return;
304 } 304 }
@@ -314,10 +314,10 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
314 | MAS1_TID(get_tlb_tid(gtlbe)) | MAS1_TS | MAS1_VALID; 314 | MAS1_TID(get_tlb_tid(gtlbe)) | MAS1_TS | MAS1_VALID;
315 stlbe->mas2 = (gvaddr & MAS2_EPN) 315 stlbe->mas2 = (gvaddr & MAS2_EPN)
316 | e500_shadow_mas2_attrib(gtlbe->mas2, 316 | e500_shadow_mas2_attrib(gtlbe->mas2,
317 vcpu_e500->vcpu.arch.msr & MSR_PR); 317 vcpu_e500->vcpu.arch.shared->msr & MSR_PR);
318 stlbe->mas3 = (hpaddr & MAS3_RPN) 318 stlbe->mas3 = (hpaddr & MAS3_RPN)
319 | e500_shadow_mas3_attrib(gtlbe->mas3, 319 | e500_shadow_mas3_attrib(gtlbe->mas3,
320 vcpu_e500->vcpu.arch.msr & MSR_PR); 320 vcpu_e500->vcpu.arch.shared->msr & MSR_PR);
321 stlbe->mas7 = (hpaddr >> 32) & MAS7_RPN; 321 stlbe->mas7 = (hpaddr >> 32) & MAS7_RPN;
322 322
323 trace_kvm_stlb_write(index_of(tlbsel, esel), stlbe->mas1, stlbe->mas2, 323 trace_kvm_stlb_write(index_of(tlbsel, esel), stlbe->mas1, stlbe->mas2,
@@ -576,28 +576,28 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
576 576
577int kvmppc_mmu_itlb_index(struct kvm_vcpu *vcpu, gva_t eaddr) 577int kvmppc_mmu_itlb_index(struct kvm_vcpu *vcpu, gva_t eaddr)
578{ 578{
579 unsigned int as = !!(vcpu->arch.msr & MSR_IS); 579 unsigned int as = !!(vcpu->arch.shared->msr & MSR_IS);
580 580
581 return kvmppc_e500_tlb_search(vcpu, eaddr, get_cur_pid(vcpu), as); 581 return kvmppc_e500_tlb_search(vcpu, eaddr, get_cur_pid(vcpu), as);
582} 582}
583 583
584int kvmppc_mmu_dtlb_index(struct kvm_vcpu *vcpu, gva_t eaddr) 584int kvmppc_mmu_dtlb_index(struct kvm_vcpu *vcpu, gva_t eaddr)
585{ 585{
586 unsigned int as = !!(vcpu->arch.msr & MSR_DS); 586 unsigned int as = !!(vcpu->arch.shared->msr & MSR_DS);
587 587
588 return kvmppc_e500_tlb_search(vcpu, eaddr, get_cur_pid(vcpu), as); 588 return kvmppc_e500_tlb_search(vcpu, eaddr, get_cur_pid(vcpu), as);
589} 589}
590 590
591void kvmppc_mmu_itlb_miss(struct kvm_vcpu *vcpu) 591void kvmppc_mmu_itlb_miss(struct kvm_vcpu *vcpu)
592{ 592{
593 unsigned int as = !!(vcpu->arch.msr & MSR_IS); 593 unsigned int as = !!(vcpu->arch.shared->msr & MSR_IS);
594 594
595 kvmppc_e500_deliver_tlb_miss(vcpu, vcpu->arch.pc, as); 595 kvmppc_e500_deliver_tlb_miss(vcpu, vcpu->arch.pc, as);
596} 596}
597 597
598void kvmppc_mmu_dtlb_miss(struct kvm_vcpu *vcpu) 598void kvmppc_mmu_dtlb_miss(struct kvm_vcpu *vcpu)
599{ 599{
600 unsigned int as = !!(vcpu->arch.msr & MSR_DS); 600 unsigned int as = !!(vcpu->arch.shared->msr & MSR_DS);
601 601
602 kvmppc_e500_deliver_tlb_miss(vcpu, vcpu->arch.fault_dear, as); 602 kvmppc_e500_deliver_tlb_miss(vcpu, vcpu->arch.fault_dear, as);
603} 603}
diff --git a/arch/powerpc/kvm/e500_tlb.h b/arch/powerpc/kvm/e500_tlb.h
index d28e3010a5e2..458946b4775d 100644
--- a/arch/powerpc/kvm/e500_tlb.h
+++ b/arch/powerpc/kvm/e500_tlb.h
@@ -171,7 +171,7 @@ static inline int tlbe_is_host_safe(const struct kvm_vcpu *vcpu,
171 171
172 /* Does it match current guest AS? */ 172 /* Does it match current guest AS? */
173 /* XXX what about IS != DS? */ 173 /* XXX what about IS != DS? */
174 if (get_tlb_ts(tlbe) != !!(vcpu->arch.msr & MSR_IS)) 174 if (get_tlb_ts(tlbe) != !!(vcpu->arch.shared->msr & MSR_IS))
175 return 0; 175 return 0;
176 176
177 gpa = get_tlb_raddr(tlbe); 177 gpa = get_tlb_raddr(tlbe);
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index b83ba581fd8e..c64fd2909bb2 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -242,9 +242,11 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
242 242
243 switch (sprn) { 243 switch (sprn) {
244 case SPRN_SRR0: 244 case SPRN_SRR0:
245 kvmppc_set_gpr(vcpu, rt, vcpu->arch.srr0); break; 245 kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->srr0);
246 break;
246 case SPRN_SRR1: 247 case SPRN_SRR1:
247 kvmppc_set_gpr(vcpu, rt, vcpu->arch.srr1); break; 248 kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->srr1);
249 break;
248 case SPRN_PVR: 250 case SPRN_PVR:
249 kvmppc_set_gpr(vcpu, rt, vcpu->arch.pvr); break; 251 kvmppc_set_gpr(vcpu, rt, vcpu->arch.pvr); break;
250 case SPRN_PIR: 252 case SPRN_PIR:
@@ -261,13 +263,17 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
261 kvmppc_set_gpr(vcpu, rt, get_tb()); break; 263 kvmppc_set_gpr(vcpu, rt, get_tb()); break;
262 264
263 case SPRN_SPRG0: 265 case SPRN_SPRG0:
264 kvmppc_set_gpr(vcpu, rt, vcpu->arch.sprg0); break; 266 kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->sprg0);
267 break;
265 case SPRN_SPRG1: 268 case SPRN_SPRG1:
266 kvmppc_set_gpr(vcpu, rt, vcpu->arch.sprg1); break; 269 kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->sprg1);
270 break;
267 case SPRN_SPRG2: 271 case SPRN_SPRG2:
268 kvmppc_set_gpr(vcpu, rt, vcpu->arch.sprg2); break; 272 kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->sprg2);
273 break;
269 case SPRN_SPRG3: 274 case SPRN_SPRG3:
270 kvmppc_set_gpr(vcpu, rt, vcpu->arch.sprg3); break; 275 kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->sprg3);
276 break;
271 /* Note: SPRG4-7 are user-readable, so we don't get 277 /* Note: SPRG4-7 are user-readable, so we don't get
272 * a trap. */ 278 * a trap. */
273 279
@@ -320,9 +326,11 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
320 rs = get_rs(inst); 326 rs = get_rs(inst);
321 switch (sprn) { 327 switch (sprn) {
322 case SPRN_SRR0: 328 case SPRN_SRR0:
323 vcpu->arch.srr0 = kvmppc_get_gpr(vcpu, rs); break; 329 vcpu->arch.shared->srr0 = kvmppc_get_gpr(vcpu, rs);
330 break;
324 case SPRN_SRR1: 331 case SPRN_SRR1:
325 vcpu->arch.srr1 = kvmppc_get_gpr(vcpu, rs); break; 332 vcpu->arch.shared->srr1 = kvmppc_get_gpr(vcpu, rs);
333 break;
326 334
327 /* XXX We need to context-switch the timebase for 335 /* XXX We need to context-switch the timebase for
328 * watchdog and FIT. */ 336 * watchdog and FIT. */
@@ -337,13 +345,17 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
337 break; 345 break;
338 346
339 case SPRN_SPRG0: 347 case SPRN_SPRG0:
340 vcpu->arch.sprg0 = kvmppc_get_gpr(vcpu, rs); break; 348 vcpu->arch.shared->sprg0 = kvmppc_get_gpr(vcpu, rs);
349 break;
341 case SPRN_SPRG1: 350 case SPRN_SPRG1:
342 vcpu->arch.sprg1 = kvmppc_get_gpr(vcpu, rs); break; 351 vcpu->arch.shared->sprg1 = kvmppc_get_gpr(vcpu, rs);
352 break;
343 case SPRN_SPRG2: 353 case SPRN_SPRG2:
344 vcpu->arch.sprg2 = kvmppc_get_gpr(vcpu, rs); break; 354 vcpu->arch.shared->sprg2 = kvmppc_get_gpr(vcpu, rs);
355 break;
345 case SPRN_SPRG3: 356 case SPRN_SPRG3:
346 vcpu->arch.sprg3 = kvmppc_get_gpr(vcpu, rs); break; 357 vcpu->arch.shared->sprg3 = kvmppc_get_gpr(vcpu, rs);
358 break;
347 359
348 default: 360 default:
349 emulated = kvmppc_core_emulate_mtspr(vcpu, sprn, rs); 361 emulated = kvmppc_core_emulate_mtspr(vcpu, sprn, rs);
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 72a4ad86ee91..2f87a1627f6c 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -38,9 +38,56 @@
38 38
39int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) 39int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
40{ 40{
41 return !(v->arch.msr & MSR_WE) || !!(v->arch.pending_exceptions); 41 return !(v->arch.shared->msr & MSR_WE) ||
42 !!(v->arch.pending_exceptions);
42} 43}
43 44
45int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
46{
47 int nr = kvmppc_get_gpr(vcpu, 11);
48 int r;
49 unsigned long __maybe_unused param1 = kvmppc_get_gpr(vcpu, 3);
50 unsigned long __maybe_unused param2 = kvmppc_get_gpr(vcpu, 4);
51 unsigned long __maybe_unused param3 = kvmppc_get_gpr(vcpu, 5);
52 unsigned long __maybe_unused param4 = kvmppc_get_gpr(vcpu, 6);
53 unsigned long r2 = 0;
54
55 if (!(vcpu->arch.shared->msr & MSR_SF)) {
56 /* 32 bit mode */
57 param1 &= 0xffffffff;
58 param2 &= 0xffffffff;
59 param3 &= 0xffffffff;
60 param4 &= 0xffffffff;
61 }
62
63 switch (nr) {
64 case HC_VENDOR_KVM | KVM_HC_PPC_MAP_MAGIC_PAGE:
65 {
66 vcpu->arch.magic_page_pa = param1;
67 vcpu->arch.magic_page_ea = param2;
68
69 r2 = KVM_MAGIC_FEAT_SR;
70
71 r = HC_EV_SUCCESS;
72 break;
73 }
74 case HC_VENDOR_KVM | KVM_HC_FEATURES:
75 r = HC_EV_SUCCESS;
76#if defined(CONFIG_PPC_BOOK3S) /* XXX Missing magic page on BookE */
77 r2 |= (1 << KVM_FEATURE_MAGIC_PAGE);
78#endif
79
80 /* Second return value is in r4 */
81 break;
82 default:
83 r = HC_EV_UNIMPLEMENTED;
84 break;
85 }
86
87 kvmppc_set_gpr(vcpu, 4, r2);
88
89 return r;
90}
44 91
45int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu) 92int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu)
46{ 93{
@@ -145,8 +192,10 @@ int kvm_dev_ioctl_check_extension(long ext)
145 case KVM_CAP_PPC_SEGSTATE: 192 case KVM_CAP_PPC_SEGSTATE:
146 case KVM_CAP_PPC_PAIRED_SINGLES: 193 case KVM_CAP_PPC_PAIRED_SINGLES:
147 case KVM_CAP_PPC_UNSET_IRQ: 194 case KVM_CAP_PPC_UNSET_IRQ:
195 case KVM_CAP_PPC_IRQ_LEVEL:
148 case KVM_CAP_ENABLE_CAP: 196 case KVM_CAP_ENABLE_CAP:
149 case KVM_CAP_PPC_OSI: 197 case KVM_CAP_PPC_OSI:
198 case KVM_CAP_PPC_GET_PVINFO:
150 r = 1; 199 r = 1;
151 break; 200 break;
152 case KVM_CAP_COALESCED_MMIO: 201 case KVM_CAP_COALESCED_MMIO:
@@ -534,16 +583,53 @@ out:
534 return r; 583 return r;
535} 584}
536 585
586static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo)
587{
588 u32 inst_lis = 0x3c000000;
589 u32 inst_ori = 0x60000000;
590 u32 inst_nop = 0x60000000;
591 u32 inst_sc = 0x44000002;
592 u32 inst_imm_mask = 0xffff;
593
594 /*
595 * The hypercall to get into KVM from within guest context is as
596 * follows:
597 *
598 * lis r0, r0, KVM_SC_MAGIC_R0@h
599 * ori r0, KVM_SC_MAGIC_R0@l
600 * sc
601 * nop
602 */
603 pvinfo->hcall[0] = inst_lis | ((KVM_SC_MAGIC_R0 >> 16) & inst_imm_mask);
604 pvinfo->hcall[1] = inst_ori | (KVM_SC_MAGIC_R0 & inst_imm_mask);
605 pvinfo->hcall[2] = inst_sc;
606 pvinfo->hcall[3] = inst_nop;
607
608 return 0;
609}
610
537long kvm_arch_vm_ioctl(struct file *filp, 611long kvm_arch_vm_ioctl(struct file *filp,
538 unsigned int ioctl, unsigned long arg) 612 unsigned int ioctl, unsigned long arg)
539{ 613{
614 void __user *argp = (void __user *)arg;
540 long r; 615 long r;
541 616
542 switch (ioctl) { 617 switch (ioctl) {
618 case KVM_PPC_GET_PVINFO: {
619 struct kvm_ppc_pvinfo pvinfo;
620 r = kvm_vm_ioctl_get_pvinfo(&pvinfo);
621 if (copy_to_user(argp, &pvinfo, sizeof(pvinfo))) {
622 r = -EFAULT;
623 goto out;
624 }
625
626 break;
627 }
543 default: 628 default:
544 r = -ENOTTY; 629 r = -ENOTTY;
545 } 630 }
546 631
632out:
547 return r; 633 return r;
548} 634}
549 635
diff --git a/arch/powerpc/kvm/trace.h b/arch/powerpc/kvm/trace.h
index a8e840018052..3aca1b042b8c 100644
--- a/arch/powerpc/kvm/trace.h
+++ b/arch/powerpc/kvm/trace.h
@@ -98,6 +98,245 @@ TRACE_EVENT(kvm_gtlb_write,
98 __entry->word1, __entry->word2) 98 __entry->word1, __entry->word2)
99); 99);
100 100
101
102/*************************************************************************
103 * Book3S trace points *
104 *************************************************************************/
105
106#ifdef CONFIG_PPC_BOOK3S
107
108TRACE_EVENT(kvm_book3s_exit,
109 TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu),
110 TP_ARGS(exit_nr, vcpu),
111
112 TP_STRUCT__entry(
113 __field( unsigned int, exit_nr )
114 __field( unsigned long, pc )
115 __field( unsigned long, msr )
116 __field( unsigned long, dar )
117 __field( unsigned long, srr1 )
118 ),
119
120 TP_fast_assign(
121 __entry->exit_nr = exit_nr;
122 __entry->pc = kvmppc_get_pc(vcpu);
123 __entry->dar = kvmppc_get_fault_dar(vcpu);
124 __entry->msr = vcpu->arch.shared->msr;
125 __entry->srr1 = to_svcpu(vcpu)->shadow_srr1;
126 ),
127
128 TP_printk("exit=0x%x | pc=0x%lx | msr=0x%lx | dar=0x%lx | srr1=0x%lx",
129 __entry->exit_nr, __entry->pc, __entry->msr, __entry->dar,
130 __entry->srr1)
131);
132
133TRACE_EVENT(kvm_book3s_reenter,
134 TP_PROTO(int r, struct kvm_vcpu *vcpu),
135 TP_ARGS(r, vcpu),
136
137 TP_STRUCT__entry(
138 __field( unsigned int, r )
139 __field( unsigned long, pc )
140 ),
141
142 TP_fast_assign(
143 __entry->r = r;
144 __entry->pc = kvmppc_get_pc(vcpu);
145 ),
146
147 TP_printk("reentry r=%d | pc=0x%lx", __entry->r, __entry->pc)
148);
149
150#ifdef CONFIG_PPC_BOOK3S_64
151
152TRACE_EVENT(kvm_book3s_64_mmu_map,
153 TP_PROTO(int rflags, ulong hpteg, ulong va, pfn_t hpaddr,
154 struct kvmppc_pte *orig_pte),
155 TP_ARGS(rflags, hpteg, va, hpaddr, orig_pte),
156
157 TP_STRUCT__entry(
158 __field( unsigned char, flag_w )
159 __field( unsigned char, flag_x )
160 __field( unsigned long, eaddr )
161 __field( unsigned long, hpteg )
162 __field( unsigned long, va )
163 __field( unsigned long long, vpage )
164 __field( unsigned long, hpaddr )
165 ),
166
167 TP_fast_assign(
168 __entry->flag_w = ((rflags & HPTE_R_PP) == 3) ? '-' : 'w';
169 __entry->flag_x = (rflags & HPTE_R_N) ? '-' : 'x';
170 __entry->eaddr = orig_pte->eaddr;
171 __entry->hpteg = hpteg;
172 __entry->va = va;
173 __entry->vpage = orig_pte->vpage;
174 __entry->hpaddr = hpaddr;
175 ),
176
177 TP_printk("KVM: %c%c Map 0x%lx: [%lx] 0x%lx (0x%llx) -> %lx",
178 __entry->flag_w, __entry->flag_x, __entry->eaddr,
179 __entry->hpteg, __entry->va, __entry->vpage, __entry->hpaddr)
180);
181
182#endif /* CONFIG_PPC_BOOK3S_64 */
183
184TRACE_EVENT(kvm_book3s_mmu_map,
185 TP_PROTO(struct hpte_cache *pte),
186 TP_ARGS(pte),
187
188 TP_STRUCT__entry(
189 __field( u64, host_va )
190 __field( u64, pfn )
191 __field( ulong, eaddr )
192 __field( u64, vpage )
193 __field( ulong, raddr )
194 __field( int, flags )
195 ),
196
197 TP_fast_assign(
198 __entry->host_va = pte->host_va;
199 __entry->pfn = pte->pfn;
200 __entry->eaddr = pte->pte.eaddr;
201 __entry->vpage = pte->pte.vpage;
202 __entry->raddr = pte->pte.raddr;
203 __entry->flags = (pte->pte.may_read ? 0x4 : 0) |
204 (pte->pte.may_write ? 0x2 : 0) |
205 (pte->pte.may_execute ? 0x1 : 0);
206 ),
207
208 TP_printk("Map: hva=%llx pfn=%llx ea=%lx vp=%llx ra=%lx [%x]",
209 __entry->host_va, __entry->pfn, __entry->eaddr,
210 __entry->vpage, __entry->raddr, __entry->flags)
211);
212
213TRACE_EVENT(kvm_book3s_mmu_invalidate,
214 TP_PROTO(struct hpte_cache *pte),
215 TP_ARGS(pte),
216
217 TP_STRUCT__entry(
218 __field( u64, host_va )
219 __field( u64, pfn )
220 __field( ulong, eaddr )
221 __field( u64, vpage )
222 __field( ulong, raddr )
223 __field( int, flags )
224 ),
225
226 TP_fast_assign(
227 __entry->host_va = pte->host_va;
228 __entry->pfn = pte->pfn;
229 __entry->eaddr = pte->pte.eaddr;
230 __entry->vpage = pte->pte.vpage;
231 __entry->raddr = pte->pte.raddr;
232 __entry->flags = (pte->pte.may_read ? 0x4 : 0) |
233 (pte->pte.may_write ? 0x2 : 0) |
234 (pte->pte.may_execute ? 0x1 : 0);
235 ),
236
237 TP_printk("Flush: hva=%llx pfn=%llx ea=%lx vp=%llx ra=%lx [%x]",
238 __entry->host_va, __entry->pfn, __entry->eaddr,
239 __entry->vpage, __entry->raddr, __entry->flags)
240);
241
242TRACE_EVENT(kvm_book3s_mmu_flush,
243 TP_PROTO(const char *type, struct kvm_vcpu *vcpu, unsigned long long p1,
244 unsigned long long p2),
245 TP_ARGS(type, vcpu, p1, p2),
246
247 TP_STRUCT__entry(
248 __field( int, count )
249 __field( unsigned long long, p1 )
250 __field( unsigned long long, p2 )
251 __field( const char *, type )
252 ),
253
254 TP_fast_assign(
255 __entry->count = vcpu->arch.hpte_cache_count;
256 __entry->p1 = p1;
257 __entry->p2 = p2;
258 __entry->type = type;
259 ),
260
261 TP_printk("Flush %d %sPTEs: %llx - %llx",
262 __entry->count, __entry->type, __entry->p1, __entry->p2)
263);
264
265TRACE_EVENT(kvm_book3s_slb_found,
266 TP_PROTO(unsigned long long gvsid, unsigned long long hvsid),
267 TP_ARGS(gvsid, hvsid),
268
269 TP_STRUCT__entry(
270 __field( unsigned long long, gvsid )
271 __field( unsigned long long, hvsid )
272 ),
273
274 TP_fast_assign(
275 __entry->gvsid = gvsid;
276 __entry->hvsid = hvsid;
277 ),
278
279 TP_printk("%llx -> %llx", __entry->gvsid, __entry->hvsid)
280);
281
282TRACE_EVENT(kvm_book3s_slb_fail,
283 TP_PROTO(u16 sid_map_mask, unsigned long long gvsid),
284 TP_ARGS(sid_map_mask, gvsid),
285
286 TP_STRUCT__entry(
287 __field( unsigned short, sid_map_mask )
288 __field( unsigned long long, gvsid )
289 ),
290
291 TP_fast_assign(
292 __entry->sid_map_mask = sid_map_mask;
293 __entry->gvsid = gvsid;
294 ),
295
296 TP_printk("%x/%x: %llx", __entry->sid_map_mask,
297 SID_MAP_MASK - __entry->sid_map_mask, __entry->gvsid)
298);
299
300TRACE_EVENT(kvm_book3s_slb_map,
301 TP_PROTO(u16 sid_map_mask, unsigned long long gvsid,
302 unsigned long long hvsid),
303 TP_ARGS(sid_map_mask, gvsid, hvsid),
304
305 TP_STRUCT__entry(
306 __field( unsigned short, sid_map_mask )
307 __field( unsigned long long, guest_vsid )
308 __field( unsigned long long, host_vsid )
309 ),
310
311 TP_fast_assign(
312 __entry->sid_map_mask = sid_map_mask;
313 __entry->guest_vsid = gvsid;
314 __entry->host_vsid = hvsid;
315 ),
316
317 TP_printk("%x: %llx -> %llx", __entry->sid_map_mask,
318 __entry->guest_vsid, __entry->host_vsid)
319);
320
321TRACE_EVENT(kvm_book3s_slbmte,
322 TP_PROTO(u64 slb_vsid, u64 slb_esid),
323 TP_ARGS(slb_vsid, slb_esid),
324
325 TP_STRUCT__entry(
326 __field( u64, slb_vsid )
327 __field( u64, slb_esid )
328 ),
329
330 TP_fast_assign(
331 __entry->slb_vsid = slb_vsid;
332 __entry->slb_esid = slb_esid;
333 ),
334
335 TP_printk("%llx, %llx", __entry->slb_vsid, __entry->slb_esid)
336);
337
338#endif /* CONFIG_PPC_BOOK3S */
339
101#endif /* _TRACE_KVM_H */ 340#endif /* _TRACE_KVM_H */
102 341
103/* This part must be outside protection */ 342/* This part must be outside protection */
diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig
index 81c9208025fa..956154f32cfe 100644
--- a/arch/powerpc/platforms/Kconfig
+++ b/arch/powerpc/platforms/Kconfig
@@ -21,6 +21,16 @@ source "arch/powerpc/platforms/44x/Kconfig"
21source "arch/powerpc/platforms/40x/Kconfig" 21source "arch/powerpc/platforms/40x/Kconfig"
22source "arch/powerpc/platforms/amigaone/Kconfig" 22source "arch/powerpc/platforms/amigaone/Kconfig"
23 23
24config KVM_GUEST
25 bool "KVM Guest support"
26 default y
27 ---help---
28 This option enables various optimizations for running under the KVM
29 hypervisor. Overhead for the kernel when not running inside KVM should
30 be minimal.
31
32 In case of doubt, say Y
33
24config PPC_NATIVE 34config PPC_NATIVE
25 bool 35 bool
26 depends on 6xx || PPC64 36 depends on 6xx || PPC64
diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild
index 42e512ba8b43..287d7bbb6d36 100644
--- a/arch/s390/include/asm/Kbuild
+++ b/arch/s390/include/asm/Kbuild
@@ -5,6 +5,7 @@ header-y += chsc.h
5header-y += cmb.h 5header-y += cmb.h
6header-y += dasd.h 6header-y += dasd.h
7header-y += debug.h 7header-y += debug.h
8header-y += kvm_virtio.h
8header-y += monwriter.h 9header-y += monwriter.h
9header-y += qeth.h 10header-y += qeth.h
10header-y += schid.h 11header-y += schid.h
diff --git a/arch/s390/include/asm/kvm_virtio.h b/arch/s390/include/asm/kvm_virtio.h
index acdfdff26611..72f614181eff 100644
--- a/arch/s390/include/asm/kvm_virtio.h
+++ b/arch/s390/include/asm/kvm_virtio.h
@@ -54,4 +54,11 @@ struct kvm_vqconfig {
54 * This is pagesize for historical reasons. */ 54 * This is pagesize for historical reasons. */
55#define KVM_S390_VIRTIO_RING_ALIGN 4096 55#define KVM_S390_VIRTIO_RING_ALIGN 4096
56 56
57
58/* These values are supposed to be in ext_params on an interrupt */
59#define VIRTIO_PARAM_MASK 0xff
60#define VIRTIO_PARAM_VRING_INTERRUPT 0x0
61#define VIRTIO_PARAM_CONFIG_CHANGED 0x1
62#define VIRTIO_PARAM_DEV_ADD 0x2
63
57#endif 64#endif
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 1f99ecfc48e1..b36c6b3fe144 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -139,6 +139,7 @@ struct x86_emulate_ops {
139 void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu); 139 void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu);
140 unsigned long (*get_cached_segment_base)(int seg, struct kvm_vcpu *vcpu); 140 unsigned long (*get_cached_segment_base)(int seg, struct kvm_vcpu *vcpu);
141 void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu); 141 void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu);
142 void (*get_idt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu);
142 ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu); 143 ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu);
143 int (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu); 144 int (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu);
144 int (*cpl)(struct kvm_vcpu *vcpu); 145 int (*cpl)(struct kvm_vcpu *vcpu);
@@ -156,7 +157,10 @@ struct operand {
156 unsigned long orig_val; 157 unsigned long orig_val;
157 u64 orig_val64; 158 u64 orig_val64;
158 }; 159 };
159 unsigned long *ptr; 160 union {
161 unsigned long *reg;
162 unsigned long mem;
163 } addr;
160 union { 164 union {
161 unsigned long val; 165 unsigned long val;
162 u64 val64; 166 u64 val64;
@@ -190,6 +194,7 @@ struct decode_cache {
190 bool has_seg_override; 194 bool has_seg_override;
191 u8 seg_override; 195 u8 seg_override;
192 unsigned int d; 196 unsigned int d;
197 int (*execute)(struct x86_emulate_ctxt *ctxt);
193 unsigned long regs[NR_VCPU_REGS]; 198 unsigned long regs[NR_VCPU_REGS];
194 unsigned long eip; 199 unsigned long eip;
195 /* modrm */ 200 /* modrm */
@@ -197,17 +202,16 @@ struct decode_cache {
197 u8 modrm_mod; 202 u8 modrm_mod;
198 u8 modrm_reg; 203 u8 modrm_reg;
199 u8 modrm_rm; 204 u8 modrm_rm;
200 u8 use_modrm_ea; 205 u8 modrm_seg;
201 bool rip_relative; 206 bool rip_relative;
202 unsigned long modrm_ea;
203 void *modrm_ptr;
204 unsigned long modrm_val;
205 struct fetch_cache fetch; 207 struct fetch_cache fetch;
206 struct read_cache io_read; 208 struct read_cache io_read;
207 struct read_cache mem_read; 209 struct read_cache mem_read;
208}; 210};
209 211
210struct x86_emulate_ctxt { 212struct x86_emulate_ctxt {
213 struct x86_emulate_ops *ops;
214
211 /* Register state before/after emulation. */ 215 /* Register state before/after emulation. */
212 struct kvm_vcpu *vcpu; 216 struct kvm_vcpu *vcpu;
213 217
@@ -220,12 +224,11 @@ struct x86_emulate_ctxt {
220 /* interruptibility state, as a result of execution of STI or MOV SS */ 224 /* interruptibility state, as a result of execution of STI or MOV SS */
221 int interruptibility; 225 int interruptibility;
222 226
223 bool restart; /* restart string instruction after writeback */ 227 bool perm_ok; /* do not check permissions if true */
224 228
225 int exception; /* exception that happens during emulation or -1 */ 229 int exception; /* exception that happens during emulation or -1 */
226 u32 error_code; /* error code for exception */ 230 u32 error_code; /* error code for exception */
227 bool error_code_valid; 231 bool error_code_valid;
228 unsigned long cr2; /* faulted address in case of #PF */
229 232
230 /* decode cache */ 233 /* decode cache */
231 struct decode_cache decode; 234 struct decode_cache decode;
@@ -249,13 +252,14 @@ struct x86_emulate_ctxt {
249#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 252#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
250#endif 253#endif
251 254
252int x86_decode_insn(struct x86_emulate_ctxt *ctxt, 255int x86_decode_insn(struct x86_emulate_ctxt *ctxt);
253 struct x86_emulate_ops *ops); 256#define EMULATION_FAILED -1
254int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, 257#define EMULATION_OK 0
255 struct x86_emulate_ops *ops); 258#define EMULATION_RESTART 1
259int x86_emulate_insn(struct x86_emulate_ctxt *ctxt);
256int emulator_task_switch(struct x86_emulate_ctxt *ctxt, 260int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
257 struct x86_emulate_ops *ops,
258 u16 tss_selector, int reason, 261 u16 tss_selector, int reason,
259 bool has_error_code, u32 error_code); 262 bool has_error_code, u32 error_code);
260 263int emulate_int_real(struct x86_emulate_ctxt *ctxt,
264 struct x86_emulate_ops *ops, int irq);
261#endif /* _ASM_X86_KVM_X86_EMULATE_H */ 265#endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c52e2eb40a1e..9e6fe391094e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -236,10 +236,14 @@ struct kvm_pio_request {
236 */ 236 */
237struct kvm_mmu { 237struct kvm_mmu {
238 void (*new_cr3)(struct kvm_vcpu *vcpu); 238 void (*new_cr3)(struct kvm_vcpu *vcpu);
239 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
240 unsigned long (*get_cr3)(struct kvm_vcpu *vcpu);
239 int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); 241 int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
242 void (*inject_page_fault)(struct kvm_vcpu *vcpu);
240 void (*free)(struct kvm_vcpu *vcpu); 243 void (*free)(struct kvm_vcpu *vcpu);
241 gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access, 244 gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
242 u32 *error); 245 u32 *error);
246 gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access);
243 void (*prefetch_page)(struct kvm_vcpu *vcpu, 247 void (*prefetch_page)(struct kvm_vcpu *vcpu,
244 struct kvm_mmu_page *page); 248 struct kvm_mmu_page *page);
245 int (*sync_page)(struct kvm_vcpu *vcpu, 249 int (*sync_page)(struct kvm_vcpu *vcpu,
@@ -249,13 +253,18 @@ struct kvm_mmu {
249 int root_level; 253 int root_level;
250 int shadow_root_level; 254 int shadow_root_level;
251 union kvm_mmu_page_role base_role; 255 union kvm_mmu_page_role base_role;
256 bool direct_map;
252 257
253 u64 *pae_root; 258 u64 *pae_root;
259 u64 *lm_root;
254 u64 rsvd_bits_mask[2][4]; 260 u64 rsvd_bits_mask[2][4];
261
262 bool nx;
263
264 u64 pdptrs[4]; /* pae */
255}; 265};
256 266
257struct kvm_vcpu_arch { 267struct kvm_vcpu_arch {
258 u64 host_tsc;
259 /* 268 /*
260 * rip and regs accesses must go through 269 * rip and regs accesses must go through
261 * kvm_{register,rip}_{read,write} functions. 270 * kvm_{register,rip}_{read,write} functions.
@@ -272,7 +281,6 @@ struct kvm_vcpu_arch {
272 unsigned long cr4_guest_owned_bits; 281 unsigned long cr4_guest_owned_bits;
273 unsigned long cr8; 282 unsigned long cr8;
274 u32 hflags; 283 u32 hflags;
275 u64 pdptrs[4]; /* pae */
276 u64 efer; 284 u64 efer;
277 u64 apic_base; 285 u64 apic_base;
278 struct kvm_lapic *apic; /* kernel irqchip context */ 286 struct kvm_lapic *apic; /* kernel irqchip context */
@@ -282,7 +290,41 @@ struct kvm_vcpu_arch {
282 u64 ia32_misc_enable_msr; 290 u64 ia32_misc_enable_msr;
283 bool tpr_access_reporting; 291 bool tpr_access_reporting;
284 292
293 /*
294 * Paging state of the vcpu
295 *
296 * If the vcpu runs in guest mode with two level paging this still saves
297 * the paging mode of the l1 guest. This context is always used to
298 * handle faults.
299 */
285 struct kvm_mmu mmu; 300 struct kvm_mmu mmu;
301
302 /*
303 * Paging state of an L2 guest (used for nested npt)
304 *
305 * This context will save all necessary information to walk page tables
306 * of the an L2 guest. This context is only initialized for page table
307 * walking and not for faulting since we never handle l2 page faults on
308 * the host.
309 */
310 struct kvm_mmu nested_mmu;
311
312 /*
313 * Pointer to the mmu context currently used for
314 * gva_to_gpa translations.
315 */
316 struct kvm_mmu *walk_mmu;
317
318 /*
319 * This struct is filled with the necessary information to propagate a
320 * page fault into the guest
321 */
322 struct {
323 u64 address;
324 unsigned error_code;
325 bool nested;
326 } fault;
327
286 /* only needed in kvm_pv_mmu_op() path, but it's hot so 328 /* only needed in kvm_pv_mmu_op() path, but it's hot so
287 * put it here to avoid allocation */ 329 * put it here to avoid allocation */
288 struct kvm_pv_mmu_op_buffer mmu_op_buffer; 330 struct kvm_pv_mmu_op_buffer mmu_op_buffer;
@@ -336,9 +378,15 @@ struct kvm_vcpu_arch {
336 378
337 gpa_t time; 379 gpa_t time;
338 struct pvclock_vcpu_time_info hv_clock; 380 struct pvclock_vcpu_time_info hv_clock;
339 unsigned int hv_clock_tsc_khz; 381 unsigned int hw_tsc_khz;
340 unsigned int time_offset; 382 unsigned int time_offset;
341 struct page *time_page; 383 struct page *time_page;
384 u64 last_host_tsc;
385 u64 last_guest_tsc;
386 u64 last_kernel_ns;
387 u64 last_tsc_nsec;
388 u64 last_tsc_write;
389 bool tsc_catchup;
342 390
343 bool nmi_pending; 391 bool nmi_pending;
344 bool nmi_injected; 392 bool nmi_injected;
@@ -367,9 +415,9 @@ struct kvm_vcpu_arch {
367}; 415};
368 416
369struct kvm_arch { 417struct kvm_arch {
370 unsigned int n_free_mmu_pages; 418 unsigned int n_used_mmu_pages;
371 unsigned int n_requested_mmu_pages; 419 unsigned int n_requested_mmu_pages;
372 unsigned int n_alloc_mmu_pages; 420 unsigned int n_max_mmu_pages;
373 atomic_t invlpg_counter; 421 atomic_t invlpg_counter;
374 struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; 422 struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
375 /* 423 /*
@@ -394,8 +442,14 @@ struct kvm_arch {
394 gpa_t ept_identity_map_addr; 442 gpa_t ept_identity_map_addr;
395 443
396 unsigned long irq_sources_bitmap; 444 unsigned long irq_sources_bitmap;
397 u64 vm_init_tsc;
398 s64 kvmclock_offset; 445 s64 kvmclock_offset;
446 spinlock_t tsc_write_lock;
447 u64 last_tsc_nsec;
448 u64 last_tsc_offset;
449 u64 last_tsc_write;
450 u32 virtual_tsc_khz;
451 u32 virtual_tsc_mult;
452 s8 virtual_tsc_shift;
399 453
400 struct kvm_xen_hvm_config xen_hvm_config; 454 struct kvm_xen_hvm_config xen_hvm_config;
401 455
@@ -505,6 +559,7 @@ struct kvm_x86_ops {
505 void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr, 559 void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
506 bool has_error_code, u32 error_code, 560 bool has_error_code, u32 error_code,
507 bool reinject); 561 bool reinject);
562 void (*cancel_injection)(struct kvm_vcpu *vcpu);
508 int (*interrupt_allowed)(struct kvm_vcpu *vcpu); 563 int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
509 int (*nmi_allowed)(struct kvm_vcpu *vcpu); 564 int (*nmi_allowed)(struct kvm_vcpu *vcpu);
510 bool (*get_nmi_mask)(struct kvm_vcpu *vcpu); 565 bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
@@ -517,11 +572,16 @@ struct kvm_x86_ops {
517 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); 572 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
518 int (*get_lpage_level)(void); 573 int (*get_lpage_level)(void);
519 bool (*rdtscp_supported)(void); 574 bool (*rdtscp_supported)(void);
575 void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment);
576
577 void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
520 578
521 void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry); 579 void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry);
522 580
523 bool (*has_wbinvd_exit)(void); 581 bool (*has_wbinvd_exit)(void);
524 582
583 void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
584
525 const struct trace_print_flags *exit_reasons_str; 585 const struct trace_print_flags *exit_reasons_str;
526}; 586};
527 587
@@ -544,7 +604,7 @@ void kvm_mmu_zap_all(struct kvm *kvm);
544unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); 604unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
545void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); 605void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
546 606
547int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); 607int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3);
548 608
549int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, 609int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
550 const void *val, int bytes); 610 const void *val, int bytes);
@@ -608,8 +668,11 @@ void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
608void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); 668void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
609void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr); 669void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr);
610void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); 670void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
611void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2, 671void kvm_inject_page_fault(struct kvm_vcpu *vcpu);
612 u32 error_code); 672int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
673 gfn_t gfn, void *data, int offset, int len,
674 u32 access);
675void kvm_propagate_fault(struct kvm_vcpu *vcpu);
613bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); 676bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
614 677
615int kvm_pic_set_irq(void *opaque, int irq, int level); 678int kvm_pic_set_irq(void *opaque, int irq, int level);
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 05eba5e9a8e8..7b562b6184bc 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -158,6 +158,12 @@ static inline unsigned int kvm_arch_para_features(void)
158 return cpuid_eax(KVM_CPUID_FEATURES); 158 return cpuid_eax(KVM_CPUID_FEATURES);
159} 159}
160 160
161#ifdef CONFIG_KVM_GUEST
162void __init kvm_guest_init(void);
163#else
164#define kvm_guest_init() do { } while (0)
161#endif 165#endif
162 166
167#endif /* __KERNEL__ */
168
163#endif /* _ASM_X86_KVM_PARA_H */ 169#endif /* _ASM_X86_KVM_PARA_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 986f7790fdb2..83c4bb1d917d 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -198,6 +198,7 @@
198#define MSR_IA32_TSC 0x00000010 198#define MSR_IA32_TSC 0x00000010
199#define MSR_IA32_PLATFORM_ID 0x00000017 199#define MSR_IA32_PLATFORM_ID 0x00000017
200#define MSR_IA32_EBL_CR_POWERON 0x0000002a 200#define MSR_IA32_EBL_CR_POWERON 0x0000002a
201#define MSR_EBC_FREQUENCY_ID 0x0000002c
201#define MSR_IA32_FEATURE_CONTROL 0x0000003a 202#define MSR_IA32_FEATURE_CONTROL 0x0000003a
202 203
203#define FEATURE_CONTROL_LOCKED (1<<0) 204#define FEATURE_CONTROL_LOCKED (1<<0)
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index cd02f324aa6b..7f7e577a0e39 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -12,4 +12,42 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
12 struct pvclock_vcpu_time_info *vcpu, 12 struct pvclock_vcpu_time_info *vcpu,
13 struct timespec *ts); 13 struct timespec *ts);
14 14
15/*
16 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
17 * yielding a 64-bit result.
18 */
19static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
20{
21 u64 product;
22#ifdef __i386__
23 u32 tmp1, tmp2;
24#endif
25
26 if (shift < 0)
27 delta >>= -shift;
28 else
29 delta <<= shift;
30
31#ifdef __i386__
32 __asm__ (
33 "mul %5 ; "
34 "mov %4,%%eax ; "
35 "mov %%edx,%4 ; "
36 "mul %5 ; "
37 "xor %5,%5 ; "
38 "add %4,%%eax ; "
39 "adc %5,%%edx ; "
40 : "=A" (product), "=r" (tmp1), "=r" (tmp2)
41 : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
42#elif defined(__x86_64__)
43 __asm__ (
44 "mul %%rdx ; shrd $32,%%rdx,%%rax"
45 : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
46#else
47#error implement me!
48#endif
49
50 return product;
51}
52
15#endif /* _ASM_X86_PVCLOCK_H */ 53#endif /* _ASM_X86_PVCLOCK_H */
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index eb9b76c716c2..ca43ce31a19c 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -128,13 +128,15 @@ static struct clocksource kvm_clock = {
128static int kvm_register_clock(char *txt) 128static int kvm_register_clock(char *txt)
129{ 129{
130 int cpu = smp_processor_id(); 130 int cpu = smp_processor_id();
131 int low, high; 131 int low, high, ret;
132
132 low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1; 133 low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1;
133 high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32); 134 high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);
135 ret = native_write_msr_safe(msr_kvm_system_time, low, high);
134 printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", 136 printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
135 cpu, high, low, txt); 137 cpu, high, low, txt);
136 138
137 return native_write_msr_safe(msr_kvm_system_time, low, high); 139 return ret;
138} 140}
139 141
140#ifdef CONFIG_X86_LOCAL_APIC 142#ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 239427ca02af..bab3b9e6f66d 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -82,7 +82,8 @@ static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
82static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow) 82static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow)
83{ 83{
84 u64 delta = native_read_tsc() - shadow->tsc_timestamp; 84 u64 delta = native_read_tsc() - shadow->tsc_timestamp;
85 return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); 85 return pvclock_scale_delta(delta, shadow->tsc_to_nsec_mul,
86 shadow->tsc_shift);
86} 87}
87 88
88/* 89/*
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 970bbd479516..ddc131ff438f 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -64,6 +64,13 @@ config KVM_AMD
64 To compile this as a module, choose M here: the module 64 To compile this as a module, choose M here: the module
65 will be called kvm-amd. 65 will be called kvm-amd.
66 66
67config KVM_MMU_AUDIT
68 bool "Audit KVM MMU"
69 depends on KVM && TRACEPOINTS
70 ---help---
71 This option adds a R/W kVM module parameter 'mmu_audit', which allows
72 audit KVM MMU at runtime.
73
67# OK, it's a little counter-intuitive to do this, but it puts it neatly under 74# OK, it's a little counter-intuitive to do this, but it puts it neatly under
68# the virtualization menu. 75# the virtualization menu.
69source drivers/vhost/Kconfig 76source drivers/vhost/Kconfig
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 66ca98aafdd6..38b6e8dafaff 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -9,7 +9,7 @@
9 * privileged instructions: 9 * privileged instructions:
10 * 10 *
11 * Copyright (C) 2006 Qumranet 11 * Copyright (C) 2006 Qumranet
12 * Copyright 2010 Red Hat, Inc. and/or its affilates. 12 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
13 * 13 *
14 * Avi Kivity <avi@qumranet.com> 14 * Avi Kivity <avi@qumranet.com>
15 * Yaniv Kamay <yaniv@qumranet.com> 15 * Yaniv Kamay <yaniv@qumranet.com>
@@ -51,13 +51,13 @@
51#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ 51#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */
52#define DstReg (2<<1) /* Register operand. */ 52#define DstReg (2<<1) /* Register operand. */
53#define DstMem (3<<1) /* Memory operand. */ 53#define DstMem (3<<1) /* Memory operand. */
54#define DstAcc (4<<1) /* Destination Accumulator */ 54#define DstAcc (4<<1) /* Destination Accumulator */
55#define DstDI (5<<1) /* Destination is in ES:(E)DI */ 55#define DstDI (5<<1) /* Destination is in ES:(E)DI */
56#define DstMem64 (6<<1) /* 64bit memory operand */ 56#define DstMem64 (6<<1) /* 64bit memory operand */
57#define DstImmUByte (7<<1) /* 8-bit unsigned immediate operand */
57#define DstMask (7<<1) 58#define DstMask (7<<1)
58/* Source operand type. */ 59/* Source operand type. */
59#define SrcNone (0<<4) /* No source operand. */ 60#define SrcNone (0<<4) /* No source operand. */
60#define SrcImplicit (0<<4) /* Source operand is implicit in the opcode. */
61#define SrcReg (1<<4) /* Register operand. */ 61#define SrcReg (1<<4) /* Register operand. */
62#define SrcMem (2<<4) /* Memory operand. */ 62#define SrcMem (2<<4) /* Memory operand. */
63#define SrcMem16 (3<<4) /* Memory operand (16-bit). */ 63#define SrcMem16 (3<<4) /* Memory operand (16-bit). */
@@ -71,6 +71,7 @@
71#define SrcImmFAddr (0xb<<4) /* Source is immediate far address */ 71#define SrcImmFAddr (0xb<<4) /* Source is immediate far address */
72#define SrcMemFAddr (0xc<<4) /* Source is far address in memory */ 72#define SrcMemFAddr (0xc<<4) /* Source is far address in memory */
73#define SrcAcc (0xd<<4) /* Source Accumulator */ 73#define SrcAcc (0xd<<4) /* Source Accumulator */
74#define SrcImmU16 (0xe<<4) /* Immediate operand, unsigned, 16 bits */
74#define SrcMask (0xf<<4) 75#define SrcMask (0xf<<4)
75/* Generic ModRM decode. */ 76/* Generic ModRM decode. */
76#define ModRM (1<<8) 77#define ModRM (1<<8)
@@ -82,8 +83,10 @@
82#define Stack (1<<13) /* Stack instruction (push/pop) */ 83#define Stack (1<<13) /* Stack instruction (push/pop) */
83#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ 84#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
84#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ 85#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */
85#define GroupMask 0xff /* Group number stored in bits 0:7 */
86/* Misc flags */ 86/* Misc flags */
87#define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */
88#define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */
89#define Undefined (1<<25) /* No Such Instruction */
87#define Lock (1<<26) /* lock prefix is allowed for the instruction */ 90#define Lock (1<<26) /* lock prefix is allowed for the instruction */
88#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ 91#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */
89#define No64 (1<<28) 92#define No64 (1<<28)
@@ -92,285 +95,30 @@
92#define Src2CL (1<<29) 95#define Src2CL (1<<29)
93#define Src2ImmByte (2<<29) 96#define Src2ImmByte (2<<29)
94#define Src2One (3<<29) 97#define Src2One (3<<29)
98#define Src2Imm (4<<29)
95#define Src2Mask (7<<29) 99#define Src2Mask (7<<29)
96 100
97enum { 101#define X2(x...) x, x
98 Group1_80, Group1_81, Group1_82, Group1_83, 102#define X3(x...) X2(x), x
99 Group1A, Group3_Byte, Group3, Group4, Group5, Group7, 103#define X4(x...) X2(x), X2(x)
100 Group8, Group9, 104#define X5(x...) X4(x), x
105#define X6(x...) X4(x), X2(x)
106#define X7(x...) X4(x), X3(x)
107#define X8(x...) X4(x), X4(x)
108#define X16(x...) X8(x), X8(x)
109
110struct opcode {
111 u32 flags;
112 union {
113 int (*execute)(struct x86_emulate_ctxt *ctxt);
114 struct opcode *group;
115 struct group_dual *gdual;
116 } u;
101}; 117};
102 118
103static u32 opcode_table[256] = { 119struct group_dual {
104 /* 0x00 - 0x07 */ 120 struct opcode mod012[8];
105 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, 121 struct opcode mod3[8];
106 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
107 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
108 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
109 /* 0x08 - 0x0F */
110 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
111 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
112 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
113 ImplicitOps | Stack | No64, 0,
114 /* 0x10 - 0x17 */
115 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
116 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
117 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
118 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
119 /* 0x18 - 0x1F */
120 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
121 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
122 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
123 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
124 /* 0x20 - 0x27 */
125 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
126 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
127 ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
128 /* 0x28 - 0x2F */
129 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
130 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
131 ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
132 /* 0x30 - 0x37 */
133 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
134 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
135 ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
136 /* 0x38 - 0x3F */
137 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
138 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
139 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
140 0, 0,
141 /* 0x40 - 0x47 */
142 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
143 /* 0x48 - 0x4F */
144 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
145 /* 0x50 - 0x57 */
146 SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
147 SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
148 /* 0x58 - 0x5F */
149 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
150 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
151 /* 0x60 - 0x67 */
152 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
153 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
154 0, 0, 0, 0,
155 /* 0x68 - 0x6F */
156 SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0,
157 DstDI | ByteOp | Mov | String, DstDI | Mov | String, /* insb, insw/insd */
158 SrcSI | ByteOp | ImplicitOps | String, SrcSI | ImplicitOps | String, /* outsb, outsw/outsd */
159 /* 0x70 - 0x77 */
160 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
161 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
162 /* 0x78 - 0x7F */
163 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
164 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
165 /* 0x80 - 0x87 */
166 Group | Group1_80, Group | Group1_81,
167 Group | Group1_82, Group | Group1_83,
168 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
169 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
170 /* 0x88 - 0x8F */
171 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
172 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
173 DstMem | SrcNone | ModRM | Mov, ModRM | DstReg,
174 ImplicitOps | SrcMem16 | ModRM, Group | Group1A,
175 /* 0x90 - 0x97 */
176 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
177 /* 0x98 - 0x9F */
178 0, 0, SrcImmFAddr | No64, 0,
179 ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
180 /* 0xA0 - 0xA7 */
181 ByteOp | DstAcc | SrcMem | Mov | MemAbs, DstAcc | SrcMem | Mov | MemAbs,
182 ByteOp | DstMem | SrcAcc | Mov | MemAbs, DstMem | SrcAcc | Mov | MemAbs,
183 ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String,
184 ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String,
185 /* 0xA8 - 0xAF */
186 DstAcc | SrcImmByte | ByteOp, DstAcc | SrcImm, ByteOp | DstDI | Mov | String, DstDI | Mov | String,
187 ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String,
188 ByteOp | DstDI | String, DstDI | String,
189 /* 0xB0 - 0xB7 */
190 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
191 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
192 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
193 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
194 /* 0xB8 - 0xBF */
195 DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
196 DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
197 DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
198 DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
199 /* 0xC0 - 0xC7 */
200 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
201 0, ImplicitOps | Stack, 0, 0,
202 ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
203 /* 0xC8 - 0xCF */
204 0, 0, 0, ImplicitOps | Stack,
205 ImplicitOps, SrcImmByte, ImplicitOps | No64, ImplicitOps,
206 /* 0xD0 - 0xD7 */
207 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
208 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
209 0, 0, 0, 0,
210 /* 0xD8 - 0xDF */
211 0, 0, 0, 0, 0, 0, 0, 0,
212 /* 0xE0 - 0xE7 */
213 0, 0, 0, 0,
214 ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
215 ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
216 /* 0xE8 - 0xEF */
217 SrcImm | Stack, SrcImm | ImplicitOps,
218 SrcImmFAddr | No64, SrcImmByte | ImplicitOps,
219 SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
220 SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
221 /* 0xF0 - 0xF7 */
222 0, 0, 0, 0,
223 ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3,
224 /* 0xF8 - 0xFF */
225 ImplicitOps, 0, ImplicitOps, ImplicitOps,
226 ImplicitOps, ImplicitOps, Group | Group4, Group | Group5,
227};
228
229static u32 twobyte_table[256] = {
230 /* 0x00 - 0x0F */
231 0, Group | GroupDual | Group7, 0, 0,
232 0, ImplicitOps, ImplicitOps | Priv, 0,
233 ImplicitOps | Priv, ImplicitOps | Priv, 0, 0,
234 0, ImplicitOps | ModRM, 0, 0,
235 /* 0x10 - 0x1F */
236 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
237 /* 0x20 - 0x2F */
238 ModRM | ImplicitOps | Priv, ModRM | Priv,
239 ModRM | ImplicitOps | Priv, ModRM | Priv,
240 0, 0, 0, 0,
241 0, 0, 0, 0, 0, 0, 0, 0,
242 /* 0x30 - 0x3F */
243 ImplicitOps | Priv, 0, ImplicitOps | Priv, 0,
244 ImplicitOps, ImplicitOps | Priv, 0, 0,
245 0, 0, 0, 0, 0, 0, 0, 0,
246 /* 0x40 - 0x47 */
247 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
248 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
249 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
250 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
251 /* 0x48 - 0x4F */
252 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
253 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
254 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
255 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
256 /* 0x50 - 0x5F */
257 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
258 /* 0x60 - 0x6F */
259 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
260 /* 0x70 - 0x7F */
261 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
262 /* 0x80 - 0x8F */
263 SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm,
264 SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm,
265 /* 0x90 - 0x9F */
266 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
267 /* 0xA0 - 0xA7 */
268 ImplicitOps | Stack, ImplicitOps | Stack,
269 0, DstMem | SrcReg | ModRM | BitOp,
270 DstMem | SrcReg | Src2ImmByte | ModRM,
271 DstMem | SrcReg | Src2CL | ModRM, 0, 0,
272 /* 0xA8 - 0xAF */
273 ImplicitOps | Stack, ImplicitOps | Stack,
274 0, DstMem | SrcReg | ModRM | BitOp | Lock,
275 DstMem | SrcReg | Src2ImmByte | ModRM,
276 DstMem | SrcReg | Src2CL | ModRM,
277 ModRM, 0,
278 /* 0xB0 - 0xB7 */
279 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
280 0, DstMem | SrcReg | ModRM | BitOp | Lock,
281 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
282 DstReg | SrcMem16 | ModRM | Mov,
283 /* 0xB8 - 0xBF */
284 0, 0,
285 Group | Group8, DstMem | SrcReg | ModRM | BitOp | Lock,
286 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
287 DstReg | SrcMem16 | ModRM | Mov,
288 /* 0xC0 - 0xCF */
289 0, 0, 0, DstMem | SrcReg | ModRM | Mov,
290 0, 0, 0, Group | GroupDual | Group9,
291 0, 0, 0, 0, 0, 0, 0, 0,
292 /* 0xD0 - 0xDF */
293 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
294 /* 0xE0 - 0xEF */
295 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
296 /* 0xF0 - 0xFF */
297 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
298};
299
300static u32 group_table[] = {
301 [Group1_80*8] =
302 ByteOp | DstMem | SrcImm | ModRM | Lock,
303 ByteOp | DstMem | SrcImm | ModRM | Lock,
304 ByteOp | DstMem | SrcImm | ModRM | Lock,
305 ByteOp | DstMem | SrcImm | ModRM | Lock,
306 ByteOp | DstMem | SrcImm | ModRM | Lock,
307 ByteOp | DstMem | SrcImm | ModRM | Lock,
308 ByteOp | DstMem | SrcImm | ModRM | Lock,
309 ByteOp | DstMem | SrcImm | ModRM,
310 [Group1_81*8] =
311 DstMem | SrcImm | ModRM | Lock,
312 DstMem | SrcImm | ModRM | Lock,
313 DstMem | SrcImm | ModRM | Lock,
314 DstMem | SrcImm | ModRM | Lock,
315 DstMem | SrcImm | ModRM | Lock,
316 DstMem | SrcImm | ModRM | Lock,
317 DstMem | SrcImm | ModRM | Lock,
318 DstMem | SrcImm | ModRM,
319 [Group1_82*8] =
320 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
321 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
322 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
323 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
324 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
325 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
326 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
327 ByteOp | DstMem | SrcImm | ModRM | No64,
328 [Group1_83*8] =
329 DstMem | SrcImmByte | ModRM | Lock,
330 DstMem | SrcImmByte | ModRM | Lock,
331 DstMem | SrcImmByte | ModRM | Lock,
332 DstMem | SrcImmByte | ModRM | Lock,
333 DstMem | SrcImmByte | ModRM | Lock,
334 DstMem | SrcImmByte | ModRM | Lock,
335 DstMem | SrcImmByte | ModRM | Lock,
336 DstMem | SrcImmByte | ModRM,
337 [Group1A*8] =
338 DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0,
339 [Group3_Byte*8] =
340 ByteOp | SrcImm | DstMem | ModRM, ByteOp | SrcImm | DstMem | ModRM,
341 ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
342 0, 0, 0, 0,
343 [Group3*8] =
344 DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
345 DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
346 0, 0, 0, 0,
347 [Group4*8] =
348 ByteOp | DstMem | SrcNone | ModRM | Lock, ByteOp | DstMem | SrcNone | ModRM | Lock,
349 0, 0, 0, 0, 0, 0,
350 [Group5*8] =
351 DstMem | SrcNone | ModRM | Lock, DstMem | SrcNone | ModRM | Lock,
352 SrcMem | ModRM | Stack, 0,
353 SrcMem | ModRM | Stack, SrcMemFAddr | ModRM | ImplicitOps,
354 SrcMem | ModRM | Stack, 0,
355 [Group7*8] =
356 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv,
357 SrcNone | ModRM | DstMem | Mov, 0,
358 SrcMem16 | ModRM | Mov | Priv, SrcMem | ModRM | ByteOp | Priv,
359 [Group8*8] =
360 0, 0, 0, 0,
361 DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock,
362 DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock,
363 [Group9*8] =
364 0, DstMem64 | ModRM | Lock, 0, 0, 0, 0, 0, 0,
365};
366
367static u32 group2_table[] = {
368 [Group7*8] =
369 SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM | Priv,
370 SrcNone | ModRM | DstMem | Mov, 0,
371 SrcMem16 | ModRM | Mov | Priv, 0,
372 [Group9*8] =
373 0, 0, 0, 0, 0, 0, 0, 0,
374}; 122};
375 123
376/* EFLAGS bit definitions. */ 124/* EFLAGS bit definitions. */
@@ -392,6 +140,9 @@ static u32 group2_table[] = {
392#define EFLG_PF (1<<2) 140#define EFLG_PF (1<<2)
393#define EFLG_CF (1<<0) 141#define EFLG_CF (1<<0)
394 142
143#define EFLG_RESERVED_ZEROS_MASK 0xffc0802a
144#define EFLG_RESERVED_ONE_MASK 2
145
395/* 146/*
396 * Instruction emulation: 147 * Instruction emulation:
397 * Most instructions are emulated directly via a fragment of inline assembly 148 * Most instructions are emulated directly via a fragment of inline assembly
@@ -444,13 +195,13 @@ static u32 group2_table[] = {
444#define ON64(x) 195#define ON64(x)
445#endif 196#endif
446 197
447#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix) \ 198#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix, _dsttype) \
448 do { \ 199 do { \
449 __asm__ __volatile__ ( \ 200 __asm__ __volatile__ ( \
450 _PRE_EFLAGS("0", "4", "2") \ 201 _PRE_EFLAGS("0", "4", "2") \
451 _op _suffix " %"_x"3,%1; " \ 202 _op _suffix " %"_x"3,%1; " \
452 _POST_EFLAGS("0", "4", "2") \ 203 _POST_EFLAGS("0", "4", "2") \
453 : "=m" (_eflags), "=m" ((_dst).val), \ 204 : "=m" (_eflags), "+q" (*(_dsttype*)&(_dst).val),\
454 "=&r" (_tmp) \ 205 "=&r" (_tmp) \
455 : _y ((_src).val), "i" (EFLAGS_MASK)); \ 206 : _y ((_src).val), "i" (EFLAGS_MASK)); \
456 } while (0) 207 } while (0)
@@ -463,13 +214,13 @@ static u32 group2_table[] = {
463 \ 214 \
464 switch ((_dst).bytes) { \ 215 switch ((_dst).bytes) { \
465 case 2: \ 216 case 2: \
466 ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w"); \ 217 ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w",u16);\
467 break; \ 218 break; \
468 case 4: \ 219 case 4: \
469 ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l"); \ 220 ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l",u32);\
470 break; \ 221 break; \
471 case 8: \ 222 case 8: \
472 ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q")); \ 223 ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q",u64)); \
473 break; \ 224 break; \
474 } \ 225 } \
475 } while (0) 226 } while (0)
@@ -479,7 +230,7 @@ static u32 group2_table[] = {
479 unsigned long _tmp; \ 230 unsigned long _tmp; \
480 switch ((_dst).bytes) { \ 231 switch ((_dst).bytes) { \
481 case 1: \ 232 case 1: \
482 ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b"); \ 233 ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b",u8); \
483 break; \ 234 break; \
484 default: \ 235 default: \
485 __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ 236 __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
@@ -566,6 +317,74 @@ static u32 group2_table[] = {
566 } \ 317 } \
567 } while (0) 318 } while (0)
568 319
320#define __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, _suffix) \
321 do { \
322 unsigned long _tmp; \
323 \
324 __asm__ __volatile__ ( \
325 _PRE_EFLAGS("0", "4", "1") \
326 _op _suffix " %5; " \
327 _POST_EFLAGS("0", "4", "1") \
328 : "=m" (_eflags), "=&r" (_tmp), \
329 "+a" (_rax), "+d" (_rdx) \
330 : "i" (EFLAGS_MASK), "m" ((_src).val), \
331 "a" (_rax), "d" (_rdx)); \
332 } while (0)
333
334#define __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _suffix, _ex) \
335 do { \
336 unsigned long _tmp; \
337 \
338 __asm__ __volatile__ ( \
339 _PRE_EFLAGS("0", "5", "1") \
340 "1: \n\t" \
341 _op _suffix " %6; " \
342 "2: \n\t" \
343 _POST_EFLAGS("0", "5", "1") \
344 ".pushsection .fixup,\"ax\" \n\t" \
345 "3: movb $1, %4 \n\t" \
346 "jmp 2b \n\t" \
347 ".popsection \n\t" \
348 _ASM_EXTABLE(1b, 3b) \
349 : "=m" (_eflags), "=&r" (_tmp), \
350 "+a" (_rax), "+d" (_rdx), "+qm"(_ex) \
351 : "i" (EFLAGS_MASK), "m" ((_src).val), \
352 "a" (_rax), "d" (_rdx)); \
353 } while (0)
354
355/* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */
356#define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags) \
357 do { \
358 switch((_src).bytes) { \
359 case 1: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "b"); break; \
360 case 2: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "w"); break; \
361 case 4: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "l"); break; \
362 case 8: ON64(__emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "q")); break; \
363 } \
364 } while (0)
365
366#define emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _ex) \
367 do { \
368 switch((_src).bytes) { \
369 case 1: \
370 __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
371 _eflags, "b", _ex); \
372 break; \
373 case 2: \
374 __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
375 _eflags, "w", _ex); \
376 break; \
377 case 4: \
378 __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
379 _eflags, "l", _ex); \
380 break; \
381 case 8: ON64( \
382 __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
383 _eflags, "q", _ex)); \
384 break; \
385 } \
386 } while (0)
387
569/* Fetch next part of the instruction being emulated. */ 388/* Fetch next part of the instruction being emulated. */
570#define insn_fetch(_type, _size, _eip) \ 389#define insn_fetch(_type, _size, _eip) \
571({ unsigned long _x; \ 390({ unsigned long _x; \
@@ -661,7 +480,6 @@ static void emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
661 ctxt->exception = vec; 480 ctxt->exception = vec;
662 ctxt->error_code = error; 481 ctxt->error_code = error;
663 ctxt->error_code_valid = valid; 482 ctxt->error_code_valid = valid;
664 ctxt->restart = false;
665} 483}
666 484
667static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err) 485static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
@@ -669,11 +487,9 @@ static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
669 emulate_exception(ctxt, GP_VECTOR, err, true); 487 emulate_exception(ctxt, GP_VECTOR, err, true);
670} 488}
671 489
672static void emulate_pf(struct x86_emulate_ctxt *ctxt, unsigned long addr, 490static void emulate_pf(struct x86_emulate_ctxt *ctxt)
673 int err)
674{ 491{
675 ctxt->cr2 = addr; 492 emulate_exception(ctxt, PF_VECTOR, 0, true);
676 emulate_exception(ctxt, PF_VECTOR, err, true);
677} 493}
678 494
679static void emulate_ud(struct x86_emulate_ctxt *ctxt) 495static void emulate_ud(struct x86_emulate_ctxt *ctxt)
@@ -686,6 +502,12 @@ static void emulate_ts(struct x86_emulate_ctxt *ctxt, int err)
686 emulate_exception(ctxt, TS_VECTOR, err, true); 502 emulate_exception(ctxt, TS_VECTOR, err, true);
687} 503}
688 504
505static int emulate_de(struct x86_emulate_ctxt *ctxt)
506{
507 emulate_exception(ctxt, DE_VECTOR, 0, false);
508 return X86EMUL_PROPAGATE_FAULT;
509}
510
689static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, 511static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
690 struct x86_emulate_ops *ops, 512 struct x86_emulate_ops *ops,
691 unsigned long eip, u8 *dest) 513 unsigned long eip, u8 *dest)
@@ -742,7 +564,7 @@ static void *decode_register(u8 modrm_reg, unsigned long *regs,
742 564
743static int read_descriptor(struct x86_emulate_ctxt *ctxt, 565static int read_descriptor(struct x86_emulate_ctxt *ctxt,
744 struct x86_emulate_ops *ops, 566 struct x86_emulate_ops *ops,
745 void *ptr, 567 ulong addr,
746 u16 *size, unsigned long *address, int op_bytes) 568 u16 *size, unsigned long *address, int op_bytes)
747{ 569{
748 int rc; 570 int rc;
@@ -750,12 +572,10 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
750 if (op_bytes == 2) 572 if (op_bytes == 2)
751 op_bytes = 3; 573 op_bytes = 3;
752 *address = 0; 574 *address = 0;
753 rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, 575 rc = ops->read_std(addr, (unsigned long *)size, 2, ctxt->vcpu, NULL);
754 ctxt->vcpu, NULL);
755 if (rc != X86EMUL_CONTINUE) 576 if (rc != X86EMUL_CONTINUE)
756 return rc; 577 return rc;
757 rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, 578 rc = ops->read_std(addr + 2, address, op_bytes, ctxt->vcpu, NULL);
758 ctxt->vcpu, NULL);
759 return rc; 579 return rc;
760} 580}
761 581
@@ -794,6 +614,24 @@ static int test_cc(unsigned int condition, unsigned int flags)
794 return (!!rc ^ (condition & 1)); 614 return (!!rc ^ (condition & 1));
795} 615}
796 616
617static void fetch_register_operand(struct operand *op)
618{
619 switch (op->bytes) {
620 case 1:
621 op->val = *(u8 *)op->addr.reg;
622 break;
623 case 2:
624 op->val = *(u16 *)op->addr.reg;
625 break;
626 case 4:
627 op->val = *(u32 *)op->addr.reg;
628 break;
629 case 8:
630 op->val = *(u64 *)op->addr.reg;
631 break;
632 }
633}
634
797static void decode_register_operand(struct operand *op, 635static void decode_register_operand(struct operand *op,
798 struct decode_cache *c, 636 struct decode_cache *c,
799 int inhibit_bytereg) 637 int inhibit_bytereg)
@@ -805,34 +643,25 @@ static void decode_register_operand(struct operand *op,
805 reg = (c->b & 7) | ((c->rex_prefix & 1) << 3); 643 reg = (c->b & 7) | ((c->rex_prefix & 1) << 3);
806 op->type = OP_REG; 644 op->type = OP_REG;
807 if ((c->d & ByteOp) && !inhibit_bytereg) { 645 if ((c->d & ByteOp) && !inhibit_bytereg) {
808 op->ptr = decode_register(reg, c->regs, highbyte_regs); 646 op->addr.reg = decode_register(reg, c->regs, highbyte_regs);
809 op->val = *(u8 *)op->ptr;
810 op->bytes = 1; 647 op->bytes = 1;
811 } else { 648 } else {
812 op->ptr = decode_register(reg, c->regs, 0); 649 op->addr.reg = decode_register(reg, c->regs, 0);
813 op->bytes = c->op_bytes; 650 op->bytes = c->op_bytes;
814 switch (op->bytes) {
815 case 2:
816 op->val = *(u16 *)op->ptr;
817 break;
818 case 4:
819 op->val = *(u32 *)op->ptr;
820 break;
821 case 8:
822 op->val = *(u64 *) op->ptr;
823 break;
824 }
825 } 651 }
652 fetch_register_operand(op);
826 op->orig_val = op->val; 653 op->orig_val = op->val;
827} 654}
828 655
829static int decode_modrm(struct x86_emulate_ctxt *ctxt, 656static int decode_modrm(struct x86_emulate_ctxt *ctxt,
830 struct x86_emulate_ops *ops) 657 struct x86_emulate_ops *ops,
658 struct operand *op)
831{ 659{
832 struct decode_cache *c = &ctxt->decode; 660 struct decode_cache *c = &ctxt->decode;
833 u8 sib; 661 u8 sib;
834 int index_reg = 0, base_reg = 0, scale; 662 int index_reg = 0, base_reg = 0, scale;
835 int rc = X86EMUL_CONTINUE; 663 int rc = X86EMUL_CONTINUE;
664 ulong modrm_ea = 0;
836 665
837 if (c->rex_prefix) { 666 if (c->rex_prefix) {
838 c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */ 667 c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */
@@ -844,16 +673,19 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
844 c->modrm_mod |= (c->modrm & 0xc0) >> 6; 673 c->modrm_mod |= (c->modrm & 0xc0) >> 6;
845 c->modrm_reg |= (c->modrm & 0x38) >> 3; 674 c->modrm_reg |= (c->modrm & 0x38) >> 3;
846 c->modrm_rm |= (c->modrm & 0x07); 675 c->modrm_rm |= (c->modrm & 0x07);
847 c->modrm_ea = 0; 676 c->modrm_seg = VCPU_SREG_DS;
848 c->use_modrm_ea = 1;
849 677
850 if (c->modrm_mod == 3) { 678 if (c->modrm_mod == 3) {
851 c->modrm_ptr = decode_register(c->modrm_rm, 679 op->type = OP_REG;
680 op->bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
681 op->addr.reg = decode_register(c->modrm_rm,
852 c->regs, c->d & ByteOp); 682 c->regs, c->d & ByteOp);
853 c->modrm_val = *(unsigned long *)c->modrm_ptr; 683 fetch_register_operand(op);
854 return rc; 684 return rc;
855 } 685 }
856 686
687 op->type = OP_MEM;
688
857 if (c->ad_bytes == 2) { 689 if (c->ad_bytes == 2) {
858 unsigned bx = c->regs[VCPU_REGS_RBX]; 690 unsigned bx = c->regs[VCPU_REGS_RBX];
859 unsigned bp = c->regs[VCPU_REGS_RBP]; 691 unsigned bp = c->regs[VCPU_REGS_RBP];
@@ -864,47 +696,46 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
864 switch (c->modrm_mod) { 696 switch (c->modrm_mod) {
865 case 0: 697 case 0:
866 if (c->modrm_rm == 6) 698 if (c->modrm_rm == 6)
867 c->modrm_ea += insn_fetch(u16, 2, c->eip); 699 modrm_ea += insn_fetch(u16, 2, c->eip);
868 break; 700 break;
869 case 1: 701 case 1:
870 c->modrm_ea += insn_fetch(s8, 1, c->eip); 702 modrm_ea += insn_fetch(s8, 1, c->eip);
871 break; 703 break;
872 case 2: 704 case 2:
873 c->modrm_ea += insn_fetch(u16, 2, c->eip); 705 modrm_ea += insn_fetch(u16, 2, c->eip);
874 break; 706 break;
875 } 707 }
876 switch (c->modrm_rm) { 708 switch (c->modrm_rm) {
877 case 0: 709 case 0:
878 c->modrm_ea += bx + si; 710 modrm_ea += bx + si;
879 break; 711 break;
880 case 1: 712 case 1:
881 c->modrm_ea += bx + di; 713 modrm_ea += bx + di;
882 break; 714 break;
883 case 2: 715 case 2:
884 c->modrm_ea += bp + si; 716 modrm_ea += bp + si;
885 break; 717 break;
886 case 3: 718 case 3:
887 c->modrm_ea += bp + di; 719 modrm_ea += bp + di;
888 break; 720 break;
889 case 4: 721 case 4:
890 c->modrm_ea += si; 722 modrm_ea += si;
891 break; 723 break;
892 case 5: 724 case 5:
893 c->modrm_ea += di; 725 modrm_ea += di;
894 break; 726 break;
895 case 6: 727 case 6:
896 if (c->modrm_mod != 0) 728 if (c->modrm_mod != 0)
897 c->modrm_ea += bp; 729 modrm_ea += bp;
898 break; 730 break;
899 case 7: 731 case 7:
900 c->modrm_ea += bx; 732 modrm_ea += bx;
901 break; 733 break;
902 } 734 }
903 if (c->modrm_rm == 2 || c->modrm_rm == 3 || 735 if (c->modrm_rm == 2 || c->modrm_rm == 3 ||
904 (c->modrm_rm == 6 && c->modrm_mod != 0)) 736 (c->modrm_rm == 6 && c->modrm_mod != 0))
905 if (!c->has_seg_override) 737 c->modrm_seg = VCPU_SREG_SS;
906 set_seg_override(c, VCPU_SREG_SS); 738 modrm_ea = (u16)modrm_ea;
907 c->modrm_ea = (u16)c->modrm_ea;
908 } else { 739 } else {
909 /* 32/64-bit ModR/M decode. */ 740 /* 32/64-bit ModR/M decode. */
910 if ((c->modrm_rm & 7) == 4) { 741 if ((c->modrm_rm & 7) == 4) {
@@ -914,410 +745,74 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
914 scale = sib >> 6; 745 scale = sib >> 6;
915 746
916 if ((base_reg & 7) == 5 && c->modrm_mod == 0) 747 if ((base_reg & 7) == 5 && c->modrm_mod == 0)
917 c->modrm_ea += insn_fetch(s32, 4, c->eip); 748 modrm_ea += insn_fetch(s32, 4, c->eip);
918 else 749 else
919 c->modrm_ea += c->regs[base_reg]; 750 modrm_ea += c->regs[base_reg];
920 if (index_reg != 4) 751 if (index_reg != 4)
921 c->modrm_ea += c->regs[index_reg] << scale; 752 modrm_ea += c->regs[index_reg] << scale;
922 } else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) { 753 } else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) {
923 if (ctxt->mode == X86EMUL_MODE_PROT64) 754 if (ctxt->mode == X86EMUL_MODE_PROT64)
924 c->rip_relative = 1; 755 c->rip_relative = 1;
925 } else 756 } else
926 c->modrm_ea += c->regs[c->modrm_rm]; 757 modrm_ea += c->regs[c->modrm_rm];
927 switch (c->modrm_mod) { 758 switch (c->modrm_mod) {
928 case 0: 759 case 0:
929 if (c->modrm_rm == 5) 760 if (c->modrm_rm == 5)
930 c->modrm_ea += insn_fetch(s32, 4, c->eip); 761 modrm_ea += insn_fetch(s32, 4, c->eip);
931 break; 762 break;
932 case 1: 763 case 1:
933 c->modrm_ea += insn_fetch(s8, 1, c->eip); 764 modrm_ea += insn_fetch(s8, 1, c->eip);
934 break; 765 break;
935 case 2: 766 case 2:
936 c->modrm_ea += insn_fetch(s32, 4, c->eip); 767 modrm_ea += insn_fetch(s32, 4, c->eip);
937 break; 768 break;
938 } 769 }
939 } 770 }
771 op->addr.mem = modrm_ea;
940done: 772done:
941 return rc; 773 return rc;
942} 774}
943 775
944static int decode_abs(struct x86_emulate_ctxt *ctxt, 776static int decode_abs(struct x86_emulate_ctxt *ctxt,
945 struct x86_emulate_ops *ops) 777 struct x86_emulate_ops *ops,
778 struct operand *op)
946{ 779{
947 struct decode_cache *c = &ctxt->decode; 780 struct decode_cache *c = &ctxt->decode;
948 int rc = X86EMUL_CONTINUE; 781 int rc = X86EMUL_CONTINUE;
949 782
783 op->type = OP_MEM;
950 switch (c->ad_bytes) { 784 switch (c->ad_bytes) {
951 case 2: 785 case 2:
952 c->modrm_ea = insn_fetch(u16, 2, c->eip); 786 op->addr.mem = insn_fetch(u16, 2, c->eip);
953 break; 787 break;
954 case 4: 788 case 4:
955 c->modrm_ea = insn_fetch(u32, 4, c->eip); 789 op->addr.mem = insn_fetch(u32, 4, c->eip);
956 break; 790 break;
957 case 8: 791 case 8:
958 c->modrm_ea = insn_fetch(u64, 8, c->eip); 792 op->addr.mem = insn_fetch(u64, 8, c->eip);
959 break; 793 break;
960 } 794 }
961done: 795done:
962 return rc; 796 return rc;
963} 797}
964 798
965int 799static void fetch_bit_operand(struct decode_cache *c)
966x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
967{ 800{
968 struct decode_cache *c = &ctxt->decode; 801 long sv = 0, mask;
969 int rc = X86EMUL_CONTINUE;
970 int mode = ctxt->mode;
971 int def_op_bytes, def_ad_bytes, group;
972
973
974 /* we cannot decode insn before we complete previous rep insn */
975 WARN_ON(ctxt->restart);
976
977 c->eip = ctxt->eip;
978 c->fetch.start = c->fetch.end = c->eip;
979 ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS);
980
981 switch (mode) {
982 case X86EMUL_MODE_REAL:
983 case X86EMUL_MODE_VM86:
984 case X86EMUL_MODE_PROT16:
985 def_op_bytes = def_ad_bytes = 2;
986 break;
987 case X86EMUL_MODE_PROT32:
988 def_op_bytes = def_ad_bytes = 4;
989 break;
990#ifdef CONFIG_X86_64
991 case X86EMUL_MODE_PROT64:
992 def_op_bytes = 4;
993 def_ad_bytes = 8;
994 break;
995#endif
996 default:
997 return -1;
998 }
999
1000 c->op_bytes = def_op_bytes;
1001 c->ad_bytes = def_ad_bytes;
1002
1003 /* Legacy prefixes. */
1004 for (;;) {
1005 switch (c->b = insn_fetch(u8, 1, c->eip)) {
1006 case 0x66: /* operand-size override */
1007 /* switch between 2/4 bytes */
1008 c->op_bytes = def_op_bytes ^ 6;
1009 break;
1010 case 0x67: /* address-size override */
1011 if (mode == X86EMUL_MODE_PROT64)
1012 /* switch between 4/8 bytes */
1013 c->ad_bytes = def_ad_bytes ^ 12;
1014 else
1015 /* switch between 2/4 bytes */
1016 c->ad_bytes = def_ad_bytes ^ 6;
1017 break;
1018 case 0x26: /* ES override */
1019 case 0x2e: /* CS override */
1020 case 0x36: /* SS override */
1021 case 0x3e: /* DS override */
1022 set_seg_override(c, (c->b >> 3) & 3);
1023 break;
1024 case 0x64: /* FS override */
1025 case 0x65: /* GS override */
1026 set_seg_override(c, c->b & 7);
1027 break;
1028 case 0x40 ... 0x4f: /* REX */
1029 if (mode != X86EMUL_MODE_PROT64)
1030 goto done_prefixes;
1031 c->rex_prefix = c->b;
1032 continue;
1033 case 0xf0: /* LOCK */
1034 c->lock_prefix = 1;
1035 break;
1036 case 0xf2: /* REPNE/REPNZ */
1037 c->rep_prefix = REPNE_PREFIX;
1038 break;
1039 case 0xf3: /* REP/REPE/REPZ */
1040 c->rep_prefix = REPE_PREFIX;
1041 break;
1042 default:
1043 goto done_prefixes;
1044 }
1045
1046 /* Any legacy prefix after a REX prefix nullifies its effect. */
1047
1048 c->rex_prefix = 0;
1049 }
1050
1051done_prefixes:
1052
1053 /* REX prefix. */
1054 if (c->rex_prefix)
1055 if (c->rex_prefix & 8)
1056 c->op_bytes = 8; /* REX.W */
1057
1058 /* Opcode byte(s). */
1059 c->d = opcode_table[c->b];
1060 if (c->d == 0) {
1061 /* Two-byte opcode? */
1062 if (c->b == 0x0f) {
1063 c->twobyte = 1;
1064 c->b = insn_fetch(u8, 1, c->eip);
1065 c->d = twobyte_table[c->b];
1066 }
1067 }
1068
1069 if (c->d & Group) {
1070 group = c->d & GroupMask;
1071 c->modrm = insn_fetch(u8, 1, c->eip);
1072 --c->eip;
1073
1074 group = (group << 3) + ((c->modrm >> 3) & 7);
1075 if ((c->d & GroupDual) && (c->modrm >> 6) == 3)
1076 c->d = group2_table[group];
1077 else
1078 c->d = group_table[group];
1079 }
1080
1081 /* Unrecognised? */
1082 if (c->d == 0) {
1083 DPRINTF("Cannot emulate %02x\n", c->b);
1084 return -1;
1085 }
1086
1087 if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
1088 c->op_bytes = 8;
1089
1090 /* ModRM and SIB bytes. */
1091 if (c->d & ModRM)
1092 rc = decode_modrm(ctxt, ops);
1093 else if (c->d & MemAbs)
1094 rc = decode_abs(ctxt, ops);
1095 if (rc != X86EMUL_CONTINUE)
1096 goto done;
1097
1098 if (!c->has_seg_override)
1099 set_seg_override(c, VCPU_SREG_DS);
1100
1101 if (!(!c->twobyte && c->b == 0x8d))
1102 c->modrm_ea += seg_override_base(ctxt, ops, c);
1103
1104 if (c->ad_bytes != 8)
1105 c->modrm_ea = (u32)c->modrm_ea;
1106
1107 if (c->rip_relative)
1108 c->modrm_ea += c->eip;
1109
1110 /*
1111 * Decode and fetch the source operand: register, memory
1112 * or immediate.
1113 */
1114 switch (c->d & SrcMask) {
1115 case SrcNone:
1116 break;
1117 case SrcReg:
1118 decode_register_operand(&c->src, c, 0);
1119 break;
1120 case SrcMem16:
1121 c->src.bytes = 2;
1122 goto srcmem_common;
1123 case SrcMem32:
1124 c->src.bytes = 4;
1125 goto srcmem_common;
1126 case SrcMem:
1127 c->src.bytes = (c->d & ByteOp) ? 1 :
1128 c->op_bytes;
1129 /* Don't fetch the address for invlpg: it could be unmapped. */
1130 if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7)
1131 break;
1132 srcmem_common:
1133 /*
1134 * For instructions with a ModR/M byte, switch to register
1135 * access if Mod = 3.
1136 */
1137 if ((c->d & ModRM) && c->modrm_mod == 3) {
1138 c->src.type = OP_REG;
1139 c->src.val = c->modrm_val;
1140 c->src.ptr = c->modrm_ptr;
1141 break;
1142 }
1143 c->src.type = OP_MEM;
1144 c->src.ptr = (unsigned long *)c->modrm_ea;
1145 c->src.val = 0;
1146 break;
1147 case SrcImm:
1148 case SrcImmU:
1149 c->src.type = OP_IMM;
1150 c->src.ptr = (unsigned long *)c->eip;
1151 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1152 if (c->src.bytes == 8)
1153 c->src.bytes = 4;
1154 /* NB. Immediates are sign-extended as necessary. */
1155 switch (c->src.bytes) {
1156 case 1:
1157 c->src.val = insn_fetch(s8, 1, c->eip);
1158 break;
1159 case 2:
1160 c->src.val = insn_fetch(s16, 2, c->eip);
1161 break;
1162 case 4:
1163 c->src.val = insn_fetch(s32, 4, c->eip);
1164 break;
1165 }
1166 if ((c->d & SrcMask) == SrcImmU) {
1167 switch (c->src.bytes) {
1168 case 1:
1169 c->src.val &= 0xff;
1170 break;
1171 case 2:
1172 c->src.val &= 0xffff;
1173 break;
1174 case 4:
1175 c->src.val &= 0xffffffff;
1176 break;
1177 }
1178 }
1179 break;
1180 case SrcImmByte:
1181 case SrcImmUByte:
1182 c->src.type = OP_IMM;
1183 c->src.ptr = (unsigned long *)c->eip;
1184 c->src.bytes = 1;
1185 if ((c->d & SrcMask) == SrcImmByte)
1186 c->src.val = insn_fetch(s8, 1, c->eip);
1187 else
1188 c->src.val = insn_fetch(u8, 1, c->eip);
1189 break;
1190 case SrcAcc:
1191 c->src.type = OP_REG;
1192 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1193 c->src.ptr = &c->regs[VCPU_REGS_RAX];
1194 switch (c->src.bytes) {
1195 case 1:
1196 c->src.val = *(u8 *)c->src.ptr;
1197 break;
1198 case 2:
1199 c->src.val = *(u16 *)c->src.ptr;
1200 break;
1201 case 4:
1202 c->src.val = *(u32 *)c->src.ptr;
1203 break;
1204 case 8:
1205 c->src.val = *(u64 *)c->src.ptr;
1206 break;
1207 }
1208 break;
1209 case SrcOne:
1210 c->src.bytes = 1;
1211 c->src.val = 1;
1212 break;
1213 case SrcSI:
1214 c->src.type = OP_MEM;
1215 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1216 c->src.ptr = (unsigned long *)
1217 register_address(c, seg_override_base(ctxt, ops, c),
1218 c->regs[VCPU_REGS_RSI]);
1219 c->src.val = 0;
1220 break;
1221 case SrcImmFAddr:
1222 c->src.type = OP_IMM;
1223 c->src.ptr = (unsigned long *)c->eip;
1224 c->src.bytes = c->op_bytes + 2;
1225 insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip);
1226 break;
1227 case SrcMemFAddr:
1228 c->src.type = OP_MEM;
1229 c->src.ptr = (unsigned long *)c->modrm_ea;
1230 c->src.bytes = c->op_bytes + 2;
1231 break;
1232 }
1233 802
1234 /* 803 if (c->dst.type == OP_MEM && c->src.type == OP_REG) {
1235 * Decode and fetch the second source operand: register, memory 804 mask = ~(c->dst.bytes * 8 - 1);
1236 * or immediate.
1237 */
1238 switch (c->d & Src2Mask) {
1239 case Src2None:
1240 break;
1241 case Src2CL:
1242 c->src2.bytes = 1;
1243 c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8;
1244 break;
1245 case Src2ImmByte:
1246 c->src2.type = OP_IMM;
1247 c->src2.ptr = (unsigned long *)c->eip;
1248 c->src2.bytes = 1;
1249 c->src2.val = insn_fetch(u8, 1, c->eip);
1250 break;
1251 case Src2One:
1252 c->src2.bytes = 1;
1253 c->src2.val = 1;
1254 break;
1255 }
1256 805
1257 /* Decode and fetch the destination operand: register or memory. */ 806 if (c->src.bytes == 2)
1258 switch (c->d & DstMask) { 807 sv = (s16)c->src.val & (s16)mask;
1259 case ImplicitOps: 808 else if (c->src.bytes == 4)
1260 /* Special instructions do their own operand decoding. */ 809 sv = (s32)c->src.val & (s32)mask;
1261 return 0;
1262 case DstReg:
1263 decode_register_operand(&c->dst, c,
1264 c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
1265 break;
1266 case DstMem:
1267 case DstMem64:
1268 if ((c->d & ModRM) && c->modrm_mod == 3) {
1269 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1270 c->dst.type = OP_REG;
1271 c->dst.val = c->dst.orig_val = c->modrm_val;
1272 c->dst.ptr = c->modrm_ptr;
1273 break;
1274 }
1275 c->dst.type = OP_MEM;
1276 c->dst.ptr = (unsigned long *)c->modrm_ea;
1277 if ((c->d & DstMask) == DstMem64)
1278 c->dst.bytes = 8;
1279 else
1280 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1281 c->dst.val = 0;
1282 if (c->d & BitOp) {
1283 unsigned long mask = ~(c->dst.bytes * 8 - 1);
1284 810
1285 c->dst.ptr = (void *)c->dst.ptr + 811 c->dst.addr.mem += (sv >> 3);
1286 (c->src.val & mask) / 8;
1287 }
1288 break;
1289 case DstAcc:
1290 c->dst.type = OP_REG;
1291 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1292 c->dst.ptr = &c->regs[VCPU_REGS_RAX];
1293 switch (c->dst.bytes) {
1294 case 1:
1295 c->dst.val = *(u8 *)c->dst.ptr;
1296 break;
1297 case 2:
1298 c->dst.val = *(u16 *)c->dst.ptr;
1299 break;
1300 case 4:
1301 c->dst.val = *(u32 *)c->dst.ptr;
1302 break;
1303 case 8:
1304 c->dst.val = *(u64 *)c->dst.ptr;
1305 break;
1306 }
1307 c->dst.orig_val = c->dst.val;
1308 break;
1309 case DstDI:
1310 c->dst.type = OP_MEM;
1311 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1312 c->dst.ptr = (unsigned long *)
1313 register_address(c, es_base(ctxt, ops),
1314 c->regs[VCPU_REGS_RDI]);
1315 c->dst.val = 0;
1316 break;
1317 } 812 }
1318 813
1319done: 814 /* only subword offset */
1320 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; 815 c->src.val &= (c->dst.bytes << 3) - 1;
1321} 816}
1322 817
1323static int read_emulated(struct x86_emulate_ctxt *ctxt, 818static int read_emulated(struct x86_emulate_ctxt *ctxt,
@@ -1337,7 +832,7 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
1337 rc = ops->read_emulated(addr, mc->data + mc->end, n, &err, 832 rc = ops->read_emulated(addr, mc->data + mc->end, n, &err,
1338 ctxt->vcpu); 833 ctxt->vcpu);
1339 if (rc == X86EMUL_PROPAGATE_FAULT) 834 if (rc == X86EMUL_PROPAGATE_FAULT)
1340 emulate_pf(ctxt, addr, err); 835 emulate_pf(ctxt);
1341 if (rc != X86EMUL_CONTINUE) 836 if (rc != X86EMUL_CONTINUE)
1342 return rc; 837 return rc;
1343 mc->end += n; 838 mc->end += n;
@@ -1424,7 +919,7 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1424 addr = dt.address + index * 8; 919 addr = dt.address + index * 8;
1425 ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); 920 ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
1426 if (ret == X86EMUL_PROPAGATE_FAULT) 921 if (ret == X86EMUL_PROPAGATE_FAULT)
1427 emulate_pf(ctxt, addr, err); 922 emulate_pf(ctxt);
1428 923
1429 return ret; 924 return ret;
1430} 925}
@@ -1450,7 +945,7 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1450 addr = dt.address + index * 8; 945 addr = dt.address + index * 8;
1451 ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); 946 ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
1452 if (ret == X86EMUL_PROPAGATE_FAULT) 947 if (ret == X86EMUL_PROPAGATE_FAULT)
1453 emulate_pf(ctxt, addr, err); 948 emulate_pf(ctxt);
1454 949
1455 return ret; 950 return ret;
1456} 951}
@@ -1573,6 +1068,25 @@ exception:
1573 return X86EMUL_PROPAGATE_FAULT; 1068 return X86EMUL_PROPAGATE_FAULT;
1574} 1069}
1575 1070
1071static void write_register_operand(struct operand *op)
1072{
1073 /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
1074 switch (op->bytes) {
1075 case 1:
1076 *(u8 *)op->addr.reg = (u8)op->val;
1077 break;
1078 case 2:
1079 *(u16 *)op->addr.reg = (u16)op->val;
1080 break;
1081 case 4:
1082 *op->addr.reg = (u32)op->val;
1083 break; /* 64b: zero-extend */
1084 case 8:
1085 *op->addr.reg = op->val;
1086 break;
1087 }
1088}
1089
1576static inline int writeback(struct x86_emulate_ctxt *ctxt, 1090static inline int writeback(struct x86_emulate_ctxt *ctxt,
1577 struct x86_emulate_ops *ops) 1091 struct x86_emulate_ops *ops)
1578{ 1092{
@@ -1582,28 +1096,12 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
1582 1096
1583 switch (c->dst.type) { 1097 switch (c->dst.type) {
1584 case OP_REG: 1098 case OP_REG:
1585 /* The 4-byte case *is* correct: 1099 write_register_operand(&c->dst);
1586 * in 64-bit mode we zero-extend.
1587 */
1588 switch (c->dst.bytes) {
1589 case 1:
1590 *(u8 *)c->dst.ptr = (u8)c->dst.val;
1591 break;
1592 case 2:
1593 *(u16 *)c->dst.ptr = (u16)c->dst.val;
1594 break;
1595 case 4:
1596 *c->dst.ptr = (u32)c->dst.val;
1597 break; /* 64b: zero-ext */
1598 case 8:
1599 *c->dst.ptr = c->dst.val;
1600 break;
1601 }
1602 break; 1100 break;
1603 case OP_MEM: 1101 case OP_MEM:
1604 if (c->lock_prefix) 1102 if (c->lock_prefix)
1605 rc = ops->cmpxchg_emulated( 1103 rc = ops->cmpxchg_emulated(
1606 (unsigned long)c->dst.ptr, 1104 c->dst.addr.mem,
1607 &c->dst.orig_val, 1105 &c->dst.orig_val,
1608 &c->dst.val, 1106 &c->dst.val,
1609 c->dst.bytes, 1107 c->dst.bytes,
@@ -1611,14 +1109,13 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
1611 ctxt->vcpu); 1109 ctxt->vcpu);
1612 else 1110 else
1613 rc = ops->write_emulated( 1111 rc = ops->write_emulated(
1614 (unsigned long)c->dst.ptr, 1112 c->dst.addr.mem,
1615 &c->dst.val, 1113 &c->dst.val,
1616 c->dst.bytes, 1114 c->dst.bytes,
1617 &err, 1115 &err,
1618 ctxt->vcpu); 1116 ctxt->vcpu);
1619 if (rc == X86EMUL_PROPAGATE_FAULT) 1117 if (rc == X86EMUL_PROPAGATE_FAULT)
1620 emulate_pf(ctxt, 1118 emulate_pf(ctxt);
1621 (unsigned long)c->dst.ptr, err);
1622 if (rc != X86EMUL_CONTINUE) 1119 if (rc != X86EMUL_CONTINUE)
1623 return rc; 1120 return rc;
1624 break; 1121 break;
@@ -1640,8 +1137,8 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt,
1640 c->dst.bytes = c->op_bytes; 1137 c->dst.bytes = c->op_bytes;
1641 c->dst.val = c->src.val; 1138 c->dst.val = c->src.val;
1642 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); 1139 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
1643 c->dst.ptr = (void *) register_address(c, ss_base(ctxt, ops), 1140 c->dst.addr.mem = register_address(c, ss_base(ctxt, ops),
1644 c->regs[VCPU_REGS_RSP]); 1141 c->regs[VCPU_REGS_RSP]);
1645} 1142}
1646 1143
1647static int emulate_pop(struct x86_emulate_ctxt *ctxt, 1144static int emulate_pop(struct x86_emulate_ctxt *ctxt,
@@ -1701,6 +1198,9 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1701 *(unsigned long *)dest = 1198 *(unsigned long *)dest =
1702 (ctxt->eflags & ~change_mask) | (val & change_mask); 1199 (ctxt->eflags & ~change_mask) | (val & change_mask);
1703 1200
1201 if (rc == X86EMUL_PROPAGATE_FAULT)
1202 emulate_pf(ctxt);
1203
1704 return rc; 1204 return rc;
1705} 1205}
1706 1206
@@ -1778,6 +1278,150 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt,
1778 return rc; 1278 return rc;
1779} 1279}
1780 1280
1281int emulate_int_real(struct x86_emulate_ctxt *ctxt,
1282 struct x86_emulate_ops *ops, int irq)
1283{
1284 struct decode_cache *c = &ctxt->decode;
1285 int rc;
1286 struct desc_ptr dt;
1287 gva_t cs_addr;
1288 gva_t eip_addr;
1289 u16 cs, eip;
1290 u32 err;
1291
1292 /* TODO: Add limit checks */
1293 c->src.val = ctxt->eflags;
1294 emulate_push(ctxt, ops);
1295 rc = writeback(ctxt, ops);
1296 if (rc != X86EMUL_CONTINUE)
1297 return rc;
1298
1299 ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC);
1300
1301 c->src.val = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
1302 emulate_push(ctxt, ops);
1303 rc = writeback(ctxt, ops);
1304 if (rc != X86EMUL_CONTINUE)
1305 return rc;
1306
1307 c->src.val = c->eip;
1308 emulate_push(ctxt, ops);
1309 rc = writeback(ctxt, ops);
1310 if (rc != X86EMUL_CONTINUE)
1311 return rc;
1312
1313 c->dst.type = OP_NONE;
1314
1315 ops->get_idt(&dt, ctxt->vcpu);
1316
1317 eip_addr = dt.address + (irq << 2);
1318 cs_addr = dt.address + (irq << 2) + 2;
1319
1320 rc = ops->read_std(cs_addr, &cs, 2, ctxt->vcpu, &err);
1321 if (rc != X86EMUL_CONTINUE)
1322 return rc;
1323
1324 rc = ops->read_std(eip_addr, &eip, 2, ctxt->vcpu, &err);
1325 if (rc != X86EMUL_CONTINUE)
1326 return rc;
1327
1328 rc = load_segment_descriptor(ctxt, ops, cs, VCPU_SREG_CS);
1329 if (rc != X86EMUL_CONTINUE)
1330 return rc;
1331
1332 c->eip = eip;
1333
1334 return rc;
1335}
1336
1337static int emulate_int(struct x86_emulate_ctxt *ctxt,
1338 struct x86_emulate_ops *ops, int irq)
1339{
1340 switch(ctxt->mode) {
1341 case X86EMUL_MODE_REAL:
1342 return emulate_int_real(ctxt, ops, irq);
1343 case X86EMUL_MODE_VM86:
1344 case X86EMUL_MODE_PROT16:
1345 case X86EMUL_MODE_PROT32:
1346 case X86EMUL_MODE_PROT64:
1347 default:
1348 /* Protected mode interrupts unimplemented yet */
1349 return X86EMUL_UNHANDLEABLE;
1350 }
1351}
1352
1353static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
1354 struct x86_emulate_ops *ops)
1355{
1356 struct decode_cache *c = &ctxt->decode;
1357 int rc = X86EMUL_CONTINUE;
1358 unsigned long temp_eip = 0;
1359 unsigned long temp_eflags = 0;
1360 unsigned long cs = 0;
1361 unsigned long mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_TF |
1362 EFLG_IF | EFLG_DF | EFLG_OF | EFLG_IOPL | EFLG_NT | EFLG_RF |
1363 EFLG_AC | EFLG_ID | (1 << 1); /* Last one is the reserved bit */
1364 unsigned long vm86_mask = EFLG_VM | EFLG_VIF | EFLG_VIP;
1365
1366 /* TODO: Add stack limit check */
1367
1368 rc = emulate_pop(ctxt, ops, &temp_eip, c->op_bytes);
1369
1370 if (rc != X86EMUL_CONTINUE)
1371 return rc;
1372
1373 if (temp_eip & ~0xffff) {
1374 emulate_gp(ctxt, 0);
1375 return X86EMUL_PROPAGATE_FAULT;
1376 }
1377
1378 rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
1379
1380 if (rc != X86EMUL_CONTINUE)
1381 return rc;
1382
1383 rc = emulate_pop(ctxt, ops, &temp_eflags, c->op_bytes);
1384
1385 if (rc != X86EMUL_CONTINUE)
1386 return rc;
1387
1388 rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS);
1389
1390 if (rc != X86EMUL_CONTINUE)
1391 return rc;
1392
1393 c->eip = temp_eip;
1394
1395
1396 if (c->op_bytes == 4)
1397 ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask));
1398 else if (c->op_bytes == 2) {
1399 ctxt->eflags &= ~0xffff;
1400 ctxt->eflags |= temp_eflags;
1401 }
1402
1403 ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */
1404 ctxt->eflags |= EFLG_RESERVED_ONE_MASK;
1405
1406 return rc;
1407}
1408
1409static inline int emulate_iret(struct x86_emulate_ctxt *ctxt,
1410 struct x86_emulate_ops* ops)
1411{
1412 switch(ctxt->mode) {
1413 case X86EMUL_MODE_REAL:
1414 return emulate_iret_real(ctxt, ops);
1415 case X86EMUL_MODE_VM86:
1416 case X86EMUL_MODE_PROT16:
1417 case X86EMUL_MODE_PROT32:
1418 case X86EMUL_MODE_PROT64:
1419 default:
1420 /* iret from protected mode unimplemented yet */
1421 return X86EMUL_UNHANDLEABLE;
1422 }
1423}
1424
1781static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, 1425static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
1782 struct x86_emulate_ops *ops) 1426 struct x86_emulate_ops *ops)
1783{ 1427{
@@ -1819,6 +1463,9 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
1819 struct x86_emulate_ops *ops) 1463 struct x86_emulate_ops *ops)
1820{ 1464{
1821 struct decode_cache *c = &ctxt->decode; 1465 struct decode_cache *c = &ctxt->decode;
1466 unsigned long *rax = &c->regs[VCPU_REGS_RAX];
1467 unsigned long *rdx = &c->regs[VCPU_REGS_RDX];
1468 u8 de = 0;
1822 1469
1823 switch (c->modrm_reg) { 1470 switch (c->modrm_reg) {
1824 case 0 ... 1: /* test */ 1471 case 0 ... 1: /* test */
@@ -1830,10 +1477,26 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
1830 case 3: /* neg */ 1477 case 3: /* neg */
1831 emulate_1op("neg", c->dst, ctxt->eflags); 1478 emulate_1op("neg", c->dst, ctxt->eflags);
1832 break; 1479 break;
1480 case 4: /* mul */
1481 emulate_1op_rax_rdx("mul", c->src, *rax, *rdx, ctxt->eflags);
1482 break;
1483 case 5: /* imul */
1484 emulate_1op_rax_rdx("imul", c->src, *rax, *rdx, ctxt->eflags);
1485 break;
1486 case 6: /* div */
1487 emulate_1op_rax_rdx_ex("div", c->src, *rax, *rdx,
1488 ctxt->eflags, de);
1489 break;
1490 case 7: /* idiv */
1491 emulate_1op_rax_rdx_ex("idiv", c->src, *rax, *rdx,
1492 ctxt->eflags, de);
1493 break;
1833 default: 1494 default:
1834 return 0; 1495 return X86EMUL_UNHANDLEABLE;
1835 } 1496 }
1836 return 1; 1497 if (de)
1498 return emulate_de(ctxt);
1499 return X86EMUL_CONTINUE;
1837} 1500}
1838 1501
1839static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, 1502static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
@@ -1905,6 +1568,23 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
1905 return rc; 1568 return rc;
1906} 1569}
1907 1570
1571static int emulate_load_segment(struct x86_emulate_ctxt *ctxt,
1572 struct x86_emulate_ops *ops, int seg)
1573{
1574 struct decode_cache *c = &ctxt->decode;
1575 unsigned short sel;
1576 int rc;
1577
1578 memcpy(&sel, c->src.valptr + c->op_bytes, 2);
1579
1580 rc = load_segment_descriptor(ctxt, ops, sel, seg);
1581 if (rc != X86EMUL_CONTINUE)
1582 return rc;
1583
1584 c->dst.val = c->src.val;
1585 return rc;
1586}
1587
1908static inline void 1588static inline void
1909setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, 1589setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
1910 struct x86_emulate_ops *ops, struct desc_struct *cs, 1590 struct x86_emulate_ops *ops, struct desc_struct *cs,
@@ -2160,9 +1840,15 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
2160 struct x86_emulate_ops *ops, 1840 struct x86_emulate_ops *ops,
2161 u16 port, u16 len) 1841 u16 port, u16 len)
2162{ 1842{
1843 if (ctxt->perm_ok)
1844 return true;
1845
2163 if (emulator_bad_iopl(ctxt, ops)) 1846 if (emulator_bad_iopl(ctxt, ops))
2164 if (!emulator_io_port_access_allowed(ctxt, ops, port, len)) 1847 if (!emulator_io_port_access_allowed(ctxt, ops, port, len))
2165 return false; 1848 return false;
1849
1850 ctxt->perm_ok = true;
1851
2166 return true; 1852 return true;
2167} 1853}
2168 1854
@@ -2254,7 +1940,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2254 &err); 1940 &err);
2255 if (ret == X86EMUL_PROPAGATE_FAULT) { 1941 if (ret == X86EMUL_PROPAGATE_FAULT) {
2256 /* FIXME: need to provide precise fault address */ 1942 /* FIXME: need to provide precise fault address */
2257 emulate_pf(ctxt, old_tss_base, err); 1943 emulate_pf(ctxt);
2258 return ret; 1944 return ret;
2259 } 1945 }
2260 1946
@@ -2264,7 +1950,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2264 &err); 1950 &err);
2265 if (ret == X86EMUL_PROPAGATE_FAULT) { 1951 if (ret == X86EMUL_PROPAGATE_FAULT) {
2266 /* FIXME: need to provide precise fault address */ 1952 /* FIXME: need to provide precise fault address */
2267 emulate_pf(ctxt, old_tss_base, err); 1953 emulate_pf(ctxt);
2268 return ret; 1954 return ret;
2269 } 1955 }
2270 1956
@@ -2272,7 +1958,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2272 &err); 1958 &err);
2273 if (ret == X86EMUL_PROPAGATE_FAULT) { 1959 if (ret == X86EMUL_PROPAGATE_FAULT) {
2274 /* FIXME: need to provide precise fault address */ 1960 /* FIXME: need to provide precise fault address */
2275 emulate_pf(ctxt, new_tss_base, err); 1961 emulate_pf(ctxt);
2276 return ret; 1962 return ret;
2277 } 1963 }
2278 1964
@@ -2285,7 +1971,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2285 ctxt->vcpu, &err); 1971 ctxt->vcpu, &err);
2286 if (ret == X86EMUL_PROPAGATE_FAULT) { 1972 if (ret == X86EMUL_PROPAGATE_FAULT) {
2287 /* FIXME: need to provide precise fault address */ 1973 /* FIXME: need to provide precise fault address */
2288 emulate_pf(ctxt, new_tss_base, err); 1974 emulate_pf(ctxt);
2289 return ret; 1975 return ret;
2290 } 1976 }
2291 } 1977 }
@@ -2396,7 +2082,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2396 &err); 2082 &err);
2397 if (ret == X86EMUL_PROPAGATE_FAULT) { 2083 if (ret == X86EMUL_PROPAGATE_FAULT) {
2398 /* FIXME: need to provide precise fault address */ 2084 /* FIXME: need to provide precise fault address */
2399 emulate_pf(ctxt, old_tss_base, err); 2085 emulate_pf(ctxt);
2400 return ret; 2086 return ret;
2401 } 2087 }
2402 2088
@@ -2406,7 +2092,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2406 &err); 2092 &err);
2407 if (ret == X86EMUL_PROPAGATE_FAULT) { 2093 if (ret == X86EMUL_PROPAGATE_FAULT) {
2408 /* FIXME: need to provide precise fault address */ 2094 /* FIXME: need to provide precise fault address */
2409 emulate_pf(ctxt, old_tss_base, err); 2095 emulate_pf(ctxt);
2410 return ret; 2096 return ret;
2411 } 2097 }
2412 2098
@@ -2414,7 +2100,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2414 &err); 2100 &err);
2415 if (ret == X86EMUL_PROPAGATE_FAULT) { 2101 if (ret == X86EMUL_PROPAGATE_FAULT) {
2416 /* FIXME: need to provide precise fault address */ 2102 /* FIXME: need to provide precise fault address */
2417 emulate_pf(ctxt, new_tss_base, err); 2103 emulate_pf(ctxt);
2418 return ret; 2104 return ret;
2419 } 2105 }
2420 2106
@@ -2427,7 +2113,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2427 ctxt->vcpu, &err); 2113 ctxt->vcpu, &err);
2428 if (ret == X86EMUL_PROPAGATE_FAULT) { 2114 if (ret == X86EMUL_PROPAGATE_FAULT) {
2429 /* FIXME: need to provide precise fault address */ 2115 /* FIXME: need to provide precise fault address */
2430 emulate_pf(ctxt, new_tss_base, err); 2116 emulate_pf(ctxt);
2431 return ret; 2117 return ret;
2432 } 2118 }
2433 } 2119 }
@@ -2523,10 +2209,10 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2523} 2209}
2524 2210
2525int emulator_task_switch(struct x86_emulate_ctxt *ctxt, 2211int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
2526 struct x86_emulate_ops *ops,
2527 u16 tss_selector, int reason, 2212 u16 tss_selector, int reason,
2528 bool has_error_code, u32 error_code) 2213 bool has_error_code, u32 error_code)
2529{ 2214{
2215 struct x86_emulate_ops *ops = ctxt->ops;
2530 struct decode_cache *c = &ctxt->decode; 2216 struct decode_cache *c = &ctxt->decode;
2531 int rc; 2217 int rc;
2532 2218
@@ -2552,16 +2238,784 @@ static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base,
2552 int df = (ctxt->eflags & EFLG_DF) ? -1 : 1; 2238 int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
2553 2239
2554 register_address_increment(c, &c->regs[reg], df * op->bytes); 2240 register_address_increment(c, &c->regs[reg], df * op->bytes);
2555 op->ptr = (unsigned long *)register_address(c, base, c->regs[reg]); 2241 op->addr.mem = register_address(c, base, c->regs[reg]);
2242}
2243
2244static int em_push(struct x86_emulate_ctxt *ctxt)
2245{
2246 emulate_push(ctxt, ctxt->ops);
2247 return X86EMUL_CONTINUE;
2248}
2249
2250static int em_das(struct x86_emulate_ctxt *ctxt)
2251{
2252 struct decode_cache *c = &ctxt->decode;
2253 u8 al, old_al;
2254 bool af, cf, old_cf;
2255
2256 cf = ctxt->eflags & X86_EFLAGS_CF;
2257 al = c->dst.val;
2258
2259 old_al = al;
2260 old_cf = cf;
2261 cf = false;
2262 af = ctxt->eflags & X86_EFLAGS_AF;
2263 if ((al & 0x0f) > 9 || af) {
2264 al -= 6;
2265 cf = old_cf | (al >= 250);
2266 af = true;
2267 } else {
2268 af = false;
2269 }
2270 if (old_al > 0x99 || old_cf) {
2271 al -= 0x60;
2272 cf = true;
2273 }
2274
2275 c->dst.val = al;
2276 /* Set PF, ZF, SF */
2277 c->src.type = OP_IMM;
2278 c->src.val = 0;
2279 c->src.bytes = 1;
2280 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
2281 ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF);
2282 if (cf)
2283 ctxt->eflags |= X86_EFLAGS_CF;
2284 if (af)
2285 ctxt->eflags |= X86_EFLAGS_AF;
2286 return X86EMUL_CONTINUE;
2287}
2288
2289static int em_call_far(struct x86_emulate_ctxt *ctxt)
2290{
2291 struct decode_cache *c = &ctxt->decode;
2292 u16 sel, old_cs;
2293 ulong old_eip;
2294 int rc;
2295
2296 old_cs = ctxt->ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
2297 old_eip = c->eip;
2298
2299 memcpy(&sel, c->src.valptr + c->op_bytes, 2);
2300 if (load_segment_descriptor(ctxt, ctxt->ops, sel, VCPU_SREG_CS))
2301 return X86EMUL_CONTINUE;
2302
2303 c->eip = 0;
2304 memcpy(&c->eip, c->src.valptr, c->op_bytes);
2305
2306 c->src.val = old_cs;
2307 emulate_push(ctxt, ctxt->ops);
2308 rc = writeback(ctxt, ctxt->ops);
2309 if (rc != X86EMUL_CONTINUE)
2310 return rc;
2311
2312 c->src.val = old_eip;
2313 emulate_push(ctxt, ctxt->ops);
2314 rc = writeback(ctxt, ctxt->ops);
2315 if (rc != X86EMUL_CONTINUE)
2316 return rc;
2317
2318 c->dst.type = OP_NONE;
2319
2320 return X86EMUL_CONTINUE;
2321}
2322
2323static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
2324{
2325 struct decode_cache *c = &ctxt->decode;
2326 int rc;
2327
2328 c->dst.type = OP_REG;
2329 c->dst.addr.reg = &c->eip;
2330 c->dst.bytes = c->op_bytes;
2331 rc = emulate_pop(ctxt, ctxt->ops, &c->dst.val, c->op_bytes);
2332 if (rc != X86EMUL_CONTINUE)
2333 return rc;
2334 register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.val);
2335 return X86EMUL_CONTINUE;
2336}
2337
2338static int em_imul(struct x86_emulate_ctxt *ctxt)
2339{
2340 struct decode_cache *c = &ctxt->decode;
2341
2342 emulate_2op_SrcV_nobyte("imul", c->src, c->dst, ctxt->eflags);
2343 return X86EMUL_CONTINUE;
2344}
2345
2346static int em_imul_3op(struct x86_emulate_ctxt *ctxt)
2347{
2348 struct decode_cache *c = &ctxt->decode;
2349
2350 c->dst.val = c->src2.val;
2351 return em_imul(ctxt);
2352}
2353
2354static int em_cwd(struct x86_emulate_ctxt *ctxt)
2355{
2356 struct decode_cache *c = &ctxt->decode;
2357
2358 c->dst.type = OP_REG;
2359 c->dst.bytes = c->src.bytes;
2360 c->dst.addr.reg = &c->regs[VCPU_REGS_RDX];
2361 c->dst.val = ~((c->src.val >> (c->src.bytes * 8 - 1)) - 1);
2362
2363 return X86EMUL_CONTINUE;
2364}
2365
2366static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
2367{
2368 unsigned cpl = ctxt->ops->cpl(ctxt->vcpu);
2369 struct decode_cache *c = &ctxt->decode;
2370 u64 tsc = 0;
2371
2372 if (cpl > 0 && (ctxt->ops->get_cr(4, ctxt->vcpu) & X86_CR4_TSD)) {
2373 emulate_gp(ctxt, 0);
2374 return X86EMUL_PROPAGATE_FAULT;
2375 }
2376 ctxt->ops->get_msr(ctxt->vcpu, MSR_IA32_TSC, &tsc);
2377 c->regs[VCPU_REGS_RAX] = (u32)tsc;
2378 c->regs[VCPU_REGS_RDX] = tsc >> 32;
2379 return X86EMUL_CONTINUE;
2380}
2381
2382static int em_mov(struct x86_emulate_ctxt *ctxt)
2383{
2384 struct decode_cache *c = &ctxt->decode;
2385 c->dst.val = c->src.val;
2386 return X86EMUL_CONTINUE;
2387}
2388
2389#define D(_y) { .flags = (_y) }
2390#define N D(0)
2391#define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) }
2392#define GD(_f, _g) { .flags = ((_f) | Group | GroupDual), .u.gdual = (_g) }
2393#define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
2394
2395#define D2bv(_f) D((_f) | ByteOp), D(_f)
2396#define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e)
2397
2398#define D6ALU(_f) D2bv((_f) | DstMem | SrcReg | ModRM), \
2399 D2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock), \
2400 D2bv(((_f) & ~Lock) | DstAcc | SrcImm)
2401
2402
2403static struct opcode group1[] = {
2404 X7(D(Lock)), N
2405};
2406
2407static struct opcode group1A[] = {
2408 D(DstMem | SrcNone | ModRM | Mov | Stack), N, N, N, N, N, N, N,
2409};
2410
2411static struct opcode group3[] = {
2412 D(DstMem | SrcImm | ModRM), D(DstMem | SrcImm | ModRM),
2413 D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
2414 X4(D(SrcMem | ModRM)),
2415};
2416
2417static struct opcode group4[] = {
2418 D(ByteOp | DstMem | SrcNone | ModRM | Lock), D(ByteOp | DstMem | SrcNone | ModRM | Lock),
2419 N, N, N, N, N, N,
2420};
2421
2422static struct opcode group5[] = {
2423 D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
2424 D(SrcMem | ModRM | Stack),
2425 I(SrcMemFAddr | ModRM | ImplicitOps | Stack, em_call_far),
2426 D(SrcMem | ModRM | Stack), D(SrcMemFAddr | ModRM | ImplicitOps),
2427 D(SrcMem | ModRM | Stack), N,
2428};
2429
2430static struct group_dual group7 = { {
2431 N, N, D(ModRM | SrcMem | Priv), D(ModRM | SrcMem | Priv),
2432 D(SrcNone | ModRM | DstMem | Mov), N,
2433 D(SrcMem16 | ModRM | Mov | Priv),
2434 D(SrcMem | ModRM | ByteOp | Priv | NoAccess),
2435}, {
2436 D(SrcNone | ModRM | Priv), N, N, D(SrcNone | ModRM | Priv),
2437 D(SrcNone | ModRM | DstMem | Mov), N,
2438 D(SrcMem16 | ModRM | Mov | Priv), N,
2439} };
2440
2441static struct opcode group8[] = {
2442 N, N, N, N,
2443 D(DstMem | SrcImmByte | ModRM), D(DstMem | SrcImmByte | ModRM | Lock),
2444 D(DstMem | SrcImmByte | ModRM | Lock), D(DstMem | SrcImmByte | ModRM | Lock),
2445};
2446
2447static struct group_dual group9 = { {
2448 N, D(DstMem64 | ModRM | Lock), N, N, N, N, N, N,
2449}, {
2450 N, N, N, N, N, N, N, N,
2451} };
2452
2453static struct opcode group11[] = {
2454 I(DstMem | SrcImm | ModRM | Mov, em_mov), X7(D(Undefined)),
2455};
2456
2457static struct opcode opcode_table[256] = {
2458 /* 0x00 - 0x07 */
2459 D6ALU(Lock),
2460 D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
2461 /* 0x08 - 0x0F */
2462 D6ALU(Lock),
2463 D(ImplicitOps | Stack | No64), N,
2464 /* 0x10 - 0x17 */
2465 D6ALU(Lock),
2466 D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
2467 /* 0x18 - 0x1F */
2468 D6ALU(Lock),
2469 D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
2470 /* 0x20 - 0x27 */
2471 D6ALU(Lock), N, N,
2472 /* 0x28 - 0x2F */
2473 D6ALU(Lock), N, I(ByteOp | DstAcc | No64, em_das),
2474 /* 0x30 - 0x37 */
2475 D6ALU(Lock), N, N,
2476 /* 0x38 - 0x3F */
2477 D6ALU(0), N, N,
2478 /* 0x40 - 0x4F */
2479 X16(D(DstReg)),
2480 /* 0x50 - 0x57 */
2481 X8(I(SrcReg | Stack, em_push)),
2482 /* 0x58 - 0x5F */
2483 X8(D(DstReg | Stack)),
2484 /* 0x60 - 0x67 */
2485 D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
2486 N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ ,
2487 N, N, N, N,
2488 /* 0x68 - 0x6F */
2489 I(SrcImm | Mov | Stack, em_push),
2490 I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op),
2491 I(SrcImmByte | Mov | Stack, em_push),
2492 I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op),
2493 D2bv(DstDI | Mov | String), /* insb, insw/insd */
2494 D2bv(SrcSI | ImplicitOps | String), /* outsb, outsw/outsd */
2495 /* 0x70 - 0x7F */
2496 X16(D(SrcImmByte)),
2497 /* 0x80 - 0x87 */
2498 G(ByteOp | DstMem | SrcImm | ModRM | Group, group1),
2499 G(DstMem | SrcImm | ModRM | Group, group1),
2500 G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1),
2501 G(DstMem | SrcImmByte | ModRM | Group, group1),
2502 D2bv(DstMem | SrcReg | ModRM), D2bv(DstMem | SrcReg | ModRM | Lock),
2503 /* 0x88 - 0x8F */
2504 I2bv(DstMem | SrcReg | ModRM | Mov, em_mov),
2505 I2bv(DstReg | SrcMem | ModRM | Mov, em_mov),
2506 D(DstMem | SrcNone | ModRM | Mov), D(ModRM | SrcMem | NoAccess | DstReg),
2507 D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A),
2508 /* 0x90 - 0x97 */
2509 X8(D(SrcAcc | DstReg)),
2510 /* 0x98 - 0x9F */
2511 D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd),
2512 I(SrcImmFAddr | No64, em_call_far), N,
2513 D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N,
2514 /* 0xA0 - 0xA7 */
2515 I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),
2516 I2bv(DstMem | SrcAcc | Mov | MemAbs, em_mov),
2517 I2bv(SrcSI | DstDI | Mov | String, em_mov),
2518 D2bv(SrcSI | DstDI | String),
2519 /* 0xA8 - 0xAF */
2520 D2bv(DstAcc | SrcImm),
2521 I2bv(SrcAcc | DstDI | Mov | String, em_mov),
2522 I2bv(SrcSI | DstAcc | Mov | String, em_mov),
2523 D2bv(SrcAcc | DstDI | String),
2524 /* 0xB0 - 0xB7 */
2525 X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)),
2526 /* 0xB8 - 0xBF */
2527 X8(I(DstReg | SrcImm | Mov, em_mov)),
2528 /* 0xC0 - 0xC7 */
2529 D2bv(DstMem | SrcImmByte | ModRM),
2530 I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm),
2531 D(ImplicitOps | Stack),
2532 D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64),
2533 G(ByteOp, group11), G(0, group11),
2534 /* 0xC8 - 0xCF */
2535 N, N, N, D(ImplicitOps | Stack),
2536 D(ImplicitOps), D(SrcImmByte), D(ImplicitOps | No64), D(ImplicitOps),
2537 /* 0xD0 - 0xD7 */
2538 D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM),
2539 N, N, N, N,
2540 /* 0xD8 - 0xDF */
2541 N, N, N, N, N, N, N, N,
2542 /* 0xE0 - 0xE7 */
2543 X4(D(SrcImmByte)),
2544 D2bv(SrcImmUByte | DstAcc), D2bv(SrcAcc | DstImmUByte),
2545 /* 0xE8 - 0xEF */
2546 D(SrcImm | Stack), D(SrcImm | ImplicitOps),
2547 D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps),
2548 D2bv(SrcNone | DstAcc), D2bv(SrcAcc | ImplicitOps),
2549 /* 0xF0 - 0xF7 */
2550 N, N, N, N,
2551 D(ImplicitOps | Priv), D(ImplicitOps), G(ByteOp, group3), G(0, group3),
2552 /* 0xF8 - 0xFF */
2553 D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), D(ImplicitOps),
2554 D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5),
2555};
2556
2557static struct opcode twobyte_table[256] = {
2558 /* 0x00 - 0x0F */
2559 N, GD(0, &group7), N, N,
2560 N, D(ImplicitOps), D(ImplicitOps | Priv), N,
2561 D(ImplicitOps | Priv), D(ImplicitOps | Priv), N, N,
2562 N, D(ImplicitOps | ModRM), N, N,
2563 /* 0x10 - 0x1F */
2564 N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, N,
2565 /* 0x20 - 0x2F */
2566 D(ModRM | DstMem | Priv | Op3264), D(ModRM | DstMem | Priv | Op3264),
2567 D(ModRM | SrcMem | Priv | Op3264), D(ModRM | SrcMem | Priv | Op3264),
2568 N, N, N, N,
2569 N, N, N, N, N, N, N, N,
2570 /* 0x30 - 0x3F */
2571 D(ImplicitOps | Priv), I(ImplicitOps, em_rdtsc),
2572 D(ImplicitOps | Priv), N,
2573 D(ImplicitOps), D(ImplicitOps | Priv), N, N,
2574 N, N, N, N, N, N, N, N,
2575 /* 0x40 - 0x4F */
2576 X16(D(DstReg | SrcMem | ModRM | Mov)),
2577 /* 0x50 - 0x5F */
2578 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
2579 /* 0x60 - 0x6F */
2580 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
2581 /* 0x70 - 0x7F */
2582 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
2583 /* 0x80 - 0x8F */
2584 X16(D(SrcImm)),
2585 /* 0x90 - 0x9F */
2586 X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)),
2587 /* 0xA0 - 0xA7 */
2588 D(ImplicitOps | Stack), D(ImplicitOps | Stack),
2589 N, D(DstMem | SrcReg | ModRM | BitOp),
2590 D(DstMem | SrcReg | Src2ImmByte | ModRM),
2591 D(DstMem | SrcReg | Src2CL | ModRM), N, N,
2592 /* 0xA8 - 0xAF */
2593 D(ImplicitOps | Stack), D(ImplicitOps | Stack),
2594 N, D(DstMem | SrcReg | ModRM | BitOp | Lock),
2595 D(DstMem | SrcReg | Src2ImmByte | ModRM),
2596 D(DstMem | SrcReg | Src2CL | ModRM),
2597 D(ModRM), I(DstReg | SrcMem | ModRM, em_imul),
2598 /* 0xB0 - 0xB7 */
2599 D2bv(DstMem | SrcReg | ModRM | Lock),
2600 D(DstReg | SrcMemFAddr | ModRM), D(DstMem | SrcReg | ModRM | BitOp | Lock),
2601 D(DstReg | SrcMemFAddr | ModRM), D(DstReg | SrcMemFAddr | ModRM),
2602 D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
2603 /* 0xB8 - 0xBF */
2604 N, N,
2605 G(BitOp, group8), D(DstMem | SrcReg | ModRM | BitOp | Lock),
2606 D(DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
2607 D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
2608 /* 0xC0 - 0xCF */
2609 D2bv(DstMem | SrcReg | ModRM | Lock),
2610 N, D(DstMem | SrcReg | ModRM | Mov),
2611 N, N, N, GD(0, &group9),
2612 N, N, N, N, N, N, N, N,
2613 /* 0xD0 - 0xDF */
2614 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
2615 /* 0xE0 - 0xEF */
2616 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
2617 /* 0xF0 - 0xFF */
2618 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N
2619};
2620
2621#undef D
2622#undef N
2623#undef G
2624#undef GD
2625#undef I
2626
2627#undef D2bv
2628#undef I2bv
2629#undef D6ALU
2630
2631static unsigned imm_size(struct decode_cache *c)
2632{
2633 unsigned size;
2634
2635 size = (c->d & ByteOp) ? 1 : c->op_bytes;
2636 if (size == 8)
2637 size = 4;
2638 return size;
2639}
2640
2641static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op,
2642 unsigned size, bool sign_extension)
2643{
2644 struct decode_cache *c = &ctxt->decode;
2645 struct x86_emulate_ops *ops = ctxt->ops;
2646 int rc = X86EMUL_CONTINUE;
2647
2648 op->type = OP_IMM;
2649 op->bytes = size;
2650 op->addr.mem = c->eip;
2651 /* NB. Immediates are sign-extended as necessary. */
2652 switch (op->bytes) {
2653 case 1:
2654 op->val = insn_fetch(s8, 1, c->eip);
2655 break;
2656 case 2:
2657 op->val = insn_fetch(s16, 2, c->eip);
2658 break;
2659 case 4:
2660 op->val = insn_fetch(s32, 4, c->eip);
2661 break;
2662 }
2663 if (!sign_extension) {
2664 switch (op->bytes) {
2665 case 1:
2666 op->val &= 0xff;
2667 break;
2668 case 2:
2669 op->val &= 0xffff;
2670 break;
2671 case 4:
2672 op->val &= 0xffffffff;
2673 break;
2674 }
2675 }
2676done:
2677 return rc;
2556} 2678}
2557 2679
2558int 2680int
2559x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) 2681x86_decode_insn(struct x86_emulate_ctxt *ctxt)
2560{ 2682{
2683 struct x86_emulate_ops *ops = ctxt->ops;
2684 struct decode_cache *c = &ctxt->decode;
2685 int rc = X86EMUL_CONTINUE;
2686 int mode = ctxt->mode;
2687 int def_op_bytes, def_ad_bytes, dual, goffset;
2688 struct opcode opcode, *g_mod012, *g_mod3;
2689 struct operand memop = { .type = OP_NONE };
2690
2691 c->eip = ctxt->eip;
2692 c->fetch.start = c->fetch.end = c->eip;
2693 ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS);
2694
2695 switch (mode) {
2696 case X86EMUL_MODE_REAL:
2697 case X86EMUL_MODE_VM86:
2698 case X86EMUL_MODE_PROT16:
2699 def_op_bytes = def_ad_bytes = 2;
2700 break;
2701 case X86EMUL_MODE_PROT32:
2702 def_op_bytes = def_ad_bytes = 4;
2703 break;
2704#ifdef CONFIG_X86_64
2705 case X86EMUL_MODE_PROT64:
2706 def_op_bytes = 4;
2707 def_ad_bytes = 8;
2708 break;
2709#endif
2710 default:
2711 return -1;
2712 }
2713
2714 c->op_bytes = def_op_bytes;
2715 c->ad_bytes = def_ad_bytes;
2716
2717 /* Legacy prefixes. */
2718 for (;;) {
2719 switch (c->b = insn_fetch(u8, 1, c->eip)) {
2720 case 0x66: /* operand-size override */
2721 /* switch between 2/4 bytes */
2722 c->op_bytes = def_op_bytes ^ 6;
2723 break;
2724 case 0x67: /* address-size override */
2725 if (mode == X86EMUL_MODE_PROT64)
2726 /* switch between 4/8 bytes */
2727 c->ad_bytes = def_ad_bytes ^ 12;
2728 else
2729 /* switch between 2/4 bytes */
2730 c->ad_bytes = def_ad_bytes ^ 6;
2731 break;
2732 case 0x26: /* ES override */
2733 case 0x2e: /* CS override */
2734 case 0x36: /* SS override */
2735 case 0x3e: /* DS override */
2736 set_seg_override(c, (c->b >> 3) & 3);
2737 break;
2738 case 0x64: /* FS override */
2739 case 0x65: /* GS override */
2740 set_seg_override(c, c->b & 7);
2741 break;
2742 case 0x40 ... 0x4f: /* REX */
2743 if (mode != X86EMUL_MODE_PROT64)
2744 goto done_prefixes;
2745 c->rex_prefix = c->b;
2746 continue;
2747 case 0xf0: /* LOCK */
2748 c->lock_prefix = 1;
2749 break;
2750 case 0xf2: /* REPNE/REPNZ */
2751 c->rep_prefix = REPNE_PREFIX;
2752 break;
2753 case 0xf3: /* REP/REPE/REPZ */
2754 c->rep_prefix = REPE_PREFIX;
2755 break;
2756 default:
2757 goto done_prefixes;
2758 }
2759
2760 /* Any legacy prefix after a REX prefix nullifies its effect. */
2761
2762 c->rex_prefix = 0;
2763 }
2764
2765done_prefixes:
2766
2767 /* REX prefix. */
2768 if (c->rex_prefix & 8)
2769 c->op_bytes = 8; /* REX.W */
2770
2771 /* Opcode byte(s). */
2772 opcode = opcode_table[c->b];
2773 /* Two-byte opcode? */
2774 if (c->b == 0x0f) {
2775 c->twobyte = 1;
2776 c->b = insn_fetch(u8, 1, c->eip);
2777 opcode = twobyte_table[c->b];
2778 }
2779 c->d = opcode.flags;
2780
2781 if (c->d & Group) {
2782 dual = c->d & GroupDual;
2783 c->modrm = insn_fetch(u8, 1, c->eip);
2784 --c->eip;
2785
2786 if (c->d & GroupDual) {
2787 g_mod012 = opcode.u.gdual->mod012;
2788 g_mod3 = opcode.u.gdual->mod3;
2789 } else
2790 g_mod012 = g_mod3 = opcode.u.group;
2791
2792 c->d &= ~(Group | GroupDual);
2793
2794 goffset = (c->modrm >> 3) & 7;
2795
2796 if ((c->modrm >> 6) == 3)
2797 opcode = g_mod3[goffset];
2798 else
2799 opcode = g_mod012[goffset];
2800 c->d |= opcode.flags;
2801 }
2802
2803 c->execute = opcode.u.execute;
2804
2805 /* Unrecognised? */
2806 if (c->d == 0 || (c->d & Undefined)) {
2807 DPRINTF("Cannot emulate %02x\n", c->b);
2808 return -1;
2809 }
2810
2811 if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
2812 c->op_bytes = 8;
2813
2814 if (c->d & Op3264) {
2815 if (mode == X86EMUL_MODE_PROT64)
2816 c->op_bytes = 8;
2817 else
2818 c->op_bytes = 4;
2819 }
2820
2821 /* ModRM and SIB bytes. */
2822 if (c->d & ModRM) {
2823 rc = decode_modrm(ctxt, ops, &memop);
2824 if (!c->has_seg_override)
2825 set_seg_override(c, c->modrm_seg);
2826 } else if (c->d & MemAbs)
2827 rc = decode_abs(ctxt, ops, &memop);
2828 if (rc != X86EMUL_CONTINUE)
2829 goto done;
2830
2831 if (!c->has_seg_override)
2832 set_seg_override(c, VCPU_SREG_DS);
2833
2834 if (memop.type == OP_MEM && !(!c->twobyte && c->b == 0x8d))
2835 memop.addr.mem += seg_override_base(ctxt, ops, c);
2836
2837 if (memop.type == OP_MEM && c->ad_bytes != 8)
2838 memop.addr.mem = (u32)memop.addr.mem;
2839
2840 if (memop.type == OP_MEM && c->rip_relative)
2841 memop.addr.mem += c->eip;
2842
2843 /*
2844 * Decode and fetch the source operand: register, memory
2845 * or immediate.
2846 */
2847 switch (c->d & SrcMask) {
2848 case SrcNone:
2849 break;
2850 case SrcReg:
2851 decode_register_operand(&c->src, c, 0);
2852 break;
2853 case SrcMem16:
2854 memop.bytes = 2;
2855 goto srcmem_common;
2856 case SrcMem32:
2857 memop.bytes = 4;
2858 goto srcmem_common;
2859 case SrcMem:
2860 memop.bytes = (c->d & ByteOp) ? 1 :
2861 c->op_bytes;
2862 srcmem_common:
2863 c->src = memop;
2864 break;
2865 case SrcImmU16:
2866 rc = decode_imm(ctxt, &c->src, 2, false);
2867 break;
2868 case SrcImm:
2869 rc = decode_imm(ctxt, &c->src, imm_size(c), true);
2870 break;
2871 case SrcImmU:
2872 rc = decode_imm(ctxt, &c->src, imm_size(c), false);
2873 break;
2874 case SrcImmByte:
2875 rc = decode_imm(ctxt, &c->src, 1, true);
2876 break;
2877 case SrcImmUByte:
2878 rc = decode_imm(ctxt, &c->src, 1, false);
2879 break;
2880 case SrcAcc:
2881 c->src.type = OP_REG;
2882 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2883 c->src.addr.reg = &c->regs[VCPU_REGS_RAX];
2884 fetch_register_operand(&c->src);
2885 break;
2886 case SrcOne:
2887 c->src.bytes = 1;
2888 c->src.val = 1;
2889 break;
2890 case SrcSI:
2891 c->src.type = OP_MEM;
2892 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2893 c->src.addr.mem =
2894 register_address(c, seg_override_base(ctxt, ops, c),
2895 c->regs[VCPU_REGS_RSI]);
2896 c->src.val = 0;
2897 break;
2898 case SrcImmFAddr:
2899 c->src.type = OP_IMM;
2900 c->src.addr.mem = c->eip;
2901 c->src.bytes = c->op_bytes + 2;
2902 insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip);
2903 break;
2904 case SrcMemFAddr:
2905 memop.bytes = c->op_bytes + 2;
2906 goto srcmem_common;
2907 break;
2908 }
2909
2910 if (rc != X86EMUL_CONTINUE)
2911 goto done;
2912
2913 /*
2914 * Decode and fetch the second source operand: register, memory
2915 * or immediate.
2916 */
2917 switch (c->d & Src2Mask) {
2918 case Src2None:
2919 break;
2920 case Src2CL:
2921 c->src2.bytes = 1;
2922 c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8;
2923 break;
2924 case Src2ImmByte:
2925 rc = decode_imm(ctxt, &c->src2, 1, true);
2926 break;
2927 case Src2One:
2928 c->src2.bytes = 1;
2929 c->src2.val = 1;
2930 break;
2931 case Src2Imm:
2932 rc = decode_imm(ctxt, &c->src2, imm_size(c), true);
2933 break;
2934 }
2935
2936 if (rc != X86EMUL_CONTINUE)
2937 goto done;
2938
2939 /* Decode and fetch the destination operand: register or memory. */
2940 switch (c->d & DstMask) {
2941 case DstReg:
2942 decode_register_operand(&c->dst, c,
2943 c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
2944 break;
2945 case DstImmUByte:
2946 c->dst.type = OP_IMM;
2947 c->dst.addr.mem = c->eip;
2948 c->dst.bytes = 1;
2949 c->dst.val = insn_fetch(u8, 1, c->eip);
2950 break;
2951 case DstMem:
2952 case DstMem64:
2953 c->dst = memop;
2954 if ((c->d & DstMask) == DstMem64)
2955 c->dst.bytes = 8;
2956 else
2957 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2958 if (c->d & BitOp)
2959 fetch_bit_operand(c);
2960 c->dst.orig_val = c->dst.val;
2961 break;
2962 case DstAcc:
2963 c->dst.type = OP_REG;
2964 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2965 c->dst.addr.reg = &c->regs[VCPU_REGS_RAX];
2966 fetch_register_operand(&c->dst);
2967 c->dst.orig_val = c->dst.val;
2968 break;
2969 case DstDI:
2970 c->dst.type = OP_MEM;
2971 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2972 c->dst.addr.mem =
2973 register_address(c, es_base(ctxt, ops),
2974 c->regs[VCPU_REGS_RDI]);
2975 c->dst.val = 0;
2976 break;
2977 case ImplicitOps:
2978 /* Special instructions do their own operand decoding. */
2979 default:
2980 c->dst.type = OP_NONE; /* Disable writeback. */
2981 return 0;
2982 }
2983
2984done:
2985 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
2986}
2987
2988static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
2989{
2990 struct decode_cache *c = &ctxt->decode;
2991
2992 /* The second termination condition only applies for REPE
2993 * and REPNE. Test if the repeat string operation prefix is
2994 * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
2995 * corresponding termination condition according to:
2996 * - if REPE/REPZ and ZF = 0 then done
2997 * - if REPNE/REPNZ and ZF = 1 then done
2998 */
2999 if (((c->b == 0xa6) || (c->b == 0xa7) ||
3000 (c->b == 0xae) || (c->b == 0xaf))
3001 && (((c->rep_prefix == REPE_PREFIX) &&
3002 ((ctxt->eflags & EFLG_ZF) == 0))
3003 || ((c->rep_prefix == REPNE_PREFIX) &&
3004 ((ctxt->eflags & EFLG_ZF) == EFLG_ZF))))
3005 return true;
3006
3007 return false;
3008}
3009
3010int
3011x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
3012{
3013 struct x86_emulate_ops *ops = ctxt->ops;
2561 u64 msr_data; 3014 u64 msr_data;
2562 struct decode_cache *c = &ctxt->decode; 3015 struct decode_cache *c = &ctxt->decode;
2563 int rc = X86EMUL_CONTINUE; 3016 int rc = X86EMUL_CONTINUE;
2564 int saved_dst_type = c->dst.type; 3017 int saved_dst_type = c->dst.type;
3018 int irq; /* Used for int 3, int, and into */
2565 3019
2566 ctxt->decode.mem_read.pos = 0; 3020 ctxt->decode.mem_read.pos = 0;
2567 3021
@@ -2576,6 +3030,11 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2576 goto done; 3030 goto done;
2577 } 3031 }
2578 3032
3033 if ((c->d & SrcMask) == SrcMemFAddr && c->src.type != OP_MEM) {
3034 emulate_ud(ctxt);
3035 goto done;
3036 }
3037
2579 /* Privileged instruction can be executed only in CPL=0 */ 3038 /* Privileged instruction can be executed only in CPL=0 */
2580 if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) { 3039 if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) {
2581 emulate_gp(ctxt, 0); 3040 emulate_gp(ctxt, 0);
@@ -2583,35 +3042,15 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2583 } 3042 }
2584 3043
2585 if (c->rep_prefix && (c->d & String)) { 3044 if (c->rep_prefix && (c->d & String)) {
2586 ctxt->restart = true;
2587 /* All REP prefixes have the same first termination condition */ 3045 /* All REP prefixes have the same first termination condition */
2588 if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) { 3046 if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) {
2589 string_done:
2590 ctxt->restart = false;
2591 ctxt->eip = c->eip; 3047 ctxt->eip = c->eip;
2592 goto done; 3048 goto done;
2593 } 3049 }
2594 /* The second termination condition only applies for REPE
2595 * and REPNE. Test if the repeat string operation prefix is
2596 * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
2597 * corresponding termination condition according to:
2598 * - if REPE/REPZ and ZF = 0 then done
2599 * - if REPNE/REPNZ and ZF = 1 then done
2600 */
2601 if ((c->b == 0xa6) || (c->b == 0xa7) ||
2602 (c->b == 0xae) || (c->b == 0xaf)) {
2603 if ((c->rep_prefix == REPE_PREFIX) &&
2604 ((ctxt->eflags & EFLG_ZF) == 0))
2605 goto string_done;
2606 if ((c->rep_prefix == REPNE_PREFIX) &&
2607 ((ctxt->eflags & EFLG_ZF) == EFLG_ZF))
2608 goto string_done;
2609 }
2610 c->eip = ctxt->eip;
2611 } 3050 }
2612 3051
2613 if (c->src.type == OP_MEM) { 3052 if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) {
2614 rc = read_emulated(ctxt, ops, (unsigned long)c->src.ptr, 3053 rc = read_emulated(ctxt, ops, c->src.addr.mem,
2615 c->src.valptr, c->src.bytes); 3054 c->src.valptr, c->src.bytes);
2616 if (rc != X86EMUL_CONTINUE) 3055 if (rc != X86EMUL_CONTINUE)
2617 goto done; 3056 goto done;
@@ -2619,7 +3058,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2619 } 3058 }
2620 3059
2621 if (c->src2.type == OP_MEM) { 3060 if (c->src2.type == OP_MEM) {
2622 rc = read_emulated(ctxt, ops, (unsigned long)c->src2.ptr, 3061 rc = read_emulated(ctxt, ops, c->src2.addr.mem,
2623 &c->src2.val, c->src2.bytes); 3062 &c->src2.val, c->src2.bytes);
2624 if (rc != X86EMUL_CONTINUE) 3063 if (rc != X86EMUL_CONTINUE)
2625 goto done; 3064 goto done;
@@ -2631,7 +3070,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2631 3070
2632 if ((c->dst.type == OP_MEM) && !(c->d & Mov)) { 3071 if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
2633 /* optimisation - avoid slow emulated read if Mov */ 3072 /* optimisation - avoid slow emulated read if Mov */
2634 rc = read_emulated(ctxt, ops, (unsigned long)c->dst.ptr, 3073 rc = read_emulated(ctxt, ops, c->dst.addr.mem,
2635 &c->dst.val, c->dst.bytes); 3074 &c->dst.val, c->dst.bytes);
2636 if (rc != X86EMUL_CONTINUE) 3075 if (rc != X86EMUL_CONTINUE)
2637 goto done; 3076 goto done;
@@ -2640,6 +3079,13 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2640 3079
2641special_insn: 3080special_insn:
2642 3081
3082 if (c->execute) {
3083 rc = c->execute(ctxt);
3084 if (rc != X86EMUL_CONTINUE)
3085 goto done;
3086 goto writeback;
3087 }
3088
2643 if (c->twobyte) 3089 if (c->twobyte)
2644 goto twobyte_insn; 3090 goto twobyte_insn;
2645 3091
@@ -2653,8 +3099,6 @@ special_insn:
2653 break; 3099 break;
2654 case 0x07: /* pop es */ 3100 case 0x07: /* pop es */
2655 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES); 3101 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
2656 if (rc != X86EMUL_CONTINUE)
2657 goto done;
2658 break; 3102 break;
2659 case 0x08 ... 0x0d: 3103 case 0x08 ... 0x0d:
2660 or: /* or */ 3104 or: /* or */
@@ -2672,8 +3116,6 @@ special_insn:
2672 break; 3116 break;
2673 case 0x17: /* pop ss */ 3117 case 0x17: /* pop ss */
2674 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS); 3118 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
2675 if (rc != X86EMUL_CONTINUE)
2676 goto done;
2677 break; 3119 break;
2678 case 0x18 ... 0x1d: 3120 case 0x18 ... 0x1d:
2679 sbb: /* sbb */ 3121 sbb: /* sbb */
@@ -2684,8 +3126,6 @@ special_insn:
2684 break; 3126 break;
2685 case 0x1f: /* pop ds */ 3127 case 0x1f: /* pop ds */
2686 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS); 3128 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
2687 if (rc != X86EMUL_CONTINUE)
2688 goto done;
2689 break; 3129 break;
2690 case 0x20 ... 0x25: 3130 case 0x20 ... 0x25:
2691 and: /* and */ 3131 and: /* and */
@@ -2709,58 +3149,29 @@ special_insn:
2709 case 0x48 ... 0x4f: /* dec r16/r32 */ 3149 case 0x48 ... 0x4f: /* dec r16/r32 */
2710 emulate_1op("dec", c->dst, ctxt->eflags); 3150 emulate_1op("dec", c->dst, ctxt->eflags);
2711 break; 3151 break;
2712 case 0x50 ... 0x57: /* push reg */
2713 emulate_push(ctxt, ops);
2714 break;
2715 case 0x58 ... 0x5f: /* pop reg */ 3152 case 0x58 ... 0x5f: /* pop reg */
2716 pop_instruction: 3153 pop_instruction:
2717 rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes); 3154 rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes);
2718 if (rc != X86EMUL_CONTINUE)
2719 goto done;
2720 break; 3155 break;
2721 case 0x60: /* pusha */ 3156 case 0x60: /* pusha */
2722 rc = emulate_pusha(ctxt, ops); 3157 rc = emulate_pusha(ctxt, ops);
2723 if (rc != X86EMUL_CONTINUE)
2724 goto done;
2725 break; 3158 break;
2726 case 0x61: /* popa */ 3159 case 0x61: /* popa */
2727 rc = emulate_popa(ctxt, ops); 3160 rc = emulate_popa(ctxt, ops);
2728 if (rc != X86EMUL_CONTINUE)
2729 goto done;
2730 break; 3161 break;
2731 case 0x63: /* movsxd */ 3162 case 0x63: /* movsxd */
2732 if (ctxt->mode != X86EMUL_MODE_PROT64) 3163 if (ctxt->mode != X86EMUL_MODE_PROT64)
2733 goto cannot_emulate; 3164 goto cannot_emulate;
2734 c->dst.val = (s32) c->src.val; 3165 c->dst.val = (s32) c->src.val;
2735 break; 3166 break;
2736 case 0x68: /* push imm */
2737 case 0x6a: /* push imm8 */
2738 emulate_push(ctxt, ops);
2739 break;
2740 case 0x6c: /* insb */ 3167 case 0x6c: /* insb */
2741 case 0x6d: /* insw/insd */ 3168 case 0x6d: /* insw/insd */
2742 c->dst.bytes = min(c->dst.bytes, 4u); 3169 c->src.val = c->regs[VCPU_REGS_RDX];
2743 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], 3170 goto do_io_in;
2744 c->dst.bytes)) {
2745 emulate_gp(ctxt, 0);
2746 goto done;
2747 }
2748 if (!pio_in_emulated(ctxt, ops, c->dst.bytes,
2749 c->regs[VCPU_REGS_RDX], &c->dst.val))
2750 goto done; /* IO is needed, skip writeback */
2751 break;
2752 case 0x6e: /* outsb */ 3171 case 0x6e: /* outsb */
2753 case 0x6f: /* outsw/outsd */ 3172 case 0x6f: /* outsw/outsd */
2754 c->src.bytes = min(c->src.bytes, 4u); 3173 c->dst.val = c->regs[VCPU_REGS_RDX];
2755 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], 3174 goto do_io_out;
2756 c->src.bytes)) {
2757 emulate_gp(ctxt, 0);
2758 goto done;
2759 }
2760 ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX],
2761 &c->src.val, 1, ctxt->vcpu);
2762
2763 c->dst.type = OP_NONE; /* nothing to writeback */
2764 break; 3175 break;
2765 case 0x70 ... 0x7f: /* jcc (short) */ 3176 case 0x70 ... 0x7f: /* jcc (short) */
2766 if (test_cc(c->b, ctxt->eflags)) 3177 if (test_cc(c->b, ctxt->eflags))
@@ -2793,29 +3204,15 @@ special_insn:
2793 case 0x86 ... 0x87: /* xchg */ 3204 case 0x86 ... 0x87: /* xchg */
2794 xchg: 3205 xchg:
2795 /* Write back the register source. */ 3206 /* Write back the register source. */
2796 switch (c->dst.bytes) { 3207 c->src.val = c->dst.val;
2797 case 1: 3208 write_register_operand(&c->src);
2798 *(u8 *) c->src.ptr = (u8) c->dst.val;
2799 break;
2800 case 2:
2801 *(u16 *) c->src.ptr = (u16) c->dst.val;
2802 break;
2803 case 4:
2804 *c->src.ptr = (u32) c->dst.val;
2805 break; /* 64b reg: zero-extend */
2806 case 8:
2807 *c->src.ptr = c->dst.val;
2808 break;
2809 }
2810 /* 3209 /*
2811 * Write back the memory destination with implicit LOCK 3210 * Write back the memory destination with implicit LOCK
2812 * prefix. 3211 * prefix.
2813 */ 3212 */
2814 c->dst.val = c->src.val; 3213 c->dst.val = c->src.orig_val;
2815 c->lock_prefix = 1; 3214 c->lock_prefix = 1;
2816 break; 3215 break;
2817 case 0x88 ... 0x8b: /* mov */
2818 goto mov;
2819 case 0x8c: /* mov r/m, sreg */ 3216 case 0x8c: /* mov r/m, sreg */
2820 if (c->modrm_reg > VCPU_SREG_GS) { 3217 if (c->modrm_reg > VCPU_SREG_GS) {
2821 emulate_ud(ctxt); 3218 emulate_ud(ctxt);
@@ -2824,7 +3221,7 @@ special_insn:
2824 c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu); 3221 c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu);
2825 break; 3222 break;
2826 case 0x8d: /* lea r16/r32, m */ 3223 case 0x8d: /* lea r16/r32, m */
2827 c->dst.val = c->modrm_ea; 3224 c->dst.val = c->src.addr.mem;
2828 break; 3225 break;
2829 case 0x8e: { /* mov seg, r/m16 */ 3226 case 0x8e: { /* mov seg, r/m16 */
2830 uint16_t sel; 3227 uint16_t sel;
@@ -2847,76 +3244,87 @@ special_insn:
2847 } 3244 }
2848 case 0x8f: /* pop (sole member of Grp1a) */ 3245 case 0x8f: /* pop (sole member of Grp1a) */
2849 rc = emulate_grp1a(ctxt, ops); 3246 rc = emulate_grp1a(ctxt, ops);
2850 if (rc != X86EMUL_CONTINUE)
2851 goto done;
2852 break; 3247 break;
2853 case 0x90: /* nop / xchg r8,rax */ 3248 case 0x90 ... 0x97: /* nop / xchg reg, rax */
2854 if (c->dst.ptr == (unsigned long *)&c->regs[VCPU_REGS_RAX]) { 3249 if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX])
2855 c->dst.type = OP_NONE; /* nop */
2856 break; 3250 break;
2857 }
2858 case 0x91 ... 0x97: /* xchg reg,rax */
2859 c->src.type = OP_REG;
2860 c->src.bytes = c->op_bytes;
2861 c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX];
2862 c->src.val = *(c->src.ptr);
2863 goto xchg; 3251 goto xchg;
3252 case 0x98: /* cbw/cwde/cdqe */
3253 switch (c->op_bytes) {
3254 case 2: c->dst.val = (s8)c->dst.val; break;
3255 case 4: c->dst.val = (s16)c->dst.val; break;
3256 case 8: c->dst.val = (s32)c->dst.val; break;
3257 }
3258 break;
2864 case 0x9c: /* pushf */ 3259 case 0x9c: /* pushf */
2865 c->src.val = (unsigned long) ctxt->eflags; 3260 c->src.val = (unsigned long) ctxt->eflags;
2866 emulate_push(ctxt, ops); 3261 emulate_push(ctxt, ops);
2867 break; 3262 break;
2868 case 0x9d: /* popf */ 3263 case 0x9d: /* popf */
2869 c->dst.type = OP_REG; 3264 c->dst.type = OP_REG;
2870 c->dst.ptr = (unsigned long *) &ctxt->eflags; 3265 c->dst.addr.reg = &ctxt->eflags;
2871 c->dst.bytes = c->op_bytes; 3266 c->dst.bytes = c->op_bytes;
2872 rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes); 3267 rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes);
2873 if (rc != X86EMUL_CONTINUE)
2874 goto done;
2875 break; 3268 break;
2876 case 0xa0 ... 0xa3: /* mov */
2877 case 0xa4 ... 0xa5: /* movs */
2878 goto mov;
2879 case 0xa6 ... 0xa7: /* cmps */ 3269 case 0xa6 ... 0xa7: /* cmps */
2880 c->dst.type = OP_NONE; /* Disable writeback. */ 3270 c->dst.type = OP_NONE; /* Disable writeback. */
2881 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); 3271 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.addr.mem, c->dst.addr.mem);
2882 goto cmp; 3272 goto cmp;
2883 case 0xa8 ... 0xa9: /* test ax, imm */ 3273 case 0xa8 ... 0xa9: /* test ax, imm */
2884 goto test; 3274 goto test;
2885 case 0xaa ... 0xab: /* stos */
2886 c->dst.val = c->regs[VCPU_REGS_RAX];
2887 break;
2888 case 0xac ... 0xad: /* lods */
2889 goto mov;
2890 case 0xae ... 0xaf: /* scas */ 3275 case 0xae ... 0xaf: /* scas */
2891 DPRINTF("Urk! I don't handle SCAS.\n"); 3276 goto cmp;
2892 goto cannot_emulate;
2893 case 0xb0 ... 0xbf: /* mov r, imm */
2894 goto mov;
2895 case 0xc0 ... 0xc1: 3277 case 0xc0 ... 0xc1:
2896 emulate_grp2(ctxt); 3278 emulate_grp2(ctxt);
2897 break; 3279 break;
2898 case 0xc3: /* ret */ 3280 case 0xc3: /* ret */
2899 c->dst.type = OP_REG; 3281 c->dst.type = OP_REG;
2900 c->dst.ptr = &c->eip; 3282 c->dst.addr.reg = &c->eip;
2901 c->dst.bytes = c->op_bytes; 3283 c->dst.bytes = c->op_bytes;
2902 goto pop_instruction; 3284 goto pop_instruction;
2903 case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */ 3285 case 0xc4: /* les */
2904 mov: 3286 rc = emulate_load_segment(ctxt, ops, VCPU_SREG_ES);
2905 c->dst.val = c->src.val; 3287 break;
3288 case 0xc5: /* lds */
3289 rc = emulate_load_segment(ctxt, ops, VCPU_SREG_DS);
2906 break; 3290 break;
2907 case 0xcb: /* ret far */ 3291 case 0xcb: /* ret far */
2908 rc = emulate_ret_far(ctxt, ops); 3292 rc = emulate_ret_far(ctxt, ops);
2909 if (rc != X86EMUL_CONTINUE) 3293 break;
2910 goto done; 3294 case 0xcc: /* int3 */
3295 irq = 3;
3296 goto do_interrupt;
3297 case 0xcd: /* int n */
3298 irq = c->src.val;
3299 do_interrupt:
3300 rc = emulate_int(ctxt, ops, irq);
3301 break;
3302 case 0xce: /* into */
3303 if (ctxt->eflags & EFLG_OF) {
3304 irq = 4;
3305 goto do_interrupt;
3306 }
3307 break;
3308 case 0xcf: /* iret */
3309 rc = emulate_iret(ctxt, ops);
2911 break; 3310 break;
2912 case 0xd0 ... 0xd1: /* Grp2 */ 3311 case 0xd0 ... 0xd1: /* Grp2 */
2913 c->src.val = 1;
2914 emulate_grp2(ctxt); 3312 emulate_grp2(ctxt);
2915 break; 3313 break;
2916 case 0xd2 ... 0xd3: /* Grp2 */ 3314 case 0xd2 ... 0xd3: /* Grp2 */
2917 c->src.val = c->regs[VCPU_REGS_RCX]; 3315 c->src.val = c->regs[VCPU_REGS_RCX];
2918 emulate_grp2(ctxt); 3316 emulate_grp2(ctxt);
2919 break; 3317 break;
3318 case 0xe0 ... 0xe2: /* loop/loopz/loopnz */
3319 register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
3320 if (address_mask(c, c->regs[VCPU_REGS_RCX]) != 0 &&
3321 (c->b == 0xe2 || test_cc(c->b ^ 0x5, ctxt->eflags)))
3322 jmp_rel(c, c->src.val);
3323 break;
3324 case 0xe3: /* jcxz/jecxz/jrcxz */
3325 if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0)
3326 jmp_rel(c, c->src.val);
3327 break;
2920 case 0xe4: /* inb */ 3328 case 0xe4: /* inb */
2921 case 0xe5: /* in */ 3329 case 0xe5: /* in */
2922 goto do_io_in; 3330 goto do_io_in;
@@ -2964,15 +3372,16 @@ special_insn:
2964 break; 3372 break;
2965 case 0xee: /* out dx,al */ 3373 case 0xee: /* out dx,al */
2966 case 0xef: /* out dx,(e/r)ax */ 3374 case 0xef: /* out dx,(e/r)ax */
2967 c->src.val = c->regs[VCPU_REGS_RDX]; 3375 c->dst.val = c->regs[VCPU_REGS_RDX];
2968 do_io_out: 3376 do_io_out:
2969 c->dst.bytes = min(c->dst.bytes, 4u); 3377 c->src.bytes = min(c->src.bytes, 4u);
2970 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { 3378 if (!emulator_io_permited(ctxt, ops, c->dst.val,
3379 c->src.bytes)) {
2971 emulate_gp(ctxt, 0); 3380 emulate_gp(ctxt, 0);
2972 goto done; 3381 goto done;
2973 } 3382 }
2974 ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1, 3383 ops->pio_out_emulated(c->src.bytes, c->dst.val,
2975 ctxt->vcpu); 3384 &c->src.val, 1, ctxt->vcpu);
2976 c->dst.type = OP_NONE; /* Disable writeback. */ 3385 c->dst.type = OP_NONE; /* Disable writeback. */
2977 break; 3386 break;
2978 case 0xf4: /* hlt */ 3387 case 0xf4: /* hlt */
@@ -2981,24 +3390,22 @@ special_insn:
2981 case 0xf5: /* cmc */ 3390 case 0xf5: /* cmc */
2982 /* complement carry flag from eflags reg */ 3391 /* complement carry flag from eflags reg */
2983 ctxt->eflags ^= EFLG_CF; 3392 ctxt->eflags ^= EFLG_CF;
2984 c->dst.type = OP_NONE; /* Disable writeback. */
2985 break; 3393 break;
2986 case 0xf6 ... 0xf7: /* Grp3 */ 3394 case 0xf6 ... 0xf7: /* Grp3 */
2987 if (!emulate_grp3(ctxt, ops)) 3395 rc = emulate_grp3(ctxt, ops);
2988 goto cannot_emulate;
2989 break; 3396 break;
2990 case 0xf8: /* clc */ 3397 case 0xf8: /* clc */
2991 ctxt->eflags &= ~EFLG_CF; 3398 ctxt->eflags &= ~EFLG_CF;
2992 c->dst.type = OP_NONE; /* Disable writeback. */ 3399 break;
3400 case 0xf9: /* stc */
3401 ctxt->eflags |= EFLG_CF;
2993 break; 3402 break;
2994 case 0xfa: /* cli */ 3403 case 0xfa: /* cli */
2995 if (emulator_bad_iopl(ctxt, ops)) { 3404 if (emulator_bad_iopl(ctxt, ops)) {
2996 emulate_gp(ctxt, 0); 3405 emulate_gp(ctxt, 0);
2997 goto done; 3406 goto done;
2998 } else { 3407 } else
2999 ctxt->eflags &= ~X86_EFLAGS_IF; 3408 ctxt->eflags &= ~X86_EFLAGS_IF;
3000 c->dst.type = OP_NONE; /* Disable writeback. */
3001 }
3002 break; 3409 break;
3003 case 0xfb: /* sti */ 3410 case 0xfb: /* sti */
3004 if (emulator_bad_iopl(ctxt, ops)) { 3411 if (emulator_bad_iopl(ctxt, ops)) {
@@ -3007,29 +3414,29 @@ special_insn:
3007 } else { 3414 } else {
3008 ctxt->interruptibility = KVM_X86_SHADOW_INT_STI; 3415 ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
3009 ctxt->eflags |= X86_EFLAGS_IF; 3416 ctxt->eflags |= X86_EFLAGS_IF;
3010 c->dst.type = OP_NONE; /* Disable writeback. */
3011 } 3417 }
3012 break; 3418 break;
3013 case 0xfc: /* cld */ 3419 case 0xfc: /* cld */
3014 ctxt->eflags &= ~EFLG_DF; 3420 ctxt->eflags &= ~EFLG_DF;
3015 c->dst.type = OP_NONE; /* Disable writeback. */
3016 break; 3421 break;
3017 case 0xfd: /* std */ 3422 case 0xfd: /* std */
3018 ctxt->eflags |= EFLG_DF; 3423 ctxt->eflags |= EFLG_DF;
3019 c->dst.type = OP_NONE; /* Disable writeback. */
3020 break; 3424 break;
3021 case 0xfe: /* Grp4 */ 3425 case 0xfe: /* Grp4 */
3022 grp45: 3426 grp45:
3023 rc = emulate_grp45(ctxt, ops); 3427 rc = emulate_grp45(ctxt, ops);
3024 if (rc != X86EMUL_CONTINUE)
3025 goto done;
3026 break; 3428 break;
3027 case 0xff: /* Grp5 */ 3429 case 0xff: /* Grp5 */
3028 if (c->modrm_reg == 5) 3430 if (c->modrm_reg == 5)
3029 goto jump_far; 3431 goto jump_far;
3030 goto grp45; 3432 goto grp45;
3433 default:
3434 goto cannot_emulate;
3031 } 3435 }
3032 3436
3437 if (rc != X86EMUL_CONTINUE)
3438 goto done;
3439
3033writeback: 3440writeback:
3034 rc = writeback(ctxt, ops); 3441 rc = writeback(ctxt, ops);
3035 if (rc != X86EMUL_CONTINUE) 3442 if (rc != X86EMUL_CONTINUE)
@@ -3050,25 +3457,32 @@ writeback:
3050 &c->dst); 3457 &c->dst);
3051 3458
3052 if (c->rep_prefix && (c->d & String)) { 3459 if (c->rep_prefix && (c->d & String)) {
3053 struct read_cache *rc = &ctxt->decode.io_read; 3460 struct read_cache *r = &ctxt->decode.io_read;
3054 register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1); 3461 register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
3055 /* 3462
3056 * Re-enter guest when pio read ahead buffer is empty or, 3463 if (!string_insn_completed(ctxt)) {
3057 * if it is not used, after each 1024 iteration. 3464 /*
3058 */ 3465 * Re-enter guest when pio read ahead buffer is empty
3059 if ((rc->end == 0 && !(c->regs[VCPU_REGS_RCX] & 0x3ff)) || 3466 * or, if it is not used, after each 1024 iteration.
3060 (rc->end != 0 && rc->end == rc->pos)) 3467 */
3061 ctxt->restart = false; 3468 if ((r->end != 0 || c->regs[VCPU_REGS_RCX] & 0x3ff) &&
3469 (r->end == 0 || r->end != r->pos)) {
3470 /*
3471 * Reset read cache. Usually happens before
3472 * decode, but since instruction is restarted
3473 * we have to do it here.
3474 */
3475 ctxt->decode.mem_read.end = 0;
3476 return EMULATION_RESTART;
3477 }
3478 goto done; /* skip rip writeback */
3479 }
3062 } 3480 }
3063 /* 3481
3064 * reset read cache here in case string instruction is restared
3065 * without decoding
3066 */
3067 ctxt->decode.mem_read.end = 0;
3068 ctxt->eip = c->eip; 3482 ctxt->eip = c->eip;
3069 3483
3070done: 3484done:
3071 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; 3485 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
3072 3486
3073twobyte_insn: 3487twobyte_insn:
3074 switch (c->b) { 3488 switch (c->b) {
@@ -3091,7 +3505,7 @@ twobyte_insn:
3091 c->dst.type = OP_NONE; 3505 c->dst.type = OP_NONE;
3092 break; 3506 break;
3093 case 2: /* lgdt */ 3507 case 2: /* lgdt */
3094 rc = read_descriptor(ctxt, ops, c->src.ptr, 3508 rc = read_descriptor(ctxt, ops, c->src.addr.mem,
3095 &size, &address, c->op_bytes); 3509 &size, &address, c->op_bytes);
3096 if (rc != X86EMUL_CONTINUE) 3510 if (rc != X86EMUL_CONTINUE)
3097 goto done; 3511 goto done;
@@ -3104,14 +3518,12 @@ twobyte_insn:
3104 switch (c->modrm_rm) { 3518 switch (c->modrm_rm) {
3105 case 1: 3519 case 1:
3106 rc = kvm_fix_hypercall(ctxt->vcpu); 3520 rc = kvm_fix_hypercall(ctxt->vcpu);
3107 if (rc != X86EMUL_CONTINUE)
3108 goto done;
3109 break; 3521 break;
3110 default: 3522 default:
3111 goto cannot_emulate; 3523 goto cannot_emulate;
3112 } 3524 }
3113 } else { 3525 } else {
3114 rc = read_descriptor(ctxt, ops, c->src.ptr, 3526 rc = read_descriptor(ctxt, ops, c->src.addr.mem,
3115 &size, &address, 3527 &size, &address,
3116 c->op_bytes); 3528 c->op_bytes);
3117 if (rc != X86EMUL_CONTINUE) 3529 if (rc != X86EMUL_CONTINUE)
@@ -3126,7 +3538,7 @@ twobyte_insn:
3126 c->dst.val = ops->get_cr(0, ctxt->vcpu); 3538 c->dst.val = ops->get_cr(0, ctxt->vcpu);
3127 break; 3539 break;
3128 case 6: /* lmsw */ 3540 case 6: /* lmsw */
3129 ops->set_cr(0, (ops->get_cr(0, ctxt->vcpu) & ~0x0ful) | 3541 ops->set_cr(0, (ops->get_cr(0, ctxt->vcpu) & ~0x0eul) |
3130 (c->src.val & 0x0f), ctxt->vcpu); 3542 (c->src.val & 0x0f), ctxt->vcpu);
3131 c->dst.type = OP_NONE; 3543 c->dst.type = OP_NONE;
3132 break; 3544 break;
@@ -3134,7 +3546,7 @@ twobyte_insn:
3134 emulate_ud(ctxt); 3546 emulate_ud(ctxt);
3135 goto done; 3547 goto done;
3136 case 7: /* invlpg*/ 3548 case 7: /* invlpg*/
3137 emulate_invlpg(ctxt->vcpu, c->modrm_ea); 3549 emulate_invlpg(ctxt->vcpu, c->src.addr.mem);
3138 /* Disable writeback. */ 3550 /* Disable writeback. */
3139 c->dst.type = OP_NONE; 3551 c->dst.type = OP_NONE;
3140 break; 3552 break;
@@ -3144,23 +3556,16 @@ twobyte_insn:
3144 break; 3556 break;
3145 case 0x05: /* syscall */ 3557 case 0x05: /* syscall */
3146 rc = emulate_syscall(ctxt, ops); 3558 rc = emulate_syscall(ctxt, ops);
3147 if (rc != X86EMUL_CONTINUE)
3148 goto done;
3149 else
3150 goto writeback;
3151 break; 3559 break;
3152 case 0x06: 3560 case 0x06:
3153 emulate_clts(ctxt->vcpu); 3561 emulate_clts(ctxt->vcpu);
3154 c->dst.type = OP_NONE;
3155 break; 3562 break;
3156 case 0x09: /* wbinvd */ 3563 case 0x09: /* wbinvd */
3157 kvm_emulate_wbinvd(ctxt->vcpu); 3564 kvm_emulate_wbinvd(ctxt->vcpu);
3158 c->dst.type = OP_NONE;
3159 break; 3565 break;
3160 case 0x08: /* invd */ 3566 case 0x08: /* invd */
3161 case 0x0d: /* GrpP (prefetch) */ 3567 case 0x0d: /* GrpP (prefetch) */
3162 case 0x18: /* Grp16 (prefetch/nop) */ 3568 case 0x18: /* Grp16 (prefetch/nop) */
3163 c->dst.type = OP_NONE;
3164 break; 3569 break;
3165 case 0x20: /* mov cr, reg */ 3570 case 0x20: /* mov cr, reg */
3166 switch (c->modrm_reg) { 3571 switch (c->modrm_reg) {
@@ -3170,8 +3575,7 @@ twobyte_insn:
3170 emulate_ud(ctxt); 3575 emulate_ud(ctxt);
3171 goto done; 3576 goto done;
3172 } 3577 }
3173 c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu); 3578 c->dst.val = ops->get_cr(c->modrm_reg, ctxt->vcpu);
3174 c->dst.type = OP_NONE; /* no writeback */
3175 break; 3579 break;
3176 case 0x21: /* mov from dr to reg */ 3580 case 0x21: /* mov from dr to reg */
3177 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && 3581 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
@@ -3179,11 +3583,10 @@ twobyte_insn:
3179 emulate_ud(ctxt); 3583 emulate_ud(ctxt);
3180 goto done; 3584 goto done;
3181 } 3585 }
3182 ops->get_dr(c->modrm_reg, &c->regs[c->modrm_rm], ctxt->vcpu); 3586 ops->get_dr(c->modrm_reg, &c->dst.val, ctxt->vcpu);
3183 c->dst.type = OP_NONE; /* no writeback */
3184 break; 3587 break;
3185 case 0x22: /* mov reg, cr */ 3588 case 0x22: /* mov reg, cr */
3186 if (ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu)) { 3589 if (ops->set_cr(c->modrm_reg, c->src.val, ctxt->vcpu)) {
3187 emulate_gp(ctxt, 0); 3590 emulate_gp(ctxt, 0);
3188 goto done; 3591 goto done;
3189 } 3592 }
@@ -3196,7 +3599,7 @@ twobyte_insn:
3196 goto done; 3599 goto done;
3197 } 3600 }
3198 3601
3199 if (ops->set_dr(c->modrm_reg, c->regs[c->modrm_rm] & 3602 if (ops->set_dr(c->modrm_reg, c->src.val &
3200 ((ctxt->mode == X86EMUL_MODE_PROT64) ? 3603 ((ctxt->mode == X86EMUL_MODE_PROT64) ?
3201 ~0ULL : ~0U), ctxt->vcpu) < 0) { 3604 ~0ULL : ~0U), ctxt->vcpu) < 0) {
3202 /* #UD condition is already handled by the code above */ 3605 /* #UD condition is already handled by the code above */
@@ -3215,7 +3618,6 @@ twobyte_insn:
3215 goto done; 3618 goto done;
3216 } 3619 }
3217 rc = X86EMUL_CONTINUE; 3620 rc = X86EMUL_CONTINUE;
3218 c->dst.type = OP_NONE;
3219 break; 3621 break;
3220 case 0x32: 3622 case 0x32:
3221 /* rdmsr */ 3623 /* rdmsr */
@@ -3227,21 +3629,12 @@ twobyte_insn:
3227 c->regs[VCPU_REGS_RDX] = msr_data >> 32; 3629 c->regs[VCPU_REGS_RDX] = msr_data >> 32;
3228 } 3630 }
3229 rc = X86EMUL_CONTINUE; 3631 rc = X86EMUL_CONTINUE;
3230 c->dst.type = OP_NONE;
3231 break; 3632 break;
3232 case 0x34: /* sysenter */ 3633 case 0x34: /* sysenter */
3233 rc = emulate_sysenter(ctxt, ops); 3634 rc = emulate_sysenter(ctxt, ops);
3234 if (rc != X86EMUL_CONTINUE)
3235 goto done;
3236 else
3237 goto writeback;
3238 break; 3635 break;
3239 case 0x35: /* sysexit */ 3636 case 0x35: /* sysexit */
3240 rc = emulate_sysexit(ctxt, ops); 3637 rc = emulate_sysexit(ctxt, ops);
3241 if (rc != X86EMUL_CONTINUE)
3242 goto done;
3243 else
3244 goto writeback;
3245 break; 3638 break;
3246 case 0x40 ... 0x4f: /* cmov */ 3639 case 0x40 ... 0x4f: /* cmov */
3247 c->dst.val = c->dst.orig_val = c->src.val; 3640 c->dst.val = c->dst.orig_val = c->src.val;
@@ -3251,15 +3644,15 @@ twobyte_insn:
3251 case 0x80 ... 0x8f: /* jnz rel, etc*/ 3644 case 0x80 ... 0x8f: /* jnz rel, etc*/
3252 if (test_cc(c->b, ctxt->eflags)) 3645 if (test_cc(c->b, ctxt->eflags))
3253 jmp_rel(c, c->src.val); 3646 jmp_rel(c, c->src.val);
3254 c->dst.type = OP_NONE; 3647 break;
3648 case 0x90 ... 0x9f: /* setcc r/m8 */
3649 c->dst.val = test_cc(c->b, ctxt->eflags);
3255 break; 3650 break;
3256 case 0xa0: /* push fs */ 3651 case 0xa0: /* push fs */
3257 emulate_push_sreg(ctxt, ops, VCPU_SREG_FS); 3652 emulate_push_sreg(ctxt, ops, VCPU_SREG_FS);
3258 break; 3653 break;
3259 case 0xa1: /* pop fs */ 3654 case 0xa1: /* pop fs */
3260 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS); 3655 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
3261 if (rc != X86EMUL_CONTINUE)
3262 goto done;
3263 break; 3656 break;
3264 case 0xa3: 3657 case 0xa3:
3265 bt: /* bt */ 3658 bt: /* bt */
@@ -3277,13 +3670,9 @@ twobyte_insn:
3277 break; 3670 break;
3278 case 0xa9: /* pop gs */ 3671 case 0xa9: /* pop gs */
3279 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS); 3672 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
3280 if (rc != X86EMUL_CONTINUE)
3281 goto done;
3282 break; 3673 break;
3283 case 0xab: 3674 case 0xab:
3284 bts: /* bts */ 3675 bts: /* bts */
3285 /* only subword offset */
3286 c->src.val &= (c->dst.bytes << 3) - 1;
3287 emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags); 3676 emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags);
3288 break; 3677 break;
3289 case 0xac: /* shrd imm8, r, r/m */ 3678 case 0xac: /* shrd imm8, r, r/m */
@@ -3306,15 +3695,22 @@ twobyte_insn:
3306 } else { 3695 } else {
3307 /* Failure: write the value we saw to EAX. */ 3696 /* Failure: write the value we saw to EAX. */
3308 c->dst.type = OP_REG; 3697 c->dst.type = OP_REG;
3309 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; 3698 c->dst.addr.reg = (unsigned long *)&c->regs[VCPU_REGS_RAX];
3310 } 3699 }
3311 break; 3700 break;
3701 case 0xb2: /* lss */
3702 rc = emulate_load_segment(ctxt, ops, VCPU_SREG_SS);
3703 break;
3312 case 0xb3: 3704 case 0xb3:
3313 btr: /* btr */ 3705 btr: /* btr */
3314 /* only subword offset */
3315 c->src.val &= (c->dst.bytes << 3) - 1;
3316 emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags); 3706 emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags);
3317 break; 3707 break;
3708 case 0xb4: /* lfs */
3709 rc = emulate_load_segment(ctxt, ops, VCPU_SREG_FS);
3710 break;
3711 case 0xb5: /* lgs */
3712 rc = emulate_load_segment(ctxt, ops, VCPU_SREG_GS);
3713 break;
3318 case 0xb6 ... 0xb7: /* movzx */ 3714 case 0xb6 ... 0xb7: /* movzx */
3319 c->dst.bytes = c->op_bytes; 3715 c->dst.bytes = c->op_bytes;
3320 c->dst.val = (c->d & ByteOp) ? (u8) c->src.val 3716 c->dst.val = (c->d & ByteOp) ? (u8) c->src.val
@@ -3334,15 +3730,43 @@ twobyte_insn:
3334 break; 3730 break;
3335 case 0xbb: 3731 case 0xbb:
3336 btc: /* btc */ 3732 btc: /* btc */
3337 /* only subword offset */
3338 c->src.val &= (c->dst.bytes << 3) - 1;
3339 emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags); 3733 emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags);
3340 break; 3734 break;
3735 case 0xbc: { /* bsf */
3736 u8 zf;
3737 __asm__ ("bsf %2, %0; setz %1"
3738 : "=r"(c->dst.val), "=q"(zf)
3739 : "r"(c->src.val));
3740 ctxt->eflags &= ~X86_EFLAGS_ZF;
3741 if (zf) {
3742 ctxt->eflags |= X86_EFLAGS_ZF;
3743 c->dst.type = OP_NONE; /* Disable writeback. */
3744 }
3745 break;
3746 }
3747 case 0xbd: { /* bsr */
3748 u8 zf;
3749 __asm__ ("bsr %2, %0; setz %1"
3750 : "=r"(c->dst.val), "=q"(zf)
3751 : "r"(c->src.val));
3752 ctxt->eflags &= ~X86_EFLAGS_ZF;
3753 if (zf) {
3754 ctxt->eflags |= X86_EFLAGS_ZF;
3755 c->dst.type = OP_NONE; /* Disable writeback. */
3756 }
3757 break;
3758 }
3341 case 0xbe ... 0xbf: /* movsx */ 3759 case 0xbe ... 0xbf: /* movsx */
3342 c->dst.bytes = c->op_bytes; 3760 c->dst.bytes = c->op_bytes;
3343 c->dst.val = (c->d & ByteOp) ? (s8) c->src.val : 3761 c->dst.val = (c->d & ByteOp) ? (s8) c->src.val :
3344 (s16) c->src.val; 3762 (s16) c->src.val;
3345 break; 3763 break;
3764 case 0xc0 ... 0xc1: /* xadd */
3765 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
3766 /* Write back the register source. */
3767 c->src.val = c->dst.orig_val;
3768 write_register_operand(&c->src);
3769 break;
3346 case 0xc3: /* movnti */ 3770 case 0xc3: /* movnti */
3347 c->dst.bytes = c->op_bytes; 3771 c->dst.bytes = c->op_bytes;
3348 c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val : 3772 c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val :
@@ -3350,10 +3774,14 @@ twobyte_insn:
3350 break; 3774 break;
3351 case 0xc7: /* Grp9 (cmpxchg8b) */ 3775 case 0xc7: /* Grp9 (cmpxchg8b) */
3352 rc = emulate_grp9(ctxt, ops); 3776 rc = emulate_grp9(ctxt, ops);
3353 if (rc != X86EMUL_CONTINUE)
3354 goto done;
3355 break; 3777 break;
3778 default:
3779 goto cannot_emulate;
3356 } 3780 }
3781
3782 if (rc != X86EMUL_CONTINUE)
3783 goto done;
3784
3357 goto writeback; 3785 goto writeback;
3358 3786
3359cannot_emulate: 3787cannot_emulate:
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index ddeb2314b522..efad72385058 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -5,7 +5,7 @@
5 * Copyright (c) 2006 Intel Corporation 5 * Copyright (c) 2006 Intel Corporation
6 * Copyright (c) 2007 Keir Fraser, XenSource Inc 6 * Copyright (c) 2007 Keir Fraser, XenSource Inc
7 * Copyright (c) 2008 Intel Corporation 7 * Copyright (c) 2008 Intel Corporation
8 * Copyright 2009 Red Hat, Inc. and/or its affilates. 8 * Copyright 2009 Red Hat, Inc. and/or its affiliates.
9 * 9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy 10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal 11 * of this software and associated documentation files (the "Software"), to deal
@@ -232,15 +232,6 @@ static void pit_latch_status(struct kvm *kvm, int channel)
232 } 232 }
233} 233}
234 234
235int pit_has_pending_timer(struct kvm_vcpu *vcpu)
236{
237 struct kvm_pit *pit = vcpu->kvm->arch.vpit;
238
239 if (pit && kvm_vcpu_is_bsp(vcpu) && pit->pit_state.irq_ack)
240 return atomic_read(&pit->pit_state.pit_timer.pending);
241 return 0;
242}
243
244static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) 235static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
245{ 236{
246 struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, 237 struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 4b7b73ce2098..f628234fbeca 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * Copyright (c) 2003-2004 Fabrice Bellard 4 * Copyright (c) 2003-2004 Fabrice Bellard
5 * Copyright (c) 2007 Intel Corporation 5 * Copyright (c) 2007 Intel Corporation
6 * Copyright 2009 Red Hat, Inc. and/or its affilates. 6 * Copyright 2009 Red Hat, Inc. and/or its affiliates.
7 * 7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a copy 8 * Permission is hereby granted, free of charge, to any person obtaining a copy
9 * of this software and associated documentation files (the "Software"), to deal 9 * of this software and associated documentation files (the "Software"), to deal
@@ -39,7 +39,7 @@ static void pic_irq_request(struct kvm *kvm, int level);
39static void pic_lock(struct kvm_pic *s) 39static void pic_lock(struct kvm_pic *s)
40 __acquires(&s->lock) 40 __acquires(&s->lock)
41{ 41{
42 raw_spin_lock(&s->lock); 42 spin_lock(&s->lock);
43} 43}
44 44
45static void pic_unlock(struct kvm_pic *s) 45static void pic_unlock(struct kvm_pic *s)
@@ -51,7 +51,7 @@ static void pic_unlock(struct kvm_pic *s)
51 51
52 s->wakeup_needed = false; 52 s->wakeup_needed = false;
53 53
54 raw_spin_unlock(&s->lock); 54 spin_unlock(&s->lock);
55 55
56 if (wakeup) { 56 if (wakeup) {
57 kvm_for_each_vcpu(i, vcpu, s->kvm) { 57 kvm_for_each_vcpu(i, vcpu, s->kvm) {
@@ -67,6 +67,7 @@ static void pic_unlock(struct kvm_pic *s)
67 if (!found) 67 if (!found)
68 return; 68 return;
69 69
70 kvm_make_request(KVM_REQ_EVENT, found);
70 kvm_vcpu_kick(found); 71 kvm_vcpu_kick(found);
71 } 72 }
72} 73}
@@ -308,13 +309,17 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
308 addr &= 1; 309 addr &= 1;
309 if (addr == 0) { 310 if (addr == 0) {
310 if (val & 0x10) { 311 if (val & 0x10) {
311 kvm_pic_reset(s); /* init */
312 /*
313 * deassert a pending interrupt
314 */
315 pic_irq_request(s->pics_state->kvm, 0);
316 s->init_state = 1;
317 s->init4 = val & 1; 312 s->init4 = val & 1;
313 s->last_irr = 0;
314 s->imr = 0;
315 s->priority_add = 0;
316 s->special_mask = 0;
317 s->read_reg_select = 0;
318 if (!s->init4) {
319 s->special_fully_nested_mode = 0;
320 s->auto_eoi = 0;
321 }
322 s->init_state = 1;
318 if (val & 0x02) 323 if (val & 0x02)
319 printk(KERN_ERR "single mode not supported"); 324 printk(KERN_ERR "single mode not supported");
320 if (val & 0x08) 325 if (val & 0x08)
@@ -564,7 +569,7 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
564 s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); 569 s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
565 if (!s) 570 if (!s)
566 return NULL; 571 return NULL;
567 raw_spin_lock_init(&s->lock); 572 spin_lock_init(&s->lock);
568 s->kvm = kvm; 573 s->kvm = kvm;
569 s->pics[0].elcr_mask = 0xf8; 574 s->pics[0].elcr_mask = 0xf8;
570 s->pics[1].elcr_mask = 0xde; 575 s->pics[1].elcr_mask = 0xde;
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 2095a049835e..7e06ba1618bd 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * irq.c: API for in kernel interrupt controller 2 * irq.c: API for in kernel interrupt controller
3 * Copyright (c) 2007, Intel Corporation. 3 * Copyright (c) 2007, Intel Corporation.
4 * Copyright 2009 Red Hat, Inc. and/or its affilates. 4 * Copyright 2009 Red Hat, Inc. and/or its affiliates.
5 * 5 *
6 * This program is free software; you can redistribute it and/or modify it 6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms and conditions of the GNU General Public License, 7 * under the terms and conditions of the GNU General Public License,
@@ -33,12 +33,7 @@
33 */ 33 */
34int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) 34int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
35{ 35{
36 int ret; 36 return apic_has_pending_timer(vcpu);
37
38 ret = pit_has_pending_timer(vcpu);
39 ret |= apic_has_pending_timer(vcpu);
40
41 return ret;
42} 37}
43EXPORT_SYMBOL(kvm_cpu_has_pending_timer); 38EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
44 39
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 63c314502993..ba910d149410 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -60,7 +60,7 @@ struct kvm_kpic_state {
60}; 60};
61 61
62struct kvm_pic { 62struct kvm_pic {
63 raw_spinlock_t lock; 63 spinlock_t lock;
64 bool wakeup_needed; 64 bool wakeup_needed;
65 unsigned pending_acks; 65 unsigned pending_acks;
66 struct kvm *kvm; 66 struct kvm *kvm;
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 6491ac8e755b..975bb45329a1 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -42,7 +42,14 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
42 (unsigned long *)&vcpu->arch.regs_avail)) 42 (unsigned long *)&vcpu->arch.regs_avail))
43 kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR); 43 kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR);
44 44
45 return vcpu->arch.pdptrs[index]; 45 return vcpu->arch.walk_mmu->pdptrs[index];
46}
47
48static inline u64 kvm_pdptr_read_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, int index)
49{
50 load_pdptrs(vcpu, mmu, mmu->get_cr3(vcpu));
51
52 return mmu->pdptrs[index];
46} 53}
47 54
48static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) 55static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 22b06f7660f4..413f8973a855 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -5,7 +5,7 @@
5 * Copyright (C) 2006 Qumranet, Inc. 5 * Copyright (C) 2006 Qumranet, Inc.
6 * Copyright (C) 2007 Novell 6 * Copyright (C) 2007 Novell
7 * Copyright (C) 2007 Intel 7 * Copyright (C) 2007 Intel
8 * Copyright 2009 Red Hat, Inc. and/or its affilates. 8 * Copyright 2009 Red Hat, Inc. and/or its affiliates.
9 * 9 *
10 * Authors: 10 * Authors:
11 * Dor Laor <dor.laor@qumranet.com> 11 * Dor Laor <dor.laor@qumranet.com>
@@ -259,9 +259,10 @@ static inline int apic_find_highest_isr(struct kvm_lapic *apic)
259 259
260static void apic_update_ppr(struct kvm_lapic *apic) 260static void apic_update_ppr(struct kvm_lapic *apic)
261{ 261{
262 u32 tpr, isrv, ppr; 262 u32 tpr, isrv, ppr, old_ppr;
263 int isr; 263 int isr;
264 264
265 old_ppr = apic_get_reg(apic, APIC_PROCPRI);
265 tpr = apic_get_reg(apic, APIC_TASKPRI); 266 tpr = apic_get_reg(apic, APIC_TASKPRI);
266 isr = apic_find_highest_isr(apic); 267 isr = apic_find_highest_isr(apic);
267 isrv = (isr != -1) ? isr : 0; 268 isrv = (isr != -1) ? isr : 0;
@@ -274,7 +275,10 @@ static void apic_update_ppr(struct kvm_lapic *apic)
274 apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x", 275 apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x",
275 apic, ppr, isr, isrv); 276 apic, ppr, isr, isrv);
276 277
277 apic_set_reg(apic, APIC_PROCPRI, ppr); 278 if (old_ppr != ppr) {
279 apic_set_reg(apic, APIC_PROCPRI, ppr);
280 kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
281 }
278} 282}
279 283
280static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) 284static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
@@ -391,6 +395,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
391 break; 395 break;
392 } 396 }
393 397
398 kvm_make_request(KVM_REQ_EVENT, vcpu);
394 kvm_vcpu_kick(vcpu); 399 kvm_vcpu_kick(vcpu);
395 break; 400 break;
396 401
@@ -416,6 +421,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
416 "INIT on a runnable vcpu %d\n", 421 "INIT on a runnable vcpu %d\n",
417 vcpu->vcpu_id); 422 vcpu->vcpu_id);
418 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; 423 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
424 kvm_make_request(KVM_REQ_EVENT, vcpu);
419 kvm_vcpu_kick(vcpu); 425 kvm_vcpu_kick(vcpu);
420 } else { 426 } else {
421 apic_debug("Ignoring de-assert INIT to vcpu %d\n", 427 apic_debug("Ignoring de-assert INIT to vcpu %d\n",
@@ -430,6 +436,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
430 result = 1; 436 result = 1;
431 vcpu->arch.sipi_vector = vector; 437 vcpu->arch.sipi_vector = vector;
432 vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; 438 vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
439 kvm_make_request(KVM_REQ_EVENT, vcpu);
433 kvm_vcpu_kick(vcpu); 440 kvm_vcpu_kick(vcpu);
434 } 441 }
435 break; 442 break;
@@ -475,6 +482,7 @@ static void apic_set_eoi(struct kvm_lapic *apic)
475 trigger_mode = IOAPIC_EDGE_TRIG; 482 trigger_mode = IOAPIC_EDGE_TRIG;
476 if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) 483 if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI))
477 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); 484 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
485 kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
478} 486}
479 487
480static void apic_send_ipi(struct kvm_lapic *apic) 488static void apic_send_ipi(struct kvm_lapic *apic)
@@ -1151,6 +1159,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
1151 update_divide_count(apic); 1159 update_divide_count(apic);
1152 start_apic_timer(apic); 1160 start_apic_timer(apic);
1153 apic->irr_pending = true; 1161 apic->irr_pending = true;
1162 kvm_make_request(KVM_REQ_EVENT, vcpu);
1154} 1163}
1155 1164
1156void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) 1165void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 311f6dad8951..908ea5464a51 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -7,7 +7,7 @@
7 * MMU support 7 * MMU support
8 * 8 *
9 * Copyright (C) 2006 Qumranet, Inc. 9 * Copyright (C) 2006 Qumranet, Inc.
10 * Copyright 2010 Red Hat, Inc. and/or its affilates. 10 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
11 * 11 *
12 * Authors: 12 * Authors:
13 * Yaniv Kamay <yaniv@qumranet.com> 13 * Yaniv Kamay <yaniv@qumranet.com>
@@ -49,15 +49,25 @@
49 */ 49 */
50bool tdp_enabled = false; 50bool tdp_enabled = false;
51 51
52#undef MMU_DEBUG 52enum {
53 AUDIT_PRE_PAGE_FAULT,
54 AUDIT_POST_PAGE_FAULT,
55 AUDIT_PRE_PTE_WRITE,
56 AUDIT_POST_PTE_WRITE,
57 AUDIT_PRE_SYNC,
58 AUDIT_POST_SYNC
59};
53 60
54#undef AUDIT 61char *audit_point_name[] = {
62 "pre page fault",
63 "post page fault",
64 "pre pte write",
65 "post pte write",
66 "pre sync",
67 "post sync"
68};
55 69
56#ifdef AUDIT 70#undef MMU_DEBUG
57static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
58#else
59static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
60#endif
61 71
62#ifdef MMU_DEBUG 72#ifdef MMU_DEBUG
63 73
@@ -71,7 +81,7 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
71 81
72#endif 82#endif
73 83
74#if defined(MMU_DEBUG) || defined(AUDIT) 84#ifdef MMU_DEBUG
75static int dbg = 0; 85static int dbg = 0;
76module_param(dbg, bool, 0644); 86module_param(dbg, bool, 0644);
77#endif 87#endif
@@ -89,6 +99,8 @@ module_param(oos_shadow, bool, 0644);
89 } 99 }
90#endif 100#endif
91 101
102#define PTE_PREFETCH_NUM 8
103
92#define PT_FIRST_AVAIL_BITS_SHIFT 9 104#define PT_FIRST_AVAIL_BITS_SHIFT 9
93#define PT64_SECOND_AVAIL_BITS_SHIFT 52 105#define PT64_SECOND_AVAIL_BITS_SHIFT 52
94 106
@@ -178,6 +190,7 @@ typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
178static struct kmem_cache *pte_chain_cache; 190static struct kmem_cache *pte_chain_cache;
179static struct kmem_cache *rmap_desc_cache; 191static struct kmem_cache *rmap_desc_cache;
180static struct kmem_cache *mmu_page_header_cache; 192static struct kmem_cache *mmu_page_header_cache;
193static struct percpu_counter kvm_total_used_mmu_pages;
181 194
182static u64 __read_mostly shadow_trap_nonpresent_pte; 195static u64 __read_mostly shadow_trap_nonpresent_pte;
183static u64 __read_mostly shadow_notrap_nonpresent_pte; 196static u64 __read_mostly shadow_notrap_nonpresent_pte;
@@ -299,18 +312,50 @@ static u64 __xchg_spte(u64 *sptep, u64 new_spte)
299#endif 312#endif
300} 313}
301 314
315static bool spte_has_volatile_bits(u64 spte)
316{
317 if (!shadow_accessed_mask)
318 return false;
319
320 if (!is_shadow_present_pte(spte))
321 return false;
322
323 if ((spte & shadow_accessed_mask) &&
324 (!is_writable_pte(spte) || (spte & shadow_dirty_mask)))
325 return false;
326
327 return true;
328}
329
330static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)
331{
332 return (old_spte & bit_mask) && !(new_spte & bit_mask);
333}
334
302static void update_spte(u64 *sptep, u64 new_spte) 335static void update_spte(u64 *sptep, u64 new_spte)
303{ 336{
304 u64 old_spte; 337 u64 mask, old_spte = *sptep;
338
339 WARN_ON(!is_rmap_spte(new_spte));
340
341 new_spte |= old_spte & shadow_dirty_mask;
305 342
306 if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask) || 343 mask = shadow_accessed_mask;
307 !is_rmap_spte(*sptep)) 344 if (is_writable_pte(old_spte))
345 mask |= shadow_dirty_mask;
346
347 if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask)
308 __set_spte(sptep, new_spte); 348 __set_spte(sptep, new_spte);
309 else { 349 else
310 old_spte = __xchg_spte(sptep, new_spte); 350 old_spte = __xchg_spte(sptep, new_spte);
311 if (old_spte & shadow_accessed_mask) 351
312 mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte))); 352 if (!shadow_accessed_mask)
313 } 353 return;
354
355 if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
356 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
357 if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
358 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
314} 359}
315 360
316static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, 361static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
@@ -367,7 +412,7 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
367 if (r) 412 if (r)
368 goto out; 413 goto out;
369 r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, 414 r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
370 rmap_desc_cache, 4); 415 rmap_desc_cache, 4 + PTE_PREFETCH_NUM);
371 if (r) 416 if (r)
372 goto out; 417 goto out;
373 r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); 418 r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
@@ -591,6 +636,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
591 desc->sptes[0] = (u64 *)*rmapp; 636 desc->sptes[0] = (u64 *)*rmapp;
592 desc->sptes[1] = spte; 637 desc->sptes[1] = spte;
593 *rmapp = (unsigned long)desc | 1; 638 *rmapp = (unsigned long)desc | 1;
639 ++count;
594 } else { 640 } else {
595 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); 641 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
596 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 642 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
@@ -603,7 +649,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
603 desc = desc->more; 649 desc = desc->more;
604 } 650 }
605 for (i = 0; desc->sptes[i]; ++i) 651 for (i = 0; desc->sptes[i]; ++i)
606 ; 652 ++count;
607 desc->sptes[i] = spte; 653 desc->sptes[i] = spte;
608 } 654 }
609 return count; 655 return count;
@@ -645,18 +691,17 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
645 gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); 691 gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
646 rmapp = gfn_to_rmap(kvm, gfn, sp->role.level); 692 rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
647 if (!*rmapp) { 693 if (!*rmapp) {
648 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); 694 printk(KERN_ERR "rmap_remove: %p 0->BUG\n", spte);
649 BUG(); 695 BUG();
650 } else if (!(*rmapp & 1)) { 696 } else if (!(*rmapp & 1)) {
651 rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte); 697 rmap_printk("rmap_remove: %p 1->0\n", spte);
652 if ((u64 *)*rmapp != spte) { 698 if ((u64 *)*rmapp != spte) {
653 printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n", 699 printk(KERN_ERR "rmap_remove: %p 1->BUG\n", spte);
654 spte, *spte);
655 BUG(); 700 BUG();
656 } 701 }
657 *rmapp = 0; 702 *rmapp = 0;
658 } else { 703 } else {
659 rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte); 704 rmap_printk("rmap_remove: %p many->many\n", spte);
660 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 705 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
661 prev_desc = NULL; 706 prev_desc = NULL;
662 while (desc) { 707 while (desc) {
@@ -670,7 +715,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
670 prev_desc = desc; 715 prev_desc = desc;
671 desc = desc->more; 716 desc = desc->more;
672 } 717 }
673 pr_err("rmap_remove: %p %llx many->many\n", spte, *spte); 718 pr_err("rmap_remove: %p many->many\n", spte);
674 BUG(); 719 BUG();
675 } 720 }
676} 721}
@@ -680,18 +725,18 @@ static void set_spte_track_bits(u64 *sptep, u64 new_spte)
680 pfn_t pfn; 725 pfn_t pfn;
681 u64 old_spte = *sptep; 726 u64 old_spte = *sptep;
682 727
683 if (!shadow_accessed_mask || !is_shadow_present_pte(old_spte) || 728 if (!spte_has_volatile_bits(old_spte))
684 old_spte & shadow_accessed_mask) {
685 __set_spte(sptep, new_spte); 729 __set_spte(sptep, new_spte);
686 } else 730 else
687 old_spte = __xchg_spte(sptep, new_spte); 731 old_spte = __xchg_spte(sptep, new_spte);
688 732
689 if (!is_rmap_spte(old_spte)) 733 if (!is_rmap_spte(old_spte))
690 return; 734 return;
735
691 pfn = spte_to_pfn(old_spte); 736 pfn = spte_to_pfn(old_spte);
692 if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) 737 if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
693 kvm_set_pfn_accessed(pfn); 738 kvm_set_pfn_accessed(pfn);
694 if (is_writable_pte(old_spte)) 739 if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
695 kvm_set_pfn_dirty(pfn); 740 kvm_set_pfn_dirty(pfn);
696} 741}
697 742
@@ -746,13 +791,6 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
746 } 791 }
747 spte = rmap_next(kvm, rmapp, spte); 792 spte = rmap_next(kvm, rmapp, spte);
748 } 793 }
749 if (write_protected) {
750 pfn_t pfn;
751
752 spte = rmap_next(kvm, rmapp, NULL);
753 pfn = spte_to_pfn(*spte);
754 kvm_set_pfn_dirty(pfn);
755 }
756 794
757 /* check for huge page mappings */ 795 /* check for huge page mappings */
758 for (i = PT_DIRECTORY_LEVEL; 796 for (i = PT_DIRECTORY_LEVEL;
@@ -947,6 +985,18 @@ static int is_empty_shadow_page(u64 *spt)
947} 985}
948#endif 986#endif
949 987
988/*
989 * This value is the sum of all of the kvm instances's
990 * kvm->arch.n_used_mmu_pages values. We need a global,
991 * aggregate version in order to make the slab shrinker
992 * faster
993 */
994static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
995{
996 kvm->arch.n_used_mmu_pages += nr;
997 percpu_counter_add(&kvm_total_used_mmu_pages, nr);
998}
999
950static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1000static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
951{ 1001{
952 ASSERT(is_empty_shadow_page(sp->spt)); 1002 ASSERT(is_empty_shadow_page(sp->spt));
@@ -956,7 +1006,7 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
956 if (!sp->role.direct) 1006 if (!sp->role.direct)
957 __free_page(virt_to_page(sp->gfns)); 1007 __free_page(virt_to_page(sp->gfns));
958 kmem_cache_free(mmu_page_header_cache, sp); 1008 kmem_cache_free(mmu_page_header_cache, sp);
959 ++kvm->arch.n_free_mmu_pages; 1009 kvm_mod_used_mmu_pages(kvm, -1);
960} 1010}
961 1011
962static unsigned kvm_page_table_hashfn(gfn_t gfn) 1012static unsigned kvm_page_table_hashfn(gfn_t gfn)
@@ -979,7 +1029,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
979 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); 1029 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
980 sp->multimapped = 0; 1030 sp->multimapped = 0;
981 sp->parent_pte = parent_pte; 1031 sp->parent_pte = parent_pte;
982 --vcpu->kvm->arch.n_free_mmu_pages; 1032 kvm_mod_used_mmu_pages(vcpu->kvm, +1);
983 return sp; 1033 return sp;
984} 1034}
985 1035
@@ -1403,7 +1453,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1403 if (role.direct) 1453 if (role.direct)
1404 role.cr4_pae = 0; 1454 role.cr4_pae = 0;
1405 role.access = access; 1455 role.access = access;
1406 if (!tdp_enabled && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { 1456 if (!vcpu->arch.mmu.direct_map
1457 && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
1407 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); 1458 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
1408 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; 1459 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
1409 role.quadrant = quadrant; 1460 role.quadrant = quadrant;
@@ -1458,6 +1509,12 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
1458 iterator->addr = addr; 1509 iterator->addr = addr;
1459 iterator->shadow_addr = vcpu->arch.mmu.root_hpa; 1510 iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
1460 iterator->level = vcpu->arch.mmu.shadow_root_level; 1511 iterator->level = vcpu->arch.mmu.shadow_root_level;
1512
1513 if (iterator->level == PT64_ROOT_LEVEL &&
1514 vcpu->arch.mmu.root_level < PT64_ROOT_LEVEL &&
1515 !vcpu->arch.mmu.direct_map)
1516 --iterator->level;
1517
1461 if (iterator->level == PT32E_ROOT_LEVEL) { 1518 if (iterator->level == PT32E_ROOT_LEVEL) {
1462 iterator->shadow_addr 1519 iterator->shadow_addr
1463 = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; 1520 = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
@@ -1665,41 +1722,31 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1665 1722
1666/* 1723/*
1667 * Changing the number of mmu pages allocated to the vm 1724 * Changing the number of mmu pages allocated to the vm
1668 * Note: if kvm_nr_mmu_pages is too small, you will get dead lock 1725 * Note: if goal_nr_mmu_pages is too small, you will get dead lock
1669 */ 1726 */
1670void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) 1727void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
1671{ 1728{
1672 int used_pages;
1673 LIST_HEAD(invalid_list); 1729 LIST_HEAD(invalid_list);
1674
1675 used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages;
1676 used_pages = max(0, used_pages);
1677
1678 /* 1730 /*
1679 * If we set the number of mmu pages to be smaller be than the 1731 * If we set the number of mmu pages to be smaller be than the
1680 * number of actived pages , we must to free some mmu pages before we 1732 * number of actived pages , we must to free some mmu pages before we
1681 * change the value 1733 * change the value
1682 */ 1734 */
1683 1735
1684 if (used_pages > kvm_nr_mmu_pages) { 1736 if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
1685 while (used_pages > kvm_nr_mmu_pages && 1737 while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages &&
1686 !list_empty(&kvm->arch.active_mmu_pages)) { 1738 !list_empty(&kvm->arch.active_mmu_pages)) {
1687 struct kvm_mmu_page *page; 1739 struct kvm_mmu_page *page;
1688 1740
1689 page = container_of(kvm->arch.active_mmu_pages.prev, 1741 page = container_of(kvm->arch.active_mmu_pages.prev,
1690 struct kvm_mmu_page, link); 1742 struct kvm_mmu_page, link);
1691 used_pages -= kvm_mmu_prepare_zap_page(kvm, page, 1743 kvm_mmu_prepare_zap_page(kvm, page, &invalid_list);
1692 &invalid_list); 1744 kvm_mmu_commit_zap_page(kvm, &invalid_list);
1693 } 1745 }
1694 kvm_mmu_commit_zap_page(kvm, &invalid_list); 1746 goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
1695 kvm_nr_mmu_pages = used_pages;
1696 kvm->arch.n_free_mmu_pages = 0;
1697 } 1747 }
1698 else
1699 kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages
1700 - kvm->arch.n_alloc_mmu_pages;
1701 1748
1702 kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages; 1749 kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
1703} 1750}
1704 1751
1705static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) 1752static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
@@ -1709,11 +1756,11 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
1709 LIST_HEAD(invalid_list); 1756 LIST_HEAD(invalid_list);
1710 int r; 1757 int r;
1711 1758
1712 pgprintk("%s: looking for gfn %lx\n", __func__, gfn); 1759 pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
1713 r = 0; 1760 r = 0;
1714 1761
1715 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { 1762 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
1716 pgprintk("%s: gfn %lx role %x\n", __func__, gfn, 1763 pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
1717 sp->role.word); 1764 sp->role.word);
1718 r = 1; 1765 r = 1;
1719 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); 1766 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
@@ -1729,7 +1776,7 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
1729 LIST_HEAD(invalid_list); 1776 LIST_HEAD(invalid_list);
1730 1777
1731 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { 1778 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
1732 pgprintk("%s: zap %lx %x\n", 1779 pgprintk("%s: zap %llx %x\n",
1733 __func__, gfn, sp->role.word); 1780 __func__, gfn, sp->role.word);
1734 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); 1781 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
1735 } 1782 }
@@ -1925,7 +1972,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1925 * whether the guest actually used the pte (in order to detect 1972 * whether the guest actually used the pte (in order to detect
1926 * demand paging). 1973 * demand paging).
1927 */ 1974 */
1928 spte = shadow_base_present_pte | shadow_dirty_mask; 1975 spte = shadow_base_present_pte;
1929 if (!speculative) 1976 if (!speculative)
1930 spte |= shadow_accessed_mask; 1977 spte |= shadow_accessed_mask;
1931 if (!dirty) 1978 if (!dirty)
@@ -1948,8 +1995,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1948 spte |= (u64)pfn << PAGE_SHIFT; 1995 spte |= (u64)pfn << PAGE_SHIFT;
1949 1996
1950 if ((pte_access & ACC_WRITE_MASK) 1997 if ((pte_access & ACC_WRITE_MASK)
1951 || (!tdp_enabled && write_fault && !is_write_protection(vcpu) 1998 || (!vcpu->arch.mmu.direct_map && write_fault
1952 && !user_fault)) { 1999 && !is_write_protection(vcpu) && !user_fault)) {
1953 2000
1954 if (level > PT_PAGE_TABLE_LEVEL && 2001 if (level > PT_PAGE_TABLE_LEVEL &&
1955 has_wrprotected_page(vcpu->kvm, gfn, level)) { 2002 has_wrprotected_page(vcpu->kvm, gfn, level)) {
@@ -1960,7 +2007,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1960 2007
1961 spte |= PT_WRITABLE_MASK; 2008 spte |= PT_WRITABLE_MASK;
1962 2009
1963 if (!tdp_enabled && !(pte_access & ACC_WRITE_MASK)) 2010 if (!vcpu->arch.mmu.direct_map
2011 && !(pte_access & ACC_WRITE_MASK))
1964 spte &= ~PT_USER_MASK; 2012 spte &= ~PT_USER_MASK;
1965 2013
1966 /* 2014 /*
@@ -1973,7 +2021,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1973 goto set_pte; 2021 goto set_pte;
1974 2022
1975 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { 2023 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
1976 pgprintk("%s: found shadow page for %lx, marking ro\n", 2024 pgprintk("%s: found shadow page for %llx, marking ro\n",
1977 __func__, gfn); 2025 __func__, gfn);
1978 ret = 1; 2026 ret = 1;
1979 pte_access &= ~ACC_WRITE_MASK; 2027 pte_access &= ~ACC_WRITE_MASK;
@@ -1986,8 +2034,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1986 mark_page_dirty(vcpu->kvm, gfn); 2034 mark_page_dirty(vcpu->kvm, gfn);
1987 2035
1988set_pte: 2036set_pte:
1989 if (is_writable_pte(*sptep) && !is_writable_pte(spte))
1990 kvm_set_pfn_dirty(pfn);
1991 update_spte(sptep, spte); 2037 update_spte(sptep, spte);
1992done: 2038done:
1993 return ret; 2039 return ret;
@@ -2004,7 +2050,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2004 int rmap_count; 2050 int rmap_count;
2005 2051
2006 pgprintk("%s: spte %llx access %x write_fault %d" 2052 pgprintk("%s: spte %llx access %x write_fault %d"
2007 " user_fault %d gfn %lx\n", 2053 " user_fault %d gfn %llx\n",
2008 __func__, *sptep, pt_access, 2054 __func__, *sptep, pt_access,
2009 write_fault, user_fault, gfn); 2055 write_fault, user_fault, gfn);
2010 2056
@@ -2023,7 +2069,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2023 __set_spte(sptep, shadow_trap_nonpresent_pte); 2069 __set_spte(sptep, shadow_trap_nonpresent_pte);
2024 kvm_flush_remote_tlbs(vcpu->kvm); 2070 kvm_flush_remote_tlbs(vcpu->kvm);
2025 } else if (pfn != spte_to_pfn(*sptep)) { 2071 } else if (pfn != spte_to_pfn(*sptep)) {
2026 pgprintk("hfn old %lx new %lx\n", 2072 pgprintk("hfn old %llx new %llx\n",
2027 spte_to_pfn(*sptep), pfn); 2073 spte_to_pfn(*sptep), pfn);
2028 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); 2074 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
2029 kvm_flush_remote_tlbs(vcpu->kvm); 2075 kvm_flush_remote_tlbs(vcpu->kvm);
@@ -2040,7 +2086,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2040 } 2086 }
2041 2087
2042 pgprintk("%s: setting spte %llx\n", __func__, *sptep); 2088 pgprintk("%s: setting spte %llx\n", __func__, *sptep);
2043 pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n", 2089 pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
2044 is_large_pte(*sptep)? "2MB" : "4kB", 2090 is_large_pte(*sptep)? "2MB" : "4kB",
2045 *sptep & PT_PRESENT_MASK ?"RW":"R", gfn, 2091 *sptep & PT_PRESENT_MASK ?"RW":"R", gfn,
2046 *sptep, sptep); 2092 *sptep, sptep);
@@ -2064,6 +2110,105 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
2064{ 2110{
2065} 2111}
2066 2112
2113static struct kvm_memory_slot *
2114pte_prefetch_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log)
2115{
2116 struct kvm_memory_slot *slot;
2117
2118 slot = gfn_to_memslot(vcpu->kvm, gfn);
2119 if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
2120 (no_dirty_log && slot->dirty_bitmap))
2121 slot = NULL;
2122
2123 return slot;
2124}
2125
2126static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
2127 bool no_dirty_log)
2128{
2129 struct kvm_memory_slot *slot;
2130 unsigned long hva;
2131
2132 slot = pte_prefetch_gfn_to_memslot(vcpu, gfn, no_dirty_log);
2133 if (!slot) {
2134 get_page(bad_page);
2135 return page_to_pfn(bad_page);
2136 }
2137
2138 hva = gfn_to_hva_memslot(slot, gfn);
2139
2140 return hva_to_pfn_atomic(vcpu->kvm, hva);
2141}
2142
2143static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
2144 struct kvm_mmu_page *sp,
2145 u64 *start, u64 *end)
2146{
2147 struct page *pages[PTE_PREFETCH_NUM];
2148 unsigned access = sp->role.access;
2149 int i, ret;
2150 gfn_t gfn;
2151
2152 gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
2153 if (!pte_prefetch_gfn_to_memslot(vcpu, gfn, access & ACC_WRITE_MASK))
2154 return -1;
2155
2156 ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start);
2157 if (ret <= 0)
2158 return -1;
2159
2160 for (i = 0; i < ret; i++, gfn++, start++)
2161 mmu_set_spte(vcpu, start, ACC_ALL,
2162 access, 0, 0, 1, NULL,
2163 sp->role.level, gfn,
2164 page_to_pfn(pages[i]), true, true);
2165
2166 return 0;
2167}
2168
2169static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
2170 struct kvm_mmu_page *sp, u64 *sptep)
2171{
2172 u64 *spte, *start = NULL;
2173 int i;
2174
2175 WARN_ON(!sp->role.direct);
2176
2177 i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
2178 spte = sp->spt + i;
2179
2180 for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
2181 if (*spte != shadow_trap_nonpresent_pte || spte == sptep) {
2182 if (!start)
2183 continue;
2184 if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
2185 break;
2186 start = NULL;
2187 } else if (!start)
2188 start = spte;
2189 }
2190}
2191
2192static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
2193{
2194 struct kvm_mmu_page *sp;
2195
2196 /*
2197 * Since it's no accessed bit on EPT, it's no way to
2198 * distinguish between actually accessed translations
2199 * and prefetched, so disable pte prefetch if EPT is
2200 * enabled.
2201 */
2202 if (!shadow_accessed_mask)
2203 return;
2204
2205 sp = page_header(__pa(sptep));
2206 if (sp->role.level > PT_PAGE_TABLE_LEVEL)
2207 return;
2208
2209 __direct_pte_prefetch(vcpu, sp, sptep);
2210}
2211
2067static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, 2212static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2068 int level, gfn_t gfn, pfn_t pfn) 2213 int level, gfn_t gfn, pfn_t pfn)
2069{ 2214{
@@ -2077,6 +2222,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2077 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, 2222 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
2078 0, write, 1, &pt_write, 2223 0, write, 1, &pt_write,
2079 level, gfn, pfn, false, true); 2224 level, gfn, pfn, false, true);
2225 direct_pte_prefetch(vcpu, iterator.sptep);
2080 ++vcpu->stat.pf_fixed; 2226 ++vcpu->stat.pf_fixed;
2081 break; 2227 break;
2082 } 2228 }
@@ -2098,28 +2244,31 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2098 __set_spte(iterator.sptep, 2244 __set_spte(iterator.sptep,
2099 __pa(sp->spt) 2245 __pa(sp->spt)
2100 | PT_PRESENT_MASK | PT_WRITABLE_MASK 2246 | PT_PRESENT_MASK | PT_WRITABLE_MASK
2101 | shadow_user_mask | shadow_x_mask); 2247 | shadow_user_mask | shadow_x_mask
2248 | shadow_accessed_mask);
2102 } 2249 }
2103 } 2250 }
2104 return pt_write; 2251 return pt_write;
2105} 2252}
2106 2253
2107static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn) 2254static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
2108{ 2255{
2109 char buf[1]; 2256 siginfo_t info;
2110 void __user *hva; 2257
2111 int r; 2258 info.si_signo = SIGBUS;
2259 info.si_errno = 0;
2260 info.si_code = BUS_MCEERR_AR;
2261 info.si_addr = (void __user *)address;
2262 info.si_addr_lsb = PAGE_SHIFT;
2112 2263
2113 /* Touch the page, so send SIGBUS */ 2264 send_sig_info(SIGBUS, &info, tsk);
2114 hva = (void __user *)gfn_to_hva(kvm, gfn);
2115 r = copy_from_user(buf, hva, 1);
2116} 2265}
2117 2266
2118static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) 2267static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
2119{ 2268{
2120 kvm_release_pfn_clean(pfn); 2269 kvm_release_pfn_clean(pfn);
2121 if (is_hwpoison_pfn(pfn)) { 2270 if (is_hwpoison_pfn(pfn)) {
2122 kvm_send_hwpoison_signal(kvm, gfn); 2271 kvm_send_hwpoison_signal(gfn_to_hva(kvm, gfn), current);
2123 return 0; 2272 return 0;
2124 } else if (is_fault_pfn(pfn)) 2273 } else if (is_fault_pfn(pfn))
2125 return -EFAULT; 2274 return -EFAULT;
@@ -2179,7 +2328,9 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
2179 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 2328 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2180 return; 2329 return;
2181 spin_lock(&vcpu->kvm->mmu_lock); 2330 spin_lock(&vcpu->kvm->mmu_lock);
2182 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { 2331 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL &&
2332 (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL ||
2333 vcpu->arch.mmu.direct_map)) {
2183 hpa_t root = vcpu->arch.mmu.root_hpa; 2334 hpa_t root = vcpu->arch.mmu.root_hpa;
2184 2335
2185 sp = page_header(root); 2336 sp = page_header(root);
@@ -2222,80 +2373,158 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
2222 return ret; 2373 return ret;
2223} 2374}
2224 2375
2225static int mmu_alloc_roots(struct kvm_vcpu *vcpu) 2376static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
2226{ 2377{
2227 int i;
2228 gfn_t root_gfn;
2229 struct kvm_mmu_page *sp; 2378 struct kvm_mmu_page *sp;
2230 int direct = 0; 2379 unsigned i;
2231 u64 pdptr;
2232
2233 root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
2234 2380
2235 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { 2381 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
2382 spin_lock(&vcpu->kvm->mmu_lock);
2383 kvm_mmu_free_some_pages(vcpu);
2384 sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL,
2385 1, ACC_ALL, NULL);
2386 ++sp->root_count;
2387 spin_unlock(&vcpu->kvm->mmu_lock);
2388 vcpu->arch.mmu.root_hpa = __pa(sp->spt);
2389 } else if (vcpu->arch.mmu.shadow_root_level == PT32E_ROOT_LEVEL) {
2390 for (i = 0; i < 4; ++i) {
2391 hpa_t root = vcpu->arch.mmu.pae_root[i];
2392
2393 ASSERT(!VALID_PAGE(root));
2394 spin_lock(&vcpu->kvm->mmu_lock);
2395 kvm_mmu_free_some_pages(vcpu);
2396 sp = kvm_mmu_get_page(vcpu, i << 30, i << 30,
2397 PT32_ROOT_LEVEL, 1, ACC_ALL,
2398 NULL);
2399 root = __pa(sp->spt);
2400 ++sp->root_count;
2401 spin_unlock(&vcpu->kvm->mmu_lock);
2402 vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
2403 }
2404 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
2405 } else
2406 BUG();
2407
2408 return 0;
2409}
2410
2411static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
2412{
2413 struct kvm_mmu_page *sp;
2414 u64 pdptr, pm_mask;
2415 gfn_t root_gfn;
2416 int i;
2417
2418 root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT;
2419
2420 if (mmu_check_root(vcpu, root_gfn))
2421 return 1;
2422
2423 /*
2424 * Do we shadow a long mode page table? If so we need to
2425 * write-protect the guests page table root.
2426 */
2427 if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
2236 hpa_t root = vcpu->arch.mmu.root_hpa; 2428 hpa_t root = vcpu->arch.mmu.root_hpa;
2237 2429
2238 ASSERT(!VALID_PAGE(root)); 2430 ASSERT(!VALID_PAGE(root));
2239 if (mmu_check_root(vcpu, root_gfn)) 2431
2240 return 1;
2241 if (tdp_enabled) {
2242 direct = 1;
2243 root_gfn = 0;
2244 }
2245 spin_lock(&vcpu->kvm->mmu_lock); 2432 spin_lock(&vcpu->kvm->mmu_lock);
2246 kvm_mmu_free_some_pages(vcpu); 2433 kvm_mmu_free_some_pages(vcpu);
2247 sp = kvm_mmu_get_page(vcpu, root_gfn, 0, 2434 sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL,
2248 PT64_ROOT_LEVEL, direct, 2435 0, ACC_ALL, NULL);
2249 ACC_ALL, NULL);
2250 root = __pa(sp->spt); 2436 root = __pa(sp->spt);
2251 ++sp->root_count; 2437 ++sp->root_count;
2252 spin_unlock(&vcpu->kvm->mmu_lock); 2438 spin_unlock(&vcpu->kvm->mmu_lock);
2253 vcpu->arch.mmu.root_hpa = root; 2439 vcpu->arch.mmu.root_hpa = root;
2254 return 0; 2440 return 0;
2255 } 2441 }
2256 direct = !is_paging(vcpu); 2442
2443 /*
2444 * We shadow a 32 bit page table. This may be a legacy 2-level
2445 * or a PAE 3-level page table. In either case we need to be aware that
2446 * the shadow page table may be a PAE or a long mode page table.
2447 */
2448 pm_mask = PT_PRESENT_MASK;
2449 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL)
2450 pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
2451
2257 for (i = 0; i < 4; ++i) { 2452 for (i = 0; i < 4; ++i) {
2258 hpa_t root = vcpu->arch.mmu.pae_root[i]; 2453 hpa_t root = vcpu->arch.mmu.pae_root[i];
2259 2454
2260 ASSERT(!VALID_PAGE(root)); 2455 ASSERT(!VALID_PAGE(root));
2261 if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { 2456 if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
2262 pdptr = kvm_pdptr_read(vcpu, i); 2457 pdptr = kvm_pdptr_read_mmu(vcpu, &vcpu->arch.mmu, i);
2263 if (!is_present_gpte(pdptr)) { 2458 if (!is_present_gpte(pdptr)) {
2264 vcpu->arch.mmu.pae_root[i] = 0; 2459 vcpu->arch.mmu.pae_root[i] = 0;
2265 continue; 2460 continue;
2266 } 2461 }
2267 root_gfn = pdptr >> PAGE_SHIFT; 2462 root_gfn = pdptr >> PAGE_SHIFT;
2268 } else if (vcpu->arch.mmu.root_level == 0) 2463 if (mmu_check_root(vcpu, root_gfn))
2269 root_gfn = 0; 2464 return 1;
2270 if (mmu_check_root(vcpu, root_gfn))
2271 return 1;
2272 if (tdp_enabled) {
2273 direct = 1;
2274 root_gfn = i << 30;
2275 } 2465 }
2276 spin_lock(&vcpu->kvm->mmu_lock); 2466 spin_lock(&vcpu->kvm->mmu_lock);
2277 kvm_mmu_free_some_pages(vcpu); 2467 kvm_mmu_free_some_pages(vcpu);
2278 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, 2468 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
2279 PT32_ROOT_LEVEL, direct, 2469 PT32_ROOT_LEVEL, 0,
2280 ACC_ALL, NULL); 2470 ACC_ALL, NULL);
2281 root = __pa(sp->spt); 2471 root = __pa(sp->spt);
2282 ++sp->root_count; 2472 ++sp->root_count;
2283 spin_unlock(&vcpu->kvm->mmu_lock); 2473 spin_unlock(&vcpu->kvm->mmu_lock);
2284 2474
2285 vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; 2475 vcpu->arch.mmu.pae_root[i] = root | pm_mask;
2286 } 2476 }
2287 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); 2477 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
2478
2479 /*
2480 * If we shadow a 32 bit page table with a long mode page
2481 * table we enter this path.
2482 */
2483 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
2484 if (vcpu->arch.mmu.lm_root == NULL) {
2485 /*
2486 * The additional page necessary for this is only
2487 * allocated on demand.
2488 */
2489
2490 u64 *lm_root;
2491
2492 lm_root = (void*)get_zeroed_page(GFP_KERNEL);
2493 if (lm_root == NULL)
2494 return 1;
2495
2496 lm_root[0] = __pa(vcpu->arch.mmu.pae_root) | pm_mask;
2497
2498 vcpu->arch.mmu.lm_root = lm_root;
2499 }
2500
2501 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.lm_root);
2502 }
2503
2288 return 0; 2504 return 0;
2289} 2505}
2290 2506
2507static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
2508{
2509 if (vcpu->arch.mmu.direct_map)
2510 return mmu_alloc_direct_roots(vcpu);
2511 else
2512 return mmu_alloc_shadow_roots(vcpu);
2513}
2514
2291static void mmu_sync_roots(struct kvm_vcpu *vcpu) 2515static void mmu_sync_roots(struct kvm_vcpu *vcpu)
2292{ 2516{
2293 int i; 2517 int i;
2294 struct kvm_mmu_page *sp; 2518 struct kvm_mmu_page *sp;
2295 2519
2520 if (vcpu->arch.mmu.direct_map)
2521 return;
2522
2296 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 2523 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2297 return; 2524 return;
2298 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { 2525
2526 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
2527 if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
2299 hpa_t root = vcpu->arch.mmu.root_hpa; 2528 hpa_t root = vcpu->arch.mmu.root_hpa;
2300 sp = page_header(root); 2529 sp = page_header(root);
2301 mmu_sync_children(vcpu, sp); 2530 mmu_sync_children(vcpu, sp);
@@ -2310,6 +2539,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
2310 mmu_sync_children(vcpu, sp); 2539 mmu_sync_children(vcpu, sp);
2311 } 2540 }
2312 } 2541 }
2542 trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
2313} 2543}
2314 2544
2315void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) 2545void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
@@ -2327,6 +2557,14 @@ static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
2327 return vaddr; 2557 return vaddr;
2328} 2558}
2329 2559
2560static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
2561 u32 access, u32 *error)
2562{
2563 if (error)
2564 *error = 0;
2565 return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
2566}
2567
2330static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, 2568static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
2331 u32 error_code) 2569 u32 error_code)
2332{ 2570{
@@ -2393,10 +2631,9 @@ static void nonpaging_free(struct kvm_vcpu *vcpu)
2393 mmu_free_roots(vcpu); 2631 mmu_free_roots(vcpu);
2394} 2632}
2395 2633
2396static int nonpaging_init_context(struct kvm_vcpu *vcpu) 2634static int nonpaging_init_context(struct kvm_vcpu *vcpu,
2635 struct kvm_mmu *context)
2397{ 2636{
2398 struct kvm_mmu *context = &vcpu->arch.mmu;
2399
2400 context->new_cr3 = nonpaging_new_cr3; 2637 context->new_cr3 = nonpaging_new_cr3;
2401 context->page_fault = nonpaging_page_fault; 2638 context->page_fault = nonpaging_page_fault;
2402 context->gva_to_gpa = nonpaging_gva_to_gpa; 2639 context->gva_to_gpa = nonpaging_gva_to_gpa;
@@ -2407,6 +2644,8 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
2407 context->root_level = 0; 2644 context->root_level = 0;
2408 context->shadow_root_level = PT32E_ROOT_LEVEL; 2645 context->shadow_root_level = PT32E_ROOT_LEVEL;
2409 context->root_hpa = INVALID_PAGE; 2646 context->root_hpa = INVALID_PAGE;
2647 context->direct_map = true;
2648 context->nx = false;
2410 return 0; 2649 return 0;
2411} 2650}
2412 2651
@@ -2422,11 +2661,14 @@ static void paging_new_cr3(struct kvm_vcpu *vcpu)
2422 mmu_free_roots(vcpu); 2661 mmu_free_roots(vcpu);
2423} 2662}
2424 2663
2425static void inject_page_fault(struct kvm_vcpu *vcpu, 2664static unsigned long get_cr3(struct kvm_vcpu *vcpu)
2426 u64 addr, 2665{
2427 u32 err_code) 2666 return vcpu->arch.cr3;
2667}
2668
2669static void inject_page_fault(struct kvm_vcpu *vcpu)
2428{ 2670{
2429 kvm_inject_page_fault(vcpu, addr, err_code); 2671 vcpu->arch.mmu.inject_page_fault(vcpu);
2430} 2672}
2431 2673
2432static void paging_free(struct kvm_vcpu *vcpu) 2674static void paging_free(struct kvm_vcpu *vcpu)
@@ -2434,12 +2676,12 @@ static void paging_free(struct kvm_vcpu *vcpu)
2434 nonpaging_free(vcpu); 2676 nonpaging_free(vcpu);
2435} 2677}
2436 2678
2437static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level) 2679static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
2438{ 2680{
2439 int bit7; 2681 int bit7;
2440 2682
2441 bit7 = (gpte >> 7) & 1; 2683 bit7 = (gpte >> 7) & 1;
2442 return (gpte & vcpu->arch.mmu.rsvd_bits_mask[bit7][level-1]) != 0; 2684 return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
2443} 2685}
2444 2686
2445#define PTTYPE 64 2687#define PTTYPE 64
@@ -2450,13 +2692,14 @@ static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level)
2450#include "paging_tmpl.h" 2692#include "paging_tmpl.h"
2451#undef PTTYPE 2693#undef PTTYPE
2452 2694
2453static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) 2695static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
2696 struct kvm_mmu *context,
2697 int level)
2454{ 2698{
2455 struct kvm_mmu *context = &vcpu->arch.mmu;
2456 int maxphyaddr = cpuid_maxphyaddr(vcpu); 2699 int maxphyaddr = cpuid_maxphyaddr(vcpu);
2457 u64 exb_bit_rsvd = 0; 2700 u64 exb_bit_rsvd = 0;
2458 2701
2459 if (!is_nx(vcpu)) 2702 if (!context->nx)
2460 exb_bit_rsvd = rsvd_bits(63, 63); 2703 exb_bit_rsvd = rsvd_bits(63, 63);
2461 switch (level) { 2704 switch (level) {
2462 case PT32_ROOT_LEVEL: 2705 case PT32_ROOT_LEVEL:
@@ -2511,9 +2754,13 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
2511 } 2754 }
2512} 2755}
2513 2756
2514static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) 2757static int paging64_init_context_common(struct kvm_vcpu *vcpu,
2758 struct kvm_mmu *context,
2759 int level)
2515{ 2760{
2516 struct kvm_mmu *context = &vcpu->arch.mmu; 2761 context->nx = is_nx(vcpu);
2762
2763 reset_rsvds_bits_mask(vcpu, context, level);
2517 2764
2518 ASSERT(is_pae(vcpu)); 2765 ASSERT(is_pae(vcpu));
2519 context->new_cr3 = paging_new_cr3; 2766 context->new_cr3 = paging_new_cr3;
@@ -2526,20 +2773,23 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
2526 context->root_level = level; 2773 context->root_level = level;
2527 context->shadow_root_level = level; 2774 context->shadow_root_level = level;
2528 context->root_hpa = INVALID_PAGE; 2775 context->root_hpa = INVALID_PAGE;
2776 context->direct_map = false;
2529 return 0; 2777 return 0;
2530} 2778}
2531 2779
2532static int paging64_init_context(struct kvm_vcpu *vcpu) 2780static int paging64_init_context(struct kvm_vcpu *vcpu,
2781 struct kvm_mmu *context)
2533{ 2782{
2534 reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL); 2783 return paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL);
2535 return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
2536} 2784}
2537 2785
2538static int paging32_init_context(struct kvm_vcpu *vcpu) 2786static int paging32_init_context(struct kvm_vcpu *vcpu,
2787 struct kvm_mmu *context)
2539{ 2788{
2540 struct kvm_mmu *context = &vcpu->arch.mmu; 2789 context->nx = false;
2790
2791 reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
2541 2792
2542 reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
2543 context->new_cr3 = paging_new_cr3; 2793 context->new_cr3 = paging_new_cr3;
2544 context->page_fault = paging32_page_fault; 2794 context->page_fault = paging32_page_fault;
2545 context->gva_to_gpa = paging32_gva_to_gpa; 2795 context->gva_to_gpa = paging32_gva_to_gpa;
@@ -2550,18 +2800,19 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
2550 context->root_level = PT32_ROOT_LEVEL; 2800 context->root_level = PT32_ROOT_LEVEL;
2551 context->shadow_root_level = PT32E_ROOT_LEVEL; 2801 context->shadow_root_level = PT32E_ROOT_LEVEL;
2552 context->root_hpa = INVALID_PAGE; 2802 context->root_hpa = INVALID_PAGE;
2803 context->direct_map = false;
2553 return 0; 2804 return 0;
2554} 2805}
2555 2806
2556static int paging32E_init_context(struct kvm_vcpu *vcpu) 2807static int paging32E_init_context(struct kvm_vcpu *vcpu,
2808 struct kvm_mmu *context)
2557{ 2809{
2558 reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL); 2810 return paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
2559 return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
2560} 2811}
2561 2812
2562static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) 2813static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
2563{ 2814{
2564 struct kvm_mmu *context = &vcpu->arch.mmu; 2815 struct kvm_mmu *context = vcpu->arch.walk_mmu;
2565 2816
2566 context->new_cr3 = nonpaging_new_cr3; 2817 context->new_cr3 = nonpaging_new_cr3;
2567 context->page_fault = tdp_page_fault; 2818 context->page_fault = tdp_page_fault;
@@ -2571,20 +2822,29 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
2571 context->invlpg = nonpaging_invlpg; 2822 context->invlpg = nonpaging_invlpg;
2572 context->shadow_root_level = kvm_x86_ops->get_tdp_level(); 2823 context->shadow_root_level = kvm_x86_ops->get_tdp_level();
2573 context->root_hpa = INVALID_PAGE; 2824 context->root_hpa = INVALID_PAGE;
2825 context->direct_map = true;
2826 context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
2827 context->get_cr3 = get_cr3;
2828 context->inject_page_fault = kvm_inject_page_fault;
2829 context->nx = is_nx(vcpu);
2574 2830
2575 if (!is_paging(vcpu)) { 2831 if (!is_paging(vcpu)) {
2832 context->nx = false;
2576 context->gva_to_gpa = nonpaging_gva_to_gpa; 2833 context->gva_to_gpa = nonpaging_gva_to_gpa;
2577 context->root_level = 0; 2834 context->root_level = 0;
2578 } else if (is_long_mode(vcpu)) { 2835 } else if (is_long_mode(vcpu)) {
2579 reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL); 2836 context->nx = is_nx(vcpu);
2837 reset_rsvds_bits_mask(vcpu, context, PT64_ROOT_LEVEL);
2580 context->gva_to_gpa = paging64_gva_to_gpa; 2838 context->gva_to_gpa = paging64_gva_to_gpa;
2581 context->root_level = PT64_ROOT_LEVEL; 2839 context->root_level = PT64_ROOT_LEVEL;
2582 } else if (is_pae(vcpu)) { 2840 } else if (is_pae(vcpu)) {
2583 reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL); 2841 context->nx = is_nx(vcpu);
2842 reset_rsvds_bits_mask(vcpu, context, PT32E_ROOT_LEVEL);
2584 context->gva_to_gpa = paging64_gva_to_gpa; 2843 context->gva_to_gpa = paging64_gva_to_gpa;
2585 context->root_level = PT32E_ROOT_LEVEL; 2844 context->root_level = PT32E_ROOT_LEVEL;
2586 } else { 2845 } else {
2587 reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL); 2846 context->nx = false;
2847 reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
2588 context->gva_to_gpa = paging32_gva_to_gpa; 2848 context->gva_to_gpa = paging32_gva_to_gpa;
2589 context->root_level = PT32_ROOT_LEVEL; 2849 context->root_level = PT32_ROOT_LEVEL;
2590 } 2850 }
@@ -2592,33 +2852,83 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
2592 return 0; 2852 return 0;
2593} 2853}
2594 2854
2595static int init_kvm_softmmu(struct kvm_vcpu *vcpu) 2855int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
2596{ 2856{
2597 int r; 2857 int r;
2598
2599 ASSERT(vcpu); 2858 ASSERT(vcpu);
2600 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); 2859 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
2601 2860
2602 if (!is_paging(vcpu)) 2861 if (!is_paging(vcpu))
2603 r = nonpaging_init_context(vcpu); 2862 r = nonpaging_init_context(vcpu, context);
2604 else if (is_long_mode(vcpu)) 2863 else if (is_long_mode(vcpu))
2605 r = paging64_init_context(vcpu); 2864 r = paging64_init_context(vcpu, context);
2606 else if (is_pae(vcpu)) 2865 else if (is_pae(vcpu))
2607 r = paging32E_init_context(vcpu); 2866 r = paging32E_init_context(vcpu, context);
2608 else 2867 else
2609 r = paging32_init_context(vcpu); 2868 r = paging32_init_context(vcpu, context);
2610 2869
2611 vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); 2870 vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
2612 vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); 2871 vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
2613 2872
2614 return r; 2873 return r;
2615} 2874}
2875EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
2876
2877static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
2878{
2879 int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
2880
2881 vcpu->arch.walk_mmu->set_cr3 = kvm_x86_ops->set_cr3;
2882 vcpu->arch.walk_mmu->get_cr3 = get_cr3;
2883 vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
2884
2885 return r;
2886}
2887
2888static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
2889{
2890 struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
2891
2892 g_context->get_cr3 = get_cr3;
2893 g_context->inject_page_fault = kvm_inject_page_fault;
2894
2895 /*
2896 * Note that arch.mmu.gva_to_gpa translates l2_gva to l1_gpa. The
2897 * translation of l2_gpa to l1_gpa addresses is done using the
2898 * arch.nested_mmu.gva_to_gpa function. Basically the gva_to_gpa
2899 * functions between mmu and nested_mmu are swapped.
2900 */
2901 if (!is_paging(vcpu)) {
2902 g_context->nx = false;
2903 g_context->root_level = 0;
2904 g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
2905 } else if (is_long_mode(vcpu)) {
2906 g_context->nx = is_nx(vcpu);
2907 reset_rsvds_bits_mask(vcpu, g_context, PT64_ROOT_LEVEL);
2908 g_context->root_level = PT64_ROOT_LEVEL;
2909 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
2910 } else if (is_pae(vcpu)) {
2911 g_context->nx = is_nx(vcpu);
2912 reset_rsvds_bits_mask(vcpu, g_context, PT32E_ROOT_LEVEL);
2913 g_context->root_level = PT32E_ROOT_LEVEL;
2914 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
2915 } else {
2916 g_context->nx = false;
2917 reset_rsvds_bits_mask(vcpu, g_context, PT32_ROOT_LEVEL);
2918 g_context->root_level = PT32_ROOT_LEVEL;
2919 g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
2920 }
2921
2922 return 0;
2923}
2616 2924
2617static int init_kvm_mmu(struct kvm_vcpu *vcpu) 2925static int init_kvm_mmu(struct kvm_vcpu *vcpu)
2618{ 2926{
2619 vcpu->arch.update_pte.pfn = bad_pfn; 2927 vcpu->arch.update_pte.pfn = bad_pfn;
2620 2928
2621 if (tdp_enabled) 2929 if (mmu_is_nested(vcpu))
2930 return init_kvm_nested_mmu(vcpu);
2931 else if (tdp_enabled)
2622 return init_kvm_tdp_mmu(vcpu); 2932 return init_kvm_tdp_mmu(vcpu);
2623 else 2933 else
2624 return init_kvm_softmmu(vcpu); 2934 return init_kvm_softmmu(vcpu);
@@ -2653,7 +2963,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
2653 if (r) 2963 if (r)
2654 goto out; 2964 goto out;
2655 /* set_cr3() should ensure TLB has been flushed */ 2965 /* set_cr3() should ensure TLB has been flushed */
2656 kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); 2966 vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
2657out: 2967out:
2658 return r; 2968 return r;
2659} 2969}
@@ -2663,6 +2973,7 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu)
2663{ 2973{
2664 mmu_free_roots(vcpu); 2974 mmu_free_roots(vcpu);
2665} 2975}
2976EXPORT_SYMBOL_GPL(kvm_mmu_unload);
2666 2977
2667static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, 2978static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
2668 struct kvm_mmu_page *sp, 2979 struct kvm_mmu_page *sp,
@@ -2695,7 +3006,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
2695 return; 3006 return;
2696 } 3007 }
2697 3008
2698 if (is_rsvd_bits_set(vcpu, *(u64 *)new, PT_PAGE_TABLE_LEVEL)) 3009 if (is_rsvd_bits_set(&vcpu->arch.mmu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
2699 return; 3010 return;
2700 3011
2701 ++vcpu->kvm->stat.mmu_pte_updated; 3012 ++vcpu->kvm->stat.mmu_pte_updated;
@@ -2837,7 +3148,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2837 kvm_mmu_access_page(vcpu, gfn); 3148 kvm_mmu_access_page(vcpu, gfn);
2838 kvm_mmu_free_some_pages(vcpu); 3149 kvm_mmu_free_some_pages(vcpu);
2839 ++vcpu->kvm->stat.mmu_pte_write; 3150 ++vcpu->kvm->stat.mmu_pte_write;
2840 kvm_mmu_audit(vcpu, "pre pte write"); 3151 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
2841 if (guest_initiated) { 3152 if (guest_initiated) {
2842 if (gfn == vcpu->arch.last_pt_write_gfn 3153 if (gfn == vcpu->arch.last_pt_write_gfn
2843 && !last_updated_pte_accessed(vcpu)) { 3154 && !last_updated_pte_accessed(vcpu)) {
@@ -2910,7 +3221,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2910 } 3221 }
2911 mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush); 3222 mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
2912 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 3223 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2913 kvm_mmu_audit(vcpu, "post pte write"); 3224 trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
2914 spin_unlock(&vcpu->kvm->mmu_lock); 3225 spin_unlock(&vcpu->kvm->mmu_lock);
2915 if (!is_error_pfn(vcpu->arch.update_pte.pfn)) { 3226 if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
2916 kvm_release_pfn_clean(vcpu->arch.update_pte.pfn); 3227 kvm_release_pfn_clean(vcpu->arch.update_pte.pfn);
@@ -2923,7 +3234,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
2923 gpa_t gpa; 3234 gpa_t gpa;
2924 int r; 3235 int r;
2925 3236
2926 if (tdp_enabled) 3237 if (vcpu->arch.mmu.direct_map)
2927 return 0; 3238 return 0;
2928 3239
2929 gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); 3240 gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
@@ -2937,21 +3248,18 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
2937 3248
2938void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) 3249void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
2939{ 3250{
2940 int free_pages;
2941 LIST_HEAD(invalid_list); 3251 LIST_HEAD(invalid_list);
2942 3252
2943 free_pages = vcpu->kvm->arch.n_free_mmu_pages; 3253 while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES &&
2944 while (free_pages < KVM_REFILL_PAGES &&
2945 !list_empty(&vcpu->kvm->arch.active_mmu_pages)) { 3254 !list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
2946 struct kvm_mmu_page *sp; 3255 struct kvm_mmu_page *sp;
2947 3256
2948 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, 3257 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
2949 struct kvm_mmu_page, link); 3258 struct kvm_mmu_page, link);
2950 free_pages += kvm_mmu_prepare_zap_page(vcpu->kvm, sp, 3259 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
2951 &invalid_list); 3260 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2952 ++vcpu->kvm->stat.mmu_recycled; 3261 ++vcpu->kvm->stat.mmu_recycled;
2953 } 3262 }
2954 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2955} 3263}
2956 3264
2957int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) 3265int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
@@ -3013,6 +3321,8 @@ EXPORT_SYMBOL_GPL(kvm_disable_tdp);
3013static void free_mmu_pages(struct kvm_vcpu *vcpu) 3321static void free_mmu_pages(struct kvm_vcpu *vcpu)
3014{ 3322{
3015 free_page((unsigned long)vcpu->arch.mmu.pae_root); 3323 free_page((unsigned long)vcpu->arch.mmu.pae_root);
3324 if (vcpu->arch.mmu.lm_root != NULL)
3325 free_page((unsigned long)vcpu->arch.mmu.lm_root);
3016} 3326}
3017 3327
3018static int alloc_mmu_pages(struct kvm_vcpu *vcpu) 3328static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
@@ -3054,15 +3364,6 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu)
3054 return init_kvm_mmu(vcpu); 3364 return init_kvm_mmu(vcpu);
3055} 3365}
3056 3366
3057void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
3058{
3059 ASSERT(vcpu);
3060
3061 destroy_kvm_mmu(vcpu);
3062 free_mmu_pages(vcpu);
3063 mmu_free_memory_caches(vcpu);
3064}
3065
3066void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) 3367void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
3067{ 3368{
3068 struct kvm_mmu_page *sp; 3369 struct kvm_mmu_page *sp;
@@ -3112,23 +3413,22 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
3112{ 3413{
3113 struct kvm *kvm; 3414 struct kvm *kvm;
3114 struct kvm *kvm_freed = NULL; 3415 struct kvm *kvm_freed = NULL;
3115 int cache_count = 0; 3416
3417 if (nr_to_scan == 0)
3418 goto out;
3116 3419
3117 spin_lock(&kvm_lock); 3420 spin_lock(&kvm_lock);
3118 3421
3119 list_for_each_entry(kvm, &vm_list, vm_list) { 3422 list_for_each_entry(kvm, &vm_list, vm_list) {
3120 int npages, idx, freed_pages; 3423 int idx, freed_pages;
3121 LIST_HEAD(invalid_list); 3424 LIST_HEAD(invalid_list);
3122 3425
3123 idx = srcu_read_lock(&kvm->srcu); 3426 idx = srcu_read_lock(&kvm->srcu);
3124 spin_lock(&kvm->mmu_lock); 3427 spin_lock(&kvm->mmu_lock);
3125 npages = kvm->arch.n_alloc_mmu_pages - 3428 if (!kvm_freed && nr_to_scan > 0 &&
3126 kvm->arch.n_free_mmu_pages; 3429 kvm->arch.n_used_mmu_pages > 0) {
3127 cache_count += npages;
3128 if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
3129 freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm, 3430 freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm,
3130 &invalid_list); 3431 &invalid_list);
3131 cache_count -= freed_pages;
3132 kvm_freed = kvm; 3432 kvm_freed = kvm;
3133 } 3433 }
3134 nr_to_scan--; 3434 nr_to_scan--;
@@ -3142,7 +3442,8 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
3142 3442
3143 spin_unlock(&kvm_lock); 3443 spin_unlock(&kvm_lock);
3144 3444
3145 return cache_count; 3445out:
3446 return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
3146} 3447}
3147 3448
3148static struct shrinker mmu_shrinker = { 3449static struct shrinker mmu_shrinker = {
@@ -3163,6 +3464,7 @@ static void mmu_destroy_caches(void)
3163void kvm_mmu_module_exit(void) 3464void kvm_mmu_module_exit(void)
3164{ 3465{
3165 mmu_destroy_caches(); 3466 mmu_destroy_caches();
3467 percpu_counter_destroy(&kvm_total_used_mmu_pages);
3166 unregister_shrinker(&mmu_shrinker); 3468 unregister_shrinker(&mmu_shrinker);
3167} 3469}
3168 3470
@@ -3185,6 +3487,9 @@ int kvm_mmu_module_init(void)
3185 if (!mmu_page_header_cache) 3487 if (!mmu_page_header_cache)
3186 goto nomem; 3488 goto nomem;
3187 3489
3490 if (percpu_counter_init(&kvm_total_used_mmu_pages, 0))
3491 goto nomem;
3492
3188 register_shrinker(&mmu_shrinker); 3493 register_shrinker(&mmu_shrinker);
3189 3494
3190 return 0; 3495 return 0;
@@ -3355,271 +3660,18 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
3355} 3660}
3356EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy); 3661EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
3357 3662
3358#ifdef AUDIT 3663#ifdef CONFIG_KVM_MMU_AUDIT
3359 3664#include "mmu_audit.c"
3360static const char *audit_msg; 3665#else
3361 3666static void mmu_audit_disable(void) { }
3362static gva_t canonicalize(gva_t gva)
3363{
3364#ifdef CONFIG_X86_64
3365 gva = (long long)(gva << 16) >> 16;
3366#endif 3667#endif
3367 return gva;
3368}
3369
3370
3371typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
3372
3373static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
3374 inspect_spte_fn fn)
3375{
3376 int i;
3377
3378 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
3379 u64 ent = sp->spt[i];
3380
3381 if (is_shadow_present_pte(ent)) {
3382 if (!is_last_spte(ent, sp->role.level)) {
3383 struct kvm_mmu_page *child;
3384 child = page_header(ent & PT64_BASE_ADDR_MASK);
3385 __mmu_spte_walk(kvm, child, fn);
3386 } else
3387 fn(kvm, &sp->spt[i]);
3388 }
3389 }
3390}
3391
3392static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
3393{
3394 int i;
3395 struct kvm_mmu_page *sp;
3396
3397 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
3398 return;
3399 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
3400 hpa_t root = vcpu->arch.mmu.root_hpa;
3401 sp = page_header(root);
3402 __mmu_spte_walk(vcpu->kvm, sp, fn);
3403 return;
3404 }
3405 for (i = 0; i < 4; ++i) {
3406 hpa_t root = vcpu->arch.mmu.pae_root[i];
3407
3408 if (root && VALID_PAGE(root)) {
3409 root &= PT64_BASE_ADDR_MASK;
3410 sp = page_header(root);
3411 __mmu_spte_walk(vcpu->kvm, sp, fn);
3412 }
3413 }
3414 return;
3415}
3416
3417static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
3418 gva_t va, int level)
3419{
3420 u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
3421 int i;
3422 gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
3423
3424 for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
3425 u64 ent = pt[i];
3426
3427 if (ent == shadow_trap_nonpresent_pte)
3428 continue;
3429
3430 va = canonicalize(va);
3431 if (is_shadow_present_pte(ent) && !is_last_spte(ent, level))
3432 audit_mappings_page(vcpu, ent, va, level - 1);
3433 else {
3434 gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, va, NULL);
3435 gfn_t gfn = gpa >> PAGE_SHIFT;
3436 pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn);
3437 hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT;
3438 3668
3439 if (is_error_pfn(pfn)) { 3669void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
3440 kvm_release_pfn_clean(pfn);
3441 continue;
3442 }
3443
3444 if (is_shadow_present_pte(ent)
3445 && (ent & PT64_BASE_ADDR_MASK) != hpa)
3446 printk(KERN_ERR "xx audit error: (%s) levels %d"
3447 " gva %lx gpa %llx hpa %llx ent %llx %d\n",
3448 audit_msg, vcpu->arch.mmu.root_level,
3449 va, gpa, hpa, ent,
3450 is_shadow_present_pte(ent));
3451 else if (ent == shadow_notrap_nonpresent_pte
3452 && !is_error_hpa(hpa))
3453 printk(KERN_ERR "audit: (%s) notrap shadow,"
3454 " valid guest gva %lx\n", audit_msg, va);
3455 kvm_release_pfn_clean(pfn);
3456
3457 }
3458 }
3459}
3460
3461static void audit_mappings(struct kvm_vcpu *vcpu)
3462{
3463 unsigned i;
3464
3465 if (vcpu->arch.mmu.root_level == 4)
3466 audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
3467 else
3468 for (i = 0; i < 4; ++i)
3469 if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
3470 audit_mappings_page(vcpu,
3471 vcpu->arch.mmu.pae_root[i],
3472 i << 30,
3473 2);
3474}
3475
3476static int count_rmaps(struct kvm_vcpu *vcpu)
3477{
3478 struct kvm *kvm = vcpu->kvm;
3479 struct kvm_memslots *slots;
3480 int nmaps = 0;
3481 int i, j, k, idx;
3482
3483 idx = srcu_read_lock(&kvm->srcu);
3484 slots = kvm_memslots(kvm);
3485 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
3486 struct kvm_memory_slot *m = &slots->memslots[i];
3487 struct kvm_rmap_desc *d;
3488
3489 for (j = 0; j < m->npages; ++j) {
3490 unsigned long *rmapp = &m->rmap[j];
3491
3492 if (!*rmapp)
3493 continue;
3494 if (!(*rmapp & 1)) {
3495 ++nmaps;
3496 continue;
3497 }
3498 d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
3499 while (d) {
3500 for (k = 0; k < RMAP_EXT; ++k)
3501 if (d->sptes[k])
3502 ++nmaps;
3503 else
3504 break;
3505 d = d->more;
3506 }
3507 }
3508 }
3509 srcu_read_unlock(&kvm->srcu, idx);
3510 return nmaps;
3511}
3512
3513void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
3514{
3515 unsigned long *rmapp;
3516 struct kvm_mmu_page *rev_sp;
3517 gfn_t gfn;
3518
3519 if (is_writable_pte(*sptep)) {
3520 rev_sp = page_header(__pa(sptep));
3521 gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
3522
3523 if (!gfn_to_memslot(kvm, gfn)) {
3524 if (!printk_ratelimit())
3525 return;
3526 printk(KERN_ERR "%s: no memslot for gfn %ld\n",
3527 audit_msg, gfn);
3528 printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n",
3529 audit_msg, (long int)(sptep - rev_sp->spt),
3530 rev_sp->gfn);
3531 dump_stack();
3532 return;
3533 }
3534
3535 rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
3536 if (!*rmapp) {
3537 if (!printk_ratelimit())
3538 return;
3539 printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
3540 audit_msg, *sptep);
3541 dump_stack();
3542 }
3543 }
3544
3545}
3546
3547void audit_writable_sptes_have_rmaps(struct kvm_vcpu *vcpu)
3548{
3549 mmu_spte_walk(vcpu, inspect_spte_has_rmap);
3550}
3551
3552static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
3553{
3554 struct kvm_mmu_page *sp;
3555 int i;
3556
3557 list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
3558 u64 *pt = sp->spt;
3559
3560 if (sp->role.level != PT_PAGE_TABLE_LEVEL)
3561 continue;
3562
3563 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
3564 u64 ent = pt[i];
3565
3566 if (!(ent & PT_PRESENT_MASK))
3567 continue;
3568 if (!is_writable_pte(ent))
3569 continue;
3570 inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
3571 }
3572 }
3573 return;
3574}
3575
3576static void audit_rmap(struct kvm_vcpu *vcpu)
3577{
3578 check_writable_mappings_rmap(vcpu);
3579 count_rmaps(vcpu);
3580}
3581
3582static void audit_write_protection(struct kvm_vcpu *vcpu)
3583{
3584 struct kvm_mmu_page *sp;
3585 struct kvm_memory_slot *slot;
3586 unsigned long *rmapp;
3587 u64 *spte;
3588 gfn_t gfn;
3589
3590 list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
3591 if (sp->role.direct)
3592 continue;
3593 if (sp->unsync)
3594 continue;
3595
3596 slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
3597 rmapp = &slot->rmap[gfn - slot->base_gfn];
3598
3599 spte = rmap_next(vcpu->kvm, rmapp, NULL);
3600 while (spte) {
3601 if (is_writable_pte(*spte))
3602 printk(KERN_ERR "%s: (%s) shadow page has "
3603 "writable mappings: gfn %lx role %x\n",
3604 __func__, audit_msg, sp->gfn,
3605 sp->role.word);
3606 spte = rmap_next(vcpu->kvm, rmapp, spte);
3607 }
3608 }
3609}
3610
3611static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
3612{ 3670{
3613 int olddbg = dbg; 3671 ASSERT(vcpu);
3614 3672
3615 dbg = 0; 3673 destroy_kvm_mmu(vcpu);
3616 audit_msg = msg; 3674 free_mmu_pages(vcpu);
3617 audit_rmap(vcpu); 3675 mmu_free_memory_caches(vcpu);
3618 audit_write_protection(vcpu); 3676 mmu_audit_disable();
3619 if (strcmp("pre pte write", audit_msg) != 0)
3620 audit_mappings(vcpu);
3621 audit_writable_sptes_have_rmaps(vcpu);
3622 dbg = olddbg;
3623} 3677}
3624
3625#endif
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index be66759321a5..7086ca85d3e7 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -49,10 +49,17 @@
49#define PFERR_FETCH_MASK (1U << 4) 49#define PFERR_FETCH_MASK (1U << 4)
50 50
51int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); 51int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
52int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
53
54static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
55{
56 return kvm->arch.n_max_mmu_pages -
57 kvm->arch.n_used_mmu_pages;
58}
52 59
53static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) 60static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
54{ 61{
55 if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) 62 if (unlikely(kvm_mmu_available_pages(vcpu->kvm)< KVM_MIN_FREE_MMU_PAGES))
56 __kvm_mmu_free_some_pages(vcpu); 63 __kvm_mmu_free_some_pages(vcpu);
57} 64}
58 65
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
new file mode 100644
index 000000000000..ba2bcdde6221
--- /dev/null
+++ b/arch/x86/kvm/mmu_audit.c
@@ -0,0 +1,299 @@
1/*
2 * mmu_audit.c:
3 *
4 * Audit code for KVM MMU
5 *
6 * Copyright (C) 2006 Qumranet, Inc.
7 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
8 *
9 * Authors:
10 * Yaniv Kamay <yaniv@qumranet.com>
11 * Avi Kivity <avi@qumranet.com>
12 * Marcelo Tosatti <mtosatti@redhat.com>
13 * Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
14 *
15 * This work is licensed under the terms of the GNU GPL, version 2. See
16 * the COPYING file in the top-level directory.
17 *
18 */
19
20#include <linux/ratelimit.h>
21
22static int audit_point;
23
24#define audit_printk(fmt, args...) \
25 printk(KERN_ERR "audit: (%s) error: " \
26 fmt, audit_point_name[audit_point], ##args)
27
28typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level);
29
30static void __mmu_spte_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
31 inspect_spte_fn fn, int level)
32{
33 int i;
34
35 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
36 u64 *ent = sp->spt;
37
38 fn(vcpu, ent + i, level);
39
40 if (is_shadow_present_pte(ent[i]) &&
41 !is_last_spte(ent[i], level)) {
42 struct kvm_mmu_page *child;
43
44 child = page_header(ent[i] & PT64_BASE_ADDR_MASK);
45 __mmu_spte_walk(vcpu, child, fn, level - 1);
46 }
47 }
48}
49
50static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
51{
52 int i;
53 struct kvm_mmu_page *sp;
54
55 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
56 return;
57
58 if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
59 hpa_t root = vcpu->arch.mmu.root_hpa;
60
61 sp = page_header(root);
62 __mmu_spte_walk(vcpu, sp, fn, PT64_ROOT_LEVEL);
63 return;
64 }
65
66 for (i = 0; i < 4; ++i) {
67 hpa_t root = vcpu->arch.mmu.pae_root[i];
68
69 if (root && VALID_PAGE(root)) {
70 root &= PT64_BASE_ADDR_MASK;
71 sp = page_header(root);
72 __mmu_spte_walk(vcpu, sp, fn, 2);
73 }
74 }
75
76 return;
77}
78
79typedef void (*sp_handler) (struct kvm *kvm, struct kvm_mmu_page *sp);
80
81static void walk_all_active_sps(struct kvm *kvm, sp_handler fn)
82{
83 struct kvm_mmu_page *sp;
84
85 list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link)
86 fn(kvm, sp);
87}
88
89static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
90{
91 struct kvm_mmu_page *sp;
92 gfn_t gfn;
93 pfn_t pfn;
94 hpa_t hpa;
95
96 sp = page_header(__pa(sptep));
97
98 if (sp->unsync) {
99 if (level != PT_PAGE_TABLE_LEVEL) {
100 audit_printk("unsync sp: %p level = %d\n", sp, level);
101 return;
102 }
103
104 if (*sptep == shadow_notrap_nonpresent_pte) {
105 audit_printk("notrap spte in unsync sp: %p\n", sp);
106 return;
107 }
108 }
109
110 if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
111 audit_printk("notrap spte in direct sp: %p\n", sp);
112 return;
113 }
114
115 if (!is_shadow_present_pte(*sptep) || !is_last_spte(*sptep, level))
116 return;
117
118 gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
119 pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
120
121 if (is_error_pfn(pfn)) {
122 kvm_release_pfn_clean(pfn);
123 return;
124 }
125
126 hpa = pfn << PAGE_SHIFT;
127 if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
128 audit_printk("levels %d pfn %llx hpa %llx ent %llxn",
129 vcpu->arch.mmu.root_level, pfn, hpa, *sptep);
130}
131
132static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
133{
134 unsigned long *rmapp;
135 struct kvm_mmu_page *rev_sp;
136 gfn_t gfn;
137
138
139 rev_sp = page_header(__pa(sptep));
140 gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
141
142 if (!gfn_to_memslot(kvm, gfn)) {
143 if (!printk_ratelimit())
144 return;
145 audit_printk("no memslot for gfn %llx\n", gfn);
146 audit_printk("index %ld of sp (gfn=%llx)\n",
147 (long int)(sptep - rev_sp->spt), rev_sp->gfn);
148 dump_stack();
149 return;
150 }
151
152 rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
153 if (!*rmapp) {
154 if (!printk_ratelimit())
155 return;
156 audit_printk("no rmap for writable spte %llx\n", *sptep);
157 dump_stack();
158 }
159}
160
161static void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu, u64 *sptep, int level)
162{
163 if (is_shadow_present_pte(*sptep) && is_last_spte(*sptep, level))
164 inspect_spte_has_rmap(vcpu->kvm, sptep);
165}
166
167static void audit_spte_after_sync(struct kvm_vcpu *vcpu, u64 *sptep, int level)
168{
169 struct kvm_mmu_page *sp = page_header(__pa(sptep));
170
171 if (audit_point == AUDIT_POST_SYNC && sp->unsync)
172 audit_printk("meet unsync sp(%p) after sync root.\n", sp);
173}
174
175static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
176{
177 int i;
178
179 if (sp->role.level != PT_PAGE_TABLE_LEVEL)
180 return;
181
182 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
183 if (!is_rmap_spte(sp->spt[i]))
184 continue;
185
186 inspect_spte_has_rmap(kvm, sp->spt + i);
187 }
188}
189
190static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
191{
192 struct kvm_memory_slot *slot;
193 unsigned long *rmapp;
194 u64 *spte;
195
196 if (sp->role.direct || sp->unsync || sp->role.invalid)
197 return;
198
199 slot = gfn_to_memslot(kvm, sp->gfn);
200 rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
201
202 spte = rmap_next(kvm, rmapp, NULL);
203 while (spte) {
204 if (is_writable_pte(*spte))
205 audit_printk("shadow page has writable mappings: gfn "
206 "%llx role %x\n", sp->gfn, sp->role.word);
207 spte = rmap_next(kvm, rmapp, spte);
208 }
209}
210
211static void audit_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
212{
213 check_mappings_rmap(kvm, sp);
214 audit_write_protection(kvm, sp);
215}
216
217static void audit_all_active_sps(struct kvm *kvm)
218{
219 walk_all_active_sps(kvm, audit_sp);
220}
221
222static void audit_spte(struct kvm_vcpu *vcpu, u64 *sptep, int level)
223{
224 audit_sptes_have_rmaps(vcpu, sptep, level);
225 audit_mappings(vcpu, sptep, level);
226 audit_spte_after_sync(vcpu, sptep, level);
227}
228
229static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
230{
231 mmu_spte_walk(vcpu, audit_spte);
232}
233
234static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int point)
235{
236 static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
237
238 if (!__ratelimit(&ratelimit_state))
239 return;
240
241 audit_point = point;
242 audit_all_active_sps(vcpu->kvm);
243 audit_vcpu_spte(vcpu);
244}
245
246static bool mmu_audit;
247
248static void mmu_audit_enable(void)
249{
250 int ret;
251
252 if (mmu_audit)
253 return;
254
255 ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
256 WARN_ON(ret);
257
258 mmu_audit = true;
259}
260
261static void mmu_audit_disable(void)
262{
263 if (!mmu_audit)
264 return;
265
266 unregister_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
267 tracepoint_synchronize_unregister();
268 mmu_audit = false;
269}
270
271static int mmu_audit_set(const char *val, const struct kernel_param *kp)
272{
273 int ret;
274 unsigned long enable;
275
276 ret = strict_strtoul(val, 10, &enable);
277 if (ret < 0)
278 return -EINVAL;
279
280 switch (enable) {
281 case 0:
282 mmu_audit_disable();
283 break;
284 case 1:
285 mmu_audit_enable();
286 break;
287 default:
288 return -EINVAL;
289 }
290
291 return 0;
292}
293
294static struct kernel_param_ops audit_param_ops = {
295 .set = mmu_audit_set,
296 .get = param_get_bool,
297};
298
299module_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644);
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 3aab0f0930ef..b60b4fdb3eda 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -195,6 +195,25 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
195 195
196 TP_ARGS(sp) 196 TP_ARGS(sp)
197); 197);
198
199TRACE_EVENT(
200 kvm_mmu_audit,
201 TP_PROTO(struct kvm_vcpu *vcpu, int audit_point),
202 TP_ARGS(vcpu, audit_point),
203
204 TP_STRUCT__entry(
205 __field(struct kvm_vcpu *, vcpu)
206 __field(int, audit_point)
207 ),
208
209 TP_fast_assign(
210 __entry->vcpu = vcpu;
211 __entry->audit_point = audit_point;
212 ),
213
214 TP_printk("vcpu:%d %s", __entry->vcpu->cpu,
215 audit_point_name[__entry->audit_point])
216);
198#endif /* _TRACE_KVMMMU_H */ 217#endif /* _TRACE_KVMMMU_H */
199 218
200#undef TRACE_INCLUDE_PATH 219#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 51ef9097960d..cd7a833a3b52 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -7,7 +7,7 @@
7 * MMU support 7 * MMU support
8 * 8 *
9 * Copyright (C) 2006 Qumranet, Inc. 9 * Copyright (C) 2006 Qumranet, Inc.
10 * Copyright 2010 Red Hat, Inc. and/or its affilates. 10 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
11 * 11 *
12 * Authors: 12 * Authors:
13 * Yaniv Kamay <yaniv@qumranet.com> 13 * Yaniv Kamay <yaniv@qumranet.com>
@@ -67,6 +67,7 @@ struct guest_walker {
67 int level; 67 int level;
68 gfn_t table_gfn[PT_MAX_FULL_LEVELS]; 68 gfn_t table_gfn[PT_MAX_FULL_LEVELS];
69 pt_element_t ptes[PT_MAX_FULL_LEVELS]; 69 pt_element_t ptes[PT_MAX_FULL_LEVELS];
70 pt_element_t prefetch_ptes[PTE_PREFETCH_NUM];
70 gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; 71 gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
71 unsigned pt_access; 72 unsigned pt_access;
72 unsigned pte_access; 73 unsigned pte_access;
@@ -104,7 +105,7 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
104 105
105 access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; 106 access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
106#if PTTYPE == 64 107#if PTTYPE == 64
107 if (is_nx(vcpu)) 108 if (vcpu->arch.mmu.nx)
108 access &= ~(gpte >> PT64_NX_SHIFT); 109 access &= ~(gpte >> PT64_NX_SHIFT);
109#endif 110#endif
110 return access; 111 return access;
@@ -113,26 +114,32 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
113/* 114/*
114 * Fetch a guest pte for a guest virtual address 115 * Fetch a guest pte for a guest virtual address
115 */ 116 */
116static int FNAME(walk_addr)(struct guest_walker *walker, 117static int FNAME(walk_addr_generic)(struct guest_walker *walker,
117 struct kvm_vcpu *vcpu, gva_t addr, 118 struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
118 int write_fault, int user_fault, int fetch_fault) 119 gva_t addr, u32 access)
119{ 120{
120 pt_element_t pte; 121 pt_element_t pte;
121 gfn_t table_gfn; 122 gfn_t table_gfn;
122 unsigned index, pt_access, uninitialized_var(pte_access); 123 unsigned index, pt_access, uninitialized_var(pte_access);
123 gpa_t pte_gpa; 124 gpa_t pte_gpa;
124 bool eperm, present, rsvd_fault; 125 bool eperm, present, rsvd_fault;
126 int offset, write_fault, user_fault, fetch_fault;
127
128 write_fault = access & PFERR_WRITE_MASK;
129 user_fault = access & PFERR_USER_MASK;
130 fetch_fault = access & PFERR_FETCH_MASK;
125 131
126 trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, 132 trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
127 fetch_fault); 133 fetch_fault);
128walk: 134walk:
129 present = true; 135 present = true;
130 eperm = rsvd_fault = false; 136 eperm = rsvd_fault = false;
131 walker->level = vcpu->arch.mmu.root_level; 137 walker->level = mmu->root_level;
132 pte = vcpu->arch.cr3; 138 pte = mmu->get_cr3(vcpu);
139
133#if PTTYPE == 64 140#if PTTYPE == 64
134 if (!is_long_mode(vcpu)) { 141 if (walker->level == PT32E_ROOT_LEVEL) {
135 pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3); 142 pte = kvm_pdptr_read_mmu(vcpu, mmu, (addr >> 30) & 3);
136 trace_kvm_mmu_paging_element(pte, walker->level); 143 trace_kvm_mmu_paging_element(pte, walker->level);
137 if (!is_present_gpte(pte)) { 144 if (!is_present_gpte(pte)) {
138 present = false; 145 present = false;
@@ -142,7 +149,7 @@ walk:
142 } 149 }
143#endif 150#endif
144 ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || 151 ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
145 (vcpu->arch.cr3 & CR3_NONPAE_RESERVED_BITS) == 0); 152 (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0);
146 153
147 pt_access = ACC_ALL; 154 pt_access = ACC_ALL;
148 155
@@ -150,12 +157,14 @@ walk:
150 index = PT_INDEX(addr, walker->level); 157 index = PT_INDEX(addr, walker->level);
151 158
152 table_gfn = gpte_to_gfn(pte); 159 table_gfn = gpte_to_gfn(pte);
153 pte_gpa = gfn_to_gpa(table_gfn); 160 offset = index * sizeof(pt_element_t);
154 pte_gpa += index * sizeof(pt_element_t); 161 pte_gpa = gfn_to_gpa(table_gfn) + offset;
155 walker->table_gfn[walker->level - 1] = table_gfn; 162 walker->table_gfn[walker->level - 1] = table_gfn;
156 walker->pte_gpa[walker->level - 1] = pte_gpa; 163 walker->pte_gpa[walker->level - 1] = pte_gpa;
157 164
158 if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) { 165 if (kvm_read_guest_page_mmu(vcpu, mmu, table_gfn, &pte,
166 offset, sizeof(pte),
167 PFERR_USER_MASK|PFERR_WRITE_MASK)) {
159 present = false; 168 present = false;
160 break; 169 break;
161 } 170 }
@@ -167,7 +176,7 @@ walk:
167 break; 176 break;
168 } 177 }
169 178
170 if (is_rsvd_bits_set(vcpu, pte, walker->level)) { 179 if (is_rsvd_bits_set(&vcpu->arch.mmu, pte, walker->level)) {
171 rsvd_fault = true; 180 rsvd_fault = true;
172 break; 181 break;
173 } 182 }
@@ -204,17 +213,28 @@ walk:
204 (PTTYPE == 64 || is_pse(vcpu))) || 213 (PTTYPE == 64 || is_pse(vcpu))) ||
205 ((walker->level == PT_PDPE_LEVEL) && 214 ((walker->level == PT_PDPE_LEVEL) &&
206 is_large_pte(pte) && 215 is_large_pte(pte) &&
207 is_long_mode(vcpu))) { 216 mmu->root_level == PT64_ROOT_LEVEL)) {
208 int lvl = walker->level; 217 int lvl = walker->level;
218 gpa_t real_gpa;
219 gfn_t gfn;
220 u32 ac;
209 221
210 walker->gfn = gpte_to_gfn_lvl(pte, lvl); 222 gfn = gpte_to_gfn_lvl(pte, lvl);
211 walker->gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) 223 gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT;
212 >> PAGE_SHIFT;
213 224
214 if (PTTYPE == 32 && 225 if (PTTYPE == 32 &&
215 walker->level == PT_DIRECTORY_LEVEL && 226 walker->level == PT_DIRECTORY_LEVEL &&
216 is_cpuid_PSE36()) 227 is_cpuid_PSE36())
217 walker->gfn += pse36_gfn_delta(pte); 228 gfn += pse36_gfn_delta(pte);
229
230 ac = write_fault | fetch_fault | user_fault;
231
232 real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn),
233 ac);
234 if (real_gpa == UNMAPPED_GVA)
235 return 0;
236
237 walker->gfn = real_gpa >> PAGE_SHIFT;
218 238
219 break; 239 break;
220 } 240 }
@@ -249,18 +269,36 @@ error:
249 walker->error_code = 0; 269 walker->error_code = 0;
250 if (present) 270 if (present)
251 walker->error_code |= PFERR_PRESENT_MASK; 271 walker->error_code |= PFERR_PRESENT_MASK;
252 if (write_fault) 272
253 walker->error_code |= PFERR_WRITE_MASK; 273 walker->error_code |= write_fault | user_fault;
254 if (user_fault) 274
255 walker->error_code |= PFERR_USER_MASK; 275 if (fetch_fault && mmu->nx)
256 if (fetch_fault && is_nx(vcpu))
257 walker->error_code |= PFERR_FETCH_MASK; 276 walker->error_code |= PFERR_FETCH_MASK;
258 if (rsvd_fault) 277 if (rsvd_fault)
259 walker->error_code |= PFERR_RSVD_MASK; 278 walker->error_code |= PFERR_RSVD_MASK;
279
280 vcpu->arch.fault.address = addr;
281 vcpu->arch.fault.error_code = walker->error_code;
282
260 trace_kvm_mmu_walker_error(walker->error_code); 283 trace_kvm_mmu_walker_error(walker->error_code);
261 return 0; 284 return 0;
262} 285}
263 286
287static int FNAME(walk_addr)(struct guest_walker *walker,
288 struct kvm_vcpu *vcpu, gva_t addr, u32 access)
289{
290 return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.mmu, addr,
291 access);
292}
293
294static int FNAME(walk_addr_nested)(struct guest_walker *walker,
295 struct kvm_vcpu *vcpu, gva_t addr,
296 u32 access)
297{
298 return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
299 addr, access);
300}
301
264static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 302static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
265 u64 *spte, const void *pte) 303 u64 *spte, const void *pte)
266{ 304{
@@ -302,14 +340,87 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
302static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, 340static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
303 struct guest_walker *gw, int level) 341 struct guest_walker *gw, int level)
304{ 342{
305 int r;
306 pt_element_t curr_pte; 343 pt_element_t curr_pte;
307 344 gpa_t base_gpa, pte_gpa = gw->pte_gpa[level - 1];
308 r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 1], 345 u64 mask;
346 int r, index;
347
348 if (level == PT_PAGE_TABLE_LEVEL) {
349 mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1;
350 base_gpa = pte_gpa & ~mask;
351 index = (pte_gpa - base_gpa) / sizeof(pt_element_t);
352
353 r = kvm_read_guest_atomic(vcpu->kvm, base_gpa,
354 gw->prefetch_ptes, sizeof(gw->prefetch_ptes));
355 curr_pte = gw->prefetch_ptes[index];
356 } else
357 r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa,
309 &curr_pte, sizeof(curr_pte)); 358 &curr_pte, sizeof(curr_pte));
359
310 return r || curr_pte != gw->ptes[level - 1]; 360 return r || curr_pte != gw->ptes[level - 1];
311} 361}
312 362
363static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
364 u64 *sptep)
365{
366 struct kvm_mmu_page *sp;
367 struct kvm_mmu *mmu = &vcpu->arch.mmu;
368 pt_element_t *gptep = gw->prefetch_ptes;
369 u64 *spte;
370 int i;
371
372 sp = page_header(__pa(sptep));
373
374 if (sp->role.level > PT_PAGE_TABLE_LEVEL)
375 return;
376
377 if (sp->role.direct)
378 return __direct_pte_prefetch(vcpu, sp, sptep);
379
380 i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
381 spte = sp->spt + i;
382
383 for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
384 pt_element_t gpte;
385 unsigned pte_access;
386 gfn_t gfn;
387 pfn_t pfn;
388 bool dirty;
389
390 if (spte == sptep)
391 continue;
392
393 if (*spte != shadow_trap_nonpresent_pte)
394 continue;
395
396 gpte = gptep[i];
397
398 if (!is_present_gpte(gpte) ||
399 is_rsvd_bits_set(mmu, gpte, PT_PAGE_TABLE_LEVEL)) {
400 if (!sp->unsync)
401 __set_spte(spte, shadow_notrap_nonpresent_pte);
402 continue;
403 }
404
405 if (!(gpte & PT_ACCESSED_MASK))
406 continue;
407
408 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
409 gfn = gpte_to_gfn(gpte);
410 dirty = is_dirty_gpte(gpte);
411 pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
412 (pte_access & ACC_WRITE_MASK) && dirty);
413 if (is_error_pfn(pfn)) {
414 kvm_release_pfn_clean(pfn);
415 break;
416 }
417
418 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
419 dirty, NULL, PT_PAGE_TABLE_LEVEL, gfn,
420 pfn, true, true);
421 }
422}
423
313/* 424/*
314 * Fetch a shadow pte for a specific level in the paging hierarchy. 425 * Fetch a shadow pte for a specific level in the paging hierarchy.
315 */ 426 */
@@ -391,6 +502,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
391 mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access, 502 mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
392 user_fault, write_fault, dirty, ptwrite, it.level, 503 user_fault, write_fault, dirty, ptwrite, it.level,
393 gw->gfn, pfn, false, true); 504 gw->gfn, pfn, false, true);
505 FNAME(pte_prefetch)(vcpu, gw, it.sptep);
394 506
395 return it.sptep; 507 return it.sptep;
396 508
@@ -420,7 +532,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
420{ 532{
421 int write_fault = error_code & PFERR_WRITE_MASK; 533 int write_fault = error_code & PFERR_WRITE_MASK;
422 int user_fault = error_code & PFERR_USER_MASK; 534 int user_fault = error_code & PFERR_USER_MASK;
423 int fetch_fault = error_code & PFERR_FETCH_MASK;
424 struct guest_walker walker; 535 struct guest_walker walker;
425 u64 *sptep; 536 u64 *sptep;
426 int write_pt = 0; 537 int write_pt = 0;
@@ -430,7 +541,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
430 unsigned long mmu_seq; 541 unsigned long mmu_seq;
431 542
432 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); 543 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
433 kvm_mmu_audit(vcpu, "pre page fault");
434 544
435 r = mmu_topup_memory_caches(vcpu); 545 r = mmu_topup_memory_caches(vcpu);
436 if (r) 546 if (r)
@@ -439,15 +549,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
439 /* 549 /*
440 * Look up the guest pte for the faulting address. 550 * Look up the guest pte for the faulting address.
441 */ 551 */
442 r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault, 552 r = FNAME(walk_addr)(&walker, vcpu, addr, error_code);
443 fetch_fault);
444 553
445 /* 554 /*
446 * The page is not mapped by the guest. Let the guest handle it. 555 * The page is not mapped by the guest. Let the guest handle it.
447 */ 556 */
448 if (!r) { 557 if (!r) {
449 pgprintk("%s: guest page fault\n", __func__); 558 pgprintk("%s: guest page fault\n", __func__);
450 inject_page_fault(vcpu, addr, walker.error_code); 559 inject_page_fault(vcpu);
451 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ 560 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
452 return 0; 561 return 0;
453 } 562 }
@@ -468,6 +577,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
468 spin_lock(&vcpu->kvm->mmu_lock); 577 spin_lock(&vcpu->kvm->mmu_lock);
469 if (mmu_notifier_retry(vcpu, mmu_seq)) 578 if (mmu_notifier_retry(vcpu, mmu_seq))
470 goto out_unlock; 579 goto out_unlock;
580
581 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
471 kvm_mmu_free_some_pages(vcpu); 582 kvm_mmu_free_some_pages(vcpu);
472 sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, 583 sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
473 level, &write_pt, pfn); 584 level, &write_pt, pfn);
@@ -479,7 +590,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
479 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ 590 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
480 591
481 ++vcpu->stat.pf_fixed; 592 ++vcpu->stat.pf_fixed;
482 kvm_mmu_audit(vcpu, "post page fault (fixed)"); 593 trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
483 spin_unlock(&vcpu->kvm->mmu_lock); 594 spin_unlock(&vcpu->kvm->mmu_lock);
484 595
485 return write_pt; 596 return write_pt;
@@ -556,10 +667,25 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
556 gpa_t gpa = UNMAPPED_GVA; 667 gpa_t gpa = UNMAPPED_GVA;
557 int r; 668 int r;
558 669
559 r = FNAME(walk_addr)(&walker, vcpu, vaddr, 670 r = FNAME(walk_addr)(&walker, vcpu, vaddr, access);
560 !!(access & PFERR_WRITE_MASK), 671
561 !!(access & PFERR_USER_MASK), 672 if (r) {
562 !!(access & PFERR_FETCH_MASK)); 673 gpa = gfn_to_gpa(walker.gfn);
674 gpa |= vaddr & ~PAGE_MASK;
675 } else if (error)
676 *error = walker.error_code;
677
678 return gpa;
679}
680
681static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
682 u32 access, u32 *error)
683{
684 struct guest_walker walker;
685 gpa_t gpa = UNMAPPED_GVA;
686 int r;
687
688 r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, access);
563 689
564 if (r) { 690 if (r) {
565 gpa = gfn_to_gpa(walker.gfn); 691 gpa = gfn_to_gpa(walker.gfn);
@@ -638,7 +764,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
638 return -EINVAL; 764 return -EINVAL;
639 765
640 gfn = gpte_to_gfn(gpte); 766 gfn = gpte_to_gfn(gpte);
641 if (is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL) 767 if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)
642 || gfn != sp->gfns[i] || !is_present_gpte(gpte) 768 || gfn != sp->gfns[i] || !is_present_gpte(gpte)
643 || !(gpte & PT_ACCESSED_MASK)) { 769 || !(gpte & PT_ACCESSED_MASK)) {
644 u64 nonpresent; 770 u64 nonpresent;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 8a3f9f64f86f..82e144a4e514 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4,7 +4,7 @@
4 * AMD SVM support 4 * AMD SVM support
5 * 5 *
6 * Copyright (C) 2006 Qumranet, Inc. 6 * Copyright (C) 2006 Qumranet, Inc.
7 * Copyright 2010 Red Hat, Inc. and/or its affilates. 7 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
8 * 8 *
9 * Authors: 9 * Authors:
10 * Yaniv Kamay <yaniv@qumranet.com> 10 * Yaniv Kamay <yaniv@qumranet.com>
@@ -88,6 +88,14 @@ struct nested_state {
88 /* A VMEXIT is required but not yet emulated */ 88 /* A VMEXIT is required but not yet emulated */
89 bool exit_required; 89 bool exit_required;
90 90
91 /*
92 * If we vmexit during an instruction emulation we need this to restore
93 * the l1 guest rip after the emulation
94 */
95 unsigned long vmexit_rip;
96 unsigned long vmexit_rsp;
97 unsigned long vmexit_rax;
98
91 /* cache for intercepts of the guest */ 99 /* cache for intercepts of the guest */
92 u16 intercept_cr_read; 100 u16 intercept_cr_read;
93 u16 intercept_cr_write; 101 u16 intercept_cr_write;
@@ -96,6 +104,8 @@ struct nested_state {
96 u32 intercept_exceptions; 104 u32 intercept_exceptions;
97 u64 intercept; 105 u64 intercept;
98 106
107 /* Nested Paging related state */
108 u64 nested_cr3;
99}; 109};
100 110
101#define MSRPM_OFFSETS 16 111#define MSRPM_OFFSETS 16
@@ -284,6 +294,15 @@ static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
284 force_new_asid(vcpu); 294 force_new_asid(vcpu);
285} 295}
286 296
297static int get_npt_level(void)
298{
299#ifdef CONFIG_X86_64
300 return PT64_ROOT_LEVEL;
301#else
302 return PT32E_ROOT_LEVEL;
303#endif
304}
305
287static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) 306static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
288{ 307{
289 vcpu->arch.efer = efer; 308 vcpu->arch.efer = efer;
@@ -701,6 +720,29 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
701 seg->base = 0; 720 seg->base = 0;
702} 721}
703 722
723static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
724{
725 struct vcpu_svm *svm = to_svm(vcpu);
726 u64 g_tsc_offset = 0;
727
728 if (is_nested(svm)) {
729 g_tsc_offset = svm->vmcb->control.tsc_offset -
730 svm->nested.hsave->control.tsc_offset;
731 svm->nested.hsave->control.tsc_offset = offset;
732 }
733
734 svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
735}
736
737static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
738{
739 struct vcpu_svm *svm = to_svm(vcpu);
740
741 svm->vmcb->control.tsc_offset += adjustment;
742 if (is_nested(svm))
743 svm->nested.hsave->control.tsc_offset += adjustment;
744}
745
704static void init_vmcb(struct vcpu_svm *svm) 746static void init_vmcb(struct vcpu_svm *svm)
705{ 747{
706 struct vmcb_control_area *control = &svm->vmcb->control; 748 struct vmcb_control_area *control = &svm->vmcb->control;
@@ -793,7 +835,7 @@ static void init_vmcb(struct vcpu_svm *svm)
793 init_sys_seg(&save->ldtr, SEG_TYPE_LDT); 835 init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
794 init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); 836 init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
795 837
796 save->efer = EFER_SVME; 838 svm_set_efer(&svm->vcpu, 0);
797 save->dr6 = 0xffff0ff0; 839 save->dr6 = 0xffff0ff0;
798 save->dr7 = 0x400; 840 save->dr7 = 0x400;
799 save->rflags = 2; 841 save->rflags = 2;
@@ -804,8 +846,8 @@ static void init_vmcb(struct vcpu_svm *svm)
804 * This is the guest-visible cr0 value. 846 * This is the guest-visible cr0 value.
805 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. 847 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
806 */ 848 */
807 svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; 849 svm->vcpu.arch.cr0 = 0;
808 (void)kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0); 850 (void)kvm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
809 851
810 save->cr4 = X86_CR4_PAE; 852 save->cr4 = X86_CR4_PAE;
811 /* rdx = ?? */ 853 /* rdx = ?? */
@@ -901,7 +943,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
901 svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; 943 svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
902 svm->asid_generation = 0; 944 svm->asid_generation = 0;
903 init_vmcb(svm); 945 init_vmcb(svm);
904 svm->vmcb->control.tsc_offset = 0-native_read_tsc(); 946 kvm_write_tsc(&svm->vcpu, 0);
905 947
906 err = fx_init(&svm->vcpu); 948 err = fx_init(&svm->vcpu);
907 if (err) 949 if (err)
@@ -947,20 +989,6 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
947 int i; 989 int i;
948 990
949 if (unlikely(cpu != vcpu->cpu)) { 991 if (unlikely(cpu != vcpu->cpu)) {
950 u64 delta;
951
952 if (check_tsc_unstable()) {
953 /*
954 * Make sure that the guest sees a monotonically
955 * increasing TSC.
956 */
957 delta = vcpu->arch.host_tsc - native_read_tsc();
958 svm->vmcb->control.tsc_offset += delta;
959 if (is_nested(svm))
960 svm->nested.hsave->control.tsc_offset += delta;
961 }
962 vcpu->cpu = cpu;
963 kvm_migrate_timers(vcpu);
964 svm->asid_generation = 0; 992 svm->asid_generation = 0;
965 } 993 }
966 994
@@ -976,8 +1004,6 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
976 ++vcpu->stat.host_state_reload; 1004 ++vcpu->stat.host_state_reload;
977 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) 1005 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
978 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); 1006 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
979
980 vcpu->arch.host_tsc = native_read_tsc();
981} 1007}
982 1008
983static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) 1009static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
@@ -995,7 +1021,7 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
995 switch (reg) { 1021 switch (reg) {
996 case VCPU_EXREG_PDPTR: 1022 case VCPU_EXREG_PDPTR:
997 BUG_ON(!npt_enabled); 1023 BUG_ON(!npt_enabled);
998 load_pdptrs(vcpu, vcpu->arch.cr3); 1024 load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3);
999 break; 1025 break;
1000 default: 1026 default:
1001 BUG(); 1027 BUG();
@@ -1206,8 +1232,12 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1206 if (old == new) { 1232 if (old == new) {
1207 /* cr0 write with ts and mp unchanged */ 1233 /* cr0 write with ts and mp unchanged */
1208 svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE; 1234 svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
1209 if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE) 1235 if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE) {
1236 svm->nested.vmexit_rip = kvm_rip_read(vcpu);
1237 svm->nested.vmexit_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
1238 svm->nested.vmexit_rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
1210 return; 1239 return;
1240 }
1211 } 1241 }
1212 } 1242 }
1213 1243
@@ -1581,6 +1611,54 @@ static int vmmcall_interception(struct vcpu_svm *svm)
1581 return 1; 1611 return 1;
1582} 1612}
1583 1613
1614static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
1615{
1616 struct vcpu_svm *svm = to_svm(vcpu);
1617
1618 return svm->nested.nested_cr3;
1619}
1620
1621static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
1622 unsigned long root)
1623{
1624 struct vcpu_svm *svm = to_svm(vcpu);
1625
1626 svm->vmcb->control.nested_cr3 = root;
1627 force_new_asid(vcpu);
1628}
1629
1630static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu)
1631{
1632 struct vcpu_svm *svm = to_svm(vcpu);
1633
1634 svm->vmcb->control.exit_code = SVM_EXIT_NPF;
1635 svm->vmcb->control.exit_code_hi = 0;
1636 svm->vmcb->control.exit_info_1 = vcpu->arch.fault.error_code;
1637 svm->vmcb->control.exit_info_2 = vcpu->arch.fault.address;
1638
1639 nested_svm_vmexit(svm);
1640}
1641
1642static int nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
1643{
1644 int r;
1645
1646 r = kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu);
1647
1648 vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3;
1649 vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3;
1650 vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit;
1651 vcpu->arch.mmu.shadow_root_level = get_npt_level();
1652 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
1653
1654 return r;
1655}
1656
1657static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
1658{
1659 vcpu->arch.walk_mmu = &vcpu->arch.mmu;
1660}
1661
1584static int nested_svm_check_permissions(struct vcpu_svm *svm) 1662static int nested_svm_check_permissions(struct vcpu_svm *svm)
1585{ 1663{
1586 if (!(svm->vcpu.arch.efer & EFER_SVME) 1664 if (!(svm->vcpu.arch.efer & EFER_SVME)
@@ -1629,6 +1707,14 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm)
1629 if (!(svm->vcpu.arch.hflags & HF_HIF_MASK)) 1707 if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
1630 return false; 1708 return false;
1631 1709
1710 /*
1711 * if vmexit was already requested (by intercepted exception
1712 * for instance) do not overwrite it with "external interrupt"
1713 * vmexit.
1714 */
1715 if (svm->nested.exit_required)
1716 return false;
1717
1632 svm->vmcb->control.exit_code = SVM_EXIT_INTR; 1718 svm->vmcb->control.exit_code = SVM_EXIT_INTR;
1633 svm->vmcb->control.exit_info_1 = 0; 1719 svm->vmcb->control.exit_info_1 = 0;
1634 svm->vmcb->control.exit_info_2 = 0; 1720 svm->vmcb->control.exit_info_2 = 0;
@@ -1896,6 +1982,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1896 nested_vmcb->save.ds = vmcb->save.ds; 1982 nested_vmcb->save.ds = vmcb->save.ds;
1897 nested_vmcb->save.gdtr = vmcb->save.gdtr; 1983 nested_vmcb->save.gdtr = vmcb->save.gdtr;
1898 nested_vmcb->save.idtr = vmcb->save.idtr; 1984 nested_vmcb->save.idtr = vmcb->save.idtr;
1985 nested_vmcb->save.efer = svm->vcpu.arch.efer;
1899 nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu); 1986 nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu);
1900 nested_vmcb->save.cr3 = svm->vcpu.arch.cr3; 1987 nested_vmcb->save.cr3 = svm->vcpu.arch.cr3;
1901 nested_vmcb->save.cr2 = vmcb->save.cr2; 1988 nested_vmcb->save.cr2 = vmcb->save.cr2;
@@ -1917,6 +2004,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1917 nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2; 2004 nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2;
1918 nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info; 2005 nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info;
1919 nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err; 2006 nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
2007 nested_vmcb->control.next_rip = vmcb->control.next_rip;
1920 2008
1921 /* 2009 /*
1922 * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have 2010 * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
@@ -1947,6 +2035,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1947 kvm_clear_exception_queue(&svm->vcpu); 2035 kvm_clear_exception_queue(&svm->vcpu);
1948 kvm_clear_interrupt_queue(&svm->vcpu); 2036 kvm_clear_interrupt_queue(&svm->vcpu);
1949 2037
2038 svm->nested.nested_cr3 = 0;
2039
1950 /* Restore selected save entries */ 2040 /* Restore selected save entries */
1951 svm->vmcb->save.es = hsave->save.es; 2041 svm->vmcb->save.es = hsave->save.es;
1952 svm->vmcb->save.cs = hsave->save.cs; 2042 svm->vmcb->save.cs = hsave->save.cs;
@@ -1973,6 +2063,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1973 2063
1974 nested_svm_unmap(page); 2064 nested_svm_unmap(page);
1975 2065
2066 nested_svm_uninit_mmu_context(&svm->vcpu);
1976 kvm_mmu_reset_context(&svm->vcpu); 2067 kvm_mmu_reset_context(&svm->vcpu);
1977 kvm_mmu_load(&svm->vcpu); 2068 kvm_mmu_load(&svm->vcpu);
1978 2069
@@ -2012,6 +2103,20 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
2012 return true; 2103 return true;
2013} 2104}
2014 2105
2106static bool nested_vmcb_checks(struct vmcb *vmcb)
2107{
2108 if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0)
2109 return false;
2110
2111 if (vmcb->control.asid == 0)
2112 return false;
2113
2114 if (vmcb->control.nested_ctl && !npt_enabled)
2115 return false;
2116
2117 return true;
2118}
2119
2015static bool nested_svm_vmrun(struct vcpu_svm *svm) 2120static bool nested_svm_vmrun(struct vcpu_svm *svm)
2016{ 2121{
2017 struct vmcb *nested_vmcb; 2122 struct vmcb *nested_vmcb;
@@ -2026,7 +2131,18 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2026 if (!nested_vmcb) 2131 if (!nested_vmcb)
2027 return false; 2132 return false;
2028 2133
2029 trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, vmcb_gpa, 2134 if (!nested_vmcb_checks(nested_vmcb)) {
2135 nested_vmcb->control.exit_code = SVM_EXIT_ERR;
2136 nested_vmcb->control.exit_code_hi = 0;
2137 nested_vmcb->control.exit_info_1 = 0;
2138 nested_vmcb->control.exit_info_2 = 0;
2139
2140 nested_svm_unmap(page);
2141
2142 return false;
2143 }
2144
2145 trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa,
2030 nested_vmcb->save.rip, 2146 nested_vmcb->save.rip,
2031 nested_vmcb->control.int_ctl, 2147 nested_vmcb->control.int_ctl,
2032 nested_vmcb->control.event_inj, 2148 nested_vmcb->control.event_inj,
@@ -2055,7 +2171,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2055 hsave->save.cr0 = kvm_read_cr0(&svm->vcpu); 2171 hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
2056 hsave->save.cr4 = svm->vcpu.arch.cr4; 2172 hsave->save.cr4 = svm->vcpu.arch.cr4;
2057 hsave->save.rflags = vmcb->save.rflags; 2173 hsave->save.rflags = vmcb->save.rflags;
2058 hsave->save.rip = svm->next_rip; 2174 hsave->save.rip = kvm_rip_read(&svm->vcpu);
2059 hsave->save.rsp = vmcb->save.rsp; 2175 hsave->save.rsp = vmcb->save.rsp;
2060 hsave->save.rax = vmcb->save.rax; 2176 hsave->save.rax = vmcb->save.rax;
2061 if (npt_enabled) 2177 if (npt_enabled)
@@ -2070,6 +2186,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2070 else 2186 else
2071 svm->vcpu.arch.hflags &= ~HF_HIF_MASK; 2187 svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
2072 2188
2189 if (nested_vmcb->control.nested_ctl) {
2190 kvm_mmu_unload(&svm->vcpu);
2191 svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3;
2192 nested_svm_init_mmu_context(&svm->vcpu);
2193 }
2194
2073 /* Load the nested guest state */ 2195 /* Load the nested guest state */
2074 svm->vmcb->save.es = nested_vmcb->save.es; 2196 svm->vmcb->save.es = nested_vmcb->save.es;
2075 svm->vmcb->save.cs = nested_vmcb->save.cs; 2197 svm->vmcb->save.cs = nested_vmcb->save.cs;
@@ -2227,8 +2349,8 @@ static int vmrun_interception(struct vcpu_svm *svm)
2227 if (nested_svm_check_permissions(svm)) 2349 if (nested_svm_check_permissions(svm))
2228 return 1; 2350 return 1;
2229 2351
2230 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 2352 /* Save rip after vmrun instruction */
2231 skip_emulated_instruction(&svm->vcpu); 2353 kvm_rip_write(&svm->vcpu, kvm_rip_read(&svm->vcpu) + 3);
2232 2354
2233 if (!nested_svm_vmrun(svm)) 2355 if (!nested_svm_vmrun(svm))
2234 return 1; 2356 return 1;
@@ -2257,6 +2379,7 @@ static int stgi_interception(struct vcpu_svm *svm)
2257 2379
2258 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 2380 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2259 skip_emulated_instruction(&svm->vcpu); 2381 skip_emulated_instruction(&svm->vcpu);
2382 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2260 2383
2261 enable_gif(svm); 2384 enable_gif(svm);
2262 2385
@@ -2399,6 +2522,23 @@ static int emulate_on_interception(struct vcpu_svm *svm)
2399 return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE; 2522 return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE;
2400} 2523}
2401 2524
2525static int cr0_write_interception(struct vcpu_svm *svm)
2526{
2527 struct kvm_vcpu *vcpu = &svm->vcpu;
2528 int r;
2529
2530 r = emulate_instruction(&svm->vcpu, 0, 0, 0);
2531
2532 if (svm->nested.vmexit_rip) {
2533 kvm_register_write(vcpu, VCPU_REGS_RIP, svm->nested.vmexit_rip);
2534 kvm_register_write(vcpu, VCPU_REGS_RSP, svm->nested.vmexit_rsp);
2535 kvm_register_write(vcpu, VCPU_REGS_RAX, svm->nested.vmexit_rax);
2536 svm->nested.vmexit_rip = 0;
2537 }
2538
2539 return r == EMULATE_DONE;
2540}
2541
2402static int cr8_write_interception(struct vcpu_svm *svm) 2542static int cr8_write_interception(struct vcpu_svm *svm)
2403{ 2543{
2404 struct kvm_run *kvm_run = svm->vcpu.run; 2544 struct kvm_run *kvm_run = svm->vcpu.run;
@@ -2542,20 +2682,9 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
2542 struct vcpu_svm *svm = to_svm(vcpu); 2682 struct vcpu_svm *svm = to_svm(vcpu);
2543 2683
2544 switch (ecx) { 2684 switch (ecx) {
2545 case MSR_IA32_TSC: { 2685 case MSR_IA32_TSC:
2546 u64 tsc_offset = data - native_read_tsc(); 2686 kvm_write_tsc(vcpu, data);
2547 u64 g_tsc_offset = 0;
2548
2549 if (is_nested(svm)) {
2550 g_tsc_offset = svm->vmcb->control.tsc_offset -
2551 svm->nested.hsave->control.tsc_offset;
2552 svm->nested.hsave->control.tsc_offset = tsc_offset;
2553 }
2554
2555 svm->vmcb->control.tsc_offset = tsc_offset + g_tsc_offset;
2556
2557 break; 2687 break;
2558 }
2559 case MSR_STAR: 2688 case MSR_STAR:
2560 svm->vmcb->save.star = data; 2689 svm->vmcb->save.star = data;
2561 break; 2690 break;
@@ -2643,6 +2772,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm)
2643{ 2772{
2644 struct kvm_run *kvm_run = svm->vcpu.run; 2773 struct kvm_run *kvm_run = svm->vcpu.run;
2645 2774
2775 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2646 svm_clear_vintr(svm); 2776 svm_clear_vintr(svm);
2647 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; 2777 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
2648 /* 2778 /*
@@ -2672,7 +2802,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2672 [SVM_EXIT_READ_CR4] = emulate_on_interception, 2802 [SVM_EXIT_READ_CR4] = emulate_on_interception,
2673 [SVM_EXIT_READ_CR8] = emulate_on_interception, 2803 [SVM_EXIT_READ_CR8] = emulate_on_interception,
2674 [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, 2804 [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception,
2675 [SVM_EXIT_WRITE_CR0] = emulate_on_interception, 2805 [SVM_EXIT_WRITE_CR0] = cr0_write_interception,
2676 [SVM_EXIT_WRITE_CR3] = emulate_on_interception, 2806 [SVM_EXIT_WRITE_CR3] = emulate_on_interception,
2677 [SVM_EXIT_WRITE_CR4] = emulate_on_interception, 2807 [SVM_EXIT_WRITE_CR4] = emulate_on_interception,
2678 [SVM_EXIT_WRITE_CR8] = cr8_write_interception, 2808 [SVM_EXIT_WRITE_CR8] = cr8_write_interception,
@@ -2871,7 +3001,8 @@ static int handle_exit(struct kvm_vcpu *vcpu)
2871 3001
2872 if (is_external_interrupt(svm->vmcb->control.exit_int_info) && 3002 if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
2873 exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR && 3003 exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
2874 exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH) 3004 exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
3005 exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
2875 printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x " 3006 printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
2876 "exit_code 0x%x\n", 3007 "exit_code 0x%x\n",
2877 __func__, svm->vmcb->control.exit_int_info, 3008 __func__, svm->vmcb->control.exit_int_info,
@@ -3088,8 +3219,10 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
3088 3219
3089 svm->int3_injected = 0; 3220 svm->int3_injected = 0;
3090 3221
3091 if (svm->vcpu.arch.hflags & HF_IRET_MASK) 3222 if (svm->vcpu.arch.hflags & HF_IRET_MASK) {
3092 svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); 3223 svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
3224 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
3225 }
3093 3226
3094 svm->vcpu.arch.nmi_injected = false; 3227 svm->vcpu.arch.nmi_injected = false;
3095 kvm_clear_exception_queue(&svm->vcpu); 3228 kvm_clear_exception_queue(&svm->vcpu);
@@ -3098,6 +3231,8 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
3098 if (!(exitintinfo & SVM_EXITINTINFO_VALID)) 3231 if (!(exitintinfo & SVM_EXITINTINFO_VALID))
3099 return; 3232 return;
3100 3233
3234 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
3235
3101 vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK; 3236 vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
3102 type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK; 3237 type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
3103 3238
@@ -3134,6 +3269,17 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
3134 } 3269 }
3135} 3270}
3136 3271
3272static void svm_cancel_injection(struct kvm_vcpu *vcpu)
3273{
3274 struct vcpu_svm *svm = to_svm(vcpu);
3275 struct vmcb_control_area *control = &svm->vmcb->control;
3276
3277 control->exit_int_info = control->event_inj;
3278 control->exit_int_info_err = control->event_inj_err;
3279 control->event_inj = 0;
3280 svm_complete_interrupts(svm);
3281}
3282
3137#ifdef CONFIG_X86_64 3283#ifdef CONFIG_X86_64
3138#define R "r" 3284#define R "r"
3139#else 3285#else
@@ -3167,9 +3313,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3167 savesegment(gs, gs_selector); 3313 savesegment(gs, gs_selector);
3168 ldt_selector = kvm_read_ldt(); 3314 ldt_selector = kvm_read_ldt();
3169 svm->vmcb->save.cr2 = vcpu->arch.cr2; 3315 svm->vmcb->save.cr2 = vcpu->arch.cr2;
3170 /* required for live migration with NPT */
3171 if (npt_enabled)
3172 svm->vmcb->save.cr3 = vcpu->arch.cr3;
3173 3316
3174 clgi(); 3317 clgi();
3175 3318
@@ -3291,16 +3434,22 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
3291{ 3434{
3292 struct vcpu_svm *svm = to_svm(vcpu); 3435 struct vcpu_svm *svm = to_svm(vcpu);
3293 3436
3294 if (npt_enabled) {
3295 svm->vmcb->control.nested_cr3 = root;
3296 force_new_asid(vcpu);
3297 return;
3298 }
3299
3300 svm->vmcb->save.cr3 = root; 3437 svm->vmcb->save.cr3 = root;
3301 force_new_asid(vcpu); 3438 force_new_asid(vcpu);
3302} 3439}
3303 3440
3441static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
3442{
3443 struct vcpu_svm *svm = to_svm(vcpu);
3444
3445 svm->vmcb->control.nested_cr3 = root;
3446
3447 /* Also sync guest cr3 here in case we live migrate */
3448 svm->vmcb->save.cr3 = vcpu->arch.cr3;
3449
3450 force_new_asid(vcpu);
3451}
3452
3304static int is_disabled(void) 3453static int is_disabled(void)
3305{ 3454{
3306 u64 vm_cr; 3455 u64 vm_cr;
@@ -3333,15 +3482,6 @@ static bool svm_cpu_has_accelerated_tpr(void)
3333 return false; 3482 return false;
3334} 3483}
3335 3484
3336static int get_npt_level(void)
3337{
3338#ifdef CONFIG_X86_64
3339 return PT64_ROOT_LEVEL;
3340#else
3341 return PT32E_ROOT_LEVEL;
3342#endif
3343}
3344
3345static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) 3485static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
3346{ 3486{
3347 return 0; 3487 return 0;
@@ -3354,12 +3494,25 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu)
3354static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) 3494static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
3355{ 3495{
3356 switch (func) { 3496 switch (func) {
3497 case 0x80000001:
3498 if (nested)
3499 entry->ecx |= (1 << 2); /* Set SVM bit */
3500 break;
3357 case 0x8000000A: 3501 case 0x8000000A:
3358 entry->eax = 1; /* SVM revision 1 */ 3502 entry->eax = 1; /* SVM revision 1 */
3359 entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper 3503 entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper
3360 ASID emulation to nested SVM */ 3504 ASID emulation to nested SVM */
3361 entry->ecx = 0; /* Reserved */ 3505 entry->ecx = 0; /* Reserved */
3362 entry->edx = 0; /* Do not support any additional features */ 3506 entry->edx = 0; /* Per default do not support any
3507 additional features */
3508
3509 /* Support next_rip if host supports it */
3510 if (svm_has(SVM_FEATURE_NRIP))
3511 entry->edx |= SVM_FEATURE_NRIP;
3512
3513 /* Support NPT for the guest if enabled */
3514 if (npt_enabled)
3515 entry->edx |= SVM_FEATURE_NPT;
3363 3516
3364 break; 3517 break;
3365 } 3518 }
@@ -3497,6 +3650,7 @@ static struct kvm_x86_ops svm_x86_ops = {
3497 .set_irq = svm_set_irq, 3650 .set_irq = svm_set_irq,
3498 .set_nmi = svm_inject_nmi, 3651 .set_nmi = svm_inject_nmi,
3499 .queue_exception = svm_queue_exception, 3652 .queue_exception = svm_queue_exception,
3653 .cancel_injection = svm_cancel_injection,
3500 .interrupt_allowed = svm_interrupt_allowed, 3654 .interrupt_allowed = svm_interrupt_allowed,
3501 .nmi_allowed = svm_nmi_allowed, 3655 .nmi_allowed = svm_nmi_allowed,
3502 .get_nmi_mask = svm_get_nmi_mask, 3656 .get_nmi_mask = svm_get_nmi_mask,
@@ -3519,6 +3673,11 @@ static struct kvm_x86_ops svm_x86_ops = {
3519 .set_supported_cpuid = svm_set_supported_cpuid, 3673 .set_supported_cpuid = svm_set_supported_cpuid,
3520 3674
3521 .has_wbinvd_exit = svm_has_wbinvd_exit, 3675 .has_wbinvd_exit = svm_has_wbinvd_exit,
3676
3677 .write_tsc_offset = svm_write_tsc_offset,
3678 .adjust_tsc_offset = svm_adjust_tsc_offset,
3679
3680 .set_tdp_cr3 = set_tdp_cr3,
3522}; 3681};
3523 3682
3524static int __init svm_init(void) 3683static int __init svm_init(void)
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
index e16a0dbe74d8..fc7a101c4a35 100644
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
@@ -6,7 +6,7 @@
6 * 6 *
7 * timer support 7 * timer support
8 * 8 *
9 * Copyright 2010 Red Hat, Inc. and/or its affilates. 9 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
10 * 10 *
11 * This work is licensed under the terms of the GNU GPL, version 2. See 11 * This work is licensed under the terms of the GNU GPL, version 2. See
12 * the COPYING file in the top-level directory. 12 * the COPYING file in the top-level directory.
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7bddfab12013..8da0e45ff7c9 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5,7 +5,7 @@
5 * machines without emulation or binary translation. 5 * machines without emulation or binary translation.
6 * 6 *
7 * Copyright (C) 2006 Qumranet, Inc. 7 * Copyright (C) 2006 Qumranet, Inc.
8 * Copyright 2010 Red Hat, Inc. and/or its affilates. 8 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
9 * 9 *
10 * Authors: 10 * Authors:
11 * Avi Kivity <avi@qumranet.com> 11 * Avi Kivity <avi@qumranet.com>
@@ -125,6 +125,7 @@ struct vcpu_vmx {
125 unsigned long host_rsp; 125 unsigned long host_rsp;
126 int launched; 126 int launched;
127 u8 fail; 127 u8 fail;
128 u32 exit_intr_info;
128 u32 idt_vectoring_info; 129 u32 idt_vectoring_info;
129 struct shared_msr_entry *guest_msrs; 130 struct shared_msr_entry *guest_msrs;
130 int nmsrs; 131 int nmsrs;
@@ -154,11 +155,6 @@ struct vcpu_vmx {
154 u32 limit; 155 u32 limit;
155 u32 ar; 156 u32 ar;
156 } tr, es, ds, fs, gs; 157 } tr, es, ds, fs, gs;
157 struct {
158 bool pending;
159 u8 vector;
160 unsigned rip;
161 } irq;
162 } rmode; 158 } rmode;
163 int vpid; 159 int vpid;
164 bool emulation_required; 160 bool emulation_required;
@@ -505,7 +501,6 @@ static void __vcpu_clear(void *arg)
505 vmcs_clear(vmx->vmcs); 501 vmcs_clear(vmx->vmcs);
506 if (per_cpu(current_vmcs, cpu) == vmx->vmcs) 502 if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
507 per_cpu(current_vmcs, cpu) = NULL; 503 per_cpu(current_vmcs, cpu) = NULL;
508 rdtscll(vmx->vcpu.arch.host_tsc);
509 list_del(&vmx->local_vcpus_link); 504 list_del(&vmx->local_vcpus_link);
510 vmx->vcpu.cpu = -1; 505 vmx->vcpu.cpu = -1;
511 vmx->launched = 0; 506 vmx->launched = 0;
@@ -706,11 +701,10 @@ static void reload_tss(void)
706 /* 701 /*
707 * VT restores TR but not its size. Useless. 702 * VT restores TR but not its size. Useless.
708 */ 703 */
709 struct desc_ptr gdt; 704 struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
710 struct desc_struct *descs; 705 struct desc_struct *descs;
711 706
712 native_store_gdt(&gdt); 707 descs = (void *)gdt->address;
713 descs = (void *)gdt.address;
714 descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ 708 descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
715 load_TR_desc(); 709 load_TR_desc();
716} 710}
@@ -753,7 +747,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
753 747
754static unsigned long segment_base(u16 selector) 748static unsigned long segment_base(u16 selector)
755{ 749{
756 struct desc_ptr gdt; 750 struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
757 struct desc_struct *d; 751 struct desc_struct *d;
758 unsigned long table_base; 752 unsigned long table_base;
759 unsigned long v; 753 unsigned long v;
@@ -761,8 +755,7 @@ static unsigned long segment_base(u16 selector)
761 if (!(selector & ~3)) 755 if (!(selector & ~3))
762 return 0; 756 return 0;
763 757
764 native_store_gdt(&gdt); 758 table_base = gdt->address;
765 table_base = gdt.address;
766 759
767 if (selector & 4) { /* from ldt */ 760 if (selector & 4) { /* from ldt */
768 u16 ldt_selector = kvm_read_ldt(); 761 u16 ldt_selector = kvm_read_ldt();
@@ -883,7 +876,6 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
883static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 876static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
884{ 877{
885 struct vcpu_vmx *vmx = to_vmx(vcpu); 878 struct vcpu_vmx *vmx = to_vmx(vcpu);
886 u64 tsc_this, delta, new_offset;
887 u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); 879 u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
888 880
889 if (!vmm_exclusive) 881 if (!vmm_exclusive)
@@ -897,37 +889,24 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
897 } 889 }
898 890
899 if (vcpu->cpu != cpu) { 891 if (vcpu->cpu != cpu) {
900 struct desc_ptr dt; 892 struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
901 unsigned long sysenter_esp; 893 unsigned long sysenter_esp;
902 894
903 kvm_migrate_timers(vcpu);
904 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 895 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
905 local_irq_disable(); 896 local_irq_disable();
906 list_add(&vmx->local_vcpus_link, 897 list_add(&vmx->local_vcpus_link,
907 &per_cpu(vcpus_on_cpu, cpu)); 898 &per_cpu(vcpus_on_cpu, cpu));
908 local_irq_enable(); 899 local_irq_enable();
909 900
910 vcpu->cpu = cpu;
911 /* 901 /*
912 * Linux uses per-cpu TSS and GDT, so set these when switching 902 * Linux uses per-cpu TSS and GDT, so set these when switching
913 * processors. 903 * processors.
914 */ 904 */
915 vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */ 905 vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */
916 native_store_gdt(&dt); 906 vmcs_writel(HOST_GDTR_BASE, gdt->address); /* 22.2.4 */
917 vmcs_writel(HOST_GDTR_BASE, dt.address); /* 22.2.4 */
918 907
919 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); 908 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
920 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ 909 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
921
922 /*
923 * Make sure the time stamp counter is monotonous.
924 */
925 rdtscll(tsc_this);
926 if (tsc_this < vcpu->arch.host_tsc) {
927 delta = vcpu->arch.host_tsc - tsc_this;
928 new_offset = vmcs_read64(TSC_OFFSET) + delta;
929 vmcs_write64(TSC_OFFSET, new_offset);
930 }
931 } 910 }
932} 911}
933 912
@@ -1044,16 +1023,8 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
1044 } 1023 }
1045 1024
1046 if (vmx->rmode.vm86_active) { 1025 if (vmx->rmode.vm86_active) {
1047 vmx->rmode.irq.pending = true; 1026 if (kvm_inject_realmode_interrupt(vcpu, nr) != EMULATE_DONE)
1048 vmx->rmode.irq.vector = nr; 1027 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
1049 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
1050 if (kvm_exception_is_soft(nr))
1051 vmx->rmode.irq.rip +=
1052 vmx->vcpu.arch.event_exit_inst_len;
1053 intr_info |= INTR_TYPE_SOFT_INTR;
1054 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
1055 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
1056 kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
1057 return; 1028 return;
1058 } 1029 }
1059 1030
@@ -1149,12 +1120,17 @@ static u64 guest_read_tsc(void)
1149} 1120}
1150 1121
1151/* 1122/*
1152 * writes 'guest_tsc' into guest's timestamp counter "register" 1123 * writes 'offset' into guest's timestamp counter offset register
1153 * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc
1154 */ 1124 */
1155static void guest_write_tsc(u64 guest_tsc, u64 host_tsc) 1125static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1126{
1127 vmcs_write64(TSC_OFFSET, offset);
1128}
1129
1130static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
1156{ 1131{
1157 vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc); 1132 u64 offset = vmcs_read64(TSC_OFFSET);
1133 vmcs_write64(TSC_OFFSET, offset + adjustment);
1158} 1134}
1159 1135
1160/* 1136/*
@@ -1227,7 +1203,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1227{ 1203{
1228 struct vcpu_vmx *vmx = to_vmx(vcpu); 1204 struct vcpu_vmx *vmx = to_vmx(vcpu);
1229 struct shared_msr_entry *msr; 1205 struct shared_msr_entry *msr;
1230 u64 host_tsc;
1231 int ret = 0; 1206 int ret = 0;
1232 1207
1233 switch (msr_index) { 1208 switch (msr_index) {
@@ -1257,8 +1232,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1257 vmcs_writel(GUEST_SYSENTER_ESP, data); 1232 vmcs_writel(GUEST_SYSENTER_ESP, data);
1258 break; 1233 break;
1259 case MSR_IA32_TSC: 1234 case MSR_IA32_TSC:
1260 rdtscll(host_tsc); 1235 kvm_write_tsc(vcpu, data);
1261 guest_write_tsc(data, host_tsc);
1262 break; 1236 break;
1263 case MSR_IA32_CR_PAT: 1237 case MSR_IA32_CR_PAT:
1264 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 1238 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
@@ -1856,20 +1830,20 @@ static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
1856 return; 1830 return;
1857 1831
1858 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { 1832 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
1859 vmcs_write64(GUEST_PDPTR0, vcpu->arch.pdptrs[0]); 1833 vmcs_write64(GUEST_PDPTR0, vcpu->arch.mmu.pdptrs[0]);
1860 vmcs_write64(GUEST_PDPTR1, vcpu->arch.pdptrs[1]); 1834 vmcs_write64(GUEST_PDPTR1, vcpu->arch.mmu.pdptrs[1]);
1861 vmcs_write64(GUEST_PDPTR2, vcpu->arch.pdptrs[2]); 1835 vmcs_write64(GUEST_PDPTR2, vcpu->arch.mmu.pdptrs[2]);
1862 vmcs_write64(GUEST_PDPTR3, vcpu->arch.pdptrs[3]); 1836 vmcs_write64(GUEST_PDPTR3, vcpu->arch.mmu.pdptrs[3]);
1863 } 1837 }
1864} 1838}
1865 1839
1866static void ept_save_pdptrs(struct kvm_vcpu *vcpu) 1840static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
1867{ 1841{
1868 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { 1842 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
1869 vcpu->arch.pdptrs[0] = vmcs_read64(GUEST_PDPTR0); 1843 vcpu->arch.mmu.pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
1870 vcpu->arch.pdptrs[1] = vmcs_read64(GUEST_PDPTR1); 1844 vcpu->arch.mmu.pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
1871 vcpu->arch.pdptrs[2] = vmcs_read64(GUEST_PDPTR2); 1845 vcpu->arch.mmu.pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
1872 vcpu->arch.pdptrs[3] = vmcs_read64(GUEST_PDPTR3); 1846 vcpu->arch.mmu.pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
1873 } 1847 }
1874 1848
1875 __set_bit(VCPU_EXREG_PDPTR, 1849 __set_bit(VCPU_EXREG_PDPTR,
@@ -2515,7 +2489,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2515{ 2489{
2516 u32 host_sysenter_cs, msr_low, msr_high; 2490 u32 host_sysenter_cs, msr_low, msr_high;
2517 u32 junk; 2491 u32 junk;
2518 u64 host_pat, tsc_this, tsc_base; 2492 u64 host_pat;
2519 unsigned long a; 2493 unsigned long a;
2520 struct desc_ptr dt; 2494 struct desc_ptr dt;
2521 int i; 2495 int i;
@@ -2656,12 +2630,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2656 vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; 2630 vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
2657 vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); 2631 vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
2658 2632
2659 tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc; 2633 kvm_write_tsc(&vmx->vcpu, 0);
2660 rdtscll(tsc_this);
2661 if (tsc_this < vmx->vcpu.kvm->arch.vm_init_tsc)
2662 tsc_base = tsc_this;
2663
2664 guest_write_tsc(0, tsc_base);
2665 2634
2666 return 0; 2635 return 0;
2667} 2636}
@@ -2834,16 +2803,8 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
2834 2803
2835 ++vcpu->stat.irq_injections; 2804 ++vcpu->stat.irq_injections;
2836 if (vmx->rmode.vm86_active) { 2805 if (vmx->rmode.vm86_active) {
2837 vmx->rmode.irq.pending = true; 2806 if (kvm_inject_realmode_interrupt(vcpu, irq) != EMULATE_DONE)
2838 vmx->rmode.irq.vector = irq; 2807 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2839 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
2840 if (vcpu->arch.interrupt.soft)
2841 vmx->rmode.irq.rip +=
2842 vmx->vcpu.arch.event_exit_inst_len;
2843 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2844 irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK);
2845 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
2846 kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
2847 return; 2808 return;
2848 } 2809 }
2849 intr = irq | INTR_INFO_VALID_MASK; 2810 intr = irq | INTR_INFO_VALID_MASK;
@@ -2875,14 +2836,8 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
2875 2836
2876 ++vcpu->stat.nmi_injections; 2837 ++vcpu->stat.nmi_injections;
2877 if (vmx->rmode.vm86_active) { 2838 if (vmx->rmode.vm86_active) {
2878 vmx->rmode.irq.pending = true; 2839 if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR) != EMULATE_DONE)
2879 vmx->rmode.irq.vector = NMI_VECTOR; 2840 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2880 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
2881 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2882 NMI_VECTOR | INTR_TYPE_SOFT_INTR |
2883 INTR_INFO_VALID_MASK);
2884 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
2885 kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
2886 return; 2841 return;
2887 } 2842 }
2888 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2843 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
@@ -3346,6 +3301,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu)
3346 3301
3347static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) 3302static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
3348{ 3303{
3304 kvm_make_request(KVM_REQ_EVENT, vcpu);
3349 return 1; 3305 return 1;
3350} 3306}
3351 3307
@@ -3358,6 +3314,8 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu)
3358 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; 3314 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
3359 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 3315 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
3360 3316
3317 kvm_make_request(KVM_REQ_EVENT, vcpu);
3318
3361 ++vcpu->stat.irq_window_exits; 3319 ++vcpu->stat.irq_window_exits;
3362 3320
3363 /* 3321 /*
@@ -3614,6 +3572,7 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu)
3614 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; 3572 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
3615 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 3573 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
3616 ++vcpu->stat.nmi_window_exits; 3574 ++vcpu->stat.nmi_window_exits;
3575 kvm_make_request(KVM_REQ_EVENT, vcpu);
3617 3576
3618 return 1; 3577 return 1;
3619} 3578}
@@ -3623,8 +3582,17 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
3623 struct vcpu_vmx *vmx = to_vmx(vcpu); 3582 struct vcpu_vmx *vmx = to_vmx(vcpu);
3624 enum emulation_result err = EMULATE_DONE; 3583 enum emulation_result err = EMULATE_DONE;
3625 int ret = 1; 3584 int ret = 1;
3585 u32 cpu_exec_ctrl;
3586 bool intr_window_requested;
3587
3588 cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
3589 intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING;
3626 3590
3627 while (!guest_state_valid(vcpu)) { 3591 while (!guest_state_valid(vcpu)) {
3592 if (intr_window_requested
3593 && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF))
3594 return handle_interrupt_window(&vmx->vcpu);
3595
3628 err = emulate_instruction(vcpu, 0, 0, 0); 3596 err = emulate_instruction(vcpu, 0, 0, 0);
3629 3597
3630 if (err == EMULATE_DO_MMIO) { 3598 if (err == EMULATE_DO_MMIO) {
@@ -3790,18 +3758,9 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
3790 vmcs_write32(TPR_THRESHOLD, irr); 3758 vmcs_write32(TPR_THRESHOLD, irr);
3791} 3759}
3792 3760
3793static void vmx_complete_interrupts(struct vcpu_vmx *vmx) 3761static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
3794{ 3762{
3795 u32 exit_intr_info; 3763 u32 exit_intr_info = vmx->exit_intr_info;
3796 u32 idt_vectoring_info = vmx->idt_vectoring_info;
3797 bool unblock_nmi;
3798 u8 vector;
3799 int type;
3800 bool idtv_info_valid;
3801
3802 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
3803
3804 vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
3805 3764
3806 /* Handle machine checks before interrupts are enabled */ 3765 /* Handle machine checks before interrupts are enabled */
3807 if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY) 3766 if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
@@ -3816,8 +3775,16 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
3816 asm("int $2"); 3775 asm("int $2");
3817 kvm_after_handle_nmi(&vmx->vcpu); 3776 kvm_after_handle_nmi(&vmx->vcpu);
3818 } 3777 }
3778}
3819 3779
3820 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; 3780static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
3781{
3782 u32 exit_intr_info = vmx->exit_intr_info;
3783 bool unblock_nmi;
3784 u8 vector;
3785 bool idtv_info_valid;
3786
3787 idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
3821 3788
3822 if (cpu_has_virtual_nmis()) { 3789 if (cpu_has_virtual_nmis()) {
3823 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; 3790 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
@@ -3839,6 +3806,18 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
3839 } else if (unlikely(vmx->soft_vnmi_blocked)) 3806 } else if (unlikely(vmx->soft_vnmi_blocked))
3840 vmx->vnmi_blocked_time += 3807 vmx->vnmi_blocked_time +=
3841 ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); 3808 ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
3809}
3810
3811static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
3812 u32 idt_vectoring_info,
3813 int instr_len_field,
3814 int error_code_field)
3815{
3816 u8 vector;
3817 int type;
3818 bool idtv_info_valid;
3819
3820 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
3842 3821
3843 vmx->vcpu.arch.nmi_injected = false; 3822 vmx->vcpu.arch.nmi_injected = false;
3844 kvm_clear_exception_queue(&vmx->vcpu); 3823 kvm_clear_exception_queue(&vmx->vcpu);
@@ -3847,6 +3826,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
3847 if (!idtv_info_valid) 3826 if (!idtv_info_valid)
3848 return; 3827 return;
3849 3828
3829 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
3830
3850 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; 3831 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
3851 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; 3832 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
3852 3833
@@ -3863,18 +3844,18 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
3863 break; 3844 break;
3864 case INTR_TYPE_SOFT_EXCEPTION: 3845 case INTR_TYPE_SOFT_EXCEPTION:
3865 vmx->vcpu.arch.event_exit_inst_len = 3846 vmx->vcpu.arch.event_exit_inst_len =
3866 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 3847 vmcs_read32(instr_len_field);
3867 /* fall through */ 3848 /* fall through */
3868 case INTR_TYPE_HARD_EXCEPTION: 3849 case INTR_TYPE_HARD_EXCEPTION:
3869 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { 3850 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
3870 u32 err = vmcs_read32(IDT_VECTORING_ERROR_CODE); 3851 u32 err = vmcs_read32(error_code_field);
3871 kvm_queue_exception_e(&vmx->vcpu, vector, err); 3852 kvm_queue_exception_e(&vmx->vcpu, vector, err);
3872 } else 3853 } else
3873 kvm_queue_exception(&vmx->vcpu, vector); 3854 kvm_queue_exception(&vmx->vcpu, vector);
3874 break; 3855 break;
3875 case INTR_TYPE_SOFT_INTR: 3856 case INTR_TYPE_SOFT_INTR:
3876 vmx->vcpu.arch.event_exit_inst_len = 3857 vmx->vcpu.arch.event_exit_inst_len =
3877 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 3858 vmcs_read32(instr_len_field);
3878 /* fall through */ 3859 /* fall through */
3879 case INTR_TYPE_EXT_INTR: 3860 case INTR_TYPE_EXT_INTR:
3880 kvm_queue_interrupt(&vmx->vcpu, vector, 3861 kvm_queue_interrupt(&vmx->vcpu, vector,
@@ -3885,27 +3866,21 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
3885 } 3866 }
3886} 3867}
3887 3868
3888/* 3869static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
3889 * Failure to inject an interrupt should give us the information
3890 * in IDT_VECTORING_INFO_FIELD. However, if the failure occurs
3891 * when fetching the interrupt redirection bitmap in the real-mode
3892 * tss, this doesn't happen. So we do it ourselves.
3893 */
3894static void fixup_rmode_irq(struct vcpu_vmx *vmx)
3895{ 3870{
3896 vmx->rmode.irq.pending = 0; 3871 __vmx_complete_interrupts(vmx, vmx->idt_vectoring_info,
3897 if (kvm_rip_read(&vmx->vcpu) + 1 != vmx->rmode.irq.rip) 3872 VM_EXIT_INSTRUCTION_LEN,
3898 return; 3873 IDT_VECTORING_ERROR_CODE);
3899 kvm_rip_write(&vmx->vcpu, vmx->rmode.irq.rip); 3874}
3900 if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) { 3875
3901 vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK; 3876static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
3902 vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR; 3877{
3903 return; 3878 __vmx_complete_interrupts(to_vmx(vcpu),
3904 } 3879 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
3905 vmx->idt_vectoring_info = 3880 VM_ENTRY_INSTRUCTION_LEN,
3906 VECTORING_INFO_VALID_MASK 3881 VM_ENTRY_EXCEPTION_ERROR_CODE);
3907 | INTR_TYPE_EXT_INTR 3882
3908 | vmx->rmode.irq.vector; 3883 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
3909} 3884}
3910 3885
3911#ifdef CONFIG_X86_64 3886#ifdef CONFIG_X86_64
@@ -4032,7 +4007,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
4032#endif 4007#endif
4033 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) 4008 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2))
4034 : "cc", "memory" 4009 : "cc", "memory"
4035 , R"bx", R"di", R"si" 4010 , R"ax", R"bx", R"di", R"si"
4036#ifdef CONFIG_X86_64 4011#ifdef CONFIG_X86_64
4037 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" 4012 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
4038#endif 4013#endif
@@ -4043,12 +4018,15 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
4043 vcpu->arch.regs_dirty = 0; 4018 vcpu->arch.regs_dirty = 0;
4044 4019
4045 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 4020 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
4046 if (vmx->rmode.irq.pending)
4047 fixup_rmode_irq(vmx);
4048 4021
4049 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); 4022 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
4050 vmx->launched = 1; 4023 vmx->launched = 1;
4051 4024
4025 vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
4026 vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
4027
4028 vmx_complete_atomic_exit(vmx);
4029 vmx_recover_nmi_blocking(vmx);
4052 vmx_complete_interrupts(vmx); 4030 vmx_complete_interrupts(vmx);
4053} 4031}
4054 4032
@@ -4119,6 +4097,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
4119 4097
4120 cpu = get_cpu(); 4098 cpu = get_cpu();
4121 vmx_vcpu_load(&vmx->vcpu, cpu); 4099 vmx_vcpu_load(&vmx->vcpu, cpu);
4100 vmx->vcpu.cpu = cpu;
4122 err = vmx_vcpu_setup(vmx); 4101 err = vmx_vcpu_setup(vmx);
4123 vmx_vcpu_put(&vmx->vcpu); 4102 vmx_vcpu_put(&vmx->vcpu);
4124 put_cpu(); 4103 put_cpu();
@@ -4334,6 +4313,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
4334 .set_irq = vmx_inject_irq, 4313 .set_irq = vmx_inject_irq,
4335 .set_nmi = vmx_inject_nmi, 4314 .set_nmi = vmx_inject_nmi,
4336 .queue_exception = vmx_queue_exception, 4315 .queue_exception = vmx_queue_exception,
4316 .cancel_injection = vmx_cancel_injection,
4337 .interrupt_allowed = vmx_interrupt_allowed, 4317 .interrupt_allowed = vmx_interrupt_allowed,
4338 .nmi_allowed = vmx_nmi_allowed, 4318 .nmi_allowed = vmx_nmi_allowed,
4339 .get_nmi_mask = vmx_get_nmi_mask, 4319 .get_nmi_mask = vmx_get_nmi_mask,
@@ -4356,6 +4336,11 @@ static struct kvm_x86_ops vmx_x86_ops = {
4356 .set_supported_cpuid = vmx_set_supported_cpuid, 4336 .set_supported_cpuid = vmx_set_supported_cpuid,
4357 4337
4358 .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, 4338 .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
4339
4340 .write_tsc_offset = vmx_write_tsc_offset,
4341 .adjust_tsc_offset = vmx_adjust_tsc_offset,
4342
4343 .set_tdp_cr3 = vmx_set_cr3,
4359}; 4344};
4360 4345
4361static int __init vmx_init(void) 4346static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6c2ecf0a806d..2288ad829b32 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6,7 +6,7 @@
6 * Copyright (C) 2006 Qumranet, Inc. 6 * Copyright (C) 2006 Qumranet, Inc.
7 * Copyright (C) 2008 Qumranet, Inc. 7 * Copyright (C) 2008 Qumranet, Inc.
8 * Copyright IBM Corporation, 2008 8 * Copyright IBM Corporation, 2008
9 * Copyright 2010 Red Hat, Inc. and/or its affilates. 9 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
10 * 10 *
11 * Authors: 11 * Authors:
12 * Avi Kivity <avi@qumranet.com> 12 * Avi Kivity <avi@qumranet.com>
@@ -55,6 +55,8 @@
55#include <asm/mce.h> 55#include <asm/mce.h>
56#include <asm/i387.h> 56#include <asm/i387.h>
57#include <asm/xcr.h> 57#include <asm/xcr.h>
58#include <asm/pvclock.h>
59#include <asm/div64.h>
58 60
59#define MAX_IO_MSRS 256 61#define MAX_IO_MSRS 256
60#define CR0_RESERVED_BITS \ 62#define CR0_RESERVED_BITS \
@@ -71,7 +73,7 @@
71#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) 73#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
72 74
73#define KVM_MAX_MCE_BANKS 32 75#define KVM_MAX_MCE_BANKS 32
74#define KVM_MCE_CAP_SUPPORTED MCG_CTL_P 76#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)
75 77
76/* EFER defaults: 78/* EFER defaults:
77 * - enable syscall per default because its emulated by KVM 79 * - enable syscall per default because its emulated by KVM
@@ -282,6 +284,8 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
282 u32 prev_nr; 284 u32 prev_nr;
283 int class1, class2; 285 int class1, class2;
284 286
287 kvm_make_request(KVM_REQ_EVENT, vcpu);
288
285 if (!vcpu->arch.exception.pending) { 289 if (!vcpu->arch.exception.pending) {
286 queue: 290 queue:
287 vcpu->arch.exception.pending = true; 291 vcpu->arch.exception.pending = true;
@@ -327,16 +331,28 @@ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
327} 331}
328EXPORT_SYMBOL_GPL(kvm_requeue_exception); 332EXPORT_SYMBOL_GPL(kvm_requeue_exception);
329 333
330void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, 334void kvm_inject_page_fault(struct kvm_vcpu *vcpu)
331 u32 error_code)
332{ 335{
336 unsigned error_code = vcpu->arch.fault.error_code;
337
333 ++vcpu->stat.pf_guest; 338 ++vcpu->stat.pf_guest;
334 vcpu->arch.cr2 = addr; 339 vcpu->arch.cr2 = vcpu->arch.fault.address;
335 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); 340 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
336} 341}
337 342
343void kvm_propagate_fault(struct kvm_vcpu *vcpu)
344{
345 if (mmu_is_nested(vcpu) && !vcpu->arch.fault.nested)
346 vcpu->arch.nested_mmu.inject_page_fault(vcpu);
347 else
348 vcpu->arch.mmu.inject_page_fault(vcpu);
349
350 vcpu->arch.fault.nested = false;
351}
352
338void kvm_inject_nmi(struct kvm_vcpu *vcpu) 353void kvm_inject_nmi(struct kvm_vcpu *vcpu)
339{ 354{
355 kvm_make_request(KVM_REQ_EVENT, vcpu);
340 vcpu->arch.nmi_pending = 1; 356 vcpu->arch.nmi_pending = 1;
341} 357}
342EXPORT_SYMBOL_GPL(kvm_inject_nmi); 358EXPORT_SYMBOL_GPL(kvm_inject_nmi);
@@ -367,18 +383,49 @@ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
367EXPORT_SYMBOL_GPL(kvm_require_cpl); 383EXPORT_SYMBOL_GPL(kvm_require_cpl);
368 384
369/* 385/*
386 * This function will be used to read from the physical memory of the currently
387 * running guest. The difference to kvm_read_guest_page is that this function
388 * can read from guest physical or from the guest's guest physical memory.
389 */
390int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
391 gfn_t ngfn, void *data, int offset, int len,
392 u32 access)
393{
394 gfn_t real_gfn;
395 gpa_t ngpa;
396
397 ngpa = gfn_to_gpa(ngfn);
398 real_gfn = mmu->translate_gpa(vcpu, ngpa, access);
399 if (real_gfn == UNMAPPED_GVA)
400 return -EFAULT;
401
402 real_gfn = gpa_to_gfn(real_gfn);
403
404 return kvm_read_guest_page(vcpu->kvm, real_gfn, data, offset, len);
405}
406EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
407
408int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
409 void *data, int offset, int len, u32 access)
410{
411 return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
412 data, offset, len, access);
413}
414
415/*
370 * Load the pae pdptrs. Return true is they are all valid. 416 * Load the pae pdptrs. Return true is they are all valid.
371 */ 417 */
372int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) 418int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
373{ 419{
374 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; 420 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
375 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; 421 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
376 int i; 422 int i;
377 int ret; 423 int ret;
378 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; 424 u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
379 425
380 ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte, 426 ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte,
381 offset * sizeof(u64), sizeof(pdpte)); 427 offset * sizeof(u64), sizeof(pdpte),
428 PFERR_USER_MASK|PFERR_WRITE_MASK);
382 if (ret < 0) { 429 if (ret < 0) {
383 ret = 0; 430 ret = 0;
384 goto out; 431 goto out;
@@ -392,7 +439,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
392 } 439 }
393 ret = 1; 440 ret = 1;
394 441
395 memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); 442 memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
396 __set_bit(VCPU_EXREG_PDPTR, 443 __set_bit(VCPU_EXREG_PDPTR,
397 (unsigned long *)&vcpu->arch.regs_avail); 444 (unsigned long *)&vcpu->arch.regs_avail);
398 __set_bit(VCPU_EXREG_PDPTR, 445 __set_bit(VCPU_EXREG_PDPTR,
@@ -405,8 +452,10 @@ EXPORT_SYMBOL_GPL(load_pdptrs);
405 452
406static bool pdptrs_changed(struct kvm_vcpu *vcpu) 453static bool pdptrs_changed(struct kvm_vcpu *vcpu)
407{ 454{
408 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; 455 u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
409 bool changed = true; 456 bool changed = true;
457 int offset;
458 gfn_t gfn;
410 int r; 459 int r;
411 460
412 if (is_long_mode(vcpu) || !is_pae(vcpu)) 461 if (is_long_mode(vcpu) || !is_pae(vcpu))
@@ -416,10 +465,13 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
416 (unsigned long *)&vcpu->arch.regs_avail)) 465 (unsigned long *)&vcpu->arch.regs_avail))
417 return true; 466 return true;
418 467
419 r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); 468 gfn = (vcpu->arch.cr3 & ~31u) >> PAGE_SHIFT;
469 offset = (vcpu->arch.cr3 & ~31u) & (PAGE_SIZE - 1);
470 r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
471 PFERR_USER_MASK | PFERR_WRITE_MASK);
420 if (r < 0) 472 if (r < 0)
421 goto out; 473 goto out;
422 changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0; 474 changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
423out: 475out:
424 476
425 return changed; 477 return changed;
@@ -458,7 +510,8 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
458 return 1; 510 return 1;
459 } else 511 } else
460#endif 512#endif
461 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) 513 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
514 vcpu->arch.cr3))
462 return 1; 515 return 1;
463 } 516 }
464 517
@@ -547,7 +600,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
547 return 1; 600 return 1;
548 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) 601 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
549 && ((cr4 ^ old_cr4) & pdptr_bits) 602 && ((cr4 ^ old_cr4) & pdptr_bits)
550 && !load_pdptrs(vcpu, vcpu->arch.cr3)) 603 && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3))
551 return 1; 604 return 1;
552 605
553 if (cr4 & X86_CR4_VMXE) 606 if (cr4 & X86_CR4_VMXE)
@@ -580,7 +633,8 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
580 if (is_pae(vcpu)) { 633 if (is_pae(vcpu)) {
581 if (cr3 & CR3_PAE_RESERVED_BITS) 634 if (cr3 & CR3_PAE_RESERVED_BITS)
582 return 1; 635 return 1;
583 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) 636 if (is_paging(vcpu) &&
637 !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
584 return 1; 638 return 1;
585 } 639 }
586 /* 640 /*
@@ -737,7 +791,7 @@ static u32 msrs_to_save[] = {
737#ifdef CONFIG_X86_64 791#ifdef CONFIG_X86_64
738 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 792 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
739#endif 793#endif
740 MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA 794 MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
741}; 795};
742 796
743static unsigned num_msrs_to_save; 797static unsigned num_msrs_to_save;
@@ -838,7 +892,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
838 892
839 /* 893 /*
840 * The guest calculates current wall clock time by adding 894 * The guest calculates current wall clock time by adding
841 * system time (updated by kvm_write_guest_time below) to the 895 * system time (updated by kvm_guest_time_update below) to the
842 * wall clock specified here. guest system time equals host 896 * wall clock specified here. guest system time equals host
843 * system time for us, thus we must fill in host boot time here. 897 * system time for us, thus we must fill in host boot time here.
844 */ 898 */
@@ -866,65 +920,229 @@ static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
866 return quotient; 920 return quotient;
867} 921}
868 922
869static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock) 923static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
924 s8 *pshift, u32 *pmultiplier)
870{ 925{
871 uint64_t nsecs = 1000000000LL; 926 uint64_t scaled64;
872 int32_t shift = 0; 927 int32_t shift = 0;
873 uint64_t tps64; 928 uint64_t tps64;
874 uint32_t tps32; 929 uint32_t tps32;
875 930
876 tps64 = tsc_khz * 1000LL; 931 tps64 = base_khz * 1000LL;
877 while (tps64 > nsecs*2) { 932 scaled64 = scaled_khz * 1000LL;
933 while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
878 tps64 >>= 1; 934 tps64 >>= 1;
879 shift--; 935 shift--;
880 } 936 }
881 937
882 tps32 = (uint32_t)tps64; 938 tps32 = (uint32_t)tps64;
883 while (tps32 <= (uint32_t)nsecs) { 939 while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) {
884 tps32 <<= 1; 940 if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000)
941 scaled64 >>= 1;
942 else
943 tps32 <<= 1;
885 shift++; 944 shift++;
886 } 945 }
887 946
888 hv_clock->tsc_shift = shift; 947 *pshift = shift;
889 hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); 948 *pmultiplier = div_frac(scaled64, tps32);
890 949
891 pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n", 950 pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n",
892 __func__, tsc_khz, hv_clock->tsc_shift, 951 __func__, base_khz, scaled_khz, shift, *pmultiplier);
893 hv_clock->tsc_to_system_mul); 952}
953
954static inline u64 get_kernel_ns(void)
955{
956 struct timespec ts;
957
958 WARN_ON(preemptible());
959 ktime_get_ts(&ts);
960 monotonic_to_bootbased(&ts);
961 return timespec_to_ns(&ts);
894} 962}
895 963
896static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); 964static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
965unsigned long max_tsc_khz;
897 966
898static void kvm_write_guest_time(struct kvm_vcpu *v) 967static inline int kvm_tsc_changes_freq(void)
968{
969 int cpu = get_cpu();
970 int ret = !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
971 cpufreq_quick_get(cpu) != 0;
972 put_cpu();
973 return ret;
974}
975
976static inline u64 nsec_to_cycles(u64 nsec)
977{
978 u64 ret;
979
980 WARN_ON(preemptible());
981 if (kvm_tsc_changes_freq())
982 printk_once(KERN_WARNING
983 "kvm: unreliable cycle conversion on adjustable rate TSC\n");
984 ret = nsec * __get_cpu_var(cpu_tsc_khz);
985 do_div(ret, USEC_PER_SEC);
986 return ret;
987}
988
989static void kvm_arch_set_tsc_khz(struct kvm *kvm, u32 this_tsc_khz)
990{
991 /* Compute a scale to convert nanoseconds in TSC cycles */
992 kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
993 &kvm->arch.virtual_tsc_shift,
994 &kvm->arch.virtual_tsc_mult);
995 kvm->arch.virtual_tsc_khz = this_tsc_khz;
996}
997
998static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
999{
1000 u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec,
1001 vcpu->kvm->arch.virtual_tsc_mult,
1002 vcpu->kvm->arch.virtual_tsc_shift);
1003 tsc += vcpu->arch.last_tsc_write;
1004 return tsc;
1005}
1006
1007void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
1008{
1009 struct kvm *kvm = vcpu->kvm;
1010 u64 offset, ns, elapsed;
1011 unsigned long flags;
1012 s64 sdiff;
1013
1014 spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
1015 offset = data - native_read_tsc();
1016 ns = get_kernel_ns();
1017 elapsed = ns - kvm->arch.last_tsc_nsec;
1018 sdiff = data - kvm->arch.last_tsc_write;
1019 if (sdiff < 0)
1020 sdiff = -sdiff;
1021
1022 /*
1023 * Special case: close write to TSC within 5 seconds of
1024 * another CPU is interpreted as an attempt to synchronize
1025 * The 5 seconds is to accomodate host load / swapping as
1026 * well as any reset of TSC during the boot process.
1027 *
1028 * In that case, for a reliable TSC, we can match TSC offsets,
1029 * or make a best guest using elapsed value.
1030 */
1031 if (sdiff < nsec_to_cycles(5ULL * NSEC_PER_SEC) &&
1032 elapsed < 5ULL * NSEC_PER_SEC) {
1033 if (!check_tsc_unstable()) {
1034 offset = kvm->arch.last_tsc_offset;
1035 pr_debug("kvm: matched tsc offset for %llu\n", data);
1036 } else {
1037 u64 delta = nsec_to_cycles(elapsed);
1038 offset += delta;
1039 pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
1040 }
1041 ns = kvm->arch.last_tsc_nsec;
1042 }
1043 kvm->arch.last_tsc_nsec = ns;
1044 kvm->arch.last_tsc_write = data;
1045 kvm->arch.last_tsc_offset = offset;
1046 kvm_x86_ops->write_tsc_offset(vcpu, offset);
1047 spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
1048
1049 /* Reset of TSC must disable overshoot protection below */
1050 vcpu->arch.hv_clock.tsc_timestamp = 0;
1051 vcpu->arch.last_tsc_write = data;
1052 vcpu->arch.last_tsc_nsec = ns;
1053}
1054EXPORT_SYMBOL_GPL(kvm_write_tsc);
1055
1056static int kvm_guest_time_update(struct kvm_vcpu *v)
899{ 1057{
900 struct timespec ts;
901 unsigned long flags; 1058 unsigned long flags;
902 struct kvm_vcpu_arch *vcpu = &v->arch; 1059 struct kvm_vcpu_arch *vcpu = &v->arch;
903 void *shared_kaddr; 1060 void *shared_kaddr;
904 unsigned long this_tsc_khz; 1061 unsigned long this_tsc_khz;
1062 s64 kernel_ns, max_kernel_ns;
1063 u64 tsc_timestamp;
905 1064
906 if ((!vcpu->time_page)) 1065 /* Keep irq disabled to prevent changes to the clock */
907 return; 1066 local_irq_save(flags);
1067 kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp);
1068 kernel_ns = get_kernel_ns();
1069 this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
908 1070
909 this_tsc_khz = get_cpu_var(cpu_tsc_khz); 1071 if (unlikely(this_tsc_khz == 0)) {
910 if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) { 1072 local_irq_restore(flags);
911 kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock); 1073 kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
912 vcpu->hv_clock_tsc_khz = this_tsc_khz; 1074 return 1;
1075 }
1076
1077 /*
1078 * We may have to catch up the TSC to match elapsed wall clock
1079 * time for two reasons, even if kvmclock is used.
1080 * 1) CPU could have been running below the maximum TSC rate
1081 * 2) Broken TSC compensation resets the base at each VCPU
1082 * entry to avoid unknown leaps of TSC even when running
1083 * again on the same CPU. This may cause apparent elapsed
1084 * time to disappear, and the guest to stand still or run
1085 * very slowly.
1086 */
1087 if (vcpu->tsc_catchup) {
1088 u64 tsc = compute_guest_tsc(v, kernel_ns);
1089 if (tsc > tsc_timestamp) {
1090 kvm_x86_ops->adjust_tsc_offset(v, tsc - tsc_timestamp);
1091 tsc_timestamp = tsc;
1092 }
913 } 1093 }
914 put_cpu_var(cpu_tsc_khz);
915 1094
916 /* Keep irq disabled to prevent changes to the clock */
917 local_irq_save(flags);
918 kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp);
919 ktime_get_ts(&ts);
920 monotonic_to_bootbased(&ts);
921 local_irq_restore(flags); 1095 local_irq_restore(flags);
922 1096
923 /* With all the info we got, fill in the values */ 1097 if (!vcpu->time_page)
1098 return 0;
924 1099
925 vcpu->hv_clock.system_time = ts.tv_nsec + 1100 /*
926 (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset; 1101 * Time as measured by the TSC may go backwards when resetting the base
1102 * tsc_timestamp. The reason for this is that the TSC resolution is
1103 * higher than the resolution of the other clock scales. Thus, many
1104 * possible measurments of the TSC correspond to one measurement of any
1105 * other clock, and so a spread of values is possible. This is not a
1106 * problem for the computation of the nanosecond clock; with TSC rates
1107 * around 1GHZ, there can only be a few cycles which correspond to one
1108 * nanosecond value, and any path through this code will inevitably
1109 * take longer than that. However, with the kernel_ns value itself,
1110 * the precision may be much lower, down to HZ granularity. If the
1111 * first sampling of TSC against kernel_ns ends in the low part of the
1112 * range, and the second in the high end of the range, we can get:
1113 *
1114 * (TSC - offset_low) * S + kns_old > (TSC - offset_high) * S + kns_new
1115 *
1116 * As the sampling errors potentially range in the thousands of cycles,
1117 * it is possible such a time value has already been observed by the
1118 * guest. To protect against this, we must compute the system time as
1119 * observed by the guest and ensure the new system time is greater.
1120 */
1121 max_kernel_ns = 0;
1122 if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) {
1123 max_kernel_ns = vcpu->last_guest_tsc -
1124 vcpu->hv_clock.tsc_timestamp;
1125 max_kernel_ns = pvclock_scale_delta(max_kernel_ns,
1126 vcpu->hv_clock.tsc_to_system_mul,
1127 vcpu->hv_clock.tsc_shift);
1128 max_kernel_ns += vcpu->last_kernel_ns;
1129 }
927 1130
1131 if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
1132 kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz,
1133 &vcpu->hv_clock.tsc_shift,
1134 &vcpu->hv_clock.tsc_to_system_mul);
1135 vcpu->hw_tsc_khz = this_tsc_khz;
1136 }
1137
1138 if (max_kernel_ns > kernel_ns)
1139 kernel_ns = max_kernel_ns;
1140
1141 /* With all the info we got, fill in the values */
1142 vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
1143 vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
1144 vcpu->last_kernel_ns = kernel_ns;
1145 vcpu->last_guest_tsc = tsc_timestamp;
928 vcpu->hv_clock.flags = 0; 1146 vcpu->hv_clock.flags = 0;
929 1147
930 /* 1148 /*
@@ -942,16 +1160,7 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
942 kunmap_atomic(shared_kaddr, KM_USER0); 1160 kunmap_atomic(shared_kaddr, KM_USER0);
943 1161
944 mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); 1162 mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
945} 1163 return 0;
946
947static int kvm_request_guest_time_update(struct kvm_vcpu *v)
948{
949 struct kvm_vcpu_arch *vcpu = &v->arch;
950
951 if (!vcpu->time_page)
952 return 0;
953 kvm_make_request(KVM_REQ_KVMCLOCK_UPDATE, v);
954 return 1;
955} 1164}
956 1165
957static bool msr_mtrr_valid(unsigned msr) 1166static bool msr_mtrr_valid(unsigned msr)
@@ -1277,6 +1486,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1277 } 1486 }
1278 1487
1279 vcpu->arch.time = data; 1488 vcpu->arch.time = data;
1489 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
1280 1490
1281 /* we verify if the enable bit is set... */ 1491 /* we verify if the enable bit is set... */
1282 if (!(data & 1)) 1492 if (!(data & 1))
@@ -1292,8 +1502,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1292 kvm_release_page_clean(vcpu->arch.time_page); 1502 kvm_release_page_clean(vcpu->arch.time_page);
1293 vcpu->arch.time_page = NULL; 1503 vcpu->arch.time_page = NULL;
1294 } 1504 }
1295
1296 kvm_request_guest_time_update(vcpu);
1297 break; 1505 break;
1298 } 1506 }
1299 case MSR_IA32_MCG_CTL: 1507 case MSR_IA32_MCG_CTL:
@@ -1330,6 +1538,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1330 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " 1538 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
1331 "0x%x data 0x%llx\n", msr, data); 1539 "0x%x data 0x%llx\n", msr, data);
1332 break; 1540 break;
1541 case MSR_K7_CLK_CTL:
1542 /*
1543 * Ignore all writes to this no longer documented MSR.
1544 * Writes are only relevant for old K7 processors,
1545 * all pre-dating SVM, but a recommended workaround from
1546 * AMD for these chips. It is possible to speicify the
1547 * affected processor models on the command line, hence
1548 * the need to ignore the workaround.
1549 */
1550 break;
1333 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: 1551 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
1334 if (kvm_hv_msr_partition_wide(msr)) { 1552 if (kvm_hv_msr_partition_wide(msr)) {
1335 int r; 1553 int r;
@@ -1522,6 +1740,20 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1522 case 0xcd: /* fsb frequency */ 1740 case 0xcd: /* fsb frequency */
1523 data = 3; 1741 data = 3;
1524 break; 1742 break;
1743 /*
1744 * MSR_EBC_FREQUENCY_ID
1745 * Conservative value valid for even the basic CPU models.
1746 * Models 0,1: 000 in bits 23:21 indicating a bus speed of
1747 * 100MHz, model 2 000 in bits 18:16 indicating 100MHz,
1748 * and 266MHz for model 3, or 4. Set Core Clock
1749 * Frequency to System Bus Frequency Ratio to 1 (bits
1750 * 31:24) even though these are only valid for CPU
1751 * models > 2, however guests may end up dividing or
1752 * multiplying by zero otherwise.
1753 */
1754 case MSR_EBC_FREQUENCY_ID:
1755 data = 1 << 24;
1756 break;
1525 case MSR_IA32_APICBASE: 1757 case MSR_IA32_APICBASE:
1526 data = kvm_get_apic_base(vcpu); 1758 data = kvm_get_apic_base(vcpu);
1527 break; 1759 break;
@@ -1555,6 +1787,18 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1555 case MSR_IA32_MCG_STATUS: 1787 case MSR_IA32_MCG_STATUS:
1556 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: 1788 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
1557 return get_msr_mce(vcpu, msr, pdata); 1789 return get_msr_mce(vcpu, msr, pdata);
1790 case MSR_K7_CLK_CTL:
1791 /*
1792 * Provide expected ramp-up count for K7. All other
1793 * are set to zero, indicating minimum divisors for
1794 * every field.
1795 *
1796 * This prevents guest kernels on AMD host with CPU
1797 * type 6, model 8 and higher from exploding due to
1798 * the rdmsr failing.
1799 */
1800 data = 0x20000000;
1801 break;
1558 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: 1802 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
1559 if (kvm_hv_msr_partition_wide(msr)) { 1803 if (kvm_hv_msr_partition_wide(msr)) {
1560 int r; 1804 int r;
@@ -1808,19 +2052,28 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1808 } 2052 }
1809 2053
1810 kvm_x86_ops->vcpu_load(vcpu, cpu); 2054 kvm_x86_ops->vcpu_load(vcpu, cpu);
1811 if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) { 2055 if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
1812 unsigned long khz = cpufreq_quick_get(cpu); 2056 /* Make sure TSC doesn't go backwards */
1813 if (!khz) 2057 s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
1814 khz = tsc_khz; 2058 native_read_tsc() - vcpu->arch.last_host_tsc;
1815 per_cpu(cpu_tsc_khz, cpu) = khz; 2059 if (tsc_delta < 0)
2060 mark_tsc_unstable("KVM discovered backwards TSC");
2061 if (check_tsc_unstable()) {
2062 kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta);
2063 vcpu->arch.tsc_catchup = 1;
2064 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
2065 }
2066 if (vcpu->cpu != cpu)
2067 kvm_migrate_timers(vcpu);
2068 vcpu->cpu = cpu;
1816 } 2069 }
1817 kvm_request_guest_time_update(vcpu);
1818} 2070}
1819 2071
1820void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 2072void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
1821{ 2073{
1822 kvm_x86_ops->vcpu_put(vcpu); 2074 kvm_x86_ops->vcpu_put(vcpu);
1823 kvm_put_guest_fpu(vcpu); 2075 kvm_put_guest_fpu(vcpu);
2076 vcpu->arch.last_host_tsc = native_read_tsc();
1824} 2077}
1825 2078
1826static int is_efer_nx(void) 2079static int is_efer_nx(void)
@@ -1995,7 +2248,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1995 F(F16C); 2248 F(F16C);
1996 /* cpuid 0x80000001.ecx */ 2249 /* cpuid 0x80000001.ecx */
1997 const u32 kvm_supported_word6_x86_features = 2250 const u32 kvm_supported_word6_x86_features =
1998 F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ | 2251 F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
1999 F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | 2252 F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
2000 F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) | 2253 F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) |
2001 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM); 2254 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
@@ -2204,6 +2457,7 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
2204 return -ENXIO; 2457 return -ENXIO;
2205 2458
2206 kvm_queue_interrupt(vcpu, irq->irq, false); 2459 kvm_queue_interrupt(vcpu, irq->irq, false);
2460 kvm_make_request(KVM_REQ_EVENT, vcpu);
2207 2461
2208 return 0; 2462 return 0;
2209} 2463}
@@ -2357,6 +2611,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2357 if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR) 2611 if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR)
2358 vcpu->arch.sipi_vector = events->sipi_vector; 2612 vcpu->arch.sipi_vector = events->sipi_vector;
2359 2613
2614 kvm_make_request(KVM_REQ_EVENT, vcpu);
2615
2360 return 0; 2616 return 0;
2361} 2617}
2362 2618
@@ -2760,7 +3016,7 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
2760 3016
2761static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) 3017static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
2762{ 3018{
2763 return kvm->arch.n_alloc_mmu_pages; 3019 return kvm->arch.n_max_mmu_pages;
2764} 3020}
2765 3021
2766static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) 3022static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
@@ -2796,18 +3052,18 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
2796 r = 0; 3052 r = 0;
2797 switch (chip->chip_id) { 3053 switch (chip->chip_id) {
2798 case KVM_IRQCHIP_PIC_MASTER: 3054 case KVM_IRQCHIP_PIC_MASTER:
2799 raw_spin_lock(&pic_irqchip(kvm)->lock); 3055 spin_lock(&pic_irqchip(kvm)->lock);
2800 memcpy(&pic_irqchip(kvm)->pics[0], 3056 memcpy(&pic_irqchip(kvm)->pics[0],
2801 &chip->chip.pic, 3057 &chip->chip.pic,
2802 sizeof(struct kvm_pic_state)); 3058 sizeof(struct kvm_pic_state));
2803 raw_spin_unlock(&pic_irqchip(kvm)->lock); 3059 spin_unlock(&pic_irqchip(kvm)->lock);
2804 break; 3060 break;
2805 case KVM_IRQCHIP_PIC_SLAVE: 3061 case KVM_IRQCHIP_PIC_SLAVE:
2806 raw_spin_lock(&pic_irqchip(kvm)->lock); 3062 spin_lock(&pic_irqchip(kvm)->lock);
2807 memcpy(&pic_irqchip(kvm)->pics[1], 3063 memcpy(&pic_irqchip(kvm)->pics[1],
2808 &chip->chip.pic, 3064 &chip->chip.pic,
2809 sizeof(struct kvm_pic_state)); 3065 sizeof(struct kvm_pic_state));
2810 raw_spin_unlock(&pic_irqchip(kvm)->lock); 3066 spin_unlock(&pic_irqchip(kvm)->lock);
2811 break; 3067 break;
2812 case KVM_IRQCHIP_IOAPIC: 3068 case KVM_IRQCHIP_IOAPIC:
2813 r = kvm_set_ioapic(kvm, &chip->chip.ioapic); 3069 r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
@@ -3201,7 +3457,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
3201 break; 3457 break;
3202 } 3458 }
3203 case KVM_SET_CLOCK: { 3459 case KVM_SET_CLOCK: {
3204 struct timespec now;
3205 struct kvm_clock_data user_ns; 3460 struct kvm_clock_data user_ns;
3206 u64 now_ns; 3461 u64 now_ns;
3207 s64 delta; 3462 s64 delta;
@@ -3215,20 +3470,21 @@ long kvm_arch_vm_ioctl(struct file *filp,
3215 goto out; 3470 goto out;
3216 3471
3217 r = 0; 3472 r = 0;
3218 ktime_get_ts(&now); 3473 local_irq_disable();
3219 now_ns = timespec_to_ns(&now); 3474 now_ns = get_kernel_ns();
3220 delta = user_ns.clock - now_ns; 3475 delta = user_ns.clock - now_ns;
3476 local_irq_enable();
3221 kvm->arch.kvmclock_offset = delta; 3477 kvm->arch.kvmclock_offset = delta;
3222 break; 3478 break;
3223 } 3479 }
3224 case KVM_GET_CLOCK: { 3480 case KVM_GET_CLOCK: {
3225 struct timespec now;
3226 struct kvm_clock_data user_ns; 3481 struct kvm_clock_data user_ns;
3227 u64 now_ns; 3482 u64 now_ns;
3228 3483
3229 ktime_get_ts(&now); 3484 local_irq_disable();
3230 now_ns = timespec_to_ns(&now); 3485 now_ns = get_kernel_ns();
3231 user_ns.clock = kvm->arch.kvmclock_offset + now_ns; 3486 user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
3487 local_irq_enable();
3232 user_ns.flags = 0; 3488 user_ns.flags = 0;
3233 3489
3234 r = -EFAULT; 3490 r = -EFAULT;
@@ -3292,30 +3548,51 @@ void kvm_get_segment(struct kvm_vcpu *vcpu,
3292 kvm_x86_ops->get_segment(vcpu, var, seg); 3548 kvm_x86_ops->get_segment(vcpu, var, seg);
3293} 3549}
3294 3550
3551static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
3552{
3553 return gpa;
3554}
3555
3556static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
3557{
3558 gpa_t t_gpa;
3559 u32 error;
3560
3561 BUG_ON(!mmu_is_nested(vcpu));
3562
3563 /* NPT walks are always user-walks */
3564 access |= PFERR_USER_MASK;
3565 t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &error);
3566 if (t_gpa == UNMAPPED_GVA)
3567 vcpu->arch.fault.nested = true;
3568
3569 return t_gpa;
3570}
3571
3295gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3572gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
3296{ 3573{
3297 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3574 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3298 return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); 3575 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error);
3299} 3576}
3300 3577
3301 gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3578 gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
3302{ 3579{
3303 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3580 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3304 access |= PFERR_FETCH_MASK; 3581 access |= PFERR_FETCH_MASK;
3305 return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); 3582 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error);
3306} 3583}
3307 3584
3308gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3585gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
3309{ 3586{
3310 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3587 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3311 access |= PFERR_WRITE_MASK; 3588 access |= PFERR_WRITE_MASK;
3312 return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); 3589 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error);
3313} 3590}
3314 3591
3315/* uses this to access any guest's mapped memory without checking CPL */ 3592/* uses this to access any guest's mapped memory without checking CPL */
3316gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3593gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
3317{ 3594{
3318 return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, 0, error); 3595 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, error);
3319} 3596}
3320 3597
3321static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, 3598static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
@@ -3326,7 +3603,8 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
3326 int r = X86EMUL_CONTINUE; 3603 int r = X86EMUL_CONTINUE;
3327 3604
3328 while (bytes) { 3605 while (bytes) {
3329 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error); 3606 gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access,
3607 error);
3330 unsigned offset = addr & (PAGE_SIZE-1); 3608 unsigned offset = addr & (PAGE_SIZE-1);
3331 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); 3609 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
3332 int ret; 3610 int ret;
@@ -3381,8 +3659,9 @@ static int kvm_write_guest_virt_system(gva_t addr, void *val,
3381 int r = X86EMUL_CONTINUE; 3659 int r = X86EMUL_CONTINUE;
3382 3660
3383 while (bytes) { 3661 while (bytes) {
3384 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, 3662 gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
3385 PFERR_WRITE_MASK, error); 3663 PFERR_WRITE_MASK,
3664 error);
3386 unsigned offset = addr & (PAGE_SIZE-1); 3665 unsigned offset = addr & (PAGE_SIZE-1);
3387 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); 3666 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
3388 int ret; 3667 int ret;
@@ -3624,7 +3903,7 @@ static int emulator_pio_in_emulated(int size, unsigned short port, void *val,
3624 if (vcpu->arch.pio.count) 3903 if (vcpu->arch.pio.count)
3625 goto data_avail; 3904 goto data_avail;
3626 3905
3627 trace_kvm_pio(1, port, size, 1); 3906 trace_kvm_pio(0, port, size, 1);
3628 3907
3629 vcpu->arch.pio.port = port; 3908 vcpu->arch.pio.port = port;
3630 vcpu->arch.pio.in = 1; 3909 vcpu->arch.pio.in = 1;
@@ -3652,7 +3931,7 @@ static int emulator_pio_out_emulated(int size, unsigned short port,
3652 const void *val, unsigned int count, 3931 const void *val, unsigned int count,
3653 struct kvm_vcpu *vcpu) 3932 struct kvm_vcpu *vcpu)
3654{ 3933{
3655 trace_kvm_pio(0, port, size, 1); 3934 trace_kvm_pio(1, port, size, 1);
3656 3935
3657 vcpu->arch.pio.port = port; 3936 vcpu->arch.pio.port = port;
3658 vcpu->arch.pio.in = 0; 3937 vcpu->arch.pio.in = 0;
@@ -3791,6 +4070,11 @@ static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu)
3791 kvm_x86_ops->get_gdt(vcpu, dt); 4070 kvm_x86_ops->get_gdt(vcpu, dt);
3792} 4071}
3793 4072
4073static void emulator_get_idt(struct desc_ptr *dt, struct kvm_vcpu *vcpu)
4074{
4075 kvm_x86_ops->get_idt(vcpu, dt);
4076}
4077
3794static unsigned long emulator_get_cached_segment_base(int seg, 4078static unsigned long emulator_get_cached_segment_base(int seg,
3795 struct kvm_vcpu *vcpu) 4079 struct kvm_vcpu *vcpu)
3796{ 4080{
@@ -3884,6 +4168,7 @@ static struct x86_emulate_ops emulate_ops = {
3884 .set_segment_selector = emulator_set_segment_selector, 4168 .set_segment_selector = emulator_set_segment_selector,
3885 .get_cached_segment_base = emulator_get_cached_segment_base, 4169 .get_cached_segment_base = emulator_get_cached_segment_base,
3886 .get_gdt = emulator_get_gdt, 4170 .get_gdt = emulator_get_gdt,
4171 .get_idt = emulator_get_idt,
3887 .get_cr = emulator_get_cr, 4172 .get_cr = emulator_get_cr,
3888 .set_cr = emulator_set_cr, 4173 .set_cr = emulator_set_cr,
3889 .cpl = emulator_get_cpl, 4174 .cpl = emulator_get_cpl,
@@ -3919,13 +4204,64 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu)
3919{ 4204{
3920 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; 4205 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
3921 if (ctxt->exception == PF_VECTOR) 4206 if (ctxt->exception == PF_VECTOR)
3922 kvm_inject_page_fault(vcpu, ctxt->cr2, ctxt->error_code); 4207 kvm_propagate_fault(vcpu);
3923 else if (ctxt->error_code_valid) 4208 else if (ctxt->error_code_valid)
3924 kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code); 4209 kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code);
3925 else 4210 else
3926 kvm_queue_exception(vcpu, ctxt->exception); 4211 kvm_queue_exception(vcpu, ctxt->exception);
3927} 4212}
3928 4213
4214static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
4215{
4216 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
4217 int cs_db, cs_l;
4218
4219 cache_all_regs(vcpu);
4220
4221 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
4222
4223 vcpu->arch.emulate_ctxt.vcpu = vcpu;
4224 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
4225 vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
4226 vcpu->arch.emulate_ctxt.mode =
4227 (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
4228 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
4229 ? X86EMUL_MODE_VM86 : cs_l
4230 ? X86EMUL_MODE_PROT64 : cs_db
4231 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
4232 memset(c, 0, sizeof(struct decode_cache));
4233 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
4234}
4235
4236int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq)
4237{
4238 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
4239 int ret;
4240
4241 init_emulate_ctxt(vcpu);
4242
4243 vcpu->arch.emulate_ctxt.decode.op_bytes = 2;
4244 vcpu->arch.emulate_ctxt.decode.ad_bytes = 2;
4245 vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip;
4246 ret = emulate_int_real(&vcpu->arch.emulate_ctxt, &emulate_ops, irq);
4247
4248 if (ret != X86EMUL_CONTINUE)
4249 return EMULATE_FAIL;
4250
4251 vcpu->arch.emulate_ctxt.eip = c->eip;
4252 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
4253 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
4254 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
4255
4256 if (irq == NMI_VECTOR)
4257 vcpu->arch.nmi_pending = false;
4258 else
4259 vcpu->arch.interrupt.pending = false;
4260
4261 return EMULATE_DONE;
4262}
4263EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
4264
3929static int handle_emulation_failure(struct kvm_vcpu *vcpu) 4265static int handle_emulation_failure(struct kvm_vcpu *vcpu)
3930{ 4266{
3931 ++vcpu->stat.insn_emulation_fail; 4267 ++vcpu->stat.insn_emulation_fail;
@@ -3982,24 +4318,15 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
3982 cache_all_regs(vcpu); 4318 cache_all_regs(vcpu);
3983 4319
3984 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 4320 if (!(emulation_type & EMULTYPE_NO_DECODE)) {
3985 int cs_db, cs_l; 4321 init_emulate_ctxt(vcpu);
3986 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
3987
3988 vcpu->arch.emulate_ctxt.vcpu = vcpu;
3989 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
3990 vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
3991 vcpu->arch.emulate_ctxt.mode =
3992 (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
3993 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
3994 ? X86EMUL_MODE_VM86 : cs_l
3995 ? X86EMUL_MODE_PROT64 : cs_db
3996 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
3997 memset(c, 0, sizeof(struct decode_cache));
3998 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
3999 vcpu->arch.emulate_ctxt.interruptibility = 0; 4322 vcpu->arch.emulate_ctxt.interruptibility = 0;
4000 vcpu->arch.emulate_ctxt.exception = -1; 4323 vcpu->arch.emulate_ctxt.exception = -1;
4324 vcpu->arch.emulate_ctxt.perm_ok = false;
4325
4326 r = x86_decode_insn(&vcpu->arch.emulate_ctxt);
4327 if (r == X86EMUL_PROPAGATE_FAULT)
4328 goto done;
4001 4329
4002 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
4003 trace_kvm_emulate_insn_start(vcpu); 4330 trace_kvm_emulate_insn_start(vcpu);
4004 4331
4005 /* Only allow emulation of specific instructions on #UD 4332 /* Only allow emulation of specific instructions on #UD
@@ -4049,41 +4376,39 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
4049 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); 4376 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
4050 4377
4051restart: 4378restart:
4052 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 4379 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt);
4053 4380
4054 if (r) { /* emulation failed */ 4381 if (r == EMULATION_FAILED) {
4055 if (reexecute_instruction(vcpu, cr2)) 4382 if (reexecute_instruction(vcpu, cr2))
4056 return EMULATE_DONE; 4383 return EMULATE_DONE;
4057 4384
4058 return handle_emulation_failure(vcpu); 4385 return handle_emulation_failure(vcpu);
4059 } 4386 }
4060 4387
4061 toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility); 4388done:
4062 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
4063 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
4064 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
4065
4066 if (vcpu->arch.emulate_ctxt.exception >= 0) { 4389 if (vcpu->arch.emulate_ctxt.exception >= 0) {
4067 inject_emulated_exception(vcpu); 4390 inject_emulated_exception(vcpu);
4068 return EMULATE_DONE; 4391 r = EMULATE_DONE;
4069 } 4392 } else if (vcpu->arch.pio.count) {
4070
4071 if (vcpu->arch.pio.count) {
4072 if (!vcpu->arch.pio.in) 4393 if (!vcpu->arch.pio.in)
4073 vcpu->arch.pio.count = 0; 4394 vcpu->arch.pio.count = 0;
4074 return EMULATE_DO_MMIO; 4395 r = EMULATE_DO_MMIO;
4075 } 4396 } else if (vcpu->mmio_needed) {
4076
4077 if (vcpu->mmio_needed) {
4078 if (vcpu->mmio_is_write) 4397 if (vcpu->mmio_is_write)
4079 vcpu->mmio_needed = 0; 4398 vcpu->mmio_needed = 0;
4080 return EMULATE_DO_MMIO; 4399 r = EMULATE_DO_MMIO;
4081 } 4400 } else if (r == EMULATION_RESTART)
4082
4083 if (vcpu->arch.emulate_ctxt.restart)
4084 goto restart; 4401 goto restart;
4402 else
4403 r = EMULATE_DONE;
4085 4404
4086 return EMULATE_DONE; 4405 toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility);
4406 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
4407 kvm_make_request(KVM_REQ_EVENT, vcpu);
4408 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
4409 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
4410
4411 return r;
4087} 4412}
4088EXPORT_SYMBOL_GPL(emulate_instruction); 4413EXPORT_SYMBOL_GPL(emulate_instruction);
4089 4414
@@ -4097,9 +4422,23 @@ int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
4097} 4422}
4098EXPORT_SYMBOL_GPL(kvm_fast_pio_out); 4423EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
4099 4424
4100static void bounce_off(void *info) 4425static void tsc_bad(void *info)
4426{
4427 __get_cpu_var(cpu_tsc_khz) = 0;
4428}
4429
4430static void tsc_khz_changed(void *data)
4101{ 4431{
4102 /* nothing */ 4432 struct cpufreq_freqs *freq = data;
4433 unsigned long khz = 0;
4434
4435 if (data)
4436 khz = freq->new;
4437 else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
4438 khz = cpufreq_quick_get(raw_smp_processor_id());
4439 if (!khz)
4440 khz = tsc_khz;
4441 __get_cpu_var(cpu_tsc_khz) = khz;
4103} 4442}
4104 4443
4105static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, 4444static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
@@ -4110,21 +4449,60 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
4110 struct kvm_vcpu *vcpu; 4449 struct kvm_vcpu *vcpu;
4111 int i, send_ipi = 0; 4450 int i, send_ipi = 0;
4112 4451
4452 /*
4453 * We allow guests to temporarily run on slowing clocks,
4454 * provided we notify them after, or to run on accelerating
4455 * clocks, provided we notify them before. Thus time never
4456 * goes backwards.
4457 *
4458 * However, we have a problem. We can't atomically update
4459 * the frequency of a given CPU from this function; it is
4460 * merely a notifier, which can be called from any CPU.
4461 * Changing the TSC frequency at arbitrary points in time
4462 * requires a recomputation of local variables related to
4463 * the TSC for each VCPU. We must flag these local variables
4464 * to be updated and be sure the update takes place with the
4465 * new frequency before any guests proceed.
4466 *
4467 * Unfortunately, the combination of hotplug CPU and frequency
4468 * change creates an intractable locking scenario; the order
4469 * of when these callouts happen is undefined with respect to
4470 * CPU hotplug, and they can race with each other. As such,
4471 * merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is
4472 * undefined; you can actually have a CPU frequency change take
4473 * place in between the computation of X and the setting of the
4474 * variable. To protect against this problem, all updates of
4475 * the per_cpu tsc_khz variable are done in an interrupt
4476 * protected IPI, and all callers wishing to update the value
4477 * must wait for a synchronous IPI to complete (which is trivial
4478 * if the caller is on the CPU already). This establishes the
4479 * necessary total order on variable updates.
4480 *
4481 * Note that because a guest time update may take place
4482 * anytime after the setting of the VCPU's request bit, the
4483 * correct TSC value must be set before the request. However,
4484 * to ensure the update actually makes it to any guest which
4485 * starts running in hardware virtualization between the set
4486 * and the acquisition of the spinlock, we must also ping the
4487 * CPU after setting the request bit.
4488 *
4489 */
4490
4113 if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) 4491 if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
4114 return 0; 4492 return 0;
4115 if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) 4493 if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
4116 return 0; 4494 return 0;
4117 per_cpu(cpu_tsc_khz, freq->cpu) = freq->new; 4495
4496 smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
4118 4497
4119 spin_lock(&kvm_lock); 4498 spin_lock(&kvm_lock);
4120 list_for_each_entry(kvm, &vm_list, vm_list) { 4499 list_for_each_entry(kvm, &vm_list, vm_list) {
4121 kvm_for_each_vcpu(i, vcpu, kvm) { 4500 kvm_for_each_vcpu(i, vcpu, kvm) {
4122 if (vcpu->cpu != freq->cpu) 4501 if (vcpu->cpu != freq->cpu)
4123 continue; 4502 continue;
4124 if (!kvm_request_guest_time_update(vcpu)) 4503 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
4125 continue;
4126 if (vcpu->cpu != smp_processor_id()) 4504 if (vcpu->cpu != smp_processor_id())
4127 send_ipi++; 4505 send_ipi = 1;
4128 } 4506 }
4129 } 4507 }
4130 spin_unlock(&kvm_lock); 4508 spin_unlock(&kvm_lock);
@@ -4142,32 +4520,57 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
4142 * guest context is entered kvmclock will be updated, 4520 * guest context is entered kvmclock will be updated,
4143 * so the guest will not see stale values. 4521 * so the guest will not see stale values.
4144 */ 4522 */
4145 smp_call_function_single(freq->cpu, bounce_off, NULL, 1); 4523 smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
4146 } 4524 }
4147 return 0; 4525 return 0;
4148} 4526}
4149 4527
4150static struct notifier_block kvmclock_cpufreq_notifier_block = { 4528static struct notifier_block kvmclock_cpufreq_notifier_block = {
4151 .notifier_call = kvmclock_cpufreq_notifier 4529 .notifier_call = kvmclock_cpufreq_notifier
4530};
4531
4532static int kvmclock_cpu_notifier(struct notifier_block *nfb,
4533 unsigned long action, void *hcpu)
4534{
4535 unsigned int cpu = (unsigned long)hcpu;
4536
4537 switch (action) {
4538 case CPU_ONLINE:
4539 case CPU_DOWN_FAILED:
4540 smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
4541 break;
4542 case CPU_DOWN_PREPARE:
4543 smp_call_function_single(cpu, tsc_bad, NULL, 1);
4544 break;
4545 }
4546 return NOTIFY_OK;
4547}
4548
4549static struct notifier_block kvmclock_cpu_notifier_block = {
4550 .notifier_call = kvmclock_cpu_notifier,
4551 .priority = -INT_MAX
4152}; 4552};
4153 4553
4154static void kvm_timer_init(void) 4554static void kvm_timer_init(void)
4155{ 4555{
4156 int cpu; 4556 int cpu;
4157 4557
4558 max_tsc_khz = tsc_khz;
4559 register_hotcpu_notifier(&kvmclock_cpu_notifier_block);
4158 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { 4560 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
4561#ifdef CONFIG_CPU_FREQ
4562 struct cpufreq_policy policy;
4563 memset(&policy, 0, sizeof(policy));
4564 cpufreq_get_policy(&policy, get_cpu());
4565 if (policy.cpuinfo.max_freq)
4566 max_tsc_khz = policy.cpuinfo.max_freq;
4567#endif
4159 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, 4568 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
4160 CPUFREQ_TRANSITION_NOTIFIER); 4569 CPUFREQ_TRANSITION_NOTIFIER);
4161 for_each_online_cpu(cpu) {
4162 unsigned long khz = cpufreq_get(cpu);
4163 if (!khz)
4164 khz = tsc_khz;
4165 per_cpu(cpu_tsc_khz, cpu) = khz;
4166 }
4167 } else {
4168 for_each_possible_cpu(cpu)
4169 per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
4170 } 4570 }
4571 pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz);
4572 for_each_online_cpu(cpu)
4573 smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
4171} 4574}
4172 4575
4173static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); 4576static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
@@ -4269,6 +4672,7 @@ void kvm_arch_exit(void)
4269 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) 4672 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
4270 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, 4673 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
4271 CPUFREQ_TRANSITION_NOTIFIER); 4674 CPUFREQ_TRANSITION_NOTIFIER);
4675 unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block);
4272 kvm_x86_ops = NULL; 4676 kvm_x86_ops = NULL;
4273 kvm_mmu_module_exit(); 4677 kvm_mmu_module_exit();
4274} 4678}
@@ -4684,8 +5088,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
4684 kvm_mmu_unload(vcpu); 5088 kvm_mmu_unload(vcpu);
4685 if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) 5089 if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
4686 __kvm_migrate_timers(vcpu); 5090 __kvm_migrate_timers(vcpu);
4687 if (kvm_check_request(KVM_REQ_KVMCLOCK_UPDATE, vcpu)) 5091 if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
4688 kvm_write_guest_time(vcpu); 5092 r = kvm_guest_time_update(vcpu);
5093 if (unlikely(r))
5094 goto out;
5095 }
4689 if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu)) 5096 if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
4690 kvm_mmu_sync_roots(vcpu); 5097 kvm_mmu_sync_roots(vcpu);
4691 if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) 5098 if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
@@ -4710,6 +5117,21 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
4710 if (unlikely(r)) 5117 if (unlikely(r))
4711 goto out; 5118 goto out;
4712 5119
5120 if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
5121 inject_pending_event(vcpu);
5122
5123 /* enable NMI/IRQ window open exits if needed */
5124 if (vcpu->arch.nmi_pending)
5125 kvm_x86_ops->enable_nmi_window(vcpu);
5126 else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
5127 kvm_x86_ops->enable_irq_window(vcpu);
5128
5129 if (kvm_lapic_enabled(vcpu)) {
5130 update_cr8_intercept(vcpu);
5131 kvm_lapic_sync_to_vapic(vcpu);
5132 }
5133 }
5134
4713 preempt_disable(); 5135 preempt_disable();
4714 5136
4715 kvm_x86_ops->prepare_guest_switch(vcpu); 5137 kvm_x86_ops->prepare_guest_switch(vcpu);
@@ -4728,23 +5150,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
4728 smp_wmb(); 5150 smp_wmb();
4729 local_irq_enable(); 5151 local_irq_enable();
4730 preempt_enable(); 5152 preempt_enable();
5153 kvm_x86_ops->cancel_injection(vcpu);
4731 r = 1; 5154 r = 1;
4732 goto out; 5155 goto out;
4733 } 5156 }
4734 5157
4735 inject_pending_event(vcpu);
4736
4737 /* enable NMI/IRQ window open exits if needed */
4738 if (vcpu->arch.nmi_pending)
4739 kvm_x86_ops->enable_nmi_window(vcpu);
4740 else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
4741 kvm_x86_ops->enable_irq_window(vcpu);
4742
4743 if (kvm_lapic_enabled(vcpu)) {
4744 update_cr8_intercept(vcpu);
4745 kvm_lapic_sync_to_vapic(vcpu);
4746 }
4747
4748 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); 5158 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
4749 5159
4750 kvm_guest_enter(); 5160 kvm_guest_enter();
@@ -4770,6 +5180,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
4770 if (hw_breakpoint_active()) 5180 if (hw_breakpoint_active())
4771 hw_breakpoint_restore(); 5181 hw_breakpoint_restore();
4772 5182
5183 kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);
5184
4773 atomic_set(&vcpu->guest_mode, 0); 5185 atomic_set(&vcpu->guest_mode, 0);
4774 smp_wmb(); 5186 smp_wmb();
4775 local_irq_enable(); 5187 local_irq_enable();
@@ -4899,8 +5311,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
4899 if (!irqchip_in_kernel(vcpu->kvm)) 5311 if (!irqchip_in_kernel(vcpu->kvm))
4900 kvm_set_cr8(vcpu, kvm_run->cr8); 5312 kvm_set_cr8(vcpu, kvm_run->cr8);
4901 5313
4902 if (vcpu->arch.pio.count || vcpu->mmio_needed || 5314 if (vcpu->arch.pio.count || vcpu->mmio_needed) {
4903 vcpu->arch.emulate_ctxt.restart) {
4904 if (vcpu->mmio_needed) { 5315 if (vcpu->mmio_needed) {
4905 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); 5316 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
4906 vcpu->mmio_read_completed = 1; 5317 vcpu->mmio_read_completed = 1;
@@ -4981,6 +5392,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
4981 5392
4982 vcpu->arch.exception.pending = false; 5393 vcpu->arch.exception.pending = false;
4983 5394
5395 kvm_make_request(KVM_REQ_EVENT, vcpu);
5396
4984 return 0; 5397 return 0;
4985} 5398}
4986 5399
@@ -5044,6 +5457,7 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
5044 struct kvm_mp_state *mp_state) 5457 struct kvm_mp_state *mp_state)
5045{ 5458{
5046 vcpu->arch.mp_state = mp_state->mp_state; 5459 vcpu->arch.mp_state = mp_state->mp_state;
5460 kvm_make_request(KVM_REQ_EVENT, vcpu);
5047 return 0; 5461 return 0;
5048} 5462}
5049 5463
@@ -5051,24 +5465,11 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
5051 bool has_error_code, u32 error_code) 5465 bool has_error_code, u32 error_code)
5052{ 5466{
5053 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; 5467 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
5054 int cs_db, cs_l, ret; 5468 int ret;
5055 cache_all_regs(vcpu);
5056
5057 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
5058 5469
5059 vcpu->arch.emulate_ctxt.vcpu = vcpu; 5470 init_emulate_ctxt(vcpu);
5060 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
5061 vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
5062 vcpu->arch.emulate_ctxt.mode =
5063 (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
5064 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
5065 ? X86EMUL_MODE_VM86 : cs_l
5066 ? X86EMUL_MODE_PROT64 : cs_db
5067 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
5068 memset(c, 0, sizeof(struct decode_cache));
5069 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
5070 5471
5071 ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops, 5472 ret = emulator_task_switch(&vcpu->arch.emulate_ctxt,
5072 tss_selector, reason, has_error_code, 5473 tss_selector, reason, has_error_code,
5073 error_code); 5474 error_code);
5074 5475
@@ -5078,6 +5479,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
5078 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); 5479 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
5079 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); 5480 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
5080 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 5481 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
5482 kvm_make_request(KVM_REQ_EVENT, vcpu);
5081 return EMULATE_DONE; 5483 return EMULATE_DONE;
5082} 5484}
5083EXPORT_SYMBOL_GPL(kvm_task_switch); 5485EXPORT_SYMBOL_GPL(kvm_task_switch);
@@ -5113,7 +5515,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5113 mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; 5515 mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
5114 kvm_x86_ops->set_cr4(vcpu, sregs->cr4); 5516 kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
5115 if (!is_long_mode(vcpu) && is_pae(vcpu)) { 5517 if (!is_long_mode(vcpu) && is_pae(vcpu)) {
5116 load_pdptrs(vcpu, vcpu->arch.cr3); 5518 load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3);
5117 mmu_reset_needed = 1; 5519 mmu_reset_needed = 1;
5118 } 5520 }
5119 5521
@@ -5148,6 +5550,8 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5148 !is_protmode(vcpu)) 5550 !is_protmode(vcpu))
5149 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 5551 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
5150 5552
5553 kvm_make_request(KVM_REQ_EVENT, vcpu);
5554
5151 return 0; 5555 return 0;
5152} 5556}
5153 5557
@@ -5334,6 +5738,10 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
5334struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, 5738struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
5335 unsigned int id) 5739 unsigned int id)
5336{ 5740{
5741 if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
5742 printk_once(KERN_WARNING
5743 "kvm: SMP vm created on host with unstable TSC; "
5744 "guest TSC will not be reliable\n");
5337 return kvm_x86_ops->vcpu_create(kvm, id); 5745 return kvm_x86_ops->vcpu_create(kvm, id);
5338} 5746}
5339 5747
@@ -5376,22 +5784,22 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
5376 vcpu->arch.dr6 = DR6_FIXED_1; 5784 vcpu->arch.dr6 = DR6_FIXED_1;
5377 vcpu->arch.dr7 = DR7_FIXED_1; 5785 vcpu->arch.dr7 = DR7_FIXED_1;
5378 5786
5787 kvm_make_request(KVM_REQ_EVENT, vcpu);
5788
5379 return kvm_x86_ops->vcpu_reset(vcpu); 5789 return kvm_x86_ops->vcpu_reset(vcpu);
5380} 5790}
5381 5791
5382int kvm_arch_hardware_enable(void *garbage) 5792int kvm_arch_hardware_enable(void *garbage)
5383{ 5793{
5384 /* 5794 struct kvm *kvm;
5385 * Since this may be called from a hotplug notifcation, 5795 struct kvm_vcpu *vcpu;
5386 * we can't get the CPU frequency directly. 5796 int i;
5387 */
5388 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
5389 int cpu = raw_smp_processor_id();
5390 per_cpu(cpu_tsc_khz, cpu) = 0;
5391 }
5392 5797
5393 kvm_shared_msr_cpu_online(); 5798 kvm_shared_msr_cpu_online();
5394 5799 list_for_each_entry(kvm, &vm_list, vm_list)
5800 kvm_for_each_vcpu(i, vcpu, kvm)
5801 if (vcpu->cpu == smp_processor_id())
5802 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
5395 return kvm_x86_ops->hardware_enable(garbage); 5803 return kvm_x86_ops->hardware_enable(garbage);
5396} 5804}
5397 5805
@@ -5425,7 +5833,11 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
5425 BUG_ON(vcpu->kvm == NULL); 5833 BUG_ON(vcpu->kvm == NULL);
5426 kvm = vcpu->kvm; 5834 kvm = vcpu->kvm;
5427 5835
5836 vcpu->arch.emulate_ctxt.ops = &emulate_ops;
5837 vcpu->arch.walk_mmu = &vcpu->arch.mmu;
5428 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 5838 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
5839 vcpu->arch.mmu.translate_gpa = translate_gpa;
5840 vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
5429 if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) 5841 if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
5430 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 5842 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
5431 else 5843 else
@@ -5438,6 +5850,9 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
5438 } 5850 }
5439 vcpu->arch.pio_data = page_address(page); 5851 vcpu->arch.pio_data = page_address(page);
5440 5852
5853 if (!kvm->arch.virtual_tsc_khz)
5854 kvm_arch_set_tsc_khz(kvm, max_tsc_khz);
5855
5441 r = kvm_mmu_create(vcpu); 5856 r = kvm_mmu_create(vcpu);
5442 if (r < 0) 5857 if (r < 0)
5443 goto fail_free_pio_data; 5858 goto fail_free_pio_data;
@@ -5497,7 +5912,7 @@ struct kvm *kvm_arch_create_vm(void)
5497 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ 5912 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
5498 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); 5913 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
5499 5914
5500 rdtscll(kvm->arch.vm_init_tsc); 5915 spin_lock_init(&kvm->arch.tsc_write_lock);
5501 5916
5502 return kvm; 5917 return kvm;
5503} 5918}
@@ -5684,6 +6099,7 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
5684 kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip)) 6099 kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
5685 rflags |= X86_EFLAGS_TF; 6100 rflags |= X86_EFLAGS_TF;
5686 kvm_x86_ops->set_rflags(vcpu, rflags); 6101 kvm_x86_ops->set_rflags(vcpu, rflags);
6102 kvm_make_request(KVM_REQ_EVENT, vcpu);
5687} 6103}
5688EXPORT_SYMBOL_GPL(kvm_set_rflags); 6104EXPORT_SYMBOL_GPL(kvm_set_rflags);
5689 6105
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index b7a404722d2b..2cea414489f3 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -50,6 +50,11 @@ static inline int is_long_mode(struct kvm_vcpu *vcpu)
50#endif 50#endif
51} 51}
52 52
53static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
54{
55 return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu;
56}
57
53static inline int is_pae(struct kvm_vcpu *vcpu) 58static inline int is_pae(struct kvm_vcpu *vcpu)
54{ 59{
55 return kvm_read_cr4_bits(vcpu, X86_CR4_PAE); 60 return kvm_read_cr4_bits(vcpu, X86_CR4_PAE);
@@ -67,5 +72,8 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
67 72
68void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); 73void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
69void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); 74void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
75int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq);
76
77void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data);
70 78
71#endif 79#endif
diff --git a/drivers/s390/kvm/kvm_virtio.c b/drivers/s390/kvm/kvm_virtio.c
index 4e298bc8949d..5a46b8c5d68a 100644
--- a/drivers/s390/kvm/kvm_virtio.c
+++ b/drivers/s390/kvm/kvm_virtio.c
@@ -32,6 +32,7 @@
32 * The pointer to our (page) of device descriptions. 32 * The pointer to our (page) of device descriptions.
33 */ 33 */
34static void *kvm_devices; 34static void *kvm_devices;
35struct work_struct hotplug_work;
35 36
36struct kvm_device { 37struct kvm_device {
37 struct virtio_device vdev; 38 struct virtio_device vdev;
@@ -328,13 +329,54 @@ static void scan_devices(void)
328} 329}
329 330
330/* 331/*
332 * match for a kvm device with a specific desc pointer
333 */
334static int match_desc(struct device *dev, void *data)
335{
336 if ((ulong)to_kvmdev(dev_to_virtio(dev))->desc == (ulong)data)
337 return 1;
338
339 return 0;
340}
341
342/*
343 * hotplug_device tries to find changes in the device page.
344 */
345static void hotplug_devices(struct work_struct *dummy)
346{
347 unsigned int i;
348 struct kvm_device_desc *d;
349 struct device *dev;
350
351 for (i = 0; i < PAGE_SIZE; i += desc_size(d)) {
352 d = kvm_devices + i;
353
354 /* end of list */
355 if (d->type == 0)
356 break;
357
358 /* device already exists */
359 dev = device_find_child(kvm_root, d, match_desc);
360 if (dev) {
361 /* XXX check for hotplug remove */
362 put_device(dev);
363 continue;
364 }
365
366 /* new device */
367 printk(KERN_INFO "Adding new virtio device %p\n", d);
368 add_kvm_device(d, i);
369 }
370}
371
372/*
331 * we emulate the request_irq behaviour on top of s390 extints 373 * we emulate the request_irq behaviour on top of s390 extints
332 */ 374 */
333static void kvm_extint_handler(u16 code) 375static void kvm_extint_handler(u16 code)
334{ 376{
335 struct virtqueue *vq; 377 struct virtqueue *vq;
336 u16 subcode; 378 u16 subcode;
337 int config_changed; 379 u32 param;
338 380
339 subcode = S390_lowcore.cpu_addr; 381 subcode = S390_lowcore.cpu_addr;
340 if ((subcode & 0xff00) != VIRTIO_SUBCODE_64) 382 if ((subcode & 0xff00) != VIRTIO_SUBCODE_64)
@@ -343,18 +385,28 @@ static void kvm_extint_handler(u16 code)
343 /* The LSB might be overloaded, we have to mask it */ 385 /* The LSB might be overloaded, we have to mask it */
344 vq = (struct virtqueue *)(S390_lowcore.ext_params2 & ~1UL); 386 vq = (struct virtqueue *)(S390_lowcore.ext_params2 & ~1UL);
345 387
346 /* We use the LSB of extparam, to decide, if this interrupt is a config 388 /* We use ext_params to decide what this interrupt means */
347 * change or a "standard" interrupt */ 389 param = S390_lowcore.ext_params & VIRTIO_PARAM_MASK;
348 config_changed = S390_lowcore.ext_params & 1;
349 390
350 if (config_changed) { 391 switch (param) {
392 case VIRTIO_PARAM_CONFIG_CHANGED:
393 {
351 struct virtio_driver *drv; 394 struct virtio_driver *drv;
352 drv = container_of(vq->vdev->dev.driver, 395 drv = container_of(vq->vdev->dev.driver,
353 struct virtio_driver, driver); 396 struct virtio_driver, driver);
354 if (drv->config_changed) 397 if (drv->config_changed)
355 drv->config_changed(vq->vdev); 398 drv->config_changed(vq->vdev);
356 } else 399
400 break;
401 }
402 case VIRTIO_PARAM_DEV_ADD:
403 schedule_work(&hotplug_work);
404 break;
405 case VIRTIO_PARAM_VRING_INTERRUPT:
406 default:
357 vring_interrupt(0, vq); 407 vring_interrupt(0, vq);
408 break;
409 }
358} 410}
359 411
360/* 412/*
@@ -383,6 +435,8 @@ static int __init kvm_devices_init(void)
383 435
384 kvm_devices = (void *) real_memory_size; 436 kvm_devices = (void *) real_memory_size;
385 437
438 INIT_WORK(&hotplug_work, hotplug_devices);
439
386 ctl_set_bit(0, 9); 440 ctl_set_bit(0, 9);
387 register_external_interrupt(0x2603, kvm_extint_handler); 441 register_external_interrupt(0x2603, kvm_extint_handler);
388 442
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 636fc381c897..919ae53adc5c 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -414,6 +414,14 @@ struct kvm_enable_cap {
414 __u8 pad[64]; 414 __u8 pad[64];
415}; 415};
416 416
417/* for KVM_PPC_GET_PVINFO */
418struct kvm_ppc_pvinfo {
419 /* out */
420 __u32 flags;
421 __u32 hcall[4];
422 __u8 pad[108];
423};
424
417#define KVMIO 0xAE 425#define KVMIO 0xAE
418 426
419/* 427/*
@@ -530,6 +538,8 @@ struct kvm_enable_cap {
530#ifdef __KVM_HAVE_XCRS 538#ifdef __KVM_HAVE_XCRS
531#define KVM_CAP_XCRS 56 539#define KVM_CAP_XCRS 56
532#endif 540#endif
541#define KVM_CAP_PPC_GET_PVINFO 57
542#define KVM_CAP_PPC_IRQ_LEVEL 58
533 543
534#ifdef KVM_CAP_IRQ_ROUTING 544#ifdef KVM_CAP_IRQ_ROUTING
535 545
@@ -664,6 +674,8 @@ struct kvm_clock_data {
664/* Available with KVM_CAP_PIT_STATE2 */ 674/* Available with KVM_CAP_PIT_STATE2 */
665#define KVM_GET_PIT2 _IOR(KVMIO, 0x9f, struct kvm_pit_state2) 675#define KVM_GET_PIT2 _IOR(KVMIO, 0x9f, struct kvm_pit_state2)
666#define KVM_SET_PIT2 _IOW(KVMIO, 0xa0, struct kvm_pit_state2) 676#define KVM_SET_PIT2 _IOW(KVMIO, 0xa0, struct kvm_pit_state2)
677/* Available with KVM_CAP_PPC_GET_PVINFO */
678#define KVM_PPC_GET_PVINFO _IOW(KVMIO, 0xa1, struct kvm_ppc_pvinfo)
667 679
668/* 680/*
669 * ioctls for vcpu fds 681 * ioctls for vcpu fds
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ac740b26eb10..a0557422715e 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -36,9 +36,10 @@
36#define KVM_REQ_PENDING_TIMER 5 36#define KVM_REQ_PENDING_TIMER 5
37#define KVM_REQ_UNHALT 6 37#define KVM_REQ_UNHALT 6
38#define KVM_REQ_MMU_SYNC 7 38#define KVM_REQ_MMU_SYNC 7
39#define KVM_REQ_KVMCLOCK_UPDATE 8 39#define KVM_REQ_CLOCK_UPDATE 8
40#define KVM_REQ_KICK 9 40#define KVM_REQ_KICK 9
41#define KVM_REQ_DEACTIVATE_FPU 10 41#define KVM_REQ_DEACTIVATE_FPU 10
42#define KVM_REQ_EVENT 11
42 43
43#define KVM_USERSPACE_IRQ_SOURCE_ID 0 44#define KVM_USERSPACE_IRQ_SOURCE_ID 0
44 45
@@ -289,6 +290,9 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
289void kvm_disable_largepages(void); 290void kvm_disable_largepages(void);
290void kvm_arch_flush_shadow(struct kvm *kvm); 291void kvm_arch_flush_shadow(struct kvm *kvm);
291 292
293int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
294 int nr_pages);
295
292struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); 296struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
293unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn); 297unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn);
294void kvm_release_page_clean(struct page *page); 298void kvm_release_page_clean(struct page *page);
@@ -296,6 +300,8 @@ void kvm_release_page_dirty(struct page *page);
296void kvm_set_page_dirty(struct page *page); 300void kvm_set_page_dirty(struct page *page);
297void kvm_set_page_accessed(struct page *page); 301void kvm_set_page_accessed(struct page *page);
298 302
303pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr);
304pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn);
299pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn); 305pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
300pfn_t gfn_to_pfn_memslot(struct kvm *kvm, 306pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
301 struct kvm_memory_slot *slot, gfn_t gfn); 307 struct kvm_memory_slot *slot, gfn_t gfn);
@@ -477,8 +483,7 @@ int kvm_deassign_device(struct kvm *kvm,
477 struct kvm_assigned_dev_kernel *assigned_dev); 483 struct kvm_assigned_dev_kernel *assigned_dev);
478#else /* CONFIG_IOMMU_API */ 484#else /* CONFIG_IOMMU_API */
479static inline int kvm_iommu_map_pages(struct kvm *kvm, 485static inline int kvm_iommu_map_pages(struct kvm *kvm,
480 gfn_t base_gfn, 486 struct kvm_memory_slot *slot)
481 unsigned long npages)
482{ 487{
483 return 0; 488 return 0;
484} 489}
@@ -518,11 +523,22 @@ static inline void kvm_guest_exit(void)
518 current->flags &= ~PF_VCPU; 523 current->flags &= ~PF_VCPU;
519} 524}
520 525
526static inline unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
527 gfn_t gfn)
528{
529 return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE;
530}
531
521static inline gpa_t gfn_to_gpa(gfn_t gfn) 532static inline gpa_t gfn_to_gpa(gfn_t gfn)
522{ 533{
523 return (gpa_t)gfn << PAGE_SHIFT; 534 return (gpa_t)gfn << PAGE_SHIFT;
524} 535}
525 536
537static inline gfn_t gpa_to_gfn(gpa_t gpa)
538{
539 return (gfn_t)(gpa >> PAGE_SHIFT);
540}
541
526static inline hpa_t pfn_to_hpa(pfn_t pfn) 542static inline hpa_t pfn_to_hpa(pfn_t pfn)
527{ 543{
528 return (hpa_t)pfn << PAGE_SHIFT; 544 return (hpa_t)pfn << PAGE_SHIFT;
diff --git a/include/linux/kvm_para.h b/include/linux/kvm_para.h
index d73109243fda..47a070b0520e 100644
--- a/include/linux/kvm_para.h
+++ b/include/linux/kvm_para.h
@@ -17,6 +17,8 @@
17 17
18#define KVM_HC_VAPIC_POLL_IRQ 1 18#define KVM_HC_VAPIC_POLL_IRQ 1
19#define KVM_HC_MMU_OP 2 19#define KVM_HC_MMU_OP 2
20#define KVM_HC_FEATURES 3
21#define KVM_HC_PPC_MAP_MAGIC_PAGE 4
20 22
21/* 23/*
22 * hypercalls use architecture specific 24 * hypercalls use architecture specific
@@ -24,11 +26,6 @@
24#include <asm/kvm_para.h> 26#include <asm/kvm_para.h>
25 27
26#ifdef __KERNEL__ 28#ifdef __KERNEL__
27#ifdef CONFIG_KVM_GUEST
28void __init kvm_guest_init(void);
29#else
30#define kvm_guest_init() do { } while (0)
31#endif
32 29
33static inline int kvm_para_has_feature(unsigned int feature) 30static inline int kvm_para_has_feature(unsigned int feature)
34{ 31{
diff --git a/mm/util.c b/mm/util.c
index 4735ea481816..73dac81e9f78 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -245,6 +245,19 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
245} 245}
246#endif 246#endif
247 247
248/*
249 * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
250 * back to the regular GUP.
251 * If the architecture not support this fucntion, simply return with no
252 * page pinned
253 */
254int __attribute__((weak)) __get_user_pages_fast(unsigned long start,
255 int nr_pages, int write, struct page **pages)
256{
257 return 0;
258}
259EXPORT_SYMBOL_GPL(__get_user_pages_fast);
260
248/** 261/**
249 * get_user_pages_fast() - pin user pages in memory 262 * get_user_pages_fast() - pin user pages in memory
250 * @start: starting user address 263 * @start: starting user address
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 369e38010ad5..8edca9141b78 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -17,7 +17,7 @@
17 * Authors: 17 * Authors:
18 * Yaozu (Eddie) Dong <Eddie.dong@intel.com> 18 * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
19 * 19 *
20 * Copyright 2010 Red Hat, Inc. and/or its affilates. 20 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
21 */ 21 */
22 22
23#include <linux/kvm_host.h> 23#include <linux/kvm_host.h>
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 60e5e4612b0b..5225052aebc1 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -5,7 +5,7 @@
5 * machines without emulation or binary translation. 5 * machines without emulation or binary translation.
6 * 6 *
7 * Copyright (C) 2006 Qumranet, Inc. 7 * Copyright (C) 2006 Qumranet, Inc.
8 * Copyright 2010 Red Hat, Inc. and/or its affilates. 8 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
9 * 9 *
10 * Authors: 10 * Authors:
11 * Avi Kivity <avi@qumranet.com> 11 * Avi Kivity <avi@qumranet.com>
@@ -705,14 +705,12 @@ skip_lpage:
705 if (r) 705 if (r)
706 goto out_free; 706 goto out_free;
707 707
708#ifdef CONFIG_DMAR
709 /* map the pages in iommu page table */ 708 /* map the pages in iommu page table */
710 if (npages) { 709 if (npages) {
711 r = kvm_iommu_map_pages(kvm, &new); 710 r = kvm_iommu_map_pages(kvm, &new);
712 if (r) 711 if (r)
713 goto out_free; 712 goto out_free;
714 } 713 }
715#endif
716 714
717 r = -ENOMEM; 715 r = -ENOMEM;
718 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 716 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
@@ -927,35 +925,46 @@ int memslot_id(struct kvm *kvm, gfn_t gfn)
927 return memslot - slots->memslots; 925 return memslot - slots->memslots;
928} 926}
929 927
930static unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn) 928static unsigned long gfn_to_hva_many(struct kvm *kvm, gfn_t gfn,
931{ 929 gfn_t *nr_pages)
932 return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE;
933}
934
935unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
936{ 930{
937 struct kvm_memory_slot *slot; 931 struct kvm_memory_slot *slot;
938 932
939 slot = gfn_to_memslot(kvm, gfn); 933 slot = gfn_to_memslot(kvm, gfn);
940 if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 934 if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
941 return bad_hva(); 935 return bad_hva();
936
937 if (nr_pages)
938 *nr_pages = slot->npages - (gfn - slot->base_gfn);
939
942 return gfn_to_hva_memslot(slot, gfn); 940 return gfn_to_hva_memslot(slot, gfn);
943} 941}
942
943unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
944{
945 return gfn_to_hva_many(kvm, gfn, NULL);
946}
944EXPORT_SYMBOL_GPL(gfn_to_hva); 947EXPORT_SYMBOL_GPL(gfn_to_hva);
945 948
946static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr) 949static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic)
947{ 950{
948 struct page *page[1]; 951 struct page *page[1];
949 int npages; 952 int npages;
950 pfn_t pfn; 953 pfn_t pfn;
951 954
952 might_sleep(); 955 if (atomic)
953 956 npages = __get_user_pages_fast(addr, 1, 1, page);
954 npages = get_user_pages_fast(addr, 1, 1, page); 957 else {
958 might_sleep();
959 npages = get_user_pages_fast(addr, 1, 1, page);
960 }
955 961
956 if (unlikely(npages != 1)) { 962 if (unlikely(npages != 1)) {
957 struct vm_area_struct *vma; 963 struct vm_area_struct *vma;
958 964
965 if (atomic)
966 goto return_fault_page;
967
959 down_read(&current->mm->mmap_sem); 968 down_read(&current->mm->mmap_sem);
960 if (is_hwpoison_address(addr)) { 969 if (is_hwpoison_address(addr)) {
961 up_read(&current->mm->mmap_sem); 970 up_read(&current->mm->mmap_sem);
@@ -968,6 +977,7 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr)
968 if (vma == NULL || addr < vma->vm_start || 977 if (vma == NULL || addr < vma->vm_start ||
969 !(vma->vm_flags & VM_PFNMAP)) { 978 !(vma->vm_flags & VM_PFNMAP)) {
970 up_read(&current->mm->mmap_sem); 979 up_read(&current->mm->mmap_sem);
980return_fault_page:
971 get_page(fault_page); 981 get_page(fault_page);
972 return page_to_pfn(fault_page); 982 return page_to_pfn(fault_page);
973 } 983 }
@@ -981,7 +991,13 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr)
981 return pfn; 991 return pfn;
982} 992}
983 993
984pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) 994pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr)
995{
996 return hva_to_pfn(kvm, addr, true);
997}
998EXPORT_SYMBOL_GPL(hva_to_pfn_atomic);
999
1000static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic)
985{ 1001{
986 unsigned long addr; 1002 unsigned long addr;
987 1003
@@ -991,7 +1007,18 @@ pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
991 return page_to_pfn(bad_page); 1007 return page_to_pfn(bad_page);
992 } 1008 }
993 1009
994 return hva_to_pfn(kvm, addr); 1010 return hva_to_pfn(kvm, addr, atomic);
1011}
1012
1013pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
1014{
1015 return __gfn_to_pfn(kvm, gfn, true);
1016}
1017EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
1018
1019pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
1020{
1021 return __gfn_to_pfn(kvm, gfn, false);
995} 1022}
996EXPORT_SYMBOL_GPL(gfn_to_pfn); 1023EXPORT_SYMBOL_GPL(gfn_to_pfn);
997 1024
@@ -999,9 +1026,26 @@ pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
999 struct kvm_memory_slot *slot, gfn_t gfn) 1026 struct kvm_memory_slot *slot, gfn_t gfn)
1000{ 1027{
1001 unsigned long addr = gfn_to_hva_memslot(slot, gfn); 1028 unsigned long addr = gfn_to_hva_memslot(slot, gfn);
1002 return hva_to_pfn(kvm, addr); 1029 return hva_to_pfn(kvm, addr, false);
1003} 1030}
1004 1031
1032int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
1033 int nr_pages)
1034{
1035 unsigned long addr;
1036 gfn_t entry;
1037
1038 addr = gfn_to_hva_many(kvm, gfn, &entry);
1039 if (kvm_is_error_hva(addr))
1040 return -1;
1041
1042 if (entry < nr_pages)
1043 return 0;
1044
1045 return __get_user_pages_fast(addr, nr_pages, 1, pages);
1046}
1047EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
1048
1005struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) 1049struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
1006{ 1050{
1007 pfn_t pfn; 1051 pfn_t pfn;
@@ -1964,7 +2008,9 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
1964 case CPU_STARTING: 2008 case CPU_STARTING:
1965 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", 2009 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
1966 cpu); 2010 cpu);
2011 spin_lock(&kvm_lock);
1967 hardware_enable(NULL); 2012 hardware_enable(NULL);
2013 spin_unlock(&kvm_lock);
1968 break; 2014 break;
1969 } 2015 }
1970 return NOTIFY_OK; 2016 return NOTIFY_OK;
@@ -1977,7 +2023,7 @@ asmlinkage void kvm_handle_fault_on_reboot(void)
1977 /* spin while reset goes on */ 2023 /* spin while reset goes on */
1978 local_irq_enable(); 2024 local_irq_enable();
1979 while (true) 2025 while (true)
1980 ; 2026 cpu_relax();
1981 } 2027 }
1982 /* Fault while not rebooting. We want the trace. */ 2028 /* Fault while not rebooting. We want the trace. */
1983 BUG(); 2029 BUG();
@@ -2171,8 +2217,10 @@ static int kvm_suspend(struct sys_device *dev, pm_message_t state)
2171 2217
2172static int kvm_resume(struct sys_device *dev) 2218static int kvm_resume(struct sys_device *dev)
2173{ 2219{
2174 if (kvm_usage_count) 2220 if (kvm_usage_count) {
2221 WARN_ON(spin_is_locked(&kvm_lock));
2175 hardware_enable(NULL); 2222 hardware_enable(NULL);
2223 }
2176 return 0; 2224 return 0;
2177} 2225}
2178 2226